1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "AArch64GlobalISelUtils.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "MCTargetDesc/AArch64MCTargetDesc.h"
24 #include "llvm/ADT/Optional.h"
25 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
26 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
27 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineConstantPool.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineInstr.h"
33 #include "llvm/CodeGen/MachineInstrBuilder.h"
34 #include "llvm/CodeGen/MachineMemOperand.h"
35 #include "llvm/CodeGen/MachineOperand.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/TargetOpcodes.h"
38 #include "llvm/IR/Constants.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/PatternMatch.h"
42 #include "llvm/IR/Type.h"
43 #include "llvm/IR/IntrinsicsAArch64.h"
44 #include "llvm/Pass.h"
45 #include "llvm/Support/Debug.h"
46 #include "llvm/Support/raw_ostream.h"
47
48 #define DEBUG_TYPE "aarch64-isel"
49
50 using namespace llvm;
51 using namespace MIPatternMatch;
52 using namespace AArch64GISelUtils;
53
54 namespace llvm {
55 class BlockFrequencyInfo;
56 class ProfileSummaryInfo;
57 }
58
59 namespace {
60
61 #define GET_GLOBALISEL_PREDICATE_BITSET
62 #include "AArch64GenGlobalISel.inc"
63 #undef GET_GLOBALISEL_PREDICATE_BITSET
64
65 class AArch64InstructionSelector : public InstructionSelector {
66 public:
67 AArch64InstructionSelector(const AArch64TargetMachine &TM,
68 const AArch64Subtarget &STI,
69 const AArch64RegisterBankInfo &RBI);
70
71 bool select(MachineInstr &I) override;
getName()72 static const char *getName() { return DEBUG_TYPE; }
73
setupMF(MachineFunction & MF,GISelKnownBits * KB,CodeGenCoverage & CoverageInfo,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)74 void setupMF(MachineFunction &MF, GISelKnownBits *KB,
75 CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI,
76 BlockFrequencyInfo *BFI) override {
77 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
78 MIB.setMF(MF);
79
80 // hasFnAttribute() is expensive to call on every BRCOND selection, so
81 // cache it here for each run of the selector.
82 ProduceNonFlagSettingCondBr =
83 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
84 MFReturnAddr = Register();
85
86 processPHIs(MF);
87 }
88
89 private:
90 /// tblgen-erated 'select' implementation, used as the initial selector for
91 /// the patterns that don't require complex C++.
92 bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
93
94 // A lowering phase that runs before any selection attempts.
95 // Returns true if the instruction was modified.
96 bool preISelLower(MachineInstr &I);
97
98 // An early selection function that runs before the selectImpl() call.
99 bool earlySelect(MachineInstr &I);
100
101 // Do some preprocessing of G_PHIs before we begin selection.
102 void processPHIs(MachineFunction &MF);
103
104 bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
105
106 /// Eliminate same-sized cross-bank copies into stores before selectImpl().
107 bool contractCrossBankCopyIntoStore(MachineInstr &I,
108 MachineRegisterInfo &MRI);
109
110 bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
111
112 bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
113 MachineRegisterInfo &MRI) const;
114 bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
115 MachineRegisterInfo &MRI) const;
116
117 ///@{
118 /// Helper functions for selectCompareBranch.
119 bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
120 MachineIRBuilder &MIB) const;
121 bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
122 MachineIRBuilder &MIB) const;
123 bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
124 MachineIRBuilder &MIB) const;
125 bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
126 MachineBasicBlock *DstMBB,
127 MachineIRBuilder &MIB) const;
128 ///@}
129
130 bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
131 MachineRegisterInfo &MRI);
132
133 bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
134 bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
135
136 // Helper to generate an equivalent of scalar_to_vector into a new register,
137 // returned via 'Dst'.
138 MachineInstr *emitScalarToVector(unsigned EltSize,
139 const TargetRegisterClass *DstRC,
140 Register Scalar,
141 MachineIRBuilder &MIRBuilder) const;
142
143 /// Emit a lane insert into \p DstReg, or a new vector register if None is
144 /// provided.
145 ///
146 /// The lane inserted into is defined by \p LaneIdx. The vector source
147 /// register is given by \p SrcReg. The register containing the element is
148 /// given by \p EltReg.
149 MachineInstr *emitLaneInsert(Optional<Register> DstReg, Register SrcReg,
150 Register EltReg, unsigned LaneIdx,
151 const RegisterBank &RB,
152 MachineIRBuilder &MIRBuilder) const;
153
154 /// Emit a sequence of instructions representing a constant \p CV for a
155 /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
156 ///
157 /// \returns the last instruction in the sequence on success, and nullptr
158 /// otherwise.
159 MachineInstr *emitConstantVector(Register Dst, Constant *CV,
160 MachineIRBuilder &MIRBuilder,
161 MachineRegisterInfo &MRI);
162
163 bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
164 bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
165 MachineRegisterInfo &MRI);
166 bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
167 bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
168 bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
169
170 bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
171 bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
172 bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
173 bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
174 bool selectIntrinsicWithSideEffects(MachineInstr &I,
175 MachineRegisterInfo &MRI);
176 bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
177 bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
178 bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
179 bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
180 bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
181 bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
182 bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
183 bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
184
185 unsigned emitConstantPoolEntry(const Constant *CPVal,
186 MachineFunction &MF) const;
187 MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
188 MachineIRBuilder &MIRBuilder) const;
189
190 // Emit a vector concat operation.
191 MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1,
192 Register Op2,
193 MachineIRBuilder &MIRBuilder) const;
194
195 // Emit an integer compare between LHS and RHS, which checks for Predicate.
196 MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
197 MachineOperand &Predicate,
198 MachineIRBuilder &MIRBuilder) const;
199
200 /// Emit a floating point comparison between \p LHS and \p RHS.
201 /// \p Pred if given is the intended predicate to use.
202 MachineInstr *emitFPCompare(Register LHS, Register RHS,
203 MachineIRBuilder &MIRBuilder,
204 Optional<CmpInst::Predicate> = None) const;
205
206 MachineInstr *emitInstr(unsigned Opcode,
207 std::initializer_list<llvm::DstOp> DstOps,
208 std::initializer_list<llvm::SrcOp> SrcOps,
209 MachineIRBuilder &MIRBuilder,
210 const ComplexRendererFns &RenderFns = None) const;
211 /// Helper function to emit an add or sub instruction.
212 ///
213 /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
214 /// in a specific order.
215 ///
216 /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
217 ///
218 /// \code
219 /// const std::array<std::array<unsigned, 2>, 4> Table {
220 /// {{AArch64::ADDXri, AArch64::ADDWri},
221 /// {AArch64::ADDXrs, AArch64::ADDWrs},
222 /// {AArch64::ADDXrr, AArch64::ADDWrr},
223 /// {AArch64::SUBXri, AArch64::SUBWri},
224 /// {AArch64::ADDXrx, AArch64::ADDWrx}}};
225 /// \endcode
226 ///
227 /// Each row in the table corresponds to a different addressing mode. Each
228 /// column corresponds to a different register size.
229 ///
230 /// \attention Rows must be structured as follows:
231 /// - Row 0: The ri opcode variants
232 /// - Row 1: The rs opcode variants
233 /// - Row 2: The rr opcode variants
234 /// - Row 3: The ri opcode variants for negative immediates
235 /// - Row 4: The rx opcode variants
236 ///
237 /// \attention Columns must be structured as follows:
238 /// - Column 0: The 64-bit opcode variants
239 /// - Column 1: The 32-bit opcode variants
240 ///
241 /// \p Dst is the destination register of the binop to emit.
242 /// \p LHS is the left-hand operand of the binop to emit.
243 /// \p RHS is the right-hand operand of the binop to emit.
244 MachineInstr *emitAddSub(
245 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
246 Register Dst, MachineOperand &LHS, MachineOperand &RHS,
247 MachineIRBuilder &MIRBuilder) const;
248 MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
249 MachineOperand &RHS,
250 MachineIRBuilder &MIRBuilder) const;
251 MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
252 MachineIRBuilder &MIRBuilder) const;
253 MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
254 MachineIRBuilder &MIRBuilder) const;
255 MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
256 MachineIRBuilder &MIRBuilder) const;
257 MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
258 MachineIRBuilder &MIRBuilder) const;
259 MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
260 AArch64CC::CondCode CC,
261 MachineIRBuilder &MIRBuilder) const;
262 MachineInstr *emitExtractVectorElt(Optional<Register> DstReg,
263 const RegisterBank &DstRB, LLT ScalarTy,
264 Register VecReg, unsigned LaneIdx,
265 MachineIRBuilder &MIRBuilder) const;
266
267 /// Emit a CSet for an integer compare.
268 ///
269 /// \p DefReg and \p SrcReg are expected to be 32-bit scalar registers.
270 MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
271 MachineIRBuilder &MIRBuilder,
272 Register SrcReg = AArch64::WZR) const;
273 /// Emit a CSet for a FP compare.
274 ///
275 /// \p Dst is expected to be a 32-bit scalar register.
276 MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
277 MachineIRBuilder &MIRBuilder) const;
278
279 /// Emit the overflow op for \p Opcode.
280 ///
281 /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
282 /// G_USUBO, etc.
283 std::pair<MachineInstr *, AArch64CC::CondCode>
284 emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
285 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
286
287 /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
288 /// \p IsNegative is true if the test should be "not zero".
289 /// This will also optimize the test bit instruction when possible.
290 MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
291 MachineBasicBlock *DstMBB,
292 MachineIRBuilder &MIB) const;
293
294 /// Emit a CB(N)Z instruction which branches to \p DestMBB.
295 MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
296 MachineBasicBlock *DestMBB,
297 MachineIRBuilder &MIB) const;
298
299 // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
300 // We use these manually instead of using the importer since it doesn't
301 // support SDNodeXForm.
302 ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
303 ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
304 ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
305 ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
306
307 ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
308 ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
309 ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
310
311 ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
312 unsigned Size) const;
313
selectAddrModeUnscaled8(MachineOperand & Root) const314 ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
315 return selectAddrModeUnscaled(Root, 1);
316 }
selectAddrModeUnscaled16(MachineOperand & Root) const317 ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
318 return selectAddrModeUnscaled(Root, 2);
319 }
selectAddrModeUnscaled32(MachineOperand & Root) const320 ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
321 return selectAddrModeUnscaled(Root, 4);
322 }
selectAddrModeUnscaled64(MachineOperand & Root) const323 ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
324 return selectAddrModeUnscaled(Root, 8);
325 }
selectAddrModeUnscaled128(MachineOperand & Root) const326 ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
327 return selectAddrModeUnscaled(Root, 16);
328 }
329
330 /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
331 /// from complex pattern matchers like selectAddrModeIndexed().
332 ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
333 MachineRegisterInfo &MRI) const;
334
335 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
336 unsigned Size) const;
337 template <int Width>
selectAddrModeIndexed(MachineOperand & Root) const338 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
339 return selectAddrModeIndexed(Root, Width / 8);
340 }
341
342 bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
343 const MachineRegisterInfo &MRI) const;
344 ComplexRendererFns
345 selectAddrModeShiftedExtendXReg(MachineOperand &Root,
346 unsigned SizeInBytes) const;
347
348 /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
349 /// or not a shift + extend should be folded into an addressing mode. Returns
350 /// None when this is not profitable or possible.
351 ComplexRendererFns
352 selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
353 MachineOperand &Offset, unsigned SizeInBytes,
354 bool WantsExt) const;
355 ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
356 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
357 unsigned SizeInBytes) const;
358 template <int Width>
selectAddrModeXRO(MachineOperand & Root) const359 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
360 return selectAddrModeXRO(Root, Width / 8);
361 }
362
363 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
364 unsigned SizeInBytes) const;
365 template <int Width>
selectAddrModeWRO(MachineOperand & Root) const366 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
367 return selectAddrModeWRO(Root, Width / 8);
368 }
369
370 ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const;
371
selectArithShiftedRegister(MachineOperand & Root) const372 ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
373 return selectShiftedRegister(Root);
374 }
375
selectLogicalShiftedRegister(MachineOperand & Root) const376 ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
377 // TODO: selectShiftedRegister should allow for rotates on logical shifts.
378 // For now, make them the same. The only difference between the two is that
379 // logical shifts are allowed to fold in rotates. Otherwise, these are
380 // functionally the same.
381 return selectShiftedRegister(Root);
382 }
383
384 /// Given an extend instruction, determine the correct shift-extend type for
385 /// that instruction.
386 ///
387 /// If the instruction is going to be used in a load or store, pass
388 /// \p IsLoadStore = true.
389 AArch64_AM::ShiftExtendType
390 getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
391 bool IsLoadStore = false) const;
392
393 /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
394 ///
395 /// \returns Either \p Reg if no change was necessary, or the new register
396 /// created by moving \p Reg.
397 ///
398 /// Note: This uses emitCopy right now.
399 Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
400 MachineIRBuilder &MIB) const;
401
402 ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
403
404 void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
405 int OpIdx = -1) const;
406 void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
407 int OpIdx = -1) const;
408 void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
409 int OpIdx = -1) const;
410 void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
411 int OpIdx = -1) const;
412 void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
413 int OpIdx = -1) const;
414 void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
415 int OpIdx = -1) const;
416
417 // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
418 void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
419
420 // Optimization methods.
421 bool tryOptSelect(MachineInstr &MI);
422 MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
423 MachineOperand &Predicate,
424 MachineIRBuilder &MIRBuilder) const;
425
426 /// Return true if \p MI is a load or store of \p NumBytes bytes.
427 bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
428
429 /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
430 /// register zeroed out. In other words, the result of MI has been explicitly
431 /// zero extended.
432 bool isDef32(const MachineInstr &MI) const;
433
434 const AArch64TargetMachine &TM;
435 const AArch64Subtarget &STI;
436 const AArch64InstrInfo &TII;
437 const AArch64RegisterInfo &TRI;
438 const AArch64RegisterBankInfo &RBI;
439
440 bool ProduceNonFlagSettingCondBr = false;
441
442 // Some cached values used during selection.
443 // We use LR as a live-in register, and we keep track of it here as it can be
444 // clobbered by calls.
445 Register MFReturnAddr;
446
447 MachineIRBuilder MIB;
448
449 #define GET_GLOBALISEL_PREDICATES_DECL
450 #include "AArch64GenGlobalISel.inc"
451 #undef GET_GLOBALISEL_PREDICATES_DECL
452
453 // We declare the temporaries used by selectImpl() in the class to minimize the
454 // cost of constructing placeholder values.
455 #define GET_GLOBALISEL_TEMPORARIES_DECL
456 #include "AArch64GenGlobalISel.inc"
457 #undef GET_GLOBALISEL_TEMPORARIES_DECL
458 };
459
460 } // end anonymous namespace
461
462 #define GET_GLOBALISEL_IMPL
463 #include "AArch64GenGlobalISel.inc"
464 #undef GET_GLOBALISEL_IMPL
465
AArch64InstructionSelector(const AArch64TargetMachine & TM,const AArch64Subtarget & STI,const AArch64RegisterBankInfo & RBI)466 AArch64InstructionSelector::AArch64InstructionSelector(
467 const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
468 const AArch64RegisterBankInfo &RBI)
469 : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
470 TRI(*STI.getRegisterInfo()), RBI(RBI),
471 #define GET_GLOBALISEL_PREDICATES_INIT
472 #include "AArch64GenGlobalISel.inc"
473 #undef GET_GLOBALISEL_PREDICATES_INIT
474 #define GET_GLOBALISEL_TEMPORARIES_INIT
475 #include "AArch64GenGlobalISel.inc"
476 #undef GET_GLOBALISEL_TEMPORARIES_INIT
477 {
478 }
479
480 // FIXME: This should be target-independent, inferred from the types declared
481 // for each class in the bank.
482 static const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty,const RegisterBank & RB,const RegisterBankInfo & RBI,bool GetAllRegSet=false)483 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
484 const RegisterBankInfo &RBI,
485 bool GetAllRegSet = false) {
486 if (RB.getID() == AArch64::GPRRegBankID) {
487 if (Ty.getSizeInBits() <= 32)
488 return GetAllRegSet ? &AArch64::GPR32allRegClass
489 : &AArch64::GPR32RegClass;
490 if (Ty.getSizeInBits() == 64)
491 return GetAllRegSet ? &AArch64::GPR64allRegClass
492 : &AArch64::GPR64RegClass;
493 if (Ty.getSizeInBits() == 128)
494 return &AArch64::XSeqPairsClassRegClass;
495 return nullptr;
496 }
497
498 if (RB.getID() == AArch64::FPRRegBankID) {
499 if (Ty.getSizeInBits() <= 16)
500 return &AArch64::FPR16RegClass;
501 if (Ty.getSizeInBits() == 32)
502 return &AArch64::FPR32RegClass;
503 if (Ty.getSizeInBits() == 64)
504 return &AArch64::FPR64RegClass;
505 if (Ty.getSizeInBits() == 128)
506 return &AArch64::FPR128RegClass;
507 return nullptr;
508 }
509
510 return nullptr;
511 }
512
513 /// Given a register bank, and size in bits, return the smallest register class
514 /// that can represent that combination.
515 static const TargetRegisterClass *
getMinClassForRegBank(const RegisterBank & RB,unsigned SizeInBits,bool GetAllRegSet=false)516 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
517 bool GetAllRegSet = false) {
518 unsigned RegBankID = RB.getID();
519
520 if (RegBankID == AArch64::GPRRegBankID) {
521 if (SizeInBits <= 32)
522 return GetAllRegSet ? &AArch64::GPR32allRegClass
523 : &AArch64::GPR32RegClass;
524 if (SizeInBits == 64)
525 return GetAllRegSet ? &AArch64::GPR64allRegClass
526 : &AArch64::GPR64RegClass;
527 if (SizeInBits == 128)
528 return &AArch64::XSeqPairsClassRegClass;
529 }
530
531 if (RegBankID == AArch64::FPRRegBankID) {
532 switch (SizeInBits) {
533 default:
534 return nullptr;
535 case 8:
536 return &AArch64::FPR8RegClass;
537 case 16:
538 return &AArch64::FPR16RegClass;
539 case 32:
540 return &AArch64::FPR32RegClass;
541 case 64:
542 return &AArch64::FPR64RegClass;
543 case 128:
544 return &AArch64::FPR128RegClass;
545 }
546 }
547
548 return nullptr;
549 }
550
551 /// Returns the correct subregister to use for a given register class.
getSubRegForClass(const TargetRegisterClass * RC,const TargetRegisterInfo & TRI,unsigned & SubReg)552 static bool getSubRegForClass(const TargetRegisterClass *RC,
553 const TargetRegisterInfo &TRI, unsigned &SubReg) {
554 switch (TRI.getRegSizeInBits(*RC)) {
555 case 8:
556 SubReg = AArch64::bsub;
557 break;
558 case 16:
559 SubReg = AArch64::hsub;
560 break;
561 case 32:
562 if (RC != &AArch64::FPR32RegClass)
563 SubReg = AArch64::sub_32;
564 else
565 SubReg = AArch64::ssub;
566 break;
567 case 64:
568 SubReg = AArch64::dsub;
569 break;
570 default:
571 LLVM_DEBUG(
572 dbgs() << "Couldn't find appropriate subregister for register class.");
573 return false;
574 }
575
576 return true;
577 }
578
579 /// Returns the minimum size the given register bank can hold.
getMinSizeForRegBank(const RegisterBank & RB)580 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
581 switch (RB.getID()) {
582 case AArch64::GPRRegBankID:
583 return 32;
584 case AArch64::FPRRegBankID:
585 return 8;
586 default:
587 llvm_unreachable("Tried to get minimum size for unknown register bank.");
588 }
589 }
590
591 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
592 /// Helper function for functions like createDTuple and createQTuple.
593 ///
594 /// \p RegClassIDs - The list of register class IDs available for some tuple of
595 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
596 /// expected to contain between 2 and 4 tuple classes.
597 ///
598 /// \p SubRegs - The list of subregister classes associated with each register
599 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
600 /// subregister class. The index of each subregister class is expected to
601 /// correspond with the index of each register class.
602 ///
603 /// \returns Either the destination register of REG_SEQUENCE instruction that
604 /// was created, or the 0th element of \p Regs if \p Regs contains a single
605 /// element.
createTuple(ArrayRef<Register> Regs,const unsigned RegClassIDs[],const unsigned SubRegs[],MachineIRBuilder & MIB)606 static Register createTuple(ArrayRef<Register> Regs,
607 const unsigned RegClassIDs[],
608 const unsigned SubRegs[], MachineIRBuilder &MIB) {
609 unsigned NumRegs = Regs.size();
610 if (NumRegs == 1)
611 return Regs[0];
612 assert(NumRegs >= 2 && NumRegs <= 4 &&
613 "Only support between two and 4 registers in a tuple!");
614 const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
615 auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
616 auto RegSequence =
617 MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
618 for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
619 RegSequence.addUse(Regs[I]);
620 RegSequence.addImm(SubRegs[I]);
621 }
622 return RegSequence.getReg(0);
623 }
624
625 /// Create a tuple of D-registers using the registers in \p Regs.
createDTuple(ArrayRef<Register> Regs,MachineIRBuilder & MIB)626 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
627 static const unsigned RegClassIDs[] = {
628 AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
629 static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
630 AArch64::dsub2, AArch64::dsub3};
631 return createTuple(Regs, RegClassIDs, SubRegs, MIB);
632 }
633
634 /// Create a tuple of Q-registers using the registers in \p Regs.
createQTuple(ArrayRef<Register> Regs,MachineIRBuilder & MIB)635 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
636 static const unsigned RegClassIDs[] = {
637 AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
638 static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
639 AArch64::qsub2, AArch64::qsub3};
640 return createTuple(Regs, RegClassIDs, SubRegs, MIB);
641 }
642
getImmedFromMO(const MachineOperand & Root)643 static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
644 auto &MI = *Root.getParent();
645 auto &MBB = *MI.getParent();
646 auto &MF = *MBB.getParent();
647 auto &MRI = MF.getRegInfo();
648 uint64_t Immed;
649 if (Root.isImm())
650 Immed = Root.getImm();
651 else if (Root.isCImm())
652 Immed = Root.getCImm()->getZExtValue();
653 else if (Root.isReg()) {
654 auto ValAndVReg =
655 getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
656 if (!ValAndVReg)
657 return None;
658 Immed = ValAndVReg->Value.getSExtValue();
659 } else
660 return None;
661 return Immed;
662 }
663
664 /// Check whether \p I is a currently unsupported binary operation:
665 /// - it has an unsized type
666 /// - an operand is not a vreg
667 /// - all operands are not in the same bank
668 /// These are checks that should someday live in the verifier, but right now,
669 /// these are mostly limitations of the aarch64 selector.
unsupportedBinOp(const MachineInstr & I,const AArch64RegisterBankInfo & RBI,const MachineRegisterInfo & MRI,const AArch64RegisterInfo & TRI)670 static bool unsupportedBinOp(const MachineInstr &I,
671 const AArch64RegisterBankInfo &RBI,
672 const MachineRegisterInfo &MRI,
673 const AArch64RegisterInfo &TRI) {
674 LLT Ty = MRI.getType(I.getOperand(0).getReg());
675 if (!Ty.isValid()) {
676 LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
677 return true;
678 }
679
680 const RegisterBank *PrevOpBank = nullptr;
681 for (auto &MO : I.operands()) {
682 // FIXME: Support non-register operands.
683 if (!MO.isReg()) {
684 LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
685 return true;
686 }
687
688 // FIXME: Can generic operations have physical registers operands? If
689 // so, this will need to be taught about that, and we'll need to get the
690 // bank out of the minimal class for the register.
691 // Either way, this needs to be documented (and possibly verified).
692 if (!Register::isVirtualRegister(MO.getReg())) {
693 LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
694 return true;
695 }
696
697 const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
698 if (!OpBank) {
699 LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
700 return true;
701 }
702
703 if (PrevOpBank && OpBank != PrevOpBank) {
704 LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
705 return true;
706 }
707 PrevOpBank = OpBank;
708 }
709 return false;
710 }
711
712 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
713 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
714 /// and of size \p OpSize.
715 /// \returns \p GenericOpc if the combination is unsupported.
selectBinaryOp(unsigned GenericOpc,unsigned RegBankID,unsigned OpSize)716 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
717 unsigned OpSize) {
718 switch (RegBankID) {
719 case AArch64::GPRRegBankID:
720 if (OpSize == 32) {
721 switch (GenericOpc) {
722 case TargetOpcode::G_SHL:
723 return AArch64::LSLVWr;
724 case TargetOpcode::G_LSHR:
725 return AArch64::LSRVWr;
726 case TargetOpcode::G_ASHR:
727 return AArch64::ASRVWr;
728 default:
729 return GenericOpc;
730 }
731 } else if (OpSize == 64) {
732 switch (GenericOpc) {
733 case TargetOpcode::G_PTR_ADD:
734 return AArch64::ADDXrr;
735 case TargetOpcode::G_SHL:
736 return AArch64::LSLVXr;
737 case TargetOpcode::G_LSHR:
738 return AArch64::LSRVXr;
739 case TargetOpcode::G_ASHR:
740 return AArch64::ASRVXr;
741 default:
742 return GenericOpc;
743 }
744 }
745 break;
746 case AArch64::FPRRegBankID:
747 switch (OpSize) {
748 case 32:
749 switch (GenericOpc) {
750 case TargetOpcode::G_FADD:
751 return AArch64::FADDSrr;
752 case TargetOpcode::G_FSUB:
753 return AArch64::FSUBSrr;
754 case TargetOpcode::G_FMUL:
755 return AArch64::FMULSrr;
756 case TargetOpcode::G_FDIV:
757 return AArch64::FDIVSrr;
758 default:
759 return GenericOpc;
760 }
761 case 64:
762 switch (GenericOpc) {
763 case TargetOpcode::G_FADD:
764 return AArch64::FADDDrr;
765 case TargetOpcode::G_FSUB:
766 return AArch64::FSUBDrr;
767 case TargetOpcode::G_FMUL:
768 return AArch64::FMULDrr;
769 case TargetOpcode::G_FDIV:
770 return AArch64::FDIVDrr;
771 case TargetOpcode::G_OR:
772 return AArch64::ORRv8i8;
773 default:
774 return GenericOpc;
775 }
776 }
777 break;
778 }
779 return GenericOpc;
780 }
781
782 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
783 /// appropriate for the (value) register bank \p RegBankID and of memory access
784 /// size \p OpSize. This returns the variant with the base+unsigned-immediate
785 /// addressing mode (e.g., LDRXui).
786 /// \returns \p GenericOpc if the combination is unsupported.
selectLoadStoreUIOp(unsigned GenericOpc,unsigned RegBankID,unsigned OpSize)787 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
788 unsigned OpSize) {
789 const bool isStore = GenericOpc == TargetOpcode::G_STORE;
790 switch (RegBankID) {
791 case AArch64::GPRRegBankID:
792 switch (OpSize) {
793 case 8:
794 return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
795 case 16:
796 return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
797 case 32:
798 return isStore ? AArch64::STRWui : AArch64::LDRWui;
799 case 64:
800 return isStore ? AArch64::STRXui : AArch64::LDRXui;
801 }
802 break;
803 case AArch64::FPRRegBankID:
804 switch (OpSize) {
805 case 8:
806 return isStore ? AArch64::STRBui : AArch64::LDRBui;
807 case 16:
808 return isStore ? AArch64::STRHui : AArch64::LDRHui;
809 case 32:
810 return isStore ? AArch64::STRSui : AArch64::LDRSui;
811 case 64:
812 return isStore ? AArch64::STRDui : AArch64::LDRDui;
813 }
814 break;
815 }
816 return GenericOpc;
817 }
818
819 #ifndef NDEBUG
820 /// Helper function that verifies that we have a valid copy at the end of
821 /// selectCopy. Verifies that the source and dest have the expected sizes and
822 /// then returns true.
isValidCopy(const MachineInstr & I,const RegisterBank & DstBank,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)823 static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
824 const MachineRegisterInfo &MRI,
825 const TargetRegisterInfo &TRI,
826 const RegisterBankInfo &RBI) {
827 const Register DstReg = I.getOperand(0).getReg();
828 const Register SrcReg = I.getOperand(1).getReg();
829 const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
830 const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
831
832 // Make sure the size of the source and dest line up.
833 assert(
834 (DstSize == SrcSize ||
835 // Copies are a mean to setup initial types, the number of
836 // bits may not exactly match.
837 (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
838 // Copies are a mean to copy bits around, as long as we are
839 // on the same register class, that's fine. Otherwise, that
840 // means we need some SUBREG_TO_REG or AND & co.
841 (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
842 "Copy with different width?!");
843
844 // Check the size of the destination.
845 assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) &&
846 "GPRs cannot get more than 64-bit width values");
847
848 return true;
849 }
850 #endif
851
852 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
853 /// to \p *To.
854 ///
855 /// E.g "To = COPY SrcReg:SubReg"
copySubReg(MachineInstr & I,MachineRegisterInfo & MRI,const RegisterBankInfo & RBI,Register SrcReg,const TargetRegisterClass * To,unsigned SubReg)856 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
857 const RegisterBankInfo &RBI, Register SrcReg,
858 const TargetRegisterClass *To, unsigned SubReg) {
859 assert(SrcReg.isValid() && "Expected a valid source register?");
860 assert(To && "Destination register class cannot be null");
861 assert(SubReg && "Expected a valid subregister");
862
863 MachineIRBuilder MIB(I);
864 auto SubRegCopy =
865 MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
866 MachineOperand &RegOp = I.getOperand(1);
867 RegOp.setReg(SubRegCopy.getReg(0));
868
869 // It's possible that the destination register won't be constrained. Make
870 // sure that happens.
871 if (!Register::isPhysicalRegister(I.getOperand(0).getReg()))
872 RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
873
874 return true;
875 }
876
877 /// Helper function to get the source and destination register classes for a
878 /// copy. Returns a std::pair containing the source register class for the
879 /// copy, and the destination register class for the copy. If a register class
880 /// cannot be determined, then it will be nullptr.
881 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getRegClassesForCopy(MachineInstr & I,const TargetInstrInfo & TII,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)882 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
883 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
884 const RegisterBankInfo &RBI) {
885 Register DstReg = I.getOperand(0).getReg();
886 Register SrcReg = I.getOperand(1).getReg();
887 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
888 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
889 unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
890 unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
891
892 // Special casing for cross-bank copies of s1s. We can technically represent
893 // a 1-bit value with any size of register. The minimum size for a GPR is 32
894 // bits. So, we need to put the FPR on 32 bits as well.
895 //
896 // FIXME: I'm not sure if this case holds true outside of copies. If it does,
897 // then we can pull it into the helpers that get the appropriate class for a
898 // register bank. Or make a new helper that carries along some constraint
899 // information.
900 if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
901 SrcSize = DstSize = 32;
902
903 return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
904 getMinClassForRegBank(DstRegBank, DstSize, true)};
905 }
906
selectCopy(MachineInstr & I,const TargetInstrInfo & TII,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)907 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
908 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
909 const RegisterBankInfo &RBI) {
910 Register DstReg = I.getOperand(0).getReg();
911 Register SrcReg = I.getOperand(1).getReg();
912 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
913 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
914
915 // Find the correct register classes for the source and destination registers.
916 const TargetRegisterClass *SrcRC;
917 const TargetRegisterClass *DstRC;
918 std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
919
920 if (!DstRC) {
921 LLVM_DEBUG(dbgs() << "Unexpected dest size "
922 << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
923 return false;
924 }
925
926 // A couple helpers below, for making sure that the copy we produce is valid.
927
928 // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want
929 // to verify that the src and dst are the same size, since that's handled by
930 // the SUBREG_TO_REG.
931 bool KnownValid = false;
932
933 // Returns true, or asserts if something we don't expect happens. Instead of
934 // returning true, we return isValidCopy() to ensure that we verify the
935 // result.
936 auto CheckCopy = [&]() {
937 // If we have a bitcast or something, we can't have physical registers.
938 assert((I.isCopy() ||
939 (!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
940 !Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
941 "No phys reg on generic operator!");
942 bool ValidCopy = true;
943 #ifndef NDEBUG
944 ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
945 assert(ValidCopy && "Invalid copy.");
946 #endif
947 (void)KnownValid;
948 return ValidCopy;
949 };
950
951 // Is this a copy? If so, then we may need to insert a subregister copy.
952 if (I.isCopy()) {
953 // Yes. Check if there's anything to fix up.
954 if (!SrcRC) {
955 LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
956 return false;
957 }
958
959 unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
960 unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
961 unsigned SubReg;
962
963 // If the source bank doesn't support a subregister copy small enough,
964 // then we first need to copy to the destination bank.
965 if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
966 const TargetRegisterClass *DstTempRC =
967 getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
968 getSubRegForClass(DstRC, TRI, SubReg);
969
970 MachineIRBuilder MIB(I);
971 auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
972 copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
973 } else if (SrcSize > DstSize) {
974 // If the source register is bigger than the destination we need to
975 // perform a subregister copy.
976 const TargetRegisterClass *SubRegRC =
977 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
978 getSubRegForClass(SubRegRC, TRI, SubReg);
979 copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
980 } else if (DstSize > SrcSize) {
981 // If the destination register is bigger than the source we need to do
982 // a promotion using SUBREG_TO_REG.
983 const TargetRegisterClass *PromotionRC =
984 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
985 getSubRegForClass(SrcRC, TRI, SubReg);
986
987 Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
988 BuildMI(*I.getParent(), I, I.getDebugLoc(),
989 TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
990 .addImm(0)
991 .addUse(SrcReg)
992 .addImm(SubReg);
993 MachineOperand &RegOp = I.getOperand(1);
994 RegOp.setReg(PromoteReg);
995
996 // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
997 KnownValid = true;
998 }
999
1000 // If the destination is a physical register, then there's nothing to
1001 // change, so we're done.
1002 if (Register::isPhysicalRegister(DstReg))
1003 return CheckCopy();
1004 }
1005
1006 // No need to constrain SrcReg. It will get constrained when we hit another
1007 // of its use or its defs. Copies do not have constraints.
1008 if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
1009 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1010 << " operand\n");
1011 return false;
1012 }
1013
1014 // If this a GPR ZEXT that we want to just reduce down into a copy.
1015 // The sizes will be mismatched with the source < 32b but that's ok.
1016 if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1017 I.setDesc(TII.get(AArch64::COPY));
1018 assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1019 return selectCopy(I, TII, MRI, TRI, RBI);
1020 }
1021
1022 I.setDesc(TII.get(AArch64::COPY));
1023 return CheckCopy();
1024 }
1025
selectFPConvOpc(unsigned GenericOpc,LLT DstTy,LLT SrcTy)1026 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1027 if (!DstTy.isScalar() || !SrcTy.isScalar())
1028 return GenericOpc;
1029
1030 const unsigned DstSize = DstTy.getSizeInBits();
1031 const unsigned SrcSize = SrcTy.getSizeInBits();
1032
1033 switch (DstSize) {
1034 case 32:
1035 switch (SrcSize) {
1036 case 32:
1037 switch (GenericOpc) {
1038 case TargetOpcode::G_SITOFP:
1039 return AArch64::SCVTFUWSri;
1040 case TargetOpcode::G_UITOFP:
1041 return AArch64::UCVTFUWSri;
1042 case TargetOpcode::G_FPTOSI:
1043 return AArch64::FCVTZSUWSr;
1044 case TargetOpcode::G_FPTOUI:
1045 return AArch64::FCVTZUUWSr;
1046 default:
1047 return GenericOpc;
1048 }
1049 case 64:
1050 switch (GenericOpc) {
1051 case TargetOpcode::G_SITOFP:
1052 return AArch64::SCVTFUXSri;
1053 case TargetOpcode::G_UITOFP:
1054 return AArch64::UCVTFUXSri;
1055 case TargetOpcode::G_FPTOSI:
1056 return AArch64::FCVTZSUWDr;
1057 case TargetOpcode::G_FPTOUI:
1058 return AArch64::FCVTZUUWDr;
1059 default:
1060 return GenericOpc;
1061 }
1062 default:
1063 return GenericOpc;
1064 }
1065 case 64:
1066 switch (SrcSize) {
1067 case 32:
1068 switch (GenericOpc) {
1069 case TargetOpcode::G_SITOFP:
1070 return AArch64::SCVTFUWDri;
1071 case TargetOpcode::G_UITOFP:
1072 return AArch64::UCVTFUWDri;
1073 case TargetOpcode::G_FPTOSI:
1074 return AArch64::FCVTZSUXSr;
1075 case TargetOpcode::G_FPTOUI:
1076 return AArch64::FCVTZUUXSr;
1077 default:
1078 return GenericOpc;
1079 }
1080 case 64:
1081 switch (GenericOpc) {
1082 case TargetOpcode::G_SITOFP:
1083 return AArch64::SCVTFUXDri;
1084 case TargetOpcode::G_UITOFP:
1085 return AArch64::UCVTFUXDri;
1086 case TargetOpcode::G_FPTOSI:
1087 return AArch64::FCVTZSUXDr;
1088 case TargetOpcode::G_FPTOUI:
1089 return AArch64::FCVTZUUXDr;
1090 default:
1091 return GenericOpc;
1092 }
1093 default:
1094 return GenericOpc;
1095 }
1096 default:
1097 return GenericOpc;
1098 };
1099 return GenericOpc;
1100 }
1101
1102 MachineInstr *
emitSelect(Register Dst,Register True,Register False,AArch64CC::CondCode CC,MachineIRBuilder & MIB) const1103 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1104 Register False, AArch64CC::CondCode CC,
1105 MachineIRBuilder &MIB) const {
1106 MachineRegisterInfo &MRI = *MIB.getMRI();
1107 assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1108 RBI.getRegBank(True, MRI, TRI)->getID() &&
1109 "Expected both select operands to have the same regbank?");
1110 LLT Ty = MRI.getType(True);
1111 if (Ty.isVector())
1112 return nullptr;
1113 const unsigned Size = Ty.getSizeInBits();
1114 assert((Size == 32 || Size == 64) &&
1115 "Expected 32 bit or 64 bit select only?");
1116 const bool Is32Bit = Size == 32;
1117 if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1118 unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1119 auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1120 constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1121 return &*FCSel;
1122 }
1123
1124 // By default, we'll try and emit a CSEL.
1125 unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1126 bool Optimized = false;
1127 auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1128 &Optimized](Register &Reg, Register &OtherReg,
1129 bool Invert) {
1130 if (Optimized)
1131 return false;
1132
1133 // Attempt to fold:
1134 //
1135 // %sub = G_SUB 0, %x
1136 // %select = G_SELECT cc, %reg, %sub
1137 //
1138 // Into:
1139 // %select = CSNEG %reg, %x, cc
1140 Register MatchReg;
1141 if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1142 Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1143 Reg = MatchReg;
1144 if (Invert) {
1145 CC = AArch64CC::getInvertedCondCode(CC);
1146 std::swap(Reg, OtherReg);
1147 }
1148 return true;
1149 }
1150
1151 // Attempt to fold:
1152 //
1153 // %xor = G_XOR %x, -1
1154 // %select = G_SELECT cc, %reg, %xor
1155 //
1156 // Into:
1157 // %select = CSINV %reg, %x, cc
1158 if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1159 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1160 Reg = MatchReg;
1161 if (Invert) {
1162 CC = AArch64CC::getInvertedCondCode(CC);
1163 std::swap(Reg, OtherReg);
1164 }
1165 return true;
1166 }
1167
1168 // Attempt to fold:
1169 //
1170 // %add = G_ADD %x, 1
1171 // %select = G_SELECT cc, %reg, %add
1172 //
1173 // Into:
1174 // %select = CSINC %reg, %x, cc
1175 if (mi_match(Reg, MRI,
1176 m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1177 m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1178 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1179 Reg = MatchReg;
1180 if (Invert) {
1181 CC = AArch64CC::getInvertedCondCode(CC);
1182 std::swap(Reg, OtherReg);
1183 }
1184 return true;
1185 }
1186
1187 return false;
1188 };
1189
1190 // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1191 // true/false values are constants.
1192 // FIXME: All of these patterns already exist in tablegen. We should be
1193 // able to import these.
1194 auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1195 &Optimized]() {
1196 if (Optimized)
1197 return false;
1198 auto TrueCst = getConstantVRegValWithLookThrough(True, MRI);
1199 auto FalseCst = getConstantVRegValWithLookThrough(False, MRI);
1200 if (!TrueCst && !FalseCst)
1201 return false;
1202
1203 Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1204 if (TrueCst && FalseCst) {
1205 int64_t T = TrueCst->Value.getSExtValue();
1206 int64_t F = FalseCst->Value.getSExtValue();
1207
1208 if (T == 0 && F == 1) {
1209 // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1210 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1211 True = ZReg;
1212 False = ZReg;
1213 return true;
1214 }
1215
1216 if (T == 0 && F == -1) {
1217 // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1218 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1219 True = ZReg;
1220 False = ZReg;
1221 return true;
1222 }
1223 }
1224
1225 if (TrueCst) {
1226 int64_t T = TrueCst->Value.getSExtValue();
1227 if (T == 1) {
1228 // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1229 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1230 True = False;
1231 False = ZReg;
1232 CC = AArch64CC::getInvertedCondCode(CC);
1233 return true;
1234 }
1235
1236 if (T == -1) {
1237 // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1238 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1239 True = False;
1240 False = ZReg;
1241 CC = AArch64CC::getInvertedCondCode(CC);
1242 return true;
1243 }
1244 }
1245
1246 if (FalseCst) {
1247 int64_t F = FalseCst->Value.getSExtValue();
1248 if (F == 1) {
1249 // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1250 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1251 False = ZReg;
1252 return true;
1253 }
1254
1255 if (F == -1) {
1256 // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1257 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1258 False = ZReg;
1259 return true;
1260 }
1261 }
1262 return false;
1263 };
1264
1265 Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1266 Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1267 Optimized |= TryOptSelectCst();
1268 auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1269 constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1270 return &*SelectInst;
1271 }
1272
changeICMPPredToAArch64CC(CmpInst::Predicate P)1273 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1274 switch (P) {
1275 default:
1276 llvm_unreachable("Unknown condition code!");
1277 case CmpInst::ICMP_NE:
1278 return AArch64CC::NE;
1279 case CmpInst::ICMP_EQ:
1280 return AArch64CC::EQ;
1281 case CmpInst::ICMP_SGT:
1282 return AArch64CC::GT;
1283 case CmpInst::ICMP_SGE:
1284 return AArch64CC::GE;
1285 case CmpInst::ICMP_SLT:
1286 return AArch64CC::LT;
1287 case CmpInst::ICMP_SLE:
1288 return AArch64CC::LE;
1289 case CmpInst::ICMP_UGT:
1290 return AArch64CC::HI;
1291 case CmpInst::ICMP_UGE:
1292 return AArch64CC::HS;
1293 case CmpInst::ICMP_ULT:
1294 return AArch64CC::LO;
1295 case CmpInst::ICMP_ULE:
1296 return AArch64CC::LS;
1297 }
1298 }
1299
1300 /// Return a register which can be used as a bit to test in a TB(N)Z.
getTestBitReg(Register Reg,uint64_t & Bit,bool & Invert,MachineRegisterInfo & MRI)1301 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1302 MachineRegisterInfo &MRI) {
1303 assert(Reg.isValid() && "Expected valid register!");
1304 bool HasZext = false;
1305 while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1306 unsigned Opc = MI->getOpcode();
1307
1308 if (!MI->getOperand(0).isReg() ||
1309 !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1310 break;
1311
1312 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1313 //
1314 // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1315 // on the truncated x is the same as the bit number on x.
1316 if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1317 Opc == TargetOpcode::G_TRUNC) {
1318 if (Opc == TargetOpcode::G_ZEXT)
1319 HasZext = true;
1320
1321 Register NextReg = MI->getOperand(1).getReg();
1322 // Did we find something worth folding?
1323 if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1324 break;
1325
1326 // NextReg is worth folding. Keep looking.
1327 Reg = NextReg;
1328 continue;
1329 }
1330
1331 // Attempt to find a suitable operation with a constant on one side.
1332 Optional<uint64_t> C;
1333 Register TestReg;
1334 switch (Opc) {
1335 default:
1336 break;
1337 case TargetOpcode::G_AND:
1338 case TargetOpcode::G_XOR: {
1339 TestReg = MI->getOperand(1).getReg();
1340 Register ConstantReg = MI->getOperand(2).getReg();
1341 auto VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
1342 if (!VRegAndVal) {
1343 // AND commutes, check the other side for a constant.
1344 // FIXME: Can we canonicalize the constant so that it's always on the
1345 // same side at some point earlier?
1346 std::swap(ConstantReg, TestReg);
1347 VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
1348 }
1349 if (VRegAndVal) {
1350 if (HasZext)
1351 C = VRegAndVal->Value.getZExtValue();
1352 else
1353 C = VRegAndVal->Value.getSExtValue();
1354 }
1355 break;
1356 }
1357 case TargetOpcode::G_ASHR:
1358 case TargetOpcode::G_LSHR:
1359 case TargetOpcode::G_SHL: {
1360 TestReg = MI->getOperand(1).getReg();
1361 auto VRegAndVal =
1362 getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1363 if (VRegAndVal)
1364 C = VRegAndVal->Value.getSExtValue();
1365 break;
1366 }
1367 }
1368
1369 // Didn't find a constant or viable register. Bail out of the loop.
1370 if (!C || !TestReg.isValid())
1371 break;
1372
1373 // We found a suitable instruction with a constant. Check to see if we can
1374 // walk through the instruction.
1375 Register NextReg;
1376 unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1377 switch (Opc) {
1378 default:
1379 break;
1380 case TargetOpcode::G_AND:
1381 // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1382 if ((*C >> Bit) & 1)
1383 NextReg = TestReg;
1384 break;
1385 case TargetOpcode::G_SHL:
1386 // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1387 // the type of the register.
1388 if (*C <= Bit && (Bit - *C) < TestRegSize) {
1389 NextReg = TestReg;
1390 Bit = Bit - *C;
1391 }
1392 break;
1393 case TargetOpcode::G_ASHR:
1394 // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1395 // in x
1396 NextReg = TestReg;
1397 Bit = Bit + *C;
1398 if (Bit >= TestRegSize)
1399 Bit = TestRegSize - 1;
1400 break;
1401 case TargetOpcode::G_LSHR:
1402 // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1403 if ((Bit + *C) < TestRegSize) {
1404 NextReg = TestReg;
1405 Bit = Bit + *C;
1406 }
1407 break;
1408 case TargetOpcode::G_XOR:
1409 // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1410 // appropriate.
1411 //
1412 // e.g. If x' = xor x, c, and the b-th bit is set in c then
1413 //
1414 // tbz x', b -> tbnz x, b
1415 //
1416 // Because x' only has the b-th bit set if x does not.
1417 if ((*C >> Bit) & 1)
1418 Invert = !Invert;
1419 NextReg = TestReg;
1420 break;
1421 }
1422
1423 // Check if we found anything worth folding.
1424 if (!NextReg.isValid())
1425 return Reg;
1426 Reg = NextReg;
1427 }
1428
1429 return Reg;
1430 }
1431
emitTestBit(Register TestReg,uint64_t Bit,bool IsNegative,MachineBasicBlock * DstMBB,MachineIRBuilder & MIB) const1432 MachineInstr *AArch64InstructionSelector::emitTestBit(
1433 Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1434 MachineIRBuilder &MIB) const {
1435 assert(TestReg.isValid());
1436 assert(ProduceNonFlagSettingCondBr &&
1437 "Cannot emit TB(N)Z with speculation tracking!");
1438 MachineRegisterInfo &MRI = *MIB.getMRI();
1439
1440 // Attempt to optimize the test bit by walking over instructions.
1441 TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1442 LLT Ty = MRI.getType(TestReg);
1443 unsigned Size = Ty.getSizeInBits();
1444 assert(!Ty.isVector() && "Expected a scalar!");
1445 assert(Bit < 64 && "Bit is too large!");
1446
1447 // When the test register is a 64-bit register, we have to narrow to make
1448 // TBNZW work.
1449 bool UseWReg = Bit < 32;
1450 unsigned NecessarySize = UseWReg ? 32 : 64;
1451 if (Size != NecessarySize)
1452 TestReg = moveScalarRegClass(
1453 TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1454 MIB);
1455
1456 static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1457 {AArch64::TBZW, AArch64::TBNZW}};
1458 unsigned Opc = OpcTable[UseWReg][IsNegative];
1459 auto TestBitMI =
1460 MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1461 constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1462 return &*TestBitMI;
1463 }
1464
tryOptAndIntoCompareBranch(MachineInstr & AndInst,bool Invert,MachineBasicBlock * DstMBB,MachineIRBuilder & MIB) const1465 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1466 MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1467 MachineIRBuilder &MIB) const {
1468 assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1469 // Given something like this:
1470 //
1471 // %x = ...Something...
1472 // %one = G_CONSTANT i64 1
1473 // %zero = G_CONSTANT i64 0
1474 // %and = G_AND %x, %one
1475 // %cmp = G_ICMP intpred(ne), %and, %zero
1476 // %cmp_trunc = G_TRUNC %cmp
1477 // G_BRCOND %cmp_trunc, %bb.3
1478 //
1479 // We want to try and fold the AND into the G_BRCOND and produce either a
1480 // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1481 //
1482 // In this case, we'd get
1483 //
1484 // TBNZ %x %bb.3
1485 //
1486
1487 // Check if the AND has a constant on its RHS which we can use as a mask.
1488 // If it's a power of 2, then it's the same as checking a specific bit.
1489 // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1490 auto MaybeBit = getConstantVRegValWithLookThrough(
1491 AndInst.getOperand(2).getReg(), *MIB.getMRI());
1492 if (!MaybeBit)
1493 return false;
1494
1495 int32_t Bit = MaybeBit->Value.exactLogBase2();
1496 if (Bit < 0)
1497 return false;
1498
1499 Register TestReg = AndInst.getOperand(1).getReg();
1500
1501 // Emit a TB(N)Z.
1502 emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1503 return true;
1504 }
1505
emitCBZ(Register CompareReg,bool IsNegative,MachineBasicBlock * DestMBB,MachineIRBuilder & MIB) const1506 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1507 bool IsNegative,
1508 MachineBasicBlock *DestMBB,
1509 MachineIRBuilder &MIB) const {
1510 assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1511 MachineRegisterInfo &MRI = *MIB.getMRI();
1512 assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1513 AArch64::GPRRegBankID &&
1514 "Expected GPRs only?");
1515 auto Ty = MRI.getType(CompareReg);
1516 unsigned Width = Ty.getSizeInBits();
1517 assert(!Ty.isVector() && "Expected scalar only?");
1518 assert(Width <= 64 && "Expected width to be at most 64?");
1519 static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1520 {AArch64::CBNZW, AArch64::CBNZX}};
1521 unsigned Opc = OpcTable[IsNegative][Width == 64];
1522 auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1523 constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1524 return &*BranchMI;
1525 }
1526
selectCompareBranchFedByFCmp(MachineInstr & I,MachineInstr & FCmp,MachineIRBuilder & MIB) const1527 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1528 MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1529 assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1530 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1531 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1532 // totally clean. Some of them require two branches to implement.
1533 auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1534 emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1535 Pred);
1536 AArch64CC::CondCode CC1, CC2;
1537 changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1538 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1539 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1540 if (CC2 != AArch64CC::AL)
1541 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1542 I.eraseFromParent();
1543 return true;
1544 }
1545
tryOptCompareBranchFedByICmp(MachineInstr & I,MachineInstr & ICmp,MachineIRBuilder & MIB) const1546 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1547 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1548 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1549 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1550 // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1551 //
1552 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1553 // instructions will not be produced, as they are conditional branch
1554 // instructions that do not set flags.
1555 if (!ProduceNonFlagSettingCondBr)
1556 return false;
1557
1558 MachineRegisterInfo &MRI = *MIB.getMRI();
1559 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1560 auto Pred =
1561 static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1562 Register LHS = ICmp.getOperand(2).getReg();
1563 Register RHS = ICmp.getOperand(3).getReg();
1564
1565 // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1566 auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
1567 MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1568
1569 // When we can emit a TB(N)Z, prefer that.
1570 //
1571 // Handle non-commutative condition codes first.
1572 // Note that we don't want to do this when we have a G_AND because it can
1573 // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1574 if (VRegAndVal && !AndInst) {
1575 int64_t C = VRegAndVal->Value.getSExtValue();
1576
1577 // When we have a greater-than comparison, we can just test if the msb is
1578 // zero.
1579 if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1580 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1581 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1582 I.eraseFromParent();
1583 return true;
1584 }
1585
1586 // When we have a less than comparison, we can just test if the msb is not
1587 // zero.
1588 if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1589 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1590 emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1591 I.eraseFromParent();
1592 return true;
1593 }
1594 }
1595
1596 // Attempt to handle commutative condition codes. Right now, that's only
1597 // eq/ne.
1598 if (ICmpInst::isEquality(Pred)) {
1599 if (!VRegAndVal) {
1600 std::swap(RHS, LHS);
1601 VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
1602 AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1603 }
1604
1605 if (VRegAndVal && VRegAndVal->Value == 0) {
1606 // If there's a G_AND feeding into this branch, try to fold it away by
1607 // emitting a TB(N)Z instead.
1608 //
1609 // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1610 // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1611 // would be redundant.
1612 if (AndInst &&
1613 tryOptAndIntoCompareBranch(
1614 *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1615 I.eraseFromParent();
1616 return true;
1617 }
1618
1619 // Otherwise, try to emit a CB(N)Z instead.
1620 auto LHSTy = MRI.getType(LHS);
1621 if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1622 emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1623 I.eraseFromParent();
1624 return true;
1625 }
1626 }
1627 }
1628
1629 return false;
1630 }
1631
selectCompareBranchFedByICmp(MachineInstr & I,MachineInstr & ICmp,MachineIRBuilder & MIB) const1632 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1633 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1634 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1635 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1636 if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1637 return true;
1638
1639 // Couldn't optimize. Emit a compare + a Bcc.
1640 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1641 auto PredOp = ICmp.getOperand(1);
1642 emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1643 const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1644 static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1645 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1646 I.eraseFromParent();
1647 return true;
1648 }
1649
selectCompareBranch(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI)1650 bool AArch64InstructionSelector::selectCompareBranch(
1651 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1652 Register CondReg = I.getOperand(0).getReg();
1653 MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1654 if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) {
1655 CondReg = CCMI->getOperand(1).getReg();
1656 CCMI = MRI.getVRegDef(CondReg);
1657 }
1658
1659 // Try to select the G_BRCOND using whatever is feeding the condition if
1660 // possible.
1661 unsigned CCMIOpc = CCMI->getOpcode();
1662 if (CCMIOpc == TargetOpcode::G_FCMP)
1663 return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1664 if (CCMIOpc == TargetOpcode::G_ICMP)
1665 return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1666
1667 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1668 // instructions will not be produced, as they are conditional branch
1669 // instructions that do not set flags.
1670 if (ProduceNonFlagSettingCondBr) {
1671 emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1672 I.getOperand(1).getMBB(), MIB);
1673 I.eraseFromParent();
1674 return true;
1675 }
1676
1677 // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1678 auto TstMI =
1679 MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1680 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1681 auto Bcc = MIB.buildInstr(AArch64::Bcc)
1682 .addImm(AArch64CC::EQ)
1683 .addMBB(I.getOperand(1).getMBB());
1684 I.eraseFromParent();
1685 return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1686 }
1687
1688 /// Returns the element immediate value of a vector shift operand if found.
1689 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
getVectorShiftImm(Register Reg,MachineRegisterInfo & MRI)1690 static Optional<int64_t> getVectorShiftImm(Register Reg,
1691 MachineRegisterInfo &MRI) {
1692 assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1693 MachineInstr *OpMI = MRI.getVRegDef(Reg);
1694 assert(OpMI && "Expected to find a vreg def for vector shift operand");
1695 return getAArch64VectorSplatScalar(*OpMI, MRI);
1696 }
1697
1698 /// Matches and returns the shift immediate value for a SHL instruction given
1699 /// a shift operand.
getVectorSHLImm(LLT SrcTy,Register Reg,MachineRegisterInfo & MRI)1700 static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) {
1701 Optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1702 if (!ShiftImm)
1703 return None;
1704 // Check the immediate is in range for a SHL.
1705 int64_t Imm = *ShiftImm;
1706 if (Imm < 0)
1707 return None;
1708 switch (SrcTy.getElementType().getSizeInBits()) {
1709 default:
1710 LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1711 return None;
1712 case 8:
1713 if (Imm > 7)
1714 return None;
1715 break;
1716 case 16:
1717 if (Imm > 15)
1718 return None;
1719 break;
1720 case 32:
1721 if (Imm > 31)
1722 return None;
1723 break;
1724 case 64:
1725 if (Imm > 63)
1726 return None;
1727 break;
1728 }
1729 return Imm;
1730 }
1731
selectVectorSHL(MachineInstr & I,MachineRegisterInfo & MRI)1732 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1733 MachineRegisterInfo &MRI) {
1734 assert(I.getOpcode() == TargetOpcode::G_SHL);
1735 Register DstReg = I.getOperand(0).getReg();
1736 const LLT Ty = MRI.getType(DstReg);
1737 Register Src1Reg = I.getOperand(1).getReg();
1738 Register Src2Reg = I.getOperand(2).getReg();
1739
1740 if (!Ty.isVector())
1741 return false;
1742
1743 // Check if we have a vector of constants on RHS that we can select as the
1744 // immediate form.
1745 Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1746
1747 unsigned Opc = 0;
1748 if (Ty == LLT::fixed_vector(2, 64)) {
1749 Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1750 } else if (Ty == LLT::fixed_vector(4, 32)) {
1751 Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1752 } else if (Ty == LLT::fixed_vector(2, 32)) {
1753 Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1754 } else if (Ty == LLT::fixed_vector(4, 16)) {
1755 Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1756 } else if (Ty == LLT::fixed_vector(8, 16)) {
1757 Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1758 } else if (Ty == LLT::fixed_vector(16, 8)) {
1759 Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1760 } else if (Ty == LLT::fixed_vector(8, 8)) {
1761 Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1762 } else {
1763 LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1764 return false;
1765 }
1766
1767 auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1768 if (ImmVal)
1769 Shl.addImm(*ImmVal);
1770 else
1771 Shl.addUse(Src2Reg);
1772 constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1773 I.eraseFromParent();
1774 return true;
1775 }
1776
selectVectorAshrLshr(MachineInstr & I,MachineRegisterInfo & MRI)1777 bool AArch64InstructionSelector::selectVectorAshrLshr(
1778 MachineInstr &I, MachineRegisterInfo &MRI) {
1779 assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1780 I.getOpcode() == TargetOpcode::G_LSHR);
1781 Register DstReg = I.getOperand(0).getReg();
1782 const LLT Ty = MRI.getType(DstReg);
1783 Register Src1Reg = I.getOperand(1).getReg();
1784 Register Src2Reg = I.getOperand(2).getReg();
1785
1786 if (!Ty.isVector())
1787 return false;
1788
1789 bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1790
1791 // We expect the immediate case to be lowered in the PostLegalCombiner to
1792 // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1793
1794 // There is not a shift right register instruction, but the shift left
1795 // register instruction takes a signed value, where negative numbers specify a
1796 // right shift.
1797
1798 unsigned Opc = 0;
1799 unsigned NegOpc = 0;
1800 const TargetRegisterClass *RC =
1801 getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
1802 if (Ty == LLT::fixed_vector(2, 64)) {
1803 Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1804 NegOpc = AArch64::NEGv2i64;
1805 } else if (Ty == LLT::fixed_vector(4, 32)) {
1806 Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1807 NegOpc = AArch64::NEGv4i32;
1808 } else if (Ty == LLT::fixed_vector(2, 32)) {
1809 Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1810 NegOpc = AArch64::NEGv2i32;
1811 } else if (Ty == LLT::fixed_vector(4, 16)) {
1812 Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1813 NegOpc = AArch64::NEGv4i16;
1814 } else if (Ty == LLT::fixed_vector(8, 16)) {
1815 Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1816 NegOpc = AArch64::NEGv8i16;
1817 } else if (Ty == LLT::fixed_vector(16, 8)) {
1818 Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1819 NegOpc = AArch64::NEGv16i8;
1820 } else if (Ty == LLT::fixed_vector(8, 8)) {
1821 Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1822 NegOpc = AArch64::NEGv8i8;
1823 } else {
1824 LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1825 return false;
1826 }
1827
1828 auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1829 constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1830 auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1831 constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1832 I.eraseFromParent();
1833 return true;
1834 }
1835
selectVaStartAAPCS(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI) const1836 bool AArch64InstructionSelector::selectVaStartAAPCS(
1837 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1838 return false;
1839 }
1840
selectVaStartDarwin(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI) const1841 bool AArch64InstructionSelector::selectVaStartDarwin(
1842 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1843 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1844 Register ListReg = I.getOperand(0).getReg();
1845
1846 Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1847
1848 auto MIB =
1849 BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
1850 .addDef(ArgsAddrReg)
1851 .addFrameIndex(FuncInfo->getVarArgsStackIndex())
1852 .addImm(0)
1853 .addImm(0);
1854
1855 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1856
1857 MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
1858 .addUse(ArgsAddrReg)
1859 .addUse(ListReg)
1860 .addImm(0)
1861 .addMemOperand(*I.memoperands_begin());
1862
1863 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1864 I.eraseFromParent();
1865 return true;
1866 }
1867
materializeLargeCMVal(MachineInstr & I,const Value * V,unsigned OpFlags)1868 void AArch64InstructionSelector::materializeLargeCMVal(
1869 MachineInstr &I, const Value *V, unsigned OpFlags) {
1870 MachineBasicBlock &MBB = *I.getParent();
1871 MachineFunction &MF = *MBB.getParent();
1872 MachineRegisterInfo &MRI = MF.getRegInfo();
1873
1874 auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
1875 MovZ->addOperand(MF, I.getOperand(1));
1876 MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
1877 AArch64II::MO_NC);
1878 MovZ->addOperand(MF, MachineOperand::CreateImm(0));
1879 constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
1880
1881 auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
1882 Register ForceDstReg) {
1883 Register DstReg = ForceDstReg
1884 ? ForceDstReg
1885 : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1886 auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
1887 if (auto *GV = dyn_cast<GlobalValue>(V)) {
1888 MovI->addOperand(MF, MachineOperand::CreateGA(
1889 GV, MovZ->getOperand(1).getOffset(), Flags));
1890 } else {
1891 MovI->addOperand(
1892 MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
1893 MovZ->getOperand(1).getOffset(), Flags));
1894 }
1895 MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
1896 constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
1897 return DstReg;
1898 };
1899 Register DstReg = BuildMovK(MovZ.getReg(0),
1900 AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
1901 DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
1902 BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
1903 }
1904
preISelLower(MachineInstr & I)1905 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
1906 MachineBasicBlock &MBB = *I.getParent();
1907 MachineFunction &MF = *MBB.getParent();
1908 MachineRegisterInfo &MRI = MF.getRegInfo();
1909
1910 switch (I.getOpcode()) {
1911 case TargetOpcode::G_SHL:
1912 case TargetOpcode::G_ASHR:
1913 case TargetOpcode::G_LSHR: {
1914 // These shifts are legalized to have 64 bit shift amounts because we want
1915 // to take advantage of the existing imported selection patterns that assume
1916 // the immediates are s64s. However, if the shifted type is 32 bits and for
1917 // some reason we receive input GMIR that has an s64 shift amount that's not
1918 // a G_CONSTANT, insert a truncate so that we can still select the s32
1919 // register-register variant.
1920 Register SrcReg = I.getOperand(1).getReg();
1921 Register ShiftReg = I.getOperand(2).getReg();
1922 const LLT ShiftTy = MRI.getType(ShiftReg);
1923 const LLT SrcTy = MRI.getType(SrcReg);
1924 if (SrcTy.isVector())
1925 return false;
1926 assert(!ShiftTy.isVector() && "unexpected vector shift ty");
1927 if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64)
1928 return false;
1929 auto *AmtMI = MRI.getVRegDef(ShiftReg);
1930 assert(AmtMI && "could not find a vreg definition for shift amount");
1931 if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) {
1932 // Insert a subregister copy to implement a 64->32 trunc
1933 auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
1934 .addReg(ShiftReg, 0, AArch64::sub_32);
1935 MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
1936 I.getOperand(2).setReg(Trunc.getReg(0));
1937 }
1938 return true;
1939 }
1940 case TargetOpcode::G_STORE: {
1941 bool Changed = contractCrossBankCopyIntoStore(I, MRI);
1942 MachineOperand &SrcOp = I.getOperand(0);
1943 if (MRI.getType(SrcOp.getReg()).isPointer()) {
1944 // Allow matching with imported patterns for stores of pointers. Unlike
1945 // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
1946 // and constrain.
1947 auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
1948 Register NewSrc = Copy.getReg(0);
1949 SrcOp.setReg(NewSrc);
1950 RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
1951 Changed = true;
1952 }
1953 return Changed;
1954 }
1955 case TargetOpcode::G_PTR_ADD:
1956 return convertPtrAddToAdd(I, MRI);
1957 case TargetOpcode::G_LOAD: {
1958 // For scalar loads of pointers, we try to convert the dest type from p0
1959 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
1960 // conversion, this should be ok because all users should have been
1961 // selected already, so the type doesn't matter for them.
1962 Register DstReg = I.getOperand(0).getReg();
1963 const LLT DstTy = MRI.getType(DstReg);
1964 if (!DstTy.isPointer())
1965 return false;
1966 MRI.setType(DstReg, LLT::scalar(64));
1967 return true;
1968 }
1969 case AArch64::G_DUP: {
1970 // Convert the type from p0 to s64 to help selection.
1971 LLT DstTy = MRI.getType(I.getOperand(0).getReg());
1972 if (!DstTy.getElementType().isPointer())
1973 return false;
1974 auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
1975 MRI.setType(I.getOperand(0).getReg(),
1976 DstTy.changeElementType(LLT::scalar(64)));
1977 MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
1978 I.getOperand(1).setReg(NewSrc.getReg(0));
1979 return true;
1980 }
1981 case TargetOpcode::G_UITOFP:
1982 case TargetOpcode::G_SITOFP: {
1983 // If both source and destination regbanks are FPR, then convert the opcode
1984 // to G_SITOF so that the importer can select it to an fpr variant.
1985 // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
1986 // copy.
1987 Register SrcReg = I.getOperand(1).getReg();
1988 LLT SrcTy = MRI.getType(SrcReg);
1989 LLT DstTy = MRI.getType(I.getOperand(0).getReg());
1990 if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
1991 return false;
1992
1993 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
1994 if (I.getOpcode() == TargetOpcode::G_SITOFP)
1995 I.setDesc(TII.get(AArch64::G_SITOF));
1996 else
1997 I.setDesc(TII.get(AArch64::G_UITOF));
1998 return true;
1999 }
2000 return false;
2001 }
2002 default:
2003 return false;
2004 }
2005 }
2006
2007 /// This lowering tries to look for G_PTR_ADD instructions and then converts
2008 /// them to a standard G_ADD with a COPY on the source.
2009 ///
2010 /// The motivation behind this is to expose the add semantics to the imported
2011 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2012 /// because the selector works bottom up, uses before defs. By the time we
2013 /// end up trying to select a G_PTR_ADD, we should have already attempted to
2014 /// fold this into addressing modes and were therefore unsuccessful.
convertPtrAddToAdd(MachineInstr & I,MachineRegisterInfo & MRI)2015 bool AArch64InstructionSelector::convertPtrAddToAdd(
2016 MachineInstr &I, MachineRegisterInfo &MRI) {
2017 assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2018 Register DstReg = I.getOperand(0).getReg();
2019 Register AddOp1Reg = I.getOperand(1).getReg();
2020 const LLT PtrTy = MRI.getType(DstReg);
2021 if (PtrTy.getAddressSpace() != 0)
2022 return false;
2023
2024 const LLT CastPtrTy =
2025 PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2026 auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2027 // Set regbanks on the registers.
2028 if (PtrTy.isVector())
2029 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2030 else
2031 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2032
2033 // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2034 // %dst(intty) = G_ADD %intbase, off
2035 I.setDesc(TII.get(TargetOpcode::G_ADD));
2036 MRI.setType(DstReg, CastPtrTy);
2037 I.getOperand(1).setReg(PtrToInt.getReg(0));
2038 if (!select(*PtrToInt)) {
2039 LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2040 return false;
2041 }
2042
2043 // Also take the opportunity here to try to do some optimization.
2044 // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2045 Register NegatedReg;
2046 if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2047 return true;
2048 I.getOperand(2).setReg(NegatedReg);
2049 I.setDesc(TII.get(TargetOpcode::G_SUB));
2050 return true;
2051 }
2052
earlySelectSHL(MachineInstr & I,MachineRegisterInfo & MRI)2053 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2054 MachineRegisterInfo &MRI) {
2055 // We try to match the immediate variant of LSL, which is actually an alias
2056 // for a special case of UBFM. Otherwise, we fall back to the imported
2057 // selector which will match the register variant.
2058 assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2059 const auto &MO = I.getOperand(2);
2060 auto VRegAndVal = getConstantVRegVal(MO.getReg(), MRI);
2061 if (!VRegAndVal)
2062 return false;
2063
2064 const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2065 if (DstTy.isVector())
2066 return false;
2067 bool Is64Bit = DstTy.getSizeInBits() == 64;
2068 auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2069 auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2070
2071 if (!Imm1Fn || !Imm2Fn)
2072 return false;
2073
2074 auto NewI =
2075 MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2076 {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2077
2078 for (auto &RenderFn : *Imm1Fn)
2079 RenderFn(NewI);
2080 for (auto &RenderFn : *Imm2Fn)
2081 RenderFn(NewI);
2082
2083 I.eraseFromParent();
2084 return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2085 }
2086
contractCrossBankCopyIntoStore(MachineInstr & I,MachineRegisterInfo & MRI)2087 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2088 MachineInstr &I, MachineRegisterInfo &MRI) {
2089 assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2090 // If we're storing a scalar, it doesn't matter what register bank that
2091 // scalar is on. All that matters is the size.
2092 //
2093 // So, if we see something like this (with a 32-bit scalar as an example):
2094 //
2095 // %x:gpr(s32) = ... something ...
2096 // %y:fpr(s32) = COPY %x:gpr(s32)
2097 // G_STORE %y:fpr(s32)
2098 //
2099 // We can fix this up into something like this:
2100 //
2101 // G_STORE %x:gpr(s32)
2102 //
2103 // And then continue the selection process normally.
2104 Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2105 if (!DefDstReg.isValid())
2106 return false;
2107 LLT DefDstTy = MRI.getType(DefDstReg);
2108 Register StoreSrcReg = I.getOperand(0).getReg();
2109 LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2110
2111 // If we get something strange like a physical register, then we shouldn't
2112 // go any further.
2113 if (!DefDstTy.isValid())
2114 return false;
2115
2116 // Are the source and dst types the same size?
2117 if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2118 return false;
2119
2120 if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2121 RBI.getRegBank(DefDstReg, MRI, TRI))
2122 return false;
2123
2124 // We have a cross-bank copy, which is entering a store. Let's fold it.
2125 I.getOperand(0).setReg(DefDstReg);
2126 return true;
2127 }
2128
earlySelect(MachineInstr & I)2129 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2130 assert(I.getParent() && "Instruction should be in a basic block!");
2131 assert(I.getParent()->getParent() && "Instruction should be in a function!");
2132
2133 MachineBasicBlock &MBB = *I.getParent();
2134 MachineFunction &MF = *MBB.getParent();
2135 MachineRegisterInfo &MRI = MF.getRegInfo();
2136
2137 switch (I.getOpcode()) {
2138 case AArch64::G_DUP: {
2139 // Before selecting a DUP instruction, check if it is better selected as a
2140 // MOV or load from a constant pool.
2141 Register Src = I.getOperand(1).getReg();
2142 auto ValAndVReg = getConstantVRegValWithLookThrough(Src, MRI);
2143 if (!ValAndVReg)
2144 return false;
2145 LLVMContext &Ctx = MF.getFunction().getContext();
2146 Register Dst = I.getOperand(0).getReg();
2147 auto *CV = ConstantDataVector::getSplat(
2148 MRI.getType(Dst).getNumElements(),
2149 ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
2150 ValAndVReg->Value));
2151 if (!emitConstantVector(Dst, CV, MIB, MRI))
2152 return false;
2153 I.eraseFromParent();
2154 return true;
2155 }
2156 case TargetOpcode::G_BR: {
2157 // If the branch jumps to the fallthrough block, don't bother emitting it.
2158 // Only do this for -O0 for a good code size improvement, because when
2159 // optimizations are enabled we want to leave this choice to
2160 // MachineBlockPlacement.
2161 bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None;
2162 if (EnableOpt || !MBB.isLayoutSuccessor(I.getOperand(0).getMBB()))
2163 return false;
2164 I.eraseFromParent();
2165 return true;
2166 }
2167 case TargetOpcode::G_SHL:
2168 return earlySelectSHL(I, MRI);
2169 case TargetOpcode::G_CONSTANT: {
2170 bool IsZero = false;
2171 if (I.getOperand(1).isCImm())
2172 IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0;
2173 else if (I.getOperand(1).isImm())
2174 IsZero = I.getOperand(1).getImm() == 0;
2175
2176 if (!IsZero)
2177 return false;
2178
2179 Register DefReg = I.getOperand(0).getReg();
2180 LLT Ty = MRI.getType(DefReg);
2181 if (Ty.getSizeInBits() == 64) {
2182 I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2183 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2184 } else if (Ty.getSizeInBits() == 32) {
2185 I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2186 RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2187 } else
2188 return false;
2189
2190 I.setDesc(TII.get(TargetOpcode::COPY));
2191 return true;
2192 }
2193
2194 case TargetOpcode::G_ADD: {
2195 // Check if this is being fed by a G_ICMP on either side.
2196 //
2197 // (cmp pred, x, y) + z
2198 //
2199 // In the above case, when the cmp is true, we increment z by 1. So, we can
2200 // fold the add into the cset for the cmp by using cinc.
2201 //
2202 // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2203 Register X = I.getOperand(1).getReg();
2204
2205 // Only handle scalars. Scalar G_ICMP is only legal for s32, so bail out
2206 // early if we see it.
2207 LLT Ty = MRI.getType(X);
2208 if (Ty.isVector() || Ty.getSizeInBits() != 32)
2209 return false;
2210
2211 Register CmpReg = I.getOperand(2).getReg();
2212 MachineInstr *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI);
2213 if (!Cmp) {
2214 std::swap(X, CmpReg);
2215 Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI);
2216 if (!Cmp)
2217 return false;
2218 }
2219 auto Pred =
2220 static_cast<CmpInst::Predicate>(Cmp->getOperand(1).getPredicate());
2221 emitIntegerCompare(Cmp->getOperand(2), Cmp->getOperand(3),
2222 Cmp->getOperand(1), MIB);
2223 emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB, X);
2224 I.eraseFromParent();
2225 return true;
2226 }
2227 case TargetOpcode::G_OR: {
2228 // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2229 // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2230 // shifting and masking that we can replace with a BFI (encoded as a BFM).
2231 Register Dst = I.getOperand(0).getReg();
2232 LLT Ty = MRI.getType(Dst);
2233
2234 if (!Ty.isScalar())
2235 return false;
2236
2237 unsigned Size = Ty.getSizeInBits();
2238 if (Size != 32 && Size != 64)
2239 return false;
2240
2241 Register ShiftSrc;
2242 int64_t ShiftImm;
2243 Register MaskSrc;
2244 int64_t MaskImm;
2245 if (!mi_match(
2246 Dst, MRI,
2247 m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2248 m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2249 return false;
2250
2251 if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2252 return false;
2253
2254 int64_t Immr = Size - ShiftImm;
2255 int64_t Imms = Size - ShiftImm - 1;
2256 unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2257 emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2258 I.eraseFromParent();
2259 return true;
2260 }
2261 default:
2262 return false;
2263 }
2264 }
2265
select(MachineInstr & I)2266 bool AArch64InstructionSelector::select(MachineInstr &I) {
2267 assert(I.getParent() && "Instruction should be in a basic block!");
2268 assert(I.getParent()->getParent() && "Instruction should be in a function!");
2269
2270 MachineBasicBlock &MBB = *I.getParent();
2271 MachineFunction &MF = *MBB.getParent();
2272 MachineRegisterInfo &MRI = MF.getRegInfo();
2273
2274 const AArch64Subtarget *Subtarget =
2275 &static_cast<const AArch64Subtarget &>(MF.getSubtarget());
2276 if (Subtarget->requiresStrictAlign()) {
2277 // We don't support this feature yet.
2278 LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2279 return false;
2280 }
2281
2282 MIB.setInstrAndDebugLoc(I);
2283
2284 unsigned Opcode = I.getOpcode();
2285 // G_PHI requires same handling as PHI
2286 if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2287 // Certain non-generic instructions also need some special handling.
2288
2289 if (Opcode == TargetOpcode::LOAD_STACK_GUARD)
2290 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2291
2292 if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2293 const Register DefReg = I.getOperand(0).getReg();
2294 const LLT DefTy = MRI.getType(DefReg);
2295
2296 const RegClassOrRegBank &RegClassOrBank =
2297 MRI.getRegClassOrRegBank(DefReg);
2298
2299 const TargetRegisterClass *DefRC
2300 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2301 if (!DefRC) {
2302 if (!DefTy.isValid()) {
2303 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2304 return false;
2305 }
2306 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2307 DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
2308 if (!DefRC) {
2309 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2310 return false;
2311 }
2312 }
2313
2314 I.setDesc(TII.get(TargetOpcode::PHI));
2315
2316 return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2317 }
2318
2319 if (I.isCopy())
2320 return selectCopy(I, TII, MRI, TRI, RBI);
2321
2322 return true;
2323 }
2324
2325
2326 if (I.getNumOperands() != I.getNumExplicitOperands()) {
2327 LLVM_DEBUG(
2328 dbgs() << "Generic instruction has unexpected implicit operands\n");
2329 return false;
2330 }
2331
2332 // Try to do some lowering before we start instruction selecting. These
2333 // lowerings are purely transformations on the input G_MIR and so selection
2334 // must continue after any modification of the instruction.
2335 if (preISelLower(I)) {
2336 Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2337 }
2338
2339 // There may be patterns where the importer can't deal with them optimally,
2340 // but does select it to a suboptimal sequence so our custom C++ selection
2341 // code later never has a chance to work on it. Therefore, we have an early
2342 // selection attempt here to give priority to certain selection routines
2343 // over the imported ones.
2344 if (earlySelect(I))
2345 return true;
2346
2347 if (selectImpl(I, *CoverageInfo))
2348 return true;
2349
2350 LLT Ty =
2351 I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2352
2353 switch (Opcode) {
2354 case TargetOpcode::G_SBFX:
2355 case TargetOpcode::G_UBFX: {
2356 static const unsigned OpcTable[2][2] = {
2357 {AArch64::UBFMWri, AArch64::UBFMXri},
2358 {AArch64::SBFMWri, AArch64::SBFMXri}};
2359 bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2360 unsigned Size = Ty.getSizeInBits();
2361 unsigned Opc = OpcTable[IsSigned][Size == 64];
2362 auto Cst1 =
2363 getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2364 assert(Cst1 && "Should have gotten a constant for src 1?");
2365 auto Cst2 =
2366 getConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2367 assert(Cst2 && "Should have gotten a constant for src 2?");
2368 auto LSB = Cst1->Value.getZExtValue();
2369 auto Width = Cst2->Value.getZExtValue();
2370 auto BitfieldInst =
2371 MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2372 .addImm(LSB)
2373 .addImm(LSB + Width - 1);
2374 I.eraseFromParent();
2375 return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2376 }
2377 case TargetOpcode::G_BRCOND:
2378 return selectCompareBranch(I, MF, MRI);
2379
2380 case TargetOpcode::G_BRINDIRECT: {
2381 I.setDesc(TII.get(AArch64::BR));
2382 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2383 }
2384
2385 case TargetOpcode::G_BRJT:
2386 return selectBrJT(I, MRI);
2387
2388 case AArch64::G_ADD_LOW: {
2389 // This op may have been separated from it's ADRP companion by the localizer
2390 // or some other code motion pass. Given that many CPUs will try to
2391 // macro fuse these operations anyway, select this into a MOVaddr pseudo
2392 // which will later be expanded into an ADRP+ADD pair after scheduling.
2393 MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2394 if (BaseMI->getOpcode() != AArch64::ADRP) {
2395 I.setDesc(TII.get(AArch64::ADDXri));
2396 I.addOperand(MachineOperand::CreateImm(0));
2397 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2398 }
2399 assert(TM.getCodeModel() == CodeModel::Small &&
2400 "Expected small code model");
2401 auto Op1 = BaseMI->getOperand(1);
2402 auto Op2 = I.getOperand(2);
2403 auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2404 .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2405 Op1.getTargetFlags())
2406 .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2407 Op2.getTargetFlags());
2408 I.eraseFromParent();
2409 return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2410 }
2411
2412 case TargetOpcode::G_BSWAP: {
2413 // Handle vector types for G_BSWAP directly.
2414 Register DstReg = I.getOperand(0).getReg();
2415 LLT DstTy = MRI.getType(DstReg);
2416
2417 // We should only get vector types here; everything else is handled by the
2418 // importer right now.
2419 if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
2420 LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
2421 return false;
2422 }
2423
2424 // Only handle 4 and 2 element vectors for now.
2425 // TODO: 16-bit elements.
2426 unsigned NumElts = DstTy.getNumElements();
2427 if (NumElts != 4 && NumElts != 2) {
2428 LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
2429 return false;
2430 }
2431
2432 // Choose the correct opcode for the supported types. Right now, that's
2433 // v2s32, v4s32, and v2s64.
2434 unsigned Opc = 0;
2435 unsigned EltSize = DstTy.getElementType().getSizeInBits();
2436 if (EltSize == 32)
2437 Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
2438 : AArch64::REV32v16i8;
2439 else if (EltSize == 64)
2440 Opc = AArch64::REV64v16i8;
2441
2442 // We should always get something by the time we get here...
2443 assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
2444
2445 I.setDesc(TII.get(Opc));
2446 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2447 }
2448
2449 case TargetOpcode::G_FCONSTANT:
2450 case TargetOpcode::G_CONSTANT: {
2451 const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2452
2453 const LLT s8 = LLT::scalar(8);
2454 const LLT s16 = LLT::scalar(16);
2455 const LLT s32 = LLT::scalar(32);
2456 const LLT s64 = LLT::scalar(64);
2457 const LLT s128 = LLT::scalar(128);
2458 const LLT p0 = LLT::pointer(0, 64);
2459
2460 const Register DefReg = I.getOperand(0).getReg();
2461 const LLT DefTy = MRI.getType(DefReg);
2462 const unsigned DefSize = DefTy.getSizeInBits();
2463 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2464
2465 // FIXME: Redundant check, but even less readable when factored out.
2466 if (isFP) {
2467 if (Ty != s32 && Ty != s64 && Ty != s128) {
2468 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2469 << " constant, expected: " << s32 << " or " << s64
2470 << " or " << s128 << '\n');
2471 return false;
2472 }
2473
2474 if (RB.getID() != AArch64::FPRRegBankID) {
2475 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2476 << " constant on bank: " << RB
2477 << ", expected: FPR\n");
2478 return false;
2479 }
2480
2481 // The case when we have 0.0 is covered by tablegen. Reject it here so we
2482 // can be sure tablegen works correctly and isn't rescued by this code.
2483 // 0.0 is not covered by tablegen for FP128. So we will handle this
2484 // scenario in the code here.
2485 if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2486 return false;
2487 } else {
2488 // s32 and s64 are covered by tablegen.
2489 if (Ty != p0 && Ty != s8 && Ty != s16) {
2490 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2491 << " constant, expected: " << s32 << ", " << s64
2492 << ", or " << p0 << '\n');
2493 return false;
2494 }
2495
2496 if (RB.getID() != AArch64::GPRRegBankID) {
2497 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2498 << " constant on bank: " << RB
2499 << ", expected: GPR\n");
2500 return false;
2501 }
2502 }
2503
2504 // We allow G_CONSTANT of types < 32b.
2505 const unsigned MovOpc =
2506 DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2507
2508 if (isFP) {
2509 // Either emit a FMOV, or emit a copy to emit a normal mov.
2510 const TargetRegisterClass &GPRRC =
2511 DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
2512 const TargetRegisterClass &FPRRC =
2513 DefSize == 32 ? AArch64::FPR32RegClass
2514 : (DefSize == 64 ? AArch64::FPR64RegClass
2515 : AArch64::FPR128RegClass);
2516
2517 // For 64b values, emit a constant pool load instead.
2518 // For s32, use a cp load if we have optsize/minsize.
2519 if (DefSize == 64 || DefSize == 128 ||
2520 (DefSize == 32 && shouldOptForSize(&MF))) {
2521 auto *FPImm = I.getOperand(1).getFPImm();
2522 auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2523 if (!LoadMI) {
2524 LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2525 return false;
2526 }
2527 MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2528 I.eraseFromParent();
2529 return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2530 }
2531
2532 // Nope. Emit a copy and use a normal mov instead.
2533 const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC);
2534 MachineOperand &RegOp = I.getOperand(0);
2535 RegOp.setReg(DefGPRReg);
2536 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2537 MIB.buildCopy({DefReg}, {DefGPRReg});
2538
2539 if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2540 LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2541 return false;
2542 }
2543
2544 MachineOperand &ImmOp = I.getOperand(1);
2545 // FIXME: Is going through int64_t always correct?
2546 ImmOp.ChangeToImmediate(
2547 ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2548 } else if (I.getOperand(1).isCImm()) {
2549 uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2550 I.getOperand(1).ChangeToImmediate(Val);
2551 } else if (I.getOperand(1).isImm()) {
2552 uint64_t Val = I.getOperand(1).getImm();
2553 I.getOperand(1).ChangeToImmediate(Val);
2554 }
2555
2556 I.setDesc(TII.get(MovOpc));
2557 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2558 return true;
2559 }
2560 case TargetOpcode::G_EXTRACT: {
2561 Register DstReg = I.getOperand(0).getReg();
2562 Register SrcReg = I.getOperand(1).getReg();
2563 LLT SrcTy = MRI.getType(SrcReg);
2564 LLT DstTy = MRI.getType(DstReg);
2565 (void)DstTy;
2566 unsigned SrcSize = SrcTy.getSizeInBits();
2567
2568 if (SrcTy.getSizeInBits() > 64) {
2569 // This should be an extract of an s128, which is like a vector extract.
2570 if (SrcTy.getSizeInBits() != 128)
2571 return false;
2572 // Only support extracting 64 bits from an s128 at the moment.
2573 if (DstTy.getSizeInBits() != 64)
2574 return false;
2575
2576 unsigned Offset = I.getOperand(2).getImm();
2577 if (Offset % 64 != 0)
2578 return false;
2579
2580 // Check we have the right regbank always.
2581 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2582 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2583 assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2584
2585 if (SrcRB.getID() == AArch64::GPRRegBankID) {
2586 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2587 .addUse(SrcReg, 0, Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2588 I.eraseFromParent();
2589 return true;
2590 }
2591
2592 // Emit the same code as a vector extract.
2593 // Offset must be a multiple of 64.
2594 unsigned LaneIdx = Offset / 64;
2595 MachineInstr *Extract = emitExtractVectorElt(
2596 DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2597 if (!Extract)
2598 return false;
2599 I.eraseFromParent();
2600 return true;
2601 }
2602
2603 I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2604 MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2605 Ty.getSizeInBits() - 1);
2606
2607 if (SrcSize < 64) {
2608 assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2609 "unexpected G_EXTRACT types");
2610 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2611 }
2612
2613 DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2614 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2615 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2616 .addReg(DstReg, 0, AArch64::sub_32);
2617 RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2618 AArch64::GPR32RegClass, MRI);
2619 I.getOperand(0).setReg(DstReg);
2620
2621 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2622 }
2623
2624 case TargetOpcode::G_INSERT: {
2625 LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2626 LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2627 unsigned DstSize = DstTy.getSizeInBits();
2628 // Larger inserts are vectors, same-size ones should be something else by
2629 // now (split up or turned into COPYs).
2630 if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2631 return false;
2632
2633 I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2634 unsigned LSB = I.getOperand(3).getImm();
2635 unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2636 I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2637 MachineInstrBuilder(MF, I).addImm(Width - 1);
2638
2639 if (DstSize < 64) {
2640 assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2641 "unexpected G_INSERT types");
2642 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2643 }
2644
2645 Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2646 BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2647 TII.get(AArch64::SUBREG_TO_REG))
2648 .addDef(SrcReg)
2649 .addImm(0)
2650 .addUse(I.getOperand(2).getReg())
2651 .addImm(AArch64::sub_32);
2652 RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2653 AArch64::GPR32RegClass, MRI);
2654 I.getOperand(2).setReg(SrcReg);
2655
2656 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2657 }
2658 case TargetOpcode::G_FRAME_INDEX: {
2659 // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2660 if (Ty != LLT::pointer(0, 64)) {
2661 LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2662 << ", expected: " << LLT::pointer(0, 64) << '\n');
2663 return false;
2664 }
2665 I.setDesc(TII.get(AArch64::ADDXri));
2666
2667 // MOs for a #0 shifted immediate.
2668 I.addOperand(MachineOperand::CreateImm(0));
2669 I.addOperand(MachineOperand::CreateImm(0));
2670
2671 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2672 }
2673
2674 case TargetOpcode::G_GLOBAL_VALUE: {
2675 auto GV = I.getOperand(1).getGlobal();
2676 if (GV->isThreadLocal())
2677 return selectTLSGlobalValue(I, MRI);
2678
2679 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
2680 if (OpFlags & AArch64II::MO_GOT) {
2681 I.setDesc(TII.get(AArch64::LOADgot));
2682 I.getOperand(1).setTargetFlags(OpFlags);
2683 } else if (TM.getCodeModel() == CodeModel::Large) {
2684 // Materialize the global using movz/movk instructions.
2685 materializeLargeCMVal(I, GV, OpFlags);
2686 I.eraseFromParent();
2687 return true;
2688 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2689 I.setDesc(TII.get(AArch64::ADR));
2690 I.getOperand(1).setTargetFlags(OpFlags);
2691 } else {
2692 I.setDesc(TII.get(AArch64::MOVaddr));
2693 I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2694 MachineInstrBuilder MIB(MF, I);
2695 MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2696 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2697 }
2698 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2699 }
2700
2701 case TargetOpcode::G_ZEXTLOAD:
2702 case TargetOpcode::G_LOAD:
2703 case TargetOpcode::G_STORE: {
2704 bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2705 LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
2706
2707 if (PtrTy != LLT::pointer(0, 64)) {
2708 LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2709 << ", expected: " << LLT::pointer(0, 64) << '\n');
2710 return false;
2711 }
2712
2713 auto &MemOp = **I.memoperands_begin();
2714 uint64_t MemSizeInBytes = MemOp.getSize();
2715 unsigned MemSizeInBits = MemSizeInBytes * 8;
2716 AtomicOrdering Order = MemOp.getSuccessOrdering();
2717
2718 // Need special instructions for atomics that affect ordering.
2719 if (Order != AtomicOrdering::NotAtomic &&
2720 Order != AtomicOrdering::Unordered &&
2721 Order != AtomicOrdering::Monotonic) {
2722 assert(I.getOpcode() != TargetOpcode::G_ZEXTLOAD);
2723 if (MemSizeInBytes > 64)
2724 return false;
2725
2726 if (I.getOpcode() == TargetOpcode::G_LOAD) {
2727 static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH,
2728 AArch64::LDARW, AArch64::LDARX};
2729 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2730 } else {
2731 static unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2732 AArch64::STLRW, AArch64::STLRX};
2733 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2734 }
2735 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2736 return true;
2737 }
2738
2739 #ifndef NDEBUG
2740 const Register PtrReg = I.getOperand(1).getReg();
2741 const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2742 // Sanity-check the pointer register.
2743 assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2744 "Load/Store pointer operand isn't a GPR");
2745 assert(MRI.getType(PtrReg).isPointer() &&
2746 "Load/Store pointer operand isn't a pointer");
2747 #endif
2748
2749 const Register ValReg = I.getOperand(0).getReg();
2750 const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2751
2752 // Helper lambda for partially selecting I. Either returns the original
2753 // instruction with an updated opcode, or a new instruction.
2754 auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2755 bool IsStore = I.getOpcode() == TargetOpcode::G_STORE;
2756 const unsigned NewOpc =
2757 selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
2758 if (NewOpc == I.getOpcode())
2759 return nullptr;
2760 // Check if we can fold anything into the addressing mode.
2761 auto AddrModeFns =
2762 selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
2763 if (!AddrModeFns) {
2764 // Can't fold anything. Use the original instruction.
2765 I.setDesc(TII.get(NewOpc));
2766 I.addOperand(MachineOperand::CreateImm(0));
2767 return &I;
2768 }
2769
2770 // Folded something. Create a new instruction and return it.
2771 auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
2772 IsStore ? NewInst.addUse(ValReg) : NewInst.addDef(ValReg);
2773 NewInst.cloneMemRefs(I);
2774 for (auto &Fn : *AddrModeFns)
2775 Fn(NewInst);
2776 I.eraseFromParent();
2777 return &*NewInst;
2778 };
2779
2780 MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
2781 if (!LoadStore)
2782 return false;
2783
2784 // If we're storing a 0, use WZR/XZR.
2785 if (Opcode == TargetOpcode::G_STORE) {
2786 auto CVal = getConstantVRegValWithLookThrough(
2787 LoadStore->getOperand(0).getReg(), MRI, /*LookThroughInstrs = */ true,
2788 /*HandleFConstants = */ false);
2789 if (CVal && CVal->Value == 0) {
2790 switch (LoadStore->getOpcode()) {
2791 case AArch64::STRWui:
2792 case AArch64::STRHHui:
2793 case AArch64::STRBBui:
2794 LoadStore->getOperand(0).setReg(AArch64::WZR);
2795 break;
2796 case AArch64::STRXui:
2797 LoadStore->getOperand(0).setReg(AArch64::XZR);
2798 break;
2799 }
2800 }
2801 }
2802
2803 if (IsZExtLoad) {
2804 // The zextload from a smaller type to i32 should be handled by the
2805 // importer.
2806 if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
2807 return false;
2808 // If we have a ZEXTLOAD then change the load's type to be a narrower reg
2809 // and zero_extend with SUBREG_TO_REG.
2810 Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2811 Register DstReg = LoadStore->getOperand(0).getReg();
2812 LoadStore->getOperand(0).setReg(LdReg);
2813
2814 MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
2815 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
2816 .addImm(0)
2817 .addUse(LdReg)
2818 .addImm(AArch64::sub_32);
2819 constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2820 return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
2821 MRI);
2822 }
2823 return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2824 }
2825
2826 case TargetOpcode::G_SMULH:
2827 case TargetOpcode::G_UMULH: {
2828 // Reject the various things we don't support yet.
2829 if (unsupportedBinOp(I, RBI, MRI, TRI))
2830 return false;
2831
2832 const Register DefReg = I.getOperand(0).getReg();
2833 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2834
2835 if (RB.getID() != AArch64::GPRRegBankID) {
2836 LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
2837 return false;
2838 }
2839
2840 if (Ty != LLT::scalar(64)) {
2841 LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
2842 << ", expected: " << LLT::scalar(64) << '\n');
2843 return false;
2844 }
2845
2846 unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
2847 : AArch64::UMULHrr;
2848 I.setDesc(TII.get(NewOpc));
2849
2850 // Now that we selected an opcode, we need to constrain the register
2851 // operands to use appropriate classes.
2852 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2853 }
2854 case TargetOpcode::G_LSHR:
2855 case TargetOpcode::G_ASHR:
2856 if (MRI.getType(I.getOperand(0).getReg()).isVector())
2857 return selectVectorAshrLshr(I, MRI);
2858 LLVM_FALLTHROUGH;
2859 case TargetOpcode::G_SHL:
2860 if (Opcode == TargetOpcode::G_SHL &&
2861 MRI.getType(I.getOperand(0).getReg()).isVector())
2862 return selectVectorSHL(I, MRI);
2863 LLVM_FALLTHROUGH;
2864 case TargetOpcode::G_FADD:
2865 case TargetOpcode::G_FSUB:
2866 case TargetOpcode::G_FMUL:
2867 case TargetOpcode::G_FDIV:
2868 case TargetOpcode::G_OR: {
2869 // Reject the various things we don't support yet.
2870 if (unsupportedBinOp(I, RBI, MRI, TRI))
2871 return false;
2872
2873 const unsigned OpSize = Ty.getSizeInBits();
2874
2875 const Register DefReg = I.getOperand(0).getReg();
2876 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2877
2878 const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
2879 if (NewOpc == I.getOpcode())
2880 return false;
2881
2882 I.setDesc(TII.get(NewOpc));
2883 // FIXME: Should the type be always reset in setDesc?
2884
2885 // Now that we selected an opcode, we need to constrain the register
2886 // operands to use appropriate classes.
2887 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2888 }
2889
2890 case TargetOpcode::G_PTR_ADD: {
2891 emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
2892 I.eraseFromParent();
2893 return true;
2894 }
2895 case TargetOpcode::G_SADDO:
2896 case TargetOpcode::G_UADDO:
2897 case TargetOpcode::G_SSUBO:
2898 case TargetOpcode::G_USUBO: {
2899 // Emit the operation and get the correct condition code.
2900 auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(),
2901 I.getOperand(2), I.getOperand(3), MIB);
2902
2903 // Now, put the overflow result in the register given by the first operand
2904 // to the overflow op. CSINC increments the result when the predicate is
2905 // false, so to get the increment when it's true, we need to use the
2906 // inverse. In this case, we want to increment when carry is set.
2907 Register ZReg = AArch64::WZR;
2908 auto CsetMI = MIB.buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()},
2909 {ZReg, ZReg})
2910 .addImm(getInvertedCondCode(OpAndCC.second));
2911 constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI);
2912 I.eraseFromParent();
2913 return true;
2914 }
2915
2916 case TargetOpcode::G_PTRMASK: {
2917 Register MaskReg = I.getOperand(2).getReg();
2918 Optional<int64_t> MaskVal = getConstantVRegSExtVal(MaskReg, MRI);
2919 // TODO: Implement arbitrary cases
2920 if (!MaskVal || !isShiftedMask_64(*MaskVal))
2921 return false;
2922
2923 uint64_t Mask = *MaskVal;
2924 I.setDesc(TII.get(AArch64::ANDXri));
2925 I.getOperand(2).ChangeToImmediate(
2926 AArch64_AM::encodeLogicalImmediate(Mask, 64));
2927
2928 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2929 }
2930 case TargetOpcode::G_PTRTOINT:
2931 case TargetOpcode::G_TRUNC: {
2932 const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2933 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
2934
2935 const Register DstReg = I.getOperand(0).getReg();
2936 const Register SrcReg = I.getOperand(1).getReg();
2937
2938 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2939 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2940
2941 if (DstRB.getID() != SrcRB.getID()) {
2942 LLVM_DEBUG(
2943 dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
2944 return false;
2945 }
2946
2947 if (DstRB.getID() == AArch64::GPRRegBankID) {
2948 const TargetRegisterClass *DstRC =
2949 getRegClassForTypeOnBank(DstTy, DstRB, RBI);
2950 if (!DstRC)
2951 return false;
2952
2953 const TargetRegisterClass *SrcRC =
2954 getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
2955 if (!SrcRC)
2956 return false;
2957
2958 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
2959 !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
2960 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
2961 return false;
2962 }
2963
2964 if (DstRC == SrcRC) {
2965 // Nothing to be done
2966 } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
2967 SrcTy == LLT::scalar(64)) {
2968 llvm_unreachable("TableGen can import this case");
2969 return false;
2970 } else if (DstRC == &AArch64::GPR32RegClass &&
2971 SrcRC == &AArch64::GPR64RegClass) {
2972 I.getOperand(1).setSubReg(AArch64::sub_32);
2973 } else {
2974 LLVM_DEBUG(
2975 dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
2976 return false;
2977 }
2978
2979 I.setDesc(TII.get(TargetOpcode::COPY));
2980 return true;
2981 } else if (DstRB.getID() == AArch64::FPRRegBankID) {
2982 if (DstTy == LLT::fixed_vector(4, 16) &&
2983 SrcTy == LLT::fixed_vector(4, 32)) {
2984 I.setDesc(TII.get(AArch64::XTNv4i16));
2985 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2986 return true;
2987 }
2988
2989 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
2990 MachineInstr *Extract = emitExtractVectorElt(
2991 DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
2992 if (!Extract)
2993 return false;
2994 I.eraseFromParent();
2995 return true;
2996 }
2997
2998 // We might have a vector G_PTRTOINT, in which case just emit a COPY.
2999 if (Opcode == TargetOpcode::G_PTRTOINT) {
3000 assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3001 I.setDesc(TII.get(TargetOpcode::COPY));
3002 return true;
3003 }
3004 }
3005
3006 return false;
3007 }
3008
3009 case TargetOpcode::G_ANYEXT: {
3010 const Register DstReg = I.getOperand(0).getReg();
3011 const Register SrcReg = I.getOperand(1).getReg();
3012
3013 const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3014 if (RBDst.getID() != AArch64::GPRRegBankID) {
3015 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3016 << ", expected: GPR\n");
3017 return false;
3018 }
3019
3020 const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3021 if (RBSrc.getID() != AArch64::GPRRegBankID) {
3022 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3023 << ", expected: GPR\n");
3024 return false;
3025 }
3026
3027 const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3028
3029 if (DstSize == 0) {
3030 LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3031 return false;
3032 }
3033
3034 if (DstSize != 64 && DstSize > 32) {
3035 LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3036 << ", expected: 32 or 64\n");
3037 return false;
3038 }
3039 // At this point G_ANYEXT is just like a plain COPY, but we need
3040 // to explicitly form the 64-bit value if any.
3041 if (DstSize > 32) {
3042 Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3043 BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3044 .addDef(ExtSrc)
3045 .addImm(0)
3046 .addUse(SrcReg)
3047 .addImm(AArch64::sub_32);
3048 I.getOperand(1).setReg(ExtSrc);
3049 }
3050 return selectCopy(I, TII, MRI, TRI, RBI);
3051 }
3052
3053 case TargetOpcode::G_ZEXT:
3054 case TargetOpcode::G_SEXT_INREG:
3055 case TargetOpcode::G_SEXT: {
3056 unsigned Opcode = I.getOpcode();
3057 const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3058 const Register DefReg = I.getOperand(0).getReg();
3059 Register SrcReg = I.getOperand(1).getReg();
3060 const LLT DstTy = MRI.getType(DefReg);
3061 const LLT SrcTy = MRI.getType(SrcReg);
3062 unsigned DstSize = DstTy.getSizeInBits();
3063 unsigned SrcSize = SrcTy.getSizeInBits();
3064
3065 // SEXT_INREG has the same src reg size as dst, the size of the value to be
3066 // extended is encoded in the imm.
3067 if (Opcode == TargetOpcode::G_SEXT_INREG)
3068 SrcSize = I.getOperand(2).getImm();
3069
3070 if (DstTy.isVector())
3071 return false; // Should be handled by imported patterns.
3072
3073 assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3074 AArch64::GPRRegBankID &&
3075 "Unexpected ext regbank");
3076
3077 MachineInstr *ExtI;
3078
3079 // First check if we're extending the result of a load which has a dest type
3080 // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3081 // GPR register on AArch64 and all loads which are smaller automatically
3082 // zero-extend the upper bits. E.g.
3083 // %v(s8) = G_LOAD %p, :: (load 1)
3084 // %v2(s32) = G_ZEXT %v(s8)
3085 if (!IsSigned) {
3086 auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3087 bool IsGPR =
3088 RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3089 if (LoadMI && IsGPR) {
3090 const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3091 unsigned BytesLoaded = MemOp->getSize();
3092 if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3093 return selectCopy(I, TII, MRI, TRI, RBI);
3094 }
3095
3096 // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3097 // + SUBREG_TO_REG.
3098 //
3099 // If we are zero extending from 32 bits to 64 bits, it's possible that
3100 // the instruction implicitly does the zero extend for us. In that case,
3101 // we only need the SUBREG_TO_REG.
3102 if (IsGPR && SrcSize == 32 && DstSize == 64) {
3103 // Unlike with the G_LOAD case, we don't want to look through copies
3104 // here. (See isDef32.)
3105 MachineInstr *Def = MRI.getVRegDef(SrcReg);
3106 Register SubregToRegSrc = SrcReg;
3107
3108 // Does the instruction implicitly zero extend?
3109 if (!Def || !isDef32(*Def)) {
3110 // No. Zero out using an OR.
3111 Register OrDst = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3112 const Register ZReg = AArch64::WZR;
3113 MIB.buildInstr(AArch64::ORRWrs, {OrDst}, {ZReg, SrcReg}).addImm(0);
3114 SubregToRegSrc = OrDst;
3115 }
3116
3117 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3118 .addImm(0)
3119 .addUse(SubregToRegSrc)
3120 .addImm(AArch64::sub_32);
3121
3122 if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3123 MRI)) {
3124 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3125 return false;
3126 }
3127
3128 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3129 MRI)) {
3130 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3131 return false;
3132 }
3133
3134 I.eraseFromParent();
3135 return true;
3136 }
3137 }
3138
3139 if (DstSize == 64) {
3140 if (Opcode != TargetOpcode::G_SEXT_INREG) {
3141 // FIXME: Can we avoid manually doing this?
3142 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3143 MRI)) {
3144 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3145 << " operand\n");
3146 return false;
3147 }
3148 SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3149 {&AArch64::GPR64RegClass}, {})
3150 .addImm(0)
3151 .addUse(SrcReg)
3152 .addImm(AArch64::sub_32)
3153 .getReg(0);
3154 }
3155
3156 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3157 {DefReg}, {SrcReg})
3158 .addImm(0)
3159 .addImm(SrcSize - 1);
3160 } else if (DstSize <= 32) {
3161 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3162 {DefReg}, {SrcReg})
3163 .addImm(0)
3164 .addImm(SrcSize - 1);
3165 } else {
3166 return false;
3167 }
3168
3169 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3170 I.eraseFromParent();
3171 return true;
3172 }
3173
3174 case TargetOpcode::G_SITOFP:
3175 case TargetOpcode::G_UITOFP:
3176 case TargetOpcode::G_FPTOSI:
3177 case TargetOpcode::G_FPTOUI: {
3178 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3179 SrcTy = MRI.getType(I.getOperand(1).getReg());
3180 const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3181 if (NewOpc == Opcode)
3182 return false;
3183
3184 I.setDesc(TII.get(NewOpc));
3185 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3186
3187 return true;
3188 }
3189
3190 case TargetOpcode::G_FREEZE:
3191 return selectCopy(I, TII, MRI, TRI, RBI);
3192
3193 case TargetOpcode::G_INTTOPTR:
3194 // The importer is currently unable to import pointer types since they
3195 // didn't exist in SelectionDAG.
3196 return selectCopy(I, TII, MRI, TRI, RBI);
3197
3198 case TargetOpcode::G_BITCAST:
3199 // Imported SelectionDAG rules can handle every bitcast except those that
3200 // bitcast from a type to the same type. Ideally, these shouldn't occur
3201 // but we might not run an optimizer that deletes them. The other exception
3202 // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3203 // of them.
3204 return selectCopy(I, TII, MRI, TRI, RBI);
3205
3206 case TargetOpcode::G_SELECT: {
3207 if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
3208 LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
3209 << ", expected: " << LLT::scalar(1) << '\n');
3210 return false;
3211 }
3212
3213 const Register CondReg = I.getOperand(1).getReg();
3214 const Register TReg = I.getOperand(2).getReg();
3215 const Register FReg = I.getOperand(3).getReg();
3216
3217 if (tryOptSelect(I))
3218 return true;
3219
3220 // Make sure to use an unused vreg instead of wzr, so that the peephole
3221 // optimizations will be able to optimize these.
3222 Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3223 auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3224 .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3225 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3226 if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
3227 return false;
3228 I.eraseFromParent();
3229 return true;
3230 }
3231 case TargetOpcode::G_ICMP: {
3232 if (Ty.isVector())
3233 return selectVectorICmp(I, MRI);
3234
3235 if (Ty != LLT::scalar(32)) {
3236 LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3237 << ", expected: " << LLT::scalar(32) << '\n');
3238 return false;
3239 }
3240
3241 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3242 emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
3243 MIB);
3244 emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB);
3245 I.eraseFromParent();
3246 return true;
3247 }
3248
3249 case TargetOpcode::G_FCMP: {
3250 CmpInst::Predicate Pred =
3251 static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3252 if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3253 Pred) ||
3254 !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3255 return false;
3256 I.eraseFromParent();
3257 return true;
3258 }
3259 case TargetOpcode::G_VASTART:
3260 return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3261 : selectVaStartAAPCS(I, MF, MRI);
3262 case TargetOpcode::G_INTRINSIC:
3263 return selectIntrinsic(I, MRI);
3264 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3265 return selectIntrinsicWithSideEffects(I, MRI);
3266 case TargetOpcode::G_IMPLICIT_DEF: {
3267 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3268 const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3269 const Register DstReg = I.getOperand(0).getReg();
3270 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3271 const TargetRegisterClass *DstRC =
3272 getRegClassForTypeOnBank(DstTy, DstRB, RBI);
3273 RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3274 return true;
3275 }
3276 case TargetOpcode::G_BLOCK_ADDR: {
3277 if (TM.getCodeModel() == CodeModel::Large) {
3278 materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3279 I.eraseFromParent();
3280 return true;
3281 } else {
3282 I.setDesc(TII.get(AArch64::MOVaddrBA));
3283 auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3284 I.getOperand(0).getReg())
3285 .addBlockAddress(I.getOperand(1).getBlockAddress(),
3286 /* Offset */ 0, AArch64II::MO_PAGE)
3287 .addBlockAddress(
3288 I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3289 AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3290 I.eraseFromParent();
3291 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3292 }
3293 }
3294 case AArch64::G_DUP: {
3295 // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3296 // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3297 // difficult because at RBS we may end up pessimizing the fpr case if we
3298 // decided to add an anyextend to fix this. Manual selection is the most
3299 // robust solution for now.
3300 if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3301 AArch64::GPRRegBankID)
3302 return false; // We expect the fpr regbank case to be imported.
3303 LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3304 if (VecTy == LLT::fixed_vector(8, 8))
3305 I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3306 else if (VecTy == LLT::fixed_vector(16, 8))
3307 I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3308 else if (VecTy == LLT::fixed_vector(4, 16))
3309 I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3310 else if (VecTy == LLT::fixed_vector(8, 16))
3311 I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3312 else
3313 return false;
3314 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3315 }
3316 case TargetOpcode::G_INTRINSIC_TRUNC:
3317 return selectIntrinsicTrunc(I, MRI);
3318 case TargetOpcode::G_INTRINSIC_ROUND:
3319 return selectIntrinsicRound(I, MRI);
3320 case TargetOpcode::G_BUILD_VECTOR:
3321 return selectBuildVector(I, MRI);
3322 case TargetOpcode::G_MERGE_VALUES:
3323 return selectMergeValues(I, MRI);
3324 case TargetOpcode::G_UNMERGE_VALUES:
3325 return selectUnmergeValues(I, MRI);
3326 case TargetOpcode::G_SHUFFLE_VECTOR:
3327 return selectShuffleVector(I, MRI);
3328 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3329 return selectExtractElt(I, MRI);
3330 case TargetOpcode::G_INSERT_VECTOR_ELT:
3331 return selectInsertElt(I, MRI);
3332 case TargetOpcode::G_CONCAT_VECTORS:
3333 return selectConcatVectors(I, MRI);
3334 case TargetOpcode::G_JUMP_TABLE:
3335 return selectJumpTable(I, MRI);
3336 case TargetOpcode::G_VECREDUCE_FADD:
3337 case TargetOpcode::G_VECREDUCE_ADD:
3338 return selectReduction(I, MRI);
3339 }
3340
3341 return false;
3342 }
3343
selectReduction(MachineInstr & I,MachineRegisterInfo & MRI)3344 bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
3345 MachineRegisterInfo &MRI) {
3346 Register VecReg = I.getOperand(1).getReg();
3347 LLT VecTy = MRI.getType(VecReg);
3348 if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) {
3349 // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit
3350 // a subregister copy afterwards.
3351 if (VecTy == LLT::fixed_vector(2, 32)) {
3352 Register DstReg = I.getOperand(0).getReg();
3353 auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass},
3354 {VecReg, VecReg});
3355 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3356 .addReg(AddP.getReg(0), 0, AArch64::ssub)
3357 .getReg(0);
3358 RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI);
3359 I.eraseFromParent();
3360 return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI);
3361 }
3362
3363 unsigned Opc = 0;
3364 if (VecTy == LLT::fixed_vector(16, 8))
3365 Opc = AArch64::ADDVv16i8v;
3366 else if (VecTy == LLT::fixed_vector(8, 16))
3367 Opc = AArch64::ADDVv8i16v;
3368 else if (VecTy == LLT::fixed_vector(4, 32))
3369 Opc = AArch64::ADDVv4i32v;
3370 else if (VecTy == LLT::fixed_vector(2, 64))
3371 Opc = AArch64::ADDPv2i64p;
3372 else {
3373 LLVM_DEBUG(dbgs() << "Unhandled type for add reduction");
3374 return false;
3375 }
3376 I.setDesc(TII.get(Opc));
3377 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3378 }
3379
3380 if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) {
3381 unsigned Opc = 0;
3382 if (VecTy == LLT::fixed_vector(2, 32))
3383 Opc = AArch64::FADDPv2i32p;
3384 else if (VecTy == LLT::fixed_vector(2, 64))
3385 Opc = AArch64::FADDPv2i64p;
3386 else {
3387 LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction");
3388 return false;
3389 }
3390 I.setDesc(TII.get(Opc));
3391 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3392 }
3393 return false;
3394 }
3395
selectBrJT(MachineInstr & I,MachineRegisterInfo & MRI)3396 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3397 MachineRegisterInfo &MRI) {
3398 assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3399 Register JTAddr = I.getOperand(0).getReg();
3400 unsigned JTI = I.getOperand(1).getIndex();
3401 Register Index = I.getOperand(2).getReg();
3402
3403 Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3404 Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3405
3406 MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3407 auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3408 {TargetReg, ScratchReg}, {JTAddr, Index})
3409 .addJumpTableIndex(JTI);
3410 // Build the indirect branch.
3411 MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3412 I.eraseFromParent();
3413 return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3414 }
3415
selectJumpTable(MachineInstr & I,MachineRegisterInfo & MRI)3416 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3417 MachineRegisterInfo &MRI) {
3418 assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3419 assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3420
3421 Register DstReg = I.getOperand(0).getReg();
3422 unsigned JTI = I.getOperand(1).getIndex();
3423 // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3424 auto MovMI =
3425 MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3426 .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3427 .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3428 I.eraseFromParent();
3429 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3430 }
3431
selectTLSGlobalValue(MachineInstr & I,MachineRegisterInfo & MRI)3432 bool AArch64InstructionSelector::selectTLSGlobalValue(
3433 MachineInstr &I, MachineRegisterInfo &MRI) {
3434 if (!STI.isTargetMachO())
3435 return false;
3436 MachineFunction &MF = *I.getParent()->getParent();
3437 MF.getFrameInfo().setAdjustsStack(true);
3438
3439 const auto &GlobalOp = I.getOperand(1);
3440 assert(GlobalOp.getOffset() == 0 &&
3441 "Shouldn't have an offset on TLS globals!");
3442 const GlobalValue &GV = *GlobalOp.getGlobal();
3443
3444 auto LoadGOT =
3445 MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3446 .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3447
3448 auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3449 {LoadGOT.getReg(0)})
3450 .addImm(0);
3451
3452 MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3453 // TLS calls preserve all registers except those that absolutely must be
3454 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3455 // silly).
3456 MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
3457 .addUse(AArch64::X0, RegState::Implicit)
3458 .addDef(AArch64::X0, RegState::Implicit)
3459 .addRegMask(TRI.getTLSCallPreservedMask());
3460
3461 MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3462 RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3463 MRI);
3464 I.eraseFromParent();
3465 return true;
3466 }
3467
selectIntrinsicTrunc(MachineInstr & I,MachineRegisterInfo & MRI) const3468 bool AArch64InstructionSelector::selectIntrinsicTrunc(
3469 MachineInstr &I, MachineRegisterInfo &MRI) const {
3470 const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3471
3472 // Select the correct opcode.
3473 unsigned Opc = 0;
3474 if (!SrcTy.isVector()) {
3475 switch (SrcTy.getSizeInBits()) {
3476 default:
3477 case 16:
3478 Opc = AArch64::FRINTZHr;
3479 break;
3480 case 32:
3481 Opc = AArch64::FRINTZSr;
3482 break;
3483 case 64:
3484 Opc = AArch64::FRINTZDr;
3485 break;
3486 }
3487 } else {
3488 unsigned NumElts = SrcTy.getNumElements();
3489 switch (SrcTy.getElementType().getSizeInBits()) {
3490 default:
3491 break;
3492 case 16:
3493 if (NumElts == 4)
3494 Opc = AArch64::FRINTZv4f16;
3495 else if (NumElts == 8)
3496 Opc = AArch64::FRINTZv8f16;
3497 break;
3498 case 32:
3499 if (NumElts == 2)
3500 Opc = AArch64::FRINTZv2f32;
3501 else if (NumElts == 4)
3502 Opc = AArch64::FRINTZv4f32;
3503 break;
3504 case 64:
3505 if (NumElts == 2)
3506 Opc = AArch64::FRINTZv2f64;
3507 break;
3508 }
3509 }
3510
3511 if (!Opc) {
3512 // Didn't get an opcode above, bail.
3513 LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n");
3514 return false;
3515 }
3516
3517 // Legalization would have set us up perfectly for this; we just need to
3518 // set the opcode and move on.
3519 I.setDesc(TII.get(Opc));
3520 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3521 }
3522
selectIntrinsicRound(MachineInstr & I,MachineRegisterInfo & MRI) const3523 bool AArch64InstructionSelector::selectIntrinsicRound(
3524 MachineInstr &I, MachineRegisterInfo &MRI) const {
3525 const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3526
3527 // Select the correct opcode.
3528 unsigned Opc = 0;
3529 if (!SrcTy.isVector()) {
3530 switch (SrcTy.getSizeInBits()) {
3531 default:
3532 case 16:
3533 Opc = AArch64::FRINTAHr;
3534 break;
3535 case 32:
3536 Opc = AArch64::FRINTASr;
3537 break;
3538 case 64:
3539 Opc = AArch64::FRINTADr;
3540 break;
3541 }
3542 } else {
3543 unsigned NumElts = SrcTy.getNumElements();
3544 switch (SrcTy.getElementType().getSizeInBits()) {
3545 default:
3546 break;
3547 case 16:
3548 if (NumElts == 4)
3549 Opc = AArch64::FRINTAv4f16;
3550 else if (NumElts == 8)
3551 Opc = AArch64::FRINTAv8f16;
3552 break;
3553 case 32:
3554 if (NumElts == 2)
3555 Opc = AArch64::FRINTAv2f32;
3556 else if (NumElts == 4)
3557 Opc = AArch64::FRINTAv4f32;
3558 break;
3559 case 64:
3560 if (NumElts == 2)
3561 Opc = AArch64::FRINTAv2f64;
3562 break;
3563 }
3564 }
3565
3566 if (!Opc) {
3567 // Didn't get an opcode above, bail.
3568 LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n");
3569 return false;
3570 }
3571
3572 // Legalization would have set us up perfectly for this; we just need to
3573 // set the opcode and move on.
3574 I.setDesc(TII.get(Opc));
3575 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3576 }
3577
selectVectorICmp(MachineInstr & I,MachineRegisterInfo & MRI)3578 bool AArch64InstructionSelector::selectVectorICmp(
3579 MachineInstr &I, MachineRegisterInfo &MRI) {
3580 Register DstReg = I.getOperand(0).getReg();
3581 LLT DstTy = MRI.getType(DstReg);
3582 Register SrcReg = I.getOperand(2).getReg();
3583 Register Src2Reg = I.getOperand(3).getReg();
3584 LLT SrcTy = MRI.getType(SrcReg);
3585
3586 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
3587 unsigned NumElts = DstTy.getNumElements();
3588
3589 // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
3590 // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
3591 // Third index is cc opcode:
3592 // 0 == eq
3593 // 1 == ugt
3594 // 2 == uge
3595 // 3 == ult
3596 // 4 == ule
3597 // 5 == sgt
3598 // 6 == sge
3599 // 7 == slt
3600 // 8 == sle
3601 // ne is done by negating 'eq' result.
3602
3603 // This table below assumes that for some comparisons the operands will be
3604 // commuted.
3605 // ult op == commute + ugt op
3606 // ule op == commute + uge op
3607 // slt op == commute + sgt op
3608 // sle op == commute + sge op
3609 unsigned PredIdx = 0;
3610 bool SwapOperands = false;
3611 CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
3612 switch (Pred) {
3613 case CmpInst::ICMP_NE:
3614 case CmpInst::ICMP_EQ:
3615 PredIdx = 0;
3616 break;
3617 case CmpInst::ICMP_UGT:
3618 PredIdx = 1;
3619 break;
3620 case CmpInst::ICMP_UGE:
3621 PredIdx = 2;
3622 break;
3623 case CmpInst::ICMP_ULT:
3624 PredIdx = 3;
3625 SwapOperands = true;
3626 break;
3627 case CmpInst::ICMP_ULE:
3628 PredIdx = 4;
3629 SwapOperands = true;
3630 break;
3631 case CmpInst::ICMP_SGT:
3632 PredIdx = 5;
3633 break;
3634 case CmpInst::ICMP_SGE:
3635 PredIdx = 6;
3636 break;
3637 case CmpInst::ICMP_SLT:
3638 PredIdx = 7;
3639 SwapOperands = true;
3640 break;
3641 case CmpInst::ICMP_SLE:
3642 PredIdx = 8;
3643 SwapOperands = true;
3644 break;
3645 default:
3646 llvm_unreachable("Unhandled icmp predicate");
3647 return false;
3648 }
3649
3650 // This table obviously should be tablegen'd when we have our GISel native
3651 // tablegen selector.
3652
3653 static const unsigned OpcTable[4][4][9] = {
3654 {
3655 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3656 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3657 0 /* invalid */},
3658 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3659 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3660 0 /* invalid */},
3661 {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
3662 AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
3663 AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
3664 {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
3665 AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
3666 AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
3667 },
3668 {
3669 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3670 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3671 0 /* invalid */},
3672 {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
3673 AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
3674 AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
3675 {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
3676 AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
3677 AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
3678 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3679 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3680 0 /* invalid */}
3681 },
3682 {
3683 {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
3684 AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
3685 AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
3686 {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
3687 AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
3688 AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
3689 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3690 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3691 0 /* invalid */},
3692 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3693 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3694 0 /* invalid */}
3695 },
3696 {
3697 {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
3698 AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
3699 AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
3700 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3701 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3702 0 /* invalid */},
3703 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3704 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3705 0 /* invalid */},
3706 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3707 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3708 0 /* invalid */}
3709 },
3710 };
3711 unsigned EltIdx = Log2_32(SrcEltSize / 8);
3712 unsigned NumEltsIdx = Log2_32(NumElts / 2);
3713 unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
3714 if (!Opc) {
3715 LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
3716 return false;
3717 }
3718
3719 const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3720 const TargetRegisterClass *SrcRC =
3721 getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true);
3722 if (!SrcRC) {
3723 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3724 return false;
3725 }
3726
3727 unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
3728 if (SrcTy.getSizeInBits() == 128)
3729 NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
3730
3731 if (SwapOperands)
3732 std::swap(SrcReg, Src2Reg);
3733
3734 auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
3735 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3736
3737 // Invert if we had a 'ne' cc.
3738 if (NotOpc) {
3739 Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
3740 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3741 } else {
3742 MIB.buildCopy(DstReg, Cmp.getReg(0));
3743 }
3744 RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
3745 I.eraseFromParent();
3746 return true;
3747 }
3748
emitScalarToVector(unsigned EltSize,const TargetRegisterClass * DstRC,Register Scalar,MachineIRBuilder & MIRBuilder) const3749 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3750 unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3751 MachineIRBuilder &MIRBuilder) const {
3752 auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3753
3754 auto BuildFn = [&](unsigned SubregIndex) {
3755 auto Ins =
3756 MIRBuilder
3757 .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3758 .addImm(SubregIndex);
3759 constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3760 constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3761 return &*Ins;
3762 };
3763
3764 switch (EltSize) {
3765 case 16:
3766 return BuildFn(AArch64::hsub);
3767 case 32:
3768 return BuildFn(AArch64::ssub);
3769 case 64:
3770 return BuildFn(AArch64::dsub);
3771 default:
3772 return nullptr;
3773 }
3774 }
3775
selectMergeValues(MachineInstr & I,MachineRegisterInfo & MRI)3776 bool AArch64InstructionSelector::selectMergeValues(
3777 MachineInstr &I, MachineRegisterInfo &MRI) {
3778 assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3779 const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3780 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3781 assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3782 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
3783
3784 if (I.getNumOperands() != 3)
3785 return false;
3786
3787 // Merging 2 s64s into an s128.
3788 if (DstTy == LLT::scalar(128)) {
3789 if (SrcTy.getSizeInBits() != 64)
3790 return false;
3791 Register DstReg = I.getOperand(0).getReg();
3792 Register Src1Reg = I.getOperand(1).getReg();
3793 Register Src2Reg = I.getOperand(2).getReg();
3794 auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
3795 MachineInstr *InsMI =
3796 emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB);
3797 if (!InsMI)
3798 return false;
3799 MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
3800 Src2Reg, /* LaneIdx */ 1, RB, MIB);
3801 if (!Ins2MI)
3802 return false;
3803 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3804 constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3805 I.eraseFromParent();
3806 return true;
3807 }
3808
3809 if (RB.getID() != AArch64::GPRRegBankID)
3810 return false;
3811
3812 if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3813 return false;
3814
3815 auto *DstRC = &AArch64::GPR64RegClass;
3816 Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3817 MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3818 TII.get(TargetOpcode::SUBREG_TO_REG))
3819 .addDef(SubToRegDef)
3820 .addImm(0)
3821 .addUse(I.getOperand(1).getReg())
3822 .addImm(AArch64::sub_32);
3823 Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3824 // Need to anyext the second scalar before we can use bfm
3825 MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3826 TII.get(TargetOpcode::SUBREG_TO_REG))
3827 .addDef(SubToRegDef2)
3828 .addImm(0)
3829 .addUse(I.getOperand(2).getReg())
3830 .addImm(AArch64::sub_32);
3831 MachineInstr &BFM =
3832 *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3833 .addDef(I.getOperand(0).getReg())
3834 .addUse(SubToRegDef)
3835 .addUse(SubToRegDef2)
3836 .addImm(32)
3837 .addImm(31);
3838 constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
3839 constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
3840 constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
3841 I.eraseFromParent();
3842 return true;
3843 }
3844
getLaneCopyOpcode(unsigned & CopyOpc,unsigned & ExtractSubReg,const unsigned EltSize)3845 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3846 const unsigned EltSize) {
3847 // Choose a lane copy opcode and subregister based off of the size of the
3848 // vector's elements.
3849 switch (EltSize) {
3850 case 16:
3851 CopyOpc = AArch64::CPYi16;
3852 ExtractSubReg = AArch64::hsub;
3853 break;
3854 case 32:
3855 CopyOpc = AArch64::CPYi32;
3856 ExtractSubReg = AArch64::ssub;
3857 break;
3858 case 64:
3859 CopyOpc = AArch64::CPYi64;
3860 ExtractSubReg = AArch64::dsub;
3861 break;
3862 default:
3863 // Unknown size, bail out.
3864 LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
3865 return false;
3866 }
3867 return true;
3868 }
3869
emitExtractVectorElt(Optional<Register> DstReg,const RegisterBank & DstRB,LLT ScalarTy,Register VecReg,unsigned LaneIdx,MachineIRBuilder & MIRBuilder) const3870 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
3871 Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
3872 Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
3873 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3874 unsigned CopyOpc = 0;
3875 unsigned ExtractSubReg = 0;
3876 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
3877 LLVM_DEBUG(
3878 dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
3879 return nullptr;
3880 }
3881
3882 const TargetRegisterClass *DstRC =
3883 getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true);
3884 if (!DstRC) {
3885 LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
3886 return nullptr;
3887 }
3888
3889 const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
3890 const LLT &VecTy = MRI.getType(VecReg);
3891 const TargetRegisterClass *VecRC =
3892 getRegClassForTypeOnBank(VecTy, VecRB, RBI, true);
3893 if (!VecRC) {
3894 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3895 return nullptr;
3896 }
3897
3898 // The register that we're going to copy into.
3899 Register InsertReg = VecReg;
3900 if (!DstReg)
3901 DstReg = MRI.createVirtualRegister(DstRC);
3902 // If the lane index is 0, we just use a subregister COPY.
3903 if (LaneIdx == 0) {
3904 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
3905 .addReg(VecReg, 0, ExtractSubReg);
3906 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
3907 return &*Copy;
3908 }
3909
3910 // Lane copies require 128-bit wide registers. If we're dealing with an
3911 // unpacked vector, then we need to move up to that width. Insert an implicit
3912 // def and a subregister insert to get us there.
3913 if (VecTy.getSizeInBits() != 128) {
3914 MachineInstr *ScalarToVector = emitScalarToVector(
3915 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
3916 if (!ScalarToVector)
3917 return nullptr;
3918 InsertReg = ScalarToVector->getOperand(0).getReg();
3919 }
3920
3921 MachineInstr *LaneCopyMI =
3922 MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
3923 constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
3924
3925 // Make sure that we actually constrain the initial copy.
3926 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
3927 return LaneCopyMI;
3928 }
3929
selectExtractElt(MachineInstr & I,MachineRegisterInfo & MRI)3930 bool AArch64InstructionSelector::selectExtractElt(
3931 MachineInstr &I, MachineRegisterInfo &MRI) {
3932 assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
3933 "unexpected opcode!");
3934 Register DstReg = I.getOperand(0).getReg();
3935 const LLT NarrowTy = MRI.getType(DstReg);
3936 const Register SrcReg = I.getOperand(1).getReg();
3937 const LLT WideTy = MRI.getType(SrcReg);
3938 (void)WideTy;
3939 assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
3940 "source register size too small!");
3941 assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
3942
3943 // Need the lane index to determine the correct copy opcode.
3944 MachineOperand &LaneIdxOp = I.getOperand(2);
3945 assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
3946
3947 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
3948 LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
3949 return false;
3950 }
3951
3952 // Find the index to extract from.
3953 auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
3954 if (!VRegAndVal)
3955 return false;
3956 unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
3957
3958
3959 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3960 MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
3961 LaneIdx, MIB);
3962 if (!Extract)
3963 return false;
3964
3965 I.eraseFromParent();
3966 return true;
3967 }
3968
selectSplitVectorUnmerge(MachineInstr & I,MachineRegisterInfo & MRI)3969 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
3970 MachineInstr &I, MachineRegisterInfo &MRI) {
3971 unsigned NumElts = I.getNumOperands() - 1;
3972 Register SrcReg = I.getOperand(NumElts).getReg();
3973 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
3974 const LLT SrcTy = MRI.getType(SrcReg);
3975
3976 assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
3977 if (SrcTy.getSizeInBits() > 128) {
3978 LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
3979 return false;
3980 }
3981
3982 // We implement a split vector operation by treating the sub-vectors as
3983 // scalars and extracting them.
3984 const RegisterBank &DstRB =
3985 *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
3986 for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
3987 Register Dst = I.getOperand(OpIdx).getReg();
3988 MachineInstr *Extract =
3989 emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
3990 if (!Extract)
3991 return false;
3992 }
3993 I.eraseFromParent();
3994 return true;
3995 }
3996
selectUnmergeValues(MachineInstr & I,MachineRegisterInfo & MRI)3997 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
3998 MachineRegisterInfo &MRI) {
3999 assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4000 "unexpected opcode");
4001
4002 // TODO: Handle unmerging into GPRs and from scalars to scalars.
4003 if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4004 AArch64::FPRRegBankID ||
4005 RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4006 AArch64::FPRRegBankID) {
4007 LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4008 "currently unsupported.\n");
4009 return false;
4010 }
4011
4012 // The last operand is the vector source register, and every other operand is
4013 // a register to unpack into.
4014 unsigned NumElts = I.getNumOperands() - 1;
4015 Register SrcReg = I.getOperand(NumElts).getReg();
4016 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4017 const LLT WideTy = MRI.getType(SrcReg);
4018 (void)WideTy;
4019 assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4020 "can only unmerge from vector or s128 types!");
4021 assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4022 "source register size too small!");
4023
4024 if (!NarrowTy.isScalar())
4025 return selectSplitVectorUnmerge(I, MRI);
4026
4027 // Choose a lane copy opcode and subregister based off of the size of the
4028 // vector's elements.
4029 unsigned CopyOpc = 0;
4030 unsigned ExtractSubReg = 0;
4031 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4032 return false;
4033
4034 // Set up for the lane copies.
4035 MachineBasicBlock &MBB = *I.getParent();
4036
4037 // Stores the registers we'll be copying from.
4038 SmallVector<Register, 4> InsertRegs;
4039
4040 // We'll use the first register twice, so we only need NumElts-1 registers.
4041 unsigned NumInsertRegs = NumElts - 1;
4042
4043 // If our elements fit into exactly 128 bits, then we can copy from the source
4044 // directly. Otherwise, we need to do a bit of setup with some subregister
4045 // inserts.
4046 if (NarrowTy.getSizeInBits() * NumElts == 128) {
4047 InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4048 } else {
4049 // No. We have to perform subregister inserts. For each insert, create an
4050 // implicit def and a subregister insert, and save the register we create.
4051 const TargetRegisterClass *RC =
4052 getMinClassForRegBank(*RBI.getRegBank(SrcReg, MRI, TRI),
4053 WideTy.getScalarSizeInBits() * NumElts);
4054 unsigned SubReg = 0;
4055 bool Found = getSubRegForClass(RC, TRI, SubReg);
4056 (void)Found;
4057 assert(Found && "expected to find last operand's subeg idx");
4058 for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4059 Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4060 MachineInstr &ImpDefMI =
4061 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4062 ImpDefReg);
4063
4064 // Now, create the subregister insert from SrcReg.
4065 Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4066 MachineInstr &InsMI =
4067 *BuildMI(MBB, I, I.getDebugLoc(),
4068 TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4069 .addUse(ImpDefReg)
4070 .addUse(SrcReg)
4071 .addImm(SubReg);
4072
4073 constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4074 constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4075
4076 // Save the register so that we can copy from it after.
4077 InsertRegs.push_back(InsertReg);
4078 }
4079 }
4080
4081 // Now that we've created any necessary subregister inserts, we can
4082 // create the copies.
4083 //
4084 // Perform the first copy separately as a subregister copy.
4085 Register CopyTo = I.getOperand(0).getReg();
4086 auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4087 .addReg(InsertRegs[0], 0, ExtractSubReg);
4088 constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4089
4090 // Now, perform the remaining copies as vector lane copies.
4091 unsigned LaneIdx = 1;
4092 for (Register InsReg : InsertRegs) {
4093 Register CopyTo = I.getOperand(LaneIdx).getReg();
4094 MachineInstr &CopyInst =
4095 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4096 .addUse(InsReg)
4097 .addImm(LaneIdx);
4098 constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4099 ++LaneIdx;
4100 }
4101
4102 // Separately constrain the first copy's destination. Because of the
4103 // limitation in constrainOperandRegClass, we can't guarantee that this will
4104 // actually be constrained. So, do it ourselves using the second operand.
4105 const TargetRegisterClass *RC =
4106 MRI.getRegClassOrNull(I.getOperand(1).getReg());
4107 if (!RC) {
4108 LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4109 return false;
4110 }
4111
4112 RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4113 I.eraseFromParent();
4114 return true;
4115 }
4116
selectConcatVectors(MachineInstr & I,MachineRegisterInfo & MRI)4117 bool AArch64InstructionSelector::selectConcatVectors(
4118 MachineInstr &I, MachineRegisterInfo &MRI) {
4119 assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4120 "Unexpected opcode");
4121 Register Dst = I.getOperand(0).getReg();
4122 Register Op1 = I.getOperand(1).getReg();
4123 Register Op2 = I.getOperand(2).getReg();
4124 MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4125 if (!ConcatMI)
4126 return false;
4127 I.eraseFromParent();
4128 return true;
4129 }
4130
4131 unsigned
emitConstantPoolEntry(const Constant * CPVal,MachineFunction & MF) const4132 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4133 MachineFunction &MF) const {
4134 Type *CPTy = CPVal->getType();
4135 Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4136
4137 MachineConstantPool *MCP = MF.getConstantPool();
4138 return MCP->getConstantPoolIndex(CPVal, Alignment);
4139 }
4140
emitLoadFromConstantPool(const Constant * CPVal,MachineIRBuilder & MIRBuilder) const4141 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4142 const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4143 auto &MF = MIRBuilder.getMF();
4144 unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4145
4146 auto Adrp =
4147 MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4148 .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4149
4150 MachineInstr *LoadMI = nullptr;
4151 MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4152 unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4153 switch (Size) {
4154 case 16:
4155 LoadMI =
4156 &*MIRBuilder
4157 .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
4158 .addConstantPoolIndex(CPIdx, 0,
4159 AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4160 break;
4161 case 8:
4162 LoadMI =
4163 &*MIRBuilder
4164 .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
4165 .addConstantPoolIndex(CPIdx, 0,
4166 AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4167 break;
4168 case 4:
4169 LoadMI =
4170 &*MIRBuilder
4171 .buildInstr(AArch64::LDRSui, {&AArch64::FPR32RegClass}, {Adrp})
4172 .addConstantPoolIndex(CPIdx, 0,
4173 AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4174 break;
4175 default:
4176 LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4177 << *CPVal->getType());
4178 return nullptr;
4179 }
4180 LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4181 MachineMemOperand::MOLoad,
4182 Size, Align(Size)));
4183 constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4184 constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4185 return LoadMI;
4186 }
4187
4188 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4189 /// size and RB.
4190 static std::pair<unsigned, unsigned>
getInsertVecEltOpInfo(const RegisterBank & RB,unsigned EltSize)4191 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4192 unsigned Opc, SubregIdx;
4193 if (RB.getID() == AArch64::GPRRegBankID) {
4194 if (EltSize == 16) {
4195 Opc = AArch64::INSvi16gpr;
4196 SubregIdx = AArch64::ssub;
4197 } else if (EltSize == 32) {
4198 Opc = AArch64::INSvi32gpr;
4199 SubregIdx = AArch64::ssub;
4200 } else if (EltSize == 64) {
4201 Opc = AArch64::INSvi64gpr;
4202 SubregIdx = AArch64::dsub;
4203 } else {
4204 llvm_unreachable("invalid elt size!");
4205 }
4206 } else {
4207 if (EltSize == 8) {
4208 Opc = AArch64::INSvi8lane;
4209 SubregIdx = AArch64::bsub;
4210 } else if (EltSize == 16) {
4211 Opc = AArch64::INSvi16lane;
4212 SubregIdx = AArch64::hsub;
4213 } else if (EltSize == 32) {
4214 Opc = AArch64::INSvi32lane;
4215 SubregIdx = AArch64::ssub;
4216 } else if (EltSize == 64) {
4217 Opc = AArch64::INSvi64lane;
4218 SubregIdx = AArch64::dsub;
4219 } else {
4220 llvm_unreachable("invalid elt size!");
4221 }
4222 }
4223 return std::make_pair(Opc, SubregIdx);
4224 }
4225
emitInstr(unsigned Opcode,std::initializer_list<llvm::DstOp> DstOps,std::initializer_list<llvm::SrcOp> SrcOps,MachineIRBuilder & MIRBuilder,const ComplexRendererFns & RenderFns) const4226 MachineInstr *AArch64InstructionSelector::emitInstr(
4227 unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4228 std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4229 const ComplexRendererFns &RenderFns) const {
4230 assert(Opcode && "Expected an opcode?");
4231 assert(!isPreISelGenericOpcode(Opcode) &&
4232 "Function should only be used to produce selected instructions!");
4233 auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4234 if (RenderFns)
4235 for (auto &Fn : *RenderFns)
4236 Fn(MI);
4237 constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4238 return &*MI;
4239 }
4240
emitAddSub(const std::array<std::array<unsigned,2>,5> & AddrModeAndSizeToOpcode,Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4241 MachineInstr *AArch64InstructionSelector::emitAddSub(
4242 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4243 Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4244 MachineIRBuilder &MIRBuilder) const {
4245 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4246 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4247 auto Ty = MRI.getType(LHS.getReg());
4248 assert(!Ty.isVector() && "Expected a scalar or pointer?");
4249 unsigned Size = Ty.getSizeInBits();
4250 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4251 bool Is32Bit = Size == 32;
4252
4253 // INSTRri form with positive arithmetic immediate.
4254 if (auto Fns = selectArithImmed(RHS))
4255 return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4256 MIRBuilder, Fns);
4257
4258 // INSTRri form with negative arithmetic immediate.
4259 if (auto Fns = selectNegArithImmed(RHS))
4260 return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4261 MIRBuilder, Fns);
4262
4263 // INSTRrx form.
4264 if (auto Fns = selectArithExtendedRegister(RHS))
4265 return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4266 MIRBuilder, Fns);
4267
4268 // INSTRrs form.
4269 if (auto Fns = selectShiftedRegister(RHS))
4270 return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4271 MIRBuilder, Fns);
4272 return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4273 MIRBuilder);
4274 }
4275
4276 MachineInstr *
emitADD(Register DefReg,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4277 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4278 MachineOperand &RHS,
4279 MachineIRBuilder &MIRBuilder) const {
4280 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4281 {{AArch64::ADDXri, AArch64::ADDWri},
4282 {AArch64::ADDXrs, AArch64::ADDWrs},
4283 {AArch64::ADDXrr, AArch64::ADDWrr},
4284 {AArch64::SUBXri, AArch64::SUBWri},
4285 {AArch64::ADDXrx, AArch64::ADDWrx}}};
4286 return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4287 }
4288
4289 MachineInstr *
emitADDS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4290 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4291 MachineOperand &RHS,
4292 MachineIRBuilder &MIRBuilder) const {
4293 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4294 {{AArch64::ADDSXri, AArch64::ADDSWri},
4295 {AArch64::ADDSXrs, AArch64::ADDSWrs},
4296 {AArch64::ADDSXrr, AArch64::ADDSWrr},
4297 {AArch64::SUBSXri, AArch64::SUBSWri},
4298 {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4299 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4300 }
4301
4302 MachineInstr *
emitSUBS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4303 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4304 MachineOperand &RHS,
4305 MachineIRBuilder &MIRBuilder) const {
4306 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4307 {{AArch64::SUBSXri, AArch64::SUBSWri},
4308 {AArch64::SUBSXrs, AArch64::SUBSWrs},
4309 {AArch64::SUBSXrr, AArch64::SUBSWrr},
4310 {AArch64::ADDSXri, AArch64::ADDSWri},
4311 {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4312 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4313 }
4314
4315 MachineInstr *
emitCMN(MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4316 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4317 MachineIRBuilder &MIRBuilder) const {
4318 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4319 bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4320 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4321 return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4322 }
4323
4324 MachineInstr *
emitTST(MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4325 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4326 MachineIRBuilder &MIRBuilder) const {
4327 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4328 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4329 LLT Ty = MRI.getType(LHS.getReg());
4330 unsigned RegSize = Ty.getSizeInBits();
4331 bool Is32Bit = (RegSize == 32);
4332 const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4333 {AArch64::ANDSXrs, AArch64::ANDSWrs},
4334 {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4335 // ANDS needs a logical immediate for its immediate form. Check if we can
4336 // fold one in.
4337 if (auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4338 int64_t Imm = ValAndVReg->Value.getSExtValue();
4339
4340 if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4341 auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4342 TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4343 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4344 return &*TstMI;
4345 }
4346 }
4347
4348 if (auto Fns = selectLogicalShiftedRegister(RHS))
4349 return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4350 return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4351 }
4352
emitIntegerCompare(MachineOperand & LHS,MachineOperand & RHS,MachineOperand & Predicate,MachineIRBuilder & MIRBuilder) const4353 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4354 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4355 MachineIRBuilder &MIRBuilder) const {
4356 assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4357 assert(Predicate.isPredicate() && "Expected predicate?");
4358 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4359 LLT CmpTy = MRI.getType(LHS.getReg());
4360 assert(!CmpTy.isVector() && "Expected scalar or pointer");
4361 unsigned Size = CmpTy.getSizeInBits();
4362 (void)Size;
4363 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4364 // Fold the compare into a cmn or tst if possible.
4365 if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4366 return FoldCmp;
4367 auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4368 return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4369 }
4370
emitCSetForFCmp(Register Dst,CmpInst::Predicate Pred,MachineIRBuilder & MIRBuilder) const4371 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4372 Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4373 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4374 #ifndef NDEBUG
4375 LLT Ty = MRI.getType(Dst);
4376 assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4377 "Expected a 32-bit scalar register?");
4378 #endif
4379 const Register ZeroReg = AArch64::WZR;
4380 auto EmitCSet = [&](Register CsetDst, AArch64CC::CondCode CC) {
4381 auto CSet =
4382 MIRBuilder.buildInstr(AArch64::CSINCWr, {CsetDst}, {ZeroReg, ZeroReg})
4383 .addImm(getInvertedCondCode(CC));
4384 constrainSelectedInstRegOperands(*CSet, TII, TRI, RBI);
4385 return &*CSet;
4386 };
4387
4388 AArch64CC::CondCode CC1, CC2;
4389 changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4390 if (CC2 == AArch64CC::AL)
4391 return EmitCSet(Dst, CC1);
4392
4393 const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4394 Register Def1Reg = MRI.createVirtualRegister(RC);
4395 Register Def2Reg = MRI.createVirtualRegister(RC);
4396 EmitCSet(Def1Reg, CC1);
4397 EmitCSet(Def2Reg, CC2);
4398 auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4399 constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4400 return &*OrMI;
4401 }
4402
4403 MachineInstr *
emitFPCompare(Register LHS,Register RHS,MachineIRBuilder & MIRBuilder,Optional<CmpInst::Predicate> Pred) const4404 AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS,
4405 MachineIRBuilder &MIRBuilder,
4406 Optional<CmpInst::Predicate> Pred) const {
4407 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4408 LLT Ty = MRI.getType(LHS);
4409 if (Ty.isVector())
4410 return nullptr;
4411 unsigned OpSize = Ty.getSizeInBits();
4412 if (OpSize != 32 && OpSize != 64)
4413 return nullptr;
4414
4415 // If this is a compare against +0.0, then we don't have
4416 // to explicitly materialize a constant.
4417 const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4418 bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4419
4420 auto IsEqualityPred = [](CmpInst::Predicate P) {
4421 return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4422 P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4423 };
4424 if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4425 // Try commutating the operands.
4426 const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4427 if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4428 ShouldUseImm = true;
4429 std::swap(LHS, RHS);
4430 }
4431 }
4432 unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
4433 {AArch64::FCMPSri, AArch64::FCMPDri}};
4434 unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
4435
4436 // Partially build the compare. Decide if we need to add a use for the
4437 // third operand based off whether or not we're comparing against 0.0.
4438 auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4439 if (!ShouldUseImm)
4440 CmpMI.addUse(RHS);
4441 constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4442 return &*CmpMI;
4443 }
4444
emitVectorConcat(Optional<Register> Dst,Register Op1,Register Op2,MachineIRBuilder & MIRBuilder) const4445 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4446 Optional<Register> Dst, Register Op1, Register Op2,
4447 MachineIRBuilder &MIRBuilder) const {
4448 // We implement a vector concat by:
4449 // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4450 // 2. Insert the upper vector into the destination's upper element
4451 // TODO: some of this code is common with G_BUILD_VECTOR handling.
4452 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4453
4454 const LLT Op1Ty = MRI.getType(Op1);
4455 const LLT Op2Ty = MRI.getType(Op2);
4456
4457 if (Op1Ty != Op2Ty) {
4458 LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4459 return nullptr;
4460 }
4461 assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4462
4463 if (Op1Ty.getSizeInBits() >= 128) {
4464 LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4465 return nullptr;
4466 }
4467
4468 // At the moment we just support 64 bit vector concats.
4469 if (Op1Ty.getSizeInBits() != 64) {
4470 LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4471 return nullptr;
4472 }
4473
4474 const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4475 const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4476 const TargetRegisterClass *DstRC =
4477 getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
4478
4479 MachineInstr *WidenedOp1 =
4480 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4481 MachineInstr *WidenedOp2 =
4482 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4483 if (!WidenedOp1 || !WidenedOp2) {
4484 LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4485 return nullptr;
4486 }
4487
4488 // Now do the insert of the upper element.
4489 unsigned InsertOpc, InsSubRegIdx;
4490 std::tie(InsertOpc, InsSubRegIdx) =
4491 getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4492
4493 if (!Dst)
4494 Dst = MRI.createVirtualRegister(DstRC);
4495 auto InsElt =
4496 MIRBuilder
4497 .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4498 .addImm(1) /* Lane index */
4499 .addUse(WidenedOp2->getOperand(0).getReg())
4500 .addImm(0);
4501 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4502 return &*InsElt;
4503 }
4504
4505 MachineInstr *
emitCSetForICMP(Register DefReg,unsigned Pred,MachineIRBuilder & MIRBuilder,Register SrcReg) const4506 AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred,
4507 MachineIRBuilder &MIRBuilder,
4508 Register SrcReg) const {
4509 // CSINC increments the result when the predicate is false. Invert it.
4510 const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
4511 CmpInst::getInversePredicate((CmpInst::Predicate)Pred));
4512 auto I = MIRBuilder.buildInstr(AArch64::CSINCWr, {DefReg}, {SrcReg, SrcReg})
4513 .addImm(InvCC);
4514 constrainSelectedInstRegOperands(*I, TII, TRI, RBI);
4515 return &*I;
4516 }
4517
4518 std::pair<MachineInstr *, AArch64CC::CondCode>
emitOverflowOp(unsigned Opcode,Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4519 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4520 MachineOperand &LHS,
4521 MachineOperand &RHS,
4522 MachineIRBuilder &MIRBuilder) const {
4523 switch (Opcode) {
4524 default:
4525 llvm_unreachable("Unexpected opcode!");
4526 case TargetOpcode::G_SADDO:
4527 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4528 case TargetOpcode::G_UADDO:
4529 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4530 case TargetOpcode::G_SSUBO:
4531 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4532 case TargetOpcode::G_USUBO:
4533 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4534 }
4535 }
4536
tryOptSelect(MachineInstr & I)4537 bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
4538 MachineRegisterInfo &MRI = *MIB.getMRI();
4539 // We want to recognize this pattern:
4540 //
4541 // $z = G_FCMP pred, $x, $y
4542 // ...
4543 // $w = G_SELECT $z, $a, $b
4544 //
4545 // Where the value of $z is *only* ever used by the G_SELECT (possibly with
4546 // some copies/truncs in between.)
4547 //
4548 // If we see this, then we can emit something like this:
4549 //
4550 // fcmp $x, $y
4551 // fcsel $w, $a, $b, pred
4552 //
4553 // Rather than emitting both of the rather long sequences in the standard
4554 // G_FCMP/G_SELECT select methods.
4555
4556 // First, check if the condition is defined by a compare.
4557 MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
4558 while (CondDef) {
4559 // We can only fold if all of the defs have one use.
4560 Register CondDefReg = CondDef->getOperand(0).getReg();
4561 if (!MRI.hasOneNonDBGUse(CondDefReg)) {
4562 // Unless it's another select.
4563 for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
4564 if (CondDef == &UI)
4565 continue;
4566 if (UI.getOpcode() != TargetOpcode::G_SELECT)
4567 return false;
4568 }
4569 }
4570
4571 // We can skip over G_TRUNC since the condition is 1-bit.
4572 // Truncating/extending can have no impact on the value.
4573 unsigned Opc = CondDef->getOpcode();
4574 if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC)
4575 break;
4576
4577 // Can't see past copies from physregs.
4578 if (Opc == TargetOpcode::COPY &&
4579 Register::isPhysicalRegister(CondDef->getOperand(1).getReg()))
4580 return false;
4581
4582 CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg());
4583 }
4584
4585 // Is the condition defined by a compare?
4586 if (!CondDef)
4587 return false;
4588
4589 unsigned CondOpc = CondDef->getOpcode();
4590 if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
4591 return false;
4592
4593 AArch64CC::CondCode CondCode;
4594 if (CondOpc == TargetOpcode::G_ICMP) {
4595 auto Pred =
4596 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
4597 CondCode = changeICMPPredToAArch64CC(Pred);
4598 emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
4599 CondDef->getOperand(1), MIB);
4600 } else {
4601 // Get the condition code for the select.
4602 auto Pred =
4603 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
4604 AArch64CC::CondCode CondCode2;
4605 changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
4606
4607 // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
4608 // instructions to emit the comparison.
4609 // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
4610 // unnecessary.
4611 if (CondCode2 != AArch64CC::AL)
4612 return false;
4613
4614 if (!emitFPCompare(CondDef->getOperand(2).getReg(),
4615 CondDef->getOperand(3).getReg(), MIB)) {
4616 LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
4617 return false;
4618 }
4619 }
4620
4621 // Emit the select.
4622 emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
4623 I.getOperand(3).getReg(), CondCode, MIB);
4624 I.eraseFromParent();
4625 return true;
4626 }
4627
tryFoldIntegerCompare(MachineOperand & LHS,MachineOperand & RHS,MachineOperand & Predicate,MachineIRBuilder & MIRBuilder) const4628 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
4629 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4630 MachineIRBuilder &MIRBuilder) const {
4631 assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
4632 "Unexpected MachineOperand");
4633 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4634 // We want to find this sort of thing:
4635 // x = G_SUB 0, y
4636 // G_ICMP z, x
4637 //
4638 // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
4639 // e.g:
4640 //
4641 // cmn z, y
4642
4643 // Check if the RHS or LHS of the G_ICMP is defined by a SUB
4644 MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
4645 MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
4646 auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
4647 // Given this:
4648 //
4649 // x = G_SUB 0, y
4650 // G_ICMP x, z
4651 //
4652 // Produce this:
4653 //
4654 // cmn y, z
4655 if (isCMN(LHSDef, P, MRI))
4656 return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
4657
4658 // Same idea here, but with the RHS of the compare instead:
4659 //
4660 // Given this:
4661 //
4662 // x = G_SUB 0, y
4663 // G_ICMP z, x
4664 //
4665 // Produce this:
4666 //
4667 // cmn z, y
4668 if (isCMN(RHSDef, P, MRI))
4669 return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
4670
4671 // Given this:
4672 //
4673 // z = G_AND x, y
4674 // G_ICMP z, 0
4675 //
4676 // Produce this if the compare is signed:
4677 //
4678 // tst x, y
4679 if (!CmpInst::isUnsigned(P) && LHSDef &&
4680 LHSDef->getOpcode() == TargetOpcode::G_AND) {
4681 // Make sure that the RHS is 0.
4682 auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI);
4683 if (!ValAndVReg || ValAndVReg->Value != 0)
4684 return nullptr;
4685
4686 return emitTST(LHSDef->getOperand(1),
4687 LHSDef->getOperand(2), MIRBuilder);
4688 }
4689
4690 return nullptr;
4691 }
4692
selectShuffleVector(MachineInstr & I,MachineRegisterInfo & MRI)4693 bool AArch64InstructionSelector::selectShuffleVector(
4694 MachineInstr &I, MachineRegisterInfo &MRI) {
4695 const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
4696 Register Src1Reg = I.getOperand(1).getReg();
4697 const LLT Src1Ty = MRI.getType(Src1Reg);
4698 Register Src2Reg = I.getOperand(2).getReg();
4699 const LLT Src2Ty = MRI.getType(Src2Reg);
4700 ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
4701
4702 MachineBasicBlock &MBB = *I.getParent();
4703 MachineFunction &MF = *MBB.getParent();
4704 LLVMContext &Ctx = MF.getFunction().getContext();
4705
4706 // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
4707 // it's originated from a <1 x T> type. Those should have been lowered into
4708 // G_BUILD_VECTOR earlier.
4709 if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
4710 LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
4711 return false;
4712 }
4713
4714 unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
4715
4716 SmallVector<Constant *, 64> CstIdxs;
4717 for (int Val : Mask) {
4718 // For now, any undef indexes we'll just assume to be 0. This should be
4719 // optimized in future, e.g. to select DUP etc.
4720 Val = Val < 0 ? 0 : Val;
4721 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
4722 unsigned Offset = Byte + Val * BytesPerElt;
4723 CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
4724 }
4725 }
4726
4727 // Use a constant pool to load the index vector for TBL.
4728 Constant *CPVal = ConstantVector::get(CstIdxs);
4729 MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
4730 if (!IndexLoad) {
4731 LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
4732 return false;
4733 }
4734
4735 if (DstTy.getSizeInBits() != 128) {
4736 assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
4737 // This case can be done with TBL1.
4738 MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIB);
4739 if (!Concat) {
4740 LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
4741 return false;
4742 }
4743
4744 // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
4745 IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
4746 IndexLoad->getOperand(0).getReg(), MIB);
4747
4748 auto TBL1 = MIB.buildInstr(
4749 AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
4750 {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
4751 constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
4752
4753 auto Copy =
4754 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
4755 .addReg(TBL1.getReg(0), 0, AArch64::dsub);
4756 RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
4757 I.eraseFromParent();
4758 return true;
4759 }
4760
4761 // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
4762 // Q registers for regalloc.
4763 SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
4764 auto RegSeq = createQTuple(Regs, MIB);
4765 auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
4766 {RegSeq, IndexLoad->getOperand(0)});
4767 constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
4768 I.eraseFromParent();
4769 return true;
4770 }
4771
emitLaneInsert(Optional<Register> DstReg,Register SrcReg,Register EltReg,unsigned LaneIdx,const RegisterBank & RB,MachineIRBuilder & MIRBuilder) const4772 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
4773 Optional<Register> DstReg, Register SrcReg, Register EltReg,
4774 unsigned LaneIdx, const RegisterBank &RB,
4775 MachineIRBuilder &MIRBuilder) const {
4776 MachineInstr *InsElt = nullptr;
4777 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
4778 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4779
4780 // Create a register to define with the insert if one wasn't passed in.
4781 if (!DstReg)
4782 DstReg = MRI.createVirtualRegister(DstRC);
4783
4784 unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
4785 unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
4786
4787 if (RB.getID() == AArch64::FPRRegBankID) {
4788 auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
4789 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
4790 .addImm(LaneIdx)
4791 .addUse(InsSub->getOperand(0).getReg())
4792 .addImm(0);
4793 } else {
4794 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
4795 .addImm(LaneIdx)
4796 .addUse(EltReg);
4797 }
4798
4799 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4800 return InsElt;
4801 }
4802
selectInsertElt(MachineInstr & I,MachineRegisterInfo & MRI)4803 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
4804 MachineRegisterInfo &MRI) {
4805 assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
4806
4807 // Get information on the destination.
4808 Register DstReg = I.getOperand(0).getReg();
4809 const LLT DstTy = MRI.getType(DstReg);
4810 unsigned VecSize = DstTy.getSizeInBits();
4811
4812 // Get information on the element we want to insert into the destination.
4813 Register EltReg = I.getOperand(2).getReg();
4814 const LLT EltTy = MRI.getType(EltReg);
4815 unsigned EltSize = EltTy.getSizeInBits();
4816 if (EltSize < 16 || EltSize > 64)
4817 return false; // Don't support all element types yet.
4818
4819 // Find the definition of the index. Bail out if it's not defined by a
4820 // G_CONSTANT.
4821 Register IdxReg = I.getOperand(3).getReg();
4822 auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI);
4823 if (!VRegAndVal)
4824 return false;
4825 unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4826
4827 // Perform the lane insert.
4828 Register SrcReg = I.getOperand(1).getReg();
4829 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
4830
4831 if (VecSize < 128) {
4832 // If the vector we're inserting into is smaller than 128 bits, widen it
4833 // to 128 to do the insert.
4834 MachineInstr *ScalarToVec =
4835 emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
4836 if (!ScalarToVec)
4837 return false;
4838 SrcReg = ScalarToVec->getOperand(0).getReg();
4839 }
4840
4841 // Create an insert into a new FPR128 register.
4842 // Note that if our vector is already 128 bits, we end up emitting an extra
4843 // register.
4844 MachineInstr *InsMI =
4845 emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIB);
4846
4847 if (VecSize < 128) {
4848 // If we had to widen to perform the insert, then we have to demote back to
4849 // the original size to get the result we want.
4850 Register DemoteVec = InsMI->getOperand(0).getReg();
4851 const TargetRegisterClass *RC =
4852 getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize);
4853 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
4854 LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
4855 return false;
4856 }
4857 unsigned SubReg = 0;
4858 if (!getSubRegForClass(RC, TRI, SubReg))
4859 return false;
4860 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
4861 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
4862 << "\n");
4863 return false;
4864 }
4865 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
4866 .addReg(DemoteVec, 0, SubReg);
4867 RBI.constrainGenericRegister(DstReg, *RC, MRI);
4868 } else {
4869 // No widening needed.
4870 InsMI->getOperand(0).setReg(DstReg);
4871 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
4872 }
4873
4874 I.eraseFromParent();
4875 return true;
4876 }
4877
4878 MachineInstr *
emitConstantVector(Register Dst,Constant * CV,MachineIRBuilder & MIRBuilder,MachineRegisterInfo & MRI)4879 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
4880 MachineIRBuilder &MIRBuilder,
4881 MachineRegisterInfo &MRI) {
4882 LLT DstTy = MRI.getType(Dst);
4883 unsigned DstSize = DstTy.getSizeInBits();
4884 if (CV->isNullValue()) {
4885 if (DstSize == 128) {
4886 auto Mov =
4887 MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
4888 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
4889 return &*Mov;
4890 }
4891
4892 if (DstSize == 64) {
4893 auto Mov =
4894 MIRBuilder
4895 .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
4896 .addImm(0);
4897 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
4898 .addReg(Mov.getReg(0), 0, AArch64::dsub);
4899 RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
4900 return &*Copy;
4901 }
4902 }
4903
4904 auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
4905 if (!CPLoad) {
4906 LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
4907 return nullptr;
4908 }
4909
4910 auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
4911 RBI.constrainGenericRegister(
4912 Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
4913 return &*Copy;
4914 }
4915
tryOptConstantBuildVec(MachineInstr & I,LLT DstTy,MachineRegisterInfo & MRI)4916 bool AArch64InstructionSelector::tryOptConstantBuildVec(
4917 MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
4918 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
4919 unsigned DstSize = DstTy.getSizeInBits();
4920 assert(DstSize <= 128 && "Unexpected build_vec type!");
4921 if (DstSize < 32)
4922 return false;
4923 // Check if we're building a constant vector, in which case we want to
4924 // generate a constant pool load instead of a vector insert sequence.
4925 SmallVector<Constant *, 16> Csts;
4926 for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
4927 // Try to find G_CONSTANT or G_FCONSTANT
4928 auto *OpMI =
4929 getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
4930 if (OpMI)
4931 Csts.emplace_back(
4932 const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
4933 else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
4934 I.getOperand(Idx).getReg(), MRI)))
4935 Csts.emplace_back(
4936 const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
4937 else
4938 return false;
4939 }
4940 Constant *CV = ConstantVector::get(Csts);
4941 if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
4942 return false;
4943 I.eraseFromParent();
4944 return true;
4945 }
4946
selectBuildVector(MachineInstr & I,MachineRegisterInfo & MRI)4947 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
4948 MachineRegisterInfo &MRI) {
4949 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
4950 // Until we port more of the optimized selections, for now just use a vector
4951 // insert sequence.
4952 const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
4953 const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
4954 unsigned EltSize = EltTy.getSizeInBits();
4955
4956 if (tryOptConstantBuildVec(I, DstTy, MRI))
4957 return true;
4958 if (EltSize < 16 || EltSize > 64)
4959 return false; // Don't support all element types yet.
4960 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
4961
4962 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
4963 MachineInstr *ScalarToVec =
4964 emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
4965 I.getOperand(1).getReg(), MIB);
4966 if (!ScalarToVec)
4967 return false;
4968
4969 Register DstVec = ScalarToVec->getOperand(0).getReg();
4970 unsigned DstSize = DstTy.getSizeInBits();
4971
4972 // Keep track of the last MI we inserted. Later on, we might be able to save
4973 // a copy using it.
4974 MachineInstr *PrevMI = nullptr;
4975 for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
4976 // Note that if we don't do a subregister copy, we can end up making an
4977 // extra register.
4978 PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB,
4979 MIB);
4980 DstVec = PrevMI->getOperand(0).getReg();
4981 }
4982
4983 // If DstTy's size in bits is less than 128, then emit a subregister copy
4984 // from DstVec to the last register we've defined.
4985 if (DstSize < 128) {
4986 // Force this to be FPR using the destination vector.
4987 const TargetRegisterClass *RC =
4988 getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize);
4989 if (!RC)
4990 return false;
4991 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
4992 LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
4993 return false;
4994 }
4995
4996 unsigned SubReg = 0;
4997 if (!getSubRegForClass(RC, TRI, SubReg))
4998 return false;
4999 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5000 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5001 << "\n");
5002 return false;
5003 }
5004
5005 Register Reg = MRI.createVirtualRegister(RC);
5006 Register DstReg = I.getOperand(0).getReg();
5007
5008 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5009 MachineOperand &RegOp = I.getOperand(1);
5010 RegOp.setReg(Reg);
5011 RBI.constrainGenericRegister(DstReg, *RC, MRI);
5012 } else {
5013 // We don't need a subregister copy. Save a copy by re-using the
5014 // destination register on the final insert.
5015 assert(PrevMI && "PrevMI was null?");
5016 PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
5017 constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5018 }
5019
5020 I.eraseFromParent();
5021 return true;
5022 }
5023
5024 /// Helper function to find an intrinsic ID on an a MachineInstr. Returns the
5025 /// ID if it exists, and 0 otherwise.
findIntrinsicID(MachineInstr & I)5026 static unsigned findIntrinsicID(MachineInstr &I) {
5027 auto IntrinOp = find_if(I.operands(), [&](const MachineOperand &Op) {
5028 return Op.isIntrinsicID();
5029 });
5030 if (IntrinOp == I.operands_end())
5031 return 0;
5032 return IntrinOp->getIntrinsicID();
5033 }
5034
selectIntrinsicWithSideEffects(MachineInstr & I,MachineRegisterInfo & MRI)5035 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
5036 MachineInstr &I, MachineRegisterInfo &MRI) {
5037 // Find the intrinsic ID.
5038 unsigned IntrinID = findIntrinsicID(I);
5039 if (!IntrinID)
5040 return false;
5041
5042 // Select the instruction.
5043 switch (IntrinID) {
5044 default:
5045 return false;
5046 case Intrinsic::aarch64_ldxp:
5047 case Intrinsic::aarch64_ldaxp: {
5048 auto NewI = MIB.buildInstr(
5049 IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
5050 {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
5051 {I.getOperand(3)});
5052 NewI.cloneMemRefs(I);
5053 constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
5054 break;
5055 }
5056 case Intrinsic::trap:
5057 MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1);
5058 break;
5059 case Intrinsic::debugtrap:
5060 MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
5061 break;
5062 case Intrinsic::ubsantrap:
5063 MIB.buildInstr(AArch64::BRK, {}, {})
5064 .addImm(I.getOperand(1).getImm() | ('U' << 8));
5065 break;
5066 case Intrinsic::aarch64_neon_st2: {
5067 Register Src1 = I.getOperand(1).getReg();
5068 Register Src2 = I.getOperand(2).getReg();
5069 Register Ptr = I.getOperand(3).getReg();
5070 LLT Ty = MRI.getType(Src1);
5071 const LLT S8 = LLT::scalar(8);
5072 const LLT S16 = LLT::scalar(16);
5073 const LLT S32 = LLT::scalar(32);
5074 const LLT S64 = LLT::scalar(64);
5075 const LLT P0 = LLT::pointer(0, 64);
5076 unsigned Opc;
5077 if (Ty == LLT::fixed_vector(8, S8))
5078 Opc = AArch64::ST2Twov8b;
5079 else if (Ty == LLT::fixed_vector(16, S8))
5080 Opc = AArch64::ST2Twov16b;
5081 else if (Ty == LLT::fixed_vector(4, S16))
5082 Opc = AArch64::ST2Twov4h;
5083 else if (Ty == LLT::fixed_vector(8, S16))
5084 Opc = AArch64::ST2Twov8h;
5085 else if (Ty == LLT::fixed_vector(2, S32))
5086 Opc = AArch64::ST2Twov2s;
5087 else if (Ty == LLT::fixed_vector(4, S32))
5088 Opc = AArch64::ST2Twov4s;
5089 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5090 Opc = AArch64::ST2Twov2d;
5091 else if (Ty == S64 || Ty == P0)
5092 Opc = AArch64::ST1Twov1d;
5093 else
5094 llvm_unreachable("Unexpected type for st2!");
5095 SmallVector<Register, 2> Regs = {Src1, Src2};
5096 Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
5097 : createDTuple(Regs, MIB);
5098 auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
5099 Store.cloneMemRefs(I);
5100 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
5101 break;
5102 }
5103 }
5104
5105 I.eraseFromParent();
5106 return true;
5107 }
5108
selectIntrinsic(MachineInstr & I,MachineRegisterInfo & MRI)5109 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
5110 MachineRegisterInfo &MRI) {
5111 unsigned IntrinID = findIntrinsicID(I);
5112 if (!IntrinID)
5113 return false;
5114
5115 switch (IntrinID) {
5116 default:
5117 break;
5118 case Intrinsic::aarch64_crypto_sha1h: {
5119 Register DstReg = I.getOperand(0).getReg();
5120 Register SrcReg = I.getOperand(2).getReg();
5121
5122 // FIXME: Should this be an assert?
5123 if (MRI.getType(DstReg).getSizeInBits() != 32 ||
5124 MRI.getType(SrcReg).getSizeInBits() != 32)
5125 return false;
5126
5127 // The operation has to happen on FPRs. Set up some new FPR registers for
5128 // the source and destination if they are on GPRs.
5129 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
5130 SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5131 MIB.buildCopy({SrcReg}, {I.getOperand(2)});
5132
5133 // Make sure the copy ends up getting constrained properly.
5134 RBI.constrainGenericRegister(I.getOperand(2).getReg(),
5135 AArch64::GPR32RegClass, MRI);
5136 }
5137
5138 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
5139 DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5140
5141 // Actually insert the instruction.
5142 auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
5143 constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
5144
5145 // Did we create a new register for the destination?
5146 if (DstReg != I.getOperand(0).getReg()) {
5147 // Yep. Copy the result of the instruction back into the original
5148 // destination.
5149 MIB.buildCopy({I.getOperand(0)}, {DstReg});
5150 RBI.constrainGenericRegister(I.getOperand(0).getReg(),
5151 AArch64::GPR32RegClass, MRI);
5152 }
5153
5154 I.eraseFromParent();
5155 return true;
5156 }
5157 case Intrinsic::frameaddress:
5158 case Intrinsic::returnaddress: {
5159 MachineFunction &MF = *I.getParent()->getParent();
5160 MachineFrameInfo &MFI = MF.getFrameInfo();
5161
5162 unsigned Depth = I.getOperand(2).getImm();
5163 Register DstReg = I.getOperand(0).getReg();
5164 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5165
5166 if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
5167 if (!MFReturnAddr) {
5168 // Insert the copy from LR/X30 into the entry block, before it can be
5169 // clobbered by anything.
5170 MFI.setReturnAddressIsTaken(true);
5171 MFReturnAddr = getFunctionLiveInPhysReg(MF, TII, AArch64::LR,
5172 AArch64::GPR64RegClass);
5173 }
5174
5175 if (STI.hasPAuth()) {
5176 MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
5177 } else {
5178 MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
5179 MIB.buildInstr(AArch64::XPACLRI);
5180 MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5181 }
5182
5183 I.eraseFromParent();
5184 return true;
5185 }
5186
5187 MFI.setFrameAddressIsTaken(true);
5188 Register FrameAddr(AArch64::FP);
5189 while (Depth--) {
5190 Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
5191 auto Ldr =
5192 MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
5193 constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
5194 FrameAddr = NextFrame;
5195 }
5196
5197 if (IntrinID == Intrinsic::frameaddress)
5198 MIB.buildCopy({DstReg}, {FrameAddr});
5199 else {
5200 MFI.setReturnAddressIsTaken(true);
5201
5202 if (STI.hasPAuth()) {
5203 Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
5204 MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
5205 MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
5206 } else {
5207 MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
5208 .addImm(1);
5209 MIB.buildInstr(AArch64::XPACLRI);
5210 MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5211 }
5212 }
5213
5214 I.eraseFromParent();
5215 return true;
5216 }
5217 case Intrinsic::swift_async_context_addr:
5218 auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
5219 {Register(AArch64::FP)})
5220 .addImm(8)
5221 .addImm(0);
5222 constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
5223
5224 MF->getFrameInfo().setFrameAddressIsTaken(true);
5225 MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
5226 I.eraseFromParent();
5227 return true;
5228 }
5229 return false;
5230 }
5231
5232 InstructionSelector::ComplexRendererFns
selectShiftA_32(const MachineOperand & Root) const5233 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
5234 auto MaybeImmed = getImmedFromMO(Root);
5235 if (MaybeImmed == None || *MaybeImmed > 31)
5236 return None;
5237 uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
5238 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5239 }
5240
5241 InstructionSelector::ComplexRendererFns
selectShiftB_32(const MachineOperand & Root) const5242 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
5243 auto MaybeImmed = getImmedFromMO(Root);
5244 if (MaybeImmed == None || *MaybeImmed > 31)
5245 return None;
5246 uint64_t Enc = 31 - *MaybeImmed;
5247 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5248 }
5249
5250 InstructionSelector::ComplexRendererFns
selectShiftA_64(const MachineOperand & Root) const5251 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
5252 auto MaybeImmed = getImmedFromMO(Root);
5253 if (MaybeImmed == None || *MaybeImmed > 63)
5254 return None;
5255 uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
5256 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5257 }
5258
5259 InstructionSelector::ComplexRendererFns
selectShiftB_64(const MachineOperand & Root) const5260 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
5261 auto MaybeImmed = getImmedFromMO(Root);
5262 if (MaybeImmed == None || *MaybeImmed > 63)
5263 return None;
5264 uint64_t Enc = 63 - *MaybeImmed;
5265 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5266 }
5267
5268 /// Helper to select an immediate value that can be represented as a 12-bit
5269 /// value shifted left by either 0 or 12. If it is possible to do so, return
5270 /// the immediate and shift value. If not, return None.
5271 ///
5272 /// Used by selectArithImmed and selectNegArithImmed.
5273 InstructionSelector::ComplexRendererFns
select12BitValueWithLeftShift(uint64_t Immed) const5274 AArch64InstructionSelector::select12BitValueWithLeftShift(
5275 uint64_t Immed) const {
5276 unsigned ShiftAmt;
5277 if (Immed >> 12 == 0) {
5278 ShiftAmt = 0;
5279 } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
5280 ShiftAmt = 12;
5281 Immed = Immed >> 12;
5282 } else
5283 return None;
5284
5285 unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
5286 return {{
5287 [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
5288 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
5289 }};
5290 }
5291
5292 /// SelectArithImmed - Select an immediate value that can be represented as
5293 /// a 12-bit value shifted left by either 0 or 12. If so, return true with
5294 /// Val set to the 12-bit value and Shift set to the shifter operand.
5295 InstructionSelector::ComplexRendererFns
selectArithImmed(MachineOperand & Root) const5296 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
5297 // This function is called from the addsub_shifted_imm ComplexPattern,
5298 // which lists [imm] as the list of opcode it's interested in, however
5299 // we still need to check whether the operand is actually an immediate
5300 // here because the ComplexPattern opcode list is only used in
5301 // root-level opcode matching.
5302 auto MaybeImmed = getImmedFromMO(Root);
5303 if (MaybeImmed == None)
5304 return None;
5305 return select12BitValueWithLeftShift(*MaybeImmed);
5306 }
5307
5308 /// SelectNegArithImmed - As above, but negates the value before trying to
5309 /// select it.
5310 InstructionSelector::ComplexRendererFns
selectNegArithImmed(MachineOperand & Root) const5311 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
5312 // We need a register here, because we need to know if we have a 64 or 32
5313 // bit immediate.
5314 if (!Root.isReg())
5315 return None;
5316 auto MaybeImmed = getImmedFromMO(Root);
5317 if (MaybeImmed == None)
5318 return None;
5319 uint64_t Immed = *MaybeImmed;
5320
5321 // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
5322 // have the opposite effect on the C flag, so this pattern mustn't match under
5323 // those circumstances.
5324 if (Immed == 0)
5325 return None;
5326
5327 // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
5328 // the root.
5329 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5330 if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
5331 Immed = ~((uint32_t)Immed) + 1;
5332 else
5333 Immed = ~Immed + 1ULL;
5334
5335 if (Immed & 0xFFFFFFFFFF000000ULL)
5336 return None;
5337
5338 Immed &= 0xFFFFFFULL;
5339 return select12BitValueWithLeftShift(Immed);
5340 }
5341
5342 /// Return true if it is worth folding MI into an extended register. That is,
5343 /// if it's safe to pull it into the addressing mode of a load or store as a
5344 /// shift.
isWorthFoldingIntoExtendedReg(MachineInstr & MI,const MachineRegisterInfo & MRI) const5345 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
5346 MachineInstr &MI, const MachineRegisterInfo &MRI) const {
5347 // Always fold if there is one use, or if we're optimizing for size.
5348 Register DefReg = MI.getOperand(0).getReg();
5349 if (MRI.hasOneNonDBGUse(DefReg) ||
5350 MI.getParent()->getParent()->getFunction().hasOptSize())
5351 return true;
5352
5353 // It's better to avoid folding and recomputing shifts when we don't have a
5354 // fastpath.
5355 if (!STI.hasLSLFast())
5356 return false;
5357
5358 // We have a fastpath, so folding a shift in and potentially computing it
5359 // many times may be beneficial. Check if this is only used in memory ops.
5360 // If it is, then we should fold.
5361 return all_of(MRI.use_nodbg_instructions(DefReg),
5362 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
5363 }
5364
isSignExtendShiftType(AArch64_AM::ShiftExtendType Type)5365 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
5366 switch (Type) {
5367 case AArch64_AM::SXTB:
5368 case AArch64_AM::SXTH:
5369 case AArch64_AM::SXTW:
5370 return true;
5371 default:
5372 return false;
5373 }
5374 }
5375
5376 InstructionSelector::ComplexRendererFns
selectExtendedSHL(MachineOperand & Root,MachineOperand & Base,MachineOperand & Offset,unsigned SizeInBytes,bool WantsExt) const5377 AArch64InstructionSelector::selectExtendedSHL(
5378 MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
5379 unsigned SizeInBytes, bool WantsExt) const {
5380 assert(Base.isReg() && "Expected base to be a register operand");
5381 assert(Offset.isReg() && "Expected offset to be a register operand");
5382
5383 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5384 MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
5385 if (!OffsetInst)
5386 return None;
5387
5388 unsigned OffsetOpc = OffsetInst->getOpcode();
5389 bool LookedThroughZExt = false;
5390 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
5391 // Try to look through a ZEXT.
5392 if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
5393 return None;
5394
5395 OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
5396 OffsetOpc = OffsetInst->getOpcode();
5397 LookedThroughZExt = true;
5398
5399 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
5400 return None;
5401 }
5402 // Make sure that the memory op is a valid size.
5403 int64_t LegalShiftVal = Log2_32(SizeInBytes);
5404 if (LegalShiftVal == 0)
5405 return None;
5406 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
5407 return None;
5408
5409 // Now, try to find the specific G_CONSTANT. Start by assuming that the
5410 // register we will offset is the LHS, and the register containing the
5411 // constant is the RHS.
5412 Register OffsetReg = OffsetInst->getOperand(1).getReg();
5413 Register ConstantReg = OffsetInst->getOperand(2).getReg();
5414 auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
5415 if (!ValAndVReg) {
5416 // We didn't get a constant on the RHS. If the opcode is a shift, then
5417 // we're done.
5418 if (OffsetOpc == TargetOpcode::G_SHL)
5419 return None;
5420
5421 // If we have a G_MUL, we can use either register. Try looking at the RHS.
5422 std::swap(OffsetReg, ConstantReg);
5423 ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
5424 if (!ValAndVReg)
5425 return None;
5426 }
5427
5428 // The value must fit into 3 bits, and must be positive. Make sure that is
5429 // true.
5430 int64_t ImmVal = ValAndVReg->Value.getSExtValue();
5431
5432 // Since we're going to pull this into a shift, the constant value must be
5433 // a power of 2. If we got a multiply, then we need to check this.
5434 if (OffsetOpc == TargetOpcode::G_MUL) {
5435 if (!isPowerOf2_32(ImmVal))
5436 return None;
5437
5438 // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
5439 ImmVal = Log2_32(ImmVal);
5440 }
5441
5442 if ((ImmVal & 0x7) != ImmVal)
5443 return None;
5444
5445 // We are only allowed to shift by LegalShiftVal. This shift value is built
5446 // into the instruction, so we can't just use whatever we want.
5447 if (ImmVal != LegalShiftVal)
5448 return None;
5449
5450 unsigned SignExtend = 0;
5451 if (WantsExt) {
5452 // Check if the offset is defined by an extend, unless we looked through a
5453 // G_ZEXT earlier.
5454 if (!LookedThroughZExt) {
5455 MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
5456 auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
5457 if (Ext == AArch64_AM::InvalidShiftExtend)
5458 return None;
5459
5460 SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
5461 // We only support SXTW for signed extension here.
5462 if (SignExtend && Ext != AArch64_AM::SXTW)
5463 return None;
5464 OffsetReg = ExtInst->getOperand(1).getReg();
5465 }
5466
5467 // Need a 32-bit wide register here.
5468 MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
5469 OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
5470 }
5471
5472 // We can use the LHS of the GEP as the base, and the LHS of the shift as an
5473 // offset. Signify that we are shifting by setting the shift flag to 1.
5474 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
5475 [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
5476 [=](MachineInstrBuilder &MIB) {
5477 // Need to add both immediates here to make sure that they are both
5478 // added to the instruction.
5479 MIB.addImm(SignExtend);
5480 MIB.addImm(1);
5481 }}};
5482 }
5483
5484 /// This is used for computing addresses like this:
5485 ///
5486 /// ldr x1, [x2, x3, lsl #3]
5487 ///
5488 /// Where x2 is the base register, and x3 is an offset register. The shift-left
5489 /// is a constant value specific to this load instruction. That is, we'll never
5490 /// see anything other than a 3 here (which corresponds to the size of the
5491 /// element being loaded.)
5492 InstructionSelector::ComplexRendererFns
selectAddrModeShiftedExtendXReg(MachineOperand & Root,unsigned SizeInBytes) const5493 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
5494 MachineOperand &Root, unsigned SizeInBytes) const {
5495 if (!Root.isReg())
5496 return None;
5497 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5498
5499 // We want to find something like this:
5500 //
5501 // val = G_CONSTANT LegalShiftVal
5502 // shift = G_SHL off_reg val
5503 // ptr = G_PTR_ADD base_reg shift
5504 // x = G_LOAD ptr
5505 //
5506 // And fold it into this addressing mode:
5507 //
5508 // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
5509
5510 // Check if we can find the G_PTR_ADD.
5511 MachineInstr *PtrAdd =
5512 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5513 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
5514 return None;
5515
5516 // Now, try to match an opcode which will match our specific offset.
5517 // We want a G_SHL or a G_MUL.
5518 MachineInstr *OffsetInst =
5519 getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
5520 return selectExtendedSHL(Root, PtrAdd->getOperand(1),
5521 OffsetInst->getOperand(0), SizeInBytes,
5522 /*WantsExt=*/false);
5523 }
5524
5525 /// This is used for computing addresses like this:
5526 ///
5527 /// ldr x1, [x2, x3]
5528 ///
5529 /// Where x2 is the base register, and x3 is an offset register.
5530 ///
5531 /// When possible (or profitable) to fold a G_PTR_ADD into the address calculation,
5532 /// this will do so. Otherwise, it will return None.
5533 InstructionSelector::ComplexRendererFns
selectAddrModeRegisterOffset(MachineOperand & Root) const5534 AArch64InstructionSelector::selectAddrModeRegisterOffset(
5535 MachineOperand &Root) const {
5536 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5537
5538 // We need a GEP.
5539 MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
5540 if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
5541 return None;
5542
5543 // If this is used more than once, let's not bother folding.
5544 // TODO: Check if they are memory ops. If they are, then we can still fold
5545 // without having to recompute anything.
5546 if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
5547 return None;
5548
5549 // Base is the GEP's LHS, offset is its RHS.
5550 return {{[=](MachineInstrBuilder &MIB) {
5551 MIB.addUse(Gep->getOperand(1).getReg());
5552 },
5553 [=](MachineInstrBuilder &MIB) {
5554 MIB.addUse(Gep->getOperand(2).getReg());
5555 },
5556 [=](MachineInstrBuilder &MIB) {
5557 // Need to add both immediates here to make sure that they are both
5558 // added to the instruction.
5559 MIB.addImm(0);
5560 MIB.addImm(0);
5561 }}};
5562 }
5563
5564 /// This is intended to be equivalent to selectAddrModeXRO in
5565 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
5566 InstructionSelector::ComplexRendererFns
selectAddrModeXRO(MachineOperand & Root,unsigned SizeInBytes) const5567 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
5568 unsigned SizeInBytes) const {
5569 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5570 if (!Root.isReg())
5571 return None;
5572 MachineInstr *PtrAdd =
5573 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5574 if (!PtrAdd)
5575 return None;
5576
5577 // Check for an immediates which cannot be encoded in the [base + imm]
5578 // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
5579 // end up with code like:
5580 //
5581 // mov x0, wide
5582 // add x1 base, x0
5583 // ldr x2, [x1, x0]
5584 //
5585 // In this situation, we can use the [base, xreg] addressing mode to save an
5586 // add/sub:
5587 //
5588 // mov x0, wide
5589 // ldr x2, [base, x0]
5590 auto ValAndVReg =
5591 getConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
5592 if (ValAndVReg) {
5593 unsigned Scale = Log2_32(SizeInBytes);
5594 int64_t ImmOff = ValAndVReg->Value.getSExtValue();
5595
5596 // Skip immediates that can be selected in the load/store addresing
5597 // mode.
5598 if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
5599 ImmOff < (0x1000 << Scale))
5600 return None;
5601
5602 // Helper lambda to decide whether or not it is preferable to emit an add.
5603 auto isPreferredADD = [](int64_t ImmOff) {
5604 // Constants in [0x0, 0xfff] can be encoded in an add.
5605 if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
5606 return true;
5607
5608 // Can it be encoded in an add lsl #12?
5609 if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
5610 return false;
5611
5612 // It can be encoded in an add lsl #12, but we may not want to. If it is
5613 // possible to select this as a single movz, then prefer that. A single
5614 // movz is faster than an add with a shift.
5615 return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
5616 (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
5617 };
5618
5619 // If the immediate can be encoded in a single add/sub, then bail out.
5620 if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
5621 return None;
5622 }
5623
5624 // Try to fold shifts into the addressing mode.
5625 auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
5626 if (AddrModeFns)
5627 return AddrModeFns;
5628
5629 // If that doesn't work, see if it's possible to fold in registers from
5630 // a GEP.
5631 return selectAddrModeRegisterOffset(Root);
5632 }
5633
5634 /// This is used for computing addresses like this:
5635 ///
5636 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
5637 ///
5638 /// Where we have a 64-bit base register, a 32-bit offset register, and an
5639 /// extend (which may or may not be signed).
5640 InstructionSelector::ComplexRendererFns
selectAddrModeWRO(MachineOperand & Root,unsigned SizeInBytes) const5641 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
5642 unsigned SizeInBytes) const {
5643 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5644
5645 MachineInstr *PtrAdd =
5646 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5647 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
5648 return None;
5649
5650 MachineOperand &LHS = PtrAdd->getOperand(1);
5651 MachineOperand &RHS = PtrAdd->getOperand(2);
5652 MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
5653
5654 // The first case is the same as selectAddrModeXRO, except we need an extend.
5655 // In this case, we try to find a shift and extend, and fold them into the
5656 // addressing mode.
5657 //
5658 // E.g.
5659 //
5660 // off_reg = G_Z/S/ANYEXT ext_reg
5661 // val = G_CONSTANT LegalShiftVal
5662 // shift = G_SHL off_reg val
5663 // ptr = G_PTR_ADD base_reg shift
5664 // x = G_LOAD ptr
5665 //
5666 // In this case we can get a load like this:
5667 //
5668 // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
5669 auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
5670 SizeInBytes, /*WantsExt=*/true);
5671 if (ExtendedShl)
5672 return ExtendedShl;
5673
5674 // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
5675 //
5676 // e.g.
5677 // ldr something, [base_reg, ext_reg, sxtw]
5678 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
5679 return None;
5680
5681 // Check if this is an extend. We'll get an extend type if it is.
5682 AArch64_AM::ShiftExtendType Ext =
5683 getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
5684 if (Ext == AArch64_AM::InvalidShiftExtend)
5685 return None;
5686
5687 // Need a 32-bit wide register.
5688 MachineIRBuilder MIB(*PtrAdd);
5689 Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
5690 AArch64::GPR32RegClass, MIB);
5691 unsigned SignExtend = Ext == AArch64_AM::SXTW;
5692
5693 // Base is LHS, offset is ExtReg.
5694 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
5695 [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
5696 [=](MachineInstrBuilder &MIB) {
5697 MIB.addImm(SignExtend);
5698 MIB.addImm(0);
5699 }}};
5700 }
5701
5702 /// Select a "register plus unscaled signed 9-bit immediate" address. This
5703 /// should only match when there is an offset that is not valid for a scaled
5704 /// immediate addressing mode. The "Size" argument is the size in bytes of the
5705 /// memory reference, which is needed here to know what is valid for a scaled
5706 /// immediate.
5707 InstructionSelector::ComplexRendererFns
selectAddrModeUnscaled(MachineOperand & Root,unsigned Size) const5708 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
5709 unsigned Size) const {
5710 MachineRegisterInfo &MRI =
5711 Root.getParent()->getParent()->getParent()->getRegInfo();
5712
5713 if (!Root.isReg())
5714 return None;
5715
5716 if (!isBaseWithConstantOffset(Root, MRI))
5717 return None;
5718
5719 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
5720 if (!RootDef)
5721 return None;
5722
5723 MachineOperand &OffImm = RootDef->getOperand(2);
5724 if (!OffImm.isReg())
5725 return None;
5726 MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
5727 if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT)
5728 return None;
5729 int64_t RHSC;
5730 MachineOperand &RHSOp1 = RHS->getOperand(1);
5731 if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
5732 return None;
5733 RHSC = RHSOp1.getCImm()->getSExtValue();
5734
5735 // If the offset is valid as a scaled immediate, don't match here.
5736 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
5737 return None;
5738 if (RHSC >= -256 && RHSC < 256) {
5739 MachineOperand &Base = RootDef->getOperand(1);
5740 return {{
5741 [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
5742 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
5743 }};
5744 }
5745 return None;
5746 }
5747
5748 InstructionSelector::ComplexRendererFns
tryFoldAddLowIntoImm(MachineInstr & RootDef,unsigned Size,MachineRegisterInfo & MRI) const5749 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
5750 unsigned Size,
5751 MachineRegisterInfo &MRI) const {
5752 if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
5753 return None;
5754 MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
5755 if (Adrp.getOpcode() != AArch64::ADRP)
5756 return None;
5757
5758 // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
5759 auto Offset = Adrp.getOperand(1).getOffset();
5760 if (Offset % Size != 0)
5761 return None;
5762
5763 auto GV = Adrp.getOperand(1).getGlobal();
5764 if (GV->isThreadLocal())
5765 return None;
5766
5767 auto &MF = *RootDef.getParent()->getParent();
5768 if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
5769 return None;
5770
5771 unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
5772 MachineIRBuilder MIRBuilder(RootDef);
5773 Register AdrpReg = Adrp.getOperand(0).getReg();
5774 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
5775 [=](MachineInstrBuilder &MIB) {
5776 MIB.addGlobalAddress(GV, Offset,
5777 OpFlags | AArch64II::MO_PAGEOFF |
5778 AArch64II::MO_NC);
5779 }}};
5780 }
5781
5782 /// Select a "register plus scaled unsigned 12-bit immediate" address. The
5783 /// "Size" argument is the size in bytes of the memory reference, which
5784 /// determines the scale.
5785 InstructionSelector::ComplexRendererFns
selectAddrModeIndexed(MachineOperand & Root,unsigned Size) const5786 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
5787 unsigned Size) const {
5788 MachineFunction &MF = *Root.getParent()->getParent()->getParent();
5789 MachineRegisterInfo &MRI = MF.getRegInfo();
5790
5791 if (!Root.isReg())
5792 return None;
5793
5794 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
5795 if (!RootDef)
5796 return None;
5797
5798 if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
5799 return {{
5800 [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
5801 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
5802 }};
5803 }
5804
5805 CodeModel::Model CM = MF.getTarget().getCodeModel();
5806 // Check if we can fold in the ADD of small code model ADRP + ADD address.
5807 if (CM == CodeModel::Small) {
5808 auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
5809 if (OpFns)
5810 return OpFns;
5811 }
5812
5813 if (isBaseWithConstantOffset(Root, MRI)) {
5814 MachineOperand &LHS = RootDef->getOperand(1);
5815 MachineOperand &RHS = RootDef->getOperand(2);
5816 MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
5817 MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
5818 if (LHSDef && RHSDef) {
5819 int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
5820 unsigned Scale = Log2_32(Size);
5821 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
5822 if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
5823 return {{
5824 [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
5825 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
5826 }};
5827
5828 return {{
5829 [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
5830 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
5831 }};
5832 }
5833 }
5834 }
5835
5836 // Before falling back to our general case, check if the unscaled
5837 // instructions can handle this. If so, that's preferable.
5838 if (selectAddrModeUnscaled(Root, Size).hasValue())
5839 return None;
5840
5841 return {{
5842 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
5843 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
5844 }};
5845 }
5846
5847 /// Given a shift instruction, return the correct shift type for that
5848 /// instruction.
getShiftTypeForInst(MachineInstr & MI)5849 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
5850 // TODO: Handle AArch64_AM::ROR
5851 switch (MI.getOpcode()) {
5852 default:
5853 return AArch64_AM::InvalidShiftExtend;
5854 case TargetOpcode::G_SHL:
5855 return AArch64_AM::LSL;
5856 case TargetOpcode::G_LSHR:
5857 return AArch64_AM::LSR;
5858 case TargetOpcode::G_ASHR:
5859 return AArch64_AM::ASR;
5860 }
5861 }
5862
5863 /// Select a "shifted register" operand. If the value is not shifted, set the
5864 /// shift operand to a default value of "lsl 0".
5865 ///
5866 /// TODO: Allow shifted register to be rotated in logical instructions.
5867 InstructionSelector::ComplexRendererFns
selectShiftedRegister(MachineOperand & Root) const5868 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const {
5869 if (!Root.isReg())
5870 return None;
5871 MachineRegisterInfo &MRI =
5872 Root.getParent()->getParent()->getParent()->getRegInfo();
5873
5874 // Check if the operand is defined by an instruction which corresponds to
5875 // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
5876 //
5877 // TODO: Handle AArch64_AM::ROR for logical instructions.
5878 MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
5879 if (!ShiftInst)
5880 return None;
5881 AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
5882 if (ShType == AArch64_AM::InvalidShiftExtend)
5883 return None;
5884 if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
5885 return None;
5886
5887 // Need an immediate on the RHS.
5888 MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
5889 auto Immed = getImmedFromMO(ShiftRHS);
5890 if (!Immed)
5891 return None;
5892
5893 // We have something that we can fold. Fold in the shift's LHS and RHS into
5894 // the instruction.
5895 MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
5896 Register ShiftReg = ShiftLHS.getReg();
5897
5898 unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
5899 unsigned Val = *Immed & (NumBits - 1);
5900 unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
5901
5902 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
5903 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
5904 }
5905
getExtendTypeForInst(MachineInstr & MI,MachineRegisterInfo & MRI,bool IsLoadStore) const5906 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
5907 MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
5908 unsigned Opc = MI.getOpcode();
5909
5910 // Handle explicit extend instructions first.
5911 if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
5912 unsigned Size;
5913 if (Opc == TargetOpcode::G_SEXT)
5914 Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5915 else
5916 Size = MI.getOperand(2).getImm();
5917 assert(Size != 64 && "Extend from 64 bits?");
5918 switch (Size) {
5919 case 8:
5920 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
5921 case 16:
5922 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
5923 case 32:
5924 return AArch64_AM::SXTW;
5925 default:
5926 return AArch64_AM::InvalidShiftExtend;
5927 }
5928 }
5929
5930 if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
5931 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5932 assert(Size != 64 && "Extend from 64 bits?");
5933 switch (Size) {
5934 case 8:
5935 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
5936 case 16:
5937 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
5938 case 32:
5939 return AArch64_AM::UXTW;
5940 default:
5941 return AArch64_AM::InvalidShiftExtend;
5942 }
5943 }
5944
5945 // Don't have an explicit extend. Try to handle a G_AND with a constant mask
5946 // on the RHS.
5947 if (Opc != TargetOpcode::G_AND)
5948 return AArch64_AM::InvalidShiftExtend;
5949
5950 Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
5951 if (!MaybeAndMask)
5952 return AArch64_AM::InvalidShiftExtend;
5953 uint64_t AndMask = *MaybeAndMask;
5954 switch (AndMask) {
5955 default:
5956 return AArch64_AM::InvalidShiftExtend;
5957 case 0xFF:
5958 return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
5959 case 0xFFFF:
5960 return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
5961 case 0xFFFFFFFF:
5962 return AArch64_AM::UXTW;
5963 }
5964 }
5965
moveScalarRegClass(Register Reg,const TargetRegisterClass & RC,MachineIRBuilder & MIB) const5966 Register AArch64InstructionSelector::moveScalarRegClass(
5967 Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
5968 MachineRegisterInfo &MRI = *MIB.getMRI();
5969 auto Ty = MRI.getType(Reg);
5970 assert(!Ty.isVector() && "Expected scalars only!");
5971 if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
5972 return Reg;
5973
5974 // Create a copy and immediately select it.
5975 // FIXME: We should have an emitCopy function?
5976 auto Copy = MIB.buildCopy({&RC}, {Reg});
5977 selectCopy(*Copy, TII, MRI, TRI, RBI);
5978 return Copy.getReg(0);
5979 }
5980
5981 /// Select an "extended register" operand. This operand folds in an extend
5982 /// followed by an optional left shift.
5983 InstructionSelector::ComplexRendererFns
selectArithExtendedRegister(MachineOperand & Root) const5984 AArch64InstructionSelector::selectArithExtendedRegister(
5985 MachineOperand &Root) const {
5986 if (!Root.isReg())
5987 return None;
5988 MachineRegisterInfo &MRI =
5989 Root.getParent()->getParent()->getParent()->getRegInfo();
5990
5991 uint64_t ShiftVal = 0;
5992 Register ExtReg;
5993 AArch64_AM::ShiftExtendType Ext;
5994 MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
5995 if (!RootDef)
5996 return None;
5997
5998 if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
5999 return None;
6000
6001 // Check if we can fold a shift and an extend.
6002 if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
6003 // Look for a constant on the RHS of the shift.
6004 MachineOperand &RHS = RootDef->getOperand(2);
6005 Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
6006 if (!MaybeShiftVal)
6007 return None;
6008 ShiftVal = *MaybeShiftVal;
6009 if (ShiftVal > 4)
6010 return None;
6011 // Look for a valid extend instruction on the LHS of the shift.
6012 MachineOperand &LHS = RootDef->getOperand(1);
6013 MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
6014 if (!ExtDef)
6015 return None;
6016 Ext = getExtendTypeForInst(*ExtDef, MRI);
6017 if (Ext == AArch64_AM::InvalidShiftExtend)
6018 return None;
6019 ExtReg = ExtDef->getOperand(1).getReg();
6020 } else {
6021 // Didn't get a shift. Try just folding an extend.
6022 Ext = getExtendTypeForInst(*RootDef, MRI);
6023 if (Ext == AArch64_AM::InvalidShiftExtend)
6024 return None;
6025 ExtReg = RootDef->getOperand(1).getReg();
6026
6027 // If we have a 32 bit instruction which zeroes out the high half of a
6028 // register, we get an implicit zero extend for free. Check if we have one.
6029 // FIXME: We actually emit the extend right now even though we don't have
6030 // to.
6031 if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
6032 MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
6033 if (ExtInst && isDef32(*ExtInst))
6034 return None;
6035 }
6036 }
6037
6038 // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
6039 // copy.
6040 MachineIRBuilder MIB(*RootDef);
6041 ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
6042
6043 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
6044 [=](MachineInstrBuilder &MIB) {
6045 MIB.addImm(getArithExtendImm(Ext, ShiftVal));
6046 }}};
6047 }
6048
renderTruncImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6049 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
6050 const MachineInstr &MI,
6051 int OpIdx) const {
6052 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6053 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6054 "Expected G_CONSTANT");
6055 Optional<int64_t> CstVal =
6056 getConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
6057 assert(CstVal && "Expected constant value");
6058 MIB.addImm(CstVal.getValue());
6059 }
6060
renderLogicalImm32(MachineInstrBuilder & MIB,const MachineInstr & I,int OpIdx) const6061 void AArch64InstructionSelector::renderLogicalImm32(
6062 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6063 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6064 "Expected G_CONSTANT");
6065 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6066 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
6067 MIB.addImm(Enc);
6068 }
6069
renderLogicalImm64(MachineInstrBuilder & MIB,const MachineInstr & I,int OpIdx) const6070 void AArch64InstructionSelector::renderLogicalImm64(
6071 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6072 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6073 "Expected G_CONSTANT");
6074 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6075 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
6076 MIB.addImm(Enc);
6077 }
6078
renderFPImm16(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6079 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
6080 const MachineInstr &MI,
6081 int OpIdx) const {
6082 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6083 "Expected G_FCONSTANT");
6084 MIB.addImm(
6085 AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6086 }
6087
renderFPImm32(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6088 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
6089 const MachineInstr &MI,
6090 int OpIdx) const {
6091 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6092 "Expected G_FCONSTANT");
6093 MIB.addImm(
6094 AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6095 }
6096
renderFPImm64(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6097 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
6098 const MachineInstr &MI,
6099 int OpIdx) const {
6100 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6101 "Expected G_FCONSTANT");
6102 MIB.addImm(
6103 AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6104 }
6105
isLoadStoreOfNumBytes(const MachineInstr & MI,unsigned NumBytes) const6106 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
6107 const MachineInstr &MI, unsigned NumBytes) const {
6108 if (!MI.mayLoadOrStore())
6109 return false;
6110 assert(MI.hasOneMemOperand() &&
6111 "Expected load/store to have only one mem op!");
6112 return (*MI.memoperands_begin())->getSize() == NumBytes;
6113 }
6114
isDef32(const MachineInstr & MI) const6115 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
6116 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6117 if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
6118 return false;
6119
6120 // Only return true if we know the operation will zero-out the high half of
6121 // the 64-bit register. Truncates can be subregister copies, which don't
6122 // zero out the high bits. Copies and other copy-like instructions can be
6123 // fed by truncates, or could be lowered as subregister copies.
6124 switch (MI.getOpcode()) {
6125 default:
6126 return true;
6127 case TargetOpcode::COPY:
6128 case TargetOpcode::G_BITCAST:
6129 case TargetOpcode::G_TRUNC:
6130 case TargetOpcode::G_PHI:
6131 return false;
6132 }
6133 }
6134
6135
6136 // Perform fixups on the given PHI instruction's operands to force them all
6137 // to be the same as the destination regbank.
fixupPHIOpBanks(MachineInstr & MI,MachineRegisterInfo & MRI,const AArch64RegisterBankInfo & RBI)6138 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
6139 const AArch64RegisterBankInfo &RBI) {
6140 assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
6141 Register DstReg = MI.getOperand(0).getReg();
6142 const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
6143 assert(DstRB && "Expected PHI dst to have regbank assigned");
6144 MachineIRBuilder MIB(MI);
6145
6146 // Go through each operand and ensure it has the same regbank.
6147 for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
6148 MachineOperand &MO = MI.getOperand(OpIdx);
6149 if (!MO.isReg())
6150 continue;
6151 Register OpReg = MO.getReg();
6152 const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
6153 if (RB != DstRB) {
6154 // Insert a cross-bank copy.
6155 auto *OpDef = MRI.getVRegDef(OpReg);
6156 const LLT &Ty = MRI.getType(OpReg);
6157 MachineBasicBlock &OpDefBB = *OpDef->getParent();
6158
6159 // Any instruction we insert must appear after all PHIs in the block
6160 // for the block to be valid MIR.
6161 MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
6162 if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
6163 InsertPt = OpDefBB.getFirstNonPHI();
6164 MIB.setInsertPt(*OpDef->getParent(), InsertPt);
6165 auto Copy = MIB.buildCopy(Ty, OpReg);
6166 MRI.setRegBank(Copy.getReg(0), *DstRB);
6167 MO.setReg(Copy.getReg(0));
6168 }
6169 }
6170 }
6171
processPHIs(MachineFunction & MF)6172 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
6173 // We're looking for PHIs, build a list so we don't invalidate iterators.
6174 MachineRegisterInfo &MRI = MF.getRegInfo();
6175 SmallVector<MachineInstr *, 32> Phis;
6176 for (auto &BB : MF) {
6177 for (auto &MI : BB) {
6178 if (MI.getOpcode() == TargetOpcode::G_PHI)
6179 Phis.emplace_back(&MI);
6180 }
6181 }
6182
6183 for (auto *MI : Phis) {
6184 // We need to do some work here if the operand types are < 16 bit and they
6185 // are split across fpr/gpr banks. Since all types <32b on gpr
6186 // end up being assigned gpr32 regclasses, we can end up with PHIs here
6187 // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
6188 // be selecting heterogenous regbanks for operands if possible, but we
6189 // still need to be able to deal with it here.
6190 //
6191 // To fix this, if we have a gpr-bank operand < 32b in size and at least
6192 // one other operand is on the fpr bank, then we add cross-bank copies
6193 // to homogenize the operand banks. For simplicity the bank that we choose
6194 // to settle on is whatever bank the def operand has. For example:
6195 //
6196 // %endbb:
6197 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
6198 // =>
6199 // %bb2:
6200 // ...
6201 // %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
6202 // ...
6203 // %endbb:
6204 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
6205 bool HasGPROp = false, HasFPROp = false;
6206 for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) {
6207 const auto &MO = MI->getOperand(OpIdx);
6208 if (!MO.isReg())
6209 continue;
6210 const LLT &Ty = MRI.getType(MO.getReg());
6211 if (!Ty.isValid() || !Ty.isScalar())
6212 break;
6213 if (Ty.getSizeInBits() >= 32)
6214 break;
6215 const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
6216 // If for some reason we don't have a regbank yet. Don't try anything.
6217 if (!RB)
6218 break;
6219
6220 if (RB->getID() == AArch64::GPRRegBankID)
6221 HasGPROp = true;
6222 else
6223 HasFPROp = true;
6224 }
6225 // We have heterogenous regbanks, need to fixup.
6226 if (HasGPROp && HasFPROp)
6227 fixupPHIOpBanks(*MI, MRI, RBI);
6228 }
6229 }
6230
6231 namespace llvm {
6232 InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine & TM,AArch64Subtarget & Subtarget,AArch64RegisterBankInfo & RBI)6233 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
6234 AArch64Subtarget &Subtarget,
6235 AArch64RegisterBankInfo &RBI) {
6236 return new AArch64InstructionSelector(TM, Subtarget, RBI);
6237 }
6238 }
6239