1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "MCTargetDesc/AArch64AddressingModes.h"
22 #include "MCTargetDesc/AArch64MCTargetDesc.h"
23 #include "llvm/BinaryFormat/Dwarf.h"
24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
26 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/Utils.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineConstantPool.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineMemOperand.h"
37 #include "llvm/CodeGen/MachineOperand.h"
38 #include "llvm/CodeGen/MachineRegisterInfo.h"
39 #include "llvm/CodeGen/TargetOpcodes.h"
40 #include "llvm/IR/Constants.h"
41 #include "llvm/IR/DerivedTypes.h"
42 #include "llvm/IR/Instructions.h"
43 #include "llvm/IR/IntrinsicsAArch64.h"
44 #include "llvm/IR/PatternMatch.h"
45 #include "llvm/IR/Type.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Support/Debug.h"
48 #include "llvm/Support/raw_ostream.h"
49 #include <optional>
50
51 #define DEBUG_TYPE "aarch64-isel"
52
53 using namespace llvm;
54 using namespace MIPatternMatch;
55 using namespace AArch64GISelUtils;
56
57 namespace llvm {
58 class BlockFrequencyInfo;
59 class ProfileSummaryInfo;
60 }
61
62 namespace {
63
64 #define GET_GLOBALISEL_PREDICATE_BITSET
65 #include "AArch64GenGlobalISel.inc"
66 #undef GET_GLOBALISEL_PREDICATE_BITSET
67
68
69 class AArch64InstructionSelector : public InstructionSelector {
70 public:
71 AArch64InstructionSelector(const AArch64TargetMachine &TM,
72 const AArch64Subtarget &STI,
73 const AArch64RegisterBankInfo &RBI);
74
75 bool select(MachineInstr &I) override;
getName()76 static const char *getName() { return DEBUG_TYPE; }
77
setupMF(MachineFunction & MF,GISelKnownBits * KB,CodeGenCoverage & CoverageInfo,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)78 void setupMF(MachineFunction &MF, GISelKnownBits *KB,
79 CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI,
80 BlockFrequencyInfo *BFI) override {
81 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
82 MIB.setMF(MF);
83
84 // hasFnAttribute() is expensive to call on every BRCOND selection, so
85 // cache it here for each run of the selector.
86 ProduceNonFlagSettingCondBr =
87 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
88 MFReturnAddr = Register();
89
90 processPHIs(MF);
91 }
92
93 private:
94 /// tblgen-erated 'select' implementation, used as the initial selector for
95 /// the patterns that don't require complex C++.
96 bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
97
98 // A lowering phase that runs before any selection attempts.
99 // Returns true if the instruction was modified.
100 bool preISelLower(MachineInstr &I);
101
102 // An early selection function that runs before the selectImpl() call.
103 bool earlySelect(MachineInstr &I);
104
105 // Do some preprocessing of G_PHIs before we begin selection.
106 void processPHIs(MachineFunction &MF);
107
108 bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
109
110 /// Eliminate same-sized cross-bank copies into stores before selectImpl().
111 bool contractCrossBankCopyIntoStore(MachineInstr &I,
112 MachineRegisterInfo &MRI);
113
114 bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
115
116 bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
117 MachineRegisterInfo &MRI) const;
118 bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
119 MachineRegisterInfo &MRI) const;
120
121 ///@{
122 /// Helper functions for selectCompareBranch.
123 bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
124 MachineIRBuilder &MIB) const;
125 bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
126 MachineIRBuilder &MIB) const;
127 bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
128 MachineIRBuilder &MIB) const;
129 bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
130 MachineBasicBlock *DstMBB,
131 MachineIRBuilder &MIB) const;
132 ///@}
133
134 bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
135 MachineRegisterInfo &MRI);
136
137 bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
138 bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
139
140 // Helper to generate an equivalent of scalar_to_vector into a new register,
141 // returned via 'Dst'.
142 MachineInstr *emitScalarToVector(unsigned EltSize,
143 const TargetRegisterClass *DstRC,
144 Register Scalar,
145 MachineIRBuilder &MIRBuilder) const;
146
147 /// Emit a lane insert into \p DstReg, or a new vector register if
148 /// std::nullopt is provided.
149 ///
150 /// The lane inserted into is defined by \p LaneIdx. The vector source
151 /// register is given by \p SrcReg. The register containing the element is
152 /// given by \p EltReg.
153 MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
154 Register EltReg, unsigned LaneIdx,
155 const RegisterBank &RB,
156 MachineIRBuilder &MIRBuilder) const;
157
158 /// Emit a sequence of instructions representing a constant \p CV for a
159 /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
160 ///
161 /// \returns the last instruction in the sequence on success, and nullptr
162 /// otherwise.
163 MachineInstr *emitConstantVector(Register Dst, Constant *CV,
164 MachineIRBuilder &MIRBuilder,
165 MachineRegisterInfo &MRI);
166
167 bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
168 bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
169 MachineRegisterInfo &MRI);
170 /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
171 /// SUBREG_TO_REG.
172 bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
173 bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
174 bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
175 bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
176
177 bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
178 bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
179 bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
180 bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
181
182 /// Helper function to select vector load intrinsics like
183 /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
184 /// \p Opc is the opcode that the selected instruction should use.
185 /// \p NumVecs is the number of vector destinations for the instruction.
186 /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
187 bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
188 MachineInstr &I);
189 bool selectIntrinsicWithSideEffects(MachineInstr &I,
190 MachineRegisterInfo &MRI);
191 bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
192 bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
193 bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
194 bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
195 bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
196 bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
197 bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
198 bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
199 bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
200 bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
201
202 unsigned emitConstantPoolEntry(const Constant *CPVal,
203 MachineFunction &MF) const;
204 MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
205 MachineIRBuilder &MIRBuilder) const;
206
207 // Emit a vector concat operation.
208 MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
209 Register Op2,
210 MachineIRBuilder &MIRBuilder) const;
211
212 // Emit an integer compare between LHS and RHS, which checks for Predicate.
213 MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
214 MachineOperand &Predicate,
215 MachineIRBuilder &MIRBuilder) const;
216
217 /// Emit a floating point comparison between \p LHS and \p RHS.
218 /// \p Pred if given is the intended predicate to use.
219 MachineInstr *
220 emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
221 std::optional<CmpInst::Predicate> = std::nullopt) const;
222
223 MachineInstr *
224 emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
225 std::initializer_list<llvm::SrcOp> SrcOps,
226 MachineIRBuilder &MIRBuilder,
227 const ComplexRendererFns &RenderFns = std::nullopt) const;
228 /// Helper function to emit an add or sub instruction.
229 ///
230 /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
231 /// in a specific order.
232 ///
233 /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
234 ///
235 /// \code
236 /// const std::array<std::array<unsigned, 2>, 4> Table {
237 /// {{AArch64::ADDXri, AArch64::ADDWri},
238 /// {AArch64::ADDXrs, AArch64::ADDWrs},
239 /// {AArch64::ADDXrr, AArch64::ADDWrr},
240 /// {AArch64::SUBXri, AArch64::SUBWri},
241 /// {AArch64::ADDXrx, AArch64::ADDWrx}}};
242 /// \endcode
243 ///
244 /// Each row in the table corresponds to a different addressing mode. Each
245 /// column corresponds to a different register size.
246 ///
247 /// \attention Rows must be structured as follows:
248 /// - Row 0: The ri opcode variants
249 /// - Row 1: The rs opcode variants
250 /// - Row 2: The rr opcode variants
251 /// - Row 3: The ri opcode variants for negative immediates
252 /// - Row 4: The rx opcode variants
253 ///
254 /// \attention Columns must be structured as follows:
255 /// - Column 0: The 64-bit opcode variants
256 /// - Column 1: The 32-bit opcode variants
257 ///
258 /// \p Dst is the destination register of the binop to emit.
259 /// \p LHS is the left-hand operand of the binop to emit.
260 /// \p RHS is the right-hand operand of the binop to emit.
261 MachineInstr *emitAddSub(
262 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
263 Register Dst, MachineOperand &LHS, MachineOperand &RHS,
264 MachineIRBuilder &MIRBuilder) const;
265 MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
266 MachineOperand &RHS,
267 MachineIRBuilder &MIRBuilder) const;
268 MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
269 MachineIRBuilder &MIRBuilder) const;
270 MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
271 MachineIRBuilder &MIRBuilder) const;
272 MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
273 MachineIRBuilder &MIRBuilder) const;
274 MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
275 MachineIRBuilder &MIRBuilder) const;
276 MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
277 AArch64CC::CondCode CC,
278 MachineIRBuilder &MIRBuilder) const;
279 MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
280 const RegisterBank &DstRB, LLT ScalarTy,
281 Register VecReg, unsigned LaneIdx,
282 MachineIRBuilder &MIRBuilder) const;
283 MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
284 AArch64CC::CondCode Pred,
285 MachineIRBuilder &MIRBuilder) const;
286 /// Emit a CSet for a FP compare.
287 ///
288 /// \p Dst is expected to be a 32-bit scalar register.
289 MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
290 MachineIRBuilder &MIRBuilder) const;
291
292 /// Emit the overflow op for \p Opcode.
293 ///
294 /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
295 /// G_USUBO, etc.
296 std::pair<MachineInstr *, AArch64CC::CondCode>
297 emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
298 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
299
300 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
301 /// In some cases this is even possible with OR operations in the expression.
302 MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
303 MachineIRBuilder &MIB) const;
304 MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
305 CmpInst::Predicate CC,
306 AArch64CC::CondCode Predicate,
307 AArch64CC::CondCode OutCC,
308 MachineIRBuilder &MIB) const;
309 MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
310 bool Negate, Register CCOp,
311 AArch64CC::CondCode Predicate,
312 MachineIRBuilder &MIB) const;
313
314 /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
315 /// \p IsNegative is true if the test should be "not zero".
316 /// This will also optimize the test bit instruction when possible.
317 MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
318 MachineBasicBlock *DstMBB,
319 MachineIRBuilder &MIB) const;
320
321 /// Emit a CB(N)Z instruction which branches to \p DestMBB.
322 MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
323 MachineBasicBlock *DestMBB,
324 MachineIRBuilder &MIB) const;
325
326 // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
327 // We use these manually instead of using the importer since it doesn't
328 // support SDNodeXForm.
329 ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
330 ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
331 ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
332 ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
333
334 ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
335 ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
336 ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
337
338 ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
339 unsigned Size) const;
340
selectAddrModeUnscaled8(MachineOperand & Root) const341 ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
342 return selectAddrModeUnscaled(Root, 1);
343 }
selectAddrModeUnscaled16(MachineOperand & Root) const344 ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
345 return selectAddrModeUnscaled(Root, 2);
346 }
selectAddrModeUnscaled32(MachineOperand & Root) const347 ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
348 return selectAddrModeUnscaled(Root, 4);
349 }
selectAddrModeUnscaled64(MachineOperand & Root) const350 ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
351 return selectAddrModeUnscaled(Root, 8);
352 }
selectAddrModeUnscaled128(MachineOperand & Root) const353 ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
354 return selectAddrModeUnscaled(Root, 16);
355 }
356
357 /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
358 /// from complex pattern matchers like selectAddrModeIndexed().
359 ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
360 MachineRegisterInfo &MRI) const;
361
362 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
363 unsigned Size) const;
364 template <int Width>
selectAddrModeIndexed(MachineOperand & Root) const365 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
366 return selectAddrModeIndexed(Root, Width / 8);
367 }
368
369 bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
370 const MachineRegisterInfo &MRI) const;
371 ComplexRendererFns
372 selectAddrModeShiftedExtendXReg(MachineOperand &Root,
373 unsigned SizeInBytes) const;
374
375 /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
376 /// or not a shift + extend should be folded into an addressing mode. Returns
377 /// None when this is not profitable or possible.
378 ComplexRendererFns
379 selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
380 MachineOperand &Offset, unsigned SizeInBytes,
381 bool WantsExt) const;
382 ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
383 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
384 unsigned SizeInBytes) const;
385 template <int Width>
selectAddrModeXRO(MachineOperand & Root) const386 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
387 return selectAddrModeXRO(Root, Width / 8);
388 }
389
390 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
391 unsigned SizeInBytes) const;
392 template <int Width>
selectAddrModeWRO(MachineOperand & Root) const393 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
394 return selectAddrModeWRO(Root, Width / 8);
395 }
396
397 ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
398 bool AllowROR = false) const;
399
selectArithShiftedRegister(MachineOperand & Root) const400 ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
401 return selectShiftedRegister(Root);
402 }
403
selectLogicalShiftedRegister(MachineOperand & Root) const404 ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
405 return selectShiftedRegister(Root, true);
406 }
407
408 /// Given an extend instruction, determine the correct shift-extend type for
409 /// that instruction.
410 ///
411 /// If the instruction is going to be used in a load or store, pass
412 /// \p IsLoadStore = true.
413 AArch64_AM::ShiftExtendType
414 getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
415 bool IsLoadStore = false) const;
416
417 /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
418 ///
419 /// \returns Either \p Reg if no change was necessary, or the new register
420 /// created by moving \p Reg.
421 ///
422 /// Note: This uses emitCopy right now.
423 Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
424 MachineIRBuilder &MIB) const;
425
426 ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
427
428 void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
429 int OpIdx = -1) const;
430 void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
431 int OpIdx = -1) const;
432 void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
433 int OpIdx = -1) const;
434 void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
435 int OpIdx = -1) const;
436 void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
437 int OpIdx = -1) const;
438 void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
439 int OpIdx = -1) const;
440 void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
441 const MachineInstr &MI,
442 int OpIdx = -1) const;
443
444 // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
445 void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
446
447 // Optimization methods.
448 bool tryOptSelect(GSelect &Sel);
449 bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
450 MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
451 MachineOperand &Predicate,
452 MachineIRBuilder &MIRBuilder) const;
453
454 /// Return true if \p MI is a load or store of \p NumBytes bytes.
455 bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
456
457 /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
458 /// register zeroed out. In other words, the result of MI has been explicitly
459 /// zero extended.
460 bool isDef32(const MachineInstr &MI) const;
461
462 const AArch64TargetMachine &TM;
463 const AArch64Subtarget &STI;
464 const AArch64InstrInfo &TII;
465 const AArch64RegisterInfo &TRI;
466 const AArch64RegisterBankInfo &RBI;
467
468 bool ProduceNonFlagSettingCondBr = false;
469
470 // Some cached values used during selection.
471 // We use LR as a live-in register, and we keep track of it here as it can be
472 // clobbered by calls.
473 Register MFReturnAddr;
474
475 MachineIRBuilder MIB;
476
477 #define GET_GLOBALISEL_PREDICATES_DECL
478 #include "AArch64GenGlobalISel.inc"
479 #undef GET_GLOBALISEL_PREDICATES_DECL
480
481 // We declare the temporaries used by selectImpl() in the class to minimize the
482 // cost of constructing placeholder values.
483 #define GET_GLOBALISEL_TEMPORARIES_DECL
484 #include "AArch64GenGlobalISel.inc"
485 #undef GET_GLOBALISEL_TEMPORARIES_DECL
486 };
487
488 } // end anonymous namespace
489
490 #define GET_GLOBALISEL_IMPL
491 #include "AArch64GenGlobalISel.inc"
492 #undef GET_GLOBALISEL_IMPL
493
AArch64InstructionSelector(const AArch64TargetMachine & TM,const AArch64Subtarget & STI,const AArch64RegisterBankInfo & RBI)494 AArch64InstructionSelector::AArch64InstructionSelector(
495 const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
496 const AArch64RegisterBankInfo &RBI)
497 : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
498 RBI(RBI),
499 #define GET_GLOBALISEL_PREDICATES_INIT
500 #include "AArch64GenGlobalISel.inc"
501 #undef GET_GLOBALISEL_PREDICATES_INIT
502 #define GET_GLOBALISEL_TEMPORARIES_INIT
503 #include "AArch64GenGlobalISel.inc"
504 #undef GET_GLOBALISEL_TEMPORARIES_INIT
505 {
506 }
507
508 // FIXME: This should be target-independent, inferred from the types declared
509 // for each class in the bank.
510 //
511 /// Given a register bank, and a type, return the smallest register class that
512 /// can represent that combination.
513 static const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty,const RegisterBank & RB,bool GetAllRegSet=false)514 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
515 bool GetAllRegSet = false) {
516 if (RB.getID() == AArch64::GPRRegBankID) {
517 if (Ty.getSizeInBits() <= 32)
518 return GetAllRegSet ? &AArch64::GPR32allRegClass
519 : &AArch64::GPR32RegClass;
520 if (Ty.getSizeInBits() == 64)
521 return GetAllRegSet ? &AArch64::GPR64allRegClass
522 : &AArch64::GPR64RegClass;
523 if (Ty.getSizeInBits() == 128)
524 return &AArch64::XSeqPairsClassRegClass;
525 return nullptr;
526 }
527
528 if (RB.getID() == AArch64::FPRRegBankID) {
529 switch (Ty.getSizeInBits()) {
530 case 8:
531 return &AArch64::FPR8RegClass;
532 case 16:
533 return &AArch64::FPR16RegClass;
534 case 32:
535 return &AArch64::FPR32RegClass;
536 case 64:
537 return &AArch64::FPR64RegClass;
538 case 128:
539 return &AArch64::FPR128RegClass;
540 }
541 return nullptr;
542 }
543
544 return nullptr;
545 }
546
547 /// Given a register bank, and size in bits, return the smallest register class
548 /// that can represent that combination.
549 static const TargetRegisterClass *
getMinClassForRegBank(const RegisterBank & RB,unsigned SizeInBits,bool GetAllRegSet=false)550 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
551 bool GetAllRegSet = false) {
552 unsigned RegBankID = RB.getID();
553
554 if (RegBankID == AArch64::GPRRegBankID) {
555 if (SizeInBits <= 32)
556 return GetAllRegSet ? &AArch64::GPR32allRegClass
557 : &AArch64::GPR32RegClass;
558 if (SizeInBits == 64)
559 return GetAllRegSet ? &AArch64::GPR64allRegClass
560 : &AArch64::GPR64RegClass;
561 if (SizeInBits == 128)
562 return &AArch64::XSeqPairsClassRegClass;
563 }
564
565 if (RegBankID == AArch64::FPRRegBankID) {
566 switch (SizeInBits) {
567 default:
568 return nullptr;
569 case 8:
570 return &AArch64::FPR8RegClass;
571 case 16:
572 return &AArch64::FPR16RegClass;
573 case 32:
574 return &AArch64::FPR32RegClass;
575 case 64:
576 return &AArch64::FPR64RegClass;
577 case 128:
578 return &AArch64::FPR128RegClass;
579 }
580 }
581
582 return nullptr;
583 }
584
585 /// Returns the correct subregister to use for a given register class.
getSubRegForClass(const TargetRegisterClass * RC,const TargetRegisterInfo & TRI,unsigned & SubReg)586 static bool getSubRegForClass(const TargetRegisterClass *RC,
587 const TargetRegisterInfo &TRI, unsigned &SubReg) {
588 switch (TRI.getRegSizeInBits(*RC)) {
589 case 8:
590 SubReg = AArch64::bsub;
591 break;
592 case 16:
593 SubReg = AArch64::hsub;
594 break;
595 case 32:
596 if (RC != &AArch64::FPR32RegClass)
597 SubReg = AArch64::sub_32;
598 else
599 SubReg = AArch64::ssub;
600 break;
601 case 64:
602 SubReg = AArch64::dsub;
603 break;
604 default:
605 LLVM_DEBUG(
606 dbgs() << "Couldn't find appropriate subregister for register class.");
607 return false;
608 }
609
610 return true;
611 }
612
613 /// Returns the minimum size the given register bank can hold.
getMinSizeForRegBank(const RegisterBank & RB)614 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
615 switch (RB.getID()) {
616 case AArch64::GPRRegBankID:
617 return 32;
618 case AArch64::FPRRegBankID:
619 return 8;
620 default:
621 llvm_unreachable("Tried to get minimum size for unknown register bank.");
622 }
623 }
624
625 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
626 /// Helper function for functions like createDTuple and createQTuple.
627 ///
628 /// \p RegClassIDs - The list of register class IDs available for some tuple of
629 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
630 /// expected to contain between 2 and 4 tuple classes.
631 ///
632 /// \p SubRegs - The list of subregister classes associated with each register
633 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
634 /// subregister class. The index of each subregister class is expected to
635 /// correspond with the index of each register class.
636 ///
637 /// \returns Either the destination register of REG_SEQUENCE instruction that
638 /// was created, or the 0th element of \p Regs if \p Regs contains a single
639 /// element.
createTuple(ArrayRef<Register> Regs,const unsigned RegClassIDs[],const unsigned SubRegs[],MachineIRBuilder & MIB)640 static Register createTuple(ArrayRef<Register> Regs,
641 const unsigned RegClassIDs[],
642 const unsigned SubRegs[], MachineIRBuilder &MIB) {
643 unsigned NumRegs = Regs.size();
644 if (NumRegs == 1)
645 return Regs[0];
646 assert(NumRegs >= 2 && NumRegs <= 4 &&
647 "Only support between two and 4 registers in a tuple!");
648 const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
649 auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
650 auto RegSequence =
651 MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
652 for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
653 RegSequence.addUse(Regs[I]);
654 RegSequence.addImm(SubRegs[I]);
655 }
656 return RegSequence.getReg(0);
657 }
658
659 /// Create a tuple of D-registers using the registers in \p Regs.
createDTuple(ArrayRef<Register> Regs,MachineIRBuilder & MIB)660 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
661 static const unsigned RegClassIDs[] = {
662 AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
663 static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
664 AArch64::dsub2, AArch64::dsub3};
665 return createTuple(Regs, RegClassIDs, SubRegs, MIB);
666 }
667
668 /// Create a tuple of Q-registers using the registers in \p Regs.
createQTuple(ArrayRef<Register> Regs,MachineIRBuilder & MIB)669 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
670 static const unsigned RegClassIDs[] = {
671 AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
672 static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
673 AArch64::qsub2, AArch64::qsub3};
674 return createTuple(Regs, RegClassIDs, SubRegs, MIB);
675 }
676
getImmedFromMO(const MachineOperand & Root)677 static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
678 auto &MI = *Root.getParent();
679 auto &MBB = *MI.getParent();
680 auto &MF = *MBB.getParent();
681 auto &MRI = MF.getRegInfo();
682 uint64_t Immed;
683 if (Root.isImm())
684 Immed = Root.getImm();
685 else if (Root.isCImm())
686 Immed = Root.getCImm()->getZExtValue();
687 else if (Root.isReg()) {
688 auto ValAndVReg =
689 getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
690 if (!ValAndVReg)
691 return std::nullopt;
692 Immed = ValAndVReg->Value.getSExtValue();
693 } else
694 return std::nullopt;
695 return Immed;
696 }
697
698 /// Check whether \p I is a currently unsupported binary operation:
699 /// - it has an unsized type
700 /// - an operand is not a vreg
701 /// - all operands are not in the same bank
702 /// These are checks that should someday live in the verifier, but right now,
703 /// these are mostly limitations of the aarch64 selector.
unsupportedBinOp(const MachineInstr & I,const AArch64RegisterBankInfo & RBI,const MachineRegisterInfo & MRI,const AArch64RegisterInfo & TRI)704 static bool unsupportedBinOp(const MachineInstr &I,
705 const AArch64RegisterBankInfo &RBI,
706 const MachineRegisterInfo &MRI,
707 const AArch64RegisterInfo &TRI) {
708 LLT Ty = MRI.getType(I.getOperand(0).getReg());
709 if (!Ty.isValid()) {
710 LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
711 return true;
712 }
713
714 const RegisterBank *PrevOpBank = nullptr;
715 for (auto &MO : I.operands()) {
716 // FIXME: Support non-register operands.
717 if (!MO.isReg()) {
718 LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
719 return true;
720 }
721
722 // FIXME: Can generic operations have physical registers operands? If
723 // so, this will need to be taught about that, and we'll need to get the
724 // bank out of the minimal class for the register.
725 // Either way, this needs to be documented (and possibly verified).
726 if (!MO.getReg().isVirtual()) {
727 LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
728 return true;
729 }
730
731 const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
732 if (!OpBank) {
733 LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
734 return true;
735 }
736
737 if (PrevOpBank && OpBank != PrevOpBank) {
738 LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
739 return true;
740 }
741 PrevOpBank = OpBank;
742 }
743 return false;
744 }
745
746 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
747 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
748 /// and of size \p OpSize.
749 /// \returns \p GenericOpc if the combination is unsupported.
selectBinaryOp(unsigned GenericOpc,unsigned RegBankID,unsigned OpSize)750 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
751 unsigned OpSize) {
752 switch (RegBankID) {
753 case AArch64::GPRRegBankID:
754 if (OpSize == 32) {
755 switch (GenericOpc) {
756 case TargetOpcode::G_SHL:
757 return AArch64::LSLVWr;
758 case TargetOpcode::G_LSHR:
759 return AArch64::LSRVWr;
760 case TargetOpcode::G_ASHR:
761 return AArch64::ASRVWr;
762 default:
763 return GenericOpc;
764 }
765 } else if (OpSize == 64) {
766 switch (GenericOpc) {
767 case TargetOpcode::G_PTR_ADD:
768 return AArch64::ADDXrr;
769 case TargetOpcode::G_SHL:
770 return AArch64::LSLVXr;
771 case TargetOpcode::G_LSHR:
772 return AArch64::LSRVXr;
773 case TargetOpcode::G_ASHR:
774 return AArch64::ASRVXr;
775 default:
776 return GenericOpc;
777 }
778 }
779 break;
780 case AArch64::FPRRegBankID:
781 switch (OpSize) {
782 case 32:
783 switch (GenericOpc) {
784 case TargetOpcode::G_FADD:
785 return AArch64::FADDSrr;
786 case TargetOpcode::G_FSUB:
787 return AArch64::FSUBSrr;
788 case TargetOpcode::G_FMUL:
789 return AArch64::FMULSrr;
790 case TargetOpcode::G_FDIV:
791 return AArch64::FDIVSrr;
792 default:
793 return GenericOpc;
794 }
795 case 64:
796 switch (GenericOpc) {
797 case TargetOpcode::G_FADD:
798 return AArch64::FADDDrr;
799 case TargetOpcode::G_FSUB:
800 return AArch64::FSUBDrr;
801 case TargetOpcode::G_FMUL:
802 return AArch64::FMULDrr;
803 case TargetOpcode::G_FDIV:
804 return AArch64::FDIVDrr;
805 case TargetOpcode::G_OR:
806 return AArch64::ORRv8i8;
807 default:
808 return GenericOpc;
809 }
810 }
811 break;
812 }
813 return GenericOpc;
814 }
815
816 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
817 /// appropriate for the (value) register bank \p RegBankID and of memory access
818 /// size \p OpSize. This returns the variant with the base+unsigned-immediate
819 /// addressing mode (e.g., LDRXui).
820 /// \returns \p GenericOpc if the combination is unsupported.
selectLoadStoreUIOp(unsigned GenericOpc,unsigned RegBankID,unsigned OpSize)821 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
822 unsigned OpSize) {
823 const bool isStore = GenericOpc == TargetOpcode::G_STORE;
824 switch (RegBankID) {
825 case AArch64::GPRRegBankID:
826 switch (OpSize) {
827 case 8:
828 return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
829 case 16:
830 return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
831 case 32:
832 return isStore ? AArch64::STRWui : AArch64::LDRWui;
833 case 64:
834 return isStore ? AArch64::STRXui : AArch64::LDRXui;
835 }
836 break;
837 case AArch64::FPRRegBankID:
838 switch (OpSize) {
839 case 8:
840 return isStore ? AArch64::STRBui : AArch64::LDRBui;
841 case 16:
842 return isStore ? AArch64::STRHui : AArch64::LDRHui;
843 case 32:
844 return isStore ? AArch64::STRSui : AArch64::LDRSui;
845 case 64:
846 return isStore ? AArch64::STRDui : AArch64::LDRDui;
847 case 128:
848 return isStore ? AArch64::STRQui : AArch64::LDRQui;
849 }
850 break;
851 }
852 return GenericOpc;
853 }
854
855 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
856 /// to \p *To.
857 ///
858 /// E.g "To = COPY SrcReg:SubReg"
copySubReg(MachineInstr & I,MachineRegisterInfo & MRI,const RegisterBankInfo & RBI,Register SrcReg,const TargetRegisterClass * To,unsigned SubReg)859 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
860 const RegisterBankInfo &RBI, Register SrcReg,
861 const TargetRegisterClass *To, unsigned SubReg) {
862 assert(SrcReg.isValid() && "Expected a valid source register?");
863 assert(To && "Destination register class cannot be null");
864 assert(SubReg && "Expected a valid subregister");
865
866 MachineIRBuilder MIB(I);
867 auto SubRegCopy =
868 MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
869 MachineOperand &RegOp = I.getOperand(1);
870 RegOp.setReg(SubRegCopy.getReg(0));
871
872 // It's possible that the destination register won't be constrained. Make
873 // sure that happens.
874 if (!I.getOperand(0).getReg().isPhysical())
875 RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
876
877 return true;
878 }
879
880 /// Helper function to get the source and destination register classes for a
881 /// copy. Returns a std::pair containing the source register class for the
882 /// copy, and the destination register class for the copy. If a register class
883 /// cannot be determined, then it will be nullptr.
884 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getRegClassesForCopy(MachineInstr & I,const TargetInstrInfo & TII,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)885 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
886 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
887 const RegisterBankInfo &RBI) {
888 Register DstReg = I.getOperand(0).getReg();
889 Register SrcReg = I.getOperand(1).getReg();
890 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
891 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
892 unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
893 unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
894
895 // Special casing for cross-bank copies of s1s. We can technically represent
896 // a 1-bit value with any size of register. The minimum size for a GPR is 32
897 // bits. So, we need to put the FPR on 32 bits as well.
898 //
899 // FIXME: I'm not sure if this case holds true outside of copies. If it does,
900 // then we can pull it into the helpers that get the appropriate class for a
901 // register bank. Or make a new helper that carries along some constraint
902 // information.
903 if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
904 SrcSize = DstSize = 32;
905
906 return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
907 getMinClassForRegBank(DstRegBank, DstSize, true)};
908 }
909
910 // FIXME: We need some sort of API in RBI/TRI to allow generic code to
911 // constrain operands of simple instructions given a TargetRegisterClass
912 // and LLT
selectDebugInstr(MachineInstr & I,MachineRegisterInfo & MRI,const RegisterBankInfo & RBI)913 static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
914 const RegisterBankInfo &RBI) {
915 for (MachineOperand &MO : I.operands()) {
916 if (!MO.isReg())
917 continue;
918 Register Reg = MO.getReg();
919 if (!Reg)
920 continue;
921 if (Reg.isPhysical())
922 continue;
923 LLT Ty = MRI.getType(Reg);
924 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
925 const TargetRegisterClass *RC =
926 RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
927 if (!RC) {
928 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
929 RC = getRegClassForTypeOnBank(Ty, RB);
930 if (!RC) {
931 LLVM_DEBUG(
932 dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
933 break;
934 }
935 }
936 RBI.constrainGenericRegister(Reg, *RC, MRI);
937 }
938
939 return true;
940 }
941
selectCopy(MachineInstr & I,const TargetInstrInfo & TII,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)942 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
943 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
944 const RegisterBankInfo &RBI) {
945 Register DstReg = I.getOperand(0).getReg();
946 Register SrcReg = I.getOperand(1).getReg();
947 const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
948 const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
949
950 // Find the correct register classes for the source and destination registers.
951 const TargetRegisterClass *SrcRC;
952 const TargetRegisterClass *DstRC;
953 std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
954
955 if (!DstRC) {
956 LLVM_DEBUG(dbgs() << "Unexpected dest size "
957 << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
958 return false;
959 }
960
961 // Is this a copy? If so, then we may need to insert a subregister copy.
962 if (I.isCopy()) {
963 // Yes. Check if there's anything to fix up.
964 if (!SrcRC) {
965 LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
966 return false;
967 }
968
969 unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
970 unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
971 unsigned SubReg;
972
973 // If the source bank doesn't support a subregister copy small enough,
974 // then we first need to copy to the destination bank.
975 if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
976 const TargetRegisterClass *DstTempRC =
977 getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
978 getSubRegForClass(DstRC, TRI, SubReg);
979
980 MachineIRBuilder MIB(I);
981 auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
982 copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
983 } else if (SrcSize > DstSize) {
984 // If the source register is bigger than the destination we need to
985 // perform a subregister copy.
986 const TargetRegisterClass *SubRegRC =
987 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
988 getSubRegForClass(SubRegRC, TRI, SubReg);
989 copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
990 } else if (DstSize > SrcSize) {
991 // If the destination register is bigger than the source we need to do
992 // a promotion using SUBREG_TO_REG.
993 const TargetRegisterClass *PromotionRC =
994 getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
995 getSubRegForClass(SrcRC, TRI, SubReg);
996
997 Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
998 BuildMI(*I.getParent(), I, I.getDebugLoc(),
999 TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
1000 .addImm(0)
1001 .addUse(SrcReg)
1002 .addImm(SubReg);
1003 MachineOperand &RegOp = I.getOperand(1);
1004 RegOp.setReg(PromoteReg);
1005 }
1006
1007 // If the destination is a physical register, then there's nothing to
1008 // change, so we're done.
1009 if (DstReg.isPhysical())
1010 return true;
1011 }
1012
1013 // No need to constrain SrcReg. It will get constrained when we hit another
1014 // of its use or its defs. Copies do not have constraints.
1015 if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
1016 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1017 << " operand\n");
1018 return false;
1019 }
1020
1021 // If this a GPR ZEXT that we want to just reduce down into a copy.
1022 // The sizes will be mismatched with the source < 32b but that's ok.
1023 if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1024 I.setDesc(TII.get(AArch64::COPY));
1025 assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1026 return selectCopy(I, TII, MRI, TRI, RBI);
1027 }
1028
1029 I.setDesc(TII.get(AArch64::COPY));
1030 return true;
1031 }
1032
selectFPConvOpc(unsigned GenericOpc,LLT DstTy,LLT SrcTy)1033 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1034 if (!DstTy.isScalar() || !SrcTy.isScalar())
1035 return GenericOpc;
1036
1037 const unsigned DstSize = DstTy.getSizeInBits();
1038 const unsigned SrcSize = SrcTy.getSizeInBits();
1039
1040 switch (DstSize) {
1041 case 32:
1042 switch (SrcSize) {
1043 case 32:
1044 switch (GenericOpc) {
1045 case TargetOpcode::G_SITOFP:
1046 return AArch64::SCVTFUWSri;
1047 case TargetOpcode::G_UITOFP:
1048 return AArch64::UCVTFUWSri;
1049 case TargetOpcode::G_FPTOSI:
1050 return AArch64::FCVTZSUWSr;
1051 case TargetOpcode::G_FPTOUI:
1052 return AArch64::FCVTZUUWSr;
1053 default:
1054 return GenericOpc;
1055 }
1056 case 64:
1057 switch (GenericOpc) {
1058 case TargetOpcode::G_SITOFP:
1059 return AArch64::SCVTFUXSri;
1060 case TargetOpcode::G_UITOFP:
1061 return AArch64::UCVTFUXSri;
1062 case TargetOpcode::G_FPTOSI:
1063 return AArch64::FCVTZSUWDr;
1064 case TargetOpcode::G_FPTOUI:
1065 return AArch64::FCVTZUUWDr;
1066 default:
1067 return GenericOpc;
1068 }
1069 default:
1070 return GenericOpc;
1071 }
1072 case 64:
1073 switch (SrcSize) {
1074 case 32:
1075 switch (GenericOpc) {
1076 case TargetOpcode::G_SITOFP:
1077 return AArch64::SCVTFUWDri;
1078 case TargetOpcode::G_UITOFP:
1079 return AArch64::UCVTFUWDri;
1080 case TargetOpcode::G_FPTOSI:
1081 return AArch64::FCVTZSUXSr;
1082 case TargetOpcode::G_FPTOUI:
1083 return AArch64::FCVTZUUXSr;
1084 default:
1085 return GenericOpc;
1086 }
1087 case 64:
1088 switch (GenericOpc) {
1089 case TargetOpcode::G_SITOFP:
1090 return AArch64::SCVTFUXDri;
1091 case TargetOpcode::G_UITOFP:
1092 return AArch64::UCVTFUXDri;
1093 case TargetOpcode::G_FPTOSI:
1094 return AArch64::FCVTZSUXDr;
1095 case TargetOpcode::G_FPTOUI:
1096 return AArch64::FCVTZUUXDr;
1097 default:
1098 return GenericOpc;
1099 }
1100 default:
1101 return GenericOpc;
1102 }
1103 default:
1104 return GenericOpc;
1105 };
1106 return GenericOpc;
1107 }
1108
1109 MachineInstr *
emitSelect(Register Dst,Register True,Register False,AArch64CC::CondCode CC,MachineIRBuilder & MIB) const1110 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1111 Register False, AArch64CC::CondCode CC,
1112 MachineIRBuilder &MIB) const {
1113 MachineRegisterInfo &MRI = *MIB.getMRI();
1114 assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1115 RBI.getRegBank(True, MRI, TRI)->getID() &&
1116 "Expected both select operands to have the same regbank?");
1117 LLT Ty = MRI.getType(True);
1118 if (Ty.isVector())
1119 return nullptr;
1120 const unsigned Size = Ty.getSizeInBits();
1121 assert((Size == 32 || Size == 64) &&
1122 "Expected 32 bit or 64 bit select only?");
1123 const bool Is32Bit = Size == 32;
1124 if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1125 unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1126 auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1127 constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1128 return &*FCSel;
1129 }
1130
1131 // By default, we'll try and emit a CSEL.
1132 unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1133 bool Optimized = false;
1134 auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1135 &Optimized](Register &Reg, Register &OtherReg,
1136 bool Invert) {
1137 if (Optimized)
1138 return false;
1139
1140 // Attempt to fold:
1141 //
1142 // %sub = G_SUB 0, %x
1143 // %select = G_SELECT cc, %reg, %sub
1144 //
1145 // Into:
1146 // %select = CSNEG %reg, %x, cc
1147 Register MatchReg;
1148 if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1149 Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1150 Reg = MatchReg;
1151 if (Invert) {
1152 CC = AArch64CC::getInvertedCondCode(CC);
1153 std::swap(Reg, OtherReg);
1154 }
1155 return true;
1156 }
1157
1158 // Attempt to fold:
1159 //
1160 // %xor = G_XOR %x, -1
1161 // %select = G_SELECT cc, %reg, %xor
1162 //
1163 // Into:
1164 // %select = CSINV %reg, %x, cc
1165 if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1166 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1167 Reg = MatchReg;
1168 if (Invert) {
1169 CC = AArch64CC::getInvertedCondCode(CC);
1170 std::swap(Reg, OtherReg);
1171 }
1172 return true;
1173 }
1174
1175 // Attempt to fold:
1176 //
1177 // %add = G_ADD %x, 1
1178 // %select = G_SELECT cc, %reg, %add
1179 //
1180 // Into:
1181 // %select = CSINC %reg, %x, cc
1182 if (mi_match(Reg, MRI,
1183 m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1184 m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1185 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1186 Reg = MatchReg;
1187 if (Invert) {
1188 CC = AArch64CC::getInvertedCondCode(CC);
1189 std::swap(Reg, OtherReg);
1190 }
1191 return true;
1192 }
1193
1194 return false;
1195 };
1196
1197 // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1198 // true/false values are constants.
1199 // FIXME: All of these patterns already exist in tablegen. We should be
1200 // able to import these.
1201 auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1202 &Optimized]() {
1203 if (Optimized)
1204 return false;
1205 auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI);
1206 auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI);
1207 if (!TrueCst && !FalseCst)
1208 return false;
1209
1210 Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1211 if (TrueCst && FalseCst) {
1212 int64_t T = TrueCst->Value.getSExtValue();
1213 int64_t F = FalseCst->Value.getSExtValue();
1214
1215 if (T == 0 && F == 1) {
1216 // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1217 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1218 True = ZReg;
1219 False = ZReg;
1220 return true;
1221 }
1222
1223 if (T == 0 && F == -1) {
1224 // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1225 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1226 True = ZReg;
1227 False = ZReg;
1228 return true;
1229 }
1230 }
1231
1232 if (TrueCst) {
1233 int64_t T = TrueCst->Value.getSExtValue();
1234 if (T == 1) {
1235 // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1236 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1237 True = False;
1238 False = ZReg;
1239 CC = AArch64CC::getInvertedCondCode(CC);
1240 return true;
1241 }
1242
1243 if (T == -1) {
1244 // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1245 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1246 True = False;
1247 False = ZReg;
1248 CC = AArch64CC::getInvertedCondCode(CC);
1249 return true;
1250 }
1251 }
1252
1253 if (FalseCst) {
1254 int64_t F = FalseCst->Value.getSExtValue();
1255 if (F == 1) {
1256 // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1257 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1258 False = ZReg;
1259 return true;
1260 }
1261
1262 if (F == -1) {
1263 // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1264 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1265 False = ZReg;
1266 return true;
1267 }
1268 }
1269 return false;
1270 };
1271
1272 Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1273 Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1274 Optimized |= TryOptSelectCst();
1275 auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1276 constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1277 return &*SelectInst;
1278 }
1279
changeICMPPredToAArch64CC(CmpInst::Predicate P)1280 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1281 switch (P) {
1282 default:
1283 llvm_unreachable("Unknown condition code!");
1284 case CmpInst::ICMP_NE:
1285 return AArch64CC::NE;
1286 case CmpInst::ICMP_EQ:
1287 return AArch64CC::EQ;
1288 case CmpInst::ICMP_SGT:
1289 return AArch64CC::GT;
1290 case CmpInst::ICMP_SGE:
1291 return AArch64CC::GE;
1292 case CmpInst::ICMP_SLT:
1293 return AArch64CC::LT;
1294 case CmpInst::ICMP_SLE:
1295 return AArch64CC::LE;
1296 case CmpInst::ICMP_UGT:
1297 return AArch64CC::HI;
1298 case CmpInst::ICMP_UGE:
1299 return AArch64CC::HS;
1300 case CmpInst::ICMP_ULT:
1301 return AArch64CC::LO;
1302 case CmpInst::ICMP_ULE:
1303 return AArch64CC::LS;
1304 }
1305 }
1306
1307 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
changeFPCCToORAArch64CC(CmpInst::Predicate CC,AArch64CC::CondCode & CondCode,AArch64CC::CondCode & CondCode2)1308 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1309 AArch64CC::CondCode &CondCode,
1310 AArch64CC::CondCode &CondCode2) {
1311 CondCode2 = AArch64CC::AL;
1312 switch (CC) {
1313 default:
1314 llvm_unreachable("Unknown FP condition!");
1315 case CmpInst::FCMP_OEQ:
1316 CondCode = AArch64CC::EQ;
1317 break;
1318 case CmpInst::FCMP_OGT:
1319 CondCode = AArch64CC::GT;
1320 break;
1321 case CmpInst::FCMP_OGE:
1322 CondCode = AArch64CC::GE;
1323 break;
1324 case CmpInst::FCMP_OLT:
1325 CondCode = AArch64CC::MI;
1326 break;
1327 case CmpInst::FCMP_OLE:
1328 CondCode = AArch64CC::LS;
1329 break;
1330 case CmpInst::FCMP_ONE:
1331 CondCode = AArch64CC::MI;
1332 CondCode2 = AArch64CC::GT;
1333 break;
1334 case CmpInst::FCMP_ORD:
1335 CondCode = AArch64CC::VC;
1336 break;
1337 case CmpInst::FCMP_UNO:
1338 CondCode = AArch64CC::VS;
1339 break;
1340 case CmpInst::FCMP_UEQ:
1341 CondCode = AArch64CC::EQ;
1342 CondCode2 = AArch64CC::VS;
1343 break;
1344 case CmpInst::FCMP_UGT:
1345 CondCode = AArch64CC::HI;
1346 break;
1347 case CmpInst::FCMP_UGE:
1348 CondCode = AArch64CC::PL;
1349 break;
1350 case CmpInst::FCMP_ULT:
1351 CondCode = AArch64CC::LT;
1352 break;
1353 case CmpInst::FCMP_ULE:
1354 CondCode = AArch64CC::LE;
1355 break;
1356 case CmpInst::FCMP_UNE:
1357 CondCode = AArch64CC::NE;
1358 break;
1359 }
1360 }
1361
1362 /// Convert an IR fp condition code to an AArch64 CC.
1363 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1364 /// should be AND'ed instead of OR'ed.
changeFPCCToANDAArch64CC(CmpInst::Predicate CC,AArch64CC::CondCode & CondCode,AArch64CC::CondCode & CondCode2)1365 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1366 AArch64CC::CondCode &CondCode,
1367 AArch64CC::CondCode &CondCode2) {
1368 CondCode2 = AArch64CC::AL;
1369 switch (CC) {
1370 default:
1371 changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1372 assert(CondCode2 == AArch64CC::AL);
1373 break;
1374 case CmpInst::FCMP_ONE:
1375 // (a one b)
1376 // == ((a olt b) || (a ogt b))
1377 // == ((a ord b) && (a une b))
1378 CondCode = AArch64CC::VC;
1379 CondCode2 = AArch64CC::NE;
1380 break;
1381 case CmpInst::FCMP_UEQ:
1382 // (a ueq b)
1383 // == ((a uno b) || (a oeq b))
1384 // == ((a ule b) && (a uge b))
1385 CondCode = AArch64CC::PL;
1386 CondCode2 = AArch64CC::LE;
1387 break;
1388 }
1389 }
1390
1391 /// Return a register which can be used as a bit to test in a TB(N)Z.
getTestBitReg(Register Reg,uint64_t & Bit,bool & Invert,MachineRegisterInfo & MRI)1392 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1393 MachineRegisterInfo &MRI) {
1394 assert(Reg.isValid() && "Expected valid register!");
1395 bool HasZext = false;
1396 while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1397 unsigned Opc = MI->getOpcode();
1398
1399 if (!MI->getOperand(0).isReg() ||
1400 !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1401 break;
1402
1403 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1404 //
1405 // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1406 // on the truncated x is the same as the bit number on x.
1407 if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1408 Opc == TargetOpcode::G_TRUNC) {
1409 if (Opc == TargetOpcode::G_ZEXT)
1410 HasZext = true;
1411
1412 Register NextReg = MI->getOperand(1).getReg();
1413 // Did we find something worth folding?
1414 if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1415 break;
1416
1417 // NextReg is worth folding. Keep looking.
1418 Reg = NextReg;
1419 continue;
1420 }
1421
1422 // Attempt to find a suitable operation with a constant on one side.
1423 std::optional<uint64_t> C;
1424 Register TestReg;
1425 switch (Opc) {
1426 default:
1427 break;
1428 case TargetOpcode::G_AND:
1429 case TargetOpcode::G_XOR: {
1430 TestReg = MI->getOperand(1).getReg();
1431 Register ConstantReg = MI->getOperand(2).getReg();
1432 auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1433 if (!VRegAndVal) {
1434 // AND commutes, check the other side for a constant.
1435 // FIXME: Can we canonicalize the constant so that it's always on the
1436 // same side at some point earlier?
1437 std::swap(ConstantReg, TestReg);
1438 VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1439 }
1440 if (VRegAndVal) {
1441 if (HasZext)
1442 C = VRegAndVal->Value.getZExtValue();
1443 else
1444 C = VRegAndVal->Value.getSExtValue();
1445 }
1446 break;
1447 }
1448 case TargetOpcode::G_ASHR:
1449 case TargetOpcode::G_LSHR:
1450 case TargetOpcode::G_SHL: {
1451 TestReg = MI->getOperand(1).getReg();
1452 auto VRegAndVal =
1453 getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1454 if (VRegAndVal)
1455 C = VRegAndVal->Value.getSExtValue();
1456 break;
1457 }
1458 }
1459
1460 // Didn't find a constant or viable register. Bail out of the loop.
1461 if (!C || !TestReg.isValid())
1462 break;
1463
1464 // We found a suitable instruction with a constant. Check to see if we can
1465 // walk through the instruction.
1466 Register NextReg;
1467 unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1468 switch (Opc) {
1469 default:
1470 break;
1471 case TargetOpcode::G_AND:
1472 // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1473 if ((*C >> Bit) & 1)
1474 NextReg = TestReg;
1475 break;
1476 case TargetOpcode::G_SHL:
1477 // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1478 // the type of the register.
1479 if (*C <= Bit && (Bit - *C) < TestRegSize) {
1480 NextReg = TestReg;
1481 Bit = Bit - *C;
1482 }
1483 break;
1484 case TargetOpcode::G_ASHR:
1485 // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1486 // in x
1487 NextReg = TestReg;
1488 Bit = Bit + *C;
1489 if (Bit >= TestRegSize)
1490 Bit = TestRegSize - 1;
1491 break;
1492 case TargetOpcode::G_LSHR:
1493 // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1494 if ((Bit + *C) < TestRegSize) {
1495 NextReg = TestReg;
1496 Bit = Bit + *C;
1497 }
1498 break;
1499 case TargetOpcode::G_XOR:
1500 // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1501 // appropriate.
1502 //
1503 // e.g. If x' = xor x, c, and the b-th bit is set in c then
1504 //
1505 // tbz x', b -> tbnz x, b
1506 //
1507 // Because x' only has the b-th bit set if x does not.
1508 if ((*C >> Bit) & 1)
1509 Invert = !Invert;
1510 NextReg = TestReg;
1511 break;
1512 }
1513
1514 // Check if we found anything worth folding.
1515 if (!NextReg.isValid())
1516 return Reg;
1517 Reg = NextReg;
1518 }
1519
1520 return Reg;
1521 }
1522
emitTestBit(Register TestReg,uint64_t Bit,bool IsNegative,MachineBasicBlock * DstMBB,MachineIRBuilder & MIB) const1523 MachineInstr *AArch64InstructionSelector::emitTestBit(
1524 Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1525 MachineIRBuilder &MIB) const {
1526 assert(TestReg.isValid());
1527 assert(ProduceNonFlagSettingCondBr &&
1528 "Cannot emit TB(N)Z with speculation tracking!");
1529 MachineRegisterInfo &MRI = *MIB.getMRI();
1530
1531 // Attempt to optimize the test bit by walking over instructions.
1532 TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1533 LLT Ty = MRI.getType(TestReg);
1534 unsigned Size = Ty.getSizeInBits();
1535 assert(!Ty.isVector() && "Expected a scalar!");
1536 assert(Bit < 64 && "Bit is too large!");
1537
1538 // When the test register is a 64-bit register, we have to narrow to make
1539 // TBNZW work.
1540 bool UseWReg = Bit < 32;
1541 unsigned NecessarySize = UseWReg ? 32 : 64;
1542 if (Size != NecessarySize)
1543 TestReg = moveScalarRegClass(
1544 TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1545 MIB);
1546
1547 static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1548 {AArch64::TBZW, AArch64::TBNZW}};
1549 unsigned Opc = OpcTable[UseWReg][IsNegative];
1550 auto TestBitMI =
1551 MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1552 constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1553 return &*TestBitMI;
1554 }
1555
tryOptAndIntoCompareBranch(MachineInstr & AndInst,bool Invert,MachineBasicBlock * DstMBB,MachineIRBuilder & MIB) const1556 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1557 MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1558 MachineIRBuilder &MIB) const {
1559 assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1560 // Given something like this:
1561 //
1562 // %x = ...Something...
1563 // %one = G_CONSTANT i64 1
1564 // %zero = G_CONSTANT i64 0
1565 // %and = G_AND %x, %one
1566 // %cmp = G_ICMP intpred(ne), %and, %zero
1567 // %cmp_trunc = G_TRUNC %cmp
1568 // G_BRCOND %cmp_trunc, %bb.3
1569 //
1570 // We want to try and fold the AND into the G_BRCOND and produce either a
1571 // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1572 //
1573 // In this case, we'd get
1574 //
1575 // TBNZ %x %bb.3
1576 //
1577
1578 // Check if the AND has a constant on its RHS which we can use as a mask.
1579 // If it's a power of 2, then it's the same as checking a specific bit.
1580 // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1581 auto MaybeBit = getIConstantVRegValWithLookThrough(
1582 AndInst.getOperand(2).getReg(), *MIB.getMRI());
1583 if (!MaybeBit)
1584 return false;
1585
1586 int32_t Bit = MaybeBit->Value.exactLogBase2();
1587 if (Bit < 0)
1588 return false;
1589
1590 Register TestReg = AndInst.getOperand(1).getReg();
1591
1592 // Emit a TB(N)Z.
1593 emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1594 return true;
1595 }
1596
emitCBZ(Register CompareReg,bool IsNegative,MachineBasicBlock * DestMBB,MachineIRBuilder & MIB) const1597 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1598 bool IsNegative,
1599 MachineBasicBlock *DestMBB,
1600 MachineIRBuilder &MIB) const {
1601 assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1602 MachineRegisterInfo &MRI = *MIB.getMRI();
1603 assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1604 AArch64::GPRRegBankID &&
1605 "Expected GPRs only?");
1606 auto Ty = MRI.getType(CompareReg);
1607 unsigned Width = Ty.getSizeInBits();
1608 assert(!Ty.isVector() && "Expected scalar only?");
1609 assert(Width <= 64 && "Expected width to be at most 64?");
1610 static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1611 {AArch64::CBNZW, AArch64::CBNZX}};
1612 unsigned Opc = OpcTable[IsNegative][Width == 64];
1613 auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1614 constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1615 return &*BranchMI;
1616 }
1617
selectCompareBranchFedByFCmp(MachineInstr & I,MachineInstr & FCmp,MachineIRBuilder & MIB) const1618 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1619 MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1620 assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1621 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1622 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1623 // totally clean. Some of them require two branches to implement.
1624 auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1625 emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1626 Pred);
1627 AArch64CC::CondCode CC1, CC2;
1628 changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1629 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1630 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1631 if (CC2 != AArch64CC::AL)
1632 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1633 I.eraseFromParent();
1634 return true;
1635 }
1636
tryOptCompareBranchFedByICmp(MachineInstr & I,MachineInstr & ICmp,MachineIRBuilder & MIB) const1637 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1638 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1639 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1640 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1641 // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1642 //
1643 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1644 // instructions will not be produced, as they are conditional branch
1645 // instructions that do not set flags.
1646 if (!ProduceNonFlagSettingCondBr)
1647 return false;
1648
1649 MachineRegisterInfo &MRI = *MIB.getMRI();
1650 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1651 auto Pred =
1652 static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1653 Register LHS = ICmp.getOperand(2).getReg();
1654 Register RHS = ICmp.getOperand(3).getReg();
1655
1656 // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1657 auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1658 MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1659
1660 // When we can emit a TB(N)Z, prefer that.
1661 //
1662 // Handle non-commutative condition codes first.
1663 // Note that we don't want to do this when we have a G_AND because it can
1664 // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1665 if (VRegAndVal && !AndInst) {
1666 int64_t C = VRegAndVal->Value.getSExtValue();
1667
1668 // When we have a greater-than comparison, we can just test if the msb is
1669 // zero.
1670 if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1671 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1672 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1673 I.eraseFromParent();
1674 return true;
1675 }
1676
1677 // When we have a less than comparison, we can just test if the msb is not
1678 // zero.
1679 if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1680 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1681 emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1682 I.eraseFromParent();
1683 return true;
1684 }
1685
1686 // Inversely, if we have a signed greater-than-or-equal comparison to zero,
1687 // we can test if the msb is zero.
1688 if (C == 0 && Pred == CmpInst::ICMP_SGE) {
1689 uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1690 emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1691 I.eraseFromParent();
1692 return true;
1693 }
1694 }
1695
1696 // Attempt to handle commutative condition codes. Right now, that's only
1697 // eq/ne.
1698 if (ICmpInst::isEquality(Pred)) {
1699 if (!VRegAndVal) {
1700 std::swap(RHS, LHS);
1701 VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1702 AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1703 }
1704
1705 if (VRegAndVal && VRegAndVal->Value == 0) {
1706 // If there's a G_AND feeding into this branch, try to fold it away by
1707 // emitting a TB(N)Z instead.
1708 //
1709 // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1710 // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1711 // would be redundant.
1712 if (AndInst &&
1713 tryOptAndIntoCompareBranch(
1714 *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1715 I.eraseFromParent();
1716 return true;
1717 }
1718
1719 // Otherwise, try to emit a CB(N)Z instead.
1720 auto LHSTy = MRI.getType(LHS);
1721 if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1722 emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1723 I.eraseFromParent();
1724 return true;
1725 }
1726 }
1727 }
1728
1729 return false;
1730 }
1731
selectCompareBranchFedByICmp(MachineInstr & I,MachineInstr & ICmp,MachineIRBuilder & MIB) const1732 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1733 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1734 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1735 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1736 if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1737 return true;
1738
1739 // Couldn't optimize. Emit a compare + a Bcc.
1740 MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1741 auto PredOp = ICmp.getOperand(1);
1742 emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1743 const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1744 static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1745 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1746 I.eraseFromParent();
1747 return true;
1748 }
1749
selectCompareBranch(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI)1750 bool AArch64InstructionSelector::selectCompareBranch(
1751 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1752 Register CondReg = I.getOperand(0).getReg();
1753 MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1754 // Try to select the G_BRCOND using whatever is feeding the condition if
1755 // possible.
1756 unsigned CCMIOpc = CCMI->getOpcode();
1757 if (CCMIOpc == TargetOpcode::G_FCMP)
1758 return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1759 if (CCMIOpc == TargetOpcode::G_ICMP)
1760 return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1761
1762 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1763 // instructions will not be produced, as they are conditional branch
1764 // instructions that do not set flags.
1765 if (ProduceNonFlagSettingCondBr) {
1766 emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1767 I.getOperand(1).getMBB(), MIB);
1768 I.eraseFromParent();
1769 return true;
1770 }
1771
1772 // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1773 auto TstMI =
1774 MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1775 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1776 auto Bcc = MIB.buildInstr(AArch64::Bcc)
1777 .addImm(AArch64CC::EQ)
1778 .addMBB(I.getOperand(1).getMBB());
1779 I.eraseFromParent();
1780 return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1781 }
1782
1783 /// Returns the element immediate value of a vector shift operand if found.
1784 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
getVectorShiftImm(Register Reg,MachineRegisterInfo & MRI)1785 static std::optional<int64_t> getVectorShiftImm(Register Reg,
1786 MachineRegisterInfo &MRI) {
1787 assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1788 MachineInstr *OpMI = MRI.getVRegDef(Reg);
1789 return getAArch64VectorSplatScalar(*OpMI, MRI);
1790 }
1791
1792 /// Matches and returns the shift immediate value for a SHL instruction given
1793 /// a shift operand.
getVectorSHLImm(LLT SrcTy,Register Reg,MachineRegisterInfo & MRI)1794 static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1795 MachineRegisterInfo &MRI) {
1796 std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1797 if (!ShiftImm)
1798 return std::nullopt;
1799 // Check the immediate is in range for a SHL.
1800 int64_t Imm = *ShiftImm;
1801 if (Imm < 0)
1802 return std::nullopt;
1803 switch (SrcTy.getElementType().getSizeInBits()) {
1804 default:
1805 LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1806 return std::nullopt;
1807 case 8:
1808 if (Imm > 7)
1809 return std::nullopt;
1810 break;
1811 case 16:
1812 if (Imm > 15)
1813 return std::nullopt;
1814 break;
1815 case 32:
1816 if (Imm > 31)
1817 return std::nullopt;
1818 break;
1819 case 64:
1820 if (Imm > 63)
1821 return std::nullopt;
1822 break;
1823 }
1824 return Imm;
1825 }
1826
selectVectorSHL(MachineInstr & I,MachineRegisterInfo & MRI)1827 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1828 MachineRegisterInfo &MRI) {
1829 assert(I.getOpcode() == TargetOpcode::G_SHL);
1830 Register DstReg = I.getOperand(0).getReg();
1831 const LLT Ty = MRI.getType(DstReg);
1832 Register Src1Reg = I.getOperand(1).getReg();
1833 Register Src2Reg = I.getOperand(2).getReg();
1834
1835 if (!Ty.isVector())
1836 return false;
1837
1838 // Check if we have a vector of constants on RHS that we can select as the
1839 // immediate form.
1840 std::optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1841
1842 unsigned Opc = 0;
1843 if (Ty == LLT::fixed_vector(2, 64)) {
1844 Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1845 } else if (Ty == LLT::fixed_vector(4, 32)) {
1846 Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1847 } else if (Ty == LLT::fixed_vector(2, 32)) {
1848 Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1849 } else if (Ty == LLT::fixed_vector(4, 16)) {
1850 Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1851 } else if (Ty == LLT::fixed_vector(8, 16)) {
1852 Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1853 } else if (Ty == LLT::fixed_vector(16, 8)) {
1854 Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1855 } else if (Ty == LLT::fixed_vector(8, 8)) {
1856 Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1857 } else {
1858 LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1859 return false;
1860 }
1861
1862 auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1863 if (ImmVal)
1864 Shl.addImm(*ImmVal);
1865 else
1866 Shl.addUse(Src2Reg);
1867 constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1868 I.eraseFromParent();
1869 return true;
1870 }
1871
selectVectorAshrLshr(MachineInstr & I,MachineRegisterInfo & MRI)1872 bool AArch64InstructionSelector::selectVectorAshrLshr(
1873 MachineInstr &I, MachineRegisterInfo &MRI) {
1874 assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1875 I.getOpcode() == TargetOpcode::G_LSHR);
1876 Register DstReg = I.getOperand(0).getReg();
1877 const LLT Ty = MRI.getType(DstReg);
1878 Register Src1Reg = I.getOperand(1).getReg();
1879 Register Src2Reg = I.getOperand(2).getReg();
1880
1881 if (!Ty.isVector())
1882 return false;
1883
1884 bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1885
1886 // We expect the immediate case to be lowered in the PostLegalCombiner to
1887 // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1888
1889 // There is not a shift right register instruction, but the shift left
1890 // register instruction takes a signed value, where negative numbers specify a
1891 // right shift.
1892
1893 unsigned Opc = 0;
1894 unsigned NegOpc = 0;
1895 const TargetRegisterClass *RC =
1896 getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
1897 if (Ty == LLT::fixed_vector(2, 64)) {
1898 Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1899 NegOpc = AArch64::NEGv2i64;
1900 } else if (Ty == LLT::fixed_vector(4, 32)) {
1901 Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1902 NegOpc = AArch64::NEGv4i32;
1903 } else if (Ty == LLT::fixed_vector(2, 32)) {
1904 Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1905 NegOpc = AArch64::NEGv2i32;
1906 } else if (Ty == LLT::fixed_vector(4, 16)) {
1907 Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1908 NegOpc = AArch64::NEGv4i16;
1909 } else if (Ty == LLT::fixed_vector(8, 16)) {
1910 Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1911 NegOpc = AArch64::NEGv8i16;
1912 } else if (Ty == LLT::fixed_vector(16, 8)) {
1913 Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1914 NegOpc = AArch64::NEGv16i8;
1915 } else if (Ty == LLT::fixed_vector(8, 8)) {
1916 Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1917 NegOpc = AArch64::NEGv8i8;
1918 } else {
1919 LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1920 return false;
1921 }
1922
1923 auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1924 constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1925 auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1926 constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1927 I.eraseFromParent();
1928 return true;
1929 }
1930
selectVaStartAAPCS(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI) const1931 bool AArch64InstructionSelector::selectVaStartAAPCS(
1932 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1933 return false;
1934 }
1935
selectVaStartDarwin(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI) const1936 bool AArch64InstructionSelector::selectVaStartDarwin(
1937 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1938 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1939 Register ListReg = I.getOperand(0).getReg();
1940
1941 Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1942
1943 auto MIB =
1944 BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
1945 .addDef(ArgsAddrReg)
1946 .addFrameIndex(FuncInfo->getVarArgsStackIndex())
1947 .addImm(0)
1948 .addImm(0);
1949
1950 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1951
1952 MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
1953 .addUse(ArgsAddrReg)
1954 .addUse(ListReg)
1955 .addImm(0)
1956 .addMemOperand(*I.memoperands_begin());
1957
1958 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1959 I.eraseFromParent();
1960 return true;
1961 }
1962
materializeLargeCMVal(MachineInstr & I,const Value * V,unsigned OpFlags)1963 void AArch64InstructionSelector::materializeLargeCMVal(
1964 MachineInstr &I, const Value *V, unsigned OpFlags) {
1965 MachineBasicBlock &MBB = *I.getParent();
1966 MachineFunction &MF = *MBB.getParent();
1967 MachineRegisterInfo &MRI = MF.getRegInfo();
1968
1969 auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
1970 MovZ->addOperand(MF, I.getOperand(1));
1971 MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
1972 AArch64II::MO_NC);
1973 MovZ->addOperand(MF, MachineOperand::CreateImm(0));
1974 constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
1975
1976 auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
1977 Register ForceDstReg) {
1978 Register DstReg = ForceDstReg
1979 ? ForceDstReg
1980 : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1981 auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
1982 if (auto *GV = dyn_cast<GlobalValue>(V)) {
1983 MovI->addOperand(MF, MachineOperand::CreateGA(
1984 GV, MovZ->getOperand(1).getOffset(), Flags));
1985 } else {
1986 MovI->addOperand(
1987 MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
1988 MovZ->getOperand(1).getOffset(), Flags));
1989 }
1990 MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
1991 constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
1992 return DstReg;
1993 };
1994 Register DstReg = BuildMovK(MovZ.getReg(0),
1995 AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
1996 DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
1997 BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
1998 }
1999
preISelLower(MachineInstr & I)2000 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2001 MachineBasicBlock &MBB = *I.getParent();
2002 MachineFunction &MF = *MBB.getParent();
2003 MachineRegisterInfo &MRI = MF.getRegInfo();
2004
2005 switch (I.getOpcode()) {
2006 case TargetOpcode::G_STORE: {
2007 bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2008 MachineOperand &SrcOp = I.getOperand(0);
2009 if (MRI.getType(SrcOp.getReg()).isPointer()) {
2010 // Allow matching with imported patterns for stores of pointers. Unlike
2011 // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2012 // and constrain.
2013 auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
2014 Register NewSrc = Copy.getReg(0);
2015 SrcOp.setReg(NewSrc);
2016 RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
2017 Changed = true;
2018 }
2019 return Changed;
2020 }
2021 case TargetOpcode::G_PTR_ADD:
2022 return convertPtrAddToAdd(I, MRI);
2023 case TargetOpcode::G_LOAD: {
2024 // For scalar loads of pointers, we try to convert the dest type from p0
2025 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2026 // conversion, this should be ok because all users should have been
2027 // selected already, so the type doesn't matter for them.
2028 Register DstReg = I.getOperand(0).getReg();
2029 const LLT DstTy = MRI.getType(DstReg);
2030 if (!DstTy.isPointer())
2031 return false;
2032 MRI.setType(DstReg, LLT::scalar(64));
2033 return true;
2034 }
2035 case AArch64::G_DUP: {
2036 // Convert the type from p0 to s64 to help selection.
2037 LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2038 if (!DstTy.getElementType().isPointer())
2039 return false;
2040 auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
2041 MRI.setType(I.getOperand(0).getReg(),
2042 DstTy.changeElementType(LLT::scalar(64)));
2043 MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2044 I.getOperand(1).setReg(NewSrc.getReg(0));
2045 return true;
2046 }
2047 case TargetOpcode::G_UITOFP:
2048 case TargetOpcode::G_SITOFP: {
2049 // If both source and destination regbanks are FPR, then convert the opcode
2050 // to G_SITOF so that the importer can select it to an fpr variant.
2051 // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2052 // copy.
2053 Register SrcReg = I.getOperand(1).getReg();
2054 LLT SrcTy = MRI.getType(SrcReg);
2055 LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2056 if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2057 return false;
2058
2059 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2060 if (I.getOpcode() == TargetOpcode::G_SITOFP)
2061 I.setDesc(TII.get(AArch64::G_SITOF));
2062 else
2063 I.setDesc(TII.get(AArch64::G_UITOF));
2064 return true;
2065 }
2066 return false;
2067 }
2068 default:
2069 return false;
2070 }
2071 }
2072
2073 /// This lowering tries to look for G_PTR_ADD instructions and then converts
2074 /// them to a standard G_ADD with a COPY on the source.
2075 ///
2076 /// The motivation behind this is to expose the add semantics to the imported
2077 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2078 /// because the selector works bottom up, uses before defs. By the time we
2079 /// end up trying to select a G_PTR_ADD, we should have already attempted to
2080 /// fold this into addressing modes and were therefore unsuccessful.
convertPtrAddToAdd(MachineInstr & I,MachineRegisterInfo & MRI)2081 bool AArch64InstructionSelector::convertPtrAddToAdd(
2082 MachineInstr &I, MachineRegisterInfo &MRI) {
2083 assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2084 Register DstReg = I.getOperand(0).getReg();
2085 Register AddOp1Reg = I.getOperand(1).getReg();
2086 const LLT PtrTy = MRI.getType(DstReg);
2087 if (PtrTy.getAddressSpace() != 0)
2088 return false;
2089
2090 const LLT CastPtrTy =
2091 PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2092 auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2093 // Set regbanks on the registers.
2094 if (PtrTy.isVector())
2095 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2096 else
2097 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2098
2099 // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2100 // %dst(intty) = G_ADD %intbase, off
2101 I.setDesc(TII.get(TargetOpcode::G_ADD));
2102 MRI.setType(DstReg, CastPtrTy);
2103 I.getOperand(1).setReg(PtrToInt.getReg(0));
2104 if (!select(*PtrToInt)) {
2105 LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2106 return false;
2107 }
2108
2109 // Also take the opportunity here to try to do some optimization.
2110 // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2111 Register NegatedReg;
2112 if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2113 return true;
2114 I.getOperand(2).setReg(NegatedReg);
2115 I.setDesc(TII.get(TargetOpcode::G_SUB));
2116 return true;
2117 }
2118
earlySelectSHL(MachineInstr & I,MachineRegisterInfo & MRI)2119 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2120 MachineRegisterInfo &MRI) {
2121 // We try to match the immediate variant of LSL, which is actually an alias
2122 // for a special case of UBFM. Otherwise, we fall back to the imported
2123 // selector which will match the register variant.
2124 assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2125 const auto &MO = I.getOperand(2);
2126 auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI);
2127 if (!VRegAndVal)
2128 return false;
2129
2130 const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2131 if (DstTy.isVector())
2132 return false;
2133 bool Is64Bit = DstTy.getSizeInBits() == 64;
2134 auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2135 auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2136
2137 if (!Imm1Fn || !Imm2Fn)
2138 return false;
2139
2140 auto NewI =
2141 MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2142 {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2143
2144 for (auto &RenderFn : *Imm1Fn)
2145 RenderFn(NewI);
2146 for (auto &RenderFn : *Imm2Fn)
2147 RenderFn(NewI);
2148
2149 I.eraseFromParent();
2150 return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2151 }
2152
contractCrossBankCopyIntoStore(MachineInstr & I,MachineRegisterInfo & MRI)2153 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2154 MachineInstr &I, MachineRegisterInfo &MRI) {
2155 assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2156 // If we're storing a scalar, it doesn't matter what register bank that
2157 // scalar is on. All that matters is the size.
2158 //
2159 // So, if we see something like this (with a 32-bit scalar as an example):
2160 //
2161 // %x:gpr(s32) = ... something ...
2162 // %y:fpr(s32) = COPY %x:gpr(s32)
2163 // G_STORE %y:fpr(s32)
2164 //
2165 // We can fix this up into something like this:
2166 //
2167 // G_STORE %x:gpr(s32)
2168 //
2169 // And then continue the selection process normally.
2170 Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2171 if (!DefDstReg.isValid())
2172 return false;
2173 LLT DefDstTy = MRI.getType(DefDstReg);
2174 Register StoreSrcReg = I.getOperand(0).getReg();
2175 LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2176
2177 // If we get something strange like a physical register, then we shouldn't
2178 // go any further.
2179 if (!DefDstTy.isValid())
2180 return false;
2181
2182 // Are the source and dst types the same size?
2183 if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2184 return false;
2185
2186 if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2187 RBI.getRegBank(DefDstReg, MRI, TRI))
2188 return false;
2189
2190 // We have a cross-bank copy, which is entering a store. Let's fold it.
2191 I.getOperand(0).setReg(DefDstReg);
2192 return true;
2193 }
2194
earlySelect(MachineInstr & I)2195 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2196 assert(I.getParent() && "Instruction should be in a basic block!");
2197 assert(I.getParent()->getParent() && "Instruction should be in a function!");
2198
2199 MachineBasicBlock &MBB = *I.getParent();
2200 MachineFunction &MF = *MBB.getParent();
2201 MachineRegisterInfo &MRI = MF.getRegInfo();
2202
2203 switch (I.getOpcode()) {
2204 case AArch64::G_DUP: {
2205 // Before selecting a DUP instruction, check if it is better selected as a
2206 // MOV or load from a constant pool.
2207 Register Src = I.getOperand(1).getReg();
2208 auto ValAndVReg = getIConstantVRegValWithLookThrough(Src, MRI);
2209 if (!ValAndVReg)
2210 return false;
2211 LLVMContext &Ctx = MF.getFunction().getContext();
2212 Register Dst = I.getOperand(0).getReg();
2213 auto *CV = ConstantDataVector::getSplat(
2214 MRI.getType(Dst).getNumElements(),
2215 ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
2216 ValAndVReg->Value));
2217 if (!emitConstantVector(Dst, CV, MIB, MRI))
2218 return false;
2219 I.eraseFromParent();
2220 return true;
2221 }
2222 case TargetOpcode::G_SEXT:
2223 // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2224 // over a normal extend.
2225 if (selectUSMovFromExtend(I, MRI))
2226 return true;
2227 return false;
2228 case TargetOpcode::G_BR:
2229 return false;
2230 case TargetOpcode::G_SHL:
2231 return earlySelectSHL(I, MRI);
2232 case TargetOpcode::G_CONSTANT: {
2233 bool IsZero = false;
2234 if (I.getOperand(1).isCImm())
2235 IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0;
2236 else if (I.getOperand(1).isImm())
2237 IsZero = I.getOperand(1).getImm() == 0;
2238
2239 if (!IsZero)
2240 return false;
2241
2242 Register DefReg = I.getOperand(0).getReg();
2243 LLT Ty = MRI.getType(DefReg);
2244 if (Ty.getSizeInBits() == 64) {
2245 I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2246 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2247 } else if (Ty.getSizeInBits() == 32) {
2248 I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2249 RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2250 } else
2251 return false;
2252
2253 I.setDesc(TII.get(TargetOpcode::COPY));
2254 return true;
2255 }
2256
2257 case TargetOpcode::G_ADD: {
2258 // Check if this is being fed by a G_ICMP on either side.
2259 //
2260 // (cmp pred, x, y) + z
2261 //
2262 // In the above case, when the cmp is true, we increment z by 1. So, we can
2263 // fold the add into the cset for the cmp by using cinc.
2264 //
2265 // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2266 Register AddDst = I.getOperand(0).getReg();
2267 Register AddLHS = I.getOperand(1).getReg();
2268 Register AddRHS = I.getOperand(2).getReg();
2269 // Only handle scalars.
2270 LLT Ty = MRI.getType(AddLHS);
2271 if (Ty.isVector())
2272 return false;
2273 // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2274 // bits.
2275 unsigned Size = Ty.getSizeInBits();
2276 if (Size != 32 && Size != 64)
2277 return false;
2278 auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2279 if (!MRI.hasOneNonDBGUse(Reg))
2280 return nullptr;
2281 // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2282 // compare.
2283 if (Size == 32)
2284 return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI);
2285 // We model scalar compares using 32-bit destinations right now.
2286 // If it's a 64-bit compare, it'll have 64-bit sources.
2287 Register ZExt;
2288 if (!mi_match(Reg, MRI,
2289 m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt))))))
2290 return nullptr;
2291 auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI);
2292 if (!Cmp ||
2293 MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64)
2294 return nullptr;
2295 return Cmp;
2296 };
2297 // Try to match
2298 // z + (cmp pred, x, y)
2299 MachineInstr *Cmp = MatchCmp(AddRHS);
2300 if (!Cmp) {
2301 // (cmp pred, x, y) + z
2302 std::swap(AddLHS, AddRHS);
2303 Cmp = MatchCmp(AddRHS);
2304 if (!Cmp)
2305 return false;
2306 }
2307 auto &PredOp = Cmp->getOperand(1);
2308 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2309 const AArch64CC::CondCode InvCC =
2310 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
2311 MIB.setInstrAndDebugLoc(I);
2312 emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
2313 /*RHS=*/Cmp->getOperand(3), PredOp, MIB);
2314 emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
2315 I.eraseFromParent();
2316 return true;
2317 }
2318 case TargetOpcode::G_OR: {
2319 // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2320 // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2321 // shifting and masking that we can replace with a BFI (encoded as a BFM).
2322 Register Dst = I.getOperand(0).getReg();
2323 LLT Ty = MRI.getType(Dst);
2324
2325 if (!Ty.isScalar())
2326 return false;
2327
2328 unsigned Size = Ty.getSizeInBits();
2329 if (Size != 32 && Size != 64)
2330 return false;
2331
2332 Register ShiftSrc;
2333 int64_t ShiftImm;
2334 Register MaskSrc;
2335 int64_t MaskImm;
2336 if (!mi_match(
2337 Dst, MRI,
2338 m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2339 m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2340 return false;
2341
2342 if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2343 return false;
2344
2345 int64_t Immr = Size - ShiftImm;
2346 int64_t Imms = Size - ShiftImm - 1;
2347 unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2348 emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2349 I.eraseFromParent();
2350 return true;
2351 }
2352 case TargetOpcode::G_FENCE: {
2353 if (I.getOperand(1).getImm() == 0)
2354 BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER));
2355 else
2356 BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB))
2357 .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb);
2358 I.eraseFromParent();
2359 return true;
2360 }
2361 default:
2362 return false;
2363 }
2364 }
2365
select(MachineInstr & I)2366 bool AArch64InstructionSelector::select(MachineInstr &I) {
2367 assert(I.getParent() && "Instruction should be in a basic block!");
2368 assert(I.getParent()->getParent() && "Instruction should be in a function!");
2369
2370 MachineBasicBlock &MBB = *I.getParent();
2371 MachineFunction &MF = *MBB.getParent();
2372 MachineRegisterInfo &MRI = MF.getRegInfo();
2373
2374 const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2375 if (Subtarget->requiresStrictAlign()) {
2376 // We don't support this feature yet.
2377 LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2378 return false;
2379 }
2380
2381 MIB.setInstrAndDebugLoc(I);
2382
2383 unsigned Opcode = I.getOpcode();
2384 // G_PHI requires same handling as PHI
2385 if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2386 // Certain non-generic instructions also need some special handling.
2387
2388 if (Opcode == TargetOpcode::LOAD_STACK_GUARD)
2389 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2390
2391 if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2392 const Register DefReg = I.getOperand(0).getReg();
2393 const LLT DefTy = MRI.getType(DefReg);
2394
2395 const RegClassOrRegBank &RegClassOrBank =
2396 MRI.getRegClassOrRegBank(DefReg);
2397
2398 const TargetRegisterClass *DefRC
2399 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2400 if (!DefRC) {
2401 if (!DefTy.isValid()) {
2402 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2403 return false;
2404 }
2405 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2406 DefRC = getRegClassForTypeOnBank(DefTy, RB);
2407 if (!DefRC) {
2408 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2409 return false;
2410 }
2411 }
2412
2413 I.setDesc(TII.get(TargetOpcode::PHI));
2414
2415 return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2416 }
2417
2418 if (I.isCopy())
2419 return selectCopy(I, TII, MRI, TRI, RBI);
2420
2421 if (I.isDebugInstr())
2422 return selectDebugInstr(I, MRI, RBI);
2423
2424 return true;
2425 }
2426
2427
2428 if (I.getNumOperands() != I.getNumExplicitOperands()) {
2429 LLVM_DEBUG(
2430 dbgs() << "Generic instruction has unexpected implicit operands\n");
2431 return false;
2432 }
2433
2434 // Try to do some lowering before we start instruction selecting. These
2435 // lowerings are purely transformations on the input G_MIR and so selection
2436 // must continue after any modification of the instruction.
2437 if (preISelLower(I)) {
2438 Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2439 }
2440
2441 // There may be patterns where the importer can't deal with them optimally,
2442 // but does select it to a suboptimal sequence so our custom C++ selection
2443 // code later never has a chance to work on it. Therefore, we have an early
2444 // selection attempt here to give priority to certain selection routines
2445 // over the imported ones.
2446 if (earlySelect(I))
2447 return true;
2448
2449 if (selectImpl(I, *CoverageInfo))
2450 return true;
2451
2452 LLT Ty =
2453 I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2454
2455 switch (Opcode) {
2456 case TargetOpcode::G_SBFX:
2457 case TargetOpcode::G_UBFX: {
2458 static const unsigned OpcTable[2][2] = {
2459 {AArch64::UBFMWri, AArch64::UBFMXri},
2460 {AArch64::SBFMWri, AArch64::SBFMXri}};
2461 bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2462 unsigned Size = Ty.getSizeInBits();
2463 unsigned Opc = OpcTable[IsSigned][Size == 64];
2464 auto Cst1 =
2465 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2466 assert(Cst1 && "Should have gotten a constant for src 1?");
2467 auto Cst2 =
2468 getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2469 assert(Cst2 && "Should have gotten a constant for src 2?");
2470 auto LSB = Cst1->Value.getZExtValue();
2471 auto Width = Cst2->Value.getZExtValue();
2472 auto BitfieldInst =
2473 MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2474 .addImm(LSB)
2475 .addImm(LSB + Width - 1);
2476 I.eraseFromParent();
2477 return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2478 }
2479 case TargetOpcode::G_BRCOND:
2480 return selectCompareBranch(I, MF, MRI);
2481
2482 case TargetOpcode::G_BRINDIRECT: {
2483 I.setDesc(TII.get(AArch64::BR));
2484 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2485 }
2486
2487 case TargetOpcode::G_BRJT:
2488 return selectBrJT(I, MRI);
2489
2490 case AArch64::G_ADD_LOW: {
2491 // This op may have been separated from it's ADRP companion by the localizer
2492 // or some other code motion pass. Given that many CPUs will try to
2493 // macro fuse these operations anyway, select this into a MOVaddr pseudo
2494 // which will later be expanded into an ADRP+ADD pair after scheduling.
2495 MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2496 if (BaseMI->getOpcode() != AArch64::ADRP) {
2497 I.setDesc(TII.get(AArch64::ADDXri));
2498 I.addOperand(MachineOperand::CreateImm(0));
2499 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2500 }
2501 assert(TM.getCodeModel() == CodeModel::Small &&
2502 "Expected small code model");
2503 auto Op1 = BaseMI->getOperand(1);
2504 auto Op2 = I.getOperand(2);
2505 auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2506 .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2507 Op1.getTargetFlags())
2508 .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2509 Op2.getTargetFlags());
2510 I.eraseFromParent();
2511 return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2512 }
2513
2514 case TargetOpcode::G_BSWAP: {
2515 // Handle vector types for G_BSWAP directly.
2516 Register DstReg = I.getOperand(0).getReg();
2517 LLT DstTy = MRI.getType(DstReg);
2518
2519 // We should only get vector types here; everything else is handled by the
2520 // importer right now.
2521 if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
2522 LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
2523 return false;
2524 }
2525
2526 // Only handle 4 and 2 element vectors for now.
2527 // TODO: 16-bit elements.
2528 unsigned NumElts = DstTy.getNumElements();
2529 if (NumElts != 4 && NumElts != 2) {
2530 LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
2531 return false;
2532 }
2533
2534 // Choose the correct opcode for the supported types. Right now, that's
2535 // v2s32, v4s32, and v2s64.
2536 unsigned Opc = 0;
2537 unsigned EltSize = DstTy.getElementType().getSizeInBits();
2538 if (EltSize == 32)
2539 Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
2540 : AArch64::REV32v16i8;
2541 else if (EltSize == 64)
2542 Opc = AArch64::REV64v16i8;
2543
2544 // We should always get something by the time we get here...
2545 assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
2546
2547 I.setDesc(TII.get(Opc));
2548 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2549 }
2550
2551 case TargetOpcode::G_FCONSTANT:
2552 case TargetOpcode::G_CONSTANT: {
2553 const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2554
2555 const LLT s8 = LLT::scalar(8);
2556 const LLT s16 = LLT::scalar(16);
2557 const LLT s32 = LLT::scalar(32);
2558 const LLT s64 = LLT::scalar(64);
2559 const LLT s128 = LLT::scalar(128);
2560 const LLT p0 = LLT::pointer(0, 64);
2561
2562 const Register DefReg = I.getOperand(0).getReg();
2563 const LLT DefTy = MRI.getType(DefReg);
2564 const unsigned DefSize = DefTy.getSizeInBits();
2565 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2566
2567 // FIXME: Redundant check, but even less readable when factored out.
2568 if (isFP) {
2569 if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2570 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2571 << " constant, expected: " << s16 << " or " << s32
2572 << " or " << s64 << " or " << s128 << '\n');
2573 return false;
2574 }
2575
2576 if (RB.getID() != AArch64::FPRRegBankID) {
2577 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2578 << " constant on bank: " << RB
2579 << ", expected: FPR\n");
2580 return false;
2581 }
2582
2583 // The case when we have 0.0 is covered by tablegen. Reject it here so we
2584 // can be sure tablegen works correctly and isn't rescued by this code.
2585 // 0.0 is not covered by tablegen for FP128. So we will handle this
2586 // scenario in the code here.
2587 if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2588 return false;
2589 } else {
2590 // s32 and s64 are covered by tablegen.
2591 if (Ty != p0 && Ty != s8 && Ty != s16) {
2592 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2593 << " constant, expected: " << s32 << ", " << s64
2594 << ", or " << p0 << '\n');
2595 return false;
2596 }
2597
2598 if (RB.getID() != AArch64::GPRRegBankID) {
2599 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2600 << " constant on bank: " << RB
2601 << ", expected: GPR\n");
2602 return false;
2603 }
2604 }
2605
2606 if (isFP) {
2607 const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB);
2608 // For 16, 64, and 128b values, emit a constant pool load.
2609 switch (DefSize) {
2610 default:
2611 llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2612 case 32:
2613 // For s32, use a cp load if we have optsize/minsize.
2614 if (!shouldOptForSize(&MF))
2615 break;
2616 [[fallthrough]];
2617 case 16:
2618 case 64:
2619 case 128: {
2620 auto *FPImm = I.getOperand(1).getFPImm();
2621 auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2622 if (!LoadMI) {
2623 LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2624 return false;
2625 }
2626 MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2627 I.eraseFromParent();
2628 return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2629 }
2630 }
2631
2632 // Either emit a FMOV, or emit a copy to emit a normal mov.
2633 assert(DefSize == 32 &&
2634 "Expected constant pool loads for all sizes other than 32!");
2635 const Register DefGPRReg =
2636 MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2637 MachineOperand &RegOp = I.getOperand(0);
2638 RegOp.setReg(DefGPRReg);
2639 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2640 MIB.buildCopy({DefReg}, {DefGPRReg});
2641
2642 if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2643 LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2644 return false;
2645 }
2646
2647 MachineOperand &ImmOp = I.getOperand(1);
2648 // FIXME: Is going through int64_t always correct?
2649 ImmOp.ChangeToImmediate(
2650 ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2651 } else if (I.getOperand(1).isCImm()) {
2652 uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2653 I.getOperand(1).ChangeToImmediate(Val);
2654 } else if (I.getOperand(1).isImm()) {
2655 uint64_t Val = I.getOperand(1).getImm();
2656 I.getOperand(1).ChangeToImmediate(Val);
2657 }
2658
2659 const unsigned MovOpc =
2660 DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2661 I.setDesc(TII.get(MovOpc));
2662 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2663 return true;
2664 }
2665 case TargetOpcode::G_EXTRACT: {
2666 Register DstReg = I.getOperand(0).getReg();
2667 Register SrcReg = I.getOperand(1).getReg();
2668 LLT SrcTy = MRI.getType(SrcReg);
2669 LLT DstTy = MRI.getType(DstReg);
2670 (void)DstTy;
2671 unsigned SrcSize = SrcTy.getSizeInBits();
2672
2673 if (SrcTy.getSizeInBits() > 64) {
2674 // This should be an extract of an s128, which is like a vector extract.
2675 if (SrcTy.getSizeInBits() != 128)
2676 return false;
2677 // Only support extracting 64 bits from an s128 at the moment.
2678 if (DstTy.getSizeInBits() != 64)
2679 return false;
2680
2681 unsigned Offset = I.getOperand(2).getImm();
2682 if (Offset % 64 != 0)
2683 return false;
2684
2685 // Check we have the right regbank always.
2686 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2687 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2688 assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2689
2690 if (SrcRB.getID() == AArch64::GPRRegBankID) {
2691 auto NewI =
2692 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2693 .addUse(SrcReg, 0,
2694 Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2695 constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI,
2696 AArch64::GPR64RegClass, NewI->getOperand(0));
2697 I.eraseFromParent();
2698 return true;
2699 }
2700
2701 // Emit the same code as a vector extract.
2702 // Offset must be a multiple of 64.
2703 unsigned LaneIdx = Offset / 64;
2704 MachineInstr *Extract = emitExtractVectorElt(
2705 DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2706 if (!Extract)
2707 return false;
2708 I.eraseFromParent();
2709 return true;
2710 }
2711
2712 I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2713 MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2714 Ty.getSizeInBits() - 1);
2715
2716 if (SrcSize < 64) {
2717 assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2718 "unexpected G_EXTRACT types");
2719 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2720 }
2721
2722 DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2723 MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2724 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2725 .addReg(DstReg, 0, AArch64::sub_32);
2726 RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2727 AArch64::GPR32RegClass, MRI);
2728 I.getOperand(0).setReg(DstReg);
2729
2730 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2731 }
2732
2733 case TargetOpcode::G_INSERT: {
2734 LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2735 LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2736 unsigned DstSize = DstTy.getSizeInBits();
2737 // Larger inserts are vectors, same-size ones should be something else by
2738 // now (split up or turned into COPYs).
2739 if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2740 return false;
2741
2742 I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2743 unsigned LSB = I.getOperand(3).getImm();
2744 unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2745 I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2746 MachineInstrBuilder(MF, I).addImm(Width - 1);
2747
2748 if (DstSize < 64) {
2749 assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2750 "unexpected G_INSERT types");
2751 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2752 }
2753
2754 Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2755 BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2756 TII.get(AArch64::SUBREG_TO_REG))
2757 .addDef(SrcReg)
2758 .addImm(0)
2759 .addUse(I.getOperand(2).getReg())
2760 .addImm(AArch64::sub_32);
2761 RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2762 AArch64::GPR32RegClass, MRI);
2763 I.getOperand(2).setReg(SrcReg);
2764
2765 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2766 }
2767 case TargetOpcode::G_FRAME_INDEX: {
2768 // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2769 if (Ty != LLT::pointer(0, 64)) {
2770 LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2771 << ", expected: " << LLT::pointer(0, 64) << '\n');
2772 return false;
2773 }
2774 I.setDesc(TII.get(AArch64::ADDXri));
2775
2776 // MOs for a #0 shifted immediate.
2777 I.addOperand(MachineOperand::CreateImm(0));
2778 I.addOperand(MachineOperand::CreateImm(0));
2779
2780 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2781 }
2782
2783 case TargetOpcode::G_GLOBAL_VALUE: {
2784 auto GV = I.getOperand(1).getGlobal();
2785 if (GV->isThreadLocal())
2786 return selectTLSGlobalValue(I, MRI);
2787
2788 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
2789 if (OpFlags & AArch64II::MO_GOT) {
2790 I.setDesc(TII.get(AArch64::LOADgot));
2791 I.getOperand(1).setTargetFlags(OpFlags);
2792 } else if (TM.getCodeModel() == CodeModel::Large) {
2793 // Materialize the global using movz/movk instructions.
2794 materializeLargeCMVal(I, GV, OpFlags);
2795 I.eraseFromParent();
2796 return true;
2797 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2798 I.setDesc(TII.get(AArch64::ADR));
2799 I.getOperand(1).setTargetFlags(OpFlags);
2800 } else {
2801 I.setDesc(TII.get(AArch64::MOVaddr));
2802 I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2803 MachineInstrBuilder MIB(MF, I);
2804 MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2805 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2806 }
2807 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2808 }
2809
2810 case TargetOpcode::G_ZEXTLOAD:
2811 case TargetOpcode::G_LOAD:
2812 case TargetOpcode::G_STORE: {
2813 GLoadStore &LdSt = cast<GLoadStore>(I);
2814 bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2815 LLT PtrTy = MRI.getType(LdSt.getPointerReg());
2816
2817 if (PtrTy != LLT::pointer(0, 64)) {
2818 LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2819 << ", expected: " << LLT::pointer(0, 64) << '\n');
2820 return false;
2821 }
2822
2823 uint64_t MemSizeInBytes = LdSt.getMemSize();
2824 unsigned MemSizeInBits = LdSt.getMemSizeInBits();
2825 AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2826
2827 // Need special instructions for atomics that affect ordering.
2828 if (Order != AtomicOrdering::NotAtomic &&
2829 Order != AtomicOrdering::Unordered &&
2830 Order != AtomicOrdering::Monotonic) {
2831 assert(!isa<GZExtLoad>(LdSt));
2832 if (MemSizeInBytes > 64)
2833 return false;
2834
2835 if (isa<GLoad>(LdSt)) {
2836 static constexpr unsigned LDAPROpcodes[] = {
2837 AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
2838 static constexpr unsigned LDAROpcodes[] = {
2839 AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
2840 ArrayRef<unsigned> Opcodes =
2841 STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
2842 ? LDAPROpcodes
2843 : LDAROpcodes;
2844 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2845 } else {
2846 static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2847 AArch64::STLRW, AArch64::STLRX};
2848 Register ValReg = LdSt.getReg(0);
2849 if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
2850 // Emit a subreg copy of 32 bits.
2851 Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2852 MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
2853 .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32);
2854 I.getOperand(0).setReg(NewVal);
2855 }
2856 I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2857 }
2858 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2859 return true;
2860 }
2861
2862 #ifndef NDEBUG
2863 const Register PtrReg = LdSt.getPointerReg();
2864 const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2865 // Check that the pointer register is valid.
2866 assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2867 "Load/Store pointer operand isn't a GPR");
2868 assert(MRI.getType(PtrReg).isPointer() &&
2869 "Load/Store pointer operand isn't a pointer");
2870 #endif
2871
2872 const Register ValReg = LdSt.getReg(0);
2873 const LLT ValTy = MRI.getType(ValReg);
2874 const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2875
2876 // The code below doesn't support truncating stores, so we need to split it
2877 // again.
2878 if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2879 unsigned SubReg;
2880 LLT MemTy = LdSt.getMMO().getMemoryType();
2881 auto *RC = getRegClassForTypeOnBank(MemTy, RB);
2882 if (!getSubRegForClass(RC, TRI, SubReg))
2883 return false;
2884
2885 // Generate a subreg copy.
2886 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
2887 .addReg(ValReg, 0, SubReg)
2888 .getReg(0);
2889 RBI.constrainGenericRegister(Copy, *RC, MRI);
2890 LdSt.getOperand(0).setReg(Copy);
2891 } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2892 // If this is an any-extending load from the FPR bank, split it into a regular
2893 // load + extend.
2894 if (RB.getID() == AArch64::FPRRegBankID) {
2895 unsigned SubReg;
2896 LLT MemTy = LdSt.getMMO().getMemoryType();
2897 auto *RC = getRegClassForTypeOnBank(MemTy, RB);
2898 if (!getSubRegForClass(RC, TRI, SubReg))
2899 return false;
2900 Register OldDst = LdSt.getReg(0);
2901 Register NewDst =
2902 MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType());
2903 LdSt.getOperand(0).setReg(NewDst);
2904 MRI.setRegBank(NewDst, RB);
2905 // Generate a SUBREG_TO_REG to extend it.
2906 MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator()));
2907 MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
2908 .addImm(0)
2909 .addUse(NewDst)
2910 .addImm(SubReg);
2911 auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB);
2912 RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
2913 MIB.setInstr(LdSt);
2914 }
2915 }
2916
2917 // Helper lambda for partially selecting I. Either returns the original
2918 // instruction with an updated opcode, or a new instruction.
2919 auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2920 bool IsStore = isa<GStore>(I);
2921 const unsigned NewOpc =
2922 selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
2923 if (NewOpc == I.getOpcode())
2924 return nullptr;
2925 // Check if we can fold anything into the addressing mode.
2926 auto AddrModeFns =
2927 selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
2928 if (!AddrModeFns) {
2929 // Can't fold anything. Use the original instruction.
2930 I.setDesc(TII.get(NewOpc));
2931 I.addOperand(MachineOperand::CreateImm(0));
2932 return &I;
2933 }
2934
2935 // Folded something. Create a new instruction and return it.
2936 auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
2937 Register CurValReg = I.getOperand(0).getReg();
2938 IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
2939 NewInst.cloneMemRefs(I);
2940 for (auto &Fn : *AddrModeFns)
2941 Fn(NewInst);
2942 I.eraseFromParent();
2943 return &*NewInst;
2944 };
2945
2946 MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
2947 if (!LoadStore)
2948 return false;
2949
2950 // If we're storing a 0, use WZR/XZR.
2951 if (Opcode == TargetOpcode::G_STORE) {
2952 auto CVal = getIConstantVRegValWithLookThrough(
2953 LoadStore->getOperand(0).getReg(), MRI);
2954 if (CVal && CVal->Value == 0) {
2955 switch (LoadStore->getOpcode()) {
2956 case AArch64::STRWui:
2957 case AArch64::STRHHui:
2958 case AArch64::STRBBui:
2959 LoadStore->getOperand(0).setReg(AArch64::WZR);
2960 break;
2961 case AArch64::STRXui:
2962 LoadStore->getOperand(0).setReg(AArch64::XZR);
2963 break;
2964 }
2965 }
2966 }
2967
2968 if (IsZExtLoad) {
2969 // The zextload from a smaller type to i32 should be handled by the
2970 // importer.
2971 if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
2972 return false;
2973 // If we have a ZEXTLOAD then change the load's type to be a narrower reg
2974 // and zero_extend with SUBREG_TO_REG.
2975 Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2976 Register DstReg = LoadStore->getOperand(0).getReg();
2977 LoadStore->getOperand(0).setReg(LdReg);
2978
2979 MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
2980 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
2981 .addImm(0)
2982 .addUse(LdReg)
2983 .addImm(AArch64::sub_32);
2984 constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2985 return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
2986 MRI);
2987 }
2988 return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2989 }
2990
2991 case TargetOpcode::G_SMULH:
2992 case TargetOpcode::G_UMULH: {
2993 // Reject the various things we don't support yet.
2994 if (unsupportedBinOp(I, RBI, MRI, TRI))
2995 return false;
2996
2997 const Register DefReg = I.getOperand(0).getReg();
2998 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2999
3000 if (RB.getID() != AArch64::GPRRegBankID) {
3001 LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
3002 return false;
3003 }
3004
3005 if (Ty != LLT::scalar(64)) {
3006 LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
3007 << ", expected: " << LLT::scalar(64) << '\n');
3008 return false;
3009 }
3010
3011 unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
3012 : AArch64::UMULHrr;
3013 I.setDesc(TII.get(NewOpc));
3014
3015 // Now that we selected an opcode, we need to constrain the register
3016 // operands to use appropriate classes.
3017 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3018 }
3019 case TargetOpcode::G_LSHR:
3020 case TargetOpcode::G_ASHR:
3021 if (MRI.getType(I.getOperand(0).getReg()).isVector())
3022 return selectVectorAshrLshr(I, MRI);
3023 [[fallthrough]];
3024 case TargetOpcode::G_SHL:
3025 if (Opcode == TargetOpcode::G_SHL &&
3026 MRI.getType(I.getOperand(0).getReg()).isVector())
3027 return selectVectorSHL(I, MRI);
3028
3029 // These shifts were legalized to have 64 bit shift amounts because we
3030 // want to take advantage of the selection patterns that assume the
3031 // immediates are s64s, however, selectBinaryOp will assume both operands
3032 // will have the same bit size.
3033 {
3034 Register SrcReg = I.getOperand(1).getReg();
3035 Register ShiftReg = I.getOperand(2).getReg();
3036 const LLT ShiftTy = MRI.getType(ShiftReg);
3037 const LLT SrcTy = MRI.getType(SrcReg);
3038 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
3039 ShiftTy.getSizeInBits() == 64) {
3040 assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3041 // Insert a subregister copy to implement a 64->32 trunc
3042 auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
3043 .addReg(ShiftReg, 0, AArch64::sub_32);
3044 MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
3045 I.getOperand(2).setReg(Trunc.getReg(0));
3046 }
3047 }
3048 [[fallthrough]];
3049 case TargetOpcode::G_OR: {
3050 // Reject the various things we don't support yet.
3051 if (unsupportedBinOp(I, RBI, MRI, TRI))
3052 return false;
3053
3054 const unsigned OpSize = Ty.getSizeInBits();
3055
3056 const Register DefReg = I.getOperand(0).getReg();
3057 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3058
3059 const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
3060 if (NewOpc == I.getOpcode())
3061 return false;
3062
3063 I.setDesc(TII.get(NewOpc));
3064 // FIXME: Should the type be always reset in setDesc?
3065
3066 // Now that we selected an opcode, we need to constrain the register
3067 // operands to use appropriate classes.
3068 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3069 }
3070
3071 case TargetOpcode::G_PTR_ADD: {
3072 emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
3073 I.eraseFromParent();
3074 return true;
3075 }
3076 case TargetOpcode::G_SADDO:
3077 case TargetOpcode::G_UADDO:
3078 case TargetOpcode::G_SSUBO:
3079 case TargetOpcode::G_USUBO: {
3080 // Emit the operation and get the correct condition code.
3081 auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(),
3082 I.getOperand(2), I.getOperand(3), MIB);
3083
3084 // Now, put the overflow result in the register given by the first operand
3085 // to the overflow op. CSINC increments the result when the predicate is
3086 // false, so to get the increment when it's true, we need to use the
3087 // inverse. In this case, we want to increment when carry is set.
3088 Register ZReg = AArch64::WZR;
3089 emitCSINC(/*Dst=*/I.getOperand(1).getReg(), /*Src1=*/ZReg, /*Src2=*/ZReg,
3090 getInvertedCondCode(OpAndCC.second), MIB);
3091 I.eraseFromParent();
3092 return true;
3093 }
3094
3095 case TargetOpcode::G_PTRMASK: {
3096 Register MaskReg = I.getOperand(2).getReg();
3097 std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI);
3098 // TODO: Implement arbitrary cases
3099 if (!MaskVal || !isShiftedMask_64(*MaskVal))
3100 return false;
3101
3102 uint64_t Mask = *MaskVal;
3103 I.setDesc(TII.get(AArch64::ANDXri));
3104 I.getOperand(2).ChangeToImmediate(
3105 AArch64_AM::encodeLogicalImmediate(Mask, 64));
3106
3107 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3108 }
3109 case TargetOpcode::G_PTRTOINT:
3110 case TargetOpcode::G_TRUNC: {
3111 const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3112 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3113
3114 const Register DstReg = I.getOperand(0).getReg();
3115 const Register SrcReg = I.getOperand(1).getReg();
3116
3117 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3118 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3119
3120 if (DstRB.getID() != SrcRB.getID()) {
3121 LLVM_DEBUG(
3122 dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3123 return false;
3124 }
3125
3126 if (DstRB.getID() == AArch64::GPRRegBankID) {
3127 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3128 if (!DstRC)
3129 return false;
3130
3131 const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
3132 if (!SrcRC)
3133 return false;
3134
3135 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
3136 !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
3137 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3138 return false;
3139 }
3140
3141 if (DstRC == SrcRC) {
3142 // Nothing to be done
3143 } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
3144 SrcTy == LLT::scalar(64)) {
3145 llvm_unreachable("TableGen can import this case");
3146 return false;
3147 } else if (DstRC == &AArch64::GPR32RegClass &&
3148 SrcRC == &AArch64::GPR64RegClass) {
3149 I.getOperand(1).setSubReg(AArch64::sub_32);
3150 } else {
3151 LLVM_DEBUG(
3152 dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3153 return false;
3154 }
3155
3156 I.setDesc(TII.get(TargetOpcode::COPY));
3157 return true;
3158 } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3159 if (DstTy == LLT::fixed_vector(4, 16) &&
3160 SrcTy == LLT::fixed_vector(4, 32)) {
3161 I.setDesc(TII.get(AArch64::XTNv4i16));
3162 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3163 return true;
3164 }
3165
3166 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3167 MachineInstr *Extract = emitExtractVectorElt(
3168 DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
3169 if (!Extract)
3170 return false;
3171 I.eraseFromParent();
3172 return true;
3173 }
3174
3175 // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3176 if (Opcode == TargetOpcode::G_PTRTOINT) {
3177 assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3178 I.setDesc(TII.get(TargetOpcode::COPY));
3179 return selectCopy(I, TII, MRI, TRI, RBI);
3180 }
3181 }
3182
3183 return false;
3184 }
3185
3186 case TargetOpcode::G_ANYEXT: {
3187 if (selectUSMovFromExtend(I, MRI))
3188 return true;
3189
3190 const Register DstReg = I.getOperand(0).getReg();
3191 const Register SrcReg = I.getOperand(1).getReg();
3192
3193 const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3194 if (RBDst.getID() != AArch64::GPRRegBankID) {
3195 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3196 << ", expected: GPR\n");
3197 return false;
3198 }
3199
3200 const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3201 if (RBSrc.getID() != AArch64::GPRRegBankID) {
3202 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3203 << ", expected: GPR\n");
3204 return false;
3205 }
3206
3207 const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3208
3209 if (DstSize == 0) {
3210 LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3211 return false;
3212 }
3213
3214 if (DstSize != 64 && DstSize > 32) {
3215 LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3216 << ", expected: 32 or 64\n");
3217 return false;
3218 }
3219 // At this point G_ANYEXT is just like a plain COPY, but we need
3220 // to explicitly form the 64-bit value if any.
3221 if (DstSize > 32) {
3222 Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3223 BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3224 .addDef(ExtSrc)
3225 .addImm(0)
3226 .addUse(SrcReg)
3227 .addImm(AArch64::sub_32);
3228 I.getOperand(1).setReg(ExtSrc);
3229 }
3230 return selectCopy(I, TII, MRI, TRI, RBI);
3231 }
3232
3233 case TargetOpcode::G_ZEXT:
3234 case TargetOpcode::G_SEXT_INREG:
3235 case TargetOpcode::G_SEXT: {
3236 if (selectUSMovFromExtend(I, MRI))
3237 return true;
3238
3239 unsigned Opcode = I.getOpcode();
3240 const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3241 const Register DefReg = I.getOperand(0).getReg();
3242 Register SrcReg = I.getOperand(1).getReg();
3243 const LLT DstTy = MRI.getType(DefReg);
3244 const LLT SrcTy = MRI.getType(SrcReg);
3245 unsigned DstSize = DstTy.getSizeInBits();
3246 unsigned SrcSize = SrcTy.getSizeInBits();
3247
3248 // SEXT_INREG has the same src reg size as dst, the size of the value to be
3249 // extended is encoded in the imm.
3250 if (Opcode == TargetOpcode::G_SEXT_INREG)
3251 SrcSize = I.getOperand(2).getImm();
3252
3253 if (DstTy.isVector())
3254 return false; // Should be handled by imported patterns.
3255
3256 assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3257 AArch64::GPRRegBankID &&
3258 "Unexpected ext regbank");
3259
3260 MachineInstr *ExtI;
3261
3262 // First check if we're extending the result of a load which has a dest type
3263 // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3264 // GPR register on AArch64 and all loads which are smaller automatically
3265 // zero-extend the upper bits. E.g.
3266 // %v(s8) = G_LOAD %p, :: (load 1)
3267 // %v2(s32) = G_ZEXT %v(s8)
3268 if (!IsSigned) {
3269 auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3270 bool IsGPR =
3271 RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3272 if (LoadMI && IsGPR) {
3273 const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3274 unsigned BytesLoaded = MemOp->getSize();
3275 if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3276 return selectCopy(I, TII, MRI, TRI, RBI);
3277 }
3278
3279 // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3280 // + SUBREG_TO_REG.
3281 if (IsGPR && SrcSize == 32 && DstSize == 64) {
3282 Register SubregToRegSrc =
3283 MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3284 const Register ZReg = AArch64::WZR;
3285 MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg})
3286 .addImm(0);
3287
3288 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3289 .addImm(0)
3290 .addUse(SubregToRegSrc)
3291 .addImm(AArch64::sub_32);
3292
3293 if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3294 MRI)) {
3295 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3296 return false;
3297 }
3298
3299 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3300 MRI)) {
3301 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3302 return false;
3303 }
3304
3305 I.eraseFromParent();
3306 return true;
3307 }
3308 }
3309
3310 if (DstSize == 64) {
3311 if (Opcode != TargetOpcode::G_SEXT_INREG) {
3312 // FIXME: Can we avoid manually doing this?
3313 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3314 MRI)) {
3315 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3316 << " operand\n");
3317 return false;
3318 }
3319 SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3320 {&AArch64::GPR64RegClass}, {})
3321 .addImm(0)
3322 .addUse(SrcReg)
3323 .addImm(AArch64::sub_32)
3324 .getReg(0);
3325 }
3326
3327 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3328 {DefReg}, {SrcReg})
3329 .addImm(0)
3330 .addImm(SrcSize - 1);
3331 } else if (DstSize <= 32) {
3332 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3333 {DefReg}, {SrcReg})
3334 .addImm(0)
3335 .addImm(SrcSize - 1);
3336 } else {
3337 return false;
3338 }
3339
3340 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3341 I.eraseFromParent();
3342 return true;
3343 }
3344
3345 case TargetOpcode::G_SITOFP:
3346 case TargetOpcode::G_UITOFP:
3347 case TargetOpcode::G_FPTOSI:
3348 case TargetOpcode::G_FPTOUI: {
3349 const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3350 SrcTy = MRI.getType(I.getOperand(1).getReg());
3351 const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3352 if (NewOpc == Opcode)
3353 return false;
3354
3355 I.setDesc(TII.get(NewOpc));
3356 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3357 I.setFlags(MachineInstr::NoFPExcept);
3358
3359 return true;
3360 }
3361
3362 case TargetOpcode::G_FREEZE:
3363 return selectCopy(I, TII, MRI, TRI, RBI);
3364
3365 case TargetOpcode::G_INTTOPTR:
3366 // The importer is currently unable to import pointer types since they
3367 // didn't exist in SelectionDAG.
3368 return selectCopy(I, TII, MRI, TRI, RBI);
3369
3370 case TargetOpcode::G_BITCAST:
3371 // Imported SelectionDAG rules can handle every bitcast except those that
3372 // bitcast from a type to the same type. Ideally, these shouldn't occur
3373 // but we might not run an optimizer that deletes them. The other exception
3374 // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3375 // of them.
3376 return selectCopy(I, TII, MRI, TRI, RBI);
3377
3378 case TargetOpcode::G_SELECT: {
3379 auto &Sel = cast<GSelect>(I);
3380 const Register CondReg = Sel.getCondReg();
3381 const Register TReg = Sel.getTrueReg();
3382 const Register FReg = Sel.getFalseReg();
3383
3384 if (tryOptSelect(Sel))
3385 return true;
3386
3387 // Make sure to use an unused vreg instead of wzr, so that the peephole
3388 // optimizations will be able to optimize these.
3389 Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3390 auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3391 .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3392 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3393 if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
3394 return false;
3395 Sel.eraseFromParent();
3396 return true;
3397 }
3398 case TargetOpcode::G_ICMP: {
3399 if (Ty.isVector())
3400 return selectVectorICmp(I, MRI);
3401
3402 if (Ty != LLT::scalar(32)) {
3403 LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3404 << ", expected: " << LLT::scalar(32) << '\n');
3405 return false;
3406 }
3407
3408 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3409 const AArch64CC::CondCode InvCC =
3410 changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
3411 emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
3412 emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
3413 /*Src2=*/AArch64::WZR, InvCC, MIB);
3414 I.eraseFromParent();
3415 return true;
3416 }
3417
3418 case TargetOpcode::G_FCMP: {
3419 CmpInst::Predicate Pred =
3420 static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3421 if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3422 Pred) ||
3423 !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3424 return false;
3425 I.eraseFromParent();
3426 return true;
3427 }
3428 case TargetOpcode::G_VASTART:
3429 return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3430 : selectVaStartAAPCS(I, MF, MRI);
3431 case TargetOpcode::G_INTRINSIC:
3432 return selectIntrinsic(I, MRI);
3433 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3434 return selectIntrinsicWithSideEffects(I, MRI);
3435 case TargetOpcode::G_IMPLICIT_DEF: {
3436 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3437 const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3438 const Register DstReg = I.getOperand(0).getReg();
3439 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3440 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3441 RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3442 return true;
3443 }
3444 case TargetOpcode::G_BLOCK_ADDR: {
3445 if (TM.getCodeModel() == CodeModel::Large) {
3446 materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3447 I.eraseFromParent();
3448 return true;
3449 } else {
3450 I.setDesc(TII.get(AArch64::MOVaddrBA));
3451 auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3452 I.getOperand(0).getReg())
3453 .addBlockAddress(I.getOperand(1).getBlockAddress(),
3454 /* Offset */ 0, AArch64II::MO_PAGE)
3455 .addBlockAddress(
3456 I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3457 AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3458 I.eraseFromParent();
3459 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3460 }
3461 }
3462 case AArch64::G_DUP: {
3463 // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3464 // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3465 // difficult because at RBS we may end up pessimizing the fpr case if we
3466 // decided to add an anyextend to fix this. Manual selection is the most
3467 // robust solution for now.
3468 if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3469 AArch64::GPRRegBankID)
3470 return false; // We expect the fpr regbank case to be imported.
3471 LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3472 if (VecTy == LLT::fixed_vector(8, 8))
3473 I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3474 else if (VecTy == LLT::fixed_vector(16, 8))
3475 I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3476 else if (VecTy == LLT::fixed_vector(4, 16))
3477 I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3478 else if (VecTy == LLT::fixed_vector(8, 16))
3479 I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3480 else
3481 return false;
3482 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3483 }
3484 case TargetOpcode::G_INTRINSIC_TRUNC:
3485 return selectIntrinsicTrunc(I, MRI);
3486 case TargetOpcode::G_INTRINSIC_ROUND:
3487 return selectIntrinsicRound(I, MRI);
3488 case TargetOpcode::G_BUILD_VECTOR:
3489 return selectBuildVector(I, MRI);
3490 case TargetOpcode::G_MERGE_VALUES:
3491 return selectMergeValues(I, MRI);
3492 case TargetOpcode::G_UNMERGE_VALUES:
3493 return selectUnmergeValues(I, MRI);
3494 case TargetOpcode::G_SHUFFLE_VECTOR:
3495 return selectShuffleVector(I, MRI);
3496 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3497 return selectExtractElt(I, MRI);
3498 case TargetOpcode::G_INSERT_VECTOR_ELT:
3499 return selectInsertElt(I, MRI);
3500 case TargetOpcode::G_CONCAT_VECTORS:
3501 return selectConcatVectors(I, MRI);
3502 case TargetOpcode::G_JUMP_TABLE:
3503 return selectJumpTable(I, MRI);
3504 case TargetOpcode::G_VECREDUCE_FADD:
3505 case TargetOpcode::G_VECREDUCE_ADD:
3506 return selectReduction(I, MRI);
3507 case TargetOpcode::G_MEMCPY:
3508 case TargetOpcode::G_MEMCPY_INLINE:
3509 case TargetOpcode::G_MEMMOVE:
3510 case TargetOpcode::G_MEMSET:
3511 assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3512 return selectMOPS(I, MRI);
3513 }
3514
3515 return false;
3516 }
3517
selectReduction(MachineInstr & I,MachineRegisterInfo & MRI)3518 bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
3519 MachineRegisterInfo &MRI) {
3520 Register VecReg = I.getOperand(1).getReg();
3521 LLT VecTy = MRI.getType(VecReg);
3522 if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) {
3523 // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit
3524 // a subregister copy afterwards.
3525 if (VecTy == LLT::fixed_vector(2, 32)) {
3526 Register DstReg = I.getOperand(0).getReg();
3527 auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass},
3528 {VecReg, VecReg});
3529 auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3530 .addReg(AddP.getReg(0), 0, AArch64::ssub)
3531 .getReg(0);
3532 RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI);
3533 I.eraseFromParent();
3534 return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI);
3535 }
3536
3537 unsigned Opc = 0;
3538 if (VecTy == LLT::fixed_vector(16, 8))
3539 Opc = AArch64::ADDVv16i8v;
3540 else if (VecTy == LLT::fixed_vector(8, 16))
3541 Opc = AArch64::ADDVv8i16v;
3542 else if (VecTy == LLT::fixed_vector(4, 32))
3543 Opc = AArch64::ADDVv4i32v;
3544 else if (VecTy == LLT::fixed_vector(2, 64))
3545 Opc = AArch64::ADDPv2i64p;
3546 else {
3547 LLVM_DEBUG(dbgs() << "Unhandled type for add reduction");
3548 return false;
3549 }
3550 I.setDesc(TII.get(Opc));
3551 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3552 }
3553
3554 if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) {
3555 unsigned Opc = 0;
3556 if (VecTy == LLT::fixed_vector(2, 32))
3557 Opc = AArch64::FADDPv2i32p;
3558 else if (VecTy == LLT::fixed_vector(2, 64))
3559 Opc = AArch64::FADDPv2i64p;
3560 else {
3561 LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction");
3562 return false;
3563 }
3564 I.setDesc(TII.get(Opc));
3565 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3566 }
3567 return false;
3568 }
3569
selectMOPS(MachineInstr & GI,MachineRegisterInfo & MRI)3570 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3571 MachineRegisterInfo &MRI) {
3572 unsigned Mopcode;
3573 switch (GI.getOpcode()) {
3574 case TargetOpcode::G_MEMCPY:
3575 case TargetOpcode::G_MEMCPY_INLINE:
3576 Mopcode = AArch64::MOPSMemoryCopyPseudo;
3577 break;
3578 case TargetOpcode::G_MEMMOVE:
3579 Mopcode = AArch64::MOPSMemoryMovePseudo;
3580 break;
3581 case TargetOpcode::G_MEMSET:
3582 // For tagged memset see llvm.aarch64.mops.memset.tag
3583 Mopcode = AArch64::MOPSMemorySetPseudo;
3584 break;
3585 }
3586
3587 auto &DstPtr = GI.getOperand(0);
3588 auto &SrcOrVal = GI.getOperand(1);
3589 auto &Size = GI.getOperand(2);
3590
3591 // Create copies of the registers that can be clobbered.
3592 const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
3593 const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
3594 const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());
3595
3596 const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3597 const auto &SrcValRegClass =
3598 IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3599
3600 // Constrain to specific registers
3601 RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
3602 RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
3603 RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
3604
3605 MIB.buildCopy(DstPtrCopy, DstPtr);
3606 MIB.buildCopy(SrcValCopy, SrcOrVal);
3607 MIB.buildCopy(SizeCopy, Size);
3608
3609 // New instruction uses the copied registers because it must update them.
3610 // The defs are not used since they don't exist in G_MEM*. They are still
3611 // tied.
3612 // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3613 Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
3614 Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3615 if (IsSet) {
3616 MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
3617 {DstPtrCopy, SizeCopy, SrcValCopy});
3618 } else {
3619 Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
3620 MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
3621 {DstPtrCopy, SrcValCopy, SizeCopy});
3622 }
3623
3624 GI.eraseFromParent();
3625 return true;
3626 }
3627
selectBrJT(MachineInstr & I,MachineRegisterInfo & MRI)3628 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3629 MachineRegisterInfo &MRI) {
3630 assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3631 Register JTAddr = I.getOperand(0).getReg();
3632 unsigned JTI = I.getOperand(1).getIndex();
3633 Register Index = I.getOperand(2).getReg();
3634
3635 Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3636 Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3637
3638 MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3639 auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3640 {TargetReg, ScratchReg}, {JTAddr, Index})
3641 .addJumpTableIndex(JTI);
3642 // Build the indirect branch.
3643 MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3644 I.eraseFromParent();
3645 return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3646 }
3647
selectJumpTable(MachineInstr & I,MachineRegisterInfo & MRI)3648 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3649 MachineRegisterInfo &MRI) {
3650 assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3651 assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3652
3653 Register DstReg = I.getOperand(0).getReg();
3654 unsigned JTI = I.getOperand(1).getIndex();
3655 // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3656 auto MovMI =
3657 MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3658 .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3659 .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3660 I.eraseFromParent();
3661 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3662 }
3663
selectTLSGlobalValue(MachineInstr & I,MachineRegisterInfo & MRI)3664 bool AArch64InstructionSelector::selectTLSGlobalValue(
3665 MachineInstr &I, MachineRegisterInfo &MRI) {
3666 if (!STI.isTargetMachO())
3667 return false;
3668 MachineFunction &MF = *I.getParent()->getParent();
3669 MF.getFrameInfo().setAdjustsStack(true);
3670
3671 const auto &GlobalOp = I.getOperand(1);
3672 assert(GlobalOp.getOffset() == 0 &&
3673 "Shouldn't have an offset on TLS globals!");
3674 const GlobalValue &GV = *GlobalOp.getGlobal();
3675
3676 auto LoadGOT =
3677 MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3678 .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3679
3680 auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3681 {LoadGOT.getReg(0)})
3682 .addImm(0);
3683
3684 MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3685 // TLS calls preserve all registers except those that absolutely must be
3686 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3687 // silly).
3688 MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
3689 .addUse(AArch64::X0, RegState::Implicit)
3690 .addDef(AArch64::X0, RegState::Implicit)
3691 .addRegMask(TRI.getTLSCallPreservedMask());
3692
3693 MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3694 RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3695 MRI);
3696 I.eraseFromParent();
3697 return true;
3698 }
3699
selectIntrinsicTrunc(MachineInstr & I,MachineRegisterInfo & MRI) const3700 bool AArch64InstructionSelector::selectIntrinsicTrunc(
3701 MachineInstr &I, MachineRegisterInfo &MRI) const {
3702 const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3703
3704 // Select the correct opcode.
3705 unsigned Opc = 0;
3706 if (!SrcTy.isVector()) {
3707 switch (SrcTy.getSizeInBits()) {
3708 default:
3709 case 16:
3710 Opc = AArch64::FRINTZHr;
3711 break;
3712 case 32:
3713 Opc = AArch64::FRINTZSr;
3714 break;
3715 case 64:
3716 Opc = AArch64::FRINTZDr;
3717 break;
3718 }
3719 } else {
3720 unsigned NumElts = SrcTy.getNumElements();
3721 switch (SrcTy.getElementType().getSizeInBits()) {
3722 default:
3723 break;
3724 case 16:
3725 if (NumElts == 4)
3726 Opc = AArch64::FRINTZv4f16;
3727 else if (NumElts == 8)
3728 Opc = AArch64::FRINTZv8f16;
3729 break;
3730 case 32:
3731 if (NumElts == 2)
3732 Opc = AArch64::FRINTZv2f32;
3733 else if (NumElts == 4)
3734 Opc = AArch64::FRINTZv4f32;
3735 break;
3736 case 64:
3737 if (NumElts == 2)
3738 Opc = AArch64::FRINTZv2f64;
3739 break;
3740 }
3741 }
3742
3743 if (!Opc) {
3744 // Didn't get an opcode above, bail.
3745 LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n");
3746 return false;
3747 }
3748
3749 // Legalization would have set us up perfectly for this; we just need to
3750 // set the opcode and move on.
3751 I.setDesc(TII.get(Opc));
3752 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3753 }
3754
selectIntrinsicRound(MachineInstr & I,MachineRegisterInfo & MRI) const3755 bool AArch64InstructionSelector::selectIntrinsicRound(
3756 MachineInstr &I, MachineRegisterInfo &MRI) const {
3757 const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3758
3759 // Select the correct opcode.
3760 unsigned Opc = 0;
3761 if (!SrcTy.isVector()) {
3762 switch (SrcTy.getSizeInBits()) {
3763 default:
3764 case 16:
3765 Opc = AArch64::FRINTAHr;
3766 break;
3767 case 32:
3768 Opc = AArch64::FRINTASr;
3769 break;
3770 case 64:
3771 Opc = AArch64::FRINTADr;
3772 break;
3773 }
3774 } else {
3775 unsigned NumElts = SrcTy.getNumElements();
3776 switch (SrcTy.getElementType().getSizeInBits()) {
3777 default:
3778 break;
3779 case 16:
3780 if (NumElts == 4)
3781 Opc = AArch64::FRINTAv4f16;
3782 else if (NumElts == 8)
3783 Opc = AArch64::FRINTAv8f16;
3784 break;
3785 case 32:
3786 if (NumElts == 2)
3787 Opc = AArch64::FRINTAv2f32;
3788 else if (NumElts == 4)
3789 Opc = AArch64::FRINTAv4f32;
3790 break;
3791 case 64:
3792 if (NumElts == 2)
3793 Opc = AArch64::FRINTAv2f64;
3794 break;
3795 }
3796 }
3797
3798 if (!Opc) {
3799 // Didn't get an opcode above, bail.
3800 LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n");
3801 return false;
3802 }
3803
3804 // Legalization would have set us up perfectly for this; we just need to
3805 // set the opcode and move on.
3806 I.setDesc(TII.get(Opc));
3807 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3808 }
3809
selectVectorICmp(MachineInstr & I,MachineRegisterInfo & MRI)3810 bool AArch64InstructionSelector::selectVectorICmp(
3811 MachineInstr &I, MachineRegisterInfo &MRI) {
3812 Register DstReg = I.getOperand(0).getReg();
3813 LLT DstTy = MRI.getType(DstReg);
3814 Register SrcReg = I.getOperand(2).getReg();
3815 Register Src2Reg = I.getOperand(3).getReg();
3816 LLT SrcTy = MRI.getType(SrcReg);
3817
3818 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
3819 unsigned NumElts = DstTy.getNumElements();
3820
3821 // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
3822 // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
3823 // Third index is cc opcode:
3824 // 0 == eq
3825 // 1 == ugt
3826 // 2 == uge
3827 // 3 == ult
3828 // 4 == ule
3829 // 5 == sgt
3830 // 6 == sge
3831 // 7 == slt
3832 // 8 == sle
3833 // ne is done by negating 'eq' result.
3834
3835 // This table below assumes that for some comparisons the operands will be
3836 // commuted.
3837 // ult op == commute + ugt op
3838 // ule op == commute + uge op
3839 // slt op == commute + sgt op
3840 // sle op == commute + sge op
3841 unsigned PredIdx = 0;
3842 bool SwapOperands = false;
3843 CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
3844 switch (Pred) {
3845 case CmpInst::ICMP_NE:
3846 case CmpInst::ICMP_EQ:
3847 PredIdx = 0;
3848 break;
3849 case CmpInst::ICMP_UGT:
3850 PredIdx = 1;
3851 break;
3852 case CmpInst::ICMP_UGE:
3853 PredIdx = 2;
3854 break;
3855 case CmpInst::ICMP_ULT:
3856 PredIdx = 3;
3857 SwapOperands = true;
3858 break;
3859 case CmpInst::ICMP_ULE:
3860 PredIdx = 4;
3861 SwapOperands = true;
3862 break;
3863 case CmpInst::ICMP_SGT:
3864 PredIdx = 5;
3865 break;
3866 case CmpInst::ICMP_SGE:
3867 PredIdx = 6;
3868 break;
3869 case CmpInst::ICMP_SLT:
3870 PredIdx = 7;
3871 SwapOperands = true;
3872 break;
3873 case CmpInst::ICMP_SLE:
3874 PredIdx = 8;
3875 SwapOperands = true;
3876 break;
3877 default:
3878 llvm_unreachable("Unhandled icmp predicate");
3879 return false;
3880 }
3881
3882 // This table obviously should be tablegen'd when we have our GISel native
3883 // tablegen selector.
3884
3885 static const unsigned OpcTable[4][4][9] = {
3886 {
3887 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3888 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3889 0 /* invalid */},
3890 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3891 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3892 0 /* invalid */},
3893 {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
3894 AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
3895 AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
3896 {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
3897 AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
3898 AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
3899 },
3900 {
3901 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3902 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3903 0 /* invalid */},
3904 {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
3905 AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
3906 AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
3907 {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
3908 AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
3909 AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
3910 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3911 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3912 0 /* invalid */}
3913 },
3914 {
3915 {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
3916 AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
3917 AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
3918 {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
3919 AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
3920 AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
3921 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3922 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3923 0 /* invalid */},
3924 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3925 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3926 0 /* invalid */}
3927 },
3928 {
3929 {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
3930 AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
3931 AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
3932 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3933 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3934 0 /* invalid */},
3935 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3936 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3937 0 /* invalid */},
3938 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3939 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3940 0 /* invalid */}
3941 },
3942 };
3943 unsigned EltIdx = Log2_32(SrcEltSize / 8);
3944 unsigned NumEltsIdx = Log2_32(NumElts / 2);
3945 unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
3946 if (!Opc) {
3947 LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
3948 return false;
3949 }
3950
3951 const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3952 const TargetRegisterClass *SrcRC =
3953 getRegClassForTypeOnBank(SrcTy, VecRB, true);
3954 if (!SrcRC) {
3955 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3956 return false;
3957 }
3958
3959 unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
3960 if (SrcTy.getSizeInBits() == 128)
3961 NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
3962
3963 if (SwapOperands)
3964 std::swap(SrcReg, Src2Reg);
3965
3966 auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
3967 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3968
3969 // Invert if we had a 'ne' cc.
3970 if (NotOpc) {
3971 Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
3972 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3973 } else {
3974 MIB.buildCopy(DstReg, Cmp.getReg(0));
3975 }
3976 RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
3977 I.eraseFromParent();
3978 return true;
3979 }
3980
emitScalarToVector(unsigned EltSize,const TargetRegisterClass * DstRC,Register Scalar,MachineIRBuilder & MIRBuilder) const3981 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3982 unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3983 MachineIRBuilder &MIRBuilder) const {
3984 auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3985
3986 auto BuildFn = [&](unsigned SubregIndex) {
3987 auto Ins =
3988 MIRBuilder
3989 .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3990 .addImm(SubregIndex);
3991 constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3992 constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3993 return &*Ins;
3994 };
3995
3996 switch (EltSize) {
3997 case 16:
3998 return BuildFn(AArch64::hsub);
3999 case 32:
4000 return BuildFn(AArch64::ssub);
4001 case 64:
4002 return BuildFn(AArch64::dsub);
4003 default:
4004 return nullptr;
4005 }
4006 }
4007
selectMergeValues(MachineInstr & I,MachineRegisterInfo & MRI)4008 bool AArch64InstructionSelector::selectMergeValues(
4009 MachineInstr &I, MachineRegisterInfo &MRI) {
4010 assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
4011 const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
4012 const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
4013 assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
4014 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
4015
4016 if (I.getNumOperands() != 3)
4017 return false;
4018
4019 // Merging 2 s64s into an s128.
4020 if (DstTy == LLT::scalar(128)) {
4021 if (SrcTy.getSizeInBits() != 64)
4022 return false;
4023 Register DstReg = I.getOperand(0).getReg();
4024 Register Src1Reg = I.getOperand(1).getReg();
4025 Register Src2Reg = I.getOperand(2).getReg();
4026 auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
4027 MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg,
4028 /* LaneIdx */ 0, RB, MIB);
4029 if (!InsMI)
4030 return false;
4031 MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
4032 Src2Reg, /* LaneIdx */ 1, RB, MIB);
4033 if (!Ins2MI)
4034 return false;
4035 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
4036 constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
4037 I.eraseFromParent();
4038 return true;
4039 }
4040
4041 if (RB.getID() != AArch64::GPRRegBankID)
4042 return false;
4043
4044 if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
4045 return false;
4046
4047 auto *DstRC = &AArch64::GPR64RegClass;
4048 Register SubToRegDef = MRI.createVirtualRegister(DstRC);
4049 MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
4050 TII.get(TargetOpcode::SUBREG_TO_REG))
4051 .addDef(SubToRegDef)
4052 .addImm(0)
4053 .addUse(I.getOperand(1).getReg())
4054 .addImm(AArch64::sub_32);
4055 Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
4056 // Need to anyext the second scalar before we can use bfm
4057 MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
4058 TII.get(TargetOpcode::SUBREG_TO_REG))
4059 .addDef(SubToRegDef2)
4060 .addImm(0)
4061 .addUse(I.getOperand(2).getReg())
4062 .addImm(AArch64::sub_32);
4063 MachineInstr &BFM =
4064 *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
4065 .addDef(I.getOperand(0).getReg())
4066 .addUse(SubToRegDef)
4067 .addUse(SubToRegDef2)
4068 .addImm(32)
4069 .addImm(31);
4070 constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
4071 constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
4072 constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
4073 I.eraseFromParent();
4074 return true;
4075 }
4076
getLaneCopyOpcode(unsigned & CopyOpc,unsigned & ExtractSubReg,const unsigned EltSize)4077 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
4078 const unsigned EltSize) {
4079 // Choose a lane copy opcode and subregister based off of the size of the
4080 // vector's elements.
4081 switch (EltSize) {
4082 case 8:
4083 CopyOpc = AArch64::DUPi8;
4084 ExtractSubReg = AArch64::bsub;
4085 break;
4086 case 16:
4087 CopyOpc = AArch64::DUPi16;
4088 ExtractSubReg = AArch64::hsub;
4089 break;
4090 case 32:
4091 CopyOpc = AArch64::DUPi32;
4092 ExtractSubReg = AArch64::ssub;
4093 break;
4094 case 64:
4095 CopyOpc = AArch64::DUPi64;
4096 ExtractSubReg = AArch64::dsub;
4097 break;
4098 default:
4099 // Unknown size, bail out.
4100 LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
4101 return false;
4102 }
4103 return true;
4104 }
4105
emitExtractVectorElt(std::optional<Register> DstReg,const RegisterBank & DstRB,LLT ScalarTy,Register VecReg,unsigned LaneIdx,MachineIRBuilder & MIRBuilder) const4106 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
4107 std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
4108 Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
4109 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4110 unsigned CopyOpc = 0;
4111 unsigned ExtractSubReg = 0;
4112 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
4113 LLVM_DEBUG(
4114 dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
4115 return nullptr;
4116 }
4117
4118 const TargetRegisterClass *DstRC =
4119 getRegClassForTypeOnBank(ScalarTy, DstRB, true);
4120 if (!DstRC) {
4121 LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
4122 return nullptr;
4123 }
4124
4125 const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
4126 const LLT &VecTy = MRI.getType(VecReg);
4127 const TargetRegisterClass *VecRC =
4128 getRegClassForTypeOnBank(VecTy, VecRB, true);
4129 if (!VecRC) {
4130 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
4131 return nullptr;
4132 }
4133
4134 // The register that we're going to copy into.
4135 Register InsertReg = VecReg;
4136 if (!DstReg)
4137 DstReg = MRI.createVirtualRegister(DstRC);
4138 // If the lane index is 0, we just use a subregister COPY.
4139 if (LaneIdx == 0) {
4140 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
4141 .addReg(VecReg, 0, ExtractSubReg);
4142 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4143 return &*Copy;
4144 }
4145
4146 // Lane copies require 128-bit wide registers. If we're dealing with an
4147 // unpacked vector, then we need to move up to that width. Insert an implicit
4148 // def and a subregister insert to get us there.
4149 if (VecTy.getSizeInBits() != 128) {
4150 MachineInstr *ScalarToVector = emitScalarToVector(
4151 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
4152 if (!ScalarToVector)
4153 return nullptr;
4154 InsertReg = ScalarToVector->getOperand(0).getReg();
4155 }
4156
4157 MachineInstr *LaneCopyMI =
4158 MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
4159 constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
4160
4161 // Make sure that we actually constrain the initial copy.
4162 RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4163 return LaneCopyMI;
4164 }
4165
selectExtractElt(MachineInstr & I,MachineRegisterInfo & MRI)4166 bool AArch64InstructionSelector::selectExtractElt(
4167 MachineInstr &I, MachineRegisterInfo &MRI) {
4168 assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4169 "unexpected opcode!");
4170 Register DstReg = I.getOperand(0).getReg();
4171 const LLT NarrowTy = MRI.getType(DstReg);
4172 const Register SrcReg = I.getOperand(1).getReg();
4173 const LLT WideTy = MRI.getType(SrcReg);
4174 (void)WideTy;
4175 assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4176 "source register size too small!");
4177 assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4178
4179 // Need the lane index to determine the correct copy opcode.
4180 MachineOperand &LaneIdxOp = I.getOperand(2);
4181 assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4182
4183 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4184 LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4185 return false;
4186 }
4187
4188 // Find the index to extract from.
4189 auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
4190 if (!VRegAndVal)
4191 return false;
4192 unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4193
4194
4195 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
4196 MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
4197 LaneIdx, MIB);
4198 if (!Extract)
4199 return false;
4200
4201 I.eraseFromParent();
4202 return true;
4203 }
4204
selectSplitVectorUnmerge(MachineInstr & I,MachineRegisterInfo & MRI)4205 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4206 MachineInstr &I, MachineRegisterInfo &MRI) {
4207 unsigned NumElts = I.getNumOperands() - 1;
4208 Register SrcReg = I.getOperand(NumElts).getReg();
4209 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4210 const LLT SrcTy = MRI.getType(SrcReg);
4211
4212 assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4213 if (SrcTy.getSizeInBits() > 128) {
4214 LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4215 return false;
4216 }
4217
4218 // We implement a split vector operation by treating the sub-vectors as
4219 // scalars and extracting them.
4220 const RegisterBank &DstRB =
4221 *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
4222 for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4223 Register Dst = I.getOperand(OpIdx).getReg();
4224 MachineInstr *Extract =
4225 emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
4226 if (!Extract)
4227 return false;
4228 }
4229 I.eraseFromParent();
4230 return true;
4231 }
4232
selectUnmergeValues(MachineInstr & I,MachineRegisterInfo & MRI)4233 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4234 MachineRegisterInfo &MRI) {
4235 assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4236 "unexpected opcode");
4237
4238 // TODO: Handle unmerging into GPRs and from scalars to scalars.
4239 if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4240 AArch64::FPRRegBankID ||
4241 RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4242 AArch64::FPRRegBankID) {
4243 LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4244 "currently unsupported.\n");
4245 return false;
4246 }
4247
4248 // The last operand is the vector source register, and every other operand is
4249 // a register to unpack into.
4250 unsigned NumElts = I.getNumOperands() - 1;
4251 Register SrcReg = I.getOperand(NumElts).getReg();
4252 const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4253 const LLT WideTy = MRI.getType(SrcReg);
4254 (void)WideTy;
4255 assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4256 "can only unmerge from vector or s128 types!");
4257 assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4258 "source register size too small!");
4259
4260 if (!NarrowTy.isScalar())
4261 return selectSplitVectorUnmerge(I, MRI);
4262
4263 // Choose a lane copy opcode and subregister based off of the size of the
4264 // vector's elements.
4265 unsigned CopyOpc = 0;
4266 unsigned ExtractSubReg = 0;
4267 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4268 return false;
4269
4270 // Set up for the lane copies.
4271 MachineBasicBlock &MBB = *I.getParent();
4272
4273 // Stores the registers we'll be copying from.
4274 SmallVector<Register, 4> InsertRegs;
4275
4276 // We'll use the first register twice, so we only need NumElts-1 registers.
4277 unsigned NumInsertRegs = NumElts - 1;
4278
4279 // If our elements fit into exactly 128 bits, then we can copy from the source
4280 // directly. Otherwise, we need to do a bit of setup with some subregister
4281 // inserts.
4282 if (NarrowTy.getSizeInBits() * NumElts == 128) {
4283 InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4284 } else {
4285 // No. We have to perform subregister inserts. For each insert, create an
4286 // implicit def and a subregister insert, and save the register we create.
4287 const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4288 LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()),
4289 *RBI.getRegBank(SrcReg, MRI, TRI));
4290 unsigned SubReg = 0;
4291 bool Found = getSubRegForClass(RC, TRI, SubReg);
4292 (void)Found;
4293 assert(Found && "expected to find last operand's subeg idx");
4294 for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4295 Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4296 MachineInstr &ImpDefMI =
4297 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4298 ImpDefReg);
4299
4300 // Now, create the subregister insert from SrcReg.
4301 Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4302 MachineInstr &InsMI =
4303 *BuildMI(MBB, I, I.getDebugLoc(),
4304 TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4305 .addUse(ImpDefReg)
4306 .addUse(SrcReg)
4307 .addImm(SubReg);
4308
4309 constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4310 constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4311
4312 // Save the register so that we can copy from it after.
4313 InsertRegs.push_back(InsertReg);
4314 }
4315 }
4316
4317 // Now that we've created any necessary subregister inserts, we can
4318 // create the copies.
4319 //
4320 // Perform the first copy separately as a subregister copy.
4321 Register CopyTo = I.getOperand(0).getReg();
4322 auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4323 .addReg(InsertRegs[0], 0, ExtractSubReg);
4324 constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4325
4326 // Now, perform the remaining copies as vector lane copies.
4327 unsigned LaneIdx = 1;
4328 for (Register InsReg : InsertRegs) {
4329 Register CopyTo = I.getOperand(LaneIdx).getReg();
4330 MachineInstr &CopyInst =
4331 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4332 .addUse(InsReg)
4333 .addImm(LaneIdx);
4334 constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4335 ++LaneIdx;
4336 }
4337
4338 // Separately constrain the first copy's destination. Because of the
4339 // limitation in constrainOperandRegClass, we can't guarantee that this will
4340 // actually be constrained. So, do it ourselves using the second operand.
4341 const TargetRegisterClass *RC =
4342 MRI.getRegClassOrNull(I.getOperand(1).getReg());
4343 if (!RC) {
4344 LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4345 return false;
4346 }
4347
4348 RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4349 I.eraseFromParent();
4350 return true;
4351 }
4352
selectConcatVectors(MachineInstr & I,MachineRegisterInfo & MRI)4353 bool AArch64InstructionSelector::selectConcatVectors(
4354 MachineInstr &I, MachineRegisterInfo &MRI) {
4355 assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4356 "Unexpected opcode");
4357 Register Dst = I.getOperand(0).getReg();
4358 Register Op1 = I.getOperand(1).getReg();
4359 Register Op2 = I.getOperand(2).getReg();
4360 MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4361 if (!ConcatMI)
4362 return false;
4363 I.eraseFromParent();
4364 return true;
4365 }
4366
4367 unsigned
emitConstantPoolEntry(const Constant * CPVal,MachineFunction & MF) const4368 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4369 MachineFunction &MF) const {
4370 Type *CPTy = CPVal->getType();
4371 Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4372
4373 MachineConstantPool *MCP = MF.getConstantPool();
4374 return MCP->getConstantPoolIndex(CPVal, Alignment);
4375 }
4376
emitLoadFromConstantPool(const Constant * CPVal,MachineIRBuilder & MIRBuilder) const4377 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4378 const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4379 auto &MF = MIRBuilder.getMF();
4380 unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4381
4382 auto Adrp =
4383 MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4384 .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4385
4386 MachineInstr *LoadMI = nullptr;
4387 MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4388 unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4389 switch (Size) {
4390 case 16:
4391 LoadMI =
4392 &*MIRBuilder
4393 .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
4394 .addConstantPoolIndex(CPIdx, 0,
4395 AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4396 break;
4397 case 8:
4398 LoadMI =
4399 &*MIRBuilder
4400 .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
4401 .addConstantPoolIndex(CPIdx, 0,
4402 AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4403 break;
4404 case 4:
4405 LoadMI =
4406 &*MIRBuilder
4407 .buildInstr(AArch64::LDRSui, {&AArch64::FPR32RegClass}, {Adrp})
4408 .addConstantPoolIndex(CPIdx, 0,
4409 AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4410 break;
4411 case 2:
4412 LoadMI =
4413 &*MIRBuilder
4414 .buildInstr(AArch64::LDRHui, {&AArch64::FPR16RegClass}, {Adrp})
4415 .addConstantPoolIndex(CPIdx, 0,
4416 AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4417 break;
4418 default:
4419 LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4420 << *CPVal->getType());
4421 return nullptr;
4422 }
4423 LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4424 MachineMemOperand::MOLoad,
4425 Size, Align(Size)));
4426 constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4427 constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4428 return LoadMI;
4429 }
4430
4431 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4432 /// size and RB.
4433 static std::pair<unsigned, unsigned>
getInsertVecEltOpInfo(const RegisterBank & RB,unsigned EltSize)4434 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4435 unsigned Opc, SubregIdx;
4436 if (RB.getID() == AArch64::GPRRegBankID) {
4437 if (EltSize == 16) {
4438 Opc = AArch64::INSvi16gpr;
4439 SubregIdx = AArch64::ssub;
4440 } else if (EltSize == 32) {
4441 Opc = AArch64::INSvi32gpr;
4442 SubregIdx = AArch64::ssub;
4443 } else if (EltSize == 64) {
4444 Opc = AArch64::INSvi64gpr;
4445 SubregIdx = AArch64::dsub;
4446 } else {
4447 llvm_unreachable("invalid elt size!");
4448 }
4449 } else {
4450 if (EltSize == 8) {
4451 Opc = AArch64::INSvi8lane;
4452 SubregIdx = AArch64::bsub;
4453 } else if (EltSize == 16) {
4454 Opc = AArch64::INSvi16lane;
4455 SubregIdx = AArch64::hsub;
4456 } else if (EltSize == 32) {
4457 Opc = AArch64::INSvi32lane;
4458 SubregIdx = AArch64::ssub;
4459 } else if (EltSize == 64) {
4460 Opc = AArch64::INSvi64lane;
4461 SubregIdx = AArch64::dsub;
4462 } else {
4463 llvm_unreachable("invalid elt size!");
4464 }
4465 }
4466 return std::make_pair(Opc, SubregIdx);
4467 }
4468
emitInstr(unsigned Opcode,std::initializer_list<llvm::DstOp> DstOps,std::initializer_list<llvm::SrcOp> SrcOps,MachineIRBuilder & MIRBuilder,const ComplexRendererFns & RenderFns) const4469 MachineInstr *AArch64InstructionSelector::emitInstr(
4470 unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4471 std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4472 const ComplexRendererFns &RenderFns) const {
4473 assert(Opcode && "Expected an opcode?");
4474 assert(!isPreISelGenericOpcode(Opcode) &&
4475 "Function should only be used to produce selected instructions!");
4476 auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4477 if (RenderFns)
4478 for (auto &Fn : *RenderFns)
4479 Fn(MI);
4480 constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4481 return &*MI;
4482 }
4483
emitAddSub(const std::array<std::array<unsigned,2>,5> & AddrModeAndSizeToOpcode,Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4484 MachineInstr *AArch64InstructionSelector::emitAddSub(
4485 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4486 Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4487 MachineIRBuilder &MIRBuilder) const {
4488 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4489 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4490 auto Ty = MRI.getType(LHS.getReg());
4491 assert(!Ty.isVector() && "Expected a scalar or pointer?");
4492 unsigned Size = Ty.getSizeInBits();
4493 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4494 bool Is32Bit = Size == 32;
4495
4496 // INSTRri form with positive arithmetic immediate.
4497 if (auto Fns = selectArithImmed(RHS))
4498 return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4499 MIRBuilder, Fns);
4500
4501 // INSTRri form with negative arithmetic immediate.
4502 if (auto Fns = selectNegArithImmed(RHS))
4503 return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4504 MIRBuilder, Fns);
4505
4506 // INSTRrx form.
4507 if (auto Fns = selectArithExtendedRegister(RHS))
4508 return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4509 MIRBuilder, Fns);
4510
4511 // INSTRrs form.
4512 if (auto Fns = selectShiftedRegister(RHS))
4513 return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4514 MIRBuilder, Fns);
4515 return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4516 MIRBuilder);
4517 }
4518
4519 MachineInstr *
emitADD(Register DefReg,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4520 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4521 MachineOperand &RHS,
4522 MachineIRBuilder &MIRBuilder) const {
4523 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4524 {{AArch64::ADDXri, AArch64::ADDWri},
4525 {AArch64::ADDXrs, AArch64::ADDWrs},
4526 {AArch64::ADDXrr, AArch64::ADDWrr},
4527 {AArch64::SUBXri, AArch64::SUBWri},
4528 {AArch64::ADDXrx, AArch64::ADDWrx}}};
4529 return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4530 }
4531
4532 MachineInstr *
emitADDS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4533 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4534 MachineOperand &RHS,
4535 MachineIRBuilder &MIRBuilder) const {
4536 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4537 {{AArch64::ADDSXri, AArch64::ADDSWri},
4538 {AArch64::ADDSXrs, AArch64::ADDSWrs},
4539 {AArch64::ADDSXrr, AArch64::ADDSWrr},
4540 {AArch64::SUBSXri, AArch64::SUBSWri},
4541 {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4542 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4543 }
4544
4545 MachineInstr *
emitSUBS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4546 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4547 MachineOperand &RHS,
4548 MachineIRBuilder &MIRBuilder) const {
4549 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4550 {{AArch64::SUBSXri, AArch64::SUBSWri},
4551 {AArch64::SUBSXrs, AArch64::SUBSWrs},
4552 {AArch64::SUBSXrr, AArch64::SUBSWrr},
4553 {AArch64::ADDSXri, AArch64::ADDSWri},
4554 {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4555 return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4556 }
4557
4558 MachineInstr *
emitCMN(MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4559 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4560 MachineIRBuilder &MIRBuilder) const {
4561 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4562 bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4563 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4564 return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4565 }
4566
4567 MachineInstr *
emitTST(MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4568 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4569 MachineIRBuilder &MIRBuilder) const {
4570 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4571 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4572 LLT Ty = MRI.getType(LHS.getReg());
4573 unsigned RegSize = Ty.getSizeInBits();
4574 bool Is32Bit = (RegSize == 32);
4575 const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4576 {AArch64::ANDSXrs, AArch64::ANDSWrs},
4577 {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4578 // ANDS needs a logical immediate for its immediate form. Check if we can
4579 // fold one in.
4580 if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4581 int64_t Imm = ValAndVReg->Value.getSExtValue();
4582
4583 if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4584 auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4585 TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4586 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4587 return &*TstMI;
4588 }
4589 }
4590
4591 if (auto Fns = selectLogicalShiftedRegister(RHS))
4592 return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4593 return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4594 }
4595
emitIntegerCompare(MachineOperand & LHS,MachineOperand & RHS,MachineOperand & Predicate,MachineIRBuilder & MIRBuilder) const4596 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4597 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4598 MachineIRBuilder &MIRBuilder) const {
4599 assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4600 assert(Predicate.isPredicate() && "Expected predicate?");
4601 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4602 LLT CmpTy = MRI.getType(LHS.getReg());
4603 assert(!CmpTy.isVector() && "Expected scalar or pointer");
4604 unsigned Size = CmpTy.getSizeInBits();
4605 (void)Size;
4606 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4607 // Fold the compare into a cmn or tst if possible.
4608 if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4609 return FoldCmp;
4610 auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4611 return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4612 }
4613
emitCSetForFCmp(Register Dst,CmpInst::Predicate Pred,MachineIRBuilder & MIRBuilder) const4614 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4615 Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4616 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4617 #ifndef NDEBUG
4618 LLT Ty = MRI.getType(Dst);
4619 assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4620 "Expected a 32-bit scalar register?");
4621 #endif
4622 const Register ZReg = AArch64::WZR;
4623 AArch64CC::CondCode CC1, CC2;
4624 changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4625 auto InvCC1 = AArch64CC::getInvertedCondCode(CC1);
4626 if (CC2 == AArch64CC::AL)
4627 return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1,
4628 MIRBuilder);
4629 const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4630 Register Def1Reg = MRI.createVirtualRegister(RC);
4631 Register Def2Reg = MRI.createVirtualRegister(RC);
4632 auto InvCC2 = AArch64CC::getInvertedCondCode(CC2);
4633 emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder);
4634 emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder);
4635 auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4636 constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4637 return &*OrMI;
4638 }
4639
emitFPCompare(Register LHS,Register RHS,MachineIRBuilder & MIRBuilder,std::optional<CmpInst::Predicate> Pred) const4640 MachineInstr *AArch64InstructionSelector::emitFPCompare(
4641 Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4642 std::optional<CmpInst::Predicate> Pred) const {
4643 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4644 LLT Ty = MRI.getType(LHS);
4645 if (Ty.isVector())
4646 return nullptr;
4647 unsigned OpSize = Ty.getSizeInBits();
4648 if (OpSize != 32 && OpSize != 64)
4649 return nullptr;
4650
4651 // If this is a compare against +0.0, then we don't have
4652 // to explicitly materialize a constant.
4653 const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4654 bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4655
4656 auto IsEqualityPred = [](CmpInst::Predicate P) {
4657 return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4658 P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4659 };
4660 if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4661 // Try commutating the operands.
4662 const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4663 if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4664 ShouldUseImm = true;
4665 std::swap(LHS, RHS);
4666 }
4667 }
4668 unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
4669 {AArch64::FCMPSri, AArch64::FCMPDri}};
4670 unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
4671
4672 // Partially build the compare. Decide if we need to add a use for the
4673 // third operand based off whether or not we're comparing against 0.0.
4674 auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4675 CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4676 if (!ShouldUseImm)
4677 CmpMI.addUse(RHS);
4678 constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4679 return &*CmpMI;
4680 }
4681
emitVectorConcat(std::optional<Register> Dst,Register Op1,Register Op2,MachineIRBuilder & MIRBuilder) const4682 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4683 std::optional<Register> Dst, Register Op1, Register Op2,
4684 MachineIRBuilder &MIRBuilder) const {
4685 // We implement a vector concat by:
4686 // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4687 // 2. Insert the upper vector into the destination's upper element
4688 // TODO: some of this code is common with G_BUILD_VECTOR handling.
4689 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4690
4691 const LLT Op1Ty = MRI.getType(Op1);
4692 const LLT Op2Ty = MRI.getType(Op2);
4693
4694 if (Op1Ty != Op2Ty) {
4695 LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4696 return nullptr;
4697 }
4698 assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4699
4700 if (Op1Ty.getSizeInBits() >= 128) {
4701 LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4702 return nullptr;
4703 }
4704
4705 // At the moment we just support 64 bit vector concats.
4706 if (Op1Ty.getSizeInBits() != 64) {
4707 LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4708 return nullptr;
4709 }
4710
4711 const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4712 const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4713 const TargetRegisterClass *DstRC =
4714 getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank);
4715
4716 MachineInstr *WidenedOp1 =
4717 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4718 MachineInstr *WidenedOp2 =
4719 emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4720 if (!WidenedOp1 || !WidenedOp2) {
4721 LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4722 return nullptr;
4723 }
4724
4725 // Now do the insert of the upper element.
4726 unsigned InsertOpc, InsSubRegIdx;
4727 std::tie(InsertOpc, InsSubRegIdx) =
4728 getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4729
4730 if (!Dst)
4731 Dst = MRI.createVirtualRegister(DstRC);
4732 auto InsElt =
4733 MIRBuilder
4734 .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4735 .addImm(1) /* Lane index */
4736 .addUse(WidenedOp2->getOperand(0).getReg())
4737 .addImm(0);
4738 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4739 return &*InsElt;
4740 }
4741
4742 MachineInstr *
emitCSINC(Register Dst,Register Src1,Register Src2,AArch64CC::CondCode Pred,MachineIRBuilder & MIRBuilder) const4743 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4744 Register Src2, AArch64CC::CondCode Pred,
4745 MachineIRBuilder &MIRBuilder) const {
4746 auto &MRI = *MIRBuilder.getMRI();
4747 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst);
4748 // If we used a register class, then this won't necessarily have an LLT.
4749 // Compute the size based off whether or not we have a class or bank.
4750 unsigned Size;
4751 if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
4752 Size = TRI.getRegSizeInBits(*RC);
4753 else
4754 Size = MRI.getType(Dst).getSizeInBits();
4755 // Some opcodes use s1.
4756 assert(Size <= 64 && "Expected 64 bits or less only!");
4757 static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4758 unsigned Opc = OpcTable[Size == 64];
4759 auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred);
4760 constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
4761 return &*CSINC;
4762 }
4763
4764 std::pair<MachineInstr *, AArch64CC::CondCode>
emitOverflowOp(unsigned Opcode,Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4765 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4766 MachineOperand &LHS,
4767 MachineOperand &RHS,
4768 MachineIRBuilder &MIRBuilder) const {
4769 switch (Opcode) {
4770 default:
4771 llvm_unreachable("Unexpected opcode!");
4772 case TargetOpcode::G_SADDO:
4773 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4774 case TargetOpcode::G_UADDO:
4775 return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4776 case TargetOpcode::G_SSUBO:
4777 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4778 case TargetOpcode::G_USUBO:
4779 return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4780 }
4781 }
4782
4783 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4784 /// expressed as a conjunction.
4785 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
4786 /// changing the conditions on the CMP tests.
4787 /// (this means we can call emitConjunctionRec() with
4788 /// Negate==true on this sub-tree)
4789 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
4790 /// cannot do the negation naturally. We are required to
4791 /// emit the subtree first in this case.
4792 /// \param WillNegate Is true if are called when the result of this
4793 /// subexpression must be negated. This happens when the
4794 /// outer expression is an OR. We can use this fact to know
4795 /// that we have a double negation (or (or ...) ...) that
4796 /// can be implemented for free.
canEmitConjunction(Register Val,bool & CanNegate,bool & MustBeFirst,bool WillNegate,MachineRegisterInfo & MRI,unsigned Depth=0)4797 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4798 bool WillNegate, MachineRegisterInfo &MRI,
4799 unsigned Depth = 0) {
4800 if (!MRI.hasOneNonDBGUse(Val))
4801 return false;
4802 MachineInstr *ValDef = MRI.getVRegDef(Val);
4803 unsigned Opcode = ValDef->getOpcode();
4804 if (isa<GAnyCmp>(ValDef)) {
4805 CanNegate = true;
4806 MustBeFirst = false;
4807 return true;
4808 }
4809 // Protect against exponential runtime and stack overflow.
4810 if (Depth > 6)
4811 return false;
4812 if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4813 bool IsOR = Opcode == TargetOpcode::G_OR;
4814 Register O0 = ValDef->getOperand(1).getReg();
4815 Register O1 = ValDef->getOperand(2).getReg();
4816 bool CanNegateL;
4817 bool MustBeFirstL;
4818 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
4819 return false;
4820 bool CanNegateR;
4821 bool MustBeFirstR;
4822 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
4823 return false;
4824
4825 if (MustBeFirstL && MustBeFirstR)
4826 return false;
4827
4828 if (IsOR) {
4829 // For an OR expression we need to be able to naturally negate at least
4830 // one side or we cannot do the transformation at all.
4831 if (!CanNegateL && !CanNegateR)
4832 return false;
4833 // If we the result of the OR will be negated and we can naturally negate
4834 // the leaves, then this sub-tree as a whole negates naturally.
4835 CanNegate = WillNegate && CanNegateL && CanNegateR;
4836 // If we cannot naturally negate the whole sub-tree, then this must be
4837 // emitted first.
4838 MustBeFirst = !CanNegate;
4839 } else {
4840 assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4841 // We cannot naturally negate an AND operation.
4842 CanNegate = false;
4843 MustBeFirst = MustBeFirstL || MustBeFirstR;
4844 }
4845 return true;
4846 }
4847 return false;
4848 }
4849
emitConditionalComparison(Register LHS,Register RHS,CmpInst::Predicate CC,AArch64CC::CondCode Predicate,AArch64CC::CondCode OutCC,MachineIRBuilder & MIB) const4850 MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4851 Register LHS, Register RHS, CmpInst::Predicate CC,
4852 AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4853 MachineIRBuilder &MIB) const {
4854 // TODO: emit CMN as an optimization.
4855 auto &MRI = *MIB.getMRI();
4856 LLT OpTy = MRI.getType(LHS);
4857 assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4858 unsigned CCmpOpc;
4859 std::optional<ValueAndVReg> C;
4860 if (CmpInst::isIntPredicate(CC)) {
4861 C = getIConstantVRegValWithLookThrough(RHS, MRI);
4862 if (C && C->Value.ult(32))
4863 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
4864 else
4865 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4866 } else {
4867 switch (OpTy.getSizeInBits()) {
4868 case 16:
4869 CCmpOpc = AArch64::FCCMPHrr;
4870 break;
4871 case 32:
4872 CCmpOpc = AArch64::FCCMPSrr;
4873 break;
4874 case 64:
4875 CCmpOpc = AArch64::FCCMPDrr;
4876 break;
4877 default:
4878 return nullptr;
4879 }
4880 }
4881 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
4882 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
4883 auto CCmp =
4884 MIB.buildInstr(CCmpOpc, {}, {LHS});
4885 if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
4886 CCmp.addImm(C->Value.getZExtValue());
4887 else
4888 CCmp.addReg(RHS);
4889 CCmp.addImm(NZCV).addImm(Predicate);
4890 constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
4891 return &*CCmp;
4892 }
4893
emitConjunctionRec(Register Val,AArch64CC::CondCode & OutCC,bool Negate,Register CCOp,AArch64CC::CondCode Predicate,MachineIRBuilder & MIB) const4894 MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4895 Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4896 AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4897 // We're at a tree leaf, produce a conditional comparison operation.
4898 auto &MRI = *MIB.getMRI();
4899 MachineInstr *ValDef = MRI.getVRegDef(Val);
4900 unsigned Opcode = ValDef->getOpcode();
4901 if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
4902 Register LHS = Cmp->getLHSReg();
4903 Register RHS = Cmp->getRHSReg();
4904 CmpInst::Predicate CC = Cmp->getCond();
4905 if (Negate)
4906 CC = CmpInst::getInversePredicate(CC);
4907 if (isa<GICmp>(Cmp)) {
4908 OutCC = changeICMPPredToAArch64CC(CC);
4909 } else {
4910 // Handle special FP cases.
4911 AArch64CC::CondCode ExtraCC;
4912 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4913 // Some floating point conditions can't be tested with a single condition
4914 // code. Construct an additional comparison in this case.
4915 if (ExtraCC != AArch64CC::AL) {
4916 MachineInstr *ExtraCmp;
4917 if (!CCOp)
4918 ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
4919 else
4920 ExtraCmp =
4921 emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
4922 CCOp = ExtraCmp->getOperand(0).getReg();
4923 Predicate = ExtraCC;
4924 }
4925 }
4926
4927 // Produce a normal comparison if we are first in the chain
4928 if (!CCOp) {
4929 auto Dst = MRI.cloneVirtualRegister(LHS);
4930 if (isa<GICmp>(Cmp))
4931 return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
4932 return emitFPCompare(Cmp->getOperand(2).getReg(),
4933 Cmp->getOperand(3).getReg(), MIB);
4934 }
4935 // Otherwise produce a ccmp.
4936 return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4937 }
4938 assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4939
4940 bool IsOR = Opcode == TargetOpcode::G_OR;
4941
4942 Register LHS = ValDef->getOperand(1).getReg();
4943 bool CanNegateL;
4944 bool MustBeFirstL;
4945 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
4946 assert(ValidL && "Valid conjunction/disjunction tree");
4947 (void)ValidL;
4948
4949 Register RHS = ValDef->getOperand(2).getReg();
4950 bool CanNegateR;
4951 bool MustBeFirstR;
4952 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
4953 assert(ValidR && "Valid conjunction/disjunction tree");
4954 (void)ValidR;
4955
4956 // Swap sub-tree that must come first to the right side.
4957 if (MustBeFirstL) {
4958 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4959 std::swap(LHS, RHS);
4960 std::swap(CanNegateL, CanNegateR);
4961 std::swap(MustBeFirstL, MustBeFirstR);
4962 }
4963
4964 bool NegateR;
4965 bool NegateAfterR;
4966 bool NegateL;
4967 bool NegateAfterAll;
4968 if (Opcode == TargetOpcode::G_OR) {
4969 // Swap the sub-tree that we can negate naturally to the left.
4970 if (!CanNegateL) {
4971 assert(CanNegateR && "at least one side must be negatable");
4972 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4973 assert(!Negate);
4974 std::swap(LHS, RHS);
4975 NegateR = false;
4976 NegateAfterR = true;
4977 } else {
4978 // Negate the left sub-tree if possible, otherwise negate the result.
4979 NegateR = CanNegateR;
4980 NegateAfterR = !CanNegateR;
4981 }
4982 NegateL = true;
4983 NegateAfterAll = !Negate;
4984 } else {
4985 assert(Opcode == TargetOpcode::G_AND &&
4986 "Valid conjunction/disjunction tree");
4987 assert(!Negate && "Valid conjunction/disjunction tree");
4988
4989 NegateL = false;
4990 NegateR = false;
4991 NegateAfterR = false;
4992 NegateAfterAll = false;
4993 }
4994
4995 // Emit sub-trees.
4996 AArch64CC::CondCode RHSCC;
4997 MachineInstr *CmpR =
4998 emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
4999 if (NegateAfterR)
5000 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
5001 MachineInstr *CmpL = emitConjunctionRec(
5002 LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
5003 if (NegateAfterAll)
5004 OutCC = AArch64CC::getInvertedCondCode(OutCC);
5005 return CmpL;
5006 }
5007
emitConjunction(Register Val,AArch64CC::CondCode & OutCC,MachineIRBuilder & MIB) const5008 MachineInstr *AArch64InstructionSelector::emitConjunction(
5009 Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
5010 bool DummyCanNegate;
5011 bool DummyMustBeFirst;
5012 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
5013 *MIB.getMRI()))
5014 return nullptr;
5015 return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
5016 }
5017
tryOptSelectConjunction(GSelect & SelI,MachineInstr & CondMI)5018 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
5019 MachineInstr &CondMI) {
5020 AArch64CC::CondCode AArch64CC;
5021 MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
5022 if (!ConjMI)
5023 return false;
5024
5025 emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB);
5026 SelI.eraseFromParent();
5027 return true;
5028 }
5029
tryOptSelect(GSelect & I)5030 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
5031 MachineRegisterInfo &MRI = *MIB.getMRI();
5032 // We want to recognize this pattern:
5033 //
5034 // $z = G_FCMP pred, $x, $y
5035 // ...
5036 // $w = G_SELECT $z, $a, $b
5037 //
5038 // Where the value of $z is *only* ever used by the G_SELECT (possibly with
5039 // some copies/truncs in between.)
5040 //
5041 // If we see this, then we can emit something like this:
5042 //
5043 // fcmp $x, $y
5044 // fcsel $w, $a, $b, pred
5045 //
5046 // Rather than emitting both of the rather long sequences in the standard
5047 // G_FCMP/G_SELECT select methods.
5048
5049 // First, check if the condition is defined by a compare.
5050 MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
5051
5052 // We can only fold if all of the defs have one use.
5053 Register CondDefReg = CondDef->getOperand(0).getReg();
5054 if (!MRI.hasOneNonDBGUse(CondDefReg)) {
5055 // Unless it's another select.
5056 for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
5057 if (CondDef == &UI)
5058 continue;
5059 if (UI.getOpcode() != TargetOpcode::G_SELECT)
5060 return false;
5061 }
5062 }
5063
5064 // Is the condition defined by a compare?
5065 unsigned CondOpc = CondDef->getOpcode();
5066 if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
5067 if (tryOptSelectConjunction(I, *CondDef))
5068 return true;
5069 return false;
5070 }
5071
5072 AArch64CC::CondCode CondCode;
5073 if (CondOpc == TargetOpcode::G_ICMP) {
5074 auto Pred =
5075 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5076 CondCode = changeICMPPredToAArch64CC(Pred);
5077 emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
5078 CondDef->getOperand(1), MIB);
5079 } else {
5080 // Get the condition code for the select.
5081 auto Pred =
5082 static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5083 AArch64CC::CondCode CondCode2;
5084 changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
5085
5086 // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5087 // instructions to emit the comparison.
5088 // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5089 // unnecessary.
5090 if (CondCode2 != AArch64CC::AL)
5091 return false;
5092
5093 if (!emitFPCompare(CondDef->getOperand(2).getReg(),
5094 CondDef->getOperand(3).getReg(), MIB)) {
5095 LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5096 return false;
5097 }
5098 }
5099
5100 // Emit the select.
5101 emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
5102 I.getOperand(3).getReg(), CondCode, MIB);
5103 I.eraseFromParent();
5104 return true;
5105 }
5106
tryFoldIntegerCompare(MachineOperand & LHS,MachineOperand & RHS,MachineOperand & Predicate,MachineIRBuilder & MIRBuilder) const5107 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5108 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5109 MachineIRBuilder &MIRBuilder) const {
5110 assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5111 "Unexpected MachineOperand");
5112 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5113 // We want to find this sort of thing:
5114 // x = G_SUB 0, y
5115 // G_ICMP z, x
5116 //
5117 // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5118 // e.g:
5119 //
5120 // cmn z, y
5121
5122 // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5123 MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
5124 MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
5125 auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5126 // Given this:
5127 //
5128 // x = G_SUB 0, y
5129 // G_ICMP x, z
5130 //
5131 // Produce this:
5132 //
5133 // cmn y, z
5134 if (isCMN(LHSDef, P, MRI))
5135 return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
5136
5137 // Same idea here, but with the RHS of the compare instead:
5138 //
5139 // Given this:
5140 //
5141 // x = G_SUB 0, y
5142 // G_ICMP z, x
5143 //
5144 // Produce this:
5145 //
5146 // cmn z, y
5147 if (isCMN(RHSDef, P, MRI))
5148 return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
5149
5150 // Given this:
5151 //
5152 // z = G_AND x, y
5153 // G_ICMP z, 0
5154 //
5155 // Produce this if the compare is signed:
5156 //
5157 // tst x, y
5158 if (!CmpInst::isUnsigned(P) && LHSDef &&
5159 LHSDef->getOpcode() == TargetOpcode::G_AND) {
5160 // Make sure that the RHS is 0.
5161 auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5162 if (!ValAndVReg || ValAndVReg->Value != 0)
5163 return nullptr;
5164
5165 return emitTST(LHSDef->getOperand(1),
5166 LHSDef->getOperand(2), MIRBuilder);
5167 }
5168
5169 return nullptr;
5170 }
5171
selectShuffleVector(MachineInstr & I,MachineRegisterInfo & MRI)5172 bool AArch64InstructionSelector::selectShuffleVector(
5173 MachineInstr &I, MachineRegisterInfo &MRI) {
5174 const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5175 Register Src1Reg = I.getOperand(1).getReg();
5176 const LLT Src1Ty = MRI.getType(Src1Reg);
5177 Register Src2Reg = I.getOperand(2).getReg();
5178 const LLT Src2Ty = MRI.getType(Src2Reg);
5179 ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
5180
5181 MachineBasicBlock &MBB = *I.getParent();
5182 MachineFunction &MF = *MBB.getParent();
5183 LLVMContext &Ctx = MF.getFunction().getContext();
5184
5185 // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
5186 // it's originated from a <1 x T> type. Those should have been lowered into
5187 // G_BUILD_VECTOR earlier.
5188 if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
5189 LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
5190 return false;
5191 }
5192
5193 unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5194
5195 SmallVector<Constant *, 64> CstIdxs;
5196 for (int Val : Mask) {
5197 // For now, any undef indexes we'll just assume to be 0. This should be
5198 // optimized in future, e.g. to select DUP etc.
5199 Val = Val < 0 ? 0 : Val;
5200 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5201 unsigned Offset = Byte + Val * BytesPerElt;
5202 CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
5203 }
5204 }
5205
5206 // Use a constant pool to load the index vector for TBL.
5207 Constant *CPVal = ConstantVector::get(CstIdxs);
5208 MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
5209 if (!IndexLoad) {
5210 LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5211 return false;
5212 }
5213
5214 if (DstTy.getSizeInBits() != 128) {
5215 assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5216 // This case can be done with TBL1.
5217 MachineInstr *Concat =
5218 emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB);
5219 if (!Concat) {
5220 LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5221 return false;
5222 }
5223
5224 // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5225 IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
5226 IndexLoad->getOperand(0).getReg(), MIB);
5227
5228 auto TBL1 = MIB.buildInstr(
5229 AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
5230 {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
5231 constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
5232
5233 auto Copy =
5234 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
5235 .addReg(TBL1.getReg(0), 0, AArch64::dsub);
5236 RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
5237 I.eraseFromParent();
5238 return true;
5239 }
5240
5241 // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5242 // Q registers for regalloc.
5243 SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5244 auto RegSeq = createQTuple(Regs, MIB);
5245 auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
5246 {RegSeq, IndexLoad->getOperand(0)});
5247 constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
5248 I.eraseFromParent();
5249 return true;
5250 }
5251
emitLaneInsert(std::optional<Register> DstReg,Register SrcReg,Register EltReg,unsigned LaneIdx,const RegisterBank & RB,MachineIRBuilder & MIRBuilder) const5252 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5253 std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5254 unsigned LaneIdx, const RegisterBank &RB,
5255 MachineIRBuilder &MIRBuilder) const {
5256 MachineInstr *InsElt = nullptr;
5257 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5258 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5259
5260 // Create a register to define with the insert if one wasn't passed in.
5261 if (!DstReg)
5262 DstReg = MRI.createVirtualRegister(DstRC);
5263
5264 unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
5265 unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5266
5267 if (RB.getID() == AArch64::FPRRegBankID) {
5268 auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
5269 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5270 .addImm(LaneIdx)
5271 .addUse(InsSub->getOperand(0).getReg())
5272 .addImm(0);
5273 } else {
5274 InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5275 .addImm(LaneIdx)
5276 .addUse(EltReg);
5277 }
5278
5279 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
5280 return InsElt;
5281 }
5282
selectUSMovFromExtend(MachineInstr & MI,MachineRegisterInfo & MRI)5283 bool AArch64InstructionSelector::selectUSMovFromExtend(
5284 MachineInstr &MI, MachineRegisterInfo &MRI) {
5285 if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5286 MI.getOpcode() != TargetOpcode::G_ZEXT &&
5287 MI.getOpcode() != TargetOpcode::G_ANYEXT)
5288 return false;
5289 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5290 const Register DefReg = MI.getOperand(0).getReg();
5291 const LLT DstTy = MRI.getType(DefReg);
5292 unsigned DstSize = DstTy.getSizeInBits();
5293
5294 if (DstSize != 32 && DstSize != 64)
5295 return false;
5296
5297 MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT,
5298 MI.getOperand(1).getReg(), MRI);
5299 int64_t Lane;
5300 if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane)))
5301 return false;
5302 Register Src0 = Extract->getOperand(1).getReg();
5303
5304 const LLT &VecTy = MRI.getType(Src0);
5305
5306 if (VecTy.getSizeInBits() != 128) {
5307 const MachineInstr *ScalarToVector = emitScalarToVector(
5308 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
5309 assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5310 Src0 = ScalarToVector->getOperand(0).getReg();
5311 }
5312
5313 unsigned Opcode;
5314 if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5315 Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5316 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5317 Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5318 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5319 Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5320 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5321 Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5322 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5323 Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5324 else
5325 llvm_unreachable("Unexpected type combo for S/UMov!");
5326
5327 // We may need to generate one of these, depending on the type and sign of the
5328 // input:
5329 // DstReg = SMOV Src0, Lane;
5330 // NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5331 MachineInstr *ExtI = nullptr;
5332 if (DstSize == 64 && !IsSigned) {
5333 Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
5334 MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane);
5335 ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
5336 .addImm(0)
5337 .addUse(NewReg)
5338 .addImm(AArch64::sub_32);
5339 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
5340 } else
5341 ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane);
5342
5343 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
5344 MI.eraseFromParent();
5345 return true;
5346 }
5347
selectInsertElt(MachineInstr & I,MachineRegisterInfo & MRI)5348 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
5349 MachineRegisterInfo &MRI) {
5350 assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
5351
5352 // Get information on the destination.
5353 Register DstReg = I.getOperand(0).getReg();
5354 const LLT DstTy = MRI.getType(DstReg);
5355 unsigned VecSize = DstTy.getSizeInBits();
5356
5357 // Get information on the element we want to insert into the destination.
5358 Register EltReg = I.getOperand(2).getReg();
5359 const LLT EltTy = MRI.getType(EltReg);
5360 unsigned EltSize = EltTy.getSizeInBits();
5361 if (EltSize < 16 || EltSize > 64)
5362 return false; // Don't support all element types yet.
5363
5364 // Find the definition of the index. Bail out if it's not defined by a
5365 // G_CONSTANT.
5366 Register IdxReg = I.getOperand(3).getReg();
5367 auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI);
5368 if (!VRegAndVal)
5369 return false;
5370 unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
5371
5372 // Perform the lane insert.
5373 Register SrcReg = I.getOperand(1).getReg();
5374 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5375
5376 if (VecSize < 128) {
5377 // If the vector we're inserting into is smaller than 128 bits, widen it
5378 // to 128 to do the insert.
5379 MachineInstr *ScalarToVec =
5380 emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
5381 if (!ScalarToVec)
5382 return false;
5383 SrcReg = ScalarToVec->getOperand(0).getReg();
5384 }
5385
5386 // Create an insert into a new FPR128 register.
5387 // Note that if our vector is already 128 bits, we end up emitting an extra
5388 // register.
5389 MachineInstr *InsMI =
5390 emitLaneInsert(std::nullopt, SrcReg, EltReg, LaneIdx, EltRB, MIB);
5391
5392 if (VecSize < 128) {
5393 // If we had to widen to perform the insert, then we have to demote back to
5394 // the original size to get the result we want.
5395 Register DemoteVec = InsMI->getOperand(0).getReg();
5396 const TargetRegisterClass *RC =
5397 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DemoteVec, MRI, TRI));
5398 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5399 LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5400 return false;
5401 }
5402 unsigned SubReg = 0;
5403 if (!getSubRegForClass(RC, TRI, SubReg))
5404 return false;
5405 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5406 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
5407 << "\n");
5408 return false;
5409 }
5410 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
5411 .addReg(DemoteVec, 0, SubReg);
5412 RBI.constrainGenericRegister(DstReg, *RC, MRI);
5413 } else {
5414 // No widening needed.
5415 InsMI->getOperand(0).setReg(DstReg);
5416 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
5417 }
5418
5419 I.eraseFromParent();
5420 return true;
5421 }
5422
5423 MachineInstr *
emitConstantVector(Register Dst,Constant * CV,MachineIRBuilder & MIRBuilder,MachineRegisterInfo & MRI)5424 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5425 MachineIRBuilder &MIRBuilder,
5426 MachineRegisterInfo &MRI) {
5427 LLT DstTy = MRI.getType(Dst);
5428 unsigned DstSize = DstTy.getSizeInBits();
5429 if (CV->isNullValue()) {
5430 if (DstSize == 128) {
5431 auto Mov =
5432 MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
5433 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5434 return &*Mov;
5435 }
5436
5437 if (DstSize == 64) {
5438 auto Mov =
5439 MIRBuilder
5440 .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5441 .addImm(0);
5442 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5443 .addReg(Mov.getReg(0), 0, AArch64::dsub);
5444 RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5445 return &*Copy;
5446 }
5447 }
5448
5449 auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
5450 if (!CPLoad) {
5451 LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5452 return nullptr;
5453 }
5454
5455 auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
5456 RBI.constrainGenericRegister(
5457 Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
5458 return &*Copy;
5459 }
5460
tryOptConstantBuildVec(MachineInstr & I,LLT DstTy,MachineRegisterInfo & MRI)5461 bool AArch64InstructionSelector::tryOptConstantBuildVec(
5462 MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5463 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5464 unsigned DstSize = DstTy.getSizeInBits();
5465 assert(DstSize <= 128 && "Unexpected build_vec type!");
5466 if (DstSize < 32)
5467 return false;
5468 // Check if we're building a constant vector, in which case we want to
5469 // generate a constant pool load instead of a vector insert sequence.
5470 SmallVector<Constant *, 16> Csts;
5471 for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5472 // Try to find G_CONSTANT or G_FCONSTANT
5473 auto *OpMI =
5474 getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
5475 if (OpMI)
5476 Csts.emplace_back(
5477 const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
5478 else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
5479 I.getOperand(Idx).getReg(), MRI)))
5480 Csts.emplace_back(
5481 const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
5482 else
5483 return false;
5484 }
5485 Constant *CV = ConstantVector::get(Csts);
5486 if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
5487 return false;
5488 I.eraseFromParent();
5489 return true;
5490 }
5491
tryOptBuildVecToSubregToReg(MachineInstr & I,MachineRegisterInfo & MRI)5492 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5493 MachineInstr &I, MachineRegisterInfo &MRI) {
5494 // Given:
5495 // %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5496 //
5497 // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5498 Register Dst = I.getOperand(0).getReg();
5499 Register EltReg = I.getOperand(1).getReg();
5500 LLT EltTy = MRI.getType(EltReg);
5501 // If the index isn't on the same bank as its elements, then this can't be a
5502 // SUBREG_TO_REG.
5503 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5504 const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5505 if (EltRB != DstRB)
5506 return false;
5507 if (any_of(make_range(I.operands_begin() + 2, I.operands_end()),
5508 [&MRI](const MachineOperand &Op) {
5509 return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(),
5510 MRI);
5511 }))
5512 return false;
5513 unsigned SubReg;
5514 const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB);
5515 if (!EltRC)
5516 return false;
5517 const TargetRegisterClass *DstRC =
5518 getRegClassForTypeOnBank(MRI.getType(Dst), DstRB);
5519 if (!DstRC)
5520 return false;
5521 if (!getSubRegForClass(EltRC, TRI, SubReg))
5522 return false;
5523 auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5524 .addImm(0)
5525 .addUse(EltReg)
5526 .addImm(SubReg);
5527 I.eraseFromParent();
5528 constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5529 return RBI.constrainGenericRegister(Dst, *DstRC, MRI);
5530 }
5531
selectBuildVector(MachineInstr & I,MachineRegisterInfo & MRI)5532 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5533 MachineRegisterInfo &MRI) {
5534 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5535 // Until we port more of the optimized selections, for now just use a vector
5536 // insert sequence.
5537 const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5538 const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
5539 unsigned EltSize = EltTy.getSizeInBits();
5540
5541 if (tryOptConstantBuildVec(I, DstTy, MRI))
5542 return true;
5543 if (tryOptBuildVecToSubregToReg(I, MRI))
5544 return true;
5545
5546 if (EltSize < 16 || EltSize > 64)
5547 return false; // Don't support all element types yet.
5548 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
5549
5550 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5551 MachineInstr *ScalarToVec =
5552 emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
5553 I.getOperand(1).getReg(), MIB);
5554 if (!ScalarToVec)
5555 return false;
5556
5557 Register DstVec = ScalarToVec->getOperand(0).getReg();
5558 unsigned DstSize = DstTy.getSizeInBits();
5559
5560 // Keep track of the last MI we inserted. Later on, we might be able to save
5561 // a copy using it.
5562 MachineInstr *PrevMI = nullptr;
5563 for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5564 // Note that if we don't do a subregister copy, we can end up making an
5565 // extra register.
5566 PrevMI = &*emitLaneInsert(std::nullopt, DstVec, I.getOperand(i).getReg(),
5567 i - 1, RB, MIB);
5568 DstVec = PrevMI->getOperand(0).getReg();
5569 }
5570
5571 // If DstTy's size in bits is less than 128, then emit a subregister copy
5572 // from DstVec to the last register we've defined.
5573 if (DstSize < 128) {
5574 // Force this to be FPR using the destination vector.
5575 const TargetRegisterClass *RC =
5576 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5577 if (!RC)
5578 return false;
5579 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5580 LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5581 return false;
5582 }
5583
5584 unsigned SubReg = 0;
5585 if (!getSubRegForClass(RC, TRI, SubReg))
5586 return false;
5587 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5588 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5589 << "\n");
5590 return false;
5591 }
5592
5593 Register Reg = MRI.createVirtualRegister(RC);
5594 Register DstReg = I.getOperand(0).getReg();
5595
5596 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5597 MachineOperand &RegOp = I.getOperand(1);
5598 RegOp.setReg(Reg);
5599 RBI.constrainGenericRegister(DstReg, *RC, MRI);
5600 } else {
5601 // We don't need a subregister copy. Save a copy by re-using the
5602 // destination register on the final insert.
5603 assert(PrevMI && "PrevMI was null?");
5604 PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
5605 constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5606 }
5607
5608 I.eraseFromParent();
5609 return true;
5610 }
5611
selectVectorLoadIntrinsic(unsigned Opc,unsigned NumVecs,MachineInstr & I)5612 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5613 unsigned NumVecs,
5614 MachineInstr &I) {
5615 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5616 assert(Opc && "Expected an opcode?");
5617 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5618 auto &MRI = *MIB.getMRI();
5619 LLT Ty = MRI.getType(I.getOperand(0).getReg());
5620 unsigned Size = Ty.getSizeInBits();
5621 assert((Size == 64 || Size == 128) &&
5622 "Destination must be 64 bits or 128 bits?");
5623 unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
5624 auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
5625 assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5626 auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
5627 Load.cloneMemRefs(I);
5628 constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
5629 Register SelectedLoadDst = Load->getOperand(0).getReg();
5630 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
5631 auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
5632 .addReg(SelectedLoadDst, 0, SubReg + Idx);
5633 // Emit the subreg copies and immediately select them.
5634 // FIXME: We should refactor our copy code into an emitCopy helper and
5635 // clean up uses of this pattern elsewhere in the selector.
5636 selectCopy(*Vec, TII, MRI, TRI, RBI);
5637 }
5638 return true;
5639 }
5640
selectIntrinsicWithSideEffects(MachineInstr & I,MachineRegisterInfo & MRI)5641 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
5642 MachineInstr &I, MachineRegisterInfo &MRI) {
5643 // Find the intrinsic ID.
5644 unsigned IntrinID = I.getIntrinsicID();
5645
5646 const LLT S8 = LLT::scalar(8);
5647 const LLT S16 = LLT::scalar(16);
5648 const LLT S32 = LLT::scalar(32);
5649 const LLT S64 = LLT::scalar(64);
5650 const LLT P0 = LLT::pointer(0, 64);
5651 // Select the instruction.
5652 switch (IntrinID) {
5653 default:
5654 return false;
5655 case Intrinsic::aarch64_ldxp:
5656 case Intrinsic::aarch64_ldaxp: {
5657 auto NewI = MIB.buildInstr(
5658 IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
5659 {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
5660 {I.getOperand(3)});
5661 NewI.cloneMemRefs(I);
5662 constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
5663 break;
5664 }
5665 case Intrinsic::trap:
5666 MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1);
5667 break;
5668 case Intrinsic::debugtrap:
5669 MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
5670 break;
5671 case Intrinsic::ubsantrap:
5672 MIB.buildInstr(AArch64::BRK, {}, {})
5673 .addImm(I.getOperand(1).getImm() | ('U' << 8));
5674 break;
5675 case Intrinsic::aarch64_neon_ld2: {
5676 LLT Ty = MRI.getType(I.getOperand(0).getReg());
5677 unsigned Opc = 0;
5678 if (Ty == LLT::fixed_vector(8, S8))
5679 Opc = AArch64::LD2Twov8b;
5680 else if (Ty == LLT::fixed_vector(16, S8))
5681 Opc = AArch64::LD2Twov16b;
5682 else if (Ty == LLT::fixed_vector(4, S16))
5683 Opc = AArch64::LD2Twov4h;
5684 else if (Ty == LLT::fixed_vector(8, S16))
5685 Opc = AArch64::LD2Twov8h;
5686 else if (Ty == LLT::fixed_vector(2, S32))
5687 Opc = AArch64::LD2Twov2s;
5688 else if (Ty == LLT::fixed_vector(4, S32))
5689 Opc = AArch64::LD2Twov4s;
5690 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5691 Opc = AArch64::LD2Twov2d;
5692 else if (Ty == S64 || Ty == P0)
5693 Opc = AArch64::LD1Twov1d;
5694 else
5695 llvm_unreachable("Unexpected type for ld2!");
5696 selectVectorLoadIntrinsic(Opc, 2, I);
5697 break;
5698 }
5699 case Intrinsic::aarch64_neon_ld4: {
5700 LLT Ty = MRI.getType(I.getOperand(0).getReg());
5701 unsigned Opc = 0;
5702 if (Ty == LLT::fixed_vector(8, S8))
5703 Opc = AArch64::LD4Fourv8b;
5704 else if (Ty == LLT::fixed_vector(16, S8))
5705 Opc = AArch64::LD4Fourv16b;
5706 else if (Ty == LLT::fixed_vector(4, S16))
5707 Opc = AArch64::LD4Fourv4h;
5708 else if (Ty == LLT::fixed_vector(8, S16))
5709 Opc = AArch64::LD4Fourv8h;
5710 else if (Ty == LLT::fixed_vector(2, S32))
5711 Opc = AArch64::LD4Fourv2s;
5712 else if (Ty == LLT::fixed_vector(4, S32))
5713 Opc = AArch64::LD4Fourv4s;
5714 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5715 Opc = AArch64::LD4Fourv2d;
5716 else if (Ty == S64 || Ty == P0)
5717 Opc = AArch64::LD1Fourv1d;
5718 else
5719 llvm_unreachable("Unexpected type for ld4!");
5720 selectVectorLoadIntrinsic(Opc, 4, I);
5721 break;
5722 }
5723 case Intrinsic::aarch64_neon_st2: {
5724 Register Src1 = I.getOperand(1).getReg();
5725 Register Src2 = I.getOperand(2).getReg();
5726 Register Ptr = I.getOperand(3).getReg();
5727 LLT Ty = MRI.getType(Src1);
5728 unsigned Opc;
5729 if (Ty == LLT::fixed_vector(8, S8))
5730 Opc = AArch64::ST2Twov8b;
5731 else if (Ty == LLT::fixed_vector(16, S8))
5732 Opc = AArch64::ST2Twov16b;
5733 else if (Ty == LLT::fixed_vector(4, S16))
5734 Opc = AArch64::ST2Twov4h;
5735 else if (Ty == LLT::fixed_vector(8, S16))
5736 Opc = AArch64::ST2Twov8h;
5737 else if (Ty == LLT::fixed_vector(2, S32))
5738 Opc = AArch64::ST2Twov2s;
5739 else if (Ty == LLT::fixed_vector(4, S32))
5740 Opc = AArch64::ST2Twov4s;
5741 else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5742 Opc = AArch64::ST2Twov2d;
5743 else if (Ty == S64 || Ty == P0)
5744 Opc = AArch64::ST1Twov1d;
5745 else
5746 llvm_unreachable("Unexpected type for st2!");
5747 SmallVector<Register, 2> Regs = {Src1, Src2};
5748 Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
5749 : createDTuple(Regs, MIB);
5750 auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
5751 Store.cloneMemRefs(I);
5752 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
5753 break;
5754 }
5755 case Intrinsic::aarch64_mops_memset_tag: {
5756 // Transform
5757 // %dst:gpr(p0) = \
5758 // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
5759 // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
5760 // where %dst is updated, into
5761 // %Rd:GPR64common, %Rn:GPR64) = \
5762 // MOPSMemorySetTaggingPseudo \
5763 // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
5764 // where Rd and Rn are tied.
5765 // It is expected that %val has been extended to s64 in legalization.
5766 // Note that the order of the size/value operands are swapped.
5767
5768 Register DstDef = I.getOperand(0).getReg();
5769 // I.getOperand(1) is the intrinsic function
5770 Register DstUse = I.getOperand(2).getReg();
5771 Register ValUse = I.getOperand(3).getReg();
5772 Register SizeUse = I.getOperand(4).getReg();
5773
5774 // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
5775 // Therefore an additional virtual register is requried for the updated size
5776 // operand. This value is not accessible via the semantics of the intrinsic.
5777 Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
5778
5779 auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
5780 {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
5781 Memset.cloneMemRefs(I);
5782 constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
5783 break;
5784 }
5785 }
5786
5787 I.eraseFromParent();
5788 return true;
5789 }
5790
selectIntrinsic(MachineInstr & I,MachineRegisterInfo & MRI)5791 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
5792 MachineRegisterInfo &MRI) {
5793 unsigned IntrinID = I.getIntrinsicID();
5794
5795 switch (IntrinID) {
5796 default:
5797 break;
5798 case Intrinsic::aarch64_crypto_sha1h: {
5799 Register DstReg = I.getOperand(0).getReg();
5800 Register SrcReg = I.getOperand(2).getReg();
5801
5802 // FIXME: Should this be an assert?
5803 if (MRI.getType(DstReg).getSizeInBits() != 32 ||
5804 MRI.getType(SrcReg).getSizeInBits() != 32)
5805 return false;
5806
5807 // The operation has to happen on FPRs. Set up some new FPR registers for
5808 // the source and destination if they are on GPRs.
5809 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
5810 SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5811 MIB.buildCopy({SrcReg}, {I.getOperand(2)});
5812
5813 // Make sure the copy ends up getting constrained properly.
5814 RBI.constrainGenericRegister(I.getOperand(2).getReg(),
5815 AArch64::GPR32RegClass, MRI);
5816 }
5817
5818 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
5819 DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5820
5821 // Actually insert the instruction.
5822 auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
5823 constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
5824
5825 // Did we create a new register for the destination?
5826 if (DstReg != I.getOperand(0).getReg()) {
5827 // Yep. Copy the result of the instruction back into the original
5828 // destination.
5829 MIB.buildCopy({I.getOperand(0)}, {DstReg});
5830 RBI.constrainGenericRegister(I.getOperand(0).getReg(),
5831 AArch64::GPR32RegClass, MRI);
5832 }
5833
5834 I.eraseFromParent();
5835 return true;
5836 }
5837 case Intrinsic::ptrauth_sign: {
5838 Register DstReg = I.getOperand(0).getReg();
5839 Register ValReg = I.getOperand(2).getReg();
5840 uint64_t Key = I.getOperand(3).getImm();
5841 Register DiscReg = I.getOperand(4).getReg();
5842 auto DiscVal = getIConstantVRegVal(DiscReg, MRI);
5843 bool IsDiscZero = DiscVal && DiscVal->isNullValue();
5844
5845 if (Key > AArch64PACKey::LAST)
5846 return false;
5847
5848 unsigned Opcodes[][4] = {
5849 {AArch64::PACIA, AArch64::PACIB, AArch64::PACDA, AArch64::PACDB},
5850 {AArch64::PACIZA, AArch64::PACIZB, AArch64::PACDZA, AArch64::PACDZB}};
5851 unsigned Opcode = Opcodes[IsDiscZero][Key];
5852
5853 auto PAC = MIB.buildInstr(Opcode, {DstReg}, {ValReg});
5854
5855 if (!IsDiscZero) {
5856 PAC.addUse(DiscReg);
5857 RBI.constrainGenericRegister(DiscReg, AArch64::GPR64spRegClass, MRI);
5858 }
5859
5860 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5861 I.eraseFromParent();
5862 return true;
5863 }
5864 case Intrinsic::ptrauth_strip: {
5865 Register DstReg = I.getOperand(0).getReg();
5866 Register ValReg = I.getOperand(2).getReg();
5867 uint64_t Key = I.getOperand(3).getImm();
5868
5869 if (Key > AArch64PACKey::LAST)
5870 return false;
5871 unsigned Opcode = getXPACOpcodeForKey((AArch64PACKey::ID)Key);
5872
5873 MIB.buildInstr(Opcode, {DstReg}, {ValReg});
5874
5875 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5876 RBI.constrainGenericRegister(ValReg, AArch64::GPR64RegClass, MRI);
5877 I.eraseFromParent();
5878 return true;
5879 }
5880 case Intrinsic::frameaddress:
5881 case Intrinsic::returnaddress: {
5882 MachineFunction &MF = *I.getParent()->getParent();
5883 MachineFrameInfo &MFI = MF.getFrameInfo();
5884
5885 unsigned Depth = I.getOperand(2).getImm();
5886 Register DstReg = I.getOperand(0).getReg();
5887 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5888
5889 if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
5890 if (!MFReturnAddr) {
5891 // Insert the copy from LR/X30 into the entry block, before it can be
5892 // clobbered by anything.
5893 MFI.setReturnAddressIsTaken(true);
5894 MFReturnAddr = getFunctionLiveInPhysReg(
5895 MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
5896 }
5897
5898 if (STI.hasPAuth()) {
5899 MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
5900 } else {
5901 MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
5902 MIB.buildInstr(AArch64::XPACLRI);
5903 MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5904 }
5905
5906 I.eraseFromParent();
5907 return true;
5908 }
5909
5910 MFI.setFrameAddressIsTaken(true);
5911 Register FrameAddr(AArch64::FP);
5912 while (Depth--) {
5913 Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
5914 auto Ldr =
5915 MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
5916 constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
5917 FrameAddr = NextFrame;
5918 }
5919
5920 if (IntrinID == Intrinsic::frameaddress)
5921 MIB.buildCopy({DstReg}, {FrameAddr});
5922 else {
5923 MFI.setReturnAddressIsTaken(true);
5924
5925 if (STI.hasPAuth()) {
5926 Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
5927 MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
5928 MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
5929 } else {
5930 MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
5931 .addImm(1);
5932 MIB.buildInstr(AArch64::XPACLRI);
5933 MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5934 }
5935 }
5936
5937 I.eraseFromParent();
5938 return true;
5939 }
5940 case Intrinsic::swift_async_context_addr:
5941 auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
5942 {Register(AArch64::FP)})
5943 .addImm(8)
5944 .addImm(0);
5945 constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
5946
5947 MF->getFrameInfo().setFrameAddressIsTaken(true);
5948 MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
5949 I.eraseFromParent();
5950 return true;
5951 }
5952 return false;
5953 }
5954
5955 InstructionSelector::ComplexRendererFns
selectShiftA_32(const MachineOperand & Root) const5956 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
5957 auto MaybeImmed = getImmedFromMO(Root);
5958 if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
5959 return std::nullopt;
5960 uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
5961 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5962 }
5963
5964 InstructionSelector::ComplexRendererFns
selectShiftB_32(const MachineOperand & Root) const5965 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
5966 auto MaybeImmed = getImmedFromMO(Root);
5967 if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
5968 return std::nullopt;
5969 uint64_t Enc = 31 - *MaybeImmed;
5970 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5971 }
5972
5973 InstructionSelector::ComplexRendererFns
selectShiftA_64(const MachineOperand & Root) const5974 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
5975 auto MaybeImmed = getImmedFromMO(Root);
5976 if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
5977 return std::nullopt;
5978 uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
5979 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5980 }
5981
5982 InstructionSelector::ComplexRendererFns
selectShiftB_64(const MachineOperand & Root) const5983 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
5984 auto MaybeImmed = getImmedFromMO(Root);
5985 if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
5986 return std::nullopt;
5987 uint64_t Enc = 63 - *MaybeImmed;
5988 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5989 }
5990
5991 /// Helper to select an immediate value that can be represented as a 12-bit
5992 /// value shifted left by either 0 or 12. If it is possible to do so, return
5993 /// the immediate and shift value. If not, return std::nullopt.
5994 ///
5995 /// Used by selectArithImmed and selectNegArithImmed.
5996 InstructionSelector::ComplexRendererFns
select12BitValueWithLeftShift(uint64_t Immed) const5997 AArch64InstructionSelector::select12BitValueWithLeftShift(
5998 uint64_t Immed) const {
5999 unsigned ShiftAmt;
6000 if (Immed >> 12 == 0) {
6001 ShiftAmt = 0;
6002 } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
6003 ShiftAmt = 12;
6004 Immed = Immed >> 12;
6005 } else
6006 return std::nullopt;
6007
6008 unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
6009 return {{
6010 [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
6011 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
6012 }};
6013 }
6014
6015 /// SelectArithImmed - Select an immediate value that can be represented as
6016 /// a 12-bit value shifted left by either 0 or 12. If so, return true with
6017 /// Val set to the 12-bit value and Shift set to the shifter operand.
6018 InstructionSelector::ComplexRendererFns
selectArithImmed(MachineOperand & Root) const6019 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
6020 // This function is called from the addsub_shifted_imm ComplexPattern,
6021 // which lists [imm] as the list of opcode it's interested in, however
6022 // we still need to check whether the operand is actually an immediate
6023 // here because the ComplexPattern opcode list is only used in
6024 // root-level opcode matching.
6025 auto MaybeImmed = getImmedFromMO(Root);
6026 if (MaybeImmed == std::nullopt)
6027 return std::nullopt;
6028 return select12BitValueWithLeftShift(*MaybeImmed);
6029 }
6030
6031 /// SelectNegArithImmed - As above, but negates the value before trying to
6032 /// select it.
6033 InstructionSelector::ComplexRendererFns
selectNegArithImmed(MachineOperand & Root) const6034 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
6035 // We need a register here, because we need to know if we have a 64 or 32
6036 // bit immediate.
6037 if (!Root.isReg())
6038 return std::nullopt;
6039 auto MaybeImmed = getImmedFromMO(Root);
6040 if (MaybeImmed == std::nullopt)
6041 return std::nullopt;
6042 uint64_t Immed = *MaybeImmed;
6043
6044 // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
6045 // have the opposite effect on the C flag, so this pattern mustn't match under
6046 // those circumstances.
6047 if (Immed == 0)
6048 return std::nullopt;
6049
6050 // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
6051 // the root.
6052 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6053 if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
6054 Immed = ~((uint32_t)Immed) + 1;
6055 else
6056 Immed = ~Immed + 1ULL;
6057
6058 if (Immed & 0xFFFFFFFFFF000000ULL)
6059 return std::nullopt;
6060
6061 Immed &= 0xFFFFFFULL;
6062 return select12BitValueWithLeftShift(Immed);
6063 }
6064
6065 /// Return true if it is worth folding MI into an extended register. That is,
6066 /// if it's safe to pull it into the addressing mode of a load or store as a
6067 /// shift.
isWorthFoldingIntoExtendedReg(MachineInstr & MI,const MachineRegisterInfo & MRI) const6068 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
6069 MachineInstr &MI, const MachineRegisterInfo &MRI) const {
6070 // Always fold if there is one use, or if we're optimizing for size.
6071 Register DefReg = MI.getOperand(0).getReg();
6072 if (MRI.hasOneNonDBGUse(DefReg) ||
6073 MI.getParent()->getParent()->getFunction().hasOptSize())
6074 return true;
6075
6076 // It's better to avoid folding and recomputing shifts when we don't have a
6077 // fastpath.
6078 if (!STI.hasLSLFast())
6079 return false;
6080
6081 // We have a fastpath, so folding a shift in and potentially computing it
6082 // many times may be beneficial. Check if this is only used in memory ops.
6083 // If it is, then we should fold.
6084 return all_of(MRI.use_nodbg_instructions(DefReg),
6085 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
6086 }
6087
isSignExtendShiftType(AArch64_AM::ShiftExtendType Type)6088 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
6089 switch (Type) {
6090 case AArch64_AM::SXTB:
6091 case AArch64_AM::SXTH:
6092 case AArch64_AM::SXTW:
6093 return true;
6094 default:
6095 return false;
6096 }
6097 }
6098
6099 InstructionSelector::ComplexRendererFns
selectExtendedSHL(MachineOperand & Root,MachineOperand & Base,MachineOperand & Offset,unsigned SizeInBytes,bool WantsExt) const6100 AArch64InstructionSelector::selectExtendedSHL(
6101 MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
6102 unsigned SizeInBytes, bool WantsExt) const {
6103 assert(Base.isReg() && "Expected base to be a register operand");
6104 assert(Offset.isReg() && "Expected offset to be a register operand");
6105
6106 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6107 MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
6108
6109 unsigned OffsetOpc = OffsetInst->getOpcode();
6110 bool LookedThroughZExt = false;
6111 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
6112 // Try to look through a ZEXT.
6113 if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
6114 return std::nullopt;
6115
6116 OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
6117 OffsetOpc = OffsetInst->getOpcode();
6118 LookedThroughZExt = true;
6119
6120 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
6121 return std::nullopt;
6122 }
6123 // Make sure that the memory op is a valid size.
6124 int64_t LegalShiftVal = Log2_32(SizeInBytes);
6125 if (LegalShiftVal == 0)
6126 return std::nullopt;
6127 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
6128 return std::nullopt;
6129
6130 // Now, try to find the specific G_CONSTANT. Start by assuming that the
6131 // register we will offset is the LHS, and the register containing the
6132 // constant is the RHS.
6133 Register OffsetReg = OffsetInst->getOperand(1).getReg();
6134 Register ConstantReg = OffsetInst->getOperand(2).getReg();
6135 auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
6136 if (!ValAndVReg) {
6137 // We didn't get a constant on the RHS. If the opcode is a shift, then
6138 // we're done.
6139 if (OffsetOpc == TargetOpcode::G_SHL)
6140 return std::nullopt;
6141
6142 // If we have a G_MUL, we can use either register. Try looking at the RHS.
6143 std::swap(OffsetReg, ConstantReg);
6144 ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
6145 if (!ValAndVReg)
6146 return std::nullopt;
6147 }
6148
6149 // The value must fit into 3 bits, and must be positive. Make sure that is
6150 // true.
6151 int64_t ImmVal = ValAndVReg->Value.getSExtValue();
6152
6153 // Since we're going to pull this into a shift, the constant value must be
6154 // a power of 2. If we got a multiply, then we need to check this.
6155 if (OffsetOpc == TargetOpcode::G_MUL) {
6156 if (!isPowerOf2_32(ImmVal))
6157 return std::nullopt;
6158
6159 // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
6160 ImmVal = Log2_32(ImmVal);
6161 }
6162
6163 if ((ImmVal & 0x7) != ImmVal)
6164 return std::nullopt;
6165
6166 // We are only allowed to shift by LegalShiftVal. This shift value is built
6167 // into the instruction, so we can't just use whatever we want.
6168 if (ImmVal != LegalShiftVal)
6169 return std::nullopt;
6170
6171 unsigned SignExtend = 0;
6172 if (WantsExt) {
6173 // Check if the offset is defined by an extend, unless we looked through a
6174 // G_ZEXT earlier.
6175 if (!LookedThroughZExt) {
6176 MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
6177 auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
6178 if (Ext == AArch64_AM::InvalidShiftExtend)
6179 return std::nullopt;
6180
6181 SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
6182 // We only support SXTW for signed extension here.
6183 if (SignExtend && Ext != AArch64_AM::SXTW)
6184 return std::nullopt;
6185 OffsetReg = ExtInst->getOperand(1).getReg();
6186 }
6187
6188 // Need a 32-bit wide register here.
6189 MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
6190 OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
6191 }
6192
6193 // We can use the LHS of the GEP as the base, and the LHS of the shift as an
6194 // offset. Signify that we are shifting by setting the shift flag to 1.
6195 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
6196 [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
6197 [=](MachineInstrBuilder &MIB) {
6198 // Need to add both immediates here to make sure that they are both
6199 // added to the instruction.
6200 MIB.addImm(SignExtend);
6201 MIB.addImm(1);
6202 }}};
6203 }
6204
6205 /// This is used for computing addresses like this:
6206 ///
6207 /// ldr x1, [x2, x3, lsl #3]
6208 ///
6209 /// Where x2 is the base register, and x3 is an offset register. The shift-left
6210 /// is a constant value specific to this load instruction. That is, we'll never
6211 /// see anything other than a 3 here (which corresponds to the size of the
6212 /// element being loaded.)
6213 InstructionSelector::ComplexRendererFns
selectAddrModeShiftedExtendXReg(MachineOperand & Root,unsigned SizeInBytes) const6214 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
6215 MachineOperand &Root, unsigned SizeInBytes) const {
6216 if (!Root.isReg())
6217 return std::nullopt;
6218 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6219
6220 // We want to find something like this:
6221 //
6222 // val = G_CONSTANT LegalShiftVal
6223 // shift = G_SHL off_reg val
6224 // ptr = G_PTR_ADD base_reg shift
6225 // x = G_LOAD ptr
6226 //
6227 // And fold it into this addressing mode:
6228 //
6229 // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
6230
6231 // Check if we can find the G_PTR_ADD.
6232 MachineInstr *PtrAdd =
6233 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
6234 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
6235 return std::nullopt;
6236
6237 // Now, try to match an opcode which will match our specific offset.
6238 // We want a G_SHL or a G_MUL.
6239 MachineInstr *OffsetInst =
6240 getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
6241 return selectExtendedSHL(Root, PtrAdd->getOperand(1),
6242 OffsetInst->getOperand(0), SizeInBytes,
6243 /*WantsExt=*/false);
6244 }
6245
6246 /// This is used for computing addresses like this:
6247 ///
6248 /// ldr x1, [x2, x3]
6249 ///
6250 /// Where x2 is the base register, and x3 is an offset register.
6251 ///
6252 /// When possible (or profitable) to fold a G_PTR_ADD into the address
6253 /// calculation, this will do so. Otherwise, it will return std::nullopt.
6254 InstructionSelector::ComplexRendererFns
selectAddrModeRegisterOffset(MachineOperand & Root) const6255 AArch64InstructionSelector::selectAddrModeRegisterOffset(
6256 MachineOperand &Root) const {
6257 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6258
6259 // We need a GEP.
6260 MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
6261 if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
6262 return std::nullopt;
6263
6264 // If this is used more than once, let's not bother folding.
6265 // TODO: Check if they are memory ops. If they are, then we can still fold
6266 // without having to recompute anything.
6267 if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
6268 return std::nullopt;
6269
6270 // Base is the GEP's LHS, offset is its RHS.
6271 return {{[=](MachineInstrBuilder &MIB) {
6272 MIB.addUse(Gep->getOperand(1).getReg());
6273 },
6274 [=](MachineInstrBuilder &MIB) {
6275 MIB.addUse(Gep->getOperand(2).getReg());
6276 },
6277 [=](MachineInstrBuilder &MIB) {
6278 // Need to add both immediates here to make sure that they are both
6279 // added to the instruction.
6280 MIB.addImm(0);
6281 MIB.addImm(0);
6282 }}};
6283 }
6284
6285 /// This is intended to be equivalent to selectAddrModeXRO in
6286 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
6287 InstructionSelector::ComplexRendererFns
selectAddrModeXRO(MachineOperand & Root,unsigned SizeInBytes) const6288 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
6289 unsigned SizeInBytes) const {
6290 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6291 if (!Root.isReg())
6292 return std::nullopt;
6293 MachineInstr *PtrAdd =
6294 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
6295 if (!PtrAdd)
6296 return std::nullopt;
6297
6298 // Check for an immediates which cannot be encoded in the [base + imm]
6299 // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
6300 // end up with code like:
6301 //
6302 // mov x0, wide
6303 // add x1 base, x0
6304 // ldr x2, [x1, x0]
6305 //
6306 // In this situation, we can use the [base, xreg] addressing mode to save an
6307 // add/sub:
6308 //
6309 // mov x0, wide
6310 // ldr x2, [base, x0]
6311 auto ValAndVReg =
6312 getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
6313 if (ValAndVReg) {
6314 unsigned Scale = Log2_32(SizeInBytes);
6315 int64_t ImmOff = ValAndVReg->Value.getSExtValue();
6316
6317 // Skip immediates that can be selected in the load/store addresing
6318 // mode.
6319 if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
6320 ImmOff < (0x1000 << Scale))
6321 return std::nullopt;
6322
6323 // Helper lambda to decide whether or not it is preferable to emit an add.
6324 auto isPreferredADD = [](int64_t ImmOff) {
6325 // Constants in [0x0, 0xfff] can be encoded in an add.
6326 if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
6327 return true;
6328
6329 // Can it be encoded in an add lsl #12?
6330 if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
6331 return false;
6332
6333 // It can be encoded in an add lsl #12, but we may not want to. If it is
6334 // possible to select this as a single movz, then prefer that. A single
6335 // movz is faster than an add with a shift.
6336 return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
6337 (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
6338 };
6339
6340 // If the immediate can be encoded in a single add/sub, then bail out.
6341 if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
6342 return std::nullopt;
6343 }
6344
6345 // Try to fold shifts into the addressing mode.
6346 auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
6347 if (AddrModeFns)
6348 return AddrModeFns;
6349
6350 // If that doesn't work, see if it's possible to fold in registers from
6351 // a GEP.
6352 return selectAddrModeRegisterOffset(Root);
6353 }
6354
6355 /// This is used for computing addresses like this:
6356 ///
6357 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
6358 ///
6359 /// Where we have a 64-bit base register, a 32-bit offset register, and an
6360 /// extend (which may or may not be signed).
6361 InstructionSelector::ComplexRendererFns
selectAddrModeWRO(MachineOperand & Root,unsigned SizeInBytes) const6362 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
6363 unsigned SizeInBytes) const {
6364 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6365
6366 MachineInstr *PtrAdd =
6367 getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
6368 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
6369 return std::nullopt;
6370
6371 MachineOperand &LHS = PtrAdd->getOperand(1);
6372 MachineOperand &RHS = PtrAdd->getOperand(2);
6373 MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
6374
6375 // The first case is the same as selectAddrModeXRO, except we need an extend.
6376 // In this case, we try to find a shift and extend, and fold them into the
6377 // addressing mode.
6378 //
6379 // E.g.
6380 //
6381 // off_reg = G_Z/S/ANYEXT ext_reg
6382 // val = G_CONSTANT LegalShiftVal
6383 // shift = G_SHL off_reg val
6384 // ptr = G_PTR_ADD base_reg shift
6385 // x = G_LOAD ptr
6386 //
6387 // In this case we can get a load like this:
6388 //
6389 // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
6390 auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
6391 SizeInBytes, /*WantsExt=*/true);
6392 if (ExtendedShl)
6393 return ExtendedShl;
6394
6395 // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
6396 //
6397 // e.g.
6398 // ldr something, [base_reg, ext_reg, sxtw]
6399 if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
6400 return std::nullopt;
6401
6402 // Check if this is an extend. We'll get an extend type if it is.
6403 AArch64_AM::ShiftExtendType Ext =
6404 getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
6405 if (Ext == AArch64_AM::InvalidShiftExtend)
6406 return std::nullopt;
6407
6408 // Need a 32-bit wide register.
6409 MachineIRBuilder MIB(*PtrAdd);
6410 Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
6411 AArch64::GPR32RegClass, MIB);
6412 unsigned SignExtend = Ext == AArch64_AM::SXTW;
6413
6414 // Base is LHS, offset is ExtReg.
6415 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
6416 [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
6417 [=](MachineInstrBuilder &MIB) {
6418 MIB.addImm(SignExtend);
6419 MIB.addImm(0);
6420 }}};
6421 }
6422
6423 /// Select a "register plus unscaled signed 9-bit immediate" address. This
6424 /// should only match when there is an offset that is not valid for a scaled
6425 /// immediate addressing mode. The "Size" argument is the size in bytes of the
6426 /// memory reference, which is needed here to know what is valid for a scaled
6427 /// immediate.
6428 InstructionSelector::ComplexRendererFns
selectAddrModeUnscaled(MachineOperand & Root,unsigned Size) const6429 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
6430 unsigned Size) const {
6431 MachineRegisterInfo &MRI =
6432 Root.getParent()->getParent()->getParent()->getRegInfo();
6433
6434 if (!Root.isReg())
6435 return std::nullopt;
6436
6437 if (!isBaseWithConstantOffset(Root, MRI))
6438 return std::nullopt;
6439
6440 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
6441
6442 MachineOperand &OffImm = RootDef->getOperand(2);
6443 if (!OffImm.isReg())
6444 return std::nullopt;
6445 MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
6446 if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
6447 return std::nullopt;
6448 int64_t RHSC;
6449 MachineOperand &RHSOp1 = RHS->getOperand(1);
6450 if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
6451 return std::nullopt;
6452 RHSC = RHSOp1.getCImm()->getSExtValue();
6453
6454 // If the offset is valid as a scaled immediate, don't match here.
6455 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
6456 return std::nullopt;
6457 if (RHSC >= -256 && RHSC < 256) {
6458 MachineOperand &Base = RootDef->getOperand(1);
6459 return {{
6460 [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
6461 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
6462 }};
6463 }
6464 return std::nullopt;
6465 }
6466
6467 InstructionSelector::ComplexRendererFns
tryFoldAddLowIntoImm(MachineInstr & RootDef,unsigned Size,MachineRegisterInfo & MRI) const6468 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
6469 unsigned Size,
6470 MachineRegisterInfo &MRI) const {
6471 if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
6472 return std::nullopt;
6473 MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
6474 if (Adrp.getOpcode() != AArch64::ADRP)
6475 return std::nullopt;
6476
6477 // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
6478 auto Offset = Adrp.getOperand(1).getOffset();
6479 if (Offset % Size != 0)
6480 return std::nullopt;
6481
6482 auto GV = Adrp.getOperand(1).getGlobal();
6483 if (GV->isThreadLocal())
6484 return std::nullopt;
6485
6486 auto &MF = *RootDef.getParent()->getParent();
6487 if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
6488 return std::nullopt;
6489
6490 unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
6491 MachineIRBuilder MIRBuilder(RootDef);
6492 Register AdrpReg = Adrp.getOperand(0).getReg();
6493 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
6494 [=](MachineInstrBuilder &MIB) {
6495 MIB.addGlobalAddress(GV, Offset,
6496 OpFlags | AArch64II::MO_PAGEOFF |
6497 AArch64II::MO_NC);
6498 }}};
6499 }
6500
6501 /// Select a "register plus scaled unsigned 12-bit immediate" address. The
6502 /// "Size" argument is the size in bytes of the memory reference, which
6503 /// determines the scale.
6504 InstructionSelector::ComplexRendererFns
selectAddrModeIndexed(MachineOperand & Root,unsigned Size) const6505 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
6506 unsigned Size) const {
6507 MachineFunction &MF = *Root.getParent()->getParent()->getParent();
6508 MachineRegisterInfo &MRI = MF.getRegInfo();
6509
6510 if (!Root.isReg())
6511 return std::nullopt;
6512
6513 MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
6514 if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
6515 return {{
6516 [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
6517 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
6518 }};
6519 }
6520
6521 CodeModel::Model CM = MF.getTarget().getCodeModel();
6522 // Check if we can fold in the ADD of small code model ADRP + ADD address.
6523 if (CM == CodeModel::Small) {
6524 auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
6525 if (OpFns)
6526 return OpFns;
6527 }
6528
6529 if (isBaseWithConstantOffset(Root, MRI)) {
6530 MachineOperand &LHS = RootDef->getOperand(1);
6531 MachineOperand &RHS = RootDef->getOperand(2);
6532 MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
6533 MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
6534
6535 int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
6536 unsigned Scale = Log2_32(Size);
6537 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
6538 if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
6539 return {{
6540 [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
6541 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
6542 }};
6543
6544 return {{
6545 [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
6546 [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
6547 }};
6548 }
6549 }
6550
6551 // Before falling back to our general case, check if the unscaled
6552 // instructions can handle this. If so, that's preferable.
6553 if (selectAddrModeUnscaled(Root, Size))
6554 return std::nullopt;
6555
6556 return {{
6557 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
6558 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
6559 }};
6560 }
6561
6562 /// Given a shift instruction, return the correct shift type for that
6563 /// instruction.
getShiftTypeForInst(MachineInstr & MI)6564 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
6565 switch (MI.getOpcode()) {
6566 default:
6567 return AArch64_AM::InvalidShiftExtend;
6568 case TargetOpcode::G_SHL:
6569 return AArch64_AM::LSL;
6570 case TargetOpcode::G_LSHR:
6571 return AArch64_AM::LSR;
6572 case TargetOpcode::G_ASHR:
6573 return AArch64_AM::ASR;
6574 case TargetOpcode::G_ROTR:
6575 return AArch64_AM::ROR;
6576 }
6577 }
6578
6579 /// Select a "shifted register" operand. If the value is not shifted, set the
6580 /// shift operand to a default value of "lsl 0".
6581 InstructionSelector::ComplexRendererFns
selectShiftedRegister(MachineOperand & Root,bool AllowROR) const6582 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
6583 bool AllowROR) const {
6584 if (!Root.isReg())
6585 return std::nullopt;
6586 MachineRegisterInfo &MRI =
6587 Root.getParent()->getParent()->getParent()->getRegInfo();
6588
6589 // Check if the operand is defined by an instruction which corresponds to
6590 // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
6591 MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
6592 AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
6593 if (ShType == AArch64_AM::InvalidShiftExtend)
6594 return std::nullopt;
6595 if (ShType == AArch64_AM::ROR && !AllowROR)
6596 return std::nullopt;
6597 if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
6598 return std::nullopt;
6599
6600 // Need an immediate on the RHS.
6601 MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
6602 auto Immed = getImmedFromMO(ShiftRHS);
6603 if (!Immed)
6604 return std::nullopt;
6605
6606 // We have something that we can fold. Fold in the shift's LHS and RHS into
6607 // the instruction.
6608 MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
6609 Register ShiftReg = ShiftLHS.getReg();
6610
6611 unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
6612 unsigned Val = *Immed & (NumBits - 1);
6613 unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
6614
6615 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
6616 [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
6617 }
6618
getExtendTypeForInst(MachineInstr & MI,MachineRegisterInfo & MRI,bool IsLoadStore) const6619 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
6620 MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
6621 unsigned Opc = MI.getOpcode();
6622
6623 // Handle explicit extend instructions first.
6624 if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
6625 unsigned Size;
6626 if (Opc == TargetOpcode::G_SEXT)
6627 Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6628 else
6629 Size = MI.getOperand(2).getImm();
6630 assert(Size != 64 && "Extend from 64 bits?");
6631 switch (Size) {
6632 case 8:
6633 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
6634 case 16:
6635 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
6636 case 32:
6637 return AArch64_AM::SXTW;
6638 default:
6639 return AArch64_AM::InvalidShiftExtend;
6640 }
6641 }
6642
6643 if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
6644 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6645 assert(Size != 64 && "Extend from 64 bits?");
6646 switch (Size) {
6647 case 8:
6648 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
6649 case 16:
6650 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
6651 case 32:
6652 return AArch64_AM::UXTW;
6653 default:
6654 return AArch64_AM::InvalidShiftExtend;
6655 }
6656 }
6657
6658 // Don't have an explicit extend. Try to handle a G_AND with a constant mask
6659 // on the RHS.
6660 if (Opc != TargetOpcode::G_AND)
6661 return AArch64_AM::InvalidShiftExtend;
6662
6663 std::optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
6664 if (!MaybeAndMask)
6665 return AArch64_AM::InvalidShiftExtend;
6666 uint64_t AndMask = *MaybeAndMask;
6667 switch (AndMask) {
6668 default:
6669 return AArch64_AM::InvalidShiftExtend;
6670 case 0xFF:
6671 return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
6672 case 0xFFFF:
6673 return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
6674 case 0xFFFFFFFF:
6675 return AArch64_AM::UXTW;
6676 }
6677 }
6678
moveScalarRegClass(Register Reg,const TargetRegisterClass & RC,MachineIRBuilder & MIB) const6679 Register AArch64InstructionSelector::moveScalarRegClass(
6680 Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
6681 MachineRegisterInfo &MRI = *MIB.getMRI();
6682 auto Ty = MRI.getType(Reg);
6683 assert(!Ty.isVector() && "Expected scalars only!");
6684 if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
6685 return Reg;
6686
6687 // Create a copy and immediately select it.
6688 // FIXME: We should have an emitCopy function?
6689 auto Copy = MIB.buildCopy({&RC}, {Reg});
6690 selectCopy(*Copy, TII, MRI, TRI, RBI);
6691 return Copy.getReg(0);
6692 }
6693
6694 /// Select an "extended register" operand. This operand folds in an extend
6695 /// followed by an optional left shift.
6696 InstructionSelector::ComplexRendererFns
selectArithExtendedRegister(MachineOperand & Root) const6697 AArch64InstructionSelector::selectArithExtendedRegister(
6698 MachineOperand &Root) const {
6699 if (!Root.isReg())
6700 return std::nullopt;
6701 MachineRegisterInfo &MRI =
6702 Root.getParent()->getParent()->getParent()->getRegInfo();
6703
6704 uint64_t ShiftVal = 0;
6705 Register ExtReg;
6706 AArch64_AM::ShiftExtendType Ext;
6707 MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
6708 if (!RootDef)
6709 return std::nullopt;
6710
6711 if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
6712 return std::nullopt;
6713
6714 // Check if we can fold a shift and an extend.
6715 if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
6716 // Look for a constant on the RHS of the shift.
6717 MachineOperand &RHS = RootDef->getOperand(2);
6718 std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
6719 if (!MaybeShiftVal)
6720 return std::nullopt;
6721 ShiftVal = *MaybeShiftVal;
6722 if (ShiftVal > 4)
6723 return std::nullopt;
6724 // Look for a valid extend instruction on the LHS of the shift.
6725 MachineOperand &LHS = RootDef->getOperand(1);
6726 MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
6727 if (!ExtDef)
6728 return std::nullopt;
6729 Ext = getExtendTypeForInst(*ExtDef, MRI);
6730 if (Ext == AArch64_AM::InvalidShiftExtend)
6731 return std::nullopt;
6732 ExtReg = ExtDef->getOperand(1).getReg();
6733 } else {
6734 // Didn't get a shift. Try just folding an extend.
6735 Ext = getExtendTypeForInst(*RootDef, MRI);
6736 if (Ext == AArch64_AM::InvalidShiftExtend)
6737 return std::nullopt;
6738 ExtReg = RootDef->getOperand(1).getReg();
6739
6740 // If we have a 32 bit instruction which zeroes out the high half of a
6741 // register, we get an implicit zero extend for free. Check if we have one.
6742 // FIXME: We actually emit the extend right now even though we don't have
6743 // to.
6744 if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
6745 MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
6746 if (isDef32(*ExtInst))
6747 return std::nullopt;
6748 }
6749 }
6750
6751 // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
6752 // copy.
6753 MachineIRBuilder MIB(*RootDef);
6754 ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
6755
6756 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
6757 [=](MachineInstrBuilder &MIB) {
6758 MIB.addImm(getArithExtendImm(Ext, ShiftVal));
6759 }}};
6760 }
6761
renderTruncImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6762 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
6763 const MachineInstr &MI,
6764 int OpIdx) const {
6765 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6766 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6767 "Expected G_CONSTANT");
6768 std::optional<int64_t> CstVal =
6769 getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
6770 assert(CstVal && "Expected constant value");
6771 MIB.addImm(*CstVal);
6772 }
6773
renderLogicalImm32(MachineInstrBuilder & MIB,const MachineInstr & I,int OpIdx) const6774 void AArch64InstructionSelector::renderLogicalImm32(
6775 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6776 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6777 "Expected G_CONSTANT");
6778 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6779 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
6780 MIB.addImm(Enc);
6781 }
6782
renderLogicalImm64(MachineInstrBuilder & MIB,const MachineInstr & I,int OpIdx) const6783 void AArch64InstructionSelector::renderLogicalImm64(
6784 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6785 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6786 "Expected G_CONSTANT");
6787 uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6788 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
6789 MIB.addImm(Enc);
6790 }
6791
renderFPImm16(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6792 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
6793 const MachineInstr &MI,
6794 int OpIdx) const {
6795 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6796 "Expected G_FCONSTANT");
6797 MIB.addImm(
6798 AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6799 }
6800
renderFPImm32(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6801 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
6802 const MachineInstr &MI,
6803 int OpIdx) const {
6804 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6805 "Expected G_FCONSTANT");
6806 MIB.addImm(
6807 AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6808 }
6809
renderFPImm64(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6810 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
6811 const MachineInstr &MI,
6812 int OpIdx) const {
6813 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6814 "Expected G_FCONSTANT");
6815 MIB.addImm(
6816 AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6817 }
6818
renderFPImm32SIMDModImmType4(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6819 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
6820 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6821 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6822 "Expected G_FCONSTANT");
6823 MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1)
6824 .getFPImm()
6825 ->getValueAPF()
6826 .bitcastToAPInt()
6827 .getZExtValue()));
6828 }
6829
isLoadStoreOfNumBytes(const MachineInstr & MI,unsigned NumBytes) const6830 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
6831 const MachineInstr &MI, unsigned NumBytes) const {
6832 if (!MI.mayLoadOrStore())
6833 return false;
6834 assert(MI.hasOneMemOperand() &&
6835 "Expected load/store to have only one mem op!");
6836 return (*MI.memoperands_begin())->getSize() == NumBytes;
6837 }
6838
isDef32(const MachineInstr & MI) const6839 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
6840 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6841 if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
6842 return false;
6843
6844 // Only return true if we know the operation will zero-out the high half of
6845 // the 64-bit register. Truncates can be subregister copies, which don't
6846 // zero out the high bits. Copies and other copy-like instructions can be
6847 // fed by truncates, or could be lowered as subregister copies.
6848 switch (MI.getOpcode()) {
6849 default:
6850 return true;
6851 case TargetOpcode::COPY:
6852 case TargetOpcode::G_BITCAST:
6853 case TargetOpcode::G_TRUNC:
6854 case TargetOpcode::G_PHI:
6855 return false;
6856 }
6857 }
6858
6859
6860 // Perform fixups on the given PHI instruction's operands to force them all
6861 // to be the same as the destination regbank.
fixupPHIOpBanks(MachineInstr & MI,MachineRegisterInfo & MRI,const AArch64RegisterBankInfo & RBI)6862 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
6863 const AArch64RegisterBankInfo &RBI) {
6864 assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
6865 Register DstReg = MI.getOperand(0).getReg();
6866 const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
6867 assert(DstRB && "Expected PHI dst to have regbank assigned");
6868 MachineIRBuilder MIB(MI);
6869
6870 // Go through each operand and ensure it has the same regbank.
6871 for (MachineOperand &MO : llvm::drop_begin(MI.operands())) {
6872 if (!MO.isReg())
6873 continue;
6874 Register OpReg = MO.getReg();
6875 const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
6876 if (RB != DstRB) {
6877 // Insert a cross-bank copy.
6878 auto *OpDef = MRI.getVRegDef(OpReg);
6879 const LLT &Ty = MRI.getType(OpReg);
6880 MachineBasicBlock &OpDefBB = *OpDef->getParent();
6881
6882 // Any instruction we insert must appear after all PHIs in the block
6883 // for the block to be valid MIR.
6884 MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
6885 if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
6886 InsertPt = OpDefBB.getFirstNonPHI();
6887 MIB.setInsertPt(*OpDef->getParent(), InsertPt);
6888 auto Copy = MIB.buildCopy(Ty, OpReg);
6889 MRI.setRegBank(Copy.getReg(0), *DstRB);
6890 MO.setReg(Copy.getReg(0));
6891 }
6892 }
6893 }
6894
processPHIs(MachineFunction & MF)6895 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
6896 // We're looking for PHIs, build a list so we don't invalidate iterators.
6897 MachineRegisterInfo &MRI = MF.getRegInfo();
6898 SmallVector<MachineInstr *, 32> Phis;
6899 for (auto &BB : MF) {
6900 for (auto &MI : BB) {
6901 if (MI.getOpcode() == TargetOpcode::G_PHI)
6902 Phis.emplace_back(&MI);
6903 }
6904 }
6905
6906 for (auto *MI : Phis) {
6907 // We need to do some work here if the operand types are < 16 bit and they
6908 // are split across fpr/gpr banks. Since all types <32b on gpr
6909 // end up being assigned gpr32 regclasses, we can end up with PHIs here
6910 // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
6911 // be selecting heterogenous regbanks for operands if possible, but we
6912 // still need to be able to deal with it here.
6913 //
6914 // To fix this, if we have a gpr-bank operand < 32b in size and at least
6915 // one other operand is on the fpr bank, then we add cross-bank copies
6916 // to homogenize the operand banks. For simplicity the bank that we choose
6917 // to settle on is whatever bank the def operand has. For example:
6918 //
6919 // %endbb:
6920 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
6921 // =>
6922 // %bb2:
6923 // ...
6924 // %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
6925 // ...
6926 // %endbb:
6927 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
6928 bool HasGPROp = false, HasFPROp = false;
6929 for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) {
6930 if (!MO.isReg())
6931 continue;
6932 const LLT &Ty = MRI.getType(MO.getReg());
6933 if (!Ty.isValid() || !Ty.isScalar())
6934 break;
6935 if (Ty.getSizeInBits() >= 32)
6936 break;
6937 const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
6938 // If for some reason we don't have a regbank yet. Don't try anything.
6939 if (!RB)
6940 break;
6941
6942 if (RB->getID() == AArch64::GPRRegBankID)
6943 HasGPROp = true;
6944 else
6945 HasFPROp = true;
6946 }
6947 // We have heterogenous regbanks, need to fixup.
6948 if (HasGPROp && HasFPROp)
6949 fixupPHIOpBanks(*MI, MRI, RBI);
6950 }
6951 }
6952
6953 namespace llvm {
6954 InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine & TM,AArch64Subtarget & Subtarget,AArch64RegisterBankInfo & RBI)6955 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
6956 AArch64Subtarget &Subtarget,
6957 AArch64RegisterBankInfo &RBI) {
6958 return new AArch64InstructionSelector(TM, Subtarget, RBI);
6959 }
6960 }
6961