1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ISelLowering.h"
14 #include "AArch64CallingConvention.h"
15 #include "AArch64ExpandImm.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "MCTargetDesc/AArch64AddressingModes.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
32 #include "llvm/Analysis/ObjCARCUtil.h"
33 #include "llvm/Analysis/VectorUtils.h"
34 #include "llvm/CodeGen/Analysis.h"
35 #include "llvm/CodeGen/CallingConvLower.h"
36 #include "llvm/CodeGen/MachineBasicBlock.h"
37 #include "llvm/CodeGen/MachineFrameInfo.h"
38 #include "llvm/CodeGen/MachineFunction.h"
39 #include "llvm/CodeGen/MachineInstr.h"
40 #include "llvm/CodeGen/MachineInstrBuilder.h"
41 #include "llvm/CodeGen/MachineMemOperand.h"
42 #include "llvm/CodeGen/MachineRegisterInfo.h"
43 #include "llvm/CodeGen/RuntimeLibcalls.h"
44 #include "llvm/CodeGen/SelectionDAG.h"
45 #include "llvm/CodeGen/SelectionDAGNodes.h"
46 #include "llvm/CodeGen/TargetCallingConv.h"
47 #include "llvm/CodeGen/TargetInstrInfo.h"
48 #include "llvm/CodeGen/ValueTypes.h"
49 #include "llvm/IR/Attributes.h"
50 #include "llvm/IR/Constants.h"
51 #include "llvm/IR/DataLayout.h"
52 #include "llvm/IR/DebugLoc.h"
53 #include "llvm/IR/DerivedTypes.h"
54 #include "llvm/IR/Function.h"
55 #include "llvm/IR/GetElementPtrTypeIterator.h"
56 #include "llvm/IR/GlobalValue.h"
57 #include "llvm/IR/IRBuilder.h"
58 #include "llvm/IR/Instruction.h"
59 #include "llvm/IR/Instructions.h"
60 #include "llvm/IR/IntrinsicInst.h"
61 #include "llvm/IR/Intrinsics.h"
62 #include "llvm/IR/IntrinsicsAArch64.h"
63 #include "llvm/IR/Module.h"
64 #include "llvm/IR/OperandTraits.h"
65 #include "llvm/IR/PatternMatch.h"
66 #include "llvm/IR/Type.h"
67 #include "llvm/IR/Use.h"
68 #include "llvm/IR/Value.h"
69 #include "llvm/MC/MCRegisterInfo.h"
70 #include "llvm/Support/Casting.h"
71 #include "llvm/Support/CodeGen.h"
72 #include "llvm/Support/CommandLine.h"
73 #include "llvm/Support/Compiler.h"
74 #include "llvm/Support/Debug.h"
75 #include "llvm/Support/ErrorHandling.h"
76 #include "llvm/Support/KnownBits.h"
77 #include "llvm/Support/MachineValueType.h"
78 #include "llvm/Support/MathExtras.h"
79 #include "llvm/Support/raw_ostream.h"
80 #include "llvm/Target/TargetMachine.h"
81 #include "llvm/Target/TargetOptions.h"
82 #include <algorithm>
83 #include <bitset>
84 #include <cassert>
85 #include <cctype>
86 #include <cstdint>
87 #include <cstdlib>
88 #include <iterator>
89 #include <limits>
90 #include <tuple>
91 #include <utility>
92 #include <vector>
93 
94 using namespace llvm;
95 using namespace llvm::PatternMatch;
96 
97 #define DEBUG_TYPE "aarch64-lower"
98 
99 STATISTIC(NumTailCalls, "Number of tail calls");
100 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
101 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
102 
103 // FIXME: The necessary dtprel relocations don't seem to be supported
104 // well in the GNU bfd and gold linkers at the moment. Therefore, by
105 // default, for now, fall back to GeneralDynamic code generation.
106 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
107     "aarch64-elf-ldtls-generation", cl::Hidden,
108     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
109     cl::init(false));
110 
111 static cl::opt<bool>
112 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
113                          cl::desc("Enable AArch64 logical imm instruction "
114                                   "optimization"),
115                          cl::init(true));
116 
117 // Temporary option added for the purpose of testing functionality added
118 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
119 // in future when both implementations will be based off MGATHER rather
120 // than the GLD1 nodes added for the SVE gather load intrinsics.
121 static cl::opt<bool>
122 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
123                                 cl::desc("Combine extends of AArch64 masked "
124                                          "gather intrinsics"),
125                                 cl::init(true));
126 
127 /// Value type used for condition codes.
128 static const MVT MVT_CC = MVT::i32;
129 
130 static inline EVT getPackedSVEVectorVT(EVT VT) {
131   switch (VT.getSimpleVT().SimpleTy) {
132   default:
133     llvm_unreachable("unexpected element type for vector");
134   case MVT::i8:
135     return MVT::nxv16i8;
136   case MVT::i16:
137     return MVT::nxv8i16;
138   case MVT::i32:
139     return MVT::nxv4i32;
140   case MVT::i64:
141     return MVT::nxv2i64;
142   case MVT::f16:
143     return MVT::nxv8f16;
144   case MVT::f32:
145     return MVT::nxv4f32;
146   case MVT::f64:
147     return MVT::nxv2f64;
148   case MVT::bf16:
149     return MVT::nxv8bf16;
150   }
151 }
152 
153 // NOTE: Currently there's only a need to return integer vector types. If this
154 // changes then just add an extra "type" parameter.
155 static inline EVT getPackedSVEVectorVT(ElementCount EC) {
156   switch (EC.getKnownMinValue()) {
157   default:
158     llvm_unreachable("unexpected element count for vector");
159   case 16:
160     return MVT::nxv16i8;
161   case 8:
162     return MVT::nxv8i16;
163   case 4:
164     return MVT::nxv4i32;
165   case 2:
166     return MVT::nxv2i64;
167   }
168 }
169 
170 static inline EVT getPromotedVTForPredicate(EVT VT) {
171   assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
172          "Expected scalable predicate vector type!");
173   switch (VT.getVectorMinNumElements()) {
174   default:
175     llvm_unreachable("unexpected element count for vector");
176   case 2:
177     return MVT::nxv2i64;
178   case 4:
179     return MVT::nxv4i32;
180   case 8:
181     return MVT::nxv8i16;
182   case 16:
183     return MVT::nxv16i8;
184   }
185 }
186 
187 /// Returns true if VT's elements occupy the lowest bit positions of its
188 /// associated register class without any intervening space.
189 ///
190 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
191 /// same register class, but only nxv8f16 can be treated as a packed vector.
192 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
193   assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
194          "Expected legal vector type!");
195   return VT.isFixedLengthVector() ||
196          VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
197 }
198 
199 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
200 // predicate and end with a passthru value matching the result type.
201 static bool isMergePassthruOpcode(unsigned Opc) {
202   switch (Opc) {
203   default:
204     return false;
205   case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
206   case AArch64ISD::BSWAP_MERGE_PASSTHRU:
207   case AArch64ISD::CTLZ_MERGE_PASSTHRU:
208   case AArch64ISD::CTPOP_MERGE_PASSTHRU:
209   case AArch64ISD::DUP_MERGE_PASSTHRU:
210   case AArch64ISD::ABS_MERGE_PASSTHRU:
211   case AArch64ISD::NEG_MERGE_PASSTHRU:
212   case AArch64ISD::FNEG_MERGE_PASSTHRU:
213   case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
214   case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
215   case AArch64ISD::FCEIL_MERGE_PASSTHRU:
216   case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
217   case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
218   case AArch64ISD::FRINT_MERGE_PASSTHRU:
219   case AArch64ISD::FROUND_MERGE_PASSTHRU:
220   case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
221   case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
222   case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
223   case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
224   case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
225   case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
226   case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
227   case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
228   case AArch64ISD::FSQRT_MERGE_PASSTHRU:
229   case AArch64ISD::FRECPX_MERGE_PASSTHRU:
230   case AArch64ISD::FABS_MERGE_PASSTHRU:
231     return true;
232   }
233 }
234 
235 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
236                                              const AArch64Subtarget &STI)
237     : TargetLowering(TM), Subtarget(&STI) {
238   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
239   // we have to make something up. Arbitrarily, choose ZeroOrOne.
240   setBooleanContents(ZeroOrOneBooleanContent);
241   // When comparing vectors the result sets the different elements in the
242   // vector to all-one or all-zero.
243   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
244 
245   // Set up the register classes.
246   addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
247   addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
248 
249   if (Subtarget->hasLS64()) {
250     addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
251     setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
252     setOperationAction(ISD::STORE, MVT::i64x8, Custom);
253   }
254 
255   if (Subtarget->hasFPARMv8()) {
256     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
257     addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
258     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
259     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
260     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
261   }
262 
263   if (Subtarget->hasNEON()) {
264     addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
265     addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
266     // Someone set us up the NEON.
267     addDRTypeForNEON(MVT::v2f32);
268     addDRTypeForNEON(MVT::v8i8);
269     addDRTypeForNEON(MVT::v4i16);
270     addDRTypeForNEON(MVT::v2i32);
271     addDRTypeForNEON(MVT::v1i64);
272     addDRTypeForNEON(MVT::v1f64);
273     addDRTypeForNEON(MVT::v4f16);
274     if (Subtarget->hasBF16())
275       addDRTypeForNEON(MVT::v4bf16);
276 
277     addQRTypeForNEON(MVT::v4f32);
278     addQRTypeForNEON(MVT::v2f64);
279     addQRTypeForNEON(MVT::v16i8);
280     addQRTypeForNEON(MVT::v8i16);
281     addQRTypeForNEON(MVT::v4i32);
282     addQRTypeForNEON(MVT::v2i64);
283     addQRTypeForNEON(MVT::v8f16);
284     if (Subtarget->hasBF16())
285       addQRTypeForNEON(MVT::v8bf16);
286   }
287 
288   if (Subtarget->hasSVE()) {
289     // Add legal sve predicate types
290     addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
291     addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
292     addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
293     addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
294 
295     // Add legal sve data types
296     addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
297     addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
298     addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
299     addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
300 
301     addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
302     addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
303     addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
304     addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
305     addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
306     addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
307 
308     if (Subtarget->hasBF16()) {
309       addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
310       addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
311       addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
312     }
313 
314     if (Subtarget->useSVEForFixedLengthVectors()) {
315       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
316         if (useSVEForFixedLengthVectorVT(VT))
317           addRegisterClass(VT, &AArch64::ZPRRegClass);
318 
319       for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
320         if (useSVEForFixedLengthVectorVT(VT))
321           addRegisterClass(VT, &AArch64::ZPRRegClass);
322     }
323 
324     for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
325       setOperationAction(ISD::SADDSAT, VT, Legal);
326       setOperationAction(ISD::UADDSAT, VT, Legal);
327       setOperationAction(ISD::SSUBSAT, VT, Legal);
328       setOperationAction(ISD::USUBSAT, VT, Legal);
329       setOperationAction(ISD::UREM, VT, Expand);
330       setOperationAction(ISD::SREM, VT, Expand);
331       setOperationAction(ISD::SDIVREM, VT, Expand);
332       setOperationAction(ISD::UDIVREM, VT, Expand);
333     }
334 
335     for (auto VT :
336          { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
337            MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
338       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
339 
340     for (auto VT :
341          { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32,
342            MVT::nxv2f64 }) {
343       setCondCodeAction(ISD::SETO, VT, Expand);
344       setCondCodeAction(ISD::SETOLT, VT, Expand);
345       setCondCodeAction(ISD::SETLT, VT, Expand);
346       setCondCodeAction(ISD::SETOLE, VT, Expand);
347       setCondCodeAction(ISD::SETLE, VT, Expand);
348       setCondCodeAction(ISD::SETULT, VT, Expand);
349       setCondCodeAction(ISD::SETULE, VT, Expand);
350       setCondCodeAction(ISD::SETUGE, VT, Expand);
351       setCondCodeAction(ISD::SETUGT, VT, Expand);
352       setCondCodeAction(ISD::SETUEQ, VT, Expand);
353       setCondCodeAction(ISD::SETUNE, VT, Expand);
354 
355       setOperationAction(ISD::FREM, VT, Expand);
356       setOperationAction(ISD::FPOW, VT, Expand);
357       setOperationAction(ISD::FPOWI, VT, Expand);
358       setOperationAction(ISD::FCOS, VT, Expand);
359       setOperationAction(ISD::FSIN, VT, Expand);
360       setOperationAction(ISD::FSINCOS, VT, Expand);
361       setOperationAction(ISD::FEXP, VT, Expand);
362       setOperationAction(ISD::FEXP2, VT, Expand);
363       setOperationAction(ISD::FLOG, VT, Expand);
364       setOperationAction(ISD::FLOG2, VT, Expand);
365       setOperationAction(ISD::FLOG10, VT, Expand);
366     }
367   }
368 
369   // Compute derived properties from the register classes
370   computeRegisterProperties(Subtarget->getRegisterInfo());
371 
372   // Provide all sorts of operation actions
373   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
374   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
375   setOperationAction(ISD::SETCC, MVT::i32, Custom);
376   setOperationAction(ISD::SETCC, MVT::i64, Custom);
377   setOperationAction(ISD::SETCC, MVT::f16, Custom);
378   setOperationAction(ISD::SETCC, MVT::f32, Custom);
379   setOperationAction(ISD::SETCC, MVT::f64, Custom);
380   setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
381   setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
382   setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
383   setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
384   setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
385   setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
386   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
387   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
388   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
389   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
390   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
391   setOperationAction(ISD::BR_CC, MVT::f16, Custom);
392   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
393   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
394   setOperationAction(ISD::SELECT, MVT::i32, Custom);
395   setOperationAction(ISD::SELECT, MVT::i64, Custom);
396   setOperationAction(ISD::SELECT, MVT::f16, Custom);
397   setOperationAction(ISD::SELECT, MVT::f32, Custom);
398   setOperationAction(ISD::SELECT, MVT::f64, Custom);
399   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
400   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
401   setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
402   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
403   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
404   setOperationAction(ISD::BR_JT, MVT::Other, Custom);
405   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
406 
407   setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
408   setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
409   setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
410 
411   setOperationAction(ISD::FREM, MVT::f32, Expand);
412   setOperationAction(ISD::FREM, MVT::f64, Expand);
413   setOperationAction(ISD::FREM, MVT::f80, Expand);
414 
415   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
416 
417   // Custom lowering hooks are needed for XOR
418   // to fold it into CSINC/CSINV.
419   setOperationAction(ISD::XOR, MVT::i32, Custom);
420   setOperationAction(ISD::XOR, MVT::i64, Custom);
421 
422   // Virtually no operation on f128 is legal, but LLVM can't expand them when
423   // there's a valid register class, so we need custom operations in most cases.
424   setOperationAction(ISD::FABS, MVT::f128, Expand);
425   setOperationAction(ISD::FADD, MVT::f128, LibCall);
426   setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
427   setOperationAction(ISD::FCOS, MVT::f128, Expand);
428   setOperationAction(ISD::FDIV, MVT::f128, LibCall);
429   setOperationAction(ISD::FMA, MVT::f128, Expand);
430   setOperationAction(ISD::FMUL, MVT::f128, LibCall);
431   setOperationAction(ISD::FNEG, MVT::f128, Expand);
432   setOperationAction(ISD::FPOW, MVT::f128, Expand);
433   setOperationAction(ISD::FREM, MVT::f128, Expand);
434   setOperationAction(ISD::FRINT, MVT::f128, Expand);
435   setOperationAction(ISD::FSIN, MVT::f128, Expand);
436   setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
437   setOperationAction(ISD::FSQRT, MVT::f128, Expand);
438   setOperationAction(ISD::FSUB, MVT::f128, LibCall);
439   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
440   setOperationAction(ISD::SETCC, MVT::f128, Custom);
441   setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
442   setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
443   setOperationAction(ISD::BR_CC, MVT::f128, Custom);
444   setOperationAction(ISD::SELECT, MVT::f128, Custom);
445   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
446   setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
447 
448   // Lowering for many of the conversions is actually specified by the non-f128
449   // type. The LowerXXX function will be trivial when f128 isn't involved.
450   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
451   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
452   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
453   setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
454   setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
455   setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
456   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
457   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
458   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
459   setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
460   setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
461   setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
462   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
463   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
464   setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
465   setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
466   setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
467   setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
468   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
469   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
470   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
471   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
472   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
473   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
474   setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
475   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
476   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
477   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
478   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
479   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
480 
481   setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
482   setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
483   setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
484   setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
485 
486   // Variable arguments.
487   setOperationAction(ISD::VASTART, MVT::Other, Custom);
488   setOperationAction(ISD::VAARG, MVT::Other, Custom);
489   setOperationAction(ISD::VACOPY, MVT::Other, Custom);
490   setOperationAction(ISD::VAEND, MVT::Other, Expand);
491 
492   // Variable-sized objects.
493   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
494   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
495 
496   if (Subtarget->isTargetWindows())
497     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
498   else
499     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
500 
501   // Constant pool entries
502   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
503 
504   // BlockAddress
505   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
506 
507   // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
508   setOperationAction(ISD::ADDC, MVT::i32, Custom);
509   setOperationAction(ISD::ADDE, MVT::i32, Custom);
510   setOperationAction(ISD::SUBC, MVT::i32, Custom);
511   setOperationAction(ISD::SUBE, MVT::i32, Custom);
512   setOperationAction(ISD::ADDC, MVT::i64, Custom);
513   setOperationAction(ISD::ADDE, MVT::i64, Custom);
514   setOperationAction(ISD::SUBC, MVT::i64, Custom);
515   setOperationAction(ISD::SUBE, MVT::i64, Custom);
516 
517   // AArch64 lacks both left-rotate and popcount instructions.
518   setOperationAction(ISD::ROTL, MVT::i32, Expand);
519   setOperationAction(ISD::ROTL, MVT::i64, Expand);
520   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
521     setOperationAction(ISD::ROTL, VT, Expand);
522     setOperationAction(ISD::ROTR, VT, Expand);
523   }
524 
525   // AArch64 doesn't have i32 MULH{S|U}.
526   setOperationAction(ISD::MULHU, MVT::i32, Expand);
527   setOperationAction(ISD::MULHS, MVT::i32, Expand);
528 
529   // AArch64 doesn't have {U|S}MUL_LOHI.
530   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
531   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
532 
533   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
534   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
535   setOperationAction(ISD::CTPOP, MVT::i128, Custom);
536 
537   setOperationAction(ISD::ABS, MVT::i32, Custom);
538   setOperationAction(ISD::ABS, MVT::i64, Custom);
539 
540   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
541   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
542   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
543     setOperationAction(ISD::SDIVREM, VT, Expand);
544     setOperationAction(ISD::UDIVREM, VT, Expand);
545   }
546   setOperationAction(ISD::SREM, MVT::i32, Expand);
547   setOperationAction(ISD::SREM, MVT::i64, Expand);
548   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
549   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
550   setOperationAction(ISD::UREM, MVT::i32, Expand);
551   setOperationAction(ISD::UREM, MVT::i64, Expand);
552 
553   // Custom lower Add/Sub/Mul with overflow.
554   setOperationAction(ISD::SADDO, MVT::i32, Custom);
555   setOperationAction(ISD::SADDO, MVT::i64, Custom);
556   setOperationAction(ISD::UADDO, MVT::i32, Custom);
557   setOperationAction(ISD::UADDO, MVT::i64, Custom);
558   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
559   setOperationAction(ISD::SSUBO, MVT::i64, Custom);
560   setOperationAction(ISD::USUBO, MVT::i32, Custom);
561   setOperationAction(ISD::USUBO, MVT::i64, Custom);
562   setOperationAction(ISD::SMULO, MVT::i32, Custom);
563   setOperationAction(ISD::SMULO, MVT::i64, Custom);
564   setOperationAction(ISD::UMULO, MVT::i32, Custom);
565   setOperationAction(ISD::UMULO, MVT::i64, Custom);
566 
567   setOperationAction(ISD::FSIN, MVT::f32, Expand);
568   setOperationAction(ISD::FSIN, MVT::f64, Expand);
569   setOperationAction(ISD::FCOS, MVT::f32, Expand);
570   setOperationAction(ISD::FCOS, MVT::f64, Expand);
571   setOperationAction(ISD::FPOW, MVT::f32, Expand);
572   setOperationAction(ISD::FPOW, MVT::f64, Expand);
573   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
574   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
575   if (Subtarget->hasFullFP16())
576     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
577   else
578     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
579 
580   setOperationAction(ISD::FREM,    MVT::f16,   Promote);
581   setOperationAction(ISD::FREM,    MVT::v4f16, Expand);
582   setOperationAction(ISD::FREM,    MVT::v8f16, Expand);
583   setOperationAction(ISD::FPOW,    MVT::f16,   Promote);
584   setOperationAction(ISD::FPOW,    MVT::v4f16, Expand);
585   setOperationAction(ISD::FPOW,    MVT::v8f16, Expand);
586   setOperationAction(ISD::FPOWI,   MVT::f16,   Promote);
587   setOperationAction(ISD::FPOWI,   MVT::v4f16, Expand);
588   setOperationAction(ISD::FPOWI,   MVT::v8f16, Expand);
589   setOperationAction(ISD::FCOS,    MVT::f16,   Promote);
590   setOperationAction(ISD::FCOS,    MVT::v4f16, Expand);
591   setOperationAction(ISD::FCOS,    MVT::v8f16, Expand);
592   setOperationAction(ISD::FSIN,    MVT::f16,   Promote);
593   setOperationAction(ISD::FSIN,    MVT::v4f16, Expand);
594   setOperationAction(ISD::FSIN,    MVT::v8f16, Expand);
595   setOperationAction(ISD::FSINCOS, MVT::f16,   Promote);
596   setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
597   setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
598   setOperationAction(ISD::FEXP,    MVT::f16,   Promote);
599   setOperationAction(ISD::FEXP,    MVT::v4f16, Expand);
600   setOperationAction(ISD::FEXP,    MVT::v8f16, Expand);
601   setOperationAction(ISD::FEXP2,   MVT::f16,   Promote);
602   setOperationAction(ISD::FEXP2,   MVT::v4f16, Expand);
603   setOperationAction(ISD::FEXP2,   MVT::v8f16, Expand);
604   setOperationAction(ISD::FLOG,    MVT::f16,   Promote);
605   setOperationAction(ISD::FLOG,    MVT::v4f16, Expand);
606   setOperationAction(ISD::FLOG,    MVT::v8f16, Expand);
607   setOperationAction(ISD::FLOG2,   MVT::f16,   Promote);
608   setOperationAction(ISD::FLOG2,   MVT::v4f16, Expand);
609   setOperationAction(ISD::FLOG2,   MVT::v8f16, Expand);
610   setOperationAction(ISD::FLOG10,  MVT::f16,   Promote);
611   setOperationAction(ISD::FLOG10,  MVT::v4f16, Expand);
612   setOperationAction(ISD::FLOG10,  MVT::v8f16, Expand);
613 
614   if (!Subtarget->hasFullFP16()) {
615     setOperationAction(ISD::SELECT,      MVT::f16,  Promote);
616     setOperationAction(ISD::SELECT_CC,   MVT::f16,  Promote);
617     setOperationAction(ISD::SETCC,       MVT::f16,  Promote);
618     setOperationAction(ISD::BR_CC,       MVT::f16,  Promote);
619     setOperationAction(ISD::FADD,        MVT::f16,  Promote);
620     setOperationAction(ISD::FSUB,        MVT::f16,  Promote);
621     setOperationAction(ISD::FMUL,        MVT::f16,  Promote);
622     setOperationAction(ISD::FDIV,        MVT::f16,  Promote);
623     setOperationAction(ISD::FMA,         MVT::f16,  Promote);
624     setOperationAction(ISD::FNEG,        MVT::f16,  Promote);
625     setOperationAction(ISD::FABS,        MVT::f16,  Promote);
626     setOperationAction(ISD::FCEIL,       MVT::f16,  Promote);
627     setOperationAction(ISD::FSQRT,       MVT::f16,  Promote);
628     setOperationAction(ISD::FFLOOR,      MVT::f16,  Promote);
629     setOperationAction(ISD::FNEARBYINT,  MVT::f16,  Promote);
630     setOperationAction(ISD::FRINT,       MVT::f16,  Promote);
631     setOperationAction(ISD::FROUND,      MVT::f16,  Promote);
632     setOperationAction(ISD::FROUNDEVEN,  MVT::f16,  Promote);
633     setOperationAction(ISD::FTRUNC,      MVT::f16,  Promote);
634     setOperationAction(ISD::FMINNUM,     MVT::f16,  Promote);
635     setOperationAction(ISD::FMAXNUM,     MVT::f16,  Promote);
636     setOperationAction(ISD::FMINIMUM,    MVT::f16,  Promote);
637     setOperationAction(ISD::FMAXIMUM,    MVT::f16,  Promote);
638 
639     // promote v4f16 to v4f32 when that is known to be safe.
640     setOperationAction(ISD::FADD,        MVT::v4f16, Promote);
641     setOperationAction(ISD::FSUB,        MVT::v4f16, Promote);
642     setOperationAction(ISD::FMUL,        MVT::v4f16, Promote);
643     setOperationAction(ISD::FDIV,        MVT::v4f16, Promote);
644     AddPromotedToType(ISD::FADD,         MVT::v4f16, MVT::v4f32);
645     AddPromotedToType(ISD::FSUB,         MVT::v4f16, MVT::v4f32);
646     AddPromotedToType(ISD::FMUL,         MVT::v4f16, MVT::v4f32);
647     AddPromotedToType(ISD::FDIV,         MVT::v4f16, MVT::v4f32);
648 
649     setOperationAction(ISD::FABS,        MVT::v4f16, Expand);
650     setOperationAction(ISD::FNEG,        MVT::v4f16, Expand);
651     setOperationAction(ISD::FROUND,      MVT::v4f16, Expand);
652     setOperationAction(ISD::FROUNDEVEN,  MVT::v4f16, Expand);
653     setOperationAction(ISD::FMA,         MVT::v4f16, Expand);
654     setOperationAction(ISD::SETCC,       MVT::v4f16, Expand);
655     setOperationAction(ISD::BR_CC,       MVT::v4f16, Expand);
656     setOperationAction(ISD::SELECT,      MVT::v4f16, Expand);
657     setOperationAction(ISD::SELECT_CC,   MVT::v4f16, Expand);
658     setOperationAction(ISD::FTRUNC,      MVT::v4f16, Expand);
659     setOperationAction(ISD::FCOPYSIGN,   MVT::v4f16, Expand);
660     setOperationAction(ISD::FFLOOR,      MVT::v4f16, Expand);
661     setOperationAction(ISD::FCEIL,       MVT::v4f16, Expand);
662     setOperationAction(ISD::FRINT,       MVT::v4f16, Expand);
663     setOperationAction(ISD::FNEARBYINT,  MVT::v4f16, Expand);
664     setOperationAction(ISD::FSQRT,       MVT::v4f16, Expand);
665 
666     setOperationAction(ISD::FABS,        MVT::v8f16, Expand);
667     setOperationAction(ISD::FADD,        MVT::v8f16, Expand);
668     setOperationAction(ISD::FCEIL,       MVT::v8f16, Expand);
669     setOperationAction(ISD::FCOPYSIGN,   MVT::v8f16, Expand);
670     setOperationAction(ISD::FDIV,        MVT::v8f16, Expand);
671     setOperationAction(ISD::FFLOOR,      MVT::v8f16, Expand);
672     setOperationAction(ISD::FMA,         MVT::v8f16, Expand);
673     setOperationAction(ISD::FMUL,        MVT::v8f16, Expand);
674     setOperationAction(ISD::FNEARBYINT,  MVT::v8f16, Expand);
675     setOperationAction(ISD::FNEG,        MVT::v8f16, Expand);
676     setOperationAction(ISD::FROUND,      MVT::v8f16, Expand);
677     setOperationAction(ISD::FROUNDEVEN,  MVT::v8f16, Expand);
678     setOperationAction(ISD::FRINT,       MVT::v8f16, Expand);
679     setOperationAction(ISD::FSQRT,       MVT::v8f16, Expand);
680     setOperationAction(ISD::FSUB,        MVT::v8f16, Expand);
681     setOperationAction(ISD::FTRUNC,      MVT::v8f16, Expand);
682     setOperationAction(ISD::SETCC,       MVT::v8f16, Expand);
683     setOperationAction(ISD::BR_CC,       MVT::v8f16, Expand);
684     setOperationAction(ISD::SELECT,      MVT::v8f16, Expand);
685     setOperationAction(ISD::SELECT_CC,   MVT::v8f16, Expand);
686     setOperationAction(ISD::FP_EXTEND,   MVT::v8f16, Expand);
687   }
688 
689   // AArch64 has implementations of a lot of rounding-like FP operations.
690   for (MVT Ty : {MVT::f32, MVT::f64}) {
691     setOperationAction(ISD::FFLOOR, Ty, Legal);
692     setOperationAction(ISD::FNEARBYINT, Ty, Legal);
693     setOperationAction(ISD::FCEIL, Ty, Legal);
694     setOperationAction(ISD::FRINT, Ty, Legal);
695     setOperationAction(ISD::FTRUNC, Ty, Legal);
696     setOperationAction(ISD::FROUND, Ty, Legal);
697     setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
698     setOperationAction(ISD::FMINNUM, Ty, Legal);
699     setOperationAction(ISD::FMAXNUM, Ty, Legal);
700     setOperationAction(ISD::FMINIMUM, Ty, Legal);
701     setOperationAction(ISD::FMAXIMUM, Ty, Legal);
702     setOperationAction(ISD::LROUND, Ty, Legal);
703     setOperationAction(ISD::LLROUND, Ty, Legal);
704     setOperationAction(ISD::LRINT, Ty, Legal);
705     setOperationAction(ISD::LLRINT, Ty, Legal);
706   }
707 
708   if (Subtarget->hasFullFP16()) {
709     setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
710     setOperationAction(ISD::FFLOOR,  MVT::f16, Legal);
711     setOperationAction(ISD::FCEIL,   MVT::f16, Legal);
712     setOperationAction(ISD::FRINT,   MVT::f16, Legal);
713     setOperationAction(ISD::FTRUNC,  MVT::f16, Legal);
714     setOperationAction(ISD::FROUND,  MVT::f16, Legal);
715     setOperationAction(ISD::FROUNDEVEN,  MVT::f16, Legal);
716     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
717     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
718     setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
719     setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
720   }
721 
722   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
723 
724   setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
725   setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
726 
727   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
728   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
729   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
730   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
731   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
732 
733   // Generate outline atomics library calls only if LSE was not specified for
734   // subtarget
735   if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
736     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
737     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
738     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
739     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
740     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
741     setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
742     setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
743     setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
744     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
745     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
746     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
747     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
748     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
749     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
750     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
751     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
752     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
753     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
754     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
755     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
756     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
757     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
758     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
759     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
760     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
761 #define LCALLNAMES(A, B, N)                                                    \
762   setLibcallName(A##N##_RELAX, #B #N "_relax");                                \
763   setLibcallName(A##N##_ACQ, #B #N "_acq");                                    \
764   setLibcallName(A##N##_REL, #B #N "_rel");                                    \
765   setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
766 #define LCALLNAME4(A, B)                                                       \
767   LCALLNAMES(A, B, 1)                                                          \
768   LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
769 #define LCALLNAME5(A, B)                                                       \
770   LCALLNAMES(A, B, 1)                                                          \
771   LCALLNAMES(A, B, 2)                                                          \
772   LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
773     LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
774     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
775     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
776     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
777     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
778     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
779 #undef LCALLNAMES
780 #undef LCALLNAME4
781 #undef LCALLNAME5
782   }
783 
784   // 128-bit loads and stores can be done without expanding
785   setOperationAction(ISD::LOAD, MVT::i128, Custom);
786   setOperationAction(ISD::STORE, MVT::i128, Custom);
787 
788   // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
789   // custom lowering, as there are no un-paired non-temporal stores and
790   // legalization will break up 256 bit inputs.
791   setOperationAction(ISD::STORE, MVT::v32i8, Custom);
792   setOperationAction(ISD::STORE, MVT::v16i16, Custom);
793   setOperationAction(ISD::STORE, MVT::v16f16, Custom);
794   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
795   setOperationAction(ISD::STORE, MVT::v8f32, Custom);
796   setOperationAction(ISD::STORE, MVT::v4f64, Custom);
797   setOperationAction(ISD::STORE, MVT::v4i64, Custom);
798 
799   // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
800   // This requires the Performance Monitors extension.
801   if (Subtarget->hasPerfMon())
802     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
803 
804   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
805       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
806     // Issue __sincos_stret if available.
807     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
808     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
809   } else {
810     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
811     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
812   }
813 
814   if (Subtarget->getTargetTriple().isOSMSVCRT()) {
815     // MSVCRT doesn't have powi; fall back to pow
816     setLibcallName(RTLIB::POWI_F32, nullptr);
817     setLibcallName(RTLIB::POWI_F64, nullptr);
818   }
819 
820   // Make floating-point constants legal for the large code model, so they don't
821   // become loads from the constant pool.
822   if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
823     setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
824     setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
825   }
826 
827   // AArch64 does not have floating-point extending loads, i1 sign-extending
828   // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
829   for (MVT VT : MVT::fp_valuetypes()) {
830     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
831     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
832     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
833     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
834   }
835   for (MVT VT : MVT::integer_valuetypes())
836     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
837 
838   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
839   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
840   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
841   setTruncStoreAction(MVT::f128, MVT::f80, Expand);
842   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
843   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
844   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
845 
846   setOperationAction(ISD::BITCAST, MVT::i16, Custom);
847   setOperationAction(ISD::BITCAST, MVT::f16, Custom);
848   setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
849 
850   // Indexed loads and stores are supported.
851   for (unsigned im = (unsigned)ISD::PRE_INC;
852        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
853     setIndexedLoadAction(im, MVT::i8, Legal);
854     setIndexedLoadAction(im, MVT::i16, Legal);
855     setIndexedLoadAction(im, MVT::i32, Legal);
856     setIndexedLoadAction(im, MVT::i64, Legal);
857     setIndexedLoadAction(im, MVT::f64, Legal);
858     setIndexedLoadAction(im, MVT::f32, Legal);
859     setIndexedLoadAction(im, MVT::f16, Legal);
860     setIndexedLoadAction(im, MVT::bf16, Legal);
861     setIndexedStoreAction(im, MVT::i8, Legal);
862     setIndexedStoreAction(im, MVT::i16, Legal);
863     setIndexedStoreAction(im, MVT::i32, Legal);
864     setIndexedStoreAction(im, MVT::i64, Legal);
865     setIndexedStoreAction(im, MVT::f64, Legal);
866     setIndexedStoreAction(im, MVT::f32, Legal);
867     setIndexedStoreAction(im, MVT::f16, Legal);
868     setIndexedStoreAction(im, MVT::bf16, Legal);
869   }
870 
871   // Trap.
872   setOperationAction(ISD::TRAP, MVT::Other, Legal);
873   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
874   setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
875 
876   // We combine OR nodes for bitfield operations.
877   setTargetDAGCombine(ISD::OR);
878   // Try to create BICs for vector ANDs.
879   setTargetDAGCombine(ISD::AND);
880 
881   // Vector add and sub nodes may conceal a high-half opportunity.
882   // Also, try to fold ADD into CSINC/CSINV..
883   setTargetDAGCombine(ISD::ADD);
884   setTargetDAGCombine(ISD::ABS);
885   setTargetDAGCombine(ISD::SUB);
886   setTargetDAGCombine(ISD::SRL);
887   setTargetDAGCombine(ISD::XOR);
888   setTargetDAGCombine(ISD::SINT_TO_FP);
889   setTargetDAGCombine(ISD::UINT_TO_FP);
890 
891   // TODO: Do the same for FP_TO_*INT_SAT.
892   setTargetDAGCombine(ISD::FP_TO_SINT);
893   setTargetDAGCombine(ISD::FP_TO_UINT);
894   setTargetDAGCombine(ISD::FDIV);
895 
896   // Try and combine setcc with csel
897   setTargetDAGCombine(ISD::SETCC);
898 
899   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
900 
901   setTargetDAGCombine(ISD::ANY_EXTEND);
902   setTargetDAGCombine(ISD::ZERO_EXTEND);
903   setTargetDAGCombine(ISD::SIGN_EXTEND);
904   setTargetDAGCombine(ISD::VECTOR_SPLICE);
905   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
906   setTargetDAGCombine(ISD::TRUNCATE);
907   setTargetDAGCombine(ISD::CONCAT_VECTORS);
908   setTargetDAGCombine(ISD::STORE);
909   if (Subtarget->supportsAddressTopByteIgnored())
910     setTargetDAGCombine(ISD::LOAD);
911 
912   setTargetDAGCombine(ISD::MUL);
913 
914   setTargetDAGCombine(ISD::SELECT);
915   setTargetDAGCombine(ISD::VSELECT);
916 
917   setTargetDAGCombine(ISD::INTRINSIC_VOID);
918   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
919   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
920   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
921   setTargetDAGCombine(ISD::VECREDUCE_ADD);
922   setTargetDAGCombine(ISD::STEP_VECTOR);
923 
924   setTargetDAGCombine(ISD::GlobalAddress);
925 
926   // In case of strict alignment, avoid an excessive number of byte wide stores.
927   MaxStoresPerMemsetOptSize = 8;
928   MaxStoresPerMemset = Subtarget->requiresStrictAlign()
929                        ? MaxStoresPerMemsetOptSize : 32;
930 
931   MaxGluedStoresPerMemcpy = 4;
932   MaxStoresPerMemcpyOptSize = 4;
933   MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
934                        ? MaxStoresPerMemcpyOptSize : 16;
935 
936   MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
937 
938   MaxLoadsPerMemcmpOptSize = 4;
939   MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
940                       ? MaxLoadsPerMemcmpOptSize : 8;
941 
942   setStackPointerRegisterToSaveRestore(AArch64::SP);
943 
944   setSchedulingPreference(Sched::Hybrid);
945 
946   EnableExtLdPromotion = true;
947 
948   // Set required alignment.
949   setMinFunctionAlignment(Align(4));
950   // Set preferred alignments.
951   setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
952   setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
953 
954   // Only change the limit for entries in a jump table if specified by
955   // the sub target, but not at the command line.
956   unsigned MaxJT = STI.getMaximumJumpTableSize();
957   if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
958     setMaximumJumpTableSize(MaxJT);
959 
960   setHasExtractBitsInsn(true);
961 
962   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
963 
964   if (Subtarget->hasNEON()) {
965     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
966     // silliness like this:
967     setOperationAction(ISD::FABS, MVT::v1f64, Expand);
968     setOperationAction(ISD::FADD, MVT::v1f64, Expand);
969     setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
970     setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
971     setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
972     setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
973     setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
974     setOperationAction(ISD::FMA, MVT::v1f64, Expand);
975     setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
976     setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
977     setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
978     setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
979     setOperationAction(ISD::FREM, MVT::v1f64, Expand);
980     setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
981     setOperationAction(ISD::FROUNDEVEN, MVT::v1f64, Expand);
982     setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
983     setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
984     setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
985     setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
986     setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
987     setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
988     setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
989     setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
990     setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
991     setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
992     setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
993 
994     setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
995     setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
996     setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
997     setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
998     setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
999 
1000     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
1001 
1002     // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1003     // elements smaller than i32, so promote the input to i32 first.
1004     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1005     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1006     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1007     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1008     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1009     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1010 
1011     // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1012     setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1013     setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1014     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
1015     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
1016     // Or, direct i32 -> f16 vector conversion.  Set it so custom, so the
1017     // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1018     setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
1019     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1020 
1021     if (Subtarget->hasFullFP16()) {
1022       setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
1023       setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1024       setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
1025       setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1026     } else {
1027       // when AArch64 doesn't have fullfp16 support, promote the input
1028       // to i32 first.
1029       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1030       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1031       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1032       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1033     }
1034 
1035     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
1036     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
1037     setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
1038     setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
1039     setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
1040     setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
1041     setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1042     setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
1043 
1044     // AArch64 doesn't have MUL.2d:
1045     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
1046     // Custom handling for some quad-vector types to detect MULL.
1047     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1048     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1049     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1050 
1051     // Saturates
1052     for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1053                     MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1054       setOperationAction(ISD::SADDSAT, VT, Legal);
1055       setOperationAction(ISD::UADDSAT, VT, Legal);
1056       setOperationAction(ISD::SSUBSAT, VT, Legal);
1057       setOperationAction(ISD::USUBSAT, VT, Legal);
1058     }
1059 
1060     for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1061                    MVT::v4i32}) {
1062       setOperationAction(ISD::ABDS, VT, Legal);
1063       setOperationAction(ISD::ABDU, VT, Legal);
1064     }
1065 
1066     // Vector reductions
1067     for (MVT VT : { MVT::v4f16, MVT::v2f32,
1068                     MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1069       if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1070         setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1071         setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1072 
1073         setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1074       }
1075     }
1076     for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1077                     MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1078       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1079       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1080       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1081       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1082       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1083     }
1084     setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1085 
1086     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
1087     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1088     // Likewise, narrowing and extending vector loads/stores aren't handled
1089     // directly.
1090     for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1091       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
1092 
1093       if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1094         setOperationAction(ISD::MULHS, VT, Legal);
1095         setOperationAction(ISD::MULHU, VT, Legal);
1096       } else {
1097         setOperationAction(ISD::MULHS, VT, Expand);
1098         setOperationAction(ISD::MULHU, VT, Expand);
1099       }
1100       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1101       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1102 
1103       setOperationAction(ISD::BSWAP, VT, Expand);
1104       setOperationAction(ISD::CTTZ, VT, Expand);
1105 
1106       for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1107         setTruncStoreAction(VT, InnerVT, Expand);
1108         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1109         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1110         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1111       }
1112     }
1113 
1114     // AArch64 has implementations of a lot of rounding-like FP operations.
1115     for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
1116       setOperationAction(ISD::FFLOOR, Ty, Legal);
1117       setOperationAction(ISD::FNEARBYINT, Ty, Legal);
1118       setOperationAction(ISD::FCEIL, Ty, Legal);
1119       setOperationAction(ISD::FRINT, Ty, Legal);
1120       setOperationAction(ISD::FTRUNC, Ty, Legal);
1121       setOperationAction(ISD::FROUND, Ty, Legal);
1122       setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
1123     }
1124 
1125     if (Subtarget->hasFullFP16()) {
1126       for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
1127         setOperationAction(ISD::FFLOOR, Ty, Legal);
1128         setOperationAction(ISD::FNEARBYINT, Ty, Legal);
1129         setOperationAction(ISD::FCEIL, Ty, Legal);
1130         setOperationAction(ISD::FRINT, Ty, Legal);
1131         setOperationAction(ISD::FTRUNC, Ty, Legal);
1132         setOperationAction(ISD::FROUND, Ty, Legal);
1133         setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
1134       }
1135     }
1136 
1137     if (Subtarget->hasSVE())
1138       setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1139 
1140     setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1141 
1142     setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);
1143     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1144     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1145     setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);
1146     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1147     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1148   }
1149 
1150   if (Subtarget->hasSVE()) {
1151     for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1152       setOperationAction(ISD::BITREVERSE, VT, Custom);
1153       setOperationAction(ISD::BSWAP, VT, Custom);
1154       setOperationAction(ISD::CTLZ, VT, Custom);
1155       setOperationAction(ISD::CTPOP, VT, Custom);
1156       setOperationAction(ISD::CTTZ, VT, Custom);
1157       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1158       setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1159       setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1160       setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1161       setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1162       setOperationAction(ISD::MGATHER, VT, Custom);
1163       setOperationAction(ISD::MSCATTER, VT, Custom);
1164       setOperationAction(ISD::MLOAD, VT, Custom);
1165       setOperationAction(ISD::MUL, VT, Custom);
1166       setOperationAction(ISD::MULHS, VT, Custom);
1167       setOperationAction(ISD::MULHU, VT, Custom);
1168       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1169       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1170       setOperationAction(ISD::SELECT, VT, Custom);
1171       setOperationAction(ISD::SETCC, VT, Custom);
1172       setOperationAction(ISD::SDIV, VT, Custom);
1173       setOperationAction(ISD::UDIV, VT, Custom);
1174       setOperationAction(ISD::SMIN, VT, Custom);
1175       setOperationAction(ISD::UMIN, VT, Custom);
1176       setOperationAction(ISD::SMAX, VT, Custom);
1177       setOperationAction(ISD::UMAX, VT, Custom);
1178       setOperationAction(ISD::SHL, VT, Custom);
1179       setOperationAction(ISD::SRL, VT, Custom);
1180       setOperationAction(ISD::SRA, VT, Custom);
1181       setOperationAction(ISD::ABS, VT, Custom);
1182       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1183       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1184       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1185       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1186       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1187       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1188       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1189       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1190 
1191       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1192       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1193       setOperationAction(ISD::SELECT_CC, VT, Expand);
1194       setOperationAction(ISD::ROTL, VT, Expand);
1195       setOperationAction(ISD::ROTR, VT, Expand);
1196     }
1197 
1198     // Illegal unpacked integer vector types.
1199     for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1200       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1201       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1202     }
1203 
1204     // Legalize unpacked bitcasts to REINTERPRET_CAST.
1205     for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1206                     MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1207       setOperationAction(ISD::BITCAST, VT, Custom);
1208 
1209     for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1210       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1211       setOperationAction(ISD::SELECT, VT, Custom);
1212       setOperationAction(ISD::SETCC, VT, Custom);
1213       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1214       setOperationAction(ISD::TRUNCATE, VT, Custom);
1215       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1216       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1217       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1218 
1219       setOperationAction(ISD::SELECT_CC, VT, Expand);
1220       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1221       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1222 
1223       // There are no legal MVT::nxv16f## based types.
1224       if (VT != MVT::nxv16i1) {
1225         setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1226         setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1227       }
1228     }
1229 
1230     // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1231     for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1232                     MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1233                     MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1234       setOperationAction(ISD::MLOAD, VT, Custom);
1235       setOperationAction(ISD::MSTORE, VT, Custom);
1236       setOperationAction(ISD::MGATHER, VT, Custom);
1237       setOperationAction(ISD::MSCATTER, VT, Custom);
1238     }
1239 
1240     for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
1241       for (MVT InnerVT : MVT::fp_scalable_vector_valuetypes()) {
1242         // Avoid marking truncating FP stores as legal to prevent the
1243         // DAGCombiner from creating unsupported truncating stores.
1244         setTruncStoreAction(VT, InnerVT, Expand);
1245         // SVE does not have floating-point extending loads.
1246         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1247         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1248         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1249       }
1250     }
1251 
1252     // SVE supports truncating stores of 64 and 128-bit vectors
1253     setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1254     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1255     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1256     setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1257     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1258 
1259     for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1260                     MVT::nxv4f32, MVT::nxv2f64}) {
1261       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1262       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1263       setOperationAction(ISD::MGATHER, VT, Custom);
1264       setOperationAction(ISD::MSCATTER, VT, Custom);
1265       setOperationAction(ISD::MLOAD, VT, Custom);
1266       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1267       setOperationAction(ISD::SELECT, VT, Custom);
1268       setOperationAction(ISD::FADD, VT, Custom);
1269       setOperationAction(ISD::FDIV, VT, Custom);
1270       setOperationAction(ISD::FMA, VT, Custom);
1271       setOperationAction(ISD::FMAXIMUM, VT, Custom);
1272       setOperationAction(ISD::FMAXNUM, VT, Custom);
1273       setOperationAction(ISD::FMINIMUM, VT, Custom);
1274       setOperationAction(ISD::FMINNUM, VT, Custom);
1275       setOperationAction(ISD::FMUL, VT, Custom);
1276       setOperationAction(ISD::FNEG, VT, Custom);
1277       setOperationAction(ISD::FSUB, VT, Custom);
1278       setOperationAction(ISD::FCEIL, VT, Custom);
1279       setOperationAction(ISD::FFLOOR, VT, Custom);
1280       setOperationAction(ISD::FNEARBYINT, VT, Custom);
1281       setOperationAction(ISD::FRINT, VT, Custom);
1282       setOperationAction(ISD::FROUND, VT, Custom);
1283       setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1284       setOperationAction(ISD::FTRUNC, VT, Custom);
1285       setOperationAction(ISD::FSQRT, VT, Custom);
1286       setOperationAction(ISD::FABS, VT, Custom);
1287       setOperationAction(ISD::FP_EXTEND, VT, Custom);
1288       setOperationAction(ISD::FP_ROUND, VT, Custom);
1289       setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1290       setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1291       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1292       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1293       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1294 
1295       setOperationAction(ISD::SELECT_CC, VT, Expand);
1296     }
1297 
1298     for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1299       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1300       setOperationAction(ISD::MGATHER, VT, Custom);
1301       setOperationAction(ISD::MSCATTER, VT, Custom);
1302       setOperationAction(ISD::MLOAD, VT, Custom);
1303     }
1304 
1305     setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
1306 
1307     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
1308     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
1309 
1310     // NOTE: Currently this has to happen after computeRegisterProperties rather
1311     // than the preferred option of combining it with the addRegisterClass call.
1312     if (Subtarget->useSVEForFixedLengthVectors()) {
1313       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
1314         if (useSVEForFixedLengthVectorVT(VT))
1315           addTypeForFixedLengthSVE(VT);
1316       for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
1317         if (useSVEForFixedLengthVectorVT(VT))
1318           addTypeForFixedLengthSVE(VT);
1319 
1320       // 64bit results can mean a bigger than NEON input.
1321       for (auto VT : {MVT::v8i8, MVT::v4i16})
1322         setOperationAction(ISD::TRUNCATE, VT, Custom);
1323       setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
1324 
1325       // 128bit results imply a bigger than NEON input.
1326       for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1327         setOperationAction(ISD::TRUNCATE, VT, Custom);
1328       for (auto VT : {MVT::v8f16, MVT::v4f32})
1329         setOperationAction(ISD::FP_ROUND, VT, Custom);
1330 
1331       // These operations are not supported on NEON but SVE can do them.
1332       setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1333       setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1334       setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1335       setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1336       setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1337       setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1338       setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1339       setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1340       setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1341       setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1342       setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
1343       setOperationAction(ISD::SDIV, MVT::v16i8, Custom);
1344       setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
1345       setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
1346       setOperationAction(ISD::SDIV, MVT::v2i32, Custom);
1347       setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
1348       setOperationAction(ISD::SDIV, MVT::v1i64, Custom);
1349       setOperationAction(ISD::SDIV, MVT::v2i64, Custom);
1350       setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1351       setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1352       setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1353       setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1354       setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
1355       setOperationAction(ISD::UDIV, MVT::v16i8, Custom);
1356       setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
1357       setOperationAction(ISD::UDIV, MVT::v8i16, Custom);
1358       setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
1359       setOperationAction(ISD::UDIV, MVT::v4i32, Custom);
1360       setOperationAction(ISD::UDIV, MVT::v1i64, Custom);
1361       setOperationAction(ISD::UDIV, MVT::v2i64, Custom);
1362       setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1363       setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1364       setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1365       setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1366       setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1367       setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1368       setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1369       setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1370 
1371       // Int operations with no NEON support.
1372       for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1373                       MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1374         setOperationAction(ISD::BITREVERSE, VT, Custom);
1375         setOperationAction(ISD::CTTZ, VT, Custom);
1376         setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1377         setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1378         setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1379       }
1380 
1381       // FP operations with no NEON support.
1382       for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
1383                       MVT::v1f64, MVT::v2f64})
1384         setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1385 
1386       // Use SVE for vectors with more than 2 elements.
1387       for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1388         setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1389     }
1390 
1391     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1392     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1393     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1394     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1395   }
1396 
1397   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1398 }
1399 
1400 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1401   assert(VT.isVector() && "VT should be a vector type");
1402 
1403   if (VT.isFloatingPoint()) {
1404     MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
1405     setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1406     setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1407   }
1408 
1409   // Mark vector float intrinsics as expand.
1410   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1411     setOperationAction(ISD::FSIN, VT, Expand);
1412     setOperationAction(ISD::FCOS, VT, Expand);
1413     setOperationAction(ISD::FPOW, VT, Expand);
1414     setOperationAction(ISD::FLOG, VT, Expand);
1415     setOperationAction(ISD::FLOG2, VT, Expand);
1416     setOperationAction(ISD::FLOG10, VT, Expand);
1417     setOperationAction(ISD::FEXP, VT, Expand);
1418     setOperationAction(ISD::FEXP2, VT, Expand);
1419   }
1420 
1421   // But we do support custom-lowering for FCOPYSIGN.
1422   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1423       ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1424     setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1425 
1426   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1427   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1428   setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1429   setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1430   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1431   setOperationAction(ISD::SRA, VT, Custom);
1432   setOperationAction(ISD::SRL, VT, Custom);
1433   setOperationAction(ISD::SHL, VT, Custom);
1434   setOperationAction(ISD::OR, VT, Custom);
1435   setOperationAction(ISD::SETCC, VT, Custom);
1436   setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
1437 
1438   setOperationAction(ISD::SELECT, VT, Expand);
1439   setOperationAction(ISD::SELECT_CC, VT, Expand);
1440   setOperationAction(ISD::VSELECT, VT, Expand);
1441   for (MVT InnerVT : MVT::all_valuetypes())
1442     setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1443 
1444   // CNT supports only B element sizes, then use UADDLP to widen.
1445   if (VT != MVT::v8i8 && VT != MVT::v16i8)
1446     setOperationAction(ISD::CTPOP, VT, Custom);
1447 
1448   setOperationAction(ISD::UDIV, VT, Expand);
1449   setOperationAction(ISD::SDIV, VT, Expand);
1450   setOperationAction(ISD::UREM, VT, Expand);
1451   setOperationAction(ISD::SREM, VT, Expand);
1452   setOperationAction(ISD::FREM, VT, Expand);
1453 
1454   setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1455   setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1456 
1457   if (!VT.isFloatingPoint())
1458     setOperationAction(ISD::ABS, VT, Legal);
1459 
1460   // [SU][MIN|MAX] are available for all NEON types apart from i64.
1461   if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1462     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1463       setOperationAction(Opcode, VT, Legal);
1464 
1465   // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
1466   if (VT.isFloatingPoint() &&
1467       VT.getVectorElementType() != MVT::bf16 &&
1468       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1469     for (unsigned Opcode :
1470          {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
1471       setOperationAction(Opcode, VT, Legal);
1472 
1473   if (Subtarget->isLittleEndian()) {
1474     for (unsigned im = (unsigned)ISD::PRE_INC;
1475          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1476       setIndexedLoadAction(im, VT, Legal);
1477       setIndexedStoreAction(im, VT, Legal);
1478     }
1479   }
1480 }
1481 
1482 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1483   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1484 
1485   // By default everything must be expanded.
1486   for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1487     setOperationAction(Op, VT, Expand);
1488 
1489   // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1490   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1491 
1492   if (VT.isFloatingPoint()) {
1493     setCondCodeAction(ISD::SETO, VT, Expand);
1494     setCondCodeAction(ISD::SETOLT, VT, Expand);
1495     setCondCodeAction(ISD::SETLT, VT, Expand);
1496     setCondCodeAction(ISD::SETOLE, VT, Expand);
1497     setCondCodeAction(ISD::SETLE, VT, Expand);
1498     setCondCodeAction(ISD::SETULT, VT, Expand);
1499     setCondCodeAction(ISD::SETULE, VT, Expand);
1500     setCondCodeAction(ISD::SETUGE, VT, Expand);
1501     setCondCodeAction(ISD::SETUGT, VT, Expand);
1502     setCondCodeAction(ISD::SETUEQ, VT, Expand);
1503     setCondCodeAction(ISD::SETUNE, VT, Expand);
1504   }
1505 
1506   // Mark integer truncating stores as having custom lowering
1507   if (VT.isInteger()) {
1508     MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1509     while (InnerVT != VT) {
1510       setTruncStoreAction(VT, InnerVT, Custom);
1511       InnerVT = InnerVT.changeVectorElementType(
1512           MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1513     }
1514   }
1515 
1516   // Lower fixed length vector operations to scalable equivalents.
1517   setOperationAction(ISD::ABS, VT, Custom);
1518   setOperationAction(ISD::ADD, VT, Custom);
1519   setOperationAction(ISD::AND, VT, Custom);
1520   setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1521   setOperationAction(ISD::BITCAST, VT, Custom);
1522   setOperationAction(ISD::BITREVERSE, VT, Custom);
1523   setOperationAction(ISD::BSWAP, VT, Custom);
1524   setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1525   setOperationAction(ISD::CTLZ, VT, Custom);
1526   setOperationAction(ISD::CTPOP, VT, Custom);
1527   setOperationAction(ISD::CTTZ, VT, Custom);
1528   setOperationAction(ISD::FABS, VT, Custom);
1529   setOperationAction(ISD::FADD, VT, Custom);
1530   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1531   setOperationAction(ISD::FCEIL, VT, Custom);
1532   setOperationAction(ISD::FDIV, VT, Custom);
1533   setOperationAction(ISD::FFLOOR, VT, Custom);
1534   setOperationAction(ISD::FMA, VT, Custom);
1535   setOperationAction(ISD::FMAXIMUM, VT, Custom);
1536   setOperationAction(ISD::FMAXNUM, VT, Custom);
1537   setOperationAction(ISD::FMINIMUM, VT, Custom);
1538   setOperationAction(ISD::FMINNUM, VT, Custom);
1539   setOperationAction(ISD::FMUL, VT, Custom);
1540   setOperationAction(ISD::FNEARBYINT, VT, Custom);
1541   setOperationAction(ISD::FNEG, VT, Custom);
1542   setOperationAction(ISD::FP_EXTEND, VT, Custom);
1543   setOperationAction(ISD::FP_ROUND, VT, Custom);
1544   setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1545   setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1546   setOperationAction(ISD::FRINT, VT, Custom);
1547   setOperationAction(ISD::FROUND, VT, Custom);
1548   setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1549   setOperationAction(ISD::FSQRT, VT, Custom);
1550   setOperationAction(ISD::FSUB, VT, Custom);
1551   setOperationAction(ISD::FTRUNC, VT, Custom);
1552   setOperationAction(ISD::LOAD, VT, Custom);
1553   setOperationAction(ISD::MGATHER, VT, Custom);
1554   setOperationAction(ISD::MLOAD, VT, Custom);
1555   setOperationAction(ISD::MSCATTER, VT, Custom);
1556   setOperationAction(ISD::MSTORE, VT, Custom);
1557   setOperationAction(ISD::MUL, VT, Custom);
1558   setOperationAction(ISD::MULHS, VT, Custom);
1559   setOperationAction(ISD::MULHU, VT, Custom);
1560   setOperationAction(ISD::OR, VT, Custom);
1561   setOperationAction(ISD::SDIV, VT, Custom);
1562   setOperationAction(ISD::SELECT, VT, Custom);
1563   setOperationAction(ISD::SETCC, VT, Custom);
1564   setOperationAction(ISD::SHL, VT, Custom);
1565   setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1566   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1567   setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1568   setOperationAction(ISD::SMAX, VT, Custom);
1569   setOperationAction(ISD::SMIN, VT, Custom);
1570   setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1571   setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1572   setOperationAction(ISD::SRA, VT, Custom);
1573   setOperationAction(ISD::SRL, VT, Custom);
1574   setOperationAction(ISD::STORE, VT, Custom);
1575   setOperationAction(ISD::SUB, VT, Custom);
1576   setOperationAction(ISD::TRUNCATE, VT, Custom);
1577   setOperationAction(ISD::UDIV, VT, Custom);
1578   setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1579   setOperationAction(ISD::UMAX, VT, Custom);
1580   setOperationAction(ISD::UMIN, VT, Custom);
1581   setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1582   setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1583   setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1584   setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1585   setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1586   setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1587   setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1588   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1589   setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1590   setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1591   setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1592   setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1593   setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1594   setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1595   setOperationAction(ISD::VSELECT, VT, Custom);
1596   setOperationAction(ISD::XOR, VT, Custom);
1597   setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1598 }
1599 
1600 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1601   addRegisterClass(VT, &AArch64::FPR64RegClass);
1602   addTypeForNEON(VT);
1603 }
1604 
1605 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1606   addRegisterClass(VT, &AArch64::FPR128RegClass);
1607   addTypeForNEON(VT);
1608 }
1609 
1610 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
1611                                               LLVMContext &C, EVT VT) const {
1612   if (!VT.isVector())
1613     return MVT::i32;
1614   if (VT.isScalableVector())
1615     return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
1616   return VT.changeVectorElementTypeToInteger();
1617 }
1618 
1619 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1620                                const APInt &Demanded,
1621                                TargetLowering::TargetLoweringOpt &TLO,
1622                                unsigned NewOpc) {
1623   uint64_t OldImm = Imm, NewImm, Enc;
1624   uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1625 
1626   // Return if the immediate is already all zeros, all ones, a bimm32 or a
1627   // bimm64.
1628   if (Imm == 0 || Imm == Mask ||
1629       AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
1630     return false;
1631 
1632   unsigned EltSize = Size;
1633   uint64_t DemandedBits = Demanded.getZExtValue();
1634 
1635   // Clear bits that are not demanded.
1636   Imm &= DemandedBits;
1637 
1638   while (true) {
1639     // The goal here is to set the non-demanded bits in a way that minimizes
1640     // the number of switching between 0 and 1. In order to achieve this goal,
1641     // we set the non-demanded bits to the value of the preceding demanded bits.
1642     // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1643     // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1644     // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1645     // The final result is 0b11000011.
1646     uint64_t NonDemandedBits = ~DemandedBits;
1647     uint64_t InvertedImm = ~Imm & DemandedBits;
1648     uint64_t RotatedImm =
1649         ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1650         NonDemandedBits;
1651     uint64_t Sum = RotatedImm + NonDemandedBits;
1652     bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1653     uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1654     NewImm = (Imm | Ones) & Mask;
1655 
1656     // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1657     // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1658     // we halve the element size and continue the search.
1659     if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1660       break;
1661 
1662     // We cannot shrink the element size any further if it is 2-bits.
1663     if (EltSize == 2)
1664       return false;
1665 
1666     EltSize /= 2;
1667     Mask >>= EltSize;
1668     uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1669 
1670     // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1671     if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1672       return false;
1673 
1674     // Merge the upper and lower halves of Imm and DemandedBits.
1675     Imm |= Hi;
1676     DemandedBits |= DemandedBitsHi;
1677   }
1678 
1679   ++NumOptimizedImms;
1680 
1681   // Replicate the element across the register width.
1682   while (EltSize < Size) {
1683     NewImm |= NewImm << EltSize;
1684     EltSize *= 2;
1685   }
1686 
1687   (void)OldImm;
1688   assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1689          "demanded bits should never be altered");
1690   assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1691 
1692   // Create the new constant immediate node.
1693   EVT VT = Op.getValueType();
1694   SDLoc DL(Op);
1695   SDValue New;
1696 
1697   // If the new constant immediate is all-zeros or all-ones, let the target
1698   // independent DAG combine optimize this node.
1699   if (NewImm == 0 || NewImm == OrigMask) {
1700     New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1701                           TLO.DAG.getConstant(NewImm, DL, VT));
1702   // Otherwise, create a machine node so that target independent DAG combine
1703   // doesn't undo this optimization.
1704   } else {
1705     Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
1706     SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1707     New = SDValue(
1708         TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1709   }
1710 
1711   return TLO.CombineTo(Op, New);
1712 }
1713 
1714 bool AArch64TargetLowering::targetShrinkDemandedConstant(
1715     SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1716     TargetLoweringOpt &TLO) const {
1717   // Delay this optimization to as late as possible.
1718   if (!TLO.LegalOps)
1719     return false;
1720 
1721   if (!EnableOptimizeLogicalImm)
1722     return false;
1723 
1724   EVT VT = Op.getValueType();
1725   if (VT.isVector())
1726     return false;
1727 
1728   unsigned Size = VT.getSizeInBits();
1729   assert((Size == 32 || Size == 64) &&
1730          "i32 or i64 is expected after legalization.");
1731 
1732   // Exit early if we demand all bits.
1733   if (DemandedBits.countPopulation() == Size)
1734     return false;
1735 
1736   unsigned NewOpc;
1737   switch (Op.getOpcode()) {
1738   default:
1739     return false;
1740   case ISD::AND:
1741     NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1742     break;
1743   case ISD::OR:
1744     NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1745     break;
1746   case ISD::XOR:
1747     NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1748     break;
1749   }
1750   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1751   if (!C)
1752     return false;
1753   uint64_t Imm = C->getZExtValue();
1754   return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1755 }
1756 
1757 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1758 /// Mask are known to be either zero or one and return them Known.
1759 void AArch64TargetLowering::computeKnownBitsForTargetNode(
1760     const SDValue Op, KnownBits &Known,
1761     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1762   switch (Op.getOpcode()) {
1763   default:
1764     break;
1765   case AArch64ISD::CSEL: {
1766     KnownBits Known2;
1767     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1768     Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1769     Known = KnownBits::commonBits(Known, Known2);
1770     break;
1771   }
1772   case AArch64ISD::LOADgot:
1773   case AArch64ISD::ADDlow: {
1774     if (!Subtarget->isTargetILP32())
1775       break;
1776     // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1777     Known.Zero = APInt::getHighBitsSet(64, 32);
1778     break;
1779   }
1780   case ISD::INTRINSIC_W_CHAIN: {
1781     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1782     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1783     switch (IntID) {
1784     default: return;
1785     case Intrinsic::aarch64_ldaxr:
1786     case Intrinsic::aarch64_ldxr: {
1787       unsigned BitWidth = Known.getBitWidth();
1788       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1789       unsigned MemBits = VT.getScalarSizeInBits();
1790       Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1791       return;
1792     }
1793     }
1794     break;
1795   }
1796   case ISD::INTRINSIC_WO_CHAIN:
1797   case ISD::INTRINSIC_VOID: {
1798     unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1799     switch (IntNo) {
1800     default:
1801       break;
1802     case Intrinsic::aarch64_neon_umaxv:
1803     case Intrinsic::aarch64_neon_uminv: {
1804       // Figure out the datatype of the vector operand. The UMINV instruction
1805       // will zero extend the result, so we can mark as known zero all the
1806       // bits larger than the element datatype. 32-bit or larget doesn't need
1807       // this as those are legal types and will be handled by isel directly.
1808       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1809       unsigned BitWidth = Known.getBitWidth();
1810       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1811         assert(BitWidth >= 8 && "Unexpected width!");
1812         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1813         Known.Zero |= Mask;
1814       } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1815         assert(BitWidth >= 16 && "Unexpected width!");
1816         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1817         Known.Zero |= Mask;
1818       }
1819       break;
1820     } break;
1821     }
1822   }
1823   }
1824 }
1825 
1826 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
1827                                                   EVT) const {
1828   return MVT::i64;
1829 }
1830 
1831 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
1832     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1833     bool *Fast) const {
1834   if (Subtarget->requiresStrictAlign())
1835     return false;
1836 
1837   if (Fast) {
1838     // Some CPUs are fine with unaligned stores except for 128-bit ones.
1839     *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1840             // See comments in performSTORECombine() for more details about
1841             // these conditions.
1842 
1843             // Code that uses clang vector extensions can mark that it
1844             // wants unaligned accesses to be treated as fast by
1845             // underspecifying alignment to be 1 or 2.
1846             Alignment <= 2 ||
1847 
1848             // Disregard v2i64. Memcpy lowering produces those and splitting
1849             // them regresses performance on micro-benchmarks and olden/bh.
1850             VT == MVT::v2i64;
1851   }
1852   return true;
1853 }
1854 
1855 // Same as above but handling LLTs instead.
1856 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
1857     LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1858     bool *Fast) const {
1859   if (Subtarget->requiresStrictAlign())
1860     return false;
1861 
1862   if (Fast) {
1863     // Some CPUs are fine with unaligned stores except for 128-bit ones.
1864     *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1865             Ty.getSizeInBytes() != 16 ||
1866             // See comments in performSTORECombine() for more details about
1867             // these conditions.
1868 
1869             // Code that uses clang vector extensions can mark that it
1870             // wants unaligned accesses to be treated as fast by
1871             // underspecifying alignment to be 1 or 2.
1872             Alignment <= 2 ||
1873 
1874             // Disregard v2i64. Memcpy lowering produces those and splitting
1875             // them regresses performance on micro-benchmarks and olden/bh.
1876             Ty == LLT::fixed_vector(2, 64);
1877   }
1878   return true;
1879 }
1880 
1881 FastISel *
1882 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1883                                       const TargetLibraryInfo *libInfo) const {
1884   return AArch64::createFastISel(funcInfo, libInfo);
1885 }
1886 
1887 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1888 #define MAKE_CASE(V)                                                           \
1889   case V:                                                                      \
1890     return #V;
1891   switch ((AArch64ISD::NodeType)Opcode) {
1892   case AArch64ISD::FIRST_NUMBER:
1893     break;
1894     MAKE_CASE(AArch64ISD::CALL)
1895     MAKE_CASE(AArch64ISD::ADRP)
1896     MAKE_CASE(AArch64ISD::ADR)
1897     MAKE_CASE(AArch64ISD::ADDlow)
1898     MAKE_CASE(AArch64ISD::LOADgot)
1899     MAKE_CASE(AArch64ISD::RET_FLAG)
1900     MAKE_CASE(AArch64ISD::BRCOND)
1901     MAKE_CASE(AArch64ISD::CSEL)
1902     MAKE_CASE(AArch64ISD::CSINV)
1903     MAKE_CASE(AArch64ISD::CSNEG)
1904     MAKE_CASE(AArch64ISD::CSINC)
1905     MAKE_CASE(AArch64ISD::THREAD_POINTER)
1906     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
1907     MAKE_CASE(AArch64ISD::ADD_PRED)
1908     MAKE_CASE(AArch64ISD::MUL_PRED)
1909     MAKE_CASE(AArch64ISD::MULHS_PRED)
1910     MAKE_CASE(AArch64ISD::MULHU_PRED)
1911     MAKE_CASE(AArch64ISD::SDIV_PRED)
1912     MAKE_CASE(AArch64ISD::SHL_PRED)
1913     MAKE_CASE(AArch64ISD::SMAX_PRED)
1914     MAKE_CASE(AArch64ISD::SMIN_PRED)
1915     MAKE_CASE(AArch64ISD::SRA_PRED)
1916     MAKE_CASE(AArch64ISD::SRL_PRED)
1917     MAKE_CASE(AArch64ISD::SUB_PRED)
1918     MAKE_CASE(AArch64ISD::UDIV_PRED)
1919     MAKE_CASE(AArch64ISD::UMAX_PRED)
1920     MAKE_CASE(AArch64ISD::UMIN_PRED)
1921     MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
1922     MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
1923     MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
1924     MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
1925     MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
1926     MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
1927     MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
1928     MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
1929     MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
1930     MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
1931     MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
1932     MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
1933     MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
1934     MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
1935     MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
1936     MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
1937     MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
1938     MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
1939     MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
1940     MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
1941     MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
1942     MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
1943     MAKE_CASE(AArch64ISD::ADC)
1944     MAKE_CASE(AArch64ISD::SBC)
1945     MAKE_CASE(AArch64ISD::ADDS)
1946     MAKE_CASE(AArch64ISD::SUBS)
1947     MAKE_CASE(AArch64ISD::ADCS)
1948     MAKE_CASE(AArch64ISD::SBCS)
1949     MAKE_CASE(AArch64ISD::ANDS)
1950     MAKE_CASE(AArch64ISD::CCMP)
1951     MAKE_CASE(AArch64ISD::CCMN)
1952     MAKE_CASE(AArch64ISD::FCCMP)
1953     MAKE_CASE(AArch64ISD::FCMP)
1954     MAKE_CASE(AArch64ISD::STRICT_FCMP)
1955     MAKE_CASE(AArch64ISD::STRICT_FCMPE)
1956     MAKE_CASE(AArch64ISD::DUP)
1957     MAKE_CASE(AArch64ISD::DUPLANE8)
1958     MAKE_CASE(AArch64ISD::DUPLANE16)
1959     MAKE_CASE(AArch64ISD::DUPLANE32)
1960     MAKE_CASE(AArch64ISD::DUPLANE64)
1961     MAKE_CASE(AArch64ISD::MOVI)
1962     MAKE_CASE(AArch64ISD::MOVIshift)
1963     MAKE_CASE(AArch64ISD::MOVIedit)
1964     MAKE_CASE(AArch64ISD::MOVImsl)
1965     MAKE_CASE(AArch64ISD::FMOV)
1966     MAKE_CASE(AArch64ISD::MVNIshift)
1967     MAKE_CASE(AArch64ISD::MVNImsl)
1968     MAKE_CASE(AArch64ISD::BICi)
1969     MAKE_CASE(AArch64ISD::ORRi)
1970     MAKE_CASE(AArch64ISD::BSP)
1971     MAKE_CASE(AArch64ISD::EXTR)
1972     MAKE_CASE(AArch64ISD::ZIP1)
1973     MAKE_CASE(AArch64ISD::ZIP2)
1974     MAKE_CASE(AArch64ISD::UZP1)
1975     MAKE_CASE(AArch64ISD::UZP2)
1976     MAKE_CASE(AArch64ISD::TRN1)
1977     MAKE_CASE(AArch64ISD::TRN2)
1978     MAKE_CASE(AArch64ISD::REV16)
1979     MAKE_CASE(AArch64ISD::REV32)
1980     MAKE_CASE(AArch64ISD::REV64)
1981     MAKE_CASE(AArch64ISD::EXT)
1982     MAKE_CASE(AArch64ISD::SPLICE)
1983     MAKE_CASE(AArch64ISD::VSHL)
1984     MAKE_CASE(AArch64ISD::VLSHR)
1985     MAKE_CASE(AArch64ISD::VASHR)
1986     MAKE_CASE(AArch64ISD::VSLI)
1987     MAKE_CASE(AArch64ISD::VSRI)
1988     MAKE_CASE(AArch64ISD::CMEQ)
1989     MAKE_CASE(AArch64ISD::CMGE)
1990     MAKE_CASE(AArch64ISD::CMGT)
1991     MAKE_CASE(AArch64ISD::CMHI)
1992     MAKE_CASE(AArch64ISD::CMHS)
1993     MAKE_CASE(AArch64ISD::FCMEQ)
1994     MAKE_CASE(AArch64ISD::FCMGE)
1995     MAKE_CASE(AArch64ISD::FCMGT)
1996     MAKE_CASE(AArch64ISD::CMEQz)
1997     MAKE_CASE(AArch64ISD::CMGEz)
1998     MAKE_CASE(AArch64ISD::CMGTz)
1999     MAKE_CASE(AArch64ISD::CMLEz)
2000     MAKE_CASE(AArch64ISD::CMLTz)
2001     MAKE_CASE(AArch64ISD::FCMEQz)
2002     MAKE_CASE(AArch64ISD::FCMGEz)
2003     MAKE_CASE(AArch64ISD::FCMGTz)
2004     MAKE_CASE(AArch64ISD::FCMLEz)
2005     MAKE_CASE(AArch64ISD::FCMLTz)
2006     MAKE_CASE(AArch64ISD::SADDV)
2007     MAKE_CASE(AArch64ISD::UADDV)
2008     MAKE_CASE(AArch64ISD::SRHADD)
2009     MAKE_CASE(AArch64ISD::URHADD)
2010     MAKE_CASE(AArch64ISD::SHADD)
2011     MAKE_CASE(AArch64ISD::UHADD)
2012     MAKE_CASE(AArch64ISD::SDOT)
2013     MAKE_CASE(AArch64ISD::UDOT)
2014     MAKE_CASE(AArch64ISD::SMINV)
2015     MAKE_CASE(AArch64ISD::UMINV)
2016     MAKE_CASE(AArch64ISD::SMAXV)
2017     MAKE_CASE(AArch64ISD::UMAXV)
2018     MAKE_CASE(AArch64ISD::SADDV_PRED)
2019     MAKE_CASE(AArch64ISD::UADDV_PRED)
2020     MAKE_CASE(AArch64ISD::SMAXV_PRED)
2021     MAKE_CASE(AArch64ISD::UMAXV_PRED)
2022     MAKE_CASE(AArch64ISD::SMINV_PRED)
2023     MAKE_CASE(AArch64ISD::UMINV_PRED)
2024     MAKE_CASE(AArch64ISD::ORV_PRED)
2025     MAKE_CASE(AArch64ISD::EORV_PRED)
2026     MAKE_CASE(AArch64ISD::ANDV_PRED)
2027     MAKE_CASE(AArch64ISD::CLASTA_N)
2028     MAKE_CASE(AArch64ISD::CLASTB_N)
2029     MAKE_CASE(AArch64ISD::LASTA)
2030     MAKE_CASE(AArch64ISD::LASTB)
2031     MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
2032     MAKE_CASE(AArch64ISD::LS64_BUILD)
2033     MAKE_CASE(AArch64ISD::LS64_EXTRACT)
2034     MAKE_CASE(AArch64ISD::TBL)
2035     MAKE_CASE(AArch64ISD::FADD_PRED)
2036     MAKE_CASE(AArch64ISD::FADDA_PRED)
2037     MAKE_CASE(AArch64ISD::FADDV_PRED)
2038     MAKE_CASE(AArch64ISD::FDIV_PRED)
2039     MAKE_CASE(AArch64ISD::FMA_PRED)
2040     MAKE_CASE(AArch64ISD::FMAX_PRED)
2041     MAKE_CASE(AArch64ISD::FMAXV_PRED)
2042     MAKE_CASE(AArch64ISD::FMAXNM_PRED)
2043     MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
2044     MAKE_CASE(AArch64ISD::FMIN_PRED)
2045     MAKE_CASE(AArch64ISD::FMINV_PRED)
2046     MAKE_CASE(AArch64ISD::FMINNM_PRED)
2047     MAKE_CASE(AArch64ISD::FMINNMV_PRED)
2048     MAKE_CASE(AArch64ISD::FMUL_PRED)
2049     MAKE_CASE(AArch64ISD::FSUB_PRED)
2050     MAKE_CASE(AArch64ISD::BIC)
2051     MAKE_CASE(AArch64ISD::BIT)
2052     MAKE_CASE(AArch64ISD::CBZ)
2053     MAKE_CASE(AArch64ISD::CBNZ)
2054     MAKE_CASE(AArch64ISD::TBZ)
2055     MAKE_CASE(AArch64ISD::TBNZ)
2056     MAKE_CASE(AArch64ISD::TC_RETURN)
2057     MAKE_CASE(AArch64ISD::PREFETCH)
2058     MAKE_CASE(AArch64ISD::SITOF)
2059     MAKE_CASE(AArch64ISD::UITOF)
2060     MAKE_CASE(AArch64ISD::NVCAST)
2061     MAKE_CASE(AArch64ISD::MRS)
2062     MAKE_CASE(AArch64ISD::SQSHL_I)
2063     MAKE_CASE(AArch64ISD::UQSHL_I)
2064     MAKE_CASE(AArch64ISD::SRSHR_I)
2065     MAKE_CASE(AArch64ISD::URSHR_I)
2066     MAKE_CASE(AArch64ISD::SQSHLU_I)
2067     MAKE_CASE(AArch64ISD::WrapperLarge)
2068     MAKE_CASE(AArch64ISD::LD2post)
2069     MAKE_CASE(AArch64ISD::LD3post)
2070     MAKE_CASE(AArch64ISD::LD4post)
2071     MAKE_CASE(AArch64ISD::ST2post)
2072     MAKE_CASE(AArch64ISD::ST3post)
2073     MAKE_CASE(AArch64ISD::ST4post)
2074     MAKE_CASE(AArch64ISD::LD1x2post)
2075     MAKE_CASE(AArch64ISD::LD1x3post)
2076     MAKE_CASE(AArch64ISD::LD1x4post)
2077     MAKE_CASE(AArch64ISD::ST1x2post)
2078     MAKE_CASE(AArch64ISD::ST1x3post)
2079     MAKE_CASE(AArch64ISD::ST1x4post)
2080     MAKE_CASE(AArch64ISD::LD1DUPpost)
2081     MAKE_CASE(AArch64ISD::LD2DUPpost)
2082     MAKE_CASE(AArch64ISD::LD3DUPpost)
2083     MAKE_CASE(AArch64ISD::LD4DUPpost)
2084     MAKE_CASE(AArch64ISD::LD1LANEpost)
2085     MAKE_CASE(AArch64ISD::LD2LANEpost)
2086     MAKE_CASE(AArch64ISD::LD3LANEpost)
2087     MAKE_CASE(AArch64ISD::LD4LANEpost)
2088     MAKE_CASE(AArch64ISD::ST2LANEpost)
2089     MAKE_CASE(AArch64ISD::ST3LANEpost)
2090     MAKE_CASE(AArch64ISD::ST4LANEpost)
2091     MAKE_CASE(AArch64ISD::SMULL)
2092     MAKE_CASE(AArch64ISD::UMULL)
2093     MAKE_CASE(AArch64ISD::FRECPE)
2094     MAKE_CASE(AArch64ISD::FRECPS)
2095     MAKE_CASE(AArch64ISD::FRSQRTE)
2096     MAKE_CASE(AArch64ISD::FRSQRTS)
2097     MAKE_CASE(AArch64ISD::STG)
2098     MAKE_CASE(AArch64ISD::STZG)
2099     MAKE_CASE(AArch64ISD::ST2G)
2100     MAKE_CASE(AArch64ISD::STZ2G)
2101     MAKE_CASE(AArch64ISD::SUNPKHI)
2102     MAKE_CASE(AArch64ISD::SUNPKLO)
2103     MAKE_CASE(AArch64ISD::UUNPKHI)
2104     MAKE_CASE(AArch64ISD::UUNPKLO)
2105     MAKE_CASE(AArch64ISD::INSR)
2106     MAKE_CASE(AArch64ISD::PTEST)
2107     MAKE_CASE(AArch64ISD::PTRUE)
2108     MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
2109     MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
2110     MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
2111     MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
2112     MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
2113     MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
2114     MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
2115     MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
2116     MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
2117     MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
2118     MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
2119     MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
2120     MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
2121     MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
2122     MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
2123     MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
2124     MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2125     MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2126     MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2127     MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2128     MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
2129     MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
2130     MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
2131     MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
2132     MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
2133     MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
2134     MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
2135     MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
2136     MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
2137     MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
2138     MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
2139     MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
2140     MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
2141     MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
2142     MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
2143     MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
2144     MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
2145     MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
2146     MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
2147     MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
2148     MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2149     MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2150     MAKE_CASE(AArch64ISD::ST1_PRED)
2151     MAKE_CASE(AArch64ISD::SST1_PRED)
2152     MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
2153     MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
2154     MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
2155     MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
2156     MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
2157     MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
2158     MAKE_CASE(AArch64ISD::SSTNT1_PRED)
2159     MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
2160     MAKE_CASE(AArch64ISD::LDP)
2161     MAKE_CASE(AArch64ISD::STP)
2162     MAKE_CASE(AArch64ISD::STNP)
2163     MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
2164     MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
2165     MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
2166     MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
2167     MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
2168     MAKE_CASE(AArch64ISD::INDEX_VECTOR)
2169     MAKE_CASE(AArch64ISD::UADDLP)
2170     MAKE_CASE(AArch64ISD::CALL_RVMARKER)
2171   }
2172 #undef MAKE_CASE
2173   return nullptr;
2174 }
2175 
2176 MachineBasicBlock *
2177 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2178                                     MachineBasicBlock *MBB) const {
2179   // We materialise the F128CSEL pseudo-instruction as some control flow and a
2180   // phi node:
2181 
2182   // OrigBB:
2183   //     [... previous instrs leading to comparison ...]
2184   //     b.ne TrueBB
2185   //     b EndBB
2186   // TrueBB:
2187   //     ; Fallthrough
2188   // EndBB:
2189   //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2190 
2191   MachineFunction *MF = MBB->getParent();
2192   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2193   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2194   DebugLoc DL = MI.getDebugLoc();
2195   MachineFunction::iterator It = ++MBB->getIterator();
2196 
2197   Register DestReg = MI.getOperand(0).getReg();
2198   Register IfTrueReg = MI.getOperand(1).getReg();
2199   Register IfFalseReg = MI.getOperand(2).getReg();
2200   unsigned CondCode = MI.getOperand(3).getImm();
2201   bool NZCVKilled = MI.getOperand(4).isKill();
2202 
2203   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2204   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2205   MF->insert(It, TrueBB);
2206   MF->insert(It, EndBB);
2207 
2208   // Transfer rest of current basic-block to EndBB
2209   EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2210                 MBB->end());
2211   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
2212 
2213   BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2214   BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2215   MBB->addSuccessor(TrueBB);
2216   MBB->addSuccessor(EndBB);
2217 
2218   // TrueBB falls through to the end.
2219   TrueBB->addSuccessor(EndBB);
2220 
2221   if (!NZCVKilled) {
2222     TrueBB->addLiveIn(AArch64::NZCV);
2223     EndBB->addLiveIn(AArch64::NZCV);
2224   }
2225 
2226   BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2227       .addReg(IfTrueReg)
2228       .addMBB(TrueBB)
2229       .addReg(IfFalseReg)
2230       .addMBB(MBB);
2231 
2232   MI.eraseFromParent();
2233   return EndBB;
2234 }
2235 
2236 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2237        MachineInstr &MI, MachineBasicBlock *BB) const {
2238   assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2239              BB->getParent()->getFunction().getPersonalityFn())) &&
2240          "SEH does not use catchret!");
2241   return BB;
2242 }
2243 
2244 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
2245     MachineInstr &MI, MachineBasicBlock *BB) const {
2246   switch (MI.getOpcode()) {
2247   default:
2248 #ifndef NDEBUG
2249     MI.dump();
2250 #endif
2251     llvm_unreachable("Unexpected instruction for custom inserter!");
2252 
2253   case AArch64::F128CSEL:
2254     return EmitF128CSEL(MI, BB);
2255 
2256   case TargetOpcode::STACKMAP:
2257   case TargetOpcode::PATCHPOINT:
2258   case TargetOpcode::STATEPOINT:
2259     return emitPatchPoint(MI, BB);
2260 
2261   case AArch64::CATCHRET:
2262     return EmitLoweredCatchRet(MI, BB);
2263   }
2264 }
2265 
2266 //===----------------------------------------------------------------------===//
2267 // AArch64 Lowering private implementation.
2268 //===----------------------------------------------------------------------===//
2269 
2270 //===----------------------------------------------------------------------===//
2271 // Lowering Code
2272 //===----------------------------------------------------------------------===//
2273 
2274 // Forward declarations of SVE fixed length lowering helpers
2275 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
2276 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2277 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2278 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
2279                                                 SelectionDAG &DAG);
2280 
2281 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
2282 static bool isZerosVector(const SDNode *N) {
2283   // Look through a bit convert.
2284   while (N->getOpcode() == ISD::BITCAST)
2285     N = N->getOperand(0).getNode();
2286 
2287   if (ISD::isConstantSplatVectorAllZeros(N))
2288     return true;
2289 
2290   if (N->getOpcode() != AArch64ISD::DUP)
2291     return false;
2292 
2293   auto Opnd0 = N->getOperand(0);
2294   auto *CINT = dyn_cast<ConstantSDNode>(Opnd0);
2295   auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0);
2296   return (CINT && CINT->isNullValue()) || (CFP && CFP->isZero());
2297 }
2298 
2299 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2300 /// CC
2301 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
2302   switch (CC) {
2303   default:
2304     llvm_unreachable("Unknown condition code!");
2305   case ISD::SETNE:
2306     return AArch64CC::NE;
2307   case ISD::SETEQ:
2308     return AArch64CC::EQ;
2309   case ISD::SETGT:
2310     return AArch64CC::GT;
2311   case ISD::SETGE:
2312     return AArch64CC::GE;
2313   case ISD::SETLT:
2314     return AArch64CC::LT;
2315   case ISD::SETLE:
2316     return AArch64CC::LE;
2317   case ISD::SETUGT:
2318     return AArch64CC::HI;
2319   case ISD::SETUGE:
2320     return AArch64CC::HS;
2321   case ISD::SETULT:
2322     return AArch64CC::LO;
2323   case ISD::SETULE:
2324     return AArch64CC::LS;
2325   }
2326 }
2327 
2328 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2329 static void changeFPCCToAArch64CC(ISD::CondCode CC,
2330                                   AArch64CC::CondCode &CondCode,
2331                                   AArch64CC::CondCode &CondCode2) {
2332   CondCode2 = AArch64CC::AL;
2333   switch (CC) {
2334   default:
2335     llvm_unreachable("Unknown FP condition!");
2336   case ISD::SETEQ:
2337   case ISD::SETOEQ:
2338     CondCode = AArch64CC::EQ;
2339     break;
2340   case ISD::SETGT:
2341   case ISD::SETOGT:
2342     CondCode = AArch64CC::GT;
2343     break;
2344   case ISD::SETGE:
2345   case ISD::SETOGE:
2346     CondCode = AArch64CC::GE;
2347     break;
2348   case ISD::SETOLT:
2349     CondCode = AArch64CC::MI;
2350     break;
2351   case ISD::SETOLE:
2352     CondCode = AArch64CC::LS;
2353     break;
2354   case ISD::SETONE:
2355     CondCode = AArch64CC::MI;
2356     CondCode2 = AArch64CC::GT;
2357     break;
2358   case ISD::SETO:
2359     CondCode = AArch64CC::VC;
2360     break;
2361   case ISD::SETUO:
2362     CondCode = AArch64CC::VS;
2363     break;
2364   case ISD::SETUEQ:
2365     CondCode = AArch64CC::EQ;
2366     CondCode2 = AArch64CC::VS;
2367     break;
2368   case ISD::SETUGT:
2369     CondCode = AArch64CC::HI;
2370     break;
2371   case ISD::SETUGE:
2372     CondCode = AArch64CC::PL;
2373     break;
2374   case ISD::SETLT:
2375   case ISD::SETULT:
2376     CondCode = AArch64CC::LT;
2377     break;
2378   case ISD::SETLE:
2379   case ISD::SETULE:
2380     CondCode = AArch64CC::LE;
2381     break;
2382   case ISD::SETNE:
2383   case ISD::SETUNE:
2384     CondCode = AArch64CC::NE;
2385     break;
2386   }
2387 }
2388 
2389 /// Convert a DAG fp condition code to an AArch64 CC.
2390 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2391 /// should be AND'ed instead of OR'ed.
2392 static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
2393                                      AArch64CC::CondCode &CondCode,
2394                                      AArch64CC::CondCode &CondCode2) {
2395   CondCode2 = AArch64CC::AL;
2396   switch (CC) {
2397   default:
2398     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2399     assert(CondCode2 == AArch64CC::AL);
2400     break;
2401   case ISD::SETONE:
2402     // (a one b)
2403     // == ((a olt b) || (a ogt b))
2404     // == ((a ord b) && (a une b))
2405     CondCode = AArch64CC::VC;
2406     CondCode2 = AArch64CC::NE;
2407     break;
2408   case ISD::SETUEQ:
2409     // (a ueq b)
2410     // == ((a uno b) || (a oeq b))
2411     // == ((a ule b) && (a uge b))
2412     CondCode = AArch64CC::PL;
2413     CondCode2 = AArch64CC::LE;
2414     break;
2415   }
2416 }
2417 
2418 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2419 /// CC usable with the vector instructions. Fewer operations are available
2420 /// without a real NZCV register, so we have to use less efficient combinations
2421 /// to get the same effect.
2422 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
2423                                         AArch64CC::CondCode &CondCode,
2424                                         AArch64CC::CondCode &CondCode2,
2425                                         bool &Invert) {
2426   Invert = false;
2427   switch (CC) {
2428   default:
2429     // Mostly the scalar mappings work fine.
2430     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2431     break;
2432   case ISD::SETUO:
2433     Invert = true;
2434     LLVM_FALLTHROUGH;
2435   case ISD::SETO:
2436     CondCode = AArch64CC::MI;
2437     CondCode2 = AArch64CC::GE;
2438     break;
2439   case ISD::SETUEQ:
2440   case ISD::SETULT:
2441   case ISD::SETULE:
2442   case ISD::SETUGT:
2443   case ISD::SETUGE:
2444     // All of the compare-mask comparisons are ordered, but we can switch
2445     // between the two by a double inversion. E.g. ULE == !OGT.
2446     Invert = true;
2447     changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2448                           CondCode, CondCode2);
2449     break;
2450   }
2451 }
2452 
2453 static bool isLegalArithImmed(uint64_t C) {
2454   // Matches AArch64DAGToDAGISel::SelectArithImmed().
2455   bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2456   LLVM_DEBUG(dbgs() << "Is imm " << C
2457                     << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2458   return IsLegal;
2459 }
2460 
2461 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2462 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2463 // can be set differently by this operation. It comes down to whether
2464 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2465 // everything is fine. If not then the optimization is wrong. Thus general
2466 // comparisons are only valid if op2 != 0.
2467 //
2468 // So, finally, the only LLVM-native comparisons that don't mention C and V
2469 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2470 // the absence of information about op2.
2471 static bool isCMN(SDValue Op, ISD::CondCode CC) {
2472   return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2473          (CC == ISD::SETEQ || CC == ISD::SETNE);
2474 }
2475 
2476 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
2477                                       SelectionDAG &DAG, SDValue Chain,
2478                                       bool IsSignaling) {
2479   EVT VT = LHS.getValueType();
2480   assert(VT != MVT::f128);
2481   assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
2482   unsigned Opcode =
2483       IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
2484   return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2485 }
2486 
2487 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
2488                               const SDLoc &dl, SelectionDAG &DAG) {
2489   EVT VT = LHS.getValueType();
2490   const bool FullFP16 =
2491     static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2492 
2493   if (VT.isFloatingPoint()) {
2494     assert(VT != MVT::f128);
2495     if (VT == MVT::f16 && !FullFP16) {
2496       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2497       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2498       VT = MVT::f32;
2499     }
2500     return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2501   }
2502 
2503   // The CMP instruction is just an alias for SUBS, and representing it as
2504   // SUBS means that it's possible to get CSE with subtract operations.
2505   // A later phase can perform the optimization of setting the destination
2506   // register to WZR/XZR if it ends up being unused.
2507   unsigned Opcode = AArch64ISD::SUBS;
2508 
2509   if (isCMN(RHS, CC)) {
2510     // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2511     Opcode = AArch64ISD::ADDS;
2512     RHS = RHS.getOperand(1);
2513   } else if (isCMN(LHS, CC)) {
2514     // As we are looking for EQ/NE compares, the operands can be commuted ; can
2515     // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2516     Opcode = AArch64ISD::ADDS;
2517     LHS = LHS.getOperand(1);
2518   } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2519     if (LHS.getOpcode() == ISD::AND) {
2520       // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2521       // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2522       // of the signed comparisons.
2523       const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2524                                            DAG.getVTList(VT, MVT_CC),
2525                                            LHS.getOperand(0),
2526                                            LHS.getOperand(1));
2527       // Replace all users of (and X, Y) with newly generated (ands X, Y)
2528       DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2529       return ANDSNode.getValue(1);
2530     } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2531       // Use result of ANDS
2532       return LHS.getValue(1);
2533     }
2534   }
2535 
2536   return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2537       .getValue(1);
2538 }
2539 
2540 /// \defgroup AArch64CCMP CMP;CCMP matching
2541 ///
2542 /// These functions deal with the formation of CMP;CCMP;... sequences.
2543 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2544 /// a comparison. They set the NZCV flags to a predefined value if their
2545 /// predicate is false. This allows to express arbitrary conjunctions, for
2546 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2547 /// expressed as:
2548 ///   cmp A
2549 ///   ccmp B, inv(CB), CA
2550 ///   check for CB flags
2551 ///
2552 /// This naturally lets us implement chains of AND operations with SETCC
2553 /// operands. And we can even implement some other situations by transforming
2554 /// them:
2555 ///   - We can implement (NEG SETCC) i.e. negating a single comparison by
2556 ///     negating the flags used in a CCMP/FCCMP operations.
2557 ///   - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2558 ///     by negating the flags we test for afterwards. i.e.
2559 ///     NEG (CMP CCMP CCCMP ...) can be implemented.
2560 ///   - Note that we can only ever negate all previously processed results.
2561 ///     What we can not implement by flipping the flags to test is a negation
2562 ///     of two sub-trees (because the negation affects all sub-trees emitted so
2563 ///     far, so the 2nd sub-tree we emit would also affect the first).
2564 /// With those tools we can implement some OR operations:
2565 ///   - (OR (SETCC A) (SETCC B)) can be implemented via:
2566 ///     NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
2567 ///   - After transforming OR to NEG/AND combinations we may be able to use NEG
2568 ///     elimination rules from earlier to implement the whole thing as a
2569 ///     CCMP/FCCMP chain.
2570 ///
2571 /// As complete example:
2572 ///     or (or (setCA (cmp A)) (setCB (cmp B)))
2573 ///        (and (setCC (cmp C)) (setCD (cmp D)))"
2574 /// can be reassociated to:
2575 ///     or (and (setCC (cmp C)) setCD (cmp D))
2576 //         (or (setCA (cmp A)) (setCB (cmp B)))
2577 /// can be transformed to:
2578 ///     not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
2579 ///              (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
2580 /// which can be implemented as:
2581 ///   cmp C
2582 ///   ccmp D, inv(CD), CC
2583 ///   ccmp A, CA, inv(CD)
2584 ///   ccmp B, CB, inv(CA)
2585 ///   check for CB flags
2586 ///
2587 /// A counterexample is "or (and A B) (and C D)" which translates to
2588 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
2589 /// can only implement 1 of the inner (not) operations, but not both!
2590 /// @{
2591 
2592 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
2593 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
2594                                          ISD::CondCode CC, SDValue CCOp,
2595                                          AArch64CC::CondCode Predicate,
2596                                          AArch64CC::CondCode OutCC,
2597                                          const SDLoc &DL, SelectionDAG &DAG) {
2598   unsigned Opcode = 0;
2599   const bool FullFP16 =
2600     static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2601 
2602   if (LHS.getValueType().isFloatingPoint()) {
2603     assert(LHS.getValueType() != MVT::f128);
2604     if (LHS.getValueType() == MVT::f16 && !FullFP16) {
2605       LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
2606       RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
2607     }
2608     Opcode = AArch64ISD::FCCMP;
2609   } else if (RHS.getOpcode() == ISD::SUB) {
2610     SDValue SubOp0 = RHS.getOperand(0);
2611     if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
2612       // See emitComparison() on why we can only do this for SETEQ and SETNE.
2613       Opcode = AArch64ISD::CCMN;
2614       RHS = RHS.getOperand(1);
2615     }
2616   }
2617   if (Opcode == 0)
2618     Opcode = AArch64ISD::CCMP;
2619 
2620   SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
2621   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
2622   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
2623   SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
2624   return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
2625 }
2626 
2627 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
2628 /// expressed as a conjunction. See \ref AArch64CCMP.
2629 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
2630 ///                     changing the conditions on the SETCC tests.
2631 ///                     (this means we can call emitConjunctionRec() with
2632 ///                      Negate==true on this sub-tree)
2633 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
2634 ///                     cannot do the negation naturally. We are required to
2635 ///                     emit the subtree first in this case.
2636 /// \param WillNegate   Is true if are called when the result of this
2637 ///                     subexpression must be negated. This happens when the
2638 ///                     outer expression is an OR. We can use this fact to know
2639 ///                     that we have a double negation (or (or ...) ...) that
2640 ///                     can be implemented for free.
2641 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
2642                                bool &MustBeFirst, bool WillNegate,
2643                                unsigned Depth = 0) {
2644   if (!Val.hasOneUse())
2645     return false;
2646   unsigned Opcode = Val->getOpcode();
2647   if (Opcode == ISD::SETCC) {
2648     if (Val->getOperand(0).getValueType() == MVT::f128)
2649       return false;
2650     CanNegate = true;
2651     MustBeFirst = false;
2652     return true;
2653   }
2654   // Protect against exponential runtime and stack overflow.
2655   if (Depth > 6)
2656     return false;
2657   if (Opcode == ISD::AND || Opcode == ISD::OR) {
2658     bool IsOR = Opcode == ISD::OR;
2659     SDValue O0 = Val->getOperand(0);
2660     SDValue O1 = Val->getOperand(1);
2661     bool CanNegateL;
2662     bool MustBeFirstL;
2663     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
2664       return false;
2665     bool CanNegateR;
2666     bool MustBeFirstR;
2667     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
2668       return false;
2669 
2670     if (MustBeFirstL && MustBeFirstR)
2671       return false;
2672 
2673     if (IsOR) {
2674       // For an OR expression we need to be able to naturally negate at least
2675       // one side or we cannot do the transformation at all.
2676       if (!CanNegateL && !CanNegateR)
2677         return false;
2678       // If we the result of the OR will be negated and we can naturally negate
2679       // the leafs, then this sub-tree as a whole negates naturally.
2680       CanNegate = WillNegate && CanNegateL && CanNegateR;
2681       // If we cannot naturally negate the whole sub-tree, then this must be
2682       // emitted first.
2683       MustBeFirst = !CanNegate;
2684     } else {
2685       assert(Opcode == ISD::AND && "Must be OR or AND");
2686       // We cannot naturally negate an AND operation.
2687       CanNegate = false;
2688       MustBeFirst = MustBeFirstL || MustBeFirstR;
2689     }
2690     return true;
2691   }
2692   return false;
2693 }
2694 
2695 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
2696 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
2697 /// Tries to transform the given i1 producing node @p Val to a series compare
2698 /// and conditional compare operations. @returns an NZCV flags producing node
2699 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
2700 /// transformation was not possible.
2701 /// \p Negate is true if we want this sub-tree being negated just by changing
2702 /// SETCC conditions.
2703 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
2704     AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
2705     AArch64CC::CondCode Predicate) {
2706   // We're at a tree leaf, produce a conditional comparison operation.
2707   unsigned Opcode = Val->getOpcode();
2708   if (Opcode == ISD::SETCC) {
2709     SDValue LHS = Val->getOperand(0);
2710     SDValue RHS = Val->getOperand(1);
2711     ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
2712     bool isInteger = LHS.getValueType().isInteger();
2713     if (Negate)
2714       CC = getSetCCInverse(CC, LHS.getValueType());
2715     SDLoc DL(Val);
2716     // Determine OutCC and handle FP special case.
2717     if (isInteger) {
2718       OutCC = changeIntCCToAArch64CC(CC);
2719     } else {
2720       assert(LHS.getValueType().isFloatingPoint());
2721       AArch64CC::CondCode ExtraCC;
2722       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
2723       // Some floating point conditions can't be tested with a single condition
2724       // code. Construct an additional comparison in this case.
2725       if (ExtraCC != AArch64CC::AL) {
2726         SDValue ExtraCmp;
2727         if (!CCOp.getNode())
2728           ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
2729         else
2730           ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
2731                                                ExtraCC, DL, DAG);
2732         CCOp = ExtraCmp;
2733         Predicate = ExtraCC;
2734       }
2735     }
2736 
2737     // Produce a normal comparison if we are first in the chain
2738     if (!CCOp)
2739       return emitComparison(LHS, RHS, CC, DL, DAG);
2740     // Otherwise produce a ccmp.
2741     return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
2742                                      DAG);
2743   }
2744   assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
2745 
2746   bool IsOR = Opcode == ISD::OR;
2747 
2748   SDValue LHS = Val->getOperand(0);
2749   bool CanNegateL;
2750   bool MustBeFirstL;
2751   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
2752   assert(ValidL && "Valid conjunction/disjunction tree");
2753   (void)ValidL;
2754 
2755   SDValue RHS = Val->getOperand(1);
2756   bool CanNegateR;
2757   bool MustBeFirstR;
2758   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
2759   assert(ValidR && "Valid conjunction/disjunction tree");
2760   (void)ValidR;
2761 
2762   // Swap sub-tree that must come first to the right side.
2763   if (MustBeFirstL) {
2764     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
2765     std::swap(LHS, RHS);
2766     std::swap(CanNegateL, CanNegateR);
2767     std::swap(MustBeFirstL, MustBeFirstR);
2768   }
2769 
2770   bool NegateR;
2771   bool NegateAfterR;
2772   bool NegateL;
2773   bool NegateAfterAll;
2774   if (Opcode == ISD::OR) {
2775     // Swap the sub-tree that we can negate naturally to the left.
2776     if (!CanNegateL) {
2777       assert(CanNegateR && "at least one side must be negatable");
2778       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
2779       assert(!Negate);
2780       std::swap(LHS, RHS);
2781       NegateR = false;
2782       NegateAfterR = true;
2783     } else {
2784       // Negate the left sub-tree if possible, otherwise negate the result.
2785       NegateR = CanNegateR;
2786       NegateAfterR = !CanNegateR;
2787     }
2788     NegateL = true;
2789     NegateAfterAll = !Negate;
2790   } else {
2791     assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
2792     assert(!Negate && "Valid conjunction/disjunction tree");
2793 
2794     NegateL = false;
2795     NegateR = false;
2796     NegateAfterR = false;
2797     NegateAfterAll = false;
2798   }
2799 
2800   // Emit sub-trees.
2801   AArch64CC::CondCode RHSCC;
2802   SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
2803   if (NegateAfterR)
2804     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
2805   SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
2806   if (NegateAfterAll)
2807     OutCC = AArch64CC::getInvertedCondCode(OutCC);
2808   return CmpL;
2809 }
2810 
2811 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
2812 /// In some cases this is even possible with OR operations in the expression.
2813 /// See \ref AArch64CCMP.
2814 /// \see emitConjunctionRec().
2815 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
2816                                AArch64CC::CondCode &OutCC) {
2817   bool DummyCanNegate;
2818   bool DummyMustBeFirst;
2819   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
2820     return SDValue();
2821 
2822   return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
2823 }
2824 
2825 /// @}
2826 
2827 /// Returns how profitable it is to fold a comparison's operand's shift and/or
2828 /// extension operations.
2829 static unsigned getCmpOperandFoldingProfit(SDValue Op) {
2830   auto isSupportedExtend = [&](SDValue V) {
2831     if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
2832       return true;
2833 
2834     if (V.getOpcode() == ISD::AND)
2835       if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
2836         uint64_t Mask = MaskCst->getZExtValue();
2837         return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
2838       }
2839 
2840     return false;
2841   };
2842 
2843   if (!Op.hasOneUse())
2844     return 0;
2845 
2846   if (isSupportedExtend(Op))
2847     return 1;
2848 
2849   unsigned Opc = Op.getOpcode();
2850   if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
2851     if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
2852       uint64_t Shift = ShiftCst->getZExtValue();
2853       if (isSupportedExtend(Op.getOperand(0)))
2854         return (Shift <= 4) ? 2 : 1;
2855       EVT VT = Op.getValueType();
2856       if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
2857         return 1;
2858     }
2859 
2860   return 0;
2861 }
2862 
2863 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
2864                              SDValue &AArch64cc, SelectionDAG &DAG,
2865                              const SDLoc &dl) {
2866   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
2867     EVT VT = RHS.getValueType();
2868     uint64_t C = RHSC->getZExtValue();
2869     if (!isLegalArithImmed(C)) {
2870       // Constant does not fit, try adjusting it by one?
2871       switch (CC) {
2872       default:
2873         break;
2874       case ISD::SETLT:
2875       case ISD::SETGE:
2876         if ((VT == MVT::i32 && C != 0x80000000 &&
2877              isLegalArithImmed((uint32_t)(C - 1))) ||
2878             (VT == MVT::i64 && C != 0x80000000ULL &&
2879              isLegalArithImmed(C - 1ULL))) {
2880           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2881           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2882           RHS = DAG.getConstant(C, dl, VT);
2883         }
2884         break;
2885       case ISD::SETULT:
2886       case ISD::SETUGE:
2887         if ((VT == MVT::i32 && C != 0 &&
2888              isLegalArithImmed((uint32_t)(C - 1))) ||
2889             (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
2890           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2891           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2892           RHS = DAG.getConstant(C, dl, VT);
2893         }
2894         break;
2895       case ISD::SETLE:
2896       case ISD::SETGT:
2897         if ((VT == MVT::i32 && C != INT32_MAX &&
2898              isLegalArithImmed((uint32_t)(C + 1))) ||
2899             (VT == MVT::i64 && C != INT64_MAX &&
2900              isLegalArithImmed(C + 1ULL))) {
2901           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2902           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2903           RHS = DAG.getConstant(C, dl, VT);
2904         }
2905         break;
2906       case ISD::SETULE:
2907       case ISD::SETUGT:
2908         if ((VT == MVT::i32 && C != UINT32_MAX &&
2909              isLegalArithImmed((uint32_t)(C + 1))) ||
2910             (VT == MVT::i64 && C != UINT64_MAX &&
2911              isLegalArithImmed(C + 1ULL))) {
2912           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2913           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2914           RHS = DAG.getConstant(C, dl, VT);
2915         }
2916         break;
2917       }
2918     }
2919   }
2920 
2921   // Comparisons are canonicalized so that the RHS operand is simpler than the
2922   // LHS one, the extreme case being when RHS is an immediate. However, AArch64
2923   // can fold some shift+extend operations on the RHS operand, so swap the
2924   // operands if that can be done.
2925   //
2926   // For example:
2927   //    lsl     w13, w11, #1
2928   //    cmp     w13, w12
2929   // can be turned into:
2930   //    cmp     w12, w11, lsl #1
2931   if (!isa<ConstantSDNode>(RHS) ||
2932       !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
2933     SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
2934 
2935     if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
2936       std::swap(LHS, RHS);
2937       CC = ISD::getSetCCSwappedOperands(CC);
2938     }
2939   }
2940 
2941   SDValue Cmp;
2942   AArch64CC::CondCode AArch64CC;
2943   if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
2944     const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
2945 
2946     // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
2947     // For the i8 operand, the largest immediate is 255, so this can be easily
2948     // encoded in the compare instruction. For the i16 operand, however, the
2949     // largest immediate cannot be encoded in the compare.
2950     // Therefore, use a sign extending load and cmn to avoid materializing the
2951     // -1 constant. For example,
2952     // movz w1, #65535
2953     // ldrh w0, [x0, #0]
2954     // cmp w0, w1
2955     // >
2956     // ldrsh w0, [x0, #0]
2957     // cmn w0, #1
2958     // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
2959     // if and only if (sext LHS) == (sext RHS). The checks are in place to
2960     // ensure both the LHS and RHS are truly zero extended and to make sure the
2961     // transformation is profitable.
2962     if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
2963         cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
2964         cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
2965         LHS.getNode()->hasNUsesOfValue(1, 0)) {
2966       int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
2967       if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
2968         SDValue SExt =
2969             DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
2970                         DAG.getValueType(MVT::i16));
2971         Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
2972                                                    RHS.getValueType()),
2973                              CC, dl, DAG);
2974         AArch64CC = changeIntCCToAArch64CC(CC);
2975       }
2976     }
2977 
2978     if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2979       if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2980         if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2981           AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2982       }
2983     }
2984   }
2985 
2986   if (!Cmp) {
2987     Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2988     AArch64CC = changeIntCCToAArch64CC(CC);
2989   }
2990   AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2991   return Cmp;
2992 }
2993 
2994 static std::pair<SDValue, SDValue>
2995 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
2996   assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2997          "Unsupported value type");
2998   SDValue Value, Overflow;
2999   SDLoc DL(Op);
3000   SDValue LHS = Op.getOperand(0);
3001   SDValue RHS = Op.getOperand(1);
3002   unsigned Opc = 0;
3003   switch (Op.getOpcode()) {
3004   default:
3005     llvm_unreachable("Unknown overflow instruction!");
3006   case ISD::SADDO:
3007     Opc = AArch64ISD::ADDS;
3008     CC = AArch64CC::VS;
3009     break;
3010   case ISD::UADDO:
3011     Opc = AArch64ISD::ADDS;
3012     CC = AArch64CC::HS;
3013     break;
3014   case ISD::SSUBO:
3015     Opc = AArch64ISD::SUBS;
3016     CC = AArch64CC::VS;
3017     break;
3018   case ISD::USUBO:
3019     Opc = AArch64ISD::SUBS;
3020     CC = AArch64CC::LO;
3021     break;
3022   // Multiply needs a little bit extra work.
3023   case ISD::SMULO:
3024   case ISD::UMULO: {
3025     CC = AArch64CC::NE;
3026     bool IsSigned = Op.getOpcode() == ISD::SMULO;
3027     if (Op.getValueType() == MVT::i32) {
3028       // Extend to 64-bits, then perform a 64-bit multiply.
3029       unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3030       LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3031       RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3032       SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3033       Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3034 
3035       // Check that the result fits into a 32-bit integer.
3036       SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3037       if (IsSigned) {
3038         // cmp xreg, wreg, sxtw
3039         SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3040         Overflow =
3041             DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3042       } else {
3043         // tst xreg, #0xffffffff00000000
3044         SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3045         Overflow =
3046             DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3047       }
3048       break;
3049     }
3050     assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3051     // For the 64 bit multiply
3052     Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3053     if (IsSigned) {
3054       SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3055       SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3056                                       DAG.getConstant(63, DL, MVT::i64));
3057       // It is important that LowerBits is last, otherwise the arithmetic
3058       // shift will not be folded into the compare (SUBS).
3059       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3060       Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3061                      .getValue(1);
3062     } else {
3063       SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3064       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3065       Overflow =
3066           DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3067                       DAG.getConstant(0, DL, MVT::i64),
3068                       UpperBits).getValue(1);
3069     }
3070     break;
3071   }
3072   } // switch (...)
3073 
3074   if (Opc) {
3075     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3076 
3077     // Emit the AArch64 operation with overflow check.
3078     Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3079     Overflow = Value.getValue(1);
3080   }
3081   return std::make_pair(Value, Overflow);
3082 }
3083 
3084 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3085   if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3086     return LowerToScalableOp(Op, DAG);
3087 
3088   SDValue Sel = Op.getOperand(0);
3089   SDValue Other = Op.getOperand(1);
3090   SDLoc dl(Sel);
3091 
3092   // If the operand is an overflow checking operation, invert the condition
3093   // code and kill the Not operation. I.e., transform:
3094   // (xor (overflow_op_bool, 1))
3095   //   -->
3096   // (csel 1, 0, invert(cc), overflow_op_bool)
3097   // ... which later gets transformed to just a cset instruction with an
3098   // inverted condition code, rather than a cset + eor sequence.
3099   if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
3100     // Only lower legal XALUO ops.
3101     if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3102       return SDValue();
3103 
3104     SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3105     SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3106     AArch64CC::CondCode CC;
3107     SDValue Value, Overflow;
3108     std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3109     SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3110     return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3111                        CCVal, Overflow);
3112   }
3113   // If neither operand is a SELECT_CC, give up.
3114   if (Sel.getOpcode() != ISD::SELECT_CC)
3115     std::swap(Sel, Other);
3116   if (Sel.getOpcode() != ISD::SELECT_CC)
3117     return Op;
3118 
3119   // The folding we want to perform is:
3120   // (xor x, (select_cc a, b, cc, 0, -1) )
3121   //   -->
3122   // (csel x, (xor x, -1), cc ...)
3123   //
3124   // The latter will get matched to a CSINV instruction.
3125 
3126   ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3127   SDValue LHS = Sel.getOperand(0);
3128   SDValue RHS = Sel.getOperand(1);
3129   SDValue TVal = Sel.getOperand(2);
3130   SDValue FVal = Sel.getOperand(3);
3131 
3132   // FIXME: This could be generalized to non-integer comparisons.
3133   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3134     return Op;
3135 
3136   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3137   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3138 
3139   // The values aren't constants, this isn't the pattern we're looking for.
3140   if (!CFVal || !CTVal)
3141     return Op;
3142 
3143   // We can commute the SELECT_CC by inverting the condition.  This
3144   // might be needed to make this fit into a CSINV pattern.
3145   if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
3146     std::swap(TVal, FVal);
3147     std::swap(CTVal, CFVal);
3148     CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3149   }
3150 
3151   // If the constants line up, perform the transform!
3152   if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
3153     SDValue CCVal;
3154     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3155 
3156     FVal = Other;
3157     TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3158                        DAG.getConstant(-1ULL, dl, Other.getValueType()));
3159 
3160     return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3161                        CCVal, Cmp);
3162   }
3163 
3164   return Op;
3165 }
3166 
3167 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
3168   EVT VT = Op.getValueType();
3169 
3170   // Let legalize expand this if it isn't a legal type yet.
3171   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
3172     return SDValue();
3173 
3174   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
3175 
3176   unsigned Opc;
3177   bool ExtraOp = false;
3178   switch (Op.getOpcode()) {
3179   default:
3180     llvm_unreachable("Invalid code");
3181   case ISD::ADDC:
3182     Opc = AArch64ISD::ADDS;
3183     break;
3184   case ISD::SUBC:
3185     Opc = AArch64ISD::SUBS;
3186     break;
3187   case ISD::ADDE:
3188     Opc = AArch64ISD::ADCS;
3189     ExtraOp = true;
3190     break;
3191   case ISD::SUBE:
3192     Opc = AArch64ISD::SBCS;
3193     ExtraOp = true;
3194     break;
3195   }
3196 
3197   if (!ExtraOp)
3198     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
3199   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
3200                      Op.getOperand(2));
3201 }
3202 
3203 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
3204   // Let legalize expand this if it isn't a legal type yet.
3205   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3206     return SDValue();
3207 
3208   SDLoc dl(Op);
3209   AArch64CC::CondCode CC;
3210   // The actual operation that sets the overflow or carry flag.
3211   SDValue Value, Overflow;
3212   std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3213 
3214   // We use 0 and 1 as false and true values.
3215   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3216   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3217 
3218   // We use an inverted condition, because the conditional select is inverted
3219   // too. This will allow it to be selected to a single instruction:
3220   // CSINC Wd, WZR, WZR, invert(cond).
3221   SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3222   Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3223                          CCVal, Overflow);
3224 
3225   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3226   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3227 }
3228 
3229 // Prefetch operands are:
3230 // 1: Address to prefetch
3231 // 2: bool isWrite
3232 // 3: int locality (0 = no locality ... 3 = extreme locality)
3233 // 4: bool isDataCache
3234 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
3235   SDLoc DL(Op);
3236   unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
3237   unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
3238   unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3239 
3240   bool IsStream = !Locality;
3241   // When the locality number is set
3242   if (Locality) {
3243     // The front-end should have filtered out the out-of-range values
3244     assert(Locality <= 3 && "Prefetch locality out-of-range");
3245     // The locality degree is the opposite of the cache speed.
3246     // Put the number the other way around.
3247     // The encoding starts at 0 for level 1
3248     Locality = 3 - Locality;
3249   }
3250 
3251   // built the mask value encoding the expected behavior.
3252   unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
3253                    (!IsData << 3) |     // IsDataCache bit
3254                    (Locality << 1) |    // Cache level bits
3255                    (unsigned)IsStream;  // Stream bit
3256   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3257                      DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
3258 }
3259 
3260 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3261                                               SelectionDAG &DAG) const {
3262   EVT VT = Op.getValueType();
3263   if (VT.isScalableVector())
3264     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3265 
3266   if (useSVEForFixedLengthVectorVT(VT))
3267     return LowerFixedLengthFPExtendToSVE(Op, DAG);
3268 
3269   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
3270   return SDValue();
3271 }
3272 
3273 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
3274                                              SelectionDAG &DAG) const {
3275   if (Op.getValueType().isScalableVector())
3276     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
3277 
3278   bool IsStrict = Op->isStrictFPOpcode();
3279   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3280   EVT SrcVT = SrcVal.getValueType();
3281 
3282   if (useSVEForFixedLengthVectorVT(SrcVT))
3283     return LowerFixedLengthFPRoundToSVE(Op, DAG);
3284 
3285   if (SrcVT != MVT::f128) {
3286     // Expand cases where the input is a vector bigger than NEON.
3287     if (useSVEForFixedLengthVectorVT(SrcVT))
3288       return SDValue();
3289 
3290     // It's legal except when f128 is involved
3291     return Op;
3292   }
3293 
3294   return SDValue();
3295 }
3296 
3297 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
3298                                                     SelectionDAG &DAG) const {
3299   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3300   // Any additional optimization in this function should be recorded
3301   // in the cost tables.
3302   EVT InVT = Op.getOperand(0).getValueType();
3303   EVT VT = Op.getValueType();
3304 
3305   if (VT.isScalableVector()) {
3306     unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
3307                           ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
3308                           : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
3309     return LowerToPredicatedOp(Op, DAG, Opcode);
3310   }
3311 
3312   if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
3313     return LowerFixedLengthFPToIntToSVE(Op, DAG);
3314 
3315   unsigned NumElts = InVT.getVectorNumElements();
3316 
3317   // f16 conversions are promoted to f32 when full fp16 is not supported.
3318   if (InVT.getVectorElementType() == MVT::f16 &&
3319       !Subtarget->hasFullFP16()) {
3320     MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
3321     SDLoc dl(Op);
3322     return DAG.getNode(
3323         Op.getOpcode(), dl, Op.getValueType(),
3324         DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
3325   }
3326 
3327   uint64_t VTSize = VT.getFixedSizeInBits();
3328   uint64_t InVTSize = InVT.getFixedSizeInBits();
3329   if (VTSize < InVTSize) {
3330     SDLoc dl(Op);
3331     SDValue Cv =
3332         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
3333                     Op.getOperand(0));
3334     return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3335   }
3336 
3337   if (VTSize > InVTSize) {
3338     SDLoc dl(Op);
3339     MVT ExtVT =
3340         MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
3341                          VT.getVectorNumElements());
3342     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
3343     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
3344   }
3345 
3346   // Type changing conversions are illegal.
3347   return Op;
3348 }
3349 
3350 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
3351                                               SelectionDAG &DAG) const {
3352   bool IsStrict = Op->isStrictFPOpcode();
3353   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3354 
3355   if (SrcVal.getValueType().isVector())
3356     return LowerVectorFP_TO_INT(Op, DAG);
3357 
3358   // f16 conversions are promoted to f32 when full fp16 is not supported.
3359   if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
3360     assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
3361     SDLoc dl(Op);
3362     return DAG.getNode(
3363         Op.getOpcode(), dl, Op.getValueType(),
3364         DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
3365   }
3366 
3367   if (SrcVal.getValueType() != MVT::f128) {
3368     // It's legal except when f128 is involved
3369     return Op;
3370   }
3371 
3372   return SDValue();
3373 }
3374 
3375 SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
3376                                                   SelectionDAG &DAG) const {
3377   // AArch64 FP-to-int conversions saturate to the destination register size, so
3378   // we can lower common saturating conversions to simple instructions.
3379   SDValue SrcVal = Op.getOperand(0);
3380 
3381   EVT SrcVT = SrcVal.getValueType();
3382   EVT DstVT = Op.getValueType();
3383 
3384   EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3385   uint64_t SatWidth = SatVT.getScalarSizeInBits();
3386   uint64_t DstWidth = DstVT.getScalarSizeInBits();
3387   assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3388 
3389   // TODO: Support lowering of NEON and SVE conversions.
3390   if (SrcVT.isVector())
3391     return SDValue();
3392 
3393   // TODO: Saturate to SatWidth explicitly.
3394   if (SatWidth != DstWidth)
3395     return SDValue();
3396 
3397   // In the absence of FP16 support, promote f32 to f16, like LowerFP_TO_INT().
3398   if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16())
3399     return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
3400                        DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal),
3401                        Op.getOperand(1));
3402 
3403   // Cases that we can emit directly.
3404   if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
3405        (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
3406       (DstVT == MVT::i64 || DstVT == MVT::i32))
3407     return Op;
3408 
3409   // For all other cases, fall back on the expanded form.
3410   return SDValue();
3411 }
3412 
3413 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
3414                                                     SelectionDAG &DAG) const {
3415   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3416   // Any additional optimization in this function should be recorded
3417   // in the cost tables.
3418   EVT VT = Op.getValueType();
3419   SDLoc dl(Op);
3420   SDValue In = Op.getOperand(0);
3421   EVT InVT = In.getValueType();
3422   unsigned Opc = Op.getOpcode();
3423   bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
3424 
3425   if (VT.isScalableVector()) {
3426     if (InVT.getVectorElementType() == MVT::i1) {
3427       // We can't directly extend an SVE predicate; extend it first.
3428       unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3429       EVT CastVT = getPromotedVTForPredicate(InVT);
3430       In = DAG.getNode(CastOpc, dl, CastVT, In);
3431       return DAG.getNode(Opc, dl, VT, In);
3432     }
3433 
3434     unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
3435                                : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
3436     return LowerToPredicatedOp(Op, DAG, Opcode);
3437   }
3438 
3439   if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
3440     return LowerFixedLengthIntToFPToSVE(Op, DAG);
3441 
3442   uint64_t VTSize = VT.getFixedSizeInBits();
3443   uint64_t InVTSize = InVT.getFixedSizeInBits();
3444   if (VTSize < InVTSize) {
3445     MVT CastVT =
3446         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
3447                          InVT.getVectorNumElements());
3448     In = DAG.getNode(Opc, dl, CastVT, In);
3449     return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
3450   }
3451 
3452   if (VTSize > InVTSize) {
3453     unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3454     EVT CastVT = VT.changeVectorElementTypeToInteger();
3455     In = DAG.getNode(CastOpc, dl, CastVT, In);
3456     return DAG.getNode(Opc, dl, VT, In);
3457   }
3458 
3459   return Op;
3460 }
3461 
3462 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
3463                                             SelectionDAG &DAG) const {
3464   if (Op.getValueType().isVector())
3465     return LowerVectorINT_TO_FP(Op, DAG);
3466 
3467   bool IsStrict = Op->isStrictFPOpcode();
3468   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3469 
3470   // f16 conversions are promoted to f32 when full fp16 is not supported.
3471   if (Op.getValueType() == MVT::f16 &&
3472       !Subtarget->hasFullFP16()) {
3473     assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
3474     SDLoc dl(Op);
3475     return DAG.getNode(
3476         ISD::FP_ROUND, dl, MVT::f16,
3477         DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
3478         DAG.getIntPtrConstant(0, dl));
3479   }
3480 
3481   // i128 conversions are libcalls.
3482   if (SrcVal.getValueType() == MVT::i128)
3483     return SDValue();
3484 
3485   // Other conversions are legal, unless it's to the completely software-based
3486   // fp128.
3487   if (Op.getValueType() != MVT::f128)
3488     return Op;
3489   return SDValue();
3490 }
3491 
3492 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
3493                                             SelectionDAG &DAG) const {
3494   // For iOS, we want to call an alternative entry point: __sincos_stret,
3495   // which returns the values in two S / D registers.
3496   SDLoc dl(Op);
3497   SDValue Arg = Op.getOperand(0);
3498   EVT ArgVT = Arg.getValueType();
3499   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
3500 
3501   ArgListTy Args;
3502   ArgListEntry Entry;
3503 
3504   Entry.Node = Arg;
3505   Entry.Ty = ArgTy;
3506   Entry.IsSExt = false;
3507   Entry.IsZExt = false;
3508   Args.push_back(Entry);
3509 
3510   RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
3511                                         : RTLIB::SINCOS_STRET_F32;
3512   const char *LibcallName = getLibcallName(LC);
3513   SDValue Callee =
3514       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
3515 
3516   StructType *RetTy = StructType::get(ArgTy, ArgTy);
3517   TargetLowering::CallLoweringInfo CLI(DAG);
3518   CLI.setDebugLoc(dl)
3519       .setChain(DAG.getEntryNode())
3520       .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
3521 
3522   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3523   return CallResult.first;
3524 }
3525 
3526 static MVT getSVEContainerType(EVT ContentTy);
3527 
3528 SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
3529                                             SelectionDAG &DAG) const {
3530   EVT OpVT = Op.getValueType();
3531   EVT ArgVT = Op.getOperand(0).getValueType();
3532 
3533   if (useSVEForFixedLengthVectorVT(OpVT))
3534     return LowerFixedLengthBitcastToSVE(Op, DAG);
3535 
3536   if (OpVT.isScalableVector()) {
3537     if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
3538       assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
3539              "Expected int->fp bitcast!");
3540       SDValue ExtResult =
3541           DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
3542                       Op.getOperand(0));
3543       return getSVESafeBitCast(OpVT, ExtResult, DAG);
3544     }
3545     return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
3546   }
3547 
3548   if (OpVT != MVT::f16 && OpVT != MVT::bf16)
3549     return SDValue();
3550 
3551   assert(ArgVT == MVT::i16);
3552   SDLoc DL(Op);
3553 
3554   Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
3555   Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
3556   return SDValue(
3557       DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
3558                          DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
3559       0);
3560 }
3561 
3562 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
3563   if (OrigVT.getSizeInBits() >= 64)
3564     return OrigVT;
3565 
3566   assert(OrigVT.isSimple() && "Expecting a simple value type");
3567 
3568   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
3569   switch (OrigSimpleTy) {
3570   default: llvm_unreachable("Unexpected Vector Type");
3571   case MVT::v2i8:
3572   case MVT::v2i16:
3573      return MVT::v2i32;
3574   case MVT::v4i8:
3575     return  MVT::v4i16;
3576   }
3577 }
3578 
3579 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
3580                                                  const EVT &OrigTy,
3581                                                  const EVT &ExtTy,
3582                                                  unsigned ExtOpcode) {
3583   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
3584   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
3585   // 64-bits we need to insert a new extension so that it will be 64-bits.
3586   assert(ExtTy.is128BitVector() && "Unexpected extension size");
3587   if (OrigTy.getSizeInBits() >= 64)
3588     return N;
3589 
3590   // Must extend size to at least 64 bits to be used as an operand for VMULL.
3591   EVT NewVT = getExtensionTo64Bits(OrigTy);
3592 
3593   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
3594 }
3595 
3596 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
3597                                    bool isSigned) {
3598   EVT VT = N->getValueType(0);
3599 
3600   if (N->getOpcode() != ISD::BUILD_VECTOR)
3601     return false;
3602 
3603   for (const SDValue &Elt : N->op_values()) {
3604     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
3605       unsigned EltSize = VT.getScalarSizeInBits();
3606       unsigned HalfSize = EltSize / 2;
3607       if (isSigned) {
3608         if (!isIntN(HalfSize, C->getSExtValue()))
3609           return false;
3610       } else {
3611         if (!isUIntN(HalfSize, C->getZExtValue()))
3612           return false;
3613       }
3614       continue;
3615     }
3616     return false;
3617   }
3618 
3619   return true;
3620 }
3621 
3622 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
3623   if (N->getOpcode() == ISD::SIGN_EXTEND ||
3624       N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
3625     return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
3626                                              N->getOperand(0)->getValueType(0),
3627                                              N->getValueType(0),
3628                                              N->getOpcode());
3629 
3630   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
3631   EVT VT = N->getValueType(0);
3632   SDLoc dl(N);
3633   unsigned EltSize = VT.getScalarSizeInBits() / 2;
3634   unsigned NumElts = VT.getVectorNumElements();
3635   MVT TruncVT = MVT::getIntegerVT(EltSize);
3636   SmallVector<SDValue, 8> Ops;
3637   for (unsigned i = 0; i != NumElts; ++i) {
3638     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
3639     const APInt &CInt = C->getAPIntValue();
3640     // Element types smaller than 32 bits are not legal, so use i32 elements.
3641     // The values are implicitly truncated so sext vs. zext doesn't matter.
3642     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
3643   }
3644   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
3645 }
3646 
3647 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
3648   return N->getOpcode() == ISD::SIGN_EXTEND ||
3649          N->getOpcode() == ISD::ANY_EXTEND ||
3650          isExtendedBUILD_VECTOR(N, DAG, true);
3651 }
3652 
3653 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
3654   return N->getOpcode() == ISD::ZERO_EXTEND ||
3655          N->getOpcode() == ISD::ANY_EXTEND ||
3656          isExtendedBUILD_VECTOR(N, DAG, false);
3657 }
3658 
3659 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
3660   unsigned Opcode = N->getOpcode();
3661   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
3662     SDNode *N0 = N->getOperand(0).getNode();
3663     SDNode *N1 = N->getOperand(1).getNode();
3664     return N0->hasOneUse() && N1->hasOneUse() &&
3665       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
3666   }
3667   return false;
3668 }
3669 
3670 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
3671   unsigned Opcode = N->getOpcode();
3672   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
3673     SDNode *N0 = N->getOperand(0).getNode();
3674     SDNode *N1 = N->getOperand(1).getNode();
3675     return N0->hasOneUse() && N1->hasOneUse() &&
3676       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
3677   }
3678   return false;
3679 }
3680 
3681 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
3682                                                 SelectionDAG &DAG) const {
3683   // The rounding mode is in bits 23:22 of the FPSCR.
3684   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
3685   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
3686   // so that the shift + and get folded into a bitfield extract.
3687   SDLoc dl(Op);
3688 
3689   SDValue Chain = Op.getOperand(0);
3690   SDValue FPCR_64 = DAG.getNode(
3691       ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
3692       {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
3693   Chain = FPCR_64.getValue(1);
3694   SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
3695   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
3696                                   DAG.getConstant(1U << 22, dl, MVT::i32));
3697   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
3698                               DAG.getConstant(22, dl, MVT::i32));
3699   SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
3700                             DAG.getConstant(3, dl, MVT::i32));
3701   return DAG.getMergeValues({AND, Chain}, dl);
3702 }
3703 
3704 SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
3705                                                  SelectionDAG &DAG) const {
3706   SDLoc DL(Op);
3707   SDValue Chain = Op->getOperand(0);
3708   SDValue RMValue = Op->getOperand(1);
3709 
3710   // The rounding mode is in bits 23:22 of the FPCR.
3711   // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
3712   // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
3713   // ((arg - 1) & 3) << 22).
3714   //
3715   // The argument of llvm.set.rounding must be within the segment [0, 3], so
3716   // NearestTiesToAway (4) is not handled here. It is responsibility of the code
3717   // generated llvm.set.rounding to ensure this condition.
3718 
3719   // Calculate new value of FPCR[23:22].
3720   RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
3721                         DAG.getConstant(1, DL, MVT::i32));
3722   RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
3723                         DAG.getConstant(0x3, DL, MVT::i32));
3724   RMValue =
3725       DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
3726                   DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
3727   RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
3728 
3729   // Get current value of FPCR.
3730   SDValue Ops[] = {
3731       Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
3732   SDValue FPCR =
3733       DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
3734   Chain = FPCR.getValue(1);
3735   FPCR = FPCR.getValue(0);
3736 
3737   // Put new rounding mode into FPSCR[23:22].
3738   const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
3739   FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
3740                      DAG.getConstant(RMMask, DL, MVT::i64));
3741   FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
3742   SDValue Ops2[] = {
3743       Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
3744       FPCR};
3745   return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
3746 }
3747 
3748 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
3749   EVT VT = Op.getValueType();
3750 
3751   // If SVE is available then i64 vector multiplications can also be made legal.
3752   bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
3753 
3754   if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
3755     return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON);
3756 
3757   // Multiplications are only custom-lowered for 128-bit vectors so that
3758   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
3759   assert(VT.is128BitVector() && VT.isInteger() &&
3760          "unexpected type for custom-lowering ISD::MUL");
3761   SDNode *N0 = Op.getOperand(0).getNode();
3762   SDNode *N1 = Op.getOperand(1).getNode();
3763   unsigned NewOpc = 0;
3764   bool isMLA = false;
3765   bool isN0SExt = isSignExtended(N0, DAG);
3766   bool isN1SExt = isSignExtended(N1, DAG);
3767   if (isN0SExt && isN1SExt)
3768     NewOpc = AArch64ISD::SMULL;
3769   else {
3770     bool isN0ZExt = isZeroExtended(N0, DAG);
3771     bool isN1ZExt = isZeroExtended(N1, DAG);
3772     if (isN0ZExt && isN1ZExt)
3773       NewOpc = AArch64ISD::UMULL;
3774     else if (isN1SExt || isN1ZExt) {
3775       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
3776       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
3777       if (isN1SExt && isAddSubSExt(N0, DAG)) {
3778         NewOpc = AArch64ISD::SMULL;
3779         isMLA = true;
3780       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
3781         NewOpc =  AArch64ISD::UMULL;
3782         isMLA = true;
3783       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
3784         std::swap(N0, N1);
3785         NewOpc =  AArch64ISD::UMULL;
3786         isMLA = true;
3787       }
3788     }
3789 
3790     if (!NewOpc) {
3791       if (VT == MVT::v2i64)
3792         // Fall through to expand this.  It is not legal.
3793         return SDValue();
3794       else
3795         // Other vector multiplications are legal.
3796         return Op;
3797     }
3798   }
3799 
3800   // Legalize to a S/UMULL instruction
3801   SDLoc DL(Op);
3802   SDValue Op0;
3803   SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
3804   if (!isMLA) {
3805     Op0 = skipExtensionForVectorMULL(N0, DAG);
3806     assert(Op0.getValueType().is64BitVector() &&
3807            Op1.getValueType().is64BitVector() &&
3808            "unexpected types for extended operands to VMULL");
3809     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
3810   }
3811   // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
3812   // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
3813   // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
3814   SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
3815   SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
3816   EVT Op1VT = Op1.getValueType();
3817   return DAG.getNode(N0->getOpcode(), DL, VT,
3818                      DAG.getNode(NewOpc, DL, VT,
3819                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
3820                      DAG.getNode(NewOpc, DL, VT,
3821                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
3822 }
3823 
3824 static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
3825                                int Pattern) {
3826   return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
3827                      DAG.getTargetConstant(Pattern, DL, MVT::i32));
3828 }
3829 
3830 static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) {
3831   SDLoc DL(Op);
3832   EVT OutVT = Op.getValueType();
3833   SDValue InOp = Op.getOperand(1);
3834   EVT InVT = InOp.getValueType();
3835 
3836   // Return the operand if the cast isn't changing type,
3837   // i.e. <n x 16 x i1> -> <n x 16 x i1>
3838   if (InVT == OutVT)
3839     return InOp;
3840 
3841   SDValue Reinterpret =
3842       DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, InOp);
3843 
3844   // If the argument converted to an svbool is a ptrue or a comparison, the
3845   // lanes introduced by the widening are zero by construction.
3846   switch (InOp.getOpcode()) {
3847   case AArch64ISD::SETCC_MERGE_ZERO:
3848     return Reinterpret;
3849   case ISD::INTRINSIC_WO_CHAIN:
3850     if (InOp.getConstantOperandVal(0) == Intrinsic::aarch64_sve_ptrue)
3851       return Reinterpret;
3852   }
3853 
3854   // Otherwise, zero the newly introduced lanes.
3855   SDValue Mask = getPTrue(DAG, DL, InVT, AArch64SVEPredPattern::all);
3856   SDValue MaskReinterpret =
3857       DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, Mask);
3858   return DAG.getNode(ISD::AND, DL, OutVT, Reinterpret, MaskReinterpret);
3859 }
3860 
3861 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
3862                                                      SelectionDAG &DAG) const {
3863   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3864   SDLoc dl(Op);
3865   switch (IntNo) {
3866   default: return SDValue();    // Don't custom lower most intrinsics.
3867   case Intrinsic::thread_pointer: {
3868     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3869     return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
3870   }
3871   case Intrinsic::aarch64_neon_abs: {
3872     EVT Ty = Op.getValueType();
3873     if (Ty == MVT::i64) {
3874       SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
3875                                    Op.getOperand(1));
3876       Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
3877       return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
3878     } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
3879       return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
3880     } else {
3881       report_fatal_error("Unexpected type for AArch64 NEON intrinic");
3882     }
3883   }
3884   case Intrinsic::aarch64_neon_smax:
3885     return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
3886                        Op.getOperand(1), Op.getOperand(2));
3887   case Intrinsic::aarch64_neon_umax:
3888     return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
3889                        Op.getOperand(1), Op.getOperand(2));
3890   case Intrinsic::aarch64_neon_smin:
3891     return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
3892                        Op.getOperand(1), Op.getOperand(2));
3893   case Intrinsic::aarch64_neon_umin:
3894     return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
3895                        Op.getOperand(1), Op.getOperand(2));
3896 
3897   case Intrinsic::aarch64_sve_sunpkhi:
3898     return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
3899                        Op.getOperand(1));
3900   case Intrinsic::aarch64_sve_sunpklo:
3901     return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
3902                        Op.getOperand(1));
3903   case Intrinsic::aarch64_sve_uunpkhi:
3904     return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
3905                        Op.getOperand(1));
3906   case Intrinsic::aarch64_sve_uunpklo:
3907     return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
3908                        Op.getOperand(1));
3909   case Intrinsic::aarch64_sve_clasta_n:
3910     return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
3911                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3912   case Intrinsic::aarch64_sve_clastb_n:
3913     return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
3914                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3915   case Intrinsic::aarch64_sve_lasta:
3916     return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
3917                        Op.getOperand(1), Op.getOperand(2));
3918   case Intrinsic::aarch64_sve_lastb:
3919     return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
3920                        Op.getOperand(1), Op.getOperand(2));
3921   case Intrinsic::aarch64_sve_rev:
3922     return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
3923                        Op.getOperand(1));
3924   case Intrinsic::aarch64_sve_tbl:
3925     return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
3926                        Op.getOperand(1), Op.getOperand(2));
3927   case Intrinsic::aarch64_sve_trn1:
3928     return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
3929                        Op.getOperand(1), Op.getOperand(2));
3930   case Intrinsic::aarch64_sve_trn2:
3931     return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
3932                        Op.getOperand(1), Op.getOperand(2));
3933   case Intrinsic::aarch64_sve_uzp1:
3934     return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
3935                        Op.getOperand(1), Op.getOperand(2));
3936   case Intrinsic::aarch64_sve_uzp2:
3937     return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
3938                        Op.getOperand(1), Op.getOperand(2));
3939   case Intrinsic::aarch64_sve_zip1:
3940     return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
3941                        Op.getOperand(1), Op.getOperand(2));
3942   case Intrinsic::aarch64_sve_zip2:
3943     return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
3944                        Op.getOperand(1), Op.getOperand(2));
3945   case Intrinsic::aarch64_sve_splice:
3946     return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
3947                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3948   case Intrinsic::aarch64_sve_ptrue:
3949     return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
3950                        Op.getOperand(1));
3951   case Intrinsic::aarch64_sve_clz:
3952     return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
3953                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3954   case Intrinsic::aarch64_sve_cnt: {
3955     SDValue Data = Op.getOperand(3);
3956     // CTPOP only supports integer operands.
3957     if (Data.getValueType().isFloatingPoint())
3958       Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
3959     return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
3960                        Op.getOperand(2), Data, Op.getOperand(1));
3961   }
3962   case Intrinsic::aarch64_sve_dupq_lane:
3963     return LowerDUPQLane(Op, DAG);
3964   case Intrinsic::aarch64_sve_convert_from_svbool:
3965     return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
3966                        Op.getOperand(1));
3967   case Intrinsic::aarch64_sve_convert_to_svbool:
3968     return lowerConvertToSVBool(Op, DAG);
3969   case Intrinsic::aarch64_sve_fneg:
3970     return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
3971                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3972   case Intrinsic::aarch64_sve_frintp:
3973     return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
3974                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3975   case Intrinsic::aarch64_sve_frintm:
3976     return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
3977                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3978   case Intrinsic::aarch64_sve_frinti:
3979     return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
3980                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3981   case Intrinsic::aarch64_sve_frintx:
3982     return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
3983                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3984   case Intrinsic::aarch64_sve_frinta:
3985     return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
3986                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3987   case Intrinsic::aarch64_sve_frintn:
3988     return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
3989                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3990   case Intrinsic::aarch64_sve_frintz:
3991     return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
3992                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3993   case Intrinsic::aarch64_sve_ucvtf:
3994     return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl,
3995                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3996                        Op.getOperand(1));
3997   case Intrinsic::aarch64_sve_scvtf:
3998     return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl,
3999                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4000                        Op.getOperand(1));
4001   case Intrinsic::aarch64_sve_fcvtzu:
4002     return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
4003                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4004                        Op.getOperand(1));
4005   case Intrinsic::aarch64_sve_fcvtzs:
4006     return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
4007                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4008                        Op.getOperand(1));
4009   case Intrinsic::aarch64_sve_fsqrt:
4010     return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
4011                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4012   case Intrinsic::aarch64_sve_frecpx:
4013     return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
4014                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4015   case Intrinsic::aarch64_sve_fabs:
4016     return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
4017                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4018   case Intrinsic::aarch64_sve_abs:
4019     return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
4020                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4021   case Intrinsic::aarch64_sve_neg:
4022     return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
4023                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4024   case Intrinsic::aarch64_sve_insr: {
4025     SDValue Scalar = Op.getOperand(2);
4026     EVT ScalarTy = Scalar.getValueType();
4027     if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
4028       Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
4029 
4030     return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
4031                        Op.getOperand(1), Scalar);
4032   }
4033   case Intrinsic::aarch64_sve_rbit:
4034     return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl,
4035                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4036                        Op.getOperand(1));
4037   case Intrinsic::aarch64_sve_revb:
4038     return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
4039                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4040   case Intrinsic::aarch64_sve_sxtb:
4041     return DAG.getNode(
4042         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4043         Op.getOperand(2), Op.getOperand(3),
4044         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
4045         Op.getOperand(1));
4046   case Intrinsic::aarch64_sve_sxth:
4047     return DAG.getNode(
4048         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4049         Op.getOperand(2), Op.getOperand(3),
4050         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
4051         Op.getOperand(1));
4052   case Intrinsic::aarch64_sve_sxtw:
4053     return DAG.getNode(
4054         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4055         Op.getOperand(2), Op.getOperand(3),
4056         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
4057         Op.getOperand(1));
4058   case Intrinsic::aarch64_sve_uxtb:
4059     return DAG.getNode(
4060         AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4061         Op.getOperand(2), Op.getOperand(3),
4062         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
4063         Op.getOperand(1));
4064   case Intrinsic::aarch64_sve_uxth:
4065     return DAG.getNode(
4066         AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4067         Op.getOperand(2), Op.getOperand(3),
4068         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
4069         Op.getOperand(1));
4070   case Intrinsic::aarch64_sve_uxtw:
4071     return DAG.getNode(
4072         AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4073         Op.getOperand(2), Op.getOperand(3),
4074         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
4075         Op.getOperand(1));
4076 
4077   case Intrinsic::localaddress: {
4078     const auto &MF = DAG.getMachineFunction();
4079     const auto *RegInfo = Subtarget->getRegisterInfo();
4080     unsigned Reg = RegInfo->getLocalAddressRegister(MF);
4081     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
4082                               Op.getSimpleValueType());
4083   }
4084 
4085   case Intrinsic::eh_recoverfp: {
4086     // FIXME: This needs to be implemented to correctly handle highly aligned
4087     // stack objects. For now we simply return the incoming FP. Refer D53541
4088     // for more details.
4089     SDValue FnOp = Op.getOperand(1);
4090     SDValue IncomingFPOp = Op.getOperand(2);
4091     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
4092     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
4093     if (!Fn)
4094       report_fatal_error(
4095           "llvm.eh.recoverfp must take a function as the first argument");
4096     return IncomingFPOp;
4097   }
4098 
4099   case Intrinsic::aarch64_neon_vsri:
4100   case Intrinsic::aarch64_neon_vsli: {
4101     EVT Ty = Op.getValueType();
4102 
4103     if (!Ty.isVector())
4104       report_fatal_error("Unexpected type for aarch64_neon_vsli");
4105 
4106     assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
4107 
4108     bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
4109     unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
4110     return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
4111                        Op.getOperand(3));
4112   }
4113 
4114   case Intrinsic::aarch64_neon_srhadd:
4115   case Intrinsic::aarch64_neon_urhadd:
4116   case Intrinsic::aarch64_neon_shadd:
4117   case Intrinsic::aarch64_neon_uhadd: {
4118     bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
4119                         IntNo == Intrinsic::aarch64_neon_shadd);
4120     bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
4121                           IntNo == Intrinsic::aarch64_neon_urhadd);
4122     unsigned Opcode =
4123         IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
4124                     : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD);
4125     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4126                        Op.getOperand(2));
4127   }
4128   case Intrinsic::aarch64_neon_sabd:
4129   case Intrinsic::aarch64_neon_uabd: {
4130     unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU
4131                                                             : ISD::ABDS;
4132     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4133                        Op.getOperand(2));
4134   }
4135   case Intrinsic::aarch64_neon_uaddlp: {
4136     unsigned Opcode = AArch64ISD::UADDLP;
4137     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
4138   }
4139   case Intrinsic::aarch64_neon_sdot:
4140   case Intrinsic::aarch64_neon_udot:
4141   case Intrinsic::aarch64_sve_sdot:
4142   case Intrinsic::aarch64_sve_udot: {
4143     unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
4144                        IntNo == Intrinsic::aarch64_sve_udot)
4145                           ? AArch64ISD::UDOT
4146                           : AArch64ISD::SDOT;
4147     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4148                        Op.getOperand(2), Op.getOperand(3));
4149   }
4150   }
4151 }
4152 
4153 bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
4154   if (VT.getVectorElementType() == MVT::i8 ||
4155       VT.getVectorElementType() == MVT::i16) {
4156     EltTy = MVT::i32;
4157     return true;
4158   }
4159   return false;
4160 }
4161 
4162 bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
4163   if (VT.getVectorElementType() == MVT::i32 &&
4164       VT.getVectorElementCount().getKnownMinValue() >= 4 &&
4165       !VT.isFixedLengthVector())
4166     return true;
4167 
4168   return false;
4169 }
4170 
4171 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
4172   return ExtVal.getValueType().isScalableVector();
4173 }
4174 
4175 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
4176   std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
4177       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
4178        AArch64ISD::GLD1_MERGE_ZERO},
4179       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
4180        AArch64ISD::GLD1_UXTW_MERGE_ZERO},
4181       {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
4182        AArch64ISD::GLD1_MERGE_ZERO},
4183       {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
4184        AArch64ISD::GLD1_SXTW_MERGE_ZERO},
4185       {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
4186        AArch64ISD::GLD1_SCALED_MERGE_ZERO},
4187       {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
4188        AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
4189       {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
4190        AArch64ISD::GLD1_SCALED_MERGE_ZERO},
4191       {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
4192        AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
4193   };
4194   auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
4195   return AddrModes.find(Key)->second;
4196 }
4197 
4198 unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
4199   std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
4200       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
4201        AArch64ISD::SST1_PRED},
4202       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
4203        AArch64ISD::SST1_UXTW_PRED},
4204       {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
4205        AArch64ISD::SST1_PRED},
4206       {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
4207        AArch64ISD::SST1_SXTW_PRED},
4208       {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
4209        AArch64ISD::SST1_SCALED_PRED},
4210       {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
4211        AArch64ISD::SST1_UXTW_SCALED_PRED},
4212       {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
4213        AArch64ISD::SST1_SCALED_PRED},
4214       {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
4215        AArch64ISD::SST1_SXTW_SCALED_PRED},
4216   };
4217   auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
4218   return AddrModes.find(Key)->second;
4219 }
4220 
4221 unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
4222   switch (Opcode) {
4223   default:
4224     llvm_unreachable("unimplemented opcode");
4225     return Opcode;
4226   case AArch64ISD::GLD1_MERGE_ZERO:
4227     return AArch64ISD::GLD1S_MERGE_ZERO;
4228   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
4229     return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
4230   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
4231     return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
4232   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
4233     return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
4234   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
4235     return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
4236   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
4237     return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
4238   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
4239     return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
4240   }
4241 }
4242 
4243 bool getGatherScatterIndexIsExtended(SDValue Index) {
4244   unsigned Opcode = Index.getOpcode();
4245   if (Opcode == ISD::SIGN_EXTEND_INREG)
4246     return true;
4247 
4248   if (Opcode == ISD::AND) {
4249     SDValue Splat = Index.getOperand(1);
4250     if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
4251       return false;
4252     ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0));
4253     if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF)
4254       return false;
4255     return true;
4256   }
4257 
4258   return false;
4259 }
4260 
4261 // If the base pointer of a masked gather or scatter is null, we
4262 // may be able to swap BasePtr & Index and use the vector + register
4263 // or vector + immediate addressing mode, e.g.
4264 // VECTOR + REGISTER:
4265 //    getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices)
4266 // -> getelementptr %offset, <vscale x N x T> %indices
4267 // VECTOR + IMMEDIATE:
4268 //    getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices)
4269 // -> getelementptr #x, <vscale x N x T> %indices
4270 void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT,
4271                                  unsigned &Opcode, bool IsGather,
4272                                  SelectionDAG &DAG) {
4273   if (!isNullConstant(BasePtr))
4274     return;
4275 
4276   // FIXME: This will not match for fixed vector type codegen as the nodes in
4277   // question will have fixed<->scalable conversions around them. This should be
4278   // moved to a DAG combine or complex pattern so that is executes after all of
4279   // the fixed vector insert and extracts have been removed. This deficiency
4280   // will result in a sub-optimal addressing mode being used, i.e. an ADD not
4281   // being folded into the scatter/gather.
4282   ConstantSDNode *Offset = nullptr;
4283   if (Index.getOpcode() == ISD::ADD)
4284     if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
4285       if (isa<ConstantSDNode>(SplatVal))
4286         Offset = cast<ConstantSDNode>(SplatVal);
4287       else {
4288         BasePtr = SplatVal;
4289         Index = Index->getOperand(0);
4290         return;
4291       }
4292     }
4293 
4294   unsigned NewOp =
4295       IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED;
4296 
4297   if (!Offset) {
4298     std::swap(BasePtr, Index);
4299     Opcode = NewOp;
4300     return;
4301   }
4302 
4303   uint64_t OffsetVal = Offset->getZExtValue();
4304   unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8;
4305   auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64);
4306 
4307   if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) {
4308     // Index is out of range for the immediate addressing mode
4309     BasePtr = ConstOffset;
4310     Index = Index->getOperand(0);
4311     return;
4312   }
4313 
4314   // Immediate is in range
4315   Opcode = NewOp;
4316   BasePtr = Index->getOperand(0);
4317   Index = ConstOffset;
4318 }
4319 
4320 SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
4321                                             SelectionDAG &DAG) const {
4322   SDLoc DL(Op);
4323   MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
4324   assert(MGT && "Can only custom lower gather load nodes");
4325 
4326   bool IsFixedLength = MGT->getMemoryVT().isFixedLengthVector();
4327 
4328   SDValue Index = MGT->getIndex();
4329   SDValue Chain = MGT->getChain();
4330   SDValue PassThru = MGT->getPassThru();
4331   SDValue Mask = MGT->getMask();
4332   SDValue BasePtr = MGT->getBasePtr();
4333   ISD::LoadExtType ExtTy = MGT->getExtensionType();
4334 
4335   ISD::MemIndexType IndexType = MGT->getIndexType();
4336   bool IsScaled =
4337       IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
4338   bool IsSigned =
4339       IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
4340   bool IdxNeedsExtend =
4341       getGatherScatterIndexIsExtended(Index) ||
4342       Index.getSimpleValueType().getVectorElementType() == MVT::i32;
4343   bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
4344 
4345   EVT VT = PassThru.getSimpleValueType();
4346   EVT IndexVT = Index.getSimpleValueType();
4347   EVT MemVT = MGT->getMemoryVT();
4348   SDValue InputVT = DAG.getValueType(MemVT);
4349 
4350   if (VT.getVectorElementType() == MVT::bf16 &&
4351       !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
4352     return SDValue();
4353 
4354   if (IsFixedLength) {
4355     assert(Subtarget->useSVEForFixedLengthVectors() &&
4356            "Cannot lower when not using SVE for fixed vectors");
4357     if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
4358       IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
4359       MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
4360     } else {
4361       MemVT = getContainerForFixedLengthVector(DAG, MemVT);
4362       IndexVT = MemVT.changeTypeToInteger();
4363     }
4364     InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
4365     Mask = DAG.getNode(
4366         ISD::ZERO_EXTEND, DL,
4367         VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
4368   }
4369 
4370   if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
4371     PassThru = SDValue();
4372 
4373   if (VT.isFloatingPoint() && !IsFixedLength) {
4374     // Handle FP data by using an integer gather and casting the result.
4375     if (PassThru) {
4376       EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount());
4377       PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
4378     }
4379     InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
4380   }
4381 
4382   SDVTList VTs = DAG.getVTList(IndexVT, MVT::Other);
4383 
4384   if (getGatherScatterIndexIsExtended(Index))
4385     Index = Index.getOperand(0);
4386 
4387   unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
4388   selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
4389                               /*isGather=*/true, DAG);
4390 
4391   if (ResNeedsSignExtend)
4392     Opcode = getSignExtendedGatherOpcode(Opcode);
4393 
4394   if (IsFixedLength) {
4395     if (Index.getSimpleValueType().isFixedLengthVector())
4396       Index = convertToScalableVector(DAG, IndexVT, Index);
4397     if (BasePtr.getSimpleValueType().isFixedLengthVector())
4398       BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
4399     Mask = convertFixedMaskToScalableVector(Mask, DAG);
4400   }
4401 
4402   SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT};
4403   SDValue Result = DAG.getNode(Opcode, DL, VTs, Ops);
4404   Chain = Result.getValue(1);
4405 
4406   if (IsFixedLength) {
4407     Result = convertFromScalableVector(
4408         DAG, VT.changeVectorElementType(IndexVT.getVectorElementType()),
4409         Result);
4410     Result = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Result);
4411     Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
4412 
4413     if (PassThru)
4414       Result = DAG.getSelect(DL, VT, MGT->getMask(), Result, PassThru);
4415   } else {
4416     if (PassThru)
4417       Result = DAG.getSelect(DL, IndexVT, Mask, Result, PassThru);
4418 
4419     if (VT.isFloatingPoint())
4420       Result = getSVESafeBitCast(VT, Result, DAG);
4421   }
4422 
4423   return DAG.getMergeValues({Result, Chain}, DL);
4424 }
4425 
4426 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
4427                                              SelectionDAG &DAG) const {
4428   SDLoc DL(Op);
4429   MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
4430   assert(MSC && "Can only custom lower scatter store nodes");
4431 
4432   bool IsFixedLength = MSC->getMemoryVT().isFixedLengthVector();
4433 
4434   SDValue Index = MSC->getIndex();
4435   SDValue Chain = MSC->getChain();
4436   SDValue StoreVal = MSC->getValue();
4437   SDValue Mask = MSC->getMask();
4438   SDValue BasePtr = MSC->getBasePtr();
4439 
4440   ISD::MemIndexType IndexType = MSC->getIndexType();
4441   bool IsScaled =
4442       IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
4443   bool IsSigned =
4444       IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
4445   bool NeedsExtend =
4446       getGatherScatterIndexIsExtended(Index) ||
4447       Index.getSimpleValueType().getVectorElementType() == MVT::i32;
4448 
4449   EVT VT = StoreVal.getSimpleValueType();
4450   EVT IndexVT = Index.getSimpleValueType();
4451   SDVTList VTs = DAG.getVTList(MVT::Other);
4452   EVT MemVT = MSC->getMemoryVT();
4453   SDValue InputVT = DAG.getValueType(MemVT);
4454 
4455   if (VT.getVectorElementType() == MVT::bf16 &&
4456       !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
4457     return SDValue();
4458 
4459   if (IsFixedLength) {
4460     assert(Subtarget->useSVEForFixedLengthVectors() &&
4461            "Cannot lower when not using SVE for fixed vectors");
4462     if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
4463       IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
4464       MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
4465     } else {
4466       MemVT = getContainerForFixedLengthVector(DAG, MemVT);
4467       IndexVT = MemVT.changeTypeToInteger();
4468     }
4469     InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
4470 
4471     StoreVal =
4472         DAG.getNode(ISD::BITCAST, DL, VT.changeTypeToInteger(), StoreVal);
4473     StoreVal = DAG.getNode(
4474         ISD::ANY_EXTEND, DL,
4475         VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal);
4476     StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal);
4477     Mask = DAG.getNode(
4478         ISD::ZERO_EXTEND, DL,
4479         VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
4480   } else if (VT.isFloatingPoint()) {
4481     // Handle FP data by casting the data so an integer scatter can be used.
4482     EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount());
4483     StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG);
4484     InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
4485   }
4486 
4487   if (getGatherScatterIndexIsExtended(Index))
4488     Index = Index.getOperand(0);
4489 
4490   unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend);
4491   selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
4492                               /*isGather=*/false, DAG);
4493 
4494   if (IsFixedLength) {
4495     if (Index.getSimpleValueType().isFixedLengthVector())
4496       Index = convertToScalableVector(DAG, IndexVT, Index);
4497     if (BasePtr.getSimpleValueType().isFixedLengthVector())
4498       BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
4499     Mask = convertFixedMaskToScalableVector(Mask, DAG);
4500   }
4501 
4502   SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
4503   return DAG.getNode(Opcode, DL, VTs, Ops);
4504 }
4505 
4506 SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
4507   SDLoc DL(Op);
4508   MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
4509   assert(LoadNode && "Expected custom lowering of a masked load node");
4510   EVT VT = Op->getValueType(0);
4511 
4512   if (useSVEForFixedLengthVectorVT(VT, true))
4513     return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
4514 
4515   SDValue PassThru = LoadNode->getPassThru();
4516   SDValue Mask = LoadNode->getMask();
4517 
4518   if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
4519     return Op;
4520 
4521   SDValue Load = DAG.getMaskedLoad(
4522       VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
4523       LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
4524       LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
4525       LoadNode->getExtensionType());
4526 
4527   SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
4528 
4529   return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
4530 }
4531 
4532 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
4533 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
4534                                         EVT VT, EVT MemVT,
4535                                         SelectionDAG &DAG) {
4536   assert(VT.isVector() && "VT should be a vector type");
4537   assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
4538 
4539   SDValue Value = ST->getValue();
4540 
4541   // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
4542   // the word lane which represent the v4i8 subvector.  It optimizes the store
4543   // to:
4544   //
4545   //   xtn  v0.8b, v0.8h
4546   //   str  s0, [x0]
4547 
4548   SDValue Undef = DAG.getUNDEF(MVT::i16);
4549   SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
4550                                         {Undef, Undef, Undef, Undef});
4551 
4552   SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
4553                                  Value, UndefVec);
4554   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
4555 
4556   Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
4557   SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
4558                                      Trunc, DAG.getConstant(0, DL, MVT::i64));
4559 
4560   return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
4561                       ST->getBasePtr(), ST->getMemOperand());
4562 }
4563 
4564 // Custom lowering for any store, vector or scalar and/or default or with
4565 // a truncate operations.  Currently only custom lower truncate operation
4566 // from vector v4i16 to v4i8 or volatile stores of i128.
4567 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
4568                                           SelectionDAG &DAG) const {
4569   SDLoc Dl(Op);
4570   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
4571   assert (StoreNode && "Can only custom lower store nodes");
4572 
4573   SDValue Value = StoreNode->getValue();
4574 
4575   EVT VT = Value.getValueType();
4576   EVT MemVT = StoreNode->getMemoryVT();
4577 
4578   if (VT.isVector()) {
4579     if (useSVEForFixedLengthVectorVT(VT, true))
4580       return LowerFixedLengthVectorStoreToSVE(Op, DAG);
4581 
4582     unsigned AS = StoreNode->getAddressSpace();
4583     Align Alignment = StoreNode->getAlign();
4584     if (Alignment < MemVT.getStoreSize() &&
4585         !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
4586                                         StoreNode->getMemOperand()->getFlags(),
4587                                         nullptr)) {
4588       return scalarizeVectorStore(StoreNode, DAG);
4589     }
4590 
4591     if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
4592         MemVT == MVT::v4i8) {
4593       return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
4594     }
4595     // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
4596     // the custom lowering, as there are no un-paired non-temporal stores and
4597     // legalization will break up 256 bit inputs.
4598     ElementCount EC = MemVT.getVectorElementCount();
4599     if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
4600         EC.isKnownEven() &&
4601         ((MemVT.getScalarSizeInBits() == 8u ||
4602           MemVT.getScalarSizeInBits() == 16u ||
4603           MemVT.getScalarSizeInBits() == 32u ||
4604           MemVT.getScalarSizeInBits() == 64u))) {
4605       SDValue Lo =
4606           DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
4607                       MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
4608                       StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
4609       SDValue Hi =
4610           DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
4611                       MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
4612                       StoreNode->getValue(),
4613                       DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
4614       SDValue Result = DAG.getMemIntrinsicNode(
4615           AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
4616           {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
4617           StoreNode->getMemoryVT(), StoreNode->getMemOperand());
4618       return Result;
4619     }
4620   } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
4621     assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
4622     SDValue Lo =
4623         DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
4624                     DAG.getConstant(0, Dl, MVT::i64));
4625     SDValue Hi =
4626         DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
4627                     DAG.getConstant(1, Dl, MVT::i64));
4628     SDValue Result = DAG.getMemIntrinsicNode(
4629         AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
4630         {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
4631         StoreNode->getMemoryVT(), StoreNode->getMemOperand());
4632     return Result;
4633   } else if (MemVT == MVT::i64x8) {
4634     SDValue Value = StoreNode->getValue();
4635     assert(Value->getValueType(0) == MVT::i64x8);
4636     SDValue Chain = StoreNode->getChain();
4637     SDValue Base = StoreNode->getBasePtr();
4638     EVT PtrVT = Base.getValueType();
4639     for (unsigned i = 0; i < 8; i++) {
4640       SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
4641                                  Value, DAG.getConstant(i, Dl, MVT::i32));
4642       SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
4643                                 DAG.getConstant(i * 8, Dl, PtrVT));
4644       Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
4645                            StoreNode->getOriginalAlign());
4646     }
4647     return Chain;
4648   }
4649 
4650   return SDValue();
4651 }
4652 
4653 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
4654                                          SelectionDAG &DAG) const {
4655   SDLoc DL(Op);
4656   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
4657   assert(LoadNode && "Expected custom lowering of a load node");
4658 
4659   if (LoadNode->getMemoryVT() == MVT::i64x8) {
4660     SmallVector<SDValue, 8> Ops;
4661     SDValue Base = LoadNode->getBasePtr();
4662     SDValue Chain = LoadNode->getChain();
4663     EVT PtrVT = Base.getValueType();
4664     for (unsigned i = 0; i < 8; i++) {
4665       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
4666                                 DAG.getConstant(i * 8, DL, PtrVT));
4667       SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
4668                                  LoadNode->getPointerInfo(),
4669                                  LoadNode->getOriginalAlign());
4670       Ops.push_back(Part);
4671       Chain = SDValue(Part.getNode(), 1);
4672     }
4673     SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
4674     return DAG.getMergeValues({Loaded, Chain}, DL);
4675   }
4676 
4677   // Custom lowering for extending v4i8 vector loads.
4678   EVT VT = Op->getValueType(0);
4679   assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
4680 
4681   if (LoadNode->getMemoryVT() != MVT::v4i8)
4682     return SDValue();
4683 
4684   unsigned ExtType;
4685   if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
4686     ExtType = ISD::SIGN_EXTEND;
4687   else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
4688            LoadNode->getExtensionType() == ISD::EXTLOAD)
4689     ExtType = ISD::ZERO_EXTEND;
4690   else
4691     return SDValue();
4692 
4693   SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
4694                              LoadNode->getBasePtr(), MachinePointerInfo());
4695   SDValue Chain = Load.getValue(1);
4696   SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
4697   SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
4698   SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
4699   Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
4700                     DAG.getConstant(0, DL, MVT::i64));
4701   if (VT == MVT::v4i32)
4702     Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
4703   return DAG.getMergeValues({Ext, Chain}, DL);
4704 }
4705 
4706 // Generate SUBS and CSEL for integer abs.
4707 SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
4708   MVT VT = Op.getSimpleValueType();
4709 
4710   if (VT.isVector())
4711     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
4712 
4713   SDLoc DL(Op);
4714   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
4715                             Op.getOperand(0));
4716   // Generate SUBS & CSEL.
4717   SDValue Cmp =
4718       DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
4719                   Op.getOperand(0), DAG.getConstant(0, DL, VT));
4720   return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
4721                      DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
4722                      Cmp.getValue(1));
4723 }
4724 
4725 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
4726                                               SelectionDAG &DAG) const {
4727   LLVM_DEBUG(dbgs() << "Custom lowering: ");
4728   LLVM_DEBUG(Op.dump());
4729 
4730   switch (Op.getOpcode()) {
4731   default:
4732     llvm_unreachable("unimplemented operand");
4733     return SDValue();
4734   case ISD::BITCAST:
4735     return LowerBITCAST(Op, DAG);
4736   case ISD::GlobalAddress:
4737     return LowerGlobalAddress(Op, DAG);
4738   case ISD::GlobalTLSAddress:
4739     return LowerGlobalTLSAddress(Op, DAG);
4740   case ISD::SETCC:
4741   case ISD::STRICT_FSETCC:
4742   case ISD::STRICT_FSETCCS:
4743     return LowerSETCC(Op, DAG);
4744   case ISD::BR_CC:
4745     return LowerBR_CC(Op, DAG);
4746   case ISD::SELECT:
4747     return LowerSELECT(Op, DAG);
4748   case ISD::SELECT_CC:
4749     return LowerSELECT_CC(Op, DAG);
4750   case ISD::JumpTable:
4751     return LowerJumpTable(Op, DAG);
4752   case ISD::BR_JT:
4753     return LowerBR_JT(Op, DAG);
4754   case ISD::ConstantPool:
4755     return LowerConstantPool(Op, DAG);
4756   case ISD::BlockAddress:
4757     return LowerBlockAddress(Op, DAG);
4758   case ISD::VASTART:
4759     return LowerVASTART(Op, DAG);
4760   case ISD::VACOPY:
4761     return LowerVACOPY(Op, DAG);
4762   case ISD::VAARG:
4763     return LowerVAARG(Op, DAG);
4764   case ISD::ADDC:
4765   case ISD::ADDE:
4766   case ISD::SUBC:
4767   case ISD::SUBE:
4768     return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
4769   case ISD::SADDO:
4770   case ISD::UADDO:
4771   case ISD::SSUBO:
4772   case ISD::USUBO:
4773   case ISD::SMULO:
4774   case ISD::UMULO:
4775     return LowerXALUO(Op, DAG);
4776   case ISD::FADD:
4777     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
4778   case ISD::FSUB:
4779     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
4780   case ISD::FMUL:
4781     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
4782   case ISD::FMA:
4783     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
4784   case ISD::FDIV:
4785     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
4786   case ISD::FNEG:
4787     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
4788   case ISD::FCEIL:
4789     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
4790   case ISD::FFLOOR:
4791     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
4792   case ISD::FNEARBYINT:
4793     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
4794   case ISD::FRINT:
4795     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
4796   case ISD::FROUND:
4797     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
4798   case ISD::FROUNDEVEN:
4799     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
4800   case ISD::FTRUNC:
4801     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
4802   case ISD::FSQRT:
4803     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
4804   case ISD::FABS:
4805     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
4806   case ISD::FP_ROUND:
4807   case ISD::STRICT_FP_ROUND:
4808     return LowerFP_ROUND(Op, DAG);
4809   case ISD::FP_EXTEND:
4810     return LowerFP_EXTEND(Op, DAG);
4811   case ISD::FRAMEADDR:
4812     return LowerFRAMEADDR(Op, DAG);
4813   case ISD::SPONENTRY:
4814     return LowerSPONENTRY(Op, DAG);
4815   case ISD::RETURNADDR:
4816     return LowerRETURNADDR(Op, DAG);
4817   case ISD::ADDROFRETURNADDR:
4818     return LowerADDROFRETURNADDR(Op, DAG);
4819   case ISD::CONCAT_VECTORS:
4820     return LowerCONCAT_VECTORS(Op, DAG);
4821   case ISD::INSERT_VECTOR_ELT:
4822     return LowerINSERT_VECTOR_ELT(Op, DAG);
4823   case ISD::EXTRACT_VECTOR_ELT:
4824     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
4825   case ISD::BUILD_VECTOR:
4826     return LowerBUILD_VECTOR(Op, DAG);
4827   case ISD::VECTOR_SHUFFLE:
4828     return LowerVECTOR_SHUFFLE(Op, DAG);
4829   case ISD::SPLAT_VECTOR:
4830     return LowerSPLAT_VECTOR(Op, DAG);
4831   case ISD::EXTRACT_SUBVECTOR:
4832     return LowerEXTRACT_SUBVECTOR(Op, DAG);
4833   case ISD::INSERT_SUBVECTOR:
4834     return LowerINSERT_SUBVECTOR(Op, DAG);
4835   case ISD::SDIV:
4836   case ISD::UDIV:
4837     return LowerDIV(Op, DAG);
4838   case ISD::SMIN:
4839     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
4840                                /*OverrideNEON=*/true);
4841   case ISD::UMIN:
4842     return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
4843                                /*OverrideNEON=*/true);
4844   case ISD::SMAX:
4845     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
4846                                /*OverrideNEON=*/true);
4847   case ISD::UMAX:
4848     return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
4849                                /*OverrideNEON=*/true);
4850   case ISD::SRA:
4851   case ISD::SRL:
4852   case ISD::SHL:
4853     return LowerVectorSRA_SRL_SHL(Op, DAG);
4854   case ISD::SHL_PARTS:
4855   case ISD::SRL_PARTS:
4856   case ISD::SRA_PARTS:
4857     return LowerShiftParts(Op, DAG);
4858   case ISD::CTPOP:
4859     return LowerCTPOP(Op, DAG);
4860   case ISD::FCOPYSIGN:
4861     return LowerFCOPYSIGN(Op, DAG);
4862   case ISD::OR:
4863     return LowerVectorOR(Op, DAG);
4864   case ISD::XOR:
4865     return LowerXOR(Op, DAG);
4866   case ISD::PREFETCH:
4867     return LowerPREFETCH(Op, DAG);
4868   case ISD::SINT_TO_FP:
4869   case ISD::UINT_TO_FP:
4870   case ISD::STRICT_SINT_TO_FP:
4871   case ISD::STRICT_UINT_TO_FP:
4872     return LowerINT_TO_FP(Op, DAG);
4873   case ISD::FP_TO_SINT:
4874   case ISD::FP_TO_UINT:
4875   case ISD::STRICT_FP_TO_SINT:
4876   case ISD::STRICT_FP_TO_UINT:
4877     return LowerFP_TO_INT(Op, DAG);
4878   case ISD::FP_TO_SINT_SAT:
4879   case ISD::FP_TO_UINT_SAT:
4880     return LowerFP_TO_INT_SAT(Op, DAG);
4881   case ISD::FSINCOS:
4882     return LowerFSINCOS(Op, DAG);
4883   case ISD::FLT_ROUNDS_:
4884     return LowerFLT_ROUNDS_(Op, DAG);
4885   case ISD::SET_ROUNDING:
4886     return LowerSET_ROUNDING(Op, DAG);
4887   case ISD::MUL:
4888     return LowerMUL(Op, DAG);
4889   case ISD::MULHS:
4890     return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED,
4891                                /*OverrideNEON=*/true);
4892   case ISD::MULHU:
4893     return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED,
4894                                /*OverrideNEON=*/true);
4895   case ISD::INTRINSIC_WO_CHAIN:
4896     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
4897   case ISD::STORE:
4898     return LowerSTORE(Op, DAG);
4899   case ISD::MSTORE:
4900     return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
4901   case ISD::MGATHER:
4902     return LowerMGATHER(Op, DAG);
4903   case ISD::MSCATTER:
4904     return LowerMSCATTER(Op, DAG);
4905   case ISD::VECREDUCE_SEQ_FADD:
4906     return LowerVECREDUCE_SEQ_FADD(Op, DAG);
4907   case ISD::VECREDUCE_ADD:
4908   case ISD::VECREDUCE_AND:
4909   case ISD::VECREDUCE_OR:
4910   case ISD::VECREDUCE_XOR:
4911   case ISD::VECREDUCE_SMAX:
4912   case ISD::VECREDUCE_SMIN:
4913   case ISD::VECREDUCE_UMAX:
4914   case ISD::VECREDUCE_UMIN:
4915   case ISD::VECREDUCE_FADD:
4916   case ISD::VECREDUCE_FMAX:
4917   case ISD::VECREDUCE_FMIN:
4918     return LowerVECREDUCE(Op, DAG);
4919   case ISD::ATOMIC_LOAD_SUB:
4920     return LowerATOMIC_LOAD_SUB(Op, DAG);
4921   case ISD::ATOMIC_LOAD_AND:
4922     return LowerATOMIC_LOAD_AND(Op, DAG);
4923   case ISD::DYNAMIC_STACKALLOC:
4924     return LowerDYNAMIC_STACKALLOC(Op, DAG);
4925   case ISD::VSCALE:
4926     return LowerVSCALE(Op, DAG);
4927   case ISD::ANY_EXTEND:
4928   case ISD::SIGN_EXTEND:
4929   case ISD::ZERO_EXTEND:
4930     return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
4931   case ISD::SIGN_EXTEND_INREG: {
4932     // Only custom lower when ExtraVT has a legal byte based element type.
4933     EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4934     EVT ExtraEltVT = ExtraVT.getVectorElementType();
4935     if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
4936         (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
4937       return SDValue();
4938 
4939     return LowerToPredicatedOp(Op, DAG,
4940                                AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
4941   }
4942   case ISD::TRUNCATE:
4943     return LowerTRUNCATE(Op, DAG);
4944   case ISD::MLOAD:
4945     return LowerMLOAD(Op, DAG);
4946   case ISD::LOAD:
4947     if (useSVEForFixedLengthVectorVT(Op.getValueType()))
4948       return LowerFixedLengthVectorLoadToSVE(Op, DAG);
4949     return LowerLOAD(Op, DAG);
4950   case ISD::ADD:
4951     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
4952   case ISD::AND:
4953     return LowerToScalableOp(Op, DAG);
4954   case ISD::SUB:
4955     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED);
4956   case ISD::FMAXIMUM:
4957     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
4958   case ISD::FMAXNUM:
4959     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
4960   case ISD::FMINIMUM:
4961     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
4962   case ISD::FMINNUM:
4963     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
4964   case ISD::VSELECT:
4965     return LowerFixedLengthVectorSelectToSVE(Op, DAG);
4966   case ISD::ABS:
4967     return LowerABS(Op, DAG);
4968   case ISD::BITREVERSE:
4969     return LowerBitreverse(Op, DAG);
4970   case ISD::BSWAP:
4971     return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
4972   case ISD::CTLZ:
4973     return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU,
4974                                /*OverrideNEON=*/true);
4975   case ISD::CTTZ:
4976     return LowerCTTZ(Op, DAG);
4977   case ISD::VECTOR_SPLICE:
4978     return LowerVECTOR_SPLICE(Op, DAG);
4979   }
4980 }
4981 
4982 bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
4983   return !Subtarget->useSVEForFixedLengthVectors();
4984 }
4985 
4986 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
4987     EVT VT, bool OverrideNEON) const {
4988   if (!Subtarget->useSVEForFixedLengthVectors())
4989     return false;
4990 
4991   if (!VT.isFixedLengthVector())
4992     return false;
4993 
4994   // Don't use SVE for vectors we cannot scalarize if required.
4995   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
4996   // Fixed length predicates should be promoted to i8.
4997   // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
4998   case MVT::i1:
4999   default:
5000     return false;
5001   case MVT::i8:
5002   case MVT::i16:
5003   case MVT::i32:
5004   case MVT::i64:
5005   case MVT::f16:
5006   case MVT::f32:
5007   case MVT::f64:
5008     break;
5009   }
5010 
5011   // All SVE implementations support NEON sized vectors.
5012   if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
5013     return true;
5014 
5015   // Ensure NEON MVTs only belong to a single register class.
5016   if (VT.getFixedSizeInBits() <= 128)
5017     return false;
5018 
5019   // Don't use SVE for types that don't fit.
5020   if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
5021     return false;
5022 
5023   // TODO: Perhaps an artificial restriction, but worth having whilst getting
5024   // the base fixed length SVE support in place.
5025   if (!VT.isPow2VectorType())
5026     return false;
5027 
5028   return true;
5029 }
5030 
5031 //===----------------------------------------------------------------------===//
5032 //                      Calling Convention Implementation
5033 //===----------------------------------------------------------------------===//
5034 
5035 /// Selects the correct CCAssignFn for a given CallingConvention value.
5036 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
5037                                                      bool IsVarArg) const {
5038   switch (CC) {
5039   default:
5040     report_fatal_error("Unsupported calling convention.");
5041   case CallingConv::WebKit_JS:
5042     return CC_AArch64_WebKit_JS;
5043   case CallingConv::GHC:
5044     return CC_AArch64_GHC;
5045   case CallingConv::C:
5046   case CallingConv::Fast:
5047   case CallingConv::PreserveMost:
5048   case CallingConv::CXX_FAST_TLS:
5049   case CallingConv::Swift:
5050   case CallingConv::SwiftTail:
5051   case CallingConv::Tail:
5052     if (Subtarget->isTargetWindows() && IsVarArg)
5053       return CC_AArch64_Win64_VarArg;
5054     if (!Subtarget->isTargetDarwin())
5055       return CC_AArch64_AAPCS;
5056     if (!IsVarArg)
5057       return CC_AArch64_DarwinPCS;
5058     return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
5059                                       : CC_AArch64_DarwinPCS_VarArg;
5060    case CallingConv::Win64:
5061     return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
5062    case CallingConv::CFGuard_Check:
5063      return CC_AArch64_Win64_CFGuard_Check;
5064    case CallingConv::AArch64_VectorCall:
5065    case CallingConv::AArch64_SVE_VectorCall:
5066      return CC_AArch64_AAPCS;
5067   }
5068 }
5069 
5070 CCAssignFn *
5071 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
5072   return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
5073                                       : RetCC_AArch64_AAPCS;
5074 }
5075 
5076 SDValue AArch64TargetLowering::LowerFormalArguments(
5077     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
5078     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
5079     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
5080   MachineFunction &MF = DAG.getMachineFunction();
5081   MachineFrameInfo &MFI = MF.getFrameInfo();
5082   bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
5083 
5084   // Assign locations to all of the incoming arguments.
5085   SmallVector<CCValAssign, 16> ArgLocs;
5086   DenseMap<unsigned, SDValue> CopiedRegs;
5087   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
5088                  *DAG.getContext());
5089 
5090   // At this point, Ins[].VT may already be promoted to i32. To correctly
5091   // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
5092   // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
5093   // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
5094   // we use a special version of AnalyzeFormalArguments to pass in ValVT and
5095   // LocVT.
5096   unsigned NumArgs = Ins.size();
5097   Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
5098   unsigned CurArgIdx = 0;
5099   for (unsigned i = 0; i != NumArgs; ++i) {
5100     MVT ValVT = Ins[i].VT;
5101     if (Ins[i].isOrigArg()) {
5102       std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
5103       CurArgIdx = Ins[i].getOrigArgIndex();
5104 
5105       // Get type of the original argument.
5106       EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
5107                                   /*AllowUnknown*/ true);
5108       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
5109       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
5110       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
5111         ValVT = MVT::i8;
5112       else if (ActualMVT == MVT::i16)
5113         ValVT = MVT::i16;
5114     }
5115     bool UseVarArgCC = false;
5116     if (IsWin64)
5117       UseVarArgCC = isVarArg;
5118     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
5119     bool Res =
5120         AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
5121     assert(!Res && "Call operand has unhandled type");
5122     (void)Res;
5123   }
5124   SmallVector<SDValue, 16> ArgValues;
5125   unsigned ExtraArgLocs = 0;
5126   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
5127     CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
5128 
5129     if (Ins[i].Flags.isByVal()) {
5130       // Byval is used for HFAs in the PCS, but the system should work in a
5131       // non-compliant manner for larger structs.
5132       EVT PtrVT = getPointerTy(DAG.getDataLayout());
5133       int Size = Ins[i].Flags.getByValSize();
5134       unsigned NumRegs = (Size + 7) / 8;
5135 
5136       // FIXME: This works on big-endian for composite byvals, which are the common
5137       // case. It should also work for fundamental types too.
5138       unsigned FrameIdx =
5139         MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
5140       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
5141       InVals.push_back(FrameIdxN);
5142 
5143       continue;
5144     }
5145 
5146     if (Ins[i].Flags.isSwiftAsync())
5147       MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
5148 
5149     SDValue ArgValue;
5150     if (VA.isRegLoc()) {
5151       // Arguments stored in registers.
5152       EVT RegVT = VA.getLocVT();
5153       const TargetRegisterClass *RC;
5154 
5155       if (RegVT == MVT::i32)
5156         RC = &AArch64::GPR32RegClass;
5157       else if (RegVT == MVT::i64)
5158         RC = &AArch64::GPR64RegClass;
5159       else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
5160         RC = &AArch64::FPR16RegClass;
5161       else if (RegVT == MVT::f32)
5162         RC = &AArch64::FPR32RegClass;
5163       else if (RegVT == MVT::f64 || RegVT.is64BitVector())
5164         RC = &AArch64::FPR64RegClass;
5165       else if (RegVT == MVT::f128 || RegVT.is128BitVector())
5166         RC = &AArch64::FPR128RegClass;
5167       else if (RegVT.isScalableVector() &&
5168                RegVT.getVectorElementType() == MVT::i1)
5169         RC = &AArch64::PPRRegClass;
5170       else if (RegVT.isScalableVector())
5171         RC = &AArch64::ZPRRegClass;
5172       else
5173         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
5174 
5175       // Transform the arguments in physical registers into virtual ones.
5176       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
5177       ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
5178 
5179       // If this is an 8, 16 or 32-bit value, it is really passed promoted
5180       // to 64 bits.  Insert an assert[sz]ext to capture this, then
5181       // truncate to the right size.
5182       switch (VA.getLocInfo()) {
5183       default:
5184         llvm_unreachable("Unknown loc info!");
5185       case CCValAssign::Full:
5186         break;
5187       case CCValAssign::Indirect:
5188         assert(VA.getValVT().isScalableVector() &&
5189                "Only scalable vectors can be passed indirectly");
5190         break;
5191       case CCValAssign::BCvt:
5192         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
5193         break;
5194       case CCValAssign::AExt:
5195       case CCValAssign::SExt:
5196       case CCValAssign::ZExt:
5197         break;
5198       case CCValAssign::AExtUpper:
5199         ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
5200                                DAG.getConstant(32, DL, RegVT));
5201         ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
5202         break;
5203       }
5204     } else { // VA.isRegLoc()
5205       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
5206       unsigned ArgOffset = VA.getLocMemOffset();
5207       unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
5208                               ? VA.getLocVT().getSizeInBits()
5209                               : VA.getValVT().getSizeInBits()) / 8;
5210 
5211       uint32_t BEAlign = 0;
5212       if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
5213           !Ins[i].Flags.isInConsecutiveRegs())
5214         BEAlign = 8 - ArgSize;
5215 
5216       int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
5217 
5218       // Create load nodes to retrieve arguments from the stack.
5219       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
5220 
5221       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
5222       ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
5223       MVT MemVT = VA.getValVT();
5224 
5225       switch (VA.getLocInfo()) {
5226       default:
5227         break;
5228       case CCValAssign::Trunc:
5229       case CCValAssign::BCvt:
5230         MemVT = VA.getLocVT();
5231         break;
5232       case CCValAssign::Indirect:
5233         assert(VA.getValVT().isScalableVector() &&
5234                "Only scalable vectors can be passed indirectly");
5235         MemVT = VA.getLocVT();
5236         break;
5237       case CCValAssign::SExt:
5238         ExtType = ISD::SEXTLOAD;
5239         break;
5240       case CCValAssign::ZExt:
5241         ExtType = ISD::ZEXTLOAD;
5242         break;
5243       case CCValAssign::AExt:
5244         ExtType = ISD::EXTLOAD;
5245         break;
5246       }
5247 
5248       ArgValue = DAG.getExtLoad(
5249           ExtType, DL, VA.getLocVT(), Chain, FIN,
5250           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
5251           MemVT);
5252     }
5253 
5254     if (VA.getLocInfo() == CCValAssign::Indirect) {
5255       assert(VA.getValVT().isScalableVector() &&
5256            "Only scalable vectors can be passed indirectly");
5257 
5258       uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
5259       unsigned NumParts = 1;
5260       if (Ins[i].Flags.isInConsecutiveRegs()) {
5261         assert(!Ins[i].Flags.isInConsecutiveRegsLast());
5262         while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
5263           ++NumParts;
5264       }
5265 
5266       MVT PartLoad = VA.getValVT();
5267       SDValue Ptr = ArgValue;
5268 
5269       // Ensure we generate all loads for each tuple part, whilst updating the
5270       // pointer after each load correctly using vscale.
5271       while (NumParts > 0) {
5272         ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
5273         InVals.push_back(ArgValue);
5274         NumParts--;
5275         if (NumParts > 0) {
5276           SDValue BytesIncrement = DAG.getVScale(
5277               DL, Ptr.getValueType(),
5278               APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
5279           SDNodeFlags Flags;
5280           Flags.setNoUnsignedWrap(true);
5281           Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5282                             BytesIncrement, Flags);
5283           ExtraArgLocs++;
5284           i++;
5285         }
5286       }
5287     } else {
5288       if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
5289         ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
5290                                ArgValue, DAG.getValueType(MVT::i32));
5291       InVals.push_back(ArgValue);
5292     }
5293   }
5294   assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
5295 
5296   // varargs
5297   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5298   if (isVarArg) {
5299     if (!Subtarget->isTargetDarwin() || IsWin64) {
5300       // The AAPCS variadic function ABI is identical to the non-variadic
5301       // one. As a result there may be more arguments in registers and we should
5302       // save them for future reference.
5303       // Win64 variadic functions also pass arguments in registers, but all float
5304       // arguments are passed in integer registers.
5305       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
5306     }
5307 
5308     // This will point to the next argument passed via stack.
5309     unsigned StackOffset = CCInfo.getNextStackOffset();
5310     // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
5311     StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
5312     FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
5313 
5314     if (MFI.hasMustTailInVarArgFunc()) {
5315       SmallVector<MVT, 2> RegParmTypes;
5316       RegParmTypes.push_back(MVT::i64);
5317       RegParmTypes.push_back(MVT::f128);
5318       // Compute the set of forwarded registers. The rest are scratch.
5319       SmallVectorImpl<ForwardedRegister> &Forwards =
5320                                        FuncInfo->getForwardedMustTailRegParms();
5321       CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
5322                                                CC_AArch64_AAPCS);
5323 
5324       // Conservatively forward X8, since it might be used for aggregate return.
5325       if (!CCInfo.isAllocated(AArch64::X8)) {
5326         unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
5327         Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
5328       }
5329     }
5330   }
5331 
5332   // On Windows, InReg pointers must be returned, so record the pointer in a
5333   // virtual register at the start of the function so it can be returned in the
5334   // epilogue.
5335   if (IsWin64) {
5336     for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
5337       if (Ins[I].Flags.isInReg()) {
5338         assert(!FuncInfo->getSRetReturnReg());
5339 
5340         MVT PtrTy = getPointerTy(DAG.getDataLayout());
5341         Register Reg =
5342             MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
5343         FuncInfo->setSRetReturnReg(Reg);
5344 
5345         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
5346         Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
5347         break;
5348       }
5349     }
5350   }
5351 
5352   unsigned StackArgSize = CCInfo.getNextStackOffset();
5353   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
5354   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
5355     // This is a non-standard ABI so by fiat I say we're allowed to make full
5356     // use of the stack area to be popped, which must be aligned to 16 bytes in
5357     // any case:
5358     StackArgSize = alignTo(StackArgSize, 16);
5359 
5360     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
5361     // a multiple of 16.
5362     FuncInfo->setArgumentStackToRestore(StackArgSize);
5363 
5364     // This realignment carries over to the available bytes below. Our own
5365     // callers will guarantee the space is free by giving an aligned value to
5366     // CALLSEQ_START.
5367   }
5368   // Even if we're not expected to free up the space, it's useful to know how
5369   // much is there while considering tail calls (because we can reuse it).
5370   FuncInfo->setBytesInStackArgArea(StackArgSize);
5371 
5372   if (Subtarget->hasCustomCallingConv())
5373     Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
5374 
5375   return Chain;
5376 }
5377 
5378 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
5379                                                 SelectionDAG &DAG,
5380                                                 const SDLoc &DL,
5381                                                 SDValue &Chain) const {
5382   MachineFunction &MF = DAG.getMachineFunction();
5383   MachineFrameInfo &MFI = MF.getFrameInfo();
5384   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5385   auto PtrVT = getPointerTy(DAG.getDataLayout());
5386   bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
5387 
5388   SmallVector<SDValue, 8> MemOps;
5389 
5390   static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
5391                                           AArch64::X3, AArch64::X4, AArch64::X5,
5392                                           AArch64::X6, AArch64::X7 };
5393   static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
5394   unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
5395 
5396   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
5397   int GPRIdx = 0;
5398   if (GPRSaveSize != 0) {
5399     if (IsWin64) {
5400       GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
5401       if (GPRSaveSize & 15)
5402         // The extra size here, if triggered, will always be 8.
5403         MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
5404     } else
5405       GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
5406 
5407     SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
5408 
5409     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
5410       unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
5411       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
5412       SDValue Store = DAG.getStore(
5413           Val.getValue(1), DL, Val, FIN,
5414           IsWin64
5415               ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
5416                                                   GPRIdx,
5417                                                   (i - FirstVariadicGPR) * 8)
5418               : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
5419       MemOps.push_back(Store);
5420       FIN =
5421           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
5422     }
5423   }
5424   FuncInfo->setVarArgsGPRIndex(GPRIdx);
5425   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
5426 
5427   if (Subtarget->hasFPARMv8() && !IsWin64) {
5428     static const MCPhysReg FPRArgRegs[] = {
5429         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
5430         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
5431     static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
5432     unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
5433 
5434     unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
5435     int FPRIdx = 0;
5436     if (FPRSaveSize != 0) {
5437       FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
5438 
5439       SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
5440 
5441       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
5442         unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
5443         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
5444 
5445         SDValue Store = DAG.getStore(
5446             Val.getValue(1), DL, Val, FIN,
5447             MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
5448         MemOps.push_back(Store);
5449         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
5450                           DAG.getConstant(16, DL, PtrVT));
5451       }
5452     }
5453     FuncInfo->setVarArgsFPRIndex(FPRIdx);
5454     FuncInfo->setVarArgsFPRSize(FPRSaveSize);
5455   }
5456 
5457   if (!MemOps.empty()) {
5458     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
5459   }
5460 }
5461 
5462 /// LowerCallResult - Lower the result values of a call into the
5463 /// appropriate copies out of appropriate physical registers.
5464 SDValue AArch64TargetLowering::LowerCallResult(
5465     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
5466     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
5467     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
5468     SDValue ThisVal) const {
5469   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
5470   // Assign locations to each value returned by this call.
5471   SmallVector<CCValAssign, 16> RVLocs;
5472   DenseMap<unsigned, SDValue> CopiedRegs;
5473   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
5474                  *DAG.getContext());
5475   CCInfo.AnalyzeCallResult(Ins, RetCC);
5476 
5477   // Copy all of the result registers out of their specified physreg.
5478   for (unsigned i = 0; i != RVLocs.size(); ++i) {
5479     CCValAssign VA = RVLocs[i];
5480 
5481     // Pass 'this' value directly from the argument to return value, to avoid
5482     // reg unit interference
5483     if (i == 0 && isThisReturn) {
5484       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
5485              "unexpected return calling convention register assignment");
5486       InVals.push_back(ThisVal);
5487       continue;
5488     }
5489 
5490     // Avoid copying a physreg twice since RegAllocFast is incompetent and only
5491     // allows one use of a physreg per block.
5492     SDValue Val = CopiedRegs.lookup(VA.getLocReg());
5493     if (!Val) {
5494       Val =
5495           DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
5496       Chain = Val.getValue(1);
5497       InFlag = Val.getValue(2);
5498       CopiedRegs[VA.getLocReg()] = Val;
5499     }
5500 
5501     switch (VA.getLocInfo()) {
5502     default:
5503       llvm_unreachable("Unknown loc info!");
5504     case CCValAssign::Full:
5505       break;
5506     case CCValAssign::BCvt:
5507       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
5508       break;
5509     case CCValAssign::AExtUpper:
5510       Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
5511                         DAG.getConstant(32, DL, VA.getLocVT()));
5512       LLVM_FALLTHROUGH;
5513     case CCValAssign::AExt:
5514       LLVM_FALLTHROUGH;
5515     case CCValAssign::ZExt:
5516       Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
5517       break;
5518     }
5519 
5520     InVals.push_back(Val);
5521   }
5522 
5523   return Chain;
5524 }
5525 
5526 /// Return true if the calling convention is one that we can guarantee TCO for.
5527 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
5528   return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
5529          CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
5530 }
5531 
5532 /// Return true if we might ever do TCO for calls with this calling convention.
5533 static bool mayTailCallThisCC(CallingConv::ID CC) {
5534   switch (CC) {
5535   case CallingConv::C:
5536   case CallingConv::AArch64_SVE_VectorCall:
5537   case CallingConv::PreserveMost:
5538   case CallingConv::Swift:
5539   case CallingConv::SwiftTail:
5540   case CallingConv::Tail:
5541   case CallingConv::Fast:
5542     return true;
5543   default:
5544     return false;
5545   }
5546 }
5547 
5548 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
5549     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
5550     const SmallVectorImpl<ISD::OutputArg> &Outs,
5551     const SmallVectorImpl<SDValue> &OutVals,
5552     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
5553   if (!mayTailCallThisCC(CalleeCC))
5554     return false;
5555 
5556   MachineFunction &MF = DAG.getMachineFunction();
5557   const Function &CallerF = MF.getFunction();
5558   CallingConv::ID CallerCC = CallerF.getCallingConv();
5559 
5560   // Functions using the C or Fast calling convention that have an SVE signature
5561   // preserve more registers and should assume the SVE_VectorCall CC.
5562   // The check for matching callee-saved regs will determine whether it is
5563   // eligible for TCO.
5564   if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
5565       AArch64RegisterInfo::hasSVEArgsOrReturn(&MF))
5566     CallerCC = CallingConv::AArch64_SVE_VectorCall;
5567 
5568   bool CCMatch = CallerCC == CalleeCC;
5569 
5570   // When using the Windows calling convention on a non-windows OS, we want
5571   // to back up and restore X18 in such functions; we can't do a tail call
5572   // from those functions.
5573   if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
5574       CalleeCC != CallingConv::Win64)
5575     return false;
5576 
5577   // Byval parameters hand the function a pointer directly into the stack area
5578   // we want to reuse during a tail call. Working around this *is* possible (see
5579   // X86) but less efficient and uglier in LowerCall.
5580   for (Function::const_arg_iterator i = CallerF.arg_begin(),
5581                                     e = CallerF.arg_end();
5582        i != e; ++i) {
5583     if (i->hasByValAttr())
5584       return false;
5585 
5586     // On Windows, "inreg" attributes signify non-aggregate indirect returns.
5587     // In this case, it is necessary to save/restore X0 in the callee. Tail
5588     // call opt interferes with this. So we disable tail call opt when the
5589     // caller has an argument with "inreg" attribute.
5590 
5591     // FIXME: Check whether the callee also has an "inreg" argument.
5592     if (i->hasInRegAttr())
5593       return false;
5594   }
5595 
5596   if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
5597     return CCMatch;
5598 
5599   // Externally-defined functions with weak linkage should not be
5600   // tail-called on AArch64 when the OS does not support dynamic
5601   // pre-emption of symbols, as the AAELF spec requires normal calls
5602   // to undefined weak functions to be replaced with a NOP or jump to the
5603   // next instruction. The behaviour of branch instructions in this
5604   // situation (as used for tail calls) is implementation-defined, so we
5605   // cannot rely on the linker replacing the tail call with a return.
5606   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
5607     const GlobalValue *GV = G->getGlobal();
5608     const Triple &TT = getTargetMachine().getTargetTriple();
5609     if (GV->hasExternalWeakLinkage() &&
5610         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
5611       return false;
5612   }
5613 
5614   // Now we search for cases where we can use a tail call without changing the
5615   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
5616   // concept.
5617 
5618   // I want anyone implementing a new calling convention to think long and hard
5619   // about this assert.
5620   assert((!isVarArg || CalleeCC == CallingConv::C) &&
5621          "Unexpected variadic calling convention");
5622 
5623   LLVMContext &C = *DAG.getContext();
5624   if (isVarArg && !Outs.empty()) {
5625     // At least two cases here: if caller is fastcc then we can't have any
5626     // memory arguments (we'd be expected to clean up the stack afterwards). If
5627     // caller is C then we could potentially use its argument area.
5628 
5629     // FIXME: for now we take the most conservative of these in both cases:
5630     // disallow all variadic memory operands.
5631     SmallVector<CCValAssign, 16> ArgLocs;
5632     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5633 
5634     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
5635     for (const CCValAssign &ArgLoc : ArgLocs)
5636       if (!ArgLoc.isRegLoc())
5637         return false;
5638   }
5639 
5640   // Check that the call results are passed in the same way.
5641   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
5642                                   CCAssignFnForCall(CalleeCC, isVarArg),
5643                                   CCAssignFnForCall(CallerCC, isVarArg)))
5644     return false;
5645   // The callee has to preserve all registers the caller needs to preserve.
5646   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
5647   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
5648   if (!CCMatch) {
5649     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
5650     if (Subtarget->hasCustomCallingConv()) {
5651       TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
5652       TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
5653     }
5654     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
5655       return false;
5656   }
5657 
5658   // Nothing more to check if the callee is taking no arguments
5659   if (Outs.empty())
5660     return true;
5661 
5662   SmallVector<CCValAssign, 16> ArgLocs;
5663   CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5664 
5665   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
5666 
5667   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5668 
5669   // If any of the arguments is passed indirectly, it must be SVE, so the
5670   // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
5671   // allocate space on the stack. That is why we determine this explicitly here
5672   // the call cannot be a tailcall.
5673   if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
5674         assert((A.getLocInfo() != CCValAssign::Indirect ||
5675                 A.getValVT().isScalableVector()) &&
5676                "Expected value to be scalable");
5677         return A.getLocInfo() == CCValAssign::Indirect;
5678       }))
5679     return false;
5680 
5681   // If the stack arguments for this call do not fit into our own save area then
5682   // the call cannot be made tail.
5683   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
5684     return false;
5685 
5686   const MachineRegisterInfo &MRI = MF.getRegInfo();
5687   if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5688     return false;
5689 
5690   return true;
5691 }
5692 
5693 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
5694                                                    SelectionDAG &DAG,
5695                                                    MachineFrameInfo &MFI,
5696                                                    int ClobberedFI) const {
5697   SmallVector<SDValue, 8> ArgChains;
5698   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
5699   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
5700 
5701   // Include the original chain at the beginning of the list. When this is
5702   // used by target LowerCall hooks, this helps legalize find the
5703   // CALLSEQ_BEGIN node.
5704   ArgChains.push_back(Chain);
5705 
5706   // Add a chain value for each stack argument corresponding
5707   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
5708                             UE = DAG.getEntryNode().getNode()->use_end();
5709        U != UE; ++U)
5710     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
5711       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
5712         if (FI->getIndex() < 0) {
5713           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
5714           int64_t InLastByte = InFirstByte;
5715           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
5716 
5717           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
5718               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
5719             ArgChains.push_back(SDValue(L, 1));
5720         }
5721 
5722   // Build a tokenfactor for all the chains.
5723   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
5724 }
5725 
5726 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
5727                                                    bool TailCallOpt) const {
5728   return (CallCC == CallingConv::Fast && TailCallOpt) ||
5729          CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
5730 }
5731 
5732 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
5733 /// and add input and output parameter nodes.
5734 SDValue
5735 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
5736                                  SmallVectorImpl<SDValue> &InVals) const {
5737   SelectionDAG &DAG = CLI.DAG;
5738   SDLoc &DL = CLI.DL;
5739   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
5740   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
5741   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
5742   SDValue Chain = CLI.Chain;
5743   SDValue Callee = CLI.Callee;
5744   bool &IsTailCall = CLI.IsTailCall;
5745   CallingConv::ID CallConv = CLI.CallConv;
5746   bool IsVarArg = CLI.IsVarArg;
5747 
5748   MachineFunction &MF = DAG.getMachineFunction();
5749   MachineFunction::CallSiteInfo CSInfo;
5750   bool IsThisReturn = false;
5751 
5752   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5753   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
5754   bool IsSibCall = false;
5755   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CallConv);
5756 
5757   // Check callee args/returns for SVE registers and set calling convention
5758   // accordingly.
5759   if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
5760     bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
5761       return Out.VT.isScalableVector();
5762     });
5763     bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
5764       return In.VT.isScalableVector();
5765     });
5766 
5767     if (CalleeInSVE || CalleeOutSVE)
5768       CallConv = CallingConv::AArch64_SVE_VectorCall;
5769   }
5770 
5771   if (IsTailCall) {
5772     // Check if it's really possible to do a tail call.
5773     IsTailCall = isEligibleForTailCallOptimization(
5774         Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
5775 
5776     // A sibling call is one where we're under the usual C ABI and not planning
5777     // to change that but can still do a tail call:
5778     if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
5779         CallConv != CallingConv::SwiftTail)
5780       IsSibCall = true;
5781 
5782     if (IsTailCall)
5783       ++NumTailCalls;
5784   }
5785 
5786   if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
5787     report_fatal_error("failed to perform tail call elimination on a call "
5788                        "site marked musttail");
5789 
5790   // Analyze operands of the call, assigning locations to each operand.
5791   SmallVector<CCValAssign, 16> ArgLocs;
5792   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
5793                  *DAG.getContext());
5794 
5795   if (IsVarArg) {
5796     // Handle fixed and variable vector arguments differently.
5797     // Variable vector arguments always go into memory.
5798     unsigned NumArgs = Outs.size();
5799 
5800     for (unsigned i = 0; i != NumArgs; ++i) {
5801       MVT ArgVT = Outs[i].VT;
5802       if (!Outs[i].IsFixed && ArgVT.isScalableVector())
5803         report_fatal_error("Passing SVE types to variadic functions is "
5804                            "currently not supported");
5805 
5806       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5807       bool UseVarArgCC = !Outs[i].IsFixed;
5808       // On Windows, the fixed arguments in a vararg call are passed in GPRs
5809       // too, so use the vararg CC to force them to integer registers.
5810       if (IsCalleeWin64)
5811         UseVarArgCC = true;
5812       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
5813       bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
5814       assert(!Res && "Call operand has unhandled type");
5815       (void)Res;
5816     }
5817   } else {
5818     // At this point, Outs[].VT may already be promoted to i32. To correctly
5819     // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
5820     // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
5821     // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
5822     // we use a special version of AnalyzeCallOperands to pass in ValVT and
5823     // LocVT.
5824     unsigned NumArgs = Outs.size();
5825     for (unsigned i = 0; i != NumArgs; ++i) {
5826       MVT ValVT = Outs[i].VT;
5827       // Get type of the original argument.
5828       EVT ActualVT = getValueType(DAG.getDataLayout(),
5829                                   CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
5830                                   /*AllowUnknown*/ true);
5831       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
5832       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5833       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
5834       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
5835         ValVT = MVT::i8;
5836       else if (ActualMVT == MVT::i16)
5837         ValVT = MVT::i16;
5838 
5839       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
5840       bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
5841       assert(!Res && "Call operand has unhandled type");
5842       (void)Res;
5843     }
5844   }
5845 
5846   // Get a count of how many bytes are to be pushed on the stack.
5847   unsigned NumBytes = CCInfo.getNextStackOffset();
5848 
5849   if (IsSibCall) {
5850     // Since we're not changing the ABI to make this a tail call, the memory
5851     // operands are already available in the caller's incoming argument space.
5852     NumBytes = 0;
5853   }
5854 
5855   // FPDiff is the byte offset of the call's argument area from the callee's.
5856   // Stores to callee stack arguments will be placed in FixedStackSlots offset
5857   // by this amount for a tail call. In a sibling call it must be 0 because the
5858   // caller will deallocate the entire stack and the callee still expects its
5859   // arguments to begin at SP+0. Completely unused for non-tail calls.
5860   int FPDiff = 0;
5861 
5862   if (IsTailCall && !IsSibCall) {
5863     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
5864 
5865     // Since callee will pop argument stack as a tail call, we must keep the
5866     // popped size 16-byte aligned.
5867     NumBytes = alignTo(NumBytes, 16);
5868 
5869     // FPDiff will be negative if this tail call requires more space than we
5870     // would automatically have in our incoming argument space. Positive if we
5871     // can actually shrink the stack.
5872     FPDiff = NumReusableBytes - NumBytes;
5873 
5874     // Update the required reserved area if this is the tail call requiring the
5875     // most argument stack space.
5876     if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
5877       FuncInfo->setTailCallReservedStack(-FPDiff);
5878 
5879     // The stack pointer must be 16-byte aligned at all times it's used for a
5880     // memory operation, which in practice means at *all* times and in
5881     // particular across call boundaries. Therefore our own arguments started at
5882     // a 16-byte aligned SP and the delta applied for the tail call should
5883     // satisfy the same constraint.
5884     assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
5885   }
5886 
5887   // Adjust the stack pointer for the new arguments...
5888   // These operations are automatically eliminated by the prolog/epilog pass
5889   if (!IsSibCall)
5890     Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
5891 
5892   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
5893                                         getPointerTy(DAG.getDataLayout()));
5894 
5895   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
5896   SmallSet<unsigned, 8> RegsUsed;
5897   SmallVector<SDValue, 8> MemOpChains;
5898   auto PtrVT = getPointerTy(DAG.getDataLayout());
5899 
5900   if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
5901     const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
5902     for (const auto &F : Forwards) {
5903       SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
5904        RegsToPass.emplace_back(F.PReg, Val);
5905     }
5906   }
5907 
5908   // Walk the register/memloc assignments, inserting copies/loads.
5909   unsigned ExtraArgLocs = 0;
5910   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
5911     CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
5912     SDValue Arg = OutVals[i];
5913     ISD::ArgFlagsTy Flags = Outs[i].Flags;
5914 
5915     // Promote the value if needed.
5916     switch (VA.getLocInfo()) {
5917     default:
5918       llvm_unreachable("Unknown loc info!");
5919     case CCValAssign::Full:
5920       break;
5921     case CCValAssign::SExt:
5922       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
5923       break;
5924     case CCValAssign::ZExt:
5925       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
5926       break;
5927     case CCValAssign::AExt:
5928       if (Outs[i].ArgVT == MVT::i1) {
5929         // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
5930         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
5931         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
5932       }
5933       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
5934       break;
5935     case CCValAssign::AExtUpper:
5936       assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
5937       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
5938       Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
5939                         DAG.getConstant(32, DL, VA.getLocVT()));
5940       break;
5941     case CCValAssign::BCvt:
5942       Arg = DAG.getBitcast(VA.getLocVT(), Arg);
5943       break;
5944     case CCValAssign::Trunc:
5945       Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
5946       break;
5947     case CCValAssign::FPExt:
5948       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
5949       break;
5950     case CCValAssign::Indirect:
5951       assert(VA.getValVT().isScalableVector() &&
5952              "Only scalable vectors can be passed indirectly");
5953 
5954       uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
5955       uint64_t PartSize = StoreSize;
5956       unsigned NumParts = 1;
5957       if (Outs[i].Flags.isInConsecutiveRegs()) {
5958         assert(!Outs[i].Flags.isInConsecutiveRegsLast());
5959         while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
5960           ++NumParts;
5961         StoreSize *= NumParts;
5962       }
5963 
5964       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
5965       Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
5966       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
5967       int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
5968       MFI.setStackID(FI, TargetStackID::ScalableVector);
5969 
5970       MachinePointerInfo MPI =
5971           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
5972       SDValue Ptr = DAG.getFrameIndex(
5973           FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
5974       SDValue SpillSlot = Ptr;
5975 
5976       // Ensure we generate all stores for each tuple part, whilst updating the
5977       // pointer after each store correctly using vscale.
5978       while (NumParts) {
5979         Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
5980         NumParts--;
5981         if (NumParts > 0) {
5982           SDValue BytesIncrement = DAG.getVScale(
5983               DL, Ptr.getValueType(),
5984               APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
5985           SDNodeFlags Flags;
5986           Flags.setNoUnsignedWrap(true);
5987 
5988           MPI = MachinePointerInfo(MPI.getAddrSpace());
5989           Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5990                             BytesIncrement, Flags);
5991           ExtraArgLocs++;
5992           i++;
5993         }
5994       }
5995 
5996       Arg = SpillSlot;
5997       break;
5998     }
5999 
6000     if (VA.isRegLoc()) {
6001       if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
6002           Outs[0].VT == MVT::i64) {
6003         assert(VA.getLocVT() == MVT::i64 &&
6004                "unexpected calling convention register assignment");
6005         assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
6006                "unexpected use of 'returned'");
6007         IsThisReturn = true;
6008       }
6009       if (RegsUsed.count(VA.getLocReg())) {
6010         // If this register has already been used then we're trying to pack
6011         // parts of an [N x i32] into an X-register. The extension type will
6012         // take care of putting the two halves in the right place but we have to
6013         // combine them.
6014         SDValue &Bits =
6015             llvm::find_if(RegsToPass,
6016                           [=](const std::pair<unsigned, SDValue> &Elt) {
6017                             return Elt.first == VA.getLocReg();
6018                           })
6019                 ->second;
6020         Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
6021         // Call site info is used for function's parameter entry value
6022         // tracking. For now we track only simple cases when parameter
6023         // is transferred through whole register.
6024         llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
6025           return ArgReg.Reg == VA.getLocReg();
6026         });
6027       } else {
6028         RegsToPass.emplace_back(VA.getLocReg(), Arg);
6029         RegsUsed.insert(VA.getLocReg());
6030         const TargetOptions &Options = DAG.getTarget().Options;
6031         if (Options.EmitCallSiteInfo)
6032           CSInfo.emplace_back(VA.getLocReg(), i);
6033       }
6034     } else {
6035       assert(VA.isMemLoc());
6036 
6037       SDValue DstAddr;
6038       MachinePointerInfo DstInfo;
6039 
6040       // FIXME: This works on big-endian for composite byvals, which are the
6041       // common case. It should also work for fundamental types too.
6042       uint32_t BEAlign = 0;
6043       unsigned OpSize;
6044       if (VA.getLocInfo() == CCValAssign::Indirect ||
6045           VA.getLocInfo() == CCValAssign::Trunc)
6046         OpSize = VA.getLocVT().getFixedSizeInBits();
6047       else
6048         OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
6049                                  : VA.getValVT().getSizeInBits();
6050       OpSize = (OpSize + 7) / 8;
6051       if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
6052           !Flags.isInConsecutiveRegs()) {
6053         if (OpSize < 8)
6054           BEAlign = 8 - OpSize;
6055       }
6056       unsigned LocMemOffset = VA.getLocMemOffset();
6057       int32_t Offset = LocMemOffset + BEAlign;
6058       SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
6059       PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
6060 
6061       if (IsTailCall) {
6062         Offset = Offset + FPDiff;
6063         int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
6064 
6065         DstAddr = DAG.getFrameIndex(FI, PtrVT);
6066         DstInfo =
6067             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
6068 
6069         // Make sure any stack arguments overlapping with where we're storing
6070         // are loaded before this eventual operation. Otherwise they'll be
6071         // clobbered.
6072         Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
6073       } else {
6074         SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
6075 
6076         DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
6077         DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
6078                                                LocMemOffset);
6079       }
6080 
6081       if (Outs[i].Flags.isByVal()) {
6082         SDValue SizeNode =
6083             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
6084         SDValue Cpy = DAG.getMemcpy(
6085             Chain, DL, DstAddr, Arg, SizeNode,
6086             Outs[i].Flags.getNonZeroByValAlign(),
6087             /*isVol = */ false, /*AlwaysInline = */ false,
6088             /*isTailCall = */ false, DstInfo, MachinePointerInfo());
6089 
6090         MemOpChains.push_back(Cpy);
6091       } else {
6092         // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
6093         // promoted to a legal register type i32, we should truncate Arg back to
6094         // i1/i8/i16.
6095         if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
6096             VA.getValVT() == MVT::i16)
6097           Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
6098 
6099         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
6100         MemOpChains.push_back(Store);
6101       }
6102     }
6103   }
6104 
6105   if (!MemOpChains.empty())
6106     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
6107 
6108   // Build a sequence of copy-to-reg nodes chained together with token chain
6109   // and flag operands which copy the outgoing args into the appropriate regs.
6110   SDValue InFlag;
6111   for (auto &RegToPass : RegsToPass) {
6112     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
6113                              RegToPass.second, InFlag);
6114     InFlag = Chain.getValue(1);
6115   }
6116 
6117   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
6118   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
6119   // node so that legalize doesn't hack it.
6120   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
6121     auto GV = G->getGlobal();
6122     unsigned OpFlags =
6123         Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
6124     if (OpFlags & AArch64II::MO_GOT) {
6125       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
6126       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
6127     } else {
6128       const GlobalValue *GV = G->getGlobal();
6129       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
6130     }
6131   } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
6132     if (getTargetMachine().getCodeModel() == CodeModel::Large &&
6133         Subtarget->isTargetMachO()) {
6134       const char *Sym = S->getSymbol();
6135       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
6136       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
6137     } else {
6138       const char *Sym = S->getSymbol();
6139       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
6140     }
6141   }
6142 
6143   // We don't usually want to end the call-sequence here because we would tidy
6144   // the frame up *after* the call, however in the ABI-changing tail-call case
6145   // we've carefully laid out the parameters so that when sp is reset they'll be
6146   // in the correct location.
6147   if (IsTailCall && !IsSibCall) {
6148     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
6149                                DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
6150     InFlag = Chain.getValue(1);
6151   }
6152 
6153   std::vector<SDValue> Ops;
6154   Ops.push_back(Chain);
6155   Ops.push_back(Callee);
6156 
6157   if (IsTailCall) {
6158     // Each tail call may have to adjust the stack by a different amount, so
6159     // this information must travel along with the operation for eventual
6160     // consumption by emitEpilogue.
6161     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
6162   }
6163 
6164   // Add argument registers to the end of the list so that they are known live
6165   // into the call.
6166   for (auto &RegToPass : RegsToPass)
6167     Ops.push_back(DAG.getRegister(RegToPass.first,
6168                                   RegToPass.second.getValueType()));
6169 
6170   // Add a register mask operand representing the call-preserved registers.
6171   const uint32_t *Mask;
6172   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6173   if (IsThisReturn) {
6174     // For 'this' returns, use the X0-preserving mask if applicable
6175     Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
6176     if (!Mask) {
6177       IsThisReturn = false;
6178       Mask = TRI->getCallPreservedMask(MF, CallConv);
6179     }
6180   } else
6181     Mask = TRI->getCallPreservedMask(MF, CallConv);
6182 
6183   if (Subtarget->hasCustomCallingConv())
6184     TRI->UpdateCustomCallPreservedMask(MF, &Mask);
6185 
6186   if (TRI->isAnyArgRegReserved(MF))
6187     TRI->emitReservedArgRegCallError(MF);
6188 
6189   assert(Mask && "Missing call preserved mask for calling convention");
6190   Ops.push_back(DAG.getRegisterMask(Mask));
6191 
6192   if (InFlag.getNode())
6193     Ops.push_back(InFlag);
6194 
6195   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
6196 
6197   // If we're doing a tall call, use a TC_RETURN here rather than an
6198   // actual call instruction.
6199   if (IsTailCall) {
6200     MF.getFrameInfo().setHasTailCall();
6201     SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
6202     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
6203     return Ret;
6204   }
6205 
6206   unsigned CallOpc = AArch64ISD::CALL;
6207   // Calls with operand bundle "clang.arc.attachedcall" are special. They should
6208   // be expanded to the call, directly followed by a special marker sequence.
6209   // Use the CALL_RVMARKER to do that.
6210   if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
6211     assert(!IsTailCall &&
6212            "tail calls cannot be marked with clang.arc.attachedcall");
6213     CallOpc = AArch64ISD::CALL_RVMARKER;
6214   }
6215 
6216   // Returns a chain and a flag for retval copy to use.
6217   Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
6218   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
6219   InFlag = Chain.getValue(1);
6220   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
6221 
6222   uint64_t CalleePopBytes =
6223       DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
6224 
6225   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
6226                              DAG.getIntPtrConstant(CalleePopBytes, DL, true),
6227                              InFlag, DL);
6228   if (!Ins.empty())
6229     InFlag = Chain.getValue(1);
6230 
6231   // Handle result values, copying them out of physregs into vregs that we
6232   // return.
6233   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
6234                          InVals, IsThisReturn,
6235                          IsThisReturn ? OutVals[0] : SDValue());
6236 }
6237 
6238 bool AArch64TargetLowering::CanLowerReturn(
6239     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
6240     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
6241   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
6242   SmallVector<CCValAssign, 16> RVLocs;
6243   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
6244   return CCInfo.CheckReturn(Outs, RetCC);
6245 }
6246 
6247 SDValue
6248 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
6249                                    bool isVarArg,
6250                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
6251                                    const SmallVectorImpl<SDValue> &OutVals,
6252                                    const SDLoc &DL, SelectionDAG &DAG) const {
6253   auto &MF = DAG.getMachineFunction();
6254   auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6255 
6256   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
6257   SmallVector<CCValAssign, 16> RVLocs;
6258   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
6259                  *DAG.getContext());
6260   CCInfo.AnalyzeReturn(Outs, RetCC);
6261 
6262   // Copy the result values into the output registers.
6263   SDValue Flag;
6264   SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
6265   SmallSet<unsigned, 4> RegsUsed;
6266   for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
6267        ++i, ++realRVLocIdx) {
6268     CCValAssign &VA = RVLocs[i];
6269     assert(VA.isRegLoc() && "Can only return in registers!");
6270     SDValue Arg = OutVals[realRVLocIdx];
6271 
6272     switch (VA.getLocInfo()) {
6273     default:
6274       llvm_unreachable("Unknown loc info!");
6275     case CCValAssign::Full:
6276       if (Outs[i].ArgVT == MVT::i1) {
6277         // AAPCS requires i1 to be zero-extended to i8 by the producer of the
6278         // value. This is strictly redundant on Darwin (which uses "zeroext
6279         // i1"), but will be optimised out before ISel.
6280         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
6281         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
6282       }
6283       break;
6284     case CCValAssign::BCvt:
6285       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
6286       break;
6287     case CCValAssign::AExt:
6288     case CCValAssign::ZExt:
6289       Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
6290       break;
6291     case CCValAssign::AExtUpper:
6292       assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
6293       Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
6294       Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
6295                         DAG.getConstant(32, DL, VA.getLocVT()));
6296       break;
6297     }
6298 
6299     if (RegsUsed.count(VA.getLocReg())) {
6300       SDValue &Bits =
6301           llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
6302             return Elt.first == VA.getLocReg();
6303           })->second;
6304       Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
6305     } else {
6306       RetVals.emplace_back(VA.getLocReg(), Arg);
6307       RegsUsed.insert(VA.getLocReg());
6308     }
6309   }
6310 
6311   SmallVector<SDValue, 4> RetOps(1, Chain);
6312   for (auto &RetVal : RetVals) {
6313     Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
6314     Flag = Chain.getValue(1);
6315     RetOps.push_back(
6316         DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
6317   }
6318 
6319   // Windows AArch64 ABIs require that for returning structs by value we copy
6320   // the sret argument into X0 for the return.
6321   // We saved the argument into a virtual register in the entry block,
6322   // so now we copy the value out and into X0.
6323   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
6324     SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
6325                                      getPointerTy(MF.getDataLayout()));
6326 
6327     unsigned RetValReg = AArch64::X0;
6328     Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
6329     Flag = Chain.getValue(1);
6330 
6331     RetOps.push_back(
6332       DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
6333   }
6334 
6335   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6336   const MCPhysReg *I =
6337       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
6338   if (I) {
6339     for (; *I; ++I) {
6340       if (AArch64::GPR64RegClass.contains(*I))
6341         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
6342       else if (AArch64::FPR64RegClass.contains(*I))
6343         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
6344       else
6345         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
6346     }
6347   }
6348 
6349   RetOps[0] = Chain; // Update chain.
6350 
6351   // Add the flag if we have it.
6352   if (Flag.getNode())
6353     RetOps.push_back(Flag);
6354 
6355   return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
6356 }
6357 
6358 //===----------------------------------------------------------------------===//
6359 //  Other Lowering Code
6360 //===----------------------------------------------------------------------===//
6361 
6362 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
6363                                              SelectionDAG &DAG,
6364                                              unsigned Flag) const {
6365   return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
6366                                     N->getOffset(), Flag);
6367 }
6368 
6369 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
6370                                              SelectionDAG &DAG,
6371                                              unsigned Flag) const {
6372   return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
6373 }
6374 
6375 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
6376                                              SelectionDAG &DAG,
6377                                              unsigned Flag) const {
6378   return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
6379                                    N->getOffset(), Flag);
6380 }
6381 
6382 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
6383                                              SelectionDAG &DAG,
6384                                              unsigned Flag) const {
6385   return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
6386 }
6387 
6388 // (loadGOT sym)
6389 template <class NodeTy>
6390 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
6391                                       unsigned Flags) const {
6392   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
6393   SDLoc DL(N);
6394   EVT Ty = getPointerTy(DAG.getDataLayout());
6395   SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
6396   // FIXME: Once remat is capable of dealing with instructions with register
6397   // operands, expand this into two nodes instead of using a wrapper node.
6398   return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
6399 }
6400 
6401 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
6402 template <class NodeTy>
6403 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
6404                                             unsigned Flags) const {
6405   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
6406   SDLoc DL(N);
6407   EVT Ty = getPointerTy(DAG.getDataLayout());
6408   const unsigned char MO_NC = AArch64II::MO_NC;
6409   return DAG.getNode(
6410       AArch64ISD::WrapperLarge, DL, Ty,
6411       getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
6412       getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
6413       getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
6414       getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
6415 }
6416 
6417 // (addlow (adrp %hi(sym)) %lo(sym))
6418 template <class NodeTy>
6419 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
6420                                        unsigned Flags) const {
6421   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
6422   SDLoc DL(N);
6423   EVT Ty = getPointerTy(DAG.getDataLayout());
6424   SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
6425   SDValue Lo = getTargetNode(N, Ty, DAG,
6426                              AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
6427   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
6428   return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
6429 }
6430 
6431 // (adr sym)
6432 template <class NodeTy>
6433 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
6434                                            unsigned Flags) const {
6435   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
6436   SDLoc DL(N);
6437   EVT Ty = getPointerTy(DAG.getDataLayout());
6438   SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
6439   return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
6440 }
6441 
6442 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
6443                                                   SelectionDAG &DAG) const {
6444   GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
6445   const GlobalValue *GV = GN->getGlobal();
6446   unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
6447 
6448   if (OpFlags != AArch64II::MO_NO_FLAG)
6449     assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
6450            "unexpected offset in global node");
6451 
6452   // This also catches the large code model case for Darwin, and tiny code
6453   // model with got relocations.
6454   if ((OpFlags & AArch64II::MO_GOT) != 0) {
6455     return getGOT(GN, DAG, OpFlags);
6456   }
6457 
6458   SDValue Result;
6459   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
6460     Result = getAddrLarge(GN, DAG, OpFlags);
6461   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
6462     Result = getAddrTiny(GN, DAG, OpFlags);
6463   } else {
6464     Result = getAddr(GN, DAG, OpFlags);
6465   }
6466   EVT PtrVT = getPointerTy(DAG.getDataLayout());
6467   SDLoc DL(GN);
6468   if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))
6469     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
6470                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
6471   return Result;
6472 }
6473 
6474 /// Convert a TLS address reference into the correct sequence of loads
6475 /// and calls to compute the variable's address (for Darwin, currently) and
6476 /// return an SDValue containing the final node.
6477 
6478 /// Darwin only has one TLS scheme which must be capable of dealing with the
6479 /// fully general situation, in the worst case. This means:
6480 ///     + "extern __thread" declaration.
6481 ///     + Defined in a possibly unknown dynamic library.
6482 ///
6483 /// The general system is that each __thread variable has a [3 x i64] descriptor
6484 /// which contains information used by the runtime to calculate the address. The
6485 /// only part of this the compiler needs to know about is the first xword, which
6486 /// contains a function pointer that must be called with the address of the
6487 /// entire descriptor in "x0".
6488 ///
6489 /// Since this descriptor may be in a different unit, in general even the
6490 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
6491 /// is:
6492 ///     adrp x0, _var@TLVPPAGE
6493 ///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
6494 ///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
6495 ///                                      ; the function pointer
6496 ///     blr x1                           ; Uses descriptor address in x0
6497 ///     ; Address of _var is now in x0.
6498 ///
6499 /// If the address of _var's descriptor *is* known to the linker, then it can
6500 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
6501 /// a slight efficiency gain.
6502 SDValue
6503 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
6504                                                    SelectionDAG &DAG) const {
6505   assert(Subtarget->isTargetDarwin() &&
6506          "This function expects a Darwin target");
6507 
6508   SDLoc DL(Op);
6509   MVT PtrVT = getPointerTy(DAG.getDataLayout());
6510   MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
6511   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
6512 
6513   SDValue TLVPAddr =
6514       DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
6515   SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
6516 
6517   // The first entry in the descriptor is a function pointer that we must call
6518   // to obtain the address of the variable.
6519   SDValue Chain = DAG.getEntryNode();
6520   SDValue FuncTLVGet = DAG.getLoad(
6521       PtrMemVT, DL, Chain, DescAddr,
6522       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
6523       Align(PtrMemVT.getSizeInBits() / 8),
6524       MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
6525   Chain = FuncTLVGet.getValue(1);
6526 
6527   // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
6528   FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
6529 
6530   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6531   MFI.setAdjustsStack(true);
6532 
6533   // TLS calls preserve all registers except those that absolutely must be
6534   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
6535   // silly).
6536   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6537   const uint32_t *Mask = TRI->getTLSCallPreservedMask();
6538   if (Subtarget->hasCustomCallingConv())
6539     TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
6540 
6541   // Finally, we can make the call. This is just a degenerate version of a
6542   // normal AArch64 call node: x0 takes the address of the descriptor, and
6543   // returns the address of the variable in this thread.
6544   Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
6545   Chain =
6546       DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6547                   Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
6548                   DAG.getRegisterMask(Mask), Chain.getValue(1));
6549   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
6550 }
6551 
6552 /// Convert a thread-local variable reference into a sequence of instructions to
6553 /// compute the variable's address for the local exec TLS model of ELF targets.
6554 /// The sequence depends on the maximum TLS area size.
6555 SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
6556                                                     SDValue ThreadBase,
6557                                                     const SDLoc &DL,
6558                                                     SelectionDAG &DAG) const {
6559   EVT PtrVT = getPointerTy(DAG.getDataLayout());
6560   SDValue TPOff, Addr;
6561 
6562   switch (DAG.getTarget().Options.TLSSize) {
6563   default:
6564     llvm_unreachable("Unexpected TLS size");
6565 
6566   case 12: {
6567     // mrs   x0, TPIDR_EL0
6568     // add   x0, x0, :tprel_lo12:a
6569     SDValue Var = DAG.getTargetGlobalAddress(
6570         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
6571     return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
6572                                       Var,
6573                                       DAG.getTargetConstant(0, DL, MVT::i32)),
6574                    0);
6575   }
6576 
6577   case 24: {
6578     // mrs   x0, TPIDR_EL0
6579     // add   x0, x0, :tprel_hi12:a
6580     // add   x0, x0, :tprel_lo12_nc:a
6581     SDValue HiVar = DAG.getTargetGlobalAddress(
6582         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
6583     SDValue LoVar = DAG.getTargetGlobalAddress(
6584         GV, DL, PtrVT, 0,
6585         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
6586     Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
6587                                       HiVar,
6588                                       DAG.getTargetConstant(0, DL, MVT::i32)),
6589                    0);
6590     return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
6591                                       LoVar,
6592                                       DAG.getTargetConstant(0, DL, MVT::i32)),
6593                    0);
6594   }
6595 
6596   case 32: {
6597     // mrs   x1, TPIDR_EL0
6598     // movz  x0, #:tprel_g1:a
6599     // movk  x0, #:tprel_g0_nc:a
6600     // add   x0, x1, x0
6601     SDValue HiVar = DAG.getTargetGlobalAddress(
6602         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
6603     SDValue LoVar = DAG.getTargetGlobalAddress(
6604         GV, DL, PtrVT, 0,
6605         AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
6606     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
6607                                        DAG.getTargetConstant(16, DL, MVT::i32)),
6608                     0);
6609     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
6610                                        DAG.getTargetConstant(0, DL, MVT::i32)),
6611                     0);
6612     return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6613   }
6614 
6615   case 48: {
6616     // mrs   x1, TPIDR_EL0
6617     // movz  x0, #:tprel_g2:a
6618     // movk  x0, #:tprel_g1_nc:a
6619     // movk  x0, #:tprel_g0_nc:a
6620     // add   x0, x1, x0
6621     SDValue HiVar = DAG.getTargetGlobalAddress(
6622         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
6623     SDValue MiVar = DAG.getTargetGlobalAddress(
6624         GV, DL, PtrVT, 0,
6625         AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
6626     SDValue LoVar = DAG.getTargetGlobalAddress(
6627         GV, DL, PtrVT, 0,
6628         AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
6629     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
6630                                        DAG.getTargetConstant(32, DL, MVT::i32)),
6631                     0);
6632     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
6633                                        DAG.getTargetConstant(16, DL, MVT::i32)),
6634                     0);
6635     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
6636                                        DAG.getTargetConstant(0, DL, MVT::i32)),
6637                     0);
6638     return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6639   }
6640   }
6641 }
6642 
6643 /// When accessing thread-local variables under either the general-dynamic or
6644 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
6645 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
6646 /// is a function pointer to carry out the resolution.
6647 ///
6648 /// The sequence is:
6649 ///    adrp  x0, :tlsdesc:var
6650 ///    ldr   x1, [x0, #:tlsdesc_lo12:var]
6651 ///    add   x0, x0, #:tlsdesc_lo12:var
6652 ///    .tlsdesccall var
6653 ///    blr   x1
6654 ///    (TPIDR_EL0 offset now in x0)
6655 ///
6656 ///  The above sequence must be produced unscheduled, to enable the linker to
6657 ///  optimize/relax this sequence.
6658 ///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
6659 ///  above sequence, and expanded really late in the compilation flow, to ensure
6660 ///  the sequence is produced as per above.
6661 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
6662                                                       const SDLoc &DL,
6663                                                       SelectionDAG &DAG) const {
6664   EVT PtrVT = getPointerTy(DAG.getDataLayout());
6665 
6666   SDValue Chain = DAG.getEntryNode();
6667   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
6668 
6669   Chain =
6670       DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
6671   SDValue Glue = Chain.getValue(1);
6672 
6673   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
6674 }
6675 
6676 SDValue
6677 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
6678                                                 SelectionDAG &DAG) const {
6679   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
6680 
6681   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
6682 
6683   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
6684 
6685   if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
6686     if (Model == TLSModel::LocalDynamic)
6687       Model = TLSModel::GeneralDynamic;
6688   }
6689 
6690   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
6691       Model != TLSModel::LocalExec)
6692     report_fatal_error("ELF TLS only supported in small memory model or "
6693                        "in local exec TLS model");
6694   // Different choices can be made for the maximum size of the TLS area for a
6695   // module. For the small address model, the default TLS size is 16MiB and the
6696   // maximum TLS size is 4GiB.
6697   // FIXME: add tiny and large code model support for TLS access models other
6698   // than local exec. We currently generate the same code as small for tiny,
6699   // which may be larger than needed.
6700 
6701   SDValue TPOff;
6702   EVT PtrVT = getPointerTy(DAG.getDataLayout());
6703   SDLoc DL(Op);
6704   const GlobalValue *GV = GA->getGlobal();
6705 
6706   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6707 
6708   if (Model == TLSModel::LocalExec) {
6709     return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
6710   } else if (Model == TLSModel::InitialExec) {
6711     TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
6712     TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
6713   } else if (Model == TLSModel::LocalDynamic) {
6714     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
6715     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
6716     // the beginning of the module's TLS region, followed by a DTPREL offset
6717     // calculation.
6718 
6719     // These accesses will need deduplicating if there's more than one.
6720     AArch64FunctionInfo *MFI =
6721         DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
6722     MFI->incNumLocalDynamicTLSAccesses();
6723 
6724     // The call needs a relocation too for linker relaxation. It doesn't make
6725     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
6726     // the address.
6727     SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
6728                                                   AArch64II::MO_TLS);
6729 
6730     // Now we can calculate the offset from TPIDR_EL0 to this module's
6731     // thread-local area.
6732     TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
6733 
6734     // Now use :dtprel_whatever: operations to calculate this variable's offset
6735     // in its thread-storage area.
6736     SDValue HiVar = DAG.getTargetGlobalAddress(
6737         GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
6738     SDValue LoVar = DAG.getTargetGlobalAddress(
6739         GV, DL, MVT::i64, 0,
6740         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
6741 
6742     TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
6743                                        DAG.getTargetConstant(0, DL, MVT::i32)),
6744                     0);
6745     TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
6746                                        DAG.getTargetConstant(0, DL, MVT::i32)),
6747                     0);
6748   } else if (Model == TLSModel::GeneralDynamic) {
6749     // The call needs a relocation too for linker relaxation. It doesn't make
6750     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
6751     // the address.
6752     SDValue SymAddr =
6753         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
6754 
6755     // Finally we can make a call to calculate the offset from tpidr_el0.
6756     TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
6757   } else
6758     llvm_unreachable("Unsupported ELF TLS access model");
6759 
6760   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6761 }
6762 
6763 SDValue
6764 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
6765                                                     SelectionDAG &DAG) const {
6766   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
6767 
6768   SDValue Chain = DAG.getEntryNode();
6769   EVT PtrVT = getPointerTy(DAG.getDataLayout());
6770   SDLoc DL(Op);
6771 
6772   SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
6773 
6774   // Load the ThreadLocalStoragePointer from the TEB
6775   // A pointer to the TLS array is located at offset 0x58 from the TEB.
6776   SDValue TLSArray =
6777       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
6778   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
6779   Chain = TLSArray.getValue(1);
6780 
6781   // Load the TLS index from the C runtime;
6782   // This does the same as getAddr(), but without having a GlobalAddressSDNode.
6783   // This also does the same as LOADgot, but using a generic i32 load,
6784   // while LOADgot only loads i64.
6785   SDValue TLSIndexHi =
6786       DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
6787   SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
6788       "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
6789   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
6790   SDValue TLSIndex =
6791       DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
6792   TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
6793   Chain = TLSIndex.getValue(1);
6794 
6795   // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
6796   // offset into the TLSArray.
6797   TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
6798   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
6799                              DAG.getConstant(3, DL, PtrVT));
6800   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
6801                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
6802                             MachinePointerInfo());
6803   Chain = TLS.getValue(1);
6804 
6805   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
6806   const GlobalValue *GV = GA->getGlobal();
6807   SDValue TGAHi = DAG.getTargetGlobalAddress(
6808       GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
6809   SDValue TGALo = DAG.getTargetGlobalAddress(
6810       GV, DL, PtrVT, 0,
6811       AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
6812 
6813   // Add the offset from the start of the .tls section (section base).
6814   SDValue Addr =
6815       SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
6816                                  DAG.getTargetConstant(0, DL, MVT::i32)),
6817               0);
6818   Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
6819   return Addr;
6820 }
6821 
6822 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
6823                                                      SelectionDAG &DAG) const {
6824   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
6825   if (DAG.getTarget().useEmulatedTLS())
6826     return LowerToTLSEmulatedModel(GA, DAG);
6827 
6828   if (Subtarget->isTargetDarwin())
6829     return LowerDarwinGlobalTLSAddress(Op, DAG);
6830   if (Subtarget->isTargetELF())
6831     return LowerELFGlobalTLSAddress(Op, DAG);
6832   if (Subtarget->isTargetWindows())
6833     return LowerWindowsGlobalTLSAddress(Op, DAG);
6834 
6835   llvm_unreachable("Unexpected platform trying to use TLS");
6836 }
6837 
6838 // Looks through \param Val to determine the bit that can be used to
6839 // check the sign of the value. It returns the unextended value and
6840 // the sign bit position.
6841 std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
6842   if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
6843     return {Val.getOperand(0),
6844             cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
6845                 1};
6846 
6847   if (Val.getOpcode() == ISD::SIGN_EXTEND)
6848     return {Val.getOperand(0),
6849             Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
6850 
6851   return {Val, Val.getValueSizeInBits() - 1};
6852 }
6853 
6854 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
6855   SDValue Chain = Op.getOperand(0);
6856   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
6857   SDValue LHS = Op.getOperand(2);
6858   SDValue RHS = Op.getOperand(3);
6859   SDValue Dest = Op.getOperand(4);
6860   SDLoc dl(Op);
6861 
6862   MachineFunction &MF = DAG.getMachineFunction();
6863   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
6864   // will not be produced, as they are conditional branch instructions that do
6865   // not set flags.
6866   bool ProduceNonFlagSettingCondBr =
6867       !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
6868 
6869   // Handle f128 first, since lowering it will result in comparing the return
6870   // value of a libcall against zero, which is just what the rest of LowerBR_CC
6871   // is expecting to deal with.
6872   if (LHS.getValueType() == MVT::f128) {
6873     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
6874 
6875     // If softenSetCCOperands returned a scalar, we need to compare the result
6876     // against zero to select between true and false values.
6877     if (!RHS.getNode()) {
6878       RHS = DAG.getConstant(0, dl, LHS.getValueType());
6879       CC = ISD::SETNE;
6880     }
6881   }
6882 
6883   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
6884   // instruction.
6885   if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
6886       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
6887     // Only lower legal XALUO ops.
6888     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
6889       return SDValue();
6890 
6891     // The actual operation with overflow check.
6892     AArch64CC::CondCode OFCC;
6893     SDValue Value, Overflow;
6894     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
6895 
6896     if (CC == ISD::SETNE)
6897       OFCC = getInvertedCondCode(OFCC);
6898     SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
6899 
6900     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6901                        Overflow);
6902   }
6903 
6904   if (LHS.getValueType().isInteger()) {
6905     assert((LHS.getValueType() == RHS.getValueType()) &&
6906            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
6907 
6908     // If the RHS of the comparison is zero, we can potentially fold this
6909     // to a specialized branch.
6910     const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
6911     if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
6912       if (CC == ISD::SETEQ) {
6913         // See if we can use a TBZ to fold in an AND as well.
6914         // TBZ has a smaller branch displacement than CBZ.  If the offset is
6915         // out of bounds, a late MI-layer pass rewrites branches.
6916         // 403.gcc is an example that hits this case.
6917         if (LHS.getOpcode() == ISD::AND &&
6918             isa<ConstantSDNode>(LHS.getOperand(1)) &&
6919             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
6920           SDValue Test = LHS.getOperand(0);
6921           uint64_t Mask = LHS.getConstantOperandVal(1);
6922           return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
6923                              DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
6924                              Dest);
6925         }
6926 
6927         return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
6928       } else if (CC == ISD::SETNE) {
6929         // See if we can use a TBZ to fold in an AND as well.
6930         // TBZ has a smaller branch displacement than CBZ.  If the offset is
6931         // out of bounds, a late MI-layer pass rewrites branches.
6932         // 403.gcc is an example that hits this case.
6933         if (LHS.getOpcode() == ISD::AND &&
6934             isa<ConstantSDNode>(LHS.getOperand(1)) &&
6935             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
6936           SDValue Test = LHS.getOperand(0);
6937           uint64_t Mask = LHS.getConstantOperandVal(1);
6938           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
6939                              DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
6940                              Dest);
6941         }
6942 
6943         return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
6944       } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
6945         // Don't combine AND since emitComparison converts the AND to an ANDS
6946         // (a.k.a. TST) and the test in the test bit and branch instruction
6947         // becomes redundant.  This would also increase register pressure.
6948         uint64_t SignBitPos;
6949         std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
6950         return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
6951                            DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
6952       }
6953     }
6954     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
6955         LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
6956       // Don't combine AND since emitComparison converts the AND to an ANDS
6957       // (a.k.a. TST) and the test in the test bit and branch instruction
6958       // becomes redundant.  This would also increase register pressure.
6959       uint64_t SignBitPos;
6960       std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
6961       return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
6962                          DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
6963     }
6964 
6965     SDValue CCVal;
6966     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
6967     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6968                        Cmp);
6969   }
6970 
6971   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
6972          LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
6973 
6974   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
6975   // clean.  Some of them require two branches to implement.
6976   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
6977   AArch64CC::CondCode CC1, CC2;
6978   changeFPCCToAArch64CC(CC, CC1, CC2);
6979   SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
6980   SDValue BR1 =
6981       DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
6982   if (CC2 != AArch64CC::AL) {
6983     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
6984     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
6985                        Cmp);
6986   }
6987 
6988   return BR1;
6989 }
6990 
6991 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
6992                                               SelectionDAG &DAG) const {
6993   EVT VT = Op.getValueType();
6994   SDLoc DL(Op);
6995 
6996   SDValue In1 = Op.getOperand(0);
6997   SDValue In2 = Op.getOperand(1);
6998   EVT SrcVT = In2.getValueType();
6999 
7000   if (SrcVT.bitsLT(VT))
7001     In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
7002   else if (SrcVT.bitsGT(VT))
7003     In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
7004 
7005   EVT VecVT;
7006   uint64_t EltMask;
7007   SDValue VecVal1, VecVal2;
7008 
7009   auto setVecVal = [&] (int Idx) {
7010     if (!VT.isVector()) {
7011       VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
7012                                           DAG.getUNDEF(VecVT), In1);
7013       VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
7014                                           DAG.getUNDEF(VecVT), In2);
7015     } else {
7016       VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
7017       VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
7018     }
7019   };
7020 
7021   if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
7022     VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
7023     EltMask = 0x80000000ULL;
7024     setVecVal(AArch64::ssub);
7025   } else if (VT == MVT::f64 || VT == MVT::v2f64) {
7026     VecVT = MVT::v2i64;
7027 
7028     // We want to materialize a mask with the high bit set, but the AdvSIMD
7029     // immediate moves cannot materialize that in a single instruction for
7030     // 64-bit elements. Instead, materialize zero and then negate it.
7031     EltMask = 0;
7032 
7033     setVecVal(AArch64::dsub);
7034   } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) {
7035     VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
7036     EltMask = 0x8000ULL;
7037     setVecVal(AArch64::hsub);
7038   } else {
7039     llvm_unreachable("Invalid type for copysign!");
7040   }
7041 
7042   SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
7043 
7044   // If we couldn't materialize the mask above, then the mask vector will be
7045   // the zero vector, and we need to negate it here.
7046   if (VT == MVT::f64 || VT == MVT::v2f64) {
7047     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
7048     BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
7049     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
7050   }
7051 
7052   SDValue Sel =
7053       DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
7054 
7055   if (VT == MVT::f16)
7056     return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
7057   if (VT == MVT::f32)
7058     return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
7059   else if (VT == MVT::f64)
7060     return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
7061   else
7062     return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
7063 }
7064 
7065 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
7066   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
7067           Attribute::NoImplicitFloat))
7068     return SDValue();
7069 
7070   if (!Subtarget->hasNEON())
7071     return SDValue();
7072 
7073   // While there is no integer popcount instruction, it can
7074   // be more efficiently lowered to the following sequence that uses
7075   // AdvSIMD registers/instructions as long as the copies to/from
7076   // the AdvSIMD registers are cheap.
7077   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
7078   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
7079   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
7080   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
7081   SDValue Val = Op.getOperand(0);
7082   SDLoc DL(Op);
7083   EVT VT = Op.getValueType();
7084 
7085   if (VT == MVT::i32 || VT == MVT::i64) {
7086     if (VT == MVT::i32)
7087       Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
7088     Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
7089 
7090     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
7091     SDValue UaddLV = DAG.getNode(
7092         ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
7093         DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
7094 
7095     if (VT == MVT::i64)
7096       UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
7097     return UaddLV;
7098   } else if (VT == MVT::i128) {
7099     Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
7100 
7101     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
7102     SDValue UaddLV = DAG.getNode(
7103         ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
7104         DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
7105 
7106     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
7107   }
7108 
7109   if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
7110     return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
7111 
7112   assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
7113           VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
7114          "Unexpected type for custom ctpop lowering");
7115 
7116   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
7117   Val = DAG.getBitcast(VT8Bit, Val);
7118   Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
7119 
7120   // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
7121   unsigned EltSize = 8;
7122   unsigned NumElts = VT.is64BitVector() ? 8 : 16;
7123   while (EltSize != VT.getScalarSizeInBits()) {
7124     EltSize *= 2;
7125     NumElts /= 2;
7126     MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
7127     Val = DAG.getNode(
7128         ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
7129         DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
7130   }
7131 
7132   return Val;
7133 }
7134 
7135 SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
7136   EVT VT = Op.getValueType();
7137   assert(VT.isScalableVector() ||
7138          useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true));
7139 
7140   SDLoc DL(Op);
7141   SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
7142   return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
7143 }
7144 
7145 SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
7146                                                SelectionDAG &DAG) const {
7147   EVT VT = Op.getValueType();
7148 
7149   if (VT.isScalableVector() ||
7150       useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7151     return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU,
7152                                true);
7153 
7154   SDLoc DL(Op);
7155   SDValue REVB;
7156   MVT VST;
7157 
7158   switch (VT.getSimpleVT().SimpleTy) {
7159   default:
7160     llvm_unreachable("Invalid type for bitreverse!");
7161 
7162   case MVT::v2i32: {
7163     VST = MVT::v8i8;
7164     REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
7165 
7166     break;
7167   }
7168 
7169   case MVT::v4i32: {
7170     VST = MVT::v16i8;
7171     REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
7172 
7173     break;
7174   }
7175 
7176   case MVT::v1i64: {
7177     VST = MVT::v8i8;
7178     REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
7179 
7180     break;
7181   }
7182 
7183   case MVT::v2i64: {
7184     VST = MVT::v16i8;
7185     REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
7186 
7187     break;
7188   }
7189   }
7190 
7191   return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
7192                      DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
7193 }
7194 
7195 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
7196 
7197   if (Op.getValueType().isVector())
7198     return LowerVSETCC(Op, DAG);
7199 
7200   bool IsStrict = Op->isStrictFPOpcode();
7201   bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
7202   unsigned OpNo = IsStrict ? 1 : 0;
7203   SDValue Chain;
7204   if (IsStrict)
7205     Chain = Op.getOperand(0);
7206   SDValue LHS = Op.getOperand(OpNo + 0);
7207   SDValue RHS = Op.getOperand(OpNo + 1);
7208   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
7209   SDLoc dl(Op);
7210 
7211   // We chose ZeroOrOneBooleanContents, so use zero and one.
7212   EVT VT = Op.getValueType();
7213   SDValue TVal = DAG.getConstant(1, dl, VT);
7214   SDValue FVal = DAG.getConstant(0, dl, VT);
7215 
7216   // Handle f128 first, since one possible outcome is a normal integer
7217   // comparison which gets picked up by the next if statement.
7218   if (LHS.getValueType() == MVT::f128) {
7219     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
7220                         IsSignaling);
7221 
7222     // If softenSetCCOperands returned a scalar, use it.
7223     if (!RHS.getNode()) {
7224       assert(LHS.getValueType() == Op.getValueType() &&
7225              "Unexpected setcc expansion!");
7226       return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
7227     }
7228   }
7229 
7230   if (LHS.getValueType().isInteger()) {
7231     SDValue CCVal;
7232     SDValue Cmp = getAArch64Cmp(
7233         LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
7234 
7235     // Note that we inverted the condition above, so we reverse the order of
7236     // the true and false operands here.  This will allow the setcc to be
7237     // matched to a single CSINC instruction.
7238     SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
7239     return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
7240   }
7241 
7242   // Now we know we're dealing with FP values.
7243   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
7244          LHS.getValueType() == MVT::f64);
7245 
7246   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
7247   // and do the comparison.
7248   SDValue Cmp;
7249   if (IsStrict)
7250     Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
7251   else
7252     Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
7253 
7254   AArch64CC::CondCode CC1, CC2;
7255   changeFPCCToAArch64CC(CC, CC1, CC2);
7256   SDValue Res;
7257   if (CC2 == AArch64CC::AL) {
7258     changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
7259                           CC2);
7260     SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
7261 
7262     // Note that we inverted the condition above, so we reverse the order of
7263     // the true and false operands here.  This will allow the setcc to be
7264     // matched to a single CSINC instruction.
7265     Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
7266   } else {
7267     // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
7268     // totally clean.  Some of them require two CSELs to implement.  As is in
7269     // this case, we emit the first CSEL and then emit a second using the output
7270     // of the first as the RHS.  We're effectively OR'ing the two CC's together.
7271 
7272     // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
7273     SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
7274     SDValue CS1 =
7275         DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
7276 
7277     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
7278     Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
7279   }
7280   return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
7281 }
7282 
7283 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
7284                                               SDValue RHS, SDValue TVal,
7285                                               SDValue FVal, const SDLoc &dl,
7286                                               SelectionDAG &DAG) const {
7287   // Handle f128 first, because it will result in a comparison of some RTLIB
7288   // call result against zero.
7289   if (LHS.getValueType() == MVT::f128) {
7290     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
7291 
7292     // If softenSetCCOperands returned a scalar, we need to compare the result
7293     // against zero to select between true and false values.
7294     if (!RHS.getNode()) {
7295       RHS = DAG.getConstant(0, dl, LHS.getValueType());
7296       CC = ISD::SETNE;
7297     }
7298   }
7299 
7300   // Also handle f16, for which we need to do a f32 comparison.
7301   if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
7302     LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
7303     RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
7304   }
7305 
7306   // Next, handle integers.
7307   if (LHS.getValueType().isInteger()) {
7308     assert((LHS.getValueType() == RHS.getValueType()) &&
7309            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
7310 
7311     ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
7312     ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
7313     ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
7314     // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
7315     // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
7316     // supported types.
7317     if (CC == ISD::SETGT && RHSC && RHSC->isAllOnesValue() && CTVal && CFVal &&
7318         CTVal->isOne() && CFVal->isAllOnesValue() &&
7319         LHS.getValueType() == TVal.getValueType()) {
7320       EVT VT = LHS.getValueType();
7321       SDValue Shift =
7322           DAG.getNode(ISD::SRA, dl, VT, LHS,
7323                       DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
7324       return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
7325     }
7326 
7327     unsigned Opcode = AArch64ISD::CSEL;
7328 
7329     // If both the TVal and the FVal are constants, see if we can swap them in
7330     // order to for a CSINV or CSINC out of them.
7331     if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
7332       std::swap(TVal, FVal);
7333       std::swap(CTVal, CFVal);
7334       CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7335     } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
7336       std::swap(TVal, FVal);
7337       std::swap(CTVal, CFVal);
7338       CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7339     } else if (TVal.getOpcode() == ISD::XOR) {
7340       // If TVal is a NOT we want to swap TVal and FVal so that we can match
7341       // with a CSINV rather than a CSEL.
7342       if (isAllOnesConstant(TVal.getOperand(1))) {
7343         std::swap(TVal, FVal);
7344         std::swap(CTVal, CFVal);
7345         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7346       }
7347     } else if (TVal.getOpcode() == ISD::SUB) {
7348       // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
7349       // that we can match with a CSNEG rather than a CSEL.
7350       if (isNullConstant(TVal.getOperand(0))) {
7351         std::swap(TVal, FVal);
7352         std::swap(CTVal, CFVal);
7353         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7354       }
7355     } else if (CTVal && CFVal) {
7356       const int64_t TrueVal = CTVal->getSExtValue();
7357       const int64_t FalseVal = CFVal->getSExtValue();
7358       bool Swap = false;
7359 
7360       // If both TVal and FVal are constants, see if FVal is the
7361       // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
7362       // instead of a CSEL in that case.
7363       if (TrueVal == ~FalseVal) {
7364         Opcode = AArch64ISD::CSINV;
7365       } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
7366                  TrueVal == -FalseVal) {
7367         Opcode = AArch64ISD::CSNEG;
7368       } else if (TVal.getValueType() == MVT::i32) {
7369         // If our operands are only 32-bit wide, make sure we use 32-bit
7370         // arithmetic for the check whether we can use CSINC. This ensures that
7371         // the addition in the check will wrap around properly in case there is
7372         // an overflow (which would not be the case if we do the check with
7373         // 64-bit arithmetic).
7374         const uint32_t TrueVal32 = CTVal->getZExtValue();
7375         const uint32_t FalseVal32 = CFVal->getZExtValue();
7376 
7377         if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
7378           Opcode = AArch64ISD::CSINC;
7379 
7380           if (TrueVal32 > FalseVal32) {
7381             Swap = true;
7382           }
7383         }
7384         // 64-bit check whether we can use CSINC.
7385       } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
7386         Opcode = AArch64ISD::CSINC;
7387 
7388         if (TrueVal > FalseVal) {
7389           Swap = true;
7390         }
7391       }
7392 
7393       // Swap TVal and FVal if necessary.
7394       if (Swap) {
7395         std::swap(TVal, FVal);
7396         std::swap(CTVal, CFVal);
7397         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
7398       }
7399 
7400       if (Opcode != AArch64ISD::CSEL) {
7401         // Drop FVal since we can get its value by simply inverting/negating
7402         // TVal.
7403         FVal = TVal;
7404       }
7405     }
7406 
7407     // Avoid materializing a constant when possible by reusing a known value in
7408     // a register.  However, don't perform this optimization if the known value
7409     // is one, zero or negative one in the case of a CSEL.  We can always
7410     // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
7411     // FVal, respectively.
7412     ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
7413     if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
7414         !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
7415       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
7416       // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
7417       // "a != C ? x : a" to avoid materializing C.
7418       if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
7419         TVal = LHS;
7420       else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
7421         FVal = LHS;
7422     } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
7423       assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
7424       // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
7425       // avoid materializing C.
7426       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
7427       if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
7428         Opcode = AArch64ISD::CSINV;
7429         TVal = LHS;
7430         FVal = DAG.getConstant(0, dl, FVal.getValueType());
7431       }
7432     }
7433 
7434     SDValue CCVal;
7435     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
7436     EVT VT = TVal.getValueType();
7437     return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
7438   }
7439 
7440   // Now we know we're dealing with FP values.
7441   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
7442          LHS.getValueType() == MVT::f64);
7443   assert(LHS.getValueType() == RHS.getValueType());
7444   EVT VT = TVal.getValueType();
7445   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
7446 
7447   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
7448   // clean.  Some of them require two CSELs to implement.
7449   AArch64CC::CondCode CC1, CC2;
7450   changeFPCCToAArch64CC(CC, CC1, CC2);
7451 
7452   if (DAG.getTarget().Options.UnsafeFPMath) {
7453     // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
7454     // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
7455     ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
7456     if (RHSVal && RHSVal->isZero()) {
7457       ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
7458       ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
7459 
7460       if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
7461           CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
7462         TVal = LHS;
7463       else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
7464                CFVal && CFVal->isZero() &&
7465                FVal.getValueType() == LHS.getValueType())
7466         FVal = LHS;
7467     }
7468   }
7469 
7470   // Emit first, and possibly only, CSEL.
7471   SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
7472   SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
7473 
7474   // If we need a second CSEL, emit it, using the output of the first as the
7475   // RHS.  We're effectively OR'ing the two CC's together.
7476   if (CC2 != AArch64CC::AL) {
7477     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
7478     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
7479   }
7480 
7481   // Otherwise, return the output of the first CSEL.
7482   return CS1;
7483 }
7484 
7485 SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
7486                                                   SelectionDAG &DAG) const {
7487 
7488   EVT Ty = Op.getValueType();
7489   auto Idx = Op.getConstantOperandAPInt(2);
7490   if (Idx.sge(-1) && Idx.slt(Ty.getVectorMinNumElements()))
7491     return Op;
7492   return SDValue();
7493 }
7494 
7495 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
7496                                               SelectionDAG &DAG) const {
7497   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
7498   SDValue LHS = Op.getOperand(0);
7499   SDValue RHS = Op.getOperand(1);
7500   SDValue TVal = Op.getOperand(2);
7501   SDValue FVal = Op.getOperand(3);
7502   SDLoc DL(Op);
7503   return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
7504 }
7505 
7506 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
7507                                            SelectionDAG &DAG) const {
7508   SDValue CCVal = Op->getOperand(0);
7509   SDValue TVal = Op->getOperand(1);
7510   SDValue FVal = Op->getOperand(2);
7511   SDLoc DL(Op);
7512 
7513   EVT Ty = Op.getValueType();
7514   if (Ty.isScalableVector()) {
7515     SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
7516     MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
7517     SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
7518     return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
7519   }
7520 
7521   if (useSVEForFixedLengthVectorVT(Ty)) {
7522     // FIXME: Ideally this would be the same as above using i1 types, however
7523     // for the moment we can't deal with fixed i1 vector types properly, so
7524     // instead extend the predicate to a result type sized integer vector.
7525     MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
7526     MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
7527     SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
7528     SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
7529     return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
7530   }
7531 
7532   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
7533   // instruction.
7534   if (ISD::isOverflowIntrOpRes(CCVal)) {
7535     // Only lower legal XALUO ops.
7536     if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
7537       return SDValue();
7538 
7539     AArch64CC::CondCode OFCC;
7540     SDValue Value, Overflow;
7541     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
7542     SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
7543 
7544     return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
7545                        CCVal, Overflow);
7546   }
7547 
7548   // Lower it the same way as we would lower a SELECT_CC node.
7549   ISD::CondCode CC;
7550   SDValue LHS, RHS;
7551   if (CCVal.getOpcode() == ISD::SETCC) {
7552     LHS = CCVal.getOperand(0);
7553     RHS = CCVal.getOperand(1);
7554     CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
7555   } else {
7556     LHS = CCVal;
7557     RHS = DAG.getConstant(0, DL, CCVal.getValueType());
7558     CC = ISD::SETNE;
7559   }
7560   return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
7561 }
7562 
7563 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
7564                                               SelectionDAG &DAG) const {
7565   // Jump table entries as PC relative offsets. No additional tweaking
7566   // is necessary here. Just get the address of the jump table.
7567   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
7568 
7569   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
7570       !Subtarget->isTargetMachO()) {
7571     return getAddrLarge(JT, DAG);
7572   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7573     return getAddrTiny(JT, DAG);
7574   }
7575   return getAddr(JT, DAG);
7576 }
7577 
7578 SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
7579                                           SelectionDAG &DAG) const {
7580   // Jump table entries as PC relative offsets. No additional tweaking
7581   // is necessary here. Just get the address of the jump table.
7582   SDLoc DL(Op);
7583   SDValue JT = Op.getOperand(1);
7584   SDValue Entry = Op.getOperand(2);
7585   int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
7586 
7587   auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
7588   AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
7589 
7590   SDNode *Dest =
7591       DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
7592                          Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
7593   return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
7594                      SDValue(Dest, 0));
7595 }
7596 
7597 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
7598                                                  SelectionDAG &DAG) const {
7599   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
7600 
7601   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
7602     // Use the GOT for the large code model on iOS.
7603     if (Subtarget->isTargetMachO()) {
7604       return getGOT(CP, DAG);
7605     }
7606     return getAddrLarge(CP, DAG);
7607   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7608     return getAddrTiny(CP, DAG);
7609   } else {
7610     return getAddr(CP, DAG);
7611   }
7612 }
7613 
7614 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
7615                                                SelectionDAG &DAG) const {
7616   BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
7617   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
7618       !Subtarget->isTargetMachO()) {
7619     return getAddrLarge(BA, DAG);
7620   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7621     return getAddrTiny(BA, DAG);
7622   }
7623   return getAddr(BA, DAG);
7624 }
7625 
7626 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
7627                                                  SelectionDAG &DAG) const {
7628   AArch64FunctionInfo *FuncInfo =
7629       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
7630 
7631   SDLoc DL(Op);
7632   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
7633                                  getPointerTy(DAG.getDataLayout()));
7634   FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
7635   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7636   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
7637                       MachinePointerInfo(SV));
7638 }
7639 
7640 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
7641                                                   SelectionDAG &DAG) const {
7642   AArch64FunctionInfo *FuncInfo =
7643       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
7644 
7645   SDLoc DL(Op);
7646   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
7647                                      ? FuncInfo->getVarArgsGPRIndex()
7648                                      : FuncInfo->getVarArgsStackIndex(),
7649                                  getPointerTy(DAG.getDataLayout()));
7650   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7651   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
7652                       MachinePointerInfo(SV));
7653 }
7654 
7655 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
7656                                                   SelectionDAG &DAG) const {
7657   // The layout of the va_list struct is specified in the AArch64 Procedure Call
7658   // Standard, section B.3.
7659   MachineFunction &MF = DAG.getMachineFunction();
7660   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7661   unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
7662   auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
7663   auto PtrVT = getPointerTy(DAG.getDataLayout());
7664   SDLoc DL(Op);
7665 
7666   SDValue Chain = Op.getOperand(0);
7667   SDValue VAList = Op.getOperand(1);
7668   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7669   SmallVector<SDValue, 4> MemOps;
7670 
7671   // void *__stack at offset 0
7672   unsigned Offset = 0;
7673   SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
7674   Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
7675   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
7676                                 MachinePointerInfo(SV), Align(PtrSize)));
7677 
7678   // void *__gr_top at offset 8 (4 on ILP32)
7679   Offset += PtrSize;
7680   int GPRSize = FuncInfo->getVarArgsGPRSize();
7681   if (GPRSize > 0) {
7682     SDValue GRTop, GRTopAddr;
7683 
7684     GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7685                             DAG.getConstant(Offset, DL, PtrVT));
7686 
7687     GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
7688     GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
7689                         DAG.getConstant(GPRSize, DL, PtrVT));
7690     GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
7691 
7692     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
7693                                   MachinePointerInfo(SV, Offset),
7694                                   Align(PtrSize)));
7695   }
7696 
7697   // void *__vr_top at offset 16 (8 on ILP32)
7698   Offset += PtrSize;
7699   int FPRSize = FuncInfo->getVarArgsFPRSize();
7700   if (FPRSize > 0) {
7701     SDValue VRTop, VRTopAddr;
7702     VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7703                             DAG.getConstant(Offset, DL, PtrVT));
7704 
7705     VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
7706     VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
7707                         DAG.getConstant(FPRSize, DL, PtrVT));
7708     VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
7709 
7710     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
7711                                   MachinePointerInfo(SV, Offset),
7712                                   Align(PtrSize)));
7713   }
7714 
7715   // int __gr_offs at offset 24 (12 on ILP32)
7716   Offset += PtrSize;
7717   SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7718                                    DAG.getConstant(Offset, DL, PtrVT));
7719   MemOps.push_back(
7720       DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
7721                    GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
7722 
7723   // int __vr_offs at offset 28 (16 on ILP32)
7724   Offset += 4;
7725   SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7726                                    DAG.getConstant(Offset, DL, PtrVT));
7727   MemOps.push_back(
7728       DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
7729                    VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
7730 
7731   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7732 }
7733 
7734 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
7735                                             SelectionDAG &DAG) const {
7736   MachineFunction &MF = DAG.getMachineFunction();
7737 
7738   if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
7739     return LowerWin64_VASTART(Op, DAG);
7740   else if (Subtarget->isTargetDarwin())
7741     return LowerDarwin_VASTART(Op, DAG);
7742   else
7743     return LowerAAPCS_VASTART(Op, DAG);
7744 }
7745 
7746 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
7747                                            SelectionDAG &DAG) const {
7748   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
7749   // pointer.
7750   SDLoc DL(Op);
7751   unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
7752   unsigned VaListSize =
7753       (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7754           ? PtrSize
7755           : Subtarget->isTargetILP32() ? 20 : 32;
7756   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
7757   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7758 
7759   return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
7760                        DAG.getConstant(VaListSize, DL, MVT::i32),
7761                        Align(PtrSize), false, false, false,
7762                        MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
7763 }
7764 
7765 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
7766   assert(Subtarget->isTargetDarwin() &&
7767          "automatic va_arg instruction only works on Darwin");
7768 
7769   const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7770   EVT VT = Op.getValueType();
7771   SDLoc DL(Op);
7772   SDValue Chain = Op.getOperand(0);
7773   SDValue Addr = Op.getOperand(1);
7774   MaybeAlign Align(Op.getConstantOperandVal(3));
7775   unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
7776   auto PtrVT = getPointerTy(DAG.getDataLayout());
7777   auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
7778   SDValue VAList =
7779       DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
7780   Chain = VAList.getValue(1);
7781   VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
7782 
7783   if (VT.isScalableVector())
7784     report_fatal_error("Passing SVE types to variadic functions is "
7785                        "currently not supported");
7786 
7787   if (Align && *Align > MinSlotSize) {
7788     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7789                          DAG.getConstant(Align->value() - 1, DL, PtrVT));
7790     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
7791                          DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
7792   }
7793 
7794   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
7795   unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
7796 
7797   // Scalar integer and FP values smaller than 64 bits are implicitly extended
7798   // up to 64 bits.  At the very least, we have to increase the striding of the
7799   // vaargs list to match this, and for FP values we need to introduce
7800   // FP_ROUND nodes as well.
7801   if (VT.isInteger() && !VT.isVector())
7802     ArgSize = std::max(ArgSize, MinSlotSize);
7803   bool NeedFPTrunc = false;
7804   if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
7805     ArgSize = 8;
7806     NeedFPTrunc = true;
7807   }
7808 
7809   // Increment the pointer, VAList, to the next vaarg
7810   SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7811                                DAG.getConstant(ArgSize, DL, PtrVT));
7812   VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
7813 
7814   // Store the incremented VAList to the legalized pointer
7815   SDValue APStore =
7816       DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
7817 
7818   // Load the actual argument out of the pointer VAList
7819   if (NeedFPTrunc) {
7820     // Load the value as an f64.
7821     SDValue WideFP =
7822         DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
7823     // Round the value down to an f32.
7824     SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
7825                                    DAG.getIntPtrConstant(1, DL));
7826     SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
7827     // Merge the rounded value with the chain output of the load.
7828     return DAG.getMergeValues(Ops, DL);
7829   }
7830 
7831   return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
7832 }
7833 
7834 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
7835                                               SelectionDAG &DAG) const {
7836   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7837   MFI.setFrameAddressIsTaken(true);
7838 
7839   EVT VT = Op.getValueType();
7840   SDLoc DL(Op);
7841   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7842   SDValue FrameAddr =
7843       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
7844   while (Depth--)
7845     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
7846                             MachinePointerInfo());
7847 
7848   if (Subtarget->isTargetILP32())
7849     FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
7850                             DAG.getValueType(VT));
7851 
7852   return FrameAddr;
7853 }
7854 
7855 SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
7856                                               SelectionDAG &DAG) const {
7857   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7858 
7859   EVT VT = getPointerTy(DAG.getDataLayout());
7860   SDLoc DL(Op);
7861   int FI = MFI.CreateFixedObject(4, 0, false);
7862   return DAG.getFrameIndex(FI, VT);
7863 }
7864 
7865 #define GET_REGISTER_MATCHER
7866 #include "AArch64GenAsmMatcher.inc"
7867 
7868 // FIXME? Maybe this could be a TableGen attribute on some registers and
7869 // this table could be generated automatically from RegInfo.
7870 Register AArch64TargetLowering::
7871 getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
7872   Register Reg = MatchRegisterName(RegName);
7873   if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
7874     const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
7875     unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
7876     if (!Subtarget->isXRegisterReserved(DwarfRegNum))
7877       Reg = 0;
7878   }
7879   if (Reg)
7880     return Reg;
7881   report_fatal_error(Twine("Invalid register name \""
7882                               + StringRef(RegName)  + "\"."));
7883 }
7884 
7885 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
7886                                                      SelectionDAG &DAG) const {
7887   DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
7888 
7889   EVT VT = Op.getValueType();
7890   SDLoc DL(Op);
7891 
7892   SDValue FrameAddr =
7893       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
7894   SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
7895 
7896   return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
7897 }
7898 
7899 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
7900                                                SelectionDAG &DAG) const {
7901   MachineFunction &MF = DAG.getMachineFunction();
7902   MachineFrameInfo &MFI = MF.getFrameInfo();
7903   MFI.setReturnAddressIsTaken(true);
7904 
7905   EVT VT = Op.getValueType();
7906   SDLoc DL(Op);
7907   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7908   SDValue ReturnAddress;
7909   if (Depth) {
7910     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
7911     SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
7912     ReturnAddress = DAG.getLoad(
7913         VT, DL, DAG.getEntryNode(),
7914         DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
7915   } else {
7916     // Return LR, which contains the return address. Mark it an implicit
7917     // live-in.
7918     unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
7919     ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7920   }
7921 
7922   // The XPACLRI instruction assembles to a hint-space instruction before
7923   // Armv8.3-A therefore this instruction can be safely used for any pre
7924   // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
7925   // that instead.
7926   SDNode *St;
7927   if (Subtarget->hasPAuth()) {
7928     St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
7929   } else {
7930     // XPACLRI operates on LR therefore we must move the operand accordingly.
7931     SDValue Chain =
7932         DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
7933     St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
7934   }
7935   return SDValue(St, 0);
7936 }
7937 
7938 /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
7939 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
7940 SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
7941                                                SelectionDAG &DAG) const {
7942   SDValue Lo, Hi;
7943   expandShiftParts(Op.getNode(), Lo, Hi, DAG);
7944   return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
7945 }
7946 
7947 bool AArch64TargetLowering::isOffsetFoldingLegal(
7948     const GlobalAddressSDNode *GA) const {
7949   // Offsets are folded in the DAG combine rather than here so that we can
7950   // intelligently choose an offset based on the uses.
7951   return false;
7952 }
7953 
7954 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
7955                                          bool OptForSize) const {
7956   bool IsLegal = false;
7957   // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
7958   // 16-bit case when target has full fp16 support.
7959   // FIXME: We should be able to handle f128 as well with a clever lowering.
7960   const APInt ImmInt = Imm.bitcastToAPInt();
7961   if (VT == MVT::f64)
7962     IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
7963   else if (VT == MVT::f32)
7964     IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
7965   else if (VT == MVT::f16 && Subtarget->hasFullFP16())
7966     IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero();
7967   // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
7968   //       generate that fmov.
7969 
7970   // If we can not materialize in immediate field for fmov, check if the
7971   // value can be encoded as the immediate operand of a logical instruction.
7972   // The immediate value will be created with either MOVZ, MOVN, or ORR.
7973   if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
7974     // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
7975     // however the mov+fmov sequence is always better because of the reduced
7976     // cache pressure. The timings are still the same if you consider
7977     // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
7978     // movw+movk is fused). So we limit up to 2 instrdduction at most.
7979     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7980     AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
7981 			      Insn);
7982     unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
7983     IsLegal = Insn.size() <= Limit;
7984   }
7985 
7986   LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
7987                     << " imm value: "; Imm.dump(););
7988   return IsLegal;
7989 }
7990 
7991 //===----------------------------------------------------------------------===//
7992 //                          AArch64 Optimization Hooks
7993 //===----------------------------------------------------------------------===//
7994 
7995 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
7996                            SDValue Operand, SelectionDAG &DAG,
7997                            int &ExtraSteps) {
7998   EVT VT = Operand.getValueType();
7999   if (ST->hasNEON() &&
8000       (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
8001        VT == MVT::f32 || VT == MVT::v1f32 ||
8002        VT == MVT::v2f32 || VT == MVT::v4f32)) {
8003     if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
8004       // For the reciprocal estimates, convergence is quadratic, so the number
8005       // of digits is doubled after each iteration.  In ARMv8, the accuracy of
8006       // the initial estimate is 2^-8.  Thus the number of extra steps to refine
8007       // the result for float (23 mantissa bits) is 2 and for double (52
8008       // mantissa bits) is 3.
8009       ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
8010 
8011     return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
8012   }
8013 
8014   return SDValue();
8015 }
8016 
8017 SDValue
8018 AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
8019                                         const DenormalMode &Mode) const {
8020   SDLoc DL(Op);
8021   EVT VT = Op.getValueType();
8022   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
8023   SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
8024   return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
8025 }
8026 
8027 SDValue
8028 AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
8029                                                    SelectionDAG &DAG) const {
8030   return Op;
8031 }
8032 
8033 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
8034                                                SelectionDAG &DAG, int Enabled,
8035                                                int &ExtraSteps,
8036                                                bool &UseOneConst,
8037                                                bool Reciprocal) const {
8038   if (Enabled == ReciprocalEstimate::Enabled ||
8039       (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
8040     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
8041                                        DAG, ExtraSteps)) {
8042       SDLoc DL(Operand);
8043       EVT VT = Operand.getValueType();
8044 
8045       SDNodeFlags Flags;
8046       Flags.setAllowReassociation(true);
8047 
8048       // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
8049       // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
8050       for (int i = ExtraSteps; i > 0; --i) {
8051         SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
8052                                    Flags);
8053         Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
8054         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
8055       }
8056       if (!Reciprocal)
8057         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
8058 
8059       ExtraSteps = 0;
8060       return Estimate;
8061     }
8062 
8063   return SDValue();
8064 }
8065 
8066 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
8067                                                 SelectionDAG &DAG, int Enabled,
8068                                                 int &ExtraSteps) const {
8069   if (Enabled == ReciprocalEstimate::Enabled)
8070     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
8071                                        DAG, ExtraSteps)) {
8072       SDLoc DL(Operand);
8073       EVT VT = Operand.getValueType();
8074 
8075       SDNodeFlags Flags;
8076       Flags.setAllowReassociation(true);
8077 
8078       // Newton reciprocal iteration: E * (2 - X * E)
8079       // AArch64 reciprocal iteration instruction: (2 - M * N)
8080       for (int i = ExtraSteps; i > 0; --i) {
8081         SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
8082                                    Estimate, Flags);
8083         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
8084       }
8085 
8086       ExtraSteps = 0;
8087       return Estimate;
8088     }
8089 
8090   return SDValue();
8091 }
8092 
8093 //===----------------------------------------------------------------------===//
8094 //                          AArch64 Inline Assembly Support
8095 //===----------------------------------------------------------------------===//
8096 
8097 // Table of Constraints
8098 // TODO: This is the current set of constraints supported by ARM for the
8099 // compiler, not all of them may make sense.
8100 //
8101 // r - A general register
8102 // w - An FP/SIMD register of some size in the range v0-v31
8103 // x - An FP/SIMD register of some size in the range v0-v15
8104 // I - Constant that can be used with an ADD instruction
8105 // J - Constant that can be used with a SUB instruction
8106 // K - Constant that can be used with a 32-bit logical instruction
8107 // L - Constant that can be used with a 64-bit logical instruction
8108 // M - Constant that can be used as a 32-bit MOV immediate
8109 // N - Constant that can be used as a 64-bit MOV immediate
8110 // Q - A memory reference with base register and no offset
8111 // S - A symbolic address
8112 // Y - Floating point constant zero
8113 // Z - Integer constant zero
8114 //
8115 //   Note that general register operands will be output using their 64-bit x
8116 // register name, whatever the size of the variable, unless the asm operand
8117 // is prefixed by the %w modifier. Floating-point and SIMD register operands
8118 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
8119 // %q modifier.
8120 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
8121   // At this point, we have to lower this constraint to something else, so we
8122   // lower it to an "r" or "w". However, by doing this we will force the result
8123   // to be in register, while the X constraint is much more permissive.
8124   //
8125   // Although we are correct (we are free to emit anything, without
8126   // constraints), we might break use cases that would expect us to be more
8127   // efficient and emit something else.
8128   if (!Subtarget->hasFPARMv8())
8129     return "r";
8130 
8131   if (ConstraintVT.isFloatingPoint())
8132     return "w";
8133 
8134   if (ConstraintVT.isVector() &&
8135      (ConstraintVT.getSizeInBits() == 64 ||
8136       ConstraintVT.getSizeInBits() == 128))
8137     return "w";
8138 
8139   return "r";
8140 }
8141 
8142 enum PredicateConstraint {
8143   Upl,
8144   Upa,
8145   Invalid
8146 };
8147 
8148 static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
8149   PredicateConstraint P = PredicateConstraint::Invalid;
8150   if (Constraint == "Upa")
8151     P = PredicateConstraint::Upa;
8152   if (Constraint == "Upl")
8153     P = PredicateConstraint::Upl;
8154   return P;
8155 }
8156 
8157 /// getConstraintType - Given a constraint letter, return the type of
8158 /// constraint it is for this target.
8159 AArch64TargetLowering::ConstraintType
8160 AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
8161   if (Constraint.size() == 1) {
8162     switch (Constraint[0]) {
8163     default:
8164       break;
8165     case 'x':
8166     case 'w':
8167     case 'y':
8168       return C_RegisterClass;
8169     // An address with a single base register. Due to the way we
8170     // currently handle addresses it is the same as 'r'.
8171     case 'Q':
8172       return C_Memory;
8173     case 'I':
8174     case 'J':
8175     case 'K':
8176     case 'L':
8177     case 'M':
8178     case 'N':
8179     case 'Y':
8180     case 'Z':
8181       return C_Immediate;
8182     case 'z':
8183     case 'S': // A symbolic address
8184       return C_Other;
8185     }
8186   } else if (parsePredicateConstraint(Constraint) !=
8187              PredicateConstraint::Invalid)
8188       return C_RegisterClass;
8189   return TargetLowering::getConstraintType(Constraint);
8190 }
8191 
8192 /// Examine constraint type and operand type and determine a weight value.
8193 /// This object must already have been set up with the operand type
8194 /// and the current alternative constraint selected.
8195 TargetLowering::ConstraintWeight
8196 AArch64TargetLowering::getSingleConstraintMatchWeight(
8197     AsmOperandInfo &info, const char *constraint) const {
8198   ConstraintWeight weight = CW_Invalid;
8199   Value *CallOperandVal = info.CallOperandVal;
8200   // If we don't have a value, we can't do a match,
8201   // but allow it at the lowest weight.
8202   if (!CallOperandVal)
8203     return CW_Default;
8204   Type *type = CallOperandVal->getType();
8205   // Look at the constraint type.
8206   switch (*constraint) {
8207   default:
8208     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
8209     break;
8210   case 'x':
8211   case 'w':
8212   case 'y':
8213     if (type->isFloatingPointTy() || type->isVectorTy())
8214       weight = CW_Register;
8215     break;
8216   case 'z':
8217     weight = CW_Constant;
8218     break;
8219   case 'U':
8220     if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
8221       weight = CW_Register;
8222     break;
8223   }
8224   return weight;
8225 }
8226 
8227 std::pair<unsigned, const TargetRegisterClass *>
8228 AArch64TargetLowering::getRegForInlineAsmConstraint(
8229     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
8230   if (Constraint.size() == 1) {
8231     switch (Constraint[0]) {
8232     case 'r':
8233       if (VT.isScalableVector())
8234         return std::make_pair(0U, nullptr);
8235       if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
8236         return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
8237       if (VT.getFixedSizeInBits() == 64)
8238         return std::make_pair(0U, &AArch64::GPR64commonRegClass);
8239       return std::make_pair(0U, &AArch64::GPR32commonRegClass);
8240     case 'w': {
8241       if (!Subtarget->hasFPARMv8())
8242         break;
8243       if (VT.isScalableVector()) {
8244         if (VT.getVectorElementType() != MVT::i1)
8245           return std::make_pair(0U, &AArch64::ZPRRegClass);
8246         return std::make_pair(0U, nullptr);
8247       }
8248       uint64_t VTSize = VT.getFixedSizeInBits();
8249       if (VTSize == 16)
8250         return std::make_pair(0U, &AArch64::FPR16RegClass);
8251       if (VTSize == 32)
8252         return std::make_pair(0U, &AArch64::FPR32RegClass);
8253       if (VTSize == 64)
8254         return std::make_pair(0U, &AArch64::FPR64RegClass);
8255       if (VTSize == 128)
8256         return std::make_pair(0U, &AArch64::FPR128RegClass);
8257       break;
8258     }
8259     // The instructions that this constraint is designed for can
8260     // only take 128-bit registers so just use that regclass.
8261     case 'x':
8262       if (!Subtarget->hasFPARMv8())
8263         break;
8264       if (VT.isScalableVector())
8265         return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
8266       if (VT.getSizeInBits() == 128)
8267         return std::make_pair(0U, &AArch64::FPR128_loRegClass);
8268       break;
8269     case 'y':
8270       if (!Subtarget->hasFPARMv8())
8271         break;
8272       if (VT.isScalableVector())
8273         return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
8274       break;
8275     }
8276   } else {
8277     PredicateConstraint PC = parsePredicateConstraint(Constraint);
8278     if (PC != PredicateConstraint::Invalid) {
8279       if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
8280         return std::make_pair(0U, nullptr);
8281       bool restricted = (PC == PredicateConstraint::Upl);
8282       return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
8283                         : std::make_pair(0U, &AArch64::PPRRegClass);
8284     }
8285   }
8286   if (StringRef("{cc}").equals_insensitive(Constraint))
8287     return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
8288 
8289   // Use the default implementation in TargetLowering to convert the register
8290   // constraint into a member of a register class.
8291   std::pair<unsigned, const TargetRegisterClass *> Res;
8292   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
8293 
8294   // Not found as a standard register?
8295   if (!Res.second) {
8296     unsigned Size = Constraint.size();
8297     if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
8298         tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
8299       int RegNo;
8300       bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
8301       if (!Failed && RegNo >= 0 && RegNo <= 31) {
8302         // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
8303         // By default we'll emit v0-v31 for this unless there's a modifier where
8304         // we'll emit the correct register as well.
8305         if (VT != MVT::Other && VT.getSizeInBits() == 64) {
8306           Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
8307           Res.second = &AArch64::FPR64RegClass;
8308         } else {
8309           Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
8310           Res.second = &AArch64::FPR128RegClass;
8311         }
8312       }
8313     }
8314   }
8315 
8316   if (Res.second && !Subtarget->hasFPARMv8() &&
8317       !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
8318       !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
8319     return std::make_pair(0U, nullptr);
8320 
8321   return Res;
8322 }
8323 
8324 EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
8325                                                   llvm::Type *Ty,
8326                                                   bool AllowUnknown) const {
8327   if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
8328     return EVT(MVT::i64x8);
8329 
8330   return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
8331 }
8332 
8333 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
8334 /// vector.  If it is invalid, don't add anything to Ops.
8335 void AArch64TargetLowering::LowerAsmOperandForConstraint(
8336     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
8337     SelectionDAG &DAG) const {
8338   SDValue Result;
8339 
8340   // Currently only support length 1 constraints.
8341   if (Constraint.length() != 1)
8342     return;
8343 
8344   char ConstraintLetter = Constraint[0];
8345   switch (ConstraintLetter) {
8346   default:
8347     break;
8348 
8349   // This set of constraints deal with valid constants for various instructions.
8350   // Validate and return a target constant for them if we can.
8351   case 'z': {
8352     // 'z' maps to xzr or wzr so it needs an input of 0.
8353     if (!isNullConstant(Op))
8354       return;
8355 
8356     if (Op.getValueType() == MVT::i64)
8357       Result = DAG.getRegister(AArch64::XZR, MVT::i64);
8358     else
8359       Result = DAG.getRegister(AArch64::WZR, MVT::i32);
8360     break;
8361   }
8362   case 'S': {
8363     // An absolute symbolic address or label reference.
8364     if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
8365       Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
8366                                           GA->getValueType(0));
8367     } else if (const BlockAddressSDNode *BA =
8368                    dyn_cast<BlockAddressSDNode>(Op)) {
8369       Result =
8370           DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
8371     } else
8372       return;
8373     break;
8374   }
8375 
8376   case 'I':
8377   case 'J':
8378   case 'K':
8379   case 'L':
8380   case 'M':
8381   case 'N':
8382     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
8383     if (!C)
8384       return;
8385 
8386     // Grab the value and do some validation.
8387     uint64_t CVal = C->getZExtValue();
8388     switch (ConstraintLetter) {
8389     // The I constraint applies only to simple ADD or SUB immediate operands:
8390     // i.e. 0 to 4095 with optional shift by 12
8391     // The J constraint applies only to ADD or SUB immediates that would be
8392     // valid when negated, i.e. if [an add pattern] were to be output as a SUB
8393     // instruction [or vice versa], in other words -1 to -4095 with optional
8394     // left shift by 12.
8395     case 'I':
8396       if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
8397         break;
8398       return;
8399     case 'J': {
8400       uint64_t NVal = -C->getSExtValue();
8401       if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
8402         CVal = C->getSExtValue();
8403         break;
8404       }
8405       return;
8406     }
8407     // The K and L constraints apply *only* to logical immediates, including
8408     // what used to be the MOVI alias for ORR (though the MOVI alias has now
8409     // been removed and MOV should be used). So these constraints have to
8410     // distinguish between bit patterns that are valid 32-bit or 64-bit
8411     // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
8412     // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
8413     // versa.
8414     case 'K':
8415       if (AArch64_AM::isLogicalImmediate(CVal, 32))
8416         break;
8417       return;
8418     case 'L':
8419       if (AArch64_AM::isLogicalImmediate(CVal, 64))
8420         break;
8421       return;
8422     // The M and N constraints are a superset of K and L respectively, for use
8423     // with the MOV (immediate) alias. As well as the logical immediates they
8424     // also match 32 or 64-bit immediates that can be loaded either using a
8425     // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
8426     // (M) or 64-bit 0x1234000000000000 (N) etc.
8427     // As a note some of this code is liberally stolen from the asm parser.
8428     case 'M': {
8429       if (!isUInt<32>(CVal))
8430         return;
8431       if (AArch64_AM::isLogicalImmediate(CVal, 32))
8432         break;
8433       if ((CVal & 0xFFFF) == CVal)
8434         break;
8435       if ((CVal & 0xFFFF0000ULL) == CVal)
8436         break;
8437       uint64_t NCVal = ~(uint32_t)CVal;
8438       if ((NCVal & 0xFFFFULL) == NCVal)
8439         break;
8440       if ((NCVal & 0xFFFF0000ULL) == NCVal)
8441         break;
8442       return;
8443     }
8444     case 'N': {
8445       if (AArch64_AM::isLogicalImmediate(CVal, 64))
8446         break;
8447       if ((CVal & 0xFFFFULL) == CVal)
8448         break;
8449       if ((CVal & 0xFFFF0000ULL) == CVal)
8450         break;
8451       if ((CVal & 0xFFFF00000000ULL) == CVal)
8452         break;
8453       if ((CVal & 0xFFFF000000000000ULL) == CVal)
8454         break;
8455       uint64_t NCVal = ~CVal;
8456       if ((NCVal & 0xFFFFULL) == NCVal)
8457         break;
8458       if ((NCVal & 0xFFFF0000ULL) == NCVal)
8459         break;
8460       if ((NCVal & 0xFFFF00000000ULL) == NCVal)
8461         break;
8462       if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
8463         break;
8464       return;
8465     }
8466     default:
8467       return;
8468     }
8469 
8470     // All assembler immediates are 64-bit integers.
8471     Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
8472     break;
8473   }
8474 
8475   if (Result.getNode()) {
8476     Ops.push_back(Result);
8477     return;
8478   }
8479 
8480   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
8481 }
8482 
8483 //===----------------------------------------------------------------------===//
8484 //                     AArch64 Advanced SIMD Support
8485 //===----------------------------------------------------------------------===//
8486 
8487 /// WidenVector - Given a value in the V64 register class, produce the
8488 /// equivalent value in the V128 register class.
8489 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
8490   EVT VT = V64Reg.getValueType();
8491   unsigned NarrowSize = VT.getVectorNumElements();
8492   MVT EltTy = VT.getVectorElementType().getSimpleVT();
8493   MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
8494   SDLoc DL(V64Reg);
8495 
8496   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
8497                      V64Reg, DAG.getConstant(0, DL, MVT::i64));
8498 }
8499 
8500 /// getExtFactor - Determine the adjustment factor for the position when
8501 /// generating an "extract from vector registers" instruction.
8502 static unsigned getExtFactor(SDValue &V) {
8503   EVT EltType = V.getValueType().getVectorElementType();
8504   return EltType.getSizeInBits() / 8;
8505 }
8506 
8507 /// NarrowVector - Given a value in the V128 register class, produce the
8508 /// equivalent value in the V64 register class.
8509 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
8510   EVT VT = V128Reg.getValueType();
8511   unsigned WideSize = VT.getVectorNumElements();
8512   MVT EltTy = VT.getVectorElementType().getSimpleVT();
8513   MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
8514   SDLoc DL(V128Reg);
8515 
8516   return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
8517 }
8518 
8519 // Gather data to see if the operation can be modelled as a
8520 // shuffle in combination with VEXTs.
8521 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
8522                                                   SelectionDAG &DAG) const {
8523   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8524   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
8525   SDLoc dl(Op);
8526   EVT VT = Op.getValueType();
8527   assert(!VT.isScalableVector() &&
8528          "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
8529   unsigned NumElts = VT.getVectorNumElements();
8530 
8531   struct ShuffleSourceInfo {
8532     SDValue Vec;
8533     unsigned MinElt;
8534     unsigned MaxElt;
8535 
8536     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
8537     // be compatible with the shuffle we intend to construct. As a result
8538     // ShuffleVec will be some sliding window into the original Vec.
8539     SDValue ShuffleVec;
8540 
8541     // Code should guarantee that element i in Vec starts at element "WindowBase
8542     // + i * WindowScale in ShuffleVec".
8543     int WindowBase;
8544     int WindowScale;
8545 
8546     ShuffleSourceInfo(SDValue Vec)
8547       : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
8548           ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
8549 
8550     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8551   };
8552 
8553   // First gather all vectors used as an immediate source for this BUILD_VECTOR
8554   // node.
8555   SmallVector<ShuffleSourceInfo, 2> Sources;
8556   for (unsigned i = 0; i < NumElts; ++i) {
8557     SDValue V = Op.getOperand(i);
8558     if (V.isUndef())
8559       continue;
8560     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8561              !isa<ConstantSDNode>(V.getOperand(1))) {
8562       LLVM_DEBUG(
8563           dbgs() << "Reshuffle failed: "
8564                     "a shuffle can only come from building a vector from "
8565                     "various elements of other vectors, provided their "
8566                     "indices are constant\n");
8567       return SDValue();
8568     }
8569 
8570     // Add this element source to the list if it's not already there.
8571     SDValue SourceVec = V.getOperand(0);
8572     auto Source = find(Sources, SourceVec);
8573     if (Source == Sources.end())
8574       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8575 
8576     // Update the minimum and maximum lane number seen.
8577     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
8578     Source->MinElt = std::min(Source->MinElt, EltNo);
8579     Source->MaxElt = std::max(Source->MaxElt, EltNo);
8580   }
8581 
8582   if (Sources.size() > 2) {
8583     LLVM_DEBUG(
8584         dbgs() << "Reshuffle failed: currently only do something sane when at "
8585                   "most two source vectors are involved\n");
8586     return SDValue();
8587   }
8588 
8589   // Find out the smallest element size among result and two sources, and use
8590   // it as element size to build the shuffle_vector.
8591   EVT SmallestEltTy = VT.getVectorElementType();
8592   for (auto &Source : Sources) {
8593     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8594     if (SrcEltTy.bitsLT(SmallestEltTy)) {
8595       SmallestEltTy = SrcEltTy;
8596     }
8597   }
8598   unsigned ResMultiplier =
8599       VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
8600   uint64_t VTSize = VT.getFixedSizeInBits();
8601   NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
8602   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8603 
8604   // If the source vector is too wide or too narrow, we may nevertheless be able
8605   // to construct a compatible shuffle either by concatenating it with UNDEF or
8606   // extracting a suitable range of elements.
8607   for (auto &Src : Sources) {
8608     EVT SrcVT = Src.ShuffleVec.getValueType();
8609 
8610     uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8611     if (SrcVTSize == VTSize)
8612       continue;
8613 
8614     // This stage of the search produces a source with the same element type as
8615     // the original, but with a total width matching the BUILD_VECTOR output.
8616     EVT EltVT = SrcVT.getVectorElementType();
8617     unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8618     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8619 
8620     if (SrcVTSize < VTSize) {
8621       assert(2 * SrcVTSize == VTSize);
8622       // We can pad out the smaller vector for free, so if it's part of a
8623       // shuffle...
8624       Src.ShuffleVec =
8625           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8626                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8627       continue;
8628     }
8629 
8630     if (SrcVTSize != 2 * VTSize) {
8631       LLVM_DEBUG(
8632           dbgs() << "Reshuffle failed: result vector too small to extract\n");
8633       return SDValue();
8634     }
8635 
8636     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8637       LLVM_DEBUG(
8638           dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
8639       return SDValue();
8640     }
8641 
8642     if (Src.MinElt >= NumSrcElts) {
8643       // The extraction can just take the second half
8644       Src.ShuffleVec =
8645           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8646                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
8647       Src.WindowBase = -NumSrcElts;
8648     } else if (Src.MaxElt < NumSrcElts) {
8649       // The extraction can just take the first half
8650       Src.ShuffleVec =
8651           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8652                       DAG.getConstant(0, dl, MVT::i64));
8653     } else {
8654       // An actual VEXT is needed
8655       SDValue VEXTSrc1 =
8656           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8657                       DAG.getConstant(0, dl, MVT::i64));
8658       SDValue VEXTSrc2 =
8659           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8660                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
8661       unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
8662 
8663       if (!SrcVT.is64BitVector()) {
8664         LLVM_DEBUG(
8665           dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
8666                     "for SVE vectors.");
8667         return SDValue();
8668       }
8669 
8670       Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
8671                                    VEXTSrc2,
8672                                    DAG.getConstant(Imm, dl, MVT::i32));
8673       Src.WindowBase = -Src.MinElt;
8674     }
8675   }
8676 
8677   // Another possible incompatibility occurs from the vector element types. We
8678   // can fix this by bitcasting the source vectors to the same type we intend
8679   // for the shuffle.
8680   for (auto &Src : Sources) {
8681     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8682     if (SrcEltTy == SmallestEltTy)
8683       continue;
8684     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8685     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
8686     Src.WindowScale =
8687         SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
8688     Src.WindowBase *= Src.WindowScale;
8689   }
8690 
8691   // Final sanity check before we try to actually produce a shuffle.
8692   LLVM_DEBUG(for (auto Src
8693                   : Sources)
8694                  assert(Src.ShuffleVec.getValueType() == ShuffleVT););
8695 
8696   // The stars all align, our next step is to produce the mask for the shuffle.
8697   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8698   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8699   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8700     SDValue Entry = Op.getOperand(i);
8701     if (Entry.isUndef())
8702       continue;
8703 
8704     auto Src = find(Sources, Entry.getOperand(0));
8705     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8706 
8707     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8708     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8709     // segment.
8710     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8711     int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8712                                VT.getScalarSizeInBits());
8713     int LanesDefined = BitsDefined / BitsPerShuffleLane;
8714 
8715     // This source is expected to fill ResMultiplier lanes of the final shuffle,
8716     // starting at the appropriate offset.
8717     int *LaneMask = &Mask[i * ResMultiplier];
8718 
8719     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8720     ExtractBase += NumElts * (Src - Sources.begin());
8721     for (int j = 0; j < LanesDefined; ++j)
8722       LaneMask[j] = ExtractBase + j;
8723   }
8724 
8725   // Final check before we try to produce nonsense...
8726   if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
8727     LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
8728     return SDValue();
8729   }
8730 
8731   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8732   for (unsigned i = 0; i < Sources.size(); ++i)
8733     ShuffleOps[i] = Sources[i].ShuffleVec;
8734 
8735   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8736                                          ShuffleOps[1], Mask);
8737   SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
8738 
8739   LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
8740              dbgs() << "Reshuffle, creating node: "; V.dump(););
8741 
8742   return V;
8743 }
8744 
8745 // check if an EXT instruction can handle the shuffle mask when the
8746 // vector sources of the shuffle are the same.
8747 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
8748   unsigned NumElts = VT.getVectorNumElements();
8749 
8750   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
8751   if (M[0] < 0)
8752     return false;
8753 
8754   Imm = M[0];
8755 
8756   // If this is a VEXT shuffle, the immediate value is the index of the first
8757   // element.  The other shuffle indices must be the successive elements after
8758   // the first one.
8759   unsigned ExpectedElt = Imm;
8760   for (unsigned i = 1; i < NumElts; ++i) {
8761     // Increment the expected index.  If it wraps around, just follow it
8762     // back to index zero and keep going.
8763     ++ExpectedElt;
8764     if (ExpectedElt == NumElts)
8765       ExpectedElt = 0;
8766 
8767     if (M[i] < 0)
8768       continue; // ignore UNDEF indices
8769     if (ExpectedElt != static_cast<unsigned>(M[i]))
8770       return false;
8771   }
8772 
8773   return true;
8774 }
8775 
8776 /// Check if a vector shuffle corresponds to a DUP instructions with a larger
8777 /// element width than the vector lane type. If that is the case the function
8778 /// returns true and writes the value of the DUP instruction lane operand into
8779 /// DupLaneOp
8780 static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
8781                           unsigned &DupLaneOp) {
8782   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
8783          "Only possible block sizes for wide DUP are: 16, 32, 64");
8784 
8785   if (BlockSize <= VT.getScalarSizeInBits())
8786     return false;
8787   if (BlockSize % VT.getScalarSizeInBits() != 0)
8788     return false;
8789   if (VT.getSizeInBits() % BlockSize != 0)
8790     return false;
8791 
8792   size_t SingleVecNumElements = VT.getVectorNumElements();
8793   size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
8794   size_t NumBlocks = VT.getSizeInBits() / BlockSize;
8795 
8796   // We are looking for masks like
8797   // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
8798   // might be replaced by 'undefined'. BlockIndices will eventually contain
8799   // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
8800   // for the above examples)
8801   SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
8802   for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
8803     for (size_t I = 0; I < NumEltsPerBlock; I++) {
8804       int Elt = M[BlockIndex * NumEltsPerBlock + I];
8805       if (Elt < 0)
8806         continue;
8807       // For now we don't support shuffles that use the second operand
8808       if ((unsigned)Elt >= SingleVecNumElements)
8809         return false;
8810       if (BlockElts[I] < 0)
8811         BlockElts[I] = Elt;
8812       else if (BlockElts[I] != Elt)
8813         return false;
8814     }
8815 
8816   // We found a candidate block (possibly with some undefs). It must be a
8817   // sequence of consecutive integers starting with a value divisible by
8818   // NumEltsPerBlock with some values possibly replaced by undef-s.
8819 
8820   // Find first non-undef element
8821   auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
8822   assert(FirstRealEltIter != BlockElts.end() &&
8823          "Shuffle with all-undefs must have been caught by previous cases, "
8824          "e.g. isSplat()");
8825   if (FirstRealEltIter == BlockElts.end()) {
8826     DupLaneOp = 0;
8827     return true;
8828   }
8829 
8830   // Index of FirstRealElt in BlockElts
8831   size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
8832 
8833   if ((unsigned)*FirstRealEltIter < FirstRealIndex)
8834     return false;
8835   // BlockElts[0] must have the following value if it isn't undef:
8836   size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
8837 
8838   // Check the first element
8839   if (Elt0 % NumEltsPerBlock != 0)
8840     return false;
8841   // Check that the sequence indeed consists of consecutive integers (modulo
8842   // undefs)
8843   for (size_t I = 0; I < NumEltsPerBlock; I++)
8844     if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
8845       return false;
8846 
8847   DupLaneOp = Elt0 / NumEltsPerBlock;
8848   return true;
8849 }
8850 
8851 // check if an EXT instruction can handle the shuffle mask when the
8852 // vector sources of the shuffle are different.
8853 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
8854                       unsigned &Imm) {
8855   // Look for the first non-undef element.
8856   const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
8857 
8858   // Benefit form APInt to handle overflow when calculating expected element.
8859   unsigned NumElts = VT.getVectorNumElements();
8860   unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
8861   APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
8862   // The following shuffle indices must be the successive elements after the
8863   // first real element.
8864   const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
8865       [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
8866   if (FirstWrongElt != M.end())
8867     return false;
8868 
8869   // The index of an EXT is the first element if it is not UNDEF.
8870   // Watch out for the beginning UNDEFs. The EXT index should be the expected
8871   // value of the first element.  E.g.
8872   // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
8873   // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
8874   // ExpectedElt is the last mask index plus 1.
8875   Imm = ExpectedElt.getZExtValue();
8876 
8877   // There are two difference cases requiring to reverse input vectors.
8878   // For example, for vector <4 x i32> we have the following cases,
8879   // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
8880   // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
8881   // For both cases, we finally use mask <5, 6, 7, 0>, which requires
8882   // to reverse two input vectors.
8883   if (Imm < NumElts)
8884     ReverseEXT = true;
8885   else
8886     Imm -= NumElts;
8887 
8888   return true;
8889 }
8890 
8891 /// isREVMask - Check if a vector shuffle corresponds to a REV
8892 /// instruction with the specified blocksize.  (The order of the elements
8893 /// within each block of the vector is reversed.)
8894 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
8895   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
8896          "Only possible block sizes for REV are: 16, 32, 64");
8897 
8898   unsigned EltSz = VT.getScalarSizeInBits();
8899   if (EltSz == 64)
8900     return false;
8901 
8902   unsigned NumElts = VT.getVectorNumElements();
8903   unsigned BlockElts = M[0] + 1;
8904   // If the first shuffle index is UNDEF, be optimistic.
8905   if (M[0] < 0)
8906     BlockElts = BlockSize / EltSz;
8907 
8908   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
8909     return false;
8910 
8911   for (unsigned i = 0; i < NumElts; ++i) {
8912     if (M[i] < 0)
8913       continue; // ignore UNDEF indices
8914     if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
8915       return false;
8916   }
8917 
8918   return true;
8919 }
8920 
8921 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8922   unsigned NumElts = VT.getVectorNumElements();
8923   if (NumElts % 2 != 0)
8924     return false;
8925   WhichResult = (M[0] == 0 ? 0 : 1);
8926   unsigned Idx = WhichResult * NumElts / 2;
8927   for (unsigned i = 0; i != NumElts; i += 2) {
8928     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
8929         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
8930       return false;
8931     Idx += 1;
8932   }
8933 
8934   return true;
8935 }
8936 
8937 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8938   unsigned NumElts = VT.getVectorNumElements();
8939   WhichResult = (M[0] == 0 ? 0 : 1);
8940   for (unsigned i = 0; i != NumElts; ++i) {
8941     if (M[i] < 0)
8942       continue; // ignore UNDEF indices
8943     if ((unsigned)M[i] != 2 * i + WhichResult)
8944       return false;
8945   }
8946 
8947   return true;
8948 }
8949 
8950 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8951   unsigned NumElts = VT.getVectorNumElements();
8952   if (NumElts % 2 != 0)
8953     return false;
8954   WhichResult = (M[0] == 0 ? 0 : 1);
8955   for (unsigned i = 0; i < NumElts; i += 2) {
8956     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
8957         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
8958       return false;
8959   }
8960   return true;
8961 }
8962 
8963 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
8964 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
8965 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
8966 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8967   unsigned NumElts = VT.getVectorNumElements();
8968   if (NumElts % 2 != 0)
8969     return false;
8970   WhichResult = (M[0] == 0 ? 0 : 1);
8971   unsigned Idx = WhichResult * NumElts / 2;
8972   for (unsigned i = 0; i != NumElts; i += 2) {
8973     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
8974         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
8975       return false;
8976     Idx += 1;
8977   }
8978 
8979   return true;
8980 }
8981 
8982 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
8983 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
8984 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
8985 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8986   unsigned Half = VT.getVectorNumElements() / 2;
8987   WhichResult = (M[0] == 0 ? 0 : 1);
8988   for (unsigned j = 0; j != 2; ++j) {
8989     unsigned Idx = WhichResult;
8990     for (unsigned i = 0; i != Half; ++i) {
8991       int MIdx = M[i + j * Half];
8992       if (MIdx >= 0 && (unsigned)MIdx != Idx)
8993         return false;
8994       Idx += 2;
8995     }
8996   }
8997 
8998   return true;
8999 }
9000 
9001 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
9002 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
9003 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
9004 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
9005   unsigned NumElts = VT.getVectorNumElements();
9006   if (NumElts % 2 != 0)
9007     return false;
9008   WhichResult = (M[0] == 0 ? 0 : 1);
9009   for (unsigned i = 0; i < NumElts; i += 2) {
9010     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
9011         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
9012       return false;
9013   }
9014   return true;
9015 }
9016 
9017 static bool isINSMask(ArrayRef<int> M, int NumInputElements,
9018                       bool &DstIsLeft, int &Anomaly) {
9019   if (M.size() != static_cast<size_t>(NumInputElements))
9020     return false;
9021 
9022   int NumLHSMatch = 0, NumRHSMatch = 0;
9023   int LastLHSMismatch = -1, LastRHSMismatch = -1;
9024 
9025   for (int i = 0; i < NumInputElements; ++i) {
9026     if (M[i] == -1) {
9027       ++NumLHSMatch;
9028       ++NumRHSMatch;
9029       continue;
9030     }
9031 
9032     if (M[i] == i)
9033       ++NumLHSMatch;
9034     else
9035       LastLHSMismatch = i;
9036 
9037     if (M[i] == i + NumInputElements)
9038       ++NumRHSMatch;
9039     else
9040       LastRHSMismatch = i;
9041   }
9042 
9043   if (NumLHSMatch == NumInputElements - 1) {
9044     DstIsLeft = true;
9045     Anomaly = LastLHSMismatch;
9046     return true;
9047   } else if (NumRHSMatch == NumInputElements - 1) {
9048     DstIsLeft = false;
9049     Anomaly = LastRHSMismatch;
9050     return true;
9051   }
9052 
9053   return false;
9054 }
9055 
9056 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
9057   if (VT.getSizeInBits() != 128)
9058     return false;
9059 
9060   unsigned NumElts = VT.getVectorNumElements();
9061 
9062   for (int I = 0, E = NumElts / 2; I != E; I++) {
9063     if (Mask[I] != I)
9064       return false;
9065   }
9066 
9067   int Offset = NumElts / 2;
9068   for (int I = NumElts / 2, E = NumElts; I != E; I++) {
9069     if (Mask[I] != I + SplitLHS * Offset)
9070       return false;
9071   }
9072 
9073   return true;
9074 }
9075 
9076 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
9077   SDLoc DL(Op);
9078   EVT VT = Op.getValueType();
9079   SDValue V0 = Op.getOperand(0);
9080   SDValue V1 = Op.getOperand(1);
9081   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
9082 
9083   if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
9084       VT.getVectorElementType() != V1.getValueType().getVectorElementType())
9085     return SDValue();
9086 
9087   bool SplitV0 = V0.getValueSizeInBits() == 128;
9088 
9089   if (!isConcatMask(Mask, VT, SplitV0))
9090     return SDValue();
9091 
9092   EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
9093   if (SplitV0) {
9094     V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
9095                      DAG.getConstant(0, DL, MVT::i64));
9096   }
9097   if (V1.getValueSizeInBits() == 128) {
9098     V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
9099                      DAG.getConstant(0, DL, MVT::i64));
9100   }
9101   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
9102 }
9103 
9104 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
9105 /// the specified operations to build the shuffle.
9106 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
9107                                       SDValue RHS, SelectionDAG &DAG,
9108                                       const SDLoc &dl) {
9109   unsigned OpNum = (PFEntry >> 26) & 0x0F;
9110   unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
9111   unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
9112 
9113   enum {
9114     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
9115     OP_VREV,
9116     OP_VDUP0,
9117     OP_VDUP1,
9118     OP_VDUP2,
9119     OP_VDUP3,
9120     OP_VEXT1,
9121     OP_VEXT2,
9122     OP_VEXT3,
9123     OP_VUZPL, // VUZP, left result
9124     OP_VUZPR, // VUZP, right result
9125     OP_VZIPL, // VZIP, left result
9126     OP_VZIPR, // VZIP, right result
9127     OP_VTRNL, // VTRN, left result
9128     OP_VTRNR  // VTRN, right result
9129   };
9130 
9131   if (OpNum == OP_COPY) {
9132     if (LHSID == (1 * 9 + 2) * 9 + 3)
9133       return LHS;
9134     assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
9135     return RHS;
9136   }
9137 
9138   SDValue OpLHS, OpRHS;
9139   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
9140   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
9141   EVT VT = OpLHS.getValueType();
9142 
9143   switch (OpNum) {
9144   default:
9145     llvm_unreachable("Unknown shuffle opcode!");
9146   case OP_VREV:
9147     // VREV divides the vector in half and swaps within the half.
9148     if (VT.getVectorElementType() == MVT::i32 ||
9149         VT.getVectorElementType() == MVT::f32)
9150       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
9151     // vrev <4 x i16> -> REV32
9152     if (VT.getVectorElementType() == MVT::i16 ||
9153         VT.getVectorElementType() == MVT::f16 ||
9154         VT.getVectorElementType() == MVT::bf16)
9155       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
9156     // vrev <4 x i8> -> REV16
9157     assert(VT.getVectorElementType() == MVT::i8);
9158     return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
9159   case OP_VDUP0:
9160   case OP_VDUP1:
9161   case OP_VDUP2:
9162   case OP_VDUP3: {
9163     EVT EltTy = VT.getVectorElementType();
9164     unsigned Opcode;
9165     if (EltTy == MVT::i8)
9166       Opcode = AArch64ISD::DUPLANE8;
9167     else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
9168       Opcode = AArch64ISD::DUPLANE16;
9169     else if (EltTy == MVT::i32 || EltTy == MVT::f32)
9170       Opcode = AArch64ISD::DUPLANE32;
9171     else if (EltTy == MVT::i64 || EltTy == MVT::f64)
9172       Opcode = AArch64ISD::DUPLANE64;
9173     else
9174       llvm_unreachable("Invalid vector element type?");
9175 
9176     if (VT.getSizeInBits() == 64)
9177       OpLHS = WidenVector(OpLHS, DAG);
9178     SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
9179     return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
9180   }
9181   case OP_VEXT1:
9182   case OP_VEXT2:
9183   case OP_VEXT3: {
9184     unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
9185     return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
9186                        DAG.getConstant(Imm, dl, MVT::i32));
9187   }
9188   case OP_VUZPL:
9189     return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
9190                        OpRHS);
9191   case OP_VUZPR:
9192     return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
9193                        OpRHS);
9194   case OP_VZIPL:
9195     return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
9196                        OpRHS);
9197   case OP_VZIPR:
9198     return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
9199                        OpRHS);
9200   case OP_VTRNL:
9201     return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
9202                        OpRHS);
9203   case OP_VTRNR:
9204     return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
9205                        OpRHS);
9206   }
9207 }
9208 
9209 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
9210                            SelectionDAG &DAG) {
9211   // Check to see if we can use the TBL instruction.
9212   SDValue V1 = Op.getOperand(0);
9213   SDValue V2 = Op.getOperand(1);
9214   SDLoc DL(Op);
9215 
9216   EVT EltVT = Op.getValueType().getVectorElementType();
9217   unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
9218 
9219   SmallVector<SDValue, 8> TBLMask;
9220   for (int Val : ShuffleMask) {
9221     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
9222       unsigned Offset = Byte + Val * BytesPerElt;
9223       TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
9224     }
9225   }
9226 
9227   MVT IndexVT = MVT::v8i8;
9228   unsigned IndexLen = 8;
9229   if (Op.getValueSizeInBits() == 128) {
9230     IndexVT = MVT::v16i8;
9231     IndexLen = 16;
9232   }
9233 
9234   SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
9235   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
9236 
9237   SDValue Shuffle;
9238   if (V2.getNode()->isUndef()) {
9239     if (IndexLen == 8)
9240       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
9241     Shuffle = DAG.getNode(
9242         ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
9243         DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
9244         DAG.getBuildVector(IndexVT, DL,
9245                            makeArrayRef(TBLMask.data(), IndexLen)));
9246   } else {
9247     if (IndexLen == 8) {
9248       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
9249       Shuffle = DAG.getNode(
9250           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
9251           DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
9252           DAG.getBuildVector(IndexVT, DL,
9253                              makeArrayRef(TBLMask.data(), IndexLen)));
9254     } else {
9255       // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
9256       // cannot currently represent the register constraints on the input
9257       // table registers.
9258       //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
9259       //                   DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
9260       //                   IndexLen));
9261       Shuffle = DAG.getNode(
9262           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
9263           DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
9264           V2Cst, DAG.getBuildVector(IndexVT, DL,
9265                                     makeArrayRef(TBLMask.data(), IndexLen)));
9266     }
9267   }
9268   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
9269 }
9270 
9271 static unsigned getDUPLANEOp(EVT EltType) {
9272   if (EltType == MVT::i8)
9273     return AArch64ISD::DUPLANE8;
9274   if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
9275     return AArch64ISD::DUPLANE16;
9276   if (EltType == MVT::i32 || EltType == MVT::f32)
9277     return AArch64ISD::DUPLANE32;
9278   if (EltType == MVT::i64 || EltType == MVT::f64)
9279     return AArch64ISD::DUPLANE64;
9280 
9281   llvm_unreachable("Invalid vector element type?");
9282 }
9283 
9284 static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
9285                             unsigned Opcode, SelectionDAG &DAG) {
9286   // Try to eliminate a bitcasted extract subvector before a DUPLANE.
9287   auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
9288     // Match: dup (bitcast (extract_subv X, C)), LaneC
9289     if (BitCast.getOpcode() != ISD::BITCAST ||
9290         BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
9291       return false;
9292 
9293     // The extract index must align in the destination type. That may not
9294     // happen if the bitcast is from narrow to wide type.
9295     SDValue Extract = BitCast.getOperand(0);
9296     unsigned ExtIdx = Extract.getConstantOperandVal(1);
9297     unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
9298     unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
9299     unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
9300     if (ExtIdxInBits % CastedEltBitWidth != 0)
9301       return false;
9302 
9303     // Update the lane value by offsetting with the scaled extract index.
9304     LaneC += ExtIdxInBits / CastedEltBitWidth;
9305 
9306     // Determine the casted vector type of the wide vector input.
9307     // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
9308     // Examples:
9309     // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
9310     // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
9311     unsigned SrcVecNumElts =
9312         Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
9313     CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
9314                               SrcVecNumElts);
9315     return true;
9316   };
9317   MVT CastVT;
9318   if (getScaledOffsetDup(V, Lane, CastVT)) {
9319     V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
9320   } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
9321     // The lane is incremented by the index of the extract.
9322     // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
9323     Lane += V.getConstantOperandVal(1);
9324     V = V.getOperand(0);
9325   } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
9326     // The lane is decremented if we are splatting from the 2nd operand.
9327     // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
9328     unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
9329     Lane -= Idx * VT.getVectorNumElements() / 2;
9330     V = WidenVector(V.getOperand(Idx), DAG);
9331   } else if (VT.getSizeInBits() == 64) {
9332     // Widen the operand to 128-bit register with undef.
9333     V = WidenVector(V, DAG);
9334   }
9335   return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
9336 }
9337 
9338 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
9339                                                    SelectionDAG &DAG) const {
9340   SDLoc dl(Op);
9341   EVT VT = Op.getValueType();
9342 
9343   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
9344 
9345   if (useSVEForFixedLengthVectorVT(VT))
9346     return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
9347 
9348   // Convert shuffles that are directly supported on NEON to target-specific
9349   // DAG nodes, instead of keeping them as shuffles and matching them again
9350   // during code selection.  This is more efficient and avoids the possibility
9351   // of inconsistencies between legalization and selection.
9352   ArrayRef<int> ShuffleMask = SVN->getMask();
9353 
9354   SDValue V1 = Op.getOperand(0);
9355   SDValue V2 = Op.getOperand(1);
9356 
9357   assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
9358   assert(ShuffleMask.size() == VT.getVectorNumElements() &&
9359          "Unexpected VECTOR_SHUFFLE mask size!");
9360 
9361   if (SVN->isSplat()) {
9362     int Lane = SVN->getSplatIndex();
9363     // If this is undef splat, generate it via "just" vdup, if possible.
9364     if (Lane == -1)
9365       Lane = 0;
9366 
9367     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
9368       return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
9369                          V1.getOperand(0));
9370     // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
9371     // constant. If so, we can just reference the lane's definition directly.
9372     if (V1.getOpcode() == ISD::BUILD_VECTOR &&
9373         !isa<ConstantSDNode>(V1.getOperand(Lane)))
9374       return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
9375 
9376     // Otherwise, duplicate from the lane of the input vector.
9377     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
9378     return constructDup(V1, Lane, dl, VT, Opcode, DAG);
9379   }
9380 
9381   // Check if the mask matches a DUP for a wider element
9382   for (unsigned LaneSize : {64U, 32U, 16U}) {
9383     unsigned Lane = 0;
9384     if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
9385       unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
9386                                        : LaneSize == 32 ? AArch64ISD::DUPLANE32
9387                                                         : AArch64ISD::DUPLANE16;
9388       // Cast V1 to an integer vector with required lane size
9389       MVT NewEltTy = MVT::getIntegerVT(LaneSize);
9390       unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
9391       MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
9392       V1 = DAG.getBitcast(NewVecTy, V1);
9393       // Constuct the DUP instruction
9394       V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
9395       // Cast back to the original type
9396       return DAG.getBitcast(VT, V1);
9397     }
9398   }
9399 
9400   if (isREVMask(ShuffleMask, VT, 64))
9401     return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
9402   if (isREVMask(ShuffleMask, VT, 32))
9403     return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
9404   if (isREVMask(ShuffleMask, VT, 16))
9405     return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
9406 
9407   if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
9408        (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
9409       ShuffleVectorInst::isReverseMask(ShuffleMask)) {
9410     SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
9411     return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
9412                        DAG.getConstant(8, dl, MVT::i32));
9413   }
9414 
9415   bool ReverseEXT = false;
9416   unsigned Imm;
9417   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
9418     if (ReverseEXT)
9419       std::swap(V1, V2);
9420     Imm *= getExtFactor(V1);
9421     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
9422                        DAG.getConstant(Imm, dl, MVT::i32));
9423   } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
9424     Imm *= getExtFactor(V1);
9425     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
9426                        DAG.getConstant(Imm, dl, MVT::i32));
9427   }
9428 
9429   unsigned WhichResult;
9430   if (isZIPMask(ShuffleMask, VT, WhichResult)) {
9431     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
9432     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
9433   }
9434   if (isUZPMask(ShuffleMask, VT, WhichResult)) {
9435     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
9436     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
9437   }
9438   if (isTRNMask(ShuffleMask, VT, WhichResult)) {
9439     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
9440     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
9441   }
9442 
9443   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
9444     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
9445     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
9446   }
9447   if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
9448     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
9449     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
9450   }
9451   if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
9452     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
9453     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
9454   }
9455 
9456   if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
9457     return Concat;
9458 
9459   bool DstIsLeft;
9460   int Anomaly;
9461   int NumInputElements = V1.getValueType().getVectorNumElements();
9462   if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
9463     SDValue DstVec = DstIsLeft ? V1 : V2;
9464     SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
9465 
9466     SDValue SrcVec = V1;
9467     int SrcLane = ShuffleMask[Anomaly];
9468     if (SrcLane >= NumInputElements) {
9469       SrcVec = V2;
9470       SrcLane -= VT.getVectorNumElements();
9471     }
9472     SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
9473 
9474     EVT ScalarVT = VT.getVectorElementType();
9475 
9476     if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
9477       ScalarVT = MVT::i32;
9478 
9479     return DAG.getNode(
9480         ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9481         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
9482         DstLaneV);
9483   }
9484 
9485   // If the shuffle is not directly supported and it has 4 elements, use
9486   // the PerfectShuffle-generated table to synthesize it from other shuffles.
9487   unsigned NumElts = VT.getVectorNumElements();
9488   if (NumElts == 4) {
9489     unsigned PFIndexes[4];
9490     for (unsigned i = 0; i != 4; ++i) {
9491       if (ShuffleMask[i] < 0)
9492         PFIndexes[i] = 8;
9493       else
9494         PFIndexes[i] = ShuffleMask[i];
9495     }
9496 
9497     // Compute the index in the perfect shuffle table.
9498     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
9499                             PFIndexes[2] * 9 + PFIndexes[3];
9500     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
9501     unsigned Cost = (PFEntry >> 30);
9502 
9503     if (Cost <= 4)
9504       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
9505   }
9506 
9507   return GenerateTBL(Op, ShuffleMask, DAG);
9508 }
9509 
9510 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
9511                                                  SelectionDAG &DAG) const {
9512   SDLoc dl(Op);
9513   EVT VT = Op.getValueType();
9514   EVT ElemVT = VT.getScalarType();
9515   SDValue SplatVal = Op.getOperand(0);
9516 
9517   if (useSVEForFixedLengthVectorVT(VT))
9518     return LowerToScalableOp(Op, DAG);
9519 
9520   // Extend input splat value where needed to fit into a GPR (32b or 64b only)
9521   // FPRs don't have this restriction.
9522   switch (ElemVT.getSimpleVT().SimpleTy) {
9523   case MVT::i1: {
9524     // The only legal i1 vectors are SVE vectors, so we can use SVE-specific
9525     // lowering code.
9526     if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
9527       if (ConstVal->isOne())
9528         return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
9529       // TODO: Add special case for constant false
9530     }
9531     // The general case of i1.  There isn't any natural way to do this,
9532     // so we use some trickery with whilelo.
9533     SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
9534     SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal,
9535                            DAG.getValueType(MVT::i1));
9536     SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl,
9537                                        MVT::i64);
9538     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
9539                        DAG.getConstant(0, dl, MVT::i64), SplatVal);
9540   }
9541   case MVT::i8:
9542   case MVT::i16:
9543   case MVT::i32:
9544     SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
9545     break;
9546   case MVT::i64:
9547     SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
9548     break;
9549   case MVT::f16:
9550   case MVT::bf16:
9551   case MVT::f32:
9552   case MVT::f64:
9553     // Fine as is
9554     break;
9555   default:
9556     report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
9557   }
9558 
9559   return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
9560 }
9561 
9562 SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
9563                                              SelectionDAG &DAG) const {
9564   SDLoc DL(Op);
9565 
9566   EVT VT = Op.getValueType();
9567   if (!isTypeLegal(VT) || !VT.isScalableVector())
9568     return SDValue();
9569 
9570   // Current lowering only supports the SVE-ACLE types.
9571   if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
9572     return SDValue();
9573 
9574   // The DUPQ operation is indepedent of element type so normalise to i64s.
9575   SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
9576   SDValue Idx128 = Op.getOperand(2);
9577 
9578   // DUPQ can be used when idx is in range.
9579   auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
9580   if (CIdx && (CIdx->getZExtValue() <= 3)) {
9581     SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
9582     SDNode *DUPQ =
9583         DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
9584     return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
9585   }
9586 
9587   // The ACLE says this must produce the same result as:
9588   //   svtbl(data, svadd_x(svptrue_b64(),
9589   //                       svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
9590   //                       index * 2))
9591   SDValue One = DAG.getConstant(1, DL, MVT::i64);
9592   SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
9593 
9594   // create the vector 0,1,0,1,...
9595   SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
9596   SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
9597 
9598   // create the vector idx64,idx64+1,idx64,idx64+1,...
9599   SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
9600   SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
9601   SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
9602 
9603   // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
9604   SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
9605   return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
9606 }
9607 
9608 
9609 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
9610                                APInt &UndefBits) {
9611   EVT VT = BVN->getValueType(0);
9612   APInt SplatBits, SplatUndef;
9613   unsigned SplatBitSize;
9614   bool HasAnyUndefs;
9615   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9616     unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
9617 
9618     for (unsigned i = 0; i < NumSplats; ++i) {
9619       CnstBits <<= SplatBitSize;
9620       UndefBits <<= SplatBitSize;
9621       CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
9622       UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
9623     }
9624 
9625     return true;
9626   }
9627 
9628   return false;
9629 }
9630 
9631 // Try 64-bit splatted SIMD immediate.
9632 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9633                                  const APInt &Bits) {
9634   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9635     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9636     EVT VT = Op.getValueType();
9637     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
9638 
9639     if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
9640       Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
9641 
9642       SDLoc dl(Op);
9643       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9644                                 DAG.getConstant(Value, dl, MVT::i32));
9645       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9646     }
9647   }
9648 
9649   return SDValue();
9650 }
9651 
9652 // Try 32-bit splatted SIMD immediate.
9653 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9654                                   const APInt &Bits,
9655                                   const SDValue *LHS = nullptr) {
9656   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9657     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9658     EVT VT = Op.getValueType();
9659     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
9660     bool isAdvSIMDModImm = false;
9661     uint64_t Shift;
9662 
9663     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
9664       Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
9665       Shift = 0;
9666     }
9667     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
9668       Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
9669       Shift = 8;
9670     }
9671     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
9672       Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
9673       Shift = 16;
9674     }
9675     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
9676       Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
9677       Shift = 24;
9678     }
9679 
9680     if (isAdvSIMDModImm) {
9681       SDLoc dl(Op);
9682       SDValue Mov;
9683 
9684       if (LHS)
9685         Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
9686                           DAG.getConstant(Value, dl, MVT::i32),
9687                           DAG.getConstant(Shift, dl, MVT::i32));
9688       else
9689         Mov = DAG.getNode(NewOp, dl, MovTy,
9690                           DAG.getConstant(Value, dl, MVT::i32),
9691                           DAG.getConstant(Shift, dl, MVT::i32));
9692 
9693       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9694     }
9695   }
9696 
9697   return SDValue();
9698 }
9699 
9700 // Try 16-bit splatted SIMD immediate.
9701 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9702                                   const APInt &Bits,
9703                                   const SDValue *LHS = nullptr) {
9704   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9705     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9706     EVT VT = Op.getValueType();
9707     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
9708     bool isAdvSIMDModImm = false;
9709     uint64_t Shift;
9710 
9711     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
9712       Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
9713       Shift = 0;
9714     }
9715     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
9716       Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
9717       Shift = 8;
9718     }
9719 
9720     if (isAdvSIMDModImm) {
9721       SDLoc dl(Op);
9722       SDValue Mov;
9723 
9724       if (LHS)
9725         Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
9726                           DAG.getConstant(Value, dl, MVT::i32),
9727                           DAG.getConstant(Shift, dl, MVT::i32));
9728       else
9729         Mov = DAG.getNode(NewOp, dl, MovTy,
9730                           DAG.getConstant(Value, dl, MVT::i32),
9731                           DAG.getConstant(Shift, dl, MVT::i32));
9732 
9733       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9734     }
9735   }
9736 
9737   return SDValue();
9738 }
9739 
9740 // Try 32-bit splatted SIMD immediate with shifted ones.
9741 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
9742                                     SelectionDAG &DAG, const APInt &Bits) {
9743   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9744     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9745     EVT VT = Op.getValueType();
9746     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
9747     bool isAdvSIMDModImm = false;
9748     uint64_t Shift;
9749 
9750     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
9751       Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
9752       Shift = 264;
9753     }
9754     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
9755       Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
9756       Shift = 272;
9757     }
9758 
9759     if (isAdvSIMDModImm) {
9760       SDLoc dl(Op);
9761       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9762                                 DAG.getConstant(Value, dl, MVT::i32),
9763                                 DAG.getConstant(Shift, dl, MVT::i32));
9764       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9765     }
9766   }
9767 
9768   return SDValue();
9769 }
9770 
9771 // Try 8-bit splatted SIMD immediate.
9772 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9773                                  const APInt &Bits) {
9774   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9775     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9776     EVT VT = Op.getValueType();
9777     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
9778 
9779     if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
9780       Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
9781 
9782       SDLoc dl(Op);
9783       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9784                                 DAG.getConstant(Value, dl, MVT::i32));
9785       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9786     }
9787   }
9788 
9789   return SDValue();
9790 }
9791 
9792 // Try FP splatted SIMD immediate.
9793 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9794                                   const APInt &Bits) {
9795   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9796     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9797     EVT VT = Op.getValueType();
9798     bool isWide = (VT.getSizeInBits() == 128);
9799     MVT MovTy;
9800     bool isAdvSIMDModImm = false;
9801 
9802     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
9803       Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
9804       MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
9805     }
9806     else if (isWide &&
9807              (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
9808       Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
9809       MovTy = MVT::v2f64;
9810     }
9811 
9812     if (isAdvSIMDModImm) {
9813       SDLoc dl(Op);
9814       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9815                                 DAG.getConstant(Value, dl, MVT::i32));
9816       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9817     }
9818   }
9819 
9820   return SDValue();
9821 }
9822 
9823 // Specialized code to quickly find if PotentialBVec is a BuildVector that
9824 // consists of only the same constant int value, returned in reference arg
9825 // ConstVal
9826 static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
9827                                      uint64_t &ConstVal) {
9828   BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
9829   if (!Bvec)
9830     return false;
9831   ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
9832   if (!FirstElt)
9833     return false;
9834   EVT VT = Bvec->getValueType(0);
9835   unsigned NumElts = VT.getVectorNumElements();
9836   for (unsigned i = 1; i < NumElts; ++i)
9837     if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
9838       return false;
9839   ConstVal = FirstElt->getZExtValue();
9840   return true;
9841 }
9842 
9843 static unsigned getIntrinsicID(const SDNode *N) {
9844   unsigned Opcode = N->getOpcode();
9845   switch (Opcode) {
9846   default:
9847     return Intrinsic::not_intrinsic;
9848   case ISD::INTRINSIC_WO_CHAIN: {
9849     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
9850     if (IID < Intrinsic::num_intrinsics)
9851       return IID;
9852     return Intrinsic::not_intrinsic;
9853   }
9854   }
9855 }
9856 
9857 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
9858 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
9859 // BUILD_VECTORs with constant element C1, C2 is a constant, and:
9860 //   - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
9861 //   - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
9862 // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
9863 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
9864   EVT VT = N->getValueType(0);
9865 
9866   if (!VT.isVector())
9867     return SDValue();
9868 
9869   SDLoc DL(N);
9870 
9871   SDValue And;
9872   SDValue Shift;
9873 
9874   SDValue FirstOp = N->getOperand(0);
9875   unsigned FirstOpc = FirstOp.getOpcode();
9876   SDValue SecondOp = N->getOperand(1);
9877   unsigned SecondOpc = SecondOp.getOpcode();
9878 
9879   // Is one of the operands an AND or a BICi? The AND may have been optimised to
9880   // a BICi in order to use an immediate instead of a register.
9881   // Is the other operand an shl or lshr? This will have been turned into:
9882   // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
9883   if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
9884       (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
9885     And = FirstOp;
9886     Shift = SecondOp;
9887 
9888   } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
9889              (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
9890     And = SecondOp;
9891     Shift = FirstOp;
9892   } else
9893     return SDValue();
9894 
9895   bool IsAnd = And.getOpcode() == ISD::AND;
9896   bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
9897 
9898   // Is the shift amount constant?
9899   ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
9900   if (!C2node)
9901     return SDValue();
9902 
9903   uint64_t C1;
9904   if (IsAnd) {
9905     // Is the and mask vector all constant?
9906     if (!isAllConstantBuildVector(And.getOperand(1), C1))
9907       return SDValue();
9908   } else {
9909     // Reconstruct the corresponding AND immediate from the two BICi immediates.
9910     ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
9911     ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
9912     assert(C1nodeImm && C1nodeShift);
9913     C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
9914   }
9915 
9916   // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
9917   // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
9918   // how much one can shift elements of a particular size?
9919   uint64_t C2 = C2node->getZExtValue();
9920   unsigned ElemSizeInBits = VT.getScalarSizeInBits();
9921   if (C2 > ElemSizeInBits)
9922     return SDValue();
9923 
9924   APInt C1AsAPInt(ElemSizeInBits, C1);
9925   APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
9926                                   : APInt::getLowBitsSet(ElemSizeInBits, C2);
9927   if (C1AsAPInt != RequiredC1)
9928     return SDValue();
9929 
9930   SDValue X = And.getOperand(0);
9931   SDValue Y = Shift.getOperand(0);
9932 
9933   unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
9934   SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
9935 
9936   LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
9937   LLVM_DEBUG(N->dump(&DAG));
9938   LLVM_DEBUG(dbgs() << "into: \n");
9939   LLVM_DEBUG(ResultSLI->dump(&DAG));
9940 
9941   ++NumShiftInserts;
9942   return ResultSLI;
9943 }
9944 
9945 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
9946                                              SelectionDAG &DAG) const {
9947   if (useSVEForFixedLengthVectorVT(Op.getValueType()))
9948     return LowerToScalableOp(Op, DAG);
9949 
9950   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
9951   if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
9952     return Res;
9953 
9954   EVT VT = Op.getValueType();
9955 
9956   SDValue LHS = Op.getOperand(0);
9957   BuildVectorSDNode *BVN =
9958       dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
9959   if (!BVN) {
9960     // OR commutes, so try swapping the operands.
9961     LHS = Op.getOperand(1);
9962     BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
9963   }
9964   if (!BVN)
9965     return Op;
9966 
9967   APInt DefBits(VT.getSizeInBits(), 0);
9968   APInt UndefBits(VT.getSizeInBits(), 0);
9969   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
9970     SDValue NewOp;
9971 
9972     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
9973                                     DefBits, &LHS)) ||
9974         (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
9975                                     DefBits, &LHS)))
9976       return NewOp;
9977 
9978     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
9979                                     UndefBits, &LHS)) ||
9980         (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
9981                                     UndefBits, &LHS)))
9982       return NewOp;
9983   }
9984 
9985   // We can always fall back to a non-immediate OR.
9986   return Op;
9987 }
9988 
9989 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
9990 // be truncated to fit element width.
9991 static SDValue NormalizeBuildVector(SDValue Op,
9992                                     SelectionDAG &DAG) {
9993   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
9994   SDLoc dl(Op);
9995   EVT VT = Op.getValueType();
9996   EVT EltTy= VT.getVectorElementType();
9997 
9998   if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
9999     return Op;
10000 
10001   SmallVector<SDValue, 16> Ops;
10002   for (SDValue Lane : Op->ops()) {
10003     // For integer vectors, type legalization would have promoted the
10004     // operands already. Otherwise, if Op is a floating-point splat
10005     // (with operands cast to integers), then the only possibilities
10006     // are constants and UNDEFs.
10007     if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
10008       APInt LowBits(EltTy.getSizeInBits(),
10009                     CstLane->getZExtValue());
10010       Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
10011     } else if (Lane.getNode()->isUndef()) {
10012       Lane = DAG.getUNDEF(MVT::i32);
10013     } else {
10014       assert(Lane.getValueType() == MVT::i32 &&
10015              "Unexpected BUILD_VECTOR operand type");
10016     }
10017     Ops.push_back(Lane);
10018   }
10019   return DAG.getBuildVector(VT, dl, Ops);
10020 }
10021 
10022 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
10023   EVT VT = Op.getValueType();
10024 
10025   APInt DefBits(VT.getSizeInBits(), 0);
10026   APInt UndefBits(VT.getSizeInBits(), 0);
10027   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
10028   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
10029     SDValue NewOp;
10030     if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
10031         (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
10032         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
10033         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
10034         (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
10035         (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
10036       return NewOp;
10037 
10038     DefBits = ~DefBits;
10039     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
10040         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
10041         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
10042       return NewOp;
10043 
10044     DefBits = UndefBits;
10045     if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
10046         (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
10047         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
10048         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
10049         (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
10050         (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
10051       return NewOp;
10052 
10053     DefBits = ~UndefBits;
10054     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
10055         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
10056         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
10057       return NewOp;
10058   }
10059 
10060   return SDValue();
10061 }
10062 
10063 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
10064                                                  SelectionDAG &DAG) const {
10065   EVT VT = Op.getValueType();
10066 
10067   // Try to build a simple constant vector.
10068   Op = NormalizeBuildVector(Op, DAG);
10069   if (VT.isInteger()) {
10070     // Certain vector constants, used to express things like logical NOT and
10071     // arithmetic NEG, are passed through unmodified.  This allows special
10072     // patterns for these operations to match, which will lower these constants
10073     // to whatever is proven necessary.
10074     BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
10075     if (BVN->isConstant())
10076       if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
10077         unsigned BitSize = VT.getVectorElementType().getSizeInBits();
10078         APInt Val(BitSize,
10079                   Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
10080         if (Val.isNullValue() || Val.isAllOnesValue())
10081           return Op;
10082       }
10083   }
10084 
10085   if (SDValue V = ConstantBuildVector(Op, DAG))
10086     return V;
10087 
10088   // Scan through the operands to find some interesting properties we can
10089   // exploit:
10090   //   1) If only one value is used, we can use a DUP, or
10091   //   2) if only the low element is not undef, we can just insert that, or
10092   //   3) if only one constant value is used (w/ some non-constant lanes),
10093   //      we can splat the constant value into the whole vector then fill
10094   //      in the non-constant lanes.
10095   //   4) FIXME: If different constant values are used, but we can intelligently
10096   //             select the values we'll be overwriting for the non-constant
10097   //             lanes such that we can directly materialize the vector
10098   //             some other way (MOVI, e.g.), we can be sneaky.
10099   //   5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
10100   SDLoc dl(Op);
10101   unsigned NumElts = VT.getVectorNumElements();
10102   bool isOnlyLowElement = true;
10103   bool usesOnlyOneValue = true;
10104   bool usesOnlyOneConstantValue = true;
10105   bool isConstant = true;
10106   bool AllLanesExtractElt = true;
10107   unsigned NumConstantLanes = 0;
10108   unsigned NumDifferentLanes = 0;
10109   unsigned NumUndefLanes = 0;
10110   SDValue Value;
10111   SDValue ConstantValue;
10112   for (unsigned i = 0; i < NumElts; ++i) {
10113     SDValue V = Op.getOperand(i);
10114     if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10115       AllLanesExtractElt = false;
10116     if (V.isUndef()) {
10117       ++NumUndefLanes;
10118       continue;
10119     }
10120     if (i > 0)
10121       isOnlyLowElement = false;
10122     if (!isIntOrFPConstant(V))
10123       isConstant = false;
10124 
10125     if (isIntOrFPConstant(V)) {
10126       ++NumConstantLanes;
10127       if (!ConstantValue.getNode())
10128         ConstantValue = V;
10129       else if (ConstantValue != V)
10130         usesOnlyOneConstantValue = false;
10131     }
10132 
10133     if (!Value.getNode())
10134       Value = V;
10135     else if (V != Value) {
10136       usesOnlyOneValue = false;
10137       ++NumDifferentLanes;
10138     }
10139   }
10140 
10141   if (!Value.getNode()) {
10142     LLVM_DEBUG(
10143         dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
10144     return DAG.getUNDEF(VT);
10145   }
10146 
10147   // Convert BUILD_VECTOR where all elements but the lowest are undef into
10148   // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
10149   // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
10150   if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
10151     LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
10152                          "SCALAR_TO_VECTOR node\n");
10153     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
10154   }
10155 
10156   if (AllLanesExtractElt) {
10157     SDNode *Vector = nullptr;
10158     bool Even = false;
10159     bool Odd = false;
10160     // Check whether the extract elements match the Even pattern <0,2,4,...> or
10161     // the Odd pattern <1,3,5,...>.
10162     for (unsigned i = 0; i < NumElts; ++i) {
10163       SDValue V = Op.getOperand(i);
10164       const SDNode *N = V.getNode();
10165       if (!isa<ConstantSDNode>(N->getOperand(1)))
10166         break;
10167       SDValue N0 = N->getOperand(0);
10168 
10169       // All elements are extracted from the same vector.
10170       if (!Vector) {
10171         Vector = N0.getNode();
10172         // Check that the type of EXTRACT_VECTOR_ELT matches the type of
10173         // BUILD_VECTOR.
10174         if (VT.getVectorElementType() !=
10175             N0.getValueType().getVectorElementType())
10176           break;
10177       } else if (Vector != N0.getNode()) {
10178         Odd = false;
10179         Even = false;
10180         break;
10181       }
10182 
10183       // Extracted values are either at Even indices <0,2,4,...> or at Odd
10184       // indices <1,3,5,...>.
10185       uint64_t Val = N->getConstantOperandVal(1);
10186       if (Val == 2 * i) {
10187         Even = true;
10188         continue;
10189       }
10190       if (Val - 1 == 2 * i) {
10191         Odd = true;
10192         continue;
10193       }
10194 
10195       // Something does not match: abort.
10196       Odd = false;
10197       Even = false;
10198       break;
10199     }
10200     if (Even || Odd) {
10201       SDValue LHS =
10202           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
10203                       DAG.getConstant(0, dl, MVT::i64));
10204       SDValue RHS =
10205           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
10206                       DAG.getConstant(NumElts, dl, MVT::i64));
10207 
10208       if (Even && !Odd)
10209         return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
10210                            RHS);
10211       if (Odd && !Even)
10212         return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
10213                            RHS);
10214     }
10215   }
10216 
10217   // Use DUP for non-constant splats. For f32 constant splats, reduce to
10218   // i32 and try again.
10219   if (usesOnlyOneValue) {
10220     if (!isConstant) {
10221       if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10222           Value.getValueType() != VT) {
10223         LLVM_DEBUG(
10224             dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
10225         return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
10226       }
10227 
10228       // This is actually a DUPLANExx operation, which keeps everything vectory.
10229 
10230       SDValue Lane = Value.getOperand(1);
10231       Value = Value.getOperand(0);
10232       if (Value.getValueSizeInBits() == 64) {
10233         LLVM_DEBUG(
10234             dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
10235                       "widening it\n");
10236         Value = WidenVector(Value, DAG);
10237       }
10238 
10239       unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
10240       return DAG.getNode(Opcode, dl, VT, Value, Lane);
10241     }
10242 
10243     if (VT.getVectorElementType().isFloatingPoint()) {
10244       SmallVector<SDValue, 8> Ops;
10245       EVT EltTy = VT.getVectorElementType();
10246       assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
10247                EltTy == MVT::f64) && "Unsupported floating-point vector type");
10248       LLVM_DEBUG(
10249           dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
10250                     "BITCASTS, and try again\n");
10251       MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
10252       for (unsigned i = 0; i < NumElts; ++i)
10253         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
10254       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
10255       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
10256       LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
10257                  Val.dump(););
10258       Val = LowerBUILD_VECTOR(Val, DAG);
10259       if (Val.getNode())
10260         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
10261     }
10262   }
10263 
10264   // If we need to insert a small number of different non-constant elements and
10265   // the vector width is sufficiently large, prefer using DUP with the common
10266   // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
10267   // skip the constant lane handling below.
10268   bool PreferDUPAndInsert =
10269       !isConstant && NumDifferentLanes >= 1 &&
10270       NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
10271       NumDifferentLanes >= NumConstantLanes;
10272 
10273   // If there was only one constant value used and for more than one lane,
10274   // start by splatting that value, then replace the non-constant lanes. This
10275   // is better than the default, which will perform a separate initialization
10276   // for each lane.
10277   if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
10278     // Firstly, try to materialize the splat constant.
10279     SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
10280             Val = ConstantBuildVector(Vec, DAG);
10281     if (!Val) {
10282       // Otherwise, materialize the constant and splat it.
10283       Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
10284       DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
10285     }
10286 
10287     // Now insert the non-constant lanes.
10288     for (unsigned i = 0; i < NumElts; ++i) {
10289       SDValue V = Op.getOperand(i);
10290       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
10291       if (!isIntOrFPConstant(V))
10292         // Note that type legalization likely mucked about with the VT of the
10293         // source operand, so we may have to convert it here before inserting.
10294         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
10295     }
10296     return Val;
10297   }
10298 
10299   // This will generate a load from the constant pool.
10300   if (isConstant) {
10301     LLVM_DEBUG(
10302         dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
10303                   "expansion\n");
10304     return SDValue();
10305   }
10306 
10307   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
10308   if (NumElts >= 4) {
10309     if (SDValue shuffle = ReconstructShuffle(Op, DAG))
10310       return shuffle;
10311   }
10312 
10313   if (PreferDUPAndInsert) {
10314     // First, build a constant vector with the common element.
10315     SmallVector<SDValue, 8> Ops(NumElts, Value);
10316     SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
10317     // Next, insert the elements that do not match the common value.
10318     for (unsigned I = 0; I < NumElts; ++I)
10319       if (Op.getOperand(I) != Value)
10320         NewVector =
10321             DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
10322                         Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
10323 
10324     return NewVector;
10325   }
10326 
10327   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
10328   // know the default expansion would otherwise fall back on something even
10329   // worse. For a vector with one or two non-undef values, that's
10330   // scalar_to_vector for the elements followed by a shuffle (provided the
10331   // shuffle is valid for the target) and materialization element by element
10332   // on the stack followed by a load for everything else.
10333   if (!isConstant && !usesOnlyOneValue) {
10334     LLVM_DEBUG(
10335         dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
10336                   "of INSERT_VECTOR_ELT\n");
10337 
10338     SDValue Vec = DAG.getUNDEF(VT);
10339     SDValue Op0 = Op.getOperand(0);
10340     unsigned i = 0;
10341 
10342     // Use SCALAR_TO_VECTOR for lane zero to
10343     // a) Avoid a RMW dependency on the full vector register, and
10344     // b) Allow the register coalescer to fold away the copy if the
10345     //    value is already in an S or D register, and we're forced to emit an
10346     //    INSERT_SUBREG that we can't fold anywhere.
10347     //
10348     // We also allow types like i8 and i16 which are illegal scalar but legal
10349     // vector element types. After type-legalization the inserted value is
10350     // extended (i32) and it is safe to cast them to the vector type by ignoring
10351     // the upper bits of the lowest lane (e.g. v8i8, v4i16).
10352     if (!Op0.isUndef()) {
10353       LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
10354       Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
10355       ++i;
10356     }
10357     LLVM_DEBUG(if (i < NumElts) dbgs()
10358                    << "Creating nodes for the other vector elements:\n";);
10359     for (; i < NumElts; ++i) {
10360       SDValue V = Op.getOperand(i);
10361       if (V.isUndef())
10362         continue;
10363       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
10364       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
10365     }
10366     return Vec;
10367   }
10368 
10369   LLVM_DEBUG(
10370       dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
10371                 "better alternative\n");
10372   return SDValue();
10373 }
10374 
10375 SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
10376                                                    SelectionDAG &DAG) const {
10377   if (useSVEForFixedLengthVectorVT(Op.getValueType()))
10378     return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
10379 
10380   assert(Op.getValueType().isScalableVector() &&
10381          isTypeLegal(Op.getValueType()) &&
10382          "Expected legal scalable vector type!");
10383 
10384   if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2)
10385     return Op;
10386 
10387   return SDValue();
10388 }
10389 
10390 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
10391                                                       SelectionDAG &DAG) const {
10392   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
10393 
10394   if (useSVEForFixedLengthVectorVT(Op.getValueType()))
10395     return LowerFixedLengthInsertVectorElt(Op, DAG);
10396 
10397   // Check for non-constant or out of range lane.
10398   EVT VT = Op.getOperand(0).getValueType();
10399 
10400   if (VT.getScalarType() == MVT::i1) {
10401     EVT VectorVT = getPromotedVTForPredicate(VT);
10402     SDLoc DL(Op);
10403     SDValue ExtendedVector =
10404         DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
10405     SDValue ExtendedValue =
10406         DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
10407                              VectorVT.getScalarType().getSizeInBits() < 32
10408                                  ? MVT::i32
10409                                  : VectorVT.getScalarType());
10410     ExtendedVector =
10411         DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
10412                     ExtendedValue, Op.getOperand(2));
10413     return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
10414   }
10415 
10416   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
10417   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
10418     return SDValue();
10419 
10420   // Insertion/extraction are legal for V128 types.
10421   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
10422       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
10423       VT == MVT::v8f16 || VT == MVT::v8bf16)
10424     return Op;
10425 
10426   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
10427       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
10428       VT != MVT::v4bf16)
10429     return SDValue();
10430 
10431   // For V64 types, we perform insertion by expanding the value
10432   // to a V128 type and perform the insertion on that.
10433   SDLoc DL(Op);
10434   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
10435   EVT WideTy = WideVec.getValueType();
10436 
10437   SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
10438                              Op.getOperand(1), Op.getOperand(2));
10439   // Re-narrow the resultant vector.
10440   return NarrowVector(Node, DAG);
10441 }
10442 
10443 SDValue
10444 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
10445                                                SelectionDAG &DAG) const {
10446   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
10447   EVT VT = Op.getOperand(0).getValueType();
10448 
10449   if (VT.getScalarType() == MVT::i1) {
10450     // We can't directly extract from an SVE predicate; extend it first.
10451     // (This isn't the only possible lowering, but it's straightforward.)
10452     EVT VectorVT = getPromotedVTForPredicate(VT);
10453     SDLoc DL(Op);
10454     SDValue Extend =
10455         DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
10456     MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
10457     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
10458                                   Extend, Op.getOperand(1));
10459     return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
10460   }
10461 
10462   if (useSVEForFixedLengthVectorVT(VT))
10463     return LowerFixedLengthExtractVectorElt(Op, DAG);
10464 
10465   // Check for non-constant or out of range lane.
10466   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
10467   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
10468     return SDValue();
10469 
10470   // Insertion/extraction are legal for V128 types.
10471   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
10472       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
10473       VT == MVT::v8f16 || VT == MVT::v8bf16)
10474     return Op;
10475 
10476   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
10477       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
10478       VT != MVT::v4bf16)
10479     return SDValue();
10480 
10481   // For V64 types, we perform extraction by expanding the value
10482   // to a V128 type and perform the extraction on that.
10483   SDLoc DL(Op);
10484   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
10485   EVT WideTy = WideVec.getValueType();
10486 
10487   EVT ExtrTy = WideTy.getVectorElementType();
10488   if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
10489     ExtrTy = MVT::i32;
10490 
10491   // For extractions, we just return the result directly.
10492   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
10493                      Op.getOperand(1));
10494 }
10495 
10496 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
10497                                                       SelectionDAG &DAG) const {
10498   assert(Op.getValueType().isFixedLengthVector() &&
10499          "Only cases that extract a fixed length vector are supported!");
10500 
10501   EVT InVT = Op.getOperand(0).getValueType();
10502   unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
10503   unsigned Size = Op.getValueSizeInBits();
10504 
10505   if (InVT.isScalableVector()) {
10506     // This will be matched by custom code during ISelDAGToDAG.
10507     if (Idx == 0 && isPackedVectorType(InVT, DAG))
10508       return Op;
10509 
10510     return SDValue();
10511   }
10512 
10513   // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
10514   if (Idx == 0 && InVT.getSizeInBits() <= 128)
10515     return Op;
10516 
10517   // If this is extracting the upper 64-bits of a 128-bit vector, we match
10518   // that directly.
10519   if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
10520       InVT.getSizeInBits() == 128)
10521     return Op;
10522 
10523   return SDValue();
10524 }
10525 
10526 SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
10527                                                      SelectionDAG &DAG) const {
10528   assert(Op.getValueType().isScalableVector() &&
10529          "Only expect to lower inserts into scalable vectors!");
10530 
10531   EVT InVT = Op.getOperand(1).getValueType();
10532   unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
10533 
10534   if (InVT.isScalableVector()) {
10535     SDLoc DL(Op);
10536     EVT VT = Op.getValueType();
10537 
10538     if (!isTypeLegal(VT) || !VT.isInteger())
10539       return SDValue();
10540 
10541     SDValue Vec0 = Op.getOperand(0);
10542     SDValue Vec1 = Op.getOperand(1);
10543 
10544     // Ensure the subvector is half the size of the main vector.
10545     if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
10546       return SDValue();
10547 
10548     // Extend elements of smaller vector...
10549     EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
10550     SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
10551 
10552     if (Idx == 0) {
10553       SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
10554       return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0);
10555     } else if (Idx == InVT.getVectorMinNumElements()) {
10556       SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
10557       return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec);
10558     }
10559 
10560     return SDValue();
10561   }
10562 
10563   // This will be matched by custom code during ISelDAGToDAG.
10564   if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
10565     return Op;
10566 
10567   return SDValue();
10568 }
10569 
10570 SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
10571   EVT VT = Op.getValueType();
10572 
10573   if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
10574     return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
10575 
10576   assert(VT.isScalableVector() && "Expected a scalable vector.");
10577 
10578   bool Signed = Op.getOpcode() == ISD::SDIV;
10579   unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
10580 
10581   if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
10582     return LowerToPredicatedOp(Op, DAG, PredOpcode);
10583 
10584   // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
10585   // operations, and truncate the result.
10586   EVT WidenedVT;
10587   if (VT == MVT::nxv16i8)
10588     WidenedVT = MVT::nxv8i16;
10589   else if (VT == MVT::nxv8i16)
10590     WidenedVT = MVT::nxv4i32;
10591   else
10592     llvm_unreachable("Unexpected Custom DIV operation");
10593 
10594   SDLoc dl(Op);
10595   unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
10596   unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
10597   SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
10598   SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
10599   SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
10600   SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
10601   SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
10602   SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
10603   return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
10604 }
10605 
10606 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
10607   // Currently no fixed length shuffles that require SVE are legal.
10608   if (useSVEForFixedLengthVectorVT(VT))
10609     return false;
10610 
10611   if (VT.getVectorNumElements() == 4 &&
10612       (VT.is128BitVector() || VT.is64BitVector())) {
10613     unsigned PFIndexes[4];
10614     for (unsigned i = 0; i != 4; ++i) {
10615       if (M[i] < 0)
10616         PFIndexes[i] = 8;
10617       else
10618         PFIndexes[i] = M[i];
10619     }
10620 
10621     // Compute the index in the perfect shuffle table.
10622     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
10623                             PFIndexes[2] * 9 + PFIndexes[3];
10624     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
10625     unsigned Cost = (PFEntry >> 30);
10626 
10627     if (Cost <= 4)
10628       return true;
10629   }
10630 
10631   bool DummyBool;
10632   int DummyInt;
10633   unsigned DummyUnsigned;
10634 
10635   return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
10636           isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
10637           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
10638           // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
10639           isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
10640           isZIPMask(M, VT, DummyUnsigned) ||
10641           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
10642           isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
10643           isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
10644           isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
10645           isConcatMask(M, VT, VT.getSizeInBits() == 128));
10646 }
10647 
10648 /// getVShiftImm - Check if this is a valid build_vector for the immediate
10649 /// operand of a vector shift operation, where all the elements of the
10650 /// build_vector must have the same constant integer value.
10651 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
10652   // Ignore bit_converts.
10653   while (Op.getOpcode() == ISD::BITCAST)
10654     Op = Op.getOperand(0);
10655   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
10656   APInt SplatBits, SplatUndef;
10657   unsigned SplatBitSize;
10658   bool HasAnyUndefs;
10659   if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
10660                                     HasAnyUndefs, ElementBits) ||
10661       SplatBitSize > ElementBits)
10662     return false;
10663   Cnt = SplatBits.getSExtValue();
10664   return true;
10665 }
10666 
10667 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
10668 /// operand of a vector shift left operation.  That value must be in the range:
10669 ///   0 <= Value < ElementBits for a left shift; or
10670 ///   0 <= Value <= ElementBits for a long left shift.
10671 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
10672   assert(VT.isVector() && "vector shift count is not a vector type");
10673   int64_t ElementBits = VT.getScalarSizeInBits();
10674   if (!getVShiftImm(Op, ElementBits, Cnt))
10675     return false;
10676   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
10677 }
10678 
10679 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
10680 /// operand of a vector shift right operation. The value must be in the range:
10681 ///   1 <= Value <= ElementBits for a right shift; or
10682 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
10683   assert(VT.isVector() && "vector shift count is not a vector type");
10684   int64_t ElementBits = VT.getScalarSizeInBits();
10685   if (!getVShiftImm(Op, ElementBits, Cnt))
10686     return false;
10687   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
10688 }
10689 
10690 SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
10691                                              SelectionDAG &DAG) const {
10692   EVT VT = Op.getValueType();
10693 
10694   if (VT.getScalarType() == MVT::i1) {
10695     // Lower i1 truncate to `(x & 1) != 0`.
10696     SDLoc dl(Op);
10697     EVT OpVT = Op.getOperand(0).getValueType();
10698     SDValue Zero = DAG.getConstant(0, dl, OpVT);
10699     SDValue One = DAG.getConstant(1, dl, OpVT);
10700     SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
10701     return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
10702   }
10703 
10704   if (!VT.isVector() || VT.isScalableVector())
10705     return SDValue();
10706 
10707   if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
10708     return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
10709 
10710   return SDValue();
10711 }
10712 
10713 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
10714                                                       SelectionDAG &DAG) const {
10715   EVT VT = Op.getValueType();
10716   SDLoc DL(Op);
10717   int64_t Cnt;
10718 
10719   if (!Op.getOperand(1).getValueType().isVector())
10720     return Op;
10721   unsigned EltSize = VT.getScalarSizeInBits();
10722 
10723   switch (Op.getOpcode()) {
10724   default:
10725     llvm_unreachable("unexpected shift opcode");
10726 
10727   case ISD::SHL:
10728     if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
10729       return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
10730 
10731     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
10732       return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
10733                          DAG.getConstant(Cnt, DL, MVT::i32));
10734     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10735                        DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
10736                                        MVT::i32),
10737                        Op.getOperand(0), Op.getOperand(1));
10738   case ISD::SRA:
10739   case ISD::SRL:
10740     if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) {
10741       unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
10742                                                 : AArch64ISD::SRL_PRED;
10743       return LowerToPredicatedOp(Op, DAG, Opc);
10744     }
10745 
10746     // Right shift immediate
10747     if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
10748       unsigned Opc =
10749           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
10750       return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
10751                          DAG.getConstant(Cnt, DL, MVT::i32));
10752     }
10753 
10754     // Right shift register.  Note, there is not a shift right register
10755     // instruction, but the shift left register instruction takes a signed
10756     // value, where negative numbers specify a right shift.
10757     unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
10758                                                 : Intrinsic::aarch64_neon_ushl;
10759     // negate the shift amount
10760     SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
10761                                    Op.getOperand(1));
10762     SDValue NegShiftLeft =
10763         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10764                     DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
10765                     NegShift);
10766     return NegShiftLeft;
10767   }
10768 
10769   return SDValue();
10770 }
10771 
10772 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
10773                                     AArch64CC::CondCode CC, bool NoNans, EVT VT,
10774                                     const SDLoc &dl, SelectionDAG &DAG) {
10775   EVT SrcVT = LHS.getValueType();
10776   assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
10777          "function only supposed to emit natural comparisons");
10778 
10779   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
10780   APInt CnstBits(VT.getSizeInBits(), 0);
10781   APInt UndefBits(VT.getSizeInBits(), 0);
10782   bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
10783   bool IsZero = IsCnst && (CnstBits == 0);
10784 
10785   if (SrcVT.getVectorElementType().isFloatingPoint()) {
10786     switch (CC) {
10787     default:
10788       return SDValue();
10789     case AArch64CC::NE: {
10790       SDValue Fcmeq;
10791       if (IsZero)
10792         Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
10793       else
10794         Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10795       return DAG.getNOT(dl, Fcmeq, VT);
10796     }
10797     case AArch64CC::EQ:
10798       if (IsZero)
10799         return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
10800       return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10801     case AArch64CC::GE:
10802       if (IsZero)
10803         return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
10804       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
10805     case AArch64CC::GT:
10806       if (IsZero)
10807         return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
10808       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
10809     case AArch64CC::LS:
10810       if (IsZero)
10811         return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
10812       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
10813     case AArch64CC::LT:
10814       if (!NoNans)
10815         return SDValue();
10816       // If we ignore NaNs then we can use to the MI implementation.
10817       LLVM_FALLTHROUGH;
10818     case AArch64CC::MI:
10819       if (IsZero)
10820         return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
10821       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
10822     }
10823   }
10824 
10825   switch (CC) {
10826   default:
10827     return SDValue();
10828   case AArch64CC::NE: {
10829     SDValue Cmeq;
10830     if (IsZero)
10831       Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
10832     else
10833       Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
10834     return DAG.getNOT(dl, Cmeq, VT);
10835   }
10836   case AArch64CC::EQ:
10837     if (IsZero)
10838       return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
10839     return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
10840   case AArch64CC::GE:
10841     if (IsZero)
10842       return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
10843     return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
10844   case AArch64CC::GT:
10845     if (IsZero)
10846       return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
10847     return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
10848   case AArch64CC::LE:
10849     if (IsZero)
10850       return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
10851     return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
10852   case AArch64CC::LS:
10853     return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
10854   case AArch64CC::LO:
10855     return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
10856   case AArch64CC::LT:
10857     if (IsZero)
10858       return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
10859     return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
10860   case AArch64CC::HI:
10861     return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
10862   case AArch64CC::HS:
10863     return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
10864   }
10865 }
10866 
10867 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
10868                                            SelectionDAG &DAG) const {
10869   if (Op.getValueType().isScalableVector())
10870     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
10871 
10872   if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
10873     return LowerFixedLengthVectorSetccToSVE(Op, DAG);
10874 
10875   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
10876   SDValue LHS = Op.getOperand(0);
10877   SDValue RHS = Op.getOperand(1);
10878   EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
10879   SDLoc dl(Op);
10880 
10881   if (LHS.getValueType().getVectorElementType().isInteger()) {
10882     assert(LHS.getValueType() == RHS.getValueType());
10883     AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
10884     SDValue Cmp =
10885         EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
10886     return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
10887   }
10888 
10889   const bool FullFP16 =
10890     static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
10891 
10892   // Make v4f16 (only) fcmp operations utilise vector instructions
10893   // v8f16 support will be a litle more complicated
10894   if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
10895     if (LHS.getValueType().getVectorNumElements() == 4) {
10896       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
10897       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
10898       SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
10899       DAG.ReplaceAllUsesWith(Op, NewSetcc);
10900       CmpVT = MVT::v4i32;
10901     } else
10902       return SDValue();
10903   }
10904 
10905   assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
10906           LHS.getValueType().getVectorElementType() != MVT::f128);
10907 
10908   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10909   // clean.  Some of them require two branches to implement.
10910   AArch64CC::CondCode CC1, CC2;
10911   bool ShouldInvert;
10912   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
10913 
10914   bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
10915   SDValue Cmp =
10916       EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
10917   if (!Cmp.getNode())
10918     return SDValue();
10919 
10920   if (CC2 != AArch64CC::AL) {
10921     SDValue Cmp2 =
10922         EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
10923     if (!Cmp2.getNode())
10924       return SDValue();
10925 
10926     Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
10927   }
10928 
10929   Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
10930 
10931   if (ShouldInvert)
10932     Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
10933 
10934   return Cmp;
10935 }
10936 
10937 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
10938                                   SelectionDAG &DAG) {
10939   SDValue VecOp = ScalarOp.getOperand(0);
10940   auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
10941   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
10942                      DAG.getConstant(0, DL, MVT::i64));
10943 }
10944 
10945 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
10946                                               SelectionDAG &DAG) const {
10947   SDValue Src = Op.getOperand(0);
10948 
10949   // Try to lower fixed length reductions to SVE.
10950   EVT SrcVT = Src.getValueType();
10951   bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND ||
10952                       Op.getOpcode() == ISD::VECREDUCE_OR ||
10953                       Op.getOpcode() == ISD::VECREDUCE_XOR ||
10954                       Op.getOpcode() == ISD::VECREDUCE_FADD ||
10955                       (Op.getOpcode() != ISD::VECREDUCE_ADD &&
10956                        SrcVT.getVectorElementType() == MVT::i64);
10957   if (SrcVT.isScalableVector() ||
10958       useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) {
10959 
10960     if (SrcVT.getVectorElementType() == MVT::i1)
10961       return LowerPredReductionToSVE(Op, DAG);
10962 
10963     switch (Op.getOpcode()) {
10964     case ISD::VECREDUCE_ADD:
10965       return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
10966     case ISD::VECREDUCE_AND:
10967       return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
10968     case ISD::VECREDUCE_OR:
10969       return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
10970     case ISD::VECREDUCE_SMAX:
10971       return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
10972     case ISD::VECREDUCE_SMIN:
10973       return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
10974     case ISD::VECREDUCE_UMAX:
10975       return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
10976     case ISD::VECREDUCE_UMIN:
10977       return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
10978     case ISD::VECREDUCE_XOR:
10979       return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
10980     case ISD::VECREDUCE_FADD:
10981       return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
10982     case ISD::VECREDUCE_FMAX:
10983       return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
10984     case ISD::VECREDUCE_FMIN:
10985       return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
10986     default:
10987       llvm_unreachable("Unhandled fixed length reduction");
10988     }
10989   }
10990 
10991   // Lower NEON reductions.
10992   SDLoc dl(Op);
10993   switch (Op.getOpcode()) {
10994   case ISD::VECREDUCE_ADD:
10995     return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
10996   case ISD::VECREDUCE_SMAX:
10997     return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
10998   case ISD::VECREDUCE_SMIN:
10999     return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
11000   case ISD::VECREDUCE_UMAX:
11001     return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
11002   case ISD::VECREDUCE_UMIN:
11003     return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
11004   case ISD::VECREDUCE_FMAX: {
11005     return DAG.getNode(
11006         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
11007         DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
11008         Src);
11009   }
11010   case ISD::VECREDUCE_FMIN: {
11011     return DAG.getNode(
11012         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
11013         DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
11014         Src);
11015   }
11016   default:
11017     llvm_unreachable("Unhandled reduction");
11018   }
11019 }
11020 
11021 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
11022                                                     SelectionDAG &DAG) const {
11023   auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
11024   if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
11025     return SDValue();
11026 
11027   // LSE has an atomic load-add instruction, but not a load-sub.
11028   SDLoc dl(Op);
11029   MVT VT = Op.getSimpleValueType();
11030   SDValue RHS = Op.getOperand(2);
11031   AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
11032   RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
11033   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
11034                        Op.getOperand(0), Op.getOperand(1), RHS,
11035                        AN->getMemOperand());
11036 }
11037 
11038 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
11039                                                     SelectionDAG &DAG) const {
11040   auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
11041   if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
11042     return SDValue();
11043 
11044   // LSE has an atomic load-clear instruction, but not a load-and.
11045   SDLoc dl(Op);
11046   MVT VT = Op.getSimpleValueType();
11047   SDValue RHS = Op.getOperand(2);
11048   AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
11049   RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
11050   return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
11051                        Op.getOperand(0), Op.getOperand(1), RHS,
11052                        AN->getMemOperand());
11053 }
11054 
11055 SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
11056     SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
11057   SDLoc dl(Op);
11058   EVT PtrVT = getPointerTy(DAG.getDataLayout());
11059   SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
11060 
11061   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
11062   const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
11063   if (Subtarget->hasCustomCallingConv())
11064     TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
11065 
11066   Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
11067                      DAG.getConstant(4, dl, MVT::i64));
11068   Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
11069   Chain =
11070       DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
11071                   Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
11072                   DAG.getRegisterMask(Mask), Chain.getValue(1));
11073   // To match the actual intent better, we should read the output from X15 here
11074   // again (instead of potentially spilling it to the stack), but rereading Size
11075   // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
11076   // here.
11077 
11078   Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
11079                      DAG.getConstant(4, dl, MVT::i64));
11080   return Chain;
11081 }
11082 
11083 SDValue
11084 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
11085                                                SelectionDAG &DAG) const {
11086   assert(Subtarget->isTargetWindows() &&
11087          "Only Windows alloca probing supported");
11088   SDLoc dl(Op);
11089   // Get the inputs.
11090   SDNode *Node = Op.getNode();
11091   SDValue Chain = Op.getOperand(0);
11092   SDValue Size = Op.getOperand(1);
11093   MaybeAlign Align =
11094       cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
11095   EVT VT = Node->getValueType(0);
11096 
11097   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
11098           "no-stack-arg-probe")) {
11099     SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
11100     Chain = SP.getValue(1);
11101     SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
11102     if (Align)
11103       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
11104                        DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
11105     Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
11106     SDValue Ops[2] = {SP, Chain};
11107     return DAG.getMergeValues(Ops, dl);
11108   }
11109 
11110   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
11111 
11112   Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
11113 
11114   SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
11115   Chain = SP.getValue(1);
11116   SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
11117   if (Align)
11118     SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
11119                      DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
11120   Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
11121 
11122   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
11123                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
11124 
11125   SDValue Ops[2] = {SP, Chain};
11126   return DAG.getMergeValues(Ops, dl);
11127 }
11128 
11129 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
11130                                            SelectionDAG &DAG) const {
11131   EVT VT = Op.getValueType();
11132   assert(VT != MVT::i64 && "Expected illegal VSCALE node");
11133 
11134   SDLoc DL(Op);
11135   APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
11136   return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
11137                             DL, VT);
11138 }
11139 
11140 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
11141 template <unsigned NumVecs>
11142 static bool
11143 setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
11144               AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
11145   Info.opc = ISD::INTRINSIC_VOID;
11146   // Retrieve EC from first vector argument.
11147   const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
11148   ElementCount EC = VT.getVectorElementCount();
11149 #ifndef NDEBUG
11150   // Check the assumption that all input vectors are the same type.
11151   for (unsigned I = 0; I < NumVecs; ++I)
11152     assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
11153            "Invalid type.");
11154 #endif
11155   // memVT is `NumVecs * VT`.
11156   Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
11157                                 EC * NumVecs);
11158   Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
11159   Info.offset = 0;
11160   Info.align.reset();
11161   Info.flags = MachineMemOperand::MOStore;
11162   return true;
11163 }
11164 
11165 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
11166 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
11167 /// specified in the intrinsic calls.
11168 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
11169                                                const CallInst &I,
11170                                                MachineFunction &MF,
11171                                                unsigned Intrinsic) const {
11172   auto &DL = I.getModule()->getDataLayout();
11173   switch (Intrinsic) {
11174   case Intrinsic::aarch64_sve_st2:
11175     return setInfoSVEStN<2>(*this, DL, Info, I);
11176   case Intrinsic::aarch64_sve_st3:
11177     return setInfoSVEStN<3>(*this, DL, Info, I);
11178   case Intrinsic::aarch64_sve_st4:
11179     return setInfoSVEStN<4>(*this, DL, Info, I);
11180   case Intrinsic::aarch64_neon_ld2:
11181   case Intrinsic::aarch64_neon_ld3:
11182   case Intrinsic::aarch64_neon_ld4:
11183   case Intrinsic::aarch64_neon_ld1x2:
11184   case Intrinsic::aarch64_neon_ld1x3:
11185   case Intrinsic::aarch64_neon_ld1x4:
11186   case Intrinsic::aarch64_neon_ld2lane:
11187   case Intrinsic::aarch64_neon_ld3lane:
11188   case Intrinsic::aarch64_neon_ld4lane:
11189   case Intrinsic::aarch64_neon_ld2r:
11190   case Intrinsic::aarch64_neon_ld3r:
11191   case Intrinsic::aarch64_neon_ld4r: {
11192     Info.opc = ISD::INTRINSIC_W_CHAIN;
11193     // Conservatively set memVT to the entire set of vectors loaded.
11194     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
11195     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
11196     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
11197     Info.offset = 0;
11198     Info.align.reset();
11199     // volatile loads with NEON intrinsics not supported
11200     Info.flags = MachineMemOperand::MOLoad;
11201     return true;
11202   }
11203   case Intrinsic::aarch64_neon_st2:
11204   case Intrinsic::aarch64_neon_st3:
11205   case Intrinsic::aarch64_neon_st4:
11206   case Intrinsic::aarch64_neon_st1x2:
11207   case Intrinsic::aarch64_neon_st1x3:
11208   case Intrinsic::aarch64_neon_st1x4:
11209   case Intrinsic::aarch64_neon_st2lane:
11210   case Intrinsic::aarch64_neon_st3lane:
11211   case Intrinsic::aarch64_neon_st4lane: {
11212     Info.opc = ISD::INTRINSIC_VOID;
11213     // Conservatively set memVT to the entire set of vectors stored.
11214     unsigned NumElts = 0;
11215     for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
11216       Type *ArgTy = I.getArgOperand(ArgI)->getType();
11217       if (!ArgTy->isVectorTy())
11218         break;
11219       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
11220     }
11221     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
11222     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
11223     Info.offset = 0;
11224     Info.align.reset();
11225     // volatile stores with NEON intrinsics not supported
11226     Info.flags = MachineMemOperand::MOStore;
11227     return true;
11228   }
11229   case Intrinsic::aarch64_ldaxr:
11230   case Intrinsic::aarch64_ldxr: {
11231     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
11232     Info.opc = ISD::INTRINSIC_W_CHAIN;
11233     Info.memVT = MVT::getVT(PtrTy->getElementType());
11234     Info.ptrVal = I.getArgOperand(0);
11235     Info.offset = 0;
11236     Info.align = DL.getABITypeAlign(PtrTy->getElementType());
11237     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
11238     return true;
11239   }
11240   case Intrinsic::aarch64_stlxr:
11241   case Intrinsic::aarch64_stxr: {
11242     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
11243     Info.opc = ISD::INTRINSIC_W_CHAIN;
11244     Info.memVT = MVT::getVT(PtrTy->getElementType());
11245     Info.ptrVal = I.getArgOperand(1);
11246     Info.offset = 0;
11247     Info.align = DL.getABITypeAlign(PtrTy->getElementType());
11248     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
11249     return true;
11250   }
11251   case Intrinsic::aarch64_ldaxp:
11252   case Intrinsic::aarch64_ldxp:
11253     Info.opc = ISD::INTRINSIC_W_CHAIN;
11254     Info.memVT = MVT::i128;
11255     Info.ptrVal = I.getArgOperand(0);
11256     Info.offset = 0;
11257     Info.align = Align(16);
11258     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
11259     return true;
11260   case Intrinsic::aarch64_stlxp:
11261   case Intrinsic::aarch64_stxp:
11262     Info.opc = ISD::INTRINSIC_W_CHAIN;
11263     Info.memVT = MVT::i128;
11264     Info.ptrVal = I.getArgOperand(2);
11265     Info.offset = 0;
11266     Info.align = Align(16);
11267     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
11268     return true;
11269   case Intrinsic::aarch64_sve_ldnt1: {
11270     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
11271     Info.opc = ISD::INTRINSIC_W_CHAIN;
11272     Info.memVT = MVT::getVT(I.getType());
11273     Info.ptrVal = I.getArgOperand(1);
11274     Info.offset = 0;
11275     Info.align = DL.getABITypeAlign(PtrTy->getElementType());
11276     Info.flags = MachineMemOperand::MOLoad;
11277     if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
11278       Info.flags |= MachineMemOperand::MONonTemporal;
11279     return true;
11280   }
11281   case Intrinsic::aarch64_sve_stnt1: {
11282     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
11283     Info.opc = ISD::INTRINSIC_W_CHAIN;
11284     Info.memVT = MVT::getVT(I.getOperand(0)->getType());
11285     Info.ptrVal = I.getArgOperand(2);
11286     Info.offset = 0;
11287     Info.align = DL.getABITypeAlign(PtrTy->getElementType());
11288     Info.flags = MachineMemOperand::MOStore;
11289     if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
11290       Info.flags |= MachineMemOperand::MONonTemporal;
11291     return true;
11292   }
11293   default:
11294     break;
11295   }
11296 
11297   return false;
11298 }
11299 
11300 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
11301                                                   ISD::LoadExtType ExtTy,
11302                                                   EVT NewVT) const {
11303   // TODO: This may be worth removing. Check regression tests for diffs.
11304   if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
11305     return false;
11306 
11307   // If we're reducing the load width in order to avoid having to use an extra
11308   // instruction to do extension then it's probably a good idea.
11309   if (ExtTy != ISD::NON_EXTLOAD)
11310     return true;
11311   // Don't reduce load width if it would prevent us from combining a shift into
11312   // the offset.
11313   MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
11314   assert(Mem);
11315   const SDValue &Base = Mem->getBasePtr();
11316   if (Base.getOpcode() == ISD::ADD &&
11317       Base.getOperand(1).getOpcode() == ISD::SHL &&
11318       Base.getOperand(1).hasOneUse() &&
11319       Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
11320     // The shift can be combined if it matches the size of the value being
11321     // loaded (and so reducing the width would make it not match).
11322     uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
11323     uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
11324     if (ShiftAmount == Log2_32(LoadBytes))
11325       return false;
11326   }
11327   // We have no reason to disallow reducing the load width, so allow it.
11328   return true;
11329 }
11330 
11331 // Truncations from 64-bit GPR to 32-bit GPR is free.
11332 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
11333   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
11334     return false;
11335   uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize();
11336   uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize();
11337   return NumBits1 > NumBits2;
11338 }
11339 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
11340   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
11341     return false;
11342   uint64_t NumBits1 = VT1.getFixedSizeInBits();
11343   uint64_t NumBits2 = VT2.getFixedSizeInBits();
11344   return NumBits1 > NumBits2;
11345 }
11346 
11347 /// Check if it is profitable to hoist instruction in then/else to if.
11348 /// Not profitable if I and it's user can form a FMA instruction
11349 /// because we prefer FMSUB/FMADD.
11350 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
11351   if (I->getOpcode() != Instruction::FMul)
11352     return true;
11353 
11354   if (!I->hasOneUse())
11355     return true;
11356 
11357   Instruction *User = I->user_back();
11358 
11359   if (User &&
11360       !(User->getOpcode() == Instruction::FSub ||
11361         User->getOpcode() == Instruction::FAdd))
11362     return true;
11363 
11364   const TargetOptions &Options = getTargetMachine().Options;
11365   const Function *F = I->getFunction();
11366   const DataLayout &DL = F->getParent()->getDataLayout();
11367   Type *Ty = User->getOperand(0)->getType();
11368 
11369   return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
11370            isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
11371            (Options.AllowFPOpFusion == FPOpFusion::Fast ||
11372             Options.UnsafeFPMath));
11373 }
11374 
11375 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
11376 // 64-bit GPR.
11377 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
11378   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
11379     return false;
11380   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
11381   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
11382   return NumBits1 == 32 && NumBits2 == 64;
11383 }
11384 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
11385   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
11386     return false;
11387   unsigned NumBits1 = VT1.getSizeInBits();
11388   unsigned NumBits2 = VT2.getSizeInBits();
11389   return NumBits1 == 32 && NumBits2 == 64;
11390 }
11391 
11392 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
11393   EVT VT1 = Val.getValueType();
11394   if (isZExtFree(VT1, VT2)) {
11395     return true;
11396   }
11397 
11398   if (Val.getOpcode() != ISD::LOAD)
11399     return false;
11400 
11401   // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
11402   return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
11403           VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
11404           VT1.getSizeInBits() <= 32);
11405 }
11406 
11407 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
11408   if (isa<FPExtInst>(Ext))
11409     return false;
11410 
11411   // Vector types are not free.
11412   if (Ext->getType()->isVectorTy())
11413     return false;
11414 
11415   for (const Use &U : Ext->uses()) {
11416     // The extension is free if we can fold it with a left shift in an
11417     // addressing mode or an arithmetic operation: add, sub, and cmp.
11418 
11419     // Is there a shift?
11420     const Instruction *Instr = cast<Instruction>(U.getUser());
11421 
11422     // Is this a constant shift?
11423     switch (Instr->getOpcode()) {
11424     case Instruction::Shl:
11425       if (!isa<ConstantInt>(Instr->getOperand(1)))
11426         return false;
11427       break;
11428     case Instruction::GetElementPtr: {
11429       gep_type_iterator GTI = gep_type_begin(Instr);
11430       auto &DL = Ext->getModule()->getDataLayout();
11431       std::advance(GTI, U.getOperandNo()-1);
11432       Type *IdxTy = GTI.getIndexedType();
11433       // This extension will end up with a shift because of the scaling factor.
11434       // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
11435       // Get the shift amount based on the scaling factor:
11436       // log2(sizeof(IdxTy)) - log2(8).
11437       uint64_t ShiftAmt =
11438         countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
11439       // Is the constant foldable in the shift of the addressing mode?
11440       // I.e., shift amount is between 1 and 4 inclusive.
11441       if (ShiftAmt == 0 || ShiftAmt > 4)
11442         return false;
11443       break;
11444     }
11445     case Instruction::Trunc:
11446       // Check if this is a noop.
11447       // trunc(sext ty1 to ty2) to ty1.
11448       if (Instr->getType() == Ext->getOperand(0)->getType())
11449         continue;
11450       LLVM_FALLTHROUGH;
11451     default:
11452       return false;
11453     }
11454 
11455     // At this point we can use the bfm family, so this extension is free
11456     // for that use.
11457   }
11458   return true;
11459 }
11460 
11461 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower
11462 /// or upper half of the vector elements.
11463 static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
11464   auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
11465     auto *FullTy = FullV->getType();
11466     auto *HalfTy = HalfV->getType();
11467     return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
11468            2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
11469   };
11470 
11471   auto extractHalf = [](Value *FullV, Value *HalfV) {
11472     auto *FullVT = cast<FixedVectorType>(FullV->getType());
11473     auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
11474     return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
11475   };
11476 
11477   ArrayRef<int> M1, M2;
11478   Value *S1Op1, *S2Op1;
11479   if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
11480       !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
11481     return false;
11482 
11483   // Check that the operands are half as wide as the result and we extract
11484   // half of the elements of the input vectors.
11485   if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
11486       !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
11487     return false;
11488 
11489   // Check the mask extracts either the lower or upper half of vector
11490   // elements.
11491   int M1Start = -1;
11492   int M2Start = -1;
11493   int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
11494   if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
11495       !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
11496       M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
11497     return false;
11498 
11499   return true;
11500 }
11501 
11502 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
11503 /// of the vector elements.
11504 static bool areExtractExts(Value *Ext1, Value *Ext2) {
11505   auto areExtDoubled = [](Instruction *Ext) {
11506     return Ext->getType()->getScalarSizeInBits() ==
11507            2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
11508   };
11509 
11510   if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
11511       !match(Ext2, m_ZExtOrSExt(m_Value())) ||
11512       !areExtDoubled(cast<Instruction>(Ext1)) ||
11513       !areExtDoubled(cast<Instruction>(Ext2)))
11514     return false;
11515 
11516   return true;
11517 }
11518 
11519 /// Check if Op could be used with vmull_high_p64 intrinsic.
11520 static bool isOperandOfVmullHighP64(Value *Op) {
11521   Value *VectorOperand = nullptr;
11522   ConstantInt *ElementIndex = nullptr;
11523   return match(Op, m_ExtractElt(m_Value(VectorOperand),
11524                                 m_ConstantInt(ElementIndex))) &&
11525          ElementIndex->getValue() == 1 &&
11526          isa<FixedVectorType>(VectorOperand->getType()) &&
11527          cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
11528 }
11529 
11530 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
11531 static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
11532   return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
11533 }
11534 
11535 /// Check if sinking \p I's operands to I's basic block is profitable, because
11536 /// the operands can be folded into a target instruction, e.g.
11537 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
11538 bool AArch64TargetLowering::shouldSinkOperands(
11539     Instruction *I, SmallVectorImpl<Use *> &Ops) const {
11540   if (!I->getType()->isVectorTy())
11541     return false;
11542 
11543   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
11544     switch (II->getIntrinsicID()) {
11545     case Intrinsic::aarch64_neon_umull:
11546       if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
11547         return false;
11548       Ops.push_back(&II->getOperandUse(0));
11549       Ops.push_back(&II->getOperandUse(1));
11550       return true;
11551 
11552     case Intrinsic::aarch64_neon_pmull64:
11553       if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
11554                                      II->getArgOperand(1)))
11555         return false;
11556       Ops.push_back(&II->getArgOperandUse(0));
11557       Ops.push_back(&II->getArgOperandUse(1));
11558       return true;
11559 
11560     default:
11561       return false;
11562     }
11563   }
11564 
11565   switch (I->getOpcode()) {
11566   case Instruction::Sub:
11567   case Instruction::Add: {
11568     if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
11569       return false;
11570 
11571     // If the exts' operands extract either the lower or upper elements, we
11572     // can sink them too.
11573     auto Ext1 = cast<Instruction>(I->getOperand(0));
11574     auto Ext2 = cast<Instruction>(I->getOperand(1));
11575     if (areExtractShuffleVectors(Ext1, Ext2)) {
11576       Ops.push_back(&Ext1->getOperandUse(0));
11577       Ops.push_back(&Ext2->getOperandUse(0));
11578     }
11579 
11580     Ops.push_back(&I->getOperandUse(0));
11581     Ops.push_back(&I->getOperandUse(1));
11582 
11583     return true;
11584   }
11585   case Instruction::Mul: {
11586     bool IsProfitable = false;
11587     for (auto &Op : I->operands()) {
11588       // Make sure we are not already sinking this operand
11589       if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
11590         continue;
11591 
11592       ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
11593       if (!Shuffle || !Shuffle->isZeroEltSplat())
11594         continue;
11595 
11596       Value *ShuffleOperand = Shuffle->getOperand(0);
11597       InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
11598       if (!Insert)
11599         continue;
11600 
11601       Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
11602       if (!OperandInstr)
11603         continue;
11604 
11605       ConstantInt *ElementConstant =
11606           dyn_cast<ConstantInt>(Insert->getOperand(2));
11607       // Check that the insertelement is inserting into element 0
11608       if (!ElementConstant || ElementConstant->getZExtValue() != 0)
11609         continue;
11610 
11611       unsigned Opcode = OperandInstr->getOpcode();
11612       if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
11613         continue;
11614 
11615       Ops.push_back(&Shuffle->getOperandUse(0));
11616       Ops.push_back(&Op);
11617       IsProfitable = true;
11618     }
11619 
11620     return IsProfitable;
11621   }
11622   default:
11623     return false;
11624   }
11625   return false;
11626 }
11627 
11628 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
11629                                           Align &RequiredAligment) const {
11630   if (!LoadedType.isSimple() ||
11631       (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
11632     return false;
11633   // Cyclone supports unaligned accesses.
11634   RequiredAligment = Align(1);
11635   unsigned NumBits = LoadedType.getSizeInBits();
11636   return NumBits == 32 || NumBits == 64;
11637 }
11638 
11639 /// A helper function for determining the number of interleaved accesses we
11640 /// will generate when lowering accesses of the given type.
11641 unsigned
11642 AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
11643                                                  const DataLayout &DL) const {
11644   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
11645 }
11646 
11647 MachineMemOperand::Flags
11648 AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
11649   if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
11650       I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
11651     return MOStridedAccess;
11652   return MachineMemOperand::MONone;
11653 }
11654 
11655 bool AArch64TargetLowering::isLegalInterleavedAccessType(
11656     VectorType *VecTy, const DataLayout &DL) const {
11657 
11658   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
11659   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
11660 
11661   // Ensure the number of vector elements is greater than 1.
11662   if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
11663     return false;
11664 
11665   // Ensure the element type is legal.
11666   if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
11667     return false;
11668 
11669   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
11670   // 128 will be split into multiple interleaved accesses.
11671   return VecSize == 64 || VecSize % 128 == 0;
11672 }
11673 
11674 /// Lower an interleaved load into a ldN intrinsic.
11675 ///
11676 /// E.g. Lower an interleaved load (Factor = 2):
11677 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
11678 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
11679 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
11680 ///
11681 ///      Into:
11682 ///        %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
11683 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
11684 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
11685 bool AArch64TargetLowering::lowerInterleavedLoad(
11686     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
11687     ArrayRef<unsigned> Indices, unsigned Factor) const {
11688   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
11689          "Invalid interleave factor");
11690   assert(!Shuffles.empty() && "Empty shufflevector input");
11691   assert(Shuffles.size() == Indices.size() &&
11692          "Unmatched number of shufflevectors and indices");
11693 
11694   const DataLayout &DL = LI->getModule()->getDataLayout();
11695 
11696   VectorType *VTy = Shuffles[0]->getType();
11697 
11698   // Skip if we do not have NEON and skip illegal vector types. We can
11699   // "legalize" wide vector types into multiple interleaved accesses as long as
11700   // the vector types are divisible by 128.
11701   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL))
11702     return false;
11703 
11704   unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);
11705 
11706   auto *FVTy = cast<FixedVectorType>(VTy);
11707 
11708   // A pointer vector can not be the return type of the ldN intrinsics. Need to
11709   // load integer vectors first and then convert to pointer vectors.
11710   Type *EltTy = FVTy->getElementType();
11711   if (EltTy->isPointerTy())
11712     FVTy =
11713         FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
11714 
11715   IRBuilder<> Builder(LI);
11716 
11717   // The base address of the load.
11718   Value *BaseAddr = LI->getPointerOperand();
11719 
11720   if (NumLoads > 1) {
11721     // If we're going to generate more than one load, reset the sub-vector type
11722     // to something legal.
11723     FVTy = FixedVectorType::get(FVTy->getElementType(),
11724                                 FVTy->getNumElements() / NumLoads);
11725 
11726     // We will compute the pointer operand of each load from the original base
11727     // address using GEPs. Cast the base address to a pointer to the scalar
11728     // element type.
11729     BaseAddr = Builder.CreateBitCast(
11730         BaseAddr,
11731         FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
11732   }
11733 
11734   Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
11735   Type *Tys[2] = {FVTy, PtrTy};
11736   static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
11737                                             Intrinsic::aarch64_neon_ld3,
11738                                             Intrinsic::aarch64_neon_ld4};
11739   Function *LdNFunc =
11740       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
11741 
11742   // Holds sub-vectors extracted from the load intrinsic return values. The
11743   // sub-vectors are associated with the shufflevector instructions they will
11744   // replace.
11745   DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
11746 
11747   for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
11748 
11749     // If we're generating more than one load, compute the base address of
11750     // subsequent loads as an offset from the previous.
11751     if (LoadCount > 0)
11752       BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
11753                                             FVTy->getNumElements() * Factor);
11754 
11755     CallInst *LdN = Builder.CreateCall(
11756         LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
11757 
11758     // Extract and store the sub-vectors returned by the load intrinsic.
11759     for (unsigned i = 0; i < Shuffles.size(); i++) {
11760       ShuffleVectorInst *SVI = Shuffles[i];
11761       unsigned Index = Indices[i];
11762 
11763       Value *SubVec = Builder.CreateExtractValue(LdN, Index);
11764 
11765       // Convert the integer vector to pointer vector if the element is pointer.
11766       if (EltTy->isPointerTy())
11767         SubVec = Builder.CreateIntToPtr(
11768             SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
11769                                          FVTy->getNumElements()));
11770       SubVecs[SVI].push_back(SubVec);
11771     }
11772   }
11773 
11774   // Replace uses of the shufflevector instructions with the sub-vectors
11775   // returned by the load intrinsic. If a shufflevector instruction is
11776   // associated with more than one sub-vector, those sub-vectors will be
11777   // concatenated into a single wide vector.
11778   for (ShuffleVectorInst *SVI : Shuffles) {
11779     auto &SubVec = SubVecs[SVI];
11780     auto *WideVec =
11781         SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
11782     SVI->replaceAllUsesWith(WideVec);
11783   }
11784 
11785   return true;
11786 }
11787 
11788 /// Lower an interleaved store into a stN intrinsic.
11789 ///
11790 /// E.g. Lower an interleaved store (Factor = 3):
11791 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
11792 ///                 <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
11793 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
11794 ///
11795 ///      Into:
11796 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
11797 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
11798 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
11799 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
11800 ///
11801 /// Note that the new shufflevectors will be removed and we'll only generate one
11802 /// st3 instruction in CodeGen.
11803 ///
11804 /// Example for a more general valid mask (Factor 3). Lower:
11805 ///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
11806 ///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
11807 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
11808 ///
11809 ///      Into:
11810 ///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
11811 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
11812 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
11813 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
11814 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
11815                                                   ShuffleVectorInst *SVI,
11816                                                   unsigned Factor) const {
11817   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
11818          "Invalid interleave factor");
11819 
11820   auto *VecTy = cast<FixedVectorType>(SVI->getType());
11821   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
11822 
11823   unsigned LaneLen = VecTy->getNumElements() / Factor;
11824   Type *EltTy = VecTy->getElementType();
11825   auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
11826 
11827   const DataLayout &DL = SI->getModule()->getDataLayout();
11828 
11829   // Skip if we do not have NEON and skip illegal vector types. We can
11830   // "legalize" wide vector types into multiple interleaved accesses as long as
11831   // the vector types are divisible by 128.
11832   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
11833     return false;
11834 
11835   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
11836 
11837   Value *Op0 = SVI->getOperand(0);
11838   Value *Op1 = SVI->getOperand(1);
11839   IRBuilder<> Builder(SI);
11840 
11841   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
11842   // vectors to integer vectors.
11843   if (EltTy->isPointerTy()) {
11844     Type *IntTy = DL.getIntPtrType(EltTy);
11845     unsigned NumOpElts =
11846         cast<FixedVectorType>(Op0->getType())->getNumElements();
11847 
11848     // Convert to the corresponding integer vector.
11849     auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
11850     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
11851     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
11852 
11853     SubVecTy = FixedVectorType::get(IntTy, LaneLen);
11854   }
11855 
11856   // The base address of the store.
11857   Value *BaseAddr = SI->getPointerOperand();
11858 
11859   if (NumStores > 1) {
11860     // If we're going to generate more than one store, reset the lane length
11861     // and sub-vector type to something legal.
11862     LaneLen /= NumStores;
11863     SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
11864 
11865     // We will compute the pointer operand of each store from the original base
11866     // address using GEPs. Cast the base address to a pointer to the scalar
11867     // element type.
11868     BaseAddr = Builder.CreateBitCast(
11869         BaseAddr,
11870         SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
11871   }
11872 
11873   auto Mask = SVI->getShuffleMask();
11874 
11875   Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
11876   Type *Tys[2] = {SubVecTy, PtrTy};
11877   static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
11878                                              Intrinsic::aarch64_neon_st3,
11879                                              Intrinsic::aarch64_neon_st4};
11880   Function *StNFunc =
11881       Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
11882 
11883   for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
11884 
11885     SmallVector<Value *, 5> Ops;
11886 
11887     // Split the shufflevector operands into sub vectors for the new stN call.
11888     for (unsigned i = 0; i < Factor; i++) {
11889       unsigned IdxI = StoreCount * LaneLen * Factor + i;
11890       if (Mask[IdxI] >= 0) {
11891         Ops.push_back(Builder.CreateShuffleVector(
11892             Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
11893       } else {
11894         unsigned StartMask = 0;
11895         for (unsigned j = 1; j < LaneLen; j++) {
11896           unsigned IdxJ = StoreCount * LaneLen * Factor + j;
11897           if (Mask[IdxJ * Factor + IdxI] >= 0) {
11898             StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
11899             break;
11900           }
11901         }
11902         // Note: Filling undef gaps with random elements is ok, since
11903         // those elements were being written anyway (with undefs).
11904         // In the case of all undefs we're defaulting to using elems from 0
11905         // Note: StartMask cannot be negative, it's checked in
11906         // isReInterleaveMask
11907         Ops.push_back(Builder.CreateShuffleVector(
11908             Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
11909       }
11910     }
11911 
11912     // If we generating more than one store, we compute the base address of
11913     // subsequent stores as an offset from the previous.
11914     if (StoreCount > 0)
11915       BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
11916                                             BaseAddr, LaneLen * Factor);
11917 
11918     Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
11919     Builder.CreateCall(StNFunc, Ops);
11920   }
11921   return true;
11922 }
11923 
11924 // Lower an SVE structured load intrinsic returning a tuple type to target
11925 // specific intrinsic taking the same input but returning a multi-result value
11926 // of the split tuple type.
11927 //
11928 // E.g. Lowering an LD3:
11929 //
11930 //  call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
11931 //                                                    <vscale x 4 x i1> %pred,
11932 //                                                    <vscale x 4 x i32>* %addr)
11933 //
11934 //  Output DAG:
11935 //
11936 //    t0: ch = EntryToken
11937 //        t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
11938 //        t4: i64,ch = CopyFromReg t0, Register:i64 %1
11939 //    t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
11940 //    t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
11941 //
11942 // This is called pre-legalization to avoid widening/splitting issues with
11943 // non-power-of-2 tuple types used for LD3, such as nxv12i32.
11944 SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
11945                                                   ArrayRef<SDValue> LoadOps,
11946                                                   EVT VT, SelectionDAG &DAG,
11947                                                   const SDLoc &DL) const {
11948   assert(VT.isScalableVector() && "Can only lower scalable vectors");
11949 
11950   unsigned N, Opcode;
11951   static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
11952       {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
11953       {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
11954       {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
11955 
11956   std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
11957   assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 &&
11958          "invalid tuple vector type!");
11959 
11960   EVT SplitVT =
11961       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
11962                        VT.getVectorElementCount().divideCoefficientBy(N));
11963   assert(isTypeLegal(SplitVT));
11964 
11965   SmallVector<EVT, 5> VTs(N, SplitVT);
11966   VTs.push_back(MVT::Other); // Chain
11967   SDVTList NodeTys = DAG.getVTList(VTs);
11968 
11969   SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
11970   SmallVector<SDValue, 4> PseudoLoadOps;
11971   for (unsigned I = 0; I < N; ++I)
11972     PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
11973   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
11974 }
11975 
11976 EVT AArch64TargetLowering::getOptimalMemOpType(
11977     const MemOp &Op, const AttributeList &FuncAttributes) const {
11978   bool CanImplicitFloat =
11979       !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
11980   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
11981   bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
11982   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
11983   // taken one instruction to materialize the v2i64 zero and one store (with
11984   // restrictive addressing mode). Just do i64 stores.
11985   bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
11986   auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
11987     if (Op.isAligned(AlignCheck))
11988       return true;
11989     bool Fast;
11990     return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
11991                                           MachineMemOperand::MONone, &Fast) &&
11992            Fast;
11993   };
11994 
11995   if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
11996       AlignmentIsAcceptable(MVT::v2i64, Align(16)))
11997     return MVT::v2i64;
11998   if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
11999     return MVT::f128;
12000   if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
12001     return MVT::i64;
12002   if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
12003     return MVT::i32;
12004   return MVT::Other;
12005 }
12006 
12007 LLT AArch64TargetLowering::getOptimalMemOpLLT(
12008     const MemOp &Op, const AttributeList &FuncAttributes) const {
12009   bool CanImplicitFloat =
12010       !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
12011   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
12012   bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
12013   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
12014   // taken one instruction to materialize the v2i64 zero and one store (with
12015   // restrictive addressing mode). Just do i64 stores.
12016   bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
12017   auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
12018     if (Op.isAligned(AlignCheck))
12019       return true;
12020     bool Fast;
12021     return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
12022                                           MachineMemOperand::MONone, &Fast) &&
12023            Fast;
12024   };
12025 
12026   if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
12027       AlignmentIsAcceptable(MVT::v2i64, Align(16)))
12028     return LLT::fixed_vector(2, 64);
12029   if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
12030     return LLT::scalar(128);
12031   if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
12032     return LLT::scalar(64);
12033   if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
12034     return LLT::scalar(32);
12035   return LLT();
12036 }
12037 
12038 // 12-bit optionally shifted immediates are legal for adds.
12039 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
12040   if (Immed == std::numeric_limits<int64_t>::min()) {
12041     LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
12042                       << ": avoid UB for INT64_MIN\n");
12043     return false;
12044   }
12045   // Same encoding for add/sub, just flip the sign.
12046   Immed = std::abs(Immed);
12047   bool IsLegal = ((Immed >> 12) == 0 ||
12048                   ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
12049   LLVM_DEBUG(dbgs() << "Is " << Immed
12050                     << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
12051   return IsLegal;
12052 }
12053 
12054 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
12055 // immediates is the same as for an add or a sub.
12056 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
12057   return isLegalAddImmediate(Immed);
12058 }
12059 
12060 /// isLegalAddressingMode - Return true if the addressing mode represented
12061 /// by AM is legal for this target, for a load/store of the specified type.
12062 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
12063                                                   const AddrMode &AM, Type *Ty,
12064                                                   unsigned AS, Instruction *I) const {
12065   // AArch64 has five basic addressing modes:
12066   //  reg
12067   //  reg + 9-bit signed offset
12068   //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
12069   //  reg1 + reg2
12070   //  reg + SIZE_IN_BYTES * reg
12071 
12072   // No global is ever allowed as a base.
12073   if (AM.BaseGV)
12074     return false;
12075 
12076   // No reg+reg+imm addressing.
12077   if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
12078     return false;
12079 
12080   // FIXME: Update this method to support scalable addressing modes.
12081   if (isa<ScalableVectorType>(Ty)) {
12082     uint64_t VecElemNumBytes =
12083         DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
12084     return AM.HasBaseReg && !AM.BaseOffs &&
12085            (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
12086   }
12087 
12088   // check reg + imm case:
12089   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
12090   uint64_t NumBytes = 0;
12091   if (Ty->isSized()) {
12092     uint64_t NumBits = DL.getTypeSizeInBits(Ty);
12093     NumBytes = NumBits / 8;
12094     if (!isPowerOf2_64(NumBits))
12095       NumBytes = 0;
12096   }
12097 
12098   if (!AM.Scale) {
12099     int64_t Offset = AM.BaseOffs;
12100 
12101     // 9-bit signed offset
12102     if (isInt<9>(Offset))
12103       return true;
12104 
12105     // 12-bit unsigned offset
12106     unsigned shift = Log2_64(NumBytes);
12107     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
12108         // Must be a multiple of NumBytes (NumBytes is a power of 2)
12109         (Offset >> shift) << shift == Offset)
12110       return true;
12111     return false;
12112   }
12113 
12114   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
12115 
12116   return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
12117 }
12118 
12119 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
12120   // Consider splitting large offset of struct or array.
12121   return true;
12122 }
12123 
12124 InstructionCost AArch64TargetLowering::getScalingFactorCost(
12125     const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const {
12126   // Scaling factors are not free at all.
12127   // Operands                     | Rt Latency
12128   // -------------------------------------------
12129   // Rt, [Xn, Xm]                 | 4
12130   // -------------------------------------------
12131   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
12132   // Rt, [Xn, Wm, <extend> #imm]  |
12133   if (isLegalAddressingMode(DL, AM, Ty, AS))
12134     // Scale represents reg2 * scale, thus account for 1 if
12135     // it is not equal to 0 or 1.
12136     return AM.Scale != 0 && AM.Scale != 1;
12137   return -1;
12138 }
12139 
12140 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
12141     const MachineFunction &MF, EVT VT) const {
12142   VT = VT.getScalarType();
12143 
12144   if (!VT.isSimple())
12145     return false;
12146 
12147   switch (VT.getSimpleVT().SimpleTy) {
12148   case MVT::f16:
12149     return Subtarget->hasFullFP16();
12150   case MVT::f32:
12151   case MVT::f64:
12152     return true;
12153   default:
12154     break;
12155   }
12156 
12157   return false;
12158 }
12159 
12160 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
12161                                                        Type *Ty) const {
12162   switch (Ty->getScalarType()->getTypeID()) {
12163   case Type::FloatTyID:
12164   case Type::DoubleTyID:
12165     return true;
12166   default:
12167     return false;
12168   }
12169 }
12170 
12171 bool AArch64TargetLowering::generateFMAsInMachineCombiner(
12172     EVT VT, CodeGenOpt::Level OptLevel) const {
12173   return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector();
12174 }
12175 
12176 const MCPhysReg *
12177 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
12178   // LR is a callee-save register, but we must treat it as clobbered by any call
12179   // site. Hence we include LR in the scratch registers, which are in turn added
12180   // as implicit-defs for stackmaps and patchpoints.
12181   static const MCPhysReg ScratchRegs[] = {
12182     AArch64::X16, AArch64::X17, AArch64::LR, 0
12183   };
12184   return ScratchRegs;
12185 }
12186 
12187 bool
12188 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
12189                                                      CombineLevel Level) const {
12190   N = N->getOperand(0).getNode();
12191   EVT VT = N->getValueType(0);
12192     // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
12193     // it with shift to let it be lowered to UBFX.
12194   if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
12195       isa<ConstantSDNode>(N->getOperand(1))) {
12196     uint64_t TruncMask = N->getConstantOperandVal(1);
12197     if (isMask_64(TruncMask) &&
12198       N->getOperand(0).getOpcode() == ISD::SRL &&
12199       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
12200       return false;
12201   }
12202   return true;
12203 }
12204 
12205 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
12206                                                               Type *Ty) const {
12207   assert(Ty->isIntegerTy());
12208 
12209   unsigned BitSize = Ty->getPrimitiveSizeInBits();
12210   if (BitSize == 0)
12211     return false;
12212 
12213   int64_t Val = Imm.getSExtValue();
12214   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
12215     return true;
12216 
12217   if ((int64_t)Val < 0)
12218     Val = ~Val;
12219   if (BitSize == 32)
12220     Val &= (1LL << 32) - 1;
12221 
12222   unsigned LZ = countLeadingZeros((uint64_t)Val);
12223   unsigned Shift = (63 - LZ) / 16;
12224   // MOVZ is free so return true for one or fewer MOVK.
12225   return Shift < 3;
12226 }
12227 
12228 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
12229                                                     unsigned Index) const {
12230   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
12231     return false;
12232 
12233   return (Index == 0 || Index == ResVT.getVectorNumElements());
12234 }
12235 
12236 /// Turn vector tests of the signbit in the form of:
12237 ///   xor (sra X, elt_size(X)-1), -1
12238 /// into:
12239 ///   cmge X, X, #0
12240 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
12241                                          const AArch64Subtarget *Subtarget) {
12242   EVT VT = N->getValueType(0);
12243   if (!Subtarget->hasNEON() || !VT.isVector())
12244     return SDValue();
12245 
12246   // There must be a shift right algebraic before the xor, and the xor must be a
12247   // 'not' operation.
12248   SDValue Shift = N->getOperand(0);
12249   SDValue Ones = N->getOperand(1);
12250   if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
12251       !ISD::isBuildVectorAllOnes(Ones.getNode()))
12252     return SDValue();
12253 
12254   // The shift should be smearing the sign bit across each vector element.
12255   auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
12256   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
12257   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
12258     return SDValue();
12259 
12260   return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
12261 }
12262 
12263 // Given a vecreduce_add node, detect the below pattern and convert it to the
12264 // node sequence with UABDL, [S|U]ADB and UADDLP.
12265 //
12266 // i32 vecreduce_add(
12267 //  v16i32 abs(
12268 //    v16i32 sub(
12269 //     v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
12270 // =================>
12271 // i32 vecreduce_add(
12272 //   v4i32 UADDLP(
12273 //     v8i16 add(
12274 //       v8i16 zext(
12275 //         v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
12276 //       v8i16 zext(
12277 //         v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
12278 static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
12279                                                     SelectionDAG &DAG) {
12280   // Assumed i32 vecreduce_add
12281   if (N->getValueType(0) != MVT::i32)
12282     return SDValue();
12283 
12284   SDValue VecReduceOp0 = N->getOperand(0);
12285   unsigned Opcode = VecReduceOp0.getOpcode();
12286   // Assumed v16i32 abs
12287   if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
12288     return SDValue();
12289 
12290   SDValue ABS = VecReduceOp0;
12291   // Assumed v16i32 sub
12292   if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
12293       ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
12294     return SDValue();
12295 
12296   SDValue SUB = ABS->getOperand(0);
12297   unsigned Opcode0 = SUB->getOperand(0).getOpcode();
12298   unsigned Opcode1 = SUB->getOperand(1).getOpcode();
12299   // Assumed v16i32 type
12300   if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
12301       SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
12302     return SDValue();
12303 
12304   // Assumed zext or sext
12305   bool IsZExt = false;
12306   if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
12307     IsZExt = true;
12308   } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
12309     IsZExt = false;
12310   } else
12311     return SDValue();
12312 
12313   SDValue EXT0 = SUB->getOperand(0);
12314   SDValue EXT1 = SUB->getOperand(1);
12315   // Assumed zext's operand has v16i8 type
12316   if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
12317       EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
12318     return SDValue();
12319 
12320   // Pattern is dectected. Let's convert it to sequence of nodes.
12321   SDLoc DL(N);
12322 
12323   // First, create the node pattern of UABD/SABD.
12324   SDValue UABDHigh8Op0 =
12325       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
12326                   DAG.getConstant(8, DL, MVT::i64));
12327   SDValue UABDHigh8Op1 =
12328       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
12329                   DAG.getConstant(8, DL, MVT::i64));
12330   SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
12331                                   UABDHigh8Op0, UABDHigh8Op1);
12332   SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
12333 
12334   // Second, create the node pattern of UABAL.
12335   SDValue UABDLo8Op0 =
12336       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
12337                   DAG.getConstant(0, DL, MVT::i64));
12338   SDValue UABDLo8Op1 =
12339       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
12340                   DAG.getConstant(0, DL, MVT::i64));
12341   SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
12342                                 UABDLo8Op0, UABDLo8Op1);
12343   SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
12344   SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
12345 
12346   // Third, create the node of UADDLP.
12347   SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
12348 
12349   // Fourth, create the node of VECREDUCE_ADD.
12350   return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
12351 }
12352 
12353 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
12354 //   vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
12355 //   vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
12356 static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
12357                                           const AArch64Subtarget *ST) {
12358   if (!ST->hasDotProd())
12359     return performVecReduceAddCombineWithUADDLP(N, DAG);
12360 
12361   SDValue Op0 = N->getOperand(0);
12362   if (N->getValueType(0) != MVT::i32 ||
12363       Op0.getValueType().getVectorElementType() != MVT::i32)
12364     return SDValue();
12365 
12366   unsigned ExtOpcode = Op0.getOpcode();
12367   SDValue A = Op0;
12368   SDValue B;
12369   if (ExtOpcode == ISD::MUL) {
12370     A = Op0.getOperand(0);
12371     B = Op0.getOperand(1);
12372     if (A.getOpcode() != B.getOpcode() ||
12373         A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
12374       return SDValue();
12375     ExtOpcode = A.getOpcode();
12376   }
12377   if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
12378     return SDValue();
12379 
12380   EVT Op0VT = A.getOperand(0).getValueType();
12381   if (Op0VT != MVT::v8i8 && Op0VT != MVT::v16i8)
12382     return SDValue();
12383 
12384   SDLoc DL(Op0);
12385   // For non-mla reductions B can be set to 1. For MLA we take the operand of
12386   // the extend B.
12387   if (!B)
12388     B = DAG.getConstant(1, DL, Op0VT);
12389   else
12390     B = B.getOperand(0);
12391 
12392   SDValue Zeros =
12393       DAG.getConstant(0, DL, Op0VT == MVT::v8i8 ? MVT::v2i32 : MVT::v4i32);
12394   auto DotOpcode =
12395       (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
12396   SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
12397                             A.getOperand(0), B);
12398   return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
12399 }
12400 
12401 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
12402                                  TargetLowering::DAGCombinerInfo &DCI,
12403                                  const AArch64Subtarget *Subtarget) {
12404   if (DCI.isBeforeLegalizeOps())
12405     return SDValue();
12406 
12407   return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
12408 }
12409 
12410 SDValue
12411 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
12412                                      SelectionDAG &DAG,
12413                                      SmallVectorImpl<SDNode *> &Created) const {
12414   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
12415   if (isIntDivCheap(N->getValueType(0), Attr))
12416     return SDValue(N,0); // Lower SDIV as SDIV
12417 
12418   // fold (sdiv X, pow2)
12419   EVT VT = N->getValueType(0);
12420   if ((VT != MVT::i32 && VT != MVT::i64) ||
12421       !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
12422     return SDValue();
12423 
12424   SDLoc DL(N);
12425   SDValue N0 = N->getOperand(0);
12426   unsigned Lg2 = Divisor.countTrailingZeros();
12427   SDValue Zero = DAG.getConstant(0, DL, VT);
12428   SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
12429 
12430   // Add (N0 < 0) ? Pow2 - 1 : 0;
12431   SDValue CCVal;
12432   SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
12433   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
12434   SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
12435 
12436   Created.push_back(Cmp.getNode());
12437   Created.push_back(Add.getNode());
12438   Created.push_back(CSel.getNode());
12439 
12440   // Divide by pow2.
12441   SDValue SRA =
12442       DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
12443 
12444   // If we're dividing by a positive value, we're done.  Otherwise, we must
12445   // negate the result.
12446   if (Divisor.isNonNegative())
12447     return SRA;
12448 
12449   Created.push_back(SRA.getNode());
12450   return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
12451 }
12452 
12453 static bool IsSVECntIntrinsic(SDValue S) {
12454   switch(getIntrinsicID(S.getNode())) {
12455   default:
12456     break;
12457   case Intrinsic::aarch64_sve_cntb:
12458   case Intrinsic::aarch64_sve_cnth:
12459   case Intrinsic::aarch64_sve_cntw:
12460   case Intrinsic::aarch64_sve_cntd:
12461     return true;
12462   }
12463   return false;
12464 }
12465 
12466 /// Calculates what the pre-extend type is, based on the extension
12467 /// operation node provided by \p Extend.
12468 ///
12469 /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
12470 /// pre-extend type is pulled directly from the operand, while other extend
12471 /// operations need a bit more inspection to get this information.
12472 ///
12473 /// \param Extend The SDNode from the DAG that represents the extend operation
12474 /// \param DAG The SelectionDAG hosting the \p Extend node
12475 ///
12476 /// \returns The type representing the \p Extend source type, or \p MVT::Other
12477 /// if no valid type can be determined
12478 static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) {
12479   switch (Extend.getOpcode()) {
12480   case ISD::SIGN_EXTEND:
12481   case ISD::ZERO_EXTEND:
12482     return Extend.getOperand(0).getValueType();
12483   case ISD::AssertSext:
12484   case ISD::AssertZext:
12485   case ISD::SIGN_EXTEND_INREG: {
12486     VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
12487     if (!TypeNode)
12488       return MVT::Other;
12489     return TypeNode->getVT();
12490   }
12491   case ISD::AND: {
12492     ConstantSDNode *Constant =
12493         dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
12494     if (!Constant)
12495       return MVT::Other;
12496 
12497     uint32_t Mask = Constant->getZExtValue();
12498 
12499     if (Mask == UCHAR_MAX)
12500       return MVT::i8;
12501     else if (Mask == USHRT_MAX)
12502       return MVT::i16;
12503     else if (Mask == UINT_MAX)
12504       return MVT::i32;
12505 
12506     return MVT::Other;
12507   }
12508   default:
12509     return MVT::Other;
12510   }
12511 
12512   llvm_unreachable("Code path unhandled in calculatePreExtendType!");
12513 }
12514 
12515 /// Combines a dup(sext/zext) node pattern into sext/zext(dup)
12516 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
12517 static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
12518                                                 SelectionDAG &DAG) {
12519 
12520   ShuffleVectorSDNode *ShuffleNode =
12521       dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode());
12522   if (!ShuffleNode)
12523     return SDValue();
12524 
12525   // Ensuring the mask is zero before continuing
12526   if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
12527     return SDValue();
12528 
12529   SDValue InsertVectorElt = VectorShuffle.getOperand(0);
12530 
12531   if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
12532     return SDValue();
12533 
12534   SDValue InsertLane = InsertVectorElt.getOperand(2);
12535   ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode());
12536   // Ensures the insert is inserting into lane 0
12537   if (!Constant || Constant->getZExtValue() != 0)
12538     return SDValue();
12539 
12540   SDValue Extend = InsertVectorElt.getOperand(1);
12541   unsigned ExtendOpcode = Extend.getOpcode();
12542 
12543   bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
12544                 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
12545                 ExtendOpcode == ISD::AssertSext;
12546   if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
12547       ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
12548     return SDValue();
12549 
12550   EVT TargetType = VectorShuffle.getValueType();
12551   EVT PreExtendType = calculatePreExtendType(Extend, DAG);
12552 
12553   if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 &&
12554        TargetType != MVT::v2i64) ||
12555       (PreExtendType == MVT::Other))
12556     return SDValue();
12557 
12558   // Restrict valid pre-extend data type
12559   if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 &&
12560       PreExtendType != MVT::i32)
12561     return SDValue();
12562 
12563   EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType);
12564 
12565   if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount())
12566     return SDValue();
12567 
12568   if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
12569     return SDValue();
12570 
12571   SDLoc DL(VectorShuffle);
12572 
12573   SDValue InsertVectorNode = DAG.getNode(
12574       InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT),
12575       DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
12576       DAG.getConstant(0, DL, MVT::i64));
12577 
12578   std::vector<int> ShuffleMask(TargetType.getVectorElementCount().getValue());
12579 
12580   SDValue VectorShuffleNode =
12581       DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
12582                            DAG.getUNDEF(PreExtendVT), ShuffleMask);
12583 
12584   SDValue ExtendNode = DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
12585                                    DL, TargetType, VectorShuffleNode);
12586 
12587   return ExtendNode;
12588 }
12589 
12590 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
12591 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
12592 static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
12593   // If the value type isn't a vector, none of the operands are going to be dups
12594   if (!Mul->getValueType(0).isVector())
12595     return SDValue();
12596 
12597   SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
12598   SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
12599 
12600   // Neither operands have been changed, don't make any further changes
12601   if (!Op0 && !Op1)
12602     return SDValue();
12603 
12604   SDLoc DL(Mul);
12605   return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0),
12606                      Op0 ? Op0 : Mul->getOperand(0),
12607                      Op1 ? Op1 : Mul->getOperand(1));
12608 }
12609 
12610 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
12611                                  TargetLowering::DAGCombinerInfo &DCI,
12612                                  const AArch64Subtarget *Subtarget) {
12613 
12614   if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
12615     return Ext;
12616 
12617   if (DCI.isBeforeLegalizeOps())
12618     return SDValue();
12619 
12620   // The below optimizations require a constant RHS.
12621   if (!isa<ConstantSDNode>(N->getOperand(1)))
12622     return SDValue();
12623 
12624   SDValue N0 = N->getOperand(0);
12625   ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
12626   const APInt &ConstValue = C->getAPIntValue();
12627 
12628   // Allow the scaling to be folded into the `cnt` instruction by preventing
12629   // the scaling to be obscured here. This makes it easier to pattern match.
12630   if (IsSVECntIntrinsic(N0) ||
12631      (N0->getOpcode() == ISD::TRUNCATE &&
12632       (IsSVECntIntrinsic(N0->getOperand(0)))))
12633        if (ConstValue.sge(1) && ConstValue.sle(16))
12634          return SDValue();
12635 
12636   // Multiplication of a power of two plus/minus one can be done more
12637   // cheaply as as shift+add/sub. For now, this is true unilaterally. If
12638   // future CPUs have a cheaper MADD instruction, this may need to be
12639   // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
12640   // 64-bit is 5 cycles, so this is always a win.
12641   // More aggressively, some multiplications N0 * C can be lowered to
12642   // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
12643   // e.g. 6=3*2=(2+1)*2.
12644   // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
12645   // which equals to (1+2)*16-(1+2).
12646 
12647   // TrailingZeroes is used to test if the mul can be lowered to
12648   // shift+add+shift.
12649   unsigned TrailingZeroes = ConstValue.countTrailingZeros();
12650   if (TrailingZeroes) {
12651     // Conservatively do not lower to shift+add+shift if the mul might be
12652     // folded into smul or umul.
12653     if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
12654                             isZeroExtended(N0.getNode(), DAG)))
12655       return SDValue();
12656     // Conservatively do not lower to shift+add+shift if the mul might be
12657     // folded into madd or msub.
12658     if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
12659                            N->use_begin()->getOpcode() == ISD::SUB))
12660       return SDValue();
12661   }
12662   // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
12663   // and shift+add+shift.
12664   APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
12665 
12666   unsigned ShiftAmt, AddSubOpc;
12667   // Is the shifted value the LHS operand of the add/sub?
12668   bool ShiftValUseIsN0 = true;
12669   // Do we need to negate the result?
12670   bool NegateResult = false;
12671 
12672   if (ConstValue.isNonNegative()) {
12673     // (mul x, 2^N + 1) => (add (shl x, N), x)
12674     // (mul x, 2^N - 1) => (sub (shl x, N), x)
12675     // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
12676     APInt SCVMinus1 = ShiftedConstValue - 1;
12677     APInt CVPlus1 = ConstValue + 1;
12678     if (SCVMinus1.isPowerOf2()) {
12679       ShiftAmt = SCVMinus1.logBase2();
12680       AddSubOpc = ISD::ADD;
12681     } else if (CVPlus1.isPowerOf2()) {
12682       ShiftAmt = CVPlus1.logBase2();
12683       AddSubOpc = ISD::SUB;
12684     } else
12685       return SDValue();
12686   } else {
12687     // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
12688     // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
12689     APInt CVNegPlus1 = -ConstValue + 1;
12690     APInt CVNegMinus1 = -ConstValue - 1;
12691     if (CVNegPlus1.isPowerOf2()) {
12692       ShiftAmt = CVNegPlus1.logBase2();
12693       AddSubOpc = ISD::SUB;
12694       ShiftValUseIsN0 = false;
12695     } else if (CVNegMinus1.isPowerOf2()) {
12696       ShiftAmt = CVNegMinus1.logBase2();
12697       AddSubOpc = ISD::ADD;
12698       NegateResult = true;
12699     } else
12700       return SDValue();
12701   }
12702 
12703   SDLoc DL(N);
12704   EVT VT = N->getValueType(0);
12705   SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
12706                                    DAG.getConstant(ShiftAmt, DL, MVT::i64));
12707 
12708   SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
12709   SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
12710   SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
12711   assert(!(NegateResult && TrailingZeroes) &&
12712          "NegateResult and TrailingZeroes cannot both be true for now.");
12713   // Negate the result.
12714   if (NegateResult)
12715     return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
12716   // Shift the result.
12717   if (TrailingZeroes)
12718     return DAG.getNode(ISD::SHL, DL, VT, Res,
12719                        DAG.getConstant(TrailingZeroes, DL, MVT::i64));
12720   return Res;
12721 }
12722 
12723 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
12724                                                          SelectionDAG &DAG) {
12725   // Take advantage of vector comparisons producing 0 or -1 in each lane to
12726   // optimize away operation when it's from a constant.
12727   //
12728   // The general transformation is:
12729   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
12730   //       AND(VECTOR_CMP(x,y), constant2)
12731   //    constant2 = UNARYOP(constant)
12732 
12733   // Early exit if this isn't a vector operation, the operand of the
12734   // unary operation isn't a bitwise AND, or if the sizes of the operations
12735   // aren't the same.
12736   EVT VT = N->getValueType(0);
12737   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
12738       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
12739       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
12740     return SDValue();
12741 
12742   // Now check that the other operand of the AND is a constant. We could
12743   // make the transformation for non-constant splats as well, but it's unclear
12744   // that would be a benefit as it would not eliminate any operations, just
12745   // perform one more step in scalar code before moving to the vector unit.
12746   if (BuildVectorSDNode *BV =
12747           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
12748     // Bail out if the vector isn't a constant.
12749     if (!BV->isConstant())
12750       return SDValue();
12751 
12752     // Everything checks out. Build up the new and improved node.
12753     SDLoc DL(N);
12754     EVT IntVT = BV->getValueType(0);
12755     // Create a new constant of the appropriate type for the transformed
12756     // DAG.
12757     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
12758     // The AND node needs bitcasts to/from an integer vector type around it.
12759     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
12760     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
12761                                  N->getOperand(0)->getOperand(0), MaskConst);
12762     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
12763     return Res;
12764   }
12765 
12766   return SDValue();
12767 }
12768 
12769 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
12770                                      const AArch64Subtarget *Subtarget) {
12771   // First try to optimize away the conversion when it's conditionally from
12772   // a constant. Vectors only.
12773   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
12774     return Res;
12775 
12776   EVT VT = N->getValueType(0);
12777   if (VT != MVT::f32 && VT != MVT::f64)
12778     return SDValue();
12779 
12780   // Only optimize when the source and destination types have the same width.
12781   if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
12782     return SDValue();
12783 
12784   // If the result of an integer load is only used by an integer-to-float
12785   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
12786   // This eliminates an "integer-to-vector-move" UOP and improves throughput.
12787   SDValue N0 = N->getOperand(0);
12788   if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
12789       // Do not change the width of a volatile load.
12790       !cast<LoadSDNode>(N0)->isVolatile()) {
12791     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
12792     SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
12793                                LN0->getPointerInfo(), LN0->getAlignment(),
12794                                LN0->getMemOperand()->getFlags());
12795 
12796     // Make sure successors of the original load stay after it by updating them
12797     // to use the new Chain.
12798     DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
12799 
12800     unsigned Opcode =
12801         (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
12802     return DAG.getNode(Opcode, SDLoc(N), VT, Load);
12803   }
12804 
12805   return SDValue();
12806 }
12807 
12808 /// Fold a floating-point multiply by power of two into floating-point to
12809 /// fixed-point conversion.
12810 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
12811                                      TargetLowering::DAGCombinerInfo &DCI,
12812                                      const AArch64Subtarget *Subtarget) {
12813   if (!Subtarget->hasNEON())
12814     return SDValue();
12815 
12816   if (!N->getValueType(0).isSimple())
12817     return SDValue();
12818 
12819   SDValue Op = N->getOperand(0);
12820   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
12821       Op.getOpcode() != ISD::FMUL)
12822     return SDValue();
12823 
12824   SDValue ConstVec = Op->getOperand(1);
12825   if (!isa<BuildVectorSDNode>(ConstVec))
12826     return SDValue();
12827 
12828   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
12829   uint32_t FloatBits = FloatTy.getSizeInBits();
12830   if (FloatBits != 32 && FloatBits != 64)
12831     return SDValue();
12832 
12833   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
12834   uint32_t IntBits = IntTy.getSizeInBits();
12835   if (IntBits != 16 && IntBits != 32 && IntBits != 64)
12836     return SDValue();
12837 
12838   // Avoid conversions where iN is larger than the float (e.g., float -> i64).
12839   if (IntBits > FloatBits)
12840     return SDValue();
12841 
12842   BitVector UndefElements;
12843   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
12844   int32_t Bits = IntBits == 64 ? 64 : 32;
12845   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
12846   if (C == -1 || C == 0 || C > Bits)
12847     return SDValue();
12848 
12849   MVT ResTy;
12850   unsigned NumLanes = Op.getValueType().getVectorNumElements();
12851   switch (NumLanes) {
12852   default:
12853     return SDValue();
12854   case 2:
12855     ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
12856     break;
12857   case 4:
12858     ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
12859     break;
12860   }
12861 
12862   if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
12863     return SDValue();
12864 
12865   assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
12866          "Illegal vector type after legalization");
12867 
12868   SDLoc DL(N);
12869   bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
12870   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
12871                                       : Intrinsic::aarch64_neon_vcvtfp2fxu;
12872   SDValue FixConv =
12873       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
12874                   DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
12875                   Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
12876   // We can handle smaller integers by generating an extra trunc.
12877   if (IntBits < FloatBits)
12878     FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
12879 
12880   return FixConv;
12881 }
12882 
12883 /// Fold a floating-point divide by power of two into fixed-point to
12884 /// floating-point conversion.
12885 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
12886                                   TargetLowering::DAGCombinerInfo &DCI,
12887                                   const AArch64Subtarget *Subtarget) {
12888   if (!Subtarget->hasNEON())
12889     return SDValue();
12890 
12891   SDValue Op = N->getOperand(0);
12892   unsigned Opc = Op->getOpcode();
12893   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
12894       !Op.getOperand(0).getValueType().isSimple() ||
12895       (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
12896     return SDValue();
12897 
12898   SDValue ConstVec = N->getOperand(1);
12899   if (!isa<BuildVectorSDNode>(ConstVec))
12900     return SDValue();
12901 
12902   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
12903   int32_t IntBits = IntTy.getSizeInBits();
12904   if (IntBits != 16 && IntBits != 32 && IntBits != 64)
12905     return SDValue();
12906 
12907   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
12908   int32_t FloatBits = FloatTy.getSizeInBits();
12909   if (FloatBits != 32 && FloatBits != 64)
12910     return SDValue();
12911 
12912   // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
12913   if (IntBits > FloatBits)
12914     return SDValue();
12915 
12916   BitVector UndefElements;
12917   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
12918   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
12919   if (C == -1 || C == 0 || C > FloatBits)
12920     return SDValue();
12921 
12922   MVT ResTy;
12923   unsigned NumLanes = Op.getValueType().getVectorNumElements();
12924   switch (NumLanes) {
12925   default:
12926     return SDValue();
12927   case 2:
12928     ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
12929     break;
12930   case 4:
12931     ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
12932     break;
12933   }
12934 
12935   if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
12936     return SDValue();
12937 
12938   SDLoc DL(N);
12939   SDValue ConvInput = Op.getOperand(0);
12940   bool IsSigned = Opc == ISD::SINT_TO_FP;
12941   if (IntBits < FloatBits)
12942     ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
12943                             ResTy, ConvInput);
12944 
12945   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
12946                                       : Intrinsic::aarch64_neon_vcvtfxu2fp;
12947   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
12948                      DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
12949                      DAG.getConstant(C, DL, MVT::i32));
12950 }
12951 
12952 /// An EXTR instruction is made up of two shifts, ORed together. This helper
12953 /// searches for and classifies those shifts.
12954 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
12955                          bool &FromHi) {
12956   if (N.getOpcode() == ISD::SHL)
12957     FromHi = false;
12958   else if (N.getOpcode() == ISD::SRL)
12959     FromHi = true;
12960   else
12961     return false;
12962 
12963   if (!isa<ConstantSDNode>(N.getOperand(1)))
12964     return false;
12965 
12966   ShiftAmount = N->getConstantOperandVal(1);
12967   Src = N->getOperand(0);
12968   return true;
12969 }
12970 
12971 /// EXTR instruction extracts a contiguous chunk of bits from two existing
12972 /// registers viewed as a high/low pair. This function looks for the pattern:
12973 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
12974 /// with an EXTR. Can't quite be done in TableGen because the two immediates
12975 /// aren't independent.
12976 static SDValue tryCombineToEXTR(SDNode *N,
12977                                 TargetLowering::DAGCombinerInfo &DCI) {
12978   SelectionDAG &DAG = DCI.DAG;
12979   SDLoc DL(N);
12980   EVT VT = N->getValueType(0);
12981 
12982   assert(N->getOpcode() == ISD::OR && "Unexpected root");
12983 
12984   if (VT != MVT::i32 && VT != MVT::i64)
12985     return SDValue();
12986 
12987   SDValue LHS;
12988   uint32_t ShiftLHS = 0;
12989   bool LHSFromHi = false;
12990   if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
12991     return SDValue();
12992 
12993   SDValue RHS;
12994   uint32_t ShiftRHS = 0;
12995   bool RHSFromHi = false;
12996   if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
12997     return SDValue();
12998 
12999   // If they're both trying to come from the high part of the register, they're
13000   // not really an EXTR.
13001   if (LHSFromHi == RHSFromHi)
13002     return SDValue();
13003 
13004   if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
13005     return SDValue();
13006 
13007   if (LHSFromHi) {
13008     std::swap(LHS, RHS);
13009     std::swap(ShiftLHS, ShiftRHS);
13010   }
13011 
13012   return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
13013                      DAG.getConstant(ShiftRHS, DL, MVT::i64));
13014 }
13015 
13016 static SDValue tryCombineToBSL(SDNode *N,
13017                                 TargetLowering::DAGCombinerInfo &DCI) {
13018   EVT VT = N->getValueType(0);
13019   SelectionDAG &DAG = DCI.DAG;
13020   SDLoc DL(N);
13021 
13022   if (!VT.isVector())
13023     return SDValue();
13024 
13025   // The combining code currently only works for NEON vectors. In particular,
13026   // it does not work for SVE when dealing with vectors wider than 128 bits.
13027   if (!VT.is64BitVector() && !VT.is128BitVector())
13028     return SDValue();
13029 
13030   SDValue N0 = N->getOperand(0);
13031   if (N0.getOpcode() != ISD::AND)
13032     return SDValue();
13033 
13034   SDValue N1 = N->getOperand(1);
13035   if (N1.getOpcode() != ISD::AND)
13036     return SDValue();
13037 
13038   // InstCombine does (not (neg a)) => (add a -1).
13039   // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
13040   // Loop over all combinations of AND operands.
13041   for (int i = 1; i >= 0; --i) {
13042     for (int j = 1; j >= 0; --j) {
13043       SDValue O0 = N0->getOperand(i);
13044       SDValue O1 = N1->getOperand(j);
13045       SDValue Sub, Add, SubSibling, AddSibling;
13046 
13047       // Find a SUB and an ADD operand, one from each AND.
13048       if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
13049         Sub = O0;
13050         Add = O1;
13051         SubSibling = N0->getOperand(1 - i);
13052         AddSibling = N1->getOperand(1 - j);
13053       } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
13054         Add = O0;
13055         Sub = O1;
13056         AddSibling = N0->getOperand(1 - i);
13057         SubSibling = N1->getOperand(1 - j);
13058       } else
13059         continue;
13060 
13061       if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()))
13062         continue;
13063 
13064       // Constant ones is always righthand operand of the Add.
13065       if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
13066         continue;
13067 
13068       if (Sub.getOperand(1) != Add.getOperand(0))
13069         continue;
13070 
13071       return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
13072     }
13073   }
13074 
13075   // (or (and a b) (and (not a) c)) => (bsl a b c)
13076   // We only have to look for constant vectors here since the general, variable
13077   // case can be handled in TableGen.
13078   unsigned Bits = VT.getScalarSizeInBits();
13079   uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
13080   for (int i = 1; i >= 0; --i)
13081     for (int j = 1; j >= 0; --j) {
13082       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
13083       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
13084       if (!BVN0 || !BVN1)
13085         continue;
13086 
13087       bool FoundMatch = true;
13088       for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
13089         ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
13090         ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
13091         if (!CN0 || !CN1 ||
13092             CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
13093           FoundMatch = false;
13094           break;
13095         }
13096       }
13097 
13098       if (FoundMatch)
13099         return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
13100                            N0->getOperand(1 - i), N1->getOperand(1 - j));
13101     }
13102 
13103   return SDValue();
13104 }
13105 
13106 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
13107                                 const AArch64Subtarget *Subtarget) {
13108   // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
13109   SelectionDAG &DAG = DCI.DAG;
13110   EVT VT = N->getValueType(0);
13111 
13112   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
13113     return SDValue();
13114 
13115   if (SDValue Res = tryCombineToEXTR(N, DCI))
13116     return Res;
13117 
13118   if (SDValue Res = tryCombineToBSL(N, DCI))
13119     return Res;
13120 
13121   return SDValue();
13122 }
13123 
13124 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
13125   if (!MemVT.getVectorElementType().isSimple())
13126     return false;
13127 
13128   uint64_t MaskForTy = 0ull;
13129   switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
13130   case MVT::i8:
13131     MaskForTy = 0xffull;
13132     break;
13133   case MVT::i16:
13134     MaskForTy = 0xffffull;
13135     break;
13136   case MVT::i32:
13137     MaskForTy = 0xffffffffull;
13138     break;
13139   default:
13140     return false;
13141     break;
13142   }
13143 
13144   if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
13145     if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
13146       return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
13147 
13148   return false;
13149 }
13150 
13151 static SDValue performSVEAndCombine(SDNode *N,
13152                                     TargetLowering::DAGCombinerInfo &DCI) {
13153   if (DCI.isBeforeLegalizeOps())
13154     return SDValue();
13155 
13156   SelectionDAG &DAG = DCI.DAG;
13157   SDValue Src = N->getOperand(0);
13158   unsigned Opc = Src->getOpcode();
13159 
13160   // Zero/any extend of an unsigned unpack
13161   if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
13162     SDValue UnpkOp = Src->getOperand(0);
13163     SDValue Dup = N->getOperand(1);
13164 
13165     if (Dup.getOpcode() != AArch64ISD::DUP)
13166       return SDValue();
13167 
13168     SDLoc DL(N);
13169     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
13170     uint64_t ExtVal = C->getZExtValue();
13171 
13172     // If the mask is fully covered by the unpack, we don't need to push
13173     // a new AND onto the operand
13174     EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
13175     if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
13176         (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
13177         (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
13178       return Src;
13179 
13180     // Truncate to prevent a DUP with an over wide constant
13181     APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
13182 
13183     // Otherwise, make sure we propagate the AND to the operand
13184     // of the unpack
13185     Dup = DAG.getNode(AArch64ISD::DUP, DL,
13186                       UnpkOp->getValueType(0),
13187                       DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
13188 
13189     SDValue And = DAG.getNode(ISD::AND, DL,
13190                               UnpkOp->getValueType(0), UnpkOp, Dup);
13191 
13192     return DAG.getNode(Opc, DL, N->getValueType(0), And);
13193   }
13194 
13195   if (!EnableCombineMGatherIntrinsics)
13196     return SDValue();
13197 
13198   SDValue Mask = N->getOperand(1);
13199 
13200   if (!Src.hasOneUse())
13201     return SDValue();
13202 
13203   EVT MemVT;
13204 
13205   // SVE load instructions perform an implicit zero-extend, which makes them
13206   // perfect candidates for combining.
13207   switch (Opc) {
13208   case AArch64ISD::LD1_MERGE_ZERO:
13209   case AArch64ISD::LDNF1_MERGE_ZERO:
13210   case AArch64ISD::LDFF1_MERGE_ZERO:
13211     MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
13212     break;
13213   case AArch64ISD::GLD1_MERGE_ZERO:
13214   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
13215   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
13216   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
13217   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
13218   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
13219   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
13220   case AArch64ISD::GLDFF1_MERGE_ZERO:
13221   case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
13222   case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
13223   case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
13224   case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
13225   case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
13226   case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
13227   case AArch64ISD::GLDNT1_MERGE_ZERO:
13228     MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
13229     break;
13230   default:
13231     return SDValue();
13232   }
13233 
13234   if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
13235     return Src;
13236 
13237   return SDValue();
13238 }
13239 
13240 static SDValue performANDCombine(SDNode *N,
13241                                  TargetLowering::DAGCombinerInfo &DCI) {
13242   SelectionDAG &DAG = DCI.DAG;
13243   SDValue LHS = N->getOperand(0);
13244   EVT VT = N->getValueType(0);
13245   if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT))
13246     return SDValue();
13247 
13248   if (VT.isScalableVector())
13249     return performSVEAndCombine(N, DCI);
13250 
13251   // The combining code below works only for NEON vectors. In particular, it
13252   // does not work for SVE when dealing with vectors wider than 128 bits.
13253   if (!(VT.is64BitVector() || VT.is128BitVector()))
13254     return SDValue();
13255 
13256   BuildVectorSDNode *BVN =
13257       dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
13258   if (!BVN)
13259     return SDValue();
13260 
13261   // AND does not accept an immediate, so check if we can use a BIC immediate
13262   // instruction instead. We do this here instead of using a (and x, (mvni imm))
13263   // pattern in isel, because some immediates may be lowered to the preferred
13264   // (and x, (movi imm)) form, even though an mvni representation also exists.
13265   APInt DefBits(VT.getSizeInBits(), 0);
13266   APInt UndefBits(VT.getSizeInBits(), 0);
13267   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13268     SDValue NewOp;
13269 
13270     DefBits = ~DefBits;
13271     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
13272                                     DefBits, &LHS)) ||
13273         (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
13274                                     DefBits, &LHS)))
13275       return NewOp;
13276 
13277     UndefBits = ~UndefBits;
13278     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
13279                                     UndefBits, &LHS)) ||
13280         (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
13281                                     UndefBits, &LHS)))
13282       return NewOp;
13283   }
13284 
13285   return SDValue();
13286 }
13287 
13288 static SDValue performSRLCombine(SDNode *N,
13289                                  TargetLowering::DAGCombinerInfo &DCI) {
13290   SelectionDAG &DAG = DCI.DAG;
13291   EVT VT = N->getValueType(0);
13292   if (VT != MVT::i32 && VT != MVT::i64)
13293     return SDValue();
13294 
13295   // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
13296   // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
13297   // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
13298   SDValue N0 = N->getOperand(0);
13299   if (N0.getOpcode() == ISD::BSWAP) {
13300     SDLoc DL(N);
13301     SDValue N1 = N->getOperand(1);
13302     SDValue N00 = N0.getOperand(0);
13303     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
13304       uint64_t ShiftAmt = C->getZExtValue();
13305       if (VT == MVT::i32 && ShiftAmt == 16 &&
13306           DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
13307         return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
13308       if (VT == MVT::i64 && ShiftAmt == 32 &&
13309           DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
13310         return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
13311     }
13312   }
13313   return SDValue();
13314 }
13315 
13316 // Attempt to form urhadd(OpA, OpB) from
13317 // truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1))
13318 // or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)).
13319 // The original form of the first expression is
13320 // truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the
13321 // (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)).
13322 // Before this function is called the srl will have been lowered to
13323 // AArch64ISD::VLSHR.
13324 // This pass can also recognize signed variants of the patterns that use sign
13325 // extension instead of zero extension and form a srhadd(OpA, OpB) or a
13326 // shadd(OpA, OpB) from them.
13327 static SDValue
13328 performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
13329                              SelectionDAG &DAG) {
13330   EVT VT = N->getValueType(0);
13331 
13332   // Since we are looking for a right shift by a constant value of 1 and we are
13333   // operating on types at least 16 bits in length (sign/zero extended OpA and
13334   // OpB, which are at least 8 bits), it follows that the truncate will always
13335   // discard the shifted-in bit and therefore the right shift will be logical
13336   // regardless of the signedness of OpA and OpB.
13337   SDValue Shift = N->getOperand(0);
13338   if (Shift.getOpcode() != AArch64ISD::VLSHR)
13339     return SDValue();
13340 
13341   // Is the right shift using an immediate value of 1?
13342   uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
13343   if (ShiftAmount != 1)
13344     return SDValue();
13345 
13346   SDValue ExtendOpA, ExtendOpB;
13347   SDValue ShiftOp0 = Shift.getOperand(0);
13348   unsigned ShiftOp0Opc = ShiftOp0.getOpcode();
13349   if (ShiftOp0Opc == ISD::SUB) {
13350 
13351     SDValue Xor = ShiftOp0.getOperand(1);
13352     if (Xor.getOpcode() != ISD::XOR)
13353       return SDValue();
13354 
13355     // Is the XOR using a constant amount of all ones in the right hand side?
13356     uint64_t C;
13357     if (!isAllConstantBuildVector(Xor.getOperand(1), C))
13358       return SDValue();
13359 
13360     unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13361     APInt CAsAPInt(ElemSizeInBits, C);
13362     if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
13363       return SDValue();
13364 
13365     ExtendOpA = Xor.getOperand(0);
13366     ExtendOpB = ShiftOp0.getOperand(0);
13367   } else if (ShiftOp0Opc == ISD::ADD) {
13368     ExtendOpA = ShiftOp0.getOperand(0);
13369     ExtendOpB = ShiftOp0.getOperand(1);
13370   } else
13371     return SDValue();
13372 
13373   unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
13374   unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
13375   if (!(ExtendOpAOpc == ExtendOpBOpc &&
13376         (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
13377     return SDValue();
13378 
13379   // Is the result of the right shift being truncated to the same value type as
13380   // the original operands, OpA and OpB?
13381   SDValue OpA = ExtendOpA.getOperand(0);
13382   SDValue OpB = ExtendOpB.getOperand(0);
13383   EVT OpAVT = OpA.getValueType();
13384   assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
13385   if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
13386     return SDValue();
13387 
13388   SDLoc DL(N);
13389   bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
13390   bool IsRHADD = ShiftOp0Opc == ISD::SUB;
13391   unsigned HADDOpc = IsSignExtend
13392                          ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
13393                          : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD);
13394   SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB);
13395 
13396   return ResultHADD;
13397 }
13398 
13399 static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
13400   switch (Opcode) {
13401   case ISD::FADD:
13402     return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
13403   case ISD::ADD:
13404     return VT == MVT::i64;
13405   default:
13406     return false;
13407   }
13408 }
13409 
13410 static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
13411   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
13412   ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
13413 
13414   EVT VT = N->getValueType(0);
13415   const bool FullFP16 =
13416       static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
13417 
13418   // Rewrite for pairwise fadd pattern
13419   //   (f32 (extract_vector_elt
13420   //           (fadd (vXf32 Other)
13421   //                 (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
13422   // ->
13423   //   (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
13424   //              (extract_vector_elt (vXf32 Other) 1))
13425   if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
13426       hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) {
13427     SDLoc DL(N0);
13428     SDValue N00 = N0->getOperand(0);
13429     SDValue N01 = N0->getOperand(1);
13430 
13431     ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
13432     SDValue Other = N00;
13433 
13434     // And handle the commutative case.
13435     if (!Shuffle) {
13436       Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
13437       Other = N01;
13438     }
13439 
13440     if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
13441         Other == Shuffle->getOperand(0)) {
13442       return DAG.getNode(N0->getOpcode(), DL, VT,
13443                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
13444                                      DAG.getConstant(0, DL, MVT::i64)),
13445                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
13446                                      DAG.getConstant(1, DL, MVT::i64)));
13447     }
13448   }
13449 
13450   return SDValue();
13451 }
13452 
13453 static SDValue performConcatVectorsCombine(SDNode *N,
13454                                            TargetLowering::DAGCombinerInfo &DCI,
13455                                            SelectionDAG &DAG) {
13456   SDLoc dl(N);
13457   EVT VT = N->getValueType(0);
13458   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
13459   unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
13460 
13461   // Optimize concat_vectors of truncated vectors, where the intermediate
13462   // type is illegal, to avoid said illegality,  e.g.,
13463   //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
13464   //                          (v2i16 (truncate (v2i64)))))
13465   // ->
13466   //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
13467   //                                    (v4i32 (bitcast (v2i64))),
13468   //                                    <0, 2, 4, 6>)))
13469   // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
13470   // on both input and result type, so we might generate worse code.
13471   // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
13472   if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
13473       N1Opc == ISD::TRUNCATE) {
13474     SDValue N00 = N0->getOperand(0);
13475     SDValue N10 = N1->getOperand(0);
13476     EVT N00VT = N00.getValueType();
13477 
13478     if (N00VT == N10.getValueType() &&
13479         (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
13480         N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
13481       MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
13482       SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
13483       for (size_t i = 0; i < Mask.size(); ++i)
13484         Mask[i] = i * 2;
13485       return DAG.getNode(ISD::TRUNCATE, dl, VT,
13486                          DAG.getVectorShuffle(
13487                              MidVT, dl,
13488                              DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
13489                              DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
13490     }
13491   }
13492 
13493   // Wait 'til after everything is legalized to try this. That way we have
13494   // legal vector types and such.
13495   if (DCI.isBeforeLegalizeOps())
13496     return SDValue();
13497 
13498   // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted
13499   // subvectors from the same original vectors. Combine these into a single
13500   // [us]rhadd or [us]hadd that operates on the two original vectors. Example:
13501   //  (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
13502   //                                        extract_subvector (v16i8 OpB,
13503   //                                        <0>))),
13504   //                         (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
13505   //                                        extract_subvector (v16i8 OpB,
13506   //                                        <8>)))))
13507   // ->
13508   //  (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
13509   if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
13510       (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD ||
13511        N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) {
13512     SDValue N00 = N0->getOperand(0);
13513     SDValue N01 = N0->getOperand(1);
13514     SDValue N10 = N1->getOperand(0);
13515     SDValue N11 = N1->getOperand(1);
13516 
13517     EVT N00VT = N00.getValueType();
13518     EVT N10VT = N10.getValueType();
13519 
13520     if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13521         N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13522         N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13523         N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
13524       SDValue N00Source = N00->getOperand(0);
13525       SDValue N01Source = N01->getOperand(0);
13526       SDValue N10Source = N10->getOperand(0);
13527       SDValue N11Source = N11->getOperand(0);
13528 
13529       if (N00Source == N10Source && N01Source == N11Source &&
13530           N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
13531         assert(N0.getValueType() == N1.getValueType());
13532 
13533         uint64_t N00Index = N00.getConstantOperandVal(1);
13534         uint64_t N01Index = N01.getConstantOperandVal(1);
13535         uint64_t N10Index = N10.getConstantOperandVal(1);
13536         uint64_t N11Index = N11.getConstantOperandVal(1);
13537 
13538         if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
13539             N10Index == N00VT.getVectorNumElements())
13540           return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
13541       }
13542     }
13543   }
13544 
13545   // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
13546   // splat. The indexed instructions are going to be expecting a DUPLANE64, so
13547   // canonicalise to that.
13548   if (N0 == N1 && VT.getVectorNumElements() == 2) {
13549     assert(VT.getScalarSizeInBits() == 64);
13550     return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
13551                        DAG.getConstant(0, dl, MVT::i64));
13552   }
13553 
13554   // Canonicalise concat_vectors so that the right-hand vector has as few
13555   // bit-casts as possible before its real operation. The primary matching
13556   // destination for these operations will be the narrowing "2" instructions,
13557   // which depend on the operation being performed on this right-hand vector.
13558   // For example,
13559   //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
13560   // becomes
13561   //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
13562 
13563   if (N1Opc != ISD::BITCAST)
13564     return SDValue();
13565   SDValue RHS = N1->getOperand(0);
13566   MVT RHSTy = RHS.getValueType().getSimpleVT();
13567   // If the RHS is not a vector, this is not the pattern we're looking for.
13568   if (!RHSTy.isVector())
13569     return SDValue();
13570 
13571   LLVM_DEBUG(
13572       dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
13573 
13574   MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
13575                                   RHSTy.getVectorNumElements() * 2);
13576   return DAG.getNode(ISD::BITCAST, dl, VT,
13577                      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
13578                                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
13579                                  RHS));
13580 }
13581 
13582 static SDValue tryCombineFixedPointConvert(SDNode *N,
13583                                            TargetLowering::DAGCombinerInfo &DCI,
13584                                            SelectionDAG &DAG) {
13585   // Wait until after everything is legalized to try this. That way we have
13586   // legal vector types and such.
13587   if (DCI.isBeforeLegalizeOps())
13588     return SDValue();
13589   // Transform a scalar conversion of a value from a lane extract into a
13590   // lane extract of a vector conversion. E.g., from foo1 to foo2:
13591   // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
13592   // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
13593   //
13594   // The second form interacts better with instruction selection and the
13595   // register allocator to avoid cross-class register copies that aren't
13596   // coalescable due to a lane reference.
13597 
13598   // Check the operand and see if it originates from a lane extract.
13599   SDValue Op1 = N->getOperand(1);
13600   if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13601     // Yep, no additional predication needed. Perform the transform.
13602     SDValue IID = N->getOperand(0);
13603     SDValue Shift = N->getOperand(2);
13604     SDValue Vec = Op1.getOperand(0);
13605     SDValue Lane = Op1.getOperand(1);
13606     EVT ResTy = N->getValueType(0);
13607     EVT VecResTy;
13608     SDLoc DL(N);
13609 
13610     // The vector width should be 128 bits by the time we get here, even
13611     // if it started as 64 bits (the extract_vector handling will have
13612     // done so).
13613     assert(Vec.getValueSizeInBits() == 128 &&
13614            "unexpected vector size on extract_vector_elt!");
13615     if (Vec.getValueType() == MVT::v4i32)
13616       VecResTy = MVT::v4f32;
13617     else if (Vec.getValueType() == MVT::v2i64)
13618       VecResTy = MVT::v2f64;
13619     else
13620       llvm_unreachable("unexpected vector type!");
13621 
13622     SDValue Convert =
13623         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
13624     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
13625   }
13626   return SDValue();
13627 }
13628 
13629 // AArch64 high-vector "long" operations are formed by performing the non-high
13630 // version on an extract_subvector of each operand which gets the high half:
13631 //
13632 //  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
13633 //
13634 // However, there are cases which don't have an extract_high explicitly, but
13635 // have another operation that can be made compatible with one for free. For
13636 // example:
13637 //
13638 //  (dupv64 scalar) --> (extract_high (dup128 scalar))
13639 //
13640 // This routine does the actual conversion of such DUPs, once outer routines
13641 // have determined that everything else is in order.
13642 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
13643 // similarly here.
13644 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
13645   switch (N.getOpcode()) {
13646   case AArch64ISD::DUP:
13647   case AArch64ISD::DUPLANE8:
13648   case AArch64ISD::DUPLANE16:
13649   case AArch64ISD::DUPLANE32:
13650   case AArch64ISD::DUPLANE64:
13651   case AArch64ISD::MOVI:
13652   case AArch64ISD::MOVIshift:
13653   case AArch64ISD::MOVIedit:
13654   case AArch64ISD::MOVImsl:
13655   case AArch64ISD::MVNIshift:
13656   case AArch64ISD::MVNImsl:
13657     break;
13658   default:
13659     // FMOV could be supported, but isn't very useful, as it would only occur
13660     // if you passed a bitcast' floating point immediate to an eligible long
13661     // integer op (addl, smull, ...).
13662     return SDValue();
13663   }
13664 
13665   MVT NarrowTy = N.getSimpleValueType();
13666   if (!NarrowTy.is64BitVector())
13667     return SDValue();
13668 
13669   MVT ElementTy = NarrowTy.getVectorElementType();
13670   unsigned NumElems = NarrowTy.getVectorNumElements();
13671   MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
13672 
13673   SDLoc dl(N);
13674   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
13675                      DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
13676                      DAG.getConstant(NumElems, dl, MVT::i64));
13677 }
13678 
13679 static bool isEssentiallyExtractHighSubvector(SDValue N) {
13680   if (N.getOpcode() == ISD::BITCAST)
13681     N = N.getOperand(0);
13682   if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
13683     return false;
13684   if (N.getOperand(0).getValueType().isScalableVector())
13685     return false;
13686   return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
13687          N.getOperand(0).getValueType().getVectorNumElements() / 2;
13688 }
13689 
13690 /// Helper structure to keep track of ISD::SET_CC operands.
13691 struct GenericSetCCInfo {
13692   const SDValue *Opnd0;
13693   const SDValue *Opnd1;
13694   ISD::CondCode CC;
13695 };
13696 
13697 /// Helper structure to keep track of a SET_CC lowered into AArch64 code.
13698 struct AArch64SetCCInfo {
13699   const SDValue *Cmp;
13700   AArch64CC::CondCode CC;
13701 };
13702 
13703 /// Helper structure to keep track of SetCC information.
13704 union SetCCInfo {
13705   GenericSetCCInfo Generic;
13706   AArch64SetCCInfo AArch64;
13707 };
13708 
13709 /// Helper structure to be able to read SetCC information.  If set to
13710 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
13711 /// GenericSetCCInfo.
13712 struct SetCCInfoAndKind {
13713   SetCCInfo Info;
13714   bool IsAArch64;
13715 };
13716 
13717 /// Check whether or not \p Op is a SET_CC operation, either a generic or
13718 /// an
13719 /// AArch64 lowered one.
13720 /// \p SetCCInfo is filled accordingly.
13721 /// \post SetCCInfo is meanginfull only when this function returns true.
13722 /// \return True when Op is a kind of SET_CC operation.
13723 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
13724   // If this is a setcc, this is straight forward.
13725   if (Op.getOpcode() == ISD::SETCC) {
13726     SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
13727     SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
13728     SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
13729     SetCCInfo.IsAArch64 = false;
13730     return true;
13731   }
13732   // Otherwise, check if this is a matching csel instruction.
13733   // In other words:
13734   // - csel 1, 0, cc
13735   // - csel 0, 1, !cc
13736   if (Op.getOpcode() != AArch64ISD::CSEL)
13737     return false;
13738   // Set the information about the operands.
13739   // TODO: we want the operands of the Cmp not the csel
13740   SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
13741   SetCCInfo.IsAArch64 = true;
13742   SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
13743       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
13744 
13745   // Check that the operands matches the constraints:
13746   // (1) Both operands must be constants.
13747   // (2) One must be 1 and the other must be 0.
13748   ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
13749   ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13750 
13751   // Check (1).
13752   if (!TValue || !FValue)
13753     return false;
13754 
13755   // Check (2).
13756   if (!TValue->isOne()) {
13757     // Update the comparison when we are interested in !cc.
13758     std::swap(TValue, FValue);
13759     SetCCInfo.Info.AArch64.CC =
13760         AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
13761   }
13762   return TValue->isOne() && FValue->isNullValue();
13763 }
13764 
13765 // Returns true if Op is setcc or zext of setcc.
13766 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
13767   if (isSetCC(Op, Info))
13768     return true;
13769   return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
13770     isSetCC(Op->getOperand(0), Info));
13771 }
13772 
13773 // The folding we want to perform is:
13774 // (add x, [zext] (setcc cc ...) )
13775 //   -->
13776 // (csel x, (add x, 1), !cc ...)
13777 //
13778 // The latter will get matched to a CSINC instruction.
13779 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
13780   assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
13781   SDValue LHS = Op->getOperand(0);
13782   SDValue RHS = Op->getOperand(1);
13783   SetCCInfoAndKind InfoAndKind;
13784 
13785   // If both operands are a SET_CC, then we don't want to perform this
13786   // folding and create another csel as this results in more instructions
13787   // (and higher register usage).
13788   if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
13789       isSetCCOrZExtSetCC(RHS, InfoAndKind))
13790     return SDValue();
13791 
13792   // If neither operand is a SET_CC, give up.
13793   if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
13794     std::swap(LHS, RHS);
13795     if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
13796       return SDValue();
13797   }
13798 
13799   // FIXME: This could be generatized to work for FP comparisons.
13800   EVT CmpVT = InfoAndKind.IsAArch64
13801                   ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
13802                   : InfoAndKind.Info.Generic.Opnd0->getValueType();
13803   if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
13804     return SDValue();
13805 
13806   SDValue CCVal;
13807   SDValue Cmp;
13808   SDLoc dl(Op);
13809   if (InfoAndKind.IsAArch64) {
13810     CCVal = DAG.getConstant(
13811         AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
13812         MVT::i32);
13813     Cmp = *InfoAndKind.Info.AArch64.Cmp;
13814   } else
13815     Cmp = getAArch64Cmp(
13816         *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
13817         ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
13818         dl);
13819 
13820   EVT VT = Op->getValueType(0);
13821   LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
13822   return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
13823 }
13824 
13825 // ADD(UADDV a, UADDV b) -->  UADDV(ADD a, b)
13826 static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
13827   EVT VT = N->getValueType(0);
13828   // Only scalar integer and vector types.
13829   if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
13830     return SDValue();
13831 
13832   SDValue LHS = N->getOperand(0);
13833   SDValue RHS = N->getOperand(1);
13834   if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13835       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
13836     return SDValue();
13837 
13838   auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13839   auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
13840   if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isNullValue())
13841     return SDValue();
13842 
13843   SDValue Op1 = LHS->getOperand(0);
13844   SDValue Op2 = RHS->getOperand(0);
13845   EVT OpVT1 = Op1.getValueType();
13846   EVT OpVT2 = Op2.getValueType();
13847   if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
13848       Op2.getOpcode() != AArch64ISD::UADDV ||
13849       OpVT1.getVectorElementType() != VT)
13850     return SDValue();
13851 
13852   SDValue Val1 = Op1.getOperand(0);
13853   SDValue Val2 = Op2.getOperand(0);
13854   EVT ValVT = Val1->getValueType(0);
13855   SDLoc DL(N);
13856   SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
13857   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
13858                      DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
13859                      DAG.getConstant(0, DL, MVT::i64));
13860 }
13861 
13862 // ADD(UDOT(zero, x, y), A) -->  UDOT(A, x, y)
13863 static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
13864   EVT VT = N->getValueType(0);
13865   if (N->getOpcode() != ISD::ADD)
13866     return SDValue();
13867 
13868   SDValue Dot = N->getOperand(0);
13869   SDValue A = N->getOperand(1);
13870   // Handle commutivity
13871   auto isZeroDot = [](SDValue Dot) {
13872     return (Dot.getOpcode() == AArch64ISD::UDOT ||
13873             Dot.getOpcode() == AArch64ISD::SDOT) &&
13874            isZerosVector(Dot.getOperand(0).getNode());
13875   };
13876   if (!isZeroDot(Dot))
13877     std::swap(Dot, A);
13878   if (!isZeroDot(Dot))
13879     return SDValue();
13880 
13881   return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
13882                      Dot.getOperand(2));
13883 }
13884 
13885 // The basic add/sub long vector instructions have variants with "2" on the end
13886 // which act on the high-half of their inputs. They are normally matched by
13887 // patterns like:
13888 //
13889 // (add (zeroext (extract_high LHS)),
13890 //      (zeroext (extract_high RHS)))
13891 // -> uaddl2 vD, vN, vM
13892 //
13893 // However, if one of the extracts is something like a duplicate, this
13894 // instruction can still be used profitably. This function puts the DAG into a
13895 // more appropriate form for those patterns to trigger.
13896 static SDValue performAddSubLongCombine(SDNode *N,
13897                                         TargetLowering::DAGCombinerInfo &DCI,
13898                                         SelectionDAG &DAG) {
13899   if (DCI.isBeforeLegalizeOps())
13900     return SDValue();
13901 
13902   MVT VT = N->getSimpleValueType(0);
13903   if (!VT.is128BitVector()) {
13904     if (N->getOpcode() == ISD::ADD)
13905       return performSetccAddFolding(N, DAG);
13906     return SDValue();
13907   }
13908 
13909   // Make sure both branches are extended in the same way.
13910   SDValue LHS = N->getOperand(0);
13911   SDValue RHS = N->getOperand(1);
13912   if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
13913        LHS.getOpcode() != ISD::SIGN_EXTEND) ||
13914       LHS.getOpcode() != RHS.getOpcode())
13915     return SDValue();
13916 
13917   unsigned ExtType = LHS.getOpcode();
13918 
13919   // It's not worth doing if at least one of the inputs isn't already an
13920   // extract, but we don't know which it'll be so we have to try both.
13921   if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
13922     RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
13923     if (!RHS.getNode())
13924       return SDValue();
13925 
13926     RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
13927   } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
13928     LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
13929     if (!LHS.getNode())
13930       return SDValue();
13931 
13932     LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
13933   }
13934 
13935   return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
13936 }
13937 
13938 static SDValue performAddSubCombine(SDNode *N,
13939                                     TargetLowering::DAGCombinerInfo &DCI,
13940                                     SelectionDAG &DAG) {
13941   // Try to change sum of two reductions.
13942   if (SDValue Val = performUADDVCombine(N, DAG))
13943     return Val;
13944   if (SDValue Val = performAddDotCombine(N, DAG))
13945     return Val;
13946 
13947   return performAddSubLongCombine(N, DCI, DAG);
13948 }
13949 
13950 // Massage DAGs which we can use the high-half "long" operations on into
13951 // something isel will recognize better. E.g.
13952 //
13953 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
13954 //   (aarch64_neon_umull (extract_high (v2i64 vec)))
13955 //                     (extract_high (v2i64 (dup128 scalar)))))
13956 //
13957 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
13958                                        TargetLowering::DAGCombinerInfo &DCI,
13959                                        SelectionDAG &DAG) {
13960   if (DCI.isBeforeLegalizeOps())
13961     return SDValue();
13962 
13963   SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
13964   SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
13965   assert(LHS.getValueType().is64BitVector() &&
13966          RHS.getValueType().is64BitVector() &&
13967          "unexpected shape for long operation");
13968 
13969   // Either node could be a DUP, but it's not worth doing both of them (you'd
13970   // just as well use the non-high version) so look for a corresponding extract
13971   // operation on the other "wing".
13972   if (isEssentiallyExtractHighSubvector(LHS)) {
13973     RHS = tryExtendDUPToExtractHigh(RHS, DAG);
13974     if (!RHS.getNode())
13975       return SDValue();
13976   } else if (isEssentiallyExtractHighSubvector(RHS)) {
13977     LHS = tryExtendDUPToExtractHigh(LHS, DAG);
13978     if (!LHS.getNode())
13979       return SDValue();
13980   }
13981 
13982   if (IID == Intrinsic::not_intrinsic)
13983     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
13984 
13985   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
13986                      N->getOperand(0), LHS, RHS);
13987 }
13988 
13989 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
13990   MVT ElemTy = N->getSimpleValueType(0).getScalarType();
13991   unsigned ElemBits = ElemTy.getSizeInBits();
13992 
13993   int64_t ShiftAmount;
13994   if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
13995     APInt SplatValue, SplatUndef;
13996     unsigned SplatBitSize;
13997     bool HasAnyUndefs;
13998     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
13999                               HasAnyUndefs, ElemBits) ||
14000         SplatBitSize != ElemBits)
14001       return SDValue();
14002 
14003     ShiftAmount = SplatValue.getSExtValue();
14004   } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
14005     ShiftAmount = CVN->getSExtValue();
14006   } else
14007     return SDValue();
14008 
14009   unsigned Opcode;
14010   bool IsRightShift;
14011   switch (IID) {
14012   default:
14013     llvm_unreachable("Unknown shift intrinsic");
14014   case Intrinsic::aarch64_neon_sqshl:
14015     Opcode = AArch64ISD::SQSHL_I;
14016     IsRightShift = false;
14017     break;
14018   case Intrinsic::aarch64_neon_uqshl:
14019     Opcode = AArch64ISD::UQSHL_I;
14020     IsRightShift = false;
14021     break;
14022   case Intrinsic::aarch64_neon_srshl:
14023     Opcode = AArch64ISD::SRSHR_I;
14024     IsRightShift = true;
14025     break;
14026   case Intrinsic::aarch64_neon_urshl:
14027     Opcode = AArch64ISD::URSHR_I;
14028     IsRightShift = true;
14029     break;
14030   case Intrinsic::aarch64_neon_sqshlu:
14031     Opcode = AArch64ISD::SQSHLU_I;
14032     IsRightShift = false;
14033     break;
14034   case Intrinsic::aarch64_neon_sshl:
14035   case Intrinsic::aarch64_neon_ushl:
14036     // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
14037     // left shift for positive shift amounts. Below, we only replace the current
14038     // node with VSHL, if this condition is met.
14039     Opcode = AArch64ISD::VSHL;
14040     IsRightShift = false;
14041     break;
14042   }
14043 
14044   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
14045     SDLoc dl(N);
14046     return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
14047                        DAG.getConstant(-ShiftAmount, dl, MVT::i32));
14048   } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
14049     SDLoc dl(N);
14050     return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
14051                        DAG.getConstant(ShiftAmount, dl, MVT::i32));
14052   }
14053 
14054   return SDValue();
14055 }
14056 
14057 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
14058 // the intrinsics must be legal and take an i32, this means there's almost
14059 // certainly going to be a zext in the DAG which we can eliminate.
14060 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
14061   SDValue AndN = N->getOperand(2);
14062   if (AndN.getOpcode() != ISD::AND)
14063     return SDValue();
14064 
14065   ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
14066   if (!CMask || CMask->getZExtValue() != Mask)
14067     return SDValue();
14068 
14069   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
14070                      N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
14071 }
14072 
14073 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
14074                                            SelectionDAG &DAG) {
14075   SDLoc dl(N);
14076   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
14077                      DAG.getNode(Opc, dl,
14078                                  N->getOperand(1).getSimpleValueType(),
14079                                  N->getOperand(1)),
14080                      DAG.getConstant(0, dl, MVT::i64));
14081 }
14082 
14083 static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
14084   SDLoc DL(N);
14085   SDValue Op1 = N->getOperand(1);
14086   SDValue Op2 = N->getOperand(2);
14087   EVT ScalarTy = Op2.getValueType();
14088   if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
14089     ScalarTy = MVT::i32;
14090 
14091   // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
14092   SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
14093   SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
14094   SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
14095   SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
14096   return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
14097 }
14098 
14099 static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
14100   SDLoc dl(N);
14101   SDValue Scalar = N->getOperand(3);
14102   EVT ScalarTy = Scalar.getValueType();
14103 
14104   if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
14105     Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
14106 
14107   SDValue Passthru = N->getOperand(1);
14108   SDValue Pred = N->getOperand(2);
14109   return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
14110                      Pred, Scalar, Passthru);
14111 }
14112 
14113 static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
14114   SDLoc dl(N);
14115   LLVMContext &Ctx = *DAG.getContext();
14116   EVT VT = N->getValueType(0);
14117 
14118   assert(VT.isScalableVector() && "Expected a scalable vector.");
14119 
14120   // Current lowering only supports the SVE-ACLE types.
14121   if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
14122     return SDValue();
14123 
14124   unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
14125   unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
14126   EVT ByteVT =
14127       EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
14128 
14129   // Convert everything to the domain of EXT (i.e bytes).
14130   SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
14131   SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
14132   SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
14133                             DAG.getConstant(ElemSize, dl, MVT::i32));
14134 
14135   SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
14136   return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
14137 }
14138 
14139 static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
14140                                         TargetLowering::DAGCombinerInfo &DCI,
14141                                         SelectionDAG &DAG) {
14142   if (DCI.isBeforeLegalize())
14143     return SDValue();
14144 
14145   SDValue Comparator = N->getOperand(3);
14146   if (Comparator.getOpcode() == AArch64ISD::DUP ||
14147       Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
14148     unsigned IID = getIntrinsicID(N);
14149     EVT VT = N->getValueType(0);
14150     EVT CmpVT = N->getOperand(2).getValueType();
14151     SDValue Pred = N->getOperand(1);
14152     SDValue Imm;
14153     SDLoc DL(N);
14154 
14155     switch (IID) {
14156     default:
14157       llvm_unreachable("Called with wrong intrinsic!");
14158       break;
14159 
14160     // Signed comparisons
14161     case Intrinsic::aarch64_sve_cmpeq_wide:
14162     case Intrinsic::aarch64_sve_cmpne_wide:
14163     case Intrinsic::aarch64_sve_cmpge_wide:
14164     case Intrinsic::aarch64_sve_cmpgt_wide:
14165     case Intrinsic::aarch64_sve_cmplt_wide:
14166     case Intrinsic::aarch64_sve_cmple_wide: {
14167       if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
14168         int64_t ImmVal = CN->getSExtValue();
14169         if (ImmVal >= -16 && ImmVal <= 15)
14170           Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
14171         else
14172           return SDValue();
14173       }
14174       break;
14175     }
14176     // Unsigned comparisons
14177     case Intrinsic::aarch64_sve_cmphs_wide:
14178     case Intrinsic::aarch64_sve_cmphi_wide:
14179     case Intrinsic::aarch64_sve_cmplo_wide:
14180     case Intrinsic::aarch64_sve_cmpls_wide:  {
14181       if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
14182         uint64_t ImmVal = CN->getZExtValue();
14183         if (ImmVal <= 127)
14184           Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
14185         else
14186           return SDValue();
14187       }
14188       break;
14189     }
14190     }
14191 
14192     if (!Imm)
14193       return SDValue();
14194 
14195     SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
14196     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
14197                        N->getOperand(2), Splat, DAG.getCondCode(CC));
14198   }
14199 
14200   return SDValue();
14201 }
14202 
14203 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
14204                         AArch64CC::CondCode Cond) {
14205   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14206 
14207   SDLoc DL(Op);
14208   assert(Op.getValueType().isScalableVector() &&
14209          TLI.isTypeLegal(Op.getValueType()) &&
14210          "Expected legal scalable vector type!");
14211 
14212   // Ensure target specific opcodes are using legal type.
14213   EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
14214   SDValue TVal = DAG.getConstant(1, DL, OutVT);
14215   SDValue FVal = DAG.getConstant(0, DL, OutVT);
14216 
14217   // Set condition code (CC) flags.
14218   SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op);
14219 
14220   // Convert CC to integer based on requested condition.
14221   // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
14222   SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
14223   SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
14224   return DAG.getZExtOrTrunc(Res, DL, VT);
14225 }
14226 
14227 static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
14228                                       SelectionDAG &DAG) {
14229   SDLoc DL(N);
14230 
14231   SDValue Pred = N->getOperand(1);
14232   SDValue VecToReduce = N->getOperand(2);
14233 
14234   // NOTE: The integer reduction's result type is not always linked to the
14235   // operand's element type so we construct it from the intrinsic's result type.
14236   EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
14237   SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
14238 
14239   // SVE reductions set the whole vector register with the first element
14240   // containing the reduction result, which we'll now extract.
14241   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14242   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
14243                      Zero);
14244 }
14245 
14246 static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
14247                                      SelectionDAG &DAG) {
14248   SDLoc DL(N);
14249 
14250   SDValue Pred = N->getOperand(1);
14251   SDValue VecToReduce = N->getOperand(2);
14252 
14253   EVT ReduceVT = VecToReduce.getValueType();
14254   SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
14255 
14256   // SVE reductions set the whole vector register with the first element
14257   // containing the reduction result, which we'll now extract.
14258   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14259   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
14260                      Zero);
14261 }
14262 
14263 static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
14264                                             SelectionDAG &DAG) {
14265   SDLoc DL(N);
14266 
14267   SDValue Pred = N->getOperand(1);
14268   SDValue InitVal = N->getOperand(2);
14269   SDValue VecToReduce = N->getOperand(3);
14270   EVT ReduceVT = VecToReduce.getValueType();
14271 
14272   // Ordered reductions use the first lane of the result vector as the
14273   // reduction's initial value.
14274   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14275   InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
14276                         DAG.getUNDEF(ReduceVT), InitVal, Zero);
14277 
14278   SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
14279 
14280   // SVE reductions set the whole vector register with the first element
14281   // containing the reduction result, which we'll now extract.
14282   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
14283                      Zero);
14284 }
14285 
14286 static bool isAllActivePredicate(SDValue N) {
14287   unsigned NumElts = N.getValueType().getVectorMinNumElements();
14288 
14289   // Look through cast.
14290   while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14291     N = N.getOperand(0);
14292     // When reinterpreting from a type with fewer elements the "new" elements
14293     // are not active, so bail if they're likely to be used.
14294     if (N.getValueType().getVectorMinNumElements() < NumElts)
14295       return false;
14296   }
14297 
14298   // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14299   // or smaller than the implicit element type represented by N.
14300   // NOTE: A larger element count implies a smaller element type.
14301   if (N.getOpcode() == AArch64ISD::PTRUE &&
14302       N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
14303     return N.getValueType().getVectorMinNumElements() >= NumElts;
14304 
14305   return false;
14306 }
14307 
14308 // If a merged operation has no inactive lanes we can relax it to a predicated
14309 // or unpredicated operation, which potentially allows better isel (perhaps
14310 // using immediate forms) or relaxing register reuse requirements.
14311 static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
14312                                        SelectionDAG &DAG,
14313                                        bool UnpredOp = false) {
14314   assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
14315   assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
14316   SDValue Pg = N->getOperand(1);
14317 
14318   // ISD way to specify an all active predicate.
14319   if (isAllActivePredicate(Pg)) {
14320     if (UnpredOp)
14321       return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), N->getOperand(2),
14322                          N->getOperand(3));
14323     else
14324       return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg,
14325                          N->getOperand(2), N->getOperand(3));
14326   }
14327 
14328   // FUTURE: SplatVector(true)
14329   return SDValue();
14330 }
14331 
14332 static SDValue performIntrinsicCombine(SDNode *N,
14333                                        TargetLowering::DAGCombinerInfo &DCI,
14334                                        const AArch64Subtarget *Subtarget) {
14335   SelectionDAG &DAG = DCI.DAG;
14336   unsigned IID = getIntrinsicID(N);
14337   switch (IID) {
14338   default:
14339     break;
14340   case Intrinsic::aarch64_neon_vcvtfxs2fp:
14341   case Intrinsic::aarch64_neon_vcvtfxu2fp:
14342     return tryCombineFixedPointConvert(N, DCI, DAG);
14343   case Intrinsic::aarch64_neon_saddv:
14344     return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
14345   case Intrinsic::aarch64_neon_uaddv:
14346     return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
14347   case Intrinsic::aarch64_neon_sminv:
14348     return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
14349   case Intrinsic::aarch64_neon_uminv:
14350     return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
14351   case Intrinsic::aarch64_neon_smaxv:
14352     return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
14353   case Intrinsic::aarch64_neon_umaxv:
14354     return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
14355   case Intrinsic::aarch64_neon_fmax:
14356     return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
14357                        N->getOperand(1), N->getOperand(2));
14358   case Intrinsic::aarch64_neon_fmin:
14359     return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
14360                        N->getOperand(1), N->getOperand(2));
14361   case Intrinsic::aarch64_neon_fmaxnm:
14362     return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
14363                        N->getOperand(1), N->getOperand(2));
14364   case Intrinsic::aarch64_neon_fminnm:
14365     return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
14366                        N->getOperand(1), N->getOperand(2));
14367   case Intrinsic::aarch64_neon_smull:
14368   case Intrinsic::aarch64_neon_umull:
14369   case Intrinsic::aarch64_neon_pmull:
14370   case Intrinsic::aarch64_neon_sqdmull:
14371     return tryCombineLongOpWithDup(IID, N, DCI, DAG);
14372   case Intrinsic::aarch64_neon_sqshl:
14373   case Intrinsic::aarch64_neon_uqshl:
14374   case Intrinsic::aarch64_neon_sqshlu:
14375   case Intrinsic::aarch64_neon_srshl:
14376   case Intrinsic::aarch64_neon_urshl:
14377   case Intrinsic::aarch64_neon_sshl:
14378   case Intrinsic::aarch64_neon_ushl:
14379     return tryCombineShiftImm(IID, N, DAG);
14380   case Intrinsic::aarch64_crc32b:
14381   case Intrinsic::aarch64_crc32cb:
14382     return tryCombineCRC32(0xff, N, DAG);
14383   case Intrinsic::aarch64_crc32h:
14384   case Intrinsic::aarch64_crc32ch:
14385     return tryCombineCRC32(0xffff, N, DAG);
14386   case Intrinsic::aarch64_sve_saddv:
14387     // There is no i64 version of SADDV because the sign is irrelevant.
14388     if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
14389       return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
14390     else
14391       return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
14392   case Intrinsic::aarch64_sve_uaddv:
14393     return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
14394   case Intrinsic::aarch64_sve_smaxv:
14395     return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
14396   case Intrinsic::aarch64_sve_umaxv:
14397     return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
14398   case Intrinsic::aarch64_sve_sminv:
14399     return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
14400   case Intrinsic::aarch64_sve_uminv:
14401     return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
14402   case Intrinsic::aarch64_sve_orv:
14403     return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
14404   case Intrinsic::aarch64_sve_eorv:
14405     return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
14406   case Intrinsic::aarch64_sve_andv:
14407     return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
14408   case Intrinsic::aarch64_sve_index:
14409     return LowerSVEIntrinsicIndex(N, DAG);
14410   case Intrinsic::aarch64_sve_dup:
14411     return LowerSVEIntrinsicDUP(N, DAG);
14412   case Intrinsic::aarch64_sve_dup_x:
14413     return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
14414                        N->getOperand(1));
14415   case Intrinsic::aarch64_sve_ext:
14416     return LowerSVEIntrinsicEXT(N, DAG);
14417   case Intrinsic::aarch64_sve_mul:
14418     return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG);
14419   case Intrinsic::aarch64_sve_smulh:
14420     return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG);
14421   case Intrinsic::aarch64_sve_umulh:
14422     return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG);
14423   case Intrinsic::aarch64_sve_smin:
14424     return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
14425   case Intrinsic::aarch64_sve_umin:
14426     return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG);
14427   case Intrinsic::aarch64_sve_smax:
14428     return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG);
14429   case Intrinsic::aarch64_sve_umax:
14430     return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG);
14431   case Intrinsic::aarch64_sve_lsl:
14432     return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG);
14433   case Intrinsic::aarch64_sve_lsr:
14434     return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
14435   case Intrinsic::aarch64_sve_asr:
14436     return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
14437   case Intrinsic::aarch64_sve_fadd:
14438     return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG);
14439   case Intrinsic::aarch64_sve_fsub:
14440     return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG);
14441   case Intrinsic::aarch64_sve_fmul:
14442     return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG);
14443   case Intrinsic::aarch64_sve_add:
14444     return convertMergedOpToPredOp(N, ISD::ADD, DAG, true);
14445   case Intrinsic::aarch64_sve_sub:
14446     return convertMergedOpToPredOp(N, ISD::SUB, DAG, true);
14447   case Intrinsic::aarch64_sve_and:
14448     return convertMergedOpToPredOp(N, ISD::AND, DAG, true);
14449   case Intrinsic::aarch64_sve_bic:
14450     return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true);
14451   case Intrinsic::aarch64_sve_eor:
14452     return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);
14453   case Intrinsic::aarch64_sve_orr:
14454     return convertMergedOpToPredOp(N, ISD::OR, DAG, true);
14455   case Intrinsic::aarch64_sve_sqadd:
14456     return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
14457   case Intrinsic::aarch64_sve_sqsub:
14458     return convertMergedOpToPredOp(N, ISD::SSUBSAT, DAG, true);
14459   case Intrinsic::aarch64_sve_uqadd:
14460     return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
14461   case Intrinsic::aarch64_sve_uqsub:
14462     return convertMergedOpToPredOp(N, ISD::USUBSAT, DAG, true);
14463   case Intrinsic::aarch64_sve_sqadd_x:
14464     return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
14465                        N->getOperand(1), N->getOperand(2));
14466   case Intrinsic::aarch64_sve_sqsub_x:
14467     return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
14468                        N->getOperand(1), N->getOperand(2));
14469   case Intrinsic::aarch64_sve_uqadd_x:
14470     return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
14471                        N->getOperand(1), N->getOperand(2));
14472   case Intrinsic::aarch64_sve_uqsub_x:
14473     return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
14474                        N->getOperand(1), N->getOperand(2));
14475   case Intrinsic::aarch64_sve_cmphs:
14476     if (!N->getOperand(2).getValueType().isFloatingPoint())
14477       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14478                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
14479                          N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
14480     break;
14481   case Intrinsic::aarch64_sve_cmphi:
14482     if (!N->getOperand(2).getValueType().isFloatingPoint())
14483       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14484                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
14485                          N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
14486     break;
14487   case Intrinsic::aarch64_sve_fcmpge:
14488   case Intrinsic::aarch64_sve_cmpge:
14489     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14490                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
14491                        N->getOperand(3), DAG.getCondCode(ISD::SETGE));
14492     break;
14493   case Intrinsic::aarch64_sve_fcmpgt:
14494   case Intrinsic::aarch64_sve_cmpgt:
14495     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14496                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
14497                        N->getOperand(3), DAG.getCondCode(ISD::SETGT));
14498     break;
14499   case Intrinsic::aarch64_sve_fcmpeq:
14500   case Intrinsic::aarch64_sve_cmpeq:
14501     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14502                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
14503                        N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
14504     break;
14505   case Intrinsic::aarch64_sve_fcmpne:
14506   case Intrinsic::aarch64_sve_cmpne:
14507     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14508                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
14509                        N->getOperand(3), DAG.getCondCode(ISD::SETNE));
14510     break;
14511   case Intrinsic::aarch64_sve_fcmpuo:
14512     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
14513                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
14514                        N->getOperand(3), DAG.getCondCode(ISD::SETUO));
14515     break;
14516   case Intrinsic::aarch64_sve_fadda:
14517     return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
14518   case Intrinsic::aarch64_sve_faddv:
14519     return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
14520   case Intrinsic::aarch64_sve_fmaxnmv:
14521     return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
14522   case Intrinsic::aarch64_sve_fmaxv:
14523     return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
14524   case Intrinsic::aarch64_sve_fminnmv:
14525     return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
14526   case Intrinsic::aarch64_sve_fminv:
14527     return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
14528   case Intrinsic::aarch64_sve_sel:
14529     return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
14530                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
14531   case Intrinsic::aarch64_sve_cmpeq_wide:
14532     return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
14533   case Intrinsic::aarch64_sve_cmpne_wide:
14534     return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
14535   case Intrinsic::aarch64_sve_cmpge_wide:
14536     return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
14537   case Intrinsic::aarch64_sve_cmpgt_wide:
14538     return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
14539   case Intrinsic::aarch64_sve_cmplt_wide:
14540     return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
14541   case Intrinsic::aarch64_sve_cmple_wide:
14542     return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
14543   case Intrinsic::aarch64_sve_cmphs_wide:
14544     return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
14545   case Intrinsic::aarch64_sve_cmphi_wide:
14546     return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
14547   case Intrinsic::aarch64_sve_cmplo_wide:
14548     return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
14549   case Intrinsic::aarch64_sve_cmpls_wide:
14550     return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
14551   case Intrinsic::aarch64_sve_ptest_any:
14552     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
14553                     AArch64CC::ANY_ACTIVE);
14554   case Intrinsic::aarch64_sve_ptest_first:
14555     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
14556                     AArch64CC::FIRST_ACTIVE);
14557   case Intrinsic::aarch64_sve_ptest_last:
14558     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
14559                     AArch64CC::LAST_ACTIVE);
14560   }
14561   return SDValue();
14562 }
14563 
14564 static SDValue performExtendCombine(SDNode *N,
14565                                     TargetLowering::DAGCombinerInfo &DCI,
14566                                     SelectionDAG &DAG) {
14567   // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
14568   // we can convert that DUP into another extract_high (of a bigger DUP), which
14569   // helps the backend to decide that an sabdl2 would be useful, saving a real
14570   // extract_high operation.
14571   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
14572       (N->getOperand(0).getOpcode() == ISD::ABDU ||
14573        N->getOperand(0).getOpcode() == ISD::ABDS)) {
14574     SDNode *ABDNode = N->getOperand(0).getNode();
14575     SDValue NewABD =
14576         tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
14577     if (!NewABD.getNode())
14578       return SDValue();
14579 
14580     return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
14581   }
14582   return SDValue();
14583 }
14584 
14585 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
14586                                SDValue SplatVal, unsigned NumVecElts) {
14587   assert(!St.isTruncatingStore() && "cannot split truncating vector store");
14588   unsigned OrigAlignment = St.getAlignment();
14589   unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
14590 
14591   // Create scalar stores. This is at least as good as the code sequence for a
14592   // split unaligned store which is a dup.s, ext.b, and two stores.
14593   // Most of the time the three stores should be replaced by store pair
14594   // instructions (stp).
14595   SDLoc DL(&St);
14596   SDValue BasePtr = St.getBasePtr();
14597   uint64_t BaseOffset = 0;
14598 
14599   const MachinePointerInfo &PtrInfo = St.getPointerInfo();
14600   SDValue NewST1 =
14601       DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
14602                    OrigAlignment, St.getMemOperand()->getFlags());
14603 
14604   // As this in ISel, we will not merge this add which may degrade results.
14605   if (BasePtr->getOpcode() == ISD::ADD &&
14606       isa<ConstantSDNode>(BasePtr->getOperand(1))) {
14607     BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
14608     BasePtr = BasePtr->getOperand(0);
14609   }
14610 
14611   unsigned Offset = EltOffset;
14612   while (--NumVecElts) {
14613     unsigned Alignment = MinAlign(OrigAlignment, Offset);
14614     SDValue OffsetPtr =
14615         DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
14616                     DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
14617     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
14618                           PtrInfo.getWithOffset(Offset), Alignment,
14619                           St.getMemOperand()->getFlags());
14620     Offset += EltOffset;
14621   }
14622   return NewST1;
14623 }
14624 
14625 // Returns an SVE type that ContentTy can be trivially sign or zero extended
14626 // into.
14627 static MVT getSVEContainerType(EVT ContentTy) {
14628   assert(ContentTy.isSimple() && "No SVE containers for extended types");
14629 
14630   switch (ContentTy.getSimpleVT().SimpleTy) {
14631   default:
14632     llvm_unreachable("No known SVE container for this MVT type");
14633   case MVT::nxv2i8:
14634   case MVT::nxv2i16:
14635   case MVT::nxv2i32:
14636   case MVT::nxv2i64:
14637   case MVT::nxv2f32:
14638   case MVT::nxv2f64:
14639     return MVT::nxv2i64;
14640   case MVT::nxv4i8:
14641   case MVT::nxv4i16:
14642   case MVT::nxv4i32:
14643   case MVT::nxv4f32:
14644     return MVT::nxv4i32;
14645   case MVT::nxv8i8:
14646   case MVT::nxv8i16:
14647   case MVT::nxv8f16:
14648   case MVT::nxv8bf16:
14649     return MVT::nxv8i16;
14650   case MVT::nxv16i8:
14651     return MVT::nxv16i8;
14652   }
14653 }
14654 
14655 static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
14656   SDLoc DL(N);
14657   EVT VT = N->getValueType(0);
14658 
14659   if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
14660     return SDValue();
14661 
14662   EVT ContainerVT = VT;
14663   if (ContainerVT.isInteger())
14664     ContainerVT = getSVEContainerType(ContainerVT);
14665 
14666   SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
14667   SDValue Ops[] = { N->getOperand(0), // Chain
14668                     N->getOperand(2), // Pg
14669                     N->getOperand(3), // Base
14670                     DAG.getValueType(VT) };
14671 
14672   SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
14673   SDValue LoadChain = SDValue(Load.getNode(), 1);
14674 
14675   if (ContainerVT.isInteger() && (VT != ContainerVT))
14676     Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
14677 
14678   return DAG.getMergeValues({ Load, LoadChain }, DL);
14679 }
14680 
14681 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
14682   SDLoc DL(N);
14683   EVT VT = N->getValueType(0);
14684   EVT PtrTy = N->getOperand(3).getValueType();
14685 
14686   if (VT == MVT::nxv8bf16 &&
14687       !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14688     return SDValue();
14689 
14690   EVT LoadVT = VT;
14691   if (VT.isFloatingPoint())
14692     LoadVT = VT.changeTypeToInteger();
14693 
14694   auto *MINode = cast<MemIntrinsicSDNode>(N);
14695   SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
14696   SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
14697                                 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
14698                                 MINode->getOperand(2), PassThru,
14699                                 MINode->getMemoryVT(), MINode->getMemOperand(),
14700                                 ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
14701 
14702    if (VT.isFloatingPoint()) {
14703      SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
14704      return DAG.getMergeValues(Ops, DL);
14705    }
14706 
14707   return L;
14708 }
14709 
14710 template <unsigned Opcode>
14711 static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
14712   static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
14713                     Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
14714                 "Unsupported opcode.");
14715   SDLoc DL(N);
14716   EVT VT = N->getValueType(0);
14717   if (VT == MVT::nxv8bf16 &&
14718       !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14719     return SDValue();
14720 
14721   EVT LoadVT = VT;
14722   if (VT.isFloatingPoint())
14723     LoadVT = VT.changeTypeToInteger();
14724 
14725   SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
14726   SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
14727   SDValue LoadChain = SDValue(Load.getNode(), 1);
14728 
14729   if (VT.isFloatingPoint())
14730     Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
14731 
14732   return DAG.getMergeValues({Load, LoadChain}, DL);
14733 }
14734 
14735 static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
14736   SDLoc DL(N);
14737   SDValue Data = N->getOperand(2);
14738   EVT DataVT = Data.getValueType();
14739   EVT HwSrcVt = getSVEContainerType(DataVT);
14740   SDValue InputVT = DAG.getValueType(DataVT);
14741 
14742   if (DataVT == MVT::nxv8bf16 &&
14743       !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14744     return SDValue();
14745 
14746   if (DataVT.isFloatingPoint())
14747     InputVT = DAG.getValueType(HwSrcVt);
14748 
14749   SDValue SrcNew;
14750   if (Data.getValueType().isFloatingPoint())
14751     SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
14752   else
14753     SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
14754 
14755   SDValue Ops[] = { N->getOperand(0), // Chain
14756                     SrcNew,
14757                     N->getOperand(4), // Base
14758                     N->getOperand(3), // Pg
14759                     InputVT
14760                   };
14761 
14762   return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
14763 }
14764 
14765 static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
14766   SDLoc DL(N);
14767 
14768   SDValue Data = N->getOperand(2);
14769   EVT DataVT = Data.getValueType();
14770   EVT PtrTy = N->getOperand(4).getValueType();
14771 
14772   if (DataVT == MVT::nxv8bf16 &&
14773       !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14774     return SDValue();
14775 
14776   if (DataVT.isFloatingPoint())
14777     Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
14778 
14779   auto *MINode = cast<MemIntrinsicSDNode>(N);
14780   return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
14781                             DAG.getUNDEF(PtrTy), MINode->getOperand(3),
14782                             MINode->getMemoryVT(), MINode->getMemOperand(),
14783                             ISD::UNINDEXED, false, false);
14784 }
14785 
14786 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
14787 /// load store optimizer pass will merge them to store pair stores.  This should
14788 /// be better than a movi to create the vector zero followed by a vector store
14789 /// if the zero constant is not re-used, since one instructions and one register
14790 /// live range will be removed.
14791 ///
14792 /// For example, the final generated code should be:
14793 ///
14794 ///   stp xzr, xzr, [x0]
14795 ///
14796 /// instead of:
14797 ///
14798 ///   movi v0.2d, #0
14799 ///   str q0, [x0]
14800 ///
14801 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
14802   SDValue StVal = St.getValue();
14803   EVT VT = StVal.getValueType();
14804 
14805   // Avoid scalarizing zero splat stores for scalable vectors.
14806   if (VT.isScalableVector())
14807     return SDValue();
14808 
14809   // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
14810   // 2, 3 or 4 i32 elements.
14811   int NumVecElts = VT.getVectorNumElements();
14812   if (!(((NumVecElts == 2 || NumVecElts == 3) &&
14813          VT.getVectorElementType().getSizeInBits() == 64) ||
14814         ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
14815          VT.getVectorElementType().getSizeInBits() == 32)))
14816     return SDValue();
14817 
14818   if (StVal.getOpcode() != ISD::BUILD_VECTOR)
14819     return SDValue();
14820 
14821   // If the zero constant has more than one use then the vector store could be
14822   // better since the constant mov will be amortized and stp q instructions
14823   // should be able to be formed.
14824   if (!StVal.hasOneUse())
14825     return SDValue();
14826 
14827   // If the store is truncating then it's going down to i16 or smaller, which
14828   // means it can be implemented in a single store anyway.
14829   if (St.isTruncatingStore())
14830     return SDValue();
14831 
14832   // If the immediate offset of the address operand is too large for the stp
14833   // instruction, then bail out.
14834   if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
14835     int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
14836     if (Offset < -512 || Offset > 504)
14837       return SDValue();
14838   }
14839 
14840   for (int I = 0; I < NumVecElts; ++I) {
14841     SDValue EltVal = StVal.getOperand(I);
14842     if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
14843       return SDValue();
14844   }
14845 
14846   // Use a CopyFromReg WZR/XZR here to prevent
14847   // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
14848   SDLoc DL(&St);
14849   unsigned ZeroReg;
14850   EVT ZeroVT;
14851   if (VT.getVectorElementType().getSizeInBits() == 32) {
14852     ZeroReg = AArch64::WZR;
14853     ZeroVT = MVT::i32;
14854   } else {
14855     ZeroReg = AArch64::XZR;
14856     ZeroVT = MVT::i64;
14857   }
14858   SDValue SplatVal =
14859       DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
14860   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
14861 }
14862 
14863 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
14864 /// value. The load store optimizer pass will merge them to store pair stores.
14865 /// This has better performance than a splat of the scalar followed by a split
14866 /// vector store. Even if the stores are not merged it is four stores vs a dup,
14867 /// followed by an ext.b and two stores.
14868 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
14869   SDValue StVal = St.getValue();
14870   EVT VT = StVal.getValueType();
14871 
14872   // Don't replace floating point stores, they possibly won't be transformed to
14873   // stp because of the store pair suppress pass.
14874   if (VT.isFloatingPoint())
14875     return SDValue();
14876 
14877   // We can express a splat as store pair(s) for 2 or 4 elements.
14878   unsigned NumVecElts = VT.getVectorNumElements();
14879   if (NumVecElts != 4 && NumVecElts != 2)
14880     return SDValue();
14881 
14882   // If the store is truncating then it's going down to i16 or smaller, which
14883   // means it can be implemented in a single store anyway.
14884   if (St.isTruncatingStore())
14885     return SDValue();
14886 
14887   // Check that this is a splat.
14888   // Make sure that each of the relevant vector element locations are inserted
14889   // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
14890   std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
14891   SDValue SplatVal;
14892   for (unsigned I = 0; I < NumVecElts; ++I) {
14893     // Check for insert vector elements.
14894     if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
14895       return SDValue();
14896 
14897     // Check that same value is inserted at each vector element.
14898     if (I == 0)
14899       SplatVal = StVal.getOperand(1);
14900     else if (StVal.getOperand(1) != SplatVal)
14901       return SDValue();
14902 
14903     // Check insert element index.
14904     ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
14905     if (!CIndex)
14906       return SDValue();
14907     uint64_t IndexVal = CIndex->getZExtValue();
14908     if (IndexVal >= NumVecElts)
14909       return SDValue();
14910     IndexNotInserted.reset(IndexVal);
14911 
14912     StVal = StVal.getOperand(0);
14913   }
14914   // Check that all vector element locations were inserted to.
14915   if (IndexNotInserted.any())
14916       return SDValue();
14917 
14918   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
14919 }
14920 
14921 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
14922                            SelectionDAG &DAG,
14923                            const AArch64Subtarget *Subtarget) {
14924 
14925   StoreSDNode *S = cast<StoreSDNode>(N);
14926   if (S->isVolatile() || S->isIndexed())
14927     return SDValue();
14928 
14929   SDValue StVal = S->getValue();
14930   EVT VT = StVal.getValueType();
14931 
14932   if (!VT.isFixedLengthVector())
14933     return SDValue();
14934 
14935   // If we get a splat of zeros, convert this vector store to a store of
14936   // scalars. They will be merged into store pairs of xzr thereby removing one
14937   // instruction and one register.
14938   if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
14939     return ReplacedZeroSplat;
14940 
14941   // FIXME: The logic for deciding if an unaligned store should be split should
14942   // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
14943   // a call to that function here.
14944 
14945   if (!Subtarget->isMisaligned128StoreSlow())
14946     return SDValue();
14947 
14948   // Don't split at -Oz.
14949   if (DAG.getMachineFunction().getFunction().hasMinSize())
14950     return SDValue();
14951 
14952   // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
14953   // those up regresses performance on micro-benchmarks and olden/bh.
14954   if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
14955     return SDValue();
14956 
14957   // Split unaligned 16B stores. They are terrible for performance.
14958   // Don't split stores with alignment of 1 or 2. Code that uses clang vector
14959   // extensions can use this to mark that it does not want splitting to happen
14960   // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
14961   // eliminating alignment hazards is only 1 in 8 for alignment of 2.
14962   if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
14963       S->getAlignment() <= 2)
14964     return SDValue();
14965 
14966   // If we get a splat of a scalar convert this vector store to a store of
14967   // scalars. They will be merged into store pairs thereby removing two
14968   // instructions.
14969   if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
14970     return ReplacedSplat;
14971 
14972   SDLoc DL(S);
14973 
14974   // Split VT into two.
14975   EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14976   unsigned NumElts = HalfVT.getVectorNumElements();
14977   SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
14978                                    DAG.getConstant(0, DL, MVT::i64));
14979   SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
14980                                    DAG.getConstant(NumElts, DL, MVT::i64));
14981   SDValue BasePtr = S->getBasePtr();
14982   SDValue NewST1 =
14983       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
14984                    S->getAlignment(), S->getMemOperand()->getFlags());
14985   SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
14986                                   DAG.getConstant(8, DL, MVT::i64));
14987   return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
14988                       S->getPointerInfo(), S->getAlignment(),
14989                       S->getMemOperand()->getFlags());
14990 }
14991 
14992 static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
14993   assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
14994 
14995   // splice(pg, op1, undef) -> op1
14996   if (N->getOperand(2).isUndef())
14997     return N->getOperand(1);
14998 
14999   return SDValue();
15000 }
15001 
15002 static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
15003   SDLoc DL(N);
15004   SDValue Op0 = N->getOperand(0);
15005   SDValue Op1 = N->getOperand(1);
15006   EVT ResVT = N->getValueType(0);
15007 
15008   // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
15009   if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
15010     if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
15011       SDValue X = Op0.getOperand(0).getOperand(0);
15012       return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
15013     }
15014   }
15015 
15016   // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
15017   if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
15018     if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
15019       SDValue Z = Op1.getOperand(0).getOperand(1);
15020       return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
15021     }
15022   }
15023 
15024   return SDValue();
15025 }
15026 
15027 static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
15028   unsigned Opc = N->getOpcode();
15029 
15030   assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
15031            Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
15032           (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
15033            Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
15034          "Invalid opcode.");
15035 
15036   const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
15037                       Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
15038   const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
15039                       Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
15040   const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
15041                         Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
15042                         Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
15043                         Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
15044 
15045   SDLoc DL(N);
15046   SDValue Chain = N->getOperand(0);
15047   SDValue Pg = N->getOperand(1);
15048   SDValue Base = N->getOperand(2);
15049   SDValue Offset = N->getOperand(3);
15050   SDValue Ty = N->getOperand(4);
15051 
15052   EVT ResVT = N->getValueType(0);
15053 
15054   const auto OffsetOpc = Offset.getOpcode();
15055   const bool OffsetIsZExt =
15056       OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
15057   const bool OffsetIsSExt =
15058       OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
15059 
15060   // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
15061   if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
15062     SDValue ExtPg = Offset.getOperand(0);
15063     VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
15064     EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
15065 
15066     // If the predicate for the sign- or zero-extended offset is the
15067     // same as the predicate used for this load and the sign-/zero-extension
15068     // was from a 32-bits...
15069     if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
15070       SDValue UnextendedOffset = Offset.getOperand(1);
15071 
15072       unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
15073       if (Signed)
15074         NewOpc = getSignExtendedGatherOpcode(NewOpc);
15075 
15076       return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
15077                          {Chain, Pg, Base, UnextendedOffset, Ty});
15078     }
15079   }
15080 
15081   return SDValue();
15082 }
15083 
15084 /// Optimize a vector shift instruction and its operand if shifted out
15085 /// bits are not used.
15086 static SDValue performVectorShiftCombine(SDNode *N,
15087                                          const AArch64TargetLowering &TLI,
15088                                          TargetLowering::DAGCombinerInfo &DCI) {
15089   assert(N->getOpcode() == AArch64ISD::VASHR ||
15090          N->getOpcode() == AArch64ISD::VLSHR);
15091 
15092   SDValue Op = N->getOperand(0);
15093   unsigned OpScalarSize = Op.getScalarValueSizeInBits();
15094 
15095   unsigned ShiftImm = N->getConstantOperandVal(1);
15096   assert(OpScalarSize > ShiftImm && "Invalid shift imm");
15097 
15098   APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
15099   APInt DemandedMask = ~ShiftedOutBits;
15100 
15101   if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15102     return SDValue(N, 0);
15103 
15104   return SDValue();
15105 }
15106 
15107 /// Target-specific DAG combine function for post-increment LD1 (lane) and
15108 /// post-increment LD1R.
15109 static SDValue performPostLD1Combine(SDNode *N,
15110                                      TargetLowering::DAGCombinerInfo &DCI,
15111                                      bool IsLaneOp) {
15112   if (DCI.isBeforeLegalizeOps())
15113     return SDValue();
15114 
15115   SelectionDAG &DAG = DCI.DAG;
15116   EVT VT = N->getValueType(0);
15117 
15118   if (VT.isScalableVector())
15119     return SDValue();
15120 
15121   unsigned LoadIdx = IsLaneOp ? 1 : 0;
15122   SDNode *LD = N->getOperand(LoadIdx).getNode();
15123   // If it is not LOAD, can not do such combine.
15124   if (LD->getOpcode() != ISD::LOAD)
15125     return SDValue();
15126 
15127   // The vector lane must be a constant in the LD1LANE opcode.
15128   SDValue Lane;
15129   if (IsLaneOp) {
15130     Lane = N->getOperand(2);
15131     auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
15132     if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
15133       return SDValue();
15134   }
15135 
15136   LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
15137   EVT MemVT = LoadSDN->getMemoryVT();
15138   // Check if memory operand is the same type as the vector element.
15139   if (MemVT != VT.getVectorElementType())
15140     return SDValue();
15141 
15142   // Check if there are other uses. If so, do not combine as it will introduce
15143   // an extra load.
15144   for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
15145        ++UI) {
15146     if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
15147       continue;
15148     if (*UI != N)
15149       return SDValue();
15150   }
15151 
15152   SDValue Addr = LD->getOperand(1);
15153   SDValue Vector = N->getOperand(0);
15154   // Search for a use of the address operand that is an increment.
15155   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
15156        Addr.getNode()->use_end(); UI != UE; ++UI) {
15157     SDNode *User = *UI;
15158     if (User->getOpcode() != ISD::ADD
15159         || UI.getUse().getResNo() != Addr.getResNo())
15160       continue;
15161 
15162     // If the increment is a constant, it must match the memory ref size.
15163     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
15164     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
15165       uint32_t IncVal = CInc->getZExtValue();
15166       unsigned NumBytes = VT.getScalarSizeInBits() / 8;
15167       if (IncVal != NumBytes)
15168         continue;
15169       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
15170     }
15171 
15172     // To avoid cycle construction make sure that neither the load nor the add
15173     // are predecessors to each other or the Vector.
15174     SmallPtrSet<const SDNode *, 32> Visited;
15175     SmallVector<const SDNode *, 16> Worklist;
15176     Visited.insert(Addr.getNode());
15177     Worklist.push_back(User);
15178     Worklist.push_back(LD);
15179     Worklist.push_back(Vector.getNode());
15180     if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
15181         SDNode::hasPredecessorHelper(User, Visited, Worklist))
15182       continue;
15183 
15184     SmallVector<SDValue, 8> Ops;
15185     Ops.push_back(LD->getOperand(0));  // Chain
15186     if (IsLaneOp) {
15187       Ops.push_back(Vector);           // The vector to be inserted
15188       Ops.push_back(Lane);             // The lane to be inserted in the vector
15189     }
15190     Ops.push_back(Addr);
15191     Ops.push_back(Inc);
15192 
15193     EVT Tys[3] = { VT, MVT::i64, MVT::Other };
15194     SDVTList SDTys = DAG.getVTList(Tys);
15195     unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
15196     SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
15197                                            MemVT,
15198                                            LoadSDN->getMemOperand());
15199 
15200     // Update the uses.
15201     SDValue NewResults[] = {
15202         SDValue(LD, 0),            // The result of load
15203         SDValue(UpdN.getNode(), 2) // Chain
15204     };
15205     DCI.CombineTo(LD, NewResults);
15206     DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
15207     DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
15208 
15209     break;
15210   }
15211   return SDValue();
15212 }
15213 
15214 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during
15215 /// address translation.
15216 static bool performTBISimplification(SDValue Addr,
15217                                      TargetLowering::DAGCombinerInfo &DCI,
15218                                      SelectionDAG &DAG) {
15219   APInt DemandedMask = APInt::getLowBitsSet(64, 56);
15220   KnownBits Known;
15221   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
15222                                         !DCI.isBeforeLegalizeOps());
15223   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15224   if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
15225     DCI.CommitTargetLoweringOpt(TLO);
15226     return true;
15227   }
15228   return false;
15229 }
15230 
15231 static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
15232   assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
15233          "Expected STORE dag node in input!");
15234 
15235   if (auto Store = dyn_cast<StoreSDNode>(N)) {
15236     if (!Store->isTruncatingStore() || Store->isIndexed())
15237       return SDValue();
15238     SDValue Ext = Store->getValue();
15239     auto ExtOpCode = Ext.getOpcode();
15240     if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
15241         ExtOpCode != ISD::ANY_EXTEND)
15242       return SDValue();
15243     SDValue Orig = Ext->getOperand(0);
15244     if (Store->getMemoryVT() != Orig->getValueType(0))
15245       return SDValue();
15246     return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
15247                         Store->getBasePtr(), Store->getPointerInfo(),
15248                         Store->getAlign());
15249   }
15250 
15251   return SDValue();
15252 }
15253 
15254 static SDValue performSTORECombine(SDNode *N,
15255                                    TargetLowering::DAGCombinerInfo &DCI,
15256                                    SelectionDAG &DAG,
15257                                    const AArch64Subtarget *Subtarget) {
15258   if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
15259     return Split;
15260 
15261   if (Subtarget->supportsAddressTopByteIgnored() &&
15262       performTBISimplification(N->getOperand(2), DCI, DAG))
15263     return SDValue(N, 0);
15264 
15265   if (SDValue Store = foldTruncStoreOfExt(DAG, N))
15266     return Store;
15267 
15268   return SDValue();
15269 }
15270 
15271 /// Target-specific DAG combine function for NEON load/store intrinsics
15272 /// to merge base address updates.
15273 static SDValue performNEONPostLDSTCombine(SDNode *N,
15274                                           TargetLowering::DAGCombinerInfo &DCI,
15275                                           SelectionDAG &DAG) {
15276   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15277     return SDValue();
15278 
15279   unsigned AddrOpIdx = N->getNumOperands() - 1;
15280   SDValue Addr = N->getOperand(AddrOpIdx);
15281 
15282   // Search for a use of the address operand that is an increment.
15283   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
15284        UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
15285     SDNode *User = *UI;
15286     if (User->getOpcode() != ISD::ADD ||
15287         UI.getUse().getResNo() != Addr.getResNo())
15288       continue;
15289 
15290     // Check that the add is independent of the load/store.  Otherwise, folding
15291     // it would create a cycle.
15292     SmallPtrSet<const SDNode *, 32> Visited;
15293     SmallVector<const SDNode *, 16> Worklist;
15294     Visited.insert(Addr.getNode());
15295     Worklist.push_back(N);
15296     Worklist.push_back(User);
15297     if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
15298         SDNode::hasPredecessorHelper(User, Visited, Worklist))
15299       continue;
15300 
15301     // Find the new opcode for the updating load/store.
15302     bool IsStore = false;
15303     bool IsLaneOp = false;
15304     bool IsDupOp = false;
15305     unsigned NewOpc = 0;
15306     unsigned NumVecs = 0;
15307     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
15308     switch (IntNo) {
15309     default: llvm_unreachable("unexpected intrinsic for Neon base update");
15310     case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
15311       NumVecs = 2; break;
15312     case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
15313       NumVecs = 3; break;
15314     case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
15315       NumVecs = 4; break;
15316     case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
15317       NumVecs = 2; IsStore = true; break;
15318     case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
15319       NumVecs = 3; IsStore = true; break;
15320     case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
15321       NumVecs = 4; IsStore = true; break;
15322     case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
15323       NumVecs = 2; break;
15324     case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
15325       NumVecs = 3; break;
15326     case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
15327       NumVecs = 4; break;
15328     case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
15329       NumVecs = 2; IsStore = true; break;
15330     case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
15331       NumVecs = 3; IsStore = true; break;
15332     case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
15333       NumVecs = 4; IsStore = true; break;
15334     case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
15335       NumVecs = 2; IsDupOp = true; break;
15336     case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
15337       NumVecs = 3; IsDupOp = true; break;
15338     case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
15339       NumVecs = 4; IsDupOp = true; break;
15340     case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
15341       NumVecs = 2; IsLaneOp = true; break;
15342     case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
15343       NumVecs = 3; IsLaneOp = true; break;
15344     case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
15345       NumVecs = 4; IsLaneOp = true; break;
15346     case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
15347       NumVecs = 2; IsStore = true; IsLaneOp = true; break;
15348     case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
15349       NumVecs = 3; IsStore = true; IsLaneOp = true; break;
15350     case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
15351       NumVecs = 4; IsStore = true; IsLaneOp = true; break;
15352     }
15353 
15354     EVT VecTy;
15355     if (IsStore)
15356       VecTy = N->getOperand(2).getValueType();
15357     else
15358       VecTy = N->getValueType(0);
15359 
15360     // If the increment is a constant, it must match the memory ref size.
15361     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
15362     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
15363       uint32_t IncVal = CInc->getZExtValue();
15364       unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15365       if (IsLaneOp || IsDupOp)
15366         NumBytes /= VecTy.getVectorNumElements();
15367       if (IncVal != NumBytes)
15368         continue;
15369       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
15370     }
15371     SmallVector<SDValue, 8> Ops;
15372     Ops.push_back(N->getOperand(0)); // Incoming chain
15373     // Load lane and store have vector list as input.
15374     if (IsLaneOp || IsStore)
15375       for (unsigned i = 2; i < AddrOpIdx; ++i)
15376         Ops.push_back(N->getOperand(i));
15377     Ops.push_back(Addr); // Base register
15378     Ops.push_back(Inc);
15379 
15380     // Return Types.
15381     EVT Tys[6];
15382     unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
15383     unsigned n;
15384     for (n = 0; n < NumResultVecs; ++n)
15385       Tys[n] = VecTy;
15386     Tys[n++] = MVT::i64;  // Type of write back register
15387     Tys[n] = MVT::Other;  // Type of the chain
15388     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
15389 
15390     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
15391     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
15392                                            MemInt->getMemoryVT(),
15393                                            MemInt->getMemOperand());
15394 
15395     // Update the uses.
15396     std::vector<SDValue> NewResults;
15397     for (unsigned i = 0; i < NumResultVecs; ++i) {
15398       NewResults.push_back(SDValue(UpdN.getNode(), i));
15399     }
15400     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
15401     DCI.CombineTo(N, NewResults);
15402     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
15403 
15404     break;
15405   }
15406   return SDValue();
15407 }
15408 
15409 // Checks to see if the value is the prescribed width and returns information
15410 // about its extension mode.
15411 static
15412 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
15413   ExtType = ISD::NON_EXTLOAD;
15414   switch(V.getNode()->getOpcode()) {
15415   default:
15416     return false;
15417   case ISD::LOAD: {
15418     LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
15419     if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
15420        || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
15421       ExtType = LoadNode->getExtensionType();
15422       return true;
15423     }
15424     return false;
15425   }
15426   case ISD::AssertSext: {
15427     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
15428     if ((TypeNode->getVT() == MVT::i8 && width == 8)
15429        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
15430       ExtType = ISD::SEXTLOAD;
15431       return true;
15432     }
15433     return false;
15434   }
15435   case ISD::AssertZext: {
15436     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
15437     if ((TypeNode->getVT() == MVT::i8 && width == 8)
15438        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
15439       ExtType = ISD::ZEXTLOAD;
15440       return true;
15441     }
15442     return false;
15443   }
15444   case ISD::Constant:
15445   case ISD::TargetConstant: {
15446     return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
15447            1LL << (width - 1);
15448   }
15449   }
15450 
15451   return true;
15452 }
15453 
15454 // This function does a whole lot of voodoo to determine if the tests are
15455 // equivalent without and with a mask. Essentially what happens is that given a
15456 // DAG resembling:
15457 //
15458 //  +-------------+ +-------------+ +-------------+ +-------------+
15459 //  |    Input    | | AddConstant | | CompConstant| |     CC      |
15460 //  +-------------+ +-------------+ +-------------+ +-------------+
15461 //           |           |           |               |
15462 //           V           V           |    +----------+
15463 //          +-------------+  +----+  |    |
15464 //          |     ADD     |  |0xff|  |    |
15465 //          +-------------+  +----+  |    |
15466 //                  |           |    |    |
15467 //                  V           V    |    |
15468 //                 +-------------+   |    |
15469 //                 |     AND     |   |    |
15470 //                 +-------------+   |    |
15471 //                      |            |    |
15472 //                      +-----+      |    |
15473 //                            |      |    |
15474 //                            V      V    V
15475 //                           +-------------+
15476 //                           |     CMP     |
15477 //                           +-------------+
15478 //
15479 // The AND node may be safely removed for some combinations of inputs. In
15480 // particular we need to take into account the extension type of the Input,
15481 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
15482 // width of the input (this can work for any width inputs, the above graph is
15483 // specific to 8 bits.
15484 //
15485 // The specific equations were worked out by generating output tables for each
15486 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
15487 // problem was simplified by working with 4 bit inputs, which means we only
15488 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
15489 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
15490 // patterns present in both extensions (0,7). For every distinct set of
15491 // AddConstant and CompConstants bit patterns we can consider the masked and
15492 // unmasked versions to be equivalent if the result of this function is true for
15493 // all 16 distinct bit patterns of for the current extension type of Input (w0).
15494 //
15495 //   sub      w8, w0, w1
15496 //   and      w10, w8, #0x0f
15497 //   cmp      w8, w2
15498 //   cset     w9, AArch64CC
15499 //   cmp      w10, w2
15500 //   cset     w11, AArch64CC
15501 //   cmp      w9, w11
15502 //   cset     w0, eq
15503 //   ret
15504 //
15505 // Since the above function shows when the outputs are equivalent it defines
15506 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
15507 // would be expensive to run during compiles. The equations below were written
15508 // in a test harness that confirmed they gave equivalent outputs to the above
15509 // for all inputs function, so they can be used determine if the removal is
15510 // legal instead.
15511 //
15512 // isEquivalentMaskless() is the code for testing if the AND can be removed
15513 // factored out of the DAG recognition as the DAG can take several forms.
15514 
15515 static bool isEquivalentMaskless(unsigned CC, unsigned width,
15516                                  ISD::LoadExtType ExtType, int AddConstant,
15517                                  int CompConstant) {
15518   // By being careful about our equations and only writing the in term
15519   // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
15520   // make them generally applicable to all bit widths.
15521   int MaxUInt = (1 << width);
15522 
15523   // For the purposes of these comparisons sign extending the type is
15524   // equivalent to zero extending the add and displacing it by half the integer
15525   // width. Provided we are careful and make sure our equations are valid over
15526   // the whole range we can just adjust the input and avoid writing equations
15527   // for sign extended inputs.
15528   if (ExtType == ISD::SEXTLOAD)
15529     AddConstant -= (1 << (width-1));
15530 
15531   switch(CC) {
15532   case AArch64CC::LE:
15533   case AArch64CC::GT:
15534     if ((AddConstant == 0) ||
15535         (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
15536         (AddConstant >= 0 && CompConstant < 0) ||
15537         (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
15538       return true;
15539     break;
15540   case AArch64CC::LT:
15541   case AArch64CC::GE:
15542     if ((AddConstant == 0) ||
15543         (AddConstant >= 0 && CompConstant <= 0) ||
15544         (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
15545       return true;
15546     break;
15547   case AArch64CC::HI:
15548   case AArch64CC::LS:
15549     if ((AddConstant >= 0 && CompConstant < 0) ||
15550        (AddConstant <= 0 && CompConstant >= -1 &&
15551         CompConstant < AddConstant + MaxUInt))
15552       return true;
15553    break;
15554   case AArch64CC::PL:
15555   case AArch64CC::MI:
15556     if ((AddConstant == 0) ||
15557         (AddConstant > 0 && CompConstant <= 0) ||
15558         (AddConstant < 0 && CompConstant <= AddConstant))
15559       return true;
15560     break;
15561   case AArch64CC::LO:
15562   case AArch64CC::HS:
15563     if ((AddConstant >= 0 && CompConstant <= 0) ||
15564         (AddConstant <= 0 && CompConstant >= 0 &&
15565          CompConstant <= AddConstant + MaxUInt))
15566       return true;
15567     break;
15568   case AArch64CC::EQ:
15569   case AArch64CC::NE:
15570     if ((AddConstant > 0 && CompConstant < 0) ||
15571         (AddConstant < 0 && CompConstant >= 0 &&
15572          CompConstant < AddConstant + MaxUInt) ||
15573         (AddConstant >= 0 && CompConstant >= 0 &&
15574          CompConstant >= AddConstant) ||
15575         (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
15576       return true;
15577     break;
15578   case AArch64CC::VS:
15579   case AArch64CC::VC:
15580   case AArch64CC::AL:
15581   case AArch64CC::NV:
15582     return true;
15583   case AArch64CC::Invalid:
15584     break;
15585   }
15586 
15587   return false;
15588 }
15589 
15590 static
15591 SDValue performCONDCombine(SDNode *N,
15592                            TargetLowering::DAGCombinerInfo &DCI,
15593                            SelectionDAG &DAG, unsigned CCIndex,
15594                            unsigned CmpIndex) {
15595   unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
15596   SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
15597   unsigned CondOpcode = SubsNode->getOpcode();
15598 
15599   if (CondOpcode != AArch64ISD::SUBS)
15600     return SDValue();
15601 
15602   // There is a SUBS feeding this condition. Is it fed by a mask we can
15603   // use?
15604 
15605   SDNode *AndNode = SubsNode->getOperand(0).getNode();
15606   unsigned MaskBits = 0;
15607 
15608   if (AndNode->getOpcode() != ISD::AND)
15609     return SDValue();
15610 
15611   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
15612     uint32_t CNV = CN->getZExtValue();
15613     if (CNV == 255)
15614       MaskBits = 8;
15615     else if (CNV == 65535)
15616       MaskBits = 16;
15617   }
15618 
15619   if (!MaskBits)
15620     return SDValue();
15621 
15622   SDValue AddValue = AndNode->getOperand(0);
15623 
15624   if (AddValue.getOpcode() != ISD::ADD)
15625     return SDValue();
15626 
15627   // The basic dag structure is correct, grab the inputs and validate them.
15628 
15629   SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
15630   SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
15631   SDValue SubsInputValue = SubsNode->getOperand(1);
15632 
15633   // The mask is present and the provenance of all the values is a smaller type,
15634   // lets see if the mask is superfluous.
15635 
15636   if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
15637       !isa<ConstantSDNode>(SubsInputValue.getNode()))
15638     return SDValue();
15639 
15640   ISD::LoadExtType ExtType;
15641 
15642   if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
15643       !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
15644       !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
15645     return SDValue();
15646 
15647   if(!isEquivalentMaskless(CC, MaskBits, ExtType,
15648                 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
15649                 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
15650     return SDValue();
15651 
15652   // The AND is not necessary, remove it.
15653 
15654   SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
15655                                SubsNode->getValueType(1));
15656   SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
15657 
15658   SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
15659   DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
15660 
15661   return SDValue(N, 0);
15662 }
15663 
15664 // Optimize compare with zero and branch.
15665 static SDValue performBRCONDCombine(SDNode *N,
15666                                     TargetLowering::DAGCombinerInfo &DCI,
15667                                     SelectionDAG &DAG) {
15668   MachineFunction &MF = DAG.getMachineFunction();
15669   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
15670   // will not be produced, as they are conditional branch instructions that do
15671   // not set flags.
15672   if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
15673     return SDValue();
15674 
15675   if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
15676     N = NV.getNode();
15677   SDValue Chain = N->getOperand(0);
15678   SDValue Dest = N->getOperand(1);
15679   SDValue CCVal = N->getOperand(2);
15680   SDValue Cmp = N->getOperand(3);
15681 
15682   assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
15683   unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
15684   if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
15685     return SDValue();
15686 
15687   unsigned CmpOpc = Cmp.getOpcode();
15688   if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
15689     return SDValue();
15690 
15691   // Only attempt folding if there is only one use of the flag and no use of the
15692   // value.
15693   if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
15694     return SDValue();
15695 
15696   SDValue LHS = Cmp.getOperand(0);
15697   SDValue RHS = Cmp.getOperand(1);
15698 
15699   assert(LHS.getValueType() == RHS.getValueType() &&
15700          "Expected the value type to be the same for both operands!");
15701   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
15702     return SDValue();
15703 
15704   if (isNullConstant(LHS))
15705     std::swap(LHS, RHS);
15706 
15707   if (!isNullConstant(RHS))
15708     return SDValue();
15709 
15710   if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
15711       LHS.getOpcode() == ISD::SRL)
15712     return SDValue();
15713 
15714   // Fold the compare into the branch instruction.
15715   SDValue BR;
15716   if (CC == AArch64CC::EQ)
15717     BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
15718   else
15719     BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
15720 
15721   // Do not add new nodes to DAG combiner worklist.
15722   DCI.CombineTo(N, BR, false);
15723 
15724   return SDValue();
15725 }
15726 
15727 // Optimize CSEL instructions
15728 static SDValue performCSELCombine(SDNode *N,
15729                                   TargetLowering::DAGCombinerInfo &DCI,
15730                                   SelectionDAG &DAG) {
15731   // CSEL x, x, cc -> x
15732   if (N->getOperand(0) == N->getOperand(1))
15733     return N->getOperand(0);
15734 
15735   return performCONDCombine(N, DCI, DAG, 2, 3);
15736 }
15737 
15738 static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
15739   assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
15740   SDValue LHS = N->getOperand(0);
15741   SDValue RHS = N->getOperand(1);
15742   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
15743 
15744   // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
15745   if (Cond == ISD::SETNE && isOneConstant(RHS) &&
15746       LHS->getOpcode() == AArch64ISD::CSEL &&
15747       isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
15748       LHS->hasOneUse()) {
15749     SDLoc DL(N);
15750 
15751     // Invert CSEL's condition.
15752     auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2));
15753     auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue());
15754     auto NewCond = getInvertedCondCode(OldCond);
15755 
15756     // csel 0, 1, !cond, X
15757     SDValue CSEL =
15758         DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
15759                     LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
15760                     LHS.getOperand(3));
15761     return DAG.getZExtOrTrunc(CSEL, DL, N->getValueType(0));
15762   }
15763 
15764   return SDValue();
15765 }
15766 
15767 static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) {
15768   assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
15769          "Unexpected opcode!");
15770 
15771   SDValue Pred = N->getOperand(0);
15772   SDValue LHS = N->getOperand(1);
15773   SDValue RHS = N->getOperand(2);
15774   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
15775 
15776   // setcc_merge_zero pred (sign_extend (setcc_merge_zero ... pred ...)), 0, ne
15777   //    => inner setcc_merge_zero
15778   if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
15779       LHS->getOpcode() == ISD::SIGN_EXTEND &&
15780       LHS->getOperand(0)->getValueType(0) == N->getValueType(0) &&
15781       LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
15782       LHS->getOperand(0)->getOperand(0) == Pred)
15783     return LHS->getOperand(0);
15784 
15785   return SDValue();
15786 }
15787 
15788 // Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test
15789 // as well as whether the test should be inverted.  This code is required to
15790 // catch these cases (as opposed to standard dag combines) because
15791 // AArch64ISD::TBZ is matched during legalization.
15792 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
15793                                  SelectionDAG &DAG) {
15794 
15795   if (!Op->hasOneUse())
15796     return Op;
15797 
15798   // We don't handle undef/constant-fold cases below, as they should have
15799   // already been taken care of (e.g. and of 0, test of undefined shifted bits,
15800   // etc.)
15801 
15802   // (tbz (trunc x), b) -> (tbz x, b)
15803   // This case is just here to enable more of the below cases to be caught.
15804   if (Op->getOpcode() == ISD::TRUNCATE &&
15805       Bit < Op->getValueType(0).getSizeInBits()) {
15806     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15807   }
15808 
15809   // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
15810   if (Op->getOpcode() == ISD::ANY_EXTEND &&
15811       Bit < Op->getOperand(0).getValueSizeInBits()) {
15812     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15813   }
15814 
15815   if (Op->getNumOperands() != 2)
15816     return Op;
15817 
15818   auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
15819   if (!C)
15820     return Op;
15821 
15822   switch (Op->getOpcode()) {
15823   default:
15824     return Op;
15825 
15826   // (tbz (and x, m), b) -> (tbz x, b)
15827   case ISD::AND:
15828     if ((C->getZExtValue() >> Bit) & 1)
15829       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15830     return Op;
15831 
15832   // (tbz (shl x, c), b) -> (tbz x, b-c)
15833   case ISD::SHL:
15834     if (C->getZExtValue() <= Bit &&
15835         (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
15836       Bit = Bit - C->getZExtValue();
15837       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15838     }
15839     return Op;
15840 
15841   // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
15842   case ISD::SRA:
15843     Bit = Bit + C->getZExtValue();
15844     if (Bit >= Op->getValueType(0).getSizeInBits())
15845       Bit = Op->getValueType(0).getSizeInBits() - 1;
15846     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15847 
15848   // (tbz (srl x, c), b) -> (tbz x, b+c)
15849   case ISD::SRL:
15850     if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
15851       Bit = Bit + C->getZExtValue();
15852       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15853     }
15854     return Op;
15855 
15856   // (tbz (xor x, -1), b) -> (tbnz x, b)
15857   case ISD::XOR:
15858     if ((C->getZExtValue() >> Bit) & 1)
15859       Invert = !Invert;
15860     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
15861   }
15862 }
15863 
15864 // Optimize test single bit zero/non-zero and branch.
15865 static SDValue performTBZCombine(SDNode *N,
15866                                  TargetLowering::DAGCombinerInfo &DCI,
15867                                  SelectionDAG &DAG) {
15868   unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
15869   bool Invert = false;
15870   SDValue TestSrc = N->getOperand(1);
15871   SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
15872 
15873   if (TestSrc == NewTestSrc)
15874     return SDValue();
15875 
15876   unsigned NewOpc = N->getOpcode();
15877   if (Invert) {
15878     if (NewOpc == AArch64ISD::TBZ)
15879       NewOpc = AArch64ISD::TBNZ;
15880     else {
15881       assert(NewOpc == AArch64ISD::TBNZ);
15882       NewOpc = AArch64ISD::TBZ;
15883     }
15884   }
15885 
15886   SDLoc DL(N);
15887   return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
15888                      DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
15889 }
15890 
15891 // vselect (v1i1 setcc) ->
15892 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
15893 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
15894 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
15895 // such VSELECT.
15896 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
15897   SDValue N0 = N->getOperand(0);
15898   EVT CCVT = N0.getValueType();
15899 
15900   // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
15901   // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
15902   // supported types.
15903   SDValue SetCC = N->getOperand(0);
15904   if (SetCC.getOpcode() == ISD::SETCC &&
15905       SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
15906     SDValue CmpLHS = SetCC.getOperand(0);
15907     EVT VT = CmpLHS.getValueType();
15908     SDNode *CmpRHS = SetCC.getOperand(1).getNode();
15909     SDNode *SplatLHS = N->getOperand(1).getNode();
15910     SDNode *SplatRHS = N->getOperand(2).getNode();
15911     APInt SplatLHSVal;
15912     if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
15913         VT.isSimple() &&
15914         is_contained(
15915             makeArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
15916                           MVT::v2i32, MVT::v4i32, MVT::v2i64}),
15917             VT.getSimpleVT().SimpleTy) &&
15918         ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
15919         SplatLHSVal.isOneValue() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
15920         ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
15921       unsigned NumElts = VT.getVectorNumElements();
15922       SmallVector<SDValue, 8> Ops(
15923           NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
15924                                    VT.getScalarType()));
15925       SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
15926 
15927       auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
15928       auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
15929       return Or;
15930     }
15931   }
15932 
15933   if (N0.getOpcode() != ISD::SETCC ||
15934       CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
15935       CCVT.getVectorElementType() != MVT::i1)
15936     return SDValue();
15937 
15938   EVT ResVT = N->getValueType(0);
15939   EVT CmpVT = N0.getOperand(0).getValueType();
15940   // Only combine when the result type is of the same size as the compared
15941   // operands.
15942   if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
15943     return SDValue();
15944 
15945   SDValue IfTrue = N->getOperand(1);
15946   SDValue IfFalse = N->getOperand(2);
15947   SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
15948                        N0.getOperand(0), N0.getOperand(1),
15949                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
15950   return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
15951                      IfTrue, IfFalse);
15952 }
15953 
15954 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
15955 /// the compare-mask instructions rather than going via NZCV, even if LHS and
15956 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
15957 /// with a vector one followed by a DUP shuffle on the result.
15958 static SDValue performSelectCombine(SDNode *N,
15959                                     TargetLowering::DAGCombinerInfo &DCI) {
15960   SelectionDAG &DAG = DCI.DAG;
15961   SDValue N0 = N->getOperand(0);
15962   EVT ResVT = N->getValueType(0);
15963 
15964   if (N0.getOpcode() != ISD::SETCC)
15965     return SDValue();
15966 
15967   if (ResVT.isScalableVector())
15968     return SDValue();
15969 
15970   // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
15971   // scalar SetCCResultType. We also don't expect vectors, because we assume
15972   // that selects fed by vector SETCCs are canonicalized to VSELECT.
15973   assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
15974          "Scalar-SETCC feeding SELECT has unexpected result type!");
15975 
15976   // If NumMaskElts == 0, the comparison is larger than select result. The
15977   // largest real NEON comparison is 64-bits per lane, which means the result is
15978   // at most 32-bits and an illegal vector. Just bail out for now.
15979   EVT SrcVT = N0.getOperand(0).getValueType();
15980 
15981   // Don't try to do this optimization when the setcc itself has i1 operands.
15982   // There are no legal vectors of i1, so this would be pointless.
15983   if (SrcVT == MVT::i1)
15984     return SDValue();
15985 
15986   int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
15987   if (!ResVT.isVector() || NumMaskElts == 0)
15988     return SDValue();
15989 
15990   SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
15991   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
15992 
15993   // Also bail out if the vector CCVT isn't the same size as ResVT.
15994   // This can happen if the SETCC operand size doesn't divide the ResVT size
15995   // (e.g., f64 vs v3f32).
15996   if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
15997     return SDValue();
15998 
15999   // Make sure we didn't create illegal types, if we're not supposed to.
16000   assert(DCI.isBeforeLegalize() ||
16001          DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
16002 
16003   // First perform a vector comparison, where lane 0 is the one we're interested
16004   // in.
16005   SDLoc DL(N0);
16006   SDValue LHS =
16007       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
16008   SDValue RHS =
16009       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
16010   SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
16011 
16012   // Now duplicate the comparison mask we want across all other lanes.
16013   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
16014   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
16015   Mask = DAG.getNode(ISD::BITCAST, DL,
16016                      ResVT.changeVectorElementTypeToInteger(), Mask);
16017 
16018   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
16019 }
16020 
16021 /// Get rid of unnecessary NVCASTs (that don't change the type).
16022 static SDValue performNVCASTCombine(SDNode *N) {
16023   if (N->getValueType(0) == N->getOperand(0).getValueType())
16024     return N->getOperand(0);
16025 
16026   return SDValue();
16027 }
16028 
16029 // If all users of the globaladdr are of the form (globaladdr + constant), find
16030 // the smallest constant, fold it into the globaladdr's offset and rewrite the
16031 // globaladdr as (globaladdr + constant) - constant.
16032 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
16033                                            const AArch64Subtarget *Subtarget,
16034                                            const TargetMachine &TM) {
16035   auto *GN = cast<GlobalAddressSDNode>(N);
16036   if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
16037       AArch64II::MO_NO_FLAG)
16038     return SDValue();
16039 
16040   uint64_t MinOffset = -1ull;
16041   for (SDNode *N : GN->uses()) {
16042     if (N->getOpcode() != ISD::ADD)
16043       return SDValue();
16044     auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
16045     if (!C)
16046       C = dyn_cast<ConstantSDNode>(N->getOperand(1));
16047     if (!C)
16048       return SDValue();
16049     MinOffset = std::min(MinOffset, C->getZExtValue());
16050   }
16051   uint64_t Offset = MinOffset + GN->getOffset();
16052 
16053   // Require that the new offset is larger than the existing one. Otherwise, we
16054   // can end up oscillating between two possible DAGs, for example,
16055   // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
16056   if (Offset <= uint64_t(GN->getOffset()))
16057     return SDValue();
16058 
16059   // Check whether folding this offset is legal. It must not go out of bounds of
16060   // the referenced object to avoid violating the code model, and must be
16061   // smaller than 2^21 because this is the largest offset expressible in all
16062   // object formats.
16063   //
16064   // This check also prevents us from folding negative offsets, which will end
16065   // up being treated in the same way as large positive ones. They could also
16066   // cause code model violations, and aren't really common enough to matter.
16067   if (Offset >= (1 << 21))
16068     return SDValue();
16069 
16070   const GlobalValue *GV = GN->getGlobal();
16071   Type *T = GV->getValueType();
16072   if (!T->isSized() ||
16073       Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
16074     return SDValue();
16075 
16076   SDLoc DL(GN);
16077   SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
16078   return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
16079                      DAG.getConstant(MinOffset, DL, MVT::i64));
16080 }
16081 
16082 // Turns the vector of indices into a vector of byte offstes by scaling Offset
16083 // by (BitWidth / 8).
16084 static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
16085                                           SDLoc DL, unsigned BitWidth) {
16086   assert(Offset.getValueType().isScalableVector() &&
16087          "This method is only for scalable vectors of offsets");
16088 
16089   SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
16090   SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
16091 
16092   return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
16093 }
16094 
16095 /// Check if the value of \p OffsetInBytes can be used as an immediate for
16096 /// the gather load/prefetch and scatter store instructions with vector base and
16097 /// immediate offset addressing mode:
16098 ///
16099 ///      [<Zn>.[S|D]{, #<imm>}]
16100 ///
16101 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
16102 inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
16103                                                   unsigned ScalarSizeInBytes) {
16104   // The immediate is not a multiple of the scalar size.
16105   if (OffsetInBytes % ScalarSizeInBytes)
16106     return false;
16107 
16108   // The immediate is out of range.
16109   if (OffsetInBytes / ScalarSizeInBytes > 31)
16110     return false;
16111 
16112   return true;
16113 }
16114 
16115 /// Check if the value of \p Offset represents a valid immediate for the SVE
16116 /// gather load/prefetch and scatter store instructiona with vector base and
16117 /// immediate offset addressing mode:
16118 ///
16119 ///      [<Zn>.[S|D]{, #<imm>}]
16120 ///
16121 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
16122 static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
16123                                            unsigned ScalarSizeInBytes) {
16124   ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
16125   return OffsetConst && isValidImmForSVEVecImmAddrMode(
16126                             OffsetConst->getZExtValue(), ScalarSizeInBytes);
16127 }
16128 
16129 static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
16130                                           unsigned Opcode,
16131                                           bool OnlyPackedOffsets = true) {
16132   const SDValue Src = N->getOperand(2);
16133   const EVT SrcVT = Src->getValueType(0);
16134   assert(SrcVT.isScalableVector() &&
16135          "Scatter stores are only possible for SVE vectors");
16136 
16137   SDLoc DL(N);
16138   MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
16139 
16140   // Make sure that source data will fit into an SVE register
16141   if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
16142     return SDValue();
16143 
16144   // For FPs, ACLE only supports _packed_ single and double precision types.
16145   if (SrcElVT.isFloatingPoint())
16146     if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
16147       return SDValue();
16148 
16149   // Depending on the addressing mode, this is either a pointer or a vector of
16150   // pointers (that fits into one register)
16151   SDValue Base = N->getOperand(4);
16152   // Depending on the addressing mode, this is either a single offset or a
16153   // vector of offsets  (that fits into one register)
16154   SDValue Offset = N->getOperand(5);
16155 
16156   // For "scalar + vector of indices", just scale the indices. This only
16157   // applies to non-temporal scatters because there's no instruction that takes
16158   // indicies.
16159   if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
16160     Offset =
16161         getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
16162     Opcode = AArch64ISD::SSTNT1_PRED;
16163   }
16164 
16165   // In the case of non-temporal gather loads there's only one SVE instruction
16166   // per data-size: "scalar + vector", i.e.
16167   //    * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
16168   // Since we do have intrinsics that allow the arguments to be in a different
16169   // order, we may need to swap them to match the spec.
16170   if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
16171     std::swap(Base, Offset);
16172 
16173   // SST1_IMM requires that the offset is an immediate that is:
16174   //    * a multiple of #SizeInBytes,
16175   //    * in the range [0, 31 x #SizeInBytes],
16176   // where #SizeInBytes is the size in bytes of the stored items. For
16177   // immediates outside that range and non-immediate scalar offsets use SST1 or
16178   // SST1_UXTW instead.
16179   if (Opcode == AArch64ISD::SST1_IMM_PRED) {
16180     if (!isValidImmForSVEVecImmAddrMode(Offset,
16181                                         SrcVT.getScalarSizeInBits() / 8)) {
16182       if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
16183         Opcode = AArch64ISD::SST1_UXTW_PRED;
16184       else
16185         Opcode = AArch64ISD::SST1_PRED;
16186 
16187       std::swap(Base, Offset);
16188     }
16189   }
16190 
16191   auto &TLI = DAG.getTargetLoweringInfo();
16192   if (!TLI.isTypeLegal(Base.getValueType()))
16193     return SDValue();
16194 
16195   // Some scatter store variants allow unpacked offsets, but only as nxv2i32
16196   // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
16197   // nxv2i64. Legalize accordingly.
16198   if (!OnlyPackedOffsets &&
16199       Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
16200     Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
16201 
16202   if (!TLI.isTypeLegal(Offset.getValueType()))
16203     return SDValue();
16204 
16205   // Source value type that is representable in hardware
16206   EVT HwSrcVt = getSVEContainerType(SrcVT);
16207 
16208   // Keep the original type of the input data to store - this is needed to be
16209   // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
16210   // FP values we want the integer equivalent, so just use HwSrcVt.
16211   SDValue InputVT = DAG.getValueType(SrcVT);
16212   if (SrcVT.isFloatingPoint())
16213     InputVT = DAG.getValueType(HwSrcVt);
16214 
16215   SDVTList VTs = DAG.getVTList(MVT::Other);
16216   SDValue SrcNew;
16217 
16218   if (Src.getValueType().isFloatingPoint())
16219     SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
16220   else
16221     SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
16222 
16223   SDValue Ops[] = {N->getOperand(0), // Chain
16224                    SrcNew,
16225                    N->getOperand(3), // Pg
16226                    Base,
16227                    Offset,
16228                    InputVT};
16229 
16230   return DAG.getNode(Opcode, DL, VTs, Ops);
16231 }
16232 
16233 static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
16234                                         unsigned Opcode,
16235                                         bool OnlyPackedOffsets = true) {
16236   const EVT RetVT = N->getValueType(0);
16237   assert(RetVT.isScalableVector() &&
16238          "Gather loads are only possible for SVE vectors");
16239 
16240   SDLoc DL(N);
16241 
16242   // Make sure that the loaded data will fit into an SVE register
16243   if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
16244     return SDValue();
16245 
16246   // Depending on the addressing mode, this is either a pointer or a vector of
16247   // pointers (that fits into one register)
16248   SDValue Base = N->getOperand(3);
16249   // Depending on the addressing mode, this is either a single offset or a
16250   // vector of offsets  (that fits into one register)
16251   SDValue Offset = N->getOperand(4);
16252 
16253   // For "scalar + vector of indices", just scale the indices. This only
16254   // applies to non-temporal gathers because there's no instruction that takes
16255   // indicies.
16256   if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
16257     Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
16258                                         RetVT.getScalarSizeInBits());
16259     Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
16260   }
16261 
16262   // In the case of non-temporal gather loads there's only one SVE instruction
16263   // per data-size: "scalar + vector", i.e.
16264   //    * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
16265   // Since we do have intrinsics that allow the arguments to be in a different
16266   // order, we may need to swap them to match the spec.
16267   if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
16268       Offset.getValueType().isVector())
16269     std::swap(Base, Offset);
16270 
16271   // GLD{FF}1_IMM requires that the offset is an immediate that is:
16272   //    * a multiple of #SizeInBytes,
16273   //    * in the range [0, 31 x #SizeInBytes],
16274   // where #SizeInBytes is the size in bytes of the loaded items. For
16275   // immediates outside that range and non-immediate scalar offsets use
16276   // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
16277   if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
16278       Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
16279     if (!isValidImmForSVEVecImmAddrMode(Offset,
16280                                         RetVT.getScalarSizeInBits() / 8)) {
16281       if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
16282         Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
16283                      ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
16284                      : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
16285       else
16286         Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
16287                      ? AArch64ISD::GLD1_MERGE_ZERO
16288                      : AArch64ISD::GLDFF1_MERGE_ZERO;
16289 
16290       std::swap(Base, Offset);
16291     }
16292   }
16293 
16294   auto &TLI = DAG.getTargetLoweringInfo();
16295   if (!TLI.isTypeLegal(Base.getValueType()))
16296     return SDValue();
16297 
16298   // Some gather load variants allow unpacked offsets, but only as nxv2i32
16299   // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
16300   // nxv2i64. Legalize accordingly.
16301   if (!OnlyPackedOffsets &&
16302       Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
16303     Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
16304 
16305   // Return value type that is representable in hardware
16306   EVT HwRetVt = getSVEContainerType(RetVT);
16307 
16308   // Keep the original output value type around - this is needed to be able to
16309   // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
16310   // values we want the integer equivalent, so just use HwRetVT.
16311   SDValue OutVT = DAG.getValueType(RetVT);
16312   if (RetVT.isFloatingPoint())
16313     OutVT = DAG.getValueType(HwRetVt);
16314 
16315   SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
16316   SDValue Ops[] = {N->getOperand(0), // Chain
16317                    N->getOperand(2), // Pg
16318                    Base, Offset, OutVT};
16319 
16320   SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
16321   SDValue LoadChain = SDValue(Load.getNode(), 1);
16322 
16323   if (RetVT.isInteger() && (RetVT != HwRetVt))
16324     Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
16325 
16326   // If the original return value was FP, bitcast accordingly. Doing it here
16327   // means that we can avoid adding TableGen patterns for FPs.
16328   if (RetVT.isFloatingPoint())
16329     Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
16330 
16331   return DAG.getMergeValues({Load, LoadChain}, DL);
16332 }
16333 
16334 static SDValue
16335 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
16336                               SelectionDAG &DAG) {
16337   SDLoc DL(N);
16338   SDValue Src = N->getOperand(0);
16339   unsigned Opc = Src->getOpcode();
16340 
16341   // Sign extend of an unsigned unpack -> signed unpack
16342   if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
16343 
16344     unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
16345                                                : AArch64ISD::SUNPKLO;
16346 
16347     // Push the sign extend to the operand of the unpack
16348     // This is necessary where, for example, the operand of the unpack
16349     // is another unpack:
16350     // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
16351     // ->
16352     // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
16353     // ->
16354     // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
16355     SDValue ExtOp = Src->getOperand(0);
16356     auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
16357     EVT EltTy = VT.getVectorElementType();
16358     (void)EltTy;
16359 
16360     assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
16361            "Sign extending from an invalid type");
16362 
16363     EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
16364 
16365     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
16366                               ExtOp, DAG.getValueType(ExtVT));
16367 
16368     return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
16369   }
16370 
16371   if (DCI.isBeforeLegalizeOps())
16372     return SDValue();
16373 
16374   if (!EnableCombineMGatherIntrinsics)
16375     return SDValue();
16376 
16377   // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
16378   // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
16379   unsigned NewOpc;
16380   unsigned MemVTOpNum = 4;
16381   switch (Opc) {
16382   case AArch64ISD::LD1_MERGE_ZERO:
16383     NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
16384     MemVTOpNum = 3;
16385     break;
16386   case AArch64ISD::LDNF1_MERGE_ZERO:
16387     NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
16388     MemVTOpNum = 3;
16389     break;
16390   case AArch64ISD::LDFF1_MERGE_ZERO:
16391     NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
16392     MemVTOpNum = 3;
16393     break;
16394   case AArch64ISD::GLD1_MERGE_ZERO:
16395     NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
16396     break;
16397   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
16398     NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
16399     break;
16400   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
16401     NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
16402     break;
16403   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
16404     NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
16405     break;
16406   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
16407     NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
16408     break;
16409   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
16410     NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
16411     break;
16412   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
16413     NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
16414     break;
16415   case AArch64ISD::GLDFF1_MERGE_ZERO:
16416     NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
16417     break;
16418   case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
16419     NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
16420     break;
16421   case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
16422     NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
16423     break;
16424   case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
16425     NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
16426     break;
16427   case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
16428     NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
16429     break;
16430   case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
16431     NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
16432     break;
16433   case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
16434     NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
16435     break;
16436   case AArch64ISD::GLDNT1_MERGE_ZERO:
16437     NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
16438     break;
16439   default:
16440     return SDValue();
16441   }
16442 
16443   EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
16444   EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
16445 
16446   if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
16447     return SDValue();
16448 
16449   EVT DstVT = N->getValueType(0);
16450   SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
16451 
16452   SmallVector<SDValue, 5> Ops;
16453   for (unsigned I = 0; I < Src->getNumOperands(); ++I)
16454     Ops.push_back(Src->getOperand(I));
16455 
16456   SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
16457   DCI.CombineTo(N, ExtLoad);
16458   DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
16459 
16460   // Return N so it doesn't get rechecked
16461   return SDValue(N, 0);
16462 }
16463 
16464 /// Legalize the gather prefetch (scalar + vector addressing mode) when the
16465 /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
16466 /// != nxv2i32) do not need legalization.
16467 static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
16468   const unsigned OffsetPos = 4;
16469   SDValue Offset = N->getOperand(OffsetPos);
16470 
16471   // Not an unpacked vector, bail out.
16472   if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
16473     return SDValue();
16474 
16475   // Extend the unpacked offset vector to 64-bit lanes.
16476   SDLoc DL(N);
16477   Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
16478   SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
16479   // Replace the offset operand with the 64-bit one.
16480   Ops[OffsetPos] = Offset;
16481 
16482   return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
16483 }
16484 
16485 /// Combines a node carrying the intrinsic
16486 /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
16487 /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
16488 /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
16489 /// sve gather prefetch instruction with vector plus immediate addressing mode.
16490 static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
16491                                                unsigned ScalarSizeInBytes) {
16492   const unsigned ImmPos = 4, OffsetPos = 3;
16493   // No need to combine the node if the immediate is valid...
16494   if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
16495     return SDValue();
16496 
16497   // ...otherwise swap the offset base with the offset...
16498   SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
16499   std::swap(Ops[ImmPos], Ops[OffsetPos]);
16500   // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
16501   // `aarch64_sve_prfb_gather_uxtw_index`.
16502   SDLoc DL(N);
16503   Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
16504                            MVT::i64);
16505 
16506   return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
16507 }
16508 
16509 // Return true if the vector operation can guarantee only the first lane of its
16510 // result contains data, with all bits in other lanes set to zero.
16511 static bool isLanes1toNKnownZero(SDValue Op) {
16512   switch (Op.getOpcode()) {
16513   default:
16514     return false;
16515   case AArch64ISD::ANDV_PRED:
16516   case AArch64ISD::EORV_PRED:
16517   case AArch64ISD::FADDA_PRED:
16518   case AArch64ISD::FADDV_PRED:
16519   case AArch64ISD::FMAXNMV_PRED:
16520   case AArch64ISD::FMAXV_PRED:
16521   case AArch64ISD::FMINNMV_PRED:
16522   case AArch64ISD::FMINV_PRED:
16523   case AArch64ISD::ORV_PRED:
16524   case AArch64ISD::SADDV_PRED:
16525   case AArch64ISD::SMAXV_PRED:
16526   case AArch64ISD::SMINV_PRED:
16527   case AArch64ISD::UADDV_PRED:
16528   case AArch64ISD::UMAXV_PRED:
16529   case AArch64ISD::UMINV_PRED:
16530     return true;
16531   }
16532 }
16533 
16534 static SDValue removeRedundantInsertVectorElt(SDNode *N) {
16535   assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
16536   SDValue InsertVec = N->getOperand(0);
16537   SDValue InsertElt = N->getOperand(1);
16538   SDValue InsertIdx = N->getOperand(2);
16539 
16540   // We only care about inserts into the first element...
16541   if (!isNullConstant(InsertIdx))
16542     return SDValue();
16543   // ...of a zero'd vector...
16544   if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode()))
16545     return SDValue();
16546   // ...where the inserted data was previously extracted...
16547   if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16548     return SDValue();
16549 
16550   SDValue ExtractVec = InsertElt.getOperand(0);
16551   SDValue ExtractIdx = InsertElt.getOperand(1);
16552 
16553   // ...from the first element of a vector.
16554   if (!isNullConstant(ExtractIdx))
16555     return SDValue();
16556 
16557   // If we get here we are effectively trying to zero lanes 1-N of a vector.
16558 
16559   // Ensure there's no type conversion going on.
16560   if (N->getValueType(0) != ExtractVec.getValueType())
16561     return SDValue();
16562 
16563   if (!isLanes1toNKnownZero(ExtractVec))
16564     return SDValue();
16565 
16566   // The explicit zeroing is redundant.
16567   return ExtractVec;
16568 }
16569 
16570 static SDValue
16571 performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
16572   if (SDValue Res = removeRedundantInsertVectorElt(N))
16573     return Res;
16574 
16575   return performPostLD1Combine(N, DCI, true);
16576 }
16577 
16578 SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
16579   EVT Ty = N->getValueType(0);
16580   if (Ty.isInteger())
16581     return SDValue();
16582 
16583   EVT IntTy = Ty.changeVectorElementTypeToInteger();
16584   EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
16585   if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
16586       IntTy.getVectorElementType().getScalarSizeInBits())
16587     return SDValue();
16588 
16589   SDLoc DL(N);
16590   SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
16591                                      DL, ExtIntTy);
16592   SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
16593                                      DL, ExtIntTy);
16594   SDValue Idx = N->getOperand(2);
16595   SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
16596   SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
16597   return DAG.getBitcast(Ty, Trunc);
16598 }
16599 
16600 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
16601                                                  DAGCombinerInfo &DCI) const {
16602   SelectionDAG &DAG = DCI.DAG;
16603   switch (N->getOpcode()) {
16604   default:
16605     LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
16606     break;
16607   case ISD::ADD:
16608   case ISD::SUB:
16609     return performAddSubCombine(N, DCI, DAG);
16610   case ISD::XOR:
16611     return performXorCombine(N, DAG, DCI, Subtarget);
16612   case ISD::MUL:
16613     return performMulCombine(N, DAG, DCI, Subtarget);
16614   case ISD::SINT_TO_FP:
16615   case ISD::UINT_TO_FP:
16616     return performIntToFpCombine(N, DAG, Subtarget);
16617   case ISD::FP_TO_SINT:
16618   case ISD::FP_TO_UINT:
16619     return performFpToIntCombine(N, DAG, DCI, Subtarget);
16620   case ISD::FDIV:
16621     return performFDivCombine(N, DAG, DCI, Subtarget);
16622   case ISD::OR:
16623     return performORCombine(N, DCI, Subtarget);
16624   case ISD::AND:
16625     return performANDCombine(N, DCI);
16626   case ISD::SRL:
16627     return performSRLCombine(N, DCI);
16628   case ISD::INTRINSIC_WO_CHAIN:
16629     return performIntrinsicCombine(N, DCI, Subtarget);
16630   case ISD::ANY_EXTEND:
16631   case ISD::ZERO_EXTEND:
16632   case ISD::SIGN_EXTEND:
16633     return performExtendCombine(N, DCI, DAG);
16634   case ISD::SIGN_EXTEND_INREG:
16635     return performSignExtendInRegCombine(N, DCI, DAG);
16636   case ISD::TRUNCATE:
16637     return performVectorTruncateCombine(N, DCI, DAG);
16638   case ISD::CONCAT_VECTORS:
16639     return performConcatVectorsCombine(N, DCI, DAG);
16640   case ISD::SELECT:
16641     return performSelectCombine(N, DCI);
16642   case ISD::VSELECT:
16643     return performVSelectCombine(N, DCI.DAG);
16644   case ISD::SETCC:
16645     return performSETCCCombine(N, DAG);
16646   case ISD::LOAD:
16647     if (performTBISimplification(N->getOperand(1), DCI, DAG))
16648       return SDValue(N, 0);
16649     break;
16650   case ISD::STORE:
16651     return performSTORECombine(N, DCI, DAG, Subtarget);
16652   case ISD::VECTOR_SPLICE:
16653     return performSVESpliceCombine(N, DAG);
16654   case AArch64ISD::BRCOND:
16655     return performBRCONDCombine(N, DCI, DAG);
16656   case AArch64ISD::TBNZ:
16657   case AArch64ISD::TBZ:
16658     return performTBZCombine(N, DCI, DAG);
16659   case AArch64ISD::CSEL:
16660     return performCSELCombine(N, DCI, DAG);
16661   case AArch64ISD::DUP:
16662     return performPostLD1Combine(N, DCI, false);
16663   case AArch64ISD::NVCAST:
16664     return performNVCASTCombine(N);
16665   case AArch64ISD::SPLICE:
16666     return performSpliceCombine(N, DAG);
16667   case AArch64ISD::UZP1:
16668     return performUzpCombine(N, DAG);
16669   case AArch64ISD::SETCC_MERGE_ZERO:
16670     return performSetccMergeZeroCombine(N, DAG);
16671   case AArch64ISD::GLD1_MERGE_ZERO:
16672   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
16673   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
16674   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
16675   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
16676   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
16677   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
16678   case AArch64ISD::GLD1S_MERGE_ZERO:
16679   case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
16680   case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
16681   case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
16682   case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
16683   case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
16684   case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
16685     return performGLD1Combine(N, DAG);
16686   case AArch64ISD::VASHR:
16687   case AArch64ISD::VLSHR:
16688     return performVectorShiftCombine(N, *this, DCI);
16689   case ISD::INSERT_VECTOR_ELT:
16690     return performInsertVectorEltCombine(N, DCI);
16691   case ISD::EXTRACT_VECTOR_ELT:
16692     return performExtractVectorEltCombine(N, DAG);
16693   case ISD::VECREDUCE_ADD:
16694     return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
16695   case ISD::INTRINSIC_VOID:
16696   case ISD::INTRINSIC_W_CHAIN:
16697     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
16698     case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
16699       return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
16700     case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
16701       return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
16702     case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
16703       return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
16704     case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
16705       return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
16706     case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
16707     case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
16708     case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
16709     case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
16710     case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
16711     case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
16712     case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
16713     case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
16714       return legalizeSVEGatherPrefetchOffsVec(N, DAG);
16715     case Intrinsic::aarch64_neon_ld2:
16716     case Intrinsic::aarch64_neon_ld3:
16717     case Intrinsic::aarch64_neon_ld4:
16718     case Intrinsic::aarch64_neon_ld1x2:
16719     case Intrinsic::aarch64_neon_ld1x3:
16720     case Intrinsic::aarch64_neon_ld1x4:
16721     case Intrinsic::aarch64_neon_ld2lane:
16722     case Intrinsic::aarch64_neon_ld3lane:
16723     case Intrinsic::aarch64_neon_ld4lane:
16724     case Intrinsic::aarch64_neon_ld2r:
16725     case Intrinsic::aarch64_neon_ld3r:
16726     case Intrinsic::aarch64_neon_ld4r:
16727     case Intrinsic::aarch64_neon_st2:
16728     case Intrinsic::aarch64_neon_st3:
16729     case Intrinsic::aarch64_neon_st4:
16730     case Intrinsic::aarch64_neon_st1x2:
16731     case Intrinsic::aarch64_neon_st1x3:
16732     case Intrinsic::aarch64_neon_st1x4:
16733     case Intrinsic::aarch64_neon_st2lane:
16734     case Intrinsic::aarch64_neon_st3lane:
16735     case Intrinsic::aarch64_neon_st4lane:
16736       return performNEONPostLDSTCombine(N, DCI, DAG);
16737     case Intrinsic::aarch64_sve_ldnt1:
16738       return performLDNT1Combine(N, DAG);
16739     case Intrinsic::aarch64_sve_ld1rq:
16740       return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
16741     case Intrinsic::aarch64_sve_ld1ro:
16742       return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
16743     case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
16744       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
16745     case Intrinsic::aarch64_sve_ldnt1_gather:
16746       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
16747     case Intrinsic::aarch64_sve_ldnt1_gather_index:
16748       return performGatherLoadCombine(N, DAG,
16749                                       AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
16750     case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
16751       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
16752     case Intrinsic::aarch64_sve_ld1:
16753       return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
16754     case Intrinsic::aarch64_sve_ldnf1:
16755       return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
16756     case Intrinsic::aarch64_sve_ldff1:
16757       return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
16758     case Intrinsic::aarch64_sve_st1:
16759       return performST1Combine(N, DAG);
16760     case Intrinsic::aarch64_sve_stnt1:
16761       return performSTNT1Combine(N, DAG);
16762     case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
16763       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
16764     case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
16765       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
16766     case Intrinsic::aarch64_sve_stnt1_scatter:
16767       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
16768     case Intrinsic::aarch64_sve_stnt1_scatter_index:
16769       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
16770     case Intrinsic::aarch64_sve_ld1_gather:
16771       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
16772     case Intrinsic::aarch64_sve_ld1_gather_index:
16773       return performGatherLoadCombine(N, DAG,
16774                                       AArch64ISD::GLD1_SCALED_MERGE_ZERO);
16775     case Intrinsic::aarch64_sve_ld1_gather_sxtw:
16776       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
16777                                       /*OnlyPackedOffsets=*/false);
16778     case Intrinsic::aarch64_sve_ld1_gather_uxtw:
16779       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
16780                                       /*OnlyPackedOffsets=*/false);
16781     case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
16782       return performGatherLoadCombine(N, DAG,
16783                                       AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
16784                                       /*OnlyPackedOffsets=*/false);
16785     case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
16786       return performGatherLoadCombine(N, DAG,
16787                                       AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
16788                                       /*OnlyPackedOffsets=*/false);
16789     case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
16790       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
16791     case Intrinsic::aarch64_sve_ldff1_gather:
16792       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
16793     case Intrinsic::aarch64_sve_ldff1_gather_index:
16794       return performGatherLoadCombine(N, DAG,
16795                                       AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
16796     case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
16797       return performGatherLoadCombine(N, DAG,
16798                                       AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
16799                                       /*OnlyPackedOffsets=*/false);
16800     case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
16801       return performGatherLoadCombine(N, DAG,
16802                                       AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
16803                                       /*OnlyPackedOffsets=*/false);
16804     case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
16805       return performGatherLoadCombine(N, DAG,
16806                                       AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
16807                                       /*OnlyPackedOffsets=*/false);
16808     case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
16809       return performGatherLoadCombine(N, DAG,
16810                                       AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
16811                                       /*OnlyPackedOffsets=*/false);
16812     case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
16813       return performGatherLoadCombine(N, DAG,
16814                                       AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
16815     case Intrinsic::aarch64_sve_st1_scatter:
16816       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
16817     case Intrinsic::aarch64_sve_st1_scatter_index:
16818       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
16819     case Intrinsic::aarch64_sve_st1_scatter_sxtw:
16820       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
16821                                         /*OnlyPackedOffsets=*/false);
16822     case Intrinsic::aarch64_sve_st1_scatter_uxtw:
16823       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
16824                                         /*OnlyPackedOffsets=*/false);
16825     case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
16826       return performScatterStoreCombine(N, DAG,
16827                                         AArch64ISD::SST1_SXTW_SCALED_PRED,
16828                                         /*OnlyPackedOffsets=*/false);
16829     case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
16830       return performScatterStoreCombine(N, DAG,
16831                                         AArch64ISD::SST1_UXTW_SCALED_PRED,
16832                                         /*OnlyPackedOffsets=*/false);
16833     case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
16834       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
16835     case Intrinsic::aarch64_sve_tuple_get: {
16836       SDLoc DL(N);
16837       SDValue Chain = N->getOperand(0);
16838       SDValue Src1 = N->getOperand(2);
16839       SDValue Idx = N->getOperand(3);
16840 
16841       uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
16842       EVT ResVT = N->getValueType(0);
16843       uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue();
16844       SDValue ExtIdx = DAG.getVectorIdxConstant(IdxConst * NumLanes, DL);
16845       SDValue Val =
16846           DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, ExtIdx);
16847       return DAG.getMergeValues({Val, Chain}, DL);
16848     }
16849     case Intrinsic::aarch64_sve_tuple_set: {
16850       SDLoc DL(N);
16851       SDValue Chain = N->getOperand(0);
16852       SDValue Tuple = N->getOperand(2);
16853       SDValue Idx = N->getOperand(3);
16854       SDValue Vec = N->getOperand(4);
16855 
16856       EVT TupleVT = Tuple.getValueType();
16857       uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue();
16858 
16859       uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
16860       uint64_t NumLanes =
16861           Vec.getValueType().getVectorElementCount().getKnownMinValue();
16862 
16863       if ((TupleLanes % NumLanes) != 0)
16864         report_fatal_error("invalid tuple vector!");
16865 
16866       uint64_t NumVecs = TupleLanes / NumLanes;
16867 
16868       SmallVector<SDValue, 4> Opnds;
16869       for (unsigned I = 0; I < NumVecs; ++I) {
16870         if (I == IdxConst)
16871           Opnds.push_back(Vec);
16872         else {
16873           SDValue ExtIdx = DAG.getVectorIdxConstant(I * NumLanes, DL);
16874           Opnds.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
16875                                       Vec.getValueType(), Tuple, ExtIdx));
16876         }
16877       }
16878       SDValue Concat =
16879           DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
16880       return DAG.getMergeValues({Concat, Chain}, DL);
16881     }
16882     case Intrinsic::aarch64_sve_tuple_create2:
16883     case Intrinsic::aarch64_sve_tuple_create3:
16884     case Intrinsic::aarch64_sve_tuple_create4: {
16885       SDLoc DL(N);
16886       SDValue Chain = N->getOperand(0);
16887 
16888       SmallVector<SDValue, 4> Opnds;
16889       for (unsigned I = 2; I < N->getNumOperands(); ++I)
16890         Opnds.push_back(N->getOperand(I));
16891 
16892       EVT VT = Opnds[0].getValueType();
16893       EVT EltVT = VT.getVectorElementType();
16894       EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
16895                                     VT.getVectorElementCount() *
16896                                         (N->getNumOperands() - 2));
16897       SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
16898       return DAG.getMergeValues({Concat, Chain}, DL);
16899     }
16900     case Intrinsic::aarch64_sve_ld2:
16901     case Intrinsic::aarch64_sve_ld3:
16902     case Intrinsic::aarch64_sve_ld4: {
16903       SDLoc DL(N);
16904       SDValue Chain = N->getOperand(0);
16905       SDValue Mask = N->getOperand(2);
16906       SDValue BasePtr = N->getOperand(3);
16907       SDValue LoadOps[] = {Chain, Mask, BasePtr};
16908       unsigned IntrinsicID =
16909           cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
16910       SDValue Result =
16911           LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
16912       return DAG.getMergeValues({Result, Chain}, DL);
16913     }
16914     case Intrinsic::aarch64_rndr:
16915     case Intrinsic::aarch64_rndrrs: {
16916       unsigned IntrinsicID =
16917           cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
16918       auto Register =
16919           (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
16920                                                   : AArch64SysReg::RNDRRS);
16921       SDLoc DL(N);
16922       SDValue A = DAG.getNode(
16923           AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
16924           N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
16925       SDValue B = DAG.getNode(
16926           AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
16927           DAG.getConstant(0, DL, MVT::i32),
16928           DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
16929       return DAG.getMergeValues(
16930           {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
16931     }
16932     default:
16933       break;
16934     }
16935     break;
16936   case ISD::GlobalAddress:
16937     return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
16938   }
16939   return SDValue();
16940 }
16941 
16942 // Check if the return value is used as only a return value, as otherwise
16943 // we can't perform a tail-call. In particular, we need to check for
16944 // target ISD nodes that are returns and any other "odd" constructs
16945 // that the generic analysis code won't necessarily catch.
16946 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
16947                                                SDValue &Chain) const {
16948   if (N->getNumValues() != 1)
16949     return false;
16950   if (!N->hasNUsesOfValue(1, 0))
16951     return false;
16952 
16953   SDValue TCChain = Chain;
16954   SDNode *Copy = *N->use_begin();
16955   if (Copy->getOpcode() == ISD::CopyToReg) {
16956     // If the copy has a glue operand, we conservatively assume it isn't safe to
16957     // perform a tail call.
16958     if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
16959         MVT::Glue)
16960       return false;
16961     TCChain = Copy->getOperand(0);
16962   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
16963     return false;
16964 
16965   bool HasRet = false;
16966   for (SDNode *Node : Copy->uses()) {
16967     if (Node->getOpcode() != AArch64ISD::RET_FLAG)
16968       return false;
16969     HasRet = true;
16970   }
16971 
16972   if (!HasRet)
16973     return false;
16974 
16975   Chain = TCChain;
16976   return true;
16977 }
16978 
16979 // Return whether the an instruction can potentially be optimized to a tail
16980 // call. This will cause the optimizers to attempt to move, or duplicate,
16981 // return instructions to help enable tail call optimizations for this
16982 // instruction.
16983 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
16984   return CI->isTailCall();
16985 }
16986 
16987 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
16988                                                    SDValue &Offset,
16989                                                    ISD::MemIndexedMode &AM,
16990                                                    bool &IsInc,
16991                                                    SelectionDAG &DAG) const {
16992   if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
16993     return false;
16994 
16995   Base = Op->getOperand(0);
16996   // All of the indexed addressing mode instructions take a signed
16997   // 9 bit immediate offset.
16998   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
16999     int64_t RHSC = RHS->getSExtValue();
17000     if (Op->getOpcode() == ISD::SUB)
17001       RHSC = -(uint64_t)RHSC;
17002     if (!isInt<9>(RHSC))
17003       return false;
17004     IsInc = (Op->getOpcode() == ISD::ADD);
17005     Offset = Op->getOperand(1);
17006     return true;
17007   }
17008   return false;
17009 }
17010 
17011 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
17012                                                       SDValue &Offset,
17013                                                       ISD::MemIndexedMode &AM,
17014                                                       SelectionDAG &DAG) const {
17015   EVT VT;
17016   SDValue Ptr;
17017   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
17018     VT = LD->getMemoryVT();
17019     Ptr = LD->getBasePtr();
17020   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
17021     VT = ST->getMemoryVT();
17022     Ptr = ST->getBasePtr();
17023   } else
17024     return false;
17025 
17026   bool IsInc;
17027   if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
17028     return false;
17029   AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
17030   return true;
17031 }
17032 
17033 bool AArch64TargetLowering::getPostIndexedAddressParts(
17034     SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
17035     ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
17036   EVT VT;
17037   SDValue Ptr;
17038   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
17039     VT = LD->getMemoryVT();
17040     Ptr = LD->getBasePtr();
17041   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
17042     VT = ST->getMemoryVT();
17043     Ptr = ST->getBasePtr();
17044   } else
17045     return false;
17046 
17047   bool IsInc;
17048   if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
17049     return false;
17050   // Post-indexing updates the base, so it's not a valid transform
17051   // if that's not the same as the load's pointer.
17052   if (Ptr != Base)
17053     return false;
17054   AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
17055   return true;
17056 }
17057 
17058 void AArch64TargetLowering::ReplaceBITCASTResults(
17059     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
17060   SDLoc DL(N);
17061   SDValue Op = N->getOperand(0);
17062   EVT VT = N->getValueType(0);
17063   EVT SrcVT = Op.getValueType();
17064 
17065   if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
17066     assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
17067            "Expected fp->int bitcast!");
17068     SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
17069     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
17070     return;
17071   }
17072 
17073   if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
17074     return;
17075 
17076   Op = SDValue(
17077       DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
17078                          DAG.getUNDEF(MVT::i32), Op,
17079                          DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
17080       0);
17081   Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
17082   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
17083 }
17084 
17085 static void ReplaceReductionResults(SDNode *N,
17086                                     SmallVectorImpl<SDValue> &Results,
17087                                     SelectionDAG &DAG, unsigned InterOp,
17088                                     unsigned AcrossOp) {
17089   EVT LoVT, HiVT;
17090   SDValue Lo, Hi;
17091   SDLoc dl(N);
17092   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
17093   std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
17094   SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
17095   SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
17096   Results.push_back(SplitVal);
17097 }
17098 
17099 static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
17100   SDLoc DL(N);
17101   SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
17102   SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
17103                            DAG.getNode(ISD::SRL, DL, MVT::i128, N,
17104                                        DAG.getConstant(64, DL, MVT::i64)));
17105   return std::make_pair(Lo, Hi);
17106 }
17107 
17108 void AArch64TargetLowering::ReplaceExtractSubVectorResults(
17109     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
17110   SDValue In = N->getOperand(0);
17111   EVT InVT = In.getValueType();
17112 
17113   // Common code will handle these just fine.
17114   if (!InVT.isScalableVector() || !InVT.isInteger())
17115     return;
17116 
17117   SDLoc DL(N);
17118   EVT VT = N->getValueType(0);
17119 
17120   // The following checks bail if this is not a halving operation.
17121 
17122   ElementCount ResEC = VT.getVectorElementCount();
17123 
17124   if (InVT.getVectorElementCount() != (ResEC * 2))
17125     return;
17126 
17127   auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
17128   if (!CIndex)
17129     return;
17130 
17131   unsigned Index = CIndex->getZExtValue();
17132   if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
17133     return;
17134 
17135   unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
17136   EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
17137 
17138   SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
17139   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
17140 }
17141 
17142 // Create an even/odd pair of X registers holding integer value V.
17143 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
17144   SDLoc dl(V.getNode());
17145   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
17146   SDValue VHi = DAG.getAnyExtOrTrunc(
17147       DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
17148       dl, MVT::i64);
17149   if (DAG.getDataLayout().isBigEndian())
17150     std::swap (VLo, VHi);
17151   SDValue RegClass =
17152       DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
17153   SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
17154   SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
17155   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
17156   return SDValue(
17157       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
17158 }
17159 
17160 static void ReplaceCMP_SWAP_128Results(SDNode *N,
17161                                        SmallVectorImpl<SDValue> &Results,
17162                                        SelectionDAG &DAG,
17163                                        const AArch64Subtarget *Subtarget) {
17164   assert(N->getValueType(0) == MVT::i128 &&
17165          "AtomicCmpSwap on types less than 128 should be legal");
17166 
17167   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
17168   if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
17169     // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
17170     // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
17171     SDValue Ops[] = {
17172         createGPRPairNode(DAG, N->getOperand(2)), // Compare value
17173         createGPRPairNode(DAG, N->getOperand(3)), // Store value
17174         N->getOperand(1), // Ptr
17175         N->getOperand(0), // Chain in
17176     };
17177 
17178     unsigned Opcode;
17179     switch (MemOp->getMergedOrdering()) {
17180     case AtomicOrdering::Monotonic:
17181       Opcode = AArch64::CASPX;
17182       break;
17183     case AtomicOrdering::Acquire:
17184       Opcode = AArch64::CASPAX;
17185       break;
17186     case AtomicOrdering::Release:
17187       Opcode = AArch64::CASPLX;
17188       break;
17189     case AtomicOrdering::AcquireRelease:
17190     case AtomicOrdering::SequentiallyConsistent:
17191       Opcode = AArch64::CASPALX;
17192       break;
17193     default:
17194       llvm_unreachable("Unexpected ordering!");
17195     }
17196 
17197     MachineSDNode *CmpSwap = DAG.getMachineNode(
17198         Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
17199     DAG.setNodeMemRefs(CmpSwap, {MemOp});
17200 
17201     unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
17202     if (DAG.getDataLayout().isBigEndian())
17203       std::swap(SubReg1, SubReg2);
17204     SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
17205                                             SDValue(CmpSwap, 0));
17206     SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
17207                                             SDValue(CmpSwap, 0));
17208     Results.push_back(
17209         DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
17210     Results.push_back(SDValue(CmpSwap, 1)); // Chain out
17211     return;
17212   }
17213 
17214   unsigned Opcode;
17215   switch (MemOp->getMergedOrdering()) {
17216   case AtomicOrdering::Monotonic:
17217     Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
17218     break;
17219   case AtomicOrdering::Acquire:
17220     Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
17221     break;
17222   case AtomicOrdering::Release:
17223     Opcode = AArch64::CMP_SWAP_128_RELEASE;
17224     break;
17225   case AtomicOrdering::AcquireRelease:
17226   case AtomicOrdering::SequentiallyConsistent:
17227     Opcode = AArch64::CMP_SWAP_128;
17228     break;
17229   default:
17230     llvm_unreachable("Unexpected ordering!");
17231   }
17232 
17233   auto Desired = splitInt128(N->getOperand(2), DAG);
17234   auto New = splitInt128(N->getOperand(3), DAG);
17235   SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
17236                    New.first,        New.second,    N->getOperand(0)};
17237   SDNode *CmpSwap = DAG.getMachineNode(
17238       Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
17239       Ops);
17240   DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
17241 
17242   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
17243                                 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
17244   Results.push_back(SDValue(CmpSwap, 3));
17245 }
17246 
17247 void AArch64TargetLowering::ReplaceNodeResults(
17248     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
17249   switch (N->getOpcode()) {
17250   default:
17251     llvm_unreachable("Don't know how to custom expand this");
17252   case ISD::BITCAST:
17253     ReplaceBITCASTResults(N, Results, DAG);
17254     return;
17255   case ISD::VECREDUCE_ADD:
17256   case ISD::VECREDUCE_SMAX:
17257   case ISD::VECREDUCE_SMIN:
17258   case ISD::VECREDUCE_UMAX:
17259   case ISD::VECREDUCE_UMIN:
17260     Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
17261     return;
17262 
17263   case ISD::CTPOP:
17264     if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG))
17265       Results.push_back(Result);
17266     return;
17267   case AArch64ISD::SADDV:
17268     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
17269     return;
17270   case AArch64ISD::UADDV:
17271     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
17272     return;
17273   case AArch64ISD::SMINV:
17274     ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
17275     return;
17276   case AArch64ISD::UMINV:
17277     ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
17278     return;
17279   case AArch64ISD::SMAXV:
17280     ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
17281     return;
17282   case AArch64ISD::UMAXV:
17283     ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
17284     return;
17285   case ISD::FP_TO_UINT:
17286   case ISD::FP_TO_SINT:
17287     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
17288     // Let normal code take care of it by not adding anything to Results.
17289     return;
17290   case ISD::ATOMIC_CMP_SWAP:
17291     ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
17292     return;
17293   case ISD::LOAD: {
17294     assert(SDValue(N, 0).getValueType() == MVT::i128 &&
17295            "unexpected load's value type");
17296     LoadSDNode *LoadNode = cast<LoadSDNode>(N);
17297     if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
17298       // Non-volatile loads are optimized later in AArch64's load/store
17299       // optimizer.
17300       return;
17301     }
17302 
17303     SDValue Result = DAG.getMemIntrinsicNode(
17304         AArch64ISD::LDP, SDLoc(N),
17305         DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
17306         {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
17307         LoadNode->getMemOperand());
17308 
17309     SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
17310                                Result.getValue(0), Result.getValue(1));
17311     Results.append({Pair, Result.getValue(2) /* Chain */});
17312     return;
17313   }
17314   case ISD::EXTRACT_SUBVECTOR:
17315     ReplaceExtractSubVectorResults(N, Results, DAG);
17316     return;
17317   case ISD::INSERT_SUBVECTOR:
17318     // Custom lowering has been requested for INSERT_SUBVECTOR -- but delegate
17319     // to common code for result type legalisation
17320     return;
17321   case ISD::INTRINSIC_WO_CHAIN: {
17322     EVT VT = N->getValueType(0);
17323     assert((VT == MVT::i8 || VT == MVT::i16) &&
17324            "custom lowering for unexpected type");
17325 
17326     ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
17327     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
17328     switch (IntID) {
17329     default:
17330       return;
17331     case Intrinsic::aarch64_sve_clasta_n: {
17332       SDLoc DL(N);
17333       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
17334       auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
17335                            N->getOperand(1), Op2, N->getOperand(3));
17336       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
17337       return;
17338     }
17339     case Intrinsic::aarch64_sve_clastb_n: {
17340       SDLoc DL(N);
17341       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
17342       auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
17343                            N->getOperand(1), Op2, N->getOperand(3));
17344       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
17345       return;
17346     }
17347     case Intrinsic::aarch64_sve_lasta: {
17348       SDLoc DL(N);
17349       auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
17350                            N->getOperand(1), N->getOperand(2));
17351       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
17352       return;
17353     }
17354     case Intrinsic::aarch64_sve_lastb: {
17355       SDLoc DL(N);
17356       auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
17357                            N->getOperand(1), N->getOperand(2));
17358       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
17359       return;
17360     }
17361     }
17362   }
17363   }
17364 }
17365 
17366 bool AArch64TargetLowering::useLoadStackGuardNode() const {
17367   if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
17368     return TargetLowering::useLoadStackGuardNode();
17369   return true;
17370 }
17371 
17372 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
17373   // Combine multiple FDIVs with the same divisor into multiple FMULs by the
17374   // reciprocal if there are three or more FDIVs.
17375   return 3;
17376 }
17377 
17378 TargetLoweringBase::LegalizeTypeAction
17379 AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
17380   // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
17381   // v4i16, v2i32 instead of to promote.
17382   if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
17383       VT == MVT::v1f32)
17384     return TypeWidenVector;
17385 
17386   return TargetLoweringBase::getPreferredVectorAction(VT);
17387 }
17388 
17389 // Loads and stores less than 128-bits are already atomic; ones above that
17390 // are doomed anyway, so defer to the default libcall and blame the OS when
17391 // things go wrong.
17392 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
17393   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
17394   return Size == 128;
17395 }
17396 
17397 // Loads and stores less than 128-bits are already atomic; ones above that
17398 // are doomed anyway, so defer to the default libcall and blame the OS when
17399 // things go wrong.
17400 TargetLowering::AtomicExpansionKind
17401 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
17402   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
17403   return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
17404 }
17405 
17406 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
17407 TargetLowering::AtomicExpansionKind
17408 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
17409   if (AI->isFloatingPointOperation())
17410     return AtomicExpansionKind::CmpXChg;
17411 
17412   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
17413   if (Size > 128) return AtomicExpansionKind::None;
17414 
17415   // Nand is not supported in LSE.
17416   // Leave 128 bits to LLSC or CmpXChg.
17417   if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
17418     if (Subtarget->hasLSE())
17419       return AtomicExpansionKind::None;
17420     if (Subtarget->outlineAtomics()) {
17421       // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
17422       // Don't outline them unless
17423       // (1) high level <atomic> support approved:
17424       //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
17425       // (2) low level libgcc and compiler-rt support implemented by:
17426       //   min/max outline atomics helpers
17427       if (AI->getOperation() != AtomicRMWInst::Min &&
17428           AI->getOperation() != AtomicRMWInst::Max &&
17429           AI->getOperation() != AtomicRMWInst::UMin &&
17430           AI->getOperation() != AtomicRMWInst::UMax) {
17431         return AtomicExpansionKind::None;
17432       }
17433     }
17434   }
17435 
17436   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
17437   // implement atomicrmw without spilling. If the target address is also on the
17438   // stack and close enough to the spill slot, this can lead to a situation
17439   // where the monitor always gets cleared and the atomic operation can never
17440   // succeed. So at -O0 lower this operation to a CAS loop.
17441   if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
17442     return AtomicExpansionKind::CmpXChg;
17443 
17444   return AtomicExpansionKind::LLSC;
17445 }
17446 
17447 TargetLowering::AtomicExpansionKind
17448 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
17449     AtomicCmpXchgInst *AI) const {
17450   // If subtarget has LSE, leave cmpxchg intact for codegen.
17451   if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
17452     return AtomicExpansionKind::None;
17453   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
17454   // implement cmpxchg without spilling. If the address being exchanged is also
17455   // on the stack and close enough to the spill slot, this can lead to a
17456   // situation where the monitor always gets cleared and the atomic operation
17457   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
17458   if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
17459     return AtomicExpansionKind::None;
17460 
17461   // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
17462   // it.
17463   unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
17464   if (Size > 64)
17465     return AtomicExpansionKind::None;
17466 
17467   return AtomicExpansionKind::LLSC;
17468 }
17469 
17470 Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
17471                                              Type *ValueTy, Value *Addr,
17472                                              AtomicOrdering Ord) const {
17473   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
17474   bool IsAcquire = isAcquireOrStronger(Ord);
17475 
17476   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
17477   // intrinsic must return {i64, i64} and we have to recombine them into a
17478   // single i128 here.
17479   if (ValueTy->getPrimitiveSizeInBits() == 128) {
17480     Intrinsic::ID Int =
17481         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
17482     Function *Ldxr = Intrinsic::getDeclaration(M, Int);
17483 
17484     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
17485     Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
17486 
17487     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
17488     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
17489     Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
17490     Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
17491     return Builder.CreateOr(
17492         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
17493   }
17494 
17495   Type *Tys[] = { Addr->getType() };
17496   Intrinsic::ID Int =
17497       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
17498   Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
17499 
17500   const DataLayout &DL = M->getDataLayout();
17501   IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
17502   Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
17503 
17504   return Builder.CreateBitCast(Trunc, ValueTy);
17505 }
17506 
17507 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
17508     IRBuilderBase &Builder) const {
17509   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
17510   Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
17511 }
17512 
17513 Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
17514                                                    Value *Val, Value *Addr,
17515                                                    AtomicOrdering Ord) const {
17516   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
17517   bool IsRelease = isReleaseOrStronger(Ord);
17518 
17519   // Since the intrinsics must have legal type, the i128 intrinsics take two
17520   // parameters: "i64, i64". We must marshal Val into the appropriate form
17521   // before the call.
17522   if (Val->getType()->getPrimitiveSizeInBits() == 128) {
17523     Intrinsic::ID Int =
17524         IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
17525     Function *Stxr = Intrinsic::getDeclaration(M, Int);
17526     Type *Int64Ty = Type::getInt64Ty(M->getContext());
17527 
17528     Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
17529     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
17530     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
17531     return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
17532   }
17533 
17534   Intrinsic::ID Int =
17535       IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
17536   Type *Tys[] = { Addr->getType() };
17537   Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
17538 
17539   const DataLayout &DL = M->getDataLayout();
17540   IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
17541   Val = Builder.CreateBitCast(Val, IntValTy);
17542 
17543   return Builder.CreateCall(Stxr,
17544                             {Builder.CreateZExtOrBitCast(
17545                                  Val, Stxr->getFunctionType()->getParamType(0)),
17546                              Addr});
17547 }
17548 
17549 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
17550     Type *Ty, CallingConv::ID CallConv, bool isVarArg,
17551     const DataLayout &DL) const {
17552   if (!Ty->isArrayTy()) {
17553     const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
17554     return TySize.isScalable() && TySize.getKnownMinSize() > 128;
17555   }
17556 
17557   // All non aggregate members of the type must have the same type
17558   SmallVector<EVT> ValueVTs;
17559   ComputeValueVTs(*this, DL, Ty, ValueVTs);
17560   return is_splat(ValueVTs);
17561 }
17562 
17563 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
17564                                                             EVT) const {
17565   return false;
17566 }
17567 
17568 static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
17569   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
17570   Function *ThreadPointerFunc =
17571       Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
17572   return IRB.CreatePointerCast(
17573       IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
17574                              Offset),
17575       IRB.getInt8PtrTy()->getPointerTo(0));
17576 }
17577 
17578 Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
17579   // Android provides a fixed TLS slot for the stack cookie. See the definition
17580   // of TLS_SLOT_STACK_GUARD in
17581   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
17582   if (Subtarget->isTargetAndroid())
17583     return UseTlsOffset(IRB, 0x28);
17584 
17585   // Fuchsia is similar.
17586   // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
17587   if (Subtarget->isTargetFuchsia())
17588     return UseTlsOffset(IRB, -0x10);
17589 
17590   return TargetLowering::getIRStackGuard(IRB);
17591 }
17592 
17593 void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
17594   // MSVC CRT provides functionalities for stack protection.
17595   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
17596     // MSVC CRT has a global variable holding security cookie.
17597     M.getOrInsertGlobal("__security_cookie",
17598                         Type::getInt8PtrTy(M.getContext()));
17599 
17600     // MSVC CRT has a function to validate security cookie.
17601     FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
17602         "__security_check_cookie", Type::getVoidTy(M.getContext()),
17603         Type::getInt8PtrTy(M.getContext()));
17604     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
17605       F->setCallingConv(CallingConv::Win64);
17606       F->addAttribute(1, Attribute::AttrKind::InReg);
17607     }
17608     return;
17609   }
17610   TargetLowering::insertSSPDeclarations(M);
17611 }
17612 
17613 Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
17614   // MSVC CRT has a global variable holding security cookie.
17615   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
17616     return M.getGlobalVariable("__security_cookie");
17617   return TargetLowering::getSDagStackGuard(M);
17618 }
17619 
17620 Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
17621   // MSVC CRT has a function to validate security cookie.
17622   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
17623     return M.getFunction("__security_check_cookie");
17624   return TargetLowering::getSSPStackGuardCheck(M);
17625 }
17626 
17627 Value *
17628 AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
17629   // Android provides a fixed TLS slot for the SafeStack pointer. See the
17630   // definition of TLS_SLOT_SAFESTACK in
17631   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
17632   if (Subtarget->isTargetAndroid())
17633     return UseTlsOffset(IRB, 0x48);
17634 
17635   // Fuchsia is similar.
17636   // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
17637   if (Subtarget->isTargetFuchsia())
17638     return UseTlsOffset(IRB, -0x8);
17639 
17640   return TargetLowering::getSafeStackPointerLocation(IRB);
17641 }
17642 
17643 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
17644     const Instruction &AndI) const {
17645   // Only sink 'and' mask to cmp use block if it is masking a single bit, since
17646   // this is likely to be fold the and/cmp/br into a single tbz instruction.  It
17647   // may be beneficial to sink in other cases, but we would have to check that
17648   // the cmp would not get folded into the br to form a cbz for these to be
17649   // beneficial.
17650   ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
17651   if (!Mask)
17652     return false;
17653   return Mask->getValue().isPowerOf2();
17654 }
17655 
17656 bool AArch64TargetLowering::
17657     shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
17658         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
17659         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
17660         SelectionDAG &DAG) const {
17661   // Does baseline recommend not to perform the fold by default?
17662   if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
17663           X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
17664     return false;
17665   // Else, if this is a vector shift, prefer 'shl'.
17666   return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
17667 }
17668 
17669 bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
17670                                               SDNode *N) const {
17671   if (DAG.getMachineFunction().getFunction().hasMinSize() &&
17672       !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
17673     return false;
17674   return true;
17675 }
17676 
17677 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
17678   // Update IsSplitCSR in AArch64unctionInfo.
17679   AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
17680   AFI->setIsSplitCSR(true);
17681 }
17682 
17683 void AArch64TargetLowering::insertCopiesSplitCSR(
17684     MachineBasicBlock *Entry,
17685     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
17686   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
17687   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
17688   if (!IStart)
17689     return;
17690 
17691   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
17692   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
17693   MachineBasicBlock::iterator MBBI = Entry->begin();
17694   for (const MCPhysReg *I = IStart; *I; ++I) {
17695     const TargetRegisterClass *RC = nullptr;
17696     if (AArch64::GPR64RegClass.contains(*I))
17697       RC = &AArch64::GPR64RegClass;
17698     else if (AArch64::FPR64RegClass.contains(*I))
17699       RC = &AArch64::FPR64RegClass;
17700     else
17701       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
17702 
17703     Register NewVR = MRI->createVirtualRegister(RC);
17704     // Create copy from CSR to a virtual register.
17705     // FIXME: this currently does not emit CFI pseudo-instructions, it works
17706     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
17707     // nounwind. If we want to generalize this later, we may need to emit
17708     // CFI pseudo-instructions.
17709     assert(Entry->getParent()->getFunction().hasFnAttribute(
17710                Attribute::NoUnwind) &&
17711            "Function should be nounwind in insertCopiesSplitCSR!");
17712     Entry->addLiveIn(*I);
17713     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
17714         .addReg(*I);
17715 
17716     // Insert the copy-back instructions right before the terminator.
17717     for (auto *Exit : Exits)
17718       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
17719               TII->get(TargetOpcode::COPY), *I)
17720           .addReg(NewVR);
17721   }
17722 }
17723 
17724 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
17725   // Integer division on AArch64 is expensive. However, when aggressively
17726   // optimizing for code size, we prefer to use a div instruction, as it is
17727   // usually smaller than the alternative sequence.
17728   // The exception to this is vector division. Since AArch64 doesn't have vector
17729   // integer division, leaving the division as-is is a loss even in terms of
17730   // size, because it will have to be scalarized, while the alternative code
17731   // sequence can be performed in vector form.
17732   bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
17733   return OptSize && !VT.isVector();
17734 }
17735 
17736 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
17737   // We want inc-of-add for scalars and sub-of-not for vectors.
17738   return VT.isScalarInteger();
17739 }
17740 
17741 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
17742   return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
17743 }
17744 
17745 unsigned
17746 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
17747   if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
17748     return getPointerTy(DL).getSizeInBits();
17749 
17750   return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
17751 }
17752 
17753 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
17754   MF.getFrameInfo().computeMaxCallFrameSize(MF);
17755   TargetLoweringBase::finalizeLowering(MF);
17756 }
17757 
17758 // Unlike X86, we let frame lowering assign offsets to all catch objects.
17759 bool AArch64TargetLowering::needsFixedCatchObjects() const {
17760   return false;
17761 }
17762 
17763 bool AArch64TargetLowering::shouldLocalize(
17764     const MachineInstr &MI, const TargetTransformInfo *TTI) const {
17765   switch (MI.getOpcode()) {
17766   case TargetOpcode::G_GLOBAL_VALUE: {
17767     // On Darwin, TLS global vars get selected into function calls, which
17768     // we don't want localized, as they can get moved into the middle of a
17769     // another call sequence.
17770     const GlobalValue &GV = *MI.getOperand(1).getGlobal();
17771     if (GV.isThreadLocal() && Subtarget->isTargetMachO())
17772       return false;
17773     break;
17774   }
17775   // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
17776   // localizable.
17777   case AArch64::ADRP:
17778   case AArch64::G_ADD_LOW:
17779     return true;
17780   default:
17781     break;
17782   }
17783   return TargetLoweringBase::shouldLocalize(MI, TTI);
17784 }
17785 
17786 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
17787   if (isa<ScalableVectorType>(Inst.getType()))
17788     return true;
17789 
17790   for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
17791     if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
17792       return true;
17793 
17794   if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
17795     if (isa<ScalableVectorType>(AI->getAllocatedType()))
17796       return true;
17797   }
17798 
17799   return false;
17800 }
17801 
17802 // Return the largest legal scalable vector type that matches VT's element type.
17803 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
17804   assert(VT.isFixedLengthVector() &&
17805          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
17806          "Expected legal fixed length vector!");
17807   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
17808   default:
17809     llvm_unreachable("unexpected element type for SVE container");
17810   case MVT::i8:
17811     return EVT(MVT::nxv16i8);
17812   case MVT::i16:
17813     return EVT(MVT::nxv8i16);
17814   case MVT::i32:
17815     return EVT(MVT::nxv4i32);
17816   case MVT::i64:
17817     return EVT(MVT::nxv2i64);
17818   case MVT::f16:
17819     return EVT(MVT::nxv8f16);
17820   case MVT::f32:
17821     return EVT(MVT::nxv4f32);
17822   case MVT::f64:
17823     return EVT(MVT::nxv2f64);
17824   }
17825 }
17826 
17827 // Return a PTRUE with active lanes corresponding to the extent of VT.
17828 static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
17829                                                 EVT VT) {
17830   assert(VT.isFixedLengthVector() &&
17831          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
17832          "Expected legal fixed length vector!");
17833 
17834   int PgPattern;
17835   switch (VT.getVectorNumElements()) {
17836   default:
17837     llvm_unreachable("unexpected element count for SVE predicate");
17838   case 1:
17839     PgPattern = AArch64SVEPredPattern::vl1;
17840     break;
17841   case 2:
17842     PgPattern = AArch64SVEPredPattern::vl2;
17843     break;
17844   case 4:
17845     PgPattern = AArch64SVEPredPattern::vl4;
17846     break;
17847   case 8:
17848     PgPattern = AArch64SVEPredPattern::vl8;
17849     break;
17850   case 16:
17851     PgPattern = AArch64SVEPredPattern::vl16;
17852     break;
17853   case 32:
17854     PgPattern = AArch64SVEPredPattern::vl32;
17855     break;
17856   case 64:
17857     PgPattern = AArch64SVEPredPattern::vl64;
17858     break;
17859   case 128:
17860     PgPattern = AArch64SVEPredPattern::vl128;
17861     break;
17862   case 256:
17863     PgPattern = AArch64SVEPredPattern::vl256;
17864     break;
17865   }
17866 
17867   // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
17868   // use AArch64SVEPredPattern::all, which can enable the use of unpredicated
17869   // variants of instructions when available.
17870 
17871   MVT MaskVT;
17872   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
17873   default:
17874     llvm_unreachable("unexpected element type for SVE predicate");
17875   case MVT::i8:
17876     MaskVT = MVT::nxv16i1;
17877     break;
17878   case MVT::i16:
17879   case MVT::f16:
17880     MaskVT = MVT::nxv8i1;
17881     break;
17882   case MVT::i32:
17883   case MVT::f32:
17884     MaskVT = MVT::nxv4i1;
17885     break;
17886   case MVT::i64:
17887   case MVT::f64:
17888     MaskVT = MVT::nxv2i1;
17889     break;
17890   }
17891 
17892   return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
17893                      DAG.getTargetConstant(PgPattern, DL, MVT::i64));
17894 }
17895 
17896 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
17897                                              EVT VT) {
17898   assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
17899          "Expected legal scalable vector!");
17900   auto PredTy = VT.changeVectorElementType(MVT::i1);
17901   return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
17902 }
17903 
17904 static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
17905   if (VT.isFixedLengthVector())
17906     return getPredicateForFixedLengthVector(DAG, DL, VT);
17907 
17908   return getPredicateForScalableVector(DAG, DL, VT);
17909 }
17910 
17911 // Grow V to consume an entire SVE register.
17912 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
17913   assert(VT.isScalableVector() &&
17914          "Expected to convert into a scalable vector!");
17915   assert(V.getValueType().isFixedLengthVector() &&
17916          "Expected a fixed length vector operand!");
17917   SDLoc DL(V);
17918   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17919   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
17920 }
17921 
17922 // Shrink V so it's just big enough to maintain a VT's worth of data.
17923 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
17924   assert(VT.isFixedLengthVector() &&
17925          "Expected to convert into a fixed length vector!");
17926   assert(V.getValueType().isScalableVector() &&
17927          "Expected a scalable vector operand!");
17928   SDLoc DL(V);
17929   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17930   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
17931 }
17932 
17933 // Convert all fixed length vector loads larger than NEON to masked_loads.
17934 SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
17935     SDValue Op, SelectionDAG &DAG) const {
17936   auto Load = cast<LoadSDNode>(Op);
17937 
17938   SDLoc DL(Op);
17939   EVT VT = Op.getValueType();
17940   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
17941 
17942   auto NewLoad = DAG.getMaskedLoad(
17943       ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
17944       getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
17945       Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
17946       Load->getExtensionType());
17947 
17948   auto Result = convertFromScalableVector(DAG, VT, NewLoad);
17949   SDValue MergedValues[2] = {Result, Load->getChain()};
17950   return DAG.getMergeValues(MergedValues, DL);
17951 }
17952 
17953 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
17954                                                 SelectionDAG &DAG) {
17955   SDLoc DL(Mask);
17956   EVT InVT = Mask.getValueType();
17957   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
17958 
17959   auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
17960   auto Op2 = DAG.getConstant(0, DL, ContainerVT);
17961   auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
17962 
17963   EVT CmpVT = Pg.getValueType();
17964   return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
17965                      {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
17966 }
17967 
17968 // Convert all fixed length vector loads larger than NEON to masked_loads.
17969 SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
17970     SDValue Op, SelectionDAG &DAG) const {
17971   auto Load = cast<MaskedLoadSDNode>(Op);
17972 
17973   if (Load->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD)
17974     return SDValue();
17975 
17976   SDLoc DL(Op);
17977   EVT VT = Op.getValueType();
17978   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
17979 
17980   SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG);
17981 
17982   SDValue PassThru;
17983   bool IsPassThruZeroOrUndef = false;
17984 
17985   if (Load->getPassThru()->isUndef()) {
17986     PassThru = DAG.getUNDEF(ContainerVT);
17987     IsPassThruZeroOrUndef = true;
17988   } else {
17989     if (ContainerVT.isInteger())
17990       PassThru = DAG.getConstant(0, DL, ContainerVT);
17991     else
17992       PassThru = DAG.getConstantFP(0, DL, ContainerVT);
17993     if (isZerosVector(Load->getPassThru().getNode()))
17994       IsPassThruZeroOrUndef = true;
17995   }
17996 
17997   auto NewLoad = DAG.getMaskedLoad(
17998       ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
17999       Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
18000       Load->getAddressingMode(), Load->getExtensionType());
18001 
18002   if (!IsPassThruZeroOrUndef) {
18003     SDValue OldPassThru =
18004         convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
18005     NewLoad = DAG.getSelect(DL, ContainerVT, Mask, NewLoad, OldPassThru);
18006   }
18007 
18008   auto Result = convertFromScalableVector(DAG, VT, NewLoad);
18009   SDValue MergedValues[2] = {Result, Load->getChain()};
18010   return DAG.getMergeValues(MergedValues, DL);
18011 }
18012 
18013 // Convert all fixed length vector stores larger than NEON to masked_stores.
18014 SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
18015     SDValue Op, SelectionDAG &DAG) const {
18016   auto Store = cast<StoreSDNode>(Op);
18017 
18018   SDLoc DL(Op);
18019   EVT VT = Store->getValue().getValueType();
18020   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18021 
18022   auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
18023   return DAG.getMaskedStore(
18024       Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
18025       getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
18026       Store->getMemOperand(), Store->getAddressingMode(),
18027       Store->isTruncatingStore());
18028 }
18029 
18030 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
18031     SDValue Op, SelectionDAG &DAG) const {
18032   auto Store = cast<MaskedStoreSDNode>(Op);
18033 
18034   if (Store->isTruncatingStore())
18035     return SDValue();
18036 
18037   SDLoc DL(Op);
18038   EVT VT = Store->getValue().getValueType();
18039   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18040 
18041   auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
18042   SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
18043 
18044   return DAG.getMaskedStore(
18045       Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
18046       Mask, Store->getMemoryVT(), Store->getMemOperand(),
18047       Store->getAddressingMode(), Store->isTruncatingStore());
18048 }
18049 
18050 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
18051     SDValue Op, SelectionDAG &DAG) const {
18052   SDLoc dl(Op);
18053   EVT VT = Op.getValueType();
18054   EVT EltVT = VT.getVectorElementType();
18055 
18056   bool Signed = Op.getOpcode() == ISD::SDIV;
18057   unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
18058 
18059   // Scalable vector i32/i64 DIV is supported.
18060   if (EltVT == MVT::i32 || EltVT == MVT::i64)
18061     return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true);
18062 
18063   // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
18064   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18065   EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18066   EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
18067   EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT);
18068 
18069   // If this is not a full vector, extend, div, and truncate it.
18070   EVT WidenedVT = VT.widenIntegerVectorElementType(*DAG.getContext());
18071   if (DAG.getTargetLoweringInfo().isTypeLegal(WidenedVT)) {
18072     unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
18073     SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(0));
18074     SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(1));
18075     SDValue Div = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0, Op1);
18076     return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
18077   }
18078 
18079   // Convert the operands to scalable vectors.
18080   SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
18081   SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
18082 
18083   // Extend the scalable operands.
18084   unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
18085   unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
18086   SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0);
18087   SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1);
18088   SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0);
18089   SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1);
18090 
18091   // Convert back to fixed vectors so the DIV can be further lowered.
18092   Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo);
18093   Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo);
18094   Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi);
18095   Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi);
18096   SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
18097                                  Op0Lo, Op1Lo);
18098   SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
18099                                  Op0Hi, Op1Hi);
18100 
18101   // Convert again to scalable vectors to truncate.
18102   ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo);
18103   ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi);
18104   SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT,
18105                                        ResultLo, ResultHi);
18106 
18107   return convertFromScalableVector(DAG, VT, ScalableResult);
18108 }
18109 
18110 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
18111     SDValue Op, SelectionDAG &DAG) const {
18112   EVT VT = Op.getValueType();
18113   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18114 
18115   SDLoc DL(Op);
18116   SDValue Val = Op.getOperand(0);
18117   EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
18118   Val = convertToScalableVector(DAG, ContainerVT, Val);
18119 
18120   bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
18121   unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
18122 
18123   // Repeatedly unpack Val until the result is of the desired element type.
18124   switch (ContainerVT.getSimpleVT().SimpleTy) {
18125   default:
18126     llvm_unreachable("unimplemented container type");
18127   case MVT::nxv16i8:
18128     Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
18129     if (VT.getVectorElementType() == MVT::i16)
18130       break;
18131     LLVM_FALLTHROUGH;
18132   case MVT::nxv8i16:
18133     Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
18134     if (VT.getVectorElementType() == MVT::i32)
18135       break;
18136     LLVM_FALLTHROUGH;
18137   case MVT::nxv4i32:
18138     Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
18139     assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
18140     break;
18141   }
18142 
18143   return convertFromScalableVector(DAG, VT, Val);
18144 }
18145 
18146 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
18147     SDValue Op, SelectionDAG &DAG) const {
18148   EVT VT = Op.getValueType();
18149   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18150 
18151   SDLoc DL(Op);
18152   SDValue Val = Op.getOperand(0);
18153   EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
18154   Val = convertToScalableVector(DAG, ContainerVT, Val);
18155 
18156   // Repeatedly truncate Val until the result is of the desired element type.
18157   switch (ContainerVT.getSimpleVT().SimpleTy) {
18158   default:
18159     llvm_unreachable("unimplemented container type");
18160   case MVT::nxv2i64:
18161     Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
18162     Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
18163     if (VT.getVectorElementType() == MVT::i32)
18164       break;
18165     LLVM_FALLTHROUGH;
18166   case MVT::nxv4i32:
18167     Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
18168     Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
18169     if (VT.getVectorElementType() == MVT::i16)
18170       break;
18171     LLVM_FALLTHROUGH;
18172   case MVT::nxv8i16:
18173     Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
18174     Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
18175     assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
18176     break;
18177   }
18178 
18179   return convertFromScalableVector(DAG, VT, Val);
18180 }
18181 
18182 SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
18183     SDValue Op, SelectionDAG &DAG) const {
18184   EVT VT = Op.getValueType();
18185   EVT InVT = Op.getOperand(0).getValueType();
18186   assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
18187 
18188   SDLoc DL(Op);
18189   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
18190   SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
18191 
18192   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
18193 }
18194 
18195 SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
18196     SDValue Op, SelectionDAG &DAG) const {
18197   EVT VT = Op.getValueType();
18198   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18199 
18200   SDLoc DL(Op);
18201   EVT InVT = Op.getOperand(0).getValueType();
18202   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
18203   SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
18204 
18205   auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
18206                                  Op.getOperand(1), Op.getOperand(2));
18207 
18208   return convertFromScalableVector(DAG, VT, ScalableRes);
18209 }
18210 
18211 // Convert vector operation 'Op' to an equivalent predicated operation whereby
18212 // the original operation's type is used to construct a suitable predicate.
18213 // NOTE: The results for inactive lanes are undefined.
18214 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
18215                                                    SelectionDAG &DAG,
18216                                                    unsigned NewOp,
18217                                                    bool OverrideNEON) const {
18218   EVT VT = Op.getValueType();
18219   SDLoc DL(Op);
18220   auto Pg = getPredicateForVector(DAG, DL, VT);
18221 
18222   if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) {
18223     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18224 
18225     // Create list of operands by converting existing ones to scalable types.
18226     SmallVector<SDValue, 4> Operands = {Pg};
18227     for (const SDValue &V : Op->op_values()) {
18228       if (isa<CondCodeSDNode>(V)) {
18229         Operands.push_back(V);
18230         continue;
18231       }
18232 
18233       if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
18234         EVT VTArg = VTNode->getVT().getVectorElementType();
18235         EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
18236         Operands.push_back(DAG.getValueType(NewVTArg));
18237         continue;
18238       }
18239 
18240       assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) &&
18241              "Only fixed length vectors are supported!");
18242       Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
18243     }
18244 
18245     if (isMergePassthruOpcode(NewOp))
18246       Operands.push_back(DAG.getUNDEF(ContainerVT));
18247 
18248     auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
18249     return convertFromScalableVector(DAG, VT, ScalableRes);
18250   }
18251 
18252   assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
18253 
18254   SmallVector<SDValue, 4> Operands = {Pg};
18255   for (const SDValue &V : Op->op_values()) {
18256     assert((!V.getValueType().isVector() ||
18257             V.getValueType().isScalableVector()) &&
18258            "Only scalable vectors are supported!");
18259     Operands.push_back(V);
18260   }
18261 
18262   if (isMergePassthruOpcode(NewOp))
18263     Operands.push_back(DAG.getUNDEF(VT));
18264 
18265   return DAG.getNode(NewOp, DL, VT, Operands);
18266 }
18267 
18268 // If a fixed length vector operation has no side effects when applied to
18269 // undefined elements, we can safely use scalable vectors to perform the same
18270 // operation without needing to worry about predication.
18271 SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
18272                                                  SelectionDAG &DAG) const {
18273   EVT VT = Op.getValueType();
18274   assert(useSVEForFixedLengthVectorVT(VT) &&
18275          "Only expected to lower fixed length vector operation!");
18276   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18277 
18278   // Create list of operands by converting existing ones to scalable types.
18279   SmallVector<SDValue, 4> Ops;
18280   for (const SDValue &V : Op->op_values()) {
18281     assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
18282 
18283     // Pass through non-vector operands.
18284     if (!V.getValueType().isVector()) {
18285       Ops.push_back(V);
18286       continue;
18287     }
18288 
18289     // "cast" fixed length vector to a scalable vector.
18290     assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
18291            "Only fixed length vectors are supported!");
18292     Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
18293   }
18294 
18295   auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
18296   return convertFromScalableVector(DAG, VT, ScalableRes);
18297 }
18298 
18299 SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
18300     SelectionDAG &DAG) const {
18301   SDLoc DL(ScalarOp);
18302   SDValue AccOp = ScalarOp.getOperand(0);
18303   SDValue VecOp = ScalarOp.getOperand(1);
18304   EVT SrcVT = VecOp.getValueType();
18305   EVT ResVT = SrcVT.getVectorElementType();
18306 
18307   EVT ContainerVT = SrcVT;
18308   if (SrcVT.isFixedLengthVector()) {
18309     ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
18310     VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
18311   }
18312 
18313   SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
18314   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
18315 
18316   // Convert operands to Scalable.
18317   AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
18318                       DAG.getUNDEF(ContainerVT), AccOp, Zero);
18319 
18320   // Perform reduction.
18321   SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
18322                             Pg, AccOp, VecOp);
18323 
18324   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
18325 }
18326 
18327 SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
18328                                                        SelectionDAG &DAG) const {
18329   SDLoc DL(ReduceOp);
18330   SDValue Op = ReduceOp.getOperand(0);
18331   EVT OpVT = Op.getValueType();
18332   EVT VT = ReduceOp.getValueType();
18333 
18334   if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18335     return SDValue();
18336 
18337   SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
18338 
18339   switch (ReduceOp.getOpcode()) {
18340   default:
18341     return SDValue();
18342   case ISD::VECREDUCE_OR:
18343     return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
18344   case ISD::VECREDUCE_AND: {
18345     Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
18346     return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
18347   }
18348   case ISD::VECREDUCE_XOR: {
18349     SDValue ID =
18350         DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
18351     SDValue Cntp =
18352         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
18353     return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
18354   }
18355   }
18356 
18357   return SDValue();
18358 }
18359 
18360 SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
18361                                                    SDValue ScalarOp,
18362                                                    SelectionDAG &DAG) const {
18363   SDLoc DL(ScalarOp);
18364   SDValue VecOp = ScalarOp.getOperand(0);
18365   EVT SrcVT = VecOp.getValueType();
18366 
18367   if (useSVEForFixedLengthVectorVT(SrcVT, true)) {
18368     EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
18369     VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
18370   }
18371 
18372   // UADDV always returns an i64 result.
18373   EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
18374                                                    SrcVT.getVectorElementType();
18375   EVT RdxVT = SrcVT;
18376   if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
18377     RdxVT = getPackedSVEVectorVT(ResVT);
18378 
18379   SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
18380   SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
18381   SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
18382                             Rdx, DAG.getConstant(0, DL, MVT::i64));
18383 
18384   // The VEC_REDUCE nodes expect an element size result.
18385   if (ResVT != ScalarOp.getValueType())
18386     Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
18387 
18388   return Res;
18389 }
18390 
18391 SDValue
18392 AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
18393     SelectionDAG &DAG) const {
18394   EVT VT = Op.getValueType();
18395   SDLoc DL(Op);
18396 
18397   EVT InVT = Op.getOperand(1).getValueType();
18398   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
18399   SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
18400   SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
18401 
18402   // Convert the mask to a predicated (NOTE: We don't need to worry about
18403   // inactive lanes since VSELECT is safe when given undefined elements).
18404   EVT MaskVT = Op.getOperand(0).getValueType();
18405   EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
18406   auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
18407   Mask = DAG.getNode(ISD::TRUNCATE, DL,
18408                      MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
18409 
18410   auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
18411                                 Mask, Op1, Op2);
18412 
18413   return convertFromScalableVector(DAG, VT, ScalableRes);
18414 }
18415 
18416 SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
18417     SDValue Op, SelectionDAG &DAG) const {
18418   SDLoc DL(Op);
18419   EVT InVT = Op.getOperand(0).getValueType();
18420   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
18421 
18422   assert(useSVEForFixedLengthVectorVT(InVT) &&
18423          "Only expected to lower fixed length vector operation!");
18424   assert(Op.getValueType() == InVT.changeTypeToInteger() &&
18425          "Expected integer result of the same bit length as the inputs!");
18426 
18427   auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
18428   auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
18429   auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
18430 
18431   EVT CmpVT = Pg.getValueType();
18432   auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
18433                          {Pg, Op1, Op2, Op.getOperand(2)});
18434 
18435   EVT PromoteVT = ContainerVT.changeTypeToInteger();
18436   auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
18437   return convertFromScalableVector(DAG, Op.getValueType(), Promote);
18438 }
18439 
18440 SDValue
18441 AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
18442                                                     SelectionDAG &DAG) const {
18443   SDLoc DL(Op);
18444   auto SrcOp = Op.getOperand(0);
18445   EVT VT = Op.getValueType();
18446   EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
18447   EVT ContainerSrcVT =
18448       getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
18449 
18450   SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
18451   Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
18452   return convertFromScalableVector(DAG, VT, Op);
18453 }
18454 
18455 SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
18456     SDValue Op, SelectionDAG &DAG) const {
18457   SDLoc DL(Op);
18458   unsigned NumOperands = Op->getNumOperands();
18459 
18460   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
18461          "Unexpected number of operands in CONCAT_VECTORS");
18462 
18463   auto SrcOp1 = Op.getOperand(0);
18464   auto SrcOp2 = Op.getOperand(1);
18465   EVT VT = Op.getValueType();
18466   EVT SrcVT = SrcOp1.getValueType();
18467 
18468   if (NumOperands > 2) {
18469     SmallVector<SDValue, 4> Ops;
18470     EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
18471     for (unsigned I = 0; I < NumOperands; I += 2)
18472       Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
18473                                 Op->getOperand(I), Op->getOperand(I + 1)));
18474 
18475     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
18476   }
18477 
18478   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18479 
18480   SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
18481   SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
18482   SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
18483 
18484   Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
18485 
18486   return convertFromScalableVector(DAG, VT, Op);
18487 }
18488 
18489 SDValue
18490 AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
18491                                                      SelectionDAG &DAG) const {
18492   EVT VT = Op.getValueType();
18493   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18494 
18495   SDLoc DL(Op);
18496   SDValue Val = Op.getOperand(0);
18497   SDValue Pg = getPredicateForVector(DAG, DL, VT);
18498   EVT SrcVT = Val.getValueType();
18499   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18500   EVT ExtendVT = ContainerVT.changeVectorElementType(
18501       SrcVT.getVectorElementType());
18502 
18503   Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
18504   Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
18505 
18506   Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
18507   Val = getSVESafeBitCast(ExtendVT, Val, DAG);
18508   Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
18509                     Pg, Val, DAG.getUNDEF(ContainerVT));
18510 
18511   return convertFromScalableVector(DAG, VT, Val);
18512 }
18513 
18514 SDValue
18515 AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
18516                                                     SelectionDAG &DAG) const {
18517   EVT VT = Op.getValueType();
18518   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18519 
18520   SDLoc DL(Op);
18521   SDValue Val = Op.getOperand(0);
18522   EVT SrcVT = Val.getValueType();
18523   EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
18524   EVT RoundVT = ContainerSrcVT.changeVectorElementType(
18525       VT.getVectorElementType());
18526   SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
18527 
18528   Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18529   Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
18530                     Op.getOperand(1), DAG.getUNDEF(RoundVT));
18531   Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
18532   Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
18533 
18534   Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
18535   return DAG.getNode(ISD::BITCAST, DL, VT, Val);
18536 }
18537 
18538 SDValue
18539 AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
18540                                                     SelectionDAG &DAG) const {
18541   EVT VT = Op.getValueType();
18542   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18543 
18544   bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
18545   unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
18546                              : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
18547 
18548   SDLoc DL(Op);
18549   SDValue Val = Op.getOperand(0);
18550   EVT SrcVT = Val.getValueType();
18551   EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
18552   EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
18553 
18554   if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
18555       ContainerDstVT.getVectorElementType().getSizeInBits()) {
18556     SDValue Pg = getPredicateForVector(DAG, DL, VT);
18557 
18558     Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
18559                       VT.changeTypeToInteger(), Val);
18560 
18561     Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18562     Val = getSVESafeBitCast(ContainerDstVT.changeTypeToInteger(), Val, DAG);
18563     // Safe to use a larger than specified operand since we just unpacked the
18564     // data, hence the upper bits are zero.
18565     Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
18566                       DAG.getUNDEF(ContainerDstVT));
18567     return convertFromScalableVector(DAG, VT, Val);
18568   } else {
18569     EVT CvtVT = ContainerSrcVT.changeVectorElementType(
18570         ContainerDstVT.getVectorElementType());
18571     SDValue Pg = getPredicateForVector(DAG, DL, CvtVT);
18572 
18573     Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18574     Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
18575     Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
18576     Val = convertFromScalableVector(DAG, SrcVT, Val);
18577 
18578     Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
18579     return DAG.getNode(ISD::BITCAST, DL, VT, Val);
18580   }
18581 }
18582 
18583 SDValue
18584 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
18585                                                     SelectionDAG &DAG) const {
18586   EVT VT = Op.getValueType();
18587   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18588 
18589   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
18590   unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
18591                              : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
18592 
18593   SDLoc DL(Op);
18594   SDValue Val = Op.getOperand(0);
18595   EVT SrcVT = Val.getValueType();
18596   EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
18597   EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
18598 
18599   if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
18600       ContainerDstVT.getVectorElementType().getSizeInBits()) {
18601     EVT CvtVT = ContainerDstVT.changeVectorElementType(
18602       ContainerSrcVT.getVectorElementType());
18603     SDValue Pg = getPredicateForVector(DAG, DL, VT);
18604 
18605     Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
18606     Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
18607 
18608     Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18609     Val = getSVESafeBitCast(CvtVT, Val, DAG);
18610     Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
18611                       DAG.getUNDEF(ContainerDstVT));
18612     return convertFromScalableVector(DAG, VT, Val);
18613   } else {
18614     EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
18615     SDValue Pg = getPredicateForVector(DAG, DL, CvtVT);
18616 
18617     // Safe to use a larger than specified result since an fp_to_int where the
18618     // result doesn't fit into the destination is undefined.
18619     Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
18620     Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
18621     Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
18622 
18623     return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
18624   }
18625 }
18626 
18627 SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
18628     SDValue Op, SelectionDAG &DAG) const {
18629   EVT VT = Op.getValueType();
18630   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
18631 
18632   auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
18633   auto ShuffleMask = SVN->getMask();
18634 
18635   SDLoc DL(Op);
18636   SDValue Op1 = Op.getOperand(0);
18637   SDValue Op2 = Op.getOperand(1);
18638 
18639   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
18640   Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
18641   Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
18642 
18643   bool ReverseEXT = false;
18644   unsigned Imm;
18645   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
18646       Imm == VT.getVectorNumElements() - 1) {
18647     if (ReverseEXT)
18648       std::swap(Op1, Op2);
18649 
18650     EVT ScalarTy = VT.getVectorElementType();
18651     if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
18652       ScalarTy = MVT::i32;
18653     SDValue Scalar = DAG.getNode(
18654         ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
18655         DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
18656     Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
18657     return convertFromScalableVector(DAG, VT, Op);
18658   }
18659 
18660   return SDValue();
18661 }
18662 
18663 SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
18664                                                  SelectionDAG &DAG) const {
18665   SDLoc DL(Op);
18666   EVT InVT = Op.getValueType();
18667   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18668   (void)TLI;
18669 
18670   assert(VT.isScalableVector() && TLI.isTypeLegal(VT) &&
18671          InVT.isScalableVector() && TLI.isTypeLegal(InVT) &&
18672          "Only expect to cast between legal scalable vector types!");
18673   assert((VT.getVectorElementType() == MVT::i1) ==
18674              (InVT.getVectorElementType() == MVT::i1) &&
18675          "Cannot cast between data and predicate scalable vector types!");
18676 
18677   if (InVT == VT)
18678     return Op;
18679 
18680   if (VT.getVectorElementType() == MVT::i1)
18681     return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
18682 
18683   EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
18684   EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
18685 
18686   // Pack input if required.
18687   if (InVT != PackedInVT)
18688     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
18689 
18690   Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
18691 
18692   // Unpack result if required.
18693   if (VT != PackedVT)
18694     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
18695 
18696   return Op;
18697 }
18698 
18699 bool AArch64TargetLowering::isAllActivePredicate(SDValue N) const {
18700   return ::isAllActivePredicate(N);
18701 }
18702 
18703 EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
18704   return ::getPromotedVTForPredicate(VT);
18705 }
18706 
18707 bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
18708     SDValue Op, const APInt &OriginalDemandedBits,
18709     const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
18710     unsigned Depth) const {
18711 
18712   unsigned Opc = Op.getOpcode();
18713   switch (Opc) {
18714   case AArch64ISD::VSHL: {
18715     // Match (VSHL (VLSHR Val X) X)
18716     SDValue ShiftL = Op;
18717     SDValue ShiftR = Op->getOperand(0);
18718     if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
18719       return false;
18720 
18721     if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
18722       return false;
18723 
18724     unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
18725     unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
18726 
18727     // Other cases can be handled as well, but this is not
18728     // implemented.
18729     if (ShiftRBits != ShiftLBits)
18730       return false;
18731 
18732     unsigned ScalarSize = Op.getScalarValueSizeInBits();
18733     assert(ScalarSize > ShiftLBits && "Invalid shift imm");
18734 
18735     APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
18736     APInt UnusedBits = ~OriginalDemandedBits;
18737 
18738     if ((ZeroBits & UnusedBits) != ZeroBits)
18739       return false;
18740 
18741     // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
18742     // used - simplify to just Val.
18743     return TLO.CombineTo(Op, ShiftR->getOperand(0));
18744   }
18745   }
18746 
18747   return TargetLowering::SimplifyDemandedBitsForTargetNode(
18748       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
18749 }
18750 
18751 bool AArch64TargetLowering::isConstantUnsignedBitfieldExtactLegal(
18752     unsigned Opc, LLT Ty1, LLT Ty2) const {
18753   return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
18754 }
18755