1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AArch64TargetTransformInfo.h"
10 #include "AArch64ExpandImm.h"
11 #include "AArch64PerfectShuffle.h"
12 #include "MCTargetDesc/AArch64AddressingModes.h"
13 #include "llvm/Analysis/IVDescriptors.h"
14 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/Analysis/TargetTransformInfo.h"
16 #include "llvm/CodeGen/BasicTTIImpl.h"
17 #include "llvm/CodeGen/CostTable.h"
18 #include "llvm/CodeGen/TargetLowering.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/IR/Intrinsics.h"
21 #include "llvm/IR/IntrinsicsAArch64.h"
22 #include "llvm/IR/PatternMatch.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Transforms/InstCombine/InstCombiner.h"
25 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
26 #include <algorithm>
27 #include <optional>
28 using namespace llvm;
29 using namespace llvm::PatternMatch;
30 
31 #define DEBUG_TYPE "aarch64tti"
32 
33 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34                                                cl::init(true), cl::Hidden);
35 
36 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
37                                            cl::Hidden);
38 
39 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40                                             cl::init(10), cl::Hidden);
41 
42 namespace {
43 class TailFoldingKind {
44 private:
45   uint8_t Bits = 0; // Currently defaults to disabled.
46 
47 public:
48   enum TailFoldingOpts {
49     TFDisabled = 0x0,
50     TFReductions = 0x01,
51     TFRecurrences = 0x02,
52     TFSimple = 0x80,
53     TFAll = TFReductions | TFRecurrences | TFSimple
54   };
55 
56   void operator=(const std::string &Val) {
57     if (Val.empty())
58       return;
59     SmallVector<StringRef, 6> TailFoldTypes;
60     StringRef(Val).split(TailFoldTypes, '+', -1, false);
61     for (auto TailFoldType : TailFoldTypes) {
62       if (TailFoldType == "disabled")
63         Bits = 0;
64       else if (TailFoldType == "all")
65         Bits = TFAll;
66       else if (TailFoldType == "default")
67         Bits = 0; // Currently defaults to never tail-folding.
68       else if (TailFoldType == "simple")
69         add(TFSimple);
70       else if (TailFoldType == "reductions")
71         add(TFReductions);
72       else if (TailFoldType == "recurrences")
73         add(TFRecurrences);
74       else if (TailFoldType == "noreductions")
75         remove(TFReductions);
76       else if (TailFoldType == "norecurrences")
77         remove(TFRecurrences);
78       else {
79         errs()
80             << "invalid argument " << TailFoldType.str()
81             << " to -sve-tail-folding=; each element must be one of: disabled, "
82                "all, default, simple, reductions, noreductions, recurrences, "
83                "norecurrences\n";
84       }
85     }
86   }
87 
88   operator uint8_t() const { return Bits; }
89 
90   void add(uint8_t Flag) { Bits |= Flag; }
91   void remove(uint8_t Flag) { Bits &= ~Flag; }
92 };
93 } // namespace
94 
95 TailFoldingKind TailFoldingKindLoc;
96 
97 cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
98     "sve-tail-folding",
99     cl::desc(
100         "Control the use of vectorisation using tail-folding for SVE:"
101         "\ndisabled    No loop types will vectorize using tail-folding"
102         "\ndefault     Uses the default tail-folding settings for the target "
103         "CPU"
104         "\nall         All legal loop types will vectorize using tail-folding"
105         "\nsimple      Use tail-folding for simple loops (not reductions or "
106         "recurrences)"
107         "\nreductions  Use tail-folding for loops containing reductions"
108         "\nrecurrences Use tail-folding for loops containing fixed order "
109         "recurrences"),
110     cl::location(TailFoldingKindLoc));
111 
112 // Experimental option that will only be fully functional when the
113 // code-generator is changed to use SVE instead of NEON for all fixed-width
114 // operations.
115 static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
116     "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
117 
118 // Experimental option that will only be fully functional when the cost-model
119 // and code-generator have been changed to avoid using scalable vector
120 // instructions that are not legal in streaming SVE mode.
121 static cl::opt<bool> EnableScalableAutovecInStreamingMode(
122     "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
123 
124 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
125                                          const Function *Callee) const {
126   SMEAttrs CallerAttrs(*Caller);
127   SMEAttrs CalleeAttrs(*Callee);
128   if (CallerAttrs.requiresSMChange(CalleeAttrs,
129                                    /*BodyOverridesInterface=*/true) ||
130       CallerAttrs.requiresLazySave(CalleeAttrs) ||
131       CalleeAttrs.hasNewZAInterface())
132     return false;
133 
134   const TargetMachine &TM = getTLI()->getTargetMachine();
135 
136   const FeatureBitset &CallerBits =
137       TM.getSubtargetImpl(*Caller)->getFeatureBits();
138   const FeatureBitset &CalleeBits =
139       TM.getSubtargetImpl(*Callee)->getFeatureBits();
140 
141   // Inline a callee if its target-features are a subset of the callers
142   // target-features.
143   return (CallerBits & CalleeBits) == CalleeBits;
144 }
145 
146 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
147     TargetTransformInfo::RegisterKind K) const {
148   assert(K != TargetTransformInfo::RGK_Scalar);
149   return K == TargetTransformInfo::RGK_FixedWidthVector;
150 }
151 
152 /// Calculate the cost of materializing a 64-bit value. This helper
153 /// method might only calculate a fraction of a larger immediate. Therefore it
154 /// is valid to return a cost of ZERO.
155 InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
156   // Check if the immediate can be encoded within an instruction.
157   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
158     return 0;
159 
160   if (Val < 0)
161     Val = ~Val;
162 
163   // Calculate how many moves we will need to materialize this constant.
164   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
165   AArch64_IMM::expandMOVImm(Val, 64, Insn);
166   return Insn.size();
167 }
168 
169 /// Calculate the cost of materializing the given constant.
170 InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
171                                               TTI::TargetCostKind CostKind) {
172   assert(Ty->isIntegerTy());
173 
174   unsigned BitSize = Ty->getPrimitiveSizeInBits();
175   if (BitSize == 0)
176     return ~0U;
177 
178   // Sign-extend all constants to a multiple of 64-bit.
179   APInt ImmVal = Imm;
180   if (BitSize & 0x3f)
181     ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
182 
183   // Split the constant into 64-bit chunks and calculate the cost for each
184   // chunk.
185   InstructionCost Cost = 0;
186   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
187     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
188     int64_t Val = Tmp.getSExtValue();
189     Cost += getIntImmCost(Val);
190   }
191   // We need at least one instruction to materialze the constant.
192   return std::max<InstructionCost>(1, Cost);
193 }
194 
195 InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
196                                                   const APInt &Imm, Type *Ty,
197                                                   TTI::TargetCostKind CostKind,
198                                                   Instruction *Inst) {
199   assert(Ty->isIntegerTy());
200 
201   unsigned BitSize = Ty->getPrimitiveSizeInBits();
202   // There is no cost model for constants with a bit size of 0. Return TCC_Free
203   // here, so that constant hoisting will ignore this constant.
204   if (BitSize == 0)
205     return TTI::TCC_Free;
206 
207   unsigned ImmIdx = ~0U;
208   switch (Opcode) {
209   default:
210     return TTI::TCC_Free;
211   case Instruction::GetElementPtr:
212     // Always hoist the base address of a GetElementPtr.
213     if (Idx == 0)
214       return 2 * TTI::TCC_Basic;
215     return TTI::TCC_Free;
216   case Instruction::Store:
217     ImmIdx = 0;
218     break;
219   case Instruction::Add:
220   case Instruction::Sub:
221   case Instruction::Mul:
222   case Instruction::UDiv:
223   case Instruction::SDiv:
224   case Instruction::URem:
225   case Instruction::SRem:
226   case Instruction::And:
227   case Instruction::Or:
228   case Instruction::Xor:
229   case Instruction::ICmp:
230     ImmIdx = 1;
231     break;
232   // Always return TCC_Free for the shift value of a shift instruction.
233   case Instruction::Shl:
234   case Instruction::LShr:
235   case Instruction::AShr:
236     if (Idx == 1)
237       return TTI::TCC_Free;
238     break;
239   case Instruction::Trunc:
240   case Instruction::ZExt:
241   case Instruction::SExt:
242   case Instruction::IntToPtr:
243   case Instruction::PtrToInt:
244   case Instruction::BitCast:
245   case Instruction::PHI:
246   case Instruction::Call:
247   case Instruction::Select:
248   case Instruction::Ret:
249   case Instruction::Load:
250     break;
251   }
252 
253   if (Idx == ImmIdx) {
254     int NumConstants = (BitSize + 63) / 64;
255     InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
256     return (Cost <= NumConstants * TTI::TCC_Basic)
257                ? static_cast<int>(TTI::TCC_Free)
258                : Cost;
259   }
260   return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
261 }
262 
263 InstructionCost
264 AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
265                                     const APInt &Imm, Type *Ty,
266                                     TTI::TargetCostKind CostKind) {
267   assert(Ty->isIntegerTy());
268 
269   unsigned BitSize = Ty->getPrimitiveSizeInBits();
270   // There is no cost model for constants with a bit size of 0. Return TCC_Free
271   // here, so that constant hoisting will ignore this constant.
272   if (BitSize == 0)
273     return TTI::TCC_Free;
274 
275   // Most (all?) AArch64 intrinsics do not support folding immediates into the
276   // selected instruction, so we compute the materialization cost for the
277   // immediate directly.
278   if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
279     return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
280 
281   switch (IID) {
282   default:
283     return TTI::TCC_Free;
284   case Intrinsic::sadd_with_overflow:
285   case Intrinsic::uadd_with_overflow:
286   case Intrinsic::ssub_with_overflow:
287   case Intrinsic::usub_with_overflow:
288   case Intrinsic::smul_with_overflow:
289   case Intrinsic::umul_with_overflow:
290     if (Idx == 1) {
291       int NumConstants = (BitSize + 63) / 64;
292       InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
293       return (Cost <= NumConstants * TTI::TCC_Basic)
294                  ? static_cast<int>(TTI::TCC_Free)
295                  : Cost;
296     }
297     break;
298   case Intrinsic::experimental_stackmap:
299     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
300       return TTI::TCC_Free;
301     break;
302   case Intrinsic::experimental_patchpoint_void:
303   case Intrinsic::experimental_patchpoint_i64:
304     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
305       return TTI::TCC_Free;
306     break;
307   case Intrinsic::experimental_gc_statepoint:
308     if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
309       return TTI::TCC_Free;
310     break;
311   }
312   return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
313 }
314 
315 TargetTransformInfo::PopcntSupportKind
316 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
317   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
318   if (TyWidth == 32 || TyWidth == 64)
319     return TTI::PSK_FastHardware;
320   // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
321   return TTI::PSK_Software;
322 }
323 
324 InstructionCost
325 AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
326                                       TTI::TargetCostKind CostKind) {
327   auto *RetTy = ICA.getReturnType();
328   switch (ICA.getID()) {
329   case Intrinsic::umin:
330   case Intrinsic::umax:
331   case Intrinsic::smin:
332   case Intrinsic::smax: {
333     static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
334                                         MVT::v8i16, MVT::v2i32, MVT::v4i32};
335     auto LT = getTypeLegalizationCost(RetTy);
336     // v2i64 types get converted to cmp+bif hence the cost of 2
337     if (LT.second == MVT::v2i64)
338       return LT.first * 2;
339     if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
340       return LT.first;
341     break;
342   }
343   case Intrinsic::sadd_sat:
344   case Intrinsic::ssub_sat:
345   case Intrinsic::uadd_sat:
346   case Intrinsic::usub_sat: {
347     static const auto ValidSatTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
348                                      MVT::v8i16, MVT::v2i32, MVT::v4i32,
349                                      MVT::v2i64};
350     auto LT = getTypeLegalizationCost(RetTy);
351     // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
352     // need to extend the type, as it uses shr(qadd(shl, shl)).
353     unsigned Instrs =
354         LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
355     if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
356       return LT.first * Instrs;
357     break;
358   }
359   case Intrinsic::abs: {
360     static const auto ValidAbsTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
361                                      MVT::v8i16, MVT::v2i32, MVT::v4i32,
362                                      MVT::v2i64};
363     auto LT = getTypeLegalizationCost(RetTy);
364     if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
365       return LT.first;
366     break;
367   }
368   case Intrinsic::experimental_stepvector: {
369     InstructionCost Cost = 1; // Cost of the `index' instruction
370     auto LT = getTypeLegalizationCost(RetTy);
371     // Legalisation of illegal vectors involves an `index' instruction plus
372     // (LT.first - 1) vector adds.
373     if (LT.first > 1) {
374       Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
375       InstructionCost AddCost =
376           getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
377       Cost += AddCost * (LT.first - 1);
378     }
379     return Cost;
380   }
381   case Intrinsic::bitreverse: {
382     static const CostTblEntry BitreverseTbl[] = {
383         {Intrinsic::bitreverse, MVT::i32, 1},
384         {Intrinsic::bitreverse, MVT::i64, 1},
385         {Intrinsic::bitreverse, MVT::v8i8, 1},
386         {Intrinsic::bitreverse, MVT::v16i8, 1},
387         {Intrinsic::bitreverse, MVT::v4i16, 2},
388         {Intrinsic::bitreverse, MVT::v8i16, 2},
389         {Intrinsic::bitreverse, MVT::v2i32, 2},
390         {Intrinsic::bitreverse, MVT::v4i32, 2},
391         {Intrinsic::bitreverse, MVT::v1i64, 2},
392         {Intrinsic::bitreverse, MVT::v2i64, 2},
393     };
394     const auto LegalisationCost = getTypeLegalizationCost(RetTy);
395     const auto *Entry =
396         CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
397     if (Entry) {
398       // Cost Model is using the legal type(i32) that i8 and i16 will be
399       // converted to +1 so that we match the actual lowering cost
400       if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
401           TLI->getValueType(DL, RetTy, true) == MVT::i16)
402         return LegalisationCost.first * Entry->Cost + 1;
403 
404       return LegalisationCost.first * Entry->Cost;
405     }
406     break;
407   }
408   case Intrinsic::ctpop: {
409     if (!ST->hasNEON()) {
410       // 32-bit or 64-bit ctpop without NEON is 12 instructions.
411       return getTypeLegalizationCost(RetTy).first * 12;
412     }
413     static const CostTblEntry CtpopCostTbl[] = {
414         {ISD::CTPOP, MVT::v2i64, 4},
415         {ISD::CTPOP, MVT::v4i32, 3},
416         {ISD::CTPOP, MVT::v8i16, 2},
417         {ISD::CTPOP, MVT::v16i8, 1},
418         {ISD::CTPOP, MVT::i64,   4},
419         {ISD::CTPOP, MVT::v2i32, 3},
420         {ISD::CTPOP, MVT::v4i16, 2},
421         {ISD::CTPOP, MVT::v8i8,  1},
422         {ISD::CTPOP, MVT::i32,   5},
423     };
424     auto LT = getTypeLegalizationCost(RetTy);
425     MVT MTy = LT.second;
426     if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
427       // Extra cost of +1 when illegal vector types are legalized by promoting
428       // the integer type.
429       int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
430                                             RetTy->getScalarSizeInBits()
431                           ? 1
432                           : 0;
433       return LT.first * Entry->Cost + ExtraCost;
434     }
435     break;
436   }
437   case Intrinsic::sadd_with_overflow:
438   case Intrinsic::uadd_with_overflow:
439   case Intrinsic::ssub_with_overflow:
440   case Intrinsic::usub_with_overflow:
441   case Intrinsic::smul_with_overflow:
442   case Intrinsic::umul_with_overflow: {
443     static const CostTblEntry WithOverflowCostTbl[] = {
444         {Intrinsic::sadd_with_overflow, MVT::i8, 3},
445         {Intrinsic::uadd_with_overflow, MVT::i8, 3},
446         {Intrinsic::sadd_with_overflow, MVT::i16, 3},
447         {Intrinsic::uadd_with_overflow, MVT::i16, 3},
448         {Intrinsic::sadd_with_overflow, MVT::i32, 1},
449         {Intrinsic::uadd_with_overflow, MVT::i32, 1},
450         {Intrinsic::sadd_with_overflow, MVT::i64, 1},
451         {Intrinsic::uadd_with_overflow, MVT::i64, 1},
452         {Intrinsic::ssub_with_overflow, MVT::i8, 3},
453         {Intrinsic::usub_with_overflow, MVT::i8, 3},
454         {Intrinsic::ssub_with_overflow, MVT::i16, 3},
455         {Intrinsic::usub_with_overflow, MVT::i16, 3},
456         {Intrinsic::ssub_with_overflow, MVT::i32, 1},
457         {Intrinsic::usub_with_overflow, MVT::i32, 1},
458         {Intrinsic::ssub_with_overflow, MVT::i64, 1},
459         {Intrinsic::usub_with_overflow, MVT::i64, 1},
460         {Intrinsic::smul_with_overflow, MVT::i8, 5},
461         {Intrinsic::umul_with_overflow, MVT::i8, 4},
462         {Intrinsic::smul_with_overflow, MVT::i16, 5},
463         {Intrinsic::umul_with_overflow, MVT::i16, 4},
464         {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
465         {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
466         {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
467         {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
468     };
469     EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
470     if (MTy.isSimple())
471       if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
472                                               MTy.getSimpleVT()))
473         return Entry->Cost;
474     break;
475   }
476   case Intrinsic::fptosi_sat:
477   case Intrinsic::fptoui_sat: {
478     if (ICA.getArgTypes().empty())
479       break;
480     bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
481     auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
482     EVT MTy = TLI->getValueType(DL, RetTy);
483     // Check for the legal types, which are where the size of the input and the
484     // output are the same, or we are using cvt f64->i32 or f32->i64.
485     if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
486          LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
487          LT.second == MVT::v2f64) &&
488         (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
489          (LT.second == MVT::f64 && MTy == MVT::i32) ||
490          (LT.second == MVT::f32 && MTy == MVT::i64)))
491       return LT.first;
492     // Similarly for fp16 sizes
493     if (ST->hasFullFP16() &&
494         ((LT.second == MVT::f16 && MTy == MVT::i32) ||
495          ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
496           (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
497       return LT.first;
498 
499     // Otherwise we use a legal convert followed by a min+max
500     if ((LT.second.getScalarType() == MVT::f32 ||
501          LT.second.getScalarType() == MVT::f64 ||
502          (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
503         LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
504       Type *LegalTy =
505           Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
506       if (LT.second.isVector())
507         LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
508       InstructionCost Cost = 1;
509       IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
510                                     LegalTy, {LegalTy, LegalTy});
511       Cost += getIntrinsicInstrCost(Attrs1, CostKind);
512       IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
513                                     LegalTy, {LegalTy, LegalTy});
514       Cost += getIntrinsicInstrCost(Attrs2, CostKind);
515       return LT.first * Cost;
516     }
517     break;
518   }
519   default:
520     break;
521   }
522   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
523 }
524 
525 /// The function will remove redundant reinterprets casting in the presence
526 /// of the control flow
527 static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
528                                                    IntrinsicInst &II) {
529   SmallVector<Instruction *, 32> Worklist;
530   auto RequiredType = II.getType();
531 
532   auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
533   assert(PN && "Expected Phi Node!");
534 
535   // Don't create a new Phi unless we can remove the old one.
536   if (!PN->hasOneUse())
537     return std::nullopt;
538 
539   for (Value *IncValPhi : PN->incoming_values()) {
540     auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
541     if (!Reinterpret ||
542         Reinterpret->getIntrinsicID() !=
543             Intrinsic::aarch64_sve_convert_to_svbool ||
544         RequiredType != Reinterpret->getArgOperand(0)->getType())
545       return std::nullopt;
546   }
547 
548   // Create the new Phi
549   LLVMContext &Ctx = PN->getContext();
550   IRBuilder<> Builder(Ctx);
551   Builder.SetInsertPoint(PN);
552   PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
553   Worklist.push_back(PN);
554 
555   for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
556     auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
557     NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
558     Worklist.push_back(Reinterpret);
559   }
560 
561   // Cleanup Phi Node and reinterprets
562   return IC.replaceInstUsesWith(II, NPN);
563 }
564 
565 // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
566 // => (binop (pred) (from_svbool _) (from_svbool _))
567 //
568 // The above transformation eliminates a `to_svbool` in the predicate
569 // operand of bitwise operation `binop` by narrowing the vector width of
570 // the operation. For example, it would convert a `<vscale x 16 x i1>
571 // and` into a `<vscale x 4 x i1> and`. This is profitable because
572 // to_svbool must zero the new lanes during widening, whereas
573 // from_svbool is free.
574 static std::optional<Instruction *>
575 tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
576   auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
577   if (!BinOp)
578     return std::nullopt;
579 
580   auto IntrinsicID = BinOp->getIntrinsicID();
581   switch (IntrinsicID) {
582   case Intrinsic::aarch64_sve_and_z:
583   case Intrinsic::aarch64_sve_bic_z:
584   case Intrinsic::aarch64_sve_eor_z:
585   case Intrinsic::aarch64_sve_nand_z:
586   case Intrinsic::aarch64_sve_nor_z:
587   case Intrinsic::aarch64_sve_orn_z:
588   case Intrinsic::aarch64_sve_orr_z:
589     break;
590   default:
591     return std::nullopt;
592   }
593 
594   auto BinOpPred = BinOp->getOperand(0);
595   auto BinOpOp1 = BinOp->getOperand(1);
596   auto BinOpOp2 = BinOp->getOperand(2);
597 
598   auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
599   if (!PredIntr ||
600       PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
601     return std::nullopt;
602 
603   auto PredOp = PredIntr->getOperand(0);
604   auto PredOpTy = cast<VectorType>(PredOp->getType());
605   if (PredOpTy != II.getType())
606     return std::nullopt;
607 
608   IRBuilder<> Builder(II.getContext());
609   Builder.SetInsertPoint(&II);
610 
611   SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
612   auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
613       Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
614   NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
615   if (BinOpOp1 == BinOpOp2)
616     NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
617   else
618     NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
619         Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
620 
621   auto NarrowedBinOp =
622       Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
623   return IC.replaceInstUsesWith(II, NarrowedBinOp);
624 }
625 
626 static std::optional<Instruction *>
627 instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
628   // If the reinterpret instruction operand is a PHI Node
629   if (isa<PHINode>(II.getArgOperand(0)))
630     return processPhiNode(IC, II);
631 
632   if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
633     return BinOpCombine;
634 
635   SmallVector<Instruction *, 32> CandidatesForRemoval;
636   Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
637 
638   const auto *IVTy = cast<VectorType>(II.getType());
639 
640   // Walk the chain of conversions.
641   while (Cursor) {
642     // If the type of the cursor has fewer lanes than the final result, zeroing
643     // must take place, which breaks the equivalence chain.
644     const auto *CursorVTy = cast<VectorType>(Cursor->getType());
645     if (CursorVTy->getElementCount().getKnownMinValue() <
646         IVTy->getElementCount().getKnownMinValue())
647       break;
648 
649     // If the cursor has the same type as I, it is a viable replacement.
650     if (Cursor->getType() == IVTy)
651       EarliestReplacement = Cursor;
652 
653     auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
654 
655     // If this is not an SVE conversion intrinsic, this is the end of the chain.
656     if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
657                                   Intrinsic::aarch64_sve_convert_to_svbool ||
658                               IntrinsicCursor->getIntrinsicID() ==
659                                   Intrinsic::aarch64_sve_convert_from_svbool))
660       break;
661 
662     CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
663     Cursor = IntrinsicCursor->getOperand(0);
664   }
665 
666   // If no viable replacement in the conversion chain was found, there is
667   // nothing to do.
668   if (!EarliestReplacement)
669     return std::nullopt;
670 
671   return IC.replaceInstUsesWith(II, EarliestReplacement);
672 }
673 
674 static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
675                                                       IntrinsicInst &II) {
676   IRBuilder<> Builder(&II);
677   auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
678                                      II.getOperand(2));
679   return IC.replaceInstUsesWith(II, Select);
680 }
681 
682 static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
683                                                       IntrinsicInst &II) {
684   IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
685   if (!Pg)
686     return std::nullopt;
687 
688   if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
689     return std::nullopt;
690 
691   const auto PTruePattern =
692       cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
693   if (PTruePattern != AArch64SVEPredPattern::vl1)
694     return std::nullopt;
695 
696   // The intrinsic is inserting into lane zero so use an insert instead.
697   auto *IdxTy = Type::getInt64Ty(II.getContext());
698   auto *Insert = InsertElementInst::Create(
699       II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
700   Insert->insertBefore(&II);
701   Insert->takeName(&II);
702 
703   return IC.replaceInstUsesWith(II, Insert);
704 }
705 
706 static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
707                                                        IntrinsicInst &II) {
708   // Replace DupX with a regular IR splat.
709   IRBuilder<> Builder(II.getContext());
710   Builder.SetInsertPoint(&II);
711   auto *RetTy = cast<ScalableVectorType>(II.getType());
712   Value *Splat =
713       Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
714   Splat->takeName(&II);
715   return IC.replaceInstUsesWith(II, Splat);
716 }
717 
718 static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
719                                                         IntrinsicInst &II) {
720   LLVMContext &Ctx = II.getContext();
721   IRBuilder<> Builder(Ctx);
722   Builder.SetInsertPoint(&II);
723 
724   // Check that the predicate is all active
725   auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
726   if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
727     return std::nullopt;
728 
729   const auto PTruePattern =
730       cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
731   if (PTruePattern != AArch64SVEPredPattern::all)
732     return std::nullopt;
733 
734   // Check that we have a compare of zero..
735   auto *SplatValue =
736       dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
737   if (!SplatValue || !SplatValue->isZero())
738     return std::nullopt;
739 
740   // ..against a dupq
741   auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
742   if (!DupQLane ||
743       DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
744     return std::nullopt;
745 
746   // Where the dupq is a lane 0 replicate of a vector insert
747   if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
748     return std::nullopt;
749 
750   auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
751   if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
752     return std::nullopt;
753 
754   // Where the vector insert is a fixed constant vector insert into undef at
755   // index zero
756   if (!isa<UndefValue>(VecIns->getArgOperand(0)))
757     return std::nullopt;
758 
759   if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
760     return std::nullopt;
761 
762   auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
763   if (!ConstVec)
764     return std::nullopt;
765 
766   auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
767   auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
768   if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
769     return std::nullopt;
770 
771   unsigned NumElts = VecTy->getNumElements();
772   unsigned PredicateBits = 0;
773 
774   // Expand intrinsic operands to a 16-bit byte level predicate
775   for (unsigned I = 0; I < NumElts; ++I) {
776     auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
777     if (!Arg)
778       return std::nullopt;
779     if (!Arg->isZero())
780       PredicateBits |= 1 << (I * (16 / NumElts));
781   }
782 
783   // If all bits are zero bail early with an empty predicate
784   if (PredicateBits == 0) {
785     auto *PFalse = Constant::getNullValue(II.getType());
786     PFalse->takeName(&II);
787     return IC.replaceInstUsesWith(II, PFalse);
788   }
789 
790   // Calculate largest predicate type used (where byte predicate is largest)
791   unsigned Mask = 8;
792   for (unsigned I = 0; I < 16; ++I)
793     if ((PredicateBits & (1 << I)) != 0)
794       Mask |= (I % 8);
795 
796   unsigned PredSize = Mask & -Mask;
797   auto *PredType = ScalableVectorType::get(
798       Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
799 
800   // Ensure all relevant bits are set
801   for (unsigned I = 0; I < 16; I += PredSize)
802     if ((PredicateBits & (1 << I)) == 0)
803       return std::nullopt;
804 
805   auto *PTruePat =
806       ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
807   auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
808                                         {PredType}, {PTruePat});
809   auto *ConvertToSVBool = Builder.CreateIntrinsic(
810       Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
811   auto *ConvertFromSVBool =
812       Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
813                               {II.getType()}, {ConvertToSVBool});
814 
815   ConvertFromSVBool->takeName(&II);
816   return IC.replaceInstUsesWith(II, ConvertFromSVBool);
817 }
818 
819 static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
820                                                        IntrinsicInst &II) {
821   IRBuilder<> Builder(II.getContext());
822   Builder.SetInsertPoint(&II);
823   Value *Pg = II.getArgOperand(0);
824   Value *Vec = II.getArgOperand(1);
825   auto IntrinsicID = II.getIntrinsicID();
826   bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
827 
828   // lastX(splat(X)) --> X
829   if (auto *SplatVal = getSplatValue(Vec))
830     return IC.replaceInstUsesWith(II, SplatVal);
831 
832   // If x and/or y is a splat value then:
833   // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
834   Value *LHS, *RHS;
835   if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
836     if (isSplatValue(LHS) || isSplatValue(RHS)) {
837       auto *OldBinOp = cast<BinaryOperator>(Vec);
838       auto OpC = OldBinOp->getOpcode();
839       auto *NewLHS =
840           Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
841       auto *NewRHS =
842           Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
843       auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
844           OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
845       return IC.replaceInstUsesWith(II, NewBinOp);
846     }
847   }
848 
849   auto *C = dyn_cast<Constant>(Pg);
850   if (IsAfter && C && C->isNullValue()) {
851     // The intrinsic is extracting lane 0 so use an extract instead.
852     auto *IdxTy = Type::getInt64Ty(II.getContext());
853     auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
854     Extract->insertBefore(&II);
855     Extract->takeName(&II);
856     return IC.replaceInstUsesWith(II, Extract);
857   }
858 
859   auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
860   if (!IntrPG)
861     return std::nullopt;
862 
863   if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
864     return std::nullopt;
865 
866   const auto PTruePattern =
867       cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
868 
869   // Can the intrinsic's predicate be converted to a known constant index?
870   unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
871   if (!MinNumElts)
872     return std::nullopt;
873 
874   unsigned Idx = MinNumElts - 1;
875   // Increment the index if extracting the element after the last active
876   // predicate element.
877   if (IsAfter)
878     ++Idx;
879 
880   // Ignore extracts whose index is larger than the known minimum vector
881   // length. NOTE: This is an artificial constraint where we prefer to
882   // maintain what the user asked for until an alternative is proven faster.
883   auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
884   if (Idx >= PgVTy->getMinNumElements())
885     return std::nullopt;
886 
887   // The intrinsic is extracting a fixed lane so use an extract instead.
888   auto *IdxTy = Type::getInt64Ty(II.getContext());
889   auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
890   Extract->insertBefore(&II);
891   Extract->takeName(&II);
892   return IC.replaceInstUsesWith(II, Extract);
893 }
894 
895 static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
896                                                            IntrinsicInst &II) {
897   // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
898   // integer variant across a variety of micro-architectures. Replace scalar
899   // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
900   // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
901   // depending on the micro-architecture, but has been observed as generally
902   // being faster, particularly when the CLAST[AB] op is a loop-carried
903   // dependency.
904   IRBuilder<> Builder(II.getContext());
905   Builder.SetInsertPoint(&II);
906   Value *Pg = II.getArgOperand(0);
907   Value *Fallback = II.getArgOperand(1);
908   Value *Vec = II.getArgOperand(2);
909   Type *Ty = II.getType();
910 
911   if (!Ty->isIntegerTy())
912     return std::nullopt;
913 
914   Type *FPTy;
915   switch (cast<IntegerType>(Ty)->getBitWidth()) {
916   default:
917     return std::nullopt;
918   case 16:
919     FPTy = Builder.getHalfTy();
920     break;
921   case 32:
922     FPTy = Builder.getFloatTy();
923     break;
924   case 64:
925     FPTy = Builder.getDoubleTy();
926     break;
927   }
928 
929   Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy);
930   auto *FPVTy = VectorType::get(
931       FPTy, cast<VectorType>(Vec->getType())->getElementCount());
932   Value *FPVec = Builder.CreateBitCast(Vec, FPVTy);
933   auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()},
934                                        {Pg, FPFallBack, FPVec});
935   Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType());
936   return IC.replaceInstUsesWith(II, FPIItoInt);
937 }
938 
939 static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
940                                                      IntrinsicInst &II) {
941   LLVMContext &Ctx = II.getContext();
942   IRBuilder<> Builder(Ctx);
943   Builder.SetInsertPoint(&II);
944   // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
945   // can work with RDFFR_PP for ptest elimination.
946   auto *AllPat =
947       ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
948   auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
949                                         {II.getType()}, {AllPat});
950   auto *RDFFR =
951       Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
952   RDFFR->takeName(&II);
953   return IC.replaceInstUsesWith(II, RDFFR);
954 }
955 
956 static std::optional<Instruction *>
957 instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
958   const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
959 
960   if (Pattern == AArch64SVEPredPattern::all) {
961     LLVMContext &Ctx = II.getContext();
962     IRBuilder<> Builder(Ctx);
963     Builder.SetInsertPoint(&II);
964 
965     Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
966     auto *VScale = Builder.CreateVScale(StepVal);
967     VScale->takeName(&II);
968     return IC.replaceInstUsesWith(II, VScale);
969   }
970 
971   unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
972 
973   return MinNumElts && NumElts >= MinNumElts
974              ? std::optional<Instruction *>(IC.replaceInstUsesWith(
975                    II, ConstantInt::get(II.getType(), MinNumElts)))
976              : std::nullopt;
977 }
978 
979 static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
980                                                         IntrinsicInst &II) {
981   Value *PgVal = II.getArgOperand(0);
982   Value *OpVal = II.getArgOperand(1);
983 
984   IRBuilder<> Builder(II.getContext());
985   Builder.SetInsertPoint(&II);
986 
987   // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
988   // Later optimizations prefer this form.
989   if (PgVal == OpVal &&
990       (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
991        II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
992     Value *Ops[] = {PgVal, OpVal};
993     Type *Tys[] = {PgVal->getType()};
994 
995     auto *PTest =
996         Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
997     PTest->takeName(&II);
998 
999     return IC.replaceInstUsesWith(II, PTest);
1000   }
1001 
1002   IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1003   IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1004 
1005   if (!Pg || !Op)
1006     return std::nullopt;
1007 
1008   Intrinsic::ID OpIID = Op->getIntrinsicID();
1009 
1010   if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1011       OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1012       Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1013     Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1014     Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1015 
1016     auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1017 
1018     PTest->takeName(&II);
1019     return IC.replaceInstUsesWith(II, PTest);
1020   }
1021 
1022   // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1023   // Later optimizations may rewrite sequence to use the flag-setting variant
1024   // of instruction X to remove PTEST.
1025   if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1026       ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1027        (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1028        (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1029        (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1030        (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1031        (OpIID == Intrinsic::aarch64_sve_and_z) ||
1032        (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1033        (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1034        (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1035        (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1036        (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1037        (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1038     Value *Ops[] = {Pg->getArgOperand(0), Pg};
1039     Type *Tys[] = {Pg->getType()};
1040 
1041     auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1042     PTest->takeName(&II);
1043 
1044     return IC.replaceInstUsesWith(II, PTest);
1045   }
1046 
1047   return std::nullopt;
1048 }
1049 
1050 template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1051 static std::optional<Instruction *>
1052 instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
1053                                   bool MergeIntoAddendOp) {
1054   Value *P = II.getOperand(0);
1055   Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1056   if (MergeIntoAddendOp) {
1057     AddendOp = II.getOperand(1);
1058     Mul = II.getOperand(2);
1059   } else {
1060     AddendOp = II.getOperand(2);
1061     Mul = II.getOperand(1);
1062   }
1063 
1064   if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1065                                       m_Value(MulOp1))))
1066     return std::nullopt;
1067 
1068   if (!Mul->hasOneUse())
1069     return std::nullopt;
1070 
1071   Instruction *FMFSource = nullptr;
1072   if (II.getType()->isFPOrFPVectorTy()) {
1073     llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1074     // Stop the combine when the flags on the inputs differ in case dropping
1075     // flags would lead to us missing out on more beneficial optimizations.
1076     if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1077       return std::nullopt;
1078     if (!FAddFlags.allowContract())
1079       return std::nullopt;
1080     FMFSource = &II;
1081   }
1082 
1083   IRBuilder<> Builder(II.getContext());
1084   Builder.SetInsertPoint(&II);
1085 
1086   CallInst *Res;
1087   if (MergeIntoAddendOp)
1088     Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1089                                   {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1090   else
1091     Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1092                                   {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1093 
1094   return IC.replaceInstUsesWith(II, Res);
1095 }
1096 
1097 static bool isAllActivePredicate(Value *Pred) {
1098   // Look through convert.from.svbool(convert.to.svbool(...) chain.
1099   Value *UncastedPred;
1100   if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1101                       m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1102                           m_Value(UncastedPred)))))
1103     // If the predicate has the same or less lanes than the uncasted
1104     // predicate then we know the casting has no effect.
1105     if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1106         cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1107       Pred = UncastedPred;
1108 
1109   return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1110                          m_ConstantInt<AArch64SVEPredPattern::all>()));
1111 }
1112 
1113 static std::optional<Instruction *>
1114 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1115   IRBuilder<> Builder(II.getContext());
1116   Builder.SetInsertPoint(&II);
1117 
1118   Value *Pred = II.getOperand(0);
1119   Value *PtrOp = II.getOperand(1);
1120   Type *VecTy = II.getType();
1121   Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
1122 
1123   if (isAllActivePredicate(Pred)) {
1124     LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
1125     Load->copyMetadata(II);
1126     return IC.replaceInstUsesWith(II, Load);
1127   }
1128 
1129   CallInst *MaskedLoad =
1130       Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
1131                                Pred, ConstantAggregateZero::get(VecTy));
1132   MaskedLoad->copyMetadata(II);
1133   return IC.replaceInstUsesWith(II, MaskedLoad);
1134 }
1135 
1136 static std::optional<Instruction *>
1137 instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1138   IRBuilder<> Builder(II.getContext());
1139   Builder.SetInsertPoint(&II);
1140 
1141   Value *VecOp = II.getOperand(0);
1142   Value *Pred = II.getOperand(1);
1143   Value *PtrOp = II.getOperand(2);
1144   Value *VecPtr =
1145       Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
1146 
1147   if (isAllActivePredicate(Pred)) {
1148     StoreInst *Store = Builder.CreateStore(VecOp, VecPtr);
1149     Store->copyMetadata(II);
1150     return IC.eraseInstFromFunction(II);
1151   }
1152 
1153   CallInst *MaskedStore = Builder.CreateMaskedStore(
1154       VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred);
1155   MaskedStore->copyMetadata(II);
1156   return IC.eraseInstFromFunction(II);
1157 }
1158 
1159 static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
1160   switch (Intrinsic) {
1161   case Intrinsic::aarch64_sve_fmul:
1162     return Instruction::BinaryOps::FMul;
1163   case Intrinsic::aarch64_sve_fadd:
1164     return Instruction::BinaryOps::FAdd;
1165   case Intrinsic::aarch64_sve_fsub:
1166     return Instruction::BinaryOps::FSub;
1167   default:
1168     return Instruction::BinaryOpsEnd;
1169   }
1170 }
1171 
1172 static std::optional<Instruction *>
1173 instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
1174   auto *OpPredicate = II.getOperand(0);
1175   auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1176   if (BinOpCode == Instruction::BinaryOpsEnd ||
1177       !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1178                               m_ConstantInt<AArch64SVEPredPattern::all>())))
1179     return std::nullopt;
1180   IRBuilder<> Builder(II.getContext());
1181   Builder.SetInsertPoint(&II);
1182   Builder.setFastMathFlags(II.getFastMathFlags());
1183   auto BinOp =
1184       Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1185   return IC.replaceInstUsesWith(II, BinOp);
1186 }
1187 
1188 static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1189                                                             IntrinsicInst &II) {
1190   if (auto FMLA =
1191           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1192                                             Intrinsic::aarch64_sve_fmla>(IC, II,
1193                                                                          true))
1194     return FMLA;
1195   if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1196                                                    Intrinsic::aarch64_sve_mla>(
1197           IC, II, true))
1198     return MLA;
1199   if (auto FMAD =
1200           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1201                                             Intrinsic::aarch64_sve_fmad>(IC, II,
1202                                                                          false))
1203     return FMAD;
1204   if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1205                                                    Intrinsic::aarch64_sve_mad>(
1206           IC, II, false))
1207     return MAD;
1208   return instCombineSVEVectorBinOp(IC, II);
1209 }
1210 
1211 static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1212                                                             IntrinsicInst &II) {
1213   if (auto FMLS =
1214           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1215                                             Intrinsic::aarch64_sve_fmls>(IC, II,
1216                                                                          true))
1217     return FMLS;
1218   if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1219                                                    Intrinsic::aarch64_sve_mls>(
1220           IC, II, true))
1221     return MLS;
1222   if (auto FMSB =
1223           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1224                                             Intrinsic::aarch64_sve_fnmsb>(
1225               IC, II, false))
1226     return FMSB;
1227   return instCombineSVEVectorBinOp(IC, II);
1228 }
1229 
1230 static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1231                                                             IntrinsicInst &II) {
1232   auto *OpPredicate = II.getOperand(0);
1233   auto *OpMultiplicand = II.getOperand(1);
1234   auto *OpMultiplier = II.getOperand(2);
1235 
1236   IRBuilder<> Builder(II.getContext());
1237   Builder.SetInsertPoint(&II);
1238 
1239   // Return true if a given instruction is a unit splat value, false otherwise.
1240   auto IsUnitSplat = [](auto *I) {
1241     auto *SplatValue = getSplatValue(I);
1242     if (!SplatValue)
1243       return false;
1244     return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1245   };
1246 
1247   // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1248   // with a unit splat value, false otherwise.
1249   auto IsUnitDup = [](auto *I) {
1250     auto *IntrI = dyn_cast<IntrinsicInst>(I);
1251     if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1252       return false;
1253 
1254     auto *SplatValue = IntrI->getOperand(2);
1255     return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1256   };
1257 
1258   if (IsUnitSplat(OpMultiplier)) {
1259     // [f]mul pg %n, (dupx 1) => %n
1260     OpMultiplicand->takeName(&II);
1261     return IC.replaceInstUsesWith(II, OpMultiplicand);
1262   } else if (IsUnitDup(OpMultiplier)) {
1263     // [f]mul pg %n, (dup pg 1) => %n
1264     auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1265     auto *DupPg = DupInst->getOperand(1);
1266     // TODO: this is naive. The optimization is still valid if DupPg
1267     // 'encompasses' OpPredicate, not only if they're the same predicate.
1268     if (OpPredicate == DupPg) {
1269       OpMultiplicand->takeName(&II);
1270       return IC.replaceInstUsesWith(II, OpMultiplicand);
1271     }
1272   }
1273 
1274   return instCombineSVEVectorBinOp(IC, II);
1275 }
1276 
1277 static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1278                                                          IntrinsicInst &II) {
1279   IRBuilder<> Builder(II.getContext());
1280   Builder.SetInsertPoint(&II);
1281   Value *UnpackArg = II.getArgOperand(0);
1282   auto *RetTy = cast<ScalableVectorType>(II.getType());
1283   bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1284                   II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1285 
1286   // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1287   // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1288   if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1289     ScalarArg =
1290         Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1291     Value *NewVal =
1292         Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1293     NewVal->takeName(&II);
1294     return IC.replaceInstUsesWith(II, NewVal);
1295   }
1296 
1297   return std::nullopt;
1298 }
1299 static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1300                                                       IntrinsicInst &II) {
1301   auto *OpVal = II.getOperand(0);
1302   auto *OpIndices = II.getOperand(1);
1303   VectorType *VTy = cast<VectorType>(II.getType());
1304 
1305   // Check whether OpIndices is a constant splat value < minimal element count
1306   // of result.
1307   auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1308   if (!SplatValue ||
1309       SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1310     return std::nullopt;
1311 
1312   // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1313   // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1314   IRBuilder<> Builder(II.getContext());
1315   Builder.SetInsertPoint(&II);
1316   auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
1317   auto *VectorSplat =
1318       Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1319 
1320   VectorSplat->takeName(&II);
1321   return IC.replaceInstUsesWith(II, VectorSplat);
1322 }
1323 
1324 static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1325                                                       IntrinsicInst &II) {
1326   // zip1(uzp1(A, B), uzp2(A, B)) --> A
1327   // zip2(uzp1(A, B), uzp2(A, B)) --> B
1328   Value *A, *B;
1329   if (match(II.getArgOperand(0),
1330             m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1331       match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1332                                      m_Specific(A), m_Specific(B))))
1333     return IC.replaceInstUsesWith(
1334         II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1335 
1336   return std::nullopt;
1337 }
1338 
1339 static std::optional<Instruction *>
1340 instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
1341   Value *Mask = II.getOperand(0);
1342   Value *BasePtr = II.getOperand(1);
1343   Value *Index = II.getOperand(2);
1344   Type *Ty = II.getType();
1345   Value *PassThru = ConstantAggregateZero::get(Ty);
1346 
1347   // Contiguous gather => masked load.
1348   // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1349   // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1350   Value *IndexBase;
1351   if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1352                        m_Value(IndexBase), m_SpecificInt(1)))) {
1353     IRBuilder<> Builder(II.getContext());
1354     Builder.SetInsertPoint(&II);
1355 
1356     Align Alignment =
1357         BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1358 
1359     Type *VecPtrTy = PointerType::getUnqual(Ty);
1360     Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1361                                    BasePtr, IndexBase);
1362     Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1363     CallInst *MaskedLoad =
1364         Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1365     MaskedLoad->takeName(&II);
1366     return IC.replaceInstUsesWith(II, MaskedLoad);
1367   }
1368 
1369   return std::nullopt;
1370 }
1371 
1372 static std::optional<Instruction *>
1373 instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
1374   Value *Val = II.getOperand(0);
1375   Value *Mask = II.getOperand(1);
1376   Value *BasePtr = II.getOperand(2);
1377   Value *Index = II.getOperand(3);
1378   Type *Ty = Val->getType();
1379 
1380   // Contiguous scatter => masked store.
1381   // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1382   // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1383   Value *IndexBase;
1384   if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1385                        m_Value(IndexBase), m_SpecificInt(1)))) {
1386     IRBuilder<> Builder(II.getContext());
1387     Builder.SetInsertPoint(&II);
1388 
1389     Align Alignment =
1390         BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1391 
1392     Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1393                                    BasePtr, IndexBase);
1394     Type *VecPtrTy = PointerType::getUnqual(Ty);
1395     Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1396 
1397     (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1398 
1399     return IC.eraseInstFromFunction(II);
1400   }
1401 
1402   return std::nullopt;
1403 }
1404 
1405 static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1406                                                        IntrinsicInst &II) {
1407   IRBuilder<> Builder(II.getContext());
1408   Builder.SetInsertPoint(&II);
1409   Type *Int32Ty = Builder.getInt32Ty();
1410   Value *Pred = II.getOperand(0);
1411   Value *Vec = II.getOperand(1);
1412   Value *DivVec = II.getOperand(2);
1413 
1414   Value *SplatValue = getSplatValue(DivVec);
1415   ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1416   if (!SplatConstantInt)
1417     return std::nullopt;
1418   APInt Divisor = SplatConstantInt->getValue();
1419 
1420   if (Divisor.isPowerOf2()) {
1421     Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1422     auto ASRD = Builder.CreateIntrinsic(
1423         Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1424     return IC.replaceInstUsesWith(II, ASRD);
1425   }
1426   if (Divisor.isNegatedPowerOf2()) {
1427     Divisor.negate();
1428     Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1429     auto ASRD = Builder.CreateIntrinsic(
1430         Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1431     auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
1432                                        {ASRD->getType()}, {ASRD, Pred, ASRD});
1433     return IC.replaceInstUsesWith(II, NEG);
1434   }
1435 
1436   return std::nullopt;
1437 }
1438 
1439 bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1440   size_t VecSize = Vec.size();
1441   if (VecSize == 1)
1442     return true;
1443   if (!isPowerOf2_64(VecSize))
1444     return false;
1445   size_t HalfVecSize = VecSize / 2;
1446 
1447   for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1448        RHS != Vec.end(); LHS++, RHS++) {
1449     if (*LHS != nullptr && *RHS != nullptr) {
1450       if (*LHS == *RHS)
1451         continue;
1452       else
1453         return false;
1454     }
1455     if (!AllowPoison)
1456       return false;
1457     if (*LHS == nullptr && *RHS != nullptr)
1458       *LHS = *RHS;
1459   }
1460 
1461   Vec.resize(HalfVecSize);
1462   SimplifyValuePattern(Vec, AllowPoison);
1463   return true;
1464 }
1465 
1466 // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1467 // to dupqlane(f64(C)) where C is A concatenated with B
1468 static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1469                                                            IntrinsicInst &II) {
1470   Value *CurrentInsertElt = nullptr, *Default = nullptr;
1471   if (!match(II.getOperand(0),
1472              m_Intrinsic<Intrinsic::vector_insert>(
1473                  m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1474       !isa<FixedVectorType>(CurrentInsertElt->getType()))
1475     return std::nullopt;
1476   auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1477 
1478   // Insert the scalars into a container ordered by InsertElement index
1479   SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1480   while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1481     auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1482     Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1483     CurrentInsertElt = InsertElt->getOperand(0);
1484   }
1485 
1486   bool AllowPoison =
1487       isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1488   if (!SimplifyValuePattern(Elts, AllowPoison))
1489     return std::nullopt;
1490 
1491   // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1492   IRBuilder<> Builder(II.getContext());
1493   Builder.SetInsertPoint(&II);
1494   Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1495   for (size_t I = 0; I < Elts.size(); I++) {
1496     if (Elts[I] == nullptr)
1497       continue;
1498     InsertEltChain = Builder.CreateInsertElement(InsertEltChain, Elts[I],
1499                                                  Builder.getInt64(I));
1500   }
1501   if (InsertEltChain == nullptr)
1502     return std::nullopt;
1503 
1504   // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1505   // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1506   // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1507   // be narrowed back to the original type.
1508   unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1509   unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1510                                  IIScalableTy->getMinNumElements() /
1511                                  PatternWidth;
1512 
1513   IntegerType *WideTy = Builder.getIntNTy(PatternWidth);
1514   auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1515   auto *WideShuffleMaskTy =
1516       ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount);
1517 
1518   auto ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0));
1519   auto InsertSubvector = Builder.CreateInsertVector(
1520       II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1521   auto WideBitcast =
1522       Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1523   auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
1524   auto WideShuffle = Builder.CreateShuffleVector(
1525       WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1526   auto NarrowBitcast =
1527       Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1528 
1529   return IC.replaceInstUsesWith(II, NarrowBitcast);
1530 }
1531 
1532 static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1533                                                         IntrinsicInst &II) {
1534   Value *A = II.getArgOperand(0);
1535   Value *B = II.getArgOperand(1);
1536   if (A == B)
1537     return IC.replaceInstUsesWith(II, A);
1538 
1539   return std::nullopt;
1540 }
1541 
1542 static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1543                                                         IntrinsicInst &II) {
1544   IRBuilder<> Builder(&II);
1545   Value *Pred = II.getOperand(0);
1546   Value *Vec = II.getOperand(1);
1547   Value *Shift = II.getOperand(2);
1548 
1549   // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1550   Value *AbsPred, *MergedValue;
1551   if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1552                       m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1553       !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1554                       m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1555 
1556     return std::nullopt;
1557 
1558   // Transform is valid if any of the following are true:
1559   // * The ABS merge value is an undef or non-negative
1560   // * The ABS predicate is all active
1561   // * The ABS predicate and the SRSHL predicates are the same
1562   if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
1563       AbsPred != Pred && !isAllActivePredicate(AbsPred))
1564     return std::nullopt;
1565 
1566   // Only valid when the shift amount is non-negative, otherwise the rounding
1567   // behaviour of SRSHL cannot be ignored.
1568   if (!match(Shift, m_NonNegative()))
1569     return std::nullopt;
1570 
1571   auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()},
1572                                      {Pred, Vec, Shift});
1573 
1574   return IC.replaceInstUsesWith(II, LSL);
1575 }
1576 
1577 std::optional<Instruction *>
1578 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
1579                                      IntrinsicInst &II) const {
1580   Intrinsic::ID IID = II.getIntrinsicID();
1581   switch (IID) {
1582   default:
1583     break;
1584   case Intrinsic::aarch64_neon_fmaxnm:
1585   case Intrinsic::aarch64_neon_fminnm:
1586     return instCombineMaxMinNM(IC, II);
1587   case Intrinsic::aarch64_sve_convert_from_svbool:
1588     return instCombineConvertFromSVBool(IC, II);
1589   case Intrinsic::aarch64_sve_dup:
1590     return instCombineSVEDup(IC, II);
1591   case Intrinsic::aarch64_sve_dup_x:
1592     return instCombineSVEDupX(IC, II);
1593   case Intrinsic::aarch64_sve_cmpne:
1594   case Intrinsic::aarch64_sve_cmpne_wide:
1595     return instCombineSVECmpNE(IC, II);
1596   case Intrinsic::aarch64_sve_rdffr:
1597     return instCombineRDFFR(IC, II);
1598   case Intrinsic::aarch64_sve_lasta:
1599   case Intrinsic::aarch64_sve_lastb:
1600     return instCombineSVELast(IC, II);
1601   case Intrinsic::aarch64_sve_clasta_n:
1602   case Intrinsic::aarch64_sve_clastb_n:
1603     return instCombineSVECondLast(IC, II);
1604   case Intrinsic::aarch64_sve_cntd:
1605     return instCombineSVECntElts(IC, II, 2);
1606   case Intrinsic::aarch64_sve_cntw:
1607     return instCombineSVECntElts(IC, II, 4);
1608   case Intrinsic::aarch64_sve_cnth:
1609     return instCombineSVECntElts(IC, II, 8);
1610   case Intrinsic::aarch64_sve_cntb:
1611     return instCombineSVECntElts(IC, II, 16);
1612   case Intrinsic::aarch64_sve_ptest_any:
1613   case Intrinsic::aarch64_sve_ptest_first:
1614   case Intrinsic::aarch64_sve_ptest_last:
1615     return instCombineSVEPTest(IC, II);
1616   case Intrinsic::aarch64_sve_mul:
1617   case Intrinsic::aarch64_sve_fmul:
1618     return instCombineSVEVectorMul(IC, II);
1619   case Intrinsic::aarch64_sve_fadd:
1620   case Intrinsic::aarch64_sve_add:
1621     return instCombineSVEVectorAdd(IC, II);
1622   case Intrinsic::aarch64_sve_fsub:
1623   case Intrinsic::aarch64_sve_sub:
1624     return instCombineSVEVectorSub(IC, II);
1625   case Intrinsic::aarch64_sve_tbl:
1626     return instCombineSVETBL(IC, II);
1627   case Intrinsic::aarch64_sve_uunpkhi:
1628   case Intrinsic::aarch64_sve_uunpklo:
1629   case Intrinsic::aarch64_sve_sunpkhi:
1630   case Intrinsic::aarch64_sve_sunpklo:
1631     return instCombineSVEUnpack(IC, II);
1632   case Intrinsic::aarch64_sve_zip1:
1633   case Intrinsic::aarch64_sve_zip2:
1634     return instCombineSVEZip(IC, II);
1635   case Intrinsic::aarch64_sve_ld1_gather_index:
1636     return instCombineLD1GatherIndex(IC, II);
1637   case Intrinsic::aarch64_sve_st1_scatter_index:
1638     return instCombineST1ScatterIndex(IC, II);
1639   case Intrinsic::aarch64_sve_ld1:
1640     return instCombineSVELD1(IC, II, DL);
1641   case Intrinsic::aarch64_sve_st1:
1642     return instCombineSVEST1(IC, II, DL);
1643   case Intrinsic::aarch64_sve_sdiv:
1644     return instCombineSVESDIV(IC, II);
1645   case Intrinsic::aarch64_sve_sel:
1646     return instCombineSVESel(IC, II);
1647   case Intrinsic::aarch64_sve_srshl:
1648     return instCombineSVESrshl(IC, II);
1649   case Intrinsic::aarch64_sve_dupq_lane:
1650     return instCombineSVEDupqLane(IC, II);
1651   }
1652 
1653   return std::nullopt;
1654 }
1655 
1656 std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
1657     InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
1658     APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
1659     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1660         SimplifyAndSetOp) const {
1661   switch (II.getIntrinsicID()) {
1662   default:
1663     break;
1664   case Intrinsic::aarch64_neon_fcvtxn:
1665   case Intrinsic::aarch64_neon_rshrn:
1666   case Intrinsic::aarch64_neon_sqrshrn:
1667   case Intrinsic::aarch64_neon_sqrshrun:
1668   case Intrinsic::aarch64_neon_sqshrn:
1669   case Intrinsic::aarch64_neon_sqshrun:
1670   case Intrinsic::aarch64_neon_sqxtn:
1671   case Intrinsic::aarch64_neon_sqxtun:
1672   case Intrinsic::aarch64_neon_uqrshrn:
1673   case Intrinsic::aarch64_neon_uqshrn:
1674   case Intrinsic::aarch64_neon_uqxtn:
1675     SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
1676     break;
1677   }
1678 
1679   return std::nullopt;
1680 }
1681 
1682 TypeSize
1683 AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
1684   switch (K) {
1685   case TargetTransformInfo::RGK_Scalar:
1686     return TypeSize::getFixed(64);
1687   case TargetTransformInfo::RGK_FixedWidthVector:
1688     if (!ST->isStreamingSVEModeDisabled() &&
1689         !EnableFixedwidthAutovecInStreamingMode)
1690       return TypeSize::getFixed(0);
1691 
1692     if (ST->hasSVE())
1693       return TypeSize::getFixed(
1694           std::max(ST->getMinSVEVectorSizeInBits(), 128u));
1695 
1696     return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
1697   case TargetTransformInfo::RGK_ScalableVector:
1698     if (!ST->isStreamingSVEModeDisabled() && !EnableScalableAutovecInStreamingMode)
1699       return TypeSize::getScalable(0);
1700 
1701     return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
1702   }
1703   llvm_unreachable("Unsupported register kind");
1704 }
1705 
1706 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
1707                                            ArrayRef<const Value *> Args) {
1708 
1709   // A helper that returns a vector type from the given type. The number of
1710   // elements in type Ty determines the vector width.
1711   auto toVectorTy = [&](Type *ArgTy) {
1712     return VectorType::get(ArgTy->getScalarType(),
1713                            cast<VectorType>(DstTy)->getElementCount());
1714   };
1715 
1716   // Exit early if DstTy is not a vector type whose elements are at least
1717   // 16-bits wide. SVE doesn't generally have the same set of instructions to
1718   // perform an extend with the add/sub/mul. There are SMULLB style
1719   // instructions, but they operate on top/bottom, requiring some sort of lane
1720   // interleaving to be used with zext/sext.
1721   if (!useNeonVector(DstTy) || DstTy->getScalarSizeInBits() < 16)
1722     return false;
1723 
1724   // Determine if the operation has a widening variant. We consider both the
1725   // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
1726   // instructions.
1727   //
1728   // TODO: Add additional widening operations (e.g., shl, etc.) once we
1729   //       verify that their extending operands are eliminated during code
1730   //       generation.
1731   switch (Opcode) {
1732   case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
1733   case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
1734   case Instruction::Mul: // SMULL(2), UMULL(2)
1735     break;
1736   default:
1737     return false;
1738   }
1739 
1740   // To be a widening instruction (either the "wide" or "long" versions), the
1741   // second operand must be a sign- or zero extend.
1742   if (Args.size() != 2 ||
1743       (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])))
1744     return false;
1745   auto *Extend = cast<CastInst>(Args[1]);
1746   auto *Arg0 = dyn_cast<CastInst>(Args[0]);
1747 
1748   // A mul only has a mull version (not like addw). Both operands need to be
1749   // extending and the same type.
1750   if (Opcode == Instruction::Mul &&
1751       (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() ||
1752        Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType()))
1753     return false;
1754 
1755   // Legalize the destination type and ensure it can be used in a widening
1756   // operation.
1757   auto DstTyL = getTypeLegalizationCost(DstTy);
1758   unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
1759   if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
1760     return false;
1761 
1762   // Legalize the source type and ensure it can be used in a widening
1763   // operation.
1764   auto *SrcTy = toVectorTy(Extend->getSrcTy());
1765   auto SrcTyL = getTypeLegalizationCost(SrcTy);
1766   unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
1767   if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
1768     return false;
1769 
1770   // Get the total number of vector elements in the legalized types.
1771   InstructionCost NumDstEls =
1772       DstTyL.first * DstTyL.second.getVectorMinNumElements();
1773   InstructionCost NumSrcEls =
1774       SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
1775 
1776   // Return true if the legalized types have the same number of vector elements
1777   // and the destination element type size is twice that of the source type.
1778   return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
1779 }
1780 
1781 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1782                                                  Type *Src,
1783                                                  TTI::CastContextHint CCH,
1784                                                  TTI::TargetCostKind CostKind,
1785                                                  const Instruction *I) {
1786   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1787   assert(ISD && "Invalid opcode");
1788 
1789   // If the cast is observable, and it is used by a widening instruction (e.g.,
1790   // uaddl, saddw, etc.), it may be free.
1791   if (I && I->hasOneUser()) {
1792     auto *SingleUser = cast<Instruction>(*I->user_begin());
1793     SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
1794     if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
1795       // If the cast is the second operand, it is free. We will generate either
1796       // a "wide" or "long" version of the widening instruction.
1797       if (I == SingleUser->getOperand(1))
1798         return 0;
1799       // If the cast is not the second operand, it will be free if it looks the
1800       // same as the second operand. In this case, we will generate a "long"
1801       // version of the widening instruction.
1802       if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
1803         if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
1804             cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
1805           return 0;
1806     }
1807   }
1808 
1809   // TODO: Allow non-throughput costs that aren't binary.
1810   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1811     if (CostKind != TTI::TCK_RecipThroughput)
1812       return Cost == 0 ? 0 : 1;
1813     return Cost;
1814   };
1815 
1816   EVT SrcTy = TLI->getValueType(DL, Src);
1817   EVT DstTy = TLI->getValueType(DL, Dst);
1818 
1819   if (!SrcTy.isSimple() || !DstTy.isSimple())
1820     return AdjustCost(
1821         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1822 
1823   static const TypeConversionCostTblEntry
1824   ConversionTbl[] = {
1825     { ISD::TRUNCATE, MVT::v2i8,   MVT::v2i64,  1},  // xtn
1826     { ISD::TRUNCATE, MVT::v2i16,  MVT::v2i64,  1},  // xtn
1827     { ISD::TRUNCATE, MVT::v2i32,  MVT::v2i64,  1},  // xtn
1828     { ISD::TRUNCATE, MVT::v4i8,   MVT::v4i32,  1},  // xtn
1829     { ISD::TRUNCATE, MVT::v4i8,   MVT::v4i64,  3},  // 2 xtn + 1 uzp1
1830     { ISD::TRUNCATE, MVT::v4i16,  MVT::v4i32,  1},  // xtn
1831     { ISD::TRUNCATE, MVT::v4i16,  MVT::v4i64,  2},  // 1 uzp1 + 1 xtn
1832     { ISD::TRUNCATE, MVT::v4i32,  MVT::v4i64,  1},  // 1 uzp1
1833     { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i16,  1},  // 1 xtn
1834     { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i32,  2},  // 1 uzp1 + 1 xtn
1835     { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i64,  4},  // 3 x uzp1 + xtn
1836     { ISD::TRUNCATE, MVT::v8i16,  MVT::v8i32,  1},  // 1 uzp1
1837     { ISD::TRUNCATE, MVT::v8i16,  MVT::v8i64,  3},  // 3 x uzp1
1838     { ISD::TRUNCATE, MVT::v8i32,  MVT::v8i64,  2},  // 2 x uzp1
1839     { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i16, 1},  // uzp1
1840     { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i32, 3},  // (2 + 1) x uzp1
1841     { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i64, 7},  // (4 + 2 + 1) x uzp1
1842     { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2},  // 2 x uzp1
1843     { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6},  // (4 + 2) x uzp1
1844     { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4},  // 4 x uzp1
1845 
1846     // Truncations on nxvmiN
1847     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
1848     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
1849     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
1850     { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
1851     { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
1852     { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
1853     { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
1854     { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
1855     { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
1856     { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
1857     { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
1858     { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
1859     { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
1860     { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
1861     { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
1862     { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
1863 
1864     // The number of shll instructions for the extension.
1865     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
1866     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
1867     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
1868     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
1869     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
1870     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
1871     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
1872     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
1873     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
1874     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
1875     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
1876     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
1877     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1878     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
1879     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1880     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
1881 
1882     // LowerVectorINT_TO_FP:
1883     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1884     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1885     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1886     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
1887     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
1888     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
1889 
1890     // Complex: to v2f32
1891     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
1892     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1893     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1894     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
1895     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
1896     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
1897 
1898     // Complex: to v4f32
1899     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
1900     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1901     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
1902     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
1903 
1904     // Complex: to v8f32
1905     { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
1906     { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1907     { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
1908     { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
1909 
1910     // Complex: to v16f32
1911     { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1912     { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
1913 
1914     // Complex: to v2f64
1915     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
1916     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1917     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1918     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
1919     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
1920     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
1921 
1922     // Complex: to v4f64
1923     { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32,  4 },
1924     { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32,  4 },
1925 
1926     // LowerVectorFP_TO_INT
1927     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
1928     { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
1929     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
1930     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
1931     { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
1932     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
1933 
1934     // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1935     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
1936     { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
1937     { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
1938     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
1939     { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
1940     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
1941 
1942     // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1943     { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
1944     { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
1945     { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
1946     { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
1947 
1948     // Complex, from nxv2f32.
1949     { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1950     { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1951     { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1952     { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f32, 1 },
1953     { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
1954     { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
1955     { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
1956     { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f32, 1 },
1957 
1958     // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1959     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
1960     { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
1961     { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
1962     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
1963     { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
1964     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
1965 
1966     // Complex, from nxv2f64.
1967     { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1968     { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1969     { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1970     { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f64, 1 },
1971     { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
1972     { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
1973     { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
1974     { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f64, 1 },
1975 
1976     // Complex, from nxv4f32.
1977     { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1978     { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1979     { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1980     { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f32, 1 },
1981     { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
1982     { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
1983     { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
1984     { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f32, 1 },
1985 
1986     // Complex, from nxv8f64. Illegal -> illegal conversions not required.
1987     { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
1988     { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f64, 7 },
1989     { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
1990     { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f64, 7 },
1991 
1992     // Complex, from nxv4f64. Illegal -> illegal conversions not required.
1993     { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
1994     { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
1995     { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f64, 3 },
1996     { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
1997     { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
1998     { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f64, 3 },
1999 
2000     // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2001     { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2002     { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f32, 3 },
2003     { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2004     { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f32, 3 },
2005 
2006     // Complex, from nxv8f16.
2007     { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2008     { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2009     { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2010     { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f16, 1 },
2011     { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2012     { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2013     { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2014     { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f16, 1 },
2015 
2016     // Complex, from nxv4f16.
2017     { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2018     { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2019     { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2020     { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f16, 1 },
2021     { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2022     { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2023     { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2024     { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f16, 1 },
2025 
2026     // Complex, from nxv2f16.
2027     { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2028     { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2029     { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2030     { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f16, 1 },
2031     { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2032     { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2033     { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2034     { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f16, 1 },
2035 
2036     // Truncate from nxvmf32 to nxvmf16.
2037     { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2038     { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2039     { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2040 
2041     // Truncate from nxvmf64 to nxvmf16.
2042     { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2043     { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2044     { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2045 
2046     // Truncate from nxvmf64 to nxvmf32.
2047     { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2048     { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2049     { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2050 
2051     // Extend from nxvmf16 to nxvmf32.
2052     { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2053     { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2054     { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2055 
2056     // Extend from nxvmf16 to nxvmf64.
2057     { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2058     { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2059     { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2060 
2061     // Extend from nxvmf32 to nxvmf64.
2062     { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2063     { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2064     { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2065 
2066     // Bitcasts from float to integer
2067     { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
2068     { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
2069     { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
2070 
2071     // Bitcasts from integer to float
2072     { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
2073     { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
2074     { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
2075   };
2076 
2077   if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2078                                                  DstTy.getSimpleVT(),
2079                                                  SrcTy.getSimpleVT()))
2080     return AdjustCost(Entry->Cost);
2081 
2082   static const TypeConversionCostTblEntry FP16Tbl[] = {
2083       {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2084       {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
2085       {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2086       {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
2087       {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2088       {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
2089       {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2090       {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
2091       {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2092       {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
2093       {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2094       {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
2095       {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2096       {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
2097       {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2098       {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
2099       {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2100       {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
2101       {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2},   // ushll + ucvtf
2102       {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2},   // sshll + scvtf
2103       {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2104       {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2105   };
2106 
2107   if (ST->hasFullFP16())
2108     if (const auto *Entry = ConvertCostTableLookup(
2109             FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2110       return AdjustCost(Entry->Cost);
2111 
2112   return AdjustCost(
2113       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2114 }
2115 
2116 InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
2117                                                          Type *Dst,
2118                                                          VectorType *VecTy,
2119                                                          unsigned Index) {
2120 
2121   // Make sure we were given a valid extend opcode.
2122   assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2123          "Invalid opcode");
2124 
2125   // We are extending an element we extract from a vector, so the source type
2126   // of the extend is the element type of the vector.
2127   auto *Src = VecTy->getElementType();
2128 
2129   // Sign- and zero-extends are for integer types only.
2130   assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2131 
2132   // Get the cost for the extract. We compute the cost (if any) for the extend
2133   // below.
2134   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2135   InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2136                                             CostKind, Index, nullptr, nullptr);
2137 
2138   // Legalize the types.
2139   auto VecLT = getTypeLegalizationCost(VecTy);
2140   auto DstVT = TLI->getValueType(DL, Dst);
2141   auto SrcVT = TLI->getValueType(DL, Src);
2142 
2143   // If the resulting type is still a vector and the destination type is legal,
2144   // we may get the extension for free. If not, get the default cost for the
2145   // extend.
2146   if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2147     return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2148                                    CostKind);
2149 
2150   // The destination type should be larger than the element type. If not, get
2151   // the default cost for the extend.
2152   if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2153     return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2154                                    CostKind);
2155 
2156   switch (Opcode) {
2157   default:
2158     llvm_unreachable("Opcode should be either SExt or ZExt");
2159 
2160   // For sign-extends, we only need a smov, which performs the extension
2161   // automatically.
2162   case Instruction::SExt:
2163     return Cost;
2164 
2165   // For zero-extends, the extend is performed automatically by a umov unless
2166   // the destination type is i64 and the element type is i8 or i16.
2167   case Instruction::ZExt:
2168     if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2169       return Cost;
2170   }
2171 
2172   // If we are unable to perform the extend for free, get the default cost.
2173   return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2174                                  CostKind);
2175 }
2176 
2177 InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
2178                                                TTI::TargetCostKind CostKind,
2179                                                const Instruction *I) {
2180   if (CostKind != TTI::TCK_RecipThroughput)
2181     return Opcode == Instruction::PHI ? 0 : 1;
2182   assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2183   // Branches are assumed to be predicted.
2184   return 0;
2185 }
2186 
2187 InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val,
2188                                                          unsigned Index,
2189                                                          bool HasRealUse) {
2190   assert(Val->isVectorTy() && "This must be a vector type");
2191 
2192   if (Index != -1U) {
2193     // Legalize the type.
2194     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2195 
2196     // This type is legalized to a scalar type.
2197     if (!LT.second.isVector())
2198       return 0;
2199 
2200     // The type may be split. For fixed-width vectors we can normalize the
2201     // index to the new type.
2202     if (LT.second.isFixedLengthVector()) {
2203       unsigned Width = LT.second.getVectorNumElements();
2204       Index = Index % Width;
2205     }
2206 
2207     // The element at index zero is already inside the vector.
2208     // - For a physical (HasRealUse==true) insert-element or extract-element
2209     // instruction that extracts integers, an explicit FPR -> GPR move is
2210     // needed. So it has non-zero cost.
2211     // - For the rest of cases (virtual instruction or element type is float),
2212     // consider the instruction free.
2213     //
2214     // FIXME:
2215     // If the extract-element and insert-element instructions could be
2216     // simplified away (e.g., could be combined into users by looking at use-def
2217     // context), they have no cost. This is not done in the first place for
2218     // compile-time considerations.
2219     if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2220       return 0;
2221   }
2222 
2223   // All other insert/extracts cost this much.
2224   return ST->getVectorInsertExtractBaseCost();
2225 }
2226 
2227 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
2228                                                    TTI::TargetCostKind CostKind,
2229                                                    unsigned Index, Value *Op0,
2230                                                    Value *Op1) {
2231   return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */);
2232 }
2233 
2234 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
2235                                                    Type *Val,
2236                                                    TTI::TargetCostKind CostKind,
2237                                                    unsigned Index) {
2238   return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */);
2239 }
2240 
2241 InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
2242     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2243     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
2244     ArrayRef<const Value *> Args,
2245     const Instruction *CxtI) {
2246 
2247   // TODO: Handle more cost kinds.
2248   if (CostKind != TTI::TCK_RecipThroughput)
2249     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2250                                          Op2Info, Args, CxtI);
2251 
2252   // Legalize the type.
2253   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2254   int ISD = TLI->InstructionOpcodeToISD(Opcode);
2255 
2256   switch (ISD) {
2257   default:
2258     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2259                                          Op2Info);
2260   case ISD::SDIV:
2261     if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2262       // On AArch64, scalar signed division by constants power-of-two are
2263       // normally expanded to the sequence ADD + CMP + SELECT + SRA.
2264       // The OperandValue properties many not be same as that of previous
2265       // operation; conservatively assume OP_None.
2266       InstructionCost Cost = getArithmeticInstrCost(
2267           Instruction::Add, Ty, CostKind,
2268           Op1Info.getNoProps(), Op2Info.getNoProps());
2269       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2270                                      Op1Info.getNoProps(), Op2Info.getNoProps());
2271       Cost += getArithmeticInstrCost(
2272           Instruction::Select, Ty, CostKind,
2273           Op1Info.getNoProps(), Op2Info.getNoProps());
2274       Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2275                                      Op1Info.getNoProps(), Op2Info.getNoProps());
2276       return Cost;
2277     }
2278     [[fallthrough]];
2279   case ISD::UDIV: {
2280     if (Op2Info.isConstant() && Op2Info.isUniform()) {
2281       auto VT = TLI->getValueType(DL, Ty);
2282       if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
2283         // Vector signed division by constant are expanded to the
2284         // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2285         // to MULHS + SUB + SRL + ADD + SRL.
2286         InstructionCost MulCost = getArithmeticInstrCost(
2287             Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2288         InstructionCost AddCost = getArithmeticInstrCost(
2289             Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2290         InstructionCost ShrCost = getArithmeticInstrCost(
2291             Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2292         return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2293       }
2294     }
2295 
2296     InstructionCost Cost = BaseT::getArithmeticInstrCost(
2297         Opcode, Ty, CostKind, Op1Info, Op2Info);
2298     if (Ty->isVectorTy()) {
2299       if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
2300         // SDIV/UDIV operations are lowered using SVE, then we can have less
2301         // costs.
2302         if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
2303                                                 ->getPrimitiveSizeInBits()
2304                                                 .getFixedValue() < 128) {
2305           EVT VT = TLI->getValueType(DL, Ty);
2306           static const CostTblEntry DivTbl[]{
2307               {ISD::SDIV, MVT::v2i8, 5},  {ISD::SDIV, MVT::v4i8, 8},
2308               {ISD::SDIV, MVT::v8i8, 8},  {ISD::SDIV, MVT::v2i16, 5},
2309               {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
2310               {ISD::UDIV, MVT::v2i8, 5},  {ISD::UDIV, MVT::v4i8, 8},
2311               {ISD::UDIV, MVT::v8i8, 8},  {ISD::UDIV, MVT::v2i16, 5},
2312               {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
2313 
2314           const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2315           if (nullptr != Entry)
2316             return Entry->Cost;
2317         }
2318         // For 8/16-bit elements, the cost is higher because the type
2319         // requires promotion and possibly splitting:
2320         if (LT.second.getScalarType() == MVT::i8)
2321           Cost *= 8;
2322         else if (LT.second.getScalarType() == MVT::i16)
2323           Cost *= 4;
2324         return Cost;
2325       } else {
2326         // If one of the operands is a uniform constant then the cost for each
2327         // element is Cost for insertion, extraction and division.
2328         // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2329         // operation with scalar type
2330         if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
2331             (Op2Info.isConstant() && Op2Info.isUniform())) {
2332           if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2333             InstructionCost DivCost = BaseT::getArithmeticInstrCost(
2334                 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
2335             return (4 + DivCost) * VTy->getNumElements();
2336           }
2337         }
2338         // On AArch64, without SVE, vector divisions are expanded
2339         // into scalar divisions of each pair of elements.
2340         Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
2341                                        CostKind, Op1Info, Op2Info);
2342         Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
2343                                        Op1Info, Op2Info);
2344       }
2345 
2346       // TODO: if one of the arguments is scalar, then it's not necessary to
2347       // double the cost of handling the vector elements.
2348       Cost += Cost;
2349     }
2350     return Cost;
2351   }
2352   case ISD::MUL:
2353     // When SVE is available, then we can lower the v2i64 operation using
2354     // the SVE mul instruction, which has a lower cost.
2355     if (LT.second == MVT::v2i64 && ST->hasSVE())
2356       return LT.first;
2357 
2358     // When SVE is not available, there is no MUL.2d instruction,
2359     // which means mul <2 x i64> is expensive as elements are extracted
2360     // from the vectors and the muls scalarized.
2361     // As getScalarizationOverhead is a bit too pessimistic, we
2362     // estimate the cost for a i64 vector directly here, which is:
2363     // - four 2-cost i64 extracts,
2364     // - two 2-cost i64 inserts, and
2365     // - two 1-cost muls.
2366     // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
2367     // LT.first = 2 the cost is 28. If both operands are extensions it will not
2368     // need to scalarize so the cost can be cheaper (smull or umull).
2369     // so the cost can be cheaper (smull or umull).
2370     if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
2371       return LT.first;
2372     return LT.first * 14;
2373   case ISD::ADD:
2374   case ISD::XOR:
2375   case ISD::OR:
2376   case ISD::AND:
2377   case ISD::SRL:
2378   case ISD::SRA:
2379   case ISD::SHL:
2380     // These nodes are marked as 'custom' for combining purposes only.
2381     // We know that they are legal. See LowerAdd in ISelLowering.
2382     return LT.first;
2383 
2384   case ISD::FADD:
2385   case ISD::FSUB:
2386   case ISD::FMUL:
2387   case ISD::FDIV:
2388   case ISD::FNEG:
2389     // These nodes are marked as 'custom' just to lower them to SVE.
2390     // We know said lowering will incur no additional cost.
2391     if (!Ty->getScalarType()->isFP128Ty())
2392       return 2 * LT.first;
2393 
2394     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2395                                          Op2Info);
2396   }
2397 }
2398 
2399 InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
2400                                                           ScalarEvolution *SE,
2401                                                           const SCEV *Ptr) {
2402   // Address computations in vectorized code with non-consecutive addresses will
2403   // likely result in more instructions compared to scalar code where the
2404   // computation can more often be merged into the index mode. The resulting
2405   // extra micro-ops can significantly decrease throughput.
2406   unsigned NumVectorInstToHideOverhead = 10;
2407   int MaxMergeDistance = 64;
2408 
2409   if (Ty->isVectorTy() && SE &&
2410       !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
2411     return NumVectorInstToHideOverhead;
2412 
2413   // In many cases the address computation is not merged into the instruction
2414   // addressing mode.
2415   return 1;
2416 }
2417 
2418 InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
2419                                                    Type *CondTy,
2420                                                    CmpInst::Predicate VecPred,
2421                                                    TTI::TargetCostKind CostKind,
2422                                                    const Instruction *I) {
2423   // TODO: Handle other cost kinds.
2424   if (CostKind != TTI::TCK_RecipThroughput)
2425     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2426                                      I);
2427 
2428   int ISD = TLI->InstructionOpcodeToISD(Opcode);
2429   // We don't lower some vector selects well that are wider than the register
2430   // width.
2431   if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
2432     // We would need this many instructions to hide the scalarization happening.
2433     const int AmortizationCost = 20;
2434 
2435     // If VecPred is not set, check if we can get a predicate from the context
2436     // instruction, if its type matches the requested ValTy.
2437     if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
2438       CmpInst::Predicate CurrentPred;
2439       if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
2440                             m_Value())))
2441         VecPred = CurrentPred;
2442     }
2443     // Check if we have a compare/select chain that can be lowered using
2444     // a (F)CMxx & BFI pair.
2445     if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
2446         VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
2447         VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
2448         VecPred == CmpInst::FCMP_UNE) {
2449       static const auto ValidMinMaxTys = {
2450           MVT::v8i8,  MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
2451           MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
2452       static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
2453 
2454       auto LT = getTypeLegalizationCost(ValTy);
2455       if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
2456           (ST->hasFullFP16() &&
2457            any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
2458         return LT.first;
2459     }
2460 
2461     static const TypeConversionCostTblEntry
2462     VectorSelectTbl[] = {
2463       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
2464       { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
2465       { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
2466       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
2467       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
2468       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
2469     };
2470 
2471     EVT SelCondTy = TLI->getValueType(DL, CondTy);
2472     EVT SelValTy = TLI->getValueType(DL, ValTy);
2473     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
2474       if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
2475                                                      SelCondTy.getSimpleVT(),
2476                                                      SelValTy.getSimpleVT()))
2477         return Entry->Cost;
2478     }
2479   }
2480   // The base case handles scalable vectors fine for now, since it treats the
2481   // cost as 1 * legalization cost.
2482   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2483 }
2484 
2485 AArch64TTIImpl::TTI::MemCmpExpansionOptions
2486 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
2487   TTI::MemCmpExpansionOptions Options;
2488   if (ST->requiresStrictAlign()) {
2489     // TODO: Add cost modeling for strict align. Misaligned loads expand to
2490     // a bunch of instructions when strict align is enabled.
2491     return Options;
2492   }
2493   Options.AllowOverlappingLoads = true;
2494   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
2495   Options.NumLoadsPerBlock = Options.MaxNumLoads;
2496   // TODO: Though vector loads usually perform well on AArch64, in some targets
2497   // they may wake up the FP unit, which raises the power consumption.  Perhaps
2498   // they could be used with no holds barred (-O3).
2499   Options.LoadSizes = {8, 4, 2, 1};
2500   return Options;
2501 }
2502 
2503 bool AArch64TTIImpl::prefersVectorizedAddressing() const {
2504   return ST->hasSVE();
2505 }
2506 
2507 InstructionCost
2508 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
2509                                       Align Alignment, unsigned AddressSpace,
2510                                       TTI::TargetCostKind CostKind) {
2511   if (useNeonVector(Src))
2512     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2513                                         CostKind);
2514   auto LT = getTypeLegalizationCost(Src);
2515   if (!LT.first.isValid())
2516     return InstructionCost::getInvalid();
2517 
2518   // The code-generator is currently not able to handle scalable vectors
2519   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2520   // it. This change will be removed when code-generation for these types is
2521   // sufficiently reliable.
2522   if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
2523     return InstructionCost::getInvalid();
2524 
2525   return LT.first;
2526 }
2527 
2528 static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
2529   return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
2530 }
2531 
2532 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
2533     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
2534     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
2535   if (useNeonVector(DataTy))
2536     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
2537                                          Alignment, CostKind, I);
2538   auto *VT = cast<VectorType>(DataTy);
2539   auto LT = getTypeLegalizationCost(DataTy);
2540   if (!LT.first.isValid())
2541     return InstructionCost::getInvalid();
2542 
2543   // The code-generator is currently not able to handle scalable vectors
2544   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2545   // it. This change will be removed when code-generation for these types is
2546   // sufficiently reliable.
2547   if (cast<VectorType>(DataTy)->getElementCount() ==
2548       ElementCount::getScalable(1))
2549     return InstructionCost::getInvalid();
2550 
2551   ElementCount LegalVF = LT.second.getVectorElementCount();
2552   InstructionCost MemOpCost =
2553       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
2554                       {TTI::OK_AnyValue, TTI::OP_None}, I);
2555   // Add on an overhead cost for using gathers/scatters.
2556   // TODO: At the moment this is applied unilaterally for all CPUs, but at some
2557   // point we may want a per-CPU overhead.
2558   MemOpCost *= getSVEGatherScatterOverhead(Opcode);
2559   return LT.first * MemOpCost * getMaxNumElements(LegalVF);
2560 }
2561 
2562 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
2563   return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
2564 }
2565 
2566 InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
2567                                                 MaybeAlign Alignment,
2568                                                 unsigned AddressSpace,
2569                                                 TTI::TargetCostKind CostKind,
2570                                                 TTI::OperandValueInfo OpInfo,
2571                                                 const Instruction *I) {
2572   EVT VT = TLI->getValueType(DL, Ty, true);
2573   // Type legalization can't handle structs
2574   if (VT == MVT::Other)
2575     return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
2576                                   CostKind);
2577 
2578   auto LT = getTypeLegalizationCost(Ty);
2579   if (!LT.first.isValid())
2580     return InstructionCost::getInvalid();
2581 
2582   // The code-generator is currently not able to handle scalable vectors
2583   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2584   // it. This change will be removed when code-generation for these types is
2585   // sufficiently reliable.
2586   if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
2587     if (VTy->getElementCount() == ElementCount::getScalable(1))
2588       return InstructionCost::getInvalid();
2589 
2590   // TODO: consider latency as well for TCK_SizeAndLatency.
2591   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
2592     return LT.first;
2593 
2594   if (CostKind != TTI::TCK_RecipThroughput)
2595     return 1;
2596 
2597   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
2598       LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
2599     // Unaligned stores are extremely inefficient. We don't split all
2600     // unaligned 128-bit stores because the negative impact that has shown in
2601     // practice on inlined block copy code.
2602     // We make such stores expensive so that we will only vectorize if there
2603     // are 6 other instructions getting vectorized.
2604     const int AmortizationCost = 6;
2605 
2606     return LT.first * 2 * AmortizationCost;
2607   }
2608 
2609   // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
2610   if (Ty->isPtrOrPtrVectorTy())
2611     return LT.first;
2612 
2613   // Check truncating stores and extending loads.
2614   if (useNeonVector(Ty) &&
2615       Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
2616     // v4i8 types are lowered to scalar a load/store and sshll/xtn.
2617     if (VT == MVT::v4i8)
2618       return 2;
2619     // Otherwise we need to scalarize.
2620     return cast<FixedVectorType>(Ty)->getNumElements() * 2;
2621   }
2622 
2623   return LT.first;
2624 }
2625 
2626 InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
2627     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
2628     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
2629     bool UseMaskForCond, bool UseMaskForGaps) {
2630   assert(Factor >= 2 && "Invalid interleave factor");
2631   auto *VecVTy = cast<FixedVectorType>(VecTy);
2632 
2633   if (!UseMaskForCond && !UseMaskForGaps &&
2634       Factor <= TLI->getMaxSupportedInterleaveFactor()) {
2635     unsigned NumElts = VecVTy->getNumElements();
2636     auto *SubVecTy =
2637         FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
2638 
2639     // ldN/stN only support legal vector types of size 64 or 128 in bits.
2640     // Accesses having vector types that are a multiple of 128 bits can be
2641     // matched to more than one ldN/stN instruction.
2642     bool UseScalable;
2643     if (NumElts % Factor == 0 &&
2644         TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
2645       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
2646   }
2647 
2648   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2649                                            Alignment, AddressSpace, CostKind,
2650                                            UseMaskForCond, UseMaskForGaps);
2651 }
2652 
2653 InstructionCost
2654 AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
2655   InstructionCost Cost = 0;
2656   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2657   for (auto *I : Tys) {
2658     if (!I->isVectorTy())
2659       continue;
2660     if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
2661         128)
2662       Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
2663               getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
2664   }
2665   return Cost;
2666 }
2667 
2668 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
2669   return ST->getMaxInterleaveFactor();
2670 }
2671 
2672 // For Falkor, we want to avoid having too many strided loads in a loop since
2673 // that can exhaust the HW prefetcher resources.  We adjust the unroller
2674 // MaxCount preference below to attempt to ensure unrolling doesn't create too
2675 // many strided loads.
2676 static void
2677 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2678                               TargetTransformInfo::UnrollingPreferences &UP) {
2679   enum { MaxStridedLoads = 7 };
2680   auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
2681     int StridedLoads = 0;
2682     // FIXME? We could make this more precise by looking at the CFG and
2683     // e.g. not counting loads in each side of an if-then-else diamond.
2684     for (const auto BB : L->blocks()) {
2685       for (auto &I : *BB) {
2686         LoadInst *LMemI = dyn_cast<LoadInst>(&I);
2687         if (!LMemI)
2688           continue;
2689 
2690         Value *PtrValue = LMemI->getPointerOperand();
2691         if (L->isLoopInvariant(PtrValue))
2692           continue;
2693 
2694         const SCEV *LSCEV = SE.getSCEV(PtrValue);
2695         const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
2696         if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
2697           continue;
2698 
2699         // FIXME? We could take pairing of unrolled load copies into account
2700         // by looking at the AddRec, but we would probably have to limit this
2701         // to loops with no stores or other memory optimization barriers.
2702         ++StridedLoads;
2703         // We've seen enough strided loads that seeing more won't make a
2704         // difference.
2705         if (StridedLoads > MaxStridedLoads / 2)
2706           return StridedLoads;
2707       }
2708     }
2709     return StridedLoads;
2710   };
2711 
2712   int StridedLoads = countStridedLoads(L, SE);
2713   LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
2714                     << " strided loads\n");
2715   // Pick the largest power of 2 unroll count that won't result in too many
2716   // strided loads.
2717   if (StridedLoads) {
2718     UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
2719     LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
2720                       << UP.MaxCount << '\n');
2721   }
2722 }
2723 
2724 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2725                                              TTI::UnrollingPreferences &UP,
2726                                              OptimizationRemarkEmitter *ORE) {
2727   // Enable partial unrolling and runtime unrolling.
2728   BaseT::getUnrollingPreferences(L, SE, UP, ORE);
2729 
2730   UP.UpperBound = true;
2731 
2732   // For inner loop, it is more likely to be a hot one, and the runtime check
2733   // can be promoted out from LICM pass, so the overhead is less, let's try
2734   // a larger threshold to unroll more loops.
2735   if (L->getLoopDepth() > 1)
2736     UP.PartialThreshold *= 2;
2737 
2738   // Disable partial & runtime unrolling on -Os.
2739   UP.PartialOptSizeThreshold = 0;
2740 
2741   if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
2742       EnableFalkorHWPFUnrollFix)
2743     getFalkorUnrollingPreferences(L, SE, UP);
2744 
2745   // Scan the loop: don't unroll loops with calls as this could prevent
2746   // inlining. Don't unroll vector loops either, as they don't benefit much from
2747   // unrolling.
2748   for (auto *BB : L->getBlocks()) {
2749     for (auto &I : *BB) {
2750       // Don't unroll vectorised loop.
2751       if (I.getType()->isVectorTy())
2752         return;
2753 
2754       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2755         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2756           if (!isLoweredToCall(F))
2757             continue;
2758         }
2759         return;
2760       }
2761     }
2762   }
2763 
2764   // Enable runtime unrolling for in-order models
2765   // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
2766   // checking for that case, we can ensure that the default behaviour is
2767   // unchanged
2768   if (ST->getProcFamily() != AArch64Subtarget::Others &&
2769       !ST->getSchedModel().isOutOfOrder()) {
2770     UP.Runtime = true;
2771     UP.Partial = true;
2772     UP.UnrollRemainder = true;
2773     UP.DefaultUnrollRuntimeCount = 4;
2774 
2775     UP.UnrollAndJam = true;
2776     UP.UnrollAndJamInnerLoopThreshold = 60;
2777   }
2778 }
2779 
2780 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2781                                            TTI::PeelingPreferences &PP) {
2782   BaseT::getPeelingPreferences(L, SE, PP);
2783 }
2784 
2785 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
2786                                                          Type *ExpectedType) {
2787   switch (Inst->getIntrinsicID()) {
2788   default:
2789     return nullptr;
2790   case Intrinsic::aarch64_neon_st2:
2791   case Intrinsic::aarch64_neon_st3:
2792   case Intrinsic::aarch64_neon_st4: {
2793     // Create a struct type
2794     StructType *ST = dyn_cast<StructType>(ExpectedType);
2795     if (!ST)
2796       return nullptr;
2797     unsigned NumElts = Inst->arg_size() - 1;
2798     if (ST->getNumElements() != NumElts)
2799       return nullptr;
2800     for (unsigned i = 0, e = NumElts; i != e; ++i) {
2801       if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
2802         return nullptr;
2803     }
2804     Value *Res = PoisonValue::get(ExpectedType);
2805     IRBuilder<> Builder(Inst);
2806     for (unsigned i = 0, e = NumElts; i != e; ++i) {
2807       Value *L = Inst->getArgOperand(i);
2808       Res = Builder.CreateInsertValue(Res, L, i);
2809     }
2810     return Res;
2811   }
2812   case Intrinsic::aarch64_neon_ld2:
2813   case Intrinsic::aarch64_neon_ld3:
2814   case Intrinsic::aarch64_neon_ld4:
2815     if (Inst->getType() == ExpectedType)
2816       return Inst;
2817     return nullptr;
2818   }
2819 }
2820 
2821 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
2822                                         MemIntrinsicInfo &Info) {
2823   switch (Inst->getIntrinsicID()) {
2824   default:
2825     break;
2826   case Intrinsic::aarch64_neon_ld2:
2827   case Intrinsic::aarch64_neon_ld3:
2828   case Intrinsic::aarch64_neon_ld4:
2829     Info.ReadMem = true;
2830     Info.WriteMem = false;
2831     Info.PtrVal = Inst->getArgOperand(0);
2832     break;
2833   case Intrinsic::aarch64_neon_st2:
2834   case Intrinsic::aarch64_neon_st3:
2835   case Intrinsic::aarch64_neon_st4:
2836     Info.ReadMem = false;
2837     Info.WriteMem = true;
2838     Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
2839     break;
2840   }
2841 
2842   switch (Inst->getIntrinsicID()) {
2843   default:
2844     return false;
2845   case Intrinsic::aarch64_neon_ld2:
2846   case Intrinsic::aarch64_neon_st2:
2847     Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
2848     break;
2849   case Intrinsic::aarch64_neon_ld3:
2850   case Intrinsic::aarch64_neon_st3:
2851     Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
2852     break;
2853   case Intrinsic::aarch64_neon_ld4:
2854   case Intrinsic::aarch64_neon_st4:
2855     Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
2856     break;
2857   }
2858   return true;
2859 }
2860 
2861 /// See if \p I should be considered for address type promotion. We check if \p
2862 /// I is a sext with right type and used in memory accesses. If it used in a
2863 /// "complex" getelementptr, we allow it to be promoted without finding other
2864 /// sext instructions that sign extended the same initial value. A getelementptr
2865 /// is considered as "complex" if it has more than 2 operands.
2866 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
2867     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2868   bool Considerable = false;
2869   AllowPromotionWithoutCommonHeader = false;
2870   if (!isa<SExtInst>(&I))
2871     return false;
2872   Type *ConsideredSExtType =
2873       Type::getInt64Ty(I.getParent()->getParent()->getContext());
2874   if (I.getType() != ConsideredSExtType)
2875     return false;
2876   // See if the sext is the one with the right type and used in at least one
2877   // GetElementPtrInst.
2878   for (const User *U : I.users()) {
2879     if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2880       Considerable = true;
2881       // A getelementptr is considered as "complex" if it has more than 2
2882       // operands. We will promote a SExt used in such complex GEP as we
2883       // expect some computation to be merged if they are done on 64 bits.
2884       if (GEPInst->getNumOperands() > 2) {
2885         AllowPromotionWithoutCommonHeader = true;
2886         break;
2887       }
2888     }
2889   }
2890   return Considerable;
2891 }
2892 
2893 bool AArch64TTIImpl::isLegalToVectorizeReduction(
2894     const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
2895   if (!VF.isScalable())
2896     return true;
2897 
2898   Type *Ty = RdxDesc.getRecurrenceType();
2899   if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
2900     return false;
2901 
2902   switch (RdxDesc.getRecurrenceKind()) {
2903   case RecurKind::Add:
2904   case RecurKind::FAdd:
2905   case RecurKind::And:
2906   case RecurKind::Or:
2907   case RecurKind::Xor:
2908   case RecurKind::SMin:
2909   case RecurKind::SMax:
2910   case RecurKind::UMin:
2911   case RecurKind::UMax:
2912   case RecurKind::FMin:
2913   case RecurKind::FMax:
2914   case RecurKind::SelectICmp:
2915   case RecurKind::SelectFCmp:
2916   case RecurKind::FMulAdd:
2917     return true;
2918   default:
2919     return false;
2920   }
2921 }
2922 
2923 InstructionCost
2924 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2925                                        bool IsUnsigned,
2926                                        TTI::TargetCostKind CostKind) {
2927   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2928 
2929   if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
2930     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
2931 
2932   assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&
2933          "Both vector needs to be equally scalable");
2934 
2935   InstructionCost LegalizationCost = 0;
2936   if (LT.first > 1) {
2937     Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2938     unsigned MinMaxOpcode =
2939         Ty->isFPOrFPVectorTy()
2940             ? Intrinsic::maxnum
2941             : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
2942     IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
2943     LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
2944   }
2945 
2946   return LegalizationCost + /*Cost of horizontal reduction*/ 2;
2947 }
2948 
2949 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
2950     unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
2951   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2952   InstructionCost LegalizationCost = 0;
2953   if (LT.first > 1) {
2954     Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
2955     LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
2956     LegalizationCost *= LT.first - 1;
2957   }
2958 
2959   int ISD = TLI->InstructionOpcodeToISD(Opcode);
2960   assert(ISD && "Invalid opcode");
2961   // Add the final reduction cost for the legal horizontal reduction
2962   switch (ISD) {
2963   case ISD::ADD:
2964   case ISD::AND:
2965   case ISD::OR:
2966   case ISD::XOR:
2967   case ISD::FADD:
2968     return LegalizationCost + 2;
2969   default:
2970     return InstructionCost::getInvalid();
2971   }
2972 }
2973 
2974 InstructionCost
2975 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
2976                                            std::optional<FastMathFlags> FMF,
2977                                            TTI::TargetCostKind CostKind) {
2978   if (TTI::requiresOrderedReduction(FMF)) {
2979     if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
2980       InstructionCost BaseCost =
2981           BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2982       // Add on extra cost to reflect the extra overhead on some CPUs. We still
2983       // end up vectorizing for more computationally intensive loops.
2984       return BaseCost + FixedVTy->getNumElements();
2985     }
2986 
2987     if (Opcode != Instruction::FAdd)
2988       return InstructionCost::getInvalid();
2989 
2990     auto *VTy = cast<ScalableVectorType>(ValTy);
2991     InstructionCost Cost =
2992         getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
2993     Cost *= getMaxNumElements(VTy->getElementCount());
2994     return Cost;
2995   }
2996 
2997   if (isa<ScalableVectorType>(ValTy))
2998     return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
2999 
3000   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3001   MVT MTy = LT.second;
3002   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3003   assert(ISD && "Invalid opcode");
3004 
3005   // Horizontal adds can use the 'addv' instruction. We model the cost of these
3006   // instructions as twice a normal vector add, plus 1 for each legalization
3007   // step (LT.first). This is the only arithmetic vector reduction operation for
3008   // which we have an instruction.
3009   // OR, XOR and AND costs should match the codegen from:
3010   // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3011   // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3012   // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3013   static const CostTblEntry CostTblNoPairwise[]{
3014       {ISD::ADD, MVT::v8i8,   2},
3015       {ISD::ADD, MVT::v16i8,  2},
3016       {ISD::ADD, MVT::v4i16,  2},
3017       {ISD::ADD, MVT::v8i16,  2},
3018       {ISD::ADD, MVT::v4i32,  2},
3019       {ISD::ADD, MVT::v2i64,  2},
3020       {ISD::OR,  MVT::v8i8,  15},
3021       {ISD::OR,  MVT::v16i8, 17},
3022       {ISD::OR,  MVT::v4i16,  7},
3023       {ISD::OR,  MVT::v8i16,  9},
3024       {ISD::OR,  MVT::v2i32,  3},
3025       {ISD::OR,  MVT::v4i32,  5},
3026       {ISD::OR,  MVT::v2i64,  3},
3027       {ISD::XOR, MVT::v8i8,  15},
3028       {ISD::XOR, MVT::v16i8, 17},
3029       {ISD::XOR, MVT::v4i16,  7},
3030       {ISD::XOR, MVT::v8i16,  9},
3031       {ISD::XOR, MVT::v2i32,  3},
3032       {ISD::XOR, MVT::v4i32,  5},
3033       {ISD::XOR, MVT::v2i64,  3},
3034       {ISD::AND, MVT::v8i8,  15},
3035       {ISD::AND, MVT::v16i8, 17},
3036       {ISD::AND, MVT::v4i16,  7},
3037       {ISD::AND, MVT::v8i16,  9},
3038       {ISD::AND, MVT::v2i32,  3},
3039       {ISD::AND, MVT::v4i32,  5},
3040       {ISD::AND, MVT::v2i64,  3},
3041   };
3042   switch (ISD) {
3043   default:
3044     break;
3045   case ISD::ADD:
3046     if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3047       return (LT.first - 1) + Entry->Cost;
3048     break;
3049   case ISD::XOR:
3050   case ISD::AND:
3051   case ISD::OR:
3052     const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3053     if (!Entry)
3054       break;
3055     auto *ValVTy = cast<FixedVectorType>(ValTy);
3056     if (!ValVTy->getElementType()->isIntegerTy(1) &&
3057         MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3058         isPowerOf2_32(ValVTy->getNumElements())) {
3059       InstructionCost ExtraCost = 0;
3060       if (LT.first != 1) {
3061         // Type needs to be split, so there is an extra cost of LT.first - 1
3062         // arithmetic ops.
3063         auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3064                                         MTy.getVectorNumElements());
3065         ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3066         ExtraCost *= LT.first - 1;
3067       }
3068       return Entry->Cost + ExtraCost;
3069     }
3070     break;
3071   }
3072   return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3073 }
3074 
3075 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
3076   static const CostTblEntry ShuffleTbl[] = {
3077       { TTI::SK_Splice, MVT::nxv16i8,  1 },
3078       { TTI::SK_Splice, MVT::nxv8i16,  1 },
3079       { TTI::SK_Splice, MVT::nxv4i32,  1 },
3080       { TTI::SK_Splice, MVT::nxv2i64,  1 },
3081       { TTI::SK_Splice, MVT::nxv2f16,  1 },
3082       { TTI::SK_Splice, MVT::nxv4f16,  1 },
3083       { TTI::SK_Splice, MVT::nxv8f16,  1 },
3084       { TTI::SK_Splice, MVT::nxv2bf16, 1 },
3085       { TTI::SK_Splice, MVT::nxv4bf16, 1 },
3086       { TTI::SK_Splice, MVT::nxv8bf16, 1 },
3087       { TTI::SK_Splice, MVT::nxv2f32,  1 },
3088       { TTI::SK_Splice, MVT::nxv4f32,  1 },
3089       { TTI::SK_Splice, MVT::nxv2f64,  1 },
3090   };
3091 
3092   // The code-generator is currently not able to handle scalable vectors
3093   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3094   // it. This change will be removed when code-generation for these types is
3095   // sufficiently reliable.
3096   if (Tp->getElementCount() == ElementCount::getScalable(1))
3097     return InstructionCost::getInvalid();
3098 
3099   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3100   Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3101   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3102   EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3103                        ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3104                        : LT.second;
3105   Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3106   InstructionCost LegalizationCost = 0;
3107   if (Index < 0) {
3108     LegalizationCost =
3109         getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3110                            CmpInst::BAD_ICMP_PREDICATE, CostKind) +
3111         getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3112                            CmpInst::BAD_ICMP_PREDICATE, CostKind);
3113   }
3114 
3115   // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3116   // Cost performed on a promoted type.
3117   if (LT.second.getScalarType() == MVT::i1) {
3118     LegalizationCost +=
3119         getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3120                          TTI::CastContextHint::None, CostKind) +
3121         getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3122                          TTI::CastContextHint::None, CostKind);
3123   }
3124   const auto *Entry =
3125       CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3126   assert(Entry && "Illegal Type for Splice");
3127   LegalizationCost += Entry->Cost;
3128   return LegalizationCost * LT.first;
3129 }
3130 
3131 InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
3132                                                VectorType *Tp,
3133                                                ArrayRef<int> Mask,
3134                                                TTI::TargetCostKind CostKind,
3135                                                int Index, VectorType *SubTp,
3136                                                ArrayRef<const Value *> Args) {
3137   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3138   // If we have a Mask, and the LT is being legalized somehow, split the Mask
3139   // into smaller vectors and sum the cost of each shuffle.
3140   if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
3141       Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
3142       cast<FixedVectorType>(Tp)->getNumElements() >
3143           LT.second.getVectorNumElements() &&
3144       !Index && !SubTp) {
3145     unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
3146     assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!");
3147     unsigned LTNumElts = LT.second.getVectorNumElements();
3148     unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
3149     VectorType *NTp =
3150         VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
3151     InstructionCost Cost;
3152     for (unsigned N = 0; N < NumVecs; N++) {
3153       SmallVector<int> NMask;
3154       // Split the existing mask into chunks of size LTNumElts. Track the source
3155       // sub-vectors to ensure the result has at most 2 inputs.
3156       unsigned Source1, Source2;
3157       unsigned NumSources = 0;
3158       for (unsigned E = 0; E < LTNumElts; E++) {
3159         int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
3160                                                       : UndefMaskElem;
3161         if (MaskElt < 0) {
3162           NMask.push_back(UndefMaskElem);
3163           continue;
3164         }
3165 
3166         // Calculate which source from the input this comes from and whether it
3167         // is new to us.
3168         unsigned Source = MaskElt / LTNumElts;
3169         if (NumSources == 0) {
3170           Source1 = Source;
3171           NumSources = 1;
3172         } else if (NumSources == 1 && Source != Source1) {
3173           Source2 = Source;
3174           NumSources = 2;
3175         } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3176           NumSources++;
3177         }
3178 
3179         // Add to the new mask. For the NumSources>2 case these are not correct,
3180         // but are only used for the modular lane number.
3181         if (Source == Source1)
3182           NMask.push_back(MaskElt % LTNumElts);
3183         else if (Source == Source2)
3184           NMask.push_back(MaskElt % LTNumElts + LTNumElts);
3185         else
3186           NMask.push_back(MaskElt % LTNumElts);
3187       }
3188       // If the sub-mask has at most 2 input sub-vectors then re-cost it using
3189       // getShuffleCost. If not then cost it using the worst case.
3190       if (NumSources <= 2)
3191         Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
3192                                                : TTI::SK_PermuteTwoSrc,
3193                                NTp, NMask, CostKind, 0, nullptr, Args);
3194       else if (any_of(enumerate(NMask), [&](const auto &ME) {
3195                  return ME.value() % LTNumElts == ME.index();
3196                }))
3197         Cost += LTNumElts - 1;
3198       else
3199         Cost += LTNumElts;
3200     }
3201     return Cost;
3202   }
3203 
3204   Kind = improveShuffleKindFromMask(Kind, Mask);
3205 
3206   // Check for broadcast loads.
3207   if (Kind == TTI::SK_Broadcast) {
3208     bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
3209     if (IsLoad && LT.second.isVector() &&
3210         isLegalBroadcastLoad(Tp->getElementType(),
3211                              LT.second.getVectorElementCount()))
3212       return 0; // broadcast is handled by ld1r
3213   }
3214 
3215   // If we have 4 elements for the shuffle and a Mask, get the cost straight
3216   // from the perfect shuffle tables.
3217   if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
3218       (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
3219       all_of(Mask, [](int E) { return E < 8; }))
3220     return getPerfectShuffleCost(Mask);
3221 
3222   if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
3223       Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
3224       Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
3225     static const CostTblEntry ShuffleTbl[] = {
3226         // Broadcast shuffle kinds can be performed with 'dup'.
3227         {TTI::SK_Broadcast, MVT::v8i8, 1},
3228         {TTI::SK_Broadcast, MVT::v16i8, 1},
3229         {TTI::SK_Broadcast, MVT::v4i16, 1},
3230         {TTI::SK_Broadcast, MVT::v8i16, 1},
3231         {TTI::SK_Broadcast, MVT::v2i32, 1},
3232         {TTI::SK_Broadcast, MVT::v4i32, 1},
3233         {TTI::SK_Broadcast, MVT::v2i64, 1},
3234         {TTI::SK_Broadcast, MVT::v2f32, 1},
3235         {TTI::SK_Broadcast, MVT::v4f32, 1},
3236         {TTI::SK_Broadcast, MVT::v2f64, 1},
3237         // Transpose shuffle kinds can be performed with 'trn1/trn2' and
3238         // 'zip1/zip2' instructions.
3239         {TTI::SK_Transpose, MVT::v8i8, 1},
3240         {TTI::SK_Transpose, MVT::v16i8, 1},
3241         {TTI::SK_Transpose, MVT::v4i16, 1},
3242         {TTI::SK_Transpose, MVT::v8i16, 1},
3243         {TTI::SK_Transpose, MVT::v2i32, 1},
3244         {TTI::SK_Transpose, MVT::v4i32, 1},
3245         {TTI::SK_Transpose, MVT::v2i64, 1},
3246         {TTI::SK_Transpose, MVT::v2f32, 1},
3247         {TTI::SK_Transpose, MVT::v4f32, 1},
3248         {TTI::SK_Transpose, MVT::v2f64, 1},
3249         // Select shuffle kinds.
3250         // TODO: handle vXi8/vXi16.
3251         {TTI::SK_Select, MVT::v2i32, 1}, // mov.
3252         {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
3253         {TTI::SK_Select, MVT::v2i64, 1}, // mov.
3254         {TTI::SK_Select, MVT::v2f32, 1}, // mov.
3255         {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
3256         {TTI::SK_Select, MVT::v2f64, 1}, // mov.
3257         // PermuteSingleSrc shuffle kinds.
3258         {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
3259         {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
3260         {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
3261         {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
3262         {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
3263         {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
3264         {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
3265         {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
3266         {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
3267         {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8},  // constpool + load + tbl
3268         {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8},  // constpool + load + tbl
3269         {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
3270         {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8},   // constpool + load + tbl
3271         {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8},  // constpool + load + tbl
3272         // Reverse can be lowered with `rev`.
3273         {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
3274         {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
3275         {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
3276         {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
3277         {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
3278         {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
3279         {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
3280         {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
3281         {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
3282         {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
3283         {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
3284         {TTI::SK_Reverse, MVT::v8i8, 1},  // REV64
3285         // Splice can all be lowered as `ext`.
3286         {TTI::SK_Splice, MVT::v2i32, 1},
3287         {TTI::SK_Splice, MVT::v4i32, 1},
3288         {TTI::SK_Splice, MVT::v2i64, 1},
3289         {TTI::SK_Splice, MVT::v2f32, 1},
3290         {TTI::SK_Splice, MVT::v4f32, 1},
3291         {TTI::SK_Splice, MVT::v2f64, 1},
3292         {TTI::SK_Splice, MVT::v8f16, 1},
3293         {TTI::SK_Splice, MVT::v8bf16, 1},
3294         {TTI::SK_Splice, MVT::v8i16, 1},
3295         {TTI::SK_Splice, MVT::v16i8, 1},
3296         {TTI::SK_Splice, MVT::v4bf16, 1},
3297         {TTI::SK_Splice, MVT::v4f16, 1},
3298         {TTI::SK_Splice, MVT::v4i16, 1},
3299         {TTI::SK_Splice, MVT::v8i8, 1},
3300         // Broadcast shuffle kinds for scalable vectors
3301         {TTI::SK_Broadcast, MVT::nxv16i8, 1},
3302         {TTI::SK_Broadcast, MVT::nxv8i16, 1},
3303         {TTI::SK_Broadcast, MVT::nxv4i32, 1},
3304         {TTI::SK_Broadcast, MVT::nxv2i64, 1},
3305         {TTI::SK_Broadcast, MVT::nxv2f16, 1},
3306         {TTI::SK_Broadcast, MVT::nxv4f16, 1},
3307         {TTI::SK_Broadcast, MVT::nxv8f16, 1},
3308         {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
3309         {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
3310         {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
3311         {TTI::SK_Broadcast, MVT::nxv2f32, 1},
3312         {TTI::SK_Broadcast, MVT::nxv4f32, 1},
3313         {TTI::SK_Broadcast, MVT::nxv2f64, 1},
3314         {TTI::SK_Broadcast, MVT::nxv16i1, 1},
3315         {TTI::SK_Broadcast, MVT::nxv8i1, 1},
3316         {TTI::SK_Broadcast, MVT::nxv4i1, 1},
3317         {TTI::SK_Broadcast, MVT::nxv2i1, 1},
3318         // Handle the cases for vector.reverse with scalable vectors
3319         {TTI::SK_Reverse, MVT::nxv16i8, 1},
3320         {TTI::SK_Reverse, MVT::nxv8i16, 1},
3321         {TTI::SK_Reverse, MVT::nxv4i32, 1},
3322         {TTI::SK_Reverse, MVT::nxv2i64, 1},
3323         {TTI::SK_Reverse, MVT::nxv2f16, 1},
3324         {TTI::SK_Reverse, MVT::nxv4f16, 1},
3325         {TTI::SK_Reverse, MVT::nxv8f16, 1},
3326         {TTI::SK_Reverse, MVT::nxv2bf16, 1},
3327         {TTI::SK_Reverse, MVT::nxv4bf16, 1},
3328         {TTI::SK_Reverse, MVT::nxv8bf16, 1},
3329         {TTI::SK_Reverse, MVT::nxv2f32, 1},
3330         {TTI::SK_Reverse, MVT::nxv4f32, 1},
3331         {TTI::SK_Reverse, MVT::nxv2f64, 1},
3332         {TTI::SK_Reverse, MVT::nxv16i1, 1},
3333         {TTI::SK_Reverse, MVT::nxv8i1, 1},
3334         {TTI::SK_Reverse, MVT::nxv4i1, 1},
3335         {TTI::SK_Reverse, MVT::nxv2i1, 1},
3336     };
3337     if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
3338       return LT.first * Entry->Cost;
3339   }
3340 
3341   if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
3342     return getSpliceCost(Tp, Index);
3343 
3344   // Inserting a subvector can often be done with either a D, S or H register
3345   // move, so long as the inserted vector is "aligned".
3346   if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
3347       LT.second.getSizeInBits() <= 128 && SubTp) {
3348     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
3349     if (SubLT.second.isVector()) {
3350       int NumElts = LT.second.getVectorNumElements();
3351       int NumSubElts = SubLT.second.getVectorNumElements();
3352       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
3353         return SubLT.first;
3354     }
3355   }
3356 
3357   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
3358 }
3359 
3360 bool AArch64TTIImpl::preferPredicateOverEpilogue(
3361     Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
3362     TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
3363     InterleavedAccessInfo *IAI) {
3364   if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
3365     return false;
3366 
3367   // We don't currently support vectorisation with interleaving for SVE - with
3368   // such loops we're better off not using tail-folding. This gives us a chance
3369   // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
3370   if (IAI->hasGroups())
3371     return false;
3372 
3373   TailFoldingKind Required; // Defaults to 0.
3374   if (LVL->getReductionVars().size())
3375     Required.add(TailFoldingKind::TFReductions);
3376   if (LVL->getFixedOrderRecurrences().size())
3377     Required.add(TailFoldingKind::TFRecurrences);
3378   if (!Required)
3379     Required.add(TailFoldingKind::TFSimple);
3380 
3381   return (TailFoldingKindLoc & Required) == Required;
3382 }
3383 
3384 InstructionCost
3385 AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
3386                                      int64_t BaseOffset, bool HasBaseReg,
3387                                      int64_t Scale, unsigned AddrSpace) const {
3388   // Scaling factors are not free at all.
3389   // Operands                     | Rt Latency
3390   // -------------------------------------------
3391   // Rt, [Xn, Xm]                 | 4
3392   // -------------------------------------------
3393   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
3394   // Rt, [Xn, Wm, <extend> #imm]  |
3395   TargetLoweringBase::AddrMode AM;
3396   AM.BaseGV = BaseGV;
3397   AM.BaseOffs = BaseOffset;
3398   AM.HasBaseReg = HasBaseReg;
3399   AM.Scale = Scale;
3400   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
3401     // Scale represents reg2 * scale, thus account for 1 if
3402     // it is not equal to 0 or 1.
3403     return AM.Scale != 0 && AM.Scale != 1;
3404   return -1;
3405 }
3406