10b57cec5SDimitry Andric //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric 
90b57cec5SDimitry Andric #include "AArch64TargetTransformInfo.h"
10e8d8bef9SDimitry Andric #include "AArch64ExpandImm.h"
1181ad6265SDimitry Andric #include "AArch64PerfectShuffle.h"
120b57cec5SDimitry Andric #include "MCTargetDesc/AArch64AddressingModes.h"
13349cc55cSDimitry Andric #include "llvm/Analysis/IVDescriptors.h"
140b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h"
150b57cec5SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
160b57cec5SDimitry Andric #include "llvm/CodeGen/BasicTTIImpl.h"
170b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h"
180b57cec5SDimitry Andric #include "llvm/CodeGen/TargetLowering.h"
190b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
2081ad6265SDimitry Andric #include "llvm/IR/Intrinsics.h"
21480093f4SDimitry Andric #include "llvm/IR/IntrinsicsAArch64.h"
22e8d8bef9SDimitry Andric #include "llvm/IR/PatternMatch.h"
230b57cec5SDimitry Andric #include "llvm/Support/Debug.h"
24fe6060f1SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h"
25fcaf7f86SDimitry Andric #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
260b57cec5SDimitry Andric #include <algorithm>
27bdd1243dSDimitry Andric #include <optional>
280b57cec5SDimitry Andric using namespace llvm;
29e8d8bef9SDimitry Andric using namespace llvm::PatternMatch;
300b57cec5SDimitry Andric 
310b57cec5SDimitry Andric #define DEBUG_TYPE "aarch64tti"
320b57cec5SDimitry Andric 
330b57cec5SDimitry Andric static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
340b57cec5SDimitry Andric                                                cl::init(true), cl::Hidden);
350b57cec5SDimitry Andric 
360eae32dcSDimitry Andric static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
370eae32dcSDimitry Andric                                            cl::Hidden);
380eae32dcSDimitry Andric 
390eae32dcSDimitry Andric static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
400eae32dcSDimitry Andric                                             cl::init(10), cl::Hidden);
410eae32dcSDimitry Andric 
4206c3fb27SDimitry Andric static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
4306c3fb27SDimitry Andric                                                   cl::init(15), cl::Hidden);
4406c3fb27SDimitry Andric 
4506c3fb27SDimitry Andric static cl::opt<unsigned>
4606c3fb27SDimitry Andric     NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
4706c3fb27SDimitry Andric                                cl::Hidden);
4806c3fb27SDimitry Andric 
495f757f3fSDimitry Andric static cl::opt<unsigned> CallPenaltyChangeSM(
505f757f3fSDimitry Andric     "call-penalty-sm-change", cl::init(5), cl::Hidden,
515f757f3fSDimitry Andric     cl::desc(
525f757f3fSDimitry Andric         "Penalty of calling a function that requires a change to PSTATE.SM"));
535f757f3fSDimitry Andric 
545f757f3fSDimitry Andric static cl::opt<unsigned> InlineCallPenaltyChangeSM(
555f757f3fSDimitry Andric     "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
565f757f3fSDimitry Andric     cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
575f757f3fSDimitry Andric 
587a6dacacSDimitry Andric static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
597a6dacacSDimitry Andric                                            cl::init(true), cl::Hidden);
607a6dacacSDimitry Andric 
61bdd1243dSDimitry Andric namespace {
6206c3fb27SDimitry Andric class TailFoldingOption {
6306c3fb27SDimitry Andric   // These bitfields will only ever be set to something non-zero in operator=,
6406c3fb27SDimitry Andric   // when setting the -sve-tail-folding option. This option should always be of
6506c3fb27SDimitry Andric   // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
6606c3fb27SDimitry Andric   // InitialBits is one of (disabled|all|simple). EnableBits represents
6706c3fb27SDimitry Andric   // additional flags we're enabling, and DisableBits for those flags we're
6806c3fb27SDimitry Andric   // disabling. The default flag is tracked in the variable NeedsDefault, since
6906c3fb27SDimitry Andric   // at the time of setting the option we may not know what the default value
7006c3fb27SDimitry Andric   // for the CPU is.
7106c3fb27SDimitry Andric   TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
7206c3fb27SDimitry Andric   TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
7306c3fb27SDimitry Andric   TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
7406c3fb27SDimitry Andric 
7506c3fb27SDimitry Andric   // This value needs to be initialised to true in case the user does not
7606c3fb27SDimitry Andric   // explicitly set the -sve-tail-folding option.
7706c3fb27SDimitry Andric   bool NeedsDefault = true;
7806c3fb27SDimitry Andric 
setInitialBits(TailFoldingOpts Bits)7906c3fb27SDimitry Andric   void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
8006c3fb27SDimitry Andric 
setNeedsDefault(bool V)8106c3fb27SDimitry Andric   void setNeedsDefault(bool V) { NeedsDefault = V; }
8206c3fb27SDimitry Andric 
setEnableBit(TailFoldingOpts Bit)8306c3fb27SDimitry Andric   void setEnableBit(TailFoldingOpts Bit) {
8406c3fb27SDimitry Andric     EnableBits |= Bit;
8506c3fb27SDimitry Andric     DisableBits &= ~Bit;
8606c3fb27SDimitry Andric   }
8706c3fb27SDimitry Andric 
setDisableBit(TailFoldingOpts Bit)8806c3fb27SDimitry Andric   void setDisableBit(TailFoldingOpts Bit) {
8906c3fb27SDimitry Andric     EnableBits &= ~Bit;
9006c3fb27SDimitry Andric     DisableBits |= Bit;
9106c3fb27SDimitry Andric   }
9206c3fb27SDimitry Andric 
getBits(TailFoldingOpts DefaultBits) const9306c3fb27SDimitry Andric   TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
9406c3fb27SDimitry Andric     TailFoldingOpts Bits = TailFoldingOpts::Disabled;
9506c3fb27SDimitry Andric 
9606c3fb27SDimitry Andric     assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
9706c3fb27SDimitry Andric            "Initial bits should only include one of "
9806c3fb27SDimitry Andric            "(disabled|all|simple|default)");
9906c3fb27SDimitry Andric     Bits = NeedsDefault ? DefaultBits : InitialBits;
10006c3fb27SDimitry Andric     Bits |= EnableBits;
10106c3fb27SDimitry Andric     Bits &= ~DisableBits;
10206c3fb27SDimitry Andric 
10306c3fb27SDimitry Andric     return Bits;
10406c3fb27SDimitry Andric   }
10506c3fb27SDimitry Andric 
reportError(std::string Opt)10606c3fb27SDimitry Andric   void reportError(std::string Opt) {
10706c3fb27SDimitry Andric     errs() << "invalid argument '" << Opt
10806c3fb27SDimitry Andric            << "' to -sve-tail-folding=; the option should be of the form\n"
10906c3fb27SDimitry Andric               "  (disabled|all|default|simple)[+(reductions|recurrences"
11006c3fb27SDimitry Andric               "|reverse|noreductions|norecurrences|noreverse)]\n";
11106c3fb27SDimitry Andric     report_fatal_error("Unrecognised tail-folding option");
11206c3fb27SDimitry Andric   }
113fcaf7f86SDimitry Andric 
114fcaf7f86SDimitry Andric public:
115fcaf7f86SDimitry Andric 
operator =(const std::string & Val)116fcaf7f86SDimitry Andric   void operator=(const std::string &Val) {
11706c3fb27SDimitry Andric     // If the user explicitly sets -sve-tail-folding= then treat as an error.
11806c3fb27SDimitry Andric     if (Val.empty()) {
11906c3fb27SDimitry Andric       reportError("");
120fcaf7f86SDimitry Andric       return;
12106c3fb27SDimitry Andric     }
12206c3fb27SDimitry Andric 
12306c3fb27SDimitry Andric     // Since the user is explicitly setting the option we don't automatically
12406c3fb27SDimitry Andric     // need the default unless they require it.
12506c3fb27SDimitry Andric     setNeedsDefault(false);
12606c3fb27SDimitry Andric 
12706c3fb27SDimitry Andric     SmallVector<StringRef, 4> TailFoldTypes;
128fcaf7f86SDimitry Andric     StringRef(Val).split(TailFoldTypes, '+', -1, false);
12906c3fb27SDimitry Andric 
13006c3fb27SDimitry Andric     unsigned StartIdx = 1;
13106c3fb27SDimitry Andric     if (TailFoldTypes[0] == "disabled")
13206c3fb27SDimitry Andric       setInitialBits(TailFoldingOpts::Disabled);
13306c3fb27SDimitry Andric     else if (TailFoldTypes[0] == "all")
13406c3fb27SDimitry Andric       setInitialBits(TailFoldingOpts::All);
13506c3fb27SDimitry Andric     else if (TailFoldTypes[0] == "default")
13606c3fb27SDimitry Andric       setNeedsDefault(true);
13706c3fb27SDimitry Andric     else if (TailFoldTypes[0] == "simple")
13806c3fb27SDimitry Andric       setInitialBits(TailFoldingOpts::Simple);
139fcaf7f86SDimitry Andric     else {
14006c3fb27SDimitry Andric       StartIdx = 0;
14106c3fb27SDimitry Andric       setInitialBits(TailFoldingOpts::Disabled);
142fcaf7f86SDimitry Andric     }
14306c3fb27SDimitry Andric 
14406c3fb27SDimitry Andric     for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
14506c3fb27SDimitry Andric       if (TailFoldTypes[I] == "reductions")
14606c3fb27SDimitry Andric         setEnableBit(TailFoldingOpts::Reductions);
14706c3fb27SDimitry Andric       else if (TailFoldTypes[I] == "recurrences")
14806c3fb27SDimitry Andric         setEnableBit(TailFoldingOpts::Recurrences);
14906c3fb27SDimitry Andric       else if (TailFoldTypes[I] == "reverse")
15006c3fb27SDimitry Andric         setEnableBit(TailFoldingOpts::Reverse);
15106c3fb27SDimitry Andric       else if (TailFoldTypes[I] == "noreductions")
15206c3fb27SDimitry Andric         setDisableBit(TailFoldingOpts::Reductions);
15306c3fb27SDimitry Andric       else if (TailFoldTypes[I] == "norecurrences")
15406c3fb27SDimitry Andric         setDisableBit(TailFoldingOpts::Recurrences);
15506c3fb27SDimitry Andric       else if (TailFoldTypes[I] == "noreverse")
15606c3fb27SDimitry Andric         setDisableBit(TailFoldingOpts::Reverse);
15706c3fb27SDimitry Andric       else
15806c3fb27SDimitry Andric         reportError(Val);
159fcaf7f86SDimitry Andric     }
160fcaf7f86SDimitry Andric   }
161fcaf7f86SDimitry Andric 
satisfies(TailFoldingOpts DefaultBits,TailFoldingOpts Required) const16206c3fb27SDimitry Andric   bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
16306c3fb27SDimitry Andric     return (getBits(DefaultBits) & Required) == Required;
16406c3fb27SDimitry Andric   }
165fcaf7f86SDimitry Andric };
166bdd1243dSDimitry Andric } // namespace
167fcaf7f86SDimitry Andric 
16806c3fb27SDimitry Andric TailFoldingOption TailFoldingOptionLoc;
169fcaf7f86SDimitry Andric 
17006c3fb27SDimitry Andric cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
171fcaf7f86SDimitry Andric     "sve-tail-folding",
172fcaf7f86SDimitry Andric     cl::desc(
17306c3fb27SDimitry Andric         "Control the use of vectorisation using tail-folding for SVE where the"
17406c3fb27SDimitry Andric         " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
17506c3fb27SDimitry Andric         "\ndisabled      (Initial) No loop types will vectorize using "
17606c3fb27SDimitry Andric         "tail-folding"
17706c3fb27SDimitry Andric         "\ndefault       (Initial) Uses the default tail-folding settings for "
17806c3fb27SDimitry Andric         "the target CPU"
17906c3fb27SDimitry Andric         "\nall           (Initial) All legal loop types will vectorize using "
18006c3fb27SDimitry Andric         "tail-folding"
18106c3fb27SDimitry Andric         "\nsimple        (Initial) Use tail-folding for simple loops (not "
18206c3fb27SDimitry Andric         "reductions or recurrences)"
183fcaf7f86SDimitry Andric         "\nreductions    Use tail-folding for loops containing reductions"
18406c3fb27SDimitry Andric         "\nnoreductions  Inverse of above"
185bdd1243dSDimitry Andric         "\nrecurrences   Use tail-folding for loops containing fixed order "
18606c3fb27SDimitry Andric         "recurrences"
18706c3fb27SDimitry Andric         "\nnorecurrences Inverse of above"
18806c3fb27SDimitry Andric         "\nreverse       Use tail-folding for loops requiring reversed "
18906c3fb27SDimitry Andric         "predicates"
19006c3fb27SDimitry Andric         "\nnoreverse     Inverse of above"),
19106c3fb27SDimitry Andric     cl::location(TailFoldingOptionLoc));
192fcaf7f86SDimitry Andric 
193bdd1243dSDimitry Andric // Experimental option that will only be fully functional when the
194bdd1243dSDimitry Andric // code-generator is changed to use SVE instead of NEON for all fixed-width
195bdd1243dSDimitry Andric // operations.
196bdd1243dSDimitry Andric static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
197bdd1243dSDimitry Andric     "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
198bdd1243dSDimitry Andric 
199bdd1243dSDimitry Andric // Experimental option that will only be fully functional when the cost-model
200bdd1243dSDimitry Andric // and code-generator have been changed to avoid using scalable vector
201bdd1243dSDimitry Andric // instructions that are not legal in streaming SVE mode.
202bdd1243dSDimitry Andric static cl::opt<bool> EnableScalableAutovecInStreamingMode(
203bdd1243dSDimitry Andric     "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
204bdd1243dSDimitry Andric 
isSMEABIRoutineCall(const CallInst & CI)2055f757f3fSDimitry Andric static bool isSMEABIRoutineCall(const CallInst &CI) {
2065f757f3fSDimitry Andric   const auto *F = CI.getCalledFunction();
2075f757f3fSDimitry Andric   return F && StringSwitch<bool>(F->getName())
2085f757f3fSDimitry Andric                   .Case("__arm_sme_state", true)
2095f757f3fSDimitry Andric                   .Case("__arm_tpidr2_save", true)
2105f757f3fSDimitry Andric                   .Case("__arm_tpidr2_restore", true)
2115f757f3fSDimitry Andric                   .Case("__arm_za_disable", true)
2125f757f3fSDimitry Andric                   .Default(false);
2135f757f3fSDimitry Andric }
2145f757f3fSDimitry Andric 
2155f757f3fSDimitry Andric /// Returns true if the function has explicit operations that can only be
2165f757f3fSDimitry Andric /// lowered using incompatible instructions for the selected mode. This also
2175f757f3fSDimitry Andric /// returns true if the function F may use or modify ZA state.
hasPossibleIncompatibleOps(const Function * F)2185f757f3fSDimitry Andric static bool hasPossibleIncompatibleOps(const Function *F) {
2195f757f3fSDimitry Andric   for (const BasicBlock &BB : *F) {
2205f757f3fSDimitry Andric     for (const Instruction &I : BB) {
2215f757f3fSDimitry Andric       // Be conservative for now and assume that any call to inline asm or to
2225f757f3fSDimitry Andric       // intrinsics could could result in non-streaming ops (e.g. calls to
2235f757f3fSDimitry Andric       // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
2245f757f3fSDimitry Andric       // all native LLVM instructions can be lowered to compatible instructions.
2255f757f3fSDimitry Andric       if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
2265f757f3fSDimitry Andric           (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
2275f757f3fSDimitry Andric            isSMEABIRoutineCall(cast<CallInst>(I))))
2285f757f3fSDimitry Andric         return true;
2295f757f3fSDimitry Andric     }
2305f757f3fSDimitry Andric   }
2315f757f3fSDimitry Andric   return false;
2325f757f3fSDimitry Andric }
2335f757f3fSDimitry Andric 
areInlineCompatible(const Function * Caller,const Function * Callee) const2340b57cec5SDimitry Andric bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
2350b57cec5SDimitry Andric                                          const Function *Callee) const {
236b3edf446SDimitry Andric   SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
237b3edf446SDimitry Andric 
238b3edf446SDimitry Andric   // When inlining, we should consider the body of the function, not the
239b3edf446SDimitry Andric   // interface.
240b3edf446SDimitry Andric   if (CalleeAttrs.hasStreamingBody()) {
241b3edf446SDimitry Andric     CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
242b3edf446SDimitry Andric     CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
243b3edf446SDimitry Andric   }
244b3edf446SDimitry Andric 
2455f757f3fSDimitry Andric   if (CalleeAttrs.hasNewZABody())
246bdd1243dSDimitry Andric     return false;
247bdd1243dSDimitry Andric 
2485f757f3fSDimitry Andric   if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
249b3edf446SDimitry Andric       CallerAttrs.requiresSMChange(CalleeAttrs)) {
2505f757f3fSDimitry Andric     if (hasPossibleIncompatibleOps(Callee))
2515f757f3fSDimitry Andric       return false;
2525f757f3fSDimitry Andric   }
2535f757f3fSDimitry Andric 
2540b57cec5SDimitry Andric   const TargetMachine &TM = getTLI()->getTargetMachine();
2550b57cec5SDimitry Andric 
2560b57cec5SDimitry Andric   const FeatureBitset &CallerBits =
2570b57cec5SDimitry Andric       TM.getSubtargetImpl(*Caller)->getFeatureBits();
2580b57cec5SDimitry Andric   const FeatureBitset &CalleeBits =
2590b57cec5SDimitry Andric       TM.getSubtargetImpl(*Callee)->getFeatureBits();
2600b57cec5SDimitry Andric 
2610b57cec5SDimitry Andric   // Inline a callee if its target-features are a subset of the callers
2620b57cec5SDimitry Andric   // target-features.
2630b57cec5SDimitry Andric   return (CallerBits & CalleeBits) == CalleeBits;
2640b57cec5SDimitry Andric }
2650b57cec5SDimitry Andric 
areTypesABICompatible(const Function * Caller,const Function * Callee,const ArrayRef<Type * > & Types) const266b121cb00SDimitry Andric bool AArch64TTIImpl::areTypesABICompatible(
267b121cb00SDimitry Andric     const Function *Caller, const Function *Callee,
268b121cb00SDimitry Andric     const ArrayRef<Type *> &Types) const {
269b121cb00SDimitry Andric   if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
270b121cb00SDimitry Andric     return false;
271b121cb00SDimitry Andric 
272b121cb00SDimitry Andric   // We need to ensure that argument promotion does not attempt to promote
273b121cb00SDimitry Andric   // pointers to fixed-length vector types larger than 128 bits like
274b121cb00SDimitry Andric   // <8 x float> (and pointers to aggregate types which have such fixed-length
275b121cb00SDimitry Andric   // vector type members) into the values of the pointees. Such vector types
276b121cb00SDimitry Andric   // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
277b121cb00SDimitry Andric   // backend cannot lower such value arguments. The 128-bit fixed-length SVE
278b121cb00SDimitry Andric   // types can be safely treated as 128-bit NEON types and they cannot be
279b121cb00SDimitry Andric   // distinguished in IR.
280b121cb00SDimitry Andric   if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
281b121cb00SDimitry Andric         auto FVTy = dyn_cast<FixedVectorType>(Ty);
282b121cb00SDimitry Andric         return FVTy &&
283b121cb00SDimitry Andric                FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
284b121cb00SDimitry Andric       }))
285b121cb00SDimitry Andric     return false;
286b121cb00SDimitry Andric 
287b121cb00SDimitry Andric   return true;
288b121cb00SDimitry Andric }
289b121cb00SDimitry Andric 
2905f757f3fSDimitry Andric unsigned
getInlineCallPenalty(const Function * F,const CallBase & Call,unsigned DefaultCallPenalty) const2915f757f3fSDimitry Andric AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
2925f757f3fSDimitry Andric                                      unsigned DefaultCallPenalty) const {
2935f757f3fSDimitry Andric   // This function calculates a penalty for executing Call in F.
2945f757f3fSDimitry Andric   //
2955f757f3fSDimitry Andric   // There are two ways this function can be called:
2965f757f3fSDimitry Andric   // (1)  F:
2975f757f3fSDimitry Andric   //       call from F -> G (the call here is Call)
2985f757f3fSDimitry Andric   //
2995f757f3fSDimitry Andric   // For (1), Call.getCaller() == F, so it will always return a high cost if
3005f757f3fSDimitry Andric   // a streaming-mode change is required (thus promoting the need to inline the
3015f757f3fSDimitry Andric   // function)
3025f757f3fSDimitry Andric   //
3035f757f3fSDimitry Andric   // (2)  F:
3045f757f3fSDimitry Andric   //       call from F -> G (the call here is not Call)
3055f757f3fSDimitry Andric   //      G:
3065f757f3fSDimitry Andric   //       call from G -> H (the call here is Call)
3075f757f3fSDimitry Andric   //
3085f757f3fSDimitry Andric   // For (2), if after inlining the body of G into F the call to H requires a
3095f757f3fSDimitry Andric   // streaming-mode change, and the call to G from F would also require a
3105f757f3fSDimitry Andric   // streaming-mode change, then there is benefit to do the streaming-mode
3115f757f3fSDimitry Andric   // change only once and avoid inlining of G into F.
3125f757f3fSDimitry Andric   SMEAttrs FAttrs(*F);
3135f757f3fSDimitry Andric   SMEAttrs CalleeAttrs(Call);
3145f757f3fSDimitry Andric   if (FAttrs.requiresSMChange(CalleeAttrs)) {
3155f757f3fSDimitry Andric     if (F == Call.getCaller()) // (1)
3165f757f3fSDimitry Andric       return CallPenaltyChangeSM * DefaultCallPenalty;
3175f757f3fSDimitry Andric     if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
3185f757f3fSDimitry Andric       return InlineCallPenaltyChangeSM * DefaultCallPenalty;
3195f757f3fSDimitry Andric   }
3205f757f3fSDimitry Andric 
3215f757f3fSDimitry Andric   return DefaultCallPenalty;
3225f757f3fSDimitry Andric }
3235f757f3fSDimitry Andric 
shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const32481ad6265SDimitry Andric bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
32581ad6265SDimitry Andric     TargetTransformInfo::RegisterKind K) const {
32681ad6265SDimitry Andric   assert(K != TargetTransformInfo::RGK_Scalar);
32706c3fb27SDimitry Andric   return (K == TargetTransformInfo::RGK_FixedWidthVector &&
32806c3fb27SDimitry Andric           ST->isNeonAvailable());
32981ad6265SDimitry Andric }
33081ad6265SDimitry Andric 
3310b57cec5SDimitry Andric /// Calculate the cost of materializing a 64-bit value. This helper
3320b57cec5SDimitry Andric /// method might only calculate a fraction of a larger immediate. Therefore it
3330b57cec5SDimitry Andric /// is valid to return a cost of ZERO.
getIntImmCost(int64_t Val)334fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
3350b57cec5SDimitry Andric   // Check if the immediate can be encoded within an instruction.
3360b57cec5SDimitry Andric   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
3370b57cec5SDimitry Andric     return 0;
3380b57cec5SDimitry Andric 
3390b57cec5SDimitry Andric   if (Val < 0)
3400b57cec5SDimitry Andric     Val = ~Val;
3410b57cec5SDimitry Andric 
3420b57cec5SDimitry Andric   // Calculate how many moves we will need to materialize this constant.
3430b57cec5SDimitry Andric   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
3440b57cec5SDimitry Andric   AArch64_IMM::expandMOVImm(Val, 64, Insn);
3450b57cec5SDimitry Andric   return Insn.size();
3460b57cec5SDimitry Andric }
3470b57cec5SDimitry Andric 
3480b57cec5SDimitry Andric /// Calculate the cost of materializing the given constant.
getIntImmCost(const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)349fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
3505ffd83dbSDimitry Andric                                               TTI::TargetCostKind CostKind) {
3510b57cec5SDimitry Andric   assert(Ty->isIntegerTy());
3520b57cec5SDimitry Andric 
3530b57cec5SDimitry Andric   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3540b57cec5SDimitry Andric   if (BitSize == 0)
3550b57cec5SDimitry Andric     return ~0U;
3560b57cec5SDimitry Andric 
3570b57cec5SDimitry Andric   // Sign-extend all constants to a multiple of 64-bit.
3580b57cec5SDimitry Andric   APInt ImmVal = Imm;
3590b57cec5SDimitry Andric   if (BitSize & 0x3f)
3600b57cec5SDimitry Andric     ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
3610b57cec5SDimitry Andric 
3620b57cec5SDimitry Andric   // Split the constant into 64-bit chunks and calculate the cost for each
3630b57cec5SDimitry Andric   // chunk.
364fe6060f1SDimitry Andric   InstructionCost Cost = 0;
3650b57cec5SDimitry Andric   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
3660b57cec5SDimitry Andric     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
3670b57cec5SDimitry Andric     int64_t Val = Tmp.getSExtValue();
3680b57cec5SDimitry Andric     Cost += getIntImmCost(Val);
3690b57cec5SDimitry Andric   }
3700b57cec5SDimitry Andric   // We need at least one instruction to materialze the constant.
371fe6060f1SDimitry Andric   return std::max<InstructionCost>(1, Cost);
3720b57cec5SDimitry Andric }
3730b57cec5SDimitry Andric 
getIntImmCostInst(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind,Instruction * Inst)374fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
3755ffd83dbSDimitry Andric                                                   const APInt &Imm, Type *Ty,
376e8d8bef9SDimitry Andric                                                   TTI::TargetCostKind CostKind,
377e8d8bef9SDimitry Andric                                                   Instruction *Inst) {
3780b57cec5SDimitry Andric   assert(Ty->isIntegerTy());
3790b57cec5SDimitry Andric 
3800b57cec5SDimitry Andric   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3810b57cec5SDimitry Andric   // There is no cost model for constants with a bit size of 0. Return TCC_Free
3820b57cec5SDimitry Andric   // here, so that constant hoisting will ignore this constant.
3830b57cec5SDimitry Andric   if (BitSize == 0)
3840b57cec5SDimitry Andric     return TTI::TCC_Free;
3850b57cec5SDimitry Andric 
3860b57cec5SDimitry Andric   unsigned ImmIdx = ~0U;
3870b57cec5SDimitry Andric   switch (Opcode) {
3880b57cec5SDimitry Andric   default:
3890b57cec5SDimitry Andric     return TTI::TCC_Free;
3900b57cec5SDimitry Andric   case Instruction::GetElementPtr:
3910b57cec5SDimitry Andric     // Always hoist the base address of a GetElementPtr.
3920b57cec5SDimitry Andric     if (Idx == 0)
3930b57cec5SDimitry Andric       return 2 * TTI::TCC_Basic;
3940b57cec5SDimitry Andric     return TTI::TCC_Free;
3950b57cec5SDimitry Andric   case Instruction::Store:
3960b57cec5SDimitry Andric     ImmIdx = 0;
3970b57cec5SDimitry Andric     break;
3980b57cec5SDimitry Andric   case Instruction::Add:
3990b57cec5SDimitry Andric   case Instruction::Sub:
4000b57cec5SDimitry Andric   case Instruction::Mul:
4010b57cec5SDimitry Andric   case Instruction::UDiv:
4020b57cec5SDimitry Andric   case Instruction::SDiv:
4030b57cec5SDimitry Andric   case Instruction::URem:
4040b57cec5SDimitry Andric   case Instruction::SRem:
4050b57cec5SDimitry Andric   case Instruction::And:
4060b57cec5SDimitry Andric   case Instruction::Or:
4070b57cec5SDimitry Andric   case Instruction::Xor:
4080b57cec5SDimitry Andric   case Instruction::ICmp:
4090b57cec5SDimitry Andric     ImmIdx = 1;
4100b57cec5SDimitry Andric     break;
4110b57cec5SDimitry Andric   // Always return TCC_Free for the shift value of a shift instruction.
4120b57cec5SDimitry Andric   case Instruction::Shl:
4130b57cec5SDimitry Andric   case Instruction::LShr:
4140b57cec5SDimitry Andric   case Instruction::AShr:
4150b57cec5SDimitry Andric     if (Idx == 1)
4160b57cec5SDimitry Andric       return TTI::TCC_Free;
4170b57cec5SDimitry Andric     break;
4180b57cec5SDimitry Andric   case Instruction::Trunc:
4190b57cec5SDimitry Andric   case Instruction::ZExt:
4200b57cec5SDimitry Andric   case Instruction::SExt:
4210b57cec5SDimitry Andric   case Instruction::IntToPtr:
4220b57cec5SDimitry Andric   case Instruction::PtrToInt:
4230b57cec5SDimitry Andric   case Instruction::BitCast:
4240b57cec5SDimitry Andric   case Instruction::PHI:
4250b57cec5SDimitry Andric   case Instruction::Call:
4260b57cec5SDimitry Andric   case Instruction::Select:
4270b57cec5SDimitry Andric   case Instruction::Ret:
4280b57cec5SDimitry Andric   case Instruction::Load:
4290b57cec5SDimitry Andric     break;
4300b57cec5SDimitry Andric   }
4310b57cec5SDimitry Andric 
4320b57cec5SDimitry Andric   if (Idx == ImmIdx) {
4330b57cec5SDimitry Andric     int NumConstants = (BitSize + 63) / 64;
434fe6060f1SDimitry Andric     InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4350b57cec5SDimitry Andric     return (Cost <= NumConstants * TTI::TCC_Basic)
4360b57cec5SDimitry Andric                ? static_cast<int>(TTI::TCC_Free)
4370b57cec5SDimitry Andric                : Cost;
4380b57cec5SDimitry Andric   }
4395ffd83dbSDimitry Andric   return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4400b57cec5SDimitry Andric }
4410b57cec5SDimitry Andric 
442fe6060f1SDimitry Andric InstructionCost
getIntImmCostIntrin(Intrinsic::ID IID,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)443fe6060f1SDimitry Andric AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
4445ffd83dbSDimitry Andric                                     const APInt &Imm, Type *Ty,
4455ffd83dbSDimitry Andric                                     TTI::TargetCostKind CostKind) {
4460b57cec5SDimitry Andric   assert(Ty->isIntegerTy());
4470b57cec5SDimitry Andric 
4480b57cec5SDimitry Andric   unsigned BitSize = Ty->getPrimitiveSizeInBits();
4490b57cec5SDimitry Andric   // There is no cost model for constants with a bit size of 0. Return TCC_Free
4500b57cec5SDimitry Andric   // here, so that constant hoisting will ignore this constant.
4510b57cec5SDimitry Andric   if (BitSize == 0)
4520b57cec5SDimitry Andric     return TTI::TCC_Free;
4530b57cec5SDimitry Andric 
454480093f4SDimitry Andric   // Most (all?) AArch64 intrinsics do not support folding immediates into the
455480093f4SDimitry Andric   // selected instruction, so we compute the materialization cost for the
456480093f4SDimitry Andric   // immediate directly.
457480093f4SDimitry Andric   if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
4585ffd83dbSDimitry Andric     return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
459480093f4SDimitry Andric 
4600b57cec5SDimitry Andric   switch (IID) {
4610b57cec5SDimitry Andric   default:
4620b57cec5SDimitry Andric     return TTI::TCC_Free;
4630b57cec5SDimitry Andric   case Intrinsic::sadd_with_overflow:
4640b57cec5SDimitry Andric   case Intrinsic::uadd_with_overflow:
4650b57cec5SDimitry Andric   case Intrinsic::ssub_with_overflow:
4660b57cec5SDimitry Andric   case Intrinsic::usub_with_overflow:
4670b57cec5SDimitry Andric   case Intrinsic::smul_with_overflow:
4680b57cec5SDimitry Andric   case Intrinsic::umul_with_overflow:
4690b57cec5SDimitry Andric     if (Idx == 1) {
4700b57cec5SDimitry Andric       int NumConstants = (BitSize + 63) / 64;
471fe6060f1SDimitry Andric       InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4720b57cec5SDimitry Andric       return (Cost <= NumConstants * TTI::TCC_Basic)
4730b57cec5SDimitry Andric                  ? static_cast<int>(TTI::TCC_Free)
4740b57cec5SDimitry Andric                  : Cost;
4750b57cec5SDimitry Andric     }
4760b57cec5SDimitry Andric     break;
4770b57cec5SDimitry Andric   case Intrinsic::experimental_stackmap:
4780b57cec5SDimitry Andric     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
4790b57cec5SDimitry Andric       return TTI::TCC_Free;
4800b57cec5SDimitry Andric     break;
4810b57cec5SDimitry Andric   case Intrinsic::experimental_patchpoint_void:
4820b57cec5SDimitry Andric   case Intrinsic::experimental_patchpoint_i64:
4830b57cec5SDimitry Andric     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
4840b57cec5SDimitry Andric       return TTI::TCC_Free;
4850b57cec5SDimitry Andric     break;
486e8d8bef9SDimitry Andric   case Intrinsic::experimental_gc_statepoint:
487e8d8bef9SDimitry Andric     if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
488e8d8bef9SDimitry Andric       return TTI::TCC_Free;
489e8d8bef9SDimitry Andric     break;
4900b57cec5SDimitry Andric   }
4915ffd83dbSDimitry Andric   return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
4920b57cec5SDimitry Andric }
4930b57cec5SDimitry Andric 
4940b57cec5SDimitry Andric TargetTransformInfo::PopcntSupportKind
getPopcntSupport(unsigned TyWidth)4950b57cec5SDimitry Andric AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
4960b57cec5SDimitry Andric   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
4970b57cec5SDimitry Andric   if (TyWidth == 32 || TyWidth == 64)
4980b57cec5SDimitry Andric     return TTI::PSK_FastHardware;
4990b57cec5SDimitry Andric   // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
5000b57cec5SDimitry Andric   return TTI::PSK_Software;
5010b57cec5SDimitry Andric }
5020b57cec5SDimitry Andric 
503fe6060f1SDimitry Andric InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)504e8d8bef9SDimitry Andric AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
505e8d8bef9SDimitry Andric                                       TTI::TargetCostKind CostKind) {
506e8d8bef9SDimitry Andric   auto *RetTy = ICA.getReturnType();
507e8d8bef9SDimitry Andric   switch (ICA.getID()) {
508e8d8bef9SDimitry Andric   case Intrinsic::umin:
509349cc55cSDimitry Andric   case Intrinsic::umax:
510e8d8bef9SDimitry Andric   case Intrinsic::smin:
511e8d8bef9SDimitry Andric   case Intrinsic::smax: {
512e8d8bef9SDimitry Andric     static const auto ValidMinMaxTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
51306c3fb27SDimitry Andric                                         MVT::v8i16, MVT::v2i32, MVT::v4i32,
51406c3fb27SDimitry Andric                                         MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
51506c3fb27SDimitry Andric                                         MVT::nxv2i64};
516bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(RetTy);
517349cc55cSDimitry Andric     // v2i64 types get converted to cmp+bif hence the cost of 2
518349cc55cSDimitry Andric     if (LT.second == MVT::v2i64)
519349cc55cSDimitry Andric       return LT.first * 2;
520e8d8bef9SDimitry Andric     if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
521e8d8bef9SDimitry Andric       return LT.first;
522e8d8bef9SDimitry Andric     break;
523e8d8bef9SDimitry Andric   }
524fe6060f1SDimitry Andric   case Intrinsic::sadd_sat:
525fe6060f1SDimitry Andric   case Intrinsic::ssub_sat:
526fe6060f1SDimitry Andric   case Intrinsic::uadd_sat:
527fe6060f1SDimitry Andric   case Intrinsic::usub_sat: {
528fe6060f1SDimitry Andric     static const auto ValidSatTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
529fe6060f1SDimitry Andric                                      MVT::v8i16, MVT::v2i32, MVT::v4i32,
530fe6060f1SDimitry Andric                                      MVT::v2i64};
531bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(RetTy);
532fe6060f1SDimitry Andric     // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
533fe6060f1SDimitry Andric     // need to extend the type, as it uses shr(qadd(shl, shl)).
534fe6060f1SDimitry Andric     unsigned Instrs =
535fe6060f1SDimitry Andric         LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
536fe6060f1SDimitry Andric     if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
537fe6060f1SDimitry Andric       return LT.first * Instrs;
538fe6060f1SDimitry Andric     break;
539fe6060f1SDimitry Andric   }
540fe6060f1SDimitry Andric   case Intrinsic::abs: {
541fe6060f1SDimitry Andric     static const auto ValidAbsTys = {MVT::v8i8,  MVT::v16i8, MVT::v4i16,
542fe6060f1SDimitry Andric                                      MVT::v8i16, MVT::v2i32, MVT::v4i32,
543fe6060f1SDimitry Andric                                      MVT::v2i64};
544bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(RetTy);
545fe6060f1SDimitry Andric     if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
546fe6060f1SDimitry Andric       return LT.first;
547fe6060f1SDimitry Andric     break;
548fe6060f1SDimitry Andric   }
54906c3fb27SDimitry Andric   case Intrinsic::bswap: {
55006c3fb27SDimitry Andric     static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
55106c3fb27SDimitry Andric                                      MVT::v4i32, MVT::v2i64};
55206c3fb27SDimitry Andric     auto LT = getTypeLegalizationCost(RetTy);
55306c3fb27SDimitry Andric     if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
55406c3fb27SDimitry Andric         LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
55506c3fb27SDimitry Andric       return LT.first;
55606c3fb27SDimitry Andric     break;
55706c3fb27SDimitry Andric   }
558fe6060f1SDimitry Andric   case Intrinsic::experimental_stepvector: {
559fe6060f1SDimitry Andric     InstructionCost Cost = 1; // Cost of the `index' instruction
560bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(RetTy);
561fe6060f1SDimitry Andric     // Legalisation of illegal vectors involves an `index' instruction plus
562fe6060f1SDimitry Andric     // (LT.first - 1) vector adds.
563fe6060f1SDimitry Andric     if (LT.first > 1) {
564fe6060f1SDimitry Andric       Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
565fe6060f1SDimitry Andric       InstructionCost AddCost =
566fe6060f1SDimitry Andric           getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
567fe6060f1SDimitry Andric       Cost += AddCost * (LT.first - 1);
568fe6060f1SDimitry Andric     }
569fe6060f1SDimitry Andric     return Cost;
570fe6060f1SDimitry Andric   }
571fe6060f1SDimitry Andric   case Intrinsic::bitreverse: {
572fe6060f1SDimitry Andric     static const CostTblEntry BitreverseTbl[] = {
573fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::i32, 1},
574fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::i64, 1},
575fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v8i8, 1},
576fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v16i8, 1},
577fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v4i16, 2},
578fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v8i16, 2},
579fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v2i32, 2},
580fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v4i32, 2},
581fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v1i64, 2},
582fe6060f1SDimitry Andric         {Intrinsic::bitreverse, MVT::v2i64, 2},
583fe6060f1SDimitry Andric     };
584bdd1243dSDimitry Andric     const auto LegalisationCost = getTypeLegalizationCost(RetTy);
585fe6060f1SDimitry Andric     const auto *Entry =
586fe6060f1SDimitry Andric         CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
587349cc55cSDimitry Andric     if (Entry) {
588349cc55cSDimitry Andric       // Cost Model is using the legal type(i32) that i8 and i16 will be
589349cc55cSDimitry Andric       // converted to +1 so that we match the actual lowering cost
590fe6060f1SDimitry Andric       if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
591fe6060f1SDimitry Andric           TLI->getValueType(DL, RetTy, true) == MVT::i16)
592fe6060f1SDimitry Andric         return LegalisationCost.first * Entry->Cost + 1;
593349cc55cSDimitry Andric 
594fe6060f1SDimitry Andric       return LegalisationCost.first * Entry->Cost;
595349cc55cSDimitry Andric     }
596fe6060f1SDimitry Andric     break;
597fe6060f1SDimitry Andric   }
598fe6060f1SDimitry Andric   case Intrinsic::ctpop: {
599bdd1243dSDimitry Andric     if (!ST->hasNEON()) {
600bdd1243dSDimitry Andric       // 32-bit or 64-bit ctpop without NEON is 12 instructions.
601bdd1243dSDimitry Andric       return getTypeLegalizationCost(RetTy).first * 12;
602bdd1243dSDimitry Andric     }
603fe6060f1SDimitry Andric     static const CostTblEntry CtpopCostTbl[] = {
604fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v2i64, 4},
605fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v4i32, 3},
606fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v8i16, 2},
607fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v16i8, 1},
608fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::i64,   4},
609fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v2i32, 3},
610fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v4i16, 2},
611fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::v8i8,  1},
612fe6060f1SDimitry Andric         {ISD::CTPOP, MVT::i32,   5},
613fe6060f1SDimitry Andric     };
614bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(RetTy);
615fe6060f1SDimitry Andric     MVT MTy = LT.second;
616fe6060f1SDimitry Andric     if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
617fe6060f1SDimitry Andric       // Extra cost of +1 when illegal vector types are legalized by promoting
618fe6060f1SDimitry Andric       // the integer type.
619fe6060f1SDimitry Andric       int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
620fe6060f1SDimitry Andric                                             RetTy->getScalarSizeInBits()
621fe6060f1SDimitry Andric                           ? 1
622fe6060f1SDimitry Andric                           : 0;
623fe6060f1SDimitry Andric       return LT.first * Entry->Cost + ExtraCost;
624fe6060f1SDimitry Andric     }
625fe6060f1SDimitry Andric     break;
626fe6060f1SDimitry Andric   }
62704eeddc0SDimitry Andric   case Intrinsic::sadd_with_overflow:
62804eeddc0SDimitry Andric   case Intrinsic::uadd_with_overflow:
62904eeddc0SDimitry Andric   case Intrinsic::ssub_with_overflow:
63004eeddc0SDimitry Andric   case Intrinsic::usub_with_overflow:
63104eeddc0SDimitry Andric   case Intrinsic::smul_with_overflow:
63204eeddc0SDimitry Andric   case Intrinsic::umul_with_overflow: {
63304eeddc0SDimitry Andric     static const CostTblEntry WithOverflowCostTbl[] = {
63404eeddc0SDimitry Andric         {Intrinsic::sadd_with_overflow, MVT::i8, 3},
63504eeddc0SDimitry Andric         {Intrinsic::uadd_with_overflow, MVT::i8, 3},
63604eeddc0SDimitry Andric         {Intrinsic::sadd_with_overflow, MVT::i16, 3},
63704eeddc0SDimitry Andric         {Intrinsic::uadd_with_overflow, MVT::i16, 3},
63804eeddc0SDimitry Andric         {Intrinsic::sadd_with_overflow, MVT::i32, 1},
63904eeddc0SDimitry Andric         {Intrinsic::uadd_with_overflow, MVT::i32, 1},
64004eeddc0SDimitry Andric         {Intrinsic::sadd_with_overflow, MVT::i64, 1},
64104eeddc0SDimitry Andric         {Intrinsic::uadd_with_overflow, MVT::i64, 1},
64204eeddc0SDimitry Andric         {Intrinsic::ssub_with_overflow, MVT::i8, 3},
64304eeddc0SDimitry Andric         {Intrinsic::usub_with_overflow, MVT::i8, 3},
64404eeddc0SDimitry Andric         {Intrinsic::ssub_with_overflow, MVT::i16, 3},
64504eeddc0SDimitry Andric         {Intrinsic::usub_with_overflow, MVT::i16, 3},
64604eeddc0SDimitry Andric         {Intrinsic::ssub_with_overflow, MVT::i32, 1},
64704eeddc0SDimitry Andric         {Intrinsic::usub_with_overflow, MVT::i32, 1},
64804eeddc0SDimitry Andric         {Intrinsic::ssub_with_overflow, MVT::i64, 1},
64904eeddc0SDimitry Andric         {Intrinsic::usub_with_overflow, MVT::i64, 1},
65004eeddc0SDimitry Andric         {Intrinsic::smul_with_overflow, MVT::i8, 5},
65104eeddc0SDimitry Andric         {Intrinsic::umul_with_overflow, MVT::i8, 4},
65204eeddc0SDimitry Andric         {Intrinsic::smul_with_overflow, MVT::i16, 5},
65304eeddc0SDimitry Andric         {Intrinsic::umul_with_overflow, MVT::i16, 4},
65404eeddc0SDimitry Andric         {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
65504eeddc0SDimitry Andric         {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
65604eeddc0SDimitry Andric         {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
65704eeddc0SDimitry Andric         {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
65804eeddc0SDimitry Andric     };
65904eeddc0SDimitry Andric     EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
66004eeddc0SDimitry Andric     if (MTy.isSimple())
66104eeddc0SDimitry Andric       if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
66204eeddc0SDimitry Andric                                               MTy.getSimpleVT()))
66304eeddc0SDimitry Andric         return Entry->Cost;
66404eeddc0SDimitry Andric     break;
66504eeddc0SDimitry Andric   }
66681ad6265SDimitry Andric   case Intrinsic::fptosi_sat:
66781ad6265SDimitry Andric   case Intrinsic::fptoui_sat: {
66881ad6265SDimitry Andric     if (ICA.getArgTypes().empty())
66981ad6265SDimitry Andric       break;
67081ad6265SDimitry Andric     bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
671bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
67281ad6265SDimitry Andric     EVT MTy = TLI->getValueType(DL, RetTy);
67381ad6265SDimitry Andric     // Check for the legal types, which are where the size of the input and the
67481ad6265SDimitry Andric     // output are the same, or we are using cvt f64->i32 or f32->i64.
67581ad6265SDimitry Andric     if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
67681ad6265SDimitry Andric          LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
67781ad6265SDimitry Andric          LT.second == MVT::v2f64) &&
67881ad6265SDimitry Andric         (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
67981ad6265SDimitry Andric          (LT.second == MVT::f64 && MTy == MVT::i32) ||
68081ad6265SDimitry Andric          (LT.second == MVT::f32 && MTy == MVT::i64)))
68181ad6265SDimitry Andric       return LT.first;
68281ad6265SDimitry Andric     // Similarly for fp16 sizes
68381ad6265SDimitry Andric     if (ST->hasFullFP16() &&
68481ad6265SDimitry Andric         ((LT.second == MVT::f16 && MTy == MVT::i32) ||
68581ad6265SDimitry Andric          ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
68681ad6265SDimitry Andric           (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
68781ad6265SDimitry Andric       return LT.first;
68881ad6265SDimitry Andric 
68981ad6265SDimitry Andric     // Otherwise we use a legal convert followed by a min+max
69081ad6265SDimitry Andric     if ((LT.second.getScalarType() == MVT::f32 ||
69181ad6265SDimitry Andric          LT.second.getScalarType() == MVT::f64 ||
69281ad6265SDimitry Andric          (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
69381ad6265SDimitry Andric         LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
69481ad6265SDimitry Andric       Type *LegalTy =
69581ad6265SDimitry Andric           Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
69681ad6265SDimitry Andric       if (LT.second.isVector())
69781ad6265SDimitry Andric         LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
69881ad6265SDimitry Andric       InstructionCost Cost = 1;
69981ad6265SDimitry Andric       IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
70081ad6265SDimitry Andric                                     LegalTy, {LegalTy, LegalTy});
70181ad6265SDimitry Andric       Cost += getIntrinsicInstrCost(Attrs1, CostKind);
70281ad6265SDimitry Andric       IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
70381ad6265SDimitry Andric                                     LegalTy, {LegalTy, LegalTy});
70481ad6265SDimitry Andric       Cost += getIntrinsicInstrCost(Attrs2, CostKind);
70581ad6265SDimitry Andric       return LT.first * Cost;
70681ad6265SDimitry Andric     }
70781ad6265SDimitry Andric     break;
70881ad6265SDimitry Andric   }
70906c3fb27SDimitry Andric   case Intrinsic::fshl:
71006c3fb27SDimitry Andric   case Intrinsic::fshr: {
71106c3fb27SDimitry Andric     if (ICA.getArgs().empty())
71206c3fb27SDimitry Andric       break;
71306c3fb27SDimitry Andric 
71406c3fb27SDimitry Andric     // TODO: Add handling for fshl where third argument is not a constant.
71506c3fb27SDimitry Andric     const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
71606c3fb27SDimitry Andric     if (!OpInfoZ.isConstant())
71706c3fb27SDimitry Andric       break;
71806c3fb27SDimitry Andric 
71906c3fb27SDimitry Andric     const auto LegalisationCost = getTypeLegalizationCost(RetTy);
72006c3fb27SDimitry Andric     if (OpInfoZ.isUniform()) {
72106c3fb27SDimitry Andric       // FIXME: The costs could be lower if the codegen is better.
72206c3fb27SDimitry Andric       static const CostTblEntry FshlTbl[] = {
72306c3fb27SDimitry Andric           {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
72406c3fb27SDimitry Andric           {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
72506c3fb27SDimitry Andric           {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
72606c3fb27SDimitry Andric           {Intrinsic::fshl, MVT::v8i8, 4},  {Intrinsic::fshl, MVT::v4i16, 4}};
72706c3fb27SDimitry Andric       // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
72806c3fb27SDimitry Andric       // to avoid having to duplicate the costs.
72906c3fb27SDimitry Andric       const auto *Entry =
73006c3fb27SDimitry Andric           CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
73106c3fb27SDimitry Andric       if (Entry)
73206c3fb27SDimitry Andric         return LegalisationCost.first * Entry->Cost;
73306c3fb27SDimitry Andric     }
73406c3fb27SDimitry Andric 
73506c3fb27SDimitry Andric     auto TyL = getTypeLegalizationCost(RetTy);
73606c3fb27SDimitry Andric     if (!RetTy->isIntegerTy())
73706c3fb27SDimitry Andric       break;
73806c3fb27SDimitry Andric 
73906c3fb27SDimitry Andric     // Estimate cost manually, as types like i8 and i16 will get promoted to
74006c3fb27SDimitry Andric     // i32 and CostTableLookup will ignore the extra conversion cost.
74106c3fb27SDimitry Andric     bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
74206c3fb27SDimitry Andric                        RetTy->getScalarSizeInBits() < 64) ||
74306c3fb27SDimitry Andric                       (RetTy->getScalarSizeInBits() % 64 != 0);
74406c3fb27SDimitry Andric     unsigned ExtraCost = HigherCost ? 1 : 0;
74506c3fb27SDimitry Andric     if (RetTy->getScalarSizeInBits() == 32 ||
74606c3fb27SDimitry Andric         RetTy->getScalarSizeInBits() == 64)
74706c3fb27SDimitry Andric       ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
74806c3fb27SDimitry Andric                      // extr instruction.
74906c3fb27SDimitry Andric     else if (HigherCost)
75006c3fb27SDimitry Andric       ExtraCost = 1;
75106c3fb27SDimitry Andric     else
75206c3fb27SDimitry Andric       break;
75306c3fb27SDimitry Andric     return TyL.first + ExtraCost;
75406c3fb27SDimitry Andric   }
755e8d8bef9SDimitry Andric   default:
756e8d8bef9SDimitry Andric     break;
757e8d8bef9SDimitry Andric   }
758e8d8bef9SDimitry Andric   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
759e8d8bef9SDimitry Andric }
760e8d8bef9SDimitry Andric 
761fe6060f1SDimitry Andric /// The function will remove redundant reinterprets casting in the presence
762fe6060f1SDimitry Andric /// of the control flow
processPhiNode(InstCombiner & IC,IntrinsicInst & II)763bdd1243dSDimitry Andric static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
764fe6060f1SDimitry Andric                                                    IntrinsicInst &II) {
765fe6060f1SDimitry Andric   SmallVector<Instruction *, 32> Worklist;
766fe6060f1SDimitry Andric   auto RequiredType = II.getType();
767fe6060f1SDimitry Andric 
768fe6060f1SDimitry Andric   auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
769fe6060f1SDimitry Andric   assert(PN && "Expected Phi Node!");
770fe6060f1SDimitry Andric 
771fe6060f1SDimitry Andric   // Don't create a new Phi unless we can remove the old one.
772fe6060f1SDimitry Andric   if (!PN->hasOneUse())
773bdd1243dSDimitry Andric     return std::nullopt;
774fe6060f1SDimitry Andric 
775fe6060f1SDimitry Andric   for (Value *IncValPhi : PN->incoming_values()) {
776fe6060f1SDimitry Andric     auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
777fe6060f1SDimitry Andric     if (!Reinterpret ||
778fe6060f1SDimitry Andric         Reinterpret->getIntrinsicID() !=
779fe6060f1SDimitry Andric             Intrinsic::aarch64_sve_convert_to_svbool ||
780fe6060f1SDimitry Andric         RequiredType != Reinterpret->getArgOperand(0)->getType())
781bdd1243dSDimitry Andric       return std::nullopt;
782fe6060f1SDimitry Andric   }
783fe6060f1SDimitry Andric 
784fe6060f1SDimitry Andric   // Create the new Phi
78506c3fb27SDimitry Andric   IC.Builder.SetInsertPoint(PN);
78606c3fb27SDimitry Andric   PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
787fe6060f1SDimitry Andric   Worklist.push_back(PN);
788fe6060f1SDimitry Andric 
789fe6060f1SDimitry Andric   for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
790fe6060f1SDimitry Andric     auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
791fe6060f1SDimitry Andric     NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
792fe6060f1SDimitry Andric     Worklist.push_back(Reinterpret);
793fe6060f1SDimitry Andric   }
794fe6060f1SDimitry Andric 
795fe6060f1SDimitry Andric   // Cleanup Phi Node and reinterprets
796fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, NPN);
797fe6060f1SDimitry Andric }
798fe6060f1SDimitry Andric 
79904eeddc0SDimitry Andric // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
80004eeddc0SDimitry Andric // => (binop (pred) (from_svbool _) (from_svbool _))
80104eeddc0SDimitry Andric //
80204eeddc0SDimitry Andric // The above transformation eliminates a `to_svbool` in the predicate
80304eeddc0SDimitry Andric // operand of bitwise operation `binop` by narrowing the vector width of
80404eeddc0SDimitry Andric // the operation. For example, it would convert a `<vscale x 16 x i1>
80504eeddc0SDimitry Andric // and` into a `<vscale x 4 x i1> and`. This is profitable because
80604eeddc0SDimitry Andric // to_svbool must zero the new lanes during widening, whereas
80704eeddc0SDimitry Andric // from_svbool is free.
808bdd1243dSDimitry Andric static std::optional<Instruction *>
tryCombineFromSVBoolBinOp(InstCombiner & IC,IntrinsicInst & II)809bdd1243dSDimitry Andric tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
81004eeddc0SDimitry Andric   auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
81104eeddc0SDimitry Andric   if (!BinOp)
812bdd1243dSDimitry Andric     return std::nullopt;
81304eeddc0SDimitry Andric 
81404eeddc0SDimitry Andric   auto IntrinsicID = BinOp->getIntrinsicID();
81504eeddc0SDimitry Andric   switch (IntrinsicID) {
81604eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_and_z:
81704eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_bic_z:
81804eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_eor_z:
81904eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_nand_z:
82004eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_nor_z:
82104eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_orn_z:
82204eeddc0SDimitry Andric   case Intrinsic::aarch64_sve_orr_z:
82304eeddc0SDimitry Andric     break;
82404eeddc0SDimitry Andric   default:
825bdd1243dSDimitry Andric     return std::nullopt;
82604eeddc0SDimitry Andric   }
82704eeddc0SDimitry Andric 
82804eeddc0SDimitry Andric   auto BinOpPred = BinOp->getOperand(0);
82904eeddc0SDimitry Andric   auto BinOpOp1 = BinOp->getOperand(1);
83004eeddc0SDimitry Andric   auto BinOpOp2 = BinOp->getOperand(2);
83104eeddc0SDimitry Andric 
83204eeddc0SDimitry Andric   auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
83304eeddc0SDimitry Andric   if (!PredIntr ||
83404eeddc0SDimitry Andric       PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
835bdd1243dSDimitry Andric     return std::nullopt;
83604eeddc0SDimitry Andric 
83704eeddc0SDimitry Andric   auto PredOp = PredIntr->getOperand(0);
83804eeddc0SDimitry Andric   auto PredOpTy = cast<VectorType>(PredOp->getType());
83904eeddc0SDimitry Andric   if (PredOpTy != II.getType())
840bdd1243dSDimitry Andric     return std::nullopt;
84104eeddc0SDimitry Andric 
84204eeddc0SDimitry Andric   SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
84306c3fb27SDimitry Andric   auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
84404eeddc0SDimitry Andric       Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
84504eeddc0SDimitry Andric   NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
84604eeddc0SDimitry Andric   if (BinOpOp1 == BinOpOp2)
84704eeddc0SDimitry Andric     NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
84804eeddc0SDimitry Andric   else
84906c3fb27SDimitry Andric     NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
85004eeddc0SDimitry Andric         Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
85104eeddc0SDimitry Andric 
85204eeddc0SDimitry Andric   auto NarrowedBinOp =
85306c3fb27SDimitry Andric       IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
85404eeddc0SDimitry Andric   return IC.replaceInstUsesWith(II, NarrowedBinOp);
85504eeddc0SDimitry Andric }
85604eeddc0SDimitry Andric 
857bdd1243dSDimitry Andric static std::optional<Instruction *>
instCombineConvertFromSVBool(InstCombiner & IC,IntrinsicInst & II)858bdd1243dSDimitry Andric instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
859fe6060f1SDimitry Andric   // If the reinterpret instruction operand is a PHI Node
860fe6060f1SDimitry Andric   if (isa<PHINode>(II.getArgOperand(0)))
861fe6060f1SDimitry Andric     return processPhiNode(IC, II);
862fe6060f1SDimitry Andric 
86304eeddc0SDimitry Andric   if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
86404eeddc0SDimitry Andric     return BinOpCombine;
86504eeddc0SDimitry Andric 
86606c3fb27SDimitry Andric   // Ignore converts to/from svcount_t.
86706c3fb27SDimitry Andric   if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
86806c3fb27SDimitry Andric       isa<TargetExtType>(II.getType()))
86906c3fb27SDimitry Andric     return std::nullopt;
87006c3fb27SDimitry Andric 
871fe6060f1SDimitry Andric   SmallVector<Instruction *, 32> CandidatesForRemoval;
872fe6060f1SDimitry Andric   Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
873fe6060f1SDimitry Andric 
874fe6060f1SDimitry Andric   const auto *IVTy = cast<VectorType>(II.getType());
875fe6060f1SDimitry Andric 
876fe6060f1SDimitry Andric   // Walk the chain of conversions.
877fe6060f1SDimitry Andric   while (Cursor) {
878fe6060f1SDimitry Andric     // If the type of the cursor has fewer lanes than the final result, zeroing
879fe6060f1SDimitry Andric     // must take place, which breaks the equivalence chain.
880fe6060f1SDimitry Andric     const auto *CursorVTy = cast<VectorType>(Cursor->getType());
881fe6060f1SDimitry Andric     if (CursorVTy->getElementCount().getKnownMinValue() <
882fe6060f1SDimitry Andric         IVTy->getElementCount().getKnownMinValue())
883fe6060f1SDimitry Andric       break;
884fe6060f1SDimitry Andric 
885fe6060f1SDimitry Andric     // If the cursor has the same type as I, it is a viable replacement.
886fe6060f1SDimitry Andric     if (Cursor->getType() == IVTy)
887fe6060f1SDimitry Andric       EarliestReplacement = Cursor;
888fe6060f1SDimitry Andric 
889fe6060f1SDimitry Andric     auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
890fe6060f1SDimitry Andric 
891fe6060f1SDimitry Andric     // If this is not an SVE conversion intrinsic, this is the end of the chain.
892fe6060f1SDimitry Andric     if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
893fe6060f1SDimitry Andric                                   Intrinsic::aarch64_sve_convert_to_svbool ||
894fe6060f1SDimitry Andric                               IntrinsicCursor->getIntrinsicID() ==
895fe6060f1SDimitry Andric                                   Intrinsic::aarch64_sve_convert_from_svbool))
896fe6060f1SDimitry Andric       break;
897fe6060f1SDimitry Andric 
898fe6060f1SDimitry Andric     CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
899fe6060f1SDimitry Andric     Cursor = IntrinsicCursor->getOperand(0);
900fe6060f1SDimitry Andric   }
901fe6060f1SDimitry Andric 
902fe6060f1SDimitry Andric   // If no viable replacement in the conversion chain was found, there is
903fe6060f1SDimitry Andric   // nothing to do.
904fe6060f1SDimitry Andric   if (!EarliestReplacement)
905bdd1243dSDimitry Andric     return std::nullopt;
906fe6060f1SDimitry Andric 
907fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, EarliestReplacement);
908fe6060f1SDimitry Andric }
909fe6060f1SDimitry Andric 
isAllActivePredicate(Value * Pred)9105f757f3fSDimitry Andric static bool isAllActivePredicate(Value *Pred) {
9115f757f3fSDimitry Andric   // Look through convert.from.svbool(convert.to.svbool(...) chain.
9125f757f3fSDimitry Andric   Value *UncastedPred;
9135f757f3fSDimitry Andric   if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
9145f757f3fSDimitry Andric                       m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
9155f757f3fSDimitry Andric                           m_Value(UncastedPred)))))
9165f757f3fSDimitry Andric     // If the predicate has the same or less lanes than the uncasted
9175f757f3fSDimitry Andric     // predicate then we know the casting has no effect.
9185f757f3fSDimitry Andric     if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
9195f757f3fSDimitry Andric         cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
9205f757f3fSDimitry Andric       Pred = UncastedPred;
9215f757f3fSDimitry Andric 
9225f757f3fSDimitry Andric   return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
9235f757f3fSDimitry Andric                          m_ConstantInt<AArch64SVEPredPattern::all>()));
9245f757f3fSDimitry Andric }
9255f757f3fSDimitry Andric 
instCombineSVESel(InstCombiner & IC,IntrinsicInst & II)926bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
92781ad6265SDimitry Andric                                                       IntrinsicInst &II) {
9285f757f3fSDimitry Andric   // svsel(ptrue, x, y) => x
9295f757f3fSDimitry Andric   auto *OpPredicate = II.getOperand(0);
9305f757f3fSDimitry Andric   if (isAllActivePredicate(OpPredicate))
9315f757f3fSDimitry Andric     return IC.replaceInstUsesWith(II, II.getOperand(1));
9325f757f3fSDimitry Andric 
9335f757f3fSDimitry Andric   auto Select =
9345f757f3fSDimitry Andric       IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
93581ad6265SDimitry Andric   return IC.replaceInstUsesWith(II, Select);
93681ad6265SDimitry Andric }
93781ad6265SDimitry Andric 
instCombineSVEDup(InstCombiner & IC,IntrinsicInst & II)938bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
939fe6060f1SDimitry Andric                                                       IntrinsicInst &II) {
940fe6060f1SDimitry Andric   IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
941fe6060f1SDimitry Andric   if (!Pg)
942bdd1243dSDimitry Andric     return std::nullopt;
943fe6060f1SDimitry Andric 
944fe6060f1SDimitry Andric   if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
945bdd1243dSDimitry Andric     return std::nullopt;
946fe6060f1SDimitry Andric 
947fe6060f1SDimitry Andric   const auto PTruePattern =
948fe6060f1SDimitry Andric       cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
949fe6060f1SDimitry Andric   if (PTruePattern != AArch64SVEPredPattern::vl1)
950bdd1243dSDimitry Andric     return std::nullopt;
951fe6060f1SDimitry Andric 
952fe6060f1SDimitry Andric   // The intrinsic is inserting into lane zero so use an insert instead.
953fe6060f1SDimitry Andric   auto *IdxTy = Type::getInt64Ty(II.getContext());
954fe6060f1SDimitry Andric   auto *Insert = InsertElementInst::Create(
955fe6060f1SDimitry Andric       II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
956fe6060f1SDimitry Andric   Insert->insertBefore(&II);
957fe6060f1SDimitry Andric   Insert->takeName(&II);
958fe6060f1SDimitry Andric 
959fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, Insert);
960fe6060f1SDimitry Andric }
961fe6060f1SDimitry Andric 
instCombineSVEDupX(InstCombiner & IC,IntrinsicInst & II)962bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
963349cc55cSDimitry Andric                                                        IntrinsicInst &II) {
964349cc55cSDimitry Andric   // Replace DupX with a regular IR splat.
965349cc55cSDimitry Andric   auto *RetTy = cast<ScalableVectorType>(II.getType());
96606c3fb27SDimitry Andric   Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
96706c3fb27SDimitry Andric                                               II.getArgOperand(0));
968349cc55cSDimitry Andric   Splat->takeName(&II);
969349cc55cSDimitry Andric   return IC.replaceInstUsesWith(II, Splat);
970349cc55cSDimitry Andric }
971349cc55cSDimitry Andric 
instCombineSVECmpNE(InstCombiner & IC,IntrinsicInst & II)972bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
973fe6060f1SDimitry Andric                                                         IntrinsicInst &II) {
974fe6060f1SDimitry Andric   LLVMContext &Ctx = II.getContext();
975fe6060f1SDimitry Andric 
976fe6060f1SDimitry Andric   // Check that the predicate is all active
977fe6060f1SDimitry Andric   auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
978fe6060f1SDimitry Andric   if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
979bdd1243dSDimitry Andric     return std::nullopt;
980fe6060f1SDimitry Andric 
981fe6060f1SDimitry Andric   const auto PTruePattern =
982fe6060f1SDimitry Andric       cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
983fe6060f1SDimitry Andric   if (PTruePattern != AArch64SVEPredPattern::all)
984bdd1243dSDimitry Andric     return std::nullopt;
985fe6060f1SDimitry Andric 
986fe6060f1SDimitry Andric   // Check that we have a compare of zero..
987349cc55cSDimitry Andric   auto *SplatValue =
988349cc55cSDimitry Andric       dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
989349cc55cSDimitry Andric   if (!SplatValue || !SplatValue->isZero())
990bdd1243dSDimitry Andric     return std::nullopt;
991fe6060f1SDimitry Andric 
992fe6060f1SDimitry Andric   // ..against a dupq
993fe6060f1SDimitry Andric   auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
994fe6060f1SDimitry Andric   if (!DupQLane ||
995fe6060f1SDimitry Andric       DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
996bdd1243dSDimitry Andric     return std::nullopt;
997fe6060f1SDimitry Andric 
998fe6060f1SDimitry Andric   // Where the dupq is a lane 0 replicate of a vector insert
999fe6060f1SDimitry Andric   if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1000bdd1243dSDimitry Andric     return std::nullopt;
1001fe6060f1SDimitry Andric 
1002fe6060f1SDimitry Andric   auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
100381ad6265SDimitry Andric   if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1004bdd1243dSDimitry Andric     return std::nullopt;
1005fe6060f1SDimitry Andric 
1006fe6060f1SDimitry Andric   // Where the vector insert is a fixed constant vector insert into undef at
1007fe6060f1SDimitry Andric   // index zero
1008fe6060f1SDimitry Andric   if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1009bdd1243dSDimitry Andric     return std::nullopt;
1010fe6060f1SDimitry Andric 
1011fe6060f1SDimitry Andric   if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1012bdd1243dSDimitry Andric     return std::nullopt;
1013fe6060f1SDimitry Andric 
1014fe6060f1SDimitry Andric   auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1015fe6060f1SDimitry Andric   if (!ConstVec)
1016bdd1243dSDimitry Andric     return std::nullopt;
1017fe6060f1SDimitry Andric 
1018fe6060f1SDimitry Andric   auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1019fe6060f1SDimitry Andric   auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1020fe6060f1SDimitry Andric   if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1021bdd1243dSDimitry Andric     return std::nullopt;
1022fe6060f1SDimitry Andric 
1023fe6060f1SDimitry Andric   unsigned NumElts = VecTy->getNumElements();
1024fe6060f1SDimitry Andric   unsigned PredicateBits = 0;
1025fe6060f1SDimitry Andric 
1026fe6060f1SDimitry Andric   // Expand intrinsic operands to a 16-bit byte level predicate
1027fe6060f1SDimitry Andric   for (unsigned I = 0; I < NumElts; ++I) {
1028fe6060f1SDimitry Andric     auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1029fe6060f1SDimitry Andric     if (!Arg)
1030bdd1243dSDimitry Andric       return std::nullopt;
1031fe6060f1SDimitry Andric     if (!Arg->isZero())
1032fe6060f1SDimitry Andric       PredicateBits |= 1 << (I * (16 / NumElts));
1033fe6060f1SDimitry Andric   }
1034fe6060f1SDimitry Andric 
1035fe6060f1SDimitry Andric   // If all bits are zero bail early with an empty predicate
1036fe6060f1SDimitry Andric   if (PredicateBits == 0) {
1037fe6060f1SDimitry Andric     auto *PFalse = Constant::getNullValue(II.getType());
1038fe6060f1SDimitry Andric     PFalse->takeName(&II);
1039fe6060f1SDimitry Andric     return IC.replaceInstUsesWith(II, PFalse);
1040fe6060f1SDimitry Andric   }
1041fe6060f1SDimitry Andric 
1042fe6060f1SDimitry Andric   // Calculate largest predicate type used (where byte predicate is largest)
1043fe6060f1SDimitry Andric   unsigned Mask = 8;
1044fe6060f1SDimitry Andric   for (unsigned I = 0; I < 16; ++I)
1045fe6060f1SDimitry Andric     if ((PredicateBits & (1 << I)) != 0)
1046fe6060f1SDimitry Andric       Mask |= (I % 8);
1047fe6060f1SDimitry Andric 
1048fe6060f1SDimitry Andric   unsigned PredSize = Mask & -Mask;
1049fe6060f1SDimitry Andric   auto *PredType = ScalableVectorType::get(
1050fe6060f1SDimitry Andric       Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1051fe6060f1SDimitry Andric 
1052fe6060f1SDimitry Andric   // Ensure all relevant bits are set
1053fe6060f1SDimitry Andric   for (unsigned I = 0; I < 16; I += PredSize)
1054fe6060f1SDimitry Andric     if ((PredicateBits & (1 << I)) == 0)
1055bdd1243dSDimitry Andric       return std::nullopt;
1056fe6060f1SDimitry Andric 
1057fe6060f1SDimitry Andric   auto *PTruePat =
1058fe6060f1SDimitry Andric       ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
105906c3fb27SDimitry Andric   auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1060fe6060f1SDimitry Andric                                            {PredType}, {PTruePat});
106106c3fb27SDimitry Andric   auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1062fe6060f1SDimitry Andric       Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1063fe6060f1SDimitry Andric   auto *ConvertFromSVBool =
106406c3fb27SDimitry Andric       IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1065fe6060f1SDimitry Andric                                  {II.getType()}, {ConvertToSVBool});
1066fe6060f1SDimitry Andric 
1067fe6060f1SDimitry Andric   ConvertFromSVBool->takeName(&II);
1068fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1069fe6060f1SDimitry Andric }
1070fe6060f1SDimitry Andric 
instCombineSVELast(InstCombiner & IC,IntrinsicInst & II)1071bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1072fe6060f1SDimitry Andric                                                        IntrinsicInst &II) {
1073fe6060f1SDimitry Andric   Value *Pg = II.getArgOperand(0);
1074fe6060f1SDimitry Andric   Value *Vec = II.getArgOperand(1);
1075349cc55cSDimitry Andric   auto IntrinsicID = II.getIntrinsicID();
1076349cc55cSDimitry Andric   bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1077fe6060f1SDimitry Andric 
1078fe6060f1SDimitry Andric   // lastX(splat(X)) --> X
1079fe6060f1SDimitry Andric   if (auto *SplatVal = getSplatValue(Vec))
1080fe6060f1SDimitry Andric     return IC.replaceInstUsesWith(II, SplatVal);
1081fe6060f1SDimitry Andric 
1082349cc55cSDimitry Andric   // If x and/or y is a splat value then:
1083349cc55cSDimitry Andric   // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1084349cc55cSDimitry Andric   Value *LHS, *RHS;
1085349cc55cSDimitry Andric   if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1086349cc55cSDimitry Andric     if (isSplatValue(LHS) || isSplatValue(RHS)) {
1087349cc55cSDimitry Andric       auto *OldBinOp = cast<BinaryOperator>(Vec);
1088349cc55cSDimitry Andric       auto OpC = OldBinOp->getOpcode();
1089349cc55cSDimitry Andric       auto *NewLHS =
109006c3fb27SDimitry Andric           IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1091349cc55cSDimitry Andric       auto *NewRHS =
109206c3fb27SDimitry Andric           IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1093349cc55cSDimitry Andric       auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
1094349cc55cSDimitry Andric           OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
1095349cc55cSDimitry Andric       return IC.replaceInstUsesWith(II, NewBinOp);
1096349cc55cSDimitry Andric     }
1097349cc55cSDimitry Andric   }
1098349cc55cSDimitry Andric 
1099fe6060f1SDimitry Andric   auto *C = dyn_cast<Constant>(Pg);
1100fe6060f1SDimitry Andric   if (IsAfter && C && C->isNullValue()) {
1101fe6060f1SDimitry Andric     // The intrinsic is extracting lane 0 so use an extract instead.
1102fe6060f1SDimitry Andric     auto *IdxTy = Type::getInt64Ty(II.getContext());
1103fe6060f1SDimitry Andric     auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1104fe6060f1SDimitry Andric     Extract->insertBefore(&II);
1105fe6060f1SDimitry Andric     Extract->takeName(&II);
1106fe6060f1SDimitry Andric     return IC.replaceInstUsesWith(II, Extract);
1107fe6060f1SDimitry Andric   }
1108fe6060f1SDimitry Andric 
1109fe6060f1SDimitry Andric   auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1110fe6060f1SDimitry Andric   if (!IntrPG)
1111bdd1243dSDimitry Andric     return std::nullopt;
1112fe6060f1SDimitry Andric 
1113fe6060f1SDimitry Andric   if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1114bdd1243dSDimitry Andric     return std::nullopt;
1115fe6060f1SDimitry Andric 
1116fe6060f1SDimitry Andric   const auto PTruePattern =
1117fe6060f1SDimitry Andric       cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1118fe6060f1SDimitry Andric 
1119fe6060f1SDimitry Andric   // Can the intrinsic's predicate be converted to a known constant index?
1120349cc55cSDimitry Andric   unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1121349cc55cSDimitry Andric   if (!MinNumElts)
1122bdd1243dSDimitry Andric     return std::nullopt;
1123fe6060f1SDimitry Andric 
1124349cc55cSDimitry Andric   unsigned Idx = MinNumElts - 1;
1125fe6060f1SDimitry Andric   // Increment the index if extracting the element after the last active
1126fe6060f1SDimitry Andric   // predicate element.
1127fe6060f1SDimitry Andric   if (IsAfter)
1128fe6060f1SDimitry Andric     ++Idx;
1129fe6060f1SDimitry Andric 
1130fe6060f1SDimitry Andric   // Ignore extracts whose index is larger than the known minimum vector
1131fe6060f1SDimitry Andric   // length. NOTE: This is an artificial constraint where we prefer to
1132fe6060f1SDimitry Andric   // maintain what the user asked for until an alternative is proven faster.
1133fe6060f1SDimitry Andric   auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1134fe6060f1SDimitry Andric   if (Idx >= PgVTy->getMinNumElements())
1135bdd1243dSDimitry Andric     return std::nullopt;
1136fe6060f1SDimitry Andric 
1137fe6060f1SDimitry Andric   // The intrinsic is extracting a fixed lane so use an extract instead.
1138fe6060f1SDimitry Andric   auto *IdxTy = Type::getInt64Ty(II.getContext());
1139fe6060f1SDimitry Andric   auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1140fe6060f1SDimitry Andric   Extract->insertBefore(&II);
1141fe6060f1SDimitry Andric   Extract->takeName(&II);
1142fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, Extract);
1143fe6060f1SDimitry Andric }
1144fe6060f1SDimitry Andric 
instCombineSVECondLast(InstCombiner & IC,IntrinsicInst & II)1145bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1146753f127fSDimitry Andric                                                            IntrinsicInst &II) {
1147753f127fSDimitry Andric   // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1148753f127fSDimitry Andric   // integer variant across a variety of micro-architectures. Replace scalar
1149753f127fSDimitry Andric   // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1150753f127fSDimitry Andric   // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1151753f127fSDimitry Andric   // depending on the micro-architecture, but has been observed as generally
1152753f127fSDimitry Andric   // being faster, particularly when the CLAST[AB] op is a loop-carried
1153753f127fSDimitry Andric   // dependency.
1154753f127fSDimitry Andric   Value *Pg = II.getArgOperand(0);
1155753f127fSDimitry Andric   Value *Fallback = II.getArgOperand(1);
1156753f127fSDimitry Andric   Value *Vec = II.getArgOperand(2);
1157753f127fSDimitry Andric   Type *Ty = II.getType();
1158753f127fSDimitry Andric 
1159753f127fSDimitry Andric   if (!Ty->isIntegerTy())
1160bdd1243dSDimitry Andric     return std::nullopt;
1161753f127fSDimitry Andric 
1162753f127fSDimitry Andric   Type *FPTy;
1163753f127fSDimitry Andric   switch (cast<IntegerType>(Ty)->getBitWidth()) {
1164753f127fSDimitry Andric   default:
1165bdd1243dSDimitry Andric     return std::nullopt;
1166753f127fSDimitry Andric   case 16:
116706c3fb27SDimitry Andric     FPTy = IC.Builder.getHalfTy();
1168753f127fSDimitry Andric     break;
1169753f127fSDimitry Andric   case 32:
117006c3fb27SDimitry Andric     FPTy = IC.Builder.getFloatTy();
1171753f127fSDimitry Andric     break;
1172753f127fSDimitry Andric   case 64:
117306c3fb27SDimitry Andric     FPTy = IC.Builder.getDoubleTy();
1174753f127fSDimitry Andric     break;
1175753f127fSDimitry Andric   }
1176753f127fSDimitry Andric 
117706c3fb27SDimitry Andric   Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1178753f127fSDimitry Andric   auto *FPVTy = VectorType::get(
1179753f127fSDimitry Andric       FPTy, cast<VectorType>(Vec->getType())->getElementCount());
118006c3fb27SDimitry Andric   Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
118106c3fb27SDimitry Andric   auto *FPII = IC.Builder.CreateIntrinsic(
118206c3fb27SDimitry Andric       II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
118306c3fb27SDimitry Andric   Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1184753f127fSDimitry Andric   return IC.replaceInstUsesWith(II, FPIItoInt);
1185753f127fSDimitry Andric }
1186753f127fSDimitry Andric 
instCombineRDFFR(InstCombiner & IC,IntrinsicInst & II)1187bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1188fe6060f1SDimitry Andric                                                      IntrinsicInst &II) {
1189fe6060f1SDimitry Andric   LLVMContext &Ctx = II.getContext();
1190fe6060f1SDimitry Andric   // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1191fe6060f1SDimitry Andric   // can work with RDFFR_PP for ptest elimination.
1192fe6060f1SDimitry Andric   auto *AllPat =
1193fe6060f1SDimitry Andric       ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
119406c3fb27SDimitry Andric   auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1195fe6060f1SDimitry Andric                                            {II.getType()}, {AllPat});
1196fe6060f1SDimitry Andric   auto *RDFFR =
119706c3fb27SDimitry Andric       IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1198fe6060f1SDimitry Andric   RDFFR->takeName(&II);
1199fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, RDFFR);
1200fe6060f1SDimitry Andric }
1201fe6060f1SDimitry Andric 
1202bdd1243dSDimitry Andric static std::optional<Instruction *>
instCombineSVECntElts(InstCombiner & IC,IntrinsicInst & II,unsigned NumElts)1203fe6060f1SDimitry Andric instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
1204fe6060f1SDimitry Andric   const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1205fe6060f1SDimitry Andric 
1206fe6060f1SDimitry Andric   if (Pattern == AArch64SVEPredPattern::all) {
1207fe6060f1SDimitry Andric     Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
120806c3fb27SDimitry Andric     auto *VScale = IC.Builder.CreateVScale(StepVal);
1209fe6060f1SDimitry Andric     VScale->takeName(&II);
1210fe6060f1SDimitry Andric     return IC.replaceInstUsesWith(II, VScale);
1211fe6060f1SDimitry Andric   }
1212fe6060f1SDimitry Andric 
1213349cc55cSDimitry Andric   unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1214fe6060f1SDimitry Andric 
1215349cc55cSDimitry Andric   return MinNumElts && NumElts >= MinNumElts
1216bdd1243dSDimitry Andric              ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1217fe6060f1SDimitry Andric                    II, ConstantInt::get(II.getType(), MinNumElts)))
1218bdd1243dSDimitry Andric              : std::nullopt;
1219fe6060f1SDimitry Andric }
1220fe6060f1SDimitry Andric 
instCombineSVEPTest(InstCombiner & IC,IntrinsicInst & II)1221bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1222fe6060f1SDimitry Andric                                                         IntrinsicInst &II) {
1223bdd1243dSDimitry Andric   Value *PgVal = II.getArgOperand(0);
1224bdd1243dSDimitry Andric   Value *OpVal = II.getArgOperand(1);
1225fe6060f1SDimitry Andric 
1226bdd1243dSDimitry Andric   // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1227bdd1243dSDimitry Andric   // Later optimizations prefer this form.
1228bdd1243dSDimitry Andric   if (PgVal == OpVal &&
1229bdd1243dSDimitry Andric       (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1230bdd1243dSDimitry Andric        II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1231bdd1243dSDimitry Andric     Value *Ops[] = {PgVal, OpVal};
1232bdd1243dSDimitry Andric     Type *Tys[] = {PgVal->getType()};
1233bdd1243dSDimitry Andric 
1234bdd1243dSDimitry Andric     auto *PTest =
123506c3fb27SDimitry Andric         IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1236bdd1243dSDimitry Andric     PTest->takeName(&II);
1237bdd1243dSDimitry Andric 
1238bdd1243dSDimitry Andric     return IC.replaceInstUsesWith(II, PTest);
1239bdd1243dSDimitry Andric   }
1240bdd1243dSDimitry Andric 
1241bdd1243dSDimitry Andric   IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1242bdd1243dSDimitry Andric   IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1243bdd1243dSDimitry Andric 
1244bdd1243dSDimitry Andric   if (!Pg || !Op)
1245bdd1243dSDimitry Andric     return std::nullopt;
1246bdd1243dSDimitry Andric 
1247bdd1243dSDimitry Andric   Intrinsic::ID OpIID = Op->getIntrinsicID();
1248bdd1243dSDimitry Andric 
1249bdd1243dSDimitry Andric   if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1250bdd1243dSDimitry Andric       OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1251bdd1243dSDimitry Andric       Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1252bdd1243dSDimitry Andric     Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1253bdd1243dSDimitry Andric     Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1254fe6060f1SDimitry Andric 
125506c3fb27SDimitry Andric     auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1256fe6060f1SDimitry Andric 
1257fe6060f1SDimitry Andric     PTest->takeName(&II);
1258fe6060f1SDimitry Andric     return IC.replaceInstUsesWith(II, PTest);
1259fe6060f1SDimitry Andric   }
1260fe6060f1SDimitry Andric 
1261bdd1243dSDimitry Andric   // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1262bdd1243dSDimitry Andric   // Later optimizations may rewrite sequence to use the flag-setting variant
1263bdd1243dSDimitry Andric   // of instruction X to remove PTEST.
1264bdd1243dSDimitry Andric   if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1265bdd1243dSDimitry Andric       ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1266bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1267bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1268bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1269bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1270bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_and_z) ||
1271bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1272bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1273bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1274bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1275bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1276bdd1243dSDimitry Andric        (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1277bdd1243dSDimitry Andric     Value *Ops[] = {Pg->getArgOperand(0), Pg};
1278bdd1243dSDimitry Andric     Type *Tys[] = {Pg->getType()};
1279bdd1243dSDimitry Andric 
128006c3fb27SDimitry Andric     auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1281bdd1243dSDimitry Andric     PTest->takeName(&II);
1282bdd1243dSDimitry Andric 
1283bdd1243dSDimitry Andric     return IC.replaceInstUsesWith(II, PTest);
1284fe6060f1SDimitry Andric   }
1285fe6060f1SDimitry Andric 
1286bdd1243dSDimitry Andric   return std::nullopt;
1287bdd1243dSDimitry Andric }
1288bdd1243dSDimitry Andric 
1289bdd1243dSDimitry Andric template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1290bdd1243dSDimitry Andric static std::optional<Instruction *>
instCombineSVEVectorFuseMulAddSub(InstCombiner & IC,IntrinsicInst & II,bool MergeIntoAddendOp)1291bdd1243dSDimitry Andric instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
1292bdd1243dSDimitry Andric                                   bool MergeIntoAddendOp) {
1293349cc55cSDimitry Andric   Value *P = II.getOperand(0);
1294bdd1243dSDimitry Andric   Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1295bdd1243dSDimitry Andric   if (MergeIntoAddendOp) {
1296bdd1243dSDimitry Andric     AddendOp = II.getOperand(1);
1297bdd1243dSDimitry Andric     Mul = II.getOperand(2);
1298bdd1243dSDimitry Andric   } else {
1299bdd1243dSDimitry Andric     AddendOp = II.getOperand(2);
1300bdd1243dSDimitry Andric     Mul = II.getOperand(1);
1301bdd1243dSDimitry Andric   }
1302349cc55cSDimitry Andric 
1303bdd1243dSDimitry Andric   if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1304bdd1243dSDimitry Andric                                       m_Value(MulOp1))))
1305bdd1243dSDimitry Andric     return std::nullopt;
1306349cc55cSDimitry Andric 
1307bdd1243dSDimitry Andric   if (!Mul->hasOneUse())
1308bdd1243dSDimitry Andric     return std::nullopt;
1309bdd1243dSDimitry Andric 
1310bdd1243dSDimitry Andric   Instruction *FMFSource = nullptr;
1311bdd1243dSDimitry Andric   if (II.getType()->isFPOrFPVectorTy()) {
1312349cc55cSDimitry Andric     llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1313bdd1243dSDimitry Andric     // Stop the combine when the flags on the inputs differ in case dropping
1314bdd1243dSDimitry Andric     // flags would lead to us missing out on more beneficial optimizations.
1315bdd1243dSDimitry Andric     if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1316bdd1243dSDimitry Andric       return std::nullopt;
1317349cc55cSDimitry Andric     if (!FAddFlags.allowContract())
1318bdd1243dSDimitry Andric       return std::nullopt;
1319bdd1243dSDimitry Andric     FMFSource = &II;
1320bdd1243dSDimitry Andric   }
1321349cc55cSDimitry Andric 
1322bdd1243dSDimitry Andric   CallInst *Res;
1323bdd1243dSDimitry Andric   if (MergeIntoAddendOp)
132406c3fb27SDimitry Andric     Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1325bdd1243dSDimitry Andric                                      {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1326bdd1243dSDimitry Andric   else
132706c3fb27SDimitry Andric     Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1328bdd1243dSDimitry Andric                                      {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1329bdd1243dSDimitry Andric 
1330bdd1243dSDimitry Andric   return IC.replaceInstUsesWith(II, Res);
1331349cc55cSDimitry Andric }
1332349cc55cSDimitry Andric 
1333bdd1243dSDimitry Andric static std::optional<Instruction *>
instCombineSVELD1(InstCombiner & IC,IntrinsicInst & II,const DataLayout & DL)1334349cc55cSDimitry Andric instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1335349cc55cSDimitry Andric   Value *Pred = II.getOperand(0);
1336349cc55cSDimitry Andric   Value *PtrOp = II.getOperand(1);
1337349cc55cSDimitry Andric   Type *VecTy = II.getType();
1338349cc55cSDimitry Andric 
13390eae32dcSDimitry Andric   if (isAllActivePredicate(Pred)) {
134006c3fb27SDimitry Andric     LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
134181ad6265SDimitry Andric     Load->copyMetadata(II);
1342349cc55cSDimitry Andric     return IC.replaceInstUsesWith(II, Load);
1343349cc55cSDimitry Andric   }
1344349cc55cSDimitry Andric 
1345349cc55cSDimitry Andric   CallInst *MaskedLoad =
134606c3fb27SDimitry Andric       IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1347349cc55cSDimitry Andric                                   Pred, ConstantAggregateZero::get(VecTy));
134881ad6265SDimitry Andric   MaskedLoad->copyMetadata(II);
1349349cc55cSDimitry Andric   return IC.replaceInstUsesWith(II, MaskedLoad);
1350349cc55cSDimitry Andric }
1351349cc55cSDimitry Andric 
1352bdd1243dSDimitry Andric static std::optional<Instruction *>
instCombineSVEST1(InstCombiner & IC,IntrinsicInst & II,const DataLayout & DL)1353349cc55cSDimitry Andric instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1354349cc55cSDimitry Andric   Value *VecOp = II.getOperand(0);
1355349cc55cSDimitry Andric   Value *Pred = II.getOperand(1);
1356349cc55cSDimitry Andric   Value *PtrOp = II.getOperand(2);
1357349cc55cSDimitry Andric 
13580eae32dcSDimitry Andric   if (isAllActivePredicate(Pred)) {
135906c3fb27SDimitry Andric     StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
136081ad6265SDimitry Andric     Store->copyMetadata(II);
1361349cc55cSDimitry Andric     return IC.eraseInstFromFunction(II);
1362349cc55cSDimitry Andric   }
1363349cc55cSDimitry Andric 
136406c3fb27SDimitry Andric   CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
136506c3fb27SDimitry Andric       VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
136681ad6265SDimitry Andric   MaskedStore->copyMetadata(II);
1367349cc55cSDimitry Andric   return IC.eraseInstFromFunction(II);
1368349cc55cSDimitry Andric }
1369349cc55cSDimitry Andric 
intrinsicIDToBinOpCode(unsigned Intrinsic)1370349cc55cSDimitry Andric static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
1371349cc55cSDimitry Andric   switch (Intrinsic) {
137206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmul_u:
1373349cc55cSDimitry Andric     return Instruction::BinaryOps::FMul;
137406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fadd_u:
1375349cc55cSDimitry Andric     return Instruction::BinaryOps::FAdd;
137606c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fsub_u:
1377349cc55cSDimitry Andric     return Instruction::BinaryOps::FSub;
1378349cc55cSDimitry Andric   default:
1379349cc55cSDimitry Andric     return Instruction::BinaryOpsEnd;
1380349cc55cSDimitry Andric   }
1381349cc55cSDimitry Andric }
1382349cc55cSDimitry Andric 
1383bdd1243dSDimitry Andric static std::optional<Instruction *>
instCombineSVEVectorBinOp(InstCombiner & IC,IntrinsicInst & II)1384bdd1243dSDimitry Andric instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
138506c3fb27SDimitry Andric   // Bail due to missing support for ISD::STRICT_ scalable vector operations.
138606c3fb27SDimitry Andric   if (II.isStrictFP())
138706c3fb27SDimitry Andric     return std::nullopt;
138806c3fb27SDimitry Andric 
1389349cc55cSDimitry Andric   auto *OpPredicate = II.getOperand(0);
1390349cc55cSDimitry Andric   auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1391349cc55cSDimitry Andric   if (BinOpCode == Instruction::BinaryOpsEnd ||
1392349cc55cSDimitry Andric       !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1393349cc55cSDimitry Andric                               m_ConstantInt<AArch64SVEPredPattern::all>())))
1394bdd1243dSDimitry Andric     return std::nullopt;
139506c3fb27SDimitry Andric   IRBuilderBase::FastMathFlagGuard FMFGuard(IC.Builder);
139606c3fb27SDimitry Andric   IC.Builder.setFastMathFlags(II.getFastMathFlags());
1397349cc55cSDimitry Andric   auto BinOp =
139806c3fb27SDimitry Andric       IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1399349cc55cSDimitry Andric   return IC.replaceInstUsesWith(II, BinOp);
1400349cc55cSDimitry Andric }
1401349cc55cSDimitry Andric 
140206c3fb27SDimitry Andric // Canonicalise operations that take an all active predicate (e.g. sve.add ->
140306c3fb27SDimitry Andric // sve.add_u).
instCombineSVEAllActive(IntrinsicInst & II,Intrinsic::ID IID)140406c3fb27SDimitry Andric static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
140506c3fb27SDimitry Andric                                                             Intrinsic::ID IID) {
140606c3fb27SDimitry Andric   auto *OpPredicate = II.getOperand(0);
140706c3fb27SDimitry Andric   if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
140806c3fb27SDimitry Andric                               m_ConstantInt<AArch64SVEPredPattern::all>())))
140906c3fb27SDimitry Andric     return std::nullopt;
141006c3fb27SDimitry Andric 
141106c3fb27SDimitry Andric   auto *Mod = II.getModule();
141206c3fb27SDimitry Andric   auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()});
141306c3fb27SDimitry Andric   II.setCalledFunction(NewDecl);
141406c3fb27SDimitry Andric 
141506c3fb27SDimitry Andric   return &II;
141606c3fb27SDimitry Andric }
141706c3fb27SDimitry Andric 
1418297eecfbSDimitry Andric // Simplify operations where predicate has all inactive lanes or try to replace
1419297eecfbSDimitry Andric // with _u form when all lanes are active
1420297eecfbSDimitry Andric static std::optional<Instruction *>
instCombineSVEAllOrNoActive(InstCombiner & IC,IntrinsicInst & II,Intrinsic::ID IID)1421297eecfbSDimitry Andric instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II,
1422297eecfbSDimitry Andric                             Intrinsic::ID IID) {
1423297eecfbSDimitry Andric   if (match(II.getOperand(0), m_ZeroInt())) {
1424297eecfbSDimitry Andric     //  llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1425297eecfbSDimitry Andric     //  inactive for sv[func]_m
1426297eecfbSDimitry Andric     return IC.replaceInstUsesWith(II, II.getOperand(1));
1427297eecfbSDimitry Andric   }
1428297eecfbSDimitry Andric   return instCombineSVEAllActive(II, IID);
1429297eecfbSDimitry Andric }
1430297eecfbSDimitry Andric 
instCombineSVEVectorAdd(InstCombiner & IC,IntrinsicInst & II)1431bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1432349cc55cSDimitry Andric                                                             IntrinsicInst &II) {
1433297eecfbSDimitry Andric   if (auto II_U =
1434297eecfbSDimitry Andric           instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
143506c3fb27SDimitry Andric     return II_U;
143606c3fb27SDimitry Andric   if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
143706c3fb27SDimitry Andric                                                    Intrinsic::aarch64_sve_mla>(
143806c3fb27SDimitry Andric           IC, II, true))
143906c3fb27SDimitry Andric     return MLA;
144006c3fb27SDimitry Andric   if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
144106c3fb27SDimitry Andric                                                    Intrinsic::aarch64_sve_mad>(
144206c3fb27SDimitry Andric           IC, II, false))
144306c3fb27SDimitry Andric     return MAD;
144406c3fb27SDimitry Andric   return std::nullopt;
144506c3fb27SDimitry Andric }
144606c3fb27SDimitry Andric 
144706c3fb27SDimitry Andric static std::optional<Instruction *>
instCombineSVEVectorFAdd(InstCombiner & IC,IntrinsicInst & II)144806c3fb27SDimitry Andric instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
1449297eecfbSDimitry Andric   if (auto II_U =
1450297eecfbSDimitry Andric           instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
145106c3fb27SDimitry Andric     return II_U;
1452bdd1243dSDimitry Andric   if (auto FMLA =
1453bdd1243dSDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1454bdd1243dSDimitry Andric                                             Intrinsic::aarch64_sve_fmla>(IC, II,
1455bdd1243dSDimitry Andric                                                                          true))
1456349cc55cSDimitry Andric     return FMLA;
1457bdd1243dSDimitry Andric   if (auto FMAD =
1458bdd1243dSDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1459bdd1243dSDimitry Andric                                             Intrinsic::aarch64_sve_fmad>(IC, II,
1460bdd1243dSDimitry Andric                                                                          false))
1461bdd1243dSDimitry Andric     return FMAD;
146206c3fb27SDimitry Andric   if (auto FMLA =
146306c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
146406c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmla>(IC, II,
146506c3fb27SDimitry Andric                                                                          true))
146606c3fb27SDimitry Andric     return FMLA;
146706c3fb27SDimitry Andric   return std::nullopt;
146806c3fb27SDimitry Andric }
146906c3fb27SDimitry Andric 
147006c3fb27SDimitry Andric static std::optional<Instruction *>
instCombineSVEVectorFAddU(InstCombiner & IC,IntrinsicInst & II)147106c3fb27SDimitry Andric instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
147206c3fb27SDimitry Andric   if (auto FMLA =
147306c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
147406c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmla>(IC, II,
147506c3fb27SDimitry Andric                                                                          true))
147606c3fb27SDimitry Andric     return FMLA;
147706c3fb27SDimitry Andric   if (auto FMAD =
147806c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
147906c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmad>(IC, II,
148006c3fb27SDimitry Andric                                                                          false))
148106c3fb27SDimitry Andric     return FMAD;
148206c3fb27SDimitry Andric   if (auto FMLA_U =
148306c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
148406c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmla_u>(
148506c3fb27SDimitry Andric               IC, II, true))
148606c3fb27SDimitry Andric     return FMLA_U;
1487349cc55cSDimitry Andric   return instCombineSVEVectorBinOp(IC, II);
1488349cc55cSDimitry Andric }
1489349cc55cSDimitry Andric 
149006c3fb27SDimitry Andric static std::optional<Instruction *>
instCombineSVEVectorFSub(InstCombiner & IC,IntrinsicInst & II)149106c3fb27SDimitry Andric instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
1492297eecfbSDimitry Andric   if (auto II_U =
1493297eecfbSDimitry Andric           instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
149406c3fb27SDimitry Andric     return II_U;
1495bdd1243dSDimitry Andric   if (auto FMLS =
1496bdd1243dSDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1497bdd1243dSDimitry Andric                                             Intrinsic::aarch64_sve_fmls>(IC, II,
1498bdd1243dSDimitry Andric                                                                          true))
1499bdd1243dSDimitry Andric     return FMLS;
1500bdd1243dSDimitry Andric   if (auto FMSB =
1501bdd1243dSDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1502bdd1243dSDimitry Andric                                             Intrinsic::aarch64_sve_fnmsb>(
1503bdd1243dSDimitry Andric               IC, II, false))
1504bdd1243dSDimitry Andric     return FMSB;
150506c3fb27SDimitry Andric   if (auto FMLS =
150606c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
150706c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmls>(IC, II,
150806c3fb27SDimitry Andric                                                                          true))
150906c3fb27SDimitry Andric     return FMLS;
151006c3fb27SDimitry Andric   return std::nullopt;
151106c3fb27SDimitry Andric }
151206c3fb27SDimitry Andric 
151306c3fb27SDimitry Andric static std::optional<Instruction *>
instCombineSVEVectorFSubU(InstCombiner & IC,IntrinsicInst & II)151406c3fb27SDimitry Andric instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
151506c3fb27SDimitry Andric   if (auto FMLS =
151606c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
151706c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmls>(IC, II,
151806c3fb27SDimitry Andric                                                                          true))
151906c3fb27SDimitry Andric     return FMLS;
152006c3fb27SDimitry Andric   if (auto FMSB =
152106c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
152206c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fnmsb>(
152306c3fb27SDimitry Andric               IC, II, false))
152406c3fb27SDimitry Andric     return FMSB;
152506c3fb27SDimitry Andric   if (auto FMLS_U =
152606c3fb27SDimitry Andric           instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
152706c3fb27SDimitry Andric                                             Intrinsic::aarch64_sve_fmls_u>(
152806c3fb27SDimitry Andric               IC, II, true))
152906c3fb27SDimitry Andric     return FMLS_U;
1530bdd1243dSDimitry Andric   return instCombineSVEVectorBinOp(IC, II);
1531bdd1243dSDimitry Andric }
1532bdd1243dSDimitry Andric 
instCombineSVEVectorSub(InstCombiner & IC,IntrinsicInst & II)153306c3fb27SDimitry Andric static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1534fe6060f1SDimitry Andric                                                             IntrinsicInst &II) {
1535297eecfbSDimitry Andric   if (auto II_U =
1536297eecfbSDimitry Andric           instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
153706c3fb27SDimitry Andric     return II_U;
153806c3fb27SDimitry Andric   if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
153906c3fb27SDimitry Andric                                                    Intrinsic::aarch64_sve_mls>(
154006c3fb27SDimitry Andric           IC, II, true))
154106c3fb27SDimitry Andric     return MLS;
154206c3fb27SDimitry Andric   return std::nullopt;
154306c3fb27SDimitry Andric }
154406c3fb27SDimitry Andric 
instCombineSVEVectorMul(InstCombiner & IC,IntrinsicInst & II,Intrinsic::ID IID)154506c3fb27SDimitry Andric static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
154606c3fb27SDimitry Andric                                                             IntrinsicInst &II,
154706c3fb27SDimitry Andric                                                             Intrinsic::ID IID) {
1548fe6060f1SDimitry Andric   auto *OpPredicate = II.getOperand(0);
1549fe6060f1SDimitry Andric   auto *OpMultiplicand = II.getOperand(1);
1550fe6060f1SDimitry Andric   auto *OpMultiplier = II.getOperand(2);
1551fe6060f1SDimitry Andric 
1552349cc55cSDimitry Andric   // Return true if a given instruction is a unit splat value, false otherwise.
1553349cc55cSDimitry Andric   auto IsUnitSplat = [](auto *I) {
1554349cc55cSDimitry Andric     auto *SplatValue = getSplatValue(I);
1555349cc55cSDimitry Andric     if (!SplatValue)
1556fe6060f1SDimitry Andric       return false;
1557fe6060f1SDimitry Andric     return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1558fe6060f1SDimitry Andric   };
1559fe6060f1SDimitry Andric 
1560fe6060f1SDimitry Andric   // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1561fe6060f1SDimitry Andric   // with a unit splat value, false otherwise.
1562fe6060f1SDimitry Andric   auto IsUnitDup = [](auto *I) {
1563fe6060f1SDimitry Andric     auto *IntrI = dyn_cast<IntrinsicInst>(I);
1564fe6060f1SDimitry Andric     if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1565fe6060f1SDimitry Andric       return false;
1566fe6060f1SDimitry Andric 
1567fe6060f1SDimitry Andric     auto *SplatValue = IntrI->getOperand(2);
1568fe6060f1SDimitry Andric     return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1569fe6060f1SDimitry Andric   };
1570fe6060f1SDimitry Andric 
1571349cc55cSDimitry Andric   if (IsUnitSplat(OpMultiplier)) {
15724824e7fdSDimitry Andric     // [f]mul pg %n, (dupx 1) => %n
1573fe6060f1SDimitry Andric     OpMultiplicand->takeName(&II);
1574fe6060f1SDimitry Andric     return IC.replaceInstUsesWith(II, OpMultiplicand);
1575fe6060f1SDimitry Andric   } else if (IsUnitDup(OpMultiplier)) {
15764824e7fdSDimitry Andric     // [f]mul pg %n, (dup pg 1) => %n
1577fe6060f1SDimitry Andric     auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1578fe6060f1SDimitry Andric     auto *DupPg = DupInst->getOperand(1);
1579fe6060f1SDimitry Andric     // TODO: this is naive. The optimization is still valid if DupPg
1580fe6060f1SDimitry Andric     // 'encompasses' OpPredicate, not only if they're the same predicate.
1581fe6060f1SDimitry Andric     if (OpPredicate == DupPg) {
1582fe6060f1SDimitry Andric       OpMultiplicand->takeName(&II);
1583fe6060f1SDimitry Andric       return IC.replaceInstUsesWith(II, OpMultiplicand);
1584fe6060f1SDimitry Andric     }
1585fe6060f1SDimitry Andric   }
1586fe6060f1SDimitry Andric 
1587349cc55cSDimitry Andric   return instCombineSVEVectorBinOp(IC, II);
1588fe6060f1SDimitry Andric }
1589fe6060f1SDimitry Andric 
instCombineSVEUnpack(InstCombiner & IC,IntrinsicInst & II)1590bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1591349cc55cSDimitry Andric                                                          IntrinsicInst &II) {
1592349cc55cSDimitry Andric   Value *UnpackArg = II.getArgOperand(0);
1593349cc55cSDimitry Andric   auto *RetTy = cast<ScalableVectorType>(II.getType());
1594349cc55cSDimitry Andric   bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1595349cc55cSDimitry Andric                   II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1596349cc55cSDimitry Andric 
1597349cc55cSDimitry Andric   // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1598349cc55cSDimitry Andric   // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1599349cc55cSDimitry Andric   if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1600349cc55cSDimitry Andric     ScalarArg =
160106c3fb27SDimitry Andric         IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1602349cc55cSDimitry Andric     Value *NewVal =
160306c3fb27SDimitry Andric         IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1604349cc55cSDimitry Andric     NewVal->takeName(&II);
1605349cc55cSDimitry Andric     return IC.replaceInstUsesWith(II, NewVal);
1606349cc55cSDimitry Andric   }
1607349cc55cSDimitry Andric 
1608bdd1243dSDimitry Andric   return std::nullopt;
1609349cc55cSDimitry Andric }
instCombineSVETBL(InstCombiner & IC,IntrinsicInst & II)1610bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1611fe6060f1SDimitry Andric                                                       IntrinsicInst &II) {
1612fe6060f1SDimitry Andric   auto *OpVal = II.getOperand(0);
1613fe6060f1SDimitry Andric   auto *OpIndices = II.getOperand(1);
1614fe6060f1SDimitry Andric   VectorType *VTy = cast<VectorType>(II.getType());
1615fe6060f1SDimitry Andric 
1616349cc55cSDimitry Andric   // Check whether OpIndices is a constant splat value < minimal element count
1617349cc55cSDimitry Andric   // of result.
1618349cc55cSDimitry Andric   auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1619fe6060f1SDimitry Andric   if (!SplatValue ||
1620fe6060f1SDimitry Andric       SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1621bdd1243dSDimitry Andric     return std::nullopt;
1622fe6060f1SDimitry Andric 
1623fe6060f1SDimitry Andric   // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1624fe6060f1SDimitry Andric   // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
162506c3fb27SDimitry Andric   auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1626fe6060f1SDimitry Andric   auto *VectorSplat =
162706c3fb27SDimitry Andric       IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1628fe6060f1SDimitry Andric 
1629fe6060f1SDimitry Andric   VectorSplat->takeName(&II);
1630fe6060f1SDimitry Andric   return IC.replaceInstUsesWith(II, VectorSplat);
1631fe6060f1SDimitry Andric }
1632fe6060f1SDimitry Andric 
instCombineSVEZip(InstCombiner & IC,IntrinsicInst & II)1633bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1634349cc55cSDimitry Andric                                                       IntrinsicInst &II) {
1635349cc55cSDimitry Andric   // zip1(uzp1(A, B), uzp2(A, B)) --> A
1636349cc55cSDimitry Andric   // zip2(uzp1(A, B), uzp2(A, B)) --> B
1637349cc55cSDimitry Andric   Value *A, *B;
1638349cc55cSDimitry Andric   if (match(II.getArgOperand(0),
1639349cc55cSDimitry Andric             m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1640349cc55cSDimitry Andric       match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1641349cc55cSDimitry Andric                                      m_Specific(A), m_Specific(B))))
1642349cc55cSDimitry Andric     return IC.replaceInstUsesWith(
1643349cc55cSDimitry Andric         II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1644349cc55cSDimitry Andric 
1645bdd1243dSDimitry Andric   return std::nullopt;
1646349cc55cSDimitry Andric }
1647349cc55cSDimitry Andric 
1648bdd1243dSDimitry Andric static std::optional<Instruction *>
instCombineLD1GatherIndex(InstCombiner & IC,IntrinsicInst & II)1649bdd1243dSDimitry Andric instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
1650349cc55cSDimitry Andric   Value *Mask = II.getOperand(0);
1651349cc55cSDimitry Andric   Value *BasePtr = II.getOperand(1);
1652349cc55cSDimitry Andric   Value *Index = II.getOperand(2);
1653349cc55cSDimitry Andric   Type *Ty = II.getType();
1654349cc55cSDimitry Andric   Value *PassThru = ConstantAggregateZero::get(Ty);
1655349cc55cSDimitry Andric 
1656349cc55cSDimitry Andric   // Contiguous gather => masked load.
1657349cc55cSDimitry Andric   // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1658349cc55cSDimitry Andric   // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1659349cc55cSDimitry Andric   Value *IndexBase;
1660349cc55cSDimitry Andric   if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1661349cc55cSDimitry Andric                        m_Value(IndexBase), m_SpecificInt(1)))) {
1662349cc55cSDimitry Andric     Align Alignment =
1663349cc55cSDimitry Andric         BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1664349cc55cSDimitry Andric 
1665349cc55cSDimitry Andric     Type *VecPtrTy = PointerType::getUnqual(Ty);
166606c3fb27SDimitry Andric     Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1667bdd1243dSDimitry Andric                                       BasePtr, IndexBase);
166806c3fb27SDimitry Andric     Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1669349cc55cSDimitry Andric     CallInst *MaskedLoad =
167006c3fb27SDimitry Andric         IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1671349cc55cSDimitry Andric     MaskedLoad->takeName(&II);
1672349cc55cSDimitry Andric     return IC.replaceInstUsesWith(II, MaskedLoad);
1673349cc55cSDimitry Andric   }
1674349cc55cSDimitry Andric 
1675bdd1243dSDimitry Andric   return std::nullopt;
1676349cc55cSDimitry Andric }
1677349cc55cSDimitry Andric 
1678bdd1243dSDimitry Andric static std::optional<Instruction *>
instCombineST1ScatterIndex(InstCombiner & IC,IntrinsicInst & II)1679bdd1243dSDimitry Andric instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
1680349cc55cSDimitry Andric   Value *Val = II.getOperand(0);
1681349cc55cSDimitry Andric   Value *Mask = II.getOperand(1);
1682349cc55cSDimitry Andric   Value *BasePtr = II.getOperand(2);
1683349cc55cSDimitry Andric   Value *Index = II.getOperand(3);
1684349cc55cSDimitry Andric   Type *Ty = Val->getType();
1685349cc55cSDimitry Andric 
1686349cc55cSDimitry Andric   // Contiguous scatter => masked store.
168781ad6265SDimitry Andric   // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1688349cc55cSDimitry Andric   // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1689349cc55cSDimitry Andric   Value *IndexBase;
1690349cc55cSDimitry Andric   if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1691349cc55cSDimitry Andric                        m_Value(IndexBase), m_SpecificInt(1)))) {
1692349cc55cSDimitry Andric     Align Alignment =
1693349cc55cSDimitry Andric         BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1694349cc55cSDimitry Andric 
169506c3fb27SDimitry Andric     Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1696bdd1243dSDimitry Andric                                       BasePtr, IndexBase);
1697349cc55cSDimitry Andric     Type *VecPtrTy = PointerType::getUnqual(Ty);
169806c3fb27SDimitry Andric     Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1699349cc55cSDimitry Andric 
170006c3fb27SDimitry Andric     (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1701349cc55cSDimitry Andric 
1702349cc55cSDimitry Andric     return IC.eraseInstFromFunction(II);
1703349cc55cSDimitry Andric   }
1704349cc55cSDimitry Andric 
1705bdd1243dSDimitry Andric   return std::nullopt;
1706349cc55cSDimitry Andric }
1707349cc55cSDimitry Andric 
instCombineSVESDIV(InstCombiner & IC,IntrinsicInst & II)1708bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
17090eae32dcSDimitry Andric                                                        IntrinsicInst &II) {
171006c3fb27SDimitry Andric   Type *Int32Ty = IC.Builder.getInt32Ty();
17110eae32dcSDimitry Andric   Value *Pred = II.getOperand(0);
17120eae32dcSDimitry Andric   Value *Vec = II.getOperand(1);
17130eae32dcSDimitry Andric   Value *DivVec = II.getOperand(2);
17140eae32dcSDimitry Andric 
17150eae32dcSDimitry Andric   Value *SplatValue = getSplatValue(DivVec);
17160eae32dcSDimitry Andric   ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
17170eae32dcSDimitry Andric   if (!SplatConstantInt)
1718bdd1243dSDimitry Andric     return std::nullopt;
17190eae32dcSDimitry Andric   APInt Divisor = SplatConstantInt->getValue();
17200eae32dcSDimitry Andric 
17210eae32dcSDimitry Andric   if (Divisor.isPowerOf2()) {
17220eae32dcSDimitry Andric     Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
172306c3fb27SDimitry Andric     auto ASRD = IC.Builder.CreateIntrinsic(
17240eae32dcSDimitry Andric         Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
17250eae32dcSDimitry Andric     return IC.replaceInstUsesWith(II, ASRD);
17260eae32dcSDimitry Andric   }
17270eae32dcSDimitry Andric   if (Divisor.isNegatedPowerOf2()) {
17280eae32dcSDimitry Andric     Divisor.negate();
17290eae32dcSDimitry Andric     Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
173006c3fb27SDimitry Andric     auto ASRD = IC.Builder.CreateIntrinsic(
17310eae32dcSDimitry Andric         Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
173206c3fb27SDimitry Andric     auto NEG = IC.Builder.CreateIntrinsic(
173306c3fb27SDimitry Andric         Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
17340eae32dcSDimitry Andric     return IC.replaceInstUsesWith(II, NEG);
17350eae32dcSDimitry Andric   }
17360eae32dcSDimitry Andric 
1737bdd1243dSDimitry Andric   return std::nullopt;
17380eae32dcSDimitry Andric }
17390eae32dcSDimitry Andric 
SimplifyValuePattern(SmallVector<Value * > & Vec,bool AllowPoison)1740bdd1243dSDimitry Andric bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1741bdd1243dSDimitry Andric   size_t VecSize = Vec.size();
1742bdd1243dSDimitry Andric   if (VecSize == 1)
1743bdd1243dSDimitry Andric     return true;
1744bdd1243dSDimitry Andric   if (!isPowerOf2_64(VecSize))
1745bdd1243dSDimitry Andric     return false;
1746bdd1243dSDimitry Andric   size_t HalfVecSize = VecSize / 2;
1747bdd1243dSDimitry Andric 
1748bdd1243dSDimitry Andric   for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1749bdd1243dSDimitry Andric        RHS != Vec.end(); LHS++, RHS++) {
1750bdd1243dSDimitry Andric     if (*LHS != nullptr && *RHS != nullptr) {
1751bdd1243dSDimitry Andric       if (*LHS == *RHS)
1752bdd1243dSDimitry Andric         continue;
1753bdd1243dSDimitry Andric       else
1754bdd1243dSDimitry Andric         return false;
1755bdd1243dSDimitry Andric     }
1756bdd1243dSDimitry Andric     if (!AllowPoison)
1757bdd1243dSDimitry Andric       return false;
1758bdd1243dSDimitry Andric     if (*LHS == nullptr && *RHS != nullptr)
1759bdd1243dSDimitry Andric       *LHS = *RHS;
1760bdd1243dSDimitry Andric   }
1761bdd1243dSDimitry Andric 
1762bdd1243dSDimitry Andric   Vec.resize(HalfVecSize);
1763bdd1243dSDimitry Andric   SimplifyValuePattern(Vec, AllowPoison);
1764bdd1243dSDimitry Andric   return true;
1765bdd1243dSDimitry Andric }
1766bdd1243dSDimitry Andric 
1767bdd1243dSDimitry Andric // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1768bdd1243dSDimitry Andric // to dupqlane(f64(C)) where C is A concatenated with B
instCombineSVEDupqLane(InstCombiner & IC,IntrinsicInst & II)1769bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1770bdd1243dSDimitry Andric                                                            IntrinsicInst &II) {
1771bdd1243dSDimitry Andric   Value *CurrentInsertElt = nullptr, *Default = nullptr;
1772bdd1243dSDimitry Andric   if (!match(II.getOperand(0),
1773bdd1243dSDimitry Andric              m_Intrinsic<Intrinsic::vector_insert>(
1774bdd1243dSDimitry Andric                  m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1775bdd1243dSDimitry Andric       !isa<FixedVectorType>(CurrentInsertElt->getType()))
1776bdd1243dSDimitry Andric     return std::nullopt;
1777bdd1243dSDimitry Andric   auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1778bdd1243dSDimitry Andric 
1779bdd1243dSDimitry Andric   // Insert the scalars into a container ordered by InsertElement index
1780bdd1243dSDimitry Andric   SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1781bdd1243dSDimitry Andric   while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1782bdd1243dSDimitry Andric     auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1783bdd1243dSDimitry Andric     Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1784bdd1243dSDimitry Andric     CurrentInsertElt = InsertElt->getOperand(0);
1785bdd1243dSDimitry Andric   }
1786bdd1243dSDimitry Andric 
1787bdd1243dSDimitry Andric   bool AllowPoison =
1788bdd1243dSDimitry Andric       isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1789bdd1243dSDimitry Andric   if (!SimplifyValuePattern(Elts, AllowPoison))
1790bdd1243dSDimitry Andric     return std::nullopt;
1791bdd1243dSDimitry Andric 
1792bdd1243dSDimitry Andric   // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1793bdd1243dSDimitry Andric   Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1794bdd1243dSDimitry Andric   for (size_t I = 0; I < Elts.size(); I++) {
1795bdd1243dSDimitry Andric     if (Elts[I] == nullptr)
1796bdd1243dSDimitry Andric       continue;
179706c3fb27SDimitry Andric     InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
179806c3fb27SDimitry Andric                                                     IC.Builder.getInt64(I));
1799bdd1243dSDimitry Andric   }
1800bdd1243dSDimitry Andric   if (InsertEltChain == nullptr)
1801bdd1243dSDimitry Andric     return std::nullopt;
1802bdd1243dSDimitry Andric 
1803bdd1243dSDimitry Andric   // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1804bdd1243dSDimitry Andric   // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1805bdd1243dSDimitry Andric   // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1806bdd1243dSDimitry Andric   // be narrowed back to the original type.
1807bdd1243dSDimitry Andric   unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1808bdd1243dSDimitry Andric   unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1809bdd1243dSDimitry Andric                                  IIScalableTy->getMinNumElements() /
1810bdd1243dSDimitry Andric                                  PatternWidth;
1811bdd1243dSDimitry Andric 
181206c3fb27SDimitry Andric   IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
1813bdd1243dSDimitry Andric   auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1814bdd1243dSDimitry Andric   auto *WideShuffleMaskTy =
181506c3fb27SDimitry Andric       ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
1816bdd1243dSDimitry Andric 
181706c3fb27SDimitry Andric   auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
181806c3fb27SDimitry Andric   auto InsertSubvector = IC.Builder.CreateInsertVector(
1819bdd1243dSDimitry Andric       II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1820bdd1243dSDimitry Andric   auto WideBitcast =
182106c3fb27SDimitry Andric       IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1822bdd1243dSDimitry Andric   auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
182306c3fb27SDimitry Andric   auto WideShuffle = IC.Builder.CreateShuffleVector(
1824bdd1243dSDimitry Andric       WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1825bdd1243dSDimitry Andric   auto NarrowBitcast =
182606c3fb27SDimitry Andric       IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1827bdd1243dSDimitry Andric 
1828bdd1243dSDimitry Andric   return IC.replaceInstUsesWith(II, NarrowBitcast);
1829bdd1243dSDimitry Andric }
1830bdd1243dSDimitry Andric 
instCombineMaxMinNM(InstCombiner & IC,IntrinsicInst & II)1831bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
183281ad6265SDimitry Andric                                                         IntrinsicInst &II) {
183381ad6265SDimitry Andric   Value *A = II.getArgOperand(0);
183481ad6265SDimitry Andric   Value *B = II.getArgOperand(1);
183581ad6265SDimitry Andric   if (A == B)
183681ad6265SDimitry Andric     return IC.replaceInstUsesWith(II, A);
183781ad6265SDimitry Andric 
1838bdd1243dSDimitry Andric   return std::nullopt;
183981ad6265SDimitry Andric }
184081ad6265SDimitry Andric 
instCombineSVESrshl(InstCombiner & IC,IntrinsicInst & II)1841bdd1243dSDimitry Andric static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
184281ad6265SDimitry Andric                                                         IntrinsicInst &II) {
184381ad6265SDimitry Andric   Value *Pred = II.getOperand(0);
184481ad6265SDimitry Andric   Value *Vec = II.getOperand(1);
184581ad6265SDimitry Andric   Value *Shift = II.getOperand(2);
184681ad6265SDimitry Andric 
184781ad6265SDimitry Andric   // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
184881ad6265SDimitry Andric   Value *AbsPred, *MergedValue;
184981ad6265SDimitry Andric   if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
185081ad6265SDimitry Andric                       m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
185181ad6265SDimitry Andric       !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
185281ad6265SDimitry Andric                       m_Value(MergedValue), m_Value(AbsPred), m_Value())))
185381ad6265SDimitry Andric 
1854bdd1243dSDimitry Andric     return std::nullopt;
185581ad6265SDimitry Andric 
185681ad6265SDimitry Andric   // Transform is valid if any of the following are true:
185781ad6265SDimitry Andric   // * The ABS merge value is an undef or non-negative
185881ad6265SDimitry Andric   // * The ABS predicate is all active
185981ad6265SDimitry Andric   // * The ABS predicate and the SRSHL predicates are the same
1860bdd1243dSDimitry Andric   if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
186181ad6265SDimitry Andric       AbsPred != Pred && !isAllActivePredicate(AbsPred))
1862bdd1243dSDimitry Andric     return std::nullopt;
186381ad6265SDimitry Andric 
186481ad6265SDimitry Andric   // Only valid when the shift amount is non-negative, otherwise the rounding
186581ad6265SDimitry Andric   // behaviour of SRSHL cannot be ignored.
186681ad6265SDimitry Andric   if (!match(Shift, m_NonNegative()))
1867bdd1243dSDimitry Andric     return std::nullopt;
186881ad6265SDimitry Andric 
186906c3fb27SDimitry Andric   auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
187006c3fb27SDimitry Andric                                         {II.getType()}, {Pred, Vec, Shift});
187181ad6265SDimitry Andric 
187281ad6265SDimitry Andric   return IC.replaceInstUsesWith(II, LSL);
187381ad6265SDimitry Andric }
187481ad6265SDimitry Andric 
1875bdd1243dSDimitry Andric std::optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const1876fe6060f1SDimitry Andric AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
1877fe6060f1SDimitry Andric                                      IntrinsicInst &II) const {
1878fe6060f1SDimitry Andric   Intrinsic::ID IID = II.getIntrinsicID();
1879fe6060f1SDimitry Andric   switch (IID) {
1880fe6060f1SDimitry Andric   default:
1881fe6060f1SDimitry Andric     break;
188281ad6265SDimitry Andric   case Intrinsic::aarch64_neon_fmaxnm:
188381ad6265SDimitry Andric   case Intrinsic::aarch64_neon_fminnm:
188481ad6265SDimitry Andric     return instCombineMaxMinNM(IC, II);
1885fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_convert_from_svbool:
1886fe6060f1SDimitry Andric     return instCombineConvertFromSVBool(IC, II);
1887fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_dup:
1888fe6060f1SDimitry Andric     return instCombineSVEDup(IC, II);
1889349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_dup_x:
1890349cc55cSDimitry Andric     return instCombineSVEDupX(IC, II);
1891fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_cmpne:
1892fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_cmpne_wide:
1893fe6060f1SDimitry Andric     return instCombineSVECmpNE(IC, II);
1894fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_rdffr:
1895fe6060f1SDimitry Andric     return instCombineRDFFR(IC, II);
1896fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_lasta:
1897fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_lastb:
1898fe6060f1SDimitry Andric     return instCombineSVELast(IC, II);
1899753f127fSDimitry Andric   case Intrinsic::aarch64_sve_clasta_n:
1900753f127fSDimitry Andric   case Intrinsic::aarch64_sve_clastb_n:
1901753f127fSDimitry Andric     return instCombineSVECondLast(IC, II);
1902fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_cntd:
1903fe6060f1SDimitry Andric     return instCombineSVECntElts(IC, II, 2);
1904fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_cntw:
1905fe6060f1SDimitry Andric     return instCombineSVECntElts(IC, II, 4);
1906fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_cnth:
1907fe6060f1SDimitry Andric     return instCombineSVECntElts(IC, II, 8);
1908fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_cntb:
1909fe6060f1SDimitry Andric     return instCombineSVECntElts(IC, II, 16);
1910fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_ptest_any:
1911fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_ptest_first:
1912fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_ptest_last:
1913fe6060f1SDimitry Andric     return instCombineSVEPTest(IC, II);
191406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fabd:
1915297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
1916349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_fadd:
191706c3fb27SDimitry Andric     return instCombineSVEVectorFAdd(IC, II);
191806c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fadd_u:
191906c3fb27SDimitry Andric     return instCombineSVEVectorFAddU(IC, II);
192006c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fdiv:
1921297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
192206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmax:
1923297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
192406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmaxnm:
1925297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
192606c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmin:
1927297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
192806c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fminnm:
1929297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
193006c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmla:
1931297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
193206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmls:
1933297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
193406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmul:
1935297eecfbSDimitry Andric     if (auto II_U =
1936297eecfbSDimitry Andric             instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
1937297eecfbSDimitry Andric       return II_U;
1938297eecfbSDimitry Andric     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
193906c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmul_u:
194006c3fb27SDimitry Andric     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
194106c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fmulx:
1942297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
194306c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fnmla:
1944297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
194506c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fnmls:
1946297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
194706c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fsub:
194806c3fb27SDimitry Andric     return instCombineSVEVectorFSub(IC, II);
194906c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_fsub_u:
195006c3fb27SDimitry Andric     return instCombineSVEVectorFSubU(IC, II);
1951bdd1243dSDimitry Andric   case Intrinsic::aarch64_sve_add:
1952bdd1243dSDimitry Andric     return instCombineSVEVectorAdd(IC, II);
195306c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_add_u:
195406c3fb27SDimitry Andric     return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
195506c3fb27SDimitry Andric                                              Intrinsic::aarch64_sve_mla_u>(
195606c3fb27SDimitry Andric         IC, II, true);
195706c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_mla:
1958297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
195906c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_mls:
1960297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
196106c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_mul:
1962297eecfbSDimitry Andric     if (auto II_U =
1963297eecfbSDimitry Andric             instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
1964297eecfbSDimitry Andric       return II_U;
1965297eecfbSDimitry Andric     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
196606c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_mul_u:
196706c3fb27SDimitry Andric     return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
196806c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_sabd:
1969297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
197006c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_smax:
1971297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
197206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_smin:
1973297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
197406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_smulh:
1975297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
1976bdd1243dSDimitry Andric   case Intrinsic::aarch64_sve_sub:
1977bdd1243dSDimitry Andric     return instCombineSVEVectorSub(IC, II);
197806c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_sub_u:
197906c3fb27SDimitry Andric     return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
198006c3fb27SDimitry Andric                                              Intrinsic::aarch64_sve_mls_u>(
198106c3fb27SDimitry Andric         IC, II, true);
198206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_uabd:
1983297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
198406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_umax:
1985297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
198606c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_umin:
1987297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
198806c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_umulh:
1989297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
199006c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_asr:
1991297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
199206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_lsl:
1993297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
199406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_lsr:
1995297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
199606c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_and:
1997297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
199806c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_bic:
1999297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
200006c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_eor:
2001297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
200206c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_orr:
2003297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
200406c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_sqsub:
2005297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
200606c3fb27SDimitry Andric   case Intrinsic::aarch64_sve_uqsub:
2007297eecfbSDimitry Andric     return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2008fe6060f1SDimitry Andric   case Intrinsic::aarch64_sve_tbl:
2009fe6060f1SDimitry Andric     return instCombineSVETBL(IC, II);
2010349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_uunpkhi:
2011349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_uunpklo:
2012349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_sunpkhi:
2013349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_sunpklo:
2014349cc55cSDimitry Andric     return instCombineSVEUnpack(IC, II);
2015349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_zip1:
2016349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_zip2:
2017349cc55cSDimitry Andric     return instCombineSVEZip(IC, II);
2018349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_ld1_gather_index:
2019349cc55cSDimitry Andric     return instCombineLD1GatherIndex(IC, II);
2020349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_st1_scatter_index:
2021349cc55cSDimitry Andric     return instCombineST1ScatterIndex(IC, II);
2022349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_ld1:
2023349cc55cSDimitry Andric     return instCombineSVELD1(IC, II, DL);
2024349cc55cSDimitry Andric   case Intrinsic::aarch64_sve_st1:
2025349cc55cSDimitry Andric     return instCombineSVEST1(IC, II, DL);
20260eae32dcSDimitry Andric   case Intrinsic::aarch64_sve_sdiv:
20270eae32dcSDimitry Andric     return instCombineSVESDIV(IC, II);
202881ad6265SDimitry Andric   case Intrinsic::aarch64_sve_sel:
202981ad6265SDimitry Andric     return instCombineSVESel(IC, II);
203081ad6265SDimitry Andric   case Intrinsic::aarch64_sve_srshl:
203181ad6265SDimitry Andric     return instCombineSVESrshl(IC, II);
2032bdd1243dSDimitry Andric   case Intrinsic::aarch64_sve_dupq_lane:
2033bdd1243dSDimitry Andric     return instCombineSVEDupqLane(IC, II);
2034fe6060f1SDimitry Andric   }
2035fe6060f1SDimitry Andric 
2036bdd1243dSDimitry Andric   return std::nullopt;
2037fe6060f1SDimitry Andric }
2038fe6060f1SDimitry Andric 
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt OrigDemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp) const2039bdd1243dSDimitry Andric std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
204004eeddc0SDimitry Andric     InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
204104eeddc0SDimitry Andric     APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
204204eeddc0SDimitry Andric     std::function<void(Instruction *, unsigned, APInt, APInt &)>
204304eeddc0SDimitry Andric         SimplifyAndSetOp) const {
204404eeddc0SDimitry Andric   switch (II.getIntrinsicID()) {
204504eeddc0SDimitry Andric   default:
204604eeddc0SDimitry Andric     break;
204704eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_fcvtxn:
204804eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_rshrn:
204904eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_sqrshrn:
205004eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_sqrshrun:
205104eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_sqshrn:
205204eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_sqshrun:
205304eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_sqxtn:
205404eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_sqxtun:
205504eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_uqrshrn:
205604eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_uqshrn:
205704eeddc0SDimitry Andric   case Intrinsic::aarch64_neon_uqxtn:
205804eeddc0SDimitry Andric     SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
205904eeddc0SDimitry Andric     break;
206004eeddc0SDimitry Andric   }
206104eeddc0SDimitry Andric 
2062bdd1243dSDimitry Andric   return std::nullopt;
2063bdd1243dSDimitry Andric }
2064bdd1243dSDimitry Andric 
2065bdd1243dSDimitry Andric TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const2066bdd1243dSDimitry Andric AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
2067bdd1243dSDimitry Andric   switch (K) {
2068bdd1243dSDimitry Andric   case TargetTransformInfo::RGK_Scalar:
2069bdd1243dSDimitry Andric     return TypeSize::getFixed(64);
2070bdd1243dSDimitry Andric   case TargetTransformInfo::RGK_FixedWidthVector:
207106c3fb27SDimitry Andric     if (!ST->isNeonAvailable() && !EnableFixedwidthAutovecInStreamingMode)
2072bdd1243dSDimitry Andric       return TypeSize::getFixed(0);
2073bdd1243dSDimitry Andric 
2074bdd1243dSDimitry Andric     if (ST->hasSVE())
2075bdd1243dSDimitry Andric       return TypeSize::getFixed(
2076bdd1243dSDimitry Andric           std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2077bdd1243dSDimitry Andric 
2078bdd1243dSDimitry Andric     return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
2079bdd1243dSDimitry Andric   case TargetTransformInfo::RGK_ScalableVector:
20805f757f3fSDimitry Andric     if (!ST->isSVEAvailable() && !EnableScalableAutovecInStreamingMode)
2081bdd1243dSDimitry Andric       return TypeSize::getScalable(0);
2082bdd1243dSDimitry Andric 
2083bdd1243dSDimitry Andric     return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
2084bdd1243dSDimitry Andric   }
2085bdd1243dSDimitry Andric   llvm_unreachable("Unsupported register kind");
208604eeddc0SDimitry Andric }
208704eeddc0SDimitry Andric 
isWideningInstruction(Type * DstTy,unsigned Opcode,ArrayRef<const Value * > Args,Type * SrcOverrideTy)20880b57cec5SDimitry Andric bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
208906c3fb27SDimitry Andric                                            ArrayRef<const Value *> Args,
209006c3fb27SDimitry Andric                                            Type *SrcOverrideTy) {
20910b57cec5SDimitry Andric   // A helper that returns a vector type from the given type. The number of
209281ad6265SDimitry Andric   // elements in type Ty determines the vector width.
20930b57cec5SDimitry Andric   auto toVectorTy = [&](Type *ArgTy) {
2094e8d8bef9SDimitry Andric     return VectorType::get(ArgTy->getScalarType(),
2095e8d8bef9SDimitry Andric                            cast<VectorType>(DstTy)->getElementCount());
20960b57cec5SDimitry Andric   };
20970b57cec5SDimitry Andric 
209806c3fb27SDimitry Andric   // Exit early if DstTy is not a vector type whose elements are one of [i16,
209906c3fb27SDimitry Andric   // i32, i64]. SVE doesn't generally have the same set of instructions to
2100bdd1243dSDimitry Andric   // perform an extend with the add/sub/mul. There are SMULLB style
2101bdd1243dSDimitry Andric   // instructions, but they operate on top/bottom, requiring some sort of lane
2102bdd1243dSDimitry Andric   // interleaving to be used with zext/sext.
210306c3fb27SDimitry Andric   unsigned DstEltSize = DstTy->getScalarSizeInBits();
210406c3fb27SDimitry Andric   if (!useNeonVector(DstTy) || Args.size() != 2 ||
210506c3fb27SDimitry Andric       (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
21060b57cec5SDimitry Andric     return false;
21070b57cec5SDimitry Andric 
21080b57cec5SDimitry Andric   // Determine if the operation has a widening variant. We consider both the
21090b57cec5SDimitry Andric   // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
21100b57cec5SDimitry Andric   // instructions.
21110b57cec5SDimitry Andric   //
211281ad6265SDimitry Andric   // TODO: Add additional widening operations (e.g., shl, etc.) once we
21130b57cec5SDimitry Andric   //       verify that their extending operands are eliminated during code
21140b57cec5SDimitry Andric   //       generation.
211506c3fb27SDimitry Andric   Type *SrcTy = SrcOverrideTy;
21160b57cec5SDimitry Andric   switch (Opcode) {
21170b57cec5SDimitry Andric   case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
21180b57cec5SDimitry Andric   case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
211906c3fb27SDimitry Andric     // The second operand needs to be an extend
212006c3fb27SDimitry Andric     if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
212106c3fb27SDimitry Andric       if (!SrcTy)
212206c3fb27SDimitry Andric         SrcTy =
212306c3fb27SDimitry Andric             toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
212406c3fb27SDimitry Andric     } else
212506c3fb27SDimitry Andric       return false;
21260b57cec5SDimitry Andric     break;
212706c3fb27SDimitry Andric   case Instruction::Mul: { // SMULL(2), UMULL(2)
212806c3fb27SDimitry Andric     // Both operands need to be extends of the same type.
212906c3fb27SDimitry Andric     if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
213006c3fb27SDimitry Andric         (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
213106c3fb27SDimitry Andric       if (!SrcTy)
213206c3fb27SDimitry Andric         SrcTy =
213306c3fb27SDimitry Andric             toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
213406c3fb27SDimitry Andric     } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
213506c3fb27SDimitry Andric       // If one of the operands is a Zext and the other has enough zero bits to
213606c3fb27SDimitry Andric       // be treated as unsigned, we can still general a umull, meaning the zext
213706c3fb27SDimitry Andric       // is free.
213806c3fb27SDimitry Andric       KnownBits Known =
213906c3fb27SDimitry Andric           computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
214006c3fb27SDimitry Andric       if (Args[0]->getType()->getScalarSizeInBits() -
214106c3fb27SDimitry Andric               Known.Zero.countLeadingOnes() >
214206c3fb27SDimitry Andric           DstTy->getScalarSizeInBits() / 2)
214306c3fb27SDimitry Andric         return false;
214406c3fb27SDimitry Andric       if (!SrcTy)
214506c3fb27SDimitry Andric         SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
214606c3fb27SDimitry Andric                                            DstTy->getScalarSizeInBits() / 2));
214706c3fb27SDimitry Andric     } else
214806c3fb27SDimitry Andric       return false;
214906c3fb27SDimitry Andric     break;
215006c3fb27SDimitry Andric   }
21510b57cec5SDimitry Andric   default:
21520b57cec5SDimitry Andric     return false;
21530b57cec5SDimitry Andric   }
21540b57cec5SDimitry Andric 
21550b57cec5SDimitry Andric   // Legalize the destination type and ensure it can be used in a widening
21560b57cec5SDimitry Andric   // operation.
2157bdd1243dSDimitry Andric   auto DstTyL = getTypeLegalizationCost(DstTy);
215806c3fb27SDimitry Andric   if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
21590b57cec5SDimitry Andric     return false;
21600b57cec5SDimitry Andric 
21610b57cec5SDimitry Andric   // Legalize the source type and ensure it can be used in a widening
21620b57cec5SDimitry Andric   // operation.
216306c3fb27SDimitry Andric   assert(SrcTy && "Expected some SrcTy");
2164bdd1243dSDimitry Andric   auto SrcTyL = getTypeLegalizationCost(SrcTy);
21650b57cec5SDimitry Andric   unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
21660b57cec5SDimitry Andric   if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
21670b57cec5SDimitry Andric     return false;
21680b57cec5SDimitry Andric 
21690b57cec5SDimitry Andric   // Get the total number of vector elements in the legalized types.
2170fe6060f1SDimitry Andric   InstructionCost NumDstEls =
2171fe6060f1SDimitry Andric       DstTyL.first * DstTyL.second.getVectorMinNumElements();
2172fe6060f1SDimitry Andric   InstructionCost NumSrcEls =
2173fe6060f1SDimitry Andric       SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
21740b57cec5SDimitry Andric 
21750b57cec5SDimitry Andric   // Return true if the legalized types have the same number of vector elements
21760b57cec5SDimitry Andric   // and the destination element type size is twice that of the source type.
217706c3fb27SDimitry Andric   return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
21780b57cec5SDimitry Andric }
21790b57cec5SDimitry Andric 
21805f757f3fSDimitry Andric // s/urhadd instructions implement the following pattern, making the
21815f757f3fSDimitry Andric // extends free:
21825f757f3fSDimitry Andric //   %x = add ((zext i8 -> i16), 1)
21835f757f3fSDimitry Andric //   %y = (zext i8 -> i16)
21845f757f3fSDimitry Andric //   trunc i16 (lshr (add %x, %y), 1) -> i8
21855f757f3fSDimitry Andric //
isExtPartOfAvgExpr(const Instruction * ExtUser,Type * Dst,Type * Src)21865f757f3fSDimitry Andric bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,
21875f757f3fSDimitry Andric                                         Type *Src) {
21885f757f3fSDimitry Andric   // The source should be a legal vector type.
21895f757f3fSDimitry Andric   if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
21905f757f3fSDimitry Andric       (Src->isScalableTy() && !ST->hasSVE2()))
21915f757f3fSDimitry Andric     return false;
21925f757f3fSDimitry Andric 
21935f757f3fSDimitry Andric   if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
21945f757f3fSDimitry Andric     return false;
21955f757f3fSDimitry Andric 
21965f757f3fSDimitry Andric   // Look for trunc/shl/add before trying to match the pattern.
21975f757f3fSDimitry Andric   const Instruction *Add = ExtUser;
21985f757f3fSDimitry Andric   auto *AddUser =
21995f757f3fSDimitry Andric       dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
22005f757f3fSDimitry Andric   if (AddUser && AddUser->getOpcode() == Instruction::Add)
22015f757f3fSDimitry Andric     Add = AddUser;
22025f757f3fSDimitry Andric 
22035f757f3fSDimitry Andric   auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
22045f757f3fSDimitry Andric   if (!Shr || Shr->getOpcode() != Instruction::LShr)
22055f757f3fSDimitry Andric     return false;
22065f757f3fSDimitry Andric 
22075f757f3fSDimitry Andric   auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
22085f757f3fSDimitry Andric   if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
22095f757f3fSDimitry Andric       Src->getScalarSizeInBits() !=
22105f757f3fSDimitry Andric           cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
22115f757f3fSDimitry Andric     return false;
22125f757f3fSDimitry Andric 
22135f757f3fSDimitry Andric   // Try to match the whole pattern. Ext could be either the first or second
22145f757f3fSDimitry Andric   // m_ZExtOrSExt matched.
22155f757f3fSDimitry Andric   Instruction *Ex1, *Ex2;
22165f757f3fSDimitry Andric   if (!(match(Add, m_c_Add(m_Instruction(Ex1),
22175f757f3fSDimitry Andric                            m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
22185f757f3fSDimitry Andric     return false;
22195f757f3fSDimitry Andric 
22205f757f3fSDimitry Andric   // Ensure both extends are of the same type
22215f757f3fSDimitry Andric   if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
22225f757f3fSDimitry Andric       Ex1->getOpcode() == Ex2->getOpcode())
22235f757f3fSDimitry Andric     return true;
22245f757f3fSDimitry Andric 
22255f757f3fSDimitry Andric   return false;
22265f757f3fSDimitry Andric }
22275f757f3fSDimitry Andric 
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I)2228fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2229fe6060f1SDimitry Andric                                                  Type *Src,
2230e8d8bef9SDimitry Andric                                                  TTI::CastContextHint CCH,
22315ffd83dbSDimitry Andric                                                  TTI::TargetCostKind CostKind,
22320b57cec5SDimitry Andric                                                  const Instruction *I) {
22330b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
22340b57cec5SDimitry Andric   assert(ISD && "Invalid opcode");
22350b57cec5SDimitry Andric   // If the cast is observable, and it is used by a widening instruction (e.g.,
22360b57cec5SDimitry Andric   // uaddl, saddw, etc.), it may be free.
223781ad6265SDimitry Andric   if (I && I->hasOneUser()) {
22380b57cec5SDimitry Andric     auto *SingleUser = cast<Instruction>(*I->user_begin());
22390b57cec5SDimitry Andric     SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
224006c3fb27SDimitry Andric     if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
224106c3fb27SDimitry Andric       // For adds only count the second operand as free if both operands are
224206c3fb27SDimitry Andric       // extends but not the same operation. (i.e both operands are not free in
224306c3fb27SDimitry Andric       // add(sext, zext)).
224406c3fb27SDimitry Andric       if (SingleUser->getOpcode() == Instruction::Add) {
224506c3fb27SDimitry Andric         if (I == SingleUser->getOperand(1) ||
224606c3fb27SDimitry Andric             (isa<CastInst>(SingleUser->getOperand(1)) &&
224706c3fb27SDimitry Andric              cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
22480b57cec5SDimitry Andric           return 0;
224906c3fb27SDimitry Andric       } else // Others are free so long as isWideningInstruction returned true.
22500b57cec5SDimitry Andric         return 0;
22510b57cec5SDimitry Andric     }
22525f757f3fSDimitry Andric 
22535f757f3fSDimitry Andric     // The cast will be free for the s/urhadd instructions
22545f757f3fSDimitry Andric     if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
22555f757f3fSDimitry Andric         isExtPartOfAvgExpr(SingleUser, Dst, Src))
22565f757f3fSDimitry Andric       return 0;
22570b57cec5SDimitry Andric   }
22580b57cec5SDimitry Andric 
22595ffd83dbSDimitry Andric   // TODO: Allow non-throughput costs that aren't binary.
2260fe6060f1SDimitry Andric   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
22615ffd83dbSDimitry Andric     if (CostKind != TTI::TCK_RecipThroughput)
22625ffd83dbSDimitry Andric       return Cost == 0 ? 0 : 1;
22635ffd83dbSDimitry Andric     return Cost;
22645ffd83dbSDimitry Andric   };
22655ffd83dbSDimitry Andric 
22660b57cec5SDimitry Andric   EVT SrcTy = TLI->getValueType(DL, Src);
22670b57cec5SDimitry Andric   EVT DstTy = TLI->getValueType(DL, Dst);
22680b57cec5SDimitry Andric 
22690b57cec5SDimitry Andric   if (!SrcTy.isSimple() || !DstTy.isSimple())
2270e8d8bef9SDimitry Andric     return AdjustCost(
2271e8d8bef9SDimitry Andric         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
22720b57cec5SDimitry Andric 
22730b57cec5SDimitry Andric   static const TypeConversionCostTblEntry
22740b57cec5SDimitry Andric   ConversionTbl[] = {
2275bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v2i8,   MVT::v2i64,  1},  // xtn
2276bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v2i16,  MVT::v2i64,  1},  // xtn
2277bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v2i32,  MVT::v2i64,  1},  // xtn
2278bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v4i8,   MVT::v4i32,  1},  // xtn
2279bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v4i8,   MVT::v4i64,  3},  // 2 xtn + 1 uzp1
2280bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v4i16,  MVT::v4i32,  1},  // xtn
2281bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v4i16,  MVT::v4i64,  2},  // 1 uzp1 + 1 xtn
2282bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v4i32,  MVT::v4i64,  1},  // 1 uzp1
2283bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i16,  1},  // 1 xtn
2284bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i32,  2},  // 1 uzp1 + 1 xtn
2285bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i64,  4},  // 3 x uzp1 + xtn
2286bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v8i16,  MVT::v8i32,  1},  // 1 uzp1
2287bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v8i16,  MVT::v8i64,  3},  // 3 x uzp1
2288bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v8i32,  MVT::v8i64,  2},  // 2 x uzp1
2289bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i16, 1},  // uzp1
2290bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i32, 3},  // (2 + 1) x uzp1
2291bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i64, 7},  // (4 + 2 + 1) x uzp1
2292bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2},  // 2 x uzp1
2293bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6},  // (4 + 2) x uzp1
2294bdd1243dSDimitry Andric     { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4},  // 4 x uzp1
22950b57cec5SDimitry Andric 
2296fe6060f1SDimitry Andric     // Truncations on nxvmiN
2297fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
2298fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
2299fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
2300fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
2301fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
2302fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
2303fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
2304fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
2305fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
2306fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
2307fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
2308fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
2309fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
2310fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
2311fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
2312fe6060f1SDimitry Andric     { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
2313fe6060f1SDimitry Andric 
23140b57cec5SDimitry Andric     // The number of shll instructions for the extension.
23150b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
23160b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
23170b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
23180b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
23190b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
23200b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
23210b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
23220b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
23230b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
23240b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
23250b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
23260b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
23270b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
23280b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
23290b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
23300b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
23310b57cec5SDimitry Andric 
23320b57cec5SDimitry Andric     // LowerVectorINT_TO_FP:
23330b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
23340b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
23350b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
23360b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
23370b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
23380b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
23390b57cec5SDimitry Andric 
23400b57cec5SDimitry Andric     // Complex: to v2f32
23410b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
23420b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
23430b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
23440b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
23450b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
23460b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
23470b57cec5SDimitry Andric 
23480b57cec5SDimitry Andric     // Complex: to v4f32
23490b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
23500b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
23510b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
23520b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
23530b57cec5SDimitry Andric 
23540b57cec5SDimitry Andric     // Complex: to v8f32
23550b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
23560b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
23570b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
23580b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
23590b57cec5SDimitry Andric 
23600b57cec5SDimitry Andric     // Complex: to v16f32
23610b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
23620b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
23630b57cec5SDimitry Andric 
23640b57cec5SDimitry Andric     // Complex: to v2f64
23650b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
23660b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
23670b57cec5SDimitry Andric     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
23680b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
23690b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
23700b57cec5SDimitry Andric     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
23710b57cec5SDimitry Andric 
2372bdd1243dSDimitry Andric     // Complex: to v4f64
2373bdd1243dSDimitry Andric     { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32,  4 },
2374bdd1243dSDimitry Andric     { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32,  4 },
23750b57cec5SDimitry Andric 
23760b57cec5SDimitry Andric     // LowerVectorFP_TO_INT
23770b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
23780b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
23790b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
23800b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
23810b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
23820b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
23830b57cec5SDimitry Andric 
23840b57cec5SDimitry Andric     // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
23850b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
23860b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
23870b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
23880b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
23890b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
23900b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
23910b57cec5SDimitry Andric 
23920b57cec5SDimitry Andric     // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
23930b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
23940b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
23950b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
23960b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
23970b57cec5SDimitry Andric 
2398fe6060f1SDimitry Andric     // Complex, from nxv2f32.
2399fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2400fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2401fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2402fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f32, 1 },
2403fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2404fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2405fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2406fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f32, 1 },
2407fe6060f1SDimitry Andric 
24080b57cec5SDimitry Andric     // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
24090b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
24100b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
24110b57cec5SDimitry Andric     { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
24120b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
24130b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
24140b57cec5SDimitry Andric     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
2415fe6060f1SDimitry Andric 
2416fe6060f1SDimitry Andric     // Complex, from nxv2f64.
2417fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2418fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2419fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2420fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f64, 1 },
2421fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2422fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2423fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2424fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f64, 1 },
2425fe6060f1SDimitry Andric 
2426fe6060f1SDimitry Andric     // Complex, from nxv4f32.
2427fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2428fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2429fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2430fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f32, 1 },
2431fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2432fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2433fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2434fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f32, 1 },
2435fe6060f1SDimitry Andric 
2436fe6060f1SDimitry Andric     // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2437fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2438fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f64, 7 },
2439fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2440fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f64, 7 },
2441fe6060f1SDimitry Andric 
2442fe6060f1SDimitry Andric     // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2443fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2444fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2445fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f64, 3 },
2446fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2447fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2448fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f64, 3 },
2449fe6060f1SDimitry Andric 
2450fe6060f1SDimitry Andric     // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2451fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2452fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f32, 3 },
2453fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2454fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f32, 3 },
2455fe6060f1SDimitry Andric 
2456fe6060f1SDimitry Andric     // Complex, from nxv8f16.
2457fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2458fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2459fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2460fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv8i8,  MVT::nxv8f16, 1 },
2461fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2462fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2463fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2464fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv8i8,  MVT::nxv8f16, 1 },
2465fe6060f1SDimitry Andric 
2466fe6060f1SDimitry Andric     // Complex, from nxv4f16.
2467fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2468fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2469fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2470fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv4i8,  MVT::nxv4f16, 1 },
2471fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2472fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2473fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2474fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv4i8,  MVT::nxv4f16, 1 },
2475fe6060f1SDimitry Andric 
2476fe6060f1SDimitry Andric     // Complex, from nxv2f16.
2477fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2478fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2479fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2480fe6060f1SDimitry Andric     { ISD::FP_TO_SINT, MVT::nxv2i8,  MVT::nxv2f16, 1 },
2481fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2482fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2483fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2484fe6060f1SDimitry Andric     { ISD::FP_TO_UINT, MVT::nxv2i8,  MVT::nxv2f16, 1 },
2485fe6060f1SDimitry Andric 
2486fe6060f1SDimitry Andric     // Truncate from nxvmf32 to nxvmf16.
2487fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2488fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2489fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2490fe6060f1SDimitry Andric 
2491fe6060f1SDimitry Andric     // Truncate from nxvmf64 to nxvmf16.
2492fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2493fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2494fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2495fe6060f1SDimitry Andric 
2496fe6060f1SDimitry Andric     // Truncate from nxvmf64 to nxvmf32.
2497fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2498fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2499fe6060f1SDimitry Andric     { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2500fe6060f1SDimitry Andric 
2501fe6060f1SDimitry Andric     // Extend from nxvmf16 to nxvmf32.
2502fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2503fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2504fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2505fe6060f1SDimitry Andric 
2506fe6060f1SDimitry Andric     // Extend from nxvmf16 to nxvmf64.
2507fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2508fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2509fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2510fe6060f1SDimitry Andric 
2511fe6060f1SDimitry Andric     // Extend from nxvmf32 to nxvmf64.
2512fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2513fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2514fe6060f1SDimitry Andric     { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2515fe6060f1SDimitry Andric 
251604eeddc0SDimitry Andric     // Bitcasts from float to integer
251704eeddc0SDimitry Andric     { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
251804eeddc0SDimitry Andric     { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
251904eeddc0SDimitry Andric     { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
252004eeddc0SDimitry Andric 
252104eeddc0SDimitry Andric     // Bitcasts from integer to float
252204eeddc0SDimitry Andric     { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
252304eeddc0SDimitry Andric     { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
252404eeddc0SDimitry Andric     { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
252506c3fb27SDimitry Andric 
252606c3fb27SDimitry Andric     // Add cost for extending to illegal -too wide- scalable vectors.
252706c3fb27SDimitry Andric     // zero/sign extend are implemented by multiple unpack operations,
252806c3fb27SDimitry Andric     // where each operation has a cost of 1.
252906c3fb27SDimitry Andric     { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
253006c3fb27SDimitry Andric     { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
253106c3fb27SDimitry Andric     { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
253206c3fb27SDimitry Andric     { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
253306c3fb27SDimitry Andric     { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
253406c3fb27SDimitry Andric     { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
253506c3fb27SDimitry Andric 
253606c3fb27SDimitry Andric     { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
253706c3fb27SDimitry Andric     { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
253806c3fb27SDimitry Andric     { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
253906c3fb27SDimitry Andric     { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
254006c3fb27SDimitry Andric     { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
254106c3fb27SDimitry Andric     { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
25420b57cec5SDimitry Andric   };
25430b57cec5SDimitry Andric 
254406c3fb27SDimitry Andric   // We have to estimate a cost of fixed length operation upon
254506c3fb27SDimitry Andric   // SVE registers(operations) with the number of registers required
254606c3fb27SDimitry Andric   // for a fixed type to be represented upon SVE registers.
254706c3fb27SDimitry Andric   EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
254806c3fb27SDimitry Andric   if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
254906c3fb27SDimitry Andric       SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
255006c3fb27SDimitry Andric       ST->useSVEForFixedLengthVectors(WiderTy)) {
255106c3fb27SDimitry Andric     std::pair<InstructionCost, MVT> LT =
255206c3fb27SDimitry Andric         getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
255306c3fb27SDimitry Andric     unsigned NumElements = AArch64::SVEBitsPerBlock /
255406c3fb27SDimitry Andric                            LT.second.getVectorElementType().getSizeInBits();
255506c3fb27SDimitry Andric     return AdjustCost(
255606c3fb27SDimitry Andric         LT.first *
255706c3fb27SDimitry Andric         getCastInstrCost(
255806c3fb27SDimitry Andric             Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
255906c3fb27SDimitry Andric             ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
256006c3fb27SDimitry Andric             CostKind, I));
256106c3fb27SDimitry Andric   }
256206c3fb27SDimitry Andric 
25630b57cec5SDimitry Andric   if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
25640b57cec5SDimitry Andric                                                  DstTy.getSimpleVT(),
25650b57cec5SDimitry Andric                                                  SrcTy.getSimpleVT()))
25665ffd83dbSDimitry Andric     return AdjustCost(Entry->Cost);
25670b57cec5SDimitry Andric 
256881ad6265SDimitry Andric   static const TypeConversionCostTblEntry FP16Tbl[] = {
256981ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
257081ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
257181ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
257281ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
257381ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
257481ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
257581ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
257681ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
257781ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
257881ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
257981ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
258081ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
258181ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
258281ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
258381ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
258481ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
258581ad6265SDimitry Andric       {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
258681ad6265SDimitry Andric       {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
258781ad6265SDimitry Andric       {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2},   // ushll + ucvtf
258881ad6265SDimitry Andric       {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2},   // sshll + scvtf
258981ad6265SDimitry Andric       {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
259081ad6265SDimitry Andric       {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
259181ad6265SDimitry Andric   };
259281ad6265SDimitry Andric 
259381ad6265SDimitry Andric   if (ST->hasFullFP16())
259481ad6265SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(
259581ad6265SDimitry Andric             FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
259681ad6265SDimitry Andric       return AdjustCost(Entry->Cost);
259781ad6265SDimitry Andric 
25985f757f3fSDimitry Andric   if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
25995f757f3fSDimitry Andric       CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
26005f757f3fSDimitry Andric       TLI->getTypeAction(Src->getContext(), SrcTy) ==
26015f757f3fSDimitry Andric           TargetLowering::TypePromoteInteger &&
26025f757f3fSDimitry Andric       TLI->getTypeAction(Dst->getContext(), DstTy) ==
26035f757f3fSDimitry Andric           TargetLowering::TypeSplitVector) {
26045f757f3fSDimitry Andric     // The standard behaviour in the backend for these cases is to split the
26055f757f3fSDimitry Andric     // extend up into two parts:
26065f757f3fSDimitry Andric     //  1. Perform an extending load or masked load up to the legal type.
26075f757f3fSDimitry Andric     //  2. Extend the loaded data to the final type.
26085f757f3fSDimitry Andric     std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
26095f757f3fSDimitry Andric     Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
26105f757f3fSDimitry Andric     InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
26115f757f3fSDimitry Andric         Opcode, LegalTy, Src, CCH, CostKind, I);
26125f757f3fSDimitry Andric     InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
26135f757f3fSDimitry Andric         Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
26145f757f3fSDimitry Andric     return Part1 + Part2;
26155f757f3fSDimitry Andric   }
26165f757f3fSDimitry Andric 
261706c3fb27SDimitry Andric   // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
261806c3fb27SDimitry Andric   // but we also want to include the TTI::CastContextHint::Masked case too.
261906c3fb27SDimitry Andric   if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
262006c3fb27SDimitry Andric       CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
262106c3fb27SDimitry Andric       TLI->isTypeLegal(DstTy))
262206c3fb27SDimitry Andric     CCH = TTI::CastContextHint::Normal;
262306c3fb27SDimitry Andric 
2624e8d8bef9SDimitry Andric   return AdjustCost(
2625e8d8bef9SDimitry Andric       BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
26260b57cec5SDimitry Andric }
26270b57cec5SDimitry Andric 
getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)2628fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
2629fe6060f1SDimitry Andric                                                          Type *Dst,
26300b57cec5SDimitry Andric                                                          VectorType *VecTy,
26310b57cec5SDimitry Andric                                                          unsigned Index) {
26320b57cec5SDimitry Andric 
26330b57cec5SDimitry Andric   // Make sure we were given a valid extend opcode.
26340b57cec5SDimitry Andric   assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
26350b57cec5SDimitry Andric          "Invalid opcode");
26360b57cec5SDimitry Andric 
26370b57cec5SDimitry Andric   // We are extending an element we extract from a vector, so the source type
26380b57cec5SDimitry Andric   // of the extend is the element type of the vector.
26390b57cec5SDimitry Andric   auto *Src = VecTy->getElementType();
26400b57cec5SDimitry Andric 
26410b57cec5SDimitry Andric   // Sign- and zero-extends are for integer types only.
26420b57cec5SDimitry Andric   assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
26430b57cec5SDimitry Andric 
26440b57cec5SDimitry Andric   // Get the cost for the extract. We compute the cost (if any) for the extend
26450b57cec5SDimitry Andric   // below.
2646bdd1243dSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2647bdd1243dSDimitry Andric   InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2648bdd1243dSDimitry Andric                                             CostKind, Index, nullptr, nullptr);
26490b57cec5SDimitry Andric 
26500b57cec5SDimitry Andric   // Legalize the types.
2651bdd1243dSDimitry Andric   auto VecLT = getTypeLegalizationCost(VecTy);
26520b57cec5SDimitry Andric   auto DstVT = TLI->getValueType(DL, Dst);
26530b57cec5SDimitry Andric   auto SrcVT = TLI->getValueType(DL, Src);
26540b57cec5SDimitry Andric 
26550b57cec5SDimitry Andric   // If the resulting type is still a vector and the destination type is legal,
26560b57cec5SDimitry Andric   // we may get the extension for free. If not, get the default cost for the
26570b57cec5SDimitry Andric   // extend.
26580b57cec5SDimitry Andric   if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2659e8d8bef9SDimitry Andric     return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2660e8d8bef9SDimitry Andric                                    CostKind);
26610b57cec5SDimitry Andric 
26620b57cec5SDimitry Andric   // The destination type should be larger than the element type. If not, get
26630b57cec5SDimitry Andric   // the default cost for the extend.
2664e8d8bef9SDimitry Andric   if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2665e8d8bef9SDimitry Andric     return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2666e8d8bef9SDimitry Andric                                    CostKind);
26670b57cec5SDimitry Andric 
26680b57cec5SDimitry Andric   switch (Opcode) {
26690b57cec5SDimitry Andric   default:
26700b57cec5SDimitry Andric     llvm_unreachable("Opcode should be either SExt or ZExt");
26710b57cec5SDimitry Andric 
26720b57cec5SDimitry Andric   // For sign-extends, we only need a smov, which performs the extension
26730b57cec5SDimitry Andric   // automatically.
26740b57cec5SDimitry Andric   case Instruction::SExt:
26750b57cec5SDimitry Andric     return Cost;
26760b57cec5SDimitry Andric 
26770b57cec5SDimitry Andric   // For zero-extends, the extend is performed automatically by a umov unless
26780b57cec5SDimitry Andric   // the destination type is i64 and the element type is i8 or i16.
26790b57cec5SDimitry Andric   case Instruction::ZExt:
26800b57cec5SDimitry Andric     if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
26810b57cec5SDimitry Andric       return Cost;
26820b57cec5SDimitry Andric   }
26830b57cec5SDimitry Andric 
26840b57cec5SDimitry Andric   // If we are unable to perform the extend for free, get the default cost.
2685e8d8bef9SDimitry Andric   return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2686e8d8bef9SDimitry Andric                                  CostKind);
26875ffd83dbSDimitry Andric }
26885ffd83dbSDimitry Andric 
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I)2689fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
2690fe6060f1SDimitry Andric                                                TTI::TargetCostKind CostKind,
2691fe6060f1SDimitry Andric                                                const Instruction *I) {
26925ffd83dbSDimitry Andric   if (CostKind != TTI::TCK_RecipThroughput)
26935ffd83dbSDimitry Andric     return Opcode == Instruction::PHI ? 0 : 1;
26945ffd83dbSDimitry Andric   assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
26955ffd83dbSDimitry Andric   // Branches are assumed to be predicted.
26965ffd83dbSDimitry Andric   return 0;
26970b57cec5SDimitry Andric }
26980b57cec5SDimitry Andric 
getVectorInstrCostHelper(const Instruction * I,Type * Val,unsigned Index,bool HasRealUse)269906c3fb27SDimitry Andric InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
270006c3fb27SDimitry Andric                                                          Type *Val,
2701bdd1243dSDimitry Andric                                                          unsigned Index,
2702bdd1243dSDimitry Andric                                                          bool HasRealUse) {
27030b57cec5SDimitry Andric   assert(Val->isVectorTy() && "This must be a vector type");
27040b57cec5SDimitry Andric 
27050b57cec5SDimitry Andric   if (Index != -1U) {
27060b57cec5SDimitry Andric     // Legalize the type.
2707bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
27080b57cec5SDimitry Andric 
27090b57cec5SDimitry Andric     // This type is legalized to a scalar type.
27100b57cec5SDimitry Andric     if (!LT.second.isVector())
27110b57cec5SDimitry Andric       return 0;
27120b57cec5SDimitry Andric 
271304eeddc0SDimitry Andric     // The type may be split. For fixed-width vectors we can normalize the
271404eeddc0SDimitry Andric     // index to the new type.
271504eeddc0SDimitry Andric     if (LT.second.isFixedLengthVector()) {
27160b57cec5SDimitry Andric       unsigned Width = LT.second.getVectorNumElements();
27170b57cec5SDimitry Andric       Index = Index % Width;
271804eeddc0SDimitry Andric     }
27190b57cec5SDimitry Andric 
27200b57cec5SDimitry Andric     // The element at index zero is already inside the vector.
2721bdd1243dSDimitry Andric     // - For a physical (HasRealUse==true) insert-element or extract-element
2722bdd1243dSDimitry Andric     // instruction that extracts integers, an explicit FPR -> GPR move is
2723bdd1243dSDimitry Andric     // needed. So it has non-zero cost.
2724bdd1243dSDimitry Andric     // - For the rest of cases (virtual instruction or element type is float),
2725bdd1243dSDimitry Andric     // consider the instruction free.
272606c3fb27SDimitry Andric     if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
272706c3fb27SDimitry Andric       return 0;
272806c3fb27SDimitry Andric 
272906c3fb27SDimitry Andric     // This is recognising a LD1 single-element structure to one lane of one
273006c3fb27SDimitry Andric     // register instruction. I.e., if this is an `insertelement` instruction,
273106c3fb27SDimitry Andric     // and its second operand is a load, then we will generate a LD1, which
273206c3fb27SDimitry Andric     // are expensive instructions.
273306c3fb27SDimitry Andric     if (I && dyn_cast<LoadInst>(I->getOperand(1)))
273406c3fb27SDimitry Andric       return ST->getVectorInsertExtractBaseCost() + 1;
273506c3fb27SDimitry Andric 
273606c3fb27SDimitry Andric     // i1 inserts and extract will include an extra cset or cmp of the vector
273706c3fb27SDimitry Andric     // value. Increase the cost by 1 to account.
273806c3fb27SDimitry Andric     if (Val->getScalarSizeInBits() == 1)
273906c3fb27SDimitry Andric       return ST->getVectorInsertExtractBaseCost() + 1;
274006c3fb27SDimitry Andric 
2741bdd1243dSDimitry Andric     // FIXME:
2742bdd1243dSDimitry Andric     // If the extract-element and insert-element instructions could be
2743bdd1243dSDimitry Andric     // simplified away (e.g., could be combined into users by looking at use-def
2744bdd1243dSDimitry Andric     // context), they have no cost. This is not done in the first place for
2745bdd1243dSDimitry Andric     // compile-time considerations.
27460b57cec5SDimitry Andric   }
27470b57cec5SDimitry Andric 
27480b57cec5SDimitry Andric   // All other insert/extracts cost this much.
27490b57cec5SDimitry Andric   return ST->getVectorInsertExtractBaseCost();
27500b57cec5SDimitry Andric }
27510b57cec5SDimitry Andric 
getVectorInstrCost(unsigned Opcode,Type * Val,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)2752bdd1243dSDimitry Andric InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
2753bdd1243dSDimitry Andric                                                    TTI::TargetCostKind CostKind,
2754bdd1243dSDimitry Andric                                                    unsigned Index, Value *Op0,
2755bdd1243dSDimitry Andric                                                    Value *Op1) {
275606c3fb27SDimitry Andric   bool HasRealUse =
275706c3fb27SDimitry Andric       Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
275806c3fb27SDimitry Andric   return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
2759bdd1243dSDimitry Andric }
2760bdd1243dSDimitry Andric 
getVectorInstrCost(const Instruction & I,Type * Val,TTI::TargetCostKind CostKind,unsigned Index)2761bdd1243dSDimitry Andric InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
2762bdd1243dSDimitry Andric                                                    Type *Val,
2763bdd1243dSDimitry Andric                                                    TTI::TargetCostKind CostKind,
2764bdd1243dSDimitry Andric                                                    unsigned Index) {
276506c3fb27SDimitry Andric   return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
2766bdd1243dSDimitry Andric }
2767bdd1243dSDimitry Andric 
getScalarizationOverhead(VectorType * Ty,const APInt & DemandedElts,bool Insert,bool Extract,TTI::TargetCostKind CostKind)27685f757f3fSDimitry Andric InstructionCost AArch64TTIImpl::getScalarizationOverhead(
27695f757f3fSDimitry Andric     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
27705f757f3fSDimitry Andric     TTI::TargetCostKind CostKind) {
27715f757f3fSDimitry Andric   if (isa<ScalableVectorType>(Ty))
27725f757f3fSDimitry Andric     return InstructionCost::getInvalid();
27735f757f3fSDimitry Andric   if (Ty->getElementType()->isFloatingPointTy())
27745f757f3fSDimitry Andric     return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
27755f757f3fSDimitry Andric                                            CostKind);
27765f757f3fSDimitry Andric   return DemandedElts.popcount() * (Insert + Extract) *
27775f757f3fSDimitry Andric          ST->getVectorInsertExtractBaseCost();
27785f757f3fSDimitry Andric }
27795f757f3fSDimitry Andric 
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI)2780fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
27815ffd83dbSDimitry Andric     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2782bdd1243dSDimitry Andric     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
2783bdd1243dSDimitry Andric     ArrayRef<const Value *> Args,
2784480093f4SDimitry Andric     const Instruction *CxtI) {
2785bdd1243dSDimitry Andric 
27865ffd83dbSDimitry Andric   // TODO: Handle more cost kinds.
27875ffd83dbSDimitry Andric   if (CostKind != TTI::TCK_RecipThroughput)
2788bdd1243dSDimitry Andric     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2789bdd1243dSDimitry Andric                                          Op2Info, Args, CxtI);
27905ffd83dbSDimitry Andric 
27910b57cec5SDimitry Andric   // Legalize the type.
2792bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
27930b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
27940b57cec5SDimitry Andric 
27950b57cec5SDimitry Andric   switch (ISD) {
27960b57cec5SDimitry Andric   default:
2797bdd1243dSDimitry Andric     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2798bdd1243dSDimitry Andric                                          Op2Info);
27990b57cec5SDimitry Andric   case ISD::SDIV:
2800bdd1243dSDimitry Andric     if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
28010b57cec5SDimitry Andric       // On AArch64, scalar signed division by constants power-of-two are
28020b57cec5SDimitry Andric       // normally expanded to the sequence ADD + CMP + SELECT + SRA.
28030b57cec5SDimitry Andric       // The OperandValue properties many not be same as that of previous
28040b57cec5SDimitry Andric       // operation; conservatively assume OP_None.
280581ad6265SDimitry Andric       InstructionCost Cost = getArithmeticInstrCost(
2806bdd1243dSDimitry Andric           Instruction::Add, Ty, CostKind,
2807bdd1243dSDimitry Andric           Op1Info.getNoProps(), Op2Info.getNoProps());
2808bdd1243dSDimitry Andric       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2809bdd1243dSDimitry Andric                                      Op1Info.getNoProps(), Op2Info.getNoProps());
281081ad6265SDimitry Andric       Cost += getArithmeticInstrCost(
2811bdd1243dSDimitry Andric           Instruction::Select, Ty, CostKind,
2812bdd1243dSDimitry Andric           Op1Info.getNoProps(), Op2Info.getNoProps());
2813bdd1243dSDimitry Andric       Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2814bdd1243dSDimitry Andric                                      Op1Info.getNoProps(), Op2Info.getNoProps());
28150b57cec5SDimitry Andric       return Cost;
28160b57cec5SDimitry Andric     }
2817bdd1243dSDimitry Andric     [[fallthrough]];
281881ad6265SDimitry Andric   case ISD::UDIV: {
2819bdd1243dSDimitry Andric     if (Op2Info.isConstant() && Op2Info.isUniform()) {
28200b57cec5SDimitry Andric       auto VT = TLI->getValueType(DL, Ty);
28210b57cec5SDimitry Andric       if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
28220b57cec5SDimitry Andric         // Vector signed division by constant are expanded to the
28230b57cec5SDimitry Andric         // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
28240b57cec5SDimitry Andric         // to MULHS + SUB + SRL + ADD + SRL.
2825fe6060f1SDimitry Andric         InstructionCost MulCost = getArithmeticInstrCost(
2826bdd1243dSDimitry Andric             Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2827fe6060f1SDimitry Andric         InstructionCost AddCost = getArithmeticInstrCost(
2828bdd1243dSDimitry Andric             Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2829fe6060f1SDimitry Andric         InstructionCost ShrCost = getArithmeticInstrCost(
2830bdd1243dSDimitry Andric             Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
28310b57cec5SDimitry Andric         return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
28320b57cec5SDimitry Andric       }
28330b57cec5SDimitry Andric     }
28340b57cec5SDimitry Andric 
283581ad6265SDimitry Andric     InstructionCost Cost = BaseT::getArithmeticInstrCost(
2836bdd1243dSDimitry Andric         Opcode, Ty, CostKind, Op1Info, Op2Info);
28370b57cec5SDimitry Andric     if (Ty->isVectorTy()) {
2838bdd1243dSDimitry Andric       if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
2839bdd1243dSDimitry Andric         // SDIV/UDIV operations are lowered using SVE, then we can have less
2840bdd1243dSDimitry Andric         // costs.
2841bdd1243dSDimitry Andric         if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
2842bdd1243dSDimitry Andric                                                 ->getPrimitiveSizeInBits()
2843bdd1243dSDimitry Andric                                                 .getFixedValue() < 128) {
2844bdd1243dSDimitry Andric           EVT VT = TLI->getValueType(DL, Ty);
2845bdd1243dSDimitry Andric           static const CostTblEntry DivTbl[]{
2846bdd1243dSDimitry Andric               {ISD::SDIV, MVT::v2i8, 5},  {ISD::SDIV, MVT::v4i8, 8},
2847bdd1243dSDimitry Andric               {ISD::SDIV, MVT::v8i8, 8},  {ISD::SDIV, MVT::v2i16, 5},
2848bdd1243dSDimitry Andric               {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
2849bdd1243dSDimitry Andric               {ISD::UDIV, MVT::v2i8, 5},  {ISD::UDIV, MVT::v4i8, 8},
2850bdd1243dSDimitry Andric               {ISD::UDIV, MVT::v8i8, 8},  {ISD::UDIV, MVT::v2i16, 5},
2851bdd1243dSDimitry Andric               {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
2852bdd1243dSDimitry Andric 
2853bdd1243dSDimitry Andric           const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2854bdd1243dSDimitry Andric           if (nullptr != Entry)
2855bdd1243dSDimitry Andric             return Entry->Cost;
2856bdd1243dSDimitry Andric         }
2857bdd1243dSDimitry Andric         // For 8/16-bit elements, the cost is higher because the type
2858bdd1243dSDimitry Andric         // requires promotion and possibly splitting:
2859bdd1243dSDimitry Andric         if (LT.second.getScalarType() == MVT::i8)
2860bdd1243dSDimitry Andric           Cost *= 8;
2861bdd1243dSDimitry Andric         else if (LT.second.getScalarType() == MVT::i16)
2862bdd1243dSDimitry Andric           Cost *= 4;
2863bdd1243dSDimitry Andric         return Cost;
2864bdd1243dSDimitry Andric       } else {
2865bdd1243dSDimitry Andric         // If one of the operands is a uniform constant then the cost for each
2866bdd1243dSDimitry Andric         // element is Cost for insertion, extraction and division.
2867bdd1243dSDimitry Andric         // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2868bdd1243dSDimitry Andric         // operation with scalar type
2869bdd1243dSDimitry Andric         if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
2870bdd1243dSDimitry Andric             (Op2Info.isConstant() && Op2Info.isUniform())) {
2871bdd1243dSDimitry Andric           if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2872bdd1243dSDimitry Andric             InstructionCost DivCost = BaseT::getArithmeticInstrCost(
2873bdd1243dSDimitry Andric                 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
2874bdd1243dSDimitry Andric             return (4 + DivCost) * VTy->getNumElements();
2875bdd1243dSDimitry Andric           }
2876bdd1243dSDimitry Andric         }
2877bdd1243dSDimitry Andric         // On AArch64, without SVE, vector divisions are expanded
2878bdd1243dSDimitry Andric         // into scalar divisions of each pair of elements.
2879bdd1243dSDimitry Andric         Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
2880bdd1243dSDimitry Andric                                        CostKind, Op1Info, Op2Info);
28815ffd83dbSDimitry Andric         Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
2882bdd1243dSDimitry Andric                                        Op1Info, Op2Info);
2883bdd1243dSDimitry Andric       }
2884bdd1243dSDimitry Andric 
28850b57cec5SDimitry Andric       // TODO: if one of the arguments is scalar, then it's not necessary to
28860b57cec5SDimitry Andric       // double the cost of handling the vector elements.
28870b57cec5SDimitry Andric       Cost += Cost;
28880b57cec5SDimitry Andric     }
28890b57cec5SDimitry Andric     return Cost;
289081ad6265SDimitry Andric   }
28910b57cec5SDimitry Andric   case ISD::MUL:
2892bdd1243dSDimitry Andric     // When SVE is available, then we can lower the v2i64 operation using
2893bdd1243dSDimitry Andric     // the SVE mul instruction, which has a lower cost.
2894bdd1243dSDimitry Andric     if (LT.second == MVT::v2i64 && ST->hasSVE())
2895bdd1243dSDimitry Andric       return LT.first;
2896bdd1243dSDimitry Andric 
2897bdd1243dSDimitry Andric     // When SVE is not available, there is no MUL.2d instruction,
2898bdd1243dSDimitry Andric     // which means mul <2 x i64> is expensive as elements are extracted
2899bdd1243dSDimitry Andric     // from the vectors and the muls scalarized.
2900bdd1243dSDimitry Andric     // As getScalarizationOverhead is a bit too pessimistic, we
2901bdd1243dSDimitry Andric     // estimate the cost for a i64 vector directly here, which is:
290281ad6265SDimitry Andric     // - four 2-cost i64 extracts,
290381ad6265SDimitry Andric     // - two 2-cost i64 inserts, and
290481ad6265SDimitry Andric     // - two 1-cost muls.
290581ad6265SDimitry Andric     // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
290681ad6265SDimitry Andric     // LT.first = 2 the cost is 28. If both operands are extensions it will not
290781ad6265SDimitry Andric     // need to scalarize so the cost can be cheaper (smull or umull).
2908bdd1243dSDimitry Andric     // so the cost can be cheaper (smull or umull).
290981ad6265SDimitry Andric     if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
291081ad6265SDimitry Andric       return LT.first;
291181ad6265SDimitry Andric     return LT.first * 14;
2912e8d8bef9SDimitry Andric   case ISD::ADD:
29130b57cec5SDimitry Andric   case ISD::XOR:
29140b57cec5SDimitry Andric   case ISD::OR:
29150b57cec5SDimitry Andric   case ISD::AND:
291681ad6265SDimitry Andric   case ISD::SRL:
291781ad6265SDimitry Andric   case ISD::SRA:
291881ad6265SDimitry Andric   case ISD::SHL:
29190b57cec5SDimitry Andric     // These nodes are marked as 'custom' for combining purposes only.
29200b57cec5SDimitry Andric     // We know that they are legal. See LowerAdd in ISelLowering.
292181ad6265SDimitry Andric     return LT.first;
29225ffd83dbSDimitry Andric 
292306c3fb27SDimitry Andric   case ISD::FNEG:
29245ffd83dbSDimitry Andric   case ISD::FADD:
2925349cc55cSDimitry Andric   case ISD::FSUB:
292606c3fb27SDimitry Andric     // Increase the cost for half and bfloat types if not architecturally
292706c3fb27SDimitry Andric     // supported.
292806c3fb27SDimitry Andric     if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
292906c3fb27SDimitry Andric         (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
293006c3fb27SDimitry Andric       return 2 * LT.first;
293106c3fb27SDimitry Andric     if (!Ty->getScalarType()->isFP128Ty())
293206c3fb27SDimitry Andric       return LT.first;
293306c3fb27SDimitry Andric     [[fallthrough]];
2934349cc55cSDimitry Andric   case ISD::FMUL:
2935349cc55cSDimitry Andric   case ISD::FDIV:
29365ffd83dbSDimitry Andric     // These nodes are marked as 'custom' just to lower them to SVE.
29375ffd83dbSDimitry Andric     // We know said lowering will incur no additional cost.
2938349cc55cSDimitry Andric     if (!Ty->getScalarType()->isFP128Ty())
293981ad6265SDimitry Andric       return 2 * LT.first;
29405ffd83dbSDimitry Andric 
2941bdd1243dSDimitry Andric     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2942bdd1243dSDimitry Andric                                          Op2Info);
29430b57cec5SDimitry Andric   }
29440b57cec5SDimitry Andric }
29450b57cec5SDimitry Andric 
getAddressComputationCost(Type * Ty,ScalarEvolution * SE,const SCEV * Ptr)2946fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
2947fe6060f1SDimitry Andric                                                           ScalarEvolution *SE,
29480b57cec5SDimitry Andric                                                           const SCEV *Ptr) {
29490b57cec5SDimitry Andric   // Address computations in vectorized code with non-consecutive addresses will
29500b57cec5SDimitry Andric   // likely result in more instructions compared to scalar code where the
29510b57cec5SDimitry Andric   // computation can more often be merged into the index mode. The resulting
29520b57cec5SDimitry Andric   // extra micro-ops can significantly decrease throughput.
295306c3fb27SDimitry Andric   unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
29540b57cec5SDimitry Andric   int MaxMergeDistance = 64;
29550b57cec5SDimitry Andric 
29560b57cec5SDimitry Andric   if (Ty->isVectorTy() && SE &&
29570b57cec5SDimitry Andric       !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
29580b57cec5SDimitry Andric     return NumVectorInstToHideOverhead;
29590b57cec5SDimitry Andric 
29600b57cec5SDimitry Andric   // In many cases the address computation is not merged into the instruction
29610b57cec5SDimitry Andric   // addressing mode.
29620b57cec5SDimitry Andric   return 1;
29630b57cec5SDimitry Andric }
29640b57cec5SDimitry Andric 
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,CmpInst::Predicate VecPred,TTI::TargetCostKind CostKind,const Instruction * I)2965fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
2966fe6060f1SDimitry Andric                                                    Type *CondTy,
2967fe6060f1SDimitry Andric                                                    CmpInst::Predicate VecPred,
29685ffd83dbSDimitry Andric                                                    TTI::TargetCostKind CostKind,
29695ffd83dbSDimitry Andric                                                    const Instruction *I) {
29705ffd83dbSDimitry Andric   // TODO: Handle other cost kinds.
29715ffd83dbSDimitry Andric   if (CostKind != TTI::TCK_RecipThroughput)
2972e8d8bef9SDimitry Andric     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2973e8d8bef9SDimitry Andric                                      I);
29740b57cec5SDimitry Andric 
29750b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
29760b57cec5SDimitry Andric   // We don't lower some vector selects well that are wider than the register
29770b57cec5SDimitry Andric   // width.
2978e8d8bef9SDimitry Andric   if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
29790b57cec5SDimitry Andric     // We would need this many instructions to hide the scalarization happening.
29800b57cec5SDimitry Andric     const int AmortizationCost = 20;
2981e8d8bef9SDimitry Andric 
2982e8d8bef9SDimitry Andric     // If VecPred is not set, check if we can get a predicate from the context
2983e8d8bef9SDimitry Andric     // instruction, if its type matches the requested ValTy.
2984e8d8bef9SDimitry Andric     if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
2985e8d8bef9SDimitry Andric       CmpInst::Predicate CurrentPred;
2986e8d8bef9SDimitry Andric       if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
2987e8d8bef9SDimitry Andric                             m_Value())))
2988e8d8bef9SDimitry Andric         VecPred = CurrentPred;
2989e8d8bef9SDimitry Andric     }
29901fd87a68SDimitry Andric     // Check if we have a compare/select chain that can be lowered using
29911fd87a68SDimitry Andric     // a (F)CMxx & BFI pair.
29921fd87a68SDimitry Andric     if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
29931fd87a68SDimitry Andric         VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
29941fd87a68SDimitry Andric         VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
29951fd87a68SDimitry Andric         VecPred == CmpInst::FCMP_UNE) {
29961fd87a68SDimitry Andric       static const auto ValidMinMaxTys = {
29971fd87a68SDimitry Andric           MVT::v8i8,  MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
29981fd87a68SDimitry Andric           MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
29991fd87a68SDimitry Andric       static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
30001fd87a68SDimitry Andric 
3001bdd1243dSDimitry Andric       auto LT = getTypeLegalizationCost(ValTy);
30021fd87a68SDimitry Andric       if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
30031fd87a68SDimitry Andric           (ST->hasFullFP16() &&
30041fd87a68SDimitry Andric            any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3005e8d8bef9SDimitry Andric         return LT.first;
3006e8d8bef9SDimitry Andric     }
3007e8d8bef9SDimitry Andric 
30080b57cec5SDimitry Andric     static const TypeConversionCostTblEntry
30090b57cec5SDimitry Andric     VectorSelectTbl[] = {
301006c3fb27SDimitry Andric       { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
301106c3fb27SDimitry Andric       { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
301206c3fb27SDimitry Andric       { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
301306c3fb27SDimitry Andric       { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
301406c3fb27SDimitry Andric       { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
30150b57cec5SDimitry Andric       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
30160b57cec5SDimitry Andric       { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
30170b57cec5SDimitry Andric       { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
30180b57cec5SDimitry Andric       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
30190b57cec5SDimitry Andric       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
30200b57cec5SDimitry Andric       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
30210b57cec5SDimitry Andric     };
30220b57cec5SDimitry Andric 
30230b57cec5SDimitry Andric     EVT SelCondTy = TLI->getValueType(DL, CondTy);
30240b57cec5SDimitry Andric     EVT SelValTy = TLI->getValueType(DL, ValTy);
30250b57cec5SDimitry Andric     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
30260b57cec5SDimitry Andric       if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
30270b57cec5SDimitry Andric                                                      SelCondTy.getSimpleVT(),
30280b57cec5SDimitry Andric                                                      SelValTy.getSimpleVT()))
30290b57cec5SDimitry Andric         return Entry->Cost;
30300b57cec5SDimitry Andric     }
30310b57cec5SDimitry Andric   }
303206c3fb27SDimitry Andric 
303306c3fb27SDimitry Andric   if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
303406c3fb27SDimitry Andric     auto LT = getTypeLegalizationCost(ValTy);
303506c3fb27SDimitry Andric     // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
303606c3fb27SDimitry Andric     if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
303706c3fb27SDimitry Andric       return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
303806c3fb27SDimitry Andric   }
303906c3fb27SDimitry Andric 
304006c3fb27SDimitry Andric   // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
304106c3fb27SDimitry Andric   // FIXME: This can apply to more conditions and add/sub if it can be shown to
304206c3fb27SDimitry Andric   // be profitable.
304306c3fb27SDimitry Andric   if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
304406c3fb27SDimitry Andric       ICmpInst::isEquality(VecPred) &&
304506c3fb27SDimitry Andric       TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
304606c3fb27SDimitry Andric       match(I->getOperand(1), m_Zero()) &&
304706c3fb27SDimitry Andric       match(I->getOperand(0), m_And(m_Value(), m_Value())))
304806c3fb27SDimitry Andric     return 0;
304906c3fb27SDimitry Andric 
3050e8d8bef9SDimitry Andric   // The base case handles scalable vectors fine for now, since it treats the
3051e8d8bef9SDimitry Andric   // cost as 1 * legalization cost.
3052e8d8bef9SDimitry Andric   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
30530b57cec5SDimitry Andric }
30540b57cec5SDimitry Andric 
30550b57cec5SDimitry Andric AArch64TTIImpl::TTI::MemCmpExpansionOptions
enableMemCmpExpansion(bool OptSize,bool IsZeroCmp) const30560b57cec5SDimitry Andric AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
30570b57cec5SDimitry Andric   TTI::MemCmpExpansionOptions Options;
30585ffd83dbSDimitry Andric   if (ST->requiresStrictAlign()) {
30595ffd83dbSDimitry Andric     // TODO: Add cost modeling for strict align. Misaligned loads expand to
30605ffd83dbSDimitry Andric     // a bunch of instructions when strict align is enabled.
30615ffd83dbSDimitry Andric     return Options;
30625ffd83dbSDimitry Andric   }
30635ffd83dbSDimitry Andric   Options.AllowOverlappingLoads = true;
30640b57cec5SDimitry Andric   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
30650b57cec5SDimitry Andric   Options.NumLoadsPerBlock = Options.MaxNumLoads;
30660b57cec5SDimitry Andric   // TODO: Though vector loads usually perform well on AArch64, in some targets
30670b57cec5SDimitry Andric   // they may wake up the FP unit, which raises the power consumption.  Perhaps
30680b57cec5SDimitry Andric   // they could be used with no holds barred (-O3).
30690b57cec5SDimitry Andric   Options.LoadSizes = {8, 4, 2, 1};
30705f757f3fSDimitry Andric   Options.AllowedTailExpansions = {3, 5, 6};
30710b57cec5SDimitry Andric   return Options;
30720b57cec5SDimitry Andric }
30730b57cec5SDimitry Andric 
prefersVectorizedAddressing() const307481ad6265SDimitry Andric bool AArch64TTIImpl::prefersVectorizedAddressing() const {
307581ad6265SDimitry Andric   return ST->hasSVE();
307681ad6265SDimitry Andric }
307781ad6265SDimitry Andric 
3078fe6060f1SDimitry Andric InstructionCost
getMaskedMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)3079fe6060f1SDimitry Andric AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
3080fe6060f1SDimitry Andric                                       Align Alignment, unsigned AddressSpace,
3081fe6060f1SDimitry Andric                                       TTI::TargetCostKind CostKind) {
30820eae32dcSDimitry Andric   if (useNeonVector(Src))
3083fe6060f1SDimitry Andric     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3084fe6060f1SDimitry Andric                                         CostKind);
3085bdd1243dSDimitry Andric   auto LT = getTypeLegalizationCost(Src);
3086fe6060f1SDimitry Andric   if (!LT.first.isValid())
3087fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
3088fe6060f1SDimitry Andric 
3089fe6060f1SDimitry Andric   // The code-generator is currently not able to handle scalable vectors
3090fe6060f1SDimitry Andric   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3091fe6060f1SDimitry Andric   // it. This change will be removed when code-generation for these types is
3092fe6060f1SDimitry Andric   // sufficiently reliable.
3093fe6060f1SDimitry Andric   if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
3094fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
3095fe6060f1SDimitry Andric 
3096bdd1243dSDimitry Andric   return LT.first;
3097fe6060f1SDimitry Andric }
3098fe6060f1SDimitry Andric 
getSVEGatherScatterOverhead(unsigned Opcode)30990eae32dcSDimitry Andric static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
31000eae32dcSDimitry Andric   return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
31010eae32dcSDimitry Andric }
31020eae32dcSDimitry Andric 
getGatherScatterOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)3103fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
3104e8d8bef9SDimitry Andric     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3105e8d8bef9SDimitry Andric     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
31065f757f3fSDimitry Andric   if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3107e8d8bef9SDimitry Andric     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3108e8d8bef9SDimitry Andric                                          Alignment, CostKind, I);
3109e8d8bef9SDimitry Andric   auto *VT = cast<VectorType>(DataTy);
3110bdd1243dSDimitry Andric   auto LT = getTypeLegalizationCost(DataTy);
3111fe6060f1SDimitry Andric   if (!LT.first.isValid())
3112fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
3113e8d8bef9SDimitry Andric 
31145f757f3fSDimitry Andric   if (!LT.second.isVector() ||
31155f757f3fSDimitry Andric       !isElementTypeLegalForScalableVector(VT->getElementType()))
31165f757f3fSDimitry Andric     return InstructionCost::getInvalid();
31175f757f3fSDimitry Andric 
3118fe6060f1SDimitry Andric   // The code-generator is currently not able to handle scalable vectors
3119fe6060f1SDimitry Andric   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3120fe6060f1SDimitry Andric   // it. This change will be removed when code-generation for these types is
3121fe6060f1SDimitry Andric   // sufficiently reliable.
3122fe6060f1SDimitry Andric   if (cast<VectorType>(DataTy)->getElementCount() ==
3123fe6060f1SDimitry Andric       ElementCount::getScalable(1))
3124fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
3125fe6060f1SDimitry Andric 
3126fe6060f1SDimitry Andric   ElementCount LegalVF = LT.second.getVectorElementCount();
3127fe6060f1SDimitry Andric   InstructionCost MemOpCost =
3128bdd1243dSDimitry Andric       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3129bdd1243dSDimitry Andric                       {TTI::OK_AnyValue, TTI::OP_None}, I);
31300eae32dcSDimitry Andric   // Add on an overhead cost for using gathers/scatters.
31310eae32dcSDimitry Andric   // TODO: At the moment this is applied unilaterally for all CPUs, but at some
31320eae32dcSDimitry Andric   // point we may want a per-CPU overhead.
31330eae32dcSDimitry Andric   MemOpCost *= getSVEGatherScatterOverhead(Opcode);
3134fe6060f1SDimitry Andric   return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3135e8d8bef9SDimitry Andric }
3136e8d8bef9SDimitry Andric 
useNeonVector(const Type * Ty) const3137e8d8bef9SDimitry Andric bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
3138e8d8bef9SDimitry Andric   return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3139e8d8bef9SDimitry Andric }
3140e8d8bef9SDimitry Andric 
getMemoryOpCost(unsigned Opcode,Type * Ty,MaybeAlign Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,TTI::OperandValueInfo OpInfo,const Instruction * I)3141fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
3142fe6060f1SDimitry Andric                                                 MaybeAlign Alignment,
3143fe6060f1SDimitry Andric                                                 unsigned AddressSpace,
31445ffd83dbSDimitry Andric                                                 TTI::TargetCostKind CostKind,
3145bdd1243dSDimitry Andric                                                 TTI::OperandValueInfo OpInfo,
31460b57cec5SDimitry Andric                                                 const Instruction *I) {
3147fe6060f1SDimitry Andric   EVT VT = TLI->getValueType(DL, Ty, true);
31485ffd83dbSDimitry Andric   // Type legalization can't handle structs
3149fe6060f1SDimitry Andric   if (VT == MVT::Other)
31505ffd83dbSDimitry Andric     return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
31515ffd83dbSDimitry Andric                                   CostKind);
31525ffd83dbSDimitry Andric 
3153bdd1243dSDimitry Andric   auto LT = getTypeLegalizationCost(Ty);
3154fe6060f1SDimitry Andric   if (!LT.first.isValid())
3155fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
3156fe6060f1SDimitry Andric 
3157fe6060f1SDimitry Andric   // The code-generator is currently not able to handle scalable vectors
3158fe6060f1SDimitry Andric   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3159fe6060f1SDimitry Andric   // it. This change will be removed when code-generation for these types is
3160fe6060f1SDimitry Andric   // sufficiently reliable.
3161fe6060f1SDimitry Andric   if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3162fe6060f1SDimitry Andric     if (VTy->getElementCount() == ElementCount::getScalable(1))
3163fe6060f1SDimitry Andric       return InstructionCost::getInvalid();
3164fe6060f1SDimitry Andric 
3165fe6060f1SDimitry Andric   // TODO: consider latency as well for TCK_SizeAndLatency.
3166fe6060f1SDimitry Andric   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
3167fe6060f1SDimitry Andric     return LT.first;
3168fe6060f1SDimitry Andric 
3169fe6060f1SDimitry Andric   if (CostKind != TTI::TCK_RecipThroughput)
3170fe6060f1SDimitry Andric     return 1;
31710b57cec5SDimitry Andric 
31720b57cec5SDimitry Andric   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3173480093f4SDimitry Andric       LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
31740b57cec5SDimitry Andric     // Unaligned stores are extremely inefficient. We don't split all
31750b57cec5SDimitry Andric     // unaligned 128-bit stores because the negative impact that has shown in
31760b57cec5SDimitry Andric     // practice on inlined block copy code.
31770b57cec5SDimitry Andric     // We make such stores expensive so that we will only vectorize if there
31780b57cec5SDimitry Andric     // are 6 other instructions getting vectorized.
31790b57cec5SDimitry Andric     const int AmortizationCost = 6;
31800b57cec5SDimitry Andric 
31810b57cec5SDimitry Andric     return LT.first * 2 * AmortizationCost;
31820b57cec5SDimitry Andric   }
31830b57cec5SDimitry Andric 
3184bdd1243dSDimitry Andric   // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3185bdd1243dSDimitry Andric   if (Ty->isPtrOrPtrVectorTy())
3186bdd1243dSDimitry Andric     return LT.first;
3187bdd1243dSDimitry Andric 
31887a6dacacSDimitry Andric   if (useNeonVector(Ty)) {
3189fe6060f1SDimitry Andric     // Check truncating stores and extending loads.
31907a6dacacSDimitry Andric     if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3191fe6060f1SDimitry Andric       // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3192fe6060f1SDimitry Andric       if (VT == MVT::v4i8)
3193fe6060f1SDimitry Andric         return 2;
3194fe6060f1SDimitry Andric       // Otherwise we need to scalarize.
3195fe6060f1SDimitry Andric       return cast<FixedVectorType>(Ty)->getNumElements() * 2;
31960b57cec5SDimitry Andric     }
31977a6dacacSDimitry Andric     EVT EltVT = VT.getVectorElementType();
31987a6dacacSDimitry Andric     unsigned EltSize = EltVT.getScalarSizeInBits();
31997a6dacacSDimitry Andric     if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
32007a6dacacSDimitry Andric         VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
32017a6dacacSDimitry Andric         *Alignment != Align(1))
32027a6dacacSDimitry Andric       return LT.first;
32037a6dacacSDimitry Andric     // FIXME: v3i8 lowering currently is very inefficient, due to automatic
32047a6dacacSDimitry Andric     // widening to v4i8, which produces suboptimal results.
32057a6dacacSDimitry Andric     if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
32067a6dacacSDimitry Andric       return LT.first;
32077a6dacacSDimitry Andric 
32087a6dacacSDimitry Andric     // Check non-power-of-2 loads/stores for legal vector element types with
32097a6dacacSDimitry Andric     // NEON. Non-power-of-2 memory ops will get broken down to a set of
32107a6dacacSDimitry Andric     // operations on smaller power-of-2 ops, including ld1/st1.
32117a6dacacSDimitry Andric     LLVMContext &C = Ty->getContext();
32127a6dacacSDimitry Andric     InstructionCost Cost(0);
32137a6dacacSDimitry Andric     SmallVector<EVT> TypeWorklist;
32147a6dacacSDimitry Andric     TypeWorklist.push_back(VT);
32157a6dacacSDimitry Andric     while (!TypeWorklist.empty()) {
32167a6dacacSDimitry Andric       EVT CurrVT = TypeWorklist.pop_back_val();
32177a6dacacSDimitry Andric       unsigned CurrNumElements = CurrVT.getVectorNumElements();
32187a6dacacSDimitry Andric       if (isPowerOf2_32(CurrNumElements)) {
32197a6dacacSDimitry Andric         Cost += 1;
32207a6dacacSDimitry Andric         continue;
32217a6dacacSDimitry Andric       }
32227a6dacacSDimitry Andric 
32237a6dacacSDimitry Andric       unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
32247a6dacacSDimitry Andric       TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
32257a6dacacSDimitry Andric       TypeWorklist.push_back(
32267a6dacacSDimitry Andric           EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
32277a6dacacSDimitry Andric     }
32287a6dacacSDimitry Andric     return Cost;
32297a6dacacSDimitry Andric   }
32300b57cec5SDimitry Andric 
32310b57cec5SDimitry Andric   return LT.first;
32320b57cec5SDimitry Andric }
32330b57cec5SDimitry Andric 
getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps)3234fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
32355ffd83dbSDimitry Andric     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
32365ffd83dbSDimitry Andric     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
32375ffd83dbSDimitry Andric     bool UseMaskForCond, bool UseMaskForGaps) {
32380b57cec5SDimitry Andric   assert(Factor >= 2 && "Invalid interleave factor");
323906c3fb27SDimitry Andric   auto *VecVTy = cast<VectorType>(VecTy);
32400b57cec5SDimitry Andric 
324106c3fb27SDimitry Andric   if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
324206c3fb27SDimitry Andric     return InstructionCost::getInvalid();
324306c3fb27SDimitry Andric 
324406c3fb27SDimitry Andric   // Vectorization for masked interleaved accesses is only enabled for scalable
324506c3fb27SDimitry Andric   // VF.
324606c3fb27SDimitry Andric   if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
324706c3fb27SDimitry Andric     return InstructionCost::getInvalid();
324806c3fb27SDimitry Andric 
324906c3fb27SDimitry Andric   if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
325006c3fb27SDimitry Andric     unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
32515ffd83dbSDimitry Andric     auto *SubVecTy =
325206c3fb27SDimitry Andric         VectorType::get(VecVTy->getElementType(),
325306c3fb27SDimitry Andric                         VecVTy->getElementCount().divideCoefficientBy(Factor));
32540b57cec5SDimitry Andric 
32550b57cec5SDimitry Andric     // ldN/stN only support legal vector types of size 64 or 128 in bits.
32560b57cec5SDimitry Andric     // Accesses having vector types that are a multiple of 128 bits can be
32570b57cec5SDimitry Andric     // matched to more than one ldN/stN instruction.
3258349cc55cSDimitry Andric     bool UseScalable;
325906c3fb27SDimitry Andric     if (MinElts % Factor == 0 &&
3260349cc55cSDimitry Andric         TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3261349cc55cSDimitry Andric       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
32620b57cec5SDimitry Andric   }
32630b57cec5SDimitry Andric 
32640b57cec5SDimitry Andric   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
32655ffd83dbSDimitry Andric                                            Alignment, AddressSpace, CostKind,
32660b57cec5SDimitry Andric                                            UseMaskForCond, UseMaskForGaps);
32670b57cec5SDimitry Andric }
32680b57cec5SDimitry Andric 
3269fe6060f1SDimitry Andric InstructionCost
getCostOfKeepingLiveOverCall(ArrayRef<Type * > Tys)3270fe6060f1SDimitry Andric AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
3271fe6060f1SDimitry Andric   InstructionCost Cost = 0;
32725ffd83dbSDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
32730b57cec5SDimitry Andric   for (auto *I : Tys) {
32740b57cec5SDimitry Andric     if (!I->isVectorTy())
32750b57cec5SDimitry Andric       continue;
32765ffd83dbSDimitry Andric     if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
32775ffd83dbSDimitry Andric         128)
32785ffd83dbSDimitry Andric       Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
32795ffd83dbSDimitry Andric               getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
32800b57cec5SDimitry Andric   }
32810b57cec5SDimitry Andric   return Cost;
32820b57cec5SDimitry Andric }
32830b57cec5SDimitry Andric 
getMaxInterleaveFactor(ElementCount VF)328406c3fb27SDimitry Andric unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
32850b57cec5SDimitry Andric   return ST->getMaxInterleaveFactor();
32860b57cec5SDimitry Andric }
32870b57cec5SDimitry Andric 
32880b57cec5SDimitry Andric // For Falkor, we want to avoid having too many strided loads in a loop since
32890b57cec5SDimitry Andric // that can exhaust the HW prefetcher resources.  We adjust the unroller
32900b57cec5SDimitry Andric // MaxCount preference below to attempt to ensure unrolling doesn't create too
32910b57cec5SDimitry Andric // many strided loads.
32920b57cec5SDimitry Andric static void
getFalkorUnrollingPreferences(Loop * L,ScalarEvolution & SE,TargetTransformInfo::UnrollingPreferences & UP)32930b57cec5SDimitry Andric getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
32940b57cec5SDimitry Andric                               TargetTransformInfo::UnrollingPreferences &UP) {
32950b57cec5SDimitry Andric   enum { MaxStridedLoads = 7 };
32960b57cec5SDimitry Andric   auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
32970b57cec5SDimitry Andric     int StridedLoads = 0;
32980b57cec5SDimitry Andric     // FIXME? We could make this more precise by looking at the CFG and
32990b57cec5SDimitry Andric     // e.g. not counting loads in each side of an if-then-else diamond.
33000b57cec5SDimitry Andric     for (const auto BB : L->blocks()) {
33010b57cec5SDimitry Andric       for (auto &I : *BB) {
33020b57cec5SDimitry Andric         LoadInst *LMemI = dyn_cast<LoadInst>(&I);
33030b57cec5SDimitry Andric         if (!LMemI)
33040b57cec5SDimitry Andric           continue;
33050b57cec5SDimitry Andric 
33060b57cec5SDimitry Andric         Value *PtrValue = LMemI->getPointerOperand();
33070b57cec5SDimitry Andric         if (L->isLoopInvariant(PtrValue))
33080b57cec5SDimitry Andric           continue;
33090b57cec5SDimitry Andric 
33100b57cec5SDimitry Andric         const SCEV *LSCEV = SE.getSCEV(PtrValue);
33110b57cec5SDimitry Andric         const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
33120b57cec5SDimitry Andric         if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
33130b57cec5SDimitry Andric           continue;
33140b57cec5SDimitry Andric 
33150b57cec5SDimitry Andric         // FIXME? We could take pairing of unrolled load copies into account
33160b57cec5SDimitry Andric         // by looking at the AddRec, but we would probably have to limit this
33170b57cec5SDimitry Andric         // to loops with no stores or other memory optimization barriers.
33180b57cec5SDimitry Andric         ++StridedLoads;
33190b57cec5SDimitry Andric         // We've seen enough strided loads that seeing more won't make a
33200b57cec5SDimitry Andric         // difference.
33210b57cec5SDimitry Andric         if (StridedLoads > MaxStridedLoads / 2)
33220b57cec5SDimitry Andric           return StridedLoads;
33230b57cec5SDimitry Andric       }
33240b57cec5SDimitry Andric     }
33250b57cec5SDimitry Andric     return StridedLoads;
33260b57cec5SDimitry Andric   };
33270b57cec5SDimitry Andric 
33280b57cec5SDimitry Andric   int StridedLoads = countStridedLoads(L, SE);
33290b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
33300b57cec5SDimitry Andric                     << " strided loads\n");
33310b57cec5SDimitry Andric   // Pick the largest power of 2 unroll count that won't result in too many
33320b57cec5SDimitry Andric   // strided loads.
33330b57cec5SDimitry Andric   if (StridedLoads) {
33340b57cec5SDimitry Andric     UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
33350b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
33360b57cec5SDimitry Andric                       << UP.MaxCount << '\n');
33370b57cec5SDimitry Andric   }
33380b57cec5SDimitry Andric }
33390b57cec5SDimitry Andric 
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)33400b57cec5SDimitry Andric void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
3341349cc55cSDimitry Andric                                              TTI::UnrollingPreferences &UP,
3342349cc55cSDimitry Andric                                              OptimizationRemarkEmitter *ORE) {
33430b57cec5SDimitry Andric   // Enable partial unrolling and runtime unrolling.
3344349cc55cSDimitry Andric   BaseT::getUnrollingPreferences(L, SE, UP, ORE);
3345349cc55cSDimitry Andric 
3346349cc55cSDimitry Andric   UP.UpperBound = true;
33470b57cec5SDimitry Andric 
33480b57cec5SDimitry Andric   // For inner loop, it is more likely to be a hot one, and the runtime check
33490b57cec5SDimitry Andric   // can be promoted out from LICM pass, so the overhead is less, let's try
33500b57cec5SDimitry Andric   // a larger threshold to unroll more loops.
33510b57cec5SDimitry Andric   if (L->getLoopDepth() > 1)
33520b57cec5SDimitry Andric     UP.PartialThreshold *= 2;
33530b57cec5SDimitry Andric 
33540b57cec5SDimitry Andric   // Disable partial & runtime unrolling on -Os.
33550b57cec5SDimitry Andric   UP.PartialOptSizeThreshold = 0;
33560b57cec5SDimitry Andric 
33570b57cec5SDimitry Andric   if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
33580b57cec5SDimitry Andric       EnableFalkorHWPFUnrollFix)
33590b57cec5SDimitry Andric     getFalkorUnrollingPreferences(L, SE, UP);
3360fe6060f1SDimitry Andric 
3361fe6060f1SDimitry Andric   // Scan the loop: don't unroll loops with calls as this could prevent
3362fe6060f1SDimitry Andric   // inlining. Don't unroll vector loops either, as they don't benefit much from
3363fe6060f1SDimitry Andric   // unrolling.
3364fe6060f1SDimitry Andric   for (auto *BB : L->getBlocks()) {
3365fe6060f1SDimitry Andric     for (auto &I : *BB) {
3366fe6060f1SDimitry Andric       // Don't unroll vectorised loop.
3367fe6060f1SDimitry Andric       if (I.getType()->isVectorTy())
3368fe6060f1SDimitry Andric         return;
3369fe6060f1SDimitry Andric 
3370fe6060f1SDimitry Andric       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
3371fe6060f1SDimitry Andric         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
3372fe6060f1SDimitry Andric           if (!isLoweredToCall(F))
3373fe6060f1SDimitry Andric             continue;
3374fe6060f1SDimitry Andric         }
3375fe6060f1SDimitry Andric         return;
3376fe6060f1SDimitry Andric       }
3377fe6060f1SDimitry Andric     }
3378fe6060f1SDimitry Andric   }
3379fe6060f1SDimitry Andric 
3380fe6060f1SDimitry Andric   // Enable runtime unrolling for in-order models
3381fe6060f1SDimitry Andric   // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
3382fe6060f1SDimitry Andric   // checking for that case, we can ensure that the default behaviour is
3383fe6060f1SDimitry Andric   // unchanged
3384fe6060f1SDimitry Andric   if (ST->getProcFamily() != AArch64Subtarget::Others &&
3385fe6060f1SDimitry Andric       !ST->getSchedModel().isOutOfOrder()) {
3386fe6060f1SDimitry Andric     UP.Runtime = true;
3387fe6060f1SDimitry Andric     UP.Partial = true;
3388fe6060f1SDimitry Andric     UP.UnrollRemainder = true;
3389fe6060f1SDimitry Andric     UP.DefaultUnrollRuntimeCount = 4;
3390fe6060f1SDimitry Andric 
3391fe6060f1SDimitry Andric     UP.UnrollAndJam = true;
3392fe6060f1SDimitry Andric     UP.UnrollAndJamInnerLoopThreshold = 60;
3393fe6060f1SDimitry Andric   }
33940b57cec5SDimitry Andric }
33950b57cec5SDimitry Andric 
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)33965ffd83dbSDimitry Andric void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
33975ffd83dbSDimitry Andric                                            TTI::PeelingPreferences &PP) {
33985ffd83dbSDimitry Andric   BaseT::getPeelingPreferences(L, SE, PP);
33995ffd83dbSDimitry Andric }
34005ffd83dbSDimitry Andric 
getOrCreateResultFromMemIntrinsic(IntrinsicInst * Inst,Type * ExpectedType)34010b57cec5SDimitry Andric Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
34020b57cec5SDimitry Andric                                                          Type *ExpectedType) {
34030b57cec5SDimitry Andric   switch (Inst->getIntrinsicID()) {
34040b57cec5SDimitry Andric   default:
34050b57cec5SDimitry Andric     return nullptr;
34060b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st2:
34070b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st3:
34080b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st4: {
34090b57cec5SDimitry Andric     // Create a struct type
34100b57cec5SDimitry Andric     StructType *ST = dyn_cast<StructType>(ExpectedType);
34110b57cec5SDimitry Andric     if (!ST)
34120b57cec5SDimitry Andric       return nullptr;
3413349cc55cSDimitry Andric     unsigned NumElts = Inst->arg_size() - 1;
34140b57cec5SDimitry Andric     if (ST->getNumElements() != NumElts)
34150b57cec5SDimitry Andric       return nullptr;
34160b57cec5SDimitry Andric     for (unsigned i = 0, e = NumElts; i != e; ++i) {
34170b57cec5SDimitry Andric       if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
34180b57cec5SDimitry Andric         return nullptr;
34190b57cec5SDimitry Andric     }
3420bdd1243dSDimitry Andric     Value *Res = PoisonValue::get(ExpectedType);
34210b57cec5SDimitry Andric     IRBuilder<> Builder(Inst);
34220b57cec5SDimitry Andric     for (unsigned i = 0, e = NumElts; i != e; ++i) {
34230b57cec5SDimitry Andric       Value *L = Inst->getArgOperand(i);
34240b57cec5SDimitry Andric       Res = Builder.CreateInsertValue(Res, L, i);
34250b57cec5SDimitry Andric     }
34260b57cec5SDimitry Andric     return Res;
34270b57cec5SDimitry Andric   }
34280b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld2:
34290b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld3:
34300b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld4:
34310b57cec5SDimitry Andric     if (Inst->getType() == ExpectedType)
34320b57cec5SDimitry Andric       return Inst;
34330b57cec5SDimitry Andric     return nullptr;
34340b57cec5SDimitry Andric   }
34350b57cec5SDimitry Andric }
34360b57cec5SDimitry Andric 
getTgtMemIntrinsic(IntrinsicInst * Inst,MemIntrinsicInfo & Info)34370b57cec5SDimitry Andric bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
34380b57cec5SDimitry Andric                                         MemIntrinsicInfo &Info) {
34390b57cec5SDimitry Andric   switch (Inst->getIntrinsicID()) {
34400b57cec5SDimitry Andric   default:
34410b57cec5SDimitry Andric     break;
34420b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld2:
34430b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld3:
34440b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld4:
34450b57cec5SDimitry Andric     Info.ReadMem = true;
34460b57cec5SDimitry Andric     Info.WriteMem = false;
34470b57cec5SDimitry Andric     Info.PtrVal = Inst->getArgOperand(0);
34480b57cec5SDimitry Andric     break;
34490b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st2:
34500b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st3:
34510b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st4:
34520b57cec5SDimitry Andric     Info.ReadMem = false;
34530b57cec5SDimitry Andric     Info.WriteMem = true;
3454349cc55cSDimitry Andric     Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
34550b57cec5SDimitry Andric     break;
34560b57cec5SDimitry Andric   }
34570b57cec5SDimitry Andric 
34580b57cec5SDimitry Andric   switch (Inst->getIntrinsicID()) {
34590b57cec5SDimitry Andric   default:
34600b57cec5SDimitry Andric     return false;
34610b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld2:
34620b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st2:
34630b57cec5SDimitry Andric     Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
34640b57cec5SDimitry Andric     break;
34650b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld3:
34660b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st3:
34670b57cec5SDimitry Andric     Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
34680b57cec5SDimitry Andric     break;
34690b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_ld4:
34700b57cec5SDimitry Andric   case Intrinsic::aarch64_neon_st4:
34710b57cec5SDimitry Andric     Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
34720b57cec5SDimitry Andric     break;
34730b57cec5SDimitry Andric   }
34740b57cec5SDimitry Andric   return true;
34750b57cec5SDimitry Andric }
34760b57cec5SDimitry Andric 
34770b57cec5SDimitry Andric /// See if \p I should be considered for address type promotion. We check if \p
34780b57cec5SDimitry Andric /// I is a sext with right type and used in memory accesses. If it used in a
34790b57cec5SDimitry Andric /// "complex" getelementptr, we allow it to be promoted without finding other
34800b57cec5SDimitry Andric /// sext instructions that sign extended the same initial value. A getelementptr
34810b57cec5SDimitry Andric /// is considered as "complex" if it has more than 2 operands.
shouldConsiderAddressTypePromotion(const Instruction & I,bool & AllowPromotionWithoutCommonHeader)34820b57cec5SDimitry Andric bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
34830b57cec5SDimitry Andric     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
34840b57cec5SDimitry Andric   bool Considerable = false;
34850b57cec5SDimitry Andric   AllowPromotionWithoutCommonHeader = false;
34860b57cec5SDimitry Andric   if (!isa<SExtInst>(&I))
34870b57cec5SDimitry Andric     return false;
34880b57cec5SDimitry Andric   Type *ConsideredSExtType =
34890b57cec5SDimitry Andric       Type::getInt64Ty(I.getParent()->getParent()->getContext());
34900b57cec5SDimitry Andric   if (I.getType() != ConsideredSExtType)
34910b57cec5SDimitry Andric     return false;
34920b57cec5SDimitry Andric   // See if the sext is the one with the right type and used in at least one
34930b57cec5SDimitry Andric   // GetElementPtrInst.
34940b57cec5SDimitry Andric   for (const User *U : I.users()) {
34950b57cec5SDimitry Andric     if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
34960b57cec5SDimitry Andric       Considerable = true;
34970b57cec5SDimitry Andric       // A getelementptr is considered as "complex" if it has more than 2
34980b57cec5SDimitry Andric       // operands. We will promote a SExt used in such complex GEP as we
34990b57cec5SDimitry Andric       // expect some computation to be merged if they are done on 64 bits.
35000b57cec5SDimitry Andric       if (GEPInst->getNumOperands() > 2) {
35010b57cec5SDimitry Andric         AllowPromotionWithoutCommonHeader = true;
35020b57cec5SDimitry Andric         break;
35030b57cec5SDimitry Andric       }
35040b57cec5SDimitry Andric     }
35050b57cec5SDimitry Andric   }
35060b57cec5SDimitry Andric   return Considerable;
35070b57cec5SDimitry Andric }
35080b57cec5SDimitry Andric 
isLegalToVectorizeReduction(const RecurrenceDescriptor & RdxDesc,ElementCount VF) const3509fe6060f1SDimitry Andric bool AArch64TTIImpl::isLegalToVectorizeReduction(
3510fe6060f1SDimitry Andric     const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3511fe6060f1SDimitry Andric   if (!VF.isScalable())
3512fe6060f1SDimitry Andric     return true;
3513fe6060f1SDimitry Andric 
3514fe6060f1SDimitry Andric   Type *Ty = RdxDesc.getRecurrenceType();
3515fe6060f1SDimitry Andric   if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
35160b57cec5SDimitry Andric     return false;
3517fe6060f1SDimitry Andric 
3518fe6060f1SDimitry Andric   switch (RdxDesc.getRecurrenceKind()) {
3519fe6060f1SDimitry Andric   case RecurKind::Add:
3520fe6060f1SDimitry Andric   case RecurKind::FAdd:
3521fe6060f1SDimitry Andric   case RecurKind::And:
3522fe6060f1SDimitry Andric   case RecurKind::Or:
3523fe6060f1SDimitry Andric   case RecurKind::Xor:
3524fe6060f1SDimitry Andric   case RecurKind::SMin:
3525fe6060f1SDimitry Andric   case RecurKind::SMax:
3526fe6060f1SDimitry Andric   case RecurKind::UMin:
3527fe6060f1SDimitry Andric   case RecurKind::UMax:
3528fe6060f1SDimitry Andric   case RecurKind::FMin:
3529fe6060f1SDimitry Andric   case RecurKind::FMax:
35304824e7fdSDimitry Andric   case RecurKind::FMulAdd:
35315f757f3fSDimitry Andric   case RecurKind::IAnyOf:
35325f757f3fSDimitry Andric   case RecurKind::FAnyOf:
3533fe6060f1SDimitry Andric     return true;
35340b57cec5SDimitry Andric   default:
35350b57cec5SDimitry Andric     return false;
35360b57cec5SDimitry Andric   }
3537fe6060f1SDimitry Andric }
35380b57cec5SDimitry Andric 
3539fe6060f1SDimitry Andric InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)354006c3fb27SDimitry Andric AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
354106c3fb27SDimitry Andric                                        FastMathFlags FMF,
3542e8d8bef9SDimitry Andric                                        TTI::TargetCostKind CostKind) {
3543bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3544349cc55cSDimitry Andric 
3545349cc55cSDimitry Andric   if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
354606c3fb27SDimitry Andric     return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
3547349cc55cSDimitry Andric 
3548fe6060f1SDimitry Andric   InstructionCost LegalizationCost = 0;
3549e8d8bef9SDimitry Andric   if (LT.first > 1) {
3550e8d8bef9SDimitry Andric     Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
355106c3fb27SDimitry Andric     IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
3552349cc55cSDimitry Andric     LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3553e8d8bef9SDimitry Andric   }
3554e8d8bef9SDimitry Andric 
3555e8d8bef9SDimitry Andric   return LegalizationCost + /*Cost of horizontal reduction*/ 2;
3556e8d8bef9SDimitry Andric }
3557e8d8bef9SDimitry Andric 
getArithmeticReductionCostSVE(unsigned Opcode,VectorType * ValTy,TTI::TargetCostKind CostKind)3558fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
3559fe6060f1SDimitry Andric     unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3560bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3561fe6060f1SDimitry Andric   InstructionCost LegalizationCost = 0;
3562e8d8bef9SDimitry Andric   if (LT.first > 1) {
3563e8d8bef9SDimitry Andric     Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3564e8d8bef9SDimitry Andric     LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
3565e8d8bef9SDimitry Andric     LegalizationCost *= LT.first - 1;
3566e8d8bef9SDimitry Andric   }
3567e8d8bef9SDimitry Andric 
3568e8d8bef9SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
3569e8d8bef9SDimitry Andric   assert(ISD && "Invalid opcode");
3570e8d8bef9SDimitry Andric   // Add the final reduction cost for the legal horizontal reduction
3571e8d8bef9SDimitry Andric   switch (ISD) {
3572e8d8bef9SDimitry Andric   case ISD::ADD:
3573e8d8bef9SDimitry Andric   case ISD::AND:
3574e8d8bef9SDimitry Andric   case ISD::OR:
3575e8d8bef9SDimitry Andric   case ISD::XOR:
3576e8d8bef9SDimitry Andric   case ISD::FADD:
3577e8d8bef9SDimitry Andric     return LegalizationCost + 2;
3578e8d8bef9SDimitry Andric   default:
3579fe6060f1SDimitry Andric     return InstructionCost::getInvalid();
3580e8d8bef9SDimitry Andric   }
3581e8d8bef9SDimitry Andric }
3582e8d8bef9SDimitry Andric 
3583fe6060f1SDimitry Andric InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * ValTy,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)3584fe6060f1SDimitry Andric AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
3585bdd1243dSDimitry Andric                                            std::optional<FastMathFlags> FMF,
35865ffd83dbSDimitry Andric                                            TTI::TargetCostKind CostKind) {
3587fe6060f1SDimitry Andric   if (TTI::requiresOrderedReduction(FMF)) {
3588349cc55cSDimitry Andric     if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3589349cc55cSDimitry Andric       InstructionCost BaseCost =
3590349cc55cSDimitry Andric           BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3591349cc55cSDimitry Andric       // Add on extra cost to reflect the extra overhead on some CPUs. We still
3592349cc55cSDimitry Andric       // end up vectorizing for more computationally intensive loops.
3593349cc55cSDimitry Andric       return BaseCost + FixedVTy->getNumElements();
3594349cc55cSDimitry Andric     }
3595fe6060f1SDimitry Andric 
3596fe6060f1SDimitry Andric     if (Opcode != Instruction::FAdd)
3597fe6060f1SDimitry Andric       return InstructionCost::getInvalid();
3598fe6060f1SDimitry Andric 
3599fe6060f1SDimitry Andric     auto *VTy = cast<ScalableVectorType>(ValTy);
3600fe6060f1SDimitry Andric     InstructionCost Cost =
3601fe6060f1SDimitry Andric         getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3602fe6060f1SDimitry Andric     Cost *= getMaxNumElements(VTy->getElementCount());
3603fe6060f1SDimitry Andric     return Cost;
3604fe6060f1SDimitry Andric   }
36050b57cec5SDimitry Andric 
3606e8d8bef9SDimitry Andric   if (isa<ScalableVectorType>(ValTy))
3607fe6060f1SDimitry Andric     return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
36080b57cec5SDimitry Andric 
3609bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
36100b57cec5SDimitry Andric   MVT MTy = LT.second;
36110b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
36120b57cec5SDimitry Andric   assert(ISD && "Invalid opcode");
36130b57cec5SDimitry Andric 
36140b57cec5SDimitry Andric   // Horizontal adds can use the 'addv' instruction. We model the cost of these
3615fe6060f1SDimitry Andric   // instructions as twice a normal vector add, plus 1 for each legalization
3616fe6060f1SDimitry Andric   // step (LT.first). This is the only arithmetic vector reduction operation for
3617fe6060f1SDimitry Andric   // which we have an instruction.
3618fe6060f1SDimitry Andric   // OR, XOR and AND costs should match the codegen from:
3619fe6060f1SDimitry Andric   // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3620fe6060f1SDimitry Andric   // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3621fe6060f1SDimitry Andric   // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
36220b57cec5SDimitry Andric   static const CostTblEntry CostTblNoPairwise[]{
3623fe6060f1SDimitry Andric       {ISD::ADD, MVT::v8i8,   2},
3624fe6060f1SDimitry Andric       {ISD::ADD, MVT::v16i8,  2},
3625fe6060f1SDimitry Andric       {ISD::ADD, MVT::v4i16,  2},
3626fe6060f1SDimitry Andric       {ISD::ADD, MVT::v8i16,  2},
3627fe6060f1SDimitry Andric       {ISD::ADD, MVT::v4i32,  2},
3628bdd1243dSDimitry Andric       {ISD::ADD, MVT::v2i64,  2},
3629fe6060f1SDimitry Andric       {ISD::OR,  MVT::v8i8,  15},
3630fe6060f1SDimitry Andric       {ISD::OR,  MVT::v16i8, 17},
3631fe6060f1SDimitry Andric       {ISD::OR,  MVT::v4i16,  7},
3632fe6060f1SDimitry Andric       {ISD::OR,  MVT::v8i16,  9},
3633fe6060f1SDimitry Andric       {ISD::OR,  MVT::v2i32,  3},
3634fe6060f1SDimitry Andric       {ISD::OR,  MVT::v4i32,  5},
3635fe6060f1SDimitry Andric       {ISD::OR,  MVT::v2i64,  3},
3636fe6060f1SDimitry Andric       {ISD::XOR, MVT::v8i8,  15},
3637fe6060f1SDimitry Andric       {ISD::XOR, MVT::v16i8, 17},
3638fe6060f1SDimitry Andric       {ISD::XOR, MVT::v4i16,  7},
3639fe6060f1SDimitry Andric       {ISD::XOR, MVT::v8i16,  9},
3640fe6060f1SDimitry Andric       {ISD::XOR, MVT::v2i32,  3},
3641fe6060f1SDimitry Andric       {ISD::XOR, MVT::v4i32,  5},
3642fe6060f1SDimitry Andric       {ISD::XOR, MVT::v2i64,  3},
3643fe6060f1SDimitry Andric       {ISD::AND, MVT::v8i8,  15},
3644fe6060f1SDimitry Andric       {ISD::AND, MVT::v16i8, 17},
3645fe6060f1SDimitry Andric       {ISD::AND, MVT::v4i16,  7},
3646fe6060f1SDimitry Andric       {ISD::AND, MVT::v8i16,  9},
3647fe6060f1SDimitry Andric       {ISD::AND, MVT::v2i32,  3},
3648fe6060f1SDimitry Andric       {ISD::AND, MVT::v4i32,  5},
3649fe6060f1SDimitry Andric       {ISD::AND, MVT::v2i64,  3},
36500b57cec5SDimitry Andric   };
3651fe6060f1SDimitry Andric   switch (ISD) {
3652fe6060f1SDimitry Andric   default:
3653fe6060f1SDimitry Andric     break;
3654fe6060f1SDimitry Andric   case ISD::ADD:
36550b57cec5SDimitry Andric     if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3656fe6060f1SDimitry Andric       return (LT.first - 1) + Entry->Cost;
3657fe6060f1SDimitry Andric     break;
3658fe6060f1SDimitry Andric   case ISD::XOR:
3659fe6060f1SDimitry Andric   case ISD::AND:
3660fe6060f1SDimitry Andric   case ISD::OR:
3661fe6060f1SDimitry Andric     const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3662fe6060f1SDimitry Andric     if (!Entry)
3663fe6060f1SDimitry Andric       break;
3664fe6060f1SDimitry Andric     auto *ValVTy = cast<FixedVectorType>(ValTy);
366506c3fb27SDimitry Andric     if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3666fe6060f1SDimitry Andric         isPowerOf2_32(ValVTy->getNumElements())) {
3667fe6060f1SDimitry Andric       InstructionCost ExtraCost = 0;
3668fe6060f1SDimitry Andric       if (LT.first != 1) {
3669fe6060f1SDimitry Andric         // Type needs to be split, so there is an extra cost of LT.first - 1
3670fe6060f1SDimitry Andric         // arithmetic ops.
3671fe6060f1SDimitry Andric         auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3672fe6060f1SDimitry Andric                                         MTy.getVectorNumElements());
3673fe6060f1SDimitry Andric         ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3674fe6060f1SDimitry Andric         ExtraCost *= LT.first - 1;
3675fe6060f1SDimitry Andric       }
367606c3fb27SDimitry Andric       // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
367706c3fb27SDimitry Andric       auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
367806c3fb27SDimitry Andric       return Cost + ExtraCost;
3679fe6060f1SDimitry Andric     }
3680fe6060f1SDimitry Andric     break;
3681fe6060f1SDimitry Andric   }
3682fe6060f1SDimitry Andric   return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
36830b57cec5SDimitry Andric }
36840b57cec5SDimitry Andric 
getSpliceCost(VectorType * Tp,int Index)3685fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
3686fe6060f1SDimitry Andric   static const CostTblEntry ShuffleTbl[] = {
3687fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv16i8,  1 },
3688fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv8i16,  1 },
3689fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv4i32,  1 },
3690fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv2i64,  1 },
3691fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv2f16,  1 },
3692fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv4f16,  1 },
3693fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv8f16,  1 },
3694fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv2bf16, 1 },
3695fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv4bf16, 1 },
3696fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv8bf16, 1 },
3697fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv2f32,  1 },
3698fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv4f32,  1 },
3699fe6060f1SDimitry Andric       { TTI::SK_Splice, MVT::nxv2f64,  1 },
3700fe6060f1SDimitry Andric   };
3701fe6060f1SDimitry Andric 
3702bdd1243dSDimitry Andric   // The code-generator is currently not able to handle scalable vectors
3703bdd1243dSDimitry Andric   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3704bdd1243dSDimitry Andric   // it. This change will be removed when code-generation for these types is
3705bdd1243dSDimitry Andric   // sufficiently reliable.
3706bdd1243dSDimitry Andric   if (Tp->getElementCount() == ElementCount::getScalable(1))
3707bdd1243dSDimitry Andric     return InstructionCost::getInvalid();
3708bdd1243dSDimitry Andric 
3709bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3710fe6060f1SDimitry Andric   Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3711fe6060f1SDimitry Andric   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3712fe6060f1SDimitry Andric   EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3713fe6060f1SDimitry Andric                        ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3714fe6060f1SDimitry Andric                        : LT.second;
3715fe6060f1SDimitry Andric   Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3716fe6060f1SDimitry Andric   InstructionCost LegalizationCost = 0;
3717fe6060f1SDimitry Andric   if (Index < 0) {
3718fe6060f1SDimitry Andric     LegalizationCost =
3719fe6060f1SDimitry Andric         getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3720fe6060f1SDimitry Andric                            CmpInst::BAD_ICMP_PREDICATE, CostKind) +
3721fe6060f1SDimitry Andric         getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3722fe6060f1SDimitry Andric                            CmpInst::BAD_ICMP_PREDICATE, CostKind);
3723fe6060f1SDimitry Andric   }
3724fe6060f1SDimitry Andric 
3725fe6060f1SDimitry Andric   // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3726fe6060f1SDimitry Andric   // Cost performed on a promoted type.
3727fe6060f1SDimitry Andric   if (LT.second.getScalarType() == MVT::i1) {
3728fe6060f1SDimitry Andric     LegalizationCost +=
3729fe6060f1SDimitry Andric         getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3730fe6060f1SDimitry Andric                          TTI::CastContextHint::None, CostKind) +
3731fe6060f1SDimitry Andric         getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3732fe6060f1SDimitry Andric                          TTI::CastContextHint::None, CostKind);
3733fe6060f1SDimitry Andric   }
3734fe6060f1SDimitry Andric   const auto *Entry =
3735fe6060f1SDimitry Andric       CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3736fe6060f1SDimitry Andric   assert(Entry && "Illegal Type for Splice");
3737fe6060f1SDimitry Andric   LegalizationCost += Entry->Cost;
3738fe6060f1SDimitry Andric   return LegalizationCost * LT.first;
3739fe6060f1SDimitry Andric }
3740fe6060f1SDimitry Andric 
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,TTI::TargetCostKind CostKind,int Index,VectorType * SubTp,ArrayRef<const Value * > Args)3741fe6060f1SDimitry Andric InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
3742fe6060f1SDimitry Andric                                                VectorType *Tp,
3743bdd1243dSDimitry Andric                                                ArrayRef<int> Mask,
3744bdd1243dSDimitry Andric                                                TTI::TargetCostKind CostKind,
3745bdd1243dSDimitry Andric                                                int Index, VectorType *SubTp,
374681ad6265SDimitry Andric                                                ArrayRef<const Value *> Args) {
3747bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
374881ad6265SDimitry Andric   // If we have a Mask, and the LT is being legalized somehow, split the Mask
374981ad6265SDimitry Andric   // into smaller vectors and sum the cost of each shuffle.
375081ad6265SDimitry Andric   if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
375181ad6265SDimitry Andric       Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
37525f757f3fSDimitry Andric       Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
37535f757f3fSDimitry Andric     unsigned TpNumElts = Mask.size();
375481ad6265SDimitry Andric     unsigned LTNumElts = LT.second.getVectorNumElements();
375581ad6265SDimitry Andric     unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
375681ad6265SDimitry Andric     VectorType *NTp =
375781ad6265SDimitry Andric         VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
375881ad6265SDimitry Andric     InstructionCost Cost;
375981ad6265SDimitry Andric     for (unsigned N = 0; N < NumVecs; N++) {
376081ad6265SDimitry Andric       SmallVector<int> NMask;
376181ad6265SDimitry Andric       // Split the existing mask into chunks of size LTNumElts. Track the source
376281ad6265SDimitry Andric       // sub-vectors to ensure the result has at most 2 inputs.
376381ad6265SDimitry Andric       unsigned Source1, Source2;
376481ad6265SDimitry Andric       unsigned NumSources = 0;
376581ad6265SDimitry Andric       for (unsigned E = 0; E < LTNumElts; E++) {
376681ad6265SDimitry Andric         int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
376706c3fb27SDimitry Andric                                                       : PoisonMaskElem;
376881ad6265SDimitry Andric         if (MaskElt < 0) {
376906c3fb27SDimitry Andric           NMask.push_back(PoisonMaskElem);
377081ad6265SDimitry Andric           continue;
377181ad6265SDimitry Andric         }
377281ad6265SDimitry Andric 
377381ad6265SDimitry Andric         // Calculate which source from the input this comes from and whether it
377481ad6265SDimitry Andric         // is new to us.
377581ad6265SDimitry Andric         unsigned Source = MaskElt / LTNumElts;
377681ad6265SDimitry Andric         if (NumSources == 0) {
377781ad6265SDimitry Andric           Source1 = Source;
377881ad6265SDimitry Andric           NumSources = 1;
377981ad6265SDimitry Andric         } else if (NumSources == 1 && Source != Source1) {
378081ad6265SDimitry Andric           Source2 = Source;
378181ad6265SDimitry Andric           NumSources = 2;
378281ad6265SDimitry Andric         } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
378381ad6265SDimitry Andric           NumSources++;
378481ad6265SDimitry Andric         }
378581ad6265SDimitry Andric 
378681ad6265SDimitry Andric         // Add to the new mask. For the NumSources>2 case these are not correct,
378781ad6265SDimitry Andric         // but are only used for the modular lane number.
378881ad6265SDimitry Andric         if (Source == Source1)
378981ad6265SDimitry Andric           NMask.push_back(MaskElt % LTNumElts);
379081ad6265SDimitry Andric         else if (Source == Source2)
379181ad6265SDimitry Andric           NMask.push_back(MaskElt % LTNumElts + LTNumElts);
379281ad6265SDimitry Andric         else
379381ad6265SDimitry Andric           NMask.push_back(MaskElt % LTNumElts);
379481ad6265SDimitry Andric       }
379581ad6265SDimitry Andric       // If the sub-mask has at most 2 input sub-vectors then re-cost it using
379681ad6265SDimitry Andric       // getShuffleCost. If not then cost it using the worst case.
379781ad6265SDimitry Andric       if (NumSources <= 2)
379881ad6265SDimitry Andric         Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
379981ad6265SDimitry Andric                                                : TTI::SK_PermuteTwoSrc,
3800bdd1243dSDimitry Andric                                NTp, NMask, CostKind, 0, nullptr, Args);
380181ad6265SDimitry Andric       else if (any_of(enumerate(NMask), [&](const auto &ME) {
380281ad6265SDimitry Andric                  return ME.value() % LTNumElts == ME.index();
380381ad6265SDimitry Andric                }))
380481ad6265SDimitry Andric         Cost += LTNumElts - 1;
380581ad6265SDimitry Andric       else
380681ad6265SDimitry Andric         Cost += LTNumElts;
380781ad6265SDimitry Andric     }
380881ad6265SDimitry Andric     return Cost;
380981ad6265SDimitry Andric   }
381081ad6265SDimitry Andric 
38115f757f3fSDimitry Andric   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
381281ad6265SDimitry Andric 
381306c3fb27SDimitry Andric   // Check for broadcast loads, which are supported by the LD1R instruction.
381406c3fb27SDimitry Andric   // In terms of code-size, the shuffle vector is free when a load + dup get
381506c3fb27SDimitry Andric   // folded into a LD1R. That's what we check and return here. For performance
381606c3fb27SDimitry Andric   // and reciprocal throughput, a LD1R is not completely free. In this case, we
381706c3fb27SDimitry Andric   // return the cost for the broadcast below (i.e. 1 for most/all types), so
381806c3fb27SDimitry Andric   // that we model the load + dup sequence slightly higher because LD1R is a
381906c3fb27SDimitry Andric   // high latency instruction.
382006c3fb27SDimitry Andric   if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
382181ad6265SDimitry Andric     bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
382281ad6265SDimitry Andric     if (IsLoad && LT.second.isVector() &&
382381ad6265SDimitry Andric         isLegalBroadcastLoad(Tp->getElementType(),
382481ad6265SDimitry Andric                              LT.second.getVectorElementCount()))
382506c3fb27SDimitry Andric       return 0;
382681ad6265SDimitry Andric   }
382781ad6265SDimitry Andric 
382881ad6265SDimitry Andric   // If we have 4 elements for the shuffle and a Mask, get the cost straight
382981ad6265SDimitry Andric   // from the perfect shuffle tables.
383081ad6265SDimitry Andric   if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
383181ad6265SDimitry Andric       (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
383281ad6265SDimitry Andric       all_of(Mask, [](int E) { return E < 8; }))
383381ad6265SDimitry Andric     return getPerfectShuffleCost(Mask);
383481ad6265SDimitry Andric 
38350b57cec5SDimitry Andric   if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
3836fe6060f1SDimitry Andric       Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
3837bdd1243dSDimitry Andric       Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
38380b57cec5SDimitry Andric     static const CostTblEntry ShuffleTbl[] = {
38390b57cec5SDimitry Andric         // Broadcast shuffle kinds can be performed with 'dup'.
38400b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v8i8, 1},
38410b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v16i8, 1},
38420b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v4i16, 1},
38430b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v8i16, 1},
38440b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v2i32, 1},
38450b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v4i32, 1},
38460b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v2i64, 1},
384706c3fb27SDimitry Andric         {TTI::SK_Broadcast, MVT::v4f16, 1},
384806c3fb27SDimitry Andric         {TTI::SK_Broadcast, MVT::v8f16, 1},
38490b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v2f32, 1},
38500b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v4f32, 1},
38510b57cec5SDimitry Andric         {TTI::SK_Broadcast, MVT::v2f64, 1},
38520b57cec5SDimitry Andric         // Transpose shuffle kinds can be performed with 'trn1/trn2' and
38530b57cec5SDimitry Andric         // 'zip1/zip2' instructions.
38540b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v8i8, 1},
38550b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v16i8, 1},
38560b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v4i16, 1},
38570b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v8i16, 1},
38580b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v2i32, 1},
38590b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v4i32, 1},
38600b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v2i64, 1},
386106c3fb27SDimitry Andric         {TTI::SK_Transpose, MVT::v4f16, 1},
386206c3fb27SDimitry Andric         {TTI::SK_Transpose, MVT::v8f16, 1},
38630b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v2f32, 1},
38640b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v4f32, 1},
38650b57cec5SDimitry Andric         {TTI::SK_Transpose, MVT::v2f64, 1},
38660b57cec5SDimitry Andric         // Select shuffle kinds.
38670b57cec5SDimitry Andric         // TODO: handle vXi8/vXi16.
38680b57cec5SDimitry Andric         {TTI::SK_Select, MVT::v2i32, 1}, // mov.
38690b57cec5SDimitry Andric         {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
38700b57cec5SDimitry Andric         {TTI::SK_Select, MVT::v2i64, 1}, // mov.
38710b57cec5SDimitry Andric         {TTI::SK_Select, MVT::v2f32, 1}, // mov.
38720b57cec5SDimitry Andric         {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
38730b57cec5SDimitry Andric         {TTI::SK_Select, MVT::v2f64, 1}, // mov.
38740b57cec5SDimitry Andric         // PermuteSingleSrc shuffle kinds.
38750b57cec5SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
38760b57cec5SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
38770b57cec5SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
38780b57cec5SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
38790b57cec5SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
38800b57cec5SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
3881fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
3882fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
3883bdd1243dSDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
3884fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8},  // constpool + load + tbl
3885fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8},  // constpool + load + tbl
3886fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
3887fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8},   // constpool + load + tbl
3888fe6060f1SDimitry Andric         {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8},  // constpool + load + tbl
3889fe6060f1SDimitry Andric         // Reverse can be lowered with `rev`.
3890bdd1243dSDimitry Andric         {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
3891fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
3892bdd1243dSDimitry Andric         {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
3893bdd1243dSDimitry Andric         {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
3894fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
3895bdd1243dSDimitry Andric         {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
389681ad6265SDimitry Andric         {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
389781ad6265SDimitry Andric         {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
389881ad6265SDimitry Andric         {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
389981ad6265SDimitry Andric         {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
390081ad6265SDimitry Andric         {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
390181ad6265SDimitry Andric         {TTI::SK_Reverse, MVT::v8i8, 1},  // REV64
3902bdd1243dSDimitry Andric         // Splice can all be lowered as `ext`.
3903bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v2i32, 1},
3904bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v4i32, 1},
3905bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v2i64, 1},
3906bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v2f32, 1},
3907bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v4f32, 1},
3908bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v2f64, 1},
3909bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v8f16, 1},
3910bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v8bf16, 1},
3911bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v8i16, 1},
3912bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v16i8, 1},
3913bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v4bf16, 1},
3914bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v4f16, 1},
3915bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v4i16, 1},
3916bdd1243dSDimitry Andric         {TTI::SK_Splice, MVT::v8i8, 1},
3917fe6060f1SDimitry Andric         // Broadcast shuffle kinds for scalable vectors
3918fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv16i8, 1},
3919fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv8i16, 1},
3920fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv4i32, 1},
3921fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv2i64, 1},
3922fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv2f16, 1},
3923fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv4f16, 1},
3924fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv8f16, 1},
3925fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
3926fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
3927fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
3928fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv2f32, 1},
3929fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv4f32, 1},
3930fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv2f64, 1},
3931fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv16i1, 1},
3932fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv8i1, 1},
3933fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv4i1, 1},
3934fe6060f1SDimitry Andric         {TTI::SK_Broadcast, MVT::nxv2i1, 1},
3935fe6060f1SDimitry Andric         // Handle the cases for vector.reverse with scalable vectors
3936fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv16i8, 1},
3937fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv8i16, 1},
3938fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv4i32, 1},
3939fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv2i64, 1},
3940fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv2f16, 1},
3941fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv4f16, 1},
3942fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv8f16, 1},
3943fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv2bf16, 1},
3944fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv4bf16, 1},
3945fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv8bf16, 1},
3946fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv2f32, 1},
3947fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv4f32, 1},
3948fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv2f64, 1},
3949fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv16i1, 1},
3950fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv8i1, 1},
3951fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv4i1, 1},
3952fe6060f1SDimitry Andric         {TTI::SK_Reverse, MVT::nxv2i1, 1},
39530b57cec5SDimitry Andric     };
39540b57cec5SDimitry Andric     if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
39550b57cec5SDimitry Andric       return LT.first * Entry->Cost;
39560b57cec5SDimitry Andric   }
395781ad6265SDimitry Andric 
3958fe6060f1SDimitry Andric   if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
3959fe6060f1SDimitry Andric     return getSpliceCost(Tp, Index);
396081ad6265SDimitry Andric 
396181ad6265SDimitry Andric   // Inserting a subvector can often be done with either a D, S or H register
396281ad6265SDimitry Andric   // move, so long as the inserted vector is "aligned".
396381ad6265SDimitry Andric   if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
396481ad6265SDimitry Andric       LT.second.getSizeInBits() <= 128 && SubTp) {
3965bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
396681ad6265SDimitry Andric     if (SubLT.second.isVector()) {
396781ad6265SDimitry Andric       int NumElts = LT.second.getVectorNumElements();
396881ad6265SDimitry Andric       int NumSubElts = SubLT.second.getVectorNumElements();
396981ad6265SDimitry Andric       if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
397081ad6265SDimitry Andric         return SubLT.first;
397181ad6265SDimitry Andric     }
397281ad6265SDimitry Andric   }
397381ad6265SDimitry Andric 
3974bdd1243dSDimitry Andric   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
39750b57cec5SDimitry Andric }
3976fcaf7f86SDimitry Andric 
containsDecreasingPointers(Loop * TheLoop,PredicatedScalarEvolution * PSE)397706c3fb27SDimitry Andric static bool containsDecreasingPointers(Loop *TheLoop,
397806c3fb27SDimitry Andric                                        PredicatedScalarEvolution *PSE) {
397906c3fb27SDimitry Andric   const auto &Strides = DenseMap<Value *, const SCEV *>();
398006c3fb27SDimitry Andric   for (BasicBlock *BB : TheLoop->blocks()) {
398106c3fb27SDimitry Andric     // Scan the instructions in the block and look for addresses that are
398206c3fb27SDimitry Andric     // consecutive and decreasing.
398306c3fb27SDimitry Andric     for (Instruction &I : *BB) {
398406c3fb27SDimitry Andric       if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
398506c3fb27SDimitry Andric         Value *Ptr = getLoadStorePointerOperand(&I);
398606c3fb27SDimitry Andric         Type *AccessTy = getLoadStoreType(&I);
398706c3fb27SDimitry Andric         if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
398806c3fb27SDimitry Andric                          /*ShouldCheckWrap=*/false)
398906c3fb27SDimitry Andric                 .value_or(0) < 0)
399006c3fb27SDimitry Andric           return true;
399106c3fb27SDimitry Andric       }
399206c3fb27SDimitry Andric     }
399306c3fb27SDimitry Andric   }
399406c3fb27SDimitry Andric   return false;
399506c3fb27SDimitry Andric }
399606c3fb27SDimitry Andric 
preferPredicateOverEpilogue(TailFoldingInfo * TFI)399706c3fb27SDimitry Andric bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
399806c3fb27SDimitry Andric   if (!ST->hasSVE())
3999fcaf7f86SDimitry Andric     return false;
4000fcaf7f86SDimitry Andric 
4001bdd1243dSDimitry Andric   // We don't currently support vectorisation with interleaving for SVE - with
4002bdd1243dSDimitry Andric   // such loops we're better off not using tail-folding. This gives us a chance
4003bdd1243dSDimitry Andric   // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
400406c3fb27SDimitry Andric   if (TFI->IAI->hasGroups())
4005bdd1243dSDimitry Andric     return false;
4006bdd1243dSDimitry Andric 
400706c3fb27SDimitry Andric   TailFoldingOpts Required = TailFoldingOpts::Disabled;
400806c3fb27SDimitry Andric   if (TFI->LVL->getReductionVars().size())
400906c3fb27SDimitry Andric     Required |= TailFoldingOpts::Reductions;
401006c3fb27SDimitry Andric   if (TFI->LVL->getFixedOrderRecurrences().size())
401106c3fb27SDimitry Andric     Required |= TailFoldingOpts::Recurrences;
4012fcaf7f86SDimitry Andric 
401306c3fb27SDimitry Andric   // We call this to discover whether any load/store pointers in the loop have
401406c3fb27SDimitry Andric   // negative strides. This will require extra work to reverse the loop
401506c3fb27SDimitry Andric   // predicate, which may be expensive.
401606c3fb27SDimitry Andric   if (containsDecreasingPointers(TFI->LVL->getLoop(),
401706c3fb27SDimitry Andric                                  TFI->LVL->getPredicatedScalarEvolution()))
401806c3fb27SDimitry Andric     Required |= TailFoldingOpts::Reverse;
401906c3fb27SDimitry Andric   if (Required == TailFoldingOpts::Disabled)
402006c3fb27SDimitry Andric     Required |= TailFoldingOpts::Simple;
402106c3fb27SDimitry Andric 
402206c3fb27SDimitry Andric   if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
402306c3fb27SDimitry Andric                                       Required))
402406c3fb27SDimitry Andric     return false;
402506c3fb27SDimitry Andric 
402606c3fb27SDimitry Andric   // Don't tail-fold for tight loops where we would be better off interleaving
402706c3fb27SDimitry Andric   // with an unpredicated loop.
402806c3fb27SDimitry Andric   unsigned NumInsns = 0;
402906c3fb27SDimitry Andric   for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
403006c3fb27SDimitry Andric     NumInsns += BB->sizeWithoutDebug();
403106c3fb27SDimitry Andric   }
403206c3fb27SDimitry Andric 
403306c3fb27SDimitry Andric   // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
403406c3fb27SDimitry Andric   return NumInsns >= SVETailFoldInsnThreshold;
4035fcaf7f86SDimitry Andric }
4036bdd1243dSDimitry Andric 
4037bdd1243dSDimitry Andric InstructionCost
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace) const4038bdd1243dSDimitry Andric AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
4039bdd1243dSDimitry Andric                                      int64_t BaseOffset, bool HasBaseReg,
4040bdd1243dSDimitry Andric                                      int64_t Scale, unsigned AddrSpace) const {
4041bdd1243dSDimitry Andric   // Scaling factors are not free at all.
4042bdd1243dSDimitry Andric   // Operands                     | Rt Latency
4043bdd1243dSDimitry Andric   // -------------------------------------------
4044bdd1243dSDimitry Andric   // Rt, [Xn, Xm]                 | 4
4045bdd1243dSDimitry Andric   // -------------------------------------------
4046bdd1243dSDimitry Andric   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
4047bdd1243dSDimitry Andric   // Rt, [Xn, Wm, <extend> #imm]  |
4048bdd1243dSDimitry Andric   TargetLoweringBase::AddrMode AM;
4049bdd1243dSDimitry Andric   AM.BaseGV = BaseGV;
4050bdd1243dSDimitry Andric   AM.BaseOffs = BaseOffset;
4051bdd1243dSDimitry Andric   AM.HasBaseReg = HasBaseReg;
4052bdd1243dSDimitry Andric   AM.Scale = Scale;
4053bdd1243dSDimitry Andric   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
4054bdd1243dSDimitry Andric     // Scale represents reg2 * scale, thus account for 1 if
4055bdd1243dSDimitry Andric     // it is not equal to 0 or 1.
4056bdd1243dSDimitry Andric     return AM.Scale != 0 && AM.Scale != 1;
4057bdd1243dSDimitry Andric   return -1;
4058bdd1243dSDimitry Andric }
40597a6dacacSDimitry Andric 
shouldTreatInstructionLikeSelect(const Instruction * I)40607a6dacacSDimitry Andric bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
40617a6dacacSDimitry Andric   // For the binary operators (e.g. or) we need to be more careful than
40627a6dacacSDimitry Andric   // selects, here we only transform them if they are already at a natural
40637a6dacacSDimitry Andric   // break point in the code - the end of a block with an unconditional
40647a6dacacSDimitry Andric   // terminator.
40657a6dacacSDimitry Andric   if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
40667a6dacacSDimitry Andric       isa<BranchInst>(I->getNextNode()) &&
40677a6dacacSDimitry Andric       cast<BranchInst>(I->getNextNode())->isUnconditional())
40687a6dacacSDimitry Andric     return true;
40697a6dacacSDimitry Andric   return BaseT::shouldTreatInstructionLikeSelect(I);
40707a6dacacSDimitry Andric }
4071