10b57cec5SDimitry Andric //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric 
90b57cec5SDimitry Andric #include "ARMTargetTransformInfo.h"
100b57cec5SDimitry Andric #include "ARMSubtarget.h"
110b57cec5SDimitry Andric #include "MCTargetDesc/ARMAddressingModes.h"
120b57cec5SDimitry Andric #include "llvm/ADT/APInt.h"
130b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
140b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h"
150b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h"
160b57cec5SDimitry Andric #include "llvm/CodeGen/ISDOpcodes.h"
1706c3fb27SDimitry Andric #include "llvm/CodeGen/MachineValueType.h"
180b57cec5SDimitry Andric #include "llvm/CodeGen/ValueTypes.h"
190b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h"
200b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h"
210b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
220b57cec5SDimitry Andric #include "llvm/IR/Instruction.h"
230b57cec5SDimitry Andric #include "llvm/IR/Instructions.h"
240b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
25fcaf7f86SDimitry Andric #include "llvm/IR/Intrinsics.h"
265ffd83dbSDimitry Andric #include "llvm/IR/IntrinsicsARM.h"
27480093f4SDimitry Andric #include "llvm/IR/PatternMatch.h"
280b57cec5SDimitry Andric #include "llvm/IR/Type.h"
290b57cec5SDimitry Andric #include "llvm/Support/Casting.h"
30e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h"
310b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h"
3206c3fb27SDimitry Andric #include "llvm/TargetParser/SubtargetFeature.h"
33e8d8bef9SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h"
34e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
355ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h"
36fcaf7f86SDimitry Andric #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
370b57cec5SDimitry Andric #include <algorithm>
380b57cec5SDimitry Andric #include <cassert>
390b57cec5SDimitry Andric #include <cstdint>
40bdd1243dSDimitry Andric #include <optional>
410b57cec5SDimitry Andric #include <utility>
420b57cec5SDimitry Andric 
430b57cec5SDimitry Andric using namespace llvm;
440b57cec5SDimitry Andric 
450b57cec5SDimitry Andric #define DEBUG_TYPE "armtti"
460b57cec5SDimitry Andric 
478bcb0991SDimitry Andric static cl::opt<bool> EnableMaskedLoadStores(
48480093f4SDimitry Andric   "enable-arm-maskedldst", cl::Hidden, cl::init(true),
498bcb0991SDimitry Andric   cl::desc("Enable the generation of masked loads and stores"));
508bcb0991SDimitry Andric 
510b57cec5SDimitry Andric static cl::opt<bool> DisableLowOverheadLoops(
528bcb0991SDimitry Andric   "disable-arm-loloops", cl::Hidden, cl::init(false),
530b57cec5SDimitry Andric   cl::desc("Disable the generation of low-overhead loops"));
540b57cec5SDimitry Andric 
55e8d8bef9SDimitry Andric static cl::opt<bool>
56e8d8bef9SDimitry Andric     AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57e8d8bef9SDimitry Andric                   cl::desc("Enable the generation of WLS loops"));
58e8d8bef9SDimitry Andric 
595ffd83dbSDimitry Andric extern cl::opt<TailPredication::Mode> EnableTailPredication;
60480093f4SDimitry Andric 
61480093f4SDimitry Andric extern cl::opt<bool> EnableMaskedGatherScatters;
62480093f4SDimitry Andric 
63e8d8bef9SDimitry Andric extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
64e8d8bef9SDimitry Andric 
65e8d8bef9SDimitry Andric /// Convert a vector load intrinsic into a simple llvm load instruction.
66e8d8bef9SDimitry Andric /// This is beneficial when the underlying object being addressed comes
67e8d8bef9SDimitry Andric /// from a constant, since we get constant-folding for free.
simplifyNeonVld1(const IntrinsicInst & II,unsigned MemAlign,InstCombiner::BuilderTy & Builder)68e8d8bef9SDimitry Andric static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
69e8d8bef9SDimitry Andric                                InstCombiner::BuilderTy &Builder) {
70e8d8bef9SDimitry Andric   auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
71e8d8bef9SDimitry Andric 
72e8d8bef9SDimitry Andric   if (!IntrAlign)
73e8d8bef9SDimitry Andric     return nullptr;
74e8d8bef9SDimitry Andric 
75e8d8bef9SDimitry Andric   unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
76e8d8bef9SDimitry Andric                            ? MemAlign
77e8d8bef9SDimitry Andric                            : IntrAlign->getLimitedValue();
78e8d8bef9SDimitry Andric 
79e8d8bef9SDimitry Andric   if (!isPowerOf2_32(Alignment))
80e8d8bef9SDimitry Andric     return nullptr;
81e8d8bef9SDimitry Andric 
82e8d8bef9SDimitry Andric   auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
83e8d8bef9SDimitry Andric                                           PointerType::get(II.getType(), 0));
84e8d8bef9SDimitry Andric   return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
85e8d8bef9SDimitry Andric }
86e8d8bef9SDimitry Andric 
areInlineCompatible(const Function * Caller,const Function * Callee) const870b57cec5SDimitry Andric bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
880b57cec5SDimitry Andric                                      const Function *Callee) const {
890b57cec5SDimitry Andric   const TargetMachine &TM = getTLI()->getTargetMachine();
900b57cec5SDimitry Andric   const FeatureBitset &CallerBits =
910b57cec5SDimitry Andric       TM.getSubtargetImpl(*Caller)->getFeatureBits();
920b57cec5SDimitry Andric   const FeatureBitset &CalleeBits =
930b57cec5SDimitry Andric       TM.getSubtargetImpl(*Callee)->getFeatureBits();
940b57cec5SDimitry Andric 
955ffd83dbSDimitry Andric   // To inline a callee, all features not in the allowed list must match exactly.
965ffd83dbSDimitry Andric   bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
975ffd83dbSDimitry Andric                     (CalleeBits & ~InlineFeaturesAllowed);
985ffd83dbSDimitry Andric   // For features in the allowed list, the callee's features must be a subset of
990b57cec5SDimitry Andric   // the callers'.
1005ffd83dbSDimitry Andric   bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
1015ffd83dbSDimitry Andric                      (CalleeBits & InlineFeaturesAllowed);
1020b57cec5SDimitry Andric   return MatchExact && MatchSubset;
1030b57cec5SDimitry Andric }
1040b57cec5SDimitry Andric 
105fe6060f1SDimitry Andric TTI::AddressingModeKind
getPreferredAddressingMode(const Loop * L,ScalarEvolution * SE) const106fe6060f1SDimitry Andric ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
107fe6060f1SDimitry Andric                                        ScalarEvolution *SE) const {
1085ffd83dbSDimitry Andric   if (ST->hasMVEIntegerOps())
109fe6060f1SDimitry Andric     return TTI::AMK_PostIndexed;
1105ffd83dbSDimitry Andric 
111fe6060f1SDimitry Andric   if (L->getHeader()->getParent()->hasOptSize())
112fe6060f1SDimitry Andric     return TTI::AMK_None;
113fe6060f1SDimitry Andric 
114fe6060f1SDimitry Andric   if (ST->isMClass() && ST->isThumb2() &&
115fe6060f1SDimitry Andric       L->getNumBlocks() == 1)
116fe6060f1SDimitry Andric     return TTI::AMK_PreIndexed;
117fe6060f1SDimitry Andric 
118fe6060f1SDimitry Andric   return TTI::AMK_None;
1195ffd83dbSDimitry Andric }
1205ffd83dbSDimitry Andric 
121bdd1243dSDimitry Andric std::optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const122e8d8bef9SDimitry Andric ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
123e8d8bef9SDimitry Andric   using namespace PatternMatch;
124e8d8bef9SDimitry Andric   Intrinsic::ID IID = II.getIntrinsicID();
125e8d8bef9SDimitry Andric   switch (IID) {
126e8d8bef9SDimitry Andric   default:
127e8d8bef9SDimitry Andric     break;
128e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld1: {
129e8d8bef9SDimitry Andric     Align MemAlign =
130e8d8bef9SDimitry Andric         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
131e8d8bef9SDimitry Andric                           &IC.getAssumptionCache(), &IC.getDominatorTree());
132e8d8bef9SDimitry Andric     if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
133e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
134e8d8bef9SDimitry Andric     }
135e8d8bef9SDimitry Andric     break;
136e8d8bef9SDimitry Andric   }
137e8d8bef9SDimitry Andric 
138e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld2:
139e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld3:
140e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld4:
141e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld2lane:
142e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld3lane:
143e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld4lane:
144e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst1:
145e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst2:
146e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst3:
147e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst4:
148e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst2lane:
149e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst3lane:
150e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst4lane: {
151e8d8bef9SDimitry Andric     Align MemAlign =
152e8d8bef9SDimitry Andric         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
153e8d8bef9SDimitry Andric                           &IC.getAssumptionCache(), &IC.getDominatorTree());
154349cc55cSDimitry Andric     unsigned AlignArg = II.arg_size() - 1;
155e8d8bef9SDimitry Andric     Value *AlignArgOp = II.getArgOperand(AlignArg);
156e8d8bef9SDimitry Andric     MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
157e8d8bef9SDimitry Andric     if (Align && *Align < MemAlign) {
158e8d8bef9SDimitry Andric       return IC.replaceOperand(
159e8d8bef9SDimitry Andric           II, AlignArg,
160e8d8bef9SDimitry Andric           ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
161e8d8bef9SDimitry Andric                            false));
162e8d8bef9SDimitry Andric     }
163e8d8bef9SDimitry Andric     break;
164e8d8bef9SDimitry Andric   }
165e8d8bef9SDimitry Andric 
166e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_pred_i2v: {
167e8d8bef9SDimitry Andric     Value *Arg = II.getArgOperand(0);
168e8d8bef9SDimitry Andric     Value *ArgArg;
169e8d8bef9SDimitry Andric     if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
170e8d8bef9SDimitry Andric                        PatternMatch::m_Value(ArgArg))) &&
171e8d8bef9SDimitry Andric         II.getType() == ArgArg->getType()) {
172e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, ArgArg);
173e8d8bef9SDimitry Andric     }
174e8d8bef9SDimitry Andric     Constant *XorMask;
175e8d8bef9SDimitry Andric     if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
176e8d8bef9SDimitry Andric                              PatternMatch::m_Value(ArgArg)),
177e8d8bef9SDimitry Andric                          PatternMatch::m_Constant(XorMask))) &&
178e8d8bef9SDimitry Andric         II.getType() == ArgArg->getType()) {
179e8d8bef9SDimitry Andric       if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
180349cc55cSDimitry Andric         if (CI->getValue().trunc(16).isAllOnes()) {
181e8d8bef9SDimitry Andric           auto TrueVector = IC.Builder.CreateVectorSplat(
182e8d8bef9SDimitry Andric               cast<FixedVectorType>(II.getType())->getNumElements(),
183e8d8bef9SDimitry Andric               IC.Builder.getTrue());
184e8d8bef9SDimitry Andric           return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
185e8d8bef9SDimitry Andric         }
186e8d8bef9SDimitry Andric       }
187e8d8bef9SDimitry Andric     }
188e8d8bef9SDimitry Andric     KnownBits ScalarKnown(32);
189e8d8bef9SDimitry Andric     if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
190e8d8bef9SDimitry Andric                                 ScalarKnown, 0)) {
191e8d8bef9SDimitry Andric       return &II;
192e8d8bef9SDimitry Andric     }
193e8d8bef9SDimitry Andric     break;
194e8d8bef9SDimitry Andric   }
195e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_pred_v2i: {
196e8d8bef9SDimitry Andric     Value *Arg = II.getArgOperand(0);
197e8d8bef9SDimitry Andric     Value *ArgArg;
198e8d8bef9SDimitry Andric     if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
199e8d8bef9SDimitry Andric                        PatternMatch::m_Value(ArgArg)))) {
200e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, ArgArg);
201e8d8bef9SDimitry Andric     }
202e8d8bef9SDimitry Andric     if (!II.getMetadata(LLVMContext::MD_range)) {
203e8d8bef9SDimitry Andric       Type *IntTy32 = Type::getInt32Ty(II.getContext());
204e8d8bef9SDimitry Andric       Metadata *M[] = {
205e8d8bef9SDimitry Andric           ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
206fe6060f1SDimitry Andric           ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
207e8d8bef9SDimitry Andric       II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
2088a4dda33SDimitry Andric       II.setMetadata(LLVMContext::MD_noundef,
2098a4dda33SDimitry Andric                      MDNode::get(II.getContext(), std::nullopt));
210e8d8bef9SDimitry Andric       return &II;
211e8d8bef9SDimitry Andric     }
212e8d8bef9SDimitry Andric     break;
213e8d8bef9SDimitry Andric   }
214e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_vadc:
215e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_vadc_predicated: {
216e8d8bef9SDimitry Andric     unsigned CarryOp =
217e8d8bef9SDimitry Andric         (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
218e8d8bef9SDimitry Andric     assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
219e8d8bef9SDimitry Andric            "Bad type for intrinsic!");
220e8d8bef9SDimitry Andric 
221e8d8bef9SDimitry Andric     KnownBits CarryKnown(32);
222e8d8bef9SDimitry Andric     if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
223e8d8bef9SDimitry Andric                                 CarryKnown)) {
224e8d8bef9SDimitry Andric       return &II;
225e8d8bef9SDimitry Andric     }
226e8d8bef9SDimitry Andric     break;
227e8d8bef9SDimitry Andric   }
228e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_vmldava: {
229e8d8bef9SDimitry Andric     Instruction *I = cast<Instruction>(&II);
230e8d8bef9SDimitry Andric     if (I->hasOneUse()) {
231e8d8bef9SDimitry Andric       auto *User = cast<Instruction>(*I->user_begin());
232e8d8bef9SDimitry Andric       Value *OpZ;
233e8d8bef9SDimitry Andric       if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
234e8d8bef9SDimitry Andric           match(I->getOperand(3), m_Zero())) {
235e8d8bef9SDimitry Andric         Value *OpX = I->getOperand(4);
236e8d8bef9SDimitry Andric         Value *OpY = I->getOperand(5);
237e8d8bef9SDimitry Andric         Type *OpTy = OpX->getType();
238e8d8bef9SDimitry Andric 
239e8d8bef9SDimitry Andric         IC.Builder.SetInsertPoint(User);
240e8d8bef9SDimitry Andric         Value *V =
241e8d8bef9SDimitry Andric             IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
242e8d8bef9SDimitry Andric                                        {I->getOperand(0), I->getOperand(1),
243e8d8bef9SDimitry Andric                                         I->getOperand(2), OpZ, OpX, OpY});
244e8d8bef9SDimitry Andric 
245e8d8bef9SDimitry Andric         IC.replaceInstUsesWith(*User, V);
246e8d8bef9SDimitry Andric         return IC.eraseInstFromFunction(*User);
247e8d8bef9SDimitry Andric       }
248e8d8bef9SDimitry Andric     }
249bdd1243dSDimitry Andric     return std::nullopt;
250e8d8bef9SDimitry Andric   }
251e8d8bef9SDimitry Andric   }
252bdd1243dSDimitry Andric   return std::nullopt;
253e8d8bef9SDimitry Andric }
254e8d8bef9SDimitry Andric 
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt OrigDemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp) const255bdd1243dSDimitry Andric std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
256349cc55cSDimitry Andric     InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
257349cc55cSDimitry Andric     APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
258349cc55cSDimitry Andric     std::function<void(Instruction *, unsigned, APInt, APInt &)>
259349cc55cSDimitry Andric         SimplifyAndSetOp) const {
260349cc55cSDimitry Andric 
261349cc55cSDimitry Andric   // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
262349cc55cSDimitry Andric   // opcode specifying a Top/Bottom instruction, which can change between
263349cc55cSDimitry Andric   // instructions.
264349cc55cSDimitry Andric   auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
265349cc55cSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
266349cc55cSDimitry Andric     unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
267349cc55cSDimitry Andric 
268349cc55cSDimitry Andric     // The only odd/even lanes of operand 0 will only be demanded depending
269349cc55cSDimitry Andric     // on whether this is a top/bottom instruction.
270349cc55cSDimitry Andric     APInt DemandedElts =
271349cc55cSDimitry Andric         APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
272349cc55cSDimitry Andric                                        : APInt::getHighBitsSet(2, 1));
273349cc55cSDimitry Andric     SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
274349cc55cSDimitry Andric     // The other lanes will be defined from the inserted elements.
2755f757f3fSDimitry Andric     UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
276349cc55cSDimitry Andric                                                 : APInt::getHighBitsSet(2, 1));
277bdd1243dSDimitry Andric     return std::nullopt;
278349cc55cSDimitry Andric   };
279349cc55cSDimitry Andric 
280349cc55cSDimitry Andric   switch (II.getIntrinsicID()) {
281349cc55cSDimitry Andric   default:
282349cc55cSDimitry Andric     break;
283349cc55cSDimitry Andric   case Intrinsic::arm_mve_vcvt_narrow:
284349cc55cSDimitry Andric     SimplifyNarrowInstrTopBottom(2);
285349cc55cSDimitry Andric     break;
286349cc55cSDimitry Andric   case Intrinsic::arm_mve_vqmovn:
287349cc55cSDimitry Andric     SimplifyNarrowInstrTopBottom(4);
288349cc55cSDimitry Andric     break;
289349cc55cSDimitry Andric   case Intrinsic::arm_mve_vshrn:
290349cc55cSDimitry Andric     SimplifyNarrowInstrTopBottom(7);
291349cc55cSDimitry Andric     break;
292349cc55cSDimitry Andric   }
293349cc55cSDimitry Andric 
294bdd1243dSDimitry Andric   return std::nullopt;
295349cc55cSDimitry Andric }
296349cc55cSDimitry Andric 
getIntImmCost(const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind)297fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
2985ffd83dbSDimitry Andric                                           TTI::TargetCostKind CostKind) {
2990b57cec5SDimitry Andric   assert(Ty->isIntegerTy());
3000b57cec5SDimitry Andric 
3010b57cec5SDimitry Andric  unsigned Bits = Ty->getPrimitiveSizeInBits();
3020b57cec5SDimitry Andric  if (Bits == 0 || Imm.getActiveBits() >= 64)
3030b57cec5SDimitry Andric    return 4;
3040b57cec5SDimitry Andric 
3050b57cec5SDimitry Andric   int64_t SImmVal = Imm.getSExtValue();
3060b57cec5SDimitry Andric   uint64_t ZImmVal = Imm.getZExtValue();
3070b57cec5SDimitry Andric   if (!ST->isThumb()) {
3080b57cec5SDimitry Andric     if ((SImmVal >= 0 && SImmVal < 65536) ||
3090b57cec5SDimitry Andric         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
3100b57cec5SDimitry Andric         (ARM_AM::getSOImmVal(~ZImmVal) != -1))
3110b57cec5SDimitry Andric       return 1;
3120b57cec5SDimitry Andric     return ST->hasV6T2Ops() ? 2 : 3;
3130b57cec5SDimitry Andric   }
3140b57cec5SDimitry Andric   if (ST->isThumb2()) {
3150b57cec5SDimitry Andric     if ((SImmVal >= 0 && SImmVal < 65536) ||
3160b57cec5SDimitry Andric         (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
3170b57cec5SDimitry Andric         (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
3180b57cec5SDimitry Andric       return 1;
3190b57cec5SDimitry Andric     return ST->hasV6T2Ops() ? 2 : 3;
3200b57cec5SDimitry Andric   }
3210b57cec5SDimitry Andric   // Thumb1, any i8 imm cost 1.
3220b57cec5SDimitry Andric   if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
3230b57cec5SDimitry Andric     return 1;
3240b57cec5SDimitry Andric   if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
3250b57cec5SDimitry Andric     return 2;
3260b57cec5SDimitry Andric   // Load from constantpool.
3270b57cec5SDimitry Andric   return 3;
3280b57cec5SDimitry Andric }
3290b57cec5SDimitry Andric 
3300b57cec5SDimitry Andric // Constants smaller than 256 fit in the immediate field of
3310b57cec5SDimitry Andric // Thumb1 instructions so we return a zero cost and 1 otherwise.
getIntImmCodeSizeCost(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty)332fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
3330b57cec5SDimitry Andric                                                   const APInt &Imm, Type *Ty) {
3340b57cec5SDimitry Andric   if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
3350b57cec5SDimitry Andric     return 0;
3360b57cec5SDimitry Andric 
3370b57cec5SDimitry Andric   return 1;
3380b57cec5SDimitry Andric }
3390b57cec5SDimitry Andric 
340e8d8bef9SDimitry Andric // Checks whether Inst is part of a min(max()) or max(min()) pattern
3414824e7fdSDimitry Andric // that will match to an SSAT instruction. Returns the instruction being
3424824e7fdSDimitry Andric // saturated, or null if no saturation pattern was found.
isSSATMinMaxPattern(Instruction * Inst,const APInt & Imm)3434824e7fdSDimitry Andric static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
344e8d8bef9SDimitry Andric   Value *LHS, *RHS;
345e8d8bef9SDimitry Andric   ConstantInt *C;
346e8d8bef9SDimitry Andric   SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
347e8d8bef9SDimitry Andric 
348e8d8bef9SDimitry Andric   if (InstSPF == SPF_SMAX &&
349e8d8bef9SDimitry Andric       PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
350349cc55cSDimitry Andric       C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
351e8d8bef9SDimitry Andric 
352e8d8bef9SDimitry Andric     auto isSSatMin = [&](Value *MinInst) {
353e8d8bef9SDimitry Andric       if (isa<SelectInst>(MinInst)) {
354e8d8bef9SDimitry Andric         Value *MinLHS, *MinRHS;
355e8d8bef9SDimitry Andric         ConstantInt *MinC;
356e8d8bef9SDimitry Andric         SelectPatternFlavor MinSPF =
357e8d8bef9SDimitry Andric             matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
358e8d8bef9SDimitry Andric         if (MinSPF == SPF_SMIN &&
359e8d8bef9SDimitry Andric             PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
360e8d8bef9SDimitry Andric             MinC->getValue() == ((-Imm) - 1))
361e8d8bef9SDimitry Andric           return true;
362e8d8bef9SDimitry Andric       }
363e8d8bef9SDimitry Andric       return false;
364e8d8bef9SDimitry Andric     };
365e8d8bef9SDimitry Andric 
3664824e7fdSDimitry Andric     if (isSSatMin(Inst->getOperand(1)))
3674824e7fdSDimitry Andric       return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
3684824e7fdSDimitry Andric     if (Inst->hasNUses(2) &&
3694824e7fdSDimitry Andric         (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
3704824e7fdSDimitry Andric       return Inst->getOperand(1);
371e8d8bef9SDimitry Andric   }
3724824e7fdSDimitry Andric   return nullptr;
3734824e7fdSDimitry Andric }
3744824e7fdSDimitry Andric 
3754824e7fdSDimitry Andric // Look for a FP Saturation pattern, where the instruction can be simplified to
3764824e7fdSDimitry Andric // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
isFPSatMinMaxPattern(Instruction * Inst,const APInt & Imm)3774824e7fdSDimitry Andric static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
3784824e7fdSDimitry Andric   if (Imm.getBitWidth() != 64 ||
3794824e7fdSDimitry Andric       Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
380e8d8bef9SDimitry Andric     return false;
3814824e7fdSDimitry Andric   Value *FP = isSSATMinMaxPattern(Inst, Imm);
3824824e7fdSDimitry Andric   if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
3834824e7fdSDimitry Andric     FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
3844824e7fdSDimitry Andric   if (!FP)
3854824e7fdSDimitry Andric     return false;
3864824e7fdSDimitry Andric   return isa<FPToSIInst>(FP);
387e8d8bef9SDimitry Andric }
388e8d8bef9SDimitry Andric 
getIntImmCostInst(unsigned Opcode,unsigned Idx,const APInt & Imm,Type * Ty,TTI::TargetCostKind CostKind,Instruction * Inst)389fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
390e8d8bef9SDimitry Andric                                               const APInt &Imm, Type *Ty,
391e8d8bef9SDimitry Andric                                               TTI::TargetCostKind CostKind,
392e8d8bef9SDimitry Andric                                               Instruction *Inst) {
3930b57cec5SDimitry Andric   // Division by a constant can be turned into multiplication, but only if we
3940b57cec5SDimitry Andric   // know it's constant. So it's not so much that the immediate is cheap (it's
3950b57cec5SDimitry Andric   // not), but that the alternative is worse.
3960b57cec5SDimitry Andric   // FIXME: this is probably unneeded with GlobalISel.
3970b57cec5SDimitry Andric   if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
3980b57cec5SDimitry Andric        Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
3990b57cec5SDimitry Andric       Idx == 1)
4000b57cec5SDimitry Andric     return 0;
4010b57cec5SDimitry Andric 
402fe6060f1SDimitry Andric   // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
403fe6060f1SDimitry Andric   // splitting any large offsets.
404fe6060f1SDimitry Andric   if (Opcode == Instruction::GetElementPtr && Idx != 0)
405fe6060f1SDimitry Andric     return 0;
406fe6060f1SDimitry Andric 
4070b57cec5SDimitry Andric   if (Opcode == Instruction::And) {
4080b57cec5SDimitry Andric     // UXTB/UXTH
4090b57cec5SDimitry Andric     if (Imm == 255 || Imm == 65535)
4100b57cec5SDimitry Andric       return 0;
4110b57cec5SDimitry Andric     // Conversion to BIC is free, and means we can use ~Imm instead.
4125ffd83dbSDimitry Andric     return std::min(getIntImmCost(Imm, Ty, CostKind),
4135ffd83dbSDimitry Andric                     getIntImmCost(~Imm, Ty, CostKind));
4140b57cec5SDimitry Andric   }
4150b57cec5SDimitry Andric 
4160b57cec5SDimitry Andric   if (Opcode == Instruction::Add)
4170b57cec5SDimitry Andric     // Conversion to SUB is free, and means we can use -Imm instead.
4185ffd83dbSDimitry Andric     return std::min(getIntImmCost(Imm, Ty, CostKind),
4195ffd83dbSDimitry Andric                     getIntImmCost(-Imm, Ty, CostKind));
4200b57cec5SDimitry Andric 
4210b57cec5SDimitry Andric   if (Opcode == Instruction::ICmp && Imm.isNegative() &&
4220b57cec5SDimitry Andric       Ty->getIntegerBitWidth() == 32) {
4230b57cec5SDimitry Andric     int64_t NegImm = -Imm.getSExtValue();
4240b57cec5SDimitry Andric     if (ST->isThumb2() && NegImm < 1<<12)
4250b57cec5SDimitry Andric       // icmp X, #-C -> cmn X, #C
4260b57cec5SDimitry Andric       return 0;
4270b57cec5SDimitry Andric     if (ST->isThumb() && NegImm < 1<<8)
4280b57cec5SDimitry Andric       // icmp X, #-C -> adds X, #C
4290b57cec5SDimitry Andric       return 0;
4300b57cec5SDimitry Andric   }
4310b57cec5SDimitry Andric 
4320b57cec5SDimitry Andric   // xor a, -1 can always be folded to MVN
433349cc55cSDimitry Andric   if (Opcode == Instruction::Xor && Imm.isAllOnes())
4340b57cec5SDimitry Andric     return 0;
4350b57cec5SDimitry Andric 
436e8d8bef9SDimitry Andric   // Ensures negative constant of min(max()) or max(min()) patterns that
437e8d8bef9SDimitry Andric   // match to SSAT instructions don't get hoisted
438e8d8bef9SDimitry Andric   if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
439e8d8bef9SDimitry Andric       Ty->getIntegerBitWidth() <= 32) {
440e8d8bef9SDimitry Andric     if (isSSATMinMaxPattern(Inst, Imm) ||
441e8d8bef9SDimitry Andric         (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
442e8d8bef9SDimitry Andric          isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
443e8d8bef9SDimitry Andric       return 0;
444e8d8bef9SDimitry Andric   }
445e8d8bef9SDimitry Andric 
4464824e7fdSDimitry Andric   if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
4474824e7fdSDimitry Andric     return 0;
4484824e7fdSDimitry Andric 
449349cc55cSDimitry Andric   // We can convert <= -1 to < 0, which is generally quite cheap.
45006c3fb27SDimitry Andric   if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
451349cc55cSDimitry Andric     ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
452349cc55cSDimitry Andric     if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
453349cc55cSDimitry Andric       return std::min(getIntImmCost(Imm, Ty, CostKind),
454349cc55cSDimitry Andric                       getIntImmCost(Imm + 1, Ty, CostKind));
455349cc55cSDimitry Andric   }
456349cc55cSDimitry Andric 
4575ffd83dbSDimitry Andric   return getIntImmCost(Imm, Ty, CostKind);
4580b57cec5SDimitry Andric }
4590b57cec5SDimitry Andric 
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I)460fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
461fe6060f1SDimitry Andric                                            TTI::TargetCostKind CostKind,
462fe6060f1SDimitry Andric                                            const Instruction *I) {
463e8d8bef9SDimitry Andric   if (CostKind == TTI::TCK_RecipThroughput &&
464e8d8bef9SDimitry Andric       (ST->hasNEON() || ST->hasMVEIntegerOps())) {
465e8d8bef9SDimitry Andric     // FIXME: The vectorizer is highly sensistive to the cost of these
466e8d8bef9SDimitry Andric     // instructions, which suggests that it may be using the costs incorrectly.
467e8d8bef9SDimitry Andric     // But, for now, just make them free to avoid performance regressions for
468e8d8bef9SDimitry Andric     // vector targets.
469e8d8bef9SDimitry Andric     return 0;
470e8d8bef9SDimitry Andric   }
471fe6060f1SDimitry Andric   return BaseT::getCFInstrCost(Opcode, CostKind, I);
472e8d8bef9SDimitry Andric }
473e8d8bef9SDimitry Andric 
getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src,TTI::CastContextHint CCH,TTI::TargetCostKind CostKind,const Instruction * I)474fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
475fe6060f1SDimitry Andric                                              Type *Src,
476e8d8bef9SDimitry Andric                                              TTI::CastContextHint CCH,
4775ffd83dbSDimitry Andric                                              TTI::TargetCostKind CostKind,
4780b57cec5SDimitry Andric                                              const Instruction *I) {
4790b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
4800b57cec5SDimitry Andric   assert(ISD && "Invalid opcode");
4810b57cec5SDimitry Andric 
4825ffd83dbSDimitry Andric   // TODO: Allow non-throughput costs that aren't binary.
483fe6060f1SDimitry Andric   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
4845ffd83dbSDimitry Andric     if (CostKind != TTI::TCK_RecipThroughput)
4855ffd83dbSDimitry Andric       return Cost == 0 ? 0 : 1;
4865ffd83dbSDimitry Andric     return Cost;
4870b57cec5SDimitry Andric   };
488e8d8bef9SDimitry Andric   auto IsLegalFPType = [this](EVT VT) {
489e8d8bef9SDimitry Andric     EVT EltVT = VT.getScalarType();
490e8d8bef9SDimitry Andric     return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
491e8d8bef9SDimitry Andric             (EltVT == MVT::f64 && ST->hasFP64()) ||
492e8d8bef9SDimitry Andric             (EltVT == MVT::f16 && ST->hasFullFP16());
493e8d8bef9SDimitry Andric   };
4940b57cec5SDimitry Andric 
4950b57cec5SDimitry Andric   EVT SrcTy = TLI->getValueType(DL, Src);
4960b57cec5SDimitry Andric   EVT DstTy = TLI->getValueType(DL, Dst);
4970b57cec5SDimitry Andric 
4980b57cec5SDimitry Andric   if (!SrcTy.isSimple() || !DstTy.isSimple())
499e8d8bef9SDimitry Andric     return AdjustCost(
500e8d8bef9SDimitry Andric         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
5010b57cec5SDimitry Andric 
502e8d8bef9SDimitry Andric   // Extending masked load/Truncating masked stores is expensive because we
503e8d8bef9SDimitry Andric   // currently don't split them. This means that we'll likely end up
504e8d8bef9SDimitry Andric   // loading/storing each element individually (hence the high cost).
505e8d8bef9SDimitry Andric   if ((ST->hasMVEIntegerOps() &&
506e8d8bef9SDimitry Andric        (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
507e8d8bef9SDimitry Andric         Opcode == Instruction::SExt)) ||
508e8d8bef9SDimitry Andric       (ST->hasMVEFloatOps() &&
509e8d8bef9SDimitry Andric        (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
510e8d8bef9SDimitry Andric        IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
511e8d8bef9SDimitry Andric     if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
512fe6060f1SDimitry Andric       return 2 * DstTy.getVectorNumElements() *
513fe6060f1SDimitry Andric              ST->getMVEVectorCostFactor(CostKind);
514e8d8bef9SDimitry Andric 
515e8d8bef9SDimitry Andric   // The extend of other kinds of load is free
516e8d8bef9SDimitry Andric   if (CCH == TTI::CastContextHint::Normal ||
517e8d8bef9SDimitry Andric       CCH == TTI::CastContextHint::Masked) {
5188bcb0991SDimitry Andric     static const TypeConversionCostTblEntry LoadConversionTbl[] = {
5198bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
5208bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
5218bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
5228bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
5238bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
5248bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
5258bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
5268bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
5278bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
5288bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
5298bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
5308bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
5318bcb0991SDimitry Andric     };
5328bcb0991SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(
5338bcb0991SDimitry Andric             LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
5345ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
5358bcb0991SDimitry Andric 
5368bcb0991SDimitry Andric     static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
5378bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
5388bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
5398bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
5408bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
5418bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
5428bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
5435ffd83dbSDimitry Andric         // The following extend from a legal type to an illegal type, so need to
5445ffd83dbSDimitry Andric         // split the load. This introduced an extra load operation, but the
5455ffd83dbSDimitry Andric         // extend is still "free".
5465ffd83dbSDimitry Andric         {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
5475ffd83dbSDimitry Andric         {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
5485ffd83dbSDimitry Andric         {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
5495ffd83dbSDimitry Andric         {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
5505ffd83dbSDimitry Andric         {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
5515ffd83dbSDimitry Andric         {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
5528bcb0991SDimitry Andric     };
5538bcb0991SDimitry Andric     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
5548bcb0991SDimitry Andric       if (const auto *Entry =
5558bcb0991SDimitry Andric               ConvertCostTableLookup(MVELoadConversionTbl, ISD,
5568bcb0991SDimitry Andric                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
557fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5588bcb0991SDimitry Andric     }
5595ffd83dbSDimitry Andric 
5605ffd83dbSDimitry Andric     static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
5615ffd83dbSDimitry Andric         // FPExtends are similar but also require the VCVT instructions.
5625ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
5635ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
5645ffd83dbSDimitry Andric     };
5655ffd83dbSDimitry Andric     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
5665ffd83dbSDimitry Andric       if (const auto *Entry =
5675ffd83dbSDimitry Andric               ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
5685ffd83dbSDimitry Andric                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
569fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5705ffd83dbSDimitry Andric     }
5715ffd83dbSDimitry Andric 
5725ffd83dbSDimitry Andric     // The truncate of a store is free. This is the mirror of extends above.
573e8d8bef9SDimitry Andric     static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
5745ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
5755ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
5765ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
5775ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
578e8d8bef9SDimitry Andric         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
5795ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
5805ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
5815ffd83dbSDimitry Andric     };
5825ffd83dbSDimitry Andric     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
5835ffd83dbSDimitry Andric       if (const auto *Entry =
584e8d8bef9SDimitry Andric               ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
585e8d8bef9SDimitry Andric                                      SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
586fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5875ffd83dbSDimitry Andric     }
5885ffd83dbSDimitry Andric 
589e8d8bef9SDimitry Andric     static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
5905ffd83dbSDimitry Andric         {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
5915ffd83dbSDimitry Andric         {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
5925ffd83dbSDimitry Andric     };
5935ffd83dbSDimitry Andric     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
5945ffd83dbSDimitry Andric       if (const auto *Entry =
595e8d8bef9SDimitry Andric               ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
596e8d8bef9SDimitry Andric                                      SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
597fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5985ffd83dbSDimitry Andric     }
5995ffd83dbSDimitry Andric   }
6005ffd83dbSDimitry Andric 
6015ffd83dbSDimitry Andric   // NEON vector operations that can extend their inputs.
6025ffd83dbSDimitry Andric   if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
6035ffd83dbSDimitry Andric       I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
6045ffd83dbSDimitry Andric     static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
6055ffd83dbSDimitry Andric       // vaddl
6065ffd83dbSDimitry Andric       { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
6075ffd83dbSDimitry Andric       { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
6085ffd83dbSDimitry Andric       // vsubl
6095ffd83dbSDimitry Andric       { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
6105ffd83dbSDimitry Andric       { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
6115ffd83dbSDimitry Andric       // vmull
6125ffd83dbSDimitry Andric       { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
6135ffd83dbSDimitry Andric       { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
6145ffd83dbSDimitry Andric       // vshll
6155ffd83dbSDimitry Andric       { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
6165ffd83dbSDimitry Andric       { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
6175ffd83dbSDimitry Andric     };
6185ffd83dbSDimitry Andric 
6195ffd83dbSDimitry Andric     auto *User = cast<Instruction>(*I->user_begin());
6205ffd83dbSDimitry Andric     int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
6215ffd83dbSDimitry Andric     if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
6225ffd83dbSDimitry Andric                                              DstTy.getSimpleVT(),
6235ffd83dbSDimitry Andric                                              SrcTy.getSimpleVT())) {
6245ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
6255ffd83dbSDimitry Andric     }
6265ffd83dbSDimitry Andric   }
6275ffd83dbSDimitry Andric 
6285ffd83dbSDimitry Andric   // Single to/from double precision conversions.
6295ffd83dbSDimitry Andric   if (Src->isVectorTy() && ST->hasNEON() &&
6305ffd83dbSDimitry Andric       ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
6315ffd83dbSDimitry Andric         DstTy.getScalarType() == MVT::f32) ||
6325ffd83dbSDimitry Andric        (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
6335ffd83dbSDimitry Andric         DstTy.getScalarType() == MVT::f64))) {
6345ffd83dbSDimitry Andric     static const CostTblEntry NEONFltDblTbl[] = {
6355ffd83dbSDimitry Andric         // Vector fptrunc/fpext conversions.
6365ffd83dbSDimitry Andric         {ISD::FP_ROUND, MVT::v2f64, 2},
6375ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v2f32, 2},
6385ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v4f32, 4}};
6395ffd83dbSDimitry Andric 
640bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
6415ffd83dbSDimitry Andric     if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
6425ffd83dbSDimitry Andric       return AdjustCost(LT.first * Entry->Cost);
6438bcb0991SDimitry Andric   }
6448bcb0991SDimitry Andric 
6450b57cec5SDimitry Andric   // Some arithmetic, load and store operations have specific instructions
6460b57cec5SDimitry Andric   // to cast up/down their types automatically at no extra cost.
6470b57cec5SDimitry Andric   // TODO: Get these tables to know at least what the related operations are.
6480b57cec5SDimitry Andric   static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
6495ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
6505ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
6510b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
6520b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
6530b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
6540b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
6550b57cec5SDimitry Andric 
6560b57cec5SDimitry Andric     // The number of vmovl instructions for the extension.
6575ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
6585ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
6595ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
6605ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
6615ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
6625ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
6635ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
6645ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
6650b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
6660b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
6670b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
6680b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
6690b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
6700b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
6710b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
6720b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
6730b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
6740b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
6750b57cec5SDimitry Andric 
6760b57cec5SDimitry Andric     // Operations that we legalize using splitting.
6770b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
6780b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
6790b57cec5SDimitry Andric 
6800b57cec5SDimitry Andric     // Vector float <-> i32 conversions.
6810b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
6820b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
6830b57cec5SDimitry Andric 
6840b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
6850b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
6860b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
6870b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
6880b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
6890b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
6900b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
6910b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
6920b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
6930b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
6940b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
6950b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
6960b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
6970b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
6980b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
6990b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
7000b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
7010b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
7020b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
7030b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
7040b57cec5SDimitry Andric 
7050b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
7060b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
7070b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
7080b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
7090b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
7100b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
7110b57cec5SDimitry Andric 
7120b57cec5SDimitry Andric     // Vector double <-> i32 conversions.
7130b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
7140b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
7150b57cec5SDimitry Andric 
7160b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
7170b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
7180b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
7190b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
7200b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
7210b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
7220b57cec5SDimitry Andric 
7230b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
7240b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
7250b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
7260b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
7270b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
7280b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
7290b57cec5SDimitry Andric   };
7300b57cec5SDimitry Andric 
7310b57cec5SDimitry Andric   if (SrcTy.isVector() && ST->hasNEON()) {
7320b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
7330b57cec5SDimitry Andric                                                    DstTy.getSimpleVT(),
7340b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
7355ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
7360b57cec5SDimitry Andric   }
7370b57cec5SDimitry Andric 
7380b57cec5SDimitry Andric   // Scalar float to integer conversions.
7390b57cec5SDimitry Andric   static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
7400b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
7410b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
7420b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
7430b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
7440b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
7450b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
7460b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
7470b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
7480b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
7490b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
7500b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
7510b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
7520b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
7530b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
7540b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
7550b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
7560b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
7570b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
7580b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
7590b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
7600b57cec5SDimitry Andric   };
7610b57cec5SDimitry Andric   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
7620b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
7630b57cec5SDimitry Andric                                                    DstTy.getSimpleVT(),
7640b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
7655ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
7660b57cec5SDimitry Andric   }
7670b57cec5SDimitry Andric 
7680b57cec5SDimitry Andric   // Scalar integer to float conversions.
7690b57cec5SDimitry Andric   static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
7700b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
7710b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
7720b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
7730b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
7740b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
7750b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
7760b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
7770b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
7780b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
7790b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
7800b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
7810b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
7820b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
7830b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
7840b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
7850b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
7860b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
7870b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
7880b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
7890b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
7900b57cec5SDimitry Andric   };
7910b57cec5SDimitry Andric 
7920b57cec5SDimitry Andric   if (SrcTy.isInteger() && ST->hasNEON()) {
7930b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
7940b57cec5SDimitry Andric                                                    ISD, DstTy.getSimpleVT(),
7950b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
7965ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
7970b57cec5SDimitry Andric   }
7980b57cec5SDimitry Andric 
7998bcb0991SDimitry Andric   // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
8008bcb0991SDimitry Andric   // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
8018bcb0991SDimitry Andric   // are linearised so take more.
8028bcb0991SDimitry Andric   static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
8038bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
8048bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
8058bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
8068bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
8078bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
8088bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
8098bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
8108bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
8118bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
8128bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
8138bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
8148bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
8158bcb0991SDimitry Andric   };
8168bcb0991SDimitry Andric 
8178bcb0991SDimitry Andric   if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
8188bcb0991SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
8198bcb0991SDimitry Andric                                                    ISD, DstTy.getSimpleVT(),
8208bcb0991SDimitry Andric                                                    SrcTy.getSimpleVT()))
821fe6060f1SDimitry Andric       return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
8225ffd83dbSDimitry Andric   }
8235ffd83dbSDimitry Andric 
8245ffd83dbSDimitry Andric   if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
8255ffd83dbSDimitry Andric     // As general rule, fp converts that were not matched above are scalarized
8265ffd83dbSDimitry Andric     // and cost 1 vcvt for each lane, so long as the instruction is available.
8275ffd83dbSDimitry Andric     // If not it will become a series of function calls.
828fe6060f1SDimitry Andric     const InstructionCost CallCost =
829fe6060f1SDimitry Andric         getCallInstrCost(nullptr, Dst, {Src}, CostKind);
8305ffd83dbSDimitry Andric     int Lanes = 1;
8315ffd83dbSDimitry Andric     if (SrcTy.isFixedLengthVector())
8325ffd83dbSDimitry Andric       Lanes = SrcTy.getVectorNumElements();
8335ffd83dbSDimitry Andric 
834e8d8bef9SDimitry Andric     if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
8355ffd83dbSDimitry Andric       return Lanes;
8365ffd83dbSDimitry Andric     else
8375ffd83dbSDimitry Andric       return Lanes * CallCost;
8388bcb0991SDimitry Andric   }
8398bcb0991SDimitry Andric 
840e8d8bef9SDimitry Andric   if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
841e8d8bef9SDimitry Andric       SrcTy.isFixedLengthVector()) {
842e8d8bef9SDimitry Andric     // Treat a truncate with larger than legal source (128bits for MVE) as
843e8d8bef9SDimitry Andric     // expensive, 2 instructions per lane.
844e8d8bef9SDimitry Andric     if ((SrcTy.getScalarType() == MVT::i8 ||
845e8d8bef9SDimitry Andric          SrcTy.getScalarType() == MVT::i16 ||
846e8d8bef9SDimitry Andric          SrcTy.getScalarType() == MVT::i32) &&
847e8d8bef9SDimitry Andric         SrcTy.getSizeInBits() > 128 &&
848e8d8bef9SDimitry Andric         SrcTy.getSizeInBits() > DstTy.getSizeInBits())
849e8d8bef9SDimitry Andric       return SrcTy.getVectorNumElements() * 2;
850e8d8bef9SDimitry Andric   }
851e8d8bef9SDimitry Andric 
8520b57cec5SDimitry Andric   // Scalar integer conversion costs.
8530b57cec5SDimitry Andric   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
8540b57cec5SDimitry Andric     // i16 -> i64 requires two dependent operations.
8550b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
8560b57cec5SDimitry Andric 
8570b57cec5SDimitry Andric     // Truncates on i64 are assumed to be free.
8580b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
8590b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
8600b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
8610b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
8620b57cec5SDimitry Andric   };
8630b57cec5SDimitry Andric 
8640b57cec5SDimitry Andric   if (SrcTy.isInteger()) {
8650b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
8660b57cec5SDimitry Andric                                                    DstTy.getSimpleVT(),
8670b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
8685ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
8690b57cec5SDimitry Andric   }
8700b57cec5SDimitry Andric 
8718bcb0991SDimitry Andric   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
872fe6060f1SDimitry Andric                      ? ST->getMVEVectorCostFactor(CostKind)
8738bcb0991SDimitry Andric                      : 1;
8745ffd83dbSDimitry Andric   return AdjustCost(
875e8d8bef9SDimitry Andric       BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
8760b57cec5SDimitry Andric }
8770b57cec5SDimitry Andric 
getVectorInstrCost(unsigned Opcode,Type * ValTy,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)878fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
879bdd1243dSDimitry Andric                                                TTI::TargetCostKind CostKind,
880bdd1243dSDimitry Andric                                                unsigned Index, Value *Op0,
881bdd1243dSDimitry Andric                                                Value *Op1) {
8820b57cec5SDimitry Andric   // Penalize inserting into an D-subregister. We end up with a three times
8830b57cec5SDimitry Andric   // lower estimated throughput on swift.
8840b57cec5SDimitry Andric   if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
8850b57cec5SDimitry Andric       ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
8860b57cec5SDimitry Andric     return 3;
8870b57cec5SDimitry Andric 
8888bcb0991SDimitry Andric   if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
8890b57cec5SDimitry Andric                         Opcode == Instruction::ExtractElement)) {
8900b57cec5SDimitry Andric     // Cross-class copies are expensive on many microarchitectures,
8910b57cec5SDimitry Andric     // so assume they are expensive by default.
8925ffd83dbSDimitry Andric     if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
8930b57cec5SDimitry Andric       return 3;
8940b57cec5SDimitry Andric 
8950b57cec5SDimitry Andric     // Even if it's not a cross class copy, this likely leads to mixing
8960b57cec5SDimitry Andric     // of NEON and VFP code and should be therefore penalized.
8970b57cec5SDimitry Andric     if (ValTy->isVectorTy() &&
8980b57cec5SDimitry Andric         ValTy->getScalarSizeInBits() <= 32)
899fe6060f1SDimitry Andric       return std::max<InstructionCost>(
900bdd1243dSDimitry Andric           BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
901bdd1243dSDimitry Andric           2U);
9020b57cec5SDimitry Andric   }
9030b57cec5SDimitry Andric 
9048bcb0991SDimitry Andric   if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
9058bcb0991SDimitry Andric                                  Opcode == Instruction::ExtractElement)) {
906fe6060f1SDimitry Andric     // Integer cross-lane moves are more expensive than float, which can
907fe6060f1SDimitry Andric     // sometimes just be vmovs. Integer involve being passes to GPR registers,
908fe6060f1SDimitry Andric     // causing more of a delay.
909fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT =
910bdd1243dSDimitry Andric         getTypeLegalizationCost(ValTy->getScalarType());
911fe6060f1SDimitry Andric     return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
9128bcb0991SDimitry Andric   }
9138bcb0991SDimitry Andric 
914bdd1243dSDimitry Andric   return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
9150b57cec5SDimitry Andric }
9160b57cec5SDimitry Andric 
getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,CmpInst::Predicate VecPred,TTI::TargetCostKind CostKind,const Instruction * I)917fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
918fe6060f1SDimitry Andric                                                Type *CondTy,
919e8d8bef9SDimitry Andric                                                CmpInst::Predicate VecPred,
9205ffd83dbSDimitry Andric                                                TTI::TargetCostKind CostKind,
9210b57cec5SDimitry Andric                                                const Instruction *I) {
9220b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
923e8d8bef9SDimitry Andric 
924e8d8bef9SDimitry Andric   // Thumb scalar code size cost for select.
925e8d8bef9SDimitry Andric   if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
926e8d8bef9SDimitry Andric       ST->isThumb() && !ValTy->isVectorTy()) {
927e8d8bef9SDimitry Andric     // Assume expensive structs.
928e8d8bef9SDimitry Andric     if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
929e8d8bef9SDimitry Andric       return TTI::TCC_Expensive;
930e8d8bef9SDimitry Andric 
931e8d8bef9SDimitry Andric     // Select costs can vary because they:
932e8d8bef9SDimitry Andric     // - may require one or more conditional mov (including an IT),
933e8d8bef9SDimitry Andric     // - can't operate directly on immediates,
934e8d8bef9SDimitry Andric     // - require live flags, which we can't copy around easily.
935bdd1243dSDimitry Andric     InstructionCost Cost = getTypeLegalizationCost(ValTy).first;
936e8d8bef9SDimitry Andric 
937e8d8bef9SDimitry Andric     // Possible IT instruction for Thumb2, or more for Thumb1.
938e8d8bef9SDimitry Andric     ++Cost;
939e8d8bef9SDimitry Andric 
940e8d8bef9SDimitry Andric     // i1 values may need rematerialising by using mov immediates and/or
941e8d8bef9SDimitry Andric     // flag setting instructions.
942e8d8bef9SDimitry Andric     if (ValTy->isIntegerTy(1))
943e8d8bef9SDimitry Andric       ++Cost;
944e8d8bef9SDimitry Andric 
945e8d8bef9SDimitry Andric     return Cost;
946e8d8bef9SDimitry Andric   }
947e8d8bef9SDimitry Andric 
948fe6060f1SDimitry Andric   // If this is a vector min/max/abs, use the cost of that intrinsic directly
949fe6060f1SDimitry Andric   // instead. Hopefully when min/max intrinsics are more prevalent this code
950fe6060f1SDimitry Andric   // will not be needed.
951fe6060f1SDimitry Andric   const Instruction *Sel = I;
952fe6060f1SDimitry Andric   if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
953fe6060f1SDimitry Andric       Sel->hasOneUse())
954fe6060f1SDimitry Andric     Sel = cast<Instruction>(Sel->user_back());
955fe6060f1SDimitry Andric   if (Sel && ValTy->isVectorTy() &&
956fe6060f1SDimitry Andric       (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
957fe6060f1SDimitry Andric     const Value *LHS, *RHS;
958fe6060f1SDimitry Andric     SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
959fe6060f1SDimitry Andric     unsigned IID = 0;
960fe6060f1SDimitry Andric     switch (SPF) {
961fe6060f1SDimitry Andric     case SPF_ABS:
962fe6060f1SDimitry Andric       IID = Intrinsic::abs;
963fe6060f1SDimitry Andric       break;
964fe6060f1SDimitry Andric     case SPF_SMIN:
965fe6060f1SDimitry Andric       IID = Intrinsic::smin;
966fe6060f1SDimitry Andric       break;
967fe6060f1SDimitry Andric     case SPF_SMAX:
968fe6060f1SDimitry Andric       IID = Intrinsic::smax;
969fe6060f1SDimitry Andric       break;
970fe6060f1SDimitry Andric     case SPF_UMIN:
971fe6060f1SDimitry Andric       IID = Intrinsic::umin;
972fe6060f1SDimitry Andric       break;
973fe6060f1SDimitry Andric     case SPF_UMAX:
974fe6060f1SDimitry Andric       IID = Intrinsic::umax;
975fe6060f1SDimitry Andric       break;
976fe6060f1SDimitry Andric     case SPF_FMINNUM:
977fe6060f1SDimitry Andric       IID = Intrinsic::minnum;
978fe6060f1SDimitry Andric       break;
979fe6060f1SDimitry Andric     case SPF_FMAXNUM:
980fe6060f1SDimitry Andric       IID = Intrinsic::maxnum;
981fe6060f1SDimitry Andric       break;
982fe6060f1SDimitry Andric     default:
983fe6060f1SDimitry Andric       break;
984fe6060f1SDimitry Andric     }
985fe6060f1SDimitry Andric     if (IID) {
986fe6060f1SDimitry Andric       // The ICmp is free, the select gets the cost of the min/max/etc
987fe6060f1SDimitry Andric       if (Sel != I)
988fe6060f1SDimitry Andric         return 0;
989fe6060f1SDimitry Andric       IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
990fe6060f1SDimitry Andric       return getIntrinsicInstrCost(CostAttrs, CostKind);
991fe6060f1SDimitry Andric     }
992fe6060f1SDimitry Andric   }
993fe6060f1SDimitry Andric 
9940b57cec5SDimitry Andric   // On NEON a vector select gets lowered to vbsl.
995e8d8bef9SDimitry Andric   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
9960b57cec5SDimitry Andric     // Lowering of some vector selects is currently far from perfect.
9970b57cec5SDimitry Andric     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
9980b57cec5SDimitry Andric       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
9990b57cec5SDimitry Andric       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
10000b57cec5SDimitry Andric       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
10010b57cec5SDimitry Andric     };
10020b57cec5SDimitry Andric 
10030b57cec5SDimitry Andric     EVT SelCondTy = TLI->getValueType(DL, CondTy);
10040b57cec5SDimitry Andric     EVT SelValTy = TLI->getValueType(DL, ValTy);
10050b57cec5SDimitry Andric     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
10060b57cec5SDimitry Andric       if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
10070b57cec5SDimitry Andric                                                      SelCondTy.getSimpleVT(),
10080b57cec5SDimitry Andric                                                      SelValTy.getSimpleVT()))
10090b57cec5SDimitry Andric         return Entry->Cost;
10100b57cec5SDimitry Andric     }
10110b57cec5SDimitry Andric 
1012bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
10130b57cec5SDimitry Andric     return LT.first;
10140b57cec5SDimitry Andric   }
10150b57cec5SDimitry Andric 
1016fe6060f1SDimitry Andric   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1017fe6060f1SDimitry Andric       (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1018fe6060f1SDimitry Andric       cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1019fe6060f1SDimitry Andric     FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1020fe6060f1SDimitry Andric     FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1021fe6060f1SDimitry Andric     if (!VecCondTy)
1022fe6060f1SDimitry Andric       VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1023fe6060f1SDimitry Andric 
1024fe6060f1SDimitry Andric     // If we don't have mve.fp any fp operations will need to be scalarized.
1025fe6060f1SDimitry Andric     if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1026fe6060f1SDimitry Andric       // One scalaization insert, one scalarization extract and the cost of the
1027fe6060f1SDimitry Andric       // fcmps.
1028bdd1243dSDimitry Andric       return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1029bdd1243dSDimitry Andric                                              /*Extract*/ true, CostKind) +
1030bdd1243dSDimitry Andric              BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1031bdd1243dSDimitry Andric                                              /*Extract*/ false, CostKind) +
1032fe6060f1SDimitry Andric              VecValTy->getNumElements() *
1033fe6060f1SDimitry Andric                  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1034bdd1243dSDimitry Andric                                     VecCondTy->getScalarType(), VecPred,
1035bdd1243dSDimitry Andric                                     CostKind, I);
1036fe6060f1SDimitry Andric     }
1037fe6060f1SDimitry Andric 
1038bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1039fe6060f1SDimitry Andric     int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1040fe6060f1SDimitry Andric     // There are two types - the input that specifies the type of the compare
1041fe6060f1SDimitry Andric     // and the output vXi1 type. Because we don't know how the output will be
1042fe6060f1SDimitry Andric     // split, we may need an expensive shuffle to get two in sync. This has the
1043fe6060f1SDimitry Andric     // effect of making larger than legal compares (v8i32 for example)
1044fe6060f1SDimitry Andric     // expensive.
1045f3fd488fSDimitry Andric     if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1046fe6060f1SDimitry Andric       if (LT.first > 1)
1047fe6060f1SDimitry Andric         return LT.first * BaseCost +
1048bdd1243dSDimitry Andric                BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1049bdd1243dSDimitry Andric                                                /*Extract*/ false, CostKind);
1050fe6060f1SDimitry Andric       return BaseCost;
1051fe6060f1SDimitry Andric     }
1052fe6060f1SDimitry Andric   }
1053fe6060f1SDimitry Andric 
1054e8d8bef9SDimitry Andric   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1055e8d8bef9SDimitry Andric   // for "multiple beats" potentially needed by MVE instructions.
1056e8d8bef9SDimitry Andric   int BaseCost = 1;
1057fe6060f1SDimitry Andric   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1058fe6060f1SDimitry Andric     BaseCost = ST->getMVEVectorCostFactor(CostKind);
1059e8d8bef9SDimitry Andric 
1060e8d8bef9SDimitry Andric   return BaseCost *
1061e8d8bef9SDimitry Andric          BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
10620b57cec5SDimitry Andric }
10630b57cec5SDimitry Andric 
getAddressComputationCost(Type * Ty,ScalarEvolution * SE,const SCEV * Ptr)1064fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1065fe6060f1SDimitry Andric                                                       ScalarEvolution *SE,
10660b57cec5SDimitry Andric                                                       const SCEV *Ptr) {
10670b57cec5SDimitry Andric   // Address computations in vectorized code with non-consecutive addresses will
10680b57cec5SDimitry Andric   // likely result in more instructions compared to scalar code where the
10690b57cec5SDimitry Andric   // computation can more often be merged into the index mode. The resulting
10700b57cec5SDimitry Andric   // extra micro-ops can significantly decrease throughput.
10710b57cec5SDimitry Andric   unsigned NumVectorInstToHideOverhead = 10;
10720b57cec5SDimitry Andric   int MaxMergeDistance = 64;
10730b57cec5SDimitry Andric 
10748bcb0991SDimitry Andric   if (ST->hasNEON()) {
10750b57cec5SDimitry Andric     if (Ty->isVectorTy() && SE &&
10760b57cec5SDimitry Andric         !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
10770b57cec5SDimitry Andric       return NumVectorInstToHideOverhead;
10780b57cec5SDimitry Andric 
10790b57cec5SDimitry Andric     // In many cases the address computation is not merged into the instruction
10800b57cec5SDimitry Andric     // addressing mode.
10810b57cec5SDimitry Andric     return 1;
10820b57cec5SDimitry Andric   }
10838bcb0991SDimitry Andric   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
10848bcb0991SDimitry Andric }
10858bcb0991SDimitry Andric 
isProfitableLSRChainElement(Instruction * I)10865ffd83dbSDimitry Andric bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
10875ffd83dbSDimitry Andric   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
10885ffd83dbSDimitry Andric     // If a VCTP is part of a chain, it's already profitable and shouldn't be
10895ffd83dbSDimitry Andric     // optimized, else LSR may block tail-predication.
10905ffd83dbSDimitry Andric     switch (II->getIntrinsicID()) {
10915ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp8:
10925ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp16:
10935ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp32:
10945ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp64:
10955ffd83dbSDimitry Andric       return true;
10965ffd83dbSDimitry Andric     default:
10975ffd83dbSDimitry Andric       break;
10985ffd83dbSDimitry Andric     }
10995ffd83dbSDimitry Andric   }
11005ffd83dbSDimitry Andric   return false;
11015ffd83dbSDimitry Andric }
11025ffd83dbSDimitry Andric 
isLegalMaskedLoad(Type * DataTy,Align Alignment)11035ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
11048bcb0991SDimitry Andric   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
11058bcb0991SDimitry Andric     return false;
11068bcb0991SDimitry Andric 
11075ffd83dbSDimitry Andric   if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
11088bcb0991SDimitry Andric     // Don't support v2i1 yet.
11098bcb0991SDimitry Andric     if (VecTy->getNumElements() == 2)
11108bcb0991SDimitry Andric       return false;
11118bcb0991SDimitry Andric 
11128bcb0991SDimitry Andric     // We don't support extending fp types.
11138bcb0991SDimitry Andric      unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
11148bcb0991SDimitry Andric     if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
11158bcb0991SDimitry Andric       return false;
11168bcb0991SDimitry Andric   }
11178bcb0991SDimitry Andric 
11188bcb0991SDimitry Andric   unsigned EltWidth = DataTy->getScalarSizeInBits();
11195ffd83dbSDimitry Andric   return (EltWidth == 32 && Alignment >= 4) ||
11205ffd83dbSDimitry Andric          (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
11218bcb0991SDimitry Andric }
11220b57cec5SDimitry Andric 
isLegalMaskedGather(Type * Ty,Align Alignment)11235ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1124480093f4SDimitry Andric   if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1125480093f4SDimitry Andric     return false;
1126480093f4SDimitry Andric 
1127480093f4SDimitry Andric   unsigned EltWidth = Ty->getScalarSizeInBits();
11285ffd83dbSDimitry Andric   return ((EltWidth == 32 && Alignment >= 4) ||
11295ffd83dbSDimitry Andric           (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1130480093f4SDimitry Andric }
1131480093f4SDimitry Andric 
1132e8d8bef9SDimitry Andric /// Given a memcpy/memset/memmove instruction, return the number of memory
1133e8d8bef9SDimitry Andric /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1134e8d8bef9SDimitry Andric /// call is used.
getNumMemOps(const IntrinsicInst * I) const1135e8d8bef9SDimitry Andric int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1136e8d8bef9SDimitry Andric   MemOp MOp;
1137e8d8bef9SDimitry Andric   unsigned DstAddrSpace = ~0u;
1138e8d8bef9SDimitry Andric   unsigned SrcAddrSpace = ~0u;
1139e8d8bef9SDimitry Andric   const Function *F = I->getParent()->getParent();
11400b57cec5SDimitry Andric 
1141e8d8bef9SDimitry Andric   if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1142e8d8bef9SDimitry Andric     ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
11430b57cec5SDimitry Andric     // If 'size' is not a constant, a library call will be generated.
11440b57cec5SDimitry Andric     if (!C)
1145e8d8bef9SDimitry Andric       return -1;
11460b57cec5SDimitry Andric 
11470b57cec5SDimitry Andric     const unsigned Size = C->getValue().getZExtValue();
1148e8d8bef9SDimitry Andric     const Align DstAlign = *MC->getDestAlign();
1149e8d8bef9SDimitry Andric     const Align SrcAlign = *MC->getSourceAlign();
1150e8d8bef9SDimitry Andric 
1151e8d8bef9SDimitry Andric     MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1152e8d8bef9SDimitry Andric                       /*IsVolatile*/ false);
1153e8d8bef9SDimitry Andric     DstAddrSpace = MC->getDestAddressSpace();
1154e8d8bef9SDimitry Andric     SrcAddrSpace = MC->getSourceAddressSpace();
1155e8d8bef9SDimitry Andric   }
1156e8d8bef9SDimitry Andric   else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1157e8d8bef9SDimitry Andric     ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1158e8d8bef9SDimitry Andric     // If 'size' is not a constant, a library call will be generated.
1159e8d8bef9SDimitry Andric     if (!C)
1160e8d8bef9SDimitry Andric       return -1;
1161e8d8bef9SDimitry Andric 
1162e8d8bef9SDimitry Andric     const unsigned Size = C->getValue().getZExtValue();
1163e8d8bef9SDimitry Andric     const Align DstAlign = *MS->getDestAlign();
1164e8d8bef9SDimitry Andric 
1165e8d8bef9SDimitry Andric     MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1166e8d8bef9SDimitry Andric                      /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1167e8d8bef9SDimitry Andric     DstAddrSpace = MS->getDestAddressSpace();
1168e8d8bef9SDimitry Andric   }
1169e8d8bef9SDimitry Andric   else
1170e8d8bef9SDimitry Andric     llvm_unreachable("Expected a memcpy/move or memset!");
1171e8d8bef9SDimitry Andric 
1172e8d8bef9SDimitry Andric   unsigned Limit, Factor = 2;
1173e8d8bef9SDimitry Andric   switch(I->getIntrinsicID()) {
1174e8d8bef9SDimitry Andric     case Intrinsic::memcpy:
1175e8d8bef9SDimitry Andric       Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1176e8d8bef9SDimitry Andric       break;
1177e8d8bef9SDimitry Andric     case Intrinsic::memmove:
1178e8d8bef9SDimitry Andric       Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1179e8d8bef9SDimitry Andric       break;
1180e8d8bef9SDimitry Andric     case Intrinsic::memset:
1181e8d8bef9SDimitry Andric       Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1182e8d8bef9SDimitry Andric       Factor = 1;
1183e8d8bef9SDimitry Andric       break;
1184e8d8bef9SDimitry Andric     default:
1185e8d8bef9SDimitry Andric       llvm_unreachable("Expected a memcpy/move or memset!");
1186e8d8bef9SDimitry Andric   }
11870b57cec5SDimitry Andric 
11880b57cec5SDimitry Andric   // MemOps will be poplulated with a list of data types that needs to be
11890b57cec5SDimitry Andric   // loaded and stored. That's why we multiply the number of elements by 2 to
11900b57cec5SDimitry Andric   // get the cost for this memcpy.
1191e8d8bef9SDimitry Andric   std::vector<EVT> MemOps;
11920b57cec5SDimitry Andric   if (getTLI()->findOptimalMemOpLowering(
1193e8d8bef9SDimitry Andric           MemOps, Limit, MOp, DstAddrSpace,
1194e8d8bef9SDimitry Andric           SrcAddrSpace, F->getAttributes()))
1195e8d8bef9SDimitry Andric     return MemOps.size() * Factor;
11960b57cec5SDimitry Andric 
11970b57cec5SDimitry Andric   // If we can't find an optimal memop lowering, return the default cost
1198e8d8bef9SDimitry Andric   return -1;
1199e8d8bef9SDimitry Andric }
1200e8d8bef9SDimitry Andric 
getMemcpyCost(const Instruction * I)1201fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1202e8d8bef9SDimitry Andric   int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1203e8d8bef9SDimitry Andric 
1204e8d8bef9SDimitry Andric   // To model the cost of a library call, we assume 1 for the call, and
1205e8d8bef9SDimitry Andric   // 3 for the argument setup.
1206e8d8bef9SDimitry Andric   if (NumOps == -1)
1207e8d8bef9SDimitry Andric     return 4;
1208e8d8bef9SDimitry Andric   return NumOps;
12090b57cec5SDimitry Andric }
12100b57cec5SDimitry Andric 
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,TTI::TargetCostKind CostKind,int Index,VectorType * SubTp,ArrayRef<const Value * > Args)1211fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1212fe6060f1SDimitry Andric                                            VectorType *Tp, ArrayRef<int> Mask,
1213bdd1243dSDimitry Andric                                            TTI::TargetCostKind CostKind,
121481ad6265SDimitry Andric                                            int Index, VectorType *SubTp,
121581ad6265SDimitry Andric                                            ArrayRef<const Value *> Args) {
12165f757f3fSDimitry Andric   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
12178bcb0991SDimitry Andric   if (ST->hasNEON()) {
12180b57cec5SDimitry Andric     if (Kind == TTI::SK_Broadcast) {
12190b57cec5SDimitry Andric       static const CostTblEntry NEONDupTbl[] = {
12200b57cec5SDimitry Andric           // VDUP handles these cases.
12210b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12220b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
12230b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12240b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12250b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
12260b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
12270b57cec5SDimitry Andric 
12280b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
12290b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
12300b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
12310b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
12320b57cec5SDimitry Andric 
1233bdd1243dSDimitry Andric       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
12348bcb0991SDimitry Andric       if (const auto *Entry =
12358bcb0991SDimitry Andric               CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
12360b57cec5SDimitry Andric         return LT.first * Entry->Cost;
12370b57cec5SDimitry Andric     }
12380b57cec5SDimitry Andric     if (Kind == TTI::SK_Reverse) {
12390b57cec5SDimitry Andric       static const CostTblEntry NEONShuffleTbl[] = {
12400b57cec5SDimitry Andric           // Reverse shuffle cost one instruction if we are shuffling within a
12410b57cec5SDimitry Andric           // double word (vrev) or two if we shuffle a quad word (vrev, vext).
12420b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12430b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
12440b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12450b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12460b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
12470b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
12480b57cec5SDimitry Andric 
12490b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
12500b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
12510b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
12520b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
12530b57cec5SDimitry Andric 
1254bdd1243dSDimitry Andric       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
12558bcb0991SDimitry Andric       if (const auto *Entry =
12568bcb0991SDimitry Andric               CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
12570b57cec5SDimitry Andric         return LT.first * Entry->Cost;
12580b57cec5SDimitry Andric     }
12590b57cec5SDimitry Andric     if (Kind == TTI::SK_Select) {
12600b57cec5SDimitry Andric       static const CostTblEntry NEONSelShuffleTbl[] = {
12618bcb0991SDimitry Andric           // Select shuffle cost table for ARM. Cost is the number of
12628bcb0991SDimitry Andric           // instructions
12630b57cec5SDimitry Andric           // required to create the shuffled vector.
12640b57cec5SDimitry Andric 
12650b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
12660b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12670b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12680b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12690b57cec5SDimitry Andric 
12700b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
12710b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
12720b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
12730b57cec5SDimitry Andric 
12740b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
12750b57cec5SDimitry Andric 
12760b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
12770b57cec5SDimitry Andric 
1278bdd1243dSDimitry Andric       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
12790b57cec5SDimitry Andric       if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
12800b57cec5SDimitry Andric                                               ISD::VECTOR_SHUFFLE, LT.second))
12810b57cec5SDimitry Andric         return LT.first * Entry->Cost;
12820b57cec5SDimitry Andric     }
12838bcb0991SDimitry Andric   }
12848bcb0991SDimitry Andric   if (ST->hasMVEIntegerOps()) {
12858bcb0991SDimitry Andric     if (Kind == TTI::SK_Broadcast) {
12868bcb0991SDimitry Andric       static const CostTblEntry MVEDupTbl[] = {
12878bcb0991SDimitry Andric           // VDUP handles these cases.
12888bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
12898bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
12908bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
12918bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
12928bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
12938bcb0991SDimitry Andric 
1294bdd1243dSDimitry Andric       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
12958bcb0991SDimitry Andric       if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
12968bcb0991SDimitry Andric                                               LT.second))
1297fe6060f1SDimitry Andric         return LT.first * Entry->Cost *
1298fe6060f1SDimitry Andric                ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
12990b57cec5SDimitry Andric     }
13000b57cec5SDimitry Andric 
1301fe6060f1SDimitry Andric     if (!Mask.empty()) {
1302bdd1243dSDimitry Andric       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
130356f451bbSDimitry Andric       if (LT.second.isVector() &&
130456f451bbSDimitry Andric           Mask.size() <= LT.second.getVectorNumElements() &&
1305fe6060f1SDimitry Andric           (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1306fe6060f1SDimitry Andric            isVREVMask(Mask, LT.second, 64)))
1307fe6060f1SDimitry Andric         return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1308fe6060f1SDimitry Andric     }
1309fe6060f1SDimitry Andric   }
1310fe6060f1SDimitry Andric 
1311fe6060f1SDimitry Andric   int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1312fe6060f1SDimitry Andric                      ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1313fe6060f1SDimitry Andric                      : 1;
1314bdd1243dSDimitry Andric   return BaseCost *
1315bdd1243dSDimitry Andric          BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
1316fe6060f1SDimitry Andric }
1317fe6060f1SDimitry Andric 
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI)1318fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1319fe6060f1SDimitry Andric     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1320bdd1243dSDimitry Andric     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1321bdd1243dSDimitry Andric     ArrayRef<const Value *> Args,
1322480093f4SDimitry Andric     const Instruction *CxtI) {
13230b57cec5SDimitry Andric   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1324e8d8bef9SDimitry Andric   if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1325e8d8bef9SDimitry Andric     // Make operations on i1 relatively expensive as this often involves
1326e8d8bef9SDimitry Andric     // combining predicates. AND and XOR should be easier to handle with IT
1327e8d8bef9SDimitry Andric     // blocks.
1328e8d8bef9SDimitry Andric     switch (ISDOpcode) {
1329e8d8bef9SDimitry Andric     default:
1330e8d8bef9SDimitry Andric       break;
1331e8d8bef9SDimitry Andric     case ISD::AND:
1332e8d8bef9SDimitry Andric     case ISD::XOR:
1333e8d8bef9SDimitry Andric       return 2;
1334e8d8bef9SDimitry Andric     case ISD::OR:
1335e8d8bef9SDimitry Andric       return 3;
1336e8d8bef9SDimitry Andric     }
1337e8d8bef9SDimitry Andric   }
1338e8d8bef9SDimitry Andric 
1339bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
13400b57cec5SDimitry Andric 
1341480093f4SDimitry Andric   if (ST->hasNEON()) {
13420b57cec5SDimitry Andric     const unsigned FunctionCallDivCost = 20;
13430b57cec5SDimitry Andric     const unsigned ReciprocalDivCost = 10;
13440b57cec5SDimitry Andric     static const CostTblEntry CostTbl[] = {
13450b57cec5SDimitry Andric       // Division.
13460b57cec5SDimitry Andric       // These costs are somewhat random. Choose a cost of 20 to indicate that
13470b57cec5SDimitry Andric       // vectorizing devision (added function call) is going to be very expensive.
13480b57cec5SDimitry Andric       // Double registers types.
13490b57cec5SDimitry Andric       { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
13500b57cec5SDimitry Andric       { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
13510b57cec5SDimitry Andric       { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
13520b57cec5SDimitry Andric       { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
13530b57cec5SDimitry Andric       { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
13540b57cec5SDimitry Andric       { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
13550b57cec5SDimitry Andric       { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
13560b57cec5SDimitry Andric       { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
13570b57cec5SDimitry Andric       { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
13580b57cec5SDimitry Andric       { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
13590b57cec5SDimitry Andric       { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
13600b57cec5SDimitry Andric       { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
13610b57cec5SDimitry Andric       { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
13620b57cec5SDimitry Andric       { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
13630b57cec5SDimitry Andric       { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
13640b57cec5SDimitry Andric       { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
13650b57cec5SDimitry Andric       // Quad register types.
13660b57cec5SDimitry Andric       { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
13670b57cec5SDimitry Andric       { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
13680b57cec5SDimitry Andric       { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
13690b57cec5SDimitry Andric       { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
13700b57cec5SDimitry Andric       { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
13710b57cec5SDimitry Andric       { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
13720b57cec5SDimitry Andric       { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
13730b57cec5SDimitry Andric       { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
13740b57cec5SDimitry Andric       { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
13750b57cec5SDimitry Andric       { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
13760b57cec5SDimitry Andric       { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
13770b57cec5SDimitry Andric       { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
13780b57cec5SDimitry Andric       { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
13790b57cec5SDimitry Andric       { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
13800b57cec5SDimitry Andric       { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
13810b57cec5SDimitry Andric       { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
13820b57cec5SDimitry Andric       // Multiplication.
13830b57cec5SDimitry Andric     };
13840b57cec5SDimitry Andric 
13850b57cec5SDimitry Andric     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
13860b57cec5SDimitry Andric       return LT.first * Entry->Cost;
13870b57cec5SDimitry Andric 
1388fe6060f1SDimitry Andric     InstructionCost Cost = BaseT::getArithmeticInstrCost(
1389bdd1243dSDimitry Andric         Opcode, Ty, CostKind, Op1Info, Op2Info);
13900b57cec5SDimitry Andric 
13910b57cec5SDimitry Andric     // This is somewhat of a hack. The problem that we are facing is that SROA
13920b57cec5SDimitry Andric     // creates a sequence of shift, and, or instructions to construct values.
13930b57cec5SDimitry Andric     // These sequences are recognized by the ISel and have zero-cost. Not so for
13940b57cec5SDimitry Andric     // the vectorized code. Because we have support for v2i64 but not i64 those
13950b57cec5SDimitry Andric     // sequences look particularly beneficial to vectorize.
13960b57cec5SDimitry Andric     // To work around this we increase the cost of v2i64 operations to make them
13970b57cec5SDimitry Andric     // seem less beneficial.
1398bdd1243dSDimitry Andric     if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
13990b57cec5SDimitry Andric       Cost += 4;
14000b57cec5SDimitry Andric 
14010b57cec5SDimitry Andric     return Cost;
14020b57cec5SDimitry Andric   }
14030b57cec5SDimitry Andric 
1404480093f4SDimitry Andric   // If this operation is a shift on arm/thumb2, it might well be folded into
1405480093f4SDimitry Andric   // the following instruction, hence having a cost of 0.
1406480093f4SDimitry Andric   auto LooksLikeAFreeShift = [&]() {
1407480093f4SDimitry Andric     if (ST->isThumb1Only() || Ty->isVectorTy())
1408480093f4SDimitry Andric       return false;
1409480093f4SDimitry Andric 
1410480093f4SDimitry Andric     if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1411480093f4SDimitry Andric       return false;
1412bdd1243dSDimitry Andric     if (!Op2Info.isUniform() || !Op2Info.isConstant())
1413480093f4SDimitry Andric       return false;
1414480093f4SDimitry Andric 
1415480093f4SDimitry Andric     // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1416480093f4SDimitry Andric     switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1417480093f4SDimitry Andric     case Instruction::Add:
1418480093f4SDimitry Andric     case Instruction::Sub:
1419480093f4SDimitry Andric     case Instruction::And:
1420480093f4SDimitry Andric     case Instruction::Xor:
1421480093f4SDimitry Andric     case Instruction::Or:
1422480093f4SDimitry Andric     case Instruction::ICmp:
1423480093f4SDimitry Andric       return true;
1424480093f4SDimitry Andric     default:
1425480093f4SDimitry Andric       return false;
1426480093f4SDimitry Andric     }
1427480093f4SDimitry Andric   };
1428480093f4SDimitry Andric   if (LooksLikeAFreeShift())
1429480093f4SDimitry Andric     return 0;
1430480093f4SDimitry Andric 
1431e8d8bef9SDimitry Andric   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1432e8d8bef9SDimitry Andric   // for "multiple beats" potentially needed by MVE instructions.
1433e8d8bef9SDimitry Andric   int BaseCost = 1;
1434fe6060f1SDimitry Andric   if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1435fe6060f1SDimitry Andric     BaseCost = ST->getMVEVectorCostFactor(CostKind);
14368bcb0991SDimitry Andric 
14378bcb0991SDimitry Andric   // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
14388bcb0991SDimitry Andric   // without treating floats as more expensive that scalars or increasing the
14398bcb0991SDimitry Andric   // costs for custom operations. The results is also multiplied by the
14408bcb0991SDimitry Andric   // MVEVectorCostFactor where appropriate.
14418bcb0991SDimitry Andric   if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
14428bcb0991SDimitry Andric     return LT.first * BaseCost;
14438bcb0991SDimitry Andric 
14448bcb0991SDimitry Andric   // Else this is expand, assume that we need to scalarize this op.
14455ffd83dbSDimitry Andric   if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
14465ffd83dbSDimitry Andric     unsigned Num = VTy->getNumElements();
1447fe6060f1SDimitry Andric     InstructionCost Cost =
1448fe6060f1SDimitry Andric         getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
14498bcb0991SDimitry Andric     // Return the cost of multiple scalar invocation plus the cost of
14508bcb0991SDimitry Andric     // inserting and extracting the values.
1451fe6060f1SDimitry Andric     SmallVector<Type *> Tys(Args.size(), Ty);
1452bdd1243dSDimitry Andric     return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1453bdd1243dSDimitry Andric            Num * Cost;
14548bcb0991SDimitry Andric   }
14558bcb0991SDimitry Andric 
14568bcb0991SDimitry Andric   return BaseCost;
14578bcb0991SDimitry Andric }
14588bcb0991SDimitry Andric 
getMemoryOpCost(unsigned Opcode,Type * Src,MaybeAlign Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,TTI::OperandValueInfo OpInfo,const Instruction * I)1459fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1460fe6060f1SDimitry Andric                                             MaybeAlign Alignment,
1461fe6060f1SDimitry Andric                                             unsigned AddressSpace,
14625ffd83dbSDimitry Andric                                             TTI::TargetCostKind CostKind,
1463bdd1243dSDimitry Andric                                             TTI::OperandValueInfo OpInfo,
1464480093f4SDimitry Andric                                             const Instruction *I) {
14655ffd83dbSDimitry Andric   // TODO: Handle other cost kinds.
14665ffd83dbSDimitry Andric   if (CostKind != TTI::TCK_RecipThroughput)
14675ffd83dbSDimitry Andric     return 1;
14685ffd83dbSDimitry Andric 
14695ffd83dbSDimitry Andric   // Type legalization can't handle structs
14705ffd83dbSDimitry Andric   if (TLI->getValueType(DL, Src, true) == MVT::Other)
14715ffd83dbSDimitry Andric     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
14725ffd83dbSDimitry Andric                                   CostKind);
14730b57cec5SDimitry Andric 
1474480093f4SDimitry Andric   if (ST->hasNEON() && Src->isVectorTy() &&
1475480093f4SDimitry Andric       (Alignment && *Alignment != Align(16)) &&
14765ffd83dbSDimitry Andric       cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
14770b57cec5SDimitry Andric     // Unaligned loads/stores are extremely inefficient.
14780b57cec5SDimitry Andric     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1479bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
14800b57cec5SDimitry Andric     return LT.first * 4;
14810b57cec5SDimitry Andric   }
14825ffd83dbSDimitry Andric 
14835ffd83dbSDimitry Andric   // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
14845ffd83dbSDimitry Andric   // Same for stores.
14855ffd83dbSDimitry Andric   if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
14865ffd83dbSDimitry Andric       ((Opcode == Instruction::Load && I->hasOneUse() &&
14875ffd83dbSDimitry Andric         isa<FPExtInst>(*I->user_begin())) ||
14885ffd83dbSDimitry Andric        (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
14895ffd83dbSDimitry Andric     FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
14905ffd83dbSDimitry Andric     Type *DstTy =
14915ffd83dbSDimitry Andric         Opcode == Instruction::Load
14925ffd83dbSDimitry Andric             ? (*I->user_begin())->getType()
14935ffd83dbSDimitry Andric             : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
14945ffd83dbSDimitry Andric     if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
14955ffd83dbSDimitry Andric         DstTy->getScalarType()->isFloatTy())
1496fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind);
14975ffd83dbSDimitry Andric   }
14985ffd83dbSDimitry Andric 
14998bcb0991SDimitry Andric   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1500fe6060f1SDimitry Andric                      ? ST->getMVEVectorCostFactor(CostKind)
15018bcb0991SDimitry Andric                      : 1;
15025ffd83dbSDimitry Andric   return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1503bdd1243dSDimitry Andric                                            CostKind, OpInfo, I);
15040b57cec5SDimitry Andric }
15050b57cec5SDimitry Andric 
1506fe6060f1SDimitry Andric InstructionCost
getMaskedMemoryOpCost(unsigned Opcode,Type * Src,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1507fe6060f1SDimitry Andric ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1508e8d8bef9SDimitry Andric                                   unsigned AddressSpace,
1509e8d8bef9SDimitry Andric                                   TTI::TargetCostKind CostKind) {
1510e8d8bef9SDimitry Andric   if (ST->hasMVEIntegerOps()) {
1511e8d8bef9SDimitry Andric     if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1512fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind);
1513e8d8bef9SDimitry Andric     if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1514fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind);
1515e8d8bef9SDimitry Andric   }
1516e8d8bef9SDimitry Andric   if (!isa<FixedVectorType>(Src))
1517e8d8bef9SDimitry Andric     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1518e8d8bef9SDimitry Andric                                         CostKind);
1519e8d8bef9SDimitry Andric   // Scalar cost, which is currently very high due to the efficiency of the
1520e8d8bef9SDimitry Andric   // generated code.
1521e8d8bef9SDimitry Andric   return cast<FixedVectorType>(Src)->getNumElements() * 8;
1522e8d8bef9SDimitry Andric }
1523e8d8bef9SDimitry Andric 
getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind,bool UseMaskForCond,bool UseMaskForGaps)1524fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1525480093f4SDimitry Andric     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
15265ffd83dbSDimitry Andric     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
15275ffd83dbSDimitry Andric     bool UseMaskForCond, bool UseMaskForGaps) {
15280b57cec5SDimitry Andric   assert(Factor >= 2 && "Invalid interleave factor");
15290b57cec5SDimitry Andric   assert(isa<VectorType>(VecTy) && "Expect a vector type");
15300b57cec5SDimitry Andric 
15310b57cec5SDimitry Andric   // vldN/vstN doesn't support vector types of i64/f64 element.
15320b57cec5SDimitry Andric   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
15330b57cec5SDimitry Andric 
15340b57cec5SDimitry Andric   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
15350b57cec5SDimitry Andric       !UseMaskForCond && !UseMaskForGaps) {
15365ffd83dbSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
15375ffd83dbSDimitry Andric     auto *SubVecTy =
15385ffd83dbSDimitry Andric         FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
15390b57cec5SDimitry Andric 
15400b57cec5SDimitry Andric     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
15410b57cec5SDimitry Andric     // Accesses having vector types that are a multiple of 128 bits can be
15420b57cec5SDimitry Andric     // matched to more than one vldN/vstN instruction.
1543fe6060f1SDimitry Andric     int BaseCost =
1544fe6060f1SDimitry Andric         ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
15450b57cec5SDimitry Andric     if (NumElts % Factor == 0 &&
1546fe6060f1SDimitry Andric         TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1547480093f4SDimitry Andric       return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1548480093f4SDimitry Andric 
1549480093f4SDimitry Andric     // Some smaller than legal interleaved patterns are cheap as we can make
1550480093f4SDimitry Andric     // use of the vmovn or vrev patterns to interleave a standard load. This is
1551480093f4SDimitry Andric     // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1552480093f4SDimitry Andric     // promoted differently). The cost of 2 here is then a load and vrev or
1553480093f4SDimitry Andric     // vmovn.
1554480093f4SDimitry Andric     if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1555e8d8bef9SDimitry Andric         VecTy->isIntOrIntVectorTy() &&
1556bdd1243dSDimitry Andric         DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1557480093f4SDimitry Andric       return 2 * BaseCost;
15580b57cec5SDimitry Andric   }
15590b57cec5SDimitry Andric 
15600b57cec5SDimitry Andric   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
15615ffd83dbSDimitry Andric                                            Alignment, AddressSpace, CostKind,
15620b57cec5SDimitry Andric                                            UseMaskForCond, UseMaskForGaps);
15630b57cec5SDimitry Andric }
15640b57cec5SDimitry Andric 
getGatherScatterOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)1565fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1566fe6060f1SDimitry Andric     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1567fe6060f1SDimitry Andric     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
15685ffd83dbSDimitry Andric   using namespace PatternMatch;
15695ffd83dbSDimitry Andric   if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
15705ffd83dbSDimitry Andric     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
15715ffd83dbSDimitry Andric                                          Alignment, CostKind, I);
15725ffd83dbSDimitry Andric 
15735ffd83dbSDimitry Andric   assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
15745ffd83dbSDimitry Andric   auto *VTy = cast<FixedVectorType>(DataTy);
15755ffd83dbSDimitry Andric 
15765ffd83dbSDimitry Andric   // TODO: Splitting, once we do that.
15775ffd83dbSDimitry Andric 
15785ffd83dbSDimitry Andric   unsigned NumElems = VTy->getNumElements();
15795ffd83dbSDimitry Andric   unsigned EltSize = VTy->getScalarSizeInBits();
1580bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
15815ffd83dbSDimitry Andric 
15825ffd83dbSDimitry Andric   // For now, it is assumed that for the MVE gather instructions the loads are
15835ffd83dbSDimitry Andric   // all effectively serialised. This means the cost is the scalar cost
15845ffd83dbSDimitry Andric   // multiplied by the number of elements being loaded. This is possibly very
15855ffd83dbSDimitry Andric   // conservative, but even so we still end up vectorising loops because the
15865ffd83dbSDimitry Andric   // cost per iteration for many loops is lower than for scalar loops.
1587fe6060f1SDimitry Andric   InstructionCost VectorCost =
1588fe6060f1SDimitry Andric       NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
15895ffd83dbSDimitry Andric   // The scalarization cost should be a lot higher. We use the number of vector
159006c3fb27SDimitry Andric   // elements plus the scalarization overhead. If masking is required then a lot
159106c3fb27SDimitry Andric   // of little blocks will be needed and potentially a scalarized p0 mask,
159206c3fb27SDimitry Andric   // greatly increasing the cost.
1593fe6060f1SDimitry Andric   InstructionCost ScalarCost =
159406c3fb27SDimitry Andric       NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1595bdd1243dSDimitry Andric       BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1596bdd1243dSDimitry Andric                                       CostKind) +
1597bdd1243dSDimitry Andric       BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1598bdd1243dSDimitry Andric                                       CostKind);
15995ffd83dbSDimitry Andric 
1600e8d8bef9SDimitry Andric   if (EltSize < 8 || Alignment < EltSize / 8)
16015ffd83dbSDimitry Andric     return ScalarCost;
16025ffd83dbSDimitry Andric 
16035ffd83dbSDimitry Andric   unsigned ExtSize = EltSize;
16045ffd83dbSDimitry Andric   // Check whether there's a single user that asks for an extended type
16055ffd83dbSDimitry Andric   if (I != nullptr) {
16065ffd83dbSDimitry Andric     // Dependent of the caller of this function, a gather instruction will
16075ffd83dbSDimitry Andric     // either have opcode Instruction::Load or be a call to the masked_gather
16085ffd83dbSDimitry Andric     // intrinsic
16095ffd83dbSDimitry Andric     if ((I->getOpcode() == Instruction::Load ||
16105ffd83dbSDimitry Andric          match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
16115ffd83dbSDimitry Andric         I->hasOneUse()) {
16125ffd83dbSDimitry Andric       const User *Us = *I->users().begin();
16135ffd83dbSDimitry Andric       if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
16145ffd83dbSDimitry Andric         // only allow valid type combinations
16155ffd83dbSDimitry Andric         unsigned TypeSize =
16165ffd83dbSDimitry Andric             cast<Instruction>(Us)->getType()->getScalarSizeInBits();
16175ffd83dbSDimitry Andric         if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
16185ffd83dbSDimitry Andric              (TypeSize == 16 && EltSize == 8)) &&
16195ffd83dbSDimitry Andric             TypeSize * NumElems == 128) {
16205ffd83dbSDimitry Andric           ExtSize = TypeSize;
16215ffd83dbSDimitry Andric         }
16225ffd83dbSDimitry Andric       }
16235ffd83dbSDimitry Andric     }
16245ffd83dbSDimitry Andric     // Check whether the input data needs to be truncated
16255ffd83dbSDimitry Andric     TruncInst *T;
16265ffd83dbSDimitry Andric     if ((I->getOpcode() == Instruction::Store ||
16275ffd83dbSDimitry Andric          match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
16285ffd83dbSDimitry Andric         (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
16295ffd83dbSDimitry Andric       // Only allow valid type combinations
16305ffd83dbSDimitry Andric       unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
16315ffd83dbSDimitry Andric       if (((EltSize == 16 && TypeSize == 32) ||
16325ffd83dbSDimitry Andric            (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
16335ffd83dbSDimitry Andric           TypeSize * NumElems == 128)
16345ffd83dbSDimitry Andric         ExtSize = TypeSize;
16355ffd83dbSDimitry Andric     }
16365ffd83dbSDimitry Andric   }
16375ffd83dbSDimitry Andric 
16385ffd83dbSDimitry Andric   if (ExtSize * NumElems != 128 || NumElems < 4)
16395ffd83dbSDimitry Andric     return ScalarCost;
16405ffd83dbSDimitry Andric 
16415ffd83dbSDimitry Andric   // Any (aligned) i32 gather will not need to be scalarised.
16425ffd83dbSDimitry Andric   if (ExtSize == 32)
16435ffd83dbSDimitry Andric     return VectorCost;
16445ffd83dbSDimitry Andric   // For smaller types, we need to ensure that the gep's inputs are correctly
16455ffd83dbSDimitry Andric   // extended from a small enough value. Other sizes (including i64) are
16465ffd83dbSDimitry Andric   // scalarized for now.
16475ffd83dbSDimitry Andric   if (ExtSize != 8 && ExtSize != 16)
16485ffd83dbSDimitry Andric     return ScalarCost;
16495ffd83dbSDimitry Andric 
16505ffd83dbSDimitry Andric   if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
16515ffd83dbSDimitry Andric     Ptr = BC->getOperand(0);
16525ffd83dbSDimitry Andric   if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
16535ffd83dbSDimitry Andric     if (GEP->getNumOperands() != 2)
16545ffd83dbSDimitry Andric       return ScalarCost;
16555ffd83dbSDimitry Andric     unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
16565ffd83dbSDimitry Andric     // Scale needs to be correct (which is only relevant for i16s).
16575ffd83dbSDimitry Andric     if (Scale != 1 && Scale * 8 != ExtSize)
16585ffd83dbSDimitry Andric       return ScalarCost;
16595ffd83dbSDimitry Andric     // And we need to zext (not sext) the indexes from a small enough type.
16605ffd83dbSDimitry Andric     if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
16615ffd83dbSDimitry Andric       if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
16625ffd83dbSDimitry Andric         return VectorCost;
16635ffd83dbSDimitry Andric     }
16645ffd83dbSDimitry Andric     return ScalarCost;
16655ffd83dbSDimitry Andric   }
16665ffd83dbSDimitry Andric   return ScalarCost;
16675ffd83dbSDimitry Andric }
16685ffd83dbSDimitry Andric 
1669fe6060f1SDimitry Andric InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * ValTy,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)1670fe6060f1SDimitry Andric ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1671bdd1243dSDimitry Andric                                        std::optional<FastMathFlags> FMF,
1672e8d8bef9SDimitry Andric                                        TTI::TargetCostKind CostKind) {
1673fe6060f1SDimitry Andric 
1674e8d8bef9SDimitry Andric   EVT ValVT = TLI->getValueType(DL, ValTy);
1675e8d8bef9SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
16765f757f3fSDimitry Andric   unsigned EltSize = ValVT.getScalarSizeInBits();
16775f757f3fSDimitry Andric 
16785f757f3fSDimitry Andric   // In general floating point reductions are a series of elementwise
16795f757f3fSDimitry Andric   // operations, with free extracts on each step. These are either in-order or
16805f757f3fSDimitry Andric   // treewise depending on whether that is allowed by the fast math flags.
16815f757f3fSDimitry Andric   if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
16825f757f3fSDimitry Andric       ((EltSize == 32 && ST->hasVFP2Base()) ||
16835f757f3fSDimitry Andric        (EltSize == 64 && ST->hasFP64()) ||
16845f757f3fSDimitry Andric        (EltSize == 16 && ST->hasFullFP16()))) {
16855f757f3fSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
16865f757f3fSDimitry Andric     unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
16875f757f3fSDimitry Andric     InstructionCost VecCost = 0;
16885f757f3fSDimitry Andric     while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
16895f757f3fSDimitry Andric            NumElts * EltSize > VecLimit) {
16905f757f3fSDimitry Andric       Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
16915f757f3fSDimitry Andric       VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
16925f757f3fSDimitry Andric       NumElts /= 2;
16935f757f3fSDimitry Andric     }
16945f757f3fSDimitry Andric 
16955f757f3fSDimitry Andric     // For fp16 we need to extract the upper lane elements. MVE can add a
16965f757f3fSDimitry Andric     // VREV+FMIN/MAX to perform another vector step instead.
16975f757f3fSDimitry Andric     InstructionCost ExtractCost = 0;
16985f757f3fSDimitry Andric     if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
16995f757f3fSDimitry Andric         ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
17005f757f3fSDimitry Andric       VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
17015f757f3fSDimitry Andric       NumElts /= 2;
17025f757f3fSDimitry Andric     } else if (ValVT.getVectorElementType() == MVT::f16)
17035f757f3fSDimitry Andric       ExtractCost = NumElts / 2;
17045f757f3fSDimitry Andric 
17055f757f3fSDimitry Andric     return VecCost + ExtractCost +
17065f757f3fSDimitry Andric            NumElts *
17075f757f3fSDimitry Andric                getArithmeticInstrCost(Opcode, ValTy->getElementType(), CostKind);
17085f757f3fSDimitry Andric   }
17095f757f3fSDimitry Andric 
17105f757f3fSDimitry Andric   if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
17115f757f3fSDimitry Andric       (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
17125f757f3fSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
17135f757f3fSDimitry Andric     unsigned VecLimit =
17145f757f3fSDimitry Andric         ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
17155f757f3fSDimitry Andric     InstructionCost VecCost = 0;
17165f757f3fSDimitry Andric     while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
17175f757f3fSDimitry Andric       Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
17185f757f3fSDimitry Andric       VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
17195f757f3fSDimitry Andric       NumElts /= 2;
17205f757f3fSDimitry Andric     }
17215f757f3fSDimitry Andric     // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
17225f757f3fSDimitry Andric     // step.
17235f757f3fSDimitry Andric     if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
17245f757f3fSDimitry Andric         NumElts * EltSize == 64) {
17255f757f3fSDimitry Andric       Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
17265f757f3fSDimitry Andric       VecCost += ST->getMVEVectorCostFactor(CostKind) +
17275f757f3fSDimitry Andric                  getArithmeticInstrCost(Opcode, VecTy, CostKind);
17285f757f3fSDimitry Andric       NumElts /= 2;
17295f757f3fSDimitry Andric     }
17305f757f3fSDimitry Andric 
17315f757f3fSDimitry Andric     // From here we extract the elements and perform the and/or/xor.
17325f757f3fSDimitry Andric     InstructionCost ExtractCost = NumElts;
17335f757f3fSDimitry Andric     return VecCost + ExtractCost +
17345f757f3fSDimitry Andric            (NumElts - 1) * getArithmeticInstrCost(
17355f757f3fSDimitry Andric                                Opcode, ValTy->getElementType(), CostKind);
17365f757f3fSDimitry Andric   }
17375f757f3fSDimitry Andric 
17385f757f3fSDimitry Andric   if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
17395f757f3fSDimitry Andric       TTI::requiresOrderedReduction(FMF))
1740fe6060f1SDimitry Andric     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1741e8d8bef9SDimitry Andric 
1742bdd1243dSDimitry Andric   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1743e8d8bef9SDimitry Andric 
1744e8d8bef9SDimitry Andric   static const CostTblEntry CostTblAdd[]{
1745e8d8bef9SDimitry Andric       {ISD::ADD, MVT::v16i8, 1},
1746e8d8bef9SDimitry Andric       {ISD::ADD, MVT::v8i16, 1},
1747e8d8bef9SDimitry Andric       {ISD::ADD, MVT::v4i32, 1},
1748e8d8bef9SDimitry Andric   };
1749e8d8bef9SDimitry Andric   if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1750fe6060f1SDimitry Andric     return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1751e8d8bef9SDimitry Andric 
1752fe6060f1SDimitry Andric   return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1753e8d8bef9SDimitry Andric }
1754e8d8bef9SDimitry Andric 
getExtendedReductionCost(unsigned Opcode,bool IsUnsigned,Type * ResTy,VectorType * ValTy,FastMathFlags FMF,TTI::TargetCostKind CostKind)1755bdd1243dSDimitry Andric InstructionCost ARMTTIImpl::getExtendedReductionCost(
1756bdd1243dSDimitry Andric     unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
175706c3fb27SDimitry Andric     FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1758bdd1243dSDimitry Andric   EVT ValVT = TLI->getValueType(DL, ValTy);
1759bdd1243dSDimitry Andric   EVT ResVT = TLI->getValueType(DL, ResTy);
1760bdd1243dSDimitry Andric 
1761bdd1243dSDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1762bdd1243dSDimitry Andric 
1763bdd1243dSDimitry Andric   switch (ISD) {
1764bdd1243dSDimitry Andric   case ISD::ADD:
1765bdd1243dSDimitry Andric     if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1766bdd1243dSDimitry Andric       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1767bdd1243dSDimitry Andric 
1768bdd1243dSDimitry Andric       // The legal cases are:
1769bdd1243dSDimitry Andric       //   VADDV u/s 8/16/32
1770bdd1243dSDimitry Andric       //   VADDLV u/s 32
1771bdd1243dSDimitry Andric       // Codegen currently cannot always handle larger than legal vectors very
1772bdd1243dSDimitry Andric       // well, especially for predicated reductions where the mask needs to be
1773bdd1243dSDimitry Andric       // split, so restrict to 128bit or smaller input types.
1774bdd1243dSDimitry Andric       unsigned RevVTSize = ResVT.getSizeInBits();
1775bdd1243dSDimitry Andric       if (ValVT.getSizeInBits() <= 128 &&
1776bdd1243dSDimitry Andric           ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1777bdd1243dSDimitry Andric            (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1778bdd1243dSDimitry Andric            (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1779bdd1243dSDimitry Andric         return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1780bdd1243dSDimitry Andric     }
1781bdd1243dSDimitry Andric     break;
1782bdd1243dSDimitry Andric   default:
1783bdd1243dSDimitry Andric     break;
1784bdd1243dSDimitry Andric   }
1785bdd1243dSDimitry Andric   return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1786bdd1243dSDimitry Andric                                          CostKind);
1787bdd1243dSDimitry Andric }
1788bdd1243dSDimitry Andric 
1789e8d8bef9SDimitry Andric InstructionCost
getMulAccReductionCost(bool IsUnsigned,Type * ResTy,VectorType * ValTy,TTI::TargetCostKind CostKind)1790bdd1243dSDimitry Andric ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
1791bdd1243dSDimitry Andric                                    VectorType *ValTy,
1792e8d8bef9SDimitry Andric                                    TTI::TargetCostKind CostKind) {
1793e8d8bef9SDimitry Andric   EVT ValVT = TLI->getValueType(DL, ValTy);
1794e8d8bef9SDimitry Andric   EVT ResVT = TLI->getValueType(DL, ResTy);
1795349cc55cSDimitry Andric 
1796e8d8bef9SDimitry Andric   if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1797bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1798349cc55cSDimitry Andric 
1799349cc55cSDimitry Andric     // The legal cases are:
1800349cc55cSDimitry Andric     //   VMLAV u/s 8/16/32
1801349cc55cSDimitry Andric     //   VMLALV u/s 16/32
1802349cc55cSDimitry Andric     // Codegen currently cannot always handle larger than legal vectors very
1803349cc55cSDimitry Andric     // well, especially for predicated reductions where the mask needs to be
1804349cc55cSDimitry Andric     // split, so restrict to 128bit or smaller input types.
1805349cc55cSDimitry Andric     unsigned RevVTSize = ResVT.getSizeInBits();
1806349cc55cSDimitry Andric     if (ValVT.getSizeInBits() <= 128 &&
1807349cc55cSDimitry Andric         ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1808bdd1243dSDimitry Andric          (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1809349cc55cSDimitry Andric          (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1810fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1811e8d8bef9SDimitry Andric   }
1812e8d8bef9SDimitry Andric 
1813bdd1243dSDimitry Andric   return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1814e8d8bef9SDimitry Andric }
1815e8d8bef9SDimitry Andric 
1816fe6060f1SDimitry Andric InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)18175f757f3fSDimitry Andric ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
18185f757f3fSDimitry Andric                                    FastMathFlags FMF,
18195f757f3fSDimitry Andric                                    TTI::TargetCostKind CostKind) {
18205f757f3fSDimitry Andric   EVT ValVT = TLI->getValueType(DL, Ty);
18215f757f3fSDimitry Andric 
18225f757f3fSDimitry Andric   // In general floating point reductions are a series of elementwise
18235f757f3fSDimitry Andric   // operations, with free extracts on each step. These are either in-order or
18245f757f3fSDimitry Andric   // treewise depending on whether that is allowed by the fast math flags.
18255f757f3fSDimitry Andric   if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
18265f757f3fSDimitry Andric       ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
18275f757f3fSDimitry Andric        (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
18285f757f3fSDimitry Andric        (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
18295f757f3fSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
18305f757f3fSDimitry Andric     unsigned EltSize = ValVT.getScalarSizeInBits();
18315f757f3fSDimitry Andric     unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
18325f757f3fSDimitry Andric     InstructionCost VecCost;
18335f757f3fSDimitry Andric     while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
18345f757f3fSDimitry Andric       Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
18355f757f3fSDimitry Andric       IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
18365f757f3fSDimitry Andric       VecCost += getIntrinsicInstrCost(ICA, CostKind);
18375f757f3fSDimitry Andric       NumElts /= 2;
18385f757f3fSDimitry Andric     }
18395f757f3fSDimitry Andric 
18405f757f3fSDimitry Andric     // For fp16 we need to extract the upper lane elements. MVE can add a
18415f757f3fSDimitry Andric     // VREV+FMIN/MAX to perform another vector step instead.
18425f757f3fSDimitry Andric     InstructionCost ExtractCost = 0;
18435f757f3fSDimitry Andric     if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
18445f757f3fSDimitry Andric         NumElts == 8) {
18455f757f3fSDimitry Andric       VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
18465f757f3fSDimitry Andric       NumElts /= 2;
18475f757f3fSDimitry Andric     } else if (ValVT.getVectorElementType() == MVT::f16)
18485f757f3fSDimitry Andric       ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
18495f757f3fSDimitry Andric 
18505f757f3fSDimitry Andric     IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
18515f757f3fSDimitry Andric                                 {Ty->getElementType(), Ty->getElementType()},
18525f757f3fSDimitry Andric                                 FMF);
18535f757f3fSDimitry Andric     return VecCost + ExtractCost +
18545f757f3fSDimitry Andric            (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
18555f757f3fSDimitry Andric   }
18565f757f3fSDimitry Andric 
18575f757f3fSDimitry Andric   if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
18585f757f3fSDimitry Andric       IID == Intrinsic::umin || IID == Intrinsic::umax) {
18595f757f3fSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
18605f757f3fSDimitry Andric 
18615f757f3fSDimitry Andric     // All costs are the same for u/s min/max.  These lower to vminv, which are
18625f757f3fSDimitry Andric     // given a slightly higher cost as they tend to take multiple cycles for
18635f757f3fSDimitry Andric     // smaller type sizes.
18645f757f3fSDimitry Andric     static const CostTblEntry CostTblAdd[]{
18655f757f3fSDimitry Andric         {ISD::SMIN, MVT::v16i8, 4},
18665f757f3fSDimitry Andric         {ISD::SMIN, MVT::v8i16, 3},
18675f757f3fSDimitry Andric         {ISD::SMIN, MVT::v4i32, 2},
18685f757f3fSDimitry Andric     };
18695f757f3fSDimitry Andric     if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
18705f757f3fSDimitry Andric       return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
18715f757f3fSDimitry Andric   }
18725f757f3fSDimitry Andric 
18735f757f3fSDimitry Andric   return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
18745f757f3fSDimitry Andric }
18755f757f3fSDimitry Andric 
18765f757f3fSDimitry Andric InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1877fe6060f1SDimitry Andric ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1878e8d8bef9SDimitry Andric                                   TTI::TargetCostKind CostKind) {
1879e8d8bef9SDimitry Andric   switch (ICA.getID()) {
1880e8d8bef9SDimitry Andric   case Intrinsic::get_active_lane_mask:
1881e8d8bef9SDimitry Andric     // Currently we make a somewhat optimistic assumption that
1882e8d8bef9SDimitry Andric     // active_lane_mask's are always free. In reality it may be freely folded
1883e8d8bef9SDimitry Andric     // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1884e8d8bef9SDimitry Andric     // of add/icmp code. We may need to improve this in the future, but being
1885e8d8bef9SDimitry Andric     // able to detect if it is free or not involves looking at a lot of other
1886e8d8bef9SDimitry Andric     // code. We currently assume that the vectorizer inserted these, and knew
1887e8d8bef9SDimitry Andric     // what it was doing in adding one.
1888e8d8bef9SDimitry Andric     if (ST->hasMVEIntegerOps())
1889e8d8bef9SDimitry Andric       return 0;
1890e8d8bef9SDimitry Andric     break;
1891e8d8bef9SDimitry Andric   case Intrinsic::sadd_sat:
1892e8d8bef9SDimitry Andric   case Intrinsic::ssub_sat:
1893e8d8bef9SDimitry Andric   case Intrinsic::uadd_sat:
1894e8d8bef9SDimitry Andric   case Intrinsic::usub_sat: {
1895e8d8bef9SDimitry Andric     if (!ST->hasMVEIntegerOps())
1896e8d8bef9SDimitry Andric       break;
1897e8d8bef9SDimitry Andric     Type *VT = ICA.getReturnType();
1898e8d8bef9SDimitry Andric 
1899bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1900e8d8bef9SDimitry Andric     if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1901e8d8bef9SDimitry Andric         LT.second == MVT::v16i8) {
1902fe6060f1SDimitry Andric       // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1903e8d8bef9SDimitry Andric       // need to extend the type, as it uses shr(qadd(shl, shl)).
1904fe6060f1SDimitry Andric       unsigned Instrs =
1905fe6060f1SDimitry Andric           LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1906fe6060f1SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1907e8d8bef9SDimitry Andric     }
1908e8d8bef9SDimitry Andric     break;
1909e8d8bef9SDimitry Andric   }
1910fe6060f1SDimitry Andric   case Intrinsic::abs:
1911fe6060f1SDimitry Andric   case Intrinsic::smin:
1912fe6060f1SDimitry Andric   case Intrinsic::smax:
1913fe6060f1SDimitry Andric   case Intrinsic::umin:
1914fe6060f1SDimitry Andric   case Intrinsic::umax: {
1915fe6060f1SDimitry Andric     if (!ST->hasMVEIntegerOps())
1916fe6060f1SDimitry Andric       break;
1917fe6060f1SDimitry Andric     Type *VT = ICA.getReturnType();
1918fe6060f1SDimitry Andric 
1919bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1920fe6060f1SDimitry Andric     if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1921fe6060f1SDimitry Andric         LT.second == MVT::v16i8)
1922fe6060f1SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind);
1923fe6060f1SDimitry Andric     break;
1924fe6060f1SDimitry Andric   }
1925fe6060f1SDimitry Andric   case Intrinsic::minnum:
1926fe6060f1SDimitry Andric   case Intrinsic::maxnum: {
1927fe6060f1SDimitry Andric     if (!ST->hasMVEFloatOps())
1928fe6060f1SDimitry Andric       break;
1929fe6060f1SDimitry Andric     Type *VT = ICA.getReturnType();
1930bdd1243dSDimitry Andric     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1931fe6060f1SDimitry Andric     if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1932fe6060f1SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind);
1933fe6060f1SDimitry Andric     break;
1934fe6060f1SDimitry Andric   }
193581ad6265SDimitry Andric   case Intrinsic::fptosi_sat:
193681ad6265SDimitry Andric   case Intrinsic::fptoui_sat: {
193781ad6265SDimitry Andric     if (ICA.getArgTypes().empty())
193881ad6265SDimitry Andric       break;
193981ad6265SDimitry Andric     bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1940bdd1243dSDimitry Andric     auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
194181ad6265SDimitry Andric     EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
194281ad6265SDimitry Andric     // Check for the legal types, with the corect subtarget features.
194381ad6265SDimitry Andric     if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
194481ad6265SDimitry Andric         (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
194581ad6265SDimitry Andric         (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
194681ad6265SDimitry Andric       return LT.first;
194781ad6265SDimitry Andric 
194881ad6265SDimitry Andric     // Equally for MVE vector types
194981ad6265SDimitry Andric     if (ST->hasMVEFloatOps() &&
195081ad6265SDimitry Andric         (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
195181ad6265SDimitry Andric         LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
195281ad6265SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind);
195381ad6265SDimitry Andric 
195481ad6265SDimitry Andric     // Otherwise we use a legal convert followed by a min+max
195581ad6265SDimitry Andric     if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
195681ad6265SDimitry Andric          (ST->hasFP64() && LT.second == MVT::f64) ||
195781ad6265SDimitry Andric          (ST->hasFullFP16() && LT.second == MVT::f16) ||
195881ad6265SDimitry Andric          (ST->hasMVEFloatOps() &&
195981ad6265SDimitry Andric           (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
196081ad6265SDimitry Andric         LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
196181ad6265SDimitry Andric       Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
196281ad6265SDimitry Andric                                       LT.second.getScalarSizeInBits());
196381ad6265SDimitry Andric       InstructionCost Cost =
196481ad6265SDimitry Andric           LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
196581ad6265SDimitry Andric       IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
196681ad6265SDimitry Andric                                               : Intrinsic::umin,
196781ad6265SDimitry Andric                                      LegalTy, {LegalTy, LegalTy});
196881ad6265SDimitry Andric       Cost += getIntrinsicInstrCost(Attrs1, CostKind);
196981ad6265SDimitry Andric       IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
197081ad6265SDimitry Andric                                               : Intrinsic::umax,
197181ad6265SDimitry Andric                                      LegalTy, {LegalTy, LegalTy});
197281ad6265SDimitry Andric       Cost += getIntrinsicInstrCost(Attrs2, CostKind);
197381ad6265SDimitry Andric       return LT.first * Cost;
197481ad6265SDimitry Andric     }
197581ad6265SDimitry Andric     break;
197681ad6265SDimitry Andric   }
1977e8d8bef9SDimitry Andric   }
1978e8d8bef9SDimitry Andric 
1979e8d8bef9SDimitry Andric   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1980e8d8bef9SDimitry Andric }
1981e8d8bef9SDimitry Andric 
isLoweredToCall(const Function * F)19820b57cec5SDimitry Andric bool ARMTTIImpl::isLoweredToCall(const Function *F) {
19830b57cec5SDimitry Andric   if (!F->isIntrinsic())
198481ad6265SDimitry Andric     return BaseT::isLoweredToCall(F);
19850b57cec5SDimitry Andric 
19860b57cec5SDimitry Andric   // Assume all Arm-specific intrinsics map to an instruction.
19875f757f3fSDimitry Andric   if (F->getName().starts_with("llvm.arm"))
19880b57cec5SDimitry Andric     return false;
19890b57cec5SDimitry Andric 
19900b57cec5SDimitry Andric   switch (F->getIntrinsicID()) {
19910b57cec5SDimitry Andric   default: break;
19920b57cec5SDimitry Andric   case Intrinsic::powi:
19930b57cec5SDimitry Andric   case Intrinsic::sin:
19940b57cec5SDimitry Andric   case Intrinsic::cos:
19950b57cec5SDimitry Andric   case Intrinsic::pow:
19960b57cec5SDimitry Andric   case Intrinsic::log:
19970b57cec5SDimitry Andric   case Intrinsic::log10:
19980b57cec5SDimitry Andric   case Intrinsic::log2:
19990b57cec5SDimitry Andric   case Intrinsic::exp:
20000b57cec5SDimitry Andric   case Intrinsic::exp2:
20010b57cec5SDimitry Andric     return true;
20020b57cec5SDimitry Andric   case Intrinsic::sqrt:
20030b57cec5SDimitry Andric   case Intrinsic::fabs:
20040b57cec5SDimitry Andric   case Intrinsic::copysign:
20050b57cec5SDimitry Andric   case Intrinsic::floor:
20060b57cec5SDimitry Andric   case Intrinsic::ceil:
20070b57cec5SDimitry Andric   case Intrinsic::trunc:
20080b57cec5SDimitry Andric   case Intrinsic::rint:
20090b57cec5SDimitry Andric   case Intrinsic::nearbyint:
20100b57cec5SDimitry Andric   case Intrinsic::round:
20110b57cec5SDimitry Andric   case Intrinsic::canonicalize:
20120b57cec5SDimitry Andric   case Intrinsic::lround:
20130b57cec5SDimitry Andric   case Intrinsic::llround:
20140b57cec5SDimitry Andric   case Intrinsic::lrint:
20150b57cec5SDimitry Andric   case Intrinsic::llrint:
20160b57cec5SDimitry Andric     if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
20170b57cec5SDimitry Andric       return true;
20180b57cec5SDimitry Andric     if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
20190b57cec5SDimitry Andric       return true;
20200b57cec5SDimitry Andric     // Some operations can be handled by vector instructions and assume
20210b57cec5SDimitry Andric     // unsupported vectors will be expanded into supported scalar ones.
20220b57cec5SDimitry Andric     // TODO Handle scalar operations properly.
20230b57cec5SDimitry Andric     return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
20240b57cec5SDimitry Andric   case Intrinsic::masked_store:
20250b57cec5SDimitry Andric   case Intrinsic::masked_load:
20260b57cec5SDimitry Andric   case Intrinsic::masked_gather:
20270b57cec5SDimitry Andric   case Intrinsic::masked_scatter:
20280b57cec5SDimitry Andric     return !ST->hasMVEIntegerOps();
20290b57cec5SDimitry Andric   case Intrinsic::sadd_with_overflow:
20300b57cec5SDimitry Andric   case Intrinsic::uadd_with_overflow:
20310b57cec5SDimitry Andric   case Intrinsic::ssub_with_overflow:
20320b57cec5SDimitry Andric   case Intrinsic::usub_with_overflow:
20330b57cec5SDimitry Andric   case Intrinsic::sadd_sat:
20340b57cec5SDimitry Andric   case Intrinsic::uadd_sat:
20350b57cec5SDimitry Andric   case Intrinsic::ssub_sat:
20360b57cec5SDimitry Andric   case Intrinsic::usub_sat:
20370b57cec5SDimitry Andric     return false;
20380b57cec5SDimitry Andric   }
20390b57cec5SDimitry Andric 
20400b57cec5SDimitry Andric   return BaseT::isLoweredToCall(F);
20410b57cec5SDimitry Andric }
20420b57cec5SDimitry Andric 
maybeLoweredToCall(Instruction & I)2043e8d8bef9SDimitry Andric bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
20440b57cec5SDimitry Andric   unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
20450b57cec5SDimitry Andric   EVT VT = TLI->getValueType(DL, I.getType(), true);
20460b57cec5SDimitry Andric   if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
20470b57cec5SDimitry Andric     return true;
20480b57cec5SDimitry Andric 
20490b57cec5SDimitry Andric   // Check if an intrinsic will be lowered to a call and assume that any
20500b57cec5SDimitry Andric   // other CallInst will generate a bl.
20510b57cec5SDimitry Andric   if (auto *Call = dyn_cast<CallInst>(&I)) {
2052e8d8bef9SDimitry Andric     if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2053e8d8bef9SDimitry Andric       switch(II->getIntrinsicID()) {
2054e8d8bef9SDimitry Andric         case Intrinsic::memcpy:
2055e8d8bef9SDimitry Andric         case Intrinsic::memset:
2056e8d8bef9SDimitry Andric         case Intrinsic::memmove:
2057e8d8bef9SDimitry Andric           return getNumMemOps(II) == -1;
2058e8d8bef9SDimitry Andric         default:
20590b57cec5SDimitry Andric           if (const Function *F = Call->getCalledFunction())
20600b57cec5SDimitry Andric             return isLoweredToCall(F);
20610b57cec5SDimitry Andric       }
2062e8d8bef9SDimitry Andric     }
20630b57cec5SDimitry Andric     return true;
20640b57cec5SDimitry Andric   }
20650b57cec5SDimitry Andric 
20660b57cec5SDimitry Andric   // FPv5 provides conversions between integer, double-precision,
20670b57cec5SDimitry Andric   // single-precision, and half-precision formats.
20680b57cec5SDimitry Andric   switch (I.getOpcode()) {
20690b57cec5SDimitry Andric   default:
20700b57cec5SDimitry Andric     break;
20710b57cec5SDimitry Andric   case Instruction::FPToSI:
20720b57cec5SDimitry Andric   case Instruction::FPToUI:
20730b57cec5SDimitry Andric   case Instruction::SIToFP:
20740b57cec5SDimitry Andric   case Instruction::UIToFP:
20750b57cec5SDimitry Andric   case Instruction::FPTrunc:
20760b57cec5SDimitry Andric   case Instruction::FPExt:
20770b57cec5SDimitry Andric     return !ST->hasFPARMv8Base();
20780b57cec5SDimitry Andric   }
20790b57cec5SDimitry Andric 
20800b57cec5SDimitry Andric   // FIXME: Unfortunately the approach of checking the Operation Action does
20810b57cec5SDimitry Andric   // not catch all cases of Legalization that use library calls. Our
20820b57cec5SDimitry Andric   // Legalization step categorizes some transformations into library calls as
20830b57cec5SDimitry Andric   // Custom, Expand or even Legal when doing type legalization. So for now
20840b57cec5SDimitry Andric   // we have to special case for instance the SDIV of 64bit integers and the
20850b57cec5SDimitry Andric   // use of floating point emulation.
20860b57cec5SDimitry Andric   if (VT.isInteger() && VT.getSizeInBits() >= 64) {
20870b57cec5SDimitry Andric     switch (ISD) {
20880b57cec5SDimitry Andric     default:
20890b57cec5SDimitry Andric       break;
20900b57cec5SDimitry Andric     case ISD::SDIV:
20910b57cec5SDimitry Andric     case ISD::UDIV:
20920b57cec5SDimitry Andric     case ISD::SREM:
20930b57cec5SDimitry Andric     case ISD::UREM:
20940b57cec5SDimitry Andric     case ISD::SDIVREM:
20950b57cec5SDimitry Andric     case ISD::UDIVREM:
20960b57cec5SDimitry Andric       return true;
20970b57cec5SDimitry Andric     }
20980b57cec5SDimitry Andric   }
20990b57cec5SDimitry Andric 
21000b57cec5SDimitry Andric   // Assume all other non-float operations are supported.
21010b57cec5SDimitry Andric   if (!VT.isFloatingPoint())
21020b57cec5SDimitry Andric     return false;
21030b57cec5SDimitry Andric 
21040b57cec5SDimitry Andric   // We'll need a library call to handle most floats when using soft.
21050b57cec5SDimitry Andric   if (TLI->useSoftFloat()) {
21060b57cec5SDimitry Andric     switch (I.getOpcode()) {
21070b57cec5SDimitry Andric     default:
21080b57cec5SDimitry Andric       return true;
21090b57cec5SDimitry Andric     case Instruction::Alloca:
21100b57cec5SDimitry Andric     case Instruction::Load:
21110b57cec5SDimitry Andric     case Instruction::Store:
21120b57cec5SDimitry Andric     case Instruction::Select:
21130b57cec5SDimitry Andric     case Instruction::PHI:
21140b57cec5SDimitry Andric       return false;
21150b57cec5SDimitry Andric     }
21160b57cec5SDimitry Andric   }
21170b57cec5SDimitry Andric 
21180b57cec5SDimitry Andric   // We'll need a libcall to perform double precision operations on a single
21190b57cec5SDimitry Andric   // precision only FPU.
21200b57cec5SDimitry Andric   if (I.getType()->isDoubleTy() && !ST->hasFP64())
21210b57cec5SDimitry Andric     return true;
21220b57cec5SDimitry Andric 
21230b57cec5SDimitry Andric   // Likewise for half precision arithmetic.
21240b57cec5SDimitry Andric   if (I.getType()->isHalfTy() && !ST->hasFullFP16())
21250b57cec5SDimitry Andric     return true;
21260b57cec5SDimitry Andric 
21270b57cec5SDimitry Andric   return false;
2128e8d8bef9SDimitry Andric }
2129e8d8bef9SDimitry Andric 
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)2130e8d8bef9SDimitry Andric bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
2131e8d8bef9SDimitry Andric                                           AssumptionCache &AC,
2132e8d8bef9SDimitry Andric                                           TargetLibraryInfo *LibInfo,
2133e8d8bef9SDimitry Andric                                           HardwareLoopInfo &HWLoopInfo) {
2134e8d8bef9SDimitry Andric   // Low-overhead branches are only supported in the 'low-overhead branch'
2135e8d8bef9SDimitry Andric   // extension of v8.1-m.
2136e8d8bef9SDimitry Andric   if (!ST->hasLOB() || DisableLowOverheadLoops) {
2137e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2138e8d8bef9SDimitry Andric     return false;
2139e8d8bef9SDimitry Andric   }
2140e8d8bef9SDimitry Andric 
2141e8d8bef9SDimitry Andric   if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
2142e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2143e8d8bef9SDimitry Andric     return false;
2144e8d8bef9SDimitry Andric   }
2145e8d8bef9SDimitry Andric 
2146e8d8bef9SDimitry Andric   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2147e8d8bef9SDimitry Andric   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2148e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2149e8d8bef9SDimitry Andric     return false;
2150e8d8bef9SDimitry Andric   }
2151e8d8bef9SDimitry Andric 
2152e8d8bef9SDimitry Andric   const SCEV *TripCountSCEV =
2153e8d8bef9SDimitry Andric     SE.getAddExpr(BackedgeTakenCount,
2154e8d8bef9SDimitry Andric                   SE.getOne(BackedgeTakenCount->getType()));
2155e8d8bef9SDimitry Andric 
2156e8d8bef9SDimitry Andric   // We need to store the trip count in LR, a 32-bit register.
2157e8d8bef9SDimitry Andric   if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2158e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2159e8d8bef9SDimitry Andric     return false;
2160e8d8bef9SDimitry Andric   }
2161e8d8bef9SDimitry Andric 
2162e8d8bef9SDimitry Andric   // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2163e8d8bef9SDimitry Andric   // point in generating a hardware loop if that's going to happen.
21640b57cec5SDimitry Andric 
21650b57cec5SDimitry Andric   auto IsHardwareLoopIntrinsic = [](Instruction &I) {
21660b57cec5SDimitry Andric     if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
21670b57cec5SDimitry Andric       switch (Call->getIntrinsicID()) {
21680b57cec5SDimitry Andric       default:
21690b57cec5SDimitry Andric         break;
2170e8d8bef9SDimitry Andric       case Intrinsic::start_loop_iterations:
2171fe6060f1SDimitry Andric       case Intrinsic::test_start_loop_iterations:
21720b57cec5SDimitry Andric       case Intrinsic::loop_decrement:
21730b57cec5SDimitry Andric       case Intrinsic::loop_decrement_reg:
21740b57cec5SDimitry Andric         return true;
21750b57cec5SDimitry Andric       }
21760b57cec5SDimitry Andric     }
21770b57cec5SDimitry Andric     return false;
21780b57cec5SDimitry Andric   };
21790b57cec5SDimitry Andric 
21800b57cec5SDimitry Andric   // Scan the instructions to see if there's any that we know will turn into a
2181e8d8bef9SDimitry Andric   // call or if this loop is already a low-overhead loop or will become a tail
2182e8d8bef9SDimitry Andric   // predicated loop.
2183e8d8bef9SDimitry Andric   bool IsTailPredLoop = false;
21840b57cec5SDimitry Andric   auto ScanLoop = [&](Loop *L) {
21850b57cec5SDimitry Andric     for (auto *BB : L->getBlocks()) {
21860b57cec5SDimitry Andric       for (auto &I : *BB) {
2187e8d8bef9SDimitry Andric         if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2188e8d8bef9SDimitry Andric             isa<InlineAsm>(I)) {
21895ffd83dbSDimitry Andric           LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
21900b57cec5SDimitry Andric           return false;
21910b57cec5SDimitry Andric         }
2192e8d8bef9SDimitry Andric         if (auto *II = dyn_cast<IntrinsicInst>(&I))
2193e8d8bef9SDimitry Andric           IsTailPredLoop |=
2194e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2195e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2196e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2197e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2198e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
21990b57cec5SDimitry Andric       }
22005ffd83dbSDimitry Andric     }
22010b57cec5SDimitry Andric     return true;
22020b57cec5SDimitry Andric   };
22030b57cec5SDimitry Andric 
22040b57cec5SDimitry Andric   // Visit inner loops.
2205bdd1243dSDimitry Andric   for (auto *Inner : *L)
22060b57cec5SDimitry Andric     if (!ScanLoop(Inner))
22070b57cec5SDimitry Andric       return false;
22080b57cec5SDimitry Andric 
22090b57cec5SDimitry Andric   if (!ScanLoop(L))
22100b57cec5SDimitry Andric     return false;
22110b57cec5SDimitry Andric 
22120b57cec5SDimitry Andric   // TODO: Check whether the trip count calculation is expensive. If L is the
22130b57cec5SDimitry Andric   // inner loop but we know it has a low trip count, calculating that trip
22140b57cec5SDimitry Andric   // count (in the parent loop) may be detrimental.
22150b57cec5SDimitry Andric 
22160b57cec5SDimitry Andric   LLVMContext &C = L->getHeader()->getContext();
22170b57cec5SDimitry Andric   HWLoopInfo.CounterInReg = true;
22180b57cec5SDimitry Andric   HWLoopInfo.IsNestingLegal = false;
2219e8d8bef9SDimitry Andric   HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
22200b57cec5SDimitry Andric   HWLoopInfo.CountType = Type::getInt32Ty(C);
22210b57cec5SDimitry Andric   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
22220b57cec5SDimitry Andric   return true;
22230b57cec5SDimitry Andric }
22240b57cec5SDimitry Andric 
canTailPredicateInstruction(Instruction & I,int & ICmpCount)2225480093f4SDimitry Andric static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2226480093f4SDimitry Andric   // We don't allow icmp's, and because we only look at single block loops,
2227480093f4SDimitry Andric   // we simply count the icmps, i.e. there should only be 1 for the backedge.
2228480093f4SDimitry Andric   if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2229480093f4SDimitry Andric     return false;
2230349cc55cSDimitry Andric   // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2231349cc55cSDimitry Andric   // not currently canonical, but soon will be. Code without them uses icmp, and
2232349cc55cSDimitry Andric   // so is not tail predicated as per the condition above. In order to get the
2233349cc55cSDimitry Andric   // same performance we treat min and max the same as an icmp for tailpred
2234349cc55cSDimitry Andric   // purposes for the moment (we often rely on non-tailpred and higher VF's to
2235349cc55cSDimitry Andric   // pick more optimial instructions like VQDMULH. They need to be recognized
2236349cc55cSDimitry Andric   // directly by the vectorizer).
2237349cc55cSDimitry Andric   if (auto *II = dyn_cast<IntrinsicInst>(&I))
2238349cc55cSDimitry Andric     if ((II->getIntrinsicID() == Intrinsic::smin ||
2239349cc55cSDimitry Andric          II->getIntrinsicID() == Intrinsic::smax ||
2240349cc55cSDimitry Andric          II->getIntrinsicID() == Intrinsic::umin ||
2241349cc55cSDimitry Andric          II->getIntrinsicID() == Intrinsic::umax) &&
2242349cc55cSDimitry Andric         ++ICmpCount > 1)
2243349cc55cSDimitry Andric       return false;
2244480093f4SDimitry Andric 
2245480093f4SDimitry Andric   if (isa<FCmpInst>(&I))
2246480093f4SDimitry Andric     return false;
2247480093f4SDimitry Andric 
2248480093f4SDimitry Andric   // We could allow extending/narrowing FP loads/stores, but codegen is
2249480093f4SDimitry Andric   // too inefficient so reject this for now.
2250480093f4SDimitry Andric   if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2251480093f4SDimitry Andric     return false;
2252480093f4SDimitry Andric 
2253480093f4SDimitry Andric   // Extends have to be extending-loads
2254480093f4SDimitry Andric   if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2255480093f4SDimitry Andric     if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2256480093f4SDimitry Andric       return false;
2257480093f4SDimitry Andric 
2258480093f4SDimitry Andric   // Truncs have to be narrowing-stores
2259480093f4SDimitry Andric   if (isa<TruncInst>(&I) )
2260480093f4SDimitry Andric     if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2261480093f4SDimitry Andric       return false;
2262480093f4SDimitry Andric 
2263480093f4SDimitry Andric   return true;
2264480093f4SDimitry Andric }
2265480093f4SDimitry Andric 
2266480093f4SDimitry Andric // To set up a tail-predicated loop, we need to know the total number of
2267480093f4SDimitry Andric // elements processed by that loop. Thus, we need to determine the element
2268480093f4SDimitry Andric // size and:
2269480093f4SDimitry Andric // 1) it should be uniform for all operations in the vector loop, so we
2270480093f4SDimitry Andric //    e.g. don't want any widening/narrowing operations.
2271480093f4SDimitry Andric // 2) it should be smaller than i64s because we don't have vector operations
2272480093f4SDimitry Andric //    that work on i64s.
2273480093f4SDimitry Andric // 3) we don't want elements to be reversed or shuffled, to make sure the
2274480093f4SDimitry Andric //    tail-predication masks/predicates the right lanes.
2275480093f4SDimitry Andric //
canTailPredicateLoop(Loop * L,LoopInfo * LI,ScalarEvolution & SE,const DataLayout & DL,const LoopAccessInfo * LAI)2276480093f4SDimitry Andric static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2277480093f4SDimitry Andric                                  const DataLayout &DL,
2278480093f4SDimitry Andric                                  const LoopAccessInfo *LAI) {
22795ffd83dbSDimitry Andric   LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
22805ffd83dbSDimitry Andric 
2281e8d8bef9SDimitry Andric   // If there are live-out values, it is probably a reduction. We can predicate
2282e8d8bef9SDimitry Andric   // most reduction operations freely under MVE using a combination of
2283e8d8bef9SDimitry Andric   // prefer-predicated-reduction-select and inloop reductions. We limit this to
2284e8d8bef9SDimitry Andric   // floating point and integer reductions, but don't check for operators
2285e8d8bef9SDimitry Andric   // specifically here. If the value ends up not being a reduction (and so the
2286e8d8bef9SDimitry Andric   // vectorizer cannot tailfold the loop), we should fall back to standard
2287e8d8bef9SDimitry Andric   // vectorization automatically.
22885ffd83dbSDimitry Andric   SmallVector< Instruction *, 8 > LiveOuts;
22895ffd83dbSDimitry Andric   LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2290e8d8bef9SDimitry Andric   bool ReductionsDisabled =
22915ffd83dbSDimitry Andric       EnableTailPredication == TailPredication::EnabledNoReductions ||
22925ffd83dbSDimitry Andric       EnableTailPredication == TailPredication::ForceEnabledNoReductions;
22935ffd83dbSDimitry Andric 
22945ffd83dbSDimitry Andric   for (auto *I : LiveOuts) {
2295e8d8bef9SDimitry Andric     if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2296e8d8bef9SDimitry Andric         !I->getType()->isHalfTy()) {
2297e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
22985ffd83dbSDimitry Andric                            "live-out value\n");
22995ffd83dbSDimitry Andric       return false;
23005ffd83dbSDimitry Andric     }
2301e8d8bef9SDimitry Andric     if (ReductionsDisabled) {
2302e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
23035ffd83dbSDimitry Andric       return false;
23045ffd83dbSDimitry Andric     }
23055ffd83dbSDimitry Andric   }
23065ffd83dbSDimitry Andric 
23075ffd83dbSDimitry Andric   // Next, check that all instructions can be tail-predicated.
2308480093f4SDimitry Andric   PredicatedScalarEvolution PSE = LAI->getPSE();
23095ffd83dbSDimitry Andric   SmallVector<Instruction *, 16> LoadStores;
2310480093f4SDimitry Andric   int ICmpCount = 0;
2311480093f4SDimitry Andric 
2312480093f4SDimitry Andric   for (BasicBlock *BB : L->blocks()) {
2313480093f4SDimitry Andric     for (Instruction &I : BB->instructionsWithoutDebug()) {
2314480093f4SDimitry Andric       if (isa<PHINode>(&I))
2315480093f4SDimitry Andric         continue;
2316480093f4SDimitry Andric       if (!canTailPredicateInstruction(I, ICmpCount)) {
2317480093f4SDimitry Andric         LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2318480093f4SDimitry Andric         return false;
2319480093f4SDimitry Andric       }
2320480093f4SDimitry Andric 
2321480093f4SDimitry Andric       Type *T  = I.getType();
2322480093f4SDimitry Andric       if (T->getScalarSizeInBits() > 32) {
2323480093f4SDimitry Andric         LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2324480093f4SDimitry Andric         return false;
2325480093f4SDimitry Andric       }
2326480093f4SDimitry Andric       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2327349cc55cSDimitry Andric         Value *Ptr = getLoadStorePointerOperand(&I);
2328349cc55cSDimitry Andric         Type *AccessTy = getLoadStoreType(&I);
2329bdd1243dSDimitry Andric         int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2330e8d8bef9SDimitry Andric         if (NextStride == 1) {
2331480093f4SDimitry Andric           // TODO: for now only allow consecutive strides of 1. We could support
2332e8d8bef9SDimitry Andric           // other strides as long as it is uniform, but let's keep it simple
2333e8d8bef9SDimitry Andric           // for now.
2334e8d8bef9SDimitry Andric           continue;
2335e8d8bef9SDimitry Andric         } else if (NextStride == -1 ||
2336e8d8bef9SDimitry Andric                    (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2337e8d8bef9SDimitry Andric                    (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2338e8d8bef9SDimitry Andric           LLVM_DEBUG(dbgs()
2339e8d8bef9SDimitry Andric                      << "Consecutive strides of 2 found, vld2/vstr2 can't "
2340e8d8bef9SDimitry Andric                         "be tail-predicated\n.");
2341e8d8bef9SDimitry Andric           return false;
2342e8d8bef9SDimitry Andric           // TODO: don't tail predicate if there is a reversed load?
2343e8d8bef9SDimitry Andric         } else if (EnableMaskedGatherScatters) {
2344e8d8bef9SDimitry Andric           // Gather/scatters do allow loading from arbitrary strides, at
2345e8d8bef9SDimitry Andric           // least if they are loop invariant.
2346e8d8bef9SDimitry Andric           // TODO: Loop variant strides should in theory work, too, but
2347e8d8bef9SDimitry Andric           // this requires further testing.
2348349cc55cSDimitry Andric           const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2349e8d8bef9SDimitry Andric           if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2350e8d8bef9SDimitry Andric             const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2351e8d8bef9SDimitry Andric             if (PSE.getSE()->isLoopInvariant(Step, L))
2352480093f4SDimitry Andric               continue;
2353480093f4SDimitry Andric           }
2354e8d8bef9SDimitry Andric         }
2355e8d8bef9SDimitry Andric         LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2356480093f4SDimitry Andric                              "tail-predicate\n.");
2357480093f4SDimitry Andric         return false;
2358480093f4SDimitry Andric       }
2359480093f4SDimitry Andric     }
2360480093f4SDimitry Andric   }
2361480093f4SDimitry Andric 
2362480093f4SDimitry Andric   LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2363480093f4SDimitry Andric   return true;
2364480093f4SDimitry Andric }
2365480093f4SDimitry Andric 
preferPredicateOverEpilogue(TailFoldingInfo * TFI)236606c3fb27SDimitry Andric bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
23675ffd83dbSDimitry Andric   if (!EnableTailPredication) {
23685ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2369480093f4SDimitry Andric     return false;
23705ffd83dbSDimitry Andric   }
2371480093f4SDimitry Andric 
2372480093f4SDimitry Andric   // Creating a predicated vector loop is the first step for generating a
2373480093f4SDimitry Andric   // tail-predicated hardware loop, for which we need the MVE masked
2374480093f4SDimitry Andric   // load/stores instructions:
2375480093f4SDimitry Andric   if (!ST->hasMVEIntegerOps())
2376480093f4SDimitry Andric     return false;
2377480093f4SDimitry Andric 
237806c3fb27SDimitry Andric   LoopVectorizationLegality *LVL = TFI->LVL;
237906c3fb27SDimitry Andric   Loop *L = LVL->getLoop();
238006c3fb27SDimitry Andric 
2381480093f4SDimitry Andric   // For now, restrict this to single block loops.
2382480093f4SDimitry Andric   if (L->getNumBlocks() > 1) {
2383480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2384480093f4SDimitry Andric                          "loop.\n");
2385480093f4SDimitry Andric     return false;
2386480093f4SDimitry Andric   }
2387480093f4SDimitry Andric 
2388e8d8bef9SDimitry Andric   assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2389480093f4SDimitry Andric 
239006c3fb27SDimitry Andric   LoopInfo *LI = LVL->getLoopInfo();
2391480093f4SDimitry Andric   HardwareLoopInfo HWLoopInfo(L);
2392480093f4SDimitry Andric   if (!HWLoopInfo.canAnalyze(*LI)) {
2393480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2394480093f4SDimitry Andric                          "analyzable.\n");
2395480093f4SDimitry Andric     return false;
2396480093f4SDimitry Andric   }
2397480093f4SDimitry Andric 
239806c3fb27SDimitry Andric   AssumptionCache *AC = LVL->getAssumptionCache();
239906c3fb27SDimitry Andric   ScalarEvolution *SE = LVL->getScalarEvolution();
240006c3fb27SDimitry Andric 
2401480093f4SDimitry Andric   // This checks if we have the low-overhead branch architecture
2402480093f4SDimitry Andric   // extension, and if we will create a hardware-loop:
240306c3fb27SDimitry Andric   if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2404480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2405480093f4SDimitry Andric                          "profitable.\n");
2406480093f4SDimitry Andric     return false;
2407480093f4SDimitry Andric   }
2408480093f4SDimitry Andric 
240906c3fb27SDimitry Andric   DominatorTree *DT = LVL->getDominatorTree();
241006c3fb27SDimitry Andric   if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2411480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2412480093f4SDimitry Andric                          "a candidate.\n");
2413480093f4SDimitry Andric     return false;
2414480093f4SDimitry Andric   }
2415480093f4SDimitry Andric 
241606c3fb27SDimitry Andric   return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
2417480093f4SDimitry Andric }
2418480093f4SDimitry Andric 
241906c3fb27SDimitry Andric TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const242006c3fb27SDimitry Andric ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
24215ffd83dbSDimitry Andric   if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
242206c3fb27SDimitry Andric     return TailFoldingStyle::DataWithoutLaneMask;
2423480093f4SDimitry Andric 
24245ffd83dbSDimitry Andric   // Intrinsic @llvm.get.active.lane.mask is supported.
24255ffd83dbSDimitry Andric   // It is used in the MVETailPredication pass, which requires the number of
24265ffd83dbSDimitry Andric   // elements processed by this vector loop to setup the tail-predicated
24275ffd83dbSDimitry Andric   // loop.
242806c3fb27SDimitry Andric   return TailFoldingStyle::Data;
24295ffd83dbSDimitry Andric }
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)24300b57cec5SDimitry Andric void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2431349cc55cSDimitry Andric                                          TTI::UnrollingPreferences &UP,
2432349cc55cSDimitry Andric                                          OptimizationRemarkEmitter *ORE) {
24335f757f3fSDimitry Andric   // Enable Upper bound unrolling universally, providing that we do not see an
24345f757f3fSDimitry Andric   // active lane mask, which will be better kept as a loop to become tail
24355f757f3fSDimitry Andric   // predicated than to be conditionally unrolled.
24365f757f3fSDimitry Andric   UP.UpperBound =
24375f757f3fSDimitry Andric       !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
24385f757f3fSDimitry Andric         return isa<IntrinsicInst>(I) &&
24395f757f3fSDimitry Andric                cast<IntrinsicInst>(I).getIntrinsicID() ==
24405f757f3fSDimitry Andric                    Intrinsic::get_active_lane_mask;
24415f757f3fSDimitry Andric       });
2442fe6060f1SDimitry Andric 
24430b57cec5SDimitry Andric   // Only currently enable these preferences for M-Class cores.
24440b57cec5SDimitry Andric   if (!ST->isMClass())
2445349cc55cSDimitry Andric     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
24460b57cec5SDimitry Andric 
24470b57cec5SDimitry Andric   // Disable loop unrolling for Oz and Os.
24480b57cec5SDimitry Andric   UP.OptSizeThreshold = 0;
24490b57cec5SDimitry Andric   UP.PartialOptSizeThreshold = 0;
24500b57cec5SDimitry Andric   if (L->getHeader()->getParent()->hasOptSize())
24510b57cec5SDimitry Andric     return;
24520b57cec5SDimitry Andric 
24530b57cec5SDimitry Andric   SmallVector<BasicBlock*, 4> ExitingBlocks;
24540b57cec5SDimitry Andric   L->getExitingBlocks(ExitingBlocks);
24550b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Loop has:\n"
24560b57cec5SDimitry Andric                     << "Blocks: " << L->getNumBlocks() << "\n"
24570b57cec5SDimitry Andric                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
24580b57cec5SDimitry Andric 
24590b57cec5SDimitry Andric   // Only allow another exit other than the latch. This acts as an early exit
24600b57cec5SDimitry Andric   // as it mirrors the profitability calculation of the runtime unroller.
24610b57cec5SDimitry Andric   if (ExitingBlocks.size() > 2)
24620b57cec5SDimitry Andric     return;
24630b57cec5SDimitry Andric 
24640b57cec5SDimitry Andric   // Limit the CFG of the loop body for targets with a branch predictor.
24650b57cec5SDimitry Andric   // Allowing 4 blocks permits if-then-else diamonds in the body.
24660b57cec5SDimitry Andric   if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
24670b57cec5SDimitry Andric     return;
24680b57cec5SDimitry Andric 
2469e8d8bef9SDimitry Andric   // Don't unroll vectorized loops, including the remainder loop
2470e8d8bef9SDimitry Andric   if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2471e8d8bef9SDimitry Andric     return;
2472e8d8bef9SDimitry Andric 
24730b57cec5SDimitry Andric   // Scan the loop: don't unroll loops with calls as this could prevent
24740b57cec5SDimitry Andric   // inlining.
2475fe6060f1SDimitry Andric   InstructionCost Cost = 0;
24760b57cec5SDimitry Andric   for (auto *BB : L->getBlocks()) {
24770b57cec5SDimitry Andric     for (auto &I : *BB) {
2478480093f4SDimitry Andric       // Don't unroll vectorised loop. MVE does not benefit from it as much as
2479480093f4SDimitry Andric       // scalar code.
2480480093f4SDimitry Andric       if (I.getType()->isVectorTy())
2481480093f4SDimitry Andric         return;
2482480093f4SDimitry Andric 
24830b57cec5SDimitry Andric       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
24845ffd83dbSDimitry Andric         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
24850b57cec5SDimitry Andric           if (!isLoweredToCall(F))
24860b57cec5SDimitry Andric             continue;
24870b57cec5SDimitry Andric         }
24880b57cec5SDimitry Andric         return;
24890b57cec5SDimitry Andric       }
24908bcb0991SDimitry Andric 
2491e8d8bef9SDimitry Andric       SmallVector<const Value*, 4> Operands(I.operand_values());
2492bdd1243dSDimitry Andric       Cost += getInstructionCost(&I, Operands,
2493bdd1243dSDimitry Andric                                  TargetTransformInfo::TCK_SizeAndLatency);
24940b57cec5SDimitry Andric     }
24950b57cec5SDimitry Andric   }
24960b57cec5SDimitry Andric 
2497fe6060f1SDimitry Andric   // On v6m cores, there are very few registers available. We can easily end up
2498fe6060f1SDimitry Andric   // spilling and reloading more registers in an unrolled loop. Look at the
2499fe6060f1SDimitry Andric   // number of LCSSA phis as a rough measure of how many registers will need to
2500fe6060f1SDimitry Andric   // be live out of the loop, reducing the default unroll count if more than 1
2501fe6060f1SDimitry Andric   // value is needed.  In the long run, all of this should be being learnt by a
2502fe6060f1SDimitry Andric   // machine.
2503fe6060f1SDimitry Andric   unsigned UnrollCount = 4;
2504fe6060f1SDimitry Andric   if (ST->isThumb1Only()) {
2505fe6060f1SDimitry Andric     unsigned ExitingValues = 0;
2506fe6060f1SDimitry Andric     SmallVector<BasicBlock *, 4> ExitBlocks;
2507fe6060f1SDimitry Andric     L->getExitBlocks(ExitBlocks);
2508fe6060f1SDimitry Andric     for (auto *Exit : ExitBlocks) {
2509fe6060f1SDimitry Andric       // Count the number of LCSSA phis. Exclude values coming from GEP's as
2510fe6060f1SDimitry Andric       // only the last is expected to be needed for address operands.
2511fe6060f1SDimitry Andric       unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2512fe6060f1SDimitry Andric         return PH.getNumOperands() != 1 ||
2513fe6060f1SDimitry Andric                !isa<GetElementPtrInst>(PH.getOperand(0));
2514fe6060f1SDimitry Andric       });
2515fe6060f1SDimitry Andric       ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2516fe6060f1SDimitry Andric     }
2517fe6060f1SDimitry Andric     if (ExitingValues)
2518fe6060f1SDimitry Andric       UnrollCount /= ExitingValues;
2519fe6060f1SDimitry Andric     if (UnrollCount <= 1)
2520fe6060f1SDimitry Andric       return;
2521fe6060f1SDimitry Andric   }
2522fe6060f1SDimitry Andric 
25230b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2524fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
25250b57cec5SDimitry Andric 
25260b57cec5SDimitry Andric   UP.Partial = true;
25270b57cec5SDimitry Andric   UP.Runtime = true;
25280b57cec5SDimitry Andric   UP.UnrollRemainder = true;
2529fe6060f1SDimitry Andric   UP.DefaultUnrollRuntimeCount = UnrollCount;
25300b57cec5SDimitry Andric   UP.UnrollAndJam = true;
25310b57cec5SDimitry Andric   UP.UnrollAndJamInnerLoopThreshold = 60;
25320b57cec5SDimitry Andric 
25330b57cec5SDimitry Andric   // Force unrolling small loops can be very useful because of the branch
25340b57cec5SDimitry Andric   // taken cost of the backedge.
25350b57cec5SDimitry Andric   if (Cost < 12)
25360b57cec5SDimitry Andric     UP.Force = true;
25370b57cec5SDimitry Andric }
25388bcb0991SDimitry Andric 
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)25395ffd83dbSDimitry Andric void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
25405ffd83dbSDimitry Andric                                        TTI::PeelingPreferences &PP) {
25415ffd83dbSDimitry Andric   BaseT::getPeelingPreferences(L, SE, PP);
25425ffd83dbSDimitry Andric }
25435ffd83dbSDimitry Andric 
preferInLoopReduction(unsigned Opcode,Type * Ty,TTI::ReductionFlags Flags) const2544e8d8bef9SDimitry Andric bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2545e8d8bef9SDimitry Andric                                        TTI::ReductionFlags Flags) const {
2546e8d8bef9SDimitry Andric   if (!ST->hasMVEIntegerOps())
2547e8d8bef9SDimitry Andric     return false;
2548e8d8bef9SDimitry Andric 
2549e8d8bef9SDimitry Andric   unsigned ScalarBits = Ty->getScalarSizeInBits();
2550e8d8bef9SDimitry Andric   switch (Opcode) {
2551e8d8bef9SDimitry Andric   case Instruction::Add:
2552e8d8bef9SDimitry Andric     return ScalarBits <= 64;
2553e8d8bef9SDimitry Andric   default:
2554e8d8bef9SDimitry Andric     return false;
2555e8d8bef9SDimitry Andric   }
2556e8d8bef9SDimitry Andric }
2557e8d8bef9SDimitry Andric 
preferPredicatedReductionSelect(unsigned Opcode,Type * Ty,TTI::ReductionFlags Flags) const2558e8d8bef9SDimitry Andric bool ARMTTIImpl::preferPredicatedReductionSelect(
2559e8d8bef9SDimitry Andric     unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2560e8d8bef9SDimitry Andric   if (!ST->hasMVEIntegerOps())
2561e8d8bef9SDimitry Andric     return false;
2562e8d8bef9SDimitry Andric   return true;
2563e8d8bef9SDimitry Andric }
2564bdd1243dSDimitry Andric 
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace) const2565bdd1243dSDimitry Andric InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
2566bdd1243dSDimitry Andric                                                  int64_t BaseOffset,
2567bdd1243dSDimitry Andric                                                  bool HasBaseReg, int64_t Scale,
2568bdd1243dSDimitry Andric                                                  unsigned AddrSpace) const {
2569bdd1243dSDimitry Andric   TargetLoweringBase::AddrMode AM;
2570bdd1243dSDimitry Andric   AM.BaseGV = BaseGV;
2571bdd1243dSDimitry Andric   AM.BaseOffs = BaseOffset;
2572bdd1243dSDimitry Andric   AM.HasBaseReg = HasBaseReg;
2573bdd1243dSDimitry Andric   AM.Scale = Scale;
2574bdd1243dSDimitry Andric   if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2575bdd1243dSDimitry Andric     if (ST->hasFPAO())
2576bdd1243dSDimitry Andric       return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2577bdd1243dSDimitry Andric     return 0;
2578bdd1243dSDimitry Andric   }
2579bdd1243dSDimitry Andric   return -1;
2580bdd1243dSDimitry Andric }
258106c3fb27SDimitry Andric 
hasArmWideBranch(bool Thumb) const258206c3fb27SDimitry Andric bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
258306c3fb27SDimitry Andric   if (Thumb) {
258406c3fb27SDimitry Andric     // B.W is available in any Thumb2-supporting target, and also in every
258506c3fb27SDimitry Andric     // version of Armv8-M, even Baseline which does not include the rest of
258606c3fb27SDimitry Andric     // Thumb2.
258706c3fb27SDimitry Andric     return ST->isThumb2() || ST->hasV8MBaselineOps();
258806c3fb27SDimitry Andric   } else {
258906c3fb27SDimitry Andric     // B is available in all versions of the Arm ISA, so the only question is
259006c3fb27SDimitry Andric     // whether that ISA is available at all.
259106c3fb27SDimitry Andric     return ST->hasARMOps();
259206c3fb27SDimitry Andric   }
259306c3fb27SDimitry Andric }
2594