10b57cec5SDimitry Andric //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric 
90b57cec5SDimitry Andric #include "ARMTargetTransformInfo.h"
100b57cec5SDimitry Andric #include "ARMSubtarget.h"
110b57cec5SDimitry Andric #include "MCTargetDesc/ARMAddressingModes.h"
120b57cec5SDimitry Andric #include "llvm/ADT/APInt.h"
130b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
140b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h"
150b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h"
160b57cec5SDimitry Andric #include "llvm/CodeGen/ISDOpcodes.h"
170b57cec5SDimitry Andric #include "llvm/CodeGen/ValueTypes.h"
180b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h"
190b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h"
200b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
210b57cec5SDimitry Andric #include "llvm/IR/Instruction.h"
220b57cec5SDimitry Andric #include "llvm/IR/Instructions.h"
23e8d8bef9SDimitry Andric #include "llvm/IR/Intrinsics.h"
240b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
255ffd83dbSDimitry Andric #include "llvm/IR/IntrinsicsARM.h"
26480093f4SDimitry Andric #include "llvm/IR/PatternMatch.h"
270b57cec5SDimitry Andric #include "llvm/IR/Type.h"
280b57cec5SDimitry Andric #include "llvm/MC/SubtargetFeature.h"
290b57cec5SDimitry Andric #include "llvm/Support/Casting.h"
30e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h"
310b57cec5SDimitry Andric #include "llvm/Support/MachineValueType.h"
320b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h"
33e8d8bef9SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h"
34e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
355ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h"
360b57cec5SDimitry Andric #include <algorithm>
370b57cec5SDimitry Andric #include <cassert>
380b57cec5SDimitry Andric #include <cstdint>
390b57cec5SDimitry Andric #include <utility>
400b57cec5SDimitry Andric 
410b57cec5SDimitry Andric using namespace llvm;
420b57cec5SDimitry Andric 
430b57cec5SDimitry Andric #define DEBUG_TYPE "armtti"
440b57cec5SDimitry Andric 
458bcb0991SDimitry Andric static cl::opt<bool> EnableMaskedLoadStores(
46480093f4SDimitry Andric   "enable-arm-maskedldst", cl::Hidden, cl::init(true),
478bcb0991SDimitry Andric   cl::desc("Enable the generation of masked loads and stores"));
488bcb0991SDimitry Andric 
490b57cec5SDimitry Andric static cl::opt<bool> DisableLowOverheadLoops(
508bcb0991SDimitry Andric   "disable-arm-loloops", cl::Hidden, cl::init(false),
510b57cec5SDimitry Andric   cl::desc("Disable the generation of low-overhead loops"));
520b57cec5SDimitry Andric 
53e8d8bef9SDimitry Andric static cl::opt<bool>
54e8d8bef9SDimitry Andric     AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
55e8d8bef9SDimitry Andric                   cl::desc("Enable the generation of WLS loops"));
56e8d8bef9SDimitry Andric 
575ffd83dbSDimitry Andric extern cl::opt<TailPredication::Mode> EnableTailPredication;
58480093f4SDimitry Andric 
59480093f4SDimitry Andric extern cl::opt<bool> EnableMaskedGatherScatters;
60480093f4SDimitry Andric 
61e8d8bef9SDimitry Andric extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
62e8d8bef9SDimitry Andric 
63e8d8bef9SDimitry Andric /// Convert a vector load intrinsic into a simple llvm load instruction.
64e8d8bef9SDimitry Andric /// This is beneficial when the underlying object being addressed comes
65e8d8bef9SDimitry Andric /// from a constant, since we get constant-folding for free.
66e8d8bef9SDimitry Andric static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
67e8d8bef9SDimitry Andric                                InstCombiner::BuilderTy &Builder) {
68e8d8bef9SDimitry Andric   auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
69e8d8bef9SDimitry Andric 
70e8d8bef9SDimitry Andric   if (!IntrAlign)
71e8d8bef9SDimitry Andric     return nullptr;
72e8d8bef9SDimitry Andric 
73e8d8bef9SDimitry Andric   unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
74e8d8bef9SDimitry Andric                            ? MemAlign
75e8d8bef9SDimitry Andric                            : IntrAlign->getLimitedValue();
76e8d8bef9SDimitry Andric 
77e8d8bef9SDimitry Andric   if (!isPowerOf2_32(Alignment))
78e8d8bef9SDimitry Andric     return nullptr;
79e8d8bef9SDimitry Andric 
80e8d8bef9SDimitry Andric   auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
81e8d8bef9SDimitry Andric                                           PointerType::get(II.getType(), 0));
82e8d8bef9SDimitry Andric   return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
83e8d8bef9SDimitry Andric }
84e8d8bef9SDimitry Andric 
850b57cec5SDimitry Andric bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
860b57cec5SDimitry Andric                                      const Function *Callee) const {
870b57cec5SDimitry Andric   const TargetMachine &TM = getTLI()->getTargetMachine();
880b57cec5SDimitry Andric   const FeatureBitset &CallerBits =
890b57cec5SDimitry Andric       TM.getSubtargetImpl(*Caller)->getFeatureBits();
900b57cec5SDimitry Andric   const FeatureBitset &CalleeBits =
910b57cec5SDimitry Andric       TM.getSubtargetImpl(*Callee)->getFeatureBits();
920b57cec5SDimitry Andric 
935ffd83dbSDimitry Andric   // To inline a callee, all features not in the allowed list must match exactly.
945ffd83dbSDimitry Andric   bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
955ffd83dbSDimitry Andric                     (CalleeBits & ~InlineFeaturesAllowed);
965ffd83dbSDimitry Andric   // For features in the allowed list, the callee's features must be a subset of
970b57cec5SDimitry Andric   // the callers'.
985ffd83dbSDimitry Andric   bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
995ffd83dbSDimitry Andric                      (CalleeBits & InlineFeaturesAllowed);
1000b57cec5SDimitry Andric   return MatchExact && MatchSubset;
1010b57cec5SDimitry Andric }
1020b57cec5SDimitry Andric 
103fe6060f1SDimitry Andric TTI::AddressingModeKind
104fe6060f1SDimitry Andric ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
105fe6060f1SDimitry Andric                                        ScalarEvolution *SE) const {
1065ffd83dbSDimitry Andric   if (ST->hasMVEIntegerOps())
107fe6060f1SDimitry Andric     return TTI::AMK_PostIndexed;
1085ffd83dbSDimitry Andric 
109fe6060f1SDimitry Andric   if (L->getHeader()->getParent()->hasOptSize())
110fe6060f1SDimitry Andric     return TTI::AMK_None;
111fe6060f1SDimitry Andric 
112fe6060f1SDimitry Andric   if (ST->isMClass() && ST->isThumb2() &&
113fe6060f1SDimitry Andric       L->getNumBlocks() == 1)
114fe6060f1SDimitry Andric     return TTI::AMK_PreIndexed;
115fe6060f1SDimitry Andric 
116fe6060f1SDimitry Andric   return TTI::AMK_None;
1175ffd83dbSDimitry Andric }
1185ffd83dbSDimitry Andric 
119e8d8bef9SDimitry Andric Optional<Instruction *>
120e8d8bef9SDimitry Andric ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
121e8d8bef9SDimitry Andric   using namespace PatternMatch;
122e8d8bef9SDimitry Andric   Intrinsic::ID IID = II.getIntrinsicID();
123e8d8bef9SDimitry Andric   switch (IID) {
124e8d8bef9SDimitry Andric   default:
125e8d8bef9SDimitry Andric     break;
126e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld1: {
127e8d8bef9SDimitry Andric     Align MemAlign =
128e8d8bef9SDimitry Andric         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
129e8d8bef9SDimitry Andric                           &IC.getAssumptionCache(), &IC.getDominatorTree());
130e8d8bef9SDimitry Andric     if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
131e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
132e8d8bef9SDimitry Andric     }
133e8d8bef9SDimitry Andric     break;
134e8d8bef9SDimitry Andric   }
135e8d8bef9SDimitry Andric 
136e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld2:
137e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld3:
138e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld4:
139e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld2lane:
140e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld3lane:
141e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld4lane:
142e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst1:
143e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst2:
144e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst3:
145e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst4:
146e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst2lane:
147e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst3lane:
148e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst4lane: {
149e8d8bef9SDimitry Andric     Align MemAlign =
150e8d8bef9SDimitry Andric         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
151e8d8bef9SDimitry Andric                           &IC.getAssumptionCache(), &IC.getDominatorTree());
152e8d8bef9SDimitry Andric     unsigned AlignArg = II.getNumArgOperands() - 1;
153e8d8bef9SDimitry Andric     Value *AlignArgOp = II.getArgOperand(AlignArg);
154e8d8bef9SDimitry Andric     MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
155e8d8bef9SDimitry Andric     if (Align && *Align < MemAlign) {
156e8d8bef9SDimitry Andric       return IC.replaceOperand(
157e8d8bef9SDimitry Andric           II, AlignArg,
158e8d8bef9SDimitry Andric           ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
159e8d8bef9SDimitry Andric                            false));
160e8d8bef9SDimitry Andric     }
161e8d8bef9SDimitry Andric     break;
162e8d8bef9SDimitry Andric   }
163e8d8bef9SDimitry Andric 
164e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_pred_i2v: {
165e8d8bef9SDimitry Andric     Value *Arg = II.getArgOperand(0);
166e8d8bef9SDimitry Andric     Value *ArgArg;
167e8d8bef9SDimitry Andric     if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
168e8d8bef9SDimitry Andric                        PatternMatch::m_Value(ArgArg))) &&
169e8d8bef9SDimitry Andric         II.getType() == ArgArg->getType()) {
170e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, ArgArg);
171e8d8bef9SDimitry Andric     }
172e8d8bef9SDimitry Andric     Constant *XorMask;
173e8d8bef9SDimitry Andric     if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
174e8d8bef9SDimitry Andric                              PatternMatch::m_Value(ArgArg)),
175e8d8bef9SDimitry Andric                          PatternMatch::m_Constant(XorMask))) &&
176e8d8bef9SDimitry Andric         II.getType() == ArgArg->getType()) {
177e8d8bef9SDimitry Andric       if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
178e8d8bef9SDimitry Andric         if (CI->getValue().trunc(16).isAllOnesValue()) {
179e8d8bef9SDimitry Andric           auto TrueVector = IC.Builder.CreateVectorSplat(
180e8d8bef9SDimitry Andric               cast<FixedVectorType>(II.getType())->getNumElements(),
181e8d8bef9SDimitry Andric               IC.Builder.getTrue());
182e8d8bef9SDimitry Andric           return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
183e8d8bef9SDimitry Andric         }
184e8d8bef9SDimitry Andric       }
185e8d8bef9SDimitry Andric     }
186e8d8bef9SDimitry Andric     KnownBits ScalarKnown(32);
187e8d8bef9SDimitry Andric     if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
188e8d8bef9SDimitry Andric                                 ScalarKnown, 0)) {
189e8d8bef9SDimitry Andric       return &II;
190e8d8bef9SDimitry Andric     }
191e8d8bef9SDimitry Andric     break;
192e8d8bef9SDimitry Andric   }
193e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_pred_v2i: {
194e8d8bef9SDimitry Andric     Value *Arg = II.getArgOperand(0);
195e8d8bef9SDimitry Andric     Value *ArgArg;
196e8d8bef9SDimitry Andric     if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
197e8d8bef9SDimitry Andric                        PatternMatch::m_Value(ArgArg)))) {
198e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, ArgArg);
199e8d8bef9SDimitry Andric     }
200e8d8bef9SDimitry Andric     if (!II.getMetadata(LLVMContext::MD_range)) {
201e8d8bef9SDimitry Andric       Type *IntTy32 = Type::getInt32Ty(II.getContext());
202e8d8bef9SDimitry Andric       Metadata *M[] = {
203e8d8bef9SDimitry Andric           ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
204fe6060f1SDimitry Andric           ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
205e8d8bef9SDimitry Andric       II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
206e8d8bef9SDimitry Andric       return &II;
207e8d8bef9SDimitry Andric     }
208e8d8bef9SDimitry Andric     break;
209e8d8bef9SDimitry Andric   }
210e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_vadc:
211e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_vadc_predicated: {
212e8d8bef9SDimitry Andric     unsigned CarryOp =
213e8d8bef9SDimitry Andric         (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
214e8d8bef9SDimitry Andric     assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
215e8d8bef9SDimitry Andric            "Bad type for intrinsic!");
216e8d8bef9SDimitry Andric 
217e8d8bef9SDimitry Andric     KnownBits CarryKnown(32);
218e8d8bef9SDimitry Andric     if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
219e8d8bef9SDimitry Andric                                 CarryKnown)) {
220e8d8bef9SDimitry Andric       return &II;
221e8d8bef9SDimitry Andric     }
222e8d8bef9SDimitry Andric     break;
223e8d8bef9SDimitry Andric   }
224e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_vmldava: {
225e8d8bef9SDimitry Andric     Instruction *I = cast<Instruction>(&II);
226e8d8bef9SDimitry Andric     if (I->hasOneUse()) {
227e8d8bef9SDimitry Andric       auto *User = cast<Instruction>(*I->user_begin());
228e8d8bef9SDimitry Andric       Value *OpZ;
229e8d8bef9SDimitry Andric       if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
230e8d8bef9SDimitry Andric           match(I->getOperand(3), m_Zero())) {
231e8d8bef9SDimitry Andric         Value *OpX = I->getOperand(4);
232e8d8bef9SDimitry Andric         Value *OpY = I->getOperand(5);
233e8d8bef9SDimitry Andric         Type *OpTy = OpX->getType();
234e8d8bef9SDimitry Andric 
235e8d8bef9SDimitry Andric         IC.Builder.SetInsertPoint(User);
236e8d8bef9SDimitry Andric         Value *V =
237e8d8bef9SDimitry Andric             IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
238e8d8bef9SDimitry Andric                                        {I->getOperand(0), I->getOperand(1),
239e8d8bef9SDimitry Andric                                         I->getOperand(2), OpZ, OpX, OpY});
240e8d8bef9SDimitry Andric 
241e8d8bef9SDimitry Andric         IC.replaceInstUsesWith(*User, V);
242e8d8bef9SDimitry Andric         return IC.eraseInstFromFunction(*User);
243e8d8bef9SDimitry Andric       }
244e8d8bef9SDimitry Andric     }
245e8d8bef9SDimitry Andric     return None;
246e8d8bef9SDimitry Andric   }
247e8d8bef9SDimitry Andric   }
248e8d8bef9SDimitry Andric   return None;
249e8d8bef9SDimitry Andric }
250e8d8bef9SDimitry Andric 
251fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
2525ffd83dbSDimitry Andric                                           TTI::TargetCostKind CostKind) {
2530b57cec5SDimitry Andric   assert(Ty->isIntegerTy());
2540b57cec5SDimitry Andric 
2550b57cec5SDimitry Andric  unsigned Bits = Ty->getPrimitiveSizeInBits();
2560b57cec5SDimitry Andric  if (Bits == 0 || Imm.getActiveBits() >= 64)
2570b57cec5SDimitry Andric    return 4;
2580b57cec5SDimitry Andric 
2590b57cec5SDimitry Andric   int64_t SImmVal = Imm.getSExtValue();
2600b57cec5SDimitry Andric   uint64_t ZImmVal = Imm.getZExtValue();
2610b57cec5SDimitry Andric   if (!ST->isThumb()) {
2620b57cec5SDimitry Andric     if ((SImmVal >= 0 && SImmVal < 65536) ||
2630b57cec5SDimitry Andric         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
2640b57cec5SDimitry Andric         (ARM_AM::getSOImmVal(~ZImmVal) != -1))
2650b57cec5SDimitry Andric       return 1;
2660b57cec5SDimitry Andric     return ST->hasV6T2Ops() ? 2 : 3;
2670b57cec5SDimitry Andric   }
2680b57cec5SDimitry Andric   if (ST->isThumb2()) {
2690b57cec5SDimitry Andric     if ((SImmVal >= 0 && SImmVal < 65536) ||
2700b57cec5SDimitry Andric         (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
2710b57cec5SDimitry Andric         (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
2720b57cec5SDimitry Andric       return 1;
2730b57cec5SDimitry Andric     return ST->hasV6T2Ops() ? 2 : 3;
2740b57cec5SDimitry Andric   }
2750b57cec5SDimitry Andric   // Thumb1, any i8 imm cost 1.
2760b57cec5SDimitry Andric   if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
2770b57cec5SDimitry Andric     return 1;
2780b57cec5SDimitry Andric   if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
2790b57cec5SDimitry Andric     return 2;
2800b57cec5SDimitry Andric   // Load from constantpool.
2810b57cec5SDimitry Andric   return 3;
2820b57cec5SDimitry Andric }
2830b57cec5SDimitry Andric 
2840b57cec5SDimitry Andric // Constants smaller than 256 fit in the immediate field of
2850b57cec5SDimitry Andric // Thumb1 instructions so we return a zero cost and 1 otherwise.
286fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
2870b57cec5SDimitry Andric                                                   const APInt &Imm, Type *Ty) {
2880b57cec5SDimitry Andric   if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
2890b57cec5SDimitry Andric     return 0;
2900b57cec5SDimitry Andric 
2910b57cec5SDimitry Andric   return 1;
2920b57cec5SDimitry Andric }
2930b57cec5SDimitry Andric 
294e8d8bef9SDimitry Andric // Checks whether Inst is part of a min(max()) or max(min()) pattern
295e8d8bef9SDimitry Andric // that will match to an SSAT instruction
296e8d8bef9SDimitry Andric static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
297e8d8bef9SDimitry Andric   Value *LHS, *RHS;
298e8d8bef9SDimitry Andric   ConstantInt *C;
299e8d8bef9SDimitry Andric   SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
300e8d8bef9SDimitry Andric 
301e8d8bef9SDimitry Andric   if (InstSPF == SPF_SMAX &&
302e8d8bef9SDimitry Andric       PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
303e8d8bef9SDimitry Andric       C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) {
304e8d8bef9SDimitry Andric 
305e8d8bef9SDimitry Andric     auto isSSatMin = [&](Value *MinInst) {
306e8d8bef9SDimitry Andric       if (isa<SelectInst>(MinInst)) {
307e8d8bef9SDimitry Andric         Value *MinLHS, *MinRHS;
308e8d8bef9SDimitry Andric         ConstantInt *MinC;
309e8d8bef9SDimitry Andric         SelectPatternFlavor MinSPF =
310e8d8bef9SDimitry Andric             matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
311e8d8bef9SDimitry Andric         if (MinSPF == SPF_SMIN &&
312e8d8bef9SDimitry Andric             PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
313e8d8bef9SDimitry Andric             MinC->getValue() == ((-Imm) - 1))
314e8d8bef9SDimitry Andric           return true;
315e8d8bef9SDimitry Andric       }
316e8d8bef9SDimitry Andric       return false;
317e8d8bef9SDimitry Andric     };
318e8d8bef9SDimitry Andric 
319e8d8bef9SDimitry Andric     if (isSSatMin(Inst->getOperand(1)) ||
320e8d8bef9SDimitry Andric         (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) ||
321e8d8bef9SDimitry Andric                                isSSatMin(*(++Inst->user_begin())))))
322e8d8bef9SDimitry Andric       return true;
323e8d8bef9SDimitry Andric   }
324e8d8bef9SDimitry Andric   return false;
325e8d8bef9SDimitry Andric }
326e8d8bef9SDimitry Andric 
327fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
328e8d8bef9SDimitry Andric                                               const APInt &Imm, Type *Ty,
329e8d8bef9SDimitry Andric                                               TTI::TargetCostKind CostKind,
330e8d8bef9SDimitry Andric                                               Instruction *Inst) {
3310b57cec5SDimitry Andric   // Division by a constant can be turned into multiplication, but only if we
3320b57cec5SDimitry Andric   // know it's constant. So it's not so much that the immediate is cheap (it's
3330b57cec5SDimitry Andric   // not), but that the alternative is worse.
3340b57cec5SDimitry Andric   // FIXME: this is probably unneeded with GlobalISel.
3350b57cec5SDimitry Andric   if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
3360b57cec5SDimitry Andric        Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
3370b57cec5SDimitry Andric       Idx == 1)
3380b57cec5SDimitry Andric     return 0;
3390b57cec5SDimitry Andric 
340fe6060f1SDimitry Andric   // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
341fe6060f1SDimitry Andric   // splitting any large offsets.
342fe6060f1SDimitry Andric   if (Opcode == Instruction::GetElementPtr && Idx != 0)
343fe6060f1SDimitry Andric     return 0;
344fe6060f1SDimitry Andric 
3450b57cec5SDimitry Andric   if (Opcode == Instruction::And) {
3460b57cec5SDimitry Andric     // UXTB/UXTH
3470b57cec5SDimitry Andric     if (Imm == 255 || Imm == 65535)
3480b57cec5SDimitry Andric       return 0;
3490b57cec5SDimitry Andric     // Conversion to BIC is free, and means we can use ~Imm instead.
3505ffd83dbSDimitry Andric     return std::min(getIntImmCost(Imm, Ty, CostKind),
3515ffd83dbSDimitry Andric                     getIntImmCost(~Imm, Ty, CostKind));
3520b57cec5SDimitry Andric   }
3530b57cec5SDimitry Andric 
3540b57cec5SDimitry Andric   if (Opcode == Instruction::Add)
3550b57cec5SDimitry Andric     // Conversion to SUB is free, and means we can use -Imm instead.
3565ffd83dbSDimitry Andric     return std::min(getIntImmCost(Imm, Ty, CostKind),
3575ffd83dbSDimitry Andric                     getIntImmCost(-Imm, Ty, CostKind));
3580b57cec5SDimitry Andric 
3590b57cec5SDimitry Andric   if (Opcode == Instruction::ICmp && Imm.isNegative() &&
3600b57cec5SDimitry Andric       Ty->getIntegerBitWidth() == 32) {
3610b57cec5SDimitry Andric     int64_t NegImm = -Imm.getSExtValue();
3620b57cec5SDimitry Andric     if (ST->isThumb2() && NegImm < 1<<12)
3630b57cec5SDimitry Andric       // icmp X, #-C -> cmn X, #C
3640b57cec5SDimitry Andric       return 0;
3650b57cec5SDimitry Andric     if (ST->isThumb() && NegImm < 1<<8)
3660b57cec5SDimitry Andric       // icmp X, #-C -> adds X, #C
3670b57cec5SDimitry Andric       return 0;
3680b57cec5SDimitry Andric   }
3690b57cec5SDimitry Andric 
3700b57cec5SDimitry Andric   // xor a, -1 can always be folded to MVN
3710b57cec5SDimitry Andric   if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
3720b57cec5SDimitry Andric     return 0;
3730b57cec5SDimitry Andric 
374e8d8bef9SDimitry Andric   // Ensures negative constant of min(max()) or max(min()) patterns that
375e8d8bef9SDimitry Andric   // match to SSAT instructions don't get hoisted
376e8d8bef9SDimitry Andric   if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
377e8d8bef9SDimitry Andric       Ty->getIntegerBitWidth() <= 32) {
378e8d8bef9SDimitry Andric     if (isSSATMinMaxPattern(Inst, Imm) ||
379e8d8bef9SDimitry Andric         (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
380e8d8bef9SDimitry Andric          isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
381e8d8bef9SDimitry Andric       return 0;
382e8d8bef9SDimitry Andric   }
383e8d8bef9SDimitry Andric 
3845ffd83dbSDimitry Andric   return getIntImmCost(Imm, Ty, CostKind);
3850b57cec5SDimitry Andric }
3860b57cec5SDimitry Andric 
387fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
388fe6060f1SDimitry Andric                                            TTI::TargetCostKind CostKind,
389fe6060f1SDimitry Andric                                            const Instruction *I) {
390e8d8bef9SDimitry Andric   if (CostKind == TTI::TCK_RecipThroughput &&
391e8d8bef9SDimitry Andric       (ST->hasNEON() || ST->hasMVEIntegerOps())) {
392e8d8bef9SDimitry Andric     // FIXME: The vectorizer is highly sensistive to the cost of these
393e8d8bef9SDimitry Andric     // instructions, which suggests that it may be using the costs incorrectly.
394e8d8bef9SDimitry Andric     // But, for now, just make them free to avoid performance regressions for
395e8d8bef9SDimitry Andric     // vector targets.
396e8d8bef9SDimitry Andric     return 0;
397e8d8bef9SDimitry Andric   }
398fe6060f1SDimitry Andric   return BaseT::getCFInstrCost(Opcode, CostKind, I);
399e8d8bef9SDimitry Andric }
400e8d8bef9SDimitry Andric 
401fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
402fe6060f1SDimitry Andric                                              Type *Src,
403e8d8bef9SDimitry Andric                                              TTI::CastContextHint CCH,
4045ffd83dbSDimitry Andric                                              TTI::TargetCostKind CostKind,
4050b57cec5SDimitry Andric                                              const Instruction *I) {
4060b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
4070b57cec5SDimitry Andric   assert(ISD && "Invalid opcode");
4080b57cec5SDimitry Andric 
4095ffd83dbSDimitry Andric   // TODO: Allow non-throughput costs that aren't binary.
410fe6060f1SDimitry Andric   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
4115ffd83dbSDimitry Andric     if (CostKind != TTI::TCK_RecipThroughput)
4125ffd83dbSDimitry Andric       return Cost == 0 ? 0 : 1;
4135ffd83dbSDimitry Andric     return Cost;
4140b57cec5SDimitry Andric   };
415e8d8bef9SDimitry Andric   auto IsLegalFPType = [this](EVT VT) {
416e8d8bef9SDimitry Andric     EVT EltVT = VT.getScalarType();
417e8d8bef9SDimitry Andric     return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
418e8d8bef9SDimitry Andric             (EltVT == MVT::f64 && ST->hasFP64()) ||
419e8d8bef9SDimitry Andric             (EltVT == MVT::f16 && ST->hasFullFP16());
420e8d8bef9SDimitry Andric   };
4210b57cec5SDimitry Andric 
4220b57cec5SDimitry Andric   EVT SrcTy = TLI->getValueType(DL, Src);
4230b57cec5SDimitry Andric   EVT DstTy = TLI->getValueType(DL, Dst);
4240b57cec5SDimitry Andric 
4250b57cec5SDimitry Andric   if (!SrcTy.isSimple() || !DstTy.isSimple())
426e8d8bef9SDimitry Andric     return AdjustCost(
427e8d8bef9SDimitry Andric         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
4280b57cec5SDimitry Andric 
429e8d8bef9SDimitry Andric   // Extending masked load/Truncating masked stores is expensive because we
430e8d8bef9SDimitry Andric   // currently don't split them. This means that we'll likely end up
431e8d8bef9SDimitry Andric   // loading/storing each element individually (hence the high cost).
432e8d8bef9SDimitry Andric   if ((ST->hasMVEIntegerOps() &&
433e8d8bef9SDimitry Andric        (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
434e8d8bef9SDimitry Andric         Opcode == Instruction::SExt)) ||
435e8d8bef9SDimitry Andric       (ST->hasMVEFloatOps() &&
436e8d8bef9SDimitry Andric        (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
437e8d8bef9SDimitry Andric        IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
438e8d8bef9SDimitry Andric     if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
439fe6060f1SDimitry Andric       return 2 * DstTy.getVectorNumElements() *
440fe6060f1SDimitry Andric              ST->getMVEVectorCostFactor(CostKind);
441e8d8bef9SDimitry Andric 
442e8d8bef9SDimitry Andric   // The extend of other kinds of load is free
443e8d8bef9SDimitry Andric   if (CCH == TTI::CastContextHint::Normal ||
444e8d8bef9SDimitry Andric       CCH == TTI::CastContextHint::Masked) {
4458bcb0991SDimitry Andric     static const TypeConversionCostTblEntry LoadConversionTbl[] = {
4468bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
4478bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
4488bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
4498bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
4508bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
4518bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
4528bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
4538bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
4548bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
4558bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
4568bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
4578bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
4588bcb0991SDimitry Andric     };
4598bcb0991SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(
4608bcb0991SDimitry Andric             LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
4615ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
4628bcb0991SDimitry Andric 
4638bcb0991SDimitry Andric     static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
4648bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
4658bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
4668bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
4678bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
4688bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
4698bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
4705ffd83dbSDimitry Andric         // The following extend from a legal type to an illegal type, so need to
4715ffd83dbSDimitry Andric         // split the load. This introduced an extra load operation, but the
4725ffd83dbSDimitry Andric         // extend is still "free".
4735ffd83dbSDimitry Andric         {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
4745ffd83dbSDimitry Andric         {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
4755ffd83dbSDimitry Andric         {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
4765ffd83dbSDimitry Andric         {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
4775ffd83dbSDimitry Andric         {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
4785ffd83dbSDimitry Andric         {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
4798bcb0991SDimitry Andric     };
4808bcb0991SDimitry Andric     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
4818bcb0991SDimitry Andric       if (const auto *Entry =
4828bcb0991SDimitry Andric               ConvertCostTableLookup(MVELoadConversionTbl, ISD,
4838bcb0991SDimitry Andric                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
484fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
4858bcb0991SDimitry Andric     }
4865ffd83dbSDimitry Andric 
4875ffd83dbSDimitry Andric     static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
4885ffd83dbSDimitry Andric         // FPExtends are similar but also require the VCVT instructions.
4895ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
4905ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
4915ffd83dbSDimitry Andric     };
4925ffd83dbSDimitry Andric     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
4935ffd83dbSDimitry Andric       if (const auto *Entry =
4945ffd83dbSDimitry Andric               ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
4955ffd83dbSDimitry Andric                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
496fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
4975ffd83dbSDimitry Andric     }
4985ffd83dbSDimitry Andric 
4995ffd83dbSDimitry Andric     // The truncate of a store is free. This is the mirror of extends above.
500e8d8bef9SDimitry Andric     static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
5015ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
5025ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
5035ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
5045ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
505e8d8bef9SDimitry Andric         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
5065ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
5075ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
5085ffd83dbSDimitry Andric     };
5095ffd83dbSDimitry Andric     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
5105ffd83dbSDimitry Andric       if (const auto *Entry =
511e8d8bef9SDimitry Andric               ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
512e8d8bef9SDimitry Andric                                      SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
513fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5145ffd83dbSDimitry Andric     }
5155ffd83dbSDimitry Andric 
516e8d8bef9SDimitry Andric     static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
5175ffd83dbSDimitry Andric         {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
5185ffd83dbSDimitry Andric         {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
5195ffd83dbSDimitry Andric     };
5205ffd83dbSDimitry Andric     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
5215ffd83dbSDimitry Andric       if (const auto *Entry =
522e8d8bef9SDimitry Andric               ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
523e8d8bef9SDimitry Andric                                      SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
524fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5255ffd83dbSDimitry Andric     }
5265ffd83dbSDimitry Andric   }
5275ffd83dbSDimitry Andric 
5285ffd83dbSDimitry Andric   // NEON vector operations that can extend their inputs.
5295ffd83dbSDimitry Andric   if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
5305ffd83dbSDimitry Andric       I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
5315ffd83dbSDimitry Andric     static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
5325ffd83dbSDimitry Andric       // vaddl
5335ffd83dbSDimitry Andric       { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
5345ffd83dbSDimitry Andric       { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
5355ffd83dbSDimitry Andric       // vsubl
5365ffd83dbSDimitry Andric       { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
5375ffd83dbSDimitry Andric       { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
5385ffd83dbSDimitry Andric       // vmull
5395ffd83dbSDimitry Andric       { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
5405ffd83dbSDimitry Andric       { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
5415ffd83dbSDimitry Andric       // vshll
5425ffd83dbSDimitry Andric       { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
5435ffd83dbSDimitry Andric       { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
5445ffd83dbSDimitry Andric     };
5455ffd83dbSDimitry Andric 
5465ffd83dbSDimitry Andric     auto *User = cast<Instruction>(*I->user_begin());
5475ffd83dbSDimitry Andric     int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
5485ffd83dbSDimitry Andric     if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
5495ffd83dbSDimitry Andric                                              DstTy.getSimpleVT(),
5505ffd83dbSDimitry Andric                                              SrcTy.getSimpleVT())) {
5515ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
5525ffd83dbSDimitry Andric     }
5535ffd83dbSDimitry Andric   }
5545ffd83dbSDimitry Andric 
5555ffd83dbSDimitry Andric   // Single to/from double precision conversions.
5565ffd83dbSDimitry Andric   if (Src->isVectorTy() && ST->hasNEON() &&
5575ffd83dbSDimitry Andric       ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
5585ffd83dbSDimitry Andric         DstTy.getScalarType() == MVT::f32) ||
5595ffd83dbSDimitry Andric        (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
5605ffd83dbSDimitry Andric         DstTy.getScalarType() == MVT::f64))) {
5615ffd83dbSDimitry Andric     static const CostTblEntry NEONFltDblTbl[] = {
5625ffd83dbSDimitry Andric         // Vector fptrunc/fpext conversions.
5635ffd83dbSDimitry Andric         {ISD::FP_ROUND, MVT::v2f64, 2},
5645ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v2f32, 2},
5655ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v4f32, 4}};
5665ffd83dbSDimitry Andric 
567fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
5685ffd83dbSDimitry Andric     if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
5695ffd83dbSDimitry Andric       return AdjustCost(LT.first * Entry->Cost);
5708bcb0991SDimitry Andric   }
5718bcb0991SDimitry Andric 
5720b57cec5SDimitry Andric   // Some arithmetic, load and store operations have specific instructions
5730b57cec5SDimitry Andric   // to cast up/down their types automatically at no extra cost.
5740b57cec5SDimitry Andric   // TODO: Get these tables to know at least what the related operations are.
5750b57cec5SDimitry Andric   static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
5765ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
5775ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
5780b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
5790b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
5800b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
5810b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
5820b57cec5SDimitry Andric 
5830b57cec5SDimitry Andric     // The number of vmovl instructions for the extension.
5845ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
5855ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
5865ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
5875ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
5885ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
5895ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
5905ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
5915ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
5920b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
5930b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
5940b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
5950b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
5960b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
5970b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
5980b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
5990b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
6000b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
6010b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
6020b57cec5SDimitry Andric 
6030b57cec5SDimitry Andric     // Operations that we legalize using splitting.
6040b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
6050b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
6060b57cec5SDimitry Andric 
6070b57cec5SDimitry Andric     // Vector float <-> i32 conversions.
6080b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
6090b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
6100b57cec5SDimitry Andric 
6110b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
6120b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
6130b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
6140b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
6150b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
6160b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
6170b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
6180b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
6190b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
6200b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
6210b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
6220b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
6230b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
6240b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
6250b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
6260b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
6270b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
6280b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
6290b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
6300b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
6310b57cec5SDimitry Andric 
6320b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
6330b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
6340b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
6350b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
6360b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
6370b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
6380b57cec5SDimitry Andric 
6390b57cec5SDimitry Andric     // Vector double <-> i32 conversions.
6400b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
6410b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
6420b57cec5SDimitry Andric 
6430b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
6440b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
6450b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
6460b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
6470b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
6480b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
6490b57cec5SDimitry Andric 
6500b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
6510b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
6520b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
6530b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
6540b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
6550b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
6560b57cec5SDimitry Andric   };
6570b57cec5SDimitry Andric 
6580b57cec5SDimitry Andric   if (SrcTy.isVector() && ST->hasNEON()) {
6590b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
6600b57cec5SDimitry Andric                                                    DstTy.getSimpleVT(),
6610b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
6625ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
6630b57cec5SDimitry Andric   }
6640b57cec5SDimitry Andric 
6650b57cec5SDimitry Andric   // Scalar float to integer conversions.
6660b57cec5SDimitry Andric   static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
6670b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
6680b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
6690b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
6700b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
6710b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
6720b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
6730b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
6740b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
6750b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
6760b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
6770b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
6780b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
6790b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
6800b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
6810b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
6820b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
6830b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
6840b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
6850b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
6860b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
6870b57cec5SDimitry Andric   };
6880b57cec5SDimitry Andric   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
6890b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
6900b57cec5SDimitry Andric                                                    DstTy.getSimpleVT(),
6910b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
6925ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
6930b57cec5SDimitry Andric   }
6940b57cec5SDimitry Andric 
6950b57cec5SDimitry Andric   // Scalar integer to float conversions.
6960b57cec5SDimitry Andric   static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
6970b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
6980b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
6990b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
7000b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
7010b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
7020b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
7030b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
7040b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
7050b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
7060b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
7070b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
7080b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
7090b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
7100b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
7110b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
7120b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
7130b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
7140b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
7150b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
7160b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
7170b57cec5SDimitry Andric   };
7180b57cec5SDimitry Andric 
7190b57cec5SDimitry Andric   if (SrcTy.isInteger() && ST->hasNEON()) {
7200b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
7210b57cec5SDimitry Andric                                                    ISD, DstTy.getSimpleVT(),
7220b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
7235ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
7240b57cec5SDimitry Andric   }
7250b57cec5SDimitry Andric 
7268bcb0991SDimitry Andric   // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
7278bcb0991SDimitry Andric   // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
7288bcb0991SDimitry Andric   // are linearised so take more.
7298bcb0991SDimitry Andric   static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
7308bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
7318bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
7328bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
7338bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
7348bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
7358bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
7368bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
7378bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
7388bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
7398bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
7408bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
7418bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
7428bcb0991SDimitry Andric   };
7438bcb0991SDimitry Andric 
7448bcb0991SDimitry Andric   if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
7458bcb0991SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
7468bcb0991SDimitry Andric                                                    ISD, DstTy.getSimpleVT(),
7478bcb0991SDimitry Andric                                                    SrcTy.getSimpleVT()))
748fe6060f1SDimitry Andric       return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
7495ffd83dbSDimitry Andric   }
7505ffd83dbSDimitry Andric 
7515ffd83dbSDimitry Andric   if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
7525ffd83dbSDimitry Andric     // As general rule, fp converts that were not matched above are scalarized
7535ffd83dbSDimitry Andric     // and cost 1 vcvt for each lane, so long as the instruction is available.
7545ffd83dbSDimitry Andric     // If not it will become a series of function calls.
755fe6060f1SDimitry Andric     const InstructionCost CallCost =
756fe6060f1SDimitry Andric         getCallInstrCost(nullptr, Dst, {Src}, CostKind);
7575ffd83dbSDimitry Andric     int Lanes = 1;
7585ffd83dbSDimitry Andric     if (SrcTy.isFixedLengthVector())
7595ffd83dbSDimitry Andric       Lanes = SrcTy.getVectorNumElements();
7605ffd83dbSDimitry Andric 
761e8d8bef9SDimitry Andric     if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
7625ffd83dbSDimitry Andric       return Lanes;
7635ffd83dbSDimitry Andric     else
7645ffd83dbSDimitry Andric       return Lanes * CallCost;
7658bcb0991SDimitry Andric   }
7668bcb0991SDimitry Andric 
767e8d8bef9SDimitry Andric   if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
768e8d8bef9SDimitry Andric       SrcTy.isFixedLengthVector()) {
769e8d8bef9SDimitry Andric     // Treat a truncate with larger than legal source (128bits for MVE) as
770e8d8bef9SDimitry Andric     // expensive, 2 instructions per lane.
771e8d8bef9SDimitry Andric     if ((SrcTy.getScalarType() == MVT::i8 ||
772e8d8bef9SDimitry Andric          SrcTy.getScalarType() == MVT::i16 ||
773e8d8bef9SDimitry Andric          SrcTy.getScalarType() == MVT::i32) &&
774e8d8bef9SDimitry Andric         SrcTy.getSizeInBits() > 128 &&
775e8d8bef9SDimitry Andric         SrcTy.getSizeInBits() > DstTy.getSizeInBits())
776e8d8bef9SDimitry Andric       return SrcTy.getVectorNumElements() * 2;
777e8d8bef9SDimitry Andric   }
778e8d8bef9SDimitry Andric 
7790b57cec5SDimitry Andric   // Scalar integer conversion costs.
7800b57cec5SDimitry Andric   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
7810b57cec5SDimitry Andric     // i16 -> i64 requires two dependent operations.
7820b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
7830b57cec5SDimitry Andric 
7840b57cec5SDimitry Andric     // Truncates on i64 are assumed to be free.
7850b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
7860b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
7870b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
7880b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
7890b57cec5SDimitry Andric   };
7900b57cec5SDimitry Andric 
7910b57cec5SDimitry Andric   if (SrcTy.isInteger()) {
7920b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
7930b57cec5SDimitry Andric                                                    DstTy.getSimpleVT(),
7940b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
7955ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
7960b57cec5SDimitry Andric   }
7970b57cec5SDimitry Andric 
7988bcb0991SDimitry Andric   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
799fe6060f1SDimitry Andric                      ? ST->getMVEVectorCostFactor(CostKind)
8008bcb0991SDimitry Andric                      : 1;
8015ffd83dbSDimitry Andric   return AdjustCost(
802e8d8bef9SDimitry Andric       BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
8030b57cec5SDimitry Andric }
8040b57cec5SDimitry Andric 
805fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
8060b57cec5SDimitry Andric                                                unsigned Index) {
8070b57cec5SDimitry Andric   // Penalize inserting into an D-subregister. We end up with a three times
8080b57cec5SDimitry Andric   // lower estimated throughput on swift.
8090b57cec5SDimitry Andric   if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
8100b57cec5SDimitry Andric       ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
8110b57cec5SDimitry Andric     return 3;
8120b57cec5SDimitry Andric 
8138bcb0991SDimitry Andric   if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
8140b57cec5SDimitry Andric                         Opcode == Instruction::ExtractElement)) {
8150b57cec5SDimitry Andric     // Cross-class copies are expensive on many microarchitectures,
8160b57cec5SDimitry Andric     // so assume they are expensive by default.
8175ffd83dbSDimitry Andric     if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
8180b57cec5SDimitry Andric       return 3;
8190b57cec5SDimitry Andric 
8200b57cec5SDimitry Andric     // Even if it's not a cross class copy, this likely leads to mixing
8210b57cec5SDimitry Andric     // of NEON and VFP code and should be therefore penalized.
8220b57cec5SDimitry Andric     if (ValTy->isVectorTy() &&
8230b57cec5SDimitry Andric         ValTy->getScalarSizeInBits() <= 32)
824fe6060f1SDimitry Andric       return std::max<InstructionCost>(
825fe6060f1SDimitry Andric           BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
8260b57cec5SDimitry Andric   }
8270b57cec5SDimitry Andric 
8288bcb0991SDimitry Andric   if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
8298bcb0991SDimitry Andric                                  Opcode == Instruction::ExtractElement)) {
830fe6060f1SDimitry Andric     // Integer cross-lane moves are more expensive than float, which can
831fe6060f1SDimitry Andric     // sometimes just be vmovs. Integer involve being passes to GPR registers,
832fe6060f1SDimitry Andric     // causing more of a delay.
833fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT =
834fe6060f1SDimitry Andric         getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
835fe6060f1SDimitry Andric     return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
8368bcb0991SDimitry Andric   }
8378bcb0991SDimitry Andric 
8380b57cec5SDimitry Andric   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
8390b57cec5SDimitry Andric }
8400b57cec5SDimitry Andric 
841fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
842fe6060f1SDimitry Andric                                                Type *CondTy,
843e8d8bef9SDimitry Andric                                                CmpInst::Predicate VecPred,
8445ffd83dbSDimitry Andric                                                TTI::TargetCostKind CostKind,
8450b57cec5SDimitry Andric                                                const Instruction *I) {
8460b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
847e8d8bef9SDimitry Andric 
848e8d8bef9SDimitry Andric   // Thumb scalar code size cost for select.
849e8d8bef9SDimitry Andric   if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
850e8d8bef9SDimitry Andric       ST->isThumb() && !ValTy->isVectorTy()) {
851e8d8bef9SDimitry Andric     // Assume expensive structs.
852e8d8bef9SDimitry Andric     if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
853e8d8bef9SDimitry Andric       return TTI::TCC_Expensive;
854e8d8bef9SDimitry Andric 
855e8d8bef9SDimitry Andric     // Select costs can vary because they:
856e8d8bef9SDimitry Andric     // - may require one or more conditional mov (including an IT),
857e8d8bef9SDimitry Andric     // - can't operate directly on immediates,
858e8d8bef9SDimitry Andric     // - require live flags, which we can't copy around easily.
859fe6060f1SDimitry Andric     InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
860e8d8bef9SDimitry Andric 
861e8d8bef9SDimitry Andric     // Possible IT instruction for Thumb2, or more for Thumb1.
862e8d8bef9SDimitry Andric     ++Cost;
863e8d8bef9SDimitry Andric 
864e8d8bef9SDimitry Andric     // i1 values may need rematerialising by using mov immediates and/or
865e8d8bef9SDimitry Andric     // flag setting instructions.
866e8d8bef9SDimitry Andric     if (ValTy->isIntegerTy(1))
867e8d8bef9SDimitry Andric       ++Cost;
868e8d8bef9SDimitry Andric 
869e8d8bef9SDimitry Andric     return Cost;
870e8d8bef9SDimitry Andric   }
871e8d8bef9SDimitry Andric 
872fe6060f1SDimitry Andric   // If this is a vector min/max/abs, use the cost of that intrinsic directly
873fe6060f1SDimitry Andric   // instead. Hopefully when min/max intrinsics are more prevalent this code
874fe6060f1SDimitry Andric   // will not be needed.
875fe6060f1SDimitry Andric   const Instruction *Sel = I;
876fe6060f1SDimitry Andric   if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
877fe6060f1SDimitry Andric       Sel->hasOneUse())
878fe6060f1SDimitry Andric     Sel = cast<Instruction>(Sel->user_back());
879fe6060f1SDimitry Andric   if (Sel && ValTy->isVectorTy() &&
880fe6060f1SDimitry Andric       (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
881fe6060f1SDimitry Andric     const Value *LHS, *RHS;
882fe6060f1SDimitry Andric     SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
883fe6060f1SDimitry Andric     unsigned IID = 0;
884fe6060f1SDimitry Andric     switch (SPF) {
885fe6060f1SDimitry Andric     case SPF_ABS:
886fe6060f1SDimitry Andric       IID = Intrinsic::abs;
887fe6060f1SDimitry Andric       break;
888fe6060f1SDimitry Andric     case SPF_SMIN:
889fe6060f1SDimitry Andric       IID = Intrinsic::smin;
890fe6060f1SDimitry Andric       break;
891fe6060f1SDimitry Andric     case SPF_SMAX:
892fe6060f1SDimitry Andric       IID = Intrinsic::smax;
893fe6060f1SDimitry Andric       break;
894fe6060f1SDimitry Andric     case SPF_UMIN:
895fe6060f1SDimitry Andric       IID = Intrinsic::umin;
896fe6060f1SDimitry Andric       break;
897fe6060f1SDimitry Andric     case SPF_UMAX:
898fe6060f1SDimitry Andric       IID = Intrinsic::umax;
899fe6060f1SDimitry Andric       break;
900fe6060f1SDimitry Andric     case SPF_FMINNUM:
901fe6060f1SDimitry Andric       IID = Intrinsic::minnum;
902fe6060f1SDimitry Andric       break;
903fe6060f1SDimitry Andric     case SPF_FMAXNUM:
904fe6060f1SDimitry Andric       IID = Intrinsic::maxnum;
905fe6060f1SDimitry Andric       break;
906fe6060f1SDimitry Andric     default:
907fe6060f1SDimitry Andric       break;
908fe6060f1SDimitry Andric     }
909fe6060f1SDimitry Andric     if (IID) {
910fe6060f1SDimitry Andric       // The ICmp is free, the select gets the cost of the min/max/etc
911fe6060f1SDimitry Andric       if (Sel != I)
912fe6060f1SDimitry Andric         return 0;
913fe6060f1SDimitry Andric       IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
914fe6060f1SDimitry Andric       return getIntrinsicInstrCost(CostAttrs, CostKind);
915fe6060f1SDimitry Andric     }
916fe6060f1SDimitry Andric   }
917fe6060f1SDimitry Andric 
9180b57cec5SDimitry Andric   // On NEON a vector select gets lowered to vbsl.
919e8d8bef9SDimitry Andric   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
9200b57cec5SDimitry Andric     // Lowering of some vector selects is currently far from perfect.
9210b57cec5SDimitry Andric     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
9220b57cec5SDimitry Andric       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
9230b57cec5SDimitry Andric       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
9240b57cec5SDimitry Andric       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
9250b57cec5SDimitry Andric     };
9260b57cec5SDimitry Andric 
9270b57cec5SDimitry Andric     EVT SelCondTy = TLI->getValueType(DL, CondTy);
9280b57cec5SDimitry Andric     EVT SelValTy = TLI->getValueType(DL, ValTy);
9290b57cec5SDimitry Andric     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
9300b57cec5SDimitry Andric       if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
9310b57cec5SDimitry Andric                                                      SelCondTy.getSimpleVT(),
9320b57cec5SDimitry Andric                                                      SelValTy.getSimpleVT()))
9330b57cec5SDimitry Andric         return Entry->Cost;
9340b57cec5SDimitry Andric     }
9350b57cec5SDimitry Andric 
936fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT =
937fe6060f1SDimitry Andric         TLI->getTypeLegalizationCost(DL, ValTy);
9380b57cec5SDimitry Andric     return LT.first;
9390b57cec5SDimitry Andric   }
9400b57cec5SDimitry Andric 
941fe6060f1SDimitry Andric   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
942fe6060f1SDimitry Andric       (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
943fe6060f1SDimitry Andric       cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
944fe6060f1SDimitry Andric     FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
945fe6060f1SDimitry Andric     FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
946fe6060f1SDimitry Andric     if (!VecCondTy)
947fe6060f1SDimitry Andric       VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
948fe6060f1SDimitry Andric 
949fe6060f1SDimitry Andric     // If we don't have mve.fp any fp operations will need to be scalarized.
950fe6060f1SDimitry Andric     if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
951fe6060f1SDimitry Andric       // One scalaization insert, one scalarization extract and the cost of the
952fe6060f1SDimitry Andric       // fcmps.
953fe6060f1SDimitry Andric       return BaseT::getScalarizationOverhead(VecValTy, false, true) +
954fe6060f1SDimitry Andric              BaseT::getScalarizationOverhead(VecCondTy, true, false) +
955fe6060f1SDimitry Andric              VecValTy->getNumElements() *
956fe6060f1SDimitry Andric                  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
957fe6060f1SDimitry Andric                                     VecCondTy->getScalarType(), VecPred, CostKind,
958fe6060f1SDimitry Andric                                     I);
959fe6060f1SDimitry Andric     }
960fe6060f1SDimitry Andric 
961fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT =
962fe6060f1SDimitry Andric         TLI->getTypeLegalizationCost(DL, ValTy);
963fe6060f1SDimitry Andric     int BaseCost = ST->getMVEVectorCostFactor(CostKind);
964fe6060f1SDimitry Andric     // There are two types - the input that specifies the type of the compare
965fe6060f1SDimitry Andric     // and the output vXi1 type. Because we don't know how the output will be
966fe6060f1SDimitry Andric     // split, we may need an expensive shuffle to get two in sync. This has the
967fe6060f1SDimitry Andric     // effect of making larger than legal compares (v8i32 for example)
968fe6060f1SDimitry Andric     // expensive.
969fe6060f1SDimitry Andric     if (LT.second.getVectorNumElements() > 2) {
970fe6060f1SDimitry Andric       if (LT.first > 1)
971fe6060f1SDimitry Andric         return LT.first * BaseCost +
972fe6060f1SDimitry Andric                BaseT::getScalarizationOverhead(VecCondTy, true, false);
973fe6060f1SDimitry Andric       return BaseCost;
974fe6060f1SDimitry Andric     }
975fe6060f1SDimitry Andric   }
976fe6060f1SDimitry Andric 
977e8d8bef9SDimitry Andric   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
978e8d8bef9SDimitry Andric   // for "multiple beats" potentially needed by MVE instructions.
979e8d8bef9SDimitry Andric   int BaseCost = 1;
980fe6060f1SDimitry Andric   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
981fe6060f1SDimitry Andric     BaseCost = ST->getMVEVectorCostFactor(CostKind);
982e8d8bef9SDimitry Andric 
983e8d8bef9SDimitry Andric   return BaseCost *
984e8d8bef9SDimitry Andric          BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
9850b57cec5SDimitry Andric }
9860b57cec5SDimitry Andric 
987fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
988fe6060f1SDimitry Andric                                                       ScalarEvolution *SE,
9890b57cec5SDimitry Andric                                                       const SCEV *Ptr) {
9900b57cec5SDimitry Andric   // Address computations in vectorized code with non-consecutive addresses will
9910b57cec5SDimitry Andric   // likely result in more instructions compared to scalar code where the
9920b57cec5SDimitry Andric   // computation can more often be merged into the index mode. The resulting
9930b57cec5SDimitry Andric   // extra micro-ops can significantly decrease throughput.
9940b57cec5SDimitry Andric   unsigned NumVectorInstToHideOverhead = 10;
9950b57cec5SDimitry Andric   int MaxMergeDistance = 64;
9960b57cec5SDimitry Andric 
9978bcb0991SDimitry Andric   if (ST->hasNEON()) {
9980b57cec5SDimitry Andric     if (Ty->isVectorTy() && SE &&
9990b57cec5SDimitry Andric         !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
10000b57cec5SDimitry Andric       return NumVectorInstToHideOverhead;
10010b57cec5SDimitry Andric 
10020b57cec5SDimitry Andric     // In many cases the address computation is not merged into the instruction
10030b57cec5SDimitry Andric     // addressing mode.
10040b57cec5SDimitry Andric     return 1;
10050b57cec5SDimitry Andric   }
10068bcb0991SDimitry Andric   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
10078bcb0991SDimitry Andric }
10088bcb0991SDimitry Andric 
10095ffd83dbSDimitry Andric bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
10105ffd83dbSDimitry Andric   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
10115ffd83dbSDimitry Andric     // If a VCTP is part of a chain, it's already profitable and shouldn't be
10125ffd83dbSDimitry Andric     // optimized, else LSR may block tail-predication.
10135ffd83dbSDimitry Andric     switch (II->getIntrinsicID()) {
10145ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp8:
10155ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp16:
10165ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp32:
10175ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp64:
10185ffd83dbSDimitry Andric       return true;
10195ffd83dbSDimitry Andric     default:
10205ffd83dbSDimitry Andric       break;
10215ffd83dbSDimitry Andric     }
10225ffd83dbSDimitry Andric   }
10235ffd83dbSDimitry Andric   return false;
10245ffd83dbSDimitry Andric }
10255ffd83dbSDimitry Andric 
10265ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
10278bcb0991SDimitry Andric   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
10288bcb0991SDimitry Andric     return false;
10298bcb0991SDimitry Andric 
10305ffd83dbSDimitry Andric   if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
10318bcb0991SDimitry Andric     // Don't support v2i1 yet.
10328bcb0991SDimitry Andric     if (VecTy->getNumElements() == 2)
10338bcb0991SDimitry Andric       return false;
10348bcb0991SDimitry Andric 
10358bcb0991SDimitry Andric     // We don't support extending fp types.
10368bcb0991SDimitry Andric      unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
10378bcb0991SDimitry Andric     if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
10388bcb0991SDimitry Andric       return false;
10398bcb0991SDimitry Andric   }
10408bcb0991SDimitry Andric 
10418bcb0991SDimitry Andric   unsigned EltWidth = DataTy->getScalarSizeInBits();
10425ffd83dbSDimitry Andric   return (EltWidth == 32 && Alignment >= 4) ||
10435ffd83dbSDimitry Andric          (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
10448bcb0991SDimitry Andric }
10450b57cec5SDimitry Andric 
10465ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1047480093f4SDimitry Andric   if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1048480093f4SDimitry Andric     return false;
1049480093f4SDimitry Andric 
1050480093f4SDimitry Andric   // This method is called in 2 places:
1051480093f4SDimitry Andric   //  - from the vectorizer with a scalar type, in which case we need to get
1052480093f4SDimitry Andric   //  this as good as we can with the limited info we have (and rely on the cost
1053480093f4SDimitry Andric   //  model for the rest).
1054480093f4SDimitry Andric   //  - from the masked intrinsic lowering pass with the actual vector type.
1055480093f4SDimitry Andric   // For MVE, we have a custom lowering pass that will already have custom
1056480093f4SDimitry Andric   // legalised any gathers that we can to MVE intrinsics, and want to expand all
1057480093f4SDimitry Andric   // the rest. The pass runs before the masked intrinsic lowering pass, so if we
1058480093f4SDimitry Andric   // are here, we know we want to expand.
1059480093f4SDimitry Andric   if (isa<VectorType>(Ty))
1060480093f4SDimitry Andric     return false;
1061480093f4SDimitry Andric 
1062480093f4SDimitry Andric   unsigned EltWidth = Ty->getScalarSizeInBits();
10635ffd83dbSDimitry Andric   return ((EltWidth == 32 && Alignment >= 4) ||
10645ffd83dbSDimitry Andric           (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1065480093f4SDimitry Andric }
1066480093f4SDimitry Andric 
1067e8d8bef9SDimitry Andric /// Given a memcpy/memset/memmove instruction, return the number of memory
1068e8d8bef9SDimitry Andric /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1069e8d8bef9SDimitry Andric /// call is used.
1070e8d8bef9SDimitry Andric int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1071e8d8bef9SDimitry Andric   MemOp MOp;
1072e8d8bef9SDimitry Andric   unsigned DstAddrSpace = ~0u;
1073e8d8bef9SDimitry Andric   unsigned SrcAddrSpace = ~0u;
1074e8d8bef9SDimitry Andric   const Function *F = I->getParent()->getParent();
10750b57cec5SDimitry Andric 
1076e8d8bef9SDimitry Andric   if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1077e8d8bef9SDimitry Andric     ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
10780b57cec5SDimitry Andric     // If 'size' is not a constant, a library call will be generated.
10790b57cec5SDimitry Andric     if (!C)
1080e8d8bef9SDimitry Andric       return -1;
10810b57cec5SDimitry Andric 
10820b57cec5SDimitry Andric     const unsigned Size = C->getValue().getZExtValue();
1083e8d8bef9SDimitry Andric     const Align DstAlign = *MC->getDestAlign();
1084e8d8bef9SDimitry Andric     const Align SrcAlign = *MC->getSourceAlign();
1085e8d8bef9SDimitry Andric 
1086e8d8bef9SDimitry Andric     MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1087e8d8bef9SDimitry Andric                       /*IsVolatile*/ false);
1088e8d8bef9SDimitry Andric     DstAddrSpace = MC->getDestAddressSpace();
1089e8d8bef9SDimitry Andric     SrcAddrSpace = MC->getSourceAddressSpace();
1090e8d8bef9SDimitry Andric   }
1091e8d8bef9SDimitry Andric   else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1092e8d8bef9SDimitry Andric     ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1093e8d8bef9SDimitry Andric     // If 'size' is not a constant, a library call will be generated.
1094e8d8bef9SDimitry Andric     if (!C)
1095e8d8bef9SDimitry Andric       return -1;
1096e8d8bef9SDimitry Andric 
1097e8d8bef9SDimitry Andric     const unsigned Size = C->getValue().getZExtValue();
1098e8d8bef9SDimitry Andric     const Align DstAlign = *MS->getDestAlign();
1099e8d8bef9SDimitry Andric 
1100e8d8bef9SDimitry Andric     MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1101e8d8bef9SDimitry Andric                      /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1102e8d8bef9SDimitry Andric     DstAddrSpace = MS->getDestAddressSpace();
1103e8d8bef9SDimitry Andric   }
1104e8d8bef9SDimitry Andric   else
1105e8d8bef9SDimitry Andric     llvm_unreachable("Expected a memcpy/move or memset!");
1106e8d8bef9SDimitry Andric 
1107e8d8bef9SDimitry Andric   unsigned Limit, Factor = 2;
1108e8d8bef9SDimitry Andric   switch(I->getIntrinsicID()) {
1109e8d8bef9SDimitry Andric     case Intrinsic::memcpy:
1110e8d8bef9SDimitry Andric       Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1111e8d8bef9SDimitry Andric       break;
1112e8d8bef9SDimitry Andric     case Intrinsic::memmove:
1113e8d8bef9SDimitry Andric       Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1114e8d8bef9SDimitry Andric       break;
1115e8d8bef9SDimitry Andric     case Intrinsic::memset:
1116e8d8bef9SDimitry Andric       Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1117e8d8bef9SDimitry Andric       Factor = 1;
1118e8d8bef9SDimitry Andric       break;
1119e8d8bef9SDimitry Andric     default:
1120e8d8bef9SDimitry Andric       llvm_unreachable("Expected a memcpy/move or memset!");
1121e8d8bef9SDimitry Andric   }
11220b57cec5SDimitry Andric 
11230b57cec5SDimitry Andric   // MemOps will be poplulated with a list of data types that needs to be
11240b57cec5SDimitry Andric   // loaded and stored. That's why we multiply the number of elements by 2 to
11250b57cec5SDimitry Andric   // get the cost for this memcpy.
1126e8d8bef9SDimitry Andric   std::vector<EVT> MemOps;
11270b57cec5SDimitry Andric   if (getTLI()->findOptimalMemOpLowering(
1128e8d8bef9SDimitry Andric           MemOps, Limit, MOp, DstAddrSpace,
1129e8d8bef9SDimitry Andric           SrcAddrSpace, F->getAttributes()))
1130e8d8bef9SDimitry Andric     return MemOps.size() * Factor;
11310b57cec5SDimitry Andric 
11320b57cec5SDimitry Andric   // If we can't find an optimal memop lowering, return the default cost
1133e8d8bef9SDimitry Andric   return -1;
1134e8d8bef9SDimitry Andric }
1135e8d8bef9SDimitry Andric 
1136fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1137e8d8bef9SDimitry Andric   int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1138e8d8bef9SDimitry Andric 
1139e8d8bef9SDimitry Andric   // To model the cost of a library call, we assume 1 for the call, and
1140e8d8bef9SDimitry Andric   // 3 for the argument setup.
1141e8d8bef9SDimitry Andric   if (NumOps == -1)
1142e8d8bef9SDimitry Andric     return 4;
1143e8d8bef9SDimitry Andric   return NumOps;
11440b57cec5SDimitry Andric }
11450b57cec5SDimitry Andric 
1146fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1147fe6060f1SDimitry Andric                                            VectorType *Tp, ArrayRef<int> Mask,
11485ffd83dbSDimitry Andric                                            int Index, VectorType *SubTp) {
1149fe6060f1SDimitry Andric   Kind = improveShuffleKindFromMask(Kind, Mask);
11508bcb0991SDimitry Andric   if (ST->hasNEON()) {
11510b57cec5SDimitry Andric     if (Kind == TTI::SK_Broadcast) {
11520b57cec5SDimitry Andric       static const CostTblEntry NEONDupTbl[] = {
11530b57cec5SDimitry Andric           // VDUP handles these cases.
11540b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
11550b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
11560b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
11570b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
11580b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
11590b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
11600b57cec5SDimitry Andric 
11610b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
11620b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
11630b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
11640b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
11650b57cec5SDimitry Andric 
1166fe6060f1SDimitry Andric       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
11678bcb0991SDimitry Andric       if (const auto *Entry =
11688bcb0991SDimitry Andric               CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
11690b57cec5SDimitry Andric         return LT.first * Entry->Cost;
11700b57cec5SDimitry Andric     }
11710b57cec5SDimitry Andric     if (Kind == TTI::SK_Reverse) {
11720b57cec5SDimitry Andric       static const CostTblEntry NEONShuffleTbl[] = {
11730b57cec5SDimitry Andric           // Reverse shuffle cost one instruction if we are shuffling within a
11740b57cec5SDimitry Andric           // double word (vrev) or two if we shuffle a quad word (vrev, vext).
11750b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
11760b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
11770b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
11780b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
11790b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
11800b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
11810b57cec5SDimitry Andric 
11820b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
11830b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
11840b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
11850b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
11860b57cec5SDimitry Andric 
1187fe6060f1SDimitry Andric       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
11888bcb0991SDimitry Andric       if (const auto *Entry =
11898bcb0991SDimitry Andric               CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
11900b57cec5SDimitry Andric         return LT.first * Entry->Cost;
11910b57cec5SDimitry Andric     }
11920b57cec5SDimitry Andric     if (Kind == TTI::SK_Select) {
11930b57cec5SDimitry Andric       static const CostTblEntry NEONSelShuffleTbl[] = {
11948bcb0991SDimitry Andric           // Select shuffle cost table for ARM. Cost is the number of
11958bcb0991SDimitry Andric           // instructions
11960b57cec5SDimitry Andric           // required to create the shuffled vector.
11970b57cec5SDimitry Andric 
11980b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
11990b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12000b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12010b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12020b57cec5SDimitry Andric 
12030b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
12040b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
12050b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
12060b57cec5SDimitry Andric 
12070b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
12080b57cec5SDimitry Andric 
12090b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
12100b57cec5SDimitry Andric 
1211fe6060f1SDimitry Andric       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
12120b57cec5SDimitry Andric       if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
12130b57cec5SDimitry Andric                                               ISD::VECTOR_SHUFFLE, LT.second))
12140b57cec5SDimitry Andric         return LT.first * Entry->Cost;
12150b57cec5SDimitry Andric     }
12168bcb0991SDimitry Andric   }
12178bcb0991SDimitry Andric   if (ST->hasMVEIntegerOps()) {
12188bcb0991SDimitry Andric     if (Kind == TTI::SK_Broadcast) {
12198bcb0991SDimitry Andric       static const CostTblEntry MVEDupTbl[] = {
12208bcb0991SDimitry Andric           // VDUP handles these cases.
12218bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
12228bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
12238bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
12248bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
12258bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
12268bcb0991SDimitry Andric 
1227fe6060f1SDimitry Andric       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
12288bcb0991SDimitry Andric       if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
12298bcb0991SDimitry Andric                                               LT.second))
1230fe6060f1SDimitry Andric         return LT.first * Entry->Cost *
1231fe6060f1SDimitry Andric                ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
12320b57cec5SDimitry Andric     }
12330b57cec5SDimitry Andric 
1234fe6060f1SDimitry Andric     if (!Mask.empty()) {
1235fe6060f1SDimitry Andric       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1236fe6060f1SDimitry Andric       if (Mask.size() <= LT.second.getVectorNumElements() &&
1237fe6060f1SDimitry Andric           (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1238fe6060f1SDimitry Andric            isVREVMask(Mask, LT.second, 64)))
1239fe6060f1SDimitry Andric         return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1240fe6060f1SDimitry Andric     }
1241fe6060f1SDimitry Andric   }
1242fe6060f1SDimitry Andric 
1243fe6060f1SDimitry Andric   int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1244fe6060f1SDimitry Andric                      ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1245fe6060f1SDimitry Andric                      : 1;
1246fe6060f1SDimitry Andric   return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1247fe6060f1SDimitry Andric }
1248fe6060f1SDimitry Andric 
1249fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1250fe6060f1SDimitry Andric     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1251fe6060f1SDimitry Andric     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
1252480093f4SDimitry Andric     TTI::OperandValueProperties Opd1PropInfo,
1253fe6060f1SDimitry Andric     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1254480093f4SDimitry Andric     const Instruction *CxtI) {
12550b57cec5SDimitry Andric   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1256e8d8bef9SDimitry Andric   if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1257e8d8bef9SDimitry Andric     // Make operations on i1 relatively expensive as this often involves
1258e8d8bef9SDimitry Andric     // combining predicates. AND and XOR should be easier to handle with IT
1259e8d8bef9SDimitry Andric     // blocks.
1260e8d8bef9SDimitry Andric     switch (ISDOpcode) {
1261e8d8bef9SDimitry Andric     default:
1262e8d8bef9SDimitry Andric       break;
1263e8d8bef9SDimitry Andric     case ISD::AND:
1264e8d8bef9SDimitry Andric     case ISD::XOR:
1265e8d8bef9SDimitry Andric       return 2;
1266e8d8bef9SDimitry Andric     case ISD::OR:
1267e8d8bef9SDimitry Andric       return 3;
1268e8d8bef9SDimitry Andric     }
1269e8d8bef9SDimitry Andric   }
1270e8d8bef9SDimitry Andric 
1271fe6060f1SDimitry Andric   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
12720b57cec5SDimitry Andric 
1273480093f4SDimitry Andric   if (ST->hasNEON()) {
12740b57cec5SDimitry Andric     const unsigned FunctionCallDivCost = 20;
12750b57cec5SDimitry Andric     const unsigned ReciprocalDivCost = 10;
12760b57cec5SDimitry Andric     static const CostTblEntry CostTbl[] = {
12770b57cec5SDimitry Andric       // Division.
12780b57cec5SDimitry Andric       // These costs are somewhat random. Choose a cost of 20 to indicate that
12790b57cec5SDimitry Andric       // vectorizing devision (added function call) is going to be very expensive.
12800b57cec5SDimitry Andric       // Double registers types.
12810b57cec5SDimitry Andric       { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
12820b57cec5SDimitry Andric       { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
12830b57cec5SDimitry Andric       { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
12840b57cec5SDimitry Andric       { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
12850b57cec5SDimitry Andric       { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
12860b57cec5SDimitry Andric       { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
12870b57cec5SDimitry Andric       { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
12880b57cec5SDimitry Andric       { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
12890b57cec5SDimitry Andric       { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
12900b57cec5SDimitry Andric       { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
12910b57cec5SDimitry Andric       { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
12920b57cec5SDimitry Andric       { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
12930b57cec5SDimitry Andric       { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
12940b57cec5SDimitry Andric       { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
12950b57cec5SDimitry Andric       { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
12960b57cec5SDimitry Andric       { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
12970b57cec5SDimitry Andric       // Quad register types.
12980b57cec5SDimitry Andric       { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
12990b57cec5SDimitry Andric       { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
13000b57cec5SDimitry Andric       { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
13010b57cec5SDimitry Andric       { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
13020b57cec5SDimitry Andric       { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
13030b57cec5SDimitry Andric       { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
13040b57cec5SDimitry Andric       { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
13050b57cec5SDimitry Andric       { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
13060b57cec5SDimitry Andric       { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
13070b57cec5SDimitry Andric       { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
13080b57cec5SDimitry Andric       { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
13090b57cec5SDimitry Andric       { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
13100b57cec5SDimitry Andric       { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
13110b57cec5SDimitry Andric       { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
13120b57cec5SDimitry Andric       { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
13130b57cec5SDimitry Andric       { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
13140b57cec5SDimitry Andric       // Multiplication.
13150b57cec5SDimitry Andric     };
13160b57cec5SDimitry Andric 
13170b57cec5SDimitry Andric     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
13180b57cec5SDimitry Andric       return LT.first * Entry->Cost;
13190b57cec5SDimitry Andric 
1320fe6060f1SDimitry Andric     InstructionCost Cost = BaseT::getArithmeticInstrCost(
1321fe6060f1SDimitry Andric         Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
13220b57cec5SDimitry Andric 
13230b57cec5SDimitry Andric     // This is somewhat of a hack. The problem that we are facing is that SROA
13240b57cec5SDimitry Andric     // creates a sequence of shift, and, or instructions to construct values.
13250b57cec5SDimitry Andric     // These sequences are recognized by the ISel and have zero-cost. Not so for
13260b57cec5SDimitry Andric     // the vectorized code. Because we have support for v2i64 but not i64 those
13270b57cec5SDimitry Andric     // sequences look particularly beneficial to vectorize.
13280b57cec5SDimitry Andric     // To work around this we increase the cost of v2i64 operations to make them
13290b57cec5SDimitry Andric     // seem less beneficial.
13300b57cec5SDimitry Andric     if (LT.second == MVT::v2i64 &&
13310b57cec5SDimitry Andric         Op2Info == TargetTransformInfo::OK_UniformConstantValue)
13320b57cec5SDimitry Andric       Cost += 4;
13330b57cec5SDimitry Andric 
13340b57cec5SDimitry Andric     return Cost;
13350b57cec5SDimitry Andric   }
13360b57cec5SDimitry Andric 
1337480093f4SDimitry Andric   // If this operation is a shift on arm/thumb2, it might well be folded into
1338480093f4SDimitry Andric   // the following instruction, hence having a cost of 0.
1339480093f4SDimitry Andric   auto LooksLikeAFreeShift = [&]() {
1340480093f4SDimitry Andric     if (ST->isThumb1Only() || Ty->isVectorTy())
1341480093f4SDimitry Andric       return false;
1342480093f4SDimitry Andric 
1343480093f4SDimitry Andric     if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1344480093f4SDimitry Andric       return false;
1345480093f4SDimitry Andric     if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
1346480093f4SDimitry Andric       return false;
1347480093f4SDimitry Andric 
1348480093f4SDimitry Andric     // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1349480093f4SDimitry Andric     switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1350480093f4SDimitry Andric     case Instruction::Add:
1351480093f4SDimitry Andric     case Instruction::Sub:
1352480093f4SDimitry Andric     case Instruction::And:
1353480093f4SDimitry Andric     case Instruction::Xor:
1354480093f4SDimitry Andric     case Instruction::Or:
1355480093f4SDimitry Andric     case Instruction::ICmp:
1356480093f4SDimitry Andric       return true;
1357480093f4SDimitry Andric     default:
1358480093f4SDimitry Andric       return false;
1359480093f4SDimitry Andric     }
1360480093f4SDimitry Andric   };
1361480093f4SDimitry Andric   if (LooksLikeAFreeShift())
1362480093f4SDimitry Andric     return 0;
1363480093f4SDimitry Andric 
1364e8d8bef9SDimitry Andric   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1365e8d8bef9SDimitry Andric   // for "multiple beats" potentially needed by MVE instructions.
1366e8d8bef9SDimitry Andric   int BaseCost = 1;
1367fe6060f1SDimitry Andric   if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1368fe6060f1SDimitry Andric     BaseCost = ST->getMVEVectorCostFactor(CostKind);
13698bcb0991SDimitry Andric 
13708bcb0991SDimitry Andric   // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
13718bcb0991SDimitry Andric   // without treating floats as more expensive that scalars or increasing the
13728bcb0991SDimitry Andric   // costs for custom operations. The results is also multiplied by the
13738bcb0991SDimitry Andric   // MVEVectorCostFactor where appropriate.
13748bcb0991SDimitry Andric   if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
13758bcb0991SDimitry Andric     return LT.first * BaseCost;
13768bcb0991SDimitry Andric 
13778bcb0991SDimitry Andric   // Else this is expand, assume that we need to scalarize this op.
13785ffd83dbSDimitry Andric   if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
13795ffd83dbSDimitry Andric     unsigned Num = VTy->getNumElements();
1380fe6060f1SDimitry Andric     InstructionCost Cost =
1381fe6060f1SDimitry Andric         getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
13828bcb0991SDimitry Andric     // Return the cost of multiple scalar invocation plus the cost of
13838bcb0991SDimitry Andric     // inserting and extracting the values.
1384fe6060f1SDimitry Andric     SmallVector<Type *> Tys(Args.size(), Ty);
1385fe6060f1SDimitry Andric     return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
13868bcb0991SDimitry Andric   }
13878bcb0991SDimitry Andric 
13888bcb0991SDimitry Andric   return BaseCost;
13898bcb0991SDimitry Andric }
13908bcb0991SDimitry Andric 
1391fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1392fe6060f1SDimitry Andric                                             MaybeAlign Alignment,
1393fe6060f1SDimitry Andric                                             unsigned AddressSpace,
13945ffd83dbSDimitry Andric                                             TTI::TargetCostKind CostKind,
1395480093f4SDimitry Andric                                             const Instruction *I) {
13965ffd83dbSDimitry Andric   // TODO: Handle other cost kinds.
13975ffd83dbSDimitry Andric   if (CostKind != TTI::TCK_RecipThroughput)
13985ffd83dbSDimitry Andric     return 1;
13995ffd83dbSDimitry Andric 
14005ffd83dbSDimitry Andric   // Type legalization can't handle structs
14015ffd83dbSDimitry Andric   if (TLI->getValueType(DL, Src, true) == MVT::Other)
14025ffd83dbSDimitry Andric     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
14035ffd83dbSDimitry Andric                                   CostKind);
14040b57cec5SDimitry Andric 
1405480093f4SDimitry Andric   if (ST->hasNEON() && Src->isVectorTy() &&
1406480093f4SDimitry Andric       (Alignment && *Alignment != Align(16)) &&
14075ffd83dbSDimitry Andric       cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
14080b57cec5SDimitry Andric     // Unaligned loads/stores are extremely inefficient.
14090b57cec5SDimitry Andric     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1410fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
14110b57cec5SDimitry Andric     return LT.first * 4;
14120b57cec5SDimitry Andric   }
14135ffd83dbSDimitry Andric 
14145ffd83dbSDimitry Andric   // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
14155ffd83dbSDimitry Andric   // Same for stores.
14165ffd83dbSDimitry Andric   if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
14175ffd83dbSDimitry Andric       ((Opcode == Instruction::Load && I->hasOneUse() &&
14185ffd83dbSDimitry Andric         isa<FPExtInst>(*I->user_begin())) ||
14195ffd83dbSDimitry Andric        (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
14205ffd83dbSDimitry Andric     FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
14215ffd83dbSDimitry Andric     Type *DstTy =
14225ffd83dbSDimitry Andric         Opcode == Instruction::Load
14235ffd83dbSDimitry Andric             ? (*I->user_begin())->getType()
14245ffd83dbSDimitry Andric             : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
14255ffd83dbSDimitry Andric     if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
14265ffd83dbSDimitry Andric         DstTy->getScalarType()->isFloatTy())
1427fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind);
14285ffd83dbSDimitry Andric   }
14295ffd83dbSDimitry Andric 
14308bcb0991SDimitry Andric   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1431fe6060f1SDimitry Andric                      ? ST->getMVEVectorCostFactor(CostKind)
14328bcb0991SDimitry Andric                      : 1;
14335ffd83dbSDimitry Andric   return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
14345ffd83dbSDimitry Andric                                            CostKind, I);
14350b57cec5SDimitry Andric }
14360b57cec5SDimitry Andric 
1437fe6060f1SDimitry Andric InstructionCost
1438fe6060f1SDimitry Andric ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1439e8d8bef9SDimitry Andric                                   unsigned AddressSpace,
1440e8d8bef9SDimitry Andric                                   TTI::TargetCostKind CostKind) {
1441e8d8bef9SDimitry Andric   if (ST->hasMVEIntegerOps()) {
1442e8d8bef9SDimitry Andric     if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1443fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind);
1444e8d8bef9SDimitry Andric     if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1445fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind);
1446e8d8bef9SDimitry Andric   }
1447e8d8bef9SDimitry Andric   if (!isa<FixedVectorType>(Src))
1448e8d8bef9SDimitry Andric     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1449e8d8bef9SDimitry Andric                                         CostKind);
1450e8d8bef9SDimitry Andric   // Scalar cost, which is currently very high due to the efficiency of the
1451e8d8bef9SDimitry Andric   // generated code.
1452e8d8bef9SDimitry Andric   return cast<FixedVectorType>(Src)->getNumElements() * 8;
1453e8d8bef9SDimitry Andric }
1454e8d8bef9SDimitry Andric 
1455fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1456480093f4SDimitry Andric     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
14575ffd83dbSDimitry Andric     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
14585ffd83dbSDimitry Andric     bool UseMaskForCond, bool UseMaskForGaps) {
14590b57cec5SDimitry Andric   assert(Factor >= 2 && "Invalid interleave factor");
14600b57cec5SDimitry Andric   assert(isa<VectorType>(VecTy) && "Expect a vector type");
14610b57cec5SDimitry Andric 
14620b57cec5SDimitry Andric   // vldN/vstN doesn't support vector types of i64/f64 element.
14630b57cec5SDimitry Andric   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
14640b57cec5SDimitry Andric 
14650b57cec5SDimitry Andric   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
14660b57cec5SDimitry Andric       !UseMaskForCond && !UseMaskForGaps) {
14675ffd83dbSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
14685ffd83dbSDimitry Andric     auto *SubVecTy =
14695ffd83dbSDimitry Andric         FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
14700b57cec5SDimitry Andric 
14710b57cec5SDimitry Andric     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
14720b57cec5SDimitry Andric     // Accesses having vector types that are a multiple of 128 bits can be
14730b57cec5SDimitry Andric     // matched to more than one vldN/vstN instruction.
1474fe6060f1SDimitry Andric     int BaseCost =
1475fe6060f1SDimitry Andric         ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
14760b57cec5SDimitry Andric     if (NumElts % Factor == 0 &&
1477fe6060f1SDimitry Andric         TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1478480093f4SDimitry Andric       return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1479480093f4SDimitry Andric 
1480480093f4SDimitry Andric     // Some smaller than legal interleaved patterns are cheap as we can make
1481480093f4SDimitry Andric     // use of the vmovn or vrev patterns to interleave a standard load. This is
1482480093f4SDimitry Andric     // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1483480093f4SDimitry Andric     // promoted differently). The cost of 2 here is then a load and vrev or
1484480093f4SDimitry Andric     // vmovn.
1485480093f4SDimitry Andric     if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1486e8d8bef9SDimitry Andric         VecTy->isIntOrIntVectorTy() &&
1487e8d8bef9SDimitry Andric         DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1488480093f4SDimitry Andric       return 2 * BaseCost;
14890b57cec5SDimitry Andric   }
14900b57cec5SDimitry Andric 
14910b57cec5SDimitry Andric   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
14925ffd83dbSDimitry Andric                                            Alignment, AddressSpace, CostKind,
14930b57cec5SDimitry Andric                                            UseMaskForCond, UseMaskForGaps);
14940b57cec5SDimitry Andric }
14950b57cec5SDimitry Andric 
1496fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1497fe6060f1SDimitry Andric     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1498fe6060f1SDimitry Andric     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
14995ffd83dbSDimitry Andric   using namespace PatternMatch;
15005ffd83dbSDimitry Andric   if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
15015ffd83dbSDimitry Andric     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
15025ffd83dbSDimitry Andric                                          Alignment, CostKind, I);
15035ffd83dbSDimitry Andric 
15045ffd83dbSDimitry Andric   assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
15055ffd83dbSDimitry Andric   auto *VTy = cast<FixedVectorType>(DataTy);
15065ffd83dbSDimitry Andric 
15075ffd83dbSDimitry Andric   // TODO: Splitting, once we do that.
15085ffd83dbSDimitry Andric 
15095ffd83dbSDimitry Andric   unsigned NumElems = VTy->getNumElements();
15105ffd83dbSDimitry Andric   unsigned EltSize = VTy->getScalarSizeInBits();
1511fe6060f1SDimitry Andric   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
15125ffd83dbSDimitry Andric 
15135ffd83dbSDimitry Andric   // For now, it is assumed that for the MVE gather instructions the loads are
15145ffd83dbSDimitry Andric   // all effectively serialised. This means the cost is the scalar cost
15155ffd83dbSDimitry Andric   // multiplied by the number of elements being loaded. This is possibly very
15165ffd83dbSDimitry Andric   // conservative, but even so we still end up vectorising loops because the
15175ffd83dbSDimitry Andric   // cost per iteration for many loops is lower than for scalar loops.
1518fe6060f1SDimitry Andric   InstructionCost VectorCost =
1519fe6060f1SDimitry Andric       NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
15205ffd83dbSDimitry Andric   // The scalarization cost should be a lot higher. We use the number of vector
15215ffd83dbSDimitry Andric   // elements plus the scalarization overhead.
1522fe6060f1SDimitry Andric   InstructionCost ScalarCost =
1523fe6060f1SDimitry Andric       NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1524fe6060f1SDimitry Andric       BaseT::getScalarizationOverhead(VTy, false, true);
15255ffd83dbSDimitry Andric 
1526e8d8bef9SDimitry Andric   if (EltSize < 8 || Alignment < EltSize / 8)
15275ffd83dbSDimitry Andric     return ScalarCost;
15285ffd83dbSDimitry Andric 
15295ffd83dbSDimitry Andric   unsigned ExtSize = EltSize;
15305ffd83dbSDimitry Andric   // Check whether there's a single user that asks for an extended type
15315ffd83dbSDimitry Andric   if (I != nullptr) {
15325ffd83dbSDimitry Andric     // Dependent of the caller of this function, a gather instruction will
15335ffd83dbSDimitry Andric     // either have opcode Instruction::Load or be a call to the masked_gather
15345ffd83dbSDimitry Andric     // intrinsic
15355ffd83dbSDimitry Andric     if ((I->getOpcode() == Instruction::Load ||
15365ffd83dbSDimitry Andric          match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
15375ffd83dbSDimitry Andric         I->hasOneUse()) {
15385ffd83dbSDimitry Andric       const User *Us = *I->users().begin();
15395ffd83dbSDimitry Andric       if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
15405ffd83dbSDimitry Andric         // only allow valid type combinations
15415ffd83dbSDimitry Andric         unsigned TypeSize =
15425ffd83dbSDimitry Andric             cast<Instruction>(Us)->getType()->getScalarSizeInBits();
15435ffd83dbSDimitry Andric         if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
15445ffd83dbSDimitry Andric              (TypeSize == 16 && EltSize == 8)) &&
15455ffd83dbSDimitry Andric             TypeSize * NumElems == 128) {
15465ffd83dbSDimitry Andric           ExtSize = TypeSize;
15475ffd83dbSDimitry Andric         }
15485ffd83dbSDimitry Andric       }
15495ffd83dbSDimitry Andric     }
15505ffd83dbSDimitry Andric     // Check whether the input data needs to be truncated
15515ffd83dbSDimitry Andric     TruncInst *T;
15525ffd83dbSDimitry Andric     if ((I->getOpcode() == Instruction::Store ||
15535ffd83dbSDimitry Andric          match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
15545ffd83dbSDimitry Andric         (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
15555ffd83dbSDimitry Andric       // Only allow valid type combinations
15565ffd83dbSDimitry Andric       unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
15575ffd83dbSDimitry Andric       if (((EltSize == 16 && TypeSize == 32) ||
15585ffd83dbSDimitry Andric            (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
15595ffd83dbSDimitry Andric           TypeSize * NumElems == 128)
15605ffd83dbSDimitry Andric         ExtSize = TypeSize;
15615ffd83dbSDimitry Andric     }
15625ffd83dbSDimitry Andric   }
15635ffd83dbSDimitry Andric 
15645ffd83dbSDimitry Andric   if (ExtSize * NumElems != 128 || NumElems < 4)
15655ffd83dbSDimitry Andric     return ScalarCost;
15665ffd83dbSDimitry Andric 
15675ffd83dbSDimitry Andric   // Any (aligned) i32 gather will not need to be scalarised.
15685ffd83dbSDimitry Andric   if (ExtSize == 32)
15695ffd83dbSDimitry Andric     return VectorCost;
15705ffd83dbSDimitry Andric   // For smaller types, we need to ensure that the gep's inputs are correctly
15715ffd83dbSDimitry Andric   // extended from a small enough value. Other sizes (including i64) are
15725ffd83dbSDimitry Andric   // scalarized for now.
15735ffd83dbSDimitry Andric   if (ExtSize != 8 && ExtSize != 16)
15745ffd83dbSDimitry Andric     return ScalarCost;
15755ffd83dbSDimitry Andric 
15765ffd83dbSDimitry Andric   if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
15775ffd83dbSDimitry Andric     Ptr = BC->getOperand(0);
15785ffd83dbSDimitry Andric   if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
15795ffd83dbSDimitry Andric     if (GEP->getNumOperands() != 2)
15805ffd83dbSDimitry Andric       return ScalarCost;
15815ffd83dbSDimitry Andric     unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
15825ffd83dbSDimitry Andric     // Scale needs to be correct (which is only relevant for i16s).
15835ffd83dbSDimitry Andric     if (Scale != 1 && Scale * 8 != ExtSize)
15845ffd83dbSDimitry Andric       return ScalarCost;
15855ffd83dbSDimitry Andric     // And we need to zext (not sext) the indexes from a small enough type.
15865ffd83dbSDimitry Andric     if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
15875ffd83dbSDimitry Andric       if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
15885ffd83dbSDimitry Andric         return VectorCost;
15895ffd83dbSDimitry Andric     }
15905ffd83dbSDimitry Andric     return ScalarCost;
15915ffd83dbSDimitry Andric   }
15925ffd83dbSDimitry Andric   return ScalarCost;
15935ffd83dbSDimitry Andric }
15945ffd83dbSDimitry Andric 
1595fe6060f1SDimitry Andric InstructionCost
1596fe6060f1SDimitry Andric ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1597fe6060f1SDimitry Andric                                        Optional<FastMathFlags> FMF,
1598e8d8bef9SDimitry Andric                                        TTI::TargetCostKind CostKind) {
1599fe6060f1SDimitry Andric   if (TTI::requiresOrderedReduction(FMF))
1600fe6060f1SDimitry Andric     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1601fe6060f1SDimitry Andric 
1602e8d8bef9SDimitry Andric   EVT ValVT = TLI->getValueType(DL, ValTy);
1603e8d8bef9SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1604e8d8bef9SDimitry Andric   if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1605fe6060f1SDimitry Andric     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1606e8d8bef9SDimitry Andric 
1607fe6060f1SDimitry Andric   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1608e8d8bef9SDimitry Andric 
1609e8d8bef9SDimitry Andric   static const CostTblEntry CostTblAdd[]{
1610e8d8bef9SDimitry Andric       {ISD::ADD, MVT::v16i8, 1},
1611e8d8bef9SDimitry Andric       {ISD::ADD, MVT::v8i16, 1},
1612e8d8bef9SDimitry Andric       {ISD::ADD, MVT::v4i32, 1},
1613e8d8bef9SDimitry Andric   };
1614e8d8bef9SDimitry Andric   if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1615fe6060f1SDimitry Andric     return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1616e8d8bef9SDimitry Andric 
1617fe6060f1SDimitry Andric   return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1618e8d8bef9SDimitry Andric }
1619e8d8bef9SDimitry Andric 
1620e8d8bef9SDimitry Andric InstructionCost
1621e8d8bef9SDimitry Andric ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1622e8d8bef9SDimitry Andric                                         Type *ResTy, VectorType *ValTy,
1623e8d8bef9SDimitry Andric                                         TTI::TargetCostKind CostKind) {
1624e8d8bef9SDimitry Andric   EVT ValVT = TLI->getValueType(DL, ValTy);
1625e8d8bef9SDimitry Andric   EVT ResVT = TLI->getValueType(DL, ResTy);
1626e8d8bef9SDimitry Andric   if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1627fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT =
1628fe6060f1SDimitry Andric         TLI->getTypeLegalizationCost(DL, ValTy);
1629e8d8bef9SDimitry Andric     if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) ||
1630e8d8bef9SDimitry Andric         (LT.second == MVT::v8i16 &&
1631e8d8bef9SDimitry Andric          ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) ||
1632e8d8bef9SDimitry Andric         (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64))
1633fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1634e8d8bef9SDimitry Andric   }
1635e8d8bef9SDimitry Andric 
1636e8d8bef9SDimitry Andric   return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1637e8d8bef9SDimitry Andric                                             CostKind);
1638e8d8bef9SDimitry Andric }
1639e8d8bef9SDimitry Andric 
1640fe6060f1SDimitry Andric InstructionCost
1641fe6060f1SDimitry Andric ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1642e8d8bef9SDimitry Andric                                   TTI::TargetCostKind CostKind) {
1643e8d8bef9SDimitry Andric   switch (ICA.getID()) {
1644e8d8bef9SDimitry Andric   case Intrinsic::get_active_lane_mask:
1645e8d8bef9SDimitry Andric     // Currently we make a somewhat optimistic assumption that
1646e8d8bef9SDimitry Andric     // active_lane_mask's are always free. In reality it may be freely folded
1647e8d8bef9SDimitry Andric     // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1648e8d8bef9SDimitry Andric     // of add/icmp code. We may need to improve this in the future, but being
1649e8d8bef9SDimitry Andric     // able to detect if it is free or not involves looking at a lot of other
1650e8d8bef9SDimitry Andric     // code. We currently assume that the vectorizer inserted these, and knew
1651e8d8bef9SDimitry Andric     // what it was doing in adding one.
1652e8d8bef9SDimitry Andric     if (ST->hasMVEIntegerOps())
1653e8d8bef9SDimitry Andric       return 0;
1654e8d8bef9SDimitry Andric     break;
1655e8d8bef9SDimitry Andric   case Intrinsic::sadd_sat:
1656e8d8bef9SDimitry Andric   case Intrinsic::ssub_sat:
1657e8d8bef9SDimitry Andric   case Intrinsic::uadd_sat:
1658e8d8bef9SDimitry Andric   case Intrinsic::usub_sat: {
1659e8d8bef9SDimitry Andric     if (!ST->hasMVEIntegerOps())
1660e8d8bef9SDimitry Andric       break;
1661e8d8bef9SDimitry Andric     Type *VT = ICA.getReturnType();
1662e8d8bef9SDimitry Andric 
1663fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1664e8d8bef9SDimitry Andric     if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1665e8d8bef9SDimitry Andric         LT.second == MVT::v16i8) {
1666fe6060f1SDimitry Andric       // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1667e8d8bef9SDimitry Andric       // need to extend the type, as it uses shr(qadd(shl, shl)).
1668fe6060f1SDimitry Andric       unsigned Instrs =
1669fe6060f1SDimitry Andric           LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1670fe6060f1SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1671e8d8bef9SDimitry Andric     }
1672e8d8bef9SDimitry Andric     break;
1673e8d8bef9SDimitry Andric   }
1674fe6060f1SDimitry Andric   case Intrinsic::abs:
1675fe6060f1SDimitry Andric   case Intrinsic::smin:
1676fe6060f1SDimitry Andric   case Intrinsic::smax:
1677fe6060f1SDimitry Andric   case Intrinsic::umin:
1678fe6060f1SDimitry Andric   case Intrinsic::umax: {
1679fe6060f1SDimitry Andric     if (!ST->hasMVEIntegerOps())
1680fe6060f1SDimitry Andric       break;
1681fe6060f1SDimitry Andric     Type *VT = ICA.getReturnType();
1682fe6060f1SDimitry Andric 
1683fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1684fe6060f1SDimitry Andric     if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1685fe6060f1SDimitry Andric         LT.second == MVT::v16i8)
1686fe6060f1SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind);
1687fe6060f1SDimitry Andric     break;
1688fe6060f1SDimitry Andric   }
1689fe6060f1SDimitry Andric   case Intrinsic::minnum:
1690fe6060f1SDimitry Andric   case Intrinsic::maxnum: {
1691fe6060f1SDimitry Andric     if (!ST->hasMVEFloatOps())
1692fe6060f1SDimitry Andric       break;
1693fe6060f1SDimitry Andric     Type *VT = ICA.getReturnType();
1694fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1695fe6060f1SDimitry Andric     if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1696fe6060f1SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind);
1697fe6060f1SDimitry Andric     break;
1698fe6060f1SDimitry Andric   }
1699e8d8bef9SDimitry Andric   }
1700e8d8bef9SDimitry Andric 
1701e8d8bef9SDimitry Andric   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1702e8d8bef9SDimitry Andric }
1703e8d8bef9SDimitry Andric 
17040b57cec5SDimitry Andric bool ARMTTIImpl::isLoweredToCall(const Function *F) {
17050b57cec5SDimitry Andric   if (!F->isIntrinsic())
17060b57cec5SDimitry Andric     BaseT::isLoweredToCall(F);
17070b57cec5SDimitry Andric 
17080b57cec5SDimitry Andric   // Assume all Arm-specific intrinsics map to an instruction.
17090b57cec5SDimitry Andric   if (F->getName().startswith("llvm.arm"))
17100b57cec5SDimitry Andric     return false;
17110b57cec5SDimitry Andric 
17120b57cec5SDimitry Andric   switch (F->getIntrinsicID()) {
17130b57cec5SDimitry Andric   default: break;
17140b57cec5SDimitry Andric   case Intrinsic::powi:
17150b57cec5SDimitry Andric   case Intrinsic::sin:
17160b57cec5SDimitry Andric   case Intrinsic::cos:
17170b57cec5SDimitry Andric   case Intrinsic::pow:
17180b57cec5SDimitry Andric   case Intrinsic::log:
17190b57cec5SDimitry Andric   case Intrinsic::log10:
17200b57cec5SDimitry Andric   case Intrinsic::log2:
17210b57cec5SDimitry Andric   case Intrinsic::exp:
17220b57cec5SDimitry Andric   case Intrinsic::exp2:
17230b57cec5SDimitry Andric     return true;
17240b57cec5SDimitry Andric   case Intrinsic::sqrt:
17250b57cec5SDimitry Andric   case Intrinsic::fabs:
17260b57cec5SDimitry Andric   case Intrinsic::copysign:
17270b57cec5SDimitry Andric   case Intrinsic::floor:
17280b57cec5SDimitry Andric   case Intrinsic::ceil:
17290b57cec5SDimitry Andric   case Intrinsic::trunc:
17300b57cec5SDimitry Andric   case Intrinsic::rint:
17310b57cec5SDimitry Andric   case Intrinsic::nearbyint:
17320b57cec5SDimitry Andric   case Intrinsic::round:
17330b57cec5SDimitry Andric   case Intrinsic::canonicalize:
17340b57cec5SDimitry Andric   case Intrinsic::lround:
17350b57cec5SDimitry Andric   case Intrinsic::llround:
17360b57cec5SDimitry Andric   case Intrinsic::lrint:
17370b57cec5SDimitry Andric   case Intrinsic::llrint:
17380b57cec5SDimitry Andric     if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
17390b57cec5SDimitry Andric       return true;
17400b57cec5SDimitry Andric     if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
17410b57cec5SDimitry Andric       return true;
17420b57cec5SDimitry Andric     // Some operations can be handled by vector instructions and assume
17430b57cec5SDimitry Andric     // unsupported vectors will be expanded into supported scalar ones.
17440b57cec5SDimitry Andric     // TODO Handle scalar operations properly.
17450b57cec5SDimitry Andric     return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
17460b57cec5SDimitry Andric   case Intrinsic::masked_store:
17470b57cec5SDimitry Andric   case Intrinsic::masked_load:
17480b57cec5SDimitry Andric   case Intrinsic::masked_gather:
17490b57cec5SDimitry Andric   case Intrinsic::masked_scatter:
17500b57cec5SDimitry Andric     return !ST->hasMVEIntegerOps();
17510b57cec5SDimitry Andric   case Intrinsic::sadd_with_overflow:
17520b57cec5SDimitry Andric   case Intrinsic::uadd_with_overflow:
17530b57cec5SDimitry Andric   case Intrinsic::ssub_with_overflow:
17540b57cec5SDimitry Andric   case Intrinsic::usub_with_overflow:
17550b57cec5SDimitry Andric   case Intrinsic::sadd_sat:
17560b57cec5SDimitry Andric   case Intrinsic::uadd_sat:
17570b57cec5SDimitry Andric   case Intrinsic::ssub_sat:
17580b57cec5SDimitry Andric   case Intrinsic::usub_sat:
17590b57cec5SDimitry Andric     return false;
17600b57cec5SDimitry Andric   }
17610b57cec5SDimitry Andric 
17620b57cec5SDimitry Andric   return BaseT::isLoweredToCall(F);
17630b57cec5SDimitry Andric }
17640b57cec5SDimitry Andric 
1765e8d8bef9SDimitry Andric bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
17660b57cec5SDimitry Andric   unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
17670b57cec5SDimitry Andric   EVT VT = TLI->getValueType(DL, I.getType(), true);
17680b57cec5SDimitry Andric   if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
17690b57cec5SDimitry Andric     return true;
17700b57cec5SDimitry Andric 
17710b57cec5SDimitry Andric   // Check if an intrinsic will be lowered to a call and assume that any
17720b57cec5SDimitry Andric   // other CallInst will generate a bl.
17730b57cec5SDimitry Andric   if (auto *Call = dyn_cast<CallInst>(&I)) {
1774e8d8bef9SDimitry Andric     if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1775e8d8bef9SDimitry Andric       switch(II->getIntrinsicID()) {
1776e8d8bef9SDimitry Andric         case Intrinsic::memcpy:
1777e8d8bef9SDimitry Andric         case Intrinsic::memset:
1778e8d8bef9SDimitry Andric         case Intrinsic::memmove:
1779e8d8bef9SDimitry Andric           return getNumMemOps(II) == -1;
1780e8d8bef9SDimitry Andric         default:
17810b57cec5SDimitry Andric           if (const Function *F = Call->getCalledFunction())
17820b57cec5SDimitry Andric             return isLoweredToCall(F);
17830b57cec5SDimitry Andric       }
1784e8d8bef9SDimitry Andric     }
17850b57cec5SDimitry Andric     return true;
17860b57cec5SDimitry Andric   }
17870b57cec5SDimitry Andric 
17880b57cec5SDimitry Andric   // FPv5 provides conversions between integer, double-precision,
17890b57cec5SDimitry Andric   // single-precision, and half-precision formats.
17900b57cec5SDimitry Andric   switch (I.getOpcode()) {
17910b57cec5SDimitry Andric   default:
17920b57cec5SDimitry Andric     break;
17930b57cec5SDimitry Andric   case Instruction::FPToSI:
17940b57cec5SDimitry Andric   case Instruction::FPToUI:
17950b57cec5SDimitry Andric   case Instruction::SIToFP:
17960b57cec5SDimitry Andric   case Instruction::UIToFP:
17970b57cec5SDimitry Andric   case Instruction::FPTrunc:
17980b57cec5SDimitry Andric   case Instruction::FPExt:
17990b57cec5SDimitry Andric     return !ST->hasFPARMv8Base();
18000b57cec5SDimitry Andric   }
18010b57cec5SDimitry Andric 
18020b57cec5SDimitry Andric   // FIXME: Unfortunately the approach of checking the Operation Action does
18030b57cec5SDimitry Andric   // not catch all cases of Legalization that use library calls. Our
18040b57cec5SDimitry Andric   // Legalization step categorizes some transformations into library calls as
18050b57cec5SDimitry Andric   // Custom, Expand or even Legal when doing type legalization. So for now
18060b57cec5SDimitry Andric   // we have to special case for instance the SDIV of 64bit integers and the
18070b57cec5SDimitry Andric   // use of floating point emulation.
18080b57cec5SDimitry Andric   if (VT.isInteger() && VT.getSizeInBits() >= 64) {
18090b57cec5SDimitry Andric     switch (ISD) {
18100b57cec5SDimitry Andric     default:
18110b57cec5SDimitry Andric       break;
18120b57cec5SDimitry Andric     case ISD::SDIV:
18130b57cec5SDimitry Andric     case ISD::UDIV:
18140b57cec5SDimitry Andric     case ISD::SREM:
18150b57cec5SDimitry Andric     case ISD::UREM:
18160b57cec5SDimitry Andric     case ISD::SDIVREM:
18170b57cec5SDimitry Andric     case ISD::UDIVREM:
18180b57cec5SDimitry Andric       return true;
18190b57cec5SDimitry Andric     }
18200b57cec5SDimitry Andric   }
18210b57cec5SDimitry Andric 
18220b57cec5SDimitry Andric   // Assume all other non-float operations are supported.
18230b57cec5SDimitry Andric   if (!VT.isFloatingPoint())
18240b57cec5SDimitry Andric     return false;
18250b57cec5SDimitry Andric 
18260b57cec5SDimitry Andric   // We'll need a library call to handle most floats when using soft.
18270b57cec5SDimitry Andric   if (TLI->useSoftFloat()) {
18280b57cec5SDimitry Andric     switch (I.getOpcode()) {
18290b57cec5SDimitry Andric     default:
18300b57cec5SDimitry Andric       return true;
18310b57cec5SDimitry Andric     case Instruction::Alloca:
18320b57cec5SDimitry Andric     case Instruction::Load:
18330b57cec5SDimitry Andric     case Instruction::Store:
18340b57cec5SDimitry Andric     case Instruction::Select:
18350b57cec5SDimitry Andric     case Instruction::PHI:
18360b57cec5SDimitry Andric       return false;
18370b57cec5SDimitry Andric     }
18380b57cec5SDimitry Andric   }
18390b57cec5SDimitry Andric 
18400b57cec5SDimitry Andric   // We'll need a libcall to perform double precision operations on a single
18410b57cec5SDimitry Andric   // precision only FPU.
18420b57cec5SDimitry Andric   if (I.getType()->isDoubleTy() && !ST->hasFP64())
18430b57cec5SDimitry Andric     return true;
18440b57cec5SDimitry Andric 
18450b57cec5SDimitry Andric   // Likewise for half precision arithmetic.
18460b57cec5SDimitry Andric   if (I.getType()->isHalfTy() && !ST->hasFullFP16())
18470b57cec5SDimitry Andric     return true;
18480b57cec5SDimitry Andric 
18490b57cec5SDimitry Andric   return false;
1850e8d8bef9SDimitry Andric }
1851e8d8bef9SDimitry Andric 
1852e8d8bef9SDimitry Andric bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1853e8d8bef9SDimitry Andric                                           AssumptionCache &AC,
1854e8d8bef9SDimitry Andric                                           TargetLibraryInfo *LibInfo,
1855e8d8bef9SDimitry Andric                                           HardwareLoopInfo &HWLoopInfo) {
1856e8d8bef9SDimitry Andric   // Low-overhead branches are only supported in the 'low-overhead branch'
1857e8d8bef9SDimitry Andric   // extension of v8.1-m.
1858e8d8bef9SDimitry Andric   if (!ST->hasLOB() || DisableLowOverheadLoops) {
1859e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1860e8d8bef9SDimitry Andric     return false;
1861e8d8bef9SDimitry Andric   }
1862e8d8bef9SDimitry Andric 
1863e8d8bef9SDimitry Andric   if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
1864e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1865e8d8bef9SDimitry Andric     return false;
1866e8d8bef9SDimitry Andric   }
1867e8d8bef9SDimitry Andric 
1868e8d8bef9SDimitry Andric   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1869e8d8bef9SDimitry Andric   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1870e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1871e8d8bef9SDimitry Andric     return false;
1872e8d8bef9SDimitry Andric   }
1873e8d8bef9SDimitry Andric 
1874e8d8bef9SDimitry Andric   const SCEV *TripCountSCEV =
1875e8d8bef9SDimitry Andric     SE.getAddExpr(BackedgeTakenCount,
1876e8d8bef9SDimitry Andric                   SE.getOne(BackedgeTakenCount->getType()));
1877e8d8bef9SDimitry Andric 
1878e8d8bef9SDimitry Andric   // We need to store the trip count in LR, a 32-bit register.
1879e8d8bef9SDimitry Andric   if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1880e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1881e8d8bef9SDimitry Andric     return false;
1882e8d8bef9SDimitry Andric   }
1883e8d8bef9SDimitry Andric 
1884e8d8bef9SDimitry Andric   // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1885e8d8bef9SDimitry Andric   // point in generating a hardware loop if that's going to happen.
18860b57cec5SDimitry Andric 
18870b57cec5SDimitry Andric   auto IsHardwareLoopIntrinsic = [](Instruction &I) {
18880b57cec5SDimitry Andric     if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
18890b57cec5SDimitry Andric       switch (Call->getIntrinsicID()) {
18900b57cec5SDimitry Andric       default:
18910b57cec5SDimitry Andric         break;
1892e8d8bef9SDimitry Andric       case Intrinsic::start_loop_iterations:
1893fe6060f1SDimitry Andric       case Intrinsic::test_start_loop_iterations:
18940b57cec5SDimitry Andric       case Intrinsic::loop_decrement:
18950b57cec5SDimitry Andric       case Intrinsic::loop_decrement_reg:
18960b57cec5SDimitry Andric         return true;
18970b57cec5SDimitry Andric       }
18980b57cec5SDimitry Andric     }
18990b57cec5SDimitry Andric     return false;
19000b57cec5SDimitry Andric   };
19010b57cec5SDimitry Andric 
19020b57cec5SDimitry Andric   // Scan the instructions to see if there's any that we know will turn into a
1903e8d8bef9SDimitry Andric   // call or if this loop is already a low-overhead loop or will become a tail
1904e8d8bef9SDimitry Andric   // predicated loop.
1905e8d8bef9SDimitry Andric   bool IsTailPredLoop = false;
19060b57cec5SDimitry Andric   auto ScanLoop = [&](Loop *L) {
19070b57cec5SDimitry Andric     for (auto *BB : L->getBlocks()) {
19080b57cec5SDimitry Andric       for (auto &I : *BB) {
1909e8d8bef9SDimitry Andric         if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
1910e8d8bef9SDimitry Andric             isa<InlineAsm>(I)) {
19115ffd83dbSDimitry Andric           LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
19120b57cec5SDimitry Andric           return false;
19130b57cec5SDimitry Andric         }
1914e8d8bef9SDimitry Andric         if (auto *II = dyn_cast<IntrinsicInst>(&I))
1915e8d8bef9SDimitry Andric           IsTailPredLoop |=
1916e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
1917e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
1918e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
1919e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
1920e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
19210b57cec5SDimitry Andric       }
19225ffd83dbSDimitry Andric     }
19230b57cec5SDimitry Andric     return true;
19240b57cec5SDimitry Andric   };
19250b57cec5SDimitry Andric 
19260b57cec5SDimitry Andric   // Visit inner loops.
19270b57cec5SDimitry Andric   for (auto Inner : *L)
19280b57cec5SDimitry Andric     if (!ScanLoop(Inner))
19290b57cec5SDimitry Andric       return false;
19300b57cec5SDimitry Andric 
19310b57cec5SDimitry Andric   if (!ScanLoop(L))
19320b57cec5SDimitry Andric     return false;
19330b57cec5SDimitry Andric 
19340b57cec5SDimitry Andric   // TODO: Check whether the trip count calculation is expensive. If L is the
19350b57cec5SDimitry Andric   // inner loop but we know it has a low trip count, calculating that trip
19360b57cec5SDimitry Andric   // count (in the parent loop) may be detrimental.
19370b57cec5SDimitry Andric 
19380b57cec5SDimitry Andric   LLVMContext &C = L->getHeader()->getContext();
19390b57cec5SDimitry Andric   HWLoopInfo.CounterInReg = true;
19400b57cec5SDimitry Andric   HWLoopInfo.IsNestingLegal = false;
1941e8d8bef9SDimitry Andric   HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
19420b57cec5SDimitry Andric   HWLoopInfo.CountType = Type::getInt32Ty(C);
19430b57cec5SDimitry Andric   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
19440b57cec5SDimitry Andric   return true;
19450b57cec5SDimitry Andric }
19460b57cec5SDimitry Andric 
1947480093f4SDimitry Andric static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
1948480093f4SDimitry Andric   // We don't allow icmp's, and because we only look at single block loops,
1949480093f4SDimitry Andric   // we simply count the icmps, i.e. there should only be 1 for the backedge.
1950480093f4SDimitry Andric   if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
1951480093f4SDimitry Andric     return false;
1952480093f4SDimitry Andric 
1953480093f4SDimitry Andric   if (isa<FCmpInst>(&I))
1954480093f4SDimitry Andric     return false;
1955480093f4SDimitry Andric 
1956480093f4SDimitry Andric   // We could allow extending/narrowing FP loads/stores, but codegen is
1957480093f4SDimitry Andric   // too inefficient so reject this for now.
1958480093f4SDimitry Andric   if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
1959480093f4SDimitry Andric     return false;
1960480093f4SDimitry Andric 
1961480093f4SDimitry Andric   // Extends have to be extending-loads
1962480093f4SDimitry Andric   if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
1963480093f4SDimitry Andric     if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
1964480093f4SDimitry Andric       return false;
1965480093f4SDimitry Andric 
1966480093f4SDimitry Andric   // Truncs have to be narrowing-stores
1967480093f4SDimitry Andric   if (isa<TruncInst>(&I) )
1968480093f4SDimitry Andric     if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
1969480093f4SDimitry Andric       return false;
1970480093f4SDimitry Andric 
1971480093f4SDimitry Andric   return true;
1972480093f4SDimitry Andric }
1973480093f4SDimitry Andric 
1974480093f4SDimitry Andric // To set up a tail-predicated loop, we need to know the total number of
1975480093f4SDimitry Andric // elements processed by that loop. Thus, we need to determine the element
1976480093f4SDimitry Andric // size and:
1977480093f4SDimitry Andric // 1) it should be uniform for all operations in the vector loop, so we
1978480093f4SDimitry Andric //    e.g. don't want any widening/narrowing operations.
1979480093f4SDimitry Andric // 2) it should be smaller than i64s because we don't have vector operations
1980480093f4SDimitry Andric //    that work on i64s.
1981480093f4SDimitry Andric // 3) we don't want elements to be reversed or shuffled, to make sure the
1982480093f4SDimitry Andric //    tail-predication masks/predicates the right lanes.
1983480093f4SDimitry Andric //
1984480093f4SDimitry Andric static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
1985480093f4SDimitry Andric                                  const DataLayout &DL,
1986480093f4SDimitry Andric                                  const LoopAccessInfo *LAI) {
19875ffd83dbSDimitry Andric   LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
19885ffd83dbSDimitry Andric 
1989e8d8bef9SDimitry Andric   // If there are live-out values, it is probably a reduction. We can predicate
1990e8d8bef9SDimitry Andric   // most reduction operations freely under MVE using a combination of
1991e8d8bef9SDimitry Andric   // prefer-predicated-reduction-select and inloop reductions. We limit this to
1992e8d8bef9SDimitry Andric   // floating point and integer reductions, but don't check for operators
1993e8d8bef9SDimitry Andric   // specifically here. If the value ends up not being a reduction (and so the
1994e8d8bef9SDimitry Andric   // vectorizer cannot tailfold the loop), we should fall back to standard
1995e8d8bef9SDimitry Andric   // vectorization automatically.
19965ffd83dbSDimitry Andric   SmallVector< Instruction *, 8 > LiveOuts;
19975ffd83dbSDimitry Andric   LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
1998e8d8bef9SDimitry Andric   bool ReductionsDisabled =
19995ffd83dbSDimitry Andric       EnableTailPredication == TailPredication::EnabledNoReductions ||
20005ffd83dbSDimitry Andric       EnableTailPredication == TailPredication::ForceEnabledNoReductions;
20015ffd83dbSDimitry Andric 
20025ffd83dbSDimitry Andric   for (auto *I : LiveOuts) {
2003e8d8bef9SDimitry Andric     if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2004e8d8bef9SDimitry Andric         !I->getType()->isHalfTy()) {
2005e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
20065ffd83dbSDimitry Andric                            "live-out value\n");
20075ffd83dbSDimitry Andric       return false;
20085ffd83dbSDimitry Andric     }
2009e8d8bef9SDimitry Andric     if (ReductionsDisabled) {
2010e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
20115ffd83dbSDimitry Andric       return false;
20125ffd83dbSDimitry Andric     }
20135ffd83dbSDimitry Andric   }
20145ffd83dbSDimitry Andric 
20155ffd83dbSDimitry Andric   // Next, check that all instructions can be tail-predicated.
2016480093f4SDimitry Andric   PredicatedScalarEvolution PSE = LAI->getPSE();
20175ffd83dbSDimitry Andric   SmallVector<Instruction *, 16> LoadStores;
2018480093f4SDimitry Andric   int ICmpCount = 0;
2019480093f4SDimitry Andric 
2020480093f4SDimitry Andric   for (BasicBlock *BB : L->blocks()) {
2021480093f4SDimitry Andric     for (Instruction &I : BB->instructionsWithoutDebug()) {
2022480093f4SDimitry Andric       if (isa<PHINode>(&I))
2023480093f4SDimitry Andric         continue;
2024480093f4SDimitry Andric       if (!canTailPredicateInstruction(I, ICmpCount)) {
2025480093f4SDimitry Andric         LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2026480093f4SDimitry Andric         return false;
2027480093f4SDimitry Andric       }
2028480093f4SDimitry Andric 
2029480093f4SDimitry Andric       Type *T  = I.getType();
2030480093f4SDimitry Andric       if (T->isPointerTy())
2031480093f4SDimitry Andric         T = T->getPointerElementType();
2032480093f4SDimitry Andric 
2033480093f4SDimitry Andric       if (T->getScalarSizeInBits() > 32) {
2034480093f4SDimitry Andric         LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2035480093f4SDimitry Andric         return false;
2036480093f4SDimitry Andric       }
2037480093f4SDimitry Andric       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2038480093f4SDimitry Andric         Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
2039480093f4SDimitry Andric         int64_t NextStride = getPtrStride(PSE, Ptr, L);
2040e8d8bef9SDimitry Andric         if (NextStride == 1) {
2041480093f4SDimitry Andric           // TODO: for now only allow consecutive strides of 1. We could support
2042e8d8bef9SDimitry Andric           // other strides as long as it is uniform, but let's keep it simple
2043e8d8bef9SDimitry Andric           // for now.
2044e8d8bef9SDimitry Andric           continue;
2045e8d8bef9SDimitry Andric         } else if (NextStride == -1 ||
2046e8d8bef9SDimitry Andric                    (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2047e8d8bef9SDimitry Andric                    (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2048e8d8bef9SDimitry Andric           LLVM_DEBUG(dbgs()
2049e8d8bef9SDimitry Andric                      << "Consecutive strides of 2 found, vld2/vstr2 can't "
2050e8d8bef9SDimitry Andric                         "be tail-predicated\n.");
2051e8d8bef9SDimitry Andric           return false;
2052e8d8bef9SDimitry Andric           // TODO: don't tail predicate if there is a reversed load?
2053e8d8bef9SDimitry Andric         } else if (EnableMaskedGatherScatters) {
2054e8d8bef9SDimitry Andric           // Gather/scatters do allow loading from arbitrary strides, at
2055e8d8bef9SDimitry Andric           // least if they are loop invariant.
2056e8d8bef9SDimitry Andric           // TODO: Loop variant strides should in theory work, too, but
2057e8d8bef9SDimitry Andric           // this requires further testing.
2058e8d8bef9SDimitry Andric           const SCEV *PtrScev =
2059e8d8bef9SDimitry Andric               replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr);
2060e8d8bef9SDimitry Andric           if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2061e8d8bef9SDimitry Andric             const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2062e8d8bef9SDimitry Andric             if (PSE.getSE()->isLoopInvariant(Step, L))
2063480093f4SDimitry Andric               continue;
2064480093f4SDimitry Andric           }
2065e8d8bef9SDimitry Andric         }
2066e8d8bef9SDimitry Andric         LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2067480093f4SDimitry Andric                              "tail-predicate\n.");
2068480093f4SDimitry Andric         return false;
2069480093f4SDimitry Andric       }
2070480093f4SDimitry Andric     }
2071480093f4SDimitry Andric   }
2072480093f4SDimitry Andric 
2073480093f4SDimitry Andric   LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2074480093f4SDimitry Andric   return true;
2075480093f4SDimitry Andric }
2076480093f4SDimitry Andric 
2077480093f4SDimitry Andric bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
2078480093f4SDimitry Andric                                              ScalarEvolution &SE,
2079480093f4SDimitry Andric                                              AssumptionCache &AC,
2080480093f4SDimitry Andric                                              TargetLibraryInfo *TLI,
2081480093f4SDimitry Andric                                              DominatorTree *DT,
2082480093f4SDimitry Andric                                              const LoopAccessInfo *LAI) {
20835ffd83dbSDimitry Andric   if (!EnableTailPredication) {
20845ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2085480093f4SDimitry Andric     return false;
20865ffd83dbSDimitry Andric   }
2087480093f4SDimitry Andric 
2088480093f4SDimitry Andric   // Creating a predicated vector loop is the first step for generating a
2089480093f4SDimitry Andric   // tail-predicated hardware loop, for which we need the MVE masked
2090480093f4SDimitry Andric   // load/stores instructions:
2091480093f4SDimitry Andric   if (!ST->hasMVEIntegerOps())
2092480093f4SDimitry Andric     return false;
2093480093f4SDimitry Andric 
2094480093f4SDimitry Andric   // For now, restrict this to single block loops.
2095480093f4SDimitry Andric   if (L->getNumBlocks() > 1) {
2096480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2097480093f4SDimitry Andric                          "loop.\n");
2098480093f4SDimitry Andric     return false;
2099480093f4SDimitry Andric   }
2100480093f4SDimitry Andric 
2101e8d8bef9SDimitry Andric   assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2102480093f4SDimitry Andric 
2103480093f4SDimitry Andric   HardwareLoopInfo HWLoopInfo(L);
2104480093f4SDimitry Andric   if (!HWLoopInfo.canAnalyze(*LI)) {
2105480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2106480093f4SDimitry Andric                          "analyzable.\n");
2107480093f4SDimitry Andric     return false;
2108480093f4SDimitry Andric   }
2109480093f4SDimitry Andric 
2110480093f4SDimitry Andric   // This checks if we have the low-overhead branch architecture
2111480093f4SDimitry Andric   // extension, and if we will create a hardware-loop:
2112480093f4SDimitry Andric   if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2113480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2114480093f4SDimitry Andric                          "profitable.\n");
2115480093f4SDimitry Andric     return false;
2116480093f4SDimitry Andric   }
2117480093f4SDimitry Andric 
2118480093f4SDimitry Andric   if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2119480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2120480093f4SDimitry Andric                          "a candidate.\n");
2121480093f4SDimitry Andric     return false;
2122480093f4SDimitry Andric   }
2123480093f4SDimitry Andric 
2124480093f4SDimitry Andric   return canTailPredicateLoop(L, LI, SE, DL, LAI);
2125480093f4SDimitry Andric }
2126480093f4SDimitry Andric 
21275ffd83dbSDimitry Andric bool ARMTTIImpl::emitGetActiveLaneMask() const {
21285ffd83dbSDimitry Andric   if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
21295ffd83dbSDimitry Andric     return false;
2130480093f4SDimitry Andric 
21315ffd83dbSDimitry Andric   // Intrinsic @llvm.get.active.lane.mask is supported.
21325ffd83dbSDimitry Andric   // It is used in the MVETailPredication pass, which requires the number of
21335ffd83dbSDimitry Andric   // elements processed by this vector loop to setup the tail-predicated
21345ffd83dbSDimitry Andric   // loop.
21355ffd83dbSDimitry Andric   return true;
21365ffd83dbSDimitry Andric }
21370b57cec5SDimitry Andric void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
21380b57cec5SDimitry Andric                                          TTI::UnrollingPreferences &UP) {
2139fe6060f1SDimitry Andric   // Enable Upper bound unrolling universally, not dependant upon the conditions
2140fe6060f1SDimitry Andric   // below.
2141fe6060f1SDimitry Andric   UP.UpperBound = true;
2142fe6060f1SDimitry Andric 
21430b57cec5SDimitry Andric   // Only currently enable these preferences for M-Class cores.
21440b57cec5SDimitry Andric   if (!ST->isMClass())
21450b57cec5SDimitry Andric     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);
21460b57cec5SDimitry Andric 
21470b57cec5SDimitry Andric   // Disable loop unrolling for Oz and Os.
21480b57cec5SDimitry Andric   UP.OptSizeThreshold = 0;
21490b57cec5SDimitry Andric   UP.PartialOptSizeThreshold = 0;
21500b57cec5SDimitry Andric   if (L->getHeader()->getParent()->hasOptSize())
21510b57cec5SDimitry Andric     return;
21520b57cec5SDimitry Andric 
21530b57cec5SDimitry Andric   SmallVector<BasicBlock*, 4> ExitingBlocks;
21540b57cec5SDimitry Andric   L->getExitingBlocks(ExitingBlocks);
21550b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Loop has:\n"
21560b57cec5SDimitry Andric                     << "Blocks: " << L->getNumBlocks() << "\n"
21570b57cec5SDimitry Andric                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
21580b57cec5SDimitry Andric 
21590b57cec5SDimitry Andric   // Only allow another exit other than the latch. This acts as an early exit
21600b57cec5SDimitry Andric   // as it mirrors the profitability calculation of the runtime unroller.
21610b57cec5SDimitry Andric   if (ExitingBlocks.size() > 2)
21620b57cec5SDimitry Andric     return;
21630b57cec5SDimitry Andric 
21640b57cec5SDimitry Andric   // Limit the CFG of the loop body for targets with a branch predictor.
21650b57cec5SDimitry Andric   // Allowing 4 blocks permits if-then-else diamonds in the body.
21660b57cec5SDimitry Andric   if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
21670b57cec5SDimitry Andric     return;
21680b57cec5SDimitry Andric 
2169e8d8bef9SDimitry Andric   // Don't unroll vectorized loops, including the remainder loop
2170e8d8bef9SDimitry Andric   if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2171e8d8bef9SDimitry Andric     return;
2172e8d8bef9SDimitry Andric 
21730b57cec5SDimitry Andric   // Scan the loop: don't unroll loops with calls as this could prevent
21740b57cec5SDimitry Andric   // inlining.
2175fe6060f1SDimitry Andric   InstructionCost Cost = 0;
21760b57cec5SDimitry Andric   for (auto *BB : L->getBlocks()) {
21770b57cec5SDimitry Andric     for (auto &I : *BB) {
2178480093f4SDimitry Andric       // Don't unroll vectorised loop. MVE does not benefit from it as much as
2179480093f4SDimitry Andric       // scalar code.
2180480093f4SDimitry Andric       if (I.getType()->isVectorTy())
2181480093f4SDimitry Andric         return;
2182480093f4SDimitry Andric 
21830b57cec5SDimitry Andric       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
21845ffd83dbSDimitry Andric         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
21850b57cec5SDimitry Andric           if (!isLoweredToCall(F))
21860b57cec5SDimitry Andric             continue;
21870b57cec5SDimitry Andric         }
21880b57cec5SDimitry Andric         return;
21890b57cec5SDimitry Andric       }
21908bcb0991SDimitry Andric 
2191e8d8bef9SDimitry Andric       SmallVector<const Value*, 4> Operands(I.operand_values());
2192e8d8bef9SDimitry Andric       Cost +=
2193e8d8bef9SDimitry Andric         getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
21940b57cec5SDimitry Andric     }
21950b57cec5SDimitry Andric   }
21960b57cec5SDimitry Andric 
2197fe6060f1SDimitry Andric   // On v6m cores, there are very few registers available. We can easily end up
2198fe6060f1SDimitry Andric   // spilling and reloading more registers in an unrolled loop. Look at the
2199fe6060f1SDimitry Andric   // number of LCSSA phis as a rough measure of how many registers will need to
2200fe6060f1SDimitry Andric   // be live out of the loop, reducing the default unroll count if more than 1
2201fe6060f1SDimitry Andric   // value is needed.  In the long run, all of this should be being learnt by a
2202fe6060f1SDimitry Andric   // machine.
2203fe6060f1SDimitry Andric   unsigned UnrollCount = 4;
2204fe6060f1SDimitry Andric   if (ST->isThumb1Only()) {
2205fe6060f1SDimitry Andric     unsigned ExitingValues = 0;
2206fe6060f1SDimitry Andric     SmallVector<BasicBlock *, 4> ExitBlocks;
2207fe6060f1SDimitry Andric     L->getExitBlocks(ExitBlocks);
2208fe6060f1SDimitry Andric     for (auto *Exit : ExitBlocks) {
2209fe6060f1SDimitry Andric       // Count the number of LCSSA phis. Exclude values coming from GEP's as
2210fe6060f1SDimitry Andric       // only the last is expected to be needed for address operands.
2211fe6060f1SDimitry Andric       unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2212fe6060f1SDimitry Andric         return PH.getNumOperands() != 1 ||
2213fe6060f1SDimitry Andric                !isa<GetElementPtrInst>(PH.getOperand(0));
2214fe6060f1SDimitry Andric       });
2215fe6060f1SDimitry Andric       ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2216fe6060f1SDimitry Andric     }
2217fe6060f1SDimitry Andric     if (ExitingValues)
2218fe6060f1SDimitry Andric       UnrollCount /= ExitingValues;
2219fe6060f1SDimitry Andric     if (UnrollCount <= 1)
2220fe6060f1SDimitry Andric       return;
2221fe6060f1SDimitry Andric   }
2222fe6060f1SDimitry Andric 
22230b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2224fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
22250b57cec5SDimitry Andric 
22260b57cec5SDimitry Andric   UP.Partial = true;
22270b57cec5SDimitry Andric   UP.Runtime = true;
22280b57cec5SDimitry Andric   UP.UnrollRemainder = true;
2229fe6060f1SDimitry Andric   UP.DefaultUnrollRuntimeCount = UnrollCount;
22300b57cec5SDimitry Andric   UP.UnrollAndJam = true;
22310b57cec5SDimitry Andric   UP.UnrollAndJamInnerLoopThreshold = 60;
22320b57cec5SDimitry Andric 
22330b57cec5SDimitry Andric   // Force unrolling small loops can be very useful because of the branch
22340b57cec5SDimitry Andric   // taken cost of the backedge.
22350b57cec5SDimitry Andric   if (Cost < 12)
22360b57cec5SDimitry Andric     UP.Force = true;
22370b57cec5SDimitry Andric }
22388bcb0991SDimitry Andric 
22395ffd83dbSDimitry Andric void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
22405ffd83dbSDimitry Andric                                        TTI::PeelingPreferences &PP) {
22415ffd83dbSDimitry Andric   BaseT::getPeelingPreferences(L, SE, PP);
22425ffd83dbSDimitry Andric }
22435ffd83dbSDimitry Andric 
2244e8d8bef9SDimitry Andric bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2245e8d8bef9SDimitry Andric                                        TTI::ReductionFlags Flags) const {
2246e8d8bef9SDimitry Andric   if (!ST->hasMVEIntegerOps())
2247e8d8bef9SDimitry Andric     return false;
2248e8d8bef9SDimitry Andric 
2249e8d8bef9SDimitry Andric   unsigned ScalarBits = Ty->getScalarSizeInBits();
2250e8d8bef9SDimitry Andric   switch (Opcode) {
2251e8d8bef9SDimitry Andric   case Instruction::Add:
2252e8d8bef9SDimitry Andric     return ScalarBits <= 64;
2253e8d8bef9SDimitry Andric   default:
2254e8d8bef9SDimitry Andric     return false;
2255e8d8bef9SDimitry Andric   }
2256e8d8bef9SDimitry Andric }
2257e8d8bef9SDimitry Andric 
2258e8d8bef9SDimitry Andric bool ARMTTIImpl::preferPredicatedReductionSelect(
2259e8d8bef9SDimitry Andric     unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2260e8d8bef9SDimitry Andric   if (!ST->hasMVEIntegerOps())
2261e8d8bef9SDimitry Andric     return false;
2262e8d8bef9SDimitry Andric   return true;
2263e8d8bef9SDimitry Andric }
2264