10b57cec5SDimitry Andric //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric 
90b57cec5SDimitry Andric #include "ARMTargetTransformInfo.h"
100b57cec5SDimitry Andric #include "ARMSubtarget.h"
110b57cec5SDimitry Andric #include "MCTargetDesc/ARMAddressingModes.h"
120b57cec5SDimitry Andric #include "llvm/ADT/APInt.h"
130b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
140b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h"
150b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h"
160b57cec5SDimitry Andric #include "llvm/CodeGen/ISDOpcodes.h"
170b57cec5SDimitry Andric #include "llvm/CodeGen/ValueTypes.h"
180b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h"
190b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h"
200b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
210b57cec5SDimitry Andric #include "llvm/IR/Instruction.h"
220b57cec5SDimitry Andric #include "llvm/IR/Instructions.h"
230b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
24fcaf7f86SDimitry Andric #include "llvm/IR/Intrinsics.h"
255ffd83dbSDimitry Andric #include "llvm/IR/IntrinsicsARM.h"
26480093f4SDimitry Andric #include "llvm/IR/PatternMatch.h"
270b57cec5SDimitry Andric #include "llvm/IR/Type.h"
280b57cec5SDimitry Andric #include "llvm/MC/SubtargetFeature.h"
290b57cec5SDimitry Andric #include "llvm/Support/Casting.h"
30e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h"
310b57cec5SDimitry Andric #include "llvm/Support/MachineValueType.h"
320b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h"
33e8d8bef9SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h"
34e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
355ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h"
36fcaf7f86SDimitry Andric #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
370b57cec5SDimitry Andric #include <algorithm>
380b57cec5SDimitry Andric #include <cassert>
390b57cec5SDimitry Andric #include <cstdint>
400b57cec5SDimitry Andric #include <utility>
410b57cec5SDimitry Andric 
420b57cec5SDimitry Andric using namespace llvm;
430b57cec5SDimitry Andric 
440b57cec5SDimitry Andric #define DEBUG_TYPE "armtti"
450b57cec5SDimitry Andric 
468bcb0991SDimitry Andric static cl::opt<bool> EnableMaskedLoadStores(
47480093f4SDimitry Andric   "enable-arm-maskedldst", cl::Hidden, cl::init(true),
488bcb0991SDimitry Andric   cl::desc("Enable the generation of masked loads and stores"));
498bcb0991SDimitry Andric 
500b57cec5SDimitry Andric static cl::opt<bool> DisableLowOverheadLoops(
518bcb0991SDimitry Andric   "disable-arm-loloops", cl::Hidden, cl::init(false),
520b57cec5SDimitry Andric   cl::desc("Disable the generation of low-overhead loops"));
530b57cec5SDimitry Andric 
54e8d8bef9SDimitry Andric static cl::opt<bool>
55e8d8bef9SDimitry Andric     AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
56e8d8bef9SDimitry Andric                   cl::desc("Enable the generation of WLS loops"));
57e8d8bef9SDimitry Andric 
585ffd83dbSDimitry Andric extern cl::opt<TailPredication::Mode> EnableTailPredication;
59480093f4SDimitry Andric 
60480093f4SDimitry Andric extern cl::opt<bool> EnableMaskedGatherScatters;
61480093f4SDimitry Andric 
62e8d8bef9SDimitry Andric extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;
63e8d8bef9SDimitry Andric 
64e8d8bef9SDimitry Andric /// Convert a vector load intrinsic into a simple llvm load instruction.
65e8d8bef9SDimitry Andric /// This is beneficial when the underlying object being addressed comes
66e8d8bef9SDimitry Andric /// from a constant, since we get constant-folding for free.
67e8d8bef9SDimitry Andric static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
68e8d8bef9SDimitry Andric                                InstCombiner::BuilderTy &Builder) {
69e8d8bef9SDimitry Andric   auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
70e8d8bef9SDimitry Andric 
71e8d8bef9SDimitry Andric   if (!IntrAlign)
72e8d8bef9SDimitry Andric     return nullptr;
73e8d8bef9SDimitry Andric 
74e8d8bef9SDimitry Andric   unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
75e8d8bef9SDimitry Andric                            ? MemAlign
76e8d8bef9SDimitry Andric                            : IntrAlign->getLimitedValue();
77e8d8bef9SDimitry Andric 
78e8d8bef9SDimitry Andric   if (!isPowerOf2_32(Alignment))
79e8d8bef9SDimitry Andric     return nullptr;
80e8d8bef9SDimitry Andric 
81e8d8bef9SDimitry Andric   auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
82e8d8bef9SDimitry Andric                                           PointerType::get(II.getType(), 0));
83e8d8bef9SDimitry Andric   return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
84e8d8bef9SDimitry Andric }
85e8d8bef9SDimitry Andric 
860b57cec5SDimitry Andric bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
870b57cec5SDimitry Andric                                      const Function *Callee) const {
880b57cec5SDimitry Andric   const TargetMachine &TM = getTLI()->getTargetMachine();
890b57cec5SDimitry Andric   const FeatureBitset &CallerBits =
900b57cec5SDimitry Andric       TM.getSubtargetImpl(*Caller)->getFeatureBits();
910b57cec5SDimitry Andric   const FeatureBitset &CalleeBits =
920b57cec5SDimitry Andric       TM.getSubtargetImpl(*Callee)->getFeatureBits();
930b57cec5SDimitry Andric 
945ffd83dbSDimitry Andric   // To inline a callee, all features not in the allowed list must match exactly.
955ffd83dbSDimitry Andric   bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
965ffd83dbSDimitry Andric                     (CalleeBits & ~InlineFeaturesAllowed);
975ffd83dbSDimitry Andric   // For features in the allowed list, the callee's features must be a subset of
980b57cec5SDimitry Andric   // the callers'.
995ffd83dbSDimitry Andric   bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
1005ffd83dbSDimitry Andric                      (CalleeBits & InlineFeaturesAllowed);
1010b57cec5SDimitry Andric   return MatchExact && MatchSubset;
1020b57cec5SDimitry Andric }
1030b57cec5SDimitry Andric 
104fe6060f1SDimitry Andric TTI::AddressingModeKind
105fe6060f1SDimitry Andric ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
106fe6060f1SDimitry Andric                                        ScalarEvolution *SE) const {
1075ffd83dbSDimitry Andric   if (ST->hasMVEIntegerOps())
108fe6060f1SDimitry Andric     return TTI::AMK_PostIndexed;
1095ffd83dbSDimitry Andric 
110fe6060f1SDimitry Andric   if (L->getHeader()->getParent()->hasOptSize())
111fe6060f1SDimitry Andric     return TTI::AMK_None;
112fe6060f1SDimitry Andric 
113fe6060f1SDimitry Andric   if (ST->isMClass() && ST->isThumb2() &&
114fe6060f1SDimitry Andric       L->getNumBlocks() == 1)
115fe6060f1SDimitry Andric     return TTI::AMK_PreIndexed;
116fe6060f1SDimitry Andric 
117fe6060f1SDimitry Andric   return TTI::AMK_None;
1185ffd83dbSDimitry Andric }
1195ffd83dbSDimitry Andric 
120e8d8bef9SDimitry Andric Optional<Instruction *>
121e8d8bef9SDimitry Andric ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
122e8d8bef9SDimitry Andric   using namespace PatternMatch;
123e8d8bef9SDimitry Andric   Intrinsic::ID IID = II.getIntrinsicID();
124e8d8bef9SDimitry Andric   switch (IID) {
125e8d8bef9SDimitry Andric   default:
126e8d8bef9SDimitry Andric     break;
127e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld1: {
128e8d8bef9SDimitry Andric     Align MemAlign =
129e8d8bef9SDimitry Andric         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
130e8d8bef9SDimitry Andric                           &IC.getAssumptionCache(), &IC.getDominatorTree());
131e8d8bef9SDimitry Andric     if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
132e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, V);
133e8d8bef9SDimitry Andric     }
134e8d8bef9SDimitry Andric     break;
135e8d8bef9SDimitry Andric   }
136e8d8bef9SDimitry Andric 
137e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld2:
138e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld3:
139e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld4:
140e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld2lane:
141e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld3lane:
142e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vld4lane:
143e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst1:
144e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst2:
145e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst3:
146e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst4:
147e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst2lane:
148e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst3lane:
149e8d8bef9SDimitry Andric   case Intrinsic::arm_neon_vst4lane: {
150e8d8bef9SDimitry Andric     Align MemAlign =
151e8d8bef9SDimitry Andric         getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
152e8d8bef9SDimitry Andric                           &IC.getAssumptionCache(), &IC.getDominatorTree());
153349cc55cSDimitry Andric     unsigned AlignArg = II.arg_size() - 1;
154e8d8bef9SDimitry Andric     Value *AlignArgOp = II.getArgOperand(AlignArg);
155e8d8bef9SDimitry Andric     MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
156e8d8bef9SDimitry Andric     if (Align && *Align < MemAlign) {
157e8d8bef9SDimitry Andric       return IC.replaceOperand(
158e8d8bef9SDimitry Andric           II, AlignArg,
159e8d8bef9SDimitry Andric           ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
160e8d8bef9SDimitry Andric                            false));
161e8d8bef9SDimitry Andric     }
162e8d8bef9SDimitry Andric     break;
163e8d8bef9SDimitry Andric   }
164e8d8bef9SDimitry Andric 
165e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_pred_i2v: {
166e8d8bef9SDimitry Andric     Value *Arg = II.getArgOperand(0);
167e8d8bef9SDimitry Andric     Value *ArgArg;
168e8d8bef9SDimitry Andric     if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
169e8d8bef9SDimitry Andric                        PatternMatch::m_Value(ArgArg))) &&
170e8d8bef9SDimitry Andric         II.getType() == ArgArg->getType()) {
171e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, ArgArg);
172e8d8bef9SDimitry Andric     }
173e8d8bef9SDimitry Andric     Constant *XorMask;
174e8d8bef9SDimitry Andric     if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
175e8d8bef9SDimitry Andric                              PatternMatch::m_Value(ArgArg)),
176e8d8bef9SDimitry Andric                          PatternMatch::m_Constant(XorMask))) &&
177e8d8bef9SDimitry Andric         II.getType() == ArgArg->getType()) {
178e8d8bef9SDimitry Andric       if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
179349cc55cSDimitry Andric         if (CI->getValue().trunc(16).isAllOnes()) {
180e8d8bef9SDimitry Andric           auto TrueVector = IC.Builder.CreateVectorSplat(
181e8d8bef9SDimitry Andric               cast<FixedVectorType>(II.getType())->getNumElements(),
182e8d8bef9SDimitry Andric               IC.Builder.getTrue());
183e8d8bef9SDimitry Andric           return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
184e8d8bef9SDimitry Andric         }
185e8d8bef9SDimitry Andric       }
186e8d8bef9SDimitry Andric     }
187e8d8bef9SDimitry Andric     KnownBits ScalarKnown(32);
188e8d8bef9SDimitry Andric     if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
189e8d8bef9SDimitry Andric                                 ScalarKnown, 0)) {
190e8d8bef9SDimitry Andric       return &II;
191e8d8bef9SDimitry Andric     }
192e8d8bef9SDimitry Andric     break;
193e8d8bef9SDimitry Andric   }
194e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_pred_v2i: {
195e8d8bef9SDimitry Andric     Value *Arg = II.getArgOperand(0);
196e8d8bef9SDimitry Andric     Value *ArgArg;
197e8d8bef9SDimitry Andric     if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
198e8d8bef9SDimitry Andric                        PatternMatch::m_Value(ArgArg)))) {
199e8d8bef9SDimitry Andric       return IC.replaceInstUsesWith(II, ArgArg);
200e8d8bef9SDimitry Andric     }
201e8d8bef9SDimitry Andric     if (!II.getMetadata(LLVMContext::MD_range)) {
202e8d8bef9SDimitry Andric       Type *IntTy32 = Type::getInt32Ty(II.getContext());
203e8d8bef9SDimitry Andric       Metadata *M[] = {
204e8d8bef9SDimitry Andric           ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
205fe6060f1SDimitry Andric           ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))};
206e8d8bef9SDimitry Andric       II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
207e8d8bef9SDimitry Andric       return &II;
208e8d8bef9SDimitry Andric     }
209e8d8bef9SDimitry Andric     break;
210e8d8bef9SDimitry Andric   }
211e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_vadc:
212e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_vadc_predicated: {
213e8d8bef9SDimitry Andric     unsigned CarryOp =
214e8d8bef9SDimitry Andric         (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
215e8d8bef9SDimitry Andric     assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
216e8d8bef9SDimitry Andric            "Bad type for intrinsic!");
217e8d8bef9SDimitry Andric 
218e8d8bef9SDimitry Andric     KnownBits CarryKnown(32);
219e8d8bef9SDimitry Andric     if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
220e8d8bef9SDimitry Andric                                 CarryKnown)) {
221e8d8bef9SDimitry Andric       return &II;
222e8d8bef9SDimitry Andric     }
223e8d8bef9SDimitry Andric     break;
224e8d8bef9SDimitry Andric   }
225e8d8bef9SDimitry Andric   case Intrinsic::arm_mve_vmldava: {
226e8d8bef9SDimitry Andric     Instruction *I = cast<Instruction>(&II);
227e8d8bef9SDimitry Andric     if (I->hasOneUse()) {
228e8d8bef9SDimitry Andric       auto *User = cast<Instruction>(*I->user_begin());
229e8d8bef9SDimitry Andric       Value *OpZ;
230e8d8bef9SDimitry Andric       if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
231e8d8bef9SDimitry Andric           match(I->getOperand(3), m_Zero())) {
232e8d8bef9SDimitry Andric         Value *OpX = I->getOperand(4);
233e8d8bef9SDimitry Andric         Value *OpY = I->getOperand(5);
234e8d8bef9SDimitry Andric         Type *OpTy = OpX->getType();
235e8d8bef9SDimitry Andric 
236e8d8bef9SDimitry Andric         IC.Builder.SetInsertPoint(User);
237e8d8bef9SDimitry Andric         Value *V =
238e8d8bef9SDimitry Andric             IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
239e8d8bef9SDimitry Andric                                        {I->getOperand(0), I->getOperand(1),
240e8d8bef9SDimitry Andric                                         I->getOperand(2), OpZ, OpX, OpY});
241e8d8bef9SDimitry Andric 
242e8d8bef9SDimitry Andric         IC.replaceInstUsesWith(*User, V);
243e8d8bef9SDimitry Andric         return IC.eraseInstFromFunction(*User);
244e8d8bef9SDimitry Andric       }
245e8d8bef9SDimitry Andric     }
246e8d8bef9SDimitry Andric     return None;
247e8d8bef9SDimitry Andric   }
248e8d8bef9SDimitry Andric   }
249e8d8bef9SDimitry Andric   return None;
250e8d8bef9SDimitry Andric }
251e8d8bef9SDimitry Andric 
252349cc55cSDimitry Andric Optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
253349cc55cSDimitry Andric     InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
254349cc55cSDimitry Andric     APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
255349cc55cSDimitry Andric     std::function<void(Instruction *, unsigned, APInt, APInt &)>
256349cc55cSDimitry Andric         SimplifyAndSetOp) const {
257349cc55cSDimitry Andric 
258349cc55cSDimitry Andric   // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
259349cc55cSDimitry Andric   // opcode specifying a Top/Bottom instruction, which can change between
260349cc55cSDimitry Andric   // instructions.
261349cc55cSDimitry Andric   auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
262349cc55cSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
263349cc55cSDimitry Andric     unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
264349cc55cSDimitry Andric 
265349cc55cSDimitry Andric     // The only odd/even lanes of operand 0 will only be demanded depending
266349cc55cSDimitry Andric     // on whether this is a top/bottom instruction.
267349cc55cSDimitry Andric     APInt DemandedElts =
268349cc55cSDimitry Andric         APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
269349cc55cSDimitry Andric                                        : APInt::getHighBitsSet(2, 1));
270349cc55cSDimitry Andric     SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
271349cc55cSDimitry Andric     // The other lanes will be defined from the inserted elements.
272349cc55cSDimitry Andric     UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1)
273349cc55cSDimitry Andric                                                  : APInt::getHighBitsSet(2, 1));
274349cc55cSDimitry Andric     return None;
275349cc55cSDimitry Andric   };
276349cc55cSDimitry Andric 
277349cc55cSDimitry Andric   switch (II.getIntrinsicID()) {
278349cc55cSDimitry Andric   default:
279349cc55cSDimitry Andric     break;
280349cc55cSDimitry Andric   case Intrinsic::arm_mve_vcvt_narrow:
281349cc55cSDimitry Andric     SimplifyNarrowInstrTopBottom(2);
282349cc55cSDimitry Andric     break;
283349cc55cSDimitry Andric   case Intrinsic::arm_mve_vqmovn:
284349cc55cSDimitry Andric     SimplifyNarrowInstrTopBottom(4);
285349cc55cSDimitry Andric     break;
286349cc55cSDimitry Andric   case Intrinsic::arm_mve_vshrn:
287349cc55cSDimitry Andric     SimplifyNarrowInstrTopBottom(7);
288349cc55cSDimitry Andric     break;
289349cc55cSDimitry Andric   }
290349cc55cSDimitry Andric 
291349cc55cSDimitry Andric   return None;
292349cc55cSDimitry Andric }
293349cc55cSDimitry Andric 
294fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
2955ffd83dbSDimitry Andric                                           TTI::TargetCostKind CostKind) {
2960b57cec5SDimitry Andric   assert(Ty->isIntegerTy());
2970b57cec5SDimitry Andric 
2980b57cec5SDimitry Andric  unsigned Bits = Ty->getPrimitiveSizeInBits();
2990b57cec5SDimitry Andric  if (Bits == 0 || Imm.getActiveBits() >= 64)
3000b57cec5SDimitry Andric    return 4;
3010b57cec5SDimitry Andric 
3020b57cec5SDimitry Andric   int64_t SImmVal = Imm.getSExtValue();
3030b57cec5SDimitry Andric   uint64_t ZImmVal = Imm.getZExtValue();
3040b57cec5SDimitry Andric   if (!ST->isThumb()) {
3050b57cec5SDimitry Andric     if ((SImmVal >= 0 && SImmVal < 65536) ||
3060b57cec5SDimitry Andric         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
3070b57cec5SDimitry Andric         (ARM_AM::getSOImmVal(~ZImmVal) != -1))
3080b57cec5SDimitry Andric       return 1;
3090b57cec5SDimitry Andric     return ST->hasV6T2Ops() ? 2 : 3;
3100b57cec5SDimitry Andric   }
3110b57cec5SDimitry Andric   if (ST->isThumb2()) {
3120b57cec5SDimitry Andric     if ((SImmVal >= 0 && SImmVal < 65536) ||
3130b57cec5SDimitry Andric         (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
3140b57cec5SDimitry Andric         (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
3150b57cec5SDimitry Andric       return 1;
3160b57cec5SDimitry Andric     return ST->hasV6T2Ops() ? 2 : 3;
3170b57cec5SDimitry Andric   }
3180b57cec5SDimitry Andric   // Thumb1, any i8 imm cost 1.
3190b57cec5SDimitry Andric   if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
3200b57cec5SDimitry Andric     return 1;
3210b57cec5SDimitry Andric   if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
3220b57cec5SDimitry Andric     return 2;
3230b57cec5SDimitry Andric   // Load from constantpool.
3240b57cec5SDimitry Andric   return 3;
3250b57cec5SDimitry Andric }
3260b57cec5SDimitry Andric 
3270b57cec5SDimitry Andric // Constants smaller than 256 fit in the immediate field of
3280b57cec5SDimitry Andric // Thumb1 instructions so we return a zero cost and 1 otherwise.
329fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
3300b57cec5SDimitry Andric                                                   const APInt &Imm, Type *Ty) {
3310b57cec5SDimitry Andric   if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
3320b57cec5SDimitry Andric     return 0;
3330b57cec5SDimitry Andric 
3340b57cec5SDimitry Andric   return 1;
3350b57cec5SDimitry Andric }
3360b57cec5SDimitry Andric 
337e8d8bef9SDimitry Andric // Checks whether Inst is part of a min(max()) or max(min()) pattern
3384824e7fdSDimitry Andric // that will match to an SSAT instruction. Returns the instruction being
3394824e7fdSDimitry Andric // saturated, or null if no saturation pattern was found.
3404824e7fdSDimitry Andric static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
341e8d8bef9SDimitry Andric   Value *LHS, *RHS;
342e8d8bef9SDimitry Andric   ConstantInt *C;
343e8d8bef9SDimitry Andric   SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor;
344e8d8bef9SDimitry Andric 
345e8d8bef9SDimitry Andric   if (InstSPF == SPF_SMAX &&
346e8d8bef9SDimitry Andric       PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) &&
347349cc55cSDimitry Andric       C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
348e8d8bef9SDimitry Andric 
349e8d8bef9SDimitry Andric     auto isSSatMin = [&](Value *MinInst) {
350e8d8bef9SDimitry Andric       if (isa<SelectInst>(MinInst)) {
351e8d8bef9SDimitry Andric         Value *MinLHS, *MinRHS;
352e8d8bef9SDimitry Andric         ConstantInt *MinC;
353e8d8bef9SDimitry Andric         SelectPatternFlavor MinSPF =
354e8d8bef9SDimitry Andric             matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
355e8d8bef9SDimitry Andric         if (MinSPF == SPF_SMIN &&
356e8d8bef9SDimitry Andric             PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) &&
357e8d8bef9SDimitry Andric             MinC->getValue() == ((-Imm) - 1))
358e8d8bef9SDimitry Andric           return true;
359e8d8bef9SDimitry Andric       }
360e8d8bef9SDimitry Andric       return false;
361e8d8bef9SDimitry Andric     };
362e8d8bef9SDimitry Andric 
3634824e7fdSDimitry Andric     if (isSSatMin(Inst->getOperand(1)))
3644824e7fdSDimitry Andric       return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
3654824e7fdSDimitry Andric     if (Inst->hasNUses(2) &&
3664824e7fdSDimitry Andric         (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
3674824e7fdSDimitry Andric       return Inst->getOperand(1);
368e8d8bef9SDimitry Andric   }
3694824e7fdSDimitry Andric   return nullptr;
3704824e7fdSDimitry Andric }
3714824e7fdSDimitry Andric 
3724824e7fdSDimitry Andric // Look for a FP Saturation pattern, where the instruction can be simplified to
3734824e7fdSDimitry Andric // a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
3744824e7fdSDimitry Andric static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
3754824e7fdSDimitry Andric   if (Imm.getBitWidth() != 64 ||
3764824e7fdSDimitry Andric       Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
377e8d8bef9SDimitry Andric     return false;
3784824e7fdSDimitry Andric   Value *FP = isSSATMinMaxPattern(Inst, Imm);
3794824e7fdSDimitry Andric   if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
3804824e7fdSDimitry Andric     FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
3814824e7fdSDimitry Andric   if (!FP)
3824824e7fdSDimitry Andric     return false;
3834824e7fdSDimitry Andric   return isa<FPToSIInst>(FP);
384e8d8bef9SDimitry Andric }
385e8d8bef9SDimitry Andric 
386fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
387e8d8bef9SDimitry Andric                                               const APInt &Imm, Type *Ty,
388e8d8bef9SDimitry Andric                                               TTI::TargetCostKind CostKind,
389e8d8bef9SDimitry Andric                                               Instruction *Inst) {
3900b57cec5SDimitry Andric   // Division by a constant can be turned into multiplication, but only if we
3910b57cec5SDimitry Andric   // know it's constant. So it's not so much that the immediate is cheap (it's
3920b57cec5SDimitry Andric   // not), but that the alternative is worse.
3930b57cec5SDimitry Andric   // FIXME: this is probably unneeded with GlobalISel.
3940b57cec5SDimitry Andric   if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
3950b57cec5SDimitry Andric        Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
3960b57cec5SDimitry Andric       Idx == 1)
3970b57cec5SDimitry Andric     return 0;
3980b57cec5SDimitry Andric 
399fe6060f1SDimitry Andric   // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
400fe6060f1SDimitry Andric   // splitting any large offsets.
401fe6060f1SDimitry Andric   if (Opcode == Instruction::GetElementPtr && Idx != 0)
402fe6060f1SDimitry Andric     return 0;
403fe6060f1SDimitry Andric 
4040b57cec5SDimitry Andric   if (Opcode == Instruction::And) {
4050b57cec5SDimitry Andric     // UXTB/UXTH
4060b57cec5SDimitry Andric     if (Imm == 255 || Imm == 65535)
4070b57cec5SDimitry Andric       return 0;
4080b57cec5SDimitry Andric     // Conversion to BIC is free, and means we can use ~Imm instead.
4095ffd83dbSDimitry Andric     return std::min(getIntImmCost(Imm, Ty, CostKind),
4105ffd83dbSDimitry Andric                     getIntImmCost(~Imm, Ty, CostKind));
4110b57cec5SDimitry Andric   }
4120b57cec5SDimitry Andric 
4130b57cec5SDimitry Andric   if (Opcode == Instruction::Add)
4140b57cec5SDimitry Andric     // Conversion to SUB is free, and means we can use -Imm instead.
4155ffd83dbSDimitry Andric     return std::min(getIntImmCost(Imm, Ty, CostKind),
4165ffd83dbSDimitry Andric                     getIntImmCost(-Imm, Ty, CostKind));
4170b57cec5SDimitry Andric 
4180b57cec5SDimitry Andric   if (Opcode == Instruction::ICmp && Imm.isNegative() &&
4190b57cec5SDimitry Andric       Ty->getIntegerBitWidth() == 32) {
4200b57cec5SDimitry Andric     int64_t NegImm = -Imm.getSExtValue();
4210b57cec5SDimitry Andric     if (ST->isThumb2() && NegImm < 1<<12)
4220b57cec5SDimitry Andric       // icmp X, #-C -> cmn X, #C
4230b57cec5SDimitry Andric       return 0;
4240b57cec5SDimitry Andric     if (ST->isThumb() && NegImm < 1<<8)
4250b57cec5SDimitry Andric       // icmp X, #-C -> adds X, #C
4260b57cec5SDimitry Andric       return 0;
4270b57cec5SDimitry Andric   }
4280b57cec5SDimitry Andric 
4290b57cec5SDimitry Andric   // xor a, -1 can always be folded to MVN
430349cc55cSDimitry Andric   if (Opcode == Instruction::Xor && Imm.isAllOnes())
4310b57cec5SDimitry Andric     return 0;
4320b57cec5SDimitry Andric 
433e8d8bef9SDimitry Andric   // Ensures negative constant of min(max()) or max(min()) patterns that
434e8d8bef9SDimitry Andric   // match to SSAT instructions don't get hoisted
435e8d8bef9SDimitry Andric   if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
436e8d8bef9SDimitry Andric       Ty->getIntegerBitWidth() <= 32) {
437e8d8bef9SDimitry Andric     if (isSSATMinMaxPattern(Inst, Imm) ||
438e8d8bef9SDimitry Andric         (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
439e8d8bef9SDimitry Andric          isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
440e8d8bef9SDimitry Andric       return 0;
441e8d8bef9SDimitry Andric   }
442e8d8bef9SDimitry Andric 
4434824e7fdSDimitry Andric   if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
4444824e7fdSDimitry Andric     return 0;
4454824e7fdSDimitry Andric 
446349cc55cSDimitry Andric   // We can convert <= -1 to < 0, which is generally quite cheap.
447349cc55cSDimitry Andric   if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) {
448349cc55cSDimitry Andric     ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
449349cc55cSDimitry Andric     if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
450349cc55cSDimitry Andric       return std::min(getIntImmCost(Imm, Ty, CostKind),
451349cc55cSDimitry Andric                       getIntImmCost(Imm + 1, Ty, CostKind));
452349cc55cSDimitry Andric   }
453349cc55cSDimitry Andric 
4545ffd83dbSDimitry Andric   return getIntImmCost(Imm, Ty, CostKind);
4550b57cec5SDimitry Andric }
4560b57cec5SDimitry Andric 
457fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
458fe6060f1SDimitry Andric                                            TTI::TargetCostKind CostKind,
459fe6060f1SDimitry Andric                                            const Instruction *I) {
460e8d8bef9SDimitry Andric   if (CostKind == TTI::TCK_RecipThroughput &&
461e8d8bef9SDimitry Andric       (ST->hasNEON() || ST->hasMVEIntegerOps())) {
462e8d8bef9SDimitry Andric     // FIXME: The vectorizer is highly sensistive to the cost of these
463e8d8bef9SDimitry Andric     // instructions, which suggests that it may be using the costs incorrectly.
464e8d8bef9SDimitry Andric     // But, for now, just make them free to avoid performance regressions for
465e8d8bef9SDimitry Andric     // vector targets.
466e8d8bef9SDimitry Andric     return 0;
467e8d8bef9SDimitry Andric   }
468fe6060f1SDimitry Andric   return BaseT::getCFInstrCost(Opcode, CostKind, I);
469e8d8bef9SDimitry Andric }
470e8d8bef9SDimitry Andric 
471fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
472fe6060f1SDimitry Andric                                              Type *Src,
473e8d8bef9SDimitry Andric                                              TTI::CastContextHint CCH,
4745ffd83dbSDimitry Andric                                              TTI::TargetCostKind CostKind,
4750b57cec5SDimitry Andric                                              const Instruction *I) {
4760b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
4770b57cec5SDimitry Andric   assert(ISD && "Invalid opcode");
4780b57cec5SDimitry Andric 
4795ffd83dbSDimitry Andric   // TODO: Allow non-throughput costs that aren't binary.
480fe6060f1SDimitry Andric   auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
4815ffd83dbSDimitry Andric     if (CostKind != TTI::TCK_RecipThroughput)
4825ffd83dbSDimitry Andric       return Cost == 0 ? 0 : 1;
4835ffd83dbSDimitry Andric     return Cost;
4840b57cec5SDimitry Andric   };
485e8d8bef9SDimitry Andric   auto IsLegalFPType = [this](EVT VT) {
486e8d8bef9SDimitry Andric     EVT EltVT = VT.getScalarType();
487e8d8bef9SDimitry Andric     return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
488e8d8bef9SDimitry Andric             (EltVT == MVT::f64 && ST->hasFP64()) ||
489e8d8bef9SDimitry Andric             (EltVT == MVT::f16 && ST->hasFullFP16());
490e8d8bef9SDimitry Andric   };
4910b57cec5SDimitry Andric 
4920b57cec5SDimitry Andric   EVT SrcTy = TLI->getValueType(DL, Src);
4930b57cec5SDimitry Andric   EVT DstTy = TLI->getValueType(DL, Dst);
4940b57cec5SDimitry Andric 
4950b57cec5SDimitry Andric   if (!SrcTy.isSimple() || !DstTy.isSimple())
496e8d8bef9SDimitry Andric     return AdjustCost(
497e8d8bef9SDimitry Andric         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
4980b57cec5SDimitry Andric 
499e8d8bef9SDimitry Andric   // Extending masked load/Truncating masked stores is expensive because we
500e8d8bef9SDimitry Andric   // currently don't split them. This means that we'll likely end up
501e8d8bef9SDimitry Andric   // loading/storing each element individually (hence the high cost).
502e8d8bef9SDimitry Andric   if ((ST->hasMVEIntegerOps() &&
503e8d8bef9SDimitry Andric        (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
504e8d8bef9SDimitry Andric         Opcode == Instruction::SExt)) ||
505e8d8bef9SDimitry Andric       (ST->hasMVEFloatOps() &&
506e8d8bef9SDimitry Andric        (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
507e8d8bef9SDimitry Andric        IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
508e8d8bef9SDimitry Andric     if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
509fe6060f1SDimitry Andric       return 2 * DstTy.getVectorNumElements() *
510fe6060f1SDimitry Andric              ST->getMVEVectorCostFactor(CostKind);
511e8d8bef9SDimitry Andric 
512e8d8bef9SDimitry Andric   // The extend of other kinds of load is free
513e8d8bef9SDimitry Andric   if (CCH == TTI::CastContextHint::Normal ||
514e8d8bef9SDimitry Andric       CCH == TTI::CastContextHint::Masked) {
5158bcb0991SDimitry Andric     static const TypeConversionCostTblEntry LoadConversionTbl[] = {
5168bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
5178bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
5188bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
5198bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
5208bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
5218bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
5228bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
5238bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
5248bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
5258bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
5268bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
5278bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
5288bcb0991SDimitry Andric     };
5298bcb0991SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(
5308bcb0991SDimitry Andric             LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
5315ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
5328bcb0991SDimitry Andric 
5338bcb0991SDimitry Andric     static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
5348bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
5358bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
5368bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
5378bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
5388bcb0991SDimitry Andric         {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
5398bcb0991SDimitry Andric         {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
5405ffd83dbSDimitry Andric         // The following extend from a legal type to an illegal type, so need to
5415ffd83dbSDimitry Andric         // split the load. This introduced an extra load operation, but the
5425ffd83dbSDimitry Andric         // extend is still "free".
5435ffd83dbSDimitry Andric         {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
5445ffd83dbSDimitry Andric         {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
5455ffd83dbSDimitry Andric         {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
5465ffd83dbSDimitry Andric         {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
5475ffd83dbSDimitry Andric         {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
5485ffd83dbSDimitry Andric         {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
5498bcb0991SDimitry Andric     };
5508bcb0991SDimitry Andric     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
5518bcb0991SDimitry Andric       if (const auto *Entry =
5528bcb0991SDimitry Andric               ConvertCostTableLookup(MVELoadConversionTbl, ISD,
5538bcb0991SDimitry Andric                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
554fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5558bcb0991SDimitry Andric     }
5565ffd83dbSDimitry Andric 
5575ffd83dbSDimitry Andric     static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
5585ffd83dbSDimitry Andric         // FPExtends are similar but also require the VCVT instructions.
5595ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
5605ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
5615ffd83dbSDimitry Andric     };
5625ffd83dbSDimitry Andric     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
5635ffd83dbSDimitry Andric       if (const auto *Entry =
5645ffd83dbSDimitry Andric               ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
5655ffd83dbSDimitry Andric                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
566fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5675ffd83dbSDimitry Andric     }
5685ffd83dbSDimitry Andric 
5695ffd83dbSDimitry Andric     // The truncate of a store is free. This is the mirror of extends above.
570e8d8bef9SDimitry Andric     static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
5715ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
5725ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
5735ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
5745ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
575e8d8bef9SDimitry Andric         {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
5765ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
5775ffd83dbSDimitry Andric         {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
5785ffd83dbSDimitry Andric     };
5795ffd83dbSDimitry Andric     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
5805ffd83dbSDimitry Andric       if (const auto *Entry =
581e8d8bef9SDimitry Andric               ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
582e8d8bef9SDimitry Andric                                      SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
583fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5845ffd83dbSDimitry Andric     }
5855ffd83dbSDimitry Andric 
586e8d8bef9SDimitry Andric     static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
5875ffd83dbSDimitry Andric         {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
5885ffd83dbSDimitry Andric         {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
5895ffd83dbSDimitry Andric     };
5905ffd83dbSDimitry Andric     if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
5915ffd83dbSDimitry Andric       if (const auto *Entry =
592e8d8bef9SDimitry Andric               ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
593e8d8bef9SDimitry Andric                                      SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
594fe6060f1SDimitry Andric         return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
5955ffd83dbSDimitry Andric     }
5965ffd83dbSDimitry Andric   }
5975ffd83dbSDimitry Andric 
5985ffd83dbSDimitry Andric   // NEON vector operations that can extend their inputs.
5995ffd83dbSDimitry Andric   if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
6005ffd83dbSDimitry Andric       I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
6015ffd83dbSDimitry Andric     static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
6025ffd83dbSDimitry Andric       // vaddl
6035ffd83dbSDimitry Andric       { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
6045ffd83dbSDimitry Andric       { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
6055ffd83dbSDimitry Andric       // vsubl
6065ffd83dbSDimitry Andric       { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
6075ffd83dbSDimitry Andric       { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
6085ffd83dbSDimitry Andric       // vmull
6095ffd83dbSDimitry Andric       { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
6105ffd83dbSDimitry Andric       { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
6115ffd83dbSDimitry Andric       // vshll
6125ffd83dbSDimitry Andric       { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
6135ffd83dbSDimitry Andric       { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
6145ffd83dbSDimitry Andric     };
6155ffd83dbSDimitry Andric 
6165ffd83dbSDimitry Andric     auto *User = cast<Instruction>(*I->user_begin());
6175ffd83dbSDimitry Andric     int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
6185ffd83dbSDimitry Andric     if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
6195ffd83dbSDimitry Andric                                              DstTy.getSimpleVT(),
6205ffd83dbSDimitry Andric                                              SrcTy.getSimpleVT())) {
6215ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
6225ffd83dbSDimitry Andric     }
6235ffd83dbSDimitry Andric   }
6245ffd83dbSDimitry Andric 
6255ffd83dbSDimitry Andric   // Single to/from double precision conversions.
6265ffd83dbSDimitry Andric   if (Src->isVectorTy() && ST->hasNEON() &&
6275ffd83dbSDimitry Andric       ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
6285ffd83dbSDimitry Andric         DstTy.getScalarType() == MVT::f32) ||
6295ffd83dbSDimitry Andric        (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
6305ffd83dbSDimitry Andric         DstTy.getScalarType() == MVT::f64))) {
6315ffd83dbSDimitry Andric     static const CostTblEntry NEONFltDblTbl[] = {
6325ffd83dbSDimitry Andric         // Vector fptrunc/fpext conversions.
6335ffd83dbSDimitry Andric         {ISD::FP_ROUND, MVT::v2f64, 2},
6345ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v2f32, 2},
6355ffd83dbSDimitry Andric         {ISD::FP_EXTEND, MVT::v4f32, 4}};
6365ffd83dbSDimitry Andric 
637fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
6385ffd83dbSDimitry Andric     if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
6395ffd83dbSDimitry Andric       return AdjustCost(LT.first * Entry->Cost);
6408bcb0991SDimitry Andric   }
6418bcb0991SDimitry Andric 
6420b57cec5SDimitry Andric   // Some arithmetic, load and store operations have specific instructions
6430b57cec5SDimitry Andric   // to cast up/down their types automatically at no extra cost.
6440b57cec5SDimitry Andric   // TODO: Get these tables to know at least what the related operations are.
6450b57cec5SDimitry Andric   static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
6465ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
6475ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
6480b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
6490b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
6500b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
6510b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
6520b57cec5SDimitry Andric 
6530b57cec5SDimitry Andric     // The number of vmovl instructions for the extension.
6545ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
6555ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
6565ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
6575ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
6585ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
6595ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
6605ffd83dbSDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
6615ffd83dbSDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
6620b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
6630b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
6640b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
6650b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
6660b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
6670b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
6680b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
6690b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
6700b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
6710b57cec5SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
6720b57cec5SDimitry Andric 
6730b57cec5SDimitry Andric     // Operations that we legalize using splitting.
6740b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
6750b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
6760b57cec5SDimitry Andric 
6770b57cec5SDimitry Andric     // Vector float <-> i32 conversions.
6780b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
6790b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
6800b57cec5SDimitry Andric 
6810b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
6820b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
6830b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
6840b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
6850b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
6860b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
6870b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
6880b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
6890b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
6900b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
6910b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
6920b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
6930b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
6940b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
6950b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
6960b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
6970b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
6980b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
6990b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
7000b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
7010b57cec5SDimitry Andric 
7020b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
7030b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
7040b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
7050b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
7060b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
7070b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
7080b57cec5SDimitry Andric 
7090b57cec5SDimitry Andric     // Vector double <-> i32 conversions.
7100b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
7110b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
7120b57cec5SDimitry Andric 
7130b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
7140b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
7150b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
7160b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
7170b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
7180b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
7190b57cec5SDimitry Andric 
7200b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
7210b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
7220b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
7230b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
7240b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
7250b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
7260b57cec5SDimitry Andric   };
7270b57cec5SDimitry Andric 
7280b57cec5SDimitry Andric   if (SrcTy.isVector() && ST->hasNEON()) {
7290b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
7300b57cec5SDimitry Andric                                                    DstTy.getSimpleVT(),
7310b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
7325ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
7330b57cec5SDimitry Andric   }
7340b57cec5SDimitry Andric 
7350b57cec5SDimitry Andric   // Scalar float to integer conversions.
7360b57cec5SDimitry Andric   static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
7370b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
7380b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
7390b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
7400b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
7410b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
7420b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
7430b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
7440b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
7450b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
7460b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
7470b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
7480b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
7490b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
7500b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
7510b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
7520b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
7530b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
7540b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
7550b57cec5SDimitry Andric     { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
7560b57cec5SDimitry Andric     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
7570b57cec5SDimitry Andric   };
7580b57cec5SDimitry Andric   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
7590b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
7600b57cec5SDimitry Andric                                                    DstTy.getSimpleVT(),
7610b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
7625ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
7630b57cec5SDimitry Andric   }
7640b57cec5SDimitry Andric 
7650b57cec5SDimitry Andric   // Scalar integer to float conversions.
7660b57cec5SDimitry Andric   static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
7670b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
7680b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
7690b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
7700b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
7710b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
7720b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
7730b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
7740b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
7750b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
7760b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
7770b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
7780b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
7790b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
7800b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
7810b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
7820b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
7830b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
7840b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
7850b57cec5SDimitry Andric     { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
7860b57cec5SDimitry Andric     { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
7870b57cec5SDimitry Andric   };
7880b57cec5SDimitry Andric 
7890b57cec5SDimitry Andric   if (SrcTy.isInteger() && ST->hasNEON()) {
7900b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
7910b57cec5SDimitry Andric                                                    ISD, DstTy.getSimpleVT(),
7920b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
7935ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
7940b57cec5SDimitry Andric   }
7950b57cec5SDimitry Andric 
7968bcb0991SDimitry Andric   // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
7978bcb0991SDimitry Andric   // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
7988bcb0991SDimitry Andric   // are linearised so take more.
7998bcb0991SDimitry Andric   static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
8008bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
8018bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
8028bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
8038bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
8048bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
8058bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
8068bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
8078bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
8088bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
8098bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
8108bcb0991SDimitry Andric     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
8118bcb0991SDimitry Andric     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
8128bcb0991SDimitry Andric   };
8138bcb0991SDimitry Andric 
8148bcb0991SDimitry Andric   if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
8158bcb0991SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
8168bcb0991SDimitry Andric                                                    ISD, DstTy.getSimpleVT(),
8178bcb0991SDimitry Andric                                                    SrcTy.getSimpleVT()))
818fe6060f1SDimitry Andric       return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
8195ffd83dbSDimitry Andric   }
8205ffd83dbSDimitry Andric 
8215ffd83dbSDimitry Andric   if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
8225ffd83dbSDimitry Andric     // As general rule, fp converts that were not matched above are scalarized
8235ffd83dbSDimitry Andric     // and cost 1 vcvt for each lane, so long as the instruction is available.
8245ffd83dbSDimitry Andric     // If not it will become a series of function calls.
825fe6060f1SDimitry Andric     const InstructionCost CallCost =
826fe6060f1SDimitry Andric         getCallInstrCost(nullptr, Dst, {Src}, CostKind);
8275ffd83dbSDimitry Andric     int Lanes = 1;
8285ffd83dbSDimitry Andric     if (SrcTy.isFixedLengthVector())
8295ffd83dbSDimitry Andric       Lanes = SrcTy.getVectorNumElements();
8305ffd83dbSDimitry Andric 
831e8d8bef9SDimitry Andric     if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
8325ffd83dbSDimitry Andric       return Lanes;
8335ffd83dbSDimitry Andric     else
8345ffd83dbSDimitry Andric       return Lanes * CallCost;
8358bcb0991SDimitry Andric   }
8368bcb0991SDimitry Andric 
837e8d8bef9SDimitry Andric   if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
838e8d8bef9SDimitry Andric       SrcTy.isFixedLengthVector()) {
839e8d8bef9SDimitry Andric     // Treat a truncate with larger than legal source (128bits for MVE) as
840e8d8bef9SDimitry Andric     // expensive, 2 instructions per lane.
841e8d8bef9SDimitry Andric     if ((SrcTy.getScalarType() == MVT::i8 ||
842e8d8bef9SDimitry Andric          SrcTy.getScalarType() == MVT::i16 ||
843e8d8bef9SDimitry Andric          SrcTy.getScalarType() == MVT::i32) &&
844e8d8bef9SDimitry Andric         SrcTy.getSizeInBits() > 128 &&
845e8d8bef9SDimitry Andric         SrcTy.getSizeInBits() > DstTy.getSizeInBits())
846e8d8bef9SDimitry Andric       return SrcTy.getVectorNumElements() * 2;
847e8d8bef9SDimitry Andric   }
848e8d8bef9SDimitry Andric 
8490b57cec5SDimitry Andric   // Scalar integer conversion costs.
8500b57cec5SDimitry Andric   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
8510b57cec5SDimitry Andric     // i16 -> i64 requires two dependent operations.
8520b57cec5SDimitry Andric     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
8530b57cec5SDimitry Andric 
8540b57cec5SDimitry Andric     // Truncates on i64 are assumed to be free.
8550b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
8560b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
8570b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
8580b57cec5SDimitry Andric     { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
8590b57cec5SDimitry Andric   };
8600b57cec5SDimitry Andric 
8610b57cec5SDimitry Andric   if (SrcTy.isInteger()) {
8620b57cec5SDimitry Andric     if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
8630b57cec5SDimitry Andric                                                    DstTy.getSimpleVT(),
8640b57cec5SDimitry Andric                                                    SrcTy.getSimpleVT()))
8655ffd83dbSDimitry Andric       return AdjustCost(Entry->Cost);
8660b57cec5SDimitry Andric   }
8670b57cec5SDimitry Andric 
8688bcb0991SDimitry Andric   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
869fe6060f1SDimitry Andric                      ? ST->getMVEVectorCostFactor(CostKind)
8708bcb0991SDimitry Andric                      : 1;
8715ffd83dbSDimitry Andric   return AdjustCost(
872e8d8bef9SDimitry Andric       BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
8730b57cec5SDimitry Andric }
8740b57cec5SDimitry Andric 
875fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
8760b57cec5SDimitry Andric                                                unsigned Index) {
8770b57cec5SDimitry Andric   // Penalize inserting into an D-subregister. We end up with a three times
8780b57cec5SDimitry Andric   // lower estimated throughput on swift.
8790b57cec5SDimitry Andric   if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
8800b57cec5SDimitry Andric       ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
8810b57cec5SDimitry Andric     return 3;
8820b57cec5SDimitry Andric 
8838bcb0991SDimitry Andric   if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
8840b57cec5SDimitry Andric                         Opcode == Instruction::ExtractElement)) {
8850b57cec5SDimitry Andric     // Cross-class copies are expensive on many microarchitectures,
8860b57cec5SDimitry Andric     // so assume they are expensive by default.
8875ffd83dbSDimitry Andric     if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
8880b57cec5SDimitry Andric       return 3;
8890b57cec5SDimitry Andric 
8900b57cec5SDimitry Andric     // Even if it's not a cross class copy, this likely leads to mixing
8910b57cec5SDimitry Andric     // of NEON and VFP code and should be therefore penalized.
8920b57cec5SDimitry Andric     if (ValTy->isVectorTy() &&
8930b57cec5SDimitry Andric         ValTy->getScalarSizeInBits() <= 32)
894fe6060f1SDimitry Andric       return std::max<InstructionCost>(
895fe6060f1SDimitry Andric           BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
8960b57cec5SDimitry Andric   }
8970b57cec5SDimitry Andric 
8988bcb0991SDimitry Andric   if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
8998bcb0991SDimitry Andric                                  Opcode == Instruction::ExtractElement)) {
900fe6060f1SDimitry Andric     // Integer cross-lane moves are more expensive than float, which can
901fe6060f1SDimitry Andric     // sometimes just be vmovs. Integer involve being passes to GPR registers,
902fe6060f1SDimitry Andric     // causing more of a delay.
903fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT =
904fe6060f1SDimitry Andric         getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType());
905fe6060f1SDimitry Andric     return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
9068bcb0991SDimitry Andric   }
9078bcb0991SDimitry Andric 
9080b57cec5SDimitry Andric   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
9090b57cec5SDimitry Andric }
9100b57cec5SDimitry Andric 
911fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
912fe6060f1SDimitry Andric                                                Type *CondTy,
913e8d8bef9SDimitry Andric                                                CmpInst::Predicate VecPred,
9145ffd83dbSDimitry Andric                                                TTI::TargetCostKind CostKind,
9150b57cec5SDimitry Andric                                                const Instruction *I) {
9160b57cec5SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
917e8d8bef9SDimitry Andric 
918e8d8bef9SDimitry Andric   // Thumb scalar code size cost for select.
919e8d8bef9SDimitry Andric   if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
920e8d8bef9SDimitry Andric       ST->isThumb() && !ValTy->isVectorTy()) {
921e8d8bef9SDimitry Andric     // Assume expensive structs.
922e8d8bef9SDimitry Andric     if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
923e8d8bef9SDimitry Andric       return TTI::TCC_Expensive;
924e8d8bef9SDimitry Andric 
925e8d8bef9SDimitry Andric     // Select costs can vary because they:
926e8d8bef9SDimitry Andric     // - may require one or more conditional mov (including an IT),
927e8d8bef9SDimitry Andric     // - can't operate directly on immediates,
928e8d8bef9SDimitry Andric     // - require live flags, which we can't copy around easily.
929fe6060f1SDimitry Andric     InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first;
930e8d8bef9SDimitry Andric 
931e8d8bef9SDimitry Andric     // Possible IT instruction for Thumb2, or more for Thumb1.
932e8d8bef9SDimitry Andric     ++Cost;
933e8d8bef9SDimitry Andric 
934e8d8bef9SDimitry Andric     // i1 values may need rematerialising by using mov immediates and/or
935e8d8bef9SDimitry Andric     // flag setting instructions.
936e8d8bef9SDimitry Andric     if (ValTy->isIntegerTy(1))
937e8d8bef9SDimitry Andric       ++Cost;
938e8d8bef9SDimitry Andric 
939e8d8bef9SDimitry Andric     return Cost;
940e8d8bef9SDimitry Andric   }
941e8d8bef9SDimitry Andric 
942fe6060f1SDimitry Andric   // If this is a vector min/max/abs, use the cost of that intrinsic directly
943fe6060f1SDimitry Andric   // instead. Hopefully when min/max intrinsics are more prevalent this code
944fe6060f1SDimitry Andric   // will not be needed.
945fe6060f1SDimitry Andric   const Instruction *Sel = I;
946fe6060f1SDimitry Andric   if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
947fe6060f1SDimitry Andric       Sel->hasOneUse())
948fe6060f1SDimitry Andric     Sel = cast<Instruction>(Sel->user_back());
949fe6060f1SDimitry Andric   if (Sel && ValTy->isVectorTy() &&
950fe6060f1SDimitry Andric       (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
951fe6060f1SDimitry Andric     const Value *LHS, *RHS;
952fe6060f1SDimitry Andric     SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
953fe6060f1SDimitry Andric     unsigned IID = 0;
954fe6060f1SDimitry Andric     switch (SPF) {
955fe6060f1SDimitry Andric     case SPF_ABS:
956fe6060f1SDimitry Andric       IID = Intrinsic::abs;
957fe6060f1SDimitry Andric       break;
958fe6060f1SDimitry Andric     case SPF_SMIN:
959fe6060f1SDimitry Andric       IID = Intrinsic::smin;
960fe6060f1SDimitry Andric       break;
961fe6060f1SDimitry Andric     case SPF_SMAX:
962fe6060f1SDimitry Andric       IID = Intrinsic::smax;
963fe6060f1SDimitry Andric       break;
964fe6060f1SDimitry Andric     case SPF_UMIN:
965fe6060f1SDimitry Andric       IID = Intrinsic::umin;
966fe6060f1SDimitry Andric       break;
967fe6060f1SDimitry Andric     case SPF_UMAX:
968fe6060f1SDimitry Andric       IID = Intrinsic::umax;
969fe6060f1SDimitry Andric       break;
970fe6060f1SDimitry Andric     case SPF_FMINNUM:
971fe6060f1SDimitry Andric       IID = Intrinsic::minnum;
972fe6060f1SDimitry Andric       break;
973fe6060f1SDimitry Andric     case SPF_FMAXNUM:
974fe6060f1SDimitry Andric       IID = Intrinsic::maxnum;
975fe6060f1SDimitry Andric       break;
976fe6060f1SDimitry Andric     default:
977fe6060f1SDimitry Andric       break;
978fe6060f1SDimitry Andric     }
979fe6060f1SDimitry Andric     if (IID) {
980fe6060f1SDimitry Andric       // The ICmp is free, the select gets the cost of the min/max/etc
981fe6060f1SDimitry Andric       if (Sel != I)
982fe6060f1SDimitry Andric         return 0;
983fe6060f1SDimitry Andric       IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
984fe6060f1SDimitry Andric       return getIntrinsicInstrCost(CostAttrs, CostKind);
985fe6060f1SDimitry Andric     }
986fe6060f1SDimitry Andric   }
987fe6060f1SDimitry Andric 
9880b57cec5SDimitry Andric   // On NEON a vector select gets lowered to vbsl.
989e8d8bef9SDimitry Andric   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
9900b57cec5SDimitry Andric     // Lowering of some vector selects is currently far from perfect.
9910b57cec5SDimitry Andric     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
9920b57cec5SDimitry Andric       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
9930b57cec5SDimitry Andric       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
9940b57cec5SDimitry Andric       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
9950b57cec5SDimitry Andric     };
9960b57cec5SDimitry Andric 
9970b57cec5SDimitry Andric     EVT SelCondTy = TLI->getValueType(DL, CondTy);
9980b57cec5SDimitry Andric     EVT SelValTy = TLI->getValueType(DL, ValTy);
9990b57cec5SDimitry Andric     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
10000b57cec5SDimitry Andric       if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
10010b57cec5SDimitry Andric                                                      SelCondTy.getSimpleVT(),
10020b57cec5SDimitry Andric                                                      SelValTy.getSimpleVT()))
10030b57cec5SDimitry Andric         return Entry->Cost;
10040b57cec5SDimitry Andric     }
10050b57cec5SDimitry Andric 
1006fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT =
1007fe6060f1SDimitry Andric         TLI->getTypeLegalizationCost(DL, ValTy);
10080b57cec5SDimitry Andric     return LT.first;
10090b57cec5SDimitry Andric   }
10100b57cec5SDimitry Andric 
1011fe6060f1SDimitry Andric   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1012fe6060f1SDimitry Andric       (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1013fe6060f1SDimitry Andric       cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1014fe6060f1SDimitry Andric     FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1015fe6060f1SDimitry Andric     FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1016fe6060f1SDimitry Andric     if (!VecCondTy)
1017fe6060f1SDimitry Andric       VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1018fe6060f1SDimitry Andric 
1019fe6060f1SDimitry Andric     // If we don't have mve.fp any fp operations will need to be scalarized.
1020fe6060f1SDimitry Andric     if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1021fe6060f1SDimitry Andric       // One scalaization insert, one scalarization extract and the cost of the
1022fe6060f1SDimitry Andric       // fcmps.
1023fe6060f1SDimitry Andric       return BaseT::getScalarizationOverhead(VecValTy, false, true) +
1024fe6060f1SDimitry Andric              BaseT::getScalarizationOverhead(VecCondTy, true, false) +
1025fe6060f1SDimitry Andric              VecValTy->getNumElements() *
1026fe6060f1SDimitry Andric                  getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1027fe6060f1SDimitry Andric                                     VecCondTy->getScalarType(), VecPred, CostKind,
1028fe6060f1SDimitry Andric                                     I);
1029fe6060f1SDimitry Andric     }
1030fe6060f1SDimitry Andric 
1031fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT =
1032fe6060f1SDimitry Andric         TLI->getTypeLegalizationCost(DL, ValTy);
1033fe6060f1SDimitry Andric     int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1034fe6060f1SDimitry Andric     // There are two types - the input that specifies the type of the compare
1035fe6060f1SDimitry Andric     // and the output vXi1 type. Because we don't know how the output will be
1036fe6060f1SDimitry Andric     // split, we may need an expensive shuffle to get two in sync. This has the
1037fe6060f1SDimitry Andric     // effect of making larger than legal compares (v8i32 for example)
1038fe6060f1SDimitry Andric     // expensive.
1039f3fd488fSDimitry Andric     if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1040fe6060f1SDimitry Andric       if (LT.first > 1)
1041fe6060f1SDimitry Andric         return LT.first * BaseCost +
1042fe6060f1SDimitry Andric                BaseT::getScalarizationOverhead(VecCondTy, true, false);
1043fe6060f1SDimitry Andric       return BaseCost;
1044fe6060f1SDimitry Andric     }
1045fe6060f1SDimitry Andric   }
1046fe6060f1SDimitry Andric 
1047e8d8bef9SDimitry Andric   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1048e8d8bef9SDimitry Andric   // for "multiple beats" potentially needed by MVE instructions.
1049e8d8bef9SDimitry Andric   int BaseCost = 1;
1050fe6060f1SDimitry Andric   if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1051fe6060f1SDimitry Andric     BaseCost = ST->getMVEVectorCostFactor(CostKind);
1052e8d8bef9SDimitry Andric 
1053e8d8bef9SDimitry Andric   return BaseCost *
1054e8d8bef9SDimitry Andric          BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
10550b57cec5SDimitry Andric }
10560b57cec5SDimitry Andric 
1057fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
1058fe6060f1SDimitry Andric                                                       ScalarEvolution *SE,
10590b57cec5SDimitry Andric                                                       const SCEV *Ptr) {
10600b57cec5SDimitry Andric   // Address computations in vectorized code with non-consecutive addresses will
10610b57cec5SDimitry Andric   // likely result in more instructions compared to scalar code where the
10620b57cec5SDimitry Andric   // computation can more often be merged into the index mode. The resulting
10630b57cec5SDimitry Andric   // extra micro-ops can significantly decrease throughput.
10640b57cec5SDimitry Andric   unsigned NumVectorInstToHideOverhead = 10;
10650b57cec5SDimitry Andric   int MaxMergeDistance = 64;
10660b57cec5SDimitry Andric 
10678bcb0991SDimitry Andric   if (ST->hasNEON()) {
10680b57cec5SDimitry Andric     if (Ty->isVectorTy() && SE &&
10690b57cec5SDimitry Andric         !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
10700b57cec5SDimitry Andric       return NumVectorInstToHideOverhead;
10710b57cec5SDimitry Andric 
10720b57cec5SDimitry Andric     // In many cases the address computation is not merged into the instruction
10730b57cec5SDimitry Andric     // addressing mode.
10740b57cec5SDimitry Andric     return 1;
10750b57cec5SDimitry Andric   }
10768bcb0991SDimitry Andric   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
10778bcb0991SDimitry Andric }
10788bcb0991SDimitry Andric 
10795ffd83dbSDimitry Andric bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
10805ffd83dbSDimitry Andric   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
10815ffd83dbSDimitry Andric     // If a VCTP is part of a chain, it's already profitable and shouldn't be
10825ffd83dbSDimitry Andric     // optimized, else LSR may block tail-predication.
10835ffd83dbSDimitry Andric     switch (II->getIntrinsicID()) {
10845ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp8:
10855ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp16:
10865ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp32:
10875ffd83dbSDimitry Andric     case Intrinsic::arm_mve_vctp64:
10885ffd83dbSDimitry Andric       return true;
10895ffd83dbSDimitry Andric     default:
10905ffd83dbSDimitry Andric       break;
10915ffd83dbSDimitry Andric     }
10925ffd83dbSDimitry Andric   }
10935ffd83dbSDimitry Andric   return false;
10945ffd83dbSDimitry Andric }
10955ffd83dbSDimitry Andric 
10965ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
10978bcb0991SDimitry Andric   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
10988bcb0991SDimitry Andric     return false;
10998bcb0991SDimitry Andric 
11005ffd83dbSDimitry Andric   if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
11018bcb0991SDimitry Andric     // Don't support v2i1 yet.
11028bcb0991SDimitry Andric     if (VecTy->getNumElements() == 2)
11038bcb0991SDimitry Andric       return false;
11048bcb0991SDimitry Andric 
11058bcb0991SDimitry Andric     // We don't support extending fp types.
11068bcb0991SDimitry Andric      unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
11078bcb0991SDimitry Andric     if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
11088bcb0991SDimitry Andric       return false;
11098bcb0991SDimitry Andric   }
11108bcb0991SDimitry Andric 
11118bcb0991SDimitry Andric   unsigned EltWidth = DataTy->getScalarSizeInBits();
11125ffd83dbSDimitry Andric   return (EltWidth == 32 && Alignment >= 4) ||
11135ffd83dbSDimitry Andric          (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
11148bcb0991SDimitry Andric }
11150b57cec5SDimitry Andric 
11165ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
1117480093f4SDimitry Andric   if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1118480093f4SDimitry Andric     return false;
1119480093f4SDimitry Andric 
1120480093f4SDimitry Andric   unsigned EltWidth = Ty->getScalarSizeInBits();
11215ffd83dbSDimitry Andric   return ((EltWidth == 32 && Alignment >= 4) ||
11225ffd83dbSDimitry Andric           (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1123480093f4SDimitry Andric }
1124480093f4SDimitry Andric 
1125e8d8bef9SDimitry Andric /// Given a memcpy/memset/memmove instruction, return the number of memory
1126e8d8bef9SDimitry Andric /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1127e8d8bef9SDimitry Andric /// call is used.
1128e8d8bef9SDimitry Andric int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const {
1129e8d8bef9SDimitry Andric   MemOp MOp;
1130e8d8bef9SDimitry Andric   unsigned DstAddrSpace = ~0u;
1131e8d8bef9SDimitry Andric   unsigned SrcAddrSpace = ~0u;
1132e8d8bef9SDimitry Andric   const Function *F = I->getParent()->getParent();
11330b57cec5SDimitry Andric 
1134e8d8bef9SDimitry Andric   if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1135e8d8bef9SDimitry Andric     ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
11360b57cec5SDimitry Andric     // If 'size' is not a constant, a library call will be generated.
11370b57cec5SDimitry Andric     if (!C)
1138e8d8bef9SDimitry Andric       return -1;
11390b57cec5SDimitry Andric 
11400b57cec5SDimitry Andric     const unsigned Size = C->getValue().getZExtValue();
1141e8d8bef9SDimitry Andric     const Align DstAlign = *MC->getDestAlign();
1142e8d8bef9SDimitry Andric     const Align SrcAlign = *MC->getSourceAlign();
1143e8d8bef9SDimitry Andric 
1144e8d8bef9SDimitry Andric     MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1145e8d8bef9SDimitry Andric                       /*IsVolatile*/ false);
1146e8d8bef9SDimitry Andric     DstAddrSpace = MC->getDestAddressSpace();
1147e8d8bef9SDimitry Andric     SrcAddrSpace = MC->getSourceAddressSpace();
1148e8d8bef9SDimitry Andric   }
1149e8d8bef9SDimitry Andric   else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1150e8d8bef9SDimitry Andric     ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1151e8d8bef9SDimitry Andric     // If 'size' is not a constant, a library call will be generated.
1152e8d8bef9SDimitry Andric     if (!C)
1153e8d8bef9SDimitry Andric       return -1;
1154e8d8bef9SDimitry Andric 
1155e8d8bef9SDimitry Andric     const unsigned Size = C->getValue().getZExtValue();
1156e8d8bef9SDimitry Andric     const Align DstAlign = *MS->getDestAlign();
1157e8d8bef9SDimitry Andric 
1158e8d8bef9SDimitry Andric     MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1159e8d8bef9SDimitry Andric                      /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1160e8d8bef9SDimitry Andric     DstAddrSpace = MS->getDestAddressSpace();
1161e8d8bef9SDimitry Andric   }
1162e8d8bef9SDimitry Andric   else
1163e8d8bef9SDimitry Andric     llvm_unreachable("Expected a memcpy/move or memset!");
1164e8d8bef9SDimitry Andric 
1165e8d8bef9SDimitry Andric   unsigned Limit, Factor = 2;
1166e8d8bef9SDimitry Andric   switch(I->getIntrinsicID()) {
1167e8d8bef9SDimitry Andric     case Intrinsic::memcpy:
1168e8d8bef9SDimitry Andric       Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1169e8d8bef9SDimitry Andric       break;
1170e8d8bef9SDimitry Andric     case Intrinsic::memmove:
1171e8d8bef9SDimitry Andric       Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1172e8d8bef9SDimitry Andric       break;
1173e8d8bef9SDimitry Andric     case Intrinsic::memset:
1174e8d8bef9SDimitry Andric       Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1175e8d8bef9SDimitry Andric       Factor = 1;
1176e8d8bef9SDimitry Andric       break;
1177e8d8bef9SDimitry Andric     default:
1178e8d8bef9SDimitry Andric       llvm_unreachable("Expected a memcpy/move or memset!");
1179e8d8bef9SDimitry Andric   }
11800b57cec5SDimitry Andric 
11810b57cec5SDimitry Andric   // MemOps will be poplulated with a list of data types that needs to be
11820b57cec5SDimitry Andric   // loaded and stored. That's why we multiply the number of elements by 2 to
11830b57cec5SDimitry Andric   // get the cost for this memcpy.
1184e8d8bef9SDimitry Andric   std::vector<EVT> MemOps;
11850b57cec5SDimitry Andric   if (getTLI()->findOptimalMemOpLowering(
1186e8d8bef9SDimitry Andric           MemOps, Limit, MOp, DstAddrSpace,
1187e8d8bef9SDimitry Andric           SrcAddrSpace, F->getAttributes()))
1188e8d8bef9SDimitry Andric     return MemOps.size() * Factor;
11890b57cec5SDimitry Andric 
11900b57cec5SDimitry Andric   // If we can't find an optimal memop lowering, return the default cost
1191e8d8bef9SDimitry Andric   return -1;
1192e8d8bef9SDimitry Andric }
1193e8d8bef9SDimitry Andric 
1194fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
1195e8d8bef9SDimitry Andric   int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1196e8d8bef9SDimitry Andric 
1197e8d8bef9SDimitry Andric   // To model the cost of a library call, we assume 1 for the call, and
1198e8d8bef9SDimitry Andric   // 3 for the argument setup.
1199e8d8bef9SDimitry Andric   if (NumOps == -1)
1200e8d8bef9SDimitry Andric     return 4;
1201e8d8bef9SDimitry Andric   return NumOps;
12020b57cec5SDimitry Andric }
12030b57cec5SDimitry Andric 
1204fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1205fe6060f1SDimitry Andric                                            VectorType *Tp, ArrayRef<int> Mask,
120681ad6265SDimitry Andric                                            int Index, VectorType *SubTp,
120781ad6265SDimitry Andric                                            ArrayRef<const Value *> Args) {
1208fe6060f1SDimitry Andric   Kind = improveShuffleKindFromMask(Kind, Mask);
12098bcb0991SDimitry Andric   if (ST->hasNEON()) {
12100b57cec5SDimitry Andric     if (Kind == TTI::SK_Broadcast) {
12110b57cec5SDimitry Andric       static const CostTblEntry NEONDupTbl[] = {
12120b57cec5SDimitry Andric           // VDUP handles these cases.
12130b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12140b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
12150b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12160b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12170b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
12180b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
12190b57cec5SDimitry Andric 
12200b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
12210b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
12220b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
12230b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
12240b57cec5SDimitry Andric 
1225fe6060f1SDimitry Andric       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
12268bcb0991SDimitry Andric       if (const auto *Entry =
12278bcb0991SDimitry Andric               CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
12280b57cec5SDimitry Andric         return LT.first * Entry->Cost;
12290b57cec5SDimitry Andric     }
12300b57cec5SDimitry Andric     if (Kind == TTI::SK_Reverse) {
12310b57cec5SDimitry Andric       static const CostTblEntry NEONShuffleTbl[] = {
12320b57cec5SDimitry Andric           // Reverse shuffle cost one instruction if we are shuffling within a
12330b57cec5SDimitry Andric           // double word (vrev) or two if we shuffle a quad word (vrev, vext).
12340b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12350b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
12360b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12370b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12380b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
12390b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
12400b57cec5SDimitry Andric 
12410b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
12420b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
12430b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
12440b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
12450b57cec5SDimitry Andric 
1246fe6060f1SDimitry Andric       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
12478bcb0991SDimitry Andric       if (const auto *Entry =
12488bcb0991SDimitry Andric               CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
12490b57cec5SDimitry Andric         return LT.first * Entry->Cost;
12500b57cec5SDimitry Andric     }
12510b57cec5SDimitry Andric     if (Kind == TTI::SK_Select) {
12520b57cec5SDimitry Andric       static const CostTblEntry NEONSelShuffleTbl[] = {
12538bcb0991SDimitry Andric           // Select shuffle cost table for ARM. Cost is the number of
12548bcb0991SDimitry Andric           // instructions
12550b57cec5SDimitry Andric           // required to create the shuffled vector.
12560b57cec5SDimitry Andric 
12570b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
12580b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
12590b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
12600b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
12610b57cec5SDimitry Andric 
12620b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
12630b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
12640b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
12650b57cec5SDimitry Andric 
12660b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
12670b57cec5SDimitry Andric 
12680b57cec5SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
12690b57cec5SDimitry Andric 
1270fe6060f1SDimitry Andric       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
12710b57cec5SDimitry Andric       if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
12720b57cec5SDimitry Andric                                               ISD::VECTOR_SHUFFLE, LT.second))
12730b57cec5SDimitry Andric         return LT.first * Entry->Cost;
12740b57cec5SDimitry Andric     }
12758bcb0991SDimitry Andric   }
12768bcb0991SDimitry Andric   if (ST->hasMVEIntegerOps()) {
12778bcb0991SDimitry Andric     if (Kind == TTI::SK_Broadcast) {
12788bcb0991SDimitry Andric       static const CostTblEntry MVEDupTbl[] = {
12798bcb0991SDimitry Andric           // VDUP handles these cases.
12808bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
12818bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
12828bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
12838bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
12848bcb0991SDimitry Andric           {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
12858bcb0991SDimitry Andric 
1286fe6060f1SDimitry Andric       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
12878bcb0991SDimitry Andric       if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
12888bcb0991SDimitry Andric                                               LT.second))
1289fe6060f1SDimitry Andric         return LT.first * Entry->Cost *
1290fe6060f1SDimitry Andric                ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput);
12910b57cec5SDimitry Andric     }
12920b57cec5SDimitry Andric 
1293fe6060f1SDimitry Andric     if (!Mask.empty()) {
1294fe6060f1SDimitry Andric       std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
129556f451bbSDimitry Andric       if (LT.second.isVector() &&
129656f451bbSDimitry Andric           Mask.size() <= LT.second.getVectorNumElements() &&
1297fe6060f1SDimitry Andric           (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1298fe6060f1SDimitry Andric            isVREVMask(Mask, LT.second, 64)))
1299fe6060f1SDimitry Andric         return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first;
1300fe6060f1SDimitry Andric     }
1301fe6060f1SDimitry Andric   }
1302fe6060f1SDimitry Andric 
1303fe6060f1SDimitry Andric   int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
1304fe6060f1SDimitry Andric                      ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
1305fe6060f1SDimitry Andric                      : 1;
1306fe6060f1SDimitry Andric   return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
1307fe6060f1SDimitry Andric }
1308fe6060f1SDimitry Andric 
1309fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getArithmeticInstrCost(
1310fe6060f1SDimitry Andric     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1311fe6060f1SDimitry Andric     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
1312480093f4SDimitry Andric     TTI::OperandValueProperties Opd1PropInfo,
1313fe6060f1SDimitry Andric     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
1314480093f4SDimitry Andric     const Instruction *CxtI) {
13150b57cec5SDimitry Andric   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1316e8d8bef9SDimitry Andric   if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1317e8d8bef9SDimitry Andric     // Make operations on i1 relatively expensive as this often involves
1318e8d8bef9SDimitry Andric     // combining predicates. AND and XOR should be easier to handle with IT
1319e8d8bef9SDimitry Andric     // blocks.
1320e8d8bef9SDimitry Andric     switch (ISDOpcode) {
1321e8d8bef9SDimitry Andric     default:
1322e8d8bef9SDimitry Andric       break;
1323e8d8bef9SDimitry Andric     case ISD::AND:
1324e8d8bef9SDimitry Andric     case ISD::XOR:
1325e8d8bef9SDimitry Andric       return 2;
1326e8d8bef9SDimitry Andric     case ISD::OR:
1327e8d8bef9SDimitry Andric       return 3;
1328e8d8bef9SDimitry Andric     }
1329e8d8bef9SDimitry Andric   }
1330e8d8bef9SDimitry Andric 
1331fe6060f1SDimitry Andric   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
13320b57cec5SDimitry Andric 
1333480093f4SDimitry Andric   if (ST->hasNEON()) {
13340b57cec5SDimitry Andric     const unsigned FunctionCallDivCost = 20;
13350b57cec5SDimitry Andric     const unsigned ReciprocalDivCost = 10;
13360b57cec5SDimitry Andric     static const CostTblEntry CostTbl[] = {
13370b57cec5SDimitry Andric       // Division.
13380b57cec5SDimitry Andric       // These costs are somewhat random. Choose a cost of 20 to indicate that
13390b57cec5SDimitry Andric       // vectorizing devision (added function call) is going to be very expensive.
13400b57cec5SDimitry Andric       // Double registers types.
13410b57cec5SDimitry Andric       { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
13420b57cec5SDimitry Andric       { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
13430b57cec5SDimitry Andric       { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
13440b57cec5SDimitry Andric       { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
13450b57cec5SDimitry Andric       { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
13460b57cec5SDimitry Andric       { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
13470b57cec5SDimitry Andric       { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
13480b57cec5SDimitry Andric       { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
13490b57cec5SDimitry Andric       { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
13500b57cec5SDimitry Andric       { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
13510b57cec5SDimitry Andric       { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
13520b57cec5SDimitry Andric       { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
13530b57cec5SDimitry Andric       { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
13540b57cec5SDimitry Andric       { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
13550b57cec5SDimitry Andric       { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
13560b57cec5SDimitry Andric       { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
13570b57cec5SDimitry Andric       // Quad register types.
13580b57cec5SDimitry Andric       { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
13590b57cec5SDimitry Andric       { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
13600b57cec5SDimitry Andric       { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
13610b57cec5SDimitry Andric       { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
13620b57cec5SDimitry Andric       { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
13630b57cec5SDimitry Andric       { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
13640b57cec5SDimitry Andric       { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
13650b57cec5SDimitry Andric       { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
13660b57cec5SDimitry Andric       { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
13670b57cec5SDimitry Andric       { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
13680b57cec5SDimitry Andric       { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
13690b57cec5SDimitry Andric       { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
13700b57cec5SDimitry Andric       { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
13710b57cec5SDimitry Andric       { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
13720b57cec5SDimitry Andric       { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
13730b57cec5SDimitry Andric       { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
13740b57cec5SDimitry Andric       // Multiplication.
13750b57cec5SDimitry Andric     };
13760b57cec5SDimitry Andric 
13770b57cec5SDimitry Andric     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
13780b57cec5SDimitry Andric       return LT.first * Entry->Cost;
13790b57cec5SDimitry Andric 
1380fe6060f1SDimitry Andric     InstructionCost Cost = BaseT::getArithmeticInstrCost(
1381fe6060f1SDimitry Andric         Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
13820b57cec5SDimitry Andric 
13830b57cec5SDimitry Andric     // This is somewhat of a hack. The problem that we are facing is that SROA
13840b57cec5SDimitry Andric     // creates a sequence of shift, and, or instructions to construct values.
13850b57cec5SDimitry Andric     // These sequences are recognized by the ISel and have zero-cost. Not so for
13860b57cec5SDimitry Andric     // the vectorized code. Because we have support for v2i64 but not i64 those
13870b57cec5SDimitry Andric     // sequences look particularly beneficial to vectorize.
13880b57cec5SDimitry Andric     // To work around this we increase the cost of v2i64 operations to make them
13890b57cec5SDimitry Andric     // seem less beneficial.
13900b57cec5SDimitry Andric     if (LT.second == MVT::v2i64 &&
13910b57cec5SDimitry Andric         Op2Info == TargetTransformInfo::OK_UniformConstantValue)
13920b57cec5SDimitry Andric       Cost += 4;
13930b57cec5SDimitry Andric 
13940b57cec5SDimitry Andric     return Cost;
13950b57cec5SDimitry Andric   }
13960b57cec5SDimitry Andric 
1397480093f4SDimitry Andric   // If this operation is a shift on arm/thumb2, it might well be folded into
1398480093f4SDimitry Andric   // the following instruction, hence having a cost of 0.
1399480093f4SDimitry Andric   auto LooksLikeAFreeShift = [&]() {
1400480093f4SDimitry Andric     if (ST->isThumb1Only() || Ty->isVectorTy())
1401480093f4SDimitry Andric       return false;
1402480093f4SDimitry Andric 
1403480093f4SDimitry Andric     if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1404480093f4SDimitry Andric       return false;
1405480093f4SDimitry Andric     if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
1406480093f4SDimitry Andric       return false;
1407480093f4SDimitry Andric 
1408480093f4SDimitry Andric     // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1409480093f4SDimitry Andric     switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1410480093f4SDimitry Andric     case Instruction::Add:
1411480093f4SDimitry Andric     case Instruction::Sub:
1412480093f4SDimitry Andric     case Instruction::And:
1413480093f4SDimitry Andric     case Instruction::Xor:
1414480093f4SDimitry Andric     case Instruction::Or:
1415480093f4SDimitry Andric     case Instruction::ICmp:
1416480093f4SDimitry Andric       return true;
1417480093f4SDimitry Andric     default:
1418480093f4SDimitry Andric       return false;
1419480093f4SDimitry Andric     }
1420480093f4SDimitry Andric   };
1421480093f4SDimitry Andric   if (LooksLikeAFreeShift())
1422480093f4SDimitry Andric     return 0;
1423480093f4SDimitry Andric 
1424e8d8bef9SDimitry Andric   // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1425e8d8bef9SDimitry Andric   // for "multiple beats" potentially needed by MVE instructions.
1426e8d8bef9SDimitry Andric   int BaseCost = 1;
1427fe6060f1SDimitry Andric   if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1428fe6060f1SDimitry Andric     BaseCost = ST->getMVEVectorCostFactor(CostKind);
14298bcb0991SDimitry Andric 
14308bcb0991SDimitry Andric   // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
14318bcb0991SDimitry Andric   // without treating floats as more expensive that scalars or increasing the
14328bcb0991SDimitry Andric   // costs for custom operations. The results is also multiplied by the
14338bcb0991SDimitry Andric   // MVEVectorCostFactor where appropriate.
14348bcb0991SDimitry Andric   if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
14358bcb0991SDimitry Andric     return LT.first * BaseCost;
14368bcb0991SDimitry Andric 
14378bcb0991SDimitry Andric   // Else this is expand, assume that we need to scalarize this op.
14385ffd83dbSDimitry Andric   if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
14395ffd83dbSDimitry Andric     unsigned Num = VTy->getNumElements();
1440fe6060f1SDimitry Andric     InstructionCost Cost =
1441fe6060f1SDimitry Andric         getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
14428bcb0991SDimitry Andric     // Return the cost of multiple scalar invocation plus the cost of
14438bcb0991SDimitry Andric     // inserting and extracting the values.
1444fe6060f1SDimitry Andric     SmallVector<Type *> Tys(Args.size(), Ty);
1445fe6060f1SDimitry Andric     return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
14468bcb0991SDimitry Andric   }
14478bcb0991SDimitry Andric 
14488bcb0991SDimitry Andric   return BaseCost;
14498bcb0991SDimitry Andric }
14508bcb0991SDimitry Andric 
1451fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1452fe6060f1SDimitry Andric                                             MaybeAlign Alignment,
1453fe6060f1SDimitry Andric                                             unsigned AddressSpace,
14545ffd83dbSDimitry Andric                                             TTI::TargetCostKind CostKind,
1455480093f4SDimitry Andric                                             const Instruction *I) {
14565ffd83dbSDimitry Andric   // TODO: Handle other cost kinds.
14575ffd83dbSDimitry Andric   if (CostKind != TTI::TCK_RecipThroughput)
14585ffd83dbSDimitry Andric     return 1;
14595ffd83dbSDimitry Andric 
14605ffd83dbSDimitry Andric   // Type legalization can't handle structs
14615ffd83dbSDimitry Andric   if (TLI->getValueType(DL, Src, true) == MVT::Other)
14625ffd83dbSDimitry Andric     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
14635ffd83dbSDimitry Andric                                   CostKind);
14640b57cec5SDimitry Andric 
1465480093f4SDimitry Andric   if (ST->hasNEON() && Src->isVectorTy() &&
1466480093f4SDimitry Andric       (Alignment && *Alignment != Align(16)) &&
14675ffd83dbSDimitry Andric       cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
14680b57cec5SDimitry Andric     // Unaligned loads/stores are extremely inefficient.
14690b57cec5SDimitry Andric     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1470fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
14710b57cec5SDimitry Andric     return LT.first * 4;
14720b57cec5SDimitry Andric   }
14735ffd83dbSDimitry Andric 
14745ffd83dbSDimitry Andric   // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
14755ffd83dbSDimitry Andric   // Same for stores.
14765ffd83dbSDimitry Andric   if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
14775ffd83dbSDimitry Andric       ((Opcode == Instruction::Load && I->hasOneUse() &&
14785ffd83dbSDimitry Andric         isa<FPExtInst>(*I->user_begin())) ||
14795ffd83dbSDimitry Andric        (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
14805ffd83dbSDimitry Andric     FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
14815ffd83dbSDimitry Andric     Type *DstTy =
14825ffd83dbSDimitry Andric         Opcode == Instruction::Load
14835ffd83dbSDimitry Andric             ? (*I->user_begin())->getType()
14845ffd83dbSDimitry Andric             : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
14855ffd83dbSDimitry Andric     if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
14865ffd83dbSDimitry Andric         DstTy->getScalarType()->isFloatTy())
1487fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind);
14885ffd83dbSDimitry Andric   }
14895ffd83dbSDimitry Andric 
14908bcb0991SDimitry Andric   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1491fe6060f1SDimitry Andric                      ? ST->getMVEVectorCostFactor(CostKind)
14928bcb0991SDimitry Andric                      : 1;
14935ffd83dbSDimitry Andric   return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
14945ffd83dbSDimitry Andric                                            CostKind, I);
14950b57cec5SDimitry Andric }
14960b57cec5SDimitry Andric 
1497fe6060f1SDimitry Andric InstructionCost
1498fe6060f1SDimitry Andric ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1499e8d8bef9SDimitry Andric                                   unsigned AddressSpace,
1500e8d8bef9SDimitry Andric                                   TTI::TargetCostKind CostKind) {
1501e8d8bef9SDimitry Andric   if (ST->hasMVEIntegerOps()) {
1502e8d8bef9SDimitry Andric     if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment))
1503fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind);
1504e8d8bef9SDimitry Andric     if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment))
1505fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind);
1506e8d8bef9SDimitry Andric   }
1507e8d8bef9SDimitry Andric   if (!isa<FixedVectorType>(Src))
1508e8d8bef9SDimitry Andric     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1509e8d8bef9SDimitry Andric                                         CostKind);
1510e8d8bef9SDimitry Andric   // Scalar cost, which is currently very high due to the efficiency of the
1511e8d8bef9SDimitry Andric   // generated code.
1512e8d8bef9SDimitry Andric   return cast<FixedVectorType>(Src)->getNumElements() * 8;
1513e8d8bef9SDimitry Andric }
1514e8d8bef9SDimitry Andric 
1515fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
1516480093f4SDimitry Andric     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
15175ffd83dbSDimitry Andric     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
15185ffd83dbSDimitry Andric     bool UseMaskForCond, bool UseMaskForGaps) {
15190b57cec5SDimitry Andric   assert(Factor >= 2 && "Invalid interleave factor");
15200b57cec5SDimitry Andric   assert(isa<VectorType>(VecTy) && "Expect a vector type");
15210b57cec5SDimitry Andric 
15220b57cec5SDimitry Andric   // vldN/vstN doesn't support vector types of i64/f64 element.
15230b57cec5SDimitry Andric   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
15240b57cec5SDimitry Andric 
15250b57cec5SDimitry Andric   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
15260b57cec5SDimitry Andric       !UseMaskForCond && !UseMaskForGaps) {
15275ffd83dbSDimitry Andric     unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
15285ffd83dbSDimitry Andric     auto *SubVecTy =
15295ffd83dbSDimitry Andric         FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
15300b57cec5SDimitry Andric 
15310b57cec5SDimitry Andric     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
15320b57cec5SDimitry Andric     // Accesses having vector types that are a multiple of 128 bits can be
15330b57cec5SDimitry Andric     // matched to more than one vldN/vstN instruction.
1534fe6060f1SDimitry Andric     int BaseCost =
1535fe6060f1SDimitry Andric         ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
15360b57cec5SDimitry Andric     if (NumElts % Factor == 0 &&
1537fe6060f1SDimitry Andric         TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1538480093f4SDimitry Andric       return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1539480093f4SDimitry Andric 
1540480093f4SDimitry Andric     // Some smaller than legal interleaved patterns are cheap as we can make
1541480093f4SDimitry Andric     // use of the vmovn or vrev patterns to interleave a standard load. This is
1542480093f4SDimitry Andric     // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1543480093f4SDimitry Andric     // promoted differently). The cost of 2 here is then a load and vrev or
1544480093f4SDimitry Andric     // vmovn.
1545480093f4SDimitry Andric     if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1546e8d8bef9SDimitry Andric         VecTy->isIntOrIntVectorTy() &&
1547e8d8bef9SDimitry Andric         DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64)
1548480093f4SDimitry Andric       return 2 * BaseCost;
15490b57cec5SDimitry Andric   }
15500b57cec5SDimitry Andric 
15510b57cec5SDimitry Andric   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
15525ffd83dbSDimitry Andric                                            Alignment, AddressSpace, CostKind,
15530b57cec5SDimitry Andric                                            UseMaskForCond, UseMaskForGaps);
15540b57cec5SDimitry Andric }
15550b57cec5SDimitry Andric 
1556fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getGatherScatterOpCost(
1557fe6060f1SDimitry Andric     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1558fe6060f1SDimitry Andric     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
15595ffd83dbSDimitry Andric   using namespace PatternMatch;
15605ffd83dbSDimitry Andric   if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
15615ffd83dbSDimitry Andric     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
15625ffd83dbSDimitry Andric                                          Alignment, CostKind, I);
15635ffd83dbSDimitry Andric 
15645ffd83dbSDimitry Andric   assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
15655ffd83dbSDimitry Andric   auto *VTy = cast<FixedVectorType>(DataTy);
15665ffd83dbSDimitry Andric 
15675ffd83dbSDimitry Andric   // TODO: Splitting, once we do that.
15685ffd83dbSDimitry Andric 
15695ffd83dbSDimitry Andric   unsigned NumElems = VTy->getNumElements();
15705ffd83dbSDimitry Andric   unsigned EltSize = VTy->getScalarSizeInBits();
1571fe6060f1SDimitry Andric   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
15725ffd83dbSDimitry Andric 
15735ffd83dbSDimitry Andric   // For now, it is assumed that for the MVE gather instructions the loads are
15745ffd83dbSDimitry Andric   // all effectively serialised. This means the cost is the scalar cost
15755ffd83dbSDimitry Andric   // multiplied by the number of elements being loaded. This is possibly very
15765ffd83dbSDimitry Andric   // conservative, but even so we still end up vectorising loops because the
15775ffd83dbSDimitry Andric   // cost per iteration for many loops is lower than for scalar loops.
1578fe6060f1SDimitry Andric   InstructionCost VectorCost =
1579fe6060f1SDimitry Andric       NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
15805ffd83dbSDimitry Andric   // The scalarization cost should be a lot higher. We use the number of vector
15815ffd83dbSDimitry Andric   // elements plus the scalarization overhead.
1582fe6060f1SDimitry Andric   InstructionCost ScalarCost =
1583fe6060f1SDimitry Andric       NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) +
1584fe6060f1SDimitry Andric       BaseT::getScalarizationOverhead(VTy, false, true);
15855ffd83dbSDimitry Andric 
1586e8d8bef9SDimitry Andric   if (EltSize < 8 || Alignment < EltSize / 8)
15875ffd83dbSDimitry Andric     return ScalarCost;
15885ffd83dbSDimitry Andric 
15895ffd83dbSDimitry Andric   unsigned ExtSize = EltSize;
15905ffd83dbSDimitry Andric   // Check whether there's a single user that asks for an extended type
15915ffd83dbSDimitry Andric   if (I != nullptr) {
15925ffd83dbSDimitry Andric     // Dependent of the caller of this function, a gather instruction will
15935ffd83dbSDimitry Andric     // either have opcode Instruction::Load or be a call to the masked_gather
15945ffd83dbSDimitry Andric     // intrinsic
15955ffd83dbSDimitry Andric     if ((I->getOpcode() == Instruction::Load ||
15965ffd83dbSDimitry Andric          match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
15975ffd83dbSDimitry Andric         I->hasOneUse()) {
15985ffd83dbSDimitry Andric       const User *Us = *I->users().begin();
15995ffd83dbSDimitry Andric       if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
16005ffd83dbSDimitry Andric         // only allow valid type combinations
16015ffd83dbSDimitry Andric         unsigned TypeSize =
16025ffd83dbSDimitry Andric             cast<Instruction>(Us)->getType()->getScalarSizeInBits();
16035ffd83dbSDimitry Andric         if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
16045ffd83dbSDimitry Andric              (TypeSize == 16 && EltSize == 8)) &&
16055ffd83dbSDimitry Andric             TypeSize * NumElems == 128) {
16065ffd83dbSDimitry Andric           ExtSize = TypeSize;
16075ffd83dbSDimitry Andric         }
16085ffd83dbSDimitry Andric       }
16095ffd83dbSDimitry Andric     }
16105ffd83dbSDimitry Andric     // Check whether the input data needs to be truncated
16115ffd83dbSDimitry Andric     TruncInst *T;
16125ffd83dbSDimitry Andric     if ((I->getOpcode() == Instruction::Store ||
16135ffd83dbSDimitry Andric          match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
16145ffd83dbSDimitry Andric         (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
16155ffd83dbSDimitry Andric       // Only allow valid type combinations
16165ffd83dbSDimitry Andric       unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
16175ffd83dbSDimitry Andric       if (((EltSize == 16 && TypeSize == 32) ||
16185ffd83dbSDimitry Andric            (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
16195ffd83dbSDimitry Andric           TypeSize * NumElems == 128)
16205ffd83dbSDimitry Andric         ExtSize = TypeSize;
16215ffd83dbSDimitry Andric     }
16225ffd83dbSDimitry Andric   }
16235ffd83dbSDimitry Andric 
16245ffd83dbSDimitry Andric   if (ExtSize * NumElems != 128 || NumElems < 4)
16255ffd83dbSDimitry Andric     return ScalarCost;
16265ffd83dbSDimitry Andric 
16275ffd83dbSDimitry Andric   // Any (aligned) i32 gather will not need to be scalarised.
16285ffd83dbSDimitry Andric   if (ExtSize == 32)
16295ffd83dbSDimitry Andric     return VectorCost;
16305ffd83dbSDimitry Andric   // For smaller types, we need to ensure that the gep's inputs are correctly
16315ffd83dbSDimitry Andric   // extended from a small enough value. Other sizes (including i64) are
16325ffd83dbSDimitry Andric   // scalarized for now.
16335ffd83dbSDimitry Andric   if (ExtSize != 8 && ExtSize != 16)
16345ffd83dbSDimitry Andric     return ScalarCost;
16355ffd83dbSDimitry Andric 
16365ffd83dbSDimitry Andric   if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
16375ffd83dbSDimitry Andric     Ptr = BC->getOperand(0);
16385ffd83dbSDimitry Andric   if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
16395ffd83dbSDimitry Andric     if (GEP->getNumOperands() != 2)
16405ffd83dbSDimitry Andric       return ScalarCost;
16415ffd83dbSDimitry Andric     unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
16425ffd83dbSDimitry Andric     // Scale needs to be correct (which is only relevant for i16s).
16435ffd83dbSDimitry Andric     if (Scale != 1 && Scale * 8 != ExtSize)
16445ffd83dbSDimitry Andric       return ScalarCost;
16455ffd83dbSDimitry Andric     // And we need to zext (not sext) the indexes from a small enough type.
16465ffd83dbSDimitry Andric     if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
16475ffd83dbSDimitry Andric       if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
16485ffd83dbSDimitry Andric         return VectorCost;
16495ffd83dbSDimitry Andric     }
16505ffd83dbSDimitry Andric     return ScalarCost;
16515ffd83dbSDimitry Andric   }
16525ffd83dbSDimitry Andric   return ScalarCost;
16535ffd83dbSDimitry Andric }
16545ffd83dbSDimitry Andric 
1655fe6060f1SDimitry Andric InstructionCost
1656fe6060f1SDimitry Andric ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
1657fe6060f1SDimitry Andric                                        Optional<FastMathFlags> FMF,
1658e8d8bef9SDimitry Andric                                        TTI::TargetCostKind CostKind) {
1659fe6060f1SDimitry Andric   if (TTI::requiresOrderedReduction(FMF))
1660fe6060f1SDimitry Andric     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1661fe6060f1SDimitry Andric 
1662e8d8bef9SDimitry Andric   EVT ValVT = TLI->getValueType(DL, ValTy);
1663e8d8bef9SDimitry Andric   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1664e8d8bef9SDimitry Andric   if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
1665fe6060f1SDimitry Andric     return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1666e8d8bef9SDimitry Andric 
1667fe6060f1SDimitry Andric   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
1668e8d8bef9SDimitry Andric 
1669e8d8bef9SDimitry Andric   static const CostTblEntry CostTblAdd[]{
1670e8d8bef9SDimitry Andric       {ISD::ADD, MVT::v16i8, 1},
1671e8d8bef9SDimitry Andric       {ISD::ADD, MVT::v8i16, 1},
1672e8d8bef9SDimitry Andric       {ISD::ADD, MVT::v4i32, 1},
1673e8d8bef9SDimitry Andric   };
1674e8d8bef9SDimitry Andric   if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1675fe6060f1SDimitry Andric     return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1676e8d8bef9SDimitry Andric 
1677fe6060f1SDimitry Andric   return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1678e8d8bef9SDimitry Andric }
1679e8d8bef9SDimitry Andric 
1680e8d8bef9SDimitry Andric InstructionCost
1681e8d8bef9SDimitry Andric ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
1682e8d8bef9SDimitry Andric                                         Type *ResTy, VectorType *ValTy,
1683e8d8bef9SDimitry Andric                                         TTI::TargetCostKind CostKind) {
1684e8d8bef9SDimitry Andric   EVT ValVT = TLI->getValueType(DL, ValTy);
1685e8d8bef9SDimitry Andric   EVT ResVT = TLI->getValueType(DL, ResTy);
1686349cc55cSDimitry Andric 
1687e8d8bef9SDimitry Andric   if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1688fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT =
1689fe6060f1SDimitry Andric         TLI->getTypeLegalizationCost(DL, ValTy);
1690349cc55cSDimitry Andric 
1691349cc55cSDimitry Andric     // The legal cases are:
1692349cc55cSDimitry Andric     //   VADDV u/s 8/16/32
1693349cc55cSDimitry Andric     //   VMLAV u/s 8/16/32
1694349cc55cSDimitry Andric     //   VADDLV u/s 32
1695349cc55cSDimitry Andric     //   VMLALV u/s 16/32
1696349cc55cSDimitry Andric     // Codegen currently cannot always handle larger than legal vectors very
1697349cc55cSDimitry Andric     // well, especially for predicated reductions where the mask needs to be
1698349cc55cSDimitry Andric     // split, so restrict to 128bit or smaller input types.
1699349cc55cSDimitry Andric     unsigned RevVTSize = ResVT.getSizeInBits();
1700349cc55cSDimitry Andric     if (ValVT.getSizeInBits() <= 128 &&
1701349cc55cSDimitry Andric         ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1702349cc55cSDimitry Andric          (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) ||
1703349cc55cSDimitry Andric          (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1704fe6060f1SDimitry Andric       return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1705e8d8bef9SDimitry Andric   }
1706e8d8bef9SDimitry Andric 
1707e8d8bef9SDimitry Andric   return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy,
1708e8d8bef9SDimitry Andric                                             CostKind);
1709e8d8bef9SDimitry Andric }
1710e8d8bef9SDimitry Andric 
1711fe6060f1SDimitry Andric InstructionCost
1712fe6060f1SDimitry Andric ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1713e8d8bef9SDimitry Andric                                   TTI::TargetCostKind CostKind) {
1714e8d8bef9SDimitry Andric   switch (ICA.getID()) {
1715e8d8bef9SDimitry Andric   case Intrinsic::get_active_lane_mask:
1716e8d8bef9SDimitry Andric     // Currently we make a somewhat optimistic assumption that
1717e8d8bef9SDimitry Andric     // active_lane_mask's are always free. In reality it may be freely folded
1718e8d8bef9SDimitry Andric     // into a tail predicated loop, expanded into a VCPT or expanded into a lot
1719e8d8bef9SDimitry Andric     // of add/icmp code. We may need to improve this in the future, but being
1720e8d8bef9SDimitry Andric     // able to detect if it is free or not involves looking at a lot of other
1721e8d8bef9SDimitry Andric     // code. We currently assume that the vectorizer inserted these, and knew
1722e8d8bef9SDimitry Andric     // what it was doing in adding one.
1723e8d8bef9SDimitry Andric     if (ST->hasMVEIntegerOps())
1724e8d8bef9SDimitry Andric       return 0;
1725e8d8bef9SDimitry Andric     break;
1726e8d8bef9SDimitry Andric   case Intrinsic::sadd_sat:
1727e8d8bef9SDimitry Andric   case Intrinsic::ssub_sat:
1728e8d8bef9SDimitry Andric   case Intrinsic::uadd_sat:
1729e8d8bef9SDimitry Andric   case Intrinsic::usub_sat: {
1730e8d8bef9SDimitry Andric     if (!ST->hasMVEIntegerOps())
1731e8d8bef9SDimitry Andric       break;
1732e8d8bef9SDimitry Andric     Type *VT = ICA.getReturnType();
1733e8d8bef9SDimitry Andric 
1734fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1735e8d8bef9SDimitry Andric     if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1736e8d8bef9SDimitry Andric         LT.second == MVT::v16i8) {
1737fe6060f1SDimitry Andric       // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1738e8d8bef9SDimitry Andric       // need to extend the type, as it uses shr(qadd(shl, shl)).
1739fe6060f1SDimitry Andric       unsigned Instrs =
1740fe6060f1SDimitry Andric           LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1741fe6060f1SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
1742e8d8bef9SDimitry Andric     }
1743e8d8bef9SDimitry Andric     break;
1744e8d8bef9SDimitry Andric   }
1745fe6060f1SDimitry Andric   case Intrinsic::abs:
1746fe6060f1SDimitry Andric   case Intrinsic::smin:
1747fe6060f1SDimitry Andric   case Intrinsic::smax:
1748fe6060f1SDimitry Andric   case Intrinsic::umin:
1749fe6060f1SDimitry Andric   case Intrinsic::umax: {
1750fe6060f1SDimitry Andric     if (!ST->hasMVEIntegerOps())
1751fe6060f1SDimitry Andric       break;
1752fe6060f1SDimitry Andric     Type *VT = ICA.getReturnType();
1753fe6060f1SDimitry Andric 
1754fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1755fe6060f1SDimitry Andric     if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1756fe6060f1SDimitry Andric         LT.second == MVT::v16i8)
1757fe6060f1SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind);
1758fe6060f1SDimitry Andric     break;
1759fe6060f1SDimitry Andric   }
1760fe6060f1SDimitry Andric   case Intrinsic::minnum:
1761fe6060f1SDimitry Andric   case Intrinsic::maxnum: {
1762fe6060f1SDimitry Andric     if (!ST->hasMVEFloatOps())
1763fe6060f1SDimitry Andric       break;
1764fe6060f1SDimitry Andric     Type *VT = ICA.getReturnType();
1765fe6060f1SDimitry Andric     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT);
1766fe6060f1SDimitry Andric     if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
1767fe6060f1SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind);
1768fe6060f1SDimitry Andric     break;
1769fe6060f1SDimitry Andric   }
177081ad6265SDimitry Andric   case Intrinsic::fptosi_sat:
177181ad6265SDimitry Andric   case Intrinsic::fptoui_sat: {
177281ad6265SDimitry Andric     if (ICA.getArgTypes().empty())
177381ad6265SDimitry Andric       break;
177481ad6265SDimitry Andric     bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
177581ad6265SDimitry Andric     auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]);
177681ad6265SDimitry Andric     EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
177781ad6265SDimitry Andric     // Check for the legal types, with the corect subtarget features.
177881ad6265SDimitry Andric     if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
177981ad6265SDimitry Andric         (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
178081ad6265SDimitry Andric         (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
178181ad6265SDimitry Andric       return LT.first;
178281ad6265SDimitry Andric 
178381ad6265SDimitry Andric     // Equally for MVE vector types
178481ad6265SDimitry Andric     if (ST->hasMVEFloatOps() &&
178581ad6265SDimitry Andric         (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
178681ad6265SDimitry Andric         LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
178781ad6265SDimitry Andric       return LT.first * ST->getMVEVectorCostFactor(CostKind);
178881ad6265SDimitry Andric 
178981ad6265SDimitry Andric     // Otherwise we use a legal convert followed by a min+max
179081ad6265SDimitry Andric     if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
179181ad6265SDimitry Andric          (ST->hasFP64() && LT.second == MVT::f64) ||
179281ad6265SDimitry Andric          (ST->hasFullFP16() && LT.second == MVT::f16) ||
179381ad6265SDimitry Andric          (ST->hasMVEFloatOps() &&
179481ad6265SDimitry Andric           (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
179581ad6265SDimitry Andric         LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
179681ad6265SDimitry Andric       Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
179781ad6265SDimitry Andric                                       LT.second.getScalarSizeInBits());
179881ad6265SDimitry Andric       InstructionCost Cost =
179981ad6265SDimitry Andric           LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
180081ad6265SDimitry Andric       IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
180181ad6265SDimitry Andric                                               : Intrinsic::umin,
180281ad6265SDimitry Andric                                      LegalTy, {LegalTy, LegalTy});
180381ad6265SDimitry Andric       Cost += getIntrinsicInstrCost(Attrs1, CostKind);
180481ad6265SDimitry Andric       IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
180581ad6265SDimitry Andric                                               : Intrinsic::umax,
180681ad6265SDimitry Andric                                      LegalTy, {LegalTy, LegalTy});
180781ad6265SDimitry Andric       Cost += getIntrinsicInstrCost(Attrs2, CostKind);
180881ad6265SDimitry Andric       return LT.first * Cost;
180981ad6265SDimitry Andric     }
181081ad6265SDimitry Andric     break;
181181ad6265SDimitry Andric   }
1812e8d8bef9SDimitry Andric   }
1813e8d8bef9SDimitry Andric 
1814e8d8bef9SDimitry Andric   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1815e8d8bef9SDimitry Andric }
1816e8d8bef9SDimitry Andric 
18170b57cec5SDimitry Andric bool ARMTTIImpl::isLoweredToCall(const Function *F) {
18180b57cec5SDimitry Andric   if (!F->isIntrinsic())
181981ad6265SDimitry Andric     return BaseT::isLoweredToCall(F);
18200b57cec5SDimitry Andric 
18210b57cec5SDimitry Andric   // Assume all Arm-specific intrinsics map to an instruction.
18220b57cec5SDimitry Andric   if (F->getName().startswith("llvm.arm"))
18230b57cec5SDimitry Andric     return false;
18240b57cec5SDimitry Andric 
18250b57cec5SDimitry Andric   switch (F->getIntrinsicID()) {
18260b57cec5SDimitry Andric   default: break;
18270b57cec5SDimitry Andric   case Intrinsic::powi:
18280b57cec5SDimitry Andric   case Intrinsic::sin:
18290b57cec5SDimitry Andric   case Intrinsic::cos:
18300b57cec5SDimitry Andric   case Intrinsic::pow:
18310b57cec5SDimitry Andric   case Intrinsic::log:
18320b57cec5SDimitry Andric   case Intrinsic::log10:
18330b57cec5SDimitry Andric   case Intrinsic::log2:
18340b57cec5SDimitry Andric   case Intrinsic::exp:
18350b57cec5SDimitry Andric   case Intrinsic::exp2:
18360b57cec5SDimitry Andric     return true;
18370b57cec5SDimitry Andric   case Intrinsic::sqrt:
18380b57cec5SDimitry Andric   case Intrinsic::fabs:
18390b57cec5SDimitry Andric   case Intrinsic::copysign:
18400b57cec5SDimitry Andric   case Intrinsic::floor:
18410b57cec5SDimitry Andric   case Intrinsic::ceil:
18420b57cec5SDimitry Andric   case Intrinsic::trunc:
18430b57cec5SDimitry Andric   case Intrinsic::rint:
18440b57cec5SDimitry Andric   case Intrinsic::nearbyint:
18450b57cec5SDimitry Andric   case Intrinsic::round:
18460b57cec5SDimitry Andric   case Intrinsic::canonicalize:
18470b57cec5SDimitry Andric   case Intrinsic::lround:
18480b57cec5SDimitry Andric   case Intrinsic::llround:
18490b57cec5SDimitry Andric   case Intrinsic::lrint:
18500b57cec5SDimitry Andric   case Intrinsic::llrint:
18510b57cec5SDimitry Andric     if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
18520b57cec5SDimitry Andric       return true;
18530b57cec5SDimitry Andric     if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
18540b57cec5SDimitry Andric       return true;
18550b57cec5SDimitry Andric     // Some operations can be handled by vector instructions and assume
18560b57cec5SDimitry Andric     // unsupported vectors will be expanded into supported scalar ones.
18570b57cec5SDimitry Andric     // TODO Handle scalar operations properly.
18580b57cec5SDimitry Andric     return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
18590b57cec5SDimitry Andric   case Intrinsic::masked_store:
18600b57cec5SDimitry Andric   case Intrinsic::masked_load:
18610b57cec5SDimitry Andric   case Intrinsic::masked_gather:
18620b57cec5SDimitry Andric   case Intrinsic::masked_scatter:
18630b57cec5SDimitry Andric     return !ST->hasMVEIntegerOps();
18640b57cec5SDimitry Andric   case Intrinsic::sadd_with_overflow:
18650b57cec5SDimitry Andric   case Intrinsic::uadd_with_overflow:
18660b57cec5SDimitry Andric   case Intrinsic::ssub_with_overflow:
18670b57cec5SDimitry Andric   case Intrinsic::usub_with_overflow:
18680b57cec5SDimitry Andric   case Intrinsic::sadd_sat:
18690b57cec5SDimitry Andric   case Intrinsic::uadd_sat:
18700b57cec5SDimitry Andric   case Intrinsic::ssub_sat:
18710b57cec5SDimitry Andric   case Intrinsic::usub_sat:
18720b57cec5SDimitry Andric     return false;
18730b57cec5SDimitry Andric   }
18740b57cec5SDimitry Andric 
18750b57cec5SDimitry Andric   return BaseT::isLoweredToCall(F);
18760b57cec5SDimitry Andric }
18770b57cec5SDimitry Andric 
1878e8d8bef9SDimitry Andric bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) {
18790b57cec5SDimitry Andric   unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
18800b57cec5SDimitry Andric   EVT VT = TLI->getValueType(DL, I.getType(), true);
18810b57cec5SDimitry Andric   if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
18820b57cec5SDimitry Andric     return true;
18830b57cec5SDimitry Andric 
18840b57cec5SDimitry Andric   // Check if an intrinsic will be lowered to a call and assume that any
18850b57cec5SDimitry Andric   // other CallInst will generate a bl.
18860b57cec5SDimitry Andric   if (auto *Call = dyn_cast<CallInst>(&I)) {
1887e8d8bef9SDimitry Andric     if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
1888e8d8bef9SDimitry Andric       switch(II->getIntrinsicID()) {
1889e8d8bef9SDimitry Andric         case Intrinsic::memcpy:
1890e8d8bef9SDimitry Andric         case Intrinsic::memset:
1891e8d8bef9SDimitry Andric         case Intrinsic::memmove:
1892e8d8bef9SDimitry Andric           return getNumMemOps(II) == -1;
1893e8d8bef9SDimitry Andric         default:
18940b57cec5SDimitry Andric           if (const Function *F = Call->getCalledFunction())
18950b57cec5SDimitry Andric             return isLoweredToCall(F);
18960b57cec5SDimitry Andric       }
1897e8d8bef9SDimitry Andric     }
18980b57cec5SDimitry Andric     return true;
18990b57cec5SDimitry Andric   }
19000b57cec5SDimitry Andric 
19010b57cec5SDimitry Andric   // FPv5 provides conversions between integer, double-precision,
19020b57cec5SDimitry Andric   // single-precision, and half-precision formats.
19030b57cec5SDimitry Andric   switch (I.getOpcode()) {
19040b57cec5SDimitry Andric   default:
19050b57cec5SDimitry Andric     break;
19060b57cec5SDimitry Andric   case Instruction::FPToSI:
19070b57cec5SDimitry Andric   case Instruction::FPToUI:
19080b57cec5SDimitry Andric   case Instruction::SIToFP:
19090b57cec5SDimitry Andric   case Instruction::UIToFP:
19100b57cec5SDimitry Andric   case Instruction::FPTrunc:
19110b57cec5SDimitry Andric   case Instruction::FPExt:
19120b57cec5SDimitry Andric     return !ST->hasFPARMv8Base();
19130b57cec5SDimitry Andric   }
19140b57cec5SDimitry Andric 
19150b57cec5SDimitry Andric   // FIXME: Unfortunately the approach of checking the Operation Action does
19160b57cec5SDimitry Andric   // not catch all cases of Legalization that use library calls. Our
19170b57cec5SDimitry Andric   // Legalization step categorizes some transformations into library calls as
19180b57cec5SDimitry Andric   // Custom, Expand or even Legal when doing type legalization. So for now
19190b57cec5SDimitry Andric   // we have to special case for instance the SDIV of 64bit integers and the
19200b57cec5SDimitry Andric   // use of floating point emulation.
19210b57cec5SDimitry Andric   if (VT.isInteger() && VT.getSizeInBits() >= 64) {
19220b57cec5SDimitry Andric     switch (ISD) {
19230b57cec5SDimitry Andric     default:
19240b57cec5SDimitry Andric       break;
19250b57cec5SDimitry Andric     case ISD::SDIV:
19260b57cec5SDimitry Andric     case ISD::UDIV:
19270b57cec5SDimitry Andric     case ISD::SREM:
19280b57cec5SDimitry Andric     case ISD::UREM:
19290b57cec5SDimitry Andric     case ISD::SDIVREM:
19300b57cec5SDimitry Andric     case ISD::UDIVREM:
19310b57cec5SDimitry Andric       return true;
19320b57cec5SDimitry Andric     }
19330b57cec5SDimitry Andric   }
19340b57cec5SDimitry Andric 
19350b57cec5SDimitry Andric   // Assume all other non-float operations are supported.
19360b57cec5SDimitry Andric   if (!VT.isFloatingPoint())
19370b57cec5SDimitry Andric     return false;
19380b57cec5SDimitry Andric 
19390b57cec5SDimitry Andric   // We'll need a library call to handle most floats when using soft.
19400b57cec5SDimitry Andric   if (TLI->useSoftFloat()) {
19410b57cec5SDimitry Andric     switch (I.getOpcode()) {
19420b57cec5SDimitry Andric     default:
19430b57cec5SDimitry Andric       return true;
19440b57cec5SDimitry Andric     case Instruction::Alloca:
19450b57cec5SDimitry Andric     case Instruction::Load:
19460b57cec5SDimitry Andric     case Instruction::Store:
19470b57cec5SDimitry Andric     case Instruction::Select:
19480b57cec5SDimitry Andric     case Instruction::PHI:
19490b57cec5SDimitry Andric       return false;
19500b57cec5SDimitry Andric     }
19510b57cec5SDimitry Andric   }
19520b57cec5SDimitry Andric 
19530b57cec5SDimitry Andric   // We'll need a libcall to perform double precision operations on a single
19540b57cec5SDimitry Andric   // precision only FPU.
19550b57cec5SDimitry Andric   if (I.getType()->isDoubleTy() && !ST->hasFP64())
19560b57cec5SDimitry Andric     return true;
19570b57cec5SDimitry Andric 
19580b57cec5SDimitry Andric   // Likewise for half precision arithmetic.
19590b57cec5SDimitry Andric   if (I.getType()->isHalfTy() && !ST->hasFullFP16())
19600b57cec5SDimitry Andric     return true;
19610b57cec5SDimitry Andric 
19620b57cec5SDimitry Andric   return false;
1963e8d8bef9SDimitry Andric }
1964e8d8bef9SDimitry Andric 
1965e8d8bef9SDimitry Andric bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1966e8d8bef9SDimitry Andric                                           AssumptionCache &AC,
1967e8d8bef9SDimitry Andric                                           TargetLibraryInfo *LibInfo,
1968e8d8bef9SDimitry Andric                                           HardwareLoopInfo &HWLoopInfo) {
1969e8d8bef9SDimitry Andric   // Low-overhead branches are only supported in the 'low-overhead branch'
1970e8d8bef9SDimitry Andric   // extension of v8.1-m.
1971e8d8bef9SDimitry Andric   if (!ST->hasLOB() || DisableLowOverheadLoops) {
1972e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
1973e8d8bef9SDimitry Andric     return false;
1974e8d8bef9SDimitry Andric   }
1975e8d8bef9SDimitry Andric 
1976e8d8bef9SDimitry Andric   if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
1977e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
1978e8d8bef9SDimitry Andric     return false;
1979e8d8bef9SDimitry Andric   }
1980e8d8bef9SDimitry Andric 
1981e8d8bef9SDimitry Andric   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1982e8d8bef9SDimitry Andric   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
1983e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
1984e8d8bef9SDimitry Andric     return false;
1985e8d8bef9SDimitry Andric   }
1986e8d8bef9SDimitry Andric 
1987e8d8bef9SDimitry Andric   const SCEV *TripCountSCEV =
1988e8d8bef9SDimitry Andric     SE.getAddExpr(BackedgeTakenCount,
1989e8d8bef9SDimitry Andric                   SE.getOne(BackedgeTakenCount->getType()));
1990e8d8bef9SDimitry Andric 
1991e8d8bef9SDimitry Andric   // We need to store the trip count in LR, a 32-bit register.
1992e8d8bef9SDimitry Andric   if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
1993e8d8bef9SDimitry Andric     LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
1994e8d8bef9SDimitry Andric     return false;
1995e8d8bef9SDimitry Andric   }
1996e8d8bef9SDimitry Andric 
1997e8d8bef9SDimitry Andric   // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
1998e8d8bef9SDimitry Andric   // point in generating a hardware loop if that's going to happen.
19990b57cec5SDimitry Andric 
20000b57cec5SDimitry Andric   auto IsHardwareLoopIntrinsic = [](Instruction &I) {
20010b57cec5SDimitry Andric     if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
20020b57cec5SDimitry Andric       switch (Call->getIntrinsicID()) {
20030b57cec5SDimitry Andric       default:
20040b57cec5SDimitry Andric         break;
2005e8d8bef9SDimitry Andric       case Intrinsic::start_loop_iterations:
2006fe6060f1SDimitry Andric       case Intrinsic::test_start_loop_iterations:
20070b57cec5SDimitry Andric       case Intrinsic::loop_decrement:
20080b57cec5SDimitry Andric       case Intrinsic::loop_decrement_reg:
20090b57cec5SDimitry Andric         return true;
20100b57cec5SDimitry Andric       }
20110b57cec5SDimitry Andric     }
20120b57cec5SDimitry Andric     return false;
20130b57cec5SDimitry Andric   };
20140b57cec5SDimitry Andric 
20150b57cec5SDimitry Andric   // Scan the instructions to see if there's any that we know will turn into a
2016e8d8bef9SDimitry Andric   // call or if this loop is already a low-overhead loop or will become a tail
2017e8d8bef9SDimitry Andric   // predicated loop.
2018e8d8bef9SDimitry Andric   bool IsTailPredLoop = false;
20190b57cec5SDimitry Andric   auto ScanLoop = [&](Loop *L) {
20200b57cec5SDimitry Andric     for (auto *BB : L->getBlocks()) {
20210b57cec5SDimitry Andric       for (auto &I : *BB) {
2022e8d8bef9SDimitry Andric         if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2023e8d8bef9SDimitry Andric             isa<InlineAsm>(I)) {
20245ffd83dbSDimitry Andric           LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
20250b57cec5SDimitry Andric           return false;
20260b57cec5SDimitry Andric         }
2027e8d8bef9SDimitry Andric         if (auto *II = dyn_cast<IntrinsicInst>(&I))
2028e8d8bef9SDimitry Andric           IsTailPredLoop |=
2029e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2030e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2031e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2032e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2033e8d8bef9SDimitry Andric               II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
20340b57cec5SDimitry Andric       }
20355ffd83dbSDimitry Andric     }
20360b57cec5SDimitry Andric     return true;
20370b57cec5SDimitry Andric   };
20380b57cec5SDimitry Andric 
20390b57cec5SDimitry Andric   // Visit inner loops.
20400b57cec5SDimitry Andric   for (auto Inner : *L)
20410b57cec5SDimitry Andric     if (!ScanLoop(Inner))
20420b57cec5SDimitry Andric       return false;
20430b57cec5SDimitry Andric 
20440b57cec5SDimitry Andric   if (!ScanLoop(L))
20450b57cec5SDimitry Andric     return false;
20460b57cec5SDimitry Andric 
20470b57cec5SDimitry Andric   // TODO: Check whether the trip count calculation is expensive. If L is the
20480b57cec5SDimitry Andric   // inner loop but we know it has a low trip count, calculating that trip
20490b57cec5SDimitry Andric   // count (in the parent loop) may be detrimental.
20500b57cec5SDimitry Andric 
20510b57cec5SDimitry Andric   LLVMContext &C = L->getHeader()->getContext();
20520b57cec5SDimitry Andric   HWLoopInfo.CounterInReg = true;
20530b57cec5SDimitry Andric   HWLoopInfo.IsNestingLegal = false;
2054e8d8bef9SDimitry Andric   HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
20550b57cec5SDimitry Andric   HWLoopInfo.CountType = Type::getInt32Ty(C);
20560b57cec5SDimitry Andric   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
20570b57cec5SDimitry Andric   return true;
20580b57cec5SDimitry Andric }
20590b57cec5SDimitry Andric 
2060480093f4SDimitry Andric static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2061480093f4SDimitry Andric   // We don't allow icmp's, and because we only look at single block loops,
2062480093f4SDimitry Andric   // we simply count the icmps, i.e. there should only be 1 for the backedge.
2063480093f4SDimitry Andric   if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2064480093f4SDimitry Andric     return false;
2065349cc55cSDimitry Andric   // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2066349cc55cSDimitry Andric   // not currently canonical, but soon will be. Code without them uses icmp, and
2067349cc55cSDimitry Andric   // so is not tail predicated as per the condition above. In order to get the
2068349cc55cSDimitry Andric   // same performance we treat min and max the same as an icmp for tailpred
2069349cc55cSDimitry Andric   // purposes for the moment (we often rely on non-tailpred and higher VF's to
2070349cc55cSDimitry Andric   // pick more optimial instructions like VQDMULH. They need to be recognized
2071349cc55cSDimitry Andric   // directly by the vectorizer).
2072349cc55cSDimitry Andric   if (auto *II = dyn_cast<IntrinsicInst>(&I))
2073349cc55cSDimitry Andric     if ((II->getIntrinsicID() == Intrinsic::smin ||
2074349cc55cSDimitry Andric          II->getIntrinsicID() == Intrinsic::smax ||
2075349cc55cSDimitry Andric          II->getIntrinsicID() == Intrinsic::umin ||
2076349cc55cSDimitry Andric          II->getIntrinsicID() == Intrinsic::umax) &&
2077349cc55cSDimitry Andric         ++ICmpCount > 1)
2078349cc55cSDimitry Andric       return false;
2079480093f4SDimitry Andric 
2080480093f4SDimitry Andric   if (isa<FCmpInst>(&I))
2081480093f4SDimitry Andric     return false;
2082480093f4SDimitry Andric 
2083480093f4SDimitry Andric   // We could allow extending/narrowing FP loads/stores, but codegen is
2084480093f4SDimitry Andric   // too inefficient so reject this for now.
2085480093f4SDimitry Andric   if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2086480093f4SDimitry Andric     return false;
2087480093f4SDimitry Andric 
2088480093f4SDimitry Andric   // Extends have to be extending-loads
2089480093f4SDimitry Andric   if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2090480093f4SDimitry Andric     if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2091480093f4SDimitry Andric       return false;
2092480093f4SDimitry Andric 
2093480093f4SDimitry Andric   // Truncs have to be narrowing-stores
2094480093f4SDimitry Andric   if (isa<TruncInst>(&I) )
2095480093f4SDimitry Andric     if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2096480093f4SDimitry Andric       return false;
2097480093f4SDimitry Andric 
2098480093f4SDimitry Andric   return true;
2099480093f4SDimitry Andric }
2100480093f4SDimitry Andric 
2101480093f4SDimitry Andric // To set up a tail-predicated loop, we need to know the total number of
2102480093f4SDimitry Andric // elements processed by that loop. Thus, we need to determine the element
2103480093f4SDimitry Andric // size and:
2104480093f4SDimitry Andric // 1) it should be uniform for all operations in the vector loop, so we
2105480093f4SDimitry Andric //    e.g. don't want any widening/narrowing operations.
2106480093f4SDimitry Andric // 2) it should be smaller than i64s because we don't have vector operations
2107480093f4SDimitry Andric //    that work on i64s.
2108480093f4SDimitry Andric // 3) we don't want elements to be reversed or shuffled, to make sure the
2109480093f4SDimitry Andric //    tail-predication masks/predicates the right lanes.
2110480093f4SDimitry Andric //
2111480093f4SDimitry Andric static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
2112480093f4SDimitry Andric                                  const DataLayout &DL,
2113480093f4SDimitry Andric                                  const LoopAccessInfo *LAI) {
21145ffd83dbSDimitry Andric   LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
21155ffd83dbSDimitry Andric 
2116e8d8bef9SDimitry Andric   // If there are live-out values, it is probably a reduction. We can predicate
2117e8d8bef9SDimitry Andric   // most reduction operations freely under MVE using a combination of
2118e8d8bef9SDimitry Andric   // prefer-predicated-reduction-select and inloop reductions. We limit this to
2119e8d8bef9SDimitry Andric   // floating point and integer reductions, but don't check for operators
2120e8d8bef9SDimitry Andric   // specifically here. If the value ends up not being a reduction (and so the
2121e8d8bef9SDimitry Andric   // vectorizer cannot tailfold the loop), we should fall back to standard
2122e8d8bef9SDimitry Andric   // vectorization automatically.
21235ffd83dbSDimitry Andric   SmallVector< Instruction *, 8 > LiveOuts;
21245ffd83dbSDimitry Andric   LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2125e8d8bef9SDimitry Andric   bool ReductionsDisabled =
21265ffd83dbSDimitry Andric       EnableTailPredication == TailPredication::EnabledNoReductions ||
21275ffd83dbSDimitry Andric       EnableTailPredication == TailPredication::ForceEnabledNoReductions;
21285ffd83dbSDimitry Andric 
21295ffd83dbSDimitry Andric   for (auto *I : LiveOuts) {
2130e8d8bef9SDimitry Andric     if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2131e8d8bef9SDimitry Andric         !I->getType()->isHalfTy()) {
2132e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
21335ffd83dbSDimitry Andric                            "live-out value\n");
21345ffd83dbSDimitry Andric       return false;
21355ffd83dbSDimitry Andric     }
2136e8d8bef9SDimitry Andric     if (ReductionsDisabled) {
2137e8d8bef9SDimitry Andric       LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
21385ffd83dbSDimitry Andric       return false;
21395ffd83dbSDimitry Andric     }
21405ffd83dbSDimitry Andric   }
21415ffd83dbSDimitry Andric 
21425ffd83dbSDimitry Andric   // Next, check that all instructions can be tail-predicated.
2143480093f4SDimitry Andric   PredicatedScalarEvolution PSE = LAI->getPSE();
21445ffd83dbSDimitry Andric   SmallVector<Instruction *, 16> LoadStores;
2145480093f4SDimitry Andric   int ICmpCount = 0;
2146480093f4SDimitry Andric 
2147480093f4SDimitry Andric   for (BasicBlock *BB : L->blocks()) {
2148480093f4SDimitry Andric     for (Instruction &I : BB->instructionsWithoutDebug()) {
2149480093f4SDimitry Andric       if (isa<PHINode>(&I))
2150480093f4SDimitry Andric         continue;
2151480093f4SDimitry Andric       if (!canTailPredicateInstruction(I, ICmpCount)) {
2152480093f4SDimitry Andric         LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2153480093f4SDimitry Andric         return false;
2154480093f4SDimitry Andric       }
2155480093f4SDimitry Andric 
2156480093f4SDimitry Andric       Type *T  = I.getType();
2157480093f4SDimitry Andric       if (T->getScalarSizeInBits() > 32) {
2158480093f4SDimitry Andric         LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2159480093f4SDimitry Andric         return false;
2160480093f4SDimitry Andric       }
2161480093f4SDimitry Andric       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2162349cc55cSDimitry Andric         Value *Ptr = getLoadStorePointerOperand(&I);
2163349cc55cSDimitry Andric         Type *AccessTy = getLoadStoreType(&I);
2164349cc55cSDimitry Andric         int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L);
2165e8d8bef9SDimitry Andric         if (NextStride == 1) {
2166480093f4SDimitry Andric           // TODO: for now only allow consecutive strides of 1. We could support
2167e8d8bef9SDimitry Andric           // other strides as long as it is uniform, but let's keep it simple
2168e8d8bef9SDimitry Andric           // for now.
2169e8d8bef9SDimitry Andric           continue;
2170e8d8bef9SDimitry Andric         } else if (NextStride == -1 ||
2171e8d8bef9SDimitry Andric                    (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2172e8d8bef9SDimitry Andric                    (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2173e8d8bef9SDimitry Andric           LLVM_DEBUG(dbgs()
2174e8d8bef9SDimitry Andric                      << "Consecutive strides of 2 found, vld2/vstr2 can't "
2175e8d8bef9SDimitry Andric                         "be tail-predicated\n.");
2176e8d8bef9SDimitry Andric           return false;
2177e8d8bef9SDimitry Andric           // TODO: don't tail predicate if there is a reversed load?
2178e8d8bef9SDimitry Andric         } else if (EnableMaskedGatherScatters) {
2179e8d8bef9SDimitry Andric           // Gather/scatters do allow loading from arbitrary strides, at
2180e8d8bef9SDimitry Andric           // least if they are loop invariant.
2181e8d8bef9SDimitry Andric           // TODO: Loop variant strides should in theory work, too, but
2182e8d8bef9SDimitry Andric           // this requires further testing.
2183349cc55cSDimitry Andric           const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2184e8d8bef9SDimitry Andric           if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2185e8d8bef9SDimitry Andric             const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2186e8d8bef9SDimitry Andric             if (PSE.getSE()->isLoopInvariant(Step, L))
2187480093f4SDimitry Andric               continue;
2188480093f4SDimitry Andric           }
2189e8d8bef9SDimitry Andric         }
2190e8d8bef9SDimitry Andric         LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2191480093f4SDimitry Andric                              "tail-predicate\n.");
2192480093f4SDimitry Andric         return false;
2193480093f4SDimitry Andric       }
2194480093f4SDimitry Andric     }
2195480093f4SDimitry Andric   }
2196480093f4SDimitry Andric 
2197480093f4SDimitry Andric   LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2198480093f4SDimitry Andric   return true;
2199480093f4SDimitry Andric }
2200480093f4SDimitry Andric 
2201fcaf7f86SDimitry Andric bool ARMTTIImpl::preferPredicateOverEpilogue(
2202fcaf7f86SDimitry Andric     Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
2203fcaf7f86SDimitry Andric     TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) {
22045ffd83dbSDimitry Andric   if (!EnableTailPredication) {
22055ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2206480093f4SDimitry Andric     return false;
22075ffd83dbSDimitry Andric   }
2208480093f4SDimitry Andric 
2209480093f4SDimitry Andric   // Creating a predicated vector loop is the first step for generating a
2210480093f4SDimitry Andric   // tail-predicated hardware loop, for which we need the MVE masked
2211480093f4SDimitry Andric   // load/stores instructions:
2212480093f4SDimitry Andric   if (!ST->hasMVEIntegerOps())
2213480093f4SDimitry Andric     return false;
2214480093f4SDimitry Andric 
2215480093f4SDimitry Andric   // For now, restrict this to single block loops.
2216480093f4SDimitry Andric   if (L->getNumBlocks() > 1) {
2217480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2218480093f4SDimitry Andric                          "loop.\n");
2219480093f4SDimitry Andric     return false;
2220480093f4SDimitry Andric   }
2221480093f4SDimitry Andric 
2222e8d8bef9SDimitry Andric   assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2223480093f4SDimitry Andric 
2224480093f4SDimitry Andric   HardwareLoopInfo HWLoopInfo(L);
2225480093f4SDimitry Andric   if (!HWLoopInfo.canAnalyze(*LI)) {
2226480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2227480093f4SDimitry Andric                          "analyzable.\n");
2228480093f4SDimitry Andric     return false;
2229480093f4SDimitry Andric   }
2230480093f4SDimitry Andric 
2231480093f4SDimitry Andric   // This checks if we have the low-overhead branch architecture
2232480093f4SDimitry Andric   // extension, and if we will create a hardware-loop:
2233480093f4SDimitry Andric   if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
2234480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2235480093f4SDimitry Andric                          "profitable.\n");
2236480093f4SDimitry Andric     return false;
2237480093f4SDimitry Andric   }
2238480093f4SDimitry Andric 
2239480093f4SDimitry Andric   if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
2240480093f4SDimitry Andric     LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2241480093f4SDimitry Andric                          "a candidate.\n");
2242480093f4SDimitry Andric     return false;
2243480093f4SDimitry Andric   }
2244480093f4SDimitry Andric 
2245fcaf7f86SDimitry Andric   return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI());
2246480093f4SDimitry Andric }
2247480093f4SDimitry Andric 
2248753f127fSDimitry Andric PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const {
22495ffd83dbSDimitry Andric   if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2250753f127fSDimitry Andric     return PredicationStyle::None;
2251480093f4SDimitry Andric 
22525ffd83dbSDimitry Andric   // Intrinsic @llvm.get.active.lane.mask is supported.
22535ffd83dbSDimitry Andric   // It is used in the MVETailPredication pass, which requires the number of
22545ffd83dbSDimitry Andric   // elements processed by this vector loop to setup the tail-predicated
22555ffd83dbSDimitry Andric   // loop.
2256753f127fSDimitry Andric   return PredicationStyle::Data;
22575ffd83dbSDimitry Andric }
22580b57cec5SDimitry Andric void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2259349cc55cSDimitry Andric                                          TTI::UnrollingPreferences &UP,
2260349cc55cSDimitry Andric                                          OptimizationRemarkEmitter *ORE) {
2261fe6060f1SDimitry Andric   // Enable Upper bound unrolling universally, not dependant upon the conditions
2262fe6060f1SDimitry Andric   // below.
2263fe6060f1SDimitry Andric   UP.UpperBound = true;
2264fe6060f1SDimitry Andric 
22650b57cec5SDimitry Andric   // Only currently enable these preferences for M-Class cores.
22660b57cec5SDimitry Andric   if (!ST->isMClass())
2267349cc55cSDimitry Andric     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
22680b57cec5SDimitry Andric 
22690b57cec5SDimitry Andric   // Disable loop unrolling for Oz and Os.
22700b57cec5SDimitry Andric   UP.OptSizeThreshold = 0;
22710b57cec5SDimitry Andric   UP.PartialOptSizeThreshold = 0;
22720b57cec5SDimitry Andric   if (L->getHeader()->getParent()->hasOptSize())
22730b57cec5SDimitry Andric     return;
22740b57cec5SDimitry Andric 
22750b57cec5SDimitry Andric   SmallVector<BasicBlock*, 4> ExitingBlocks;
22760b57cec5SDimitry Andric   L->getExitingBlocks(ExitingBlocks);
22770b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Loop has:\n"
22780b57cec5SDimitry Andric                     << "Blocks: " << L->getNumBlocks() << "\n"
22790b57cec5SDimitry Andric                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
22800b57cec5SDimitry Andric 
22810b57cec5SDimitry Andric   // Only allow another exit other than the latch. This acts as an early exit
22820b57cec5SDimitry Andric   // as it mirrors the profitability calculation of the runtime unroller.
22830b57cec5SDimitry Andric   if (ExitingBlocks.size() > 2)
22840b57cec5SDimitry Andric     return;
22850b57cec5SDimitry Andric 
22860b57cec5SDimitry Andric   // Limit the CFG of the loop body for targets with a branch predictor.
22870b57cec5SDimitry Andric   // Allowing 4 blocks permits if-then-else diamonds in the body.
22880b57cec5SDimitry Andric   if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
22890b57cec5SDimitry Andric     return;
22900b57cec5SDimitry Andric 
2291e8d8bef9SDimitry Andric   // Don't unroll vectorized loops, including the remainder loop
2292e8d8bef9SDimitry Andric   if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2293e8d8bef9SDimitry Andric     return;
2294e8d8bef9SDimitry Andric 
22950b57cec5SDimitry Andric   // Scan the loop: don't unroll loops with calls as this could prevent
22960b57cec5SDimitry Andric   // inlining.
2297fe6060f1SDimitry Andric   InstructionCost Cost = 0;
22980b57cec5SDimitry Andric   for (auto *BB : L->getBlocks()) {
22990b57cec5SDimitry Andric     for (auto &I : *BB) {
2300480093f4SDimitry Andric       // Don't unroll vectorised loop. MVE does not benefit from it as much as
2301480093f4SDimitry Andric       // scalar code.
2302480093f4SDimitry Andric       if (I.getType()->isVectorTy())
2303480093f4SDimitry Andric         return;
2304480093f4SDimitry Andric 
23050b57cec5SDimitry Andric       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
23065ffd83dbSDimitry Andric         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
23070b57cec5SDimitry Andric           if (!isLoweredToCall(F))
23080b57cec5SDimitry Andric             continue;
23090b57cec5SDimitry Andric         }
23100b57cec5SDimitry Andric         return;
23110b57cec5SDimitry Andric       }
23128bcb0991SDimitry Andric 
2313e8d8bef9SDimitry Andric       SmallVector<const Value*, 4> Operands(I.operand_values());
2314e8d8bef9SDimitry Andric       Cost +=
2315e8d8bef9SDimitry Andric         getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
23160b57cec5SDimitry Andric     }
23170b57cec5SDimitry Andric   }
23180b57cec5SDimitry Andric 
2319fe6060f1SDimitry Andric   // On v6m cores, there are very few registers available. We can easily end up
2320fe6060f1SDimitry Andric   // spilling and reloading more registers in an unrolled loop. Look at the
2321fe6060f1SDimitry Andric   // number of LCSSA phis as a rough measure of how many registers will need to
2322fe6060f1SDimitry Andric   // be live out of the loop, reducing the default unroll count if more than 1
2323fe6060f1SDimitry Andric   // value is needed.  In the long run, all of this should be being learnt by a
2324fe6060f1SDimitry Andric   // machine.
2325fe6060f1SDimitry Andric   unsigned UnrollCount = 4;
2326fe6060f1SDimitry Andric   if (ST->isThumb1Only()) {
2327fe6060f1SDimitry Andric     unsigned ExitingValues = 0;
2328fe6060f1SDimitry Andric     SmallVector<BasicBlock *, 4> ExitBlocks;
2329fe6060f1SDimitry Andric     L->getExitBlocks(ExitBlocks);
2330fe6060f1SDimitry Andric     for (auto *Exit : ExitBlocks) {
2331fe6060f1SDimitry Andric       // Count the number of LCSSA phis. Exclude values coming from GEP's as
2332fe6060f1SDimitry Andric       // only the last is expected to be needed for address operands.
2333fe6060f1SDimitry Andric       unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2334fe6060f1SDimitry Andric         return PH.getNumOperands() != 1 ||
2335fe6060f1SDimitry Andric                !isa<GetElementPtrInst>(PH.getOperand(0));
2336fe6060f1SDimitry Andric       });
2337fe6060f1SDimitry Andric       ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2338fe6060f1SDimitry Andric     }
2339fe6060f1SDimitry Andric     if (ExitingValues)
2340fe6060f1SDimitry Andric       UnrollCount /= ExitingValues;
2341fe6060f1SDimitry Andric     if (UnrollCount <= 1)
2342fe6060f1SDimitry Andric       return;
2343fe6060f1SDimitry Andric   }
2344fe6060f1SDimitry Andric 
23450b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2346fe6060f1SDimitry Andric   LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
23470b57cec5SDimitry Andric 
23480b57cec5SDimitry Andric   UP.Partial = true;
23490b57cec5SDimitry Andric   UP.Runtime = true;
23500b57cec5SDimitry Andric   UP.UnrollRemainder = true;
2351fe6060f1SDimitry Andric   UP.DefaultUnrollRuntimeCount = UnrollCount;
23520b57cec5SDimitry Andric   UP.UnrollAndJam = true;
23530b57cec5SDimitry Andric   UP.UnrollAndJamInnerLoopThreshold = 60;
23540b57cec5SDimitry Andric 
23550b57cec5SDimitry Andric   // Force unrolling small loops can be very useful because of the branch
23560b57cec5SDimitry Andric   // taken cost of the backedge.
23570b57cec5SDimitry Andric   if (Cost < 12)
23580b57cec5SDimitry Andric     UP.Force = true;
23590b57cec5SDimitry Andric }
23608bcb0991SDimitry Andric 
23615ffd83dbSDimitry Andric void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
23625ffd83dbSDimitry Andric                                        TTI::PeelingPreferences &PP) {
23635ffd83dbSDimitry Andric   BaseT::getPeelingPreferences(L, SE, PP);
23645ffd83dbSDimitry Andric }
23655ffd83dbSDimitry Andric 
2366e8d8bef9SDimitry Andric bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
2367e8d8bef9SDimitry Andric                                        TTI::ReductionFlags Flags) const {
2368e8d8bef9SDimitry Andric   if (!ST->hasMVEIntegerOps())
2369e8d8bef9SDimitry Andric     return false;
2370e8d8bef9SDimitry Andric 
2371e8d8bef9SDimitry Andric   unsigned ScalarBits = Ty->getScalarSizeInBits();
2372e8d8bef9SDimitry Andric   switch (Opcode) {
2373e8d8bef9SDimitry Andric   case Instruction::Add:
2374e8d8bef9SDimitry Andric     return ScalarBits <= 64;
2375e8d8bef9SDimitry Andric   default:
2376e8d8bef9SDimitry Andric     return false;
2377e8d8bef9SDimitry Andric   }
2378e8d8bef9SDimitry Andric }
2379e8d8bef9SDimitry Andric 
2380e8d8bef9SDimitry Andric bool ARMTTIImpl::preferPredicatedReductionSelect(
2381e8d8bef9SDimitry Andric     unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
2382e8d8bef9SDimitry Andric   if (!ST->hasMVEIntegerOps())
2383e8d8bef9SDimitry Andric     return false;
2384e8d8bef9SDimitry Andric   return true;
2385e8d8bef9SDimitry Andric }
2386