10b57cec5SDimitry Andric //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric 90b57cec5SDimitry Andric #include "ARMTargetTransformInfo.h" 100b57cec5SDimitry Andric #include "ARMSubtarget.h" 110b57cec5SDimitry Andric #include "MCTargetDesc/ARMAddressingModes.h" 120b57cec5SDimitry Andric #include "llvm/ADT/APInt.h" 130b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h" 140b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h" 150b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h" 160b57cec5SDimitry Andric #include "llvm/CodeGen/ISDOpcodes.h" 170b57cec5SDimitry Andric #include "llvm/CodeGen/ValueTypes.h" 180b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h" 190b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h" 200b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h" 210b57cec5SDimitry Andric #include "llvm/IR/Instruction.h" 220b57cec5SDimitry Andric #include "llvm/IR/Instructions.h" 230b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h" 24fcaf7f86SDimitry Andric #include "llvm/IR/Intrinsics.h" 255ffd83dbSDimitry Andric #include "llvm/IR/IntrinsicsARM.h" 26480093f4SDimitry Andric #include "llvm/IR/PatternMatch.h" 270b57cec5SDimitry Andric #include "llvm/IR/Type.h" 280b57cec5SDimitry Andric #include "llvm/MC/SubtargetFeature.h" 290b57cec5SDimitry Andric #include "llvm/Support/Casting.h" 30e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h" 310b57cec5SDimitry Andric #include "llvm/Support/MachineValueType.h" 320b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h" 33e8d8bef9SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h" 34e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h" 355ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h" 36fcaf7f86SDimitry Andric #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 370b57cec5SDimitry Andric #include <algorithm> 380b57cec5SDimitry Andric #include <cassert> 390b57cec5SDimitry Andric #include <cstdint> 400b57cec5SDimitry Andric #include <utility> 410b57cec5SDimitry Andric 420b57cec5SDimitry Andric using namespace llvm; 430b57cec5SDimitry Andric 440b57cec5SDimitry Andric #define DEBUG_TYPE "armtti" 450b57cec5SDimitry Andric 468bcb0991SDimitry Andric static cl::opt<bool> EnableMaskedLoadStores( 47480093f4SDimitry Andric "enable-arm-maskedldst", cl::Hidden, cl::init(true), 488bcb0991SDimitry Andric cl::desc("Enable the generation of masked loads and stores")); 498bcb0991SDimitry Andric 500b57cec5SDimitry Andric static cl::opt<bool> DisableLowOverheadLoops( 518bcb0991SDimitry Andric "disable-arm-loloops", cl::Hidden, cl::init(false), 520b57cec5SDimitry Andric cl::desc("Disable the generation of low-overhead loops")); 530b57cec5SDimitry Andric 54e8d8bef9SDimitry Andric static cl::opt<bool> 55e8d8bef9SDimitry Andric AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), 56e8d8bef9SDimitry Andric cl::desc("Enable the generation of WLS loops")); 57e8d8bef9SDimitry Andric 585ffd83dbSDimitry Andric extern cl::opt<TailPredication::Mode> EnableTailPredication; 59480093f4SDimitry Andric 60480093f4SDimitry Andric extern cl::opt<bool> EnableMaskedGatherScatters; 61480093f4SDimitry Andric 62e8d8bef9SDimitry Andric extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor; 63e8d8bef9SDimitry Andric 64e8d8bef9SDimitry Andric /// Convert a vector load intrinsic into a simple llvm load instruction. 65e8d8bef9SDimitry Andric /// This is beneficial when the underlying object being addressed comes 66e8d8bef9SDimitry Andric /// from a constant, since we get constant-folding for free. 67e8d8bef9SDimitry Andric static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, 68e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 69e8d8bef9SDimitry Andric auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); 70e8d8bef9SDimitry Andric 71e8d8bef9SDimitry Andric if (!IntrAlign) 72e8d8bef9SDimitry Andric return nullptr; 73e8d8bef9SDimitry Andric 74e8d8bef9SDimitry Andric unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign 75e8d8bef9SDimitry Andric ? MemAlign 76e8d8bef9SDimitry Andric : IntrAlign->getLimitedValue(); 77e8d8bef9SDimitry Andric 78e8d8bef9SDimitry Andric if (!isPowerOf2_32(Alignment)) 79e8d8bef9SDimitry Andric return nullptr; 80e8d8bef9SDimitry Andric 81e8d8bef9SDimitry Andric auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), 82e8d8bef9SDimitry Andric PointerType::get(II.getType(), 0)); 83e8d8bef9SDimitry Andric return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); 84e8d8bef9SDimitry Andric } 85e8d8bef9SDimitry Andric 860b57cec5SDimitry Andric bool ARMTTIImpl::areInlineCompatible(const Function *Caller, 870b57cec5SDimitry Andric const Function *Callee) const { 880b57cec5SDimitry Andric const TargetMachine &TM = getTLI()->getTargetMachine(); 890b57cec5SDimitry Andric const FeatureBitset &CallerBits = 900b57cec5SDimitry Andric TM.getSubtargetImpl(*Caller)->getFeatureBits(); 910b57cec5SDimitry Andric const FeatureBitset &CalleeBits = 920b57cec5SDimitry Andric TM.getSubtargetImpl(*Callee)->getFeatureBits(); 930b57cec5SDimitry Andric 945ffd83dbSDimitry Andric // To inline a callee, all features not in the allowed list must match exactly. 955ffd83dbSDimitry Andric bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) == 965ffd83dbSDimitry Andric (CalleeBits & ~InlineFeaturesAllowed); 975ffd83dbSDimitry Andric // For features in the allowed list, the callee's features must be a subset of 980b57cec5SDimitry Andric // the callers'. 995ffd83dbSDimitry Andric bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) == 1005ffd83dbSDimitry Andric (CalleeBits & InlineFeaturesAllowed); 1010b57cec5SDimitry Andric return MatchExact && MatchSubset; 1020b57cec5SDimitry Andric } 1030b57cec5SDimitry Andric 104fe6060f1SDimitry Andric TTI::AddressingModeKind 105fe6060f1SDimitry Andric ARMTTIImpl::getPreferredAddressingMode(const Loop *L, 106fe6060f1SDimitry Andric ScalarEvolution *SE) const { 1075ffd83dbSDimitry Andric if (ST->hasMVEIntegerOps()) 108fe6060f1SDimitry Andric return TTI::AMK_PostIndexed; 1095ffd83dbSDimitry Andric 110fe6060f1SDimitry Andric if (L->getHeader()->getParent()->hasOptSize()) 111fe6060f1SDimitry Andric return TTI::AMK_None; 112fe6060f1SDimitry Andric 113fe6060f1SDimitry Andric if (ST->isMClass() && ST->isThumb2() && 114fe6060f1SDimitry Andric L->getNumBlocks() == 1) 115fe6060f1SDimitry Andric return TTI::AMK_PreIndexed; 116fe6060f1SDimitry Andric 117fe6060f1SDimitry Andric return TTI::AMK_None; 1185ffd83dbSDimitry Andric } 1195ffd83dbSDimitry Andric 120e8d8bef9SDimitry Andric Optional<Instruction *> 121e8d8bef9SDimitry Andric ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 122e8d8bef9SDimitry Andric using namespace PatternMatch; 123e8d8bef9SDimitry Andric Intrinsic::ID IID = II.getIntrinsicID(); 124e8d8bef9SDimitry Andric switch (IID) { 125e8d8bef9SDimitry Andric default: 126e8d8bef9SDimitry Andric break; 127e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld1: { 128e8d8bef9SDimitry Andric Align MemAlign = 129e8d8bef9SDimitry Andric getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, 130e8d8bef9SDimitry Andric &IC.getAssumptionCache(), &IC.getDominatorTree()); 131e8d8bef9SDimitry Andric if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) { 132e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 133e8d8bef9SDimitry Andric } 134e8d8bef9SDimitry Andric break; 135e8d8bef9SDimitry Andric } 136e8d8bef9SDimitry Andric 137e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld2: 138e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld3: 139e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld4: 140e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld2lane: 141e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld3lane: 142e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld4lane: 143e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst1: 144e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst2: 145e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst3: 146e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst4: 147e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst2lane: 148e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst3lane: 149e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst4lane: { 150e8d8bef9SDimitry Andric Align MemAlign = 151e8d8bef9SDimitry Andric getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, 152e8d8bef9SDimitry Andric &IC.getAssumptionCache(), &IC.getDominatorTree()); 153349cc55cSDimitry Andric unsigned AlignArg = II.arg_size() - 1; 154e8d8bef9SDimitry Andric Value *AlignArgOp = II.getArgOperand(AlignArg); 155e8d8bef9SDimitry Andric MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue(); 156e8d8bef9SDimitry Andric if (Align && *Align < MemAlign) { 157e8d8bef9SDimitry Andric return IC.replaceOperand( 158e8d8bef9SDimitry Andric II, AlignArg, 159e8d8bef9SDimitry Andric ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(), 160e8d8bef9SDimitry Andric false)); 161e8d8bef9SDimitry Andric } 162e8d8bef9SDimitry Andric break; 163e8d8bef9SDimitry Andric } 164e8d8bef9SDimitry Andric 165e8d8bef9SDimitry Andric case Intrinsic::arm_mve_pred_i2v: { 166e8d8bef9SDimitry Andric Value *Arg = II.getArgOperand(0); 167e8d8bef9SDimitry Andric Value *ArgArg; 168e8d8bef9SDimitry Andric if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( 169e8d8bef9SDimitry Andric PatternMatch::m_Value(ArgArg))) && 170e8d8bef9SDimitry Andric II.getType() == ArgArg->getType()) { 171e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ArgArg); 172e8d8bef9SDimitry Andric } 173e8d8bef9SDimitry Andric Constant *XorMask; 174e8d8bef9SDimitry Andric if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( 175e8d8bef9SDimitry Andric PatternMatch::m_Value(ArgArg)), 176e8d8bef9SDimitry Andric PatternMatch::m_Constant(XorMask))) && 177e8d8bef9SDimitry Andric II.getType() == ArgArg->getType()) { 178e8d8bef9SDimitry Andric if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { 179349cc55cSDimitry Andric if (CI->getValue().trunc(16).isAllOnes()) { 180e8d8bef9SDimitry Andric auto TrueVector = IC.Builder.CreateVectorSplat( 181e8d8bef9SDimitry Andric cast<FixedVectorType>(II.getType())->getNumElements(), 182e8d8bef9SDimitry Andric IC.Builder.getTrue()); 183e8d8bef9SDimitry Andric return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); 184e8d8bef9SDimitry Andric } 185e8d8bef9SDimitry Andric } 186e8d8bef9SDimitry Andric } 187e8d8bef9SDimitry Andric KnownBits ScalarKnown(32); 188e8d8bef9SDimitry Andric if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16), 189e8d8bef9SDimitry Andric ScalarKnown, 0)) { 190e8d8bef9SDimitry Andric return &II; 191e8d8bef9SDimitry Andric } 192e8d8bef9SDimitry Andric break; 193e8d8bef9SDimitry Andric } 194e8d8bef9SDimitry Andric case Intrinsic::arm_mve_pred_v2i: { 195e8d8bef9SDimitry Andric Value *Arg = II.getArgOperand(0); 196e8d8bef9SDimitry Andric Value *ArgArg; 197e8d8bef9SDimitry Andric if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>( 198e8d8bef9SDimitry Andric PatternMatch::m_Value(ArgArg)))) { 199e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ArgArg); 200e8d8bef9SDimitry Andric } 201e8d8bef9SDimitry Andric if (!II.getMetadata(LLVMContext::MD_range)) { 202e8d8bef9SDimitry Andric Type *IntTy32 = Type::getInt32Ty(II.getContext()); 203e8d8bef9SDimitry Andric Metadata *M[] = { 204e8d8bef9SDimitry Andric ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), 205fe6060f1SDimitry Andric ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))}; 206e8d8bef9SDimitry Andric II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M)); 207e8d8bef9SDimitry Andric return &II; 208e8d8bef9SDimitry Andric } 209e8d8bef9SDimitry Andric break; 210e8d8bef9SDimitry Andric } 211e8d8bef9SDimitry Andric case Intrinsic::arm_mve_vadc: 212e8d8bef9SDimitry Andric case Intrinsic::arm_mve_vadc_predicated: { 213e8d8bef9SDimitry Andric unsigned CarryOp = 214e8d8bef9SDimitry Andric (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; 215e8d8bef9SDimitry Andric assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && 216e8d8bef9SDimitry Andric "Bad type for intrinsic!"); 217e8d8bef9SDimitry Andric 218e8d8bef9SDimitry Andric KnownBits CarryKnown(32); 219e8d8bef9SDimitry Andric if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29), 220e8d8bef9SDimitry Andric CarryKnown)) { 221e8d8bef9SDimitry Andric return &II; 222e8d8bef9SDimitry Andric } 223e8d8bef9SDimitry Andric break; 224e8d8bef9SDimitry Andric } 225e8d8bef9SDimitry Andric case Intrinsic::arm_mve_vmldava: { 226e8d8bef9SDimitry Andric Instruction *I = cast<Instruction>(&II); 227e8d8bef9SDimitry Andric if (I->hasOneUse()) { 228e8d8bef9SDimitry Andric auto *User = cast<Instruction>(*I->user_begin()); 229e8d8bef9SDimitry Andric Value *OpZ; 230e8d8bef9SDimitry Andric if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) && 231e8d8bef9SDimitry Andric match(I->getOperand(3), m_Zero())) { 232e8d8bef9SDimitry Andric Value *OpX = I->getOperand(4); 233e8d8bef9SDimitry Andric Value *OpY = I->getOperand(5); 234e8d8bef9SDimitry Andric Type *OpTy = OpX->getType(); 235e8d8bef9SDimitry Andric 236e8d8bef9SDimitry Andric IC.Builder.SetInsertPoint(User); 237e8d8bef9SDimitry Andric Value *V = 238e8d8bef9SDimitry Andric IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy}, 239e8d8bef9SDimitry Andric {I->getOperand(0), I->getOperand(1), 240e8d8bef9SDimitry Andric I->getOperand(2), OpZ, OpX, OpY}); 241e8d8bef9SDimitry Andric 242e8d8bef9SDimitry Andric IC.replaceInstUsesWith(*User, V); 243e8d8bef9SDimitry Andric return IC.eraseInstFromFunction(*User); 244e8d8bef9SDimitry Andric } 245e8d8bef9SDimitry Andric } 246e8d8bef9SDimitry Andric return None; 247e8d8bef9SDimitry Andric } 248e8d8bef9SDimitry Andric } 249e8d8bef9SDimitry Andric return None; 250e8d8bef9SDimitry Andric } 251e8d8bef9SDimitry Andric 252349cc55cSDimitry Andric Optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic( 253349cc55cSDimitry Andric InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, 254349cc55cSDimitry Andric APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, 255349cc55cSDimitry Andric std::function<void(Instruction *, unsigned, APInt, APInt &)> 256349cc55cSDimitry Andric SimplifyAndSetOp) const { 257349cc55cSDimitry Andric 258349cc55cSDimitry Andric // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the 259349cc55cSDimitry Andric // opcode specifying a Top/Bottom instruction, which can change between 260349cc55cSDimitry Andric // instructions. 261349cc55cSDimitry Andric auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) { 262349cc55cSDimitry Andric unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements(); 263349cc55cSDimitry Andric unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue(); 264349cc55cSDimitry Andric 265349cc55cSDimitry Andric // The only odd/even lanes of operand 0 will only be demanded depending 266349cc55cSDimitry Andric // on whether this is a top/bottom instruction. 267349cc55cSDimitry Andric APInt DemandedElts = 268349cc55cSDimitry Andric APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1) 269349cc55cSDimitry Andric : APInt::getHighBitsSet(2, 1)); 270349cc55cSDimitry Andric SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts); 271349cc55cSDimitry Andric // The other lanes will be defined from the inserted elements. 272349cc55cSDimitry Andric UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1) 273349cc55cSDimitry Andric : APInt::getHighBitsSet(2, 1)); 274349cc55cSDimitry Andric return None; 275349cc55cSDimitry Andric }; 276349cc55cSDimitry Andric 277349cc55cSDimitry Andric switch (II.getIntrinsicID()) { 278349cc55cSDimitry Andric default: 279349cc55cSDimitry Andric break; 280349cc55cSDimitry Andric case Intrinsic::arm_mve_vcvt_narrow: 281349cc55cSDimitry Andric SimplifyNarrowInstrTopBottom(2); 282349cc55cSDimitry Andric break; 283349cc55cSDimitry Andric case Intrinsic::arm_mve_vqmovn: 284349cc55cSDimitry Andric SimplifyNarrowInstrTopBottom(4); 285349cc55cSDimitry Andric break; 286349cc55cSDimitry Andric case Intrinsic::arm_mve_vshrn: 287349cc55cSDimitry Andric SimplifyNarrowInstrTopBottom(7); 288349cc55cSDimitry Andric break; 289349cc55cSDimitry Andric } 290349cc55cSDimitry Andric 291349cc55cSDimitry Andric return None; 292349cc55cSDimitry Andric } 293349cc55cSDimitry Andric 294fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 2955ffd83dbSDimitry Andric TTI::TargetCostKind CostKind) { 2960b57cec5SDimitry Andric assert(Ty->isIntegerTy()); 2970b57cec5SDimitry Andric 2980b57cec5SDimitry Andric unsigned Bits = Ty->getPrimitiveSizeInBits(); 2990b57cec5SDimitry Andric if (Bits == 0 || Imm.getActiveBits() >= 64) 3000b57cec5SDimitry Andric return 4; 3010b57cec5SDimitry Andric 3020b57cec5SDimitry Andric int64_t SImmVal = Imm.getSExtValue(); 3030b57cec5SDimitry Andric uint64_t ZImmVal = Imm.getZExtValue(); 3040b57cec5SDimitry Andric if (!ST->isThumb()) { 3050b57cec5SDimitry Andric if ((SImmVal >= 0 && SImmVal < 65536) || 3060b57cec5SDimitry Andric (ARM_AM::getSOImmVal(ZImmVal) != -1) || 3070b57cec5SDimitry Andric (ARM_AM::getSOImmVal(~ZImmVal) != -1)) 3080b57cec5SDimitry Andric return 1; 3090b57cec5SDimitry Andric return ST->hasV6T2Ops() ? 2 : 3; 3100b57cec5SDimitry Andric } 3110b57cec5SDimitry Andric if (ST->isThumb2()) { 3120b57cec5SDimitry Andric if ((SImmVal >= 0 && SImmVal < 65536) || 3130b57cec5SDimitry Andric (ARM_AM::getT2SOImmVal(ZImmVal) != -1) || 3140b57cec5SDimitry Andric (ARM_AM::getT2SOImmVal(~ZImmVal) != -1)) 3150b57cec5SDimitry Andric return 1; 3160b57cec5SDimitry Andric return ST->hasV6T2Ops() ? 2 : 3; 3170b57cec5SDimitry Andric } 3180b57cec5SDimitry Andric // Thumb1, any i8 imm cost 1. 3190b57cec5SDimitry Andric if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256)) 3200b57cec5SDimitry Andric return 1; 3210b57cec5SDimitry Andric if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal)) 3220b57cec5SDimitry Andric return 2; 3230b57cec5SDimitry Andric // Load from constantpool. 3240b57cec5SDimitry Andric return 3; 3250b57cec5SDimitry Andric } 3260b57cec5SDimitry Andric 3270b57cec5SDimitry Andric // Constants smaller than 256 fit in the immediate field of 3280b57cec5SDimitry Andric // Thumb1 instructions so we return a zero cost and 1 otherwise. 329fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, 3300b57cec5SDimitry Andric const APInt &Imm, Type *Ty) { 3310b57cec5SDimitry Andric if (Imm.isNonNegative() && Imm.getLimitedValue() < 256) 3320b57cec5SDimitry Andric return 0; 3330b57cec5SDimitry Andric 3340b57cec5SDimitry Andric return 1; 3350b57cec5SDimitry Andric } 3360b57cec5SDimitry Andric 337e8d8bef9SDimitry Andric // Checks whether Inst is part of a min(max()) or max(min()) pattern 3384824e7fdSDimitry Andric // that will match to an SSAT instruction. Returns the instruction being 3394824e7fdSDimitry Andric // saturated, or null if no saturation pattern was found. 3404824e7fdSDimitry Andric static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { 341e8d8bef9SDimitry Andric Value *LHS, *RHS; 342e8d8bef9SDimitry Andric ConstantInt *C; 343e8d8bef9SDimitry Andric SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor; 344e8d8bef9SDimitry Andric 345e8d8bef9SDimitry Andric if (InstSPF == SPF_SMAX && 346e8d8bef9SDimitry Andric PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) && 347349cc55cSDimitry Andric C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) { 348e8d8bef9SDimitry Andric 349e8d8bef9SDimitry Andric auto isSSatMin = [&](Value *MinInst) { 350e8d8bef9SDimitry Andric if (isa<SelectInst>(MinInst)) { 351e8d8bef9SDimitry Andric Value *MinLHS, *MinRHS; 352e8d8bef9SDimitry Andric ConstantInt *MinC; 353e8d8bef9SDimitry Andric SelectPatternFlavor MinSPF = 354e8d8bef9SDimitry Andric matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor; 355e8d8bef9SDimitry Andric if (MinSPF == SPF_SMIN && 356e8d8bef9SDimitry Andric PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) && 357e8d8bef9SDimitry Andric MinC->getValue() == ((-Imm) - 1)) 358e8d8bef9SDimitry Andric return true; 359e8d8bef9SDimitry Andric } 360e8d8bef9SDimitry Andric return false; 361e8d8bef9SDimitry Andric }; 362e8d8bef9SDimitry Andric 3634824e7fdSDimitry Andric if (isSSatMin(Inst->getOperand(1))) 3644824e7fdSDimitry Andric return cast<Instruction>(Inst->getOperand(1))->getOperand(1); 3654824e7fdSDimitry Andric if (Inst->hasNUses(2) && 3664824e7fdSDimitry Andric (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin())))) 3674824e7fdSDimitry Andric return Inst->getOperand(1); 368e8d8bef9SDimitry Andric } 3694824e7fdSDimitry Andric return nullptr; 3704824e7fdSDimitry Andric } 3714824e7fdSDimitry Andric 3724824e7fdSDimitry Andric // Look for a FP Saturation pattern, where the instruction can be simplified to 3734824e7fdSDimitry Andric // a fptosi.sat. max(min(fptosi)). The constant in this case is always free. 3744824e7fdSDimitry Andric static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) { 3754824e7fdSDimitry Andric if (Imm.getBitWidth() != 64 || 3764824e7fdSDimitry Andric Imm != APInt::getHighBitsSet(64, 33)) // -2147483648 377e8d8bef9SDimitry Andric return false; 3784824e7fdSDimitry Andric Value *FP = isSSATMinMaxPattern(Inst, Imm); 3794824e7fdSDimitry Andric if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse()) 3804824e7fdSDimitry Andric FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm); 3814824e7fdSDimitry Andric if (!FP) 3824824e7fdSDimitry Andric return false; 3834824e7fdSDimitry Andric return isa<FPToSIInst>(FP); 384e8d8bef9SDimitry Andric } 385e8d8bef9SDimitry Andric 386fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 387e8d8bef9SDimitry Andric const APInt &Imm, Type *Ty, 388e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind, 389e8d8bef9SDimitry Andric Instruction *Inst) { 3900b57cec5SDimitry Andric // Division by a constant can be turned into multiplication, but only if we 3910b57cec5SDimitry Andric // know it's constant. So it's not so much that the immediate is cheap (it's 3920b57cec5SDimitry Andric // not), but that the alternative is worse. 3930b57cec5SDimitry Andric // FIXME: this is probably unneeded with GlobalISel. 3940b57cec5SDimitry Andric if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv || 3950b57cec5SDimitry Andric Opcode == Instruction::SRem || Opcode == Instruction::URem) && 3960b57cec5SDimitry Andric Idx == 1) 3970b57cec5SDimitry Andric return 0; 3980b57cec5SDimitry Andric 399fe6060f1SDimitry Andric // Leave any gep offsets for the CodeGenPrepare, which will do a better job at 400fe6060f1SDimitry Andric // splitting any large offsets. 401fe6060f1SDimitry Andric if (Opcode == Instruction::GetElementPtr && Idx != 0) 402fe6060f1SDimitry Andric return 0; 403fe6060f1SDimitry Andric 4040b57cec5SDimitry Andric if (Opcode == Instruction::And) { 4050b57cec5SDimitry Andric // UXTB/UXTH 4060b57cec5SDimitry Andric if (Imm == 255 || Imm == 65535) 4070b57cec5SDimitry Andric return 0; 4080b57cec5SDimitry Andric // Conversion to BIC is free, and means we can use ~Imm instead. 4095ffd83dbSDimitry Andric return std::min(getIntImmCost(Imm, Ty, CostKind), 4105ffd83dbSDimitry Andric getIntImmCost(~Imm, Ty, CostKind)); 4110b57cec5SDimitry Andric } 4120b57cec5SDimitry Andric 4130b57cec5SDimitry Andric if (Opcode == Instruction::Add) 4140b57cec5SDimitry Andric // Conversion to SUB is free, and means we can use -Imm instead. 4155ffd83dbSDimitry Andric return std::min(getIntImmCost(Imm, Ty, CostKind), 4165ffd83dbSDimitry Andric getIntImmCost(-Imm, Ty, CostKind)); 4170b57cec5SDimitry Andric 4180b57cec5SDimitry Andric if (Opcode == Instruction::ICmp && Imm.isNegative() && 4190b57cec5SDimitry Andric Ty->getIntegerBitWidth() == 32) { 4200b57cec5SDimitry Andric int64_t NegImm = -Imm.getSExtValue(); 4210b57cec5SDimitry Andric if (ST->isThumb2() && NegImm < 1<<12) 4220b57cec5SDimitry Andric // icmp X, #-C -> cmn X, #C 4230b57cec5SDimitry Andric return 0; 4240b57cec5SDimitry Andric if (ST->isThumb() && NegImm < 1<<8) 4250b57cec5SDimitry Andric // icmp X, #-C -> adds X, #C 4260b57cec5SDimitry Andric return 0; 4270b57cec5SDimitry Andric } 4280b57cec5SDimitry Andric 4290b57cec5SDimitry Andric // xor a, -1 can always be folded to MVN 430349cc55cSDimitry Andric if (Opcode == Instruction::Xor && Imm.isAllOnes()) 4310b57cec5SDimitry Andric return 0; 4320b57cec5SDimitry Andric 433e8d8bef9SDimitry Andric // Ensures negative constant of min(max()) or max(min()) patterns that 434e8d8bef9SDimitry Andric // match to SSAT instructions don't get hoisted 435e8d8bef9SDimitry Andric if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) && 436e8d8bef9SDimitry Andric Ty->getIntegerBitWidth() <= 32) { 437e8d8bef9SDimitry Andric if (isSSATMinMaxPattern(Inst, Imm) || 438e8d8bef9SDimitry Andric (isa<ICmpInst>(Inst) && Inst->hasOneUse() && 439e8d8bef9SDimitry Andric isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm))) 440e8d8bef9SDimitry Andric return 0; 441e8d8bef9SDimitry Andric } 442e8d8bef9SDimitry Andric 4434824e7fdSDimitry Andric if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm)) 4444824e7fdSDimitry Andric return 0; 4454824e7fdSDimitry Andric 446349cc55cSDimitry Andric // We can convert <= -1 to < 0, which is generally quite cheap. 447349cc55cSDimitry Andric if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) { 448349cc55cSDimitry Andric ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate(); 449349cc55cSDimitry Andric if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) 450349cc55cSDimitry Andric return std::min(getIntImmCost(Imm, Ty, CostKind), 451349cc55cSDimitry Andric getIntImmCost(Imm + 1, Ty, CostKind)); 452349cc55cSDimitry Andric } 453349cc55cSDimitry Andric 4545ffd83dbSDimitry Andric return getIntImmCost(Imm, Ty, CostKind); 4550b57cec5SDimitry Andric } 4560b57cec5SDimitry Andric 457fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode, 458fe6060f1SDimitry Andric TTI::TargetCostKind CostKind, 459fe6060f1SDimitry Andric const Instruction *I) { 460e8d8bef9SDimitry Andric if (CostKind == TTI::TCK_RecipThroughput && 461e8d8bef9SDimitry Andric (ST->hasNEON() || ST->hasMVEIntegerOps())) { 462e8d8bef9SDimitry Andric // FIXME: The vectorizer is highly sensistive to the cost of these 463e8d8bef9SDimitry Andric // instructions, which suggests that it may be using the costs incorrectly. 464e8d8bef9SDimitry Andric // But, for now, just make them free to avoid performance regressions for 465e8d8bef9SDimitry Andric // vector targets. 466e8d8bef9SDimitry Andric return 0; 467e8d8bef9SDimitry Andric } 468fe6060f1SDimitry Andric return BaseT::getCFInstrCost(Opcode, CostKind, I); 469e8d8bef9SDimitry Andric } 470e8d8bef9SDimitry Andric 471fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 472fe6060f1SDimitry Andric Type *Src, 473e8d8bef9SDimitry Andric TTI::CastContextHint CCH, 4745ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, 4750b57cec5SDimitry Andric const Instruction *I) { 4760b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 4770b57cec5SDimitry Andric assert(ISD && "Invalid opcode"); 4780b57cec5SDimitry Andric 4795ffd83dbSDimitry Andric // TODO: Allow non-throughput costs that aren't binary. 480fe6060f1SDimitry Andric auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 4815ffd83dbSDimitry Andric if (CostKind != TTI::TCK_RecipThroughput) 4825ffd83dbSDimitry Andric return Cost == 0 ? 0 : 1; 4835ffd83dbSDimitry Andric return Cost; 4840b57cec5SDimitry Andric }; 485e8d8bef9SDimitry Andric auto IsLegalFPType = [this](EVT VT) { 486e8d8bef9SDimitry Andric EVT EltVT = VT.getScalarType(); 487e8d8bef9SDimitry Andric return (EltVT == MVT::f32 && ST->hasVFP2Base()) || 488e8d8bef9SDimitry Andric (EltVT == MVT::f64 && ST->hasFP64()) || 489e8d8bef9SDimitry Andric (EltVT == MVT::f16 && ST->hasFullFP16()); 490e8d8bef9SDimitry Andric }; 4910b57cec5SDimitry Andric 4920b57cec5SDimitry Andric EVT SrcTy = TLI->getValueType(DL, Src); 4930b57cec5SDimitry Andric EVT DstTy = TLI->getValueType(DL, Dst); 4940b57cec5SDimitry Andric 4950b57cec5SDimitry Andric if (!SrcTy.isSimple() || !DstTy.isSimple()) 496e8d8bef9SDimitry Andric return AdjustCost( 497e8d8bef9SDimitry Andric BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 4980b57cec5SDimitry Andric 499e8d8bef9SDimitry Andric // Extending masked load/Truncating masked stores is expensive because we 500e8d8bef9SDimitry Andric // currently don't split them. This means that we'll likely end up 501e8d8bef9SDimitry Andric // loading/storing each element individually (hence the high cost). 502e8d8bef9SDimitry Andric if ((ST->hasMVEIntegerOps() && 503e8d8bef9SDimitry Andric (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt || 504e8d8bef9SDimitry Andric Opcode == Instruction::SExt)) || 505e8d8bef9SDimitry Andric (ST->hasMVEFloatOps() && 506e8d8bef9SDimitry Andric (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) && 507e8d8bef9SDimitry Andric IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))) 508e8d8bef9SDimitry Andric if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128) 509fe6060f1SDimitry Andric return 2 * DstTy.getVectorNumElements() * 510fe6060f1SDimitry Andric ST->getMVEVectorCostFactor(CostKind); 511e8d8bef9SDimitry Andric 512e8d8bef9SDimitry Andric // The extend of other kinds of load is free 513e8d8bef9SDimitry Andric if (CCH == TTI::CastContextHint::Normal || 514e8d8bef9SDimitry Andric CCH == TTI::CastContextHint::Masked) { 5158bcb0991SDimitry Andric static const TypeConversionCostTblEntry LoadConversionTbl[] = { 5168bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, 5178bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, 5188bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0}, 5198bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0}, 5208bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0}, 5218bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0}, 5228bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1}, 5238bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1}, 5248bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1}, 5258bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1}, 5268bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1}, 5278bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1}, 5288bcb0991SDimitry Andric }; 5298bcb0991SDimitry Andric if (const auto *Entry = ConvertCostTableLookup( 5308bcb0991SDimitry Andric LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 5315ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 5328bcb0991SDimitry Andric 5338bcb0991SDimitry Andric static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { 5348bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0}, 5358bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0}, 5368bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0}, 5378bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0}, 5388bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0}, 5398bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0}, 5405ffd83dbSDimitry Andric // The following extend from a legal type to an illegal type, so need to 5415ffd83dbSDimitry Andric // split the load. This introduced an extra load operation, but the 5425ffd83dbSDimitry Andric // extend is still "free". 5435ffd83dbSDimitry Andric {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1}, 5445ffd83dbSDimitry Andric {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1}, 5455ffd83dbSDimitry Andric {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3}, 5465ffd83dbSDimitry Andric {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3}, 5475ffd83dbSDimitry Andric {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1}, 5485ffd83dbSDimitry Andric {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1}, 5498bcb0991SDimitry Andric }; 5508bcb0991SDimitry Andric if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 5518bcb0991SDimitry Andric if (const auto *Entry = 5528bcb0991SDimitry Andric ConvertCostTableLookup(MVELoadConversionTbl, ISD, 5538bcb0991SDimitry Andric DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 554fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 5558bcb0991SDimitry Andric } 5565ffd83dbSDimitry Andric 5575ffd83dbSDimitry Andric static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = { 5585ffd83dbSDimitry Andric // FPExtends are similar but also require the VCVT instructions. 5595ffd83dbSDimitry Andric {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, 5605ffd83dbSDimitry Andric {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3}, 5615ffd83dbSDimitry Andric }; 5625ffd83dbSDimitry Andric if (SrcTy.isVector() && ST->hasMVEFloatOps()) { 5635ffd83dbSDimitry Andric if (const auto *Entry = 5645ffd83dbSDimitry Andric ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, 5655ffd83dbSDimitry Andric DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 566fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 5675ffd83dbSDimitry Andric } 5685ffd83dbSDimitry Andric 5695ffd83dbSDimitry Andric // The truncate of a store is free. This is the mirror of extends above. 570e8d8bef9SDimitry Andric static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = { 5715ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0}, 5725ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0}, 5735ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0}, 5745ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1}, 575e8d8bef9SDimitry Andric {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1}, 5765ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3}, 5775ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1}, 5785ffd83dbSDimitry Andric }; 5795ffd83dbSDimitry Andric if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 5805ffd83dbSDimitry Andric if (const auto *Entry = 581e8d8bef9SDimitry Andric ConvertCostTableLookup(MVEStoreConversionTbl, ISD, 582e8d8bef9SDimitry Andric SrcTy.getSimpleVT(), DstTy.getSimpleVT())) 583fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 5845ffd83dbSDimitry Andric } 5855ffd83dbSDimitry Andric 586e8d8bef9SDimitry Andric static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = { 5875ffd83dbSDimitry Andric {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1}, 5885ffd83dbSDimitry Andric {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3}, 5895ffd83dbSDimitry Andric }; 5905ffd83dbSDimitry Andric if (SrcTy.isVector() && ST->hasMVEFloatOps()) { 5915ffd83dbSDimitry Andric if (const auto *Entry = 592e8d8bef9SDimitry Andric ConvertCostTableLookup(MVEFStoreConversionTbl, ISD, 593e8d8bef9SDimitry Andric SrcTy.getSimpleVT(), DstTy.getSimpleVT())) 594fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 5955ffd83dbSDimitry Andric } 5965ffd83dbSDimitry Andric } 5975ffd83dbSDimitry Andric 5985ffd83dbSDimitry Andric // NEON vector operations that can extend their inputs. 5995ffd83dbSDimitry Andric if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) && 6005ffd83dbSDimitry Andric I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) { 6015ffd83dbSDimitry Andric static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = { 6025ffd83dbSDimitry Andric // vaddl 6035ffd83dbSDimitry Andric { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 }, 6045ffd83dbSDimitry Andric { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 }, 6055ffd83dbSDimitry Andric // vsubl 6065ffd83dbSDimitry Andric { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 }, 6075ffd83dbSDimitry Andric { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 }, 6085ffd83dbSDimitry Andric // vmull 6095ffd83dbSDimitry Andric { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 }, 6105ffd83dbSDimitry Andric { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 }, 6115ffd83dbSDimitry Andric // vshll 6125ffd83dbSDimitry Andric { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 }, 6135ffd83dbSDimitry Andric { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 }, 6145ffd83dbSDimitry Andric }; 6155ffd83dbSDimitry Andric 6165ffd83dbSDimitry Andric auto *User = cast<Instruction>(*I->user_begin()); 6175ffd83dbSDimitry Andric int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode()); 6185ffd83dbSDimitry Andric if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD, 6195ffd83dbSDimitry Andric DstTy.getSimpleVT(), 6205ffd83dbSDimitry Andric SrcTy.getSimpleVT())) { 6215ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 6225ffd83dbSDimitry Andric } 6235ffd83dbSDimitry Andric } 6245ffd83dbSDimitry Andric 6255ffd83dbSDimitry Andric // Single to/from double precision conversions. 6265ffd83dbSDimitry Andric if (Src->isVectorTy() && ST->hasNEON() && 6275ffd83dbSDimitry Andric ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 && 6285ffd83dbSDimitry Andric DstTy.getScalarType() == MVT::f32) || 6295ffd83dbSDimitry Andric (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 && 6305ffd83dbSDimitry Andric DstTy.getScalarType() == MVT::f64))) { 6315ffd83dbSDimitry Andric static const CostTblEntry NEONFltDblTbl[] = { 6325ffd83dbSDimitry Andric // Vector fptrunc/fpext conversions. 6335ffd83dbSDimitry Andric {ISD::FP_ROUND, MVT::v2f64, 2}, 6345ffd83dbSDimitry Andric {ISD::FP_EXTEND, MVT::v2f32, 2}, 6355ffd83dbSDimitry Andric {ISD::FP_EXTEND, MVT::v4f32, 4}}; 6365ffd83dbSDimitry Andric 637fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 6385ffd83dbSDimitry Andric if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) 6395ffd83dbSDimitry Andric return AdjustCost(LT.first * Entry->Cost); 6408bcb0991SDimitry Andric } 6418bcb0991SDimitry Andric 6420b57cec5SDimitry Andric // Some arithmetic, load and store operations have specific instructions 6430b57cec5SDimitry Andric // to cast up/down their types automatically at no extra cost. 6440b57cec5SDimitry Andric // TODO: Get these tables to know at least what the related operations are. 6450b57cec5SDimitry Andric static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = { 6465ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 6475ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 6480b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, 6490b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, 6500b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 6510b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 6520b57cec5SDimitry Andric 6530b57cec5SDimitry Andric // The number of vmovl instructions for the extension. 6545ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 6555ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 6565ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 6575ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 6585ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, 6595ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, 6605ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 6615ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 6620b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 6630b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 6640b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 6650b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 6660b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 6670b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 6680b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 6690b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 6700b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 6710b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 6720b57cec5SDimitry Andric 6730b57cec5SDimitry Andric // Operations that we legalize using splitting. 6740b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 6750b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 6760b57cec5SDimitry Andric 6770b57cec5SDimitry Andric // Vector float <-> i32 conversions. 6780b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 6790b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 6800b57cec5SDimitry Andric 6810b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 6820b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 6830b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, 6840b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, 6850b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 6860b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 6870b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 6880b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 6890b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 6900b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 6910b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 6920b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 6930b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 6940b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 6950b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 6960b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 6970b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, 6980b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, 6990b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, 7000b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, 7010b57cec5SDimitry Andric 7020b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 7030b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 7040b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 }, 7050b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 }, 7060b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 7070b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 7080b57cec5SDimitry Andric 7090b57cec5SDimitry Andric // Vector double <-> i32 conversions. 7100b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 7110b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 7120b57cec5SDimitry Andric 7130b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 7140b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 7150b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, 7160b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, 7170b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 7180b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 7190b57cec5SDimitry Andric 7200b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 7210b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 7220b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 }, 7230b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 }, 7240b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 }, 7250b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 } 7260b57cec5SDimitry Andric }; 7270b57cec5SDimitry Andric 7280b57cec5SDimitry Andric if (SrcTy.isVector() && ST->hasNEON()) { 7290b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, 7300b57cec5SDimitry Andric DstTy.getSimpleVT(), 7310b57cec5SDimitry Andric SrcTy.getSimpleVT())) 7325ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 7330b57cec5SDimitry Andric } 7340b57cec5SDimitry Andric 7350b57cec5SDimitry Andric // Scalar float to integer conversions. 7360b57cec5SDimitry Andric static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = { 7370b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 }, 7380b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 }, 7390b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 }, 7400b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 }, 7410b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 }, 7420b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 }, 7430b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 }, 7440b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 }, 7450b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 }, 7460b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 }, 7470b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 }, 7480b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 }, 7490b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 }, 7500b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 }, 7510b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 }, 7520b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 }, 7530b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 }, 7540b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 }, 7550b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 }, 7560b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 } 7570b57cec5SDimitry Andric }; 7580b57cec5SDimitry Andric if (SrcTy.isFloatingPoint() && ST->hasNEON()) { 7590b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, 7600b57cec5SDimitry Andric DstTy.getSimpleVT(), 7610b57cec5SDimitry Andric SrcTy.getSimpleVT())) 7625ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 7630b57cec5SDimitry Andric } 7640b57cec5SDimitry Andric 7650b57cec5SDimitry Andric // Scalar integer to float conversions. 7660b57cec5SDimitry Andric static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = { 7670b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 }, 7680b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 }, 7690b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 }, 7700b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 }, 7710b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 }, 7720b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 }, 7730b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 }, 7740b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 }, 7750b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 }, 7760b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 }, 7770b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 }, 7780b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 }, 7790b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 }, 7800b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 }, 7810b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 }, 7820b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 }, 7830b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 }, 7840b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 }, 7850b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 }, 7860b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 } 7870b57cec5SDimitry Andric }; 7880b57cec5SDimitry Andric 7890b57cec5SDimitry Andric if (SrcTy.isInteger() && ST->hasNEON()) { 7900b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl, 7910b57cec5SDimitry Andric ISD, DstTy.getSimpleVT(), 7920b57cec5SDimitry Andric SrcTy.getSimpleVT())) 7935ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 7940b57cec5SDimitry Andric } 7950b57cec5SDimitry Andric 7968bcb0991SDimitry Andric // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one 7978bcb0991SDimitry Andric // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext 7988bcb0991SDimitry Andric // are linearised so take more. 7998bcb0991SDimitry Andric static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = { 8008bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 8018bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 8028bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 8038bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 8048bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 }, 8058bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 }, 8068bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 8078bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 8088bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 }, 8098bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 8108bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 }, 8118bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 }, 8128bcb0991SDimitry Andric }; 8138bcb0991SDimitry Andric 8148bcb0991SDimitry Andric if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 8158bcb0991SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl, 8168bcb0991SDimitry Andric ISD, DstTy.getSimpleVT(), 8178bcb0991SDimitry Andric SrcTy.getSimpleVT())) 818fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 8195ffd83dbSDimitry Andric } 8205ffd83dbSDimitry Andric 8215ffd83dbSDimitry Andric if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) { 8225ffd83dbSDimitry Andric // As general rule, fp converts that were not matched above are scalarized 8235ffd83dbSDimitry Andric // and cost 1 vcvt for each lane, so long as the instruction is available. 8245ffd83dbSDimitry Andric // If not it will become a series of function calls. 825fe6060f1SDimitry Andric const InstructionCost CallCost = 826fe6060f1SDimitry Andric getCallInstrCost(nullptr, Dst, {Src}, CostKind); 8275ffd83dbSDimitry Andric int Lanes = 1; 8285ffd83dbSDimitry Andric if (SrcTy.isFixedLengthVector()) 8295ffd83dbSDimitry Andric Lanes = SrcTy.getVectorNumElements(); 8305ffd83dbSDimitry Andric 831e8d8bef9SDimitry Andric if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)) 8325ffd83dbSDimitry Andric return Lanes; 8335ffd83dbSDimitry Andric else 8345ffd83dbSDimitry Andric return Lanes * CallCost; 8358bcb0991SDimitry Andric } 8368bcb0991SDimitry Andric 837e8d8bef9SDimitry Andric if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() && 838e8d8bef9SDimitry Andric SrcTy.isFixedLengthVector()) { 839e8d8bef9SDimitry Andric // Treat a truncate with larger than legal source (128bits for MVE) as 840e8d8bef9SDimitry Andric // expensive, 2 instructions per lane. 841e8d8bef9SDimitry Andric if ((SrcTy.getScalarType() == MVT::i8 || 842e8d8bef9SDimitry Andric SrcTy.getScalarType() == MVT::i16 || 843e8d8bef9SDimitry Andric SrcTy.getScalarType() == MVT::i32) && 844e8d8bef9SDimitry Andric SrcTy.getSizeInBits() > 128 && 845e8d8bef9SDimitry Andric SrcTy.getSizeInBits() > DstTy.getSizeInBits()) 846e8d8bef9SDimitry Andric return SrcTy.getVectorNumElements() * 2; 847e8d8bef9SDimitry Andric } 848e8d8bef9SDimitry Andric 8490b57cec5SDimitry Andric // Scalar integer conversion costs. 8500b57cec5SDimitry Andric static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { 8510b57cec5SDimitry Andric // i16 -> i64 requires two dependent operations. 8520b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 }, 8530b57cec5SDimitry Andric 8540b57cec5SDimitry Andric // Truncates on i64 are assumed to be free. 8550b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 }, 8560b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 }, 8570b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 }, 8580b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 } 8590b57cec5SDimitry Andric }; 8600b57cec5SDimitry Andric 8610b57cec5SDimitry Andric if (SrcTy.isInteger()) { 8620b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, 8630b57cec5SDimitry Andric DstTy.getSimpleVT(), 8640b57cec5SDimitry Andric SrcTy.getSimpleVT())) 8655ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 8660b57cec5SDimitry Andric } 8670b57cec5SDimitry Andric 8688bcb0991SDimitry Andric int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() 869fe6060f1SDimitry Andric ? ST->getMVEVectorCostFactor(CostKind) 8708bcb0991SDimitry Andric : 1; 8715ffd83dbSDimitry Andric return AdjustCost( 872e8d8bef9SDimitry Andric BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 8730b57cec5SDimitry Andric } 8740b57cec5SDimitry Andric 875fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 8760b57cec5SDimitry Andric unsigned Index) { 8770b57cec5SDimitry Andric // Penalize inserting into an D-subregister. We end up with a three times 8780b57cec5SDimitry Andric // lower estimated throughput on swift. 8790b57cec5SDimitry Andric if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement && 8800b57cec5SDimitry Andric ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) 8810b57cec5SDimitry Andric return 3; 8820b57cec5SDimitry Andric 8838bcb0991SDimitry Andric if (ST->hasNEON() && (Opcode == Instruction::InsertElement || 8840b57cec5SDimitry Andric Opcode == Instruction::ExtractElement)) { 8850b57cec5SDimitry Andric // Cross-class copies are expensive on many microarchitectures, 8860b57cec5SDimitry Andric // so assume they are expensive by default. 8875ffd83dbSDimitry Andric if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy()) 8880b57cec5SDimitry Andric return 3; 8890b57cec5SDimitry Andric 8900b57cec5SDimitry Andric // Even if it's not a cross class copy, this likely leads to mixing 8910b57cec5SDimitry Andric // of NEON and VFP code and should be therefore penalized. 8920b57cec5SDimitry Andric if (ValTy->isVectorTy() && 8930b57cec5SDimitry Andric ValTy->getScalarSizeInBits() <= 32) 894fe6060f1SDimitry Andric return std::max<InstructionCost>( 895fe6060f1SDimitry Andric BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); 8960b57cec5SDimitry Andric } 8970b57cec5SDimitry Andric 8988bcb0991SDimitry Andric if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || 8998bcb0991SDimitry Andric Opcode == Instruction::ExtractElement)) { 900fe6060f1SDimitry Andric // Integer cross-lane moves are more expensive than float, which can 901fe6060f1SDimitry Andric // sometimes just be vmovs. Integer involve being passes to GPR registers, 902fe6060f1SDimitry Andric // causing more of a delay. 903fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = 904fe6060f1SDimitry Andric getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType()); 905fe6060f1SDimitry Andric return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1); 9068bcb0991SDimitry Andric } 9078bcb0991SDimitry Andric 9080b57cec5SDimitry Andric return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 9090b57cec5SDimitry Andric } 9100b57cec5SDimitry Andric 911fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 912fe6060f1SDimitry Andric Type *CondTy, 913e8d8bef9SDimitry Andric CmpInst::Predicate VecPred, 9145ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, 9150b57cec5SDimitry Andric const Instruction *I) { 9160b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 917e8d8bef9SDimitry Andric 918e8d8bef9SDimitry Andric // Thumb scalar code size cost for select. 919e8d8bef9SDimitry Andric if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT && 920e8d8bef9SDimitry Andric ST->isThumb() && !ValTy->isVectorTy()) { 921e8d8bef9SDimitry Andric // Assume expensive structs. 922e8d8bef9SDimitry Andric if (TLI->getValueType(DL, ValTy, true) == MVT::Other) 923e8d8bef9SDimitry Andric return TTI::TCC_Expensive; 924e8d8bef9SDimitry Andric 925e8d8bef9SDimitry Andric // Select costs can vary because they: 926e8d8bef9SDimitry Andric // - may require one or more conditional mov (including an IT), 927e8d8bef9SDimitry Andric // - can't operate directly on immediates, 928e8d8bef9SDimitry Andric // - require live flags, which we can't copy around easily. 929fe6060f1SDimitry Andric InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first; 930e8d8bef9SDimitry Andric 931e8d8bef9SDimitry Andric // Possible IT instruction for Thumb2, or more for Thumb1. 932e8d8bef9SDimitry Andric ++Cost; 933e8d8bef9SDimitry Andric 934e8d8bef9SDimitry Andric // i1 values may need rematerialising by using mov immediates and/or 935e8d8bef9SDimitry Andric // flag setting instructions. 936e8d8bef9SDimitry Andric if (ValTy->isIntegerTy(1)) 937e8d8bef9SDimitry Andric ++Cost; 938e8d8bef9SDimitry Andric 939e8d8bef9SDimitry Andric return Cost; 940e8d8bef9SDimitry Andric } 941e8d8bef9SDimitry Andric 942fe6060f1SDimitry Andric // If this is a vector min/max/abs, use the cost of that intrinsic directly 943fe6060f1SDimitry Andric // instead. Hopefully when min/max intrinsics are more prevalent this code 944fe6060f1SDimitry Andric // will not be needed. 945fe6060f1SDimitry Andric const Instruction *Sel = I; 946fe6060f1SDimitry Andric if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel && 947fe6060f1SDimitry Andric Sel->hasOneUse()) 948fe6060f1SDimitry Andric Sel = cast<Instruction>(Sel->user_back()); 949fe6060f1SDimitry Andric if (Sel && ValTy->isVectorTy() && 950fe6060f1SDimitry Andric (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) { 951fe6060f1SDimitry Andric const Value *LHS, *RHS; 952fe6060f1SDimitry Andric SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor; 953fe6060f1SDimitry Andric unsigned IID = 0; 954fe6060f1SDimitry Andric switch (SPF) { 955fe6060f1SDimitry Andric case SPF_ABS: 956fe6060f1SDimitry Andric IID = Intrinsic::abs; 957fe6060f1SDimitry Andric break; 958fe6060f1SDimitry Andric case SPF_SMIN: 959fe6060f1SDimitry Andric IID = Intrinsic::smin; 960fe6060f1SDimitry Andric break; 961fe6060f1SDimitry Andric case SPF_SMAX: 962fe6060f1SDimitry Andric IID = Intrinsic::smax; 963fe6060f1SDimitry Andric break; 964fe6060f1SDimitry Andric case SPF_UMIN: 965fe6060f1SDimitry Andric IID = Intrinsic::umin; 966fe6060f1SDimitry Andric break; 967fe6060f1SDimitry Andric case SPF_UMAX: 968fe6060f1SDimitry Andric IID = Intrinsic::umax; 969fe6060f1SDimitry Andric break; 970fe6060f1SDimitry Andric case SPF_FMINNUM: 971fe6060f1SDimitry Andric IID = Intrinsic::minnum; 972fe6060f1SDimitry Andric break; 973fe6060f1SDimitry Andric case SPF_FMAXNUM: 974fe6060f1SDimitry Andric IID = Intrinsic::maxnum; 975fe6060f1SDimitry Andric break; 976fe6060f1SDimitry Andric default: 977fe6060f1SDimitry Andric break; 978fe6060f1SDimitry Andric } 979fe6060f1SDimitry Andric if (IID) { 980fe6060f1SDimitry Andric // The ICmp is free, the select gets the cost of the min/max/etc 981fe6060f1SDimitry Andric if (Sel != I) 982fe6060f1SDimitry Andric return 0; 983fe6060f1SDimitry Andric IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy}); 984fe6060f1SDimitry Andric return getIntrinsicInstrCost(CostAttrs, CostKind); 985fe6060f1SDimitry Andric } 986fe6060f1SDimitry Andric } 987fe6060f1SDimitry Andric 9880b57cec5SDimitry Andric // On NEON a vector select gets lowered to vbsl. 989e8d8bef9SDimitry Andric if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) { 9900b57cec5SDimitry Andric // Lowering of some vector selects is currently far from perfect. 9910b57cec5SDimitry Andric static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { 9920b57cec5SDimitry Andric { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 }, 9930b57cec5SDimitry Andric { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 }, 9940b57cec5SDimitry Andric { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 } 9950b57cec5SDimitry Andric }; 9960b57cec5SDimitry Andric 9970b57cec5SDimitry Andric EVT SelCondTy = TLI->getValueType(DL, CondTy); 9980b57cec5SDimitry Andric EVT SelValTy = TLI->getValueType(DL, ValTy); 9990b57cec5SDimitry Andric if (SelCondTy.isSimple() && SelValTy.isSimple()) { 10000b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, 10010b57cec5SDimitry Andric SelCondTy.getSimpleVT(), 10020b57cec5SDimitry Andric SelValTy.getSimpleVT())) 10030b57cec5SDimitry Andric return Entry->Cost; 10040b57cec5SDimitry Andric } 10050b57cec5SDimitry Andric 1006fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = 1007fe6060f1SDimitry Andric TLI->getTypeLegalizationCost(DL, ValTy); 10080b57cec5SDimitry Andric return LT.first; 10090b57cec5SDimitry Andric } 10100b57cec5SDimitry Andric 1011fe6060f1SDimitry Andric if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() && 1012fe6060f1SDimitry Andric (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 1013fe6060f1SDimitry Andric cast<FixedVectorType>(ValTy)->getNumElements() > 1) { 1014fe6060f1SDimitry Andric FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy); 1015fe6060f1SDimitry Andric FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy); 1016fe6060f1SDimitry Andric if (!VecCondTy) 1017fe6060f1SDimitry Andric VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy)); 1018fe6060f1SDimitry Andric 1019fe6060f1SDimitry Andric // If we don't have mve.fp any fp operations will need to be scalarized. 1020fe6060f1SDimitry Andric if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) { 1021fe6060f1SDimitry Andric // One scalaization insert, one scalarization extract and the cost of the 1022fe6060f1SDimitry Andric // fcmps. 1023fe6060f1SDimitry Andric return BaseT::getScalarizationOverhead(VecValTy, false, true) + 1024fe6060f1SDimitry Andric BaseT::getScalarizationOverhead(VecCondTy, true, false) + 1025fe6060f1SDimitry Andric VecValTy->getNumElements() * 1026fe6060f1SDimitry Andric getCmpSelInstrCost(Opcode, ValTy->getScalarType(), 1027fe6060f1SDimitry Andric VecCondTy->getScalarType(), VecPred, CostKind, 1028fe6060f1SDimitry Andric I); 1029fe6060f1SDimitry Andric } 1030fe6060f1SDimitry Andric 1031fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = 1032fe6060f1SDimitry Andric TLI->getTypeLegalizationCost(DL, ValTy); 1033fe6060f1SDimitry Andric int BaseCost = ST->getMVEVectorCostFactor(CostKind); 1034fe6060f1SDimitry Andric // There are two types - the input that specifies the type of the compare 1035fe6060f1SDimitry Andric // and the output vXi1 type. Because we don't know how the output will be 1036fe6060f1SDimitry Andric // split, we may need an expensive shuffle to get two in sync. This has the 1037fe6060f1SDimitry Andric // effect of making larger than legal compares (v8i32 for example) 1038fe6060f1SDimitry Andric // expensive. 1039f3fd488fSDimitry Andric if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) { 1040fe6060f1SDimitry Andric if (LT.first > 1) 1041fe6060f1SDimitry Andric return LT.first * BaseCost + 1042fe6060f1SDimitry Andric BaseT::getScalarizationOverhead(VecCondTy, true, false); 1043fe6060f1SDimitry Andric return BaseCost; 1044fe6060f1SDimitry Andric } 1045fe6060f1SDimitry Andric } 1046fe6060f1SDimitry Andric 1047e8d8bef9SDimitry Andric // Default to cheap (throughput/size of 1 instruction) but adjust throughput 1048e8d8bef9SDimitry Andric // for "multiple beats" potentially needed by MVE instructions. 1049e8d8bef9SDimitry Andric int BaseCost = 1; 1050fe6060f1SDimitry Andric if (ST->hasMVEIntegerOps() && ValTy->isVectorTy()) 1051fe6060f1SDimitry Andric BaseCost = ST->getMVEVectorCostFactor(CostKind); 1052e8d8bef9SDimitry Andric 1053e8d8bef9SDimitry Andric return BaseCost * 1054e8d8bef9SDimitry Andric BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 10550b57cec5SDimitry Andric } 10560b57cec5SDimitry Andric 1057fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty, 1058fe6060f1SDimitry Andric ScalarEvolution *SE, 10590b57cec5SDimitry Andric const SCEV *Ptr) { 10600b57cec5SDimitry Andric // Address computations in vectorized code with non-consecutive addresses will 10610b57cec5SDimitry Andric // likely result in more instructions compared to scalar code where the 10620b57cec5SDimitry Andric // computation can more often be merged into the index mode. The resulting 10630b57cec5SDimitry Andric // extra micro-ops can significantly decrease throughput. 10640b57cec5SDimitry Andric unsigned NumVectorInstToHideOverhead = 10; 10650b57cec5SDimitry Andric int MaxMergeDistance = 64; 10660b57cec5SDimitry Andric 10678bcb0991SDimitry Andric if (ST->hasNEON()) { 10680b57cec5SDimitry Andric if (Ty->isVectorTy() && SE && 10690b57cec5SDimitry Andric !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 10700b57cec5SDimitry Andric return NumVectorInstToHideOverhead; 10710b57cec5SDimitry Andric 10720b57cec5SDimitry Andric // In many cases the address computation is not merged into the instruction 10730b57cec5SDimitry Andric // addressing mode. 10740b57cec5SDimitry Andric return 1; 10750b57cec5SDimitry Andric } 10768bcb0991SDimitry Andric return BaseT::getAddressComputationCost(Ty, SE, Ptr); 10778bcb0991SDimitry Andric } 10788bcb0991SDimitry Andric 10795ffd83dbSDimitry Andric bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) { 10805ffd83dbSDimitry Andric if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 10815ffd83dbSDimitry Andric // If a VCTP is part of a chain, it's already profitable and shouldn't be 10825ffd83dbSDimitry Andric // optimized, else LSR may block tail-predication. 10835ffd83dbSDimitry Andric switch (II->getIntrinsicID()) { 10845ffd83dbSDimitry Andric case Intrinsic::arm_mve_vctp8: 10855ffd83dbSDimitry Andric case Intrinsic::arm_mve_vctp16: 10865ffd83dbSDimitry Andric case Intrinsic::arm_mve_vctp32: 10875ffd83dbSDimitry Andric case Intrinsic::arm_mve_vctp64: 10885ffd83dbSDimitry Andric return true; 10895ffd83dbSDimitry Andric default: 10905ffd83dbSDimitry Andric break; 10915ffd83dbSDimitry Andric } 10925ffd83dbSDimitry Andric } 10935ffd83dbSDimitry Andric return false; 10945ffd83dbSDimitry Andric } 10955ffd83dbSDimitry Andric 10965ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { 10978bcb0991SDimitry Andric if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) 10988bcb0991SDimitry Andric return false; 10998bcb0991SDimitry Andric 11005ffd83dbSDimitry Andric if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) { 11018bcb0991SDimitry Andric // Don't support v2i1 yet. 11028bcb0991SDimitry Andric if (VecTy->getNumElements() == 2) 11038bcb0991SDimitry Andric return false; 11048bcb0991SDimitry Andric 11058bcb0991SDimitry Andric // We don't support extending fp types. 11068bcb0991SDimitry Andric unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); 11078bcb0991SDimitry Andric if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy()) 11088bcb0991SDimitry Andric return false; 11098bcb0991SDimitry Andric } 11108bcb0991SDimitry Andric 11118bcb0991SDimitry Andric unsigned EltWidth = DataTy->getScalarSizeInBits(); 11125ffd83dbSDimitry Andric return (EltWidth == 32 && Alignment >= 4) || 11135ffd83dbSDimitry Andric (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8); 11148bcb0991SDimitry Andric } 11150b57cec5SDimitry Andric 11165ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) { 1117480093f4SDimitry Andric if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) 1118480093f4SDimitry Andric return false; 1119480093f4SDimitry Andric 1120480093f4SDimitry Andric unsigned EltWidth = Ty->getScalarSizeInBits(); 11215ffd83dbSDimitry Andric return ((EltWidth == 32 && Alignment >= 4) || 11225ffd83dbSDimitry Andric (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); 1123480093f4SDimitry Andric } 1124480093f4SDimitry Andric 1125e8d8bef9SDimitry Andric /// Given a memcpy/memset/memmove instruction, return the number of memory 1126e8d8bef9SDimitry Andric /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a 1127e8d8bef9SDimitry Andric /// call is used. 1128e8d8bef9SDimitry Andric int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const { 1129e8d8bef9SDimitry Andric MemOp MOp; 1130e8d8bef9SDimitry Andric unsigned DstAddrSpace = ~0u; 1131e8d8bef9SDimitry Andric unsigned SrcAddrSpace = ~0u; 1132e8d8bef9SDimitry Andric const Function *F = I->getParent()->getParent(); 11330b57cec5SDimitry Andric 1134e8d8bef9SDimitry Andric if (const auto *MC = dyn_cast<MemTransferInst>(I)) { 1135e8d8bef9SDimitry Andric ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength()); 11360b57cec5SDimitry Andric // If 'size' is not a constant, a library call will be generated. 11370b57cec5SDimitry Andric if (!C) 1138e8d8bef9SDimitry Andric return -1; 11390b57cec5SDimitry Andric 11400b57cec5SDimitry Andric const unsigned Size = C->getValue().getZExtValue(); 1141e8d8bef9SDimitry Andric const Align DstAlign = *MC->getDestAlign(); 1142e8d8bef9SDimitry Andric const Align SrcAlign = *MC->getSourceAlign(); 1143e8d8bef9SDimitry Andric 1144e8d8bef9SDimitry Andric MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, 1145e8d8bef9SDimitry Andric /*IsVolatile*/ false); 1146e8d8bef9SDimitry Andric DstAddrSpace = MC->getDestAddressSpace(); 1147e8d8bef9SDimitry Andric SrcAddrSpace = MC->getSourceAddressSpace(); 1148e8d8bef9SDimitry Andric } 1149e8d8bef9SDimitry Andric else if (const auto *MS = dyn_cast<MemSetInst>(I)) { 1150e8d8bef9SDimitry Andric ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength()); 1151e8d8bef9SDimitry Andric // If 'size' is not a constant, a library call will be generated. 1152e8d8bef9SDimitry Andric if (!C) 1153e8d8bef9SDimitry Andric return -1; 1154e8d8bef9SDimitry Andric 1155e8d8bef9SDimitry Andric const unsigned Size = C->getValue().getZExtValue(); 1156e8d8bef9SDimitry Andric const Align DstAlign = *MS->getDestAlign(); 1157e8d8bef9SDimitry Andric 1158e8d8bef9SDimitry Andric MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign, 1159e8d8bef9SDimitry Andric /*IsZeroMemset*/ false, /*IsVolatile*/ false); 1160e8d8bef9SDimitry Andric DstAddrSpace = MS->getDestAddressSpace(); 1161e8d8bef9SDimitry Andric } 1162e8d8bef9SDimitry Andric else 1163e8d8bef9SDimitry Andric llvm_unreachable("Expected a memcpy/move or memset!"); 1164e8d8bef9SDimitry Andric 1165e8d8bef9SDimitry Andric unsigned Limit, Factor = 2; 1166e8d8bef9SDimitry Andric switch(I->getIntrinsicID()) { 1167e8d8bef9SDimitry Andric case Intrinsic::memcpy: 1168e8d8bef9SDimitry Andric Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize()); 1169e8d8bef9SDimitry Andric break; 1170e8d8bef9SDimitry Andric case Intrinsic::memmove: 1171e8d8bef9SDimitry Andric Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); 1172e8d8bef9SDimitry Andric break; 1173e8d8bef9SDimitry Andric case Intrinsic::memset: 1174e8d8bef9SDimitry Andric Limit = TLI->getMaxStoresPerMemset(F->hasMinSize()); 1175e8d8bef9SDimitry Andric Factor = 1; 1176e8d8bef9SDimitry Andric break; 1177e8d8bef9SDimitry Andric default: 1178e8d8bef9SDimitry Andric llvm_unreachable("Expected a memcpy/move or memset!"); 1179e8d8bef9SDimitry Andric } 11800b57cec5SDimitry Andric 11810b57cec5SDimitry Andric // MemOps will be poplulated with a list of data types that needs to be 11820b57cec5SDimitry Andric // loaded and stored. That's why we multiply the number of elements by 2 to 11830b57cec5SDimitry Andric // get the cost for this memcpy. 1184e8d8bef9SDimitry Andric std::vector<EVT> MemOps; 11850b57cec5SDimitry Andric if (getTLI()->findOptimalMemOpLowering( 1186e8d8bef9SDimitry Andric MemOps, Limit, MOp, DstAddrSpace, 1187e8d8bef9SDimitry Andric SrcAddrSpace, F->getAttributes())) 1188e8d8bef9SDimitry Andric return MemOps.size() * Factor; 11890b57cec5SDimitry Andric 11900b57cec5SDimitry Andric // If we can't find an optimal memop lowering, return the default cost 1191e8d8bef9SDimitry Andric return -1; 1192e8d8bef9SDimitry Andric } 1193e8d8bef9SDimitry Andric 1194fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) { 1195e8d8bef9SDimitry Andric int NumOps = getNumMemOps(cast<IntrinsicInst>(I)); 1196e8d8bef9SDimitry Andric 1197e8d8bef9SDimitry Andric // To model the cost of a library call, we assume 1 for the call, and 1198e8d8bef9SDimitry Andric // 3 for the argument setup. 1199e8d8bef9SDimitry Andric if (NumOps == -1) 1200e8d8bef9SDimitry Andric return 4; 1201e8d8bef9SDimitry Andric return NumOps; 12020b57cec5SDimitry Andric } 12030b57cec5SDimitry Andric 1204fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 1205fe6060f1SDimitry Andric VectorType *Tp, ArrayRef<int> Mask, 120681ad6265SDimitry Andric int Index, VectorType *SubTp, 120781ad6265SDimitry Andric ArrayRef<const Value *> Args) { 1208fe6060f1SDimitry Andric Kind = improveShuffleKindFromMask(Kind, Mask); 12098bcb0991SDimitry Andric if (ST->hasNEON()) { 12100b57cec5SDimitry Andric if (Kind == TTI::SK_Broadcast) { 12110b57cec5SDimitry Andric static const CostTblEntry NEONDupTbl[] = { 12120b57cec5SDimitry Andric // VDUP handles these cases. 12130b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 12140b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 12150b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 12160b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 12170b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, 12180b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, 12190b57cec5SDimitry Andric 12200b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, 12210b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, 12220b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, 12230b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; 12240b57cec5SDimitry Andric 1225fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 12268bcb0991SDimitry Andric if (const auto *Entry = 12278bcb0991SDimitry Andric CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) 12280b57cec5SDimitry Andric return LT.first * Entry->Cost; 12290b57cec5SDimitry Andric } 12300b57cec5SDimitry Andric if (Kind == TTI::SK_Reverse) { 12310b57cec5SDimitry Andric static const CostTblEntry NEONShuffleTbl[] = { 12320b57cec5SDimitry Andric // Reverse shuffle cost one instruction if we are shuffling within a 12330b57cec5SDimitry Andric // double word (vrev) or two if we shuffle a quad word (vrev, vext). 12340b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 12350b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 12360b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 12370b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 12380b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, 12390b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, 12400b57cec5SDimitry Andric 12410b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, 12420b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, 12430b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, 12440b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; 12450b57cec5SDimitry Andric 1246fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 12478bcb0991SDimitry Andric if (const auto *Entry = 12488bcb0991SDimitry Andric CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) 12490b57cec5SDimitry Andric return LT.first * Entry->Cost; 12500b57cec5SDimitry Andric } 12510b57cec5SDimitry Andric if (Kind == TTI::SK_Select) { 12520b57cec5SDimitry Andric static const CostTblEntry NEONSelShuffleTbl[] = { 12538bcb0991SDimitry Andric // Select shuffle cost table for ARM. Cost is the number of 12548bcb0991SDimitry Andric // instructions 12550b57cec5SDimitry Andric // required to create the shuffled vector. 12560b57cec5SDimitry Andric 12570b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 12580b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 12590b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 12600b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 12610b57cec5SDimitry Andric 12620b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, 12630b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, 12640b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, 12650b57cec5SDimitry Andric 12660b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, 12670b57cec5SDimitry Andric 12680b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; 12690b57cec5SDimitry Andric 1270fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 12710b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, 12720b57cec5SDimitry Andric ISD::VECTOR_SHUFFLE, LT.second)) 12730b57cec5SDimitry Andric return LT.first * Entry->Cost; 12740b57cec5SDimitry Andric } 12758bcb0991SDimitry Andric } 12768bcb0991SDimitry Andric if (ST->hasMVEIntegerOps()) { 12778bcb0991SDimitry Andric if (Kind == TTI::SK_Broadcast) { 12788bcb0991SDimitry Andric static const CostTblEntry MVEDupTbl[] = { 12798bcb0991SDimitry Andric // VDUP handles these cases. 12808bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, 12818bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, 12828bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}, 12838bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, 12848bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}}; 12858bcb0991SDimitry Andric 1286fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 12878bcb0991SDimitry Andric if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, 12888bcb0991SDimitry Andric LT.second)) 1289fe6060f1SDimitry Andric return LT.first * Entry->Cost * 1290fe6060f1SDimitry Andric ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput); 12910b57cec5SDimitry Andric } 12920b57cec5SDimitry Andric 1293fe6060f1SDimitry Andric if (!Mask.empty()) { 1294fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 129556f451bbSDimitry Andric if (LT.second.isVector() && 129656f451bbSDimitry Andric Mask.size() <= LT.second.getVectorNumElements() && 1297fe6060f1SDimitry Andric (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) || 1298fe6060f1SDimitry Andric isVREVMask(Mask, LT.second, 64))) 1299fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first; 1300fe6060f1SDimitry Andric } 1301fe6060f1SDimitry Andric } 1302fe6060f1SDimitry Andric 1303fe6060f1SDimitry Andric int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() 1304fe6060f1SDimitry Andric ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) 1305fe6060f1SDimitry Andric : 1; 1306fe6060f1SDimitry Andric return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); 1307fe6060f1SDimitry Andric } 1308fe6060f1SDimitry Andric 1309fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getArithmeticInstrCost( 1310fe6060f1SDimitry Andric unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1311fe6060f1SDimitry Andric TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, 1312480093f4SDimitry Andric TTI::OperandValueProperties Opd1PropInfo, 1313fe6060f1SDimitry Andric TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 1314480093f4SDimitry Andric const Instruction *CxtI) { 13150b57cec5SDimitry Andric int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); 1316e8d8bef9SDimitry Andric if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) { 1317e8d8bef9SDimitry Andric // Make operations on i1 relatively expensive as this often involves 1318e8d8bef9SDimitry Andric // combining predicates. AND and XOR should be easier to handle with IT 1319e8d8bef9SDimitry Andric // blocks. 1320e8d8bef9SDimitry Andric switch (ISDOpcode) { 1321e8d8bef9SDimitry Andric default: 1322e8d8bef9SDimitry Andric break; 1323e8d8bef9SDimitry Andric case ISD::AND: 1324e8d8bef9SDimitry Andric case ISD::XOR: 1325e8d8bef9SDimitry Andric return 2; 1326e8d8bef9SDimitry Andric case ISD::OR: 1327e8d8bef9SDimitry Andric return 3; 1328e8d8bef9SDimitry Andric } 1329e8d8bef9SDimitry Andric } 1330e8d8bef9SDimitry Andric 1331fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 13320b57cec5SDimitry Andric 1333480093f4SDimitry Andric if (ST->hasNEON()) { 13340b57cec5SDimitry Andric const unsigned FunctionCallDivCost = 20; 13350b57cec5SDimitry Andric const unsigned ReciprocalDivCost = 10; 13360b57cec5SDimitry Andric static const CostTblEntry CostTbl[] = { 13370b57cec5SDimitry Andric // Division. 13380b57cec5SDimitry Andric // These costs are somewhat random. Choose a cost of 20 to indicate that 13390b57cec5SDimitry Andric // vectorizing devision (added function call) is going to be very expensive. 13400b57cec5SDimitry Andric // Double registers types. 13410b57cec5SDimitry Andric { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, 13420b57cec5SDimitry Andric { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, 13430b57cec5SDimitry Andric { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost}, 13440b57cec5SDimitry Andric { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost}, 13450b57cec5SDimitry Andric { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost}, 13460b57cec5SDimitry Andric { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost}, 13470b57cec5SDimitry Andric { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost}, 13480b57cec5SDimitry Andric { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost}, 13490b57cec5SDimitry Andric { ISD::SDIV, MVT::v4i16, ReciprocalDivCost}, 13500b57cec5SDimitry Andric { ISD::UDIV, MVT::v4i16, ReciprocalDivCost}, 13510b57cec5SDimitry Andric { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost}, 13520b57cec5SDimitry Andric { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost}, 13530b57cec5SDimitry Andric { ISD::SDIV, MVT::v8i8, ReciprocalDivCost}, 13540b57cec5SDimitry Andric { ISD::UDIV, MVT::v8i8, ReciprocalDivCost}, 13550b57cec5SDimitry Andric { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost}, 13560b57cec5SDimitry Andric { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost}, 13570b57cec5SDimitry Andric // Quad register types. 13580b57cec5SDimitry Andric { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost}, 13590b57cec5SDimitry Andric { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost}, 13600b57cec5SDimitry Andric { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost}, 13610b57cec5SDimitry Andric { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost}, 13620b57cec5SDimitry Andric { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost}, 13630b57cec5SDimitry Andric { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost}, 13640b57cec5SDimitry Andric { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost}, 13650b57cec5SDimitry Andric { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost}, 13660b57cec5SDimitry Andric { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost}, 13670b57cec5SDimitry Andric { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost}, 13680b57cec5SDimitry Andric { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost}, 13690b57cec5SDimitry Andric { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost}, 13700b57cec5SDimitry Andric { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost}, 13710b57cec5SDimitry Andric { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, 13720b57cec5SDimitry Andric { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, 13730b57cec5SDimitry Andric { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, 13740b57cec5SDimitry Andric // Multiplication. 13750b57cec5SDimitry Andric }; 13760b57cec5SDimitry Andric 13770b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) 13780b57cec5SDimitry Andric return LT.first * Entry->Cost; 13790b57cec5SDimitry Andric 1380fe6060f1SDimitry Andric InstructionCost Cost = BaseT::getArithmeticInstrCost( 1381fe6060f1SDimitry Andric Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); 13820b57cec5SDimitry Andric 13830b57cec5SDimitry Andric // This is somewhat of a hack. The problem that we are facing is that SROA 13840b57cec5SDimitry Andric // creates a sequence of shift, and, or instructions to construct values. 13850b57cec5SDimitry Andric // These sequences are recognized by the ISel and have zero-cost. Not so for 13860b57cec5SDimitry Andric // the vectorized code. Because we have support for v2i64 but not i64 those 13870b57cec5SDimitry Andric // sequences look particularly beneficial to vectorize. 13880b57cec5SDimitry Andric // To work around this we increase the cost of v2i64 operations to make them 13890b57cec5SDimitry Andric // seem less beneficial. 13900b57cec5SDimitry Andric if (LT.second == MVT::v2i64 && 13910b57cec5SDimitry Andric Op2Info == TargetTransformInfo::OK_UniformConstantValue) 13920b57cec5SDimitry Andric Cost += 4; 13930b57cec5SDimitry Andric 13940b57cec5SDimitry Andric return Cost; 13950b57cec5SDimitry Andric } 13960b57cec5SDimitry Andric 1397480093f4SDimitry Andric // If this operation is a shift on arm/thumb2, it might well be folded into 1398480093f4SDimitry Andric // the following instruction, hence having a cost of 0. 1399480093f4SDimitry Andric auto LooksLikeAFreeShift = [&]() { 1400480093f4SDimitry Andric if (ST->isThumb1Only() || Ty->isVectorTy()) 1401480093f4SDimitry Andric return false; 1402480093f4SDimitry Andric 1403480093f4SDimitry Andric if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift()) 1404480093f4SDimitry Andric return false; 1405480093f4SDimitry Andric if (Op2Info != TargetTransformInfo::OK_UniformConstantValue) 1406480093f4SDimitry Andric return false; 1407480093f4SDimitry Andric 1408480093f4SDimitry Andric // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB 1409480093f4SDimitry Andric switch (cast<Instruction>(CxtI->user_back())->getOpcode()) { 1410480093f4SDimitry Andric case Instruction::Add: 1411480093f4SDimitry Andric case Instruction::Sub: 1412480093f4SDimitry Andric case Instruction::And: 1413480093f4SDimitry Andric case Instruction::Xor: 1414480093f4SDimitry Andric case Instruction::Or: 1415480093f4SDimitry Andric case Instruction::ICmp: 1416480093f4SDimitry Andric return true; 1417480093f4SDimitry Andric default: 1418480093f4SDimitry Andric return false; 1419480093f4SDimitry Andric } 1420480093f4SDimitry Andric }; 1421480093f4SDimitry Andric if (LooksLikeAFreeShift()) 1422480093f4SDimitry Andric return 0; 1423480093f4SDimitry Andric 1424e8d8bef9SDimitry Andric // Default to cheap (throughput/size of 1 instruction) but adjust throughput 1425e8d8bef9SDimitry Andric // for "multiple beats" potentially needed by MVE instructions. 1426e8d8bef9SDimitry Andric int BaseCost = 1; 1427fe6060f1SDimitry Andric if (ST->hasMVEIntegerOps() && Ty->isVectorTy()) 1428fe6060f1SDimitry Andric BaseCost = ST->getMVEVectorCostFactor(CostKind); 14298bcb0991SDimitry Andric 14308bcb0991SDimitry Andric // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost, 14318bcb0991SDimitry Andric // without treating floats as more expensive that scalars or increasing the 14328bcb0991SDimitry Andric // costs for custom operations. The results is also multiplied by the 14338bcb0991SDimitry Andric // MVEVectorCostFactor where appropriate. 14348bcb0991SDimitry Andric if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second)) 14358bcb0991SDimitry Andric return LT.first * BaseCost; 14368bcb0991SDimitry Andric 14378bcb0991SDimitry Andric // Else this is expand, assume that we need to scalarize this op. 14385ffd83dbSDimitry Andric if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 14395ffd83dbSDimitry Andric unsigned Num = VTy->getNumElements(); 1440fe6060f1SDimitry Andric InstructionCost Cost = 1441fe6060f1SDimitry Andric getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); 14428bcb0991SDimitry Andric // Return the cost of multiple scalar invocation plus the cost of 14438bcb0991SDimitry Andric // inserting and extracting the values. 1444fe6060f1SDimitry Andric SmallVector<Type *> Tys(Args.size(), Ty); 1445fe6060f1SDimitry Andric return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; 14468bcb0991SDimitry Andric } 14478bcb0991SDimitry Andric 14488bcb0991SDimitry Andric return BaseCost; 14498bcb0991SDimitry Andric } 14508bcb0991SDimitry Andric 1451fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1452fe6060f1SDimitry Andric MaybeAlign Alignment, 1453fe6060f1SDimitry Andric unsigned AddressSpace, 14545ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, 1455480093f4SDimitry Andric const Instruction *I) { 14565ffd83dbSDimitry Andric // TODO: Handle other cost kinds. 14575ffd83dbSDimitry Andric if (CostKind != TTI::TCK_RecipThroughput) 14585ffd83dbSDimitry Andric return 1; 14595ffd83dbSDimitry Andric 14605ffd83dbSDimitry Andric // Type legalization can't handle structs 14615ffd83dbSDimitry Andric if (TLI->getValueType(DL, Src, true) == MVT::Other) 14625ffd83dbSDimitry Andric return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 14635ffd83dbSDimitry Andric CostKind); 14640b57cec5SDimitry Andric 1465480093f4SDimitry Andric if (ST->hasNEON() && Src->isVectorTy() && 1466480093f4SDimitry Andric (Alignment && *Alignment != Align(16)) && 14675ffd83dbSDimitry Andric cast<VectorType>(Src)->getElementType()->isDoubleTy()) { 14680b57cec5SDimitry Andric // Unaligned loads/stores are extremely inefficient. 14690b57cec5SDimitry Andric // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. 1470fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 14710b57cec5SDimitry Andric return LT.first * 4; 14720b57cec5SDimitry Andric } 14735ffd83dbSDimitry Andric 14745ffd83dbSDimitry Andric // MVE can optimize a fpext(load(4xhalf)) using an extending integer load. 14755ffd83dbSDimitry Andric // Same for stores. 14765ffd83dbSDimitry Andric if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I && 14775ffd83dbSDimitry Andric ((Opcode == Instruction::Load && I->hasOneUse() && 14785ffd83dbSDimitry Andric isa<FPExtInst>(*I->user_begin())) || 14795ffd83dbSDimitry Andric (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) { 14805ffd83dbSDimitry Andric FixedVectorType *SrcVTy = cast<FixedVectorType>(Src); 14815ffd83dbSDimitry Andric Type *DstTy = 14825ffd83dbSDimitry Andric Opcode == Instruction::Load 14835ffd83dbSDimitry Andric ? (*I->user_begin())->getType() 14845ffd83dbSDimitry Andric : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType(); 14855ffd83dbSDimitry Andric if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() && 14865ffd83dbSDimitry Andric DstTy->getScalarType()->isFloatTy()) 1487fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(CostKind); 14885ffd83dbSDimitry Andric } 14895ffd83dbSDimitry Andric 14908bcb0991SDimitry Andric int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() 1491fe6060f1SDimitry Andric ? ST->getMVEVectorCostFactor(CostKind) 14928bcb0991SDimitry Andric : 1; 14935ffd83dbSDimitry Andric return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 14945ffd83dbSDimitry Andric CostKind, I); 14950b57cec5SDimitry Andric } 14960b57cec5SDimitry Andric 1497fe6060f1SDimitry Andric InstructionCost 1498fe6060f1SDimitry Andric ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, 1499e8d8bef9SDimitry Andric unsigned AddressSpace, 1500e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) { 1501e8d8bef9SDimitry Andric if (ST->hasMVEIntegerOps()) { 1502e8d8bef9SDimitry Andric if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment)) 1503fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(CostKind); 1504e8d8bef9SDimitry Andric if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment)) 1505fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(CostKind); 1506e8d8bef9SDimitry Andric } 1507e8d8bef9SDimitry Andric if (!isa<FixedVectorType>(Src)) 1508e8d8bef9SDimitry Andric return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1509e8d8bef9SDimitry Andric CostKind); 1510e8d8bef9SDimitry Andric // Scalar cost, which is currently very high due to the efficiency of the 1511e8d8bef9SDimitry Andric // generated code. 1512e8d8bef9SDimitry Andric return cast<FixedVectorType>(Src)->getNumElements() * 8; 1513e8d8bef9SDimitry Andric } 1514e8d8bef9SDimitry Andric 1515fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost( 1516480093f4SDimitry Andric unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 15175ffd83dbSDimitry Andric Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 15185ffd83dbSDimitry Andric bool UseMaskForCond, bool UseMaskForGaps) { 15190b57cec5SDimitry Andric assert(Factor >= 2 && "Invalid interleave factor"); 15200b57cec5SDimitry Andric assert(isa<VectorType>(VecTy) && "Expect a vector type"); 15210b57cec5SDimitry Andric 15220b57cec5SDimitry Andric // vldN/vstN doesn't support vector types of i64/f64 element. 15230b57cec5SDimitry Andric bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; 15240b57cec5SDimitry Andric 15250b57cec5SDimitry Andric if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && 15260b57cec5SDimitry Andric !UseMaskForCond && !UseMaskForGaps) { 15275ffd83dbSDimitry Andric unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements(); 15285ffd83dbSDimitry Andric auto *SubVecTy = 15295ffd83dbSDimitry Andric FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 15300b57cec5SDimitry Andric 15310b57cec5SDimitry Andric // vldN/vstN only support legal vector types of size 64 or 128 in bits. 15320b57cec5SDimitry Andric // Accesses having vector types that are a multiple of 128 bits can be 15330b57cec5SDimitry Andric // matched to more than one vldN/vstN instruction. 1534fe6060f1SDimitry Andric int BaseCost = 1535fe6060f1SDimitry Andric ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1; 15360b57cec5SDimitry Andric if (NumElts % Factor == 0 && 1537fe6060f1SDimitry Andric TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL)) 1538480093f4SDimitry Andric return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL); 1539480093f4SDimitry Andric 1540480093f4SDimitry Andric // Some smaller than legal interleaved patterns are cheap as we can make 1541480093f4SDimitry Andric // use of the vmovn or vrev patterns to interleave a standard load. This is 1542480093f4SDimitry Andric // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is 1543480093f4SDimitry Andric // promoted differently). The cost of 2 here is then a load and vrev or 1544480093f4SDimitry Andric // vmovn. 1545480093f4SDimitry Andric if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 && 1546e8d8bef9SDimitry Andric VecTy->isIntOrIntVectorTy() && 1547e8d8bef9SDimitry Andric DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64) 1548480093f4SDimitry Andric return 2 * BaseCost; 15490b57cec5SDimitry Andric } 15500b57cec5SDimitry Andric 15510b57cec5SDimitry Andric return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 15525ffd83dbSDimitry Andric Alignment, AddressSpace, CostKind, 15530b57cec5SDimitry Andric UseMaskForCond, UseMaskForGaps); 15540b57cec5SDimitry Andric } 15550b57cec5SDimitry Andric 1556fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getGatherScatterOpCost( 1557fe6060f1SDimitry Andric unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 1558fe6060f1SDimitry Andric Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 15595ffd83dbSDimitry Andric using namespace PatternMatch; 15605ffd83dbSDimitry Andric if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters) 15615ffd83dbSDimitry Andric return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 15625ffd83dbSDimitry Andric Alignment, CostKind, I); 15635ffd83dbSDimitry Andric 15645ffd83dbSDimitry Andric assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!"); 15655ffd83dbSDimitry Andric auto *VTy = cast<FixedVectorType>(DataTy); 15665ffd83dbSDimitry Andric 15675ffd83dbSDimitry Andric // TODO: Splitting, once we do that. 15685ffd83dbSDimitry Andric 15695ffd83dbSDimitry Andric unsigned NumElems = VTy->getNumElements(); 15705ffd83dbSDimitry Andric unsigned EltSize = VTy->getScalarSizeInBits(); 1571fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy); 15725ffd83dbSDimitry Andric 15735ffd83dbSDimitry Andric // For now, it is assumed that for the MVE gather instructions the loads are 15745ffd83dbSDimitry Andric // all effectively serialised. This means the cost is the scalar cost 15755ffd83dbSDimitry Andric // multiplied by the number of elements being loaded. This is possibly very 15765ffd83dbSDimitry Andric // conservative, but even so we still end up vectorising loops because the 15775ffd83dbSDimitry Andric // cost per iteration for many loops is lower than for scalar loops. 1578fe6060f1SDimitry Andric InstructionCost VectorCost = 1579fe6060f1SDimitry Andric NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind); 15805ffd83dbSDimitry Andric // The scalarization cost should be a lot higher. We use the number of vector 15815ffd83dbSDimitry Andric // elements plus the scalarization overhead. 1582fe6060f1SDimitry Andric InstructionCost ScalarCost = 1583fe6060f1SDimitry Andric NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) + 1584fe6060f1SDimitry Andric BaseT::getScalarizationOverhead(VTy, false, true); 15855ffd83dbSDimitry Andric 1586e8d8bef9SDimitry Andric if (EltSize < 8 || Alignment < EltSize / 8) 15875ffd83dbSDimitry Andric return ScalarCost; 15885ffd83dbSDimitry Andric 15895ffd83dbSDimitry Andric unsigned ExtSize = EltSize; 15905ffd83dbSDimitry Andric // Check whether there's a single user that asks for an extended type 15915ffd83dbSDimitry Andric if (I != nullptr) { 15925ffd83dbSDimitry Andric // Dependent of the caller of this function, a gather instruction will 15935ffd83dbSDimitry Andric // either have opcode Instruction::Load or be a call to the masked_gather 15945ffd83dbSDimitry Andric // intrinsic 15955ffd83dbSDimitry Andric if ((I->getOpcode() == Instruction::Load || 15965ffd83dbSDimitry Andric match(I, m_Intrinsic<Intrinsic::masked_gather>())) && 15975ffd83dbSDimitry Andric I->hasOneUse()) { 15985ffd83dbSDimitry Andric const User *Us = *I->users().begin(); 15995ffd83dbSDimitry Andric if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) { 16005ffd83dbSDimitry Andric // only allow valid type combinations 16015ffd83dbSDimitry Andric unsigned TypeSize = 16025ffd83dbSDimitry Andric cast<Instruction>(Us)->getType()->getScalarSizeInBits(); 16035ffd83dbSDimitry Andric if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) || 16045ffd83dbSDimitry Andric (TypeSize == 16 && EltSize == 8)) && 16055ffd83dbSDimitry Andric TypeSize * NumElems == 128) { 16065ffd83dbSDimitry Andric ExtSize = TypeSize; 16075ffd83dbSDimitry Andric } 16085ffd83dbSDimitry Andric } 16095ffd83dbSDimitry Andric } 16105ffd83dbSDimitry Andric // Check whether the input data needs to be truncated 16115ffd83dbSDimitry Andric TruncInst *T; 16125ffd83dbSDimitry Andric if ((I->getOpcode() == Instruction::Store || 16135ffd83dbSDimitry Andric match(I, m_Intrinsic<Intrinsic::masked_scatter>())) && 16145ffd83dbSDimitry Andric (T = dyn_cast<TruncInst>(I->getOperand(0)))) { 16155ffd83dbSDimitry Andric // Only allow valid type combinations 16165ffd83dbSDimitry Andric unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits(); 16175ffd83dbSDimitry Andric if (((EltSize == 16 && TypeSize == 32) || 16185ffd83dbSDimitry Andric (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) && 16195ffd83dbSDimitry Andric TypeSize * NumElems == 128) 16205ffd83dbSDimitry Andric ExtSize = TypeSize; 16215ffd83dbSDimitry Andric } 16225ffd83dbSDimitry Andric } 16235ffd83dbSDimitry Andric 16245ffd83dbSDimitry Andric if (ExtSize * NumElems != 128 || NumElems < 4) 16255ffd83dbSDimitry Andric return ScalarCost; 16265ffd83dbSDimitry Andric 16275ffd83dbSDimitry Andric // Any (aligned) i32 gather will not need to be scalarised. 16285ffd83dbSDimitry Andric if (ExtSize == 32) 16295ffd83dbSDimitry Andric return VectorCost; 16305ffd83dbSDimitry Andric // For smaller types, we need to ensure that the gep's inputs are correctly 16315ffd83dbSDimitry Andric // extended from a small enough value. Other sizes (including i64) are 16325ffd83dbSDimitry Andric // scalarized for now. 16335ffd83dbSDimitry Andric if (ExtSize != 8 && ExtSize != 16) 16345ffd83dbSDimitry Andric return ScalarCost; 16355ffd83dbSDimitry Andric 16365ffd83dbSDimitry Andric if (const auto *BC = dyn_cast<BitCastInst>(Ptr)) 16375ffd83dbSDimitry Andric Ptr = BC->getOperand(0); 16385ffd83dbSDimitry Andric if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { 16395ffd83dbSDimitry Andric if (GEP->getNumOperands() != 2) 16405ffd83dbSDimitry Andric return ScalarCost; 16415ffd83dbSDimitry Andric unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType()); 16425ffd83dbSDimitry Andric // Scale needs to be correct (which is only relevant for i16s). 16435ffd83dbSDimitry Andric if (Scale != 1 && Scale * 8 != ExtSize) 16445ffd83dbSDimitry Andric return ScalarCost; 16455ffd83dbSDimitry Andric // And we need to zext (not sext) the indexes from a small enough type. 16465ffd83dbSDimitry Andric if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) { 16475ffd83dbSDimitry Andric if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize) 16485ffd83dbSDimitry Andric return VectorCost; 16495ffd83dbSDimitry Andric } 16505ffd83dbSDimitry Andric return ScalarCost; 16515ffd83dbSDimitry Andric } 16525ffd83dbSDimitry Andric return ScalarCost; 16535ffd83dbSDimitry Andric } 16545ffd83dbSDimitry Andric 1655fe6060f1SDimitry Andric InstructionCost 1656fe6060f1SDimitry Andric ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 1657fe6060f1SDimitry Andric Optional<FastMathFlags> FMF, 1658e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) { 1659fe6060f1SDimitry Andric if (TTI::requiresOrderedReduction(FMF)) 1660fe6060f1SDimitry Andric return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1661fe6060f1SDimitry Andric 1662e8d8bef9SDimitry Andric EVT ValVT = TLI->getValueType(DL, ValTy); 1663e8d8bef9SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 1664e8d8bef9SDimitry Andric if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) 1665fe6060f1SDimitry Andric return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1666e8d8bef9SDimitry Andric 1667fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 1668e8d8bef9SDimitry Andric 1669e8d8bef9SDimitry Andric static const CostTblEntry CostTblAdd[]{ 1670e8d8bef9SDimitry Andric {ISD::ADD, MVT::v16i8, 1}, 1671e8d8bef9SDimitry Andric {ISD::ADD, MVT::v8i16, 1}, 1672e8d8bef9SDimitry Andric {ISD::ADD, MVT::v4i32, 1}, 1673e8d8bef9SDimitry Andric }; 1674e8d8bef9SDimitry Andric if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) 1675fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first; 1676e8d8bef9SDimitry Andric 1677fe6060f1SDimitry Andric return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1678e8d8bef9SDimitry Andric } 1679e8d8bef9SDimitry Andric 1680e8d8bef9SDimitry Andric InstructionCost 1681e8d8bef9SDimitry Andric ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, 1682e8d8bef9SDimitry Andric Type *ResTy, VectorType *ValTy, 1683e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) { 1684e8d8bef9SDimitry Andric EVT ValVT = TLI->getValueType(DL, ValTy); 1685e8d8bef9SDimitry Andric EVT ResVT = TLI->getValueType(DL, ResTy); 1686349cc55cSDimitry Andric 1687e8d8bef9SDimitry Andric if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { 1688fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = 1689fe6060f1SDimitry Andric TLI->getTypeLegalizationCost(DL, ValTy); 1690349cc55cSDimitry Andric 1691349cc55cSDimitry Andric // The legal cases are: 1692349cc55cSDimitry Andric // VADDV u/s 8/16/32 1693349cc55cSDimitry Andric // VMLAV u/s 8/16/32 1694349cc55cSDimitry Andric // VADDLV u/s 32 1695349cc55cSDimitry Andric // VMLALV u/s 16/32 1696349cc55cSDimitry Andric // Codegen currently cannot always handle larger than legal vectors very 1697349cc55cSDimitry Andric // well, especially for predicated reductions where the mask needs to be 1698349cc55cSDimitry Andric // split, so restrict to 128bit or smaller input types. 1699349cc55cSDimitry Andric unsigned RevVTSize = ResVT.getSizeInBits(); 1700349cc55cSDimitry Andric if (ValVT.getSizeInBits() <= 128 && 1701349cc55cSDimitry Andric ((LT.second == MVT::v16i8 && RevVTSize <= 32) || 1702349cc55cSDimitry Andric (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) || 1703349cc55cSDimitry Andric (LT.second == MVT::v4i32 && RevVTSize <= 64))) 1704fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(CostKind) * LT.first; 1705e8d8bef9SDimitry Andric } 1706e8d8bef9SDimitry Andric 1707e8d8bef9SDimitry Andric return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy, 1708e8d8bef9SDimitry Andric CostKind); 1709e8d8bef9SDimitry Andric } 1710e8d8bef9SDimitry Andric 1711fe6060f1SDimitry Andric InstructionCost 1712fe6060f1SDimitry Andric ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1713e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) { 1714e8d8bef9SDimitry Andric switch (ICA.getID()) { 1715e8d8bef9SDimitry Andric case Intrinsic::get_active_lane_mask: 1716e8d8bef9SDimitry Andric // Currently we make a somewhat optimistic assumption that 1717e8d8bef9SDimitry Andric // active_lane_mask's are always free. In reality it may be freely folded 1718e8d8bef9SDimitry Andric // into a tail predicated loop, expanded into a VCPT or expanded into a lot 1719e8d8bef9SDimitry Andric // of add/icmp code. We may need to improve this in the future, but being 1720e8d8bef9SDimitry Andric // able to detect if it is free or not involves looking at a lot of other 1721e8d8bef9SDimitry Andric // code. We currently assume that the vectorizer inserted these, and knew 1722e8d8bef9SDimitry Andric // what it was doing in adding one. 1723e8d8bef9SDimitry Andric if (ST->hasMVEIntegerOps()) 1724e8d8bef9SDimitry Andric return 0; 1725e8d8bef9SDimitry Andric break; 1726e8d8bef9SDimitry Andric case Intrinsic::sadd_sat: 1727e8d8bef9SDimitry Andric case Intrinsic::ssub_sat: 1728e8d8bef9SDimitry Andric case Intrinsic::uadd_sat: 1729e8d8bef9SDimitry Andric case Intrinsic::usub_sat: { 1730e8d8bef9SDimitry Andric if (!ST->hasMVEIntegerOps()) 1731e8d8bef9SDimitry Andric break; 1732e8d8bef9SDimitry Andric Type *VT = ICA.getReturnType(); 1733e8d8bef9SDimitry Andric 1734fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT); 1735e8d8bef9SDimitry Andric if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || 1736e8d8bef9SDimitry Andric LT.second == MVT::v16i8) { 1737fe6060f1SDimitry Andric // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we 1738e8d8bef9SDimitry Andric // need to extend the type, as it uses shr(qadd(shl, shl)). 1739fe6060f1SDimitry Andric unsigned Instrs = 1740fe6060f1SDimitry Andric LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4; 1741fe6060f1SDimitry Andric return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs; 1742e8d8bef9SDimitry Andric } 1743e8d8bef9SDimitry Andric break; 1744e8d8bef9SDimitry Andric } 1745fe6060f1SDimitry Andric case Intrinsic::abs: 1746fe6060f1SDimitry Andric case Intrinsic::smin: 1747fe6060f1SDimitry Andric case Intrinsic::smax: 1748fe6060f1SDimitry Andric case Intrinsic::umin: 1749fe6060f1SDimitry Andric case Intrinsic::umax: { 1750fe6060f1SDimitry Andric if (!ST->hasMVEIntegerOps()) 1751fe6060f1SDimitry Andric break; 1752fe6060f1SDimitry Andric Type *VT = ICA.getReturnType(); 1753fe6060f1SDimitry Andric 1754fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT); 1755fe6060f1SDimitry Andric if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || 1756fe6060f1SDimitry Andric LT.second == MVT::v16i8) 1757fe6060f1SDimitry Andric return LT.first * ST->getMVEVectorCostFactor(CostKind); 1758fe6060f1SDimitry Andric break; 1759fe6060f1SDimitry Andric } 1760fe6060f1SDimitry Andric case Intrinsic::minnum: 1761fe6060f1SDimitry Andric case Intrinsic::maxnum: { 1762fe6060f1SDimitry Andric if (!ST->hasMVEFloatOps()) 1763fe6060f1SDimitry Andric break; 1764fe6060f1SDimitry Andric Type *VT = ICA.getReturnType(); 1765fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT); 1766fe6060f1SDimitry Andric if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) 1767fe6060f1SDimitry Andric return LT.first * ST->getMVEVectorCostFactor(CostKind); 1768fe6060f1SDimitry Andric break; 1769fe6060f1SDimitry Andric } 177081ad6265SDimitry Andric case Intrinsic::fptosi_sat: 177181ad6265SDimitry Andric case Intrinsic::fptoui_sat: { 177281ad6265SDimitry Andric if (ICA.getArgTypes().empty()) 177381ad6265SDimitry Andric break; 177481ad6265SDimitry Andric bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; 177581ad6265SDimitry Andric auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]); 177681ad6265SDimitry Andric EVT MTy = TLI->getValueType(DL, ICA.getReturnType()); 177781ad6265SDimitry Andric // Check for the legal types, with the corect subtarget features. 177881ad6265SDimitry Andric if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) || 177981ad6265SDimitry Andric (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) || 178081ad6265SDimitry Andric (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32)) 178181ad6265SDimitry Andric return LT.first; 178281ad6265SDimitry Andric 178381ad6265SDimitry Andric // Equally for MVE vector types 178481ad6265SDimitry Andric if (ST->hasMVEFloatOps() && 178581ad6265SDimitry Andric (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) && 178681ad6265SDimitry Andric LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()) 178781ad6265SDimitry Andric return LT.first * ST->getMVEVectorCostFactor(CostKind); 178881ad6265SDimitry Andric 178981ad6265SDimitry Andric // Otherwise we use a legal convert followed by a min+max 179081ad6265SDimitry Andric if (((ST->hasVFP2Base() && LT.second == MVT::f32) || 179181ad6265SDimitry Andric (ST->hasFP64() && LT.second == MVT::f64) || 179281ad6265SDimitry Andric (ST->hasFullFP16() && LT.second == MVT::f16) || 179381ad6265SDimitry Andric (ST->hasMVEFloatOps() && 179481ad6265SDimitry Andric (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) && 179581ad6265SDimitry Andric LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { 179681ad6265SDimitry Andric Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(), 179781ad6265SDimitry Andric LT.second.getScalarSizeInBits()); 179881ad6265SDimitry Andric InstructionCost Cost = 179981ad6265SDimitry Andric LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1; 180081ad6265SDimitry Andric IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin 180181ad6265SDimitry Andric : Intrinsic::umin, 180281ad6265SDimitry Andric LegalTy, {LegalTy, LegalTy}); 180381ad6265SDimitry Andric Cost += getIntrinsicInstrCost(Attrs1, CostKind); 180481ad6265SDimitry Andric IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax 180581ad6265SDimitry Andric : Intrinsic::umax, 180681ad6265SDimitry Andric LegalTy, {LegalTy, LegalTy}); 180781ad6265SDimitry Andric Cost += getIntrinsicInstrCost(Attrs2, CostKind); 180881ad6265SDimitry Andric return LT.first * Cost; 180981ad6265SDimitry Andric } 181081ad6265SDimitry Andric break; 181181ad6265SDimitry Andric } 1812e8d8bef9SDimitry Andric } 1813e8d8bef9SDimitry Andric 1814e8d8bef9SDimitry Andric return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1815e8d8bef9SDimitry Andric } 1816e8d8bef9SDimitry Andric 18170b57cec5SDimitry Andric bool ARMTTIImpl::isLoweredToCall(const Function *F) { 18180b57cec5SDimitry Andric if (!F->isIntrinsic()) 181981ad6265SDimitry Andric return BaseT::isLoweredToCall(F); 18200b57cec5SDimitry Andric 18210b57cec5SDimitry Andric // Assume all Arm-specific intrinsics map to an instruction. 18220b57cec5SDimitry Andric if (F->getName().startswith("llvm.arm")) 18230b57cec5SDimitry Andric return false; 18240b57cec5SDimitry Andric 18250b57cec5SDimitry Andric switch (F->getIntrinsicID()) { 18260b57cec5SDimitry Andric default: break; 18270b57cec5SDimitry Andric case Intrinsic::powi: 18280b57cec5SDimitry Andric case Intrinsic::sin: 18290b57cec5SDimitry Andric case Intrinsic::cos: 18300b57cec5SDimitry Andric case Intrinsic::pow: 18310b57cec5SDimitry Andric case Intrinsic::log: 18320b57cec5SDimitry Andric case Intrinsic::log10: 18330b57cec5SDimitry Andric case Intrinsic::log2: 18340b57cec5SDimitry Andric case Intrinsic::exp: 18350b57cec5SDimitry Andric case Intrinsic::exp2: 18360b57cec5SDimitry Andric return true; 18370b57cec5SDimitry Andric case Intrinsic::sqrt: 18380b57cec5SDimitry Andric case Intrinsic::fabs: 18390b57cec5SDimitry Andric case Intrinsic::copysign: 18400b57cec5SDimitry Andric case Intrinsic::floor: 18410b57cec5SDimitry Andric case Intrinsic::ceil: 18420b57cec5SDimitry Andric case Intrinsic::trunc: 18430b57cec5SDimitry Andric case Intrinsic::rint: 18440b57cec5SDimitry Andric case Intrinsic::nearbyint: 18450b57cec5SDimitry Andric case Intrinsic::round: 18460b57cec5SDimitry Andric case Intrinsic::canonicalize: 18470b57cec5SDimitry Andric case Intrinsic::lround: 18480b57cec5SDimitry Andric case Intrinsic::llround: 18490b57cec5SDimitry Andric case Intrinsic::lrint: 18500b57cec5SDimitry Andric case Intrinsic::llrint: 18510b57cec5SDimitry Andric if (F->getReturnType()->isDoubleTy() && !ST->hasFP64()) 18520b57cec5SDimitry Andric return true; 18530b57cec5SDimitry Andric if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16()) 18540b57cec5SDimitry Andric return true; 18550b57cec5SDimitry Andric // Some operations can be handled by vector instructions and assume 18560b57cec5SDimitry Andric // unsupported vectors will be expanded into supported scalar ones. 18570b57cec5SDimitry Andric // TODO Handle scalar operations properly. 18580b57cec5SDimitry Andric return !ST->hasFPARMv8Base() && !ST->hasVFP2Base(); 18590b57cec5SDimitry Andric case Intrinsic::masked_store: 18600b57cec5SDimitry Andric case Intrinsic::masked_load: 18610b57cec5SDimitry Andric case Intrinsic::masked_gather: 18620b57cec5SDimitry Andric case Intrinsic::masked_scatter: 18630b57cec5SDimitry Andric return !ST->hasMVEIntegerOps(); 18640b57cec5SDimitry Andric case Intrinsic::sadd_with_overflow: 18650b57cec5SDimitry Andric case Intrinsic::uadd_with_overflow: 18660b57cec5SDimitry Andric case Intrinsic::ssub_with_overflow: 18670b57cec5SDimitry Andric case Intrinsic::usub_with_overflow: 18680b57cec5SDimitry Andric case Intrinsic::sadd_sat: 18690b57cec5SDimitry Andric case Intrinsic::uadd_sat: 18700b57cec5SDimitry Andric case Intrinsic::ssub_sat: 18710b57cec5SDimitry Andric case Intrinsic::usub_sat: 18720b57cec5SDimitry Andric return false; 18730b57cec5SDimitry Andric } 18740b57cec5SDimitry Andric 18750b57cec5SDimitry Andric return BaseT::isLoweredToCall(F); 18760b57cec5SDimitry Andric } 18770b57cec5SDimitry Andric 1878e8d8bef9SDimitry Andric bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) { 18790b57cec5SDimitry Andric unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); 18800b57cec5SDimitry Andric EVT VT = TLI->getValueType(DL, I.getType(), true); 18810b57cec5SDimitry Andric if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) 18820b57cec5SDimitry Andric return true; 18830b57cec5SDimitry Andric 18840b57cec5SDimitry Andric // Check if an intrinsic will be lowered to a call and assume that any 18850b57cec5SDimitry Andric // other CallInst will generate a bl. 18860b57cec5SDimitry Andric if (auto *Call = dyn_cast<CallInst>(&I)) { 1887e8d8bef9SDimitry Andric if (auto *II = dyn_cast<IntrinsicInst>(Call)) { 1888e8d8bef9SDimitry Andric switch(II->getIntrinsicID()) { 1889e8d8bef9SDimitry Andric case Intrinsic::memcpy: 1890e8d8bef9SDimitry Andric case Intrinsic::memset: 1891e8d8bef9SDimitry Andric case Intrinsic::memmove: 1892e8d8bef9SDimitry Andric return getNumMemOps(II) == -1; 1893e8d8bef9SDimitry Andric default: 18940b57cec5SDimitry Andric if (const Function *F = Call->getCalledFunction()) 18950b57cec5SDimitry Andric return isLoweredToCall(F); 18960b57cec5SDimitry Andric } 1897e8d8bef9SDimitry Andric } 18980b57cec5SDimitry Andric return true; 18990b57cec5SDimitry Andric } 19000b57cec5SDimitry Andric 19010b57cec5SDimitry Andric // FPv5 provides conversions between integer, double-precision, 19020b57cec5SDimitry Andric // single-precision, and half-precision formats. 19030b57cec5SDimitry Andric switch (I.getOpcode()) { 19040b57cec5SDimitry Andric default: 19050b57cec5SDimitry Andric break; 19060b57cec5SDimitry Andric case Instruction::FPToSI: 19070b57cec5SDimitry Andric case Instruction::FPToUI: 19080b57cec5SDimitry Andric case Instruction::SIToFP: 19090b57cec5SDimitry Andric case Instruction::UIToFP: 19100b57cec5SDimitry Andric case Instruction::FPTrunc: 19110b57cec5SDimitry Andric case Instruction::FPExt: 19120b57cec5SDimitry Andric return !ST->hasFPARMv8Base(); 19130b57cec5SDimitry Andric } 19140b57cec5SDimitry Andric 19150b57cec5SDimitry Andric // FIXME: Unfortunately the approach of checking the Operation Action does 19160b57cec5SDimitry Andric // not catch all cases of Legalization that use library calls. Our 19170b57cec5SDimitry Andric // Legalization step categorizes some transformations into library calls as 19180b57cec5SDimitry Andric // Custom, Expand or even Legal when doing type legalization. So for now 19190b57cec5SDimitry Andric // we have to special case for instance the SDIV of 64bit integers and the 19200b57cec5SDimitry Andric // use of floating point emulation. 19210b57cec5SDimitry Andric if (VT.isInteger() && VT.getSizeInBits() >= 64) { 19220b57cec5SDimitry Andric switch (ISD) { 19230b57cec5SDimitry Andric default: 19240b57cec5SDimitry Andric break; 19250b57cec5SDimitry Andric case ISD::SDIV: 19260b57cec5SDimitry Andric case ISD::UDIV: 19270b57cec5SDimitry Andric case ISD::SREM: 19280b57cec5SDimitry Andric case ISD::UREM: 19290b57cec5SDimitry Andric case ISD::SDIVREM: 19300b57cec5SDimitry Andric case ISD::UDIVREM: 19310b57cec5SDimitry Andric return true; 19320b57cec5SDimitry Andric } 19330b57cec5SDimitry Andric } 19340b57cec5SDimitry Andric 19350b57cec5SDimitry Andric // Assume all other non-float operations are supported. 19360b57cec5SDimitry Andric if (!VT.isFloatingPoint()) 19370b57cec5SDimitry Andric return false; 19380b57cec5SDimitry Andric 19390b57cec5SDimitry Andric // We'll need a library call to handle most floats when using soft. 19400b57cec5SDimitry Andric if (TLI->useSoftFloat()) { 19410b57cec5SDimitry Andric switch (I.getOpcode()) { 19420b57cec5SDimitry Andric default: 19430b57cec5SDimitry Andric return true; 19440b57cec5SDimitry Andric case Instruction::Alloca: 19450b57cec5SDimitry Andric case Instruction::Load: 19460b57cec5SDimitry Andric case Instruction::Store: 19470b57cec5SDimitry Andric case Instruction::Select: 19480b57cec5SDimitry Andric case Instruction::PHI: 19490b57cec5SDimitry Andric return false; 19500b57cec5SDimitry Andric } 19510b57cec5SDimitry Andric } 19520b57cec5SDimitry Andric 19530b57cec5SDimitry Andric // We'll need a libcall to perform double precision operations on a single 19540b57cec5SDimitry Andric // precision only FPU. 19550b57cec5SDimitry Andric if (I.getType()->isDoubleTy() && !ST->hasFP64()) 19560b57cec5SDimitry Andric return true; 19570b57cec5SDimitry Andric 19580b57cec5SDimitry Andric // Likewise for half precision arithmetic. 19590b57cec5SDimitry Andric if (I.getType()->isHalfTy() && !ST->hasFullFP16()) 19600b57cec5SDimitry Andric return true; 19610b57cec5SDimitry Andric 19620b57cec5SDimitry Andric return false; 1963e8d8bef9SDimitry Andric } 1964e8d8bef9SDimitry Andric 1965e8d8bef9SDimitry Andric bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 1966e8d8bef9SDimitry Andric AssumptionCache &AC, 1967e8d8bef9SDimitry Andric TargetLibraryInfo *LibInfo, 1968e8d8bef9SDimitry Andric HardwareLoopInfo &HWLoopInfo) { 1969e8d8bef9SDimitry Andric // Low-overhead branches are only supported in the 'low-overhead branch' 1970e8d8bef9SDimitry Andric // extension of v8.1-m. 1971e8d8bef9SDimitry Andric if (!ST->hasLOB() || DisableLowOverheadLoops) { 1972e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n"); 1973e8d8bef9SDimitry Andric return false; 1974e8d8bef9SDimitry Andric } 1975e8d8bef9SDimitry Andric 1976e8d8bef9SDimitry Andric if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { 1977e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n"); 1978e8d8bef9SDimitry Andric return false; 1979e8d8bef9SDimitry Andric } 1980e8d8bef9SDimitry Andric 1981e8d8bef9SDimitry Andric const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 1982e8d8bef9SDimitry Andric if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { 1983e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n"); 1984e8d8bef9SDimitry Andric return false; 1985e8d8bef9SDimitry Andric } 1986e8d8bef9SDimitry Andric 1987e8d8bef9SDimitry Andric const SCEV *TripCountSCEV = 1988e8d8bef9SDimitry Andric SE.getAddExpr(BackedgeTakenCount, 1989e8d8bef9SDimitry Andric SE.getOne(BackedgeTakenCount->getType())); 1990e8d8bef9SDimitry Andric 1991e8d8bef9SDimitry Andric // We need to store the trip count in LR, a 32-bit register. 1992e8d8bef9SDimitry Andric if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) { 1993e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n"); 1994e8d8bef9SDimitry Andric return false; 1995e8d8bef9SDimitry Andric } 1996e8d8bef9SDimitry Andric 1997e8d8bef9SDimitry Andric // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little 1998e8d8bef9SDimitry Andric // point in generating a hardware loop if that's going to happen. 19990b57cec5SDimitry Andric 20000b57cec5SDimitry Andric auto IsHardwareLoopIntrinsic = [](Instruction &I) { 20010b57cec5SDimitry Andric if (auto *Call = dyn_cast<IntrinsicInst>(&I)) { 20020b57cec5SDimitry Andric switch (Call->getIntrinsicID()) { 20030b57cec5SDimitry Andric default: 20040b57cec5SDimitry Andric break; 2005e8d8bef9SDimitry Andric case Intrinsic::start_loop_iterations: 2006fe6060f1SDimitry Andric case Intrinsic::test_start_loop_iterations: 20070b57cec5SDimitry Andric case Intrinsic::loop_decrement: 20080b57cec5SDimitry Andric case Intrinsic::loop_decrement_reg: 20090b57cec5SDimitry Andric return true; 20100b57cec5SDimitry Andric } 20110b57cec5SDimitry Andric } 20120b57cec5SDimitry Andric return false; 20130b57cec5SDimitry Andric }; 20140b57cec5SDimitry Andric 20150b57cec5SDimitry Andric // Scan the instructions to see if there's any that we know will turn into a 2016e8d8bef9SDimitry Andric // call or if this loop is already a low-overhead loop or will become a tail 2017e8d8bef9SDimitry Andric // predicated loop. 2018e8d8bef9SDimitry Andric bool IsTailPredLoop = false; 20190b57cec5SDimitry Andric auto ScanLoop = [&](Loop *L) { 20200b57cec5SDimitry Andric for (auto *BB : L->getBlocks()) { 20210b57cec5SDimitry Andric for (auto &I : *BB) { 2022e8d8bef9SDimitry Andric if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) || 2023e8d8bef9SDimitry Andric isa<InlineAsm>(I)) { 20245ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n"); 20250b57cec5SDimitry Andric return false; 20260b57cec5SDimitry Andric } 2027e8d8bef9SDimitry Andric if (auto *II = dyn_cast<IntrinsicInst>(&I)) 2028e8d8bef9SDimitry Andric IsTailPredLoop |= 2029e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::get_active_lane_mask || 2030e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 || 2031e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 || 2032e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 || 2033e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::arm_mve_vctp64; 20340b57cec5SDimitry Andric } 20355ffd83dbSDimitry Andric } 20360b57cec5SDimitry Andric return true; 20370b57cec5SDimitry Andric }; 20380b57cec5SDimitry Andric 20390b57cec5SDimitry Andric // Visit inner loops. 20400b57cec5SDimitry Andric for (auto Inner : *L) 20410b57cec5SDimitry Andric if (!ScanLoop(Inner)) 20420b57cec5SDimitry Andric return false; 20430b57cec5SDimitry Andric 20440b57cec5SDimitry Andric if (!ScanLoop(L)) 20450b57cec5SDimitry Andric return false; 20460b57cec5SDimitry Andric 20470b57cec5SDimitry Andric // TODO: Check whether the trip count calculation is expensive. If L is the 20480b57cec5SDimitry Andric // inner loop but we know it has a low trip count, calculating that trip 20490b57cec5SDimitry Andric // count (in the parent loop) may be detrimental. 20500b57cec5SDimitry Andric 20510b57cec5SDimitry Andric LLVMContext &C = L->getHeader()->getContext(); 20520b57cec5SDimitry Andric HWLoopInfo.CounterInReg = true; 20530b57cec5SDimitry Andric HWLoopInfo.IsNestingLegal = false; 2054e8d8bef9SDimitry Andric HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop; 20550b57cec5SDimitry Andric HWLoopInfo.CountType = Type::getInt32Ty(C); 20560b57cec5SDimitry Andric HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); 20570b57cec5SDimitry Andric return true; 20580b57cec5SDimitry Andric } 20590b57cec5SDimitry Andric 2060480093f4SDimitry Andric static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { 2061480093f4SDimitry Andric // We don't allow icmp's, and because we only look at single block loops, 2062480093f4SDimitry Andric // we simply count the icmps, i.e. there should only be 1 for the backedge. 2063480093f4SDimitry Andric if (isa<ICmpInst>(&I) && ++ICmpCount > 1) 2064480093f4SDimitry Andric return false; 2065349cc55cSDimitry Andric // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are 2066349cc55cSDimitry Andric // not currently canonical, but soon will be. Code without them uses icmp, and 2067349cc55cSDimitry Andric // so is not tail predicated as per the condition above. In order to get the 2068349cc55cSDimitry Andric // same performance we treat min and max the same as an icmp for tailpred 2069349cc55cSDimitry Andric // purposes for the moment (we often rely on non-tailpred and higher VF's to 2070349cc55cSDimitry Andric // pick more optimial instructions like VQDMULH. They need to be recognized 2071349cc55cSDimitry Andric // directly by the vectorizer). 2072349cc55cSDimitry Andric if (auto *II = dyn_cast<IntrinsicInst>(&I)) 2073349cc55cSDimitry Andric if ((II->getIntrinsicID() == Intrinsic::smin || 2074349cc55cSDimitry Andric II->getIntrinsicID() == Intrinsic::smax || 2075349cc55cSDimitry Andric II->getIntrinsicID() == Intrinsic::umin || 2076349cc55cSDimitry Andric II->getIntrinsicID() == Intrinsic::umax) && 2077349cc55cSDimitry Andric ++ICmpCount > 1) 2078349cc55cSDimitry Andric return false; 2079480093f4SDimitry Andric 2080480093f4SDimitry Andric if (isa<FCmpInst>(&I)) 2081480093f4SDimitry Andric return false; 2082480093f4SDimitry Andric 2083480093f4SDimitry Andric // We could allow extending/narrowing FP loads/stores, but codegen is 2084480093f4SDimitry Andric // too inefficient so reject this for now. 2085480093f4SDimitry Andric if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I)) 2086480093f4SDimitry Andric return false; 2087480093f4SDimitry Andric 2088480093f4SDimitry Andric // Extends have to be extending-loads 2089480093f4SDimitry Andric if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) ) 2090480093f4SDimitry Andric if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0))) 2091480093f4SDimitry Andric return false; 2092480093f4SDimitry Andric 2093480093f4SDimitry Andric // Truncs have to be narrowing-stores 2094480093f4SDimitry Andric if (isa<TruncInst>(&I) ) 2095480093f4SDimitry Andric if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin())) 2096480093f4SDimitry Andric return false; 2097480093f4SDimitry Andric 2098480093f4SDimitry Andric return true; 2099480093f4SDimitry Andric } 2100480093f4SDimitry Andric 2101480093f4SDimitry Andric // To set up a tail-predicated loop, we need to know the total number of 2102480093f4SDimitry Andric // elements processed by that loop. Thus, we need to determine the element 2103480093f4SDimitry Andric // size and: 2104480093f4SDimitry Andric // 1) it should be uniform for all operations in the vector loop, so we 2105480093f4SDimitry Andric // e.g. don't want any widening/narrowing operations. 2106480093f4SDimitry Andric // 2) it should be smaller than i64s because we don't have vector operations 2107480093f4SDimitry Andric // that work on i64s. 2108480093f4SDimitry Andric // 3) we don't want elements to be reversed or shuffled, to make sure the 2109480093f4SDimitry Andric // tail-predication masks/predicates the right lanes. 2110480093f4SDimitry Andric // 2111480093f4SDimitry Andric static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, 2112480093f4SDimitry Andric const DataLayout &DL, 2113480093f4SDimitry Andric const LoopAccessInfo *LAI) { 21145ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n"); 21155ffd83dbSDimitry Andric 2116e8d8bef9SDimitry Andric // If there are live-out values, it is probably a reduction. We can predicate 2117e8d8bef9SDimitry Andric // most reduction operations freely under MVE using a combination of 2118e8d8bef9SDimitry Andric // prefer-predicated-reduction-select and inloop reductions. We limit this to 2119e8d8bef9SDimitry Andric // floating point and integer reductions, but don't check for operators 2120e8d8bef9SDimitry Andric // specifically here. If the value ends up not being a reduction (and so the 2121e8d8bef9SDimitry Andric // vectorizer cannot tailfold the loop), we should fall back to standard 2122e8d8bef9SDimitry Andric // vectorization automatically. 21235ffd83dbSDimitry Andric SmallVector< Instruction *, 8 > LiveOuts; 21245ffd83dbSDimitry Andric LiveOuts = llvm::findDefsUsedOutsideOfLoop(L); 2125e8d8bef9SDimitry Andric bool ReductionsDisabled = 21265ffd83dbSDimitry Andric EnableTailPredication == TailPredication::EnabledNoReductions || 21275ffd83dbSDimitry Andric EnableTailPredication == TailPredication::ForceEnabledNoReductions; 21285ffd83dbSDimitry Andric 21295ffd83dbSDimitry Andric for (auto *I : LiveOuts) { 2130e8d8bef9SDimitry Andric if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() && 2131e8d8bef9SDimitry Andric !I->getType()->isHalfTy()) { 2132e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float " 21335ffd83dbSDimitry Andric "live-out value\n"); 21345ffd83dbSDimitry Andric return false; 21355ffd83dbSDimitry Andric } 2136e8d8bef9SDimitry Andric if (ReductionsDisabled) { 2137e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Reductions not enabled\n"); 21385ffd83dbSDimitry Andric return false; 21395ffd83dbSDimitry Andric } 21405ffd83dbSDimitry Andric } 21415ffd83dbSDimitry Andric 21425ffd83dbSDimitry Andric // Next, check that all instructions can be tail-predicated. 2143480093f4SDimitry Andric PredicatedScalarEvolution PSE = LAI->getPSE(); 21445ffd83dbSDimitry Andric SmallVector<Instruction *, 16> LoadStores; 2145480093f4SDimitry Andric int ICmpCount = 0; 2146480093f4SDimitry Andric 2147480093f4SDimitry Andric for (BasicBlock *BB : L->blocks()) { 2148480093f4SDimitry Andric for (Instruction &I : BB->instructionsWithoutDebug()) { 2149480093f4SDimitry Andric if (isa<PHINode>(&I)) 2150480093f4SDimitry Andric continue; 2151480093f4SDimitry Andric if (!canTailPredicateInstruction(I, ICmpCount)) { 2152480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump()); 2153480093f4SDimitry Andric return false; 2154480093f4SDimitry Andric } 2155480093f4SDimitry Andric 2156480093f4SDimitry Andric Type *T = I.getType(); 2157480093f4SDimitry Andric if (T->getScalarSizeInBits() > 32) { 2158480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump()); 2159480093f4SDimitry Andric return false; 2160480093f4SDimitry Andric } 2161480093f4SDimitry Andric if (isa<StoreInst>(I) || isa<LoadInst>(I)) { 2162349cc55cSDimitry Andric Value *Ptr = getLoadStorePointerOperand(&I); 2163349cc55cSDimitry Andric Type *AccessTy = getLoadStoreType(&I); 2164349cc55cSDimitry Andric int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L); 2165e8d8bef9SDimitry Andric if (NextStride == 1) { 2166480093f4SDimitry Andric // TODO: for now only allow consecutive strides of 1. We could support 2167e8d8bef9SDimitry Andric // other strides as long as it is uniform, but let's keep it simple 2168e8d8bef9SDimitry Andric // for now. 2169e8d8bef9SDimitry Andric continue; 2170e8d8bef9SDimitry Andric } else if (NextStride == -1 || 2171e8d8bef9SDimitry Andric (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) || 2172e8d8bef9SDimitry Andric (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) { 2173e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() 2174e8d8bef9SDimitry Andric << "Consecutive strides of 2 found, vld2/vstr2 can't " 2175e8d8bef9SDimitry Andric "be tail-predicated\n."); 2176e8d8bef9SDimitry Andric return false; 2177e8d8bef9SDimitry Andric // TODO: don't tail predicate if there is a reversed load? 2178e8d8bef9SDimitry Andric } else if (EnableMaskedGatherScatters) { 2179e8d8bef9SDimitry Andric // Gather/scatters do allow loading from arbitrary strides, at 2180e8d8bef9SDimitry Andric // least if they are loop invariant. 2181e8d8bef9SDimitry Andric // TODO: Loop variant strides should in theory work, too, but 2182e8d8bef9SDimitry Andric // this requires further testing. 2183349cc55cSDimitry Andric const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr); 2184e8d8bef9SDimitry Andric if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) { 2185e8d8bef9SDimitry Andric const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); 2186e8d8bef9SDimitry Andric if (PSE.getSE()->isLoopInvariant(Step, L)) 2187480093f4SDimitry Andric continue; 2188480093f4SDimitry Andric } 2189e8d8bef9SDimitry Andric } 2190e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Bad stride found, can't " 2191480093f4SDimitry Andric "tail-predicate\n."); 2192480093f4SDimitry Andric return false; 2193480093f4SDimitry Andric } 2194480093f4SDimitry Andric } 2195480093f4SDimitry Andric } 2196480093f4SDimitry Andric 2197480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n"); 2198480093f4SDimitry Andric return true; 2199480093f4SDimitry Andric } 2200480093f4SDimitry Andric 2201fcaf7f86SDimitry Andric bool ARMTTIImpl::preferPredicateOverEpilogue( 2202fcaf7f86SDimitry Andric Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, 2203fcaf7f86SDimitry Andric TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) { 22045ffd83dbSDimitry Andric if (!EnableTailPredication) { 22055ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n"); 2206480093f4SDimitry Andric return false; 22075ffd83dbSDimitry Andric } 2208480093f4SDimitry Andric 2209480093f4SDimitry Andric // Creating a predicated vector loop is the first step for generating a 2210480093f4SDimitry Andric // tail-predicated hardware loop, for which we need the MVE masked 2211480093f4SDimitry Andric // load/stores instructions: 2212480093f4SDimitry Andric if (!ST->hasMVEIntegerOps()) 2213480093f4SDimitry Andric return false; 2214480093f4SDimitry Andric 2215480093f4SDimitry Andric // For now, restrict this to single block loops. 2216480093f4SDimitry Andric if (L->getNumBlocks() > 1) { 2217480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block " 2218480093f4SDimitry Andric "loop.\n"); 2219480093f4SDimitry Andric return false; 2220480093f4SDimitry Andric } 2221480093f4SDimitry Andric 2222e8d8bef9SDimitry Andric assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected"); 2223480093f4SDimitry Andric 2224480093f4SDimitry Andric HardwareLoopInfo HWLoopInfo(L); 2225480093f4SDimitry Andric if (!HWLoopInfo.canAnalyze(*LI)) { 2226480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 2227480093f4SDimitry Andric "analyzable.\n"); 2228480093f4SDimitry Andric return false; 2229480093f4SDimitry Andric } 2230480093f4SDimitry Andric 2231480093f4SDimitry Andric // This checks if we have the low-overhead branch architecture 2232480093f4SDimitry Andric // extension, and if we will create a hardware-loop: 2233480093f4SDimitry Andric if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) { 2234480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 2235480093f4SDimitry Andric "profitable.\n"); 2236480093f4SDimitry Andric return false; 2237480093f4SDimitry Andric } 2238480093f4SDimitry Andric 2239480093f4SDimitry Andric if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) { 2240480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 2241480093f4SDimitry Andric "a candidate.\n"); 2242480093f4SDimitry Andric return false; 2243480093f4SDimitry Andric } 2244480093f4SDimitry Andric 2245fcaf7f86SDimitry Andric return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI()); 2246480093f4SDimitry Andric } 2247480093f4SDimitry Andric 2248753f127fSDimitry Andric PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const { 22495ffd83dbSDimitry Andric if (!ST->hasMVEIntegerOps() || !EnableTailPredication) 2250753f127fSDimitry Andric return PredicationStyle::None; 2251480093f4SDimitry Andric 22525ffd83dbSDimitry Andric // Intrinsic @llvm.get.active.lane.mask is supported. 22535ffd83dbSDimitry Andric // It is used in the MVETailPredication pass, which requires the number of 22545ffd83dbSDimitry Andric // elements processed by this vector loop to setup the tail-predicated 22555ffd83dbSDimitry Andric // loop. 2256753f127fSDimitry Andric return PredicationStyle::Data; 22575ffd83dbSDimitry Andric } 22580b57cec5SDimitry Andric void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 2259349cc55cSDimitry Andric TTI::UnrollingPreferences &UP, 2260349cc55cSDimitry Andric OptimizationRemarkEmitter *ORE) { 2261fe6060f1SDimitry Andric // Enable Upper bound unrolling universally, not dependant upon the conditions 2262fe6060f1SDimitry Andric // below. 2263fe6060f1SDimitry Andric UP.UpperBound = true; 2264fe6060f1SDimitry Andric 22650b57cec5SDimitry Andric // Only currently enable these preferences for M-Class cores. 22660b57cec5SDimitry Andric if (!ST->isMClass()) 2267349cc55cSDimitry Andric return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); 22680b57cec5SDimitry Andric 22690b57cec5SDimitry Andric // Disable loop unrolling for Oz and Os. 22700b57cec5SDimitry Andric UP.OptSizeThreshold = 0; 22710b57cec5SDimitry Andric UP.PartialOptSizeThreshold = 0; 22720b57cec5SDimitry Andric if (L->getHeader()->getParent()->hasOptSize()) 22730b57cec5SDimitry Andric return; 22740b57cec5SDimitry Andric 22750b57cec5SDimitry Andric SmallVector<BasicBlock*, 4> ExitingBlocks; 22760b57cec5SDimitry Andric L->getExitingBlocks(ExitingBlocks); 22770b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Loop has:\n" 22780b57cec5SDimitry Andric << "Blocks: " << L->getNumBlocks() << "\n" 22790b57cec5SDimitry Andric << "Exit blocks: " << ExitingBlocks.size() << "\n"); 22800b57cec5SDimitry Andric 22810b57cec5SDimitry Andric // Only allow another exit other than the latch. This acts as an early exit 22820b57cec5SDimitry Andric // as it mirrors the profitability calculation of the runtime unroller. 22830b57cec5SDimitry Andric if (ExitingBlocks.size() > 2) 22840b57cec5SDimitry Andric return; 22850b57cec5SDimitry Andric 22860b57cec5SDimitry Andric // Limit the CFG of the loop body for targets with a branch predictor. 22870b57cec5SDimitry Andric // Allowing 4 blocks permits if-then-else diamonds in the body. 22880b57cec5SDimitry Andric if (ST->hasBranchPredictor() && L->getNumBlocks() > 4) 22890b57cec5SDimitry Andric return; 22900b57cec5SDimitry Andric 2291e8d8bef9SDimitry Andric // Don't unroll vectorized loops, including the remainder loop 2292e8d8bef9SDimitry Andric if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 2293e8d8bef9SDimitry Andric return; 2294e8d8bef9SDimitry Andric 22950b57cec5SDimitry Andric // Scan the loop: don't unroll loops with calls as this could prevent 22960b57cec5SDimitry Andric // inlining. 2297fe6060f1SDimitry Andric InstructionCost Cost = 0; 22980b57cec5SDimitry Andric for (auto *BB : L->getBlocks()) { 22990b57cec5SDimitry Andric for (auto &I : *BB) { 2300480093f4SDimitry Andric // Don't unroll vectorised loop. MVE does not benefit from it as much as 2301480093f4SDimitry Andric // scalar code. 2302480093f4SDimitry Andric if (I.getType()->isVectorTy()) 2303480093f4SDimitry Andric return; 2304480093f4SDimitry Andric 23050b57cec5SDimitry Andric if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 23065ffd83dbSDimitry Andric if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 23070b57cec5SDimitry Andric if (!isLoweredToCall(F)) 23080b57cec5SDimitry Andric continue; 23090b57cec5SDimitry Andric } 23100b57cec5SDimitry Andric return; 23110b57cec5SDimitry Andric } 23128bcb0991SDimitry Andric 2313e8d8bef9SDimitry Andric SmallVector<const Value*, 4> Operands(I.operand_values()); 2314e8d8bef9SDimitry Andric Cost += 2315e8d8bef9SDimitry Andric getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency); 23160b57cec5SDimitry Andric } 23170b57cec5SDimitry Andric } 23180b57cec5SDimitry Andric 2319fe6060f1SDimitry Andric // On v6m cores, there are very few registers available. We can easily end up 2320fe6060f1SDimitry Andric // spilling and reloading more registers in an unrolled loop. Look at the 2321fe6060f1SDimitry Andric // number of LCSSA phis as a rough measure of how many registers will need to 2322fe6060f1SDimitry Andric // be live out of the loop, reducing the default unroll count if more than 1 2323fe6060f1SDimitry Andric // value is needed. In the long run, all of this should be being learnt by a 2324fe6060f1SDimitry Andric // machine. 2325fe6060f1SDimitry Andric unsigned UnrollCount = 4; 2326fe6060f1SDimitry Andric if (ST->isThumb1Only()) { 2327fe6060f1SDimitry Andric unsigned ExitingValues = 0; 2328fe6060f1SDimitry Andric SmallVector<BasicBlock *, 4> ExitBlocks; 2329fe6060f1SDimitry Andric L->getExitBlocks(ExitBlocks); 2330fe6060f1SDimitry Andric for (auto *Exit : ExitBlocks) { 2331fe6060f1SDimitry Andric // Count the number of LCSSA phis. Exclude values coming from GEP's as 2332fe6060f1SDimitry Andric // only the last is expected to be needed for address operands. 2333fe6060f1SDimitry Andric unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) { 2334fe6060f1SDimitry Andric return PH.getNumOperands() != 1 || 2335fe6060f1SDimitry Andric !isa<GetElementPtrInst>(PH.getOperand(0)); 2336fe6060f1SDimitry Andric }); 2337fe6060f1SDimitry Andric ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues; 2338fe6060f1SDimitry Andric } 2339fe6060f1SDimitry Andric if (ExitingValues) 2340fe6060f1SDimitry Andric UnrollCount /= ExitingValues; 2341fe6060f1SDimitry Andric if (UnrollCount <= 1) 2342fe6060f1SDimitry Andric return; 2343fe6060f1SDimitry Andric } 2344fe6060f1SDimitry Andric 23450b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 2346fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n"); 23470b57cec5SDimitry Andric 23480b57cec5SDimitry Andric UP.Partial = true; 23490b57cec5SDimitry Andric UP.Runtime = true; 23500b57cec5SDimitry Andric UP.UnrollRemainder = true; 2351fe6060f1SDimitry Andric UP.DefaultUnrollRuntimeCount = UnrollCount; 23520b57cec5SDimitry Andric UP.UnrollAndJam = true; 23530b57cec5SDimitry Andric UP.UnrollAndJamInnerLoopThreshold = 60; 23540b57cec5SDimitry Andric 23550b57cec5SDimitry Andric // Force unrolling small loops can be very useful because of the branch 23560b57cec5SDimitry Andric // taken cost of the backedge. 23570b57cec5SDimitry Andric if (Cost < 12) 23580b57cec5SDimitry Andric UP.Force = true; 23590b57cec5SDimitry Andric } 23608bcb0991SDimitry Andric 23615ffd83dbSDimitry Andric void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 23625ffd83dbSDimitry Andric TTI::PeelingPreferences &PP) { 23635ffd83dbSDimitry Andric BaseT::getPeelingPreferences(L, SE, PP); 23645ffd83dbSDimitry Andric } 23655ffd83dbSDimitry Andric 2366e8d8bef9SDimitry Andric bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty, 2367e8d8bef9SDimitry Andric TTI::ReductionFlags Flags) const { 2368e8d8bef9SDimitry Andric if (!ST->hasMVEIntegerOps()) 2369e8d8bef9SDimitry Andric return false; 2370e8d8bef9SDimitry Andric 2371e8d8bef9SDimitry Andric unsigned ScalarBits = Ty->getScalarSizeInBits(); 2372e8d8bef9SDimitry Andric switch (Opcode) { 2373e8d8bef9SDimitry Andric case Instruction::Add: 2374e8d8bef9SDimitry Andric return ScalarBits <= 64; 2375e8d8bef9SDimitry Andric default: 2376e8d8bef9SDimitry Andric return false; 2377e8d8bef9SDimitry Andric } 2378e8d8bef9SDimitry Andric } 2379e8d8bef9SDimitry Andric 2380e8d8bef9SDimitry Andric bool ARMTTIImpl::preferPredicatedReductionSelect( 2381e8d8bef9SDimitry Andric unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { 2382e8d8bef9SDimitry Andric if (!ST->hasMVEIntegerOps()) 2383e8d8bef9SDimitry Andric return false; 2384e8d8bef9SDimitry Andric return true; 2385e8d8bef9SDimitry Andric } 2386