10b57cec5SDimitry Andric //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric 90b57cec5SDimitry Andric #include "ARMTargetTransformInfo.h" 100b57cec5SDimitry Andric #include "ARMSubtarget.h" 110b57cec5SDimitry Andric #include "MCTargetDesc/ARMAddressingModes.h" 120b57cec5SDimitry Andric #include "llvm/ADT/APInt.h" 130b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h" 140b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h" 150b57cec5SDimitry Andric #include "llvm/CodeGen/CostTable.h" 160b57cec5SDimitry Andric #include "llvm/CodeGen/ISDOpcodes.h" 170b57cec5SDimitry Andric #include "llvm/CodeGen/ValueTypes.h" 180b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h" 190b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h" 200b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h" 210b57cec5SDimitry Andric #include "llvm/IR/Instruction.h" 220b57cec5SDimitry Andric #include "llvm/IR/Instructions.h" 23e8d8bef9SDimitry Andric #include "llvm/IR/Intrinsics.h" 240b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h" 255ffd83dbSDimitry Andric #include "llvm/IR/IntrinsicsARM.h" 26480093f4SDimitry Andric #include "llvm/IR/PatternMatch.h" 270b57cec5SDimitry Andric #include "llvm/IR/Type.h" 280b57cec5SDimitry Andric #include "llvm/MC/SubtargetFeature.h" 290b57cec5SDimitry Andric #include "llvm/Support/Casting.h" 30e8d8bef9SDimitry Andric #include "llvm/Support/KnownBits.h" 310b57cec5SDimitry Andric #include "llvm/Support/MachineValueType.h" 320b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h" 33e8d8bef9SDimitry Andric #include "llvm/Transforms/InstCombine/InstCombiner.h" 34e8d8bef9SDimitry Andric #include "llvm/Transforms/Utils/Local.h" 355ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h" 360b57cec5SDimitry Andric #include <algorithm> 370b57cec5SDimitry Andric #include <cassert> 380b57cec5SDimitry Andric #include <cstdint> 390b57cec5SDimitry Andric #include <utility> 400b57cec5SDimitry Andric 410b57cec5SDimitry Andric using namespace llvm; 420b57cec5SDimitry Andric 430b57cec5SDimitry Andric #define DEBUG_TYPE "armtti" 440b57cec5SDimitry Andric 458bcb0991SDimitry Andric static cl::opt<bool> EnableMaskedLoadStores( 46480093f4SDimitry Andric "enable-arm-maskedldst", cl::Hidden, cl::init(true), 478bcb0991SDimitry Andric cl::desc("Enable the generation of masked loads and stores")); 488bcb0991SDimitry Andric 490b57cec5SDimitry Andric static cl::opt<bool> DisableLowOverheadLoops( 508bcb0991SDimitry Andric "disable-arm-loloops", cl::Hidden, cl::init(false), 510b57cec5SDimitry Andric cl::desc("Disable the generation of low-overhead loops")); 520b57cec5SDimitry Andric 53e8d8bef9SDimitry Andric static cl::opt<bool> 54e8d8bef9SDimitry Andric AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), 55e8d8bef9SDimitry Andric cl::desc("Enable the generation of WLS loops")); 56e8d8bef9SDimitry Andric 575ffd83dbSDimitry Andric extern cl::opt<TailPredication::Mode> EnableTailPredication; 58480093f4SDimitry Andric 59480093f4SDimitry Andric extern cl::opt<bool> EnableMaskedGatherScatters; 60480093f4SDimitry Andric 61e8d8bef9SDimitry Andric extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor; 62e8d8bef9SDimitry Andric 63e8d8bef9SDimitry Andric /// Convert a vector load intrinsic into a simple llvm load instruction. 64e8d8bef9SDimitry Andric /// This is beneficial when the underlying object being addressed comes 65e8d8bef9SDimitry Andric /// from a constant, since we get constant-folding for free. 66e8d8bef9SDimitry Andric static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, 67e8d8bef9SDimitry Andric InstCombiner::BuilderTy &Builder) { 68e8d8bef9SDimitry Andric auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); 69e8d8bef9SDimitry Andric 70e8d8bef9SDimitry Andric if (!IntrAlign) 71e8d8bef9SDimitry Andric return nullptr; 72e8d8bef9SDimitry Andric 73e8d8bef9SDimitry Andric unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign 74e8d8bef9SDimitry Andric ? MemAlign 75e8d8bef9SDimitry Andric : IntrAlign->getLimitedValue(); 76e8d8bef9SDimitry Andric 77e8d8bef9SDimitry Andric if (!isPowerOf2_32(Alignment)) 78e8d8bef9SDimitry Andric return nullptr; 79e8d8bef9SDimitry Andric 80e8d8bef9SDimitry Andric auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), 81e8d8bef9SDimitry Andric PointerType::get(II.getType(), 0)); 82e8d8bef9SDimitry Andric return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); 83e8d8bef9SDimitry Andric } 84e8d8bef9SDimitry Andric 850b57cec5SDimitry Andric bool ARMTTIImpl::areInlineCompatible(const Function *Caller, 860b57cec5SDimitry Andric const Function *Callee) const { 870b57cec5SDimitry Andric const TargetMachine &TM = getTLI()->getTargetMachine(); 880b57cec5SDimitry Andric const FeatureBitset &CallerBits = 890b57cec5SDimitry Andric TM.getSubtargetImpl(*Caller)->getFeatureBits(); 900b57cec5SDimitry Andric const FeatureBitset &CalleeBits = 910b57cec5SDimitry Andric TM.getSubtargetImpl(*Callee)->getFeatureBits(); 920b57cec5SDimitry Andric 935ffd83dbSDimitry Andric // To inline a callee, all features not in the allowed list must match exactly. 945ffd83dbSDimitry Andric bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) == 955ffd83dbSDimitry Andric (CalleeBits & ~InlineFeaturesAllowed); 965ffd83dbSDimitry Andric // For features in the allowed list, the callee's features must be a subset of 970b57cec5SDimitry Andric // the callers'. 985ffd83dbSDimitry Andric bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) == 995ffd83dbSDimitry Andric (CalleeBits & InlineFeaturesAllowed); 1000b57cec5SDimitry Andric return MatchExact && MatchSubset; 1010b57cec5SDimitry Andric } 1020b57cec5SDimitry Andric 103fe6060f1SDimitry Andric TTI::AddressingModeKind 104fe6060f1SDimitry Andric ARMTTIImpl::getPreferredAddressingMode(const Loop *L, 105fe6060f1SDimitry Andric ScalarEvolution *SE) const { 1065ffd83dbSDimitry Andric if (ST->hasMVEIntegerOps()) 107fe6060f1SDimitry Andric return TTI::AMK_PostIndexed; 1085ffd83dbSDimitry Andric 109fe6060f1SDimitry Andric if (L->getHeader()->getParent()->hasOptSize()) 110fe6060f1SDimitry Andric return TTI::AMK_None; 111fe6060f1SDimitry Andric 112fe6060f1SDimitry Andric if (ST->isMClass() && ST->isThumb2() && 113fe6060f1SDimitry Andric L->getNumBlocks() == 1) 114fe6060f1SDimitry Andric return TTI::AMK_PreIndexed; 115fe6060f1SDimitry Andric 116fe6060f1SDimitry Andric return TTI::AMK_None; 1175ffd83dbSDimitry Andric } 1185ffd83dbSDimitry Andric 119e8d8bef9SDimitry Andric Optional<Instruction *> 120e8d8bef9SDimitry Andric ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 121e8d8bef9SDimitry Andric using namespace PatternMatch; 122e8d8bef9SDimitry Andric Intrinsic::ID IID = II.getIntrinsicID(); 123e8d8bef9SDimitry Andric switch (IID) { 124e8d8bef9SDimitry Andric default: 125e8d8bef9SDimitry Andric break; 126e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld1: { 127e8d8bef9SDimitry Andric Align MemAlign = 128e8d8bef9SDimitry Andric getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, 129e8d8bef9SDimitry Andric &IC.getAssumptionCache(), &IC.getDominatorTree()); 130e8d8bef9SDimitry Andric if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) { 131e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, V); 132e8d8bef9SDimitry Andric } 133e8d8bef9SDimitry Andric break; 134e8d8bef9SDimitry Andric } 135e8d8bef9SDimitry Andric 136e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld2: 137e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld3: 138e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld4: 139e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld2lane: 140e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld3lane: 141e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vld4lane: 142e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst1: 143e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst2: 144e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst3: 145e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst4: 146e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst2lane: 147e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst3lane: 148e8d8bef9SDimitry Andric case Intrinsic::arm_neon_vst4lane: { 149e8d8bef9SDimitry Andric Align MemAlign = 150e8d8bef9SDimitry Andric getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, 151e8d8bef9SDimitry Andric &IC.getAssumptionCache(), &IC.getDominatorTree()); 152e8d8bef9SDimitry Andric unsigned AlignArg = II.getNumArgOperands() - 1; 153e8d8bef9SDimitry Andric Value *AlignArgOp = II.getArgOperand(AlignArg); 154e8d8bef9SDimitry Andric MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue(); 155e8d8bef9SDimitry Andric if (Align && *Align < MemAlign) { 156e8d8bef9SDimitry Andric return IC.replaceOperand( 157e8d8bef9SDimitry Andric II, AlignArg, 158e8d8bef9SDimitry Andric ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(), 159e8d8bef9SDimitry Andric false)); 160e8d8bef9SDimitry Andric } 161e8d8bef9SDimitry Andric break; 162e8d8bef9SDimitry Andric } 163e8d8bef9SDimitry Andric 164e8d8bef9SDimitry Andric case Intrinsic::arm_mve_pred_i2v: { 165e8d8bef9SDimitry Andric Value *Arg = II.getArgOperand(0); 166e8d8bef9SDimitry Andric Value *ArgArg; 167e8d8bef9SDimitry Andric if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( 168e8d8bef9SDimitry Andric PatternMatch::m_Value(ArgArg))) && 169e8d8bef9SDimitry Andric II.getType() == ArgArg->getType()) { 170e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ArgArg); 171e8d8bef9SDimitry Andric } 172e8d8bef9SDimitry Andric Constant *XorMask; 173e8d8bef9SDimitry Andric if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( 174e8d8bef9SDimitry Andric PatternMatch::m_Value(ArgArg)), 175e8d8bef9SDimitry Andric PatternMatch::m_Constant(XorMask))) && 176e8d8bef9SDimitry Andric II.getType() == ArgArg->getType()) { 177e8d8bef9SDimitry Andric if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { 178e8d8bef9SDimitry Andric if (CI->getValue().trunc(16).isAllOnesValue()) { 179e8d8bef9SDimitry Andric auto TrueVector = IC.Builder.CreateVectorSplat( 180e8d8bef9SDimitry Andric cast<FixedVectorType>(II.getType())->getNumElements(), 181e8d8bef9SDimitry Andric IC.Builder.getTrue()); 182e8d8bef9SDimitry Andric return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); 183e8d8bef9SDimitry Andric } 184e8d8bef9SDimitry Andric } 185e8d8bef9SDimitry Andric } 186e8d8bef9SDimitry Andric KnownBits ScalarKnown(32); 187e8d8bef9SDimitry Andric if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16), 188e8d8bef9SDimitry Andric ScalarKnown, 0)) { 189e8d8bef9SDimitry Andric return &II; 190e8d8bef9SDimitry Andric } 191e8d8bef9SDimitry Andric break; 192e8d8bef9SDimitry Andric } 193e8d8bef9SDimitry Andric case Intrinsic::arm_mve_pred_v2i: { 194e8d8bef9SDimitry Andric Value *Arg = II.getArgOperand(0); 195e8d8bef9SDimitry Andric Value *ArgArg; 196e8d8bef9SDimitry Andric if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>( 197e8d8bef9SDimitry Andric PatternMatch::m_Value(ArgArg)))) { 198e8d8bef9SDimitry Andric return IC.replaceInstUsesWith(II, ArgArg); 199e8d8bef9SDimitry Andric } 200e8d8bef9SDimitry Andric if (!II.getMetadata(LLVMContext::MD_range)) { 201e8d8bef9SDimitry Andric Type *IntTy32 = Type::getInt32Ty(II.getContext()); 202e8d8bef9SDimitry Andric Metadata *M[] = { 203e8d8bef9SDimitry Andric ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), 204fe6060f1SDimitry Andric ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))}; 205e8d8bef9SDimitry Andric II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M)); 206e8d8bef9SDimitry Andric return &II; 207e8d8bef9SDimitry Andric } 208e8d8bef9SDimitry Andric break; 209e8d8bef9SDimitry Andric } 210e8d8bef9SDimitry Andric case Intrinsic::arm_mve_vadc: 211e8d8bef9SDimitry Andric case Intrinsic::arm_mve_vadc_predicated: { 212e8d8bef9SDimitry Andric unsigned CarryOp = 213e8d8bef9SDimitry Andric (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; 214e8d8bef9SDimitry Andric assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && 215e8d8bef9SDimitry Andric "Bad type for intrinsic!"); 216e8d8bef9SDimitry Andric 217e8d8bef9SDimitry Andric KnownBits CarryKnown(32); 218e8d8bef9SDimitry Andric if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29), 219e8d8bef9SDimitry Andric CarryKnown)) { 220e8d8bef9SDimitry Andric return &II; 221e8d8bef9SDimitry Andric } 222e8d8bef9SDimitry Andric break; 223e8d8bef9SDimitry Andric } 224e8d8bef9SDimitry Andric case Intrinsic::arm_mve_vmldava: { 225e8d8bef9SDimitry Andric Instruction *I = cast<Instruction>(&II); 226e8d8bef9SDimitry Andric if (I->hasOneUse()) { 227e8d8bef9SDimitry Andric auto *User = cast<Instruction>(*I->user_begin()); 228e8d8bef9SDimitry Andric Value *OpZ; 229e8d8bef9SDimitry Andric if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) && 230e8d8bef9SDimitry Andric match(I->getOperand(3), m_Zero())) { 231e8d8bef9SDimitry Andric Value *OpX = I->getOperand(4); 232e8d8bef9SDimitry Andric Value *OpY = I->getOperand(5); 233e8d8bef9SDimitry Andric Type *OpTy = OpX->getType(); 234e8d8bef9SDimitry Andric 235e8d8bef9SDimitry Andric IC.Builder.SetInsertPoint(User); 236e8d8bef9SDimitry Andric Value *V = 237e8d8bef9SDimitry Andric IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy}, 238e8d8bef9SDimitry Andric {I->getOperand(0), I->getOperand(1), 239e8d8bef9SDimitry Andric I->getOperand(2), OpZ, OpX, OpY}); 240e8d8bef9SDimitry Andric 241e8d8bef9SDimitry Andric IC.replaceInstUsesWith(*User, V); 242e8d8bef9SDimitry Andric return IC.eraseInstFromFunction(*User); 243e8d8bef9SDimitry Andric } 244e8d8bef9SDimitry Andric } 245e8d8bef9SDimitry Andric return None; 246e8d8bef9SDimitry Andric } 247e8d8bef9SDimitry Andric } 248e8d8bef9SDimitry Andric return None; 249e8d8bef9SDimitry Andric } 250e8d8bef9SDimitry Andric 251fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, 2525ffd83dbSDimitry Andric TTI::TargetCostKind CostKind) { 2530b57cec5SDimitry Andric assert(Ty->isIntegerTy()); 2540b57cec5SDimitry Andric 2550b57cec5SDimitry Andric unsigned Bits = Ty->getPrimitiveSizeInBits(); 2560b57cec5SDimitry Andric if (Bits == 0 || Imm.getActiveBits() >= 64) 2570b57cec5SDimitry Andric return 4; 2580b57cec5SDimitry Andric 2590b57cec5SDimitry Andric int64_t SImmVal = Imm.getSExtValue(); 2600b57cec5SDimitry Andric uint64_t ZImmVal = Imm.getZExtValue(); 2610b57cec5SDimitry Andric if (!ST->isThumb()) { 2620b57cec5SDimitry Andric if ((SImmVal >= 0 && SImmVal < 65536) || 2630b57cec5SDimitry Andric (ARM_AM::getSOImmVal(ZImmVal) != -1) || 2640b57cec5SDimitry Andric (ARM_AM::getSOImmVal(~ZImmVal) != -1)) 2650b57cec5SDimitry Andric return 1; 2660b57cec5SDimitry Andric return ST->hasV6T2Ops() ? 2 : 3; 2670b57cec5SDimitry Andric } 2680b57cec5SDimitry Andric if (ST->isThumb2()) { 2690b57cec5SDimitry Andric if ((SImmVal >= 0 && SImmVal < 65536) || 2700b57cec5SDimitry Andric (ARM_AM::getT2SOImmVal(ZImmVal) != -1) || 2710b57cec5SDimitry Andric (ARM_AM::getT2SOImmVal(~ZImmVal) != -1)) 2720b57cec5SDimitry Andric return 1; 2730b57cec5SDimitry Andric return ST->hasV6T2Ops() ? 2 : 3; 2740b57cec5SDimitry Andric } 2750b57cec5SDimitry Andric // Thumb1, any i8 imm cost 1. 2760b57cec5SDimitry Andric if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256)) 2770b57cec5SDimitry Andric return 1; 2780b57cec5SDimitry Andric if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal)) 2790b57cec5SDimitry Andric return 2; 2800b57cec5SDimitry Andric // Load from constantpool. 2810b57cec5SDimitry Andric return 3; 2820b57cec5SDimitry Andric } 2830b57cec5SDimitry Andric 2840b57cec5SDimitry Andric // Constants smaller than 256 fit in the immediate field of 2850b57cec5SDimitry Andric // Thumb1 instructions so we return a zero cost and 1 otherwise. 286fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, 2870b57cec5SDimitry Andric const APInt &Imm, Type *Ty) { 2880b57cec5SDimitry Andric if (Imm.isNonNegative() && Imm.getLimitedValue() < 256) 2890b57cec5SDimitry Andric return 0; 2900b57cec5SDimitry Andric 2910b57cec5SDimitry Andric return 1; 2920b57cec5SDimitry Andric } 2930b57cec5SDimitry Andric 294e8d8bef9SDimitry Andric // Checks whether Inst is part of a min(max()) or max(min()) pattern 295e8d8bef9SDimitry Andric // that will match to an SSAT instruction 296e8d8bef9SDimitry Andric static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { 297e8d8bef9SDimitry Andric Value *LHS, *RHS; 298e8d8bef9SDimitry Andric ConstantInt *C; 299e8d8bef9SDimitry Andric SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor; 300e8d8bef9SDimitry Andric 301e8d8bef9SDimitry Andric if (InstSPF == SPF_SMAX && 302e8d8bef9SDimitry Andric PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) && 303e8d8bef9SDimitry Andric C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) { 304e8d8bef9SDimitry Andric 305e8d8bef9SDimitry Andric auto isSSatMin = [&](Value *MinInst) { 306e8d8bef9SDimitry Andric if (isa<SelectInst>(MinInst)) { 307e8d8bef9SDimitry Andric Value *MinLHS, *MinRHS; 308e8d8bef9SDimitry Andric ConstantInt *MinC; 309e8d8bef9SDimitry Andric SelectPatternFlavor MinSPF = 310e8d8bef9SDimitry Andric matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor; 311e8d8bef9SDimitry Andric if (MinSPF == SPF_SMIN && 312e8d8bef9SDimitry Andric PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) && 313e8d8bef9SDimitry Andric MinC->getValue() == ((-Imm) - 1)) 314e8d8bef9SDimitry Andric return true; 315e8d8bef9SDimitry Andric } 316e8d8bef9SDimitry Andric return false; 317e8d8bef9SDimitry Andric }; 318e8d8bef9SDimitry Andric 319e8d8bef9SDimitry Andric if (isSSatMin(Inst->getOperand(1)) || 320e8d8bef9SDimitry Andric (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) || 321e8d8bef9SDimitry Andric isSSatMin(*(++Inst->user_begin()))))) 322e8d8bef9SDimitry Andric return true; 323e8d8bef9SDimitry Andric } 324e8d8bef9SDimitry Andric return false; 325e8d8bef9SDimitry Andric } 326e8d8bef9SDimitry Andric 327fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, 328e8d8bef9SDimitry Andric const APInt &Imm, Type *Ty, 329e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind, 330e8d8bef9SDimitry Andric Instruction *Inst) { 3310b57cec5SDimitry Andric // Division by a constant can be turned into multiplication, but only if we 3320b57cec5SDimitry Andric // know it's constant. So it's not so much that the immediate is cheap (it's 3330b57cec5SDimitry Andric // not), but that the alternative is worse. 3340b57cec5SDimitry Andric // FIXME: this is probably unneeded with GlobalISel. 3350b57cec5SDimitry Andric if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv || 3360b57cec5SDimitry Andric Opcode == Instruction::SRem || Opcode == Instruction::URem) && 3370b57cec5SDimitry Andric Idx == 1) 3380b57cec5SDimitry Andric return 0; 3390b57cec5SDimitry Andric 340fe6060f1SDimitry Andric // Leave any gep offsets for the CodeGenPrepare, which will do a better job at 341fe6060f1SDimitry Andric // splitting any large offsets. 342fe6060f1SDimitry Andric if (Opcode == Instruction::GetElementPtr && Idx != 0) 343fe6060f1SDimitry Andric return 0; 344fe6060f1SDimitry Andric 3450b57cec5SDimitry Andric if (Opcode == Instruction::And) { 3460b57cec5SDimitry Andric // UXTB/UXTH 3470b57cec5SDimitry Andric if (Imm == 255 || Imm == 65535) 3480b57cec5SDimitry Andric return 0; 3490b57cec5SDimitry Andric // Conversion to BIC is free, and means we can use ~Imm instead. 3505ffd83dbSDimitry Andric return std::min(getIntImmCost(Imm, Ty, CostKind), 3515ffd83dbSDimitry Andric getIntImmCost(~Imm, Ty, CostKind)); 3520b57cec5SDimitry Andric } 3530b57cec5SDimitry Andric 3540b57cec5SDimitry Andric if (Opcode == Instruction::Add) 3550b57cec5SDimitry Andric // Conversion to SUB is free, and means we can use -Imm instead. 3565ffd83dbSDimitry Andric return std::min(getIntImmCost(Imm, Ty, CostKind), 3575ffd83dbSDimitry Andric getIntImmCost(-Imm, Ty, CostKind)); 3580b57cec5SDimitry Andric 3590b57cec5SDimitry Andric if (Opcode == Instruction::ICmp && Imm.isNegative() && 3600b57cec5SDimitry Andric Ty->getIntegerBitWidth() == 32) { 3610b57cec5SDimitry Andric int64_t NegImm = -Imm.getSExtValue(); 3620b57cec5SDimitry Andric if (ST->isThumb2() && NegImm < 1<<12) 3630b57cec5SDimitry Andric // icmp X, #-C -> cmn X, #C 3640b57cec5SDimitry Andric return 0; 3650b57cec5SDimitry Andric if (ST->isThumb() && NegImm < 1<<8) 3660b57cec5SDimitry Andric // icmp X, #-C -> adds X, #C 3670b57cec5SDimitry Andric return 0; 3680b57cec5SDimitry Andric } 3690b57cec5SDimitry Andric 3700b57cec5SDimitry Andric // xor a, -1 can always be folded to MVN 3710b57cec5SDimitry Andric if (Opcode == Instruction::Xor && Imm.isAllOnesValue()) 3720b57cec5SDimitry Andric return 0; 3730b57cec5SDimitry Andric 374e8d8bef9SDimitry Andric // Ensures negative constant of min(max()) or max(min()) patterns that 375e8d8bef9SDimitry Andric // match to SSAT instructions don't get hoisted 376e8d8bef9SDimitry Andric if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) && 377e8d8bef9SDimitry Andric Ty->getIntegerBitWidth() <= 32) { 378e8d8bef9SDimitry Andric if (isSSATMinMaxPattern(Inst, Imm) || 379e8d8bef9SDimitry Andric (isa<ICmpInst>(Inst) && Inst->hasOneUse() && 380e8d8bef9SDimitry Andric isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm))) 381e8d8bef9SDimitry Andric return 0; 382e8d8bef9SDimitry Andric } 383e8d8bef9SDimitry Andric 3845ffd83dbSDimitry Andric return getIntImmCost(Imm, Ty, CostKind); 3850b57cec5SDimitry Andric } 3860b57cec5SDimitry Andric 387fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode, 388fe6060f1SDimitry Andric TTI::TargetCostKind CostKind, 389fe6060f1SDimitry Andric const Instruction *I) { 390e8d8bef9SDimitry Andric if (CostKind == TTI::TCK_RecipThroughput && 391e8d8bef9SDimitry Andric (ST->hasNEON() || ST->hasMVEIntegerOps())) { 392e8d8bef9SDimitry Andric // FIXME: The vectorizer is highly sensistive to the cost of these 393e8d8bef9SDimitry Andric // instructions, which suggests that it may be using the costs incorrectly. 394e8d8bef9SDimitry Andric // But, for now, just make them free to avoid performance regressions for 395e8d8bef9SDimitry Andric // vector targets. 396e8d8bef9SDimitry Andric return 0; 397e8d8bef9SDimitry Andric } 398fe6060f1SDimitry Andric return BaseT::getCFInstrCost(Opcode, CostKind, I); 399e8d8bef9SDimitry Andric } 400e8d8bef9SDimitry Andric 401fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, 402fe6060f1SDimitry Andric Type *Src, 403e8d8bef9SDimitry Andric TTI::CastContextHint CCH, 4045ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, 4050b57cec5SDimitry Andric const Instruction *I) { 4060b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 4070b57cec5SDimitry Andric assert(ISD && "Invalid opcode"); 4080b57cec5SDimitry Andric 4095ffd83dbSDimitry Andric // TODO: Allow non-throughput costs that aren't binary. 410fe6060f1SDimitry Andric auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { 4115ffd83dbSDimitry Andric if (CostKind != TTI::TCK_RecipThroughput) 4125ffd83dbSDimitry Andric return Cost == 0 ? 0 : 1; 4135ffd83dbSDimitry Andric return Cost; 4140b57cec5SDimitry Andric }; 415e8d8bef9SDimitry Andric auto IsLegalFPType = [this](EVT VT) { 416e8d8bef9SDimitry Andric EVT EltVT = VT.getScalarType(); 417e8d8bef9SDimitry Andric return (EltVT == MVT::f32 && ST->hasVFP2Base()) || 418e8d8bef9SDimitry Andric (EltVT == MVT::f64 && ST->hasFP64()) || 419e8d8bef9SDimitry Andric (EltVT == MVT::f16 && ST->hasFullFP16()); 420e8d8bef9SDimitry Andric }; 4210b57cec5SDimitry Andric 4220b57cec5SDimitry Andric EVT SrcTy = TLI->getValueType(DL, Src); 4230b57cec5SDimitry Andric EVT DstTy = TLI->getValueType(DL, Dst); 4240b57cec5SDimitry Andric 4250b57cec5SDimitry Andric if (!SrcTy.isSimple() || !DstTy.isSimple()) 426e8d8bef9SDimitry Andric return AdjustCost( 427e8d8bef9SDimitry Andric BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 4280b57cec5SDimitry Andric 429e8d8bef9SDimitry Andric // Extending masked load/Truncating masked stores is expensive because we 430e8d8bef9SDimitry Andric // currently don't split them. This means that we'll likely end up 431e8d8bef9SDimitry Andric // loading/storing each element individually (hence the high cost). 432e8d8bef9SDimitry Andric if ((ST->hasMVEIntegerOps() && 433e8d8bef9SDimitry Andric (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt || 434e8d8bef9SDimitry Andric Opcode == Instruction::SExt)) || 435e8d8bef9SDimitry Andric (ST->hasMVEFloatOps() && 436e8d8bef9SDimitry Andric (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) && 437e8d8bef9SDimitry Andric IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))) 438e8d8bef9SDimitry Andric if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128) 439fe6060f1SDimitry Andric return 2 * DstTy.getVectorNumElements() * 440fe6060f1SDimitry Andric ST->getMVEVectorCostFactor(CostKind); 441e8d8bef9SDimitry Andric 442e8d8bef9SDimitry Andric // The extend of other kinds of load is free 443e8d8bef9SDimitry Andric if (CCH == TTI::CastContextHint::Normal || 444e8d8bef9SDimitry Andric CCH == TTI::CastContextHint::Masked) { 4458bcb0991SDimitry Andric static const TypeConversionCostTblEntry LoadConversionTbl[] = { 4468bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, 4478bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, 4488bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0}, 4498bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0}, 4508bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0}, 4518bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0}, 4528bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1}, 4538bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1}, 4548bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1}, 4558bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1}, 4568bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1}, 4578bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1}, 4588bcb0991SDimitry Andric }; 4598bcb0991SDimitry Andric if (const auto *Entry = ConvertCostTableLookup( 4608bcb0991SDimitry Andric LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 4615ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 4628bcb0991SDimitry Andric 4638bcb0991SDimitry Andric static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { 4648bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0}, 4658bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0}, 4668bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0}, 4678bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0}, 4688bcb0991SDimitry Andric {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0}, 4698bcb0991SDimitry Andric {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0}, 4705ffd83dbSDimitry Andric // The following extend from a legal type to an illegal type, so need to 4715ffd83dbSDimitry Andric // split the load. This introduced an extra load operation, but the 4725ffd83dbSDimitry Andric // extend is still "free". 4735ffd83dbSDimitry Andric {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1}, 4745ffd83dbSDimitry Andric {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1}, 4755ffd83dbSDimitry Andric {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3}, 4765ffd83dbSDimitry Andric {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3}, 4775ffd83dbSDimitry Andric {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1}, 4785ffd83dbSDimitry Andric {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1}, 4798bcb0991SDimitry Andric }; 4808bcb0991SDimitry Andric if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 4818bcb0991SDimitry Andric if (const auto *Entry = 4828bcb0991SDimitry Andric ConvertCostTableLookup(MVELoadConversionTbl, ISD, 4838bcb0991SDimitry Andric DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 484fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 4858bcb0991SDimitry Andric } 4865ffd83dbSDimitry Andric 4875ffd83dbSDimitry Andric static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = { 4885ffd83dbSDimitry Andric // FPExtends are similar but also require the VCVT instructions. 4895ffd83dbSDimitry Andric {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, 4905ffd83dbSDimitry Andric {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3}, 4915ffd83dbSDimitry Andric }; 4925ffd83dbSDimitry Andric if (SrcTy.isVector() && ST->hasMVEFloatOps()) { 4935ffd83dbSDimitry Andric if (const auto *Entry = 4945ffd83dbSDimitry Andric ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, 4955ffd83dbSDimitry Andric DstTy.getSimpleVT(), SrcTy.getSimpleVT())) 496fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 4975ffd83dbSDimitry Andric } 4985ffd83dbSDimitry Andric 4995ffd83dbSDimitry Andric // The truncate of a store is free. This is the mirror of extends above. 500e8d8bef9SDimitry Andric static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = { 5015ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0}, 5025ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0}, 5035ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0}, 5045ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1}, 505e8d8bef9SDimitry Andric {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1}, 5065ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3}, 5075ffd83dbSDimitry Andric {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1}, 5085ffd83dbSDimitry Andric }; 5095ffd83dbSDimitry Andric if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 5105ffd83dbSDimitry Andric if (const auto *Entry = 511e8d8bef9SDimitry Andric ConvertCostTableLookup(MVEStoreConversionTbl, ISD, 512e8d8bef9SDimitry Andric SrcTy.getSimpleVT(), DstTy.getSimpleVT())) 513fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 5145ffd83dbSDimitry Andric } 5155ffd83dbSDimitry Andric 516e8d8bef9SDimitry Andric static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = { 5175ffd83dbSDimitry Andric {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1}, 5185ffd83dbSDimitry Andric {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3}, 5195ffd83dbSDimitry Andric }; 5205ffd83dbSDimitry Andric if (SrcTy.isVector() && ST->hasMVEFloatOps()) { 5215ffd83dbSDimitry Andric if (const auto *Entry = 522e8d8bef9SDimitry Andric ConvertCostTableLookup(MVEFStoreConversionTbl, ISD, 523e8d8bef9SDimitry Andric SrcTy.getSimpleVT(), DstTy.getSimpleVT())) 524fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 5255ffd83dbSDimitry Andric } 5265ffd83dbSDimitry Andric } 5275ffd83dbSDimitry Andric 5285ffd83dbSDimitry Andric // NEON vector operations that can extend their inputs. 5295ffd83dbSDimitry Andric if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) && 5305ffd83dbSDimitry Andric I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) { 5315ffd83dbSDimitry Andric static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = { 5325ffd83dbSDimitry Andric // vaddl 5335ffd83dbSDimitry Andric { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 }, 5345ffd83dbSDimitry Andric { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 }, 5355ffd83dbSDimitry Andric // vsubl 5365ffd83dbSDimitry Andric { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 }, 5375ffd83dbSDimitry Andric { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 }, 5385ffd83dbSDimitry Andric // vmull 5395ffd83dbSDimitry Andric { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 }, 5405ffd83dbSDimitry Andric { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 }, 5415ffd83dbSDimitry Andric // vshll 5425ffd83dbSDimitry Andric { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 }, 5435ffd83dbSDimitry Andric { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 }, 5445ffd83dbSDimitry Andric }; 5455ffd83dbSDimitry Andric 5465ffd83dbSDimitry Andric auto *User = cast<Instruction>(*I->user_begin()); 5475ffd83dbSDimitry Andric int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode()); 5485ffd83dbSDimitry Andric if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD, 5495ffd83dbSDimitry Andric DstTy.getSimpleVT(), 5505ffd83dbSDimitry Andric SrcTy.getSimpleVT())) { 5515ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 5525ffd83dbSDimitry Andric } 5535ffd83dbSDimitry Andric } 5545ffd83dbSDimitry Andric 5555ffd83dbSDimitry Andric // Single to/from double precision conversions. 5565ffd83dbSDimitry Andric if (Src->isVectorTy() && ST->hasNEON() && 5575ffd83dbSDimitry Andric ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 && 5585ffd83dbSDimitry Andric DstTy.getScalarType() == MVT::f32) || 5595ffd83dbSDimitry Andric (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 && 5605ffd83dbSDimitry Andric DstTy.getScalarType() == MVT::f64))) { 5615ffd83dbSDimitry Andric static const CostTblEntry NEONFltDblTbl[] = { 5625ffd83dbSDimitry Andric // Vector fptrunc/fpext conversions. 5635ffd83dbSDimitry Andric {ISD::FP_ROUND, MVT::v2f64, 2}, 5645ffd83dbSDimitry Andric {ISD::FP_EXTEND, MVT::v2f32, 2}, 5655ffd83dbSDimitry Andric {ISD::FP_EXTEND, MVT::v4f32, 4}}; 5665ffd83dbSDimitry Andric 567fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 5685ffd83dbSDimitry Andric if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) 5695ffd83dbSDimitry Andric return AdjustCost(LT.first * Entry->Cost); 5708bcb0991SDimitry Andric } 5718bcb0991SDimitry Andric 5720b57cec5SDimitry Andric // Some arithmetic, load and store operations have specific instructions 5730b57cec5SDimitry Andric // to cast up/down their types automatically at no extra cost. 5740b57cec5SDimitry Andric // TODO: Get these tables to know at least what the related operations are. 5750b57cec5SDimitry Andric static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = { 5765ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 5775ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 5780b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, 5790b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, 5800b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, 5810b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, 5820b57cec5SDimitry Andric 5830b57cec5SDimitry Andric // The number of vmovl instructions for the extension. 5845ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 5855ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 5865ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 5875ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 5885ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, 5895ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, 5905ffd83dbSDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 5915ffd83dbSDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 5920b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 5930b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 5940b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 5950b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 5960b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 5970b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, 5980b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 5990b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, 6000b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 6010b57cec5SDimitry Andric { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, 6020b57cec5SDimitry Andric 6030b57cec5SDimitry Andric // Operations that we legalize using splitting. 6040b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, 6050b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, 6060b57cec5SDimitry Andric 6070b57cec5SDimitry Andric // Vector float <-> i32 conversions. 6080b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 6090b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 6100b57cec5SDimitry Andric 6110b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 6120b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, 6130b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, 6140b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, 6150b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 6160b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, 6170b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 6180b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 6190b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 6200b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 6210b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 6220b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 6230b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 6240b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, 6250b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 6260b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, 6270b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, 6280b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, 6290b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, 6300b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, 6310b57cec5SDimitry Andric 6320b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, 6330b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, 6340b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 }, 6350b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 }, 6360b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, 6370b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, 6380b57cec5SDimitry Andric 6390b57cec5SDimitry Andric // Vector double <-> i32 conversions. 6400b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 6410b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 6420b57cec5SDimitry Andric 6430b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 6440b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, 6450b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, 6460b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, 6470b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 6480b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, 6490b57cec5SDimitry Andric 6500b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, 6510b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, 6520b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 }, 6530b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 }, 6540b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 }, 6550b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 } 6560b57cec5SDimitry Andric }; 6570b57cec5SDimitry Andric 6580b57cec5SDimitry Andric if (SrcTy.isVector() && ST->hasNEON()) { 6590b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, 6600b57cec5SDimitry Andric DstTy.getSimpleVT(), 6610b57cec5SDimitry Andric SrcTy.getSimpleVT())) 6625ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 6630b57cec5SDimitry Andric } 6640b57cec5SDimitry Andric 6650b57cec5SDimitry Andric // Scalar float to integer conversions. 6660b57cec5SDimitry Andric static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = { 6670b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 }, 6680b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 }, 6690b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 }, 6700b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 }, 6710b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 }, 6720b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 }, 6730b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 }, 6740b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 }, 6750b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 }, 6760b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 }, 6770b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 }, 6780b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 }, 6790b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 }, 6800b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 }, 6810b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 }, 6820b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 }, 6830b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 }, 6840b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 }, 6850b57cec5SDimitry Andric { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 }, 6860b57cec5SDimitry Andric { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 } 6870b57cec5SDimitry Andric }; 6880b57cec5SDimitry Andric if (SrcTy.isFloatingPoint() && ST->hasNEON()) { 6890b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, 6900b57cec5SDimitry Andric DstTy.getSimpleVT(), 6910b57cec5SDimitry Andric SrcTy.getSimpleVT())) 6925ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 6930b57cec5SDimitry Andric } 6940b57cec5SDimitry Andric 6950b57cec5SDimitry Andric // Scalar integer to float conversions. 6960b57cec5SDimitry Andric static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = { 6970b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 }, 6980b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 }, 6990b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 }, 7000b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 }, 7010b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 }, 7020b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 }, 7030b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 }, 7040b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 }, 7050b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 }, 7060b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 }, 7070b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 }, 7080b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 }, 7090b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 }, 7100b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 }, 7110b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 }, 7120b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 }, 7130b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 }, 7140b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 }, 7150b57cec5SDimitry Andric { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 }, 7160b57cec5SDimitry Andric { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 } 7170b57cec5SDimitry Andric }; 7180b57cec5SDimitry Andric 7190b57cec5SDimitry Andric if (SrcTy.isInteger() && ST->hasNEON()) { 7200b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl, 7210b57cec5SDimitry Andric ISD, DstTy.getSimpleVT(), 7220b57cec5SDimitry Andric SrcTy.getSimpleVT())) 7235ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 7240b57cec5SDimitry Andric } 7250b57cec5SDimitry Andric 7268bcb0991SDimitry Andric // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one 7278bcb0991SDimitry Andric // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext 7288bcb0991SDimitry Andric // are linearised so take more. 7298bcb0991SDimitry Andric static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = { 7308bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 7318bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, 7328bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 7338bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, 7348bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 }, 7358bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 }, 7368bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 7378bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, 7388bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 }, 7398bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, 7408bcb0991SDimitry Andric { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 }, 7418bcb0991SDimitry Andric { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 }, 7428bcb0991SDimitry Andric }; 7438bcb0991SDimitry Andric 7448bcb0991SDimitry Andric if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { 7458bcb0991SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl, 7468bcb0991SDimitry Andric ISD, DstTy.getSimpleVT(), 7478bcb0991SDimitry Andric SrcTy.getSimpleVT())) 748fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); 7495ffd83dbSDimitry Andric } 7505ffd83dbSDimitry Andric 7515ffd83dbSDimitry Andric if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) { 7525ffd83dbSDimitry Andric // As general rule, fp converts that were not matched above are scalarized 7535ffd83dbSDimitry Andric // and cost 1 vcvt for each lane, so long as the instruction is available. 7545ffd83dbSDimitry Andric // If not it will become a series of function calls. 755fe6060f1SDimitry Andric const InstructionCost CallCost = 756fe6060f1SDimitry Andric getCallInstrCost(nullptr, Dst, {Src}, CostKind); 7575ffd83dbSDimitry Andric int Lanes = 1; 7585ffd83dbSDimitry Andric if (SrcTy.isFixedLengthVector()) 7595ffd83dbSDimitry Andric Lanes = SrcTy.getVectorNumElements(); 7605ffd83dbSDimitry Andric 761e8d8bef9SDimitry Andric if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)) 7625ffd83dbSDimitry Andric return Lanes; 7635ffd83dbSDimitry Andric else 7645ffd83dbSDimitry Andric return Lanes * CallCost; 7658bcb0991SDimitry Andric } 7668bcb0991SDimitry Andric 767e8d8bef9SDimitry Andric if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() && 768e8d8bef9SDimitry Andric SrcTy.isFixedLengthVector()) { 769e8d8bef9SDimitry Andric // Treat a truncate with larger than legal source (128bits for MVE) as 770e8d8bef9SDimitry Andric // expensive, 2 instructions per lane. 771e8d8bef9SDimitry Andric if ((SrcTy.getScalarType() == MVT::i8 || 772e8d8bef9SDimitry Andric SrcTy.getScalarType() == MVT::i16 || 773e8d8bef9SDimitry Andric SrcTy.getScalarType() == MVT::i32) && 774e8d8bef9SDimitry Andric SrcTy.getSizeInBits() > 128 && 775e8d8bef9SDimitry Andric SrcTy.getSizeInBits() > DstTy.getSizeInBits()) 776e8d8bef9SDimitry Andric return SrcTy.getVectorNumElements() * 2; 777e8d8bef9SDimitry Andric } 778e8d8bef9SDimitry Andric 7790b57cec5SDimitry Andric // Scalar integer conversion costs. 7800b57cec5SDimitry Andric static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { 7810b57cec5SDimitry Andric // i16 -> i64 requires two dependent operations. 7820b57cec5SDimitry Andric { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 }, 7830b57cec5SDimitry Andric 7840b57cec5SDimitry Andric // Truncates on i64 are assumed to be free. 7850b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 }, 7860b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 }, 7870b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 }, 7880b57cec5SDimitry Andric { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 } 7890b57cec5SDimitry Andric }; 7900b57cec5SDimitry Andric 7910b57cec5SDimitry Andric if (SrcTy.isInteger()) { 7920b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, 7930b57cec5SDimitry Andric DstTy.getSimpleVT(), 7940b57cec5SDimitry Andric SrcTy.getSimpleVT())) 7955ffd83dbSDimitry Andric return AdjustCost(Entry->Cost); 7960b57cec5SDimitry Andric } 7970b57cec5SDimitry Andric 7988bcb0991SDimitry Andric int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() 799fe6060f1SDimitry Andric ? ST->getMVEVectorCostFactor(CostKind) 8008bcb0991SDimitry Andric : 1; 8015ffd83dbSDimitry Andric return AdjustCost( 802e8d8bef9SDimitry Andric BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); 8030b57cec5SDimitry Andric } 8040b57cec5SDimitry Andric 805fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 8060b57cec5SDimitry Andric unsigned Index) { 8070b57cec5SDimitry Andric // Penalize inserting into an D-subregister. We end up with a three times 8080b57cec5SDimitry Andric // lower estimated throughput on swift. 8090b57cec5SDimitry Andric if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement && 8100b57cec5SDimitry Andric ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) 8110b57cec5SDimitry Andric return 3; 8120b57cec5SDimitry Andric 8138bcb0991SDimitry Andric if (ST->hasNEON() && (Opcode == Instruction::InsertElement || 8140b57cec5SDimitry Andric Opcode == Instruction::ExtractElement)) { 8150b57cec5SDimitry Andric // Cross-class copies are expensive on many microarchitectures, 8160b57cec5SDimitry Andric // so assume they are expensive by default. 8175ffd83dbSDimitry Andric if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy()) 8180b57cec5SDimitry Andric return 3; 8190b57cec5SDimitry Andric 8200b57cec5SDimitry Andric // Even if it's not a cross class copy, this likely leads to mixing 8210b57cec5SDimitry Andric // of NEON and VFP code and should be therefore penalized. 8220b57cec5SDimitry Andric if (ValTy->isVectorTy() && 8230b57cec5SDimitry Andric ValTy->getScalarSizeInBits() <= 32) 824fe6060f1SDimitry Andric return std::max<InstructionCost>( 825fe6060f1SDimitry Andric BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); 8260b57cec5SDimitry Andric } 8270b57cec5SDimitry Andric 8288bcb0991SDimitry Andric if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || 8298bcb0991SDimitry Andric Opcode == Instruction::ExtractElement)) { 830fe6060f1SDimitry Andric // Integer cross-lane moves are more expensive than float, which can 831fe6060f1SDimitry Andric // sometimes just be vmovs. Integer involve being passes to GPR registers, 832fe6060f1SDimitry Andric // causing more of a delay. 833fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = 834fe6060f1SDimitry Andric getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType()); 835fe6060f1SDimitry Andric return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1); 8368bcb0991SDimitry Andric } 8378bcb0991SDimitry Andric 8380b57cec5SDimitry Andric return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 8390b57cec5SDimitry Andric } 8400b57cec5SDimitry Andric 841fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 842fe6060f1SDimitry Andric Type *CondTy, 843e8d8bef9SDimitry Andric CmpInst::Predicate VecPred, 8445ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, 8450b57cec5SDimitry Andric const Instruction *I) { 8460b57cec5SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 847e8d8bef9SDimitry Andric 848e8d8bef9SDimitry Andric // Thumb scalar code size cost for select. 849e8d8bef9SDimitry Andric if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT && 850e8d8bef9SDimitry Andric ST->isThumb() && !ValTy->isVectorTy()) { 851e8d8bef9SDimitry Andric // Assume expensive structs. 852e8d8bef9SDimitry Andric if (TLI->getValueType(DL, ValTy, true) == MVT::Other) 853e8d8bef9SDimitry Andric return TTI::TCC_Expensive; 854e8d8bef9SDimitry Andric 855e8d8bef9SDimitry Andric // Select costs can vary because they: 856e8d8bef9SDimitry Andric // - may require one or more conditional mov (including an IT), 857e8d8bef9SDimitry Andric // - can't operate directly on immediates, 858e8d8bef9SDimitry Andric // - require live flags, which we can't copy around easily. 859fe6060f1SDimitry Andric InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first; 860e8d8bef9SDimitry Andric 861e8d8bef9SDimitry Andric // Possible IT instruction for Thumb2, or more for Thumb1. 862e8d8bef9SDimitry Andric ++Cost; 863e8d8bef9SDimitry Andric 864e8d8bef9SDimitry Andric // i1 values may need rematerialising by using mov immediates and/or 865e8d8bef9SDimitry Andric // flag setting instructions. 866e8d8bef9SDimitry Andric if (ValTy->isIntegerTy(1)) 867e8d8bef9SDimitry Andric ++Cost; 868e8d8bef9SDimitry Andric 869e8d8bef9SDimitry Andric return Cost; 870e8d8bef9SDimitry Andric } 871e8d8bef9SDimitry Andric 872fe6060f1SDimitry Andric // If this is a vector min/max/abs, use the cost of that intrinsic directly 873fe6060f1SDimitry Andric // instead. Hopefully when min/max intrinsics are more prevalent this code 874fe6060f1SDimitry Andric // will not be needed. 875fe6060f1SDimitry Andric const Instruction *Sel = I; 876fe6060f1SDimitry Andric if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel && 877fe6060f1SDimitry Andric Sel->hasOneUse()) 878fe6060f1SDimitry Andric Sel = cast<Instruction>(Sel->user_back()); 879fe6060f1SDimitry Andric if (Sel && ValTy->isVectorTy() && 880fe6060f1SDimitry Andric (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) { 881fe6060f1SDimitry Andric const Value *LHS, *RHS; 882fe6060f1SDimitry Andric SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor; 883fe6060f1SDimitry Andric unsigned IID = 0; 884fe6060f1SDimitry Andric switch (SPF) { 885fe6060f1SDimitry Andric case SPF_ABS: 886fe6060f1SDimitry Andric IID = Intrinsic::abs; 887fe6060f1SDimitry Andric break; 888fe6060f1SDimitry Andric case SPF_SMIN: 889fe6060f1SDimitry Andric IID = Intrinsic::smin; 890fe6060f1SDimitry Andric break; 891fe6060f1SDimitry Andric case SPF_SMAX: 892fe6060f1SDimitry Andric IID = Intrinsic::smax; 893fe6060f1SDimitry Andric break; 894fe6060f1SDimitry Andric case SPF_UMIN: 895fe6060f1SDimitry Andric IID = Intrinsic::umin; 896fe6060f1SDimitry Andric break; 897fe6060f1SDimitry Andric case SPF_UMAX: 898fe6060f1SDimitry Andric IID = Intrinsic::umax; 899fe6060f1SDimitry Andric break; 900fe6060f1SDimitry Andric case SPF_FMINNUM: 901fe6060f1SDimitry Andric IID = Intrinsic::minnum; 902fe6060f1SDimitry Andric break; 903fe6060f1SDimitry Andric case SPF_FMAXNUM: 904fe6060f1SDimitry Andric IID = Intrinsic::maxnum; 905fe6060f1SDimitry Andric break; 906fe6060f1SDimitry Andric default: 907fe6060f1SDimitry Andric break; 908fe6060f1SDimitry Andric } 909fe6060f1SDimitry Andric if (IID) { 910fe6060f1SDimitry Andric // The ICmp is free, the select gets the cost of the min/max/etc 911fe6060f1SDimitry Andric if (Sel != I) 912fe6060f1SDimitry Andric return 0; 913fe6060f1SDimitry Andric IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy}); 914fe6060f1SDimitry Andric return getIntrinsicInstrCost(CostAttrs, CostKind); 915fe6060f1SDimitry Andric } 916fe6060f1SDimitry Andric } 917fe6060f1SDimitry Andric 9180b57cec5SDimitry Andric // On NEON a vector select gets lowered to vbsl. 919e8d8bef9SDimitry Andric if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) { 9200b57cec5SDimitry Andric // Lowering of some vector selects is currently far from perfect. 9210b57cec5SDimitry Andric static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { 9220b57cec5SDimitry Andric { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 }, 9230b57cec5SDimitry Andric { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 }, 9240b57cec5SDimitry Andric { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 } 9250b57cec5SDimitry Andric }; 9260b57cec5SDimitry Andric 9270b57cec5SDimitry Andric EVT SelCondTy = TLI->getValueType(DL, CondTy); 9280b57cec5SDimitry Andric EVT SelValTy = TLI->getValueType(DL, ValTy); 9290b57cec5SDimitry Andric if (SelCondTy.isSimple() && SelValTy.isSimple()) { 9300b57cec5SDimitry Andric if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, 9310b57cec5SDimitry Andric SelCondTy.getSimpleVT(), 9320b57cec5SDimitry Andric SelValTy.getSimpleVT())) 9330b57cec5SDimitry Andric return Entry->Cost; 9340b57cec5SDimitry Andric } 9350b57cec5SDimitry Andric 936fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = 937fe6060f1SDimitry Andric TLI->getTypeLegalizationCost(DL, ValTy); 9380b57cec5SDimitry Andric return LT.first; 9390b57cec5SDimitry Andric } 9400b57cec5SDimitry Andric 941fe6060f1SDimitry Andric if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() && 942fe6060f1SDimitry Andric (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && 943fe6060f1SDimitry Andric cast<FixedVectorType>(ValTy)->getNumElements() > 1) { 944fe6060f1SDimitry Andric FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy); 945fe6060f1SDimitry Andric FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy); 946fe6060f1SDimitry Andric if (!VecCondTy) 947fe6060f1SDimitry Andric VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy)); 948fe6060f1SDimitry Andric 949fe6060f1SDimitry Andric // If we don't have mve.fp any fp operations will need to be scalarized. 950fe6060f1SDimitry Andric if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) { 951fe6060f1SDimitry Andric // One scalaization insert, one scalarization extract and the cost of the 952fe6060f1SDimitry Andric // fcmps. 953fe6060f1SDimitry Andric return BaseT::getScalarizationOverhead(VecValTy, false, true) + 954fe6060f1SDimitry Andric BaseT::getScalarizationOverhead(VecCondTy, true, false) + 955fe6060f1SDimitry Andric VecValTy->getNumElements() * 956fe6060f1SDimitry Andric getCmpSelInstrCost(Opcode, ValTy->getScalarType(), 957fe6060f1SDimitry Andric VecCondTy->getScalarType(), VecPred, CostKind, 958fe6060f1SDimitry Andric I); 959fe6060f1SDimitry Andric } 960fe6060f1SDimitry Andric 961fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = 962fe6060f1SDimitry Andric TLI->getTypeLegalizationCost(DL, ValTy); 963fe6060f1SDimitry Andric int BaseCost = ST->getMVEVectorCostFactor(CostKind); 964fe6060f1SDimitry Andric // There are two types - the input that specifies the type of the compare 965fe6060f1SDimitry Andric // and the output vXi1 type. Because we don't know how the output will be 966fe6060f1SDimitry Andric // split, we may need an expensive shuffle to get two in sync. This has the 967fe6060f1SDimitry Andric // effect of making larger than legal compares (v8i32 for example) 968fe6060f1SDimitry Andric // expensive. 969fe6060f1SDimitry Andric if (LT.second.getVectorNumElements() > 2) { 970fe6060f1SDimitry Andric if (LT.first > 1) 971fe6060f1SDimitry Andric return LT.first * BaseCost + 972fe6060f1SDimitry Andric BaseT::getScalarizationOverhead(VecCondTy, true, false); 973fe6060f1SDimitry Andric return BaseCost; 974fe6060f1SDimitry Andric } 975fe6060f1SDimitry Andric } 976fe6060f1SDimitry Andric 977e8d8bef9SDimitry Andric // Default to cheap (throughput/size of 1 instruction) but adjust throughput 978e8d8bef9SDimitry Andric // for "multiple beats" potentially needed by MVE instructions. 979e8d8bef9SDimitry Andric int BaseCost = 1; 980fe6060f1SDimitry Andric if (ST->hasMVEIntegerOps() && ValTy->isVectorTy()) 981fe6060f1SDimitry Andric BaseCost = ST->getMVEVectorCostFactor(CostKind); 982e8d8bef9SDimitry Andric 983e8d8bef9SDimitry Andric return BaseCost * 984e8d8bef9SDimitry Andric BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); 9850b57cec5SDimitry Andric } 9860b57cec5SDimitry Andric 987fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty, 988fe6060f1SDimitry Andric ScalarEvolution *SE, 9890b57cec5SDimitry Andric const SCEV *Ptr) { 9900b57cec5SDimitry Andric // Address computations in vectorized code with non-consecutive addresses will 9910b57cec5SDimitry Andric // likely result in more instructions compared to scalar code where the 9920b57cec5SDimitry Andric // computation can more often be merged into the index mode. The resulting 9930b57cec5SDimitry Andric // extra micro-ops can significantly decrease throughput. 9940b57cec5SDimitry Andric unsigned NumVectorInstToHideOverhead = 10; 9950b57cec5SDimitry Andric int MaxMergeDistance = 64; 9960b57cec5SDimitry Andric 9978bcb0991SDimitry Andric if (ST->hasNEON()) { 9980b57cec5SDimitry Andric if (Ty->isVectorTy() && SE && 9990b57cec5SDimitry Andric !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) 10000b57cec5SDimitry Andric return NumVectorInstToHideOverhead; 10010b57cec5SDimitry Andric 10020b57cec5SDimitry Andric // In many cases the address computation is not merged into the instruction 10030b57cec5SDimitry Andric // addressing mode. 10040b57cec5SDimitry Andric return 1; 10050b57cec5SDimitry Andric } 10068bcb0991SDimitry Andric return BaseT::getAddressComputationCost(Ty, SE, Ptr); 10078bcb0991SDimitry Andric } 10088bcb0991SDimitry Andric 10095ffd83dbSDimitry Andric bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) { 10105ffd83dbSDimitry Andric if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 10115ffd83dbSDimitry Andric // If a VCTP is part of a chain, it's already profitable and shouldn't be 10125ffd83dbSDimitry Andric // optimized, else LSR may block tail-predication. 10135ffd83dbSDimitry Andric switch (II->getIntrinsicID()) { 10145ffd83dbSDimitry Andric case Intrinsic::arm_mve_vctp8: 10155ffd83dbSDimitry Andric case Intrinsic::arm_mve_vctp16: 10165ffd83dbSDimitry Andric case Intrinsic::arm_mve_vctp32: 10175ffd83dbSDimitry Andric case Intrinsic::arm_mve_vctp64: 10185ffd83dbSDimitry Andric return true; 10195ffd83dbSDimitry Andric default: 10205ffd83dbSDimitry Andric break; 10215ffd83dbSDimitry Andric } 10225ffd83dbSDimitry Andric } 10235ffd83dbSDimitry Andric return false; 10245ffd83dbSDimitry Andric } 10255ffd83dbSDimitry Andric 10265ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { 10278bcb0991SDimitry Andric if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) 10288bcb0991SDimitry Andric return false; 10298bcb0991SDimitry Andric 10305ffd83dbSDimitry Andric if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) { 10318bcb0991SDimitry Andric // Don't support v2i1 yet. 10328bcb0991SDimitry Andric if (VecTy->getNumElements() == 2) 10338bcb0991SDimitry Andric return false; 10348bcb0991SDimitry Andric 10358bcb0991SDimitry Andric // We don't support extending fp types. 10368bcb0991SDimitry Andric unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); 10378bcb0991SDimitry Andric if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy()) 10388bcb0991SDimitry Andric return false; 10398bcb0991SDimitry Andric } 10408bcb0991SDimitry Andric 10418bcb0991SDimitry Andric unsigned EltWidth = DataTy->getScalarSizeInBits(); 10425ffd83dbSDimitry Andric return (EltWidth == 32 && Alignment >= 4) || 10435ffd83dbSDimitry Andric (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8); 10448bcb0991SDimitry Andric } 10450b57cec5SDimitry Andric 10465ffd83dbSDimitry Andric bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) { 1047480093f4SDimitry Andric if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) 1048480093f4SDimitry Andric return false; 1049480093f4SDimitry Andric 1050480093f4SDimitry Andric // This method is called in 2 places: 1051480093f4SDimitry Andric // - from the vectorizer with a scalar type, in which case we need to get 1052480093f4SDimitry Andric // this as good as we can with the limited info we have (and rely on the cost 1053480093f4SDimitry Andric // model for the rest). 1054480093f4SDimitry Andric // - from the masked intrinsic lowering pass with the actual vector type. 1055480093f4SDimitry Andric // For MVE, we have a custom lowering pass that will already have custom 1056480093f4SDimitry Andric // legalised any gathers that we can to MVE intrinsics, and want to expand all 1057480093f4SDimitry Andric // the rest. The pass runs before the masked intrinsic lowering pass, so if we 1058480093f4SDimitry Andric // are here, we know we want to expand. 1059480093f4SDimitry Andric if (isa<VectorType>(Ty)) 1060480093f4SDimitry Andric return false; 1061480093f4SDimitry Andric 1062480093f4SDimitry Andric unsigned EltWidth = Ty->getScalarSizeInBits(); 10635ffd83dbSDimitry Andric return ((EltWidth == 32 && Alignment >= 4) || 10645ffd83dbSDimitry Andric (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); 1065480093f4SDimitry Andric } 1066480093f4SDimitry Andric 1067e8d8bef9SDimitry Andric /// Given a memcpy/memset/memmove instruction, return the number of memory 1068e8d8bef9SDimitry Andric /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a 1069e8d8bef9SDimitry Andric /// call is used. 1070e8d8bef9SDimitry Andric int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const { 1071e8d8bef9SDimitry Andric MemOp MOp; 1072e8d8bef9SDimitry Andric unsigned DstAddrSpace = ~0u; 1073e8d8bef9SDimitry Andric unsigned SrcAddrSpace = ~0u; 1074e8d8bef9SDimitry Andric const Function *F = I->getParent()->getParent(); 10750b57cec5SDimitry Andric 1076e8d8bef9SDimitry Andric if (const auto *MC = dyn_cast<MemTransferInst>(I)) { 1077e8d8bef9SDimitry Andric ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength()); 10780b57cec5SDimitry Andric // If 'size' is not a constant, a library call will be generated. 10790b57cec5SDimitry Andric if (!C) 1080e8d8bef9SDimitry Andric return -1; 10810b57cec5SDimitry Andric 10820b57cec5SDimitry Andric const unsigned Size = C->getValue().getZExtValue(); 1083e8d8bef9SDimitry Andric const Align DstAlign = *MC->getDestAlign(); 1084e8d8bef9SDimitry Andric const Align SrcAlign = *MC->getSourceAlign(); 1085e8d8bef9SDimitry Andric 1086e8d8bef9SDimitry Andric MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, 1087e8d8bef9SDimitry Andric /*IsVolatile*/ false); 1088e8d8bef9SDimitry Andric DstAddrSpace = MC->getDestAddressSpace(); 1089e8d8bef9SDimitry Andric SrcAddrSpace = MC->getSourceAddressSpace(); 1090e8d8bef9SDimitry Andric } 1091e8d8bef9SDimitry Andric else if (const auto *MS = dyn_cast<MemSetInst>(I)) { 1092e8d8bef9SDimitry Andric ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength()); 1093e8d8bef9SDimitry Andric // If 'size' is not a constant, a library call will be generated. 1094e8d8bef9SDimitry Andric if (!C) 1095e8d8bef9SDimitry Andric return -1; 1096e8d8bef9SDimitry Andric 1097e8d8bef9SDimitry Andric const unsigned Size = C->getValue().getZExtValue(); 1098e8d8bef9SDimitry Andric const Align DstAlign = *MS->getDestAlign(); 1099e8d8bef9SDimitry Andric 1100e8d8bef9SDimitry Andric MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign, 1101e8d8bef9SDimitry Andric /*IsZeroMemset*/ false, /*IsVolatile*/ false); 1102e8d8bef9SDimitry Andric DstAddrSpace = MS->getDestAddressSpace(); 1103e8d8bef9SDimitry Andric } 1104e8d8bef9SDimitry Andric else 1105e8d8bef9SDimitry Andric llvm_unreachable("Expected a memcpy/move or memset!"); 1106e8d8bef9SDimitry Andric 1107e8d8bef9SDimitry Andric unsigned Limit, Factor = 2; 1108e8d8bef9SDimitry Andric switch(I->getIntrinsicID()) { 1109e8d8bef9SDimitry Andric case Intrinsic::memcpy: 1110e8d8bef9SDimitry Andric Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize()); 1111e8d8bef9SDimitry Andric break; 1112e8d8bef9SDimitry Andric case Intrinsic::memmove: 1113e8d8bef9SDimitry Andric Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); 1114e8d8bef9SDimitry Andric break; 1115e8d8bef9SDimitry Andric case Intrinsic::memset: 1116e8d8bef9SDimitry Andric Limit = TLI->getMaxStoresPerMemset(F->hasMinSize()); 1117e8d8bef9SDimitry Andric Factor = 1; 1118e8d8bef9SDimitry Andric break; 1119e8d8bef9SDimitry Andric default: 1120e8d8bef9SDimitry Andric llvm_unreachable("Expected a memcpy/move or memset!"); 1121e8d8bef9SDimitry Andric } 11220b57cec5SDimitry Andric 11230b57cec5SDimitry Andric // MemOps will be poplulated with a list of data types that needs to be 11240b57cec5SDimitry Andric // loaded and stored. That's why we multiply the number of elements by 2 to 11250b57cec5SDimitry Andric // get the cost for this memcpy. 1126e8d8bef9SDimitry Andric std::vector<EVT> MemOps; 11270b57cec5SDimitry Andric if (getTLI()->findOptimalMemOpLowering( 1128e8d8bef9SDimitry Andric MemOps, Limit, MOp, DstAddrSpace, 1129e8d8bef9SDimitry Andric SrcAddrSpace, F->getAttributes())) 1130e8d8bef9SDimitry Andric return MemOps.size() * Factor; 11310b57cec5SDimitry Andric 11320b57cec5SDimitry Andric // If we can't find an optimal memop lowering, return the default cost 1133e8d8bef9SDimitry Andric return -1; 1134e8d8bef9SDimitry Andric } 1135e8d8bef9SDimitry Andric 1136fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) { 1137e8d8bef9SDimitry Andric int NumOps = getNumMemOps(cast<IntrinsicInst>(I)); 1138e8d8bef9SDimitry Andric 1139e8d8bef9SDimitry Andric // To model the cost of a library call, we assume 1 for the call, and 1140e8d8bef9SDimitry Andric // 3 for the argument setup. 1141e8d8bef9SDimitry Andric if (NumOps == -1) 1142e8d8bef9SDimitry Andric return 4; 1143e8d8bef9SDimitry Andric return NumOps; 11440b57cec5SDimitry Andric } 11450b57cec5SDimitry Andric 1146fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, 1147fe6060f1SDimitry Andric VectorType *Tp, ArrayRef<int> Mask, 11485ffd83dbSDimitry Andric int Index, VectorType *SubTp) { 1149fe6060f1SDimitry Andric Kind = improveShuffleKindFromMask(Kind, Mask); 11508bcb0991SDimitry Andric if (ST->hasNEON()) { 11510b57cec5SDimitry Andric if (Kind == TTI::SK_Broadcast) { 11520b57cec5SDimitry Andric static const CostTblEntry NEONDupTbl[] = { 11530b57cec5SDimitry Andric // VDUP handles these cases. 11540b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 11550b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 11560b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 11570b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 11580b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, 11590b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, 11600b57cec5SDimitry Andric 11610b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, 11620b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, 11630b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, 11640b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; 11650b57cec5SDimitry Andric 1166fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 11678bcb0991SDimitry Andric if (const auto *Entry = 11688bcb0991SDimitry Andric CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) 11690b57cec5SDimitry Andric return LT.first * Entry->Cost; 11700b57cec5SDimitry Andric } 11710b57cec5SDimitry Andric if (Kind == TTI::SK_Reverse) { 11720b57cec5SDimitry Andric static const CostTblEntry NEONShuffleTbl[] = { 11730b57cec5SDimitry Andric // Reverse shuffle cost one instruction if we are shuffling within a 11740b57cec5SDimitry Andric // double word (vrev) or two if we shuffle a quad word (vrev, vext). 11750b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 11760b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 11770b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 11780b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 11790b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, 11800b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, 11810b57cec5SDimitry Andric 11820b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, 11830b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, 11840b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, 11850b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; 11860b57cec5SDimitry Andric 1187fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 11888bcb0991SDimitry Andric if (const auto *Entry = 11898bcb0991SDimitry Andric CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) 11900b57cec5SDimitry Andric return LT.first * Entry->Cost; 11910b57cec5SDimitry Andric } 11920b57cec5SDimitry Andric if (Kind == TTI::SK_Select) { 11930b57cec5SDimitry Andric static const CostTblEntry NEONSelShuffleTbl[] = { 11948bcb0991SDimitry Andric // Select shuffle cost table for ARM. Cost is the number of 11958bcb0991SDimitry Andric // instructions 11960b57cec5SDimitry Andric // required to create the shuffled vector. 11970b57cec5SDimitry Andric 11980b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, 11990b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 12000b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 12010b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, 12020b57cec5SDimitry Andric 12030b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, 12040b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, 12050b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, 12060b57cec5SDimitry Andric 12070b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, 12080b57cec5SDimitry Andric 12090b57cec5SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; 12100b57cec5SDimitry Andric 1211fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 12120b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, 12130b57cec5SDimitry Andric ISD::VECTOR_SHUFFLE, LT.second)) 12140b57cec5SDimitry Andric return LT.first * Entry->Cost; 12150b57cec5SDimitry Andric } 12168bcb0991SDimitry Andric } 12178bcb0991SDimitry Andric if (ST->hasMVEIntegerOps()) { 12188bcb0991SDimitry Andric if (Kind == TTI::SK_Broadcast) { 12198bcb0991SDimitry Andric static const CostTblEntry MVEDupTbl[] = { 12208bcb0991SDimitry Andric // VDUP handles these cases. 12218bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, 12228bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, 12238bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}, 12248bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, 12258bcb0991SDimitry Andric {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}}; 12268bcb0991SDimitry Andric 1227fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 12288bcb0991SDimitry Andric if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, 12298bcb0991SDimitry Andric LT.second)) 1230fe6060f1SDimitry Andric return LT.first * Entry->Cost * 1231fe6060f1SDimitry Andric ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput); 12320b57cec5SDimitry Andric } 12330b57cec5SDimitry Andric 1234fe6060f1SDimitry Andric if (!Mask.empty()) { 1235fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); 1236fe6060f1SDimitry Andric if (Mask.size() <= LT.second.getVectorNumElements() && 1237fe6060f1SDimitry Andric (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) || 1238fe6060f1SDimitry Andric isVREVMask(Mask, LT.second, 64))) 1239fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first; 1240fe6060f1SDimitry Andric } 1241fe6060f1SDimitry Andric } 1242fe6060f1SDimitry Andric 1243fe6060f1SDimitry Andric int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() 1244fe6060f1SDimitry Andric ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) 1245fe6060f1SDimitry Andric : 1; 1246fe6060f1SDimitry Andric return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); 1247fe6060f1SDimitry Andric } 1248fe6060f1SDimitry Andric 1249fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getArithmeticInstrCost( 1250fe6060f1SDimitry Andric unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 1251fe6060f1SDimitry Andric TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, 1252480093f4SDimitry Andric TTI::OperandValueProperties Opd1PropInfo, 1253fe6060f1SDimitry Andric TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 1254480093f4SDimitry Andric const Instruction *CxtI) { 12550b57cec5SDimitry Andric int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); 1256e8d8bef9SDimitry Andric if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) { 1257e8d8bef9SDimitry Andric // Make operations on i1 relatively expensive as this often involves 1258e8d8bef9SDimitry Andric // combining predicates. AND and XOR should be easier to handle with IT 1259e8d8bef9SDimitry Andric // blocks. 1260e8d8bef9SDimitry Andric switch (ISDOpcode) { 1261e8d8bef9SDimitry Andric default: 1262e8d8bef9SDimitry Andric break; 1263e8d8bef9SDimitry Andric case ISD::AND: 1264e8d8bef9SDimitry Andric case ISD::XOR: 1265e8d8bef9SDimitry Andric return 2; 1266e8d8bef9SDimitry Andric case ISD::OR: 1267e8d8bef9SDimitry Andric return 3; 1268e8d8bef9SDimitry Andric } 1269e8d8bef9SDimitry Andric } 1270e8d8bef9SDimitry Andric 1271fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 12720b57cec5SDimitry Andric 1273480093f4SDimitry Andric if (ST->hasNEON()) { 12740b57cec5SDimitry Andric const unsigned FunctionCallDivCost = 20; 12750b57cec5SDimitry Andric const unsigned ReciprocalDivCost = 10; 12760b57cec5SDimitry Andric static const CostTblEntry CostTbl[] = { 12770b57cec5SDimitry Andric // Division. 12780b57cec5SDimitry Andric // These costs are somewhat random. Choose a cost of 20 to indicate that 12790b57cec5SDimitry Andric // vectorizing devision (added function call) is going to be very expensive. 12800b57cec5SDimitry Andric // Double registers types. 12810b57cec5SDimitry Andric { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, 12820b57cec5SDimitry Andric { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, 12830b57cec5SDimitry Andric { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost}, 12840b57cec5SDimitry Andric { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost}, 12850b57cec5SDimitry Andric { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost}, 12860b57cec5SDimitry Andric { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost}, 12870b57cec5SDimitry Andric { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost}, 12880b57cec5SDimitry Andric { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost}, 12890b57cec5SDimitry Andric { ISD::SDIV, MVT::v4i16, ReciprocalDivCost}, 12900b57cec5SDimitry Andric { ISD::UDIV, MVT::v4i16, ReciprocalDivCost}, 12910b57cec5SDimitry Andric { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost}, 12920b57cec5SDimitry Andric { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost}, 12930b57cec5SDimitry Andric { ISD::SDIV, MVT::v8i8, ReciprocalDivCost}, 12940b57cec5SDimitry Andric { ISD::UDIV, MVT::v8i8, ReciprocalDivCost}, 12950b57cec5SDimitry Andric { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost}, 12960b57cec5SDimitry Andric { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost}, 12970b57cec5SDimitry Andric // Quad register types. 12980b57cec5SDimitry Andric { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost}, 12990b57cec5SDimitry Andric { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost}, 13000b57cec5SDimitry Andric { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost}, 13010b57cec5SDimitry Andric { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost}, 13020b57cec5SDimitry Andric { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost}, 13030b57cec5SDimitry Andric { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost}, 13040b57cec5SDimitry Andric { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost}, 13050b57cec5SDimitry Andric { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost}, 13060b57cec5SDimitry Andric { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost}, 13070b57cec5SDimitry Andric { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost}, 13080b57cec5SDimitry Andric { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost}, 13090b57cec5SDimitry Andric { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost}, 13100b57cec5SDimitry Andric { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost}, 13110b57cec5SDimitry Andric { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, 13120b57cec5SDimitry Andric { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, 13130b57cec5SDimitry Andric { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, 13140b57cec5SDimitry Andric // Multiplication. 13150b57cec5SDimitry Andric }; 13160b57cec5SDimitry Andric 13170b57cec5SDimitry Andric if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) 13180b57cec5SDimitry Andric return LT.first * Entry->Cost; 13190b57cec5SDimitry Andric 1320fe6060f1SDimitry Andric InstructionCost Cost = BaseT::getArithmeticInstrCost( 1321fe6060f1SDimitry Andric Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); 13220b57cec5SDimitry Andric 13230b57cec5SDimitry Andric // This is somewhat of a hack. The problem that we are facing is that SROA 13240b57cec5SDimitry Andric // creates a sequence of shift, and, or instructions to construct values. 13250b57cec5SDimitry Andric // These sequences are recognized by the ISel and have zero-cost. Not so for 13260b57cec5SDimitry Andric // the vectorized code. Because we have support for v2i64 but not i64 those 13270b57cec5SDimitry Andric // sequences look particularly beneficial to vectorize. 13280b57cec5SDimitry Andric // To work around this we increase the cost of v2i64 operations to make them 13290b57cec5SDimitry Andric // seem less beneficial. 13300b57cec5SDimitry Andric if (LT.second == MVT::v2i64 && 13310b57cec5SDimitry Andric Op2Info == TargetTransformInfo::OK_UniformConstantValue) 13320b57cec5SDimitry Andric Cost += 4; 13330b57cec5SDimitry Andric 13340b57cec5SDimitry Andric return Cost; 13350b57cec5SDimitry Andric } 13360b57cec5SDimitry Andric 1337480093f4SDimitry Andric // If this operation is a shift on arm/thumb2, it might well be folded into 1338480093f4SDimitry Andric // the following instruction, hence having a cost of 0. 1339480093f4SDimitry Andric auto LooksLikeAFreeShift = [&]() { 1340480093f4SDimitry Andric if (ST->isThumb1Only() || Ty->isVectorTy()) 1341480093f4SDimitry Andric return false; 1342480093f4SDimitry Andric 1343480093f4SDimitry Andric if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift()) 1344480093f4SDimitry Andric return false; 1345480093f4SDimitry Andric if (Op2Info != TargetTransformInfo::OK_UniformConstantValue) 1346480093f4SDimitry Andric return false; 1347480093f4SDimitry Andric 1348480093f4SDimitry Andric // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB 1349480093f4SDimitry Andric switch (cast<Instruction>(CxtI->user_back())->getOpcode()) { 1350480093f4SDimitry Andric case Instruction::Add: 1351480093f4SDimitry Andric case Instruction::Sub: 1352480093f4SDimitry Andric case Instruction::And: 1353480093f4SDimitry Andric case Instruction::Xor: 1354480093f4SDimitry Andric case Instruction::Or: 1355480093f4SDimitry Andric case Instruction::ICmp: 1356480093f4SDimitry Andric return true; 1357480093f4SDimitry Andric default: 1358480093f4SDimitry Andric return false; 1359480093f4SDimitry Andric } 1360480093f4SDimitry Andric }; 1361480093f4SDimitry Andric if (LooksLikeAFreeShift()) 1362480093f4SDimitry Andric return 0; 1363480093f4SDimitry Andric 1364e8d8bef9SDimitry Andric // Default to cheap (throughput/size of 1 instruction) but adjust throughput 1365e8d8bef9SDimitry Andric // for "multiple beats" potentially needed by MVE instructions. 1366e8d8bef9SDimitry Andric int BaseCost = 1; 1367fe6060f1SDimitry Andric if (ST->hasMVEIntegerOps() && Ty->isVectorTy()) 1368fe6060f1SDimitry Andric BaseCost = ST->getMVEVectorCostFactor(CostKind); 13698bcb0991SDimitry Andric 13708bcb0991SDimitry Andric // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost, 13718bcb0991SDimitry Andric // without treating floats as more expensive that scalars or increasing the 13728bcb0991SDimitry Andric // costs for custom operations. The results is also multiplied by the 13738bcb0991SDimitry Andric // MVEVectorCostFactor where appropriate. 13748bcb0991SDimitry Andric if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second)) 13758bcb0991SDimitry Andric return LT.first * BaseCost; 13768bcb0991SDimitry Andric 13778bcb0991SDimitry Andric // Else this is expand, assume that we need to scalarize this op. 13785ffd83dbSDimitry Andric if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 13795ffd83dbSDimitry Andric unsigned Num = VTy->getNumElements(); 1380fe6060f1SDimitry Andric InstructionCost Cost = 1381fe6060f1SDimitry Andric getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); 13828bcb0991SDimitry Andric // Return the cost of multiple scalar invocation plus the cost of 13838bcb0991SDimitry Andric // inserting and extracting the values. 1384fe6060f1SDimitry Andric SmallVector<Type *> Tys(Args.size(), Ty); 1385fe6060f1SDimitry Andric return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; 13868bcb0991SDimitry Andric } 13878bcb0991SDimitry Andric 13888bcb0991SDimitry Andric return BaseCost; 13898bcb0991SDimitry Andric } 13908bcb0991SDimitry Andric 1391fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, 1392fe6060f1SDimitry Andric MaybeAlign Alignment, 1393fe6060f1SDimitry Andric unsigned AddressSpace, 13945ffd83dbSDimitry Andric TTI::TargetCostKind CostKind, 1395480093f4SDimitry Andric const Instruction *I) { 13965ffd83dbSDimitry Andric // TODO: Handle other cost kinds. 13975ffd83dbSDimitry Andric if (CostKind != TTI::TCK_RecipThroughput) 13985ffd83dbSDimitry Andric return 1; 13995ffd83dbSDimitry Andric 14005ffd83dbSDimitry Andric // Type legalization can't handle structs 14015ffd83dbSDimitry Andric if (TLI->getValueType(DL, Src, true) == MVT::Other) 14025ffd83dbSDimitry Andric return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 14035ffd83dbSDimitry Andric CostKind); 14040b57cec5SDimitry Andric 1405480093f4SDimitry Andric if (ST->hasNEON() && Src->isVectorTy() && 1406480093f4SDimitry Andric (Alignment && *Alignment != Align(16)) && 14075ffd83dbSDimitry Andric cast<VectorType>(Src)->getElementType()->isDoubleTy()) { 14080b57cec5SDimitry Andric // Unaligned loads/stores are extremely inefficient. 14090b57cec5SDimitry Andric // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. 1410fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); 14110b57cec5SDimitry Andric return LT.first * 4; 14120b57cec5SDimitry Andric } 14135ffd83dbSDimitry Andric 14145ffd83dbSDimitry Andric // MVE can optimize a fpext(load(4xhalf)) using an extending integer load. 14155ffd83dbSDimitry Andric // Same for stores. 14165ffd83dbSDimitry Andric if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I && 14175ffd83dbSDimitry Andric ((Opcode == Instruction::Load && I->hasOneUse() && 14185ffd83dbSDimitry Andric isa<FPExtInst>(*I->user_begin())) || 14195ffd83dbSDimitry Andric (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) { 14205ffd83dbSDimitry Andric FixedVectorType *SrcVTy = cast<FixedVectorType>(Src); 14215ffd83dbSDimitry Andric Type *DstTy = 14225ffd83dbSDimitry Andric Opcode == Instruction::Load 14235ffd83dbSDimitry Andric ? (*I->user_begin())->getType() 14245ffd83dbSDimitry Andric : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType(); 14255ffd83dbSDimitry Andric if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() && 14265ffd83dbSDimitry Andric DstTy->getScalarType()->isFloatTy()) 1427fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(CostKind); 14285ffd83dbSDimitry Andric } 14295ffd83dbSDimitry Andric 14308bcb0991SDimitry Andric int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() 1431fe6060f1SDimitry Andric ? ST->getMVEVectorCostFactor(CostKind) 14328bcb0991SDimitry Andric : 1; 14335ffd83dbSDimitry Andric return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 14345ffd83dbSDimitry Andric CostKind, I); 14350b57cec5SDimitry Andric } 14360b57cec5SDimitry Andric 1437fe6060f1SDimitry Andric InstructionCost 1438fe6060f1SDimitry Andric ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, 1439e8d8bef9SDimitry Andric unsigned AddressSpace, 1440e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) { 1441e8d8bef9SDimitry Andric if (ST->hasMVEIntegerOps()) { 1442e8d8bef9SDimitry Andric if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment)) 1443fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(CostKind); 1444e8d8bef9SDimitry Andric if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment)) 1445fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(CostKind); 1446e8d8bef9SDimitry Andric } 1447e8d8bef9SDimitry Andric if (!isa<FixedVectorType>(Src)) 1448e8d8bef9SDimitry Andric return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, 1449e8d8bef9SDimitry Andric CostKind); 1450e8d8bef9SDimitry Andric // Scalar cost, which is currently very high due to the efficiency of the 1451e8d8bef9SDimitry Andric // generated code. 1452e8d8bef9SDimitry Andric return cast<FixedVectorType>(Src)->getNumElements() * 8; 1453e8d8bef9SDimitry Andric } 1454e8d8bef9SDimitry Andric 1455fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost( 1456480093f4SDimitry Andric unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 14575ffd83dbSDimitry Andric Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 14585ffd83dbSDimitry Andric bool UseMaskForCond, bool UseMaskForGaps) { 14590b57cec5SDimitry Andric assert(Factor >= 2 && "Invalid interleave factor"); 14600b57cec5SDimitry Andric assert(isa<VectorType>(VecTy) && "Expect a vector type"); 14610b57cec5SDimitry Andric 14620b57cec5SDimitry Andric // vldN/vstN doesn't support vector types of i64/f64 element. 14630b57cec5SDimitry Andric bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; 14640b57cec5SDimitry Andric 14650b57cec5SDimitry Andric if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && 14660b57cec5SDimitry Andric !UseMaskForCond && !UseMaskForGaps) { 14675ffd83dbSDimitry Andric unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements(); 14685ffd83dbSDimitry Andric auto *SubVecTy = 14695ffd83dbSDimitry Andric FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); 14700b57cec5SDimitry Andric 14710b57cec5SDimitry Andric // vldN/vstN only support legal vector types of size 64 or 128 in bits. 14720b57cec5SDimitry Andric // Accesses having vector types that are a multiple of 128 bits can be 14730b57cec5SDimitry Andric // matched to more than one vldN/vstN instruction. 1474fe6060f1SDimitry Andric int BaseCost = 1475fe6060f1SDimitry Andric ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1; 14760b57cec5SDimitry Andric if (NumElts % Factor == 0 && 1477fe6060f1SDimitry Andric TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL)) 1478480093f4SDimitry Andric return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL); 1479480093f4SDimitry Andric 1480480093f4SDimitry Andric // Some smaller than legal interleaved patterns are cheap as we can make 1481480093f4SDimitry Andric // use of the vmovn or vrev patterns to interleave a standard load. This is 1482480093f4SDimitry Andric // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is 1483480093f4SDimitry Andric // promoted differently). The cost of 2 here is then a load and vrev or 1484480093f4SDimitry Andric // vmovn. 1485480093f4SDimitry Andric if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 && 1486e8d8bef9SDimitry Andric VecTy->isIntOrIntVectorTy() && 1487e8d8bef9SDimitry Andric DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64) 1488480093f4SDimitry Andric return 2 * BaseCost; 14890b57cec5SDimitry Andric } 14900b57cec5SDimitry Andric 14910b57cec5SDimitry Andric return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, 14925ffd83dbSDimitry Andric Alignment, AddressSpace, CostKind, 14930b57cec5SDimitry Andric UseMaskForCond, UseMaskForGaps); 14940b57cec5SDimitry Andric } 14950b57cec5SDimitry Andric 1496fe6060f1SDimitry Andric InstructionCost ARMTTIImpl::getGatherScatterOpCost( 1497fe6060f1SDimitry Andric unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, 1498fe6060f1SDimitry Andric Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { 14995ffd83dbSDimitry Andric using namespace PatternMatch; 15005ffd83dbSDimitry Andric if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters) 15015ffd83dbSDimitry Andric return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 15025ffd83dbSDimitry Andric Alignment, CostKind, I); 15035ffd83dbSDimitry Andric 15045ffd83dbSDimitry Andric assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!"); 15055ffd83dbSDimitry Andric auto *VTy = cast<FixedVectorType>(DataTy); 15065ffd83dbSDimitry Andric 15075ffd83dbSDimitry Andric // TODO: Splitting, once we do that. 15085ffd83dbSDimitry Andric 15095ffd83dbSDimitry Andric unsigned NumElems = VTy->getNumElements(); 15105ffd83dbSDimitry Andric unsigned EltSize = VTy->getScalarSizeInBits(); 1511fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy); 15125ffd83dbSDimitry Andric 15135ffd83dbSDimitry Andric // For now, it is assumed that for the MVE gather instructions the loads are 15145ffd83dbSDimitry Andric // all effectively serialised. This means the cost is the scalar cost 15155ffd83dbSDimitry Andric // multiplied by the number of elements being loaded. This is possibly very 15165ffd83dbSDimitry Andric // conservative, but even so we still end up vectorising loops because the 15175ffd83dbSDimitry Andric // cost per iteration for many loops is lower than for scalar loops. 1518fe6060f1SDimitry Andric InstructionCost VectorCost = 1519fe6060f1SDimitry Andric NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind); 15205ffd83dbSDimitry Andric // The scalarization cost should be a lot higher. We use the number of vector 15215ffd83dbSDimitry Andric // elements plus the scalarization overhead. 1522fe6060f1SDimitry Andric InstructionCost ScalarCost = 1523fe6060f1SDimitry Andric NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) + 1524fe6060f1SDimitry Andric BaseT::getScalarizationOverhead(VTy, false, true); 15255ffd83dbSDimitry Andric 1526e8d8bef9SDimitry Andric if (EltSize < 8 || Alignment < EltSize / 8) 15275ffd83dbSDimitry Andric return ScalarCost; 15285ffd83dbSDimitry Andric 15295ffd83dbSDimitry Andric unsigned ExtSize = EltSize; 15305ffd83dbSDimitry Andric // Check whether there's a single user that asks for an extended type 15315ffd83dbSDimitry Andric if (I != nullptr) { 15325ffd83dbSDimitry Andric // Dependent of the caller of this function, a gather instruction will 15335ffd83dbSDimitry Andric // either have opcode Instruction::Load or be a call to the masked_gather 15345ffd83dbSDimitry Andric // intrinsic 15355ffd83dbSDimitry Andric if ((I->getOpcode() == Instruction::Load || 15365ffd83dbSDimitry Andric match(I, m_Intrinsic<Intrinsic::masked_gather>())) && 15375ffd83dbSDimitry Andric I->hasOneUse()) { 15385ffd83dbSDimitry Andric const User *Us = *I->users().begin(); 15395ffd83dbSDimitry Andric if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) { 15405ffd83dbSDimitry Andric // only allow valid type combinations 15415ffd83dbSDimitry Andric unsigned TypeSize = 15425ffd83dbSDimitry Andric cast<Instruction>(Us)->getType()->getScalarSizeInBits(); 15435ffd83dbSDimitry Andric if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) || 15445ffd83dbSDimitry Andric (TypeSize == 16 && EltSize == 8)) && 15455ffd83dbSDimitry Andric TypeSize * NumElems == 128) { 15465ffd83dbSDimitry Andric ExtSize = TypeSize; 15475ffd83dbSDimitry Andric } 15485ffd83dbSDimitry Andric } 15495ffd83dbSDimitry Andric } 15505ffd83dbSDimitry Andric // Check whether the input data needs to be truncated 15515ffd83dbSDimitry Andric TruncInst *T; 15525ffd83dbSDimitry Andric if ((I->getOpcode() == Instruction::Store || 15535ffd83dbSDimitry Andric match(I, m_Intrinsic<Intrinsic::masked_scatter>())) && 15545ffd83dbSDimitry Andric (T = dyn_cast<TruncInst>(I->getOperand(0)))) { 15555ffd83dbSDimitry Andric // Only allow valid type combinations 15565ffd83dbSDimitry Andric unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits(); 15575ffd83dbSDimitry Andric if (((EltSize == 16 && TypeSize == 32) || 15585ffd83dbSDimitry Andric (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) && 15595ffd83dbSDimitry Andric TypeSize * NumElems == 128) 15605ffd83dbSDimitry Andric ExtSize = TypeSize; 15615ffd83dbSDimitry Andric } 15625ffd83dbSDimitry Andric } 15635ffd83dbSDimitry Andric 15645ffd83dbSDimitry Andric if (ExtSize * NumElems != 128 || NumElems < 4) 15655ffd83dbSDimitry Andric return ScalarCost; 15665ffd83dbSDimitry Andric 15675ffd83dbSDimitry Andric // Any (aligned) i32 gather will not need to be scalarised. 15685ffd83dbSDimitry Andric if (ExtSize == 32) 15695ffd83dbSDimitry Andric return VectorCost; 15705ffd83dbSDimitry Andric // For smaller types, we need to ensure that the gep's inputs are correctly 15715ffd83dbSDimitry Andric // extended from a small enough value. Other sizes (including i64) are 15725ffd83dbSDimitry Andric // scalarized for now. 15735ffd83dbSDimitry Andric if (ExtSize != 8 && ExtSize != 16) 15745ffd83dbSDimitry Andric return ScalarCost; 15755ffd83dbSDimitry Andric 15765ffd83dbSDimitry Andric if (const auto *BC = dyn_cast<BitCastInst>(Ptr)) 15775ffd83dbSDimitry Andric Ptr = BC->getOperand(0); 15785ffd83dbSDimitry Andric if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { 15795ffd83dbSDimitry Andric if (GEP->getNumOperands() != 2) 15805ffd83dbSDimitry Andric return ScalarCost; 15815ffd83dbSDimitry Andric unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType()); 15825ffd83dbSDimitry Andric // Scale needs to be correct (which is only relevant for i16s). 15835ffd83dbSDimitry Andric if (Scale != 1 && Scale * 8 != ExtSize) 15845ffd83dbSDimitry Andric return ScalarCost; 15855ffd83dbSDimitry Andric // And we need to zext (not sext) the indexes from a small enough type. 15865ffd83dbSDimitry Andric if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) { 15875ffd83dbSDimitry Andric if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize) 15885ffd83dbSDimitry Andric return VectorCost; 15895ffd83dbSDimitry Andric } 15905ffd83dbSDimitry Andric return ScalarCost; 15915ffd83dbSDimitry Andric } 15925ffd83dbSDimitry Andric return ScalarCost; 15935ffd83dbSDimitry Andric } 15945ffd83dbSDimitry Andric 1595fe6060f1SDimitry Andric InstructionCost 1596fe6060f1SDimitry Andric ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, 1597fe6060f1SDimitry Andric Optional<FastMathFlags> FMF, 1598e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) { 1599fe6060f1SDimitry Andric if (TTI::requiresOrderedReduction(FMF)) 1600fe6060f1SDimitry Andric return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1601fe6060f1SDimitry Andric 1602e8d8bef9SDimitry Andric EVT ValVT = TLI->getValueType(DL, ValTy); 1603e8d8bef9SDimitry Andric int ISD = TLI->InstructionOpcodeToISD(Opcode); 1604e8d8bef9SDimitry Andric if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) 1605fe6060f1SDimitry Andric return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1606e8d8bef9SDimitry Andric 1607fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 1608e8d8bef9SDimitry Andric 1609e8d8bef9SDimitry Andric static const CostTblEntry CostTblAdd[]{ 1610e8d8bef9SDimitry Andric {ISD::ADD, MVT::v16i8, 1}, 1611e8d8bef9SDimitry Andric {ISD::ADD, MVT::v8i16, 1}, 1612e8d8bef9SDimitry Andric {ISD::ADD, MVT::v4i32, 1}, 1613e8d8bef9SDimitry Andric }; 1614e8d8bef9SDimitry Andric if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) 1615fe6060f1SDimitry Andric return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first; 1616e8d8bef9SDimitry Andric 1617fe6060f1SDimitry Andric return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); 1618e8d8bef9SDimitry Andric } 1619e8d8bef9SDimitry Andric 1620e8d8bef9SDimitry Andric InstructionCost 1621e8d8bef9SDimitry Andric ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, 1622e8d8bef9SDimitry Andric Type *ResTy, VectorType *ValTy, 1623e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) { 1624e8d8bef9SDimitry Andric EVT ValVT = TLI->getValueType(DL, ValTy); 1625e8d8bef9SDimitry Andric EVT ResVT = TLI->getValueType(DL, ResTy); 1626e8d8bef9SDimitry Andric if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { 1627fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = 1628fe6060f1SDimitry Andric TLI->getTypeLegalizationCost(DL, ValTy); 1629e8d8bef9SDimitry Andric if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) || 1630e8d8bef9SDimitry Andric (LT.second == MVT::v8i16 && 1631e8d8bef9SDimitry Andric ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) || 1632e8d8bef9SDimitry Andric (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64)) 1633fe6060f1SDimitry Andric return ST->getMVEVectorCostFactor(CostKind) * LT.first; 1634e8d8bef9SDimitry Andric } 1635e8d8bef9SDimitry Andric 1636e8d8bef9SDimitry Andric return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy, 1637e8d8bef9SDimitry Andric CostKind); 1638e8d8bef9SDimitry Andric } 1639e8d8bef9SDimitry Andric 1640fe6060f1SDimitry Andric InstructionCost 1641fe6060f1SDimitry Andric ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1642e8d8bef9SDimitry Andric TTI::TargetCostKind CostKind) { 1643e8d8bef9SDimitry Andric switch (ICA.getID()) { 1644e8d8bef9SDimitry Andric case Intrinsic::get_active_lane_mask: 1645e8d8bef9SDimitry Andric // Currently we make a somewhat optimistic assumption that 1646e8d8bef9SDimitry Andric // active_lane_mask's are always free. In reality it may be freely folded 1647e8d8bef9SDimitry Andric // into a tail predicated loop, expanded into a VCPT or expanded into a lot 1648e8d8bef9SDimitry Andric // of add/icmp code. We may need to improve this in the future, but being 1649e8d8bef9SDimitry Andric // able to detect if it is free or not involves looking at a lot of other 1650e8d8bef9SDimitry Andric // code. We currently assume that the vectorizer inserted these, and knew 1651e8d8bef9SDimitry Andric // what it was doing in adding one. 1652e8d8bef9SDimitry Andric if (ST->hasMVEIntegerOps()) 1653e8d8bef9SDimitry Andric return 0; 1654e8d8bef9SDimitry Andric break; 1655e8d8bef9SDimitry Andric case Intrinsic::sadd_sat: 1656e8d8bef9SDimitry Andric case Intrinsic::ssub_sat: 1657e8d8bef9SDimitry Andric case Intrinsic::uadd_sat: 1658e8d8bef9SDimitry Andric case Intrinsic::usub_sat: { 1659e8d8bef9SDimitry Andric if (!ST->hasMVEIntegerOps()) 1660e8d8bef9SDimitry Andric break; 1661e8d8bef9SDimitry Andric Type *VT = ICA.getReturnType(); 1662e8d8bef9SDimitry Andric 1663fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT); 1664e8d8bef9SDimitry Andric if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || 1665e8d8bef9SDimitry Andric LT.second == MVT::v16i8) { 1666fe6060f1SDimitry Andric // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we 1667e8d8bef9SDimitry Andric // need to extend the type, as it uses shr(qadd(shl, shl)). 1668fe6060f1SDimitry Andric unsigned Instrs = 1669fe6060f1SDimitry Andric LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4; 1670fe6060f1SDimitry Andric return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs; 1671e8d8bef9SDimitry Andric } 1672e8d8bef9SDimitry Andric break; 1673e8d8bef9SDimitry Andric } 1674fe6060f1SDimitry Andric case Intrinsic::abs: 1675fe6060f1SDimitry Andric case Intrinsic::smin: 1676fe6060f1SDimitry Andric case Intrinsic::smax: 1677fe6060f1SDimitry Andric case Intrinsic::umin: 1678fe6060f1SDimitry Andric case Intrinsic::umax: { 1679fe6060f1SDimitry Andric if (!ST->hasMVEIntegerOps()) 1680fe6060f1SDimitry Andric break; 1681fe6060f1SDimitry Andric Type *VT = ICA.getReturnType(); 1682fe6060f1SDimitry Andric 1683fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT); 1684fe6060f1SDimitry Andric if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || 1685fe6060f1SDimitry Andric LT.second == MVT::v16i8) 1686fe6060f1SDimitry Andric return LT.first * ST->getMVEVectorCostFactor(CostKind); 1687fe6060f1SDimitry Andric break; 1688fe6060f1SDimitry Andric } 1689fe6060f1SDimitry Andric case Intrinsic::minnum: 1690fe6060f1SDimitry Andric case Intrinsic::maxnum: { 1691fe6060f1SDimitry Andric if (!ST->hasMVEFloatOps()) 1692fe6060f1SDimitry Andric break; 1693fe6060f1SDimitry Andric Type *VT = ICA.getReturnType(); 1694fe6060f1SDimitry Andric std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT); 1695fe6060f1SDimitry Andric if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) 1696fe6060f1SDimitry Andric return LT.first * ST->getMVEVectorCostFactor(CostKind); 1697fe6060f1SDimitry Andric break; 1698fe6060f1SDimitry Andric } 1699e8d8bef9SDimitry Andric } 1700e8d8bef9SDimitry Andric 1701e8d8bef9SDimitry Andric return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1702e8d8bef9SDimitry Andric } 1703e8d8bef9SDimitry Andric 17040b57cec5SDimitry Andric bool ARMTTIImpl::isLoweredToCall(const Function *F) { 17050b57cec5SDimitry Andric if (!F->isIntrinsic()) 17060b57cec5SDimitry Andric BaseT::isLoweredToCall(F); 17070b57cec5SDimitry Andric 17080b57cec5SDimitry Andric // Assume all Arm-specific intrinsics map to an instruction. 17090b57cec5SDimitry Andric if (F->getName().startswith("llvm.arm")) 17100b57cec5SDimitry Andric return false; 17110b57cec5SDimitry Andric 17120b57cec5SDimitry Andric switch (F->getIntrinsicID()) { 17130b57cec5SDimitry Andric default: break; 17140b57cec5SDimitry Andric case Intrinsic::powi: 17150b57cec5SDimitry Andric case Intrinsic::sin: 17160b57cec5SDimitry Andric case Intrinsic::cos: 17170b57cec5SDimitry Andric case Intrinsic::pow: 17180b57cec5SDimitry Andric case Intrinsic::log: 17190b57cec5SDimitry Andric case Intrinsic::log10: 17200b57cec5SDimitry Andric case Intrinsic::log2: 17210b57cec5SDimitry Andric case Intrinsic::exp: 17220b57cec5SDimitry Andric case Intrinsic::exp2: 17230b57cec5SDimitry Andric return true; 17240b57cec5SDimitry Andric case Intrinsic::sqrt: 17250b57cec5SDimitry Andric case Intrinsic::fabs: 17260b57cec5SDimitry Andric case Intrinsic::copysign: 17270b57cec5SDimitry Andric case Intrinsic::floor: 17280b57cec5SDimitry Andric case Intrinsic::ceil: 17290b57cec5SDimitry Andric case Intrinsic::trunc: 17300b57cec5SDimitry Andric case Intrinsic::rint: 17310b57cec5SDimitry Andric case Intrinsic::nearbyint: 17320b57cec5SDimitry Andric case Intrinsic::round: 17330b57cec5SDimitry Andric case Intrinsic::canonicalize: 17340b57cec5SDimitry Andric case Intrinsic::lround: 17350b57cec5SDimitry Andric case Intrinsic::llround: 17360b57cec5SDimitry Andric case Intrinsic::lrint: 17370b57cec5SDimitry Andric case Intrinsic::llrint: 17380b57cec5SDimitry Andric if (F->getReturnType()->isDoubleTy() && !ST->hasFP64()) 17390b57cec5SDimitry Andric return true; 17400b57cec5SDimitry Andric if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16()) 17410b57cec5SDimitry Andric return true; 17420b57cec5SDimitry Andric // Some operations can be handled by vector instructions and assume 17430b57cec5SDimitry Andric // unsupported vectors will be expanded into supported scalar ones. 17440b57cec5SDimitry Andric // TODO Handle scalar operations properly. 17450b57cec5SDimitry Andric return !ST->hasFPARMv8Base() && !ST->hasVFP2Base(); 17460b57cec5SDimitry Andric case Intrinsic::masked_store: 17470b57cec5SDimitry Andric case Intrinsic::masked_load: 17480b57cec5SDimitry Andric case Intrinsic::masked_gather: 17490b57cec5SDimitry Andric case Intrinsic::masked_scatter: 17500b57cec5SDimitry Andric return !ST->hasMVEIntegerOps(); 17510b57cec5SDimitry Andric case Intrinsic::sadd_with_overflow: 17520b57cec5SDimitry Andric case Intrinsic::uadd_with_overflow: 17530b57cec5SDimitry Andric case Intrinsic::ssub_with_overflow: 17540b57cec5SDimitry Andric case Intrinsic::usub_with_overflow: 17550b57cec5SDimitry Andric case Intrinsic::sadd_sat: 17560b57cec5SDimitry Andric case Intrinsic::uadd_sat: 17570b57cec5SDimitry Andric case Intrinsic::ssub_sat: 17580b57cec5SDimitry Andric case Intrinsic::usub_sat: 17590b57cec5SDimitry Andric return false; 17600b57cec5SDimitry Andric } 17610b57cec5SDimitry Andric 17620b57cec5SDimitry Andric return BaseT::isLoweredToCall(F); 17630b57cec5SDimitry Andric } 17640b57cec5SDimitry Andric 1765e8d8bef9SDimitry Andric bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) { 17660b57cec5SDimitry Andric unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); 17670b57cec5SDimitry Andric EVT VT = TLI->getValueType(DL, I.getType(), true); 17680b57cec5SDimitry Andric if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) 17690b57cec5SDimitry Andric return true; 17700b57cec5SDimitry Andric 17710b57cec5SDimitry Andric // Check if an intrinsic will be lowered to a call and assume that any 17720b57cec5SDimitry Andric // other CallInst will generate a bl. 17730b57cec5SDimitry Andric if (auto *Call = dyn_cast<CallInst>(&I)) { 1774e8d8bef9SDimitry Andric if (auto *II = dyn_cast<IntrinsicInst>(Call)) { 1775e8d8bef9SDimitry Andric switch(II->getIntrinsicID()) { 1776e8d8bef9SDimitry Andric case Intrinsic::memcpy: 1777e8d8bef9SDimitry Andric case Intrinsic::memset: 1778e8d8bef9SDimitry Andric case Intrinsic::memmove: 1779e8d8bef9SDimitry Andric return getNumMemOps(II) == -1; 1780e8d8bef9SDimitry Andric default: 17810b57cec5SDimitry Andric if (const Function *F = Call->getCalledFunction()) 17820b57cec5SDimitry Andric return isLoweredToCall(F); 17830b57cec5SDimitry Andric } 1784e8d8bef9SDimitry Andric } 17850b57cec5SDimitry Andric return true; 17860b57cec5SDimitry Andric } 17870b57cec5SDimitry Andric 17880b57cec5SDimitry Andric // FPv5 provides conversions between integer, double-precision, 17890b57cec5SDimitry Andric // single-precision, and half-precision formats. 17900b57cec5SDimitry Andric switch (I.getOpcode()) { 17910b57cec5SDimitry Andric default: 17920b57cec5SDimitry Andric break; 17930b57cec5SDimitry Andric case Instruction::FPToSI: 17940b57cec5SDimitry Andric case Instruction::FPToUI: 17950b57cec5SDimitry Andric case Instruction::SIToFP: 17960b57cec5SDimitry Andric case Instruction::UIToFP: 17970b57cec5SDimitry Andric case Instruction::FPTrunc: 17980b57cec5SDimitry Andric case Instruction::FPExt: 17990b57cec5SDimitry Andric return !ST->hasFPARMv8Base(); 18000b57cec5SDimitry Andric } 18010b57cec5SDimitry Andric 18020b57cec5SDimitry Andric // FIXME: Unfortunately the approach of checking the Operation Action does 18030b57cec5SDimitry Andric // not catch all cases of Legalization that use library calls. Our 18040b57cec5SDimitry Andric // Legalization step categorizes some transformations into library calls as 18050b57cec5SDimitry Andric // Custom, Expand or even Legal when doing type legalization. So for now 18060b57cec5SDimitry Andric // we have to special case for instance the SDIV of 64bit integers and the 18070b57cec5SDimitry Andric // use of floating point emulation. 18080b57cec5SDimitry Andric if (VT.isInteger() && VT.getSizeInBits() >= 64) { 18090b57cec5SDimitry Andric switch (ISD) { 18100b57cec5SDimitry Andric default: 18110b57cec5SDimitry Andric break; 18120b57cec5SDimitry Andric case ISD::SDIV: 18130b57cec5SDimitry Andric case ISD::UDIV: 18140b57cec5SDimitry Andric case ISD::SREM: 18150b57cec5SDimitry Andric case ISD::UREM: 18160b57cec5SDimitry Andric case ISD::SDIVREM: 18170b57cec5SDimitry Andric case ISD::UDIVREM: 18180b57cec5SDimitry Andric return true; 18190b57cec5SDimitry Andric } 18200b57cec5SDimitry Andric } 18210b57cec5SDimitry Andric 18220b57cec5SDimitry Andric // Assume all other non-float operations are supported. 18230b57cec5SDimitry Andric if (!VT.isFloatingPoint()) 18240b57cec5SDimitry Andric return false; 18250b57cec5SDimitry Andric 18260b57cec5SDimitry Andric // We'll need a library call to handle most floats when using soft. 18270b57cec5SDimitry Andric if (TLI->useSoftFloat()) { 18280b57cec5SDimitry Andric switch (I.getOpcode()) { 18290b57cec5SDimitry Andric default: 18300b57cec5SDimitry Andric return true; 18310b57cec5SDimitry Andric case Instruction::Alloca: 18320b57cec5SDimitry Andric case Instruction::Load: 18330b57cec5SDimitry Andric case Instruction::Store: 18340b57cec5SDimitry Andric case Instruction::Select: 18350b57cec5SDimitry Andric case Instruction::PHI: 18360b57cec5SDimitry Andric return false; 18370b57cec5SDimitry Andric } 18380b57cec5SDimitry Andric } 18390b57cec5SDimitry Andric 18400b57cec5SDimitry Andric // We'll need a libcall to perform double precision operations on a single 18410b57cec5SDimitry Andric // precision only FPU. 18420b57cec5SDimitry Andric if (I.getType()->isDoubleTy() && !ST->hasFP64()) 18430b57cec5SDimitry Andric return true; 18440b57cec5SDimitry Andric 18450b57cec5SDimitry Andric // Likewise for half precision arithmetic. 18460b57cec5SDimitry Andric if (I.getType()->isHalfTy() && !ST->hasFullFP16()) 18470b57cec5SDimitry Andric return true; 18480b57cec5SDimitry Andric 18490b57cec5SDimitry Andric return false; 1850e8d8bef9SDimitry Andric } 1851e8d8bef9SDimitry Andric 1852e8d8bef9SDimitry Andric bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 1853e8d8bef9SDimitry Andric AssumptionCache &AC, 1854e8d8bef9SDimitry Andric TargetLibraryInfo *LibInfo, 1855e8d8bef9SDimitry Andric HardwareLoopInfo &HWLoopInfo) { 1856e8d8bef9SDimitry Andric // Low-overhead branches are only supported in the 'low-overhead branch' 1857e8d8bef9SDimitry Andric // extension of v8.1-m. 1858e8d8bef9SDimitry Andric if (!ST->hasLOB() || DisableLowOverheadLoops) { 1859e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n"); 1860e8d8bef9SDimitry Andric return false; 1861e8d8bef9SDimitry Andric } 1862e8d8bef9SDimitry Andric 1863e8d8bef9SDimitry Andric if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { 1864e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n"); 1865e8d8bef9SDimitry Andric return false; 1866e8d8bef9SDimitry Andric } 1867e8d8bef9SDimitry Andric 1868e8d8bef9SDimitry Andric const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 1869e8d8bef9SDimitry Andric if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { 1870e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n"); 1871e8d8bef9SDimitry Andric return false; 1872e8d8bef9SDimitry Andric } 1873e8d8bef9SDimitry Andric 1874e8d8bef9SDimitry Andric const SCEV *TripCountSCEV = 1875e8d8bef9SDimitry Andric SE.getAddExpr(BackedgeTakenCount, 1876e8d8bef9SDimitry Andric SE.getOne(BackedgeTakenCount->getType())); 1877e8d8bef9SDimitry Andric 1878e8d8bef9SDimitry Andric // We need to store the trip count in LR, a 32-bit register. 1879e8d8bef9SDimitry Andric if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) { 1880e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n"); 1881e8d8bef9SDimitry Andric return false; 1882e8d8bef9SDimitry Andric } 1883e8d8bef9SDimitry Andric 1884e8d8bef9SDimitry Andric // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little 1885e8d8bef9SDimitry Andric // point in generating a hardware loop if that's going to happen. 18860b57cec5SDimitry Andric 18870b57cec5SDimitry Andric auto IsHardwareLoopIntrinsic = [](Instruction &I) { 18880b57cec5SDimitry Andric if (auto *Call = dyn_cast<IntrinsicInst>(&I)) { 18890b57cec5SDimitry Andric switch (Call->getIntrinsicID()) { 18900b57cec5SDimitry Andric default: 18910b57cec5SDimitry Andric break; 1892e8d8bef9SDimitry Andric case Intrinsic::start_loop_iterations: 1893fe6060f1SDimitry Andric case Intrinsic::test_start_loop_iterations: 18940b57cec5SDimitry Andric case Intrinsic::loop_decrement: 18950b57cec5SDimitry Andric case Intrinsic::loop_decrement_reg: 18960b57cec5SDimitry Andric return true; 18970b57cec5SDimitry Andric } 18980b57cec5SDimitry Andric } 18990b57cec5SDimitry Andric return false; 19000b57cec5SDimitry Andric }; 19010b57cec5SDimitry Andric 19020b57cec5SDimitry Andric // Scan the instructions to see if there's any that we know will turn into a 1903e8d8bef9SDimitry Andric // call or if this loop is already a low-overhead loop or will become a tail 1904e8d8bef9SDimitry Andric // predicated loop. 1905e8d8bef9SDimitry Andric bool IsTailPredLoop = false; 19060b57cec5SDimitry Andric auto ScanLoop = [&](Loop *L) { 19070b57cec5SDimitry Andric for (auto *BB : L->getBlocks()) { 19080b57cec5SDimitry Andric for (auto &I : *BB) { 1909e8d8bef9SDimitry Andric if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) || 1910e8d8bef9SDimitry Andric isa<InlineAsm>(I)) { 19115ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n"); 19120b57cec5SDimitry Andric return false; 19130b57cec5SDimitry Andric } 1914e8d8bef9SDimitry Andric if (auto *II = dyn_cast<IntrinsicInst>(&I)) 1915e8d8bef9SDimitry Andric IsTailPredLoop |= 1916e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::get_active_lane_mask || 1917e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 || 1918e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 || 1919e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 || 1920e8d8bef9SDimitry Andric II->getIntrinsicID() == Intrinsic::arm_mve_vctp64; 19210b57cec5SDimitry Andric } 19225ffd83dbSDimitry Andric } 19230b57cec5SDimitry Andric return true; 19240b57cec5SDimitry Andric }; 19250b57cec5SDimitry Andric 19260b57cec5SDimitry Andric // Visit inner loops. 19270b57cec5SDimitry Andric for (auto Inner : *L) 19280b57cec5SDimitry Andric if (!ScanLoop(Inner)) 19290b57cec5SDimitry Andric return false; 19300b57cec5SDimitry Andric 19310b57cec5SDimitry Andric if (!ScanLoop(L)) 19320b57cec5SDimitry Andric return false; 19330b57cec5SDimitry Andric 19340b57cec5SDimitry Andric // TODO: Check whether the trip count calculation is expensive. If L is the 19350b57cec5SDimitry Andric // inner loop but we know it has a low trip count, calculating that trip 19360b57cec5SDimitry Andric // count (in the parent loop) may be detrimental. 19370b57cec5SDimitry Andric 19380b57cec5SDimitry Andric LLVMContext &C = L->getHeader()->getContext(); 19390b57cec5SDimitry Andric HWLoopInfo.CounterInReg = true; 19400b57cec5SDimitry Andric HWLoopInfo.IsNestingLegal = false; 1941e8d8bef9SDimitry Andric HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop; 19420b57cec5SDimitry Andric HWLoopInfo.CountType = Type::getInt32Ty(C); 19430b57cec5SDimitry Andric HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); 19440b57cec5SDimitry Andric return true; 19450b57cec5SDimitry Andric } 19460b57cec5SDimitry Andric 1947480093f4SDimitry Andric static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { 1948480093f4SDimitry Andric // We don't allow icmp's, and because we only look at single block loops, 1949480093f4SDimitry Andric // we simply count the icmps, i.e. there should only be 1 for the backedge. 1950480093f4SDimitry Andric if (isa<ICmpInst>(&I) && ++ICmpCount > 1) 1951480093f4SDimitry Andric return false; 1952480093f4SDimitry Andric 1953480093f4SDimitry Andric if (isa<FCmpInst>(&I)) 1954480093f4SDimitry Andric return false; 1955480093f4SDimitry Andric 1956480093f4SDimitry Andric // We could allow extending/narrowing FP loads/stores, but codegen is 1957480093f4SDimitry Andric // too inefficient so reject this for now. 1958480093f4SDimitry Andric if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I)) 1959480093f4SDimitry Andric return false; 1960480093f4SDimitry Andric 1961480093f4SDimitry Andric // Extends have to be extending-loads 1962480093f4SDimitry Andric if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) ) 1963480093f4SDimitry Andric if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0))) 1964480093f4SDimitry Andric return false; 1965480093f4SDimitry Andric 1966480093f4SDimitry Andric // Truncs have to be narrowing-stores 1967480093f4SDimitry Andric if (isa<TruncInst>(&I) ) 1968480093f4SDimitry Andric if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin())) 1969480093f4SDimitry Andric return false; 1970480093f4SDimitry Andric 1971480093f4SDimitry Andric return true; 1972480093f4SDimitry Andric } 1973480093f4SDimitry Andric 1974480093f4SDimitry Andric // To set up a tail-predicated loop, we need to know the total number of 1975480093f4SDimitry Andric // elements processed by that loop. Thus, we need to determine the element 1976480093f4SDimitry Andric // size and: 1977480093f4SDimitry Andric // 1) it should be uniform for all operations in the vector loop, so we 1978480093f4SDimitry Andric // e.g. don't want any widening/narrowing operations. 1979480093f4SDimitry Andric // 2) it should be smaller than i64s because we don't have vector operations 1980480093f4SDimitry Andric // that work on i64s. 1981480093f4SDimitry Andric // 3) we don't want elements to be reversed or shuffled, to make sure the 1982480093f4SDimitry Andric // tail-predication masks/predicates the right lanes. 1983480093f4SDimitry Andric // 1984480093f4SDimitry Andric static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, 1985480093f4SDimitry Andric const DataLayout &DL, 1986480093f4SDimitry Andric const LoopAccessInfo *LAI) { 19875ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n"); 19885ffd83dbSDimitry Andric 1989e8d8bef9SDimitry Andric // If there are live-out values, it is probably a reduction. We can predicate 1990e8d8bef9SDimitry Andric // most reduction operations freely under MVE using a combination of 1991e8d8bef9SDimitry Andric // prefer-predicated-reduction-select and inloop reductions. We limit this to 1992e8d8bef9SDimitry Andric // floating point and integer reductions, but don't check for operators 1993e8d8bef9SDimitry Andric // specifically here. If the value ends up not being a reduction (and so the 1994e8d8bef9SDimitry Andric // vectorizer cannot tailfold the loop), we should fall back to standard 1995e8d8bef9SDimitry Andric // vectorization automatically. 19965ffd83dbSDimitry Andric SmallVector< Instruction *, 8 > LiveOuts; 19975ffd83dbSDimitry Andric LiveOuts = llvm::findDefsUsedOutsideOfLoop(L); 1998e8d8bef9SDimitry Andric bool ReductionsDisabled = 19995ffd83dbSDimitry Andric EnableTailPredication == TailPredication::EnabledNoReductions || 20005ffd83dbSDimitry Andric EnableTailPredication == TailPredication::ForceEnabledNoReductions; 20015ffd83dbSDimitry Andric 20025ffd83dbSDimitry Andric for (auto *I : LiveOuts) { 2003e8d8bef9SDimitry Andric if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() && 2004e8d8bef9SDimitry Andric !I->getType()->isHalfTy()) { 2005e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float " 20065ffd83dbSDimitry Andric "live-out value\n"); 20075ffd83dbSDimitry Andric return false; 20085ffd83dbSDimitry Andric } 2009e8d8bef9SDimitry Andric if (ReductionsDisabled) { 2010e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Reductions not enabled\n"); 20115ffd83dbSDimitry Andric return false; 20125ffd83dbSDimitry Andric } 20135ffd83dbSDimitry Andric } 20145ffd83dbSDimitry Andric 20155ffd83dbSDimitry Andric // Next, check that all instructions can be tail-predicated. 2016480093f4SDimitry Andric PredicatedScalarEvolution PSE = LAI->getPSE(); 20175ffd83dbSDimitry Andric SmallVector<Instruction *, 16> LoadStores; 2018480093f4SDimitry Andric int ICmpCount = 0; 2019480093f4SDimitry Andric 2020480093f4SDimitry Andric for (BasicBlock *BB : L->blocks()) { 2021480093f4SDimitry Andric for (Instruction &I : BB->instructionsWithoutDebug()) { 2022480093f4SDimitry Andric if (isa<PHINode>(&I)) 2023480093f4SDimitry Andric continue; 2024480093f4SDimitry Andric if (!canTailPredicateInstruction(I, ICmpCount)) { 2025480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump()); 2026480093f4SDimitry Andric return false; 2027480093f4SDimitry Andric } 2028480093f4SDimitry Andric 2029480093f4SDimitry Andric Type *T = I.getType(); 2030480093f4SDimitry Andric if (T->isPointerTy()) 2031480093f4SDimitry Andric T = T->getPointerElementType(); 2032480093f4SDimitry Andric 2033480093f4SDimitry Andric if (T->getScalarSizeInBits() > 32) { 2034480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump()); 2035480093f4SDimitry Andric return false; 2036480093f4SDimitry Andric } 2037480093f4SDimitry Andric if (isa<StoreInst>(I) || isa<LoadInst>(I)) { 2038480093f4SDimitry Andric Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1); 2039480093f4SDimitry Andric int64_t NextStride = getPtrStride(PSE, Ptr, L); 2040e8d8bef9SDimitry Andric if (NextStride == 1) { 2041480093f4SDimitry Andric // TODO: for now only allow consecutive strides of 1. We could support 2042e8d8bef9SDimitry Andric // other strides as long as it is uniform, but let's keep it simple 2043e8d8bef9SDimitry Andric // for now. 2044e8d8bef9SDimitry Andric continue; 2045e8d8bef9SDimitry Andric } else if (NextStride == -1 || 2046e8d8bef9SDimitry Andric (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) || 2047e8d8bef9SDimitry Andric (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) { 2048e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() 2049e8d8bef9SDimitry Andric << "Consecutive strides of 2 found, vld2/vstr2 can't " 2050e8d8bef9SDimitry Andric "be tail-predicated\n."); 2051e8d8bef9SDimitry Andric return false; 2052e8d8bef9SDimitry Andric // TODO: don't tail predicate if there is a reversed load? 2053e8d8bef9SDimitry Andric } else if (EnableMaskedGatherScatters) { 2054e8d8bef9SDimitry Andric // Gather/scatters do allow loading from arbitrary strides, at 2055e8d8bef9SDimitry Andric // least if they are loop invariant. 2056e8d8bef9SDimitry Andric // TODO: Loop variant strides should in theory work, too, but 2057e8d8bef9SDimitry Andric // this requires further testing. 2058e8d8bef9SDimitry Andric const SCEV *PtrScev = 2059e8d8bef9SDimitry Andric replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr); 2060e8d8bef9SDimitry Andric if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) { 2061e8d8bef9SDimitry Andric const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); 2062e8d8bef9SDimitry Andric if (PSE.getSE()->isLoopInvariant(Step, L)) 2063480093f4SDimitry Andric continue; 2064480093f4SDimitry Andric } 2065e8d8bef9SDimitry Andric } 2066e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << "Bad stride found, can't " 2067480093f4SDimitry Andric "tail-predicate\n."); 2068480093f4SDimitry Andric return false; 2069480093f4SDimitry Andric } 2070480093f4SDimitry Andric } 2071480093f4SDimitry Andric } 2072480093f4SDimitry Andric 2073480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n"); 2074480093f4SDimitry Andric return true; 2075480093f4SDimitry Andric } 2076480093f4SDimitry Andric 2077480093f4SDimitry Andric bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, 2078480093f4SDimitry Andric ScalarEvolution &SE, 2079480093f4SDimitry Andric AssumptionCache &AC, 2080480093f4SDimitry Andric TargetLibraryInfo *TLI, 2081480093f4SDimitry Andric DominatorTree *DT, 2082480093f4SDimitry Andric const LoopAccessInfo *LAI) { 20835ffd83dbSDimitry Andric if (!EnableTailPredication) { 20845ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n"); 2085480093f4SDimitry Andric return false; 20865ffd83dbSDimitry Andric } 2087480093f4SDimitry Andric 2088480093f4SDimitry Andric // Creating a predicated vector loop is the first step for generating a 2089480093f4SDimitry Andric // tail-predicated hardware loop, for which we need the MVE masked 2090480093f4SDimitry Andric // load/stores instructions: 2091480093f4SDimitry Andric if (!ST->hasMVEIntegerOps()) 2092480093f4SDimitry Andric return false; 2093480093f4SDimitry Andric 2094480093f4SDimitry Andric // For now, restrict this to single block loops. 2095480093f4SDimitry Andric if (L->getNumBlocks() > 1) { 2096480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block " 2097480093f4SDimitry Andric "loop.\n"); 2098480093f4SDimitry Andric return false; 2099480093f4SDimitry Andric } 2100480093f4SDimitry Andric 2101e8d8bef9SDimitry Andric assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected"); 2102480093f4SDimitry Andric 2103480093f4SDimitry Andric HardwareLoopInfo HWLoopInfo(L); 2104480093f4SDimitry Andric if (!HWLoopInfo.canAnalyze(*LI)) { 2105480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 2106480093f4SDimitry Andric "analyzable.\n"); 2107480093f4SDimitry Andric return false; 2108480093f4SDimitry Andric } 2109480093f4SDimitry Andric 2110480093f4SDimitry Andric // This checks if we have the low-overhead branch architecture 2111480093f4SDimitry Andric // extension, and if we will create a hardware-loop: 2112480093f4SDimitry Andric if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) { 2113480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 2114480093f4SDimitry Andric "profitable.\n"); 2115480093f4SDimitry Andric return false; 2116480093f4SDimitry Andric } 2117480093f4SDimitry Andric 2118480093f4SDimitry Andric if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) { 2119480093f4SDimitry Andric LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " 2120480093f4SDimitry Andric "a candidate.\n"); 2121480093f4SDimitry Andric return false; 2122480093f4SDimitry Andric } 2123480093f4SDimitry Andric 2124480093f4SDimitry Andric return canTailPredicateLoop(L, LI, SE, DL, LAI); 2125480093f4SDimitry Andric } 2126480093f4SDimitry Andric 21275ffd83dbSDimitry Andric bool ARMTTIImpl::emitGetActiveLaneMask() const { 21285ffd83dbSDimitry Andric if (!ST->hasMVEIntegerOps() || !EnableTailPredication) 21295ffd83dbSDimitry Andric return false; 2130480093f4SDimitry Andric 21315ffd83dbSDimitry Andric // Intrinsic @llvm.get.active.lane.mask is supported. 21325ffd83dbSDimitry Andric // It is used in the MVETailPredication pass, which requires the number of 21335ffd83dbSDimitry Andric // elements processed by this vector loop to setup the tail-predicated 21345ffd83dbSDimitry Andric // loop. 21355ffd83dbSDimitry Andric return true; 21365ffd83dbSDimitry Andric } 21370b57cec5SDimitry Andric void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 21380b57cec5SDimitry Andric TTI::UnrollingPreferences &UP) { 2139fe6060f1SDimitry Andric // Enable Upper bound unrolling universally, not dependant upon the conditions 2140fe6060f1SDimitry Andric // below. 2141fe6060f1SDimitry Andric UP.UpperBound = true; 2142fe6060f1SDimitry Andric 21430b57cec5SDimitry Andric // Only currently enable these preferences for M-Class cores. 21440b57cec5SDimitry Andric if (!ST->isMClass()) 21450b57cec5SDimitry Andric return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP); 21460b57cec5SDimitry Andric 21470b57cec5SDimitry Andric // Disable loop unrolling for Oz and Os. 21480b57cec5SDimitry Andric UP.OptSizeThreshold = 0; 21490b57cec5SDimitry Andric UP.PartialOptSizeThreshold = 0; 21500b57cec5SDimitry Andric if (L->getHeader()->getParent()->hasOptSize()) 21510b57cec5SDimitry Andric return; 21520b57cec5SDimitry Andric 21530b57cec5SDimitry Andric SmallVector<BasicBlock*, 4> ExitingBlocks; 21540b57cec5SDimitry Andric L->getExitingBlocks(ExitingBlocks); 21550b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Loop has:\n" 21560b57cec5SDimitry Andric << "Blocks: " << L->getNumBlocks() << "\n" 21570b57cec5SDimitry Andric << "Exit blocks: " << ExitingBlocks.size() << "\n"); 21580b57cec5SDimitry Andric 21590b57cec5SDimitry Andric // Only allow another exit other than the latch. This acts as an early exit 21600b57cec5SDimitry Andric // as it mirrors the profitability calculation of the runtime unroller. 21610b57cec5SDimitry Andric if (ExitingBlocks.size() > 2) 21620b57cec5SDimitry Andric return; 21630b57cec5SDimitry Andric 21640b57cec5SDimitry Andric // Limit the CFG of the loop body for targets with a branch predictor. 21650b57cec5SDimitry Andric // Allowing 4 blocks permits if-then-else diamonds in the body. 21660b57cec5SDimitry Andric if (ST->hasBranchPredictor() && L->getNumBlocks() > 4) 21670b57cec5SDimitry Andric return; 21680b57cec5SDimitry Andric 2169e8d8bef9SDimitry Andric // Don't unroll vectorized loops, including the remainder loop 2170e8d8bef9SDimitry Andric if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) 2171e8d8bef9SDimitry Andric return; 2172e8d8bef9SDimitry Andric 21730b57cec5SDimitry Andric // Scan the loop: don't unroll loops with calls as this could prevent 21740b57cec5SDimitry Andric // inlining. 2175fe6060f1SDimitry Andric InstructionCost Cost = 0; 21760b57cec5SDimitry Andric for (auto *BB : L->getBlocks()) { 21770b57cec5SDimitry Andric for (auto &I : *BB) { 2178480093f4SDimitry Andric // Don't unroll vectorised loop. MVE does not benefit from it as much as 2179480093f4SDimitry Andric // scalar code. 2180480093f4SDimitry Andric if (I.getType()->isVectorTy()) 2181480093f4SDimitry Andric return; 2182480093f4SDimitry Andric 21830b57cec5SDimitry Andric if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 21845ffd83dbSDimitry Andric if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 21850b57cec5SDimitry Andric if (!isLoweredToCall(F)) 21860b57cec5SDimitry Andric continue; 21870b57cec5SDimitry Andric } 21880b57cec5SDimitry Andric return; 21890b57cec5SDimitry Andric } 21908bcb0991SDimitry Andric 2191e8d8bef9SDimitry Andric SmallVector<const Value*, 4> Operands(I.operand_values()); 2192e8d8bef9SDimitry Andric Cost += 2193e8d8bef9SDimitry Andric getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency); 21940b57cec5SDimitry Andric } 21950b57cec5SDimitry Andric } 21960b57cec5SDimitry Andric 2197fe6060f1SDimitry Andric // On v6m cores, there are very few registers available. We can easily end up 2198fe6060f1SDimitry Andric // spilling and reloading more registers in an unrolled loop. Look at the 2199fe6060f1SDimitry Andric // number of LCSSA phis as a rough measure of how many registers will need to 2200fe6060f1SDimitry Andric // be live out of the loop, reducing the default unroll count if more than 1 2201fe6060f1SDimitry Andric // value is needed. In the long run, all of this should be being learnt by a 2202fe6060f1SDimitry Andric // machine. 2203fe6060f1SDimitry Andric unsigned UnrollCount = 4; 2204fe6060f1SDimitry Andric if (ST->isThumb1Only()) { 2205fe6060f1SDimitry Andric unsigned ExitingValues = 0; 2206fe6060f1SDimitry Andric SmallVector<BasicBlock *, 4> ExitBlocks; 2207fe6060f1SDimitry Andric L->getExitBlocks(ExitBlocks); 2208fe6060f1SDimitry Andric for (auto *Exit : ExitBlocks) { 2209fe6060f1SDimitry Andric // Count the number of LCSSA phis. Exclude values coming from GEP's as 2210fe6060f1SDimitry Andric // only the last is expected to be needed for address operands. 2211fe6060f1SDimitry Andric unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) { 2212fe6060f1SDimitry Andric return PH.getNumOperands() != 1 || 2213fe6060f1SDimitry Andric !isa<GetElementPtrInst>(PH.getOperand(0)); 2214fe6060f1SDimitry Andric }); 2215fe6060f1SDimitry Andric ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues; 2216fe6060f1SDimitry Andric } 2217fe6060f1SDimitry Andric if (ExitingValues) 2218fe6060f1SDimitry Andric UnrollCount /= ExitingValues; 2219fe6060f1SDimitry Andric if (UnrollCount <= 1) 2220fe6060f1SDimitry Andric return; 2221fe6060f1SDimitry Andric } 2222fe6060f1SDimitry Andric 22230b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); 2224fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n"); 22250b57cec5SDimitry Andric 22260b57cec5SDimitry Andric UP.Partial = true; 22270b57cec5SDimitry Andric UP.Runtime = true; 22280b57cec5SDimitry Andric UP.UnrollRemainder = true; 2229fe6060f1SDimitry Andric UP.DefaultUnrollRuntimeCount = UnrollCount; 22300b57cec5SDimitry Andric UP.UnrollAndJam = true; 22310b57cec5SDimitry Andric UP.UnrollAndJamInnerLoopThreshold = 60; 22320b57cec5SDimitry Andric 22330b57cec5SDimitry Andric // Force unrolling small loops can be very useful because of the branch 22340b57cec5SDimitry Andric // taken cost of the backedge. 22350b57cec5SDimitry Andric if (Cost < 12) 22360b57cec5SDimitry Andric UP.Force = true; 22370b57cec5SDimitry Andric } 22388bcb0991SDimitry Andric 22395ffd83dbSDimitry Andric void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, 22405ffd83dbSDimitry Andric TTI::PeelingPreferences &PP) { 22415ffd83dbSDimitry Andric BaseT::getPeelingPreferences(L, SE, PP); 22425ffd83dbSDimitry Andric } 22435ffd83dbSDimitry Andric 2244e8d8bef9SDimitry Andric bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty, 2245e8d8bef9SDimitry Andric TTI::ReductionFlags Flags) const { 2246e8d8bef9SDimitry Andric if (!ST->hasMVEIntegerOps()) 2247e8d8bef9SDimitry Andric return false; 2248e8d8bef9SDimitry Andric 2249e8d8bef9SDimitry Andric unsigned ScalarBits = Ty->getScalarSizeInBits(); 2250e8d8bef9SDimitry Andric switch (Opcode) { 2251e8d8bef9SDimitry Andric case Instruction::Add: 2252e8d8bef9SDimitry Andric return ScalarBits <= 64; 2253e8d8bef9SDimitry Andric default: 2254e8d8bef9SDimitry Andric return false; 2255e8d8bef9SDimitry Andric } 2256e8d8bef9SDimitry Andric } 2257e8d8bef9SDimitry Andric 2258e8d8bef9SDimitry Andric bool ARMTTIImpl::preferPredicatedReductionSelect( 2259e8d8bef9SDimitry Andric unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { 2260e8d8bef9SDimitry Andric if (!ST->hasMVEIntegerOps()) 2261e8d8bef9SDimitry Andric return false; 2262e8d8bef9SDimitry Andric return true; 2263e8d8bef9SDimitry Andric } 2264