//===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file /// This file implements a TargetTransformInfo analysis pass specific to the /// Hexagon target machine. It uses the target's detailed information to provide /// more precise answers to certain TTI queries, while letting the target /// independent and default TTI implementations handle the rest. /// //===----------------------------------------------------------------------===// #include "HexagonTargetTransformInfo.h" #include "HexagonSubtarget.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/User.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/UnrollLoop.h" using namespace llvm; #define DEBUG_TYPE "hexagontti" static cl::opt HexagonAutoHVX("hexagon-autohvx", cl::init(false), cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); static cl::opt EmitLookupTables("hexagon-emit-lookup-tables", cl::init(true), cl::Hidden, cl::desc("Control lookup table emission on Hexagon target")); // Constant "cost factor" to make floating point operations more expensive // in terms of vectorization cost. This isn't the best way, but it should // do. Ultimately, the cost should use cycles. static const unsigned FloatFactor = 4; bool HexagonTTIImpl::useHVX() const { return ST.useHVXOps() && HexagonAutoHVX; } bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const { assert(VecTy->isVectorTy()); if (cast(VecTy)->isScalable()) return false; // Avoid types like <2 x i32*>. if (!cast(VecTy)->getElementType()->isIntegerTy()) return false; EVT VecVT = EVT::getEVT(VecTy); if (!VecVT.isSimple() || VecVT.getSizeInBits() <= 64) return false; if (ST.isHVXVectorType(VecVT.getSimpleVT())) return true; auto Action = TLI.getPreferredVectorAction(VecVT.getSimpleVT()); return Action == TargetLoweringBase::TypeWidenVector; } unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const { if (Ty->isVectorTy()) return Ty->getVectorNumElements(); assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) && "Expecting scalar type"); return 1; } TargetTransformInfo::PopcntSupportKind HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { // Return fast hardware support as every input < 64 bits will be promoted // to 64 bits. return TargetTransformInfo::PSK_FastHardware; } // The Hexagon target can unroll loops with run-time trip counts. void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { UP.Runtime = UP.Partial = true; // Only try to peel innermost loops with small runtime trip counts. if (L && L->empty() && canPeel(L) && SE.getSmallConstantTripCount(L) == 0 && SE.getSmallConstantMaxTripCount(L) > 0 && SE.getSmallConstantMaxTripCount(L) <= 5) { UP.PeelCount = 2; } } bool HexagonTTIImpl::shouldFavorPostInc() const { return true; } /// --- Vector TTI begin --- unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const { if (Vector) return useHVX() ? 32 : 0; return 32; } unsigned HexagonTTIImpl::getMaxInterleaveFactor(unsigned VF) { return useHVX() ? 2 : 0; } unsigned HexagonTTIImpl::getRegisterBitWidth(bool Vector) const { return Vector ? getMinVectorRegisterBitWidth() : 32; } unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const { return useHVX() ? ST.getVectorLength()*8 : 0; } unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const { return (8 * ST.getVectorLength()) / ElemWidth; } unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { return BaseT::getScalarizationOverhead(Ty, Insert, Extract); } unsigned HexagonTTIImpl::getOperandsScalarizationOverhead( ArrayRef Args, unsigned VF) { return BaseT::getOperandsScalarizationOverhead(Args, VF); } unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) { return BaseT::getCallInstrCost(F, RetTy, Tys); } unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Args, FastMathFlags FMF, unsigned VF) { return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); } unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF, unsigned ScalarizationCostPassed) { if (ID == Intrinsic::bswap) { std::pair LT = TLI.getTypeLegalizationCost(DL, RetTy); return LT.first + 2; } return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF, ScalarizationCostPassed); } unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp, ScalarEvolution *SE, const SCEV *S) { return 0; } unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, const Instruction *I) { assert(Opcode == Instruction::Load || Opcode == Instruction::Store); if (Opcode == Instruction::Store) return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I); if (Src->isVectorTy()) { VectorType *VecTy = cast(Src); unsigned VecWidth = VecTy->getBitWidth(); if (useHVX() && isTypeForHVX(VecTy)) { unsigned RegWidth = getRegisterBitWidth(true); assert(RegWidth && "Non-zero vector register width expected"); // Cost of HVX loads. if (VecWidth % RegWidth == 0) return VecWidth / RegWidth; // Cost of constructing HVX vector from scalar loads const Align RegAlign(RegWidth / 8); if (!Alignment || *Alignment > RegAlign) Alignment = RegAlign; assert(Alignment); unsigned AlignWidth = 8 * Alignment->value(); unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; return 3 * NumLoads; } // Non-HVX vectors. // Add extra cost for floating point types. unsigned Cost = VecTy->getElementType()->isFloatingPointTy() ? FloatFactor : 1; // At this point unspecified alignment is considered as Align::None(). const Align BoundAlignment = std::min(Alignment.valueOrOne(), Align(8)); unsigned AlignWidth = 8 * BoundAlignment.value(); unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth; if (Alignment == Align(4) || Alignment == Align(8)) return Cost * NumLoads; // Loads of less than 32 bits will need extra inserts to compose a vector. assert(BoundAlignment <= Align(8)); unsigned LogA = Log2(BoundAlignment); return (3 - LogA) * Cost * NumLoads; } return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I); } unsigned HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) { return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { return 1; } unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment) { return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, Alignment); } unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, bool UseMaskForGaps) { if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, UseMaskForCond, UseMaskForGaps); return getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, nullptr); } unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I) { if (ValTy->isVectorTy()) { std::pair LT = TLI.getTypeLegalizationCost(DL, ValTy); if (Opcode == Instruction::FCmp) return LT.first + FloatFactor * getTypeNumElements(ValTy); } return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } unsigned HexagonTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args, const Instruction *CxtI) { if (Ty->isVectorTy()) { std::pair LT = TLI.getTypeLegalizationCost(DL, Ty); if (LT.second.isFloatingPoint()) return LT.first + FloatFactor * getTypeNumElements(Ty); } return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo, Args, CxtI); } unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy, Type *SrcTy, const Instruction *I) { if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) { unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0; unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0; std::pair SrcLT = TLI.getTypeLegalizationCost(DL, SrcTy); std::pair DstLT = TLI.getTypeLegalizationCost(DL, DstTy); return std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN); } return 1; } unsigned HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { Type *ElemTy = Val->isVectorTy() ? cast(Val)->getElementType() : Val; if (Opcode == Instruction::InsertElement) { // Need two rotations for non-zero index. unsigned Cost = (Index != 0) ? 2 : 0; if (ElemTy->isIntegerTy(32)) return Cost; // If it's not a 32-bit value, there will need to be an extract. return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index); } if (Opcode == Instruction::ExtractElement) return 2; return 1; } /// --- Vector TTI end --- unsigned HexagonTTIImpl::getPrefetchDistance() const { return ST.getL1PrefetchDistance(); } unsigned HexagonTTIImpl::getCacheLineSize() const { return ST.getL1CacheLineSize(); } int HexagonTTIImpl::getUserCost(const User *U, ArrayRef Operands) { auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool { if (!CI->isIntegerCast()) return false; // Only extensions from an integer type shorter than 32-bit to i32 // can be folded into the load. const DataLayout &DL = getDataLayout(); unsigned SBW = DL.getTypeSizeInBits(CI->getSrcTy()); unsigned DBW = DL.getTypeSizeInBits(CI->getDestTy()); if (DBW != 32 || SBW >= DBW) return false; const LoadInst *LI = dyn_cast(CI->getOperand(0)); // Technically, this code could allow multiple uses of the load, and // check if all the uses are the same extension operation, but this // should be sufficient for most cases. return LI && LI->hasOneUse(); }; if (const CastInst *CI = dyn_cast(U)) if (isCastFoldedIntoLoad(CI)) return TargetTransformInfo::TCC_Free; return BaseT::getUserCost(U, Operands); } bool HexagonTTIImpl::shouldBuildLookupTables() const { return EmitLookupTables; }