1 //===- AArch64TargetTransformInfo.h - AArch64 specific TTI ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file a TargetTransformInfo::Concept conforming object specific to the
10 /// AArch64 target machine. It uses the target's detailed information to
11 /// provide more precise answers to certain TTI queries, while letting the
12 /// target independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
17 #define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
18 
19 #include "AArch64.h"
20 #include "AArch64Subtarget.h"
21 #include "AArch64TargetMachine.h"
22 #include "llvm/ADT/ArrayRef.h"
23 #include "llvm/Analysis/TargetTransformInfo.h"
24 #include "llvm/CodeGen/BasicTTIImpl.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/IR/Intrinsics.h"
27 #include <cstdint>
28 #include <optional>
29 
30 namespace llvm {
31 
32 class APInt;
33 class Instruction;
34 class IntrinsicInst;
35 class Loop;
36 class SCEV;
37 class ScalarEvolution;
38 class Type;
39 class Value;
40 class VectorType;
41 
42 class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
43   using BaseT = BasicTTIImplBase<AArch64TTIImpl>;
44   using TTI = TargetTransformInfo;
45 
46   friend BaseT;
47 
48   const AArch64Subtarget *ST;
49   const AArch64TargetLowering *TLI;
50 
51   const AArch64Subtarget *getST() const { return ST; }
52   const AArch64TargetLowering *getTLI() const { return TLI; }
53 
54   enum MemIntrinsicType {
55     VECTOR_LDST_TWO_ELEMENTS,
56     VECTOR_LDST_THREE_ELEMENTS,
57     VECTOR_LDST_FOUR_ELEMENTS
58   };
59 
60   bool isWideningInstruction(Type *DstTy, unsigned Opcode,
61                              ArrayRef<const Value *> Args,
62                              Type *SrcOverrideTy = nullptr);
63 
64   // A helper function called by 'getVectorInstrCost'.
65   //
66   // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
67   // indicates whether the vector instruction is available in the input IR or
68   // just imaginary in vectorizer passes.
69   InstructionCost getVectorInstrCostHelper(const Instruction *I, Type *Val,
70                                            unsigned Index, bool HasRealUse);
71 
72 public:
73   explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
74       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
75         TLI(ST->getTargetLowering()) {}
76 
77   bool areInlineCompatible(const Function *Caller,
78                            const Function *Callee) const;
79 
80   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
81                              const ArrayRef<Type *> &Types) const;
82 
83   /// \name Scalar TTI Implementations
84   /// @{
85 
86   using BaseT::getIntImmCost;
87   InstructionCost getIntImmCost(int64_t Val);
88   InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
89                                 TTI::TargetCostKind CostKind);
90   InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
91                                     const APInt &Imm, Type *Ty,
92                                     TTI::TargetCostKind CostKind,
93                                     Instruction *Inst = nullptr);
94   InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
95                                       const APInt &Imm, Type *Ty,
96                                       TTI::TargetCostKind CostKind);
97   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
98 
99   /// @}
100 
101   /// \name Vector TTI Implementations
102   /// @{
103 
104   bool enableInterleavedAccessVectorization() { return true; }
105 
106   bool enableMaskedInterleavedAccessVectorization() { return ST->hasSVE(); }
107 
108   unsigned getNumberOfRegisters(unsigned ClassID) const {
109     bool Vector = (ClassID == 1);
110     if (Vector) {
111       if (ST->hasNEON())
112         return 32;
113       return 0;
114     }
115     return 31;
116   }
117 
118   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
119                                         TTI::TargetCostKind CostKind);
120 
121   std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
122                                                     IntrinsicInst &II) const;
123 
124   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
125       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
126       APInt &UndefElts2, APInt &UndefElts3,
127       std::function<void(Instruction *, unsigned, APInt, APInt &)>
128           SimplifyAndSetOp) const;
129 
130   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
131 
132   unsigned getMinVectorRegisterBitWidth() const {
133     return ST->getMinVectorRegisterBitWidth();
134   }
135 
136   std::optional<unsigned> getVScaleForTuning() const {
137     return ST->getVScaleForTuning();
138   }
139 
140   bool isVScaleKnownToBeAPowerOfTwo() const { return true; }
141 
142   bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
143 
144   /// Try to return an estimate cost factor that can be used as a multiplier
145   /// when scalarizing an operation for a vector with ElementCount \p VF.
146   /// For scalable vectors this currently takes the most pessimistic view based
147   /// upon the maximum possible value for vscale.
148   unsigned getMaxNumElements(ElementCount VF) const {
149     if (!VF.isScalable())
150       return VF.getFixedValue();
151 
152     return VF.getKnownMinValue() * ST->getVScaleForTuning();
153   }
154 
155   unsigned getMaxInterleaveFactor(ElementCount VF);
156 
157   bool prefersVectorizedAddressing() const;
158 
159   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
160                                         Align Alignment, unsigned AddressSpace,
161                                         TTI::TargetCostKind CostKind);
162 
163   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
164                                          const Value *Ptr, bool VariableMask,
165                                          Align Alignment,
166                                          TTI::TargetCostKind CostKind,
167                                          const Instruction *I = nullptr);
168 
169   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
170                                    TTI::CastContextHint CCH,
171                                    TTI::TargetCostKind CostKind,
172                                    const Instruction *I = nullptr);
173 
174   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
175                                            VectorType *VecTy, unsigned Index);
176 
177   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
178                                  const Instruction *I = nullptr);
179 
180   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
181                                      TTI::TargetCostKind CostKind,
182                                      unsigned Index, Value *Op0, Value *Op1);
183   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
184                                      TTI::TargetCostKind CostKind,
185                                      unsigned Index);
186 
187   InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
188                                          FastMathFlags FMF,
189                                          TTI::TargetCostKind CostKind);
190 
191   InstructionCost getArithmeticReductionCostSVE(unsigned Opcode,
192                                                 VectorType *ValTy,
193                                                 TTI::TargetCostKind CostKind);
194 
195   InstructionCost getSpliceCost(VectorType *Tp, int Index);
196 
197   InstructionCost getArithmeticInstrCost(
198       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
199       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
200       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
201       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
202       const Instruction *CxtI = nullptr);
203 
204   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
205                                             const SCEV *Ptr);
206 
207   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
208                                      CmpInst::Predicate VecPred,
209                                      TTI::TargetCostKind CostKind,
210                                      const Instruction *I = nullptr);
211 
212   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
213                                                     bool IsZeroCmp) const;
214   bool useNeonVector(const Type *Ty) const;
215 
216   InstructionCost
217   getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
218                   unsigned AddressSpace, TTI::TargetCostKind CostKind,
219                   TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
220                   const Instruction *I = nullptr);
221 
222   InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
223 
224   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
225                                TTI::UnrollingPreferences &UP,
226                                OptimizationRemarkEmitter *ORE);
227 
228   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
229                              TTI::PeelingPreferences &PP);
230 
231   Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
232                                            Type *ExpectedType);
233 
234   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
235 
236   bool isElementTypeLegalForScalableVector(Type *Ty) const {
237     if (Ty->isPointerTy())
238       return true;
239 
240     if (Ty->isBFloatTy() && ST->hasBF16())
241       return true;
242 
243     if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
244       return true;
245 
246     if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
247         Ty->isIntegerTy(32) || Ty->isIntegerTy(64))
248       return true;
249 
250     return false;
251   }
252 
253   bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
254     if (!ST->hasSVE())
255       return false;
256 
257     // For fixed vectors, avoid scalarization if using SVE for them.
258     if (isa<FixedVectorType>(DataType) && !ST->useSVEForFixedLengthVectors())
259       return false; // Fall back to scalarization of masked operations.
260 
261     return isElementTypeLegalForScalableVector(DataType->getScalarType());
262   }
263 
264   bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
265     return isLegalMaskedLoadStore(DataType, Alignment);
266   }
267 
268   bool isLegalMaskedStore(Type *DataType, Align Alignment) {
269     return isLegalMaskedLoadStore(DataType, Alignment);
270   }
271 
272   bool isLegalMaskedGatherScatter(Type *DataType) const {
273     if (!ST->hasSVE() || !ST->isNeonAvailable())
274       return false;
275 
276     // For fixed vectors, scalarize if not using SVE for them.
277     auto *DataTypeFVTy = dyn_cast<FixedVectorType>(DataType);
278     if (DataTypeFVTy && (!ST->useSVEForFixedLengthVectors() ||
279                          DataTypeFVTy->getNumElements() < 2))
280       return false;
281 
282     return isElementTypeLegalForScalableVector(DataType->getScalarType());
283   }
284 
285   bool isLegalMaskedGather(Type *DataType, Align Alignment) const {
286     return isLegalMaskedGatherScatter(DataType);
287   }
288   bool isLegalMaskedScatter(Type *DataType, Align Alignment) const {
289     return isLegalMaskedGatherScatter(DataType);
290   }
291 
292   bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const {
293     // Return true if we can generate a `ld1r` splat load instruction.
294     if (!ST->hasNEON() || NumElements.isScalable())
295       return false;
296     switch (unsigned ElementBits = ElementTy->getScalarSizeInBits()) {
297     case 8:
298     case 16:
299     case 32:
300     case 64: {
301       // We accept bit-widths >= 64bits and elements {8,16,32,64} bits.
302       unsigned VectorBits = NumElements.getFixedValue() * ElementBits;
303       return VectorBits >= 64;
304     }
305     }
306     return false;
307   }
308 
309   bool isLegalNTStoreLoad(Type *DataType, Align Alignment) {
310     // NOTE: The logic below is mostly geared towards LV, which calls it with
311     //       vectors with 2 elements. We might want to improve that, if other
312     //       users show up.
313     // Nontemporal vector loads/stores can be directly lowered to LDNP/STNP, if
314     // the vector can be halved so that each half fits into a register. That's
315     // the case if the element type fits into a register and the number of
316     // elements is a power of 2 > 1.
317     if (auto *DataTypeTy = dyn_cast<FixedVectorType>(DataType)) {
318       unsigned NumElements = DataTypeTy->getNumElements();
319       unsigned EltSize = DataTypeTy->getElementType()->getScalarSizeInBits();
320       return NumElements > 1 && isPowerOf2_64(NumElements) && EltSize >= 8 &&
321              EltSize <= 128 && isPowerOf2_64(EltSize);
322     }
323     return BaseT::isLegalNTStore(DataType, Alignment);
324   }
325 
326   bool isLegalNTStore(Type *DataType, Align Alignment) {
327     return isLegalNTStoreLoad(DataType, Alignment);
328   }
329 
330   bool isLegalNTLoad(Type *DataType, Align Alignment) {
331     // Only supports little-endian targets.
332     if (ST->isLittleEndian())
333       return isLegalNTStoreLoad(DataType, Alignment);
334     return BaseT::isLegalNTLoad(DataType, Alignment);
335   }
336 
337   bool enableOrderedReductions() const { return true; }
338 
339   InstructionCost getInterleavedMemoryOpCost(
340       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
341       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
342       bool UseMaskForCond = false, bool UseMaskForGaps = false);
343 
344   bool
345   shouldConsiderAddressTypePromotion(const Instruction &I,
346                                      bool &AllowPromotionWithoutCommonHeader);
347 
348   bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }
349 
350   unsigned getGISelRematGlobalCost() const {
351     return 2;
352   }
353 
354   unsigned getMinTripCountTailFoldingThreshold() const {
355     return ST->hasSVE() ? 5 : 0;
356   }
357 
358   TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
359     if (ST->hasSVE())
360       return IVUpdateMayOverflow
361                  ? TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck
362                  : TailFoldingStyle::DataAndControlFlow;
363 
364     return TailFoldingStyle::DataWithoutLaneMask;
365   }
366 
367   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
368 
369   bool supportsScalableVectors() const { return ST->hasSVE(); }
370 
371   bool enableScalableVectorization() const { return ST->hasSVE(); }
372 
373   bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
374                                    ElementCount VF) const;
375 
376   bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
377                                        TTI::ReductionFlags Flags) const {
378     return ST->hasSVE();
379   }
380 
381   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
382                                              std::optional<FastMathFlags> FMF,
383                                              TTI::TargetCostKind CostKind);
384 
385   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
386                                  ArrayRef<int> Mask,
387                                  TTI::TargetCostKind CostKind, int Index,
388                                  VectorType *SubTp,
389                                  ArrayRef<const Value *> Args = std::nullopt);
390 
391   /// Return the cost of the scaling factor used in the addressing
392   /// mode represented by AM for this target, for a load/store
393   /// of the specified type.
394   /// If the AM is supported, the return value must be >= 0.
395   /// If the AM is not supported, it returns a negative value.
396   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
397                                        int64_t BaseOffset, bool HasBaseReg,
398                                        int64_t Scale, unsigned AddrSpace) const;
399   /// @}
400 
401   bool enableSelectOptimize() { return ST->enableSelectOptimize(); }
402 
403   unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
404                              Type *ScalarValTy) const {
405     // We can vectorize store v4i8.
406     if (ScalarMemTy->isIntegerTy(8) && isPowerOf2_32(VF) && VF >= 4)
407       return 4;
408 
409     return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
410   }
411 };
412 
413 } // end namespace llvm
414 
415 #endif // LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
416