1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file a TargetTransformInfo::Concept conforming object specific to the
11 /// AMDGPU target machine. It uses the target's detailed information to
12 /// provide more precise answers to certain TTI queries, while letting the
13 /// target independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19 
20 #include "AMDGPU.h"
21 #include "llvm/CodeGen/BasicTTIImpl.h"
22 #include <optional>
23 
24 namespace llvm {
25 
26 class AMDGPUTargetMachine;
27 class GCNSubtarget;
28 class InstCombiner;
29 class Loop;
30 class ScalarEvolution;
31 class SITargetLowering;
32 class Type;
33 class Value;
34 
35 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
36   using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
37   using TTI = TargetTransformInfo;
38 
39   friend BaseT;
40 
41   Triple TargetTriple;
42 
43   const TargetSubtargetInfo *ST;
44   const TargetLoweringBase *TLI;
45 
46   const TargetSubtargetInfo *getST() const { return ST; }
47   const TargetLoweringBase *getTLI() const { return TLI; }
48 
49 public:
50   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
51 
52   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
53                                TTI::UnrollingPreferences &UP,
54                                OptimizationRemarkEmitter *ORE);
55 
56   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
57                              TTI::PeelingPreferences &PP);
58 };
59 
60 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
61   using BaseT = BasicTTIImplBase<GCNTTIImpl>;
62   using TTI = TargetTransformInfo;
63 
64   friend BaseT;
65 
66   const GCNSubtarget *ST;
67   const SITargetLowering *TLI;
68   AMDGPUTTIImpl CommonTTI;
69   bool IsGraphics;
70   bool HasFP32Denormals;
71   bool HasFP64FP16Denormals;
72 
73   static const FeatureBitset InlineFeatureIgnoreList;
74 
75   const GCNSubtarget *getST() const { return ST; }
76   const SITargetLowering *getTLI() const { return TLI; }
77 
78   static inline int getFullRateInstrCost() {
79     return TargetTransformInfo::TCC_Basic;
80   }
81 
82   static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
83     return CostKind == TTI::TCK_CodeSize ? 2
84                                          : 2 * TargetTransformInfo::TCC_Basic;
85   }
86 
87   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
88   // should be 2 or 4.
89   static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
90     return CostKind == TTI::TCK_CodeSize ? 2
91                                          : 4 * TargetTransformInfo::TCC_Basic;
92   }
93 
94   // On some parts, normal fp64 operations are half rate, and others
95   // quarter. This also applies to some integer operations.
96   int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
97 
98   std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
99 
100 public:
101   explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
102 
103   bool hasBranchDivergence() { return true; }
104   bool useGPUDivergenceAnalysis() const;
105 
106   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
107                                TTI::UnrollingPreferences &UP,
108                                OptimizationRemarkEmitter *ORE);
109 
110   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
111                              TTI::PeelingPreferences &PP);
112 
113   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
114     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
115     return TTI::PSK_FastHardware;
116   }
117 
118   unsigned getNumberOfRegisters(unsigned RCID) const;
119   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
120   unsigned getMinVectorRegisterBitWidth() const;
121   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
122   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
123                                unsigned ChainSizeInBytes,
124                                VectorType *VecTy) const;
125   unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
126                                 unsigned ChainSizeInBytes,
127                                 VectorType *VecTy) const;
128   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
129 
130   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
131                                   unsigned AddrSpace) const;
132   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
133                                    unsigned AddrSpace) const;
134   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
135                                     unsigned AddrSpace) const;
136   Type *getMemcpyLoopLoweringType(
137       LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
138       unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
139       std::optional<uint32_t> AtomicElementSize) const;
140 
141   void getMemcpyLoopResidualLoweringType(
142       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
143       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
144       unsigned SrcAlign, unsigned DestAlign,
145       std::optional<uint32_t> AtomicCpySize) const;
146   unsigned getMaxInterleaveFactor(unsigned VF);
147 
148   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
149 
150   InstructionCost getArithmeticInstrCost(
151       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
152       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
153       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
154       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
155       const Instruction *CxtI = nullptr);
156 
157   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
158                                  const Instruction *I = nullptr);
159 
160   bool isInlineAsmSourceOfDivergence(const CallInst *CI,
161                                      ArrayRef<unsigned> Indices = {}) const;
162 
163   using BaseT::getVectorInstrCost;
164   InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
165                                      TTI::TargetCostKind CostKind,
166                                      unsigned Index, Value *Op0, Value *Op1);
167 
168   bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
169   bool isSourceOfDivergence(const Value *V) const;
170   bool isAlwaysUniform(const Value *V) const;
171 
172   unsigned getFlatAddressSpace() const {
173     // Don't bother running InferAddressSpaces pass on graphics shaders which
174     // don't use flat addressing.
175     if (IsGraphics)
176       return -1;
177     return AMDGPUAS::FLAT_ADDRESS;
178   }
179 
180   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
181                                   Intrinsic::ID IID) const;
182 
183   bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const {
184     return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
185            AS != AMDGPUAS::PRIVATE_ADDRESS;
186   }
187 
188   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
189                                           Value *NewV) const;
190 
191   bool canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
192                                  InstCombiner &IC) const;
193   std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
194                                                     IntrinsicInst &II) const;
195   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
196       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
197       APInt &UndefElts2, APInt &UndefElts3,
198       std::function<void(Instruction *, unsigned, APInt, APInt &)>
199           SimplifyAndSetOp) const;
200 
201   InstructionCost getVectorSplitCost() { return 0; }
202 
203   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
204                                  ArrayRef<int> Mask,
205                                  TTI::TargetCostKind CostKind, int Index,
206                                  VectorType *SubTp,
207                                  ArrayRef<const Value *> Args = std::nullopt);
208 
209   bool areInlineCompatible(const Function *Caller,
210                            const Function *Callee) const;
211 
212   unsigned getInliningThresholdMultiplier() { return 11; }
213   unsigned adjustInliningThreshold(const CallBase *CB) const;
214 
215   int getInlinerVectorBonusPercent() { return 0; }
216 
217   InstructionCost getArithmeticReductionCost(
218       unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
219       TTI::TargetCostKind CostKind);
220 
221   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
222                                         TTI::TargetCostKind CostKind);
223   InstructionCost getMinMaxReductionCost(
224       VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
225       TTI::TargetCostKind CostKind);
226 };
227 
228 } // end namespace llvm
229 
230 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
231