1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file a TargetTransformInfo::Concept conforming object specific to the
11 /// AMDGPU target machine. It uses the target's detailed information to
12 /// provide more precise answers to certain TTI queries, while letting the
13 /// target independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19 
20 #include "AMDGPU.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm/ADT/ArrayRef.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/CodeGen/BasicTTIImpl.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/MC/SubtargetFeature.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <cassert>
32 
33 namespace llvm {
34 
35 class AMDGPUTargetLowering;
36 class Loop;
37 class ScalarEvolution;
38 class Type;
39 class Value;
40 
41 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
42   using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
43   using TTI = TargetTransformInfo;
44 
45   friend BaseT;
46 
47   Triple TargetTriple;
48 
49   const GCNSubtarget *ST;
50   const TargetLoweringBase *TLI;
51 
52   const TargetSubtargetInfo *getST() const { return ST; }
53   const TargetLoweringBase *getTLI() const { return TLI; }
54 
55 public:
56   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
57       : BaseT(TM, F.getParent()->getDataLayout()),
58         TargetTriple(TM->getTargetTriple()),
59         ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
60         TLI(ST->getTargetLowering()) {}
61 
62   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
63                                TTI::UnrollingPreferences &UP);
64 };
65 
66 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
67   using BaseT = BasicTTIImplBase<GCNTTIImpl>;
68   using TTI = TargetTransformInfo;
69 
70   friend BaseT;
71 
72   const GCNSubtarget *ST;
73   const AMDGPUTargetLowering *TLI;
74   AMDGPUTTIImpl CommonTTI;
75   bool IsGraphicsShader;
76   bool HasFP32Denormals;
77 
78   const FeatureBitset InlineFeatureIgnoreList = {
79     // Codegen control options which don't matter.
80     AMDGPU::FeatureEnableLoadStoreOpt,
81     AMDGPU::FeatureEnableSIScheduler,
82     AMDGPU::FeatureEnableUnsafeDSOffsetFolding,
83     AMDGPU::FeatureFlatForGlobal,
84     AMDGPU::FeaturePromoteAlloca,
85     AMDGPU::FeatureUnalignedBufferAccess,
86     AMDGPU::FeatureUnalignedScratchAccess,
87 
88     AMDGPU::FeatureAutoWaitcntBeforeBarrier,
89 
90     // Property of the kernel/environment which can't actually differ.
91     AMDGPU::FeatureSGPRInitBug,
92     AMDGPU::FeatureXNACK,
93     AMDGPU::FeatureTrapHandler,
94     AMDGPU::FeatureCodeObjectV3,
95 
96     // The default assumption needs to be ecc is enabled, but no directly
97     // exposed operations depend on it, so it can be safely inlined.
98     AMDGPU::FeatureSRAMECC,
99 
100     // Perf-tuning features
101     AMDGPU::FeatureFastFMAF32,
102     AMDGPU::HalfRate64Ops
103   };
104 
105   const GCNSubtarget *getST() const { return ST; }
106   const AMDGPUTargetLowering *getTLI() const { return TLI; }
107 
108   static inline int getFullRateInstrCost() {
109     return TargetTransformInfo::TCC_Basic;
110   }
111 
112   static inline int getHalfRateInstrCost() {
113     return 2 * TargetTransformInfo::TCC_Basic;
114   }
115 
116   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
117   // should be 2 or 4.
118   static inline int getQuarterRateInstrCost() {
119     return 3 * TargetTransformInfo::TCC_Basic;
120   }
121 
122    // On some parts, normal fp64 operations are half rate, and others
123    // quarter. This also applies to some integer operations.
124   inline int get64BitInstrCost() const {
125     return ST->hasHalfRate64Ops() ?
126       getHalfRateInstrCost() : getQuarterRateInstrCost();
127   }
128 
129 public:
130   explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
131     : BaseT(TM, F.getParent()->getDataLayout()),
132       ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
133       TLI(ST->getTargetLowering()),
134       CommonTTI(TM, F),
135       IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
136       HasFP32Denormals(ST->hasFP32Denormals(F)) { }
137 
138   bool hasBranchDivergence() { return true; }
139 
140   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
141                                TTI::UnrollingPreferences &UP);
142 
143   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
144     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
145     return TTI::PSK_FastHardware;
146   }
147 
148   unsigned getHardwareNumberOfRegisters(bool Vector) const;
149   unsigned getNumberOfRegisters(bool Vector) const;
150   unsigned getRegisterBitWidth(bool Vector) const;
151   unsigned getMinVectorRegisterBitWidth() const;
152   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
153                                unsigned ChainSizeInBytes,
154                                VectorType *VecTy) const;
155   unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
156                                 unsigned ChainSizeInBytes,
157                                 VectorType *VecTy) const;
158   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
159 
160   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
161                                   unsigned Alignment,
162                                   unsigned AddrSpace) const;
163   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
164                                    unsigned Alignment,
165                                    unsigned AddrSpace) const;
166   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
167                                     unsigned Alignment,
168                                     unsigned AddrSpace) const;
169 
170   unsigned getMaxInterleaveFactor(unsigned VF);
171 
172   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
173 
174   int getArithmeticInstrCost(
175       unsigned Opcode, Type *Ty,
176       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
177       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
178       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
179       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
180       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
181       const Instruction *CxtI = nullptr);
182 
183   unsigned getCFInstrCost(unsigned Opcode);
184 
185   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
186   bool isSourceOfDivergence(const Value *V) const;
187   bool isAlwaysUniform(const Value *V) const;
188 
189   unsigned getFlatAddressSpace() const {
190     // Don't bother running InferAddressSpaces pass on graphics shaders which
191     // don't use flat addressing.
192     if (IsGraphicsShader)
193       return -1;
194     return AMDGPUAS::FLAT_ADDRESS;
195   }
196 
197   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
198                                   Intrinsic::ID IID) const;
199   bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
200                                         Value *OldV, Value *NewV) const;
201 
202   unsigned getVectorSplitCost() { return 0; }
203 
204   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
205                           Type *SubTp);
206 
207   bool areInlineCompatible(const Function *Caller,
208                            const Function *Callee) const;
209 
210   unsigned getInliningThresholdMultiplier() { return 11; }
211 
212   int getInlinerVectorBonusPercent() { return 0; }
213 
214   int getArithmeticReductionCost(unsigned Opcode,
215                                  Type *Ty,
216                                  bool IsPairwise);
217   template <typename T>
218   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
219                             ArrayRef<T *> Args, FastMathFlags FMF,
220                             unsigned VF);
221   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
222                             ArrayRef<Type *> Tys, FastMathFlags FMF,
223                             unsigned ScalarizationCostPassed = UINT_MAX);
224   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
225                             ArrayRef<Value *> Args, FastMathFlags FMF,
226                             unsigned VF = 1);
227   int getMinMaxReductionCost(Type *Ty, Type *CondTy,
228                              bool IsPairwiseForm,
229                              bool IsUnsigned);
230   unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
231 };
232 
233 class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
234   using BaseT = BasicTTIImplBase<R600TTIImpl>;
235   using TTI = TargetTransformInfo;
236 
237   friend BaseT;
238 
239   const R600Subtarget *ST;
240   const AMDGPUTargetLowering *TLI;
241   AMDGPUTTIImpl CommonTTI;
242 
243 public:
244   explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
245     : BaseT(TM, F.getParent()->getDataLayout()),
246       ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
247       TLI(ST->getTargetLowering()),
248       CommonTTI(TM, F)	{}
249 
250   const R600Subtarget *getST() const { return ST; }
251   const AMDGPUTargetLowering *getTLI() const { return TLI; }
252 
253   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
254                                TTI::UnrollingPreferences &UP);
255   unsigned getHardwareNumberOfRegisters(bool Vec) const;
256   unsigned getNumberOfRegisters(bool Vec) const;
257   unsigned getRegisterBitWidth(bool Vector) const;
258   unsigned getMinVectorRegisterBitWidth() const;
259   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
260   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment,
261                                   unsigned AddrSpace) const;
262   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
263 		                   unsigned Alignment,
264                                    unsigned AddrSpace) const;
265   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
266                                     unsigned Alignment,
267                                     unsigned AddrSpace) const;
268   unsigned getMaxInterleaveFactor(unsigned VF);
269   unsigned getCFInstrCost(unsigned Opcode);
270   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
271 };
272 
273 } // end namespace llvm
274 
275 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
276