1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file a TargetTransformInfo::Concept conforming object specific to the
11 /// AMDGPU target machine. It uses the target's detailed information to
12 /// provide more precise answers to certain TTI queries, while letting the
13 /// target independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19 
20 #include "AMDGPU.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm/ADT/ArrayRef.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/CodeGen/BasicTTIImpl.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/MC/SubtargetFeature.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <cassert>
32 
33 namespace llvm {
34 
35 class AMDGPUTargetLowering;
36 class Loop;
37 class ScalarEvolution;
38 class Type;
39 class Value;
40 
41 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
42   using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
43   using TTI = TargetTransformInfo;
44 
45   friend BaseT;
46 
47   Triple TargetTriple;
48 
49   const GCNSubtarget *ST;
50   const TargetLoweringBase *TLI;
51 
getST()52   const TargetSubtargetInfo *getST() const { return ST; }
getTLI()53   const TargetLoweringBase *getTLI() const { return TLI; }
54 
55 public:
AMDGPUTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)56   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
57       : BaseT(TM, F.getParent()->getDataLayout()),
58         TargetTriple(TM->getTargetTriple()),
59         ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
60         TLI(ST->getTargetLowering()) {}
61 
62   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
63                                TTI::UnrollingPreferences &UP);
64 
65   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
66                              TTI::PeelingPreferences &PP);
67 };
68 
69 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
70   using BaseT = BasicTTIImplBase<GCNTTIImpl>;
71   using TTI = TargetTransformInfo;
72 
73   friend BaseT;
74 
75   const GCNSubtarget *ST;
76   const SITargetLowering *TLI;
77   AMDGPUTTIImpl CommonTTI;
78   bool IsGraphicsShader;
79   bool HasFP32Denormals;
80   unsigned MaxVGPRs;
81 
82   const FeatureBitset InlineFeatureIgnoreList = {
83     // Codegen control options which don't matter.
84     AMDGPU::FeatureEnableLoadStoreOpt,
85     AMDGPU::FeatureEnableSIScheduler,
86     AMDGPU::FeatureEnableUnsafeDSOffsetFolding,
87     AMDGPU::FeatureFlatForGlobal,
88     AMDGPU::FeaturePromoteAlloca,
89     AMDGPU::FeatureUnalignedBufferAccess,
90     AMDGPU::FeatureUnalignedScratchAccess,
91 
92     AMDGPU::FeatureAutoWaitcntBeforeBarrier,
93 
94     // Property of the kernel/environment which can't actually differ.
95     AMDGPU::FeatureSGPRInitBug,
96     AMDGPU::FeatureXNACK,
97     AMDGPU::FeatureTrapHandler,
98     AMDGPU::FeatureCodeObjectV3,
99 
100     // The default assumption needs to be ecc is enabled, but no directly
101     // exposed operations depend on it, so it can be safely inlined.
102     AMDGPU::FeatureSRAMECC,
103 
104     // Perf-tuning features
105     AMDGPU::FeatureFastFMAF32,
106     AMDGPU::HalfRate64Ops
107   };
108 
getST()109   const GCNSubtarget *getST() const { return ST; }
getTLI()110   const AMDGPUTargetLowering *getTLI() const { return TLI; }
111 
getFullRateInstrCost()112   static inline int getFullRateInstrCost() {
113     return TargetTransformInfo::TCC_Basic;
114   }
115 
getHalfRateInstrCost()116   static inline int getHalfRateInstrCost() {
117     return 2 * TargetTransformInfo::TCC_Basic;
118   }
119 
120   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
121   // should be 2 or 4.
getQuarterRateInstrCost()122   static inline int getQuarterRateInstrCost() {
123     return 3 * TargetTransformInfo::TCC_Basic;
124   }
125 
126    // On some parts, normal fp64 operations are half rate, and others
127    // quarter. This also applies to some integer operations.
get64BitInstrCost()128   inline int get64BitInstrCost() const {
129     return ST->hasHalfRate64Ops() ?
130       getHalfRateInstrCost() : getQuarterRateInstrCost();
131   }
132 
133 public:
GCNTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)134   explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
135     : BaseT(TM, F.getParent()->getDataLayout()),
136       ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
137       TLI(ST->getTargetLowering()),
138       CommonTTI(TM, F),
139       IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
140       HasFP32Denormals(AMDGPU::SIModeRegisterDefaults(F).allFP32Denormals()),
141       MaxVGPRs(ST->getMaxNumVGPRs(
142           std::max(ST->getWavesPerEU(F).first,
143                    ST->getWavesPerEUForWorkGroup(
144                        ST->getFlatWorkGroupSizes(F).second)))) {}
145 
hasBranchDivergence()146   bool hasBranchDivergence() { return true; }
147   bool useGPUDivergenceAnalysis() const;
148 
149   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
150                                TTI::UnrollingPreferences &UP);
151 
152   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
153                              TTI::PeelingPreferences &PP);
154 
getPopcntSupport(unsigned TyWidth)155   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
156     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
157     return TTI::PSK_FastHardware;
158   }
159 
160   unsigned getHardwareNumberOfRegisters(bool Vector) const;
161   unsigned getNumberOfRegisters(bool Vector) const;
162   unsigned getNumberOfRegisters(unsigned RCID) const;
163   unsigned getRegisterBitWidth(bool Vector) const;
164   unsigned getMinVectorRegisterBitWidth() const;
165   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
166                                unsigned ChainSizeInBytes,
167                                VectorType *VecTy) const;
168   unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
169                                 unsigned ChainSizeInBytes,
170                                 VectorType *VecTy) const;
171   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
172 
173   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
174                                   unsigned AddrSpace) const;
175   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
176                                    unsigned AddrSpace) const;
177   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
178                                     unsigned AddrSpace) const;
179   Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
180                                   unsigned SrcAddrSpace, unsigned DestAddrSpace,
181                                   unsigned SrcAlign, unsigned DestAlign) const;
182 
183   void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
184                                          LLVMContext &Context,
185                                          unsigned RemainingBytes,
186                                          unsigned SrcAddrSpace,
187                                          unsigned DestAddrSpace,
188                                          unsigned SrcAlign,
189                                          unsigned DestAlign) const;
190   unsigned getMaxInterleaveFactor(unsigned VF);
191 
192   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
193 
194   int getArithmeticInstrCost(
195       unsigned Opcode, Type *Ty,
196       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
197       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
198       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
199       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
200       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
201       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
202       const Instruction *CxtI = nullptr);
203 
204   unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
205 
206   bool isInlineAsmSourceOfDivergence(const CallInst *CI,
207                                      ArrayRef<unsigned> Indices = {}) const;
208 
209   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
210   bool isSourceOfDivergence(const Value *V) const;
211   bool isAlwaysUniform(const Value *V) const;
212 
getFlatAddressSpace()213   unsigned getFlatAddressSpace() const {
214     // Don't bother running InferAddressSpaces pass on graphics shaders which
215     // don't use flat addressing.
216     if (IsGraphicsShader)
217       return -1;
218     return AMDGPUAS::FLAT_ADDRESS;
219   }
220 
221   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
222                                   Intrinsic::ID IID) const;
223   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
224                                           Value *NewV) const;
225 
getVectorSplitCost()226   unsigned getVectorSplitCost() { return 0; }
227 
228   unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
229                           VectorType *SubTp);
230 
231   bool areInlineCompatible(const Function *Caller,
232                            const Function *Callee) const;
233 
getInliningThresholdMultiplier()234   unsigned getInliningThresholdMultiplier() { return 11; }
235 
getInlinerVectorBonusPercent()236   int getInlinerVectorBonusPercent() { return 0; }
237 
238   int getArithmeticReductionCost(
239       unsigned Opcode,
240       VectorType *Ty,
241       bool IsPairwise,
242       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
243 
244   int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
245                             TTI::TargetCostKind CostKind);
246   int getMinMaxReductionCost(
247     VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
248     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
249 };
250 
251 class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
252   using BaseT = BasicTTIImplBase<R600TTIImpl>;
253   using TTI = TargetTransformInfo;
254 
255   friend BaseT;
256 
257   const R600Subtarget *ST;
258   const AMDGPUTargetLowering *TLI;
259   AMDGPUTTIImpl CommonTTI;
260 
261 public:
R600TTIImpl(const AMDGPUTargetMachine * TM,const Function & F)262   explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
263     : BaseT(TM, F.getParent()->getDataLayout()),
264       ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
265       TLI(ST->getTargetLowering()),
266       CommonTTI(TM, F) {}
267 
getST()268   const R600Subtarget *getST() const { return ST; }
getTLI()269   const AMDGPUTargetLowering *getTLI() const { return TLI; }
270 
271   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
272                                TTI::UnrollingPreferences &UP);
273   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
274                              TTI::PeelingPreferences &PP);
275   unsigned getHardwareNumberOfRegisters(bool Vec) const;
276   unsigned getNumberOfRegisters(bool Vec) const;
277   unsigned getRegisterBitWidth(bool Vector) const;
278   unsigned getMinVectorRegisterBitWidth() const;
279   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
280   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
281                                   unsigned AddrSpace) const;
282   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
283                                    unsigned AddrSpace) const;
284   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
285                                     unsigned AddrSpace) const;
286   unsigned getMaxInterleaveFactor(unsigned VF);
287   unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
288   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
289 };
290 
291 } // end namespace llvm
292 
293 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
294