1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file a TargetTransformInfo::Concept conforming object specific to the
11 /// AMDGPU target machine. It uses the target's detailed information to
12 /// provide more precise answers to certain TTI queries, while letting the
13 /// target independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19 
20 #include "AMDGPU.h"
21 #include "llvm/CodeGen/BasicTTIImpl.h"
22 
23 namespace llvm {
24 
25 class AMDGPUTargetMachine;
26 class GCNSubtarget;
27 class InstCombiner;
28 class Loop;
29 class ScalarEvolution;
30 class SITargetLowering;
31 class Type;
32 class Value;
33 
34 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
35   using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
36   using TTI = TargetTransformInfo;
37 
38   friend BaseT;
39 
40   Triple TargetTriple;
41 
42   const TargetSubtargetInfo *ST;
43   const TargetLoweringBase *TLI;
44 
45   const TargetSubtargetInfo *getST() const { return ST; }
46   const TargetLoweringBase *getTLI() const { return TLI; }
47 
48 public:
49   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
50 
51   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
52                                TTI::UnrollingPreferences &UP,
53                                OptimizationRemarkEmitter *ORE);
54 
55   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
56                              TTI::PeelingPreferences &PP);
57 };
58 
59 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
60   using BaseT = BasicTTIImplBase<GCNTTIImpl>;
61   using TTI = TargetTransformInfo;
62 
63   friend BaseT;
64 
65   const GCNSubtarget *ST;
66   const SITargetLowering *TLI;
67   AMDGPUTTIImpl CommonTTI;
68   bool IsGraphics;
69   bool HasFP32Denormals;
70   bool HasFP64FP16Denormals;
71 
72   static const FeatureBitset InlineFeatureIgnoreList;
73 
74   const GCNSubtarget *getST() const { return ST; }
75   const SITargetLowering *getTLI() const { return TLI; }
76 
77   static inline int getFullRateInstrCost() {
78     return TargetTransformInfo::TCC_Basic;
79   }
80 
81   static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
82     return CostKind == TTI::TCK_CodeSize ? 2
83                                          : 2 * TargetTransformInfo::TCC_Basic;
84   }
85 
86   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
87   // should be 2 or 4.
88   static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
89     return CostKind == TTI::TCK_CodeSize ? 2
90                                          : 4 * TargetTransformInfo::TCC_Basic;
91   }
92 
93   // On some parts, normal fp64 operations are half rate, and others
94   // quarter. This also applies to some integer operations.
95   int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
96 
97 public:
98   explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
99 
100   bool hasBranchDivergence() { return true; }
101   bool useGPUDivergenceAnalysis() const;
102 
103   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
104                                TTI::UnrollingPreferences &UP,
105                                OptimizationRemarkEmitter *ORE);
106 
107   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
108                              TTI::PeelingPreferences &PP);
109 
110   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
111     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
112     return TTI::PSK_FastHardware;
113   }
114 
115   unsigned getNumberOfRegisters(unsigned RCID) const;
116   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
117   unsigned getMinVectorRegisterBitWidth() const;
118   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
119   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
120                                unsigned ChainSizeInBytes,
121                                VectorType *VecTy) const;
122   unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
123                                 unsigned ChainSizeInBytes,
124                                 VectorType *VecTy) const;
125   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
126 
127   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
128                                   unsigned AddrSpace) const;
129   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
130                                    unsigned AddrSpace) const;
131   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
132                                     unsigned AddrSpace) const;
133   Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
134                                   unsigned SrcAddrSpace, unsigned DestAddrSpace,
135                                   unsigned SrcAlign, unsigned DestAlign,
136                                   Optional<uint32_t> AtomicElementSize) const;
137 
138   void getMemcpyLoopResidualLoweringType(
139       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
140       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
141       unsigned SrcAlign, unsigned DestAlign,
142       Optional<uint32_t> AtomicCpySize) const;
143   unsigned getMaxInterleaveFactor(unsigned VF);
144 
145   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
146 
147   InstructionCost getArithmeticInstrCost(
148       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
149       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
150       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
151       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
152       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
153       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
154       const Instruction *CxtI = nullptr);
155 
156   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
157                                  const Instruction *I = nullptr);
158 
159   bool isInlineAsmSourceOfDivergence(const CallInst *CI,
160                                      ArrayRef<unsigned> Indices = {}) const;
161 
162   InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
163                                      unsigned Index);
164   bool isSourceOfDivergence(const Value *V) const;
165   bool isAlwaysUniform(const Value *V) const;
166 
167   unsigned getFlatAddressSpace() const {
168     // Don't bother running InferAddressSpaces pass on graphics shaders which
169     // don't use flat addressing.
170     if (IsGraphics)
171       return -1;
172     return AMDGPUAS::FLAT_ADDRESS;
173   }
174 
175   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
176                                   Intrinsic::ID IID) const;
177 
178   bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const {
179     return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
180            AS != AMDGPUAS::PRIVATE_ADDRESS;
181   }
182 
183   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
184                                           Value *NewV) const;
185 
186   bool canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
187                                  InstCombiner &IC) const;
188   Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
189                                                IntrinsicInst &II) const;
190   Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
191       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
192       APInt &UndefElts2, APInt &UndefElts3,
193       std::function<void(Instruction *, unsigned, APInt, APInt &)>
194           SimplifyAndSetOp) const;
195 
196   InstructionCost getVectorSplitCost() { return 0; }
197 
198   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
199                                  ArrayRef<int> Mask, int Index,
200                                  VectorType *SubTp,
201                                  ArrayRef<const Value *> Args = None);
202 
203   bool areInlineCompatible(const Function *Caller,
204                            const Function *Callee) const;
205 
206   unsigned getInliningThresholdMultiplier() { return 11; }
207   unsigned adjustInliningThreshold(const CallBase *CB) const;
208 
209   int getInlinerVectorBonusPercent() { return 0; }
210 
211   InstructionCost getArithmeticReductionCost(
212       unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF,
213       TTI::TargetCostKind CostKind);
214 
215   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
216                                         TTI::TargetCostKind CostKind);
217   InstructionCost getMinMaxReductionCost(
218       VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
219       TTI::TargetCostKind CostKind);
220 };
221 
222 } // end namespace llvm
223 
224 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
225