1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIModeRegisterDefaults.h"
21 #include "llvm/Analysis/InlineCost.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/Analysis.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/IR/PatternMatch.h"
28 #include "llvm/Support/KnownBits.h"
29 #include <optional>
30
31 using namespace llvm;
32
33 #define DEBUG_TYPE "AMDGPUtti"
34
35 static cl::opt<unsigned> UnrollThresholdPrivate(
36 "amdgpu-unroll-threshold-private",
37 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
38 cl::init(2700), cl::Hidden);
39
40 static cl::opt<unsigned> UnrollThresholdLocal(
41 "amdgpu-unroll-threshold-local",
42 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
43 cl::init(1000), cl::Hidden);
44
45 static cl::opt<unsigned> UnrollThresholdIf(
46 "amdgpu-unroll-threshold-if",
47 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48 cl::init(200), cl::Hidden);
49
50 static cl::opt<bool> UnrollRuntimeLocal(
51 "amdgpu-unroll-runtime-local",
52 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53 cl::init(true), cl::Hidden);
54
55 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
56 "amdgpu-unroll-max-block-to-analyze",
57 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58 cl::init(32), cl::Hidden);
59
60 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61 cl::Hidden, cl::init(4000),
62 cl::desc("Cost of alloca argument"));
63
64 // If the amount of scratch memory to eliminate exceeds our ability to allocate
65 // it into registers we gain nothing by aggressively inlining functions for that
66 // heuristic.
67 static cl::opt<unsigned>
68 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69 cl::init(256),
70 cl::desc("Maximum alloca size to use for inline cost"));
71
72 // Inliner constraint to achieve reasonable compilation time.
73 static cl::opt<size_t> InlineMaxBB(
74 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
75 cl::desc("Maximum number of BBs allowed in a function after inlining"
76 " (compile time constraint)"));
77
dependsOnLocalPhi(const Loop * L,const Value * Cond,unsigned Depth=0)78 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
79 unsigned Depth = 0) {
80 const Instruction *I = dyn_cast<Instruction>(Cond);
81 if (!I)
82 return false;
83
84 for (const Value *V : I->operand_values()) {
85 if (!L->contains(I))
86 continue;
87 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
88 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
89 return SubLoop->contains(PHI); }))
90 return true;
91 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
92 return true;
93 }
94 return false;
95 }
96
AMDGPUTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)97 AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
98 : BaseT(TM, F.getParent()->getDataLayout()),
99 TargetTriple(TM->getTargetTriple()),
100 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
101 TLI(ST->getTargetLowering()) {}
102
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)103 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
104 TTI::UnrollingPreferences &UP,
105 OptimizationRemarkEmitter *ORE) {
106 const Function &F = *L->getHeader()->getParent();
107 UP.Threshold =
108 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
109 UP.MaxCount = std::numeric_limits<unsigned>::max();
110 UP.Partial = true;
111
112 // Conditional branch in a loop back edge needs 3 additional exec
113 // manipulations in average.
114 UP.BEInsns += 3;
115
116 // We want to run unroll even for the loops which have been vectorized.
117 UP.UnrollVectorizedLoop = true;
118
119 // TODO: Do we want runtime unrolling?
120
121 // Maximum alloca size than can fit registers. Reserve 16 registers.
122 const unsigned MaxAlloca = (256 - 16) * 4;
123 unsigned ThresholdPrivate = UnrollThresholdPrivate;
124 unsigned ThresholdLocal = UnrollThresholdLocal;
125
126 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
127 // provided threshold value as the default for Threshold
128 if (MDNode *LoopUnrollThreshold =
129 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
130 if (LoopUnrollThreshold->getNumOperands() == 2) {
131 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
132 LoopUnrollThreshold->getOperand(1));
133 if (MetaThresholdValue) {
134 // We will also use the supplied value for PartialThreshold for now.
135 // We may introduce additional metadata if it becomes necessary in the
136 // future.
137 UP.Threshold = MetaThresholdValue->getSExtValue();
138 UP.PartialThreshold = UP.Threshold;
139 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
140 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
141 }
142 }
143 }
144
145 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
146 for (const BasicBlock *BB : L->getBlocks()) {
147 const DataLayout &DL = BB->getModule()->getDataLayout();
148 unsigned LocalGEPsSeen = 0;
149
150 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
151 return SubLoop->contains(BB); }))
152 continue; // Block belongs to an inner loop.
153
154 for (const Instruction &I : *BB) {
155 // Unroll a loop which contains an "if" statement whose condition
156 // defined by a PHI belonging to the loop. This may help to eliminate
157 // if region and potentially even PHI itself, saving on both divergence
158 // and registers used for the PHI.
159 // Add a small bonus for each of such "if" statements.
160 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
161 if (UP.Threshold < MaxBoost && Br->isConditional()) {
162 BasicBlock *Succ0 = Br->getSuccessor(0);
163 BasicBlock *Succ1 = Br->getSuccessor(1);
164 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
165 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
166 continue;
167 if (dependsOnLocalPhi(L, Br->getCondition())) {
168 UP.Threshold += UnrollThresholdIf;
169 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
170 << " for loop:\n"
171 << *L << " due to " << *Br << '\n');
172 if (UP.Threshold >= MaxBoost)
173 return;
174 }
175 }
176 continue;
177 }
178
179 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
180 if (!GEP)
181 continue;
182
183 unsigned AS = GEP->getAddressSpace();
184 unsigned Threshold = 0;
185 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
186 Threshold = ThresholdPrivate;
187 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
188 Threshold = ThresholdLocal;
189 else
190 continue;
191
192 if (UP.Threshold >= Threshold)
193 continue;
194
195 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
196 const Value *Ptr = GEP->getPointerOperand();
197 const AllocaInst *Alloca =
198 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
199 if (!Alloca || !Alloca->isStaticAlloca())
200 continue;
201 Type *Ty = Alloca->getAllocatedType();
202 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
203 if (AllocaSize > MaxAlloca)
204 continue;
205 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
206 AS == AMDGPUAS::REGION_ADDRESS) {
207 LocalGEPsSeen++;
208 // Inhibit unroll for local memory if we have seen addressing not to
209 // a variable, most likely we will be unable to combine it.
210 // Do not unroll too deep inner loops for local memory to give a chance
211 // to unroll an outer loop for a more important reason.
212 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
213 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
214 !isa<Argument>(GEP->getPointerOperand())))
215 continue;
216 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
217 << *L << " due to LDS use.\n");
218 UP.Runtime = UnrollRuntimeLocal;
219 }
220
221 // Check if GEP depends on a value defined by this loop itself.
222 bool HasLoopDef = false;
223 for (const Value *Op : GEP->operands()) {
224 const Instruction *Inst = dyn_cast<Instruction>(Op);
225 if (!Inst || L->isLoopInvariant(Op))
226 continue;
227
228 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
229 return SubLoop->contains(Inst); }))
230 continue;
231 HasLoopDef = true;
232 break;
233 }
234 if (!HasLoopDef)
235 continue;
236
237 // We want to do whatever we can to limit the number of alloca
238 // instructions that make it through to the code generator. allocas
239 // require us to use indirect addressing, which is slow and prone to
240 // compiler bugs. If this loop does an address calculation on an
241 // alloca ptr, then we want to use a higher than normal loop unroll
242 // threshold. This will give SROA a better chance to eliminate these
243 // allocas.
244 //
245 // We also want to have more unrolling for local memory to let ds
246 // instructions with different offsets combine.
247 //
248 // Don't use the maximum allowed value here as it will make some
249 // programs way too big.
250 UP.Threshold = Threshold;
251 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
252 << " for loop:\n"
253 << *L << " due to " << *GEP << '\n');
254 if (UP.Threshold >= MaxBoost)
255 return;
256 }
257
258 // If we got a GEP in a small BB from inner loop then increase max trip
259 // count to analyze for better estimation cost in unroll
260 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
261 UP.MaxIterationsCountToAnalyze = 32;
262 }
263 }
264
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)265 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
266 TTI::PeelingPreferences &PP) {
267 BaseT::getPeelingPreferences(L, SE, PP);
268 }
269
getMaxMemIntrinsicInlineSizeThreshold() const270 int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
271 return 1024;
272 }
273
274 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
275 // Codegen control options which don't matter.
276 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
277 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
278 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
279 AMDGPU::FeatureUnalignedAccessMode,
280
281 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
282
283 // Property of the kernel/environment which can't actually differ.
284 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
285 AMDGPU::FeatureTrapHandler,
286
287 // The default assumption needs to be ecc is enabled, but no directly
288 // exposed operations depend on it, so it can be safely inlined.
289 AMDGPU::FeatureSRAMECC,
290
291 // Perf-tuning features
292 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
293
GCNTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)294 GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
295 : BaseT(TM, F.getParent()->getDataLayout()),
296 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
297 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
298 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
299 SIModeRegisterDefaults Mode(F, *ST);
300 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
301 HasFP64FP16Denormals =
302 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
303 }
304
hasBranchDivergence(const Function * F) const305 bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
306 return !F || !ST->isSingleLaneExecution(*F);
307 }
308
getNumberOfRegisters(unsigned RCID) const309 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
310 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311 // registers. See getRegisterClassForType for the implementation.
312 // In this case vector registers are not vector in terms of
313 // VGPRs, but those which can hold multiple values.
314
315 // This is really the number of registers to fill when vectorizing /
316 // interleaving loops, so we lie to avoid trying to use all registers.
317 return 4;
318 }
319
320 TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const321 GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
322 switch (K) {
323 case TargetTransformInfo::RGK_Scalar:
324 return TypeSize::getFixed(32);
325 case TargetTransformInfo::RGK_FixedWidthVector:
326 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
327 case TargetTransformInfo::RGK_ScalableVector:
328 return TypeSize::getScalable(0);
329 }
330 llvm_unreachable("Unsupported register kind");
331 }
332
getMinVectorRegisterBitWidth() const333 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
334 return 32;
335 }
336
getMaximumVF(unsigned ElemWidth,unsigned Opcode) const337 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
339 return 32 * 4 / ElemWidth;
340 return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
341 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
342 : 1;
343 }
344
getLoadVectorFactor(unsigned VF,unsigned LoadSize,unsigned ChainSizeInBytes,VectorType * VecTy) const345 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
346 unsigned ChainSizeInBytes,
347 VectorType *VecTy) const {
348 unsigned VecRegBitWidth = VF * LoadSize;
349 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
350 // TODO: Support element-size less than 32bit?
351 return 128 / LoadSize;
352
353 return VF;
354 }
355
getStoreVectorFactor(unsigned VF,unsigned StoreSize,unsigned ChainSizeInBytes,VectorType * VecTy) const356 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
357 unsigned ChainSizeInBytes,
358 VectorType *VecTy) const {
359 unsigned VecRegBitWidth = VF * StoreSize;
360 if (VecRegBitWidth > 128)
361 return 128 / StoreSize;
362
363 return VF;
364 }
365
getLoadStoreVecRegBitWidth(unsigned AddrSpace) const366 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
367 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
368 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
369 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
370 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
371 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
372 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
373 return 512;
374 }
375
376 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
377 return 8 * ST->getMaxPrivateElementSize();
378
379 // Common to flat, global, local and region. Assume for unknown addrspace.
380 return 128;
381 }
382
isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const383 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
384 Align Alignment,
385 unsigned AddrSpace) const {
386 // We allow vectorization of flat stores, even though we may need to decompose
387 // them later if they may access private memory. We don't have enough context
388 // here, and legalization can handle it.
389 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
390 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
391 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
392 }
393 return true;
394 }
395
isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const396 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
397 Align Alignment,
398 unsigned AddrSpace) const {
399 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
400 }
401
isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const402 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
403 Align Alignment,
404 unsigned AddrSpace) const {
405 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
406 }
407
getMaxMemIntrinsicInlineSizeThreshold() const408 int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
409 return 1024;
410 }
411
412 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
413 // iteration. Should we report a larger size and let it legalize?
414 //
415 // FIXME: Should we use narrower types for local/region, or account for when
416 // unaligned access is legal?
417 //
418 // FIXME: This could use fine tuning and microbenchmarks.
getMemcpyLoopLoweringType(LLVMContext & Context,Value * Length,unsigned SrcAddrSpace,unsigned DestAddrSpace,unsigned SrcAlign,unsigned DestAlign,std::optional<uint32_t> AtomicElementSize) const419 Type *GCNTTIImpl::getMemcpyLoopLoweringType(
420 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
421 unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
422 std::optional<uint32_t> AtomicElementSize) const {
423
424 if (AtomicElementSize)
425 return Type::getIntNTy(Context, *AtomicElementSize * 8);
426
427 unsigned MinAlign = std::min(SrcAlign, DestAlign);
428
429 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
430 // hardware into byte accesses. If you assume all alignments are equally
431 // probable, it's more efficient on average to use short accesses for this
432 // case.
433 if (MinAlign == 2)
434 return Type::getInt16Ty(Context);
435
436 // Not all subtargets have 128-bit DS instructions, and we currently don't
437 // form them by default.
438 if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
439 SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
440 DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
441 DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
442 return FixedVectorType::get(Type::getInt32Ty(Context), 2);
443 }
444
445 // Global memory works best with 16-byte accesses. Private memory will also
446 // hit this, although they'll be decomposed.
447 return FixedVectorType::get(Type::getInt32Ty(Context), 4);
448 }
449
getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type * > & OpsOut,LLVMContext & Context,unsigned RemainingBytes,unsigned SrcAddrSpace,unsigned DestAddrSpace,unsigned SrcAlign,unsigned DestAlign,std::optional<uint32_t> AtomicCpySize) const450 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
451 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
452 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
453 unsigned SrcAlign, unsigned DestAlign,
454 std::optional<uint32_t> AtomicCpySize) const {
455 assert(RemainingBytes < 16);
456
457 if (AtomicCpySize)
458 BaseT::getMemcpyLoopResidualLoweringType(
459 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
460 DestAlign, AtomicCpySize);
461
462 unsigned MinAlign = std::min(SrcAlign, DestAlign);
463
464 if (MinAlign != 2) {
465 Type *I64Ty = Type::getInt64Ty(Context);
466 while (RemainingBytes >= 8) {
467 OpsOut.push_back(I64Ty);
468 RemainingBytes -= 8;
469 }
470
471 Type *I32Ty = Type::getInt32Ty(Context);
472 while (RemainingBytes >= 4) {
473 OpsOut.push_back(I32Ty);
474 RemainingBytes -= 4;
475 }
476 }
477
478 Type *I16Ty = Type::getInt16Ty(Context);
479 while (RemainingBytes >= 2) {
480 OpsOut.push_back(I16Ty);
481 RemainingBytes -= 2;
482 }
483
484 Type *I8Ty = Type::getInt8Ty(Context);
485 while (RemainingBytes) {
486 OpsOut.push_back(I8Ty);
487 --RemainingBytes;
488 }
489 }
490
getMaxInterleaveFactor(ElementCount VF)491 unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) {
492 // Disable unrolling if the loop is not vectorized.
493 // TODO: Enable this again.
494 if (VF.isScalar())
495 return 1;
496
497 return 8;
498 }
499
getTgtMemIntrinsic(IntrinsicInst * Inst,MemIntrinsicInfo & Info) const500 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
501 MemIntrinsicInfo &Info) const {
502 switch (Inst->getIntrinsicID()) {
503 case Intrinsic::amdgcn_ds_ordered_add:
504 case Intrinsic::amdgcn_ds_ordered_swap:
505 case Intrinsic::amdgcn_ds_fadd:
506 case Intrinsic::amdgcn_ds_fmin:
507 case Intrinsic::amdgcn_ds_fmax: {
508 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
509 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
510 if (!Ordering || !Volatile)
511 return false; // Invalid.
512
513 unsigned OrderingVal = Ordering->getZExtValue();
514 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
515 return false;
516
517 Info.PtrVal = Inst->getArgOperand(0);
518 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
519 Info.ReadMem = true;
520 Info.WriteMem = true;
521 Info.IsVolatile = !Volatile->isZero();
522 return true;
523 }
524 default:
525 return false;
526 }
527 }
528
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueInfo Op1Info,TTI::OperandValueInfo Op2Info,ArrayRef<const Value * > Args,const Instruction * CxtI)529 InstructionCost GCNTTIImpl::getArithmeticInstrCost(
530 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
531 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
532 ArrayRef<const Value *> Args,
533 const Instruction *CxtI) {
534
535 // Legalize the type.
536 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
537 int ISD = TLI->InstructionOpcodeToISD(Opcode);
538
539 // Because we don't have any legal vector operations, but the legal types, we
540 // need to account for split vectors.
541 unsigned NElts = LT.second.isVector() ?
542 LT.second.getVectorNumElements() : 1;
543
544 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
545
546 switch (ISD) {
547 case ISD::SHL:
548 case ISD::SRL:
549 case ISD::SRA:
550 if (SLT == MVT::i64)
551 return get64BitInstrCost(CostKind) * LT.first * NElts;
552
553 if (ST->has16BitInsts() && SLT == MVT::i16)
554 NElts = (NElts + 1) / 2;
555
556 // i32
557 return getFullRateInstrCost() * LT.first * NElts;
558 case ISD::ADD:
559 case ISD::SUB:
560 case ISD::AND:
561 case ISD::OR:
562 case ISD::XOR:
563 if (SLT == MVT::i64) {
564 // and, or and xor are typically split into 2 VALU instructions.
565 return 2 * getFullRateInstrCost() * LT.first * NElts;
566 }
567
568 if (ST->has16BitInsts() && SLT == MVT::i16)
569 NElts = (NElts + 1) / 2;
570
571 return LT.first * NElts * getFullRateInstrCost();
572 case ISD::MUL: {
573 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
574 if (SLT == MVT::i64) {
575 const int FullRateCost = getFullRateInstrCost();
576 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
577 }
578
579 if (ST->has16BitInsts() && SLT == MVT::i16)
580 NElts = (NElts + 1) / 2;
581
582 // i32
583 return QuarterRateCost * NElts * LT.first;
584 }
585 case ISD::FMUL:
586 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
587 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
588 // fused operation.
589 if (CxtI && CxtI->hasOneUse())
590 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
591 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
592 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
593 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
594 return TargetTransformInfo::TCC_Free;
595 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
596 return TargetTransformInfo::TCC_Free;
597
598 // Estimate all types may be fused with contract/unsafe flags
599 const TargetOptions &Options = TLI->getTargetMachine().Options;
600 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
601 Options.UnsafeFPMath ||
602 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
603 return TargetTransformInfo::TCC_Free;
604 }
605 }
606 [[fallthrough]];
607 case ISD::FADD:
608 case ISD::FSUB:
609 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
610 NElts = (NElts + 1) / 2;
611 if (SLT == MVT::f64)
612 return LT.first * NElts * get64BitInstrCost(CostKind);
613
614 if (ST->has16BitInsts() && SLT == MVT::f16)
615 NElts = (NElts + 1) / 2;
616
617 if (SLT == MVT::f32 || SLT == MVT::f16)
618 return LT.first * NElts * getFullRateInstrCost();
619 break;
620 case ISD::FDIV:
621 case ISD::FREM:
622 // FIXME: frem should be handled separately. The fdiv in it is most of it,
623 // but the current lowering is also not entirely correct.
624 if (SLT == MVT::f64) {
625 int Cost = 7 * get64BitInstrCost(CostKind) +
626 getQuarterRateInstrCost(CostKind) +
627 3 * getHalfRateInstrCost(CostKind);
628 // Add cost of workaround.
629 if (!ST->hasUsableDivScaleConditionOutput())
630 Cost += 3 * getFullRateInstrCost();
631
632 return LT.first * Cost * NElts;
633 }
634
635 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
636 // TODO: This is more complicated, unsafe flags etc.
637 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
638 (SLT == MVT::f16 && ST->has16BitInsts())) {
639 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
640 }
641 }
642
643 if (SLT == MVT::f16 && ST->has16BitInsts()) {
644 // 2 x v_cvt_f32_f16
645 // f32 rcp
646 // f32 fmul
647 // v_cvt_f16_f32
648 // f16 div_fixup
649 int Cost =
650 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
651 return LT.first * Cost * NElts;
652 }
653
654 if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
655 TLI->getTargetMachine().Options.UnsafeFPMath)) {
656 // Fast unsafe fdiv lowering:
657 // f32 rcp
658 // f32 fmul
659 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
660 return LT.first * Cost * NElts;
661 }
662
663 if (SLT == MVT::f32 || SLT == MVT::f16) {
664 // 4 more v_cvt_* insts without f16 insts support
665 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
666 1 * getQuarterRateInstrCost(CostKind);
667
668 if (!HasFP32Denormals) {
669 // FP mode switches.
670 Cost += 2 * getFullRateInstrCost();
671 }
672
673 return LT.first * NElts * Cost;
674 }
675 break;
676 case ISD::FNEG:
677 // Use the backend' estimation. If fneg is not free each element will cost
678 // one additional instruction.
679 return TLI->isFNegFree(SLT) ? 0 : NElts;
680 default:
681 break;
682 }
683
684 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
685 Args, CxtI);
686 }
687
688 // Return true if there's a potential benefit from using v2f16/v2i16
689 // instructions for an intrinsic, even if it requires nontrivial legalization.
intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)690 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
691 switch (ID) {
692 case Intrinsic::fma: // TODO: fmuladd
693 // There's a small benefit to using vector ops in the legalized code.
694 case Intrinsic::round:
695 case Intrinsic::uadd_sat:
696 case Intrinsic::usub_sat:
697 case Intrinsic::sadd_sat:
698 case Intrinsic::ssub_sat:
699 return true;
700 default:
701 return false;
702 }
703 }
704
705 InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)706 GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
707 TTI::TargetCostKind CostKind) {
708 if (ICA.getID() == Intrinsic::fabs)
709 return 0;
710
711 if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
712 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
713
714 Type *RetTy = ICA.getReturnType();
715
716 // Legalize the type.
717 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
718
719 unsigned NElts = LT.second.isVector() ?
720 LT.second.getVectorNumElements() : 1;
721
722 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
723
724 if (SLT == MVT::f64)
725 return LT.first * NElts * get64BitInstrCost(CostKind);
726
727 if ((ST->has16BitInsts() && SLT == MVT::f16) ||
728 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
729 NElts = (NElts + 1) / 2;
730
731 // TODO: Get more refined intrinsic costs?
732 unsigned InstRate = getQuarterRateInstrCost(CostKind);
733
734 switch (ICA.getID()) {
735 case Intrinsic::fma:
736 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
737 : getQuarterRateInstrCost(CostKind);
738 break;
739 case Intrinsic::uadd_sat:
740 case Intrinsic::usub_sat:
741 case Intrinsic::sadd_sat:
742 case Intrinsic::ssub_sat:
743 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
744 if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; }))
745 NElts = 1;
746 break;
747 }
748
749 return LT.first * NElts * InstRate;
750 }
751
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind,const Instruction * I)752 InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
753 TTI::TargetCostKind CostKind,
754 const Instruction *I) {
755 assert((I == nullptr || I->getOpcode() == Opcode) &&
756 "Opcode should reflect passed instruction.");
757 const bool SCost =
758 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
759 const int CBrCost = SCost ? 5 : 7;
760 switch (Opcode) {
761 case Instruction::Br: {
762 // Branch instruction takes about 4 slots on gfx900.
763 auto BI = dyn_cast_or_null<BranchInst>(I);
764 if (BI && BI->isUnconditional())
765 return SCost ? 1 : 4;
766 // Suppose conditional branch takes additional 3 exec manipulations
767 // instructions in average.
768 return CBrCost;
769 }
770 case Instruction::Switch: {
771 auto SI = dyn_cast_or_null<SwitchInst>(I);
772 // Each case (including default) takes 1 cmp + 1 cbr instructions in
773 // average.
774 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
775 }
776 case Instruction::Ret:
777 return SCost ? 1 : 10;
778 }
779 return BaseT::getCFInstrCost(Opcode, CostKind, I);
780 }
781
782 InstructionCost
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)783 GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
784 std::optional<FastMathFlags> FMF,
785 TTI::TargetCostKind CostKind) {
786 if (TTI::requiresOrderedReduction(FMF))
787 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
788
789 EVT OrigTy = TLI->getValueType(DL, Ty);
790
791 // Computes cost on targets that have packed math instructions(which support
792 // 16-bit types only).
793 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
794 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
795
796 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
797 return LT.first * getFullRateInstrCost();
798 }
799
800 InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)801 GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
802 FastMathFlags FMF,
803 TTI::TargetCostKind CostKind) {
804 EVT OrigTy = TLI->getValueType(DL, Ty);
805
806 // Computes cost on targets that have packed math instructions(which support
807 // 16-bit types only).
808 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
809 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
810
811 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
812 return LT.first * getHalfRateInstrCost(CostKind);
813 }
814
getVectorInstrCost(unsigned Opcode,Type * ValTy,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)815 InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
816 TTI::TargetCostKind CostKind,
817 unsigned Index, Value *Op0,
818 Value *Op1) {
819 switch (Opcode) {
820 case Instruction::ExtractElement:
821 case Instruction::InsertElement: {
822 unsigned EltSize
823 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
824 if (EltSize < 32) {
825 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
826 return 0;
827 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
828 Op1);
829 }
830
831 // Extracts are just reads of a subregister, so are free. Inserts are
832 // considered free because we don't want to have any cost for scalarizing
833 // operations, and we don't have to copy into a different register class.
834
835 // Dynamic indexing isn't free and is best avoided.
836 return Index == ~0u ? 2 : 0;
837 }
838 default:
839 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
840 }
841 }
842
843 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
844 /// this is analyzing the collective result of all output registers. Otherwise,
845 /// this is only querying a specific result index if this returns multiple
846 /// registers in a struct.
isInlineAsmSourceOfDivergence(const CallInst * CI,ArrayRef<unsigned> Indices) const847 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
848 const CallInst *CI, ArrayRef<unsigned> Indices) const {
849 // TODO: Handle complex extract indices
850 if (Indices.size() > 1)
851 return true;
852
853 const DataLayout &DL = CI->getModule()->getDataLayout();
854 const SIRegisterInfo *TRI = ST->getRegisterInfo();
855 TargetLowering::AsmOperandInfoVector TargetConstraints =
856 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
857
858 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
859
860 int OutputIdx = 0;
861 for (auto &TC : TargetConstraints) {
862 if (TC.Type != InlineAsm::isOutput)
863 continue;
864
865 // Skip outputs we don't care about.
866 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
867 continue;
868
869 TLI->ComputeConstraintToUse(TC, SDValue());
870
871 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
872 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
873
874 // For AGPR constraints null is returned on subtargets without AGPRs, so
875 // assume divergent for null.
876 if (!RC || !TRI->isSGPRClass(RC))
877 return true;
878 }
879
880 return false;
881 }
882
isReadRegisterSourceOfDivergence(const IntrinsicInst * ReadReg) const883 bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
884 const IntrinsicInst *ReadReg) const {
885 Metadata *MD =
886 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
887 StringRef RegName =
888 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
889
890 // Special case registers that look like VCC.
891 MVT VT = MVT::getVT(ReadReg->getType());
892 if (VT == MVT::i1)
893 return true;
894
895 // Special case scalar registers that start with 'v'.
896 if (RegName.starts_with("vcc") || RegName.empty())
897 return false;
898
899 // VGPR or AGPR is divergent. There aren't any specially named vector
900 // registers.
901 return RegName[0] == 'v' || RegName[0] == 'a';
902 }
903
904 /// \returns true if the result of the value could potentially be
905 /// different across workitems in a wavefront.
isSourceOfDivergence(const Value * V) const906 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
907 if (const Argument *A = dyn_cast<Argument>(V))
908 return !AMDGPU::isArgPassedInSGPR(A);
909
910 // Loads from the private and flat address spaces are divergent, because
911 // threads can execute the load instruction with the same inputs and get
912 // different results.
913 //
914 // All other loads are not divergent, because if threads issue loads with the
915 // same arguments, they will always get the same result.
916 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
917 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
918 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
919
920 // Atomics are divergent because they are executed sequentially: when an
921 // atomic operation refers to the same address in each thread, then each
922 // thread after the first sees the value written by the previous thread as
923 // original value.
924 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
925 return true;
926
927 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
928 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
929 return isReadRegisterSourceOfDivergence(Intrinsic);
930
931 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
932 }
933
934 // Assume all function calls are a source of divergence.
935 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
936 if (CI->isInlineAsm())
937 return isInlineAsmSourceOfDivergence(CI);
938 return true;
939 }
940
941 // Assume all function calls are a source of divergence.
942 if (isa<InvokeInst>(V))
943 return true;
944
945 return false;
946 }
947
isAlwaysUniform(const Value * V) const948 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
949 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
950 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
951
952 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
953 if (CI->isInlineAsm())
954 return !isInlineAsmSourceOfDivergence(CI);
955 return false;
956 }
957
958 // In most cases TID / wavefrontsize is uniform.
959 //
960 // However, if a kernel has uneven dimesions we can have a value of
961 // workitem-id-x divided by the wavefrontsize non-uniform. For example
962 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
963 // packed into a same wave which gives 1 and 0 after the division by 64
964 // respectively.
965 //
966 // FIXME: limit it to 1D kernels only, although that shall be possible
967 // to perform this optimization is the size of the X dimension is a power
968 // of 2, we just do not currently have infrastructure to query it.
969 using namespace llvm::PatternMatch;
970 uint64_t C;
971 if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
972 m_ConstantInt(C))) ||
973 match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
974 m_ConstantInt(C)))) {
975 const Function *F = cast<Instruction>(V)->getFunction();
976 return C >= ST->getWavefrontSizeLog2() &&
977 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
978 }
979
980 Value *Mask;
981 if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
982 m_Value(Mask)))) {
983 const Function *F = cast<Instruction>(V)->getFunction();
984 const DataLayout &DL = F->getParent()->getDataLayout();
985 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
986 ST->getWavefrontSizeLog2() &&
987 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
988 }
989
990 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
991 if (!ExtValue)
992 return false;
993
994 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
995 if (!CI)
996 return false;
997
998 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
999 switch (Intrinsic->getIntrinsicID()) {
1000 default:
1001 return false;
1002 case Intrinsic::amdgcn_if:
1003 case Intrinsic::amdgcn_else: {
1004 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1005 return Indices.size() == 1 && Indices[0] == 1;
1006 }
1007 }
1008 }
1009
1010 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1011 // divergent for the overall struct return. We need to override it in the
1012 // case we're extracting an SGPR component here.
1013 if (CI->isInlineAsm())
1014 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1015
1016 return false;
1017 }
1018
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID) const1019 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1020 Intrinsic::ID IID) const {
1021 switch (IID) {
1022 case Intrinsic::amdgcn_ds_fadd:
1023 case Intrinsic::amdgcn_ds_fmin:
1024 case Intrinsic::amdgcn_ds_fmax:
1025 case Intrinsic::amdgcn_is_shared:
1026 case Intrinsic::amdgcn_is_private:
1027 case Intrinsic::amdgcn_flat_atomic_fadd:
1028 case Intrinsic::amdgcn_flat_atomic_fmax:
1029 case Intrinsic::amdgcn_flat_atomic_fmin:
1030 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1031 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1032 OpIndexes.push_back(0);
1033 return true;
1034 default:
1035 return false;
1036 }
1037 }
1038
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV) const1039 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1040 Value *OldV,
1041 Value *NewV) const {
1042 auto IntrID = II->getIntrinsicID();
1043 switch (IntrID) {
1044 case Intrinsic::amdgcn_ds_fadd:
1045 case Intrinsic::amdgcn_ds_fmin:
1046 case Intrinsic::amdgcn_ds_fmax: {
1047 const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1048 if (!IsVolatile->isZero())
1049 return nullptr;
1050 Module *M = II->getParent()->getParent()->getParent();
1051 Type *DestTy = II->getType();
1052 Type *SrcTy = NewV->getType();
1053 Function *NewDecl =
1054 Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1055 II->setArgOperand(0, NewV);
1056 II->setCalledFunction(NewDecl);
1057 return II;
1058 }
1059 case Intrinsic::amdgcn_is_shared:
1060 case Intrinsic::amdgcn_is_private: {
1061 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1062 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1063 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1064 LLVMContext &Ctx = NewV->getType()->getContext();
1065 ConstantInt *NewVal = (TrueAS == NewAS) ?
1066 ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
1067 return NewVal;
1068 }
1069 case Intrinsic::ptrmask: {
1070 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1071 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1072 Value *MaskOp = II->getArgOperand(1);
1073 Type *MaskTy = MaskOp->getType();
1074
1075 bool DoTruncate = false;
1076
1077 const GCNTargetMachine &TM =
1078 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1079 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1080 // All valid 64-bit to 32-bit casts work by chopping off the high
1081 // bits. Any masking only clearing the low bits will also apply in the new
1082 // address space.
1083 if (DL.getPointerSizeInBits(OldAS) != 64 ||
1084 DL.getPointerSizeInBits(NewAS) != 32)
1085 return nullptr;
1086
1087 // TODO: Do we need to thread more context in here?
1088 KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1089 if (Known.countMinLeadingOnes() < 32)
1090 return nullptr;
1091
1092 DoTruncate = true;
1093 }
1094
1095 IRBuilder<> B(II);
1096 if (DoTruncate) {
1097 MaskTy = B.getInt32Ty();
1098 MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1099 }
1100
1101 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1102 {NewV, MaskOp});
1103 }
1104 case Intrinsic::amdgcn_flat_atomic_fadd:
1105 case Intrinsic::amdgcn_flat_atomic_fmax:
1106 case Intrinsic::amdgcn_flat_atomic_fmin:
1107 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1108 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1109 Type *DestTy = II->getType();
1110 Type *SrcTy = NewV->getType();
1111 unsigned NewAS = SrcTy->getPointerAddressSpace();
1112 if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS))
1113 return nullptr;
1114 Module *M = II->getModule();
1115 Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(),
1116 {DestTy, SrcTy, DestTy});
1117 II->setArgOperand(0, NewV);
1118 II->setCalledFunction(NewDecl);
1119 return II;
1120 }
1121 default:
1122 return nullptr;
1123 }
1124 }
1125
getShuffleCost(TTI::ShuffleKind Kind,VectorType * VT,ArrayRef<int> Mask,TTI::TargetCostKind CostKind,int Index,VectorType * SubTp,ArrayRef<const Value * > Args)1126 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1127 VectorType *VT, ArrayRef<int> Mask,
1128 TTI::TargetCostKind CostKind,
1129 int Index, VectorType *SubTp,
1130 ArrayRef<const Value *> Args) {
1131 Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
1132
1133 if (ST->hasVOP3PInsts()) {
1134 if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1135 DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1136 // With op_sel VOP3P instructions freely can access the low half or high
1137 // half of a register, so any swizzle is free.
1138
1139 switch (Kind) {
1140 case TTI::SK_Broadcast:
1141 case TTI::SK_Reverse:
1142 case TTI::SK_PermuteSingleSrc:
1143 return 0;
1144 default:
1145 break;
1146 }
1147 }
1148 }
1149
1150 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1151 }
1152
areInlineCompatible(const Function * Caller,const Function * Callee) const1153 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1154 const Function *Callee) const {
1155 const TargetMachine &TM = getTLI()->getTargetMachine();
1156 const GCNSubtarget *CallerST
1157 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1158 const GCNSubtarget *CalleeST
1159 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1160
1161 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1162 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1163
1164 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1165 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1166 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1167 return false;
1168
1169 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1170 // no way to support merge for backend defined attributes.
1171 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1172 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1173 if (!CallerMode.isInlineCompatible(CalleeMode))
1174 return false;
1175
1176 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1177 Callee->hasFnAttribute(Attribute::InlineHint))
1178 return true;
1179
1180 // Hack to make compile times reasonable.
1181 if (InlineMaxBB) {
1182 // Single BB does not increase total BB amount.
1183 if (Callee->size() == 1)
1184 return true;
1185 size_t BBSize = Caller->size() + Callee->size() - 1;
1186 return BBSize <= InlineMaxBB;
1187 }
1188
1189 return true;
1190 }
1191
adjustInliningThresholdUsingCallee(const CallBase * CB,const SITargetLowering * TLI,const GCNTTIImpl * TTIImpl)1192 static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
1193 const SITargetLowering *TLI,
1194 const GCNTTIImpl *TTIImpl) {
1195 const int NrOfSGPRUntilSpill = 26;
1196 const int NrOfVGPRUntilSpill = 32;
1197
1198 const DataLayout &DL = TTIImpl->getDataLayout();
1199
1200 unsigned adjustThreshold = 0;
1201 int SGPRsInUse = 0;
1202 int VGPRsInUse = 0;
1203 for (const Use &A : CB->args()) {
1204 SmallVector<EVT, 4> ValueVTs;
1205 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1206 for (auto ArgVT : ValueVTs) {
1207 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1208 CB->getContext(), CB->getCallingConv(), ArgVT);
1209 if (AMDGPU::isArgPassedInSGPR(CB, CB->getArgOperandNo(&A)))
1210 SGPRsInUse += CCRegNum;
1211 else
1212 VGPRsInUse += CCRegNum;
1213 }
1214 }
1215
1216 // The cost of passing function arguments through the stack:
1217 // 1 instruction to put a function argument on the stack in the caller.
1218 // 1 instruction to take a function argument from the stack in callee.
1219 // 1 instruction is explicitly take care of data dependencies in callee
1220 // function.
1221 InstructionCost ArgStackCost(1);
1222 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1223 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1224 AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
1225 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1226 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1227 AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
1228
1229 // The penalty cost is computed relative to the cost of instructions and does
1230 // not model any storage costs.
1231 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1232 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1233 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1234 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1235 return adjustThreshold;
1236 }
1237
getCallArgsTotalAllocaSize(const CallBase * CB,const DataLayout & DL)1238 static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1239 const DataLayout &DL) {
1240 // If we have a pointer to a private array passed into a function
1241 // it will not be optimized out, leaving scratch usage.
1242 // This function calculates the total size in bytes of the memory that would
1243 // end in scratch if the call was not inlined.
1244 unsigned AllocaSize = 0;
1245 SmallPtrSet<const AllocaInst *, 8> AIVisited;
1246 for (Value *PtrArg : CB->args()) {
1247 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1248 if (!Ty)
1249 continue;
1250
1251 unsigned AddrSpace = Ty->getAddressSpace();
1252 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1253 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1254 continue;
1255
1256 const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));
1257 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1258 continue;
1259
1260 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1261 }
1262 return AllocaSize;
1263 }
1264
adjustInliningThreshold(const CallBase * CB) const1265 unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1266 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1267
1268 // Private object passed as arguments may end up in scratch usage if the call
1269 // is not inlined. Increase the inline threshold to promote inlining.
1270 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1271 if (AllocaSize > 0)
1272 Threshold += ArgAllocaCost;
1273 return Threshold;
1274 }
1275
getCallerAllocaCost(const CallBase * CB,const AllocaInst * AI) const1276 unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
1277 const AllocaInst *AI) const {
1278
1279 // Below the cutoff, assume that the private memory objects would be
1280 // optimized
1281 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1282 if (AllocaSize <= ArgAllocaCutoff)
1283 return 0;
1284
1285 // Above the cutoff, we give a cost to each private memory object
1286 // depending its size. If the array can be optimized by SROA this cost is not
1287 // added to the total-cost in the inliner cost analysis.
1288 //
1289 // We choose the total cost of the alloca such that their sum cancels the
1290 // bonus given in the threshold (ArgAllocaCost).
1291 //
1292 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1293 //
1294 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1295 // the single-bb bonus and the vector-bonus.
1296 //
1297 // We compensate the first two multipliers, by repeating logic from the
1298 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1299 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1300 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1301
1302 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1303 return BB.getTerminator()->getNumSuccessors() > 1;
1304 });
1305 if (SingleBB) {
1306 Threshold += Threshold / 2;
1307 }
1308
1309 auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
1310
1311 // Attribute the bonus proportionally to the alloca size
1312 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1313
1314 return AllocaThresholdBonus;
1315 }
1316
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)1317 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1318 TTI::UnrollingPreferences &UP,
1319 OptimizationRemarkEmitter *ORE) {
1320 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1321 }
1322
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)1323 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1324 TTI::PeelingPreferences &PP) {
1325 CommonTTI.getPeelingPreferences(L, SE, PP);
1326 }
1327
get64BitInstrCost(TTI::TargetCostKind CostKind) const1328 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1329 return ST->hasFullRate64Ops()
1330 ? getFullRateInstrCost()
1331 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1332 : getQuarterRateInstrCost(CostKind);
1333 }
1334
1335 std::pair<InstructionCost, MVT>
getTypeLegalizationCost(Type * Ty) const1336 GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1337 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1338 auto Size = DL.getTypeSizeInBits(Ty);
1339 // Maximum load or store can handle 8 dwords for scalar and 4 for
1340 // vector ALU. Let's assume anything above 8 dwords is expensive
1341 // even if legal.
1342 if (Size <= 256)
1343 return Cost;
1344
1345 Cost.first += (Size + 255) / 256;
1346 return Cost;
1347 }
1348
getPrefetchDistance() const1349 unsigned GCNTTIImpl::getPrefetchDistance() const {
1350 return ST->hasPrefetch() ? 128 : 0;
1351 }
1352
shouldPrefetchAddressSpace(unsigned AS) const1353 bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
1354 return AMDGPU::isFlatGlobalAddrSpace(AS);
1355 }
1356