1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "llvm/Analysis/LoopInfo.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
24 #include "llvm/IR/PatternMatch.h"
25 #include "llvm/Support/KnownBits.h"
26 #include <optional>
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "AMDGPUtti"
31 
32 static cl::opt<unsigned> UnrollThresholdPrivate(
33   "amdgpu-unroll-threshold-private",
34   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
35   cl::init(2700), cl::Hidden);
36 
37 static cl::opt<unsigned> UnrollThresholdLocal(
38   "amdgpu-unroll-threshold-local",
39   cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
40   cl::init(1000), cl::Hidden);
41 
42 static cl::opt<unsigned> UnrollThresholdIf(
43   "amdgpu-unroll-threshold-if",
44   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
45   cl::init(200), cl::Hidden);
46 
47 static cl::opt<bool> UnrollRuntimeLocal(
48   "amdgpu-unroll-runtime-local",
49   cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
50   cl::init(true), cl::Hidden);
51 
52 static cl::opt<bool> UseLegacyDA(
53   "amdgpu-use-legacy-divergence-analysis",
54   cl::desc("Enable legacy divergence analysis for AMDGPU"),
55   cl::init(false), cl::Hidden);
56 
57 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
58     "amdgpu-unroll-max-block-to-analyze",
59     cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
60     cl::init(32), cl::Hidden);
61 
62 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
63                                        cl::Hidden, cl::init(4000),
64                                        cl::desc("Cost of alloca argument"));
65 
66 // If the amount of scratch memory to eliminate exceeds our ability to allocate
67 // it into registers we gain nothing by aggressively inlining functions for that
68 // heuristic.
69 static cl::opt<unsigned>
70     ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
71                     cl::init(256),
72                     cl::desc("Maximum alloca size to use for inline cost"));
73 
74 // Inliner constraint to achieve reasonable compilation time.
75 static cl::opt<size_t> InlineMaxBB(
76     "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
77     cl::desc("Maximum number of BBs allowed in a function after inlining"
78              " (compile time constraint)"));
79 
80 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
81                               unsigned Depth = 0) {
82   const Instruction *I = dyn_cast<Instruction>(Cond);
83   if (!I)
84     return false;
85 
86   for (const Value *V : I->operand_values()) {
87     if (!L->contains(I))
88       continue;
89     if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
90       if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
91                   return SubLoop->contains(PHI); }))
92         return true;
93     } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
94       return true;
95   }
96   return false;
97 }
98 
99 AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
100     : BaseT(TM, F.getParent()->getDataLayout()),
101       TargetTriple(TM->getTargetTriple()),
102       ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
103       TLI(ST->getTargetLowering()) {}
104 
105 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
106                                             TTI::UnrollingPreferences &UP,
107                                             OptimizationRemarkEmitter *ORE) {
108   const Function &F = *L->getHeader()->getParent();
109   UP.Threshold =
110       F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
111   UP.MaxCount = std::numeric_limits<unsigned>::max();
112   UP.Partial = true;
113 
114   // Conditional branch in a loop back edge needs 3 additional exec
115   // manipulations in average.
116   UP.BEInsns += 3;
117 
118   // TODO: Do we want runtime unrolling?
119 
120   // Maximum alloca size than can fit registers. Reserve 16 registers.
121   const unsigned MaxAlloca = (256 - 16) * 4;
122   unsigned ThresholdPrivate = UnrollThresholdPrivate;
123   unsigned ThresholdLocal = UnrollThresholdLocal;
124 
125   // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
126   // provided threshold value as the default for Threshold
127   if (MDNode *LoopUnrollThreshold =
128           findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
129     if (LoopUnrollThreshold->getNumOperands() == 2) {
130       ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
131           LoopUnrollThreshold->getOperand(1));
132       if (MetaThresholdValue) {
133         // We will also use the supplied value for PartialThreshold for now.
134         // We may introduce additional metadata if it becomes necessary in the
135         // future.
136         UP.Threshold = MetaThresholdValue->getSExtValue();
137         UP.PartialThreshold = UP.Threshold;
138         ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
139         ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
140       }
141     }
142   }
143 
144   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
145   for (const BasicBlock *BB : L->getBlocks()) {
146     const DataLayout &DL = BB->getModule()->getDataLayout();
147     unsigned LocalGEPsSeen = 0;
148 
149     if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
150                return SubLoop->contains(BB); }))
151         continue; // Block belongs to an inner loop.
152 
153     for (const Instruction &I : *BB) {
154       // Unroll a loop which contains an "if" statement whose condition
155       // defined by a PHI belonging to the loop. This may help to eliminate
156       // if region and potentially even PHI itself, saving on both divergence
157       // and registers used for the PHI.
158       // Add a small bonus for each of such "if" statements.
159       if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
160         if (UP.Threshold < MaxBoost && Br->isConditional()) {
161           BasicBlock *Succ0 = Br->getSuccessor(0);
162           BasicBlock *Succ1 = Br->getSuccessor(1);
163           if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
164               (L->contains(Succ1) && L->isLoopExiting(Succ1)))
165             continue;
166           if (dependsOnLocalPhi(L, Br->getCondition())) {
167             UP.Threshold += UnrollThresholdIf;
168             LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
169                               << " for loop:\n"
170                               << *L << " due to " << *Br << '\n');
171             if (UP.Threshold >= MaxBoost)
172               return;
173           }
174         }
175         continue;
176       }
177 
178       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
179       if (!GEP)
180         continue;
181 
182       unsigned AS = GEP->getAddressSpace();
183       unsigned Threshold = 0;
184       if (AS == AMDGPUAS::PRIVATE_ADDRESS)
185         Threshold = ThresholdPrivate;
186       else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
187         Threshold = ThresholdLocal;
188       else
189         continue;
190 
191       if (UP.Threshold >= Threshold)
192         continue;
193 
194       if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
195         const Value *Ptr = GEP->getPointerOperand();
196         const AllocaInst *Alloca =
197             dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
198         if (!Alloca || !Alloca->isStaticAlloca())
199           continue;
200         Type *Ty = Alloca->getAllocatedType();
201         unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
202         if (AllocaSize > MaxAlloca)
203           continue;
204       } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
205                  AS == AMDGPUAS::REGION_ADDRESS) {
206         LocalGEPsSeen++;
207         // Inhibit unroll for local memory if we have seen addressing not to
208         // a variable, most likely we will be unable to combine it.
209         // Do not unroll too deep inner loops for local memory to give a chance
210         // to unroll an outer loop for a more important reason.
211         if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
212             (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
213              !isa<Argument>(GEP->getPointerOperand())))
214           continue;
215         LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
216                           << *L << " due to LDS use.\n");
217         UP.Runtime = UnrollRuntimeLocal;
218       }
219 
220       // Check if GEP depends on a value defined by this loop itself.
221       bool HasLoopDef = false;
222       for (const Value *Op : GEP->operands()) {
223         const Instruction *Inst = dyn_cast<Instruction>(Op);
224         if (!Inst || L->isLoopInvariant(Op))
225           continue;
226 
227         if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
228              return SubLoop->contains(Inst); }))
229           continue;
230         HasLoopDef = true;
231         break;
232       }
233       if (!HasLoopDef)
234         continue;
235 
236       // We want to do whatever we can to limit the number of alloca
237       // instructions that make it through to the code generator.  allocas
238       // require us to use indirect addressing, which is slow and prone to
239       // compiler bugs.  If this loop does an address calculation on an
240       // alloca ptr, then we want to use a higher than normal loop unroll
241       // threshold. This will give SROA a better chance to eliminate these
242       // allocas.
243       //
244       // We also want to have more unrolling for local memory to let ds
245       // instructions with different offsets combine.
246       //
247       // Don't use the maximum allowed value here as it will make some
248       // programs way too big.
249       UP.Threshold = Threshold;
250       LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
251                         << " for loop:\n"
252                         << *L << " due to " << *GEP << '\n');
253       if (UP.Threshold >= MaxBoost)
254         return;
255     }
256 
257     // If we got a GEP in a small BB from inner loop then increase max trip
258     // count to analyze for better estimation cost in unroll
259     if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
260       UP.MaxIterationsCountToAnalyze = 32;
261   }
262 }
263 
264 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
265                                           TTI::PeelingPreferences &PP) {
266   BaseT::getPeelingPreferences(L, SE, PP);
267 }
268 
269 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
270     // Codegen control options which don't matter.
271     AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
272     AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
273     AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
274     AMDGPU::FeatureUnalignedAccessMode,
275 
276     AMDGPU::FeatureAutoWaitcntBeforeBarrier,
277 
278     // Property of the kernel/environment which can't actually differ.
279     AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
280     AMDGPU::FeatureTrapHandler,
281 
282     // The default assumption needs to be ecc is enabled, but no directly
283     // exposed operations depend on it, so it can be safely inlined.
284     AMDGPU::FeatureSRAMECC,
285 
286     // Perf-tuning features
287     AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
288 
289 GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
290     : BaseT(TM, F.getParent()->getDataLayout()),
291       ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
292       TLI(ST->getTargetLowering()), CommonTTI(TM, F),
293       IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
294   AMDGPU::SIModeRegisterDefaults Mode(F);
295   HasFP32Denormals = Mode.allFP32Denormals();
296   HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
297 }
298 
299 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
300   // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
301   // registers. See getRegisterClassForType for the implementation.
302   // In this case vector registers are not vector in terms of
303   // VGPRs, but those which can hold multiple values.
304 
305   // This is really the number of registers to fill when vectorizing /
306   // interleaving loops, so we lie to avoid trying to use all registers.
307   return 4;
308 }
309 
310 TypeSize
311 GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
312   switch (K) {
313   case TargetTransformInfo::RGK_Scalar:
314     return TypeSize::getFixed(32);
315   case TargetTransformInfo::RGK_FixedWidthVector:
316     return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
317   case TargetTransformInfo::RGK_ScalableVector:
318     return TypeSize::getScalable(0);
319   }
320   llvm_unreachable("Unsupported register kind");
321 }
322 
323 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
324   return 32;
325 }
326 
327 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
328   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
329     return 32 * 4 / ElemWidth;
330   return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
331        : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
332        : 1;
333 }
334 
335 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
336                                          unsigned ChainSizeInBytes,
337                                          VectorType *VecTy) const {
338   unsigned VecRegBitWidth = VF * LoadSize;
339   if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
340     // TODO: Support element-size less than 32bit?
341     return 128 / LoadSize;
342 
343   return VF;
344 }
345 
346 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
347                                              unsigned ChainSizeInBytes,
348                                              VectorType *VecTy) const {
349   unsigned VecRegBitWidth = VF * StoreSize;
350   if (VecRegBitWidth > 128)
351     return 128 / StoreSize;
352 
353   return VF;
354 }
355 
356 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
357   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
358       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
359       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
360       AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
361     return 512;
362   }
363 
364   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
365     return 8 * ST->getMaxPrivateElementSize();
366 
367   // Common to flat, global, local and region. Assume for unknown addrspace.
368   return 128;
369 }
370 
371 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
372                                             Align Alignment,
373                                             unsigned AddrSpace) const {
374   // We allow vectorization of flat stores, even though we may need to decompose
375   // them later if they may access private memory. We don't have enough context
376   // here, and legalization can handle it.
377   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
378     return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
379       ChainSizeInBytes <= ST->getMaxPrivateElementSize();
380   }
381   return true;
382 }
383 
384 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
385                                              Align Alignment,
386                                              unsigned AddrSpace) const {
387   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
388 }
389 
390 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
391                                               Align Alignment,
392                                               unsigned AddrSpace) const {
393   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
394 }
395 
396 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
397 // iteration. Should we report a larger size and let it legalize?
398 //
399 // FIXME: Should we use narrower types for local/region, or account for when
400 // unaligned access is legal?
401 //
402 // FIXME: This could use fine tuning and microbenchmarks.
403 Type *GCNTTIImpl::getMemcpyLoopLoweringType(
404     LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
405     unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
406     std::optional<uint32_t> AtomicElementSize) const {
407 
408   if (AtomicElementSize)
409     return Type::getIntNTy(Context, *AtomicElementSize * 8);
410 
411   unsigned MinAlign = std::min(SrcAlign, DestAlign);
412 
413   // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
414   // hardware into byte accesses. If you assume all alignments are equally
415   // probable, it's more efficient on average to use short accesses for this
416   // case.
417   if (MinAlign == 2)
418     return Type::getInt16Ty(Context);
419 
420   // Not all subtargets have 128-bit DS instructions, and we currently don't
421   // form them by default.
422   if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
423       SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
424       DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
425       DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
426     return FixedVectorType::get(Type::getInt32Ty(Context), 2);
427   }
428 
429   // Global memory works best with 16-byte accesses. Private memory will also
430   // hit this, although they'll be decomposed.
431   return FixedVectorType::get(Type::getInt32Ty(Context), 4);
432 }
433 
434 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
435     SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
436     unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
437     unsigned SrcAlign, unsigned DestAlign,
438     std::optional<uint32_t> AtomicCpySize) const {
439   assert(RemainingBytes < 16);
440 
441   if (AtomicCpySize)
442     BaseT::getMemcpyLoopResidualLoweringType(
443         OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
444         DestAlign, AtomicCpySize);
445 
446   unsigned MinAlign = std::min(SrcAlign, DestAlign);
447 
448   if (MinAlign != 2) {
449     Type *I64Ty = Type::getInt64Ty(Context);
450     while (RemainingBytes >= 8) {
451       OpsOut.push_back(I64Ty);
452       RemainingBytes -= 8;
453     }
454 
455     Type *I32Ty = Type::getInt32Ty(Context);
456     while (RemainingBytes >= 4) {
457       OpsOut.push_back(I32Ty);
458       RemainingBytes -= 4;
459     }
460   }
461 
462   Type *I16Ty = Type::getInt16Ty(Context);
463   while (RemainingBytes >= 2) {
464     OpsOut.push_back(I16Ty);
465     RemainingBytes -= 2;
466   }
467 
468   Type *I8Ty = Type::getInt8Ty(Context);
469   while (RemainingBytes) {
470     OpsOut.push_back(I8Ty);
471     --RemainingBytes;
472   }
473 }
474 
475 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
476   // Disable unrolling if the loop is not vectorized.
477   // TODO: Enable this again.
478   if (VF == 1)
479     return 1;
480 
481   return 8;
482 }
483 
484 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
485                                        MemIntrinsicInfo &Info) const {
486   switch (Inst->getIntrinsicID()) {
487   case Intrinsic::amdgcn_atomic_inc:
488   case Intrinsic::amdgcn_atomic_dec:
489   case Intrinsic::amdgcn_ds_ordered_add:
490   case Intrinsic::amdgcn_ds_ordered_swap:
491   case Intrinsic::amdgcn_ds_fadd:
492   case Intrinsic::amdgcn_ds_fmin:
493   case Intrinsic::amdgcn_ds_fmax: {
494     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
495     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
496     if (!Ordering || !Volatile)
497       return false; // Invalid.
498 
499     unsigned OrderingVal = Ordering->getZExtValue();
500     if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
501       return false;
502 
503     Info.PtrVal = Inst->getArgOperand(0);
504     Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
505     Info.ReadMem = true;
506     Info.WriteMem = true;
507     Info.IsVolatile = !Volatile->isZero();
508     return true;
509   }
510   default:
511     return false;
512   }
513 }
514 
515 InstructionCost GCNTTIImpl::getArithmeticInstrCost(
516     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
517     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
518     ArrayRef<const Value *> Args,
519     const Instruction *CxtI) {
520 
521   // Legalize the type.
522   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
523   int ISD = TLI->InstructionOpcodeToISD(Opcode);
524 
525   // Because we don't have any legal vector operations, but the legal types, we
526   // need to account for split vectors.
527   unsigned NElts = LT.second.isVector() ?
528     LT.second.getVectorNumElements() : 1;
529 
530   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
531 
532   switch (ISD) {
533   case ISD::SHL:
534   case ISD::SRL:
535   case ISD::SRA:
536     if (SLT == MVT::i64)
537       return get64BitInstrCost(CostKind) * LT.first * NElts;
538 
539     if (ST->has16BitInsts() && SLT == MVT::i16)
540       NElts = (NElts + 1) / 2;
541 
542     // i32
543     return getFullRateInstrCost() * LT.first * NElts;
544   case ISD::ADD:
545   case ISD::SUB:
546   case ISD::AND:
547   case ISD::OR:
548   case ISD::XOR:
549     if (SLT == MVT::i64) {
550       // and, or and xor are typically split into 2 VALU instructions.
551       return 2 * getFullRateInstrCost() * LT.first * NElts;
552     }
553 
554     if (ST->has16BitInsts() && SLT == MVT::i16)
555       NElts = (NElts + 1) / 2;
556 
557     return LT.first * NElts * getFullRateInstrCost();
558   case ISD::MUL: {
559     const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
560     if (SLT == MVT::i64) {
561       const int FullRateCost = getFullRateInstrCost();
562       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
563     }
564 
565     if (ST->has16BitInsts() && SLT == MVT::i16)
566       NElts = (NElts + 1) / 2;
567 
568     // i32
569     return QuarterRateCost * NElts * LT.first;
570   }
571   case ISD::FMUL:
572     // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
573     // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
574     // fused operation.
575     if (CxtI && CxtI->hasOneUse())
576       if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
577         const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
578         if (OPC == ISD::FADD || OPC == ISD::FSUB) {
579           if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
580             return TargetTransformInfo::TCC_Free;
581           if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
582             return TargetTransformInfo::TCC_Free;
583 
584           // Estimate all types may be fused with contract/unsafe flags
585           const TargetOptions &Options = TLI->getTargetMachine().Options;
586           if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
587               Options.UnsafeFPMath ||
588               (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
589             return TargetTransformInfo::TCC_Free;
590         }
591       }
592     [[fallthrough]];
593   case ISD::FADD:
594   case ISD::FSUB:
595     if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
596       NElts = (NElts + 1) / 2;
597     if (SLT == MVT::f64)
598       return LT.first * NElts * get64BitInstrCost(CostKind);
599 
600     if (ST->has16BitInsts() && SLT == MVT::f16)
601       NElts = (NElts + 1) / 2;
602 
603     if (SLT == MVT::f32 || SLT == MVT::f16)
604       return LT.first * NElts * getFullRateInstrCost();
605     break;
606   case ISD::FDIV:
607   case ISD::FREM:
608     // FIXME: frem should be handled separately. The fdiv in it is most of it,
609     // but the current lowering is also not entirely correct.
610     if (SLT == MVT::f64) {
611       int Cost = 7 * get64BitInstrCost(CostKind) +
612                  getQuarterRateInstrCost(CostKind) +
613                  3 * getHalfRateInstrCost(CostKind);
614       // Add cost of workaround.
615       if (!ST->hasUsableDivScaleConditionOutput())
616         Cost += 3 * getFullRateInstrCost();
617 
618       return LT.first * Cost * NElts;
619     }
620 
621     if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
622       // TODO: This is more complicated, unsafe flags etc.
623       if ((SLT == MVT::f32 && !HasFP32Denormals) ||
624           (SLT == MVT::f16 && ST->has16BitInsts())) {
625         return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
626       }
627     }
628 
629     if (SLT == MVT::f16 && ST->has16BitInsts()) {
630       // 2 x v_cvt_f32_f16
631       // f32 rcp
632       // f32 fmul
633       // v_cvt_f16_f32
634       // f16 div_fixup
635       int Cost =
636           4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
637       return LT.first * Cost * NElts;
638     }
639 
640     if (SLT == MVT::f32 || SLT == MVT::f16) {
641       // 4 more v_cvt_* insts without f16 insts support
642       int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
643                  1 * getQuarterRateInstrCost(CostKind);
644 
645       if (!HasFP32Denormals) {
646         // FP mode switches.
647         Cost += 2 * getFullRateInstrCost();
648       }
649 
650       return LT.first * NElts * Cost;
651     }
652     break;
653   case ISD::FNEG:
654     // Use the backend' estimation. If fneg is not free each element will cost
655     // one additional instruction.
656     return TLI->isFNegFree(SLT) ? 0 : NElts;
657   default:
658     break;
659   }
660 
661   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
662                                        Args, CxtI);
663 }
664 
665 // Return true if there's a potential benefit from using v2f16/v2i16
666 // instructions for an intrinsic, even if it requires nontrivial legalization.
667 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
668   switch (ID) {
669   case Intrinsic::fma: // TODO: fmuladd
670   // There's a small benefit to using vector ops in the legalized code.
671   case Intrinsic::round:
672   case Intrinsic::uadd_sat:
673   case Intrinsic::usub_sat:
674   case Intrinsic::sadd_sat:
675   case Intrinsic::ssub_sat:
676     return true;
677   default:
678     return false;
679   }
680 }
681 
682 InstructionCost
683 GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
684                                   TTI::TargetCostKind CostKind) {
685   if (ICA.getID() == Intrinsic::fabs)
686     return 0;
687 
688   if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
689     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
690 
691   Type *RetTy = ICA.getReturnType();
692 
693   // Legalize the type.
694   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
695 
696   unsigned NElts = LT.second.isVector() ?
697     LT.second.getVectorNumElements() : 1;
698 
699   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
700 
701   if (SLT == MVT::f64)
702     return LT.first * NElts * get64BitInstrCost(CostKind);
703 
704   if ((ST->has16BitInsts() && SLT == MVT::f16) ||
705       (ST->hasPackedFP32Ops() && SLT == MVT::f32))
706     NElts = (NElts + 1) / 2;
707 
708   // TODO: Get more refined intrinsic costs?
709   unsigned InstRate = getQuarterRateInstrCost(CostKind);
710 
711   switch (ICA.getID()) {
712   case Intrinsic::fma:
713     InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
714                                    : getQuarterRateInstrCost(CostKind);
715     break;
716   case Intrinsic::uadd_sat:
717   case Intrinsic::usub_sat:
718   case Intrinsic::sadd_sat:
719   case Intrinsic::ssub_sat:
720     static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
721     if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
722       NElts = 1;
723     break;
724   }
725 
726   return LT.first * NElts * InstRate;
727 }
728 
729 InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
730                                            TTI::TargetCostKind CostKind,
731                                            const Instruction *I) {
732   assert((I == nullptr || I->getOpcode() == Opcode) &&
733          "Opcode should reflect passed instruction.");
734   const bool SCost =
735       (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
736   const int CBrCost = SCost ? 5 : 7;
737   switch (Opcode) {
738   case Instruction::Br: {
739     // Branch instruction takes about 4 slots on gfx900.
740     auto BI = dyn_cast_or_null<BranchInst>(I);
741     if (BI && BI->isUnconditional())
742       return SCost ? 1 : 4;
743     // Suppose conditional branch takes additional 3 exec manipulations
744     // instructions in average.
745     return CBrCost;
746   }
747   case Instruction::Switch: {
748     auto SI = dyn_cast_or_null<SwitchInst>(I);
749     // Each case (including default) takes 1 cmp + 1 cbr instructions in
750     // average.
751     return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
752   }
753   case Instruction::Ret:
754     return SCost ? 1 : 10;
755   }
756   return BaseT::getCFInstrCost(Opcode, CostKind, I);
757 }
758 
759 InstructionCost
760 GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
761                                        std::optional<FastMathFlags> FMF,
762                                        TTI::TargetCostKind CostKind) {
763   if (TTI::requiresOrderedReduction(FMF))
764     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
765 
766   EVT OrigTy = TLI->getValueType(DL, Ty);
767 
768   // Computes cost on targets that have packed math instructions(which support
769   // 16-bit types only).
770   if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
771     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
772 
773   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
774   return LT.first * getFullRateInstrCost();
775 }
776 
777 InstructionCost
778 GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
779                                    bool IsUnsigned,
780                                    TTI::TargetCostKind CostKind) {
781   EVT OrigTy = TLI->getValueType(DL, Ty);
782 
783   // Computes cost on targets that have packed math instructions(which support
784   // 16-bit types only).
785   if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
786     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
787 
788   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
789   return LT.first * getHalfRateInstrCost(CostKind);
790 }
791 
792 InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
793                                                TTI::TargetCostKind CostKind,
794                                                unsigned Index, Value *Op0,
795                                                Value *Op1) {
796   switch (Opcode) {
797   case Instruction::ExtractElement:
798   case Instruction::InsertElement: {
799     unsigned EltSize
800       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
801     if (EltSize < 32) {
802       if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
803         return 0;
804       return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
805                                        Op1);
806     }
807 
808     // Extracts are just reads of a subregister, so are free. Inserts are
809     // considered free because we don't want to have any cost for scalarizing
810     // operations, and we don't have to copy into a different register class.
811 
812     // Dynamic indexing isn't free and is best avoided.
813     return Index == ~0u ? 2 : 0;
814   }
815   default:
816     return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
817   }
818 }
819 
820 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
821 /// this is analyzing the collective result of all output registers. Otherwise,
822 /// this is only querying a specific result index if this returns multiple
823 /// registers in a struct.
824 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
825   const CallInst *CI, ArrayRef<unsigned> Indices) const {
826   // TODO: Handle complex extract indices
827   if (Indices.size() > 1)
828     return true;
829 
830   const DataLayout &DL = CI->getModule()->getDataLayout();
831   const SIRegisterInfo *TRI = ST->getRegisterInfo();
832   TargetLowering::AsmOperandInfoVector TargetConstraints =
833       TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
834 
835   const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
836 
837   int OutputIdx = 0;
838   for (auto &TC : TargetConstraints) {
839     if (TC.Type != InlineAsm::isOutput)
840       continue;
841 
842     // Skip outputs we don't care about.
843     if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
844       continue;
845 
846     TLI->ComputeConstraintToUse(TC, SDValue());
847 
848     const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
849         TRI, TC.ConstraintCode, TC.ConstraintVT).second;
850 
851     // For AGPR constraints null is returned on subtargets without AGPRs, so
852     // assume divergent for null.
853     if (!RC || !TRI->isSGPRClass(RC))
854       return true;
855   }
856 
857   return false;
858 }
859 
860 /// \returns true if the new GPU divergence analysis is enabled.
861 bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
862   return !UseLegacyDA;
863 }
864 
865 bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
866     const IntrinsicInst *ReadReg) const {
867   Metadata *MD =
868       cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
869   StringRef RegName =
870       cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
871 
872   // Special case registers that look like VCC.
873   MVT VT = MVT::getVT(ReadReg->getType());
874   if (VT == MVT::i1)
875     return true;
876 
877   // Special case scalar registers that start with 'v'.
878   if (RegName.startswith("vcc") || RegName.empty())
879     return false;
880 
881   // VGPR or AGPR is divergent. There aren't any specially named vector
882   // registers.
883   return RegName[0] == 'v' || RegName[0] == 'a';
884 }
885 
886 /// \returns true if the result of the value could potentially be
887 /// different across workitems in a wavefront.
888 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
889   if (const Argument *A = dyn_cast<Argument>(V))
890     return !AMDGPU::isArgPassedInSGPR(A);
891 
892   // Loads from the private and flat address spaces are divergent, because
893   // threads can execute the load instruction with the same inputs and get
894   // different results.
895   //
896   // All other loads are not divergent, because if threads issue loads with the
897   // same arguments, they will always get the same result.
898   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
899     return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
900            Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
901 
902   // Atomics are divergent because they are executed sequentially: when an
903   // atomic operation refers to the same address in each thread, then each
904   // thread after the first sees the value written by the previous thread as
905   // original value.
906   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
907     return true;
908 
909   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
910     if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
911       return isReadRegisterSourceOfDivergence(Intrinsic);
912 
913     return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
914   }
915 
916   // Assume all function calls are a source of divergence.
917   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
918     if (CI->isInlineAsm())
919       return isInlineAsmSourceOfDivergence(CI);
920     return true;
921   }
922 
923   // Assume all function calls are a source of divergence.
924   if (isa<InvokeInst>(V))
925     return true;
926 
927   return false;
928 }
929 
930 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
931   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
932     switch (Intrinsic->getIntrinsicID()) {
933     default:
934       return false;
935     case Intrinsic::amdgcn_readfirstlane:
936     case Intrinsic::amdgcn_readlane:
937     case Intrinsic::amdgcn_icmp:
938     case Intrinsic::amdgcn_fcmp:
939     case Intrinsic::amdgcn_ballot:
940     case Intrinsic::amdgcn_if_break:
941       return true;
942     }
943   }
944 
945   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
946     if (CI->isInlineAsm())
947       return !isInlineAsmSourceOfDivergence(CI);
948     return false;
949   }
950 
951   // In most cases TID / wavefrontsize is uniform.
952   //
953   // However, if a kernel has uneven dimesions we can have a value of
954   // workitem-id-x divided by the wavefrontsize non-uniform. For example
955   // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
956   // packed into a same wave which gives 1 and 0 after the division by 64
957   // respectively.
958   //
959   // FIXME: limit it to 1D kernels only, although that shall be possible
960   // to perform this optimization is the size of the X dimension is a power
961   // of 2, we just do not currently have infrastructure to query it.
962   using namespace llvm::PatternMatch;
963   uint64_t C;
964   if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
965                       m_ConstantInt(C))) ||
966       match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
967                       m_ConstantInt(C)))) {
968     const Function *F = cast<Instruction>(V)->getFunction();
969     return C >= ST->getWavefrontSizeLog2() &&
970            ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
971   }
972 
973   Value *Mask;
974   if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
975                        m_Value(Mask)))) {
976     const Function *F = cast<Instruction>(V)->getFunction();
977     const DataLayout &DL = F->getParent()->getDataLayout();
978     return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
979                ST->getWavefrontSizeLog2() &&
980            ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
981   }
982 
983   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
984   if (!ExtValue)
985     return false;
986 
987   const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
988   if (!CI)
989     return false;
990 
991   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
992     switch (Intrinsic->getIntrinsicID()) {
993     default:
994       return false;
995     case Intrinsic::amdgcn_if:
996     case Intrinsic::amdgcn_else: {
997       ArrayRef<unsigned> Indices = ExtValue->getIndices();
998       return Indices.size() == 1 && Indices[0] == 1;
999     }
1000     }
1001   }
1002 
1003   // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1004   // divergent for the overall struct return. We need to override it in the
1005   // case we're extracting an SGPR component here.
1006   if (CI->isInlineAsm())
1007     return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1008 
1009   return false;
1010 }
1011 
1012 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1013                                             Intrinsic::ID IID) const {
1014   switch (IID) {
1015   case Intrinsic::amdgcn_atomic_inc:
1016   case Intrinsic::amdgcn_atomic_dec:
1017   case Intrinsic::amdgcn_ds_fadd:
1018   case Intrinsic::amdgcn_ds_fmin:
1019   case Intrinsic::amdgcn_ds_fmax:
1020   case Intrinsic::amdgcn_is_shared:
1021   case Intrinsic::amdgcn_is_private:
1022   case Intrinsic::amdgcn_flat_atomic_fadd:
1023   case Intrinsic::amdgcn_flat_atomic_fmax:
1024   case Intrinsic::amdgcn_flat_atomic_fmin:
1025     OpIndexes.push_back(0);
1026     return true;
1027   default:
1028     return false;
1029   }
1030 }
1031 
1032 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1033                                                     Value *OldV,
1034                                                     Value *NewV) const {
1035   auto IntrID = II->getIntrinsicID();
1036   switch (IntrID) {
1037   case Intrinsic::amdgcn_atomic_inc:
1038   case Intrinsic::amdgcn_atomic_dec:
1039   case Intrinsic::amdgcn_ds_fadd:
1040   case Intrinsic::amdgcn_ds_fmin:
1041   case Intrinsic::amdgcn_ds_fmax: {
1042     const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1043     if (!IsVolatile->isZero())
1044       return nullptr;
1045     Module *M = II->getParent()->getParent()->getParent();
1046     Type *DestTy = II->getType();
1047     Type *SrcTy = NewV->getType();
1048     Function *NewDecl =
1049         Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1050     II->setArgOperand(0, NewV);
1051     II->setCalledFunction(NewDecl);
1052     return II;
1053   }
1054   case Intrinsic::amdgcn_is_shared:
1055   case Intrinsic::amdgcn_is_private: {
1056     unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1057       AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1058     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1059     LLVMContext &Ctx = NewV->getType()->getContext();
1060     ConstantInt *NewVal = (TrueAS == NewAS) ?
1061       ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
1062     return NewVal;
1063   }
1064   case Intrinsic::ptrmask: {
1065     unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1066     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1067     Value *MaskOp = II->getArgOperand(1);
1068     Type *MaskTy = MaskOp->getType();
1069 
1070     bool DoTruncate = false;
1071 
1072     const GCNTargetMachine &TM =
1073         static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1074     if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1075       // All valid 64-bit to 32-bit casts work by chopping off the high
1076       // bits. Any masking only clearing the low bits will also apply in the new
1077       // address space.
1078       if (DL.getPointerSizeInBits(OldAS) != 64 ||
1079           DL.getPointerSizeInBits(NewAS) != 32)
1080         return nullptr;
1081 
1082       // TODO: Do we need to thread more context in here?
1083       KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1084       if (Known.countMinLeadingOnes() < 32)
1085         return nullptr;
1086 
1087       DoTruncate = true;
1088     }
1089 
1090     IRBuilder<> B(II);
1091     if (DoTruncate) {
1092       MaskTy = B.getInt32Ty();
1093       MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1094     }
1095 
1096     return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1097                              {NewV, MaskOp});
1098   }
1099   case Intrinsic::amdgcn_flat_atomic_fadd:
1100   case Intrinsic::amdgcn_flat_atomic_fmax:
1101   case Intrinsic::amdgcn_flat_atomic_fmin: {
1102     Module *M = II->getParent()->getParent()->getParent();
1103     Type *DestTy = II->getType();
1104     Type *SrcTy = NewV->getType();
1105     Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(),
1106                                                   {DestTy, SrcTy, DestTy});
1107     II->setArgOperand(0, NewV);
1108     II->setCalledFunction(NewDecl);
1109     return II;
1110   }
1111   default:
1112     return nullptr;
1113   }
1114 }
1115 
1116 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1117                                            VectorType *VT, ArrayRef<int> Mask,
1118                                            TTI::TargetCostKind CostKind,
1119                                            int Index, VectorType *SubTp,
1120                                            ArrayRef<const Value *> Args) {
1121   Kind = improveShuffleKindFromMask(Kind, Mask);
1122   if (ST->hasVOP3PInsts()) {
1123     if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1124         DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1125       // With op_sel VOP3P instructions freely can access the low half or high
1126       // half of a register, so any swizzle is free.
1127 
1128       switch (Kind) {
1129       case TTI::SK_Broadcast:
1130       case TTI::SK_Reverse:
1131       case TTI::SK_PermuteSingleSrc:
1132         return 0;
1133       default:
1134         break;
1135       }
1136     }
1137   }
1138 
1139   return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1140 }
1141 
1142 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1143                                      const Function *Callee) const {
1144   const TargetMachine &TM = getTLI()->getTargetMachine();
1145   const GCNSubtarget *CallerST
1146     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1147   const GCNSubtarget *CalleeST
1148     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1149 
1150   const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1151   const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1152 
1153   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1154   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1155   if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1156     return false;
1157 
1158   // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1159   // no way to support merge for backend defined attributes.
1160   AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1161   AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
1162   if (!CallerMode.isInlineCompatible(CalleeMode))
1163     return false;
1164 
1165   if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1166       Callee->hasFnAttribute(Attribute::InlineHint))
1167     return true;
1168 
1169   // Hack to make compile times reasonable.
1170   if (InlineMaxBB) {
1171     // Single BB does not increase total BB amount.
1172     if (Callee->size() == 1)
1173       return true;
1174     size_t BBSize = Caller->size() + Callee->size() - 1;
1175     return BBSize <= InlineMaxBB;
1176   }
1177 
1178   return true;
1179 }
1180 
1181 unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1182   // If we have a pointer to private array passed into a function
1183   // it will not be optimized out, leaving scratch usage.
1184   // Increase the inline threshold to allow inlining in this case.
1185   uint64_t AllocaSize = 0;
1186   SmallPtrSet<const AllocaInst *, 8> AIVisited;
1187   for (Value *PtrArg : CB->args()) {
1188     PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1189     if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1190                 Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
1191       continue;
1192 
1193     PtrArg = getUnderlyingObject(PtrArg);
1194     if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1195       if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1196         continue;
1197       AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1198       // If the amount of stack memory is excessive we will not be able
1199       // to get rid of the scratch anyway, bail out.
1200       if (AllocaSize > ArgAllocaCutoff) {
1201         AllocaSize = 0;
1202         break;
1203       }
1204     }
1205   }
1206   if (AllocaSize)
1207     return ArgAllocaCost;
1208   return 0;
1209 }
1210 
1211 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1212                                          TTI::UnrollingPreferences &UP,
1213                                          OptimizationRemarkEmitter *ORE) {
1214   CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1215 }
1216 
1217 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1218                                        TTI::PeelingPreferences &PP) {
1219   CommonTTI.getPeelingPreferences(L, SE, PP);
1220 }
1221 
1222 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1223   return ST->hasFullRate64Ops()
1224              ? getFullRateInstrCost()
1225              : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1226                                       : getQuarterRateInstrCost(CostKind);
1227 }
1228 
1229 std::pair<InstructionCost, MVT>
1230 GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1231   std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1232   auto Size = DL.getTypeSizeInBits(Ty);
1233   // Maximum load or store can handle 8 dwords for scalar and 4 for
1234   // vector ALU. Let's assume anything above 8 dwords is expensive
1235   // even if legal.
1236   if (Size <= 256)
1237     return Cost;
1238 
1239   Cost.first += (Size + 255) / 256;
1240   return Cost;
1241 }
1242