1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/TargetTransformInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/ISDOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/Argument.h"
27 #include "llvm/IR/Attributes.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/CallingConv.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Module.h"
37 #include "llvm/IR/PatternMatch.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/IR/Value.h"
40 #include "llvm/MC/SubtargetFeature.h"
41 #include "llvm/Support/Casting.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/MachineValueType.h"
46 #include "llvm/Support/raw_ostream.h"
47 #include "llvm/Target/TargetMachine.h"
48 #include <algorithm>
49 #include <cassert>
50 #include <limits>
51 #include <utility>
52 
53 using namespace llvm;
54 
55 #define DEBUG_TYPE "AMDGPUtti"
56 
57 static cl::opt<unsigned> UnrollThresholdPrivate(
58   "amdgpu-unroll-threshold-private",
59   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
60   cl::init(2700), cl::Hidden);
61 
62 static cl::opt<unsigned> UnrollThresholdLocal(
63   "amdgpu-unroll-threshold-local",
64   cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
65   cl::init(1000), cl::Hidden);
66 
67 static cl::opt<unsigned> UnrollThresholdIf(
68   "amdgpu-unroll-threshold-if",
69   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
70   cl::init(150), cl::Hidden);
71 
72 static cl::opt<bool> UnrollRuntimeLocal(
73   "amdgpu-unroll-runtime-local",
74   cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
75   cl::init(true), cl::Hidden);
76 
77 static cl::opt<bool> UseLegacyDA(
78   "amdgpu-use-legacy-divergence-analysis",
79   cl::desc("Enable legacy divergence analysis for AMDGPU"),
80   cl::init(false), cl::Hidden);
81 
82 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
83     "amdgpu-unroll-max-block-to-analyze",
84     cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
85     cl::init(20), cl::Hidden);
86 
87 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
88                               unsigned Depth = 0) {
89   const Instruction *I = dyn_cast<Instruction>(Cond);
90   if (!I)
91     return false;
92 
93   for (const Value *V : I->operand_values()) {
94     if (!L->contains(I))
95       continue;
96     if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
97       if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
98                   return SubLoop->contains(PHI); }))
99         return true;
100     } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
101       return true;
102   }
103   return false;
104 }
105 
106 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
107                                             TTI::UnrollingPreferences &UP) {
108   const Function &F = *L->getHeader()->getParent();
109   UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
110   UP.MaxCount = std::numeric_limits<unsigned>::max();
111   UP.Partial = true;
112 
113   // TODO: Do we want runtime unrolling?
114 
115   // Maximum alloca size than can fit registers. Reserve 16 registers.
116   const unsigned MaxAlloca = (256 - 16) * 4;
117   unsigned ThresholdPrivate = UnrollThresholdPrivate;
118   unsigned ThresholdLocal = UnrollThresholdLocal;
119   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
120   for (const BasicBlock *BB : L->getBlocks()) {
121     const DataLayout &DL = BB->getModule()->getDataLayout();
122     unsigned LocalGEPsSeen = 0;
123 
124     if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
125                return SubLoop->contains(BB); }))
126         continue; // Block belongs to an inner loop.
127 
128     for (const Instruction &I : *BB) {
129       // Unroll a loop which contains an "if" statement whose condition
130       // defined by a PHI belonging to the loop. This may help to eliminate
131       // if region and potentially even PHI itself, saving on both divergence
132       // and registers used for the PHI.
133       // Add a small bonus for each of such "if" statements.
134       if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
135         if (UP.Threshold < MaxBoost && Br->isConditional()) {
136           BasicBlock *Succ0 = Br->getSuccessor(0);
137           BasicBlock *Succ1 = Br->getSuccessor(1);
138           if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
139               (L->contains(Succ1) && L->isLoopExiting(Succ1)))
140             continue;
141           if (dependsOnLocalPhi(L, Br->getCondition())) {
142             UP.Threshold += UnrollThresholdIf;
143             LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
144                               << " for loop:\n"
145                               << *L << " due to " << *Br << '\n');
146             if (UP.Threshold >= MaxBoost)
147               return;
148           }
149         }
150         continue;
151       }
152 
153       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
154       if (!GEP)
155         continue;
156 
157       unsigned AS = GEP->getAddressSpace();
158       unsigned Threshold = 0;
159       if (AS == AMDGPUAS::PRIVATE_ADDRESS)
160         Threshold = ThresholdPrivate;
161       else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
162         Threshold = ThresholdLocal;
163       else
164         continue;
165 
166       if (UP.Threshold >= Threshold)
167         continue;
168 
169       if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
170         const Value *Ptr = GEP->getPointerOperand();
171         const AllocaInst *Alloca =
172             dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
173         if (!Alloca || !Alloca->isStaticAlloca())
174           continue;
175         Type *Ty = Alloca->getAllocatedType();
176         unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
177         if (AllocaSize > MaxAlloca)
178           continue;
179       } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
180                  AS == AMDGPUAS::REGION_ADDRESS) {
181         LocalGEPsSeen++;
182         // Inhibit unroll for local memory if we have seen addressing not to
183         // a variable, most likely we will be unable to combine it.
184         // Do not unroll too deep inner loops for local memory to give a chance
185         // to unroll an outer loop for a more important reason.
186         if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
187             (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
188              !isa<Argument>(GEP->getPointerOperand())))
189           continue;
190         LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
191                           << *L << " due to LDS use.\n");
192         UP.Runtime = UnrollRuntimeLocal;
193       }
194 
195       // Check if GEP depends on a value defined by this loop itself.
196       bool HasLoopDef = false;
197       for (const Value *Op : GEP->operands()) {
198         const Instruction *Inst = dyn_cast<Instruction>(Op);
199         if (!Inst || L->isLoopInvariant(Op))
200           continue;
201 
202         if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
203              return SubLoop->contains(Inst); }))
204           continue;
205         HasLoopDef = true;
206         break;
207       }
208       if (!HasLoopDef)
209         continue;
210 
211       // We want to do whatever we can to limit the number of alloca
212       // instructions that make it through to the code generator.  allocas
213       // require us to use indirect addressing, which is slow and prone to
214       // compiler bugs.  If this loop does an address calculation on an
215       // alloca ptr, then we want to use a higher than normal loop unroll
216       // threshold. This will give SROA a better chance to eliminate these
217       // allocas.
218       //
219       // We also want to have more unrolling for local memory to let ds
220       // instructions with different offsets combine.
221       //
222       // Don't use the maximum allowed value here as it will make some
223       // programs way too big.
224       UP.Threshold = Threshold;
225       LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
226                         << " for loop:\n"
227                         << *L << " due to " << *GEP << '\n');
228       if (UP.Threshold >= MaxBoost)
229         return;
230     }
231 
232     // If we got a GEP in a small BB from inner loop then increase max trip
233     // count to analyze for better estimation cost in unroll
234     if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze)
235       UP.MaxIterationsCountToAnalyze = 32;
236   }
237 }
238 
239 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
240                                           TTI::PeelingPreferences &PP) {
241   BaseT::getPeelingPreferences(L, SE, PP);
242 }
243 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
244   // The concept of vector registers doesn't really exist. Some packed vector
245   // operations operate on the normal 32-bit registers.
246   return MaxVGPRs;
247 }
248 
249 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
250   // This is really the number of registers to fill when vectorizing /
251   // interleaving loops, so we lie to avoid trying to use all registers.
252   return getHardwareNumberOfRegisters(Vec) >> 3;
253 }
254 
255 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
256   const SIRegisterInfo *TRI = ST->getRegisterInfo();
257   const TargetRegisterClass *RC = TRI->getRegClass(RCID);
258   unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
259   return getHardwareNumberOfRegisters(false) / NumVGPRs;
260 }
261 
262 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
263   return 32;
264 }
265 
266 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
267   return 32;
268 }
269 
270 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
271                                          unsigned ChainSizeInBytes,
272                                          VectorType *VecTy) const {
273   unsigned VecRegBitWidth = VF * LoadSize;
274   if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
275     // TODO: Support element-size less than 32bit?
276     return 128 / LoadSize;
277 
278   return VF;
279 }
280 
281 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
282                                              unsigned ChainSizeInBytes,
283                                              VectorType *VecTy) const {
284   unsigned VecRegBitWidth = VF * StoreSize;
285   if (VecRegBitWidth > 128)
286     return 128 / StoreSize;
287 
288   return VF;
289 }
290 
291 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
292   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
293       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
294       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
295       AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
296     return 512;
297   }
298 
299   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
300     return 8 * ST->getMaxPrivateElementSize();
301 
302   // Common to flat, global, local and region. Assume for unknown addrspace.
303   return 128;
304 }
305 
306 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
307                                             Align Alignment,
308                                             unsigned AddrSpace) const {
309   // We allow vectorization of flat stores, even though we may need to decompose
310   // them later if they may access private memory. We don't have enough context
311   // here, and legalization can handle it.
312   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
313     return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
314       ChainSizeInBytes <= ST->getMaxPrivateElementSize();
315   }
316   return true;
317 }
318 
319 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
320                                              Align Alignment,
321                                              unsigned AddrSpace) const {
322   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
323 }
324 
325 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
326                                               Align Alignment,
327                                               unsigned AddrSpace) const {
328   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
329 }
330 
331 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
332 // iteration. Should we report a larger size and let it legalize?
333 //
334 // FIXME: Should we use narrower types for local/region, or account for when
335 // unaligned access is legal?
336 //
337 // FIXME: This could use fine tuning and microbenchmarks.
338 Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
339                                             unsigned SrcAddrSpace,
340                                             unsigned DestAddrSpace,
341                                             unsigned SrcAlign,
342                                             unsigned DestAlign) const {
343   unsigned MinAlign = std::min(SrcAlign, DestAlign);
344 
345   // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
346   // hardware into byte accesses. If you assume all alignments are equally
347   // probable, it's more efficient on average to use short accesses for this
348   // case.
349   if (MinAlign == 2)
350     return Type::getInt16Ty(Context);
351 
352   // Not all subtargets have 128-bit DS instructions, and we currently don't
353   // form them by default.
354   if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
355       SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
356       DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
357       DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
358     return FixedVectorType::get(Type::getInt32Ty(Context), 2);
359   }
360 
361   // Global memory works best with 16-byte accesses. Private memory will also
362   // hit this, although they'll be decomposed.
363   return FixedVectorType::get(Type::getInt32Ty(Context), 4);
364 }
365 
366 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
367   SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
368   unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
369   unsigned SrcAlign, unsigned DestAlign) const {
370   assert(RemainingBytes < 16);
371 
372   unsigned MinAlign = std::min(SrcAlign, DestAlign);
373 
374   if (MinAlign != 2) {
375     Type *I64Ty = Type::getInt64Ty(Context);
376     while (RemainingBytes >= 8) {
377       OpsOut.push_back(I64Ty);
378       RemainingBytes -= 8;
379     }
380 
381     Type *I32Ty = Type::getInt32Ty(Context);
382     while (RemainingBytes >= 4) {
383       OpsOut.push_back(I32Ty);
384       RemainingBytes -= 4;
385     }
386   }
387 
388   Type *I16Ty = Type::getInt16Ty(Context);
389   while (RemainingBytes >= 2) {
390     OpsOut.push_back(I16Ty);
391     RemainingBytes -= 2;
392   }
393 
394   Type *I8Ty = Type::getInt8Ty(Context);
395   while (RemainingBytes) {
396     OpsOut.push_back(I8Ty);
397     --RemainingBytes;
398   }
399 }
400 
401 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
402   // Disable unrolling if the loop is not vectorized.
403   // TODO: Enable this again.
404   if (VF == 1)
405     return 1;
406 
407   return 8;
408 }
409 
410 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
411                                        MemIntrinsicInfo &Info) const {
412   switch (Inst->getIntrinsicID()) {
413   case Intrinsic::amdgcn_atomic_inc:
414   case Intrinsic::amdgcn_atomic_dec:
415   case Intrinsic::amdgcn_ds_ordered_add:
416   case Intrinsic::amdgcn_ds_ordered_swap:
417   case Intrinsic::amdgcn_ds_fadd:
418   case Intrinsic::amdgcn_ds_fmin:
419   case Intrinsic::amdgcn_ds_fmax: {
420     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
421     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
422     if (!Ordering || !Volatile)
423       return false; // Invalid.
424 
425     unsigned OrderingVal = Ordering->getZExtValue();
426     if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
427       return false;
428 
429     Info.PtrVal = Inst->getArgOperand(0);
430     Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
431     Info.ReadMem = true;
432     Info.WriteMem = true;
433     Info.IsVolatile = !Volatile->isNullValue();
434     return true;
435   }
436   default:
437     return false;
438   }
439 }
440 
441 int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
442                                        TTI::TargetCostKind CostKind,
443                                        TTI::OperandValueKind Opd1Info,
444                                        TTI::OperandValueKind Opd2Info,
445                                        TTI::OperandValueProperties Opd1PropInfo,
446                                        TTI::OperandValueProperties Opd2PropInfo,
447                                        ArrayRef<const Value *> Args,
448                                        const Instruction *CxtI) {
449   EVT OrigTy = TLI->getValueType(DL, Ty);
450   if (!OrigTy.isSimple()) {
451     // FIXME: We're having to query the throughput cost so that the basic
452     // implementation tries to generate legalize and scalarization costs. Maybe
453     // we could hoist the scalarization code here?
454     return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
455                                          Opd1Info, Opd2Info,
456                                          Opd1PropInfo, Opd2PropInfo);
457   }
458 
459   // Legalize the type.
460   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
461   int ISD = TLI->InstructionOpcodeToISD(Opcode);
462 
463   // Because we don't have any legal vector operations, but the legal types, we
464   // need to account for split vectors.
465   unsigned NElts = LT.second.isVector() ?
466     LT.second.getVectorNumElements() : 1;
467 
468   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
469 
470   switch (ISD) {
471   case ISD::SHL:
472   case ISD::SRL:
473   case ISD::SRA:
474     if (SLT == MVT::i64)
475       return get64BitInstrCost() * LT.first * NElts;
476 
477     if (ST->has16BitInsts() && SLT == MVT::i16)
478       NElts = (NElts + 1) / 2;
479 
480     // i32
481     return getFullRateInstrCost() * LT.first * NElts;
482   case ISD::ADD:
483   case ISD::SUB:
484   case ISD::AND:
485   case ISD::OR:
486   case ISD::XOR:
487     if (SLT == MVT::i64) {
488       // and, or and xor are typically split into 2 VALU instructions.
489       return 2 * getFullRateInstrCost() * LT.first * NElts;
490     }
491 
492     if (ST->has16BitInsts() && SLT == MVT::i16)
493       NElts = (NElts + 1) / 2;
494 
495     return LT.first * NElts * getFullRateInstrCost();
496   case ISD::MUL: {
497     const int QuarterRateCost = getQuarterRateInstrCost();
498     if (SLT == MVT::i64) {
499       const int FullRateCost = getFullRateInstrCost();
500       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
501     }
502 
503     if (ST->has16BitInsts() && SLT == MVT::i16)
504       NElts = (NElts + 1) / 2;
505 
506     // i32
507     return QuarterRateCost * NElts * LT.first;
508   }
509   case ISD::FADD:
510   case ISD::FSUB:
511   case ISD::FMUL:
512     if (SLT == MVT::f64)
513       return LT.first * NElts * get64BitInstrCost();
514 
515     if (ST->has16BitInsts() && SLT == MVT::f16)
516       NElts = (NElts + 1) / 2;
517 
518     if (SLT == MVT::f32 || SLT == MVT::f16)
519       return LT.first * NElts * getFullRateInstrCost();
520     break;
521   case ISD::FDIV:
522   case ISD::FREM:
523     // FIXME: frem should be handled separately. The fdiv in it is most of it,
524     // but the current lowering is also not entirely correct.
525     if (SLT == MVT::f64) {
526       int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
527       // Add cost of workaround.
528       if (!ST->hasUsableDivScaleConditionOutput())
529         Cost += 3 * getFullRateInstrCost();
530 
531       return LT.first * Cost * NElts;
532     }
533 
534     if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
535       // TODO: This is more complicated, unsafe flags etc.
536       if ((SLT == MVT::f32 && !HasFP32Denormals) ||
537           (SLT == MVT::f16 && ST->has16BitInsts())) {
538         return LT.first * getQuarterRateInstrCost() * NElts;
539       }
540     }
541 
542     if (SLT == MVT::f16 && ST->has16BitInsts()) {
543       // 2 x v_cvt_f32_f16
544       // f32 rcp
545       // f32 fmul
546       // v_cvt_f16_f32
547       // f16 div_fixup
548       int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
549       return LT.first * Cost * NElts;
550     }
551 
552     if (SLT == MVT::f32 || SLT == MVT::f16) {
553       int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
554 
555       if (!HasFP32Denormals) {
556         // FP mode switches.
557         Cost += 2 * getFullRateInstrCost();
558       }
559 
560       return LT.first * NElts * Cost;
561     }
562     break;
563   case ISD::FNEG:
564     // Use the backend' estimation. If fneg is not free each element will cost
565     // one additional instruction.
566     return TLI->isFNegFree(SLT) ? 0 : NElts;
567   default:
568     break;
569   }
570 
571   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
572                                        Opd2Info,
573                                        Opd1PropInfo, Opd2PropInfo);
574 }
575 
576 // Return true if there's a potential benefit from using v2f16 instructions for
577 // an intrinsic, even if it requires nontrivial legalization.
578 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
579   switch (ID) {
580   case Intrinsic::fma: // TODO: fmuladd
581   // There's a small benefit to using vector ops in the legalized code.
582   case Intrinsic::round:
583     return true;
584   default:
585     return false;
586   }
587 }
588 
589 int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
590                                       TTI::TargetCostKind CostKind) {
591   if (ICA.getID() == Intrinsic::fabs)
592     return 0;
593 
594   if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
595     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
596 
597   Type *RetTy = ICA.getReturnType();
598   EVT OrigTy = TLI->getValueType(DL, RetTy);
599   if (!OrigTy.isSimple()) {
600     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
601   }
602 
603   // Legalize the type.
604   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
605 
606   unsigned NElts = LT.second.isVector() ?
607     LT.second.getVectorNumElements() : 1;
608 
609   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
610 
611   if (SLT == MVT::f64)
612     return LT.first * NElts * get64BitInstrCost();
613 
614   if (ST->has16BitInsts() && SLT == MVT::f16)
615     NElts = (NElts + 1) / 2;
616 
617   // TODO: Get more refined intrinsic costs?
618   unsigned InstRate = getQuarterRateInstrCost();
619   if (ICA.getID() == Intrinsic::fma) {
620     InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
621                                    : getQuarterRateInstrCost();
622   }
623 
624   return LT.first * NElts * InstRate;
625 }
626 
627 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
628                                     TTI::TargetCostKind CostKind) {
629   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
630     return Opcode == Instruction::PHI ? 0 : 1;
631 
632   // XXX - For some reason this isn't called for switch.
633   switch (Opcode) {
634   case Instruction::Br:
635   case Instruction::Ret:
636     return 10;
637   default:
638     return BaseT::getCFInstrCost(Opcode, CostKind);
639   }
640 }
641 
642 int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
643                                            bool IsPairwise,
644                                            TTI::TargetCostKind CostKind) {
645   EVT OrigTy = TLI->getValueType(DL, Ty);
646 
647   // Computes cost on targets that have packed math instructions(which support
648   // 16-bit types only).
649   if (IsPairwise ||
650       !ST->hasVOP3PInsts() ||
651       OrigTy.getScalarSizeInBits() != 16)
652     return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
653 
654   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
655   return LT.first * getFullRateInstrCost();
656 }
657 
658 int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
659                                        bool IsPairwise, bool IsUnsigned,
660                                        TTI::TargetCostKind CostKind) {
661   EVT OrigTy = TLI->getValueType(DL, Ty);
662 
663   // Computes cost on targets that have packed math instructions(which support
664   // 16-bit types only).
665   if (IsPairwise ||
666       !ST->hasVOP3PInsts() ||
667       OrigTy.getScalarSizeInBits() != 16)
668     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
669                                          CostKind);
670 
671   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
672   return LT.first * getHalfRateInstrCost();
673 }
674 
675 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
676                                       unsigned Index) {
677   switch (Opcode) {
678   case Instruction::ExtractElement:
679   case Instruction::InsertElement: {
680     unsigned EltSize
681       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
682     if (EltSize < 32) {
683       if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
684         return 0;
685       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
686     }
687 
688     // Extracts are just reads of a subregister, so are free. Inserts are
689     // considered free because we don't want to have any cost for scalarizing
690     // operations, and we don't have to copy into a different register class.
691 
692     // Dynamic indexing isn't free and is best avoided.
693     return Index == ~0u ? 2 : 0;
694   }
695   default:
696     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
697   }
698 }
699 
700 static bool isArgPassedInSGPR(const Argument *A) {
701   const Function *F = A->getParent();
702 
703   // Arguments to compute shaders are never a source of divergence.
704   CallingConv::ID CC = F->getCallingConv();
705   switch (CC) {
706   case CallingConv::AMDGPU_KERNEL:
707   case CallingConv::SPIR_KERNEL:
708     return true;
709   case CallingConv::AMDGPU_VS:
710   case CallingConv::AMDGPU_LS:
711   case CallingConv::AMDGPU_HS:
712   case CallingConv::AMDGPU_ES:
713   case CallingConv::AMDGPU_GS:
714   case CallingConv::AMDGPU_PS:
715   case CallingConv::AMDGPU_CS:
716     // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
717     // Everything else is in VGPRs.
718     return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
719            F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
720   default:
721     // TODO: Should calls support inreg for SGPR inputs?
722     return false;
723   }
724 }
725 
726 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
727 /// this is analyzing the collective result of all output registers. Otherwise,
728 /// this is only querying a specific result index if this returns multiple
729 /// registers in a struct.
730 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
731   const CallInst *CI, ArrayRef<unsigned> Indices) const {
732   // TODO: Handle complex extract indices
733   if (Indices.size() > 1)
734     return true;
735 
736   const DataLayout &DL = CI->getModule()->getDataLayout();
737   const SIRegisterInfo *TRI = ST->getRegisterInfo();
738   TargetLowering::AsmOperandInfoVector TargetConstraints =
739       TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
740 
741   const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
742 
743   int OutputIdx = 0;
744   for (auto &TC : TargetConstraints) {
745     if (TC.Type != InlineAsm::isOutput)
746       continue;
747 
748     // Skip outputs we don't care about.
749     if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
750       continue;
751 
752     TLI->ComputeConstraintToUse(TC, SDValue());
753 
754     Register AssignedReg;
755     const TargetRegisterClass *RC;
756     std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
757       TRI, TC.ConstraintCode, TC.ConstraintVT);
758     if (AssignedReg) {
759       // FIXME: This is a workaround for getRegForInlineAsmConstraint
760       // returning VS_32
761       RC = TRI->getPhysRegClass(AssignedReg);
762     }
763 
764     // For AGPR constraints null is returned on subtargets without AGPRs, so
765     // assume divergent for null.
766     if (!RC || !TRI->isSGPRClass(RC))
767       return true;
768   }
769 
770   return false;
771 }
772 
773 /// \returns true if the new GPU divergence analysis is enabled.
774 bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
775   return !UseLegacyDA;
776 }
777 
778 /// \returns true if the result of the value could potentially be
779 /// different across workitems in a wavefront.
780 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
781   if (const Argument *A = dyn_cast<Argument>(V))
782     return !isArgPassedInSGPR(A);
783 
784   // Loads from the private and flat address spaces are divergent, because
785   // threads can execute the load instruction with the same inputs and get
786   // different results.
787   //
788   // All other loads are not divergent, because if threads issue loads with the
789   // same arguments, they will always get the same result.
790   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
791     return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
792            Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
793 
794   // Atomics are divergent because they are executed sequentially: when an
795   // atomic operation refers to the same address in each thread, then each
796   // thread after the first sees the value written by the previous thread as
797   // original value.
798   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
799     return true;
800 
801   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
802     return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
803 
804   // Assume all function calls are a source of divergence.
805   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
806     if (CI->isInlineAsm())
807       return isInlineAsmSourceOfDivergence(CI);
808     return true;
809   }
810 
811   // Assume all function calls are a source of divergence.
812   if (isa<InvokeInst>(V))
813     return true;
814 
815   return false;
816 }
817 
818 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
819   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
820     switch (Intrinsic->getIntrinsicID()) {
821     default:
822       return false;
823     case Intrinsic::amdgcn_readfirstlane:
824     case Intrinsic::amdgcn_readlane:
825     case Intrinsic::amdgcn_icmp:
826     case Intrinsic::amdgcn_fcmp:
827     case Intrinsic::amdgcn_ballot:
828     case Intrinsic::amdgcn_if_break:
829       return true;
830     }
831   }
832 
833   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
834     if (CI->isInlineAsm())
835       return !isInlineAsmSourceOfDivergence(CI);
836     return false;
837   }
838 
839   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
840   if (!ExtValue)
841     return false;
842 
843   const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
844   if (!CI)
845     return false;
846 
847   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
848     switch (Intrinsic->getIntrinsicID()) {
849     default:
850       return false;
851     case Intrinsic::amdgcn_if:
852     case Intrinsic::amdgcn_else: {
853       ArrayRef<unsigned> Indices = ExtValue->getIndices();
854       return Indices.size() == 1 && Indices[0] == 1;
855     }
856     }
857   }
858 
859   // If we have inline asm returning mixed SGPR and VGPR results, we inferred
860   // divergent for the overall struct return. We need to override it in the
861   // case we're extracting an SGPR component here.
862   if (CI->isInlineAsm())
863     return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
864 
865   return false;
866 }
867 
868 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
869                                             Intrinsic::ID IID) const {
870   switch (IID) {
871   case Intrinsic::amdgcn_atomic_inc:
872   case Intrinsic::amdgcn_atomic_dec:
873   case Intrinsic::amdgcn_ds_fadd:
874   case Intrinsic::amdgcn_ds_fmin:
875   case Intrinsic::amdgcn_ds_fmax:
876   case Intrinsic::amdgcn_is_shared:
877   case Intrinsic::amdgcn_is_private:
878     OpIndexes.push_back(0);
879     return true;
880   default:
881     return false;
882   }
883 }
884 
885 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
886                                                     Value *OldV,
887                                                     Value *NewV) const {
888   auto IntrID = II->getIntrinsicID();
889   switch (IntrID) {
890   case Intrinsic::amdgcn_atomic_inc:
891   case Intrinsic::amdgcn_atomic_dec:
892   case Intrinsic::amdgcn_ds_fadd:
893   case Intrinsic::amdgcn_ds_fmin:
894   case Intrinsic::amdgcn_ds_fmax: {
895     const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
896     if (!IsVolatile->isZero())
897       return nullptr;
898     Module *M = II->getParent()->getParent()->getParent();
899     Type *DestTy = II->getType();
900     Type *SrcTy = NewV->getType();
901     Function *NewDecl =
902         Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
903     II->setArgOperand(0, NewV);
904     II->setCalledFunction(NewDecl);
905     return II;
906   }
907   case Intrinsic::amdgcn_is_shared:
908   case Intrinsic::amdgcn_is_private: {
909     unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
910       AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
911     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
912     LLVMContext &Ctx = NewV->getType()->getContext();
913     ConstantInt *NewVal = (TrueAS == NewAS) ?
914       ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
915     return NewVal;
916   }
917   case Intrinsic::ptrmask: {
918     unsigned OldAS = OldV->getType()->getPointerAddressSpace();
919     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
920     Value *MaskOp = II->getArgOperand(1);
921     Type *MaskTy = MaskOp->getType();
922 
923     bool DoTruncate = false;
924     if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) {
925       // All valid 64-bit to 32-bit casts work by chopping off the high
926       // bits. Any masking only clearing the low bits will also apply in the new
927       // address space.
928       if (DL.getPointerSizeInBits(OldAS) != 64 ||
929           DL.getPointerSizeInBits(NewAS) != 32)
930         return nullptr;
931 
932       // TODO: Do we need to thread more context in here?
933       KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
934       if (Known.countMinLeadingOnes() < 32)
935         return nullptr;
936 
937       DoTruncate = true;
938     }
939 
940     IRBuilder<> B(II);
941     if (DoTruncate) {
942       MaskTy = B.getInt32Ty();
943       MaskOp = B.CreateTrunc(MaskOp, MaskTy);
944     }
945 
946     return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
947                              {NewV, MaskOp});
948   }
949   default:
950     return nullptr;
951   }
952 }
953 
954 unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT,
955                                     int Index, VectorType *SubTp) {
956   if (ST->hasVOP3PInsts()) {
957     if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
958         DL.getTypeSizeInBits(VT->getElementType()) == 16) {
959       // With op_sel VOP3P instructions freely can access the low half or high
960       // half of a register, so any swizzle is free.
961 
962       switch (Kind) {
963       case TTI::SK_Broadcast:
964       case TTI::SK_Reverse:
965       case TTI::SK_PermuteSingleSrc:
966         return 0;
967       default:
968         break;
969       }
970     }
971   }
972 
973   return BaseT::getShuffleCost(Kind, VT, Index, SubTp);
974 }
975 
976 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
977                                      const Function *Callee) const {
978   const TargetMachine &TM = getTLI()->getTargetMachine();
979   const GCNSubtarget *CallerST
980     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
981   const GCNSubtarget *CalleeST
982     = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
983 
984   const FeatureBitset &CallerBits = CallerST->getFeatureBits();
985   const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
986 
987   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
988   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
989   if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
990     return false;
991 
992   // FIXME: dx10_clamp can just take the caller setting, but there seems to be
993   // no way to support merge for backend defined attributes.
994   AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
995   AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
996   return CallerMode.isInlineCompatible(CalleeMode);
997 }
998 
999 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1000                                          TTI::UnrollingPreferences &UP) {
1001   CommonTTI.getUnrollingPreferences(L, SE, UP);
1002 }
1003 
1004 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1005                                        TTI::PeelingPreferences &PP) {
1006   CommonTTI.getPeelingPreferences(L, SE, PP);
1007 }
1008 
1009 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
1010   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
1011 }
1012 
1013 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
1014   return getHardwareNumberOfRegisters(Vec);
1015 }
1016 
1017 unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
1018   return 32;
1019 }
1020 
1021 unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
1022   return 32;
1023 }
1024 
1025 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
1026   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
1027       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
1028     return 128;
1029   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1030       AddrSpace == AMDGPUAS::REGION_ADDRESS)
1031     return 64;
1032   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
1033     return 32;
1034 
1035   if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
1036       AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
1037       (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
1038       AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
1039     return 128;
1040   llvm_unreachable("unhandled address space");
1041 }
1042 
1043 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
1044                                              Align Alignment,
1045                                              unsigned AddrSpace) const {
1046   // We allow vectorization of flat stores, even though we may need to decompose
1047   // them later if they may access private memory. We don't have enough context
1048   // here, and legalization can handle it.
1049   return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
1050 }
1051 
1052 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1053                                               Align Alignment,
1054                                               unsigned AddrSpace) const {
1055   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1056 }
1057 
1058 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1059                                                Align Alignment,
1060                                                unsigned AddrSpace) const {
1061   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1062 }
1063 
1064 unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
1065   // Disable unrolling if the loop is not vectorized.
1066   // TODO: Enable this again.
1067   if (VF == 1)
1068     return 1;
1069 
1070   return 8;
1071 }
1072 
1073 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
1074                                      TTI::TargetCostKind CostKind) {
1075   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
1076     return Opcode == Instruction::PHI ? 0 : 1;
1077 
1078   // XXX - For some reason this isn't called for switch.
1079   switch (Opcode) {
1080   case Instruction::Br:
1081   case Instruction::Ret:
1082     return 10;
1083   default:
1084     return BaseT::getCFInstrCost(Opcode, CostKind);
1085   }
1086 }
1087 
1088 int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
1089                                     unsigned Index) {
1090   switch (Opcode) {
1091   case Instruction::ExtractElement:
1092   case Instruction::InsertElement: {
1093     unsigned EltSize
1094       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1095     if (EltSize < 32) {
1096       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1097     }
1098 
1099     // Extracts are just reads of a subregister, so are free. Inserts are
1100     // considered free because we don't want to have any cost for scalarizing
1101     // operations, and we don't have to copy into a different register class.
1102 
1103     // Dynamic indexing isn't free and is best avoided.
1104     return Index == ~0u ? 2 : 0;
1105   }
1106   default:
1107     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1108   }
1109 }
1110 
1111 void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1112                                           TTI::UnrollingPreferences &UP) {
1113   CommonTTI.getUnrollingPreferences(L, SE, UP);
1114 }
1115 
1116 void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1117                                         TTI::PeelingPreferences &PP) {
1118   CommonTTI.getPeelingPreferences(L, SE, PP);
1119 }
1120