Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

0b57cec5SDimitry Andric//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
0b57cec5SDimitry Andric//
0b57cec5SDimitry Andric// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0b57cec5SDimitry Andric// See https://llvm.org/LICENSE.txt for license information.
0b57cec5SDimitry Andric// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0b57cec5SDimitry Andric//
0b57cec5SDimitry Andric//===----------------------------------------------------------------------===//
0b57cec5SDimitry Andric//
0b57cec5SDimitry Andric// \file
0b57cec5SDimitry Andric// This file implements a TargetTransformInfo analysis pass specific to the
0b57cec5SDimitry Andric// AMDGPU target machine. It uses the target's detailed information to provide
0b57cec5SDimitry Andric// more precise answers to certain TTI queries, while letting the target
0b57cec5SDimitry Andric// independent and default TTI implementations handle the rest.
0b57cec5SDimitry Andric//
0b57cec5SDimitry Andric//===----------------------------------------------------------------------===//
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric#include "AMDGPUTargetTransformInfo.h"
e8d8bef9SDimitry Andric#include "AMDGPUTargetMachine.h"
349cc55cSDimitry Andric#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
0b57cec5SDimitry Andric#include "llvm/Analysis/LoopInfo.h"
0b57cec5SDimitry Andric#include "llvm/Analysis/ValueTracking.h"
fe6060f1SDimitry Andric#include "llvm/IR/IRBuilder.h"
349cc55cSDimitry Andric#include "llvm/IR/IntrinsicsAMDGPU.h"
0b57cec5SDimitry Andric#include "llvm/IR/PatternMatch.h"
e8d8bef9SDimitry Andric#include "llvm/Support/KnownBits.h"
bdd1243dSDimitry Andric#include <optional>
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricusing namespace llvm;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric#define DEBUG_TYPE "AMDGPUtti"
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricstatic cl::opt<unsigned> UnrollThresholdPrivate(
0b57cec5SDimitry Andric  "amdgpu-unroll-threshold-private",
0b57cec5SDimitry Andric  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
480093f4SDimitry Andric  cl::init(2700), cl::Hidden);
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricstatic cl::opt<unsigned> UnrollThresholdLocal(
0b57cec5SDimitry Andric  "amdgpu-unroll-threshold-local",
0b57cec5SDimitry Andric  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
0b57cec5SDimitry Andric  cl::init(1000), cl::Hidden);
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricstatic cl::opt<unsigned> UnrollThresholdIf(
0b57cec5SDimitry Andric  "amdgpu-unroll-threshold-if",
0b57cec5SDimitry Andric  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
fe6060f1SDimitry Andric  cl::init(200), cl::Hidden);
0b57cec5SDimitry Andric
5ffd83dbSDimitry Andricstatic cl::opt<bool> UnrollRuntimeLocal(
5ffd83dbSDimitry Andric  "amdgpu-unroll-runtime-local",
5ffd83dbSDimitry Andric  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
5ffd83dbSDimitry Andric  cl::init(true), cl::Hidden);
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andricstatic cl::opt<bool> UseLegacyDA(
5ffd83dbSDimitry Andric  "amdgpu-use-legacy-divergence-analysis",
5ffd83dbSDimitry Andric  cl::desc("Enable legacy divergence analysis for AMDGPU"),
5ffd83dbSDimitry Andric  cl::init(false), cl::Hidden);
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andricstatic cl::opt<unsigned> UnrollMaxBlockToAnalyze(
5ffd83dbSDimitry Andric    "amdgpu-unroll-max-block-to-analyze",
5ffd83dbSDimitry Andric    cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
e8d8bef9SDimitry Andric    cl::init(32), cl::Hidden);
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andricstatic cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
e8d8bef9SDimitry Andric                                       cl::Hidden, cl::init(4000),
e8d8bef9SDimitry Andric                                       cl::desc("Cost of alloca argument"));
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andric// If the amount of scratch memory to eliminate exceeds our ability to allocate
e8d8bef9SDimitry Andric// it into registers we gain nothing by aggressively inlining functions for that
e8d8bef9SDimitry Andric// heuristic.
e8d8bef9SDimitry Andricstatic cl::opt<unsigned>
e8d8bef9SDimitry Andric    ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
e8d8bef9SDimitry Andric                    cl::init(256),
e8d8bef9SDimitry Andric                    cl::desc("Maximum alloca size to use for inline cost"));
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andric// Inliner constraint to achieve reasonable compilation time.
e8d8bef9SDimitry Andricstatic cl::opt<size_t> InlineMaxBB(
e8d8bef9SDimitry Andric    "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
e8d8bef9SDimitry Andric    cl::desc("Maximum number of BBs allowed in a function after inlining"
e8d8bef9SDimitry Andric             " (compile time constraint)"));
5ffd83dbSDimitry Andric
0b57cec5SDimitry Andricstatic bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
0b57cec5SDimitry Andric                              unsigned Depth = 0) {
0b57cec5SDimitry Andric  const Instruction *I = dyn_cast<Instruction>(Cond);
0b57cec5SDimitry Andric  if (!I)
0b57cec5SDimitry Andric    return false;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  for (const Value *V : I->operand_values()) {
0b57cec5SDimitry Andric    if (!L->contains(I))
0b57cec5SDimitry Andric      continue;
0b57cec5SDimitry Andric    if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
0b57cec5SDimitry Andric      if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
0b57cec5SDimitry Andric                  return SubLoop->contains(PHI); }))
0b57cec5SDimitry Andric        return true;
0b57cec5SDimitry Andric    } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
0b57cec5SDimitry Andric      return true;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric  return false;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
e8d8bef9SDimitry AndricAMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
e8d8bef9SDimitry Andric    : BaseT(TM, F.getParent()->getDataLayout()),
e8d8bef9SDimitry Andric      TargetTriple(TM->getTargetTriple()),
e8d8bef9SDimitry Andric      ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
e8d8bef9SDimitry Andric      TLI(ST->getTargetLowering()) {}
e8d8bef9SDimitry Andric
0b57cec5SDimitry Andricvoid AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
349cc55cSDimitry Andric                                            TTI::UnrollingPreferences &UP,
349cc55cSDimitry Andric                                            OptimizationRemarkEmitter *ORE) {
480093f4SDimitry Andric  const Function &F = *L->getHeader()->getParent();
bdd1243dSDimitry Andric  UP.Threshold =
bdd1243dSDimitry Andric      F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
0b57cec5SDimitry Andric  UP.MaxCount = std::numeric_limits<unsigned>::max();
0b57cec5SDimitry Andric  UP.Partial = true;
0b57cec5SDimitry Andric
fe6060f1SDimitry Andric  // Conditional branch in a loop back edge needs 3 additional exec
fe6060f1SDimitry Andric  // manipulations in average.
fe6060f1SDimitry Andric  UP.BEInsns += 3;
fe6060f1SDimitry Andric
0b57cec5SDimitry Andric  // TODO: Do we want runtime unrolling?
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  // Maximum alloca size than can fit registers. Reserve 16 registers.
0b57cec5SDimitry Andric  const unsigned MaxAlloca = (256 - 16) * 4;
0b57cec5SDimitry Andric  unsigned ThresholdPrivate = UnrollThresholdPrivate;
0b57cec5SDimitry Andric  unsigned ThresholdLocal = UnrollThresholdLocal;
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andric  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
e8d8bef9SDimitry Andric  // provided threshold value as the default for Threshold
e8d8bef9SDimitry Andric  if (MDNode *LoopUnrollThreshold =
e8d8bef9SDimitry Andric          findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
e8d8bef9SDimitry Andric    if (LoopUnrollThreshold->getNumOperands() == 2) {
e8d8bef9SDimitry Andric      ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
e8d8bef9SDimitry Andric          LoopUnrollThreshold->getOperand(1));
e8d8bef9SDimitry Andric      if (MetaThresholdValue) {
e8d8bef9SDimitry Andric        // We will also use the supplied value for PartialThreshold for now.
e8d8bef9SDimitry Andric        // We may introduce additional metadata if it becomes necessary in the
e8d8bef9SDimitry Andric        // future.
e8d8bef9SDimitry Andric        UP.Threshold = MetaThresholdValue->getSExtValue();
e8d8bef9SDimitry Andric        UP.PartialThreshold = UP.Threshold;
e8d8bef9SDimitry Andric        ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
e8d8bef9SDimitry Andric        ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
e8d8bef9SDimitry Andric      }
e8d8bef9SDimitry Andric    }
e8d8bef9SDimitry Andric  }
e8d8bef9SDimitry Andric
0b57cec5SDimitry Andric  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
0b57cec5SDimitry Andric  for (const BasicBlock *BB : L->getBlocks()) {
0b57cec5SDimitry Andric    const DataLayout &DL = BB->getModule()->getDataLayout();
0b57cec5SDimitry Andric    unsigned LocalGEPsSeen = 0;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
0b57cec5SDimitry Andric               return SubLoop->contains(BB); }))
0b57cec5SDimitry Andric        continue; // Block belongs to an inner loop.
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    for (const Instruction &I : *BB) {
0b57cec5SDimitry Andric      // Unroll a loop which contains an "if" statement whose condition
0b57cec5SDimitry Andric      // defined by a PHI belonging to the loop. This may help to eliminate
0b57cec5SDimitry Andric      // if region and potentially even PHI itself, saving on both divergence
0b57cec5SDimitry Andric      // and registers used for the PHI.
0b57cec5SDimitry Andric      // Add a small bonus for each of such "if" statements.
0b57cec5SDimitry Andric      if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
0b57cec5SDimitry Andric        if (UP.Threshold < MaxBoost && Br->isConditional()) {
0b57cec5SDimitry Andric          BasicBlock *Succ0 = Br->getSuccessor(0);
0b57cec5SDimitry Andric          BasicBlock *Succ1 = Br->getSuccessor(1);
0b57cec5SDimitry Andric          if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
0b57cec5SDimitry Andric              (L->contains(Succ1) && L->isLoopExiting(Succ1)))
0b57cec5SDimitry Andric            continue;
0b57cec5SDimitry Andric          if (dependsOnLocalPhi(L, Br->getCondition())) {
0b57cec5SDimitry Andric            UP.Threshold += UnrollThresholdIf;
0b57cec5SDimitry Andric            LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
0b57cec5SDimitry Andric                              << " for loop:\n"
0b57cec5SDimitry Andric                              << *L << " due to " << *Br << '\n');
0b57cec5SDimitry Andric            if (UP.Threshold >= MaxBoost)
0b57cec5SDimitry Andric              return;
0b57cec5SDimitry Andric          }
0b57cec5SDimitry Andric        }
0b57cec5SDimitry Andric        continue;
0b57cec5SDimitry Andric      }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
0b57cec5SDimitry Andric      if (!GEP)
0b57cec5SDimitry Andric        continue;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric      unsigned AS = GEP->getAddressSpace();
0b57cec5SDimitry Andric      unsigned Threshold = 0;
0b57cec5SDimitry Andric      if (AS == AMDGPUAS::PRIVATE_ADDRESS)
0b57cec5SDimitry Andric        Threshold = ThresholdPrivate;
0b57cec5SDimitry Andric      else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
0b57cec5SDimitry Andric        Threshold = ThresholdLocal;
0b57cec5SDimitry Andric      else
0b57cec5SDimitry Andric        continue;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric      if (UP.Threshold >= Threshold)
0b57cec5SDimitry Andric        continue;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric      if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
0b57cec5SDimitry Andric        const Value *Ptr = GEP->getPointerOperand();
0b57cec5SDimitry Andric        const AllocaInst *Alloca =
e8d8bef9SDimitry Andric            dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
0b57cec5SDimitry Andric        if (!Alloca || !Alloca->isStaticAlloca())
0b57cec5SDimitry Andric          continue;
0b57cec5SDimitry Andric        Type *Ty = Alloca->getAllocatedType();
0b57cec5SDimitry Andric        unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
0b57cec5SDimitry Andric        if (AllocaSize > MaxAlloca)
0b57cec5SDimitry Andric          continue;
0b57cec5SDimitry Andric      } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
0b57cec5SDimitry Andric                 AS == AMDGPUAS::REGION_ADDRESS) {
0b57cec5SDimitry Andric        LocalGEPsSeen++;
0b57cec5SDimitry Andric        // Inhibit unroll for local memory if we have seen addressing not to
0b57cec5SDimitry Andric        // a variable, most likely we will be unable to combine it.
0b57cec5SDimitry Andric        // Do not unroll too deep inner loops for local memory to give a chance
0b57cec5SDimitry Andric        // to unroll an outer loop for a more important reason.
0b57cec5SDimitry Andric        if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
0b57cec5SDimitry Andric            (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
0b57cec5SDimitry Andric             !isa<Argument>(GEP->getPointerOperand())))
0b57cec5SDimitry Andric          continue;
5ffd83dbSDimitry Andric        LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
5ffd83dbSDimitry Andric                          << *L << " due to LDS use.\n");
5ffd83dbSDimitry Andric        UP.Runtime = UnrollRuntimeLocal;
0b57cec5SDimitry Andric      }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric      // Check if GEP depends on a value defined by this loop itself.
0b57cec5SDimitry Andric      bool HasLoopDef = false;
0b57cec5SDimitry Andric      for (const Value *Op : GEP->operands()) {
0b57cec5SDimitry Andric        const Instruction *Inst = dyn_cast<Instruction>(Op);
0b57cec5SDimitry Andric        if (!Inst || L->isLoopInvariant(Op))
0b57cec5SDimitry Andric          continue;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric        if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
0b57cec5SDimitry Andric             return SubLoop->contains(Inst); }))
0b57cec5SDimitry Andric          continue;
0b57cec5SDimitry Andric        HasLoopDef = true;
0b57cec5SDimitry Andric        break;
0b57cec5SDimitry Andric      }
0b57cec5SDimitry Andric      if (!HasLoopDef)
0b57cec5SDimitry Andric        continue;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric      // We want to do whatever we can to limit the number of alloca
0b57cec5SDimitry Andric      // instructions that make it through to the code generator.  allocas
0b57cec5SDimitry Andric      // require us to use indirect addressing, which is slow and prone to
0b57cec5SDimitry Andric      // compiler bugs.  If this loop does an address calculation on an
0b57cec5SDimitry Andric      // alloca ptr, then we want to use a higher than normal loop unroll
0b57cec5SDimitry Andric      // threshold. This will give SROA a better chance to eliminate these
0b57cec5SDimitry Andric      // allocas.
0b57cec5SDimitry Andric      //
0b57cec5SDimitry Andric      // We also want to have more unrolling for local memory to let ds
0b57cec5SDimitry Andric      // instructions with different offsets combine.
0b57cec5SDimitry Andric      //
0b57cec5SDimitry Andric      // Don't use the maximum allowed value here as it will make some
0b57cec5SDimitry Andric      // programs way too big.
0b57cec5SDimitry Andric      UP.Threshold = Threshold;
0b57cec5SDimitry Andric      LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
0b57cec5SDimitry Andric                        << " for loop:\n"
0b57cec5SDimitry Andric                        << *L << " due to " << *GEP << '\n');
0b57cec5SDimitry Andric      if (UP.Threshold >= MaxBoost)
0b57cec5SDimitry Andric        return;
0b57cec5SDimitry Andric    }
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric    // If we got a GEP in a small BB from inner loop then increase max trip
5ffd83dbSDimitry Andric    // count to analyze for better estimation cost in unroll
e8d8bef9SDimitry Andric    if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
5ffd83dbSDimitry Andric      UP.MaxIterationsCountToAnalyze = 32;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
5ffd83dbSDimitry Andricvoid AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
5ffd83dbSDimitry Andric                                          TTI::PeelingPreferences &PP) {
5ffd83dbSDimitry Andric  BaseT::getPeelingPreferences(L, SE, PP);
5ffd83dbSDimitry Andric}
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andricconst FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
e8d8bef9SDimitry Andric    // Codegen control options which don't matter.
e8d8bef9SDimitry Andric    AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
e8d8bef9SDimitry Andric    AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
e8d8bef9SDimitry Andric    AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
e8d8bef9SDimitry Andric    AMDGPU::FeatureUnalignedAccessMode,
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andric    AMDGPU::FeatureAutoWaitcntBeforeBarrier,
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andric    // Property of the kernel/environment which can't actually differ.
e8d8bef9SDimitry Andric    AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
e8d8bef9SDimitry Andric    AMDGPU::FeatureTrapHandler,
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andric    // The default assumption needs to be ecc is enabled, but no directly
e8d8bef9SDimitry Andric    // exposed operations depend on it, so it can be safely inlined.
e8d8bef9SDimitry Andric    AMDGPU::FeatureSRAMECC,
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andric    // Perf-tuning features
e8d8bef9SDimitry Andric    AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
e8d8bef9SDimitry Andric
e8d8bef9SDimitry AndricGCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
e8d8bef9SDimitry Andric    : BaseT(TM, F.getParent()->getDataLayout()),
e8d8bef9SDimitry Andric      ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
e8d8bef9SDimitry Andric      TLI(ST->getTargetLowering()), CommonTTI(TM, F),
81ad6265SDimitry Andric      IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
e8d8bef9SDimitry Andric  AMDGPU::SIModeRegisterDefaults Mode(F);
e8d8bef9SDimitry Andric  HasFP32Denormals = Mode.allFP32Denormals();
e8d8bef9SDimitry Andric  HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
e8d8bef9SDimitry Andric}
e8d8bef9SDimitry Andric
81ad6265SDimitry Andricunsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
81ad6265SDimitry Andric  // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
81ad6265SDimitry Andric  // registers. See getRegisterClassForType for the implementation.
81ad6265SDimitry Andric  // In this case vector registers are not vector in terms of
81ad6265SDimitry Andric  // VGPRs, but those which can hold multiple values.
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  // This is really the number of registers to fill when vectorizing /
0b57cec5SDimitry Andric  // interleaving loops, so we lie to avoid trying to use all registers.
81ad6265SDimitry Andric  return 4;
5ffd83dbSDimitry Andric}
5ffd83dbSDimitry Andric
fe6060f1SDimitry AndricTypeSize
fe6060f1SDimitry AndricGCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
fe6060f1SDimitry Andric  switch (K) {
fe6060f1SDimitry Andric  case TargetTransformInfo::RGK_Scalar:
fe6060f1SDimitry Andric    return TypeSize::getFixed(32);
fe6060f1SDimitry Andric  case TargetTransformInfo::RGK_FixedWidthVector:
fe6060f1SDimitry Andric    return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
fe6060f1SDimitry Andric  case TargetTransformInfo::RGK_ScalableVector:
fe6060f1SDimitry Andric    return TypeSize::getScalable(0);
fe6060f1SDimitry Andric  }
fe6060f1SDimitry Andric  llvm_unreachable("Unsupported register kind");
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricunsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
0b57cec5SDimitry Andric  return 32;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
e8d8bef9SDimitry Andricunsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
e8d8bef9SDimitry Andric  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
e8d8bef9SDimitry Andric    return 32 * 4 / ElemWidth;
fe6060f1SDimitry Andric  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
fe6060f1SDimitry Andric       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
fe6060f1SDimitry Andric       : 1;
e8d8bef9SDimitry Andric}
e8d8bef9SDimitry Andric
0b57cec5SDimitry Andricunsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
0b57cec5SDimitry Andric                                         unsigned ChainSizeInBytes,
0b57cec5SDimitry Andric                                         VectorType *VecTy) const {
0b57cec5SDimitry Andric  unsigned VecRegBitWidth = VF * LoadSize;
0b57cec5SDimitry Andric  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
0b57cec5SDimitry Andric    // TODO: Support element-size less than 32bit?
0b57cec5SDimitry Andric    return 128 / LoadSize;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  return VF;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricunsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
0b57cec5SDimitry Andric                                             unsigned ChainSizeInBytes,
0b57cec5SDimitry Andric                                             VectorType *VecTy) const {
0b57cec5SDimitry Andric  unsigned VecRegBitWidth = VF * StoreSize;
0b57cec5SDimitry Andric  if (VecRegBitWidth > 128)
0b57cec5SDimitry Andric    return 128 / StoreSize;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  return VF;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricunsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
0b57cec5SDimitry Andric  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
0b57cec5SDimitry Andric      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
0b57cec5SDimitry Andric      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
0b57cec5SDimitry Andric      AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
0b57cec5SDimitry Andric    return 512;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
0b57cec5SDimitry Andric    return 8 * ST->getMaxPrivateElementSize();
0b57cec5SDimitry Andric
5ffd83dbSDimitry Andric  // Common to flat, global, local and region. Assume for unknown addrspace.
5ffd83dbSDimitry Andric  return 128;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricbool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
5ffd83dbSDimitry Andric                                            Align Alignment,
0b57cec5SDimitry Andric                                            unsigned AddrSpace) const {
0b57cec5SDimitry Andric  // We allow vectorization of flat stores, even though we may need to decompose
0b57cec5SDimitry Andric  // them later if they may access private memory. We don't have enough context
0b57cec5SDimitry Andric  // here, and legalization can handle it.
0b57cec5SDimitry Andric  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
0b57cec5SDimitry Andric    return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
0b57cec5SDimitry Andric      ChainSizeInBytes <= ST->getMaxPrivateElementSize();
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric  return true;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricbool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
5ffd83dbSDimitry Andric                                             Align Alignment,
0b57cec5SDimitry Andric                                             unsigned AddrSpace) const {
0b57cec5SDimitry Andric  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricbool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
5ffd83dbSDimitry Andric                                              Align Alignment,
0b57cec5SDimitry Andric                                              unsigned AddrSpace) const {
0b57cec5SDimitry Andric  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
5ffd83dbSDimitry Andric// FIXME: Really we would like to issue multiple 128-bit loads and stores per
5ffd83dbSDimitry Andric// iteration. Should we report a larger size and let it legalize?
5ffd83dbSDimitry Andric//
5ffd83dbSDimitry Andric// FIXME: Should we use narrower types for local/region, or account for when
5ffd83dbSDimitry Andric// unaligned access is legal?
5ffd83dbSDimitry Andric//
5ffd83dbSDimitry Andric// FIXME: This could use fine tuning and microbenchmarks.
81ad6265SDimitry AndricType *GCNTTIImpl::getMemcpyLoopLoweringType(
81ad6265SDimitry Andric    LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
81ad6265SDimitry Andric    unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
bdd1243dSDimitry Andric    std::optional<uint32_t> AtomicElementSize) const {
81ad6265SDimitry Andric
81ad6265SDimitry Andric  if (AtomicElementSize)
81ad6265SDimitry Andric    return Type::getIntNTy(Context, *AtomicElementSize * 8);
81ad6265SDimitry Andric
5ffd83dbSDimitry Andric  unsigned MinAlign = std::min(SrcAlign, DestAlign);
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
5ffd83dbSDimitry Andric  // hardware into byte accesses. If you assume all alignments are equally
5ffd83dbSDimitry Andric  // probable, it's more efficient on average to use short accesses for this
5ffd83dbSDimitry Andric  // case.
5ffd83dbSDimitry Andric  if (MinAlign == 2)
5ffd83dbSDimitry Andric    return Type::getInt16Ty(Context);
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  // Not all subtargets have 128-bit DS instructions, and we currently don't
5ffd83dbSDimitry Andric  // form them by default.
5ffd83dbSDimitry Andric  if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
5ffd83dbSDimitry Andric      SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
5ffd83dbSDimitry Andric      DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
5ffd83dbSDimitry Andric      DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
5ffd83dbSDimitry Andric    return FixedVectorType::get(Type::getInt32Ty(Context), 2);
5ffd83dbSDimitry Andric  }
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  // Global memory works best with 16-byte accesses. Private memory will also
5ffd83dbSDimitry Andric  // hit this, although they'll be decomposed.
5ffd83dbSDimitry Andric  return FixedVectorType::get(Type::getInt32Ty(Context), 4);
5ffd83dbSDimitry Andric}
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andricvoid GCNTTIImpl::getMemcpyLoopResidualLoweringType(
5ffd83dbSDimitry Andric    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
5ffd83dbSDimitry Andric    unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
81ad6265SDimitry Andric    unsigned SrcAlign, unsigned DestAlign,
bdd1243dSDimitry Andric    std::optional<uint32_t> AtomicCpySize) const {
5ffd83dbSDimitry Andric  assert(RemainingBytes < 16);
5ffd83dbSDimitry Andric
81ad6265SDimitry Andric  if (AtomicCpySize)
81ad6265SDimitry Andric    BaseT::getMemcpyLoopResidualLoweringType(
81ad6265SDimitry Andric        OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
81ad6265SDimitry Andric        DestAlign, AtomicCpySize);
81ad6265SDimitry Andric
5ffd83dbSDimitry Andric  unsigned MinAlign = std::min(SrcAlign, DestAlign);
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  if (MinAlign != 2) {
5ffd83dbSDimitry Andric    Type *I64Ty = Type::getInt64Ty(Context);
5ffd83dbSDimitry Andric    while (RemainingBytes >= 8) {
5ffd83dbSDimitry Andric      OpsOut.push_back(I64Ty);
5ffd83dbSDimitry Andric      RemainingBytes -= 8;
5ffd83dbSDimitry Andric    }
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric    Type *I32Ty = Type::getInt32Ty(Context);
5ffd83dbSDimitry Andric    while (RemainingBytes >= 4) {
5ffd83dbSDimitry Andric      OpsOut.push_back(I32Ty);
5ffd83dbSDimitry Andric      RemainingBytes -= 4;
5ffd83dbSDimitry Andric    }
5ffd83dbSDimitry Andric  }
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  Type *I16Ty = Type::getInt16Ty(Context);
5ffd83dbSDimitry Andric  while (RemainingBytes >= 2) {
5ffd83dbSDimitry Andric    OpsOut.push_back(I16Ty);
5ffd83dbSDimitry Andric    RemainingBytes -= 2;
5ffd83dbSDimitry Andric  }
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  Type *I8Ty = Type::getInt8Ty(Context);
5ffd83dbSDimitry Andric  while (RemainingBytes) {
5ffd83dbSDimitry Andric    OpsOut.push_back(I8Ty);
5ffd83dbSDimitry Andric    --RemainingBytes;
5ffd83dbSDimitry Andric  }
5ffd83dbSDimitry Andric}
5ffd83dbSDimitry Andric
0b57cec5SDimitry Andricunsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
0b57cec5SDimitry Andric  // Disable unrolling if the loop is not vectorized.
0b57cec5SDimitry Andric  // TODO: Enable this again.
0b57cec5SDimitry Andric  if (VF == 1)
0b57cec5SDimitry Andric    return 1;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  return 8;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricbool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
0b57cec5SDimitry Andric                                       MemIntrinsicInfo &Info) const {
0b57cec5SDimitry Andric  switch (Inst->getIntrinsicID()) {
0b57cec5SDimitry Andric  case Intrinsic::amdgcn_atomic_inc:
0b57cec5SDimitry Andric  case Intrinsic::amdgcn_atomic_dec:
0b57cec5SDimitry Andric  case Intrinsic::amdgcn_ds_ordered_add:
0b57cec5SDimitry Andric  case Intrinsic::amdgcn_ds_ordered_swap:
0b57cec5SDimitry Andric  case Intrinsic::amdgcn_ds_fadd:
0b57cec5SDimitry Andric  case Intrinsic::amdgcn_ds_fmin:
0b57cec5SDimitry Andric  case Intrinsic::amdgcn_ds_fmax: {
0b57cec5SDimitry Andric    auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
0b57cec5SDimitry Andric    auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
0b57cec5SDimitry Andric    if (!Ordering || !Volatile)
0b57cec5SDimitry Andric      return false; // Invalid.
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    unsigned OrderingVal = Ordering->getZExtValue();
0b57cec5SDimitry Andric    if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
0b57cec5SDimitry Andric      return false;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    Info.PtrVal = Inst->getArgOperand(0);
0b57cec5SDimitry Andric    Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
0b57cec5SDimitry Andric    Info.ReadMem = true;
0b57cec5SDimitry Andric    Info.WriteMem = true;
349cc55cSDimitry Andric    Info.IsVolatile = !Volatile->isZero();
0b57cec5SDimitry Andric    return true;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric  default:
0b57cec5SDimitry Andric    return false;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
fe6060f1SDimitry AndricInstructionCost GCNTTIImpl::getArithmeticInstrCost(
fe6060f1SDimitry Andric    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
bdd1243dSDimitry Andric    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
bdd1243dSDimitry Andric    ArrayRef<const Value *> Args,
480093f4SDimitry Andric    const Instruction *CxtI) {
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  // Legalize the type.
bdd1243dSDimitry Andric  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
0b57cec5SDimitry Andric  int ISD = TLI->InstructionOpcodeToISD(Opcode);
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  // Because we don't have any legal vector operations, but the legal types, we
0b57cec5SDimitry Andric  // need to account for split vectors.
0b57cec5SDimitry Andric  unsigned NElts = LT.second.isVector() ?
0b57cec5SDimitry Andric    LT.second.getVectorNumElements() : 1;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  switch (ISD) {
0b57cec5SDimitry Andric  case ISD::SHL:
0b57cec5SDimitry Andric  case ISD::SRL:
0b57cec5SDimitry Andric  case ISD::SRA:
0b57cec5SDimitry Andric    if (SLT == MVT::i64)
e8d8bef9SDimitry Andric      return get64BitInstrCost(CostKind) * LT.first * NElts;
0b57cec5SDimitry Andric
480093f4SDimitry Andric    if (ST->has16BitInsts() && SLT == MVT::i16)
480093f4SDimitry Andric      NElts = (NElts + 1) / 2;
480093f4SDimitry Andric
0b57cec5SDimitry Andric    // i32
0b57cec5SDimitry Andric    return getFullRateInstrCost() * LT.first * NElts;
0b57cec5SDimitry Andric  case ISD::ADD:
0b57cec5SDimitry Andric  case ISD::SUB:
0b57cec5SDimitry Andric  case ISD::AND:
0b57cec5SDimitry Andric  case ISD::OR:
0b57cec5SDimitry Andric  case ISD::XOR:
0b57cec5SDimitry Andric    if (SLT == MVT::i64) {
0b57cec5SDimitry Andric      // and, or and xor are typically split into 2 VALU instructions.
0b57cec5SDimitry Andric      return 2 * getFullRateInstrCost() * LT.first * NElts;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric
480093f4SDimitry Andric    if (ST->has16BitInsts() && SLT == MVT::i16)
480093f4SDimitry Andric      NElts = (NElts + 1) / 2;
480093f4SDimitry Andric
0b57cec5SDimitry Andric    return LT.first * NElts * getFullRateInstrCost();
0b57cec5SDimitry Andric  case ISD::MUL: {
e8d8bef9SDimitry Andric    const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
0b57cec5SDimitry Andric    if (SLT == MVT::i64) {
0b57cec5SDimitry Andric      const int FullRateCost = getFullRateInstrCost();
0b57cec5SDimitry Andric      return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric
480093f4SDimitry Andric    if (ST->has16BitInsts() && SLT == MVT::i16)
480093f4SDimitry Andric      NElts = (NElts + 1) / 2;
480093f4SDimitry Andric
0b57cec5SDimitry Andric    // i32
0b57cec5SDimitry Andric    return QuarterRateCost * NElts * LT.first;
0b57cec5SDimitry Andric  }
e8d8bef9SDimitry Andric  case ISD::FMUL:
e8d8bef9SDimitry Andric    // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
e8d8bef9SDimitry Andric    // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
e8d8bef9SDimitry Andric    // fused operation.
e8d8bef9SDimitry Andric    if (CxtI && CxtI->hasOneUse())
e8d8bef9SDimitry Andric      if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
e8d8bef9SDimitry Andric        const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
e8d8bef9SDimitry Andric        if (OPC == ISD::FADD || OPC == ISD::FSUB) {
e8d8bef9SDimitry Andric          if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
e8d8bef9SDimitry Andric            return TargetTransformInfo::TCC_Free;
e8d8bef9SDimitry Andric          if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
e8d8bef9SDimitry Andric            return TargetTransformInfo::TCC_Free;
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andric          // Estimate all types may be fused with contract/unsafe flags
e8d8bef9SDimitry Andric          const TargetOptions &Options = TLI->getTargetMachine().Options;
e8d8bef9SDimitry Andric          if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
e8d8bef9SDimitry Andric              Options.UnsafeFPMath ||
e8d8bef9SDimitry Andric              (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
e8d8bef9SDimitry Andric            return TargetTransformInfo::TCC_Free;
e8d8bef9SDimitry Andric        }
e8d8bef9SDimitry Andric      }
bdd1243dSDimitry Andric    [[fallthrough]];
0b57cec5SDimitry Andric  case ISD::FADD:
0b57cec5SDimitry Andric  case ISD::FSUB:
fe6060f1SDimitry Andric    if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
fe6060f1SDimitry Andric      NElts = (NElts + 1) / 2;
0b57cec5SDimitry Andric    if (SLT == MVT::f64)
e8d8bef9SDimitry Andric      return LT.first * NElts * get64BitInstrCost(CostKind);
0b57cec5SDimitry Andric
480093f4SDimitry Andric    if (ST->has16BitInsts() && SLT == MVT::f16)
480093f4SDimitry Andric      NElts = (NElts + 1) / 2;
480093f4SDimitry Andric
0b57cec5SDimitry Andric    if (SLT == MVT::f32 || SLT == MVT::f16)
0b57cec5SDimitry Andric      return LT.first * NElts * getFullRateInstrCost();
0b57cec5SDimitry Andric    break;
0b57cec5SDimitry Andric  case ISD::FDIV:
0b57cec5SDimitry Andric  case ISD::FREM:
0b57cec5SDimitry Andric    // FIXME: frem should be handled separately. The fdiv in it is most of it,
0b57cec5SDimitry Andric    // but the current lowering is also not entirely correct.
0b57cec5SDimitry Andric    if (SLT == MVT::f64) {
e8d8bef9SDimitry Andric      int Cost = 7 * get64BitInstrCost(CostKind) +
e8d8bef9SDimitry Andric                 getQuarterRateInstrCost(CostKind) +
e8d8bef9SDimitry Andric                 3 * getHalfRateInstrCost(CostKind);
0b57cec5SDimitry Andric      // Add cost of workaround.
0b57cec5SDimitry Andric      if (!ST->hasUsableDivScaleConditionOutput())
0b57cec5SDimitry Andric        Cost += 3 * getFullRateInstrCost();
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric      return LT.first * Cost * NElts;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
0b57cec5SDimitry Andric      // TODO: This is more complicated, unsafe flags etc.
480093f4SDimitry Andric      if ((SLT == MVT::f32 && !HasFP32Denormals) ||
0b57cec5SDimitry Andric          (SLT == MVT::f16 && ST->has16BitInsts())) {
e8d8bef9SDimitry Andric        return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
0b57cec5SDimitry Andric      }
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    if (SLT == MVT::f16 && ST->has16BitInsts()) {
0b57cec5SDimitry Andric      // 2 x v_cvt_f32_f16
0b57cec5SDimitry Andric      // f32 rcp
0b57cec5SDimitry Andric      // f32 fmul
0b57cec5SDimitry Andric      // v_cvt_f16_f32
0b57cec5SDimitry Andric      // f16 div_fixup
e8d8bef9SDimitry Andric      int Cost =
e8d8bef9SDimitry Andric          4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
0b57cec5SDimitry Andric      return LT.first * Cost * NElts;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    if (SLT == MVT::f32 || SLT == MVT::f16) {
e8d8bef9SDimitry Andric      // 4 more v_cvt_* insts without f16 insts support
e8d8bef9SDimitry Andric      int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
e8d8bef9SDimitry Andric                 1 * getQuarterRateInstrCost(CostKind);
0b57cec5SDimitry Andric
480093f4SDimitry Andric      if (!HasFP32Denormals) {
0b57cec5SDimitry Andric        // FP mode switches.
0b57cec5SDimitry Andric        Cost += 2 * getFullRateInstrCost();
0b57cec5SDimitry Andric      }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric      return LT.first * NElts * Cost;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric    break;
5ffd83dbSDimitry Andric  case ISD::FNEG:
5ffd83dbSDimitry Andric    // Use the backend' estimation. If fneg is not free each element will cost
5ffd83dbSDimitry Andric    // one additional instruction.
5ffd83dbSDimitry Andric    return TLI->isFNegFree(SLT) ? 0 : NElts;
0b57cec5SDimitry Andric  default:
0b57cec5SDimitry Andric    break;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric
bdd1243dSDimitry Andric  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
bdd1243dSDimitry Andric                                       Args, CxtI);
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
e8d8bef9SDimitry Andric// Return true if there's a potential benefit from using v2f16/v2i16
e8d8bef9SDimitry Andric// instructions for an intrinsic, even if it requires nontrivial legalization.
5ffd83dbSDimitry Andricstatic bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
5ffd83dbSDimitry Andric  switch (ID) {
5ffd83dbSDimitry Andric  case Intrinsic::fma: // TODO: fmuladd
5ffd83dbSDimitry Andric  // There's a small benefit to using vector ops in the legalized code.
5ffd83dbSDimitry Andric  case Intrinsic::round:
e8d8bef9SDimitry Andric  case Intrinsic::uadd_sat:
e8d8bef9SDimitry Andric  case Intrinsic::usub_sat:
e8d8bef9SDimitry Andric  case Intrinsic::sadd_sat:
e8d8bef9SDimitry Andric  case Intrinsic::ssub_sat:
5ffd83dbSDimitry Andric    return true;
5ffd83dbSDimitry Andric  default:
5ffd83dbSDimitry Andric    return false;
5ffd83dbSDimitry Andric  }
5ffd83dbSDimitry Andric}
480093f4SDimitry Andric
fe6060f1SDimitry AndricInstructionCost
fe6060f1SDimitry AndricGCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
5ffd83dbSDimitry Andric                                  TTI::TargetCostKind CostKind) {
5ffd83dbSDimitry Andric  if (ICA.getID() == Intrinsic::fabs)
5ffd83dbSDimitry Andric    return 0;
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
5ffd83dbSDimitry Andric    return BaseT::getIntrinsicInstrCost(ICA, CostKind);
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  Type *RetTy = ICA.getReturnType();
480093f4SDimitry Andric
480093f4SDimitry Andric  // Legalize the type.
bdd1243dSDimitry Andric  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
480093f4SDimitry Andric
480093f4SDimitry Andric  unsigned NElts = LT.second.isVector() ?
480093f4SDimitry Andric    LT.second.getVectorNumElements() : 1;
480093f4SDimitry Andric
480093f4SDimitry Andric  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
480093f4SDimitry Andric
480093f4SDimitry Andric  if (SLT == MVT::f64)
e8d8bef9SDimitry Andric    return LT.first * NElts * get64BitInstrCost(CostKind);
480093f4SDimitry Andric
fe6060f1SDimitry Andric  if ((ST->has16BitInsts() && SLT == MVT::f16) ||
fe6060f1SDimitry Andric      (ST->hasPackedFP32Ops() && SLT == MVT::f32))
480093f4SDimitry Andric    NElts = (NElts + 1) / 2;
480093f4SDimitry Andric
5ffd83dbSDimitry Andric  // TODO: Get more refined intrinsic costs?
e8d8bef9SDimitry Andric  unsigned InstRate = getQuarterRateInstrCost(CostKind);
fe6060f1SDimitry Andric
fe6060f1SDimitry Andric  switch (ICA.getID()) {
fe6060f1SDimitry Andric  case Intrinsic::fma:
e8d8bef9SDimitry Andric    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
e8d8bef9SDimitry Andric                                   : getQuarterRateInstrCost(CostKind);
fe6060f1SDimitry Andric    break;
fe6060f1SDimitry Andric  case Intrinsic::uadd_sat:
fe6060f1SDimitry Andric  case Intrinsic::usub_sat:
fe6060f1SDimitry Andric  case Intrinsic::sadd_sat:
fe6060f1SDimitry Andric  case Intrinsic::ssub_sat:
fe6060f1SDimitry Andric    static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
fe6060f1SDimitry Andric    if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
fe6060f1SDimitry Andric      NElts = 1;
fe6060f1SDimitry Andric    break;
480093f4SDimitry Andric  }
480093f4SDimitry Andric
5ffd83dbSDimitry Andric  return LT.first * NElts * InstRate;
480093f4SDimitry Andric}
480093f4SDimitry Andric
fe6060f1SDimitry AndricInstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
fe6060f1SDimitry Andric                                           TTI::TargetCostKind CostKind,
fe6060f1SDimitry Andric                                           const Instruction *I) {
fe6060f1SDimitry Andric  assert((I == nullptr || I->getOpcode() == Opcode) &&
fe6060f1SDimitry Andric         "Opcode should reflect passed instruction.");
fe6060f1SDimitry Andric  const bool SCost =
fe6060f1SDimitry Andric      (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
fe6060f1SDimitry Andric  const int CBrCost = SCost ? 5 : 7;
0b57cec5SDimitry Andric  switch (Opcode) {
fe6060f1SDimitry Andric  case Instruction::Br: {
fe6060f1SDimitry Andric    // Branch instruction takes about 4 slots on gfx900.
fe6060f1SDimitry Andric    auto BI = dyn_cast_or_null<BranchInst>(I);
fe6060f1SDimitry Andric    if (BI && BI->isUnconditional())
fe6060f1SDimitry Andric      return SCost ? 1 : 4;
fe6060f1SDimitry Andric    // Suppose conditional branch takes additional 3 exec manipulations
fe6060f1SDimitry Andric    // instructions in average.
fe6060f1SDimitry Andric    return CBrCost;
0b57cec5SDimitry Andric  }
fe6060f1SDimitry Andric  case Instruction::Switch: {
fe6060f1SDimitry Andric    auto SI = dyn_cast_or_null<SwitchInst>(I);
fe6060f1SDimitry Andric    // Each case (including default) takes 1 cmp + 1 cbr instructions in
fe6060f1SDimitry Andric    // average.
fe6060f1SDimitry Andric    return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
fe6060f1SDimitry Andric  }
fe6060f1SDimitry Andric  case Instruction::Ret:
fe6060f1SDimitry Andric    return SCost ? 1 : 10;
fe6060f1SDimitry Andric  }
fe6060f1SDimitry Andric  return BaseT::getCFInstrCost(Opcode, CostKind, I);
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
fe6060f1SDimitry AndricInstructionCost
fe6060f1SDimitry AndricGCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
bdd1243dSDimitry Andric                                       std::optional<FastMathFlags> FMF,
5ffd83dbSDimitry Andric                                       TTI::TargetCostKind CostKind) {
fe6060f1SDimitry Andric  if (TTI::requiresOrderedReduction(FMF))
fe6060f1SDimitry Andric    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
fe6060f1SDimitry Andric
0b57cec5SDimitry Andric  EVT OrigTy = TLI->getValueType(DL, Ty);
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  // Computes cost on targets that have packed math instructions(which support
0b57cec5SDimitry Andric  // 16-bit types only).
fe6060f1SDimitry Andric  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
fe6060f1SDimitry Andric    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
0b57cec5SDimitry Andric
bdd1243dSDimitry Andric  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
0b57cec5SDimitry Andric  return LT.first * getFullRateInstrCost();
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
fe6060f1SDimitry AndricInstructionCost
fe6060f1SDimitry AndricGCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
fe6060f1SDimitry Andric                                   bool IsUnsigned,
5ffd83dbSDimitry Andric                                   TTI::TargetCostKind CostKind) {
0b57cec5SDimitry Andric  EVT OrigTy = TLI->getValueType(DL, Ty);
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  // Computes cost on targets that have packed math instructions(which support
0b57cec5SDimitry Andric  // 16-bit types only).
fe6060f1SDimitry Andric  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
fe6060f1SDimitry Andric    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
0b57cec5SDimitry Andric
bdd1243dSDimitry Andric  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
e8d8bef9SDimitry Andric  return LT.first * getHalfRateInstrCost(CostKind);
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
fe6060f1SDimitry AndricInstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
bdd1243dSDimitry Andric                                               TTI::TargetCostKind CostKind,
bdd1243dSDimitry Andric                                               unsigned Index, Value *Op0,
bdd1243dSDimitry Andric                                               Value *Op1) {
0b57cec5SDimitry Andric  switch (Opcode) {
0b57cec5SDimitry Andric  case Instruction::ExtractElement:
0b57cec5SDimitry Andric  case Instruction::InsertElement: {
0b57cec5SDimitry Andric    unsigned EltSize
0b57cec5SDimitry Andric      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
0b57cec5SDimitry Andric    if (EltSize < 32) {
0b57cec5SDimitry Andric      if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
0b57cec5SDimitry Andric        return 0;
bdd1243dSDimitry Andric      return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
bdd1243dSDimitry Andric                                       Op1);
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    // Extracts are just reads of a subregister, so are free. Inserts are
0b57cec5SDimitry Andric    // considered free because we don't want to have any cost for scalarizing
0b57cec5SDimitry Andric    // operations, and we don't have to copy into a different register class.
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric    // Dynamic indexing isn't free and is best avoided.
0b57cec5SDimitry Andric    return Index == ~0u ? 2 : 0;
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric  default:
bdd1243dSDimitry Andric    return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
5ffd83dbSDimitry Andric/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
5ffd83dbSDimitry Andric/// this is analyzing the collective result of all output registers. Otherwise,
5ffd83dbSDimitry Andric/// this is only querying a specific result index if this returns multiple
5ffd83dbSDimitry Andric/// registers in a struct.
5ffd83dbSDimitry Andricbool GCNTTIImpl::isInlineAsmSourceOfDivergence(
5ffd83dbSDimitry Andric  const CallInst *CI, ArrayRef<unsigned> Indices) const {
5ffd83dbSDimitry Andric  // TODO: Handle complex extract indices
5ffd83dbSDimitry Andric  if (Indices.size() > 1)
5ffd83dbSDimitry Andric    return true;
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  const DataLayout &DL = CI->getModule()->getDataLayout();
5ffd83dbSDimitry Andric  const SIRegisterInfo *TRI = ST->getRegisterInfo();
5ffd83dbSDimitry Andric  TargetLowering::AsmOperandInfoVector TargetConstraints =
5ffd83dbSDimitry Andric      TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  int OutputIdx = 0;
5ffd83dbSDimitry Andric  for (auto &TC : TargetConstraints) {
5ffd83dbSDimitry Andric    if (TC.Type != InlineAsm::isOutput)
5ffd83dbSDimitry Andric      continue;
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric    // Skip outputs we don't care about.
5ffd83dbSDimitry Andric    if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
5ffd83dbSDimitry Andric      continue;
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric    TLI->ComputeConstraintToUse(TC, SDValue());
5ffd83dbSDimitry Andric
04eeddc0SDimitry Andric    const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
04eeddc0SDimitry Andric        TRI, TC.ConstraintCode, TC.ConstraintVT).second;
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric    // For AGPR constraints null is returned on subtargets without AGPRs, so
5ffd83dbSDimitry Andric    // assume divergent for null.
5ffd83dbSDimitry Andric    if (!RC || !TRI->isSGPRClass(RC))
5ffd83dbSDimitry Andric      return true;
5ffd83dbSDimitry Andric  }
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  return false;
5ffd83dbSDimitry Andric}
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric/// \returns true if the new GPU divergence analysis is enabled.
5ffd83dbSDimitry Andricbool GCNTTIImpl::useGPUDivergenceAnalysis() const {
5ffd83dbSDimitry Andric  return !UseLegacyDA;
5ffd83dbSDimitry Andric}
5ffd83dbSDimitry Andric
bdd1243dSDimitry Andricbool GCNTTIImpl::isReadRegisterSourceOfDivergence(
bdd1243dSDimitry Andric    const IntrinsicInst *ReadReg) const {
bdd1243dSDimitry Andric  Metadata *MD =
bdd1243dSDimitry Andric      cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
bdd1243dSDimitry Andric  StringRef RegName =
bdd1243dSDimitry Andric      cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
bdd1243dSDimitry Andric
bdd1243dSDimitry Andric  // Special case registers that look like VCC.
bdd1243dSDimitry Andric  MVT VT = MVT::getVT(ReadReg->getType());
bdd1243dSDimitry Andric  if (VT == MVT::i1)
bdd1243dSDimitry Andric    return true;
bdd1243dSDimitry Andric
bdd1243dSDimitry Andric  // Special case scalar registers that start with 'v'.
bdd1243dSDimitry Andric  if (RegName.startswith("vcc") || RegName.empty())
bdd1243dSDimitry Andric    return false;
bdd1243dSDimitry Andric
bdd1243dSDimitry Andric  // VGPR or AGPR is divergent. There aren't any specially named vector
bdd1243dSDimitry Andric  // registers.
bdd1243dSDimitry Andric  return RegName[0] == 'v' || RegName[0] == 'a';
bdd1243dSDimitry Andric}
bdd1243dSDimitry Andric
0b57cec5SDimitry Andric/// \returns true if the result of the value could potentially be
0b57cec5SDimitry Andric/// different across workitems in a wavefront.
0b57cec5SDimitry Andricbool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
0b57cec5SDimitry Andric  if (const Argument *A = dyn_cast<Argument>(V))
e8d8bef9SDimitry Andric    return !AMDGPU::isArgPassedInSGPR(A);
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  // Loads from the private and flat address spaces are divergent, because
0b57cec5SDimitry Andric  // threads can execute the load instruction with the same inputs and get
0b57cec5SDimitry Andric  // different results.
0b57cec5SDimitry Andric  //
0b57cec5SDimitry Andric  // All other loads are not divergent, because if threads issue loads with the
0b57cec5SDimitry Andric  // same arguments, they will always get the same result.
0b57cec5SDimitry Andric  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
0b57cec5SDimitry Andric    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
0b57cec5SDimitry Andric           Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  // Atomics are divergent because they are executed sequentially: when an
0b57cec5SDimitry Andric  // atomic operation refers to the same address in each thread, then each
0b57cec5SDimitry Andric  // thread after the first sees the value written by the previous thread as
0b57cec5SDimitry Andric  // original value.
0b57cec5SDimitry Andric  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
0b57cec5SDimitry Andric    return true;
0b57cec5SDimitry Andric
bdd1243dSDimitry Andric  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
bdd1243dSDimitry Andric    if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
bdd1243dSDimitry Andric      return isReadRegisterSourceOfDivergence(Intrinsic);
bdd1243dSDimitry Andric
0b57cec5SDimitry Andric    return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
bdd1243dSDimitry Andric  }
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  // Assume all function calls are a source of divergence.
5ffd83dbSDimitry Andric  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
5ffd83dbSDimitry Andric    if (CI->isInlineAsm())
5ffd83dbSDimitry Andric      return isInlineAsmSourceOfDivergence(CI);
5ffd83dbSDimitry Andric    return true;
5ffd83dbSDimitry Andric  }
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  // Assume all function calls are a source of divergence.
5ffd83dbSDimitry Andric  if (isa<InvokeInst>(V))
0b57cec5SDimitry Andric    return true;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  return false;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricbool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
0b57cec5SDimitry Andric  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
0b57cec5SDimitry Andric    switch (Intrinsic->getIntrinsicID()) {
0b57cec5SDimitry Andric    default:
0b57cec5SDimitry Andric      return false;
0b57cec5SDimitry Andric    case Intrinsic::amdgcn_readfirstlane:
0b57cec5SDimitry Andric    case Intrinsic::amdgcn_readlane:
0b57cec5SDimitry Andric    case Intrinsic::amdgcn_icmp:
0b57cec5SDimitry Andric    case Intrinsic::amdgcn_fcmp:
5ffd83dbSDimitry Andric    case Intrinsic::amdgcn_ballot:
5ffd83dbSDimitry Andric    case Intrinsic::amdgcn_if_break:
0b57cec5SDimitry Andric      return true;
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric  }
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
5ffd83dbSDimitry Andric    if (CI->isInlineAsm())
5ffd83dbSDimitry Andric      return !isInlineAsmSourceOfDivergence(CI);
5ffd83dbSDimitry Andric    return false;
5ffd83dbSDimitry Andric  }
5ffd83dbSDimitry Andric
bdd1243dSDimitry Andric  // In most cases TID / wavefrontsize is uniform.
bdd1243dSDimitry Andric  //
bdd1243dSDimitry Andric  // However, if a kernel has uneven dimesions we can have a value of
bdd1243dSDimitry Andric  // workitem-id-x divided by the wavefrontsize non-uniform. For example
bdd1243dSDimitry Andric  // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
bdd1243dSDimitry Andric  // packed into a same wave which gives 1 and 0 after the division by 64
bdd1243dSDimitry Andric  // respectively.
bdd1243dSDimitry Andric  //
bdd1243dSDimitry Andric  // FIXME: limit it to 1D kernels only, although that shall be possible
bdd1243dSDimitry Andric  // to perform this optimization is the size of the X dimension is a power
bdd1243dSDimitry Andric  // of 2, we just do not currently have infrastructure to query it.
bdd1243dSDimitry Andric  using namespace llvm::PatternMatch;
bdd1243dSDimitry Andric  uint64_t C;
bdd1243dSDimitry Andric  if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
bdd1243dSDimitry Andric                      m_ConstantInt(C))) ||
bdd1243dSDimitry Andric      match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
bdd1243dSDimitry Andric                      m_ConstantInt(C)))) {
bdd1243dSDimitry Andric    const Function *F = cast<Instruction>(V)->getFunction();
bdd1243dSDimitry Andric    return C >= ST->getWavefrontSizeLog2() &&
bdd1243dSDimitry Andric           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
bdd1243dSDimitry Andric  }
bdd1243dSDimitry Andric
bdd1243dSDimitry Andric  Value *Mask;
bdd1243dSDimitry Andric  if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
bdd1243dSDimitry Andric                       m_Value(Mask)))) {
bdd1243dSDimitry Andric    const Function *F = cast<Instruction>(V)->getFunction();
bdd1243dSDimitry Andric    const DataLayout &DL = F->getParent()->getDataLayout();
bdd1243dSDimitry Andric    return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
bdd1243dSDimitry Andric               ST->getWavefrontSizeLog2() &&
bdd1243dSDimitry Andric           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
bdd1243dSDimitry Andric  }
bdd1243dSDimitry Andric
5ffd83dbSDimitry Andric  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
5ffd83dbSDimitry Andric  if (!ExtValue)
5ffd83dbSDimitry Andric    return false;
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
5ffd83dbSDimitry Andric  if (!CI)
5ffd83dbSDimitry Andric    return false;
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
5ffd83dbSDimitry Andric    switch (Intrinsic->getIntrinsicID()) {
5ffd83dbSDimitry Andric    default:
5ffd83dbSDimitry Andric      return false;
5ffd83dbSDimitry Andric    case Intrinsic::amdgcn_if:
5ffd83dbSDimitry Andric    case Intrinsic::amdgcn_else: {
5ffd83dbSDimitry Andric      ArrayRef<unsigned> Indices = ExtValue->getIndices();
5ffd83dbSDimitry Andric      return Indices.size() == 1 && Indices[0] == 1;
5ffd83dbSDimitry Andric    }
5ffd83dbSDimitry Andric    }
5ffd83dbSDimitry Andric  }
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric  // If we have inline asm returning mixed SGPR and VGPR results, we inferred
5ffd83dbSDimitry Andric  // divergent for the overall struct return. We need to override it in the
5ffd83dbSDimitry Andric  // case we're extracting an SGPR component here.
5ffd83dbSDimitry Andric  if (CI->isInlineAsm())
5ffd83dbSDimitry Andric    return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
5ffd83dbSDimitry Andric
0b57cec5SDimitry Andric  return false;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
8bcb0991SDimitry Andricbool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
8bcb0991SDimitry Andric                                            Intrinsic::ID IID) const {
8bcb0991SDimitry Andric  switch (IID) {
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_atomic_inc:
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_atomic_dec:
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_ds_fadd:
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_ds_fmin:
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_ds_fmax:
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_is_shared:
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_is_private:
bdd1243dSDimitry Andric  case Intrinsic::amdgcn_flat_atomic_fadd:
bdd1243dSDimitry Andric  case Intrinsic::amdgcn_flat_atomic_fmax:
bdd1243dSDimitry Andric  case Intrinsic::amdgcn_flat_atomic_fmin:
8bcb0991SDimitry Andric    OpIndexes.push_back(0);
8bcb0991SDimitry Andric    return true;
8bcb0991SDimitry Andric  default:
8bcb0991SDimitry Andric    return false;
8bcb0991SDimitry Andric  }
8bcb0991SDimitry Andric}
8bcb0991SDimitry Andric
5ffd83dbSDimitry AndricValue *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
5ffd83dbSDimitry Andric                                                    Value *OldV,
5ffd83dbSDimitry Andric                                                    Value *NewV) const {
8bcb0991SDimitry Andric  auto IntrID = II->getIntrinsicID();
8bcb0991SDimitry Andric  switch (IntrID) {
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_atomic_inc:
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_atomic_dec:
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_ds_fadd:
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_ds_fmin:
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_ds_fmax: {
8bcb0991SDimitry Andric    const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
8bcb0991SDimitry Andric    if (!IsVolatile->isZero())
5ffd83dbSDimitry Andric      return nullptr;
8bcb0991SDimitry Andric    Module *M = II->getParent()->getParent()->getParent();
8bcb0991SDimitry Andric    Type *DestTy = II->getType();
8bcb0991SDimitry Andric    Type *SrcTy = NewV->getType();
8bcb0991SDimitry Andric    Function *NewDecl =
8bcb0991SDimitry Andric        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
8bcb0991SDimitry Andric    II->setArgOperand(0, NewV);
8bcb0991SDimitry Andric    II->setCalledFunction(NewDecl);
5ffd83dbSDimitry Andric    return II;
8bcb0991SDimitry Andric  }
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_is_shared:
8bcb0991SDimitry Andric  case Intrinsic::amdgcn_is_private: {
8bcb0991SDimitry Andric    unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
8bcb0991SDimitry Andric      AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
8bcb0991SDimitry Andric    unsigned NewAS = NewV->getType()->getPointerAddressSpace();
8bcb0991SDimitry Andric    LLVMContext &Ctx = NewV->getType()->getContext();
8bcb0991SDimitry Andric    ConstantInt *NewVal = (TrueAS == NewAS) ?
8bcb0991SDimitry Andric      ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
5ffd83dbSDimitry Andric    return NewVal;
5ffd83dbSDimitry Andric  }
5ffd83dbSDimitry Andric  case Intrinsic::ptrmask: {
5ffd83dbSDimitry Andric    unsigned OldAS = OldV->getType()->getPointerAddressSpace();
5ffd83dbSDimitry Andric    unsigned NewAS = NewV->getType()->getPointerAddressSpace();
5ffd83dbSDimitry Andric    Value *MaskOp = II->getArgOperand(1);
5ffd83dbSDimitry Andric    Type *MaskTy = MaskOp->getType();
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric    bool DoTruncate = false;
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andric    const GCNTargetMachine &TM =
e8d8bef9SDimitry Andric        static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
e8d8bef9SDimitry Andric    if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
5ffd83dbSDimitry Andric      // All valid 64-bit to 32-bit casts work by chopping off the high
5ffd83dbSDimitry Andric      // bits. Any masking only clearing the low bits will also apply in the new
5ffd83dbSDimitry Andric      // address space.
5ffd83dbSDimitry Andric      if (DL.getPointerSizeInBits(OldAS) != 64 ||
5ffd83dbSDimitry Andric          DL.getPointerSizeInBits(NewAS) != 32)
5ffd83dbSDimitry Andric        return nullptr;
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric      // TODO: Do we need to thread more context in here?
5ffd83dbSDimitry Andric      KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
5ffd83dbSDimitry Andric      if (Known.countMinLeadingOnes() < 32)
5ffd83dbSDimitry Andric        return nullptr;
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric      DoTruncate = true;
5ffd83dbSDimitry Andric    }
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric    IRBuilder<> B(II);
5ffd83dbSDimitry Andric    if (DoTruncate) {
5ffd83dbSDimitry Andric      MaskTy = B.getInt32Ty();
5ffd83dbSDimitry Andric      MaskOp = B.CreateTrunc(MaskOp, MaskTy);
5ffd83dbSDimitry Andric    }
5ffd83dbSDimitry Andric
5ffd83dbSDimitry Andric    return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
5ffd83dbSDimitry Andric                             {NewV, MaskOp});
8bcb0991SDimitry Andric  }
bdd1243dSDimitry Andric  case Intrinsic::amdgcn_flat_atomic_fadd:
bdd1243dSDimitry Andric  case Intrinsic::amdgcn_flat_atomic_fmax:
bdd1243dSDimitry Andric  case Intrinsic::amdgcn_flat_atomic_fmin: {
bdd1243dSDimitry Andric    Module *M = II->getParent()->getParent()->getParent();
bdd1243dSDimitry Andric    Type *DestTy = II->getType();
bdd1243dSDimitry Andric    Type *SrcTy = NewV->getType();
bdd1243dSDimitry Andric    Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(),
bdd1243dSDimitry Andric                                                  {DestTy, SrcTy, DestTy});
bdd1243dSDimitry Andric    II->setArgOperand(0, NewV);
bdd1243dSDimitry Andric    II->setCalledFunction(NewDecl);
bdd1243dSDimitry Andric    return II;
bdd1243dSDimitry Andric  }
8bcb0991SDimitry Andric  default:
5ffd83dbSDimitry Andric    return nullptr;
8bcb0991SDimitry Andric  }
8bcb0991SDimitry Andric}
8bcb0991SDimitry Andric
fe6060f1SDimitry AndricInstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
fe6060f1SDimitry Andric                                           VectorType *VT, ArrayRef<int> Mask,
bdd1243dSDimitry Andric                                           TTI::TargetCostKind CostKind,
81ad6265SDimitry Andric                                           int Index, VectorType *SubTp,
81ad6265SDimitry Andric                                           ArrayRef<const Value *> Args) {
fe6060f1SDimitry Andric  Kind = improveShuffleKindFromMask(Kind, Mask);
0b57cec5SDimitry Andric  if (ST->hasVOP3PInsts()) {
5ffd83dbSDimitry Andric    if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
0b57cec5SDimitry Andric        DL.getTypeSizeInBits(VT->getElementType()) == 16) {
0b57cec5SDimitry Andric      // With op_sel VOP3P instructions freely can access the low half or high
0b57cec5SDimitry Andric      // half of a register, so any swizzle is free.
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric      switch (Kind) {
0b57cec5SDimitry Andric      case TTI::SK_Broadcast:
0b57cec5SDimitry Andric      case TTI::SK_Reverse:
0b57cec5SDimitry Andric      case TTI::SK_PermuteSingleSrc:
0b57cec5SDimitry Andric        return 0;
0b57cec5SDimitry Andric      default:
0b57cec5SDimitry Andric        break;
0b57cec5SDimitry Andric      }
0b57cec5SDimitry Andric    }
0b57cec5SDimitry Andric  }
0b57cec5SDimitry Andric
bdd1243dSDimitry Andric  return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricbool GCNTTIImpl::areInlineCompatible(const Function *Caller,
0b57cec5SDimitry Andric                                     const Function *Callee) const {
0b57cec5SDimitry Andric  const TargetMachine &TM = getTLI()->getTargetMachine();
480093f4SDimitry Andric  const GCNSubtarget *CallerST
480093f4SDimitry Andric    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
480093f4SDimitry Andric  const GCNSubtarget *CalleeST
480093f4SDimitry Andric    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
480093f4SDimitry Andric
480093f4SDimitry Andric  const FeatureBitset &CallerBits = CallerST->getFeatureBits();
480093f4SDimitry Andric  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
0b57cec5SDimitry Andric  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
0b57cec5SDimitry Andric  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
0b57cec5SDimitry Andric    return false;
0b57cec5SDimitry Andric
0b57cec5SDimitry Andric  // FIXME: dx10_clamp can just take the caller setting, but there seems to be
0b57cec5SDimitry Andric  // no way to support merge for backend defined attributes.
5ffd83dbSDimitry Andric  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
5ffd83dbSDimitry Andric  AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
e8d8bef9SDimitry Andric  if (!CallerMode.isInlineCompatible(CalleeMode))
e8d8bef9SDimitry Andric    return false;
e8d8bef9SDimitry Andric
fe6060f1SDimitry Andric  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
fe6060f1SDimitry Andric      Callee->hasFnAttribute(Attribute::InlineHint))
fe6060f1SDimitry Andric    return true;
fe6060f1SDimitry Andric
e8d8bef9SDimitry Andric  // Hack to make compile times reasonable.
fe6060f1SDimitry Andric  if (InlineMaxBB) {
fe6060f1SDimitry Andric    // Single BB does not increase total BB amount.
fe6060f1SDimitry Andric    if (Callee->size() == 1)
fe6060f1SDimitry Andric      return true;
e8d8bef9SDimitry Andric    size_t BBSize = Caller->size() + Callee->size() - 1;
e8d8bef9SDimitry Andric    return BBSize <= InlineMaxBB;
e8d8bef9SDimitry Andric  }
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andric  return true;
e8d8bef9SDimitry Andric}
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andricunsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
e8d8bef9SDimitry Andric  // If we have a pointer to private array passed into a function
e8d8bef9SDimitry Andric  // it will not be optimized out, leaving scratch usage.
e8d8bef9SDimitry Andric  // Increase the inline threshold to allow inlining in this case.
e8d8bef9SDimitry Andric  uint64_t AllocaSize = 0;
e8d8bef9SDimitry Andric  SmallPtrSet<const AllocaInst *, 8> AIVisited;
e8d8bef9SDimitry Andric  for (Value *PtrArg : CB->args()) {
e8d8bef9SDimitry Andric    PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
e8d8bef9SDimitry Andric    if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
e8d8bef9SDimitry Andric                Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
e8d8bef9SDimitry Andric      continue;
e8d8bef9SDimitry Andric
e8d8bef9SDimitry Andric    PtrArg = getUnderlyingObject(PtrArg);
e8d8bef9SDimitry Andric    if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
e8d8bef9SDimitry Andric      if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
e8d8bef9SDimitry Andric        continue;
e8d8bef9SDimitry Andric      AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
e8d8bef9SDimitry Andric      // If the amount of stack memory is excessive we will not be able
e8d8bef9SDimitry Andric      // to get rid of the scratch anyway, bail out.
e8d8bef9SDimitry Andric      if (AllocaSize > ArgAllocaCutoff) {
e8d8bef9SDimitry Andric        AllocaSize = 0;
e8d8bef9SDimitry Andric        break;
e8d8bef9SDimitry Andric      }
e8d8bef9SDimitry Andric    }
e8d8bef9SDimitry Andric  }
e8d8bef9SDimitry Andric  if (AllocaSize)
e8d8bef9SDimitry Andric    return ArgAllocaCost;
e8d8bef9SDimitry Andric  return 0;
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
0b57cec5SDimitry Andricvoid GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
349cc55cSDimitry Andric                                         TTI::UnrollingPreferences &UP,
349cc55cSDimitry Andric                                         OptimizationRemarkEmitter *ORE) {
349cc55cSDimitry Andric  CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
0b57cec5SDimitry Andric}
0b57cec5SDimitry Andric
5ffd83dbSDimitry Andricvoid GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
5ffd83dbSDimitry Andric                                       TTI::PeelingPreferences &PP) {
5ffd83dbSDimitry Andric  CommonTTI.getPeelingPreferences(L, SE, PP);
8bcb0991SDimitry Andric}
8bcb0991SDimitry Andric
e8d8bef9SDimitry Andricint GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
fe6060f1SDimitry Andric  return ST->hasFullRate64Ops()
fe6060f1SDimitry Andric             ? getFullRateInstrCost()
fe6060f1SDimitry Andric             : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
e8d8bef9SDimitry Andric                                      : getQuarterRateInstrCost(CostKind);
e8d8bef9SDimitry Andric}
bdd1243dSDimitry Andric
bdd1243dSDimitry Andricstd::pair<InstructionCost, MVT>
bdd1243dSDimitry AndricGCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
bdd1243dSDimitry Andric  std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
bdd1243dSDimitry Andric  auto Size = DL.getTypeSizeInBits(Ty);
bdd1243dSDimitry Andric  // Maximum load or store can handle 8 dwords for scalar and 4 for
bdd1243dSDimitry Andric  // vector ALU. Let's assume anything above 8 dwords is expensive
bdd1243dSDimitry Andric  // even if legal.
bdd1243dSDimitry Andric  if (Size <= 256)
bdd1243dSDimitry Andric    return Cost;
bdd1243dSDimitry Andric
bdd1243dSDimitry Andric  Cost.first += (Size + 255) / 256;
bdd1243dSDimitry Andric  return Cost;
bdd1243dSDimitry Andric}