1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/TargetTransformInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/ISDOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/Argument.h"
27 #include "llvm/IR/Attributes.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/CallingConv.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Module.h"
37 #include "llvm/IR/PatternMatch.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/IR/Value.h"
40 #include "llvm/MC/SubtargetFeature.h"
41 #include "llvm/Support/Casting.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/MachineValueType.h"
46 #include "llvm/Support/raw_ostream.h"
47 #include "llvm/Target/TargetMachine.h"
48 #include <algorithm>
49 #include <cassert>
50 #include <limits>
51 #include <utility>
52
53 using namespace llvm;
54
55 #define DEBUG_TYPE "AMDGPUtti"
56
57 static cl::opt<unsigned> UnrollThresholdPrivate(
58 "amdgpu-unroll-threshold-private",
59 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
60 cl::init(2700), cl::Hidden);
61
62 static cl::opt<unsigned> UnrollThresholdLocal(
63 "amdgpu-unroll-threshold-local",
64 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
65 cl::init(1000), cl::Hidden);
66
67 static cl::opt<unsigned> UnrollThresholdIf(
68 "amdgpu-unroll-threshold-if",
69 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
70 cl::init(150), cl::Hidden);
71
72 static cl::opt<bool> UnrollRuntimeLocal(
73 "amdgpu-unroll-runtime-local",
74 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
75 cl::init(true), cl::Hidden);
76
77 static cl::opt<bool> UseLegacyDA(
78 "amdgpu-use-legacy-divergence-analysis",
79 cl::desc("Enable legacy divergence analysis for AMDGPU"),
80 cl::init(false), cl::Hidden);
81
82 static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
83 "amdgpu-unroll-max-block-to-analyze",
84 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
85 cl::init(20), cl::Hidden);
86
dependsOnLocalPhi(const Loop * L,const Value * Cond,unsigned Depth=0)87 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
88 unsigned Depth = 0) {
89 const Instruction *I = dyn_cast<Instruction>(Cond);
90 if (!I)
91 return false;
92
93 for (const Value *V : I->operand_values()) {
94 if (!L->contains(I))
95 continue;
96 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
97 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
98 return SubLoop->contains(PHI); }))
99 return true;
100 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
101 return true;
102 }
103 return false;
104 }
105
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)106 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
107 TTI::UnrollingPreferences &UP) {
108 const Function &F = *L->getHeader()->getParent();
109 UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
110 UP.MaxCount = std::numeric_limits<unsigned>::max();
111 UP.Partial = true;
112
113 // TODO: Do we want runtime unrolling?
114
115 // Maximum alloca size than can fit registers. Reserve 16 registers.
116 const unsigned MaxAlloca = (256 - 16) * 4;
117 unsigned ThresholdPrivate = UnrollThresholdPrivate;
118 unsigned ThresholdLocal = UnrollThresholdLocal;
119 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
120 for (const BasicBlock *BB : L->getBlocks()) {
121 const DataLayout &DL = BB->getModule()->getDataLayout();
122 unsigned LocalGEPsSeen = 0;
123
124 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
125 return SubLoop->contains(BB); }))
126 continue; // Block belongs to an inner loop.
127
128 for (const Instruction &I : *BB) {
129 // Unroll a loop which contains an "if" statement whose condition
130 // defined by a PHI belonging to the loop. This may help to eliminate
131 // if region and potentially even PHI itself, saving on both divergence
132 // and registers used for the PHI.
133 // Add a small bonus for each of such "if" statements.
134 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
135 if (UP.Threshold < MaxBoost && Br->isConditional()) {
136 BasicBlock *Succ0 = Br->getSuccessor(0);
137 BasicBlock *Succ1 = Br->getSuccessor(1);
138 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
139 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
140 continue;
141 if (dependsOnLocalPhi(L, Br->getCondition())) {
142 UP.Threshold += UnrollThresholdIf;
143 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
144 << " for loop:\n"
145 << *L << " due to " << *Br << '\n');
146 if (UP.Threshold >= MaxBoost)
147 return;
148 }
149 }
150 continue;
151 }
152
153 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
154 if (!GEP)
155 continue;
156
157 unsigned AS = GEP->getAddressSpace();
158 unsigned Threshold = 0;
159 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
160 Threshold = ThresholdPrivate;
161 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
162 Threshold = ThresholdLocal;
163 else
164 continue;
165
166 if (UP.Threshold >= Threshold)
167 continue;
168
169 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
170 const Value *Ptr = GEP->getPointerOperand();
171 const AllocaInst *Alloca =
172 dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
173 if (!Alloca || !Alloca->isStaticAlloca())
174 continue;
175 Type *Ty = Alloca->getAllocatedType();
176 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
177 if (AllocaSize > MaxAlloca)
178 continue;
179 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
180 AS == AMDGPUAS::REGION_ADDRESS) {
181 LocalGEPsSeen++;
182 // Inhibit unroll for local memory if we have seen addressing not to
183 // a variable, most likely we will be unable to combine it.
184 // Do not unroll too deep inner loops for local memory to give a chance
185 // to unroll an outer loop for a more important reason.
186 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
187 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
188 !isa<Argument>(GEP->getPointerOperand())))
189 continue;
190 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
191 << *L << " due to LDS use.\n");
192 UP.Runtime = UnrollRuntimeLocal;
193 }
194
195 // Check if GEP depends on a value defined by this loop itself.
196 bool HasLoopDef = false;
197 for (const Value *Op : GEP->operands()) {
198 const Instruction *Inst = dyn_cast<Instruction>(Op);
199 if (!Inst || L->isLoopInvariant(Op))
200 continue;
201
202 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
203 return SubLoop->contains(Inst); }))
204 continue;
205 HasLoopDef = true;
206 break;
207 }
208 if (!HasLoopDef)
209 continue;
210
211 // We want to do whatever we can to limit the number of alloca
212 // instructions that make it through to the code generator. allocas
213 // require us to use indirect addressing, which is slow and prone to
214 // compiler bugs. If this loop does an address calculation on an
215 // alloca ptr, then we want to use a higher than normal loop unroll
216 // threshold. This will give SROA a better chance to eliminate these
217 // allocas.
218 //
219 // We also want to have more unrolling for local memory to let ds
220 // instructions with different offsets combine.
221 //
222 // Don't use the maximum allowed value here as it will make some
223 // programs way too big.
224 UP.Threshold = Threshold;
225 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
226 << " for loop:\n"
227 << *L << " due to " << *GEP << '\n');
228 if (UP.Threshold >= MaxBoost)
229 return;
230 }
231
232 // If we got a GEP in a small BB from inner loop then increase max trip
233 // count to analyze for better estimation cost in unroll
234 if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze)
235 UP.MaxIterationsCountToAnalyze = 32;
236 }
237 }
238
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)239 void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
240 TTI::PeelingPreferences &PP) {
241 BaseT::getPeelingPreferences(L, SE, PP);
242 }
getHardwareNumberOfRegisters(bool Vec) const243 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
244 // The concept of vector registers doesn't really exist. Some packed vector
245 // operations operate on the normal 32-bit registers.
246 return MaxVGPRs;
247 }
248
getNumberOfRegisters(bool Vec) const249 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
250 // This is really the number of registers to fill when vectorizing /
251 // interleaving loops, so we lie to avoid trying to use all registers.
252 return getHardwareNumberOfRegisters(Vec) >> 3;
253 }
254
getNumberOfRegisters(unsigned RCID) const255 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
256 const SIRegisterInfo *TRI = ST->getRegisterInfo();
257 const TargetRegisterClass *RC = TRI->getRegClass(RCID);
258 unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
259 return getHardwareNumberOfRegisters(false) / NumVGPRs;
260 }
261
getRegisterBitWidth(bool Vector) const262 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
263 return 32;
264 }
265
getMinVectorRegisterBitWidth() const266 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
267 return 32;
268 }
269
getLoadVectorFactor(unsigned VF,unsigned LoadSize,unsigned ChainSizeInBytes,VectorType * VecTy) const270 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
271 unsigned ChainSizeInBytes,
272 VectorType *VecTy) const {
273 unsigned VecRegBitWidth = VF * LoadSize;
274 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
275 // TODO: Support element-size less than 32bit?
276 return 128 / LoadSize;
277
278 return VF;
279 }
280
getStoreVectorFactor(unsigned VF,unsigned StoreSize,unsigned ChainSizeInBytes,VectorType * VecTy) const281 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
282 unsigned ChainSizeInBytes,
283 VectorType *VecTy) const {
284 unsigned VecRegBitWidth = VF * StoreSize;
285 if (VecRegBitWidth > 128)
286 return 128 / StoreSize;
287
288 return VF;
289 }
290
getLoadStoreVecRegBitWidth(unsigned AddrSpace) const291 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
292 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
293 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
294 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
295 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
296 return 512;
297 }
298
299 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
300 return 8 * ST->getMaxPrivateElementSize();
301
302 // Common to flat, global, local and region. Assume for unknown addrspace.
303 return 128;
304 }
305
isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const306 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
307 Align Alignment,
308 unsigned AddrSpace) const {
309 // We allow vectorization of flat stores, even though we may need to decompose
310 // them later if they may access private memory. We don't have enough context
311 // here, and legalization can handle it.
312 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
313 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
314 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
315 }
316 return true;
317 }
318
isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const319 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
320 Align Alignment,
321 unsigned AddrSpace) const {
322 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
323 }
324
isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const325 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
326 Align Alignment,
327 unsigned AddrSpace) const {
328 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
329 }
330
331 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
332 // iteration. Should we report a larger size and let it legalize?
333 //
334 // FIXME: Should we use narrower types for local/region, or account for when
335 // unaligned access is legal?
336 //
337 // FIXME: This could use fine tuning and microbenchmarks.
getMemcpyLoopLoweringType(LLVMContext & Context,Value * Length,unsigned SrcAddrSpace,unsigned DestAddrSpace,unsigned SrcAlign,unsigned DestAlign) const338 Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
339 unsigned SrcAddrSpace,
340 unsigned DestAddrSpace,
341 unsigned SrcAlign,
342 unsigned DestAlign) const {
343 unsigned MinAlign = std::min(SrcAlign, DestAlign);
344
345 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
346 // hardware into byte accesses. If you assume all alignments are equally
347 // probable, it's more efficient on average to use short accesses for this
348 // case.
349 if (MinAlign == 2)
350 return Type::getInt16Ty(Context);
351
352 // Not all subtargets have 128-bit DS instructions, and we currently don't
353 // form them by default.
354 if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
355 SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
356 DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
357 DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
358 return FixedVectorType::get(Type::getInt32Ty(Context), 2);
359 }
360
361 // Global memory works best with 16-byte accesses. Private memory will also
362 // hit this, although they'll be decomposed.
363 return FixedVectorType::get(Type::getInt32Ty(Context), 4);
364 }
365
getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type * > & OpsOut,LLVMContext & Context,unsigned RemainingBytes,unsigned SrcAddrSpace,unsigned DestAddrSpace,unsigned SrcAlign,unsigned DestAlign) const366 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
367 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
368 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
369 unsigned SrcAlign, unsigned DestAlign) const {
370 assert(RemainingBytes < 16);
371
372 unsigned MinAlign = std::min(SrcAlign, DestAlign);
373
374 if (MinAlign != 2) {
375 Type *I64Ty = Type::getInt64Ty(Context);
376 while (RemainingBytes >= 8) {
377 OpsOut.push_back(I64Ty);
378 RemainingBytes -= 8;
379 }
380
381 Type *I32Ty = Type::getInt32Ty(Context);
382 while (RemainingBytes >= 4) {
383 OpsOut.push_back(I32Ty);
384 RemainingBytes -= 4;
385 }
386 }
387
388 Type *I16Ty = Type::getInt16Ty(Context);
389 while (RemainingBytes >= 2) {
390 OpsOut.push_back(I16Ty);
391 RemainingBytes -= 2;
392 }
393
394 Type *I8Ty = Type::getInt8Ty(Context);
395 while (RemainingBytes) {
396 OpsOut.push_back(I8Ty);
397 --RemainingBytes;
398 }
399 }
400
getMaxInterleaveFactor(unsigned VF)401 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
402 // Disable unrolling if the loop is not vectorized.
403 // TODO: Enable this again.
404 if (VF == 1)
405 return 1;
406
407 return 8;
408 }
409
getTgtMemIntrinsic(IntrinsicInst * Inst,MemIntrinsicInfo & Info) const410 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
411 MemIntrinsicInfo &Info) const {
412 switch (Inst->getIntrinsicID()) {
413 case Intrinsic::amdgcn_atomic_inc:
414 case Intrinsic::amdgcn_atomic_dec:
415 case Intrinsic::amdgcn_ds_ordered_add:
416 case Intrinsic::amdgcn_ds_ordered_swap:
417 case Intrinsic::amdgcn_ds_fadd:
418 case Intrinsic::amdgcn_ds_fmin:
419 case Intrinsic::amdgcn_ds_fmax: {
420 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
421 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
422 if (!Ordering || !Volatile)
423 return false; // Invalid.
424
425 unsigned OrderingVal = Ordering->getZExtValue();
426 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
427 return false;
428
429 Info.PtrVal = Inst->getArgOperand(0);
430 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
431 Info.ReadMem = true;
432 Info.WriteMem = true;
433 Info.IsVolatile = !Volatile->isNullValue();
434 return true;
435 }
436 default:
437 return false;
438 }
439 }
440
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::TargetCostKind CostKind,TTI::OperandValueKind Opd1Info,TTI::OperandValueKind Opd2Info,TTI::OperandValueProperties Opd1PropInfo,TTI::OperandValueProperties Opd2PropInfo,ArrayRef<const Value * > Args,const Instruction * CxtI)441 int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
442 TTI::TargetCostKind CostKind,
443 TTI::OperandValueKind Opd1Info,
444 TTI::OperandValueKind Opd2Info,
445 TTI::OperandValueProperties Opd1PropInfo,
446 TTI::OperandValueProperties Opd2PropInfo,
447 ArrayRef<const Value *> Args,
448 const Instruction *CxtI) {
449 EVT OrigTy = TLI->getValueType(DL, Ty);
450 if (!OrigTy.isSimple()) {
451 // FIXME: We're having to query the throughput cost so that the basic
452 // implementation tries to generate legalize and scalarization costs. Maybe
453 // we could hoist the scalarization code here?
454 return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
455 Opd1Info, Opd2Info,
456 Opd1PropInfo, Opd2PropInfo);
457 }
458
459 // Legalize the type.
460 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
461 int ISD = TLI->InstructionOpcodeToISD(Opcode);
462
463 // Because we don't have any legal vector operations, but the legal types, we
464 // need to account for split vectors.
465 unsigned NElts = LT.second.isVector() ?
466 LT.second.getVectorNumElements() : 1;
467
468 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
469
470 switch (ISD) {
471 case ISD::SHL:
472 case ISD::SRL:
473 case ISD::SRA:
474 if (SLT == MVT::i64)
475 return get64BitInstrCost() * LT.first * NElts;
476
477 if (ST->has16BitInsts() && SLT == MVT::i16)
478 NElts = (NElts + 1) / 2;
479
480 // i32
481 return getFullRateInstrCost() * LT.first * NElts;
482 case ISD::ADD:
483 case ISD::SUB:
484 case ISD::AND:
485 case ISD::OR:
486 case ISD::XOR:
487 if (SLT == MVT::i64) {
488 // and, or and xor are typically split into 2 VALU instructions.
489 return 2 * getFullRateInstrCost() * LT.first * NElts;
490 }
491
492 if (ST->has16BitInsts() && SLT == MVT::i16)
493 NElts = (NElts + 1) / 2;
494
495 return LT.first * NElts * getFullRateInstrCost();
496 case ISD::MUL: {
497 const int QuarterRateCost = getQuarterRateInstrCost();
498 if (SLT == MVT::i64) {
499 const int FullRateCost = getFullRateInstrCost();
500 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
501 }
502
503 if (ST->has16BitInsts() && SLT == MVT::i16)
504 NElts = (NElts + 1) / 2;
505
506 // i32
507 return QuarterRateCost * NElts * LT.first;
508 }
509 case ISD::FADD:
510 case ISD::FSUB:
511 case ISD::FMUL:
512 if (SLT == MVT::f64)
513 return LT.first * NElts * get64BitInstrCost();
514
515 if (ST->has16BitInsts() && SLT == MVT::f16)
516 NElts = (NElts + 1) / 2;
517
518 if (SLT == MVT::f32 || SLT == MVT::f16)
519 return LT.first * NElts * getFullRateInstrCost();
520 break;
521 case ISD::FDIV:
522 case ISD::FREM:
523 // FIXME: frem should be handled separately. The fdiv in it is most of it,
524 // but the current lowering is also not entirely correct.
525 if (SLT == MVT::f64) {
526 int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
527 // Add cost of workaround.
528 if (!ST->hasUsableDivScaleConditionOutput())
529 Cost += 3 * getFullRateInstrCost();
530
531 return LT.first * Cost * NElts;
532 }
533
534 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
535 // TODO: This is more complicated, unsafe flags etc.
536 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
537 (SLT == MVT::f16 && ST->has16BitInsts())) {
538 return LT.first * getQuarterRateInstrCost() * NElts;
539 }
540 }
541
542 if (SLT == MVT::f16 && ST->has16BitInsts()) {
543 // 2 x v_cvt_f32_f16
544 // f32 rcp
545 // f32 fmul
546 // v_cvt_f16_f32
547 // f16 div_fixup
548 int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
549 return LT.first * Cost * NElts;
550 }
551
552 if (SLT == MVT::f32 || SLT == MVT::f16) {
553 int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
554
555 if (!HasFP32Denormals) {
556 // FP mode switches.
557 Cost += 2 * getFullRateInstrCost();
558 }
559
560 return LT.first * NElts * Cost;
561 }
562 break;
563 case ISD::FNEG:
564 // Use the backend' estimation. If fneg is not free each element will cost
565 // one additional instruction.
566 return TLI->isFNegFree(SLT) ? 0 : NElts;
567 default:
568 break;
569 }
570
571 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
572 Opd2Info,
573 Opd1PropInfo, Opd2PropInfo);
574 }
575
576 // Return true if there's a potential benefit from using v2f16 instructions for
577 // an intrinsic, even if it requires nontrivial legalization.
intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)578 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
579 switch (ID) {
580 case Intrinsic::fma: // TODO: fmuladd
581 // There's a small benefit to using vector ops in the legalized code.
582 case Intrinsic::round:
583 return true;
584 default:
585 return false;
586 }
587 }
588
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)589 int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
590 TTI::TargetCostKind CostKind) {
591 if (ICA.getID() == Intrinsic::fabs)
592 return 0;
593
594 if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
595 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
596
597 Type *RetTy = ICA.getReturnType();
598 EVT OrigTy = TLI->getValueType(DL, RetTy);
599 if (!OrigTy.isSimple()) {
600 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
601 }
602
603 // Legalize the type.
604 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
605
606 unsigned NElts = LT.second.isVector() ?
607 LT.second.getVectorNumElements() : 1;
608
609 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
610
611 if (SLT == MVT::f64)
612 return LT.first * NElts * get64BitInstrCost();
613
614 if (ST->has16BitInsts() && SLT == MVT::f16)
615 NElts = (NElts + 1) / 2;
616
617 // TODO: Get more refined intrinsic costs?
618 unsigned InstRate = getQuarterRateInstrCost();
619 if (ICA.getID() == Intrinsic::fma) {
620 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
621 : getQuarterRateInstrCost();
622 }
623
624 return LT.first * NElts * InstRate;
625 }
626
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind)627 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
628 TTI::TargetCostKind CostKind) {
629 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
630 return Opcode == Instruction::PHI ? 0 : 1;
631
632 // XXX - For some reason this isn't called for switch.
633 switch (Opcode) {
634 case Instruction::Br:
635 case Instruction::Ret:
636 return 10;
637 default:
638 return BaseT::getCFInstrCost(Opcode, CostKind);
639 }
640 }
641
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,bool IsPairwise,TTI::TargetCostKind CostKind)642 int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
643 bool IsPairwise,
644 TTI::TargetCostKind CostKind) {
645 EVT OrigTy = TLI->getValueType(DL, Ty);
646
647 // Computes cost on targets that have packed math instructions(which support
648 // 16-bit types only).
649 if (IsPairwise ||
650 !ST->hasVOP3PInsts() ||
651 OrigTy.getScalarSizeInBits() != 16)
652 return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
653
654 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
655 return LT.first * getFullRateInstrCost();
656 }
657
getMinMaxReductionCost(VectorType * Ty,VectorType * CondTy,bool IsPairwise,bool IsUnsigned,TTI::TargetCostKind CostKind)658 int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
659 bool IsPairwise, bool IsUnsigned,
660 TTI::TargetCostKind CostKind) {
661 EVT OrigTy = TLI->getValueType(DL, Ty);
662
663 // Computes cost on targets that have packed math instructions(which support
664 // 16-bit types only).
665 if (IsPairwise ||
666 !ST->hasVOP3PInsts() ||
667 OrigTy.getScalarSizeInBits() != 16)
668 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
669 CostKind);
670
671 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
672 return LT.first * getHalfRateInstrCost();
673 }
674
getVectorInstrCost(unsigned Opcode,Type * ValTy,unsigned Index)675 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
676 unsigned Index) {
677 switch (Opcode) {
678 case Instruction::ExtractElement:
679 case Instruction::InsertElement: {
680 unsigned EltSize
681 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
682 if (EltSize < 32) {
683 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
684 return 0;
685 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
686 }
687
688 // Extracts are just reads of a subregister, so are free. Inserts are
689 // considered free because we don't want to have any cost for scalarizing
690 // operations, and we don't have to copy into a different register class.
691
692 // Dynamic indexing isn't free and is best avoided.
693 return Index == ~0u ? 2 : 0;
694 }
695 default:
696 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
697 }
698 }
699
isArgPassedInSGPR(const Argument * A)700 static bool isArgPassedInSGPR(const Argument *A) {
701 const Function *F = A->getParent();
702
703 // Arguments to compute shaders are never a source of divergence.
704 CallingConv::ID CC = F->getCallingConv();
705 switch (CC) {
706 case CallingConv::AMDGPU_KERNEL:
707 case CallingConv::SPIR_KERNEL:
708 return true;
709 case CallingConv::AMDGPU_VS:
710 case CallingConv::AMDGPU_LS:
711 case CallingConv::AMDGPU_HS:
712 case CallingConv::AMDGPU_ES:
713 case CallingConv::AMDGPU_GS:
714 case CallingConv::AMDGPU_PS:
715 case CallingConv::AMDGPU_CS:
716 // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
717 // Everything else is in VGPRs.
718 return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
719 F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
720 default:
721 // TODO: Should calls support inreg for SGPR inputs?
722 return false;
723 }
724 }
725
726 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
727 /// this is analyzing the collective result of all output registers. Otherwise,
728 /// this is only querying a specific result index if this returns multiple
729 /// registers in a struct.
isInlineAsmSourceOfDivergence(const CallInst * CI,ArrayRef<unsigned> Indices) const730 bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
731 const CallInst *CI, ArrayRef<unsigned> Indices) const {
732 // TODO: Handle complex extract indices
733 if (Indices.size() > 1)
734 return true;
735
736 const DataLayout &DL = CI->getModule()->getDataLayout();
737 const SIRegisterInfo *TRI = ST->getRegisterInfo();
738 TargetLowering::AsmOperandInfoVector TargetConstraints =
739 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
740
741 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
742
743 int OutputIdx = 0;
744 for (auto &TC : TargetConstraints) {
745 if (TC.Type != InlineAsm::isOutput)
746 continue;
747
748 // Skip outputs we don't care about.
749 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
750 continue;
751
752 TLI->ComputeConstraintToUse(TC, SDValue());
753
754 Register AssignedReg;
755 const TargetRegisterClass *RC;
756 std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
757 TRI, TC.ConstraintCode, TC.ConstraintVT);
758 if (AssignedReg) {
759 // FIXME: This is a workaround for getRegForInlineAsmConstraint
760 // returning VS_32
761 RC = TRI->getPhysRegClass(AssignedReg);
762 }
763
764 // For AGPR constraints null is returned on subtargets without AGPRs, so
765 // assume divergent for null.
766 if (!RC || !TRI->isSGPRClass(RC))
767 return true;
768 }
769
770 return false;
771 }
772
773 /// \returns true if the new GPU divergence analysis is enabled.
useGPUDivergenceAnalysis() const774 bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
775 return !UseLegacyDA;
776 }
777
778 /// \returns true if the result of the value could potentially be
779 /// different across workitems in a wavefront.
isSourceOfDivergence(const Value * V) const780 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
781 if (const Argument *A = dyn_cast<Argument>(V))
782 return !isArgPassedInSGPR(A);
783
784 // Loads from the private and flat address spaces are divergent, because
785 // threads can execute the load instruction with the same inputs and get
786 // different results.
787 //
788 // All other loads are not divergent, because if threads issue loads with the
789 // same arguments, they will always get the same result.
790 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
791 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
792 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
793
794 // Atomics are divergent because they are executed sequentially: when an
795 // atomic operation refers to the same address in each thread, then each
796 // thread after the first sees the value written by the previous thread as
797 // original value.
798 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
799 return true;
800
801 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
802 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
803
804 // Assume all function calls are a source of divergence.
805 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
806 if (CI->isInlineAsm())
807 return isInlineAsmSourceOfDivergence(CI);
808 return true;
809 }
810
811 // Assume all function calls are a source of divergence.
812 if (isa<InvokeInst>(V))
813 return true;
814
815 return false;
816 }
817
isAlwaysUniform(const Value * V) const818 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
819 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
820 switch (Intrinsic->getIntrinsicID()) {
821 default:
822 return false;
823 case Intrinsic::amdgcn_readfirstlane:
824 case Intrinsic::amdgcn_readlane:
825 case Intrinsic::amdgcn_icmp:
826 case Intrinsic::amdgcn_fcmp:
827 case Intrinsic::amdgcn_ballot:
828 case Intrinsic::amdgcn_if_break:
829 return true;
830 }
831 }
832
833 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
834 if (CI->isInlineAsm())
835 return !isInlineAsmSourceOfDivergence(CI);
836 return false;
837 }
838
839 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
840 if (!ExtValue)
841 return false;
842
843 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
844 if (!CI)
845 return false;
846
847 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
848 switch (Intrinsic->getIntrinsicID()) {
849 default:
850 return false;
851 case Intrinsic::amdgcn_if:
852 case Intrinsic::amdgcn_else: {
853 ArrayRef<unsigned> Indices = ExtValue->getIndices();
854 return Indices.size() == 1 && Indices[0] == 1;
855 }
856 }
857 }
858
859 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
860 // divergent for the overall struct return. We need to override it in the
861 // case we're extracting an SGPR component here.
862 if (CI->isInlineAsm())
863 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
864
865 return false;
866 }
867
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID) const868 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
869 Intrinsic::ID IID) const {
870 switch (IID) {
871 case Intrinsic::amdgcn_atomic_inc:
872 case Intrinsic::amdgcn_atomic_dec:
873 case Intrinsic::amdgcn_ds_fadd:
874 case Intrinsic::amdgcn_ds_fmin:
875 case Intrinsic::amdgcn_ds_fmax:
876 case Intrinsic::amdgcn_is_shared:
877 case Intrinsic::amdgcn_is_private:
878 OpIndexes.push_back(0);
879 return true;
880 default:
881 return false;
882 }
883 }
884
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV) const885 Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
886 Value *OldV,
887 Value *NewV) const {
888 auto IntrID = II->getIntrinsicID();
889 switch (IntrID) {
890 case Intrinsic::amdgcn_atomic_inc:
891 case Intrinsic::amdgcn_atomic_dec:
892 case Intrinsic::amdgcn_ds_fadd:
893 case Intrinsic::amdgcn_ds_fmin:
894 case Intrinsic::amdgcn_ds_fmax: {
895 const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
896 if (!IsVolatile->isZero())
897 return nullptr;
898 Module *M = II->getParent()->getParent()->getParent();
899 Type *DestTy = II->getType();
900 Type *SrcTy = NewV->getType();
901 Function *NewDecl =
902 Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
903 II->setArgOperand(0, NewV);
904 II->setCalledFunction(NewDecl);
905 return II;
906 }
907 case Intrinsic::amdgcn_is_shared:
908 case Intrinsic::amdgcn_is_private: {
909 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
910 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
911 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
912 LLVMContext &Ctx = NewV->getType()->getContext();
913 ConstantInt *NewVal = (TrueAS == NewAS) ?
914 ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
915 return NewVal;
916 }
917 case Intrinsic::ptrmask: {
918 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
919 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
920 Value *MaskOp = II->getArgOperand(1);
921 Type *MaskTy = MaskOp->getType();
922
923 bool DoTruncate = false;
924 if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) {
925 // All valid 64-bit to 32-bit casts work by chopping off the high
926 // bits. Any masking only clearing the low bits will also apply in the new
927 // address space.
928 if (DL.getPointerSizeInBits(OldAS) != 64 ||
929 DL.getPointerSizeInBits(NewAS) != 32)
930 return nullptr;
931
932 // TODO: Do we need to thread more context in here?
933 KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
934 if (Known.countMinLeadingOnes() < 32)
935 return nullptr;
936
937 DoTruncate = true;
938 }
939
940 IRBuilder<> B(II);
941 if (DoTruncate) {
942 MaskTy = B.getInt32Ty();
943 MaskOp = B.CreateTrunc(MaskOp, MaskTy);
944 }
945
946 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
947 {NewV, MaskOp});
948 }
949 default:
950 return nullptr;
951 }
952 }
953
getShuffleCost(TTI::ShuffleKind Kind,VectorType * VT,int Index,VectorType * SubTp)954 unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT,
955 int Index, VectorType *SubTp) {
956 if (ST->hasVOP3PInsts()) {
957 if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
958 DL.getTypeSizeInBits(VT->getElementType()) == 16) {
959 // With op_sel VOP3P instructions freely can access the low half or high
960 // half of a register, so any swizzle is free.
961
962 switch (Kind) {
963 case TTI::SK_Broadcast:
964 case TTI::SK_Reverse:
965 case TTI::SK_PermuteSingleSrc:
966 return 0;
967 default:
968 break;
969 }
970 }
971 }
972
973 return BaseT::getShuffleCost(Kind, VT, Index, SubTp);
974 }
975
areInlineCompatible(const Function * Caller,const Function * Callee) const976 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
977 const Function *Callee) const {
978 const TargetMachine &TM = getTLI()->getTargetMachine();
979 const GCNSubtarget *CallerST
980 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
981 const GCNSubtarget *CalleeST
982 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
983
984 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
985 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
986
987 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
988 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
989 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
990 return false;
991
992 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
993 // no way to support merge for backend defined attributes.
994 AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
995 AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
996 return CallerMode.isInlineCompatible(CalleeMode);
997 }
998
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)999 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1000 TTI::UnrollingPreferences &UP) {
1001 CommonTTI.getUnrollingPreferences(L, SE, UP);
1002 }
1003
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)1004 void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1005 TTI::PeelingPreferences &PP) {
1006 CommonTTI.getPeelingPreferences(L, SE, PP);
1007 }
1008
getHardwareNumberOfRegisters(bool Vec) const1009 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
1010 return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
1011 }
1012
getNumberOfRegisters(bool Vec) const1013 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
1014 return getHardwareNumberOfRegisters(Vec);
1015 }
1016
getRegisterBitWidth(bool Vector) const1017 unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
1018 return 32;
1019 }
1020
getMinVectorRegisterBitWidth() const1021 unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
1022 return 32;
1023 }
1024
getLoadStoreVecRegBitWidth(unsigned AddrSpace) const1025 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
1026 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
1027 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
1028 return 128;
1029 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1030 AddrSpace == AMDGPUAS::REGION_ADDRESS)
1031 return 64;
1032 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
1033 return 32;
1034
1035 if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
1036 AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
1037 (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
1038 AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
1039 return 128;
1040 llvm_unreachable("unhandled address space");
1041 }
1042
isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const1043 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
1044 Align Alignment,
1045 unsigned AddrSpace) const {
1046 // We allow vectorization of flat stores, even though we may need to decompose
1047 // them later if they may access private memory. We don't have enough context
1048 // here, and legalization can handle it.
1049 return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
1050 }
1051
isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const1052 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1053 Align Alignment,
1054 unsigned AddrSpace) const {
1055 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1056 }
1057
isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,Align Alignment,unsigned AddrSpace) const1058 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1059 Align Alignment,
1060 unsigned AddrSpace) const {
1061 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1062 }
1063
getMaxInterleaveFactor(unsigned VF)1064 unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
1065 // Disable unrolling if the loop is not vectorized.
1066 // TODO: Enable this again.
1067 if (VF == 1)
1068 return 1;
1069
1070 return 8;
1071 }
1072
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind)1073 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
1074 TTI::TargetCostKind CostKind) {
1075 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
1076 return Opcode == Instruction::PHI ? 0 : 1;
1077
1078 // XXX - For some reason this isn't called for switch.
1079 switch (Opcode) {
1080 case Instruction::Br:
1081 case Instruction::Ret:
1082 return 10;
1083 default:
1084 return BaseT::getCFInstrCost(Opcode, CostKind);
1085 }
1086 }
1087
getVectorInstrCost(unsigned Opcode,Type * ValTy,unsigned Index)1088 int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
1089 unsigned Index) {
1090 switch (Opcode) {
1091 case Instruction::ExtractElement:
1092 case Instruction::InsertElement: {
1093 unsigned EltSize
1094 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1095 if (EltSize < 32) {
1096 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1097 }
1098
1099 // Extracts are just reads of a subregister, so are free. Inserts are
1100 // considered free because we don't want to have any cost for scalarizing
1101 // operations, and we don't have to copy into a different register class.
1102
1103 // Dynamic indexing isn't free and is best avoided.
1104 return Index == ~0u ? 2 : 0;
1105 }
1106 default:
1107 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1108 }
1109 }
1110
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)1111 void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1112 TTI::UnrollingPreferences &UP) {
1113 CommonTTI.getUnrollingPreferences(L, SE, UP);
1114 }
1115
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)1116 void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1117 TTI::PeelingPreferences &PP) {
1118 CommonTTI.getPeelingPreferences(L, SE, PP);
1119 }
1120