1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32
33 using namespace llvm;
34
35 #define DEBUG_TYPE "amdgpu-subtarget"
36
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45
46 static cl::opt<bool> DisablePowerSched(
47 "amdgpu-disable-power-sched",
48 cl::desc("Disable scheduling to minimize mAI power bursts"),
49 cl::init(false));
50
51 static cl::opt<bool> EnableVGPRIndexMode(
52 "amdgpu-vgpr-index-mode",
53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54 cl::init(false));
55
56 static cl::opt<bool> EnableFlatScratch(
57 "amdgpu-enable-flat-scratch",
58 cl::desc("Use flat scratch instructions"),
59 cl::init(false));
60
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62 cl::desc("Enable the use of AA during codegen."),
63 cl::init(true));
64
65 GCNSubtarget::~GCNSubtarget() = default;
66
67 R600Subtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69 StringRef GPU, StringRef FS) {
70 SmallString<256> FullFS("+promote-alloca,");
71 FullFS += FS;
72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73
74 HasMulU24 = getGeneration() >= EVERGREEN;
75 HasMulI24 = hasCaymanISA();
76
77 return *this;
78 }
79
80 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82 StringRef GPU, StringRef FS) {
83 // Determine default and user-specified characteristics
84 //
85 // We want to be able to turn these off, but making this a subtarget feature
86 // for SI has the unhelpful behavior that it unsets everything else if you
87 // disable it.
88 //
89 // Similarly we want enable-prt-strict-null to be on by default and not to
90 // unset everything else if it is disabled
91
92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93
94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95 if (isAmdHsaOS())
96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97
98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99
100 // Disable mutually exclusive bits.
101 if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102 if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103 FullFS += "-wavefrontsize16,";
104 if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105 FullFS += "-wavefrontsize32,";
106 if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107 FullFS += "-wavefrontsize64,";
108 }
109
110 FullFS += FS;
111
112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113
114 // Implement the "generic" processors, which acts as the default when no
115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116 // the first amdgcn target that supports flat addressing. Other OSes defaults
117 // to the first amdgcn target.
118 if (Gen == AMDGPUSubtarget::INVALID) {
119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120 : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121 }
122
123 // We don't support FP64 for EG/NI atm.
124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125
126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127 // support flat operations, otherwise they cannot access a 64-bit global
128 // address space
129 assert(hasAddr64() || hasFlat());
130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131 // that do not support ADDR64 variants of MUBUF instructions. Such targets
132 // cannot use a 64 bit offset with a MUBUF instruction to access the global
133 // address space
134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136 FlatForGlobal = true;
137 }
138 // Unless +-flat-for-global is specified, use MUBUF instructions for global
139 // address space access if flat operations are not available.
140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142 FlatForGlobal = false;
143 }
144
145 // Set defaults if needed.
146 if (MaxPrivateElementSize == 0)
147 MaxPrivateElementSize = 4;
148
149 if (LDSBankCount == 0)
150 LDSBankCount = 32;
151
152 if (TT.getArch() == Triple::amdgcn) {
153 if (LocalMemorySize == 0)
154 LocalMemorySize = 32768;
155
156 // Do something sensible for unspecified target.
157 if (!HasMovrel && !HasVGPRIndexMode)
158 HasMovrel = true;
159 }
160
161 // Don't crash on invalid devices.
162 if (WavefrontSizeLog2 == 0)
163 WavefrontSizeLog2 = 5;
164
165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166
167 TargetID.setTargetIDFromFeaturesString(FS);
168
169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170 << TargetID.getXnackSetting() << '\n');
171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172 << TargetID.getSramEccSetting() << '\n');
173
174 return *this;
175 }
176
AMDGPUSubtarget(const Triple & TT)177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
178 TargetTriple(TT),
179 Has16BitInsts(false),
180 HasMadMixInsts(false),
181 HasMadMacF32Insts(false),
182 HasDsSrc2Insts(false),
183 HasSDWA(false),
184 HasVOP3PInsts(false),
185 HasMulI24(true),
186 HasMulU24(true),
187 HasInv2PiInlineImm(false),
188 HasFminFmaxLegacy(true),
189 EnablePromoteAlloca(false),
190 HasTrigReducedRange(false),
191 MaxWavesPerEU(10),
192 LocalMemorySize(0),
193 WavefrontSizeLog2(0)
194 { }
195
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)196 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
197 const GCNTargetMachine &TM) :
198 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
199 AMDGPUSubtarget(TT),
200 TargetTriple(TT),
201 TargetID(*this),
202 Gen(INVALID),
203 InstrItins(getInstrItineraryForCPU(GPU)),
204 LDSBankCount(0),
205 MaxPrivateElementSize(0),
206
207 FastFMAF32(false),
208 FastDenormalF32(false),
209 HalfRate64Ops(false),
210
211 FlatForGlobal(false),
212 AutoWaitcntBeforeBarrier(false),
213 UnalignedScratchAccess(false),
214 UnalignedAccessMode(false),
215
216 HasApertureRegs(false),
217 SupportsXNACK(false),
218 EnableXNACK(false),
219 EnableCuMode(false),
220 TrapHandler(false),
221
222 EnableLoadStoreOpt(false),
223 EnableUnsafeDSOffsetFolding(false),
224 EnableSIScheduler(false),
225 EnableDS128(false),
226 EnablePRTStrictNull(false),
227 DumpCode(false),
228
229 FP64(false),
230 GCN3Encoding(false),
231 CIInsts(false),
232 GFX8Insts(false),
233 GFX9Insts(false),
234 GFX10Insts(false),
235 GFX10_3Insts(false),
236 GFX7GFX8GFX9Insts(false),
237 SGPRInitBug(false),
238 HasSMemRealTime(false),
239 HasIntClamp(false),
240 HasFmaMixInsts(false),
241 HasMovrel(false),
242 HasVGPRIndexMode(false),
243 HasScalarStores(false),
244 HasScalarAtomics(false),
245 HasSDWAOmod(false),
246 HasSDWAScalar(false),
247 HasSDWASdst(false),
248 HasSDWAMac(false),
249 HasSDWAOutModsVOPC(false),
250 HasDPP(false),
251 HasDPP8(false),
252 HasR128A16(false),
253 HasGFX10A16(false),
254 HasG16(false),
255 HasNSAEncoding(false),
256 GFX10_BEncoding(false),
257 HasDLInsts(false),
258 HasDot1Insts(false),
259 HasDot2Insts(false),
260 HasDot3Insts(false),
261 HasDot4Insts(false),
262 HasDot5Insts(false),
263 HasDot6Insts(false),
264 HasMAIInsts(false),
265 HasPkFmacF16Inst(false),
266 HasAtomicFaddInsts(false),
267 SupportsSRAMECC(false),
268 EnableSRAMECC(false),
269 HasNoSdstCMPX(false),
270 HasVscnt(false),
271 HasGetWaveIdInst(false),
272 HasSMemTimeInst(false),
273 HasRegisterBanking(false),
274 HasVOP3Literal(false),
275 HasNoDataDepHazard(false),
276 FlatAddressSpace(false),
277 FlatInstOffsets(false),
278 FlatGlobalInsts(false),
279 FlatScratchInsts(false),
280 ScalarFlatScratchInsts(false),
281 AddNoCarryInsts(false),
282 HasUnpackedD16VMem(false),
283 LDSMisalignedBug(false),
284 HasMFMAInlineLiteralBug(false),
285 UnalignedBufferAccess(false),
286 UnalignedDSAccess(false),
287
288 ScalarizeGlobal(false),
289
290 HasVcmpxPermlaneHazard(false),
291 HasVMEMtoScalarWriteHazard(false),
292 HasSMEMtoVectorWriteHazard(false),
293 HasInstFwdPrefetchBug(false),
294 HasVcmpxExecWARHazard(false),
295 HasLdsBranchVmemWARHazard(false),
296 HasNSAtoVMEMBug(false),
297 HasOffset3fBug(false),
298 HasFlatSegmentOffsetBug(false),
299 HasImageStoreD16Bug(false),
300 HasImageGather4D16Bug(false),
301
302 FeatureDisable(false),
303 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
304 TLInfo(TM, *this),
305 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
306 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
307 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
308 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
309 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
310 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
311 InstSelector.reset(new AMDGPUInstructionSelector(
312 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
313 }
314
enableFlatScratch() const315 bool GCNSubtarget::enableFlatScratch() const {
316 return EnableFlatScratch && hasFlatScratchInsts();
317 }
318
getConstantBusLimit(unsigned Opcode) const319 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
320 if (getGeneration() < GFX10)
321 return 1;
322
323 switch (Opcode) {
324 case AMDGPU::V_LSHLREV_B64_e64:
325 case AMDGPU::V_LSHLREV_B64_gfx10:
326 case AMDGPU::V_LSHL_B64_e64:
327 case AMDGPU::V_LSHRREV_B64_e64:
328 case AMDGPU::V_LSHRREV_B64_gfx10:
329 case AMDGPU::V_LSHR_B64_e64:
330 case AMDGPU::V_ASHRREV_I64_e64:
331 case AMDGPU::V_ASHRREV_I64_gfx10:
332 case AMDGPU::V_ASHR_I64_e64:
333 return 1;
334 }
335
336 return 2;
337 }
338
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const339 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
340 const Function &F) const {
341 if (NWaves == 1)
342 return getLocalMemorySize();
343 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
344 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
345 if (!WorkGroupsPerCu)
346 return 0;
347 unsigned MaxWaves = getMaxWavesPerEU();
348 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
349 }
350
351 // FIXME: Should return min,max range.
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const352 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
353 const Function &F) const {
354 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
355 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
356 if (!MaxWorkGroupsPerCu)
357 return 0;
358
359 const unsigned WaveSize = getWavefrontSize();
360
361 // FIXME: Do we need to account for alignment requirement of LDS rounding the
362 // size up?
363 // Compute restriction based on LDS usage
364 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
365
366 // This can be queried with more LDS than is possible, so just assume the
367 // worst.
368 if (NumGroups == 0)
369 return 1;
370
371 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
372
373 // Round to the number of waves.
374 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
375 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
376
377 // Clamp to the maximum possible number of waves.
378 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
379
380 // FIXME: Needs to be a multiple of the group size?
381 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
382
383 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
384 "computed invalid occupancy");
385 return MaxWaves;
386 }
387
388 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const389 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
390 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
391 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
392 }
393
394 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const395 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
396 switch (CC) {
397 case CallingConv::AMDGPU_VS:
398 case CallingConv::AMDGPU_LS:
399 case CallingConv::AMDGPU_HS:
400 case CallingConv::AMDGPU_ES:
401 case CallingConv::AMDGPU_GS:
402 case CallingConv::AMDGPU_PS:
403 return std::make_pair(1, getWavefrontSize());
404 default:
405 return std::make_pair(1u, getMaxFlatWorkGroupSize());
406 }
407 }
408
getFlatWorkGroupSizes(const Function & F) const409 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
410 const Function &F) const {
411 // Default minimum/maximum flat work group sizes.
412 std::pair<unsigned, unsigned> Default =
413 getDefaultFlatWorkGroupSize(F.getCallingConv());
414
415 // Requested minimum/maximum flat work group sizes.
416 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
417 F, "amdgpu-flat-work-group-size", Default);
418
419 // Make sure requested minimum is less than requested maximum.
420 if (Requested.first > Requested.second)
421 return Default;
422
423 // Make sure requested values do not violate subtarget's specifications.
424 if (Requested.first < getMinFlatWorkGroupSize())
425 return Default;
426 if (Requested.second > getMaxFlatWorkGroupSize())
427 return Default;
428
429 return Requested;
430 }
431
getWavesPerEU(const Function & F) const432 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
433 const Function &F) const {
434 // Default minimum/maximum number of waves per execution unit.
435 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
436
437 // Default/requested minimum/maximum flat work group sizes.
438 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
439
440 // If minimum/maximum flat work group sizes were explicitly requested using
441 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
442 // number of waves per execution unit to values implied by requested
443 // minimum/maximum flat work group sizes.
444 unsigned MinImpliedByFlatWorkGroupSize =
445 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
446 Default.first = MinImpliedByFlatWorkGroupSize;
447 bool RequestedFlatWorkGroupSize =
448 F.hasFnAttribute("amdgpu-flat-work-group-size");
449
450 // Requested minimum/maximum number of waves per execution unit.
451 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
452 F, "amdgpu-waves-per-eu", Default, true);
453
454 // Make sure requested minimum is less than requested maximum.
455 if (Requested.second && Requested.first > Requested.second)
456 return Default;
457
458 // Make sure requested values do not violate subtarget's specifications.
459 if (Requested.first < getMinWavesPerEU() ||
460 Requested.second > getMaxWavesPerEU())
461 return Default;
462
463 // Make sure requested values are compatible with values implied by requested
464 // minimum/maximum flat work group sizes.
465 if (RequestedFlatWorkGroupSize &&
466 Requested.first < MinImpliedByFlatWorkGroupSize)
467 return Default;
468
469 return Requested;
470 }
471
getReqdWorkGroupSize(const Function & Kernel,unsigned Dim)472 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
473 auto Node = Kernel.getMetadata("reqd_work_group_size");
474 if (Node && Node->getNumOperands() == 3)
475 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
476 return std::numeric_limits<unsigned>::max();
477 }
478
isMesaKernel(const Function & F) const479 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
480 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
481 }
482
getMaxWorkitemID(const Function & Kernel,unsigned Dimension) const483 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
484 unsigned Dimension) const {
485 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
486 if (ReqdSize != std::numeric_limits<unsigned>::max())
487 return ReqdSize - 1;
488 return getFlatWorkGroupSizes(Kernel).second - 1;
489 }
490
makeLIDRangeMetadata(Instruction * I) const491 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
492 Function *Kernel = I->getParent()->getParent();
493 unsigned MinSize = 0;
494 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
495 bool IdQuery = false;
496
497 // If reqd_work_group_size is present it narrows value down.
498 if (auto *CI = dyn_cast<CallInst>(I)) {
499 const Function *F = CI->getCalledFunction();
500 if (F) {
501 unsigned Dim = UINT_MAX;
502 switch (F->getIntrinsicID()) {
503 case Intrinsic::amdgcn_workitem_id_x:
504 case Intrinsic::r600_read_tidig_x:
505 IdQuery = true;
506 LLVM_FALLTHROUGH;
507 case Intrinsic::r600_read_local_size_x:
508 Dim = 0;
509 break;
510 case Intrinsic::amdgcn_workitem_id_y:
511 case Intrinsic::r600_read_tidig_y:
512 IdQuery = true;
513 LLVM_FALLTHROUGH;
514 case Intrinsic::r600_read_local_size_y:
515 Dim = 1;
516 break;
517 case Intrinsic::amdgcn_workitem_id_z:
518 case Intrinsic::r600_read_tidig_z:
519 IdQuery = true;
520 LLVM_FALLTHROUGH;
521 case Intrinsic::r600_read_local_size_z:
522 Dim = 2;
523 break;
524 default:
525 break;
526 }
527
528 if (Dim <= 3) {
529 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
530 if (ReqdSize != std::numeric_limits<unsigned>::max())
531 MinSize = MaxSize = ReqdSize;
532 }
533 }
534 }
535
536 if (!MaxSize)
537 return false;
538
539 // Range metadata is [Lo, Hi). For ID query we need to pass max size
540 // as Hi. For size query we need to pass Hi + 1.
541 if (IdQuery)
542 MinSize = 0;
543 else
544 ++MaxSize;
545
546 MDBuilder MDB(I->getContext());
547 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
548 APInt(32, MaxSize));
549 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
550 return true;
551 }
552
getImplicitArgNumBytes(const Function & F) const553 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
554 if (isMesaKernel(F))
555 return 16;
556 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
557 }
558
getExplicitKernArgSize(const Function & F,Align & MaxAlign) const559 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
560 Align &MaxAlign) const {
561 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
562 F.getCallingConv() == CallingConv::SPIR_KERNEL);
563
564 const DataLayout &DL = F.getParent()->getDataLayout();
565 uint64_t ExplicitArgBytes = 0;
566 MaxAlign = Align(1);
567
568 for (const Argument &Arg : F.args()) {
569 const bool IsByRef = Arg.hasByRefAttr();
570 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
571 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
572 if (!Alignment)
573 Alignment = DL.getABITypeAlign(ArgTy);
574
575 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
576 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
577 MaxAlign = max(MaxAlign, Alignment);
578 }
579
580 return ExplicitArgBytes;
581 }
582
getKernArgSegmentSize(const Function & F,Align & MaxAlign) const583 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
584 Align &MaxAlign) const {
585 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
586
587 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
588
589 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
590 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
591 if (ImplicitBytes != 0) {
592 const Align Alignment = getAlignmentForImplicitArgPtr();
593 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
594 }
595
596 // Being able to dereference past the end is useful for emitting scalar loads.
597 return alignTo(TotalSize, 4);
598 }
599
getAMDGPUDwarfFlavour() const600 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
601 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
602 : AMDGPUDwarfFlavour::Wave64;
603 }
604
R600Subtarget(const Triple & TT,StringRef GPU,StringRef FS,const TargetMachine & TM)605 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
606 const TargetMachine &TM) :
607 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
608 AMDGPUSubtarget(TT),
609 InstrInfo(*this),
610 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
611 FMA(false),
612 CaymanISA(false),
613 CFALUBug(false),
614 HasVertexCache(false),
615 R600ALUInst(false),
616 FP64(false),
617 TexVTXClauseSize(0),
618 Gen(R600),
619 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
620 InstrItins(getInstrItineraryForCPU(GPU)) { }
621
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const622 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
623 unsigned NumRegionInstrs) const {
624 // Track register pressure so the scheduler can try to decrease
625 // pressure once register usage is above the threshold defined by
626 // SIRegisterInfo::getRegPressureSetLimit()
627 Policy.ShouldTrackPressure = true;
628
629 // Enabling both top down and bottom up scheduling seems to give us less
630 // register spills than just using one of these approaches on its own.
631 Policy.OnlyTopDown = false;
632 Policy.OnlyBottomUp = false;
633
634 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
635 if (!enableSIScheduler())
636 Policy.ShouldTrackLaneMasks = true;
637 }
638
hasMadF16() const639 bool GCNSubtarget::hasMadF16() const {
640 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
641 }
642
useVGPRIndexMode() const643 bool GCNSubtarget::useVGPRIndexMode() const {
644 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
645 }
646
useAA() const647 bool GCNSubtarget::useAA() const { return UseAA; }
648
getOccupancyWithNumSGPRs(unsigned SGPRs) const649 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
650 if (getGeneration() >= AMDGPUSubtarget::GFX10)
651 return getMaxWavesPerEU();
652
653 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
654 if (SGPRs <= 80)
655 return 10;
656 if (SGPRs <= 88)
657 return 9;
658 if (SGPRs <= 100)
659 return 8;
660 return 7;
661 }
662 if (SGPRs <= 48)
663 return 10;
664 if (SGPRs <= 56)
665 return 9;
666 if (SGPRs <= 64)
667 return 8;
668 if (SGPRs <= 72)
669 return 7;
670 if (SGPRs <= 80)
671 return 6;
672 return 5;
673 }
674
getOccupancyWithNumVGPRs(unsigned VGPRs) const675 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
676 unsigned MaxWaves = getMaxWavesPerEU();
677 unsigned Granule = getVGPRAllocGranule();
678 if (VGPRs < Granule)
679 return MaxWaves;
680 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
681 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
682 }
683
getReservedNumSGPRs(const MachineFunction & MF) const684 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
685 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
686 if (getGeneration() >= AMDGPUSubtarget::GFX10)
687 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
688
689 if (MFI.hasFlatScratchInit()) {
690 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
691 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
692 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
693 return 4; // FLAT_SCRATCH, VCC (in that order).
694 }
695
696 if (isXNACKEnabled())
697 return 4; // XNACK, VCC (in that order).
698 return 2; // VCC.
699 }
700
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const701 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
702 unsigned NumSGPRs,
703 unsigned NumVGPRs) const {
704 unsigned Occupancy =
705 std::min(getMaxWavesPerEU(),
706 getOccupancyWithLocalMemSize(LDSSize, F));
707 if (NumSGPRs)
708 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
709 if (NumVGPRs)
710 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
711 return Occupancy;
712 }
713
getMaxNumSGPRs(const MachineFunction & MF) const714 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
715 const Function &F = MF.getFunction();
716 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
717
718 // Compute maximum number of SGPRs function can use using default/requested
719 // minimum number of waves per execution unit.
720 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
721 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
722 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
723
724 // Check if maximum number of SGPRs was explicitly requested using
725 // "amdgpu-num-sgpr" attribute.
726 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
727 unsigned Requested = AMDGPU::getIntegerAttribute(
728 F, "amdgpu-num-sgpr", MaxNumSGPRs);
729
730 // Make sure requested value does not violate subtarget's specifications.
731 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
732 Requested = 0;
733
734 // If more SGPRs are required to support the input user/system SGPRs,
735 // increase to accommodate them.
736 //
737 // FIXME: This really ends up using the requested number of SGPRs + number
738 // of reserved special registers in total. Theoretically you could re-use
739 // the last input registers for these special registers, but this would
740 // require a lot of complexity to deal with the weird aliasing.
741 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
742 if (Requested && Requested < InputNumSGPRs)
743 Requested = InputNumSGPRs;
744
745 // Make sure requested value is compatible with values implied by
746 // default/requested minimum/maximum number of waves per execution unit.
747 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
748 Requested = 0;
749 if (WavesPerEU.second &&
750 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
751 Requested = 0;
752
753 if (Requested)
754 MaxNumSGPRs = Requested;
755 }
756
757 if (hasSGPRInitBug())
758 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
759
760 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
761 MaxAddressableNumSGPRs);
762 }
763
getMaxNumVGPRs(const MachineFunction & MF) const764 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
765 const Function &F = MF.getFunction();
766 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
767
768 // Compute maximum number of VGPRs function can use using default/requested
769 // minimum number of waves per execution unit.
770 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
771 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
772
773 // Check if maximum number of VGPRs was explicitly requested using
774 // "amdgpu-num-vgpr" attribute.
775 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
776 unsigned Requested = AMDGPU::getIntegerAttribute(
777 F, "amdgpu-num-vgpr", MaxNumVGPRs);
778
779 // Make sure requested value is compatible with values implied by
780 // default/requested minimum/maximum number of waves per execution unit.
781 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
782 Requested = 0;
783 if (WavesPerEU.second &&
784 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
785 Requested = 0;
786
787 if (Requested)
788 MaxNumVGPRs = Requested;
789 }
790
791 return MaxNumVGPRs;
792 }
793
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep) const794 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
795 int UseOpIdx, SDep &Dep) const {
796 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
797 !Def->isInstr() || !Use->isInstr())
798 return;
799
800 MachineInstr *DefI = Def->getInstr();
801 MachineInstr *UseI = Use->getInstr();
802
803 if (DefI->isBundle()) {
804 const SIRegisterInfo *TRI = getRegisterInfo();
805 auto Reg = Dep.getReg();
806 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
807 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
808 unsigned Lat = 0;
809 for (++I; I != E && I->isBundledWithPred(); ++I) {
810 if (I->modifiesRegister(Reg, TRI))
811 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
812 else if (Lat)
813 --Lat;
814 }
815 Dep.setLatency(Lat);
816 } else if (UseI->isBundle()) {
817 const SIRegisterInfo *TRI = getRegisterInfo();
818 auto Reg = Dep.getReg();
819 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
820 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
821 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
822 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
823 if (I->readsRegister(Reg, TRI))
824 break;
825 --Lat;
826 }
827 Dep.setLatency(Lat);
828 }
829 }
830
831 namespace {
832 struct FillMFMAShadowMutation : ScheduleDAGMutation {
833 const SIInstrInfo *TII;
834
835 ScheduleDAGMI *DAG;
836
FillMFMAShadowMutation__anonc48abd6e0111::FillMFMAShadowMutation837 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
838
isSALU__anonc48abd6e0111::FillMFMAShadowMutation839 bool isSALU(const SUnit *SU) const {
840 const MachineInstr *MI = SU->getInstr();
841 return MI && TII->isSALU(*MI) && !MI->isTerminator();
842 }
843
isVALU__anonc48abd6e0111::FillMFMAShadowMutation844 bool isVALU(const SUnit *SU) const {
845 const MachineInstr *MI = SU->getInstr();
846 return MI && TII->isVALU(*MI);
847 }
848
canAddEdge__anonc48abd6e0111::FillMFMAShadowMutation849 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
850 if (Pred->NodeNum < Succ->NodeNum)
851 return true;
852
853 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
854
855 for (unsigned I = 0; I < Succs.size(); ++I) {
856 for (const SDep &SI : Succs[I]->Succs) {
857 const SUnit *SU = SI.getSUnit();
858 if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
859 Succs.push_back(SU);
860 }
861 }
862
863 SmallPtrSet<const SUnit*, 32> Visited;
864 while (!Preds.empty()) {
865 const SUnit *SU = Preds.pop_back_val();
866 if (llvm::is_contained(Succs, SU))
867 return false;
868 Visited.insert(SU);
869 for (const SDep &SI : SU->Preds)
870 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
871 Preds.push_back(SI.getSUnit());
872 }
873
874 return true;
875 }
876
877 // Link as much SALU intructions in chain as possible. Return the size
878 // of the chain. Links up to MaxChain instructions.
linkSALUChain__anonc48abd6e0111::FillMFMAShadowMutation879 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
880 SmallPtrSetImpl<SUnit *> &Visited) const {
881 SmallVector<SUnit *, 8> Worklist({To});
882 unsigned Linked = 0;
883
884 while (!Worklist.empty() && MaxChain-- > 0) {
885 SUnit *SU = Worklist.pop_back_val();
886 if (!Visited.insert(SU).second)
887 continue;
888
889 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
890 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
891
892 if (SU->addPred(SDep(From, SDep::Artificial), false))
893 ++Linked;
894
895 for (SDep &SI : From->Succs) {
896 SUnit *SUv = SI.getSUnit();
897 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
898 SUv->addPred(SDep(SU, SDep::Artificial), false);
899 }
900
901 for (SDep &SI : SU->Succs) {
902 SUnit *Succ = SI.getSUnit();
903 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
904 Worklist.push_back(Succ);
905 }
906 }
907
908 return Linked;
909 }
910
apply__anonc48abd6e0111::FillMFMAShadowMutation911 void apply(ScheduleDAGInstrs *DAGInstrs) override {
912 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
913 if (!ST.hasMAIInsts() || DisablePowerSched)
914 return;
915 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
916 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
917 if (!TSchedModel || DAG->SUnits.empty())
918 return;
919
920 // Scan for MFMA long latency instructions and try to add a dependency
921 // of available SALU instructions to give them a chance to fill MFMA
922 // shadow. That is desirable to fill MFMA shadow with SALU instructions
923 // rather than VALU to prevent power consumption bursts and throttle.
924 auto LastSALU = DAG->SUnits.begin();
925 auto E = DAG->SUnits.end();
926 SmallPtrSet<SUnit*, 32> Visited;
927 for (SUnit &SU : DAG->SUnits) {
928 MachineInstr &MAI = *SU.getInstr();
929 if (!TII->isMAI(MAI) ||
930 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
931 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
932 continue;
933
934 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
935
936 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
937 dbgs() << "Need " << Lat
938 << " instructions to cover latency.\n");
939
940 // Find up to Lat independent scalar instructions as early as
941 // possible such that they can be scheduled after this MFMA.
942 for ( ; Lat && LastSALU != E; ++LastSALU) {
943 if (Visited.count(&*LastSALU))
944 continue;
945
946 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
947 continue;
948
949 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
950 }
951 }
952 }
953 };
954 } // namespace
955
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const956 void GCNSubtarget::getPostRAMutations(
957 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
958 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
959 }
960
get(const MachineFunction & MF)961 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
962 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
963 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
964 else
965 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
966 }
967
get(const TargetMachine & TM,const Function & F)968 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
969 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
970 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
971 else
972 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
973 }
974