1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32
33 using namespace llvm;
34
35 #define DEBUG_TYPE "amdgpu-subtarget"
36
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45
46 static cl::opt<bool> DisablePowerSched(
47 "amdgpu-disable-power-sched",
48 cl::desc("Disable scheduling to minimize mAI power bursts"),
49 cl::init(false));
50
51 static cl::opt<bool> EnableVGPRIndexMode(
52 "amdgpu-vgpr-index-mode",
53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54 cl::init(false));
55
56 static cl::opt<bool> EnableFlatScratch(
57 "amdgpu-enable-flat-scratch",
58 cl::desc("Use flat scratch instructions"),
59 cl::init(false));
60
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62 cl::desc("Enable the use of AA during codegen."),
63 cl::init(true));
64
65 GCNSubtarget::~GCNSubtarget() = default;
66
67 R600Subtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69 StringRef GPU, StringRef FS) {
70 SmallString<256> FullFS("+promote-alloca,");
71 FullFS += FS;
72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73
74 HasMulU24 = getGeneration() >= EVERGREEN;
75 HasMulI24 = hasCaymanISA();
76
77 return *this;
78 }
79
80 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82 StringRef GPU, StringRef FS) {
83 // Determine default and user-specified characteristics
84 //
85 // We want to be able to turn these off, but making this a subtarget feature
86 // for SI has the unhelpful behavior that it unsets everything else if you
87 // disable it.
88 //
89 // Similarly we want enable-prt-strict-null to be on by default and not to
90 // unset everything else if it is disabled
91
92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93
94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95 if (isAmdHsaOS())
96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97
98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99
100 // Disable mutually exclusive bits.
101 if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102 if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103 FullFS += "-wavefrontsize16,";
104 if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105 FullFS += "-wavefrontsize32,";
106 if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107 FullFS += "-wavefrontsize64,";
108 }
109
110 FullFS += FS;
111
112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113
114 // Implement the "generic" processors, which acts as the default when no
115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116 // the first amdgcn target that supports flat addressing. Other OSes defaults
117 // to the first amdgcn target.
118 if (Gen == AMDGPUSubtarget::INVALID) {
119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120 : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121 }
122
123 // We don't support FP64 for EG/NI atm.
124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125
126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127 // support flat operations, otherwise they cannot access a 64-bit global
128 // address space
129 assert(hasAddr64() || hasFlat());
130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131 // that do not support ADDR64 variants of MUBUF instructions. Such targets
132 // cannot use a 64 bit offset with a MUBUF instruction to access the global
133 // address space
134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136 FlatForGlobal = true;
137 }
138 // Unless +-flat-for-global is specified, use MUBUF instructions for global
139 // address space access if flat operations are not available.
140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142 FlatForGlobal = false;
143 }
144
145 // Set defaults if needed.
146 if (MaxPrivateElementSize == 0)
147 MaxPrivateElementSize = 4;
148
149 if (LDSBankCount == 0)
150 LDSBankCount = 32;
151
152 if (TT.getArch() == Triple::amdgcn) {
153 if (LocalMemorySize == 0)
154 LocalMemorySize = 32768;
155
156 // Do something sensible for unspecified target.
157 if (!HasMovrel && !HasVGPRIndexMode)
158 HasMovrel = true;
159 }
160
161 // Don't crash on invalid devices.
162 if (WavefrontSizeLog2 == 0)
163 WavefrontSizeLog2 = 5;
164
165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166
167 TargetID.setTargetIDFromFeaturesString(FS);
168
169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170 << TargetID.getXnackSetting() << '\n');
171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172 << TargetID.getSramEccSetting() << '\n');
173
174 return *this;
175 }
176
AMDGPUSubtarget(const Triple & TT)177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
178 TargetTriple(TT),
179 GCN3Encoding(false),
180 Has16BitInsts(false),
181 HasMadMixInsts(false),
182 HasMadMacF32Insts(false),
183 HasDsSrc2Insts(false),
184 HasSDWA(false),
185 HasVOP3PInsts(false),
186 HasMulI24(true),
187 HasMulU24(true),
188 HasInv2PiInlineImm(false),
189 HasFminFmaxLegacy(true),
190 EnablePromoteAlloca(false),
191 HasTrigReducedRange(false),
192 MaxWavesPerEU(10),
193 LocalMemorySize(0),
194 WavefrontSizeLog2(0)
195 { }
196
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
198 const GCNTargetMachine &TM)
199 : // clang-format off
200 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
201 AMDGPUSubtarget(TT),
202 TargetTriple(TT),
203 TargetID(*this),
204 Gen(INVALID),
205 InstrItins(getInstrItineraryForCPU(GPU)),
206 LDSBankCount(0),
207 MaxPrivateElementSize(0),
208
209 FastFMAF32(false),
210 FastDenormalF32(false),
211 HalfRate64Ops(false),
212 FullRate64Ops(false),
213
214 FlatForGlobal(false),
215 AutoWaitcntBeforeBarrier(false),
216 UnalignedScratchAccess(false),
217 UnalignedAccessMode(false),
218
219 HasApertureRegs(false),
220 SupportsXNACK(false),
221 EnableXNACK(false),
222 EnableTgSplit(false),
223 EnableCuMode(false),
224 TrapHandler(false),
225
226 EnableLoadStoreOpt(false),
227 EnableUnsafeDSOffsetFolding(false),
228 EnableSIScheduler(false),
229 EnableDS128(false),
230 EnablePRTStrictNull(false),
231 DumpCode(false),
232
233 FP64(false),
234 CIInsts(false),
235 GFX8Insts(false),
236 GFX9Insts(false),
237 GFX90AInsts(false),
238 GFX10Insts(false),
239 GFX10_3Insts(false),
240 GFX7GFX8GFX9Insts(false),
241 SGPRInitBug(false),
242 NegativeScratchOffsetBug(false),
243 NegativeUnalignedScratchOffsetBug(false),
244 HasSMemRealTime(false),
245 HasIntClamp(false),
246 HasFmaMixInsts(false),
247 HasMovrel(false),
248 HasVGPRIndexMode(false),
249 HasScalarStores(false),
250 HasScalarAtomics(false),
251 HasSDWAOmod(false),
252 HasSDWAScalar(false),
253 HasSDWASdst(false),
254 HasSDWAMac(false),
255 HasSDWAOutModsVOPC(false),
256 HasDPP(false),
257 HasDPP8(false),
258 Has64BitDPP(false),
259 HasPackedFP32Ops(false),
260 HasExtendedImageInsts(false),
261 HasR128A16(false),
262 HasGFX10A16(false),
263 HasG16(false),
264 HasNSAEncoding(false),
265 GFX10_BEncoding(false),
266 HasDLInsts(false),
267 HasDot1Insts(false),
268 HasDot2Insts(false),
269 HasDot3Insts(false),
270 HasDot4Insts(false),
271 HasDot5Insts(false),
272 HasDot6Insts(false),
273 HasDot7Insts(false),
274 HasMAIInsts(false),
275 HasPkFmacF16Inst(false),
276 HasAtomicFaddInsts(false),
277 SupportsSRAMECC(false),
278 EnableSRAMECC(false),
279 HasNoSdstCMPX(false),
280 HasVscnt(false),
281 HasGetWaveIdInst(false),
282 HasSMemTimeInst(false),
283 HasShaderCyclesRegister(false),
284 HasRegisterBanking(false),
285 HasVOP3Literal(false),
286 HasNoDataDepHazard(false),
287 FlatAddressSpace(false),
288 FlatInstOffsets(false),
289 FlatGlobalInsts(false),
290 FlatScratchInsts(false),
291 ScalarFlatScratchInsts(false),
292 HasArchitectedFlatScratch(false),
293 AddNoCarryInsts(false),
294 HasUnpackedD16VMem(false),
295 LDSMisalignedBug(false),
296 HasMFMAInlineLiteralBug(false),
297 UnalignedBufferAccess(false),
298 UnalignedDSAccess(false),
299 HasPackedTID(false),
300
301 ScalarizeGlobal(false),
302
303 HasVcmpxPermlaneHazard(false),
304 HasVMEMtoScalarWriteHazard(false),
305 HasSMEMtoVectorWriteHazard(false),
306 HasInstFwdPrefetchBug(false),
307 HasVcmpxExecWARHazard(false),
308 HasLdsBranchVmemWARHazard(false),
309 HasNSAtoVMEMBug(false),
310 HasNSAClauseBug(false),
311 HasOffset3fBug(false),
312 HasFlatSegmentOffsetBug(false),
313 HasImageStoreD16Bug(false),
314 HasImageGather4D16Bug(false),
315
316 FeatureDisable(false),
317 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
318 TLInfo(TM, *this),
319 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
320 // clang-format on
321 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
322 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
323 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
324 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
325 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
326 InstSelector.reset(new AMDGPUInstructionSelector(
327 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
328 }
329
enableFlatScratch() const330 bool GCNSubtarget::enableFlatScratch() const {
331 return flatScratchIsArchitected() ||
332 (EnableFlatScratch && hasFlatScratchInsts());
333 }
334
getConstantBusLimit(unsigned Opcode) const335 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
336 if (getGeneration() < GFX10)
337 return 1;
338
339 switch (Opcode) {
340 case AMDGPU::V_LSHLREV_B64_e64:
341 case AMDGPU::V_LSHLREV_B64_gfx10:
342 case AMDGPU::V_LSHL_B64_e64:
343 case AMDGPU::V_LSHRREV_B64_e64:
344 case AMDGPU::V_LSHRREV_B64_gfx10:
345 case AMDGPU::V_LSHR_B64_e64:
346 case AMDGPU::V_ASHRREV_I64_e64:
347 case AMDGPU::V_ASHRREV_I64_gfx10:
348 case AMDGPU::V_ASHR_I64_e64:
349 return 1;
350 }
351
352 return 2;
353 }
354
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const355 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
356 const Function &F) const {
357 if (NWaves == 1)
358 return getLocalMemorySize();
359 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
360 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
361 if (!WorkGroupsPerCu)
362 return 0;
363 unsigned MaxWaves = getMaxWavesPerEU();
364 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
365 }
366
367 // FIXME: Should return min,max range.
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const368 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
369 const Function &F) const {
370 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
371 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
372 if (!MaxWorkGroupsPerCu)
373 return 0;
374
375 const unsigned WaveSize = getWavefrontSize();
376
377 // FIXME: Do we need to account for alignment requirement of LDS rounding the
378 // size up?
379 // Compute restriction based on LDS usage
380 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
381
382 // This can be queried with more LDS than is possible, so just assume the
383 // worst.
384 if (NumGroups == 0)
385 return 1;
386
387 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
388
389 // Round to the number of waves.
390 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
391 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
392
393 // Clamp to the maximum possible number of waves.
394 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
395
396 // FIXME: Needs to be a multiple of the group size?
397 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
398
399 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
400 "computed invalid occupancy");
401 return MaxWaves;
402 }
403
404 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const405 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
406 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
407 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
408 }
409
410 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const411 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
412 switch (CC) {
413 case CallingConv::AMDGPU_VS:
414 case CallingConv::AMDGPU_LS:
415 case CallingConv::AMDGPU_HS:
416 case CallingConv::AMDGPU_ES:
417 case CallingConv::AMDGPU_GS:
418 case CallingConv::AMDGPU_PS:
419 return std::make_pair(1, getWavefrontSize());
420 default:
421 return std::make_pair(1u, getMaxFlatWorkGroupSize());
422 }
423 }
424
getFlatWorkGroupSizes(const Function & F) const425 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
426 const Function &F) const {
427 // Default minimum/maximum flat work group sizes.
428 std::pair<unsigned, unsigned> Default =
429 getDefaultFlatWorkGroupSize(F.getCallingConv());
430
431 // Requested minimum/maximum flat work group sizes.
432 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
433 F, "amdgpu-flat-work-group-size", Default);
434
435 // Make sure requested minimum is less than requested maximum.
436 if (Requested.first > Requested.second)
437 return Default;
438
439 // Make sure requested values do not violate subtarget's specifications.
440 if (Requested.first < getMinFlatWorkGroupSize())
441 return Default;
442 if (Requested.second > getMaxFlatWorkGroupSize())
443 return Default;
444
445 return Requested;
446 }
447
getWavesPerEU(const Function & F) const448 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
449 const Function &F) const {
450 // Default minimum/maximum number of waves per execution unit.
451 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
452
453 // Default/requested minimum/maximum flat work group sizes.
454 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
455
456 // If minimum/maximum flat work group sizes were explicitly requested using
457 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
458 // number of waves per execution unit to values implied by requested
459 // minimum/maximum flat work group sizes.
460 unsigned MinImpliedByFlatWorkGroupSize =
461 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
462 Default.first = MinImpliedByFlatWorkGroupSize;
463 bool RequestedFlatWorkGroupSize =
464 F.hasFnAttribute("amdgpu-flat-work-group-size");
465
466 // Requested minimum/maximum number of waves per execution unit.
467 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
468 F, "amdgpu-waves-per-eu", Default, true);
469
470 // Make sure requested minimum is less than requested maximum.
471 if (Requested.second && Requested.first > Requested.second)
472 return Default;
473
474 // Make sure requested values do not violate subtarget's specifications.
475 if (Requested.first < getMinWavesPerEU() ||
476 Requested.second > getMaxWavesPerEU())
477 return Default;
478
479 // Make sure requested values are compatible with values implied by requested
480 // minimum/maximum flat work group sizes.
481 if (RequestedFlatWorkGroupSize &&
482 Requested.first < MinImpliedByFlatWorkGroupSize)
483 return Default;
484
485 return Requested;
486 }
487
getReqdWorkGroupSize(const Function & Kernel,unsigned Dim)488 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
489 auto Node = Kernel.getMetadata("reqd_work_group_size");
490 if (Node && Node->getNumOperands() == 3)
491 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
492 return std::numeric_limits<unsigned>::max();
493 }
494
isMesaKernel(const Function & F) const495 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
496 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
497 }
498
getMaxWorkitemID(const Function & Kernel,unsigned Dimension) const499 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
500 unsigned Dimension) const {
501 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
502 if (ReqdSize != std::numeric_limits<unsigned>::max())
503 return ReqdSize - 1;
504 return getFlatWorkGroupSizes(Kernel).second - 1;
505 }
506
makeLIDRangeMetadata(Instruction * I) const507 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
508 Function *Kernel = I->getParent()->getParent();
509 unsigned MinSize = 0;
510 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
511 bool IdQuery = false;
512
513 // If reqd_work_group_size is present it narrows value down.
514 if (auto *CI = dyn_cast<CallInst>(I)) {
515 const Function *F = CI->getCalledFunction();
516 if (F) {
517 unsigned Dim = UINT_MAX;
518 switch (F->getIntrinsicID()) {
519 case Intrinsic::amdgcn_workitem_id_x:
520 case Intrinsic::r600_read_tidig_x:
521 IdQuery = true;
522 LLVM_FALLTHROUGH;
523 case Intrinsic::r600_read_local_size_x:
524 Dim = 0;
525 break;
526 case Intrinsic::amdgcn_workitem_id_y:
527 case Intrinsic::r600_read_tidig_y:
528 IdQuery = true;
529 LLVM_FALLTHROUGH;
530 case Intrinsic::r600_read_local_size_y:
531 Dim = 1;
532 break;
533 case Intrinsic::amdgcn_workitem_id_z:
534 case Intrinsic::r600_read_tidig_z:
535 IdQuery = true;
536 LLVM_FALLTHROUGH;
537 case Intrinsic::r600_read_local_size_z:
538 Dim = 2;
539 break;
540 default:
541 break;
542 }
543
544 if (Dim <= 3) {
545 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
546 if (ReqdSize != std::numeric_limits<unsigned>::max())
547 MinSize = MaxSize = ReqdSize;
548 }
549 }
550 }
551
552 if (!MaxSize)
553 return false;
554
555 // Range metadata is [Lo, Hi). For ID query we need to pass max size
556 // as Hi. For size query we need to pass Hi + 1.
557 if (IdQuery)
558 MinSize = 0;
559 else
560 ++MaxSize;
561
562 MDBuilder MDB(I->getContext());
563 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
564 APInt(32, MaxSize));
565 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
566 return true;
567 }
568
getImplicitArgNumBytes(const Function & F) const569 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
570 if (isMesaKernel(F))
571 return 16;
572 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
573 }
574
getExplicitKernArgSize(const Function & F,Align & MaxAlign) const575 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
576 Align &MaxAlign) const {
577 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
578 F.getCallingConv() == CallingConv::SPIR_KERNEL);
579
580 const DataLayout &DL = F.getParent()->getDataLayout();
581 uint64_t ExplicitArgBytes = 0;
582 MaxAlign = Align(1);
583
584 for (const Argument &Arg : F.args()) {
585 const bool IsByRef = Arg.hasByRefAttr();
586 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
587 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
588 if (!Alignment)
589 Alignment = DL.getABITypeAlign(ArgTy);
590
591 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
592 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
593 MaxAlign = max(MaxAlign, Alignment);
594 }
595
596 return ExplicitArgBytes;
597 }
598
getKernArgSegmentSize(const Function & F,Align & MaxAlign) const599 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
600 Align &MaxAlign) const {
601 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
602
603 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
604
605 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
606 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
607 if (ImplicitBytes != 0) {
608 const Align Alignment = getAlignmentForImplicitArgPtr();
609 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
610 }
611
612 // Being able to dereference past the end is useful for emitting scalar loads.
613 return alignTo(TotalSize, 4);
614 }
615
getAMDGPUDwarfFlavour() const616 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
617 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
618 : AMDGPUDwarfFlavour::Wave64;
619 }
620
R600Subtarget(const Triple & TT,StringRef GPU,StringRef FS,const TargetMachine & TM)621 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
622 const TargetMachine &TM) :
623 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
624 AMDGPUSubtarget(TT),
625 InstrInfo(*this),
626 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
627 FMA(false),
628 CaymanISA(false),
629 CFALUBug(false),
630 HasVertexCache(false),
631 R600ALUInst(false),
632 FP64(false),
633 TexVTXClauseSize(0),
634 Gen(R600),
635 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
636 InstrItins(getInstrItineraryForCPU(GPU)) { }
637
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const638 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
639 unsigned NumRegionInstrs) const {
640 // Track register pressure so the scheduler can try to decrease
641 // pressure once register usage is above the threshold defined by
642 // SIRegisterInfo::getRegPressureSetLimit()
643 Policy.ShouldTrackPressure = true;
644
645 // Enabling both top down and bottom up scheduling seems to give us less
646 // register spills than just using one of these approaches on its own.
647 Policy.OnlyTopDown = false;
648 Policy.OnlyBottomUp = false;
649
650 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
651 if (!enableSIScheduler())
652 Policy.ShouldTrackLaneMasks = true;
653 }
654
hasMadF16() const655 bool GCNSubtarget::hasMadF16() const {
656 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
657 }
658
useVGPRIndexMode() const659 bool GCNSubtarget::useVGPRIndexMode() const {
660 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
661 }
662
useAA() const663 bool GCNSubtarget::useAA() const { return UseAA; }
664
getOccupancyWithNumSGPRs(unsigned SGPRs) const665 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
666 if (getGeneration() >= AMDGPUSubtarget::GFX10)
667 return getMaxWavesPerEU();
668
669 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
670 if (SGPRs <= 80)
671 return 10;
672 if (SGPRs <= 88)
673 return 9;
674 if (SGPRs <= 100)
675 return 8;
676 return 7;
677 }
678 if (SGPRs <= 48)
679 return 10;
680 if (SGPRs <= 56)
681 return 9;
682 if (SGPRs <= 64)
683 return 8;
684 if (SGPRs <= 72)
685 return 7;
686 if (SGPRs <= 80)
687 return 6;
688 return 5;
689 }
690
getOccupancyWithNumVGPRs(unsigned VGPRs) const691 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
692 unsigned MaxWaves = getMaxWavesPerEU();
693 unsigned Granule = getVGPRAllocGranule();
694 if (VGPRs < Granule)
695 return MaxWaves;
696 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
697 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
698 }
699
getReservedNumSGPRs(const MachineFunction & MF) const700 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
701 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
702 if (getGeneration() >= AMDGPUSubtarget::GFX10)
703 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
704
705 if (MFI.hasFlatScratchInit()) {
706 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
707 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
708 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
709 return 4; // FLAT_SCRATCH, VCC (in that order).
710 }
711
712 if (isXNACKEnabled())
713 return 4; // XNACK, VCC (in that order).
714 return 2; // VCC.
715 }
716
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const717 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
718 unsigned NumSGPRs,
719 unsigned NumVGPRs) const {
720 unsigned Occupancy =
721 std::min(getMaxWavesPerEU(),
722 getOccupancyWithLocalMemSize(LDSSize, F));
723 if (NumSGPRs)
724 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
725 if (NumVGPRs)
726 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
727 return Occupancy;
728 }
729
getMaxNumSGPRs(const MachineFunction & MF) const730 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
731 const Function &F = MF.getFunction();
732 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
733
734 // Compute maximum number of SGPRs function can use using default/requested
735 // minimum number of waves per execution unit.
736 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
737 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
738 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
739
740 // Check if maximum number of SGPRs was explicitly requested using
741 // "amdgpu-num-sgpr" attribute.
742 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
743 unsigned Requested = AMDGPU::getIntegerAttribute(
744 F, "amdgpu-num-sgpr", MaxNumSGPRs);
745
746 // Make sure requested value does not violate subtarget's specifications.
747 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
748 Requested = 0;
749
750 // If more SGPRs are required to support the input user/system SGPRs,
751 // increase to accommodate them.
752 //
753 // FIXME: This really ends up using the requested number of SGPRs + number
754 // of reserved special registers in total. Theoretically you could re-use
755 // the last input registers for these special registers, but this would
756 // require a lot of complexity to deal with the weird aliasing.
757 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
758 if (Requested && Requested < InputNumSGPRs)
759 Requested = InputNumSGPRs;
760
761 // Make sure requested value is compatible with values implied by
762 // default/requested minimum/maximum number of waves per execution unit.
763 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
764 Requested = 0;
765 if (WavesPerEU.second &&
766 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
767 Requested = 0;
768
769 if (Requested)
770 MaxNumSGPRs = Requested;
771 }
772
773 if (hasSGPRInitBug())
774 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
775
776 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
777 MaxAddressableNumSGPRs);
778 }
779
getMaxNumVGPRs(const MachineFunction & MF) const780 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
781 const Function &F = MF.getFunction();
782 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
783
784 // Compute maximum number of VGPRs function can use using default/requested
785 // minimum number of waves per execution unit.
786 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
787 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
788
789 // Check if maximum number of VGPRs was explicitly requested using
790 // "amdgpu-num-vgpr" attribute.
791 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
792 unsigned Requested = AMDGPU::getIntegerAttribute(
793 F, "amdgpu-num-vgpr", MaxNumVGPRs);
794
795 if (hasGFX90AInsts())
796 Requested *= 2;
797
798 // Make sure requested value is compatible with values implied by
799 // default/requested minimum/maximum number of waves per execution unit.
800 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
801 Requested = 0;
802 if (WavesPerEU.second &&
803 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
804 Requested = 0;
805
806 if (Requested)
807 MaxNumVGPRs = Requested;
808 }
809
810 return MaxNumVGPRs;
811 }
812
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep) const813 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
814 int UseOpIdx, SDep &Dep) const {
815 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
816 !Def->isInstr() || !Use->isInstr())
817 return;
818
819 MachineInstr *DefI = Def->getInstr();
820 MachineInstr *UseI = Use->getInstr();
821
822 if (DefI->isBundle()) {
823 const SIRegisterInfo *TRI = getRegisterInfo();
824 auto Reg = Dep.getReg();
825 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
826 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
827 unsigned Lat = 0;
828 for (++I; I != E && I->isBundledWithPred(); ++I) {
829 if (I->modifiesRegister(Reg, TRI))
830 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
831 else if (Lat)
832 --Lat;
833 }
834 Dep.setLatency(Lat);
835 } else if (UseI->isBundle()) {
836 const SIRegisterInfo *TRI = getRegisterInfo();
837 auto Reg = Dep.getReg();
838 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
839 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
840 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
841 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
842 if (I->readsRegister(Reg, TRI))
843 break;
844 --Lat;
845 }
846 Dep.setLatency(Lat);
847 }
848 }
849
850 namespace {
851 struct FillMFMAShadowMutation : ScheduleDAGMutation {
852 const SIInstrInfo *TII;
853
854 ScheduleDAGMI *DAG;
855
FillMFMAShadowMutation__anonb95807e30111::FillMFMAShadowMutation856 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
857
isSALU__anonb95807e30111::FillMFMAShadowMutation858 bool isSALU(const SUnit *SU) const {
859 const MachineInstr *MI = SU->getInstr();
860 return MI && TII->isSALU(*MI) && !MI->isTerminator();
861 }
862
isVALU__anonb95807e30111::FillMFMAShadowMutation863 bool isVALU(const SUnit *SU) const {
864 const MachineInstr *MI = SU->getInstr();
865 return MI && TII->isVALU(*MI);
866 }
867
canAddEdge__anonb95807e30111::FillMFMAShadowMutation868 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
869 if (Pred->NodeNum < Succ->NodeNum)
870 return true;
871
872 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
873
874 for (unsigned I = 0; I < Succs.size(); ++I) {
875 for (const SDep &SI : Succs[I]->Succs) {
876 const SUnit *SU = SI.getSUnit();
877 if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
878 Succs.push_back(SU);
879 }
880 }
881
882 SmallPtrSet<const SUnit*, 32> Visited;
883 while (!Preds.empty()) {
884 const SUnit *SU = Preds.pop_back_val();
885 if (llvm::is_contained(Succs, SU))
886 return false;
887 Visited.insert(SU);
888 for (const SDep &SI : SU->Preds)
889 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
890 Preds.push_back(SI.getSUnit());
891 }
892
893 return true;
894 }
895
896 // Link as much SALU intructions in chain as possible. Return the size
897 // of the chain. Links up to MaxChain instructions.
linkSALUChain__anonb95807e30111::FillMFMAShadowMutation898 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
899 SmallPtrSetImpl<SUnit *> &Visited) const {
900 SmallVector<SUnit *, 8> Worklist({To});
901 unsigned Linked = 0;
902
903 while (!Worklist.empty() && MaxChain-- > 0) {
904 SUnit *SU = Worklist.pop_back_val();
905 if (!Visited.insert(SU).second)
906 continue;
907
908 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
909 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
910
911 if (SU->addPred(SDep(From, SDep::Artificial), false))
912 ++Linked;
913
914 for (SDep &SI : From->Succs) {
915 SUnit *SUv = SI.getSUnit();
916 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
917 SUv->addPred(SDep(SU, SDep::Artificial), false);
918 }
919
920 for (SDep &SI : SU->Succs) {
921 SUnit *Succ = SI.getSUnit();
922 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
923 Worklist.push_back(Succ);
924 }
925 }
926
927 return Linked;
928 }
929
apply__anonb95807e30111::FillMFMAShadowMutation930 void apply(ScheduleDAGInstrs *DAGInstrs) override {
931 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
932 if (!ST.hasMAIInsts() || DisablePowerSched)
933 return;
934 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
935 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
936 if (!TSchedModel || DAG->SUnits.empty())
937 return;
938
939 // Scan for MFMA long latency instructions and try to add a dependency
940 // of available SALU instructions to give them a chance to fill MFMA
941 // shadow. That is desirable to fill MFMA shadow with SALU instructions
942 // rather than VALU to prevent power consumption bursts and throttle.
943 auto LastSALU = DAG->SUnits.begin();
944 auto E = DAG->SUnits.end();
945 SmallPtrSet<SUnit*, 32> Visited;
946 for (SUnit &SU : DAG->SUnits) {
947 MachineInstr &MAI = *SU.getInstr();
948 if (!TII->isMAI(MAI) ||
949 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
950 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
951 continue;
952
953 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
954
955 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
956 dbgs() << "Need " << Lat
957 << " instructions to cover latency.\n");
958
959 // Find up to Lat independent scalar instructions as early as
960 // possible such that they can be scheduled after this MFMA.
961 for ( ; Lat && LastSALU != E; ++LastSALU) {
962 if (Visited.count(&*LastSALU))
963 continue;
964
965 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
966 continue;
967
968 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
969 }
970 }
971 }
972 };
973 } // namespace
974
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const975 void GCNSubtarget::getPostRAMutations(
976 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
977 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
978 }
979
get(const MachineFunction & MF)980 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
981 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
982 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
983 else
984 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
985 }
986
get(const TargetMachine & TM,const Function & F)987 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
988 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
989 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
990 else
991 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
992 }
993