1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
46 static cl::opt<bool> DisablePowerSched(
47   "amdgpu-disable-power-sched",
48   cl::desc("Disable scheduling to minimize mAI power bursts"),
49   cl::init(false));
50 
51 static cl::opt<bool> EnableVGPRIndexMode(
52   "amdgpu-vgpr-index-mode",
53   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableFlatScratch(
57   "amdgpu-enable-flat-scratch",
58   cl::desc("Use flat scratch instructions"),
59   cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62                            cl::desc("Enable the use of AA during codegen."),
63                            cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
67 R600Subtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69                                                StringRef GPU, StringRef FS) {
70   SmallString<256> FullFS("+promote-alloca,");
71   FullFS += FS;
72   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
74   HasMulU24 = getGeneration() >= EVERGREEN;
75   HasMulI24 = hasCaymanISA();
76 
77   return *this;
78 }
79 
80 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82                                               StringRef GPU, StringRef FS) {
83   // Determine default and user-specified characteristics
84   //
85   // We want to be able to turn these off, but making this a subtarget feature
86   // for SI has the unhelpful behavior that it unsets everything else if you
87   // disable it.
88   //
89   // Similarly we want enable-prt-strict-null to be on by default and not to
90   // unset everything else if it is disabled
91 
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 
167   TargetID.setTargetIDFromFeaturesString(FS);
168 
169   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170                     << TargetID.getXnackSetting() << '\n');
171   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172                     << TargetID.getSramEccSetting() << '\n');
173 
174   return *this;
175 }
176 
AMDGPUSubtarget(const Triple & TT)177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
178   TargetTriple(TT),
179   GCN3Encoding(false),
180   Has16BitInsts(false),
181   HasMadMixInsts(false),
182   HasMadMacF32Insts(false),
183   HasDsSrc2Insts(false),
184   HasSDWA(false),
185   HasVOP3PInsts(false),
186   HasMulI24(true),
187   HasMulU24(true),
188   HasInv2PiInlineImm(false),
189   HasFminFmaxLegacy(true),
190   EnablePromoteAlloca(false),
191   HasTrigReducedRange(false),
192   MaxWavesPerEU(10),
193   LocalMemorySize(0),
194   WavefrontSizeLog2(0)
195   { }
196 
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
198                            const GCNTargetMachine &TM)
199     : // clang-format off
200     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
201     AMDGPUSubtarget(TT),
202     TargetTriple(TT),
203     TargetID(*this),
204     Gen(INVALID),
205     InstrItins(getInstrItineraryForCPU(GPU)),
206     LDSBankCount(0),
207     MaxPrivateElementSize(0),
208 
209     FastFMAF32(false),
210     FastDenormalF32(false),
211     HalfRate64Ops(false),
212     FullRate64Ops(false),
213 
214     FlatForGlobal(false),
215     AutoWaitcntBeforeBarrier(false),
216     UnalignedScratchAccess(false),
217     UnalignedAccessMode(false),
218 
219     HasApertureRegs(false),
220     SupportsXNACK(false),
221     EnableXNACK(false),
222     EnableTgSplit(false),
223     EnableCuMode(false),
224     TrapHandler(false),
225 
226     EnableLoadStoreOpt(false),
227     EnableUnsafeDSOffsetFolding(false),
228     EnableSIScheduler(false),
229     EnableDS128(false),
230     EnablePRTStrictNull(false),
231     DumpCode(false),
232 
233     FP64(false),
234     CIInsts(false),
235     GFX8Insts(false),
236     GFX9Insts(false),
237     GFX90AInsts(false),
238     GFX10Insts(false),
239     GFX10_3Insts(false),
240     GFX7GFX8GFX9Insts(false),
241     SGPRInitBug(false),
242     NegativeScratchOffsetBug(false),
243     NegativeUnalignedScratchOffsetBug(false),
244     HasSMemRealTime(false),
245     HasIntClamp(false),
246     HasFmaMixInsts(false),
247     HasMovrel(false),
248     HasVGPRIndexMode(false),
249     HasScalarStores(false),
250     HasScalarAtomics(false),
251     HasSDWAOmod(false),
252     HasSDWAScalar(false),
253     HasSDWASdst(false),
254     HasSDWAMac(false),
255     HasSDWAOutModsVOPC(false),
256     HasDPP(false),
257     HasDPP8(false),
258     Has64BitDPP(false),
259     HasPackedFP32Ops(false),
260     HasExtendedImageInsts(false),
261     HasR128A16(false),
262     HasGFX10A16(false),
263     HasG16(false),
264     HasNSAEncoding(false),
265     GFX10_BEncoding(false),
266     HasDLInsts(false),
267     HasDot1Insts(false),
268     HasDot2Insts(false),
269     HasDot3Insts(false),
270     HasDot4Insts(false),
271     HasDot5Insts(false),
272     HasDot6Insts(false),
273     HasDot7Insts(false),
274     HasMAIInsts(false),
275     HasPkFmacF16Inst(false),
276     HasAtomicFaddInsts(false),
277     SupportsSRAMECC(false),
278     EnableSRAMECC(false),
279     HasNoSdstCMPX(false),
280     HasVscnt(false),
281     HasGetWaveIdInst(false),
282     HasSMemTimeInst(false),
283     HasShaderCyclesRegister(false),
284     HasRegisterBanking(false),
285     HasVOP3Literal(false),
286     HasNoDataDepHazard(false),
287     FlatAddressSpace(false),
288     FlatInstOffsets(false),
289     FlatGlobalInsts(false),
290     FlatScratchInsts(false),
291     ScalarFlatScratchInsts(false),
292     HasArchitectedFlatScratch(false),
293     AddNoCarryInsts(false),
294     HasUnpackedD16VMem(false),
295     LDSMisalignedBug(false),
296     HasMFMAInlineLiteralBug(false),
297     UnalignedBufferAccess(false),
298     UnalignedDSAccess(false),
299     HasPackedTID(false),
300 
301     ScalarizeGlobal(false),
302 
303     HasVcmpxPermlaneHazard(false),
304     HasVMEMtoScalarWriteHazard(false),
305     HasSMEMtoVectorWriteHazard(false),
306     HasInstFwdPrefetchBug(false),
307     HasVcmpxExecWARHazard(false),
308     HasLdsBranchVmemWARHazard(false),
309     HasNSAtoVMEMBug(false),
310     HasNSAClauseBug(false),
311     HasOffset3fBug(false),
312     HasFlatSegmentOffsetBug(false),
313     HasImageStoreD16Bug(false),
314     HasImageGather4D16Bug(false),
315 
316     FeatureDisable(false),
317     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
318     TLInfo(TM, *this),
319     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
320   // clang-format on
321   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
322   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
323   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
324   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
325   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
326   InstSelector.reset(new AMDGPUInstructionSelector(
327   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
328 }
329 
enableFlatScratch() const330 bool GCNSubtarget::enableFlatScratch() const {
331   return flatScratchIsArchitected() ||
332          (EnableFlatScratch && hasFlatScratchInsts());
333 }
334 
getConstantBusLimit(unsigned Opcode) const335 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
336   if (getGeneration() < GFX10)
337     return 1;
338 
339   switch (Opcode) {
340   case AMDGPU::V_LSHLREV_B64_e64:
341   case AMDGPU::V_LSHLREV_B64_gfx10:
342   case AMDGPU::V_LSHL_B64_e64:
343   case AMDGPU::V_LSHRREV_B64_e64:
344   case AMDGPU::V_LSHRREV_B64_gfx10:
345   case AMDGPU::V_LSHR_B64_e64:
346   case AMDGPU::V_ASHRREV_I64_e64:
347   case AMDGPU::V_ASHRREV_I64_gfx10:
348   case AMDGPU::V_ASHR_I64_e64:
349     return 1;
350   }
351 
352   return 2;
353 }
354 
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const355 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
356   const Function &F) const {
357   if (NWaves == 1)
358     return getLocalMemorySize();
359   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
360   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
361   if (!WorkGroupsPerCu)
362     return 0;
363   unsigned MaxWaves = getMaxWavesPerEU();
364   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
365 }
366 
367 // FIXME: Should return min,max range.
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const368 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
369   const Function &F) const {
370   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
371   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
372   if (!MaxWorkGroupsPerCu)
373     return 0;
374 
375   const unsigned WaveSize = getWavefrontSize();
376 
377   // FIXME: Do we need to account for alignment requirement of LDS rounding the
378   // size up?
379   // Compute restriction based on LDS usage
380   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
381 
382   // This can be queried with more LDS than is possible, so just assume the
383   // worst.
384   if (NumGroups == 0)
385     return 1;
386 
387   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
388 
389   // Round to the number of waves.
390   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
391   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
392 
393   // Clamp to the maximum possible number of waves.
394   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
395 
396   // FIXME: Needs to be a multiple of the group size?
397   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
398 
399   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
400          "computed invalid occupancy");
401   return MaxWaves;
402 }
403 
404 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const405 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
406   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
407   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
408 }
409 
410 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const411 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
412   switch (CC) {
413   case CallingConv::AMDGPU_VS:
414   case CallingConv::AMDGPU_LS:
415   case CallingConv::AMDGPU_HS:
416   case CallingConv::AMDGPU_ES:
417   case CallingConv::AMDGPU_GS:
418   case CallingConv::AMDGPU_PS:
419     return std::make_pair(1, getWavefrontSize());
420   default:
421     return std::make_pair(1u, getMaxFlatWorkGroupSize());
422   }
423 }
424 
getFlatWorkGroupSizes(const Function & F) const425 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
426   const Function &F) const {
427   // Default minimum/maximum flat work group sizes.
428   std::pair<unsigned, unsigned> Default =
429     getDefaultFlatWorkGroupSize(F.getCallingConv());
430 
431   // Requested minimum/maximum flat work group sizes.
432   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
433     F, "amdgpu-flat-work-group-size", Default);
434 
435   // Make sure requested minimum is less than requested maximum.
436   if (Requested.first > Requested.second)
437     return Default;
438 
439   // Make sure requested values do not violate subtarget's specifications.
440   if (Requested.first < getMinFlatWorkGroupSize())
441     return Default;
442   if (Requested.second > getMaxFlatWorkGroupSize())
443     return Default;
444 
445   return Requested;
446 }
447 
getWavesPerEU(const Function & F) const448 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
449   const Function &F) const {
450   // Default minimum/maximum number of waves per execution unit.
451   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
452 
453   // Default/requested minimum/maximum flat work group sizes.
454   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
455 
456   // If minimum/maximum flat work group sizes were explicitly requested using
457   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
458   // number of waves per execution unit to values implied by requested
459   // minimum/maximum flat work group sizes.
460   unsigned MinImpliedByFlatWorkGroupSize =
461     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
462   Default.first = MinImpliedByFlatWorkGroupSize;
463   bool RequestedFlatWorkGroupSize =
464       F.hasFnAttribute("amdgpu-flat-work-group-size");
465 
466   // Requested minimum/maximum number of waves per execution unit.
467   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
468     F, "amdgpu-waves-per-eu", Default, true);
469 
470   // Make sure requested minimum is less than requested maximum.
471   if (Requested.second && Requested.first > Requested.second)
472     return Default;
473 
474   // Make sure requested values do not violate subtarget's specifications.
475   if (Requested.first < getMinWavesPerEU() ||
476       Requested.second > getMaxWavesPerEU())
477     return Default;
478 
479   // Make sure requested values are compatible with values implied by requested
480   // minimum/maximum flat work group sizes.
481   if (RequestedFlatWorkGroupSize &&
482       Requested.first < MinImpliedByFlatWorkGroupSize)
483     return Default;
484 
485   return Requested;
486 }
487 
getReqdWorkGroupSize(const Function & Kernel,unsigned Dim)488 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
489   auto Node = Kernel.getMetadata("reqd_work_group_size");
490   if (Node && Node->getNumOperands() == 3)
491     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
492   return std::numeric_limits<unsigned>::max();
493 }
494 
isMesaKernel(const Function & F) const495 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
496   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
497 }
498 
getMaxWorkitemID(const Function & Kernel,unsigned Dimension) const499 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
500                                            unsigned Dimension) const {
501   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
502   if (ReqdSize != std::numeric_limits<unsigned>::max())
503     return ReqdSize - 1;
504   return getFlatWorkGroupSizes(Kernel).second - 1;
505 }
506 
makeLIDRangeMetadata(Instruction * I) const507 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
508   Function *Kernel = I->getParent()->getParent();
509   unsigned MinSize = 0;
510   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
511   bool IdQuery = false;
512 
513   // If reqd_work_group_size is present it narrows value down.
514   if (auto *CI = dyn_cast<CallInst>(I)) {
515     const Function *F = CI->getCalledFunction();
516     if (F) {
517       unsigned Dim = UINT_MAX;
518       switch (F->getIntrinsicID()) {
519       case Intrinsic::amdgcn_workitem_id_x:
520       case Intrinsic::r600_read_tidig_x:
521         IdQuery = true;
522         LLVM_FALLTHROUGH;
523       case Intrinsic::r600_read_local_size_x:
524         Dim = 0;
525         break;
526       case Intrinsic::amdgcn_workitem_id_y:
527       case Intrinsic::r600_read_tidig_y:
528         IdQuery = true;
529         LLVM_FALLTHROUGH;
530       case Intrinsic::r600_read_local_size_y:
531         Dim = 1;
532         break;
533       case Intrinsic::amdgcn_workitem_id_z:
534       case Intrinsic::r600_read_tidig_z:
535         IdQuery = true;
536         LLVM_FALLTHROUGH;
537       case Intrinsic::r600_read_local_size_z:
538         Dim = 2;
539         break;
540       default:
541         break;
542       }
543 
544       if (Dim <= 3) {
545         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
546         if (ReqdSize != std::numeric_limits<unsigned>::max())
547           MinSize = MaxSize = ReqdSize;
548       }
549     }
550   }
551 
552   if (!MaxSize)
553     return false;
554 
555   // Range metadata is [Lo, Hi). For ID query we need to pass max size
556   // as Hi. For size query we need to pass Hi + 1.
557   if (IdQuery)
558     MinSize = 0;
559   else
560     ++MaxSize;
561 
562   MDBuilder MDB(I->getContext());
563   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
564                                                   APInt(32, MaxSize));
565   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
566   return true;
567 }
568 
getImplicitArgNumBytes(const Function & F) const569 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
570   if (isMesaKernel(F))
571     return 16;
572   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
573 }
574 
getExplicitKernArgSize(const Function & F,Align & MaxAlign) const575 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
576                                                  Align &MaxAlign) const {
577   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
578          F.getCallingConv() == CallingConv::SPIR_KERNEL);
579 
580   const DataLayout &DL = F.getParent()->getDataLayout();
581   uint64_t ExplicitArgBytes = 0;
582   MaxAlign = Align(1);
583 
584   for (const Argument &Arg : F.args()) {
585     const bool IsByRef = Arg.hasByRefAttr();
586     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
587     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
588     if (!Alignment)
589       Alignment = DL.getABITypeAlign(ArgTy);
590 
591     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
592     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
593     MaxAlign = max(MaxAlign, Alignment);
594   }
595 
596   return ExplicitArgBytes;
597 }
598 
getKernArgSegmentSize(const Function & F,Align & MaxAlign) const599 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
600                                                 Align &MaxAlign) const {
601   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
602 
603   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
604 
605   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
606   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
607   if (ImplicitBytes != 0) {
608     const Align Alignment = getAlignmentForImplicitArgPtr();
609     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
610   }
611 
612   // Being able to dereference past the end is useful for emitting scalar loads.
613   return alignTo(TotalSize, 4);
614 }
615 
getAMDGPUDwarfFlavour() const616 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
617   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
618                                   : AMDGPUDwarfFlavour::Wave64;
619 }
620 
R600Subtarget(const Triple & TT,StringRef GPU,StringRef FS,const TargetMachine & TM)621 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
622                              const TargetMachine &TM) :
623   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
624   AMDGPUSubtarget(TT),
625   InstrInfo(*this),
626   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
627   FMA(false),
628   CaymanISA(false),
629   CFALUBug(false),
630   HasVertexCache(false),
631   R600ALUInst(false),
632   FP64(false),
633   TexVTXClauseSize(0),
634   Gen(R600),
635   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
636   InstrItins(getInstrItineraryForCPU(GPU)) { }
637 
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const638 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
639                                       unsigned NumRegionInstrs) const {
640   // Track register pressure so the scheduler can try to decrease
641   // pressure once register usage is above the threshold defined by
642   // SIRegisterInfo::getRegPressureSetLimit()
643   Policy.ShouldTrackPressure = true;
644 
645   // Enabling both top down and bottom up scheduling seems to give us less
646   // register spills than just using one of these approaches on its own.
647   Policy.OnlyTopDown = false;
648   Policy.OnlyBottomUp = false;
649 
650   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
651   if (!enableSIScheduler())
652     Policy.ShouldTrackLaneMasks = true;
653 }
654 
hasMadF16() const655 bool GCNSubtarget::hasMadF16() const {
656   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
657 }
658 
useVGPRIndexMode() const659 bool GCNSubtarget::useVGPRIndexMode() const {
660   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
661 }
662 
useAA() const663 bool GCNSubtarget::useAA() const { return UseAA; }
664 
getOccupancyWithNumSGPRs(unsigned SGPRs) const665 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
666   if (getGeneration() >= AMDGPUSubtarget::GFX10)
667     return getMaxWavesPerEU();
668 
669   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
670     if (SGPRs <= 80)
671       return 10;
672     if (SGPRs <= 88)
673       return 9;
674     if (SGPRs <= 100)
675       return 8;
676     return 7;
677   }
678   if (SGPRs <= 48)
679     return 10;
680   if (SGPRs <= 56)
681     return 9;
682   if (SGPRs <= 64)
683     return 8;
684   if (SGPRs <= 72)
685     return 7;
686   if (SGPRs <= 80)
687     return 6;
688   return 5;
689 }
690 
getOccupancyWithNumVGPRs(unsigned VGPRs) const691 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
692   unsigned MaxWaves = getMaxWavesPerEU();
693   unsigned Granule = getVGPRAllocGranule();
694   if (VGPRs < Granule)
695     return MaxWaves;
696   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
697   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
698 }
699 
getReservedNumSGPRs(const MachineFunction & MF) const700 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
701   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
702   if (getGeneration() >= AMDGPUSubtarget::GFX10)
703     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
704 
705   if (MFI.hasFlatScratchInit()) {
706     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
707       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
708     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
709       return 4; // FLAT_SCRATCH, VCC (in that order).
710   }
711 
712   if (isXNACKEnabled())
713     return 4; // XNACK, VCC (in that order).
714   return 2; // VCC.
715 }
716 
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const717 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
718                                         unsigned NumSGPRs,
719                                         unsigned NumVGPRs) const {
720   unsigned Occupancy =
721     std::min(getMaxWavesPerEU(),
722              getOccupancyWithLocalMemSize(LDSSize, F));
723   if (NumSGPRs)
724     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
725   if (NumVGPRs)
726     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
727   return Occupancy;
728 }
729 
getMaxNumSGPRs(const MachineFunction & MF) const730 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
731   const Function &F = MF.getFunction();
732   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
733 
734   // Compute maximum number of SGPRs function can use using default/requested
735   // minimum number of waves per execution unit.
736   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
737   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
738   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
739 
740   // Check if maximum number of SGPRs was explicitly requested using
741   // "amdgpu-num-sgpr" attribute.
742   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
743     unsigned Requested = AMDGPU::getIntegerAttribute(
744       F, "amdgpu-num-sgpr", MaxNumSGPRs);
745 
746     // Make sure requested value does not violate subtarget's specifications.
747     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
748       Requested = 0;
749 
750     // If more SGPRs are required to support the input user/system SGPRs,
751     // increase to accommodate them.
752     //
753     // FIXME: This really ends up using the requested number of SGPRs + number
754     // of reserved special registers in total. Theoretically you could re-use
755     // the last input registers for these special registers, but this would
756     // require a lot of complexity to deal with the weird aliasing.
757     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
758     if (Requested && Requested < InputNumSGPRs)
759       Requested = InputNumSGPRs;
760 
761     // Make sure requested value is compatible with values implied by
762     // default/requested minimum/maximum number of waves per execution unit.
763     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
764       Requested = 0;
765     if (WavesPerEU.second &&
766         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
767       Requested = 0;
768 
769     if (Requested)
770       MaxNumSGPRs = Requested;
771   }
772 
773   if (hasSGPRInitBug())
774     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
775 
776   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
777                   MaxAddressableNumSGPRs);
778 }
779 
getMaxNumVGPRs(const MachineFunction & MF) const780 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
781   const Function &F = MF.getFunction();
782   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
783 
784   // Compute maximum number of VGPRs function can use using default/requested
785   // minimum number of waves per execution unit.
786   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
787   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
788 
789   // Check if maximum number of VGPRs was explicitly requested using
790   // "amdgpu-num-vgpr" attribute.
791   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
792     unsigned Requested = AMDGPU::getIntegerAttribute(
793       F, "amdgpu-num-vgpr", MaxNumVGPRs);
794 
795     if (hasGFX90AInsts())
796       Requested *= 2;
797 
798     // Make sure requested value is compatible with values implied by
799     // default/requested minimum/maximum number of waves per execution unit.
800     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
801       Requested = 0;
802     if (WavesPerEU.second &&
803         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
804       Requested = 0;
805 
806     if (Requested)
807       MaxNumVGPRs = Requested;
808   }
809 
810   return MaxNumVGPRs;
811 }
812 
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep) const813 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
814                                          int UseOpIdx, SDep &Dep) const {
815   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
816       !Def->isInstr() || !Use->isInstr())
817     return;
818 
819   MachineInstr *DefI = Def->getInstr();
820   MachineInstr *UseI = Use->getInstr();
821 
822   if (DefI->isBundle()) {
823     const SIRegisterInfo *TRI = getRegisterInfo();
824     auto Reg = Dep.getReg();
825     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
826     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
827     unsigned Lat = 0;
828     for (++I; I != E && I->isBundledWithPred(); ++I) {
829       if (I->modifiesRegister(Reg, TRI))
830         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
831       else if (Lat)
832         --Lat;
833     }
834     Dep.setLatency(Lat);
835   } else if (UseI->isBundle()) {
836     const SIRegisterInfo *TRI = getRegisterInfo();
837     auto Reg = Dep.getReg();
838     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
839     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
840     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
841     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
842       if (I->readsRegister(Reg, TRI))
843         break;
844       --Lat;
845     }
846     Dep.setLatency(Lat);
847   }
848 }
849 
850 namespace {
851 struct FillMFMAShadowMutation : ScheduleDAGMutation {
852   const SIInstrInfo *TII;
853 
854   ScheduleDAGMI *DAG;
855 
FillMFMAShadowMutation__anonb95807e30111::FillMFMAShadowMutation856   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
857 
isSALU__anonb95807e30111::FillMFMAShadowMutation858   bool isSALU(const SUnit *SU) const {
859     const MachineInstr *MI = SU->getInstr();
860     return MI && TII->isSALU(*MI) && !MI->isTerminator();
861   }
862 
isVALU__anonb95807e30111::FillMFMAShadowMutation863   bool isVALU(const SUnit *SU) const {
864     const MachineInstr *MI = SU->getInstr();
865     return MI && TII->isVALU(*MI);
866   }
867 
canAddEdge__anonb95807e30111::FillMFMAShadowMutation868   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
869     if (Pred->NodeNum < Succ->NodeNum)
870       return true;
871 
872     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
873 
874     for (unsigned I = 0; I < Succs.size(); ++I) {
875       for (const SDep &SI : Succs[I]->Succs) {
876         const SUnit *SU = SI.getSUnit();
877         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
878           Succs.push_back(SU);
879       }
880     }
881 
882     SmallPtrSet<const SUnit*, 32> Visited;
883     while (!Preds.empty()) {
884       const SUnit *SU = Preds.pop_back_val();
885       if (llvm::is_contained(Succs, SU))
886         return false;
887       Visited.insert(SU);
888       for (const SDep &SI : SU->Preds)
889         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
890           Preds.push_back(SI.getSUnit());
891     }
892 
893     return true;
894   }
895 
896   // Link as much SALU intructions in chain as possible. Return the size
897   // of the chain. Links up to MaxChain instructions.
linkSALUChain__anonb95807e30111::FillMFMAShadowMutation898   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
899                          SmallPtrSetImpl<SUnit *> &Visited) const {
900     SmallVector<SUnit *, 8> Worklist({To});
901     unsigned Linked = 0;
902 
903     while (!Worklist.empty() && MaxChain-- > 0) {
904       SUnit *SU = Worklist.pop_back_val();
905       if (!Visited.insert(SU).second)
906         continue;
907 
908       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
909                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
910 
911       if (SU->addPred(SDep(From, SDep::Artificial), false))
912         ++Linked;
913 
914       for (SDep &SI : From->Succs) {
915         SUnit *SUv = SI.getSUnit();
916         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
917           SUv->addPred(SDep(SU, SDep::Artificial), false);
918       }
919 
920       for (SDep &SI : SU->Succs) {
921         SUnit *Succ = SI.getSUnit();
922         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
923           Worklist.push_back(Succ);
924       }
925     }
926 
927     return Linked;
928   }
929 
apply__anonb95807e30111::FillMFMAShadowMutation930   void apply(ScheduleDAGInstrs *DAGInstrs) override {
931     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
932     if (!ST.hasMAIInsts() || DisablePowerSched)
933       return;
934     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
935     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
936     if (!TSchedModel || DAG->SUnits.empty())
937       return;
938 
939     // Scan for MFMA long latency instructions and try to add a dependency
940     // of available SALU instructions to give them a chance to fill MFMA
941     // shadow. That is desirable to fill MFMA shadow with SALU instructions
942     // rather than VALU to prevent power consumption bursts and throttle.
943     auto LastSALU = DAG->SUnits.begin();
944     auto E = DAG->SUnits.end();
945     SmallPtrSet<SUnit*, 32> Visited;
946     for (SUnit &SU : DAG->SUnits) {
947       MachineInstr &MAI = *SU.getInstr();
948       if (!TII->isMAI(MAI) ||
949            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
950            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
951         continue;
952 
953       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
954 
955       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
956                  dbgs() << "Need " << Lat
957                         << " instructions to cover latency.\n");
958 
959       // Find up to Lat independent scalar instructions as early as
960       // possible such that they can be scheduled after this MFMA.
961       for ( ; Lat && LastSALU != E; ++LastSALU) {
962         if (Visited.count(&*LastSALU))
963           continue;
964 
965         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
966           continue;
967 
968         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
969       }
970     }
971   }
972 };
973 } // namespace
974 
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const975 void GCNSubtarget::getPostRAMutations(
976     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
977   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
978 }
979 
get(const MachineFunction & MF)980 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
981   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
982     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
983   else
984     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
985 }
986 
get(const TargetMachine & TM,const Function & F)987 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
988   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
989     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
990   else
991     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
992 }
993