1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
46 static cl::opt<bool> DisablePowerSched(
47   "amdgpu-disable-power-sched",
48   cl::desc("Disable scheduling to minimize mAI power bursts"),
49   cl::init(false));
50 
51 static cl::opt<bool> EnableVGPRIndexMode(
52   "amdgpu-vgpr-index-mode",
53   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableFlatScratch(
57   "amdgpu-enable-flat-scratch",
58   cl::desc("Use flat scratch instructions"),
59   cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62                            cl::desc("Enable the use of AA during codegen."),
63                            cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
67 R600Subtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69                                                StringRef GPU, StringRef FS) {
70   SmallString<256> FullFS("+promote-alloca,");
71   FullFS += FS;
72   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
74   HasMulU24 = getGeneration() >= EVERGREEN;
75   HasMulI24 = hasCaymanISA();
76 
77   return *this;
78 }
79 
80 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82                                               StringRef GPU, StringRef FS) {
83   // Determine default and user-specified characteristics
84   //
85   // We want to be able to turn these off, but making this a subtarget feature
86   // for SI has the unhelpful behavior that it unsets everything else if you
87   // disable it.
88   //
89   // Similarly we want enable-prt-strict-null to be on by default and not to
90   // unset everything else if it is disabled
91 
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 
167   TargetID.setTargetIDFromFeaturesString(FS);
168 
169   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170                     << TargetID.getXnackSetting() << '\n');
171   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172                     << TargetID.getSramEccSetting() << '\n');
173 
174   return *this;
175 }
176 
AMDGPUSubtarget(const Triple & TT)177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
178   TargetTriple(TT),
179   Has16BitInsts(false),
180   HasMadMixInsts(false),
181   HasMadMacF32Insts(false),
182   HasDsSrc2Insts(false),
183   HasSDWA(false),
184   HasVOP3PInsts(false),
185   HasMulI24(true),
186   HasMulU24(true),
187   HasInv2PiInlineImm(false),
188   HasFminFmaxLegacy(true),
189   EnablePromoteAlloca(false),
190   HasTrigReducedRange(false),
191   MaxWavesPerEU(10),
192   LocalMemorySize(0),
193   WavefrontSizeLog2(0)
194   { }
195 
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)196 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
197                            const GCNTargetMachine &TM) :
198     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
199     AMDGPUSubtarget(TT),
200     TargetTriple(TT),
201     TargetID(*this),
202     Gen(INVALID),
203     InstrItins(getInstrItineraryForCPU(GPU)),
204     LDSBankCount(0),
205     MaxPrivateElementSize(0),
206 
207     FastFMAF32(false),
208     FastDenormalF32(false),
209     HalfRate64Ops(false),
210 
211     FlatForGlobal(false),
212     AutoWaitcntBeforeBarrier(false),
213     UnalignedScratchAccess(false),
214     UnalignedAccessMode(false),
215 
216     HasApertureRegs(false),
217     SupportsXNACK(false),
218     EnableXNACK(false),
219     EnableCuMode(false),
220     TrapHandler(false),
221 
222     EnableLoadStoreOpt(false),
223     EnableUnsafeDSOffsetFolding(false),
224     EnableSIScheduler(false),
225     EnableDS128(false),
226     EnablePRTStrictNull(false),
227     DumpCode(false),
228 
229     FP64(false),
230     GCN3Encoding(false),
231     CIInsts(false),
232     GFX8Insts(false),
233     GFX9Insts(false),
234     GFX10Insts(false),
235     GFX10_3Insts(false),
236     GFX7GFX8GFX9Insts(false),
237     SGPRInitBug(false),
238     HasSMemRealTime(false),
239     HasIntClamp(false),
240     HasFmaMixInsts(false),
241     HasMovrel(false),
242     HasVGPRIndexMode(false),
243     HasScalarStores(false),
244     HasScalarAtomics(false),
245     HasSDWAOmod(false),
246     HasSDWAScalar(false),
247     HasSDWASdst(false),
248     HasSDWAMac(false),
249     HasSDWAOutModsVOPC(false),
250     HasDPP(false),
251     HasDPP8(false),
252     HasR128A16(false),
253     HasGFX10A16(false),
254     HasG16(false),
255     HasNSAEncoding(false),
256     GFX10_BEncoding(false),
257     HasDLInsts(false),
258     HasDot1Insts(false),
259     HasDot2Insts(false),
260     HasDot3Insts(false),
261     HasDot4Insts(false),
262     HasDot5Insts(false),
263     HasDot6Insts(false),
264     HasMAIInsts(false),
265     HasPkFmacF16Inst(false),
266     HasAtomicFaddInsts(false),
267     SupportsSRAMECC(false),
268     EnableSRAMECC(false),
269     HasNoSdstCMPX(false),
270     HasVscnt(false),
271     HasGetWaveIdInst(false),
272     HasSMemTimeInst(false),
273     HasRegisterBanking(false),
274     HasVOP3Literal(false),
275     HasNoDataDepHazard(false),
276     FlatAddressSpace(false),
277     FlatInstOffsets(false),
278     FlatGlobalInsts(false),
279     FlatScratchInsts(false),
280     ScalarFlatScratchInsts(false),
281     AddNoCarryInsts(false),
282     HasUnpackedD16VMem(false),
283     LDSMisalignedBug(false),
284     HasMFMAInlineLiteralBug(false),
285     UnalignedBufferAccess(false),
286     UnalignedDSAccess(false),
287 
288     ScalarizeGlobal(false),
289 
290     HasVcmpxPermlaneHazard(false),
291     HasVMEMtoScalarWriteHazard(false),
292     HasSMEMtoVectorWriteHazard(false),
293     HasInstFwdPrefetchBug(false),
294     HasVcmpxExecWARHazard(false),
295     HasLdsBranchVmemWARHazard(false),
296     HasNSAtoVMEMBug(false),
297     HasOffset3fBug(false),
298     HasFlatSegmentOffsetBug(false),
299     HasImageStoreD16Bug(false),
300     HasImageGather4D16Bug(false),
301 
302     FeatureDisable(false),
303     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
304     TLInfo(TM, *this),
305     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
306   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
307   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
308   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
309   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
310   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
311   InstSelector.reset(new AMDGPUInstructionSelector(
312   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
313 }
314 
enableFlatScratch() const315 bool GCNSubtarget::enableFlatScratch() const {
316   return EnableFlatScratch && hasFlatScratchInsts();
317 }
318 
getConstantBusLimit(unsigned Opcode) const319 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
320   if (getGeneration() < GFX10)
321     return 1;
322 
323   switch (Opcode) {
324   case AMDGPU::V_LSHLREV_B64_e64:
325   case AMDGPU::V_LSHLREV_B64_gfx10:
326   case AMDGPU::V_LSHL_B64_e64:
327   case AMDGPU::V_LSHRREV_B64_e64:
328   case AMDGPU::V_LSHRREV_B64_gfx10:
329   case AMDGPU::V_LSHR_B64_e64:
330   case AMDGPU::V_ASHRREV_I64_e64:
331   case AMDGPU::V_ASHRREV_I64_gfx10:
332   case AMDGPU::V_ASHR_I64_e64:
333     return 1;
334   }
335 
336   return 2;
337 }
338 
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const339 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
340   const Function &F) const {
341   if (NWaves == 1)
342     return getLocalMemorySize();
343   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
344   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
345   if (!WorkGroupsPerCu)
346     return 0;
347   unsigned MaxWaves = getMaxWavesPerEU();
348   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
349 }
350 
351 // FIXME: Should return min,max range.
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const352 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
353   const Function &F) const {
354   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
355   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
356   if (!MaxWorkGroupsPerCu)
357     return 0;
358 
359   const unsigned WaveSize = getWavefrontSize();
360 
361   // FIXME: Do we need to account for alignment requirement of LDS rounding the
362   // size up?
363   // Compute restriction based on LDS usage
364   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
365 
366   // This can be queried with more LDS than is possible, so just assume the
367   // worst.
368   if (NumGroups == 0)
369     return 1;
370 
371   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
372 
373   // Round to the number of waves.
374   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
375   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
376 
377   // Clamp to the maximum possible number of waves.
378   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
379 
380   // FIXME: Needs to be a multiple of the group size?
381   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
382 
383   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
384          "computed invalid occupancy");
385   return MaxWaves;
386 }
387 
388 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const389 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
390   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
391   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
392 }
393 
394 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const395 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
396   switch (CC) {
397   case CallingConv::AMDGPU_VS:
398   case CallingConv::AMDGPU_LS:
399   case CallingConv::AMDGPU_HS:
400   case CallingConv::AMDGPU_ES:
401   case CallingConv::AMDGPU_GS:
402   case CallingConv::AMDGPU_PS:
403     return std::make_pair(1, getWavefrontSize());
404   default:
405     return std::make_pair(1u, getMaxFlatWorkGroupSize());
406   }
407 }
408 
getFlatWorkGroupSizes(const Function & F) const409 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
410   const Function &F) const {
411   // Default minimum/maximum flat work group sizes.
412   std::pair<unsigned, unsigned> Default =
413     getDefaultFlatWorkGroupSize(F.getCallingConv());
414 
415   // Requested minimum/maximum flat work group sizes.
416   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
417     F, "amdgpu-flat-work-group-size", Default);
418 
419   // Make sure requested minimum is less than requested maximum.
420   if (Requested.first > Requested.second)
421     return Default;
422 
423   // Make sure requested values do not violate subtarget's specifications.
424   if (Requested.first < getMinFlatWorkGroupSize())
425     return Default;
426   if (Requested.second > getMaxFlatWorkGroupSize())
427     return Default;
428 
429   return Requested;
430 }
431 
getWavesPerEU(const Function & F) const432 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
433   const Function &F) const {
434   // Default minimum/maximum number of waves per execution unit.
435   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
436 
437   // Default/requested minimum/maximum flat work group sizes.
438   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
439 
440   // If minimum/maximum flat work group sizes were explicitly requested using
441   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
442   // number of waves per execution unit to values implied by requested
443   // minimum/maximum flat work group sizes.
444   unsigned MinImpliedByFlatWorkGroupSize =
445     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
446   Default.first = MinImpliedByFlatWorkGroupSize;
447   bool RequestedFlatWorkGroupSize =
448       F.hasFnAttribute("amdgpu-flat-work-group-size");
449 
450   // Requested minimum/maximum number of waves per execution unit.
451   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
452     F, "amdgpu-waves-per-eu", Default, true);
453 
454   // Make sure requested minimum is less than requested maximum.
455   if (Requested.second && Requested.first > Requested.second)
456     return Default;
457 
458   // Make sure requested values do not violate subtarget's specifications.
459   if (Requested.first < getMinWavesPerEU() ||
460       Requested.second > getMaxWavesPerEU())
461     return Default;
462 
463   // Make sure requested values are compatible with values implied by requested
464   // minimum/maximum flat work group sizes.
465   if (RequestedFlatWorkGroupSize &&
466       Requested.first < MinImpliedByFlatWorkGroupSize)
467     return Default;
468 
469   return Requested;
470 }
471 
getReqdWorkGroupSize(const Function & Kernel,unsigned Dim)472 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
473   auto Node = Kernel.getMetadata("reqd_work_group_size");
474   if (Node && Node->getNumOperands() == 3)
475     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
476   return std::numeric_limits<unsigned>::max();
477 }
478 
isMesaKernel(const Function & F) const479 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
480   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
481 }
482 
getMaxWorkitemID(const Function & Kernel,unsigned Dimension) const483 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
484                                            unsigned Dimension) const {
485   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
486   if (ReqdSize != std::numeric_limits<unsigned>::max())
487     return ReqdSize - 1;
488   return getFlatWorkGroupSizes(Kernel).second - 1;
489 }
490 
makeLIDRangeMetadata(Instruction * I) const491 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
492   Function *Kernel = I->getParent()->getParent();
493   unsigned MinSize = 0;
494   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
495   bool IdQuery = false;
496 
497   // If reqd_work_group_size is present it narrows value down.
498   if (auto *CI = dyn_cast<CallInst>(I)) {
499     const Function *F = CI->getCalledFunction();
500     if (F) {
501       unsigned Dim = UINT_MAX;
502       switch (F->getIntrinsicID()) {
503       case Intrinsic::amdgcn_workitem_id_x:
504       case Intrinsic::r600_read_tidig_x:
505         IdQuery = true;
506         LLVM_FALLTHROUGH;
507       case Intrinsic::r600_read_local_size_x:
508         Dim = 0;
509         break;
510       case Intrinsic::amdgcn_workitem_id_y:
511       case Intrinsic::r600_read_tidig_y:
512         IdQuery = true;
513         LLVM_FALLTHROUGH;
514       case Intrinsic::r600_read_local_size_y:
515         Dim = 1;
516         break;
517       case Intrinsic::amdgcn_workitem_id_z:
518       case Intrinsic::r600_read_tidig_z:
519         IdQuery = true;
520         LLVM_FALLTHROUGH;
521       case Intrinsic::r600_read_local_size_z:
522         Dim = 2;
523         break;
524       default:
525         break;
526       }
527 
528       if (Dim <= 3) {
529         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
530         if (ReqdSize != std::numeric_limits<unsigned>::max())
531           MinSize = MaxSize = ReqdSize;
532       }
533     }
534   }
535 
536   if (!MaxSize)
537     return false;
538 
539   // Range metadata is [Lo, Hi). For ID query we need to pass max size
540   // as Hi. For size query we need to pass Hi + 1.
541   if (IdQuery)
542     MinSize = 0;
543   else
544     ++MaxSize;
545 
546   MDBuilder MDB(I->getContext());
547   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
548                                                   APInt(32, MaxSize));
549   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
550   return true;
551 }
552 
getImplicitArgNumBytes(const Function & F) const553 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
554   if (isMesaKernel(F))
555     return 16;
556   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
557 }
558 
getExplicitKernArgSize(const Function & F,Align & MaxAlign) const559 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
560                                                  Align &MaxAlign) const {
561   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
562          F.getCallingConv() == CallingConv::SPIR_KERNEL);
563 
564   const DataLayout &DL = F.getParent()->getDataLayout();
565   uint64_t ExplicitArgBytes = 0;
566   MaxAlign = Align(1);
567 
568   for (const Argument &Arg : F.args()) {
569     const bool IsByRef = Arg.hasByRefAttr();
570     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
571     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
572     if (!Alignment)
573       Alignment = DL.getABITypeAlign(ArgTy);
574 
575     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
576     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
577     MaxAlign = max(MaxAlign, Alignment);
578   }
579 
580   return ExplicitArgBytes;
581 }
582 
getKernArgSegmentSize(const Function & F,Align & MaxAlign) const583 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
584                                                 Align &MaxAlign) const {
585   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
586 
587   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
588 
589   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
590   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
591   if (ImplicitBytes != 0) {
592     const Align Alignment = getAlignmentForImplicitArgPtr();
593     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
594   }
595 
596   // Being able to dereference past the end is useful for emitting scalar loads.
597   return alignTo(TotalSize, 4);
598 }
599 
getAMDGPUDwarfFlavour() const600 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
601   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
602                                   : AMDGPUDwarfFlavour::Wave64;
603 }
604 
R600Subtarget(const Triple & TT,StringRef GPU,StringRef FS,const TargetMachine & TM)605 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
606                              const TargetMachine &TM) :
607   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
608   AMDGPUSubtarget(TT),
609   InstrInfo(*this),
610   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
611   FMA(false),
612   CaymanISA(false),
613   CFALUBug(false),
614   HasVertexCache(false),
615   R600ALUInst(false),
616   FP64(false),
617   TexVTXClauseSize(0),
618   Gen(R600),
619   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
620   InstrItins(getInstrItineraryForCPU(GPU)) { }
621 
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const622 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
623                                       unsigned NumRegionInstrs) const {
624   // Track register pressure so the scheduler can try to decrease
625   // pressure once register usage is above the threshold defined by
626   // SIRegisterInfo::getRegPressureSetLimit()
627   Policy.ShouldTrackPressure = true;
628 
629   // Enabling both top down and bottom up scheduling seems to give us less
630   // register spills than just using one of these approaches on its own.
631   Policy.OnlyTopDown = false;
632   Policy.OnlyBottomUp = false;
633 
634   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
635   if (!enableSIScheduler())
636     Policy.ShouldTrackLaneMasks = true;
637 }
638 
hasMadF16() const639 bool GCNSubtarget::hasMadF16() const {
640   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
641 }
642 
useVGPRIndexMode() const643 bool GCNSubtarget::useVGPRIndexMode() const {
644   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
645 }
646 
useAA() const647 bool GCNSubtarget::useAA() const { return UseAA; }
648 
getOccupancyWithNumSGPRs(unsigned SGPRs) const649 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
650   if (getGeneration() >= AMDGPUSubtarget::GFX10)
651     return getMaxWavesPerEU();
652 
653   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
654     if (SGPRs <= 80)
655       return 10;
656     if (SGPRs <= 88)
657       return 9;
658     if (SGPRs <= 100)
659       return 8;
660     return 7;
661   }
662   if (SGPRs <= 48)
663     return 10;
664   if (SGPRs <= 56)
665     return 9;
666   if (SGPRs <= 64)
667     return 8;
668   if (SGPRs <= 72)
669     return 7;
670   if (SGPRs <= 80)
671     return 6;
672   return 5;
673 }
674 
getOccupancyWithNumVGPRs(unsigned VGPRs) const675 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
676   unsigned MaxWaves = getMaxWavesPerEU();
677   unsigned Granule = getVGPRAllocGranule();
678   if (VGPRs < Granule)
679     return MaxWaves;
680   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
681   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
682 }
683 
getReservedNumSGPRs(const MachineFunction & MF) const684 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
685   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
686   if (getGeneration() >= AMDGPUSubtarget::GFX10)
687     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
688 
689   if (MFI.hasFlatScratchInit()) {
690     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
691       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
692     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
693       return 4; // FLAT_SCRATCH, VCC (in that order).
694   }
695 
696   if (isXNACKEnabled())
697     return 4; // XNACK, VCC (in that order).
698   return 2; // VCC.
699 }
700 
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const701 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
702                                         unsigned NumSGPRs,
703                                         unsigned NumVGPRs) const {
704   unsigned Occupancy =
705     std::min(getMaxWavesPerEU(),
706              getOccupancyWithLocalMemSize(LDSSize, F));
707   if (NumSGPRs)
708     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
709   if (NumVGPRs)
710     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
711   return Occupancy;
712 }
713 
getMaxNumSGPRs(const MachineFunction & MF) const714 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
715   const Function &F = MF.getFunction();
716   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
717 
718   // Compute maximum number of SGPRs function can use using default/requested
719   // minimum number of waves per execution unit.
720   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
721   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
722   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
723 
724   // Check if maximum number of SGPRs was explicitly requested using
725   // "amdgpu-num-sgpr" attribute.
726   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
727     unsigned Requested = AMDGPU::getIntegerAttribute(
728       F, "amdgpu-num-sgpr", MaxNumSGPRs);
729 
730     // Make sure requested value does not violate subtarget's specifications.
731     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
732       Requested = 0;
733 
734     // If more SGPRs are required to support the input user/system SGPRs,
735     // increase to accommodate them.
736     //
737     // FIXME: This really ends up using the requested number of SGPRs + number
738     // of reserved special registers in total. Theoretically you could re-use
739     // the last input registers for these special registers, but this would
740     // require a lot of complexity to deal with the weird aliasing.
741     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
742     if (Requested && Requested < InputNumSGPRs)
743       Requested = InputNumSGPRs;
744 
745     // Make sure requested value is compatible with values implied by
746     // default/requested minimum/maximum number of waves per execution unit.
747     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
748       Requested = 0;
749     if (WavesPerEU.second &&
750         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
751       Requested = 0;
752 
753     if (Requested)
754       MaxNumSGPRs = Requested;
755   }
756 
757   if (hasSGPRInitBug())
758     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
759 
760   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
761                   MaxAddressableNumSGPRs);
762 }
763 
getMaxNumVGPRs(const MachineFunction & MF) const764 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
765   const Function &F = MF.getFunction();
766   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
767 
768   // Compute maximum number of VGPRs function can use using default/requested
769   // minimum number of waves per execution unit.
770   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
771   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
772 
773   // Check if maximum number of VGPRs was explicitly requested using
774   // "amdgpu-num-vgpr" attribute.
775   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
776     unsigned Requested = AMDGPU::getIntegerAttribute(
777       F, "amdgpu-num-vgpr", MaxNumVGPRs);
778 
779     // Make sure requested value is compatible with values implied by
780     // default/requested minimum/maximum number of waves per execution unit.
781     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
782       Requested = 0;
783     if (WavesPerEU.second &&
784         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
785       Requested = 0;
786 
787     if (Requested)
788       MaxNumVGPRs = Requested;
789   }
790 
791   return MaxNumVGPRs;
792 }
793 
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep) const794 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
795                                          int UseOpIdx, SDep &Dep) const {
796   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
797       !Def->isInstr() || !Use->isInstr())
798     return;
799 
800   MachineInstr *DefI = Def->getInstr();
801   MachineInstr *UseI = Use->getInstr();
802 
803   if (DefI->isBundle()) {
804     const SIRegisterInfo *TRI = getRegisterInfo();
805     auto Reg = Dep.getReg();
806     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
807     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
808     unsigned Lat = 0;
809     for (++I; I != E && I->isBundledWithPred(); ++I) {
810       if (I->modifiesRegister(Reg, TRI))
811         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
812       else if (Lat)
813         --Lat;
814     }
815     Dep.setLatency(Lat);
816   } else if (UseI->isBundle()) {
817     const SIRegisterInfo *TRI = getRegisterInfo();
818     auto Reg = Dep.getReg();
819     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
820     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
821     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
822     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
823       if (I->readsRegister(Reg, TRI))
824         break;
825       --Lat;
826     }
827     Dep.setLatency(Lat);
828   }
829 }
830 
831 namespace {
832 struct FillMFMAShadowMutation : ScheduleDAGMutation {
833   const SIInstrInfo *TII;
834 
835   ScheduleDAGMI *DAG;
836 
FillMFMAShadowMutation__anon6c79c95a0111::FillMFMAShadowMutation837   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
838 
isSALU__anon6c79c95a0111::FillMFMAShadowMutation839   bool isSALU(const SUnit *SU) const {
840     const MachineInstr *MI = SU->getInstr();
841     return MI && TII->isSALU(*MI) && !MI->isTerminator();
842   }
843 
isVALU__anon6c79c95a0111::FillMFMAShadowMutation844   bool isVALU(const SUnit *SU) const {
845     const MachineInstr *MI = SU->getInstr();
846     return MI && TII->isVALU(*MI);
847   }
848 
canAddEdge__anon6c79c95a0111::FillMFMAShadowMutation849   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
850     if (Pred->NodeNum < Succ->NodeNum)
851       return true;
852 
853     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
854 
855     for (unsigned I = 0; I < Succs.size(); ++I) {
856       for (const SDep &SI : Succs[I]->Succs) {
857         const SUnit *SU = SI.getSUnit();
858         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
859           Succs.push_back(SU);
860       }
861     }
862 
863     SmallPtrSet<const SUnit*, 32> Visited;
864     while (!Preds.empty()) {
865       const SUnit *SU = Preds.pop_back_val();
866       if (llvm::is_contained(Succs, SU))
867         return false;
868       Visited.insert(SU);
869       for (const SDep &SI : SU->Preds)
870         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
871           Preds.push_back(SI.getSUnit());
872     }
873 
874     return true;
875   }
876 
877   // Link as much SALU intructions in chain as possible. Return the size
878   // of the chain. Links up to MaxChain instructions.
linkSALUChain__anon6c79c95a0111::FillMFMAShadowMutation879   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
880                          SmallPtrSetImpl<SUnit *> &Visited) const {
881     SmallVector<SUnit *, 8> Worklist({To});
882     unsigned Linked = 0;
883 
884     while (!Worklist.empty() && MaxChain-- > 0) {
885       SUnit *SU = Worklist.pop_back_val();
886       if (!Visited.insert(SU).second)
887         continue;
888 
889       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
890                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
891 
892       if (SU->addPred(SDep(From, SDep::Artificial), false))
893         ++Linked;
894 
895       for (SDep &SI : From->Succs) {
896         SUnit *SUv = SI.getSUnit();
897         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
898           SUv->addPred(SDep(SU, SDep::Artificial), false);
899       }
900 
901       for (SDep &SI : SU->Succs) {
902         SUnit *Succ = SI.getSUnit();
903         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
904           Worklist.push_back(Succ);
905       }
906     }
907 
908     return Linked;
909   }
910 
apply__anon6c79c95a0111::FillMFMAShadowMutation911   void apply(ScheduleDAGInstrs *DAGInstrs) override {
912     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
913     if (!ST.hasMAIInsts() || DisablePowerSched)
914       return;
915     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
916     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
917     if (!TSchedModel || DAG->SUnits.empty())
918       return;
919 
920     // Scan for MFMA long latency instructions and try to add a dependency
921     // of available SALU instructions to give them a chance to fill MFMA
922     // shadow. That is desirable to fill MFMA shadow with SALU instructions
923     // rather than VALU to prevent power consumption bursts and throttle.
924     auto LastSALU = DAG->SUnits.begin();
925     auto E = DAG->SUnits.end();
926     SmallPtrSet<SUnit*, 32> Visited;
927     for (SUnit &SU : DAG->SUnits) {
928       MachineInstr &MAI = *SU.getInstr();
929       if (!TII->isMAI(MAI) ||
930            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
931            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
932         continue;
933 
934       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
935 
936       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
937                  dbgs() << "Need " << Lat
938                         << " instructions to cover latency.\n");
939 
940       // Find up to Lat independent scalar instructions as early as
941       // possible such that they can be scheduled after this MFMA.
942       for ( ; Lat && LastSALU != E; ++LastSALU) {
943         if (Visited.count(&*LastSALU))
944           continue;
945 
946         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
947           continue;
948 
949         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
950       }
951     }
952   }
953 };
954 } // namespace
955 
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const956 void GCNSubtarget::getPostRAMutations(
957     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
958   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
959 }
960 
get(const MachineFunction & MF)961 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
962   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
963     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
964   else
965     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
966 }
967 
get(const TargetMachine & TM,const Function & F)968 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
969   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
970     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
971   else
972     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
973 }
974