1 //===- AMDGPUAttributor.cpp -----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPU.h"
14 #include "GCNSubtarget.h"
15 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/Analysis/CycleAnalysis.h"
17 #include "llvm/CodeGen/TargetPassConfig.h"
18 #include "llvm/IR/IntrinsicsAMDGPU.h"
19 #include "llvm/IR/IntrinsicsR600.h"
20 #include "llvm/Target/TargetMachine.h"
21 #include "llvm/Transforms/IPO/Attributor.h"
22 
23 #define DEBUG_TYPE "amdgpu-attributor"
24 
25 namespace llvm {
26 void initializeCycleInfoWrapperPassPass(PassRegistry &);
27 }
28 
29 using namespace llvm;
30 
31 static cl::opt<unsigned> KernargPreloadCount(
32     "amdgpu-kernarg-preload-count",
33     cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
34 
35 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
36 
37 enum ImplicitArgumentPositions {
38   #include "AMDGPUAttributes.def"
39   LAST_ARG_POS
40 };
41 
42 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
43 
44 enum ImplicitArgumentMask {
45   NOT_IMPLICIT_INPUT = 0,
46   #include "AMDGPUAttributes.def"
47   ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
48 };
49 
50 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
51 static constexpr std::pair<ImplicitArgumentMask,
52                            StringLiteral> ImplicitAttrs[] = {
53  #include "AMDGPUAttributes.def"
54 };
55 
56 // We do not need to note the x workitem or workgroup id because they are always
57 // initialized.
58 //
59 // TODO: We should not add the attributes if the known compile time workgroup
60 // size is 1 for y/z.
61 static ImplicitArgumentMask
62 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
63                     bool HasApertureRegs, bool SupportsGetDoorBellID,
64                     unsigned CodeObjectVersion) {
65   switch (ID) {
66   case Intrinsic::amdgcn_workitem_id_x:
67     NonKernelOnly = true;
68     return WORKITEM_ID_X;
69   case Intrinsic::amdgcn_workgroup_id_x:
70     NonKernelOnly = true;
71     return WORKGROUP_ID_X;
72   case Intrinsic::amdgcn_workitem_id_y:
73   case Intrinsic::r600_read_tidig_y:
74     return WORKITEM_ID_Y;
75   case Intrinsic::amdgcn_workitem_id_z:
76   case Intrinsic::r600_read_tidig_z:
77     return WORKITEM_ID_Z;
78   case Intrinsic::amdgcn_workgroup_id_y:
79   case Intrinsic::r600_read_tgid_y:
80     return WORKGROUP_ID_Y;
81   case Intrinsic::amdgcn_workgroup_id_z:
82   case Intrinsic::r600_read_tgid_z:
83     return WORKGROUP_ID_Z;
84   case Intrinsic::amdgcn_lds_kernel_id:
85     return LDS_KERNEL_ID;
86   case Intrinsic::amdgcn_dispatch_ptr:
87     return DISPATCH_PTR;
88   case Intrinsic::amdgcn_dispatch_id:
89     return DISPATCH_ID;
90   case Intrinsic::amdgcn_implicitarg_ptr:
91     return IMPLICIT_ARG_PTR;
92   // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
93   // queue_ptr.
94   case Intrinsic::amdgcn_queue_ptr:
95     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
96     return QUEUE_PTR;
97   case Intrinsic::amdgcn_is_shared:
98   case Intrinsic::amdgcn_is_private:
99     if (HasApertureRegs)
100       return NOT_IMPLICIT_INPUT;
101     // Under V5, we need implicitarg_ptr + offsets to access private_base or
102     // shared_base. For pre-V5, however, need to access them through queue_ptr +
103     // offsets.
104     return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR :
105                                                       QUEUE_PTR;
106   case Intrinsic::trap:
107     if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
108       return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT :
109                                                         QUEUE_PTR;
110     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
111     return QUEUE_PTR;
112   default:
113     return NOT_IMPLICIT_INPUT;
114   }
115 }
116 
117 static bool castRequiresQueuePtr(unsigned SrcAS) {
118   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
119 }
120 
121 static bool isDSAddress(const Constant *C) {
122   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
123   if (!GV)
124     return false;
125   unsigned AS = GV->getAddressSpace();
126   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
127 }
128 
129 /// Returns true if the function requires the implicit argument be passed
130 /// regardless of the function contents.
131 static bool funcRequiresHostcallPtr(const Function &F) {
132   // Sanitizers require the hostcall buffer passed in the implicit arguments.
133   return F.hasFnAttribute(Attribute::SanitizeAddress) ||
134          F.hasFnAttribute(Attribute::SanitizeThread) ||
135          F.hasFnAttribute(Attribute::SanitizeMemory) ||
136          F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
137          F.hasFnAttribute(Attribute::SanitizeMemTag);
138 }
139 
140 namespace {
141 class AMDGPUInformationCache : public InformationCache {
142 public:
143   AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
144                          BumpPtrAllocator &Allocator,
145                          SetVector<Function *> *CGSCC, TargetMachine &TM)
146       : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
147         CodeObjectVersion(AMDGPU::getCodeObjectVersion(M)) {}
148 
149   TargetMachine &TM;
150 
151   enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
152 
153   /// Check if the subtarget has aperture regs.
154   bool hasApertureRegs(Function &F) {
155     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
156     return ST.hasApertureRegs();
157   }
158 
159   /// Check if the subtarget supports GetDoorbellID.
160   bool supportsGetDoorbellID(Function &F) {
161     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
162     return ST.supportsGetDoorbellID();
163   }
164 
165   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
166     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
167     return ST.getFlatWorkGroupSizes(F);
168   }
169 
170   std::pair<unsigned, unsigned>
171   getMaximumFlatWorkGroupRange(const Function &F) {
172     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
173     return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
174   }
175 
176   /// Get code object version.
177   unsigned getCodeObjectVersion() const {
178     return CodeObjectVersion;
179   }
180 
181   /// Get the effective value of "amdgpu-waves-per-eu" for the function,
182   /// accounting for the interaction with the passed value to use for
183   /// "amdgpu-flat-work-group-size".
184   std::pair<unsigned, unsigned>
185   getWavesPerEU(const Function &F,
186                 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
187     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
188     return ST.getWavesPerEU(F, FlatWorkGroupSize);
189   }
190 
191   std::pair<unsigned, unsigned>
192   getEffectiveWavesPerEU(const Function &F,
193                          std::pair<unsigned, unsigned> WavesPerEU,
194                          std::pair<unsigned, unsigned> FlatWorkGroupSize) {
195     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
196     return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize);
197   }
198 
199   unsigned getMaxWavesPerEU(const Function &F) {
200     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
201     return ST.getMaxWavesPerEU();
202   }
203 
204 private:
205   /// Check if the ConstantExpr \p CE requires the queue pointer.
206   static bool visitConstExpr(const ConstantExpr *CE) {
207     if (CE->getOpcode() == Instruction::AddrSpaceCast) {
208       unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
209       return castRequiresQueuePtr(SrcAS);
210     }
211     return false;
212   }
213 
214   /// Get the constant access bitmap for \p C.
215   uint8_t getConstantAccess(const Constant *C,
216                             SmallPtrSetImpl<const Constant *> &Visited) {
217     auto It = ConstantStatus.find(C);
218     if (It != ConstantStatus.end())
219       return It->second;
220 
221     uint8_t Result = 0;
222     if (isDSAddress(C))
223       Result = DS_GLOBAL;
224 
225     if (const auto *CE = dyn_cast<ConstantExpr>(C))
226       if (visitConstExpr(CE))
227         Result |= ADDR_SPACE_CAST;
228 
229     for (const Use &U : C->operands()) {
230       const auto *OpC = dyn_cast<Constant>(U);
231       if (!OpC || !Visited.insert(OpC).second)
232         continue;
233 
234       Result |= getConstantAccess(OpC, Visited);
235     }
236     return Result;
237   }
238 
239 public:
240   /// Returns true if \p Fn needs the queue pointer because of \p C.
241   bool needsQueuePtr(const Constant *C, Function &Fn) {
242     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
243     bool HasAperture = hasApertureRegs(Fn);
244 
245     // No need to explore the constants.
246     if (!IsNonEntryFunc && HasAperture)
247       return false;
248 
249     SmallPtrSet<const Constant *, 8> Visited;
250     uint8_t Access = getConstantAccess(C, Visited);
251 
252     // We need to trap on DS globals in non-entry functions.
253     if (IsNonEntryFunc && (Access & DS_GLOBAL))
254       return true;
255 
256     return !HasAperture && (Access & ADDR_SPACE_CAST);
257   }
258 
259 private:
260   /// Used to determine if the Constant needs the queue pointer.
261   DenseMap<const Constant *, uint8_t> ConstantStatus;
262   const unsigned CodeObjectVersion;
263 };
264 
265 struct AAAMDAttributes
266     : public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
267                           AbstractAttribute> {
268   using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
269                             AbstractAttribute>;
270 
271   AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
272 
273   /// Create an abstract attribute view for the position \p IRP.
274   static AAAMDAttributes &createForPosition(const IRPosition &IRP,
275                                             Attributor &A);
276 
277   /// See AbstractAttribute::getName().
278   const std::string getName() const override { return "AAAMDAttributes"; }
279 
280   /// See AbstractAttribute::getIdAddr().
281   const char *getIdAddr() const override { return &ID; }
282 
283   /// This function should return true if the type of the \p AA is
284   /// AAAMDAttributes.
285   static bool classof(const AbstractAttribute *AA) {
286     return (AA->getIdAddr() == &ID);
287   }
288 
289   /// Unique ID (due to the unique address)
290   static const char ID;
291 };
292 const char AAAMDAttributes::ID = 0;
293 
294 struct AAUniformWorkGroupSize
295     : public StateWrapper<BooleanState, AbstractAttribute> {
296   using Base = StateWrapper<BooleanState, AbstractAttribute>;
297   AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
298 
299   /// Create an abstract attribute view for the position \p IRP.
300   static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
301                                                    Attributor &A);
302 
303   /// See AbstractAttribute::getName().
304   const std::string getName() const override {
305     return "AAUniformWorkGroupSize";
306   }
307 
308   /// See AbstractAttribute::getIdAddr().
309   const char *getIdAddr() const override { return &ID; }
310 
311   /// This function should return true if the type of the \p AA is
312   /// AAAMDAttributes.
313   static bool classof(const AbstractAttribute *AA) {
314     return (AA->getIdAddr() == &ID);
315   }
316 
317   /// Unique ID (due to the unique address)
318   static const char ID;
319 };
320 const char AAUniformWorkGroupSize::ID = 0;
321 
322 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
323   AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
324       : AAUniformWorkGroupSize(IRP, A) {}
325 
326   void initialize(Attributor &A) override {
327     Function *F = getAssociatedFunction();
328     CallingConv::ID CC = F->getCallingConv();
329 
330     if (CC != CallingConv::AMDGPU_KERNEL)
331       return;
332 
333     bool InitialValue = false;
334     if (F->hasFnAttribute("uniform-work-group-size"))
335       InitialValue = F->getFnAttribute("uniform-work-group-size")
336                          .getValueAsString()
337                          .equals("true");
338 
339     if (InitialValue)
340       indicateOptimisticFixpoint();
341     else
342       indicatePessimisticFixpoint();
343   }
344 
345   ChangeStatus updateImpl(Attributor &A) override {
346     ChangeStatus Change = ChangeStatus::UNCHANGED;
347 
348     auto CheckCallSite = [&](AbstractCallSite CS) {
349       Function *Caller = CS.getInstruction()->getFunction();
350       LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
351                         << "->" << getAssociatedFunction()->getName() << "\n");
352 
353       const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
354           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
355       if (!CallerInfo)
356         return false;
357 
358       Change = Change | clampStateAndIndicateChange(this->getState(),
359                                                     CallerInfo->getState());
360 
361       return true;
362     };
363 
364     bool AllCallSitesKnown = true;
365     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
366       return indicatePessimisticFixpoint();
367 
368     return Change;
369   }
370 
371   ChangeStatus manifest(Attributor &A) override {
372     SmallVector<Attribute, 8> AttrList;
373     LLVMContext &Ctx = getAssociatedFunction()->getContext();
374 
375     AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
376                                       getAssumed() ? "true" : "false"));
377     return A.manifestAttrs(getIRPosition(), AttrList,
378                            /* ForceReplace */ true);
379   }
380 
381   bool isValidState() const override {
382     // This state is always valid, even when the state is false.
383     return true;
384   }
385 
386   const std::string getAsStr(Attributor *) const override {
387     return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
388   }
389 
390   /// See AbstractAttribute::trackStatistics()
391   void trackStatistics() const override {}
392 };
393 
394 AAUniformWorkGroupSize &
395 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
396                                           Attributor &A) {
397   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
398     return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
399   llvm_unreachable(
400       "AAUniformWorkGroupSize is only valid for function position");
401 }
402 
403 struct AAAMDAttributesFunction : public AAAMDAttributes {
404   AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
405       : AAAMDAttributes(IRP, A) {}
406 
407   void initialize(Attributor &A) override {
408     Function *F = getAssociatedFunction();
409 
410     // If the function requires the implicit arg pointer due to sanitizers,
411     // assume it's needed even if explicitly marked as not requiring it.
412     const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
413     if (NeedsHostcall) {
414       removeAssumedBits(IMPLICIT_ARG_PTR);
415       removeAssumedBits(HOSTCALL_PTR);
416     }
417 
418     for (auto Attr : ImplicitAttrs) {
419       if (NeedsHostcall &&
420           (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
421         continue;
422 
423       if (F->hasFnAttribute(Attr.second))
424         addKnownBits(Attr.first);
425     }
426 
427     if (F->isDeclaration())
428       return;
429 
430     // Ignore functions with graphics calling conventions, these are currently
431     // not allowed to have kernel arguments.
432     if (AMDGPU::isGraphics(F->getCallingConv())) {
433       indicatePessimisticFixpoint();
434       return;
435     }
436   }
437 
438   ChangeStatus updateImpl(Attributor &A) override {
439     Function *F = getAssociatedFunction();
440     // The current assumed state used to determine a change.
441     auto OrigAssumed = getAssumed();
442 
443     // Check for Intrinsics and propagate attributes.
444     const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
445         *this, this->getIRPosition(), DepClassTy::REQUIRED);
446     if (!AAEdges || AAEdges->hasNonAsmUnknownCallee())
447       return indicatePessimisticFixpoint();
448 
449     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
450 
451     bool NeedsImplicit = false;
452     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
453     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
454     bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
455     unsigned COV = InfoCache.getCodeObjectVersion();
456 
457     for (Function *Callee : AAEdges->getOptimisticEdges()) {
458       Intrinsic::ID IID = Callee->getIntrinsicID();
459       if (IID == Intrinsic::not_intrinsic) {
460         const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
461             *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
462         if (!AAAMD)
463           return indicatePessimisticFixpoint();
464         *this &= *AAAMD;
465         continue;
466       }
467 
468       bool NonKernelOnly = false;
469       ImplicitArgumentMask AttrMask =
470           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
471                               HasApertureRegs, SupportsGetDoorbellID, COV);
472       if (AttrMask != NOT_IMPLICIT_INPUT) {
473         if ((IsNonEntryFunc || !NonKernelOnly))
474           removeAssumedBits(AttrMask);
475       }
476     }
477 
478     // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
479     if (NeedsImplicit)
480       removeAssumedBits(IMPLICIT_ARG_PTR);
481 
482     if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
483       // Under V5, we need implicitarg_ptr + offsets to access private_base or
484       // shared_base. We do not actually need queue_ptr.
485       if (COV >= 5)
486         removeAssumedBits(IMPLICIT_ARG_PTR);
487       else
488         removeAssumedBits(QUEUE_PTR);
489     }
490 
491     if (funcRetrievesMultigridSyncArg(A, COV)) {
492       assert(!isAssumed(IMPLICIT_ARG_PTR) &&
493              "multigrid_sync_arg needs implicitarg_ptr");
494       removeAssumedBits(MULTIGRID_SYNC_ARG);
495     }
496 
497     if (funcRetrievesHostcallPtr(A, COV)) {
498       assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
499       removeAssumedBits(HOSTCALL_PTR);
500     }
501 
502     if (funcRetrievesHeapPtr(A, COV)) {
503       assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
504       removeAssumedBits(HEAP_PTR);
505     }
506 
507     if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
508       assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
509       removeAssumedBits(QUEUE_PTR);
510     }
511 
512     if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
513       removeAssumedBits(LDS_KERNEL_ID);
514     }
515 
516     if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
517       removeAssumedBits(DEFAULT_QUEUE);
518 
519     if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
520       removeAssumedBits(COMPLETION_ACTION);
521 
522     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
523                                        : ChangeStatus::UNCHANGED;
524   }
525 
526   ChangeStatus manifest(Attributor &A) override {
527     SmallVector<Attribute, 8> AttrList;
528     LLVMContext &Ctx = getAssociatedFunction()->getContext();
529 
530     for (auto Attr : ImplicitAttrs) {
531       if (isKnown(Attr.first))
532         AttrList.push_back(Attribute::get(Ctx, Attr.second));
533     }
534 
535     return A.manifestAttrs(getIRPosition(), AttrList,
536                            /* ForceReplace */ true);
537   }
538 
539   const std::string getAsStr(Attributor *) const override {
540     std::string Str;
541     raw_string_ostream OS(Str);
542     OS << "AMDInfo[";
543     for (auto Attr : ImplicitAttrs)
544       if (isAssumed(Attr.first))
545         OS << ' ' << Attr.second;
546     OS << " ]";
547     return OS.str();
548   }
549 
550   /// See AbstractAttribute::trackStatistics()
551   void trackStatistics() const override {}
552 
553 private:
554   bool checkForQueuePtr(Attributor &A) {
555     Function *F = getAssociatedFunction();
556     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
557 
558     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
559 
560     bool NeedsQueuePtr = false;
561 
562     auto CheckAddrSpaceCasts = [&](Instruction &I) {
563       unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
564       if (castRequiresQueuePtr(SrcAS)) {
565         NeedsQueuePtr = true;
566         return false;
567       }
568       return true;
569     };
570 
571     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
572 
573     // `checkForAllInstructions` is much more cheaper than going through all
574     // instructions, try it first.
575 
576     // The queue pointer is not needed if aperture regs is present.
577     if (!HasApertureRegs) {
578       bool UsedAssumedInformation = false;
579       A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
580                                 {Instruction::AddrSpaceCast},
581                                 UsedAssumedInformation);
582     }
583 
584     // If we found  that we need the queue pointer, nothing else to do.
585     if (NeedsQueuePtr)
586       return true;
587 
588     if (!IsNonEntryFunc && HasApertureRegs)
589       return false;
590 
591     for (BasicBlock &BB : *F) {
592       for (Instruction &I : BB) {
593         for (const Use &U : I.operands()) {
594           if (const auto *C = dyn_cast<Constant>(U)) {
595             if (InfoCache.needsQueuePtr(C, *F))
596               return true;
597           }
598         }
599       }
600     }
601 
602     return false;
603   }
604 
605   bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
606     auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
607     AA::RangeTy Range(Pos, 8);
608     return funcRetrievesImplicitKernelArg(A, Range);
609   }
610 
611   bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
612     auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
613     AA::RangeTy Range(Pos, 8);
614     return funcRetrievesImplicitKernelArg(A, Range);
615   }
616 
617   bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
618     auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
619     AA::RangeTy Range(Pos, 8);
620     return funcRetrievesImplicitKernelArg(A, Range);
621   }
622 
623   bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
624     auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
625     AA::RangeTy Range(Pos, 8);
626     return funcRetrievesImplicitKernelArg(A, Range);
627   }
628 
629   bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
630     if (COV < 5)
631       return false;
632     AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
633     return funcRetrievesImplicitKernelArg(A, Range);
634   }
635 
636   bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
637     if (COV < 5)
638       return false;
639     AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
640     return funcRetrievesImplicitKernelArg(A, Range);
641   }
642 
643   bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
644     // Check if this is a call to the implicitarg_ptr builtin and it
645     // is used to retrieve the hostcall pointer. The implicit arg for
646     // hostcall is not used only if every use of the implicitarg_ptr
647     // is a load that clearly does not retrieve any byte of the
648     // hostcall pointer. We check this by tracing all the uses of the
649     // initial call to the implicitarg_ptr intrinsic.
650     auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
651       auto &Call = cast<CallBase>(I);
652       if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
653         return true;
654 
655       const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
656           *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
657       if (!PointerInfoAA)
658         return false;
659 
660       return PointerInfoAA->forallInterferingAccesses(
661           Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
662             return Acc.getRemoteInst()->isDroppable();
663           });
664     };
665 
666     bool UsedAssumedInformation = false;
667     return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
668                                               UsedAssumedInformation);
669   }
670 
671   bool funcRetrievesLDSKernelId(Attributor &A) {
672     auto DoesNotRetrieve = [&](Instruction &I) {
673       auto &Call = cast<CallBase>(I);
674       return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
675     };
676     bool UsedAssumedInformation = false;
677     return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
678                                               UsedAssumedInformation);
679   }
680 };
681 
682 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
683                                                     Attributor &A) {
684   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
685     return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
686   llvm_unreachable("AAAMDAttributes is only valid for function position");
687 }
688 
689 /// Base class to derive different size ranges.
690 struct AAAMDSizeRangeAttribute
691     : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
692   using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
693 
694   StringRef AttrName;
695 
696   AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
697                           StringRef AttrName)
698       : Base(IRP, 32), AttrName(AttrName) {}
699 
700   /// See AbstractAttribute::trackStatistics()
701   void trackStatistics() const override {}
702 
703   template <class AttributeImpl>
704   ChangeStatus updateImplImpl(Attributor &A) {
705     ChangeStatus Change = ChangeStatus::UNCHANGED;
706 
707     auto CheckCallSite = [&](AbstractCallSite CS) {
708       Function *Caller = CS.getInstruction()->getFunction();
709       LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
710                         << "->" << getAssociatedFunction()->getName() << '\n');
711 
712       const auto *CallerInfo = A.getAAFor<AttributeImpl>(
713           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
714       if (!CallerInfo)
715         return false;
716 
717       Change |=
718           clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
719 
720       return true;
721     };
722 
723     bool AllCallSitesKnown = true;
724     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
725       return indicatePessimisticFixpoint();
726 
727     return Change;
728   }
729 
730   ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,
731                                          unsigned Max) {
732     // Don't add the attribute if it's the implied default.
733     if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
734       return ChangeStatus::UNCHANGED;
735 
736     Function *F = getAssociatedFunction();
737     LLVMContext &Ctx = F->getContext();
738     SmallString<10> Buffer;
739     raw_svector_ostream OS(Buffer);
740     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
741     return A.manifestAttrs(getIRPosition(),
742                            {Attribute::get(Ctx, AttrName, OS.str())},
743                            /* ForceReplace */ true);
744   }
745 
746   const std::string getAsStr(Attributor *) const override {
747     std::string Str;
748     raw_string_ostream OS(Str);
749     OS << getName() << '[';
750     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
751     OS << ']';
752     return OS.str();
753   }
754 };
755 
756 /// Propagate amdgpu-flat-work-group-size attribute.
757 struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
758   AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
759       : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
760 
761   void initialize(Attributor &A) override {
762     Function *F = getAssociatedFunction();
763     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
764     unsigned MinGroupSize, MaxGroupSize;
765     std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
766     intersectKnown(
767         ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
768 
769     if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
770       indicatePessimisticFixpoint();
771   }
772 
773   ChangeStatus updateImpl(Attributor &A) override {
774     return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
775   }
776 
777   /// Create an abstract attribute view for the position \p IRP.
778   static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
779                                                    Attributor &A);
780 
781   ChangeStatus manifest(Attributor &A) override {
782     Function *F = getAssociatedFunction();
783     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
784     unsigned Min, Max;
785     std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
786     return emitAttributeIfNotDefault(A, Min, Max);
787   }
788 
789   /// See AbstractAttribute::getName()
790   const std::string getName() const override {
791     return "AAAMDFlatWorkGroupSize";
792   }
793 
794   /// See AbstractAttribute::getIdAddr()
795   const char *getIdAddr() const override { return &ID; }
796 
797   /// This function should return true if the type of the \p AA is
798   /// AAAMDFlatWorkGroupSize
799   static bool classof(const AbstractAttribute *AA) {
800     return (AA->getIdAddr() == &ID);
801   }
802 
803   /// Unique ID (due to the unique address)
804   static const char ID;
805 };
806 
807 const char AAAMDFlatWorkGroupSize::ID = 0;
808 
809 AAAMDFlatWorkGroupSize &
810 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
811                                           Attributor &A) {
812   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
813     return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
814   llvm_unreachable(
815       "AAAMDFlatWorkGroupSize is only valid for function position");
816 }
817 
818 /// Propagate amdgpu-waves-per-eu attribute.
819 struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
820   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
821       : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
822 
823   bool isValidState() const override {
824     return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
825   }
826 
827   void initialize(Attributor &A) override {
828     Function *F = getAssociatedFunction();
829     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
830 
831     if (const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
832             *this, IRPosition::function(*F), DepClassTy::REQUIRED)) {
833 
834       unsigned Min, Max;
835       std::tie(Min, Max) = InfoCache.getWavesPerEU(
836           *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
837                AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
838 
839       ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
840       intersectKnown(Range);
841     }
842 
843     if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
844       indicatePessimisticFixpoint();
845   }
846 
847   ChangeStatus updateImpl(Attributor &A) override {
848     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
849     ChangeStatus Change = ChangeStatus::UNCHANGED;
850 
851     auto CheckCallSite = [&](AbstractCallSite CS) {
852       Function *Caller = CS.getInstruction()->getFunction();
853       Function *Func = getAssociatedFunction();
854       LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
855                         << "->" << Func->getName() << '\n');
856 
857       const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(
858           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
859       const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
860           *this, IRPosition::function(*Func), DepClassTy::REQUIRED);
861       if (!CallerInfo || !AssumedGroupSize)
862         return false;
863 
864       unsigned Min, Max;
865       std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU(
866           *Caller,
867           {CallerInfo->getAssumed().getLower().getZExtValue(),
868            CallerInfo->getAssumed().getUpper().getZExtValue() - 1},
869           {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
870            AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
871       ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));
872       IntegerRangeState CallerRangeState(CallerRange);
873       Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState);
874 
875       return true;
876     };
877 
878     bool AllCallSitesKnown = true;
879     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
880       return indicatePessimisticFixpoint();
881 
882     return Change;
883   }
884 
885   /// Create an abstract attribute view for the position \p IRP.
886   static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
887                                             Attributor &A);
888 
889   ChangeStatus manifest(Attributor &A) override {
890     Function *F = getAssociatedFunction();
891     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
892     unsigned Max = InfoCache.getMaxWavesPerEU(*F);
893     return emitAttributeIfNotDefault(A, 1, Max);
894   }
895 
896   /// See AbstractAttribute::getName()
897   const std::string getName() const override { return "AAAMDWavesPerEU"; }
898 
899   /// See AbstractAttribute::getIdAddr()
900   const char *getIdAddr() const override { return &ID; }
901 
902   /// This function should return true if the type of the \p AA is
903   /// AAAMDWavesPerEU
904   static bool classof(const AbstractAttribute *AA) {
905     return (AA->getIdAddr() == &ID);
906   }
907 
908   /// Unique ID (due to the unique address)
909   static const char ID;
910 };
911 
912 const char AAAMDWavesPerEU::ID = 0;
913 
914 AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
915                                                     Attributor &A) {
916   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
917     return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
918   llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
919 }
920 
921 static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
922   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
923   for (unsigned I = 0;
924        I < F.arg_size() &&
925        I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
926        ++I) {
927     Argument &Arg = *F.getArg(I);
928     // Check for incompatible attributes.
929     if (Arg.hasByRefAttr() || Arg.hasNestAttr())
930       break;
931 
932     Arg.addAttr(Attribute::InReg);
933   }
934 }
935 
936 static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
937   SetVector<Function *> Functions;
938   for (Function &F : M) {
939     if (!F.isIntrinsic())
940       Functions.insert(&F);
941   }
942 
943   CallGraphUpdater CGUpdater;
944   BumpPtrAllocator Allocator;
945   AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
946   DenseSet<const char *> Allowed(
947       {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
948        &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
949        &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
950        &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
951 
952   AttributorConfig AC(CGUpdater);
953   AC.Allowed = &Allowed;
954   AC.IsModulePass = true;
955   AC.DefaultInitializeLiveInternals = false;
956   AC.IPOAmendableCB = [](const Function &F) {
957     return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
958   };
959 
960   Attributor A(Functions, InfoCache, AC);
961 
962   for (Function &F : M) {
963     if (!F.isIntrinsic()) {
964       A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
965       A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
966       CallingConv::ID CC = F.getCallingConv();
967       if (!AMDGPU::isEntryFunctionCC(CC)) {
968         A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
969         A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F));
970       } else if (CC == CallingConv::AMDGPU_KERNEL) {
971         addPreloadKernArgHint(F, TM);
972       }
973     }
974   }
975 
976   ChangeStatus Change = A.run();
977   return Change == ChangeStatus::CHANGED;
978 }
979 
980 class AMDGPUAttributorLegacy : public ModulePass {
981 public:
982   AMDGPUAttributorLegacy() : ModulePass(ID) {}
983 
984   /// doInitialization - Virtual method overridden by subclasses to do
985   /// any necessary initialization before any pass is run.
986   bool doInitialization(Module &) override {
987     auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
988     if (!TPC)
989       report_fatal_error("TargetMachine is required");
990 
991     TM = &TPC->getTM<TargetMachine>();
992     return false;
993   }
994 
995   bool runOnModule(Module &M) override {
996     AnalysisGetter AG(this);
997     return runImpl(M, AG, *TM);
998   }
999 
1000   void getAnalysisUsage(AnalysisUsage &AU) const override {
1001     AU.addRequired<CycleInfoWrapperPass>();
1002   }
1003 
1004   StringRef getPassName() const override { return "AMDGPU Attributor"; }
1005   TargetMachine *TM;
1006   static char ID;
1007 };
1008 } // namespace
1009 
1010 PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
1011                                                   ModuleAnalysisManager &AM) {
1012 
1013   FunctionAnalysisManager &FAM =
1014       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1015   AnalysisGetter AG(FAM);
1016 
1017   // TODO: Probably preserves CFG
1018   return runImpl(M, AG, TM) ? PreservedAnalyses::none()
1019                             : PreservedAnalyses::all();
1020 }
1021 
1022 char AMDGPUAttributorLegacy::ID = 0;
1023 
1024 Pass *llvm::createAMDGPUAttributorLegacyPass() {
1025   return new AMDGPUAttributorLegacy();
1026 }
1027 INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
1028                       false, false)
1029 INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);
1030 INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
1031                     false, false)
1032