1 //===- AMDGPUAttributor.cpp -----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPU.h"
14 #include "GCNSubtarget.h"
15 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/CodeGen/TargetPassConfig.h"
17 #include "llvm/IR/IntrinsicsAMDGPU.h"
18 #include "llvm/IR/IntrinsicsR600.h"
19 #include "llvm/Target/TargetMachine.h"
20 #include "llvm/Transforms/IPO/Attributor.h"
21 
22 #define DEBUG_TYPE "amdgpu-attributor"
23 
24 using namespace llvm;
25 
26 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
27 
28 enum ImplicitArgumentPositions {
29   #include "AMDGPUAttributes.def"
30   LAST_ARG_POS
31 };
32 
33 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
34 
35 enum ImplicitArgumentMask {
36   NOT_IMPLICIT_INPUT = 0,
37   #include "AMDGPUAttributes.def"
38   ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
39 };
40 
41 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
42 static constexpr std::pair<ImplicitArgumentMask,
43                            StringLiteral> ImplicitAttrs[] = {
44  #include "AMDGPUAttributes.def"
45 };
46 
47 // We do not need to note the x workitem or workgroup id because they are always
48 // initialized.
49 //
50 // TODO: We should not add the attributes if the known compile time workgroup
51 // size is 1 for y/z.
52 static ImplicitArgumentMask
53 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
54                     bool HasApertureRegs, bool SupportsGetDoorBellID) {
55   unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion();
56   switch (ID) {
57   case Intrinsic::amdgcn_workitem_id_x:
58     NonKernelOnly = true;
59     return WORKITEM_ID_X;
60   case Intrinsic::amdgcn_workgroup_id_x:
61     NonKernelOnly = true;
62     return WORKGROUP_ID_X;
63   case Intrinsic::amdgcn_workitem_id_y:
64   case Intrinsic::r600_read_tidig_y:
65     return WORKITEM_ID_Y;
66   case Intrinsic::amdgcn_workitem_id_z:
67   case Intrinsic::r600_read_tidig_z:
68     return WORKITEM_ID_Z;
69   case Intrinsic::amdgcn_workgroup_id_y:
70   case Intrinsic::r600_read_tgid_y:
71     return WORKGROUP_ID_Y;
72   case Intrinsic::amdgcn_workgroup_id_z:
73   case Intrinsic::r600_read_tgid_z:
74     return WORKGROUP_ID_Z;
75   case Intrinsic::amdgcn_lds_kernel_id:
76     return LDS_KERNEL_ID;
77   case Intrinsic::amdgcn_dispatch_ptr:
78     return DISPATCH_PTR;
79   case Intrinsic::amdgcn_dispatch_id:
80     return DISPATCH_ID;
81   case Intrinsic::amdgcn_implicitarg_ptr:
82     return IMPLICIT_ARG_PTR;
83   // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
84   // queue_ptr.
85   case Intrinsic::amdgcn_queue_ptr:
86     NeedsImplicit = (CodeObjectVersion == 5);
87     return QUEUE_PTR;
88   case Intrinsic::amdgcn_is_shared:
89   case Intrinsic::amdgcn_is_private:
90     if (HasApertureRegs)
91       return NOT_IMPLICIT_INPUT;
92     // Under V5, we need implicitarg_ptr + offsets to access private_base or
93     // shared_base. For pre-V5, however, need to access them through queue_ptr +
94     // offsets.
95     return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR;
96   case Intrinsic::trap:
97     if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
98       return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR;
99     NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5.
100     return QUEUE_PTR;
101   default:
102     return NOT_IMPLICIT_INPUT;
103   }
104 }
105 
106 static bool castRequiresQueuePtr(unsigned SrcAS) {
107   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
108 }
109 
110 static bool isDSAddress(const Constant *C) {
111   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
112   if (!GV)
113     return false;
114   unsigned AS = GV->getAddressSpace();
115   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
116 }
117 
118 /// Returns true if the function requires the implicit argument be passed
119 /// regardless of the function contents.
120 static bool funcRequiresHostcallPtr(const Function &F) {
121   // Sanitizers require the hostcall buffer passed in the implicit arguments.
122   return F.hasFnAttribute(Attribute::SanitizeAddress) ||
123          F.hasFnAttribute(Attribute::SanitizeThread) ||
124          F.hasFnAttribute(Attribute::SanitizeMemory) ||
125          F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
126          F.hasFnAttribute(Attribute::SanitizeMemTag);
127 }
128 
129 namespace {
130 class AMDGPUInformationCache : public InformationCache {
131 public:
132   AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
133                          BumpPtrAllocator &Allocator,
134                          SetVector<Function *> *CGSCC, TargetMachine &TM)
135       : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {}
136   TargetMachine &TM;
137 
138   enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
139 
140   /// Check if the subtarget has aperture regs.
141   bool hasApertureRegs(Function &F) {
142     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
143     return ST.hasApertureRegs();
144   }
145 
146   /// Check if the subtarget supports GetDoorbellID.
147   bool supportsGetDoorbellID(Function &F) {
148     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
149     return ST.supportsGetDoorbellID();
150   }
151 
152   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
153     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
154     return ST.getFlatWorkGroupSizes(F);
155   }
156 
157   std::pair<unsigned, unsigned>
158   getMaximumFlatWorkGroupRange(const Function &F) {
159     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
160     return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
161   }
162 
163 private:
164   /// Check if the ConstantExpr \p CE requires the queue pointer.
165   static bool visitConstExpr(const ConstantExpr *CE) {
166     if (CE->getOpcode() == Instruction::AddrSpaceCast) {
167       unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
168       return castRequiresQueuePtr(SrcAS);
169     }
170     return false;
171   }
172 
173   /// Get the constant access bitmap for \p C.
174   uint8_t getConstantAccess(const Constant *C) {
175     auto It = ConstantStatus.find(C);
176     if (It != ConstantStatus.end())
177       return It->second;
178 
179     uint8_t Result = 0;
180     if (isDSAddress(C))
181       Result = DS_GLOBAL;
182 
183     if (const auto *CE = dyn_cast<ConstantExpr>(C))
184       if (visitConstExpr(CE))
185         Result |= ADDR_SPACE_CAST;
186 
187     for (const Use &U : C->operands()) {
188       const auto *OpC = dyn_cast<Constant>(U);
189       if (!OpC)
190         continue;
191 
192       Result |= getConstantAccess(OpC);
193     }
194     return Result;
195   }
196 
197 public:
198   /// Returns true if \p Fn needs the queue pointer because of \p C.
199   bool needsQueuePtr(const Constant *C, Function &Fn) {
200     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
201     bool HasAperture = hasApertureRegs(Fn);
202 
203     // No need to explore the constants.
204     if (!IsNonEntryFunc && HasAperture)
205       return false;
206 
207     uint8_t Access = getConstantAccess(C);
208 
209     // We need to trap on DS globals in non-entry functions.
210     if (IsNonEntryFunc && (Access & DS_GLOBAL))
211       return true;
212 
213     return !HasAperture && (Access & ADDR_SPACE_CAST);
214   }
215 
216 private:
217   /// Used to determine if the Constant needs the queue pointer.
218   DenseMap<const Constant *, uint8_t> ConstantStatus;
219 };
220 
221 struct AAAMDAttributes : public StateWrapper<
222   BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>, AbstractAttribute> {
223   using Base = StateWrapper<BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>,
224                             AbstractAttribute>;
225 
226   AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
227 
228   /// Create an abstract attribute view for the position \p IRP.
229   static AAAMDAttributes &createForPosition(const IRPosition &IRP,
230                                             Attributor &A);
231 
232   /// See AbstractAttribute::getName().
233   const std::string getName() const override { return "AAAMDAttributes"; }
234 
235   /// See AbstractAttribute::getIdAddr().
236   const char *getIdAddr() const override { return &ID; }
237 
238   /// This function should return true if the type of the \p AA is
239   /// AAAMDAttributes.
240   static bool classof(const AbstractAttribute *AA) {
241     return (AA->getIdAddr() == &ID);
242   }
243 
244   /// Unique ID (due to the unique address)
245   static const char ID;
246 };
247 const char AAAMDAttributes::ID = 0;
248 
249 struct AAUniformWorkGroupSize
250     : public StateWrapper<BooleanState, AbstractAttribute> {
251   using Base = StateWrapper<BooleanState, AbstractAttribute>;
252   AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
253 
254   /// Create an abstract attribute view for the position \p IRP.
255   static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
256                                                    Attributor &A);
257 
258   /// See AbstractAttribute::getName().
259   const std::string getName() const override {
260     return "AAUniformWorkGroupSize";
261   }
262 
263   /// See AbstractAttribute::getIdAddr().
264   const char *getIdAddr() const override { return &ID; }
265 
266   /// This function should return true if the type of the \p AA is
267   /// AAAMDAttributes.
268   static bool classof(const AbstractAttribute *AA) {
269     return (AA->getIdAddr() == &ID);
270   }
271 
272   /// Unique ID (due to the unique address)
273   static const char ID;
274 };
275 const char AAUniformWorkGroupSize::ID = 0;
276 
277 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
278   AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
279       : AAUniformWorkGroupSize(IRP, A) {}
280 
281   void initialize(Attributor &A) override {
282     Function *F = getAssociatedFunction();
283     CallingConv::ID CC = F->getCallingConv();
284 
285     if (CC != CallingConv::AMDGPU_KERNEL)
286       return;
287 
288     bool InitialValue = false;
289     if (F->hasFnAttribute("uniform-work-group-size"))
290       InitialValue = F->getFnAttribute("uniform-work-group-size")
291                          .getValueAsString()
292                          .equals("true");
293 
294     if (InitialValue)
295       indicateOptimisticFixpoint();
296     else
297       indicatePessimisticFixpoint();
298   }
299 
300   ChangeStatus updateImpl(Attributor &A) override {
301     ChangeStatus Change = ChangeStatus::UNCHANGED;
302 
303     auto CheckCallSite = [&](AbstractCallSite CS) {
304       Function *Caller = CS.getInstruction()->getFunction();
305       LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
306                         << "->" << getAssociatedFunction()->getName() << "\n");
307 
308       const auto &CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
309           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
310 
311       Change = Change | clampStateAndIndicateChange(this->getState(),
312                                                     CallerInfo.getState());
313 
314       return true;
315     };
316 
317     bool AllCallSitesKnown = true;
318     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
319       return indicatePessimisticFixpoint();
320 
321     return Change;
322   }
323 
324   ChangeStatus manifest(Attributor &A) override {
325     SmallVector<Attribute, 8> AttrList;
326     LLVMContext &Ctx = getAssociatedFunction()->getContext();
327 
328     AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
329                                       getAssumed() ? "true" : "false"));
330     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
331                                               /* ForceReplace */ true);
332   }
333 
334   bool isValidState() const override {
335     // This state is always valid, even when the state is false.
336     return true;
337   }
338 
339   const std::string getAsStr() const override {
340     return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
341   }
342 
343   /// See AbstractAttribute::trackStatistics()
344   void trackStatistics() const override {}
345 };
346 
347 AAUniformWorkGroupSize &
348 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
349                                           Attributor &A) {
350   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
351     return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
352   llvm_unreachable(
353       "AAUniformWorkGroupSize is only valid for function position");
354 }
355 
356 struct AAAMDAttributesFunction : public AAAMDAttributes {
357   AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
358       : AAAMDAttributes(IRP, A) {}
359 
360   void initialize(Attributor &A) override {
361     Function *F = getAssociatedFunction();
362 
363     // If the function requires the implicit arg pointer due to sanitizers,
364     // assume it's needed even if explicitly marked as not requiring it.
365     const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
366     if (NeedsHostcall) {
367       removeAssumedBits(IMPLICIT_ARG_PTR);
368       removeAssumedBits(HOSTCALL_PTR);
369     }
370 
371     for (auto Attr : ImplicitAttrs) {
372       if (NeedsHostcall &&
373           (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
374         continue;
375 
376       if (F->hasFnAttribute(Attr.second))
377         addKnownBits(Attr.first);
378     }
379 
380     if (F->isDeclaration())
381       return;
382 
383     // Ignore functions with graphics calling conventions, these are currently
384     // not allowed to have kernel arguments.
385     if (AMDGPU::isGraphics(F->getCallingConv())) {
386       indicatePessimisticFixpoint();
387       return;
388     }
389   }
390 
391   ChangeStatus updateImpl(Attributor &A) override {
392     Function *F = getAssociatedFunction();
393     // The current assumed state used to determine a change.
394     auto OrigAssumed = getAssumed();
395 
396     // Check for Intrinsics and propagate attributes.
397     const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
398         *this, this->getIRPosition(), DepClassTy::REQUIRED);
399     if (AAEdges.hasNonAsmUnknownCallee())
400       return indicatePessimisticFixpoint();
401 
402     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
403 
404     bool NeedsImplicit = false;
405     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
406     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
407     bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
408 
409     for (Function *Callee : AAEdges.getOptimisticEdges()) {
410       Intrinsic::ID IID = Callee->getIntrinsicID();
411       if (IID == Intrinsic::not_intrinsic) {
412         const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
413           *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
414         *this &= AAAMD;
415         continue;
416       }
417 
418       bool NonKernelOnly = false;
419       ImplicitArgumentMask AttrMask =
420           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
421                               HasApertureRegs, SupportsGetDoorbellID);
422       if (AttrMask != NOT_IMPLICIT_INPUT) {
423         if ((IsNonEntryFunc || !NonKernelOnly))
424           removeAssumedBits(AttrMask);
425       }
426     }
427 
428     // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
429     if (NeedsImplicit)
430       removeAssumedBits(IMPLICIT_ARG_PTR);
431 
432     if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
433       // Under V5, we need implicitarg_ptr + offsets to access private_base or
434       // shared_base. We do not actually need queue_ptr.
435       if (AMDGPU::getAmdhsaCodeObjectVersion() == 5)
436         removeAssumedBits(IMPLICIT_ARG_PTR);
437       else
438         removeAssumedBits(QUEUE_PTR);
439     }
440 
441     if (funcRetrievesMultigridSyncArg(A)) {
442       assert(!isAssumed(IMPLICIT_ARG_PTR) &&
443              "multigrid_sync_arg needs implicitarg_ptr");
444       removeAssumedBits(MULTIGRID_SYNC_ARG);
445     }
446 
447     if (funcRetrievesHostcallPtr(A)) {
448       assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
449       removeAssumedBits(HOSTCALL_PTR);
450     }
451 
452     if (funcRetrievesHeapPtr(A)) {
453       assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
454       removeAssumedBits(HEAP_PTR);
455     }
456 
457     if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) {
458       assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
459       removeAssumedBits(QUEUE_PTR);
460     }
461 
462     if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
463       removeAssumedBits(LDS_KERNEL_ID);
464     }
465 
466     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
467                                        : ChangeStatus::UNCHANGED;
468   }
469 
470   ChangeStatus manifest(Attributor &A) override {
471     SmallVector<Attribute, 8> AttrList;
472     LLVMContext &Ctx = getAssociatedFunction()->getContext();
473 
474     for (auto Attr : ImplicitAttrs) {
475       if (isKnown(Attr.first))
476         AttrList.push_back(Attribute::get(Ctx, Attr.second));
477     }
478 
479     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
480                                               /* ForceReplace */ true);
481   }
482 
483   const std::string getAsStr() const override {
484     std::string Str;
485     raw_string_ostream OS(Str);
486     OS << "AMDInfo[";
487     for (auto Attr : ImplicitAttrs)
488       OS << ' ' << Attr.second;
489     OS << " ]";
490     return OS.str();
491   }
492 
493   /// See AbstractAttribute::trackStatistics()
494   void trackStatistics() const override {}
495 
496 private:
497   bool checkForQueuePtr(Attributor &A) {
498     Function *F = getAssociatedFunction();
499     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
500 
501     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
502 
503     bool NeedsQueuePtr = false;
504 
505     auto CheckAddrSpaceCasts = [&](Instruction &I) {
506       unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
507       if (castRequiresQueuePtr(SrcAS)) {
508         NeedsQueuePtr = true;
509         return false;
510       }
511       return true;
512     };
513 
514     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
515 
516     // `checkForAllInstructions` is much more cheaper than going through all
517     // instructions, try it first.
518 
519     // The queue pointer is not needed if aperture regs is present.
520     if (!HasApertureRegs) {
521       bool UsedAssumedInformation = false;
522       A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
523                                 {Instruction::AddrSpaceCast},
524                                 UsedAssumedInformation);
525     }
526 
527     // If we found  that we need the queue pointer, nothing else to do.
528     if (NeedsQueuePtr)
529       return true;
530 
531     if (!IsNonEntryFunc && HasApertureRegs)
532       return false;
533 
534     for (BasicBlock &BB : *F) {
535       for (Instruction &I : BB) {
536         for (const Use &U : I.operands()) {
537           if (const auto *C = dyn_cast<Constant>(U)) {
538             if (InfoCache.needsQueuePtr(C, *F))
539               return true;
540           }
541         }
542       }
543     }
544 
545     return false;
546   }
547 
548   bool funcRetrievesMultigridSyncArg(Attributor &A) {
549     auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition();
550     AAPointerInfo::OffsetAndSize OAS(Pos, 8);
551     return funcRetrievesImplicitKernelArg(A, OAS);
552   }
553 
554   bool funcRetrievesHostcallPtr(Attributor &A) {
555     auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition();
556     AAPointerInfo::OffsetAndSize OAS(Pos, 8);
557     return funcRetrievesImplicitKernelArg(A, OAS);
558   }
559 
560   bool funcRetrievesHeapPtr(Attributor &A) {
561     if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
562       return false;
563     AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
564     return funcRetrievesImplicitKernelArg(A, OAS);
565   }
566 
567   bool funcRetrievesQueuePtr(Attributor &A) {
568     if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
569       return false;
570     AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
571     return funcRetrievesImplicitKernelArg(A, OAS);
572   }
573 
574   bool funcRetrievesImplicitKernelArg(Attributor &A,
575                                       AAPointerInfo::OffsetAndSize OAS) {
576     // Check if this is a call to the implicitarg_ptr builtin and it
577     // is used to retrieve the hostcall pointer. The implicit arg for
578     // hostcall is not used only if every use of the implicitarg_ptr
579     // is a load that clearly does not retrieve any byte of the
580     // hostcall pointer. We check this by tracing all the uses of the
581     // initial call to the implicitarg_ptr intrinsic.
582     auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
583       auto &Call = cast<CallBase>(I);
584       if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
585         return true;
586 
587       const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>(
588           *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
589 
590       return PointerInfoAA.forallInterferingAccesses(
591           OAS, [](const AAPointerInfo::Access &Acc, bool IsExact) {
592             return Acc.getRemoteInst()->isDroppable();
593           });
594     };
595 
596     bool UsedAssumedInformation = false;
597     return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
598                                               UsedAssumedInformation);
599   }
600 
601   bool funcRetrievesLDSKernelId(Attributor &A) {
602     auto DoesNotRetrieve = [&](Instruction &I) {
603       auto &Call = cast<CallBase>(I);
604       return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
605     };
606     bool UsedAssumedInformation = false;
607     return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
608                                               UsedAssumedInformation);
609   }
610 };
611 
612 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
613                                                     Attributor &A) {
614   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
615     return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
616   llvm_unreachable("AAAMDAttributes is only valid for function position");
617 }
618 
619 /// Propagate amdgpu-flat-work-group-size attribute.
620 struct AAAMDFlatWorkGroupSize
621     : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
622   using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
623   AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
624       : Base(IRP, 32) {}
625 
626   /// See AbstractAttribute::getState(...).
627   IntegerRangeState &getState() override { return *this; }
628   const IntegerRangeState &getState() const override { return *this; }
629 
630   void initialize(Attributor &A) override {
631     Function *F = getAssociatedFunction();
632     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
633     unsigned MinGroupSize, MaxGroupSize;
634     std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
635     intersectKnown(
636         ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
637 
638     if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
639       indicatePessimisticFixpoint();
640   }
641 
642   ChangeStatus updateImpl(Attributor &A) override {
643     ChangeStatus Change = ChangeStatus::UNCHANGED;
644 
645     auto CheckCallSite = [&](AbstractCallSite CS) {
646       Function *Caller = CS.getInstruction()->getFunction();
647       LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName()
648                         << "->" << getAssociatedFunction()->getName() << '\n');
649 
650       const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>(
651           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
652 
653       Change |=
654           clampStateAndIndicateChange(this->getState(), CallerInfo.getState());
655 
656       return true;
657     };
658 
659     bool AllCallSitesKnown = true;
660     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
661       return indicatePessimisticFixpoint();
662 
663     return Change;
664   }
665 
666   ChangeStatus manifest(Attributor &A) override {
667     SmallVector<Attribute, 8> AttrList;
668     Function *F = getAssociatedFunction();
669     LLVMContext &Ctx = F->getContext();
670 
671     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
672     unsigned Min, Max;
673     std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
674 
675     // Don't add the attribute if it's the implied default.
676     if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
677       return ChangeStatus::UNCHANGED;
678 
679     SmallString<10> Buffer;
680     raw_svector_ostream OS(Buffer);
681     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
682 
683     AttrList.push_back(
684         Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str()));
685     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
686                                               /* ForceReplace */ true);
687   }
688 
689   const std::string getAsStr() const override {
690     std::string Str;
691     raw_string_ostream OS(Str);
692     OS << "AMDFlatWorkGroupSize[";
693     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
694     OS << ']';
695     return OS.str();
696   }
697 
698   /// See AbstractAttribute::trackStatistics()
699   void trackStatistics() const override {}
700 
701   /// Create an abstract attribute view for the position \p IRP.
702   static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
703                                                    Attributor &A);
704 
705   /// See AbstractAttribute::getName()
706   const std::string getName() const override {
707     return "AAAMDFlatWorkGroupSize";
708   }
709 
710   /// See AbstractAttribute::getIdAddr()
711   const char *getIdAddr() const override { return &ID; }
712 
713   /// This function should return true if the type of the \p AA is
714   /// AAAMDFlatWorkGroupSize
715   static bool classof(const AbstractAttribute *AA) {
716     return (AA->getIdAddr() == &ID);
717   }
718 
719   /// Unique ID (due to the unique address)
720   static const char ID;
721 };
722 
723 const char AAAMDFlatWorkGroupSize::ID = 0;
724 
725 AAAMDFlatWorkGroupSize &
726 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
727                                           Attributor &A) {
728   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
729     return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
730   llvm_unreachable(
731       "AAAMDFlatWorkGroupSize is only valid for function position");
732 }
733 
734 class AMDGPUAttributor : public ModulePass {
735 public:
736   AMDGPUAttributor() : ModulePass(ID) {}
737 
738   /// doInitialization - Virtual method overridden by subclasses to do
739   /// any necessary initialization before any pass is run.
740   bool doInitialization(Module &) override {
741     auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
742     if (!TPC)
743       report_fatal_error("TargetMachine is required");
744 
745     TM = &TPC->getTM<TargetMachine>();
746     return false;
747   }
748 
749   bool runOnModule(Module &M) override {
750     SetVector<Function *> Functions;
751     AnalysisGetter AG;
752     for (Function &F : M) {
753       if (!F.isIntrinsic())
754         Functions.insert(&F);
755     }
756 
757     CallGraphUpdater CGUpdater;
758     BumpPtrAllocator Allocator;
759     AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
760     DenseSet<const char *> Allowed(
761         {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
762          &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID,
763          &AAPointerInfo::ID});
764 
765     AttributorConfig AC(CGUpdater);
766     AC.Allowed = &Allowed;
767     AC.IsModulePass = true;
768     AC.DefaultInitializeLiveInternals = false;
769 
770     Attributor A(Functions, InfoCache, AC);
771 
772     for (Function &F : M) {
773       if (!F.isIntrinsic()) {
774         A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
775         A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
776         if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
777           A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
778         }
779       }
780     }
781 
782     ChangeStatus Change = A.run();
783     return Change == ChangeStatus::CHANGED;
784   }
785 
786   StringRef getPassName() const override { return "AMDGPU Attributor"; }
787   TargetMachine *TM;
788   static char ID;
789 };
790 } // namespace
791 
792 char AMDGPUAttributor::ID = 0;
793 
794 Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
795 INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false)
796