1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass adds target attributes to functions which use intrinsics
10 /// which will impact calling convention lowering.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "llvm/Analysis/CallGraph.h"
17 #include "llvm/Analysis/CallGraphSCCPass.h"
18 #include "llvm/CodeGen/TargetPassConfig.h"
19 #include "llvm/IR/IntrinsicsAMDGPU.h"
20 #include "llvm/IR/IntrinsicsR600.h"
21 #include "llvm/Target/TargetMachine.h"
22 
23 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
24 
25 using namespace llvm;
26 
27 namespace {
28 static constexpr StringLiteral ImplicitAttrNames[] = {
29     // X ids unnecessarily propagated to kernels.
30     "amdgpu-work-item-id-x",  "amdgpu-work-item-id-y",
31     "amdgpu-work-item-id-z",  "amdgpu-work-group-id-x",
32     "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
33     "amdgpu-dispatch-ptr",    "amdgpu-dispatch-id",
34     "amdgpu-queue-ptr",       "amdgpu-implicitarg-ptr"};
35 
36 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
37 private:
38   const TargetMachine *TM = nullptr;
39   SmallVector<CallGraphNode*, 8> NodeList;
40 
41   bool addFeatureAttributes(Function &F);
42   bool processUniformWorkGroupAttribute();
43   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
44 
45 public:
46   static char ID;
47 
AMDGPUAnnotateKernelFeatures()48   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
49 
50   bool doInitialization(CallGraph &CG) override;
51   bool runOnSCC(CallGraphSCC &SCC) override;
52 
getPassName() const53   StringRef getPassName() const override {
54     return "AMDGPU Annotate Kernel Features";
55   }
56 
getAnalysisUsage(AnalysisUsage & AU) const57   void getAnalysisUsage(AnalysisUsage &AU) const override {
58     AU.setPreservesAll();
59     CallGraphSCCPass::getAnalysisUsage(AU);
60   }
61 
62   static bool visitConstantExpr(const ConstantExpr *CE);
63   static bool visitConstantExprsRecursively(
64     const Constant *EntryC,
65     SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
66     bool HasApertureRegs);
67 };
68 
69 } // end anonymous namespace
70 
71 char AMDGPUAnnotateKernelFeatures::ID = 0;
72 
73 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
74 
75 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
76                 "Add AMDGPU function attributes", false, false)
77 
78 
79 // The queue ptr is only needed when casting to flat, not from it.
castRequiresQueuePtr(unsigned SrcAS)80 static bool castRequiresQueuePtr(unsigned SrcAS) {
81   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
82 }
83 
castRequiresQueuePtr(const AddrSpaceCastInst * ASC)84 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
85   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
86 }
87 
isDSAddress(const Constant * C)88 static bool isDSAddress(const Constant *C) {
89   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
90   if (!GV)
91     return false;
92   unsigned AS = GV->getAddressSpace();
93   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
94 }
95 
visitConstantExpr(const ConstantExpr * CE)96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
97   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
98     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
99     return castRequiresQueuePtr(SrcAS);
100   }
101 
102   return false;
103 }
104 
visitConstantExprsRecursively(const Constant * EntryC,SmallPtrSet<const Constant *,8> & ConstantExprVisited,bool IsFunc,bool HasApertureRegs)105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
106   const Constant *EntryC,
107   SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
108   bool IsFunc, bool HasApertureRegs) {
109 
110   if (!ConstantExprVisited.insert(EntryC).second)
111     return false;
112 
113   SmallVector<const Constant *, 16> Stack;
114   Stack.push_back(EntryC);
115 
116   while (!Stack.empty()) {
117     const Constant *C = Stack.pop_back_val();
118 
119     // We need to trap on DS globals in non-entry functions.
120     if (IsFunc && isDSAddress(C))
121       return true;
122 
123     // Check this constant expression.
124     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
125       if (!HasApertureRegs && visitConstantExpr(CE))
126         return true;
127     }
128 
129     // Visit all sub-expressions.
130     for (const Use &U : C->operands()) {
131       const auto *OpC = dyn_cast<Constant>(U);
132       if (!OpC)
133         continue;
134 
135       if (!ConstantExprVisited.insert(OpC).second)
136         continue;
137 
138       Stack.push_back(OpC);
139     }
140   }
141 
142   return false;
143 }
144 
145 // We do not need to note the x workitem or workgroup id because they are always
146 // initialized.
147 //
148 // TODO: We should not add the attributes if the known compile time workgroup
149 // size is 1 for y/z.
intrinsicToAttrName(Intrinsic::ID ID,bool & NonKernelOnly,bool & IsQueuePtr)150 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
151                                      bool &NonKernelOnly,
152                                      bool &IsQueuePtr) {
153   switch (ID) {
154   case Intrinsic::amdgcn_workitem_id_x:
155     NonKernelOnly = true;
156     return "amdgpu-work-item-id-x";
157   case Intrinsic::amdgcn_workgroup_id_x:
158     NonKernelOnly = true;
159     return "amdgpu-work-group-id-x";
160   case Intrinsic::amdgcn_workitem_id_y:
161   case Intrinsic::r600_read_tidig_y:
162     return "amdgpu-work-item-id-y";
163   case Intrinsic::amdgcn_workitem_id_z:
164   case Intrinsic::r600_read_tidig_z:
165     return "amdgpu-work-item-id-z";
166   case Intrinsic::amdgcn_workgroup_id_y:
167   case Intrinsic::r600_read_tgid_y:
168     return "amdgpu-work-group-id-y";
169   case Intrinsic::amdgcn_workgroup_id_z:
170   case Intrinsic::r600_read_tgid_z:
171     return "amdgpu-work-group-id-z";
172   case Intrinsic::amdgcn_dispatch_ptr:
173     return "amdgpu-dispatch-ptr";
174   case Intrinsic::amdgcn_dispatch_id:
175     return "amdgpu-dispatch-id";
176   case Intrinsic::amdgcn_kernarg_segment_ptr:
177     return "amdgpu-kernarg-segment-ptr";
178   case Intrinsic::amdgcn_implicitarg_ptr:
179     return "amdgpu-implicitarg-ptr";
180   case Intrinsic::amdgcn_queue_ptr:
181   case Intrinsic::amdgcn_is_shared:
182   case Intrinsic::amdgcn_is_private:
183     // TODO: Does not require queue ptr on gfx9+
184   case Intrinsic::trap:
185   case Intrinsic::debugtrap:
186     IsQueuePtr = true;
187     return "amdgpu-queue-ptr";
188   default:
189     return "";
190   }
191 }
192 
handleAttr(Function & Parent,const Function & Callee,StringRef Name)193 static bool handleAttr(Function &Parent, const Function &Callee,
194                        StringRef Name) {
195   if (Callee.hasFnAttribute(Name)) {
196     Parent.addFnAttr(Name);
197     return true;
198   }
199   return false;
200 }
201 
copyFeaturesToFunction(Function & Parent,const Function & Callee,bool & NeedQueuePtr)202 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
203                                    bool &NeedQueuePtr) {
204   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
205     NeedQueuePtr = true;
206 
207   for (StringRef AttrName : ImplicitAttrNames)
208     handleAttr(Parent, Callee, AttrName);
209 }
210 
processUniformWorkGroupAttribute()211 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
212   bool Changed = false;
213 
214   for (auto *Node : reverse(NodeList)) {
215     Function *Caller = Node->getFunction();
216 
217     for (auto I : *Node) {
218       Function *Callee = std::get<1>(I)->getFunction();
219       if (Callee)
220         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
221     }
222   }
223 
224   return Changed;
225 }
226 
propagateUniformWorkGroupAttribute(Function & Caller,Function & Callee)227 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
228        Function &Caller, Function &Callee) {
229 
230   // Check for externally defined function
231   if (!Callee.hasExactDefinition()) {
232     Callee.addFnAttr("uniform-work-group-size", "false");
233     if (!Caller.hasFnAttribute("uniform-work-group-size"))
234       Caller.addFnAttr("uniform-work-group-size", "false");
235 
236     return true;
237   }
238   // Check if the Caller has the attribute
239   if (Caller.hasFnAttribute("uniform-work-group-size")) {
240     // Check if the value of the attribute is true
241     if (Caller.getFnAttribute("uniform-work-group-size")
242         .getValueAsString().equals("true")) {
243       // Propagate the attribute to the Callee, if it does not have it
244       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
245         Callee.addFnAttr("uniform-work-group-size", "true");
246         return true;
247       }
248     } else {
249       Callee.addFnAttr("uniform-work-group-size", "false");
250       return true;
251     }
252   } else {
253     // If the attribute is absent, set it as false
254     Caller.addFnAttr("uniform-work-group-size", "false");
255     Callee.addFnAttr("uniform-work-group-size", "false");
256     return true;
257   }
258   return false;
259 }
260 
addFeatureAttributes(Function & F)261 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
262   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
263   bool HasApertureRegs = ST.hasApertureRegs();
264   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
265 
266   bool HaveStackObjects = false;
267   bool Changed = false;
268   bool NeedQueuePtr = false;
269   bool HaveCall = false;
270   bool HasIndirectCall = false;
271   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
272   CallingConv::ID CC = F.getCallingConv();
273   bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
274 
275   // If this function hasAddressTaken() = true
276   // then add all attributes corresponding to the implicit args.
277   if (CallingConvSupportsAllImplicits &&
278       F.hasAddressTaken(nullptr, true, true, true)) {
279     for (StringRef AttrName : ImplicitAttrNames) {
280       F.addFnAttr(AttrName);
281     }
282     Changed = true;
283   }
284 
285   for (BasicBlock &BB : F) {
286     for (Instruction &I : BB) {
287       if (isa<AllocaInst>(I)) {
288         HaveStackObjects = true;
289         continue;
290       }
291 
292       if (auto *CB = dyn_cast<CallBase>(&I)) {
293         const Function *Callee =
294             dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
295 
296         // Note the occurence of indirect call.
297         if (!Callee) {
298           if (!CB->isInlineAsm()) {
299             HasIndirectCall = true;
300             HaveCall = true;
301           }
302           continue;
303         }
304 
305         Intrinsic::ID IID = Callee->getIntrinsicID();
306         if (IID == Intrinsic::not_intrinsic) {
307           HaveCall = true;
308           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
309           Changed = true;
310         } else {
311           bool NonKernelOnly = false;
312 
313           if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
314             F.addFnAttr("amdgpu-kernarg-segment-ptr");
315           } else {
316             StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
317                                                      NeedQueuePtr);
318             if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
319               F.addFnAttr(AttrName);
320               Changed = true;
321             }
322           }
323         }
324       }
325 
326       if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
327         continue;
328 
329       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
330         if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
331           NeedQueuePtr = true;
332           continue;
333         }
334       }
335 
336       for (const Use &U : I.operands()) {
337         const auto *OpC = dyn_cast<Constant>(U);
338         if (!OpC)
339           continue;
340 
341         if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
342                                           HasApertureRegs)) {
343           NeedQueuePtr = true;
344           break;
345         }
346       }
347     }
348   }
349 
350   if (NeedQueuePtr) {
351     F.addFnAttr("amdgpu-queue-ptr");
352     Changed = true;
353   }
354 
355   // TODO: We could refine this to captured pointers that could possibly be
356   // accessed by flat instructions. For now this is mostly a poor way of
357   // estimating whether there are calls before argument lowering.
358   if (!IsFunc && HaveCall) {
359     F.addFnAttr("amdgpu-calls");
360     Changed = true;
361   }
362 
363   if (HaveStackObjects) {
364     F.addFnAttr("amdgpu-stack-objects");
365     Changed = true;
366   }
367 
368   // This pass cannot copy attributes from callees to callers
369   // if there is an indirect call and in thus such cases,
370   // hasAddressTaken() would be false for kernels and functions
371   // making an indirect call (if they are themselves not indirectly called).
372   // We must tag all such kernels/functions with all implicits attributes
373   // for correctness.
374   // e.g.
375   // 1. Kernel K1 makes an indirect call to function F1.
376   //    Without detecting an indirect call in K1, this pass will not
377   //    add all implicit args to K1 (which is incorrect).
378   // 2. Kernel K1 makes direct call to F1 which makes indirect call to function
379   // F2.
380   //    Without detecting an indirect call in F1 (whose hasAddressTaken() is
381   //    false), the pass will not add all implicit args to F1 (which is
382   //    essential for correctness).
383   if (CallingConvSupportsAllImplicits && HasIndirectCall) {
384     for (StringRef AttrName : ImplicitAttrNames) {
385       F.addFnAttr(AttrName);
386     }
387     Changed = true;
388   }
389 
390   return Changed;
391 }
392 
runOnSCC(CallGraphSCC & SCC)393 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
394   bool Changed = false;
395 
396   for (CallGraphNode *I : SCC) {
397     // Build a list of CallGraphNodes from most number of uses to least
398     if (I->getNumReferences())
399       NodeList.push_back(I);
400     else {
401       processUniformWorkGroupAttribute();
402       NodeList.clear();
403     }
404 
405     Function *F = I->getFunction();
406     // Ignore functions with graphics calling conventions, these are currently
407     // not allowed to have kernel arguments.
408     if (!F || F->isDeclaration() || AMDGPU::isGraphics(F->getCallingConv()))
409       continue;
410     // Add feature attributes
411     Changed |= addFeatureAttributes(*F);
412   }
413 
414   return Changed;
415 }
416 
doInitialization(CallGraph & CG)417 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
418   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
419   if (!TPC)
420     report_fatal_error("TargetMachine is required");
421 
422   TM = &TPC->getTM<TargetMachine>();
423   return false;
424 }
425 
createAMDGPUAnnotateKernelFeaturesPass()426 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
427   return new AMDGPUAnnotateKernelFeatures();
428 }
429