1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass adds target attributes to functions which use intrinsics
10 /// which will impact calling convention lowering.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "Utils/AMDGPUBaseInfo.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Triple.h"
21 #include "llvm/Analysis/CallGraph.h"
22 #include "llvm/Analysis/CallGraphSCCPass.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/IR/CallSite.h"
25 #include "llvm/IR/Constant.h"
26 #include "llvm/IR/Constants.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Intrinsics.h"
31 #include "llvm/IR/Module.h"
32 #include "llvm/IR/Type.h"
33 #include "llvm/IR/Use.h"
34 #include "llvm/Pass.h"
35 #include "llvm/Support/Casting.h"
36 #include "llvm/Support/ErrorHandling.h"
37 #include "llvm/Target/TargetMachine.h"
38 
39 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
40 
41 using namespace llvm;
42 
43 namespace {
44 
45 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
46 private:
47   const TargetMachine *TM = nullptr;
48   SmallVector<CallGraphNode*, 8> NodeList;
49 
50   bool addFeatureAttributes(Function &F);
51   bool processUniformWorkGroupAttribute();
52   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
53 
54 public:
55   static char ID;
56 
57   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
58 
59   bool doInitialization(CallGraph &CG) override;
60   bool runOnSCC(CallGraphSCC &SCC) override;
61 
62   StringRef getPassName() const override {
63     return "AMDGPU Annotate Kernel Features";
64   }
65 
66   void getAnalysisUsage(AnalysisUsage &AU) const override {
67     AU.setPreservesAll();
68     CallGraphSCCPass::getAnalysisUsage(AU);
69   }
70 
71   static bool visitConstantExpr(const ConstantExpr *CE);
72   static bool visitConstantExprsRecursively(
73     const Constant *EntryC,
74     SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
75 };
76 
77 } // end anonymous namespace
78 
79 char AMDGPUAnnotateKernelFeatures::ID = 0;
80 
81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
82 
83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
84                 "Add AMDGPU function attributes", false, false)
85 
86 
87 // The queue ptr is only needed when casting to flat, not from it.
88 static bool castRequiresQueuePtr(unsigned SrcAS) {
89   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
90 }
91 
92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
93   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
94 }
95 
96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
97   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
98     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
99     return castRequiresQueuePtr(SrcAS);
100   }
101 
102   return false;
103 }
104 
105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
106   const Constant *EntryC,
107   SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
108 
109   if (!ConstantExprVisited.insert(EntryC).second)
110     return false;
111 
112   SmallVector<const Constant *, 16> Stack;
113   Stack.push_back(EntryC);
114 
115   while (!Stack.empty()) {
116     const Constant *C = Stack.pop_back_val();
117 
118     // Check this constant expression.
119     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
120       if (visitConstantExpr(CE))
121         return true;
122     }
123 
124     // Visit all sub-expressions.
125     for (const Use &U : C->operands()) {
126       const auto *OpC = dyn_cast<Constant>(U);
127       if (!OpC)
128         continue;
129 
130       if (!ConstantExprVisited.insert(OpC).second)
131         continue;
132 
133       Stack.push_back(OpC);
134     }
135   }
136 
137   return false;
138 }
139 
140 // We do not need to note the x workitem or workgroup id because they are always
141 // initialized.
142 //
143 // TODO: We should not add the attributes if the known compile time workgroup
144 // size is 1 for y/z.
145 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
146                                      bool &NonKernelOnly,
147                                      bool &IsQueuePtr) {
148   switch (ID) {
149   case Intrinsic::amdgcn_workitem_id_x:
150     NonKernelOnly = true;
151     return "amdgpu-work-item-id-x";
152   case Intrinsic::amdgcn_workgroup_id_x:
153     NonKernelOnly = true;
154     return "amdgpu-work-group-id-x";
155   case Intrinsic::amdgcn_workitem_id_y:
156   case Intrinsic::r600_read_tidig_y:
157     return "amdgpu-work-item-id-y";
158   case Intrinsic::amdgcn_workitem_id_z:
159   case Intrinsic::r600_read_tidig_z:
160     return "amdgpu-work-item-id-z";
161   case Intrinsic::amdgcn_workgroup_id_y:
162   case Intrinsic::r600_read_tgid_y:
163     return "amdgpu-work-group-id-y";
164   case Intrinsic::amdgcn_workgroup_id_z:
165   case Intrinsic::r600_read_tgid_z:
166     return "amdgpu-work-group-id-z";
167   case Intrinsic::amdgcn_dispatch_ptr:
168     return "amdgpu-dispatch-ptr";
169   case Intrinsic::amdgcn_dispatch_id:
170     return "amdgpu-dispatch-id";
171   case Intrinsic::amdgcn_kernarg_segment_ptr:
172     return "amdgpu-kernarg-segment-ptr";
173   case Intrinsic::amdgcn_implicitarg_ptr:
174     return "amdgpu-implicitarg-ptr";
175   case Intrinsic::amdgcn_queue_ptr:
176   case Intrinsic::amdgcn_is_shared:
177   case Intrinsic::amdgcn_is_private:
178     // TODO: Does not require queue ptr on gfx9+
179   case Intrinsic::trap:
180   case Intrinsic::debugtrap:
181     IsQueuePtr = true;
182     return "amdgpu-queue-ptr";
183   default:
184     return "";
185   }
186 }
187 
188 static bool handleAttr(Function &Parent, const Function &Callee,
189                        StringRef Name) {
190   if (Callee.hasFnAttribute(Name)) {
191     Parent.addFnAttr(Name);
192     return true;
193   }
194   return false;
195 }
196 
197 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
198                                    bool &NeedQueuePtr) {
199   // X ids unnecessarily propagated to kernels.
200   static constexpr StringLiteral AttrNames[] = {
201       "amdgpu-work-item-id-x",      "amdgpu-work-item-id-y",
202       "amdgpu-work-item-id-z",      "amdgpu-work-group-id-x",
203       "amdgpu-work-group-id-y",     "amdgpu-work-group-id-z",
204       "amdgpu-dispatch-ptr",        "amdgpu-dispatch-id",
205       "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"};
206 
207   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
208     NeedQueuePtr = true;
209 
210   for (StringRef AttrName : AttrNames)
211     handleAttr(Parent, Callee, AttrName);
212 }
213 
214 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
215   bool Changed = false;
216 
217   for (auto *Node : reverse(NodeList)) {
218     Function *Caller = Node->getFunction();
219 
220     for (auto I : *Node) {
221       Function *Callee = std::get<1>(I)->getFunction();
222       if (Callee)
223         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
224     }
225   }
226 
227   return Changed;
228 }
229 
230 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
231        Function &Caller, Function &Callee) {
232 
233   // Check for externally defined function
234   if (!Callee.hasExactDefinition()) {
235     Callee.addFnAttr("uniform-work-group-size", "false");
236     if (!Caller.hasFnAttribute("uniform-work-group-size"))
237       Caller.addFnAttr("uniform-work-group-size", "false");
238 
239     return true;
240   }
241   // Check if the Caller has the attribute
242   if (Caller.hasFnAttribute("uniform-work-group-size")) {
243     // Check if the value of the attribute is true
244     if (Caller.getFnAttribute("uniform-work-group-size")
245         .getValueAsString().equals("true")) {
246       // Propagate the attribute to the Callee, if it does not have it
247       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
248         Callee.addFnAttr("uniform-work-group-size", "true");
249         return true;
250       }
251     } else {
252       Callee.addFnAttr("uniform-work-group-size", "false");
253       return true;
254     }
255   } else {
256     // If the attribute is absent, set it as false
257     Caller.addFnAttr("uniform-work-group-size", "false");
258     Callee.addFnAttr("uniform-work-group-size", "false");
259     return true;
260   }
261   return false;
262 }
263 
264 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
265   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
266   bool HasFlat = ST.hasFlatAddressSpace();
267   bool HasApertureRegs = ST.hasApertureRegs();
268   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
269 
270   bool Changed = false;
271   bool NeedQueuePtr = false;
272   bool HaveCall = false;
273   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
274 
275   for (BasicBlock &BB : F) {
276     for (Instruction &I : BB) {
277       CallSite CS(&I);
278       if (CS) {
279         Function *Callee = CS.getCalledFunction();
280 
281         // TODO: Do something with indirect calls.
282         if (!Callee) {
283           if (!CS.isInlineAsm())
284             HaveCall = true;
285           continue;
286         }
287 
288         Intrinsic::ID IID = Callee->getIntrinsicID();
289         if (IID == Intrinsic::not_intrinsic) {
290           HaveCall = true;
291           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
292           Changed = true;
293         } else {
294           bool NonKernelOnly = false;
295           StringRef AttrName = intrinsicToAttrName(IID,
296                                                    NonKernelOnly, NeedQueuePtr);
297           if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
298             F.addFnAttr(AttrName);
299             Changed = true;
300           }
301         }
302       }
303 
304       if (NeedQueuePtr || HasApertureRegs)
305         continue;
306 
307       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
308         if (castRequiresQueuePtr(ASC)) {
309           NeedQueuePtr = true;
310           continue;
311         }
312       }
313 
314       for (const Use &U : I.operands()) {
315         const auto *OpC = dyn_cast<Constant>(U);
316         if (!OpC)
317           continue;
318 
319         if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
320           NeedQueuePtr = true;
321           break;
322         }
323       }
324     }
325   }
326 
327   if (NeedQueuePtr) {
328     F.addFnAttr("amdgpu-queue-ptr");
329     Changed = true;
330   }
331 
332   // TODO: We could refine this to captured pointers that could possibly be
333   // accessed by flat instructions. For now this is mostly a poor way of
334   // estimating whether there are calls before argument lowering.
335   if (HasFlat && !IsFunc && HaveCall) {
336     F.addFnAttr("amdgpu-flat-scratch");
337     Changed = true;
338   }
339 
340   return Changed;
341 }
342 
343 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
344   bool Changed = false;
345 
346   for (CallGraphNode *I : SCC) {
347     // Build a list of CallGraphNodes from most number of uses to least
348     if (I->getNumReferences())
349       NodeList.push_back(I);
350     else {
351       processUniformWorkGroupAttribute();
352       NodeList.clear();
353     }
354 
355     Function *F = I->getFunction();
356     // Add feature attributes
357     if (!F || F->isDeclaration())
358       continue;
359     Changed |= addFeatureAttributes(*F);
360   }
361 
362   return Changed;
363 }
364 
365 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
366   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
367   if (!TPC)
368     report_fatal_error("TargetMachine is required");
369 
370   TM = &TPC->getTM<TargetMachine>();
371   return false;
372 }
373 
374 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
375   return new AMDGPUAnnotateKernelFeatures();
376 }
377