1 //===- AMDGPULDSUtils.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // AMDGPU LDS related helper utility functions.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPULDSUtils.h"
14 #include "Utils/AMDGPUBaseInfo.h"
15 #include "llvm/ADT/DepthFirstIterator.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/Analysis/CallGraph.h"
18 #include "llvm/IR/Constants.h"
19 #include "llvm/IR/ReplaceConstant.h"
20 
21 using namespace llvm;
22 
23 namespace llvm {
24 
25 namespace AMDGPU {
26 
27 // An helper class for collecting all reachable callees for each kernel defined
28 // within the module.
29 class CollectReachableCallees {
30   Module &M;
31   CallGraph CG;
32   SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
33 
34   // Collect all address taken functions within the module.
collectAddressTakenFunctions()35   void collectAddressTakenFunctions() {
36     auto *ECNode = CG.getExternalCallingNode();
37 
38     for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) {
39       auto *CGN = GI->second;
40       auto *F = CGN->getFunction();
41       if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F))
42         continue;
43       AddressTakenFunctions.insert(CGN);
44     }
45   }
46 
47   // For given kernel, collect all its reachable non-kernel functions.
collectReachableCallees(Function * K)48   SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
49     SmallPtrSet<Function *, 8> ReachableCallees;
50 
51     // Call graph node which represents this kernel.
52     auto *KCGN = CG[K];
53 
54     // Go through all call graph nodes reachable from the node representing this
55     // kernel, visit all their call sites, if the call site is direct, add
56     // corresponding callee to reachable callee set, if it is indirect, resolve
57     // the indirect call site to potential reachable callees, add them to
58     // reachable callee set, and repeat the process for the newly added
59     // potential callee nodes.
60     //
61     // FIXME: Need to handle bit-casted function pointers.
62     //
63     SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN));
64     SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
65     while (!CGNStack.empty()) {
66       auto *CGN = CGNStack.pop_back_val();
67 
68       if (!VisitedCGNodes.insert(CGN).second)
69         continue;
70 
71       for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
72         auto *RCB = cast<CallBase>(GI->first.getValue());
73         auto *RCGN = GI->second;
74 
75         if (auto *DCallee = RCGN->getFunction()) {
76           ReachableCallees.insert(DCallee);
77         } else if (RCB->isIndirectCall()) {
78           auto *RCBFTy = RCB->getFunctionType();
79           for (auto *ACGN : AddressTakenFunctions) {
80             auto *ACallee = ACGN->getFunction();
81             if (ACallee->getFunctionType() == RCBFTy) {
82               ReachableCallees.insert(ACallee);
83               CGNStack.append(df_begin(ACGN), df_end(ACGN));
84             }
85           }
86         }
87       }
88     }
89 
90     return ReachableCallees;
91   }
92 
93 public:
CollectReachableCallees(Module & M)94   explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
95     // Collect address taken functions.
96     collectAddressTakenFunctions();
97   }
98 
collectReachableCallees(DenseMap<Function *,SmallPtrSet<Function *,8>> & KernelToCallees)99   void collectReachableCallees(
100       DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
101     // Collect reachable callee set for each kernel defined in the module.
102     for (Function &F : M.functions()) {
103       if (!AMDGPU::isKernelCC(&F))
104         continue;
105       Function *K = &F;
106       KernelToCallees[K] = collectReachableCallees(K);
107     }
108   }
109 };
110 
collectReachableCallees(Module & M,DenseMap<Function *,SmallPtrSet<Function *,8>> & KernelToCallees)111 void collectReachableCallees(
112     Module &M,
113     DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
114   CollectReachableCallees CRC{M};
115   CRC.collectReachableCallees(KernelToCallees);
116 }
117 
collectNonKernelAccessorsOfLDS(GlobalVariable * GV)118 SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
119   SmallPtrSet<Function *, 8> LDSAccessors;
120   SmallVector<User *, 8> UserStack(GV->users());
121   SmallPtrSet<User *, 8> VisitedUsers;
122 
123   while (!UserStack.empty()) {
124     auto *U = UserStack.pop_back_val();
125 
126     // `U` is already visited? continue to next one.
127     if (!VisitedUsers.insert(U).second)
128       continue;
129 
130     // `U` is a global variable which is initialized with LDS. Ignore LDS.
131     if (isa<GlobalValue>(U))
132       return SmallPtrSet<Function *, 8>();
133 
134     // Recursively explore constant users.
135     if (isa<Constant>(U)) {
136       append_range(UserStack, U->users());
137       continue;
138     }
139 
140     // `U` should be an instruction, if it belongs to a non-kernel function F,
141     // then collect F.
142     Function *F = cast<Instruction>(U)->getFunction();
143     if (!AMDGPU::isKernelCC(F))
144       LDSAccessors.insert(F);
145   }
146 
147   return LDSAccessors;
148 }
149 
150 DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
getFunctionToInstsMap(User * U,bool CollectKernelInsts)151 getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
152   DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
153   SmallVector<User *, 8> UserStack;
154   SmallPtrSet<User *, 8> VisitedUsers;
155 
156   UserStack.push_back(U);
157 
158   while (!UserStack.empty()) {
159     auto *UU = UserStack.pop_back_val();
160 
161     if (!VisitedUsers.insert(UU).second)
162       continue;
163 
164     if (isa<GlobalValue>(UU))
165       continue;
166 
167     if (isa<Constant>(UU)) {
168       append_range(UserStack, UU->users());
169       continue;
170     }
171 
172     auto *I = cast<Instruction>(UU);
173     Function *F = I->getFunction();
174     if (CollectKernelInsts) {
175       if (!AMDGPU::isKernelCC(F)) {
176         continue;
177       }
178     } else {
179       if (AMDGPU::isKernelCC(F)) {
180         continue;
181       }
182     }
183 
184     FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
185     FunctionToInsts[F].insert(I);
186   }
187 
188   return FunctionToInsts;
189 }
190 
isKernelCC(const Function * Func)191 bool isKernelCC(const Function *Func) {
192   return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
193 }
194 
getAlign(DataLayout const & DL,const GlobalVariable * GV)195 Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
196   return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
197                                        GV->getValueType());
198 }
199 
collectFunctionUses(User * U,const Function * F,SetVector<Instruction * > & InstUsers)200 static void collectFunctionUses(User *U, const Function *F,
201                                 SetVector<Instruction *> &InstUsers) {
202   SmallVector<User *> Stack{U};
203 
204   while (!Stack.empty()) {
205     U = Stack.pop_back_val();
206 
207     if (auto *I = dyn_cast<Instruction>(U)) {
208       if (I->getFunction() == F)
209         InstUsers.insert(I);
210       continue;
211     }
212 
213     if (!isa<ConstantExpr>(U))
214       continue;
215 
216     append_range(Stack, U->users());
217   }
218 }
219 
replaceConstantUsesInFunction(ConstantExpr * C,const Function * F)220 void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
221   SetVector<Instruction *> InstUsers;
222 
223   collectFunctionUses(C, F, InstUsers);
224   for (Instruction *I : InstUsers) {
225     convertConstantExprsToInstructions(I, C);
226   }
227 }
228 
hasUserInstruction(const GlobalValue * GV)229 bool hasUserInstruction(const GlobalValue *GV) {
230   SmallPtrSet<const User *, 8> Visited;
231   SmallVector<const User *, 16> Stack(GV->users());
232 
233   while (!Stack.empty()) {
234     const User *U = Stack.pop_back_val();
235 
236     if (!Visited.insert(U).second)
237       continue;
238 
239     if (isa<Instruction>(U))
240       return true;
241 
242     append_range(Stack, U->users());
243   }
244 
245   return false;
246 }
247 
shouldLowerLDSToStruct(const GlobalVariable & GV,const Function * F)248 bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
249   // We are not interested in kernel LDS lowering for module LDS itself.
250   if (F && GV.getName() == "llvm.amdgcn.module.lds")
251     return false;
252 
253   bool Ret = false;
254   SmallPtrSet<const User *, 8> Visited;
255   SmallVector<const User *, 16> Stack(GV.users());
256   SmallPtrSet<const GlobalValue *, 8> GlobalUsers;
257 
258   assert(!F || isKernelCC(F));
259 
260   while (!Stack.empty()) {
261     const User *V = Stack.pop_back_val();
262     Visited.insert(V);
263 
264     if (auto *G = dyn_cast<GlobalValue>(V)) {
265       StringRef GName = G->getName();
266       if (F && GName != "llvm.used" && GName != "llvm.compiler.used") {
267         // For kernel LDS lowering, if G is not a compiler.used list, then we
268         // cannot lower the lds GV since we cannot replace the use of GV within
269         // G.
270         return false;
271       }
272       GlobalUsers.insert(G);
273       continue;
274     }
275 
276     if (auto *I = dyn_cast<Instruction>(V)) {
277       const Function *UF = I->getFunction();
278       if (UF == F) {
279         // Used from this kernel, we want to put it into the structure.
280         Ret = true;
281       } else if (!F) {
282         // For module LDS lowering, lowering is required if the user instruction
283         // is from non-kernel function.
284         Ret |= !isKernelCC(UF);
285       }
286       continue;
287     }
288 
289     // User V should be a constant, recursively visit users of V.
290     assert(isa<Constant>(V) && "Expected a constant.");
291     append_range(Stack, V->users());
292   }
293 
294   if (!F && !Ret) {
295     // For module LDS lowering, we have not yet decided if we should lower GV or
296     // not. Explore all global users of GV, and check if atleast one of these
297     // global users appear as an use within an instruction (possibly nested use
298     // via constant expression), if so, then conservately lower LDS.
299     for (auto *G : GlobalUsers)
300       Ret |= hasUserInstruction(G);
301   }
302 
303   return Ret;
304 }
305 
findVariablesToLower(Module & M,const Function * F)306 std::vector<GlobalVariable *> findVariablesToLower(Module &M,
307                                                    const Function *F) {
308   std::vector<llvm::GlobalVariable *> LocalVars;
309   for (auto &GV : M.globals()) {
310     if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
311       continue;
312     }
313     if (!GV.hasInitializer()) {
314       // addrspace(3) without initializer implies cuda/hip extern __shared__
315       // the semantics for such a variable appears to be that all extern
316       // __shared__ variables alias one another, in which case this transform
317       // is not required
318       continue;
319     }
320     if (!isa<UndefValue>(GV.getInitializer())) {
321       // Initializers are unimplemented for local address space.
322       // Leave such variables in place for consistent error reporting.
323       continue;
324     }
325     if (GV.isConstant()) {
326       // A constant undef variable can't be written to, and any load is
327       // undef, so it should be eliminated by the optimizer. It could be
328       // dropped by the back end if not. This pass skips over it.
329       continue;
330     }
331     if (!shouldLowerLDSToStruct(GV, F)) {
332       continue;
333     }
334     LocalVars.push_back(&GV);
335   }
336   return LocalVars;
337 }
338 
getUsedList(Module & M)339 SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
340   SmallPtrSet<GlobalValue *, 32> UsedList;
341 
342   SmallVector<GlobalValue *, 32> TmpVec;
343   collectUsedGlobalVariables(M, TmpVec, true);
344   UsedList.insert(TmpVec.begin(), TmpVec.end());
345 
346   TmpVec.clear();
347   collectUsedGlobalVariables(M, TmpVec, false);
348   UsedList.insert(TmpVec.begin(), TmpVec.end());
349 
350   return UsedList;
351 }
352 
353 } // end namespace AMDGPU
354 
355 } // end namespace llvm
356