1 //===- AMDGPULDSUtils.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // AMDGPU LDS related helper utility functions.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "AMDGPULDSUtils.h"
14 #include "Utils/AMDGPUBaseInfo.h"
15 #include "llvm/ADT/DepthFirstIterator.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/Analysis/CallGraph.h"
18 #include "llvm/IR/Constants.h"
19 #include "llvm/IR/ReplaceConstant.h"
20
21 using namespace llvm;
22
23 namespace llvm {
24
25 namespace AMDGPU {
26
27 // An helper class for collecting all reachable callees for each kernel defined
28 // within the module.
29 class CollectReachableCallees {
30 Module &M;
31 CallGraph CG;
32 SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
33
34 // Collect all address taken functions within the module.
collectAddressTakenFunctions()35 void collectAddressTakenFunctions() {
36 auto *ECNode = CG.getExternalCallingNode();
37
38 for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) {
39 auto *CGN = GI->second;
40 auto *F = CGN->getFunction();
41 if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F))
42 continue;
43 AddressTakenFunctions.insert(CGN);
44 }
45 }
46
47 // For given kernel, collect all its reachable non-kernel functions.
collectReachableCallees(Function * K)48 SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
49 SmallPtrSet<Function *, 8> ReachableCallees;
50
51 // Call graph node which represents this kernel.
52 auto *KCGN = CG[K];
53
54 // Go through all call graph nodes reachable from the node representing this
55 // kernel, visit all their call sites, if the call site is direct, add
56 // corresponding callee to reachable callee set, if it is indirect, resolve
57 // the indirect call site to potential reachable callees, add them to
58 // reachable callee set, and repeat the process for the newly added
59 // potential callee nodes.
60 //
61 // FIXME: Need to handle bit-casted function pointers.
62 //
63 SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN));
64 SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
65 while (!CGNStack.empty()) {
66 auto *CGN = CGNStack.pop_back_val();
67
68 if (!VisitedCGNodes.insert(CGN).second)
69 continue;
70
71 for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
72 auto *RCB = cast<CallBase>(GI->first.getValue());
73 auto *RCGN = GI->second;
74
75 if (auto *DCallee = RCGN->getFunction()) {
76 ReachableCallees.insert(DCallee);
77 } else if (RCB->isIndirectCall()) {
78 auto *RCBFTy = RCB->getFunctionType();
79 for (auto *ACGN : AddressTakenFunctions) {
80 auto *ACallee = ACGN->getFunction();
81 if (ACallee->getFunctionType() == RCBFTy) {
82 ReachableCallees.insert(ACallee);
83 CGNStack.append(df_begin(ACGN), df_end(ACGN));
84 }
85 }
86 }
87 }
88 }
89
90 return ReachableCallees;
91 }
92
93 public:
CollectReachableCallees(Module & M)94 explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
95 // Collect address taken functions.
96 collectAddressTakenFunctions();
97 }
98
collectReachableCallees(DenseMap<Function *,SmallPtrSet<Function *,8>> & KernelToCallees)99 void collectReachableCallees(
100 DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
101 // Collect reachable callee set for each kernel defined in the module.
102 for (Function &F : M.functions()) {
103 if (!AMDGPU::isKernelCC(&F))
104 continue;
105 Function *K = &F;
106 KernelToCallees[K] = collectReachableCallees(K);
107 }
108 }
109 };
110
collectReachableCallees(Module & M,DenseMap<Function *,SmallPtrSet<Function *,8>> & KernelToCallees)111 void collectReachableCallees(
112 Module &M,
113 DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
114 CollectReachableCallees CRC{M};
115 CRC.collectReachableCallees(KernelToCallees);
116 }
117
collectNonKernelAccessorsOfLDS(GlobalVariable * GV)118 SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
119 SmallPtrSet<Function *, 8> LDSAccessors;
120 SmallVector<User *, 8> UserStack(GV->users());
121 SmallPtrSet<User *, 8> VisitedUsers;
122
123 while (!UserStack.empty()) {
124 auto *U = UserStack.pop_back_val();
125
126 // `U` is already visited? continue to next one.
127 if (!VisitedUsers.insert(U).second)
128 continue;
129
130 // `U` is a global variable which is initialized with LDS. Ignore LDS.
131 if (isa<GlobalValue>(U))
132 return SmallPtrSet<Function *, 8>();
133
134 // Recursively explore constant users.
135 if (isa<Constant>(U)) {
136 append_range(UserStack, U->users());
137 continue;
138 }
139
140 // `U` should be an instruction, if it belongs to a non-kernel function F,
141 // then collect F.
142 Function *F = cast<Instruction>(U)->getFunction();
143 if (!AMDGPU::isKernelCC(F))
144 LDSAccessors.insert(F);
145 }
146
147 return LDSAccessors;
148 }
149
150 DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
getFunctionToInstsMap(User * U,bool CollectKernelInsts)151 getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
152 DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
153 SmallVector<User *, 8> UserStack;
154 SmallPtrSet<User *, 8> VisitedUsers;
155
156 UserStack.push_back(U);
157
158 while (!UserStack.empty()) {
159 auto *UU = UserStack.pop_back_val();
160
161 if (!VisitedUsers.insert(UU).second)
162 continue;
163
164 if (isa<GlobalValue>(UU))
165 continue;
166
167 if (isa<Constant>(UU)) {
168 append_range(UserStack, UU->users());
169 continue;
170 }
171
172 auto *I = cast<Instruction>(UU);
173 Function *F = I->getFunction();
174 if (CollectKernelInsts) {
175 if (!AMDGPU::isKernelCC(F)) {
176 continue;
177 }
178 } else {
179 if (AMDGPU::isKernelCC(F)) {
180 continue;
181 }
182 }
183
184 FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
185 FunctionToInsts[F].insert(I);
186 }
187
188 return FunctionToInsts;
189 }
190
isKernelCC(const Function * Func)191 bool isKernelCC(const Function *Func) {
192 return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
193 }
194
getAlign(DataLayout const & DL,const GlobalVariable * GV)195 Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
196 return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
197 GV->getValueType());
198 }
199
collectFunctionUses(User * U,const Function * F,SetVector<Instruction * > & InstUsers)200 static void collectFunctionUses(User *U, const Function *F,
201 SetVector<Instruction *> &InstUsers) {
202 SmallVector<User *> Stack{U};
203
204 while (!Stack.empty()) {
205 U = Stack.pop_back_val();
206
207 if (auto *I = dyn_cast<Instruction>(U)) {
208 if (I->getFunction() == F)
209 InstUsers.insert(I);
210 continue;
211 }
212
213 if (!isa<ConstantExpr>(U))
214 continue;
215
216 append_range(Stack, U->users());
217 }
218 }
219
replaceConstantUsesInFunction(ConstantExpr * C,const Function * F)220 void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
221 SetVector<Instruction *> InstUsers;
222
223 collectFunctionUses(C, F, InstUsers);
224 for (Instruction *I : InstUsers) {
225 convertConstantExprsToInstructions(I, C);
226 }
227 }
228
hasUserInstruction(const GlobalValue * GV)229 bool hasUserInstruction(const GlobalValue *GV) {
230 SmallPtrSet<const User *, 8> Visited;
231 SmallVector<const User *, 16> Stack(GV->users());
232
233 while (!Stack.empty()) {
234 const User *U = Stack.pop_back_val();
235
236 if (!Visited.insert(U).second)
237 continue;
238
239 if (isa<Instruction>(U))
240 return true;
241
242 append_range(Stack, U->users());
243 }
244
245 return false;
246 }
247
shouldLowerLDSToStruct(const GlobalVariable & GV,const Function * F)248 bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
249 // We are not interested in kernel LDS lowering for module LDS itself.
250 if (F && GV.getName() == "llvm.amdgcn.module.lds")
251 return false;
252
253 bool Ret = false;
254 SmallPtrSet<const User *, 8> Visited;
255 SmallVector<const User *, 16> Stack(GV.users());
256 SmallPtrSet<const GlobalValue *, 8> GlobalUsers;
257
258 assert(!F || isKernelCC(F));
259
260 while (!Stack.empty()) {
261 const User *V = Stack.pop_back_val();
262 Visited.insert(V);
263
264 if (auto *G = dyn_cast<GlobalValue>(V)) {
265 StringRef GName = G->getName();
266 if (F && GName != "llvm.used" && GName != "llvm.compiler.used") {
267 // For kernel LDS lowering, if G is not a compiler.used list, then we
268 // cannot lower the lds GV since we cannot replace the use of GV within
269 // G.
270 return false;
271 }
272 GlobalUsers.insert(G);
273 continue;
274 }
275
276 if (auto *I = dyn_cast<Instruction>(V)) {
277 const Function *UF = I->getFunction();
278 if (UF == F) {
279 // Used from this kernel, we want to put it into the structure.
280 Ret = true;
281 } else if (!F) {
282 // For module LDS lowering, lowering is required if the user instruction
283 // is from non-kernel function.
284 Ret |= !isKernelCC(UF);
285 }
286 continue;
287 }
288
289 // User V should be a constant, recursively visit users of V.
290 assert(isa<Constant>(V) && "Expected a constant.");
291 append_range(Stack, V->users());
292 }
293
294 if (!F && !Ret) {
295 // For module LDS lowering, we have not yet decided if we should lower GV or
296 // not. Explore all global users of GV, and check if atleast one of these
297 // global users appear as an use within an instruction (possibly nested use
298 // via constant expression), if so, then conservately lower LDS.
299 for (auto *G : GlobalUsers)
300 Ret |= hasUserInstruction(G);
301 }
302
303 return Ret;
304 }
305
findVariablesToLower(Module & M,const Function * F)306 std::vector<GlobalVariable *> findVariablesToLower(Module &M,
307 const Function *F) {
308 std::vector<llvm::GlobalVariable *> LocalVars;
309 for (auto &GV : M.globals()) {
310 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
311 continue;
312 }
313 if (!GV.hasInitializer()) {
314 // addrspace(3) without initializer implies cuda/hip extern __shared__
315 // the semantics for such a variable appears to be that all extern
316 // __shared__ variables alias one another, in which case this transform
317 // is not required
318 continue;
319 }
320 if (!isa<UndefValue>(GV.getInitializer())) {
321 // Initializers are unimplemented for local address space.
322 // Leave such variables in place for consistent error reporting.
323 continue;
324 }
325 if (GV.isConstant()) {
326 // A constant undef variable can't be written to, and any load is
327 // undef, so it should be eliminated by the optimizer. It could be
328 // dropped by the back end if not. This pass skips over it.
329 continue;
330 }
331 if (!shouldLowerLDSToStruct(GV, F)) {
332 continue;
333 }
334 LocalVars.push_back(&GV);
335 }
336 return LocalVars;
337 }
338
getUsedList(Module & M)339 SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
340 SmallPtrSet<GlobalValue *, 32> UsedList;
341
342 SmallVector<GlobalValue *, 32> TmpVec;
343 collectUsedGlobalVariables(M, TmpVec, true);
344 UsedList.insert(TmpVec.begin(), TmpVec.end());
345
346 TmpVec.clear();
347 collectUsedGlobalVariables(M, TmpVec, false);
348 UsedList.insert(TmpVec.begin(), TmpVec.end());
349
350 return UsedList;
351 }
352
353 } // end namespace AMDGPU
354
355 } // end namespace llvm
356