1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
10 // runtime library.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "CGCUDARuntime.h"
15 #include "CodeGenFunction.h"
16 #include "CodeGenModule.h"
17 #include "clang/AST/Decl.h"
18 #include "clang/Basic/Cuda.h"
19 #include "clang/CodeGen/CodeGenABITypes.h"
20 #include "clang/CodeGen/ConstantInitBuilder.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/Support/Format.h"
25
26 using namespace clang;
27 using namespace CodeGen;
28
29 namespace {
30 constexpr unsigned CudaFatMagic = 0x466243b1;
31 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
32
33 class CGNVCUDARuntime : public CGCUDARuntime {
34
35 private:
36 llvm::IntegerType *IntTy, *SizeTy;
37 llvm::Type *VoidTy;
38 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
39
40 /// Convenience reference to LLVM Context
41 llvm::LLVMContext &Context;
42 /// Convenience reference to the current module
43 llvm::Module &TheModule;
44 /// Keeps track of kernel launch stubs emitted in this module
45 struct KernelInfo {
46 llvm::Function *Kernel;
47 const Decl *D;
48 };
49 llvm::SmallVector<KernelInfo, 16> EmittedKernels;
50 struct VarInfo {
51 llvm::GlobalVariable *Var;
52 const VarDecl *D;
53 DeviceVarFlags Flags;
54 };
55 llvm::SmallVector<VarInfo, 16> DeviceVars;
56 /// Keeps track of variable containing handle of GPU binary. Populated by
57 /// ModuleCtorFunction() and used to create corresponding cleanup calls in
58 /// ModuleDtorFunction()
59 llvm::GlobalVariable *GpuBinaryHandle = nullptr;
60 /// Whether we generate relocatable device code.
61 bool RelocatableDeviceCode;
62 /// Mangle context for device.
63 std::unique_ptr<MangleContext> DeviceMC;
64
65 llvm::FunctionCallee getSetupArgumentFn() const;
66 llvm::FunctionCallee getLaunchFn() const;
67
68 llvm::FunctionType *getRegisterGlobalsFnTy() const;
69 llvm::FunctionType *getCallbackFnTy() const;
70 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
71 std::string addPrefixToName(StringRef FuncName) const;
72 std::string addUnderscoredPrefixToName(StringRef FuncName) const;
73
74 /// Creates a function to register all kernel stubs generated in this module.
75 llvm::Function *makeRegisterGlobalsFn();
76
77 /// Helper function that generates a constant string and returns a pointer to
78 /// the start of the string. The result of this function can be used anywhere
79 /// where the C code specifies const char*.
makeConstantString(const std::string & Str,const std::string & Name="",const std::string & SectionName="",unsigned Alignment=0)80 llvm::Constant *makeConstantString(const std::string &Str,
81 const std::string &Name = "",
82 const std::string &SectionName = "",
83 unsigned Alignment = 0) {
84 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
85 llvm::ConstantInt::get(SizeTy, 0)};
86 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
87 llvm::GlobalVariable *GV =
88 cast<llvm::GlobalVariable>(ConstStr.getPointer());
89 if (!SectionName.empty()) {
90 GV->setSection(SectionName);
91 // Mark the address as used which make sure that this section isn't
92 // merged and we will really have it in the object file.
93 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
94 }
95 if (Alignment)
96 GV->setAlignment(llvm::Align(Alignment));
97
98 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
99 ConstStr.getPointer(), Zeros);
100 }
101
102 /// Helper function that generates an empty dummy function returning void.
makeDummyFunction(llvm::FunctionType * FnTy)103 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
104 assert(FnTy->getReturnType()->isVoidTy() &&
105 "Can only generate dummy functions returning void!");
106 llvm::Function *DummyFunc = llvm::Function::Create(
107 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
108
109 llvm::BasicBlock *DummyBlock =
110 llvm::BasicBlock::Create(Context, "", DummyFunc);
111 CGBuilderTy FuncBuilder(CGM, Context);
112 FuncBuilder.SetInsertPoint(DummyBlock);
113 FuncBuilder.CreateRetVoid();
114
115 return DummyFunc;
116 }
117
118 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
119 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
120 std::string getDeviceSideName(const NamedDecl *ND) override;
121
122 public:
123 CGNVCUDARuntime(CodeGenModule &CGM);
124
125 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
registerDeviceVar(const VarDecl * VD,llvm::GlobalVariable & Var,bool Extern,bool Constant)126 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
127 bool Extern, bool Constant) override {
128 DeviceVars.push_back({&Var,
129 VD,
130 {DeviceVarFlags::Variable, Extern, Constant,
131 /*Normalized*/ false, /*Type*/ 0}});
132 }
registerDeviceSurf(const VarDecl * VD,llvm::GlobalVariable & Var,bool Extern,int Type)133 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
134 bool Extern, int Type) override {
135 DeviceVars.push_back({&Var,
136 VD,
137 {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
138 /*Normalized*/ false, Type}});
139 }
registerDeviceTex(const VarDecl * VD,llvm::GlobalVariable & Var,bool Extern,int Type,bool Normalized)140 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
141 bool Extern, int Type, bool Normalized) override {
142 DeviceVars.push_back({&Var,
143 VD,
144 {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
145 Normalized, Type}});
146 }
147
148 /// Creates module constructor function
149 llvm::Function *makeModuleCtorFunction() override;
150 /// Creates module destructor function
151 llvm::Function *makeModuleDtorFunction() override;
152 };
153
154 }
155
addPrefixToName(StringRef FuncName) const156 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
157 if (CGM.getLangOpts().HIP)
158 return ((Twine("hip") + Twine(FuncName)).str());
159 return ((Twine("cuda") + Twine(FuncName)).str());
160 }
161 std::string
addUnderscoredPrefixToName(StringRef FuncName) const162 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
163 if (CGM.getLangOpts().HIP)
164 return ((Twine("__hip") + Twine(FuncName)).str());
165 return ((Twine("__cuda") + Twine(FuncName)).str());
166 }
167
CGNVCUDARuntime(CodeGenModule & CGM)168 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
169 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
170 TheModule(CGM.getModule()),
171 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
172 DeviceMC(CGM.getContext().createMangleContext(
173 CGM.getContext().getAuxTargetInfo())) {
174 CodeGen::CodeGenTypes &Types = CGM.getTypes();
175 ASTContext &Ctx = CGM.getContext();
176
177 IntTy = CGM.IntTy;
178 SizeTy = CGM.SizeTy;
179 VoidTy = CGM.VoidTy;
180
181 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
182 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
183 VoidPtrPtrTy = VoidPtrTy->getPointerTo();
184 }
185
getSetupArgumentFn() const186 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
187 // cudaError_t cudaSetupArgument(void *, size_t, size_t)
188 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
189 return CGM.CreateRuntimeFunction(
190 llvm::FunctionType::get(IntTy, Params, false),
191 addPrefixToName("SetupArgument"));
192 }
193
getLaunchFn() const194 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
195 if (CGM.getLangOpts().HIP) {
196 // hipError_t hipLaunchByPtr(char *);
197 return CGM.CreateRuntimeFunction(
198 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
199 } else {
200 // cudaError_t cudaLaunch(char *);
201 return CGM.CreateRuntimeFunction(
202 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
203 }
204 }
205
getRegisterGlobalsFnTy() const206 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
207 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
208 }
209
getCallbackFnTy() const210 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
211 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
212 }
213
getRegisterLinkedBinaryFnTy() const214 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
215 auto CallbackFnTy = getCallbackFnTy();
216 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
217 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
218 VoidPtrTy, CallbackFnTy->getPointerTo()};
219 return llvm::FunctionType::get(VoidTy, Params, false);
220 }
221
getDeviceSideName(const NamedDecl * ND)222 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
223 GlobalDecl GD;
224 // D could be either a kernel or a variable.
225 if (auto *FD = dyn_cast<FunctionDecl>(ND))
226 GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
227 else
228 GD = GlobalDecl(ND);
229 std::string DeviceSideName;
230 if (DeviceMC->shouldMangleDeclName(ND)) {
231 SmallString<256> Buffer;
232 llvm::raw_svector_ostream Out(Buffer);
233 DeviceMC->mangleName(GD, Out);
234 DeviceSideName = std::string(Out.str());
235 } else
236 DeviceSideName = std::string(ND->getIdentifier()->getName());
237 return DeviceSideName;
238 }
239
emitDeviceStub(CodeGenFunction & CGF,FunctionArgList & Args)240 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
241 FunctionArgList &Args) {
242 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
243 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
244 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
245 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
246 emitDeviceStubBodyNew(CGF, Args);
247 else
248 emitDeviceStubBodyLegacy(CGF, Args);
249 }
250
251 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
252 // array and kernels are launched using cudaLaunchKernel().
emitDeviceStubBodyNew(CodeGenFunction & CGF,FunctionArgList & Args)253 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
254 FunctionArgList &Args) {
255 // Build the shadow stack entry at the very start of the function.
256
257 // Calculate amount of space we will need for all arguments. If we have no
258 // args, allocate a single pointer so we still have a valid pointer to the
259 // argument array that we can pass to runtime, even if it will be unused.
260 Address KernelArgs = CGF.CreateTempAlloca(
261 VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
262 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
263 // Store pointers to the arguments in a locally allocated launch_args.
264 for (unsigned i = 0; i < Args.size(); ++i) {
265 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
266 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
267 CGF.Builder.CreateDefaultAlignedStore(
268 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
269 }
270
271 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
272
273 // Lookup cudaLaunchKernel/hipLaunchKernel function.
274 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
275 // void **args, size_t sharedMem,
276 // cudaStream_t stream);
277 // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
278 // void **args, size_t sharedMem,
279 // hipStream_t stream);
280 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
281 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
282 auto LaunchKernelName = addPrefixToName("LaunchKernel");
283 IdentifierInfo &cudaLaunchKernelII =
284 CGM.getContext().Idents.get(LaunchKernelName);
285 FunctionDecl *cudaLaunchKernelFD = nullptr;
286 for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
287 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
288 cudaLaunchKernelFD = FD;
289 }
290
291 if (cudaLaunchKernelFD == nullptr) {
292 CGM.Error(CGF.CurFuncDecl->getLocation(),
293 "Can't find declaration for " + LaunchKernelName);
294 return;
295 }
296 // Create temporary dim3 grid_dim, block_dim.
297 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
298 QualType Dim3Ty = GridDimParam->getType();
299 Address GridDim =
300 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
301 Address BlockDim =
302 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
303 Address ShmemSize =
304 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
305 Address Stream =
306 CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
307 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
308 llvm::FunctionType::get(IntTy,
309 {/*gridDim=*/GridDim.getType(),
310 /*blockDim=*/BlockDim.getType(),
311 /*ShmemSize=*/ShmemSize.getType(),
312 /*Stream=*/Stream.getType()},
313 /*isVarArg=*/false),
314 addUnderscoredPrefixToName("PopCallConfiguration"));
315
316 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
317 {GridDim.getPointer(), BlockDim.getPointer(),
318 ShmemSize.getPointer(), Stream.getPointer()});
319
320 // Emit the call to cudaLaunch
321 llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
322 CallArgList LaunchKernelArgs;
323 LaunchKernelArgs.add(RValue::get(Kernel),
324 cudaLaunchKernelFD->getParamDecl(0)->getType());
325 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
326 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
327 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
328 cudaLaunchKernelFD->getParamDecl(3)->getType());
329 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
330 cudaLaunchKernelFD->getParamDecl(4)->getType());
331 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
332 cudaLaunchKernelFD->getParamDecl(5)->getType());
333
334 QualType QT = cudaLaunchKernelFD->getType();
335 QualType CQT = QT.getCanonicalType();
336 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
337 llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
338
339 const CGFunctionInfo &FI =
340 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
341 llvm::FunctionCallee cudaLaunchKernelFn =
342 CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
343 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
344 LaunchKernelArgs);
345 CGF.EmitBranch(EndBlock);
346
347 CGF.EmitBlock(EndBlock);
348 }
349
emitDeviceStubBodyLegacy(CodeGenFunction & CGF,FunctionArgList & Args)350 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
351 FunctionArgList &Args) {
352 // Emit a call to cudaSetupArgument for each arg in Args.
353 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
354 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
355 CharUnits Offset = CharUnits::Zero();
356 for (const VarDecl *A : Args) {
357 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
358 Offset = Offset.alignTo(TInfo.Align);
359 llvm::Value *Args[] = {
360 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
361 VoidPtrTy),
362 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
363 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
364 };
365 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
366 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
367 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
368 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
369 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
370 CGF.EmitBlock(NextBlock);
371 Offset += TInfo.Width;
372 }
373
374 // Emit the call to cudaLaunch
375 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
376 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
377 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
378 CGF.EmitBranch(EndBlock);
379
380 CGF.EmitBlock(EndBlock);
381 }
382
383 /// Creates a function that sets up state on the host side for CUDA objects that
384 /// have a presence on both the host and device sides. Specifically, registers
385 /// the host side of kernel functions and device global variables with the CUDA
386 /// runtime.
387 /// \code
388 /// void __cuda_register_globals(void** GpuBinaryHandle) {
389 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
390 /// ...
391 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
392 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
393 /// ...
394 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
395 /// }
396 /// \endcode
makeRegisterGlobalsFn()397 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
398 // No need to register anything
399 if (EmittedKernels.empty() && DeviceVars.empty())
400 return nullptr;
401
402 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
403 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
404 addUnderscoredPrefixToName("_register_globals"), &TheModule);
405 llvm::BasicBlock *EntryBB =
406 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
407 CGBuilderTy Builder(CGM, Context);
408 Builder.SetInsertPoint(EntryBB);
409
410 // void __cudaRegisterFunction(void **, const char *, char *, const char *,
411 // int, uint3*, uint3*, dim3*, dim3*, int*)
412 llvm::Type *RegisterFuncParams[] = {
413 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
414 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
415 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
416 llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
417 addUnderscoredPrefixToName("RegisterFunction"));
418
419 // Extract GpuBinaryHandle passed as the first argument passed to
420 // __cuda_register_globals() and generate __cudaRegisterFunction() call for
421 // each emitted kernel.
422 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
423 for (auto &&I : EmittedKernels) {
424 llvm::Constant *KernelName =
425 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
426 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
427 llvm::Value *Args[] = {
428 &GpuBinaryHandlePtr,
429 Builder.CreateBitCast(I.Kernel, VoidPtrTy),
430 KernelName,
431 KernelName,
432 llvm::ConstantInt::get(IntTy, -1),
433 NullPtr,
434 NullPtr,
435 NullPtr,
436 NullPtr,
437 llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
438 Builder.CreateCall(RegisterFunc, Args);
439 }
440
441 llvm::Type *VarSizeTy = IntTy;
442 // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
443 if (CGM.getLangOpts().HIP ||
444 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
445 VarSizeTy = SizeTy;
446
447 // void __cudaRegisterVar(void **, char *, char *, const char *,
448 // int, int, int, int)
449 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
450 CharPtrTy, IntTy, VarSizeTy,
451 IntTy, IntTy};
452 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
453 llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
454 addUnderscoredPrefixToName("RegisterVar"));
455 // void __cudaRegisterSurface(void **, const struct surfaceReference *,
456 // const void **, const char *, int, int);
457 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
458 llvm::FunctionType::get(
459 VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
460 false),
461 addUnderscoredPrefixToName("RegisterSurface"));
462 // void __cudaRegisterTexture(void **, const struct textureReference *,
463 // const void **, const char *, int, int, int)
464 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
465 llvm::FunctionType::get(
466 VoidTy,
467 {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
468 false),
469 addUnderscoredPrefixToName("RegisterTexture"));
470 for (auto &&Info : DeviceVars) {
471 llvm::GlobalVariable *Var = Info.Var;
472 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
473 switch (Info.Flags.getKind()) {
474 case DeviceVarFlags::Variable: {
475 uint64_t VarSize =
476 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
477 llvm::Value *Args[] = {
478 &GpuBinaryHandlePtr,
479 Builder.CreateBitCast(Var, VoidPtrTy),
480 VarName,
481 VarName,
482 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
483 llvm::ConstantInt::get(VarSizeTy, VarSize),
484 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
485 llvm::ConstantInt::get(IntTy, 0)};
486 Builder.CreateCall(RegisterVar, Args);
487 break;
488 }
489 case DeviceVarFlags::Surface:
490 Builder.CreateCall(
491 RegisterSurf,
492 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
493 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
494 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
495 break;
496 case DeviceVarFlags::Texture:
497 Builder.CreateCall(
498 RegisterTex,
499 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
500 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
501 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
502 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
503 break;
504 }
505 }
506
507 Builder.CreateRetVoid();
508 return RegisterKernelsFunc;
509 }
510
511 /// Creates a global constructor function for the module:
512 ///
513 /// For CUDA:
514 /// \code
515 /// void __cuda_module_ctor(void*) {
516 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
517 /// __cuda_register_globals(Handle);
518 /// }
519 /// \endcode
520 ///
521 /// For HIP:
522 /// \code
523 /// void __hip_module_ctor(void*) {
524 /// if (__hip_gpubin_handle == 0) {
525 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
526 /// __hip_register_globals(__hip_gpubin_handle);
527 /// }
528 /// }
529 /// \endcode
makeModuleCtorFunction()530 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
531 bool IsHIP = CGM.getLangOpts().HIP;
532 bool IsCUDA = CGM.getLangOpts().CUDA;
533 // No need to generate ctors/dtors if there is no GPU binary.
534 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
535 if (CudaGpuBinaryFileName.empty() && !IsHIP)
536 return nullptr;
537 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
538 DeviceVars.empty())
539 return nullptr;
540
541 // void __{cuda|hip}_register_globals(void* handle);
542 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
543 // We always need a function to pass in as callback. Create a dummy
544 // implementation if we don't need to register anything.
545 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
546 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
547
548 // void ** __{cuda|hip}RegisterFatBinary(void *);
549 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
550 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
551 addUnderscoredPrefixToName("RegisterFatBinary"));
552 // struct { int magic, int version, void * gpu_binary, void * dont_care };
553 llvm::StructType *FatbinWrapperTy =
554 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
555
556 // Register GPU binary with the CUDA runtime, store returned handle in a
557 // global variable and save a reference in GpuBinaryHandle to be cleaned up
558 // in destructor on exit. Then associate all known kernels with the GPU binary
559 // handle so CUDA runtime can figure out what to call on the GPU side.
560 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
561 if (!CudaGpuBinaryFileName.empty()) {
562 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
563 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
564 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
565 CGM.getDiags().Report(diag::err_cannot_open_file)
566 << CudaGpuBinaryFileName << EC.message();
567 return nullptr;
568 }
569 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
570 }
571
572 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
573 llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
574 llvm::GlobalValue::InternalLinkage,
575 addUnderscoredPrefixToName("_module_ctor"), &TheModule);
576 llvm::BasicBlock *CtorEntryBB =
577 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
578 CGBuilderTy CtorBuilder(CGM, Context);
579
580 CtorBuilder.SetInsertPoint(CtorEntryBB);
581
582 const char *FatbinConstantName;
583 const char *FatbinSectionName;
584 const char *ModuleIDSectionName;
585 StringRef ModuleIDPrefix;
586 llvm::Constant *FatBinStr;
587 unsigned FatMagic;
588 if (IsHIP) {
589 FatbinConstantName = ".hip_fatbin";
590 FatbinSectionName = ".hipFatBinSegment";
591
592 ModuleIDSectionName = "__hip_module_id";
593 ModuleIDPrefix = "__hip_";
594
595 if (CudaGpuBinary) {
596 // If fatbin is available from early finalization, create a string
597 // literal containing the fat binary loaded from the given file.
598 const unsigned HIPCodeObjectAlign = 4096;
599 FatBinStr =
600 makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
601 FatbinConstantName, HIPCodeObjectAlign);
602 } else {
603 // If fatbin is not available, create an external symbol
604 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
605 // to contain the fat binary but will be populated somewhere else,
606 // e.g. by lld through link script.
607 FatBinStr = new llvm::GlobalVariable(
608 CGM.getModule(), CGM.Int8Ty,
609 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
610 "__hip_fatbin", nullptr,
611 llvm::GlobalVariable::NotThreadLocal);
612 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
613 }
614
615 FatMagic = HIPFatMagic;
616 } else {
617 if (RelocatableDeviceCode)
618 FatbinConstantName = CGM.getTriple().isMacOSX()
619 ? "__NV_CUDA,__nv_relfatbin"
620 : "__nv_relfatbin";
621 else
622 FatbinConstantName =
623 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
624 // NVIDIA's cuobjdump looks for fatbins in this section.
625 FatbinSectionName =
626 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
627
628 ModuleIDSectionName = CGM.getTriple().isMacOSX()
629 ? "__NV_CUDA,__nv_module_id"
630 : "__nv_module_id";
631 ModuleIDPrefix = "__nv_";
632
633 // For CUDA, create a string literal containing the fat binary loaded from
634 // the given file.
635 FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
636 FatbinConstantName, 8);
637 FatMagic = CudaFatMagic;
638 }
639
640 // Create initialized wrapper structure that points to the loaded GPU binary
641 ConstantInitBuilder Builder(CGM);
642 auto Values = Builder.beginStruct(FatbinWrapperTy);
643 // Fatbin wrapper magic.
644 Values.addInt(IntTy, FatMagic);
645 // Fatbin version.
646 Values.addInt(IntTy, 1);
647 // Data.
648 Values.add(FatBinStr);
649 // Unused in fatbin v1.
650 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
651 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
652 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
653 /*constant*/ true);
654 FatbinWrapper->setSection(FatbinSectionName);
655
656 // There is only one HIP fat binary per linked module, however there are
657 // multiple constructor functions. Make sure the fat binary is registered
658 // only once. The constructor functions are executed by the dynamic loader
659 // before the program gains control. The dynamic loader cannot execute the
660 // constructor functions concurrently since doing that would not guarantee
661 // thread safety of the loaded program. Therefore we can assume sequential
662 // execution of constructor functions here.
663 if (IsHIP) {
664 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
665 llvm::GlobalValue::LinkOnceAnyLinkage;
666 llvm::BasicBlock *IfBlock =
667 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
668 llvm::BasicBlock *ExitBlock =
669 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
670 // The name, size, and initialization pattern of this variable is part
671 // of HIP ABI.
672 GpuBinaryHandle = new llvm::GlobalVariable(
673 TheModule, VoidPtrPtrTy, /*isConstant=*/false,
674 Linkage,
675 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
676 "__hip_gpubin_handle");
677 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
678 // Prevent the weak symbol in different shared libraries being merged.
679 if (Linkage != llvm::GlobalValue::InternalLinkage)
680 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
681 Address GpuBinaryAddr(
682 GpuBinaryHandle,
683 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
684 {
685 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
686 llvm::Constant *Zero =
687 llvm::Constant::getNullValue(HandleValue->getType());
688 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
689 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
690 }
691 {
692 CtorBuilder.SetInsertPoint(IfBlock);
693 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
694 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
695 RegisterFatbinFunc,
696 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
697 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
698 CtorBuilder.CreateBr(ExitBlock);
699 }
700 {
701 CtorBuilder.SetInsertPoint(ExitBlock);
702 // Call __hip_register_globals(GpuBinaryHandle);
703 if (RegisterGlobalsFunc) {
704 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
705 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
706 }
707 }
708 } else if (!RelocatableDeviceCode) {
709 // Register binary with CUDA runtime. This is substantially different in
710 // default mode vs. separate compilation!
711 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
712 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
713 RegisterFatbinFunc,
714 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
715 GpuBinaryHandle = new llvm::GlobalVariable(
716 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
717 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
718 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
719 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
720 CGM.getPointerAlign());
721
722 // Call __cuda_register_globals(GpuBinaryHandle);
723 if (RegisterGlobalsFunc)
724 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
725
726 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
727 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
728 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
729 // void __cudaRegisterFatBinaryEnd(void **);
730 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
731 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
732 "__cudaRegisterFatBinaryEnd");
733 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
734 }
735 } else {
736 // Generate a unique module ID.
737 SmallString<64> ModuleID;
738 llvm::raw_svector_ostream OS(ModuleID);
739 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
740 llvm::Constant *ModuleIDConstant = makeConstantString(
741 std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
742
743 // Create an alias for the FatbinWrapper that nvcc will look for.
744 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
745 Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
746
747 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
748 // void *, void (*)(void **))
749 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
750 RegisterLinkedBinaryName += ModuleID;
751 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
752 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
753
754 assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
755 llvm::Value *Args[] = {RegisterGlobalsFunc,
756 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
757 ModuleIDConstant,
758 makeDummyFunction(getCallbackFnTy())};
759 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
760 }
761
762 // Create destructor and register it with atexit() the way NVCC does it. Doing
763 // it during regular destructor phase worked in CUDA before 9.2 but results in
764 // double-free in 9.2.
765 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
766 // extern "C" int atexit(void (*f)(void));
767 llvm::FunctionType *AtExitTy =
768 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
769 llvm::FunctionCallee AtExitFunc =
770 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
771 /*Local=*/true);
772 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
773 }
774
775 CtorBuilder.CreateRetVoid();
776 return ModuleCtorFunc;
777 }
778
779 /// Creates a global destructor function that unregisters the GPU code blob
780 /// registered by constructor.
781 ///
782 /// For CUDA:
783 /// \code
784 /// void __cuda_module_dtor(void*) {
785 /// __cudaUnregisterFatBinary(Handle);
786 /// }
787 /// \endcode
788 ///
789 /// For HIP:
790 /// \code
791 /// void __hip_module_dtor(void*) {
792 /// if (__hip_gpubin_handle) {
793 /// __hipUnregisterFatBinary(__hip_gpubin_handle);
794 /// __hip_gpubin_handle = 0;
795 /// }
796 /// }
797 /// \endcode
makeModuleDtorFunction()798 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
799 // No need for destructor if we don't have a handle to unregister.
800 if (!GpuBinaryHandle)
801 return nullptr;
802
803 // void __cudaUnregisterFatBinary(void ** handle);
804 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
805 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
806 addUnderscoredPrefixToName("UnregisterFatBinary"));
807
808 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
809 llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
810 llvm::GlobalValue::InternalLinkage,
811 addUnderscoredPrefixToName("_module_dtor"), &TheModule);
812
813 llvm::BasicBlock *DtorEntryBB =
814 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
815 CGBuilderTy DtorBuilder(CGM, Context);
816 DtorBuilder.SetInsertPoint(DtorEntryBB);
817
818 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
819 GpuBinaryHandle->getAlignment()));
820 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
821 // There is only one HIP fat binary per linked module, however there are
822 // multiple destructor functions. Make sure the fat binary is unregistered
823 // only once.
824 if (CGM.getLangOpts().HIP) {
825 llvm::BasicBlock *IfBlock =
826 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
827 llvm::BasicBlock *ExitBlock =
828 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
829 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
830 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
831 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
832
833 DtorBuilder.SetInsertPoint(IfBlock);
834 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
835 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
836 DtorBuilder.CreateBr(ExitBlock);
837
838 DtorBuilder.SetInsertPoint(ExitBlock);
839 } else {
840 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
841 }
842 DtorBuilder.CreateRetVoid();
843 return ModuleDtorFunc;
844 }
845
CreateNVCUDARuntime(CodeGenModule & CGM)846 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
847 return new CGNVCUDARuntime(CGM);
848 }
849