1 //===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===//
2 //
3 // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to convert gpu kernel functions into a
10 // corresponding binary blob that can be executed on a CUDA GPU. Currently
11 // only translates the function itself but no dependencies.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
16
17 #include "mlir/Dialect/GPU/GPUDialect.h"
18 #include "mlir/IR/Attributes.h"
19 #include "mlir/IR/Builders.h"
20 #include "mlir/IR/Function.h"
21 #include "mlir/IR/Module.h"
22 #include "mlir/Pass/Pass.h"
23 #include "mlir/Pass/PassRegistry.h"
24 #include "mlir/Support/LogicalResult.h"
25 #include "mlir/Target/NVVMIR.h"
26
27 #include "llvm/ADT/Optional.h"
28 #include "llvm/ADT/Twine.h"
29 #include "llvm/IR/Constants.h"
30 #include "llvm/IR/LegacyPassManager.h"
31 #include "llvm/IR/Module.h"
32 #include "llvm/Support/Error.h"
33 #include "llvm/Support/TargetRegistry.h"
34 #include "llvm/Support/TargetSelect.h"
35 #include "llvm/Target/TargetMachine.h"
36
37 using namespace mlir;
38
39 namespace {
40 // TODO(herhut): Move to shared location.
41 static constexpr const char *kCubinAnnotation = "nvvm.cubin";
42
43 /// A pass converting tagged kernel modules to cubin blobs.
44 ///
45 /// If tagged as a kernel module, each contained function is translated to NVVM
46 /// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
47 /// GPU binary code, which is then attached as an attribute to the function. The
48 /// function body is erased.
49 class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
50 public:
GpuKernelToCubinPass(CubinGenerator cubinGenerator=compilePtxToCubinForTesting)51 GpuKernelToCubinPass(
52 CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
53 : cubinGenerator(cubinGenerator) {}
54
runOnModule()55 void runOnModule() override {
56 ModuleOp module = getModule();
57 if (!module.getAttrOfType<UnitAttr>(
58 gpu::GPUDialect::getKernelModuleAttrName()) ||
59 !module.getName())
60 return;
61
62 // Make sure the NVPTX target is initialized.
63 LLVMInitializeNVPTXTarget();
64 LLVMInitializeNVPTXTargetInfo();
65 LLVMInitializeNVPTXTargetMC();
66 LLVMInitializeNVPTXAsmPrinter();
67
68 auto llvmModule = translateModuleToNVVMIR(module);
69 if (!llvmModule)
70 return signalPassFailure();
71
72 // Translate the module to CUBIN and attach the result as attribute to the
73 // module.
74 if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
75 *llvmModule, module.getLoc(), *module.getName()))
76 module.setAttr(kCubinAnnotation, cubinAttr);
77 else
78 signalPassFailure();
79 }
80
81 private:
82 static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
83 Location, StringRef);
84
85 std::string translateModuleToPtx(llvm::Module &module,
86 llvm::TargetMachine &target_machine);
87
88 /// Converts llvmModule to cubin using the user-provided generator. Location
89 /// is used for error reporting and name is forwarded to the CUBIN generator
90 /// to use in its logging mechanisms.
91 OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, Location loc,
92 StringRef name);
93
94 /// Translates llvmModule to cubin and returns the result as attribute.
95 StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
96 Location loc, StringRef name);
97
98 CubinGenerator cubinGenerator;
99 };
100
101 } // anonymous namespace
102
translateModuleToPtx(llvm::Module & module,llvm::TargetMachine & target_machine)103 std::string GpuKernelToCubinPass::translateModuleToPtx(
104 llvm::Module &module, llvm::TargetMachine &target_machine) {
105 std::string ptx;
106 {
107 llvm::raw_string_ostream stream(ptx);
108 llvm::buffer_ostream pstream(stream);
109 llvm::legacy::PassManager codegen_passes;
110 target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
111 llvm::CGFT_AssemblyFile);
112 codegen_passes.run(module);
113 }
114
115 return ptx;
116 }
117
118 OwnedCubin
compilePtxToCubinForTesting(const std::string & ptx,Location,StringRef)119 GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
120 Location, StringRef) {
121 const char data[] = "CUBIN";
122 return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
123 }
124
convertModuleToCubin(llvm::Module & llvmModule,Location loc,StringRef name)125 OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
126 Location loc,
127 StringRef name) {
128 std::unique_ptr<llvm::TargetMachine> targetMachine;
129 {
130 std::string error;
131 // TODO(herhut): Make triple configurable.
132 constexpr const char *cudaTriple = "nvptx64-nvidia-cuda";
133 llvm::Triple triple(cudaTriple);
134 const llvm::Target *target =
135 llvm::TargetRegistry::lookupTarget("", triple, error);
136 if (target == nullptr) {
137 emitError(loc, "cannot initialize target triple");
138 return {};
139 }
140 targetMachine.reset(
141 target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {}));
142 }
143
144 // Set the data layout of the llvm module to match what the ptx target needs.
145 llvmModule.setDataLayout(targetMachine->createDataLayout());
146
147 auto ptx = translateModuleToPtx(llvmModule, *targetMachine);
148
149 return cubinGenerator(ptx, loc, name);
150 }
151
translateGpuModuleToCubinAnnotation(llvm::Module & llvmModule,Location loc,StringRef name)152 StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
153 llvm::Module &llvmModule, Location loc, StringRef name) {
154 auto cubin = convertModuleToCubin(llvmModule, loc, name);
155 if (!cubin)
156 return {};
157 return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
158 }
159
160 std::unique_ptr<OpPassBase<ModuleOp>>
createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator)161 mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
162 return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
163 }
164
165 static PassRegistration<GpuKernelToCubinPass>
166 pass("test-kernel-to-cubin",
167 "Convert all kernel functions to CUDA cubin blobs");
168