1 //===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===//
2 //
3 // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to convert gpu kernel functions into a
10 // corresponding binary blob that can be executed on a CUDA GPU. Currently
11 // only translates the function itself but no dependencies.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
16 
17 #include "mlir/Dialect/GPU/GPUDialect.h"
18 #include "mlir/IR/Attributes.h"
19 #include "mlir/IR/Builders.h"
20 #include "mlir/IR/Function.h"
21 #include "mlir/IR/Module.h"
22 #include "mlir/Pass/Pass.h"
23 #include "mlir/Pass/PassRegistry.h"
24 #include "mlir/Support/LogicalResult.h"
25 #include "mlir/Target/NVVMIR.h"
26 
27 #include "llvm/ADT/Optional.h"
28 #include "llvm/ADT/Twine.h"
29 #include "llvm/IR/Constants.h"
30 #include "llvm/IR/LegacyPassManager.h"
31 #include "llvm/IR/Module.h"
32 #include "llvm/Support/Error.h"
33 #include "llvm/Support/TargetRegistry.h"
34 #include "llvm/Support/TargetSelect.h"
35 #include "llvm/Target/TargetMachine.h"
36 
37 using namespace mlir;
38 
39 namespace {
40 // TODO(herhut): Move to shared location.
41 static constexpr const char *kCubinAnnotation = "nvvm.cubin";
42 
43 /// A pass converting tagged kernel modules to cubin blobs.
44 ///
45 /// If tagged as a kernel module, each contained function is translated to NVVM
46 /// IR and further to PTX. A user provided CubinGenerator compiles the PTX to
47 /// GPU binary code, which is then attached as an attribute to the function. The
48 /// function body is erased.
49 class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
50 public:
GpuKernelToCubinPass(CubinGenerator cubinGenerator=compilePtxToCubinForTesting)51   GpuKernelToCubinPass(
52       CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
53       : cubinGenerator(cubinGenerator) {}
54 
runOnModule()55   void runOnModule() override {
56     ModuleOp module = getModule();
57     if (!module.getAttrOfType<UnitAttr>(
58             gpu::GPUDialect::getKernelModuleAttrName()) ||
59         !module.getName())
60       return;
61 
62     // Make sure the NVPTX target is initialized.
63     LLVMInitializeNVPTXTarget();
64     LLVMInitializeNVPTXTargetInfo();
65     LLVMInitializeNVPTXTargetMC();
66     LLVMInitializeNVPTXAsmPrinter();
67 
68     auto llvmModule = translateModuleToNVVMIR(module);
69     if (!llvmModule)
70       return signalPassFailure();
71 
72     // Translate the module to CUBIN and attach the result as attribute to the
73     // module.
74     if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
75             *llvmModule, module.getLoc(), *module.getName()))
76       module.setAttr(kCubinAnnotation, cubinAttr);
77     else
78       signalPassFailure();
79   }
80 
81 private:
82   static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
83                                                 Location, StringRef);
84 
85   std::string translateModuleToPtx(llvm::Module &module,
86                                    llvm::TargetMachine &target_machine);
87 
88   /// Converts llvmModule to cubin using the user-provided generator. Location
89   /// is used for error reporting and name is forwarded to the CUBIN generator
90   /// to use in its logging mechanisms.
91   OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, Location loc,
92                                   StringRef name);
93 
94   /// Translates llvmModule to cubin and returns the result as attribute.
95   StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
96                                                  Location loc, StringRef name);
97 
98   CubinGenerator cubinGenerator;
99 };
100 
101 } // anonymous namespace
102 
translateModuleToPtx(llvm::Module & module,llvm::TargetMachine & target_machine)103 std::string GpuKernelToCubinPass::translateModuleToPtx(
104     llvm::Module &module, llvm::TargetMachine &target_machine) {
105   std::string ptx;
106   {
107     llvm::raw_string_ostream stream(ptx);
108     llvm::buffer_ostream pstream(stream);
109     llvm::legacy::PassManager codegen_passes;
110     target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
111                                        llvm::CGFT_AssemblyFile);
112     codegen_passes.run(module);
113   }
114 
115   return ptx;
116 }
117 
118 OwnedCubin
compilePtxToCubinForTesting(const std::string & ptx,Location,StringRef)119 GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
120                                                   Location, StringRef) {
121   const char data[] = "CUBIN";
122   return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
123 }
124 
convertModuleToCubin(llvm::Module & llvmModule,Location loc,StringRef name)125 OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
126                                                       Location loc,
127                                                       StringRef name) {
128   std::unique_ptr<llvm::TargetMachine> targetMachine;
129   {
130     std::string error;
131     // TODO(herhut): Make triple configurable.
132     constexpr const char *cudaTriple = "nvptx64-nvidia-cuda";
133     llvm::Triple triple(cudaTriple);
134     const llvm::Target *target =
135         llvm::TargetRegistry::lookupTarget("", triple, error);
136     if (target == nullptr) {
137       emitError(loc, "cannot initialize target triple");
138       return {};
139     }
140     targetMachine.reset(
141         target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {}));
142   }
143 
144   // Set the data layout of the llvm module to match what the ptx target needs.
145   llvmModule.setDataLayout(targetMachine->createDataLayout());
146 
147   auto ptx = translateModuleToPtx(llvmModule, *targetMachine);
148 
149   return cubinGenerator(ptx, loc, name);
150 }
151 
translateGpuModuleToCubinAnnotation(llvm::Module & llvmModule,Location loc,StringRef name)152 StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
153     llvm::Module &llvmModule, Location loc, StringRef name) {
154   auto cubin = convertModuleToCubin(llvmModule, loc, name);
155   if (!cubin)
156     return {};
157   return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
158 }
159 
160 std::unique_ptr<OpPassBase<ModuleOp>>
createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator)161 mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
162   return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
163 }
164 
165 static PassRegistration<GpuKernelToCubinPass>
166     pass("test-kernel-to-cubin",
167          "Convert all kernel functions to CUDA cubin blobs");
168