//===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===// // // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements a pass to convert gpu kernel functions into a // corresponding binary blob that can be executed on a CUDA GPU. Currently // only translates the function itself but no dependencies. // //===----------------------------------------------------------------------===// #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Function.h" #include "mlir/IR/Module.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassRegistry.h" #include "mlir/Support/LogicalResult.h" #include "mlir/Target/NVVMIR.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/Twine.h" #include "llvm/IR/Constants.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/Support/Error.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Target/TargetMachine.h" using namespace mlir; namespace { // TODO(herhut): Move to shared location. static constexpr const char *kCubinAnnotation = "nvvm.cubin"; /// A pass converting tagged kernel modules to cubin blobs. /// /// If tagged as a kernel module, each contained function is translated to NVVM /// IR and further to PTX. A user provided CubinGenerator compiles the PTX to /// GPU binary code, which is then attached as an attribute to the function. The /// function body is erased. class GpuKernelToCubinPass : public ModulePass { public: GpuKernelToCubinPass( CubinGenerator cubinGenerator = compilePtxToCubinForTesting) : cubinGenerator(cubinGenerator) {} void runOnModule() override { ModuleOp module = getModule(); if (!module.getAttrOfType( gpu::GPUDialect::getKernelModuleAttrName()) || !module.getName()) return; // Make sure the NVPTX target is initialized. LLVMInitializeNVPTXTarget(); LLVMInitializeNVPTXTargetInfo(); LLVMInitializeNVPTXTargetMC(); LLVMInitializeNVPTXAsmPrinter(); auto llvmModule = translateModuleToNVVMIR(module); if (!llvmModule) return signalPassFailure(); // Translate the module to CUBIN and attach the result as attribute to the // module. if (auto cubinAttr = translateGpuModuleToCubinAnnotation( *llvmModule, module.getLoc(), *module.getName())) module.setAttr(kCubinAnnotation, cubinAttr); else signalPassFailure(); } private: static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx, Location, StringRef); std::string translateModuleToPtx(llvm::Module &module, llvm::TargetMachine &target_machine); /// Converts llvmModule to cubin using the user-provided generator. Location /// is used for error reporting and name is forwarded to the CUBIN generator /// to use in its logging mechanisms. OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, Location loc, StringRef name); /// Translates llvmModule to cubin and returns the result as attribute. StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule, Location loc, StringRef name); CubinGenerator cubinGenerator; }; } // anonymous namespace std::string GpuKernelToCubinPass::translateModuleToPtx( llvm::Module &module, llvm::TargetMachine &target_machine) { std::string ptx; { llvm::raw_string_ostream stream(ptx); llvm::buffer_ostream pstream(stream); llvm::legacy::PassManager codegen_passes; target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr, llvm::CGFT_AssemblyFile); codegen_passes.run(module); } return ptx; } OwnedCubin GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx, Location, StringRef) { const char data[] = "CUBIN"; return std::make_unique>(data, data + sizeof(data) - 1); } OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule, Location loc, StringRef name) { std::unique_ptr targetMachine; { std::string error; // TODO(herhut): Make triple configurable. constexpr const char *cudaTriple = "nvptx64-nvidia-cuda"; llvm::Triple triple(cudaTriple); const llvm::Target *target = llvm::TargetRegistry::lookupTarget("", triple, error); if (target == nullptr) { emitError(loc, "cannot initialize target triple"); return {}; } targetMachine.reset( target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {})); } // Set the data layout of the llvm module to match what the ptx target needs. llvmModule.setDataLayout(targetMachine->createDataLayout()); auto ptx = translateModuleToPtx(llvmModule, *targetMachine); return cubinGenerator(ptx, loc, name); } StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation( llvm::Module &llvmModule, Location loc, StringRef name) { auto cubin = convertModuleToCubin(llvmModule, loc, name); if (!cubin) return {}; return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext()); } std::unique_ptr> mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) { return std::make_unique(cubinGenerator); } static PassRegistration pass("test-kernel-to-cubin", "Convert all kernel functions to CUDA cubin blobs");