1 //===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to convert gpu.launch_func op into a sequence of
10 // GPU runtime calls. As most of GPU runtimes does not have a stable published
11 // ABI, this pass uses a slim runtime layer that builds on top of the public
12 // API from GPU runtime headers.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
17 
18 #include "../PassDetail.h"
19 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
20 #include "mlir/Dialect/GPU/GPUDialect.h"
21 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
22 #include "mlir/IR/Attributes.h"
23 #include "mlir/IR/Builders.h"
24 #include "mlir/IR/BuiltinOps.h"
25 #include "mlir/IR/BuiltinTypes.h"
26 
27 #include "llvm/ADT/STLExtras.h"
28 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/Module.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Error.h"
33 #include "llvm/Support/FormatVariadic.h"
34 
35 using namespace mlir;
36 
37 static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
38 
39 namespace {
40 
41 class GpuToLLVMConversionPass
42     : public GpuToLLVMConversionPassBase<GpuToLLVMConversionPass> {
43 public:
GpuToLLVMConversionPass(StringRef gpuBinaryAnnotation)44   GpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
45     if (!gpuBinaryAnnotation.empty())
46       this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
47   }
48 
49   // Run the dialect converter on the module.
50   void runOnOperation() override;
51 };
52 
53 class FunctionCallBuilder {
54 public:
FunctionCallBuilder(StringRef functionName,Type returnType,ArrayRef<Type> argumentTypes)55   FunctionCallBuilder(StringRef functionName, Type returnType,
56                       ArrayRef<Type> argumentTypes)
57       : functionName(functionName),
58         functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {}
59   LLVM::CallOp create(Location loc, OpBuilder &builder,
60                       ArrayRef<Value> arguments) const;
61 
62 private:
63   StringRef functionName;
64   LLVM::LLVMFunctionType functionType;
65 };
66 
67 template <typename OpTy>
68 class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
69 public:
ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)70   explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
71       : ConvertOpToLLVMPattern<OpTy>(typeConverter) {}
72 
73 protected:
74   MLIRContext *context = &this->getTypeConverter()->getContext();
75 
76   Type llvmVoidType = LLVM::LLVMVoidType::get(context);
77   Type llvmPointerType =
78       LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
79   Type llvmPointerPointerType = LLVM::LLVMPointerType::get(llvmPointerType);
80   Type llvmInt8Type = IntegerType::get(context, 8);
81   Type llvmInt32Type = IntegerType::get(context, 32);
82   Type llvmInt64Type = IntegerType::get(context, 64);
83   Type llvmIntPtrType = IntegerType::get(
84       context, this->getTypeConverter()->getPointerBitwidth(0));
85 
86   FunctionCallBuilder moduleLoadCallBuilder = {
87       "mgpuModuleLoad",
88       llvmPointerType /* void *module */,
89       {llvmPointerType /* void *cubin */}};
90   FunctionCallBuilder moduleUnloadCallBuilder = {
91       "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}};
92   FunctionCallBuilder moduleGetFunctionCallBuilder = {
93       "mgpuModuleGetFunction",
94       llvmPointerType /* void *function */,
95       {
96           llvmPointerType, /* void *module */
97           llvmPointerType  /* char *name   */
98       }};
99   FunctionCallBuilder launchKernelCallBuilder = {
100       "mgpuLaunchKernel",
101       llvmVoidType,
102       {
103           llvmPointerType,        /* void* f */
104           llvmIntPtrType,         /* intptr_t gridXDim */
105           llvmIntPtrType,         /* intptr_t gridyDim */
106           llvmIntPtrType,         /* intptr_t gridZDim */
107           llvmIntPtrType,         /* intptr_t blockXDim */
108           llvmIntPtrType,         /* intptr_t blockYDim */
109           llvmIntPtrType,         /* intptr_t blockZDim */
110           llvmInt32Type,          /* unsigned int sharedMemBytes */
111           llvmPointerType,        /* void *hstream */
112           llvmPointerPointerType, /* void **kernelParams */
113           llvmPointerPointerType  /* void **extra */
114       }};
115   FunctionCallBuilder streamCreateCallBuilder = {
116       "mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
117   FunctionCallBuilder streamDestroyCallBuilder = {
118       "mgpuStreamDestroy", llvmVoidType, {llvmPointerType /* void *stream */}};
119   FunctionCallBuilder streamSynchronizeCallBuilder = {
120       "mgpuStreamSynchronize",
121       llvmVoidType,
122       {llvmPointerType /* void *stream */}};
123   FunctionCallBuilder streamWaitEventCallBuilder = {
124       "mgpuStreamWaitEvent",
125       llvmVoidType,
126       {llvmPointerType /* void *stream */, llvmPointerType /* void *event */}};
127   FunctionCallBuilder eventCreateCallBuilder = {
128       "mgpuEventCreate", llvmPointerType /* void *event */, {}};
129   FunctionCallBuilder eventDestroyCallBuilder = {
130       "mgpuEventDestroy", llvmVoidType, {llvmPointerType /* void *event */}};
131   FunctionCallBuilder eventSynchronizeCallBuilder = {
132       "mgpuEventSynchronize",
133       llvmVoidType,
134       {llvmPointerType /* void *event */}};
135   FunctionCallBuilder eventRecordCallBuilder = {
136       "mgpuEventRecord",
137       llvmVoidType,
138       {llvmPointerType /* void *event */, llvmPointerType /* void *stream */}};
139   FunctionCallBuilder hostRegisterCallBuilder = {
140       "mgpuMemHostRegisterMemRef",
141       llvmVoidType,
142       {llvmIntPtrType /* intptr_t rank */,
143        llvmPointerType /* void *memrefDesc */,
144        llvmIntPtrType /* intptr_t elementSizeBytes */}};
145   FunctionCallBuilder allocCallBuilder = {
146       "mgpuMemAlloc",
147       llvmPointerType /* void * */,
148       {llvmIntPtrType /* intptr_t sizeBytes */,
149        llvmPointerType /* void *stream */}};
150   FunctionCallBuilder deallocCallBuilder = {
151       "mgpuMemFree",
152       llvmVoidType,
153       {llvmPointerType /* void *ptr */, llvmPointerType /* void *stream */}};
154   FunctionCallBuilder memcpyCallBuilder = {
155       "mgpuMemcpy",
156       llvmVoidType,
157       {llvmPointerType /* void *dst */, llvmPointerType /* void *src */,
158        llvmIntPtrType /* intptr_t sizeBytes */,
159        llvmPointerType /* void *stream */}};
160 };
161 
162 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
163 /// call. Currently it supports CUDA and ROCm (HIP).
164 class ConvertHostRegisterOpToGpuRuntimeCallPattern
165     : public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> {
166 public:
ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)167   ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
168       : ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp>(typeConverter) {}
169 
170 private:
171   LogicalResult
172   matchAndRewrite(gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands,
173                   ConversionPatternRewriter &rewriter) const override;
174 };
175 
176 /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime
177 /// call. Currently it supports CUDA and ROCm (HIP).
178 class ConvertAllocOpToGpuRuntimeCallPattern
179     : public ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp> {
180 public:
ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)181   ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
182       : ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp>(typeConverter) {}
183 
184 private:
185   LogicalResult
186   matchAndRewrite(gpu::AllocOp allocOp, ArrayRef<Value> operands,
187                   ConversionPatternRewriter &rewriter) const override;
188 };
189 
190 /// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime
191 /// call. Currently it supports CUDA and ROCm (HIP).
192 class ConvertDeallocOpToGpuRuntimeCallPattern
193     : public ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp> {
194 public:
ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)195   ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
196       : ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp>(typeConverter) {}
197 
198 private:
199   LogicalResult
200   matchAndRewrite(gpu::DeallocOp deallocOp, ArrayRef<Value> operands,
201                   ConversionPatternRewriter &rewriter) const override;
202 };
203 
204 /// A rewrite pattern to convert gpu.wait operations into a GPU runtime
205 /// call. Currently it supports CUDA and ROCm (HIP).
206 class ConvertWaitOpToGpuRuntimeCallPattern
207     : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
208 public:
ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)209   ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
210       : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}
211 
212 private:
213   LogicalResult
214   matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands,
215                   ConversionPatternRewriter &rewriter) const override;
216 };
217 
218 /// A rewrite pattern to convert gpu.wait async operations into a GPU runtime
219 /// call. Currently it supports CUDA and ROCm (HIP).
220 class ConvertWaitAsyncOpToGpuRuntimeCallPattern
221     : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
222 public:
ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)223   ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
224       : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}
225 
226 private:
227   LogicalResult
228   matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands,
229                   ConversionPatternRewriter &rewriter) const override;
230 };
231 
232 /// A rewrite patter to convert gpu.launch_func operations into a sequence of
233 /// GPU runtime calls. Currently it supports CUDA and ROCm (HIP).
234 ///
235 /// In essence, a gpu.launch_func operations gets compiled into the following
236 /// sequence of runtime calls:
237 ///
238 /// * moduleLoad        -- loads the module given the cubin / hsaco data
239 /// * moduleGetFunction -- gets a handle to the actual kernel function
240 /// * getStreamHelper   -- initializes a new compute stream on GPU
241 /// * launchKernel      -- launches the kernel on a stream
242 /// * streamSynchronize -- waits for operations on the stream to finish
243 ///
244 /// Intermediate data structures are allocated on the stack.
245 class ConvertLaunchFuncOpToGpuRuntimeCallPattern
246     : public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> {
247 public:
ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter,StringRef gpuBinaryAnnotation)248   ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter,
249                                              StringRef gpuBinaryAnnotation)
250       : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter),
251         gpuBinaryAnnotation(gpuBinaryAnnotation) {}
252 
253 private:
254   Value generateParamsArray(gpu::LaunchFuncOp launchOp,
255                             ArrayRef<Value> operands, OpBuilder &builder) const;
256   Value generateKernelNameConstant(StringRef moduleName, StringRef name,
257                                    Location loc, OpBuilder &builder) const;
258 
259   LogicalResult
260   matchAndRewrite(gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
261                   ConversionPatternRewriter &rewriter) const override;
262 
263   llvm::SmallString<32> gpuBinaryAnnotation;
264 };
265 
266 class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
267   using OpRewritePattern<gpu::GPUModuleOp>::OpRewritePattern;
268 
matchAndRewrite(gpu::GPUModuleOp op,PatternRewriter & rewriter) const269   LogicalResult matchAndRewrite(gpu::GPUModuleOp op,
270                                 PatternRewriter &rewriter) const override {
271     // GPU kernel modules are no longer necessary since we have a global
272     // constant with the CUBIN, or HSACO data.
273     rewriter.eraseOp(op);
274     return success();
275   }
276 };
277 
278 /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime
279 /// call. Currently it supports CUDA and ROCm (HIP).
280 class ConvertMemcpyOpToGpuRuntimeCallPattern
281     : public ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp> {
282 public:
ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)283   ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
284       : ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp>(typeConverter) {}
285 
286 private:
287   LogicalResult
288   matchAndRewrite(gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands,
289                   ConversionPatternRewriter &rewriter) const override;
290 };
291 } // namespace
292 
runOnOperation()293 void GpuToLLVMConversionPass::runOnOperation() {
294   LLVMTypeConverter converter(&getContext());
295   OwningRewritePatternList patterns;
296   populateStdToLLVMConversionPatterns(converter, patterns);
297   populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);
298 
299   LLVMConversionTarget target(getContext());
300   if (failed(
301           applyPartialConversion(getOperation(), target, std::move(patterns))))
302     signalPassFailure();
303 }
304 
create(Location loc,OpBuilder & builder,ArrayRef<Value> arguments) const305 LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
306                                          ArrayRef<Value> arguments) const {
307   auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>();
308   auto function = [&] {
309     if (auto function = module.lookupSymbol<LLVM::LLVMFuncOp>(functionName))
310       return function;
311     return OpBuilder(module.getBody()->getTerminator())
312         .create<LLVM::LLVMFuncOp>(loc, functionName, functionType);
313   }();
314   return builder.create<LLVM::CallOp>(
315       loc, const_cast<LLVM::LLVMFunctionType &>(functionType).getReturnType(),
316       builder.getSymbolRefAttr(function), arguments);
317 }
318 
319 // Returns whether all operands are of LLVM type.
areAllLLVMTypes(Operation * op,ValueRange operands,ConversionPatternRewriter & rewriter)320 static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands,
321                                      ConversionPatternRewriter &rewriter) {
322   if (!llvm::all_of(operands, [](Value value) {
323         return LLVM::isCompatibleType(value.getType());
324       }))
325     return rewriter.notifyMatchFailure(
326         op, "Cannot convert if operands aren't of LLVM type.");
327   return success();
328 }
329 
330 static LogicalResult
isAsyncWithOneDependency(ConversionPatternRewriter & rewriter,gpu::AsyncOpInterface op)331 isAsyncWithOneDependency(ConversionPatternRewriter &rewriter,
332                          gpu::AsyncOpInterface op) {
333   if (op.getAsyncDependencies().size() != 1)
334     return rewriter.notifyMatchFailure(
335         op, "Can only convert with exactly one async dependency.");
336 
337   if (!op.getAsyncToken())
338     return rewriter.notifyMatchFailure(op, "Can convert only async version.");
339 
340   return success();
341 }
342 
matchAndRewrite(gpu::HostRegisterOp hostRegisterOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const343 LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
344     gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands,
345     ConversionPatternRewriter &rewriter) const {
346   auto *op = hostRegisterOp.getOperation();
347   if (failed(areAllLLVMTypes(op, operands, rewriter)))
348     return failure();
349 
350   Location loc = op->getLoc();
351 
352   auto memRefType = hostRegisterOp.value().getType();
353   auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType();
354   auto elementSize = getSizeInBytes(loc, elementType, rewriter);
355 
356   auto arguments = getTypeConverter()->promoteOperands(loc, op->getOperands(),
357                                                        operands, rewriter);
358   arguments.push_back(elementSize);
359   hostRegisterCallBuilder.create(loc, rewriter, arguments);
360 
361   rewriter.eraseOp(op);
362   return success();
363 }
364 
matchAndRewrite(gpu::AllocOp allocOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const365 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
366     gpu::AllocOp allocOp, ArrayRef<Value> operands,
367     ConversionPatternRewriter &rewriter) const {
368   MemRefType memRefType = allocOp.getType();
369 
370   if (failed(areAllLLVMTypes(allocOp, operands, rewriter)) ||
371       !isConvertibleAndHasIdentityMaps(memRefType) ||
372       failed(isAsyncWithOneDependency(rewriter, allocOp)))
373     return failure();
374 
375   auto loc = allocOp.getLoc();
376   auto adaptor = gpu::AllocOpAdaptor(operands, allocOp->getAttrDictionary());
377 
378   // Get shape of the memref as values: static sizes are constant
379   // values and dynamic sizes are passed to 'alloc' as operands.
380   SmallVector<Value, 4> shape;
381   SmallVector<Value, 4> strides;
382   Value sizeBytes;
383   getMemRefDescriptorSizes(loc, memRefType, adaptor.dynamicSizes(), rewriter,
384                            shape, strides, sizeBytes);
385 
386   // Allocate the underlying buffer and store a pointer to it in the MemRef
387   // descriptor.
388   Type elementPtrType = this->getElementPtrType(memRefType);
389   auto stream = adaptor.asyncDependencies().front();
390   Value allocatedPtr =
391       allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(0);
392   allocatedPtr =
393       rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
394 
395   // No alignment.
396   Value alignedPtr = allocatedPtr;
397 
398   // Create the MemRef descriptor.
399   auto memRefDescriptor = this->createMemRefDescriptor(
400       loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter);
401 
402   rewriter.replaceOp(allocOp, {memRefDescriptor, stream});
403 
404   return success();
405 }
406 
matchAndRewrite(gpu::DeallocOp deallocOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const407 LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
408     gpu::DeallocOp deallocOp, ArrayRef<Value> operands,
409     ConversionPatternRewriter &rewriter) const {
410   if (failed(areAllLLVMTypes(deallocOp, operands, rewriter)) ||
411       failed(isAsyncWithOneDependency(rewriter, deallocOp)))
412     return failure();
413 
414   Location loc = deallocOp.getLoc();
415 
416   auto adaptor =
417       gpu::DeallocOpAdaptor(operands, deallocOp->getAttrDictionary());
418   Value pointer =
419       MemRefDescriptor(adaptor.memref()).allocatedPtr(rewriter, loc);
420   auto casted = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer);
421   Value stream = adaptor.asyncDependencies().front();
422   deallocCallBuilder.create(loc, rewriter, {casted, stream});
423 
424   rewriter.replaceOp(deallocOp, {stream});
425   return success();
426 }
427 
428 // Converts `gpu.wait` to runtime calls. The operands are all CUDA or ROCm
429 // streams (i.e. void*). The converted op synchronizes the host with every
430 // stream and then destroys it. That is, it assumes that the stream is not used
431 // afterwards. In case this isn't correct, we will get a runtime error.
432 // Eventually, we will have a pass that guarantees this property.
matchAndRewrite(gpu::WaitOp waitOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const433 LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite(
434     gpu::WaitOp waitOp, ArrayRef<Value> operands,
435     ConversionPatternRewriter &rewriter) const {
436   if (waitOp.asyncToken())
437     return rewriter.notifyMatchFailure(waitOp, "Cannot convert async op.");
438 
439   Location loc = waitOp.getLoc();
440 
441   for (auto asyncDependency : operands)
442     streamSynchronizeCallBuilder.create(loc, rewriter, {asyncDependency});
443   for (auto asyncDependency : operands)
444     streamDestroyCallBuilder.create(loc, rewriter, {asyncDependency});
445 
446   rewriter.eraseOp(waitOp);
447   return success();
448 }
449 
450 // Converts `gpu.wait async` to runtime calls. The result is a new stream that
451 // is synchronized with all operands, which are CUDA or ROCm streams (i.e.
452 // void*). We create and record an event after the definition of the stream
453 // and make the new stream wait on that event before destroying it again. This
454 // assumes that there is no other use between the definition and this op, and
455 // the plan is to have a pass that guarantees this property.
matchAndRewrite(gpu::WaitOp waitOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const456 LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite(
457     gpu::WaitOp waitOp, ArrayRef<Value> operands,
458     ConversionPatternRewriter &rewriter) const {
459   if (!waitOp.asyncToken())
460     return rewriter.notifyMatchFailure(waitOp, "Can only convert async op.");
461 
462   Location loc = waitOp.getLoc();
463 
464   auto insertionPoint = rewriter.saveInsertionPoint();
465   SmallVector<Value, 1> events;
466   for (auto pair : llvm::zip(waitOp.asyncDependencies(), operands)) {
467     auto token = std::get<0>(pair);
468     if (auto *defOp = token.getDefiningOp()) {
469       rewriter.setInsertionPointAfter(defOp);
470     } else {
471       // If we can't find the defining op, we record the event at block start,
472       // which is late and therefore misses parallelism, but still valid.
473       rewriter.setInsertionPointToStart(waitOp->getBlock());
474     }
475     auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
476     auto stream = std::get<1>(pair);
477     eventRecordCallBuilder.create(loc, rewriter, {event, stream});
478     events.push_back(event);
479   }
480   rewriter.restoreInsertionPoint(insertionPoint);
481   auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
482   for (auto event : events)
483     streamWaitEventCallBuilder.create(loc, rewriter, {stream, event});
484   for (auto event : events)
485     eventDestroyCallBuilder.create(loc, rewriter, {event});
486   rewriter.replaceOp(waitOp, {stream});
487 
488   return success();
489 }
490 
491 // Creates a struct containing all kernel parameters on the stack and returns
492 // an array of type-erased pointers to the fields of the struct. The array can
493 // then be passed to the CUDA / ROCm (HIP) kernel launch calls.
494 // The generated code is essentially as follows:
495 //
496 // %struct = alloca(sizeof(struct { Parameters... }))
497 // %array = alloca(NumParameters * sizeof(void *))
498 // for (i : [0, NumParameters))
499 //   %fieldPtr = llvm.getelementptr %struct[0, i]
500 //   llvm.store parameters[i], %fieldPtr
501 //   %elementPtr = llvm.getelementptr %array[i]
502 //   llvm.store %fieldPtr, %elementPtr
503 // return %array
generateParamsArray(gpu::LaunchFuncOp launchOp,ArrayRef<Value> operands,OpBuilder & builder) const504 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
505     gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
506     OpBuilder &builder) const {
507   auto loc = launchOp.getLoc();
508   auto numKernelOperands = launchOp.getNumKernelOperands();
509   auto arguments = getTypeConverter()->promoteOperands(
510       loc, launchOp.getOperands().take_back(numKernelOperands),
511       operands.take_back(numKernelOperands), builder);
512   auto numArguments = arguments.size();
513   SmallVector<Type, 4> argumentTypes;
514   argumentTypes.reserve(numArguments);
515   for (auto argument : arguments)
516     argumentTypes.push_back(argument.getType());
517   auto structType = LLVM::LLVMStructType::getNewIdentified(context, StringRef(),
518                                                            argumentTypes);
519   auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
520                                               builder.getI32IntegerAttr(1));
521   auto structPtr = builder.create<LLVM::AllocaOp>(
522       loc, LLVM::LLVMPointerType::get(structType), one, /*alignment=*/0);
523   auto arraySize = builder.create<LLVM::ConstantOp>(
524       loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments));
525   auto arrayPtr = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
526                                                  arraySize, /*alignment=*/0);
527   auto zero = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
528                                                builder.getI32IntegerAttr(0));
529   for (auto en : llvm::enumerate(arguments)) {
530     auto index = builder.create<LLVM::ConstantOp>(
531         loc, llvmInt32Type, builder.getI32IntegerAttr(en.index()));
532     auto fieldPtr = builder.create<LLVM::GEPOp>(
533         loc, LLVM::LLVMPointerType::get(argumentTypes[en.index()]), structPtr,
534         ArrayRef<Value>{zero, index.getResult()});
535     builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr);
536     auto elementPtr = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType,
537                                                   arrayPtr, index.getResult());
538     auto casted =
539         builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr);
540     builder.create<LLVM::StoreOp>(loc, casted, elementPtr);
541   }
542   return arrayPtr;
543 }
544 
545 // Generates an LLVM IR dialect global that contains the name of the given
546 // kernel function as a C string, and returns a pointer to its beginning.
547 // The code is essentially:
548 //
549 // llvm.global constant @kernel_name("function_name\00")
550 // func(...) {
551 //   %0 = llvm.addressof @kernel_name
552 //   %1 = llvm.constant (0 : index)
553 //   %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
554 // }
generateKernelNameConstant(StringRef moduleName,StringRef name,Location loc,OpBuilder & builder) const555 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant(
556     StringRef moduleName, StringRef name, Location loc,
557     OpBuilder &builder) const {
558   // Make sure the trailing zero is included in the constant.
559   std::vector<char> kernelName(name.begin(), name.end());
560   kernelName.push_back('\0');
561 
562   std::string globalName =
563       std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));
564   return LLVM::createGlobalString(
565       loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
566       LLVM::Linkage::Internal);
567 }
568 
569 // Emits LLVM IR to launch a kernel function. Expects the module that contains
570 // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
571 // hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
572 //
573 // %0 = call %binarygetter
574 // %1 = call %moduleLoad(%0)
575 // %2 = <see generateKernelNameConstant>
576 // %3 = call %moduleGetFunction(%1, %2)
577 // %4 = call %streamCreate()
578 // %5 = <see generateParamsArray>
579 // call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
580 // call %streamSynchronize(%4)
581 // call %streamDestroy(%4)
582 // call %moduleUnload(%1)
583 //
584 // If the op is async, the stream corresponds to the (single) async dependency
585 // as well as the async token the op produces.
matchAndRewrite(gpu::LaunchFuncOp launchOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const586 LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
587     gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
588     ConversionPatternRewriter &rewriter) const {
589   if (failed(areAllLLVMTypes(launchOp, operands, rewriter)))
590     return failure();
591 
592   if (launchOp.asyncDependencies().size() > 1)
593     return rewriter.notifyMatchFailure(
594         launchOp, "Cannot convert with more than one async dependency.");
595 
596   // Fail when the synchronous version of the op has async dependencies. The
597   // lowering destroys the stream, and we do not want to check that there is no
598   // use of the stream after this op.
599   if (!launchOp.asyncToken() && !launchOp.asyncDependencies().empty())
600     return rewriter.notifyMatchFailure(
601         launchOp, "Cannot convert non-async op with async dependencies.");
602 
603   Location loc = launchOp.getLoc();
604 
605   // Create an LLVM global with CUBIN extracted from the kernel annotation and
606   // obtain a pointer to the first byte in it.
607   auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
608       launchOp, launchOp.getKernelModuleName());
609   assert(kernelModule && "expected a kernel module");
610 
611   auto binaryAttr =
612       kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation);
613   if (!binaryAttr) {
614     kernelModule.emitOpError()
615         << "missing " << gpuBinaryAnnotation << " attribute";
616     return failure();
617   }
618 
619   SmallString<128> nameBuffer(kernelModule.getName());
620   nameBuffer.append(kGpuBinaryStorageSuffix);
621   Value data =
622       LLVM::createGlobalString(loc, rewriter, nameBuffer.str(),
623                                binaryAttr.getValue(), LLVM::Linkage::Internal);
624 
625   auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
626   // Get the function from the module. The name corresponds to the name of
627   // the kernel function.
628   auto kernelName = generateKernelNameConstant(
629       launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, rewriter);
630   auto function = moduleGetFunctionCallBuilder.create(
631       loc, rewriter, {module.getResult(0), kernelName});
632   auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
633                                                 rewriter.getI32IntegerAttr(0));
634   auto adaptor =
635       gpu::LaunchFuncOpAdaptor(operands, launchOp->getAttrDictionary());
636   Value stream =
637       adaptor.asyncDependencies().empty()
638           ? streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0)
639           : adaptor.asyncDependencies().front();
640   // Create array of pointers to kernel arguments.
641   auto kernelParams = generateParamsArray(launchOp, operands, rewriter);
642   auto nullpointer = rewriter.create<LLVM::NullOp>(loc, llvmPointerPointerType);
643   launchKernelCallBuilder.create(loc, rewriter,
644                                  {function.getResult(0), launchOp.gridSizeX(),
645                                   launchOp.gridSizeY(), launchOp.gridSizeZ(),
646                                   launchOp.blockSizeX(), launchOp.blockSizeY(),
647                                   launchOp.blockSizeZ(),
648                                   /*sharedMemBytes=*/zero, stream, kernelParams,
649                                   /*extra=*/nullpointer});
650 
651   if (launchOp.asyncToken()) {
652     // Async launch: make dependent ops use the same stream.
653     rewriter.replaceOp(launchOp, {stream});
654   } else {
655     // Synchronize with host and destroy stream. This must be the stream created
656     // above (with no other uses) because we check that the synchronous version
657     // does not have any async dependencies.
658     streamSynchronizeCallBuilder.create(loc, rewriter, stream);
659     streamDestroyCallBuilder.create(loc, rewriter, stream);
660     rewriter.eraseOp(launchOp);
661   }
662   moduleUnloadCallBuilder.create(loc, rewriter, module.getResult(0));
663 
664   return success();
665 }
666 
matchAndRewrite(gpu::MemcpyOp memcpyOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const667 LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite(
668     gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands,
669     ConversionPatternRewriter &rewriter) const {
670   auto memRefType = memcpyOp.src().getType().cast<MemRefType>();
671 
672   if (failed(areAllLLVMTypes(memcpyOp, operands, rewriter)) ||
673       !isConvertibleAndHasIdentityMaps(memRefType) ||
674       failed(isAsyncWithOneDependency(rewriter, memcpyOp)))
675     return failure();
676 
677   auto loc = memcpyOp.getLoc();
678   auto adaptor = gpu::MemcpyOpAdaptor(operands, memcpyOp->getAttrDictionary());
679 
680   MemRefDescriptor srcDesc(adaptor.src());
681 
682   Value numElements =
683       memRefType.hasStaticShape()
684           ? createIndexConstant(rewriter, loc, memRefType.getNumElements())
685           // For identity layouts (verified above), the number of elements is
686           // stride[0] * size[0].
687           : rewriter.create<LLVM::MulOp>(loc, srcDesc.stride(rewriter, loc, 0),
688                                          srcDesc.size(rewriter, loc, 0));
689 
690   Type elementPtrType = getElementPtrType(memRefType);
691   Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType);
692   Value gepPtr = rewriter.create<LLVM::GEPOp>(
693       loc, elementPtrType, ArrayRef<Value>{nullPtr, numElements});
694   auto sizeBytes =
695       rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
696 
697   auto src = rewriter.create<LLVM::BitcastOp>(
698       loc, llvmPointerType, srcDesc.alignedPtr(rewriter, loc));
699   auto dst = rewriter.create<LLVM::BitcastOp>(
700       loc, llvmPointerType,
701       MemRefDescriptor(adaptor.dst()).alignedPtr(rewriter, loc));
702 
703   auto stream = adaptor.asyncDependencies().front();
704   memcpyCallBuilder.create(loc, rewriter, {dst, src, sizeBytes, stream});
705 
706   rewriter.replaceOp(memcpyOp, {stream});
707 
708   return success();
709 }
710 
711 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation)712 mlir::createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
713   return std::make_unique<GpuToLLVMConversionPass>(gpuBinaryAnnotation);
714 }
715 
populateGpuToLLVMConversionPatterns(LLVMTypeConverter & converter,OwningRewritePatternList & patterns,StringRef gpuBinaryAnnotation)716 void mlir::populateGpuToLLVMConversionPatterns(
717     LLVMTypeConverter &converter, OwningRewritePatternList &patterns,
718     StringRef gpuBinaryAnnotation) {
719   converter.addConversion(
720       [context = &converter.getContext()](gpu::AsyncTokenType type) -> Type {
721         return LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
722       });
723   patterns.insert<ConvertAllocOpToGpuRuntimeCallPattern,
724                   ConvertDeallocOpToGpuRuntimeCallPattern,
725                   ConvertHostRegisterOpToGpuRuntimeCallPattern,
726                   ConvertMemcpyOpToGpuRuntimeCallPattern,
727                   ConvertWaitAsyncOpToGpuRuntimeCallPattern,
728                   ConvertWaitOpToGpuRuntimeCallPattern>(converter);
729   patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
730       converter, gpuBinaryAnnotation);
731   patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());
732 }
733