1 //===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to convert gpu.launch_func op into a sequence of
10 // GPU runtime calls. As most of GPU runtimes does not have a stable published
11 // ABI, this pass uses a slim runtime layer that builds on top of the public
12 // API from GPU runtime headers.
13 //
14 //===----------------------------------------------------------------------===//
15
16 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
17
18 #include "../PassDetail.h"
19 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
20 #include "mlir/Dialect/GPU/GPUDialect.h"
21 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
22 #include "mlir/IR/Attributes.h"
23 #include "mlir/IR/Builders.h"
24 #include "mlir/IR/BuiltinOps.h"
25 #include "mlir/IR/BuiltinTypes.h"
26
27 #include "llvm/ADT/STLExtras.h"
28 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/Module.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Error.h"
33 #include "llvm/Support/FormatVariadic.h"
34
35 using namespace mlir;
36
37 static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
38
39 namespace {
40
41 class GpuToLLVMConversionPass
42 : public GpuToLLVMConversionPassBase<GpuToLLVMConversionPass> {
43 public:
GpuToLLVMConversionPass(StringRef gpuBinaryAnnotation)44 GpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
45 if (!gpuBinaryAnnotation.empty())
46 this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
47 }
48
49 // Run the dialect converter on the module.
50 void runOnOperation() override;
51 };
52
53 class FunctionCallBuilder {
54 public:
FunctionCallBuilder(StringRef functionName,Type returnType,ArrayRef<Type> argumentTypes)55 FunctionCallBuilder(StringRef functionName, Type returnType,
56 ArrayRef<Type> argumentTypes)
57 : functionName(functionName),
58 functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {}
59 LLVM::CallOp create(Location loc, OpBuilder &builder,
60 ArrayRef<Value> arguments) const;
61
62 private:
63 StringRef functionName;
64 LLVM::LLVMFunctionType functionType;
65 };
66
67 template <typename OpTy>
68 class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
69 public:
ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)70 explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
71 : ConvertOpToLLVMPattern<OpTy>(typeConverter) {}
72
73 protected:
74 MLIRContext *context = &this->getTypeConverter()->getContext();
75
76 Type llvmVoidType = LLVM::LLVMVoidType::get(context);
77 Type llvmPointerType =
78 LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
79 Type llvmPointerPointerType = LLVM::LLVMPointerType::get(llvmPointerType);
80 Type llvmInt8Type = IntegerType::get(context, 8);
81 Type llvmInt32Type = IntegerType::get(context, 32);
82 Type llvmInt64Type = IntegerType::get(context, 64);
83 Type llvmIntPtrType = IntegerType::get(
84 context, this->getTypeConverter()->getPointerBitwidth(0));
85
86 FunctionCallBuilder moduleLoadCallBuilder = {
87 "mgpuModuleLoad",
88 llvmPointerType /* void *module */,
89 {llvmPointerType /* void *cubin */}};
90 FunctionCallBuilder moduleUnloadCallBuilder = {
91 "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}};
92 FunctionCallBuilder moduleGetFunctionCallBuilder = {
93 "mgpuModuleGetFunction",
94 llvmPointerType /* void *function */,
95 {
96 llvmPointerType, /* void *module */
97 llvmPointerType /* char *name */
98 }};
99 FunctionCallBuilder launchKernelCallBuilder = {
100 "mgpuLaunchKernel",
101 llvmVoidType,
102 {
103 llvmPointerType, /* void* f */
104 llvmIntPtrType, /* intptr_t gridXDim */
105 llvmIntPtrType, /* intptr_t gridyDim */
106 llvmIntPtrType, /* intptr_t gridZDim */
107 llvmIntPtrType, /* intptr_t blockXDim */
108 llvmIntPtrType, /* intptr_t blockYDim */
109 llvmIntPtrType, /* intptr_t blockZDim */
110 llvmInt32Type, /* unsigned int sharedMemBytes */
111 llvmPointerType, /* void *hstream */
112 llvmPointerPointerType, /* void **kernelParams */
113 llvmPointerPointerType /* void **extra */
114 }};
115 FunctionCallBuilder streamCreateCallBuilder = {
116 "mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
117 FunctionCallBuilder streamDestroyCallBuilder = {
118 "mgpuStreamDestroy", llvmVoidType, {llvmPointerType /* void *stream */}};
119 FunctionCallBuilder streamSynchronizeCallBuilder = {
120 "mgpuStreamSynchronize",
121 llvmVoidType,
122 {llvmPointerType /* void *stream */}};
123 FunctionCallBuilder streamWaitEventCallBuilder = {
124 "mgpuStreamWaitEvent",
125 llvmVoidType,
126 {llvmPointerType /* void *stream */, llvmPointerType /* void *event */}};
127 FunctionCallBuilder eventCreateCallBuilder = {
128 "mgpuEventCreate", llvmPointerType /* void *event */, {}};
129 FunctionCallBuilder eventDestroyCallBuilder = {
130 "mgpuEventDestroy", llvmVoidType, {llvmPointerType /* void *event */}};
131 FunctionCallBuilder eventSynchronizeCallBuilder = {
132 "mgpuEventSynchronize",
133 llvmVoidType,
134 {llvmPointerType /* void *event */}};
135 FunctionCallBuilder eventRecordCallBuilder = {
136 "mgpuEventRecord",
137 llvmVoidType,
138 {llvmPointerType /* void *event */, llvmPointerType /* void *stream */}};
139 FunctionCallBuilder hostRegisterCallBuilder = {
140 "mgpuMemHostRegisterMemRef",
141 llvmVoidType,
142 {llvmIntPtrType /* intptr_t rank */,
143 llvmPointerType /* void *memrefDesc */,
144 llvmIntPtrType /* intptr_t elementSizeBytes */}};
145 FunctionCallBuilder allocCallBuilder = {
146 "mgpuMemAlloc",
147 llvmPointerType /* void * */,
148 {llvmIntPtrType /* intptr_t sizeBytes */,
149 llvmPointerType /* void *stream */}};
150 FunctionCallBuilder deallocCallBuilder = {
151 "mgpuMemFree",
152 llvmVoidType,
153 {llvmPointerType /* void *ptr */, llvmPointerType /* void *stream */}};
154 FunctionCallBuilder memcpyCallBuilder = {
155 "mgpuMemcpy",
156 llvmVoidType,
157 {llvmPointerType /* void *dst */, llvmPointerType /* void *src */,
158 llvmIntPtrType /* intptr_t sizeBytes */,
159 llvmPointerType /* void *stream */}};
160 };
161
162 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
163 /// call. Currently it supports CUDA and ROCm (HIP).
164 class ConvertHostRegisterOpToGpuRuntimeCallPattern
165 : public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> {
166 public:
ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)167 ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
168 : ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp>(typeConverter) {}
169
170 private:
171 LogicalResult
172 matchAndRewrite(gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands,
173 ConversionPatternRewriter &rewriter) const override;
174 };
175
176 /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime
177 /// call. Currently it supports CUDA and ROCm (HIP).
178 class ConvertAllocOpToGpuRuntimeCallPattern
179 : public ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp> {
180 public:
ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)181 ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
182 : ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp>(typeConverter) {}
183
184 private:
185 LogicalResult
186 matchAndRewrite(gpu::AllocOp allocOp, ArrayRef<Value> operands,
187 ConversionPatternRewriter &rewriter) const override;
188 };
189
190 /// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime
191 /// call. Currently it supports CUDA and ROCm (HIP).
192 class ConvertDeallocOpToGpuRuntimeCallPattern
193 : public ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp> {
194 public:
ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)195 ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
196 : ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp>(typeConverter) {}
197
198 private:
199 LogicalResult
200 matchAndRewrite(gpu::DeallocOp deallocOp, ArrayRef<Value> operands,
201 ConversionPatternRewriter &rewriter) const override;
202 };
203
204 /// A rewrite pattern to convert gpu.wait operations into a GPU runtime
205 /// call. Currently it supports CUDA and ROCm (HIP).
206 class ConvertWaitOpToGpuRuntimeCallPattern
207 : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
208 public:
ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)209 ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
210 : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}
211
212 private:
213 LogicalResult
214 matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands,
215 ConversionPatternRewriter &rewriter) const override;
216 };
217
218 /// A rewrite pattern to convert gpu.wait async operations into a GPU runtime
219 /// call. Currently it supports CUDA and ROCm (HIP).
220 class ConvertWaitAsyncOpToGpuRuntimeCallPattern
221 : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
222 public:
ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)223 ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
224 : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}
225
226 private:
227 LogicalResult
228 matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands,
229 ConversionPatternRewriter &rewriter) const override;
230 };
231
232 /// A rewrite patter to convert gpu.launch_func operations into a sequence of
233 /// GPU runtime calls. Currently it supports CUDA and ROCm (HIP).
234 ///
235 /// In essence, a gpu.launch_func operations gets compiled into the following
236 /// sequence of runtime calls:
237 ///
238 /// * moduleLoad -- loads the module given the cubin / hsaco data
239 /// * moduleGetFunction -- gets a handle to the actual kernel function
240 /// * getStreamHelper -- initializes a new compute stream on GPU
241 /// * launchKernel -- launches the kernel on a stream
242 /// * streamSynchronize -- waits for operations on the stream to finish
243 ///
244 /// Intermediate data structures are allocated on the stack.
245 class ConvertLaunchFuncOpToGpuRuntimeCallPattern
246 : public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> {
247 public:
ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter,StringRef gpuBinaryAnnotation)248 ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter,
249 StringRef gpuBinaryAnnotation)
250 : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter),
251 gpuBinaryAnnotation(gpuBinaryAnnotation) {}
252
253 private:
254 Value generateParamsArray(gpu::LaunchFuncOp launchOp,
255 ArrayRef<Value> operands, OpBuilder &builder) const;
256 Value generateKernelNameConstant(StringRef moduleName, StringRef name,
257 Location loc, OpBuilder &builder) const;
258
259 LogicalResult
260 matchAndRewrite(gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
261 ConversionPatternRewriter &rewriter) const override;
262
263 llvm::SmallString<32> gpuBinaryAnnotation;
264 };
265
266 class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
267 using OpRewritePattern<gpu::GPUModuleOp>::OpRewritePattern;
268
matchAndRewrite(gpu::GPUModuleOp op,PatternRewriter & rewriter) const269 LogicalResult matchAndRewrite(gpu::GPUModuleOp op,
270 PatternRewriter &rewriter) const override {
271 // GPU kernel modules are no longer necessary since we have a global
272 // constant with the CUBIN, or HSACO data.
273 rewriter.eraseOp(op);
274 return success();
275 }
276 };
277
278 /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime
279 /// call. Currently it supports CUDA and ROCm (HIP).
280 class ConvertMemcpyOpToGpuRuntimeCallPattern
281 : public ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp> {
282 public:
ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter & typeConverter)283 ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
284 : ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp>(typeConverter) {}
285
286 private:
287 LogicalResult
288 matchAndRewrite(gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands,
289 ConversionPatternRewriter &rewriter) const override;
290 };
291 } // namespace
292
runOnOperation()293 void GpuToLLVMConversionPass::runOnOperation() {
294 LLVMTypeConverter converter(&getContext());
295 OwningRewritePatternList patterns;
296 populateStdToLLVMConversionPatterns(converter, patterns);
297 populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);
298
299 LLVMConversionTarget target(getContext());
300 if (failed(
301 applyPartialConversion(getOperation(), target, std::move(patterns))))
302 signalPassFailure();
303 }
304
create(Location loc,OpBuilder & builder,ArrayRef<Value> arguments) const305 LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
306 ArrayRef<Value> arguments) const {
307 auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>();
308 auto function = [&] {
309 if (auto function = module.lookupSymbol<LLVM::LLVMFuncOp>(functionName))
310 return function;
311 return OpBuilder(module.getBody()->getTerminator())
312 .create<LLVM::LLVMFuncOp>(loc, functionName, functionType);
313 }();
314 return builder.create<LLVM::CallOp>(
315 loc, const_cast<LLVM::LLVMFunctionType &>(functionType).getReturnType(),
316 builder.getSymbolRefAttr(function), arguments);
317 }
318
319 // Returns whether all operands are of LLVM type.
areAllLLVMTypes(Operation * op,ValueRange operands,ConversionPatternRewriter & rewriter)320 static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands,
321 ConversionPatternRewriter &rewriter) {
322 if (!llvm::all_of(operands, [](Value value) {
323 return LLVM::isCompatibleType(value.getType());
324 }))
325 return rewriter.notifyMatchFailure(
326 op, "Cannot convert if operands aren't of LLVM type.");
327 return success();
328 }
329
330 static LogicalResult
isAsyncWithOneDependency(ConversionPatternRewriter & rewriter,gpu::AsyncOpInterface op)331 isAsyncWithOneDependency(ConversionPatternRewriter &rewriter,
332 gpu::AsyncOpInterface op) {
333 if (op.getAsyncDependencies().size() != 1)
334 return rewriter.notifyMatchFailure(
335 op, "Can only convert with exactly one async dependency.");
336
337 if (!op.getAsyncToken())
338 return rewriter.notifyMatchFailure(op, "Can convert only async version.");
339
340 return success();
341 }
342
matchAndRewrite(gpu::HostRegisterOp hostRegisterOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const343 LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
344 gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands,
345 ConversionPatternRewriter &rewriter) const {
346 auto *op = hostRegisterOp.getOperation();
347 if (failed(areAllLLVMTypes(op, operands, rewriter)))
348 return failure();
349
350 Location loc = op->getLoc();
351
352 auto memRefType = hostRegisterOp.value().getType();
353 auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType();
354 auto elementSize = getSizeInBytes(loc, elementType, rewriter);
355
356 auto arguments = getTypeConverter()->promoteOperands(loc, op->getOperands(),
357 operands, rewriter);
358 arguments.push_back(elementSize);
359 hostRegisterCallBuilder.create(loc, rewriter, arguments);
360
361 rewriter.eraseOp(op);
362 return success();
363 }
364
matchAndRewrite(gpu::AllocOp allocOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const365 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
366 gpu::AllocOp allocOp, ArrayRef<Value> operands,
367 ConversionPatternRewriter &rewriter) const {
368 MemRefType memRefType = allocOp.getType();
369
370 if (failed(areAllLLVMTypes(allocOp, operands, rewriter)) ||
371 !isConvertibleAndHasIdentityMaps(memRefType) ||
372 failed(isAsyncWithOneDependency(rewriter, allocOp)))
373 return failure();
374
375 auto loc = allocOp.getLoc();
376 auto adaptor = gpu::AllocOpAdaptor(operands, allocOp->getAttrDictionary());
377
378 // Get shape of the memref as values: static sizes are constant
379 // values and dynamic sizes are passed to 'alloc' as operands.
380 SmallVector<Value, 4> shape;
381 SmallVector<Value, 4> strides;
382 Value sizeBytes;
383 getMemRefDescriptorSizes(loc, memRefType, adaptor.dynamicSizes(), rewriter,
384 shape, strides, sizeBytes);
385
386 // Allocate the underlying buffer and store a pointer to it in the MemRef
387 // descriptor.
388 Type elementPtrType = this->getElementPtrType(memRefType);
389 auto stream = adaptor.asyncDependencies().front();
390 Value allocatedPtr =
391 allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(0);
392 allocatedPtr =
393 rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
394
395 // No alignment.
396 Value alignedPtr = allocatedPtr;
397
398 // Create the MemRef descriptor.
399 auto memRefDescriptor = this->createMemRefDescriptor(
400 loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter);
401
402 rewriter.replaceOp(allocOp, {memRefDescriptor, stream});
403
404 return success();
405 }
406
matchAndRewrite(gpu::DeallocOp deallocOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const407 LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
408 gpu::DeallocOp deallocOp, ArrayRef<Value> operands,
409 ConversionPatternRewriter &rewriter) const {
410 if (failed(areAllLLVMTypes(deallocOp, operands, rewriter)) ||
411 failed(isAsyncWithOneDependency(rewriter, deallocOp)))
412 return failure();
413
414 Location loc = deallocOp.getLoc();
415
416 auto adaptor =
417 gpu::DeallocOpAdaptor(operands, deallocOp->getAttrDictionary());
418 Value pointer =
419 MemRefDescriptor(adaptor.memref()).allocatedPtr(rewriter, loc);
420 auto casted = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer);
421 Value stream = adaptor.asyncDependencies().front();
422 deallocCallBuilder.create(loc, rewriter, {casted, stream});
423
424 rewriter.replaceOp(deallocOp, {stream});
425 return success();
426 }
427
428 // Converts `gpu.wait` to runtime calls. The operands are all CUDA or ROCm
429 // streams (i.e. void*). The converted op synchronizes the host with every
430 // stream and then destroys it. That is, it assumes that the stream is not used
431 // afterwards. In case this isn't correct, we will get a runtime error.
432 // Eventually, we will have a pass that guarantees this property.
matchAndRewrite(gpu::WaitOp waitOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const433 LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite(
434 gpu::WaitOp waitOp, ArrayRef<Value> operands,
435 ConversionPatternRewriter &rewriter) const {
436 if (waitOp.asyncToken())
437 return rewriter.notifyMatchFailure(waitOp, "Cannot convert async op.");
438
439 Location loc = waitOp.getLoc();
440
441 for (auto asyncDependency : operands)
442 streamSynchronizeCallBuilder.create(loc, rewriter, {asyncDependency});
443 for (auto asyncDependency : operands)
444 streamDestroyCallBuilder.create(loc, rewriter, {asyncDependency});
445
446 rewriter.eraseOp(waitOp);
447 return success();
448 }
449
450 // Converts `gpu.wait async` to runtime calls. The result is a new stream that
451 // is synchronized with all operands, which are CUDA or ROCm streams (i.e.
452 // void*). We create and record an event after the definition of the stream
453 // and make the new stream wait on that event before destroying it again. This
454 // assumes that there is no other use between the definition and this op, and
455 // the plan is to have a pass that guarantees this property.
matchAndRewrite(gpu::WaitOp waitOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const456 LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite(
457 gpu::WaitOp waitOp, ArrayRef<Value> operands,
458 ConversionPatternRewriter &rewriter) const {
459 if (!waitOp.asyncToken())
460 return rewriter.notifyMatchFailure(waitOp, "Can only convert async op.");
461
462 Location loc = waitOp.getLoc();
463
464 auto insertionPoint = rewriter.saveInsertionPoint();
465 SmallVector<Value, 1> events;
466 for (auto pair : llvm::zip(waitOp.asyncDependencies(), operands)) {
467 auto token = std::get<0>(pair);
468 if (auto *defOp = token.getDefiningOp()) {
469 rewriter.setInsertionPointAfter(defOp);
470 } else {
471 // If we can't find the defining op, we record the event at block start,
472 // which is late and therefore misses parallelism, but still valid.
473 rewriter.setInsertionPointToStart(waitOp->getBlock());
474 }
475 auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
476 auto stream = std::get<1>(pair);
477 eventRecordCallBuilder.create(loc, rewriter, {event, stream});
478 events.push_back(event);
479 }
480 rewriter.restoreInsertionPoint(insertionPoint);
481 auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
482 for (auto event : events)
483 streamWaitEventCallBuilder.create(loc, rewriter, {stream, event});
484 for (auto event : events)
485 eventDestroyCallBuilder.create(loc, rewriter, {event});
486 rewriter.replaceOp(waitOp, {stream});
487
488 return success();
489 }
490
491 // Creates a struct containing all kernel parameters on the stack and returns
492 // an array of type-erased pointers to the fields of the struct. The array can
493 // then be passed to the CUDA / ROCm (HIP) kernel launch calls.
494 // The generated code is essentially as follows:
495 //
496 // %struct = alloca(sizeof(struct { Parameters... }))
497 // %array = alloca(NumParameters * sizeof(void *))
498 // for (i : [0, NumParameters))
499 // %fieldPtr = llvm.getelementptr %struct[0, i]
500 // llvm.store parameters[i], %fieldPtr
501 // %elementPtr = llvm.getelementptr %array[i]
502 // llvm.store %fieldPtr, %elementPtr
503 // return %array
generateParamsArray(gpu::LaunchFuncOp launchOp,ArrayRef<Value> operands,OpBuilder & builder) const504 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
505 gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
506 OpBuilder &builder) const {
507 auto loc = launchOp.getLoc();
508 auto numKernelOperands = launchOp.getNumKernelOperands();
509 auto arguments = getTypeConverter()->promoteOperands(
510 loc, launchOp.getOperands().take_back(numKernelOperands),
511 operands.take_back(numKernelOperands), builder);
512 auto numArguments = arguments.size();
513 SmallVector<Type, 4> argumentTypes;
514 argumentTypes.reserve(numArguments);
515 for (auto argument : arguments)
516 argumentTypes.push_back(argument.getType());
517 auto structType = LLVM::LLVMStructType::getNewIdentified(context, StringRef(),
518 argumentTypes);
519 auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
520 builder.getI32IntegerAttr(1));
521 auto structPtr = builder.create<LLVM::AllocaOp>(
522 loc, LLVM::LLVMPointerType::get(structType), one, /*alignment=*/0);
523 auto arraySize = builder.create<LLVM::ConstantOp>(
524 loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments));
525 auto arrayPtr = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
526 arraySize, /*alignment=*/0);
527 auto zero = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
528 builder.getI32IntegerAttr(0));
529 for (auto en : llvm::enumerate(arguments)) {
530 auto index = builder.create<LLVM::ConstantOp>(
531 loc, llvmInt32Type, builder.getI32IntegerAttr(en.index()));
532 auto fieldPtr = builder.create<LLVM::GEPOp>(
533 loc, LLVM::LLVMPointerType::get(argumentTypes[en.index()]), structPtr,
534 ArrayRef<Value>{zero, index.getResult()});
535 builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr);
536 auto elementPtr = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType,
537 arrayPtr, index.getResult());
538 auto casted =
539 builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr);
540 builder.create<LLVM::StoreOp>(loc, casted, elementPtr);
541 }
542 return arrayPtr;
543 }
544
545 // Generates an LLVM IR dialect global that contains the name of the given
546 // kernel function as a C string, and returns a pointer to its beginning.
547 // The code is essentially:
548 //
549 // llvm.global constant @kernel_name("function_name\00")
550 // func(...) {
551 // %0 = llvm.addressof @kernel_name
552 // %1 = llvm.constant (0 : index)
553 // %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
554 // }
generateKernelNameConstant(StringRef moduleName,StringRef name,Location loc,OpBuilder & builder) const555 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant(
556 StringRef moduleName, StringRef name, Location loc,
557 OpBuilder &builder) const {
558 // Make sure the trailing zero is included in the constant.
559 std::vector<char> kernelName(name.begin(), name.end());
560 kernelName.push_back('\0');
561
562 std::string globalName =
563 std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));
564 return LLVM::createGlobalString(
565 loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
566 LLVM::Linkage::Internal);
567 }
568
569 // Emits LLVM IR to launch a kernel function. Expects the module that contains
570 // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
571 // hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
572 //
573 // %0 = call %binarygetter
574 // %1 = call %moduleLoad(%0)
575 // %2 = <see generateKernelNameConstant>
576 // %3 = call %moduleGetFunction(%1, %2)
577 // %4 = call %streamCreate()
578 // %5 = <see generateParamsArray>
579 // call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
580 // call %streamSynchronize(%4)
581 // call %streamDestroy(%4)
582 // call %moduleUnload(%1)
583 //
584 // If the op is async, the stream corresponds to the (single) async dependency
585 // as well as the async token the op produces.
matchAndRewrite(gpu::LaunchFuncOp launchOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const586 LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
587 gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
588 ConversionPatternRewriter &rewriter) const {
589 if (failed(areAllLLVMTypes(launchOp, operands, rewriter)))
590 return failure();
591
592 if (launchOp.asyncDependencies().size() > 1)
593 return rewriter.notifyMatchFailure(
594 launchOp, "Cannot convert with more than one async dependency.");
595
596 // Fail when the synchronous version of the op has async dependencies. The
597 // lowering destroys the stream, and we do not want to check that there is no
598 // use of the stream after this op.
599 if (!launchOp.asyncToken() && !launchOp.asyncDependencies().empty())
600 return rewriter.notifyMatchFailure(
601 launchOp, "Cannot convert non-async op with async dependencies.");
602
603 Location loc = launchOp.getLoc();
604
605 // Create an LLVM global with CUBIN extracted from the kernel annotation and
606 // obtain a pointer to the first byte in it.
607 auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
608 launchOp, launchOp.getKernelModuleName());
609 assert(kernelModule && "expected a kernel module");
610
611 auto binaryAttr =
612 kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation);
613 if (!binaryAttr) {
614 kernelModule.emitOpError()
615 << "missing " << gpuBinaryAnnotation << " attribute";
616 return failure();
617 }
618
619 SmallString<128> nameBuffer(kernelModule.getName());
620 nameBuffer.append(kGpuBinaryStorageSuffix);
621 Value data =
622 LLVM::createGlobalString(loc, rewriter, nameBuffer.str(),
623 binaryAttr.getValue(), LLVM::Linkage::Internal);
624
625 auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
626 // Get the function from the module. The name corresponds to the name of
627 // the kernel function.
628 auto kernelName = generateKernelNameConstant(
629 launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, rewriter);
630 auto function = moduleGetFunctionCallBuilder.create(
631 loc, rewriter, {module.getResult(0), kernelName});
632 auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
633 rewriter.getI32IntegerAttr(0));
634 auto adaptor =
635 gpu::LaunchFuncOpAdaptor(operands, launchOp->getAttrDictionary());
636 Value stream =
637 adaptor.asyncDependencies().empty()
638 ? streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0)
639 : adaptor.asyncDependencies().front();
640 // Create array of pointers to kernel arguments.
641 auto kernelParams = generateParamsArray(launchOp, operands, rewriter);
642 auto nullpointer = rewriter.create<LLVM::NullOp>(loc, llvmPointerPointerType);
643 launchKernelCallBuilder.create(loc, rewriter,
644 {function.getResult(0), launchOp.gridSizeX(),
645 launchOp.gridSizeY(), launchOp.gridSizeZ(),
646 launchOp.blockSizeX(), launchOp.blockSizeY(),
647 launchOp.blockSizeZ(),
648 /*sharedMemBytes=*/zero, stream, kernelParams,
649 /*extra=*/nullpointer});
650
651 if (launchOp.asyncToken()) {
652 // Async launch: make dependent ops use the same stream.
653 rewriter.replaceOp(launchOp, {stream});
654 } else {
655 // Synchronize with host and destroy stream. This must be the stream created
656 // above (with no other uses) because we check that the synchronous version
657 // does not have any async dependencies.
658 streamSynchronizeCallBuilder.create(loc, rewriter, stream);
659 streamDestroyCallBuilder.create(loc, rewriter, stream);
660 rewriter.eraseOp(launchOp);
661 }
662 moduleUnloadCallBuilder.create(loc, rewriter, module.getResult(0));
663
664 return success();
665 }
666
matchAndRewrite(gpu::MemcpyOp memcpyOp,ArrayRef<Value> operands,ConversionPatternRewriter & rewriter) const667 LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite(
668 gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands,
669 ConversionPatternRewriter &rewriter) const {
670 auto memRefType = memcpyOp.src().getType().cast<MemRefType>();
671
672 if (failed(areAllLLVMTypes(memcpyOp, operands, rewriter)) ||
673 !isConvertibleAndHasIdentityMaps(memRefType) ||
674 failed(isAsyncWithOneDependency(rewriter, memcpyOp)))
675 return failure();
676
677 auto loc = memcpyOp.getLoc();
678 auto adaptor = gpu::MemcpyOpAdaptor(operands, memcpyOp->getAttrDictionary());
679
680 MemRefDescriptor srcDesc(adaptor.src());
681
682 Value numElements =
683 memRefType.hasStaticShape()
684 ? createIndexConstant(rewriter, loc, memRefType.getNumElements())
685 // For identity layouts (verified above), the number of elements is
686 // stride[0] * size[0].
687 : rewriter.create<LLVM::MulOp>(loc, srcDesc.stride(rewriter, loc, 0),
688 srcDesc.size(rewriter, loc, 0));
689
690 Type elementPtrType = getElementPtrType(memRefType);
691 Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType);
692 Value gepPtr = rewriter.create<LLVM::GEPOp>(
693 loc, elementPtrType, ArrayRef<Value>{nullPtr, numElements});
694 auto sizeBytes =
695 rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
696
697 auto src = rewriter.create<LLVM::BitcastOp>(
698 loc, llvmPointerType, srcDesc.alignedPtr(rewriter, loc));
699 auto dst = rewriter.create<LLVM::BitcastOp>(
700 loc, llvmPointerType,
701 MemRefDescriptor(adaptor.dst()).alignedPtr(rewriter, loc));
702
703 auto stream = adaptor.asyncDependencies().front();
704 memcpyCallBuilder.create(loc, rewriter, {dst, src, sizeBytes, stream});
705
706 rewriter.replaceOp(memcpyOp, {stream});
707
708 return success();
709 }
710
711 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation)712 mlir::createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
713 return std::make_unique<GpuToLLVMConversionPass>(gpuBinaryAnnotation);
714 }
715
populateGpuToLLVMConversionPatterns(LLVMTypeConverter & converter,OwningRewritePatternList & patterns,StringRef gpuBinaryAnnotation)716 void mlir::populateGpuToLLVMConversionPatterns(
717 LLVMTypeConverter &converter, OwningRewritePatternList &patterns,
718 StringRef gpuBinaryAnnotation) {
719 converter.addConversion(
720 [context = &converter.getContext()](gpu::AsyncTokenType type) -> Type {
721 return LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
722 });
723 patterns.insert<ConvertAllocOpToGpuRuntimeCallPattern,
724 ConvertDeallocOpToGpuRuntimeCallPattern,
725 ConvertHostRegisterOpToGpuRuntimeCallPattern,
726 ConvertMemcpyOpToGpuRuntimeCallPattern,
727 ConvertWaitAsyncOpToGpuRuntimeCallPattern,
728 ConvertWaitOpToGpuRuntimeCallPattern>(converter);
729 patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
730 converter, gpuBinaryAnnotation);
731 patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());
732 }
733