1 //===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the GPU dialect pattern rewriters that make GPU op
10 // within a region execute asynchronously.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "PassDetail.h"
15 #include "mlir/Dialect/Async/IR/Async.h"
16 #include "mlir/Dialect/GPU/GPUDialect.h"
17 #include "mlir/Dialect/GPU/Passes.h"
18 #include "mlir/Dialect/GPU/Utils.h"
19 #include "mlir/Dialect/StandardOps/IR/Ops.h"
20 #include "mlir/IR/BlockAndValueMapping.h"
21 #include "mlir/IR/Builders.h"
22 #include "mlir/IR/PatternMatch.h"
23 #include "mlir/IR/SymbolTable.h"
24 #include "mlir/Support/LLVM.h"
25 #include "mlir/Transforms/RegionUtils.h"
26 #include "llvm/ADT/TypeSwitch.h"
27
28 using namespace mlir;
29 namespace {
30 class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> {
31 struct ThreadTokenCallback;
32 struct DeferWaitCallback;
33 struct SingleTokenUseCallback;
34 void runOnFunction() override;
35 };
36 } // namespace
37
isTerminator(Operation * op)38 static bool isTerminator(Operation *op) {
39 return op->mightHaveTrait<OpTrait::IsTerminator>();
40 }
hasSideEffects(Operation * op)41 static bool hasSideEffects(Operation *op) {
42 return !MemoryEffectOpInterface::hasNoEffect(op);
43 }
44
45 // Region walk callback which makes GPU ops implementing the AsyncOpInterface
46 // execute asynchronously.
47 struct GpuAsyncRegionPass::ThreadTokenCallback {
ThreadTokenCallbackGpuAsyncRegionPass::ThreadTokenCallback48 ThreadTokenCallback(MLIRContext &context) : builder(&context) {}
49
operator ()GpuAsyncRegionPass::ThreadTokenCallback50 WalkResult operator()(Block *block) {
51 for (Operation &op : make_early_inc_range(*block)) {
52 if (failed(visit(&op)))
53 return WalkResult::interrupt();
54 }
55 return WalkResult::advance();
56 }
57
58 private:
59 // If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to
60 // create a current token (unless it already exists), and 'thread' that token
61 // through the `op` so that it executes asynchronously.
62 //
63 // If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to
64 // host-synchronize execution. A `!gpu.async.token` will therefore only be
65 // used inside of its block and GPU execution will always synchronize with
66 // the host at block boundaries.
visitGpuAsyncRegionPass::ThreadTokenCallback67 LogicalResult visit(Operation *op) {
68 if (isa<gpu::LaunchOp>(op))
69 return op->emitOpError("replace with gpu.launch_func first");
70 if (auto waitOp = llvm::dyn_cast<gpu::WaitOp>(op)) {
71 if (currentToken)
72 waitOp.addAsyncDependency(currentToken);
73 currentToken = waitOp.asyncToken();
74 return success();
75 }
76 builder.setInsertionPoint(op);
77 if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op))
78 return rewriteAsyncOp(asyncOp); // Replace GPU op with async version.
79 if (!currentToken)
80 return success();
81 // Insert host synchronization before terminator or op with side effects.
82 if (isTerminator(op) || hasSideEffects(op))
83 currentToken = createWaitOp(op->getLoc(), Type(), {currentToken});
84 return success();
85 }
86
87 // Replaces asyncOp with a clone that returns a token.
rewriteAsyncOpGpuAsyncRegionPass::ThreadTokenCallback88 LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {
89 auto *op = asyncOp.getOperation();
90 auto tokenType = builder.getType<gpu::AsyncTokenType>();
91
92 // If there is no current token, insert a `gpu.wait async` without
93 // dependencies to create one.
94 if (!currentToken)
95 currentToken = createWaitOp(op->getLoc(), tokenType, {});
96 asyncOp.addAsyncDependency(currentToken);
97
98 // Return early if op returns a token already.
99 currentToken = asyncOp.getAsyncToken();
100 if (currentToken)
101 return success();
102
103 // Clone the op to return a token in addition to the other results.
104 SmallVector<Type, 1> resultTypes;
105 resultTypes.reserve(1 + op->getNumResults());
106 copy(op->getResultTypes(), std::back_inserter(resultTypes));
107 resultTypes.push_back(tokenType);
108 auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes,
109 op->getOperands(), op->getAttrDictionary(),
110 op->getSuccessors(), op->getNumRegions());
111
112 // Clone regions into new op.
113 BlockAndValueMapping mapping;
114 for (auto pair : llvm::zip_first(op->getRegions(), newOp->getRegions()))
115 std::get<0>(pair).cloneInto(&std::get<1>(pair), mapping);
116
117 // Replace the op with the async clone.
118 auto results = newOp->getResults();
119 currentToken = results.back();
120 builder.insert(newOp);
121 op->replaceAllUsesWith(results.drop_back());
122 op->erase();
123
124 return success();
125 }
126
createWaitOpGpuAsyncRegionPass::ThreadTokenCallback127 Value createWaitOp(Location loc, Type resultType, ValueRange operands) {
128 return builder.create<gpu::WaitOp>(loc, resultType, operands).asyncToken();
129 }
130
131 OpBuilder builder;
132
133 // The token that represents the current asynchronous dependency. It's valid
134 // range starts with a `gpu.wait async` op, and ends with a `gpu.wait` op.
135 // In between, each gpu::AsyncOpInterface depends on the current token and
136 // produces the new one.
137 Value currentToken = {};
138 };
139
140 /// Erases `executeOp` and returns a clone with additional `results`.
addExecuteResults(async::ExecuteOp executeOp,ValueRange results)141 async::ExecuteOp addExecuteResults(async::ExecuteOp executeOp,
142 ValueRange results) {
143 // Add values to async.yield op.
144 Operation *yieldOp = executeOp.getBody()->getTerminator();
145 yieldOp->insertOperands(yieldOp->getNumOperands(), results);
146
147 // Construct new result type list with additional types.
148 SmallVector<Type, 2> resultTypes;
149 resultTypes.reserve(executeOp.getNumResults() + results.size());
150 transform(executeOp.getResultTypes(), std::back_inserter(resultTypes),
151 [](Type type) {
152 // Extract value type from !async.value.
153 if (auto valueType = type.dyn_cast<async::ValueType>())
154 return valueType.getValueType();
155 assert(type.isa<async::TokenType>() && "expected token type");
156 return type;
157 });
158 transform(results, std::back_inserter(resultTypes),
159 [](Value value) { return value.getType(); });
160
161 // Clone executeOp with the extra results.
162 OpBuilder builder(executeOp);
163 auto newOp = builder.create<async::ExecuteOp>(
164 executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/,
165 executeOp.dependencies(), executeOp.operands());
166 BlockAndValueMapping mapper;
167 newOp.getRegion().getBlocks().clear();
168 executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper);
169
170 // Replace executeOp with cloned one.
171 executeOp.getOperation()->replaceAllUsesWith(
172 newOp.getResults().drop_back(results.size()));
173 executeOp.erase();
174
175 return newOp;
176 }
177
178 // Callback for `async.execute` ops which tries to push the contained
179 // synchronous `gpu.wait` op to the dependencies of the `async.execute`.
180 struct GpuAsyncRegionPass::DeferWaitCallback {
181 // If the `executeOp`s token is used only in `async.execute` or `async.await`
182 // ops, add the region's last `gpu.wait` op to the worklist if it is
183 // synchronous and is the last op with side effects.
operator ()GpuAsyncRegionPass::DeferWaitCallback184 void operator()(async::ExecuteOp executeOp) {
185 if (!areAllUsersExecuteOrAwait(executeOp.token()))
186 return;
187 // async.execute's region is currently restricted to one block.
188 for (auto &op : llvm::reverse(executeOp.getBody()->without_terminator())) {
189 if (auto waitOp = dyn_cast<gpu::WaitOp>(op)) {
190 if (!waitOp.asyncToken())
191 worklist.push_back(waitOp);
192 return;
193 }
194 if (hasSideEffects(&op))
195 return;
196 }
197 }
198
199 // The destructor performs the actual rewrite work.
~DeferWaitCallbackGpuAsyncRegionPass::DeferWaitCallback200 ~DeferWaitCallback() {
201 for (size_t i = 0; i < worklist.size(); ++i) {
202 auto waitOp = worklist[i];
203 auto executeOp = waitOp->getParentOfType<async::ExecuteOp>();
204
205 // Erase `gpu.wait` and return async dependencies from execute op instead.
206 SmallVector<Value, 4> dependencies = waitOp.asyncDependencies();
207 waitOp.erase();
208 executeOp = addExecuteResults(executeOp, dependencies);
209
210 // Add the async dependency to each user of the `async.execute` token.
211 auto asyncTokens = executeOp.getResults().take_back(dependencies.size());
212 for (Operation *user : executeOp.token().getUsers())
213 addAsyncDependencyAfter(asyncTokens, user);
214 }
215 }
216
217 private:
218 // Returns whether all token users are either 'async.execute' or 'async.await'
219 // ops. This is used as a requirement for pushing 'gpu.wait' ops from a
220 // 'async.execute' body to it's users. Specifically, we do not allow
221 // terminator users, because it could mean that the `async.execute` is inside
222 // control flow code.
areAllUsersExecuteOrAwaitGpuAsyncRegionPass::DeferWaitCallback223 static bool areAllUsersExecuteOrAwait(Value token) {
224 return !token.use_empty() &&
225 llvm::all_of(token.getUsers(), [](Operation *user) {
226 return isa<async::ExecuteOp, async::AwaitOp>(user);
227 });
228 }
229
230 // Add the `asyncToken` as dependency as needed after `op`.
addAsyncDependencyAfterGpuAsyncRegionPass::DeferWaitCallback231 void addAsyncDependencyAfter(ValueRange asyncTokens, Operation *op) {
232 OpBuilder builder(op->getContext());
233 auto loc = op->getLoc();
234
235 Block::iterator it;
236 SmallVector<Value, 1> tokens;
237 tokens.reserve(asyncTokens.size());
238 TypeSwitch<Operation *>(op)
239 .Case<async::AwaitOp>([&](auto awaitOp) {
240 // Add async.await ops to wait for the !gpu.async.tokens.
241 builder.setInsertionPointAfter(op);
242 for (auto asyncToken : asyncTokens)
243 tokens.push_back(
244 builder.create<async::AwaitOp>(loc, asyncToken).result());
245 // Set `it` after the inserted async.await ops.
246 it = builder.getInsertionPoint();
247 })
248 .Case<async::ExecuteOp>([&](auto executeOp) {
249 // Set `it` to the beginning of the region and add asyncTokens to the
250 // async.execute operands.
251 it = executeOp.getBody()->begin();
252 executeOp.operandsMutable().append(asyncTokens);
253 SmallVector<Type, 1> tokenTypes(
254 asyncTokens.size(), builder.getType<gpu::AsyncTokenType>());
255 copy(executeOp.getBody()->addArguments(tokenTypes),
256 std::back_inserter(tokens));
257 });
258
259 // Advance `it` to terminator or op with side-effects.
260 it = std::find_if(it, Block::iterator(), [](Operation &op) {
261 return isTerminator(&op) || hasSideEffects(&op);
262 });
263
264 // If `op` implements the AsyncOpInterface, add `token` to the list of async
265 // dependencies.
266 if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(*it)) {
267 for (auto token : tokens)
268 asyncOp.addAsyncDependency(token);
269 return;
270 }
271
272 // Otherwise, insert a gpu.wait before 'it'.
273 builder.setInsertionPoint(it->getBlock(), it);
274 auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens);
275
276 // If the new waitOp is at the end of an async.execute region, add it to the
277 // worklist. 'operator()(executeOp)' would do the same, but this is faster.
278 auto executeOp = dyn_cast<async::ExecuteOp>(it->getParentOp());
279 if (executeOp && areAllUsersExecuteOrAwait(executeOp.token()) &&
280 !it->getNextNode())
281 worklist.push_back(waitOp);
282 }
283
284 SmallVector<gpu::WaitOp, 8> worklist;
285 };
286
287 // Callback for `async.execute` ops which repeats !gpu.async.token results
288 // so that each of them is only used once.
289 struct GpuAsyncRegionPass::SingleTokenUseCallback {
operator ()GpuAsyncRegionPass::SingleTokenUseCallback290 void operator()(async::ExecuteOp executeOp) {
291 // Extract !gpu.async.token results which have multiple uses.
292 auto multiUseResults =
293 llvm::make_filter_range(executeOp.results(), [](OpResult result) {
294 if (result.use_empty() || result.hasOneUse())
295 return false;
296 auto valueType = result.getType().dyn_cast<async::ValueType>();
297 return valueType &&
298 valueType.getValueType().isa<gpu::AsyncTokenType>();
299 });
300 if (multiUseResults.empty())
301 return;
302
303 // Indices within !async.execute results (i.e. without the async.token).
304 SmallVector<int, 4> indices;
305 transform(multiUseResults, std::back_inserter(indices),
306 [](OpResult result) {
307 return result.getResultNumber() - 1; // Index without token.
308 });
309
310 for (auto index : indices) {
311 assert(!executeOp.results()[index].getUses().empty());
312 // Repeat async.yield token result, one for each use after the first one.
313 auto uses = llvm::drop_begin(executeOp.results()[index].getUses());
314 auto count = std::distance(uses.begin(), uses.end());
315 auto yieldOp = cast<async::YieldOp>(executeOp.getBody()->getTerminator());
316 SmallVector<Value, 4> operands(count, yieldOp.getOperand(index));
317 executeOp = addExecuteResults(executeOp, operands);
318 // Update 'uses' to refer to the new executeOp.
319 uses = llvm::drop_begin(executeOp.results()[index].getUses());
320 auto results = executeOp.results().take_back(count);
321 for (auto pair : llvm::zip(uses, results))
322 std::get<0>(pair).set(std::get<1>(pair));
323 }
324 }
325 };
326
327 // Replaces synchronous GPU ops in the op's region with asynchronous ones and
328 // inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
329 // execution semantics and that no GPU ops are asynchronous yet.
runOnFunction()330 void GpuAsyncRegionPass::runOnFunction() {
331 if (getFunction()->walk(ThreadTokenCallback(getContext())).wasInterrupted())
332 return signalPassFailure();
333
334 // Collect gpu.wait ops that we can move out of async.execute regions.
335 getFunction().getRegion().walk(DeferWaitCallback());
336 // Makes each !gpu.async.token returned from async.execute op have single use.
337 getFunction().getRegion().walk(SingleTokenUseCallback());
338 }
339
createGpuAsyncRegionPass()340 std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {
341 return std::make_unique<GpuAsyncRegionPass>();
342 }
343