1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "mlir/Transforms/LoopUtils.h"
14
15 #include "mlir/Analysis/AffineAnalysis.h"
16 #include "mlir/Analysis/LoopAnalysis.h"
17 #include "mlir/Analysis/SliceAnalysis.h"
18 #include "mlir/Analysis/Utils.h"
19 #include "mlir/Dialect/Affine/IR/AffineOps.h"
20 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
21 #include "mlir/Dialect/MemRef/IR/MemRef.h"
22 #include "mlir/Dialect/SCF/SCF.h"
23 #include "mlir/IR/AffineMap.h"
24 #include "mlir/IR/BlockAndValueMapping.h"
25 #include "mlir/IR/BuiltinOps.h"
26 #include "mlir/IR/IntegerSet.h"
27 #include "mlir/Support/MathExtras.h"
28 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
29 #include "mlir/Transforms/RegionUtils.h"
30 #include "mlir/Transforms/Utils.h"
31 #include "llvm/ADT/DenseMap.h"
32 #include "llvm/ADT/MapVector.h"
33 #include "llvm/ADT/SetVector.h"
34 #include "llvm/ADT/SmallPtrSet.h"
35 #include "llvm/Support/Debug.h"
36 #include "llvm/Support/raw_ostream.h"
37
38 #define DEBUG_TYPE "LoopUtils"
39
40 using namespace mlir;
41 using llvm::SmallMapVector;
42
43 namespace {
44 // This structure is to pass and return sets of loop parameters without
45 // confusing the order.
46 struct LoopParams {
47 Value lowerBound;
48 Value upperBound;
49 Value step;
50 };
51 } // namespace
52
53 /// Computes the cleanup loop lower bound of the loop being unrolled with
54 /// the specified unroll factor; this bound will also be upper bound of the main
55 /// part of the unrolled loop. Computes the bound as an AffineMap with its
56 /// operands or a null map when the trip count can't be expressed as an affine
57 /// expression.
getCleanupLoopLowerBound(AffineForOp forOp,unsigned unrollFactor,AffineMap & map,SmallVectorImpl<Value> & operands)58 static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
59 AffineMap &map,
60 SmallVectorImpl<Value> &operands) {
61 auto lbMap = forOp.getLowerBoundMap();
62
63 // Single result lower bound map only.
64 if (lbMap.getNumResults() != 1) {
65 map = AffineMap();
66 return;
67 }
68
69 AffineMap tripCountMap;
70 SmallVector<Value, 4> tripCountOperands;
71 buildTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
72
73 // Sometimes the trip count cannot be expressed as an affine expression.
74 if (!tripCountMap) {
75 map = AffineMap();
76 return;
77 }
78
79 OpBuilder b(forOp);
80 auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
81 forOp.getLowerBoundOperands());
82
83 // For each upper bound expr, get the range.
84 // Eg: affine.for %i = lb to min (ub1, ub2),
85 // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
86 // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
87 // these affine.apply's make up the cleanup loop lower bound.
88 SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
89 SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
90 int64_t step = forOp.getStep();
91 for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
92 auto tripCountExpr = tripCountMap.getResult(i);
93 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
94 auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
95 tripCountMap.getNumSymbols(), bumpExprs[i]);
96 bumpValues[i] =
97 b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
98 }
99
100 SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
101 for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
102 newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
103
104 operands.clear();
105 operands.push_back(lb);
106 operands.append(bumpValues.begin(), bumpValues.end());
107 map = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
108 b.getContext());
109 // Simplify the map + operands.
110 fullyComposeAffineMapAndOperands(&map, &operands);
111 map = simplifyAffineMap(map);
112 canonicalizeMapAndOperands(&map, &operands);
113 // Remove any affine.apply's that became dead from the simplification above.
114 for (auto v : bumpValues)
115 if (v.use_empty())
116 v.getDefiningOp()->erase();
117
118 if (lb.use_empty())
119 lb.erase();
120 }
121
122 // Build the IR that performs ceil division of a positive value by a constant:
123 // ceildiv(a, B) = divis(a + (B-1), B)
124 // where divis is rounding-to-zero division.
ceilDivPositive(OpBuilder & builder,Location loc,Value dividend,int64_t divisor)125 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
126 int64_t divisor) {
127 assert(divisor > 0 && "expected positive divisor");
128 assert(dividend.getType().isIndex() && "expected index-typed value");
129
130 Value divisorMinusOneCst = builder.create<ConstantIndexOp>(loc, divisor - 1);
131 Value divisorCst = builder.create<ConstantIndexOp>(loc, divisor);
132 Value sum = builder.create<AddIOp>(loc, dividend, divisorMinusOneCst);
133 return builder.create<SignedDivIOp>(loc, sum, divisorCst);
134 }
135
136 // Build the IR that performs ceil division of a positive value by another
137 // positive value:
138 // ceildiv(a, b) = divis(a + (b - 1), b)
139 // where divis is rounding-to-zero division.
ceilDivPositive(OpBuilder & builder,Location loc,Value dividend,Value divisor)140 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
141 Value divisor) {
142 assert(dividend.getType().isIndex() && "expected index-typed value");
143
144 Value cstOne = builder.create<ConstantIndexOp>(loc, 1);
145 Value divisorMinusOne = builder.create<SubIOp>(loc, divisor, cstOne);
146 Value sum = builder.create<AddIOp>(loc, dividend, divisorMinusOne);
147 return builder.create<SignedDivIOp>(loc, sum, divisor);
148 }
149
150 /// Helper to replace uses of loop carried values (iter_args) and loop
151 /// yield values while promoting single iteration affine.for and scf.for ops.
152 template <typename AffineOrSCFForOp>
replaceIterArgsAndYieldResults(AffineOrSCFForOp forOp)153 static void replaceIterArgsAndYieldResults(AffineOrSCFForOp forOp) {
154 static_assert(
155 llvm::is_one_of<AffineOrSCFForOp, AffineForOp, scf::ForOp>::value,
156 "only for affine.for and scf.for ops");
157 // Replace uses of iter arguments with iter operands (initial values).
158 auto iterOperands = forOp.getIterOperands();
159 auto iterArgs = forOp.getRegionIterArgs();
160 for (auto e : llvm::zip(iterOperands, iterArgs))
161 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
162
163 // Replace uses of loop results with the values yielded by the loop.
164 auto outerResults = forOp.getResults();
165 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
166 for (auto e : llvm::zip(outerResults, innerResults))
167 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
168 }
169
170 /// Promotes the loop body of a forOp to its containing block if the forOp
171 /// was known to have a single iteration.
172 // TODO: extend this for arbitrary affine bounds.
promoteIfSingleIteration(AffineForOp forOp)173 LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
174 Optional<uint64_t> tripCount = getConstantTripCount(forOp);
175 if (!tripCount || tripCount.getValue() != 1)
176 return failure();
177
178 if (forOp.getLowerBoundMap().getNumResults() != 1)
179 return failure();
180
181 // Replaces all IV uses to its single iteration value.
182 auto iv = forOp.getInductionVar();
183 auto *parentBlock = forOp->getBlock();
184 if (!iv.use_empty()) {
185 if (forOp.hasConstantLowerBound()) {
186 OpBuilder topBuilder(forOp->getParentOfType<FuncOp>().getBody());
187 auto constOp = topBuilder.create<ConstantIndexOp>(
188 forOp.getLoc(), forOp.getConstantLowerBound());
189 iv.replaceAllUsesWith(constOp);
190 } else {
191 auto lbOperands = forOp.getLowerBoundOperands();
192 auto lbMap = forOp.getLowerBoundMap();
193 OpBuilder builder(parentBlock, Block::iterator(forOp));
194 if (lbMap == builder.getDimIdentityMap()) {
195 // No need of generating an affine.apply.
196 iv.replaceAllUsesWith(lbOperands[0]);
197 } else {
198 auto affineApplyOp =
199 builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
200 iv.replaceAllUsesWith(affineApplyOp);
201 }
202 }
203 }
204
205 replaceIterArgsAndYieldResults(forOp);
206
207 // Move the loop body operations, except for its terminator, to the loop's
208 // containing block.
209 forOp.getBody()->back().erase();
210 parentBlock->getOperations().splice(Block::iterator(forOp),
211 forOp.getBody()->getOperations());
212 forOp.erase();
213 return success();
214 }
215
216 /// Promotes the loop body of a forOp to its containing block if the forOp
217 /// it can be determined that the loop has a single iteration.
promoteIfSingleIteration(scf::ForOp forOp)218 LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) {
219 auto lbCstOp = forOp.lowerBound().getDefiningOp<ConstantIndexOp>();
220 auto ubCstOp = forOp.upperBound().getDefiningOp<ConstantIndexOp>();
221 auto stepCstOp = forOp.step().getDefiningOp<ConstantIndexOp>();
222 if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.getValue() < 0 ||
223 ubCstOp.getValue() < 0 || stepCstOp.getValue() < 0)
224 return failure();
225 int64_t tripCount = mlir::ceilDiv(ubCstOp.getValue() - lbCstOp.getValue(),
226 stepCstOp.getValue());
227 if (tripCount != 1)
228 return failure();
229 auto iv = forOp.getInductionVar();
230 iv.replaceAllUsesWith(lbCstOp);
231
232 replaceIterArgsAndYieldResults(forOp);
233
234 // Move the loop body operations, except for its terminator, to the loop's
235 // containing block.
236 auto *parentBlock = forOp->getBlock();
237 forOp.getBody()->getTerminator()->erase();
238 parentBlock->getOperations().splice(Block::iterator(forOp),
239 forOp.getBody()->getOperations());
240 forOp.erase();
241 return success();
242 }
243
244 /// Promotes all single iteration 'for' ops in `f`, i.e., moves
245 /// their body into the containing Block.
promoteSingleIterationLoops(FuncOp f)246 void mlir::promoteSingleIterationLoops(FuncOp f) {
247 // Gathers all innermost loops through a post order pruned walk.
248 f.walk([](Operation *op) {
249 if (auto forOp = dyn_cast<AffineForOp>(op))
250 (void)promoteIfSingleIteration(forOp);
251 else if (auto forOp = dyn_cast<scf::ForOp>(op))
252 (void)promoteIfSingleIteration(forOp);
253 });
254 }
255
256 /// Generates an affine.for op with the specified lower and upper bounds
257 /// while generating the right IV remappings to realize shifts for operations in
258 /// its body. The operations that go into the loop body are specified in
259 /// opGroupQueue starting from the specified offset, and in that order. The
260 /// first element of the pair specifies the shift applied to that group of
261 /// operations; the shift is multiplied by the loop step before being applied.
262 /// Returns nullptr if the generated loop simplifies to a single iteration one.
generateShiftedLoop(AffineMap lbMap,AffineMap ubMap,const std::vector<std::pair<uint64_t,ArrayRef<Operation * >>> & opGroupQueue,unsigned offset,AffineForOp srcForOp,OpBuilder b)263 static AffineForOp generateShiftedLoop(
264 AffineMap lbMap, AffineMap ubMap,
265 const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
266 unsigned offset, AffineForOp srcForOp, OpBuilder b) {
267 auto lbOperands = srcForOp.getLowerBoundOperands();
268 auto ubOperands = srcForOp.getUpperBoundOperands();
269
270 assert(lbMap.getNumInputs() == lbOperands.size());
271 assert(ubMap.getNumInputs() == ubOperands.size());
272
273 auto loopChunk = b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap,
274 ubOperands, ubMap, srcForOp.getStep());
275 auto loopChunkIV = loopChunk.getInductionVar();
276 auto srcIV = srcForOp.getInductionVar();
277
278 BlockAndValueMapping operandMap;
279
280 auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
281 for (auto it = opGroupQueue.begin() + offset, e = opGroupQueue.end(); it != e;
282 ++it) {
283 uint64_t shift = it->first;
284 auto ops = it->second;
285 // All 'same shift' operations get added with their operands being
286 // remapped to results of cloned operations, and their IV used remapped.
287 // Generate the remapping if the shift is not zero: remappedIV = newIV -
288 // shift.
289 if (!srcIV.use_empty() && shift != 0) {
290 auto ivRemap = bodyBuilder.create<AffineApplyOp>(
291 srcForOp.getLoc(),
292 bodyBuilder.getSingleDimShiftAffineMap(
293 -static_cast<int64_t>(srcForOp.getStep() * shift)),
294 loopChunkIV);
295 operandMap.map(srcIV, ivRemap);
296 } else {
297 operandMap.map(srcIV, loopChunkIV);
298 }
299 for (auto *op : ops)
300 bodyBuilder.clone(*op, operandMap);
301 };
302 if (succeeded(promoteIfSingleIteration(loopChunk)))
303 return AffineForOp();
304 return loopChunk;
305 }
306
307 // The skewing of operations with respect to one another can be used for
308 // example to allow overlap of asynchronous operations (such as DMA
309 // communication) with computation, or just relative shifting of operations
310 // for better register reuse, locality or parallelism. As such, the shifts are
311 // typically expected to be at most of the order of the number of operations.
312 // This method should not be used as a substitute for loop distribution/fission.
313 // This method uses an algorithm// in time linear in the number of operations
314 // in the body of the for loop - (using the 'sweep line' paradigm). This method
315 // asserts preservation of SSA dominance. A check for that as well as that for
316 // memory-based dependence preservation check rests with the users of this
317 // method.
affineForOpBodySkew(AffineForOp forOp,ArrayRef<uint64_t> shifts,bool unrollPrologueEpilogue)318 LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp,
319 ArrayRef<uint64_t> shifts,
320 bool unrollPrologueEpilogue) {
321 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
322 "too few/many shifts");
323 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
324 return success();
325
326 // If the trip counts aren't constant, we would need versioning and
327 // conditional guards (or context information to prevent such versioning). The
328 // better way to pipeline for such loops is to first tile them and extract
329 // constant trip count "full tiles" before applying this.
330 auto mayBeConstTripCount = getConstantTripCount(forOp);
331 if (!mayBeConstTripCount.hasValue()) {
332 LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
333 return success();
334 }
335 uint64_t tripCount = mayBeConstTripCount.getValue();
336
337 assert(isOpwiseShiftValid(forOp, shifts) &&
338 "shifts will lead to an invalid transformation\n");
339
340 int64_t step = forOp.getStep();
341
342 unsigned numChildOps = shifts.size();
343
344 // Do a linear time (counting) sort for the shifts.
345 uint64_t maxShift = *std::max_element(shifts.begin(), shifts.end());
346 if (maxShift >= numChildOps) {
347 // Large shifts are not the typical use case.
348 forOp.emitWarning("not shifting because shifts are unrealistically large");
349 return success();
350 }
351
352 // An array of operation groups sorted by shift amount; each group has all
353 // operations with the same shift in the order in which they appear in the
354 // body of the 'affine.for' op.
355 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
356 unsigned pos = 0;
357 for (auto &op : forOp.getBody()->without_terminator()) {
358 auto shift = shifts[pos++];
359 sortedOpGroups[shift].push_back(&op);
360 }
361
362 // Unless the shifts have a specific pattern (which actually would be the
363 // common use case), prologue and epilogue are not meaningfully defined.
364 // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
365 // loop generated as the prologue and the last as epilogue and unroll these
366 // fully.
367 AffineForOp prologue, epilogue;
368
369 // Do a sweep over the sorted shifts while storing open groups in a
370 // vector, and generating loop portions as necessary during the sweep. A block
371 // of operations is paired with its shift.
372 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
373
374 auto origLbMap = forOp.getLowerBoundMap();
375 uint64_t lbShift = 0;
376 OpBuilder b(forOp);
377 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
378 // If nothing is shifted by d, continue.
379 if (sortedOpGroups[d].empty())
380 continue;
381 if (!opGroupQueue.empty()) {
382 assert(d > 0 &&
383 "Queue expected to be empty when the first block is found");
384 // The interval for which the loop needs to be generated here is:
385 // [lbShift, min(lbShift + tripCount, d)) and the body of the
386 // loop needs to have all operations in opQueue in that order.
387 AffineForOp res;
388 if (lbShift + tripCount * step < d * step) {
389 res = generateShiftedLoop(
390 b.getShiftedAffineMap(origLbMap, lbShift),
391 b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
392 opGroupQueue, /*offset=*/0, forOp, b);
393 // Entire loop for the queued op groups generated, empty it.
394 opGroupQueue.clear();
395 lbShift += tripCount * step;
396 } else {
397 res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
398 b.getShiftedAffineMap(origLbMap, d),
399 opGroupQueue, /*offset=*/0, forOp, b);
400 lbShift = d * step;
401 }
402
403 if (res) {
404 // Simplify/canonicalize the affine.for.
405 RewritePatternSet patterns(res.getContext());
406 AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
407 bool erased;
408 (void)applyOpPatternsAndFold(res, std::move(patterns), &erased);
409
410 if (!erased && !prologue)
411 prologue = res;
412 if (!erased)
413 epilogue = res;
414 }
415 } else {
416 // Start of first interval.
417 lbShift = d * step;
418 }
419 // Augment the list of operations that get into the current open interval.
420 opGroupQueue.push_back({d, sortedOpGroups[d]});
421 }
422
423 // Those operations groups left in the queue now need to be processed (FIFO)
424 // and their loops completed.
425 for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
426 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
427 epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
428 b.getShiftedAffineMap(origLbMap, ubShift),
429 opGroupQueue, /*offset=*/i, forOp, b);
430 lbShift = ubShift;
431 if (!prologue)
432 prologue = epilogue;
433 }
434
435 // Erase the original for op.
436 forOp.erase();
437
438 if (unrollPrologueEpilogue && prologue)
439 (void)loopUnrollFull(prologue);
440 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
441 (void)loopUnrollFull(epilogue);
442
443 return success();
444 }
445
446 /// Checks the legality of tiling of a hyper-rectangular loop nest by simply
447 /// checking if there is a 'negative' dependence in the memrefs present in
448 /// the loop nest. If yes then tiling is invalid.
449 static bool
checkTilingLegalityImpl(MutableArrayRef<mlir::AffineForOp> origLoops)450 checkTilingLegalityImpl(MutableArrayRef<mlir::AffineForOp> origLoops) {
451 assert(!origLoops.empty() && "no original loops provided");
452
453 // We first find out all dependences we intend to check.
454 SmallVector<Operation *, 8> loadAndStoreOps;
455 origLoops[0]->walk([&](Operation *op) {
456 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
457 loadAndStoreOps.push_back(op);
458 });
459
460 unsigned numOps = loadAndStoreOps.size();
461 unsigned numLoops = origLoops.size();
462 FlatAffineConstraints dependenceConstraints;
463 for (unsigned d = 1; d <= numLoops + 1; ++d) {
464 for (unsigned i = 0; i < numOps; ++i) {
465 Operation *srcOp = loadAndStoreOps[i];
466 MemRefAccess srcAccess(srcOp);
467 for (unsigned j = 0; j < numOps; ++j) {
468 Operation *dstOp = loadAndStoreOps[j];
469 MemRefAccess dstAccess(dstOp);
470
471 SmallVector<DependenceComponent, 2> depComps;
472 dependenceConstraints.reset();
473 DependenceResult result = checkMemrefAccessDependence(
474 srcAccess, dstAccess, d, &dependenceConstraints, &depComps);
475
476 // Skip if there is no dependence in this case.
477 if (!hasDependence(result))
478 continue;
479
480 // Check whether there is any negative direction vector in the
481 // dependence components found above, which means that dependence is
482 // violated by the default hyper-rect tiling method.
483 LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated "
484 "for dependence at depth: "
485 << Twine(d) << " between:\n";);
486 LLVM_DEBUG(srcAccess.opInst->dump(););
487 LLVM_DEBUG(dstAccess.opInst->dump(););
488 for (unsigned k = 0, e = depComps.size(); k < e; k++) {
489 DependenceComponent depComp = depComps[k];
490 if (depComp.lb.hasValue() && depComp.ub.hasValue() &&
491 depComp.lb.getValue() < depComp.ub.getValue() &&
492 depComp.ub.getValue() < 0) {
493 LLVM_DEBUG(llvm::dbgs()
494 << "Dependence component lb = "
495 << Twine(depComp.lb.getValue())
496 << " ub = " << Twine(depComp.ub.getValue())
497 << " is negative at depth: " << Twine(d)
498 << " and thus violates the legality rule.\n");
499 return false;
500 }
501 }
502 }
503 }
504 }
505
506 return true;
507 }
508
509 /// Checks whether hyper-rectangular loop tiling of the nest
510 /// represented by `origLoops` is valid. The validity condition is from Irigoin
511 /// and Triolet, which states that two tiles cannot depend on each other. We
512 /// simplify such condition to just checking whether there is any negative
513 /// dependence direction, since we have the prior knowledge that the tiling
514 /// results will be hyper-rectangles, which are scheduled in the
515 /// lexicographically increasing order on the vector of loop indices. This
516 /// function will return failure when any dependence component is negative along
517 /// any of `origLoops`.
518 LogicalResult
checkTilingLegality(MutableArrayRef<mlir::AffineForOp> origLoops)519 checkTilingLegality(MutableArrayRef<mlir::AffineForOp> origLoops) {
520 return success(checkTilingLegalityImpl(origLoops));
521 }
522
523 /// Check if the input data is valid and whether tiled code will be legal or
524 /// not.
525 template <typename t>
performPreTilingChecks(MutableArrayRef<AffineForOp> input,ArrayRef<t> tileSizes)526 void performPreTilingChecks(MutableArrayRef<AffineForOp> input,
527 ArrayRef<t> tileSizes) {
528 // Check if the supplied for op's are all successively nested.
529 assert(!input.empty() && "no loops in input band");
530 assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
531
532 assert(isPerfectlyNested(input) && "input loops not perfectly nested");
533
534 // Perform tiling legality test.
535 if (failed(checkTilingLegality(input)))
536 input[0].emitRemark("tiled code is illegal due to dependences");
537 }
538
539 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
540 /// location in destination's body, ignoring the terminator.
moveLoopBodyImpl(AffineForOp src,AffineForOp dest,Block::iterator loc)541 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
542 Block::iterator loc) {
543 auto &ops = src.getBody()->getOperations();
544 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
545 std::prev(ops.end()));
546 }
547
548 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
549 /// body.
moveLoopBody(AffineForOp src,AffineForOp dest)550 void moveLoopBody(AffineForOp src, AffineForOp dest) {
551 moveLoopBodyImpl(src, dest, dest.getBody()->begin());
552 }
553
554 /// Constructs tiled loop nest, without setting the loop bounds and move the
555 /// body of the original loop nest to the tiled loop nest.
constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,AffineForOp rootAffineForOp,unsigned width,MutableArrayRef<AffineForOp> tiledLoops)556 void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,
557 AffineForOp rootAffineForOp, unsigned width,
558 MutableArrayRef<AffineForOp> tiledLoops) {
559 Location loc = rootAffineForOp.getLoc();
560
561 // The outermost among the loops as we add more..
562 Operation *topLoop = rootAffineForOp.getOperation();
563 AffineForOp innermostPointLoop;
564
565 // Add intra-tile (or point) loops.
566 for (unsigned i = 0; i < width; i++) {
567 OpBuilder b(topLoop);
568 // Loop bounds will be set later.
569 AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
570 pointLoop.getBody()->getOperations().splice(
571 pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
572 topLoop);
573 tiledLoops[2 * width - 1 - i] = pointLoop;
574 topLoop = pointLoop.getOperation();
575 if (i == 0)
576 innermostPointLoop = pointLoop;
577 }
578
579 // Add tile space loops;
580 for (unsigned i = width; i < 2 * width; i++) {
581 OpBuilder b(topLoop);
582 // Loop bounds will be set later.
583 AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
584 tileSpaceLoop.getBody()->getOperations().splice(
585 tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
586 topLoop);
587 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
588 topLoop = tileSpaceLoop.getOperation();
589 }
590
591 // Move the loop body of the original nest to the new one.
592 moveLoopBody(origLoops.back(), innermostPointLoop);
593 }
594
595 /// Checks whether a loop nest is hyper-rectangular or not.
checkIfHyperRectangular(MutableArrayRef<AffineForOp> input,AffineForOp rootAffineForOp,unsigned width)596 LogicalResult checkIfHyperRectangular(MutableArrayRef<AffineForOp> input,
597 AffineForOp rootAffineForOp,
598 unsigned width) {
599 FlatAffineConstraints cst;
600 SmallVector<Operation *, 8> ops(input.begin(), input.end());
601 (void)getIndexSet(ops, &cst);
602 if (!cst.isHyperRectangular(0, width)) {
603 rootAffineForOp.emitError("tiled code generation unimplemented for the "
604 "non-hyperrectangular case");
605 return failure();
606 }
607 return success();
608 }
609
610 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
611 // TODO: Handle non-constant lower bounds.
setIntraTileBoundsParametric(OpBuilder & b,AffineForOp origLoop,AffineForOp newInterTileLoop,AffineForOp newIntraTileLoop,Value tileSize)612 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
613 AffineForOp newInterTileLoop,
614 AffineForOp newIntraTileLoop,
615 Value tileSize) {
616 // The lower bound for the intra-tile loop is represented by an affine map
617 // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
618 // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
619 // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
620 // of the corresponding inter-tile loop, %t0 is the corresponding tiling
621 // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
622 // corresponding inter-tile loop.
623
624 assert(origLoop.hasConstantLowerBound() &&
625 "expected input loops to have constant lower bound.");
626
627 // Get lower bound of original loop as an affine expression.
628 AffineExpr origLowerBoundExpr;
629 origLowerBoundExpr =
630 b.getAffineConstantExpr(origLoop.getConstantLowerBound());
631
632 // Add dim operands from original lower/upper bound.
633 SmallVector<Value, 4> lbOperands, ubOperands;
634 AffineBound lb = origLoop.getLowerBound();
635 AffineBound ub = origLoop.getUpperBound();
636 lbOperands.reserve(lb.getNumOperands() + 2);
637 ubOperands.reserve(ub.getNumOperands() + 2);
638 AffineMap origLbMap = lb.getMap();
639 AffineMap origUbMap = ub.getMap();
640 for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
641 lbOperands.push_back(lb.getOperand(j));
642 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
643 ubOperands.push_back(ub.getOperand(j));
644
645 // Add a new dim operand in lb/ubOperands corresponding to the origLoop
646 // IV.
647 lbOperands.push_back(newInterTileLoop.getInductionVar());
648 ubOperands.push_back(newInterTileLoop.getInductionVar());
649
650 // Get loop IV as an affine expression for lower/upper bound. Size of
651 // lb/ubOperands is guaranteed to be atleast one.
652 AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
653 AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
654
655 // Add symbol operands from original lower/upper bound.
656 for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
657 lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
658 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
659 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
660
661 // Add a new symbol operand which is the tile size for this loop.
662 lbOperands.push_back(tileSize);
663 ubOperands.push_back(tileSize);
664
665 SmallVector<AffineExpr, 4> lbBoundExprs;
666 SmallVector<AffineExpr, 4> ubBoundExprs;
667 lbBoundExprs.reserve(origLbMap.getNumResults());
668 ubBoundExprs.reserve(origUbMap.getNumResults());
669
670 // Get tiling parameter as an affine expression for lb/ub.
671 AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
672 AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
673
674 // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
675 lbBoundExprs.push_back(
676 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
677 origLowerBoundExpr);
678
679 // Get the origLoopStep as an affine expression.
680 AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStep());
681
682 // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
683 // (tilingParameter * origLoopStep) + origlb.
684 ubBoundExprs.push_back(
685 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
686 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
687
688 ubBoundExprs.append(origUbMap.getResults().begin(),
689 origUbMap.getResults().end());
690
691 AffineMap lbMap =
692 AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
693 lbBoundExprs, b.getContext());
694 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
695
696 AffineMap ubMap =
697 AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
698 ubBoundExprs, b.getContext());
699 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
700
701 // Original loop step must be preserved.
702 newIntraTileLoop.setStep(origLoop.getStep());
703 }
704
705 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
706 // TODO: Handle non-constant lower bounds.
setInterTileBoundsParametric(OpBuilder & b,AffineForOp origLoop,AffineForOp newLoop,Value tileSize)707 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
708 AffineForOp newLoop, Value tileSize) {
709 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
710
711 // The lower bounds for inter-tile loops are same as the corresponding lower
712 // bounds of original loops.
713 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
714
715 // The new upper bound map for inter-tile loops, assuming constant lower
716 // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
717 // originalLowerBound), tiling parameter); where tiling parameter is the
718 // respective tile size for that loop. For e.g. if the original ubmap was
719 // ()->(1024), the new map will be
720 // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
721 // Therefore a new symbol operand is inserted in the map and the result
722 // expression is overwritten.
723
724 assert(origLoop.hasConstantLowerBound() &&
725 "expected input loops to have constant lower bound.");
726
727 // Get lower bound of original loop as an affine expression.
728 AffineExpr origLowerBoundExpr;
729 origLowerBoundExpr =
730 b.getAffineConstantExpr(origLoop.getConstantLowerBound());
731
732 // Add dim operands from original upper bound.
733 SmallVector<Value, 4> ubOperands;
734 AffineBound ub = origLoop.getUpperBound();
735 ubOperands.reserve(ub.getNumOperands() + 1);
736 AffineMap origUbMap = ub.getMap();
737 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
738 ubOperands.push_back(ub.getOperand(j));
739
740 // Add symbol operands from original upper bound.
741 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
742 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
743
744 // Add a new symbol operand which is the tile size for this loop.
745 ubOperands.push_back(tileSize);
746
747 // Get tiling parameter as an affine expression.
748 AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
749
750 SmallVector<AffineExpr, 4> boundExprs;
751 boundExprs.reserve(origUbMap.getNumResults());
752 int64_t origUpperBound;
753 AffineExpr origUpperBoundExpr;
754
755 // If upper bound for the original loop is constant, then the constant can
756 // be obtained as an affine expression straight away.
757 if (origLoop.hasConstantUpperBound()) {
758 origUpperBound = origLoop.getConstantUpperBound();
759
760 // Get original constant upper bound as an affine expression.
761 origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
762
763 // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
764 // originalLowerBound), tilingParameter).
765 boundExprs.push_back(
766 origLowerBoundExpr +
767 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
768 } else {
769 // If upper bound for the original loop is not constant then two cases
770 // are possible, although there handeling is the same, 1.) The result of
771 // ubmap has only one result expression. For e.g.
772 // affine.for %i = 5 to %ub
773 //
774 // A symbol operand is added which represents the tiling parameter. The
775 // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
776 // where 's0' is the original upper bound and 's1' is the tiling
777 // parameter. 2.) When ubMap has more than one result expression. For e.g.
778 // #map0 = affine_map<()[s0, s1] -> (s0, s1)
779 // affine.for %i = 5 to min #map0()[%s0, %s1]
780 //
781 // A symbol operand is added which represents the tiling parameter. The
782 // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
783 // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
784
785 // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
786 // originalLowerBound), tilingParameter).
787 for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
788 boundExprs.push_back(
789 origLowerBoundExpr +
790 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
791 }
792
793 AffineMap ubMap =
794 AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
795 boundExprs, b.getContext());
796 newLoop.setUpperBound(ubOperands, ubMap);
797
798 // Original loop step must be preserved.
799 newLoop.setStep(origLoop.getStep());
800 }
801
802 /// Constructs and sets new loop bounds after tiling for the case of
803 /// hyper-rectangular index sets, where the bounds of one dimension do not
804 /// depend on other dimensions and tiling parameters are captured from SSA
805 /// values. Bounds of each dimension can thus be treated independently,
806 /// and deriving the new bounds is much simpler and faster than for the case of
807 /// tiling arbitrary polyhedral shapes.
constructParametricallyTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,MutableArrayRef<AffineForOp> newLoops,ArrayRef<Value> tileSizes)808 static void constructParametricallyTiledIndexSetHyperRect(
809 MutableArrayRef<AffineForOp> origLoops,
810 MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
811 assert(!origLoops.empty() && "expected atleast one loop in band");
812 assert(origLoops.size() == tileSizes.size() &&
813 "expected tiling parameter for each loop in band.");
814
815 OpBuilder b(origLoops[0].getOperation());
816 unsigned width = origLoops.size();
817
818 // Set bounds for tile space loops.
819 for (unsigned i = 0; i < width; ++i) {
820 setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
821 }
822
823 // Set bounds for intra-tile loops.
824 for (unsigned i = 0; i < width; ++i) {
825 setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
826 newLoops[i + width], tileSizes[i]);
827 }
828 }
829
830 /// Constructs and sets new loop bounds after tiling for the case of
831 /// hyper-rectangular index sets, where the bounds of one dimension do not
832 /// depend on other dimensions. Bounds of each dimension can thus be treated
833 /// independently, and deriving the new bounds is much simpler and faster
834 /// than for the case of tiling arbitrary polyhedral shapes.
835 static void
constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,MutableArrayRef<AffineForOp> newLoops,ArrayRef<unsigned> tileSizes)836 constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
837 MutableArrayRef<AffineForOp> newLoops,
838 ArrayRef<unsigned> tileSizes) {
839 assert(!origLoops.empty());
840 assert(origLoops.size() == tileSizes.size());
841
842 OpBuilder b(origLoops[0].getOperation());
843 unsigned width = origLoops.size();
844
845 // Bounds for tile space loops.
846 for (unsigned i = 0; i < width; i++) {
847 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
848 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
849 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
850 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
851 // If the step size of original loop is x and tileSize is y then after
852 // tiling the tile space loops' step size becomes x*y.
853 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStep());
854 }
855 // Bounds for intra-tile loops.
856 for (unsigned i = 0; i < width; i++) {
857 int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
858 Optional<uint64_t> mayBeConstantCount = getConstantTripCount(origLoops[i]);
859 // The lower bound is just the tile-space loop.
860 AffineMap lbMap = b.getDimIdentityMap();
861 newLoops[width + i].setLowerBound(
862 /*operands=*/newLoops[i].getInductionVar(), lbMap);
863 // The step sizes of intra-tile loops is just the original loops' step size.
864 newLoops[width + i].setStep(origLoops[i].getStep());
865
866 // Set the upper bound.
867 if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSizes[i]) {
868 // Trip count is less than the tile size: upper bound is lower bound +
869 // trip count * stepSize.
870 AffineMap ubMap = b.getSingleDimShiftAffineMap(
871 mayBeConstantCount.getValue() * origLoops[i].getStep());
872 newLoops[width + i].setUpperBound(
873 /*operands=*/newLoops[i].getInductionVar(), ubMap);
874 } else if (largestDiv % tileSizes[i] != 0) {
875 // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
876 // Construct the upper bound map; the operands are the original operands
877 // with 'i' (tile-space loop) appended to it. The new upper bound map is
878 // the original one with an additional expression i + tileSize * stepSize
879 // appended.
880
881 // Add dim operands from original upper bound.
882 SmallVector<Value, 4> ubOperands;
883 AffineBound ub = origLoops[i].getUpperBound();
884 ubOperands.reserve(ub.getNumOperands() + 1);
885 AffineMap origUbMap = ub.getMap();
886 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
887 ubOperands.push_back(ub.getOperand(j));
888
889 // Add dim operand for new loop upper bound.
890 ubOperands.push_back(newLoops[i].getInductionVar());
891
892 // Add symbol operands from original upper bound.
893 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
894 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
895
896 SmallVector<AffineExpr, 4> boundExprs;
897 boundExprs.reserve(1 + origUbMap.getNumResults());
898 AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
899 // The new upper bound map is the original one with an additional
900 // expression i + tileSize * stepSize (of original loop) appended.
901 boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStep());
902 boundExprs.append(origUbMap.getResults().begin(),
903 origUbMap.getResults().end());
904 AffineMap ubMap =
905 AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
906 boundExprs, b.getContext());
907 newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
908 } else {
909 // No need of the min expression.
910 AffineExpr dim = b.getAffineDimExpr(0);
911 AffineMap ubMap =
912 AffineMap::get(1, 0, dim + tileSizes[i] * origLoops[i].getStep());
913 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
914 }
915 }
916 }
917
918 /// Tiles the specified band of perfectly nested loops creating tile-space loops
919 /// and intra-tile loops. A band is a contiguous set of loops.
920 // TODO: handle non hyper-rectangular spaces.
921 LogicalResult
tilePerfectlyNested(MutableArrayRef<AffineForOp> input,ArrayRef<unsigned> tileSizes,SmallVectorImpl<AffineForOp> * tiledNest)922 mlir::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
923 ArrayRef<unsigned> tileSizes,
924 SmallVectorImpl<AffineForOp> *tiledNest) {
925 performPreTilingChecks(input, tileSizes);
926
927 MutableArrayRef<AffineForOp> origLoops = input;
928 AffineForOp rootAffineForOp = origLoops[0];
929 // Note that width is at least one since band isn't empty.
930 unsigned width = input.size();
931 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
932
933 // Construct a tiled loop nest without setting their bounds. Bounds are
934 // set later.
935 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
936
937 SmallVector<Value, 8> origLoopIVs;
938 extractForInductionVars(input, &origLoopIVs);
939
940 if (failed(checkIfHyperRectangular(input, rootAffineForOp, width)))
941 return failure();
942
943 // Set loop bounds for the tiled loop nest.
944 constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
945
946 // Replace original IVs with intra-tile loop IVs.
947 for (unsigned i = 0; i < width; i++)
948 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
949
950 // Erase the old loop nest.
951 rootAffineForOp.erase();
952
953 if (tiledNest)
954 *tiledNest = std::move(tiledLoops);
955
956 return success();
957 }
958
959 /// Tiles the specified band of perfectly nested loops creating tile-space
960 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
961 /// is a contiguous set of loops.
962 // TODO: handle non hyper-rectangular spaces.
963 LogicalResult
tilePerfectlyNestedParametric(MutableArrayRef<AffineForOp> input,ArrayRef<Value> tileSizes,SmallVectorImpl<AffineForOp> * tiledNest)964 mlir::tilePerfectlyNestedParametric(MutableArrayRef<AffineForOp> input,
965 ArrayRef<Value> tileSizes,
966 SmallVectorImpl<AffineForOp> *tiledNest) {
967 performPreTilingChecks(input, tileSizes);
968
969 MutableArrayRef<AffineForOp> origLoops = input;
970 AffineForOp rootAffineForOp = origLoops[0];
971 // Note that width is at least one since band isn't empty.
972 unsigned width = input.size();
973 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
974
975 // Construct a tiled loop nest without setting their bounds. Bounds are
976 // set later.
977 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
978
979 SmallVector<Value, 8> origLoopIVs;
980 extractForInductionVars(input, &origLoopIVs);
981
982 if (failed(checkIfHyperRectangular(input, rootAffineForOp, width)))
983 return failure();
984
985 // Set loop bounds for the tiled loop nest.
986 constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
987 tileSizes);
988
989 // Replace original IVs with intra-tile loop IVs.
990 for (unsigned i = 0; i < width; i++)
991 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
992
993 // Erase the old loop nest.
994 rootAffineForOp.erase();
995
996 if (tiledNest)
997 *tiledNest = std::move(tiledLoops);
998
999 return success();
1000 }
1001
1002 /// Collect perfectly nested loops starting from `rootForOps`. Loops are
1003 /// perfectly nested if each loop is the first and only non-terminator operation
1004 /// in the parent loop. Collect at most `maxLoops` loops and append them to
1005 /// `forOps`.
1006 template <typename T>
getPerfectlyNestedLoopsImpl(SmallVectorImpl<T> & forOps,T rootForOp,unsigned maxLoops=std::numeric_limits<unsigned>::max ())1007 static void getPerfectlyNestedLoopsImpl(
1008 SmallVectorImpl<T> &forOps, T rootForOp,
1009 unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
1010 for (unsigned i = 0; i < maxLoops; ++i) {
1011 forOps.push_back(rootForOp);
1012 Block &body = rootForOp.region().front();
1013 if (body.begin() != std::prev(body.end(), 2))
1014 return;
1015
1016 rootForOp = dyn_cast<T>(&body.front());
1017 if (!rootForOp)
1018 return;
1019 }
1020 }
1021
1022 /// Get perfectly nested sequence of loops starting at root of loop nest
1023 /// (the first op being another AffineFor, and the second op - a terminator).
1024 /// A loop is perfectly nested iff: the first op in the loop's body is another
1025 /// AffineForOp, and the second op is a terminator).
getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> & nestedLoops,AffineForOp root)1026 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
1027 AffineForOp root) {
1028 getPerfectlyNestedLoopsImpl(nestedLoops, root);
1029 }
1030
getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> & nestedLoops,scf::ForOp root)1031 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
1032 scf::ForOp root) {
1033 getPerfectlyNestedLoopsImpl(nestedLoops, root);
1034 }
1035
1036 /// Identify valid and profitable bands of loops to tile. This is currently just
1037 /// a temporary placeholder to test the mechanics of tiled code generation.
1038 /// Returns all maximal outermost perfect loop nests to tile.
getTileableBands(FuncOp f,std::vector<SmallVector<AffineForOp,6>> * bands)1039 void mlir::getTileableBands(FuncOp f,
1040 std::vector<SmallVector<AffineForOp, 6>> *bands) {
1041 // Get maximal perfect nest of 'affine.for' insts starting from root
1042 // (inclusive).
1043 for (AffineForOp forOp : f.getOps<AffineForOp>()) {
1044 SmallVector<AffineForOp, 6> band;
1045 getPerfectlyNestedLoops(band, forOp);
1046 bands->push_back(band);
1047 }
1048 }
1049
1050 /// Unrolls this loop completely.
loopUnrollFull(AffineForOp forOp)1051 LogicalResult mlir::loopUnrollFull(AffineForOp forOp) {
1052 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1053 if (mayBeConstantTripCount.hasValue()) {
1054 uint64_t tripCount = mayBeConstantTripCount.getValue();
1055 if (tripCount == 1)
1056 return promoteIfSingleIteration(forOp);
1057 return loopUnrollByFactor(forOp, tripCount);
1058 }
1059 return failure();
1060 }
1061
1062 /// Unrolls this loop by the specified factor or by the trip count (if constant)
1063 /// whichever is lower.
loopUnrollUpToFactor(AffineForOp forOp,uint64_t unrollFactor)1064 LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp,
1065 uint64_t unrollFactor) {
1066 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1067 if (mayBeConstantTripCount.hasValue() &&
1068 mayBeConstantTripCount.getValue() < unrollFactor)
1069 return loopUnrollByFactor(forOp, mayBeConstantTripCount.getValue());
1070 return loopUnrollByFactor(forOp, unrollFactor);
1071 }
1072
1073 /// Generates unrolled copies of AffineForOp or scf::ForOp 'loopBodyBlock', with
1074 /// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
1075 /// 'forOpIV' for each unrolled body.
1076 static void
generateUnrolledLoop(Block * loopBodyBlock,Value forOpIV,uint64_t unrollFactor,function_ref<Value (unsigned,Value,OpBuilder)> ivRemapFn,ValueRange iterArgs,ValueRange yieldedValues)1077 generateUnrolledLoop(Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
1078 function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
1079 ValueRange iterArgs, ValueRange yieldedValues) {
1080 // Builder to insert unrolled bodies just before the terminator of the body of
1081 // 'forOp'.
1082 auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
1083
1084 // Keep a pointer to the last non-terminator operation in the original block
1085 // so that we know what to clone (since we are doing this in-place).
1086 Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
1087
1088 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
1089 SmallVector<Value, 4> lastYielded(yieldedValues);
1090
1091 for (unsigned i = 1; i < unrollFactor; i++) {
1092 BlockAndValueMapping operandMap;
1093
1094 // Prepare operand map.
1095 operandMap.map(iterArgs, lastYielded);
1096
1097 // If the induction variable is used, create a remapping to the value for
1098 // this unrolled instance.
1099 if (!forOpIV.use_empty()) {
1100 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
1101 operandMap.map(forOpIV, ivUnroll);
1102 }
1103
1104 // Clone the original body of 'forOp'.
1105 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
1106 builder.clone(*it, operandMap);
1107
1108 // Update yielded values.
1109 for (unsigned i = 0, e = lastYielded.size(); i < e; i++)
1110 lastYielded[i] = operandMap.lookup(yieldedValues[i]);
1111 }
1112
1113 // Update operands of the yield statement.
1114 loopBodyBlock->getTerminator()->setOperands(lastYielded);
1115 }
1116
1117 /// Unrolls this loop by the specified factor. Returns success if the loop
1118 /// is successfully unrolled.
loopUnrollByFactor(AffineForOp forOp,uint64_t unrollFactor)1119 LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp,
1120 uint64_t unrollFactor) {
1121 assert(unrollFactor > 0 && "unroll factor should be positive");
1122
1123 if (unrollFactor == 1)
1124 return promoteIfSingleIteration(forOp);
1125
1126 // Nothing in the loop body other than the terminator.
1127 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1128 return success();
1129
1130 // Loops where the lower bound is a max expression isn't supported for
1131 // unrolling since the trip count can be expressed as an affine function when
1132 // both the lower bound and the upper bound are multi-result maps. However,
1133 // one meaningful way to do such unrolling would be to specialize the loop for
1134 // the 'hotspot' case and unroll that hotspot.
1135 if (forOp.getLowerBoundMap().getNumResults() != 1)
1136 return failure();
1137
1138 // If the trip count is lower than the unroll factor, no unrolled body.
1139 // TODO: option to specify cleanup loop unrolling.
1140 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1141 if (mayBeConstantTripCount.hasValue() &&
1142 mayBeConstantTripCount.getValue() < unrollFactor)
1143 return failure();
1144
1145 // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1146 if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1147 OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
1148 auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
1149
1150 // Update users of loop results.
1151 auto results = forOp.getResults();
1152 auto cleanupResults = cleanupForOp.getResults();
1153 auto cleanupIterOperands = cleanupForOp.getIterOperands();
1154
1155 for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
1156 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
1157 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
1158 }
1159
1160 AffineMap cleanupMap;
1161 SmallVector<Value, 4> cleanupOperands;
1162 getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
1163 assert(cleanupMap &&
1164 "cleanup loop lower bound map for single result lower bound maps "
1165 "can always be determined");
1166 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
1167 // Promote the loop body up if this has turned into a single iteration loop.
1168 (void)promoteIfSingleIteration(cleanupForOp);
1169
1170 // Adjust upper bound of the original loop; this is the same as the lower
1171 // bound of the cleanup loop.
1172 forOp.setUpperBound(cleanupOperands, cleanupMap);
1173 }
1174
1175 ValueRange iterArgs(forOp.getRegionIterArgs());
1176 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1177
1178 // Scale the step of loop being unrolled by unroll factor.
1179 int64_t step = forOp.getStep();
1180 forOp.setStep(step * unrollFactor);
1181 generateUnrolledLoop(
1182 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1183 [&](unsigned i, Value iv, OpBuilder b) {
1184 // iv' = iv + i * step
1185 auto d0 = b.getAffineDimExpr(0);
1186 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1187 return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1188 },
1189 /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1190
1191 // Promote the loop body up if this has turned into a single iteration loop.
1192 (void)promoteIfSingleIteration(forOp);
1193 return success();
1194 }
1195
1196 /// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
loopUnrollByFactor(scf::ForOp forOp,uint64_t unrollFactor)1197 LogicalResult mlir::loopUnrollByFactor(scf::ForOp forOp,
1198 uint64_t unrollFactor) {
1199 assert(unrollFactor > 0 && "expected positive unroll factor");
1200 if (unrollFactor == 1)
1201 return promoteIfSingleIteration(forOp);
1202
1203 // Return if the loop body is empty.
1204 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1205 return success();
1206
1207 // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
1208 // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
1209 OpBuilder boundsBuilder(forOp);
1210 auto loc = forOp.getLoc();
1211 auto step = forOp.step();
1212 Value upperBoundUnrolled;
1213 Value stepUnrolled;
1214 bool generateEpilogueLoop = true;
1215
1216 auto lbCstOp = forOp.lowerBound().getDefiningOp<ConstantIndexOp>();
1217 auto ubCstOp = forOp.upperBound().getDefiningOp<ConstantIndexOp>();
1218 auto stepCstOp = forOp.step().getDefiningOp<ConstantIndexOp>();
1219 if (lbCstOp && ubCstOp && stepCstOp) {
1220 // Constant loop bounds computation.
1221 int64_t lbCst = lbCstOp.getValue();
1222 int64_t ubCst = ubCstOp.getValue();
1223 int64_t stepCst = stepCstOp.getValue();
1224 assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 &&
1225 "expected positive loop bounds and step");
1226 int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst);
1227 int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor);
1228 int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
1229 assert(upperBoundUnrolledCst <= ubCst);
1230 int64_t stepUnrolledCst = stepCst * unrollFactor;
1231
1232 // Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
1233 generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
1234 if (generateEpilogueLoop)
1235 upperBoundUnrolled =
1236 boundsBuilder.create<ConstantIndexOp>(loc, upperBoundUnrolledCst);
1237 else
1238 upperBoundUnrolled = ubCstOp;
1239
1240 // Create constant for 'stepUnrolled'.
1241 stepUnrolled =
1242 stepCst == stepUnrolledCst
1243 ? step
1244 : boundsBuilder.create<ConstantIndexOp>(loc, stepUnrolledCst);
1245 } else {
1246 // Dynamic loop bounds computation.
1247 // TODO: Add dynamic asserts for negative lb/ub/step, or
1248 // consider using ceilDiv from AffineApplyExpander.
1249 auto lowerBound = forOp.lowerBound();
1250 auto upperBound = forOp.upperBound();
1251 Value diff = boundsBuilder.create<SubIOp>(loc, upperBound, lowerBound);
1252 Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
1253 Value unrollFactorCst =
1254 boundsBuilder.create<ConstantIndexOp>(loc, unrollFactor);
1255 Value tripCountRem =
1256 boundsBuilder.create<SignedRemIOp>(loc, tripCount, unrollFactorCst);
1257 // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
1258 Value tripCountEvenMultiple =
1259 boundsBuilder.create<SubIOp>(loc, tripCount, tripCountRem);
1260 // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
1261 upperBoundUnrolled = boundsBuilder.create<AddIOp>(
1262 loc, lowerBound,
1263 boundsBuilder.create<MulIOp>(loc, tripCountEvenMultiple, step));
1264 // Scale 'step' by 'unrollFactor'.
1265 stepUnrolled = boundsBuilder.create<MulIOp>(loc, step, unrollFactorCst);
1266 }
1267
1268 // Create epilogue clean up loop starting at 'upperBoundUnrolled'.
1269 if (generateEpilogueLoop) {
1270 OpBuilder epilogueBuilder(forOp->getContext());
1271 epilogueBuilder.setInsertionPoint(forOp->getBlock(),
1272 std::next(Block::iterator(forOp)));
1273 auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
1274 epilogueForOp.setLowerBound(upperBoundUnrolled);
1275
1276 // Update uses of loop results.
1277 auto results = forOp.getResults();
1278 auto epilogueResults = epilogueForOp.getResults();
1279 auto epilogueIterOperands = epilogueForOp.getIterOperands();
1280
1281 for (auto e : llvm::zip(results, epilogueResults, epilogueIterOperands)) {
1282 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
1283 epilogueForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
1284 }
1285 (void)promoteIfSingleIteration(epilogueForOp);
1286 }
1287
1288 // Create unrolled loop.
1289 forOp.setUpperBound(upperBoundUnrolled);
1290 forOp.setStep(stepUnrolled);
1291
1292 auto iterArgs = ValueRange(forOp.getRegionIterArgs());
1293 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1294
1295 generateUnrolledLoop(
1296 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1297 [&](unsigned i, Value iv, OpBuilder b) {
1298 // iv' = iv + step * i;
1299 auto stride =
1300 b.create<MulIOp>(loc, step, b.create<ConstantIndexOp>(loc, i));
1301 return b.create<AddIOp>(loc, iv, stride);
1302 },
1303 iterArgs, yieldedValues);
1304 // Promote the loop body up if this has turned into a single iteration loop.
1305 (void)promoteIfSingleIteration(forOp);
1306 return success();
1307 }
1308
loopUnrollJamUpToFactor(AffineForOp forOp,uint64_t unrollJamFactor)1309 LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp,
1310 uint64_t unrollJamFactor) {
1311 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1312 if (mayBeConstantTripCount.hasValue() &&
1313 mayBeConstantTripCount.getValue() < unrollJamFactor)
1314 return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.getValue());
1315 return loopUnrollJamByFactor(forOp, unrollJamFactor);
1316 }
1317
1318 /// Unrolls and jams this loop by the specified factor.
loopUnrollJamByFactor(AffineForOp forOp,uint64_t unrollJamFactor)1319 LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
1320 uint64_t unrollJamFactor) {
1321 // Gathers all maximal sub-blocks of operations that do not themselves
1322 // include a for op (a operation could have a descendant for op though
1323 // in its tree). Ignore the block terminators.
1324 struct JamBlockGatherer {
1325 // Store iterators to the first and last op of each sub-block found.
1326 std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks;
1327
1328 // This is a linear time walk.
1329 void walk(Operation *op) {
1330 for (auto ®ion : op->getRegions())
1331 for (auto &block : region)
1332 walk(block);
1333 }
1334
1335 void walk(Block &block) {
1336 for (auto it = block.begin(), e = std::prev(block.end()); it != e;) {
1337 auto subBlockStart = it;
1338 while (it != e && !isa<AffineForOp>(&*it))
1339 ++it;
1340 if (it != subBlockStart)
1341 subBlocks.push_back({subBlockStart, std::prev(it)});
1342 // Process all for ops that appear next.
1343 while (it != e && isa<AffineForOp>(&*it))
1344 walk(&*it++);
1345 }
1346 }
1347 };
1348
1349 assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1350
1351 if (unrollJamFactor == 1)
1352 return promoteIfSingleIteration(forOp);
1353
1354 // Nothing in the loop body other than the terminator.
1355 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1356 return success();
1357
1358 // Loops where both lower and upper bounds are multi-result maps won't be
1359 // unrolled (since the trip can't be expressed as an affine function in
1360 // general).
1361 // TODO: this may not be common, but we could support the case
1362 // where the lower bound is a multi-result map and the ub is a single result
1363 // one.
1364 if (forOp.getLowerBoundMap().getNumResults() != 1)
1365 return failure();
1366
1367 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1368 // If the trip count is lower than the unroll jam factor, no unroll jam.
1369 if (mayBeConstantTripCount.hasValue() &&
1370 mayBeConstantTripCount.getValue() < unrollJamFactor) {
1371 LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1372 return failure();
1373 }
1374
1375 // Gather all sub-blocks to jam upon the loop being unrolled.
1376 JamBlockGatherer jbg;
1377 jbg.walk(forOp);
1378 auto &subBlocks = jbg.subBlocks;
1379
1380 // Generate the cleanup loop if trip count isn't a multiple of
1381 // unrollJamFactor.
1382 if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1383 // Insert the cleanup loop right after 'forOp'.
1384 OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
1385 auto cleanupAffineForOp = cast<AffineForOp>(builder.clone(*forOp));
1386 // Adjust the lower bound of the cleanup loop; its upper bound is the same
1387 // as the original loop's upper bound.
1388 AffineMap cleanupMap;
1389 SmallVector<Value, 4> cleanupOperands;
1390 getCleanupLoopLowerBound(forOp, unrollJamFactor, cleanupMap,
1391 cleanupOperands);
1392 cleanupAffineForOp.setLowerBound(cleanupOperands, cleanupMap);
1393
1394 // Promote the cleanup loop if it has turned into a single iteration loop.
1395 (void)promoteIfSingleIteration(cleanupAffineForOp);
1396
1397 // Adjust the upper bound of the original loop - it will be the same as the
1398 // cleanup loop's lower bound. Its lower bound remains unchanged.
1399 forOp.setUpperBound(cleanupOperands, cleanupMap);
1400 }
1401
1402 // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1403 int64_t step = forOp.getStep();
1404 forOp.setStep(step * unrollJamFactor);
1405
1406 auto forOpIV = forOp.getInductionVar();
1407 // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1408 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1409 // Operand map persists across all sub-blocks.
1410 BlockAndValueMapping operandMap;
1411 for (auto &subBlock : subBlocks) {
1412 // Builder to insert unroll-jammed bodies. Insert right at the end of
1413 // sub-block.
1414 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1415
1416 // If the induction variable is used, create a remapping to the value for
1417 // this unrolled instance.
1418 if (!forOpIV.use_empty()) {
1419 // iv' = iv + i, i = 1 to unrollJamFactor-1.
1420 auto d0 = builder.getAffineDimExpr(0);
1421 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1422 auto ivUnroll =
1423 builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1424 operandMap.map(forOpIV, ivUnroll);
1425 }
1426 // Clone the sub-block being unroll-jammed.
1427 for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1428 builder.clone(*it, operandMap);
1429 }
1430 }
1431
1432 // Promote the loop body up if this has turned into a single iteration loop.
1433 (void)promoteIfSingleIteration(forOp);
1434 return success();
1435 }
1436
1437 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1438 /// nested within 'forOpA' as the only non-terminator operation in its block.
interchangeLoops(AffineForOp forOpA,AffineForOp forOpB)1439 void mlir::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1440 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1441 auto &forOpABody = forOpA.getBody()->getOperations();
1442 auto &forOpBBody = forOpB.getBody()->getOperations();
1443
1444 // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1445 // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1446 // body containing only the terminator.
1447 forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1448 forOpABody, forOpABody.begin(),
1449 std::prev(forOpABody.end()));
1450 // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1451 // body (this leaves forOpB's body containing only the terminator).
1452 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1453 std::prev(forOpBBody.end()));
1454 // 3) Splice forOpA into the beginning of forOpB's body.
1455 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1456 Block::iterator(forOpA));
1457 }
1458
1459 // Checks each dependence component against the permutation to see if the
1460 // desired loop interchange would violate dependences by making the
1461 // dependence component lexicographically negative.
checkLoopInterchangeDependences(const std::vector<SmallVector<DependenceComponent,2>> & depCompsVec,ArrayRef<AffineForOp> loops,ArrayRef<unsigned> loopPermMap)1462 static bool checkLoopInterchangeDependences(
1463 const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1464 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1465 // Invert permutation map.
1466 unsigned maxLoopDepth = loops.size();
1467 SmallVector<unsigned, 4> loopPermMapInv;
1468 loopPermMapInv.resize(maxLoopDepth);
1469 for (unsigned i = 0; i < maxLoopDepth; ++i)
1470 loopPermMapInv[loopPermMap[i]] = i;
1471
1472 // Check each dependence component against the permutation to see if the
1473 // desired loop interchange permutation would make the dependence vectors
1474 // lexicographically negative.
1475 // Example 1: [-1, 1][0, 0]
1476 // Example 2: [0, 0][-1, 1]
1477 for (unsigned i = 0, e = depCompsVec.size(); i < e; ++i) {
1478 const SmallVector<DependenceComponent, 2> &depComps = depCompsVec[i];
1479 assert(depComps.size() >= maxLoopDepth);
1480 // Check if the first non-zero dependence component is positive.
1481 // This iterates through loops in the desired order.
1482 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1483 unsigned permIndex = loopPermMapInv[j];
1484 assert(depComps[permIndex].lb.hasValue());
1485 int64_t depCompLb = depComps[permIndex].lb.getValue();
1486 if (depCompLb > 0)
1487 break;
1488 if (depCompLb < 0)
1489 return false;
1490 }
1491 }
1492 return true;
1493 }
1494
1495 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1496 /// nested sequence of loops in 'loops' would violate dependences.
isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,ArrayRef<unsigned> loopPermMap)1497 bool mlir::isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,
1498 ArrayRef<unsigned> loopPermMap) {
1499 // Gather dependence components for dependences between all ops in loop nest
1500 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1501 assert(loopPermMap.size() == loops.size());
1502 unsigned maxLoopDepth = loops.size();
1503 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1504 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1505 return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1506 }
1507
1508 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1509 /// in it from outermost to innermost.
1510 bool LLVM_ATTRIBUTE_UNUSED
isPerfectlyNested(ArrayRef<AffineForOp> loops)1511 mlir::isPerfectlyNested(ArrayRef<AffineForOp> loops) {
1512 assert(!loops.empty() && "no loops provided");
1513
1514 // We already know that the block can't be empty.
1515 auto hasTwoElements = [](Block *block) {
1516 auto secondOpIt = std::next(block->begin());
1517 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1518 };
1519
1520 auto enclosingLoop = loops.front();
1521 for (auto loop : loops.drop_front()) {
1522 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1523 // parentForOp's body should be just this loop and the terminator.
1524 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1525 return false;
1526 enclosingLoop = loop;
1527 }
1528 return true;
1529 }
1530
1531 // input[i] should move from position i -> permMap[i]. Returns the position in
1532 // `input` that becomes the new outermost loop.
permuteLoops(MutableArrayRef<AffineForOp> input,ArrayRef<unsigned> permMap)1533 unsigned mlir::permuteLoops(MutableArrayRef<AffineForOp> input,
1534 ArrayRef<unsigned> permMap) {
1535 assert(input.size() == permMap.size() && "invalid permutation map size");
1536 // Check whether the permutation spec is valid. This is a small vector - we'll
1537 // just sort and check if it's iota.
1538 SmallVector<unsigned, 4> checkPermMap(permMap.begin(), permMap.end());
1539 llvm::sort(checkPermMap);
1540 if (llvm::any_of(llvm::enumerate(checkPermMap),
1541 [](const auto &en) { return en.value() != en.index(); }))
1542 assert(false && "invalid permutation map");
1543
1544 // Nothing to do.
1545 if (input.size() < 2)
1546 return 0;
1547
1548 assert(isPerfectlyNested(input) && "input not perfectly nested");
1549
1550 // Compute the inverse mapping, invPermMap: since input[i] goes to position
1551 // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1552 SmallVector<std::pair<unsigned, unsigned>, 4> invPermMap;
1553 for (unsigned i = 0, e = input.size(); i < e; ++i)
1554 invPermMap.push_back({permMap[i], i});
1555 llvm::sort(invPermMap);
1556
1557 // Move the innermost loop body to the loop that would be the innermost in the
1558 // permuted nest (only if the innermost loop is going to change).
1559 if (permMap.back() != input.size() - 1) {
1560 auto *destBody = input[invPermMap.back().second].getBody();
1561 auto *srcBody = input.back().getBody();
1562 destBody->getOperations().splice(destBody->begin(),
1563 srcBody->getOperations(), srcBody->begin(),
1564 std::prev(srcBody->end()));
1565 }
1566
1567 // We'll move each loop in `input` in the reverse order so that its body is
1568 // empty when we are moving it; this incurs zero copies and no erasing.
1569 for (int i = input.size() - 1; i >= 0; --i) {
1570 // If this has to become the outermost loop after permutation, add it to the
1571 // parent block of the original root.
1572 if (permMap[i] == 0) {
1573 // If the root remains the same, nothing to do.
1574 if (i == 0)
1575 continue;
1576 // Make input[i] the new outermost loop moving it into parentBlock.
1577 auto *parentBlock = input[0]->getBlock();
1578 parentBlock->getOperations().splice(Block::iterator(input[0]),
1579 input[i]->getBlock()->getOperations(),
1580 Block::iterator(input[i]));
1581 continue;
1582 }
1583
1584 // If the parent in the permuted order is the same as in the original,
1585 // nothing to do.
1586 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1587 if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1588 continue;
1589
1590 // Move input[i] to its surrounding loop in the transformed nest.
1591 auto *destBody = input[parentPosInInput].getBody();
1592 destBody->getOperations().splice(destBody->begin(),
1593 input[i]->getBlock()->getOperations(),
1594 Block::iterator(input[i]));
1595 }
1596
1597 return invPermMap[0].second;
1598 }
1599
1600 // Sinks all sequential loops to the innermost levels (while preserving
1601 // relative order among them) and moves all parallel loops to the
1602 // outermost (while again preserving relative order among them).
sinkSequentialLoops(AffineForOp forOp)1603 AffineForOp mlir::sinkSequentialLoops(AffineForOp forOp) {
1604 SmallVector<AffineForOp, 4> loops;
1605 getPerfectlyNestedLoops(loops, forOp);
1606 if (loops.size() < 2)
1607 return forOp;
1608
1609 // Gather dependence components for dependences between all ops in loop nest
1610 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1611 unsigned maxLoopDepth = loops.size();
1612 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1613 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1614
1615 // Mark loops as either parallel or sequential.
1616 SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1617 for (unsigned i = 0, e = depCompsVec.size(); i < e; ++i) {
1618 SmallVector<DependenceComponent, 2> &depComps = depCompsVec[i];
1619 assert(depComps.size() >= maxLoopDepth);
1620 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1621 DependenceComponent &depComp = depComps[j];
1622 assert(depComp.lb.hasValue() && depComp.ub.hasValue());
1623 if (depComp.lb.getValue() != 0 || depComp.ub.getValue() != 0)
1624 isParallelLoop[j] = false;
1625 }
1626 }
1627
1628 // Count the number of parallel loops.
1629 unsigned numParallelLoops = 0;
1630 for (unsigned i = 0, e = isParallelLoop.size(); i < e; ++i)
1631 if (isParallelLoop[i])
1632 ++numParallelLoops;
1633
1634 // Compute permutation of loops that sinks sequential loops (and thus raises
1635 // parallel loops) while preserving relative order.
1636 SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1637 unsigned nextSequentialLoop = numParallelLoops;
1638 unsigned nextParallelLoop = 0;
1639 for (unsigned i = 0; i < maxLoopDepth; ++i) {
1640 if (isParallelLoop[i]) {
1641 loopPermMap[i] = nextParallelLoop++;
1642 } else {
1643 loopPermMap[i] = nextSequentialLoop++;
1644 }
1645 }
1646
1647 // Check if permutation 'loopPermMap' would violate dependences.
1648 if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1649 return forOp;
1650 // Perform loop interchange according to permutation 'loopPermMap'.
1651 unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1652 return loops[loopNestRootIndex];
1653 }
1654
1655 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1656 // lower (resp. upper) loop bound. When called for both the lower and upper
1657 // bounds, the resulting IR resembles:
1658 //
1659 // ```mlir
1660 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1661 // ...
1662 // }
1663 // ```
augmentMapAndBounds(OpBuilder & b,Value iv,AffineMap * map,SmallVector<Value,4> * operands,int64_t offset=0)1664 static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map,
1665 SmallVector<Value, 4> *operands,
1666 int64_t offset = 0) {
1667 auto bounds = llvm::to_vector<4>(map->getResults());
1668 bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1669 operands->insert(operands->begin() + map->getNumDims(), iv);
1670 *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1671 b.getContext());
1672 canonicalizeMapAndOperands(map, operands);
1673 }
1674
1675 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1676 // Stripmine-sink is a primitive building block for generalized tiling of
1677 // imperfectly nested loops.
1678 // This transformation is purely mechanical and does not check legality,
1679 // profitability or even structural correctness. It is the user's
1680 // responsibility to specify `targets` that are dominated by `forOp`.
1681 // Returns the new AffineForOps, one per `targets`, nested immediately under
1682 // each of the `targets`.
1683 static SmallVector<AffineForOp, 8>
stripmineSink(AffineForOp forOp,uint64_t factor,ArrayRef<AffineForOp> targets)1684 stripmineSink(AffineForOp forOp, uint64_t factor,
1685 ArrayRef<AffineForOp> targets) {
1686 auto originalStep = forOp.getStep();
1687 auto scaledStep = originalStep * factor;
1688 forOp.setStep(scaledStep);
1689
1690 OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1691
1692 // Lower-bound map creation.
1693 auto lbMap = forOp.getLowerBoundMap();
1694 SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1695 augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1696
1697 // Upper-bound map creation.
1698 auto ubMap = forOp.getUpperBoundMap();
1699 SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1700 augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1701 /*offset=*/scaledStep);
1702
1703 auto iv = forOp.getInductionVar();
1704 SmallVector<AffineForOp, 8> innerLoops;
1705 for (auto t : targets) {
1706 // Insert newForOp before the terminator of `t`.
1707 auto b = OpBuilder::atBlockTerminator(t.getBody());
1708 auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1709 ubOperands, ubMap, originalStep);
1710 auto begin = t.getBody()->begin();
1711 // Skip terminator and `newForOp` which is just before the terminator.
1712 auto nOps = t.getBody()->getOperations().size() - 2;
1713 newForOp.getBody()->getOperations().splice(
1714 newForOp.getBody()->getOperations().begin(),
1715 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1716 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1717 newForOp.region());
1718 innerLoops.push_back(newForOp);
1719 }
1720
1721 return innerLoops;
1722 }
1723
stripmineSink(scf::ForOp forOp,Value factor,ArrayRef<scf::ForOp> targets)1724 static Loops stripmineSink(scf::ForOp forOp, Value factor,
1725 ArrayRef<scf::ForOp> targets) {
1726 auto originalStep = forOp.step();
1727 auto iv = forOp.getInductionVar();
1728
1729 OpBuilder b(forOp);
1730 forOp.setStep(b.create<MulIOp>(forOp.getLoc(), originalStep, factor));
1731
1732 Loops innerLoops;
1733 for (auto t : targets) {
1734 // Save information for splicing ops out of t when done
1735 auto begin = t.getBody()->begin();
1736 auto nOps = t.getBody()->getOperations().size();
1737
1738 // Insert newForOp before the terminator of `t`.
1739 auto b = OpBuilder::atBlockTerminator((t.getBody()));
1740 Value stepped = b.create<AddIOp>(t.getLoc(), iv, forOp.step());
1741 Value less = b.create<CmpIOp>(t.getLoc(), CmpIPredicate::slt,
1742 forOp.upperBound(), stepped);
1743 Value ub =
1744 b.create<SelectOp>(t.getLoc(), less, forOp.upperBound(), stepped);
1745
1746 // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
1747 auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
1748 newForOp.getBody()->getOperations().splice(
1749 newForOp.getBody()->getOperations().begin(),
1750 t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
1751 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1752 newForOp.region());
1753
1754 innerLoops.push_back(newForOp);
1755 }
1756
1757 return innerLoops;
1758 }
1759
1760 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1761 // Returns the new AffineForOps, nested immediately under `target`.
1762 template <typename ForType, typename SizeType>
stripmineSink(ForType forOp,SizeType factor,ForType target)1763 static ForType stripmineSink(ForType forOp, SizeType factor, ForType target) {
1764 // TODO: Use cheap structural assertions that targets are nested under
1765 // forOp and that targets are not nested under each other when DominanceInfo
1766 // exposes the capability. It seems overkill to construct a whole function
1767 // dominance tree at this point.
1768 auto res = stripmineSink(forOp, factor, ArrayRef<ForType>{target});
1769 assert(res.size() == 1 && "Expected 1 inner forOp");
1770 return res[0];
1771 }
1772
1773 template <typename ForType, typename SizeType>
1774 static SmallVector<SmallVector<ForType, 8>, 8>
tileImpl(ArrayRef<ForType> forOps,ArrayRef<SizeType> sizes,ArrayRef<ForType> targets)1775 tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes,
1776 ArrayRef<ForType> targets) {
1777 SmallVector<SmallVector<ForType, 8>, 8> res;
1778 SmallVector<ForType, 8> currentTargets(targets.begin(), targets.end());
1779 for (auto it : llvm::zip(forOps, sizes)) {
1780 auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1781 res.push_back(step);
1782 currentTargets = step;
1783 }
1784 return res;
1785 }
1786
1787 SmallVector<SmallVector<AffineForOp, 8>, 8>
tile(ArrayRef<AffineForOp> forOps,ArrayRef<uint64_t> sizes,ArrayRef<AffineForOp> targets)1788 mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
1789 ArrayRef<AffineForOp> targets) {
1790 return tileImpl(forOps, sizes, targets);
1791 }
1792
tile(ArrayRef<scf::ForOp> forOps,ArrayRef<Value> sizes,ArrayRef<scf::ForOp> targets)1793 SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps,
1794 ArrayRef<Value> sizes,
1795 ArrayRef<scf::ForOp> targets) {
1796 return tileImpl(forOps, sizes, targets);
1797 }
1798
1799 template <typename ForType, typename SizeType>
1800 static SmallVector<ForType, 8>
tileImpl(ArrayRef<ForType> forOps,ArrayRef<SizeType> sizes,ForType target)1801 tileImpl(ArrayRef<ForType> forOps, ArrayRef<SizeType> sizes, ForType target) {
1802 SmallVector<ForType, 8> res;
1803 for (auto loops : tile(forOps, sizes, ArrayRef<ForType>{target})) {
1804 assert(loops.size() == 1);
1805 res.push_back(loops[0]);
1806 }
1807 return res;
1808 }
1809
tile(ArrayRef<AffineForOp> forOps,ArrayRef<uint64_t> sizes,AffineForOp target)1810 SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
1811 ArrayRef<uint64_t> sizes,
1812 AffineForOp target) {
1813 return tileImpl(forOps, sizes, target);
1814 }
1815
tile(ArrayRef<scf::ForOp> forOps,ArrayRef<Value> sizes,scf::ForOp target)1816 Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
1817 scf::ForOp target) {
1818 return tileImpl(forOps, sizes, target);
1819 }
1820
tilePerfectlyNested(scf::ForOp rootForOp,ArrayRef<Value> sizes)1821 Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
1822 // Collect perfectly nested loops. If more size values provided than nested
1823 // loops available, truncate `sizes`.
1824 SmallVector<scf::ForOp, 4> forOps;
1825 forOps.reserve(sizes.size());
1826 getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
1827 if (forOps.size() < sizes.size())
1828 sizes = sizes.take_front(forOps.size());
1829
1830 return ::tile(forOps, sizes, forOps.back());
1831 }
1832
1833 // Hoist the ops within `outer` that appear before `inner`.
1834 // Such ops include the ops that have been introduced by parametric tiling.
1835 // Ops that come from triangular loops (i.e. that belong to the program slice
1836 // rooted at `outer`) and ops that have side effects cannot be hoisted.
1837 // Return failure when any op fails to hoist.
hoistOpsBetween(scf::ForOp outer,scf::ForOp inner)1838 static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
1839 SetVector<Operation *> forwardSlice;
1840 getForwardSlice(
1841 outer.getInductionVar(), &forwardSlice,
1842 [&inner](Operation *op) { return op != inner.getOperation(); });
1843 LogicalResult status = success();
1844 SmallVector<Operation *, 8> toHoist;
1845 for (auto &op : outer.getBody()->without_terminator()) {
1846 // Stop when encountering the inner loop.
1847 if (&op == inner.getOperation())
1848 break;
1849 // Skip over non-hoistable ops.
1850 if (forwardSlice.count(&op) > 0) {
1851 status = failure();
1852 continue;
1853 }
1854 // Skip intermediate scf::ForOp, these are not considered a failure.
1855 if (isa<scf::ForOp>(op))
1856 continue;
1857 // Skip other ops with regions.
1858 if (op.getNumRegions() > 0) {
1859 status = failure();
1860 continue;
1861 }
1862 // Skip if op has side effects.
1863 // TODO: loads to immutable memory regions are ok.
1864 if (!MemoryEffectOpInterface::hasNoEffect(&op)) {
1865 status = failure();
1866 continue;
1867 }
1868 toHoist.push_back(&op);
1869 }
1870 auto *outerForOp = outer.getOperation();
1871 for (auto *op : toHoist)
1872 op->moveBefore(outerForOp);
1873 return status;
1874 }
1875
1876 // Traverse the interTile and intraTile loops and try to hoist ops such that
1877 // bands of perfectly nested loops are isolated.
1878 // Return failure if either perfect interTile or perfect intraTile bands cannot
1879 // be formed.
tryIsolateBands(const TileLoops & tileLoops)1880 static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
1881 LogicalResult status = success();
1882 const Loops &interTile = tileLoops.first;
1883 const Loops &intraTile = tileLoops.second;
1884 auto size = interTile.size();
1885 assert(size == intraTile.size());
1886 if (size <= 1)
1887 return success();
1888 for (unsigned s = 1; s < size; ++s)
1889 status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
1890 : failure();
1891 for (unsigned s = 1; s < size; ++s)
1892 status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
1893 : failure();
1894 return status;
1895 }
1896
extractFixedOuterLoops(scf::ForOp rootForOp,ArrayRef<int64_t> sizes)1897 TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp,
1898 ArrayRef<int64_t> sizes) {
1899 // Collect perfectly nested loops. If more size values provided than nested
1900 // loops available, truncate `sizes`.
1901 SmallVector<scf::ForOp, 4> forOps;
1902 forOps.reserve(sizes.size());
1903 getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
1904 if (forOps.size() < sizes.size())
1905 sizes = sizes.take_front(forOps.size());
1906
1907 // Compute the tile sizes such that i-th outer loop executes size[i]
1908 // iterations. Given that the loop current executes
1909 // numIterations = ceildiv((upperBound - lowerBound), step)
1910 // iterations, we need to tile with size ceildiv(numIterations, size[i]).
1911 SmallVector<Value, 4> tileSizes;
1912 tileSizes.reserve(sizes.size());
1913 for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
1914 assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
1915
1916 auto forOp = forOps[i];
1917 OpBuilder builder(forOp);
1918 auto loc = forOp.getLoc();
1919 Value diff =
1920 builder.create<SubIOp>(loc, forOp.upperBound(), forOp.lowerBound());
1921 Value numIterations = ceilDivPositive(builder, loc, diff, forOp.step());
1922 Value iterationsPerBlock =
1923 ceilDivPositive(builder, loc, numIterations, sizes[i]);
1924 tileSizes.push_back(iterationsPerBlock);
1925 }
1926
1927 // Call parametric tiling with the given sizes.
1928 auto intraTile = tile(forOps, tileSizes, forOps.back());
1929 TileLoops tileLoops = std::make_pair(forOps, intraTile);
1930
1931 // TODO: for now we just ignore the result of band isolation.
1932 // In the future, mapping decisions may be impacted by the ability to
1933 // isolate perfectly nested bands.
1934 (void)tryIsolateBands(tileLoops);
1935
1936 return tileLoops;
1937 }
1938
1939 /// Return the new lower bound, upper bound, and step in that order. Insert any
1940 /// additional bounds calculations before the given builder and any additional
1941 /// conversion back to the original loop induction value inside the given Block.
normalizeLoop(OpBuilder & boundsBuilder,OpBuilder & insideLoopBuilder,Location loc,Value lowerBound,Value upperBound,Value step,Value inductionVar)1942 static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
1943 OpBuilder &insideLoopBuilder, Location loc,
1944 Value lowerBound, Value upperBound, Value step,
1945 Value inductionVar) {
1946 // Check if the loop is already known to have a constant zero lower bound or
1947 // a constant one step.
1948 bool isZeroBased = false;
1949 if (auto ubCst = lowerBound.getDefiningOp<ConstantIndexOp>())
1950 isZeroBased = ubCst.getValue() == 0;
1951
1952 bool isStepOne = false;
1953 if (auto stepCst = step.getDefiningOp<ConstantIndexOp>())
1954 isStepOne = stepCst.getValue() == 1;
1955
1956 // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
1957 // assuming the step is strictly positive. Update the bounds and the step
1958 // of the loop to go from 0 to the number of iterations, if necessary.
1959 // TODO: introduce support for negative steps or emit dynamic asserts
1960 // on step positivity, whatever gets implemented first.
1961 if (isZeroBased && isStepOne)
1962 return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
1963 /*step=*/step};
1964
1965 Value diff = boundsBuilder.create<SubIOp>(loc, upperBound, lowerBound);
1966 Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step);
1967
1968 Value newLowerBound =
1969 isZeroBased ? lowerBound : boundsBuilder.create<ConstantIndexOp>(loc, 0);
1970 Value newStep =
1971 isStepOne ? step : boundsBuilder.create<ConstantIndexOp>(loc, 1);
1972
1973 // Insert code computing the value of the original loop induction variable
1974 // from the "normalized" one.
1975 Value scaled =
1976 isStepOne ? inductionVar
1977 : insideLoopBuilder.create<MulIOp>(loc, inductionVar, step);
1978 Value shifted =
1979 isZeroBased ? scaled
1980 : insideLoopBuilder.create<AddIOp>(loc, scaled, lowerBound);
1981
1982 SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
1983 shifted.getDefiningOp()};
1984 inductionVar.replaceAllUsesExcept(shifted, preserve);
1985 return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
1986 /*step=*/newStep};
1987 }
1988
1989 /// Transform a loop with a strictly positive step
1990 /// for %i = %lb to %ub step %s
1991 /// into a 0-based loop with step 1
1992 /// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
1993 /// %i = %ii * %s + %lb
1994 /// Insert the induction variable remapping in the body of `inner`, which is
1995 /// expected to be either `loop` or another loop perfectly nested under `loop`.
1996 /// Insert the definition of new bounds immediate before `outer`, which is
1997 /// expected to be either `loop` or its parent in the loop nest.
normalizeLoop(scf::ForOp loop,scf::ForOp outer,scf::ForOp inner)1998 static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) {
1999 OpBuilder builder(outer);
2000 OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
2001 auto loopPieces =
2002 normalizeLoop(builder, innerBuilder, loop.getLoc(), loop.lowerBound(),
2003 loop.upperBound(), loop.step(), loop.getInductionVar());
2004
2005 loop.setLowerBound(loopPieces.lowerBound);
2006 loop.setUpperBound(loopPieces.upperBound);
2007 loop.setStep(loopPieces.step);
2008 }
2009
coalesceLoops(MutableArrayRef<scf::ForOp> loops)2010 void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
2011 if (loops.size() < 2)
2012 return;
2013
2014 scf::ForOp innermost = loops.back();
2015 scf::ForOp outermost = loops.front();
2016
2017 // 1. Make sure all loops iterate from 0 to upperBound with step 1. This
2018 // allows the following code to assume upperBound is the number of iterations.
2019 for (auto loop : loops)
2020 normalizeLoop(loop, outermost, innermost);
2021
2022 // 2. Emit code computing the upper bound of the coalesced loop as product
2023 // of the number of iterations of all loops.
2024 OpBuilder builder(outermost);
2025 Location loc = outermost.getLoc();
2026 Value upperBound = outermost.upperBound();
2027 for (auto loop : loops.drop_front())
2028 upperBound = builder.create<MulIOp>(loc, upperBound, loop.upperBound());
2029 outermost.setUpperBound(upperBound);
2030
2031 builder.setInsertionPointToStart(outermost.getBody());
2032
2033 // 3. Remap induction variables. For each original loop, the value of the
2034 // induction variable can be obtained by dividing the induction variable of
2035 // the linearized loop by the total number of iterations of the loops nested
2036 // in it modulo the number of iterations in this loop (remove the values
2037 // related to the outer loops):
2038 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
2039 // Compute these iteratively from the innermost loop by creating a "running
2040 // quotient" of division by the range.
2041 Value previous = outermost.getInductionVar();
2042 for (unsigned i = 0, e = loops.size(); i < e; ++i) {
2043 unsigned idx = loops.size() - i - 1;
2044 if (i != 0)
2045 previous = builder.create<SignedDivIOp>(loc, previous,
2046 loops[idx + 1].upperBound());
2047
2048 Value iv = (i == e - 1) ? previous
2049 : builder.create<SignedRemIOp>(
2050 loc, previous, loops[idx].upperBound());
2051 replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
2052 loops.back().region());
2053 }
2054
2055 // 4. Move the operations from the innermost just above the second-outermost
2056 // loop, delete the extra terminator and the second-outermost loop.
2057 scf::ForOp second = loops[1];
2058 innermost.getBody()->back().erase();
2059 outermost.getBody()->getOperations().splice(
2060 Block::iterator(second.getOperation()),
2061 innermost.getBody()->getOperations());
2062 second.erase();
2063 }
2064
collapseParallelLoops(scf::ParallelOp loops,ArrayRef<std::vector<unsigned>> combinedDimensions)2065 void mlir::collapseParallelLoops(
2066 scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
2067 OpBuilder outsideBuilder(loops);
2068 Location loc = loops.getLoc();
2069
2070 // Presort combined dimensions.
2071 auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
2072 for (auto &dims : sortedDimensions)
2073 std::sort(dims.begin(), dims.end());
2074
2075 // Normalize ParallelOp's iteration pattern.
2076 SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps,
2077 normalizedUpperBounds;
2078 for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
2079 OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
2080 auto resultBounds =
2081 normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
2082 loops.lowerBound()[i], loops.upperBound()[i],
2083 loops.step()[i], loops.getBody()->getArgument(i));
2084
2085 normalizedLowerBounds.push_back(resultBounds.lowerBound);
2086 normalizedUpperBounds.push_back(resultBounds.upperBound);
2087 normalizedSteps.push_back(resultBounds.step);
2088 }
2089
2090 // Combine iteration spaces.
2091 SmallVector<Value, 3> lowerBounds, upperBounds, steps;
2092 auto cst0 = outsideBuilder.create<ConstantIndexOp>(loc, 0);
2093 auto cst1 = outsideBuilder.create<ConstantIndexOp>(loc, 1);
2094 for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) {
2095 Value newUpperBound = outsideBuilder.create<ConstantIndexOp>(loc, 1);
2096 for (auto idx : sortedDimensions[i]) {
2097 newUpperBound = outsideBuilder.create<MulIOp>(loc, newUpperBound,
2098 normalizedUpperBounds[idx]);
2099 }
2100 lowerBounds.push_back(cst0);
2101 steps.push_back(cst1);
2102 upperBounds.push_back(newUpperBound);
2103 }
2104
2105 // Create new ParallelLoop with conversions to the original induction values.
2106 // The loop below uses divisions to get the relevant range of values in the
2107 // new induction value that represent each range of the original induction
2108 // value. The remainders then determine based on that range, which iteration
2109 // of the original induction value this represents. This is a normalized value
2110 // that is un-normalized already by the previous logic.
2111 auto newPloop = outsideBuilder.create<scf::ParallelOp>(
2112 loc, lowerBounds, upperBounds, steps,
2113 [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
2114 for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
2115 Value previous = ploopIVs[i];
2116 unsigned numberCombinedDimensions = combinedDimensions[i].size();
2117 // Iterate over all except the last induction value.
2118 for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
2119 unsigned idx = combinedDimensions[i][j];
2120
2121 // Determine the current induction value's current loop iteration
2122 Value iv = insideBuilder.create<SignedRemIOp>(
2123 loc, previous, normalizedUpperBounds[idx]);
2124 replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
2125 loops.region());
2126
2127 // Remove the effect of the current induction value to prepare for
2128 // the next value.
2129 previous = insideBuilder.create<SignedDivIOp>(
2130 loc, previous, normalizedUpperBounds[idx]);
2131 }
2132
2133 // The final induction value is just the remaining value.
2134 unsigned idx = combinedDimensions[i][0];
2135 replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
2136 previous, loops.region());
2137 }
2138 });
2139
2140 // Replace the old loop with the new loop.
2141 loops.getBody()->back().erase();
2142 newPloop.getBody()->getOperations().splice(
2143 Block::iterator(newPloop.getBody()->back()),
2144 loops.getBody()->getOperations());
2145 loops.erase();
2146 }
2147
mapLoopToProcessorIds(scf::ForOp forOp,ArrayRef<Value> processorId,ArrayRef<Value> numProcessors)2148 void mlir::mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId,
2149 ArrayRef<Value> numProcessors) {
2150 assert(processorId.size() == numProcessors.size());
2151 if (processorId.empty())
2152 return;
2153
2154 OpBuilder b(forOp);
2155 Location loc(forOp.getLoc());
2156 AffineExpr lhs, rhs;
2157 bindSymbols(forOp.getContext(), lhs, rhs);
2158 auto mulMap = AffineMap::get(0, 2, lhs * rhs);
2159 auto addMap = AffineMap::get(0, 2, lhs + rhs);
2160
2161 Value linearIndex = processorId.front();
2162 for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
2163 auto mulApplyOp = b.create<AffineApplyOp>(
2164 loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
2165 linearIndex = b.create<AffineApplyOp>(
2166 loc, addMap, ValueRange{mulApplyOp, processorId[i]});
2167 }
2168
2169 auto mulApplyOp = b.create<AffineApplyOp>(
2170 loc, mulMap, ValueRange{linearIndex, forOp.step()});
2171 Value lb = b.create<AffineApplyOp>(
2172 loc, addMap, ValueRange{mulApplyOp, forOp.lowerBound()});
2173 forOp.setLowerBound(lb);
2174
2175 Value step = forOp.step();
2176 for (auto numProcs : numProcessors)
2177 step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
2178 forOp.setStep(step);
2179 }
2180
2181 /// Given a memref region, determine the lowest depth at which transfers can be
2182 /// placed for it, and return the corresponding block, start and end positions
2183 /// in the block for placing incoming (read) and outgoing (write) copies
2184 /// respectively. The lowest depth depends on whether the region being accessed
2185 /// is hoistable with respect to one or more immediately surrounding loops.
2186 static void
findHighestBlockForPlacement(const MemRefRegion & region,Block & block,Block::iterator & begin,Block::iterator & end,Block ** copyPlacementBlock,Block::iterator * copyInPlacementStart,Block::iterator * copyOutPlacementStart)2187 findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block,
2188 Block::iterator &begin, Block::iterator &end,
2189 Block **copyPlacementBlock,
2190 Block::iterator *copyInPlacementStart,
2191 Block::iterator *copyOutPlacementStart) {
2192 const auto *cst = region.getConstraints();
2193 SmallVector<Value, 4> symbols;
2194 cst->getIdValues(cst->getNumDimIds(), cst->getNumDimAndSymbolIds(), &symbols);
2195
2196 SmallVector<AffineForOp, 4> enclosingFors;
2197 getLoopIVs(*block.begin(), &enclosingFors);
2198 // Walk up loop parents till we find an IV on which this region is
2199 // symbolic/variant.
2200 auto it = enclosingFors.rbegin();
2201 for (auto e = enclosingFors.rend(); it != e; ++it) {
2202 // TODO: also need to be checking this for regions symbols that
2203 // aren't loop IVs, whether we are within their resp. defs' dominance scope.
2204 if (llvm::is_contained(symbols, it->getInductionVar()))
2205 break;
2206 }
2207
2208 if (it != enclosingFors.rbegin()) {
2209 auto lastInvariantIV = *std::prev(it);
2210 *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation());
2211 *copyOutPlacementStart = std::next(*copyInPlacementStart);
2212 *copyPlacementBlock = lastInvariantIV->getBlock();
2213 } else {
2214 *copyInPlacementStart = begin;
2215 *copyOutPlacementStart = end;
2216 *copyPlacementBlock = █
2217 }
2218 }
2219
2220 // Info comprising stride and number of elements transferred every stride.
2221 struct StrideInfo {
2222 int64_t stride;
2223 int64_t numEltPerStride;
2224 };
2225
2226 /// Returns striding information for a copy/transfer of this region with
2227 /// potentially multiple striding levels from outermost to innermost. For an
2228 /// n-dimensional region, there can be at most n-1 levels of striding
2229 /// successively nested.
2230 // TODO: make this work with non-identity layout maps.
getMultiLevelStrides(const MemRefRegion & region,ArrayRef<int64_t> bufferShape,SmallVectorImpl<StrideInfo> * strideInfos)2231 static void getMultiLevelStrides(const MemRefRegion ®ion,
2232 ArrayRef<int64_t> bufferShape,
2233 SmallVectorImpl<StrideInfo> *strideInfos) {
2234 if (bufferShape.size() <= 1)
2235 return;
2236
2237 int64_t numEltPerStride = 1;
2238 int64_t stride = 1;
2239 for (int d = bufferShape.size() - 1; d >= 1; d--) {
2240 int64_t dimSize = region.memref.getType().cast<MemRefType>().getDimSize(d);
2241 stride *= dimSize;
2242 numEltPerStride *= bufferShape[d];
2243 // A stride is needed only if the region has a shorter extent than the
2244 // memref along the dimension *and* has an extent greater than one along the
2245 // next major dimension.
2246 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
2247 strideInfos->push_back({stride, numEltPerStride});
2248 }
2249 }
2250 }
2251
2252 /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
2253 /// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and
2254 /// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper
2255 /// bound information for the copy loop nest. `fastBufOffsets` contain the
2256 /// expressions to be subtracted out from the respective copy loop iterators in
2257 /// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
2258 /// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
2259 /// inserted.
2260 //
2261 /// The copy-in nest is generated as follows as an example for a 2-d region:
2262 /// for x = ...
2263 /// for y = ...
2264 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
2265 ///
2266 static AffineForOp
generatePointWiseCopy(Location loc,Value memref,Value fastMemRef,ArrayRef<AffineMap> lbMaps,ArrayRef<Value> lbOperands,ArrayRef<AffineMap> ubMaps,ArrayRef<Value> ubOperands,ArrayRef<AffineExpr> fastBufOffsets,bool isCopyOut,OpBuilder b)2267 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
2268 ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
2269 ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
2270 ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
2271 OpBuilder b) {
2272 assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
2273 return lbMap.getNumInputs() == lbOperands.size();
2274 }));
2275 assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
2276 return ubMap.getNumInputs() == ubOperands.size();
2277 }));
2278
2279 unsigned rank = memref.getType().cast<MemRefType>().getRank();
2280 assert(lbMaps.size() == rank && "wrong number of lb maps");
2281 assert(ubMaps.size() == rank && "wrong number of ub maps");
2282
2283 SmallVector<Value, 4> memIndices;
2284 SmallVector<AffineExpr, 4> fastBufExprs;
2285 SmallVector<Value, 4> fastBufMapOperands;
2286 AffineForOp copyNestRoot;
2287 SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
2288 for (unsigned d = 0; d < rank; ++d) {
2289 auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
2290 ubOperands, ubMaps[d]);
2291 if (d == 0)
2292 copyNestRoot = forOp;
2293
2294 b = OpBuilder::atBlockTerminator(forOp.getBody());
2295
2296 auto fastBufOffsetMap =
2297 AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
2298 auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
2299
2300 // Construct the subscript for the fast memref being copied into/from:
2301 // x - offset_x.
2302 fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
2303 b.getAffineDimExpr(2 * d));
2304 fastBufMapOperands.push_back(offset);
2305 fastBufMapOperands.push_back(forOp.getInductionVar());
2306 mayBeDeadApplys.push_back(offset);
2307
2308 // Subscript for the slow memref being copied.
2309 memIndices.push_back(forOp.getInductionVar());
2310 }
2311
2312 auto fastBufMap =
2313 AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
2314 fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
2315 fastBufMap = simplifyAffineMap(fastBufMap);
2316 canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
2317
2318 // Drop any dead affine.applys.
2319 for (auto applyOp : mayBeDeadApplys)
2320 if (applyOp.use_empty())
2321 applyOp.erase();
2322
2323 if (!isCopyOut) {
2324 // Copy in.
2325 auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
2326 b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
2327 fastBufMapOperands);
2328 return copyNestRoot;
2329 }
2330
2331 // Copy out.
2332 auto load =
2333 b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
2334 b.create<AffineStoreOp>(loc, load, memref, memIndices);
2335 return copyNestRoot;
2336 }
2337
2338 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
emitRemarkForBlock(Block & block)2339 emitRemarkForBlock(Block &block) {
2340 return block.getParentOp()->emitRemark();
2341 }
2342
2343 /// Creates a buffer in the faster memory space for the specified memref region;
2344 /// generates a copy from the lower memory space to this one, and replaces all
2345 /// loads/stores in the block range [`begin', `end') of `block' to load/store
2346 /// from that buffer. Returns failure if copies could not be generated due to
2347 /// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart`
2348 /// in copyPlacementBlock specify the insertion points where the incoming copies
2349 /// and outgoing copies, respectively, should be inserted (the insertion happens
2350 /// right before the insertion point). Since `begin` can itself be invalidated
2351 /// due to the memref rewriting done from this method, the output argument
2352 /// `nBegin` is set to its replacement (set to `begin` if no invalidation
2353 /// happens). Since outgoing copies could have been inserted at `end`, the
2354 /// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the
2355 /// size of the fast buffer allocated.
generateCopy(const MemRefRegion & region,Block * block,Block::iterator begin,Block::iterator end,Block * copyPlacementBlock,Block::iterator copyInPlacementStart,Block::iterator copyOutPlacementStart,AffineCopyOptions copyOptions,DenseMap<Value,Value> & fastBufferMap,DenseSet<Operation * > & copyNests,uint64_t * sizeInBytes,Block::iterator * nBegin,Block::iterator * nEnd)2356 static LogicalResult generateCopy(
2357 const MemRefRegion ®ion, Block *block, Block::iterator begin,
2358 Block::iterator end, Block *copyPlacementBlock,
2359 Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
2360 AffineCopyOptions copyOptions, DenseMap<Value, Value> &fastBufferMap,
2361 DenseSet<Operation *> ©Nests, uint64_t *sizeInBytes,
2362 Block::iterator *nBegin, Block::iterator *nEnd) {
2363 *nBegin = begin;
2364 *nEnd = end;
2365
2366 FuncOp f = begin->getParentOfType<FuncOp>();
2367 OpBuilder topBuilder(f.getBody());
2368 Value zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0);
2369
2370 if (begin == end)
2371 return success();
2372
2373 // Is the copy out point at the end of the block where we are doing
2374 // explicit copying.
2375 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
2376
2377 // Copies for read regions are going to be inserted at 'begin'.
2378 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
2379 // Copies for write regions are going to be inserted at 'end'.
2380 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
2381 OpBuilder &b = region.isWrite() ? epilogue : prologue;
2382
2383 // Builder to create constants at the top level.
2384 auto func = copyPlacementBlock->getParent()->getParentOfType<FuncOp>();
2385 OpBuilder top(func.getBody());
2386
2387 auto loc = region.loc;
2388 auto memref = region.memref;
2389 auto memRefType = memref.getType().cast<MemRefType>();
2390
2391 auto layoutMaps = memRefType.getAffineMaps();
2392 if (layoutMaps.size() > 1 ||
2393 (layoutMaps.size() == 1 && !layoutMaps[0].isIdentity())) {
2394 LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
2395 return failure();
2396 }
2397
2398 // Indices to use for the copying.
2399 // Indices for the original memref being copied from/to.
2400 SmallVector<Value, 4> memIndices;
2401 // Indices for the faster buffer being copied into/from.
2402 SmallVector<Value, 4> bufIndices;
2403
2404 unsigned rank = memRefType.getRank();
2405 SmallVector<int64_t, 4> fastBufferShape;
2406
2407 // Compute the extents of the buffer.
2408 std::vector<SmallVector<int64_t, 4>> lbs;
2409 SmallVector<int64_t, 8> lbDivisors;
2410 lbs.reserve(rank);
2411 Optional<int64_t> numElements = region.getConstantBoundingSizeAndShape(
2412 &fastBufferShape, &lbs, &lbDivisors);
2413 if (!numElements.hasValue()) {
2414 LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2415 return failure();
2416 }
2417
2418 if (numElements.getValue() == 0) {
2419 LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2420 *sizeInBytes = 0;
2421 return success();
2422 }
2423
2424 SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2425 for (unsigned i = 0; i < rank; ++i)
2426 region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2427
2428 const FlatAffineConstraints *cst = region.getConstraints();
2429 // 'regionSymbols' hold values that this memory region is symbolic/parametric
2430 // on; these typically include loop IVs surrounding the level at which the
2431 // copy generation is being done or other valid symbols in MLIR.
2432 SmallVector<Value, 8> regionSymbols;
2433 cst->getIdValues(rank, cst->getNumIds(), ®ionSymbols);
2434
2435 // Construct the index expressions for the fast memory buffer. The index
2436 // expression for a particular dimension of the fast buffer is obtained by
2437 // subtracting out the lower bound on the original memref's data region
2438 // along the corresponding dimension.
2439
2440 // Index start offsets for faster memory buffer relative to the original.
2441 SmallVector<AffineExpr, 4> fastBufOffsets;
2442 fastBufOffsets.reserve(rank);
2443 for (unsigned d = 0; d < rank; d++) {
2444 assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
2445
2446 AffineExpr offset = top.getAffineConstantExpr(0);
2447 for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++)
2448 offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
2449 assert(lbDivisors[d] > 0);
2450 offset =
2451 (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
2452
2453 // Set copy start location for this dimension in the lower memory space
2454 // memref.
2455 if (auto caf = offset.dyn_cast<AffineConstantExpr>()) {
2456 auto indexVal = caf.getValue();
2457 if (indexVal == 0) {
2458 memIndices.push_back(zeroIndex);
2459 } else {
2460 memIndices.push_back(
2461 top.create<ConstantIndexOp>(loc, indexVal).getResult());
2462 }
2463 } else {
2464 // The coordinate for the start location is just the lower bound along the
2465 // corresponding dimension on the memory region (stored in 'offset').
2466 auto map = AffineMap::get(
2467 cst->getNumDimIds() + cst->getNumSymbolIds() - rank, 0, offset);
2468 memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));
2469 }
2470 // The fast buffer is copied into at location zero; addressing is relative.
2471 bufIndices.push_back(zeroIndex);
2472
2473 // Record the offsets since they are needed to remap the memory accesses of
2474 // the original memref further below.
2475 fastBufOffsets.push_back(offset);
2476 }
2477
2478 // The faster memory space buffer.
2479 Value fastMemRef;
2480
2481 // Check if a buffer was already created.
2482 bool existingBuf = fastBufferMap.count(memref) > 0;
2483 if (!existingBuf) {
2484 AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2485 auto fastMemRefType =
2486 MemRefType::get(fastBufferShape, memRefType.getElementType(),
2487 fastBufferLayout, copyOptions.fastMemorySpace);
2488
2489 // Create the fast memory space buffer just before the 'affine.for'
2490 // operation.
2491 fastMemRef =
2492 prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2493 // Record it.
2494 fastBufferMap[memref] = fastMemRef;
2495 // fastMemRefType is a constant shaped memref.
2496 *sizeInBytes = getMemRefSizeInBytes(fastMemRefType).getValue();
2497 LLVM_DEBUG(emitRemarkForBlock(*block)
2498 << "Creating fast buffer of type " << fastMemRefType
2499 << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2500 << " KiB\n");
2501 } else {
2502 // Reuse the one already created.
2503 fastMemRef = fastBufferMap[memref];
2504 *sizeInBytes = 0;
2505 }
2506
2507 auto numElementsSSA =
2508 top.create<ConstantIndexOp>(loc, numElements.getValue());
2509
2510 Value dmaStride = nullptr;
2511 Value numEltPerDmaStride = nullptr;
2512 if (copyOptions.generateDma) {
2513 SmallVector<StrideInfo, 4> dmaStrideInfos;
2514 getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2515
2516 // TODO: use all stride levels once DmaStartOp is extended for
2517 // multi-level strides.
2518 if (dmaStrideInfos.size() > 1) {
2519 LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2520 return failure();
2521 }
2522
2523 if (!dmaStrideInfos.empty()) {
2524 dmaStride = top.create<ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2525 numEltPerDmaStride =
2526 top.create<ConstantIndexOp>(loc, dmaStrideInfos[0].numEltPerStride);
2527 }
2528 }
2529
2530 // Record the last operation where we want the memref replacement to end. We
2531 // later do the memref replacement only in [begin, postDomFilter] so
2532 // that the original memref's used in the data movement code themselves don't
2533 // get replaced.
2534 auto postDomFilter = std::prev(end);
2535
2536 // Create fully composed affine maps for each memref.
2537 auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2538 fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2539 auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2540 fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2541
2542 if (!copyOptions.generateDma) {
2543 // Point-wise copy generation.
2544 auto copyNest =
2545 generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2546 /*lbOperands=*/regionSymbols, ubMaps,
2547 /*ubOperands=*/regionSymbols, fastBufOffsets,
2548 /*isCopyOut=*/region.isWrite(), b);
2549
2550 // Record this so that we can skip it from yet another copy.
2551 copyNests.insert(copyNest);
2552
2553 // Since new ops are being appended (for copy out's), adjust the end to
2554 // mark end of block range being processed if necessary.
2555 if (region.isWrite() && isCopyOutAtEndOfBlock)
2556 *nEnd = Block::iterator(copyNest.getOperation());
2557 } else {
2558 // DMA generation.
2559 // Create a tag (single element 1-d memref) for the DMA.
2560 auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2561 copyOptions.tagMemorySpace);
2562 auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2563
2564 SmallVector<Value, 4> tagIndices({zeroIndex});
2565 auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2566 fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2567 if (!region.isWrite()) {
2568 // DMA non-blocking read from original buffer to fast buffer.
2569 b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2570 fastMemRef, bufAffineMap, bufIndices,
2571 tagMemRef, tagAffineMap, tagIndices,
2572 numElementsSSA, dmaStride, numEltPerDmaStride);
2573 } else {
2574 // DMA non-blocking write from fast buffer to the original memref.
2575 auto op = b.create<AffineDmaStartOp>(
2576 loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2577 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2578 dmaStride, numEltPerDmaStride);
2579 // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2580 // end to mark end of block range being processed.
2581 if (isCopyOutAtEndOfBlock)
2582 *nEnd = Block::iterator(op.getOperation());
2583 }
2584
2585 // Matching DMA wait to block on completion; tag always has a 0 index.
2586 b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2587 numElementsSSA);
2588
2589 // Generate dealloc for the tag.
2590 auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2591 if (*nEnd == end && isCopyOutAtEndOfBlock)
2592 // Since new ops are being appended (for outgoing DMAs), adjust the end to
2593 // mark end of range of the original.
2594 *nEnd = Block::iterator(tagDeallocOp.getOperation());
2595 }
2596
2597 // Generate dealloc for the buffer.
2598 if (!existingBuf) {
2599 auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2600 // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2601 // the fast buffer (since it marks the new end insertion point).
2602 if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2603 *nEnd = Block::iterator(bufDeallocOp.getOperation());
2604 }
2605
2606 // Replace all uses of the old memref with the faster one while remapping
2607 // access indices (subtracting out lower bound offsets for each dimension).
2608 // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2609 // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2610 // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2611 // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2612 // d2, d3 correspond to the original indices (%i, %j).
2613 SmallVector<AffineExpr, 4> remapExprs;
2614 remapExprs.reserve(rank);
2615 for (unsigned i = 0; i < rank; i++) {
2616 // The starting operands of indexRemap will be regionSymbols (the symbols on
2617 // which the memref region is parametric); then those corresponding to
2618 // the memref's original indices follow.
2619 auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2620 remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2621 }
2622 auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2623 b.getContext());
2624
2625 // Record the begin since it may be invalidated by memref replacement.
2626 Block::iterator prevOfBegin;
2627 bool isBeginAtStartOfBlock = (begin == block->begin());
2628 if (!isBeginAtStartOfBlock)
2629 prevOfBegin = std::prev(begin);
2630
2631 // *Only* those uses within the range [begin, end) of 'block' are replaced.
2632 (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2633 /*extraIndices=*/{}, indexRemap,
2634 /*extraOperands=*/regionSymbols,
2635 /*symbolOperands=*/{},
2636 /*domInstFilter=*/&*begin,
2637 /*postDomInstFilter=*/&*postDomFilter);
2638
2639 *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2640
2641 return success();
2642 }
2643
2644 /// Construct the memref region to just include the entire memref. Returns false
2645 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2646 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2647 /// is parametric on.
getFullMemRefAsRegion(Operation * op,unsigned numParamLoopIVs,MemRefRegion * region)2648 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2649 MemRefRegion *region) {
2650 unsigned rank;
2651 if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2652 rank = loadOp.getMemRefType().getRank();
2653 region->memref = loadOp.getMemRef();
2654 region->setWrite(false);
2655 } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2656 rank = storeOp.getMemRefType().getRank();
2657 region->memref = storeOp.getMemRef();
2658 region->setWrite(true);
2659 } else {
2660 assert(false && "expected load or store op");
2661 return false;
2662 }
2663 auto memRefType = region->memref.getType().cast<MemRefType>();
2664 if (!memRefType.hasStaticShape())
2665 return false;
2666
2667 auto *regionCst = region->getConstraints();
2668
2669 // Just get the first numSymbols IVs, which the memref region is parametric
2670 // on.
2671 SmallVector<AffineForOp, 4> ivs;
2672 getLoopIVs(*op, &ivs);
2673 ivs.resize(numParamLoopIVs);
2674 SmallVector<Value, 4> symbols;
2675 extractForInductionVars(ivs, &symbols);
2676 regionCst->reset(rank, numParamLoopIVs, 0);
2677 regionCst->setIdValues(rank, rank + numParamLoopIVs, symbols);
2678
2679 // Memref dim sizes provide the bounds.
2680 for (unsigned d = 0; d < rank; d++) {
2681 auto dimSize = memRefType.getDimSize(d);
2682 assert(dimSize > 0 && "filtered dynamic shapes above");
2683 regionCst->addConstantLowerBound(d, 0);
2684 regionCst->addConstantUpperBound(d, dimSize - 1);
2685 }
2686 return true;
2687 }
2688
2689 /// Performs explicit copying for the contiguous sequence of operations in the
2690 /// block iterator range [`begin', `end'), where `end' can't be past the
2691 /// terminator of the block (since additional operations are potentially
2692 /// inserted right before `end`. Returns the total size of fast memory space
2693 /// buffers used. `copyOptions` provides various parameters, and the output
2694 /// argument `copyNests` is the set of all copy nests inserted, each represented
2695 /// by its root affine.for. Since we generate alloc's and dealloc's for all fast
2696 /// buffers (before and after the range of operations resp. or at a hoisted
2697 /// position), all of the fast memory capacity is assumed to be available for
2698 /// processing this block range. When 'filterMemRef' is specified, copies are
2699 /// only generated for the provided MemRef.
affineDataCopyGenerate(Block::iterator begin,Block::iterator end,const AffineCopyOptions & copyOptions,Optional<Value> filterMemRef,DenseSet<Operation * > & copyNests)2700 uint64_t mlir::affineDataCopyGenerate(Block::iterator begin,
2701 Block::iterator end,
2702 const AffineCopyOptions ©Options,
2703 Optional<Value> filterMemRef,
2704 DenseSet<Operation *> ©Nests) {
2705 if (begin == end)
2706 return 0;
2707
2708 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2709 "Inconsistent block begin/end args");
2710 assert(end != end->getBlock()->end() && "end can't be the block terminator");
2711
2712 Block *block = begin->getBlock();
2713
2714 // Copies will be generated for this depth, i.e., symbolic in all loops
2715 // surrounding the this block range.
2716 unsigned copyDepth = getNestingDepth(&*begin);
2717
2718 LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2719 << "\n");
2720 LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2721 LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2722
2723 // List of memory regions to copy for. We need a map vector to have a
2724 // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2725 // since the alloc's for example are identical except for the SSA id.
2726 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2727 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2728
2729 // Map from original memref's to the fast buffers that their accesses are
2730 // replaced with.
2731 DenseMap<Value, Value> fastBufferMap;
2732
2733 // To check for errors when walking the block.
2734 bool error = false;
2735
2736 // Walk this range of operations to gather all memory regions.
2737 block->walk(begin, end, [&](Operation *opInst) {
2738 // Gather regions to allocate to buffers in faster memory space.
2739 if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2740 if ((filterMemRef.hasValue() && filterMemRef != loadOp.getMemRef()) ||
2741 (loadOp.getMemRefType().getMemorySpaceAsInt() !=
2742 copyOptions.slowMemorySpace))
2743 return;
2744 } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2745 if ((filterMemRef.hasValue() && filterMemRef != storeOp.getMemRef()) ||
2746 storeOp.getMemRefType().getMemorySpaceAsInt() !=
2747 copyOptions.slowMemorySpace)
2748 return;
2749 } else {
2750 // Neither load nor a store op.
2751 return;
2752 }
2753
2754 // Compute the MemRefRegion accessed.
2755 auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2756 if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2757 /*addMemRefDimBounds=*/false))) {
2758 LLVM_DEBUG(llvm::dbgs()
2759 << "Error obtaining memory region: semi-affine maps?\n");
2760 LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2761 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2762 LLVM_DEBUG(
2763 opInst->emitError("non-constant memref sizes not yet supported"));
2764 error = true;
2765 return;
2766 }
2767 }
2768
2769 // Each memref has a single buffer associated with it irrespective of how
2770 // many load's and store's happen on it.
2771 // TODO: in the future, when regions don't intersect and satisfy
2772 // other properties (based on load/store regions), we could consider
2773 // multiple buffers per memref.
2774
2775 // Add to the appropriate region if it's not already in it, or take a
2776 // bounding box union with the existing one if it's already in there.
2777 // Note that a memref may have both read and write regions - so update the
2778 // region in the other list if one exists (write in case of read and vice
2779 // versa) since there is a single bounding box for a memref across all reads
2780 // and writes that happen on it.
2781
2782 // Attempts to update; returns true if 'region' exists in targetRegions.
2783 auto updateRegion =
2784 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2785 &targetRegions) {
2786 const auto it = targetRegions.find(region->memref);
2787 if (it == targetRegions.end())
2788 return false;
2789
2790 // Perform a union with the existing region.
2791 if (failed(it->second->unionBoundingBox(*region))) {
2792 LLVM_DEBUG(llvm::dbgs()
2793 << "Memory region bounding box failed; "
2794 "over-approximating to the entire memref\n");
2795 // If the union fails, we will overapproximate.
2796 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2797 LLVM_DEBUG(opInst->emitError(
2798 "non-constant memref sizes not yet supported"));
2799 error = true;
2800 return true;
2801 }
2802 it->second->getConstraints()->clearAndCopyFrom(
2803 *region->getConstraints());
2804 } else {
2805 // Union was computed and stored in 'it->second': copy to 'region'.
2806 region->getConstraints()->clearAndCopyFrom(
2807 *it->second->getConstraints());
2808 }
2809 return true;
2810 };
2811
2812 bool existsInRead = updateRegion(readRegions);
2813 if (error)
2814 return;
2815 bool existsInWrite = updateRegion(writeRegions);
2816 if (error)
2817 return;
2818
2819 // Finally add it to the region list.
2820 if (region->isWrite() && !existsInWrite) {
2821 writeRegions[region->memref] = std::move(region);
2822 } else if (!region->isWrite() && !existsInRead) {
2823 readRegions[region->memref] = std::move(region);
2824 }
2825 });
2826
2827 if (error) {
2828 begin->emitError(
2829 "copy generation failed for one or more memref's in this block\n");
2830 return 0;
2831 }
2832
2833 uint64_t totalCopyBuffersSizeInBytes = 0;
2834 bool ret = true;
2835 auto processRegions =
2836 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2837 ®ions) {
2838 for (const auto ®ionEntry : regions) {
2839 // For each region, hoist copy in/out past all hoistable
2840 // 'affine.for's.
2841 Block::iterator copyInPlacementStart, copyOutPlacementStart;
2842 Block *copyPlacementBlock;
2843 findHighestBlockForPlacement(
2844 *regionEntry.second, *block, begin, end, ©PlacementBlock,
2845 ©InPlacementStart, ©OutPlacementStart);
2846
2847 uint64_t sizeInBytes;
2848 Block::iterator nBegin, nEnd;
2849 LogicalResult iRet = generateCopy(
2850 *regionEntry.second, block, begin, end, copyPlacementBlock,
2851 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2852 fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2853 if (succeeded(iRet)) {
2854 // begin/end could have been invalidated, and need update.
2855 begin = nBegin;
2856 end = nEnd;
2857 totalCopyBuffersSizeInBytes += sizeInBytes;
2858 }
2859 ret = ret & succeeded(iRet);
2860 }
2861 };
2862 processRegions(readRegions);
2863 processRegions(writeRegions);
2864
2865 if (!ret) {
2866 begin->emitError(
2867 "copy generation failed for one or more memref's in this block\n");
2868 return totalCopyBuffersSizeInBytes;
2869 }
2870
2871 // For a range of operations, a note will be emitted at the caller.
2872 AffineForOp forOp;
2873 uint64_t sizeInKib = llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024);
2874 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2875 forOp.emitRemark()
2876 << sizeInKib
2877 << " KiB of copy buffers in fast memory space for this block\n";
2878 }
2879
2880 if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2881 StringRef str = "Total size of all copy buffers' for this block "
2882 "exceeds fast memory capacity\n";
2883 block->getParentOp()->emitWarning(str);
2884 }
2885
2886 return totalCopyBuffersSizeInBytes;
2887 }
2888
2889 // A convenience version of affineDataCopyGenerate for all ops in the body of
2890 // an AffineForOp.
affineDataCopyGenerate(AffineForOp forOp,const AffineCopyOptions & copyOptions,Optional<Value> filterMemRef,DenseSet<Operation * > & copyNests)2891 uint64_t mlir::affineDataCopyGenerate(AffineForOp forOp,
2892 const AffineCopyOptions ©Options,
2893 Optional<Value> filterMemRef,
2894 DenseSet<Operation *> ©Nests) {
2895 return affineDataCopyGenerate(forOp.getBody()->begin(),
2896 std::prev(forOp.getBody()->end()), copyOptions,
2897 filterMemRef, copyNests);
2898 }
2899
generateCopyForMemRegion(const MemRefRegion & memrefRegion,Operation * analyzedOp,const AffineCopyOptions & copyOptions,CopyGenerateResult & result)2900 LogicalResult mlir::generateCopyForMemRegion(
2901 const MemRefRegion &memrefRegion, Operation *analyzedOp,
2902 const AffineCopyOptions ©Options, CopyGenerateResult &result) {
2903 Block *block = analyzedOp->getBlock();
2904 auto begin = analyzedOp->getIterator();
2905 auto end = std::next(begin);
2906 DenseMap<Value, Value> fastBufferMap;
2907 DenseSet<Operation *> copyNests;
2908
2909 auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2910 copyOptions, fastBufferMap, copyNests,
2911 &result.sizeInBytes, &begin, &end);
2912 if (failed(err))
2913 return err;
2914
2915 const auto &en = fastBufferMap.find(memrefRegion.memref);
2916 // In some cases (empty loops), no copy generation would have happened.
2917 if (en == fastBufferMap.end())
2918 return failure();
2919 result.alloc = en->second.getDefiningOp();
2920 assert(result.alloc && "fast buffer expected to be locally allocated");
2921 assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2922 result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2923 return success();
2924 }
2925
2926 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2927 static void
gatherLoopsInBlock(Block * block,unsigned currLoopDepth,std::vector<SmallVector<AffineForOp,2>> & depthToLoops)2928 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2929 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2930 // Add a new empty level to output if it doesn't exist level already.
2931 assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2932 if (currLoopDepth == depthToLoops.size())
2933 depthToLoops.push_back(SmallVector<AffineForOp, 2>());
2934
2935 for (auto &op : *block) {
2936 if (auto forOp = dyn_cast<AffineForOp>(op)) {
2937 depthToLoops[currLoopDepth].push_back(forOp);
2938 gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2939 }
2940 }
2941 }
2942
2943 /// Gathers all AffineForOps in 'func' grouped by loop depth.
gatherLoops(FuncOp func,std::vector<SmallVector<AffineForOp,2>> & depthToLoops)2944 void mlir::gatherLoops(FuncOp func,
2945 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2946 for (auto &block : func)
2947 gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2948
2949 // Remove last loop level from output since it's empty.
2950 if (!depthToLoops.empty()) {
2951 assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2952 depthToLoops.pop_back();
2953 }
2954 }
2955
2956 // TODO: if necessary, this can be extended to also compose in any
2957 // affine.applys, fold to constant if all result dimensions of the map are
2958 // constant (canonicalizeMapAndOperands below already does this for single
2959 // result bound maps), and use simplifyMap to perform algebraic simplification.
createCanonicalizedAffineForOp(OpBuilder b,Location loc,ValueRange lbOperands,AffineMap lbMap,ValueRange ubOperands,AffineMap ubMap,int64_t step)2960 AffineForOp mlir::createCanonicalizedAffineForOp(
2961 OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2962 ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2963 SmallVector<Value, 4> lowerOperands(lbOperands);
2964 SmallVector<Value, 4> upperOperands(ubOperands);
2965
2966 fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2967 canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2968 lbMap = removeDuplicateExprs(lbMap);
2969 fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2970 canonicalizeMapAndOperands(&ubMap, &upperOperands);
2971 ubMap = removeDuplicateExprs(ubMap);
2972
2973 return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2974 step);
2975 }
2976
2977 /// Creates an AffineIfOp that encodes the conditional to choose between
2978 /// the constant trip count version and an unknown trip count version of this
2979 /// nest of loops. This is used to separate partial and full tiles if `loops`
2980 /// has the intra-tile loops. The affine.if op is inserted at the builder
2981 /// insertion point of `b`.
createSeparationCondition(MutableArrayRef<AffineForOp> loops,OpBuilder b)2982 static AffineIfOp createSeparationCondition(MutableArrayRef<AffineForOp> loops,
2983 OpBuilder b) {
2984 if (loops.empty())
2985 return nullptr;
2986
2987 auto *context = loops[0].getContext();
2988
2989 FlatAffineConstraints cst;
2990 SmallVector<Operation *, 8> ops;
2991 ops.reserve(loops.size());
2992 for (AffineForOp forOp : loops)
2993 ops.push_back(forOp);
2994 (void)getIndexSet(ops, &cst);
2995
2996 // Remove constraints that are independent of these loop IVs.
2997 cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2998
2999 // Construct the constraint set representing the guard for full tiles. The
3000 // lower bound (and upper bound) corresponding to the full tile should be
3001 // larger (and resp. smaller) than any other lower (or upper bound).
3002 SmallVector<int64_t, 8> fullTileLb, fullTileUb;
3003 for (auto loop : loops) {
3004 (void)loop;
3005 // TODO: Non-unit stride is not an issue to generalize to.
3006 assert(loop.getStep() == 1 && "point loop step expected to be one");
3007 // Mark everything symbols for the purpose of finding a constant diff pair.
3008 cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolIds() -
3009 1);
3010 unsigned fullTileLbPos, fullTileUbPos;
3011 if (!cst.getConstantBoundOnDimSize(0, /*lb=*/nullptr,
3012 /*lbFloorDivisor=*/nullptr,
3013 /*ub=*/nullptr, &fullTileLbPos,
3014 &fullTileUbPos)) {
3015 LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
3016 return nullptr;
3017 }
3018
3019 SmallVector<unsigned, 4> lbIndices, ubIndices;
3020 cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
3021
3022 auto fLb = cst.getInequality(fullTileLbPos);
3023 auto fUb = cst.getInequality(fullTileUbPos);
3024 fullTileLb.assign(fLb.begin(), fLb.end());
3025 fullTileUb.assign(fUb.begin(), fUb.end());
3026
3027 // Full tile lower bound should be >= than any other lower bound.
3028 for (auto lbIndex : lbIndices)
3029 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
3030 cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
3031
3032 // Full tile upper bound should be <= any other upper bound.
3033 for (auto ubIndex : ubIndices)
3034 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
3035 cst.atIneq(ubIndex, i) -= fullTileUb[i];
3036
3037 cst.removeId(0);
3038 }
3039
3040 // The previous step leads to all zeros for the full tile lb and ub position
3041 // itself; remove those and any other duplicates / trivial redundancies.
3042 cst.removeTrivialRedundancy();
3043
3044 // Turn everything into dims conservatively since we earlier turned all
3045 // trailing ids past point loop IV into symbols. Some of these could be outer
3046 // loop IVs; we'll canonicalize anyway.
3047 cst.setDimSymbolSeparation(0);
3048
3049 IntegerSet ifCondSet = cst.getAsIntegerSet(context);
3050 // ifCondSet can be null if cst was empty -- this can happen if all loops
3051 // in the nest have constant trip counts.
3052 if (!ifCondSet)
3053 return nullptr;
3054
3055 SmallVector<Value, 4> setOperands;
3056 cst.getIdValues(0, cst.getNumDimAndSymbolIds(), &setOperands);
3057 canonicalizeSetAndOperands(&ifCondSet, &setOperands);
3058 return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
3059 /*withElseRegion=*/true);
3060 }
3061
3062 /// Create the full tile loop nest (along with its body).
3063 static LogicalResult
createFullTiles(MutableArrayRef<AffineForOp> inputNest,SmallVectorImpl<AffineForOp> & fullTileLoops,OpBuilder b)3064 createFullTiles(MutableArrayRef<AffineForOp> inputNest,
3065 SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
3066 fullTileLoops.reserve(inputNest.size());
3067
3068 // For each loop in the original nest identify a lower/upper bound pair such
3069 // that their difference is a constant.
3070 FlatAffineConstraints cst;
3071 for (auto loop : inputNest) {
3072 // TODO: straightforward to generalize to a non-unit stride.
3073 if (loop.getStep() != 1) {
3074 LLVM_DEBUG(llvm::dbgs()
3075 << "[tile separation] non-unit stride not implemented\n");
3076 return failure();
3077 }
3078 SmallVector<Operation *, 1> loopOp{loop.getOperation()};
3079 (void)getIndexSet(loopOp, &cst);
3080 // We will mark everything other than this loop IV as symbol for getting a
3081 // pair of <lb, ub> with a constant difference.
3082 cst.setDimSymbolSeparation(cst.getNumDimAndSymbolIds() - 1);
3083 unsigned lbPos, ubPos;
3084 if (!cst.getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
3085 /*lbDivisor=*/nullptr, /*ub=*/nullptr,
3086 &lbPos, &ubPos) ||
3087 lbPos == ubPos) {
3088 LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
3089 "equalities not yet handled\n");
3090 return failure();
3091 }
3092
3093 // Set all identifiers as dimensions uniformly since some of those marked as
3094 // symbols above could be outer loop IVs (corresponding tile space IVs).
3095 cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
3096
3097 AffineValueMap lbVmap, ubVmap;
3098 cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
3099 cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
3100 AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
3101 b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
3102 ubVmap.getOperands(), ubVmap.getAffineMap());
3103 b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
3104 fullTileLoops.push_back(fullTileLoop);
3105 }
3106
3107 // Add the body for the full tile loop nest.
3108 BlockAndValueMapping operandMap;
3109 for (auto loopEn : llvm::enumerate(inputNest))
3110 operandMap.map(loopEn.value().getInductionVar(),
3111 fullTileLoops[loopEn.index()].getInductionVar());
3112 b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
3113 for (auto &op : inputNest.back().getBody()->without_terminator())
3114 b.clone(op, operandMap);
3115 return success();
3116 }
3117
3118 LogicalResult
separateFullTiles(MutableArrayRef<AffineForOp> inputNest,SmallVectorImpl<AffineForOp> * fullTileNest)3119 mlir::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
3120 SmallVectorImpl<AffineForOp> *fullTileNest) {
3121 if (inputNest.empty())
3122 return success();
3123
3124 auto firstLoop = inputNest[0];
3125
3126 // Each successive for op has to be nested in the other.
3127 auto prevLoop = firstLoop;
3128 for (auto loop : inputNest.drop_front(1)) {
3129 assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
3130 prevLoop = loop;
3131 }
3132
3133 // Create the full tile loop nest.
3134 SmallVector<AffineForOp, 4> fullTileLoops;
3135 OpBuilder b(firstLoop);
3136 if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
3137 if (!fullTileLoops.empty())
3138 fullTileLoops.front().erase();
3139 return failure();
3140 }
3141
3142 // Create and insert the version select right before the root of the nest.
3143 b = OpBuilder(firstLoop);
3144 AffineIfOp ifOp = createSeparationCondition(inputNest, b);
3145 if (!ifOp) {
3146 fullTileLoops.front().erase();
3147 LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
3148 "separation condition\n");
3149 return failure();
3150 }
3151
3152 // Move the full tile into the then block.
3153 Block *thenBlock = ifOp.getThenBlock();
3154 AffineForOp outermostFullTileLoop = fullTileLoops[0];
3155 thenBlock->getOperations().splice(
3156 std::prev(thenBlock->end()),
3157 outermostFullTileLoop->getBlock()->getOperations(),
3158 Block::iterator(outermostFullTileLoop));
3159
3160 // Move the partial tile into the else block. The partial tile is the same as
3161 // the original loop nest.
3162 Block *elseBlock = ifOp.getElseBlock();
3163 elseBlock->getOperations().splice(std::prev(elseBlock->end()),
3164 firstLoop->getBlock()->getOperations(),
3165 Block::iterator(firstLoop));
3166
3167 if (fullTileNest)
3168 *fullTileNest = std::move(fullTileLoops);
3169
3170 return success();
3171 }
3172