1 //===- LoopUtils.h - Loop transformation utilities --------------*- C++ -*-===//
2 //
3 // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This header file defines prototypes for various loop transformation utility
10 // methods: these are not passes by themselves but are used either by passes,
11 // optimization sequences, or in turn by other transformation utilities.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef MLIR_TRANSFORMS_LOOP_UTILS_H
16 #define MLIR_TRANSFORMS_LOOP_UTILS_H
17 
18 #include "mlir/IR/Block.h"
19 #include "mlir/Support/LLVM.h"
20 #include "mlir/Support/LogicalResult.h"
21 
22 namespace mlir {
23 class AffineForOp;
24 class FuncOp;
25 class OpBuilder;
26 class Value;
27 
28 namespace loop {
29 class ForOp;
30 } // end namespace loop
31 
32 /// Unrolls this for operation completely if the trip count is known to be
33 /// constant. Returns failure otherwise.
34 LogicalResult loopUnrollFull(AffineForOp forOp);
35 
36 /// Unrolls this for operation by the specified unroll factor. Returns failure
37 /// if the loop cannot be unrolled either due to restrictions or due to invalid
38 /// unroll factors.
39 LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor);
40 
41 /// Unrolls this loop by the specified unroll factor or its trip count,
42 /// whichever is lower.
43 LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor);
44 
45 /// Get perfectly nested sequence of loops starting at root of loop nest
46 /// (the first op being another AffineFor, and the second op - a terminator).
47 /// A loop is perfectly nested iff: the first op in the loop's body is another
48 /// AffineForOp, and the second op is a terminator).
49 void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
50                              AffineForOp root);
51 void getPerfectlyNestedLoops(SmallVectorImpl<loop::ForOp> &nestedLoops,
52                              loop::ForOp root);
53 
54 /// Unrolls and jams this loop by the specified factor. Returns success if the
55 /// loop is successfully unroll-jammed.
56 LogicalResult loopUnrollJamByFactor(AffineForOp forOp,
57                                     uint64_t unrollJamFactor);
58 
59 /// Unrolls and jams this loop by the specified factor or by the trip count (if
60 /// constant), whichever is lower.
61 LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
62                                       uint64_t unrollJamFactor);
63 
64 /// Promotes the loop body of a AffineForOp to its containing block if the
65 /// AffineForOp was known to have a single iteration.
66 LogicalResult promoteIfSingleIteration(AffineForOp forOp);
67 
68 /// Promotes all single iteration AffineForOp's in the Function, i.e., moves
69 /// their body into the containing Block.
70 void promoteSingleIterationLoops(FuncOp f);
71 
72 /// Computes the cleanup loop lower bound of the loop being unrolled with
73 /// the specified unroll factor; this bound will also be upper bound of the main
74 /// part of the unrolled loop. Computes the bound as an AffineMap with its
75 /// operands or a null map when the trip count can't be expressed as an affine
76 /// expression.
77 void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
78                               AffineMap *map, SmallVectorImpl<Value> *operands,
79                               OpBuilder &builder);
80 
81 /// Skew the operations in the body of a 'affine.for' operation with the
82 /// specified operation-wise shifts. The shifts are with respect to the
83 /// original execution order, and are multiplied by the loop 'step' before being
84 /// applied.
85 LLVM_NODISCARD
86 LogicalResult instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
87                            bool unrollPrologueEpilogue = false);
88 
89 /// Tiles the specified band of perfectly nested loops creating tile-space loops
90 /// and intra-tile loops. A band is a contiguous set of loops.
91 LLVM_NODISCARD
92 LogicalResult tileCodeGen(MutableArrayRef<AffineForOp> band,
93                           ArrayRef<unsigned> tileSizes);
94 
95 /// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA'
96 /// and 'forOpB' are part of a perfectly nested sequence of loops.
97 void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB);
98 
99 /// Checks if the loop interchange permutation 'loopPermMap', of the perfectly
100 /// nested sequence of loops in 'loops', would violate dependences (loop 'i' in
101 /// 'loops' is mapped to location 'j = 'loopPermMap[i]' in the interchange).
102 bool isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,
103                                        ArrayRef<unsigned> loopPermMap);
104 
105 /// Performs a sequence of loop interchanges on perfectly nested 'loops', as
106 /// specified by permutation 'loopPermMap' (loop 'i' in 'loops' is mapped to
107 /// location 'j = 'loopPermMap[i]' after the loop interchange).
108 unsigned interchangeLoops(ArrayRef<AffineForOp> loops,
109                           ArrayRef<unsigned> loopPermMap);
110 
111 // Sinks all sequential loops to the innermost levels (while preserving
112 // relative order among them) and moves all parallel loops to the
113 // outermost (while again preserving relative order among them).
114 // Returns AffineForOp of the root of the new loop nest after loop interchanges.
115 AffineForOp sinkSequentialLoops(AffineForOp forOp);
116 
117 /// Sinks 'forOp' by 'loopDepth' levels by performing a series of loop
118 /// interchanges. Requires that 'forOp' is part of a perfect nest with
119 /// 'loopDepth' AffineForOps consecutively nested under it.
120 void sinkLoop(AffineForOp forOp, unsigned loopDepth);
121 
122 /// Performs tiling fo imperfectly nested loops (with interchange) by
123 /// strip-mining the `forOps` by `sizes` and sinking them, in their order of
124 /// occurrence in `forOps`, under each of the `targets`.
125 /// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair,
126 /// nested immediately under each of `targets`.
127 using Loops = SmallVector<loop::ForOp, 8>;
128 using TileLoops = std::pair<Loops, Loops>;
129 SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps,
130                                                  ArrayRef<uint64_t> sizes,
131                                                  ArrayRef<AffineForOp> targets);
132 SmallVector<Loops, 8> tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value> sizes,
133                            ArrayRef<loop::ForOp> targets);
134 
135 /// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes`
136 /// and sinking them, in their order of occurrence in `forOps`, under `target`.
137 /// Returns the new AffineForOps, one per `forOps`, nested immediately under
138 /// `target`.
139 SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps,
140                                  ArrayRef<uint64_t> sizes, AffineForOp target);
141 Loops tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value> sizes,
142            loop::ForOp target);
143 
144 /// Tile a nest of loop::ForOp loops rooted at `rootForOp` with the given
145 /// (parametric) sizes. Sizes are expected to be strictly positive values at
146 /// runtime.  If more sizes than loops are provided, discard the trailing values
147 /// in sizes.  Assumes the loop nest is permutable.
148 /// Returns the newly created intra-tile loops.
149 Loops tilePerfectlyNested(loop::ForOp rootForOp, ArrayRef<Value> sizes);
150 
151 /// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate.
152 struct AffineCopyOptions {
153   // True if DMAs should be generated instead of point-wise copies.
154   bool generateDma;
155   // The slower memory space from which data is to be moved.
156   unsigned slowMemorySpace;
157   // Memory space of the faster one (typically a scratchpad).
158   unsigned fastMemorySpace;
159   // Memory space to place tags in: only meaningful for DMAs.
160   unsigned tagMemorySpace;
161   // Capacity of the fast memory space in bytes.
162   uint64_t fastMemCapacityBytes;
163 };
164 
165 /// Performs explicit copying for the contiguous sequence of operations in the
166 /// block iterator range [`begin', `end'), where `end' can't be past the
167 /// terminator of the block (since additional operations are potentially
168 /// inserted right before `end`. Returns the total size of fast memory space
169 /// buffers used. `copyOptions` provides various parameters, and the output
170 /// argument `copyNests` is the set of all copy nests inserted, each represented
171 /// by its root affine.for. Since we generate alloc's and dealloc's for all fast
172 /// buffers (before and after the range of operations resp. or at a hoisted
173 /// position), all of the fast memory capacity is assumed to be available for
174 /// processing this block range.
175 uint64_t affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
176                                 const AffineCopyOptions &copyOptions,
177                                 DenseSet<Operation *> &copyNests);
178 
179 /// Tile a nest of standard for loops rooted at `rootForOp` by finding such
180 /// parametric tile sizes that the outer loops have a fixed number of iterations
181 /// as defined in `sizes`.
182 TileLoops extractFixedOuterLoops(loop::ForOp rootFOrOp,
183                                  ArrayRef<int64_t> sizes);
184 
185 /// Replace a perfect nest of "for" loops with a single linearized loop. Assumes
186 /// `loops` contains a list of perfectly nested loops with bounds and steps
187 /// independent of any loop induction variable involved in the nest.
188 void coalesceLoops(MutableArrayRef<loop::ForOp> loops);
189 
190 /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of
191 /// size given by `numProcessors`. This is achieved by embedding the SSA values
192 /// corresponding to `processorIds` and `numProcessors` into the bounds and step
193 /// of the `forOp`. No check is performed on the legality of the rewrite, it is
194 /// the caller's responsibility to ensure legality.
195 ///
196 /// Requires that `processorIds` and `numProcessors` have the same size and that
197 /// for each idx, `processorIds`[idx] takes, at runtime, all values between 0
198 /// and `numProcessors`[idx] - 1. This corresponds to traditional use cases for:
199 ///   1. GPU (threadIdx, get_local_id(), ...)
200 ///   2. MPI (MPI_Comm_rank)
201 ///   3. OpenMP (omp_get_thread_num)
202 ///
203 /// Example:
204 /// Assuming a 2-d grid with processorIds = [blockIdx.x, threadIdx.x] and
205 /// numProcessors = [gridDim.x, blockDim.x], the loop:
206 ///
207 /// ```
208 ///    loop.for %i = %lb to %ub step %step {
209 ///      ...
210 ///    }
211 /// ```
212 ///
213 /// is rewritten into a version resembling the following pseudo-IR:
214 ///
215 /// ```
216 ///    loop.for %i = %lb + %step * (threadIdx.x + blockIdx.x * blockDim.x)
217 ///       to %ub step %gridDim.x * blockDim.x * %step {
218 ///      ...
219 ///    }
220 /// ```
221 void mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value> processorId,
222                            ArrayRef<Value> numProcessors);
223 } // end namespace mlir
224 
225 #endif // MLIR_TRANSFORMS_LOOP_UTILS_H
226