1 //===- LoopUtils.h - Loop transformation utilities --------------*- C++ -*-===// 2 // 3 // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This header file defines prototypes for various loop transformation utility 10 // methods: these are not passes by themselves but are used either by passes, 11 // optimization sequences, or in turn by other transformation utilities. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifndef MLIR_TRANSFORMS_LOOP_UTILS_H 16 #define MLIR_TRANSFORMS_LOOP_UTILS_H 17 18 #include "mlir/IR/Block.h" 19 #include "mlir/Support/LLVM.h" 20 #include "mlir/Support/LogicalResult.h" 21 22 namespace mlir { 23 class AffineForOp; 24 class FuncOp; 25 class OpBuilder; 26 class Value; 27 28 namespace loop { 29 class ForOp; 30 } // end namespace loop 31 32 /// Unrolls this for operation completely if the trip count is known to be 33 /// constant. Returns failure otherwise. 34 LogicalResult loopUnrollFull(AffineForOp forOp); 35 36 /// Unrolls this for operation by the specified unroll factor. Returns failure 37 /// if the loop cannot be unrolled either due to restrictions or due to invalid 38 /// unroll factors. 39 LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor); 40 41 /// Unrolls this loop by the specified unroll factor or its trip count, 42 /// whichever is lower. 43 LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor); 44 45 /// Get perfectly nested sequence of loops starting at root of loop nest 46 /// (the first op being another AffineFor, and the second op - a terminator). 47 /// A loop is perfectly nested iff: the first op in the loop's body is another 48 /// AffineForOp, and the second op is a terminator). 49 void getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops, 50 AffineForOp root); 51 void getPerfectlyNestedLoops(SmallVectorImpl<loop::ForOp> &nestedLoops, 52 loop::ForOp root); 53 54 /// Unrolls and jams this loop by the specified factor. Returns success if the 55 /// loop is successfully unroll-jammed. 56 LogicalResult loopUnrollJamByFactor(AffineForOp forOp, 57 uint64_t unrollJamFactor); 58 59 /// Unrolls and jams this loop by the specified factor or by the trip count (if 60 /// constant), whichever is lower. 61 LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, 62 uint64_t unrollJamFactor); 63 64 /// Promotes the loop body of a AffineForOp to its containing block if the 65 /// AffineForOp was known to have a single iteration. 66 LogicalResult promoteIfSingleIteration(AffineForOp forOp); 67 68 /// Promotes all single iteration AffineForOp's in the Function, i.e., moves 69 /// their body into the containing Block. 70 void promoteSingleIterationLoops(FuncOp f); 71 72 /// Computes the cleanup loop lower bound of the loop being unrolled with 73 /// the specified unroll factor; this bound will also be upper bound of the main 74 /// part of the unrolled loop. Computes the bound as an AffineMap with its 75 /// operands or a null map when the trip count can't be expressed as an affine 76 /// expression. 77 void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, 78 AffineMap *map, SmallVectorImpl<Value> *operands, 79 OpBuilder &builder); 80 81 /// Skew the operations in the body of a 'affine.for' operation with the 82 /// specified operation-wise shifts. The shifts are with respect to the 83 /// original execution order, and are multiplied by the loop 'step' before being 84 /// applied. 85 LLVM_NODISCARD 86 LogicalResult instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts, 87 bool unrollPrologueEpilogue = false); 88 89 /// Tiles the specified band of perfectly nested loops creating tile-space loops 90 /// and intra-tile loops. A band is a contiguous set of loops. 91 LLVM_NODISCARD 92 LogicalResult tileCodeGen(MutableArrayRef<AffineForOp> band, 93 ArrayRef<unsigned> tileSizes); 94 95 /// Performs loop interchange on 'forOpA' and 'forOpB'. Requires that 'forOpA' 96 /// and 'forOpB' are part of a perfectly nested sequence of loops. 97 void interchangeLoops(AffineForOp forOpA, AffineForOp forOpB); 98 99 /// Checks if the loop interchange permutation 'loopPermMap', of the perfectly 100 /// nested sequence of loops in 'loops', would violate dependences (loop 'i' in 101 /// 'loops' is mapped to location 'j = 'loopPermMap[i]' in the interchange). 102 bool isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops, 103 ArrayRef<unsigned> loopPermMap); 104 105 /// Performs a sequence of loop interchanges on perfectly nested 'loops', as 106 /// specified by permutation 'loopPermMap' (loop 'i' in 'loops' is mapped to 107 /// location 'j = 'loopPermMap[i]' after the loop interchange). 108 unsigned interchangeLoops(ArrayRef<AffineForOp> loops, 109 ArrayRef<unsigned> loopPermMap); 110 111 // Sinks all sequential loops to the innermost levels (while preserving 112 // relative order among them) and moves all parallel loops to the 113 // outermost (while again preserving relative order among them). 114 // Returns AffineForOp of the root of the new loop nest after loop interchanges. 115 AffineForOp sinkSequentialLoops(AffineForOp forOp); 116 117 /// Sinks 'forOp' by 'loopDepth' levels by performing a series of loop 118 /// interchanges. Requires that 'forOp' is part of a perfect nest with 119 /// 'loopDepth' AffineForOps consecutively nested under it. 120 void sinkLoop(AffineForOp forOp, unsigned loopDepth); 121 122 /// Performs tiling fo imperfectly nested loops (with interchange) by 123 /// strip-mining the `forOps` by `sizes` and sinking them, in their order of 124 /// occurrence in `forOps`, under each of the `targets`. 125 /// Returns the new AffineForOps, one per each of (`forOps`, `targets`) pair, 126 /// nested immediately under each of `targets`. 127 using Loops = SmallVector<loop::ForOp, 8>; 128 using TileLoops = std::pair<Loops, Loops>; 129 SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps, 130 ArrayRef<uint64_t> sizes, 131 ArrayRef<AffineForOp> targets); 132 SmallVector<Loops, 8> tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value> sizes, 133 ArrayRef<loop::ForOp> targets); 134 135 /// Performs tiling (with interchange) by strip-mining the `forOps` by `sizes` 136 /// and sinking them, in their order of occurrence in `forOps`, under `target`. 137 /// Returns the new AffineForOps, one per `forOps`, nested immediately under 138 /// `target`. 139 SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps, 140 ArrayRef<uint64_t> sizes, AffineForOp target); 141 Loops tile(ArrayRef<loop::ForOp> forOps, ArrayRef<Value> sizes, 142 loop::ForOp target); 143 144 /// Tile a nest of loop::ForOp loops rooted at `rootForOp` with the given 145 /// (parametric) sizes. Sizes are expected to be strictly positive values at 146 /// runtime. If more sizes than loops are provided, discard the trailing values 147 /// in sizes. Assumes the loop nest is permutable. 148 /// Returns the newly created intra-tile loops. 149 Loops tilePerfectlyNested(loop::ForOp rootForOp, ArrayRef<Value> sizes); 150 151 /// Explicit copy / DMA generation options for mlir::affineDataCopyGenerate. 152 struct AffineCopyOptions { 153 // True if DMAs should be generated instead of point-wise copies. 154 bool generateDma; 155 // The slower memory space from which data is to be moved. 156 unsigned slowMemorySpace; 157 // Memory space of the faster one (typically a scratchpad). 158 unsigned fastMemorySpace; 159 // Memory space to place tags in: only meaningful for DMAs. 160 unsigned tagMemorySpace; 161 // Capacity of the fast memory space in bytes. 162 uint64_t fastMemCapacityBytes; 163 }; 164 165 /// Performs explicit copying for the contiguous sequence of operations in the 166 /// block iterator range [`begin', `end'), where `end' can't be past the 167 /// terminator of the block (since additional operations are potentially 168 /// inserted right before `end`. Returns the total size of fast memory space 169 /// buffers used. `copyOptions` provides various parameters, and the output 170 /// argument `copyNests` is the set of all copy nests inserted, each represented 171 /// by its root affine.for. Since we generate alloc's and dealloc's for all fast 172 /// buffers (before and after the range of operations resp. or at a hoisted 173 /// position), all of the fast memory capacity is assumed to be available for 174 /// processing this block range. 175 uint64_t affineDataCopyGenerate(Block::iterator begin, Block::iterator end, 176 const AffineCopyOptions ©Options, 177 DenseSet<Operation *> ©Nests); 178 179 /// Tile a nest of standard for loops rooted at `rootForOp` by finding such 180 /// parametric tile sizes that the outer loops have a fixed number of iterations 181 /// as defined in `sizes`. 182 TileLoops extractFixedOuterLoops(loop::ForOp rootFOrOp, 183 ArrayRef<int64_t> sizes); 184 185 /// Replace a perfect nest of "for" loops with a single linearized loop. Assumes 186 /// `loops` contains a list of perfectly nested loops with bounds and steps 187 /// independent of any loop induction variable involved in the nest. 188 void coalesceLoops(MutableArrayRef<loop::ForOp> loops); 189 190 /// Maps `forOp` for execution on a parallel grid of virtual `processorIds` of 191 /// size given by `numProcessors`. This is achieved by embedding the SSA values 192 /// corresponding to `processorIds` and `numProcessors` into the bounds and step 193 /// of the `forOp`. No check is performed on the legality of the rewrite, it is 194 /// the caller's responsibility to ensure legality. 195 /// 196 /// Requires that `processorIds` and `numProcessors` have the same size and that 197 /// for each idx, `processorIds`[idx] takes, at runtime, all values between 0 198 /// and `numProcessors`[idx] - 1. This corresponds to traditional use cases for: 199 /// 1. GPU (threadIdx, get_local_id(), ...) 200 /// 2. MPI (MPI_Comm_rank) 201 /// 3. OpenMP (omp_get_thread_num) 202 /// 203 /// Example: 204 /// Assuming a 2-d grid with processorIds = [blockIdx.x, threadIdx.x] and 205 /// numProcessors = [gridDim.x, blockDim.x], the loop: 206 /// 207 /// ``` 208 /// loop.for %i = %lb to %ub step %step { 209 /// ... 210 /// } 211 /// ``` 212 /// 213 /// is rewritten into a version resembling the following pseudo-IR: 214 /// 215 /// ``` 216 /// loop.for %i = %lb + %step * (threadIdx.x + blockIdx.x * blockDim.x) 217 /// to %ub step %gridDim.x * blockDim.x * %step { 218 /// ... 219 /// } 220 /// ``` 221 void mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef<Value> processorId, 222 ArrayRef<Value> numProcessors); 223 } // end namespace mlir 224 225 #endif // MLIR_TRANSFORMS_LOOP_UTILS_H 226