1 //===- Utils.h - Utilities to support the Linalg dialect --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef MLIR_DIALECT_LINALG_UTILS_H_
10 #define MLIR_DIALECT_LINALG_UTILS_H_
11 
12 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
13 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
14 #include "mlir/Dialect/Linalg/EDSC/Builders.h"
15 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
16 #include "mlir/Dialect/SCF/SCF.h"
17 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
18 #include "mlir/Dialect/StandardOps/IR/Ops.h"
19 
20 #include "llvm/ADT/MapVector.h"
21 #include "llvm/ADT/SetVector.h"
22 
23 using mlir::edsc::intrinsics::AffineIndexedValue;
24 using mlir::edsc::intrinsics::StdIndexedValue;
25 
26 namespace mlir {
27 class AffineExpr;
28 class AffineForOp;
29 class AffineMap;
30 class OperationFolder;
31 class PatternRewriter;
32 
33 namespace linalg {
34 class LinalgDependenceGraph;
35 
36 /// A struct containing the Linalg producer before and after fusion.
37 /// When operating on tensors, `fusedProducer` may feed into a `tensor.cast` op
38 /// before the consumer Linalg op, until enough canonicalizations have applied.
39 struct FusionInfo {
40   LinalgOp originalProducer;
41   LinalgOp fusedProducer;
42 };
43 
44 /// A struct containing common matchers over linalg op's region.
45 struct RegionMatcher {
46   enum class BinaryOpKind {
47     IAdd,
48   };
49 
50   /// Matches the given linalg op if its body is performing binary operation on
51   /// int or float scalar values and returns the binary op kind.
52   ///
53   /// The linalg op's region is expected to be
54   /// ```
55   /// {
56   ///   ^bb(%a: <scalar-type>, %b: <scalar-type>):
57   ///     %0 = <binary-op> %a, %b: <scalar-type>
58   ///     linalg.yield %0: <scalar-type>
59   /// }
60   /// ```
61   static Optional<BinaryOpKind> matchAsScalarBinaryOp(GenericOp op);
62 };
63 
64 /// Checks if an iterator_type attribute is parallel.
65 bool isParallelIteratorType(Attribute attr);
66 
67 /// Checks if an iterator_type attribute is parallel.
68 bool isReductionIteratorType(Attribute attr);
69 
70 /// Checks if an iterator_type attribute is parallel.
71 bool isWindowIteratorType(Attribute attr);
72 
73 /// Checks whether the specific `producer` is the last write to exactly the
74 /// whole `consumedView`. This checks structural dominance, that the dependence
75 /// is a RAW without any interleaved write to any piece of `consumedView`.
76 bool isProducerLastWriteOfView(const LinalgDependenceGraph &graph,
77                                LinalgOp consumer, Value consumedView,
78                                LinalgOp producer);
79 
80 /// Checks whether fusing the specific `producer` of the `consumedView` is
81 /// feasible. This checks `producer` is the last write of `consumedView` and
82 /// that no interleaved dependence would be violated (RAW, WAR or WAW).
83 bool isFusableInto(const LinalgDependenceGraph &graph, LinalgOp consumer,
84                    Value consumedView, LinalgOp producer);
85 
86 using FusableOpDependencesTy = llvm::MapVector<
87     Operation *,
88     SmallVector<LinalgDependenceGraph::LinalgDependenceGraphElem, 1>>;
89 FusableOpDependencesTy
90 findAllFusableDependences(ArrayRef<LinalgOp> ops,
91                           const LinalgDependenceGraph &dependenceGraph);
92 
93 /// Fuses producer into consumer if the producer is structurally feasible and
94 /// the fusion would not violate dependencies.
95 /// Implements the fusion part of the "tileAndFuse on buffers" transformation
96 /// and thus requires the `consumerOpOperand` to be a `subview` op (generally
97 /// obtained by applying the tiling transformation).
98 Optional<FusionInfo> fuseProducerOfBuffer(OpBuilder &b,
99                                           OpOperand &consumerOpOperand,
100                                           const LinalgDependenceGraph &graph);
101 /// Tensor counterpart of `fuseProducerOfBuffer`.
102 /// This implements the fusion part of the "tileAndFuse on tensors"
103 /// transformation and thus requires the `consumerOpOperand` to be a `subtensor`
104 /// op (generally obtained by applying the tiling transformation).
105 Optional<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
106                                           OpOperand &consumerOpOperand);
107 /// Tensor counterpart of `fuseProducerOfBuffer`.
108 /// This implements the fusion part of the "tileAndFuse on tensors"
109 /// transformation and thus requires the `consumerOpOperand` to be a `subtensor`
110 /// op (generally obtained by applying the tiling transformation).
111 /// Assumes `producerOfTensor` is a Linalg op that produces `consumerOpOperand`.
112 Optional<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
113                                           OpResult producerOpResult,
114                                           OpOperand &consumerOpOperand);
115 
116 /// Fuse linalg operation on tensors, with the producer of the operand at
117 /// position `consumerIdx` of the consumer.
118 Optional<SmallVector<Value, 1>> fuseTensorOps(PatternRewriter &rewriter,
119                                               OpOperand &consumerOpOperand);
120 
121 /// Like `getShape`, but only returns statically-known information, without
122 /// generating any new IR. For each shape dimension, returns >=0 if that
123 /// dimension is statically known, or -1 otherwise.
124 SmallVector<int64_t, 8> getStaticShape(LinalgOp linalgOp);
125 
126 /// Returns the statically-known loop ranges of the `linalgOp`. Composes
127 /// `linalgOp.getShapesToLoopsMap()` with the result of `getStaticShape`.
128 /// Returns None if `linalgOp.getShapesToLoopsMap()` fails. Returns -1
129 /// for non-statically-known loop ranges.
130 Optional<SmallVector<int64_t, 4>> getStaticLoopRanges(LinalgOp linalgOp);
131 
132 /// Apply the permutation defined by `permutation` to `inVec`.
133 /// Element `i` in `inVec` is mapped to location `j = permutation[i]`.
134 /// E.g.: for an input vector `inVec = ['a', 'b', 'c']` and a permutation vector
135 /// `permutation = [2, 0, 1]`, this function leaves `inVec = ['c', 'a', 'b']`.
136 template <typename T, unsigned N>
applyPermutationToVector(SmallVector<T,N> & inVec,ArrayRef<unsigned> permutation)137 void applyPermutationToVector(SmallVector<T, N> &inVec,
138                               ArrayRef<unsigned> permutation) {
139   SmallVector<T, N> auxVec(inVec.size());
140   for (unsigned i = 0; i < permutation.size(); ++i)
141     auxVec[i] = inVec[permutation[i]];
142   inVec = auxVec;
143 }
144 
145 /// If `size` comes from an AffineMinOp and one of the values of AffineMinOp
146 /// is a constant then return a new value set to the smallest such constant.
147 /// If `size` comes from a ConstantOp, return the constant.
148 /// Otherwise return nullptr.
149 IntegerAttr getSmallestBoundingIndex(Value size);
150 
151 /// Scheme used to distribute loops to processors.
152 enum class DistributionMethod {
153   /// Cyclic distribution where no assumption is made about the dynamic
154   /// relationship between number of processors and number of iterations of the
155   /// distributed loop. Distributes the following loop
156   ///
157   /// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
158   ///
159   /// to
160   ///
161   /// scf.parallel(%iv)= (%lb + %procId * %step) to (%ub) step (%step * %nprocs)
162   Cyclic = 0,
163 
164   /// Cyclic distribution where the number of processors can be assumed to be
165   /// more than or equal to the number of iterations of the distributed loop. In
166   /// such cases, a simple in-bounds check is enough (instead of materializing a
167   /// loop). Distributes the following loop
168   ///
169   /// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
170   ///
171   /// to
172   ///
173   /// %iv = %lb + %procId * %step
174   /// %cond = cmpi "slt", %iv, %ub
175   /// scf.if %cond {
176   ///   ...
177   /// }
178   CyclicNumProcsGeNumIters = 1,
179 
180   /// Cyclic distribution where the number of processors can be assumed to be
181   ///  equal to the number of iterations of the distributed loop. In such cases,
182   ///  no bounds check is needed. Distributes the following loop
183   ///
184   /// scf.parallel (%iv) = (%lb) to (%ub) step (%step)
185   ///
186   /// to
187   ///
188   /// %iv = %lb + %procId * %step
189   CyclicNumProcsEqNumIters = 2
190 };
191 
192 /// Callback function type used to get processor ID, and number of processors
193 /// used for distribution for all parallel loops generated.
194 struct ProcInfo {
195   Value procId;
196   Value nprocs;
197 };
198 using ProcInfoCallBackFn = std::function<SmallVector<ProcInfo, 2>(
199     OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges)>;
200 
201 /// Options that allow distribution of loops generated in Linalg transforms to
202 /// processors while generating the loops.
203 struct LinalgLoopDistributionOptions {
204   /// Callback function that returns the Values for processor ID (`procId`), and
205   /// number of processors (`nprocs`) used to execute the parallel loops. The
206   /// number of `{procId, nprocs}` pairs returned must be equal to the number of
207   /// `parallelLoopRanges` passed into the callback, which in-turn is same as
208   /// the number of parallel loops for which the `distributionMethod` is
209   /// specified below.
210   ProcInfoCallBackFn procInfo;
211   /// Specification of how to distribute the `scf.parallel` loops that are
212   /// generated. As the `scf.parallel` loop is generated, the elements of this
213   /// vector is used (from left to right) and the specified distribution is
214   /// applied. If the vector is less than the number of `scf.parallel` loops
215   /// generated, then no distribution is applied.
216   SmallVector<DistributionMethod, 0> distributionMethod = {};
217 };
218 
219 /// Utility class used to generate nested loops with ranges described by
220 /// `loopRanges` and loop type described by the `iteratorTypes`. `bodyBuilderFn`
221 /// is used to generate the body of the innermost loop. It is passed a range
222 /// of loop induction variables.
223 template <typename LoopTy>
224 struct GenerateLoopNest {
225   using IndexedValueTy =
226       typename std::conditional<std::is_same<LoopTy, AffineForOp>::value,
227                                 AffineIndexedValue, StdIndexedValue>::type;
228 
229   static void
230   doit(ArrayRef<Range> loopRanges, ValueRange iterArgInitValues,
231        ArrayRef<Attribute> iteratorTypes,
232        function_ref<scf::ValueVector(ValueRange, ValueRange)> bodyBuilderFn,
233        Optional<LinalgLoopDistributionOptions> = None);
234 };
235 
236 } // namespace linalg
237 } // namespace mlir
238 
239 #endif // MLIR_DIALECT_LINALG_UTILS_H_
240