1 //===- SuperVectorize.cpp - Vectorize Pass Impl ---------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements vectorization of loops, operations and data types to
10 // a target-independent, n-D super-vector abstraction.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "PassDetail.h"
15 #include "mlir/Analysis/LoopAnalysis.h"
16 #include "mlir/Analysis/NestedMatcher.h"
17 #include "mlir/Analysis/SliceAnalysis.h"
18 #include "mlir/Analysis/Utils.h"
19 #include "mlir/Dialect/Affine/IR/AffineOps.h"
20 #include "mlir/Dialect/Affine/Passes.h"
21 #include "mlir/Dialect/StandardOps/IR/Ops.h"
22 #include "mlir/Dialect/Vector/VectorOps.h"
23 #include "mlir/Dialect/Vector/VectorUtils.h"
24 #include "mlir/IR/AffineExpr.h"
25 #include "mlir/IR/Builders.h"
26 #include "mlir/IR/Location.h"
27 #include "mlir/IR/Types.h"
28 #include "mlir/Support/LLVM.h"
29 #include "mlir/Transforms/FoldUtils.h"
30
31 #include "llvm/ADT/DenseMap.h"
32 #include "llvm/ADT/DenseSet.h"
33 #include "llvm/ADT/SetVector.h"
34 #include "llvm/ADT/SmallString.h"
35 #include "llvm/ADT/SmallVector.h"
36 #include "llvm/Support/CommandLine.h"
37 #include "llvm/Support/Debug.h"
38
39 using namespace mlir;
40
41 ///
42 /// Implements a high-level vectorization strategy on a Function.
43 /// The abstraction used is that of super-vectors, which provide a single,
44 /// compact, representation in the vector types, information that is expected
45 /// to reduce the impact of the phase ordering problem
46 ///
47 /// Vector granularity:
48 /// ===================
49 /// This pass is designed to perform vectorization at a super-vector
50 /// granularity. A super-vector is loosely defined as a vector type that is a
51 /// multiple of a "good" vector size so the HW can efficiently implement a set
52 /// of high-level primitives. Multiple is understood along any dimension; e.g.
53 /// both vector<16xf32> and vector<2x8xf32> are valid super-vectors for a
54 /// vector<8xf32> HW vector. Note that a "good vector size so the HW can
55 /// efficiently implement a set of high-level primitives" is not necessarily an
56 /// integer multiple of actual hardware registers. We leave details of this
57 /// distinction unspecified for now.
58 ///
59 /// Some may prefer the terminology a "tile of HW vectors". In this case, one
60 /// should note that super-vectors implement an "always full tile" abstraction.
61 /// They guarantee no partial-tile separation is necessary by relying on a
62 /// high-level copy-reshape abstraction that we call vector.transfer. This
63 /// copy-reshape operations is also responsible for performing layout
64 /// transposition if necessary. In the general case this will require a scoped
65 /// allocation in some notional local memory.
66 ///
67 /// Whatever the mental model one prefers to use for this abstraction, the key
68 /// point is that we burn into a single, compact, representation in the vector
69 /// types, information that is expected to reduce the impact of the phase
70 /// ordering problem. Indeed, a vector type conveys information that:
71 /// 1. the associated loops have dependency semantics that do not prevent
72 /// vectorization;
73 /// 2. the associate loops have been sliced in chunks of static sizes that are
74 /// compatible with vector sizes (i.e. similar to unroll-and-jam);
75 /// 3. the inner loops, in the unroll-and-jam analogy of 2, are captured by
76 /// the
77 /// vector type and no vectorization hampering transformations can be
78 /// applied to them anymore;
79 /// 4. the underlying memrefs are accessed in some notional contiguous way
80 /// that allows loading into vectors with some amount of spatial locality;
81 /// In other words, super-vectorization provides a level of separation of
82 /// concern by way of opacity to subsequent passes. This has the effect of
83 /// encapsulating and propagating vectorization constraints down the list of
84 /// passes until we are ready to lower further.
85 ///
86 /// For a particular target, a notion of minimal n-d vector size will be
87 /// specified and vectorization targets a multiple of those. In the following
88 /// paragraph, let "k ." represent "a multiple of", to be understood as a
89 /// multiple in the same dimension (e.g. vector<16 x k . 128> summarizes
90 /// vector<16 x 128>, vector<16 x 256>, vector<16 x 1024>, etc).
91 ///
92 /// Some non-exhaustive notable super-vector sizes of interest include:
93 /// - CPU: vector<k . HW_vector_size>,
94 /// vector<k' . core_count x k . HW_vector_size>,
95 /// vector<socket_count x k' . core_count x k . HW_vector_size>;
96 /// - GPU: vector<k . warp_size>,
97 /// vector<k . warp_size x float2>,
98 /// vector<k . warp_size x float4>,
99 /// vector<k . warp_size x 4 x 4x 4> (for tensor_core sizes).
100 ///
101 /// Loops and operations are emitted that operate on those super-vector shapes.
102 /// Subsequent lowering passes will materialize to actual HW vector sizes. These
103 /// passes are expected to be (gradually) more target-specific.
104 ///
105 /// At a high level, a vectorized load in a loop will resemble:
106 /// ```mlir
107 /// affine.for %i = ? to ? step ? {
108 /// %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32>
109 /// }
110 /// ```
111 /// It is the responsibility of the implementation of vector.transfer_read to
112 /// materialize vector registers from the original scalar memrefs. A later (more
113 /// target-dependent) lowering pass will materialize to actual HW vector sizes.
114 /// This lowering may be occur at different times:
115 /// 1. at the MLIR level into a combination of loops, unrolling, DmaStartOp +
116 /// DmaWaitOp + vectorized operations for data transformations and shuffle;
117 /// thus opening opportunities for unrolling and pipelining. This is an
118 /// instance of library call "whiteboxing"; or
119 /// 2. later in the a target-specific lowering pass or hand-written library
120 /// call; achieving full separation of concerns. This is an instance of
121 /// library call; or
122 /// 3. a mix of both, e.g. based on a model.
123 /// In the future, these operations will expose a contract to constrain the
124 /// search on vectorization patterns and sizes.
125 ///
126 /// Occurrence of super-vectorization in the compiler flow:
127 /// =======================================================
128 /// This is an active area of investigation. We start with 2 remarks to position
129 /// super-vectorization in the context of existing ongoing work: LLVM VPLAN
130 /// and LLVM SLP Vectorizer.
131 ///
132 /// LLVM VPLAN:
133 /// -----------
134 /// The astute reader may have noticed that in the limit, super-vectorization
135 /// can be applied at a similar time and with similar objectives than VPLAN.
136 /// For instance, in the case of a traditional, polyhedral compilation-flow (for
137 /// instance, the PPCG project uses ISL to provide dependence analysis,
138 /// multi-level(scheduling + tiling), lifting footprint to fast memory,
139 /// communication synthesis, mapping, register optimizations) and before
140 /// unrolling. When vectorization is applied at this *late* level in a typical
141 /// polyhedral flow, and is instantiated with actual hardware vector sizes,
142 /// super-vectorization is expected to match (or subsume) the type of patterns
143 /// that LLVM's VPLAN aims at targeting. The main difference here is that MLIR
144 /// is higher level and our implementation should be significantly simpler. Also
145 /// note that in this mode, recursive patterns are probably a bit of an overkill
146 /// although it is reasonable to expect that mixing a bit of outer loop and
147 /// inner loop vectorization + unrolling will provide interesting choices to
148 /// MLIR.
149 ///
150 /// LLVM SLP Vectorizer:
151 /// --------------------
152 /// Super-vectorization however is not meant to be usable in a similar fashion
153 /// to the SLP vectorizer. The main difference lies in the information that
154 /// both vectorizers use: super-vectorization examines contiguity of memory
155 /// references along fastest varying dimensions and loops with recursive nested
156 /// patterns capturing imperfectly-nested loop nests; the SLP vectorizer, on
157 /// the other hand, performs flat pattern matching inside a single unrolled loop
158 /// body and stitches together pieces of load and store operations into full
159 /// 1-D vectors. We envision that the SLP vectorizer is a good way to capture
160 /// innermost loop, control-flow dependent patterns that super-vectorization may
161 /// not be able to capture easily. In other words, super-vectorization does not
162 /// aim at replacing the SLP vectorizer and the two solutions are complementary.
163 ///
164 /// Ongoing investigations:
165 /// -----------------------
166 /// We discuss the following *early* places where super-vectorization is
167 /// applicable and touch on the expected benefits and risks . We list the
168 /// opportunities in the context of the traditional polyhedral compiler flow
169 /// described in PPCG. There are essentially 6 places in the MLIR pass pipeline
170 /// we expect to experiment with super-vectorization:
171 /// 1. Right after language lowering to MLIR: this is the earliest time where
172 /// super-vectorization is expected to be applied. At this level, all the
173 /// language/user/library-level annotations are available and can be fully
174 /// exploited. Examples include loop-type annotations (such as parallel,
175 /// reduction, scan, dependence distance vector, vectorizable) as well as
176 /// memory access annotations (such as non-aliasing writes guaranteed,
177 /// indirect accesses that are permutations by construction) accesses or
178 /// that a particular operation is prescribed atomic by the user. At this
179 /// level, anything that enriches what dependence analysis can do should be
180 /// aggressively exploited. At this level we are close to having explicit
181 /// vector types in the language, except we do not impose that burden on the
182 /// programmer/library: we derive information from scalar code + annotations.
183 /// 2. After dependence analysis and before polyhedral scheduling: the
184 /// information that supports vectorization does not need to be supplied by a
185 /// higher level of abstraction. Traditional dependence analysis is available
186 /// in MLIR and will be used to drive vectorization and cost models.
187 ///
188 /// Let's pause here and remark that applying super-vectorization as described
189 /// in 1. and 2. presents clear opportunities and risks:
190 /// - the opportunity is that vectorization is burned in the type system and
191 /// is protected from the adverse effect of loop scheduling, tiling, loop
192 /// interchange and all passes downstream. Provided that subsequent passes are
193 /// able to operate on vector types; the vector shapes, associated loop
194 /// iterator properties, alignment, and contiguity of fastest varying
195 /// dimensions are preserved until we lower the super-vector types. We expect
196 /// this to significantly rein in on the adverse effects of phase ordering.
197 /// - the risks are that a. all passes after super-vectorization have to work
198 /// on elemental vector types (not that this is always true, wherever
199 /// vectorization is applied) and b. that imposing vectorization constraints
200 /// too early may be overall detrimental to loop fusion, tiling and other
201 /// transformations because the dependence distances are coarsened when
202 /// operating on elemental vector types. For this reason, the pattern
203 /// profitability analysis should include a component that also captures the
204 /// maximal amount of fusion available under a particular pattern. This is
205 /// still at the stage of rough ideas but in this context, search is our
206 /// friend as the Tensor Comprehensions and auto-TVM contributions
207 /// demonstrated previously.
208 /// Bottom-line is we do not yet have good answers for the above but aim at
209 /// making it easy to answer such questions.
210 ///
211 /// Back to our listing, the last places where early super-vectorization makes
212 /// sense are:
213 /// 3. right after polyhedral-style scheduling: PLUTO-style algorithms are known
214 /// to improve locality, parallelism and be configurable (e.g. max-fuse,
215 /// smart-fuse etc). They can also have adverse effects on contiguity
216 /// properties that are required for vectorization but the vector.transfer
217 /// copy-reshape-pad-transpose abstraction is expected to help recapture
218 /// these properties.
219 /// 4. right after polyhedral-style scheduling+tiling;
220 /// 5. right after scheduling+tiling+rescheduling: points 4 and 5 represent
221 /// probably the most promising places because applying tiling achieves a
222 /// separation of concerns that allows rescheduling to worry less about
223 /// locality and more about parallelism and distribution (e.g. min-fuse).
224 ///
225 /// At these levels the risk-reward looks different: on one hand we probably
226 /// lost a good deal of language/user/library-level annotation; on the other
227 /// hand we gained parallelism and locality through scheduling and tiling.
228 /// However we probably want to ensure tiling is compatible with the
229 /// full-tile-only abstraction used in super-vectorization or suffer the
230 /// consequences. It is too early to place bets on what will win but we expect
231 /// super-vectorization to be the right abstraction to allow exploring at all
232 /// these levels. And again, search is our friend.
233 ///
234 /// Lastly, we mention it again here:
235 /// 6. as a MLIR-based alternative to VPLAN.
236 ///
237 /// Lowering, unrolling, pipelining:
238 /// ================================
239 /// TODO: point to the proper places.
240 ///
241 /// Algorithm:
242 /// ==========
243 /// The algorithm proceeds in a few steps:
244 /// 1. defining super-vectorization patterns and matching them on the tree of
245 /// AffineForOp. A super-vectorization pattern is defined as a recursive
246 /// data structures that matches and captures nested, imperfectly-nested
247 /// loops that have a. conformable loop annotations attached (e.g. parallel,
248 /// reduction, vectorizable, ...) as well as b. all contiguous load/store
249 /// operations along a specified minor dimension (not necessarily the
250 /// fastest varying) ;
251 /// 2. analyzing those patterns for profitability (TODO: and
252 /// interference);
253 /// 3. Then, for each pattern in order:
254 /// a. applying iterative rewriting of the loop and the load operations in
255 /// DFS postorder. Rewriting is implemented by coarsening the loops and
256 /// turning load operations into opaque vector.transfer_read ops;
257 /// b. keeping track of the load operations encountered as "roots" and the
258 /// store operations as "terminals";
259 /// c. traversing the use-def chains starting from the roots and iteratively
260 /// propagating vectorized values. Scalar values that are encountered
261 /// during this process must come from outside the scope of the current
262 /// pattern (TODO: enforce this and generalize). Such a scalar value
263 /// is vectorized only if it is a constant (into a vector splat). The
264 /// non-constant case is not supported for now and results in the pattern
265 /// failing to vectorize;
266 /// d. performing a second traversal on the terminals (store ops) to
267 /// rewriting the scalar value they write to memory into vector form.
268 /// If the scalar value has been vectorized previously, we simply replace
269 /// it by its vector form. Otherwise, if the scalar value is a constant,
270 /// it is vectorized into a splat. In all other cases, vectorization for
271 /// the pattern currently fails.
272 /// e. if everything under the root AffineForOp in the current pattern
273 /// vectorizes properly, we commit that loop to the IR. Otherwise we
274 /// discard it and restore a previously cloned version of the loop. Thanks
275 /// to the recursive scoping nature of matchers and captured patterns,
276 /// this is transparently achieved by a simple RAII implementation.
277 /// f. vectorization is applied on the next pattern in the list. Because
278 /// pattern interference avoidance is not yet implemented and that we do
279 /// not support further vectorizing an already vector load we need to
280 /// re-verify that the pattern is still vectorizable. This is expected to
281 /// make cost models more difficult to write and is subject to improvement
282 /// in the future.
283 ///
284 /// Points c. and d. above are worth additional comment. In most passes that
285 /// do not change the type of operands, it is usually preferred to eagerly
286 /// `replaceAllUsesWith`. Unfortunately this does not work for vectorization
287 /// because during the use-def chain traversal, all the operands of an operation
288 /// must be available in vector form. Trying to propagate eagerly makes the IR
289 /// temporarily invalid and results in errors such as:
290 /// `vectorize.mlir:308:13: error: 'addf' op requires the same type for all
291 /// operands and results
292 /// %s5 = addf %a5, %b5 : f32`
293 ///
294 /// Lastly, we show a minimal example for which use-def chains rooted in load /
295 /// vector.transfer_read are not enough. This is what motivated splitting
296 /// terminal processing out of the use-def chains starting from loads. In the
297 /// following snippet, there is simply no load::
298 /// ```mlir
299 /// func @fill(%A : memref<128xf32>) -> () {
300 /// %f1 = constant 1.0 : f32
301 /// affine.for %i0 = 0 to 32 {
302 /// affine.store %f1, %A[%i0] : memref<128xf32, 0>
303 /// }
304 /// return
305 /// }
306 /// ```
307 ///
308 /// Choice of loop transformation to support the algorithm:
309 /// =======================================================
310 /// The choice of loop transformation to apply for coarsening vectorized loops
311 /// is still subject to exploratory tradeoffs. In particular, say we want to
312 /// vectorize by a factor 128, we want to transform the following input:
313 /// ```mlir
314 /// affine.for %i = %M to %N {
315 /// %a = affine.load %A[%i] : memref<?xf32>
316 /// }
317 /// ```
318 ///
319 /// Traditionally, one would vectorize late (after scheduling, tiling,
320 /// memory promotion etc) say after stripmining (and potentially unrolling in
321 /// the case of LLVM's SLP vectorizer):
322 /// ```mlir
323 /// affine.for %i = floor(%M, 128) to ceil(%N, 128) {
324 /// affine.for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) {
325 /// %a = affine.load %A[%ii] : memref<?xf32>
326 /// }
327 /// }
328 /// ```
329 ///
330 /// Instead, we seek to vectorize early and freeze vector types before
331 /// scheduling, so we want to generate a pattern that resembles:
332 /// ```mlir
333 /// affine.for %i = ? to ? step ? {
334 /// %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>
335 /// }
336 /// ```
337 ///
338 /// i. simply dividing the lower / upper bounds by 128 creates issues
339 /// when representing expressions such as ii + 1 because now we only
340 /// have access to original values that have been divided. Additional
341 /// information is needed to specify accesses at below-128 granularity;
342 /// ii. another alternative is to coarsen the loop step but this may have
343 /// consequences on dependence analysis and fusability of loops: fusable
344 /// loops probably need to have the same step (because we don't want to
345 /// stripmine/unroll to enable fusion).
346 /// As a consequence, we choose to represent the coarsening using the loop
347 /// step for now and reevaluate in the future. Note that we can renormalize
348 /// loop steps later if/when we have evidence that they are problematic.
349 ///
350 /// For the simple strawman example above, vectorizing for a 1-D vector
351 /// abstraction of size 128 returns code similar to:
352 /// ```mlir
353 /// affine.for %i = %M to %N step 128 {
354 /// %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>
355 /// }
356 /// ```
357 ///
358 /// Unsupported cases, extensions, and work in progress (help welcome :-) ):
359 /// ========================================================================
360 /// 1. lowering to concrete vector types for various HW;
361 /// 2. reduction support;
362 /// 3. non-effecting padding during vector.transfer_read and filter during
363 /// vector.transfer_write;
364 /// 4. misalignment support vector.transfer_read / vector.transfer_write
365 /// (hopefully without read-modify-writes);
366 /// 5. control-flow support;
367 /// 6. cost-models, heuristics and search;
368 /// 7. Op implementation, extensions and implication on memref views;
369 /// 8. many TODOs left around.
370 ///
371 /// Examples:
372 /// =========
373 /// Consider the following Function:
374 /// ```mlir
375 /// func @vector_add_2d(%M : index, %N : index) -> f32 {
376 /// %A = alloc (%M, %N) : memref<?x?xf32, 0>
377 /// %B = alloc (%M, %N) : memref<?x?xf32, 0>
378 /// %C = alloc (%M, %N) : memref<?x?xf32, 0>
379 /// %f1 = constant 1.0 : f32
380 /// %f2 = constant 2.0 : f32
381 /// affine.for %i0 = 0 to %M {
382 /// affine.for %i1 = 0 to %N {
383 /// // non-scoped %f1
384 /// affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
385 /// }
386 /// }
387 /// affine.for %i2 = 0 to %M {
388 /// affine.for %i3 = 0 to %N {
389 /// // non-scoped %f2
390 /// affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
391 /// }
392 /// }
393 /// affine.for %i4 = 0 to %M {
394 /// affine.for %i5 = 0 to %N {
395 /// %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0>
396 /// %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0>
397 /// %s5 = addf %a5, %b5 : f32
398 /// // non-scoped %f1
399 /// %s6 = addf %s5, %f1 : f32
400 /// // non-scoped %f2
401 /// %s7 = addf %s5, %f2 : f32
402 /// // diamond dependency.
403 /// %s8 = addf %s7, %s6 : f32
404 /// affine.store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
405 /// }
406 /// }
407 /// %c7 = constant 7 : index
408 /// %c42 = constant 42 : index
409 /// %res = load %C[%c7, %c42] : memref<?x?xf32, 0>
410 /// return %res : f32
411 /// }
412 /// ```
413 ///
414 /// The -affine-vectorize pass with the following arguments:
415 /// ```
416 /// -affine-vectorize="virtual-vector-size=256 test-fastest-varying=0"
417 /// ```
418 ///
419 /// produces this standard innermost-loop vectorized code:
420 /// ```mlir
421 /// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
422 /// %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
423 /// %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
424 /// %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
425 /// %cst = constant 1.0 : f32
426 /// %cst_0 = constant 2.0 : f32
427 /// affine.for %i0 = 0 to %arg0 {
428 /// affine.for %i1 = 0 to %arg1 step 256 {
429 /// %cst_1 = constant dense<vector<256xf32>, 1.0> :
430 /// vector<256xf32>
431 /// vector.transfer_write %cst_1, %0[%i0, %i1] :
432 /// vector<256xf32>, memref<?x?xf32>
433 /// }
434 /// }
435 /// affine.for %i2 = 0 to %arg0 {
436 /// affine.for %i3 = 0 to %arg1 step 256 {
437 /// %cst_2 = constant dense<vector<256xf32>, 2.0> :
438 /// vector<256xf32>
439 /// vector.transfer_write %cst_2, %1[%i2, %i3] :
440 /// vector<256xf32>, memref<?x?xf32>
441 /// }
442 /// }
443 /// affine.for %i4 = 0 to %arg0 {
444 /// affine.for %i5 = 0 to %arg1 step 256 {
445 /// %3 = vector.transfer_read %0[%i4, %i5] :
446 /// memref<?x?xf32>, vector<256xf32>
447 /// %4 = vector.transfer_read %1[%i4, %i5] :
448 /// memref<?x?xf32>, vector<256xf32>
449 /// %5 = addf %3, %4 : vector<256xf32>
450 /// %cst_3 = constant dense<vector<256xf32>, 1.0> :
451 /// vector<256xf32>
452 /// %6 = addf %5, %cst_3 : vector<256xf32>
453 /// %cst_4 = constant dense<vector<256xf32>, 2.0> :
454 /// vector<256xf32>
455 /// %7 = addf %5, %cst_4 : vector<256xf32>
456 /// %8 = addf %7, %6 : vector<256xf32>
457 /// vector.transfer_write %8, %2[%i4, %i5] :
458 /// vector<256xf32>, memref<?x?xf32>
459 /// }
460 /// }
461 /// %c7 = constant 7 : index
462 /// %c42 = constant 42 : index
463 /// %9 = load %2[%c7, %c42] : memref<?x?xf32>
464 /// return %9 : f32
465 /// }
466 /// ```
467 ///
468 /// The -affine-vectorize pass with the following arguments:
469 /// ```
470 /// -affine-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0"
471 /// ```
472 ///
473 /// produces this more interesting mixed outer-innermost-loop vectorized code:
474 /// ```mlir
475 /// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
476 /// %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
477 /// %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
478 /// %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
479 /// %cst = constant 1.0 : f32
480 /// %cst_0 = constant 2.0 : f32
481 /// affine.for %i0 = 0 to %arg0 step 32 {
482 /// affine.for %i1 = 0 to %arg1 step 256 {
483 /// %cst_1 = constant dense<vector<32x256xf32>, 1.0> :
484 /// vector<32x256xf32>
485 /// vector.transfer_write %cst_1, %0[%i0, %i1] :
486 /// vector<32x256xf32>, memref<?x?xf32>
487 /// }
488 /// }
489 /// affine.for %i2 = 0 to %arg0 step 32 {
490 /// affine.for %i3 = 0 to %arg1 step 256 {
491 /// %cst_2 = constant dense<vector<32x256xf32>, 2.0> :
492 /// vector<32x256xf32>
493 /// vector.transfer_write %cst_2, %1[%i2, %i3] :
494 /// vector<32x256xf32>, memref<?x?xf32>
495 /// }
496 /// }
497 /// affine.for %i4 = 0 to %arg0 step 32 {
498 /// affine.for %i5 = 0 to %arg1 step 256 {
499 /// %3 = vector.transfer_read %0[%i4, %i5] :
500 /// memref<?x?xf32> vector<32x256xf32>
501 /// %4 = vector.transfer_read %1[%i4, %i5] :
502 /// memref<?x?xf32>, vector<32x256xf32>
503 /// %5 = addf %3, %4 : vector<32x256xf32>
504 /// %cst_3 = constant dense<vector<32x256xf32>, 1.0> :
505 /// vector<32x256xf32>
506 /// %6 = addf %5, %cst_3 : vector<32x256xf32>
507 /// %cst_4 = constant dense<vector<32x256xf32>, 2.0> :
508 /// vector<32x256xf32>
509 /// %7 = addf %5, %cst_4 : vector<32x256xf32>
510 /// %8 = addf %7, %6 : vector<32x256xf32>
511 /// vector.transfer_write %8, %2[%i4, %i5] :
512 /// vector<32x256xf32>, memref<?x?xf32>
513 /// }
514 /// }
515 /// %c7 = constant 7 : index
516 /// %c42 = constant 42 : index
517 /// %9 = load %2[%c7, %c42] : memref<?x?xf32>
518 /// return %9 : f32
519 /// }
520 /// ```
521 ///
522 /// Of course, much more intricate n-D imperfectly-nested patterns can be
523 /// vectorized too and specified in a fully declarative fashion.
524
525 #define DEBUG_TYPE "early-vect"
526
527 using llvm::dbgs;
528 using llvm::SetVector;
529
530 /// Forward declaration.
531 static FilterFunctionType
532 isVectorizableLoopPtrFactory(const DenseSet<Operation *> ¶llelLoops,
533 int fastestVaryingMemRefDimension);
534
535 /// Creates a vectorization pattern from the command line arguments.
536 /// Up to 3-D patterns are supported.
537 /// If the command line argument requests a pattern of higher order, returns an
538 /// empty pattern list which will conservatively result in no vectorization.
539 static std::vector<NestedPattern>
makePatterns(const DenseSet<Operation * > & parallelLoops,int vectorRank,ArrayRef<int64_t> fastestVaryingPattern)540 makePatterns(const DenseSet<Operation *> ¶llelLoops, int vectorRank,
541 ArrayRef<int64_t> fastestVaryingPattern) {
542 using matcher::For;
543 int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0];
544 int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1];
545 int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2];
546 switch (vectorRank) {
547 case 1:
548 return {For(isVectorizableLoopPtrFactory(parallelLoops, d0))};
549 case 2:
550 return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
551 For(isVectorizableLoopPtrFactory(parallelLoops, d1)))};
552 case 3:
553 return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
554 For(isVectorizableLoopPtrFactory(parallelLoops, d1),
555 For(isVectorizableLoopPtrFactory(parallelLoops, d2))))};
556 default: {
557 return std::vector<NestedPattern>();
558 }
559 }
560 }
561
vectorTransferPattern()562 static NestedPattern &vectorTransferPattern() {
563 static auto pattern = matcher::Op([](Operation &op) {
564 return isa<vector::TransferReadOp, vector::TransferWriteOp>(op);
565 });
566 return pattern;
567 }
568
569 namespace {
570
571 /// Base state for the vectorize pass.
572 /// Command line arguments are preempted by non-empty pass arguments.
573 struct Vectorize : public AffineVectorizeBase<Vectorize> {
574 Vectorize() = default;
575 Vectorize(ArrayRef<int64_t> virtualVectorSize);
576 void runOnFunction() override;
577 };
578
579 } // end anonymous namespace
580
Vectorize(ArrayRef<int64_t> virtualVectorSize)581 Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) {
582 vectorSizes = virtualVectorSize;
583 }
584
585 /////// TODO: Hoist to a VectorizationStrategy.cpp when appropriate.
586 /////////
587 namespace {
588
589 struct VectorizationStrategy {
590 SmallVector<int64_t, 8> vectorSizes;
591 DenseMap<Operation *, unsigned> loopToVectorDim;
592 };
593
594 } // end anonymous namespace
595
vectorizeLoopIfProfitable(Operation * loop,unsigned depthInPattern,unsigned patternDepth,VectorizationStrategy * strategy)596 static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern,
597 unsigned patternDepth,
598 VectorizationStrategy *strategy) {
599 assert(patternDepth > depthInPattern &&
600 "patternDepth is greater than depthInPattern");
601 if (patternDepth - depthInPattern > strategy->vectorSizes.size()) {
602 // Don't vectorize this loop
603 return;
604 }
605 strategy->loopToVectorDim[loop] =
606 strategy->vectorSizes.size() - (patternDepth - depthInPattern);
607 }
608
609 /// Implements a simple strawman strategy for vectorization.
610 /// Given a matched pattern `matches` of depth `patternDepth`, this strategy
611 /// greedily assigns the fastest varying dimension ** of the vector ** to the
612 /// innermost loop in the pattern.
613 /// When coupled with a pattern that looks for the fastest varying dimension in
614 /// load/store MemRefs, this creates a generic vectorization strategy that works
615 /// for any loop in a hierarchy (outermost, innermost or intermediate).
616 ///
617 /// TODO: In the future we should additionally increase the power of the
618 /// profitability analysis along 3 directions:
619 /// 1. account for loop extents (both static and parametric + annotations);
620 /// 2. account for data layout permutations;
621 /// 3. account for impact of vectorization on maximal loop fusion.
622 /// Then we can quantify the above to build a cost model and search over
623 /// strategies.
analyzeProfitability(ArrayRef<NestedMatch> matches,unsigned depthInPattern,unsigned patternDepth,VectorizationStrategy * strategy)624 static LogicalResult analyzeProfitability(ArrayRef<NestedMatch> matches,
625 unsigned depthInPattern,
626 unsigned patternDepth,
627 VectorizationStrategy *strategy) {
628 for (auto m : matches) {
629 if (failed(analyzeProfitability(m.getMatchedChildren(), depthInPattern + 1,
630 patternDepth, strategy))) {
631 return failure();
632 }
633 vectorizeLoopIfProfitable(m.getMatchedOperation(), depthInPattern,
634 patternDepth, strategy);
635 }
636 return success();
637 }
638
639 ///// end TODO: Hoist to a VectorizationStrategy.cpp when appropriate /////
640
641 namespace {
642
643 struct VectorizationState {
644 /// Adds an entry of pre/post vectorization operations in the state.
645 void registerReplacement(Operation *key, Operation *value);
646 /// When the current vectorization pattern is successful, this erases the
647 /// operations that were marked for erasure in the proper order and resets
648 /// the internal state for the next pattern.
649 void finishVectorizationPattern();
650
651 // In-order tracking of original Operation that have been vectorized.
652 // Erase in reverse order.
653 SmallVector<Operation *, 16> toErase;
654 // Set of Operation that have been vectorized (the values in the
655 // vectorizationMap for hashed access). The vectorizedSet is used in
656 // particular to filter the operations that have already been vectorized by
657 // this pattern, when iterating over nested loops in this pattern.
658 DenseSet<Operation *> vectorizedSet;
659 // Map of old scalar Operation to new vectorized Operation.
660 DenseMap<Operation *, Operation *> vectorizationMap;
661 // Map of old scalar Value to new vectorized Value.
662 DenseMap<Value, Value> replacementMap;
663 // The strategy drives which loop to vectorize by which amount.
664 const VectorizationStrategy *strategy;
665 // Use-def roots. These represent the starting points for the worklist in the
666 // vectorizeNonTerminals function. They consist of the subset of load
667 // operations that have been vectorized. They can be retrieved from
668 // `vectorizationMap` but it is convenient to keep track of them in a separate
669 // data structure.
670 DenseSet<Operation *> roots;
671 // Terminal operations for the worklist in the vectorizeNonTerminals
672 // function. They consist of the subset of store operations that have been
673 // vectorized. They can be retrieved from `vectorizationMap` but it is
674 // convenient to keep track of them in a separate data structure. Since they
675 // do not necessarily belong to use-def chains starting from loads (e.g
676 // storing a constant), we need to handle them in a post-pass.
677 DenseSet<Operation *> terminals;
678 // Checks that the type of `op` is AffineStoreOp and adds it to the terminals
679 // set.
680 void registerTerminal(Operation *op);
681 // Folder used to factor out constant creation.
682 OperationFolder *folder;
683
684 private:
685 void registerReplacement(Value key, Value value);
686 };
687
688 } // end namespace
689
registerReplacement(Operation * key,Operation * value)690 void VectorizationState::registerReplacement(Operation *key, Operation *value) {
691 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op: ");
692 LLVM_DEBUG(key->print(dbgs()));
693 LLVM_DEBUG(dbgs() << " into ");
694 LLVM_DEBUG(value->print(dbgs()));
695 assert(key->getNumResults() == 1 && "already registered");
696 assert(value->getNumResults() == 1 && "already registered");
697 assert(vectorizedSet.count(value) == 0 && "already registered");
698 assert(vectorizationMap.count(key) == 0 && "already registered");
699 toErase.push_back(key);
700 vectorizedSet.insert(value);
701 vectorizationMap.insert(std::make_pair(key, value));
702 registerReplacement(key->getResult(0), value->getResult(0));
703 if (isa<AffineLoadOp>(key)) {
704 assert(roots.count(key) == 0 && "root was already inserted previously");
705 roots.insert(key);
706 }
707 }
708
registerTerminal(Operation * op)709 void VectorizationState::registerTerminal(Operation *op) {
710 assert(isa<AffineStoreOp>(op) && "terminal must be a AffineStoreOp");
711 assert(terminals.count(op) == 0 &&
712 "terminal was already inserted previously");
713 terminals.insert(op);
714 }
715
finishVectorizationPattern()716 void VectorizationState::finishVectorizationPattern() {
717 while (!toErase.empty()) {
718 auto *op = toErase.pop_back_val();
719 LLVM_DEBUG(dbgs() << "\n[early-vect] finishVectorizationPattern erase: ");
720 LLVM_DEBUG(op->print(dbgs()));
721 op->erase();
722 }
723 }
724
registerReplacement(Value key,Value value)725 void VectorizationState::registerReplacement(Value key, Value value) {
726 assert(replacementMap.count(key) == 0 && "replacement already registered");
727 replacementMap.insert(std::make_pair(key, value));
728 }
729
730 // Apply 'map' with 'mapOperands' returning resulting values in 'results'.
computeMemoryOpIndices(Operation * op,AffineMap map,ValueRange mapOperands,SmallVectorImpl<Value> & results)731 static void computeMemoryOpIndices(Operation *op, AffineMap map,
732 ValueRange mapOperands,
733 SmallVectorImpl<Value> &results) {
734 OpBuilder builder(op);
735 for (auto resultExpr : map.getResults()) {
736 auto singleResMap =
737 AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
738 auto afOp =
739 builder.create<AffineApplyOp>(op->getLoc(), singleResMap, mapOperands);
740 results.push_back(afOp);
741 }
742 }
743
744 ////// TODO: Hoist to a VectorizationMaterialize.cpp when appropriate. ////
745
746 /// Handles the vectorization of load and store MLIR operations.
747 ///
748 /// AffineLoadOp operations are the roots of the vectorizeNonTerminals call.
749 /// They are vectorized immediately. The resulting vector.transfer_read is
750 /// immediately registered to replace all uses of the AffineLoadOp in this
751 /// pattern's scope.
752 ///
753 /// AffineStoreOp are the terminals of the vectorizeNonTerminals call. They
754 /// need to be vectorized late once all the use-def chains have been traversed.
755 /// Additionally, they may have ssa-values operands which come from outside the
756 /// scope of the current pattern.
757 /// Such special cases force us to delay the vectorization of the stores until
758 /// the last step. Here we merely register the store operation.
759 template <typename LoadOrStoreOpPointer>
vectorizeRootOrTerminal(Value iv,LoadOrStoreOpPointer memoryOp,VectorizationState * state)760 static LogicalResult vectorizeRootOrTerminal(Value iv,
761 LoadOrStoreOpPointer memoryOp,
762 VectorizationState *state) {
763 auto memRefType = memoryOp.getMemRef().getType().template cast<MemRefType>();
764
765 auto elementType = memRefType.getElementType();
766 // TODO: ponder whether we want to further vectorize a vector value.
767 assert(VectorType::isValidElementType(elementType) &&
768 "Not a valid vector element type");
769 auto vectorType = VectorType::get(state->strategy->vectorSizes, elementType);
770
771 // Materialize a MemRef with 1 vector.
772 auto *opInst = memoryOp.getOperation();
773 // For now, vector.transfers must be aligned, operate only on indices with an
774 // identity subset of AffineMap and do not change layout.
775 // TODO: increase the expressiveness power of vector.transfer operations
776 // as needed by various targets.
777 if (auto load = dyn_cast<AffineLoadOp>(opInst)) {
778 OpBuilder b(opInst);
779 ValueRange mapOperands = load.getMapOperands();
780 SmallVector<Value, 8> indices;
781 indices.reserve(load.getMemRefType().getRank());
782 if (load.getAffineMap() !=
783 b.getMultiDimIdentityMap(load.getMemRefType().getRank())) {
784 computeMemoryOpIndices(opInst, load.getAffineMap(), mapOperands, indices);
785 } else {
786 indices.append(mapOperands.begin(), mapOperands.end());
787 }
788 auto permutationMap =
789 makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
790 if (!permutationMap)
791 return LogicalResult::Failure;
792 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
793 LLVM_DEBUG(permutationMap.print(dbgs()));
794 auto transfer = b.create<vector::TransferReadOp>(
795 opInst->getLoc(), vectorType, memoryOp.getMemRef(), indices,
796 permutationMap);
797 state->registerReplacement(opInst, transfer.getOperation());
798 } else {
799 state->registerTerminal(opInst);
800 }
801 return success();
802 }
803 /// end TODO: Hoist to a VectorizationMaterialize.cpp when appropriate. ///
804
805 /// Coarsens the loops bounds and transforms all remaining load and store
806 /// operations into the appropriate vector.transfer.
vectorizeAffineForOp(AffineForOp loop,int64_t step,VectorizationState * state)807 static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step,
808 VectorizationState *state) {
809 loop.setStep(step);
810
811 FilterFunctionType notVectorizedThisPattern = [state](Operation &op) {
812 if (!matcher::isLoadOrStore(op)) {
813 return false;
814 }
815 return state->vectorizationMap.count(&op) == 0 &&
816 state->vectorizedSet.count(&op) == 0 &&
817 state->roots.count(&op) == 0 && state->terminals.count(&op) == 0;
818 };
819 auto loadAndStores = matcher::Op(notVectorizedThisPattern);
820 SmallVector<NestedMatch, 8> loadAndStoresMatches;
821 loadAndStores.match(loop.getOperation(), &loadAndStoresMatches);
822 for (auto ls : loadAndStoresMatches) {
823 auto *opInst = ls.getMatchedOperation();
824 auto load = dyn_cast<AffineLoadOp>(opInst);
825 auto store = dyn_cast<AffineStoreOp>(opInst);
826 LLVM_DEBUG(opInst->print(dbgs()));
827 LogicalResult result =
828 load ? vectorizeRootOrTerminal(loop.getInductionVar(), load, state)
829 : vectorizeRootOrTerminal(loop.getInductionVar(), store, state);
830 if (failed(result)) {
831 return failure();
832 }
833 }
834 return success();
835 }
836
837 /// Returns a FilterFunctionType that can be used in NestedPattern to match a
838 /// loop whose underlying load/store accesses are either invariant or all
839 // varying along the `fastestVaryingMemRefDimension`.
840 static FilterFunctionType
isVectorizableLoopPtrFactory(const DenseSet<Operation * > & parallelLoops,int fastestVaryingMemRefDimension)841 isVectorizableLoopPtrFactory(const DenseSet<Operation *> ¶llelLoops,
842 int fastestVaryingMemRefDimension) {
843 return [¶llelLoops, fastestVaryingMemRefDimension](Operation &forOp) {
844 auto loop = cast<AffineForOp>(forOp);
845 auto parallelIt = parallelLoops.find(loop);
846 if (parallelIt == parallelLoops.end())
847 return false;
848 int memRefDim = -1;
849 auto vectorizableBody =
850 isVectorizableLoopBody(loop, &memRefDim, vectorTransferPattern());
851 if (!vectorizableBody)
852 return false;
853 return memRefDim == -1 || fastestVaryingMemRefDimension == -1 ||
854 memRefDim == fastestVaryingMemRefDimension;
855 };
856 }
857
858 /// Apply vectorization of `loop` according to `state`. This is only triggered
859 /// if all vectorizations in `childrenMatches` have already succeeded
860 /// recursively in DFS post-order.
861 static LogicalResult
vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch,VectorizationState * state)862 vectorizeLoopsAndLoadsRecursively(NestedMatch oneMatch,
863 VectorizationState *state) {
864 auto *loopInst = oneMatch.getMatchedOperation();
865 auto loop = cast<AffineForOp>(loopInst);
866 auto childrenMatches = oneMatch.getMatchedChildren();
867
868 // 1. DFS postorder recursion, if any of my children fails, I fail too.
869 for (auto m : childrenMatches) {
870 if (failed(vectorizeLoopsAndLoadsRecursively(m, state))) {
871 return failure();
872 }
873 }
874
875 // 2. This loop may have been omitted from vectorization for various reasons
876 // (e.g. due to the performance model or pattern depth > vector size).
877 auto it = state->strategy->loopToVectorDim.find(loopInst);
878 if (it == state->strategy->loopToVectorDim.end()) {
879 return success();
880 }
881
882 // 3. Actual post-order transformation.
883 auto vectorDim = it->second;
884 assert(vectorDim < state->strategy->vectorSizes.size() &&
885 "vector dim overflow");
886 // a. get actual vector size
887 auto vectorSize = state->strategy->vectorSizes[vectorDim];
888 // b. loop transformation for early vectorization is still subject to
889 // exploratory tradeoffs (see top of the file). Apply coarsening, i.e.:
890 // | ub -> ub
891 // | step -> step * vectorSize
892 LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
893 << " : ");
894 LLVM_DEBUG(loopInst->print(dbgs()));
895 return vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state);
896 }
897
898 /// Tries to transform a scalar constant into a vector splat of that constant.
899 /// Returns the vectorized splat operation if the constant is a valid vector
900 /// element type.
901 /// If `type` is not a valid vector type or if the scalar constant is not a
902 /// valid vector element type, returns nullptr.
vectorizeConstant(Operation * op,ConstantOp constant,Type type)903 static Value vectorizeConstant(Operation *op, ConstantOp constant, Type type) {
904 if (!type || !type.isa<VectorType>() ||
905 !VectorType::isValidElementType(constant.getType())) {
906 return nullptr;
907 }
908 OpBuilder b(op);
909 Location loc = op->getLoc();
910 auto vectorType = type.cast<VectorType>();
911 auto attr = DenseElementsAttr::get(vectorType, constant.getValue());
912 auto *constantOpInst = constant.getOperation();
913
914 OperationState state(loc, constantOpInst->getName().getStringRef(), {},
915 {vectorType}, {b.getNamedAttr("value", attr)});
916
917 return b.createOperation(state)->getResult(0);
918 }
919
920 /// Tries to vectorize a given operand `op` of Operation `op` during
921 /// def-chain propagation or during terminal vectorization, by applying the
922 /// following logic:
923 /// 1. if the defining operation is part of the vectorizedSet (i.e. vectorized
924 /// useby -def propagation), `op` is already in the proper vector form;
925 /// 2. otherwise, the `op` may be in some other vector form that fails to
926 /// vectorize atm (i.e. broadcasting required), returns nullptr to indicate
927 /// failure;
928 /// 3. if the `op` is a constant, returns the vectorized form of the constant;
929 /// 4. non-constant scalars are currently non-vectorizable, in particular to
930 /// guard against vectorizing an index which may be loop-variant and needs
931 /// special handling.
932 ///
933 /// In particular this logic captures some of the use cases where definitions
934 /// that are not scoped under the current pattern are needed to vectorize.
935 /// One such example is top level function constants that need to be splatted.
936 ///
937 /// Returns an operand that has been vectorized to match `state`'s strategy if
938 /// vectorization is possible with the above logic. Returns nullptr otherwise.
939 ///
940 /// TODO: handle more complex cases.
vectorizeOperand(Value operand,Operation * op,VectorizationState * state)941 static Value vectorizeOperand(Value operand, Operation *op,
942 VectorizationState *state) {
943 LLVM_DEBUG(dbgs() << "\n[early-vect]vectorize operand: " << operand);
944 // 1. If this value has already been vectorized this round, we are done.
945 if (state->vectorizedSet.count(operand.getDefiningOp()) > 0) {
946 LLVM_DEBUG(dbgs() << " -> already vector operand");
947 return operand;
948 }
949 // 1.b. Delayed on-demand replacement of a use.
950 // Note that we cannot just call replaceAllUsesWith because it may result
951 // in ops with mixed types, for ops whose operands have not all yet
952 // been vectorized. This would be invalid IR.
953 auto it = state->replacementMap.find(operand);
954 if (it != state->replacementMap.end()) {
955 auto res = it->second;
956 LLVM_DEBUG(dbgs() << "-> delayed replacement by: " << res);
957 return res;
958 }
959 // 2. TODO: broadcast needed.
960 if (operand.getType().isa<VectorType>()) {
961 LLVM_DEBUG(dbgs() << "-> non-vectorizable");
962 return nullptr;
963 }
964 // 3. vectorize constant.
965 if (auto constant = operand.getDefiningOp<ConstantOp>()) {
966 return vectorizeConstant(
967 op, constant,
968 VectorType::get(state->strategy->vectorSizes, operand.getType()));
969 }
970 // 4. currently non-vectorizable.
971 LLVM_DEBUG(dbgs() << "-> non-vectorizable: " << operand);
972 return nullptr;
973 }
974
975 /// Encodes Operation-specific behavior for vectorization. In general we assume
976 /// that all operands of an op must be vectorized but this is not always true.
977 /// In the future, it would be nice to have a trait that describes how a
978 /// particular operation vectorizes. For now we implement the case distinction
979 /// here.
980 /// Returns a vectorized form of an operation or nullptr if vectorization fails.
981 // TODO: consider adding a trait to Op to describe how it gets vectorized.
982 // Maybe some Ops are not vectorizable or require some tricky logic, we cannot
983 // do one-off logic here; ideally it would be TableGen'd.
vectorizeOneOperation(Operation * opInst,VectorizationState * state)984 static Operation *vectorizeOneOperation(Operation *opInst,
985 VectorizationState *state) {
986 // Sanity checks.
987 assert(!isa<AffineLoadOp>(opInst) &&
988 "all loads must have already been fully vectorized independently");
989 assert(!isa<vector::TransferReadOp>(opInst) &&
990 "vector.transfer_read cannot be further vectorized");
991 assert(!isa<vector::TransferWriteOp>(opInst) &&
992 "vector.transfer_write cannot be further vectorized");
993
994 if (auto store = dyn_cast<AffineStoreOp>(opInst)) {
995 OpBuilder b(opInst);
996 auto memRef = store.getMemRef();
997 auto value = store.getValueToStore();
998 auto vectorValue = vectorizeOperand(value, opInst, state);
999 if (!vectorValue)
1000 return nullptr;
1001
1002 ValueRange mapOperands = store.getMapOperands();
1003 SmallVector<Value, 8> indices;
1004 indices.reserve(store.getMemRefType().getRank());
1005 if (store.getAffineMap() !=
1006 b.getMultiDimIdentityMap(store.getMemRefType().getRank())) {
1007 computeMemoryOpIndices(opInst, store.getAffineMap(), mapOperands,
1008 indices);
1009 } else {
1010 indices.append(mapOperands.begin(), mapOperands.end());
1011 }
1012
1013 auto permutationMap =
1014 makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
1015 if (!permutationMap)
1016 return nullptr;
1017 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
1018 LLVM_DEBUG(permutationMap.print(dbgs()));
1019 auto transfer = b.create<vector::TransferWriteOp>(
1020 opInst->getLoc(), vectorValue, memRef, indices, permutationMap);
1021 auto *res = transfer.getOperation();
1022 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << *res);
1023 // "Terminals" (i.e. AffineStoreOps) are erased on the spot.
1024 opInst->erase();
1025 return res;
1026 }
1027 if (opInst->getNumRegions() != 0)
1028 return nullptr;
1029
1030 SmallVector<Type, 8> vectorTypes;
1031 for (auto v : opInst->getResults()) {
1032 vectorTypes.push_back(
1033 VectorType::get(state->strategy->vectorSizes, v.getType()));
1034 }
1035 SmallVector<Value, 8> vectorOperands;
1036 for (auto v : opInst->getOperands()) {
1037 vectorOperands.push_back(vectorizeOperand(v, opInst, state));
1038 }
1039 // Check whether a single operand is null. If so, vectorization failed.
1040 bool success = llvm::all_of(vectorOperands, [](Value op) { return op; });
1041 if (!success) {
1042 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize");
1043 return nullptr;
1044 }
1045
1046 // Create a clone of the op with the proper operands and return types.
1047 // TODO: The following assumes there is always an op with a fixed
1048 // name that works both in scalar mode and vector mode.
1049 // TODO: Is it worth considering an Operation.clone operation which
1050 // changes the type so we can promote an Operation with less boilerplate?
1051 OpBuilder b(opInst);
1052 OperationState newOp(opInst->getLoc(), opInst->getName().getStringRef(),
1053 vectorOperands, vectorTypes, opInst->getAttrs(),
1054 /*successors=*/{}, /*regions=*/{});
1055 return b.createOperation(newOp);
1056 }
1057
1058 /// Iterates over the forward slice from the loads in the vectorization pattern
1059 /// and rewrites them using their vectorized counterpart by:
1060 /// 1. Create the forward slice starting from the loads in the vectorization
1061 /// pattern.
1062 /// 2. Topologically sorts the forward slice.
1063 /// 3. For each operation in the slice, create the vector form of this
1064 /// operation, replacing each operand by a replacement operands retrieved from
1065 /// replacementMap. If any such replacement is missing, vectorization fails.
vectorizeNonTerminals(VectorizationState * state)1066 static LogicalResult vectorizeNonTerminals(VectorizationState *state) {
1067 // 1. create initial worklist with the uses of the roots.
1068 SetVector<Operation *> worklist;
1069 // Note: state->roots have already been vectorized and must not be vectorized
1070 // again. This fits `getForwardSlice` which does not insert `op` in the
1071 // result.
1072 // Note: we have to exclude terminals because some of their defs may not be
1073 // nested under the vectorization pattern (e.g. constants defined in an
1074 // encompassing scope).
1075 // TODO: Use a backward slice for terminals, avoid special casing and
1076 // merge implementations.
1077 for (auto *op : state->roots) {
1078 getForwardSlice(op, &worklist, [state](Operation *op) {
1079 return state->terminals.count(op) == 0; // propagate if not terminal
1080 });
1081 }
1082 // We merged multiple slices, topological order may not hold anymore.
1083 worklist = topologicalSort(worklist);
1084
1085 for (unsigned i = 0; i < worklist.size(); ++i) {
1086 auto *op = worklist[i];
1087 LLVM_DEBUG(dbgs() << "\n[early-vect] vectorize use: ");
1088 LLVM_DEBUG(op->print(dbgs()));
1089
1090 // Create vector form of the operation.
1091 // Insert it just before op, on success register op as replaced.
1092 auto *vectorizedInst = vectorizeOneOperation(op, state);
1093 if (!vectorizedInst) {
1094 return failure();
1095 }
1096
1097 // 3. Register replacement for future uses in the scope.
1098 // Note that we cannot just call replaceAllUsesWith because it may
1099 // result in ops with mixed types, for ops whose operands have not all
1100 // yet been vectorized. This would be invalid IR.
1101 state->registerReplacement(op, vectorizedInst);
1102 }
1103 return success();
1104 }
1105
1106 /// Vectorization is a recursive procedure where anything below can fail.
1107 /// The root match thus needs to maintain a clone for handling failure.
1108 /// Each root may succeed independently but will otherwise clean after itself if
1109 /// anything below it fails.
vectorizeRootMatch(NestedMatch m,VectorizationStrategy * strategy)1110 static LogicalResult vectorizeRootMatch(NestedMatch m,
1111 VectorizationStrategy *strategy) {
1112 auto loop = cast<AffineForOp>(m.getMatchedOperation());
1113 OperationFolder folder(loop.getContext());
1114 VectorizationState state;
1115 state.strategy = strategy;
1116 state.folder = &folder;
1117
1118 // Since patterns are recursive, they can very well intersect.
1119 // Since we do not want a fully greedy strategy in general, we decouple
1120 // pattern matching, from profitability analysis, from application.
1121 // As a consequence we must check that each root pattern is still
1122 // vectorizable. If a pattern is not vectorizable anymore, we just skip it.
1123 // TODO: implement a non-greedy profitability analysis that keeps only
1124 // non-intersecting patterns.
1125 if (!isVectorizableLoopBody(loop, vectorTransferPattern())) {
1126 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
1127 return failure();
1128 }
1129
1130 /// Sets up error handling for this root loop. This is how the root match
1131 /// maintains a clone for handling failure and restores the proper state via
1132 /// RAII.
1133 auto *loopInst = loop.getOperation();
1134 OpBuilder builder(loopInst);
1135 auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst));
1136 struct Guard {
1137 LogicalResult failure() {
1138 loop.getInductionVar().replaceAllUsesWith(clonedLoop.getInductionVar());
1139 loop.erase();
1140 return mlir::failure();
1141 }
1142 LogicalResult success() {
1143 clonedLoop.erase();
1144 return mlir::success();
1145 }
1146 AffineForOp loop;
1147 AffineForOp clonedLoop;
1148 } guard{loop, clonedLoop};
1149
1150 //////////////////////////////////////////////////////////////////////////////
1151 // Start vectorizing.
1152 // From now on, any error triggers the scope guard above.
1153 //////////////////////////////////////////////////////////////////////////////
1154 // 1. Vectorize all the loops matched by the pattern, recursively.
1155 // This also vectorizes the roots (AffineLoadOp) as well as registers the
1156 // terminals (AffineStoreOp) for post-processing vectorization (we need to
1157 // wait for all use-def chains into them to be vectorized first).
1158 if (failed(vectorizeLoopsAndLoadsRecursively(m, &state))) {
1159 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop");
1160 return guard.failure();
1161 }
1162
1163 // 2. Vectorize operations reached by use-def chains from root except the
1164 // terminals (store operations) that need to be post-processed separately.
1165 // TODO: add more as we expand.
1166 if (failed(vectorizeNonTerminals(&state))) {
1167 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed vectorizeNonTerminals");
1168 return guard.failure();
1169 }
1170
1171 // 3. Post-process terminals.
1172 // Note: we have to post-process terminals because some of their defs may not
1173 // be nested under the vectorization pattern (e.g. constants defined in an
1174 // encompassing scope).
1175 // TODO: Use a backward slice for terminals, avoid special casing and
1176 // merge implementations.
1177 for (auto *op : state.terminals) {
1178 if (!vectorizeOneOperation(op, &state)) { // nullptr == failure
1179 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed to vectorize terminals");
1180 return guard.failure();
1181 }
1182 }
1183
1184 // 4. Finish this vectorization pattern.
1185 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
1186 state.finishVectorizationPattern();
1187 return guard.success();
1188 }
1189
1190 /// Applies vectorization to the current Function by searching over a bunch of
1191 /// predetermined patterns.
runOnFunction()1192 void Vectorize::runOnFunction() {
1193 FuncOp f = getFunction();
1194 if (!fastestVaryingPattern.empty() &&
1195 fastestVaryingPattern.size() != vectorSizes.size()) {
1196 f.emitRemark("Fastest varying pattern specified with different size than "
1197 "the vector size.");
1198 return signalPassFailure();
1199 }
1200
1201 // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
1202 NestedPatternContext mlContext;
1203
1204 DenseSet<Operation *> parallelLoops;
1205 f.walk([¶llelLoops](AffineForOp loop) {
1206 if (isLoopParallel(loop))
1207 parallelLoops.insert(loop);
1208 });
1209
1210 for (auto &pat :
1211 makePatterns(parallelLoops, vectorSizes.size(), fastestVaryingPattern)) {
1212 LLVM_DEBUG(dbgs() << "\n******************************************");
1213 LLVM_DEBUG(dbgs() << "\n******************************************");
1214 LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on Function\n");
1215 LLVM_DEBUG(f.print(dbgs()));
1216 unsigned patternDepth = pat.getDepth();
1217
1218 SmallVector<NestedMatch, 8> matches;
1219 pat.match(f, &matches);
1220 // Iterate over all the top-level matches and vectorize eagerly.
1221 // This automatically prunes intersecting matches.
1222 for (auto m : matches) {
1223 VectorizationStrategy strategy;
1224 // TODO: depending on profitability, elect to reduce the vector size.
1225 strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());
1226 if (failed(analyzeProfitability(m.getMatchedChildren(), 1, patternDepth,
1227 &strategy))) {
1228 continue;
1229 }
1230 vectorizeLoopIfProfitable(m.getMatchedOperation(), 0, patternDepth,
1231 &strategy);
1232 // TODO: if pattern does not apply, report it; alter the
1233 // cost/benefit.
1234 vectorizeRootMatch(m, &strategy);
1235 // TODO: some diagnostics if failure to vectorize occurs.
1236 }
1237 }
1238 LLVM_DEBUG(dbgs() << "\n");
1239 }
1240
1241 std::unique_ptr<OperationPass<FuncOp>>
createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize)1242 mlir::createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
1243 return std::make_unique<Vectorize>(virtualVectorSize);
1244 }
createSuperVectorizePass()1245 std::unique_ptr<OperationPass<FuncOp>> mlir::createSuperVectorizePass() {
1246 return std::make_unique<Vectorize>();
1247 }
1248