1 //===- polly/ScheduleOptimizer.h - The Schedule Optimizer -------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef POLLY_SCHEDULEOPTIMIZER_H
10 #define POLLY_SCHEDULEOPTIMIZER_H
11 
12 #include "llvm/ADT/ArrayRef.h"
13 #include "isl/isl-noexceptions.h"
14 
15 namespace llvm {
16 
17 class TargetTransformInfo;
18 } // namespace llvm
19 
20 struct isl_schedule_node;
21 
22 /// Parameters of the micro kernel.
23 ///
24 /// Parameters, which determine sizes of rank-1 (i.e., outer product) update
25 /// used in the optimized matrix multiplication.
26 struct MicroKernelParamsTy {
27   int Mr;
28   int Nr;
29 };
30 
31 /// Parameters of the macro kernel.
32 ///
33 /// Parameters, which determine sizes of blocks of partitioned matrices
34 /// used in the optimized matrix multiplication.
35 struct MacroKernelParamsTy {
36   int Mc;
37   int Nc;
38   int Kc;
39 };
40 
41 namespace polly {
42 
43 struct Dependences;
44 class MemoryAccess;
45 class Scop;
46 
47 /// Additional parameters of the schedule optimizer.
48 ///
49 /// Target Transform Info and the SCoP dependencies used by the schedule
50 /// optimizer.
51 struct OptimizerAdditionalInfoTy {
52   const llvm::TargetTransformInfo *TTI;
53   const Dependences *D;
54 };
55 
56 /// Parameters of the matrix multiplication operands.
57 ///
58 /// Parameters, which describe access relations that represent operands of the
59 /// matrix multiplication.
60 struct MatMulInfoTy {
61   MemoryAccess *A = nullptr;
62   MemoryAccess *B = nullptr;
63   MemoryAccess *ReadFromC = nullptr;
64   MemoryAccess *WriteToC = nullptr;
65   int i = -1;
66   int j = -1;
67   int k = -1;
68 };
69 
70 extern bool DisablePollyTiling;
71 } // namespace polly
72 
73 class ScheduleTreeOptimizer {
74 public:
75   /// Apply schedule tree transformations.
76   ///
77   /// This function takes an (possibly already optimized) schedule tree and
78   /// applies a set of additional optimizations on the schedule tree. The
79   /// transformations applied include:
80   ///
81   ///   - Tiling
82   ///   - Prevectorization
83   ///
84   /// @param Schedule The schedule object the transformations will be applied
85   ///                 to.
86   /// @param OAI      Target Transform Info and the SCoP dependencies.
87   /// @returns        The transformed schedule.
88   static isl::schedule
89   optimizeSchedule(isl::schedule Schedule,
90                    const polly::OptimizerAdditionalInfoTy *OAI = nullptr);
91 
92   /// Apply schedule tree transformations.
93   ///
94   /// This function takes a node in an (possibly already optimized) schedule
95   /// tree and applies a set of additional optimizations on this schedule tree
96   /// node and its descendants. The transformations applied include:
97   ///
98   ///   - Tiling
99   ///   - Prevectorization
100   ///
101   /// @param Node The schedule object post-transformations will be applied to.
102   /// @param OAI  Target Transform Info and the SCoP dependencies.
103   /// @returns    The transformed schedule.
104   static isl::schedule_node
105   optimizeScheduleNode(isl::schedule_node Node,
106                        const polly::OptimizerAdditionalInfoTy *OAI = nullptr);
107 
108   /// Decide if the @p NewSchedule is profitable for @p S.
109   ///
110   /// @param S           The SCoP we optimize.
111   /// @param NewSchedule The new schedule we computed.
112   ///
113   /// @return True, if we believe @p NewSchedule is an improvement for @p S.
114   static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule);
115 
116   /// Isolate a set of partial tile prefixes.
117   ///
118   /// This set should ensure that it contains only partial tile prefixes that
119   /// have exactly VectorWidth iterations.
120   ///
121   /// @param Node A schedule node band, which is a parent of a band node,
122   ///             that contains a vector loop.
123   /// @return Modified isl_schedule_node.
124   static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node,
125                                                     int VectorWidth);
126 
127 private:
128   /// Tile a schedule node.
129   ///
130   /// @param Node            The node to tile.
131   /// @param Identifier      An name that identifies this kind of tiling and
132   ///                        that is used to mark the tiled loops in the
133   ///                        generated AST.
134   /// @param TileSizes       A vector of tile sizes that should be used for
135   ///                        tiling.
136   /// @param DefaultTileSize A default tile size that is used for dimensions
137   ///                        that are not covered by the TileSizes vector.
138   static isl::schedule_node tileNode(isl::schedule_node Node,
139                                      const char *Identifier,
140                                      llvm::ArrayRef<int> TileSizes,
141                                      int DefaultTileSize);
142 
143   /// Tile a schedule node and unroll point loops.
144   ///
145   /// @param Node            The node to register tile.
146   /// @param TileSizes       A vector of tile sizes that should be used for
147   ///                        tiling.
148   /// @param DefaultTileSize A default tile size that is used for dimensions
149   static isl::schedule_node applyRegisterTiling(isl::schedule_node Node,
150                                                 llvm::ArrayRef<int> TileSizes,
151                                                 int DefaultTileSize);
152 
153   /// Apply the BLIS matmul optimization pattern.
154   ///
155   /// Make the loops containing the matrix multiplication be the innermost
156   /// loops and apply the BLIS matmul optimization pattern. BLIS implements
157   /// gemm as three nested loops around a macro-kernel, plus two packing
158   /// routines. The macro-kernel is implemented in terms of two additional
159   /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
160   /// (i.e., outer product) update.
161   ///
162   /// For a detailed description please see [1].
163   ///
164   /// The order of the loops defines the data reused in the BLIS implementation
165   /// of gemm ([1]). In particular, elements of the matrix B, the second
166   /// operand of matrix multiplication, are reused between iterations of the
167   /// innermost loop. To keep the reused data in cache, only elements of matrix
168   /// A, the first operand of matrix multiplication, should be evicted during
169   /// an iteration of the innermost loop. To provide such a cache replacement
170   /// policy, elements of the matrix A can, in particular, be loaded first and,
171   /// consequently, be least-recently-used.
172   ///
173   /// In our case matrices are stored in row-major order instead of
174   /// column-major order used in the BLIS implementation ([1]). It affects only
175   /// on the form of the BLIS micro kernel and the computation of its
176   /// parameters. In particular, reused elements of the matrix B are
177   /// successively multiplied by specific elements of the matrix A.
178   ///
179   /// Refs.:
180   /// [1] - Analytical Modeling is Enough for High Performance BLIS
181   /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
182   /// Technical Report, 2014
183   /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
184   ///
185   /// @see ScheduleTreeOptimizer::createMicroKernel
186   /// @see ScheduleTreeOptimizer::createMacroKernel
187   /// @see getMicroKernelParams
188   /// @see getMacroKernelParams
189   ///
190   /// TODO: Implement the packing transformation.
191   ///
192   /// @param Node The node that contains a band to be optimized. The node
193   ///             is required to successfully pass
194   ///             ScheduleTreeOptimizer::isMatrMultPattern.
195   /// @param TTI  Target Transform Info.
196   /// @param MMI  Parameters of the matrix multiplication operands.
197   /// @returns    The transformed schedule.
198   static isl::schedule_node
199   optimizeMatMulPattern(isl::schedule_node Node,
200                         const llvm::TargetTransformInfo *TTI,
201                         polly::MatMulInfoTy &MMI);
202 
203   /// Check if this node is a band node we want to tile.
204   ///
205   /// We look for innermost band nodes where individual dimensions are marked as
206   /// permutable.
207   ///
208   /// @param Node The node to check.
209   static bool isTileableBandNode(isl::schedule_node Node);
210 
211   /// Pre-vectorizes one scheduling dimension of a schedule band.
212   ///
213   /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
214   /// sinks the resulting point loop.
215   ///
216   /// Example (DimToVectorize=0, VectorWidth=4):
217   ///
218   /// | Before transformation:
219   /// |
220   /// | A[i,j] -> [i,j]
221   /// |
222   /// | for (i = 0; i < 128; i++)
223   /// |    for (j = 0; j < 128; j++)
224   /// |      A(i,j);
225   ///
226   /// | After transformation:
227   /// |
228   /// | for (it = 0; it < 32; it+=1)
229   /// |    for (j = 0; j < 128; j++)
230   /// |      for (ip = 0; ip <= 3; ip++)
231   /// |        A(4 * it + ip,j);
232   ///
233   /// The goal of this transformation is to create a trivially vectorizable
234   /// loop.  This means a parallel loop at the innermost level that has a
235   /// constant number of iterations corresponding to the target vector width.
236   ///
237   /// This transformation creates a loop at the innermost level. The loop has
238   /// a constant number of iterations, if the number of loop iterations at
239   /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
240   /// currently constant and not yet target specific. This function does not
241   /// reason about parallelism.
242   static isl::schedule_node prevectSchedBand(isl::schedule_node Node,
243                                              unsigned DimToVectorize,
244                                              int VectorWidth);
245 
246   /// Apply additional optimizations on the bands in the schedule tree.
247   ///
248   /// We are looking for an innermost band node and apply the following
249   /// transformations:
250   ///
251   ///  - Tile the band
252   ///      - if the band is tileable
253   ///      - if the band has more than one loop dimension
254   ///
255   ///  - Prevectorize the schedule of the band (or the point loop in case of
256   ///    tiling).
257   ///      - if vectorization is enabled
258   ///
259   /// @param Node The schedule node to (possibly) optimize.
260   /// @param User A pointer to forward some use information
261   ///        (currently unused).
262   static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User);
263 
264   /// Apply additional optimizations on the bands in the schedule tree.
265   ///
266   /// We apply the following
267   /// transformations:
268   ///
269   ///  - Tile the band
270   ///  - Prevectorize the schedule of the band (or the point loop in case of
271   ///    tiling).
272   ///      - if vectorization is enabled
273   ///
274   /// @param Node The schedule node to (possibly) optimize.
275   /// @param User A pointer to forward some use information
276   ///        (currently unused).
277   static isl::schedule_node standardBandOpts(isl::schedule_node Node,
278                                              void *User);
279 
280   /// Check if this node contains a partial schedule that could
281   ///        probably be optimized with analytical modeling.
282   ///
283   /// isMatrMultPattern tries to determine whether the following conditions
284   /// are true:
285   /// 1. the partial schedule contains only one statement.
286   /// 2. there are exactly three input dimensions.
287   /// 3. all memory accesses of the statement will have stride 0 or 1, if we
288   ///    interchange loops (switch the variable used in the inner loop to
289   ///    the outer loop).
290   /// 4. all memory accesses of the statement except from the last one, are
291   ///    read memory access and the last one is write memory access.
292   /// 5. all subscripts of the last memory access of the statement don't
293   ///    contain the variable used in the inner loop.
294   /// If this is the case, we could try to use an approach that is similar to
295   /// the one used to get close-to-peak performance of matrix multiplications.
296   ///
297   /// @param Node The node to check.
298   /// @param D    The SCoP dependencies.
299   /// @param MMI  Parameters of the matrix multiplication operands.
300   static bool isMatrMultPattern(isl::schedule_node Node,
301                                 const polly::Dependences *D,
302                                 polly::MatMulInfoTy &MMI);
303 
304   /// Create the BLIS macro-kernel.
305   ///
306   /// We create the BLIS macro-kernel by applying a combination of tiling
307   /// of dimensions of the band node and interchanging of two innermost
308   /// modified dimensions. The values of of MacroKernelParams's fields are used
309   /// as tile sizes.
310   ///
311   /// @param Node The schedule node to be modified.
312   /// @param MacroKernelParams Parameters of the macro kernel
313   ///                          to be used as tile sizes.
314   static isl::schedule_node
315   createMacroKernel(isl::schedule_node Node,
316                     MacroKernelParamsTy MacroKernelParams);
317 
318   /// Create the BLIS macro-kernel.
319   ///
320   /// We create the BLIS macro-kernel by applying a combination of tiling
321   /// of dimensions of the band node and interchanging of two innermost
322   /// modified dimensions. The values passed in MicroKernelParam are used
323   /// as tile sizes.
324   ///
325   /// @param Node The schedule node to be modified.
326   /// @param MicroKernelParams Parameters of the micro kernel
327   ///                          to be used as tile sizes.
328   /// @see MicroKernelParamsTy
329   static isl::schedule_node
330   createMicroKernel(isl::schedule_node Node,
331                     MicroKernelParamsTy MicroKernelParams);
332 };
333 
334 /// Build the desired set of partial tile prefixes.
335 ///
336 /// We build a set of partial tile prefixes, which are prefixes of the vector
337 /// loop that have exactly VectorWidth iterations.
338 ///
339 /// 1. Drop all constraints involving the dimension that represents the
340 ///    vector loop.
341 /// 2. Constrain the last dimension to get a set, which has exactly VectorWidth
342 ///    iterations.
343 /// 3. Subtract loop domain from it, project out the vector loop dimension and
344 ///    get a set that contains prefixes, which do not have exactly VectorWidth
345 ///    iterations.
346 /// 4. Project out the vector loop dimension of the set that was build on the
347 ///    first step and subtract the set built on the previous step to get the
348 ///    desired set of prefixes.
349 ///
350 /// @param ScheduleRange A range of a map, which describes a prefix schedule
351 ///                      relation.
352 isl::set getPartialTilePrefixes(isl::set ScheduleRange, int VectorWidth);
353 #endif // POLLY_SCHEDULEOPTIMIZER_H
354