1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 /*
19  * Something specific for BLAS generators
20  *
21  * NOTE:
22  *      1) All the blas kernel generators should
23  *         perceive fields of the SubproblemDim
24  *         structure as following:
25  *         'y' - rows of matrix A, i. e. M dimension
26  *               of matrix C
27  *         'x' - columns of matrix B and C
28  *         'bwidth' - block width in K dimension
29  *
30  *      2) At generating copying functions and their calls one should
31  *         keep in mind, all the matrix blocks are copied in
32  *         the local memory such that sequentially accessed elements
33  *         are located in memory sequentially. In this context
34  *         transposing is perceived as transposing at copying
35  *         to/from the local memory, not matrix storage way in
36  *         the array passed to kernel.
37  */
38 
39 #ifndef BLAS_KGEN_H_
40 #define BLAS_KGEN_H_
41 
42 #include <clBLAS.h>
43 
44 #include <cltypes.h>
45 #include <kerngen.h>
46 #include <mempat.h>
47 #include <dblock_kgen.h>
48 
49 #include <blas_funcs.h>
50 #include <matrix_props.h>
51 
52 #include "tile.h"
53 #include "fetch.h"
54 
55 #define BLAS_KGEN_FORMAT 1
56 
57 #define genInternalLoopEnd(ctx) kgenEndBranch(ctx, NULL)
58 
59 enum {
60     MAX_OPENCL_VECTOR_LENGTH = 16
61 };
62 
63 typedef enum TailFetch {
64     FETCH_NO_TAILS = 0,
65     FETCH_TAIL_ROW = 0x01,
66     FETCH_TAIL_COL = 0x02
67 } TailFetch;
68 
69 /**
70  * @internal
71  * @brief Blas generator flags
72  * @ingroup GEN_SETTINGS
73  */
74 typedef enum BlasGenFlags {
75     BGF_EXPLICIT_INLINE = 0x01,
76     BGF_DISTINCT_VECLEN = 0x02,
77     // TODO: replace with a flags with inverse semantics
78     BGF_WHOLE_A = 0x04,
79     /** Leading dimension are in vectors rather than in elements */
80     BGF_LD_IN_VECTORS = 0x08,
81     /**
82      * Objects in the global memory are accessed through the unified pointers.
83      * This feature is deprecated and should be not used in new generators.
84      * It is left for backward compatibility
85      */
86     BGF_UPTRS = 0x10
87 } BlasGenFlags;
88 
89 /**
90  * @internal
91  * @brief Flags showing how problem tails are handled
92  * @ingroup TAILS_HANDLING
93  */
94 typedef enum TailStatus {
95     /** Tail of the matrix A is raised */
96     TAIL_A_RAISED = 0x01,
97     /** Tail of the matrix B is raised */
98     TAIL_B_RAISED = 0x02
99 } TailStatus;
100 
101 /**
102  * @internal
103  * @brief Tiles multiplier flags
104  * @ingroup BLAS_MAJOR_SUBGENS
105  */
106 typedef enum TileMulFlags {
107     TILEMUL_NO_FLAGS = 0,              /**< No flags */
108     TILEMUL_TRA = 0x01,                /**< Transposed matrix A */
109     TILEMUL_TRB = 0x02,                /**< Transposed matrix B */
110     TILEMUL_CONJA = 0x04,              /**< Conjugated elements of A */
111     TILEMUL_CONJB = 0x08,              /**< Conjugated elements of B */
112     TILEMUL_C_COLUMN_MAJOR = 0x10,     /**< Column major block for matrix C */
113     TILEMUL_NOT_FETCH_B = 0x20,        /**< Do not fetch matrix B block */
114     TILEMUL_EXTERN_RDECL = 0x40,       /**< External register tiles declaration,
115                                           the generator must not declare them
116                                           itself */
117 
118     /**
119      * Deprecated. Use the repsective mode being a part of FetchAddr mode.
120      * He is left just for backward compatibility to don't break the working
121      * code and will be removed soon
122      */
123     TILEMUL_WRAP_AROUND_TAIL = 0x80,   /**< Sizes used for column skew are
124                                             rounded to next vecLen bound */
125     /** Use global cyclic along subproblem A coordinate.
126      * Deprecated. Don't use it */
127     TILEMUL_GLOBAL_CYCLIC_A = 0x100,
128     /** Use global cyclic along subproblem B coordinate.
129      * Deprecated don't use it */
130     TILEMUL_GLOBAL_CYCLIC_B = 0x200,
131     /* Deprecated. Don't use it */
132     TILEMUL_GLOBAL_CYCLIC_K = 0x400,   /**< Use global cyclic along K */
133     /** Use skew along subproblem A coordinate */
134     TILEMUL_SKEW_A = 0x800,
135     /** Use skew along subproblem B coordinate. Deprecated */
136     TILEMUL_SKEW_B = 0x1000,
137     /* Deprecated */
138     TILEMUL_SKEW_K = 0x2000,           /**< Use skew along K */
139     /** Use size of whole matrix for cyclic addressing. Deprecated */
140     TILEMUL_GLOBAL_CYCLIC = TILEMUL_GLOBAL_CYCLIC_A |
141                             TILEMUL_GLOBAL_CYCLIC_B |
142                             TILEMUL_GLOBAL_CYCLIC_K,
143     // Deprecated
144     TILEMUL_SKEWS = TILEMUL_SKEW_A | TILEMUL_SKEW_B | TILEMUL_SKEW_K,
145     /** Optimize coordinates calculations by storing coordinates values */
146     // Deprecated
147     TILEMUL_OPTIMIZE_COORD_CALC = 0x4000,
148     /** Use bwidth0 stride */
149     TILEMUL_BW_STRIDE = 0x8000,
150     /** Optimize coordinates calculations by using vectors
151      *  and pointer increments */
152     // Deprecated
153     TILEMUL_OPTIMIZE_VEC_COORDS = 0x10000,
154     /** Do not increment K*/
155     TILEMUL_NOT_INC_K = 0x20000,
156     /**
157      * Use variants with explicit vectorization. Useful on platforms with
158      * true SIMD.
159      */
160     TILEMUL_FORCE_VECTORIZATION = 0x40000
161 } TileMulFlags;
162 
163 
164 /**
165  * @internal
166  * @brief Tiles multiplier core
167  * @ingroup BLAS_MAJOR_SUBGENS
168  */
169 typedef enum TileMulCore {
170     /** Use multiplication and addition operations */
171     TILEMUL_MULADD,
172     /** Use the 'dot' function where possible */
173     TILEMUL_DOT,
174     /** Use the 'mad' function */
175     TILEMUL_MAD
176 } TileMulCore;
177 
178 /**
179  * @internal
180  * @brief Update result operations
181  * @ingroup BLAS_MAJOR_SUBGENS
182  */
183 typedef enum UpdateResultOp {
184     /** Just set the values stored in a target buffer */
185     UPRES_SET,
186     /** Summarize values stored in a target buffer with the temporary result */
187     UPRES_SUM
188 } UpdateResultOp;
189 
190 /**
191  * @internal
192  * @brief Update result generator flags
193  * @ingroup BLAS_MAJOR_SUBGENS
194  */
195 typedef enum UpdateResultFlags {
196     /** Resulting matrix is stored in the column major form */
197     UPRES_COLUMN_MAJOR = 0x01,
198     /** Generic version, non optimal sizes */
199     UPRES_GENERIC = 0x02,
200     /** Multiply result on beta */
201     UPRES_WITH_BETA = 0x04,
202     /** do not multiply on the alpha scalar */
203     UPRES_WITHOUT_ALPHA = 0x08,
204     /**
205      * Destination is private memory;
206      * if not set destination is in the global one
207      */
208     UPRES_PRIV_DEST = 0x10,
209     /** Use the local memory instead the global memory */
210     UPRES_USE_LDS = 0x20,
211     /** Generate the inline version */
212     UPRES_INLINE = 0x40,
213     /** Disable vectorization at memory access */
214     UPRES_NO_VECTORIZATION = 0x80,
215     /** For the generic version useful data reside at the tile rows' tail */
216     UPRES_TAIL_ROW = 0x100,
217     /** For the generic version useful data reside at the tile columns' tail */
218     UPRES_TAIL_COL = 0x200,
219     /** Generate condition whether coordinates don't exceed problem bounds */
220     UPRES_EXCEED_PROBLEM_CONDITION = 0x400,
221     /****/
222     UPRES_INDEXING_WITH_CONSTANTS = 0x800,
223     /** Write result to C instead of B for functions with triangular matrix */
224     UPRES_TRIANG_WRITE_C = 0x1000
225 } UpdateResultFlags;
226 
227 typedef struct PrivateArea {
228     const char *typeName;
229     unsigned int vecLen;
230     unsigned int size;
231 } PrivateArea;
232 
233 /**
234  * @internal
235  * @defgroup GEN_SETTINGS Generator settings
236  * @ingroup BLAS_GENERATORS
237  */
238 /*@{*/
239 
240 /**
241  * @internal
242  * @brief Kernel variable and argument names
243  */
244 typedef struct KernelVarNames {
245     const char *A;          /**< Matrix A variable name */
246     const char *B;          /**< Matrix B variable name */
247     const char *C;
248     const char *LDS;		/**< LDS pointer name */
249     const char *coordA;     /**< Variable for subproblem A coordinate */
250     const char *coordB;     /**< Variable for subproblem B coordinate */
251     const char *k;          /**< Variable for incrementable K offset value*/
252     const char *skewA;      /**< Variable for skews along A */
253     const char *skewB;      /**< Variable for skews along B */
254     const char *skewK;      /**< Variable for skews along K */
255     const char *sizeM;      /**< Matrix A size M */
256     const char *sizeN;      /**< Matrix B size N */
257     const char *sizeK;      /**< Matrixes size K */
258     const char *lda;        /**< Leading dimension of matrix A */
259     const char *ldb;        /**< Leading dimension of matrix B */
260     const char *ldc;        /**< Leading dimension of matrix C, in vectors */
261     const char *vectCoordA; /**< Vector containing indexes of tile a elements
262                                  in matrix A */
263     const char *vectCoordB; /**< Vector containing indexes of tile b elements
264                                  in matrix B*/
265     const char *startM;
266     const char *startN;
267     const char *startK;
268     const char *alpha;
269     const char *beta;
270 } KernelVarNames;
271 
272 /**
273  * @internal
274  * @brief Blas generator settings
275  *
276  * This structure is designed to be used with most of subgenerators
277  * and generator helpers. It is assumed to be initialized once at the
278  * generator beginning and modified as few as possible over the rest of
279  * the process.
280  */
281 typedef struct BlasGenSettings {
282     /**
283      * Subproblem dimensions:
284      *
285      * work group dimensions are at index 0
286      * work item dimensions are at index 1
287      */
288     SubproblemDim subdims[2];
289     const PGranularity *pgran;      /**< Data parallelism granularity */
290     const CLBLASKernExtra *kextra;  /**< Kernel extra */
291     BlasGenFlags flags;             /**< Global generator flags */
292     KernelVarNames varNames;        /**< Kernel variables and argument names */
293     Tile tileA;
294     Tile tileBX;
295     Tile tileCY;
296 } BlasGenSettings;
297 
298 /*@}*/
299 
300 /**
301  * @internal
302  * @brief Variable names for the inline version of a function updating result
303  * @ingroup BLAS_MAJOR_SUBGENS
304  */
305 typedef struct UpresVarNames {
306     const char *result;     /**< Name of an output matrix */
307     /** Leading dimension of a matrix stored in the global memory */
308     const char *ld;
309     const char *startRow;   /**< Start row to update from */
310     const char *startCol;   /**< Start column to update from */
311     const char *nrRows;     /**< Number of rows */
312     const char *nrCols;     /**< Number of columns */
313     const char *cachedName; /**< Name of lds chached values */
314 } UpresVarNames;
315 
316 /**
317  * @internal
318  * @brief Options for matrix tiles multiplication generator
319  * @ingroup BLAS_MAJOR_SUBGENS
320  */
321 typedef struct TileMulOpts {
322     CLMemType memA;             /**< type of memory matrix A is located on */
323     CLMemType memB;             /**< type of memory matrix B is located on */
324     TileMulFlags flags;         /**< Flags on objects and computing specifics */
325     TileMulCore core;           /**< Multiply and add core */
326     int (*postFetch)(
327         struct KgenContext *ctx,
328         MatrixRole mrole,
329         void *arg);             /**< Tile post fetch callback */
330     void *postFetchPriv;        /**< Postfetch callback's private date */
331     struct FetchContext *fctx;
332 } TileMulOpts;
333 
334 typedef struct ZeroFuncs {
335     char names[MATRIX_ROLES_NUMBER][FUNC_NAME_MAXLEN];
336 } ZeroFuncs;
337 
338 /**
339  * @internal
340  * @brief Private data for fetch postprocessing callback
341  * @ingroup TAILS_HANDLING
342  */
343 typedef struct TilePostFetchPrivate {
344     BlasFunctionID funcID;
345     const BlasGenSettings *gset;
346     const char *regName;
347     int fetchNumA;
348     int wholeA;
349 } TilePostFetchPrivate;
350 
351 void
352 getPrivateAreaInfo(
353     const BlasGenSettings *gset,
354     BlasFunctionID funcID,
355     MatrixRole mrole,
356     PrivateArea *area);
357 
358 void
359 declarePrivateArea(
360     struct KgenContext *ctx,
361     const PrivateArea *area,
362     const char *baseName,
363     PrivateStorageType storType);
364 
365 /*
366  * Declare separately the real and imaginary part of
367  * a complex multiplier.
368  *
369  * @ctx: generator context
370  * @baseName: variable's base name matching to an existing variable
371  *            with not sepated parts
372  * @typeName: variable type name
373  *
374  * Rule naming
375  *      real part:      <baseName>R
376  *      imaginary part: <baseName>I
377  *
378  * On success returns 0, and -EOVERFLOW at source buffer
379  * overflowing
380  */
381 int
382 declareComplexMultParts(
383     struct KgenContext *ctx,
384     const char *baseName,
385     const char *typeName);
386 
387 /**
388  * @internal
389  * @defgroup CHECK_DECOMP_CACL_GRAN  Checking decomposition and calculate
390  *                                   parallelism granularity
391  * @ingroup BLAS_GENERATORS
392  */
393 
394 /*@{*/
395 
396 /**
397  * @brief Sanity check for decomposition
398  *
399  * @param[in] subdims           Subproblem dimensions. 2 levels.
400  * @param[in] minSize           Minimum size for any of the dimension
401  *                              components
402  * @param[in] maxSize           Maxium size which can't be exceeded by
403  *                              any of the dimension components at the tile
404  *                              layer
405  * @param[in] maxRegs           Maximum registers it's allowed to use
406  * @param[in] dtype             BLAS data type
407  * @param[in] wholeA            Is matrix A stored in registers entirely or
408  *                              partially
409  *
410  * The function rejects only decompositions that are completely invalid or lead
411  * to consumption of too many registers or just have component values at the
412  * tile layer that are out of the range [\b MinSize, \b MaxSize].
413  * Completely invalid decompositions are those which don't allow to divide
414  * problem integrally among work items, e. g. zeroed components are wrong,
415  * the step components (x, y, bwidth) of the 0-th level not integrally
416  * divisible on respective size components (itemX, itemY, bwidth) of the 1-st
417  * level are wrong as well. The decomposition is also wrong if the size
418  * components are not integrally divisible on the step components and not equal
419  * to #SUBDIM_UNUSED.
420  *
421  * @return true if the decomposition is valid, or false otherwise
422  */
423 bool
424 decompSanityCheck(
425     const SubproblemDim *subdims,
426     unsigned int minSize,
427     unsigned int maxSize,
428     unsigned int maxRegs,
429     DataType dtype,
430     bool wholeA);
431 
432 /**
433  * @brief Calculate granularity in case when a work item is responsible
434  *        for its own part of solution not overlapping with those of other
435  *        items
436  *
437  * @param[out] pgran            Location to store calculated granularity
438  * @pararm[in] subdims          Subproblem dimensions
439  * @param[in] xdim              Dimension in the OpenCL work space X component
440  *                              of decomposition is mapped on
441  * @param[in] level             Function BLAS level. Reserved for future use.
442  *
443  * If value of \b xdim is -1, then the function assumes that OpenCL work
444  * space is single dimensional, and puts the product of granularity against
445  * X and Y component to 0-th element of \b wgSize field. If its value is
446  * 0 or 1, the function assumes that OpenCL work space is 2D and puts
447  * granularity against X component to \b xdim element of \b wgSize field
448  * of the granularity decriptor. Granularity against Y component is put to
449  * 1 - \b xdim element. Other values are invalid and forces abort in debug
450  * build. The function initializes the \b wgDim field properly.
451  *
452  * NOTE: Now, only this function is supported only for level 3 and
453  *       must not be called for level 2
454  */
455 void
456 calcPgranDedicated(
457     PGranularity *pgran,
458     const SubproblemDim *subdims,
459     int xdim,
460     int level);
461 
462 /**
463  * @brief Calculate granularity in case when several items evaluate the same
464  *        part of solution together
465  *
466  * @param[out] pgran            Location to store calculated granularity
467  * @pararm[in] subdims          Subproblem dimensions
468  * @param[in] xdim              Dimension in the OpenCL work space X component
469  *                              of decomposition is mapped on
470  * @param[in] ydim              Dimension in the OpenCL work space Y component
471  *                              of decomposition is mapped on
472  * @param[in] level             Function BLAS level. Reserved for future use
473  *
474  * If \b xdim and \b ydim values are equal, then the function puts the product
475  * of granularity against X and Y component to \b xdim element of \b wgSize
476  * field. If not, it puts separated granularity for X and Y in \b xdim and
477  * \b ydim element respectively. Both the values must be non negative and less
478  * than 3 (since OpenCL workspace cannot have more than 3 dimensions).
479  * If some of these parameters is zero, then the other one must be zero as well.
480  * If some of these parameters is 2, then the other one must be 1. These
481  * restrictions are caused by needs in reflecting \b bwidth in granularity
482  * in case of multidimensional decomposition. For 2D and 3D decompositions
483  * granularity for bwidth is calculated as well, and it is always mapped
484  * onto 0-th workspace dimension. If some of these parameters are wrong,
485  * it forces abort in debug build. The function sets the \b wgDim field
486  * to maximum of xdim and ydim plus 1.
487  *
488  * NOTE: Now, only this function is supported only for level 3 and
489  *       must not be called for level 2
490  */
491 void
492 calcPgranCooperative(
493     PGranularity *pgran,
494     const SubproblemDim *subdims,
495     int xdim,
496     int ydim,
497     int level);
498 
499 /*@}*/
500 
501 /**
502  * @internal
503  * @defgroup COMMON_MATH_OPERATIONS Constructing useful math expression
504  * @ingroup BLAS_GENERATORS
505  */
506 /*@{*/
507 
508 /**
509  * @brief Sprintf a complex MAD operation
510  *
511  * Operations:
512  *     - \f$ dst \leftarrow a * b + c \f$
513  *     - \f$ dst \leftarrow conj(a) * b + c \f$
514  *     - \f$ dst \leftarrow a * conj(b) + c \f$
515  *     - \f$ dst \leftarrow conj(a) * conj(b) + c \f$
516  *
517  *  @param[out] expr            String object to hold the target expression
518  *  @param[in] dst              Destination argument
519  *  @param[in] a                The first multiplier
520  *  @param[in] b                The second multiplier
521  *  @param[in] c                Added argument
522  *  @param[in] isDouble         If set, the arguments have double precision
523  *  @param[in] isConjA          If set, the argument A should be conjugated
524  *  @param[in] isConjB          If set, the argument B should be conjugated
525  *  @param[in] TileMulCore      Multiplying core
526  *
527  *  The \b c argument can be NULL. In this case it is ignored, and the function
528  *  produces pure multiplication
529  */
530 void
531 sprintfComplexMulUpdate(
532     Kstring *expr,
533     const Kstring *dst,
534     const Kstring *a,
535     const Kstring *b,
536     const Kstring *c,
537     bool isDouble,
538     bool conjA,
539     bool conjB,
540     TileMulCore core);
541 
542 void
543 sprintfComplexMulUpdate_syr2k_beta0(
544     Kstring *expr,
545     const Kstring *dst,
546     const Kstring *a,
547     const Kstring *b,
548     const Kstring *c,
549     bool isDouble,
550     bool conjA,
551     bool conjB,
552     TileMulCore core);
553 
554 /**
555  * @brief Sprintf expression of fast scalar mad
556  *
557  * @param[out] expr         Output expression
558  * @param[in]  first        First multiplier
559  * @param[in]  second       Second multiplier
560  * @param[in]  scale        Scale of the second argument, i. e. it's divider.
561  *                          Ignored if zero.
562  * @param[in]  third        Added argument. Ignored if NULL.
563  *
564  * It can use mad24. So, expected result should not exceed 2^24
565  */
566 void
567 sprintfFastScalarMad(
568     Kstring *expr,
569     const Kstring *first,
570     const Kstring *second,
571     unsigned int scale,
572     const Kstring *third);
573 
574 /*@}*/
575 
576 /**
577  * @internal
578  * @defgroup BLAS_GEN_MISC_FUNCTIONS Miscellaneous functions
579  * @ingroup BLAS_GENERATORS
580  */
581 
582 /*@{*/
583 
584 /**
585  * @brief Default function prefix for the data type
586  *
587  * @param[in] dtype     One of the data types supported by the library
588  */
589 char
590 dtypeToBlasPrefix(DataType dtype);
591 
592 /**
593  * @brief Convert kernel extra flags to tilemul flags
594  *
595  * @param[in] funcID        BLAS function ID
596  * @param[in] kflags        Kernel flags
597  */
598 TileMulFlags
599 kextraToTilemulFlags(BlasFunctionID funcID, KernelExtraFlags kflags);
600 
601 /**
602  * @brief Get vector length elements should be fetched from (stored to)
603  *        the global memory
604  *
605  * @param[in] gset          Generator settings
606  * @param[in] funcID        BLAS function ID (deprecated)
607  * @param[in] mrole         Role of the matrix to get vectorization for
608  */
609 unsigned int
610 getVecLen(const BlasGenSettings *gset, BlasFunctionID funcID, MatrixRole mrole);
611 
612 /**
613  * @brief Sprintf chunk (set of components) of an OpenCL vector type
614  *
615  * @param[out] chunk        Buffer to sprintf to
616  * @param[in] vecLen        Entire vector length
617  * @param[in] clen          Length of the chunk
618  * @param[in] vecOff        Starting component offset
619  */
620 void
621 sprintfVecChunk(
622     char *chunk,
623     unsigned int vecLen,
624     unsigned int clen,
625     unsigned int vecOff);
626 
627 /**
628  * @brief Generate code containing scaling of leading dimensions on
629  *        vector size
630  *
631  * @param[out] ctx          Generator context
632  * @param[in] gset          Generator settings
633  *
634  * The function first checks whether the scaling is actually needed.
635  * If vector size is 1. If some of the kernel variables for 'lda', 'ldb'
636  * or 'ldc' is NULL, the function skips code generation for the dimension.
637  * Calling this function has no effect if the @ref BGF_LD_IN_VECTORS generator
638  * flag is not set. If some of the leading dimensions are not unique, only
639  * one of the instances is scaled. Originality of the dimensions is detected
640  * by values of the respective pointers being a part of @ref KernelVarNames.
641  * For example, 'lda' and 'ldb' pointers are the same, only 'lda' is scaled.
642  */
643 void
644 genScaleLeadingDimensions(struct KgenContext *ctx, const BlasGenSettings *gset);
645 
646 /*@}*/
647 
648 /**
649  * @internal
650  * @brief Generate default post processing logic after tile fetch
651  *
652  * @param[out] ctx      Generator context
653  * @param[in] mrole     Matrix role
654  * @priv[out]           Handler's private data
655  *
656  * @ingroup TAILS_HANDLING
657  */
658 int
659 defaultTilePostFetch(
660     struct KgenContext *ctx,
661     MatrixRole mrole,
662     void *priv);
663 
664 void
665 getResultGPRsInfo(
666     DataType dtype,
667     const SubproblemDim *dims,
668     unsigned int vecLen,
669     unsigned int *nrRegs,
670     const char **typeName);
671 
672 /**
673  * @internal
674  * @defgroup BLAS_MAJOR_SUBGENS Major subgenerators
675  * @ingroup BLAS_GENERATORS
676  */
677 /*@{*/
678 
679 /**
680  * @internal
681  * @brief Tiles fetching and multiplication inlined code generator
682  *
683  * @param[out] ctx          Generator context
684  * @param[in] gset          Generator settings
685  * @param[in] mulOpts       TileMul-specific generator settings
686  *
687  * This function generates code which fetches tiles a and b from global or local
688  * memory into private memory, multiply them storing result into tile c in
689  * private memory and increment coordinate k. Caller is responsible for loop
690  * along K.\n
691  * All combinations of tiles a and b orientations are supported. Generated
692  * code fetches tiles by vectors which size can be different for tiles a and b.
693  * Complex types and conjugated tiles are supported. Global cycling is supported
694  * for global memory fetching - this mean that if tile overlaps matrix
695  * the tail of tile will be fetched from the beginning instead of accessing
696  * memory outside the matrix.\n
697  * Second level of subdimensions is used for tiles sizes.\n
698  * Generated code will fetch tiles a, b, multiply them and add result to tile c
699  * in private memory, then increment k. By default, k is incremented by
700  * second level bwidth but it is incremented by first level bwidth if
701  * @ref TILEMUL_BW_STRIDE flag is set. It is used if whole work group goes
702  * along K loop.\n
703  * Each tile can be fetched from global memory or from local memory.
704  * If tile is fetched from local memory then leading dimensions for local
705  * memory area are taken from first level subdimensions.\n
706  * Post-fetch callback generator function can be called after fetching tiles
707  * for zeroing tails or setting diagonal elements to one. This function is
708  * provided by caller.\n
709  * If second level bwidth is not equal to first level bwidth, and
710  * @ref TILEMUL_BW_STRIDE flag is not set then TileMul generates
711  * loop from zero to first level bwidth with second level bwidth step. The
712  * most common case is second level bwidth equal to first level bwidth where
713  * single iteration of multiplication is generated.\n
714  *
715  * If the caller assume for efficient fetching from the global memory and the
716  * tilemul logic is generated within a loop, prepareFetchCycle() should be
717  * called before generation of the loop.
718  *
719  * @return 0 on success
720  * @return -EOVERFLOW on source buffer overflowing
721  * @return -EINVAL if input arguments are invalid
722  */
723 int
724 tileMulGen(
725     struct KgenContext *ctx,
726     const BlasGenSettings *gset,
727     const TileMulOpts *mulOpts);
728 
729 /**
730  * @internal
731  * @brief Tiles pure multiplication code generator
732  *
733  * @param[out] ctx          Generator context
734  * @param[in] gset          Generator settings
735  * @param[in] mulOpts       TileMul-specific generator settings
736  *
737  * This function multiply two tiles, a and b, storing result in tile c. No
738  * additional operations are made. It just performs tiles multiplication without
739  * fetching, post-fetch processing and incrementing coordinates which can be
740  * made by caller.
741  *
742  * @return 0 on success
743  * @return -EOVERFLOW on source buffer overflowing
744  */
745 int
746 genMulTiles(
747     struct KgenContext *ctx,
748     const BlasGenSettings *gset,
749     const TileMulOpts *mulOpts);
750 
751 /**
752  * @internal
753  * @brief Update result generator
754  *
755  * @param[out] ctx          Generator context
756  * @param[in] gset          Generator settings
757  * @param[in] op            Update operation
758  * @param[in] flags         Update result flags
759  * @argNames
760  *
761  * It generates a function applying an operation to the temporary result
762  * stored in the private memory and updating the target result.
763  *\n
764  * The code can be generated as well in the form of callable function
765  * as in the inlined form.
766  *\n
767  * List of taken argument differs depending on specified flags. In general,
768  * these functions are defined as: \n
769  * @code
770  * void
771  * funcName(
772  *     <input type> C,
773  *     <output type> *c,
774  *     <input type> alpha,
775  *     size_t startRow,
776  *     size_t startCol,
777  *     size_t ld
778  *     [,<input type> beta]
779  *     [,size_t nrRows]
780  *     [,size_t nrCols])
781  * @endcode
782  *
783  * @return 0 on success, -EOVERFLOW at source buffer overflowing.
784  */
785 int
786 updateResultGen(
787     struct KgenContext *ctx,
788     const BlasGenSettings *gset,
789     BlasFunctionID funcId,
790     UpdateResultOp op,
791     UpdateResultFlags flags,
792     const UpresVarNames *uvarNames);
793 
794 /**
795  * @internal
796  * @brief Produce a code updating a single result element
797  *
798  * @param[out] ctx      Generator context
799  * @param[in] dst       Destination element expression
800  * @param[in] src       Source element expression
801  * @param[in] gset      Generator settings
802  * @param[in] op        Update operation
803  * @param[in] flags     Flags showing specifics of the code needed to be
804  *                      generated
805  *
806  * @return 0 on success, -EOVERFLOW if the source buffer is exceeded.
807  */
808 int
809 genUpdateResultSingle(
810     struct KgenContext *ctx,
811     const char *dst,
812     const char *src,
813     const BlasGenSettings *gset,
814     UpdateResultOp op,
815     UpdateResultFlags flags);
816 
817 /*@}*/
818 
819 TailFetch
820 checkForTailFetches(
821     BlasFunctionID funcID,
822     const SubproblemDim *dim,
823     const CLBLASKernExtra *kextra,
824     MatrixRole mrole,
825     bool distVect,
826     bool lowerTails);
827 
828 bool
829 isNeedZeroTileTail(
830     BlasFunctionID funcID,
831     const SubproblemDim *dim,
832     const CLBLASKernExtra *kextra,
833     MatrixRole mrole,
834     bool distVect);
835 
836 /**
837  * @internal
838  * @brief Generate tail coordinates adjustment if needed
839  *
840  * @param[out] ctx              Generator context
841  * @param[in] funcID            BLAS function ID
842  * @param[in] gset              Generator settings
843  * @param[out] *error           Location to store error.
844  *                              Ignored if NULL.
845  *
846  * Adjust coordinates if work is distributed over matrix rows so as
847  * a tile would not exceed the matrix bound. Cyclic addressing is not
848  * applicable for that since skew over rows can be used for performance goals.
849  *
850  * If it's needed, issues an expression like
851  *
852  * if (coord.y + dy > M) {
853  *     coord.y -= dy - M % dy;
854  * }
855  *
856  * Return status showing if the tails have been actually adjusted or not.
857  * If \b ctx is NULL the function doesn't try to generate a code, but just
858  * return actual tail handling status
859  *
860  * @ingroup TAILS_HANDLING
861  */
862 TailStatus
863 checkGenAdjustTailCoords(
864     struct KgenContext *ctx,
865     BlasFunctionID funcID,
866     const BlasGenSettings *gset,
867     int *error);
868 
869 /**
870  * @internal
871  * @brief Generate restoring original coordinates if needed
872  *
873  * @param[out] ctx              Generator context
874  * @param[in] gset              Generator settings
875  * @param[in] status            Tails handling status
876  *
877  * Coordinates restoring is needed to have ability to write back result to
878  * a correct location.
879  *
880  * If it's needed, issues an expression like
881  *
882  * if (coord.y + dy == M) {
883  *     coord.y += dy - M % dy;
884  * }
885  *
886  * @ingroup TAILS_HANDLING
887  */
888 int
889 checkGenRestoreTailCoords(
890     struct KgenContext *ctx,
891     const BlasGenSettings *gset,
892     TailStatus status);
893 
894 /**
895  * @internal
896  * @brief Convert tail handling status to the respective flags
897  *        of the update result generator
898  *
899  * @param[in] status            Status of the handling to convert to
900  *                              the update result flags
901  *
902  * @ingroup TAILS_HANDLING
903  */
904 UpdateResultFlags
905 tailStatusToUpresFlags(TailStatus status);
906 
907 #endif /* BLAS_KGEN_H_ */
908