1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 #ifndef BLAS_KGEN_LEGACY_H_
19 #define BLAS_KGEN_LEGACY_H_
20 
21 #include "../blas_kgen.h"
22 
23 /**
24  * @internal
25  * @brief Block multiplier flags
26  * @ingroup BLAS_MAJOR_GENS
27  */
28 typedef enum BlkmulFlags {
29     BLKMUL_NO_FLAGS,            /**< No flags */
30     BLKMUL_TRANSPOSE = 0x01,    /**< Transpose result */
31     BLKMUL_IMAGE_PACKED = 0x02, /**< Data in image are packed */
32     /**
33      * Accumulate multiplication results to a
34      * private location provided by caller
35      */
36     BLKMUL_OUTPUT_PRIVATE = 0x04,
37     BLKMUL_SKEW_ROW = 0x08,     /**< Use skew over block rows */
38     BLKMUL_SKEW_COLUMN = 0x10,  /**< Use skew over block columns */
39     BLKMUL_INLINE = 0x20,       /**< Generate an inline version */
40     BLKMUL_TRANSPOSED_B = 0x40, /**< Block B is transposed */
41     /** Don't use "&" operation in cyclic address evaluation, use always "%" */
42     BLKMUL_AVOID_AND = 0x80
43 } BlkMulFlags;
44 
45 /**
46  * @internal
47  * @brief Block multiplier core
48  * @ingroup BLAS_MAJOR_GENS
49  */
50 typedef enum BlkmulCore {
51     /** Use separate multiplication and summation implemented by hand */
52     BLKMUL_SEPARATE_MULADD,
53     /** Use the 'dot' function */
54     BLKMUL_DOT,
55     /** Use the 'mad' function */
56     BLKMUL_MAD
57 } BlkmulCore;
58 
59 /**
60  * @internal
61  * @brief Argument names for the inline version of the block
62  *        multiplier
63  * @ingroup BLAS_MAJOR_GENS
64  */
65 typedef struct BlkmulArgNames {
66     const char *coordA;     /**< Matrix A start coordinates */
67     const char *coordB;     /**< Matrix B start coordinates */
68     const char *skewRow;    /**< Skew over rows */
69     const char *skewCol;    /**< Skew over columns */
70     const char *k;          /**< Counter name in the loop over K */
71     const char *vectBoundK; /**< Bound in the loop over K */
72 } BlkmulArgNames;
73 
74 /**
75  * @internal
76  * @brief Options for matrix block multiplication
77  *        generator
78  * @ingroup BLAS_MAJOR_GENS
79  */
80 typedef struct BlkMulOpts {
81     /** OpenCL memory object type storing matrix (whole or its blocks) A */
82     CLMemType aMobj;
83     /** OpenCL memory object type storing matrix (whole or its blocks) A */
84     CLMemType bMobj;
85     BlkMulFlags flags;      /**< Specific flags */
86     BlkmulCore core;        /**< Multiply and add core */
87     /** List of argument names for the inline version */
88     BlkmulArgNames argNames;
89 } BlkMulOpts;
90 
91 void
92 declareBlasEnums(struct KgenContext *ctx);
93 
94 /**
95  * @internal
96  * @brief Matrix block multiplication generator
97  *
98  * @param[out] ctx          Generator context
99  * @param[in] subdims       Subproblem dimensions; the first level reflects
100  *                          dimensions of the large blocks processed with the
101  *                          whole work group, and the second level
102  *                          reflects sizes of immediately multiplied small
103  *                          blocks within the single work item
104  * @param[in] dtype         Data type the multiplying function will be
105  *                          generated for
106  * @param[in] opts          Block multiplication options
107  *
108  * Generated functions have the following definitions: \n
109  *\n
110  * For the buffer based version:
111  * @code
112  * void
113  * funcName(
114  *     <type> alpha,
115  *     LPtr A,
116  *     LPtr B,
117  *     LPtr C,
118  *     [,int2 skewRow]
119  *     [,int skewCol]);
120  * @endcode
121  *
122  * Function naming rule:
123  * (type prefix)gemmBlock[Transp]_<width>_<height>
124  *\n
125  * It's assumed A, B and C point to start of data to be
126  * processed during this step.
127  *\n
128  * For the image based version: \n
129  * @code
130  * void
131  * funcName(
132  *     <type> alpha,
133  *     __read_only image2d_t A,
134  *     int2 coordA,
135  *     __read_only image2d_t B,
136  *     int2 coordB,
137  *     LPtr C,
138  *     [,int2 skewRow],
139  *     [,int skewCol]);
140  * @endcode
141  *
142  * Where coordA and coordB mean start image coordinates to fetch data from.
143  *\n
144  * For the image based version a mixed variant is possible when
145  * either A or B blocks are passed through the local memory.
146  *\n
147  * The 'skewRow' and 'skewCol' are optional arguments if the
148  * 'BLKMUL_SKEW_ROW' and "BLKMUL_SKEW_COLUMN" flag is specified
149  * respectively. 'y' field of the row skew is for the block A, and the
150  * 'x' one is for the block B.
151  *\n
152  * Output result can be put directly into a private location provided by the
153  * caller instead of the local one. It is achieved with 'BLKMUL_OUTPUT_PRIVATE'
154  * flag using.
155  *\n
156  * Pointer to this location should have the following types depending on the type
157  * of processed data: \n
158  * - float4 - for float
159  * - float2 - for complex float
160  * - double2 - for double and complex double
161  *\n\n
162  * Alpha is not taken in this case.
163  *\n
164  * The multiplier can be generated as well in the form of the dedicated
165  * function as in the inline form inserted to a kernel. \n In case of inline
166  * version the block multiplier becomes in fact the tile multiplier. In this
167  * case the caller should provide iteration over K.
168  *
169  * @return 0 on success, -EOVERFLOW on source buffer overflowing
170  */
171 
172 /**
173  * @internal
174  * @defgroup BLAS_MAJOR_GENS BLAS specific generators
175  * @ingroup MAJOR_GENS
176  */
177 /*@{*/
178 int
179 blkMulGen(
180     struct KgenContext *ctx,
181     const SubproblemDim subdims[2],
182     DataType dtype,
183     const BlkMulOpts *opts);
184 
185 int
186 updateResultGenOld(
187     struct KgenContext *ctx,
188     const BlasGenSettings *gset,
189     UpdateResultOp op,
190     UpdateResultFlags flags,
191     const UpresVarNames *uvarNames);
192 
193 /*@}*/
194 
195 #endif /* BLAS_KGEN_LEGACY_H_ */
196