1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 /*
19  * Common generators for functions manipulating
20  * with data blocks placed in the global, local,
21  * or private memory.
22  */
23 
24 /*
25  * TODO: add the unroll option to 'rwMatrBlockGen'
26  *       and 'smulMatrBlockGen'
27  */
28 
29 #ifndef DBLOCK_KGEN_H_
30 #define DBLOCK_KGEN_H_
31 
32 #include <cltypes.h>
33 #include <kerngen.h>
34 
35 /**
36  * @internal
37  * @defgroup MAJOR_GENS Major common used generators
38  */
39 /*@{*/
40 
41 /**
42  * @internal
43  * @brief Data block copying directions
44  */
45 typedef enum DBlockCopyDirection {
46     /** Copy from the global to the local memory */
47     DBLOCK_GLOBAL_TO_LOCAL,
48     /** Copy from the local to the global memory */
49     DBLOCK_LOCAL_TO_GLOBAL,
50     /** Copy from the global memory to an image */
51     DBLOCK_GLOBAL_TO_IMAGE,
52     /** Copy from the local memory to an image */
53     DBLOCK_LOCAL_TO_IMAGE
54 } DBlockCopyDirection;
55 
56 /**
57  * @internal
58  * @brief Data block copying flags
59  */
60 typedef enum DBlockCopyFlags {
61     DBLOCK_COPY_TRANSPOSE = 0x1,        /**< Transpose 2D block */
62     /** pack several rows into single image row */
63     DBLOCK_COPY_PACKED_IMAGE = 0x2,
64     DBLOCK_COPY_CONJUGATE = 0x4,        /**< Conjugate complex elements */
65     DBLOCK_COPY_NOT_VECTORIZE = 0x8     /**< Disable vectorized copying */
66 } DBlockCopyFlags;
67 
68 /**
69  * @internal
70  * @brief Generator to copy data blocks between different kinds
71  *        of memory
72  *
73  * @param[out] ctx              Generator context
74  * @param[in] dim               Subproblem dimension to generate a function for
75  * @param[in] pgran             Data parallelism granularity
76  * @param[in] dtype             Data type
77  * @param[in] dir               Copying direction
78  * @param[in] flags             Copying flags; when an image is used as destination
79  *                              block transposing is prohibited
80  *
81  * If 'dim' is set to NULL a generic version working with subproblem
82  * of any dimension is generated. In the case specific work group
83  * sizes are ignored, only work group dimension is used.
84  *
85  * 'x' field of the passed SuproblemDim structure should contain
86  *     the block width
87  * 'y' should contain the block height
88  *
89  * Copied blocks can be as well one as two dimensional. For any one
90  * dimensional block 'y' field of the dimension structure should be
91  * set to 1. If a block is two dimensional, and the local memory is \n
92  * the source or destination memory, the block's rows must be aligned
93  * to float4 boundary.
94  *
95  * Rows of the matrix block must be aligned to float4 boundary. \n
96  *
97  * Generated functions have the following definitions: \n
98  *\n
99  * Buffer-buffer copying function for optimal blocks: \n
100  * @code
101  * void
102  * funcName(
103  *     <Unified pointer type> dst,
104  *     <Unified pointer type> src,
105  *     size_t startRow,
106  *     size_t startCol,
107  *     size_t ld)
108  * @endcode
109  *
110  * The unified pointer types can be GPtr if the global memory is used or LPtr
111  * is the local memory is used respectively
112  * (See the "Data types in kernels" section). Function naming rule is follow: \n
113  * (type prefix)copyDBlock['Transp']['Conj']['Nvec'](src mem][dst mem]
114  * [block height][block width] \n
115  * The 'Nvec' suffix is added if vectorized copying is prohibited.\n
116  *\n
117  * Buffer-buffer copying function, generic version: \n
118  * @code
119  * void
120  * funcName(
121  *     <Unified pointer type> dst,
122  *     <Unified pointer type> src,
123  *     size_t startRow,
124  *     size_t startCol,
125  *     size_t nrRows,
126  *     size_t nrCols,
127  *     size_t dstLD,
128  *     size_t srcLD)
129  * @endcode
130  *
131  * Here "dstLD" is destination leading dimension, "srcLD" - source leading
132  * dimension. \n
133  * Naming rule is the same as for the function above except block sizes. \n
134  *\n
135  * Function copying optimal blocks from the global memory to an image: \n
136  * @code
137  * void
138  * funcName(
139  *     __write_only image2d_t dst,
140  *     size_t startX,
141  *     size_t startY,
142  *     GPtr src,
143  *     size_t startRow,
144  *     size_t startCol,
145  *     size_t ld)
146  * @endcode
147  * 'start' and 'startY' arguments is start X and Y coordinate in the image to
148  * write from. The generic version has the analogous definition, and takes two
149  * additional arguments 'nrRows' and 'nrCols' of the size_t type following just
150  * fter the 'startCol' argument. \n
151  *\n
152  * Function copying optimal blocks from the local memory to an image: \n
153  * @code
154  * void
155  * funcName(
156  *     __write_only image2d_t dst,
157  *     size_t startX,
158  *     size_t startY,
159  *     LPtr src)
160  * @endcode
161  * The generic version takes two additional arguments 'nrRows' and 'nrCols' of the
162  * size_t type following just after the 'src' argument.
163  *
164  * @return 0 on success; on error returns negated error code:
165  *
166  *      - -EINVAL: unsupported data type is passed, or
167  *               'DBLOCK_COPY_TRANSPOSE' is set when
168  *               an image is used as destination
169  *      - -ENOTSUP: unsupported copying direction is passed
170  *      - -EOVEFFLOW: code buffer overflowed
171  */
172 int
173 copyDataBlockGen(
174     struct KgenContext *ctx,
175     const SubproblemDim *dim,
176     const PGranularity *pgran,
177     DataType dtype,
178     DBlockCopyDirection dir,
179     DBlockCopyFlags flags);
180 
181 /*@}*/
182 
183 /*
184  * Zero data block in the local or global memory
185  *
186  * @ctx: generator context
187  * @dim: Subproblem dimension to generate the function for
188  * @pgran: data parallelism granularity
189  * @memPrefix: type of memory to generate the function for
190  *
191  * The 'memPrefix' field of the passed BlasKernExtra structure
192  * should contain the type of memory the buffer is stored in.
193  * It cane take one of the "__local", or the "__global" value.
194  *
195  * 'x' field of the passed SuproblemDim structure should contain
196  * the block width in float4 words. In the case the function takes only
197  * a buffer pointer. If the field is set to 'SUBDIM_UNUSED'
198  * the function is generated without any loop unrollings. In the case
199  * the function takes buffer length as the second argument.
200  *
201  * If 'unroll' is set, the 'bwidth' field of the structure should
202  * contain the maximum width of a block zeroed with loop unrolling.
203  * If 'unroll' is set but the 'bwidth' is set to 'SUBDIM_UNUSED',
204  * the generator don't apply any restriction to loop unrolling.
205  * The parameter is ignored if the 'x' field of the 'dim' is set to
206  * 'SUBDIM_UNUSED'.
207  *
208  * On success returns 0, on error returns negated error code:
209  *
210  *      -EINVAL: wrong memory prefix is passed
211  *      -EOVEFFLOW: code buffer overflowed
212  */
213 int
214 f4zeroBlockGen(
215     struct KgenContext *ctx,
216     const SubproblemDim *dim,
217     const PGranularity *pgran,
218     const char *memPrefix);
219 
220 #endif /* DBLOCK_KGEN_H_ */
221