1 /* ************************************************************************ 2 * Copyright 2013 Advanced Micro Devices, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 * ************************************************************************/ 16 17 18 /* 19 * Common generators for functions manipulating 20 * with data blocks placed in the global, local, 21 * or private memory. 22 */ 23 24 /* 25 * TODO: add the unroll option to 'rwMatrBlockGen' 26 * and 'smulMatrBlockGen' 27 */ 28 29 #ifndef DBLOCK_KGEN_H_ 30 #define DBLOCK_KGEN_H_ 31 32 #include <cltypes.h> 33 #include <kerngen.h> 34 35 /** 36 * @internal 37 * @defgroup MAJOR_GENS Major common used generators 38 */ 39 /*@{*/ 40 41 /** 42 * @internal 43 * @brief Data block copying directions 44 */ 45 typedef enum DBlockCopyDirection { 46 /** Copy from the global to the local memory */ 47 DBLOCK_GLOBAL_TO_LOCAL, 48 /** Copy from the local to the global memory */ 49 DBLOCK_LOCAL_TO_GLOBAL, 50 /** Copy from the global memory to an image */ 51 DBLOCK_GLOBAL_TO_IMAGE, 52 /** Copy from the local memory to an image */ 53 DBLOCK_LOCAL_TO_IMAGE 54 } DBlockCopyDirection; 55 56 /** 57 * @internal 58 * @brief Data block copying flags 59 */ 60 typedef enum DBlockCopyFlags { 61 DBLOCK_COPY_TRANSPOSE = 0x1, /**< Transpose 2D block */ 62 /** pack several rows into single image row */ 63 DBLOCK_COPY_PACKED_IMAGE = 0x2, 64 DBLOCK_COPY_CONJUGATE = 0x4, /**< Conjugate complex elements */ 65 DBLOCK_COPY_NOT_VECTORIZE = 0x8 /**< Disable vectorized copying */ 66 } DBlockCopyFlags; 67 68 /** 69 * @internal 70 * @brief Generator to copy data blocks between different kinds 71 * of memory 72 * 73 * @param[out] ctx Generator context 74 * @param[in] dim Subproblem dimension to generate a function for 75 * @param[in] pgran Data parallelism granularity 76 * @param[in] dtype Data type 77 * @param[in] dir Copying direction 78 * @param[in] flags Copying flags; when an image is used as destination 79 * block transposing is prohibited 80 * 81 * If 'dim' is set to NULL a generic version working with subproblem 82 * of any dimension is generated. In the case specific work group 83 * sizes are ignored, only work group dimension is used. 84 * 85 * 'x' field of the passed SuproblemDim structure should contain 86 * the block width 87 * 'y' should contain the block height 88 * 89 * Copied blocks can be as well one as two dimensional. For any one 90 * dimensional block 'y' field of the dimension structure should be 91 * set to 1. If a block is two dimensional, and the local memory is \n 92 * the source or destination memory, the block's rows must be aligned 93 * to float4 boundary. 94 * 95 * Rows of the matrix block must be aligned to float4 boundary. \n 96 * 97 * Generated functions have the following definitions: \n 98 *\n 99 * Buffer-buffer copying function for optimal blocks: \n 100 * @code 101 * void 102 * funcName( 103 * <Unified pointer type> dst, 104 * <Unified pointer type> src, 105 * size_t startRow, 106 * size_t startCol, 107 * size_t ld) 108 * @endcode 109 * 110 * The unified pointer types can be GPtr if the global memory is used or LPtr 111 * is the local memory is used respectively 112 * (See the "Data types in kernels" section). Function naming rule is follow: \n 113 * (type prefix)copyDBlock['Transp']['Conj']['Nvec'](src mem][dst mem] 114 * [block height][block width] \n 115 * The 'Nvec' suffix is added if vectorized copying is prohibited.\n 116 *\n 117 * Buffer-buffer copying function, generic version: \n 118 * @code 119 * void 120 * funcName( 121 * <Unified pointer type> dst, 122 * <Unified pointer type> src, 123 * size_t startRow, 124 * size_t startCol, 125 * size_t nrRows, 126 * size_t nrCols, 127 * size_t dstLD, 128 * size_t srcLD) 129 * @endcode 130 * 131 * Here "dstLD" is destination leading dimension, "srcLD" - source leading 132 * dimension. \n 133 * Naming rule is the same as for the function above except block sizes. \n 134 *\n 135 * Function copying optimal blocks from the global memory to an image: \n 136 * @code 137 * void 138 * funcName( 139 * __write_only image2d_t dst, 140 * size_t startX, 141 * size_t startY, 142 * GPtr src, 143 * size_t startRow, 144 * size_t startCol, 145 * size_t ld) 146 * @endcode 147 * 'start' and 'startY' arguments is start X and Y coordinate in the image to 148 * write from. The generic version has the analogous definition, and takes two 149 * additional arguments 'nrRows' and 'nrCols' of the size_t type following just 150 * fter the 'startCol' argument. \n 151 *\n 152 * Function copying optimal blocks from the local memory to an image: \n 153 * @code 154 * void 155 * funcName( 156 * __write_only image2d_t dst, 157 * size_t startX, 158 * size_t startY, 159 * LPtr src) 160 * @endcode 161 * The generic version takes two additional arguments 'nrRows' and 'nrCols' of the 162 * size_t type following just after the 'src' argument. 163 * 164 * @return 0 on success; on error returns negated error code: 165 * 166 * - -EINVAL: unsupported data type is passed, or 167 * 'DBLOCK_COPY_TRANSPOSE' is set when 168 * an image is used as destination 169 * - -ENOTSUP: unsupported copying direction is passed 170 * - -EOVEFFLOW: code buffer overflowed 171 */ 172 int 173 copyDataBlockGen( 174 struct KgenContext *ctx, 175 const SubproblemDim *dim, 176 const PGranularity *pgran, 177 DataType dtype, 178 DBlockCopyDirection dir, 179 DBlockCopyFlags flags); 180 181 /*@}*/ 182 183 /* 184 * Zero data block in the local or global memory 185 * 186 * @ctx: generator context 187 * @dim: Subproblem dimension to generate the function for 188 * @pgran: data parallelism granularity 189 * @memPrefix: type of memory to generate the function for 190 * 191 * The 'memPrefix' field of the passed BlasKernExtra structure 192 * should contain the type of memory the buffer is stored in. 193 * It cane take one of the "__local", or the "__global" value. 194 * 195 * 'x' field of the passed SuproblemDim structure should contain 196 * the block width in float4 words. In the case the function takes only 197 * a buffer pointer. If the field is set to 'SUBDIM_UNUSED' 198 * the function is generated without any loop unrollings. In the case 199 * the function takes buffer length as the second argument. 200 * 201 * If 'unroll' is set, the 'bwidth' field of the structure should 202 * contain the maximum width of a block zeroed with loop unrolling. 203 * If 'unroll' is set but the 'bwidth' is set to 'SUBDIM_UNUSED', 204 * the generator don't apply any restriction to loop unrolling. 205 * The parameter is ignored if the 'x' field of the 'dim' is set to 206 * 'SUBDIM_UNUSED'. 207 * 208 * On success returns 0, on error returns negated error code: 209 * 210 * -EINVAL: wrong memory prefix is passed 211 * -EOVEFFLOW: code buffer overflowed 212 */ 213 int 214 f4zeroBlockGen( 215 struct KgenContext *ctx, 216 const SubproblemDim *dim, 217 const PGranularity *pgran, 218 const char *memPrefix); 219 220 #endif /* DBLOCK_KGEN_H_ */ 221