1 2 // ================================================================================================= 3 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This 4 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 // width of 100 characters per line. 6 // 7 // Author(s): 8 // Cedric Nugteren <www.cedricnugteren.nl> 9 // 10 // This file contains the plain C interface to the CLBlast BLAS routines, the counter-part of the 11 // normal 'clblast.h' C++ header. 12 // 13 // ================================================================================================= 14 15 #ifndef CLBLAST_CLBLAST_C_H_ 16 #define CLBLAST_CLBLAST_C_H_ 17 18 // Includes the normal OpenCL C header 19 #if defined(__APPLE__) || defined(__MACOSX) 20 #include <OpenCL/opencl.h> 21 #else 22 #include <CL/opencl.h> 23 #endif 24 25 // Exports library functions under Windows when building a DLL. See also: 26 // https://msdn.microsoft.com/en-us/library/a90k134d.aspx 27 #if defined(_WIN32) && defined(CLBLAST_DLL) 28 #if defined(COMPILING_DLL) 29 #define PUBLIC_API __declspec(dllexport) 30 #else 31 #define PUBLIC_API __declspec(dllimport) 32 #endif 33 #else 34 #define PUBLIC_API 35 #endif 36 37 // The C interface 38 #ifdef __cplusplus 39 extern "C" { 40 #endif 41 42 // ================================================================================================= 43 44 // Status codes. These codes can be returned by functions declared in this header file. The error 45 // codes match either the standard OpenCL error codes or the clBLAS error codes. 46 typedef enum CLBlastStatusCode_ { 47 48 // Status codes in common with the OpenCL standard 49 CLBlastSuccess = 0, // CL_SUCCESS 50 CLBlastOpenCLCompilerNotAvailable= -3, // CL_COMPILER_NOT_AVAILABLE 51 CLBlastTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE 52 CLBlastOpenCLOutOfResources = -5, // CL_OUT_OF_RESOURCES 53 CLBlastOpenCLOutOfHostMemory = -6, // CL_OUT_OF_HOST_MEMORY 54 CLBlastOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error 55 CLBlastInvalidValue = -30, // CL_INVALID_VALUE 56 CLBlastInvalidCommandQueue = -36, // CL_INVALID_COMMAND_QUEUE 57 CLBlastInvalidMemObject = -38, // CL_INVALID_MEM_OBJECT 58 CLBlastInvalidBinary = -42, // CL_INVALID_BINARY 59 CLBlastInvalidBuildOptions = -43, // CL_INVALID_BUILD_OPTIONS 60 CLBlastInvalidProgram = -44, // CL_INVALID_PROGRAM 61 CLBlastInvalidProgramExecutable = -45, // CL_INVALID_PROGRAM_EXECUTABLE 62 CLBlastInvalidKernelName = -46, // CL_INVALID_KERNEL_NAME 63 CLBlastInvalidKernelDefinition = -47, // CL_INVALID_KERNEL_DEFINITION 64 CLBlastInvalidKernel = -48, // CL_INVALID_KERNEL 65 CLBlastInvalidArgIndex = -49, // CL_INVALID_ARG_INDEX 66 CLBlastInvalidArgValue = -50, // CL_INVALID_ARG_VALUE 67 CLBlastInvalidArgSize = -51, // CL_INVALID_ARG_SIZE 68 CLBlastInvalidKernelArgs = -52, // CL_INVALID_KERNEL_ARGS 69 CLBlastInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions 70 CLBlastInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total 71 CLBlastInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension 72 CLBlastInvalidGlobalOffset = -56, // CL_INVALID_GLOBAL_OFFSET 73 CLBlastInvalidEventWaitList = -57, // CL_INVALID_EVENT_WAIT_LIST 74 CLBlastInvalidEvent = -58, // CL_INVALID_EVENT 75 CLBlastInvalidOperation = -59, // CL_INVALID_OPERATION 76 CLBlastInvalidBufferSize = -61, // CL_INVALID_BUFFER_SIZE 77 CLBlastInvalidGlobalWorkSize = -63, // CL_INVALID_GLOBAL_WORK_SIZE 78 79 // Status codes in common with the clBLAS library 80 CLBlastNotImplemented = -1024, // Routine or functionality not implemented yet 81 CLBlastInvalidMatrixA = -1022, // Matrix A is not a valid OpenCL buffer 82 CLBlastInvalidMatrixB = -1021, // Matrix B is not a valid OpenCL buffer 83 CLBlastInvalidMatrixC = -1020, // Matrix C is not a valid OpenCL buffer 84 CLBlastInvalidVectorX = -1019, // Vector X is not a valid OpenCL buffer 85 CLBlastInvalidVectorY = -1018, // Vector Y is not a valid OpenCL buffer 86 CLBlastInvalidDimension = -1017, // Dimensions M, N, and K have to be larger than zero 87 CLBlastInvalidLeadDimA = -1016, // LD of A is smaller than the matrix's first dimension 88 CLBlastInvalidLeadDimB = -1015, // LD of B is smaller than the matrix's first dimension 89 CLBlastInvalidLeadDimC = -1014, // LD of C is smaller than the matrix's first dimension 90 CLBlastInvalidIncrementX = -1013, // Increment of vector X cannot be zero 91 CLBlastInvalidIncrementY = -1012, // Increment of vector Y cannot be zero 92 CLBlastInsufficientMemoryA = -1011, // Matrix A's OpenCL buffer is too small 93 CLBlastInsufficientMemoryB = -1010, // Matrix B's OpenCL buffer is too small 94 CLBlastInsufficientMemoryC = -1009, // Matrix C's OpenCL buffer is too small 95 CLBlastInsufficientMemoryX = -1008, // Vector X's OpenCL buffer is too small 96 CLBlastInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small 97 98 // Custom additional status codes for CLBlast 99 CLBlastInvalidBatchCount = -2049, // The batch count needs to be positive 100 CLBlastInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel 101 CLBlastMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel 102 CLBlastInvalidLocalMemUsage = -2046, // Not enough local memory available on this device 103 CLBlastNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device 104 CLBlastNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device 105 CLBlastInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer 106 CLBlastInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small 107 CLBlastDatabaseError = -2041, // Entry for the device was not found in the database 108 CLBlastUnknownError = -2040, // A catch-all error code representing an unspecified error 109 CLBlastUnexpectedError = -2039, // A catch-all error code representing an unexpected exception 110 } CLBlastStatusCode; 111 112 // Matrix layout and transpose types 113 typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101, 114 CLBlastLayoutColMajor = 102 } CLBlastLayout; 115 typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112, 116 CLBlastTransposeConjugate = 113 } CLBlastTranspose; 117 typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121, 118 CLBlastTriangleLower = 122 } CLBlastTriangle; 119 typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, 120 CLBlastDiagonalUnit = 132 } CLBlastDiagonal; 121 typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; 122 123 // Precision enum (values in bits) 124 typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32, 125 CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232, 126 CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision; 127 128 // ================================================================================================= 129 // BLAS level-1 (vector-vector) routines 130 // ================================================================================================= 131 132 // Generate givens plane rotation: SROTG/DROTG 133 CLBlastStatusCode PUBLIC_API CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset, 134 cl_mem sb_buffer, const size_t sb_offset, 135 cl_mem sc_buffer, const size_t sc_offset, 136 cl_mem ss_buffer, const size_t ss_offset, 137 cl_command_queue* queue, cl_event* event); 138 CLBlastStatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, 139 cl_mem sb_buffer, const size_t sb_offset, 140 cl_mem sc_buffer, const size_t sc_offset, 141 cl_mem ss_buffer, const size_t ss_offset, 142 cl_command_queue* queue, cl_event* event); 143 144 // Generate modified givens plane rotation: SROTMG/DROTMG 145 CLBlastStatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, 146 cl_mem sd2_buffer, const size_t sd2_offset, 147 cl_mem sx1_buffer, const size_t sx1_offset, 148 const cl_mem sy1_buffer, const size_t sy1_offset, 149 cl_mem sparam_buffer, const size_t sparam_offset, 150 cl_command_queue* queue, cl_event* event); 151 CLBlastStatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, 152 cl_mem sd2_buffer, const size_t sd2_offset, 153 cl_mem sx1_buffer, const size_t sx1_offset, 154 const cl_mem sy1_buffer, const size_t sy1_offset, 155 cl_mem sparam_buffer, const size_t sparam_offset, 156 cl_command_queue* queue, cl_event* event); 157 158 // Apply givens plane rotation: SROT/DROT 159 CLBlastStatusCode PUBLIC_API CLBlastSrot(const size_t n, 160 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 161 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 162 const float cos, 163 const float sin, 164 cl_command_queue* queue, cl_event* event); 165 CLBlastStatusCode PUBLIC_API CLBlastDrot(const size_t n, 166 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 167 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 168 const double cos, 169 const double sin, 170 cl_command_queue* queue, cl_event* event); 171 172 // Apply modified givens plane rotation: SROTM/DROTM 173 CLBlastStatusCode PUBLIC_API CLBlastSrotm(const size_t n, 174 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 175 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 176 cl_mem sparam_buffer, const size_t sparam_offset, 177 cl_command_queue* queue, cl_event* event); 178 CLBlastStatusCode PUBLIC_API CLBlastDrotm(const size_t n, 179 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 180 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 181 cl_mem sparam_buffer, const size_t sparam_offset, 182 cl_command_queue* queue, cl_event* event); 183 184 // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP 185 CLBlastStatusCode PUBLIC_API CLBlastSswap(const size_t n, 186 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 187 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 188 cl_command_queue* queue, cl_event* event); 189 CLBlastStatusCode PUBLIC_API CLBlastDswap(const size_t n, 190 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 191 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 192 cl_command_queue* queue, cl_event* event); 193 CLBlastStatusCode PUBLIC_API CLBlastCswap(const size_t n, 194 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 195 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 196 cl_command_queue* queue, cl_event* event); 197 CLBlastStatusCode PUBLIC_API CLBlastZswap(const size_t n, 198 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 199 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 200 cl_command_queue* queue, cl_event* event); 201 CLBlastStatusCode PUBLIC_API CLBlastHswap(const size_t n, 202 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 203 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 204 cl_command_queue* queue, cl_event* event); 205 206 // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL 207 CLBlastStatusCode PUBLIC_API CLBlastSscal(const size_t n, 208 const float alpha, 209 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 210 cl_command_queue* queue, cl_event* event); 211 CLBlastStatusCode PUBLIC_API CLBlastDscal(const size_t n, 212 const double alpha, 213 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 214 cl_command_queue* queue, cl_event* event); 215 CLBlastStatusCode PUBLIC_API CLBlastCscal(const size_t n, 216 const cl_float2 alpha, 217 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 218 cl_command_queue* queue, cl_event* event); 219 CLBlastStatusCode PUBLIC_API CLBlastZscal(const size_t n, 220 const cl_double2 alpha, 221 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 222 cl_command_queue* queue, cl_event* event); 223 CLBlastStatusCode PUBLIC_API CLBlastHscal(const size_t n, 224 const cl_half alpha, 225 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 226 cl_command_queue* queue, cl_event* event); 227 228 // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY 229 CLBlastStatusCode PUBLIC_API CLBlastScopy(const size_t n, 230 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 231 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 232 cl_command_queue* queue, cl_event* event); 233 CLBlastStatusCode PUBLIC_API CLBlastDcopy(const size_t n, 234 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 235 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 236 cl_command_queue* queue, cl_event* event); 237 CLBlastStatusCode PUBLIC_API CLBlastCcopy(const size_t n, 238 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 239 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 240 cl_command_queue* queue, cl_event* event); 241 CLBlastStatusCode PUBLIC_API CLBlastZcopy(const size_t n, 242 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 243 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 244 cl_command_queue* queue, cl_event* event); 245 CLBlastStatusCode PUBLIC_API CLBlastHcopy(const size_t n, 246 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 247 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 248 cl_command_queue* queue, cl_event* event); 249 250 // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY 251 CLBlastStatusCode PUBLIC_API CLBlastSaxpy(const size_t n, 252 const float alpha, 253 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 254 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 255 cl_command_queue* queue, cl_event* event); 256 CLBlastStatusCode PUBLIC_API CLBlastDaxpy(const size_t n, 257 const double alpha, 258 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 259 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 260 cl_command_queue* queue, cl_event* event); 261 CLBlastStatusCode PUBLIC_API CLBlastCaxpy(const size_t n, 262 const cl_float2 alpha, 263 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 264 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 265 cl_command_queue* queue, cl_event* event); 266 CLBlastStatusCode PUBLIC_API CLBlastZaxpy(const size_t n, 267 const cl_double2 alpha, 268 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 269 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 270 cl_command_queue* queue, cl_event* event); 271 CLBlastStatusCode PUBLIC_API CLBlastHaxpy(const size_t n, 272 const cl_half alpha, 273 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 274 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 275 cl_command_queue* queue, cl_event* event); 276 277 // Dot product of two vectors: SDOT/DDOT/HDOT 278 CLBlastStatusCode PUBLIC_API CLBlastSdot(const size_t n, 279 cl_mem dot_buffer, const size_t dot_offset, 280 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 281 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 282 cl_command_queue* queue, cl_event* event); 283 CLBlastStatusCode PUBLIC_API CLBlastDdot(const size_t n, 284 cl_mem dot_buffer, const size_t dot_offset, 285 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 286 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 287 cl_command_queue* queue, cl_event* event); 288 CLBlastStatusCode PUBLIC_API CLBlastHdot(const size_t n, 289 cl_mem dot_buffer, const size_t dot_offset, 290 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 291 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 292 cl_command_queue* queue, cl_event* event); 293 294 // Dot product of two complex vectors: CDOTU/ZDOTU 295 CLBlastStatusCode PUBLIC_API CLBlastCdotu(const size_t n, 296 cl_mem dot_buffer, const size_t dot_offset, 297 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 298 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 299 cl_command_queue* queue, cl_event* event); 300 CLBlastStatusCode PUBLIC_API CLBlastZdotu(const size_t n, 301 cl_mem dot_buffer, const size_t dot_offset, 302 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 303 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 304 cl_command_queue* queue, cl_event* event); 305 306 // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC 307 CLBlastStatusCode PUBLIC_API CLBlastCdotc(const size_t n, 308 cl_mem dot_buffer, const size_t dot_offset, 309 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 310 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 311 cl_command_queue* queue, cl_event* event); 312 CLBlastStatusCode PUBLIC_API CLBlastZdotc(const size_t n, 313 cl_mem dot_buffer, const size_t dot_offset, 314 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 315 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 316 cl_command_queue* queue, cl_event* event); 317 318 // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 319 CLBlastStatusCode PUBLIC_API CLBlastSnrm2(const size_t n, 320 cl_mem nrm2_buffer, const size_t nrm2_offset, 321 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 322 cl_command_queue* queue, cl_event* event); 323 CLBlastStatusCode PUBLIC_API CLBlastDnrm2(const size_t n, 324 cl_mem nrm2_buffer, const size_t nrm2_offset, 325 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 326 cl_command_queue* queue, cl_event* event); 327 CLBlastStatusCode PUBLIC_API CLBlastScnrm2(const size_t n, 328 cl_mem nrm2_buffer, const size_t nrm2_offset, 329 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 330 cl_command_queue* queue, cl_event* event); 331 CLBlastStatusCode PUBLIC_API CLBlastDznrm2(const size_t n, 332 cl_mem nrm2_buffer, const size_t nrm2_offset, 333 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 334 cl_command_queue* queue, cl_event* event); 335 CLBlastStatusCode PUBLIC_API CLBlastHnrm2(const size_t n, 336 cl_mem nrm2_buffer, const size_t nrm2_offset, 337 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 338 cl_command_queue* queue, cl_event* event); 339 340 // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM 341 CLBlastStatusCode PUBLIC_API CLBlastSasum(const size_t n, 342 cl_mem asum_buffer, const size_t asum_offset, 343 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 344 cl_command_queue* queue, cl_event* event); 345 CLBlastStatusCode PUBLIC_API CLBlastDasum(const size_t n, 346 cl_mem asum_buffer, const size_t asum_offset, 347 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 348 cl_command_queue* queue, cl_event* event); 349 CLBlastStatusCode PUBLIC_API CLBlastScasum(const size_t n, 350 cl_mem asum_buffer, const size_t asum_offset, 351 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 352 cl_command_queue* queue, cl_event* event); 353 CLBlastStatusCode PUBLIC_API CLBlastDzasum(const size_t n, 354 cl_mem asum_buffer, const size_t asum_offset, 355 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 356 cl_command_queue* queue, cl_event* event); 357 CLBlastStatusCode PUBLIC_API CLBlastHasum(const size_t n, 358 cl_mem asum_buffer, const size_t asum_offset, 359 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 360 cl_command_queue* queue, cl_event* event); 361 362 // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM 363 CLBlastStatusCode PUBLIC_API CLBlastSsum(const size_t n, 364 cl_mem sum_buffer, const size_t sum_offset, 365 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 366 cl_command_queue* queue, cl_event* event); 367 CLBlastStatusCode PUBLIC_API CLBlastDsum(const size_t n, 368 cl_mem sum_buffer, const size_t sum_offset, 369 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 370 cl_command_queue* queue, cl_event* event); 371 CLBlastStatusCode PUBLIC_API CLBlastScsum(const size_t n, 372 cl_mem sum_buffer, const size_t sum_offset, 373 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 374 cl_command_queue* queue, cl_event* event); 375 CLBlastStatusCode PUBLIC_API CLBlastDzsum(const size_t n, 376 cl_mem sum_buffer, const size_t sum_offset, 377 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 378 cl_command_queue* queue, cl_event* event); 379 CLBlastStatusCode PUBLIC_API CLBlastHsum(const size_t n, 380 cl_mem sum_buffer, const size_t sum_offset, 381 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 382 cl_command_queue* queue, cl_event* event); 383 384 // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX 385 CLBlastStatusCode PUBLIC_API CLBlastiSamax(const size_t n, 386 cl_mem imax_buffer, const size_t imax_offset, 387 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 388 cl_command_queue* queue, cl_event* event); 389 CLBlastStatusCode PUBLIC_API CLBlastiDamax(const size_t n, 390 cl_mem imax_buffer, const size_t imax_offset, 391 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 392 cl_command_queue* queue, cl_event* event); 393 CLBlastStatusCode PUBLIC_API CLBlastiCamax(const size_t n, 394 cl_mem imax_buffer, const size_t imax_offset, 395 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 396 cl_command_queue* queue, cl_event* event); 397 CLBlastStatusCode PUBLIC_API CLBlastiZamax(const size_t n, 398 cl_mem imax_buffer, const size_t imax_offset, 399 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 400 cl_command_queue* queue, cl_event* event); 401 CLBlastStatusCode PUBLIC_API CLBlastiHamax(const size_t n, 402 cl_mem imax_buffer, const size_t imax_offset, 403 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 404 cl_command_queue* queue, cl_event* event); 405 406 // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN 407 CLBlastStatusCode PUBLIC_API CLBlastiSamin(const size_t n, 408 cl_mem imin_buffer, const size_t imin_offset, 409 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 410 cl_command_queue* queue, cl_event* event); 411 CLBlastStatusCode PUBLIC_API CLBlastiDamin(const size_t n, 412 cl_mem imin_buffer, const size_t imin_offset, 413 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 414 cl_command_queue* queue, cl_event* event); 415 CLBlastStatusCode PUBLIC_API CLBlastiCamin(const size_t n, 416 cl_mem imin_buffer, const size_t imin_offset, 417 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 418 cl_command_queue* queue, cl_event* event); 419 CLBlastStatusCode PUBLIC_API CLBlastiZamin(const size_t n, 420 cl_mem imin_buffer, const size_t imin_offset, 421 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 422 cl_command_queue* queue, cl_event* event); 423 CLBlastStatusCode PUBLIC_API CLBlastiHamin(const size_t n, 424 cl_mem imin_buffer, const size_t imin_offset, 425 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 426 cl_command_queue* queue, cl_event* event); 427 428 // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX 429 CLBlastStatusCode PUBLIC_API CLBlastiSmax(const size_t n, 430 cl_mem imax_buffer, const size_t imax_offset, 431 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 432 cl_command_queue* queue, cl_event* event); 433 CLBlastStatusCode PUBLIC_API CLBlastiDmax(const size_t n, 434 cl_mem imax_buffer, const size_t imax_offset, 435 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 436 cl_command_queue* queue, cl_event* event); 437 CLBlastStatusCode PUBLIC_API CLBlastiCmax(const size_t n, 438 cl_mem imax_buffer, const size_t imax_offset, 439 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 440 cl_command_queue* queue, cl_event* event); 441 CLBlastStatusCode PUBLIC_API CLBlastiZmax(const size_t n, 442 cl_mem imax_buffer, const size_t imax_offset, 443 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 444 cl_command_queue* queue, cl_event* event); 445 CLBlastStatusCode PUBLIC_API CLBlastiHmax(const size_t n, 446 cl_mem imax_buffer, const size_t imax_offset, 447 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 448 cl_command_queue* queue, cl_event* event); 449 450 // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN 451 CLBlastStatusCode PUBLIC_API CLBlastiSmin(const size_t n, 452 cl_mem imin_buffer, const size_t imin_offset, 453 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 454 cl_command_queue* queue, cl_event* event); 455 CLBlastStatusCode PUBLIC_API CLBlastiDmin(const size_t n, 456 cl_mem imin_buffer, const size_t imin_offset, 457 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 458 cl_command_queue* queue, cl_event* event); 459 CLBlastStatusCode PUBLIC_API CLBlastiCmin(const size_t n, 460 cl_mem imin_buffer, const size_t imin_offset, 461 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 462 cl_command_queue* queue, cl_event* event); 463 CLBlastStatusCode PUBLIC_API CLBlastiZmin(const size_t n, 464 cl_mem imin_buffer, const size_t imin_offset, 465 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 466 cl_command_queue* queue, cl_event* event); 467 CLBlastStatusCode PUBLIC_API CLBlastiHmin(const size_t n, 468 cl_mem imin_buffer, const size_t imin_offset, 469 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 470 cl_command_queue* queue, cl_event* event); 471 472 // ================================================================================================= 473 // BLAS level-2 (matrix-vector) routines 474 // ================================================================================================= 475 476 // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV 477 CLBlastStatusCode PUBLIC_API CLBlastSgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 478 const size_t m, const size_t n, 479 const float alpha, 480 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 481 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 482 const float beta, 483 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 484 cl_command_queue* queue, cl_event* event); 485 CLBlastStatusCode PUBLIC_API CLBlastDgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 486 const size_t m, const size_t n, 487 const double alpha, 488 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 489 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 490 const double beta, 491 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 492 cl_command_queue* queue, cl_event* event); 493 CLBlastStatusCode PUBLIC_API CLBlastCgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 494 const size_t m, const size_t n, 495 const cl_float2 alpha, 496 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 497 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 498 const cl_float2 beta, 499 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 500 cl_command_queue* queue, cl_event* event); 501 CLBlastStatusCode PUBLIC_API CLBlastZgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 502 const size_t m, const size_t n, 503 const cl_double2 alpha, 504 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 505 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 506 const cl_double2 beta, 507 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 508 cl_command_queue* queue, cl_event* event); 509 CLBlastStatusCode PUBLIC_API CLBlastHgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 510 const size_t m, const size_t n, 511 const cl_half alpha, 512 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 513 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 514 const cl_half beta, 515 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 516 cl_command_queue* queue, cl_event* event); 517 518 // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV 519 CLBlastStatusCode PUBLIC_API CLBlastSgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 520 const size_t m, const size_t n, const size_t kl, const size_t ku, 521 const float alpha, 522 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 523 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 524 const float beta, 525 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 526 cl_command_queue* queue, cl_event* event); 527 CLBlastStatusCode PUBLIC_API CLBlastDgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 528 const size_t m, const size_t n, const size_t kl, const size_t ku, 529 const double alpha, 530 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 531 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 532 const double beta, 533 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 534 cl_command_queue* queue, cl_event* event); 535 CLBlastStatusCode PUBLIC_API CLBlastCgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 536 const size_t m, const size_t n, const size_t kl, const size_t ku, 537 const cl_float2 alpha, 538 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 539 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 540 const cl_float2 beta, 541 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 542 cl_command_queue* queue, cl_event* event); 543 CLBlastStatusCode PUBLIC_API CLBlastZgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 544 const size_t m, const size_t n, const size_t kl, const size_t ku, 545 const cl_double2 alpha, 546 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 547 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 548 const cl_double2 beta, 549 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 550 cl_command_queue* queue, cl_event* event); 551 CLBlastStatusCode PUBLIC_API CLBlastHgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 552 const size_t m, const size_t n, const size_t kl, const size_t ku, 553 const cl_half alpha, 554 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 555 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 556 const cl_half beta, 557 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 558 cl_command_queue* queue, cl_event* event); 559 560 // Hermitian matrix-vector multiplication: CHEMV/ZHEMV 561 CLBlastStatusCode PUBLIC_API CLBlastChemv(const CLBlastLayout layout, const CLBlastTriangle triangle, 562 const size_t n, 563 const cl_float2 alpha, 564 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 565 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 566 const cl_float2 beta, 567 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 568 cl_command_queue* queue, cl_event* event); 569 CLBlastStatusCode PUBLIC_API CLBlastZhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, 570 const size_t n, 571 const cl_double2 alpha, 572 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 573 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 574 const cl_double2 beta, 575 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 576 cl_command_queue* queue, cl_event* event); 577 578 // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV 579 CLBlastStatusCode PUBLIC_API CLBlastChbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, 580 const size_t n, const size_t k, 581 const cl_float2 alpha, 582 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 583 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 584 const cl_float2 beta, 585 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 586 cl_command_queue* queue, cl_event* event); 587 CLBlastStatusCode PUBLIC_API CLBlastZhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, 588 const size_t n, const size_t k, 589 const cl_double2 alpha, 590 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 591 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 592 const cl_double2 beta, 593 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 594 cl_command_queue* queue, cl_event* event); 595 596 // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV 597 CLBlastStatusCode PUBLIC_API CLBlastChpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, 598 const size_t n, 599 const cl_float2 alpha, 600 const cl_mem ap_buffer, const size_t ap_offset, 601 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 602 const cl_float2 beta, 603 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 604 cl_command_queue* queue, cl_event* event); 605 CLBlastStatusCode PUBLIC_API CLBlastZhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, 606 const size_t n, 607 const cl_double2 alpha, 608 const cl_mem ap_buffer, const size_t ap_offset, 609 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 610 const cl_double2 beta, 611 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 612 cl_command_queue* queue, cl_event* event); 613 614 // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV 615 CLBlastStatusCode PUBLIC_API CLBlastSsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, 616 const size_t n, 617 const float alpha, 618 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 619 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 620 const float beta, 621 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 622 cl_command_queue* queue, cl_event* event); 623 CLBlastStatusCode PUBLIC_API CLBlastDsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, 624 const size_t n, 625 const double alpha, 626 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 627 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 628 const double beta, 629 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 630 cl_command_queue* queue, cl_event* event); 631 CLBlastStatusCode PUBLIC_API CLBlastHsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, 632 const size_t n, 633 const cl_half alpha, 634 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 635 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 636 const cl_half beta, 637 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 638 cl_command_queue* queue, cl_event* event); 639 640 // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV 641 CLBlastStatusCode PUBLIC_API CLBlastSsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, 642 const size_t n, const size_t k, 643 const float alpha, 644 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 645 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 646 const float beta, 647 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 648 cl_command_queue* queue, cl_event* event); 649 CLBlastStatusCode PUBLIC_API CLBlastDsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, 650 const size_t n, const size_t k, 651 const double alpha, 652 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 653 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 654 const double beta, 655 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 656 cl_command_queue* queue, cl_event* event); 657 CLBlastStatusCode PUBLIC_API CLBlastHsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, 658 const size_t n, const size_t k, 659 const cl_half alpha, 660 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 661 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 662 const cl_half beta, 663 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 664 cl_command_queue* queue, cl_event* event); 665 666 // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV 667 CLBlastStatusCode PUBLIC_API CLBlastSspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, 668 const size_t n, 669 const float alpha, 670 const cl_mem ap_buffer, const size_t ap_offset, 671 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 672 const float beta, 673 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 674 cl_command_queue* queue, cl_event* event); 675 CLBlastStatusCode PUBLIC_API CLBlastDspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, 676 const size_t n, 677 const double alpha, 678 const cl_mem ap_buffer, const size_t ap_offset, 679 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 680 const double beta, 681 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 682 cl_command_queue* queue, cl_event* event); 683 CLBlastStatusCode PUBLIC_API CLBlastHspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, 684 const size_t n, 685 const cl_half alpha, 686 const cl_mem ap_buffer, const size_t ap_offset, 687 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 688 const cl_half beta, 689 cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 690 cl_command_queue* queue, cl_event* event); 691 692 // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV 693 CLBlastStatusCode PUBLIC_API CLBlastStrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 694 const size_t n, 695 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 696 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 697 cl_command_queue* queue, cl_event* event); 698 CLBlastStatusCode PUBLIC_API CLBlastDtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 699 const size_t n, 700 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 701 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 702 cl_command_queue* queue, cl_event* event); 703 CLBlastStatusCode PUBLIC_API CLBlastCtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 704 const size_t n, 705 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 706 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 707 cl_command_queue* queue, cl_event* event); 708 CLBlastStatusCode PUBLIC_API CLBlastZtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 709 const size_t n, 710 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 711 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 712 cl_command_queue* queue, cl_event* event); 713 CLBlastStatusCode PUBLIC_API CLBlastHtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 714 const size_t n, 715 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 716 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 717 cl_command_queue* queue, cl_event* event); 718 719 // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV 720 CLBlastStatusCode PUBLIC_API CLBlastStbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 721 const size_t n, const size_t k, 722 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 723 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 724 cl_command_queue* queue, cl_event* event); 725 CLBlastStatusCode PUBLIC_API CLBlastDtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 726 const size_t n, const size_t k, 727 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 728 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 729 cl_command_queue* queue, cl_event* event); 730 CLBlastStatusCode PUBLIC_API CLBlastCtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 731 const size_t n, const size_t k, 732 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 733 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 734 cl_command_queue* queue, cl_event* event); 735 CLBlastStatusCode PUBLIC_API CLBlastZtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 736 const size_t n, const size_t k, 737 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 738 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 739 cl_command_queue* queue, cl_event* event); 740 CLBlastStatusCode PUBLIC_API CLBlastHtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 741 const size_t n, const size_t k, 742 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 743 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 744 cl_command_queue* queue, cl_event* event); 745 746 // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV 747 CLBlastStatusCode PUBLIC_API CLBlastStpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 748 const size_t n, 749 const cl_mem ap_buffer, const size_t ap_offset, 750 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 751 cl_command_queue* queue, cl_event* event); 752 CLBlastStatusCode PUBLIC_API CLBlastDtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 753 const size_t n, 754 const cl_mem ap_buffer, const size_t ap_offset, 755 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 756 cl_command_queue* queue, cl_event* event); 757 CLBlastStatusCode PUBLIC_API CLBlastCtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 758 const size_t n, 759 const cl_mem ap_buffer, const size_t ap_offset, 760 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 761 cl_command_queue* queue, cl_event* event); 762 CLBlastStatusCode PUBLIC_API CLBlastZtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 763 const size_t n, 764 const cl_mem ap_buffer, const size_t ap_offset, 765 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 766 cl_command_queue* queue, cl_event* event); 767 CLBlastStatusCode PUBLIC_API CLBlastHtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 768 const size_t n, 769 const cl_mem ap_buffer, const size_t ap_offset, 770 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 771 cl_command_queue* queue, cl_event* event); 772 773 // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV 774 CLBlastStatusCode PUBLIC_API CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 775 const size_t n, 776 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 777 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 778 cl_command_queue* queue, cl_event* event); 779 CLBlastStatusCode PUBLIC_API CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 780 const size_t n, 781 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 782 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 783 cl_command_queue* queue, cl_event* event); 784 CLBlastStatusCode PUBLIC_API CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 785 const size_t n, 786 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 787 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 788 cl_command_queue* queue, cl_event* event); 789 CLBlastStatusCode PUBLIC_API CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 790 const size_t n, 791 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 792 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 793 cl_command_queue* queue, cl_event* event); 794 795 // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV 796 CLBlastStatusCode PUBLIC_API CLBlastStbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 797 const size_t n, const size_t k, 798 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 799 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 800 cl_command_queue* queue, cl_event* event); 801 CLBlastStatusCode PUBLIC_API CLBlastDtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 802 const size_t n, const size_t k, 803 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 804 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 805 cl_command_queue* queue, cl_event* event); 806 CLBlastStatusCode PUBLIC_API CLBlastCtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 807 const size_t n, const size_t k, 808 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 809 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 810 cl_command_queue* queue, cl_event* event); 811 CLBlastStatusCode PUBLIC_API CLBlastZtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 812 const size_t n, const size_t k, 813 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 814 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 815 cl_command_queue* queue, cl_event* event); 816 817 // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV 818 CLBlastStatusCode PUBLIC_API CLBlastStpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 819 const size_t n, 820 const cl_mem ap_buffer, const size_t ap_offset, 821 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 822 cl_command_queue* queue, cl_event* event); 823 CLBlastStatusCode PUBLIC_API CLBlastDtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 824 const size_t n, 825 const cl_mem ap_buffer, const size_t ap_offset, 826 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 827 cl_command_queue* queue, cl_event* event); 828 CLBlastStatusCode PUBLIC_API CLBlastCtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 829 const size_t n, 830 const cl_mem ap_buffer, const size_t ap_offset, 831 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 832 cl_command_queue* queue, cl_event* event); 833 CLBlastStatusCode PUBLIC_API CLBlastZtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 834 const size_t n, 835 const cl_mem ap_buffer, const size_t ap_offset, 836 cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 837 cl_command_queue* queue, cl_event* event); 838 839 // General rank-1 matrix update: SGER/DGER/HGER 840 CLBlastStatusCode PUBLIC_API CLBlastSger(const CLBlastLayout layout, 841 const size_t m, const size_t n, 842 const float alpha, 843 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 844 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 845 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 846 cl_command_queue* queue, cl_event* event); 847 CLBlastStatusCode PUBLIC_API CLBlastDger(const CLBlastLayout layout, 848 const size_t m, const size_t n, 849 const double alpha, 850 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 851 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 852 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 853 cl_command_queue* queue, cl_event* event); 854 CLBlastStatusCode PUBLIC_API CLBlastHger(const CLBlastLayout layout, 855 const size_t m, const size_t n, 856 const cl_half alpha, 857 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 858 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 859 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 860 cl_command_queue* queue, cl_event* event); 861 862 // General rank-1 complex matrix update: CGERU/ZGERU 863 CLBlastStatusCode PUBLIC_API CLBlastCgeru(const CLBlastLayout layout, 864 const size_t m, const size_t n, 865 const cl_float2 alpha, 866 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 867 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 868 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 869 cl_command_queue* queue, cl_event* event); 870 CLBlastStatusCode PUBLIC_API CLBlastZgeru(const CLBlastLayout layout, 871 const size_t m, const size_t n, 872 const cl_double2 alpha, 873 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 874 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 875 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 876 cl_command_queue* queue, cl_event* event); 877 878 // General rank-1 complex conjugated matrix update: CGERC/ZGERC 879 CLBlastStatusCode PUBLIC_API CLBlastCgerc(const CLBlastLayout layout, 880 const size_t m, const size_t n, 881 const cl_float2 alpha, 882 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 883 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 884 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 885 cl_command_queue* queue, cl_event* event); 886 CLBlastStatusCode PUBLIC_API CLBlastZgerc(const CLBlastLayout layout, 887 const size_t m, const size_t n, 888 const cl_double2 alpha, 889 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 890 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 891 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 892 cl_command_queue* queue, cl_event* event); 893 894 // Hermitian rank-1 matrix update: CHER/ZHER 895 CLBlastStatusCode PUBLIC_API CLBlastCher(const CLBlastLayout layout, const CLBlastTriangle triangle, 896 const size_t n, 897 const float alpha, 898 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 899 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 900 cl_command_queue* queue, cl_event* event); 901 CLBlastStatusCode PUBLIC_API CLBlastZher(const CLBlastLayout layout, const CLBlastTriangle triangle, 902 const size_t n, 903 const double alpha, 904 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 905 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 906 cl_command_queue* queue, cl_event* event); 907 908 // Hermitian packed rank-1 matrix update: CHPR/ZHPR 909 CLBlastStatusCode PUBLIC_API CLBlastChpr(const CLBlastLayout layout, const CLBlastTriangle triangle, 910 const size_t n, 911 const float alpha, 912 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 913 cl_mem ap_buffer, const size_t ap_offset, 914 cl_command_queue* queue, cl_event* event); 915 CLBlastStatusCode PUBLIC_API CLBlastZhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, 916 const size_t n, 917 const double alpha, 918 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 919 cl_mem ap_buffer, const size_t ap_offset, 920 cl_command_queue* queue, cl_event* event); 921 922 // Hermitian rank-2 matrix update: CHER2/ZHER2 923 CLBlastStatusCode PUBLIC_API CLBlastCher2(const CLBlastLayout layout, const CLBlastTriangle triangle, 924 const size_t n, 925 const cl_float2 alpha, 926 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 927 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 928 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 929 cl_command_queue* queue, cl_event* event); 930 CLBlastStatusCode PUBLIC_API CLBlastZher2(const CLBlastLayout layout, const CLBlastTriangle triangle, 931 const size_t n, 932 const cl_double2 alpha, 933 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 934 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 935 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 936 cl_command_queue* queue, cl_event* event); 937 938 // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 939 CLBlastStatusCode PUBLIC_API CLBlastChpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, 940 const size_t n, 941 const cl_float2 alpha, 942 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 943 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 944 cl_mem ap_buffer, const size_t ap_offset, 945 cl_command_queue* queue, cl_event* event); 946 CLBlastStatusCode PUBLIC_API CLBlastZhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, 947 const size_t n, 948 const cl_double2 alpha, 949 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 950 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 951 cl_mem ap_buffer, const size_t ap_offset, 952 cl_command_queue* queue, cl_event* event); 953 954 // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR 955 CLBlastStatusCode PUBLIC_API CLBlastSsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, 956 const size_t n, 957 const float alpha, 958 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 959 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 960 cl_command_queue* queue, cl_event* event); 961 CLBlastStatusCode PUBLIC_API CLBlastDsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, 962 const size_t n, 963 const double alpha, 964 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 965 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 966 cl_command_queue* queue, cl_event* event); 967 CLBlastStatusCode PUBLIC_API CLBlastHsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, 968 const size_t n, 969 const cl_half alpha, 970 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 971 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 972 cl_command_queue* queue, cl_event* event); 973 974 // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR 975 CLBlastStatusCode PUBLIC_API CLBlastSspr(const CLBlastLayout layout, const CLBlastTriangle triangle, 976 const size_t n, 977 const float alpha, 978 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 979 cl_mem ap_buffer, const size_t ap_offset, 980 cl_command_queue* queue, cl_event* event); 981 CLBlastStatusCode PUBLIC_API CLBlastDspr(const CLBlastLayout layout, const CLBlastTriangle triangle, 982 const size_t n, 983 const double alpha, 984 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 985 cl_mem ap_buffer, const size_t ap_offset, 986 cl_command_queue* queue, cl_event* event); 987 CLBlastStatusCode PUBLIC_API CLBlastHspr(const CLBlastLayout layout, const CLBlastTriangle triangle, 988 const size_t n, 989 const cl_half alpha, 990 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 991 cl_mem ap_buffer, const size_t ap_offset, 992 cl_command_queue* queue, cl_event* event); 993 994 // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 995 CLBlastStatusCode PUBLIC_API CLBlastSsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, 996 const size_t n, 997 const float alpha, 998 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 999 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 1000 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1001 cl_command_queue* queue, cl_event* event); 1002 CLBlastStatusCode PUBLIC_API CLBlastDsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, 1003 const size_t n, 1004 const double alpha, 1005 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 1006 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 1007 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1008 cl_command_queue* queue, cl_event* event); 1009 CLBlastStatusCode PUBLIC_API CLBlastHsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, 1010 const size_t n, 1011 const cl_half alpha, 1012 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 1013 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 1014 cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1015 cl_command_queue* queue, cl_event* event); 1016 1017 // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 1018 CLBlastStatusCode PUBLIC_API CLBlastSspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, 1019 const size_t n, 1020 const float alpha, 1021 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 1022 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 1023 cl_mem ap_buffer, const size_t ap_offset, 1024 cl_command_queue* queue, cl_event* event); 1025 CLBlastStatusCode PUBLIC_API CLBlastDspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, 1026 const size_t n, 1027 const double alpha, 1028 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 1029 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 1030 cl_mem ap_buffer, const size_t ap_offset, 1031 cl_command_queue* queue, cl_event* event); 1032 CLBlastStatusCode PUBLIC_API CLBlastHspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, 1033 const size_t n, 1034 const cl_half alpha, 1035 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, 1036 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, 1037 cl_mem ap_buffer, const size_t ap_offset, 1038 cl_command_queue* queue, cl_event* event); 1039 1040 // ================================================================================================= 1041 // BLAS level-3 (matrix-matrix) routines 1042 // ================================================================================================= 1043 1044 // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM 1045 CLBlastStatusCode PUBLIC_API CLBlastSgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, 1046 const size_t m, const size_t n, const size_t k, 1047 const float alpha, 1048 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1049 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1050 const float beta, 1051 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1052 cl_command_queue* queue, cl_event* event); 1053 CLBlastStatusCode PUBLIC_API CLBlastDgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, 1054 const size_t m, const size_t n, const size_t k, 1055 const double alpha, 1056 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1057 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1058 const double beta, 1059 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1060 cl_command_queue* queue, cl_event* event); 1061 CLBlastStatusCode PUBLIC_API CLBlastCgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, 1062 const size_t m, const size_t n, const size_t k, 1063 const cl_float2 alpha, 1064 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1065 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1066 const cl_float2 beta, 1067 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1068 cl_command_queue* queue, cl_event* event); 1069 CLBlastStatusCode PUBLIC_API CLBlastZgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, 1070 const size_t m, const size_t n, const size_t k, 1071 const cl_double2 alpha, 1072 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1073 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1074 const cl_double2 beta, 1075 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1076 cl_command_queue* queue, cl_event* event); 1077 CLBlastStatusCode PUBLIC_API CLBlastHgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, 1078 const size_t m, const size_t n, const size_t k, 1079 const cl_half alpha, 1080 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1081 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1082 const cl_half beta, 1083 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1084 cl_command_queue* queue, cl_event* event); 1085 1086 // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM 1087 CLBlastStatusCode PUBLIC_API CLBlastSsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, 1088 const size_t m, const size_t n, 1089 const float alpha, 1090 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1091 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1092 const float beta, 1093 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1094 cl_command_queue* queue, cl_event* event); 1095 CLBlastStatusCode PUBLIC_API CLBlastDsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, 1096 const size_t m, const size_t n, 1097 const double alpha, 1098 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1099 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1100 const double beta, 1101 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1102 cl_command_queue* queue, cl_event* event); 1103 CLBlastStatusCode PUBLIC_API CLBlastCsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, 1104 const size_t m, const size_t n, 1105 const cl_float2 alpha, 1106 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1107 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1108 const cl_float2 beta, 1109 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1110 cl_command_queue* queue, cl_event* event); 1111 CLBlastStatusCode PUBLIC_API CLBlastZsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, 1112 const size_t m, const size_t n, 1113 const cl_double2 alpha, 1114 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1115 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1116 const cl_double2 beta, 1117 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1118 cl_command_queue* queue, cl_event* event); 1119 CLBlastStatusCode PUBLIC_API CLBlastHsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, 1120 const size_t m, const size_t n, 1121 const cl_half alpha, 1122 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1123 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1124 const cl_half beta, 1125 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1126 cl_command_queue* queue, cl_event* event); 1127 1128 // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM 1129 CLBlastStatusCode PUBLIC_API CLBlastChemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, 1130 const size_t m, const size_t n, 1131 const cl_float2 alpha, 1132 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1133 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1134 const cl_float2 beta, 1135 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1136 cl_command_queue* queue, cl_event* event); 1137 CLBlastStatusCode PUBLIC_API CLBlastZhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, 1138 const size_t m, const size_t n, 1139 const cl_double2 alpha, 1140 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1141 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1142 const cl_double2 beta, 1143 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1144 cl_command_queue* queue, cl_event* event); 1145 1146 // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK 1147 CLBlastStatusCode PUBLIC_API CLBlastSsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, 1148 const size_t n, const size_t k, 1149 const float alpha, 1150 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1151 const float beta, 1152 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1153 cl_command_queue* queue, cl_event* event); 1154 CLBlastStatusCode PUBLIC_API CLBlastDsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, 1155 const size_t n, const size_t k, 1156 const double alpha, 1157 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1158 const double beta, 1159 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1160 cl_command_queue* queue, cl_event* event); 1161 CLBlastStatusCode PUBLIC_API CLBlastCsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, 1162 const size_t n, const size_t k, 1163 const cl_float2 alpha, 1164 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1165 const cl_float2 beta, 1166 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1167 cl_command_queue* queue, cl_event* event); 1168 CLBlastStatusCode PUBLIC_API CLBlastZsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, 1169 const size_t n, const size_t k, 1170 const cl_double2 alpha, 1171 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1172 const cl_double2 beta, 1173 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1174 cl_command_queue* queue, cl_event* event); 1175 CLBlastStatusCode PUBLIC_API CLBlastHsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, 1176 const size_t n, const size_t k, 1177 const cl_half alpha, 1178 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1179 const cl_half beta, 1180 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1181 cl_command_queue* queue, cl_event* event); 1182 1183 // Rank-K update of a hermitian matrix: CHERK/ZHERK 1184 CLBlastStatusCode PUBLIC_API CLBlastCherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, 1185 const size_t n, const size_t k, 1186 const float alpha, 1187 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1188 const float beta, 1189 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1190 cl_command_queue* queue, cl_event* event); 1191 CLBlastStatusCode PUBLIC_API CLBlastZherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, 1192 const size_t n, const size_t k, 1193 const double alpha, 1194 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1195 const double beta, 1196 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1197 cl_command_queue* queue, cl_event* event); 1198 1199 // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K 1200 CLBlastStatusCode PUBLIC_API CLBlastSsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, 1201 const size_t n, const size_t k, 1202 const float alpha, 1203 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1204 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1205 const float beta, 1206 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1207 cl_command_queue* queue, cl_event* event); 1208 CLBlastStatusCode PUBLIC_API CLBlastDsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, 1209 const size_t n, const size_t k, 1210 const double alpha, 1211 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1212 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1213 const double beta, 1214 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1215 cl_command_queue* queue, cl_event* event); 1216 CLBlastStatusCode PUBLIC_API CLBlastCsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, 1217 const size_t n, const size_t k, 1218 const cl_float2 alpha, 1219 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1220 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1221 const cl_float2 beta, 1222 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1223 cl_command_queue* queue, cl_event* event); 1224 CLBlastStatusCode PUBLIC_API CLBlastZsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, 1225 const size_t n, const size_t k, 1226 const cl_double2 alpha, 1227 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1228 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1229 const cl_double2 beta, 1230 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1231 cl_command_queue* queue, cl_event* event); 1232 CLBlastStatusCode PUBLIC_API CLBlastHsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, 1233 const size_t n, const size_t k, 1234 const cl_half alpha, 1235 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1236 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1237 const cl_half beta, 1238 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1239 cl_command_queue* queue, cl_event* event); 1240 1241 // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K 1242 CLBlastStatusCode PUBLIC_API CLBlastCher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, 1243 const size_t n, const size_t k, 1244 const cl_float2 alpha, 1245 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1246 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1247 const float beta, 1248 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1249 cl_command_queue* queue, cl_event* event); 1250 CLBlastStatusCode PUBLIC_API CLBlastZher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, 1251 const size_t n, const size_t k, 1252 const cl_double2 alpha, 1253 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1254 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1255 const double beta, 1256 cl_mem c_buffer, const size_t c_offset, const size_t c_ld, 1257 cl_command_queue* queue, cl_event* event); 1258 1259 // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM 1260 CLBlastStatusCode PUBLIC_API CLBlastStrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 1261 const size_t m, const size_t n, 1262 const float alpha, 1263 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1264 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1265 cl_command_queue* queue, cl_event* event); 1266 CLBlastStatusCode PUBLIC_API CLBlastDtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 1267 const size_t m, const size_t n, 1268 const double alpha, 1269 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1270 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1271 cl_command_queue* queue, cl_event* event); 1272 CLBlastStatusCode PUBLIC_API CLBlastCtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 1273 const size_t m, const size_t n, 1274 const cl_float2 alpha, 1275 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1276 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1277 cl_command_queue* queue, cl_event* event); 1278 CLBlastStatusCode PUBLIC_API CLBlastZtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 1279 const size_t m, const size_t n, 1280 const cl_double2 alpha, 1281 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1282 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1283 cl_command_queue* queue, cl_event* event); 1284 CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 1285 const size_t m, const size_t n, 1286 const cl_half alpha, 1287 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1288 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1289 cl_command_queue* queue, cl_event* event); 1290 1291 // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM 1292 CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 1293 const size_t m, const size_t n, 1294 const float alpha, 1295 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1296 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1297 cl_command_queue* queue, cl_event* event); 1298 CLBlastStatusCode PUBLIC_API CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 1299 const size_t m, const size_t n, 1300 const double alpha, 1301 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1302 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1303 cl_command_queue* queue, cl_event* event); 1304 CLBlastStatusCode PUBLIC_API CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 1305 const size_t m, const size_t n, 1306 const cl_float2 alpha, 1307 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1308 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1309 cl_command_queue* queue, cl_event* event); 1310 CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, 1311 const size_t m, const size_t n, 1312 const cl_double2 alpha, 1313 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1314 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1315 cl_command_queue* queue, cl_event* event); 1316 1317 // ================================================================================================= 1318 // Extra non-BLAS routines (level-X) 1319 // ================================================================================================= 1320 1321 // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY 1322 CLBlastStatusCode PUBLIC_API CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 1323 const size_t m, const size_t n, 1324 const float alpha, 1325 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1326 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1327 cl_command_queue* queue, cl_event* event); 1328 CLBlastStatusCode PUBLIC_API CLBlastDomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 1329 const size_t m, const size_t n, 1330 const double alpha, 1331 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1332 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1333 cl_command_queue* queue, cl_event* event); 1334 CLBlastStatusCode PUBLIC_API CLBlastComatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 1335 const size_t m, const size_t n, 1336 const cl_float2 alpha, 1337 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1338 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1339 cl_command_queue* queue, cl_event* event); 1340 CLBlastStatusCode PUBLIC_API CLBlastZomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 1341 const size_t m, const size_t n, 1342 const cl_double2 alpha, 1343 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1344 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1345 cl_command_queue* queue, cl_event* event); 1346 CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, 1347 const size_t m, const size_t n, 1348 const cl_half alpha, 1349 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, 1350 cl_mem b_buffer, const size_t b_offset, const size_t b_ld, 1351 cl_command_queue* queue, cl_event* event); 1352 1353 // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL 1354 CLBlastStatusCode PUBLIC_API CLBlastSim2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, 1355 const cl_mem im_buffer, const size_t im_offset, 1356 cl_mem col_buffer, const size_t col_offset, 1357 cl_command_queue* queue, cl_event* event); 1358 CLBlastStatusCode PUBLIC_API CLBlastDim2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, 1359 const cl_mem im_buffer, const size_t im_offset, 1360 cl_mem col_buffer, const size_t col_offset, 1361 cl_command_queue* queue, cl_event* event); 1362 CLBlastStatusCode PUBLIC_API CLBlastCim2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, 1363 const cl_mem im_buffer, const size_t im_offset, 1364 cl_mem col_buffer, const size_t col_offset, 1365 cl_command_queue* queue, cl_event* event); 1366 CLBlastStatusCode PUBLIC_API CLBlastZim2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, 1367 const cl_mem im_buffer, const size_t im_offset, 1368 cl_mem col_buffer, const size_t col_offset, 1369 cl_command_queue* queue, cl_event* event); 1370 CLBlastStatusCode PUBLIC_API CLBlastHim2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, 1371 const cl_mem im_buffer, const size_t im_offset, 1372 cl_mem col_buffer, const size_t col_offset, 1373 cl_command_queue* queue, cl_event* event); 1374 1375 // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED 1376 CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n, 1377 const float *alphas, 1378 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, 1379 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, 1380 const size_t batch_count, 1381 cl_command_queue* queue, cl_event* event); 1382 CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n, 1383 const double *alphas, 1384 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, 1385 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, 1386 const size_t batch_count, 1387 cl_command_queue* queue, cl_event* event); 1388 CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n, 1389 const cl_float2 *alphas, 1390 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, 1391 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, 1392 const size_t batch_count, 1393 cl_command_queue* queue, cl_event* event); 1394 CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n, 1395 const cl_double2 *alphas, 1396 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, 1397 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, 1398 const size_t batch_count, 1399 cl_command_queue* queue, cl_event* event); 1400 CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n, 1401 const cl_half *alphas, 1402 const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, 1403 cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, 1404 const size_t batch_count, 1405 cl_command_queue* queue, cl_event* event); 1406 1407 // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED 1408 CLBlastStatusCode PUBLIC_API CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, 1409 const size_t m, const size_t n, const size_t k, 1410 const float *alphas, 1411 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, 1412 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, 1413 const float *betas, 1414 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, 1415 const size_t batch_count, 1416 cl_command_queue* queue, cl_event* event); 1417 CLBlastStatusCode PUBLIC_API CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, 1418 const size_t m, const size_t n, const size_t k, 1419 const double *alphas, 1420 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, 1421 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, 1422 const double *betas, 1423 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, 1424 const size_t batch_count, 1425 cl_command_queue* queue, cl_event* event); 1426 CLBlastStatusCode PUBLIC_API CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, 1427 const size_t m, const size_t n, const size_t k, 1428 const cl_float2 *alphas, 1429 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, 1430 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, 1431 const cl_float2 *betas, 1432 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, 1433 const size_t batch_count, 1434 cl_command_queue* queue, cl_event* event); 1435 CLBlastStatusCode PUBLIC_API CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, 1436 const size_t m, const size_t n, const size_t k, 1437 const cl_double2 *alphas, 1438 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, 1439 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, 1440 const cl_double2 *betas, 1441 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, 1442 const size_t batch_count, 1443 cl_command_queue* queue, cl_event* event); 1444 CLBlastStatusCode PUBLIC_API CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, 1445 const size_t m, const size_t n, const size_t k, 1446 const cl_half *alphas, 1447 const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, 1448 const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, 1449 const cl_half *betas, 1450 cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, 1451 const size_t batch_count, 1452 cl_command_queue* queue, cl_event* event); 1453 1454 // ================================================================================================= 1455 1456 // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on 1457 // for the same device. This cache can be cleared to free up system memory or in case of debugging. 1458 CLBlastStatusCode PUBLIC_API CLBlastClearCache(); 1459 1460 // The cache can also be pre-initialized for a specific device with all possible CLBLast kernels. 1461 // Further CLBlast routine calls will then run at maximum speed. 1462 CLBlastStatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device); 1463 1464 // ================================================================================================= 1465 1466 // Overrides tuning parameters for a specific device-precision-kernel combination. The next time 1467 // the target routine is called it will re-compile and use the new parameters from then on. 1468 CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name, 1469 const CLBlastPrecision precision, const size_t num_parameters, 1470 const char** parameters_names, const size_t* parameters_values); 1471 1472 // ================================================================================================= 1473 1474 #ifdef __cplusplus 1475 } // extern "C" 1476 #endif 1477 1478 // CLBLAST_CLBLAST_C_H_ 1479 #endif 1480