1 
2 // =================================================================================================
3 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
4 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
5 // width of 100 characters per line.
6 //
7 // Author(s):
8 //   Cedric Nugteren <www.cedricnugteren.nl>
9 //
10 // This file contains the plain C interface to the CLBlast BLAS routines, the counter-part of the
11 // normal 'clblast.h' C++ header.
12 //
13 // =================================================================================================
14 
15 #ifndef CLBLAST_CLBLAST_C_H_
16 #define CLBLAST_CLBLAST_C_H_
17 
18 // Includes the normal OpenCL C header
19 #if defined(__APPLE__) || defined(__MACOSX)
20   #include <OpenCL/opencl.h>
21 #else
22   #include <CL/opencl.h>
23 #endif
24 
25 // Exports library functions under Windows when building a DLL. See also:
26 // https://msdn.microsoft.com/en-us/library/a90k134d.aspx
27 #if defined(_WIN32) && defined(CLBLAST_DLL)
28   #if defined(COMPILING_DLL)
29     #define PUBLIC_API __declspec(dllexport)
30   #else
31     #define PUBLIC_API __declspec(dllimport)
32   #endif
33 #else
34   #define PUBLIC_API
35 #endif
36 
37 // The C interface
38 #ifdef __cplusplus
39 extern "C" {
40 #endif
41 
42 // =================================================================================================
43 
44 // Status codes. These codes can be returned by functions declared in this header file. The error
45 // codes match either the standard OpenCL error codes or the clBLAS error codes.
46 typedef enum CLBlastStatusCode_ {
47 
48   // Status codes in common with the OpenCL standard
49   CLBlastSuccess                   =   0, // CL_SUCCESS
50   CLBlastOpenCLCompilerNotAvailable=  -3, // CL_COMPILER_NOT_AVAILABLE
51   CLBlastTempBufferAllocFailure    =  -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
52   CLBlastOpenCLOutOfResources      =  -5, // CL_OUT_OF_RESOURCES
53   CLBlastOpenCLOutOfHostMemory     =  -6, // CL_OUT_OF_HOST_MEMORY
54   CLBlastOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
55   CLBlastInvalidValue              = -30, // CL_INVALID_VALUE
56   CLBlastInvalidCommandQueue       = -36, // CL_INVALID_COMMAND_QUEUE
57   CLBlastInvalidMemObject          = -38, // CL_INVALID_MEM_OBJECT
58   CLBlastInvalidBinary             = -42, // CL_INVALID_BINARY
59   CLBlastInvalidBuildOptions       = -43, // CL_INVALID_BUILD_OPTIONS
60   CLBlastInvalidProgram            = -44, // CL_INVALID_PROGRAM
61   CLBlastInvalidProgramExecutable  = -45, // CL_INVALID_PROGRAM_EXECUTABLE
62   CLBlastInvalidKernelName         = -46, // CL_INVALID_KERNEL_NAME
63   CLBlastInvalidKernelDefinition   = -47, // CL_INVALID_KERNEL_DEFINITION
64   CLBlastInvalidKernel             = -48, // CL_INVALID_KERNEL
65   CLBlastInvalidArgIndex           = -49, // CL_INVALID_ARG_INDEX
66   CLBlastInvalidArgValue           = -50, // CL_INVALID_ARG_VALUE
67   CLBlastInvalidArgSize            = -51, // CL_INVALID_ARG_SIZE
68   CLBlastInvalidKernelArgs         = -52, // CL_INVALID_KERNEL_ARGS
69   CLBlastInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
70   CLBlastInvalidLocalThreadsTotal  = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
71   CLBlastInvalidLocalThreadsDim    = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
72   CLBlastInvalidGlobalOffset       = -56, // CL_INVALID_GLOBAL_OFFSET
73   CLBlastInvalidEventWaitList      = -57, // CL_INVALID_EVENT_WAIT_LIST
74   CLBlastInvalidEvent              = -58, // CL_INVALID_EVENT
75   CLBlastInvalidOperation          = -59, // CL_INVALID_OPERATION
76   CLBlastInvalidBufferSize         = -61, // CL_INVALID_BUFFER_SIZE
77   CLBlastInvalidGlobalWorkSize     = -63, // CL_INVALID_GLOBAL_WORK_SIZE
78 
79   // Status codes in common with the clBLAS library
80   CLBlastNotImplemented            = -1024, // Routine or functionality not implemented yet
81   CLBlastInvalidMatrixA            = -1022, // Matrix A is not a valid OpenCL buffer
82   CLBlastInvalidMatrixB            = -1021, // Matrix B is not a valid OpenCL buffer
83   CLBlastInvalidMatrixC            = -1020, // Matrix C is not a valid OpenCL buffer
84   CLBlastInvalidVectorX            = -1019, // Vector X is not a valid OpenCL buffer
85   CLBlastInvalidVectorY            = -1018, // Vector Y is not a valid OpenCL buffer
86   CLBlastInvalidDimension          = -1017, // Dimensions M, N, and K have to be larger than zero
87   CLBlastInvalidLeadDimA           = -1016, // LD of A is smaller than the matrix's first dimension
88   CLBlastInvalidLeadDimB           = -1015, // LD of B is smaller than the matrix's first dimension
89   CLBlastInvalidLeadDimC           = -1014, // LD of C is smaller than the matrix's first dimension
90   CLBlastInvalidIncrementX         = -1013, // Increment of vector X cannot be zero
91   CLBlastInvalidIncrementY         = -1012, // Increment of vector Y cannot be zero
92   CLBlastInsufficientMemoryA       = -1011, // Matrix A's OpenCL buffer is too small
93   CLBlastInsufficientMemoryB       = -1010, // Matrix B's OpenCL buffer is too small
94   CLBlastInsufficientMemoryC       = -1009, // Matrix C's OpenCL buffer is too small
95   CLBlastInsufficientMemoryX       = -1008, // Vector X's OpenCL buffer is too small
96   CLBlastInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small
97 
98   // Custom additional status codes for CLBlast
99   CLBlastInvalidBatchCount         = -2049, // The batch count needs to be positive
100   CLBlastInvalidOverrideKernel     = -2048, // Trying to override parameters for an invalid kernel
101   CLBlastMissingOverrideParameter  = -2047, // Missing override parameter(s) for the target kernel
102   CLBlastInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
103   CLBlastNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
104   CLBlastNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
105   CLBlastInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
106   CLBlastInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
107   CLBlastDatabaseError             = -2041, // Entry for the device was not found in the database
108   CLBlastUnknownError              = -2040, // A catch-all error code representing an unspecified error
109   CLBlastUnexpectedError           = -2039, // A catch-all error code representing an unexpected exception
110 } CLBlastStatusCode;
111 
112 // Matrix layout and transpose types
113 typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101,
114                               CLBlastLayoutColMajor = 102 } CLBlastLayout;
115 typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112,
116                                  CLBlastTransposeConjugate = 113 } CLBlastTranspose;
117 typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121,
118                                 CLBlastTriangleLower = 122 } CLBlastTriangle;
119 typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
120                                 CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
121 typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
122 
123 // Precision enum (values in bits)
124 typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32,
125                                  CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232,
126                                  CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision;
127 
128 // =================================================================================================
129 // BLAS level-1 (vector-vector) routines
130 // =================================================================================================
131 
132 // Generate givens plane rotation: SROTG/DROTG
133 CLBlastStatusCode PUBLIC_API CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset,
134                                           cl_mem sb_buffer, const size_t sb_offset,
135                                           cl_mem sc_buffer, const size_t sc_offset,
136                                           cl_mem ss_buffer, const size_t ss_offset,
137                                           cl_command_queue* queue, cl_event* event);
138 CLBlastStatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
139                                           cl_mem sb_buffer, const size_t sb_offset,
140                                           cl_mem sc_buffer, const size_t sc_offset,
141                                           cl_mem ss_buffer, const size_t ss_offset,
142                                           cl_command_queue* queue, cl_event* event);
143 
144 // Generate modified givens plane rotation: SROTMG/DROTMG
145 CLBlastStatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
146                                            cl_mem sd2_buffer, const size_t sd2_offset,
147                                            cl_mem sx1_buffer, const size_t sx1_offset,
148                                            const cl_mem sy1_buffer, const size_t sy1_offset,
149                                            cl_mem sparam_buffer, const size_t sparam_offset,
150                                            cl_command_queue* queue, cl_event* event);
151 CLBlastStatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
152                                            cl_mem sd2_buffer, const size_t sd2_offset,
153                                            cl_mem sx1_buffer, const size_t sx1_offset,
154                                            const cl_mem sy1_buffer, const size_t sy1_offset,
155                                            cl_mem sparam_buffer, const size_t sparam_offset,
156                                            cl_command_queue* queue, cl_event* event);
157 
158 // Apply givens plane rotation: SROT/DROT
159 CLBlastStatusCode PUBLIC_API CLBlastSrot(const size_t n,
160                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
161                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
162                                          const float cos,
163                                          const float sin,
164                                          cl_command_queue* queue, cl_event* event);
165 CLBlastStatusCode PUBLIC_API CLBlastDrot(const size_t n,
166                                          cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
167                                          cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
168                                          const double cos,
169                                          const double sin,
170                                          cl_command_queue* queue, cl_event* event);
171 
172 // Apply modified givens plane rotation: SROTM/DROTM
173 CLBlastStatusCode PUBLIC_API CLBlastSrotm(const size_t n,
174                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
175                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
176                                           cl_mem sparam_buffer, const size_t sparam_offset,
177                                           cl_command_queue* queue, cl_event* event);
178 CLBlastStatusCode PUBLIC_API CLBlastDrotm(const size_t n,
179                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
180                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
181                                           cl_mem sparam_buffer, const size_t sparam_offset,
182                                           cl_command_queue* queue, cl_event* event);
183 
184 // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
185 CLBlastStatusCode PUBLIC_API CLBlastSswap(const size_t n,
186                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
187                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
188                                           cl_command_queue* queue, cl_event* event);
189 CLBlastStatusCode PUBLIC_API CLBlastDswap(const size_t n,
190                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
191                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
192                                           cl_command_queue* queue, cl_event* event);
193 CLBlastStatusCode PUBLIC_API CLBlastCswap(const size_t n,
194                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
195                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
196                                           cl_command_queue* queue, cl_event* event);
197 CLBlastStatusCode PUBLIC_API CLBlastZswap(const size_t n,
198                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
199                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
200                                           cl_command_queue* queue, cl_event* event);
201 CLBlastStatusCode PUBLIC_API CLBlastHswap(const size_t n,
202                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
203                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
204                                           cl_command_queue* queue, cl_event* event);
205 
206 // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
207 CLBlastStatusCode PUBLIC_API CLBlastSscal(const size_t n,
208                                           const float alpha,
209                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
210                                           cl_command_queue* queue, cl_event* event);
211 CLBlastStatusCode PUBLIC_API CLBlastDscal(const size_t n,
212                                           const double alpha,
213                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
214                                           cl_command_queue* queue, cl_event* event);
215 CLBlastStatusCode PUBLIC_API CLBlastCscal(const size_t n,
216                                           const cl_float2 alpha,
217                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
218                                           cl_command_queue* queue, cl_event* event);
219 CLBlastStatusCode PUBLIC_API CLBlastZscal(const size_t n,
220                                           const cl_double2 alpha,
221                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
222                                           cl_command_queue* queue, cl_event* event);
223 CLBlastStatusCode PUBLIC_API CLBlastHscal(const size_t n,
224                                           const cl_half alpha,
225                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
226                                           cl_command_queue* queue, cl_event* event);
227 
228 // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
229 CLBlastStatusCode PUBLIC_API CLBlastScopy(const size_t n,
230                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
231                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
232                                           cl_command_queue* queue, cl_event* event);
233 CLBlastStatusCode PUBLIC_API CLBlastDcopy(const size_t n,
234                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
235                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
236                                           cl_command_queue* queue, cl_event* event);
237 CLBlastStatusCode PUBLIC_API CLBlastCcopy(const size_t n,
238                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
239                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
240                                           cl_command_queue* queue, cl_event* event);
241 CLBlastStatusCode PUBLIC_API CLBlastZcopy(const size_t n,
242                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
243                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
244                                           cl_command_queue* queue, cl_event* event);
245 CLBlastStatusCode PUBLIC_API CLBlastHcopy(const size_t n,
246                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
247                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
248                                           cl_command_queue* queue, cl_event* event);
249 
250 // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
251 CLBlastStatusCode PUBLIC_API CLBlastSaxpy(const size_t n,
252                                           const float alpha,
253                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
254                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
255                                           cl_command_queue* queue, cl_event* event);
256 CLBlastStatusCode PUBLIC_API CLBlastDaxpy(const size_t n,
257                                           const double alpha,
258                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
259                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
260                                           cl_command_queue* queue, cl_event* event);
261 CLBlastStatusCode PUBLIC_API CLBlastCaxpy(const size_t n,
262                                           const cl_float2 alpha,
263                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
264                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
265                                           cl_command_queue* queue, cl_event* event);
266 CLBlastStatusCode PUBLIC_API CLBlastZaxpy(const size_t n,
267                                           const cl_double2 alpha,
268                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
269                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
270                                           cl_command_queue* queue, cl_event* event);
271 CLBlastStatusCode PUBLIC_API CLBlastHaxpy(const size_t n,
272                                           const cl_half alpha,
273                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
274                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
275                                           cl_command_queue* queue, cl_event* event);
276 
277 // Dot product of two vectors: SDOT/DDOT/HDOT
278 CLBlastStatusCode PUBLIC_API CLBlastSdot(const size_t n,
279                                          cl_mem dot_buffer, const size_t dot_offset,
280                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
281                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
282                                          cl_command_queue* queue, cl_event* event);
283 CLBlastStatusCode PUBLIC_API CLBlastDdot(const size_t n,
284                                          cl_mem dot_buffer, const size_t dot_offset,
285                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
286                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
287                                          cl_command_queue* queue, cl_event* event);
288 CLBlastStatusCode PUBLIC_API CLBlastHdot(const size_t n,
289                                          cl_mem dot_buffer, const size_t dot_offset,
290                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
291                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
292                                          cl_command_queue* queue, cl_event* event);
293 
294 // Dot product of two complex vectors: CDOTU/ZDOTU
295 CLBlastStatusCode PUBLIC_API CLBlastCdotu(const size_t n,
296                                           cl_mem dot_buffer, const size_t dot_offset,
297                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
298                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
299                                           cl_command_queue* queue, cl_event* event);
300 CLBlastStatusCode PUBLIC_API CLBlastZdotu(const size_t n,
301                                           cl_mem dot_buffer, const size_t dot_offset,
302                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
303                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
304                                           cl_command_queue* queue, cl_event* event);
305 
306 // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
307 CLBlastStatusCode PUBLIC_API CLBlastCdotc(const size_t n,
308                                           cl_mem dot_buffer, const size_t dot_offset,
309                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
310                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
311                                           cl_command_queue* queue, cl_event* event);
312 CLBlastStatusCode PUBLIC_API CLBlastZdotc(const size_t n,
313                                           cl_mem dot_buffer, const size_t dot_offset,
314                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
315                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
316                                           cl_command_queue* queue, cl_event* event);
317 
318 // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
319 CLBlastStatusCode PUBLIC_API CLBlastSnrm2(const size_t n,
320                                           cl_mem nrm2_buffer, const size_t nrm2_offset,
321                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
322                                           cl_command_queue* queue, cl_event* event);
323 CLBlastStatusCode PUBLIC_API CLBlastDnrm2(const size_t n,
324                                           cl_mem nrm2_buffer, const size_t nrm2_offset,
325                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
326                                           cl_command_queue* queue, cl_event* event);
327 CLBlastStatusCode PUBLIC_API CLBlastScnrm2(const size_t n,
328                                           cl_mem nrm2_buffer, const size_t nrm2_offset,
329                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
330                                           cl_command_queue* queue, cl_event* event);
331 CLBlastStatusCode PUBLIC_API CLBlastDznrm2(const size_t n,
332                                           cl_mem nrm2_buffer, const size_t nrm2_offset,
333                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
334                                           cl_command_queue* queue, cl_event* event);
335 CLBlastStatusCode PUBLIC_API CLBlastHnrm2(const size_t n,
336                                           cl_mem nrm2_buffer, const size_t nrm2_offset,
337                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
338                                           cl_command_queue* queue, cl_event* event);
339 
340 // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
341 CLBlastStatusCode PUBLIC_API CLBlastSasum(const size_t n,
342                                           cl_mem asum_buffer, const size_t asum_offset,
343                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
344                                           cl_command_queue* queue, cl_event* event);
345 CLBlastStatusCode PUBLIC_API CLBlastDasum(const size_t n,
346                                           cl_mem asum_buffer, const size_t asum_offset,
347                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
348                                           cl_command_queue* queue, cl_event* event);
349 CLBlastStatusCode PUBLIC_API CLBlastScasum(const size_t n,
350                                           cl_mem asum_buffer, const size_t asum_offset,
351                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
352                                           cl_command_queue* queue, cl_event* event);
353 CLBlastStatusCode PUBLIC_API CLBlastDzasum(const size_t n,
354                                           cl_mem asum_buffer, const size_t asum_offset,
355                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
356                                           cl_command_queue* queue, cl_event* event);
357 CLBlastStatusCode PUBLIC_API CLBlastHasum(const size_t n,
358                                           cl_mem asum_buffer, const size_t asum_offset,
359                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
360                                           cl_command_queue* queue, cl_event* event);
361 
362 // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
363 CLBlastStatusCode PUBLIC_API CLBlastSsum(const size_t n,
364                                          cl_mem sum_buffer, const size_t sum_offset,
365                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
366                                          cl_command_queue* queue, cl_event* event);
367 CLBlastStatusCode PUBLIC_API CLBlastDsum(const size_t n,
368                                          cl_mem sum_buffer, const size_t sum_offset,
369                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
370                                          cl_command_queue* queue, cl_event* event);
371 CLBlastStatusCode PUBLIC_API CLBlastScsum(const size_t n,
372                                          cl_mem sum_buffer, const size_t sum_offset,
373                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
374                                          cl_command_queue* queue, cl_event* event);
375 CLBlastStatusCode PUBLIC_API CLBlastDzsum(const size_t n,
376                                          cl_mem sum_buffer, const size_t sum_offset,
377                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
378                                          cl_command_queue* queue, cl_event* event);
379 CLBlastStatusCode PUBLIC_API CLBlastHsum(const size_t n,
380                                          cl_mem sum_buffer, const size_t sum_offset,
381                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
382                                          cl_command_queue* queue, cl_event* event);
383 
384 // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
385 CLBlastStatusCode PUBLIC_API CLBlastiSamax(const size_t n,
386                                           cl_mem imax_buffer, const size_t imax_offset,
387                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
388                                           cl_command_queue* queue, cl_event* event);
389 CLBlastStatusCode PUBLIC_API CLBlastiDamax(const size_t n,
390                                           cl_mem imax_buffer, const size_t imax_offset,
391                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
392                                           cl_command_queue* queue, cl_event* event);
393 CLBlastStatusCode PUBLIC_API CLBlastiCamax(const size_t n,
394                                           cl_mem imax_buffer, const size_t imax_offset,
395                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
396                                           cl_command_queue* queue, cl_event* event);
397 CLBlastStatusCode PUBLIC_API CLBlastiZamax(const size_t n,
398                                           cl_mem imax_buffer, const size_t imax_offset,
399                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
400                                           cl_command_queue* queue, cl_event* event);
401 CLBlastStatusCode PUBLIC_API CLBlastiHamax(const size_t n,
402                                           cl_mem imax_buffer, const size_t imax_offset,
403                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
404                                           cl_command_queue* queue, cl_event* event);
405 
406 // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN
407 CLBlastStatusCode PUBLIC_API CLBlastiSamin(const size_t n,
408                                           cl_mem imin_buffer, const size_t imin_offset,
409                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
410                                           cl_command_queue* queue, cl_event* event);
411 CLBlastStatusCode PUBLIC_API CLBlastiDamin(const size_t n,
412                                           cl_mem imin_buffer, const size_t imin_offset,
413                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
414                                           cl_command_queue* queue, cl_event* event);
415 CLBlastStatusCode PUBLIC_API CLBlastiCamin(const size_t n,
416                                           cl_mem imin_buffer, const size_t imin_offset,
417                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
418                                           cl_command_queue* queue, cl_event* event);
419 CLBlastStatusCode PUBLIC_API CLBlastiZamin(const size_t n,
420                                           cl_mem imin_buffer, const size_t imin_offset,
421                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
422                                           cl_command_queue* queue, cl_event* event);
423 CLBlastStatusCode PUBLIC_API CLBlastiHamin(const size_t n,
424                                           cl_mem imin_buffer, const size_t imin_offset,
425                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
426                                           cl_command_queue* queue, cl_event* event);
427 
428 // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
429 CLBlastStatusCode PUBLIC_API CLBlastiSmax(const size_t n,
430                                          cl_mem imax_buffer, const size_t imax_offset,
431                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
432                                          cl_command_queue* queue, cl_event* event);
433 CLBlastStatusCode PUBLIC_API CLBlastiDmax(const size_t n,
434                                          cl_mem imax_buffer, const size_t imax_offset,
435                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
436                                          cl_command_queue* queue, cl_event* event);
437 CLBlastStatusCode PUBLIC_API CLBlastiCmax(const size_t n,
438                                          cl_mem imax_buffer, const size_t imax_offset,
439                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
440                                          cl_command_queue* queue, cl_event* event);
441 CLBlastStatusCode PUBLIC_API CLBlastiZmax(const size_t n,
442                                          cl_mem imax_buffer, const size_t imax_offset,
443                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
444                                          cl_command_queue* queue, cl_event* event);
445 CLBlastStatusCode PUBLIC_API CLBlastiHmax(const size_t n,
446                                          cl_mem imax_buffer, const size_t imax_offset,
447                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
448                                          cl_command_queue* queue, cl_event* event);
449 
450 // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
451 CLBlastStatusCode PUBLIC_API CLBlastiSmin(const size_t n,
452                                          cl_mem imin_buffer, const size_t imin_offset,
453                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
454                                          cl_command_queue* queue, cl_event* event);
455 CLBlastStatusCode PUBLIC_API CLBlastiDmin(const size_t n,
456                                          cl_mem imin_buffer, const size_t imin_offset,
457                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
458                                          cl_command_queue* queue, cl_event* event);
459 CLBlastStatusCode PUBLIC_API CLBlastiCmin(const size_t n,
460                                          cl_mem imin_buffer, const size_t imin_offset,
461                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
462                                          cl_command_queue* queue, cl_event* event);
463 CLBlastStatusCode PUBLIC_API CLBlastiZmin(const size_t n,
464                                          cl_mem imin_buffer, const size_t imin_offset,
465                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
466                                          cl_command_queue* queue, cl_event* event);
467 CLBlastStatusCode PUBLIC_API CLBlastiHmin(const size_t n,
468                                          cl_mem imin_buffer, const size_t imin_offset,
469                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
470                                          cl_command_queue* queue, cl_event* event);
471 
472 // =================================================================================================
473 // BLAS level-2 (matrix-vector) routines
474 // =================================================================================================
475 
476 // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
477 CLBlastStatusCode PUBLIC_API CLBlastSgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
478                                           const size_t m, const size_t n,
479                                           const float alpha,
480                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
481                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
482                                           const float beta,
483                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
484                                           cl_command_queue* queue, cl_event* event);
485 CLBlastStatusCode PUBLIC_API CLBlastDgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
486                                           const size_t m, const size_t n,
487                                           const double alpha,
488                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
489                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
490                                           const double beta,
491                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
492                                           cl_command_queue* queue, cl_event* event);
493 CLBlastStatusCode PUBLIC_API CLBlastCgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
494                                           const size_t m, const size_t n,
495                                           const cl_float2 alpha,
496                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
497                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
498                                           const cl_float2 beta,
499                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
500                                           cl_command_queue* queue, cl_event* event);
501 CLBlastStatusCode PUBLIC_API CLBlastZgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
502                                           const size_t m, const size_t n,
503                                           const cl_double2 alpha,
504                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
505                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
506                                           const cl_double2 beta,
507                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
508                                           cl_command_queue* queue, cl_event* event);
509 CLBlastStatusCode PUBLIC_API CLBlastHgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
510                                           const size_t m, const size_t n,
511                                           const cl_half alpha,
512                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
513                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
514                                           const cl_half beta,
515                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
516                                           cl_command_queue* queue, cl_event* event);
517 
518 // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
519 CLBlastStatusCode PUBLIC_API CLBlastSgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
520                                           const size_t m, const size_t n, const size_t kl, const size_t ku,
521                                           const float alpha,
522                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
523                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
524                                           const float beta,
525                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
526                                           cl_command_queue* queue, cl_event* event);
527 CLBlastStatusCode PUBLIC_API CLBlastDgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
528                                           const size_t m, const size_t n, const size_t kl, const size_t ku,
529                                           const double alpha,
530                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
531                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
532                                           const double beta,
533                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
534                                           cl_command_queue* queue, cl_event* event);
535 CLBlastStatusCode PUBLIC_API CLBlastCgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
536                                           const size_t m, const size_t n, const size_t kl, const size_t ku,
537                                           const cl_float2 alpha,
538                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
539                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
540                                           const cl_float2 beta,
541                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
542                                           cl_command_queue* queue, cl_event* event);
543 CLBlastStatusCode PUBLIC_API CLBlastZgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
544                                           const size_t m, const size_t n, const size_t kl, const size_t ku,
545                                           const cl_double2 alpha,
546                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
547                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
548                                           const cl_double2 beta,
549                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
550                                           cl_command_queue* queue, cl_event* event);
551 CLBlastStatusCode PUBLIC_API CLBlastHgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
552                                           const size_t m, const size_t n, const size_t kl, const size_t ku,
553                                           const cl_half alpha,
554                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
555                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
556                                           const cl_half beta,
557                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
558                                           cl_command_queue* queue, cl_event* event);
559 
560 // Hermitian matrix-vector multiplication: CHEMV/ZHEMV
561 CLBlastStatusCode PUBLIC_API CLBlastChemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
562                                           const size_t n,
563                                           const cl_float2 alpha,
564                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
565                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
566                                           const cl_float2 beta,
567                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
568                                           cl_command_queue* queue, cl_event* event);
569 CLBlastStatusCode PUBLIC_API CLBlastZhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
570                                           const size_t n,
571                                           const cl_double2 alpha,
572                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
573                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
574                                           const cl_double2 beta,
575                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
576                                           cl_command_queue* queue, cl_event* event);
577 
578 // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
579 CLBlastStatusCode PUBLIC_API CLBlastChbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
580                                           const size_t n, const size_t k,
581                                           const cl_float2 alpha,
582                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
583                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
584                                           const cl_float2 beta,
585                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
586                                           cl_command_queue* queue, cl_event* event);
587 CLBlastStatusCode PUBLIC_API CLBlastZhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
588                                           const size_t n, const size_t k,
589                                           const cl_double2 alpha,
590                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
591                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
592                                           const cl_double2 beta,
593                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
594                                           cl_command_queue* queue, cl_event* event);
595 
596 // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
597 CLBlastStatusCode PUBLIC_API CLBlastChpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
598                                           const size_t n,
599                                           const cl_float2 alpha,
600                                           const cl_mem ap_buffer, const size_t ap_offset,
601                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
602                                           const cl_float2 beta,
603                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
604                                           cl_command_queue* queue, cl_event* event);
605 CLBlastStatusCode PUBLIC_API CLBlastZhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
606                                           const size_t n,
607                                           const cl_double2 alpha,
608                                           const cl_mem ap_buffer, const size_t ap_offset,
609                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
610                                           const cl_double2 beta,
611                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
612                                           cl_command_queue* queue, cl_event* event);
613 
614 // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
615 CLBlastStatusCode PUBLIC_API CLBlastSsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
616                                           const size_t n,
617                                           const float alpha,
618                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
619                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
620                                           const float beta,
621                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
622                                           cl_command_queue* queue, cl_event* event);
623 CLBlastStatusCode PUBLIC_API CLBlastDsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
624                                           const size_t n,
625                                           const double alpha,
626                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
627                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
628                                           const double beta,
629                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
630                                           cl_command_queue* queue, cl_event* event);
631 CLBlastStatusCode PUBLIC_API CLBlastHsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
632                                           const size_t n,
633                                           const cl_half alpha,
634                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
635                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
636                                           const cl_half beta,
637                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
638                                           cl_command_queue* queue, cl_event* event);
639 
640 // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
641 CLBlastStatusCode PUBLIC_API CLBlastSsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
642                                           const size_t n, const size_t k,
643                                           const float alpha,
644                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
645                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
646                                           const float beta,
647                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
648                                           cl_command_queue* queue, cl_event* event);
649 CLBlastStatusCode PUBLIC_API CLBlastDsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
650                                           const size_t n, const size_t k,
651                                           const double alpha,
652                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
653                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
654                                           const double beta,
655                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
656                                           cl_command_queue* queue, cl_event* event);
657 CLBlastStatusCode PUBLIC_API CLBlastHsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
658                                           const size_t n, const size_t k,
659                                           const cl_half alpha,
660                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
661                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
662                                           const cl_half beta,
663                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
664                                           cl_command_queue* queue, cl_event* event);
665 
666 // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
667 CLBlastStatusCode PUBLIC_API CLBlastSspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
668                                           const size_t n,
669                                           const float alpha,
670                                           const cl_mem ap_buffer, const size_t ap_offset,
671                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
672                                           const float beta,
673                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
674                                           cl_command_queue* queue, cl_event* event);
675 CLBlastStatusCode PUBLIC_API CLBlastDspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
676                                           const size_t n,
677                                           const double alpha,
678                                           const cl_mem ap_buffer, const size_t ap_offset,
679                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
680                                           const double beta,
681                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
682                                           cl_command_queue* queue, cl_event* event);
683 CLBlastStatusCode PUBLIC_API CLBlastHspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
684                                           const size_t n,
685                                           const cl_half alpha,
686                                           const cl_mem ap_buffer, const size_t ap_offset,
687                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
688                                           const cl_half beta,
689                                           cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
690                                           cl_command_queue* queue, cl_event* event);
691 
692 // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
693 CLBlastStatusCode PUBLIC_API CLBlastStrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
694                                           const size_t n,
695                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
696                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
697                                           cl_command_queue* queue, cl_event* event);
698 CLBlastStatusCode PUBLIC_API CLBlastDtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
699                                           const size_t n,
700                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
701                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
702                                           cl_command_queue* queue, cl_event* event);
703 CLBlastStatusCode PUBLIC_API CLBlastCtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
704                                           const size_t n,
705                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
706                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
707                                           cl_command_queue* queue, cl_event* event);
708 CLBlastStatusCode PUBLIC_API CLBlastZtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
709                                           const size_t n,
710                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
711                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
712                                           cl_command_queue* queue, cl_event* event);
713 CLBlastStatusCode PUBLIC_API CLBlastHtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
714                                           const size_t n,
715                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
716                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
717                                           cl_command_queue* queue, cl_event* event);
718 
719 // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
720 CLBlastStatusCode PUBLIC_API CLBlastStbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
721                                           const size_t n, const size_t k,
722                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
723                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
724                                           cl_command_queue* queue, cl_event* event);
725 CLBlastStatusCode PUBLIC_API CLBlastDtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
726                                           const size_t n, const size_t k,
727                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
728                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
729                                           cl_command_queue* queue, cl_event* event);
730 CLBlastStatusCode PUBLIC_API CLBlastCtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
731                                           const size_t n, const size_t k,
732                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
733                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
734                                           cl_command_queue* queue, cl_event* event);
735 CLBlastStatusCode PUBLIC_API CLBlastZtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
736                                           const size_t n, const size_t k,
737                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
738                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
739                                           cl_command_queue* queue, cl_event* event);
740 CLBlastStatusCode PUBLIC_API CLBlastHtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
741                                           const size_t n, const size_t k,
742                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
743                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
744                                           cl_command_queue* queue, cl_event* event);
745 
746 // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
747 CLBlastStatusCode PUBLIC_API CLBlastStpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
748                                           const size_t n,
749                                           const cl_mem ap_buffer, const size_t ap_offset,
750                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
751                                           cl_command_queue* queue, cl_event* event);
752 CLBlastStatusCode PUBLIC_API CLBlastDtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
753                                           const size_t n,
754                                           const cl_mem ap_buffer, const size_t ap_offset,
755                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
756                                           cl_command_queue* queue, cl_event* event);
757 CLBlastStatusCode PUBLIC_API CLBlastCtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
758                                           const size_t n,
759                                           const cl_mem ap_buffer, const size_t ap_offset,
760                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
761                                           cl_command_queue* queue, cl_event* event);
762 CLBlastStatusCode PUBLIC_API CLBlastZtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
763                                           const size_t n,
764                                           const cl_mem ap_buffer, const size_t ap_offset,
765                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
766                                           cl_command_queue* queue, cl_event* event);
767 CLBlastStatusCode PUBLIC_API CLBlastHtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
768                                           const size_t n,
769                                           const cl_mem ap_buffer, const size_t ap_offset,
770                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
771                                           cl_command_queue* queue, cl_event* event);
772 
773 // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
774 CLBlastStatusCode PUBLIC_API CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
775                                           const size_t n,
776                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
777                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
778                                           cl_command_queue* queue, cl_event* event);
779 CLBlastStatusCode PUBLIC_API CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
780                                           const size_t n,
781                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
782                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
783                                           cl_command_queue* queue, cl_event* event);
784 CLBlastStatusCode PUBLIC_API CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
785                                           const size_t n,
786                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
787                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
788                                           cl_command_queue* queue, cl_event* event);
789 CLBlastStatusCode PUBLIC_API CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
790                                           const size_t n,
791                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
792                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
793                                           cl_command_queue* queue, cl_event* event);
794 
795 // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
796 CLBlastStatusCode PUBLIC_API CLBlastStbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
797                                           const size_t n, const size_t k,
798                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
799                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
800                                           cl_command_queue* queue, cl_event* event);
801 CLBlastStatusCode PUBLIC_API CLBlastDtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
802                                           const size_t n, const size_t k,
803                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
804                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
805                                           cl_command_queue* queue, cl_event* event);
806 CLBlastStatusCode PUBLIC_API CLBlastCtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
807                                           const size_t n, const size_t k,
808                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
809                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
810                                           cl_command_queue* queue, cl_event* event);
811 CLBlastStatusCode PUBLIC_API CLBlastZtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
812                                           const size_t n, const size_t k,
813                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
814                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
815                                           cl_command_queue* queue, cl_event* event);
816 
817 // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
818 CLBlastStatusCode PUBLIC_API CLBlastStpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
819                                           const size_t n,
820                                           const cl_mem ap_buffer, const size_t ap_offset,
821                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
822                                           cl_command_queue* queue, cl_event* event);
823 CLBlastStatusCode PUBLIC_API CLBlastDtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
824                                           const size_t n,
825                                           const cl_mem ap_buffer, const size_t ap_offset,
826                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
827                                           cl_command_queue* queue, cl_event* event);
828 CLBlastStatusCode PUBLIC_API CLBlastCtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
829                                           const size_t n,
830                                           const cl_mem ap_buffer, const size_t ap_offset,
831                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
832                                           cl_command_queue* queue, cl_event* event);
833 CLBlastStatusCode PUBLIC_API CLBlastZtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
834                                           const size_t n,
835                                           const cl_mem ap_buffer, const size_t ap_offset,
836                                           cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
837                                           cl_command_queue* queue, cl_event* event);
838 
839 // General rank-1 matrix update: SGER/DGER/HGER
840 CLBlastStatusCode PUBLIC_API CLBlastSger(const CLBlastLayout layout,
841                                          const size_t m, const size_t n,
842                                          const float alpha,
843                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
844                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
845                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
846                                          cl_command_queue* queue, cl_event* event);
847 CLBlastStatusCode PUBLIC_API CLBlastDger(const CLBlastLayout layout,
848                                          const size_t m, const size_t n,
849                                          const double alpha,
850                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
851                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
852                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
853                                          cl_command_queue* queue, cl_event* event);
854 CLBlastStatusCode PUBLIC_API CLBlastHger(const CLBlastLayout layout,
855                                          const size_t m, const size_t n,
856                                          const cl_half alpha,
857                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
858                                          const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
859                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
860                                          cl_command_queue* queue, cl_event* event);
861 
862 // General rank-1 complex matrix update: CGERU/ZGERU
863 CLBlastStatusCode PUBLIC_API CLBlastCgeru(const CLBlastLayout layout,
864                                           const size_t m, const size_t n,
865                                           const cl_float2 alpha,
866                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
867                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
868                                           cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
869                                           cl_command_queue* queue, cl_event* event);
870 CLBlastStatusCode PUBLIC_API CLBlastZgeru(const CLBlastLayout layout,
871                                           const size_t m, const size_t n,
872                                           const cl_double2 alpha,
873                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
874                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
875                                           cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
876                                           cl_command_queue* queue, cl_event* event);
877 
878 // General rank-1 complex conjugated matrix update: CGERC/ZGERC
879 CLBlastStatusCode PUBLIC_API CLBlastCgerc(const CLBlastLayout layout,
880                                           const size_t m, const size_t n,
881                                           const cl_float2 alpha,
882                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
883                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
884                                           cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
885                                           cl_command_queue* queue, cl_event* event);
886 CLBlastStatusCode PUBLIC_API CLBlastZgerc(const CLBlastLayout layout,
887                                           const size_t m, const size_t n,
888                                           const cl_double2 alpha,
889                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
890                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
891                                           cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
892                                           cl_command_queue* queue, cl_event* event);
893 
894 // Hermitian rank-1 matrix update: CHER/ZHER
895 CLBlastStatusCode PUBLIC_API CLBlastCher(const CLBlastLayout layout, const CLBlastTriangle triangle,
896                                          const size_t n,
897                                          const float alpha,
898                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
899                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
900                                          cl_command_queue* queue, cl_event* event);
901 CLBlastStatusCode PUBLIC_API CLBlastZher(const CLBlastLayout layout, const CLBlastTriangle triangle,
902                                          const size_t n,
903                                          const double alpha,
904                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
905                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
906                                          cl_command_queue* queue, cl_event* event);
907 
908 // Hermitian packed rank-1 matrix update: CHPR/ZHPR
909 CLBlastStatusCode PUBLIC_API CLBlastChpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
910                                          const size_t n,
911                                          const float alpha,
912                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
913                                          cl_mem ap_buffer, const size_t ap_offset,
914                                          cl_command_queue* queue, cl_event* event);
915 CLBlastStatusCode PUBLIC_API CLBlastZhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
916                                          const size_t n,
917                                          const double alpha,
918                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
919                                          cl_mem ap_buffer, const size_t ap_offset,
920                                          cl_command_queue* queue, cl_event* event);
921 
922 // Hermitian rank-2 matrix update: CHER2/ZHER2
923 CLBlastStatusCode PUBLIC_API CLBlastCher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
924                                           const size_t n,
925                                           const cl_float2 alpha,
926                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
927                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
928                                           cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
929                                           cl_command_queue* queue, cl_event* event);
930 CLBlastStatusCode PUBLIC_API CLBlastZher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
931                                           const size_t n,
932                                           const cl_double2 alpha,
933                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
934                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
935                                           cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
936                                           cl_command_queue* queue, cl_event* event);
937 
938 // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
939 CLBlastStatusCode PUBLIC_API CLBlastChpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
940                                           const size_t n,
941                                           const cl_float2 alpha,
942                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
943                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
944                                           cl_mem ap_buffer, const size_t ap_offset,
945                                           cl_command_queue* queue, cl_event* event);
946 CLBlastStatusCode PUBLIC_API CLBlastZhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
947                                           const size_t n,
948                                           const cl_double2 alpha,
949                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
950                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
951                                           cl_mem ap_buffer, const size_t ap_offset,
952                                           cl_command_queue* queue, cl_event* event);
953 
954 // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
955 CLBlastStatusCode PUBLIC_API CLBlastSsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
956                                          const size_t n,
957                                          const float alpha,
958                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
959                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
960                                          cl_command_queue* queue, cl_event* event);
961 CLBlastStatusCode PUBLIC_API CLBlastDsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
962                                          const size_t n,
963                                          const double alpha,
964                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
965                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
966                                          cl_command_queue* queue, cl_event* event);
967 CLBlastStatusCode PUBLIC_API CLBlastHsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
968                                          const size_t n,
969                                          const cl_half alpha,
970                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
971                                          cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
972                                          cl_command_queue* queue, cl_event* event);
973 
974 // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
975 CLBlastStatusCode PUBLIC_API CLBlastSspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
976                                          const size_t n,
977                                          const float alpha,
978                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
979                                          cl_mem ap_buffer, const size_t ap_offset,
980                                          cl_command_queue* queue, cl_event* event);
981 CLBlastStatusCode PUBLIC_API CLBlastDspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
982                                          const size_t n,
983                                          const double alpha,
984                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
985                                          cl_mem ap_buffer, const size_t ap_offset,
986                                          cl_command_queue* queue, cl_event* event);
987 CLBlastStatusCode PUBLIC_API CLBlastHspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
988                                          const size_t n,
989                                          const cl_half alpha,
990                                          const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
991                                          cl_mem ap_buffer, const size_t ap_offset,
992                                          cl_command_queue* queue, cl_event* event);
993 
994 // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
995 CLBlastStatusCode PUBLIC_API CLBlastSsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
996                                           const size_t n,
997                                           const float alpha,
998                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
999                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
1000                                           cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1001                                           cl_command_queue* queue, cl_event* event);
1002 CLBlastStatusCode PUBLIC_API CLBlastDsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
1003                                           const size_t n,
1004                                           const double alpha,
1005                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
1006                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
1007                                           cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1008                                           cl_command_queue* queue, cl_event* event);
1009 CLBlastStatusCode PUBLIC_API CLBlastHsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
1010                                           const size_t n,
1011                                           const cl_half alpha,
1012                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
1013                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
1014                                           cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1015                                           cl_command_queue* queue, cl_event* event);
1016 
1017 // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
1018 CLBlastStatusCode PUBLIC_API CLBlastSspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
1019                                           const size_t n,
1020                                           const float alpha,
1021                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
1022                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
1023                                           cl_mem ap_buffer, const size_t ap_offset,
1024                                           cl_command_queue* queue, cl_event* event);
1025 CLBlastStatusCode PUBLIC_API CLBlastDspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
1026                                           const size_t n,
1027                                           const double alpha,
1028                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
1029                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
1030                                           cl_mem ap_buffer, const size_t ap_offset,
1031                                           cl_command_queue* queue, cl_event* event);
1032 CLBlastStatusCode PUBLIC_API CLBlastHspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
1033                                           const size_t n,
1034                                           const cl_half alpha,
1035                                           const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
1036                                           const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
1037                                           cl_mem ap_buffer, const size_t ap_offset,
1038                                           cl_command_queue* queue, cl_event* event);
1039 
1040 // =================================================================================================
1041 // BLAS level-3 (matrix-matrix) routines
1042 // =================================================================================================
1043 
1044 // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
1045 CLBlastStatusCode PUBLIC_API CLBlastSgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
1046                                           const size_t m, const size_t n, const size_t k,
1047                                           const float alpha,
1048                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1049                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1050                                           const float beta,
1051                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1052                                           cl_command_queue* queue, cl_event* event);
1053 CLBlastStatusCode PUBLIC_API CLBlastDgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
1054                                           const size_t m, const size_t n, const size_t k,
1055                                           const double alpha,
1056                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1057                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1058                                           const double beta,
1059                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1060                                           cl_command_queue* queue, cl_event* event);
1061 CLBlastStatusCode PUBLIC_API CLBlastCgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
1062                                           const size_t m, const size_t n, const size_t k,
1063                                           const cl_float2 alpha,
1064                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1065                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1066                                           const cl_float2 beta,
1067                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1068                                           cl_command_queue* queue, cl_event* event);
1069 CLBlastStatusCode PUBLIC_API CLBlastZgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
1070                                           const size_t m, const size_t n, const size_t k,
1071                                           const cl_double2 alpha,
1072                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1073                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1074                                           const cl_double2 beta,
1075                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1076                                           cl_command_queue* queue, cl_event* event);
1077 CLBlastStatusCode PUBLIC_API CLBlastHgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
1078                                           const size_t m, const size_t n, const size_t k,
1079                                           const cl_half alpha,
1080                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1081                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1082                                           const cl_half beta,
1083                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1084                                           cl_command_queue* queue, cl_event* event);
1085 
1086 // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
1087 CLBlastStatusCode PUBLIC_API CLBlastSsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
1088                                           const size_t m, const size_t n,
1089                                           const float alpha,
1090                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1091                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1092                                           const float beta,
1093                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1094                                           cl_command_queue* queue, cl_event* event);
1095 CLBlastStatusCode PUBLIC_API CLBlastDsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
1096                                           const size_t m, const size_t n,
1097                                           const double alpha,
1098                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1099                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1100                                           const double beta,
1101                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1102                                           cl_command_queue* queue, cl_event* event);
1103 CLBlastStatusCode PUBLIC_API CLBlastCsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
1104                                           const size_t m, const size_t n,
1105                                           const cl_float2 alpha,
1106                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1107                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1108                                           const cl_float2 beta,
1109                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1110                                           cl_command_queue* queue, cl_event* event);
1111 CLBlastStatusCode PUBLIC_API CLBlastZsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
1112                                           const size_t m, const size_t n,
1113                                           const cl_double2 alpha,
1114                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1115                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1116                                           const cl_double2 beta,
1117                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1118                                           cl_command_queue* queue, cl_event* event);
1119 CLBlastStatusCode PUBLIC_API CLBlastHsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
1120                                           const size_t m, const size_t n,
1121                                           const cl_half alpha,
1122                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1123                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1124                                           const cl_half beta,
1125                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1126                                           cl_command_queue* queue, cl_event* event);
1127 
1128 // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
1129 CLBlastStatusCode PUBLIC_API CLBlastChemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
1130                                           const size_t m, const size_t n,
1131                                           const cl_float2 alpha,
1132                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1133                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1134                                           const cl_float2 beta,
1135                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1136                                           cl_command_queue* queue, cl_event* event);
1137 CLBlastStatusCode PUBLIC_API CLBlastZhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
1138                                           const size_t m, const size_t n,
1139                                           const cl_double2 alpha,
1140                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1141                                           const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1142                                           const cl_double2 beta,
1143                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1144                                           cl_command_queue* queue, cl_event* event);
1145 
1146 // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
1147 CLBlastStatusCode PUBLIC_API CLBlastSsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
1148                                           const size_t n, const size_t k,
1149                                           const float alpha,
1150                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1151                                           const float beta,
1152                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1153                                           cl_command_queue* queue, cl_event* event);
1154 CLBlastStatusCode PUBLIC_API CLBlastDsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
1155                                           const size_t n, const size_t k,
1156                                           const double alpha,
1157                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1158                                           const double beta,
1159                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1160                                           cl_command_queue* queue, cl_event* event);
1161 CLBlastStatusCode PUBLIC_API CLBlastCsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
1162                                           const size_t n, const size_t k,
1163                                           const cl_float2 alpha,
1164                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1165                                           const cl_float2 beta,
1166                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1167                                           cl_command_queue* queue, cl_event* event);
1168 CLBlastStatusCode PUBLIC_API CLBlastZsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
1169                                           const size_t n, const size_t k,
1170                                           const cl_double2 alpha,
1171                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1172                                           const cl_double2 beta,
1173                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1174                                           cl_command_queue* queue, cl_event* event);
1175 CLBlastStatusCode PUBLIC_API CLBlastHsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
1176                                           const size_t n, const size_t k,
1177                                           const cl_half alpha,
1178                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1179                                           const cl_half beta,
1180                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1181                                           cl_command_queue* queue, cl_event* event);
1182 
1183 // Rank-K update of a hermitian matrix: CHERK/ZHERK
1184 CLBlastStatusCode PUBLIC_API CLBlastCherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
1185                                           const size_t n, const size_t k,
1186                                           const float alpha,
1187                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1188                                           const float beta,
1189                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1190                                           cl_command_queue* queue, cl_event* event);
1191 CLBlastStatusCode PUBLIC_API CLBlastZherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
1192                                           const size_t n, const size_t k,
1193                                           const double alpha,
1194                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1195                                           const double beta,
1196                                           cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1197                                           cl_command_queue* queue, cl_event* event);
1198 
1199 // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
1200 CLBlastStatusCode PUBLIC_API CLBlastSsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
1201                                            const size_t n, const size_t k,
1202                                            const float alpha,
1203                                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1204                                            const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1205                                            const float beta,
1206                                            cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1207                                            cl_command_queue* queue, cl_event* event);
1208 CLBlastStatusCode PUBLIC_API CLBlastDsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
1209                                            const size_t n, const size_t k,
1210                                            const double alpha,
1211                                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1212                                            const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1213                                            const double beta,
1214                                            cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1215                                            cl_command_queue* queue, cl_event* event);
1216 CLBlastStatusCode PUBLIC_API CLBlastCsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
1217                                            const size_t n, const size_t k,
1218                                            const cl_float2 alpha,
1219                                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1220                                            const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1221                                            const cl_float2 beta,
1222                                            cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1223                                            cl_command_queue* queue, cl_event* event);
1224 CLBlastStatusCode PUBLIC_API CLBlastZsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
1225                                            const size_t n, const size_t k,
1226                                            const cl_double2 alpha,
1227                                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1228                                            const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1229                                            const cl_double2 beta,
1230                                            cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1231                                            cl_command_queue* queue, cl_event* event);
1232 CLBlastStatusCode PUBLIC_API CLBlastHsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
1233                                            const size_t n, const size_t k,
1234                                            const cl_half alpha,
1235                                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1236                                            const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1237                                            const cl_half beta,
1238                                            cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1239                                            cl_command_queue* queue, cl_event* event);
1240 
1241 // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
1242 CLBlastStatusCode PUBLIC_API CLBlastCher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
1243                                            const size_t n, const size_t k,
1244                                            const cl_float2 alpha,
1245                                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1246                                            const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1247                                            const float beta,
1248                                            cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1249                                            cl_command_queue* queue, cl_event* event);
1250 CLBlastStatusCode PUBLIC_API CLBlastZher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
1251                                            const size_t n, const size_t k,
1252                                            const cl_double2 alpha,
1253                                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1254                                            const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1255                                            const double beta,
1256                                            cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
1257                                            cl_command_queue* queue, cl_event* event);
1258 
1259 // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
1260 CLBlastStatusCode PUBLIC_API CLBlastStrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
1261                                           const size_t m, const size_t n,
1262                                           const float alpha,
1263                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1264                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1265                                           cl_command_queue* queue, cl_event* event);
1266 CLBlastStatusCode PUBLIC_API CLBlastDtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
1267                                           const size_t m, const size_t n,
1268                                           const double alpha,
1269                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1270                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1271                                           cl_command_queue* queue, cl_event* event);
1272 CLBlastStatusCode PUBLIC_API CLBlastCtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
1273                                           const size_t m, const size_t n,
1274                                           const cl_float2 alpha,
1275                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1276                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1277                                           cl_command_queue* queue, cl_event* event);
1278 CLBlastStatusCode PUBLIC_API CLBlastZtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
1279                                           const size_t m, const size_t n,
1280                                           const cl_double2 alpha,
1281                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1282                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1283                                           cl_command_queue* queue, cl_event* event);
1284 CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
1285                                           const size_t m, const size_t n,
1286                                           const cl_half alpha,
1287                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1288                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1289                                           cl_command_queue* queue, cl_event* event);
1290 
1291 // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
1292 CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
1293                                           const size_t m, const size_t n,
1294                                           const float alpha,
1295                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1296                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1297                                           cl_command_queue* queue, cl_event* event);
1298 CLBlastStatusCode PUBLIC_API CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
1299                                           const size_t m, const size_t n,
1300                                           const double alpha,
1301                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1302                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1303                                           cl_command_queue* queue, cl_event* event);
1304 CLBlastStatusCode PUBLIC_API CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
1305                                           const size_t m, const size_t n,
1306                                           const cl_float2 alpha,
1307                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1308                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1309                                           cl_command_queue* queue, cl_event* event);
1310 CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
1311                                           const size_t m, const size_t n,
1312                                           const cl_double2 alpha,
1313                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1314                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1315                                           cl_command_queue* queue, cl_event* event);
1316 
1317 // =================================================================================================
1318 // Extra non-BLAS routines (level-X)
1319 // =================================================================================================
1320 
1321 // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
1322 CLBlastStatusCode PUBLIC_API CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
1323                                               const size_t m, const size_t n,
1324                                               const float alpha,
1325                                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1326                                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1327                                               cl_command_queue* queue, cl_event* event);
1328 CLBlastStatusCode PUBLIC_API CLBlastDomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
1329                                               const size_t m, const size_t n,
1330                                               const double alpha,
1331                                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1332                                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1333                                               cl_command_queue* queue, cl_event* event);
1334 CLBlastStatusCode PUBLIC_API CLBlastComatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
1335                                               const size_t m, const size_t n,
1336                                               const cl_float2 alpha,
1337                                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1338                                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1339                                               cl_command_queue* queue, cl_event* event);
1340 CLBlastStatusCode PUBLIC_API CLBlastZomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
1341                                               const size_t m, const size_t n,
1342                                               const cl_double2 alpha,
1343                                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1344                                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1345                                               cl_command_queue* queue, cl_event* event);
1346 CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
1347                                               const size_t m, const size_t n,
1348                                               const cl_half alpha,
1349                                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
1350                                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
1351                                               cl_command_queue* queue, cl_event* event);
1352 
1353 // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL
1354 CLBlastStatusCode PUBLIC_API CLBlastSim2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
1355                                             const cl_mem im_buffer, const size_t im_offset,
1356                                             cl_mem col_buffer, const size_t col_offset,
1357                                             cl_command_queue* queue, cl_event* event);
1358 CLBlastStatusCode PUBLIC_API CLBlastDim2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
1359                                             const cl_mem im_buffer, const size_t im_offset,
1360                                             cl_mem col_buffer, const size_t col_offset,
1361                                             cl_command_queue* queue, cl_event* event);
1362 CLBlastStatusCode PUBLIC_API CLBlastCim2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
1363                                             const cl_mem im_buffer, const size_t im_offset,
1364                                             cl_mem col_buffer, const size_t col_offset,
1365                                             cl_command_queue* queue, cl_event* event);
1366 CLBlastStatusCode PUBLIC_API CLBlastZim2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
1367                                             const cl_mem im_buffer, const size_t im_offset,
1368                                             cl_mem col_buffer, const size_t col_offset,
1369                                             cl_command_queue* queue, cl_event* event);
1370 CLBlastStatusCode PUBLIC_API CLBlastHim2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
1371                                             const cl_mem im_buffer, const size_t im_offset,
1372                                             cl_mem col_buffer, const size_t col_offset,
1373                                             cl_command_queue* queue, cl_event* event);
1374 
1375 // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
1376 CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n,
1377                                                  const float *alphas,
1378                                                  const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
1379                                                  cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
1380                                                  const size_t batch_count,
1381                                                  cl_command_queue* queue, cl_event* event);
1382 CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n,
1383                                                  const double *alphas,
1384                                                  const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
1385                                                  cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
1386                                                  const size_t batch_count,
1387                                                  cl_command_queue* queue, cl_event* event);
1388 CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n,
1389                                                  const cl_float2 *alphas,
1390                                                  const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
1391                                                  cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
1392                                                  const size_t batch_count,
1393                                                  cl_command_queue* queue, cl_event* event);
1394 CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n,
1395                                                  const cl_double2 *alphas,
1396                                                  const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
1397                                                  cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
1398                                                  const size_t batch_count,
1399                                                  cl_command_queue* queue, cl_event* event);
1400 CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n,
1401                                                  const cl_half *alphas,
1402                                                  const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
1403                                                  cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
1404                                                  const size_t batch_count,
1405                                                  cl_command_queue* queue, cl_event* event);
1406 
1407 // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
1408 CLBlastStatusCode PUBLIC_API CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
1409                                                  const size_t m, const size_t n, const size_t k,
1410                                                  const float *alphas,
1411                                                  const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
1412                                                  const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
1413                                                  const float *betas,
1414                                                  cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
1415                                                  const size_t batch_count,
1416                                                  cl_command_queue* queue, cl_event* event);
1417 CLBlastStatusCode PUBLIC_API CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
1418                                                  const size_t m, const size_t n, const size_t k,
1419                                                  const double *alphas,
1420                                                  const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
1421                                                  const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
1422                                                  const double *betas,
1423                                                  cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
1424                                                  const size_t batch_count,
1425                                                  cl_command_queue* queue, cl_event* event);
1426 CLBlastStatusCode PUBLIC_API CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
1427                                                  const size_t m, const size_t n, const size_t k,
1428                                                  const cl_float2 *alphas,
1429                                                  const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
1430                                                  const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
1431                                                  const cl_float2 *betas,
1432                                                  cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
1433                                                  const size_t batch_count,
1434                                                  cl_command_queue* queue, cl_event* event);
1435 CLBlastStatusCode PUBLIC_API CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
1436                                                  const size_t m, const size_t n, const size_t k,
1437                                                  const cl_double2 *alphas,
1438                                                  const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
1439                                                  const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
1440                                                  const cl_double2 *betas,
1441                                                  cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
1442                                                  const size_t batch_count,
1443                                                  cl_command_queue* queue, cl_event* event);
1444 CLBlastStatusCode PUBLIC_API CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
1445                                                  const size_t m, const size_t n, const size_t k,
1446                                                  const cl_half *alphas,
1447                                                  const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
1448                                                  const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
1449                                                  const cl_half *betas,
1450                                                  cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
1451                                                  const size_t batch_count,
1452                                                  cl_command_queue* queue, cl_event* event);
1453 
1454 // =================================================================================================
1455 
1456 // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
1457 // for the same device. This cache can be cleared to free up system memory or in case of debugging.
1458 CLBlastStatusCode PUBLIC_API CLBlastClearCache();
1459 
1460 // The cache can also be pre-initialized for a specific device with all possible CLBLast kernels.
1461 // Further CLBlast routine calls will then run at maximum speed.
1462 CLBlastStatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device);
1463 
1464 // =================================================================================================
1465 
1466 // Overrides tuning parameters for a specific device-precision-kernel combination. The next time
1467 // the target routine is called it will re-compile and use the new parameters from then on.
1468 CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name,
1469                                                        const CLBlastPrecision precision, const size_t num_parameters,
1470                                                        const char** parameters_names, const size_t* parameters_values);
1471 
1472 // =================================================================================================
1473 
1474 #ifdef __cplusplus
1475 } // extern "C"
1476 #endif
1477 
1478 // CLBLAST_CLBLAST_C_H_
1479 #endif
1480