1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 #include <string.h>
19 #include <stdlib.h>
20 
21 #include <clBLAS.h>
22 #include <clkern.h>
23 #include <cltypes.h>
24 #include <stdio.h>
25 #include <ctype.h>
26 
27 #include "clblas-internal.h"
28 
29 #if defined(DUMP_CLBLAS_KERNELS) && !defined(KEEP_CLBLAS_KERNEL_SOURCES)
30 #define KEEP_CLBLAS_KERNEL_SOURCES
31 #endif
32 
33 int clblasInitialized = 0;
34 CLBlasSolvers clblasSolvers[BLAS_FUNCTIONS_NUMBER];
35 struct KernelCache *clblasKernelCache = NULL;
36 
37 enum {
38     BUILD_LOG_SIZE = 65536
39 };
40 
41 static __inline void
storeErrorCode(cl_int * error,cl_int code)42 storeErrorCode(cl_int *error, cl_int code)
43 {
44     if (error != NULL) {
45         *error = code;
46     }
47 }
48 
49 #ifndef PRINT_BUILD_ERRORS
50     #define PRINT_BUILD_ERRORS
51 #endif
52 
53 #ifdef PRINT_BUILD_ERRORS
54 
55 static char
allocBuildLog(void)56 *allocBuildLog(void)
57 {
58 	char *log;
59 
60     log = malloc(BUILD_LOG_SIZE);
61 	if (log) {
62 		log[0] = '\0';
63 	}
64 
65 	return log;
66 }
67 
68 static void
freeBuildLog(char * buildLog)69 freeBuildLog(char *buildLog)
70 {
71     free(buildLog);
72 }
73 
74 static void
printBuildError(cl_int error,cl_device_id device,SolverKgen kgen,const SubproblemDim * dims,const PGranularity * pgran,const CLBLASKernExtra * kextra,const char * source,const char * buildLog)75 printBuildError(
76     cl_int error,
77     cl_device_id device,
78     SolverKgen kgen,
79     const SubproblemDim *dims,
80     const PGranularity *pgran,
81     const CLBLASKernExtra *kextra,
82     const char *source,
83     const char *buildLog)
84 {
85     char name[128];
86     char dimStr[1024];
87     char pgranStr[1024];
88     char *p;
89     MemoryPattern *mempat = NULL;
90     unsigned int i, j;
91     const char *s;
92 
93     name[0] = '\0';
94     clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(name), name, NULL);
95 
96     // lookup memory pattern
97     s = NULL;
98     for (i = 0; i < BLAS_FUNCTIONS_NUMBER; i++) {
99         for (j = 0; j < clblasSolvers[i].nrPatterns; j++) {
100             mempat = &clblasSolvers[i].memPatterns[j];
101             if (kgen == mempat->sops->genKernel) {
102                 s = kernelTypeString(kextra->kernType);
103                 break;
104             }
105         }
106         if (s != NULL) {
107             break;
108         }
109     }
110 
111     // sprintf Subproblem dimensions
112     p = dimStr;
113     for (i = 0; i < mempat->nrLevels; i++) {
114         p = sprintfGranulation(p, dims, i);
115         strcat(p, "; ");
116         p += strlen(p);
117     }
118 
119     // sprintf data parallelism granularity
120     sprintf(pgranStr, "pgran->wgDim = %d, pgran->wgSize[0] = %u, "
121             "pgran->wgSize[1] = %u, pgran->wfSize = %u",
122             pgran->wgDim, pgran->wgSize[0], pgran->wgSize[1],
123             pgran->wfSize);
124 
125     fprintf(stderr, "\n========================================================\n\n");
126     fprintf(stderr, "AN INTERNAL KERNEL BUILD ERROR OCCURRED!\n");
127     fprintf(stderr, "device name = %s\n", name);
128     fprintf(stderr, "error = %d\n", error);
129     fprintf(stderr, "memory pattern = %s, %s kernel generator\n", mempat->name, s);
130     fprintf(stderr, "Subproblem dimensions: %s\n", dimStr);
131     fprintf(stderr, "Parallelism granularity: %s\n", pgranStr);
132     fprintf(stderr, "Kernel extra flags: %u\n", kextra->flags);
133     fprintf(stderr, "Source:\n\n%s\n\n", source);
134     fprintf(stderr, "--------------------------------------------------------\n\n");
135     if (buildLog) {
136         fprintf(stderr, "Build log:\n\n%s\n", buildLog);
137     }
138     else {
139         fprintf(stderr, "Build log is unavailable\n");
140     }
141     fprintf(stderr, "========================================================\n\n");
142 }
143 
144 #else               /* PRINT_BUILD_ERRORS */
145 
146 static __inline char*
allocBuildLog(void)147 allocBuildLog(void)
148 {
149     /* stub, do nothing */
150     return NULL;
151 }
152 
153 #define freeBuildLog(log)                       /* stub, do nothing */
154 #define printBuildError(error, device, kgen, \
155     dims, pgran, kextra, source, buildLog)      /* stub, do nothing */
156 
157 #endif              /* !PRINT_BUILD_ERRORS */
158 
159 static void
extraDtor(struct Kernel * kernel)160 extraDtor(struct Kernel *kernel)
161 {
162     if (kernel->extra != NULL) {
163         free(kernel->extra);
164         kernel->extra = NULL;
165     }
166 }
167 
168 static char
sprintfDim(char * buf,size_t dim,const char * dimName,int level,bool first)169 *sprintfDim(
170     char *buf,
171     size_t dim,
172     const char *dimName,
173     int level,
174     bool first)
175 {
176     if (!first) {
177         strcat(buf, ", ");
178         buf += strlen(buf);
179     }
180     if (dim == SUBDIM_UNUSED) {
181         sprintf(buf, "dims[%d].%s = SUBDIM_UNUSED", level, dimName);
182     }
183     else {
184         sprintf(buf, "dims[%d].%s = %lu", level, dimName, dim);
185     }
186 
187     buf += strlen(buf);
188 
189     return buf;
190 }
191 
192 const char VISIBILITY_HIDDEN
kernelTypeString(CLBlasKernelType ktype)193 *kernelTypeString(CLBlasKernelType ktype)
194 {
195     switch (ktype) {
196     case CLBLAS_COMPUTING_KERNEL:
197         return "computing";
198     case CLBLAS_PREP_A_KERNEL:
199         return "preparative for matrix A";
200     case CLBLAS_PREP_B_KERNEL:
201         return "preparative for matrix B";
202     default:
203         return NULL;
204     }
205 }
206 
207 /*
208  * Assign a scalar multiplied on a matrix a kernel argument
209  */
210 void VISIBILITY_HIDDEN
assignScalarKarg(KernelArg * arg,const void * value,DataType dtype)211 assignScalarKarg(KernelArg *arg, const void *value, DataType dtype)
212 {
213     arg->typeSize = dtypeSize(dtype);
214     memcpy(arg->arg.data, value, arg->typeSize);
215 }
216 
217 void VISIBILITY_HIDDEN
calcGlobalThreads(size_t globalThreads[2],const SubproblemDim * wgDim,const PGranularity * pgran,size_t M,size_t N)218 calcGlobalThreads(
219     size_t globalThreads[2],
220     const SubproblemDim *wgDim,
221     const PGranularity *pgran,
222     size_t M,
223     size_t N)
224 {
225     globalThreads[1] = 1;
226 
227     if ((wgDim->itemX != SUBDIM_UNUSED) &&
228         (wgDim->itemY != SUBDIM_UNUSED)) {
229 
230         size_t groupWorkX, groupWorkY;
231         size_t nrGroupsX, nrGroupsY;
232         int nrDims;
233 
234         groupWorkX = wgDim->itemX;
235         groupWorkY = wgDim->itemY;
236 
237         nrGroupsX = N / groupWorkX;
238         if (N % groupWorkX) {
239             nrGroupsX++;
240         }
241 
242         nrGroupsY = M / groupWorkY;
243         if (M % groupWorkY) {
244             nrGroupsY++;
245         }
246 
247         nrDims = (pgran == NULL) ? 1 : pgran->wgDim;
248         if (nrDims == 1) {
249             globalThreads[0] = nrGroupsX * nrGroupsY;
250         }
251         else {
252             globalThreads[0] = nrGroupsY;
253             globalThreads[1] = nrGroupsX;
254         }
255     }
256     else {
257         size_t totalWork, groupWork;
258 
259         if (wgDim->itemX != SUBDIM_UNUSED) {
260             totalWork = N;
261             groupWork = wgDim->itemX;
262         }
263         else {
264             totalWork = M;
265             groupWork = wgDim->itemY;
266         }
267 
268         globalThreads[0] = totalWork / groupWork;
269         if (totalWork % groupWork) {
270             globalThreads[0]++;
271         }
272     }
273 
274     if (pgran != NULL) {
275         globalThreads[0] *= pgran->wgSize[0];
276         globalThreads[1] *= pgran->wgSize[1];
277     }
278 }
279 
280 cl_int VISIBILITY_HIDDEN
getKernelContext(cl_kernel kernel,cl_context * context)281 getKernelContext(cl_kernel kernel, cl_context *context)
282 {
283     cl_int err;
284     cl_context ctx;
285 
286     err = clGetKernelInfo(kernel, CL_KERNEL_CONTEXT,
287         sizeof(cl_context), &ctx, NULL);
288     if (err != CL_SUCCESS)
289         return err;
290     if (context != NULL)
291         *context = ctx;
292     return err;
293 }
294 
295 cl_int VISIBILITY_HIDDEN
getQueueContext(cl_command_queue queue,cl_context * context)296 getQueueContext(cl_command_queue queue, cl_context *context)
297 {
298     cl_int err;
299     cl_context ctx;
300 
301     err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT,
302         sizeof(cl_context), &ctx, NULL);
303     if (err != CL_SUCCESS)
304         return err;
305     if (context != NULL)
306         *context = ctx;
307     return err;
308 }
309 
310 cl_int VISIBILITY_HIDDEN
getQueueDevice(cl_command_queue queue,cl_device_id * device)311 getQueueDevice(cl_command_queue queue, cl_device_id *device)
312 {
313     cl_int err;
314     cl_device_id dev;
315 
316     err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE,
317         sizeof(cl_device_id), &dev, NULL);
318     if (err != CL_SUCCESS)
319         return err;
320     if (device != NULL)
321         *device = dev;
322     return err;
323 }
324 
325 cl_int VISIBILITY_HIDDEN
getQueueProperties(cl_command_queue queue,cl_command_queue_properties * props)326 getQueueProperties(
327     cl_command_queue queue,
328     cl_command_queue_properties *props)
329 {
330     cl_int err;
331     cl_command_queue_properties p;
332 
333     err = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES,
334         sizeof(cl_command_queue_properties), &p, NULL);
335     if (err != CL_SUCCESS)
336         return err;
337     if (props != NULL)
338         *props = p;
339     return err;
340 }
341 
342 Kernel VISIBILITY_HIDDEN
loadKernel(const unsigned char ** buffer,size_t sizeBuffer,KernelKey * key,const CLBLASKernExtra * extra,cl_int * error)343 *loadKernel( const unsigned char** buffer,
344              size_t sizeBuffer,
345              KernelKey *key,
346              const CLBLASKernExtra *extra,
347              cl_int *error)
348 
349 {
350     cl_int status = CL_SUCCESS;
351     Kernel* kernel;
352 
353     kernel = allocKernel();
354     if (kernel == NULL) {
355         return NULL;
356     }
357 
358     kernel->program = createClProgramWithBinary(key->context,
359                                                 key->device,
360                                                 (unsigned char*)*buffer,
361                                                 sizeBuffer,
362                                                 &status);
363     if (status == CL_SUCCESS) {
364         kernel->extraSize = sizeof(CLBLASKernExtra);
365         kernel->extra = calloc(1, kernel->extraSize);
366         *(CLBLASKernExtra*)(kernel->extra) = *extra;
367         kernel->dtor = extraDtor;
368         kernel->noSource = 1;
369     }
370     else {
371         putKernel(NULL, kernel);
372         storeErrorCode(error, status);
373         kernel = NULL;
374     }
375 
376     return kernel;
377 }
378 
379 #if !defined(DUMP_CLBLAS_KERNELS)
380 
381 /*
382  * Drop the program's source so as to consume memory as few as possible
383  * at caching
384  */
385 static cl_int
dropProgramSource(cl_program * program,cl_context ctx,cl_device_id devID)386 dropProgramSource(cl_program *program, cl_context ctx, cl_device_id devID)
387 {
388     size_t size;
389     unsigned char *bin;
390     cl_program p = *program;
391     cl_int err;
392 
393     size = getProgramBinarySize(p);
394     bin = getProgramBinary(p);
395 
396     /*
397      * Don't release the original program until a new one is created
398      * in order to retain its own reference to the context if it is
399      * released by user
400      */
401     p = createClProgramWithBinary(ctx, devID, bin, size, &err);
402     if (err == CL_SUCCESS) {
403         clReleaseProgram(*program);
404         *program = p;
405     }
406 
407     free(bin);
408 
409     return err;
410 }
411 
412 #endif /* !DUMP_CLBLAS_KERNELS */
413 
414 Kernel
makeKernel(cl_device_id device,cl_context context,SolverKgen kernelGenerator,cl_program program,const SubproblemDim * dims,const PGranularity * pgran,const CLBLASKernExtra * extra,const char * buildOpts,cl_int * error)415 *makeKernel(
416     cl_device_id device,
417     cl_context context,
418     SolverKgen kernelGenerator,
419     cl_program program,
420     const SubproblemDim *dims,
421     const PGranularity *pgran,
422     const CLBLASKernExtra *extra,
423     const char *buildOpts,
424     cl_int *error)
425 {
426     cl_int err;
427     char *source;
428     ssize_t size;
429     Kernel *kernel;
430     char *log;
431 
432 	#ifdef DEBUG_2
433 	printf("Make kernel called\n");
434 	printf("x : %d, y : %d, itemX: %d, itemY: %d\n",  dims->x, dims->y, dims->itemX, dims->itemY);
435 	printf("PG : wgSize[0] : %d, wgSize[1] : %d, wfSize: %d\n",  pgran->wgSize[0], pgran->wgSize[1], pgran->wfSize);
436 	#endif
437 
438     kernel = allocKernel();
439 
440     if (kernel == NULL) {
441         free(source);
442         storeErrorCode(error, CL_OUT_OF_HOST_MEMORY);
443         return NULL;
444     }
445 
446 
447     if (kernelGenerator)
448     {
449     size = kernelGenerator(NULL, 0, dims, pgran, (void*)extra);
450     if (size < 0) {
451         storeErrorCode(error, CL_OUT_OF_HOST_MEMORY);
452         return NULL;
453     }
454     source = calloc(1, size);
455     if (source == NULL) {
456         storeErrorCode(error, CL_OUT_OF_HOST_MEMORY);
457         return NULL;
458     }
459     if (kernelGenerator(source, size, dims, pgran, (void*)extra) != size) {
460         free(source);
461         storeErrorCode(error, CL_OUT_OF_HOST_MEMORY);
462         return NULL;
463     }
464 
465 
466 
467     log = allocBuildLog();
468 
469 	//#define DEBUG_2
470 	#ifdef DEBUG_2
471 	printf("Build Options used %s \n", buildOpts);
472 	printf("Source kernel used %s \n", source);
473 	#endif
474 	#undef DEBUG_2
475 
476     kernel->program = buildClProgram(source, buildOpts, context, device,
477                                      log, BUILD_LOG_SIZE, &err);
478     if (err != CL_SUCCESS) {
479         printBuildError(err, device, kernelGenerator, dims,
480                         pgran, extra, source, log);
481         freeBuildLog(log);
482         putKernel(NULL, kernel);
483         free(source);
484         storeErrorCode(error, err);
485         return NULL;
486     }
487 	else
488 	{
489 		// #define DEBUG_2
490 		#ifdef DEBUG_2
491 		printf("Kernel compilation succeeded\n");
492 		#endif
493 		#undef DEBUG_2
494 	}
495 
496     freeBuildLog(log);
497     free(source);
498 
499 #if !defined(KEEP_CLBLAS_KERNEL_SOURCES)
500     if (err == CL_SUCCESS) {
501         err = dropProgramSource(&kernel->program, context, device);
502         kernel->noSource = 1;
503     }
504 #endif  /* !DUMP_CLBLAS_KERNELS */
505 
506     if (err != CL_SUCCESS) {
507         putKernel(NULL, kernel);
508         storeErrorCode(error, err);
509         return NULL;
510     }
511     }
512     else
513     {
514         kernel->program = program;
515     }
516 
517     kernel->extraSize = sizeof(CLBLASKernExtra);
518     kernel->extra = calloc(1, kernel->extraSize);
519     *(CLBLASKernExtra*)(kernel->extra) = *extra;
520     kernel->dtor = extraDtor;
521 
522     storeErrorCode(error, CL_SUCCESS);
523 
524     return kernel;
525 
526 }
527 
528 void
setupBuildOpts(char opts[BUILD_OPTS_MAXLEN],cl_device_id devID,MemoryPattern * mempat)529 setupBuildOpts(
530     char opts[BUILD_OPTS_MAXLEN],
531     cl_device_id devID,
532     MemoryPattern *mempat)
533 {
534     TargetDevice target;
535 
536     target.id = devID;
537     identifyDevice(&target);
538     opts[0] = '\0';
539 
540 #if !defined NDEBUG
541     // Nvidia runtime does not appear to support the -g flag, at least in their OpenCL v1.1 runtime
542     if( target.ident.vendor != VENDOR_NVIDIA )
543         addBuildOpt( opts, BUILD_OPTS_MAXLEN, "-g" );
544 #endif  /* NDEBUG */
545 
546     if (target.ident.vendor == VENDOR_NVIDIA &&
547         !strcmp(mempat->name, "2-staged cached global memory based "
548                               "block trsm")) {
549 
550         addBuildOpt(opts, BUILD_OPTS_MAXLEN, "-cl-opt-disable");
551     }
552 }
553 
addBuildOpt(char * opts,size_t len,const char * option)554 void addBuildOpt(
555     char * opts,
556     size_t len,
557     const char * option)
558 {
559     size_t l = strlen(opts);
560 
561     if (l > 0 && !isspace(opts[l-1]) && l+1 < len) {
562       opts[l] = ' ';
563       opts[l+1]   = '\0';
564       l++;
565     }
566 
567     strncat(opts, option, len - l - 1);
568 }
569 
570 
571 char VISIBILITY_HIDDEN
sprintfGranulation(char * buf,const SubproblemDim * dim,int level)572 *sprintfGranulation(char *buf, const SubproblemDim *dim, int level)
573 {
574     buf = sprintfDim(buf, dim[level].itemY, "itemY", level, true);
575     buf = sprintfDim(buf, dim[level].itemX, "itemX", level, false);
576     buf = sprintfDim(buf, dim[level].y, "y", level, false);
577     buf = sprintfDim(buf, dim[level].x, "x", level, false);
578     buf = sprintfDim(buf, dim[level].bwidth, "bwidth", level, false);
579     strcat(buf, "; ");
580     buf += strlen(buf);
581 
582     return buf;
583 }
584 
585 clblasStatus VISIBILITY_HIDDEN
checkMatrixSizes(DataType dtype,clblasOrder order,clblasTranspose transA,size_t M,size_t N,cl_mem A,size_t offA,size_t lda,ErrorCodeSet err)586 checkMatrixSizes(
587     DataType dtype,
588     clblasOrder order,
589     clblasTranspose transA,
590     size_t M,
591     size_t N,
592     cl_mem A,
593     size_t offA,
594     size_t lda,         // lda is passed as zero for packed matrices
595     ErrorCodeSet err )
596 {
597     size_t memSize, matrSize, tsize, memUsed;
598     size_t unusedTail = 0;
599     bool tra;
600 
601     if ((M == 0) || (N == 0)) {
602         return clblasInvalidDim;
603     }
604 
605     tsize = dtypeSize(dtype);
606     tra = (order == clblasRowMajor && transA != clblasNoTrans) ||
607           (order == clblasColumnMajor && transA == clblasNoTrans);
608 
609     if( lda > 0 )              // For Non-packed matrices
610     {
611         if (tra) {
612             if (lda < M) {
613                 switch( err )
614                 {
615                 case A_MAT_ERRSET:
616                     return clblasInvalidLeadDimA;
617                 case B_MAT_ERRSET:
618                     return clblasInvalidLeadDimB;
619                 case C_MAT_ERRSET:
620                     return clblasInvalidLeadDimC;
621                 default:
622                     return clblasNotImplemented;
623                 }
624             }
625             matrSize = ((N - 1) * lda + M) * tsize;
626             unusedTail = ( lda - N ) * tsize;
627         }
628         else {
629             if (lda < N) {
630                 switch( err )
631                 {
632                 case A_MAT_ERRSET:
633                     return clblasInvalidLeadDimA;
634                 case B_MAT_ERRSET:
635                     return clblasInvalidLeadDimB;
636                 case C_MAT_ERRSET:
637                     return clblasInvalidLeadDimC;
638                 default:
639                     return clblasNotImplemented;
640                 }
641             }
642             matrSize = ((M - 1) * lda + N) * tsize;
643             unusedTail = ( lda - M ) * tsize;
644         }
645     }
646     else {                     // For the case of packed matrices
647          matrSize = ((M * (N+1)) / 2) * tsize;
648     }
649 
650     offA *= tsize;
651 
652     if (clGetMemObjectInfo(A, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) !=
653                                 CL_SUCCESS) {
654         switch( err )
655         {
656         case A_MAT_ERRSET:
657             return clblasInvalidMatA;
658         case B_MAT_ERRSET:
659             return clblasInvalidMatB;
660         case C_MAT_ERRSET:
661             return clblasInvalidMatC;
662         default:
663             return clblasNotImplemented;
664         }
665     }
666 
667     // Calculates the memory required. Note that 'matrSize' already takes into account the fact that
668     // there might be an unused tail, i.e. the elements between lda and M in the last column if
669     // column major is used or between lda and N in the last row if row major is used.
670     memUsed = offA + matrSize;
671     if (( memUsed > memSize ) || (offA + matrSize < offA)) {
672         switch( err )
673         {
674         case A_MAT_ERRSET:
675             return clblasInsufficientMemMatA;
676         case B_MAT_ERRSET:
677             return clblasInsufficientMemMatB;
678         case C_MAT_ERRSET:
679             return clblasInsufficientMemMatC;
680         default:
681             return clblasNotImplemented;
682         }
683     }
684 
685     return clblasSuccess;
686 }
687 
688 
689 clblasStatus VISIBILITY_HIDDEN
checkBandedMatrixSizes(DataType dtype,clblasOrder order,clblasTranspose transA,size_t M,size_t N,size_t KL,size_t KU,cl_mem A,size_t offA,size_t lda,ErrorCodeSet err)690 checkBandedMatrixSizes(
691     DataType dtype,
692     clblasOrder order,
693     clblasTranspose transA,
694     size_t M,
695     size_t N,
696     size_t KL,
697     size_t KU,
698     cl_mem A,
699     size_t offA,
700     size_t lda,
701     ErrorCodeSet err )
702 {
703     size_t memSize, matrSize, tsize, K, memUsed;
704     size_t unusedTail = 0;
705     bool tra;
706 
707     if ((M == 0) || (N == 0)) {
708         return clblasInvalidDim;
709     }
710 
711     tsize = dtypeSize(dtype);
712     K = KL + KU + 1;
713     tra = (order == clblasRowMajor && transA != clblasNoTrans) ||
714           (order == clblasColumnMajor && transA == clblasNoTrans);
715 
716     if (lda < K) {
717         switch( err )
718         {
719         case A_MAT_ERRSET:
720             return clblasInvalidLeadDimA;
721         case B_MAT_ERRSET:
722             return clblasInvalidLeadDimB;
723         case C_MAT_ERRSET:
724             return clblasInvalidLeadDimC;
725         default:
726             return clblasNotImplemented;
727         }
728     }
729 
730     if (tra) {
731         matrSize = ((N - 1) * lda + K) * tsize;
732         unusedTail = ( lda - N ) * tsize;
733     }
734     else {
735         matrSize = ((M - 1) * lda + K) * tsize;
736         unusedTail = ( lda - M ) * tsize;
737     }
738 
739     offA *= tsize;
740 
741     if (clGetMemObjectInfo(A, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) !=
742                                 CL_SUCCESS) {
743         switch( err )
744         {
745         case A_MAT_ERRSET:
746             return clblasInvalidMatA;
747         case B_MAT_ERRSET:
748             return clblasInvalidMatB;
749         case C_MAT_ERRSET:
750             return clblasInvalidMatC;
751         default:
752             return clblasNotImplemented;
753         }
754     }
755 
756     // Calculates the memory required. Note that 'matrSize' already takes into account the fact that
757     // there might be an unused tail, i.e. the elements between lda and M in the last column if
758     // column major is used or between lda and N in the last row if row major is used.
759     memUsed = offA + matrSize;
760     if (memUsed > memSize) {
761         switch( err )
762         {
763         case A_MAT_ERRSET:
764             return clblasInsufficientMemMatA;
765         case B_MAT_ERRSET:
766             return clblasInsufficientMemMatB;
767         case C_MAT_ERRSET:
768             return clblasInsufficientMemMatC;
769         default:
770             return clblasNotImplemented;
771         }
772     }
773 
774     return clblasSuccess;
775 }
776 
777 clblasStatus VISIBILITY_HIDDEN
checkVectorSizes(DataType dtype,size_t N,cl_mem x,size_t offx,int incx,ErrorCodeSet err)778 checkVectorSizes(
779     DataType dtype,
780     size_t N,
781     cl_mem x,
782     size_t offx,
783     int incx,
784     ErrorCodeSet err )
785 {
786     size_t memSize, sizev;
787     size_t tsize;
788 
789     if (N == 0) {
790         return clblasInvalidDim;
791     }
792 
793     if (incx == 0) {
794         switch( err )
795         {
796         case X_VEC_ERRSET:
797             return clblasInvalidIncX;
798         case Y_VEC_ERRSET:
799             return clblasInvalidIncY;
800         default:
801             return clblasNotImplemented;
802         }
803     }
804 
805     if (clGetMemObjectInfo(x, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) !=
806                                 CL_SUCCESS) {
807         switch( err )
808         {
809         case X_VEC_ERRSET:
810             return clblasInvalidVecX;
811         case Y_VEC_ERRSET:
812             return clblasInvalidVecY;
813         default:
814             return clblasNotImplemented;
815         }
816     }
817 
818     tsize = dtypeSize(dtype);
819     sizev = ((N - 1) * abs(incx) + 1) * tsize;
820     offx *= tsize;
821 
822     if ((offx + sizev > memSize) || (offx + sizev < offx)) {
823         switch( err )
824         {
825         case X_VEC_ERRSET:
826             return clblasInsufficientMemVecX;
827         case Y_VEC_ERRSET:
828             return clblasInsufficientMemVecY;
829         default:
830             return clblasNotImplemented;
831         }
832     }
833 
834     return clblasSuccess;
835 }
836 
837 clblasStatus
checkMemObjects(cl_mem A,cl_mem B,cl_mem C,bool checkC,ErrorCodeSet errA,ErrorCodeSet errB,ErrorCodeSet errC)838 checkMemObjects(
839     cl_mem A,
840     cl_mem B,
841     cl_mem C,
842     bool checkC,
843     ErrorCodeSet errA,
844     ErrorCodeSet errB,
845     ErrorCodeSet errC )
846 {
847     cl_mem_object_type mobjType = 0;
848 
849     if (!clGetMemObjectInfo(A, CL_MEM_TYPE, sizeof(mobjType), &mobjType, NULL) &&
850         (mobjType != CL_MEM_OBJECT_BUFFER)) {
851         switch( errA )
852         {
853         case A_MAT_ERRSET:
854             return clblasInvalidMatA;
855         case B_MAT_ERRSET:
856             return clblasInvalidMatB;
857         case C_MAT_ERRSET:
858             return clblasInvalidMatC;
859         case X_VEC_ERRSET:
860             return clblasInvalidVecX;
861         case Y_VEC_ERRSET:
862             return clblasInvalidVecY;
863         default:
864             return clblasNotImplemented;
865         }
866     }
867 
868     mobjType = 0;
869     if (!clGetMemObjectInfo(B, CL_MEM_TYPE, sizeof(mobjType), &mobjType, NULL) &&
870         (mobjType != CL_MEM_OBJECT_BUFFER)) {
871         switch( errB )
872         {
873         case A_MAT_ERRSET:
874             return clblasInvalidMatA;
875         case B_MAT_ERRSET:
876             return clblasInvalidMatB;
877         case C_MAT_ERRSET:
878             return clblasInvalidMatC;
879         case X_VEC_ERRSET:
880             return clblasInvalidVecX;
881         case Y_VEC_ERRSET:
882             return clblasInvalidVecY;
883         default:
884             return clblasNotImplemented;
885         }
886     }
887 
888     mobjType = 0;
889     if (checkC && !clGetMemObjectInfo(C, CL_MEM_TYPE, sizeof(mobjType),
890                                      &mobjType, NULL) &&
891         (mobjType != CL_MEM_OBJECT_BUFFER)) {
892         switch( errC )
893         {
894         case A_MAT_ERRSET:
895             return clblasInvalidMatA;
896         case B_MAT_ERRSET:
897             return clblasInvalidMatB;
898         case C_MAT_ERRSET:
899             return clblasInvalidMatC;
900         case X_VEC_ERRSET:
901             return clblasInvalidVecX;
902         case Y_VEC_ERRSET:
903             return clblasInvalidVecY;
904         default:
905             return clblasNotImplemented;
906         }
907     }
908 
909     return clblasSuccess;
910 }
911