1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 #include <string.h>
19 #include <stdlib.h>
20
21 #include <clBLAS.h>
22 #include <clkern.h>
23 #include <cltypes.h>
24 #include <stdio.h>
25 #include <ctype.h>
26
27 #include "clblas-internal.h"
28
29 #if defined(DUMP_CLBLAS_KERNELS) && !defined(KEEP_CLBLAS_KERNEL_SOURCES)
30 #define KEEP_CLBLAS_KERNEL_SOURCES
31 #endif
32
33 int clblasInitialized = 0;
34 CLBlasSolvers clblasSolvers[BLAS_FUNCTIONS_NUMBER];
35 struct KernelCache *clblasKernelCache = NULL;
36
37 enum {
38 BUILD_LOG_SIZE = 65536
39 };
40
41 static __inline void
storeErrorCode(cl_int * error,cl_int code)42 storeErrorCode(cl_int *error, cl_int code)
43 {
44 if (error != NULL) {
45 *error = code;
46 }
47 }
48
49 #ifndef PRINT_BUILD_ERRORS
50 #define PRINT_BUILD_ERRORS
51 #endif
52
53 #ifdef PRINT_BUILD_ERRORS
54
55 static char
allocBuildLog(void)56 *allocBuildLog(void)
57 {
58 char *log;
59
60 log = malloc(BUILD_LOG_SIZE);
61 if (log) {
62 log[0] = '\0';
63 }
64
65 return log;
66 }
67
68 static void
freeBuildLog(char * buildLog)69 freeBuildLog(char *buildLog)
70 {
71 free(buildLog);
72 }
73
74 static void
printBuildError(cl_int error,cl_device_id device,SolverKgen kgen,const SubproblemDim * dims,const PGranularity * pgran,const CLBLASKernExtra * kextra,const char * source,const char * buildLog)75 printBuildError(
76 cl_int error,
77 cl_device_id device,
78 SolverKgen kgen,
79 const SubproblemDim *dims,
80 const PGranularity *pgran,
81 const CLBLASKernExtra *kextra,
82 const char *source,
83 const char *buildLog)
84 {
85 char name[128];
86 char dimStr[1024];
87 char pgranStr[1024];
88 char *p;
89 MemoryPattern *mempat = NULL;
90 unsigned int i, j;
91 const char *s;
92
93 name[0] = '\0';
94 clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(name), name, NULL);
95
96 // lookup memory pattern
97 s = NULL;
98 for (i = 0; i < BLAS_FUNCTIONS_NUMBER; i++) {
99 for (j = 0; j < clblasSolvers[i].nrPatterns; j++) {
100 mempat = &clblasSolvers[i].memPatterns[j];
101 if (kgen == mempat->sops->genKernel) {
102 s = kernelTypeString(kextra->kernType);
103 break;
104 }
105 }
106 if (s != NULL) {
107 break;
108 }
109 }
110
111 // sprintf Subproblem dimensions
112 p = dimStr;
113 for (i = 0; i < mempat->nrLevels; i++) {
114 p = sprintfGranulation(p, dims, i);
115 strcat(p, "; ");
116 p += strlen(p);
117 }
118
119 // sprintf data parallelism granularity
120 sprintf(pgranStr, "pgran->wgDim = %d, pgran->wgSize[0] = %u, "
121 "pgran->wgSize[1] = %u, pgran->wfSize = %u",
122 pgran->wgDim, pgran->wgSize[0], pgran->wgSize[1],
123 pgran->wfSize);
124
125 fprintf(stderr, "\n========================================================\n\n");
126 fprintf(stderr, "AN INTERNAL KERNEL BUILD ERROR OCCURRED!\n");
127 fprintf(stderr, "device name = %s\n", name);
128 fprintf(stderr, "error = %d\n", error);
129 fprintf(stderr, "memory pattern = %s, %s kernel generator\n", mempat->name, s);
130 fprintf(stderr, "Subproblem dimensions: %s\n", dimStr);
131 fprintf(stderr, "Parallelism granularity: %s\n", pgranStr);
132 fprintf(stderr, "Kernel extra flags: %u\n", kextra->flags);
133 fprintf(stderr, "Source:\n\n%s\n\n", source);
134 fprintf(stderr, "--------------------------------------------------------\n\n");
135 if (buildLog) {
136 fprintf(stderr, "Build log:\n\n%s\n", buildLog);
137 }
138 else {
139 fprintf(stderr, "Build log is unavailable\n");
140 }
141 fprintf(stderr, "========================================================\n\n");
142 }
143
144 #else /* PRINT_BUILD_ERRORS */
145
146 static __inline char*
allocBuildLog(void)147 allocBuildLog(void)
148 {
149 /* stub, do nothing */
150 return NULL;
151 }
152
153 #define freeBuildLog(log) /* stub, do nothing */
154 #define printBuildError(error, device, kgen, \
155 dims, pgran, kextra, source, buildLog) /* stub, do nothing */
156
157 #endif /* !PRINT_BUILD_ERRORS */
158
159 static void
extraDtor(struct Kernel * kernel)160 extraDtor(struct Kernel *kernel)
161 {
162 if (kernel->extra != NULL) {
163 free(kernel->extra);
164 kernel->extra = NULL;
165 }
166 }
167
168 static char
sprintfDim(char * buf,size_t dim,const char * dimName,int level,bool first)169 *sprintfDim(
170 char *buf,
171 size_t dim,
172 const char *dimName,
173 int level,
174 bool first)
175 {
176 if (!first) {
177 strcat(buf, ", ");
178 buf += strlen(buf);
179 }
180 if (dim == SUBDIM_UNUSED) {
181 sprintf(buf, "dims[%d].%s = SUBDIM_UNUSED", level, dimName);
182 }
183 else {
184 sprintf(buf, "dims[%d].%s = %lu", level, dimName, dim);
185 }
186
187 buf += strlen(buf);
188
189 return buf;
190 }
191
192 const char VISIBILITY_HIDDEN
kernelTypeString(CLBlasKernelType ktype)193 *kernelTypeString(CLBlasKernelType ktype)
194 {
195 switch (ktype) {
196 case CLBLAS_COMPUTING_KERNEL:
197 return "computing";
198 case CLBLAS_PREP_A_KERNEL:
199 return "preparative for matrix A";
200 case CLBLAS_PREP_B_KERNEL:
201 return "preparative for matrix B";
202 default:
203 return NULL;
204 }
205 }
206
207 /*
208 * Assign a scalar multiplied on a matrix a kernel argument
209 */
210 void VISIBILITY_HIDDEN
assignScalarKarg(KernelArg * arg,const void * value,DataType dtype)211 assignScalarKarg(KernelArg *arg, const void *value, DataType dtype)
212 {
213 arg->typeSize = dtypeSize(dtype);
214 memcpy(arg->arg.data, value, arg->typeSize);
215 }
216
217 void VISIBILITY_HIDDEN
calcGlobalThreads(size_t globalThreads[2],const SubproblemDim * wgDim,const PGranularity * pgran,size_t M,size_t N)218 calcGlobalThreads(
219 size_t globalThreads[2],
220 const SubproblemDim *wgDim,
221 const PGranularity *pgran,
222 size_t M,
223 size_t N)
224 {
225 globalThreads[1] = 1;
226
227 if ((wgDim->itemX != SUBDIM_UNUSED) &&
228 (wgDim->itemY != SUBDIM_UNUSED)) {
229
230 size_t groupWorkX, groupWorkY;
231 size_t nrGroupsX, nrGroupsY;
232 int nrDims;
233
234 groupWorkX = wgDim->itemX;
235 groupWorkY = wgDim->itemY;
236
237 nrGroupsX = N / groupWorkX;
238 if (N % groupWorkX) {
239 nrGroupsX++;
240 }
241
242 nrGroupsY = M / groupWorkY;
243 if (M % groupWorkY) {
244 nrGroupsY++;
245 }
246
247 nrDims = (pgran == NULL) ? 1 : pgran->wgDim;
248 if (nrDims == 1) {
249 globalThreads[0] = nrGroupsX * nrGroupsY;
250 }
251 else {
252 globalThreads[0] = nrGroupsY;
253 globalThreads[1] = nrGroupsX;
254 }
255 }
256 else {
257 size_t totalWork, groupWork;
258
259 if (wgDim->itemX != SUBDIM_UNUSED) {
260 totalWork = N;
261 groupWork = wgDim->itemX;
262 }
263 else {
264 totalWork = M;
265 groupWork = wgDim->itemY;
266 }
267
268 globalThreads[0] = totalWork / groupWork;
269 if (totalWork % groupWork) {
270 globalThreads[0]++;
271 }
272 }
273
274 if (pgran != NULL) {
275 globalThreads[0] *= pgran->wgSize[0];
276 globalThreads[1] *= pgran->wgSize[1];
277 }
278 }
279
280 cl_int VISIBILITY_HIDDEN
getKernelContext(cl_kernel kernel,cl_context * context)281 getKernelContext(cl_kernel kernel, cl_context *context)
282 {
283 cl_int err;
284 cl_context ctx;
285
286 err = clGetKernelInfo(kernel, CL_KERNEL_CONTEXT,
287 sizeof(cl_context), &ctx, NULL);
288 if (err != CL_SUCCESS)
289 return err;
290 if (context != NULL)
291 *context = ctx;
292 return err;
293 }
294
295 cl_int VISIBILITY_HIDDEN
getQueueContext(cl_command_queue queue,cl_context * context)296 getQueueContext(cl_command_queue queue, cl_context *context)
297 {
298 cl_int err;
299 cl_context ctx;
300
301 err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT,
302 sizeof(cl_context), &ctx, NULL);
303 if (err != CL_SUCCESS)
304 return err;
305 if (context != NULL)
306 *context = ctx;
307 return err;
308 }
309
310 cl_int VISIBILITY_HIDDEN
getQueueDevice(cl_command_queue queue,cl_device_id * device)311 getQueueDevice(cl_command_queue queue, cl_device_id *device)
312 {
313 cl_int err;
314 cl_device_id dev;
315
316 err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE,
317 sizeof(cl_device_id), &dev, NULL);
318 if (err != CL_SUCCESS)
319 return err;
320 if (device != NULL)
321 *device = dev;
322 return err;
323 }
324
325 cl_int VISIBILITY_HIDDEN
getQueueProperties(cl_command_queue queue,cl_command_queue_properties * props)326 getQueueProperties(
327 cl_command_queue queue,
328 cl_command_queue_properties *props)
329 {
330 cl_int err;
331 cl_command_queue_properties p;
332
333 err = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES,
334 sizeof(cl_command_queue_properties), &p, NULL);
335 if (err != CL_SUCCESS)
336 return err;
337 if (props != NULL)
338 *props = p;
339 return err;
340 }
341
342 Kernel VISIBILITY_HIDDEN
loadKernel(const unsigned char ** buffer,size_t sizeBuffer,KernelKey * key,const CLBLASKernExtra * extra,cl_int * error)343 *loadKernel( const unsigned char** buffer,
344 size_t sizeBuffer,
345 KernelKey *key,
346 const CLBLASKernExtra *extra,
347 cl_int *error)
348
349 {
350 cl_int status = CL_SUCCESS;
351 Kernel* kernel;
352
353 kernel = allocKernel();
354 if (kernel == NULL) {
355 return NULL;
356 }
357
358 kernel->program = createClProgramWithBinary(key->context,
359 key->device,
360 (unsigned char*)*buffer,
361 sizeBuffer,
362 &status);
363 if (status == CL_SUCCESS) {
364 kernel->extraSize = sizeof(CLBLASKernExtra);
365 kernel->extra = calloc(1, kernel->extraSize);
366 *(CLBLASKernExtra*)(kernel->extra) = *extra;
367 kernel->dtor = extraDtor;
368 kernel->noSource = 1;
369 }
370 else {
371 putKernel(NULL, kernel);
372 storeErrorCode(error, status);
373 kernel = NULL;
374 }
375
376 return kernel;
377 }
378
379 #if !defined(DUMP_CLBLAS_KERNELS)
380
381 /*
382 * Drop the program's source so as to consume memory as few as possible
383 * at caching
384 */
385 static cl_int
dropProgramSource(cl_program * program,cl_context ctx,cl_device_id devID)386 dropProgramSource(cl_program *program, cl_context ctx, cl_device_id devID)
387 {
388 size_t size;
389 unsigned char *bin;
390 cl_program p = *program;
391 cl_int err;
392
393 size = getProgramBinarySize(p);
394 bin = getProgramBinary(p);
395
396 /*
397 * Don't release the original program until a new one is created
398 * in order to retain its own reference to the context if it is
399 * released by user
400 */
401 p = createClProgramWithBinary(ctx, devID, bin, size, &err);
402 if (err == CL_SUCCESS) {
403 clReleaseProgram(*program);
404 *program = p;
405 }
406
407 free(bin);
408
409 return err;
410 }
411
412 #endif /* !DUMP_CLBLAS_KERNELS */
413
414 Kernel
makeKernel(cl_device_id device,cl_context context,SolverKgen kernelGenerator,cl_program program,const SubproblemDim * dims,const PGranularity * pgran,const CLBLASKernExtra * extra,const char * buildOpts,cl_int * error)415 *makeKernel(
416 cl_device_id device,
417 cl_context context,
418 SolverKgen kernelGenerator,
419 cl_program program,
420 const SubproblemDim *dims,
421 const PGranularity *pgran,
422 const CLBLASKernExtra *extra,
423 const char *buildOpts,
424 cl_int *error)
425 {
426 cl_int err;
427 char *source;
428 ssize_t size;
429 Kernel *kernel;
430 char *log;
431
432 #ifdef DEBUG_2
433 printf("Make kernel called\n");
434 printf("x : %d, y : %d, itemX: %d, itemY: %d\n", dims->x, dims->y, dims->itemX, dims->itemY);
435 printf("PG : wgSize[0] : %d, wgSize[1] : %d, wfSize: %d\n", pgran->wgSize[0], pgran->wgSize[1], pgran->wfSize);
436 #endif
437
438 kernel = allocKernel();
439
440 if (kernel == NULL) {
441 free(source);
442 storeErrorCode(error, CL_OUT_OF_HOST_MEMORY);
443 return NULL;
444 }
445
446
447 if (kernelGenerator)
448 {
449 size = kernelGenerator(NULL, 0, dims, pgran, (void*)extra);
450 if (size < 0) {
451 storeErrorCode(error, CL_OUT_OF_HOST_MEMORY);
452 return NULL;
453 }
454 source = calloc(1, size);
455 if (source == NULL) {
456 storeErrorCode(error, CL_OUT_OF_HOST_MEMORY);
457 return NULL;
458 }
459 if (kernelGenerator(source, size, dims, pgran, (void*)extra) != size) {
460 free(source);
461 storeErrorCode(error, CL_OUT_OF_HOST_MEMORY);
462 return NULL;
463 }
464
465
466
467 log = allocBuildLog();
468
469 //#define DEBUG_2
470 #ifdef DEBUG_2
471 printf("Build Options used %s \n", buildOpts);
472 printf("Source kernel used %s \n", source);
473 #endif
474 #undef DEBUG_2
475
476 kernel->program = buildClProgram(source, buildOpts, context, device,
477 log, BUILD_LOG_SIZE, &err);
478 if (err != CL_SUCCESS) {
479 printBuildError(err, device, kernelGenerator, dims,
480 pgran, extra, source, log);
481 freeBuildLog(log);
482 putKernel(NULL, kernel);
483 free(source);
484 storeErrorCode(error, err);
485 return NULL;
486 }
487 else
488 {
489 // #define DEBUG_2
490 #ifdef DEBUG_2
491 printf("Kernel compilation succeeded\n");
492 #endif
493 #undef DEBUG_2
494 }
495
496 freeBuildLog(log);
497 free(source);
498
499 #if !defined(KEEP_CLBLAS_KERNEL_SOURCES)
500 if (err == CL_SUCCESS) {
501 err = dropProgramSource(&kernel->program, context, device);
502 kernel->noSource = 1;
503 }
504 #endif /* !DUMP_CLBLAS_KERNELS */
505
506 if (err != CL_SUCCESS) {
507 putKernel(NULL, kernel);
508 storeErrorCode(error, err);
509 return NULL;
510 }
511 }
512 else
513 {
514 kernel->program = program;
515 }
516
517 kernel->extraSize = sizeof(CLBLASKernExtra);
518 kernel->extra = calloc(1, kernel->extraSize);
519 *(CLBLASKernExtra*)(kernel->extra) = *extra;
520 kernel->dtor = extraDtor;
521
522 storeErrorCode(error, CL_SUCCESS);
523
524 return kernel;
525
526 }
527
528 void
setupBuildOpts(char opts[BUILD_OPTS_MAXLEN],cl_device_id devID,MemoryPattern * mempat)529 setupBuildOpts(
530 char opts[BUILD_OPTS_MAXLEN],
531 cl_device_id devID,
532 MemoryPattern *mempat)
533 {
534 TargetDevice target;
535
536 target.id = devID;
537 identifyDevice(&target);
538 opts[0] = '\0';
539
540 #if !defined NDEBUG
541 // Nvidia runtime does not appear to support the -g flag, at least in their OpenCL v1.1 runtime
542 if( target.ident.vendor != VENDOR_NVIDIA )
543 addBuildOpt( opts, BUILD_OPTS_MAXLEN, "-g" );
544 #endif /* NDEBUG */
545
546 if (target.ident.vendor == VENDOR_NVIDIA &&
547 !strcmp(mempat->name, "2-staged cached global memory based "
548 "block trsm")) {
549
550 addBuildOpt(opts, BUILD_OPTS_MAXLEN, "-cl-opt-disable");
551 }
552 }
553
addBuildOpt(char * opts,size_t len,const char * option)554 void addBuildOpt(
555 char * opts,
556 size_t len,
557 const char * option)
558 {
559 size_t l = strlen(opts);
560
561 if (l > 0 && !isspace(opts[l-1]) && l+1 < len) {
562 opts[l] = ' ';
563 opts[l+1] = '\0';
564 l++;
565 }
566
567 strncat(opts, option, len - l - 1);
568 }
569
570
571 char VISIBILITY_HIDDEN
sprintfGranulation(char * buf,const SubproblemDim * dim,int level)572 *sprintfGranulation(char *buf, const SubproblemDim *dim, int level)
573 {
574 buf = sprintfDim(buf, dim[level].itemY, "itemY", level, true);
575 buf = sprintfDim(buf, dim[level].itemX, "itemX", level, false);
576 buf = sprintfDim(buf, dim[level].y, "y", level, false);
577 buf = sprintfDim(buf, dim[level].x, "x", level, false);
578 buf = sprintfDim(buf, dim[level].bwidth, "bwidth", level, false);
579 strcat(buf, "; ");
580 buf += strlen(buf);
581
582 return buf;
583 }
584
585 clblasStatus VISIBILITY_HIDDEN
checkMatrixSizes(DataType dtype,clblasOrder order,clblasTranspose transA,size_t M,size_t N,cl_mem A,size_t offA,size_t lda,ErrorCodeSet err)586 checkMatrixSizes(
587 DataType dtype,
588 clblasOrder order,
589 clblasTranspose transA,
590 size_t M,
591 size_t N,
592 cl_mem A,
593 size_t offA,
594 size_t lda, // lda is passed as zero for packed matrices
595 ErrorCodeSet err )
596 {
597 size_t memSize, matrSize, tsize, memUsed;
598 size_t unusedTail = 0;
599 bool tra;
600
601 if ((M == 0) || (N == 0)) {
602 return clblasInvalidDim;
603 }
604
605 tsize = dtypeSize(dtype);
606 tra = (order == clblasRowMajor && transA != clblasNoTrans) ||
607 (order == clblasColumnMajor && transA == clblasNoTrans);
608
609 if( lda > 0 ) // For Non-packed matrices
610 {
611 if (tra) {
612 if (lda < M) {
613 switch( err )
614 {
615 case A_MAT_ERRSET:
616 return clblasInvalidLeadDimA;
617 case B_MAT_ERRSET:
618 return clblasInvalidLeadDimB;
619 case C_MAT_ERRSET:
620 return clblasInvalidLeadDimC;
621 default:
622 return clblasNotImplemented;
623 }
624 }
625 matrSize = ((N - 1) * lda + M) * tsize;
626 unusedTail = ( lda - N ) * tsize;
627 }
628 else {
629 if (lda < N) {
630 switch( err )
631 {
632 case A_MAT_ERRSET:
633 return clblasInvalidLeadDimA;
634 case B_MAT_ERRSET:
635 return clblasInvalidLeadDimB;
636 case C_MAT_ERRSET:
637 return clblasInvalidLeadDimC;
638 default:
639 return clblasNotImplemented;
640 }
641 }
642 matrSize = ((M - 1) * lda + N) * tsize;
643 unusedTail = ( lda - M ) * tsize;
644 }
645 }
646 else { // For the case of packed matrices
647 matrSize = ((M * (N+1)) / 2) * tsize;
648 }
649
650 offA *= tsize;
651
652 if (clGetMemObjectInfo(A, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) !=
653 CL_SUCCESS) {
654 switch( err )
655 {
656 case A_MAT_ERRSET:
657 return clblasInvalidMatA;
658 case B_MAT_ERRSET:
659 return clblasInvalidMatB;
660 case C_MAT_ERRSET:
661 return clblasInvalidMatC;
662 default:
663 return clblasNotImplemented;
664 }
665 }
666
667 // Calculates the memory required. Note that 'matrSize' already takes into account the fact that
668 // there might be an unused tail, i.e. the elements between lda and M in the last column if
669 // column major is used or between lda and N in the last row if row major is used.
670 memUsed = offA + matrSize;
671 if (( memUsed > memSize ) || (offA + matrSize < offA)) {
672 switch( err )
673 {
674 case A_MAT_ERRSET:
675 return clblasInsufficientMemMatA;
676 case B_MAT_ERRSET:
677 return clblasInsufficientMemMatB;
678 case C_MAT_ERRSET:
679 return clblasInsufficientMemMatC;
680 default:
681 return clblasNotImplemented;
682 }
683 }
684
685 return clblasSuccess;
686 }
687
688
689 clblasStatus VISIBILITY_HIDDEN
checkBandedMatrixSizes(DataType dtype,clblasOrder order,clblasTranspose transA,size_t M,size_t N,size_t KL,size_t KU,cl_mem A,size_t offA,size_t lda,ErrorCodeSet err)690 checkBandedMatrixSizes(
691 DataType dtype,
692 clblasOrder order,
693 clblasTranspose transA,
694 size_t M,
695 size_t N,
696 size_t KL,
697 size_t KU,
698 cl_mem A,
699 size_t offA,
700 size_t lda,
701 ErrorCodeSet err )
702 {
703 size_t memSize, matrSize, tsize, K, memUsed;
704 size_t unusedTail = 0;
705 bool tra;
706
707 if ((M == 0) || (N == 0)) {
708 return clblasInvalidDim;
709 }
710
711 tsize = dtypeSize(dtype);
712 K = KL + KU + 1;
713 tra = (order == clblasRowMajor && transA != clblasNoTrans) ||
714 (order == clblasColumnMajor && transA == clblasNoTrans);
715
716 if (lda < K) {
717 switch( err )
718 {
719 case A_MAT_ERRSET:
720 return clblasInvalidLeadDimA;
721 case B_MAT_ERRSET:
722 return clblasInvalidLeadDimB;
723 case C_MAT_ERRSET:
724 return clblasInvalidLeadDimC;
725 default:
726 return clblasNotImplemented;
727 }
728 }
729
730 if (tra) {
731 matrSize = ((N - 1) * lda + K) * tsize;
732 unusedTail = ( lda - N ) * tsize;
733 }
734 else {
735 matrSize = ((M - 1) * lda + K) * tsize;
736 unusedTail = ( lda - M ) * tsize;
737 }
738
739 offA *= tsize;
740
741 if (clGetMemObjectInfo(A, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) !=
742 CL_SUCCESS) {
743 switch( err )
744 {
745 case A_MAT_ERRSET:
746 return clblasInvalidMatA;
747 case B_MAT_ERRSET:
748 return clblasInvalidMatB;
749 case C_MAT_ERRSET:
750 return clblasInvalidMatC;
751 default:
752 return clblasNotImplemented;
753 }
754 }
755
756 // Calculates the memory required. Note that 'matrSize' already takes into account the fact that
757 // there might be an unused tail, i.e. the elements between lda and M in the last column if
758 // column major is used or between lda and N in the last row if row major is used.
759 memUsed = offA + matrSize;
760 if (memUsed > memSize) {
761 switch( err )
762 {
763 case A_MAT_ERRSET:
764 return clblasInsufficientMemMatA;
765 case B_MAT_ERRSET:
766 return clblasInsufficientMemMatB;
767 case C_MAT_ERRSET:
768 return clblasInsufficientMemMatC;
769 default:
770 return clblasNotImplemented;
771 }
772 }
773
774 return clblasSuccess;
775 }
776
777 clblasStatus VISIBILITY_HIDDEN
checkVectorSizes(DataType dtype,size_t N,cl_mem x,size_t offx,int incx,ErrorCodeSet err)778 checkVectorSizes(
779 DataType dtype,
780 size_t N,
781 cl_mem x,
782 size_t offx,
783 int incx,
784 ErrorCodeSet err )
785 {
786 size_t memSize, sizev;
787 size_t tsize;
788
789 if (N == 0) {
790 return clblasInvalidDim;
791 }
792
793 if (incx == 0) {
794 switch( err )
795 {
796 case X_VEC_ERRSET:
797 return clblasInvalidIncX;
798 case Y_VEC_ERRSET:
799 return clblasInvalidIncY;
800 default:
801 return clblasNotImplemented;
802 }
803 }
804
805 if (clGetMemObjectInfo(x, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) !=
806 CL_SUCCESS) {
807 switch( err )
808 {
809 case X_VEC_ERRSET:
810 return clblasInvalidVecX;
811 case Y_VEC_ERRSET:
812 return clblasInvalidVecY;
813 default:
814 return clblasNotImplemented;
815 }
816 }
817
818 tsize = dtypeSize(dtype);
819 sizev = ((N - 1) * abs(incx) + 1) * tsize;
820 offx *= tsize;
821
822 if ((offx + sizev > memSize) || (offx + sizev < offx)) {
823 switch( err )
824 {
825 case X_VEC_ERRSET:
826 return clblasInsufficientMemVecX;
827 case Y_VEC_ERRSET:
828 return clblasInsufficientMemVecY;
829 default:
830 return clblasNotImplemented;
831 }
832 }
833
834 return clblasSuccess;
835 }
836
837 clblasStatus
checkMemObjects(cl_mem A,cl_mem B,cl_mem C,bool checkC,ErrorCodeSet errA,ErrorCodeSet errB,ErrorCodeSet errC)838 checkMemObjects(
839 cl_mem A,
840 cl_mem B,
841 cl_mem C,
842 bool checkC,
843 ErrorCodeSet errA,
844 ErrorCodeSet errB,
845 ErrorCodeSet errC )
846 {
847 cl_mem_object_type mobjType = 0;
848
849 if (!clGetMemObjectInfo(A, CL_MEM_TYPE, sizeof(mobjType), &mobjType, NULL) &&
850 (mobjType != CL_MEM_OBJECT_BUFFER)) {
851 switch( errA )
852 {
853 case A_MAT_ERRSET:
854 return clblasInvalidMatA;
855 case B_MAT_ERRSET:
856 return clblasInvalidMatB;
857 case C_MAT_ERRSET:
858 return clblasInvalidMatC;
859 case X_VEC_ERRSET:
860 return clblasInvalidVecX;
861 case Y_VEC_ERRSET:
862 return clblasInvalidVecY;
863 default:
864 return clblasNotImplemented;
865 }
866 }
867
868 mobjType = 0;
869 if (!clGetMemObjectInfo(B, CL_MEM_TYPE, sizeof(mobjType), &mobjType, NULL) &&
870 (mobjType != CL_MEM_OBJECT_BUFFER)) {
871 switch( errB )
872 {
873 case A_MAT_ERRSET:
874 return clblasInvalidMatA;
875 case B_MAT_ERRSET:
876 return clblasInvalidMatB;
877 case C_MAT_ERRSET:
878 return clblasInvalidMatC;
879 case X_VEC_ERRSET:
880 return clblasInvalidVecX;
881 case Y_VEC_ERRSET:
882 return clblasInvalidVecY;
883 default:
884 return clblasNotImplemented;
885 }
886 }
887
888 mobjType = 0;
889 if (checkC && !clGetMemObjectInfo(C, CL_MEM_TYPE, sizeof(mobjType),
890 &mobjType, NULL) &&
891 (mobjType != CL_MEM_OBJECT_BUFFER)) {
892 switch( errC )
893 {
894 case A_MAT_ERRSET:
895 return clblasInvalidMatA;
896 case B_MAT_ERRSET:
897 return clblasInvalidMatB;
898 case C_MAT_ERRSET:
899 return clblasInvalidMatC;
900 case X_VEC_ERRSET:
901 return clblasInvalidVecX;
902 case Y_VEC_ERRSET:
903 return clblasInvalidVecY;
904 default:
905 return clblasNotImplemented;
906 }
907 }
908
909 return clblasSuccess;
910 }
911