1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 /*
19  * gemv generator
20  */
21 
22 #include <string.h>
23 #include <stdio.h>
24 #include <assert.h>
25 #include <math.h>
26 #include <clblas_stddef.h>
27 #include <clBLAS.h>
28 #include <blas_mempat.h>
29 #include <clkern.h>
30 #include <clblas-internal.h>
31 
32 #include "blas_kgen.h"
33 #include "xxmv_common.h"
34 
35 typedef struct {
36     size_t staggered;
37 } MAY_ALIAS extraData_t;
38 
39 static const char *gemvDecl =
40     "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n"
41     "void __kernel\n"
42     "%cgemv(\n"
43     "    uint %c,\n"
44     "    uint %c,\n"
45     "    const %s alpha,\n"
46     "    const __global %s *restrict A,\n"
47     "    const __global %s *restrict X,\n"
48     "%s"
49     "    __global %s *Y,\n"
50     "    uint lda"
51     "%s"    // offset A, X and Y
52     "%s"
53     "%s)\n";
54 
55 static CLBLASMpatExtra mpatExtra;
56 
57 static ssize_t
58 generator(
59    char *buf,
60    size_t buflen,
61    const struct SubproblemDim *subdims,
62    const struct PGranularity *pgran,
63    void *extra);
64 
65 static void
66 assignKargs(KernelArg *args, const void *params, const void *extra);
67 
68 static void
69 fixupArgs(void *args, SubproblemDim *subdims, void *extra);
70 
71 static SolverFlags
72 solverFlags(void);
73 
74 static bool
75 isFitToLDS(
76     SubproblemDim *dim,
77     DataType dtype,
78     cl_ulong ldsSize,
79     const void *kernelArgs);
80 
81 static void
82 calcNrThreads(
83     size_t threads[2],
84     const SubproblemDim *subdims,
85     const PGranularity *pgran,
86     const void *args,
87     const void *extra);
88 
89 static bool
90 subgCheckCalcDecomp(
91     PGranularity *pgran,
92     SubproblemDim *subdims,
93     unsigned int subdimsNum,
94     DataType dtype,
95     int check);
96 
97 static int
98 subgGetDefaultDecomp(
99     PGranularity *pgran,
100     SubproblemDim *subdims,
101     unsigned int subdimsNum,
102     void * pArgs);
103 
104 static SolverOps gemvSops = {
105     generator,
106     assignKargs,
107     isFitToLDS,
108     NULL,
109     NULL,
110     calcNrThreads,
111     NULL,
112     solverFlags,
113     fixupArgs,
114     subgGetDefaultDecomp,//getDefaultDecomposition
115     subgCheckCalcDecomp, //get Decomp. list
116     NULL,
117     NULL
118 };
119 
120 static void
declareGemvKernel(struct KgenContext * ctx,DataType dtype,const PGranularity * pgran,KernelExtraFlags kflags)121 declareGemvKernel(
122     struct KgenContext *ctx,
123     DataType dtype,
124     const PGranularity *pgran,
125     KernelExtraFlags kflags)
126 {
127     char sizeNames[2] = {'M', 'N'};
128     bool incxOne = ((kflags & KEXTRA_INCX_ONE) != 0);
129     bool incyOne = ((kflags & KEXTRA_INCY_ONE) != 0);
130     bool beta0 = ((kflags & KEXTRA_BETA_ZERO) != 0);
131     const char *incxDecl = incxOne ? "" : ",\n    const int incx";
132     const char *incyDecl = incyOne ? "" : ",\n    const int incy";
133     char offDecl[128];
134     char betaDecl[128];
135     char tmp[512];
136     char fpref;
137     bool tra = ((kflags & KEXTRA_TRANS_A) != 0);
138     const char *typeName;
139 
140     typeName = dtypeBuiltinType(dtype);
141     fpref = dtypeToBlasPrefix(dtype);
142 
143     offDecl[0] = '\0';
144     if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
145         strcpy(offDecl, ",\n    const uint offA");
146     }
147     if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
148         strcat(offDecl, ",\n    const uint offX");
149     }
150     if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
151         strcat(offDecl, ",\n    const uint offY");
152     }
153 
154     if (beta0) {
155         betaDecl[0] = '\0';
156     }
157     else {
158         sprintf(betaDecl, "    const %s beta,\n", typeName);
159     }
160     sprintf(tmp, gemvDecl, pgran->wgSize[0], pgran->wgSize[1], fpref,
161             sizeNames[tra], sizeNames[1 - tra],
162             typeName, typeName, typeName, betaDecl, typeName,
163             offDecl, incxDecl, incyDecl);
164 
165     kgenDeclareFunction(ctx, tmp);
166 }
167 
168 static void
setFetchHandler(TileMulOpts * mulOpts,const BlasGenSettings * gset,int handler (struct KgenContext * ctx,MatrixRole mrole,void * priv),TilePostFetchPrivate * priv)169 setFetchHandler(
170     TileMulOpts *mulOpts,
171     const BlasGenSettings *gset,
172     int handler(struct KgenContext *ctx, MatrixRole mrole, void *priv),
173     TilePostFetchPrivate *priv)
174 {
175     int i, nrPrivs;
176     const char *regName = NULL;
177 
178     nrPrivs = 1;
179     for (i = 0; i < nrPrivs; i++) {
180         priv[i].fetchNumA = 0;
181         priv[i].wholeA = 1;
182         priv[i].funcID = CLBLAS_GEMV;
183         priv[i].gset = gset;
184         priv[i].regName = regName;
185         mulOpts->postFetch = handler;
186         mulOpts->postFetchPriv = priv;
187     }
188 }
189 
190 // global memory based kernel generator
191 static ssize_t
generator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)192 generator(
193    char *buf,
194    size_t buflen,
195    const struct SubproblemDim *subdims,
196    const struct PGranularity *pgran,
197    void *extra)
198 {
199     struct KgenContext *ctx;
200     CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
201     KernelExtraFlags kflags = kextra->flags;
202     size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered;
203     //yes, KEXTRA_TAILS_K because it is set if N % bw != 0
204     bool tailN = ((kflags & KEXTRA_TAILS_K) != 0);
205     bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
206     char tmp[4096];
207     DataType dtype = kextra->dtype;
208     bool doubleBased = isDoubleBasedType(dtype);
209     BlasGenSettings gset;
210     TileMulOpts mulOpts;
211     KernelVarNames *vnames = &gset.varNames;
212     ssize_t ret;
213     TilePostFetchPrivate pfPriv;
214     unsigned int vecLen = kextra->vecLen;
215     const char *outTypeName;
216     const char *gid = "get_group_id(0)";
217     const char *lid = "get_local_id(0)";
218     const char *typeName;
219     size_t wgSize;
220     //unsigned int nStep = 32;
221     unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; //8;
222     unsigned int cLocal;
223     bool isComplex = isComplexType(dtype);
224     unsigned int nPlans;
225 
226     typeName = dtypeBuiltinType(dtype);
227     memset(&gset, 0, sizeof(gset));
228     memset(&mulOpts, 0, sizeof(mulOpts));
229     ctx = createKgenContext(buf, buflen, true);
230     if (ctx == NULL) {
231         return -ENOMEM;
232     }
233 
234     // at first, generate needed declarations
235     kgenDeclareUptrs(ctx, doubleBased);
236 
237     // now, generate the kernel
238     declareGemvKernel(ctx, dtype, pgran, kflags);
239     ret = kgenBeginFuncBody(ctx);
240     kgenAddStmt(ctx, "// M always denotes length of Y "
241                      "and N denotes length of X in the kernel\n");
242     /* 1D work space. Matrix is divided among wi, each calculates it's own
243      * part of vector y */
244 
245     wgSize = (subdims[0].y / subdims[1].y) *
246             (subdims[0].bwidth / subdims[1].bwidth);
247     assert(pgran->wgSize[0] == wgSize);
248     assert(subdims[0].x == 1);
249     assert(subdims[1].x == 1);
250     cLocal = wgSize/bStep;
251 
252     memcpy(gset.subdims, subdims, sizeof(gset.subdims));
253     gset.subdims[0].itemX = gset.subdims[0].x = 1;
254     gset.subdims[1].itemX = gset.subdims[1].x = 1;
255     gset.subdims[0].bwidth = gset.subdims[1].bwidth;
256 
257     gset.pgran = pgran;
258     gset.kextra = kextra;
259     gset.flags = BGF_UPTRS;
260 
261     initDefaultTiles(&gset, CLBLAS_GEMV, 0, PRIV_STORAGE_VARIABLE_SET);
262     if (isComplex) {
263          gset.tileCY.vecLen = 1;
264     }
265     declareTileStorages(ctx, &gset);
266     genZeroTile(ctx, &gset.tileCY);
267     getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
268     nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;
269 
270     sprintf(tmp, "__local %s localRes[%u][%u];\n",
271                 outTypeName, pgran->wgSize[0], nPlans);
272     kgenAddStmt(ctx, tmp);
273     sprintf(tmp, "uint coordA = (%s * %u + %s %% %u) * %lu;\n",
274                  gid, bStep, lid, bStep, subdims[1].y);
275     kgenAddStmt(ctx, tmp);
276     sprintf(tmp, "uint k0 = (%s / %u) * %lu;\n",
277                  lid,  bStep, subdims[1].bwidth);
278     kgenAddStmt(ctx, tmp);
279 
280     kgenAddBlankLine(ctx);
281 
282     kgenBeginBranch(ctx,"if (coordA < M && k0 < N)");
283 
284     genIncPointers(ctx, kflags);
285     sprintf(tmp,
286             "const GPtr Ag = {(__global %s*)A};\n"
287             "const GPtr Xg = {(__global %s*)X};\n",
288             typeName, typeName);
289     kgenAddStmt(ctx, tmp);
290 
291     kgenAddBlankLine(ctx);
292 
293     if (tailN) {
294         sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
295         kgenAddStmt(ctx, tmp);
296         kgenAddStmt(ctx, "N -= Ntail;\n");
297         kgenAddBlankLine(ctx);
298     }
299 
300     mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
301     if (tailM) {
302         mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
303     }
304 
305     vnames->A = "Ag";
306     vnames->B = "Xg";
307     vnames->coordA = "coordA";
308     vnames->coordB = ""; //should not be used for vector
309     vnames->k = "k";
310     vnames->lda = "lda";
311     vnames->sizeK = "N";
312     vnames->sizeM = "M";
313 
314     mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_C_COLUMN_MAJOR | TILEMUL_NOT_INC_K;
315     if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
316         mulOpts.flags |= TILEMUL_CONJA;
317     }
318     if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
319         mulOpts.flags |= TILEMUL_TRA;
320     }
321     if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
322         mulOpts.core = TILEMUL_MAD;
323     }
324     else {
325         mulOpts.core = TILEMUL_MULADD;
326     }
327     mulOpts.memA = CLMEM_GLOBAL_MEMORY;
328     mulOpts.memB = CLMEM_GLOBAL_MEMORY;
329 
330     if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
331         gset.subdims[0].bwidth = pgran->wgSize[0] * subdims[1].bwidth;
332         mulOpts.flags |= TILEMUL_BW_STRIDE;
333     }
334 
335     sprintf(tmp, "uint k = k0;\nfor (; k < N; k += %lu)", cLocal*subdims[1].bwidth);
336     kgenBeginBranch(ctx, tmp);
337 
338     if (staggered) {
339         vnames->k = "k1";
340         sprintf(tmp, "const uint k1 = (k + get_group_id(0)*%lu)%%N;\n",staggered);
341         kgenAddStmt(ctx, tmp);
342     }
343 
344     genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
345             mulOpts.flags, kflags);
346 
347     ret = tileMulGen(ctx, &gset, &mulOpts);
348     if (ret != 0) {
349         return ret;
350     }
351     vnames->k = "k";
352     kgenEndBranch(ctx, NULL); /* k loop */
353 
354     if (tailN) {
355         /* Handle tail along vector X */
356         kgenAddStmt(ctx, "N += Ntail;\n");
357         kgenBeginBranch(ctx, "if (k < N)");
358 
359         mulOpts.flags |= TILEMUL_SKEW_B;
360         genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
361                   mulOpts.flags, kflags);
362         mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K|TILEMUL_WRAP_AROUND_TAIL;
363         setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
364         ret = tileMulGen(ctx, &gset, &mulOpts);
365         if (ret != 0) {
366             return ret;
367         }
368         kgenEndBranch(ctx, NULL);
369     }
370 
371     if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
372         gset.subdims[0].bwidth = subdims[1].bwidth;
373         mulOpts.flags &= ~TILEMUL_BW_STRIDE;
374     }
375 
376     kgenEndBranch(ctx,NULL);
377 
378     genStoreLocalResult(ctx, &gset.tileCY, lid);
379 
380     kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
381     kgenAddBlankLine(ctx);
382 
383     sprintf(tmp, "if (%s < %u && coordA < M && k0 < N)", lid, bStep);
384     kgenBeginBranch(ctx, tmp);
385 
386     genAddLocalResult(ctx, &gset.tileCY, lid, cLocal, bStep);
387 
388     /* write back the results */
389     /* y := alpha*A*x + beta*y */
390     setResultPos(ctx, kflags, vnames->coordA);
391 
392     updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);
393 
394     kgenEndBranch(ctx, NULL);
395 
396     kgenEndFuncBody(ctx);
397     ret = kgenAddBlankLine(ctx);
398 
399     if (!ret) {
400         ret = (ssize_t)kgenSourceSize(ctx) + 1;
401     }
402 
403     destroyKgenContext(ctx);
404     return (ret < 0) ? -EOVERFLOW : ret;
405 }
406 
407 static void
assignKargs(KernelArg * args,const void * params,const void * extra)408 assignKargs(KernelArg *args, const void *params, const void *extra)
409 {
410     const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
411     KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags;
412     cl_int inc;
413     int i;
414 
415     initSizeKarg(&args[0], blasArgs->M);
416     initSizeKarg(&args[1], blasArgs->N);
417     assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
418     INIT_KARG(&args[3], blasArgs->A);
419     INIT_KARG(&args[4], blasArgs->B);
420     i = 5;
421     if (!(kflags & KEXTRA_BETA_ZERO)) {
422         assignScalarKarg(&args[i++], &(blasArgs->beta), blasArgs->dtype);
423     }
424     INIT_KARG(&args[i], blasArgs->C);
425     i++;
426     initSizeKarg(&args[i++], blasArgs->lda.matrix);
427     if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
428         initSizeKarg(&args[i++], blasArgs->offA);
429     }
430     if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
431         initSizeKarg(&args[i++], blasArgs->offBX);
432     }
433     if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
434         initSizeKarg(&args[i++], blasArgs->offCY);
435     }
436     if (!(kflags & KEXTRA_INCX_ONE)) {
437         inc = blasArgs->ldb.vector;
438         INIT_KARG(&args[i], inc);
439         i++;
440     }
441     if (!(kflags & KEXTRA_INCY_ONE)) {
442         inc = blasArgs->ldc.vector;
443         INIT_KARG(&args[i], inc);
444         i++;
445     }
446 }
447 
448 static void
fixupArgs(void * args,SubproblemDim * subdims,void * extra)449 fixupArgs(void *args, SubproblemDim *subdims, void *extra)
450 {
451     CLBlasKargs *kargs = (CLBlasKargs*)args;
452     KernelExtraFlags kflags = ((CLBLASKernExtra*)extra)->flags;
453 
454     const size_t nChans = 8; // !!!DEVICE DEPENDED!!!
455     const size_t wideChans = 64; // !!!DEVICE DEPENDED!!!
456     const size_t sizeType[] = {1,2,2,4};
457 
458     size_t sizeBlock = wideChans * nChans / sizeType[kargs->dtype];
459     size_t off = kargs->K % sizeBlock;
460     extraData_t *extraData = (extraData_t*)&((CLBLASKernExtra*)extra)->solverPriv;
461     if (off == 0 && !isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
462         /*
463          * FIXME: staggered access is not enabled now since for some reason
464          *        it leads to slowdown at small sizes
465          */
466         extraData->staggered = 0; // wideChans / sizeType[kargs->dtype];
467     }
468     else {
469         extraData->staggered = 0;
470     }
471 
472     (void)subdims;
473 
474     off = (kargs->offsetM) ? kargs->offsetM : kargs->offsetN;
475     if (off) {
476         if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
477             kargs->offA += off;
478         }
479         else {
480             kargs->offA += off * kargs->lda.matrix;
481         }
482         if (kargs->ldc.vector < 0) {
483             // K store the original height of the matrix A
484             kargs->offCY += (kargs->K - off) * abs(kargs->ldc.vector);
485         }
486         else {
487             kargs->offCY += off * kargs->ldc.vector;
488         }
489     }
490 
491     kargs->offsetM = kargs->offsetN = 0;
492 
493 }
494 
495 static int
subgGetDefaultDecomp(PGranularity * pgran,SubproblemDim * subdims,unsigned int subdimsNum,void * pArgs)496 subgGetDefaultDecomp(
497     PGranularity *pgran,
498     SubproblemDim *subdims,
499     unsigned int subdimsNum,
500     void * pArgs)
501 {
502     (void)subdimsNum;
503     DUMMY_ARG_USAGE(pArgs);
504 
505     pgran->wgDim = 1;
506     pgran->wgSize[0] = 64;
507     pgran->wgSize[1] = 1;
508 
509     subdims[1].bwidth = 4;
510     subdims[1].itemX = subdims[1].x = 1;
511     subdims[1].itemY = subdims[1].y = 4;
512 
513     subdims[0].bwidth = 8 * subdims[1].bwidth;
514     subdims[0].itemX = subdims[0].x = 1;
515     subdims[0].itemY = subdims[0].y = 8 * subdims[1].y;
516 
517     return 0;
518 }
519 
520 static bool
isFitToLDS(SubproblemDim * dim,DataType dtype,cl_ulong ldsSize,const void * kernelArgs)521 isFitToLDS(
522     SubproblemDim *dim,
523     DataType dtype,
524     cl_ulong ldsSize,
525     const void *kernelArgs)
526 {
527     (void)kernelArgs;
528 
529     if (1) {
530         cl_ulong size;
531 
532 	    /*
533          * One needs y1 * wgSize size of local memory in elements, but
534          * y1 is not calculated yet. The expression below produces
535          * reliable a larger value. It is larger in dims[1].bwidth times.
536          */
537         size = dim[0].y * dim[0].bwidth * dtypeSize(dtype);
538 
539         return (size <= ldsSize);
540     }
541     return true;
542 }
543 
544 static void
calcNrThreads(size_t threads[2],const SubproblemDim * subdims,const PGranularity * pgran,const void * args,const void * extra)545 calcNrThreads(
546     size_t threads[2],
547     const SubproblemDim *subdims,
548     const PGranularity *pgran,
549     const void *args,
550     const void *extra)
551 {
552     size_t yLen;     /* Length of "Y" vector */
553     const CLBlasKargs *kargs = args;
554     unsigned int subgr = pgran->wgSize[0] / (subdims[0].bwidth / subdims[1].bwidth);
555 
556     (void)subdims;
557     (void)extra;
558 
559     yLen = kargs->transA == clblasNoTrans ? kargs->M : kargs->N;
560 
561     if (yLen == 0) {
562         yLen = 1;
563         //launch one group to avoid CL_INVALID_WORK_GROUP_SIZE error
564     }
565 
566     //each work item handles y1 lines
567     threads[0] = divRoundUp(yLen, subdims[1].y) * subgr;
568     threads[0] = roundUp(threads[0], pgran->wgSize[0]);
569     threads[1] = 0;
570 }
571 
572 static SolverFlags
solverFlags(void)573 solverFlags(void)
574 {
575     return (SF_WSPACE_1D);
576 }
577 
578 static bool
subgCheckCalcDecomp(PGranularity * pgran,SubproblemDim * subdims,unsigned int subdimsNum,DataType dtype,int check)579 subgCheckCalcDecomp(
580     PGranularity *pgran,
581     SubproblemDim *subdims,
582     unsigned int subdimsNum,
583     DataType dtype,
584     int check)
585 {
586     unsigned int divider1 = dtypeSize(dtype)/sizeof(cl_float);
587     unsigned int divider0 = 2-!isComplexType(dtype);
588     //EINVAL
589     if( (subdimsNum<2)||
590         (NULL==pgran)||
591         (NULL==subdims) ){
592 
593         return false;
594     }
595 
596     if( 0 == subdims[0].x ||
597         0 == subdims[0].y ||
598         0 == subdims[0].bwidth ||
599         0 == subdims[1].x ||
600         0 == subdims[1].y ||
601         0 == subdims[1].bwidth ){
602 
603         return false;
604     }
605 
606     if( subdims[1].x != subdims[1].itemX ||
607         subdims[1].y != subdims[1].itemY ){
608 
609         return false;
610     }
611 
612     // the group block must consist of integer number of subgroup blocks
613     if( subdims[0].x % subdims[1].x ||
614         subdims[0].y % subdims[1].y ||
615         subdims[0].bwidth % subdims[1].bwidth ){
616 
617         return false;
618     }
619 
620     //check fitting of bw to common vector sizes
621     if( isComplexType(dtype) ){
622 
623         if( 2*subdims[1].bwidth > 32 ){
624 
625             return false;
626         }
627     }
628 
629     // check dimensions
630     if( subdims[1].bwidth > 16 / divider1 ||
631         subdims[1].x > 1 ||
632         subdims[1].y > 16 / divider1 ){
633 
634         return false;
635     }
636 
637     if( subdims[0].bwidth > 256 / divider0 ||
638         subdims[0].x > 1 ||
639         subdims[0].y > 256 / divider0 ){
640 
641         return false;
642     }
643 
644     if (64 != (subdims[0].y / subdims[1].y) *
645         (subdims[0].bwidth / subdims[1].bwidth)) {
646         return false;
647     }
648 
649     // passed PGranularity should be checked
650     if( PGRAN_CHECK == check ){
651         if( pgran->wgSize[0] * pgran->wgSize[1] != 64 ){
652             return false;
653         }
654     }
655     // PGranularity should be calculated
656     else{
657         pgran->wgDim = 1;
658         pgran->wgSize[1] = 1;
659         pgran->wgSize[0] = 64;
660 
661         //subdims[0].bwidth = (pgran->wgSize[0] * subdims[1].bwidth) /
662         //    (subdims[0].y / subdims[1].y);
663     }
664     /*Debug out for Tune*/
665 
666     return true;
667 }
668 
669 //-----------------------------------------------------------------------------
670 
671 void
initGemvPattern(MemoryPattern * mempat)672 initGemvPattern(MemoryPattern *mempat)
673 {
674     mempat->name = "Cached global memory based block gemv";
675     mempat->nrLevels = 2;
676     mempat->cuLevel = 0;
677     mempat->thLevel = 1;
678     mempat->sops = &gemvSops;
679 
680     mpatExtra.aMset = CLMEM_LEVEL_L1;
681     mpatExtra.bMset = CLMEM_LEVEL_L1;
682     mpatExtra.mobjA = CLMEM_BUFFER;
683     mpatExtra.mobjB = CLMEM_BUFFER;
684     mempat->extra = &mpatExtra;
685 }
686