1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 /*
19 * gemv generator
20 */
21
22 #include <string.h>
23 #include <stdio.h>
24 #include <assert.h>
25 #include <math.h>
26 #include <clblas_stddef.h>
27 #include <clBLAS.h>
28 #include <blas_mempat.h>
29 #include <clkern.h>
30 #include <clblas-internal.h>
31
32 #include "blas_kgen.h"
33 #include "xxmv_common.h"
34
35 typedef struct {
36 size_t staggered;
37 } MAY_ALIAS extraData_t;
38
39 static const char *gemvDecl =
40 "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n"
41 "void __kernel\n"
42 "%cgemv(\n"
43 " uint %c,\n"
44 " uint %c,\n"
45 " const %s alpha,\n"
46 " const __global %s *restrict A,\n"
47 " const __global %s *restrict X,\n"
48 "%s"
49 " __global %s *Y,\n"
50 " uint lda"
51 "%s" // offset A, X and Y
52 "%s"
53 "%s)\n";
54
55 static CLBLASMpatExtra mpatExtra;
56
57 static ssize_t
58 generator(
59 char *buf,
60 size_t buflen,
61 const struct SubproblemDim *subdims,
62 const struct PGranularity *pgran,
63 void *extra);
64
65 static void
66 assignKargs(KernelArg *args, const void *params, const void *extra);
67
68 static void
69 fixupArgs(void *args, SubproblemDim *subdims, void *extra);
70
71 static SolverFlags
72 solverFlags(void);
73
74 static bool
75 isFitToLDS(
76 SubproblemDim *dim,
77 DataType dtype,
78 cl_ulong ldsSize,
79 const void *kernelArgs);
80
81 static void
82 calcNrThreads(
83 size_t threads[2],
84 const SubproblemDim *subdims,
85 const PGranularity *pgran,
86 const void *args,
87 const void *extra);
88
89 static bool
90 subgCheckCalcDecomp(
91 PGranularity *pgran,
92 SubproblemDim *subdims,
93 unsigned int subdimsNum,
94 DataType dtype,
95 int check);
96
97 static int
98 subgGetDefaultDecomp(
99 PGranularity *pgran,
100 SubproblemDim *subdims,
101 unsigned int subdimsNum,
102 void * pArgs);
103
104 static SolverOps gemvSops = {
105 generator,
106 assignKargs,
107 isFitToLDS,
108 NULL,
109 NULL,
110 calcNrThreads,
111 NULL,
112 solverFlags,
113 fixupArgs,
114 subgGetDefaultDecomp,//getDefaultDecomposition
115 subgCheckCalcDecomp, //get Decomp. list
116 NULL,
117 NULL
118 };
119
120 static void
declareGemvKernel(struct KgenContext * ctx,DataType dtype,const PGranularity * pgran,KernelExtraFlags kflags)121 declareGemvKernel(
122 struct KgenContext *ctx,
123 DataType dtype,
124 const PGranularity *pgran,
125 KernelExtraFlags kflags)
126 {
127 char sizeNames[2] = {'M', 'N'};
128 bool incxOne = ((kflags & KEXTRA_INCX_ONE) != 0);
129 bool incyOne = ((kflags & KEXTRA_INCY_ONE) != 0);
130 bool beta0 = ((kflags & KEXTRA_BETA_ZERO) != 0);
131 const char *incxDecl = incxOne ? "" : ",\n const int incx";
132 const char *incyDecl = incyOne ? "" : ",\n const int incy";
133 char offDecl[128];
134 char betaDecl[128];
135 char tmp[512];
136 char fpref;
137 bool tra = ((kflags & KEXTRA_TRANS_A) != 0);
138 const char *typeName;
139
140 typeName = dtypeBuiltinType(dtype);
141 fpref = dtypeToBlasPrefix(dtype);
142
143 offDecl[0] = '\0';
144 if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
145 strcpy(offDecl, ",\n const uint offA");
146 }
147 if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
148 strcat(offDecl, ",\n const uint offX");
149 }
150 if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
151 strcat(offDecl, ",\n const uint offY");
152 }
153
154 if (beta0) {
155 betaDecl[0] = '\0';
156 }
157 else {
158 sprintf(betaDecl, " const %s beta,\n", typeName);
159 }
160 sprintf(tmp, gemvDecl, pgran->wgSize[0], pgran->wgSize[1], fpref,
161 sizeNames[tra], sizeNames[1 - tra],
162 typeName, typeName, typeName, betaDecl, typeName,
163 offDecl, incxDecl, incyDecl);
164
165 kgenDeclareFunction(ctx, tmp);
166 }
167
168 static void
setFetchHandler(TileMulOpts * mulOpts,const BlasGenSettings * gset,int handler (struct KgenContext * ctx,MatrixRole mrole,void * priv),TilePostFetchPrivate * priv)169 setFetchHandler(
170 TileMulOpts *mulOpts,
171 const BlasGenSettings *gset,
172 int handler(struct KgenContext *ctx, MatrixRole mrole, void *priv),
173 TilePostFetchPrivate *priv)
174 {
175 int i, nrPrivs;
176 const char *regName = NULL;
177
178 nrPrivs = 1;
179 for (i = 0; i < nrPrivs; i++) {
180 priv[i].fetchNumA = 0;
181 priv[i].wholeA = 1;
182 priv[i].funcID = CLBLAS_GEMV;
183 priv[i].gset = gset;
184 priv[i].regName = regName;
185 mulOpts->postFetch = handler;
186 mulOpts->postFetchPriv = priv;
187 }
188 }
189
190 // global memory based kernel generator
191 static ssize_t
generator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)192 generator(
193 char *buf,
194 size_t buflen,
195 const struct SubproblemDim *subdims,
196 const struct PGranularity *pgran,
197 void *extra)
198 {
199 struct KgenContext *ctx;
200 CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
201 KernelExtraFlags kflags = kextra->flags;
202 size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered;
203 //yes, KEXTRA_TAILS_K because it is set if N % bw != 0
204 bool tailN = ((kflags & KEXTRA_TAILS_K) != 0);
205 bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
206 char tmp[4096];
207 DataType dtype = kextra->dtype;
208 bool doubleBased = isDoubleBasedType(dtype);
209 BlasGenSettings gset;
210 TileMulOpts mulOpts;
211 KernelVarNames *vnames = &gset.varNames;
212 ssize_t ret;
213 TilePostFetchPrivate pfPriv;
214 unsigned int vecLen = kextra->vecLen;
215 const char *outTypeName;
216 const char *gid = "get_group_id(0)";
217 const char *lid = "get_local_id(0)";
218 const char *typeName;
219 size_t wgSize;
220 //unsigned int nStep = 32;
221 unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; //8;
222 unsigned int cLocal;
223 bool isComplex = isComplexType(dtype);
224 unsigned int nPlans;
225
226 typeName = dtypeBuiltinType(dtype);
227 memset(&gset, 0, sizeof(gset));
228 memset(&mulOpts, 0, sizeof(mulOpts));
229 ctx = createKgenContext(buf, buflen, true);
230 if (ctx == NULL) {
231 return -ENOMEM;
232 }
233
234 // at first, generate needed declarations
235 kgenDeclareUptrs(ctx, doubleBased);
236
237 // now, generate the kernel
238 declareGemvKernel(ctx, dtype, pgran, kflags);
239 ret = kgenBeginFuncBody(ctx);
240 kgenAddStmt(ctx, "// M always denotes length of Y "
241 "and N denotes length of X in the kernel\n");
242 /* 1D work space. Matrix is divided among wi, each calculates it's own
243 * part of vector y */
244
245 wgSize = (subdims[0].y / subdims[1].y) *
246 (subdims[0].bwidth / subdims[1].bwidth);
247 assert(pgran->wgSize[0] == wgSize);
248 assert(subdims[0].x == 1);
249 assert(subdims[1].x == 1);
250 cLocal = wgSize/bStep;
251
252 memcpy(gset.subdims, subdims, sizeof(gset.subdims));
253 gset.subdims[0].itemX = gset.subdims[0].x = 1;
254 gset.subdims[1].itemX = gset.subdims[1].x = 1;
255 gset.subdims[0].bwidth = gset.subdims[1].bwidth;
256
257 gset.pgran = pgran;
258 gset.kextra = kextra;
259 gset.flags = BGF_UPTRS;
260
261 initDefaultTiles(&gset, CLBLAS_GEMV, 0, PRIV_STORAGE_VARIABLE_SET);
262 if (isComplex) {
263 gset.tileCY.vecLen = 1;
264 }
265 declareTileStorages(ctx, &gset);
266 genZeroTile(ctx, &gset.tileCY);
267 getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
268 nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;
269
270 sprintf(tmp, "__local %s localRes[%u][%u];\n",
271 outTypeName, pgran->wgSize[0], nPlans);
272 kgenAddStmt(ctx, tmp);
273 sprintf(tmp, "uint coordA = (%s * %u + %s %% %u) * %lu;\n",
274 gid, bStep, lid, bStep, subdims[1].y);
275 kgenAddStmt(ctx, tmp);
276 sprintf(tmp, "uint k0 = (%s / %u) * %lu;\n",
277 lid, bStep, subdims[1].bwidth);
278 kgenAddStmt(ctx, tmp);
279
280 kgenAddBlankLine(ctx);
281
282 kgenBeginBranch(ctx,"if (coordA < M && k0 < N)");
283
284 genIncPointers(ctx, kflags);
285 sprintf(tmp,
286 "const GPtr Ag = {(__global %s*)A};\n"
287 "const GPtr Xg = {(__global %s*)X};\n",
288 typeName, typeName);
289 kgenAddStmt(ctx, tmp);
290
291 kgenAddBlankLine(ctx);
292
293 if (tailN) {
294 sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
295 kgenAddStmt(ctx, tmp);
296 kgenAddStmt(ctx, "N -= Ntail;\n");
297 kgenAddBlankLine(ctx);
298 }
299
300 mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
301 if (tailM) {
302 mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
303 }
304
305 vnames->A = "Ag";
306 vnames->B = "Xg";
307 vnames->coordA = "coordA";
308 vnames->coordB = ""; //should not be used for vector
309 vnames->k = "k";
310 vnames->lda = "lda";
311 vnames->sizeK = "N";
312 vnames->sizeM = "M";
313
314 mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_C_COLUMN_MAJOR | TILEMUL_NOT_INC_K;
315 if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
316 mulOpts.flags |= TILEMUL_CONJA;
317 }
318 if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
319 mulOpts.flags |= TILEMUL_TRA;
320 }
321 if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
322 mulOpts.core = TILEMUL_MAD;
323 }
324 else {
325 mulOpts.core = TILEMUL_MULADD;
326 }
327 mulOpts.memA = CLMEM_GLOBAL_MEMORY;
328 mulOpts.memB = CLMEM_GLOBAL_MEMORY;
329
330 if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
331 gset.subdims[0].bwidth = pgran->wgSize[0] * subdims[1].bwidth;
332 mulOpts.flags |= TILEMUL_BW_STRIDE;
333 }
334
335 sprintf(tmp, "uint k = k0;\nfor (; k < N; k += %lu)", cLocal*subdims[1].bwidth);
336 kgenBeginBranch(ctx, tmp);
337
338 if (staggered) {
339 vnames->k = "k1";
340 sprintf(tmp, "const uint k1 = (k + get_group_id(0)*%lu)%%N;\n",staggered);
341 kgenAddStmt(ctx, tmp);
342 }
343
344 genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
345 mulOpts.flags, kflags);
346
347 ret = tileMulGen(ctx, &gset, &mulOpts);
348 if (ret != 0) {
349 return ret;
350 }
351 vnames->k = "k";
352 kgenEndBranch(ctx, NULL); /* k loop */
353
354 if (tailN) {
355 /* Handle tail along vector X */
356 kgenAddStmt(ctx, "N += Ntail;\n");
357 kgenBeginBranch(ctx, "if (k < N)");
358
359 mulOpts.flags |= TILEMUL_SKEW_B;
360 genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
361 mulOpts.flags, kflags);
362 mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K|TILEMUL_WRAP_AROUND_TAIL;
363 setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
364 ret = tileMulGen(ctx, &gset, &mulOpts);
365 if (ret != 0) {
366 return ret;
367 }
368 kgenEndBranch(ctx, NULL);
369 }
370
371 if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
372 gset.subdims[0].bwidth = subdims[1].bwidth;
373 mulOpts.flags &= ~TILEMUL_BW_STRIDE;
374 }
375
376 kgenEndBranch(ctx,NULL);
377
378 genStoreLocalResult(ctx, &gset.tileCY, lid);
379
380 kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
381 kgenAddBlankLine(ctx);
382
383 sprintf(tmp, "if (%s < %u && coordA < M && k0 < N)", lid, bStep);
384 kgenBeginBranch(ctx, tmp);
385
386 genAddLocalResult(ctx, &gset.tileCY, lid, cLocal, bStep);
387
388 /* write back the results */
389 /* y := alpha*A*x + beta*y */
390 setResultPos(ctx, kflags, vnames->coordA);
391
392 updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);
393
394 kgenEndBranch(ctx, NULL);
395
396 kgenEndFuncBody(ctx);
397 ret = kgenAddBlankLine(ctx);
398
399 if (!ret) {
400 ret = (ssize_t)kgenSourceSize(ctx) + 1;
401 }
402
403 destroyKgenContext(ctx);
404 return (ret < 0) ? -EOVERFLOW : ret;
405 }
406
407 static void
assignKargs(KernelArg * args,const void * params,const void * extra)408 assignKargs(KernelArg *args, const void *params, const void *extra)
409 {
410 const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
411 KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags;
412 cl_int inc;
413 int i;
414
415 initSizeKarg(&args[0], blasArgs->M);
416 initSizeKarg(&args[1], blasArgs->N);
417 assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
418 INIT_KARG(&args[3], blasArgs->A);
419 INIT_KARG(&args[4], blasArgs->B);
420 i = 5;
421 if (!(kflags & KEXTRA_BETA_ZERO)) {
422 assignScalarKarg(&args[i++], &(blasArgs->beta), blasArgs->dtype);
423 }
424 INIT_KARG(&args[i], blasArgs->C);
425 i++;
426 initSizeKarg(&args[i++], blasArgs->lda.matrix);
427 if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
428 initSizeKarg(&args[i++], blasArgs->offA);
429 }
430 if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
431 initSizeKarg(&args[i++], blasArgs->offBX);
432 }
433 if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
434 initSizeKarg(&args[i++], blasArgs->offCY);
435 }
436 if (!(kflags & KEXTRA_INCX_ONE)) {
437 inc = blasArgs->ldb.vector;
438 INIT_KARG(&args[i], inc);
439 i++;
440 }
441 if (!(kflags & KEXTRA_INCY_ONE)) {
442 inc = blasArgs->ldc.vector;
443 INIT_KARG(&args[i], inc);
444 i++;
445 }
446 }
447
448 static void
fixupArgs(void * args,SubproblemDim * subdims,void * extra)449 fixupArgs(void *args, SubproblemDim *subdims, void *extra)
450 {
451 CLBlasKargs *kargs = (CLBlasKargs*)args;
452 KernelExtraFlags kflags = ((CLBLASKernExtra*)extra)->flags;
453
454 const size_t nChans = 8; // !!!DEVICE DEPENDED!!!
455 const size_t wideChans = 64; // !!!DEVICE DEPENDED!!!
456 const size_t sizeType[] = {1,2,2,4};
457
458 size_t sizeBlock = wideChans * nChans / sizeType[kargs->dtype];
459 size_t off = kargs->K % sizeBlock;
460 extraData_t *extraData = (extraData_t*)&((CLBLASKernExtra*)extra)->solverPriv;
461 if (off == 0 && !isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
462 /*
463 * FIXME: staggered access is not enabled now since for some reason
464 * it leads to slowdown at small sizes
465 */
466 extraData->staggered = 0; // wideChans / sizeType[kargs->dtype];
467 }
468 else {
469 extraData->staggered = 0;
470 }
471
472 (void)subdims;
473
474 off = (kargs->offsetM) ? kargs->offsetM : kargs->offsetN;
475 if (off) {
476 if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
477 kargs->offA += off;
478 }
479 else {
480 kargs->offA += off * kargs->lda.matrix;
481 }
482 if (kargs->ldc.vector < 0) {
483 // K store the original height of the matrix A
484 kargs->offCY += (kargs->K - off) * abs(kargs->ldc.vector);
485 }
486 else {
487 kargs->offCY += off * kargs->ldc.vector;
488 }
489 }
490
491 kargs->offsetM = kargs->offsetN = 0;
492
493 }
494
495 static int
subgGetDefaultDecomp(PGranularity * pgran,SubproblemDim * subdims,unsigned int subdimsNum,void * pArgs)496 subgGetDefaultDecomp(
497 PGranularity *pgran,
498 SubproblemDim *subdims,
499 unsigned int subdimsNum,
500 void * pArgs)
501 {
502 (void)subdimsNum;
503 DUMMY_ARG_USAGE(pArgs);
504
505 pgran->wgDim = 1;
506 pgran->wgSize[0] = 64;
507 pgran->wgSize[1] = 1;
508
509 subdims[1].bwidth = 4;
510 subdims[1].itemX = subdims[1].x = 1;
511 subdims[1].itemY = subdims[1].y = 4;
512
513 subdims[0].bwidth = 8 * subdims[1].bwidth;
514 subdims[0].itemX = subdims[0].x = 1;
515 subdims[0].itemY = subdims[0].y = 8 * subdims[1].y;
516
517 return 0;
518 }
519
520 static bool
isFitToLDS(SubproblemDim * dim,DataType dtype,cl_ulong ldsSize,const void * kernelArgs)521 isFitToLDS(
522 SubproblemDim *dim,
523 DataType dtype,
524 cl_ulong ldsSize,
525 const void *kernelArgs)
526 {
527 (void)kernelArgs;
528
529 if (1) {
530 cl_ulong size;
531
532 /*
533 * One needs y1 * wgSize size of local memory in elements, but
534 * y1 is not calculated yet. The expression below produces
535 * reliable a larger value. It is larger in dims[1].bwidth times.
536 */
537 size = dim[0].y * dim[0].bwidth * dtypeSize(dtype);
538
539 return (size <= ldsSize);
540 }
541 return true;
542 }
543
544 static void
calcNrThreads(size_t threads[2],const SubproblemDim * subdims,const PGranularity * pgran,const void * args,const void * extra)545 calcNrThreads(
546 size_t threads[2],
547 const SubproblemDim *subdims,
548 const PGranularity *pgran,
549 const void *args,
550 const void *extra)
551 {
552 size_t yLen; /* Length of "Y" vector */
553 const CLBlasKargs *kargs = args;
554 unsigned int subgr = pgran->wgSize[0] / (subdims[0].bwidth / subdims[1].bwidth);
555
556 (void)subdims;
557 (void)extra;
558
559 yLen = kargs->transA == clblasNoTrans ? kargs->M : kargs->N;
560
561 if (yLen == 0) {
562 yLen = 1;
563 //launch one group to avoid CL_INVALID_WORK_GROUP_SIZE error
564 }
565
566 //each work item handles y1 lines
567 threads[0] = divRoundUp(yLen, subdims[1].y) * subgr;
568 threads[0] = roundUp(threads[0], pgran->wgSize[0]);
569 threads[1] = 0;
570 }
571
572 static SolverFlags
solverFlags(void)573 solverFlags(void)
574 {
575 return (SF_WSPACE_1D);
576 }
577
578 static bool
subgCheckCalcDecomp(PGranularity * pgran,SubproblemDim * subdims,unsigned int subdimsNum,DataType dtype,int check)579 subgCheckCalcDecomp(
580 PGranularity *pgran,
581 SubproblemDim *subdims,
582 unsigned int subdimsNum,
583 DataType dtype,
584 int check)
585 {
586 unsigned int divider1 = dtypeSize(dtype)/sizeof(cl_float);
587 unsigned int divider0 = 2-!isComplexType(dtype);
588 //EINVAL
589 if( (subdimsNum<2)||
590 (NULL==pgran)||
591 (NULL==subdims) ){
592
593 return false;
594 }
595
596 if( 0 == subdims[0].x ||
597 0 == subdims[0].y ||
598 0 == subdims[0].bwidth ||
599 0 == subdims[1].x ||
600 0 == subdims[1].y ||
601 0 == subdims[1].bwidth ){
602
603 return false;
604 }
605
606 if( subdims[1].x != subdims[1].itemX ||
607 subdims[1].y != subdims[1].itemY ){
608
609 return false;
610 }
611
612 // the group block must consist of integer number of subgroup blocks
613 if( subdims[0].x % subdims[1].x ||
614 subdims[0].y % subdims[1].y ||
615 subdims[0].bwidth % subdims[1].bwidth ){
616
617 return false;
618 }
619
620 //check fitting of bw to common vector sizes
621 if( isComplexType(dtype) ){
622
623 if( 2*subdims[1].bwidth > 32 ){
624
625 return false;
626 }
627 }
628
629 // check dimensions
630 if( subdims[1].bwidth > 16 / divider1 ||
631 subdims[1].x > 1 ||
632 subdims[1].y > 16 / divider1 ){
633
634 return false;
635 }
636
637 if( subdims[0].bwidth > 256 / divider0 ||
638 subdims[0].x > 1 ||
639 subdims[0].y > 256 / divider0 ){
640
641 return false;
642 }
643
644 if (64 != (subdims[0].y / subdims[1].y) *
645 (subdims[0].bwidth / subdims[1].bwidth)) {
646 return false;
647 }
648
649 // passed PGranularity should be checked
650 if( PGRAN_CHECK == check ){
651 if( pgran->wgSize[0] * pgran->wgSize[1] != 64 ){
652 return false;
653 }
654 }
655 // PGranularity should be calculated
656 else{
657 pgran->wgDim = 1;
658 pgran->wgSize[1] = 1;
659 pgran->wgSize[0] = 64;
660
661 //subdims[0].bwidth = (pgran->wgSize[0] * subdims[1].bwidth) /
662 // (subdims[0].y / subdims[1].y);
663 }
664 /*Debug out for Tune*/
665
666 return true;
667 }
668
669 //-----------------------------------------------------------------------------
670
671 void
initGemvPattern(MemoryPattern * mempat)672 initGemvPattern(MemoryPattern *mempat)
673 {
674 mempat->name = "Cached global memory based block gemv";
675 mempat->nrLevels = 2;
676 mempat->cuLevel = 0;
677 mempat->thLevel = 1;
678 mempat->sops = &gemvSops;
679
680 mpatExtra.aMset = CLMEM_LEVEL_L1;
681 mpatExtra.bMset = CLMEM_LEVEL_L1;
682 mpatExtra.mobjA = CLMEM_BUFFER;
683 mpatExtra.mobjB = CLMEM_BUFFER;
684 mempat->extra = &mpatExtra;
685 }
686