1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 #ifdef __APPLE__
18 #include <OpenCL/cl.h>
19 #else
20 #include <CL/cl.h>
21 #endif
22 #include <string.h>
23 #include <stdlib.h>
24 #include <assert.h>
25 #include <math.h>
26 #include <stdio.h>
27 #include <getopt.h>
28 #include <kerngen.h>
29 #include <blas_kgen.h>
30 #include <clblas_stddef.h>
31 
32 #define JUST_MULTIPLICATION 0
33 
34 #if JUST_MULTIPLICATION
35 enum {
36     ITEM_WORK_M = 1,
37     ITEM_WORK_N = 1,
38     ITEM_BLOCKS_K = 1,
39 };
40 #else
41 enum {
42     ITEM_WORK_M = 4,
43     ITEM_WORK_N = 4,
44     ITEM_BLOCKS_K = 3,
45     RAND_BOUND = 10
46 };
47 #endif
48 
49 const char *kernelName = "tilemul_test";
50 
51 
52 // float types based unified pointer
53 typedef union FPtr {
54   void *v;
55   cl_float *f;
56   cl_double *d;
57   cl_float2 *f2;
58   cl_double2 *d2;
59 } FPtr;
60 
61 // float type based unified data type
62 typedef union FType {
63     unsigned char u[sizeof(cl_double)];
64     cl_float f;
65     cl_float2 f2;
66     cl_double d;
67     cl_double2 d2;
68 } FType;
69 
70 static void
printUsage(const char * programName,int exitCode)71 printUsage(const char *programName, int exitCode)
72 {
73     printf( "USAGE: %s [options] <M N K>\n"
74             "  --help, -h                Print this help message.\n"
75             "  --device, -d <device>     OpenCL device used. <device> can "
76             "be \"gpu\" or \"cpu\". Default is \"gpu\".\n"
77             "  --type, -t <type>         Type can be s, d, c or z. Default "
78             "is s.\n"
79             "  --fetch, -f <vector size> Size of used fetch vectors, in used "
80             "types. Default is 1.\n"
81             "  --local, -l <matrix>      If matrix is local or global. Matrix "
82             "can be A or B. By default, both are global.\n"
83             "  --verbose, -v             Turn on verbose mode.\n"
84             "  --a, -a <order>\n"
85             "  --b, -b <order>\n         Set order for tiles a and b fetching. "
86             "Order can be are \"r\" for row major and \"c\" for "
87             "column major. Default values are \"r\" for A and \"c\" for B.\n"
88             "  --skew, -s <skew_value>   Set skews for tiles along M, N, and K "
89             "directions. skew_value can be \"a\" for tile A skew along M, \"b\""
90             " for tile B skew along N and \"k\" for both tiles skew along K. "
91             "There is no skews by default.\n"
92             "  -g, --globalcycling <global_cycling_value>\n"
93             "                            Set global cycling for tiles along M, "
94             "N and K directions. global_cycling_value can be \"a\" for tile A "
95             "global cycling along M, \"b\" for tile B global cycling along N "
96             "and \"k\" for both tiles global cycling along K. There is no "
97             "global cycling enabled by default.\n"
98             "  --iter, -i <num>          Number of iterations.\n"
99             "  --core, -c <mulcore>      Multiplier core. <mulcore> can "
100             "be \"muladd\", \"mad\" or \"dot\". Default is \"mad\".\n"
101             "  --old, -o                 Use old tilemul generator interface "
102             "with one generator function call for both fetching and "
103             "multiplication. Separate generators functions are used by "
104             "default.\n"
105             "  M N K                     Size of block.\n",
106            programName);
107     exit(exitCode);
108 }
109 
110 void
genFillTileWithNAN(struct KgenContext * ctx,const Tile * tile)111 genFillTileWithNAN(struct KgenContext *ctx, const Tile *tile)
112 {
113     char tmp[1024];
114     Kstring elem;
115     unsigned int incRows, incCols;
116     unsigned int i, j, v;
117 
118     if (!tile->trans) {
119         incRows = 1;
120         v = incCols = umin(tile->vecLen, tile->nrCols);
121     }
122     else {
123         v = incRows = umin(tile->vecLen, tile->nrRows);
124         incCols = 1;
125     }
126 
127     for (i = 0; i < tile->nrRows; i += incRows) {
128         for (j = 0; j < tile->nrCols; j += incCols) {
129             sprintfTileElement(&elem, tile, i, j, v);
130             sprintf(tmp, "%s = NAN;\n", elem.buf);
131             kgenAddStmt(ctx, tmp);
132         }
133     }
134 
135     kgenAddBlankLine(ctx);
136 }
137 
138 void
addTestPrefix(struct KgenContext * ctx,bool isDouble)139 addTestPrefix(struct KgenContext *ctx, bool isDouble)
140 {
141     kgenDeclareUptrs(ctx, isDouble);
142 }
143 
checkRet(int ret,const char * genName)144 static void checkRet(int ret, const char *genName)
145 {
146     if (ret != 0) {
147         printf("%s generator failed: %s\n", genName, strerror(-ret));
148         exit(EXIT_FAILURE);
149     }
150 }
151 
152 void
genTest(struct KgenContext * ctx,BlasGenSettings * gset,TileMulOpts * mulOpts,bool separateFetch)153 genTest(
154     struct KgenContext *ctx,
155     BlasGenSettings *gset,
156     TileMulOpts *mulOpts,
157     bool separateFetch)
158 {
159     char s[1024];
160     Kstring kstr;
161     char *tName, tVect[64], *ptrName;
162     KernelVarNames *vnames = &gset->varNames;
163     DataType dtype = gset->kextra->dtype;
164     const SubproblemDim *subdims = gset->subdims;
165     unsigned int vecLen = gset->kextra->vecLen;
166     size_t m, n, k;
167     unsigned int i, j;
168     bool tra, trb, localA, localB, vecCoords;
169     int ret;
170     TileMulFlags flags = mulOpts->flags;
171     FetchOpts fetchOpts;
172 
173     m = gset->subdims[1].y;
174     n = gset->subdims[1].x;
175     k = gset->subdims[1].bwidth;
176 
177     tra = ((flags & TILEMUL_TRA) != 0);
178     trb = ((flags & TILEMUL_TRB) != 0);
179     localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY);
180     localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY);
181 
182     vecCoords = ((flags & TILEMUL_OPTIMIZE_VEC_COORDS) != 0);
183 
184     tVect[0] = '\0';
185 
186     if (vecCoords && vecLen != 1) {
187         sprintf(tVect, "%u", vecLen);
188     }
189 
190     switch (dtype) {
191     case TYPE_FLOAT:
192         tName = "float";
193         ptrName = "f";
194         break;
195     case TYPE_DOUBLE:
196         tName = "double";
197         ptrName = "d";
198         break;
199     case TYPE_COMPLEX_FLOAT:
200         tName = "float2";
201         ptrName = "f2v";
202         break;
203     case TYPE_COMPLEX_DOUBLE:
204         tName = "double2";
205         ptrName = "d2v";
206         break;
207     default:
208         return;
209     }
210 
211     if (vecCoords) {
212         //Do not use GPtrs in fetching
213         vnames->A = "A";
214         vnames->B = "B";
215     }
216     else {
217         vnames->A = localA ? "LAptr" : "((GPtr)A)";
218         vnames->B = localB ? "LBptr" : "((GPtr)B)";
219     }
220     if (!localA) {
221         vnames->lda = "lda";
222 
223     }
224     if (!localB) {
225         vnames->ldb = "ldb";
226     }
227     vnames->sizeM = "M";
228     vnames->sizeN = "N";
229     vnames->sizeK = "K";
230     vnames->skewA = "skewA";
231     vnames->skewB = "skewB";
232     vnames->skewK = "skewK";
233     vnames->coordA = "workItemM";
234     vnames->coordB = "workItemN";
235     vnames->k = "k";
236 
237     kgenAddBlankLine(ctx);
238     sprintf(s, "__attribute__((reqd_work_group_size(%i, %i, 1)))\n",
239             ITEM_WORK_M, ITEM_WORK_N);
240     kgenAddStmt(ctx, s);
241     kgenAddStmt(ctx, "__kernel void\n");
242     sprintf(s, "%s(\n", kernelName);
243     kgenAddStmt(ctx, s);
244     sprintf(s,"    %s alpha,\n", tName);
245     kgenAddStmt(ctx, s);
246     sprintf(s,"    __global %s%s *A,\n", tName, tVect);
247     kgenAddStmt(ctx, s);
248     sprintf(s,"    __global %s%s *B,\n", tName, tVect);
249     kgenAddStmt(ctx, s);
250     kgenAddStmt(ctx, "    uint M,\n"
251                      "    uint N,\n"
252                      "    uint K,\n");
253     sprintf(s,
254             "    __global %s *C,\n"
255             "    const uint iter)\n", tName);
256     kgenAddStmt(ctx, s);
257     kgenBeginFuncBody(ctx);
258     sprintf(s, "uint workItemM = %lu * get_global_id(0);\n"
259                "uint workItemN = %lu * get_global_id(1);\n",
260             m, n);
261     kgenAddStmt(ctx, s);
262     if ((flags & TILEMUL_SKEW_A) != 0) {
263         kgenAddStmt(ctx, "uint skewA = 0u;\n");
264     }
265     if ((flags & TILEMUL_SKEW_B) != 0) {
266         kgenAddStmt(ctx, "uint skewB = 0u;\n");
267     }
268     if ((flags & TILEMUL_SKEW_K) != 0) {
269         kgenAddStmt(ctx, "uint skewK = 0u;\n");
270     }
271 
272     if (localA) {
273         sprintf(s, "__local %s LA[%lu];\n",
274                 tName, subdims[0].bwidth * subdims[0].y);
275         kgenAddStmt(ctx, s);
276     }
277     else { //global A
278         sprintf(s, "uint lda = %s;\n", tra ? "M" : "K");
279         kgenAddStmt(ctx, s);
280     }
281     if (localB) {
282         sprintf(s, "__local %s LB[%lu];\n",
283                 tName, subdims[0].bwidth * subdims[0].x);
284         kgenAddStmt(ctx, s);
285     }
286     else { //global B
287         sprintf(s, "uint ldb = %s;\n", trb ? "K" : "N");
288         kgenAddStmt(ctx, s);
289     }
290 
291     initDefaultTiles(gset, CLBLAS_GEMM, TILE_PACKED, PRIV_STORAGE_ARRAY);
292     declareTileStorages(ctx, gset);
293 
294     if (vecCoords) {
295         size_t ha, hb;
296         char *str;
297 
298         ha = tra ? k : m;
299         hb = trb ? n : k;
300 
301         if (ha > 1) {
302             str = s;
303             str += sprintf(str, "uint%lu ca = {0", ha);
304             for (i = 1; i < ha; i++) {
305                 str += sprintf(str, ", %s * %u / %u", vnames->lda, i, vecLen);
306             }
307             str += sprintf(str, "};\n");
308             kgenAddStmt(ctx, s);
309         }
310         else {
311             kgenAddStmt(ctx, "uint ca = 0;\n");
312         }
313         vnames->vectCoordA = "ca";
314 
315         if (hb > 1) {
316             str = s;
317             str += sprintf(str, "uint%lu cb = {0", hb);
318             for (i = 1; i < hb; i++) {
319                 str += sprintf(str, ", %s * %u / %u", vnames->ldb, i, vecLen);
320             }
321             str += sprintf(str, "};\n");
322             kgenAddStmt(ctx, s);
323         }
324         else {
325             kgenAddStmt(ctx, "uint cb = 0;\n");
326         }
327         vnames->vectCoordB = "cb";
328 
329 //        uint4 ca = {0, vecLDA, vecLDA * 2, vecLDA * 3};
330 //        uint4 cb = {0, vecLDB, vecLDB * 2, vecLDB * 3};
331     }
332 
333     kgenAddBlankLine(ctx);
334 
335     sprintf(s, "for (int it = 0; it < iter; it++)");
336     kgenBeginBranch(ctx, s);
337 
338     if (!(localA && localB)) {
339         kgenAddStmt(ctx, "uint k = 0;\n");
340     }
341 
342     genZeroTile(ctx, &gset->tileCY);
343 
344     if (vecCoords) {
345         char *coordsA[2] = {"workItemM", "k"};
346         char *coordsB[2] = {"k", "workItemN"};
347         sprintf(s, "A += %s * (lda / %u) + %s / %u;\n",
348                 coordsA[tra], vecLen, coordsA[1 - tra], vecLen);
349         kgenAddStmt(ctx, s);
350         sprintf(s, "B += %s * (ldb / %u) + %s / %u;\n",
351                 coordsB[trb], vecLen, coordsB[1 - trb], vecLen);
352         kgenAddStmt(ctx, s);
353     }
354 
355     sprintf(s, "for (int k0 = 0; k0 < K; k0 += %lu)", subdims[0].bwidth);
356     kgenBeginBranch(ctx, s);
357 
358     /* Copy data to local memory. We know that the size of matrix is the same
359      * that the size of one block and use that.
360      */
361     if (localA) {
362         sprintf(s,
363                 "event_t evA = async_work_group_copy(LA, A, %lu, 0);\n"
364                 "wait_group_events(1, &evA);\n"
365                 "barrier(CLK_LOCAL_MEM_FENCE);\n",
366                 subdims[0].y * subdims[0].bwidth);
367         kgenAddStmt(ctx, s);
368         kgenAddStmt(ctx, "LPtr LAptr;\n");
369         if (tra) {
370             sprintf(s,
371                     "LAptr.%s = LA + workItemM;\n", ptrName);
372         }
373         else {
374             sprintf(s,
375                     "LAptr.%s = LA + workItemM * %lu;\n",
376                     ptrName, subdims[0].bwidth);
377         }
378         kgenAddStmt(ctx, s);
379     }
380     if (localB) {
381         sprintf(s,
382                 "event_t evB = async_work_group_copy(LB, B, %lu, 0);\n"
383                 "wait_group_events(1, &evB);\n"
384                 "barrier(CLK_LOCAL_MEM_FENCE);\n",
385                 subdims[0].x * subdims[0].bwidth);
386         kgenAddStmt(ctx, s);
387         kgenAddStmt(ctx, "LPtr LBptr;\n");
388         if (trb) {
389             sprintf(s, "LBptr.%s = LB + workItemN * %lu;\n",
390                     ptrName, subdims[0].bwidth);
391         }
392         else {
393             sprintf(s, "LBptr.%s = LB + workItemN;\n", ptrName);
394         }
395         kgenAddStmt(ctx, s);
396     }
397 
398     if (!separateFetch) {
399         ret = tileMulGen(ctx, gset, mulOpts);
400         checkRet(ret, "Multiplier");
401     }
402     else {
403         Tile *tileA = &gset->tileA;
404         Tile *tileB = &gset->tileBX;
405 
406         memset(&fetchOpts, 0, sizeof(fetchOpts));
407         if (localA) {
408             fetchOpts.memA = CLMEM_LOCAL_MEMORY;
409         }
410         if (localB) {
411             fetchOpts.memB = CLMEM_LOCAL_MEMORY;
412         }
413 
414         genFillTileWithNAN(ctx, tileA);
415         genFillTileWithNAN(ctx, tileB);
416 
417         if (subdims[0].bwidth != subdims[1].bwidth) {
418             sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)",
419                     subdims[0].bwidth, k);
420             kgenBeginBranch(ctx, s);
421         }
422 
423 #if JUST_MULTIPLICATION
424         for (i = 0; i < tileA->nrRows; i++) {
425             for(j = 0; j < tileA->nrCols; j++) {
426                 sprintfTileElement(&kstr, tileA, i, j, 1);
427                 sprintf(s, "%s = %u;\n", kstr.buf, i * tileA->nrCols + j);
428                 kgenAddStmt(ctx, s);
429             }
430         }
431 
432         for (i = 0; i < tileB->nrRows; i++) {
433             for(j = 0; j < tileB->nrCols; j++) {
434                 sprintfTileElement(&kstr, tileB, i, j, 1);
435                 sprintf(s, "%s = %u;\n", kstr.buf, i * tileB->nrCols + j);
436                 kgenAddStmt(ctx, s);
437             }
438         }
439 #else
440         fetchOpts.mrole = MATRIX_B;
441         fetchOpts.lineOffset = 0;
442         fetchOpts.linesNum = (tileB->trans) ? tileB->nrCols : tileB->nrRows;
443         ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts);
444         checkRet(ret, "Fetching tile b");
445 
446         fetchOpts.mrole = MATRIX_A;
447         fetchOpts.linesNum = (tileA->trans) ? tileA->nrCols : tileA->nrRows;
448         kgenAddBlankLine(ctx);
449         fetchOpts.lineOffset = 0;
450         ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts);
451         checkRet(ret, "Fetching tile a");
452 #endif
453         ret = genMulTiles(ctx, gset, mulOpts);
454         checkRet(ret, "Multiplier");
455 #if ! JUST_MULTIPLICATION
456         sprintf(s, "k += %lu;\n", k);
457         kgenAddStmt(ctx, s);
458 #endif
459         if (subdims[0].bwidth != subdims[1].bwidth) {
460             kgenEndBranch(ctx, NULL);
461         }
462     }
463     kgenEndBranch(ctx, NULL); // K loop
464     kgenEndBranch(ctx, NULL); // iterations loop
465 
466     kgenAddBlankLine(ctx);
467 
468     for (i = 0; i < m; i++) {
469         for (j = 0; j < n; j++) {
470             sprintfTileElement(&kstr, &gset->tileCY, i, j, 1);
471                 sprintf(s,
472                         "((GPtr)C).%s"
473                     "[(%d + workItemM) * N  + %d + workItemN] = %s;\n",
474                     ptrName, i, j, kstr.buf);
475                 kgenAddStmt(ctx, s);
476             }
477                 }
478 
479     kgenEndFuncBody(ctx);
480 }
481 
482 cl_int
run(const char * ker,cl_uint M,cl_uint N,cl_uint K,FType alpha,BlasGenSettings * gset,TileMulFlags flags,cl_device_type deviceType,bool verbose,unsigned int iterNum)483 run (
484         const char *ker,
485         cl_uint M,
486         cl_uint N,
487         cl_uint K,
488         FType alpha,
489         BlasGenSettings *gset,
490         TileMulFlags flags,
491         cl_device_type deviceType,
492         bool verbose,
493         unsigned int iterNum)
494 {
495     cl_int err;
496     cl_platform_id platform;
497     cl_context ctx;
498     cl_device_id device;
499     cl_command_queue queue;
500     cl_event evt;
501     DataType dtype = gset->kextra->dtype;
502 
503     cl_mem bufA, bufB, bufC;
504     FPtr A, B, C, C_naive;
505     bool isComplex = isComplexType(dtype);
506     bool isDouble = isDoubleBasedType(dtype);
507     cl_uint nwords = (isComplex) ? 2 : 1;
508     unsigned int tsize = dtypeSize(dtype);
509     cl_kernel kernel;
510     size_t i, j, k;
511     size_t globalWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N};
512     size_t localWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N};
513     char log[100000];
514     size_t logSize;
515     cl_long sTime, fTime;
516     cl_program program = NULL;
517 
518     clGetPlatformIDs(1, &platform, NULL);
519 
520     clGetDeviceIDs(platform, deviceType, 1, &device, NULL);
521 
522     ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
523     if (err != CL_SUCCESS) {
524         return err;
525     }
526 
527     queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err);
528     if (err != CL_SUCCESS) {
529         return err;
530     }
531 
532     /* Prepare OpenCL kernel and its arguments */
533 
534     program = clCreateProgramWithSource(ctx, 1, &ker, NULL, NULL);
535 
536     err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
537     clGetProgramBuildInfo (program,
538             device,
539             CL_PROGRAM_BUILD_LOG,
540             sizeof(log),
541             log,
542             &logSize);
543     printf("%s", log);
544     if (err != CL_SUCCESS){
545         clReleaseProgram(program);
546         return err;
547     }
548 
549     kernel = clCreateKernel(program, kernelName, &err);
550     if (err != CL_SUCCESS){
551         clReleaseProgram(program);
552         return err;
553     }
554     /* Memory allocation */
555 
556     A.v = malloc(M * K * tsize);
557     B.v = malloc(K * N * tsize);
558     C.v = malloc(M * N * tsize);
559     C_naive.v = malloc(M * N * tsize);
560 
561 #if JUST_MULTIPLICATION
562     srand(0);
563     if (isDouble) {
564         for(i = 0; i < M * K * nwords; i++){
565             A.d[i] = i;
566         }
567         for(i = 0; i < N * K * nwords; i++){
568             B.d[i] = i + 7;
569         }
570         for(i = 0; i < M * N * nwords; i++){
571             C.d[i] = 0.0;
572             C_naive.d[i] = 0.0;
573         }
574     }
575     else {
576         for(i = 0; i < M * K * nwords; i++){
577             A.f[i] = i;
578         }
579         for(i = 0; i < N * K * nwords; i++){
580             B.f[i] = i + 7;
581         }
582         for(i = 0; i < M * N * nwords; i++){
583             C.f[i] = 0.0;
584             C_naive.f[i] = 0.0;
585         }
586     }
587 
588 #else
589     srand(0);
590     if (isDouble) {
591         for(i = 0; i < M * K * nwords; i++){
592             A.d[i] = (double)(rand() % RAND_BOUND);
593         }
594         for(i = 0; i < N * K * nwords; i++){
595             B.d[i] = (double)(rand() % RAND_BOUND);
596         }
597         for(i = 0; i < M * N * nwords; i++){
598             C.d[i] = 0.0;
599             C_naive.d[i] = 0.0;
600         }
601     }
602     else {
603         for(i = 0; i < M * K * nwords; i++){
604             A.f[i] = (float)(rand() % RAND_BOUND);
605         }
606         for(i = 0; i < N * K * nwords; i++){
607             B.f[i] = (float)(rand() % RAND_BOUND);
608         }
609         for(i = 0; i < M * N * nwords; i++){
610             C.f[i] = 0.0;
611             C_naive.f[i] = 0.0;
612         }
613     }
614 #endif
615 
616     bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
617             K * M * tsize, A.v, &err);
618     if (err != CL_SUCCESS) {
619         clReleaseKernel(kernel);
620         return err;
621     }
622 
623     bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
624             K * N * tsize, B.v, &err);
625 
626     if (err != CL_SUCCESS) {
627         clReleaseMemObject(bufA);
628         clReleaseKernel(kernel);
629         return err;
630     }
631 
632     bufC = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
633         M * N * tsize, C.v, &err);
634 
635     if (err != CL_SUCCESS) {
636         clReleaseMemObject(bufB);
637         clReleaseMemObject(bufA);
638         clReleaseKernel(kernel);
639         return err;
640     }
641 
642     /* Argument setting and kernel execution */
643     err = clSetKernelArg(kernel, 0, tsize, alpha.u);
644     err |= clSetKernelArg(kernel, 1, sizeof(bufA), &bufA);
645     err |= clSetKernelArg(kernel, 2, sizeof(bufB), &bufB);
646     err |= clSetKernelArg(kernel, 3, sizeof(M), &M);
647     err |= clSetKernelArg(kernel, 4, sizeof(N), &N);
648     err |= clSetKernelArg(kernel, 5, sizeof(K), &K);
649     err |= clSetKernelArg(kernel, 6, sizeof(bufC), &bufC);
650     err |= clSetKernelArg(kernel, 7, sizeof(iterNum), &iterNum);
651 
652     if (err != CL_SUCCESS) {
653         clReleaseMemObject(bufC);
654         clReleaseMemObject(bufB);
655         clReleaseMemObject(bufA);
656         clReleaseKernel(kernel);
657         return err;
658     }
659 
660     err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
661         globalWorkSize, localWorkSize, 0,
662         NULL, &evt);
663 
664     if (err != CL_SUCCESS) {
665         clReleaseMemObject(bufC);
666         clReleaseMemObject(bufB);
667         clReleaseMemObject(bufA);
668         clReleaseKernel(kernel);
669         return err;
670     }
671 
672     err = clFinish(queue);
673     err = clEnqueueReadBuffer (queue,
674         bufC,
675         CL_TRUE,
676         0,
677         M * N * tsize,
678         C.v,
679         0,
680         NULL,
681         NULL);
682 
683     /* Naive CPU multiplication */
684     if (isDouble) {
685         for (i = 0; i < M; i++) {
686             for (j = 0; j < N; j++) {
687                 if (isComplex) {
688                     cl_double2 val;
689                     for (k = 0; k < K; k++) {
690                         cl_double2 bkj = flags & TILEMUL_TRB ?
691                                 B.d2[j * K + k] : B.d2[k * N + j];
692                         cl_double2 aik = flags & TILEMUL_TRA ?
693                                 A.d2[k * M + i] : A.d2[i * K + k];
694                         val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1];
695                         val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0];
696                         C_naive.d2[i * N + j].s[0] += val.s[0];
697                         C_naive.d2[i * N + j].s[1] += val.s[1];
698                     }
699                     val.s[0] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[0] -
700                             C_naive.d2[i * N + j].s[1] * alpha.d2.s[1];
701                     val.s[1] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[1] +
702                             C_naive.d2[i * N + j].s[1] * alpha.d2.s[0];
703                     C_naive.d2[i * N + j] = val;
704                 }
705                 else {
706                     for (k = 0; k < K; k++) {
707                         double bkj = flags & TILEMUL_TRB ?
708                                 B.d[j * K + k] : B.d[k * N + j];
709                         double aik = flags & TILEMUL_TRA ?
710                                 A.d[k * M + i] : A.d[i * K + k];
711                         C_naive.d[i * N + j] += aik * bkj;
712                     }
713                     C_naive.d[i * N + j] *= alpha.d;
714                 }
715             }
716         }
717 
718         for (i = 0; i < M * N; i++) {
719             if (C.d[i] != C_naive.d[i]) {
720                 printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N,
721                         C.d[i], C_naive.d[i]);
722                 break;
723             }
724         }
725         if (i == M * N) {
726             printf("Match\n");
727         }
728     }
729     else {
730         for (i = 0; i < M; i++) {
731             for (j = 0; j < N; j++) {
732                 if (isComplex) {
733                     cl_float2 val;
734                     for (k = 0; k < K; k++) {
735                         cl_float2 bkj = flags & TILEMUL_TRB ?
736                                 B.f2[j * K + k] : B.f2[k * N + j];
737                         cl_float2 aik = flags & TILEMUL_TRA ?
738                                 A.f2[k * M + i] : A.f2[i * K + k];
739                         val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1];
740                         val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0];
741                         C_naive.f2[i * N + j].s[0] += val.s[0];
742                         C_naive.f2[i * N + j].s[1] += val.s[1];
743                     }
744                     val.s[0] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[0] -
745                             C_naive.f2[i * N + j].s[1] * alpha.f2.s[1];
746                     val.s[1] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[1] +
747                             C_naive.f2[i * N + j].s[1] * alpha.f2.s[0];
748                     C_naive.f2[i * N + j] = val;
749                 }
750                 else {
751                     for (k = 0; k < K; k++) {
752                         float bkj = flags & TILEMUL_TRB ?
753                                 B.f[j * K + k] : B.f[k * N + j];
754                         float aik = flags & TILEMUL_TRA ?
755                                 A.f[k * M + i] : A.f[i * K + k];
756                         C_naive.f[i * N + j] += aik * bkj;
757                     }
758                     C_naive.f[i * N + j] *= alpha.f;
759                 }
760             }
761         }
762 
763         for (i = 0; i < M * N; i++) {
764             if (C.f[i] != C_naive.f[i]) {
765                 printf("Differ at (%lu, %lu): %lf != %lf\n",
766                         i / N, i % N, C.f[i], C_naive.f[i]);
767                 break;
768             }
769         }
770         if (i == M * N) {
771             printf("Match\n");
772         }
773     }
774 
775     /* End of naive CPU multiplication */
776     if (verbose) {
777         if (!isDouble) {
778             printf("Matrix A:\n");
779             for (i = 0; i < M; i++) {
780                 for (k = 0; k < K; k++) {
781                     if (isComplex) {
782                         cl_float2 aik = flags & TILEMUL_TRA ?
783                                 A.f2[k * M + i] : A.f2[i * K + k];
784                         printf("(%4.1f, %4.1f) ", aik.s[0], aik.s[1]);
785                     }
786                     else {
787                         float aik = flags & TILEMUL_TRA ?
788                                 A.f[k * M + i] : A.f[i * K + k];
789                         printf("%4.1f ", aik);
790                     }
791                 }
792                 printf("\n");
793             }
794 
795             printf("Matrix B:\n");
796             for (k = 0; k < K; k++) {
797                 for (j = 0; j < N; j++) {
798                     if (isComplex) {
799                         cl_float2 bkj = flags & TILEMUL_TRB ?
800                                 B.f2[j * K + k] : B.f2[k * N + j];
801                         printf("(%4.1f, %4.1f) ", bkj.s[0], bkj.s[1]);
802                     }
803                     else {
804                         float bkj = flags & TILEMUL_TRB ?
805                                 B.f[j * K + k] : B.f[k * N + j];
806                         printf("%4.1f ", bkj);
807                     }
808                 }
809                 printf("\n");
810             }
811 
812             printf("CPU calculated matrix:\n");
813             for (i = 0; i < M; i++) {
814                 for (j = 0; j < N; j++) {
815                     if (isComplex) {
816                         printf("(%4.1f, %4.1f) ",
817                                 C_naive.f2[i * N + j].s[0],
818                                 C_naive.f2[i * N + j].s[1]);
819                     }
820                     else {
821                         printf("%4.1f ", C_naive.f[i * N + j]);
822                     }
823                 }
824                 printf("\n");
825             }
826 
827             printf("GPU calculated matrix:\n");
828             for (i = 0; i < M; i++) {
829                 for (j = 0; j < N; j++) {
830                     if (isComplex) {
831                         printf("(%4.1f, %4.1f) ",
832                                 C.f2[i * N + j].s[0], C.f2[i * N + j].s[1]);
833                     }
834                     else {
835                         printf("%4.1f ", C.f[i * N + j]);
836                     }
837                 }
838                 printf("\n");
839             }
840         }
841     }
842 
843     clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
844             &sTime, NULL);
845     clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
846             &fTime, NULL);
847 
848     printf("Total multiplication time: %d ms\nTime per iteration: %d ns\n",
849             (int)((fTime-sTime)/1000000), (int)((fTime-sTime)/iterNum));
850 
851     clReleaseMemObject(bufC);
852     clReleaseMemObject(bufB);
853     clReleaseMemObject(bufA);
854     clReleaseKernel(kernel);
855     return CL_SUCCESS;
856 }
857 
main(int argc,char * argv[])858 int main(int argc, char *argv[])
859 {
860     char out[1024*1024];
861     CLBLASKernExtra kextra;
862     BlasGenSettings gset;
863     TileMulOpts mulOpts;
864     int i;
865     cl_uint blockM = 4, blockN = 4, blockK = 8;
866     struct KgenContext *ctx = createKgenContext(out, sizeof(out), 1);
867     FType alpha;
868     cl_int err;
869     unsigned int iterNum = 1;
870     const char* const shortOptions = "hd:f:l:t:a:b:s:g:i:c:ov";
871     const struct option longOptions[] = {
872             {"help", no_argument, NULL, 'h'},
873             {"device", required_argument, NULL, 'd'},
874             {"fetch", required_argument, NULL, 'f'},
875             {"local", required_argument, NULL, 'l'},
876             {"type", required_argument, NULL, 't'},
877             {"a", required_argument, NULL, 'a'},
878             {"b", required_argument, NULL, 'b'},
879             {"skew", required_argument, NULL, 's'},
880             {"globalcycling", required_argument, NULL, 'g'},
881             {"iter", required_argument, NULL, 'i'},
882             {"core", required_argument, NULL, 'c'},
883             {"old", no_argument, NULL, 'o'},
884             {"verbose", no_argument, NULL, 'v'},
885             {NULL, 0, NULL, 0}
886     };
887     int nextOption;
888     cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
889     bool verbose = false;
890     SubproblemDim *subdims = gset.subdims;
891     bool separateFetch = false;
892 
893     memset(&gset, 0, sizeof(gset));
894     memset(&mulOpts, 0, sizeof(mulOpts));
895     memset(&kextra, 0, sizeof(kextra));
896     gset.kextra = &kextra;
897     gset.flags |= BGF_WHOLE_A;
898     mulOpts.core = TILEMUL_MAD;
899     mulOpts.flags = TILEMUL_FORCE_VECTORIZATION;
900     kextra.vecLen = 1;
901     kextra.dtype = TYPE_FLOAT;
902 
903     alpha.f = 1;
904 
905     // parse command line
906     do {
907         nextOption = getopt_long(argc, argv, shortOptions, longOptions, NULL);
908         switch (nextOption) {
909         case 'h':
910             printUsage(argv[0], EXIT_SUCCESS);
911             break;
912         case 'd':
913             if (!strcmp("cpu", optarg)) {
914                 deviceType = CL_DEVICE_TYPE_CPU;
915             }
916             else if (!strcmp("gpu", optarg)) {
917                 deviceType = CL_DEVICE_TYPE_GPU;
918             }
919             else {
920                 printf("Unknown device type %s. Supported values are \"cpu\" "
921                         "and \"gpu\".\n", optarg);
922                 exit(EXIT_FAILURE);
923             }
924             break;
925         case 'f':
926             kextra.vecLen = atoi(optarg);
927             break;
928         case 'l':
929             if (!strcmp(optarg, "A")) {
930                 mulOpts.memA = CLMEM_LOCAL_MEMORY;
931             }
932             else if (!strcmp(optarg, "B")) {
933                 mulOpts.memB = CLMEM_LOCAL_MEMORY;
934             }
935             else {
936                 printf("Wrong matrix specified: %s. Supported values are "
937                         "A, B.\n", optarg);
938                 exit(EXIT_FAILURE);
939             }
940             break;
941         case 't':
942             if (!strcmp(optarg, "s")) {
943                 kextra.dtype = TYPE_FLOAT;
944                 alpha.f = 1;
945             }
946             else if (!strcmp(optarg, "d")) {
947                 kextra.dtype = TYPE_DOUBLE;
948                 alpha.d = 1;
949             }
950             else if (!strcmp(optarg, "c")) {
951                 kextra.dtype = TYPE_COMPLEX_FLOAT;
952                 alpha.f2.s[0] = 1;
953                 alpha.f2.s[1] = 0;
954             }
955             else if (!strcmp(optarg, "z")) {
956                 kextra.dtype = TYPE_COMPLEX_DOUBLE;
957                 alpha.d2.s[0] = 1;
958                 alpha.d2.s[1] = 0;
959             }
960             else {
961                 printf("Wrong type specified: %s. Supported values are "
962                         "s, d, c, z.\n", optarg);
963                 exit(EXIT_FAILURE);
964             }
965             break;
966         case 'a':
967             if (!strcmp(optarg, "r")) {
968                 mulOpts.flags &= ~TILEMUL_TRA;
969             }
970             else if (!strcmp(optarg, "c")) {
971                 mulOpts.flags |= TILEMUL_TRA;
972             }
973             else {
974                 printf("Wrong tile a parameter specified: %s. Supported values "
975                         "are \"r\", \"c\".\n", optarg);
976                 exit(EXIT_FAILURE);
977             }
978             break;
979         case 'b':
980             if (!strcmp(optarg, "r")) {
981                 mulOpts.flags &= ~TILEMUL_TRB;
982             }
983             else if (!strcmp(optarg, "c")) {
984                 mulOpts.flags |= TILEMUL_TRB;
985             }
986             else {
987                 printf("Wrong tile b order specified: %s. Supported values "
988                         "are \"r\", \"c\".\n", optarg);
989                 exit(EXIT_FAILURE);
990             }
991             break;
992         case 's':
993             if (!strcmp(optarg, "a")) {
994                 mulOpts.flags |= TILEMUL_SKEW_A;
995             }
996             else if (!strcmp(optarg, "b")) {
997                 mulOpts.flags |= TILEMUL_SKEW_B;
998             }
999             else if (!strcmp(optarg, "k")) {
1000                 mulOpts.flags |= TILEMUL_SKEW_K;
1001             }
1002             else {
1003                 printf("Wrong skew parameter specified: %s. Supported values "
1004                         "are \"a\", \"b\", \"k\"\n", optarg);
1005                 exit(EXIT_FAILURE);
1006             }
1007             break;
1008         case 'g':
1009             if (!strcmp(optarg, "a")) {
1010                 mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
1011             }
1012             else if (!strcmp(optarg, "b")) {
1013                 mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B;
1014             }
1015             else if (!strcmp(optarg, "k")) {
1016                 mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K;
1017             }
1018             else {
1019                 printf("Wrong global cycling parameter specified: %s. "
1020                         "Supported values are \"a\", \"b\", \"k\"\n", optarg);
1021                 exit(EXIT_FAILURE);
1022             }
1023             break;
1024         case 'i':
1025             iterNum = atoi(optarg);
1026             break;
1027         case 'c':
1028             if (!strcmp("muladd", optarg)) {
1029                 mulOpts.core = TILEMUL_MULADD;
1030             }
1031             else if (!strcmp("mad", optarg)) {
1032                 mulOpts.core = TILEMUL_MAD;
1033             }
1034             else if (!strcmp("dot", optarg)) {
1035                 mulOpts.core = TILEMUL_DOT;
1036             }
1037             else {
1038                 printf("Unknown multiplier core %s. Supported values"
1039                         " are \"muladd\", \"mad\" and \"dot\".\n", optarg);
1040                 exit(EXIT_FAILURE);
1041             }
1042             break;
1043         case 'o':
1044             separateFetch = false;
1045             break;
1046         case 'v':
1047             verbose = true;
1048             break;
1049         case -1:
1050             break;
1051         default:
1052             printUsage(argv[0], EXIT_FAILURE);
1053             break;
1054         }
1055     } while (nextOption != -1);
1056 
1057     if (optind + 2 >= argc) {
1058         printf("Error: Not all sizes are specified\n");
1059         printUsage(argv[0], EXIT_FAILURE);
1060     }
1061     blockM = atoi(argv[optind]);
1062     blockN = atoi(argv[optind + 1]);
1063     blockK = atoi(argv[optind + 2]);
1064 
1065     if ((mulOpts.memA == CLMEM_LOCAL_MEMORY ||
1066             mulOpts.memB == CLMEM_LOCAL_MEMORY) &&
1067             ((mulOpts.flags & TILEMUL_GLOBAL_CYCLIC) != 0)) {
1068         printf("One of matrixes is in local memory, "
1069                 "disabling global cycling\n");
1070         mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC;
1071     }
1072 
1073     if (mulOpts.flags & TILEMUL_TRA) {
1074         kextra.flags |= KEXTRA_TRANS_A;
1075     }
1076     if (mulOpts.flags & TILEMUL_TRB) {
1077         kextra.flags |= KEXTRA_TRANS_B;
1078     }
1079 
1080     subdims[0].y = blockM * ITEM_WORK_M;
1081     subdims[0].x = blockN * ITEM_WORK_N;
1082     subdims[0].bwidth = blockK * ITEM_BLOCKS_K;
1083     subdims[1].y = blockM;
1084     subdims[1].x = blockN;
1085     subdims[1].bwidth = blockK;
1086 
1087     memset(out, 0, sizeof(out));
1088 
1089     i = isDoubleBasedType(kextra.dtype);
1090     kgenDeclareUptrs(ctx, i);
1091     genTest(ctx, &gset, &mulOpts, separateFetch);
1092     destroyKgenContext(ctx);
1093 
1094     printf("Kernel code: \n\"%s\"\n", out);
1095     err = run(out, subdims[0].y, subdims[0].x, subdims[0].bwidth, alpha,
1096               &gset, mulOpts.flags, deviceType, verbose, iterNum);
1097     if (err != CL_SUCCESS) {
1098         printf("Test run failed, error %d\n", err);
1099         return EXIT_FAILURE;
1100     }
1101 	return EXIT_SUCCESS;
1102 }
1103