1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17 #ifdef __APPLE__
18 #include <OpenCL/cl.h>
19 #else
20 #include <CL/cl.h>
21 #endif
22 #include <string.h>
23 #include <stdlib.h>
24 #include <assert.h>
25 #include <math.h>
26 #include <stdio.h>
27 #include <getopt.h>
28 #include <kerngen.h>
29 #include <blas_kgen.h>
30 #include <clblas_stddef.h>
31
32 #define JUST_MULTIPLICATION 0
33
34 #if JUST_MULTIPLICATION
35 enum {
36 ITEM_WORK_M = 1,
37 ITEM_WORK_N = 1,
38 ITEM_BLOCKS_K = 1,
39 };
40 #else
41 enum {
42 ITEM_WORK_M = 4,
43 ITEM_WORK_N = 4,
44 ITEM_BLOCKS_K = 3,
45 RAND_BOUND = 10
46 };
47 #endif
48
49 const char *kernelName = "tilemul_test";
50
51
52 // float types based unified pointer
53 typedef union FPtr {
54 void *v;
55 cl_float *f;
56 cl_double *d;
57 cl_float2 *f2;
58 cl_double2 *d2;
59 } FPtr;
60
61 // float type based unified data type
62 typedef union FType {
63 unsigned char u[sizeof(cl_double)];
64 cl_float f;
65 cl_float2 f2;
66 cl_double d;
67 cl_double2 d2;
68 } FType;
69
70 static void
printUsage(const char * programName,int exitCode)71 printUsage(const char *programName, int exitCode)
72 {
73 printf( "USAGE: %s [options] <M N K>\n"
74 " --help, -h Print this help message.\n"
75 " --device, -d <device> OpenCL device used. <device> can "
76 "be \"gpu\" or \"cpu\". Default is \"gpu\".\n"
77 " --type, -t <type> Type can be s, d, c or z. Default "
78 "is s.\n"
79 " --fetch, -f <vector size> Size of used fetch vectors, in used "
80 "types. Default is 1.\n"
81 " --local, -l <matrix> If matrix is local or global. Matrix "
82 "can be A or B. By default, both are global.\n"
83 " --verbose, -v Turn on verbose mode.\n"
84 " --a, -a <order>\n"
85 " --b, -b <order>\n Set order for tiles a and b fetching. "
86 "Order can be are \"r\" for row major and \"c\" for "
87 "column major. Default values are \"r\" for A and \"c\" for B.\n"
88 " --skew, -s <skew_value> Set skews for tiles along M, N, and K "
89 "directions. skew_value can be \"a\" for tile A skew along M, \"b\""
90 " for tile B skew along N and \"k\" for both tiles skew along K. "
91 "There is no skews by default.\n"
92 " -g, --globalcycling <global_cycling_value>\n"
93 " Set global cycling for tiles along M, "
94 "N and K directions. global_cycling_value can be \"a\" for tile A "
95 "global cycling along M, \"b\" for tile B global cycling along N "
96 "and \"k\" for both tiles global cycling along K. There is no "
97 "global cycling enabled by default.\n"
98 " --iter, -i <num> Number of iterations.\n"
99 " --core, -c <mulcore> Multiplier core. <mulcore> can "
100 "be \"muladd\", \"mad\" or \"dot\". Default is \"mad\".\n"
101 " --old, -o Use old tilemul generator interface "
102 "with one generator function call for both fetching and "
103 "multiplication. Separate generators functions are used by "
104 "default.\n"
105 " M N K Size of block.\n",
106 programName);
107 exit(exitCode);
108 }
109
110 void
genFillTileWithNAN(struct KgenContext * ctx,const Tile * tile)111 genFillTileWithNAN(struct KgenContext *ctx, const Tile *tile)
112 {
113 char tmp[1024];
114 Kstring elem;
115 unsigned int incRows, incCols;
116 unsigned int i, j, v;
117
118 if (!tile->trans) {
119 incRows = 1;
120 v = incCols = umin(tile->vecLen, tile->nrCols);
121 }
122 else {
123 v = incRows = umin(tile->vecLen, tile->nrRows);
124 incCols = 1;
125 }
126
127 for (i = 0; i < tile->nrRows; i += incRows) {
128 for (j = 0; j < tile->nrCols; j += incCols) {
129 sprintfTileElement(&elem, tile, i, j, v);
130 sprintf(tmp, "%s = NAN;\n", elem.buf);
131 kgenAddStmt(ctx, tmp);
132 }
133 }
134
135 kgenAddBlankLine(ctx);
136 }
137
138 void
addTestPrefix(struct KgenContext * ctx,bool isDouble)139 addTestPrefix(struct KgenContext *ctx, bool isDouble)
140 {
141 kgenDeclareUptrs(ctx, isDouble);
142 }
143
checkRet(int ret,const char * genName)144 static void checkRet(int ret, const char *genName)
145 {
146 if (ret != 0) {
147 printf("%s generator failed: %s\n", genName, strerror(-ret));
148 exit(EXIT_FAILURE);
149 }
150 }
151
152 void
genTest(struct KgenContext * ctx,BlasGenSettings * gset,TileMulOpts * mulOpts,bool separateFetch)153 genTest(
154 struct KgenContext *ctx,
155 BlasGenSettings *gset,
156 TileMulOpts *mulOpts,
157 bool separateFetch)
158 {
159 char s[1024];
160 Kstring kstr;
161 char *tName, tVect[64], *ptrName;
162 KernelVarNames *vnames = &gset->varNames;
163 DataType dtype = gset->kextra->dtype;
164 const SubproblemDim *subdims = gset->subdims;
165 unsigned int vecLen = gset->kextra->vecLen;
166 size_t m, n, k;
167 unsigned int i, j;
168 bool tra, trb, localA, localB, vecCoords;
169 int ret;
170 TileMulFlags flags = mulOpts->flags;
171 FetchOpts fetchOpts;
172
173 m = gset->subdims[1].y;
174 n = gset->subdims[1].x;
175 k = gset->subdims[1].bwidth;
176
177 tra = ((flags & TILEMUL_TRA) != 0);
178 trb = ((flags & TILEMUL_TRB) != 0);
179 localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY);
180 localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY);
181
182 vecCoords = ((flags & TILEMUL_OPTIMIZE_VEC_COORDS) != 0);
183
184 tVect[0] = '\0';
185
186 if (vecCoords && vecLen != 1) {
187 sprintf(tVect, "%u", vecLen);
188 }
189
190 switch (dtype) {
191 case TYPE_FLOAT:
192 tName = "float";
193 ptrName = "f";
194 break;
195 case TYPE_DOUBLE:
196 tName = "double";
197 ptrName = "d";
198 break;
199 case TYPE_COMPLEX_FLOAT:
200 tName = "float2";
201 ptrName = "f2v";
202 break;
203 case TYPE_COMPLEX_DOUBLE:
204 tName = "double2";
205 ptrName = "d2v";
206 break;
207 default:
208 return;
209 }
210
211 if (vecCoords) {
212 //Do not use GPtrs in fetching
213 vnames->A = "A";
214 vnames->B = "B";
215 }
216 else {
217 vnames->A = localA ? "LAptr" : "((GPtr)A)";
218 vnames->B = localB ? "LBptr" : "((GPtr)B)";
219 }
220 if (!localA) {
221 vnames->lda = "lda";
222
223 }
224 if (!localB) {
225 vnames->ldb = "ldb";
226 }
227 vnames->sizeM = "M";
228 vnames->sizeN = "N";
229 vnames->sizeK = "K";
230 vnames->skewA = "skewA";
231 vnames->skewB = "skewB";
232 vnames->skewK = "skewK";
233 vnames->coordA = "workItemM";
234 vnames->coordB = "workItemN";
235 vnames->k = "k";
236
237 kgenAddBlankLine(ctx);
238 sprintf(s, "__attribute__((reqd_work_group_size(%i, %i, 1)))\n",
239 ITEM_WORK_M, ITEM_WORK_N);
240 kgenAddStmt(ctx, s);
241 kgenAddStmt(ctx, "__kernel void\n");
242 sprintf(s, "%s(\n", kernelName);
243 kgenAddStmt(ctx, s);
244 sprintf(s," %s alpha,\n", tName);
245 kgenAddStmt(ctx, s);
246 sprintf(s," __global %s%s *A,\n", tName, tVect);
247 kgenAddStmt(ctx, s);
248 sprintf(s," __global %s%s *B,\n", tName, tVect);
249 kgenAddStmt(ctx, s);
250 kgenAddStmt(ctx, " uint M,\n"
251 " uint N,\n"
252 " uint K,\n");
253 sprintf(s,
254 " __global %s *C,\n"
255 " const uint iter)\n", tName);
256 kgenAddStmt(ctx, s);
257 kgenBeginFuncBody(ctx);
258 sprintf(s, "uint workItemM = %lu * get_global_id(0);\n"
259 "uint workItemN = %lu * get_global_id(1);\n",
260 m, n);
261 kgenAddStmt(ctx, s);
262 if ((flags & TILEMUL_SKEW_A) != 0) {
263 kgenAddStmt(ctx, "uint skewA = 0u;\n");
264 }
265 if ((flags & TILEMUL_SKEW_B) != 0) {
266 kgenAddStmt(ctx, "uint skewB = 0u;\n");
267 }
268 if ((flags & TILEMUL_SKEW_K) != 0) {
269 kgenAddStmt(ctx, "uint skewK = 0u;\n");
270 }
271
272 if (localA) {
273 sprintf(s, "__local %s LA[%lu];\n",
274 tName, subdims[0].bwidth * subdims[0].y);
275 kgenAddStmt(ctx, s);
276 }
277 else { //global A
278 sprintf(s, "uint lda = %s;\n", tra ? "M" : "K");
279 kgenAddStmt(ctx, s);
280 }
281 if (localB) {
282 sprintf(s, "__local %s LB[%lu];\n",
283 tName, subdims[0].bwidth * subdims[0].x);
284 kgenAddStmt(ctx, s);
285 }
286 else { //global B
287 sprintf(s, "uint ldb = %s;\n", trb ? "K" : "N");
288 kgenAddStmt(ctx, s);
289 }
290
291 initDefaultTiles(gset, CLBLAS_GEMM, TILE_PACKED, PRIV_STORAGE_ARRAY);
292 declareTileStorages(ctx, gset);
293
294 if (vecCoords) {
295 size_t ha, hb;
296 char *str;
297
298 ha = tra ? k : m;
299 hb = trb ? n : k;
300
301 if (ha > 1) {
302 str = s;
303 str += sprintf(str, "uint%lu ca = {0", ha);
304 for (i = 1; i < ha; i++) {
305 str += sprintf(str, ", %s * %u / %u", vnames->lda, i, vecLen);
306 }
307 str += sprintf(str, "};\n");
308 kgenAddStmt(ctx, s);
309 }
310 else {
311 kgenAddStmt(ctx, "uint ca = 0;\n");
312 }
313 vnames->vectCoordA = "ca";
314
315 if (hb > 1) {
316 str = s;
317 str += sprintf(str, "uint%lu cb = {0", hb);
318 for (i = 1; i < hb; i++) {
319 str += sprintf(str, ", %s * %u / %u", vnames->ldb, i, vecLen);
320 }
321 str += sprintf(str, "};\n");
322 kgenAddStmt(ctx, s);
323 }
324 else {
325 kgenAddStmt(ctx, "uint cb = 0;\n");
326 }
327 vnames->vectCoordB = "cb";
328
329 // uint4 ca = {0, vecLDA, vecLDA * 2, vecLDA * 3};
330 // uint4 cb = {0, vecLDB, vecLDB * 2, vecLDB * 3};
331 }
332
333 kgenAddBlankLine(ctx);
334
335 sprintf(s, "for (int it = 0; it < iter; it++)");
336 kgenBeginBranch(ctx, s);
337
338 if (!(localA && localB)) {
339 kgenAddStmt(ctx, "uint k = 0;\n");
340 }
341
342 genZeroTile(ctx, &gset->tileCY);
343
344 if (vecCoords) {
345 char *coordsA[2] = {"workItemM", "k"};
346 char *coordsB[2] = {"k", "workItemN"};
347 sprintf(s, "A += %s * (lda / %u) + %s / %u;\n",
348 coordsA[tra], vecLen, coordsA[1 - tra], vecLen);
349 kgenAddStmt(ctx, s);
350 sprintf(s, "B += %s * (ldb / %u) + %s / %u;\n",
351 coordsB[trb], vecLen, coordsB[1 - trb], vecLen);
352 kgenAddStmt(ctx, s);
353 }
354
355 sprintf(s, "for (int k0 = 0; k0 < K; k0 += %lu)", subdims[0].bwidth);
356 kgenBeginBranch(ctx, s);
357
358 /* Copy data to local memory. We know that the size of matrix is the same
359 * that the size of one block and use that.
360 */
361 if (localA) {
362 sprintf(s,
363 "event_t evA = async_work_group_copy(LA, A, %lu, 0);\n"
364 "wait_group_events(1, &evA);\n"
365 "barrier(CLK_LOCAL_MEM_FENCE);\n",
366 subdims[0].y * subdims[0].bwidth);
367 kgenAddStmt(ctx, s);
368 kgenAddStmt(ctx, "LPtr LAptr;\n");
369 if (tra) {
370 sprintf(s,
371 "LAptr.%s = LA + workItemM;\n", ptrName);
372 }
373 else {
374 sprintf(s,
375 "LAptr.%s = LA + workItemM * %lu;\n",
376 ptrName, subdims[0].bwidth);
377 }
378 kgenAddStmt(ctx, s);
379 }
380 if (localB) {
381 sprintf(s,
382 "event_t evB = async_work_group_copy(LB, B, %lu, 0);\n"
383 "wait_group_events(1, &evB);\n"
384 "barrier(CLK_LOCAL_MEM_FENCE);\n",
385 subdims[0].x * subdims[0].bwidth);
386 kgenAddStmt(ctx, s);
387 kgenAddStmt(ctx, "LPtr LBptr;\n");
388 if (trb) {
389 sprintf(s, "LBptr.%s = LB + workItemN * %lu;\n",
390 ptrName, subdims[0].bwidth);
391 }
392 else {
393 sprintf(s, "LBptr.%s = LB + workItemN;\n", ptrName);
394 }
395 kgenAddStmt(ctx, s);
396 }
397
398 if (!separateFetch) {
399 ret = tileMulGen(ctx, gset, mulOpts);
400 checkRet(ret, "Multiplier");
401 }
402 else {
403 Tile *tileA = &gset->tileA;
404 Tile *tileB = &gset->tileBX;
405
406 memset(&fetchOpts, 0, sizeof(fetchOpts));
407 if (localA) {
408 fetchOpts.memA = CLMEM_LOCAL_MEMORY;
409 }
410 if (localB) {
411 fetchOpts.memB = CLMEM_LOCAL_MEMORY;
412 }
413
414 genFillTileWithNAN(ctx, tileA);
415 genFillTileWithNAN(ctx, tileB);
416
417 if (subdims[0].bwidth != subdims[1].bwidth) {
418 sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)",
419 subdims[0].bwidth, k);
420 kgenBeginBranch(ctx, s);
421 }
422
423 #if JUST_MULTIPLICATION
424 for (i = 0; i < tileA->nrRows; i++) {
425 for(j = 0; j < tileA->nrCols; j++) {
426 sprintfTileElement(&kstr, tileA, i, j, 1);
427 sprintf(s, "%s = %u;\n", kstr.buf, i * tileA->nrCols + j);
428 kgenAddStmt(ctx, s);
429 }
430 }
431
432 for (i = 0; i < tileB->nrRows; i++) {
433 for(j = 0; j < tileB->nrCols; j++) {
434 sprintfTileElement(&kstr, tileB, i, j, 1);
435 sprintf(s, "%s = %u;\n", kstr.buf, i * tileB->nrCols + j);
436 kgenAddStmt(ctx, s);
437 }
438 }
439 #else
440 fetchOpts.mrole = MATRIX_B;
441 fetchOpts.lineOffset = 0;
442 fetchOpts.linesNum = (tileB->trans) ? tileB->nrCols : tileB->nrRows;
443 ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts);
444 checkRet(ret, "Fetching tile b");
445
446 fetchOpts.mrole = MATRIX_A;
447 fetchOpts.linesNum = (tileA->trans) ? tileA->nrCols : tileA->nrRows;
448 kgenAddBlankLine(ctx);
449 fetchOpts.lineOffset = 0;
450 ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts);
451 checkRet(ret, "Fetching tile a");
452 #endif
453 ret = genMulTiles(ctx, gset, mulOpts);
454 checkRet(ret, "Multiplier");
455 #if ! JUST_MULTIPLICATION
456 sprintf(s, "k += %lu;\n", k);
457 kgenAddStmt(ctx, s);
458 #endif
459 if (subdims[0].bwidth != subdims[1].bwidth) {
460 kgenEndBranch(ctx, NULL);
461 }
462 }
463 kgenEndBranch(ctx, NULL); // K loop
464 kgenEndBranch(ctx, NULL); // iterations loop
465
466 kgenAddBlankLine(ctx);
467
468 for (i = 0; i < m; i++) {
469 for (j = 0; j < n; j++) {
470 sprintfTileElement(&kstr, &gset->tileCY, i, j, 1);
471 sprintf(s,
472 "((GPtr)C).%s"
473 "[(%d + workItemM) * N + %d + workItemN] = %s;\n",
474 ptrName, i, j, kstr.buf);
475 kgenAddStmt(ctx, s);
476 }
477 }
478
479 kgenEndFuncBody(ctx);
480 }
481
482 cl_int
run(const char * ker,cl_uint M,cl_uint N,cl_uint K,FType alpha,BlasGenSettings * gset,TileMulFlags flags,cl_device_type deviceType,bool verbose,unsigned int iterNum)483 run (
484 const char *ker,
485 cl_uint M,
486 cl_uint N,
487 cl_uint K,
488 FType alpha,
489 BlasGenSettings *gset,
490 TileMulFlags flags,
491 cl_device_type deviceType,
492 bool verbose,
493 unsigned int iterNum)
494 {
495 cl_int err;
496 cl_platform_id platform;
497 cl_context ctx;
498 cl_device_id device;
499 cl_command_queue queue;
500 cl_event evt;
501 DataType dtype = gset->kextra->dtype;
502
503 cl_mem bufA, bufB, bufC;
504 FPtr A, B, C, C_naive;
505 bool isComplex = isComplexType(dtype);
506 bool isDouble = isDoubleBasedType(dtype);
507 cl_uint nwords = (isComplex) ? 2 : 1;
508 unsigned int tsize = dtypeSize(dtype);
509 cl_kernel kernel;
510 size_t i, j, k;
511 size_t globalWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N};
512 size_t localWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N};
513 char log[100000];
514 size_t logSize;
515 cl_long sTime, fTime;
516 cl_program program = NULL;
517
518 clGetPlatformIDs(1, &platform, NULL);
519
520 clGetDeviceIDs(platform, deviceType, 1, &device, NULL);
521
522 ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
523 if (err != CL_SUCCESS) {
524 return err;
525 }
526
527 queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err);
528 if (err != CL_SUCCESS) {
529 return err;
530 }
531
532 /* Prepare OpenCL kernel and its arguments */
533
534 program = clCreateProgramWithSource(ctx, 1, &ker, NULL, NULL);
535
536 err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
537 clGetProgramBuildInfo (program,
538 device,
539 CL_PROGRAM_BUILD_LOG,
540 sizeof(log),
541 log,
542 &logSize);
543 printf("%s", log);
544 if (err != CL_SUCCESS){
545 clReleaseProgram(program);
546 return err;
547 }
548
549 kernel = clCreateKernel(program, kernelName, &err);
550 if (err != CL_SUCCESS){
551 clReleaseProgram(program);
552 return err;
553 }
554 /* Memory allocation */
555
556 A.v = malloc(M * K * tsize);
557 B.v = malloc(K * N * tsize);
558 C.v = malloc(M * N * tsize);
559 C_naive.v = malloc(M * N * tsize);
560
561 #if JUST_MULTIPLICATION
562 srand(0);
563 if (isDouble) {
564 for(i = 0; i < M * K * nwords; i++){
565 A.d[i] = i;
566 }
567 for(i = 0; i < N * K * nwords; i++){
568 B.d[i] = i + 7;
569 }
570 for(i = 0; i < M * N * nwords; i++){
571 C.d[i] = 0.0;
572 C_naive.d[i] = 0.0;
573 }
574 }
575 else {
576 for(i = 0; i < M * K * nwords; i++){
577 A.f[i] = i;
578 }
579 for(i = 0; i < N * K * nwords; i++){
580 B.f[i] = i + 7;
581 }
582 for(i = 0; i < M * N * nwords; i++){
583 C.f[i] = 0.0;
584 C_naive.f[i] = 0.0;
585 }
586 }
587
588 #else
589 srand(0);
590 if (isDouble) {
591 for(i = 0; i < M * K * nwords; i++){
592 A.d[i] = (double)(rand() % RAND_BOUND);
593 }
594 for(i = 0; i < N * K * nwords; i++){
595 B.d[i] = (double)(rand() % RAND_BOUND);
596 }
597 for(i = 0; i < M * N * nwords; i++){
598 C.d[i] = 0.0;
599 C_naive.d[i] = 0.0;
600 }
601 }
602 else {
603 for(i = 0; i < M * K * nwords; i++){
604 A.f[i] = (float)(rand() % RAND_BOUND);
605 }
606 for(i = 0; i < N * K * nwords; i++){
607 B.f[i] = (float)(rand() % RAND_BOUND);
608 }
609 for(i = 0; i < M * N * nwords; i++){
610 C.f[i] = 0.0;
611 C_naive.f[i] = 0.0;
612 }
613 }
614 #endif
615
616 bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
617 K * M * tsize, A.v, &err);
618 if (err != CL_SUCCESS) {
619 clReleaseKernel(kernel);
620 return err;
621 }
622
623 bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
624 K * N * tsize, B.v, &err);
625
626 if (err != CL_SUCCESS) {
627 clReleaseMemObject(bufA);
628 clReleaseKernel(kernel);
629 return err;
630 }
631
632 bufC = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
633 M * N * tsize, C.v, &err);
634
635 if (err != CL_SUCCESS) {
636 clReleaseMemObject(bufB);
637 clReleaseMemObject(bufA);
638 clReleaseKernel(kernel);
639 return err;
640 }
641
642 /* Argument setting and kernel execution */
643 err = clSetKernelArg(kernel, 0, tsize, alpha.u);
644 err |= clSetKernelArg(kernel, 1, sizeof(bufA), &bufA);
645 err |= clSetKernelArg(kernel, 2, sizeof(bufB), &bufB);
646 err |= clSetKernelArg(kernel, 3, sizeof(M), &M);
647 err |= clSetKernelArg(kernel, 4, sizeof(N), &N);
648 err |= clSetKernelArg(kernel, 5, sizeof(K), &K);
649 err |= clSetKernelArg(kernel, 6, sizeof(bufC), &bufC);
650 err |= clSetKernelArg(kernel, 7, sizeof(iterNum), &iterNum);
651
652 if (err != CL_SUCCESS) {
653 clReleaseMemObject(bufC);
654 clReleaseMemObject(bufB);
655 clReleaseMemObject(bufA);
656 clReleaseKernel(kernel);
657 return err;
658 }
659
660 err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
661 globalWorkSize, localWorkSize, 0,
662 NULL, &evt);
663
664 if (err != CL_SUCCESS) {
665 clReleaseMemObject(bufC);
666 clReleaseMemObject(bufB);
667 clReleaseMemObject(bufA);
668 clReleaseKernel(kernel);
669 return err;
670 }
671
672 err = clFinish(queue);
673 err = clEnqueueReadBuffer (queue,
674 bufC,
675 CL_TRUE,
676 0,
677 M * N * tsize,
678 C.v,
679 0,
680 NULL,
681 NULL);
682
683 /* Naive CPU multiplication */
684 if (isDouble) {
685 for (i = 0; i < M; i++) {
686 for (j = 0; j < N; j++) {
687 if (isComplex) {
688 cl_double2 val;
689 for (k = 0; k < K; k++) {
690 cl_double2 bkj = flags & TILEMUL_TRB ?
691 B.d2[j * K + k] : B.d2[k * N + j];
692 cl_double2 aik = flags & TILEMUL_TRA ?
693 A.d2[k * M + i] : A.d2[i * K + k];
694 val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1];
695 val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0];
696 C_naive.d2[i * N + j].s[0] += val.s[0];
697 C_naive.d2[i * N + j].s[1] += val.s[1];
698 }
699 val.s[0] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[0] -
700 C_naive.d2[i * N + j].s[1] * alpha.d2.s[1];
701 val.s[1] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[1] +
702 C_naive.d2[i * N + j].s[1] * alpha.d2.s[0];
703 C_naive.d2[i * N + j] = val;
704 }
705 else {
706 for (k = 0; k < K; k++) {
707 double bkj = flags & TILEMUL_TRB ?
708 B.d[j * K + k] : B.d[k * N + j];
709 double aik = flags & TILEMUL_TRA ?
710 A.d[k * M + i] : A.d[i * K + k];
711 C_naive.d[i * N + j] += aik * bkj;
712 }
713 C_naive.d[i * N + j] *= alpha.d;
714 }
715 }
716 }
717
718 for (i = 0; i < M * N; i++) {
719 if (C.d[i] != C_naive.d[i]) {
720 printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N,
721 C.d[i], C_naive.d[i]);
722 break;
723 }
724 }
725 if (i == M * N) {
726 printf("Match\n");
727 }
728 }
729 else {
730 for (i = 0; i < M; i++) {
731 for (j = 0; j < N; j++) {
732 if (isComplex) {
733 cl_float2 val;
734 for (k = 0; k < K; k++) {
735 cl_float2 bkj = flags & TILEMUL_TRB ?
736 B.f2[j * K + k] : B.f2[k * N + j];
737 cl_float2 aik = flags & TILEMUL_TRA ?
738 A.f2[k * M + i] : A.f2[i * K + k];
739 val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1];
740 val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0];
741 C_naive.f2[i * N + j].s[0] += val.s[0];
742 C_naive.f2[i * N + j].s[1] += val.s[1];
743 }
744 val.s[0] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[0] -
745 C_naive.f2[i * N + j].s[1] * alpha.f2.s[1];
746 val.s[1] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[1] +
747 C_naive.f2[i * N + j].s[1] * alpha.f2.s[0];
748 C_naive.f2[i * N + j] = val;
749 }
750 else {
751 for (k = 0; k < K; k++) {
752 float bkj = flags & TILEMUL_TRB ?
753 B.f[j * K + k] : B.f[k * N + j];
754 float aik = flags & TILEMUL_TRA ?
755 A.f[k * M + i] : A.f[i * K + k];
756 C_naive.f[i * N + j] += aik * bkj;
757 }
758 C_naive.f[i * N + j] *= alpha.f;
759 }
760 }
761 }
762
763 for (i = 0; i < M * N; i++) {
764 if (C.f[i] != C_naive.f[i]) {
765 printf("Differ at (%lu, %lu): %lf != %lf\n",
766 i / N, i % N, C.f[i], C_naive.f[i]);
767 break;
768 }
769 }
770 if (i == M * N) {
771 printf("Match\n");
772 }
773 }
774
775 /* End of naive CPU multiplication */
776 if (verbose) {
777 if (!isDouble) {
778 printf("Matrix A:\n");
779 for (i = 0; i < M; i++) {
780 for (k = 0; k < K; k++) {
781 if (isComplex) {
782 cl_float2 aik = flags & TILEMUL_TRA ?
783 A.f2[k * M + i] : A.f2[i * K + k];
784 printf("(%4.1f, %4.1f) ", aik.s[0], aik.s[1]);
785 }
786 else {
787 float aik = flags & TILEMUL_TRA ?
788 A.f[k * M + i] : A.f[i * K + k];
789 printf("%4.1f ", aik);
790 }
791 }
792 printf("\n");
793 }
794
795 printf("Matrix B:\n");
796 for (k = 0; k < K; k++) {
797 for (j = 0; j < N; j++) {
798 if (isComplex) {
799 cl_float2 bkj = flags & TILEMUL_TRB ?
800 B.f2[j * K + k] : B.f2[k * N + j];
801 printf("(%4.1f, %4.1f) ", bkj.s[0], bkj.s[1]);
802 }
803 else {
804 float bkj = flags & TILEMUL_TRB ?
805 B.f[j * K + k] : B.f[k * N + j];
806 printf("%4.1f ", bkj);
807 }
808 }
809 printf("\n");
810 }
811
812 printf("CPU calculated matrix:\n");
813 for (i = 0; i < M; i++) {
814 for (j = 0; j < N; j++) {
815 if (isComplex) {
816 printf("(%4.1f, %4.1f) ",
817 C_naive.f2[i * N + j].s[0],
818 C_naive.f2[i * N + j].s[1]);
819 }
820 else {
821 printf("%4.1f ", C_naive.f[i * N + j]);
822 }
823 }
824 printf("\n");
825 }
826
827 printf("GPU calculated matrix:\n");
828 for (i = 0; i < M; i++) {
829 for (j = 0; j < N; j++) {
830 if (isComplex) {
831 printf("(%4.1f, %4.1f) ",
832 C.f2[i * N + j].s[0], C.f2[i * N + j].s[1]);
833 }
834 else {
835 printf("%4.1f ", C.f[i * N + j]);
836 }
837 }
838 printf("\n");
839 }
840 }
841 }
842
843 clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
844 &sTime, NULL);
845 clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
846 &fTime, NULL);
847
848 printf("Total multiplication time: %d ms\nTime per iteration: %d ns\n",
849 (int)((fTime-sTime)/1000000), (int)((fTime-sTime)/iterNum));
850
851 clReleaseMemObject(bufC);
852 clReleaseMemObject(bufB);
853 clReleaseMemObject(bufA);
854 clReleaseKernel(kernel);
855 return CL_SUCCESS;
856 }
857
main(int argc,char * argv[])858 int main(int argc, char *argv[])
859 {
860 char out[1024*1024];
861 CLBLASKernExtra kextra;
862 BlasGenSettings gset;
863 TileMulOpts mulOpts;
864 int i;
865 cl_uint blockM = 4, blockN = 4, blockK = 8;
866 struct KgenContext *ctx = createKgenContext(out, sizeof(out), 1);
867 FType alpha;
868 cl_int err;
869 unsigned int iterNum = 1;
870 const char* const shortOptions = "hd:f:l:t:a:b:s:g:i:c:ov";
871 const struct option longOptions[] = {
872 {"help", no_argument, NULL, 'h'},
873 {"device", required_argument, NULL, 'd'},
874 {"fetch", required_argument, NULL, 'f'},
875 {"local", required_argument, NULL, 'l'},
876 {"type", required_argument, NULL, 't'},
877 {"a", required_argument, NULL, 'a'},
878 {"b", required_argument, NULL, 'b'},
879 {"skew", required_argument, NULL, 's'},
880 {"globalcycling", required_argument, NULL, 'g'},
881 {"iter", required_argument, NULL, 'i'},
882 {"core", required_argument, NULL, 'c'},
883 {"old", no_argument, NULL, 'o'},
884 {"verbose", no_argument, NULL, 'v'},
885 {NULL, 0, NULL, 0}
886 };
887 int nextOption;
888 cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
889 bool verbose = false;
890 SubproblemDim *subdims = gset.subdims;
891 bool separateFetch = false;
892
893 memset(&gset, 0, sizeof(gset));
894 memset(&mulOpts, 0, sizeof(mulOpts));
895 memset(&kextra, 0, sizeof(kextra));
896 gset.kextra = &kextra;
897 gset.flags |= BGF_WHOLE_A;
898 mulOpts.core = TILEMUL_MAD;
899 mulOpts.flags = TILEMUL_FORCE_VECTORIZATION;
900 kextra.vecLen = 1;
901 kextra.dtype = TYPE_FLOAT;
902
903 alpha.f = 1;
904
905 // parse command line
906 do {
907 nextOption = getopt_long(argc, argv, shortOptions, longOptions, NULL);
908 switch (nextOption) {
909 case 'h':
910 printUsage(argv[0], EXIT_SUCCESS);
911 break;
912 case 'd':
913 if (!strcmp("cpu", optarg)) {
914 deviceType = CL_DEVICE_TYPE_CPU;
915 }
916 else if (!strcmp("gpu", optarg)) {
917 deviceType = CL_DEVICE_TYPE_GPU;
918 }
919 else {
920 printf("Unknown device type %s. Supported values are \"cpu\" "
921 "and \"gpu\".\n", optarg);
922 exit(EXIT_FAILURE);
923 }
924 break;
925 case 'f':
926 kextra.vecLen = atoi(optarg);
927 break;
928 case 'l':
929 if (!strcmp(optarg, "A")) {
930 mulOpts.memA = CLMEM_LOCAL_MEMORY;
931 }
932 else if (!strcmp(optarg, "B")) {
933 mulOpts.memB = CLMEM_LOCAL_MEMORY;
934 }
935 else {
936 printf("Wrong matrix specified: %s. Supported values are "
937 "A, B.\n", optarg);
938 exit(EXIT_FAILURE);
939 }
940 break;
941 case 't':
942 if (!strcmp(optarg, "s")) {
943 kextra.dtype = TYPE_FLOAT;
944 alpha.f = 1;
945 }
946 else if (!strcmp(optarg, "d")) {
947 kextra.dtype = TYPE_DOUBLE;
948 alpha.d = 1;
949 }
950 else if (!strcmp(optarg, "c")) {
951 kextra.dtype = TYPE_COMPLEX_FLOAT;
952 alpha.f2.s[0] = 1;
953 alpha.f2.s[1] = 0;
954 }
955 else if (!strcmp(optarg, "z")) {
956 kextra.dtype = TYPE_COMPLEX_DOUBLE;
957 alpha.d2.s[0] = 1;
958 alpha.d2.s[1] = 0;
959 }
960 else {
961 printf("Wrong type specified: %s. Supported values are "
962 "s, d, c, z.\n", optarg);
963 exit(EXIT_FAILURE);
964 }
965 break;
966 case 'a':
967 if (!strcmp(optarg, "r")) {
968 mulOpts.flags &= ~TILEMUL_TRA;
969 }
970 else if (!strcmp(optarg, "c")) {
971 mulOpts.flags |= TILEMUL_TRA;
972 }
973 else {
974 printf("Wrong tile a parameter specified: %s. Supported values "
975 "are \"r\", \"c\".\n", optarg);
976 exit(EXIT_FAILURE);
977 }
978 break;
979 case 'b':
980 if (!strcmp(optarg, "r")) {
981 mulOpts.flags &= ~TILEMUL_TRB;
982 }
983 else if (!strcmp(optarg, "c")) {
984 mulOpts.flags |= TILEMUL_TRB;
985 }
986 else {
987 printf("Wrong tile b order specified: %s. Supported values "
988 "are \"r\", \"c\".\n", optarg);
989 exit(EXIT_FAILURE);
990 }
991 break;
992 case 's':
993 if (!strcmp(optarg, "a")) {
994 mulOpts.flags |= TILEMUL_SKEW_A;
995 }
996 else if (!strcmp(optarg, "b")) {
997 mulOpts.flags |= TILEMUL_SKEW_B;
998 }
999 else if (!strcmp(optarg, "k")) {
1000 mulOpts.flags |= TILEMUL_SKEW_K;
1001 }
1002 else {
1003 printf("Wrong skew parameter specified: %s. Supported values "
1004 "are \"a\", \"b\", \"k\"\n", optarg);
1005 exit(EXIT_FAILURE);
1006 }
1007 break;
1008 case 'g':
1009 if (!strcmp(optarg, "a")) {
1010 mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
1011 }
1012 else if (!strcmp(optarg, "b")) {
1013 mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B;
1014 }
1015 else if (!strcmp(optarg, "k")) {
1016 mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K;
1017 }
1018 else {
1019 printf("Wrong global cycling parameter specified: %s. "
1020 "Supported values are \"a\", \"b\", \"k\"\n", optarg);
1021 exit(EXIT_FAILURE);
1022 }
1023 break;
1024 case 'i':
1025 iterNum = atoi(optarg);
1026 break;
1027 case 'c':
1028 if (!strcmp("muladd", optarg)) {
1029 mulOpts.core = TILEMUL_MULADD;
1030 }
1031 else if (!strcmp("mad", optarg)) {
1032 mulOpts.core = TILEMUL_MAD;
1033 }
1034 else if (!strcmp("dot", optarg)) {
1035 mulOpts.core = TILEMUL_DOT;
1036 }
1037 else {
1038 printf("Unknown multiplier core %s. Supported values"
1039 " are \"muladd\", \"mad\" and \"dot\".\n", optarg);
1040 exit(EXIT_FAILURE);
1041 }
1042 break;
1043 case 'o':
1044 separateFetch = false;
1045 break;
1046 case 'v':
1047 verbose = true;
1048 break;
1049 case -1:
1050 break;
1051 default:
1052 printUsage(argv[0], EXIT_FAILURE);
1053 break;
1054 }
1055 } while (nextOption != -1);
1056
1057 if (optind + 2 >= argc) {
1058 printf("Error: Not all sizes are specified\n");
1059 printUsage(argv[0], EXIT_FAILURE);
1060 }
1061 blockM = atoi(argv[optind]);
1062 blockN = atoi(argv[optind + 1]);
1063 blockK = atoi(argv[optind + 2]);
1064
1065 if ((mulOpts.memA == CLMEM_LOCAL_MEMORY ||
1066 mulOpts.memB == CLMEM_LOCAL_MEMORY) &&
1067 ((mulOpts.flags & TILEMUL_GLOBAL_CYCLIC) != 0)) {
1068 printf("One of matrixes is in local memory, "
1069 "disabling global cycling\n");
1070 mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC;
1071 }
1072
1073 if (mulOpts.flags & TILEMUL_TRA) {
1074 kextra.flags |= KEXTRA_TRANS_A;
1075 }
1076 if (mulOpts.flags & TILEMUL_TRB) {
1077 kextra.flags |= KEXTRA_TRANS_B;
1078 }
1079
1080 subdims[0].y = blockM * ITEM_WORK_M;
1081 subdims[0].x = blockN * ITEM_WORK_N;
1082 subdims[0].bwidth = blockK * ITEM_BLOCKS_K;
1083 subdims[1].y = blockM;
1084 subdims[1].x = blockN;
1085 subdims[1].bwidth = blockK;
1086
1087 memset(out, 0, sizeof(out));
1088
1089 i = isDoubleBasedType(kextra.dtype);
1090 kgenDeclareUptrs(ctx, i);
1091 genTest(ctx, &gset, &mulOpts, separateFetch);
1092 destroyKgenContext(ctx);
1093
1094 printf("Kernel code: \n\"%s\"\n", out);
1095 err = run(out, subdims[0].y, subdims[0].x, subdims[0].bwidth, alpha,
1096 &gset, mulOpts.flags, deviceType, verbose, iterNum);
1097 if (err != CL_SUCCESS) {
1098 printf("Test run failed, error %d\n", err);
1099 return EXIT_FAILURE;
1100 }
1101 return EXIT_SUCCESS;
1102 }
1103