1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 /*
19 * trmm image based generator
20 */
21
22 #include <string.h>
23 #include <stdio.h>
24 #include <math.h>
25 #include <ctype.h>
26
27 #include <clBLAS.h>
28 #include <matrix_dims.h>
29 #include <blas_mempat.h>
30 #include <clkern.h>
31 #include <clblas-internal.h>
32 #include <dis_warning.h>
33
34 #include "blas_kgen_legacy.h"
35 #include "../gen_helper.h"
36 #include "gen_helper_legacy.h"
37 #include "trxm_common_legacy.h"
38
39 static CLBLASMpatExtra mpatExtra;
40
41 static ssize_t
42 generator(
43 char *buf,
44 size_t buflen,
45 const struct SubproblemDim *subdims,
46 const struct PGranularity *pgran,
47 void *extra);
48
49 static ssize_t
50 preparator(
51 char *buf,
52 size_t buflen,
53 const struct SubproblemDim *subdims,
54 const struct PGranularity *pgran,
55 void *extra);
56
57 static ssize_t
genWrapper(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)58 genWrapper(
59 char *buf,
60 size_t buflen,
61 const struct SubproblemDim *subdims,
62 const struct PGranularity *pgran,
63 void *extra)
64 {
65 CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
66 if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) {
67 return generator(buf, buflen, subdims, pgran, extra);
68 }
69 else {
70 return preparator(buf, buflen, subdims, pgran, extra);
71 }
72 }
73
74 static void
75 assignKargs(KernelArg *args, const void *params, const void *extra);
76
77 static bool
78 isFitToLDS(
79 SubproblemDim *dim,
80 DataType dtype,
81 cl_ulong ldsSize,
82 const void *kernelArgs);
83
84 static SolverFlags
85 solverFlags(void);
86
87 static void
88 calcNrThreads(
89 size_t threads[2],
90 const SubproblemDim *subdims,
91 const PGranularity *pgran,
92 const void *args,
93 const void *extra);
94
95 static int getPerf(
96 unsigned int kflags,
97 const void *args);
98
99 static SolverOps imgSops = {
100 genWrapper,
101 assignKargs,
102 isFitToLDS,
103 getPerf,
104 NULL,
105 calcNrThreads,
106 NULL,
107 solverFlags,
108 NULL, //fixupKargs
109 NULL, //getDefaultDecomp
110 NULL, //getDecompList
111 NULL,
112 NULL
113 };
114
115 static void
imgToCopyBufFuncs(CopyBufFuncs * bufFuncs,const CopyImgFuncs * imgFuncs,KernelExtraFlags kflags)116 imgToCopyBufFuncs(
117 CopyBufFuncs *bufFuncs,
118 const CopyImgFuncs *imgFuncs,
119 KernelExtraFlags kflags)
120 {
121 memcpy(bufFuncs->write, imgFuncs->localToImage, FUNC_NAME_MAXLEN);
122 if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A)) {
123 memcpy(bufFuncs->read[MATRIX_A],
124 imgFuncs->globalToLocalTransposed[MATRIX_A], FUNC_NAME_MAXLEN);
125 memcpy(bufFuncs->readGeneric[MATRIX_A],
126 imgFuncs->globalToLocalTransposedGeneric[MATRIX_A],
127 FUNC_NAME_MAXLEN);
128 }
129 else {
130 memcpy(bufFuncs->read[MATRIX_A],
131 imgFuncs->globalToLocal[MATRIX_A], FUNC_NAME_MAXLEN);
132 memcpy(bufFuncs->readGeneric[MATRIX_A],
133 imgFuncs->globalToLocalGeneric[MATRIX_A],
134 FUNC_NAME_MAXLEN);
135 }
136 }
137
138 static void
genPrepKernelA(struct KgenContext * ctx,const SubproblemDim * subdims,KernelExtraFlags kflags,DataType dtype,CopyImgFuncs * copyImgFuncs,const PGranularity * pgran)139 genPrepKernelA(
140 struct KgenContext *ctx,
141 const SubproblemDim *subdims,
142 KernelExtraFlags kflags,
143 DataType dtype,
144 CopyImgFuncs *copyImgFuncs,
145 const PGranularity *pgran)
146 {
147 char tmp[4096];
148 bool isBranch = false;
149 size_t localBufSize;
150 unsigned int tsize, vecLen;
151 const char *typeName;
152 CopyBufFuncs copyBufFuncs;
153 char fpref;
154
155 fpref = dtypeToBlasPrefix(dtype);
156 typeName = dtypeBuiltinType(dtype);
157 tsize = dtypeSize(dtype);
158 vecLen = sizeof(cl_float4) / tsize;
159 localBufSize = subdims[1].y * fl4RowWidth(subdims[1].bwidth, tsize);
160 localBufSize *= vecLen;
161 imgToCopyBufFuncs(©BufFuncs, copyImgFuncs, kflags);
162
163 sprintf(tmp, "void __kernel\n"
164 "%cprepareImageA(\n"
165 " uint M,\n"
166 " __global %s *A,\n"
167 " uint lda,\n"
168 " __write_only image2d_t imgA,\n"
169 " uint startM,\n"
170 " uint origM,\n"
171 " uint offA)\n",
172 fpref, typeName);
173 kgenDeclareFunction(ctx, tmp);
174 kgenBeginFuncBody(ctx);
175
176 kgenDeclareGroupID(ctx, "gid", pgran);
177 kgenDeclareLocalID(ctx, "lid", pgran);
178 sprintf(tmp, "const uint bpr = (origM + %lu) / %lu;\n"
179 "uint currM = startM + (gid / bpr) * %lu;\n"
180 "uint k0 = (gid %% bpr) * %lu;\n"
181 "uint x, y;\n"
182 "__local %s tempA[%lu];\n"
183 "bool processed = false;\n\n",
184 subdims[1].bwidth - 1, subdims[1].bwidth, subdims[1].y,
185 subdims[1].bwidth, typeName, localBufSize);
186 kgenAddStmt(ctx, tmp);
187
188 kgenAddStmt(ctx, "A += offA;\n");
189 if (!(isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A) ||
190 isMatrixConj(kflags, MATRIX_A))) {
191
192 if (isMatrixUpper(kflags)) {
193 sprintf(tmp, "if (k0 >= currM + %lu)", subdims[1].y);
194 }
195 else {
196 sprintf(tmp, "if (k0 + %lu <= currM)", subdims[1].bwidth);
197 }
198 kgenBeginBranch(ctx, tmp);
199 sprintf(tmp, "if ((currM + %lu <= M + startM) && "
200 "(k0 + %lu <= origM) && %d) {\n"
201 // write directly to an image from the global memory
202 " %s(imgA, k0 / %u, currM - startM, (GPtr)A, "
203 "currM, k0, lda);\n"
204 " processed = true;\n"
205 "}\n",
206 subdims[1].y, subdims[1].bwidth,
207 (kflags & KEXTRA_NO_COPY_VEC_A) == 0,
208 copyImgFuncs->globalToImage[MATRIX_A], vecLen);
209
210 kgenAddStmt(ctx, tmp);
211 kgenEndBranch(ctx, NULL);
212
213 kgenBeginBranch(ctx, "if (!processed)");
214 isBranch = true;
215 }
216
217 // now, zeroing blocks entirely located in the "other" triangle
218 if (isMatrixUpper(kflags)) {
219 sprintf(tmp, "if (k0 + %lu <= currM) {\n"
220 " %s((__local float4*)tempA);\n"
221 "}\n",
222 subdims[1].bwidth, copyImgFuncs->zeroBlock[MATRIX_A]);
223 }
224 else {
225 sprintf(tmp, "if (k0 >= currM + %lu) {\n"
226 " %s((__local float4*)tempA);\n"
227 "}\n",
228 subdims[1].y, copyImgFuncs->zeroBlock[MATRIX_A]);
229 }
230 kgenAddStmt(ctx, tmp);
231
232 // useful block path, reading data from the global memory to the local one
233 kgenBeginBranch(ctx, "else");
234 kgenAddStmt(ctx, "M += startM;\n");
235 genPrepareTrxmBlockA(ctx, subdims, dtype, ©BufFuncs,
236 (ZeroFuncs*)copyImgFuncs->zeroBlock,
237 kflags, "origM");
238 kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
239 kgenAddStmt(ctx, "M -= startM;\n");
240 genTriangMatrBlock(ctx, subdims, dtype, kflags);
241 kgenEndBranch(ctx, NULL);
242
243 // and write to the image
244 kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
245 sprintf(tmp, "%s(imgA, k0 / %u, currM - startM, (LPtr)tempA);\n",
246 copyImgFuncs->localToImage[MATRIX_A], vecLen);
247 kgenAddStmt(ctx, tmp);
248 if (isBranch) {
249 kgenEndBranch(ctx, NULL);
250 }
251
252 kgenEndFuncBody(ctx);
253 }
254
255 static void
genPrepKernelB(struct KgenContext * ctx,const SubproblemDim * subdims,DataType dtype,CopyImgFuncs * copyImgFuncs,const PGranularity * pgran,KernelExtraFlags kflags)256 genPrepKernelB(
257 struct KgenContext *ctx,
258 const SubproblemDim *subdims,
259 DataType dtype,
260 CopyImgFuncs *copyImgFuncs,
261 const PGranularity *pgran,
262 KernelExtraFlags kflags)
263 {
264 char tmp[4096];
265 size_t localBufSize;
266 unsigned int tsize, vecLen;
267 const char *typeName;
268 char fpref;
269
270 const char *funcHead =
271 "bool trb, aligned;\n"
272 "const uint bpr = (origM + %lu) / %lu;\n"
273 "const uint n = startN + (gid / bpr) * %lu;\n"
274 "const uint k = (gid %% bpr) * %lu;\n"
275 "uint x, y;\n"
276 "__local %s temp[%lu];\n"
277 "\n"
278 "B += offB;\n"
279 "trb = (order == clblasRowMajor) ^ (side == clblasRight);\n"
280 "N += startN;\n";
281
282 const char *funcBody =
283 "//copy matrix B block\n"
284 "y = n + %u <= N ? %u : N - n;\n"
285 "x = k + %u <= origM ? %u : origM - k;\n"
286 "aligned = (x == %u) && (y == %u) && %d;\n"
287 "if (aligned && !trb) {\n"
288 " %s(imgB, k / %u, n - startN, (GPtr)B, n, k, ldb);\n"
289 "}\n"
290 "else {\n"
291 " if (n >= N) {\n"
292 // just zero, this is padding related part
293 " %s((__local float4*)temp);\n"
294 " }\n"
295 " else if (!aligned) {\n"
296 " // zero local memory\n"
297 " %s((__local float4*)temp);\n"
298 " barrier(CLK_LOCAL_MEM_FENCE);\n"
299 " if (trb) {\n"
300 " // generic transposed global to local\n"
301 " %s((LPtr)temp, (GPtr)B, k, n, x, y, %u, ldb);\n"
302 " }\n"
303 " else {\n"
304 " // generic global to local\n"
305 " %s((LPtr)temp, (GPtr)B, n, k, y, x, %u, ldb);\n"
306 " }\n"
307 " }\n"
308 " else {\n"
309 " if (trb) {//transposed, aligned\n"
310 " // optimized transposed global to local\n"
311 " %s((LPtr)temp, (GPtr)B, k, n, ldb);\n"
312 " }\n"
313 " }\n"
314 " barrier(CLK_LOCAL_MEM_FENCE);\n"
315 " %s(imgB, k / %u, n - startN, (LPtr)temp);\n"
316 "}\n"
317 "\n";
318
319 fpref = dtypeToBlasPrefix(dtype);
320 typeName = dtypeBuiltinType(dtype);
321 tsize = dtypeSize(dtype);
322 vecLen = sizeof(cl_float4) / tsize;
323 localBufSize = subdims[1].x * fl4RowWidth(subdims[1].bwidth, tsize);
324 localBufSize *= vecLen;
325
326 sprintf(tmp, "void __kernel\n"
327 "%cprepareImageB(\n"
328 " clblasOrder order,\n"
329 " clblasSide side,\n"
330 " uint N,\n"
331 " __global %s *B,\n"
332 " uint ldb,\n"
333 " __write_only image2d_t imgB,\n"
334 " uint startN,\n"
335 " uint origM,\n"
336 " uint offB)\n",
337 fpref, typeName);
338 kgenDeclareFunction(ctx, tmp);
339 kgenBeginFuncBody(ctx);
340
341 kgenDeclareGroupID(ctx, "gid", pgran);
342 sprintf(tmp, funcHead,
343 subdims[1].bwidth - 1, subdims[1].bwidth,
344 subdims[1].x, subdims[1].bwidth,
345 typeName, localBufSize);
346 kgenAddStmt(ctx, tmp);
347
348 sprintf(tmp, funcBody,
349 subdims[1].x, subdims[1].x, // y = n + dy <= N ?...
350 subdims[1].bwidth,
351 subdims[1].bwidth, // x = k + bw <= M ?...
352 subdims[1].bwidth,
353 subdims[1].x, // aligned = (x==bw1)&&(y==dx1)
354 (kflags & KEXTRA_NO_COPY_VEC_B) == 0,
355 copyImgFuncs->globalToImage[MATRIX_B],
356 vecLen,
357 copyImgFuncs->zeroBlock[MATRIX_B],
358 copyImgFuncs->zeroBlock[MATRIX_B],
359 copyImgFuncs->globalToLocalTransposedGeneric[MATRIX_B],
360 subdims[1].bwidth,
361 copyImgFuncs->globalToLocalGeneric[MATRIX_B],
362 subdims[1].bwidth,
363 copyImgFuncs->globalToLocalTransposed[MATRIX_B],
364 copyImgFuncs->localToImage[MATRIX_B],
365 vecLen);
366 kgenAddStmt(ctx, tmp);
367
368 kgenEndFuncBody(ctx);
369 }
370
371 static void
declareMainKernel(struct KgenContext * ctx,DataType dtype,KernelExtraFlags kflags,const PGranularity * pgran)372 declareMainKernel(
373 struct KgenContext *ctx,
374 DataType dtype,
375 KernelExtraFlags kflags,
376 const PGranularity *pgran)
377 {
378 char tmp[4048];
379 char fpref;
380 const char *typeName;
381 char coordNames[2] = {'M', 'N'};
382 int side = ((kflags & KEXTRA_SIDE_RIGHT) != 0);
383
384 fpref = dtypeToBlasPrefix(dtype);
385 typeName = dtypeBuiltinType(dtype);
386 sprintf(tmp, "__attribute__((reqd_work_group_size(%u, %u, 1)))\n"
387 "void __kernel\n"
388 "%ctrmmImg(\n"
389 " uint %c,\n"
390 " uint %c,\n"
391 " const %s alpha,\n"
392 " const __read_only image2d_t A,\n"
393 " const __read_only image2d_t B,\n"
394 " __global %s *C,\n"
395 " uint ldb,\n"
396 " const uint start%c,\n"
397 " const uint start%c,\n"
398 " const uint origM,\n"
399 " const uint offB)\n",
400 pgran->wgSize[0], pgran->wgSize[1], fpref, coordNames[side],
401 coordNames[1 - side], typeName, typeName, coordNames[side],
402 coordNames[1 - side]);
403
404 kgenDeclareFunction(ctx, tmp);
405 }
406
407 // Preparation function for images based kernel generator
408 static ssize_t
preparator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)409 preparator(
410 char *buf,
411 size_t buflen,
412 const struct SubproblemDim *subdims,
413 const struct PGranularity *pgran,
414 void *extra)
415 {
416 struct KgenContext *ctx;
417 CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
418 CopyImgFuncs copyImgFuncs;
419 BlasGenSettings gset;
420 ssize_t ret;
421 bool b;
422
423 memset(©ImgFuncs, 0, sizeof(copyImgFuncs));
424 memset(&gset, 0, sizeof(gset));
425
426 ctx = createKgenContext(buf, buflen, true);
427 if (ctx == NULL) {
428 return -ENOMEM;
429 }
430
431 b = isDoubleBasedType(kextra->dtype);
432 kgenDeclareUptrs(ctx, b);
433 if (kextra->kernType == CLBLAS_PREP_B_KERNEL) {
434 declareBlasEnums(ctx);
435 }
436
437 memcpy(gset.subdims, subdims, sizeof(gset.subdims));
438 gset.kextra = kextra;
439 gset.pgran = pgran;
440
441 // generate necessary memory to image copying functions
442 generateImageCopyFuncs(©ImgFuncs, ctx, CLBLAS_TRMM, &gset);
443 kgenAddBlankLine(ctx);
444
445 if (kextra->kernType == CLBLAS_PREP_A_KERNEL) {
446 genPrepKernelA(ctx, subdims, kextra->flags, kextra->dtype,
447 ©ImgFuncs, pgran);
448 }
449 else {
450 genPrepKernelB(ctx, subdims, kextra->dtype, ©ImgFuncs, pgran,
451 kextra->flags);
452 }
453
454 ret = kgenAddBlankLine(ctx);
455 if (!ret) {
456 ret = (ssize_t)kgenSourceSize(ctx) + 1;
457 }
458 destroyKgenContext(ctx);
459
460 return (ret < 0) ? -EOVERFLOW : ret;
461 }
462
463 static void
initKernelVarNames(KernelVarNames * kvars,KernelExtraFlags kflags)464 initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags)
465 {
466 kvars->A = "imgA";
467 kvars->B = "imgB";
468 if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A)) {
469 kvars->coordA = "coordA.x";
470 }
471 else {
472 kvars->coordA = "coordA.y";
473 }
474 if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B)) {
475 kvars->coordB = "coordB.x";
476 }
477 else {
478 kvars->coordB = "coordB.y";
479 }
480 kvars->sizeM = "M";
481 kvars->sizeN = "N";
482 kvars->sizeK = "K";
483 }
484
485 static ssize_t
generator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)486 generator(
487 char *buf,
488 size_t buflen,
489 const struct SubproblemDim *subdims,
490 const struct PGranularity *pgran,
491 void *extra)
492 {
493 struct KgenContext *ctx;
494 CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
495 char tmp[4096], tmp1[4096];
496 char *p;
497 // is the iteration over N, N at the top level
498 const char *typeName;
499 DataType dtype = kextra->dtype;
500 ssize_t ret;
501 BlasGenSettings gset;
502 BlkMulOpts mulOpts;
503 unsigned int tsize;
504 unsigned int vecLen, outVecLen;
505 bool b;
506 const char *outTypeName;
507 unsigned int i;
508 unsigned int nrRegs, regPitch;
509 KernelExtraFlags kflags = kextra->flags;
510 int tra, trb;
511 char coordNames[2] = {'M', 'N'};
512 char vect[2] = {'y', 'x'};
513
514 const char *coordConstants =
515 "const uint workItemM = startM + get_global_id(0) * %lu;\n"
516 "const uint workItemN = startN + get_global_id(1) * %lu;\n"
517 "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n"
518 "uint vectK = (origM + %u) / %u;\n";
519
520 /*
521 * template for image based trmm preparation part
522 * for two dimensional work space
523 */
524 const char *localVariables =
525 "uint k0;\n"
526 "int2 coordA = (int2)(0, workItemM - startM);\n"
527 "int2 coordB = (int2)(0, workItemN - startN);\n"
528 "%s c[%u];\n\n";
529
530 memset(&gset, 0, sizeof(gset));
531 memcpy(gset.subdims, subdims, sizeof(gset.subdims));
532 gset.kextra = kextra;
533 gset.pgran = pgran;
534 initKernelVarNames(&gset.varNames, kflags);
535
536 tsize = dtypeSize(dtype);
537 vecLen = sizeof(cl_float4) / dtypeSize(dtype);
538 if (isComplexType(dtype)) {
539 regPitch = (unsigned int)subdims[1].x;
540 }
541 else {
542 regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) *
543 sizeof(cl_float4) / tsize;
544 }
545
546 ctx = createKgenContext(buf, buflen, true);
547 if (ctx == NULL) {
548 return -ENOMEM;
549 }
550
551 // at first, generate needed declarations and auxiliary functions
552 b = isDoubleBasedType(dtype);
553 kgenDeclareUptrs(ctx, b);
554
555 typeName = dtypeBuiltinType(dtype);
556
557 // now, generate the kernel
558 declareMainKernel(ctx, dtype, kflags, pgran);
559 ret = kgenBeginFuncBody(ctx);
560
561 // constants
562 sprintf(tmp, coordConstants,
563 subdims[1].y, subdims[1].x, subdims[1].y,
564 vecLen - 1, vecLen);
565 kgenAddStmt(ctx, tmp);
566
567 /*
568 * Calculate local buffer pitches, and then declare local
569 * variables
570 */
571 getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);
572
573 sprintf(tmp, localVariables, outTypeName, nrRegs);
574 kgenAddStmt(ctx, tmp);
575
576 // check if offset exceeds matrix
577 kgenAddStmt(ctx, "if ((workItemM >= startM + M) ||"
578 "(workItemN >= startN + N)) {\n"
579 " return;\n"
580 "}\n");
581
582 // zero C block
583 sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n"
584 " c[k0] = 0;\n"
585 "}\n\n",
586 nrRegs);
587 kgenAddStmt(ctx, tmp);
588
589 // loop over K
590 if (isMatrixUpper(kflags)) {
591 sprintf(tmp, "coordA.x = vectK - %lu;\n"
592 "coordB.x = coordA.x;\n",
593 subdims[1].bwidth / vecLen);
594 kgenAddStmt(ctx, tmp);
595 sprintf(tmp, "for (k0 = ((workItemM/%lu)*%lu)/%u; "
596 "k0 < vectK; k0 += %lu)",
597 subdims[0].bwidth, subdims[0].bwidth, vecLen,
598 subdims[1].bwidth / vecLen);
599 }
600 else {
601 size_t dk;
602
603 dk = (subdims[1].y > subdims[1].bwidth) ? subdims[1].y :
604 subdims[1].bwidth;
605 dk = dk / vecLen + 1;
606 sprintf(tmp, "for (k0 = 0; "
607 "k0 < min((workItemM+%u)/%u + %lu, vectK); "
608 "k0 += %lu)",
609 vecLen - 1, vecLen, dk, subdims[1].bwidth / vecLen);
610 }
611 kgenBeginBranch(ctx, tmp);
612
613 mulOpts.aMobj = CLMEM_IMAGE;
614 mulOpts.bMobj = CLMEM_IMAGE;
615 mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE |
616 BLKMUL_AVOID_AND;
617 if (isComplexType(dtype)) {
618 mulOpts.core = BLKMUL_SEPARATE_MULADD;
619 }
620 else {
621 mulOpts.core = BLKMUL_MAD;
622 }
623 mulOpts.argNames.coordA = "coordA";
624 mulOpts.argNames.coordB = "coordB";
625 mulOpts.argNames.skewCol = "skewCol";
626 mulOpts.argNames.skewRow = "skewRow";
627 mulOpts.argNames.k = "k0";
628 mulOpts.argNames.vectBoundK = "vectK";
629 ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
630 if (ret) {
631 destroyKgenContext(ctx);
632 return -EOVERFLOW;
633 }
634
635 // update image coordinates
636 if (isMatrixUpper(kflags)) {
637 // In this case loop is inverted to avoid 'random' skews
638 sprintf(tmp, "\ncoordA.x -= %lu;\n"
639 "coordB.x -= %lu;\n",
640 subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen);
641 }
642 else {
643 sprintf(tmp, "\ncoordA.x += %lu;\n"
644 "coordB.x += %lu;\n",
645 subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen);
646 }
647 kgenAddStmt(ctx, tmp);
648
649 kgenEndBranch(ctx, NULL);
650 // reorder the given solution
651 outVecLen = isComplexType(dtype) ? 1 : vecLen;
652 p = tmp1;
653 for (i = 0; i < regPitch / outVecLen; i++) {
654 unsigned int k = (unsigned int)(subdims[1].y - 1)
655 * regPitch / outVecLen + i;
656
657 sprintf(p, "\n"
658 " tmp = c[%u];\n"
659 " for (j = %lu; j >= 0; j--) {\n"
660 " c[(j+1) * %u + %u] = c[j * %u + %u];\n"
661 " }\n"
662 " c[%u] = tmp;\n",
663 k, subdims[1].y - 2, regPitch / outVecLen,
664 i, regPitch / outVecLen, i, i);
665 p += strlen(p);
666 }
667 sprintf(tmp, "\n"
668 "for (k0 = 0; k0 < skewRow.y; k0++) {\n"
669 " int j;\n"
670 " %s tmp;\n"
671 "%s"
672 "}\n"
673 "\n",
674 outTypeName, tmp1);
675 kgenAddStmt(ctx, tmp);
676
677 // write back the tile evaluated
678 tra = isMatrixAccessColMaj(CLBLAS_TRMM, kextra->flags, MATRIX_A);
679 trb = isMatrixAccessColMaj(CLBLAS_TRMM, kextra->flags, MATRIX_B);
680 sprintf(tmp, "coordA.%c = workItemM - startM;\n"
681 "coordB.%c = workItemN - startN;\n\n",
682 vect[tra], vect[trb]);
683 kgenAddStmt(ctx, tmp);
684 kgenBeginBranch(ctx, NULL);
685 trb = isMatrixAccessColMaj(CLBLAS_TRMM, kextra->flags, MATRIX_C);
686 sprintf(tmp, "__global %s *B = C + offB + start%c * ldb + start%c;\n\n",
687 typeName, coordNames[trb], coordNames[1 - trb]);
688
689 kgenAddStmt(ctx, tmp);
690 generateResultUpdateOld(ctx, CLBLAS_TRMM, &gset, NULL, NULL);
691 kgenEndBranch(ctx, NULL);
692 kgenEndFuncBody(ctx);
693 ret = kgenAddBlankLine(ctx);
694
695 if (!ret) {
696 ret = (ssize_t)kgenSourceSize(ctx) + 1;
697 }
698
699 destroyKgenContext(ctx);
700
701 return (ret < 0) ? -EOVERFLOW : ret;
702 }
703
704
705
706 static void
assignKargs(KernelArg * args,const void * params,const void * extra)707 assignKargs(KernelArg *args, const void *params, const void *extra)
708 {
709 const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
710 int side = (blasArgs->side == clblasRight);
711 size_t sizes[2] = {blasArgs->M, blasArgs->N};
712 size_t offs[2] = {blasArgs->offsetM, blasArgs->offsetN};
713
714 (void)extra;
715
716 switch (blasArgs->kernType) {
717 case CLBLAS_COMPUTING_KERNEL:
718 initSizeKarg(&args[0], blasArgs->M);
719 initSizeKarg(&args[1], blasArgs->N);
720 assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
721 INIT_KARG(&args[3], blasArgs->scimage[0]);
722 INIT_KARG(&args[4], blasArgs->scimage[1]);
723 initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0);
724 initSizeKarg(&args[6], blasArgs->ldb.matrix);
725 initSizeKarg(&args[7], blasArgs->offsetM);
726 initSizeKarg(&args[8], blasArgs->offsetN);
727 initSizeKarg(&args[9], blasArgs->K);
728 initSizeKarg(&args[10], blasArgs->offBX);
729 break;
730 case CLBLAS_PREP_A_KERNEL:
731 initSizeKarg(&args[0], sizes[side]);
732 initMemobjKarg(&args[1], blasArgs->A, NULL, 0, 0);
733 initSizeKarg(&args[2], blasArgs->lda.matrix);
734 INIT_KARG(&args[3], blasArgs->scimage[0]);
735 initSizeKarg(&args[4], offs[side]);
736 initSizeKarg(&args[5], blasArgs->K);
737 initSizeKarg(&args[6], blasArgs->offA);
738 break;
739 case CLBLAS_PREP_B_KERNEL:
740 INIT_KARG(&args[0], blasArgs->order);
741 INIT_KARG(&args[1], blasArgs->side);
742 initSizeKarg(&args[2], sizes[1 - side]);
743 initMemobjKarg(&args[3], blasArgs->B, NULL, 0, 0);
744 initSizeKarg(&args[4], blasArgs->ldb.matrix);
745 INIT_KARG(&args[5], blasArgs->scimage[1]);
746 initSizeKarg(&args[6], offs[1 - side]);
747 initSizeKarg(&args[7], blasArgs->K);
748 initSizeKarg(&args[8], blasArgs->offBX);
749 break;
750 default:
751 //this should not happen
752 break;
753 }
754 }
755
756 static bool
isFitToLDS(SubproblemDim * dim,DataType dtype,cl_ulong ldsSize,const void * kernelArgs)757 isFitToLDS(
758 SubproblemDim *dim,
759 DataType dtype,
760 cl_ulong ldsSize,
761 const void *kernelArgs)
762 {
763 cl_ulong size;
764 const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
765 size = matrBlockSize(&dim[1], MATRIX_C, dtype, kargs->side);
766 return (size * dtypeSize(dtype) <= ldsSize);
767 }
768
769 static void
calcNrThreads(size_t threads[2],const SubproblemDim * subdims,const PGranularity * pgran,const void * args,const void * extra)770 calcNrThreads(
771 size_t threads[2],
772 const SubproblemDim *subdims,
773 const PGranularity *pgran,
774 const void *args,
775 const void *extra)
776 {
777 const CLBlasKargs *kargs = args;
778 size_t m, n, k;
779 (void)extra;
780
781 //form inner subdims with respect of multiplication side
782 if (kargs->side == clblasRight) {
783 m = kargs->N;
784 n = kargs->M;
785 //original N was stored in K
786 k = kargs->K;
787 }
788 else {
789 m = kargs->M;
790 n = kargs->N;
791 //original M was stored in K
792 k = kargs->K;
793 }
794
795 if (kargs->kernType != CLBLAS_COMPUTING_KERNEL) {
796 size_t whole, part;
797 size_t nrGroups;
798
799 // each thread gets one block
800 if (kargs->kernType == CLBLAS_PREP_A_KERNEL) {
801 whole = m;
802 part = subdims[0].itemY;
803 }
804 else {
805 whole = n;
806 part = subdims[0].itemX;
807 }
808
809 nrGroups = whole / part + (whole % part != 0);
810 nrGroups *= (k / subdims[0].bwidth +
811 (k % subdims[0].bwidth != 0));
812 threads[0] = pgran->wgSize[0] * nrGroups;
813 threads[1] = pgran->wgSize[1];
814 }
815 else {
816 calcGlobalThreads(threads, &subdims[0], pgran, m, n);
817 }
818 }
819
820 static SolverFlags
solverFlags(void)821 solverFlags(void)
822 {
823 return (SF_WSPACE_2D);
824 }
825
826 void
initTrmmImgPattern(MemoryPattern * mempat)827 initTrmmImgPattern(MemoryPattern *mempat)
828 {
829 mempat->name = "Image based block trmm";
830 mempat->nrLevels = 2;
831 mempat->cuLevel = 0;
832 mempat->thLevel = 1;
833 mempat->sops = &imgSops;
834
835 mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
836 mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
837 mpatExtra.mobjA = CLMEM_IMAGE;
838 mpatExtra.mobjB = CLMEM_IMAGE;
839 mempat->extra = &mpatExtra;
840 }
841
842 static int
getPerf(unsigned int kflags,const void * args)843 getPerf( unsigned int kflags,
844 const void *args)
845 {
846 DUMMY_ARG_USAGE(kflags);
847 DUMMY_ARG_USAGE(args);
848
849 return PPERF_POOR;
850 }
851