1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 /*
19 * Image based trsm generator
20 */
21
22 #include <string.h>
23 #include <stdio.h>
24 #include <assert.h>
25
26 #include <clBLAS.h>
27 #include <blas_mempat.h>
28 #include <clkern.h>
29 #include <clblas-internal.h>
30
31 #include <matrix_dims.h>
32
33 #include "blas_kgen_legacy.h"
34 #include "gen_helper_legacy.h"
35 #include "trsm_kgen_legacy.h"
36 #include "../gen_helper.h"
37 #include "../trsm_kgen.h"
38 #include <dis_warning.h>
39
40 static const char *trsmImDecl =
41 "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n"
42 "void __kernel\n"
43 "%ctrsmIm(\n"
44 " uint %c,\n"
45 " uint %c,\n"
46 " %s alpha,\n"
47 " __read_only image2d_t A,\n"
48 " __global %s *B,\n"
49 " uint ldb,\n"
50 " uint startRow,\n"
51 " uint finishRow,\n"
52 " uint offB)\n";
53
54 /*
55 * template for memory object based trsm preparation part
56 * for one dimensional work space
57 */
58 static const char *trsmImPrep1D =
59 "uint m0, k0;\n"
60 "__local %s tempC[%lu];\n"
61 "%s c[%u];\n"
62 "const int lid = get_local_id(0);\n"
63 "const int skew = lid %% %lu;\n"
64 "%s" // groups per Panel variable
65 "uint blockN;\n"
66 "uint x, y, imx, imy;\n"
67 "uint2 coordA, coordB;\n"
68 "\n"
69 "const uint currN = get_global_id(0) / %u * %lu;\n" // group ID
70 "\n";
71
72 static const char *readRectBlock =
73 "y = (currN + %lu <= N) ? %lu : N - currN;\n"
74 "x = (k0 + %lu <= finishRow) ? %lu : finishRow - k0;\n"
75 "if ((y == %lu) && (x == %lu)) {\n"
76 // just read with an optimized function
77 " %s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n"
78 "}\n"
79 "else {\n"
80 " %s((__local float4*)temp%c);\n" // zeroing
81 " barrier(CLK_LOCAL_MEM_FENCE);\n"
82 " %s((LPtr)temp%c, (GPtr)B, currN, k0, y, x, %lu, ldb);\n"
83 "}\n\n";
84
85 static const char *readRectBlockOpt =
86 // just read with an optimized function
87 "%s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n";
88
89 static const char *readRectBlockTrans =
90 "y = (currN + %lu <= N) ? %lu : N - currN;\n"
91 "x = (k0 + %lu <= finishRow) ? %lu : finishRow - k0;\n"
92 "if ((y == %lu) && (x == %lu)) {\n"
93 // read and transpose with an optimized function
94 " %s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n"
95 "}\n"
96 "else {\n"
97 " %s((__local float4*)temp%c);\n" // zeroing
98 " barrier(CLK_LOCAL_MEM_FENCE);\n"
99 // read and transpose with slow function
100 " %s((LPtr)temp%c, (GPtr)B, k0, currN, x, y, %lu, ldb);\n"
101 "}\n\n";
102
103 static const char *readRectBlockTransOpt =
104 // read and transpose with an optimized function
105 "%s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n";
106
107 static ssize_t
108 wrapper(
109 char *buf,
110 size_t buflen,
111 const struct SubproblemDim *subdims,
112 const struct PGranularity *pgran,
113 void *extra);
114
115 static ssize_t
116 generator(
117 char *buf,
118 size_t buflen,
119 const struct SubproblemDim *subdims,
120 const struct PGranularity *pgran,
121 void *extra);
122
123 static ssize_t
124 prepGenerator(
125 char *buf,
126 size_t buflen,
127 const struct SubproblemDim *subdims,
128 const struct PGranularity *pgran,
129 void *extra);
130
131 static void
132 assignKargs(KernelArg *args, const void *params, const void *extra);
133
134 static bool
135 isFitToLDS(
136 SubproblemDim *dim,
137 DataType dtype,
138 cl_ulong ldsSize,
139 const void *kernelArgs);
140
141 static void
142 calcNrThreads(
143 size_t threads[2],
144 const SubproblemDim *dims,
145 const PGranularity *pgran,
146 const void *args,
147 const void *extra);
148
149
150 static void
151 imgPackMode(
152 const void *extra,
153 const SubproblemDim *dims,
154 int dataID,
155 unsigned int *packRate,
156 clblasOrder *packOrder);
157
158 static SolverFlags
159 solverFlags(void);
160
161 static SolverOps solverOps = {
162 wrapper,
163 assignKargs,
164 isFitToLDS,
165 NULL,
166 NULL,
167 calcNrThreads,
168 imgPackMode,
169 solverFlags,
170 NULL, //fixupArgs
171 NULL, //getDefaultDecomp
172 NULL, //getDecompList
173 NULL,
174 NULL
175 };
176
177 static CLBLASMpatExtra mpatExtra;
178
179 /* Prepare A kernel begin */
180
181 static const char *trsmPrepDecl =
182 "void __kernel\n"
183 "%ctrsmPrepare(\n"
184 " uint %c,\n"
185 " __global %s *A,\n"
186 " uint lda,\n"
187 " __write_only image2d_t imA,\n"
188 " uint startRow,\n"
189 " uint offA)\n";
190
191 /*
192 * template for memory object based trsm preparation part
193 * for one dimensional work space
194 */
195 static const char *trsmPrep1D =
196 "__local %s tempA[%lu];\n"
197 "__local %s tempC[%lu];\n"
198 "int lid, gid;\n"
199 "uint currM, k0;\n"
200 "uint x, y, imx, imy;\n"
201 "\n"
202 "lid = get_local_id(0);\n"
203 "gid = get_global_id(0) / %u;\n" // group ID
204 "A += offA;\n"
205 "\n";
206
207 static const char *readSquareBlock =
208 "y = (currM + %lu <= M) ? %lu : M - currM;\n"
209 "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
210 "if ((y == %lu) && (x == %lu)) {\n"
211 // just read with an optimized function
212 " %s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n"
213 "}\n"
214 "else {\n"
215 " %s((__local float4*)temp%c);\n" // zeroing
216 " barrier(CLK_LOCAL_MEM_FENCE);\n"
217 " %s((LPtr)temp%c, (GPtr)A, currM, k0, y, x, %lu, lda);\n"
218 "}\n\n";
219
220 static const char *readSquareBlockOpt =
221 // just read with an optimized function
222 "%s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n";
223
224 static const char *readSquareBlockTrans =
225 "y = (currM + %lu <= M) ? %lu : M - currM;\n"
226 "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
227 "if ((y == %lu) && (x == %lu)) {\n"
228 // read and transpose with an optimized function
229 " %s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n"
230 "}\n"
231 "else {\n"
232 " %s((__local float4*)temp%c);\n" // zeroing
233 " barrier(CLK_LOCAL_MEM_FENCE);\n"
234 // read and transpose with slow function
235 " %s((LPtr)temp%c, (GPtr)A, k0, currM, x, y, %lu, lda);\n"
236 "}\n\n";
237
238 static const char *readSquareBlockTransOpt =
239 // read and transpose with an optimized function
240 "%s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n";
241
242
243 static bool
useTransposedMul(const SubproblemDim * dims,DataType dtype,bool trb)244 useTransposedMul(const SubproblemDim *dims, DataType dtype, bool trb)
245 {
246 unsigned int vecLen;
247
248 vecLen = sizeof(cl_float4) / dtypeSize(dtype);
249
250 return (!(trb || isComplexType(dtype) || (dims[1].x % vecLen)));
251 }
252
253 static size_t
calcPitchB(const SubproblemDim * dim,DataType dtype,bool transpMul)254 calcPitchB(const SubproblemDim *dim, DataType dtype, bool transpMul)
255 {
256 size_t ret;
257 size_t tsize;
258
259 tsize = dtypeSize(dtype);
260 ret = (transpMul) ? dim->x : dim->bwidth;
261 ret = fl4RowWidth(ret, tsize) * sizeof(cl_float4) / tsize;
262
263 return ret;
264 }
265
266 static void
genPrepareSquareBlock(struct KgenContext * ctx,const SubproblemDim * dim,DataType dtype,const CopyBufFuncs * copyFuncs,const ZeroFuncs * zeroFuncs,bool tra,char c,bool opt)267 genPrepareSquareBlock(
268 struct KgenContext *ctx,
269 const SubproblemDim *dim,
270 DataType dtype,
271 const CopyBufFuncs *copyFuncs,
272 const ZeroFuncs *zeroFuncs,
273 bool tra,
274 char c,
275 bool opt)
276 {
277 char tmp[1024];
278 size_t pitch;
279 const char *readBlock;
280
281 pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft);
282 if (opt) {
283 readBlock = (tra) ? readSquareBlockTransOpt : readSquareBlockOpt;
284 sprintf(tmp, readBlock, copyFuncs->read[MATRIX_A], c);
285 }
286 else {
287 readBlock = (tra) ? readSquareBlockTrans : readSquareBlock;
288 sprintf(tmp, readBlock, dim->y, dim->y, dim->bwidth, dim->bwidth,
289 dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], c,
290 zeroFuncs->names[MATRIX_A], c,
291 copyFuncs->readGeneric[MATRIX_A], c, pitch);
292 }
293 kgenAddStmt(ctx, tmp);
294 }
295
296 static void
genPrepZeroBlockC(struct KgenContext * ctx,const ZeroFuncs * zeroFuncs)297 genPrepZeroBlockC(
298 struct KgenContext *ctx,
299 const ZeroFuncs *zeroFuncs)
300 {
301 char tmp[1024];
302 sprintf(tmp, "%s((__local float4*)tempC);\n", zeroFuncs->names[MATRIX_A]);
303 kgenAddStmt(ctx, tmp);
304 }
305
306 static void
genWriteBlock(struct KgenContext * ctx,const SubproblemDim * dim,const CopyBufFuncs * copyFuncs)307 genWriteBlock(
308 struct KgenContext *ctx,
309 const SubproblemDim *dim,
310 const CopyBufFuncs *copyFuncs)
311 {
312 char tmp[1024];
313
314 sprintf(tmp, "%s(imA, imx, imy, (LPtr)tempC, %lu, %lu, %lu);\n",
315 copyFuncs->write, dim[0].y, dim[0].y, dim[0].y);
316 kgenAddStmt(ctx, tmp);
317 }
318
319 static void
getBufferPos(struct KgenContext * ctx,bool isU)320 getBufferPos(struct KgenContext *ctx, bool isU) //n -> x,y buffer
321 {
322 kgenDeclareFunction(ctx, "void\ngetBufferPos(uint n, uint startRow, "
323 "uint width, uint *y, "
324 "uint *x)\n");
325 kgenBeginFuncBody(ctx);
326 if (isU) {
327 //n from beginning
328 kgenAddStmt(ctx, "n += (2 * width - startRow + 1) * (startRow) / 2;\n");
329 kgenAddStmt(ctx, "*y = trunc((2 * width + 1) - "
330 "sqrt((2 * width + 1) *"
331 "(2 * width + 1) - 8 * n)) / 2;\n");
332 kgenAddStmt(ctx, "*x = *y + n - (2 * width - *y + 1) * (*y) / 2;\n");
333 }
334 else {
335 //n from beginning
336 kgenAddStmt(ctx, "n += startRow * (startRow + 1) / 2;\n");
337 kgenAddStmt(ctx, "*y = trunc((-0.5 + sqrt(2.0 * n + 0.25)));\n");
338 kgenAddStmt(ctx, "*x = n - (*y) * (*y + 1) / 2;\n");
339 }
340 kgenEndFuncBody(ctx);
341
342 kgenAddBlankLine(ctx);
343 }
344
345 static void
genGetImagePos(struct KgenContext * ctx,const SubproblemDim * subdims,DataType dtype,const char * blockName,bool tra)346 genGetImagePos(
347 struct KgenContext *ctx,
348 const SubproblemDim *subdims,
349 DataType dtype,
350 const char *blockName,
351 bool tra) //n -> x,y image
352 {
353 char tmp[1024];
354 const char *parName;
355 const char *op[2] = {"/", "%"};
356
357 parName = (tra) ? "bpc" : "bpr";
358
359 sprintf(tmp, "imy = %s %s %s * %lu;\n"
360 "imx = (%s %s %s) * %lu;\n",
361 blockName, op[tra], parName, subdims[0].y,
362 blockName, op[1 - tra], parName,
363 subdims[0].y * dtypeSize(dtype) / sizeof(cl_float4));
364 kgenAddStmt(ctx, tmp);
365 }
366
367 // global memory to image converter
368 static ssize_t
prepGenerator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)369 prepGenerator(
370 char *buf,
371 size_t buflen,
372 const struct SubproblemDim *subdims,
373 const struct PGranularity *pgran,
374 void *extra)
375 {
376 struct KgenContext *ctx;
377 CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
378 char tmp[1024];
379 const char *typeName;
380 CopyBufFuncs copyFuncs;
381 ZeroFuncs zeroFuncs;
382 char fpref;
383 DataType dtype = kextra->dtype;
384 KernelExtraFlags kflags = kextra->flags;
385 ssize_t ret;
386 size_t pitchAB;
387 bool b;
388 bool tra, trb, isU, transpMul;
389 BlasGenSettings gset;
390
391 if (pgran->wgDim != 1) {
392 return -EINVAL;
393 }
394
395 ctx = createKgenContext(buf, buflen, true);
396 if (ctx == NULL) {
397 return -ENOMEM;
398 }
399
400 tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
401 trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
402 isU = isMatrixUpper(kflags);
403
404 // at first, generate needed declarations and auxiliary functions
405
406 b = isDoubleBasedType(dtype);
407 kgenDeclareUptrs(ctx, b);
408
409 if (isComplexType(dtype)) {
410 genComplexMathOperators(ctx, dtype);
411 }
412
413 memset(&gset, 0, sizeof(gset));
414 memcpy(gset.subdims, subdims, sizeof(gset.subdims));
415 gset.kextra = kextra;
416 gset.pgran = pgran;
417
418 generateBufCopyFuncs(©Funcs, ctx, CLBLAS_TRSM, &gset,
419 BCHF_MATRIX_A | BCHF_WRITE_OUTPUT | BCHF_IMAGE_WRITE);
420 generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
421 ZF_MATRIX_A);
422
423 //matrix inversion function
424 genInvertingBlockFunc(ctx, (unsigned int)subdims[0].bwidth, dtype, isU);
425
426 //coordinates calculation
427 getBufferPos(ctx, isU);
428
429 typeName = dtypeBuiltinType(dtype);
430 fpref = dtypeToBlasPrefix(dtype);
431
432 // now, generate the kernel
433
434 sprintf(tmp, trsmPrepDecl, fpref, 'M', typeName,
435 typeName, typeName, typeName);
436
437 kgenDeclareFunction(ctx, tmp);
438 ret = kgenBeginFuncBody(ctx);
439
440 transpMul = useTransposedMul(subdims, dtype, trb);
441 if (!transpMul) {
442 sprintf(tmp, "const int bpr = get_image_width(imA) / %lu;\n",
443 subdims[0].y / (sizeof(cl_float4) / dtypeSize(dtype)));
444 }
445 else {
446 sprintf(tmp, "const int bpc = get_image_height(imA) / %lu;\n",
447 subdims[0].y);
448 }
449 kgenAddStmt(ctx, tmp);
450
451 /*
452 * Calculate local buffer pitches, and then insert the
453 * preparative code
454 */
455 pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
456 sprintf(tmp, trsmPrep1D, typeName, pitchAB * subdims[0].y,
457 typeName, pitchAB * subdims[0].y, pgran->wgSize[0]);
458 ret = kgenAddStmt(ctx, tmp);
459
460 sprintf(tmp, "getBufferPos(gid, startRow / %lu, (M + %lu) / %lu, &currM, &k0);\n",
461 subdims[0].y, subdims[0].y - 1, subdims[0].y);
462 kgenAddStmt(ctx, tmp);
463 sprintf(tmp, "currM *= %lu;\n"
464 "k0 *= %lu;\n", subdims[0].y, subdims[0].y);
465 kgenAddStmt(ctx, tmp);
466
467 genGetImagePos(ctx, subdims, dtype, "gid", transpMul);
468
469 kgenBeginBranch(ctx, "if (currM == k0)");
470 genPrepareSquareBlock(ctx, subdims, dtype, ©Funcs, &zeroFuncs,
471 tra, 'A', !(kextra->flags & KEXTRA_TAILS_M));
472 genPrepZeroBlockC(ctx, &zeroFuncs);
473 kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
474
475 if (kextra->flags & KEXTRA_UNIT_DIAGONAL) {
476 sprintf(tmp, "if (lid < %lu) {\n"
477 " tempA[lid * %lu + lid] = %s;\n"
478 "}\n",
479 subdims[0].bwidth, pitchAB, strOne(dtype));
480 kgenAddStmt(ctx, tmp);
481 kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
482 kgenAddBlankLine(ctx);
483 }
484
485 sprintf(tmp, "if (lid < %lu)", subdims[0].bwidth);
486 kgenBeginBranch(ctx, tmp);
487 sprintf(tmp, "invert(tempA, tempC, lid, (currM + %lu > M) ? "
488 "M - currM : %lu);\n",
489 subdims[0].y, subdims[0].y);
490 kgenAddStmt(ctx, tmp);
491 kgenEndBranch(ctx, NULL);
492 kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
493 kgenEndBranch(ctx, NULL);
494
495 kgenBeginBranch(ctx, "else");
496 genPrepareSquareBlock(ctx, subdims, dtype, ©Funcs, &zeroFuncs, tra,
497 'C', !(kextra->flags & KEXTRA_TAILS_M));
498 kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
499 kgenEndBranch(ctx, NULL);
500
501 genWriteBlock(ctx, subdims, ©Funcs);
502 kgenEndFuncBody(ctx);
503 ret = kgenAddBlankLine(ctx);
504
505 if (!ret) {
506 ret = (ssize_t)kgenSourceSize(ctx) + 1;
507 }
508
509 destroyKgenContext(ctx);
510
511 return (ret < 0) ? -EOVERFLOW : ret;
512 }
513
514 static void
genZeroResult(struct KgenContext * ctx,DataType dtype,const SubproblemDim * dims)515 genZeroResult(
516 struct KgenContext *ctx,
517 DataType dtype,
518 const SubproblemDim *dims)
519 {
520 unsigned int n;
521 char tmp[1024];
522 unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
523
524 getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL);
525
526 sprintf(tmp, "for (x = 0; x < %u; x++) {\n"
527 " c[x] = 0;\n"
528 "}\n\n", n);
529
530 kgenAddStmt(ctx, tmp);
531 }
532
533 static void
genPrepareRectBlock(struct KgenContext * ctx,const SubproblemDim * dim,DataType dtype,const CopyBufFuncs * copyFuncs,const ZeroFuncs * zeroFuncs,bool trb,char c,bool opt)534 genPrepareRectBlock(
535 struct KgenContext *ctx,
536 const SubproblemDim *dim,
537 DataType dtype,
538 const CopyBufFuncs *copyFuncs,
539 const ZeroFuncs *zeroFuncs,
540 bool trb,
541 char c,
542 bool opt)
543 {
544 char tmp[1024];
545 size_t pitch;
546 const char *readBlock;
547 size_t bsizes[2] = {dim->bwidth, dim->x};
548
549 /*
550 * NOTE: in case of accessing to B in the non transposed way
551 * block multiplication is done with transposed block B
552 */
553 pitch = calcPitchB(dim, dtype, !trb);
554 if (opt) {
555 readBlock = (trb) ? readRectBlockTransOpt : readRectBlockOpt;
556 sprintf(tmp, readBlock, copyFuncs->read[MATRIX_B], c);
557 }
558 else {
559 readBlock = (trb) ? readRectBlockTrans : readRectBlock;
560 sprintf(tmp, readBlock, bsizes[trb], bsizes[trb], bsizes[1 - trb],
561 bsizes[1 - trb], bsizes[trb], bsizes[1 - trb],
562 copyFuncs->read[MATRIX_B], c, zeroFuncs->names[MATRIX_B], c,
563 copyFuncs->readGeneric[MATRIX_B], c, pitch);
564 }
565 kgenAddStmt(ctx, tmp);
566 }
567
568 static void
getNblock(struct KgenContext * ctx,bool isU)569 getNblock(struct KgenContext *ctx, bool isU) //x, y -> n
570 {
571 kgenDeclareFunction(ctx, "void\ngetNBlock(uint y, uint x, uint startRow, "
572 "uint width, uint *n)\n");
573 kgenBeginFuncBody(ctx);
574 if (isU) {
575 kgenAddStmt(ctx, "*n = ((2 * width - y + 1) * y - "
576 "(2 * width - startRow + 1) * startRow) / 2 + x - y;\n");
577 }
578 else {
579 kgenAddStmt(ctx, "*n = (y * (y + 1) - startRow * (startRow + 1)) / 2 + x;\n");
580 }
581 kgenEndFuncBody(ctx);
582 kgenAddBlankLine(ctx);
583 }
584
585 static void
genMultiplication(struct KgenContext * ctx,const SubproblemDim * dims,DataType dtype,const char * blkmulName,BlkMulFlags mulFlags)586 genMultiplication(
587 struct KgenContext *ctx,
588 const SubproblemDim *dims,
589 DataType dtype,
590 const char *blkmulName,
591 BlkMulFlags mulFlags)
592 {
593 char tmp[1024];
594 size_t u;
595 unsigned int l1Pans;
596
597 l1Pans = (unsigned int)(dims[0].x / dims[1].x);
598 if (mulFlags & BLKMUL_TRANSPOSED_B) {
599 u = 1;
600 }
601 else {
602 u = matrBlockPitch(dims, MATRIX_B, dtype, clblasLeft);
603 }
604
605 // find image position and invoke the multiplier
606 sprintf(tmp, "getNBlock(m0 / %lu, k0 / %lu, startRow / %lu, "
607 "(M + %lu) / %lu, &blockN);\n",
608 dims[0].y, dims[0].y, dims[0].y, dims[0].y - 1, dims[0].y);
609 kgenAddStmt(ctx, tmp);
610 genGetImagePos(ctx, dims, dtype, "blockN", (mulFlags & BLKMUL_TRANSPOSED_B) != 0);
611 sprintf(tmp, "%s(A, (int2)(imx, imy + lid / %u * %lu), \n"
612 " (LPtr)(tempC + (lid %% %u * %lu) * %lu),\n"
613 " c, skew);\n",
614 blkmulName, l1Pans, dims[1].y, l1Pans, dims[1].x, u);
615 kgenAddStmt(ctx, tmp);
616 }
617
618 static void
genReorderSolution(struct KgenContext * ctx,const SubproblemDim * subdims,const char * outTypeName,unsigned int colRegs)619 genReorderSolution(
620 struct KgenContext *ctx,
621 const SubproblemDim *subdims,
622 const char *outTypeName,
623 unsigned int colRegs)
624 {
625 char tmp[1024], tmp1[1024];
626 char *p;
627 unsigned i;
628
629 sprintf(tmp, "void\n"
630 "reorderResult(%s *c, int skew)",
631 outTypeName);
632 kgenDeclareFunction(ctx, tmp);
633 kgenBeginFuncBody(ctx);
634
635 sprintf(tmp, "%s tmp;\n"
636 "int i, j;\n",
637 outTypeName);
638 kgenAddStmt(ctx, tmp);
639
640 p = tmp1;
641 for (i = 0; i < colRegs; i++) {
642 unsigned int k = (unsigned int)(subdims[1].y - 1) * colRegs + i;
643
644 sprintf(p, "\n"
645 " tmp = c[%u];\n"
646 " for (j = %lu; j >= 0; j--) {\n"
647 " c[(j+1) * %u + %u] = c[j * %u + %u];\n"
648 " }\n"
649 " c[%u] = tmp;\n",
650 k, subdims[1].y - 2, colRegs, i, colRegs, i, i);
651 p += strlen(p);
652 }
653
654 sprintf(tmp, "\n"
655 "for (i = 0; i < skew; i++) {\n"
656 "%s"
657 "}\n"
658 "\n",
659 tmp1);
660 kgenAddStmt(ctx, tmp);
661
662 kgenEndFuncBody(ctx);
663 kgenAddBlankLine(ctx);
664 }
665
666 static void
initKernelVarNames(KernelVarNames * kvars,KernelExtraFlags kflags)667 initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags)
668 {
669 kvars->A = "imgA";
670 kvars->B = "B";
671
672 if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) {
673 kvars->coordA = "coordA.x";
674 }
675 else {
676 kvars->coordA = "coordA.y";
677 }
678 if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
679 kvars->coordB = "coordB.x";
680 }
681 else {
682 kvars->coordB = "coordB.y";
683 }
684
685 kvars->sizeM = "M";
686 kvars->sizeN = "N";
687 kvars->sizeK = "origM";
688 }
689
690 // image based kernel generator
691 static ssize_t
generator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)692 generator(
693 char *buf,
694 size_t buflen,
695 const struct SubproblemDim *subdims,
696 const struct PGranularity *pgran,
697 void *extra)
698 {
699 struct KgenContext *ctx;
700 CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
701 CLBLASKernExtra kextraTmp = *kextra;
702 char tmp[1024], tmp1[1024];
703 char blkmul[FUNC_NAME_MAXLEN];
704 char updateResFn[FUNC_NAME_MAXLEN];
705 char updateResGenericFn[FUNC_NAME_MAXLEN];
706 char updateResFnRev[FUNC_NAME_MAXLEN];
707 char updateResGenericFnRev[FUNC_NAME_MAXLEN];
708 char copyPLFn[FUNC_NAME_MAXLEN];
709 char *s1 = "";
710 const char *typeName;
711 CopyBufFuncs copyFuncs;
712 ZeroFuncs zeroFuncs;
713 char fpref;
714 DataType dtype = kextra->dtype;
715 ssize_t ret;
716 BlasGenSettings gset;
717 BlkMulOpts mulOpts;
718 BlkMulFlags mulFlags;
719 size_t pitchAB;
720 size_t u;
721 bool b;
722 bool isU;
723 bool areTails;
724 const char *outTypeName;
725 unsigned int nrRegs, colRegs;
726 KernelExtraFlags kflags = kextra->flags;
727 size_t tsize;
728 unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
729 UpdateResultFlags upFlags;
730 int tra, trb;
731 unsigned int l1Pans;
732 char vect[2] = {'y', 'x'};
733
734 if (pgran->wgDim != 1) {
735 return -EINVAL;
736 }
737
738 ctx = createKgenContext(buf, buflen, true);
739 if (ctx == NULL) {
740 return -ENOMEM;
741 }
742
743 tsize = dtypeSize(dtype);
744 areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
745 isU = isMatrixUpper(kflags);
746
747 tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
748 trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
749 l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x;
750
751 /*
752 * Force generation of the transposed version of the block
753 * reading function with following multiplication with transposed
754 * block B to decrease LDS bank conflicts without column skew using.
755 * Reverse temporarily the flag of the column-major order for that
756 */
757 if (useTransposedMul(subdims, dtype, trb)) {
758 if (kflags & KEXTRA_COLUMN_MAJOR) {
759 kflags &= ~KEXTRA_COLUMN_MAJOR;
760 }
761 else {
762 kflags |= KEXTRA_COLUMN_MAJOR;
763 }
764 mulFlags = BLKMUL_SKEW_ROW | BLKMUL_TRANSPOSED_B;
765 u = subdims[1].y;
766 }
767 else {
768 mulFlags = BLKMUL_SKEW_COLUMN;
769 u = subdims[0].y / (sizeof(cl_float4) / dtypeSize(dtype));
770 }
771
772 ctx = createKgenContext(buf, buflen, true);
773 if (ctx == NULL) {
774 return -ENOMEM;
775 }
776
777 // at first, generate needed declarations and auxiliary functions
778
779 b = isDoubleBasedType(dtype);
780 kgenDeclareUptrs(ctx, b);
781
782 kextraTmp.flags = kflags;
783 memset(&gset, 0, sizeof(gset));
784 memcpy(gset.subdims, subdims, sizeof(gset.subdims));
785 gset.kextra = &kextraTmp;
786 gset.pgran = pgran;
787 initKernelVarNames(&gset.varNames, kextra->flags);
788
789 if (isComplexType(dtype)) {
790 genComplexMathOperators(ctx, dtype);
791 }
792
793 generateBufCopyFuncs(©Funcs, ctx, CLBLAS_TRSM, &gset, BCHF_MATRIX_B);
794 /*
795 * Temporary kernel extra has been needed to produce inverted block B read.
796 * Restore the original one, and restore kflags as well
797 */
798 gset.kextra = kextra;
799 kflags = kextra->flags;
800
801 // functions updating result
802 // for the final result
803 generateUpresFuncs(ctx, CLBLAS_TRSM, &gset, updateResFn,
804 updateResGenericFn);
805 // for intermediate result after blocks modification
806 upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
807 upFlags |= UPRES_WITH_BETA | UPRES_PRIV_DEST;
808 genUpresFuncsWithFlags(ctx, &gset, upFlags, updateResFnRev,
809 updateResGenericFnRev);
810 // for heaping before multiplying on inverted block
811 upFlags = UPRES_USE_LDS;
812 if (!(mulFlags & BLKMUL_TRANSPOSED_B)) {
813 upFlags |= UPRES_COLUMN_MAJOR;
814 }
815 updateResultGenOld(ctx, &gset, UPRES_SET, upFlags, NULL);
816 kgenGetLastFuncName(copyPLFn, FUNC_NAME_MAXLEN, ctx);
817 kgenAddBlankLine(ctx);
818
819 generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
820 ZF_MATRIX_B | ZF_MATRIX_C);
821
822 // block multiplication function
823 mulOpts.aMobj = CLMEM_IMAGE;
824 mulOpts.bMobj = CLMEM_BUFFER;
825 mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | mulFlags;
826 if (isComplexType(dtype)) {
827 mulOpts.core = BLKMUL_SEPARATE_MULADD;
828 }
829 else {
830 mulOpts.core = BLKMUL_MAD;
831 }
832 ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
833 if (ret) {
834 destroyKgenContext(ctx);
835
836 return -EOVERFLOW;
837 }
838
839 kgenAddBlankLine(ctx);
840 kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx);
841
842 typeName = dtypeBuiltinType(dtype);
843 fpref = dtypeToBlasPrefix(dtype);
844
845 // block number calculation
846 getNblock(ctx, isU);
847
848 getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);
849 if (isComplexType(dtype)) {
850 colRegs = (unsigned int)subdims[1].x;
851 }
852 else {
853 colRegs = (unsigned int)fl4RowWidth(subdims[1].x, tsize);
854 }
855
856 if (mulFlags & BLKMUL_SKEW_ROW) {
857 genReorderSolution(ctx, subdims, outTypeName, colRegs);
858 }
859
860 // now, generate the kernel
861
862 if (kflags & KEXTRA_SIDE_RIGHT) {
863 sprintf(tmp, trsmImDecl, pgran->wgSize[0], pgran->wgSize[1],
864 fpref, 'N', 'M', typeName, typeName, typeName, typeName);
865 }
866 else {
867 sprintf(tmp, trsmImDecl, pgran->wgSize[0], pgran->wgSize[1],
868 fpref, 'M', 'N', typeName, typeName, typeName, typeName);
869 }
870
871 kgenDeclareFunction(ctx, tmp);
872 ret = kgenBeginFuncBody(ctx);
873
874 if (!(mulFlags & BLKMUL_TRANSPOSED_B)) {
875 sprintf(tmp, "const int bpr = get_image_width(A) / %lu;\n",
876 subdims[0].y / (sizeof(cl_float4) / tsize));
877 }
878 else {
879 sprintf(tmp, "const int bpc = get_image_height(A) / %lu;\n",
880 subdims[0].y);
881 }
882 kgenAddStmt(ctx, tmp);
883
884 /*
885 * Calculate local buffer pitches, and then insert the
886 * preparative code
887 */
888 pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
889
890 sprintf(tmp, trsmImPrep1D, typeName, pitchAB * subdims[0].x,
891 outTypeName, nrRegs, u, s1, pgran->wgSize[0], subdims[0].itemX);
892 kgenAddStmt(ctx, tmp);
893 kgenAddBlankLine(ctx);
894
895 kgenAddStmt(ctx, "B += offB;\n");
896 sprintf(tmp, "coordB.%c = currN + lid %% %u * %lu;\n"
897 "coordB.%c = 0;\n\n",
898 vect[trb], l1Pans, subdims[1].x, vect[1 - trb]);
899 kgenAddStmt(ctx, tmp);
900
901 /*
902 * B matrix is divided on panels, each work group
903 * multiply such a panel on the whole matrix A.
904 */
905
906 // top level loop over M
907 if (isU) {
908 sprintf(tmp1, "(((finishRow - 1) / %lu) * %lu)", subdims[0].y,
909 subdims[0].y); //last block start
910 sprintf(tmp, "for (m0 = %s; m0 + %lu != startRow; m0 -= %lu)",
911 tmp1, subdims[0].y, subdims[0].y);
912 ret = kgenBeginBranch(ctx, tmp);
913 }
914 else {
915 sprintf(tmp, "for (m0 = startRow; m0 < finishRow; m0 += %lu)",
916 subdims[0].y);
917 ret = kgenBeginBranch(ctx, tmp);
918 }
919
920 sprintf(tmp, "coordA.%c = m0 + lid / %u * %lu;\n"
921 "coordA.%c = 0;\n\n",
922 vect[tra], l1Pans, subdims[1].y, vect[1 - tra]);
923 kgenAddStmt(ctx, tmp);
924
925 genZeroResult(ctx, dtype, subdims);
926
927 // loop over K
928 if (isU) {
929 sprintf(tmp, "for (k0 = m0 + %lu; k0 < M; k0 += %lu)",
930 subdims[0].bwidth, subdims[0].bwidth);
931 }
932 else {
933 sprintf(tmp, "for (k0 = 0; k0 < m0; k0 += %lu)",
934 subdims[0].bwidth);
935 }
936 ret = kgenBeginBranch(ctx, tmp);
937
938 genPrepareRectBlock(ctx, subdims, dtype, ©Funcs, &zeroFuncs,
939 trb, 'C', !areTails);
940
941 kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
942
943 // multiplication in the adjusting loop
944 genMultiplication(ctx, subdims, dtype, blkmul, mulFlags);
945
946 kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
947 kgenEndBranch(ctx, NULL); // loop over K
948 kgenAddBlankLine(ctx);
949
950 if (mulFlags & BLKMUL_SKEW_ROW) {
951 kgenAddStmt(ctx, "reorderResult(c, skew);\n");
952 }
953 kgenAddStmt(ctx, "k0 = m0;\n");
954
955 genUpdateIntermTrsmResult(ctx, &gset, updateResFnRev,
956 updateResGenericFnRev, true);
957
958 genHeapTrsmResultToLDS(ctx, &gset, copyPLFn, "tempC");
959 kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
960 genZeroResult(ctx, dtype, subdims);
961
962 // multiplication on the inverted block
963 genMultiplication(ctx, subdims, dtype, blkmul, mulFlags);
964 if (mulFlags & BLKMUL_SKEW_ROW) {
965 kgenAddStmt(ctx, "reorderResult(c, skew);\n");
966 }
967
968 // write back the tile evaluated
969 upFlags = UPRES_EXCEED_PROBLEM_CONDITION;
970 if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_C)) {
971 upFlags |= UPRES_COLUMN_MAJOR;
972 }
973 genResultUpdateWithFlagsOld(ctx, CLBLAS_TRSM, &gset, upFlags, updateResFn,
974 updateResGenericFn, NULL);
975
976 kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
977
978 // end external loops over panels of matrix A
979 kgenEndBranch(ctx, NULL);
980 kgenEndFuncBody(ctx);
981 ret = kgenAddBlankLine(ctx);
982
983 if (!ret) {
984 ret = (ssize_t)kgenSourceSize(ctx) + 1;
985 }
986
987 destroyKgenContext(ctx);
988
989 return (ret < 0) ? -EOVERFLOW : ret;
990 }
991
992 static ssize_t
wrapper(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)993 wrapper(
994 char *buf,
995 size_t buflen,
996 const struct SubproblemDim *subdims,
997 const struct PGranularity *pgran,
998 void *extra)
999 {
1000 CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
1001 if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) {
1002 return generator(buf, buflen, subdims, pgran, extra);
1003 }
1004 else {
1005 return prepGenerator(buf, buflen, subdims, pgran, extra);
1006 }
1007 }
1008
1009 static void
assignKargs(KernelArg * args,const void * params,const void * extra)1010 assignKargs(KernelArg *args, const void *params, const void *extra)
1011 {
1012 const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
1013
1014 (void)extra;
1015
1016 if (blasArgs->kernType == CLBLAS_COMPUTING_KERNEL) {
1017 if (blasArgs->side == clblasLeft) {
1018 initSizeKarg(&args[0], blasArgs->K);
1019 initSizeKarg(&args[1], blasArgs->N);
1020 }
1021 else {
1022 initSizeKarg(&args[0], blasArgs->M);
1023 initSizeKarg(&args[1], blasArgs->K);
1024 }
1025 assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
1026 initMemobjKarg(&args[3], blasArgs->scimage[0], NULL, 0, 0);
1027 initMemobjKarg(&args[4], blasArgs->B, NULL, 0, 0);
1028 initSizeKarg(&args[5], blasArgs->ldb.matrix);
1029 if (blasArgs->side == clblasLeft) {
1030 initSizeKarg(&args[6], blasArgs->offsetM);
1031 initSizeKarg(&args[7], blasArgs->M + blasArgs->offsetM);
1032 }
1033 else {
1034 initSizeKarg(&args[6], blasArgs->offsetN);
1035 initSizeKarg(&args[7], blasArgs->N + blasArgs->offsetN);
1036 }
1037 initSizeKarg(&args[8], blasArgs->offBX);
1038 }
1039 else {
1040 if (blasArgs->side == clblasLeft) {
1041 initSizeKarg(&args[0], blasArgs->M);
1042 }
1043 else {
1044 initSizeKarg(&args[0], blasArgs->N);
1045 }
1046 initMemobjKarg(&args[1], blasArgs->A, NULL, 0, 0);
1047 initSizeKarg(&args[2], blasArgs->lda.matrix);
1048 initMemobjKarg(&args[3], blasArgs->scimage[0], NULL, 0, 0);
1049 if (blasArgs->side == clblasLeft) {
1050 initSizeKarg(&args[4], blasArgs->offsetM);
1051 }
1052 else {
1053 initSizeKarg(&args[4], blasArgs->offsetN);
1054 }
1055 initSizeKarg(&args[5], blasArgs->offA);
1056 }
1057 }
1058
1059 static bool
isFitToLDS(SubproblemDim * dim,DataType dtype,cl_ulong ldsSize,const void * kernelArgs)1060 isFitToLDS(
1061 SubproblemDim *dim,
1062 DataType dtype,
1063 cl_ulong ldsSize,
1064 const void *kernelArgs)
1065 {
1066 cl_ulong sizeA, sizeB, size;
1067 const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
1068
1069 /*
1070 * For prepare kernel two square local blocks required.
1071 * For main kernel two rectangular blocks required.
1072 * Maximum of these two values checked.
1073 */
1074
1075 sizeA = matrBlockSize(dim, MATRIX_A, dtype, kargs->side);
1076 sizeB = matrBlockSize(dim, MATRIX_B, dtype, kargs->side);
1077 size = (sizeA > sizeB) ? sizeA : sizeB;
1078
1079 return (2 * size * dtypeSize(dtype) <= ldsSize);
1080 }
1081
1082 static void
calcNrThreads(size_t threads[2],const SubproblemDim * dims,const PGranularity * pgran,const void * args,const void * extra)1083 calcNrThreads(
1084 size_t threads[2],
1085 const SubproblemDim *dims,
1086 const PGranularity *pgran,
1087 const void *args,
1088 const void *extra)
1089 {
1090 SubproblemDim globDim, offDim;
1091 const CLBlasKargs *kargs = (const CLBlasKargs*)args;
1092 size_t width, startBlock, finishBlock;
1093 bool isU = (kargs->uplo == clblasUpper) ^
1094 (kargs->transA != clblasNoTrans) ^ (kargs->side == clblasRight);
1095
1096 (void)extra;
1097
1098 width = kargs->K;
1099 width = (width + dims[0].bwidth - 1) / dims[0].bwidth;
1100 kargsToProbDims(&globDim, CLBLAS_TRSM, kargs, false);
1101 kargsToProbDims(&offDim, CLBLAS_TRSM, kargs, true);
1102
1103 startBlock = offDim.y / dims[0].bwidth;
1104 finishBlock = (globDim.y + offDim.y + dims[0].bwidth - 1) / dims[0].bwidth;
1105
1106 if (kargs->kernType == CLBLAS_PREP_A_KERNEL) {
1107 if (isU) {
1108 threads[0] = ((2 * width - startBlock - finishBlock + 1) *
1109 (finishBlock - startBlock) / 2) * pgran->wgSize[0];
1110 }
1111 else {
1112 threads[0] = ((1 + finishBlock + startBlock) *
1113 (finishBlock - startBlock) / 2) * pgran->wgSize[0];
1114 }
1115 threads[1] = 0;
1116 }
1117 else {
1118 calcGlobalThreads(threads, dims, pgran, globDim.y, globDim.x);
1119 }
1120 }
1121
1122 static void
imgPackMode(const void * extra,const SubproblemDim * dims,int dataID,unsigned int * packRate,clblasOrder * packOrder)1123 imgPackMode(
1124 const void *extra,
1125 const SubproblemDim *dims,
1126 int dataID,
1127 unsigned int *packRate,
1128 clblasOrder *packOrder)
1129 {
1130 bool trb;
1131 const CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
1132
1133 (void)dataID;
1134
1135 trb = isMatrixAccessColMaj(CLBLAS_TRSM, kextra->flags, MATRIX_B);
1136 if (trb || isComplexType(kextra->dtype)) {
1137 *packOrder = clblasRowMajor;
1138 *packRate = (unsigned int)dims[0].y;
1139 }
1140 else {
1141 *packOrder = clblasColumnMajor;
1142 *packRate = (unsigned int)dims[0].y;
1143 }
1144 }
1145
1146 static SolverFlags
solverFlags(void)1147 solverFlags(void)
1148 {
1149 return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
1150 }
1151
1152 void
initTrsmImgPattern(MemoryPattern * mempat)1153 initTrsmImgPattern(MemoryPattern *mempat)
1154 {
1155 mempat->name = "Image based block trsm";
1156 mempat->nrLevels = 2;
1157 mempat->cuLevel = 0;
1158 mempat->thLevel = 1;
1159 mempat->sops = &solverOps;
1160 mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
1161 mpatExtra.bMset = CLMEM_LEVEL_LDS;
1162 mpatExtra.mobjA = CLMEM_IMAGE;
1163 mpatExtra.mobjB = CLMEM_BUFFER;
1164 mempat->extra = &mpatExtra;
1165 }
1166