1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 /*
19  * Image based trsm generator
20  */
21 
22 #include <string.h>
23 #include <stdio.h>
24 #include <assert.h>
25 
26 #include <clBLAS.h>
27 #include <blas_mempat.h>
28 #include <clkern.h>
29 #include <clblas-internal.h>
30 
31 #include <matrix_dims.h>
32 
33 #include "blas_kgen_legacy.h"
34 #include "gen_helper_legacy.h"
35 #include "trsm_kgen_legacy.h"
36 #include "../gen_helper.h"
37 #include "../trsm_kgen.h"
38 #include <dis_warning.h>
39 
40 static const char *trsmImDecl =
41     "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n"
42     "void __kernel\n"
43     "%ctrsmIm(\n"
44     "    uint %c,\n"
45     "    uint %c,\n"
46     "    %s alpha,\n"
47     "    __read_only image2d_t A,\n"
48     "    __global %s *B,\n"
49     "    uint ldb,\n"
50     "    uint startRow,\n"
51     "    uint finishRow,\n"
52     "    uint offB)\n";
53 
54 /*
55  *  template for memory object based trsm preparation part
56  *  for one dimensional work space
57  */
58 static const char *trsmImPrep1D =
59     "uint m0, k0;\n"
60     "__local %s tempC[%lu];\n"
61     "%s c[%u];\n"
62     "const int lid = get_local_id(0);\n"
63     "const int skew = lid %% %lu;\n"
64     "%s"                                    // groups per Panel variable
65     "uint blockN;\n"
66     "uint x, y, imx, imy;\n"
67     "uint2 coordA, coordB;\n"
68     "\n"
69     "const uint currN = get_global_id(0) / %u * %lu;\n"       // group ID
70     "\n";
71 
72 static const char *readRectBlock =
73     "y = (currN + %lu <= N) ? %lu : N - currN;\n"
74     "x = (k0 + %lu <= finishRow) ? %lu : finishRow - k0;\n"
75     "if ((y == %lu) && (x == %lu)) {\n"
76     // just read with an optimized function
77     "    %s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n"
78     "}\n"
79     "else {\n"
80     "    %s((__local float4*)temp%c);\n"           // zeroing
81     "    barrier(CLK_LOCAL_MEM_FENCE);\n"
82     "    %s((LPtr)temp%c, (GPtr)B, currN, k0, y, x, %lu, ldb);\n"
83     "}\n\n";
84 
85 static const char *readRectBlockOpt =
86     // just read with an optimized function
87     "%s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n";
88 
89 static const char *readRectBlockTrans =
90     "y = (currN + %lu <= N) ? %lu : N - currN;\n"
91     "x = (k0 + %lu <= finishRow) ? %lu : finishRow - k0;\n"
92     "if ((y == %lu) && (x == %lu)) {\n"
93     // read and transpose with an optimized function
94     "    %s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n"
95     "}\n"
96     "else {\n"
97     "    %s((__local float4*)temp%c);\n"           // zeroing
98     "    barrier(CLK_LOCAL_MEM_FENCE);\n"
99     // read and transpose with slow function
100     "    %s((LPtr)temp%c, (GPtr)B, k0, currN, x, y, %lu, ldb);\n"
101     "}\n\n";
102 
103 static const char *readRectBlockTransOpt =
104     // read and transpose with an optimized function
105     "%s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n";
106 
107 static ssize_t
108 wrapper(
109     char *buf,
110     size_t buflen,
111     const struct SubproblemDim *subdims,
112     const struct PGranularity *pgran,
113     void *extra);
114 
115 static ssize_t
116 generator(
117     char *buf,
118     size_t buflen,
119     const struct SubproblemDim *subdims,
120     const struct PGranularity *pgran,
121     void *extra);
122 
123 static ssize_t
124 prepGenerator(
125     char *buf,
126     size_t buflen,
127     const struct SubproblemDim *subdims,
128     const struct PGranularity *pgran,
129     void *extra);
130 
131 static void
132 assignKargs(KernelArg *args, const void *params, const void *extra);
133 
134 static bool
135 isFitToLDS(
136     SubproblemDim *dim,
137     DataType dtype,
138     cl_ulong ldsSize,
139     const void *kernelArgs);
140 
141 static void
142 calcNrThreads(
143     size_t threads[2],
144     const SubproblemDim *dims,
145     const PGranularity *pgran,
146     const void *args,
147     const void *extra);
148 
149 
150 static void
151 imgPackMode(
152     const void *extra,
153     const SubproblemDim *dims,
154     int dataID,
155     unsigned int *packRate,
156     clblasOrder *packOrder);
157 
158 static SolverFlags
159 solverFlags(void);
160 
161 static SolverOps solverOps = {
162     wrapper,
163     assignKargs,
164     isFitToLDS,
165     NULL,
166     NULL,
167     calcNrThreads,
168     imgPackMode,
169     solverFlags,
170     NULL, //fixupArgs
171     NULL, //getDefaultDecomp
172    	NULL, //getDecompList
173    	NULL,
174    	NULL
175 };
176 
177 static CLBLASMpatExtra mpatExtra;
178 
179 /* Prepare A kernel begin */
180 
181 static const char *trsmPrepDecl =
182     "void __kernel\n"
183     "%ctrsmPrepare(\n"
184     "    uint %c,\n"
185     "    __global %s *A,\n"
186     "    uint lda,\n"
187     "    __write_only image2d_t imA,\n"
188     "    uint startRow,\n"
189     "    uint offA)\n";
190 
191 /*
192  * template for memory object based trsm preparation part
193  * for one dimensional work space
194  */
195 static const char *trsmPrep1D =
196     "__local %s tempA[%lu];\n"
197     "__local %s tempC[%lu];\n"
198     "int lid, gid;\n"
199     "uint currM, k0;\n"
200     "uint x, y, imx, imy;\n"
201     "\n"
202     "lid = get_local_id(0);\n"
203     "gid = get_global_id(0) / %u;\n"      // group ID
204     "A += offA;\n"
205     "\n";
206 
207 static const char *readSquareBlock =
208     "y = (currM + %lu <= M) ? %lu : M - currM;\n"
209     "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
210     "if ((y == %lu) && (x == %lu)) {\n"
211     // just read with an optimized function
212     "    %s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n"
213     "}\n"
214     "else {\n"
215     "    %s((__local float4*)temp%c);\n"          // zeroing
216     "    barrier(CLK_LOCAL_MEM_FENCE);\n"
217     "    %s((LPtr)temp%c, (GPtr)A, currM, k0, y, x, %lu, lda);\n"
218     "}\n\n";
219 
220 static const char *readSquareBlockOpt =
221     // just read with an optimized function
222     "%s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n";
223 
224 static const char *readSquareBlockTrans =
225     "y = (currM + %lu <= M) ? %lu : M - currM;\n"
226     "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
227     "if ((y == %lu) && (x == %lu)) {\n"
228     // read and transpose with an optimized function
229     "    %s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n"
230     "}\n"
231     "else {\n"
232     "    %s((__local float4*)temp%c);\n"          // zeroing
233     "    barrier(CLK_LOCAL_MEM_FENCE);\n"
234     // read and transpose with slow function
235     "    %s((LPtr)temp%c, (GPtr)A, k0, currM, x, y, %lu, lda);\n"
236     "}\n\n";
237 
238 static const char *readSquareBlockTransOpt =
239     // read and transpose with an optimized function
240     "%s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n";
241 
242 
243 static bool
useTransposedMul(const SubproblemDim * dims,DataType dtype,bool trb)244 useTransposedMul(const SubproblemDim *dims, DataType dtype, bool trb)
245 {
246     unsigned int vecLen;
247 
248     vecLen = sizeof(cl_float4) / dtypeSize(dtype);
249 
250     return (!(trb || isComplexType(dtype) || (dims[1].x % vecLen)));
251 }
252 
253 static size_t
calcPitchB(const SubproblemDim * dim,DataType dtype,bool transpMul)254 calcPitchB(const SubproblemDim *dim, DataType dtype, bool transpMul)
255 {
256     size_t ret;
257     size_t tsize;
258 
259     tsize = dtypeSize(dtype);
260     ret = (transpMul) ? dim->x : dim->bwidth;
261     ret = fl4RowWidth(ret, tsize) * sizeof(cl_float4) / tsize;
262 
263     return ret;
264 }
265 
266 static void
genPrepareSquareBlock(struct KgenContext * ctx,const SubproblemDim * dim,DataType dtype,const CopyBufFuncs * copyFuncs,const ZeroFuncs * zeroFuncs,bool tra,char c,bool opt)267 genPrepareSquareBlock(
268     struct KgenContext *ctx,
269     const SubproblemDim *dim,
270     DataType dtype,
271     const CopyBufFuncs *copyFuncs,
272     const ZeroFuncs *zeroFuncs,
273     bool tra,
274     char c,
275     bool opt)
276 {
277     char tmp[1024];
278     size_t pitch;
279     const char *readBlock;
280 
281     pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft);
282     if (opt) {
283         readBlock = (tra) ? readSquareBlockTransOpt : readSquareBlockOpt;
284         sprintf(tmp, readBlock, copyFuncs->read[MATRIX_A], c);
285     }
286     else {
287         readBlock = (tra) ? readSquareBlockTrans : readSquareBlock;
288         sprintf(tmp, readBlock, dim->y, dim->y, dim->bwidth, dim->bwidth,
289                 dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], c,
290                 zeroFuncs->names[MATRIX_A], c,
291                 copyFuncs->readGeneric[MATRIX_A], c, pitch);
292     }
293     kgenAddStmt(ctx, tmp);
294 }
295 
296 static void
genPrepZeroBlockC(struct KgenContext * ctx,const ZeroFuncs * zeroFuncs)297 genPrepZeroBlockC(
298     struct KgenContext *ctx,
299     const ZeroFuncs *zeroFuncs)
300 {
301     char tmp[1024];
302     sprintf(tmp, "%s((__local float4*)tempC);\n", zeroFuncs->names[MATRIX_A]);
303     kgenAddStmt(ctx, tmp);
304 }
305 
306 static void
genWriteBlock(struct KgenContext * ctx,const SubproblemDim * dim,const CopyBufFuncs * copyFuncs)307 genWriteBlock(
308     struct KgenContext *ctx,
309     const SubproblemDim *dim,
310     const CopyBufFuncs *copyFuncs)
311 {
312     char tmp[1024];
313 
314     sprintf(tmp, "%s(imA, imx, imy, (LPtr)tempC, %lu, %lu, %lu);\n",
315         copyFuncs->write, dim[0].y, dim[0].y, dim[0].y);
316     kgenAddStmt(ctx, tmp);
317 }
318 
319 static void
getBufferPos(struct KgenContext * ctx,bool isU)320 getBufferPos(struct KgenContext *ctx, bool isU) //n -> x,y buffer
321 {
322     kgenDeclareFunction(ctx, "void\ngetBufferPos(uint n, uint startRow, "
323                                                 "uint width, uint *y, "
324                                                 "uint *x)\n");
325     kgenBeginFuncBody(ctx);
326     if (isU) {
327         //n from beginning
328         kgenAddStmt(ctx, "n += (2 * width - startRow + 1) * (startRow) / 2;\n");
329         kgenAddStmt(ctx, "*y = trunc((2 * width + 1) - "
330                                "sqrt((2 * width + 1) *"
331                                "(2 * width + 1) - 8 * n)) / 2;\n");
332         kgenAddStmt(ctx, "*x = *y + n - (2 * width - *y + 1) * (*y) / 2;\n");
333     }
334     else {
335         //n from beginning
336         kgenAddStmt(ctx, "n += startRow * (startRow + 1) / 2;\n");
337         kgenAddStmt(ctx, "*y = trunc((-0.5 + sqrt(2.0 * n + 0.25)));\n");
338         kgenAddStmt(ctx, "*x = n - (*y) * (*y + 1) / 2;\n");
339     }
340     kgenEndFuncBody(ctx);
341 
342     kgenAddBlankLine(ctx);
343 }
344 
345 static void
genGetImagePos(struct KgenContext * ctx,const SubproblemDim * subdims,DataType dtype,const char * blockName,bool tra)346 genGetImagePos(
347     struct KgenContext *ctx,
348     const SubproblemDim *subdims,
349     DataType dtype,
350     const char *blockName,
351     bool tra) //n -> x,y image
352 {
353     char tmp[1024];
354     const char *parName;
355     const char *op[2] = {"/", "%"};
356 
357     parName = (tra) ? "bpc" : "bpr";
358 
359     sprintf(tmp, "imy = %s %s %s * %lu;\n"
360                  "imx = (%s %s %s) * %lu;\n",
361             blockName, op[tra], parName, subdims[0].y,
362             blockName, op[1 - tra], parName,
363             subdims[0].y * dtypeSize(dtype) / sizeof(cl_float4));
364     kgenAddStmt(ctx, tmp);
365 }
366 
367 // global memory to image converter
368 static ssize_t
prepGenerator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)369 prepGenerator(
370     char *buf,
371     size_t buflen,
372     const struct SubproblemDim *subdims,
373     const struct PGranularity *pgran,
374     void *extra)
375 {
376     struct KgenContext *ctx;
377     CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
378     char tmp[1024];
379     const char *typeName;
380     CopyBufFuncs copyFuncs;
381     ZeroFuncs zeroFuncs;
382     char fpref;
383     DataType dtype = kextra->dtype;
384     KernelExtraFlags kflags = kextra->flags;
385     ssize_t ret;
386     size_t pitchAB;
387     bool b;
388     bool tra, trb, isU, transpMul;
389     BlasGenSettings gset;
390 
391     if (pgran->wgDim != 1) {
392         return -EINVAL;
393     }
394 
395     ctx = createKgenContext(buf, buflen, true);
396     if (ctx == NULL) {
397         return -ENOMEM;
398     }
399 
400     tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
401     trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
402     isU = isMatrixUpper(kflags);
403 
404     // at first, generate needed declarations and auxiliary functions
405 
406     b = isDoubleBasedType(dtype);
407     kgenDeclareUptrs(ctx, b);
408 
409     if (isComplexType(dtype)) {
410         genComplexMathOperators(ctx, dtype);
411     }
412 
413     memset(&gset, 0, sizeof(gset));
414     memcpy(gset.subdims, subdims, sizeof(gset.subdims));
415     gset.kextra = kextra;
416     gset.pgran = pgran;
417 
418     generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRSM, &gset,
419                          BCHF_MATRIX_A | BCHF_WRITE_OUTPUT | BCHF_IMAGE_WRITE);
420     generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
421                          ZF_MATRIX_A);
422 
423     //matrix inversion function
424     genInvertingBlockFunc(ctx, (unsigned int)subdims[0].bwidth, dtype, isU);
425 
426     //coordinates calculation
427     getBufferPos(ctx, isU);
428 
429     typeName = dtypeBuiltinType(dtype);
430     fpref = dtypeToBlasPrefix(dtype);
431 
432     // now, generate the kernel
433 
434     sprintf(tmp, trsmPrepDecl, fpref, 'M', typeName,
435         typeName, typeName, typeName);
436 
437     kgenDeclareFunction(ctx, tmp);
438     ret = kgenBeginFuncBody(ctx);
439 
440     transpMul = useTransposedMul(subdims, dtype, trb);
441     if (!transpMul) {
442         sprintf(tmp, "const int bpr = get_image_width(imA) / %lu;\n",
443                 subdims[0].y / (sizeof(cl_float4) / dtypeSize(dtype)));
444     }
445     else {
446         sprintf(tmp, "const int bpc = get_image_height(imA) / %lu;\n",
447                 subdims[0].y);
448     }
449     kgenAddStmt(ctx, tmp);
450 
451     /*
452      * Calculate local buffer pitches, and then insert the
453      * preparative code
454      */
455     pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
456     sprintf(tmp, trsmPrep1D, typeName, pitchAB * subdims[0].y,
457             typeName, pitchAB * subdims[0].y, pgran->wgSize[0]);
458     ret = kgenAddStmt(ctx, tmp);
459 
460     sprintf(tmp, "getBufferPos(gid, startRow / %lu, (M + %lu) / %lu, &currM, &k0);\n",
461             subdims[0].y, subdims[0].y - 1, subdims[0].y);
462     kgenAddStmt(ctx, tmp);
463     sprintf(tmp, "currM *= %lu;\n"
464             "k0 *= %lu;\n", subdims[0].y, subdims[0].y);
465     kgenAddStmt(ctx, tmp);
466 
467     genGetImagePos(ctx, subdims, dtype, "gid", transpMul);
468 
469     kgenBeginBranch(ctx, "if (currM == k0)");
470     genPrepareSquareBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
471                           tra, 'A', !(kextra->flags & KEXTRA_TAILS_M));
472     genPrepZeroBlockC(ctx, &zeroFuncs);
473     kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
474 
475     if (kextra->flags & KEXTRA_UNIT_DIAGONAL) {
476         sprintf(tmp, "if (lid < %lu) {\n"
477                      "    tempA[lid * %lu + lid] = %s;\n"
478                      "}\n",
479                 subdims[0].bwidth, pitchAB, strOne(dtype));
480         kgenAddStmt(ctx, tmp);
481         kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
482         kgenAddBlankLine(ctx);
483     }
484 
485     sprintf(tmp, "if (lid < %lu)", subdims[0].bwidth);
486     kgenBeginBranch(ctx, tmp);
487     sprintf(tmp, "invert(tempA, tempC, lid, (currM + %lu > M) ? "
488                                             "M - currM : %lu);\n",
489             subdims[0].y, subdims[0].y);
490     kgenAddStmt(ctx, tmp);
491     kgenEndBranch(ctx, NULL);
492     kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
493     kgenEndBranch(ctx, NULL);
494 
495     kgenBeginBranch(ctx, "else");
496     genPrepareSquareBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs, tra,
497                           'C', !(kextra->flags & KEXTRA_TAILS_M));
498     kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
499     kgenEndBranch(ctx, NULL);
500 
501     genWriteBlock(ctx, subdims, &copyFuncs);
502     kgenEndFuncBody(ctx);
503     ret = kgenAddBlankLine(ctx);
504 
505     if (!ret) {
506         ret = (ssize_t)kgenSourceSize(ctx) + 1;
507     }
508 
509     destroyKgenContext(ctx);
510 
511     return (ret < 0) ? -EOVERFLOW : ret;
512 }
513 
514 static void
genZeroResult(struct KgenContext * ctx,DataType dtype,const SubproblemDim * dims)515 genZeroResult(
516     struct KgenContext *ctx,
517     DataType dtype,
518     const SubproblemDim *dims)
519 {
520     unsigned int n;
521     char tmp[1024];
522     unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
523 
524     getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL);
525 
526     sprintf(tmp, "for (x = 0; x < %u; x++) {\n"
527                  "    c[x] = 0;\n"
528                  "}\n\n", n);
529 
530     kgenAddStmt(ctx, tmp);
531 }
532 
533 static void
genPrepareRectBlock(struct KgenContext * ctx,const SubproblemDim * dim,DataType dtype,const CopyBufFuncs * copyFuncs,const ZeroFuncs * zeroFuncs,bool trb,char c,bool opt)534 genPrepareRectBlock(
535     struct KgenContext *ctx,
536     const SubproblemDim *dim,
537     DataType dtype,
538     const CopyBufFuncs *copyFuncs,
539     const ZeroFuncs *zeroFuncs,
540     bool trb,
541     char c,
542     bool opt)
543 {
544     char tmp[1024];
545     size_t pitch;
546     const char *readBlock;
547     size_t bsizes[2] = {dim->bwidth, dim->x};
548 
549     /*
550      * NOTE: in case of accessing to B in the non transposed way
551      *       block multiplication is done with transposed block B
552      */
553     pitch = calcPitchB(dim, dtype, !trb);
554     if (opt) {
555         readBlock = (trb) ? readRectBlockTransOpt : readRectBlockOpt;
556         sprintf(tmp, readBlock, copyFuncs->read[MATRIX_B], c);
557     }
558     else {
559         readBlock = (trb) ? readRectBlockTrans : readRectBlock;
560         sprintf(tmp, readBlock, bsizes[trb], bsizes[trb], bsizes[1 - trb],
561                 bsizes[1 - trb], bsizes[trb], bsizes[1 - trb],
562                 copyFuncs->read[MATRIX_B], c, zeroFuncs->names[MATRIX_B], c,
563                 copyFuncs->readGeneric[MATRIX_B], c, pitch);
564     }
565     kgenAddStmt(ctx, tmp);
566 }
567 
568 static void
getNblock(struct KgenContext * ctx,bool isU)569 getNblock(struct KgenContext *ctx, bool isU) //x, y -> n
570 {
571     kgenDeclareFunction(ctx, "void\ngetNBlock(uint y, uint x, uint startRow, "
572         "uint width, uint *n)\n");
573     kgenBeginFuncBody(ctx);
574     if (isU) {
575         kgenAddStmt(ctx, "*n = ((2 * width - y + 1) * y - "
576             "(2 * width - startRow + 1) * startRow) / 2 + x - y;\n");
577     }
578     else {
579         kgenAddStmt(ctx, "*n = (y * (y + 1) - startRow * (startRow + 1)) / 2 + x;\n");
580     }
581     kgenEndFuncBody(ctx);
582     kgenAddBlankLine(ctx);
583 }
584 
585 static void
genMultiplication(struct KgenContext * ctx,const SubproblemDim * dims,DataType dtype,const char * blkmulName,BlkMulFlags mulFlags)586 genMultiplication(
587     struct KgenContext *ctx,
588     const SubproblemDim *dims,
589     DataType dtype,
590     const char *blkmulName,
591     BlkMulFlags mulFlags)
592 {
593     char tmp[1024];
594     size_t u;
595     unsigned int l1Pans;
596 
597     l1Pans = (unsigned int)(dims[0].x / dims[1].x);
598     if (mulFlags & BLKMUL_TRANSPOSED_B) {
599         u = 1;
600     }
601     else {
602         u = matrBlockPitch(dims, MATRIX_B, dtype, clblasLeft);
603     }
604 
605     // find image position and invoke the multiplier
606     sprintf(tmp, "getNBlock(m0 / %lu, k0 / %lu, startRow / %lu, "
607                            "(M + %lu) / %lu, &blockN);\n",
608             dims[0].y, dims[0].y, dims[0].y, dims[0].y - 1, dims[0].y);
609     kgenAddStmt(ctx, tmp);
610     genGetImagePos(ctx, dims, dtype, "blockN", (mulFlags & BLKMUL_TRANSPOSED_B) != 0);
611     sprintf(tmp, "%s(A, (int2)(imx, imy + lid / %u * %lu), \n"
612                   "   (LPtr)(tempC + (lid %% %u * %lu) * %lu),\n"
613                   "   c, skew);\n",
614             blkmulName, l1Pans, dims[1].y, l1Pans, dims[1].x, u);
615     kgenAddStmt(ctx, tmp);
616 }
617 
618 static void
genReorderSolution(struct KgenContext * ctx,const SubproblemDim * subdims,const char * outTypeName,unsigned int colRegs)619 genReorderSolution(
620     struct KgenContext *ctx,
621     const SubproblemDim *subdims,
622     const char *outTypeName,
623     unsigned int colRegs)
624 {
625     char tmp[1024], tmp1[1024];
626     char *p;
627     unsigned i;
628 
629     sprintf(tmp, "void\n"
630                  "reorderResult(%s *c, int skew)",
631             outTypeName);
632     kgenDeclareFunction(ctx, tmp);
633     kgenBeginFuncBody(ctx);
634 
635     sprintf(tmp, "%s tmp;\n"
636                  "int i, j;\n",
637            outTypeName);
638     kgenAddStmt(ctx, tmp);
639 
640     p = tmp1;
641     for (i = 0; i < colRegs; i++) {
642         unsigned int k = (unsigned int)(subdims[1].y - 1) * colRegs + i;
643 
644         sprintf(p,  "\n"
645                     "    tmp = c[%u];\n"
646                     "    for (j = %lu; j >= 0; j--) {\n"
647                     "        c[(j+1) * %u + %u] = c[j * %u + %u];\n"
648                     "    }\n"
649                     "    c[%u] = tmp;\n",
650                 k, subdims[1].y - 2, colRegs, i, colRegs, i, i);
651         p += strlen(p);
652     }
653 
654     sprintf(tmp, "\n"
655                  "for (i = 0; i < skew; i++) {\n"
656                  "%s"
657                  "}\n"
658                  "\n",
659             tmp1);
660     kgenAddStmt(ctx, tmp);
661 
662     kgenEndFuncBody(ctx);
663     kgenAddBlankLine(ctx);
664 }
665 
666 static void
initKernelVarNames(KernelVarNames * kvars,KernelExtraFlags kflags)667 initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags)
668 {
669     kvars->A = "imgA";
670     kvars->B = "B";
671 
672     if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) {
673         kvars->coordA = "coordA.x";
674     }
675     else {
676         kvars->coordA = "coordA.y";
677     }
678     if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
679         kvars->coordB = "coordB.x";
680     }
681     else {
682         kvars->coordB = "coordB.y";
683     }
684 
685     kvars->sizeM = "M";
686     kvars->sizeN = "N";
687     kvars->sizeK = "origM";
688 }
689 
690 // image based kernel generator
691 static ssize_t
generator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)692 generator(
693     char *buf,
694     size_t buflen,
695     const struct SubproblemDim *subdims,
696     const struct PGranularity *pgran,
697     void *extra)
698 {
699     struct KgenContext *ctx;
700     CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
701     CLBLASKernExtra kextraTmp = *kextra;
702     char tmp[1024], tmp1[1024];
703     char blkmul[FUNC_NAME_MAXLEN];
704     char updateResFn[FUNC_NAME_MAXLEN];
705     char updateResGenericFn[FUNC_NAME_MAXLEN];
706     char updateResFnRev[FUNC_NAME_MAXLEN];
707     char updateResGenericFnRev[FUNC_NAME_MAXLEN];
708     char copyPLFn[FUNC_NAME_MAXLEN];
709     char *s1 = "";
710     const char *typeName;
711     CopyBufFuncs copyFuncs;
712     ZeroFuncs zeroFuncs;
713     char fpref;
714     DataType dtype = kextra->dtype;
715     ssize_t ret;
716     BlasGenSettings gset;
717     BlkMulOpts mulOpts;
718     BlkMulFlags mulFlags;
719     size_t pitchAB;
720     size_t u;
721     bool b;
722     bool isU;
723     bool areTails;
724     const char *outTypeName;
725     unsigned int nrRegs, colRegs;
726     KernelExtraFlags kflags = kextra->flags;
727     size_t tsize;
728     unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
729     UpdateResultFlags upFlags;
730     int tra, trb;
731     unsigned int l1Pans;
732     char vect[2] = {'y', 'x'};
733 
734     if (pgran->wgDim != 1) {
735         return -EINVAL;
736     }
737 
738     ctx = createKgenContext(buf, buflen, true);
739     if (ctx == NULL) {
740         return -ENOMEM;
741     }
742 
743     tsize = dtypeSize(dtype);
744     areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
745     isU = isMatrixUpper(kflags);
746 
747     tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
748     trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
749     l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x;
750 
751     /*
752      * Force generation of the transposed version of the block
753      * reading function with following multiplication with transposed
754      * block B to decrease LDS bank conflicts without column skew using.
755      * Reverse temporarily the flag of the column-major order for that
756      */
757     if (useTransposedMul(subdims, dtype, trb)) {
758         if (kflags & KEXTRA_COLUMN_MAJOR) {
759             kflags &= ~KEXTRA_COLUMN_MAJOR;
760         }
761         else {
762             kflags |= KEXTRA_COLUMN_MAJOR;
763         }
764         mulFlags = BLKMUL_SKEW_ROW | BLKMUL_TRANSPOSED_B;
765         u = subdims[1].y;
766     }
767     else {
768         mulFlags = BLKMUL_SKEW_COLUMN;
769         u = subdims[0].y / (sizeof(cl_float4) / dtypeSize(dtype));
770     }
771 
772     ctx = createKgenContext(buf, buflen, true);
773     if (ctx == NULL) {
774         return -ENOMEM;
775     }
776 
777     // at first, generate needed declarations and auxiliary functions
778 
779     b = isDoubleBasedType(dtype);
780     kgenDeclareUptrs(ctx, b);
781 
782     kextraTmp.flags = kflags;
783     memset(&gset, 0, sizeof(gset));
784     memcpy(gset.subdims, subdims, sizeof(gset.subdims));
785     gset.kextra = &kextraTmp;
786     gset.pgran = pgran;
787     initKernelVarNames(&gset.varNames, kextra->flags);
788 
789     if (isComplexType(dtype)) {
790         genComplexMathOperators(ctx, dtype);
791     }
792 
793     generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRSM, &gset, BCHF_MATRIX_B);
794     /*
795      * Temporary kernel extra has been needed to produce inverted block B read.
796      * Restore the original one, and restore kflags as well
797      */
798     gset.kextra = kextra;
799     kflags = kextra->flags;
800 
801     // functions updating result
802     // for the final result
803     generateUpresFuncs(ctx, CLBLAS_TRSM, &gset, updateResFn,
804                        updateResGenericFn);
805     // for intermediate result after blocks modification
806     upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
807     upFlags |= UPRES_WITH_BETA | UPRES_PRIV_DEST;
808     genUpresFuncsWithFlags(ctx, &gset, upFlags, updateResFnRev,
809                            updateResGenericFnRev);
810     // for heaping before multiplying on inverted block
811     upFlags = UPRES_USE_LDS;
812     if (!(mulFlags & BLKMUL_TRANSPOSED_B)) {
813         upFlags |= UPRES_COLUMN_MAJOR;
814     }
815     updateResultGenOld(ctx, &gset, UPRES_SET, upFlags, NULL);
816     kgenGetLastFuncName(copyPLFn, FUNC_NAME_MAXLEN, ctx);
817     kgenAddBlankLine(ctx);
818 
819     generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
820                          ZF_MATRIX_B | ZF_MATRIX_C);
821 
822     // block multiplication function
823     mulOpts.aMobj = CLMEM_IMAGE;
824     mulOpts.bMobj = CLMEM_BUFFER;
825     mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | mulFlags;
826     if (isComplexType(dtype)) {
827         mulOpts.core = BLKMUL_SEPARATE_MULADD;
828     }
829     else {
830         mulOpts.core = BLKMUL_MAD;
831     }
832     ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
833     if (ret) {
834         destroyKgenContext(ctx);
835 
836         return -EOVERFLOW;
837     }
838 
839     kgenAddBlankLine(ctx);
840     kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx);
841 
842     typeName = dtypeBuiltinType(dtype);
843     fpref = dtypeToBlasPrefix(dtype);
844 
845     // block number calculation
846     getNblock(ctx, isU);
847 
848     getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);
849     if (isComplexType(dtype)) {
850         colRegs = (unsigned int)subdims[1].x;
851     }
852     else {
853         colRegs = (unsigned int)fl4RowWidth(subdims[1].x, tsize);
854     }
855 
856     if (mulFlags & BLKMUL_SKEW_ROW) {
857         genReorderSolution(ctx, subdims, outTypeName, colRegs);
858     }
859 
860     // now, generate the kernel
861 
862     if (kflags & KEXTRA_SIDE_RIGHT) {
863         sprintf(tmp, trsmImDecl, pgran->wgSize[0], pgran->wgSize[1],
864             fpref, 'N', 'M', typeName, typeName, typeName, typeName);
865     }
866     else {
867         sprintf(tmp, trsmImDecl, pgran->wgSize[0], pgran->wgSize[1],
868             fpref, 'M', 'N', typeName, typeName, typeName, typeName);
869     }
870 
871     kgenDeclareFunction(ctx, tmp);
872     ret = kgenBeginFuncBody(ctx);
873 
874     if (!(mulFlags & BLKMUL_TRANSPOSED_B)) {
875         sprintf(tmp, "const int bpr = get_image_width(A) / %lu;\n",
876                 subdims[0].y / (sizeof(cl_float4) / tsize));
877     }
878     else {
879         sprintf(tmp, "const int bpc = get_image_height(A) / %lu;\n",
880                 subdims[0].y);
881     }
882     kgenAddStmt(ctx, tmp);
883 
884     /*
885      * Calculate local buffer pitches, and then insert the
886      * preparative code
887      */
888     pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
889 
890     sprintf(tmp, trsmImPrep1D, typeName, pitchAB * subdims[0].x,
891         outTypeName, nrRegs, u, s1, pgran->wgSize[0], subdims[0].itemX);
892     kgenAddStmt(ctx, tmp);
893     kgenAddBlankLine(ctx);
894 
895     kgenAddStmt(ctx, "B += offB;\n");
896     sprintf(tmp, "coordB.%c = currN + lid %% %u * %lu;\n"
897                  "coordB.%c = 0;\n\n",
898             vect[trb], l1Pans, subdims[1].x, vect[1 - trb]);
899     kgenAddStmt(ctx, tmp);
900 
901    /*
902     * B matrix is divided on panels, each work group
903     * multiply such a panel on the whole matrix A.
904     */
905 
906     // top level loop over M
907     if (isU) {
908         sprintf(tmp1, "(((finishRow - 1) / %lu) * %lu)", subdims[0].y,
909                 subdims[0].y); //last block start
910         sprintf(tmp, "for (m0 = %s; m0 + %lu != startRow; m0 -= %lu)",
911                 tmp1, subdims[0].y, subdims[0].y);
912         ret = kgenBeginBranch(ctx, tmp);
913     }
914     else {
915         sprintf(tmp, "for (m0 = startRow; m0 < finishRow; m0 += %lu)",
916                 subdims[0].y);
917         ret = kgenBeginBranch(ctx, tmp);
918     }
919 
920     sprintf(tmp, "coordA.%c = m0 + lid / %u * %lu;\n"
921                  "coordA.%c = 0;\n\n",
922             vect[tra], l1Pans, subdims[1].y, vect[1 - tra]);
923     kgenAddStmt(ctx, tmp);
924 
925     genZeroResult(ctx, dtype, subdims);
926 
927     // loop over K
928     if (isU) {
929         sprintf(tmp, "for (k0 = m0 + %lu; k0 < M; k0 += %lu)",
930             subdims[0].bwidth, subdims[0].bwidth);
931     }
932     else {
933         sprintf(tmp, "for (k0 = 0; k0 < m0; k0 += %lu)",
934             subdims[0].bwidth);
935     }
936     ret = kgenBeginBranch(ctx, tmp);
937 
938     genPrepareRectBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
939                         trb, 'C', !areTails);
940 
941     kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
942 
943     // multiplication in the adjusting loop
944     genMultiplication(ctx, subdims, dtype, blkmul, mulFlags);
945 
946     kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
947     kgenEndBranch(ctx, NULL); // loop over K
948     kgenAddBlankLine(ctx);
949 
950     if (mulFlags & BLKMUL_SKEW_ROW) {
951         kgenAddStmt(ctx, "reorderResult(c, skew);\n");
952     }
953     kgenAddStmt(ctx, "k0 = m0;\n");
954 
955     genUpdateIntermTrsmResult(ctx, &gset, updateResFnRev,
956                                   updateResGenericFnRev, true);
957 
958     genHeapTrsmResultToLDS(ctx, &gset, copyPLFn, "tempC");
959     kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
960     genZeroResult(ctx, dtype, subdims);
961 
962     // multiplication on the inverted block
963     genMultiplication(ctx, subdims, dtype, blkmul, mulFlags);
964     if (mulFlags & BLKMUL_SKEW_ROW) {
965         kgenAddStmt(ctx, "reorderResult(c, skew);\n");
966     }
967 
968     // write back the tile evaluated
969     upFlags = UPRES_EXCEED_PROBLEM_CONDITION;
970     if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_C)) {
971         upFlags |= UPRES_COLUMN_MAJOR;
972     }
973     genResultUpdateWithFlagsOld(ctx, CLBLAS_TRSM, &gset, upFlags, updateResFn,
974                                 updateResGenericFn, NULL);
975 
976     kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
977 
978     // end external loops over panels of matrix A
979     kgenEndBranch(ctx, NULL);
980     kgenEndFuncBody(ctx);
981     ret = kgenAddBlankLine(ctx);
982 
983     if (!ret) {
984         ret = (ssize_t)kgenSourceSize(ctx) + 1;
985     }
986 
987     destroyKgenContext(ctx);
988 
989     return (ret < 0) ? -EOVERFLOW : ret;
990 }
991 
992 static ssize_t
wrapper(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)993 wrapper(
994     char *buf,
995     size_t buflen,
996     const struct SubproblemDim *subdims,
997     const struct PGranularity *pgran,
998     void *extra)
999 {
1000     CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
1001     if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) {
1002         return generator(buf, buflen, subdims, pgran, extra);
1003     }
1004     else {
1005         return prepGenerator(buf, buflen, subdims, pgran, extra);
1006     }
1007 }
1008 
1009 static void
assignKargs(KernelArg * args,const void * params,const void * extra)1010 assignKargs(KernelArg *args, const void *params, const void *extra)
1011 {
1012     const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
1013 
1014     (void)extra;
1015 
1016     if (blasArgs->kernType == CLBLAS_COMPUTING_KERNEL) {
1017         if (blasArgs->side == clblasLeft) {
1018            initSizeKarg(&args[0], blasArgs->K);
1019            initSizeKarg(&args[1], blasArgs->N);
1020         }
1021         else {
1022            initSizeKarg(&args[0], blasArgs->M);
1023            initSizeKarg(&args[1], blasArgs->K);
1024         }
1025         assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
1026         initMemobjKarg(&args[3], blasArgs->scimage[0], NULL, 0, 0);
1027         initMemobjKarg(&args[4], blasArgs->B, NULL, 0, 0);
1028         initSizeKarg(&args[5], blasArgs->ldb.matrix);
1029         if (blasArgs->side == clblasLeft) {
1030             initSizeKarg(&args[6], blasArgs->offsetM);
1031             initSizeKarg(&args[7], blasArgs->M + blasArgs->offsetM);
1032         }
1033         else {
1034             initSizeKarg(&args[6], blasArgs->offsetN);
1035             initSizeKarg(&args[7], blasArgs->N + blasArgs->offsetN);
1036         }
1037         initSizeKarg(&args[8], blasArgs->offBX);
1038     }
1039     else {
1040         if (blasArgs->side == clblasLeft) {
1041             initSizeKarg(&args[0], blasArgs->M);
1042         }
1043         else {
1044             initSizeKarg(&args[0], blasArgs->N);
1045         }
1046         initMemobjKarg(&args[1], blasArgs->A, NULL, 0, 0);
1047         initSizeKarg(&args[2], blasArgs->lda.matrix);
1048         initMemobjKarg(&args[3], blasArgs->scimage[0], NULL, 0, 0);
1049         if (blasArgs->side == clblasLeft) {
1050             initSizeKarg(&args[4], blasArgs->offsetM);
1051         }
1052         else {
1053             initSizeKarg(&args[4], blasArgs->offsetN);
1054         }
1055         initSizeKarg(&args[5], blasArgs->offA);
1056     }
1057 }
1058 
1059 static bool
isFitToLDS(SubproblemDim * dim,DataType dtype,cl_ulong ldsSize,const void * kernelArgs)1060 isFitToLDS(
1061     SubproblemDim *dim,
1062     DataType dtype,
1063     cl_ulong ldsSize,
1064     const void *kernelArgs)
1065 {
1066     cl_ulong sizeA, sizeB, size;
1067     const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
1068 
1069     /*
1070      * For prepare kernel two square local blocks required.
1071      * For main kernel two rectangular blocks required.
1072      * Maximum of these two values checked.
1073      */
1074 
1075     sizeA = matrBlockSize(dim, MATRIX_A, dtype, kargs->side);
1076     sizeB = matrBlockSize(dim, MATRIX_B, dtype, kargs->side);
1077     size = (sizeA > sizeB) ? sizeA : sizeB;
1078 
1079     return (2 * size * dtypeSize(dtype) <= ldsSize);
1080 }
1081 
1082 static void
calcNrThreads(size_t threads[2],const SubproblemDim * dims,const PGranularity * pgran,const void * args,const void * extra)1083 calcNrThreads(
1084     size_t threads[2],
1085     const SubproblemDim *dims,
1086     const PGranularity *pgran,
1087     const void *args,
1088     const void *extra)
1089 {
1090     SubproblemDim globDim, offDim;
1091     const CLBlasKargs *kargs = (const CLBlasKargs*)args;
1092     size_t width, startBlock, finishBlock;
1093     bool isU = (kargs->uplo == clblasUpper) ^
1094         (kargs->transA != clblasNoTrans) ^ (kargs->side == clblasRight);
1095 
1096     (void)extra;
1097 
1098     width = kargs->K;
1099     width = (width + dims[0].bwidth - 1) / dims[0].bwidth;
1100     kargsToProbDims(&globDim, CLBLAS_TRSM, kargs, false);
1101     kargsToProbDims(&offDim, CLBLAS_TRSM, kargs, true);
1102 
1103     startBlock = offDim.y / dims[0].bwidth;
1104     finishBlock = (globDim.y + offDim.y + dims[0].bwidth - 1) / dims[0].bwidth;
1105 
1106     if (kargs->kernType == CLBLAS_PREP_A_KERNEL) {
1107         if (isU) {
1108             threads[0] = ((2 * width - startBlock - finishBlock + 1) *
1109                 (finishBlock - startBlock) / 2) * pgran->wgSize[0];
1110         }
1111         else {
1112             threads[0] = ((1 + finishBlock + startBlock) *
1113                 (finishBlock - startBlock) / 2) * pgran->wgSize[0];
1114         }
1115         threads[1] = 0;
1116     }
1117     else {
1118         calcGlobalThreads(threads, dims, pgran, globDim.y, globDim.x);
1119     }
1120 }
1121 
1122 static void
imgPackMode(const void * extra,const SubproblemDim * dims,int dataID,unsigned int * packRate,clblasOrder * packOrder)1123 imgPackMode(
1124     const void *extra,
1125     const SubproblemDim *dims,
1126     int dataID,
1127     unsigned int *packRate,
1128     clblasOrder *packOrder)
1129 {
1130     bool trb;
1131     const CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
1132 
1133     (void)dataID;
1134 
1135     trb = isMatrixAccessColMaj(CLBLAS_TRSM, kextra->flags, MATRIX_B);
1136     if (trb || isComplexType(kextra->dtype)) {
1137         *packOrder = clblasRowMajor;
1138         *packRate = (unsigned int)dims[0].y;
1139     }
1140     else {
1141         *packOrder = clblasColumnMajor;
1142         *packRate = (unsigned int)dims[0].y;
1143     }
1144 }
1145 
1146 static SolverFlags
solverFlags(void)1147 solverFlags(void)
1148 {
1149     return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
1150 }
1151 
1152 void
initTrsmImgPattern(MemoryPattern * mempat)1153 initTrsmImgPattern(MemoryPattern *mempat)
1154 {
1155     mempat->name = "Image based block trsm";
1156     mempat->nrLevels = 2;
1157     mempat->cuLevel = 0;
1158     mempat->thLevel = 1;
1159     mempat->sops = &solverOps;
1160     mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
1161     mpatExtra.bMset = CLMEM_LEVEL_LDS;
1162     mpatExtra.mobjA = CLMEM_IMAGE;
1163     mpatExtra.mobjB = CLMEM_BUFFER;
1164     mempat->extra = &mpatExtra;
1165 }
1166