1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 /*
19  * trmm image based generator
20  */
21 
22 #include <string.h>
23 #include <stdio.h>
24 #include <math.h>
25 #include <ctype.h>
26 
27 #include <clBLAS.h>
28 #include <matrix_dims.h>
29 #include <blas_mempat.h>
30 #include <clkern.h>
31 #include <clblas-internal.h>
32 #include <dis_warning.h>
33 
34 #include "blas_kgen_legacy.h"
35 #include "../gen_helper.h"
36 #include "gen_helper_legacy.h"
37 #include "trxm_common_legacy.h"
38 
39 static CLBLASMpatExtra mpatExtra;
40 
41 static ssize_t
42 generator(
43    char *buf,
44    size_t buflen,
45    const struct SubproblemDim *subdims,
46    const struct PGranularity *pgran,
47    void *extra);
48 
49 static ssize_t
50 preparator(
51    char *buf,
52    size_t buflen,
53    const struct SubproblemDim *subdims,
54    const struct PGranularity *pgran,
55    void *extra);
56 
57 static ssize_t
genWrapper(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)58 genWrapper(
59     char *buf,
60     size_t buflen,
61     const struct SubproblemDim *subdims,
62     const struct PGranularity *pgran,
63     void *extra)
64 {
65     CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
66     if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) {
67         return generator(buf, buflen, subdims, pgran, extra);
68     }
69     else {
70         return preparator(buf, buflen, subdims, pgran, extra);
71     }
72 }
73 
74 static void
75 assignKargs(KernelArg *args, const void *params, const void *extra);
76 
77 static bool
78 isFitToLDS(
79     SubproblemDim *dim,
80     DataType dtype,
81     cl_ulong ldsSize,
82     const void *kernelArgs);
83 
84 static SolverFlags
85 solverFlags(void);
86 
87 static void
88 calcNrThreads(
89     size_t threads[2],
90     const SubproblemDim *subdims,
91     const PGranularity *pgran,
92     const void *args,
93     const void *extra);
94 
95 static int getPerf(
96     unsigned int kflags,
97     const void *args);
98 
99 static SolverOps imgSops = {
100     genWrapper,
101     assignKargs,
102     isFitToLDS,
103     getPerf,
104     NULL,
105     calcNrThreads,
106     NULL,
107     solverFlags,
108     NULL, //fixupKargs
109     NULL, //getDefaultDecomp
110     NULL, //getDecompList
111     NULL,
112     NULL
113 };
114 
115 static void
imgToCopyBufFuncs(CopyBufFuncs * bufFuncs,const CopyImgFuncs * imgFuncs,KernelExtraFlags kflags)116 imgToCopyBufFuncs(
117     CopyBufFuncs *bufFuncs,
118     const CopyImgFuncs *imgFuncs,
119     KernelExtraFlags kflags)
120 {
121     memcpy(bufFuncs->write, imgFuncs->localToImage, FUNC_NAME_MAXLEN);
122     if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A)) {
123         memcpy(bufFuncs->read[MATRIX_A],
124                imgFuncs->globalToLocalTransposed[MATRIX_A], FUNC_NAME_MAXLEN);
125         memcpy(bufFuncs->readGeneric[MATRIX_A],
126                imgFuncs->globalToLocalTransposedGeneric[MATRIX_A],
127                FUNC_NAME_MAXLEN);
128     }
129     else {
130         memcpy(bufFuncs->read[MATRIX_A],
131                imgFuncs->globalToLocal[MATRIX_A], FUNC_NAME_MAXLEN);
132         memcpy(bufFuncs->readGeneric[MATRIX_A],
133                imgFuncs->globalToLocalGeneric[MATRIX_A],
134                FUNC_NAME_MAXLEN);
135     }
136 }
137 
138 static void
genPrepKernelA(struct KgenContext * ctx,const SubproblemDim * subdims,KernelExtraFlags kflags,DataType dtype,CopyImgFuncs * copyImgFuncs,const PGranularity * pgran)139 genPrepKernelA(
140     struct KgenContext *ctx,
141     const SubproblemDim *subdims,
142     KernelExtraFlags kflags,
143     DataType dtype,
144     CopyImgFuncs *copyImgFuncs,
145     const PGranularity *pgran)
146 {
147     char tmp[4096];
148     bool isBranch = false;
149     size_t localBufSize;
150     unsigned int tsize, vecLen;
151     const char *typeName;
152     CopyBufFuncs copyBufFuncs;
153     char fpref;
154 
155     fpref = dtypeToBlasPrefix(dtype);
156     typeName = dtypeBuiltinType(dtype);
157     tsize = dtypeSize(dtype);
158     vecLen = sizeof(cl_float4) / tsize;
159     localBufSize = subdims[1].y * fl4RowWidth(subdims[1].bwidth, tsize);
160     localBufSize *= vecLen;
161     imgToCopyBufFuncs(&copyBufFuncs, copyImgFuncs, kflags);
162 
163     sprintf(tmp, "void __kernel\n"
164                  "%cprepareImageA(\n"
165                  "    uint M,\n"
166                  "    __global %s *A,\n"
167                  "    uint lda,\n"
168                  "    __write_only image2d_t imgA,\n"
169                  "    uint startM,\n"
170                  "    uint origM,\n"
171                  "    uint offA)\n",
172             fpref, typeName);
173     kgenDeclareFunction(ctx, tmp);
174     kgenBeginFuncBody(ctx);
175 
176     kgenDeclareGroupID(ctx, "gid", pgran);
177     kgenDeclareLocalID(ctx, "lid", pgran);
178     sprintf(tmp, "const uint bpr = (origM + %lu) / %lu;\n"
179                  "uint currM = startM + (gid / bpr) * %lu;\n"
180                  "uint k0 = (gid %% bpr) * %lu;\n"
181                  "uint x, y;\n"
182                  "__local %s tempA[%lu];\n"
183                  "bool processed = false;\n\n",
184             subdims[1].bwidth - 1, subdims[1].bwidth, subdims[1].y,
185             subdims[1].bwidth, typeName, localBufSize);
186     kgenAddStmt(ctx, tmp);
187 
188     kgenAddStmt(ctx, "A += offA;\n");
189     if (!(isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A) ||
190           isMatrixConj(kflags, MATRIX_A))) {
191 
192         if (isMatrixUpper(kflags)) {
193             sprintf(tmp, "if (k0 >= currM + %lu)", subdims[1].y);
194         }
195         else {
196             sprintf(tmp, "if (k0 + %lu <= currM)", subdims[1].bwidth);
197         }
198         kgenBeginBranch(ctx, tmp);
199         sprintf(tmp, "if ((currM + %lu <= M + startM) && "
200                          "(k0 + %lu <= origM) && %d) {\n"
201                      // write directly to an image from the global memory
202                      "    %s(imgA, k0 / %u, currM - startM, (GPtr)A, "
203                             "currM, k0, lda);\n"
204                      "    processed = true;\n"
205                      "}\n",
206                 subdims[1].y, subdims[1].bwidth,
207                 (kflags & KEXTRA_NO_COPY_VEC_A) == 0,
208                 copyImgFuncs->globalToImage[MATRIX_A], vecLen);
209 
210         kgenAddStmt(ctx, tmp);
211         kgenEndBranch(ctx, NULL);
212 
213         kgenBeginBranch(ctx, "if (!processed)");
214         isBranch = true;
215     }
216 
217     // now, zeroing blocks entirely located in the "other" triangle
218     if (isMatrixUpper(kflags)) {
219         sprintf(tmp, "if (k0 + %lu <= currM) {\n"
220                      "    %s((__local float4*)tempA);\n"
221                      "}\n",
222                 subdims[1].bwidth, copyImgFuncs->zeroBlock[MATRIX_A]);
223     }
224     else {
225         sprintf(tmp, "if (k0 >= currM + %lu) {\n"
226                      "    %s((__local float4*)tempA);\n"
227                      "}\n",
228                 subdims[1].y, copyImgFuncs->zeroBlock[MATRIX_A]);
229     }
230     kgenAddStmt(ctx, tmp);
231 
232     // useful block path, reading data from the global memory to the local one
233     kgenBeginBranch(ctx, "else");
234     kgenAddStmt(ctx, "M += startM;\n");
235     genPrepareTrxmBlockA(ctx, subdims, dtype, &copyBufFuncs,
236                          (ZeroFuncs*)copyImgFuncs->zeroBlock,
237                          kflags, "origM");
238     kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
239     kgenAddStmt(ctx, "M -= startM;\n");
240     genTriangMatrBlock(ctx, subdims, dtype, kflags);
241     kgenEndBranch(ctx, NULL);
242 
243     // and write to the image
244     kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
245     sprintf(tmp, "%s(imgA, k0 / %u, currM - startM, (LPtr)tempA);\n",
246             copyImgFuncs->localToImage[MATRIX_A], vecLen);
247     kgenAddStmt(ctx, tmp);
248     if (isBranch) {
249         kgenEndBranch(ctx, NULL);
250     }
251 
252     kgenEndFuncBody(ctx);
253 }
254 
255 static void
genPrepKernelB(struct KgenContext * ctx,const SubproblemDim * subdims,DataType dtype,CopyImgFuncs * copyImgFuncs,const PGranularity * pgran,KernelExtraFlags kflags)256 genPrepKernelB(
257     struct KgenContext *ctx,
258     const SubproblemDim *subdims,
259     DataType dtype,
260     CopyImgFuncs *copyImgFuncs,
261     const PGranularity *pgran,
262     KernelExtraFlags kflags)
263 {
264     char tmp[4096];
265     size_t localBufSize;
266     unsigned int tsize, vecLen;
267     const char *typeName;
268     char fpref;
269 
270     const char *funcHead =
271         "bool trb, aligned;\n"
272         "const uint bpr = (origM + %lu) / %lu;\n"
273         "const uint n = startN + (gid / bpr) * %lu;\n"
274         "const uint k = (gid %% bpr) * %lu;\n"
275         "uint x, y;\n"
276         "__local %s temp[%lu];\n"
277         "\n"
278         "B += offB;\n"
279         "trb = (order == clblasRowMajor) ^ (side == clblasRight);\n"
280         "N += startN;\n";
281 
282     const char *funcBody =
283         "//copy matrix B block\n"
284         "y = n + %u <= N ? %u : N - n;\n"
285         "x = k + %u <= origM ? %u : origM - k;\n"
286         "aligned = (x == %u) && (y == %u) && %d;\n"
287         "if (aligned && !trb) {\n"
288         "    %s(imgB, k / %u, n - startN, (GPtr)B, n, k, ldb);\n"
289         "}\n"
290         "else {\n"
291         "    if (n >= N) {\n"
292                 // just zero, this is padding related part
293         "        %s((__local float4*)temp);\n"
294         "    }\n"
295         "    else if (!aligned) {\n"
296         "        // zero local memory\n"
297         "        %s((__local float4*)temp);\n"
298         "        barrier(CLK_LOCAL_MEM_FENCE);\n"
299         "        if (trb) {\n"
300         "            // generic transposed global to local\n"
301         "            %s((LPtr)temp, (GPtr)B, k, n, x, y, %u, ldb);\n"
302         "        }\n"
303         "        else {\n"
304         "            // generic global to local\n"
305         "            %s((LPtr)temp, (GPtr)B, n, k, y, x, %u, ldb);\n"
306         "        }\n"
307         "    }\n"
308         "    else {\n"
309         "        if (trb) {//transposed, aligned\n"
310         "            // optimized transposed global to local\n"
311         "            %s((LPtr)temp, (GPtr)B, k, n, ldb);\n"
312         "        }\n"
313         "    }\n"
314         "    barrier(CLK_LOCAL_MEM_FENCE);\n"
315         "    %s(imgB, k / %u, n - startN, (LPtr)temp);\n"
316         "}\n"
317         "\n";
318 
319     fpref = dtypeToBlasPrefix(dtype);
320     typeName = dtypeBuiltinType(dtype);
321     tsize = dtypeSize(dtype);
322     vecLen = sizeof(cl_float4) / tsize;
323     localBufSize = subdims[1].x * fl4RowWidth(subdims[1].bwidth, tsize);
324     localBufSize *= vecLen;
325 
326     sprintf(tmp, "void __kernel\n"
327                  "%cprepareImageB(\n"
328                  "    clblasOrder order,\n"
329                  "    clblasSide side,\n"
330                  "    uint N,\n"
331                  "    __global %s *B,\n"
332                  "    uint ldb,\n"
333                  "    __write_only image2d_t imgB,\n"
334                  "    uint startN,\n"
335                  "    uint origM,\n"
336                  "    uint offB)\n",
337             fpref, typeName);
338     kgenDeclareFunction(ctx, tmp);
339     kgenBeginFuncBody(ctx);
340 
341     kgenDeclareGroupID(ctx, "gid", pgran);
342     sprintf(tmp, funcHead,
343             subdims[1].bwidth - 1, subdims[1].bwidth,
344             subdims[1].x, subdims[1].bwidth,
345             typeName, localBufSize);
346     kgenAddStmt(ctx, tmp);
347 
348     sprintf(tmp, funcBody,
349             subdims[1].x, subdims[1].x, // y = n + dy <= N ?...
350             subdims[1].bwidth,
351             subdims[1].bwidth, // x = k + bw <= M ?...
352             subdims[1].bwidth,
353             subdims[1].x, // aligned = (x==bw1)&&(y==dx1)
354             (kflags & KEXTRA_NO_COPY_VEC_B) == 0,
355             copyImgFuncs->globalToImage[MATRIX_B],
356             vecLen,
357             copyImgFuncs->zeroBlock[MATRIX_B],
358             copyImgFuncs->zeroBlock[MATRIX_B],
359             copyImgFuncs->globalToLocalTransposedGeneric[MATRIX_B],
360             subdims[1].bwidth,
361             copyImgFuncs->globalToLocalGeneric[MATRIX_B],
362             subdims[1].bwidth,
363             copyImgFuncs->globalToLocalTransposed[MATRIX_B],
364             copyImgFuncs->localToImage[MATRIX_B],
365             vecLen);
366     kgenAddStmt(ctx, tmp);
367 
368     kgenEndFuncBody(ctx);
369 }
370 
371 static void
declareMainKernel(struct KgenContext * ctx,DataType dtype,KernelExtraFlags kflags,const PGranularity * pgran)372 declareMainKernel(
373     struct KgenContext *ctx,
374     DataType dtype,
375     KernelExtraFlags kflags,
376     const PGranularity *pgran)
377 {
378     char tmp[4048];
379     char fpref;
380     const char *typeName;
381     char coordNames[2] = {'M', 'N'};
382     int side = ((kflags & KEXTRA_SIDE_RIGHT) != 0);
383 
384     fpref = dtypeToBlasPrefix(dtype);
385     typeName = dtypeBuiltinType(dtype);
386     sprintf(tmp, "__attribute__((reqd_work_group_size(%u, %u, 1)))\n"
387                  "void __kernel\n"
388                  "%ctrmmImg(\n"
389                  "    uint %c,\n"
390                  "    uint %c,\n"
391                  "    const %s alpha,\n"
392                  "    const __read_only image2d_t A,\n"
393                  "    const __read_only image2d_t B,\n"
394                  "    __global %s *C,\n"
395                  "    uint ldb,\n"
396                  "    const uint start%c,\n"
397                  "    const uint start%c,\n"
398                  "    const uint origM,\n"
399                  "    const uint offB)\n",
400              pgran->wgSize[0], pgran->wgSize[1],  fpref, coordNames[side],
401              coordNames[1 - side], typeName, typeName, coordNames[side],
402              coordNames[1 - side]);
403 
404     kgenDeclareFunction(ctx, tmp);
405 }
406 
407 // Preparation function for images based kernel generator
408 static ssize_t
preparator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)409 preparator(
410    char *buf,
411    size_t buflen,
412    const struct SubproblemDim *subdims,
413    const struct PGranularity *pgran,
414    void *extra)
415 {
416     struct KgenContext *ctx;
417     CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
418     CopyImgFuncs copyImgFuncs;
419     BlasGenSettings gset;
420     ssize_t ret;
421     bool b;
422 
423     memset(&copyImgFuncs, 0, sizeof(copyImgFuncs));
424     memset(&gset, 0, sizeof(gset));
425 
426     ctx = createKgenContext(buf, buflen, true);
427     if (ctx == NULL) {
428         return -ENOMEM;
429     }
430 
431     b = isDoubleBasedType(kextra->dtype);
432     kgenDeclareUptrs(ctx, b);
433     if (kextra->kernType == CLBLAS_PREP_B_KERNEL) {
434         declareBlasEnums(ctx);
435     }
436 
437     memcpy(gset.subdims, subdims, sizeof(gset.subdims));
438     gset.kextra = kextra;
439     gset.pgran = pgran;
440 
441     // generate necessary memory to image copying functions
442     generateImageCopyFuncs(&copyImgFuncs, ctx, CLBLAS_TRMM, &gset);
443     kgenAddBlankLine(ctx);
444 
445     if (kextra->kernType == CLBLAS_PREP_A_KERNEL) {
446         genPrepKernelA(ctx, subdims, kextra->flags, kextra->dtype,
447                        &copyImgFuncs, pgran);
448     }
449     else {
450         genPrepKernelB(ctx, subdims, kextra->dtype, &copyImgFuncs, pgran,
451                        kextra->flags);
452     }
453 
454     ret = kgenAddBlankLine(ctx);
455     if (!ret) {
456         ret = (ssize_t)kgenSourceSize(ctx) + 1;
457     }
458     destroyKgenContext(ctx);
459 
460     return (ret < 0) ? -EOVERFLOW : ret;
461 }
462 
463 static void
initKernelVarNames(KernelVarNames * kvars,KernelExtraFlags kflags)464 initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags)
465 {
466     kvars->A = "imgA";
467     kvars->B = "imgB";
468     if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A)) {
469         kvars->coordA = "coordA.x";
470     }
471     else {
472         kvars->coordA = "coordA.y";
473     }
474     if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B)) {
475         kvars->coordB = "coordB.x";
476     }
477     else {
478         kvars->coordB = "coordB.y";
479     }
480     kvars->sizeM = "M";
481     kvars->sizeN = "N";
482     kvars->sizeK = "K";
483 }
484 
485 static ssize_t
generator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)486 generator(
487    char *buf,
488    size_t buflen,
489    const struct SubproblemDim *subdims,
490    const struct PGranularity *pgran,
491    void *extra)
492 {
493     struct KgenContext *ctx;
494     CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
495     char tmp[4096], tmp1[4096];
496     char *p;
497     // is the iteration over N, N at the top level
498     const char *typeName;
499     DataType dtype = kextra->dtype;
500     ssize_t ret;
501     BlasGenSettings gset;
502     BlkMulOpts mulOpts;
503     unsigned int tsize;
504     unsigned int vecLen, outVecLen;
505     bool b;
506     const char *outTypeName;
507     unsigned int i;
508     unsigned int nrRegs, regPitch;
509     KernelExtraFlags kflags = kextra->flags;
510     int tra, trb;
511     char coordNames[2] = {'M', 'N'};
512     char vect[2] = {'y', 'x'};
513 
514     const char *coordConstants =
515         "const uint workItemM = startM + get_global_id(0) * %lu;\n"
516         "const uint workItemN = startN + get_global_id(1) * %lu;\n"
517         "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n"
518         "uint vectK = (origM + %u) / %u;\n";
519 
520     /*
521      *  template for image based trmm preparation part
522      *  for two dimensional work space
523      */
524     const char *localVariables =
525         "uint k0;\n"
526         "int2 coordA = (int2)(0, workItemM - startM);\n"
527         "int2 coordB = (int2)(0, workItemN - startN);\n"
528         "%s c[%u];\n\n";
529 
530     memset(&gset, 0, sizeof(gset));
531     memcpy(gset.subdims, subdims, sizeof(gset.subdims));
532     gset.kextra = kextra;
533     gset.pgran = pgran;
534     initKernelVarNames(&gset.varNames, kflags);
535 
536     tsize = dtypeSize(dtype);
537     vecLen = sizeof(cl_float4) / dtypeSize(dtype);
538     if (isComplexType(dtype)) {
539         regPitch = (unsigned int)subdims[1].x;
540     }
541     else {
542         regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) *
543                                              sizeof(cl_float4) / tsize;
544     }
545 
546     ctx = createKgenContext(buf, buflen, true);
547     if (ctx == NULL) {
548         return -ENOMEM;
549     }
550 
551     // at first, generate needed declarations and auxiliary functions
552     b = isDoubleBasedType(dtype);
553     kgenDeclareUptrs(ctx, b);
554 
555     typeName = dtypeBuiltinType(dtype);
556 
557     // now, generate the kernel
558     declareMainKernel(ctx, dtype, kflags, pgran);
559     ret = kgenBeginFuncBody(ctx);
560 
561     // constants
562     sprintf(tmp, coordConstants,
563             subdims[1].y, subdims[1].x, subdims[1].y,
564             vecLen - 1, vecLen);
565     kgenAddStmt(ctx, tmp);
566 
567     /*
568      * Calculate local buffer pitches, and then declare local
569      * variables
570      */
571     getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);
572 
573     sprintf(tmp, localVariables, outTypeName, nrRegs);
574     kgenAddStmt(ctx, tmp);
575 
576     // check if offset exceeds matrix
577     kgenAddStmt(ctx, "if ((workItemM >= startM + M) ||"
578                          "(workItemN >= startN + N)) {\n"
579                      "    return;\n"
580                      "}\n");
581 
582     // zero C block
583     sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n"
584                  "    c[k0] = 0;\n"
585                  "}\n\n",
586             nrRegs);
587     kgenAddStmt(ctx, tmp);
588 
589     // loop over K
590     if (isMatrixUpper(kflags)) {
591         sprintf(tmp, "coordA.x = vectK - %lu;\n"
592                      "coordB.x = coordA.x;\n",
593                 subdims[1].bwidth / vecLen);
594         kgenAddStmt(ctx, tmp);
595         sprintf(tmp, "for (k0 = ((workItemM/%lu)*%lu)/%u; "
596                           "k0 < vectK; k0 += %lu)",
597                 subdims[0].bwidth, subdims[0].bwidth, vecLen,
598                 subdims[1].bwidth / vecLen);
599     }
600     else {
601         size_t dk;
602 
603         dk = (subdims[1].y > subdims[1].bwidth) ? subdims[1].y :
604                                                   subdims[1].bwidth;
605         dk = dk / vecLen + 1;
606         sprintf(tmp, "for (k0 = 0; "
607                           "k0 < min((workItemM+%u)/%u + %lu, vectK); "
608                           "k0 += %lu)",
609                 vecLen - 1, vecLen, dk, subdims[1].bwidth / vecLen);
610     }
611     kgenBeginBranch(ctx, tmp);
612 
613     mulOpts.aMobj = CLMEM_IMAGE;
614     mulOpts.bMobj = CLMEM_IMAGE;
615     mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE |
616                     BLKMUL_AVOID_AND;
617     if (isComplexType(dtype)) {
618         mulOpts.core = BLKMUL_SEPARATE_MULADD;
619     }
620     else {
621         mulOpts.core = BLKMUL_MAD;
622     }
623     mulOpts.argNames.coordA = "coordA";
624     mulOpts.argNames.coordB = "coordB";
625     mulOpts.argNames.skewCol = "skewCol";
626     mulOpts.argNames.skewRow = "skewRow";
627     mulOpts.argNames.k = "k0";
628     mulOpts.argNames.vectBoundK = "vectK";
629     ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
630     if (ret) {
631         destroyKgenContext(ctx);
632         return -EOVERFLOW;
633     }
634 
635     // update image coordinates
636     if (isMatrixUpper(kflags)) {
637         // In this case loop is inverted to avoid 'random' skews
638         sprintf(tmp, "\ncoordA.x -= %lu;\n"
639                      "coordB.x -= %lu;\n",
640                 subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen);
641     }
642     else {
643         sprintf(tmp, "\ncoordA.x += %lu;\n"
644                      "coordB.x += %lu;\n",
645                 subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen);
646     }
647     kgenAddStmt(ctx, tmp);
648 
649     kgenEndBranch(ctx, NULL);
650     // reorder the given solution
651     outVecLen = isComplexType(dtype) ? 1 : vecLen;
652     p = tmp1;
653     for (i = 0; i < regPitch / outVecLen; i++) {
654         unsigned int k = (unsigned int)(subdims[1].y - 1)
655                                          * regPitch / outVecLen + i;
656 
657         sprintf(p,  "\n"
658                     "    tmp = c[%u];\n"
659                     "    for (j = %lu; j >= 0; j--) {\n"
660                     "        c[(j+1) * %u + %u] = c[j * %u + %u];\n"
661                     "    }\n"
662                     "    c[%u] = tmp;\n",
663                 k, subdims[1].y - 2, regPitch / outVecLen,
664                 i, regPitch / outVecLen, i, i);
665         p += strlen(p);
666     }
667     sprintf(tmp, "\n"
668                  "for (k0 = 0; k0 < skewRow.y; k0++) {\n"
669                  "    int j;\n"
670                  "    %s tmp;\n"
671                  "%s"
672                  "}\n"
673                  "\n",
674                  outTypeName, tmp1);
675     kgenAddStmt(ctx, tmp);
676 
677     // write back the tile evaluated
678     tra = isMatrixAccessColMaj(CLBLAS_TRMM, kextra->flags, MATRIX_A);
679     trb = isMatrixAccessColMaj(CLBLAS_TRMM, kextra->flags, MATRIX_B);
680     sprintf(tmp, "coordA.%c = workItemM - startM;\n"
681                  "coordB.%c = workItemN - startN;\n\n",
682             vect[tra], vect[trb]);
683     kgenAddStmt(ctx, tmp);
684     kgenBeginBranch(ctx, NULL);
685     trb = isMatrixAccessColMaj(CLBLAS_TRMM, kextra->flags, MATRIX_C);
686     sprintf(tmp, "__global %s *B = C + offB + start%c * ldb + start%c;\n\n",
687             typeName, coordNames[trb], coordNames[1 - trb]);
688 
689     kgenAddStmt(ctx, tmp);
690     generateResultUpdateOld(ctx, CLBLAS_TRMM, &gset, NULL, NULL);
691     kgenEndBranch(ctx, NULL);
692     kgenEndFuncBody(ctx);
693     ret = kgenAddBlankLine(ctx);
694 
695     if (!ret) {
696         ret = (ssize_t)kgenSourceSize(ctx) + 1;
697     }
698 
699     destroyKgenContext(ctx);
700 
701     return (ret < 0) ? -EOVERFLOW : ret;
702 }
703 
704 
705 
706 static void
assignKargs(KernelArg * args,const void * params,const void * extra)707 assignKargs(KernelArg *args, const void *params, const void *extra)
708 {
709     const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
710     int side = (blasArgs->side == clblasRight);
711     size_t sizes[2] = {blasArgs->M, blasArgs->N};
712     size_t offs[2] = {blasArgs->offsetM, blasArgs->offsetN};
713 
714     (void)extra;
715 
716     switch (blasArgs->kernType) {
717     case CLBLAS_COMPUTING_KERNEL:
718         initSizeKarg(&args[0], blasArgs->M);
719         initSizeKarg(&args[1], blasArgs->N);
720         assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
721         INIT_KARG(&args[3], blasArgs->scimage[0]);
722         INIT_KARG(&args[4], blasArgs->scimage[1]);
723         initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0);
724         initSizeKarg(&args[6], blasArgs->ldb.matrix);
725         initSizeKarg(&args[7], blasArgs->offsetM);
726         initSizeKarg(&args[8], blasArgs->offsetN);
727         initSizeKarg(&args[9], blasArgs->K);
728         initSizeKarg(&args[10], blasArgs->offBX);
729         break;
730     case CLBLAS_PREP_A_KERNEL:
731         initSizeKarg(&args[0], sizes[side]);
732         initMemobjKarg(&args[1], blasArgs->A, NULL, 0, 0);
733         initSizeKarg(&args[2], blasArgs->lda.matrix);
734         INIT_KARG(&args[3], blasArgs->scimage[0]);
735         initSizeKarg(&args[4], offs[side]);
736         initSizeKarg(&args[5], blasArgs->K);
737         initSizeKarg(&args[6], blasArgs->offA);
738         break;
739     case CLBLAS_PREP_B_KERNEL:
740         INIT_KARG(&args[0], blasArgs->order);
741         INIT_KARG(&args[1], blasArgs->side);
742         initSizeKarg(&args[2], sizes[1 - side]);
743         initMemobjKarg(&args[3], blasArgs->B, NULL, 0, 0);
744         initSizeKarg(&args[4], blasArgs->ldb.matrix);
745         INIT_KARG(&args[5], blasArgs->scimage[1]);
746         initSizeKarg(&args[6], offs[1 - side]);
747         initSizeKarg(&args[7], blasArgs->K);
748         initSizeKarg(&args[8], blasArgs->offBX);
749         break;
750     default:
751         //this should not happen
752         break;
753     }
754 }
755 
756 static bool
isFitToLDS(SubproblemDim * dim,DataType dtype,cl_ulong ldsSize,const void * kernelArgs)757 isFitToLDS(
758     SubproblemDim *dim,
759     DataType dtype,
760     cl_ulong ldsSize,
761     const void *kernelArgs)
762 {
763     cl_ulong size;
764     const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
765     size = matrBlockSize(&dim[1], MATRIX_C, dtype, kargs->side);
766     return (size * dtypeSize(dtype) <= ldsSize);
767 }
768 
769 static void
calcNrThreads(size_t threads[2],const SubproblemDim * subdims,const PGranularity * pgran,const void * args,const void * extra)770 calcNrThreads(
771     size_t threads[2],
772     const SubproblemDim *subdims,
773     const PGranularity *pgran,
774     const void *args,
775     const void *extra)
776 {
777     const CLBlasKargs *kargs = args;
778     size_t m, n, k;
779     (void)extra;
780 
781     //form inner subdims with respect of multiplication side
782     if (kargs->side == clblasRight) {
783         m = kargs->N;
784         n = kargs->M;
785         //original N was stored in K
786         k = kargs->K;
787     }
788     else {
789         m = kargs->M;
790         n = kargs->N;
791         //original M was stored in K
792         k = kargs->K;
793     }
794 
795     if (kargs->kernType != CLBLAS_COMPUTING_KERNEL) {
796         size_t whole, part;
797         size_t nrGroups;
798 
799         // each thread gets one block
800         if (kargs->kernType == CLBLAS_PREP_A_KERNEL) {
801             whole = m;
802             part = subdims[0].itemY;
803         }
804         else {
805             whole = n;
806             part = subdims[0].itemX;
807         }
808 
809         nrGroups = whole / part + (whole % part != 0);
810         nrGroups *= (k / subdims[0].bwidth +
811                     (k % subdims[0].bwidth != 0));
812         threads[0] = pgran->wgSize[0] * nrGroups;
813         threads[1] = pgran->wgSize[1];
814     }
815     else {
816         calcGlobalThreads(threads, &subdims[0], pgran, m, n);
817     }
818 }
819 
820 static SolverFlags
solverFlags(void)821 solverFlags(void)
822 {
823     return (SF_WSPACE_2D);
824 }
825 
826 void
initTrmmImgPattern(MemoryPattern * mempat)827 initTrmmImgPattern(MemoryPattern *mempat)
828 {
829     mempat->name = "Image based block trmm";
830     mempat->nrLevels = 2;
831     mempat->cuLevel = 0;
832     mempat->thLevel = 1;
833     mempat->sops = &imgSops;
834 
835     mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
836     mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
837     mpatExtra.mobjA = CLMEM_IMAGE;
838     mpatExtra.mobjB = CLMEM_IMAGE;
839     mempat->extra = &mpatExtra;
840 }
841 
842 static int
getPerf(unsigned int kflags,const void * args)843 getPerf( unsigned int kflags,
844     const void *args)
845 {
846     DUMMY_ARG_USAGE(kflags);
847     DUMMY_ARG_USAGE(args);
848 
849     return PPERF_POOR;
850 }
851