1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 /*
19  * common stuff for blas related
20  * kernel generators, legacy part
21  */
22 
23 #include <string.h>
24 #include <stdio.h>
25 #include <assert.h>
26 
27 #include <list.h>
28 #include <clblas_stddef.h>
29 
30 #include <matrix_props.h>
31 #include <matrix_dims.h>
32 #include <dis_warning.h>
33 
34 #include "blas_kgen_legacy.h"
35 
36 void
declareBlasEnums(struct KgenContext * ctx)37 declareBlasEnums(struct KgenContext *ctx)
38 {
39     kgenAddStmt(ctx,
40         "typedef enum clblasOrderEnum {\n"
41         "   clblasRowMajor,\n"
42         "   clblasColumnMajor\n"
43         "} clblasOrder;\n"
44         "\n"
45         "typedef enum clblasTransposeEnum {\n"
46         "   clblasNoTrans,\n"
47         "   clblasTrans,\n"
48         "   clblasConjTrans\n"
49         "} clblasTranspose;\n"
50         "\n"
51         "typedef enum clblasUploEnum {\n"
52         "   clblasUpper,\n"
53         "   clblasLower\n"
54         "} clblasUplo;\n"
55         "\n"
56         "typedef enum clblasDiagEnum {\n"
57         "   clblasUnit,\n"
58         "   clblasNonUnit\n"
59         "} clblasDiag;\n"
60         "\n"
61         "typedef enum clblasSideEnum {\n"
62         "   clblasLeft,\n"
63         "   clblasRight\n"
64         "} clblasSide;\n\n");
65 }
66 
67 static unsigned int
getTmpVecLen(const BlasGenSettings * gset,UpdateResultFlags uflags,const char ** vecName)68 getTmpVecLen(
69     const BlasGenSettings *gset,
70     UpdateResultFlags uflags,
71     const char **vecName)
72 {
73     const CLBLASKernExtra *kextra = gset->kextra;
74     unsigned int vecLen;
75 
76     if (isComplexType(kextra->dtype) || (uflags & (UPRES_GENERIC |
77                                          UPRES_NO_VECTORIZATION))) {
78         vecLen = 1;
79     }
80     else {
81         vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? kextra->vecLenC :
82                                                        kextra->vecLen;
83         getVectorTypeName(kextra->dtype, vecLen, vecName, NULL);
84     }
85 
86     return vecLen;
87 }
88 
89 static void
updateOptimResultGen(struct KgenContext * ctx,const BlasGenSettings * gset,unsigned int wvlen,unsigned int pitch,unsigned int regOff,const char * ldName,UpdateResultOp op,UpdateResultFlags flags,const char * cachedName)90 updateOptimResultGen(
91     struct KgenContext *ctx,
92     const BlasGenSettings *gset,
93     unsigned int wvlen,
94     unsigned int pitch,
95     unsigned int regOff,
96     const char *ldName,
97     UpdateResultOp op,
98     UpdateResultFlags flags,
99     const char *cachedName)
100 {
101     char tmp[1024];
102     int tra, isDouble;
103     bool useReg = true;
104     char *regRole;
105     char dst[80], src[80];
106     char vchunkTmp[64], vchunkReg[64];
107     unsigned int sizes[2];
108     unsigned int i, j, k;
109     unsigned int off;
110     const char *vfield;
111     DataType dtype = gset->kextra->dtype;
112     bool isPrivDest = ((flags & UPRES_PRIV_DEST) != 0);
113     unsigned int vecLen;     // vector length of the result's register block
114     // vector length to update with at immediate operations
115     unsigned int uplen;
116     // vector length of the temporary storage location
117     unsigned int tmpVecLen;
118     const char *ptrName;
119 
120     sizes[0] = (unsigned int)gset->subdims[1].y;
121     sizes[1] = (unsigned int)gset->subdims[1].x;
122 
123     j = 0;
124     tra = ((flags & UPRES_COLUMN_MAJOR) != 0);
125     isDouble = isDoubleBasedType(dtype);
126     vfield = dtypeUPtrField(dtype);
127     vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC :
128                                                    gset->kextra->vecLen;
129     tmpVecLen = getTmpVecLen(gset, flags, NULL);
130     getVectorTypeName(dtype, wvlen, NULL, &ptrName);
131     if (isComplexType(dtype)) {
132         vecLen = 1;
133     }
134     uplen = (tra || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen;
135 
136     /*
137      * Pass recursively over the major dimension with power of 2 vectors.
138      * If the used type size is less then the current vector size,
139      * use assembling/disassembling into/from a temporary vector. This is
140      * for trying to increase effectiveness of operations with the global
141      * memory due to vectorization.
142      */
143     if (wvlen > sizes[1 - tra]) {
144         wvlen /= 2;
145         updateOptimResultGen(ctx, gset, wvlen, pitch, regOff, ldName,
146                              op, flags, cachedName);
147         return;
148     }
149 
150     if (wvlen == 1) {
151         kgenAddStmt(ctx, "// Copy with single words\n");
152     }
153     else {
154         const char *s = (isDouble) ? "double" : "float";
155 
156         sprintf(tmp, "// Copy with %s%d vectors\n", s, wvlen);
157         kgenAddStmt(ctx, tmp);
158     }
159 
160     for (i = 0; i < sizes[tra]; i++) {
161         unsigned int roff;
162 
163         if (tra) {
164             roff = regOff + i;
165         }
166         else {
167             roff = regOff + i * pitch;
168         }
169 
170         for (j = 0; j < sizes[1 - tra] / wvlen; j++) {
171             if (wvlen > uplen) {
172                 if (isPrivDest) {
173                     sprintfVecChunk(vchunkTmp, tmpVecLen, wvlen, 0);
174                     sprintf(tmp, "tmp%s = uC.%s[%u];\n",
175                             vchunkTmp, ptrName, j);
176                     kgenAddStmt(ctx, tmp);
177                 }
178                 else {
179                     // assemble vector
180                     for (k = 0; k < wvlen; k += uplen) {
181                         off = (tra) ? (roff + k * pitch) : (roff + k);
182                         sprintfVecChunk(vchunkTmp, tmpVecLen, uplen, k);
183                         sprintfVecChunk(vchunkReg, vecLen, uplen, off % vecLen);
184                         sprintf(tmp, "tmp%s = c[%u]%s;\n",
185                                 vchunkTmp, off / vecLen, vchunkReg);
186                         kgenAddStmt(ctx, tmp);
187                     }
188                 }
189             }
190 
191             if (isPrivDest && (wvlen > uplen)) {
192                 // disassemble temporary vector and do immediate result update
193                 for (k = 0; k < wvlen; k += uplen) {
194                     off = (tra) ? (roff + k * pitch) : (roff + k);
195                     sprintfVecChunk(vchunkTmp, tmpVecLen, uplen, k);
196                     sprintfVecChunk(vchunkReg, vecLen, uplen, off % vecLen);
197                     sprintf(src, "tmp%s", vchunkTmp);
198                     sprintf(dst, "c[%u]%s", off / vecLen, vchunkReg);
199                     genUpdateResultSingle(ctx, dst, src, gset, op, flags);
200                 }
201             }
202             else {
203                 if (wvlen > uplen) {
204                     sprintfVecChunk(vchunkTmp, tmpVecLen, wvlen, 0);
205                     sprintf(src, "tmp%s", vchunkTmp);
206                     useReg = false;
207                 }
208 
209                 if (!isPrivDest) {
210                     sprintf(dst, "uC.%s[%u]", ptrName, j);
211                     if (cachedName) {
212                         char *p = dst + strlen(dst);
213                         strcat(p, " = ");
214                         p = dst + strlen(dst);
215                         sprintf(p, cachedName, i, j);
216                     }
217                     regRole = src;
218                 }
219                 else {
220                     useReg = true;
221                     regRole = dst;
222                     sprintf(src, "uC.%s[%u]", ptrName, j);
223                 }
224 
225                 if (useReg) {
226                     sprintfVecChunk(vchunkReg, vecLen, uplen, roff % vecLen);
227                     sprintf(regRole, "c[%u]%s", roff / vecLen, vchunkReg);
228                 }
229 
230                 genUpdateResultSingle(ctx, dst, src, gset, op, flags);
231             }
232 
233             // update register offset
234             if (tra) {
235                 roff += wvlen * pitch;
236             }
237             else {
238                 roff += wvlen;
239             }
240         }
241 
242         // move the destination pointer to the next line
243         if ((i != sizes[tra] - 1)) {
244             sprintf(tmp, "uC.%s += %s;\n", vfield, ldName);
245             kgenAddStmt(ctx, tmp);
246             if (tra) {
247                 kgenAddBlankLine(ctx);
248             }
249         }
250     }
251 
252     if (j * wvlen != sizes[1 - tra]) {
253         // increment pointers
254         if (tra) {
255             regOff += j * wvlen * pitch;
256         }
257         else {
258             regOff += j * wvlen;
259         }
260 
261         sprintf(tmp, "\n"
262                      "uC.%s = tmpC.%s + %u;\n"
263                      "tmpC = uC;\n",
264                 vfield, vfield, j * wvlen);
265         kgenAddStmt(ctx, tmp);
266 
267         // go down
268         sizes[1 - tra] -= j * wvlen;
269         wvlen /= 2;
270         updateOptimResultGen(ctx, gset, wvlen, pitch, regOff, ldName,
271                              op, flags, cachedName);
272     }
273 }
274 
275 static void
updateGenericResultGen(struct KgenContext * ctx,const BlasGenSettings * gset,size_t pitch,UpresVarNames * uvars,UpdateResultOp op,UpdateResultFlags flags,const char * cachedName)276 updateGenericResultGen(
277     struct KgenContext *ctx,
278     const BlasGenSettings *gset,
279     size_t pitch,
280     UpresVarNames* uvars,
281     UpdateResultOp op,
282     UpdateResultFlags flags,
283     const char *cachedName)
284 {
285     char tmp[1024], dst[128], src[128];
286     const char *boundNames[2] = {uvars->nrRows, uvars->nrCols};
287     const char *vecType = NULL;
288     const char *vFieldVectorized;
289     DataType dtype = gset->kextra->dtype;
290     unsigned int wvlen;
291     unsigned int sizes[2];
292     const char*  vfield = dtypeUPtrField(dtype);
293     bool tra = ((flags & UPRES_COLUMN_MAJOR) != 0);
294     bool row = ((flags & UPRES_TAIL_ROW));
295     bool col = ((flags & UPRES_TAIL_COL));
296     bool iwc = ((flags & UPRES_INDEXING_WITH_CONSTANTS) != 0);
297     int l0;
298     int l1;
299     unsigned int vecLen;     // vector length of the result's register block
300     // vector length to update with at immediate operations
301     unsigned int uplen;
302     // vector length of the temporary storage location
303     char vchunkReg[64];
304     bool revert = false;
305 
306     vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC :
307                                                    gset->kextra->vecLen;
308     if (isComplexType(dtype)) {
309         vecLen = 1;
310     }
311     uplen = (tra || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen;
312     uplen = 1;
313 
314 
315     sizes[0] = (unsigned int)gset->subdims[1].y;
316     sizes[1] = (unsigned int)gset->subdims[1].x;
317 
318     if (iwc) {
319         const char* l0var =  boundNames[tra];
320         revert =  (tra && col) || (!tra && row);
321 
322         if (revert) {
323             sprintf(tmp, "uC.%s += (%s-1) * %s;\n", vfield, l0var, uvars->ld);
324         }
325         else {
326             sprintf(tmp, "\n");
327         }
328         kgenAddStmt(ctx, tmp);
329 
330     }
331     wvlen = getTmpVecLen(gset, flags, &vecType);
332     getVectorTypeName(dtype, wvlen, NULL, &vFieldVectorized);
333     sprintf(tmp, "res.%s = c;\n", vFieldVectorized);
334     kgenAddStmt(ctx, tmp);
335 
336     if (flags & (UPRES_TAIL_ROW | UPRES_TAIL_COL)) {
337         char offStr[64];
338         char *p = offStr;
339 
340         offStr[0] = '\0';
341         if (flags & UPRES_TAIL_ROW) {
342             sprintf(offStr, " + (%u - %s) * %lu",
343                     sizes[0], uvars->nrRows, pitch);
344             p += strlen(offStr);
345         }
346         if (flags & UPRES_TAIL_COL) {
347             sprintf(p, " + (%u - %s)", sizes[1], uvars->nrCols);
348         }
349         if (iwc) {
350             sprintf(tmp, "res.%s = uC.%s%s;\n", vfield, vfield, offStr);
351             sprintf(tmp, "\n");
352         }
353         else {
354             sprintf(tmp, "res.%s = res.%s%s;\n", vfield, vfield, offStr);
355         }
356         kgenAddStmt(ctx, tmp);
357 
358     }
359     if (iwc) {
360         int l0st = 1; int l0en = sizes[tra];
361         int l1st = 1; int l1en = sizes[1-tra];
362 
363         const char* l0var =  boundNames[tra];
364         const char* l1var = boundNames[1-tra];
365 
366         for (l0 = l0en; l0 >= l0st; l0--) {
367 
368             sprintf(tmp, "if (%s) ",l0var);
369             kgenBeginBranch(ctx, tmp);
370 
371             sprintf(tmp, "switch (%s)", l1var);
372             kgenBeginBranch(ctx, tmp);
373 
374             for (l1 = l1en; l1 >= l1st; l1--) {
375                 int resId;
376 
377                 sprintf(tmp, "case %d:\n", l1);
378                 kgenAddStmt(ctx, tmp);
379 
380                 if (tra) {
381                     resId = (row)
382                              ? (l1en-l1)*(int)pitch
383                              : (l1-l1st)*(int)pitch;
384 
385                     resId += (col)? (l0-l0st): (l0en-l0);
386                 }
387                 else {
388                     ///////////////////////////
389                     resId = (row)
390                             ? (l0-l0st)*(int)pitch
391                             : (l0en-l0)*(int)pitch;
392                     resId += (col)? (l1en-l1) : (l1-l1st);
393                 }
394 
395                 if ((tra && row) || (!tra && col)) {
396                      sprintf(dst, "uC.%s[(%s+%d) %% %i]",
397                              vfield, l1var, (l1en - l1),  (int)l1en);
398                 }
399                 else {
400                    sprintf(dst, "uC.%s[%d]", vfield, (l1-l1st));
401                 }
402                 sprintfVecChunk(vchunkReg, vecLen, uplen, resId % vecLen);
403                 sprintf(src, "c[%u]%s", resId / vecLen, vchunkReg);
404 
405                 if (flags & UPRES_PRIV_DEST) {
406                     genUpdateResultSingle(ctx, src, dst, gset, op, flags);
407                 }
408                 else {
409                     genUpdateResultSingle(ctx, dst, src, gset, op, flags);
410                 }
411             }
412             kgenEndBranch(ctx, NULL);
413 
414             if (revert) {
415                 sprintf(tmp, "uC.%s -= %s;\n", vfield, uvars->ld);
416             }
417             else {
418                 sprintf(tmp, "uC.%s += %s;\n", vfield, uvars->ld);
419             }
420 
421             kgenAddStmt(ctx, tmp);
422 
423             sprintf(tmp, "%s--;\n", l0var);
424             kgenAddStmt(ctx, tmp);
425             kgenEndBranch(ctx, NULL);
426         }
427 
428     }
429     else {
430 
431         sprintf(tmp, "for (i = 0; i < %s; i++)", boundNames[tra]);
432         kgenBeginBranch(ctx, tmp);
433         sprintf(tmp, "for (j = 0; j < %s; j++)", boundNames[1 - tra]);
434         kgenBeginBranch(ctx, tmp);
435         sprintf(dst, "uC.%s[i * %s + j]", vfield, uvars->ld);
436         if (cachedName) {
437             unsigned int i;
438             char tmpcachedName[80] = " = ";
439             strcat(tmpcachedName, cachedName);
440             for (i = 3; i < strlen(tmpcachedName); i++) {
441                 if (strncmp(tmpcachedName+i, "%u", 2) == 0) {
442                     tmpcachedName[i+1] = 's';
443                 }
444             }
445             sprintf(tmp, tmpcachedName, "i", "[j]");
446             strcat(dst, tmp);
447         }
448         if (tra) {
449             sprintf(src, "res.%s[j * %lu + i]", vfield, pitch);
450         }
451         else {
452             sprintf(src, "res.%s[i * %lu + j]", vfield, pitch);
453         }
454         if (flags & UPRES_PRIV_DEST) {
455             genUpdateResultSingle(ctx, src, dst, gset, op, flags);
456         }
457         else {
458             genUpdateResultSingle(ctx, dst, src, gset, op, flags);
459         }
460         kgenEndBranch(ctx, NULL);
461         kgenEndBranch(ctx, NULL);
462     }
463 }
464 
465 int
updateResultGenOld(struct KgenContext * ctx,const BlasGenSettings * gset,UpdateResultOp op,UpdateResultFlags flags,const UpresVarNames * uvarNames)466 updateResultGenOld(
467     struct KgenContext *ctx,
468     const BlasGenSettings *gset,
469     UpdateResultOp op,
470     UpdateResultFlags flags,
471     const UpresVarNames *uvarNames)
472 {
473     char tmp[1024];
474     char *p = tmp;
475     const char *typeName;
476     const char *vecType = NULL;
477     const char *vfield;
478     const char *suff1;
479     const char *suff2;
480     int ret = 0;
481     unsigned int sizes[2];
482     bool generic, tra;
483     unsigned int wvlen;     // length of vectors to copy with
484     unsigned int uplen;     // length of vectors to update result with
485     size_t pitch;
486     char LG;
487     DataType dtype = gset->kextra->dtype;
488     unsigned int vecLen;
489     bool isInlined = (flags & UPRES_INLINE);
490     UpresVarNames uvars;
491 
492     vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC :
493                                                    gset->kextra->vecLen;
494     sizes[0] = (unsigned int)gset->subdims[1].y;
495     sizes[1] = (unsigned int)gset->subdims[1].x;
496 
497     if (isComplexType(dtype)) {
498         vecLen = 1;
499     }
500 
501     if ((flags & UPRES_WITH_BETA) && (op != UPRES_SUM)) {
502         return -EINVAL;
503     }
504 
505     tra = ((flags & UPRES_COLUMN_MAJOR) != 0);
506     generic = ((flags & UPRES_GENERIC) != 0);
507     typeName = dtypeBuiltinType(dtype);
508     vfield = dtypeUPtrField(dtype);
509     pitch = roundUp(sizes[1], vecLen);
510 
511     // select write vectorization
512     wvlen = getTmpVecLen(gset, flags, &vecType);
513     uplen = (tra || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen;
514 
515     suff1 = (generic) ? "Generic" : "";
516     suff2 = (flags & UPRES_PRIV_DEST) ? "Rev" : "";
517     LG = (flags & UPRES_USE_LDS) ? 'L' : 'G';
518 
519     if (!isInlined) {
520         const char *outTypeName;
521         const char *memPref = (flags & UPRES_USE_LDS) ? "__local" :
522                                                            "__global";
523 
524         getResultGPRsInfo(dtype, NULL, vecLen, NULL, &outTypeName);
525 
526         // define the function
527         sprintf(tmp, "void\n"
528                      "updateResult%s%s%c(\n"
529                      "    %s %s *C,\n"
530                      "    %s *c,\n"
531                      "    %s alpha,\n"
532                      "    uint startRow,\n"
533                      "    uint startCol,\n"
534                      "    uint ld",
535                      suff1, suff2, LG, memPref, typeName,
536                      outTypeName, typeName);
537 
538         p += strlen(p);
539         if (flags & UPRES_WITH_BETA) {
540             sprintf(p, ",\n    %s beta", typeName);
541             p += strlen(p);
542         }
543         if (generic) {
544             sprintf(p, ",\n    uint nrRows,\n"
545                        "    uint nrCols");
546         }
547 
548         uvars.result = "C";
549         uvars.ld = "ld";
550         uvars.startRow = "startRow";
551         uvars.startCol = "startCol";
552         uvars.nrRows = "nrRows";
553         uvars.nrCols = "nrCols";
554 
555         strcat(p, ")\n");
556         kgenDeclareFunction(ctx, tmp);
557         kgenBeginFuncBody(ctx);
558     }
559     else {
560         memcpy(&uvars, uvarNames, sizeof(uvars));
561     }
562 
563     // declare local variables
564     sprintf(tmp, "%cPtr uC;\n", LG);
565     kgenAddStmt(ctx, tmp);
566     if (generic) {
567         kgenAddStmt(ctx, "int i, j;\n"
568                          "PPtr res;\n");
569     }
570     else {
571         /*
572          * temporary pointer to pass correctly over the
573          * destination array since destination rows can be
574          * not aligned on a vector bound
575          */
576         if (sizes[1 - tra] % wvlen != 0) {
577             sprintf(tmp, "%cPtr tmpC;\n", LG);
578             kgenAddStmt(ctx, tmp);
579         }
580         if (wvlen > uplen) {
581             sprintf(tmp, "%s tmp;\n", vecType);
582             kgenAddStmt(ctx, tmp);
583         }
584     }
585     if (isComplexType(dtype) && !(flags & UPRES_WITHOUT_ALPHA)) {
586         declareComplexMultParts(ctx, "alpha", typeName);
587         if (flags & UPRES_WITH_BETA) {
588             declareComplexMultParts(ctx, "beta", typeName);
589         }
590 
591     }
592     kgenAddBlankLine(ctx);
593 
594     if (tra) {
595         sprintf(tmp, "uC.%s = %s + %s * %s + %s;\n",
596                 vfield, uvars.result, uvars.startCol, uvars.ld,
597                 uvars.startRow);
598     }
599     else {
600         sprintf(tmp, "uC.%s = %s + %s * %s + %s;\n",
601                 vfield, uvars.result, uvars.startRow, uvars.ld,
602                 uvars.startCol);
603     }
604     kgenAddStmt(ctx, tmp);
605 
606     if ((sizes[1 - tra] % wvlen != 0) && !generic) {
607         kgenAddStmt(ctx, "tmpC = uC;\n");
608     }
609     ret = kgenAddBlankLine(ctx);
610 
611     if (generic) {
612         updateGenericResultGen(ctx, gset, pitch, &uvars, op, flags,
613                                uvarNames ? uvarNames->cachedName : NULL);
614     }
615     else {
616         updateOptimResultGen(ctx, gset, wvlen, (unsigned int)pitch, 0, uvars.ld,
617                            op, flags, uvarNames ? uvarNames->cachedName : NULL);
618     }
619 
620     if (!isInlined) {
621         ret = kgenEndFuncBody(ctx);
622     }
623 
624     return (ret) ? -EOVERFLOW : 0;
625 }
626