1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 /*
19 * common stuff for blas related
20 * kernel generators, legacy part
21 */
22
23 #include <string.h>
24 #include <stdio.h>
25 #include <assert.h>
26
27 #include <list.h>
28 #include <clblas_stddef.h>
29
30 #include <matrix_props.h>
31 #include <matrix_dims.h>
32 #include <dis_warning.h>
33
34 #include "blas_kgen_legacy.h"
35
36 void
declareBlasEnums(struct KgenContext * ctx)37 declareBlasEnums(struct KgenContext *ctx)
38 {
39 kgenAddStmt(ctx,
40 "typedef enum clblasOrderEnum {\n"
41 " clblasRowMajor,\n"
42 " clblasColumnMajor\n"
43 "} clblasOrder;\n"
44 "\n"
45 "typedef enum clblasTransposeEnum {\n"
46 " clblasNoTrans,\n"
47 " clblasTrans,\n"
48 " clblasConjTrans\n"
49 "} clblasTranspose;\n"
50 "\n"
51 "typedef enum clblasUploEnum {\n"
52 " clblasUpper,\n"
53 " clblasLower\n"
54 "} clblasUplo;\n"
55 "\n"
56 "typedef enum clblasDiagEnum {\n"
57 " clblasUnit,\n"
58 " clblasNonUnit\n"
59 "} clblasDiag;\n"
60 "\n"
61 "typedef enum clblasSideEnum {\n"
62 " clblasLeft,\n"
63 " clblasRight\n"
64 "} clblasSide;\n\n");
65 }
66
67 static unsigned int
getTmpVecLen(const BlasGenSettings * gset,UpdateResultFlags uflags,const char ** vecName)68 getTmpVecLen(
69 const BlasGenSettings *gset,
70 UpdateResultFlags uflags,
71 const char **vecName)
72 {
73 const CLBLASKernExtra *kextra = gset->kextra;
74 unsigned int vecLen;
75
76 if (isComplexType(kextra->dtype) || (uflags & (UPRES_GENERIC |
77 UPRES_NO_VECTORIZATION))) {
78 vecLen = 1;
79 }
80 else {
81 vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? kextra->vecLenC :
82 kextra->vecLen;
83 getVectorTypeName(kextra->dtype, vecLen, vecName, NULL);
84 }
85
86 return vecLen;
87 }
88
89 static void
updateOptimResultGen(struct KgenContext * ctx,const BlasGenSettings * gset,unsigned int wvlen,unsigned int pitch,unsigned int regOff,const char * ldName,UpdateResultOp op,UpdateResultFlags flags,const char * cachedName)90 updateOptimResultGen(
91 struct KgenContext *ctx,
92 const BlasGenSettings *gset,
93 unsigned int wvlen,
94 unsigned int pitch,
95 unsigned int regOff,
96 const char *ldName,
97 UpdateResultOp op,
98 UpdateResultFlags flags,
99 const char *cachedName)
100 {
101 char tmp[1024];
102 int tra, isDouble;
103 bool useReg = true;
104 char *regRole;
105 char dst[80], src[80];
106 char vchunkTmp[64], vchunkReg[64];
107 unsigned int sizes[2];
108 unsigned int i, j, k;
109 unsigned int off;
110 const char *vfield;
111 DataType dtype = gset->kextra->dtype;
112 bool isPrivDest = ((flags & UPRES_PRIV_DEST) != 0);
113 unsigned int vecLen; // vector length of the result's register block
114 // vector length to update with at immediate operations
115 unsigned int uplen;
116 // vector length of the temporary storage location
117 unsigned int tmpVecLen;
118 const char *ptrName;
119
120 sizes[0] = (unsigned int)gset->subdims[1].y;
121 sizes[1] = (unsigned int)gset->subdims[1].x;
122
123 j = 0;
124 tra = ((flags & UPRES_COLUMN_MAJOR) != 0);
125 isDouble = isDoubleBasedType(dtype);
126 vfield = dtypeUPtrField(dtype);
127 vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC :
128 gset->kextra->vecLen;
129 tmpVecLen = getTmpVecLen(gset, flags, NULL);
130 getVectorTypeName(dtype, wvlen, NULL, &ptrName);
131 if (isComplexType(dtype)) {
132 vecLen = 1;
133 }
134 uplen = (tra || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen;
135
136 /*
137 * Pass recursively over the major dimension with power of 2 vectors.
138 * If the used type size is less then the current vector size,
139 * use assembling/disassembling into/from a temporary vector. This is
140 * for trying to increase effectiveness of operations with the global
141 * memory due to vectorization.
142 */
143 if (wvlen > sizes[1 - tra]) {
144 wvlen /= 2;
145 updateOptimResultGen(ctx, gset, wvlen, pitch, regOff, ldName,
146 op, flags, cachedName);
147 return;
148 }
149
150 if (wvlen == 1) {
151 kgenAddStmt(ctx, "// Copy with single words\n");
152 }
153 else {
154 const char *s = (isDouble) ? "double" : "float";
155
156 sprintf(tmp, "// Copy with %s%d vectors\n", s, wvlen);
157 kgenAddStmt(ctx, tmp);
158 }
159
160 for (i = 0; i < sizes[tra]; i++) {
161 unsigned int roff;
162
163 if (tra) {
164 roff = regOff + i;
165 }
166 else {
167 roff = regOff + i * pitch;
168 }
169
170 for (j = 0; j < sizes[1 - tra] / wvlen; j++) {
171 if (wvlen > uplen) {
172 if (isPrivDest) {
173 sprintfVecChunk(vchunkTmp, tmpVecLen, wvlen, 0);
174 sprintf(tmp, "tmp%s = uC.%s[%u];\n",
175 vchunkTmp, ptrName, j);
176 kgenAddStmt(ctx, tmp);
177 }
178 else {
179 // assemble vector
180 for (k = 0; k < wvlen; k += uplen) {
181 off = (tra) ? (roff + k * pitch) : (roff + k);
182 sprintfVecChunk(vchunkTmp, tmpVecLen, uplen, k);
183 sprintfVecChunk(vchunkReg, vecLen, uplen, off % vecLen);
184 sprintf(tmp, "tmp%s = c[%u]%s;\n",
185 vchunkTmp, off / vecLen, vchunkReg);
186 kgenAddStmt(ctx, tmp);
187 }
188 }
189 }
190
191 if (isPrivDest && (wvlen > uplen)) {
192 // disassemble temporary vector and do immediate result update
193 for (k = 0; k < wvlen; k += uplen) {
194 off = (tra) ? (roff + k * pitch) : (roff + k);
195 sprintfVecChunk(vchunkTmp, tmpVecLen, uplen, k);
196 sprintfVecChunk(vchunkReg, vecLen, uplen, off % vecLen);
197 sprintf(src, "tmp%s", vchunkTmp);
198 sprintf(dst, "c[%u]%s", off / vecLen, vchunkReg);
199 genUpdateResultSingle(ctx, dst, src, gset, op, flags);
200 }
201 }
202 else {
203 if (wvlen > uplen) {
204 sprintfVecChunk(vchunkTmp, tmpVecLen, wvlen, 0);
205 sprintf(src, "tmp%s", vchunkTmp);
206 useReg = false;
207 }
208
209 if (!isPrivDest) {
210 sprintf(dst, "uC.%s[%u]", ptrName, j);
211 if (cachedName) {
212 char *p = dst + strlen(dst);
213 strcat(p, " = ");
214 p = dst + strlen(dst);
215 sprintf(p, cachedName, i, j);
216 }
217 regRole = src;
218 }
219 else {
220 useReg = true;
221 regRole = dst;
222 sprintf(src, "uC.%s[%u]", ptrName, j);
223 }
224
225 if (useReg) {
226 sprintfVecChunk(vchunkReg, vecLen, uplen, roff % vecLen);
227 sprintf(regRole, "c[%u]%s", roff / vecLen, vchunkReg);
228 }
229
230 genUpdateResultSingle(ctx, dst, src, gset, op, flags);
231 }
232
233 // update register offset
234 if (tra) {
235 roff += wvlen * pitch;
236 }
237 else {
238 roff += wvlen;
239 }
240 }
241
242 // move the destination pointer to the next line
243 if ((i != sizes[tra] - 1)) {
244 sprintf(tmp, "uC.%s += %s;\n", vfield, ldName);
245 kgenAddStmt(ctx, tmp);
246 if (tra) {
247 kgenAddBlankLine(ctx);
248 }
249 }
250 }
251
252 if (j * wvlen != sizes[1 - tra]) {
253 // increment pointers
254 if (tra) {
255 regOff += j * wvlen * pitch;
256 }
257 else {
258 regOff += j * wvlen;
259 }
260
261 sprintf(tmp, "\n"
262 "uC.%s = tmpC.%s + %u;\n"
263 "tmpC = uC;\n",
264 vfield, vfield, j * wvlen);
265 kgenAddStmt(ctx, tmp);
266
267 // go down
268 sizes[1 - tra] -= j * wvlen;
269 wvlen /= 2;
270 updateOptimResultGen(ctx, gset, wvlen, pitch, regOff, ldName,
271 op, flags, cachedName);
272 }
273 }
274
275 static void
updateGenericResultGen(struct KgenContext * ctx,const BlasGenSettings * gset,size_t pitch,UpresVarNames * uvars,UpdateResultOp op,UpdateResultFlags flags,const char * cachedName)276 updateGenericResultGen(
277 struct KgenContext *ctx,
278 const BlasGenSettings *gset,
279 size_t pitch,
280 UpresVarNames* uvars,
281 UpdateResultOp op,
282 UpdateResultFlags flags,
283 const char *cachedName)
284 {
285 char tmp[1024], dst[128], src[128];
286 const char *boundNames[2] = {uvars->nrRows, uvars->nrCols};
287 const char *vecType = NULL;
288 const char *vFieldVectorized;
289 DataType dtype = gset->kextra->dtype;
290 unsigned int wvlen;
291 unsigned int sizes[2];
292 const char* vfield = dtypeUPtrField(dtype);
293 bool tra = ((flags & UPRES_COLUMN_MAJOR) != 0);
294 bool row = ((flags & UPRES_TAIL_ROW));
295 bool col = ((flags & UPRES_TAIL_COL));
296 bool iwc = ((flags & UPRES_INDEXING_WITH_CONSTANTS) != 0);
297 int l0;
298 int l1;
299 unsigned int vecLen; // vector length of the result's register block
300 // vector length to update with at immediate operations
301 unsigned int uplen;
302 // vector length of the temporary storage location
303 char vchunkReg[64];
304 bool revert = false;
305
306 vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC :
307 gset->kextra->vecLen;
308 if (isComplexType(dtype)) {
309 vecLen = 1;
310 }
311 uplen = (tra || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen;
312 uplen = 1;
313
314
315 sizes[0] = (unsigned int)gset->subdims[1].y;
316 sizes[1] = (unsigned int)gset->subdims[1].x;
317
318 if (iwc) {
319 const char* l0var = boundNames[tra];
320 revert = (tra && col) || (!tra && row);
321
322 if (revert) {
323 sprintf(tmp, "uC.%s += (%s-1) * %s;\n", vfield, l0var, uvars->ld);
324 }
325 else {
326 sprintf(tmp, "\n");
327 }
328 kgenAddStmt(ctx, tmp);
329
330 }
331 wvlen = getTmpVecLen(gset, flags, &vecType);
332 getVectorTypeName(dtype, wvlen, NULL, &vFieldVectorized);
333 sprintf(tmp, "res.%s = c;\n", vFieldVectorized);
334 kgenAddStmt(ctx, tmp);
335
336 if (flags & (UPRES_TAIL_ROW | UPRES_TAIL_COL)) {
337 char offStr[64];
338 char *p = offStr;
339
340 offStr[0] = '\0';
341 if (flags & UPRES_TAIL_ROW) {
342 sprintf(offStr, " + (%u - %s) * %lu",
343 sizes[0], uvars->nrRows, pitch);
344 p += strlen(offStr);
345 }
346 if (flags & UPRES_TAIL_COL) {
347 sprintf(p, " + (%u - %s)", sizes[1], uvars->nrCols);
348 }
349 if (iwc) {
350 sprintf(tmp, "res.%s = uC.%s%s;\n", vfield, vfield, offStr);
351 sprintf(tmp, "\n");
352 }
353 else {
354 sprintf(tmp, "res.%s = res.%s%s;\n", vfield, vfield, offStr);
355 }
356 kgenAddStmt(ctx, tmp);
357
358 }
359 if (iwc) {
360 int l0st = 1; int l0en = sizes[tra];
361 int l1st = 1; int l1en = sizes[1-tra];
362
363 const char* l0var = boundNames[tra];
364 const char* l1var = boundNames[1-tra];
365
366 for (l0 = l0en; l0 >= l0st; l0--) {
367
368 sprintf(tmp, "if (%s) ",l0var);
369 kgenBeginBranch(ctx, tmp);
370
371 sprintf(tmp, "switch (%s)", l1var);
372 kgenBeginBranch(ctx, tmp);
373
374 for (l1 = l1en; l1 >= l1st; l1--) {
375 int resId;
376
377 sprintf(tmp, "case %d:\n", l1);
378 kgenAddStmt(ctx, tmp);
379
380 if (tra) {
381 resId = (row)
382 ? (l1en-l1)*(int)pitch
383 : (l1-l1st)*(int)pitch;
384
385 resId += (col)? (l0-l0st): (l0en-l0);
386 }
387 else {
388 ///////////////////////////
389 resId = (row)
390 ? (l0-l0st)*(int)pitch
391 : (l0en-l0)*(int)pitch;
392 resId += (col)? (l1en-l1) : (l1-l1st);
393 }
394
395 if ((tra && row) || (!tra && col)) {
396 sprintf(dst, "uC.%s[(%s+%d) %% %i]",
397 vfield, l1var, (l1en - l1), (int)l1en);
398 }
399 else {
400 sprintf(dst, "uC.%s[%d]", vfield, (l1-l1st));
401 }
402 sprintfVecChunk(vchunkReg, vecLen, uplen, resId % vecLen);
403 sprintf(src, "c[%u]%s", resId / vecLen, vchunkReg);
404
405 if (flags & UPRES_PRIV_DEST) {
406 genUpdateResultSingle(ctx, src, dst, gset, op, flags);
407 }
408 else {
409 genUpdateResultSingle(ctx, dst, src, gset, op, flags);
410 }
411 }
412 kgenEndBranch(ctx, NULL);
413
414 if (revert) {
415 sprintf(tmp, "uC.%s -= %s;\n", vfield, uvars->ld);
416 }
417 else {
418 sprintf(tmp, "uC.%s += %s;\n", vfield, uvars->ld);
419 }
420
421 kgenAddStmt(ctx, tmp);
422
423 sprintf(tmp, "%s--;\n", l0var);
424 kgenAddStmt(ctx, tmp);
425 kgenEndBranch(ctx, NULL);
426 }
427
428 }
429 else {
430
431 sprintf(tmp, "for (i = 0; i < %s; i++)", boundNames[tra]);
432 kgenBeginBranch(ctx, tmp);
433 sprintf(tmp, "for (j = 0; j < %s; j++)", boundNames[1 - tra]);
434 kgenBeginBranch(ctx, tmp);
435 sprintf(dst, "uC.%s[i * %s + j]", vfield, uvars->ld);
436 if (cachedName) {
437 unsigned int i;
438 char tmpcachedName[80] = " = ";
439 strcat(tmpcachedName, cachedName);
440 for (i = 3; i < strlen(tmpcachedName); i++) {
441 if (strncmp(tmpcachedName+i, "%u", 2) == 0) {
442 tmpcachedName[i+1] = 's';
443 }
444 }
445 sprintf(tmp, tmpcachedName, "i", "[j]");
446 strcat(dst, tmp);
447 }
448 if (tra) {
449 sprintf(src, "res.%s[j * %lu + i]", vfield, pitch);
450 }
451 else {
452 sprintf(src, "res.%s[i * %lu + j]", vfield, pitch);
453 }
454 if (flags & UPRES_PRIV_DEST) {
455 genUpdateResultSingle(ctx, src, dst, gset, op, flags);
456 }
457 else {
458 genUpdateResultSingle(ctx, dst, src, gset, op, flags);
459 }
460 kgenEndBranch(ctx, NULL);
461 kgenEndBranch(ctx, NULL);
462 }
463 }
464
465 int
updateResultGenOld(struct KgenContext * ctx,const BlasGenSettings * gset,UpdateResultOp op,UpdateResultFlags flags,const UpresVarNames * uvarNames)466 updateResultGenOld(
467 struct KgenContext *ctx,
468 const BlasGenSettings *gset,
469 UpdateResultOp op,
470 UpdateResultFlags flags,
471 const UpresVarNames *uvarNames)
472 {
473 char tmp[1024];
474 char *p = tmp;
475 const char *typeName;
476 const char *vecType = NULL;
477 const char *vfield;
478 const char *suff1;
479 const char *suff2;
480 int ret = 0;
481 unsigned int sizes[2];
482 bool generic, tra;
483 unsigned int wvlen; // length of vectors to copy with
484 unsigned int uplen; // length of vectors to update result with
485 size_t pitch;
486 char LG;
487 DataType dtype = gset->kextra->dtype;
488 unsigned int vecLen;
489 bool isInlined = (flags & UPRES_INLINE);
490 UpresVarNames uvars;
491
492 vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC :
493 gset->kextra->vecLen;
494 sizes[0] = (unsigned int)gset->subdims[1].y;
495 sizes[1] = (unsigned int)gset->subdims[1].x;
496
497 if (isComplexType(dtype)) {
498 vecLen = 1;
499 }
500
501 if ((flags & UPRES_WITH_BETA) && (op != UPRES_SUM)) {
502 return -EINVAL;
503 }
504
505 tra = ((flags & UPRES_COLUMN_MAJOR) != 0);
506 generic = ((flags & UPRES_GENERIC) != 0);
507 typeName = dtypeBuiltinType(dtype);
508 vfield = dtypeUPtrField(dtype);
509 pitch = roundUp(sizes[1], vecLen);
510
511 // select write vectorization
512 wvlen = getTmpVecLen(gset, flags, &vecType);
513 uplen = (tra || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen;
514
515 suff1 = (generic) ? "Generic" : "";
516 suff2 = (flags & UPRES_PRIV_DEST) ? "Rev" : "";
517 LG = (flags & UPRES_USE_LDS) ? 'L' : 'G';
518
519 if (!isInlined) {
520 const char *outTypeName;
521 const char *memPref = (flags & UPRES_USE_LDS) ? "__local" :
522 "__global";
523
524 getResultGPRsInfo(dtype, NULL, vecLen, NULL, &outTypeName);
525
526 // define the function
527 sprintf(tmp, "void\n"
528 "updateResult%s%s%c(\n"
529 " %s %s *C,\n"
530 " %s *c,\n"
531 " %s alpha,\n"
532 " uint startRow,\n"
533 " uint startCol,\n"
534 " uint ld",
535 suff1, suff2, LG, memPref, typeName,
536 outTypeName, typeName);
537
538 p += strlen(p);
539 if (flags & UPRES_WITH_BETA) {
540 sprintf(p, ",\n %s beta", typeName);
541 p += strlen(p);
542 }
543 if (generic) {
544 sprintf(p, ",\n uint nrRows,\n"
545 " uint nrCols");
546 }
547
548 uvars.result = "C";
549 uvars.ld = "ld";
550 uvars.startRow = "startRow";
551 uvars.startCol = "startCol";
552 uvars.nrRows = "nrRows";
553 uvars.nrCols = "nrCols";
554
555 strcat(p, ")\n");
556 kgenDeclareFunction(ctx, tmp);
557 kgenBeginFuncBody(ctx);
558 }
559 else {
560 memcpy(&uvars, uvarNames, sizeof(uvars));
561 }
562
563 // declare local variables
564 sprintf(tmp, "%cPtr uC;\n", LG);
565 kgenAddStmt(ctx, tmp);
566 if (generic) {
567 kgenAddStmt(ctx, "int i, j;\n"
568 "PPtr res;\n");
569 }
570 else {
571 /*
572 * temporary pointer to pass correctly over the
573 * destination array since destination rows can be
574 * not aligned on a vector bound
575 */
576 if (sizes[1 - tra] % wvlen != 0) {
577 sprintf(tmp, "%cPtr tmpC;\n", LG);
578 kgenAddStmt(ctx, tmp);
579 }
580 if (wvlen > uplen) {
581 sprintf(tmp, "%s tmp;\n", vecType);
582 kgenAddStmt(ctx, tmp);
583 }
584 }
585 if (isComplexType(dtype) && !(flags & UPRES_WITHOUT_ALPHA)) {
586 declareComplexMultParts(ctx, "alpha", typeName);
587 if (flags & UPRES_WITH_BETA) {
588 declareComplexMultParts(ctx, "beta", typeName);
589 }
590
591 }
592 kgenAddBlankLine(ctx);
593
594 if (tra) {
595 sprintf(tmp, "uC.%s = %s + %s * %s + %s;\n",
596 vfield, uvars.result, uvars.startCol, uvars.ld,
597 uvars.startRow);
598 }
599 else {
600 sprintf(tmp, "uC.%s = %s + %s * %s + %s;\n",
601 vfield, uvars.result, uvars.startRow, uvars.ld,
602 uvars.startCol);
603 }
604 kgenAddStmt(ctx, tmp);
605
606 if ((sizes[1 - tra] % wvlen != 0) && !generic) {
607 kgenAddStmt(ctx, "tmpC = uC;\n");
608 }
609 ret = kgenAddBlankLine(ctx);
610
611 if (generic) {
612 updateGenericResultGen(ctx, gset, pitch, &uvars, op, flags,
613 uvarNames ? uvarNames->cachedName : NULL);
614 }
615 else {
616 updateOptimResultGen(ctx, gset, wvlen, (unsigned int)pitch, 0, uvars.ld,
617 op, flags, uvarNames ? uvarNames->cachedName : NULL);
618 }
619
620 if (!isInlined) {
621 ret = kgenEndFuncBody(ctx);
622 }
623
624 return (ret) ? -EOVERFLOW : 0;
625 }
626