1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17 /*
18 * trsv trtri generator -
19 *
20 * This kernel solves the triangular system of equations with only 1 work-group.
21 * This is terribly slow and forms the weakest link in the chain.
22 * It solves 1 variable per work-item. So, the size of the triangle that can be solved
23 * is limited by the hardware's MAX_WORKGROUP_SIZE.
24 * The "chain" for solving larger systems of equations involve a "gemv" operation
25 * which can be exploited by "xtrsv.c". However, the current "gemv" implementation
26 * does NOT support "single complex" and "double complex" data types.
27 * So, to give complete support, another "trsv_gemv" generator will be used.
28 */
29
30 #include <string.h>
31 #include <stdio.h>
32 #include <assert.h>
33 #include <clblas_stddef.h>
34 #include <clBLAS.h>
35 #include <blas_mempat.h>
36 #include <clkern.h>
37 #include <clblas-internal.h>
38 #include <trsv.clT>
39 #include <solution_seq.h>
40 //#include "blas_kgen.h"
41
42 #include <kprintf.hpp>
43
44 //#define DEBUG_TRSV_TRTRI
45
46 extern "C"
47 unsigned int dtypeSize(DataType type);
48
49
50 static char Prefix[4]; // PENDING: Magic "4" == Number of data types supported (float, double, cl_float2, cl_double2)
51
52
53 static SolverFlags
solverFlags(void)54 solverFlags(void)
55 {
56 #ifdef DEBUG_TRSV_TRTRI
57 printf("TRSV TRTRI solverFlags(): solverFlags callen......\n");
58 #endif
59
60 return (SF_WSPACE_1D);
61 }
62
63 static void
64 calcNrThreads(
65 size_t threads[2],
66 const SubproblemDim *subdims,
67 const PGranularity *pgran,
68 const void *args,
69 const void *extra);
70
71 static ssize_t
72 generator(
73 char *buf,
74 size_t buflen,
75 const struct SubproblemDim *subdims,
76 const struct PGranularity *pgran,
77 void *extra);
78
79
80 static void
81 assignKargs(KernelArg *args, const void *params, const void*);
82
83 extern "C"
84 void initTrsvDefaultPattern(MemoryPattern *mempat);
85
86 static void
87 setBuildOpts(
88 char * buildOptStr,
89 const void *kArgs);
90
91 static bool
92 isFitToLDS(
93 SubproblemDim *dim,
94 DataType dtype,
95 cl_ulong ldsSize,
96 const void *kernelArgs);
97
98 static ssize_t
99 generator_tbsv(
100 char *buf,
101 size_t buflen,
102 const struct SubproblemDim *subdims,
103 const struct PGranularity *pgran,
104 void *extra);
105
106 static SolverOps trsvOps = {
107 generator,
108 assignKargs,
109 isFitToLDS,
110 NULL, // Prepare Translate Dims
111 NULL, // Inner Decomposition Axis
112 calcNrThreads,
113 NULL, // Image related
114 solverFlags,
115 NULL,
116 NULL,
117 NULL,
118 setBuildOpts,
119 NULL
120 };
121
122 static void
setBuildOpts(char * buildOptStr,const void * args)123 setBuildOpts(
124 char * buildOptStr,
125 const void *args)
126 {
127 const SolutionStep *step = (const SolutionStep *)args;
128 const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
129 if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
130 {
131 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
132 #ifdef DEBUG_TRSV_TRTRI
133 printf("TRSV TRTRI: Setting build options ... Double... for DOUBLE PRECISION support\n");
134 #endif
135 }
136 if( kargs->pigFuncID == CLBLAS_TPSV)
137 {
138 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
139 #ifdef DEBUG_TRSV_TRTRI
140 printf("TPSV TRTRI: Setting build options ... PACKED\n");
141 #endif
142 }
143 if( kargs->pigFuncID == CLBLAS_TBSV)
144 {
145 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DBANDED");
146 #ifdef DEBUG_TRSV_TRTRI
147 printf("TBSV TRTRI: Setting build options .. BANDED\n");
148 #endif
149 }
150 return;
151 }
152
153 static CLBLASMpatExtra mpatExtra;
154
155 extern "C"
initTrsvDefaultPattern(MemoryPattern * mempat)156 void initTrsvDefaultPattern(MemoryPattern *mempat)
157 {
158 #ifdef DEBUG_TRSV_TRTRI
159 printf("TRSV TRTRI: initTRSVDefaultPattern called with mempat = 0x%p\n", (void*)mempat);
160 #endif
161
162 mempat->name = "Triangular matrix solver - Only 1 workgroup";
163 mempat->nrLevels = 2;
164 mempat->cuLevel = 0;
165 mempat->thLevel = 1;
166 mempat->sops = &trsvOps;
167
168 mpatExtra.aMset = CLMEM_LEVEL_L2;
169 mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
170 mpatExtra.mobjA = CLMEM_BUFFER; // == No images
171 mpatExtra.mobjB = CLMEM_BUFFER; // == No images
172 mempat->extra = &mpatExtra;
173
174 Prefix[TYPE_FLOAT] = 'S';
175 Prefix[TYPE_DOUBLE] = 'D';
176 Prefix[TYPE_COMPLEX_FLOAT] = 'C';
177 Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
178 }
179
180 //
181 // Read comments atop "isFitToLDS()"
182 // This function is required by "isFitLDS()"
183 //
getTargetWidth(size_t theight,size_t blk_size,size_t vwidth)184 static cl_ulong getTargetWidth(size_t theight, size_t blk_size, size_t vwidth)
185 {
186 cl_ulong nLoops_v, nLoops;
187 //
188 // NOTE: This function should be called only for Non-Transpose cases
189 // NOTE: Does not check if the block size is suitable for our purposes
190 // NOTE:
191 nLoops_v = (theight * theight) / blk_size;
192 nLoops = nLoops_v / vwidth;
193 if (nLoops == 0)
194 {
195 return 0;
196 }
197 return theight/nLoops;
198 }
199
200 static void
calcNrThreads(size_t threads[2],const SubproblemDim * subdims,const PGranularity * pgran,const void * args,const void * _extra)201 calcNrThreads(
202 size_t threads[2],
203 const SubproblemDim *subdims,
204 const PGranularity *pgran,
205 const void *args,
206 const void *_extra)
207 {
208 size_t BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
209 CLBlasKargs *kargs = (CLBlasKargs *)args;
210 #ifdef DEBUG_TRSV_TRTRI
211 printf("TRSV TRTRI: calcNrThreads() called \n");
212 #endif
213 int blocks = 1;
214
215 _extra = _extra; // Dummy- to avoid warnings
216
217 #ifdef DEBUG_TRSV_TRTRI
218 printf("blocks : %d\n", blocks);
219 #endif
220
221 if (((kargs->order == clblasColumnMajor) && (kargs->transA == clblasNoTrans)) ||
222 ((kargs->order == clblasRowMajor) && (kargs->transA != clblasNoTrans)))
223 {
224 if (subdims->y > BLOCKSIZE)
225 {
226 // These little kernels cannot handle arbitrary numbers
227 printf("TRSV calcNrThreads(): Warning. TRTRI Cannot handle subproblemdim of size %lu\n", subdims->y);
228 threads[0] = 0;
229 threads[1] = 0;
230 return;
231 }
232 } else {
233 if (subdims->y > 1024)
234 {
235 // These little kernels cannot handle arbitrary numbers
236 printf("TRSV calcNrThreads(): Warning. TRTRI Cannot handle subproblemdim of size %lu\n", subdims->y);
237 threads[0] = 0;
238 threads[1] = 0;
239 return;
240 }
241 }
242
243 threads[0] = blocks * BLOCKSIZE;
244 threads[1] = 1;
245 #ifdef DEBUG_TRSV_TRTRI
246 printf("pgran-wgSize[0] : %d, globalthreads[0] : %lu\n", pgran->wgSize[0], threads[0]);
247 #endif
248 return;
249 }
250
251 //
252 // FIXME: Report correct return value when "buf" is NULL - Needs change in KPRINTF
253 // FIXME: Return correct return value - Needs change in KPRINTF
254 //
255 static ssize_t
generator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)256 generator(
257 char *buf,
258 size_t buflen,
259 const struct SubproblemDim *subdims,
260 const struct PGranularity *pgran,
261 void *extra)
262 {
263 char tempTemplate[32*1024];
264 char vector_size_trans[10], triangle_height[10];
265
266 pgran = pgran; // Dummy- to avoid warnings
267
268 if (buf == NULL) // PENDING: Return correct buffer size
269 {
270 buflen = (32 * 1024 * sizeof(char));
271 return (ssize_t)buflen;
272 }
273
274 CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
275 SolutionStep *step = container_of( pgran , pgran, SolutionStep); // NOTE: using container_of() to get pigFuncID
276 CLBlasKargs* kargs = (CLBlasKargs*) &(step->args);
277
278 if(kargs->pigFuncID == CLBLAS_TBSV)
279 {
280 return generator_tbsv(buf, buflen, subdims, pgran, extra);
281 }
282
283 #ifdef DEBUG_TRSV_TRTRI
284 printf("TRSV GENERATOR called....\n");
285
286 if((( extraFlags->flags & KEXTRA_TRANS_A) || ( extraFlags ->flags & KEXTRA_CONJUGATE_A )))
287 {
288 printf("A is trans or CONJ-TRANS\n");
289 }
290 else
291 {
292 printf("A is noTrans...\n");
293 }
294 #endif
295
296 clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
297 clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
298 clblasTranspose trans = ( extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans);
299 //bool unit = (((extraFlags->flags) & KEXTRA_UNIT_DIAGONAL) != 0);
300
301 // unity and doConj handled in setKernelArgs
302 if ( order == clblasRowMajor )
303 {
304 order = clblasColumnMajor;
305 if ( trans == clblasNoTrans)
306 {
307 trans = clblasTrans;
308 }
309 else if ( trans == clblasTrans )
310 {
311 trans = clblasNoTrans;
312 }
313 else // clblasConjTrans
314 {
315 trans = clblasNoTrans;
316 }
317 uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper;
318 }
319
320 if ( trans == clblasNoTrans)
321 {
322 ( uplo == clblasLower )?
323 (strcpy(tempTemplate, (char*)trsv_CL_SolveTriangle_kernel)) :
324 (strcpy(tempTemplate, (char*)trsv_CU_SolveTriangle_kernel));
325 }
326 else // Transpose cases...
327 {
328 ( uplo == clblasLower )?
329 (strcpy(tempTemplate, (char*)trsv_CLT_SolveTriangle_kernel)) :
330 (strcpy(tempTemplate, (char*)trsv_CUT_SolveTriangle_kernel));
331 }
332
333 #ifdef DEBUG_TRSV_TRTRI
334 printf("dataType : %c\n", Prefix[extraFlags->dtype]);
335 #endif
336
337 // FIXME: VECTORSIZE HARD CODED
338 // FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint
339 unsigned int vecLenA = extraFlags->vecLenA;
340
341 #ifdef DEBUG_TRSV_TRTRI
342 printf("Vector length used : %d\n\n", vecLenA);
343 #endif
344
345 bool doVLOAD = false;
346 if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A )
347 {
348 doVLOAD = true;
349 #ifdef DEBUG_TRSV_TRTRI
350 printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
351 #endif
352 }
353 else
354 {
355 #ifdef DEBUG_TRSV_TRTRI
356 printf("Using Aligned Data Pointer .........................\n");
357 #endif
358 }
359 kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD);
360
361 if (trans != clblasNoTrans)
362 {
363 sprintf( vector_size_trans, "%u", vecLenA );
364 sprintf( triangle_height, "%ld", subdims[0].y );
365 #ifdef DEBUG_TRSV_TRTRI
366 printf("vector size trans = %s\n", vector_size_trans);
367 #endif
368 kobj.put("%PREFIXVECTOR_SIZE_TRANS", (const char *)vector_size_trans);
369 kobj.put("%TRIANGLE_HEIGHT", triangle_height);
370 }
371 kobj.spit((char*)buf, tempTemplate);
372 return (32 * 1024 * sizeof(char));
373 }
374
375 static void
assignKargs(KernelArg * args,const void * params,const void *)376 assignKargs(KernelArg *args, const void *params, const void*)
377 {
378 CLBlasKargs *blasArgs = (CLBlasKargs*)params;
379 cl_int inc;
380 cl_int unity, doConj;
381
382 INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument
383 INIT_KARG(&args[1], blasArgs->B); //x - result buffer = _xnew argument
384 initSizeKarg(&args[2], blasArgs->N);
385 inc = blasArgs->ldb.vector;
386 INIT_KARG(&args[3], inc);
387 unity = (blasArgs->diag == clblasUnit);
388 INIT_KARG(&args[4], unity);
389 initSizeKarg(&args[5], blasArgs->lda.matrix);
390 doConj = (blasArgs->transA == clblasConjTrans);
391 #ifdef DEBUG_TRSV_TRTRI
392 printf("TRMV TRTRI: assignKargs: doConj is : %d, unity is : %d, incx is : %d\n", doConj, unity, inc);
393 printf("TRMV TRTRI: startRow, startCol set to %d, %d\n", blasArgs->startRow, blasArgs->endRow);
394 #endif
395 INIT_KARG(&args[6], doConj);
396 INIT_KARG(&args[7], blasArgs->startRow);
397 INIT_KARG(&args[8], blasArgs->endRow);
398 initSizeKarg(&args[9], blasArgs->offa);
399 initSizeKarg(&args[10], blasArgs->offBX);
400
401 if( blasArgs->pigFuncID == CLBLAS_TBSV)
402 {
403 initSizeKarg(&args[11], blasArgs->K);
404 }
405 return;
406 }
407
408 /*
409 * isFitToLDS() is based on the "trsv_gemv" counterpart than the kernel corresponding to TRTRI
410 * The Kernels corersponding to TRTRI are run with only 1 Workgroup.
411 * So, it really does not matter at all.
412 * But, if dim[0].y selected by the library changes between TRTRI and TRSV_GEMV, results will go
413 * wrong. So, by using the same "isFitToLDS" function, we will indirectly force the library to
414 * choose the same "SubproblemDim" for both cases.
415 */
416 static bool
isFitToLDS(SubproblemDim * dim,DataType dtype,cl_ulong ldsSize,const void * kernelArgs)417 isFitToLDS(
418 SubproblemDim *dim,
419 DataType dtype,
420 cl_ulong ldsSize,
421 const void *kernelArgs)
422 {
423 CLBlasKargs *blasArgs = (CLBlasKargs *)kernelArgs;
424 size_t MAXBLOCKSIZE = 256;
425 cl_ulong maxSize;
426
427 if (
428 ((blasArgs->transA == clblasNoTrans) && (blasArgs->order == clblasColumnMajor)) ||
429 ((blasArgs->transA != clblasNoTrans) && (blasArgs->order == clblasRowMajor))
430 )
431 {
432 //
433 // Estimate worst case Local Memory needed - Vector Width of 4 irrespective of data-type?
434 //
435 cl_ulong tw;
436
437 tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4);
438 if (tw == 0)
439 {
440 do {
441 MAXBLOCKSIZE /= 2;
442 tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4);
443 } while((MAXBLOCKSIZE > 1) && (tw == 0));
444 }
445 #ifdef DEBUG_TRSV_TRTRI
446 printf("TRSV TRTRI: isFitLDS() tw = %lu\n", tw);
447 #endif
448 maxSize = (1+4+tw)*dtypeSize(dtype) + MAXBLOCKSIZE*dtypeSize(dtype)*4;
449 #ifdef DEBUG_TRSV_TRTRI
450 printf("TRSV TRTRI: isFitLDS() maxSize = %lu, ldsSize = %lu, Y=%lu\n", maxSize, ldsSize, dim[0].y);
451 #endif
452 return (maxSize < ldsSize);
453 }
454
455 //
456 // The remaining kernels use "TriangleWidth" amount of local memory for storing the RHS.
457 // We will assume "dim[0].y" to be the "TriangleWidth"
458 //
459 MAXBLOCKSIZE = (dim[0].y)*(dim[0].y) > 256 ? 256 : dim[0].y*dim[0].y;
460 maxSize = (dim[0].y + MAXBLOCKSIZE)*dtypeSize(dtype);
461 return (maxSize < ldsSize);
462 }
463
464 static ssize_t
generator_tbsv(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)465 generator_tbsv(
466 char *buf,
467 size_t buflen,
468 const struct SubproblemDim *subdims,
469 const struct PGranularity *pgran,
470 void *extra)
471 {
472 char tempTemplate[32*1024];
473 char vector_size_trans[10], triangle_height[10];
474
475 pgran = pgran; // Dummy- to avoid warnings
476
477 if (buf == NULL) // PENDING: Return correct buffer size
478 {
479 buflen = (32 * 1024 * sizeof(char));
480 return (ssize_t)buflen;
481 }
482
483 CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
484
485 clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
486 clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
487 clblasTranspose trans = ( extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans);
488
489 // unity and doConj handled in setKernelArgs
490 if ( order == clblasColumnMajor )
491 {
492 if ( trans == clblasNoTrans)
493 {
494 trans = clblasTrans;
495 }
496 else if ( trans == clblasTrans )
497 {
498 trans = clblasNoTrans;
499 }
500 else // clblasConjTrans
501 {
502 trans = clblasNoTrans;
503 }
504 uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper;
505 }
506
507 if ( trans == clblasNoTrans)
508 {
509 ( uplo == clblasLower )?
510 (strcpy(tempTemplate, (char*)trsv_CL_SolveTriangle_kernel)) :
511 (strcpy(tempTemplate, (char*)trsv_CU_SolveTriangle_kernel));
512 }
513 else // Transpose cases...
514 {
515 ( uplo == clblasLower )?
516 (strcpy(tempTemplate, (char*)trsv_CLT_SolveTriangle_kernel)) :
517 (strcpy(tempTemplate, (char*)trsv_CUT_SolveTriangle_kernel));
518 }
519
520 unsigned int vecLenA = extraFlags->vecLenA;
521
522 bool doVLOAD = false;
523 if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A )
524 {
525 doVLOAD = true;
526 #ifdef DEBUG_TRSV_TRTRI
527 printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
528 #endif
529 }
530 else
531 {
532 #ifdef DEBUG_TRSV_TRTRI
533 printf("Using Aligned Data Pointer .........................\n");
534 #endif
535 }
536 kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD);
537
538 if (trans != clblasNoTrans)
539 {
540 sprintf( vector_size_trans, "%u", vecLenA );
541 sprintf( triangle_height, "%ld", subdims[0].y );
542 kobj.put("%PREFIXVECTOR_SIZE_TRANS", (const char *)vector_size_trans);
543 kobj.put("%TRIANGLE_HEIGHT", triangle_height);
544 }
545 kobj.spit((char*)buf, tempTemplate);
546 return (32 * 1024 * sizeof(char));
547 }
548
549