1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 /*
18  * trsv trtri generator -
19  *
20  * This kernel solves the triangular system of equations with only 1 work-group.
21  * This is terribly slow and forms the weakest link in the chain.
22  * It solves 1 variable per work-item. So, the size of the triangle that can be solved
23  * is limited by the hardware's MAX_WORKGROUP_SIZE.
24  * The "chain" for solving larger systems of equations involve a "gemv" operation
25  * which can be exploited by "xtrsv.c". However, the current "gemv" implementation
26  * does NOT support "single complex" and "double complex" data types.
27  * So, to give complete support, another "trsv_gemv" generator will be used.
28  */
29 
30 #include <string.h>
31 #include <stdio.h>
32 #include <assert.h>
33 #include <clblas_stddef.h>
34 #include <clBLAS.h>
35 #include <blas_mempat.h>
36 #include <clkern.h>
37 #include <clblas-internal.h>
38 #include <trsv.clT>
39 #include <solution_seq.h>
40 //#include "blas_kgen.h"
41 
42 #include <kprintf.hpp>
43 
44 //#define DEBUG_TRSV_TRTRI
45 
46 extern "C"
47 unsigned int dtypeSize(DataType type);
48 
49 
50 static char Prefix[4]; // PENDING: Magic "4" == Number of data types supported (float, double, cl_float2, cl_double2)
51 
52 
53 static SolverFlags
solverFlags(void)54 solverFlags(void)
55 {
56     #ifdef DEBUG_TRSV_TRTRI
57     printf("TRSV TRTRI solverFlags(): solverFlags callen......\n");
58     #endif
59 
60     return (SF_WSPACE_1D);
61 }
62 
63 static void
64 calcNrThreads(
65     size_t threads[2],
66     const SubproblemDim *subdims,
67     const PGranularity *pgran,
68     const void *args,
69     const void *extra);
70 
71 static ssize_t
72 generator(
73    char *buf,
74    size_t buflen,
75    const struct SubproblemDim *subdims,
76    const struct PGranularity *pgran,
77    void *extra);
78 
79 
80 static void
81 assignKargs(KernelArg *args, const void *params, const void*);
82 
83 extern "C"
84 void initTrsvDefaultPattern(MemoryPattern *mempat);
85 
86 static void
87 setBuildOpts(
88     char * buildOptStr,
89     const void *kArgs);
90 
91 static bool
92 isFitToLDS(
93     SubproblemDim *dim,
94     DataType dtype,
95     cl_ulong ldsSize,
96     const void *kernelArgs);
97 
98 static ssize_t
99 generator_tbsv(
100    char *buf,
101    size_t buflen,
102    const struct SubproblemDim *subdims,
103    const struct PGranularity *pgran,
104    void *extra);
105 
106 static SolverOps trsvOps = {
107     generator,
108     assignKargs,
109     isFitToLDS,
110     NULL, // Prepare Translate Dims
111     NULL, // Inner Decomposition Axis
112     calcNrThreads,
113     NULL, // Image related
114     solverFlags,
115     NULL,
116     NULL,
117     NULL,
118     setBuildOpts,
119     NULL
120 };
121 
122 static void
setBuildOpts(char * buildOptStr,const void * args)123 setBuildOpts(
124     char * buildOptStr,
125     const void *args)
126 {
127     const SolutionStep *step = (const SolutionStep *)args;
128     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
129     if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
130     {
131         addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
132         #ifdef DEBUG_TRSV_TRTRI
133         printf("TRSV TRTRI: Setting build options ... Double... for DOUBLE PRECISION support\n");
134         #endif
135     }
136     if( kargs->pigFuncID == CLBLAS_TPSV)
137     {
138         addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
139         #ifdef DEBUG_TRSV_TRTRI
140             printf("TPSV TRTRI: Setting build options ... PACKED\n");
141         #endif
142     }
143     if( kargs->pigFuncID == CLBLAS_TBSV)
144     {
145         addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DBANDED");
146         #ifdef DEBUG_TRSV_TRTRI
147         printf("TBSV TRTRI: Setting build options .. BANDED\n");
148         #endif
149     }
150     return;
151 }
152 
153 static CLBLASMpatExtra mpatExtra;
154 
155 extern "C"
initTrsvDefaultPattern(MemoryPattern * mempat)156 void initTrsvDefaultPattern(MemoryPattern *mempat)
157 {
158     #ifdef DEBUG_TRSV_TRTRI
159     printf("TRSV TRTRI: initTRSVDefaultPattern called with mempat = 0x%p\n", (void*)mempat);
160     #endif
161 
162     mempat->name = "Triangular matrix solver - Only 1 workgroup";
163     mempat->nrLevels = 2;
164     mempat->cuLevel = 0;
165     mempat->thLevel = 1;
166     mempat->sops = &trsvOps;
167 
168     mpatExtra.aMset = CLMEM_LEVEL_L2;
169     mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
170     mpatExtra.mobjA = CLMEM_BUFFER; // == No images
171     mpatExtra.mobjB = CLMEM_BUFFER; // == No images
172     mempat->extra = &mpatExtra;
173 
174     Prefix[TYPE_FLOAT] = 'S';
175     Prefix[TYPE_DOUBLE] = 'D';
176     Prefix[TYPE_COMPLEX_FLOAT] = 'C';
177     Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
178 }
179 
180 //
181 // Read comments atop "isFitToLDS()"
182 // This function is required by "isFitLDS()"
183 //
getTargetWidth(size_t theight,size_t blk_size,size_t vwidth)184 static cl_ulong getTargetWidth(size_t theight, size_t blk_size, size_t vwidth)
185 {
186     cl_ulong nLoops_v, nLoops;
187     //
188     // NOTE: This function should be called only for Non-Transpose cases
189     // NOTE: Does not check if the block size is suitable for our purposes
190     // NOTE:
191     nLoops_v = (theight * theight) / blk_size;
192     nLoops = nLoops_v / vwidth;
193     if (nLoops == 0)
194     {
195         return 0;
196     }
197     return theight/nLoops;
198 }
199 
200 static void
calcNrThreads(size_t threads[2],const SubproblemDim * subdims,const PGranularity * pgran,const void * args,const void * _extra)201 calcNrThreads(
202     size_t threads[2],
203     const SubproblemDim *subdims,
204     const PGranularity *pgran,
205     const void *args,
206     const void *_extra)
207 {
208     size_t BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
209     CLBlasKargs *kargs = (CLBlasKargs *)args;
210     #ifdef DEBUG_TRSV_TRTRI
211     printf("TRSV TRTRI: calcNrThreads() called \n");
212     #endif
213     int blocks = 1;
214 
215     _extra = _extra; // Dummy- to avoid warnings
216 
217     #ifdef DEBUG_TRSV_TRTRI
218     printf("blocks : %d\n", blocks);
219     #endif
220 
221     if (((kargs->order == clblasColumnMajor) && (kargs->transA == clblasNoTrans)) ||
222        ((kargs->order == clblasRowMajor) && (kargs->transA != clblasNoTrans)))
223      {
224         if (subdims->y > BLOCKSIZE)
225         {
226             // These little kernels cannot handle arbitrary numbers
227             printf("TRSV calcNrThreads(): Warning. TRTRI Cannot handle subproblemdim of size %lu\n", subdims->y);
228             threads[0] = 0;
229             threads[1] = 0;
230             return;
231         }
232     } else {
233         if (subdims->y > 1024)
234         {
235             // These little kernels cannot handle arbitrary numbers
236             printf("TRSV calcNrThreads(): Warning. TRTRI Cannot handle subproblemdim of size %lu\n", subdims->y);
237             threads[0] = 0;
238             threads[1] = 0;
239             return;
240         }
241     }
242 
243     threads[0] = blocks * BLOCKSIZE;
244     threads[1] = 1;
245     #ifdef DEBUG_TRSV_TRTRI
246     printf("pgran-wgSize[0] : %d, globalthreads[0]  : %lu\n", pgran->wgSize[0], threads[0]);
247     #endif
248     return;
249 }
250 
251 //
252 // FIXME: Report correct return value when "buf" is NULL - Needs change in KPRINTF
253 // FIXME: Return correct return value - Needs change in KPRINTF
254 //
255 static ssize_t
generator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)256 generator(
257    char *buf,
258    size_t buflen,
259    const struct SubproblemDim *subdims,
260    const struct PGranularity *pgran,
261    void *extra)
262 {
263     char tempTemplate[32*1024];
264     char vector_size_trans[10], triangle_height[10];
265 
266     pgran = pgran; // Dummy- to avoid warnings
267 
268     if (buf == NULL) // PENDING: Return correct buffer size
269     {
270         buflen = (32 * 1024 * sizeof(char));
271         return (ssize_t)buflen;
272     }
273 
274     CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
275     SolutionStep *step = container_of( pgran , pgran, SolutionStep);    // NOTE: using container_of() to get pigFuncID
276     CLBlasKargs* kargs = (CLBlasKargs*) &(step->args);
277 
278     if(kargs->pigFuncID == CLBLAS_TBSV)
279     {
280         return generator_tbsv(buf, buflen, subdims, pgran, extra);
281     }
282 
283     #ifdef DEBUG_TRSV_TRTRI
284      printf("TRSV GENERATOR called....\n");
285 
286     if((( extraFlags->flags &  KEXTRA_TRANS_A) || ( extraFlags ->flags & KEXTRA_CONJUGATE_A )))
287     {
288         printf("A is trans or CONJ-TRANS\n");
289     }
290     else
291     {
292         printf("A is noTrans...\n");
293     }
294     #endif
295 
296     clblasUplo uplo   = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
297     clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
298     clblasTranspose trans = ( extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans);
299     //bool unit = (((extraFlags->flags) & KEXTRA_UNIT_DIAGONAL) != 0);
300 
301     // unity and doConj handled in setKernelArgs
302     if ( order == clblasRowMajor )
303     {
304         order = clblasColumnMajor;
305         if ( trans == clblasNoTrans)
306         {
307             trans = clblasTrans;
308         }
309         else if ( trans == clblasTrans )
310         {
311             trans = clblasNoTrans;
312         }
313         else // clblasConjTrans
314         {
315             trans = clblasNoTrans;
316         }
317         uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper;
318     }
319 
320     if ( trans == clblasNoTrans)
321     {
322         ( uplo == clblasLower )?
323                     (strcpy(tempTemplate, (char*)trsv_CL_SolveTriangle_kernel)) :
324                     (strcpy(tempTemplate, (char*)trsv_CU_SolveTriangle_kernel));
325     }
326     else // Transpose cases...
327     {
328         ( uplo == clblasLower )?
329                     (strcpy(tempTemplate, (char*)trsv_CLT_SolveTriangle_kernel)) :
330                     (strcpy(tempTemplate, (char*)trsv_CUT_SolveTriangle_kernel));
331     }
332 
333     #ifdef DEBUG_TRSV_TRTRI
334     printf("dataType : %c\n", Prefix[extraFlags->dtype]);
335     #endif
336 
337     // FIXME: VECTORSIZE HARD CODED
338     // FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint
339     unsigned int vecLenA = extraFlags->vecLenA;
340 
341     #ifdef DEBUG_TRSV_TRTRI
342     printf("Vector length used : %d\n\n", vecLenA);
343     #endif
344 
345     bool doVLOAD = false;
346     if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
347     {
348         doVLOAD = true;
349         #ifdef DEBUG_TRSV_TRTRI
350             printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
351         #endif
352     }
353     else
354     {
355         #ifdef DEBUG_TRSV_TRTRI
356             printf("Using Aligned Data Pointer .........................\n");
357         #endif
358     }
359     kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD);
360 
361     if (trans != clblasNoTrans)
362     {
363         sprintf( vector_size_trans, "%u", vecLenA );
364         sprintf( triangle_height, "%ld", subdims[0].y );
365         #ifdef DEBUG_TRSV_TRTRI
366         printf("vector size trans = %s\n", vector_size_trans);
367         #endif
368         kobj.put("%PREFIXVECTOR_SIZE_TRANS", (const char *)vector_size_trans);
369         kobj.put("%TRIANGLE_HEIGHT", triangle_height);
370     }
371     kobj.spit((char*)buf, tempTemplate);
372     return (32 * 1024 * sizeof(char));
373 }
374 
375 static void
assignKargs(KernelArg * args,const void * params,const void *)376 assignKargs(KernelArg *args, const void *params, const void*)
377 {
378     CLBlasKargs *blasArgs = (CLBlasKargs*)params;
379     cl_int inc;
380     cl_int unity, doConj;
381 
382     INIT_KARG(&args[0], blasArgs->A);     //A - input matrix - argument
383     INIT_KARG(&args[1], blasArgs->B);     //x - result buffer = _xnew argument
384     initSizeKarg(&args[2], blasArgs->N);
385     inc = blasArgs->ldb.vector;
386     INIT_KARG(&args[3], inc);
387     unity = (blasArgs->diag == clblasUnit);
388     INIT_KARG(&args[4], unity);
389     initSizeKarg(&args[5], blasArgs->lda.matrix);
390     doConj = (blasArgs->transA == clblasConjTrans);
391     #ifdef DEBUG_TRSV_TRTRI
392     printf("TRMV TRTRI: assignKargs: doConj is : %d, unity is : %d, incx is : %d\n", doConj, unity, inc);
393     printf("TRMV TRTRI: startRow, startCol set to %d, %d\n", blasArgs->startRow, blasArgs->endRow);
394     #endif
395     INIT_KARG(&args[6], doConj);
396     INIT_KARG(&args[7], blasArgs->startRow);
397     INIT_KARG(&args[8], blasArgs->endRow);
398     initSizeKarg(&args[9], blasArgs->offa);
399     initSizeKarg(&args[10], blasArgs->offBX);
400 
401     if( blasArgs->pigFuncID == CLBLAS_TBSV)
402     {
403         initSizeKarg(&args[11], blasArgs->K);
404     }
405     return;
406 }
407 
408 /*
409  * isFitToLDS() is based on the "trsv_gemv" counterpart than the kernel corresponding to TRTRI
410  * The Kernels corersponding to TRTRI are run with only 1 Workgroup.
411  * So, it really does not matter at all.
412  * But, if dim[0].y selected by the library changes between TRTRI and TRSV_GEMV, results will go
413  * wrong. So, by using the same "isFitToLDS" function, we will indirectly force the library to
414  * choose the same "SubproblemDim" for both cases.
415  */
416 static bool
isFitToLDS(SubproblemDim * dim,DataType dtype,cl_ulong ldsSize,const void * kernelArgs)417 isFitToLDS(
418     SubproblemDim *dim,
419     DataType dtype,
420     cl_ulong ldsSize,
421     const void *kernelArgs)
422 {
423     CLBlasKargs *blasArgs = (CLBlasKargs *)kernelArgs;
424     size_t MAXBLOCKSIZE = 256;
425     cl_ulong maxSize;
426 
427     if  (
428             ((blasArgs->transA == clblasNoTrans) && (blasArgs->order == clblasColumnMajor)) ||
429             ((blasArgs->transA != clblasNoTrans) && (blasArgs->order == clblasRowMajor))
430         )
431     {
432         //
433         // Estimate worst case Local Memory needed - Vector Width of 4 irrespective of data-type?
434         //
435         cl_ulong tw;
436 
437         tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4);
438         if (tw == 0)
439         {
440             do {
441                 MAXBLOCKSIZE /= 2;
442                 tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4);
443             } while((MAXBLOCKSIZE > 1) && (tw == 0));
444         }
445         #ifdef DEBUG_TRSV_TRTRI
446         printf("TRSV TRTRI: isFitLDS() tw = %lu\n", tw);
447         #endif
448         maxSize = (1+4+tw)*dtypeSize(dtype) + MAXBLOCKSIZE*dtypeSize(dtype)*4;
449         #ifdef DEBUG_TRSV_TRTRI
450         printf("TRSV TRTRI: isFitLDS() maxSize = %lu, ldsSize = %lu, Y=%lu\n", maxSize, ldsSize, dim[0].y);
451         #endif
452         return (maxSize < ldsSize);
453     }
454 
455     //
456     // The remaining kernels use "TriangleWidth" amount of local memory for storing the RHS.
457     // We will assume "dim[0].y" to be the "TriangleWidth"
458     //
459     MAXBLOCKSIZE = (dim[0].y)*(dim[0].y) > 256 ? 256 : dim[0].y*dim[0].y;
460     maxSize = (dim[0].y + MAXBLOCKSIZE)*dtypeSize(dtype);
461     return (maxSize < ldsSize);
462 }
463 
464 static ssize_t
generator_tbsv(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)465 generator_tbsv(
466    char *buf,
467    size_t buflen,
468    const struct SubproblemDim *subdims,
469    const struct PGranularity *pgran,
470    void *extra)
471 {
472     char tempTemplate[32*1024];
473     char vector_size_trans[10], triangle_height[10];
474 
475     pgran = pgran; // Dummy- to avoid warnings
476 
477     if (buf == NULL) // PENDING: Return correct buffer size
478     {
479         buflen = (32 * 1024 * sizeof(char));
480         return (ssize_t)buflen;
481     }
482 
483     CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
484 
485     clblasUplo uplo   = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
486     clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
487     clblasTranspose trans = ( extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans);
488 
489     // unity and doConj handled in setKernelArgs
490     if ( order == clblasColumnMajor )
491     {
492         if ( trans == clblasNoTrans)
493         {
494             trans = clblasTrans;
495         }
496         else if ( trans == clblasTrans )
497         {
498             trans = clblasNoTrans;
499         }
500         else // clblasConjTrans
501         {
502             trans = clblasNoTrans;
503         }
504         uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper;
505     }
506 
507     if ( trans == clblasNoTrans)
508     {
509         ( uplo == clblasLower )?
510                     (strcpy(tempTemplate, (char*)trsv_CL_SolveTriangle_kernel)) :
511                     (strcpy(tempTemplate, (char*)trsv_CU_SolveTriangle_kernel));
512     }
513     else // Transpose cases...
514     {
515         ( uplo == clblasLower )?
516                     (strcpy(tempTemplate, (char*)trsv_CLT_SolveTriangle_kernel)) :
517                     (strcpy(tempTemplate, (char*)trsv_CUT_SolveTriangle_kernel));
518     }
519 
520     unsigned int vecLenA = extraFlags->vecLenA;
521 
522     bool doVLOAD = false;
523     if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
524     {
525         doVLOAD = true;
526         #ifdef DEBUG_TRSV_TRTRI
527             printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
528         #endif
529     }
530     else
531     {
532         #ifdef DEBUG_TRSV_TRTRI
533             printf("Using Aligned Data Pointer .........................\n");
534         #endif
535     }
536     kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD);
537 
538     if (trans != clblasNoTrans)
539     {
540         sprintf( vector_size_trans, "%u", vecLenA );
541         sprintf( triangle_height, "%ld", subdims[0].y );
542         kobj.put("%PREFIXVECTOR_SIZE_TRANS", (const char *)vector_size_trans);
543         kobj.put("%TRIANGLE_HEIGHT", triangle_height);
544     }
545     kobj.spit((char*)buf, tempTemplate);
546     return (32 * 1024 * sizeof(char));
547 }
548 
549