1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 /*
18  * SYR2 Generator
19  */
20 
21 #include <string.h>
22 #include <stdio.h>
23 #include <assert.h>
24 #include <clblas_stddef.h>
25 #include <clBLAS.h>
26 #include <blas_mempat.h>
27 #include <clkern.h>
28 #include <clblas-internal.h>
29 #include "blas_kgen.h"
30 
31 #include <kprintf.hpp>
32 #include <syr2_her2.clT>
33 #include <solution_seq.h>
34 //#define DEBUG_SYR2
35 
36 extern "C"
37 unsigned int dtypeSize(DataType type);
38 
39 
40 static char Prefix[4];
41 
42 static SolverFlags
solverFlags(void)43 solverFlags(void)
44 {
45 	#ifdef DEBUG_SYR2
46 	printf("solverFlags called......\n");
47 	#endif
48 
49     return (SolverFlags)(SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
50 }
51 
52 static void
53 calcNrThreads(
54     size_t threads[2],
55     const SubproblemDim *subdims,
56     const PGranularity *pgran,
57     const void *args,
58     const void *extra);
59 
60 static ssize_t
61 generator(
62    char *buf,
63    size_t buflen,
64    const struct SubproblemDim *subdims,
65    const struct PGranularity *pgran,
66    void *extra);
67 
68 
69 static void
70 assignKargs(KernelArg *args, const void *params, const void*);
71 
72 extern "C"
73 void initSyr2DefaultPattern(MemoryPattern *mempat);
74 
75 static  KernelExtraFlags
76 selectVectorization(
77     void *kargs,
78     unsigned int vlen );
79 
80 static void
81 setBuildOpts(
82     char * buildOptStr,
83     const void *kArgs);
84 
85 static bool
86 isFitToLDS(
87     SubproblemDim *dim,
88     DataType dtype,
89     cl_ulong ldsSize,
90     const void *kernelArgs);
91 
92 static SolverOps syr2Ops = {
93     generator,
94     assignKargs,
95     isFitToLDS,
96     NULL, // Prepare Translate Dims
97     NULL, // Inner Decomposition Axis
98     calcNrThreads,
99     NULL,
100     solverFlags,
101 	NULL,
102 	NULL,
103 	NULL,
104 	setBuildOpts,
105 	selectVectorization
106 };
107 
108 static  KernelExtraFlags
selectVectorization(void * args,unsigned int vlen)109 selectVectorization(
110 	void *args,
111 	unsigned int vlen )
112 {
113 	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
114 	CLBlasKargs *kargs  = (CLBlasKargs *)args;
115 
116 	if(kargs->uplo == clblasUpper)
117 	{
118 		if( (kargs->N) % vlen)
119         {
120 			kflags = KEXTRA_NO_COPY_VEC_A;
121 		}
122 	}
123 
124     if( kargs->pigFuncID == CLBLAS_SPR2 )
125     {
126         kflags = KEXTRA_NO_COPY_VEC_A;      // Packed-case never do aligned access
127     }
128 
129 	return kflags;
130 }
131 
132 
133 static void
setBuildOpts(char * buildOptStr,const void * args)134 setBuildOpts(
135     char * buildOptStr,
136     const void *args)
137 {
138 	const SolutionStep *step = (const SolutionStep *)args;
139     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
140 	if ( kargs->dtype == TYPE_DOUBLE )
141 	{
142 		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
143 		#ifdef DEBUG_SYR2
144 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
145 		#endif
146 	}
147     if( kargs->pigFuncID == CLBLAS_SPR2 )
148     {
149         addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
150     }
151 
152 	return;
153 }
154 
155 static CLBLASMpatExtra mpatExtra;
156 
157 extern "C"
initSyr2DefaultPattern(MemoryPattern * mempat)158 void initSyr2DefaultPattern(MemoryPattern *mempat)
159 {
160 	#ifdef DEBUG_SYR2
161 	printf("initSyrDefaultPattern called with mempat = 0x%p\n", (void *)mempat);
162 	fflush(stdout);
163 	#endif
164 
165     mempat->name = "LDS based syr";
166     mempat->nrLevels = 2;
167     mempat->cuLevel = 0;
168     mempat->thLevel = 1;
169     mempat->sops = &syr2Ops;
170 
171     mpatExtra.aMset = 0;
172     mpatExtra.bMset = CLMEM_LEVEL_LDS; // For "x" vector
173 	//mpatExtra.cMset = CLMEM_LEVEL_LDS; // For "y" vector
174     mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
175     mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
176 	//mpatExtra.mobjC = CLMEM_GLOBAL_MEMORY;
177 
178     mempat->extra = &mpatExtra;
179 
180 	Prefix[TYPE_FLOAT] = 'S';
181 	Prefix[TYPE_DOUBLE] = 'D';
182 }
183 
184 static void
calcNrThreads(size_t threads[2],const SubproblemDim * subdims,const PGranularity * pgran,const void * args,const void * _extra)185 calcNrThreads(
186     size_t threads[2],
187     const SubproblemDim *subdims,
188     const PGranularity *pgran,
189     const void *args,
190     const void *_extra)
191 {
192 	int BLOCKSIZE = pgran->wgSize[0]; // 1D Block
193 	#ifdef DEBUG_SYR2
194 	printf("calcNrThreads called from syr2_lds.cpp\n");
195 	#endif
196 
197     const CLBlasKargs *kargs = (const CLBlasKargs *)args;
198 	const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
199 
200 	clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
201 
202     if ( order == clblasRowMajor )
203     {
204         order = clblasColumnMajor;
205     }
206 	#ifdef DEBUG_SYR2
207 	printf("subdims->y : %d, subdims->x : %d\n", (int)subdims->y, (int)subdims->x);
208 	#endif
209 	size_t TARGETROWS = subdims->y ;
210 
211 	#ifdef DEBUG_SYR2
212 	printf("kargs-> N : %d, TARGETROWS: %d\n", (int)kargs->N, TARGETROWS);
213 	#endif
214 
215 	size_t blocks = ((kargs->N - 1)/ TARGETROWS) + 1;
216 	#ifdef DEBUG_SYR2
217 	printf("blocks : %d\n", blocks);
218 	#endif
219 
220 	threads[0] = ((blocks * (blocks + 1)) / 2) * BLOCKSIZE;
221 	#ifdef DEBUG_SYR2
222 	printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", (int)pgran->wgSize[0], (int)threads[0]);
223 	#endif
224 	threads[1] = 1;
225 }
226 
227 
228 //
229 // FIXME: Report correct return value - Needs change in KPRINTF
230 //
231 static ssize_t
generator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)232 generator(
233    char *buf,
234    size_t buflen,
235    const struct SubproblemDim *subdims,
236    const struct PGranularity *pgran,
237    void *extra)
238 {
239 	int BLOCKSIZE  = pgran->wgSize[0];
240 	char tempTemplate[32*1024];
241 	char targetRows[10], blockSize[10];
242 
243 	if ( buf == NULL) // return buffer size
244 	{
245 		buflen = (64 * 1024 * sizeof(char));
246 		return (ssize_t)buflen;
247 	}
248 	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
249 
250 	#ifdef DEBUG_SYR2
251  	printf("SYR2 GENERATOR called....\n");
252 	#endif
253 
254 	clblasUplo uplo   = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
255 	clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
256 
257 
258 	if ((subdims->y % extraFlags->vecLenA) != 0)
259 	{
260 		printf("WARNING: SYR2: generator: TARGETROWS must be divisible by Vector Length\n");
261 		return 0;
262 	}
263 
264 	size_t TARGETROWS = 0;
265 	if(order == clblasColumnMajor)
266 	{
267 		( uplo == clblasLower )?
268 			     (strcpy(tempTemplate, (char*)syr2_her2_CL_kernel)) : (strcpy(tempTemplate, (char*)syr2_her2_CU_kernel));
269 	}
270 	else
271 	{
272 		printf("WARNING: SYR2: Rowmajor order is implemented in columnMajor. This part should never get executed.\n");
273 		return 0;
274 	}
275 
276 	TARGETROWS = subdims->y;
277 	if ((BLOCKSIZE % TARGETROWS) != 0)
278 	{
279 		printf("WARNING: SYR2: generator: Invalid Block Size\n");
280 		return 0;
281 	}
282 
283 	#ifdef DEBUG_SYR2
284 	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
285 	#endif
286 
287 	// FIXME: VECTORSIZE HARD CODED
288 	// FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint
289     unsigned int vecLenA = extraFlags->vecLenA;
290 
291 	#ifdef DEBUG_SYR2
292 	printf("Vector length used : %d\n\n", vecLenA);
293 	#endif
294 
295 	bool doVLOAD = false;
296 	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
297 	{
298 		doVLOAD = true;
299 		#ifdef DEBUG_SYR2
300 			printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
301 		#endif
302 	}
303 	else
304 	{
305 		#ifdef DEBUG_SYR2
306 			printf("Using Aligned Data Pointer .........................\n");
307 		#endif
308 	}
309 	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
310 
311 	sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
312 	sprintf( blockSize, "%d", BLOCKSIZE );
313 
314 	#ifdef DEBUG_SYR2
315     printf("TARGET ROWS = %s\n", targetRows);
316     printf("BLOCK SIZE = %s\n", blockSize);
317 	#endif
318 
319     kobj.put("%TARGET_ROWS", (const char *)targetRows);
320     kobj.put("%BLOCKSIZE", (const char *) blockSize);
321     kobj.spit((char*)buf, tempTemplate);
322 
323 	return (64 * 1024 * sizeof(char));
324     // return 0;//(ret < 0) ? -EOVERFLOW : ret;
325 }
326 
327 /*
328 ( __global %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha)
329 */
330 static void
assignKargs(KernelArg * args,const void * params,const void *)331 assignKargs(KernelArg *args, const void *params, const void*)
332 {
333     CLBlasKargs *blasArgs = (CLBlasKargs*)params;
334     cl_int inc;
335 
336     INIT_KARG(&args[0], blasArgs->A); 	//A - input/output matrix - argument
337     INIT_KARG(&args[1], blasArgs->B); 	//X - x vector
338 	INIT_KARG(&args[2], blasArgs->C); 	//Y - y vector
339 	initSizeKarg(&args[3], blasArgs->N);
340 	initSizeKarg(&args[4], blasArgs->offBX);
341     inc = blasArgs->ldb.vector;
342     INIT_KARG(&args[5], inc);
343 	initSizeKarg(&args[6], blasArgs->offCY);
344 	inc = blasArgs->ldc.vector;
345 	INIT_KARG(&args[7], inc);
346     initSizeKarg(&args[8], blasArgs->offa);
347 	initSizeKarg(&args[9], blasArgs->lda.matrix);
348     assignScalarKarg(&args[10], &(blasArgs->alpha), blasArgs->dtype);
349 	return;
350 }
351 
352 static bool
isFitToLDS(SubproblemDim * dim,DataType dtype,cl_ulong ldsSize,const void * kernelArgs)353 isFitToLDS(
354     SubproblemDim *dim,
355     DataType dtype,
356     cl_ulong ldsSize,
357     const void *kernelArgs)
358 {
359     cl_ulong maxSize;
360     CLBlasKargs *blasArgs;
361 
362 	blasArgs = (CLBlasKargs *)kernelArgs;
363 
364 	// 4  buffers for xShared, yShared, xSharedTrans and ySharedTrans and 2 integers for the values of iShared and jShared.
365 
366 	maxSize = (dim->y * 4 * sizeof(dtype)) + (2 * sizeof(int));
367 
368     return ((maxSize) <= ldsSize);
369 
370 }
371 //#undef DEBUG_SYR2
372 
373