1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17 /*
18 * SYR2 Generator
19 */
20
21 #include <string.h>
22 #include <stdio.h>
23 #include <assert.h>
24 #include <clblas_stddef.h>
25 #include <clBLAS.h>
26 #include <blas_mempat.h>
27 #include <clkern.h>
28 #include <clblas-internal.h>
29 #include "blas_kgen.h"
30
31 #include <kprintf.hpp>
32 #include <syr2_her2.clT>
33 #include <solution_seq.h>
34 //#define DEBUG_SYR2
35
36 extern "C"
37 unsigned int dtypeSize(DataType type);
38
39
40 static char Prefix[4];
41
42 static SolverFlags
solverFlags(void)43 solverFlags(void)
44 {
45 #ifdef DEBUG_SYR2
46 printf("solverFlags called......\n");
47 #endif
48
49 return (SolverFlags)(SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
50 }
51
52 static void
53 calcNrThreads(
54 size_t threads[2],
55 const SubproblemDim *subdims,
56 const PGranularity *pgran,
57 const void *args,
58 const void *extra);
59
60 static ssize_t
61 generator(
62 char *buf,
63 size_t buflen,
64 const struct SubproblemDim *subdims,
65 const struct PGranularity *pgran,
66 void *extra);
67
68
69 static void
70 assignKargs(KernelArg *args, const void *params, const void*);
71
72 extern "C"
73 void initSyr2DefaultPattern(MemoryPattern *mempat);
74
75 static KernelExtraFlags
76 selectVectorization(
77 void *kargs,
78 unsigned int vlen );
79
80 static void
81 setBuildOpts(
82 char * buildOptStr,
83 const void *kArgs);
84
85 static bool
86 isFitToLDS(
87 SubproblemDim *dim,
88 DataType dtype,
89 cl_ulong ldsSize,
90 const void *kernelArgs);
91
92 static SolverOps syr2Ops = {
93 generator,
94 assignKargs,
95 isFitToLDS,
96 NULL, // Prepare Translate Dims
97 NULL, // Inner Decomposition Axis
98 calcNrThreads,
99 NULL,
100 solverFlags,
101 NULL,
102 NULL,
103 NULL,
104 setBuildOpts,
105 selectVectorization
106 };
107
108 static KernelExtraFlags
selectVectorization(void * args,unsigned int vlen)109 selectVectorization(
110 void *args,
111 unsigned int vlen )
112 {
113 KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
114 CLBlasKargs *kargs = (CLBlasKargs *)args;
115
116 if(kargs->uplo == clblasUpper)
117 {
118 if( (kargs->N) % vlen)
119 {
120 kflags = KEXTRA_NO_COPY_VEC_A;
121 }
122 }
123
124 if( kargs->pigFuncID == CLBLAS_SPR2 )
125 {
126 kflags = KEXTRA_NO_COPY_VEC_A; // Packed-case never do aligned access
127 }
128
129 return kflags;
130 }
131
132
133 static void
setBuildOpts(char * buildOptStr,const void * args)134 setBuildOpts(
135 char * buildOptStr,
136 const void *args)
137 {
138 const SolutionStep *step = (const SolutionStep *)args;
139 const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
140 if ( kargs->dtype == TYPE_DOUBLE )
141 {
142 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
143 #ifdef DEBUG_SYR2
144 printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
145 #endif
146 }
147 if( kargs->pigFuncID == CLBLAS_SPR2 )
148 {
149 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
150 }
151
152 return;
153 }
154
155 static CLBLASMpatExtra mpatExtra;
156
157 extern "C"
initSyr2DefaultPattern(MemoryPattern * mempat)158 void initSyr2DefaultPattern(MemoryPattern *mempat)
159 {
160 #ifdef DEBUG_SYR2
161 printf("initSyrDefaultPattern called with mempat = 0x%p\n", (void *)mempat);
162 fflush(stdout);
163 #endif
164
165 mempat->name = "LDS based syr";
166 mempat->nrLevels = 2;
167 mempat->cuLevel = 0;
168 mempat->thLevel = 1;
169 mempat->sops = &syr2Ops;
170
171 mpatExtra.aMset = 0;
172 mpatExtra.bMset = CLMEM_LEVEL_LDS; // For "x" vector
173 //mpatExtra.cMset = CLMEM_LEVEL_LDS; // For "y" vector
174 mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
175 mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
176 //mpatExtra.mobjC = CLMEM_GLOBAL_MEMORY;
177
178 mempat->extra = &mpatExtra;
179
180 Prefix[TYPE_FLOAT] = 'S';
181 Prefix[TYPE_DOUBLE] = 'D';
182 }
183
184 static void
calcNrThreads(size_t threads[2],const SubproblemDim * subdims,const PGranularity * pgran,const void * args,const void * _extra)185 calcNrThreads(
186 size_t threads[2],
187 const SubproblemDim *subdims,
188 const PGranularity *pgran,
189 const void *args,
190 const void *_extra)
191 {
192 int BLOCKSIZE = pgran->wgSize[0]; // 1D Block
193 #ifdef DEBUG_SYR2
194 printf("calcNrThreads called from syr2_lds.cpp\n");
195 #endif
196
197 const CLBlasKargs *kargs = (const CLBlasKargs *)args;
198 const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
199
200 clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
201
202 if ( order == clblasRowMajor )
203 {
204 order = clblasColumnMajor;
205 }
206 #ifdef DEBUG_SYR2
207 printf("subdims->y : %d, subdims->x : %d\n", (int)subdims->y, (int)subdims->x);
208 #endif
209 size_t TARGETROWS = subdims->y ;
210
211 #ifdef DEBUG_SYR2
212 printf("kargs-> N : %d, TARGETROWS: %d\n", (int)kargs->N, TARGETROWS);
213 #endif
214
215 size_t blocks = ((kargs->N - 1)/ TARGETROWS) + 1;
216 #ifdef DEBUG_SYR2
217 printf("blocks : %d\n", blocks);
218 #endif
219
220 threads[0] = ((blocks * (blocks + 1)) / 2) * BLOCKSIZE;
221 #ifdef DEBUG_SYR2
222 printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", (int)pgran->wgSize[0], (int)threads[0]);
223 #endif
224 threads[1] = 1;
225 }
226
227
228 //
229 // FIXME: Report correct return value - Needs change in KPRINTF
230 //
231 static ssize_t
generator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)232 generator(
233 char *buf,
234 size_t buflen,
235 const struct SubproblemDim *subdims,
236 const struct PGranularity *pgran,
237 void *extra)
238 {
239 int BLOCKSIZE = pgran->wgSize[0];
240 char tempTemplate[32*1024];
241 char targetRows[10], blockSize[10];
242
243 if ( buf == NULL) // return buffer size
244 {
245 buflen = (64 * 1024 * sizeof(char));
246 return (ssize_t)buflen;
247 }
248 CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
249
250 #ifdef DEBUG_SYR2
251 printf("SYR2 GENERATOR called....\n");
252 #endif
253
254 clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
255 clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
256
257
258 if ((subdims->y % extraFlags->vecLenA) != 0)
259 {
260 printf("WARNING: SYR2: generator: TARGETROWS must be divisible by Vector Length\n");
261 return 0;
262 }
263
264 size_t TARGETROWS = 0;
265 if(order == clblasColumnMajor)
266 {
267 ( uplo == clblasLower )?
268 (strcpy(tempTemplate, (char*)syr2_her2_CL_kernel)) : (strcpy(tempTemplate, (char*)syr2_her2_CU_kernel));
269 }
270 else
271 {
272 printf("WARNING: SYR2: Rowmajor order is implemented in columnMajor. This part should never get executed.\n");
273 return 0;
274 }
275
276 TARGETROWS = subdims->y;
277 if ((BLOCKSIZE % TARGETROWS) != 0)
278 {
279 printf("WARNING: SYR2: generator: Invalid Block Size\n");
280 return 0;
281 }
282
283 #ifdef DEBUG_SYR2
284 printf("dataType : %c\n", Prefix[extraFlags->dtype]);
285 #endif
286
287 // FIXME: VECTORSIZE HARD CODED
288 // FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint
289 unsigned int vecLenA = extraFlags->vecLenA;
290
291 #ifdef DEBUG_SYR2
292 printf("Vector length used : %d\n\n", vecLenA);
293 #endif
294
295 bool doVLOAD = false;
296 if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A )
297 {
298 doVLOAD = true;
299 #ifdef DEBUG_SYR2
300 printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
301 #endif
302 }
303 else
304 {
305 #ifdef DEBUG_SYR2
306 printf("Using Aligned Data Pointer .........................\n");
307 #endif
308 }
309 kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
310
311 sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
312 sprintf( blockSize, "%d", BLOCKSIZE );
313
314 #ifdef DEBUG_SYR2
315 printf("TARGET ROWS = %s\n", targetRows);
316 printf("BLOCK SIZE = %s\n", blockSize);
317 #endif
318
319 kobj.put("%TARGET_ROWS", (const char *)targetRows);
320 kobj.put("%BLOCKSIZE", (const char *) blockSize);
321 kobj.spit((char*)buf, tempTemplate);
322
323 return (64 * 1024 * sizeof(char));
324 // return 0;//(ret < 0) ? -EOVERFLOW : ret;
325 }
326
327 /*
328 ( __global %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha)
329 */
330 static void
assignKargs(KernelArg * args,const void * params,const void *)331 assignKargs(KernelArg *args, const void *params, const void*)
332 {
333 CLBlasKargs *blasArgs = (CLBlasKargs*)params;
334 cl_int inc;
335
336 INIT_KARG(&args[0], blasArgs->A); //A - input/output matrix - argument
337 INIT_KARG(&args[1], blasArgs->B); //X - x vector
338 INIT_KARG(&args[2], blasArgs->C); //Y - y vector
339 initSizeKarg(&args[3], blasArgs->N);
340 initSizeKarg(&args[4], blasArgs->offBX);
341 inc = blasArgs->ldb.vector;
342 INIT_KARG(&args[5], inc);
343 initSizeKarg(&args[6], blasArgs->offCY);
344 inc = blasArgs->ldc.vector;
345 INIT_KARG(&args[7], inc);
346 initSizeKarg(&args[8], blasArgs->offa);
347 initSizeKarg(&args[9], blasArgs->lda.matrix);
348 assignScalarKarg(&args[10], &(blasArgs->alpha), blasArgs->dtype);
349 return;
350 }
351
352 static bool
isFitToLDS(SubproblemDim * dim,DataType dtype,cl_ulong ldsSize,const void * kernelArgs)353 isFitToLDS(
354 SubproblemDim *dim,
355 DataType dtype,
356 cl_ulong ldsSize,
357 const void *kernelArgs)
358 {
359 cl_ulong maxSize;
360 CLBlasKargs *blasArgs;
361
362 blasArgs = (CLBlasKargs *)kernelArgs;
363
364 // 4 buffers for xShared, yShared, xSharedTrans and ySharedTrans and 2 integers for the values of iShared and jShared.
365
366 maxSize = (dim->y * 4 * sizeof(dtype)) + (2 * sizeof(int));
367
368 return ((maxSize) <= ldsSize);
369
370 }
371 //#undef DEBUG_SYR2
372
373