1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 /*
18  * rotg generator
19  */
20 //#define DEBUG_ROTG
21 
22 #include <string.h>
23 #include <stdio.h>
24 #include <assert.h>
25 #include <clblas_stddef.h>
26 #include <clBLAS.h>
27 #include <blas_mempat.h>
28 #include <clkern.h>
29 #include <clblas-internal.h>
30 #include "blas_kgen.h"
31 #include <kprintf.hpp>
32 #include <rotg.clT>
33 #include <solution_seq.h>
34 #include "blas_subgroup.h"
35 #include "gen_helper.h"
36 
37 extern "C"
38 unsigned int dtypeSize(DataType type);
39 
40 
41 static char Prefix[4];
42 
43 static SolverFlags
solverFlags(void)44 solverFlags(void)
45 {
46     return (SF_WSPACE_1D);
47 }
48 
49 static void
50 calcNrThreads(
51     size_t threads[2],
52     const SubproblemDim *subdims,
53     const PGranularity *pgran,
54     const void *args,
55     const void *extra);
56 
57 static ssize_t
58 generator(
59    char *buf,
60    size_t buflen,
61    const struct SubproblemDim *subdims,
62    const struct PGranularity *pgran,
63    void *extra);
64 
65 
66 static void
67 assignKargs(KernelArg *args, const void *params, const void* extra );
68 
69 extern "C"
70 void initRotgRegisterPattern(MemoryPattern *mempat);
71 
72 static void
73 setBuildOpts(
74     char * buildOptStr,
75     const void *kArgs);
76 
77 static SolverOps rotgOps = {
78     generator,
79     assignKargs,
80     NULL,
81     NULL, // Prepare Translate Dims
82     NULL, // Inner Decomposition Axis
83     calcNrThreads,
84     NULL,
85     solverFlags,
86 	NULL,
87 	NULL,
88 	NULL,
89 	setBuildOpts,
90 	NULL
91 };
92 
93 static void
setBuildOpts(char * buildOptStr,const void * args)94 setBuildOpts(
95     char * buildOptStr,
96     const void *args)
97 {
98 	const SolutionStep *step = (const SolutionStep *)args;
99     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
100 	if ( (kargs->dtype == TYPE_DOUBLE) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
101 		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
102 	}
103 	if( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
104 	    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
105 	}
106 
107 	return;
108 }
109 
110 
111 static CLBLASMpatExtra mpatExtra;
112 
113 extern "C"
initRotgRegisterPattern(MemoryPattern * mempat)114 void initRotgRegisterPattern(MemoryPattern *mempat)
115 {
116 	#ifdef DEBUG_ROTG
117 	printf("initRegPattern called with mempat = 0x%p\n", mempat);
118 	#endif
119 
120 	fflush(stdout);
121     mempat->name = "Register accumulation based swap";
122     mempat->nrLevels = 2;
123     mempat->cuLevel = 0;
124     mempat->thLevel = 1;
125     mempat->sops = &rotgOps;
126 
127     mpatExtra.aMset = CLMEM_LEVEL_L2;
128     mpatExtra.bMset = CLMEM_LEVEL_L2;
129     mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
130     mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
131     mempat->extra = &mpatExtra;
132 
133 	Prefix[TYPE_FLOAT] = 'S';
134 	Prefix[TYPE_DOUBLE] = 'D';
135 	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
136 	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
137 }
138 
139 static void
calcNrThreads(size_t threads[2],const SubproblemDim * subdims,const PGranularity * pgran,const void * args,const void * _extra)140 calcNrThreads(
141     size_t threads[2],
142     const SubproblemDim *subdims,
143     const PGranularity *pgran,
144     const void *args,
145     const void *_extra)
146 {
147 	int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
148 	DUMMY_ARGS_USAGE_3(subdims, _extra, args);
149 
150 	size_t blocks = 1;  // Only 1 work-group is enough
151 	#ifdef DEBUG_ROTG
152 	printf("blocks : %d\n", blocks);
153 	#endif
154 
155 	threads[0] = blocks * BLOCKSIZE;
156 	#ifdef DEBUG_ROTG
157 	printf("pgran-wgSize[0] : %d, globalthreads[0]  : %d\n", pgran->wgSize[0], threads[0]);
158 	#endif
159 	threads[1] = 1;
160 }
161 
162 //
163 // FIXME: Report correct return value - Needs change in KPRINTF
164 //
165 static ssize_t
generator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)166 generator(
167    char *buf,
168    size_t buflen,
169    const struct SubproblemDim *subdims,
170    const struct PGranularity *pgran,
171    void *extra)
172 {
173 
174 	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
175 	DUMMY_ARGS_USAGE_2(subdims, pgran);
176 	char tempTemplate[32*1024];
177 
178 	if ( buf == NULL) // return buffer size
179 	{
180 		buflen = (32 * 1024 * sizeof(char));
181         return (ssize_t)buflen;
182 	}
183 
184 	#ifdef DEBUG_ROTG
185 	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
186 	#endif
187 
188     strcpy( tempTemplate, (char*)rotg_kernel );
189 
190 	kprintf kobj( Prefix[extraFlags->dtype], 1, false, false);
191     kobj.spit((char*)buf, tempTemplate);
192 
193     return (32 * 1024 * sizeof(char));
194 }
195 
196 /*
197 __kernel void %PREFIXrotg_kernel( __global %TYPE *_A, __global %TYPE *_B, __global %PTYPE *_C,
198                                 __global %TYPE *_S, uint offa, uint offb, uint offc, uint offs )
199 
200 */
201 static void
assignKargs(KernelArg * args,const void * params,const void *)202 assignKargs(KernelArg *args, const void *params, const void* )
203 {
204     CLBlasKargs *blasArgs = (CLBlasKargs*)params;
205 
206     INIT_KARG(&args[0], blasArgs->A);
207 	INIT_KARG(&args[1], blasArgs->B);
208 	INIT_KARG(&args[2], blasArgs->C);
209     INIT_KARG(&args[3], blasArgs->D);
210     initSizeKarg(&args[4], blasArgs->offa);
211     initSizeKarg(&args[5], blasArgs->offb);
212     initSizeKarg(&args[6], blasArgs->offc);
213     initSizeKarg(&args[7], blasArgs->offd);
214 
215 	return;
216 }
217