1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17 /*
18 * rotg generator
19 */
20 //#define DEBUG_ROTG
21
22 #include <string.h>
23 #include <stdio.h>
24 #include <assert.h>
25 #include <clblas_stddef.h>
26 #include <clBLAS.h>
27 #include <blas_mempat.h>
28 #include <clkern.h>
29 #include <clblas-internal.h>
30 #include "blas_kgen.h"
31 #include <kprintf.hpp>
32 #include <rotg.clT>
33 #include <solution_seq.h>
34 #include "blas_subgroup.h"
35 #include "gen_helper.h"
36
37 extern "C"
38 unsigned int dtypeSize(DataType type);
39
40
41 static char Prefix[4];
42
43 static SolverFlags
solverFlags(void)44 solverFlags(void)
45 {
46 return (SF_WSPACE_1D);
47 }
48
49 static void
50 calcNrThreads(
51 size_t threads[2],
52 const SubproblemDim *subdims,
53 const PGranularity *pgran,
54 const void *args,
55 const void *extra);
56
57 static ssize_t
58 generator(
59 char *buf,
60 size_t buflen,
61 const struct SubproblemDim *subdims,
62 const struct PGranularity *pgran,
63 void *extra);
64
65
66 static void
67 assignKargs(KernelArg *args, const void *params, const void* extra );
68
69 extern "C"
70 void initRotgRegisterPattern(MemoryPattern *mempat);
71
72 static void
73 setBuildOpts(
74 char * buildOptStr,
75 const void *kArgs);
76
77 static SolverOps rotgOps = {
78 generator,
79 assignKargs,
80 NULL,
81 NULL, // Prepare Translate Dims
82 NULL, // Inner Decomposition Axis
83 calcNrThreads,
84 NULL,
85 solverFlags,
86 NULL,
87 NULL,
88 NULL,
89 setBuildOpts,
90 NULL
91 };
92
93 static void
setBuildOpts(char * buildOptStr,const void * args)94 setBuildOpts(
95 char * buildOptStr,
96 const void *args)
97 {
98 const SolutionStep *step = (const SolutionStep *)args;
99 const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
100 if ( (kargs->dtype == TYPE_DOUBLE) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
101 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
102 }
103 if( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
104 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
105 }
106
107 return;
108 }
109
110
111 static CLBLASMpatExtra mpatExtra;
112
113 extern "C"
initRotgRegisterPattern(MemoryPattern * mempat)114 void initRotgRegisterPattern(MemoryPattern *mempat)
115 {
116 #ifdef DEBUG_ROTG
117 printf("initRegPattern called with mempat = 0x%p\n", mempat);
118 #endif
119
120 fflush(stdout);
121 mempat->name = "Register accumulation based swap";
122 mempat->nrLevels = 2;
123 mempat->cuLevel = 0;
124 mempat->thLevel = 1;
125 mempat->sops = &rotgOps;
126
127 mpatExtra.aMset = CLMEM_LEVEL_L2;
128 mpatExtra.bMset = CLMEM_LEVEL_L2;
129 mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
130 mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
131 mempat->extra = &mpatExtra;
132
133 Prefix[TYPE_FLOAT] = 'S';
134 Prefix[TYPE_DOUBLE] = 'D';
135 Prefix[TYPE_COMPLEX_FLOAT] = 'C';
136 Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
137 }
138
139 static void
calcNrThreads(size_t threads[2],const SubproblemDim * subdims,const PGranularity * pgran,const void * args,const void * _extra)140 calcNrThreads(
141 size_t threads[2],
142 const SubproblemDim *subdims,
143 const PGranularity *pgran,
144 const void *args,
145 const void *_extra)
146 {
147 int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
148 DUMMY_ARGS_USAGE_3(subdims, _extra, args);
149
150 size_t blocks = 1; // Only 1 work-group is enough
151 #ifdef DEBUG_ROTG
152 printf("blocks : %d\n", blocks);
153 #endif
154
155 threads[0] = blocks * BLOCKSIZE;
156 #ifdef DEBUG_ROTG
157 printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", pgran->wgSize[0], threads[0]);
158 #endif
159 threads[1] = 1;
160 }
161
162 //
163 // FIXME: Report correct return value - Needs change in KPRINTF
164 //
165 static ssize_t
generator(char * buf,size_t buflen,const struct SubproblemDim * subdims,const struct PGranularity * pgran,void * extra)166 generator(
167 char *buf,
168 size_t buflen,
169 const struct SubproblemDim *subdims,
170 const struct PGranularity *pgran,
171 void *extra)
172 {
173
174 CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
175 DUMMY_ARGS_USAGE_2(subdims, pgran);
176 char tempTemplate[32*1024];
177
178 if ( buf == NULL) // return buffer size
179 {
180 buflen = (32 * 1024 * sizeof(char));
181 return (ssize_t)buflen;
182 }
183
184 #ifdef DEBUG_ROTG
185 printf("dataType : %c\n", Prefix[extraFlags->dtype]);
186 #endif
187
188 strcpy( tempTemplate, (char*)rotg_kernel );
189
190 kprintf kobj( Prefix[extraFlags->dtype], 1, false, false);
191 kobj.spit((char*)buf, tempTemplate);
192
193 return (32 * 1024 * sizeof(char));
194 }
195
196 /*
197 __kernel void %PREFIXrotg_kernel( __global %TYPE *_A, __global %TYPE *_B, __global %PTYPE *_C,
198 __global %TYPE *_S, uint offa, uint offb, uint offc, uint offs )
199
200 */
201 static void
assignKargs(KernelArg * args,const void * params,const void *)202 assignKargs(KernelArg *args, const void *params, const void* )
203 {
204 CLBlasKargs *blasArgs = (CLBlasKargs*)params;
205
206 INIT_KARG(&args[0], blasArgs->A);
207 INIT_KARG(&args[1], blasArgs->B);
208 INIT_KARG(&args[2], blasArgs->C);
209 INIT_KARG(&args[3], blasArgs->D);
210 initSizeKarg(&args[4], blasArgs->offa);
211 initSizeKarg(&args[5], blasArgs->offb);
212 initSizeKarg(&args[6], blasArgs->offc);
213 initSizeKarg(&args[7], blasArgs->offd);
214
215 return;
216 }
217