1 #include <cuda_runtime.h>
2 #include <cuComplex.h>
3 #include <complex.h>
4 #include <stdio.h>
5 #include<omp.h>
6 #define type double
7
8 #define STR1(X) #X
9 #define STR(X) STR1(X)
10 #define STRINGIFY(X,Y) X ## Y
11 #define CON(X,Y) STRINGIFY(X,Y)
12 #define KDir kernels
13
14 #include "includes/ourmacros.h"
15
16
17 /*
18 __device__ __inline__ double ld_gbl_cg(const double *addr) {
19 double return_value;
20 asm("ld.global.cg.f64 %0, [%1];" : "=d"(return_value) : "l"(addr));
21 return return_value;
22 }*/
23
24 extern __shared__ type tile[];
25
fvimatchl32_main(const type * Atmp,type * Btmp,const int lda1,const int ldb1,const int size0,const int plain,const int sbp,const unsigned short * __restrict__ offset,type alpha,type beta)26 __device__ __forceinline__ void fvimatchl32_main(const type * Atmp, type * Btmp,const int lda1,const int ldb1,const int size0, const int plain, const int sbp, const unsigned short * __restrict__ offset, type alpha, type beta)
27 {
28
29 const int id = threadIdx.x;
30 const int i2 = id / 32;
31 const int rest = id %32;
32 const type *Adisp = Atmp +i2 * lda1;
33 const int tile_disp = i2 * (sbp);
34 int regs[8];int j = rest;
35 #pragma unroll 8
36 for(int i = 0; i < 8; i++)
37 {
38 regs[i] = offset[j];
39 j+=32;
40 if(j >= plain)
41 break;
42 }
43 #pragma unroll
44 for( int j = rest; j < plain; j+=32){
45 //tile[tile_disp+j] = ld_gbl_cg(&Adisp[j]);
46 tile[tile_disp+j] = Adisp[j];
47 }
48 __syncthreads();
49 type *Bdisp = &Btmp[i2 * ldb1];
50 const int tile_disp1 = i2 * size0;
51 j = rest;
52 #pragma unroll 8
53 for(int i = 0; i < 8; i++)
54 {
55 Bdisp[j] = beta*Bdisp[j] + alpha* tile[regs[i]+tile_disp1];
56 j+=32;
57 if(j >= plain)
58 break;
59 }
60 }
61
62 __device__ __forceinline__
fvimatchl32_rem(const type * Atmp,type * Btmp,const int lda1,const int ldb1,const int remainderx,const int remaindery,const int size0,const int plain,const int sbp,const int ilimit,const int olimit,const unsigned short int * __restrict__ offset,type alpha,type beta)63 void fvimatchl32_rem(const type * Atmp, type * Btmp, const int lda1, const int ldb1,const int remainderx,const int remaindery,const int size0, const int plain, const int sbp, const int ilimit, const int olimit, const unsigned short int* __restrict__ offset, type alpha, type beta)
64 {
65
66 int id = threadIdx.x;
67 int i2 = id / 32;
68 int rest = id %32;
69 if(i2 < remaindery){
70 const type *Adisp = &Atmp[i2 * lda1];
71 const int tile_disp = i2 * (sbp);
72 #pragma unroll
73 for( int j = rest; j < ilimit; j+=32){
74 tile[tile_disp+j] = Adisp[j];
75 }
76 }
77 __syncthreads();
78 if(i2 >= remainderx)
79 return;
80 type *Bdisp = &Btmp[i2 * ldb1];
81 int regs[8];int j = rest;
82 #pragma unroll 8
83 for(int i = 0; i < 8; i++)
84 {
85 if(j >= olimit)
86 break;
87 regs[i] = offset[j];
88 j+=32;
89 }
90 const int tile_disp1 = i2 * size0;
91 j = rest;
92 #pragma unroll 8
93 for(int i = 0; i < 8; i++)
94 {
95 if(j >= olimit)
96 break;
97 Bdisp[j] = beta*Bdisp[j]+ alpha* tile[regs[i]+tile_disp1];
98 j+=32;
99 }
100 }
fvimatchl32_main_coars(const type * Atmp,type * Btmp,const int lda1,const int ldb1,const int size0,const int plain,const int sbp,const unsigned short * __restrict__ offset,const int acoars,const int bcoars,const int size,type alpha,type beta)101 __device__ __forceinline__ void fvimatchl32_main_coars(const type * Atmp, type * Btmp,const int lda1,const int ldb1,const int size0, const int plain, const int sbp, const unsigned short * __restrict__ offset, const int acoars, const int bcoars, const int size, type alpha, type beta)
102 {
103
104 const int id = threadIdx.x;
105 const int i2 = id / 32;
106 const int rest = id %32;
107 const int tile_disp = i2 * (sbp);
108 int regs[8];
109 for(int c = 0; c < size; c++)
110 {
111 int j = rest;
112 const type *Adisp = Atmp +i2 * lda1 + c*acoars;
113 #pragma unroll 8
114 for(int i = 0; i < 8; i++)
115 {
116 regs[i] = offset[j];
117 j+=32;
118 if(j >= plain)
119 break;
120 }
121 #pragma unroll
122 for(j = rest; j < plain; j+=32){
123 tile[tile_disp+j] = Adisp[j];
124 }
125 __syncthreads();
126 type *Bdisp = Btmp + i2 * ldb1 + c*bcoars;
127 const int tile_disp1 = i2 * size0;
128 j = rest;
129 #pragma unroll 8
130 for(int i = 0; i < 8; i++)
131 {
132 Bdisp[j] = alpha* tile[regs[i]+tile_disp1] + beta*Bdisp[j];
133 j+=32;
134 if(j >= plain)
135 break;
136 }
137 __syncthreads();
138 }
139 }
140
141 __device__ __forceinline__
fvimatchl32_rem_coars(const type * Atmp,type * Btmp,const int lda1,const int ldb1,const int remainderx,const int remaindery,const int size0,const int plain,const int sbp,const int ilimit,const int olimit,const unsigned short int * __restrict__ offset,const int acoars,const int bcoars,const int size,type alpha,type beta)142 void fvimatchl32_rem_coars(const type * Atmp, type * Btmp, const int lda1, const int ldb1,const int remainderx,const int remaindery,const int size0, const int plain, const int sbp, const int ilimit, const int olimit, const unsigned short int* __restrict__ offset, const int acoars, const int bcoars, const int size, type alpha, type beta)
143 {
144
145 int id = threadIdx.x;
146 int i2 = id / 32;
147 int rest = id %32;
148 for(int c = 0; c< size; c++)
149 {
150 __syncthreads();
151 if(i2 < remaindery){
152 const type *Adisp = Atmp + i2 * lda1 + c*acoars;
153 const int tile_disp = i2 * (sbp);
154 #pragma unroll
155 for( int j = rest; j < ilimit; j+=32){
156 tile[tile_disp+j] = Adisp[j];
157 }
158 }
159 __syncthreads();
160 if(i2 >= remainderx)
161 continue;
162 type *Bdisp = Btmp + i2 * ldb1 + c*bcoars;
163 int regs[8];
164 int j = rest;
165 #pragma unroll 8
166 for(int i = 0; i < 8; i++)
167 {
168 if(j >= olimit)
169 break;
170 regs[i] = offset[j];
171 j+=32;
172 }
173 const int tile_disp1 = i2 * size0;
174 j = rest;
175 #pragma unroll 8
176 for(int i = 0; i < 8; i++)
177 {
178 if(j >= olimit)
179 break;
180 Bdisp[j] = alpha* tile[regs[i]+tile_disp1] + beta*Bdisp[j];
181 j+=32;
182 }
183 }
184 }
185
186 #define FNAME fvimatchl32.h
187 #include "includes/macro.h"
188 #undef FNAME
189 #define FNAME fvimatchl32_coars.h
190 #include "includes/macro.h"
191 #undef FNAME
192
193
fvimatchl32CallerWrapper(const int ndim,const type * __restrict__ A,type * B,const int size0,const int param0,const int param1,const int numthreads1,const int numblocks,const int numthreads,const int shmsize,const int * __restrict__ lda_s,const int * __restrict__ ldb_s,const int * __restrict__ idx_s,const int remainder1,const int remainder2,const int lda_kernel1,const int ldb_kernel1,unsigned short int * offset,const int ilimit,const int olimit,const int plain,const int sbp,const int ldal,const int ldap2l,const int acoars,const int bcoars,const int size,type alpha,type beta)194 void fvimatchl32CallerWrapper(const int ndim,const type * __restrict__ A, type * B,const int size0, const int param0, const int param1, const int numthreads1, const int numblocks, const int numthreads, const int shmsize
195 , const int * __restrict__ lda_s, const int* __restrict__ ldb_s, const int* __restrict__ idx_s
196 , const int remainder1, const int remainder2, const int lda_kernel1, const int ldb_kernel1, unsigned short int* offset,
197 const int ilimit, const int olimit, const int plain, const int sbp, const int ldal, const int ldap2l, const int acoars, const int bcoars, const int size,type alpha,type beta
198 )
199 {
200 #ifdef printd
201 printf("ndim = %d, size = %d, numblocks = %d, numthreads = %d, acoars = %d, bcoars = %d\n", ndim, size, numblocks, numthreads, acoars, bcoars);
202 #endif
203
204 if(size > 0)
205 {
206 #ifdef printd
207 printf("Coarsening... No. of blocks = %d\n", numblocks/size);
208 #endif
209 dim3 thread_blocks(numblocks/size, 1, 1);
210 switch(ndim)
211 {
212 EXPANDDIMS(fvimatchl32_coars_kernel_, thread_blocks, numthreads, shmsize, (A, B,size0, ldal, ldap2l,param0, param1,plain,sbp, lda_s,ldb_s,idx_s,remainder1,remainder2,ilimit,olimit,lda_kernel1, ldb_kernel1,offset, acoars, bcoars, size, alpha, beta))
213 default:
214 {
215 }
216 }
217 }
218 else
219 {
220 dim3 thread_blocks(numblocks, 1, 1);
221 switch(ndim)
222 {
223 EXPANDDIMS(fvimatchl32_kernel_, thread_blocks, numthreads, shmsize, (A, B,size0, ldal, ldap2l,param0, param1,plain,sbp, lda_s,ldb_s,idx_s,remainder1,remainder2,ilimit,olimit,lda_kernel1, ldb_kernel1,offset, alpha, beta))
224 default:
225 {
226 }
227 }
228 }
229 }
230
231 void swap(int array[], int ind1, int ind2);
232
233 int cancoarsen(int *lda, int newndim);
234
235
236 extern "C"
fvimatchl32_transpose_kernel(int ndim,type * A,type * B,const int * lda,const int * ldb,const int * params,const int * rperm,type alpha,type beta)237 void fvimatchl32_transpose_kernel(int ndim, type *A, type *B, const int *lda, const int *ldb, const int* params, const int * rperm, type alpha, type beta)
238 {
239
240 //printf("l32 normal\n");
241
242 // int numBlocks = computeNumBlocksCode ;
243 #ifdef printd
244 printf("\nA Dims: %d \t %d \t %d\t %d\t %d\n", lda[0], lda[1], lda[2], lda[3], lda[4]);
245 printf("\nParams: %d \t %d \t %d\t %d\t %d\t %d\t %d\n", params[0], params[1], params[2], params[3], params[4], params[5], params[6]);
246 printf("\nB Dims: %d \t %d \t %d\t %d\t %d\n", ldb[0], ldb[1], ldb[2], ldb[3], ldb[4]);
247 printf("\n R perm: %d \t %d \t %d\t %d\t %d\n", rperm[0], rperm[1], rperm[2], rperm[3], rperm[4]);
248 #endif
249
250
251 int numBlocks = params[6];//((size[1] + 8 -1)/8) * size[2] * ((size[3] + 8 -1)/8) * size[4] ;
252
253 int *d_lda_s, *d_ldb_s, *d_idx_s;
254 const int size0 = lda[0];
255 const int remainder1 = lda[1] % params[0];
256 const int remainder2 = lda[params[3]] % params[0];
257 int lda_s[20], ldb_s[20], idx_s[20], temp[20];
258 lda_s[0] = 1;
259 ldb_s[0] = 1;
260 int i, blockA=params[0];
261 int blockB = blockA;
262 idx_s[1] = (lda[1] + blockA - 1) / blockA;
263 lda_s[1] = lda_s[0] * lda[0];
264 ldb_s[1] = ldb_s[0] * ldb[0];
265 for(i = 2; i < ndim; i++)
266 {
267 if( i == params[3])
268 {
269 idx_s[i] = (lda[i] + blockA - 1)/blockA;
270 }
271 else
272 {
273 idx_s[i] = lda[i];
274
275 }
276 lda_s[i] = lda_s[i-1] * lda[i-1];
277 ldb_s[i] = ldb_s[i-1] * ldb[i-1];
278 }
279
280
281 for(i = 1; i < ndim; i++)
282 {
283 temp[i] = ldb_s[rperm[i]];
284 }
285 const int lda_kernel1 = lda_s[params[3]];
286 const int ldb_kernel1 = ldb_s[params[4]];
287
288 lda_s[1] *= blockA;
289 lda_s[params[3]] *= blockA;
290 temp[1] *= blockA;
291 temp[params[3]] *= blockA;
292
293 unsigned short int offset[9000];
294 unsigned short int *d_offset;
295 unsigned short limit = lda[0] * params[0];
296 int tlimit = -1;
297 for(i = 0; i < limit ; i++)
298 {
299 offset[i] = (i/lda[0]) * (lda[0] * params[0] + params[1]) +(i%lda[0]);
300 if(i / lda[0] >= remainder2 && tlimit == -1) tlimit = i;
301 }
302 #ifdef printd
303 printf("Offset memory size = %d \n", limit);
304 #endif
305
306 if(params[3] != 2)
307 {
308 swap(idx_s, 2, params[3]);
309 swap(lda_s, 2, params[3]);
310 swap(temp, 2, params[3]);
311 }
312 int newndim = ndim;
313
314 int acoars = -1, bcoars = -1, size = -1;
315 #ifndef NOCOARSEN
316 int noblock = 3;// (params[3] != 2);
317 int cd = cancoarsen(idx_s+ noblock, ndim- noblock);
318
319 if(cd >= 0)
320 {
321 #ifdef printd
322 printf("cd = %d, noblock = %d, ndim-noblock = %d\n", cd, noblock, ndim-noblock);
323 #endif
324 acoars = lda_s[noblock+cd];
325 bcoars = temp[noblock+cd];
326 size = idx_s[noblock+cd];
327 for(int j = cd+1; j < newndim; j++)
328 {
329 idx_s[noblock+j-1] = idx_s[noblock+j];
330 lda_s[noblock+j-1] = lda_s[noblock+j];
331 temp[noblock+j-1] = temp[noblock+j];
332
333 }
334 newndim--;
335 }
336 #endif
337
338 newndim--;
339
340 SAFECUDAMALLOC(&d_offset,limit*sizeof(short));
341 SAFECUDAMEMCPY(d_offset, offset,limit*sizeof(short), cudaMemcpyHostToDevice);
342 SAFECUDAMALLOC(&d_lda_s,newndim*sizeof(int));
343 SAFECUDAMALLOC(&d_ldb_s,newndim*sizeof(int));
344 SAFECUDAMALLOC(&d_idx_s,newndim*sizeof(int));
345 SAFECUDAMEMCPY(d_idx_s, idx_s+1,newndim*sizeof(int), cudaMemcpyHostToDevice);
346 SAFECUDAMEMCPY(d_lda_s, lda_s+1,newndim*sizeof(int), cudaMemcpyHostToDevice);
347 SAFECUDAMEMCPY(d_ldb_s, temp+1,newndim*sizeof(int), cudaMemcpyHostToDevice);
348
349 const int ilimit = remainder1 * size0;
350 const int olimit = remainder2 * size0;
351 const int plain = params[0] * size0;
352 const int sbp = plain+params[1];
353 const int ldal = (lda[1] - remainder1)/blockA;
354 const int ldp2l = (lda[params[3]] - remainder2)/blockB;
355 /*
356 #ifdef MODEL
357 printf("\t%d\t%d\t", plain, blockA);
358 printf("\t%d\t%d\t", plain/32, plain%32);
359 double f1, f2, f3, f4, f;
360 int minlimit = min(ilimit, olimit);
361 printf("\tf1=%lf\t", f1 = ((plain/32) + (double)(plain%32) /32)/ (int)((plain+31)/32));
362 printf("\tf2=%lf\t", f2 = ((ilimit/32) + (double)(ilimit%32) /32)/ (int)(max(1,(ilimit+31)/32)));
363 printf("\tf3=%lf\t", f3 = ((olimit/32) + (double)(olimit%32) /32)/ (int)(max(1,(olimit+31)/32)));
364 printf("\tf4=%lf\t", f4 = ((minlimit/32) + (double)(minlimit%32) /32)/ (int)(max(1,(minlimit+31)/32)));
365 //printf("\t%d\t%d\t", lda[1], ldb[1]);
366 int asize = lda[1];
367 int bsize = lda[1];
368 //printf("\t%d\t%d\t%d\t%d\t", asize/blockA, asize%blockA, bsize/blockB,bsize%blockB );
369 //int amax = min(blockA, 32);
370 //int bmax = min(blockB, 32);
371 int amax = blockA;
372 int bmax = blockB;
373 printf("\tf=%lf\t", f = ((asize/amax) * (bsize/bmax) *f1 + (double)(asize/amax) * (bsize%bmax > 0) *f3+ (double)(asize%amax>0) * (bsize/bmax)*f2 + (double)(asize%amax > 0) * (bsize%bmax > 0) *f4 )/ (int)(((asize+amax-1)/amax) * ((bsize+bmax-1)/bmax)));
374 printf("\t%lf\t", f);
375 #endif
376 */
377
378 #ifdef NOHTIME
379 #include "includes/nohtimestart.h"
380 #endif
381 fvimatchl32CallerWrapper(newndim, A, B,lda[0],params[0], params[1], params[3]-1,
382 numBlocks, params[2], params[5]*sizeof(type)
383 , d_lda_s,d_ldb_s,d_idx_s
384 ,remainder1,remainder2,lda_kernel1, ldb_kernel1, d_offset, ilimit, olimit, plain, sbp, ldal, ldp2l, acoars, bcoars, size, alpha, beta);
385
386
387 #ifdef NOHTIME
388 #include "includes/nohtimestop.h"
389 #endif
390
391
392 {cudaError_t err = cudaGetLastError();
393 if(err != cudaSuccess){
394 printf("\nKernel ERROR in fvimatchl32: %s (line: %d)\n", cudaGetErrorString(err), __LINE__);
395 //exit(-1);
396 }}
397 cudaFree(d_lda_s);
398 cudaFree(d_ldb_s);
399 cudaFree(d_idx_s);
400 cudaFree(d_offset);
401 }
402
403
404
405