1 #include <cuda_runtime.h>
2 #include <cuComplex.h>
3 #include <complex.h>
4 #include <stdio.h>
5 #include<omp.h>
6 #define type double
7 
8 #define STR1(X) #X
9 #define STR(X) STR1(X)
10 #define STRINGIFY(X,Y) X ## Y
11 #define CON(X,Y) STRINGIFY(X,Y)
12 #define KDir kernels
13 
14 #include "includes/ourmacros.h"
15 
16 
17 /*
18    __device__ __inline__ double ld_gbl_cg(const double *addr) {
19    double return_value;
20    asm("ld.global.cg.f64 %0, [%1];" : "=d"(return_value) : "l"(addr));
21    return return_value;
22    }*/
23 
24 extern __shared__ type tile[];
25 
fvimatchl32_main(const type * Atmp,type * Btmp,const int lda1,const int ldb1,const int size0,const int plain,const int sbp,const unsigned short * __restrict__ offset,type alpha,type beta)26 __device__ __forceinline__ void fvimatchl32_main(const  type *   Atmp, type *  Btmp,const  int lda1,const  int ldb1,const int size0, const int  plain, const int sbp, const unsigned  short * __restrict__ offset, type alpha, type beta)
27 {
28 
29 	const int id = threadIdx.x;
30 	const int i2 = id / 32;
31 	const int rest = id %32;
32 	const type *Adisp = Atmp +i2 * lda1;
33 	const int tile_disp = i2 * (sbp);
34 	int regs[8];int j = rest;
35 #pragma unroll 8
36 	for(int i = 0; i < 8; i++)
37 	{
38 		regs[i] = offset[j];
39 		j+=32;
40 		if(j >= plain)
41 			break;
42 	}
43 #pragma unroll
44 	for( int j = rest; j < plain; j+=32){
45 		//tile[tile_disp+j] =  ld_gbl_cg(&Adisp[j]);
46 		tile[tile_disp+j] =  Adisp[j];
47 	}
48 	__syncthreads();
49 	type *Bdisp = &Btmp[i2 * ldb1];
50 	const  int tile_disp1 = i2 * size0;
51 	j = rest;
52 #pragma unroll 8
53 	for(int i = 0; i < 8; i++)
54 	{
55 		Bdisp[j] = beta*Bdisp[j] + alpha* tile[regs[i]+tile_disp1];
56 		j+=32;
57 		if(j >= plain)
58 			break;
59 	}
60 }
61 
62 	__device__ __forceinline__
fvimatchl32_rem(const type * Atmp,type * Btmp,const int lda1,const int ldb1,const int remainderx,const int remaindery,const int size0,const int plain,const int sbp,const int ilimit,const int olimit,const unsigned short int * __restrict__ offset,type alpha,type beta)63 void fvimatchl32_rem(const type *  Atmp, type *  Btmp, const int lda1, const int ldb1,const  int remainderx,const  int remaindery,const  int size0, const int plain, const int sbp, const int ilimit, const int olimit, const unsigned short int* __restrict__ offset, type alpha, type beta)
64 {
65 
66 	int id = threadIdx.x;
67 	int i2 = id / 32;
68 	int rest = id %32;
69 	if(i2 < remaindery){
70 		const type *Adisp = &Atmp[i2 * lda1];
71 		const int tile_disp = i2 * (sbp);
72 #pragma unroll
73 		for( int  j = rest; j < ilimit; j+=32){
74 			tile[tile_disp+j] =  Adisp[j];
75 		}
76 	}
77 	__syncthreads();
78 	if(i2 >= remainderx)
79 		return;
80 	type *Bdisp = &Btmp[i2 * ldb1];
81 	int regs[8];int j = rest;
82 #pragma unroll 8
83 	for(int i = 0; i < 8; i++)
84 	{
85 		if(j >= olimit)
86 			break;
87 		regs[i] = offset[j];
88 		j+=32;
89 	}
90 	const  int tile_disp1 = i2 * size0;
91 	j = rest;
92 #pragma unroll 8
93 	for(int i = 0; i < 8; i++)
94 	{
95 		if(j >= olimit)
96 			break;
97 		Bdisp[j] = beta*Bdisp[j]+ alpha* tile[regs[i]+tile_disp1];
98 		j+=32;
99 	}
100 }
fvimatchl32_main_coars(const type * Atmp,type * Btmp,const int lda1,const int ldb1,const int size0,const int plain,const int sbp,const unsigned short * __restrict__ offset,const int acoars,const int bcoars,const int size,type alpha,type beta)101 __device__ __forceinline__ void fvimatchl32_main_coars(const  type *   Atmp, type *  Btmp,const  int lda1,const  int ldb1,const int size0, const int  plain, const int sbp, const unsigned  short * __restrict__ offset, const int acoars, const int bcoars, const int size, type alpha, type beta)
102 {
103 
104 	const int id = threadIdx.x;
105 	const int i2 = id / 32;
106 	const int rest = id %32;
107 	const int tile_disp = i2 * (sbp);
108 	int regs[8];
109 	for(int c = 0; c < size; c++)
110 	{
111 		int j = rest;
112 		const type *Adisp = Atmp +i2 * lda1 + c*acoars;
113 #pragma unroll 8
114 		for(int i = 0; i < 8; i++)
115 		{
116 			regs[i] = offset[j];
117 			j+=32;
118 			if(j >= plain)
119 				break;
120 		}
121 #pragma unroll
122 		for(j = rest; j < plain; j+=32){
123 			tile[tile_disp+j] =  Adisp[j];
124 		}
125 		__syncthreads();
126 		type *Bdisp = Btmp + i2 * ldb1 + c*bcoars;
127 		const  int tile_disp1 = i2 * size0;
128 		j = rest;
129 #pragma unroll 8
130 		for(int i = 0; i < 8; i++)
131 		{
132 			Bdisp[j] = alpha* tile[regs[i]+tile_disp1] + beta*Bdisp[j];
133 			j+=32;
134 			if(j >= plain)
135 				break;
136 		}
137 		__syncthreads();
138 	}
139 }
140 
141 	__device__ __forceinline__
fvimatchl32_rem_coars(const type * Atmp,type * Btmp,const int lda1,const int ldb1,const int remainderx,const int remaindery,const int size0,const int plain,const int sbp,const int ilimit,const int olimit,const unsigned short int * __restrict__ offset,const int acoars,const int bcoars,const int size,type alpha,type beta)142 void fvimatchl32_rem_coars(const type *  Atmp, type *  Btmp, const int lda1, const int ldb1,const  int remainderx,const  int remaindery,const  int size0, const int plain, const int sbp, const int ilimit, const int olimit, const unsigned short int* __restrict__ offset, const int acoars, const int bcoars, const int size, type alpha, type beta)
143 {
144 
145 	int id = threadIdx.x;
146 	int i2 = id / 32;
147 	int rest = id %32;
148 	for(int c = 0; c< size; c++)
149 	{
150 		__syncthreads();
151 		if(i2 < remaindery){
152 			const type *Adisp = Atmp + i2 * lda1 + c*acoars;
153 			const int tile_disp = i2 * (sbp);
154 #pragma unroll
155 			for( int  j = rest; j < ilimit; j+=32){
156 				tile[tile_disp+j] =  Adisp[j];
157 			}
158 		}
159 		__syncthreads();
160 		if(i2 >= remainderx)
161 			continue;
162 		type *Bdisp = Btmp + i2 * ldb1 + c*bcoars;
163 		int regs[8];
164 		int j = rest;
165 #pragma unroll 8
166 		for(int i = 0; i < 8; i++)
167 		{
168 			if(j >= olimit)
169 				break;
170 			regs[i] = offset[j];
171 			j+=32;
172 		}
173 		const  int tile_disp1 = i2 * size0;
174 		j = rest;
175 #pragma unroll 8
176 		for(int i = 0; i < 8; i++)
177 		{
178 			if(j >= olimit)
179 				break;
180 			Bdisp[j] = alpha* tile[regs[i]+tile_disp1] + beta*Bdisp[j];
181 			j+=32;
182 		}
183 	}
184 }
185 
186 #define FNAME fvimatchl32.h
187 #include "includes/macro.h"
188 #undef FNAME
189 #define FNAME fvimatchl32_coars.h
190 #include "includes/macro.h"
191 #undef FNAME
192 
193 
fvimatchl32CallerWrapper(const int ndim,const type * __restrict__ A,type * B,const int size0,const int param0,const int param1,const int numthreads1,const int numblocks,const int numthreads,const int shmsize,const int * __restrict__ lda_s,const int * __restrict__ ldb_s,const int * __restrict__ idx_s,const int remainder1,const int remainder2,const int lda_kernel1,const int ldb_kernel1,unsigned short int * offset,const int ilimit,const int olimit,const int plain,const int sbp,const int ldal,const int ldap2l,const int acoars,const int bcoars,const int size,type alpha,type beta)194 void fvimatchl32CallerWrapper(const int ndim,const  type * __restrict__  A, type * B,const int size0, const  int param0, const int param1, const int numthreads1, const int numblocks, const int numthreads, const int shmsize
195 		, const int * __restrict__ lda_s, const int* __restrict__  ldb_s, const int* __restrict__ idx_s
196 		, const int remainder1,		const int remainder2, const int lda_kernel1, const int ldb_kernel1, unsigned short int* offset,
197 		const int ilimit, const int olimit, const int plain, const int sbp, const int ldal, const int ldap2l, const int acoars, const int bcoars, const int size,type alpha,type beta
198 		)
199 {
200 #ifdef printd
201 	printf("ndim = %d, size = %d, numblocks = %d, numthreads = %d, acoars = %d, bcoars = %d\n", ndim, size, numblocks, numthreads, acoars, bcoars);
202 #endif
203 
204 	if(size > 0)
205 	{
206 #ifdef printd
207 		printf("Coarsening... No. of blocks = %d\n", numblocks/size);
208 #endif
209 		dim3 thread_blocks(numblocks/size, 1, 1);
210 		switch(ndim)
211 		{
212 			EXPANDDIMS(fvimatchl32_coars_kernel_, thread_blocks, numthreads, shmsize, (A,  B,size0, ldal, ldap2l,param0, param1,plain,sbp, lda_s,ldb_s,idx_s,remainder1,remainder2,ilimit,olimit,lda_kernel1, ldb_kernel1,offset, acoars, bcoars, size, alpha, beta))
213 			default:
214 			{
215 			}
216 		}
217 	}
218 	else
219 	{
220 		dim3 thread_blocks(numblocks, 1, 1);
221 		switch(ndim)
222 		{
223 			EXPANDDIMS(fvimatchl32_kernel_, thread_blocks, numthreads, shmsize, (A,  B,size0, ldal, ldap2l,param0, param1,plain,sbp, lda_s,ldb_s,idx_s,remainder1,remainder2,ilimit,olimit,lda_kernel1, ldb_kernel1,offset, alpha, beta))
224 			default:
225 			{
226 			}
227 		}
228 	}
229 }
230 
231 void swap(int array[], int ind1, int ind2);
232 
233 int  cancoarsen(int *lda, int newndim);
234 
235 
236 	extern "C"
fvimatchl32_transpose_kernel(int ndim,type * A,type * B,const int * lda,const int * ldb,const int * params,const int * rperm,type alpha,type beta)237 void  fvimatchl32_transpose_kernel(int ndim,   type *A, type *B,  const int *lda, const int *ldb, const int* params, const int * rperm, type alpha, type beta)
238 {
239 
240 	//printf("l32 normal\n");
241 
242 	// int numBlocks = computeNumBlocksCode ;
243 #ifdef printd
244 	printf("\nA Dims: %d \t %d \t %d\t %d\t %d\n", lda[0], lda[1], lda[2], lda[3], lda[4]);
245 	printf("\nParams: %d \t %d \t %d\t %d\t %d\t %d\t %d\n", params[0], params[1], params[2], params[3], params[4], params[5], params[6]);
246 	printf("\nB Dims: %d \t %d \t %d\t %d\t %d\n", ldb[0], ldb[1], ldb[2], ldb[3], ldb[4]);
247 	printf("\n         R perm: %d \t %d \t %d\t %d\t %d\n", rperm[0], rperm[1], rperm[2], rperm[3], rperm[4]);
248 #endif
249 
250 
251 	int numBlocks = params[6];//((size[1] + 8 -1)/8) * size[2] * ((size[3] + 8 -1)/8) * size[4] ;
252 
253 	int *d_lda_s, *d_ldb_s,  *d_idx_s;
254 	const int size0 = lda[0];
255 	const int remainder1 = lda[1] % params[0];
256 	const int remainder2 = lda[params[3]] % params[0];
257 	int lda_s[20], ldb_s[20], idx_s[20], temp[20];
258 	lda_s[0] = 1;
259 	ldb_s[0] = 1;
260 	int i, blockA=params[0];
261 	int blockB = blockA;
262 	idx_s[1] = (lda[1] + blockA - 1) / blockA;
263 	lda_s[1] = lda_s[0] * lda[0];
264 	ldb_s[1] = ldb_s[0] * ldb[0];
265 	for(i = 2; i < ndim; i++)
266 	{
267 		if( i == params[3])
268 		{
269 			idx_s[i] = (lda[i] + blockA - 1)/blockA;
270 		}
271 		else
272 		{
273 			idx_s[i] = lda[i];
274 
275 		}
276 		lda_s[i] = lda_s[i-1] * lda[i-1];
277 		ldb_s[i] = ldb_s[i-1] * ldb[i-1];
278 	}
279 
280 
281 	for(i = 1; i < ndim; i++)
282 	{
283 		temp[i] = ldb_s[rperm[i]];
284 	}
285 	const int lda_kernel1 = lda_s[params[3]];
286 	const  int ldb_kernel1 = ldb_s[params[4]];
287 
288 	lda_s[1] *= blockA;
289 	lda_s[params[3]] *= blockA;
290 	temp[1] *= blockA;
291 	temp[params[3]] *= blockA;
292 
293 	unsigned short int offset[9000];
294 	unsigned short int *d_offset;
295 	unsigned short limit = lda[0] * params[0];
296 	int tlimit = -1;
297 	for(i = 0; i < limit ; i++)
298 	{
299 		offset[i] = (i/lda[0]) * (lda[0] * params[0] + params[1])     +(i%lda[0]);
300 		if(i / lda[0] >= remainder2 && tlimit == -1) tlimit = i;
301 	}
302 #ifdef printd
303 	printf("Offset memory size = %d \n", limit);
304 #endif
305 
306 	if(params[3] != 2)
307 	{
308 		swap(idx_s, 2, params[3]);
309 		swap(lda_s, 2, params[3]);
310 		swap(temp, 2, params[3]);
311 	}
312 	int newndim = ndim;
313 
314 	int acoars = -1, bcoars = -1, size = -1;
315 #ifndef NOCOARSEN
316 	int noblock = 3;// (params[3] != 2);
317 	int cd = cancoarsen(idx_s+ noblock, ndim- noblock);
318 
319 	if(cd >= 0)
320 	{
321 #ifdef printd
322 		printf("cd = %d, noblock = %d, ndim-noblock = %d\n", cd, noblock, ndim-noblock);
323 #endif
324 		acoars = lda_s[noblock+cd];
325 		bcoars = temp[noblock+cd];
326 		size = idx_s[noblock+cd];
327 		for(int j = cd+1; j < newndim; j++)
328 		{
329 			idx_s[noblock+j-1] = idx_s[noblock+j];
330 			lda_s[noblock+j-1] = lda_s[noblock+j];
331 			temp[noblock+j-1] = temp[noblock+j];
332 
333 		}
334 		newndim--;
335 	}
336 #endif
337 
338 	newndim--;
339 
340 	SAFECUDAMALLOC(&d_offset,limit*sizeof(short));
341 	SAFECUDAMEMCPY(d_offset, offset,limit*sizeof(short), cudaMemcpyHostToDevice);
342 	SAFECUDAMALLOC(&d_lda_s,newndim*sizeof(int));
343 	SAFECUDAMALLOC(&d_ldb_s,newndim*sizeof(int));
344 	SAFECUDAMALLOC(&d_idx_s,newndim*sizeof(int));
345 	SAFECUDAMEMCPY(d_idx_s, idx_s+1,newndim*sizeof(int), cudaMemcpyHostToDevice);
346 	SAFECUDAMEMCPY(d_lda_s, lda_s+1,newndim*sizeof(int), cudaMemcpyHostToDevice);
347 	SAFECUDAMEMCPY(d_ldb_s, temp+1,newndim*sizeof(int), cudaMemcpyHostToDevice);
348 
349 	const int ilimit = remainder1 * size0;
350 	const int olimit = remainder2 * size0;
351 	const int plain = params[0] * size0;
352 	const int sbp = plain+params[1];
353 	const int ldal = (lda[1] - remainder1)/blockA;
354 	const int ldp2l = (lda[params[3]] - remainder2)/blockB;
355 /*
356 #ifdef MODEL
357 printf("\t%d\t%d\t", plain, blockA);
358 printf("\t%d\t%d\t", plain/32, plain%32);
359 double f1, f2, f3, f4, f;
360 int minlimit = min(ilimit, olimit);
361 printf("\tf1=%lf\t", f1 =  ((plain/32)  + (double)(plain%32) /32)/ (int)((plain+31)/32));
362 printf("\tf2=%lf\t", f2 =  ((ilimit/32)  + (double)(ilimit%32) /32)/ (int)(max(1,(ilimit+31)/32)));
363 printf("\tf3=%lf\t", f3 =  ((olimit/32)  + (double)(olimit%32) /32)/ (int)(max(1,(olimit+31)/32)));
364 printf("\tf4=%lf\t", f4 =  ((minlimit/32)  + (double)(minlimit%32) /32)/ (int)(max(1,(minlimit+31)/32)));
365 //printf("\t%d\t%d\t", lda[1], ldb[1]);
366 int asize = lda[1];
367 int bsize = lda[1];
368 //printf("\t%d\t%d\t%d\t%d\t", asize/blockA, asize%blockA, bsize/blockB,bsize%blockB );
369 //int amax = min(blockA, 32);
370 //int bmax = min(blockB, 32);
371 int amax = blockA;
372 int bmax = blockB;
373 printf("\tf=%lf\t", f = ((asize/amax) * (bsize/bmax) *f1 + (double)(asize/amax) * (bsize%bmax > 0) *f3+ (double)(asize%amax>0) * (bsize/bmax)*f2 + (double)(asize%amax > 0) * (bsize%bmax > 0) *f4 )/ (int)(((asize+amax-1)/amax) * ((bsize+bmax-1)/bmax)));
374 printf("\t%lf\t", f);
375 #endif
376 */
377 
378 #ifdef NOHTIME
379 #include "includes/nohtimestart.h"
380 #endif
381 	fvimatchl32CallerWrapper(newndim, A,  B,lda[0],params[0], params[1], params[3]-1,
382 			numBlocks, params[2], params[5]*sizeof(type)
383 			, d_lda_s,d_ldb_s,d_idx_s
384 			,remainder1,remainder2,lda_kernel1, ldb_kernel1, d_offset, ilimit, olimit, plain, sbp, ldal, ldp2l, acoars, bcoars, size, alpha, beta);
385 
386 
387 #ifdef NOHTIME
388 #include "includes/nohtimestop.h"
389 #endif
390 
391 
392 	{cudaError_t err = cudaGetLastError();
393 		if(err != cudaSuccess){
394 			printf("\nKernel ERROR in fvimatchl32: %s (line: %d)\n", cudaGetErrorString(err), __LINE__);
395 			//exit(-1);
396 		}}
397 	cudaFree(d_lda_s);
398 	cudaFree(d_ldb_s);
399 	cudaFree(d_idx_s);
400 	cudaFree(d_offset);
401 }
402 
403 
404 
405