1 #include <cuda_runtime.h>
2 #include <cuComplex.h>
3 #include <complex.h>
4 #include <stdio.h>
5 #include <omp.h>
6 #define type double
7 
8 #define STR1(X) #X
9 #define STR(X) STR1(X)
10 #define STRINGIFY(X,Y) X ## Y
11 #define CON(X,Y) STRINGIFY(X,Y)
12 #define KDir kernels
13 
14 
15 #include "includes/ourmacros.h"
16 
17 extern __shared__ type tile[];
fvinomatchg32_main(const type * __restrict__ Atmp,type * __restrict__ Btmp,const int lda1,const int ldb1)18 __device__ __forceinline__ void fvinomatchg32_main(const type * __restrict__ Atmp,  type * __restrict__ Btmp,const  int lda1,const  int ldb1)
19 {
20 #define TILE_DIM_X  32
21 #define TILE_DIM_Y  32
22 #define THREADS_PER_ROW  8
23 #define SHM2 33
24 	int id = threadIdx.x;
25 	int rowId = id%32;
26 	int colId = id/TILE_DIM_X;
27 #ifdef printd
28 	if(blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.x == 0)
29 		printf("lda1 = %d, ldb1 = %d\n", lda1, ldb1);
30 #endif
31 	for(int j = colId; j < TILE_DIM_Y; j+= THREADS_PER_ROW)
32 	{
33 		tile[j * SHM2 + rowId] = Atmp[rowId + j * lda1];
34 	}
35 	__syncthreads();
36 	for(int j = colId; j < TILE_DIM_Y; j+= THREADS_PER_ROW)
37 	{
38 		//Btmp[j * ldb1 + rowId] =tile[rowId * SHM2 + j];
39 		Btmp[j * ldb1 + rowId] =tile[rowId * SHM2 + j];
40 	}
41 #undef SHM2
42 }
43 
44 __device__ __forceinline__
fvinomatchg32_rem(const type * __restrict__ Atmp,type * __restrict__ Btmp,const int lda1,const int ldb1,const int remainderx,const int remaindery)45 void fvinomatchg32_rem(const type * __restrict__ Atmp,  type * __restrict__ Btmp, const int lda1, const int ldb1, const  int remainderx,const  int remaindery)
46 {
47 #define TILE_DIM_X  32
48 #define TILE_DIM_Y  32
49 #define THREADS_PER_ROW  8
50 	int id = threadIdx.x;
51 #define SHM2 33
52 	int rowId = id%32;
53 	int colId = id/TILE_DIM_X;
54 
55 	int limit = remaindery > TILE_DIM_Y ? TILE_DIM_Y :  remaindery;
56 	if(rowId < remainderx)
57 		for(int j = colId; j < limit; j+= THREADS_PER_ROW)
58 		{
59 			tile[j * SHM2 + rowId] = Atmp[rowId + j * lda1];
60 		}
61 	__syncthreads();
62 	if(rowId >= remaindery)
63 		return;
64 	limit = remainderx > TILE_DIM_Y ? TILE_DIM_Y :  remainderx;
65 	for(int j = colId; j < limit; j+= THREADS_PER_ROW)
66 	{
67 		Btmp[j * ldb1 + rowId] =tile[rowId * SHM2 + j];
68 	}
69 #undef SHM2
70 }
fvinomatchg32_main_coars(const type * __restrict__ Atmp,type * __restrict__ Btmp,const int lda1,const int ldb1,const int acoars,const int bcoars,const int size)71 __device__ __forceinline__ void fvinomatchg32_main_coars(const type * __restrict__ Atmp,  type * __restrict__ Btmp,const  int lda1,const  int ldb1, const int acoars, const int bcoars, const int size)
72 {
73 #define TILE_DIM_X  32
74 #define TILE_DIM_Y  32
75 #define THREADS_PER_ROW  8
76 #define SHM2 33
77 	int id = threadIdx.x;
78 	int rowId = id%32;
79 	int colId = id/TILE_DIM_X;
80 #ifdef printd
81 	if(blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.x == 0)
82 		printf("lda1 = %d, ldb1 = %d\n", lda1, ldb1);
83 #endif
84 	for(int i = 0; i < size; i++)
85 	{
86 		for(int j = colId; j < TILE_DIM_Y; j+= THREADS_PER_ROW)
87 		{
88 			tile[j * SHM2 + rowId] = Atmp[i*acoars+rowId + j * lda1];
89 		}
90 		__syncthreads();
91 		for(int j = colId; j < TILE_DIM_Y; j+= THREADS_PER_ROW)
92 		{
93 			Btmp[i*bcoars +j * ldb1 + rowId] =tile[rowId * SHM2 + j];
94 		}
95 		__syncthreads();
96 	}
97 #undef SHM2
98 }
99 
100 	__device__ __forceinline__
fvinomatchg32_rem_coars(const type * __restrict__ Atmp,type * __restrict__ Btmp,const int lda1,const int ldb1,const int remainderx,const int remaindery,const int acoars,const int bcoars,const int size)101 void fvinomatchg32_rem_coars(const type * __restrict__ Atmp,  type * __restrict__ Btmp, const int lda1, const int ldb1, const  int remainderx,const  int remaindery, const int acoars, const int bcoars, const int size)
102 {
103 #define TILE_DIM_X  32
104 #define TILE_DIM_Y  32
105 #define THREADS_PER_ROW  8
106 	int id = threadIdx.x;
107 #define SHM2 33
108 	int rowId = id%32;
109 	int colId = id/TILE_DIM_X;
110 
111 #ifdef printd
112 //if(threadIdx.x == 0 && blockIdx.x%5 == 0)
113 //printf("lda1 = %d, ldb1 = %d, remainderx = %d, remaindery = %d, acoars = %d, bcoars = %d, size = %d\n", lda1, ldb1, remainderx, remaindery, acoars, bcoars, size);
114 #endif
115 	for(int i = 0; i < size; i++)
116 	{
117 		if(rowId < remainderx)
118 			for(int j = colId; j < remaindery; j+= THREADS_PER_ROW)
119 			{
120 				tile[j * SHM2 + rowId] = Atmp[i*acoars+rowId + j * lda1];
121 			}
122 		__syncthreads();
123 		if(rowId < remaindery)
124 		for(int j = colId; j < remainderx; j+= THREADS_PER_ROW)
125 		{
126 			Btmp[i*bcoars+j * ldb1 + rowId] =tile[rowId * SHM2 + j];
127 		}
128 		__syncthreads();
129 	}
130 #undef SHM2
131 }
132 #define FNAME fvinomatchg32.h
133 #include "includes/macro.h"
134 #undef FNAME
135 #define FNAME fvinomatchg32_coars.h
136 #include "includes/macro.h"
137 #undef FNAME
138 
139 
140 
fvinomatchg32_CallerWrapper(int ndim,const type * __restrict__ A,type * B,const int numblocks,const int numthreads,const int shmsize,const int * __restrict__ lda_s,const int * __restrict__ ldb_s,const int * __restrict__ idx_s,const int coarsa,const int coarsb,const int lda_kernel1,const int ldb_kernel1,const int inputrem,const int outputrem,const int remainderx,const int remaindery,const int size)141 void fvinomatchg32_CallerWrapper(int ndim, const type * __restrict__ A, type * B,  const int numblocks, const int numthreads, const int shmsize
142 		, const int * __restrict__ lda_s, const int* __restrict__ ldb_s, const int* __restrict__ idx_s
143 		, const int coarsa, const int coarsb, const int lda_kernel1, const int ldb_kernel1,  const int inputrem, const int outputrem, const int remainderx, const int remaindery, const int size)
144 {
145 		#ifdef printd
146 			printf("ndim  = %d, size = %d, lda1 = %d, ldb1 = %d, irem = %d, orem = %d\n", ndim, size, lda_kernel1, ldb_kernel1, inputrem, outputrem);
147 		#endif
148 
149 	if(size > 0)
150 	{
151 		#ifdef printd
152 			printf("Coarsening... No. of blocks = %d\n", numblocks/size);
153 		#endif
154 		dim3 thread_blocks(numblocks/size, 1, 1);
155 		switch(ndim)
156 		{
157 			EXPANDDIMS(fvinomatchg32_coars_kernel_, thread_blocks, numthreads, shmsize, (A,  B, lda_s,ldb_s, idx_s, coarsa,coarsb, inputrem, outputrem, lda_kernel1, ldb_kernel1, remainderx, remaindery, size))
158 			default:
159 			{
160 			}
161 		}
162 	}
163 	else
164 	{
165 		dim3 thread_blocks(numblocks, 1, 1);
166 		switch(ndim)
167 		{
168 			EXPANDDIMS(fvinomatchg32_kernel_, thread_blocks, numthreads, shmsize, (A, B, lda_s,ldb_s, idx_s, inputrem, outputrem, lda_kernel1, ldb_kernel1, remainderx, remaindery))
169 			default:
170 			{
171 			}
172 		}
173 	}
174 }
175 void swap(int array[], int ind1, int ind2);
176 
177 int  cancoarsen(int *lda, int newndim);
178 	extern "C"
fvinomatchg32_transpose_kernel(int ndim,const type * A,type * B,const int * lda,const int * ldb,const int * params,const int * rperm)179 void  fvinomatchg32_transpose_kernel(int ndim, const type *A, type *B,  const int *lda, const int *ldb, const int* params, const int * rperm)
180 {
181 	// int numBlocks = computeNumBlocksCode ;
182 #ifdef printd
183 	printf("\nA Dims: %d \t %d \t %d\t %d\t %d\n", lda[0], lda[1], lda[2], lda[3], lda[4]);
184 	printf("\nParams: %d \t %d \t %d\t %d\t %d\t %d\t %d \t %d \t %d \t %d \t %d\n", params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9], params[10]);
185 	printf("\nB Dims: %d \t %d \t %d\t %d\t %d\n", ldb[0], ldb[1], ldb[2], ldb[3], ldb[4]);
186 	printf("\nR perm: %d \t %d \t %d\t %d\t %d\n", rperm[0], rperm[1], rperm[2], rperm[3], rperm[4]);
187 #endif
188 
189 	//#ifndef orig1
190 
191 	int numBlocks = params[6];//((size[1] + 8 -1)/8) * size[2] * ((size[3] + 8 -1)/8) * size[4] ;
192 
193 	//	printf("\n%d \t %d \t %d\t %d\t %d\n", ldb[0], ldb[1], ldb[2], ldb[3], ldb[4]);
194 	const int blockA = params[0];
195 	const int blockB = params[0];
196 	int irem, orem;
197 	//const int shm = params[5] * params[10];
198 	int *d_lda_s, *d_ldb_s,  *d_idx_s;
199 	const int remainder1 = lda[0] % 32;
200 	const int remainder2 = lda[params[7]] % 32;
201 	if(remainder1 == 0) irem = lda[0];
202 	else irem = (lda[0] - remainder1)/blockA;
203 	if(remainder2 == 0) orem = ldb[0];
204 	else orem = (ldb[0] - remainder2)/blockB;
205 
206 	int lda_s[20], ldb_s[20], idx_s[20], temp[20];
207 	lda_s[0] = 1;
208 	ldb_s[0] = 1;
209 
210 	int i;
211 	idx_s[0] = (lda[0] + blockA - 1) / blockA;
212 	for(i = 1; i < ndim; i++)
213 	{
214 		if( i == params[7])
215 		{
216 			idx_s[i] = (lda[i] + blockB - 1)/blockB;
217 		}
218 		else
219 		{
220 			idx_s[i] = lda[i];
221 		}
222 		lda_s[i] = lda_s[i-1] * lda[i-1];
223 		ldb_s[i] = ldb_s[i-1] * ldb[i-1];
224 	}
225 	for(i = 0; i < ndim; i++)
226 	{
227 		temp[i] = ldb_s[rperm[i]];
228 	}
229 
230 	const int lda_kernel1 =  lda_s[params[7]]; //lda,0
231 	const  int ldb_kernel1 = ldb_s[params[8]]; //ldb,0
232 	lda_s[0] *= params[0];
233         temp[0] *= params[0];
234         lda_s[params[7]] *= params[0];
235         temp[params[7]] *= params[0];
236 
237 	if(params[7] != 1)
238 	{
239 		swap(idx_s, 1, params[7]);
240 		swap(lda_s, 1, params[7]);
241 		swap(temp, 1, params[7]);
242 	}
243 	int newndim = ndim;
244 
245 	int acoars = -1, bcoars = -1, size = -1;
246 #ifndef NOCOARSEN
247 	int cd = cancoarsen(idx_s+2, ndim-2);
248 	if(cd >= 0)
249 	{
250 #ifdef printd
251 		printf("cd = %d\n", cd);
252 #endif
253 		acoars = lda_s[2+cd];
254 		bcoars = temp[2+cd];
255 		size = idx_s[2+cd];
256 		for(int j = cd+1; j < newndim; j++)
257 		{
258 			idx_s[2+j-1] = idx_s[2+j];
259 			lda_s[2+j-1] = lda_s[2+j];
260 			temp[2+j-1] = temp[2+j];
261 
262 		}
263 		newndim--;
264 	}
265 #endif
266 
267 	SAFECUDAMALLOC(&d_lda_s,newndim*sizeof(int));
268 	SAFECUDAMALLOC(&d_ldb_s,newndim*sizeof(int));
269 	SAFECUDAMALLOC(&d_idx_s,newndim*sizeof(int));
270 	SAFECUDAMEMCPY(d_idx_s, idx_s,newndim*sizeof(int), cudaMemcpyHostToDevice);
271 	SAFECUDAMEMCPY(d_lda_s, lda_s,newndim*sizeof(int), cudaMemcpyHostToDevice);
272 	SAFECUDAMEMCPY(d_ldb_s, temp,newndim*sizeof(int), cudaMemcpyHostToDevice);
273 
274 #ifdef MODEL
275 printf("\t%d\t%d\t", lda_kernel1, ldb_kernel1);
276 printf("\t%d\t%d\t%d\t%d\t", lda[0]/32,lda[0]%32, ldb[0]/32,ldb[0]%32 );
277 printf("\t%lf\t", ((lda[0]/32) * (ldb[0]/32) + (double)(lda[0]/32) * (ldb[0]%32) /32+ (double)(lda[0]%32) * (ldb[0]/32) /32 + (double)(lda[0]%32) * (ldb[0]%32) /(32*32) )/ (int)(((lda[0]+31)/32) * ((ldb[0]+31)/32)));
278 //printf("\t%lf\t%d %d \t", ((lda[0]/32) * (ldb[0]/32) + (double)(lda[0]/32) * (ldb[0]%32) /32+ (double)(lda[0]%32) * (ldb[0]/32) /32 + (double)(lda[0]%32) * (ldb[0]%32) /(32*32) ), (int)((lda[0]+31)/32) * ((ldb[0]+31)/32), ldb[0]);
279 #endif
280 
281 #ifdef NOHTIME
282 #include "includes/nohtimestart.h"
283 #endif
284 	fvinomatchg32_CallerWrapper(newndim, A,  B,
285 			numBlocks, params[2],
286 			params[10] * params[5]*sizeof(type)
287 			, d_lda_s,d_ldb_s,d_idx_s
288 			, acoars, bcoars
289 			, lda_kernel1, ldb_kernel1,irem, orem, remainder1, remainder2, size);
290 
291 
292 #ifdef NOHTIME
293 #include "includes/nohtimestop.h"
294 #endif
295 	cudaDeviceSynchronize();
296 
297 	{cudaError_t err = cudaGetLastError();
298 		if(err != cudaSuccess){
299 			printf("\nKernel ERROR in fvi_nomatch_g32: %s (line: %d)\n", cudaGetErrorString(err), __LINE__);
300 			//exit(-1);
301 		}}
302 	cudaFree(d_lda_s);
303 	cudaFree(d_ldb_s);
304 	cudaFree(d_idx_s);
305 }
306 
307 
308 
309