1 #include <cuda_runtime.h>
2 #include <cuComplex.h>
3 #include <complex.h>
4 #include <stdio.h>
5 #include <omp.h>
6 #define type double
7
8 #define STR1(X) #X
9 #define STR(X) STR1(X)
10 #define STRINGIFY(X,Y) X ## Y
11 #define CON(X,Y) STRINGIFY(X,Y)
12 #define KDir kernels
13
14
15 #include "includes/ourmacros.h"
16
17 extern __shared__ type tile[];
fvinomatchg32_main(const type * __restrict__ Atmp,type * __restrict__ Btmp,const int lda1,const int ldb1)18 __device__ __forceinline__ void fvinomatchg32_main(const type * __restrict__ Atmp, type * __restrict__ Btmp,const int lda1,const int ldb1)
19 {
20 #define TILE_DIM_X 32
21 #define TILE_DIM_Y 32
22 #define THREADS_PER_ROW 8
23 #define SHM2 33
24 int id = threadIdx.x;
25 int rowId = id%32;
26 int colId = id/TILE_DIM_X;
27 #ifdef printd
28 if(blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.x == 0)
29 printf("lda1 = %d, ldb1 = %d\n", lda1, ldb1);
30 #endif
31 for(int j = colId; j < TILE_DIM_Y; j+= THREADS_PER_ROW)
32 {
33 tile[j * SHM2 + rowId] = Atmp[rowId + j * lda1];
34 }
35 __syncthreads();
36 for(int j = colId; j < TILE_DIM_Y; j+= THREADS_PER_ROW)
37 {
38 //Btmp[j * ldb1 + rowId] =tile[rowId * SHM2 + j];
39 Btmp[j * ldb1 + rowId] =tile[rowId * SHM2 + j];
40 }
41 #undef SHM2
42 }
43
44 __device__ __forceinline__
fvinomatchg32_rem(const type * __restrict__ Atmp,type * __restrict__ Btmp,const int lda1,const int ldb1,const int remainderx,const int remaindery)45 void fvinomatchg32_rem(const type * __restrict__ Atmp, type * __restrict__ Btmp, const int lda1, const int ldb1, const int remainderx,const int remaindery)
46 {
47 #define TILE_DIM_X 32
48 #define TILE_DIM_Y 32
49 #define THREADS_PER_ROW 8
50 int id = threadIdx.x;
51 #define SHM2 33
52 int rowId = id%32;
53 int colId = id/TILE_DIM_X;
54
55 int limit = remaindery > TILE_DIM_Y ? TILE_DIM_Y : remaindery;
56 if(rowId < remainderx)
57 for(int j = colId; j < limit; j+= THREADS_PER_ROW)
58 {
59 tile[j * SHM2 + rowId] = Atmp[rowId + j * lda1];
60 }
61 __syncthreads();
62 if(rowId >= remaindery)
63 return;
64 limit = remainderx > TILE_DIM_Y ? TILE_DIM_Y : remainderx;
65 for(int j = colId; j < limit; j+= THREADS_PER_ROW)
66 {
67 Btmp[j * ldb1 + rowId] =tile[rowId * SHM2 + j];
68 }
69 #undef SHM2
70 }
fvinomatchg32_main_coars(const type * __restrict__ Atmp,type * __restrict__ Btmp,const int lda1,const int ldb1,const int acoars,const int bcoars,const int size)71 __device__ __forceinline__ void fvinomatchg32_main_coars(const type * __restrict__ Atmp, type * __restrict__ Btmp,const int lda1,const int ldb1, const int acoars, const int bcoars, const int size)
72 {
73 #define TILE_DIM_X 32
74 #define TILE_DIM_Y 32
75 #define THREADS_PER_ROW 8
76 #define SHM2 33
77 int id = threadIdx.x;
78 int rowId = id%32;
79 int colId = id/TILE_DIM_X;
80 #ifdef printd
81 if(blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.x == 0)
82 printf("lda1 = %d, ldb1 = %d\n", lda1, ldb1);
83 #endif
84 for(int i = 0; i < size; i++)
85 {
86 for(int j = colId; j < TILE_DIM_Y; j+= THREADS_PER_ROW)
87 {
88 tile[j * SHM2 + rowId] = Atmp[i*acoars+rowId + j * lda1];
89 }
90 __syncthreads();
91 for(int j = colId; j < TILE_DIM_Y; j+= THREADS_PER_ROW)
92 {
93 Btmp[i*bcoars +j * ldb1 + rowId] =tile[rowId * SHM2 + j];
94 }
95 __syncthreads();
96 }
97 #undef SHM2
98 }
99
100 __device__ __forceinline__
fvinomatchg32_rem_coars(const type * __restrict__ Atmp,type * __restrict__ Btmp,const int lda1,const int ldb1,const int remainderx,const int remaindery,const int acoars,const int bcoars,const int size)101 void fvinomatchg32_rem_coars(const type * __restrict__ Atmp, type * __restrict__ Btmp, const int lda1, const int ldb1, const int remainderx,const int remaindery, const int acoars, const int bcoars, const int size)
102 {
103 #define TILE_DIM_X 32
104 #define TILE_DIM_Y 32
105 #define THREADS_PER_ROW 8
106 int id = threadIdx.x;
107 #define SHM2 33
108 int rowId = id%32;
109 int colId = id/TILE_DIM_X;
110
111 #ifdef printd
112 //if(threadIdx.x == 0 && blockIdx.x%5 == 0)
113 //printf("lda1 = %d, ldb1 = %d, remainderx = %d, remaindery = %d, acoars = %d, bcoars = %d, size = %d\n", lda1, ldb1, remainderx, remaindery, acoars, bcoars, size);
114 #endif
115 for(int i = 0; i < size; i++)
116 {
117 if(rowId < remainderx)
118 for(int j = colId; j < remaindery; j+= THREADS_PER_ROW)
119 {
120 tile[j * SHM2 + rowId] = Atmp[i*acoars+rowId + j * lda1];
121 }
122 __syncthreads();
123 if(rowId < remaindery)
124 for(int j = colId; j < remainderx; j+= THREADS_PER_ROW)
125 {
126 Btmp[i*bcoars+j * ldb1 + rowId] =tile[rowId * SHM2 + j];
127 }
128 __syncthreads();
129 }
130 #undef SHM2
131 }
132 #define FNAME fvinomatchg32.h
133 #include "includes/macro.h"
134 #undef FNAME
135 #define FNAME fvinomatchg32_coars.h
136 #include "includes/macro.h"
137 #undef FNAME
138
139
140
fvinomatchg32_CallerWrapper(int ndim,const type * __restrict__ A,type * B,const int numblocks,const int numthreads,const int shmsize,const int * __restrict__ lda_s,const int * __restrict__ ldb_s,const int * __restrict__ idx_s,const int coarsa,const int coarsb,const int lda_kernel1,const int ldb_kernel1,const int inputrem,const int outputrem,const int remainderx,const int remaindery,const int size)141 void fvinomatchg32_CallerWrapper(int ndim, const type * __restrict__ A, type * B, const int numblocks, const int numthreads, const int shmsize
142 , const int * __restrict__ lda_s, const int* __restrict__ ldb_s, const int* __restrict__ idx_s
143 , const int coarsa, const int coarsb, const int lda_kernel1, const int ldb_kernel1, const int inputrem, const int outputrem, const int remainderx, const int remaindery, const int size)
144 {
145 #ifdef printd
146 printf("ndim = %d, size = %d, lda1 = %d, ldb1 = %d, irem = %d, orem = %d\n", ndim, size, lda_kernel1, ldb_kernel1, inputrem, outputrem);
147 #endif
148
149 if(size > 0)
150 {
151 #ifdef printd
152 printf("Coarsening... No. of blocks = %d\n", numblocks/size);
153 #endif
154 dim3 thread_blocks(numblocks/size, 1, 1);
155 switch(ndim)
156 {
157 EXPANDDIMS(fvinomatchg32_coars_kernel_, thread_blocks, numthreads, shmsize, (A, B, lda_s,ldb_s, idx_s, coarsa,coarsb, inputrem, outputrem, lda_kernel1, ldb_kernel1, remainderx, remaindery, size))
158 default:
159 {
160 }
161 }
162 }
163 else
164 {
165 dim3 thread_blocks(numblocks, 1, 1);
166 switch(ndim)
167 {
168 EXPANDDIMS(fvinomatchg32_kernel_, thread_blocks, numthreads, shmsize, (A, B, lda_s,ldb_s, idx_s, inputrem, outputrem, lda_kernel1, ldb_kernel1, remainderx, remaindery))
169 default:
170 {
171 }
172 }
173 }
174 }
175 void swap(int array[], int ind1, int ind2);
176
177 int cancoarsen(int *lda, int newndim);
178 extern "C"
fvinomatchg32_transpose_kernel(int ndim,const type * A,type * B,const int * lda,const int * ldb,const int * params,const int * rperm)179 void fvinomatchg32_transpose_kernel(int ndim, const type *A, type *B, const int *lda, const int *ldb, const int* params, const int * rperm)
180 {
181 // int numBlocks = computeNumBlocksCode ;
182 #ifdef printd
183 printf("\nA Dims: %d \t %d \t %d\t %d\t %d\n", lda[0], lda[1], lda[2], lda[3], lda[4]);
184 printf("\nParams: %d \t %d \t %d\t %d\t %d\t %d\t %d \t %d \t %d \t %d \t %d\n", params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9], params[10]);
185 printf("\nB Dims: %d \t %d \t %d\t %d\t %d\n", ldb[0], ldb[1], ldb[2], ldb[3], ldb[4]);
186 printf("\nR perm: %d \t %d \t %d\t %d\t %d\n", rperm[0], rperm[1], rperm[2], rperm[3], rperm[4]);
187 #endif
188
189 //#ifndef orig1
190
191 int numBlocks = params[6];//((size[1] + 8 -1)/8) * size[2] * ((size[3] + 8 -1)/8) * size[4] ;
192
193 // printf("\n%d \t %d \t %d\t %d\t %d\n", ldb[0], ldb[1], ldb[2], ldb[3], ldb[4]);
194 const int blockA = params[0];
195 const int blockB = params[0];
196 int irem, orem;
197 //const int shm = params[5] * params[10];
198 int *d_lda_s, *d_ldb_s, *d_idx_s;
199 const int remainder1 = lda[0] % 32;
200 const int remainder2 = lda[params[7]] % 32;
201 if(remainder1 == 0) irem = lda[0];
202 else irem = (lda[0] - remainder1)/blockA;
203 if(remainder2 == 0) orem = ldb[0];
204 else orem = (ldb[0] - remainder2)/blockB;
205
206 int lda_s[20], ldb_s[20], idx_s[20], temp[20];
207 lda_s[0] = 1;
208 ldb_s[0] = 1;
209
210 int i;
211 idx_s[0] = (lda[0] + blockA - 1) / blockA;
212 for(i = 1; i < ndim; i++)
213 {
214 if( i == params[7])
215 {
216 idx_s[i] = (lda[i] + blockB - 1)/blockB;
217 }
218 else
219 {
220 idx_s[i] = lda[i];
221 }
222 lda_s[i] = lda_s[i-1] * lda[i-1];
223 ldb_s[i] = ldb_s[i-1] * ldb[i-1];
224 }
225 for(i = 0; i < ndim; i++)
226 {
227 temp[i] = ldb_s[rperm[i]];
228 }
229
230 const int lda_kernel1 = lda_s[params[7]]; //lda,0
231 const int ldb_kernel1 = ldb_s[params[8]]; //ldb,0
232 lda_s[0] *= params[0];
233 temp[0] *= params[0];
234 lda_s[params[7]] *= params[0];
235 temp[params[7]] *= params[0];
236
237 if(params[7] != 1)
238 {
239 swap(idx_s, 1, params[7]);
240 swap(lda_s, 1, params[7]);
241 swap(temp, 1, params[7]);
242 }
243 int newndim = ndim;
244
245 int acoars = -1, bcoars = -1, size = -1;
246 #ifndef NOCOARSEN
247 int cd = cancoarsen(idx_s+2, ndim-2);
248 if(cd >= 0)
249 {
250 #ifdef printd
251 printf("cd = %d\n", cd);
252 #endif
253 acoars = lda_s[2+cd];
254 bcoars = temp[2+cd];
255 size = idx_s[2+cd];
256 for(int j = cd+1; j < newndim; j++)
257 {
258 idx_s[2+j-1] = idx_s[2+j];
259 lda_s[2+j-1] = lda_s[2+j];
260 temp[2+j-1] = temp[2+j];
261
262 }
263 newndim--;
264 }
265 #endif
266
267 SAFECUDAMALLOC(&d_lda_s,newndim*sizeof(int));
268 SAFECUDAMALLOC(&d_ldb_s,newndim*sizeof(int));
269 SAFECUDAMALLOC(&d_idx_s,newndim*sizeof(int));
270 SAFECUDAMEMCPY(d_idx_s, idx_s,newndim*sizeof(int), cudaMemcpyHostToDevice);
271 SAFECUDAMEMCPY(d_lda_s, lda_s,newndim*sizeof(int), cudaMemcpyHostToDevice);
272 SAFECUDAMEMCPY(d_ldb_s, temp,newndim*sizeof(int), cudaMemcpyHostToDevice);
273
274 #ifdef MODEL
275 printf("\t%d\t%d\t", lda_kernel1, ldb_kernel1);
276 printf("\t%d\t%d\t%d\t%d\t", lda[0]/32,lda[0]%32, ldb[0]/32,ldb[0]%32 );
277 printf("\t%lf\t", ((lda[0]/32) * (ldb[0]/32) + (double)(lda[0]/32) * (ldb[0]%32) /32+ (double)(lda[0]%32) * (ldb[0]/32) /32 + (double)(lda[0]%32) * (ldb[0]%32) /(32*32) )/ (int)(((lda[0]+31)/32) * ((ldb[0]+31)/32)));
278 //printf("\t%lf\t%d %d \t", ((lda[0]/32) * (ldb[0]/32) + (double)(lda[0]/32) * (ldb[0]%32) /32+ (double)(lda[0]%32) * (ldb[0]/32) /32 + (double)(lda[0]%32) * (ldb[0]%32) /(32*32) ), (int)((lda[0]+31)/32) * ((ldb[0]+31)/32), ldb[0]);
279 #endif
280
281 #ifdef NOHTIME
282 #include "includes/nohtimestart.h"
283 #endif
284 fvinomatchg32_CallerWrapper(newndim, A, B,
285 numBlocks, params[2],
286 params[10] * params[5]*sizeof(type)
287 , d_lda_s,d_ldb_s,d_idx_s
288 , acoars, bcoars
289 , lda_kernel1, ldb_kernel1,irem, orem, remainder1, remainder2, size);
290
291
292 #ifdef NOHTIME
293 #include "includes/nohtimestop.h"
294 #endif
295 cudaDeviceSynchronize();
296
297 {cudaError_t err = cudaGetLastError();
298 if(err != cudaSuccess){
299 printf("\nKernel ERROR in fvi_nomatch_g32: %s (line: %d)\n", cudaGetErrorString(err), __LINE__);
300 //exit(-1);
301 }}
302 cudaFree(d_lda_s);
303 cudaFree(d_ldb_s);
304 cudaFree(d_idx_s);
305 }
306
307
308
309