1 #include<iostream>
2 #include "ourproj.h"
3 #include "TransposeSpec.h"
4 #include "ParameterTuner.cpp"
5 #include <cstring>
6 #include <omp.h>
7 #include <fstream>
8 #include <time.h>
9 #include <immintrin.h>
10 #include <xmmintrin.h>
11 #include <cuComplex.h>
12 #include <complex.h>
13
14 #include <stdlib.h>
15 #include <cuda_runtime.h>
16 #include <complex.h>
17
18 #include "ourinclude.h"
19
20 #ifdef type
21 #undef type
22 #endif
23
24 #ifndef type
25 #define type double
26 #endif
27
28 int getoff(int index,const int * dims,const int * stride, int n);
29
30
transpose_check(int n,const type * A,type * B,const type alpha,const type beta,const int * lda,int * perm)31 void transpose_check(int n, const type *A, type *B, const type alpha, const type beta, const int *lda, int *perm)
32 {
33 int i = 0, j;
34 int r_perm[100], lda_s[100], ldb_s[100], temp[100] ;
35 lda_s[0] = 1;
36 ldb_s[0] = 1;
37 long size = 1;
38 for(i = 0; i < n; i++)
39 {
40 for(j = 0; j < n; j++)
41 {
42 if(perm[j] == i)
43 {
44 r_perm[i] = j;
45 break;
46 }
47 }
48 size *= lda[i];
49 }
50 for(i = 1; i < n; i++)
51 {
52 lda_s[i] = lda_s[i-1] * lda[i-1];
53 ldb_s[i] = ldb_s[i-1] * lda[perm[i-1]];
54 }
55 for(i = 0; i < n; i++)
56 {
57 temp[i] = ldb_s[r_perm[i]];
58 }
59
60 for(i=0; i < size; i++)
61 {
62 B[getoff(i, lda, temp, n)] = A[i];
63 }
64 }
65
66
67
68 void fuseIndices(TransposeSpec &spec);
69 using namespace std;
70
71 //template <typename tensortype>
72 //void ourproj_tranpose(int ndim, int *dims, int *perm, tensortype *input, tensortype *output, TensorType dataType = 1, tensortype alpha = 0.0, tensortype beta = 1.0){
73 //template <typename tensortype>
74 extern "C"
ttlg_transpose(int ndim,int * dims,int * perm,double * input,double * output,double alpha=1.0,double beta=0.0)75 void ttlg_transpose(int ndim, int *dims, int *perm, double *input, double *output,double alpha = 1.0, double beta = 0.0){
76 #ifdef INPUT
77 cout <<"Dims: ";
78 for(int i = 0; i < ndim; i++)
79 {
80 cout << dims[i] <<" ";
81 }
82 cout <<"\nPerm: ";
83 for(int i = 0; i < ndim; i++)
84 {
85 cout << perm[i] <<" ";
86 }
87 cout <<"\n";
88 #endif
89
90 TensorType dataType = (TensorType) 1;
91
92 TransposeSpec *spec = new TransposeSpec(ndim, dims, perm,dataType );
93
94 fuseIndices(*spec); //sets the fused indices to the spec
95 spec->setDataType(dataType);
96 spec->setBeta(beta);
97 spec->setAlpha(alpha);
98
99
100 ParameterTuner *parameterTuner = new ParameterTuner();
101 Parameters & myParams = parameterTuner->tune(*spec);
102
103 int * sizes = spec->getSizes();
104 int n = spec->getNdim();
105 if(n == 1 || n == 0)//simply copy out
106 {
107 #ifdef printd
108 cout <<"\n1 D\n";
109 #endif
110 int ndim = 1;
111 int lda[1];
112 lda[0] = sizes[0];
113 #ifdef NOHTIME
114 #include "includes/nohtimestart.h"
115 #endif
116
117 cudaMemcpy(output, input, sizes[0] * sizeof(input[0]), cudaMemcpyDeviceToDevice);
118 #ifdef NOHTIME
119 #include "includes/nohtimestop.h"
120 #endif
121
122 }
123 else
124 {
125 #ifdef BW
126 cout << myParams . getBW()<<"\t";
127 #endif
128 #ifdef EFF
129 cout << myParams . getWarpEfficiency()<<"\t";
130 #endif
131 #ifdef TIME
132 cout << myParams. getTime();
133 #endif
134
135 int params[12];
136 int *perm_new = spec->getPermutation();
137 #ifdef printd
138 cout <<endl<< myParams. getSharedMemSize1()<<"\t";
139 cout<< myParams.getTileSize()<<"\t";
140 cout<< myParams.getPaddingSize()<<"\t";
141 cout<< myParams.getTbSize()<<endl;
142 cout<< myParams.getNumElementsProcessedPerBlock()<<endl;
143 #endif
144 params[0] = myParams.getTileSize();
145 params[1] = myParams.getPaddingSize();
146 params[2] = myParams.getTbSize();
147 params[5] = myParams. getSharedMemSize1();
148 int ldb[20];
149 int rperm[20];
150 #ifdef printd
151 cout <<"New dims: ";
152 #endif
153 for(int i = 0; i < n; i++)
154 {
155 #ifdef printd
156 cout << sizes[i] <<" ";
157 #endif
158 ldb[i] = sizes[perm_new[i]];
159 for(int j = 0; j < n; j++)
160 {
161 if(perm_new[i] == j)
162 {
163 rperm[j] = i;
164 }
165 }
166 }
167
168
169 #ifdef printd
170 cout << "\n";
171 #endif
172 #ifdef printd
173 cout << "Perm: ";
174 for(int j = 0; j < n; j++)
175 {
176 cout <<perm_new[j]<<" ";
177 }
178 cout << "\n";
179 #endif
180 params[3]=perm_new[1];
181 params[4]= rperm[1];
182
183 int tmp;
184 BlockingCase caseId = parameterTuner->getCaseId();
185 switch(caseId.getMode())
186 {
187 case BlockingCase::FVI_MATCH_AND_LESST32:
188 #ifdef printd
189 cout << "...BlockingCase::FVI_MATCH_AND_LESST32 \n";
190 #endif
191 tmp = (sizes[1] + params[0] -1)/params[0];
192 for(int j = 2; j < n; j++)
193 {
194 if(perm_new[1] == j && sizes[0] < 128)
195 {
196 tmp *= (sizes[j] + params[0] -1)/params[0];
197 }
198 else
199 {
200 tmp *= sizes[j];
201 }
202 }
203 params[6] = tmp;
204 fvimatchl32_transpose_kernel(n, input, output, sizes, ldb,params, rperm, alpha, beta);
205
206 break;
207 case BlockingCase::FVI_MATCH_AND_GREATERT32:
208 #ifdef printd
209 cout << "...BlockingCase::FVI_MATCH_AND_GREATERT32 \n";
210 #endif
211 if(sizes[0] < 128)
212 {
213 params[0] = 4;
214 tmp = (sizes[1] + params[0] -1)/params[0];
215 }
216 else//need fix
217 {
218 params[0] = 1;
219 tmp = sizes[1];
220 //tmp = (sizes[1] + params[0] -1)/params[0];
221 }
222 for(int j = 2; j < n; j++)
223 {
224 if(perm_new[1] == j && sizes[0] < 128)
225 {
226 tmp *= (sizes[j] + params[0] -1)/params[0];
227 }
228 else
229 {
230 tmp *= sizes[j];
231 }
232
233 }
234 params[6] = tmp;
235 #ifndef notall
236 if(sizes[0] >= 128)
237 fvimatchg32_transpose_kernel(n, input, output, sizes, ldb,params, perm_new, alpha, beta);
238 else
239 fvimatchg32_blocking_transpose_kernel(n, input, output, sizes, ldb,params, perm_new, alpha, beta);
240
241 #endif
242 break;
243 case BlockingCase::FVI_NOMATCH_GENERAL:
244 params[3] = myParams.getBlockAIndex();
245 //params[1] = myParams.getPaddingSize();
246 params[4] = myParams.getBlockBIndex();
247 params[7] = myParams.getNumElementsProcessedPerBlock();
248 params[8] = myParams.getNumElementsProcessedPerBlock1();
249 params[11] = myParams.getTileSize1();
250
251 //cout <<"\n...Here "<< params[11];
252 if(params[0] == 1)
253 {
254 tmp = 1;
255 }
256 else
257 {
258 tmp = (sizes[params[3]] + params[0] -1)/params[0];
259 }
260 #ifdef printd
261 cout <<"\nBa = "<< params[3]<<"\tBb = "<<params[4]<<"\ttile1 = "<<params[0]<<"\ttile2 = "<<params[11]<<"\t tmp = "<<tmp<<"\n";
262 #endif
263 for(int j = params[3]+1; j < n; j++)
264 {
265 if(rperm[j] < params[4])
266 {
267 //tmp *= (sizes[j] + params[0] -1)/params[0];
268 }
269 else if(rperm[j] == params[4])
270 {
271 if(params[11] == 1)
272 tmp *= 1;
273 else
274 tmp *= (sizes[j] + params[11] -1)/params[11];
275 }
276 else
277 {
278 tmp *= sizes[j];
279 }
280 #ifdef printd
281 cout <<"j = "<<j<<" tmp = "<<tmp<<"\n";
282 #endif
283
284 }
285 //cout <<"\ntmp = "<<tmp<<"\n";
286 params[6] = tmp;
287 params[10] = myParams.getSharedMemSize2();
288 fvinomatchgeneral_transpose_kernel(n, input, output, sizes, ldb,params, perm_new, rperm, alpha, beta);
289 break;
290 case BlockingCase::FVI_NOMATCH_GENERAL_OVERLAP:
291 params[3] = myParams.getBlockAIndex();
292 params[4] = myParams.getBlockBIndex();
293 params[7] = myParams.getNumElementsProcessedPerBlock();
294 params[8] = myParams.getNumElementsProcessedPerBlock1();
295 params[11] = myParams.getTileSize1();
296 tmp = 1;
297 if(params[0] != 1)
298 {
299 if(rperm[params[3]] == params[4])
300 {
301 tmp = (sizes[params[3]] + params[0] -1)/params[0];
302 }
303 else
304 {
305 tmp = (sizes[params[3]] + params[0] -1)/params[0];
306 }
307 }
308 //else tmp = ;
309 #ifdef printd
310 cout <<"\nBa = "<< params[3]<<"\tBb = "<<params[4]<<"\ttile1 = "<<params[0]<<"\ttile2 = "<<params[11]<<"\t tmp = "<<tmp<<"\n";
311 #endif
312 for(int j = params[3]+1; j < n; j++)
313 {
314 if(rperm[j] < params[4])
315 {
316 }
317 else if(rperm[j] == params[4])
318 {
319 if(params[11] == 1)
320 {
321 }
322 else
323 {
324 tmp *= (sizes[j] + params[11] -1)/params[11];
325 }
326 }
327 else
328 {
329 tmp *= sizes[j];
330 }
331 #ifdef printd
332 cout <<"j = "<<j<<" tmp = "<<tmp<<"\n";
333 #endif
334 }
335 //cout <<"\ntmp = "<<tmp<<"\n";
336 params[6] = tmp;
337 params[10] = myParams.getSharedMemSize2();
338 fvigeneralolap_transpose_kernel(n, input, output, sizes, ldb,params, perm_new, rperm, alpha, beta);
339 break;
340 /* case BlockingCase::FVI_NOMATCH_AND_GREATERT32:
341 #ifdef printd
342 cout << "...BlockingCase::FVI_NOMATCH_AND_GREATERT32 \n";
343 #endif
344 tmp = (sizes[0] + params[0] -1)/params[0];
345 for(int j = 1; j < n; j++)
346 {
347 if(perm_new[0] == j)
348 {
349 tmp *= (sizes[j] + params[0] -1)/params[0];
350 }
351 else
352 {
353 tmp *= sizes[j];
354 }
355
356 }
357 params[6] = tmp;
358 params[10] = myParams. getSharedMemSize2();
359 params[7]= perm_new[0];
360 params[8]= rperm[0];
361 params[9]= 0;//perm_new[0];
362 params[1] = myParams.getNumElementsProcessedPerBlock();
363 #ifndef notall
364 fvinomatchg32_transpose_kernel(n, input, output, sizes, ldb,params, rperm, alpha, beta);
365 #endif
366 break;
367 */
368 } }
369 }
370
fuseIndices(TransposeSpec & spec)371 void fuseIndices(TransposeSpec &spec){
372 int ndim = spec.getNdim();
373 int *perm = spec.getPermutation();
374 int *sizes = spec.getSizes();
375 int rperm[100];
376 for(int i = 0; i < ndim; i++)
377 {
378 for(int j = i+1; j < ndim; j++)
379 {
380 if(perm[i] == j)
381 {
382 rperm[j] = i;
383 }
384 }
385 }
386 for(int i = 0; i < ndim; i++)
387 {
388 if(sizes[i] == 1)
389 {
390 for(int j = i+1; j < ndim; j++)
391 {
392 sizes[j-1] = sizes[j];
393 }
394 int ind = -1;
395 for(int j = 0; j < ndim; j++)
396 {
397 if(perm[j] > i)
398 perm[j]--;
399 else if (perm[j] == i)
400 ind = j;
401 }
402 //for(int j = rperm[i] + 1; j < ndim; j++)
403 for(int j = ind + 1; j < ndim; j++)
404 {
405 perm[j-1] = perm[j];
406
407 }
408 ndim--;
409 i--;
410 }
411 }
412 for(int i = 0; i < ndim-1; i++)
413 {
414 //if((perm[i] == i) && (perm[i+1] == i+1))
415 if(perm[i] +1 == perm[i+1])
416 {
417 sizes[perm[i]] *= sizes[perm[i+1]];
418 for(int j = perm[i+1] + 1; j < ndim; j++)
419 {
420 sizes[j-1] = sizes[j];
421 }
422 for(int j = 0; j < ndim; j++)
423 {
424 if(perm[j] > perm[i])
425 perm[j]--;
426 }
427 for(int j = i+2; j < ndim; j++)
428 {
429 perm[j-1] = perm[j];
430 }
431 ndim--;
432 i--;
433 }
434 }
435 spec.setNdim(ndim);
436 }
transpose_init(type * A,int size)437 void transpose_init(type *A, int size)
438 {
439 int i = 0;
440 for(i = 0; i < size; i++)
441 {
442 A[i] = i;
443 }
444 }
transpose_equal(const type * A,const type * B,int total_size)445 int transpose_equal(const type *A, const type*B, int total_size){
446 int error = 0;
447 const type *Atmp= A;
448 const type *Btmp= B;
449 for(int i=0;i < total_size ; ++i){
450 type Aabs = (Atmp[i] < 0) ? -Atmp[i] : Atmp[i];
451 type Babs = (Btmp[i] < 0) ? -Btmp[i] : Btmp[i];
452 type max = (Aabs < Babs) ? Babs : Aabs;
453 type diff = (Aabs - Babs);
454 diff = (diff < 0) ? -diff : diff;
455 if(diff > 0){
456 type relError = (diff/max);
457 if(relError > 1.000000e-10){
458 //printf("i: %d relError: %.8e %e %e\n",i,relError,Atmp[i], Btmp[i]);
459 // exit(0);
460 error += 1;
461 }
462 }
463 }
464 printf("Number of errors: %d\t",error);
465 return (error > 0) ? 0 : 1;
466 }
467
468