1 #include<iostream>
2 #include "ourproj.h"
3 #include "TransposeSpec.h"
4 #include "ParameterTuner.cpp"
5 #include <cstring>
6 #include <omp.h>
7 #include <fstream>
8 #include <time.h>
9 #include <immintrin.h>
10 #include <xmmintrin.h>
11 #include <cuComplex.h>
12 #include <complex.h>
13 
14 #include <stdlib.h>
15 #include <cuda_runtime.h>
16 #include <complex.h>
17 
18 #include "ourinclude.h"
19 
20 #ifdef type
21 #undef type
22 #endif
23 
24 #ifndef type
25 #define type double
26 #endif
27 
28 int getoff(int index,const int * dims,const int * stride, int n);
29 
30 
transpose_check(int n,const type * A,type * B,const type alpha,const type beta,const int * lda,int * perm)31 void transpose_check(int n, const type *A, type *B, const type alpha, const type beta,  const int *lda,  int *perm)
32 {
33 	int i = 0, j;
34 	int r_perm[100], lda_s[100], ldb_s[100], temp[100] ;
35 	lda_s[0] = 1;
36 	ldb_s[0] = 1;
37 	long size = 1;
38 	for(i = 0; i < n; i++)
39 	{
40 		for(j = 0; j < n; j++)
41 		{
42 			if(perm[j] == i)
43 			{
44 				r_perm[i] = j;
45 				break;
46 			}
47 		}
48 		size *= lda[i];
49 	}
50 	for(i = 1; i < n; i++)
51 	{
52 		lda_s[i] = lda_s[i-1] * lda[i-1];
53 		ldb_s[i] = ldb_s[i-1] * lda[perm[i-1]];
54 	}
55        for(i = 0; i < n; i++)
56         {
57                 temp[i] = ldb_s[r_perm[i]];
58         }
59 
60 	for(i=0; i < size; i++)
61 	{
62 			B[getoff(i, lda, temp, n)] = A[i];
63 	}
64 }
65 
66 
67 
68 void fuseIndices(TransposeSpec &spec);
69 using namespace std;
70 
71 //template <typename tensortype>
72 //void ourproj_tranpose(int ndim, int *dims, int *perm, tensortype *input, tensortype *output, TensorType dataType = 1, tensortype alpha = 0.0, tensortype beta = 1.0){
73 //template <typename tensortype>
74 extern "C"
ttlg_transpose(int ndim,int * dims,int * perm,double * input,double * output,double alpha=1.0,double beta=0.0)75 void ttlg_transpose(int ndim, int *dims, int *perm, double *input, double *output,double alpha = 1.0, double beta = 0.0){
76 #ifdef INPUT
77 	cout <<"Dims: ";
78 		for(int i = 0; i < ndim; i++)
79 		{
80 			cout << dims[i] <<" ";
81 		}
82 	cout <<"\nPerm: ";
83 		for(int i = 0; i < ndim; i++)
84 		{
85 			cout << perm[i] <<" ";
86 		}
87 	cout <<"\n";
88 #endif
89 
90 	TensorType dataType = (TensorType) 1;
91 
92 	TransposeSpec *spec = new TransposeSpec(ndim, dims, perm,dataType );
93 
94 	fuseIndices(*spec); //sets the fused indices to the spec
95 	spec->setDataType(dataType);
96 	spec->setBeta(beta);
97 	spec->setAlpha(alpha);
98 
99 
100 	ParameterTuner *parameterTuner = new ParameterTuner();
101 	Parameters & myParams = parameterTuner->tune(*spec);
102 
103 	int * sizes = spec->getSizes();
104 	int n = spec->getNdim();
105 	if(n == 1 || n == 0)//simply copy out
106 	{
107 #ifdef printd
108 		cout <<"\n1 D\n";
109 #endif
110 		int ndim = 1;
111 		int lda[1];
112 		lda[0] = sizes[0];
113 #ifdef NOHTIME
114 #include "includes/nohtimestart.h"
115 #endif
116 
117 		cudaMemcpy(output, input, sizes[0] * sizeof(input[0]), cudaMemcpyDeviceToDevice);
118 #ifdef NOHTIME
119 #include "includes/nohtimestop.h"
120 #endif
121 
122 	}
123 	else
124 	{
125 #ifdef BW
126 cout << myParams . getBW()<<"\t";
127 #endif
128 #ifdef EFF
129 cout << myParams . getWarpEfficiency()<<"\t";
130 #endif
131 #ifdef TIME
132 cout << myParams. getTime();
133 #endif
134 
135 		int params[12];
136 		int *perm_new = spec->getPermutation();
137 #ifdef printd
138 		cout <<endl<< myParams. getSharedMemSize1()<<"\t";
139 		cout<< myParams.getTileSize()<<"\t";
140 		cout<< myParams.getPaddingSize()<<"\t";
141 		cout<< myParams.getTbSize()<<endl;
142 		cout<< myParams.getNumElementsProcessedPerBlock()<<endl;
143 #endif
144 		params[0] = myParams.getTileSize();
145 		params[1] = myParams.getPaddingSize();
146 		params[2] = myParams.getTbSize();
147 		params[5] = myParams. getSharedMemSize1();
148 		int ldb[20];
149 		int rperm[20];
150 #ifdef printd
151 		cout <<"New dims: ";
152 #endif
153 		for(int i = 0; i < n; i++)
154 		{
155 #ifdef printd
156 			cout << sizes[i] <<" ";
157 #endif
158 			ldb[i] = sizes[perm_new[i]];
159 			for(int j = 0; j < n; j++)
160 			{
161 				if(perm_new[i] == j)
162 				{
163 					rperm[j] = i;
164 				}
165 			}
166 		}
167 
168 
169 #ifdef printd
170 		cout << "\n";
171 #endif
172 #ifdef printd
173 		cout << "Perm: ";
174 		for(int j = 0; j < n; j++)
175 		{
176 			cout <<perm_new[j]<<" ";
177 		}
178 		cout << "\n";
179 #endif
180 		params[3]=perm_new[1];
181 		params[4]= rperm[1];
182 
183 		int tmp;
184 		BlockingCase caseId = parameterTuner->getCaseId();
185 		switch(caseId.getMode())
186 		{
187 			case BlockingCase::FVI_MATCH_AND_LESST32:
188 #ifdef printd
189 				cout << "...BlockingCase::FVI_MATCH_AND_LESST32 \n";
190 #endif
191 				tmp  = (sizes[1] + params[0] -1)/params[0];
192 				for(int j = 2; j < n; j++)
193 				{
194 					if(perm_new[1] == j && sizes[0] < 128)
195 					{
196 						tmp *=  (sizes[j] + params[0] -1)/params[0];
197 					}
198 					else
199 					{
200 						tmp *= sizes[j];
201 					}
202 				}
203 				params[6] = tmp;
204 					fvimatchl32_transpose_kernel(n, input, output, sizes, ldb,params, rperm, alpha, beta);
205 
206 				break;
207 			case BlockingCase::FVI_MATCH_AND_GREATERT32:
208 #ifdef printd
209 				cout << "...BlockingCase::FVI_MATCH_AND_GREATERT32 \n";
210 #endif
211 				if(sizes[0] < 128)
212 				{
213 					params[0] = 4;
214 					tmp  = (sizes[1] + params[0] -1)/params[0];
215 				}
216 				else//need fix
217 				{
218 					params[0] = 1;
219 					tmp = sizes[1];
220 					//tmp  = (sizes[1] + params[0] -1)/params[0];
221 				}
222 				for(int j = 2; j < n; j++)
223 				{
224 					if(perm_new[1] == j && sizes[0] < 128)
225 					{
226 						tmp *=  (sizes[j] + params[0] -1)/params[0];
227 					}
228 					else
229 					{
230 						tmp *= sizes[j];
231 					}
232 
233 				}
234 				params[6] = tmp;
235 #ifndef notall
236 				if(sizes[0] >= 128)
237 					fvimatchg32_transpose_kernel(n, input, output, sizes, ldb,params, perm_new, alpha, beta);
238 				else
239 					fvimatchg32_blocking_transpose_kernel(n, input, output, sizes, ldb,params, perm_new, alpha, beta);
240 
241 #endif
242 				break;
243 			case BlockingCase::FVI_NOMATCH_GENERAL:
244 				params[3] = myParams.getBlockAIndex();
245 				//params[1] = myParams.getPaddingSize();
246 				params[4] = myParams.getBlockBIndex();
247 				params[7] = myParams.getNumElementsProcessedPerBlock();
248 				params[8] = myParams.getNumElementsProcessedPerBlock1();
249 				params[11] = myParams.getTileSize1();
250 
251 				//cout <<"\n...Here "<< params[11];
252 				if(params[0] == 1)
253 				{
254 					tmp = 1;
255 				}
256 				else
257 				{
258 					tmp  = (sizes[params[3]] + params[0] -1)/params[0];
259 				}
260 #ifdef printd
261 				cout <<"\nBa = "<< params[3]<<"\tBb = "<<params[4]<<"\ttile1 = "<<params[0]<<"\ttile2 = "<<params[11]<<"\t tmp = "<<tmp<<"\n";
262 #endif
263 				for(int j = params[3]+1; j < n; j++)
264 				{
265 					if(rperm[j] < params[4])
266 					{
267 						//tmp *=  (sizes[j] + params[0] -1)/params[0];
268 					}
269 					else if(rperm[j] == params[4])
270 					{
271 						if(params[11] == 1)
272 							tmp *= 1;
273 						else
274 							tmp *=  (sizes[j] + params[11] -1)/params[11];
275 					}
276 					else
277 					{
278 						tmp *= sizes[j];
279 					}
280 #ifdef printd
281 					cout <<"j = "<<j<<" tmp = "<<tmp<<"\n";
282 #endif
283 
284 				}
285 				//cout <<"\ntmp = "<<tmp<<"\n";
286 				params[6] = tmp;
287 				params[10] = myParams.getSharedMemSize2();
288 				fvinomatchgeneral_transpose_kernel(n, input, output, sizes, ldb,params, perm_new, rperm, alpha, beta);
289 				break;
290 			case BlockingCase::FVI_NOMATCH_GENERAL_OVERLAP:
291 				params[3] = myParams.getBlockAIndex();
292 				params[4] = myParams.getBlockBIndex();
293 				params[7] = myParams.getNumElementsProcessedPerBlock();
294 				params[8] = myParams.getNumElementsProcessedPerBlock1();
295 				params[11] = myParams.getTileSize1();
296 				tmp = 1;
297 				if(params[0] != 1)
298 				{
299 					if(rperm[params[3]] == params[4])
300 					{
301 						tmp  = (sizes[params[3]] + params[0] -1)/params[0];
302 					}
303 					else
304 					{
305 						tmp  = (sizes[params[3]] + params[0] -1)/params[0];
306 					}
307 				}
308 				//else tmp = ;
309 #ifdef printd
310 				cout <<"\nBa = "<< params[3]<<"\tBb = "<<params[4]<<"\ttile1 = "<<params[0]<<"\ttile2 = "<<params[11]<<"\t tmp = "<<tmp<<"\n";
311 #endif
312 				for(int j = params[3]+1; j < n; j++)
313 				{
314 					if(rperm[j] < params[4])
315 					{
316 					}
317 					else if(rperm[j] == params[4])
318 					{
319 						if(params[11] == 1)
320 						{
321 						}
322 						else
323 						{
324 							tmp *=  (sizes[j] + params[11] -1)/params[11];
325 						}
326 					}
327 					else
328 					{
329 						tmp *= sizes[j];
330 					}
331 #ifdef printd
332 					cout <<"j = "<<j<<" tmp = "<<tmp<<"\n";
333 #endif
334 				}
335 				//cout <<"\ntmp = "<<tmp<<"\n";
336 				params[6] = tmp;
337 				params[10] = myParams.getSharedMemSize2();
338 				fvigeneralolap_transpose_kernel(n, input, output, sizes, ldb,params, perm_new, rperm, alpha, beta);
339 				break;
340 /*			case BlockingCase::FVI_NOMATCH_AND_GREATERT32:
341 #ifdef printd
342 				cout << "...BlockingCase::FVI_NOMATCH_AND_GREATERT32 \n";
343 #endif
344 				tmp  = (sizes[0] + params[0] -1)/params[0];
345 				for(int j = 1; j < n; j++)
346 				{
347 					if(perm_new[0] == j)
348 					{
349 						tmp *=  (sizes[j] + params[0] -1)/params[0];
350 					}
351 					else
352 					{
353 						tmp *= sizes[j];
354 					}
355 
356 				}
357 				params[6] = tmp;
358 				params[10] = myParams. getSharedMemSize2();
359 				params[7]= perm_new[0];
360 				params[8]= rperm[0];
361 				params[9]= 0;//perm_new[0];
362 				params[1] =  myParams.getNumElementsProcessedPerBlock();
363 #ifndef notall
364 				fvinomatchg32_transpose_kernel(n, input, output, sizes, ldb,params, rperm, alpha, beta);
365 #endif
366 				break;
367 				*/
368 		}	}
369 }
370 
fuseIndices(TransposeSpec & spec)371 void fuseIndices(TransposeSpec &spec){
372 	int ndim = spec.getNdim();
373 	int *perm = spec.getPermutation();
374 	int *sizes = spec.getSizes();
375 	int rperm[100];
376 	for(int i = 0; i <  ndim; i++)
377 	{
378 		for(int j = i+1; j <  ndim; j++)
379 		{
380 		if(perm[i] == j)
381 		{
382 			rperm[j] = i;
383 		}
384 		}
385 	}
386 	for(int i = 0; i <  ndim; i++)
387 	{
388 		if(sizes[i] == 1)
389 		{
390 			for(int j = i+1; j <  ndim; j++)
391 			{
392 				sizes[j-1] = sizes[j];
393 			}
394 			int ind = -1;
395 			for(int j = 0; j < ndim; j++)
396 			{
397 				if(perm[j] > i)
398 					perm[j]--;
399 				else if (perm[j] == i)
400 					ind = j;
401 			}
402 			//for(int j = rperm[i] + 1; j < ndim; j++)
403 			for(int j = ind + 1; j < ndim; j++)
404 			{
405 				perm[j-1] = perm[j];
406 
407 			}
408 			ndim--;
409 			i--;
410 		}
411 	}
412 	for(int i = 0; i <  ndim-1; i++)
413 	{
414 		//if((perm[i] == i) && (perm[i+1] == i+1))
415 		if(perm[i] +1  == perm[i+1])
416 		{
417 			sizes[perm[i]] *= sizes[perm[i+1]];
418 			for(int j = perm[i+1] + 1; j < ndim; j++)
419 			{
420 				sizes[j-1] = sizes[j];
421 			}
422 			for(int j = 0; j < ndim; j++)
423 			{
424 				if(perm[j] > perm[i])
425 					perm[j]--;
426 			}
427 			for(int j = i+2; j < ndim; j++)
428 			{
429 				perm[j-1] = perm[j];
430 			}
431 			ndim--;
432 			i--;
433 		}
434 	}
435 	spec.setNdim(ndim);
436 }
transpose_init(type * A,int size)437 void transpose_init(type *A, int size)
438 {
439 	int i = 0;
440 	for(i = 0; i < size; i++)
441 	{
442 		A[i] = i;
443 	}
444 }
transpose_equal(const type * A,const type * B,int total_size)445 int transpose_equal(const type *A, const type*B, int total_size){
446 	int error = 0;
447 	const type *Atmp= A;
448 	const type *Btmp= B;
449 	for(int i=0;i < total_size ; ++i){
450 		type Aabs = (Atmp[i] < 0) ? -Atmp[i] : Atmp[i];
451 		type Babs = (Btmp[i] < 0) ? -Btmp[i] : Btmp[i];
452 		type max =  (Aabs < Babs) ? Babs : Aabs;
453 		type diff = (Aabs - Babs);
454 		diff = (diff < 0) ? -diff : diff;
455 		if(diff > 0){
456 			type relError = (diff/max);
457 			if(relError > 1.000000e-10){
458 				//printf("i: %d relError: %.8e %e %e\n",i,relError,Atmp[i], Btmp[i]);
459 				//		             exit(0);
460 				error += 1;
461 			}
462 		}
463 	}
464 	printf("Number of errors: %d\t",error);
465 	return (error > 0) ? 0 : 1;
466 }
467 
468