1 #include<iostream>
2 #include "ourproj.h"
3 #include "TransposeSpec.h"
4 #include "ParameterTuner.cpp"
5 #include <cstring>
6 #include <omp.h>
7 #include <fstream>
8 #include <time.h>
9 #include <immintrin.h>
10 #include <xmmintrin.h>
11 #include <cuComplex.h>
12 #include <complex.h>
13 
14 #include <stdlib.h>
15 #include <cuda_runtime.h>
16 #include <complex.h>
17 
18 #include "ourinclude.h"
19 
20 #ifdef type
21 #undef type
22 #endif
23 
24 #ifndef type
25 #define type double
26 #endif
27 
28 #define MIN(a,b) (a < b?a:b)
main(int argc,char * argv[])29 int main(int argc, char* argv[])
30 {
31 	FILE * f;
32 	int dim[10];// = {15, 20, 20, 19, 16, 16};
33 	int permutation[10];// = {0, 5, 4, 1, 2, 3};
34 	int i;
35 	//#define type float
36 #define type double
37 	if(argc < 2)
38 	{
39 		fprintf(stderr, "Please input filename to read data\n");
40 		exit(0);
41 	}
42 	f = fopen(argv[1], "r");
43 	char line[255];
44 	char temp[50];
45 	cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
46 	int icase=-1;
47 	int ndim;
48 	while(1)
49 	{
50 		bool finish = false;
51 		size_t total = 1;
52 		fscanf(f, "%d", &ndim);
53 		if(feof(f)){
54 			break;}
55 		if(ndim == -1)
56 		{
57 			char line[255];
58 			fgets(line, 255, f);
59 			continue;
60 		}
61 		for(i = 0; i < ndim; i++)
62 		{
63 			fscanf(f, "%d", &dim[i]);
64 			if(feof(f)){
65 				finish = true;
66 				break;}
67 			printf("%d ", dim[i]);
68 			total *= dim[i];
69 		}
70 		printf("\t");
71 		for(i = 0; i < ndim; i++)
72 		{
73 			fscanf(f, "%d", &permutation[i]);
74 			printf("%d ", permutation[i]);
75 		}
76 
77 		size_t totalsize = total * sizeof(type);
78 
79 		type* A = (type*) malloc(totalsize);
80 		if(!A) {cout <<"Memory error "; exit(0);}
81 		type* B = (type*) malloc(totalsize);
82 		if(!B) {cout <<"Memory error "; exit(0);}
83 		type*	B_ref = (type *) malloc(totalsize);
84 		if(!B_ref) {cout <<"Memory error "; exit(0);}
85 		type *d_A, *d_B;
86 		for(i=0; i < total ; ++i){
87 			A[i] = (type)i;
88 			//B[i] = (type)-i;
89 		}
90 
91 		cudaMalloc(&d_A, totalsize);
92 		//cout <<"A addr: " <<d_A;
93 		cudaMemcpy(d_A, A, totalsize, cudaMemcpyHostToDevice);
94 		cudaMalloc(&d_B, totalsize);
95 		//cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
96 		//cout <<"\nB addr: " <<d_B;
97 		cudaMemcpy(d_B, B, totalsize, cudaMemcpyHostToDevice);
98 
99 		type alpha=1, beta=0;
100 		TensorType  mytype = doubletype;
101 		//TensorType  mytype = floattype;
102 		double start, tmpTime;
103 		start = omp_get_wtime();
104 
105 		ttlg_transpose(ndim,dim, permutation, d_A, d_B ,alpha, beta);
106 		cudaDeviceSynchronize();
107 		tmpTime = omp_get_wtime() - start;
108 		//cout << "\t"<<tmpTime<<"\t";
109 		fprintf(stdout, "\t%6.6lf\t", tmpTime);
110 		//
111 		//                              //dCuTranspose_03241_15x20x20x16x304_reference(A, B_ref, sizes1, sizes1, ldb);
112 		transpose_check(ndim,A, B_ref, alpha, beta, dim, permutation);
113 		//tranpose5_check(A, B_ref, sizes1, sizes1, sizes1, perm1);
114 		cudaMemcpy(B, d_B,totalsize, cudaMemcpyDeviceToHost);
115 
116 #ifdef printd
117 		cout<<"\n";
118 		for(int i = 0; i < MIN(1000, total); i++)
119 			cout <<B[i]<<" ";
120 		cout<<"\n";
121 		for(int i = 0; i < MIN(1000, total); i++)
122 			cout <<A[i]<<" ";
123 		cout<<"\n";
124 #endif
125 #ifndef NOERRORC
126 		transpose_equal(B,B_ref,total);
127 		//	fprintf(stdout, "\t%f",tmpTime);
128 #endif
129 		fprintf(stdout, "\t%6.2lf\t %u\n",2*totalsize/(tmpTime*1000000000), totalsize);
130 		free(A);
131 		free(B);
132 		free(B_ref);
133 		cudaFree(d_A);
134 		cudaFree(d_B);
135 
136 	}
137 #undef type
138 }
139