1 #include<iostream>
2 #include "ourproj.h"
3 #include "TransposeSpec.h"
4 #include "ParameterTuner.cpp"
5 #include <cstring>
6 #include <omp.h>
7 #include <fstream>
8 #include <time.h>
9 #include <immintrin.h>
10 #include <xmmintrin.h>
11 #include <cuComplex.h>
12 #include <complex.h>
13
14 #include <stdlib.h>
15 #include <cuda_runtime.h>
16 #include <complex.h>
17
18 #include "ourinclude.h"
19
20 #ifdef type
21 #undef type
22 #endif
23
24 #ifndef type
25 #define type double
26 #endif
27
28 #define MIN(a,b) (a < b?a:b)
main(int argc,char * argv[])29 int main(int argc, char* argv[])
30 {
31 FILE * f;
32 int dim[10];// = {15, 20, 20, 19, 16, 16};
33 int permutation[10];// = {0, 5, 4, 1, 2, 3};
34 int i;
35 //#define type float
36 #define type double
37 if(argc < 2)
38 {
39 fprintf(stderr, "Please input filename to read data\n");
40 exit(0);
41 }
42 f = fopen(argv[1], "r");
43 char line[255];
44 char temp[50];
45 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
46 int icase=-1;
47 int ndim;
48 while(1)
49 {
50 bool finish = false;
51 size_t total = 1;
52 fscanf(f, "%d", &ndim);
53 if(feof(f)){
54 break;}
55 if(ndim == -1)
56 {
57 char line[255];
58 fgets(line, 255, f);
59 continue;
60 }
61 for(i = 0; i < ndim; i++)
62 {
63 fscanf(f, "%d", &dim[i]);
64 if(feof(f)){
65 finish = true;
66 break;}
67 printf("%d ", dim[i]);
68 total *= dim[i];
69 }
70 printf("\t");
71 for(i = 0; i < ndim; i++)
72 {
73 fscanf(f, "%d", &permutation[i]);
74 printf("%d ", permutation[i]);
75 }
76
77 size_t totalsize = total * sizeof(type);
78
79 type* A = (type*) malloc(totalsize);
80 if(!A) {cout <<"Memory error "; exit(0);}
81 type* B = (type*) malloc(totalsize);
82 if(!B) {cout <<"Memory error "; exit(0);}
83 type* B_ref = (type *) malloc(totalsize);
84 if(!B_ref) {cout <<"Memory error "; exit(0);}
85 type *d_A, *d_B;
86 for(i=0; i < total ; ++i){
87 A[i] = (type)i;
88 //B[i] = (type)-i;
89 }
90
91 cudaMalloc(&d_A, totalsize);
92 //cout <<"A addr: " <<d_A;
93 cudaMemcpy(d_A, A, totalsize, cudaMemcpyHostToDevice);
94 cudaMalloc(&d_B, totalsize);
95 //cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
96 //cout <<"\nB addr: " <<d_B;
97 cudaMemcpy(d_B, B, totalsize, cudaMemcpyHostToDevice);
98
99 type alpha=1, beta=0;
100 TensorType mytype = doubletype;
101 //TensorType mytype = floattype;
102 double start, tmpTime;
103 start = omp_get_wtime();
104
105 ttlg_transpose(ndim,dim, permutation, d_A, d_B ,alpha, beta);
106 cudaDeviceSynchronize();
107 tmpTime = omp_get_wtime() - start;
108 //cout << "\t"<<tmpTime<<"\t";
109 fprintf(stdout, "\t%6.6lf\t", tmpTime);
110 //
111 // //dCuTranspose_03241_15x20x20x16x304_reference(A, B_ref, sizes1, sizes1, ldb);
112 transpose_check(ndim,A, B_ref, alpha, beta, dim, permutation);
113 //tranpose5_check(A, B_ref, sizes1, sizes1, sizes1, perm1);
114 cudaMemcpy(B, d_B,totalsize, cudaMemcpyDeviceToHost);
115
116 #ifdef printd
117 cout<<"\n";
118 for(int i = 0; i < MIN(1000, total); i++)
119 cout <<B[i]<<" ";
120 cout<<"\n";
121 for(int i = 0; i < MIN(1000, total); i++)
122 cout <<A[i]<<" ";
123 cout<<"\n";
124 #endif
125 #ifndef NOERRORC
126 transpose_equal(B,B_ref,total);
127 // fprintf(stdout, "\t%f",tmpTime);
128 #endif
129 fprintf(stdout, "\t%6.2lf\t %u\n",2*totalsize/(tmpTime*1000000000), totalsize);
130 free(A);
131 free(B);
132 free(B_ref);
133 cudaFree(d_A);
134 cudaFree(d_B);
135
136 }
137 #undef type
138 }
139