1 enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
2
3 #include "cuda_data_cu.h"
4 #include "cuda_wrapper_cu.h"
5 #include "cuda_data_kernel.cu"
6 #include <cstdio>
7
CudaData_Upload_DoubleFloat(void * host_data,void * dev_data,unsigned * n,copy_mode mode,void * buffer)8 void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
9 {
10 int size = n[0];
11
12 if(n[1] > 0) size *= n[1];
13
14 if(n[2] > 0) size *= n[2];
15
16 dim3 threads;
17 threads.x = 1;
18 threads.y = 1;
19 threads.z = 1;
20 dim3 grid;
21 grid.x = 1;
22 grid.y = 1;
23 grid.z = 1;
24
25 if(size <= 128 * 30)
26 threads.x = 32;
27 else if(size <= 256 * 30)
28 threads.x = 64;
29 else if(size <= 512 * 30)
30 threads.x = 128;
31 else
32 threads.x = 256;
33
34 grid.x = ((size - 1) + threads.x) / threads.x;
35
36 if(grid.x > 32000)
37 grid.x = 32000;
38
39 while(grid.x * grid.y * threads.x < size) grid.y++;
40
41 float debugdata[size];
42 //int* cu_debug=(int*) CudaWrapper_AllocCudaData(size*sizeof(FLOAT));
43 size *= sizeof(double);
44 printf("size: %i (%i %i %i) (%i %i %i) %p\n", size, grid.x, grid.y, threads.x, n[0], n[1], n[2], buffer);
45 CudaWrapper_UploadCudaData(host_data, buffer, size);
46 CudaData_Upload_Kernel_DoubleFloat <<< grid, threads>>>((double*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
47 cudaThreadSynchronize();
48 CudaWrapper_DownloadCudaData(debugdata, dev_data, size / 2);
49 double sum = 0;
50 printf("debugdata: ");
51
52 for(int i = 0; i < size / sizeof(double); i++) sum += (debugdata[i] - ((double*) host_data)[i]) * (debugdata[i] - ((double*) host_data)[i]);
53
54 printf("%lf \n", sum);
55
56 }
57
CudaData_Upload_DoubleDouble(void * host_data,void * dev_data,unsigned * n,copy_mode mode,void * buffer)58 void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
59 {
60 int size = n[0];
61
62 if(n[1] > 0) size *= n[1];
63
64 if(n[2] > 0) size *= n[2];
65
66 dim3 threads;
67 threads.x = 1;
68 threads.y = 1;
69 threads.z = 1;
70 dim3 grid;
71 grid.x = 1;
72 grid.y = 1;
73 grid.z = 1;
74
75 if(size <= 128 * 30)
76 threads.x = 32;
77 else if(size <= 256 * 30)
78 threads.x = 64;
79 else if(size <= 512 * 30)
80 threads.x = 128;
81 else
82 threads.x = 256;
83
84 grid.x = ((size - 1) + threads.x) / threads.x;
85
86 if(grid.x > 32000)
87 grid.x = 32000;
88
89 while(grid.x * grid.y * threads.x < size) grid.y++;
90
91 size *= sizeof(double);
92
93 CudaWrapper_UploadCudaData(host_data, buffer, size);
94 CudaData_Upload_Kernel_DoubleDouble <<< grid, threads>>>((double*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
95 cudaThreadSynchronize();
96 }
97
CudaData_Upload_FloatDouble(void * host_data,void * dev_data,unsigned * n,copy_mode mode,void * buffer)98 void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
99 {
100 int size = n[0];
101
102 if(n[1] > 0) size *= n[1];
103
104 if(n[2] > 0) size *= n[2];
105
106 dim3 threads;
107 threads.x = 1;
108 threads.y = 1;
109 threads.z = 1;
110 dim3 grid;
111 grid.x = 1;
112 grid.y = 1;
113 grid.z = 1;
114
115 if(size <= 128 * 30)
116 threads.x = 32;
117 else if(size <= 256 * 30)
118 threads.x = 64;
119 else if(size <= 512 * 30)
120 threads.x = 128;
121 else
122 threads.x = 256;
123
124 grid.x = ((size - 1) + threads.x) / threads.x;
125
126 if(grid.x > 32000)
127 grid.x = 32000;
128
129 while(grid.x * grid.y * threads.x < size) grid.y++;
130
131 size *= sizeof(float);
132
133 CudaWrapper_UploadCudaData(host_data, buffer, size);
134 CudaData_Upload_Kernel_FloatDouble <<< grid, threads>>>((float*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
135 cudaThreadSynchronize();
136 }
137
CudaData_Upload_FloatFloat(void * host_data,void * dev_data,unsigned * n,copy_mode mode,void * buffer)138 void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
139 {
140 int size = n[0];
141
142 if(n[1] > 0) size *= n[1];
143
144 if(n[2] > 0) size *= n[2];
145
146 dim3 threads;
147 threads.x = 1;
148 threads.y = 1;
149 threads.z = 1;
150 dim3 grid;
151 grid.x = 1;
152 grid.y = 1;
153 grid.z = 1;
154
155 if(size <= 128 * 30)
156 threads.x = 32;
157 else if(size <= 256 * 30)
158 threads.x = 64;
159 else if(size <= 512 * 30)
160 threads.x = 128;
161 else
162 threads.x = 256;
163
164 grid.x = ((size - 1) + threads.x) / threads.x;
165
166 if(grid.x > 32000)
167 grid.x = 32000;
168
169 while(grid.x * grid.y * threads.x < size) grid.y++;
170
171 size *= sizeof(float);
172
173 CudaWrapper_UploadCudaData(host_data, buffer, size);
174 CudaData_Upload_Kernel_FloatFloat <<< grid, threads>>>((float*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
175 cudaThreadSynchronize();
176 }
177
CudaData_Upload_IntInt(void * host_data,void * dev_data,unsigned * n,copy_mode mode,void * buffer)178 void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
179 {
180 int size = n[0];
181
182 if(n[1] > 0) size *= n[1];
183
184 if(n[2] > 0) size *= n[2];
185
186 dim3 threads;
187 threads.x = 1;
188 threads.y = 1;
189 threads.z = 1;
190 dim3 grid;
191 grid.x = 1;
192 grid.y = 1;
193 grid.z = 1;
194
195 if(size <= 128 * 30)
196 threads.x = 32;
197 else if(size <= 256 * 30)
198 threads.x = 64;
199 else if(size <= 512 * 30)
200 threads.x = 128;
201 else
202 threads.x = 256;
203
204 grid.x = ((size - 1) + threads.x) / threads.x;
205
206 if(grid.x > 32000)
207 grid.x = 32000;
208
209 while(grid.x * grid.y * threads.x < size) grid.y++;
210
211 size *= sizeof(int);
212
213 CudaWrapper_UploadCudaData(host_data, buffer, size);
214 CudaData_Upload_Kernel_IntInt <<< grid, threads>>>((int*)buffer, (int*)dev_data, n[0], n[1], n[2], mode);
215 cudaThreadSynchronize();
216 }
217
CudaData_Download(void * host_data,void * dev_data,int host_size,int dev_size,unsigned * n,copy_mode mode,void * buffer)218 void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer)
219 {
220 }
221