1 enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
2 
3 #include "cuda_data_cu.h"
4 #include "cuda_wrapper_cu.h"
5 #include "cuda_data_kernel.cu"
6 #include <cstdio>
7 
CudaData_Upload_DoubleFloat(void * host_data,void * dev_data,unsigned * n,copy_mode mode,void * buffer)8 void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
9 {
10   int size = n[0];
11 
12   if(n[1] > 0) size *= n[1];
13 
14   if(n[2] > 0) size *= n[2];
15 
16   dim3 threads;
17   threads.x = 1;
18   threads.y = 1;
19   threads.z = 1;
20   dim3 grid;
21   grid.x = 1;
22   grid.y = 1;
23   grid.z = 1;
24 
25   if(size <= 128 * 30)
26     threads.x = 32;
27   else if(size <= 256 * 30)
28     threads.x = 64;
29   else if(size <= 512 * 30)
30     threads.x = 128;
31   else
32     threads.x = 256;
33 
34   grid.x = ((size - 1) + threads.x) / threads.x;
35 
36   if(grid.x > 32000)
37     grid.x = 32000;
38 
39   while(grid.x * grid.y * threads.x < size) grid.y++;
40 
41   float debugdata[size];
42   //int* cu_debug=(int*) CudaWrapper_AllocCudaData(size*sizeof(FLOAT));
43   size *= sizeof(double);
44   printf("size: %i (%i %i %i) (%i %i %i) %p\n", size, grid.x, grid.y, threads.x, n[0], n[1], n[2], buffer);
45   CudaWrapper_UploadCudaData(host_data, buffer, size);
46   CudaData_Upload_Kernel_DoubleFloat <<< grid, threads>>>((double*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
47   cudaThreadSynchronize();
48   CudaWrapper_DownloadCudaData(debugdata, dev_data, size / 2);
49   double sum = 0;
50   printf("debugdata: ");
51 
52   for(int i = 0; i < size / sizeof(double); i++) sum += (debugdata[i] - ((double*) host_data)[i]) * (debugdata[i] - ((double*) host_data)[i]);
53 
54   printf("%lf \n", sum);
55 
56 }
57 
CudaData_Upload_DoubleDouble(void * host_data,void * dev_data,unsigned * n,copy_mode mode,void * buffer)58 void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
59 {
60   int size = n[0];
61 
62   if(n[1] > 0) size *= n[1];
63 
64   if(n[2] > 0) size *= n[2];
65 
66   dim3 threads;
67   threads.x = 1;
68   threads.y = 1;
69   threads.z = 1;
70   dim3 grid;
71   grid.x = 1;
72   grid.y = 1;
73   grid.z = 1;
74 
75   if(size <= 128 * 30)
76     threads.x = 32;
77   else if(size <= 256 * 30)
78     threads.x = 64;
79   else if(size <= 512 * 30)
80     threads.x = 128;
81   else
82     threads.x = 256;
83 
84   grid.x = ((size - 1) + threads.x) / threads.x;
85 
86   if(grid.x > 32000)
87     grid.x = 32000;
88 
89   while(grid.x * grid.y * threads.x < size) grid.y++;
90 
91   size *= sizeof(double);
92 
93   CudaWrapper_UploadCudaData(host_data, buffer, size);
94   CudaData_Upload_Kernel_DoubleDouble <<< grid, threads>>>((double*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
95   cudaThreadSynchronize();
96 }
97 
CudaData_Upload_FloatDouble(void * host_data,void * dev_data,unsigned * n,copy_mode mode,void * buffer)98 void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
99 {
100   int size = n[0];
101 
102   if(n[1] > 0) size *= n[1];
103 
104   if(n[2] > 0) size *= n[2];
105 
106   dim3 threads;
107   threads.x = 1;
108   threads.y = 1;
109   threads.z = 1;
110   dim3 grid;
111   grid.x = 1;
112   grid.y = 1;
113   grid.z = 1;
114 
115   if(size <= 128 * 30)
116     threads.x = 32;
117   else if(size <= 256 * 30)
118     threads.x = 64;
119   else if(size <= 512 * 30)
120     threads.x = 128;
121   else
122     threads.x = 256;
123 
124   grid.x = ((size - 1) + threads.x) / threads.x;
125 
126   if(grid.x > 32000)
127     grid.x = 32000;
128 
129   while(grid.x * grid.y * threads.x < size) grid.y++;
130 
131   size *= sizeof(float);
132 
133   CudaWrapper_UploadCudaData(host_data, buffer, size);
134   CudaData_Upload_Kernel_FloatDouble <<< grid, threads>>>((float*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
135   cudaThreadSynchronize();
136 }
137 
CudaData_Upload_FloatFloat(void * host_data,void * dev_data,unsigned * n,copy_mode mode,void * buffer)138 void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
139 {
140   int size = n[0];
141 
142   if(n[1] > 0) size *= n[1];
143 
144   if(n[2] > 0) size *= n[2];
145 
146   dim3 threads;
147   threads.x = 1;
148   threads.y = 1;
149   threads.z = 1;
150   dim3 grid;
151   grid.x = 1;
152   grid.y = 1;
153   grid.z = 1;
154 
155   if(size <= 128 * 30)
156     threads.x = 32;
157   else if(size <= 256 * 30)
158     threads.x = 64;
159   else if(size <= 512 * 30)
160     threads.x = 128;
161   else
162     threads.x = 256;
163 
164   grid.x = ((size - 1) + threads.x) / threads.x;
165 
166   if(grid.x > 32000)
167     grid.x = 32000;
168 
169   while(grid.x * grid.y * threads.x < size) grid.y++;
170 
171   size *= sizeof(float);
172 
173   CudaWrapper_UploadCudaData(host_data, buffer, size);
174   CudaData_Upload_Kernel_FloatFloat <<< grid, threads>>>((float*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
175   cudaThreadSynchronize();
176 }
177 
CudaData_Upload_IntInt(void * host_data,void * dev_data,unsigned * n,copy_mode mode,void * buffer)178 void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
179 {
180   int size = n[0];
181 
182   if(n[1] > 0) size *= n[1];
183 
184   if(n[2] > 0) size *= n[2];
185 
186   dim3 threads;
187   threads.x = 1;
188   threads.y = 1;
189   threads.z = 1;
190   dim3 grid;
191   grid.x = 1;
192   grid.y = 1;
193   grid.z = 1;
194 
195   if(size <= 128 * 30)
196     threads.x = 32;
197   else if(size <= 256 * 30)
198     threads.x = 64;
199   else if(size <= 512 * 30)
200     threads.x = 128;
201   else
202     threads.x = 256;
203 
204   grid.x = ((size - 1) + threads.x) / threads.x;
205 
206   if(grid.x > 32000)
207     grid.x = 32000;
208 
209   while(grid.x * grid.y * threads.x < size) grid.y++;
210 
211   size *= sizeof(int);
212 
213   CudaWrapper_UploadCudaData(host_data, buffer, size);
214   CudaData_Upload_Kernel_IntInt <<< grid, threads>>>((int*)buffer, (int*)dev_data, n[0], n[1], n[2], mode);
215   cudaThreadSynchronize();
216 }
217 
CudaData_Download(void * host_data,void * dev_data,int host_size,int dev_size,unsigned * n,copy_mode mode,void * buffer)218 void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer)
219 {
220 }
221