1 /* When compiling the CUDA code, we do not want to include all ecm-impl.h*/
2 #define _DO_NOT_INCLUDE_ECM_IMPL_H
3 #include "ecm-gpu.h"
4 #include <gmp.h>
5 #include "cudakernel.h"
6
7 #ifndef __CUDACC__
8 #error "This file should only be compiled with nvcc"
9 #endif
10
11 __constant__ __device__ digit_t d_invNcst;
12 __device__ biguint_t d_Ncst;
13 __device__ biguint_t d_3Ncst;
14 __device__ biguint_t d_Mcst;
15
16
17 #define errCheck(err) cuda_errCheck (err, __FILE__, __LINE__)
18 #define cudaMalloc(d, size) errCheck (cudaMalloc (d, size))
19 #define cudaMemcpyHtoD(d, h, size) errCheck (cudaMemcpy ((void *) d, \
20 (void *) h, size, cudaMemcpyHostToDevice))
21 #define cudaMemcpyDtoH(h, d, size) errCheck (cudaMemcpy ((void *) h, \
22 (void *) d, size, cudaMemcpyDeviceToHost))
23 #define cudaMemcpyCst(d, h, size) errCheck (cudaMemcpyToSymbol (d, h, size))
24
25
26 /******************************/
27 /* Host code handling the GPU */
28 /******************************/
29
cuda_errCheck(cudaError err,const char * file,const int line)30 inline void cuda_errCheck (cudaError err, const char *file, const int line)
31 {
32 if( err != cudaSuccess )
33 {
34 fprintf(stderr, "%s(%i) : Error cuda : %s.\n",
35 file, line, cudaGetErrorString( err) );
36 exit(EXIT_FAILURE);
37 }
38 }
39
40 /* First call to a global function initialize the device */
Cuda_Init_Device()41 __global__ void Cuda_Init_Device ()
42 {
43 }
44
45 /* Given the compute compatibility (as major.minor), return the number of block
46 * to be run on one multiprocessor. */
47 extern "C"
48 unsigned int
getNumberOfBlockPerMultiProcessor(int major,int minor)49 getNumberOfBlockPerMultiProcessor (int major, int minor)
50 {
51 /* For 2.0 and 2.1, limited by the maximum number of threads per MP and the
52 * number of available registrer (need 23 registers per threads).
53 */
54 if (major == 2)
55 return 1;
56 /* For 3.0, 3.2, 3.5 and 3.7 limited by the maximum number of threads per MP.
57 */
58 else if (major == 3)
59 return 2;
60 /* For 5.0, 5.2, and 5.3 limited by the maximum number of threads per MP. */
61 else if (major == 5)
62 return 2;
63 /* We assume that for newer compute capability the properties of the GPU won't
64 * decrease.
65 */
66 else
67 return 2;
68 }
69
70 extern "C"
71 int
select_and_init_GPU(int device,unsigned int * number_of_curves,int verbose)72 select_and_init_GPU (int device, unsigned int *number_of_curves, int verbose)
73 {
74 cudaDeviceProp deviceProp;
75 cudaError_t err;
76
77 if (device!=-1)
78 {
79 if (verbose)
80 fprintf (stdout, "GPU: device %d is required.\n", device);
81
82 err = cudaSetDevice(device);
83 if (err != cudaSuccess)
84 {
85 fprintf (stderr, "GPU: Error: Could not use device %d\n", device);
86 fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err));
87 return -1;
88 }
89 }
90
91 err = cudaGetDevice (&device);
92 if (err != cudaSuccess)
93 {
94 fprintf (stderr, "GPU: Error: no active device.\n");
95 fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err));
96 return -1;
97 }
98
99 err = cudaGetDeviceProperties (&deviceProp, device);
100 if (err != cudaSuccess)
101 {
102 fprintf (stderr, "GPU: Error while getting device's properties.\n");
103 fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err));
104 return -1;
105 }
106
107 if (verbose)
108 {
109 printf ("GPU: will use device %d: %s, compute capability %d.%d, %d MPs.\n"
110 "GPU: maxSharedPerBlock = %zu maxThreadsPerBlock = %d "
111 "maxRegsPerBlock = %d\n", device, deviceProp.name,
112 deviceProp.major, deviceProp.minor,
113 deviceProp.multiProcessorCount, deviceProp.sharedMemPerBlock,
114 deviceProp.maxThreadsPerBlock, deviceProp.regsPerBlock);
115 }
116
117
118 if (*number_of_curves == 0) /* if choose the number of curves */
119 {
120 unsigned int n, m = ECM_GPU_CURVES_BY_BLOCK;
121 n = getNumberOfBlockPerMultiProcessor (deviceProp.major, deviceProp.minor);
122 *number_of_curves = n * deviceProp.multiProcessorCount * m;
123 }
124 else if (*number_of_curves % ECM_GPU_CURVES_BY_BLOCK != 0)
125 {
126 /* number_of_curves should be a multiple of ECM_GPU_CURVES_BY_BLOCK */
127 *number_of_curves = (*number_of_curves / ECM_GPU_CURVES_BY_BLOCK + 1) *
128 ECM_GPU_CURVES_BY_BLOCK;
129 if (verbose)
130 fprintf(stderr, "GPU: the requested number of curves has been "
131 "modified to %u\n", *number_of_curves);
132 }
133
134 /* First call to a global function initialize the device */
135 errCheck (cudaSetDeviceFlags (cudaDeviceScheduleYield));
136 Cuda_Init_Device<<<1, 1>>> ();
137 errCheck (cudaGetLastError());
138
139 if (verbose)
140 {
141 struct cudaFuncAttributes kernelAttr;
142 err = cudaFuncGetAttributes (&kernelAttr, Cuda_Ell_DblAdd);
143 if (err == cudaSuccess)
144 {
145 printf ("GPU: Using device code targeted for architecture compile_%d\n"
146 "GPU: Ptx version is %d\nGPU: maxThreadsPerBlock = %d\n"
147 "GPU: numRegsPerThread = %d sharedMemPerBlock = %zu bytes\n",
148 kernelAttr.binaryVersion, kernelAttr.ptxVersion,
149 kernelAttr.maxThreadsPerBlock, kernelAttr.numRegs,
150 kernelAttr.sharedSizeBytes);
151 }
152 }
153
154 return 0;
155 }
156
157 extern "C"
cuda_Main(biguint_t h_N,biguint_t h_3N,biguint_t h_M,digit_t h_invN,biguint_t * h_xarray,biguint_t * h_zarray,biguint_t * h_x2array,biguint_t * h_z2array,mpz_t s,unsigned int firstinvd,unsigned int number_of_curves,int verbose)158 float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN,
159 biguint_t *h_xarray, biguint_t *h_zarray,
160 biguint_t *h_x2array, biguint_t *h_z2array, mpz_t s,
161 unsigned int firstinvd, unsigned int number_of_curves,
162 int verbose)
163 {
164 cudaEvent_t start, stop;
165 cudaEventCreate (&start);
166 cudaEventCreate (&stop);
167 cudaEventRecord (start, 0);
168
169 size_t j;
170 int i;
171 float elltime = 0.0;
172 biguint_t *d_xA, *d_zA, *d_xB, *d_zB;
173
174 #define MAXEVENTS 2
175 #define DEPTH_EVENT 32
176 cudaEvent_t event[MAXEVENTS]; // Space for some cuda Event Handles
177 long nEventsRecorded = 0; // Remember how many events are recorded
178 long eventrecordix = 0; // Remember index of next event to record
179 long eventsyncix; // Remember index of oldest recorded event
180
181 size_t array_size = sizeof(biguint_t) * number_of_curves;
182
183 dim3 dimBlock (ECM_GPU_NB_DIGITS, ECM_GPU_CURVES_BY_BLOCK);
184 dim3 dimGrid (number_of_curves/ ECM_GPU_CURVES_BY_BLOCK);
185 if (verbose)
186 {
187 fprintf(stdout, "GPU: Block: %ux%ux%u Grid: %ux%ux%u "
188 "(%d parallel curves)\n", dimBlock.x, dimBlock.y, dimBlock.z,
189 dimGrid.x, dimGrid.y, dimGrid.z, number_of_curves);
190 }
191
192 /* Create a pair of events to pace ourselves */
193 for (i=0; i<MAXEVENTS; i++)
194 errCheck (cudaEventCreateWithFlags (&event[i],
195 cudaEventBlockingSync|cudaEventDisableTiming));
196
197 cudaMalloc (&d_xA, array_size);
198 cudaMalloc (&d_zA, array_size);
199 cudaMalloc (&d_xB, array_size);
200 cudaMalloc (&d_zB, array_size);
201
202 /* Copy into the gpu memory */
203 cudaMemcpyCst (d_invNcst, (void *) &h_invN, sizeof(digit_t));
204 cudaMemcpyCst (d_Ncst, (void *) h_N, sizeof(biguint_t));
205 cudaMemcpyCst (d_3Ncst, (void *) h_3N, sizeof(biguint_t));
206 cudaMemcpyCst (d_Mcst, (void *) h_M, sizeof(biguint_t));
207
208 cudaMemcpyHtoD (d_xA, h_xarray, array_size);
209 cudaMemcpyHtoD (d_zA, h_zarray, array_size);
210 cudaMemcpyHtoD (d_xB, h_x2array, array_size);
211 cudaMemcpyHtoD (d_zB, h_z2array, array_size);
212
213 #ifdef PRINT_REMAINING_ITER
214 unsigned int jmod = 100000000;
215 #endif
216
217 /* Double-and-add loop: it calls the GPU for each bits of s */
218 for (j = mpz_sizeinbase (s, 2) - 1; j>0; j-- )
219 {
220 if (mpz_tstbit (s, j-1) == 1)
221 Cuda_Ell_DblAdd<<<dimGrid,dimBlock>>>(d_xB, d_zB, d_xA, d_zA, firstinvd);
222 else
223 Cuda_Ell_DblAdd<<<dimGrid,dimBlock>>>(d_xA, d_zA, d_xB, d_zB, firstinvd);
224
225 /* Pace entry of events. Less overhead to enter an event every few */
226 /* iterations. But, if you exceed the depth of NVIDIA's kernel queue, */
227 /* it will busy-loop! */
228 /* Enter an event every DEPTH_EVENT iteration */
229 if (j % DEPTH_EVENT == 0)
230 {
231 cudaEventRecord(event[eventrecordix]);
232 if (nEventsRecorded == 0)
233 eventsyncix = eventrecordix;
234 nEventsRecorded += 1;
235 eventrecordix = (eventrecordix+1)%MAXEVENTS;
236 }
237
238 if (nEventsRecorded == MAXEVENTS)
239 {
240 cudaEventSynchronize(event[eventsyncix]);
241 nEventsRecorded -= 1;
242 eventsyncix = (eventsyncix+1)%MAXEVENTS;
243 }
244
245 #ifdef PRINT_REMAINING_ITER
246 if (j < 100000000) jmod = 10000000;
247 if (j < 10000000) jmod = 1000000;
248 if (j < 1000000) jmod = 100000;
249 if (j < 100000) jmod = 10000;
250 if (j % jmod == 0)
251 printf("%lu iterations to go\n", j);
252 #endif
253 }
254
255 /* If an error occurs during the kernel calls in the loop */
256 errCheck (cudaGetLastError());
257
258 /* Await for last recorded events */
259 while (nEventsRecorded != 0)
260 {
261 cudaEventSynchronize(event[eventsyncix]);
262 nEventsRecorded -= 1;
263 eventsyncix = (eventsyncix+1)%MAXEVENTS;
264 }
265
266 /* Get the results back from device memory */
267 cudaMemcpyDtoH (h_xarray, d_xA, array_size);
268 cudaMemcpyDtoH (h_zarray, d_zA, array_size);
269
270 /* Clean up our events and our stream handle */
271 for (i=0; i<MAXEVENTS; i++)
272 errCheck (cudaEventDestroy(event[i]));
273
274
275 cudaFree ((void *) d_xA);
276 cudaFree ((void *) d_zA);
277 cudaFree ((void *) d_xB);
278 cudaFree ((void *) d_zB);
279
280 cudaEventRecord (stop, 0);
281 cudaEventSynchronize (stop);
282
283 cudaEventElapsedTime (&elltime, start, stop);
284
285 errCheck (cudaEventDestroy (start));
286 errCheck (cudaEventDestroy (stop));
287
288 return elltime;
289 }
290
291
292
293 /***************/
294 /* Device code */
295 /***************/
296
297 #if defined(_MSC_VER)
298 # define ASM asm volatile
299 #else
300 # define ASM asm __volatile__
301 #endif
302
303 #define __add_cc(r,a,b) ASM ("add.cc.u32 %0, %1, %2;": "=r"(r): "r"(a), "r"(b))
304 #define __addc_cc(r,a,b) ASM ("addc.cc.u32 %0, %1, %2;": "=r"(r): "r"(a), "r"(b))
305 #define __sub_cc(r,a,b) ASM ("sub.cc.u32 %0, %1, %2;": "=r"(r): "r"(a), "r"(b))
306
307 #define __addcy(carry) ASM ("addc.s32 %0, 0, 0;": "=r"(carry))
308 #define __addcy2(carry) ASM ("addc.cc.s32 %0, %0, 0;": "+r"(carry))
309
310 #define __subcy(carry) ASM ("subc.s32 %0, 0, 0;": "=r"(carry))
311 #define __subcy2(carry) ASM ("subc.s32 %0, %0, 0;": "+r"(carry))
312
313 #define __mul_lo(r,a,b) ASM("mul.lo.u32 %0, %1, %2;": "=r"(r): "r"(a),"r"(b))
314 #define __mul_hi(r,a,b) ASM("mul.hi.u32 %0, %1, %2;": "=r"(r): "r"(a),"r"(b))
315 #define __mad_lo_cc(r,a,b) ASM("mad.lo.cc.u32 %0, %1, %2, %0;":\
316 "+r"(r): "r"(a),"r"(b))
317 #define __madc_hi_cc(r,a,b) ASM("madc.hi.cc.u32 %0, %1, %2, %0;":\
318 "+r"(r):"r"(a),"r"(b))
319
320 #ifdef __CUDA_ARCH__
321 #if __CUDA_ARCH__ >= 200
322 #include "cudakernel_default.cu"
323 #else
324 #error "Unsupported architecture"
325 #endif
326 #endif
327