1 /* When compiling the CUDA code, we do not want to include all ecm-impl.h*/
2 #define _DO_NOT_INCLUDE_ECM_IMPL_H
3 #include "ecm-gpu.h"
4 #include <gmp.h>
5 #include "cudakernel.h"
6 
7 #ifndef __CUDACC__
8 #error "This file should only be compiled with nvcc"
9 #endif
10 
11 __constant__ __device__ digit_t d_invNcst;
12 __device__ biguint_t d_Ncst;
13 __device__ biguint_t d_3Ncst;
14 __device__ biguint_t d_Mcst;
15 
16 
17 #define errCheck(err) cuda_errCheck (err, __FILE__, __LINE__)
18 #define cudaMalloc(d, size) errCheck (cudaMalloc (d, size))
19 #define cudaMemcpyHtoD(d, h, size) errCheck (cudaMemcpy ((void *) d, \
20                                     (void *) h, size, cudaMemcpyHostToDevice))
21 #define cudaMemcpyDtoH(h, d, size) errCheck (cudaMemcpy ((void *) h, \
22                                     (void *) d, size, cudaMemcpyDeviceToHost))
23 #define cudaMemcpyCst(d, h, size) errCheck (cudaMemcpyToSymbol (d, h, size))
24 
25 
26 /******************************/
27 /* Host code handling the GPU */
28 /******************************/
29 
cuda_errCheck(cudaError err,const char * file,const int line)30 inline void cuda_errCheck (cudaError err, const char *file, const int line)
31 {
32   if( err != cudaSuccess )
33   {
34     fprintf(stderr, "%s(%i) : Error cuda : %s.\n",
35               file, line, cudaGetErrorString( err) );
36     exit(EXIT_FAILURE);
37   }
38 }
39 
40 /* First call to a global function initialize the device */
Cuda_Init_Device()41 __global__ void Cuda_Init_Device ()
42 {
43 }
44 
45 /* Given the compute compatibility (as major.minor), return the number of block
46  * to be run on one multiprocessor. */
47 extern "C"
48 unsigned int
getNumberOfBlockPerMultiProcessor(int major,int minor)49 getNumberOfBlockPerMultiProcessor (int major, int minor)
50 {
51   /* For 2.0 and 2.1, limited by the maximum number of threads per MP and the
52    * number of available registrer (need 23 registers per threads).
53    */
54   if (major == 2)
55     return 1;
56   /* For 3.0, 3.2, 3.5 and 3.7 limited by the maximum number of threads per MP.
57    */
58   else if (major == 3)
59     return 2;
60   /* For 5.0, 5.2, and 5.3 limited by the maximum number of threads per MP. */
61   else if (major == 5)
62     return 2;
63   /* We assume that for newer compute capability the properties of the GPU won't
64    * decrease.
65    */
66   else
67     return 2;
68 }
69 
70 extern "C"
71 int
select_and_init_GPU(int device,unsigned int * number_of_curves,int verbose)72 select_and_init_GPU (int device, unsigned int *number_of_curves, int verbose)
73 {
74   cudaDeviceProp deviceProp;
75   cudaError_t err;
76 
77   if (device!=-1)
78     {
79       if (verbose)
80           fprintf (stdout, "GPU: device %d is required.\n", device);
81 
82       err = cudaSetDevice(device);
83       if (err != cudaSuccess)
84         {
85           fprintf (stderr, "GPU: Error: Could not use device %d\n", device);
86           fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err));
87           return -1;
88         }
89     }
90 
91   err = cudaGetDevice (&device);
92   if (err != cudaSuccess)
93     {
94       fprintf (stderr, "GPU: Error: no active device.\n");
95       fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err));
96       return -1;
97     }
98 
99   err = cudaGetDeviceProperties (&deviceProp, device);
100   if (err != cudaSuccess)
101     {
102       fprintf (stderr, "GPU: Error while getting device's properties.\n");
103       fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err));
104       return -1;
105     }
106 
107   if (verbose)
108     {
109       printf ("GPU: will use device %d: %s, compute capability %d.%d, %d MPs.\n"
110               "GPU: maxSharedPerBlock = %zu maxThreadsPerBlock = %d "
111               "maxRegsPerBlock = %d\n", device, deviceProp.name,
112               deviceProp.major, deviceProp.minor,
113               deviceProp.multiProcessorCount, deviceProp.sharedMemPerBlock,
114               deviceProp.maxThreadsPerBlock, deviceProp.regsPerBlock);
115     }
116 
117 
118   if (*number_of_curves == 0) /* if choose the number of curves */
119     {
120       unsigned int n, m = ECM_GPU_CURVES_BY_BLOCK;
121       n = getNumberOfBlockPerMultiProcessor (deviceProp.major, deviceProp.minor);
122       *number_of_curves = n * deviceProp.multiProcessorCount * m;
123     }
124   else if (*number_of_curves % ECM_GPU_CURVES_BY_BLOCK != 0)
125     {
126       /* number_of_curves should be a multiple of ECM_GPU_CURVES_BY_BLOCK */
127       *number_of_curves = (*number_of_curves / ECM_GPU_CURVES_BY_BLOCK + 1) *
128                                                         ECM_GPU_CURVES_BY_BLOCK;
129       if (verbose)
130           fprintf(stderr, "GPU: the requested number of curves has been "
131                           "modified to %u\n", *number_of_curves);
132     }
133 
134   /* First call to a global function initialize the device */
135   errCheck (cudaSetDeviceFlags (cudaDeviceScheduleYield));
136   Cuda_Init_Device<<<1, 1>>> ();
137   errCheck (cudaGetLastError());
138 
139   if (verbose)
140   {
141     struct cudaFuncAttributes kernelAttr;
142     err = cudaFuncGetAttributes (&kernelAttr, Cuda_Ell_DblAdd);
143     if (err == cudaSuccess)
144     {
145       printf ("GPU: Using device code targeted for architecture compile_%d\n"
146               "GPU: Ptx version is %d\nGPU: maxThreadsPerBlock = %d\n"
147               "GPU: numRegsPerThread = %d sharedMemPerBlock = %zu bytes\n",
148               kernelAttr.binaryVersion, kernelAttr.ptxVersion,
149               kernelAttr.maxThreadsPerBlock, kernelAttr.numRegs,
150               kernelAttr.sharedSizeBytes);
151     }
152   }
153 
154   return 0;
155 }
156 
157 extern "C"
cuda_Main(biguint_t h_N,biguint_t h_3N,biguint_t h_M,digit_t h_invN,biguint_t * h_xarray,biguint_t * h_zarray,biguint_t * h_x2array,biguint_t * h_z2array,mpz_t s,unsigned int firstinvd,unsigned int number_of_curves,int verbose)158 float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN,
159                  biguint_t *h_xarray, biguint_t *h_zarray,
160                  biguint_t *h_x2array, biguint_t *h_z2array, mpz_t s,
161                  unsigned int firstinvd, unsigned int number_of_curves,
162                  int verbose)
163 {
164   cudaEvent_t start, stop;
165   cudaEventCreate (&start);
166   cudaEventCreate (&stop);
167   cudaEventRecord (start, 0);
168 
169   size_t j;
170   int i;
171   float elltime = 0.0;
172   biguint_t *d_xA, *d_zA, *d_xB, *d_zB;
173 
174 #define MAXEVENTS 2
175 #define DEPTH_EVENT 32
176   cudaEvent_t event[MAXEVENTS];   // Space for some cuda Event Handles
177   long nEventsRecorded = 0;   // Remember how many events are recorded
178   long eventrecordix = 0;     // Remember index of next event to record
179   long eventsyncix;       // Remember index of oldest recorded event
180 
181   size_t array_size = sizeof(biguint_t) * number_of_curves;
182 
183   dim3 dimBlock (ECM_GPU_NB_DIGITS, ECM_GPU_CURVES_BY_BLOCK);
184   dim3 dimGrid (number_of_curves/ ECM_GPU_CURVES_BY_BLOCK);
185   if (verbose)
186     {
187       fprintf(stdout, "GPU: Block: %ux%ux%u Grid: %ux%ux%u "
188               "(%d parallel curves)\n", dimBlock.x, dimBlock.y, dimBlock.z,
189               dimGrid.x, dimGrid.y, dimGrid.z, number_of_curves);
190     }
191 
192   /* Create a pair of events to pace ourselves */
193   for (i=0; i<MAXEVENTS; i++)
194     errCheck (cudaEventCreateWithFlags (&event[i],
195                               cudaEventBlockingSync|cudaEventDisableTiming));
196 
197   cudaMalloc (&d_xA, array_size);
198   cudaMalloc (&d_zA, array_size);
199   cudaMalloc (&d_xB, array_size);
200   cudaMalloc (&d_zB, array_size);
201 
202   /* Copy into the gpu memory */
203   cudaMemcpyCst (d_invNcst, (void *) &h_invN, sizeof(digit_t));
204   cudaMemcpyCst (d_Ncst, (void *) h_N, sizeof(biguint_t));
205   cudaMemcpyCst (d_3Ncst, (void *) h_3N, sizeof(biguint_t));
206   cudaMemcpyCst (d_Mcst, (void *) h_M, sizeof(biguint_t));
207 
208   cudaMemcpyHtoD (d_xA, h_xarray, array_size);
209   cudaMemcpyHtoD (d_zA, h_zarray, array_size);
210   cudaMemcpyHtoD (d_xB, h_x2array, array_size);
211   cudaMemcpyHtoD (d_zB, h_z2array, array_size);
212 
213 #ifdef PRINT_REMAINING_ITER
214       unsigned int jmod = 100000000;
215 #endif
216 
217   /* Double-and-add loop: it calls the GPU for each bits of s */
218   for (j = mpz_sizeinbase (s, 2) - 1; j>0; j-- )
219   {
220     if (mpz_tstbit (s, j-1) == 1)
221       Cuda_Ell_DblAdd<<<dimGrid,dimBlock>>>(d_xB, d_zB, d_xA, d_zA, firstinvd);
222     else
223       Cuda_Ell_DblAdd<<<dimGrid,dimBlock>>>(d_xA, d_zA, d_xB, d_zB, firstinvd);
224 
225     /* Pace entry of events. Less overhead to enter an event every few    */
226     /* iterations. But, if you exceed the depth of NVIDIA's kernel queue, */
227     /* it will busy-loop!                                                 */
228     /* Enter an event every DEPTH_EVENT iteration */
229     if (j % DEPTH_EVENT == 0)
230     {
231       cudaEventRecord(event[eventrecordix]);
232       if (nEventsRecorded == 0)
233         eventsyncix = eventrecordix;
234       nEventsRecorded += 1;
235       eventrecordix = (eventrecordix+1)%MAXEVENTS;
236     }
237 
238     if (nEventsRecorded == MAXEVENTS)
239     {
240       cudaEventSynchronize(event[eventsyncix]);
241       nEventsRecorded -= 1;
242       eventsyncix = (eventsyncix+1)%MAXEVENTS;
243     }
244 
245 #ifdef PRINT_REMAINING_ITER
246     if (j < 100000000) jmod = 10000000;
247     if (j < 10000000)  jmod =  1000000;
248     if (j < 1000000)   jmod =   100000;
249     if (j < 100000)    jmod =    10000;
250     if (j % jmod == 0)
251       printf("%lu iterations to go\n", j);
252 #endif
253   }
254 
255   /* If an error occurs during the kernel calls in the loop */
256   errCheck (cudaGetLastError());
257 
258   /* Await for last recorded events */
259   while (nEventsRecorded != 0)
260   {
261     cudaEventSynchronize(event[eventsyncix]);
262     nEventsRecorded -= 1;
263     eventsyncix = (eventsyncix+1)%MAXEVENTS;
264   }
265 
266   /* Get the results back from device memory */
267   cudaMemcpyDtoH (h_xarray, d_xA, array_size);
268   cudaMemcpyDtoH (h_zarray, d_zA, array_size);
269 
270   /* Clean up our events and our stream handle */
271   for (i=0; i<MAXEVENTS; i++)
272     errCheck (cudaEventDestroy(event[i]));
273 
274 
275   cudaFree ((void *) d_xA);
276   cudaFree ((void *) d_zA);
277   cudaFree ((void *) d_xB);
278   cudaFree ((void *) d_zB);
279 
280   cudaEventRecord (stop, 0);
281   cudaEventSynchronize (stop);
282 
283   cudaEventElapsedTime (&elltime, start, stop);
284 
285   errCheck (cudaEventDestroy (start));
286   errCheck (cudaEventDestroy (stop));
287 
288   return elltime;
289 }
290 
291 
292 
293 /***************/
294 /* Device code */
295 /***************/
296 
297 #if defined(_MSC_VER)
298 #  define ASM asm volatile
299 #else
300 #  define ASM asm __volatile__
301 #endif
302 
303 #define __add_cc(r,a,b) ASM ("add.cc.u32 %0, %1, %2;": "=r"(r): "r"(a), "r"(b))
304 #define __addc_cc(r,a,b) ASM ("addc.cc.u32 %0, %1, %2;": "=r"(r): "r"(a), "r"(b))
305 #define __sub_cc(r,a,b) ASM ("sub.cc.u32 %0, %1, %2;": "=r"(r): "r"(a), "r"(b))
306 
307 #define __addcy(carry) ASM ("addc.s32 %0, 0, 0;": "=r"(carry))
308 #define __addcy2(carry) ASM ("addc.cc.s32 %0, %0, 0;": "+r"(carry))
309 
310 #define __subcy(carry) ASM ("subc.s32 %0, 0, 0;": "=r"(carry))
311 #define __subcy2(carry) ASM ("subc.s32 %0, %0, 0;": "+r"(carry))
312 
313 #define __mul_lo(r,a,b) ASM("mul.lo.u32 %0, %1, %2;": "=r"(r): "r"(a),"r"(b))
314 #define __mul_hi(r,a,b) ASM("mul.hi.u32 %0, %1, %2;": "=r"(r): "r"(a),"r"(b))
315 #define __mad_lo_cc(r,a,b) ASM("mad.lo.cc.u32 %0, %1, %2, %0;":\
316                                                       "+r"(r): "r"(a),"r"(b))
317 #define __madc_hi_cc(r,a,b) ASM("madc.hi.cc.u32 %0, %1, %2, %0;":\
318                                                   "+r"(r):"r"(a),"r"(b))
319 
320 #ifdef __CUDA_ARCH__
321   #if __CUDA_ARCH__ >= 200
322     #include "cudakernel_default.cu"
323   #else
324     #error "Unsupported architecture"
325   #endif
326 #endif
327