1 /*            Default code for GPU                  */
2 /* A compute capability of 2.0 at least is required */
3 
4 
Cuda_Fully_Normalize(biguint_t A,bigint_t cy)5 __device__ void Cuda_Fully_Normalize (biguint_t A, bigint_t cy)
6 {
7   carry_t cytemp;
8   unsigned int thm1;
9 
10   while(__any(cy[threadIdx.x])!=0)
11   {
12     thm1 = (threadIdx.x - 1) % ECM_GPU_NB_DIGITS;
13     cytemp = cy[thm1];
14 
15     __add_cc(A[threadIdx.x], A[threadIdx.x], cytemp);
16 
17     if (cytemp >= 0)
18       __addcy(cy[threadIdx.x]);
19     else /* if (cytemp < 0) */
20       __subcy(cy[threadIdx.x]);
21   }
22 }
23 
24 /* Compute Rmod <- A + B */
25 /* Input: 0 <= A, B < 3*N */
26 /* Ouput: 0 <= Rmod < 6*N */
Cuda_Add_mod(biguint_t Rmod,bigint_t cy,const biguint_t A,const biguint_t B)27 __device__ void Cuda_Add_mod
28 (biguint_t Rmod, bigint_t cy, const biguint_t A, const biguint_t B)
29 {
30   unsigned int thp1 = (threadIdx.x + 1) % ECM_GPU_NB_DIGITS;
31   __add_cc (Rmod[threadIdx.x], A[threadIdx.x], B[threadIdx.x]);
32   __addcy2(Rmod[thp1]);
33   __addcy (cy[thp1]);
34   Cuda_Fully_Normalize (Rmod, cy);
35 }
36 
37 /* Compute Rmod <- Rmod + B */
38 /* Input: 0 <= Rmod, B < 3*N */
39 /* (except when it follows Cuda_Mulint_mod, 0 <= Rmod < 3*N, 0 < B < 7*N ) */
40 /* Ouput: 0 <= Rmod < 6*N */
41 /* (except when it follows Cuda_Mulint_mod, 0 <= Rmod < 10*N) */
Cuda_Add_mod(biguint_t Rmod,bigint_t cy,const biguint_t A)42 __device__ void Cuda_Add_mod
43 (biguint_t Rmod, bigint_t cy, const biguint_t A)
44 {
45   unsigned int thp1 = (threadIdx.x + 1) % ECM_GPU_NB_DIGITS;
46   __add_cc (Rmod[threadIdx.x], Rmod[threadIdx.x], A[threadIdx.x]);
47   //__addcy (cy[threadIdx.x]);
48   __addcy2(Rmod[thp1]);
49   __addcy (cy[thp1]);
50   Cuda_Fully_Normalize (Rmod, cy);
51 }
52 
53 /* Compute Rmod <- Rmod - B */
54 /* Input: 0 <= Rmod, B < 3*N */
55 /* Ouput: 0 <= Rmod < 6*N */
Cuda_Sub_mod(biguint_t Rmod,bigint_t cy,const biguint_t B,const digit_t N3thdx)56 __device__ void Cuda_Sub_mod
57 (biguint_t Rmod, bigint_t cy, const biguint_t B, const digit_t N3thdx)
58 {
59   digit_t reg_Rmod = Rmod[threadIdx.x];
60   carry_t reg_cy = 0;
61 
62   __add_cc (reg_Rmod, reg_Rmod, N3thdx);
63   __addcy (reg_cy);
64   __sub_cc (reg_Rmod, reg_Rmod, B[threadIdx.x]);
65   __subcy2 (reg_cy);
66 
67   Rmod[threadIdx.x] = reg_Rmod;
68   cy[threadIdx.x] = reg_cy;
69   Cuda_Fully_Normalize (Rmod, cy);
70 }
71 
72 /* Perform one step of REDC */
Cuda_Mulmod_step(biguint_t r,bigint_t cy,digit_t a,digit_t b,const digit_t Nthdx,const digit_t invN)73 __device__ void Cuda_Mulmod_step
74 (biguint_t r, bigint_t cy, digit_t a, digit_t b, const digit_t Nthdx,
75  const digit_t invN)
76 {
77   digit_t t;
78   digit_t reg_hi = 0;
79   unsigned int thp1= (threadIdx.x + 1) % ECM_GPU_NB_DIGITS;
80   carry_t reg_cy = cy[thp1];
81 
82   __mad_lo_cc(r[threadIdx.x],a,b);
83   __madc_hi_cc(reg_hi,a,b);
84   __addcy2(reg_cy);
85 
86   __mul_lo(t, invN, r[0]);
87   __mad_lo_cc(r[threadIdx.x],t,Nthdx);
88   __madc_hi_cc(reg_hi,t,Nthdx);
89   __addcy2(reg_cy);
90 
91   /* make one round of normalize + a right shift at the same time */
92   __add_cc(r[threadIdx.x],r[thp1],reg_hi);
93   __addc_cc(r[thp1],r[thp1],reg_cy);
94   __addcy(cy[thp1]);
95 }
96 
97 /* Compute r <- 2*a */
98 /* Input: 0 <= a < 3*N */
99 /* Ouput: 0 <= r < 3*N */
Cuda_Dbl_mod(biguint_t r,biguint_t a)100 __device__ void Cuda_Dbl_mod
101 (biguint_t r, biguint_t a)
102 {
103   unsigned int thp1= (threadIdx.x + 1) % ECM_GPU_NB_DIGITS;
104   asm ("add.cc.u32 %0, %1, %1;" : "=r"(r[threadIdx.x]) : "r"(a[threadIdx.x]));
105   __addcy2(r[thp1]);
106 }
107 
108 
109 /* Compute r <- A*b */
110 /* Input: 0 < b < 2^SIZE_DIGIT, 0 <= A < 6*N */
111 /* Ouput: 0 <= r < 7*N */
Cuda_Mulint_mod(biguint_t r,bigint_t cy,biguint_t A,digit_t b,const digit_t Nthdx,const digit_t invN)112 __device__ void Cuda_Mulint_mod
113 (biguint_t r, bigint_t cy, biguint_t A, digit_t b, const digit_t Nthdx,
114  const digit_t invN)
115 {
116   digit_t t;
117   digit_t reg_hi;
118   unsigned int thp1= (threadIdx.x + 1) % ECM_GPU_NB_DIGITS;
119   digit_t reg_A = A[threadIdx.x];
120   carry_t reg_cy;
121 
122   __mul_lo(r[threadIdx.x],reg_A,b);
123   __mul_hi(reg_hi,reg_A,b);
124 
125   __mul_lo(t, invN, r[0]);
126   __mad_lo_cc(r[threadIdx.x],t,Nthdx);
127   __madc_hi_cc(reg_hi,t,Nthdx);
128   __addcy(reg_cy);
129 
130   /* make one round of normalize + a right shift at the same time */
131   __add_cc(r[threadIdx.x],r[thp1],reg_hi);
132   __addc_cc(r[thp1],r[thp1],reg_cy);
133   __addcy(cy[thp1]);
134 
135   Cuda_Fully_Normalize(r,cy);
136 }
137 
138 /* Compute r <- A*B */
139 /* Input: 0 <= A, B < 6*N */
140 /* (except when it follows Cuda_Mulint_mod, 0 <= A < 6*N, 0 < B < 10*N ) */
141 /* Ouput: 0 <= r < 3*N */
Cuda_Mul_mod(biguint_t mul,bigint_t cy,const biguint_t A,const biguint_t B,biguint_t r,const digit_t Nthdx,const digit_t invN)142 __device__ void Cuda_Mul_mod
143 (biguint_t mul, bigint_t cy, const biguint_t A, const biguint_t B, biguint_t r,
144  const digit_t Nthdx, const digit_t invN)
145 {
146 
147   int i;
148   digit_t temp=A[threadIdx.x];
149 
150   r[threadIdx.x]=0;
151 
152   for (i=0; i<ECM_GPU_NB_DIGITS; i++)
153     Cuda_Mulmod_step (r, cy, temp, B[i], Nthdx, invN);
154 
155 
156   Cuda_Fully_Normalize (r, cy);
157   mul[threadIdx.x]=r[threadIdx.x];
158 }
159 
Cuda_Square_mod(biguint_t mul,bigint_t cy,const biguint_t A,biguint_t r,const digit_t Nthdx,const digit_t invN)160 __device__ void Cuda_Square_mod
161 (biguint_t mul, bigint_t cy, const biguint_t A, biguint_t r,
162  const digit_t Nthdx, const digit_t invN)
163 {
164   Cuda_Mul_mod (mul, cy, A, A, r, Nthdx, invN);
165 }
166 
167 /*
168   Compute silmutaneously:
169   (xarg : zarg ) <- [2](xarg : zarg)
170   (xarg2 : zarg2 ) <- (xarg : zarg) + (xarg2 : zarg2)
171 */
172 __global__ void
Cuda_Ell_DblAdd(biguint_t * xAarg,biguint_t * zAarg,biguint_t * xBarg,biguint_t * zBarg,unsigned int firstinvd)173 Cuda_Ell_DblAdd (biguint_t *xAarg, biguint_t *zAarg, biguint_t *xBarg,
174                                        biguint_t *zBarg, unsigned int firstinvd)
175 {
176   __shared__ VOL digit_t b_temp_r[ECM_GPU_CURVES_BY_BLOCK][ECM_GPU_NB_DIGITS];
177   __shared__ VOL carry_t b_cy[ECM_GPU_CURVES_BY_BLOCK][ECM_GPU_NB_DIGITS];
178 
179   __shared__ VOL digit_t b_t[ECM_GPU_CURVES_BY_BLOCK][ECM_GPU_NB_DIGITS];
180   __shared__ VOL digit_t b_u[ECM_GPU_CURVES_BY_BLOCK][ECM_GPU_NB_DIGITS];
181   __shared__ VOL digit_t b_v[ECM_GPU_CURVES_BY_BLOCK][ECM_GPU_NB_DIGITS];
182   __shared__ VOL digit_t b_w[ECM_GPU_CURVES_BY_BLOCK][ECM_GPU_NB_DIGITS];
183 
184   VOL digit_t *t=b_t[threadIdx.y];
185   VOL digit_t *u=b_u[threadIdx.y];
186   VOL digit_t *v=b_v[threadIdx.y];
187   VOL digit_t *w=b_w[threadIdx.y];
188   VOL digit_t *temp_r=b_temp_r[threadIdx.y];
189   VOL carry_t *cy=b_cy[threadIdx.y];
190 
191   /* Init of shared variables */
192   const unsigned int idx1=blockIdx.x*blockDim.y+threadIdx.y;
193   //unsigned int t1=threadIdx.x+1;
194   cy[threadIdx.x]=0;
195 
196   w[threadIdx.x]=xBarg[idx1][threadIdx.x];
197   v[threadIdx.x]=zBarg[idx1][threadIdx.x];
198   temp_r[threadIdx.x]=xAarg[idx1][threadIdx.x];
199   u[threadIdx.x]=zAarg[idx1][threadIdx.x];
200 
201   const digit_t Nthdx = d_Ncst[threadIdx.x];
202   const digit_t N3thdx = d_3Ncst[threadIdx.x];
203   const digit_t invN = d_invNcst;
204 
205   Cuda_Add_mod(t, cy, v, w);           /* C=zB+xB */
206   Cuda_Sub_mod(v, cy, w, N3thdx);      /* D=zB-xB */
207   Cuda_Add_mod(w, cy, u, temp_r);      /* A=zA+xA */
208   Cuda_Sub_mod(u, cy, temp_r, N3thdx); /* B=zA-xA */
209 
210   Cuda_Mul_mod(t, cy, t, u, temp_r, Nthdx, invN); /* CB=C*B=(zB+xB)(zA-xA) */
211   Cuda_Mul_mod(v, cy, v, w, temp_r, Nthdx, invN); /* DA=D*A=(zB-xB)(zA+xA) */
212 
213   Cuda_Square_mod(w, cy, w, temp_r, Nthdx, invN); /* AA=A^2 */
214   Cuda_Square_mod(u, cy, u, temp_r, Nthdx, invN); /* BB=B^2 */
215 
216   Cuda_Mul_mod(temp_r, cy, u, w, temp_r, Nthdx, invN); /* AA*BB */
217   xAarg[idx1][threadIdx.x]=temp_r[threadIdx.x];
218 
219   Cuda_Sub_mod (w, cy, u, N3thdx); /* K= AA-BB */
220   Cuda_Mulint_mod (temp_r, cy, w, idx1 + firstinvd, Nthdx, invN); /* d*K */
221   Cuda_Add_mod (u, cy, temp_r); /* BB+d*K */
222 
223   Cuda_Mul_mod (w, cy, w, u, temp_r, Nthdx, invN); /* K*(BB+d*K) */
224   zAarg[idx1][threadIdx.x]=w[threadIdx.x];
225 
226   Cuda_Add_mod(w, cy, v, t);       /* DA+CB mod N */
227   Cuda_Sub_mod(v, cy, t, N3thdx);  /* DA-CB mod N */
228 
229   Cuda_Square_mod(w, cy, w, temp_r, Nthdx, invN); /* (DA+CB)^2 mod N */
230   Cuda_Square_mod(v, cy, v, temp_r, Nthdx, invN); /* (DA-CB)^2 mod N */
231 
232   /* z0=1 so there is nothing to compute for z0*(DA+CB)^2 */
233   Cuda_Dbl_mod(temp_r, v); /* x0=2 x0*(DA-CB)^2 */
234 
235   xBarg[idx1][threadIdx.x]=w[threadIdx.x];
236   zBarg[idx1][threadIdx.x]=temp_r[threadIdx.x];
237 }
238 
239