1 /* Default code for GPU */
2 /* A compute capability of 2.0 at least is required */
3
4
Cuda_Fully_Normalize(biguint_t A,bigint_t cy)5 __device__ void Cuda_Fully_Normalize (biguint_t A, bigint_t cy)
6 {
7 carry_t cytemp;
8 unsigned int thm1;
9
10 while(__any(cy[threadIdx.x])!=0)
11 {
12 thm1 = (threadIdx.x - 1) % ECM_GPU_NB_DIGITS;
13 cytemp = cy[thm1];
14
15 __add_cc(A[threadIdx.x], A[threadIdx.x], cytemp);
16
17 if (cytemp >= 0)
18 __addcy(cy[threadIdx.x]);
19 else /* if (cytemp < 0) */
20 __subcy(cy[threadIdx.x]);
21 }
22 }
23
24 /* Compute Rmod <- A + B */
25 /* Input: 0 <= A, B < 3*N */
26 /* Ouput: 0 <= Rmod < 6*N */
Cuda_Add_mod(biguint_t Rmod,bigint_t cy,const biguint_t A,const biguint_t B)27 __device__ void Cuda_Add_mod
28 (biguint_t Rmod, bigint_t cy, const biguint_t A, const biguint_t B)
29 {
30 unsigned int thp1 = (threadIdx.x + 1) % ECM_GPU_NB_DIGITS;
31 __add_cc (Rmod[threadIdx.x], A[threadIdx.x], B[threadIdx.x]);
32 __addcy2(Rmod[thp1]);
33 __addcy (cy[thp1]);
34 Cuda_Fully_Normalize (Rmod, cy);
35 }
36
37 /* Compute Rmod <- Rmod + B */
38 /* Input: 0 <= Rmod, B < 3*N */
39 /* (except when it follows Cuda_Mulint_mod, 0 <= Rmod < 3*N, 0 < B < 7*N ) */
40 /* Ouput: 0 <= Rmod < 6*N */
41 /* (except when it follows Cuda_Mulint_mod, 0 <= Rmod < 10*N) */
Cuda_Add_mod(biguint_t Rmod,bigint_t cy,const biguint_t A)42 __device__ void Cuda_Add_mod
43 (biguint_t Rmod, bigint_t cy, const biguint_t A)
44 {
45 unsigned int thp1 = (threadIdx.x + 1) % ECM_GPU_NB_DIGITS;
46 __add_cc (Rmod[threadIdx.x], Rmod[threadIdx.x], A[threadIdx.x]);
47 //__addcy (cy[threadIdx.x]);
48 __addcy2(Rmod[thp1]);
49 __addcy (cy[thp1]);
50 Cuda_Fully_Normalize (Rmod, cy);
51 }
52
53 /* Compute Rmod <- Rmod - B */
54 /* Input: 0 <= Rmod, B < 3*N */
55 /* Ouput: 0 <= Rmod < 6*N */
Cuda_Sub_mod(biguint_t Rmod,bigint_t cy,const biguint_t B,const digit_t N3thdx)56 __device__ void Cuda_Sub_mod
57 (biguint_t Rmod, bigint_t cy, const biguint_t B, const digit_t N3thdx)
58 {
59 digit_t reg_Rmod = Rmod[threadIdx.x];
60 carry_t reg_cy = 0;
61
62 __add_cc (reg_Rmod, reg_Rmod, N3thdx);
63 __addcy (reg_cy);
64 __sub_cc (reg_Rmod, reg_Rmod, B[threadIdx.x]);
65 __subcy2 (reg_cy);
66
67 Rmod[threadIdx.x] = reg_Rmod;
68 cy[threadIdx.x] = reg_cy;
69 Cuda_Fully_Normalize (Rmod, cy);
70 }
71
72 /* Perform one step of REDC */
Cuda_Mulmod_step(biguint_t r,bigint_t cy,digit_t a,digit_t b,const digit_t Nthdx,const digit_t invN)73 __device__ void Cuda_Mulmod_step
74 (biguint_t r, bigint_t cy, digit_t a, digit_t b, const digit_t Nthdx,
75 const digit_t invN)
76 {
77 digit_t t;
78 digit_t reg_hi = 0;
79 unsigned int thp1= (threadIdx.x + 1) % ECM_GPU_NB_DIGITS;
80 carry_t reg_cy = cy[thp1];
81
82 __mad_lo_cc(r[threadIdx.x],a,b);
83 __madc_hi_cc(reg_hi,a,b);
84 __addcy2(reg_cy);
85
86 __mul_lo(t, invN, r[0]);
87 __mad_lo_cc(r[threadIdx.x],t,Nthdx);
88 __madc_hi_cc(reg_hi,t,Nthdx);
89 __addcy2(reg_cy);
90
91 /* make one round of normalize + a right shift at the same time */
92 __add_cc(r[threadIdx.x],r[thp1],reg_hi);
93 __addc_cc(r[thp1],r[thp1],reg_cy);
94 __addcy(cy[thp1]);
95 }
96
97 /* Compute r <- 2*a */
98 /* Input: 0 <= a < 3*N */
99 /* Ouput: 0 <= r < 3*N */
Cuda_Dbl_mod(biguint_t r,biguint_t a)100 __device__ void Cuda_Dbl_mod
101 (biguint_t r, biguint_t a)
102 {
103 unsigned int thp1= (threadIdx.x + 1) % ECM_GPU_NB_DIGITS;
104 asm ("add.cc.u32 %0, %1, %1;" : "=r"(r[threadIdx.x]) : "r"(a[threadIdx.x]));
105 __addcy2(r[thp1]);
106 }
107
108
109 /* Compute r <- A*b */
110 /* Input: 0 < b < 2^SIZE_DIGIT, 0 <= A < 6*N */
111 /* Ouput: 0 <= r < 7*N */
Cuda_Mulint_mod(biguint_t r,bigint_t cy,biguint_t A,digit_t b,const digit_t Nthdx,const digit_t invN)112 __device__ void Cuda_Mulint_mod
113 (biguint_t r, bigint_t cy, biguint_t A, digit_t b, const digit_t Nthdx,
114 const digit_t invN)
115 {
116 digit_t t;
117 digit_t reg_hi;
118 unsigned int thp1= (threadIdx.x + 1) % ECM_GPU_NB_DIGITS;
119 digit_t reg_A = A[threadIdx.x];
120 carry_t reg_cy;
121
122 __mul_lo(r[threadIdx.x],reg_A,b);
123 __mul_hi(reg_hi,reg_A,b);
124
125 __mul_lo(t, invN, r[0]);
126 __mad_lo_cc(r[threadIdx.x],t,Nthdx);
127 __madc_hi_cc(reg_hi,t,Nthdx);
128 __addcy(reg_cy);
129
130 /* make one round of normalize + a right shift at the same time */
131 __add_cc(r[threadIdx.x],r[thp1],reg_hi);
132 __addc_cc(r[thp1],r[thp1],reg_cy);
133 __addcy(cy[thp1]);
134
135 Cuda_Fully_Normalize(r,cy);
136 }
137
138 /* Compute r <- A*B */
139 /* Input: 0 <= A, B < 6*N */
140 /* (except when it follows Cuda_Mulint_mod, 0 <= A < 6*N, 0 < B < 10*N ) */
141 /* Ouput: 0 <= r < 3*N */
Cuda_Mul_mod(biguint_t mul,bigint_t cy,const biguint_t A,const biguint_t B,biguint_t r,const digit_t Nthdx,const digit_t invN)142 __device__ void Cuda_Mul_mod
143 (biguint_t mul, bigint_t cy, const biguint_t A, const biguint_t B, biguint_t r,
144 const digit_t Nthdx, const digit_t invN)
145 {
146
147 int i;
148 digit_t temp=A[threadIdx.x];
149
150 r[threadIdx.x]=0;
151
152 for (i=0; i<ECM_GPU_NB_DIGITS; i++)
153 Cuda_Mulmod_step (r, cy, temp, B[i], Nthdx, invN);
154
155
156 Cuda_Fully_Normalize (r, cy);
157 mul[threadIdx.x]=r[threadIdx.x];
158 }
159
Cuda_Square_mod(biguint_t mul,bigint_t cy,const biguint_t A,biguint_t r,const digit_t Nthdx,const digit_t invN)160 __device__ void Cuda_Square_mod
161 (biguint_t mul, bigint_t cy, const biguint_t A, biguint_t r,
162 const digit_t Nthdx, const digit_t invN)
163 {
164 Cuda_Mul_mod (mul, cy, A, A, r, Nthdx, invN);
165 }
166
167 /*
168 Compute silmutaneously:
169 (xarg : zarg ) <- [2](xarg : zarg)
170 (xarg2 : zarg2 ) <- (xarg : zarg) + (xarg2 : zarg2)
171 */
172 __global__ void
Cuda_Ell_DblAdd(biguint_t * xAarg,biguint_t * zAarg,biguint_t * xBarg,biguint_t * zBarg,unsigned int firstinvd)173 Cuda_Ell_DblAdd (biguint_t *xAarg, biguint_t *zAarg, biguint_t *xBarg,
174 biguint_t *zBarg, unsigned int firstinvd)
175 {
176 __shared__ VOL digit_t b_temp_r[ECM_GPU_CURVES_BY_BLOCK][ECM_GPU_NB_DIGITS];
177 __shared__ VOL carry_t b_cy[ECM_GPU_CURVES_BY_BLOCK][ECM_GPU_NB_DIGITS];
178
179 __shared__ VOL digit_t b_t[ECM_GPU_CURVES_BY_BLOCK][ECM_GPU_NB_DIGITS];
180 __shared__ VOL digit_t b_u[ECM_GPU_CURVES_BY_BLOCK][ECM_GPU_NB_DIGITS];
181 __shared__ VOL digit_t b_v[ECM_GPU_CURVES_BY_BLOCK][ECM_GPU_NB_DIGITS];
182 __shared__ VOL digit_t b_w[ECM_GPU_CURVES_BY_BLOCK][ECM_GPU_NB_DIGITS];
183
184 VOL digit_t *t=b_t[threadIdx.y];
185 VOL digit_t *u=b_u[threadIdx.y];
186 VOL digit_t *v=b_v[threadIdx.y];
187 VOL digit_t *w=b_w[threadIdx.y];
188 VOL digit_t *temp_r=b_temp_r[threadIdx.y];
189 VOL carry_t *cy=b_cy[threadIdx.y];
190
191 /* Init of shared variables */
192 const unsigned int idx1=blockIdx.x*blockDim.y+threadIdx.y;
193 //unsigned int t1=threadIdx.x+1;
194 cy[threadIdx.x]=0;
195
196 w[threadIdx.x]=xBarg[idx1][threadIdx.x];
197 v[threadIdx.x]=zBarg[idx1][threadIdx.x];
198 temp_r[threadIdx.x]=xAarg[idx1][threadIdx.x];
199 u[threadIdx.x]=zAarg[idx1][threadIdx.x];
200
201 const digit_t Nthdx = d_Ncst[threadIdx.x];
202 const digit_t N3thdx = d_3Ncst[threadIdx.x];
203 const digit_t invN = d_invNcst;
204
205 Cuda_Add_mod(t, cy, v, w); /* C=zB+xB */
206 Cuda_Sub_mod(v, cy, w, N3thdx); /* D=zB-xB */
207 Cuda_Add_mod(w, cy, u, temp_r); /* A=zA+xA */
208 Cuda_Sub_mod(u, cy, temp_r, N3thdx); /* B=zA-xA */
209
210 Cuda_Mul_mod(t, cy, t, u, temp_r, Nthdx, invN); /* CB=C*B=(zB+xB)(zA-xA) */
211 Cuda_Mul_mod(v, cy, v, w, temp_r, Nthdx, invN); /* DA=D*A=(zB-xB)(zA+xA) */
212
213 Cuda_Square_mod(w, cy, w, temp_r, Nthdx, invN); /* AA=A^2 */
214 Cuda_Square_mod(u, cy, u, temp_r, Nthdx, invN); /* BB=B^2 */
215
216 Cuda_Mul_mod(temp_r, cy, u, w, temp_r, Nthdx, invN); /* AA*BB */
217 xAarg[idx1][threadIdx.x]=temp_r[threadIdx.x];
218
219 Cuda_Sub_mod (w, cy, u, N3thdx); /* K= AA-BB */
220 Cuda_Mulint_mod (temp_r, cy, w, idx1 + firstinvd, Nthdx, invN); /* d*K */
221 Cuda_Add_mod (u, cy, temp_r); /* BB+d*K */
222
223 Cuda_Mul_mod (w, cy, w, u, temp_r, Nthdx, invN); /* K*(BB+d*K) */
224 zAarg[idx1][threadIdx.x]=w[threadIdx.x];
225
226 Cuda_Add_mod(w, cy, v, t); /* DA+CB mod N */
227 Cuda_Sub_mod(v, cy, t, N3thdx); /* DA-CB mod N */
228
229 Cuda_Square_mod(w, cy, w, temp_r, Nthdx, invN); /* (DA+CB)^2 mod N */
230 Cuda_Square_mod(v, cy, v, temp_r, Nthdx, invN); /* (DA-CB)^2 mod N */
231
232 /* z0=1 so there is nothing to compute for z0*(DA+CB)^2 */
233 Cuda_Dbl_mod(temp_r, v); /* x0=2 x0*(DA-CB)^2 */
234
235 xBarg[idx1][threadIdx.x]=w[threadIdx.x];
236 zBarg[idx1][threadIdx.x]=temp_r[threadIdx.x];
237 }
238
239