1 /*
2 * Ed25519 field element
3 * (C) 2017 Ribose Inc
4 *
5 * Based on the public domain code from SUPERCOP ref10 by
6 * Peter Schwabe, Daniel J. Bernstein, Niels Duif, Tanja Lange, Bo-Yin Yang
7 *
8 * Botan is released under the Simplified BSD License (see license.txt)
9 */
10
11 #include <botan/internal/ed25519_fe.h>
12 #include <botan/internal/ed25519_internal.h>
13
14 namespace Botan {
15
16 //static
invert(const FE_25519 & z)17 FE_25519 FE_25519::invert(const FE_25519& z)
18 {
19 fe t0;
20 fe t1;
21 fe t2;
22 fe t3;
23
24 fe_sq(t0, z);
25 fe_sq_iter(t1, t0, 2);
26 fe_mul(t1, z, t1);
27 fe_mul(t0, t0, t1);
28 fe_sq(t2, t0);
29 fe_mul(t1, t1, t2);
30 fe_sq_iter(t2, t1, 5);
31 fe_mul(t1, t2, t1);
32 fe_sq_iter(t2, t1, 10);
33 fe_mul(t2, t2, t1);
34 fe_sq_iter(t3, t2, 20);
35 fe_mul(t2, t3, t2);
36 fe_sq_iter(t2, t2, 10);
37 fe_mul(t1, t2, t1);
38 fe_sq_iter(t2, t1, 50);
39 fe_mul(t2, t2, t1);
40 fe_sq_iter(t3, t2, 100);
41 fe_mul(t2, t3, t2);
42 fe_sq_iter(t2, t2, 50);
43 fe_mul(t1, t2, t1);
44 fe_sq_iter(t1, t1, 5);
45
46 fe_mul(t0, t1, t0);
47 return t0;
48 }
49
pow_22523(const fe & z)50 FE_25519 FE_25519::pow_22523(const fe& z)
51 {
52 fe t0;
53 fe t1;
54 fe t2;
55
56 fe_sq(t0, z);
57 fe_sq_iter(t1, t0, 2);
58 fe_mul(t1, z, t1);
59 fe_mul(t0, t0, t1);
60 fe_sq(t0, t0);
61 fe_mul(t0, t1, t0);
62 fe_sq_iter(t1, t0, 5);
63 fe_mul(t0, t1, t0);
64 fe_sq_iter(t1, t0, 10);
65 fe_mul(t1, t1, t0);
66 fe_sq_iter(t2, t1, 20);
67 fe_mul(t1, t2, t1);
68 fe_sq_iter(t1, t1, 10);
69 fe_mul(t0, t1, t0);
70 fe_sq_iter(t1, t0, 50);
71 fe_mul(t1, t1, t0);
72 fe_sq_iter(t2, t1, 100);
73 fe_mul(t1, t2, t1);
74 fe_sq_iter(t1, t1, 50);
75 fe_mul(t0, t1, t0);
76 fe_sq_iter(t0, t0, 2);
77
78 fe_mul(t0, t0, z);
79 return t0;
80 }
81
82 /*
83 h = f * g
84 Can overlap h with f or g.
85
86 Preconditions:
87 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
88 |g| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
89
90 Postconditions:
91 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
92 */
93
94 /*
95 Notes on implementation strategy:
96
97 Using schoolbook multiplication.
98 Karatsuba would save a little in some cost models.
99
100 Most multiplications by 2 and 19 are 32-bit precomputations;
101 cheaper than 64-bit postcomputations.
102
103 There is one remaining multiplication by 19 in the carry chain;
104 one *19 precomputation can be merged into this,
105 but the resulting data flow is considerably less clean.
106
107 There are 12 carries below.
108 10 of them are 2-way parallelizable and vectorizable.
109 Can get away with 11 carries, but then data flow is much deeper.
110
111 With tighter constraints on inputs can squeeze carries into int32.
112 */
113
114 //static
mul(const FE_25519 & f,const FE_25519 & g)115 FE_25519 FE_25519::mul(const FE_25519& f, const FE_25519& g)
116 {
117 const int32_t f0 = f[0];
118 const int32_t f1 = f[1];
119 const int32_t f2 = f[2];
120 const int32_t f3 = f[3];
121 const int32_t f4 = f[4];
122 const int32_t f5 = f[5];
123 const int32_t f6 = f[6];
124 const int32_t f7 = f[7];
125 const int32_t f8 = f[8];
126 const int32_t f9 = f[9];
127
128 const int32_t g0 = g[0];
129 const int32_t g1 = g[1];
130 const int32_t g2 = g[2];
131 const int32_t g3 = g[3];
132 const int32_t g4 = g[4];
133 const int32_t g5 = g[5];
134 const int32_t g6 = g[6];
135 const int32_t g7 = g[7];
136 const int32_t g8 = g[8];
137 const int32_t g9 = g[9];
138
139 const int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */
140 const int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
141 const int32_t g3_19 = 19 * g3;
142 const int32_t g4_19 = 19 * g4;
143 const int32_t g5_19 = 19 * g5;
144 const int32_t g6_19 = 19 * g6;
145 const int32_t g7_19 = 19 * g7;
146 const int32_t g8_19 = 19 * g8;
147 const int32_t g9_19 = 19 * g9;
148 const int32_t f1_2 = 2 * f1;
149 const int32_t f3_2 = 2 * f3;
150 const int32_t f5_2 = 2 * f5;
151 const int32_t f7_2 = 2 * f7;
152 const int32_t f9_2 = 2 * f9;
153
154 const int64_t f0g0 = f0 * static_cast<int64_t>(g0);
155 const int64_t f0g1 = f0 * static_cast<int64_t>(g1);
156 const int64_t f0g2 = f0 * static_cast<int64_t>(g2);
157 const int64_t f0g3 = f0 * static_cast<int64_t>(g3);
158 const int64_t f0g4 = f0 * static_cast<int64_t>(g4);
159 const int64_t f0g5 = f0 * static_cast<int64_t>(g5);
160 const int64_t f0g6 = f0 * static_cast<int64_t>(g6);
161 const int64_t f0g7 = f0 * static_cast<int64_t>(g7);
162 const int64_t f0g8 = f0 * static_cast<int64_t>(g8);
163 const int64_t f0g9 = f0 * static_cast<int64_t>(g9);
164 const int64_t f1g0 = f1 * static_cast<int64_t>(g0);
165 const int64_t f1g1_2 = f1_2 * static_cast<int64_t>(g1);
166 const int64_t f1g2 = f1 * static_cast<int64_t>(g2);
167 const int64_t f1g3_2 = f1_2 * static_cast<int64_t>(g3);
168 const int64_t f1g4 = f1 * static_cast<int64_t>(g4);
169 const int64_t f1g5_2 = f1_2 * static_cast<int64_t>(g5);
170 const int64_t f1g6 = f1 * static_cast<int64_t>(g6);
171 const int64_t f1g7_2 = f1_2 * static_cast<int64_t>(g7);
172 const int64_t f1g8 = f1 * static_cast<int64_t>(g8);
173 const int64_t f1g9_38 = f1_2 * static_cast<int64_t>(g9_19);
174 const int64_t f2g0 = f2 * static_cast<int64_t>(g0);
175 const int64_t f2g1 = f2 * static_cast<int64_t>(g1);
176 const int64_t f2g2 = f2 * static_cast<int64_t>(g2);
177 const int64_t f2g3 = f2 * static_cast<int64_t>(g3);
178 const int64_t f2g4 = f2 * static_cast<int64_t>(g4);
179 const int64_t f2g5 = f2 * static_cast<int64_t>(g5);
180 const int64_t f2g6 = f2 * static_cast<int64_t>(g6);
181 const int64_t f2g7 = f2 * static_cast<int64_t>(g7);
182 const int64_t f2g8_19 = f2 * static_cast<int64_t>(g8_19);
183 const int64_t f2g9_19 = f2 * static_cast<int64_t>(g9_19);
184 const int64_t f3g0 = f3 * static_cast<int64_t>(g0);
185 const int64_t f3g1_2 = f3_2 * static_cast<int64_t>(g1);
186 const int64_t f3g2 = f3 * static_cast<int64_t>(g2);
187 const int64_t f3g3_2 = f3_2 * static_cast<int64_t>(g3);
188 const int64_t f3g4 = f3 * static_cast<int64_t>(g4);
189 const int64_t f3g5_2 = f3_2 * static_cast<int64_t>(g5);
190 const int64_t f3g6 = f3 * static_cast<int64_t>(g6);
191 const int64_t f3g7_38 = f3_2 * static_cast<int64_t>(g7_19);
192 const int64_t f3g8_19 = f3 * static_cast<int64_t>(g8_19);
193 const int64_t f3g9_38 = f3_2 * static_cast<int64_t>(g9_19);
194 const int64_t f4g0 = f4 * static_cast<int64_t>(g0);
195 const int64_t f4g1 = f4 * static_cast<int64_t>(g1);
196 const int64_t f4g2 = f4 * static_cast<int64_t>(g2);
197 const int64_t f4g3 = f4 * static_cast<int64_t>(g3);
198 const int64_t f4g4 = f4 * static_cast<int64_t>(g4);
199 const int64_t f4g5 = f4 * static_cast<int64_t>(g5);
200 const int64_t f4g6_19 = f4 * static_cast<int64_t>(g6_19);
201 const int64_t f4g7_19 = f4 * static_cast<int64_t>(g7_19);
202 const int64_t f4g8_19 = f4 * static_cast<int64_t>(g8_19);
203 const int64_t f4g9_19 = f4 * static_cast<int64_t>(g9_19);
204 const int64_t f5g0 = f5 * static_cast<int64_t>(g0);
205 const int64_t f5g1_2 = f5_2 * static_cast<int64_t>(g1);
206 const int64_t f5g2 = f5 * static_cast<int64_t>(g2);
207 const int64_t f5g3_2 = f5_2 * static_cast<int64_t>(g3);
208 const int64_t f5g4 = f5 * static_cast<int64_t>(g4);
209 const int64_t f5g5_38 = f5_2 * static_cast<int64_t>(g5_19);
210 const int64_t f5g6_19 = f5 * static_cast<int64_t>(g6_19);
211 const int64_t f5g7_38 = f5_2 * static_cast<int64_t>(g7_19);
212 const int64_t f5g8_19 = f5 * static_cast<int64_t>(g8_19);
213 const int64_t f5g9_38 = f5_2 * static_cast<int64_t>(g9_19);
214 const int64_t f6g0 = f6 * static_cast<int64_t>(g0);
215 const int64_t f6g1 = f6 * static_cast<int64_t>(g1);
216 const int64_t f6g2 = f6 * static_cast<int64_t>(g2);
217 const int64_t f6g3 = f6 * static_cast<int64_t>(g3);
218 const int64_t f6g4_19 = f6 * static_cast<int64_t>(g4_19);
219 const int64_t f6g5_19 = f6 * static_cast<int64_t>(g5_19);
220 const int64_t f6g6_19 = f6 * static_cast<int64_t>(g6_19);
221 const int64_t f6g7_19 = f6 * static_cast<int64_t>(g7_19);
222 const int64_t f6g8_19 = f6 * static_cast<int64_t>(g8_19);
223 const int64_t f6g9_19 = f6 * static_cast<int64_t>(g9_19);
224 const int64_t f7g0 = f7 * static_cast<int64_t>(g0);
225 const int64_t f7g1_2 = f7_2 * static_cast<int64_t>(g1);
226 const int64_t f7g2 = f7 * static_cast<int64_t>(g2);
227 const int64_t f7g3_38 = f7_2 * static_cast<int64_t>(g3_19);
228 const int64_t f7g4_19 = f7 * static_cast<int64_t>(g4_19);
229 const int64_t f7g5_38 = f7_2 * static_cast<int64_t>(g5_19);
230 const int64_t f7g6_19 = f7 * static_cast<int64_t>(g6_19);
231 const int64_t f7g7_38 = f7_2 * static_cast<int64_t>(g7_19);
232 const int64_t f7g8_19 = f7 * static_cast<int64_t>(g8_19);
233 const int64_t f7g9_38 = f7_2 * static_cast<int64_t>(g9_19);
234 const int64_t f8g0 = f8 * static_cast<int64_t>(g0);
235 const int64_t f8g1 = f8 * static_cast<int64_t>(g1);
236 const int64_t f8g2_19 = f8 * static_cast<int64_t>(g2_19);
237 const int64_t f8g3_19 = f8 * static_cast<int64_t>(g3_19);
238 const int64_t f8g4_19 = f8 * static_cast<int64_t>(g4_19);
239 const int64_t f8g5_19 = f8 * static_cast<int64_t>(g5_19);
240 const int64_t f8g6_19 = f8 * static_cast<int64_t>(g6_19);
241 const int64_t f8g7_19 = f8 * static_cast<int64_t>(g7_19);
242 const int64_t f8g8_19 = f8 * static_cast<int64_t>(g8_19);
243 const int64_t f8g9_19 = f8 * static_cast<int64_t>(g9_19);
244 const int64_t f9g0 = f9 * static_cast<int64_t>(g0);
245 const int64_t f9g1_38 = f9_2 * static_cast<int64_t>(g1_19);
246 const int64_t f9g2_19 = f9 * static_cast<int64_t>(g2_19);
247 const int64_t f9g3_38 = f9_2 * static_cast<int64_t>(g3_19);
248 const int64_t f9g4_19 = f9 * static_cast<int64_t>(g4_19);
249 const int64_t f9g5_38 = f9_2 * static_cast<int64_t>(g5_19);
250 const int64_t f9g6_19 = f9 * static_cast<int64_t>(g6_19);
251 const int64_t f9g7_38 = f9_2 * static_cast<int64_t>(g7_19);
252 const int64_t f9g8_19 = f9 * static_cast<int64_t>(g8_19);
253 const int64_t f9g9_38 = f9_2 * static_cast<int64_t>(g9_19);
254
255 int64_t h0 = f0g0+f1g9_38+f2g8_19+f3g7_38+f4g6_19+f5g5_38+f6g4_19+f7g3_38+f8g2_19+f9g1_38;
256 int64_t h1 = f0g1+f1g0 +f2g9_19+f3g8_19+f4g7_19+f5g6_19+f6g5_19+f7g4_19+f8g3_19+f9g2_19;
257 int64_t h2 = f0g2+f1g1_2 +f2g0 +f3g9_38+f4g8_19+f5g7_38+f6g6_19+f7g5_38+f8g4_19+f9g3_38;
258 int64_t h3 = f0g3+f1g2 +f2g1 +f3g0 +f4g9_19+f5g8_19+f6g7_19+f7g6_19+f8g5_19+f9g4_19;
259 int64_t h4 = f0g4+f1g3_2 +f2g2 +f3g1_2 +f4g0 +f5g9_38+f6g8_19+f7g7_38+f8g6_19+f9g5_38;
260 int64_t h5 = f0g5+f1g4 +f2g3 +f3g2 +f4g1 +f5g0 +f6g9_19+f7g8_19+f8g7_19+f9g6_19;
261 int64_t h6 = f0g6+f1g5_2 +f2g4 +f3g3_2 +f4g2 +f5g1_2 +f6g0 +f7g9_38+f8g8_19+f9g7_38;
262 int64_t h7 = f0g7+f1g6 +f2g5 +f3g4 +f4g3 +f5g2 +f6g1 +f7g0 +f8g9_19+f9g8_19;
263 int64_t h8 = f0g8+f1g7_2 +f2g6 +f3g5_2 +f4g4 +f5g3_2 +f6g2 +f7g1_2 +f8g0 +f9g9_38;
264 int64_t h9 = f0g9+f1g8 +f2g7 +f3g6 +f4g5 +f5g4 +f6g3 +f7g2 +f8g1 +f9g0 ;
265
266 /*
267 |h0| <= (1.65*1.65*2^52*(1+19+19+19+19)+1.65*1.65*2^50*(38+38+38+38+38))
268 i.e. |h0| <= 1.4*2^60; narrower ranges for h2, h4, h6, h8
269 |h1| <= (1.65*1.65*2^51*(1+1+19+19+19+19+19+19+19+19))
270 i.e. |h1| <= 1.7*2^59; narrower ranges for h3, h5, h7, h9
271 */
272 carry<26>(h0, h1);
273 carry<26>(h4, h5);
274
275 /* |h0| <= 2^25 */
276 /* |h4| <= 2^25 */
277 /* |h1| <= 1.71*2^59 */
278 /* |h5| <= 1.71*2^59 */
279
280 carry<25>(h1, h2);
281 carry<25>(h5, h6);
282
283 /* |h1| <= 2^24; from now on fits into int32 */
284 /* |h5| <= 2^24; from now on fits into int32 */
285 /* |h2| <= 1.41*2^60 */
286 /* |h6| <= 1.41*2^60 */
287
288 carry<26>(h2, h3);
289 carry<26>(h6, h7);
290 /* |h2| <= 2^25; from now on fits into int32 unchanged */
291 /* |h6| <= 2^25; from now on fits into int32 unchanged */
292 /* |h3| <= 1.71*2^59 */
293 /* |h7| <= 1.71*2^59 */
294
295 carry<25>(h3, h4);
296 carry<25>(h7, h8);
297 /* |h3| <= 2^24; from now on fits into int32 unchanged */
298 /* |h7| <= 2^24; from now on fits into int32 unchanged */
299 /* |h4| <= 1.72*2^34 */
300 /* |h8| <= 1.41*2^60 */
301
302 carry<26>(h4, h5);
303 carry<26>(h8, h9);
304 /* |h4| <= 2^25; from now on fits into int32 unchanged */
305 /* |h8| <= 2^25; from now on fits into int32 unchanged */
306 /* |h5| <= 1.01*2^24 */
307 /* |h9| <= 1.71*2^59 */
308
309 carry<25, 19>(h9, h0);
310
311 /* |h9| <= 2^24; from now on fits into int32 unchanged */
312 /* |h0| <= 1.1*2^39 */
313
314 carry<26>(h0, h1);
315 /* |h0| <= 2^25; from now on fits into int32 unchanged */
316 /* |h1| <= 1.01*2^24 */
317
318 return FE_25519(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9);
319 }
320
321 /*
322 h = f * f
323 Can overlap h with f.
324
325 Preconditions:
326 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
327
328 Postconditions:
329 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
330 */
331
332 /*
333 See fe_mul.c for discussion of implementation strategy.
334 */
335
336 //static
sqr_iter(const FE_25519 & f,size_t iter)337 FE_25519 FE_25519::sqr_iter(const FE_25519& f, size_t iter)
338 {
339 int32_t f0 = f[0];
340 int32_t f1 = f[1];
341 int32_t f2 = f[2];
342 int32_t f3 = f[3];
343 int32_t f4 = f[4];
344 int32_t f5 = f[5];
345 int32_t f6 = f[6];
346 int32_t f7 = f[7];
347 int32_t f8 = f[8];
348 int32_t f9 = f[9];
349
350 for(size_t i = 0; i != iter; ++i)
351 {
352 const int32_t f0_2 = 2 * f0;
353 const int32_t f1_2 = 2 * f1;
354 const int32_t f2_2 = 2 * f2;
355 const int32_t f3_2 = 2 * f3;
356 const int32_t f4_2 = 2 * f4;
357 const int32_t f5_2 = 2 * f5;
358 const int32_t f6_2 = 2 * f6;
359 const int32_t f7_2 = 2 * f7;
360 const int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
361 const int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
362 const int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
363 const int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
364 const int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
365
366 const int64_t f0f0 = f0 * static_cast<int64_t>(f0);
367 const int64_t f0f1_2 = f0_2 * static_cast<int64_t>(f1);
368 const int64_t f0f2_2 = f0_2 * static_cast<int64_t>(f2);
369 const int64_t f0f3_2 = f0_2 * static_cast<int64_t>(f3);
370 const int64_t f0f4_2 = f0_2 * static_cast<int64_t>(f4);
371 const int64_t f0f5_2 = f0_2 * static_cast<int64_t>(f5);
372 const int64_t f0f6_2 = f0_2 * static_cast<int64_t>(f6);
373 const int64_t f0f7_2 = f0_2 * static_cast<int64_t>(f7);
374 const int64_t f0f8_2 = f0_2 * static_cast<int64_t>(f8);
375 const int64_t f0f9_2 = f0_2 * static_cast<int64_t>(f9);
376 const int64_t f1f1_2 = f1_2 * static_cast<int64_t>(f1);
377 const int64_t f1f2_2 = f1_2 * static_cast<int64_t>(f2);
378 const int64_t f1f3_4 = f1_2 * static_cast<int64_t>(f3_2);
379 const int64_t f1f4_2 = f1_2 * static_cast<int64_t>(f4);
380 const int64_t f1f5_4 = f1_2 * static_cast<int64_t>(f5_2);
381 const int64_t f1f6_2 = f1_2 * static_cast<int64_t>(f6);
382 const int64_t f1f7_4 = f1_2 * static_cast<int64_t>(f7_2);
383 const int64_t f1f8_2 = f1_2 * static_cast<int64_t>(f8);
384 const int64_t f1f9_76 = f1_2 * static_cast<int64_t>(f9_38);
385 const int64_t f2f2 = f2 * static_cast<int64_t>(f2);
386 const int64_t f2f3_2 = f2_2 * static_cast<int64_t>(f3);
387 const int64_t f2f4_2 = f2_2 * static_cast<int64_t>(f4);
388 const int64_t f2f5_2 = f2_2 * static_cast<int64_t>(f5);
389 const int64_t f2f6_2 = f2_2 * static_cast<int64_t>(f6);
390 const int64_t f2f7_2 = f2_2 * static_cast<int64_t>(f7);
391 const int64_t f2f8_38 = f2_2 * static_cast<int64_t>(f8_19);
392 const int64_t f2f9_38 = f2 * static_cast<int64_t>(f9_38);
393 const int64_t f3f3_2 = f3_2 * static_cast<int64_t>(f3);
394 const int64_t f3f4_2 = f3_2 * static_cast<int64_t>(f4);
395 const int64_t f3f5_4 = f3_2 * static_cast<int64_t>(f5_2);
396 const int64_t f3f6_2 = f3_2 * static_cast<int64_t>(f6);
397 const int64_t f3f7_76 = f3_2 * static_cast<int64_t>(f7_38);
398 const int64_t f3f8_38 = f3_2 * static_cast<int64_t>(f8_19);
399 const int64_t f3f9_76 = f3_2 * static_cast<int64_t>(f9_38);
400 const int64_t f4f4 = f4 * static_cast<int64_t>(f4);
401 const int64_t f4f5_2 = f4_2 * static_cast<int64_t>(f5);
402 const int64_t f4f6_38 = f4_2 * static_cast<int64_t>(f6_19);
403 const int64_t f4f7_38 = f4 * static_cast<int64_t>(f7_38);
404 const int64_t f4f8_38 = f4_2 * static_cast<int64_t>(f8_19);
405 const int64_t f4f9_38 = f4 * static_cast<int64_t>(f9_38);
406 const int64_t f5f5_38 = f5 * static_cast<int64_t>(f5_38);
407 const int64_t f5f6_38 = f5_2 * static_cast<int64_t>(f6_19);
408 const int64_t f5f7_76 = f5_2 * static_cast<int64_t>(f7_38);
409 const int64_t f5f8_38 = f5_2 * static_cast<int64_t>(f8_19);
410 const int64_t f5f9_76 = f5_2 * static_cast<int64_t>(f9_38);
411 const int64_t f6f6_19 = f6 * static_cast<int64_t>(f6_19);
412 const int64_t f6f7_38 = f6 * static_cast<int64_t>(f7_38);
413 const int64_t f6f8_38 = f6_2 * static_cast<int64_t>(f8_19);
414 const int64_t f6f9_38 = f6 * static_cast<int64_t>(f9_38);
415 const int64_t f7f7_38 = f7 * static_cast<int64_t>(f7_38);
416 const int64_t f7f8_38 = f7_2 * static_cast<int64_t>(f8_19);
417 const int64_t f7f9_76 = f7_2 * static_cast<int64_t>(f9_38);
418 const int64_t f8f8_19 = f8 * static_cast<int64_t>(f8_19);
419 const int64_t f8f9_38 = f8 * static_cast<int64_t>(f9_38);
420 const int64_t f9f9_38 = f9 * static_cast<int64_t>(f9_38);
421
422 int64_t h0 = f0f0 +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38;
423 int64_t h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38;
424 int64_t h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19;
425 int64_t h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38;
426 int64_t h4 = f0f4_2+f1f3_4 +f2f2 +f5f9_76+f6f8_38+f7f7_38;
427 int64_t h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38;
428 int64_t h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19;
429 int64_t h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38;
430 int64_t h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4 +f9f9_38;
431 int64_t h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2;
432
433 carry<26>(h0, h1);
434 carry<26>(h4, h5);
435 carry<25>(h1, h2);
436 carry<25>(h5, h6);
437 carry<26>(h2, h3);
438 carry<26>(h6, h7);
439
440 carry<25>(h3, h4);
441 carry<25>(h7, h8);
442
443 carry<26>(h4, h5);
444 carry<26>(h8, h9);
445 carry<25,19>(h9, h0);
446 carry<26>(h0, h1);
447
448 f0 = static_cast<int32_t>(h0);
449 f1 = static_cast<int32_t>(h1);
450 f2 = static_cast<int32_t>(h2);
451 f3 = static_cast<int32_t>(h3);
452 f4 = static_cast<int32_t>(h4);
453 f5 = static_cast<int32_t>(h5);
454 f6 = static_cast<int32_t>(h6);
455 f7 = static_cast<int32_t>(h7);
456 f8 = static_cast<int32_t>(h8);
457 f9 = static_cast<int32_t>(h9);
458 }
459
460 return FE_25519(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
461 }
462
463 /*
464 h = 2 * f * f
465 Can overlap h with f.
466
467 Preconditions:
468 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
469
470 Postconditions:
471 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
472 */
473
474 /*
475 See fe_mul.c for discussion of implementation strategy.
476 */
477
478 //static
sqr2(const FE_25519 & f)479 FE_25519 FE_25519::sqr2(const FE_25519& f)
480 {
481 const int32_t f0 = f[0];
482 const int32_t f1 = f[1];
483 const int32_t f2 = f[2];
484 const int32_t f3 = f[3];
485 const int32_t f4 = f[4];
486 const int32_t f5 = f[5];
487 const int32_t f6 = f[6];
488 const int32_t f7 = f[7];
489 const int32_t f8 = f[8];
490 const int32_t f9 = f[9];
491 const int32_t f0_2 = 2 * f0;
492 const int32_t f1_2 = 2 * f1;
493 const int32_t f2_2 = 2 * f2;
494 const int32_t f3_2 = 2 * f3;
495 const int32_t f4_2 = 2 * f4;
496 const int32_t f5_2 = 2 * f5;
497 const int32_t f6_2 = 2 * f6;
498 const int32_t f7_2 = 2 * f7;
499 const int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
500 const int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
501 const int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
502 const int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
503 const int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
504 const int64_t f0f0 = f0 * static_cast<int64_t>(f0);
505 const int64_t f0f1_2 = f0_2 * static_cast<int64_t>(f1);
506 const int64_t f0f2_2 = f0_2 * static_cast<int64_t>(f2);
507 const int64_t f0f3_2 = f0_2 * static_cast<int64_t>(f3);
508 const int64_t f0f4_2 = f0_2 * static_cast<int64_t>(f4);
509 const int64_t f0f5_2 = f0_2 * static_cast<int64_t>(f5);
510 const int64_t f0f6_2 = f0_2 * static_cast<int64_t>(f6);
511 const int64_t f0f7_2 = f0_2 * static_cast<int64_t>(f7);
512 const int64_t f0f8_2 = f0_2 * static_cast<int64_t>(f8);
513 const int64_t f0f9_2 = f0_2 * static_cast<int64_t>(f9);
514 const int64_t f1f1_2 = f1_2 * static_cast<int64_t>(f1);
515 const int64_t f1f2_2 = f1_2 * static_cast<int64_t>(f2);
516 const int64_t f1f3_4 = f1_2 * static_cast<int64_t>(f3_2);
517 const int64_t f1f4_2 = f1_2 * static_cast<int64_t>(f4);
518 const int64_t f1f5_4 = f1_2 * static_cast<int64_t>(f5_2);
519 const int64_t f1f6_2 = f1_2 * static_cast<int64_t>(f6);
520 const int64_t f1f7_4 = f1_2 * static_cast<int64_t>(f7_2);
521 const int64_t f1f8_2 = f1_2 * static_cast<int64_t>(f8);
522 const int64_t f1f9_76 = f1_2 * static_cast<int64_t>(f9_38);
523 const int64_t f2f2 = f2 * static_cast<int64_t>(f2);
524 const int64_t f2f3_2 = f2_2 * static_cast<int64_t>(f3);
525 const int64_t f2f4_2 = f2_2 * static_cast<int64_t>(f4);
526 const int64_t f2f5_2 = f2_2 * static_cast<int64_t>(f5);
527 const int64_t f2f6_2 = f2_2 * static_cast<int64_t>(f6);
528 const int64_t f2f7_2 = f2_2 * static_cast<int64_t>(f7);
529 const int64_t f2f8_38 = f2_2 * static_cast<int64_t>(f8_19);
530 const int64_t f2f9_38 = f2 * static_cast<int64_t>(f9_38);
531 const int64_t f3f3_2 = f3_2 * static_cast<int64_t>(f3);
532 const int64_t f3f4_2 = f3_2 * static_cast<int64_t>(f4);
533 const int64_t f3f5_4 = f3_2 * static_cast<int64_t>(f5_2);
534 const int64_t f3f6_2 = f3_2 * static_cast<int64_t>(f6);
535 const int64_t f3f7_76 = f3_2 * static_cast<int64_t>(f7_38);
536 const int64_t f3f8_38 = f3_2 * static_cast<int64_t>(f8_19);
537 const int64_t f3f9_76 = f3_2 * static_cast<int64_t>(f9_38);
538 const int64_t f4f4 = f4 * static_cast<int64_t>(f4);
539 const int64_t f4f5_2 = f4_2 * static_cast<int64_t>(f5);
540 const int64_t f4f6_38 = f4_2 * static_cast<int64_t>(f6_19);
541 const int64_t f4f7_38 = f4 * static_cast<int64_t>(f7_38);
542 const int64_t f4f8_38 = f4_2 * static_cast<int64_t>(f8_19);
543 const int64_t f4f9_38 = f4 * static_cast<int64_t>(f9_38);
544 const int64_t f5f5_38 = f5 * static_cast<int64_t>(f5_38);
545 const int64_t f5f6_38 = f5_2 * static_cast<int64_t>(f6_19);
546 const int64_t f5f7_76 = f5_2 * static_cast<int64_t>(f7_38);
547 const int64_t f5f8_38 = f5_2 * static_cast<int64_t>(f8_19);
548 const int64_t f5f9_76 = f5_2 * static_cast<int64_t>(f9_38);
549 const int64_t f6f6_19 = f6 * static_cast<int64_t>(f6_19);
550 const int64_t f6f7_38 = f6 * static_cast<int64_t>(f7_38);
551 const int64_t f6f8_38 = f6_2 * static_cast<int64_t>(f8_19);
552 const int64_t f6f9_38 = f6 * static_cast<int64_t>(f9_38);
553 const int64_t f7f7_38 = f7 * static_cast<int64_t>(f7_38);
554 const int64_t f7f8_38 = f7_2 * static_cast<int64_t>(f8_19);
555 const int64_t f7f9_76 = f7_2 * static_cast<int64_t>(f9_38);
556 const int64_t f8f8_19 = f8 * static_cast<int64_t>(f8_19);
557 const int64_t f8f9_38 = f8 * static_cast<int64_t>(f9_38);
558 const int64_t f9f9_38 = f9 * static_cast<int64_t>(f9_38);
559
560 int64_t h0 = f0f0 +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38;
561 int64_t h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38;
562 int64_t h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19;
563 int64_t h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38;
564 int64_t h4 = f0f4_2+f1f3_4 +f2f2 +f5f9_76+f6f8_38+f7f7_38;
565 int64_t h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38;
566 int64_t h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19;
567 int64_t h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38;
568 int64_t h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4 +f9f9_38;
569 int64_t h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2;
570
571 h0 += h0;
572 h1 += h1;
573 h2 += h2;
574 h3 += h3;
575 h4 += h4;
576 h5 += h5;
577 h6 += h6;
578 h7 += h7;
579 h8 += h8;
580 h9 += h9;
581
582 carry<26>(h0, h1);
583 carry<26>(h4, h5);
584
585 carry<25>(h1, h2);
586 carry<25>(h5, h6);
587
588 carry<26>(h2, h3);
589 carry<26>(h6, h7);
590
591 carry<25>(h3, h4);
592 carry<25>(h7, h8);
593 carry<26>(h4, h5);
594 carry<26>(h8, h9);
595 carry<25,19>(h9, h0);
596 carry<26>(h0, h1);
597
598 return FE_25519(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9);
599 }
600
601 /*
602 Ignores top bit of h.
603 */
604
from_bytes(const uint8_t s[32])605 void FE_25519::from_bytes(const uint8_t s[32])
606 {
607 int64_t h0 = load_4(s);
608 int64_t h1 = load_3(s + 4) << 6;
609 int64_t h2 = load_3(s + 7) << 5;
610 int64_t h3 = load_3(s + 10) << 3;
611 int64_t h4 = load_3(s + 13) << 2;
612 int64_t h5 = load_4(s + 16);
613 int64_t h6 = load_3(s + 20) << 7;
614 int64_t h7 = load_3(s + 23) << 5;
615 int64_t h8 = load_3(s + 26) << 4;
616 int64_t h9 = (load_3(s + 29) & 0x7fffff) << 2;
617
618 carry<25,19>(h9, h0);
619 carry<25>(h1, h2);
620 carry<25>(h3, h4);
621 carry<25>(h5, h6);
622 carry<25>(h7, h8);
623
624 carry<26>(h0, h1);
625 carry<26>(h2, h3);
626 carry<26>(h4, h5);
627 carry<26>(h6, h7);
628 carry<26>(h8, h9);
629
630 m_fe[0] = static_cast<int32_t>(h0);
631 m_fe[1] = static_cast<int32_t>(h1);
632 m_fe[2] = static_cast<int32_t>(h2);
633 m_fe[3] = static_cast<int32_t>(h3);
634 m_fe[4] = static_cast<int32_t>(h4);
635 m_fe[5] = static_cast<int32_t>(h5);
636 m_fe[6] = static_cast<int32_t>(h6);
637 m_fe[7] = static_cast<int32_t>(h7);
638 m_fe[8] = static_cast<int32_t>(h8);
639 m_fe[9] = static_cast<int32_t>(h9);
640 }
641
642 /*
643 Preconditions:
644 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
645
646 Write p=2^255-19; q=floor(h/p).
647 Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
648
649 Proof:
650 Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
651 Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
652
653 Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
654 Then 0<y<1.
655
656 Write r=h-pq.
657 Have 0<=r<=p-1=2^255-20.
658 Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
659
660 Write x=r+19(2^-255)r+y.
661 Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
662
663 Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
664 so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
665 */
666
to_bytes(uint8_t s[32]) const667 void FE_25519::to_bytes(uint8_t s[32]) const
668 {
669 const int64_t X25 = (1 << 25);
670
671 int32_t h0 = m_fe[0];
672 int32_t h1 = m_fe[1];
673 int32_t h2 = m_fe[2];
674 int32_t h3 = m_fe[3];
675 int32_t h4 = m_fe[4];
676 int32_t h5 = m_fe[5];
677 int32_t h6 = m_fe[6];
678 int32_t h7 = m_fe[7];
679 int32_t h8 = m_fe[8];
680 int32_t h9 = m_fe[9];
681 int32_t q;
682
683 q = (19 * h9 + ((static_cast<int32_t>(1) << 24))) >> 25;
684 q = (h0 + q) >> 26;
685 q = (h1 + q) >> 25;
686 q = (h2 + q) >> 26;
687 q = (h3 + q) >> 25;
688 q = (h4 + q) >> 26;
689 q = (h5 + q) >> 25;
690 q = (h6 + q) >> 26;
691 q = (h7 + q) >> 25;
692 q = (h8 + q) >> 26;
693 q = (h9 + q) >> 25;
694
695 /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
696 h0 += 19 * q;
697 /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
698
699 carry0<26>(h0, h1);
700 carry0<25>(h1, h2);
701 carry0<26>(h2, h3);
702 carry0<25>(h3, h4);
703 carry0<26>(h4, h5);
704 carry0<25>(h5, h6);
705 carry0<26>(h6, h7);
706 carry0<25>(h7, h8);
707 carry0<26>(h8, h9);
708
709 int32_t carry9 = h9 >> 25;
710 h9 -= carry9 * X25;
711 /* h10 = carry9 */
712
713 /*
714 Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
715 Have h0+...+2^230 h9 between 0 and 2^255-1;
716 evidently 2^255 h10-2^255 q = 0.
717 Goal: Output h0+...+2^230 h9.
718 */
719
720 s[0] = static_cast<uint8_t>(h0 >> 0);
721 s[1] = static_cast<uint8_t>(h0 >> 8);
722 s[2] = static_cast<uint8_t>(h0 >> 16);
723 s[3] = static_cast<uint8_t>((h0 >> 24) | (h1 << 2));
724 s[4] = static_cast<uint8_t>(h1 >> 6);
725 s[5] = static_cast<uint8_t>(h1 >> 14);
726 s[6] = static_cast<uint8_t>((h1 >> 22) | (h2 << 3));
727 s[7] = static_cast<uint8_t>(h2 >> 5);
728 s[8] = static_cast<uint8_t>(h2 >> 13);
729 s[9] = static_cast<uint8_t>((h2 >> 21) | (h3 << 5));
730 s[10] = static_cast<uint8_t>(h3 >> 3);
731 s[11] = static_cast<uint8_t>(h3 >> 11);
732 s[12] = static_cast<uint8_t>((h3 >> 19) | (h4 << 6));
733 s[13] = static_cast<uint8_t>(h4 >> 2);
734 s[14] = static_cast<uint8_t>(h4 >> 10);
735 s[15] = static_cast<uint8_t>(h4 >> 18);
736 s[16] = static_cast<uint8_t>(h5 >> 0);
737 s[17] = static_cast<uint8_t>(h5 >> 8);
738 s[18] = static_cast<uint8_t>(h5 >> 16);
739 s[19] = static_cast<uint8_t>((h5 >> 24) | (h6 << 1));
740 s[20] = static_cast<uint8_t>(h6 >> 7);
741 s[21] = static_cast<uint8_t>(h6 >> 15);
742 s[22] = static_cast<uint8_t>((h6 >> 23) | (h7 << 3));
743 s[23] = static_cast<uint8_t>(h7 >> 5);
744 s[24] = static_cast<uint8_t>(h7 >> 13);
745 s[25] = static_cast<uint8_t>((h7 >> 21) | (h8 << 4));
746 s[26] = static_cast<uint8_t>(h8 >> 4);
747 s[27] = static_cast<uint8_t>(h8 >> 12);
748 s[28] = static_cast<uint8_t>((h8 >> 20) | (h9 << 6));
749 s[29] = static_cast<uint8_t>(h9 >> 2);
750 s[30] = static_cast<uint8_t>(h9 >> 10);
751 s[31] = static_cast<uint8_t>(h9 >> 18);
752 }
753
754 }
755