1 /*
2 * Ed25519 field element
3 * (C) 2017 Ribose Inc
4 *
5 * Based on the public domain code from SUPERCOP ref10 by
6 * Peter Schwabe, Daniel J. Bernstein, Niels Duif, Tanja Lange, Bo-Yin Yang
7 *
8 * Botan is released under the Simplified BSD License (see license.txt)
9 */
10 
11 #include <botan/internal/ed25519_fe.h>
12 #include <botan/internal/ed25519_internal.h>
13 
14 namespace Botan {
15 
16 //static
invert(const FE_25519 & z)17 FE_25519 FE_25519::invert(const FE_25519& z)
18    {
19    fe t0;
20    fe t1;
21    fe t2;
22    fe t3;
23 
24    fe_sq(t0, z);
25    fe_sq_iter(t1, t0, 2);
26    fe_mul(t1, z, t1);
27    fe_mul(t0, t0, t1);
28    fe_sq(t2, t0);
29    fe_mul(t1, t1, t2);
30    fe_sq_iter(t2, t1, 5);
31    fe_mul(t1, t2, t1);
32    fe_sq_iter(t2, t1, 10);
33    fe_mul(t2, t2, t1);
34    fe_sq_iter(t3, t2, 20);
35    fe_mul(t2, t3, t2);
36    fe_sq_iter(t2, t2, 10);
37    fe_mul(t1, t2, t1);
38    fe_sq_iter(t2, t1, 50);
39    fe_mul(t2, t2, t1);
40    fe_sq_iter(t3, t2, 100);
41    fe_mul(t2, t3, t2);
42    fe_sq_iter(t2, t2, 50);
43    fe_mul(t1, t2, t1);
44    fe_sq_iter(t1, t1, 5);
45 
46    fe_mul(t0, t1, t0);
47    return t0;
48    }
49 
pow_22523(const fe & z)50 FE_25519 FE_25519::pow_22523(const fe& z)
51    {
52    fe t0;
53    fe t1;
54    fe t2;
55 
56    fe_sq(t0, z);
57    fe_sq_iter(t1, t0, 2);
58    fe_mul(t1, z, t1);
59    fe_mul(t0, t0, t1);
60    fe_sq(t0, t0);
61    fe_mul(t0, t1, t0);
62    fe_sq_iter(t1, t0, 5);
63    fe_mul(t0, t1, t0);
64    fe_sq_iter(t1, t0, 10);
65    fe_mul(t1, t1, t0);
66    fe_sq_iter(t2, t1, 20);
67    fe_mul(t1, t2, t1);
68    fe_sq_iter(t1, t1, 10);
69    fe_mul(t0, t1, t0);
70    fe_sq_iter(t1, t0, 50);
71    fe_mul(t1, t1, t0);
72    fe_sq_iter(t2, t1, 100);
73    fe_mul(t1, t2, t1);
74    fe_sq_iter(t1, t1, 50);
75    fe_mul(t0, t1, t0);
76    fe_sq_iter(t0, t0, 2);
77 
78    fe_mul(t0, t0, z);
79    return t0;
80    }
81 
82 /*
83 h = f * g
84 Can overlap h with f or g.
85 
86 Preconditions:
87 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
88 |g| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
89 
90 Postconditions:
91 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
92 */
93 
94 /*
95 Notes on implementation strategy:
96 
97 Using schoolbook multiplication.
98 Karatsuba would save a little in some cost models.
99 
100 Most multiplications by 2 and 19 are 32-bit precomputations;
101 cheaper than 64-bit postcomputations.
102 
103 There is one remaining multiplication by 19 in the carry chain;
104 one *19 precomputation can be merged into this,
105 but the resulting data flow is considerably less clean.
106 
107 There are 12 carries below.
108 10 of them are 2-way parallelizable and vectorizable.
109 Can get away with 11 carries, but then data flow is much deeper.
110 
111 With tighter constraints on inputs can squeeze carries into int32.
112 */
113 
114 //static
mul(const FE_25519 & f,const FE_25519 & g)115 FE_25519 FE_25519::mul(const FE_25519& f, const FE_25519& g)
116    {
117    const int32_t f0 = f[0];
118    const int32_t f1 = f[1];
119    const int32_t f2 = f[2];
120    const int32_t f3 = f[3];
121    const int32_t f4 = f[4];
122    const int32_t f5 = f[5];
123    const int32_t f6 = f[6];
124    const int32_t f7 = f[7];
125    const int32_t f8 = f[8];
126    const int32_t f9 = f[9];
127 
128    const int32_t g0 = g[0];
129    const int32_t g1 = g[1];
130    const int32_t g2 = g[2];
131    const int32_t g3 = g[3];
132    const int32_t g4 = g[4];
133    const int32_t g5 = g[5];
134    const int32_t g6 = g[6];
135    const int32_t g7 = g[7];
136    const int32_t g8 = g[8];
137    const int32_t g9 = g[9];
138 
139    const int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */
140    const int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
141    const int32_t g3_19 = 19 * g3;
142    const int32_t g4_19 = 19 * g4;
143    const int32_t g5_19 = 19 * g5;
144    const int32_t g6_19 = 19 * g6;
145    const int32_t g7_19 = 19 * g7;
146    const int32_t g8_19 = 19 * g8;
147    const int32_t g9_19 = 19 * g9;
148    const int32_t f1_2 = 2 * f1;
149    const int32_t f3_2 = 2 * f3;
150    const int32_t f5_2 = 2 * f5;
151    const int32_t f7_2 = 2 * f7;
152    const int32_t f9_2 = 2 * f9;
153 
154    const int64_t f0g0    = f0   * static_cast<int64_t>(g0);
155    const int64_t f0g1    = f0   * static_cast<int64_t>(g1);
156    const int64_t f0g2    = f0   * static_cast<int64_t>(g2);
157    const int64_t f0g3    = f0   * static_cast<int64_t>(g3);
158    const int64_t f0g4    = f0   * static_cast<int64_t>(g4);
159    const int64_t f0g5    = f0   * static_cast<int64_t>(g5);
160    const int64_t f0g6    = f0   * static_cast<int64_t>(g6);
161    const int64_t f0g7    = f0   * static_cast<int64_t>(g7);
162    const int64_t f0g8    = f0   * static_cast<int64_t>(g8);
163    const int64_t f0g9    = f0   * static_cast<int64_t>(g9);
164    const int64_t f1g0    = f1   * static_cast<int64_t>(g0);
165    const int64_t f1g1_2  = f1_2 * static_cast<int64_t>(g1);
166    const int64_t f1g2    = f1   * static_cast<int64_t>(g2);
167    const int64_t f1g3_2  = f1_2 * static_cast<int64_t>(g3);
168    const int64_t f1g4    = f1   * static_cast<int64_t>(g4);
169    const int64_t f1g5_2  = f1_2 * static_cast<int64_t>(g5);
170    const int64_t f1g6    = f1   * static_cast<int64_t>(g6);
171    const int64_t f1g7_2  = f1_2 * static_cast<int64_t>(g7);
172    const int64_t f1g8    = f1   * static_cast<int64_t>(g8);
173    const int64_t f1g9_38 = f1_2 * static_cast<int64_t>(g9_19);
174    const int64_t f2g0    = f2   * static_cast<int64_t>(g0);
175    const int64_t f2g1    = f2   * static_cast<int64_t>(g1);
176    const int64_t f2g2    = f2   * static_cast<int64_t>(g2);
177    const int64_t f2g3    = f2   * static_cast<int64_t>(g3);
178    const int64_t f2g4    = f2   * static_cast<int64_t>(g4);
179    const int64_t f2g5    = f2   * static_cast<int64_t>(g5);
180    const int64_t f2g6    = f2   * static_cast<int64_t>(g6);
181    const int64_t f2g7    = f2   * static_cast<int64_t>(g7);
182    const int64_t f2g8_19 = f2   * static_cast<int64_t>(g8_19);
183    const int64_t f2g9_19 = f2   * static_cast<int64_t>(g9_19);
184    const int64_t f3g0    = f3   * static_cast<int64_t>(g0);
185    const int64_t f3g1_2  = f3_2 * static_cast<int64_t>(g1);
186    const int64_t f3g2    = f3   * static_cast<int64_t>(g2);
187    const int64_t f3g3_2  = f3_2 * static_cast<int64_t>(g3);
188    const int64_t f3g4    = f3   * static_cast<int64_t>(g4);
189    const int64_t f3g5_2  = f3_2 * static_cast<int64_t>(g5);
190    const int64_t f3g6    = f3   * static_cast<int64_t>(g6);
191    const int64_t f3g7_38 = f3_2 * static_cast<int64_t>(g7_19);
192    const int64_t f3g8_19 = f3   * static_cast<int64_t>(g8_19);
193    const int64_t f3g9_38 = f3_2 * static_cast<int64_t>(g9_19);
194    const int64_t f4g0    = f4   * static_cast<int64_t>(g0);
195    const int64_t f4g1    = f4   * static_cast<int64_t>(g1);
196    const int64_t f4g2    = f4   * static_cast<int64_t>(g2);
197    const int64_t f4g3    = f4   * static_cast<int64_t>(g3);
198    const int64_t f4g4    = f4   * static_cast<int64_t>(g4);
199    const int64_t f4g5    = f4   * static_cast<int64_t>(g5);
200    const int64_t f4g6_19 = f4   * static_cast<int64_t>(g6_19);
201    const int64_t f4g7_19 = f4   * static_cast<int64_t>(g7_19);
202    const int64_t f4g8_19 = f4   * static_cast<int64_t>(g8_19);
203    const int64_t f4g9_19 = f4   * static_cast<int64_t>(g9_19);
204    const int64_t f5g0    = f5   * static_cast<int64_t>(g0);
205    const int64_t f5g1_2  = f5_2 * static_cast<int64_t>(g1);
206    const int64_t f5g2    = f5   * static_cast<int64_t>(g2);
207    const int64_t f5g3_2  = f5_2 * static_cast<int64_t>(g3);
208    const int64_t f5g4    = f5   * static_cast<int64_t>(g4);
209    const int64_t f5g5_38 = f5_2 * static_cast<int64_t>(g5_19);
210    const int64_t f5g6_19 = f5   * static_cast<int64_t>(g6_19);
211    const int64_t f5g7_38 = f5_2 * static_cast<int64_t>(g7_19);
212    const int64_t f5g8_19 = f5   * static_cast<int64_t>(g8_19);
213    const int64_t f5g9_38 = f5_2 * static_cast<int64_t>(g9_19);
214    const int64_t f6g0    = f6   * static_cast<int64_t>(g0);
215    const int64_t f6g1    = f6   * static_cast<int64_t>(g1);
216    const int64_t f6g2    = f6   * static_cast<int64_t>(g2);
217    const int64_t f6g3    = f6   * static_cast<int64_t>(g3);
218    const int64_t f6g4_19 = f6   * static_cast<int64_t>(g4_19);
219    const int64_t f6g5_19 = f6   * static_cast<int64_t>(g5_19);
220    const int64_t f6g6_19 = f6   * static_cast<int64_t>(g6_19);
221    const int64_t f6g7_19 = f6   * static_cast<int64_t>(g7_19);
222    const int64_t f6g8_19 = f6   * static_cast<int64_t>(g8_19);
223    const int64_t f6g9_19 = f6   * static_cast<int64_t>(g9_19);
224    const int64_t f7g0    = f7   * static_cast<int64_t>(g0);
225    const int64_t f7g1_2  = f7_2 * static_cast<int64_t>(g1);
226    const int64_t f7g2    = f7   * static_cast<int64_t>(g2);
227    const int64_t f7g3_38 = f7_2 * static_cast<int64_t>(g3_19);
228    const int64_t f7g4_19 = f7   * static_cast<int64_t>(g4_19);
229    const int64_t f7g5_38 = f7_2 * static_cast<int64_t>(g5_19);
230    const int64_t f7g6_19 = f7   * static_cast<int64_t>(g6_19);
231    const int64_t f7g7_38 = f7_2 * static_cast<int64_t>(g7_19);
232    const int64_t f7g8_19 = f7   * static_cast<int64_t>(g8_19);
233    const int64_t f7g9_38 = f7_2 * static_cast<int64_t>(g9_19);
234    const int64_t f8g0    = f8   * static_cast<int64_t>(g0);
235    const int64_t f8g1    = f8   * static_cast<int64_t>(g1);
236    const int64_t f8g2_19 = f8   * static_cast<int64_t>(g2_19);
237    const int64_t f8g3_19 = f8   * static_cast<int64_t>(g3_19);
238    const int64_t f8g4_19 = f8   * static_cast<int64_t>(g4_19);
239    const int64_t f8g5_19 = f8   * static_cast<int64_t>(g5_19);
240    const int64_t f8g6_19 = f8   * static_cast<int64_t>(g6_19);
241    const int64_t f8g7_19 = f8   * static_cast<int64_t>(g7_19);
242    const int64_t f8g8_19 = f8   * static_cast<int64_t>(g8_19);
243    const int64_t f8g9_19 = f8   * static_cast<int64_t>(g9_19);
244    const int64_t f9g0    = f9   * static_cast<int64_t>(g0);
245    const int64_t f9g1_38 = f9_2 * static_cast<int64_t>(g1_19);
246    const int64_t f9g2_19 = f9   * static_cast<int64_t>(g2_19);
247    const int64_t f9g3_38 = f9_2 * static_cast<int64_t>(g3_19);
248    const int64_t f9g4_19 = f9   * static_cast<int64_t>(g4_19);
249    const int64_t f9g5_38 = f9_2 * static_cast<int64_t>(g5_19);
250    const int64_t f9g6_19 = f9   * static_cast<int64_t>(g6_19);
251    const int64_t f9g7_38 = f9_2 * static_cast<int64_t>(g7_19);
252    const int64_t f9g8_19 = f9   * static_cast<int64_t>(g8_19);
253    const int64_t f9g9_38 = f9_2 * static_cast<int64_t>(g9_19);
254 
255    int64_t h0 = f0g0+f1g9_38+f2g8_19+f3g7_38+f4g6_19+f5g5_38+f6g4_19+f7g3_38+f8g2_19+f9g1_38;
256    int64_t h1 = f0g1+f1g0   +f2g9_19+f3g8_19+f4g7_19+f5g6_19+f6g5_19+f7g4_19+f8g3_19+f9g2_19;
257    int64_t h2 = f0g2+f1g1_2 +f2g0   +f3g9_38+f4g8_19+f5g7_38+f6g6_19+f7g5_38+f8g4_19+f9g3_38;
258    int64_t h3 = f0g3+f1g2   +f2g1   +f3g0   +f4g9_19+f5g8_19+f6g7_19+f7g6_19+f8g5_19+f9g4_19;
259    int64_t h4 = f0g4+f1g3_2 +f2g2   +f3g1_2 +f4g0   +f5g9_38+f6g8_19+f7g7_38+f8g6_19+f9g5_38;
260    int64_t h5 = f0g5+f1g4   +f2g3   +f3g2   +f4g1   +f5g0   +f6g9_19+f7g8_19+f8g7_19+f9g6_19;
261    int64_t h6 = f0g6+f1g5_2 +f2g4   +f3g3_2 +f4g2   +f5g1_2 +f6g0   +f7g9_38+f8g8_19+f9g7_38;
262    int64_t h7 = f0g7+f1g6   +f2g5   +f3g4   +f4g3   +f5g2   +f6g1   +f7g0   +f8g9_19+f9g8_19;
263    int64_t h8 = f0g8+f1g7_2 +f2g6   +f3g5_2 +f4g4   +f5g3_2 +f6g2   +f7g1_2 +f8g0   +f9g9_38;
264    int64_t h9 = f0g9+f1g8   +f2g7   +f3g6   +f4g5   +f5g4   +f6g3   +f7g2   +f8g1   +f9g0   ;
265 
266    /*
267    |h0| <= (1.65*1.65*2^52*(1+19+19+19+19)+1.65*1.65*2^50*(38+38+38+38+38))
268    i.e. |h0| <= 1.4*2^60; narrower ranges for h2, h4, h6, h8
269    |h1| <= (1.65*1.65*2^51*(1+1+19+19+19+19+19+19+19+19))
270    i.e. |h1| <= 1.7*2^59; narrower ranges for h3, h5, h7, h9
271    */
272    carry<26>(h0, h1);
273    carry<26>(h4, h5);
274 
275    /* |h0| <= 2^25 */
276    /* |h4| <= 2^25 */
277    /* |h1| <= 1.71*2^59 */
278    /* |h5| <= 1.71*2^59 */
279 
280    carry<25>(h1, h2);
281    carry<25>(h5, h6);
282 
283    /* |h1| <= 2^24; from now on fits into int32 */
284    /* |h5| <= 2^24; from now on fits into int32 */
285    /* |h2| <= 1.41*2^60 */
286    /* |h6| <= 1.41*2^60 */
287 
288    carry<26>(h2, h3);
289    carry<26>(h6, h7);
290    /* |h2| <= 2^25; from now on fits into int32 unchanged */
291    /* |h6| <= 2^25; from now on fits into int32 unchanged */
292    /* |h3| <= 1.71*2^59 */
293    /* |h7| <= 1.71*2^59 */
294 
295    carry<25>(h3, h4);
296    carry<25>(h7, h8);
297    /* |h3| <= 2^24; from now on fits into int32 unchanged */
298    /* |h7| <= 2^24; from now on fits into int32 unchanged */
299    /* |h4| <= 1.72*2^34 */
300    /* |h8| <= 1.41*2^60 */
301 
302    carry<26>(h4, h5);
303    carry<26>(h8, h9);
304    /* |h4| <= 2^25; from now on fits into int32 unchanged */
305    /* |h8| <= 2^25; from now on fits into int32 unchanged */
306    /* |h5| <= 1.01*2^24 */
307    /* |h9| <= 1.71*2^59 */
308 
309    carry<25, 19>(h9, h0);
310 
311    /* |h9| <= 2^24; from now on fits into int32 unchanged */
312    /* |h0| <= 1.1*2^39 */
313 
314    carry<26>(h0, h1);
315    /* |h0| <= 2^25; from now on fits into int32 unchanged */
316    /* |h1| <= 1.01*2^24 */
317 
318    return FE_25519(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9);
319    }
320 
321 /*
322 h = f * f
323 Can overlap h with f.
324 
325 Preconditions:
326 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
327 
328 Postconditions:
329 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
330 */
331 
332 /*
333 See fe_mul.c for discussion of implementation strategy.
334 */
335 
336 //static
sqr_iter(const FE_25519 & f,size_t iter)337 FE_25519 FE_25519::sqr_iter(const FE_25519& f, size_t iter)
338    {
339    int32_t f0 = f[0];
340    int32_t f1 = f[1];
341    int32_t f2 = f[2];
342    int32_t f3 = f[3];
343    int32_t f4 = f[4];
344    int32_t f5 = f[5];
345    int32_t f6 = f[6];
346    int32_t f7 = f[7];
347    int32_t f8 = f[8];
348    int32_t f9 = f[9];
349 
350    for(size_t i = 0; i != iter; ++i)
351       {
352       const int32_t f0_2 = 2 * f0;
353       const int32_t f1_2 = 2 * f1;
354       const int32_t f2_2 = 2 * f2;
355       const int32_t f3_2 = 2 * f3;
356       const int32_t f4_2 = 2 * f4;
357       const int32_t f5_2 = 2 * f5;
358       const int32_t f6_2 = 2 * f6;
359       const int32_t f7_2 = 2 * f7;
360       const int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
361       const int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
362       const int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
363       const int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
364       const int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
365 
366       const int64_t f0f0    = f0   * static_cast<int64_t>(f0);
367       const int64_t f0f1_2  = f0_2 * static_cast<int64_t>(f1);
368       const int64_t f0f2_2  = f0_2 * static_cast<int64_t>(f2);
369       const int64_t f0f3_2  = f0_2 * static_cast<int64_t>(f3);
370       const int64_t f0f4_2  = f0_2 * static_cast<int64_t>(f4);
371       const int64_t f0f5_2  = f0_2 * static_cast<int64_t>(f5);
372       const int64_t f0f6_2  = f0_2 * static_cast<int64_t>(f6);
373       const int64_t f0f7_2  = f0_2 * static_cast<int64_t>(f7);
374       const int64_t f0f8_2  = f0_2 * static_cast<int64_t>(f8);
375       const int64_t f0f9_2  = f0_2 * static_cast<int64_t>(f9);
376       const int64_t f1f1_2  = f1_2 * static_cast<int64_t>(f1);
377       const int64_t f1f2_2  = f1_2 * static_cast<int64_t>(f2);
378       const int64_t f1f3_4  = f1_2 * static_cast<int64_t>(f3_2);
379       const int64_t f1f4_2  = f1_2 * static_cast<int64_t>(f4);
380       const int64_t f1f5_4  = f1_2 * static_cast<int64_t>(f5_2);
381       const int64_t f1f6_2  = f1_2 * static_cast<int64_t>(f6);
382       const int64_t f1f7_4  = f1_2 * static_cast<int64_t>(f7_2);
383       const int64_t f1f8_2  = f1_2 * static_cast<int64_t>(f8);
384       const int64_t f1f9_76 = f1_2 * static_cast<int64_t>(f9_38);
385       const int64_t f2f2    = f2   * static_cast<int64_t>(f2);
386       const int64_t f2f3_2  = f2_2 * static_cast<int64_t>(f3);
387       const int64_t f2f4_2  = f2_2 * static_cast<int64_t>(f4);
388       const int64_t f2f5_2  = f2_2 * static_cast<int64_t>(f5);
389       const int64_t f2f6_2  = f2_2 * static_cast<int64_t>(f6);
390       const int64_t f2f7_2  = f2_2 * static_cast<int64_t>(f7);
391       const int64_t f2f8_38 = f2_2 * static_cast<int64_t>(f8_19);
392       const int64_t f2f9_38 = f2   * static_cast<int64_t>(f9_38);
393       const int64_t f3f3_2  = f3_2 * static_cast<int64_t>(f3);
394       const int64_t f3f4_2  = f3_2 * static_cast<int64_t>(f4);
395       const int64_t f3f5_4  = f3_2 * static_cast<int64_t>(f5_2);
396       const int64_t f3f6_2  = f3_2 * static_cast<int64_t>(f6);
397       const int64_t f3f7_76 = f3_2 * static_cast<int64_t>(f7_38);
398       const int64_t f3f8_38 = f3_2 * static_cast<int64_t>(f8_19);
399       const int64_t f3f9_76 = f3_2 * static_cast<int64_t>(f9_38);
400       const int64_t f4f4    = f4   * static_cast<int64_t>(f4);
401       const int64_t f4f5_2  = f4_2 * static_cast<int64_t>(f5);
402       const int64_t f4f6_38 = f4_2 * static_cast<int64_t>(f6_19);
403       const int64_t f4f7_38 = f4   * static_cast<int64_t>(f7_38);
404       const int64_t f4f8_38 = f4_2 * static_cast<int64_t>(f8_19);
405       const int64_t f4f9_38 = f4   * static_cast<int64_t>(f9_38);
406       const int64_t f5f5_38 = f5   * static_cast<int64_t>(f5_38);
407       const int64_t f5f6_38 = f5_2 * static_cast<int64_t>(f6_19);
408       const int64_t f5f7_76 = f5_2 * static_cast<int64_t>(f7_38);
409       const int64_t f5f8_38 = f5_2 * static_cast<int64_t>(f8_19);
410       const int64_t f5f9_76 = f5_2 * static_cast<int64_t>(f9_38);
411       const int64_t f6f6_19 = f6   * static_cast<int64_t>(f6_19);
412       const int64_t f6f7_38 = f6   * static_cast<int64_t>(f7_38);
413       const int64_t f6f8_38 = f6_2 * static_cast<int64_t>(f8_19);
414       const int64_t f6f9_38 = f6   * static_cast<int64_t>(f9_38);
415       const int64_t f7f7_38 = f7   * static_cast<int64_t>(f7_38);
416       const int64_t f7f8_38 = f7_2 * static_cast<int64_t>(f8_19);
417       const int64_t f7f9_76 = f7_2 * static_cast<int64_t>(f9_38);
418       const int64_t f8f8_19 = f8   * static_cast<int64_t>(f8_19);
419       const int64_t f8f9_38 = f8   * static_cast<int64_t>(f9_38);
420       const int64_t f9f9_38 = f9   * static_cast<int64_t>(f9_38);
421 
422       int64_t h0 = f0f0  +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38;
423       int64_t h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38;
424       int64_t h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19;
425       int64_t h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38;
426       int64_t h4 = f0f4_2+f1f3_4 +f2f2   +f5f9_76+f6f8_38+f7f7_38;
427       int64_t h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38;
428       int64_t h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19;
429       int64_t h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38;
430       int64_t h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4   +f9f9_38;
431       int64_t h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2;
432 
433       carry<26>(h0, h1);
434       carry<26>(h4, h5);
435       carry<25>(h1, h2);
436       carry<25>(h5, h6);
437       carry<26>(h2, h3);
438       carry<26>(h6, h7);
439 
440       carry<25>(h3, h4);
441       carry<25>(h7, h8);
442 
443       carry<26>(h4, h5);
444       carry<26>(h8, h9);
445       carry<25,19>(h9, h0);
446       carry<26>(h0, h1);
447 
448       f0 = static_cast<int32_t>(h0);
449       f1 = static_cast<int32_t>(h1);
450       f2 = static_cast<int32_t>(h2);
451       f3 = static_cast<int32_t>(h3);
452       f4 = static_cast<int32_t>(h4);
453       f5 = static_cast<int32_t>(h5);
454       f6 = static_cast<int32_t>(h6);
455       f7 = static_cast<int32_t>(h7);
456       f8 = static_cast<int32_t>(h8);
457       f9 = static_cast<int32_t>(h9);
458       }
459 
460    return FE_25519(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
461    }
462 
463 /*
464 h = 2 * f * f
465 Can overlap h with f.
466 
467 Preconditions:
468 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
469 
470 Postconditions:
471 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
472 */
473 
474 /*
475 See fe_mul.c for discussion of implementation strategy.
476 */
477 
478 //static
sqr2(const FE_25519 & f)479 FE_25519 FE_25519::sqr2(const FE_25519& f)
480    {
481    const int32_t f0 = f[0];
482    const int32_t f1 = f[1];
483    const int32_t f2 = f[2];
484    const int32_t f3 = f[3];
485    const int32_t f4 = f[4];
486    const int32_t f5 = f[5];
487    const int32_t f6 = f[6];
488    const int32_t f7 = f[7];
489    const int32_t f8 = f[8];
490    const int32_t f9 = f[9];
491    const int32_t f0_2 = 2 * f0;
492    const int32_t f1_2 = 2 * f1;
493    const int32_t f2_2 = 2 * f2;
494    const int32_t f3_2 = 2 * f3;
495    const int32_t f4_2 = 2 * f4;
496    const int32_t f5_2 = 2 * f5;
497    const int32_t f6_2 = 2 * f6;
498    const int32_t f7_2 = 2 * f7;
499    const int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
500    const int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
501    const int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
502    const int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
503    const int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
504    const int64_t f0f0    = f0   * static_cast<int64_t>(f0);
505    const int64_t f0f1_2  = f0_2 * static_cast<int64_t>(f1);
506    const int64_t f0f2_2  = f0_2 * static_cast<int64_t>(f2);
507    const int64_t f0f3_2  = f0_2 * static_cast<int64_t>(f3);
508    const int64_t f0f4_2  = f0_2 * static_cast<int64_t>(f4);
509    const int64_t f0f5_2  = f0_2 * static_cast<int64_t>(f5);
510    const int64_t f0f6_2  = f0_2 * static_cast<int64_t>(f6);
511    const int64_t f0f7_2  = f0_2 * static_cast<int64_t>(f7);
512    const int64_t f0f8_2  = f0_2 * static_cast<int64_t>(f8);
513    const int64_t f0f9_2  = f0_2 * static_cast<int64_t>(f9);
514    const int64_t f1f1_2  = f1_2 * static_cast<int64_t>(f1);
515    const int64_t f1f2_2  = f1_2 * static_cast<int64_t>(f2);
516    const int64_t f1f3_4  = f1_2 * static_cast<int64_t>(f3_2);
517    const int64_t f1f4_2  = f1_2 * static_cast<int64_t>(f4);
518    const int64_t f1f5_4  = f1_2 * static_cast<int64_t>(f5_2);
519    const int64_t f1f6_2  = f1_2 * static_cast<int64_t>(f6);
520    const int64_t f1f7_4  = f1_2 * static_cast<int64_t>(f7_2);
521    const int64_t f1f8_2  = f1_2 * static_cast<int64_t>(f8);
522    const int64_t f1f9_76 = f1_2 * static_cast<int64_t>(f9_38);
523    const int64_t f2f2    = f2   * static_cast<int64_t>(f2);
524    const int64_t f2f3_2  = f2_2 * static_cast<int64_t>(f3);
525    const int64_t f2f4_2  = f2_2 * static_cast<int64_t>(f4);
526    const int64_t f2f5_2  = f2_2 * static_cast<int64_t>(f5);
527    const int64_t f2f6_2  = f2_2 * static_cast<int64_t>(f6);
528    const int64_t f2f7_2  = f2_2 * static_cast<int64_t>(f7);
529    const int64_t f2f8_38 = f2_2 * static_cast<int64_t>(f8_19);
530    const int64_t f2f9_38 = f2   * static_cast<int64_t>(f9_38);
531    const int64_t f3f3_2  = f3_2 * static_cast<int64_t>(f3);
532    const int64_t f3f4_2  = f3_2 * static_cast<int64_t>(f4);
533    const int64_t f3f5_4  = f3_2 * static_cast<int64_t>(f5_2);
534    const int64_t f3f6_2  = f3_2 * static_cast<int64_t>(f6);
535    const int64_t f3f7_76 = f3_2 * static_cast<int64_t>(f7_38);
536    const int64_t f3f8_38 = f3_2 * static_cast<int64_t>(f8_19);
537    const int64_t f3f9_76 = f3_2 * static_cast<int64_t>(f9_38);
538    const int64_t f4f4    = f4   * static_cast<int64_t>(f4);
539    const int64_t f4f5_2  = f4_2 * static_cast<int64_t>(f5);
540    const int64_t f4f6_38 = f4_2 * static_cast<int64_t>(f6_19);
541    const int64_t f4f7_38 = f4   * static_cast<int64_t>(f7_38);
542    const int64_t f4f8_38 = f4_2 * static_cast<int64_t>(f8_19);
543    const int64_t f4f9_38 = f4   * static_cast<int64_t>(f9_38);
544    const int64_t f5f5_38 = f5   * static_cast<int64_t>(f5_38);
545    const int64_t f5f6_38 = f5_2 * static_cast<int64_t>(f6_19);
546    const int64_t f5f7_76 = f5_2 * static_cast<int64_t>(f7_38);
547    const int64_t f5f8_38 = f5_2 * static_cast<int64_t>(f8_19);
548    const int64_t f5f9_76 = f5_2 * static_cast<int64_t>(f9_38);
549    const int64_t f6f6_19 = f6   * static_cast<int64_t>(f6_19);
550    const int64_t f6f7_38 = f6   * static_cast<int64_t>(f7_38);
551    const int64_t f6f8_38 = f6_2 * static_cast<int64_t>(f8_19);
552    const int64_t f6f9_38 = f6   * static_cast<int64_t>(f9_38);
553    const int64_t f7f7_38 = f7   * static_cast<int64_t>(f7_38);
554    const int64_t f7f8_38 = f7_2 * static_cast<int64_t>(f8_19);
555    const int64_t f7f9_76 = f7_2 * static_cast<int64_t>(f9_38);
556    const int64_t f8f8_19 = f8   * static_cast<int64_t>(f8_19);
557    const int64_t f8f9_38 = f8   * static_cast<int64_t>(f9_38);
558    const int64_t f9f9_38 = f9   * static_cast<int64_t>(f9_38);
559 
560    int64_t h0 = f0f0  +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38;
561    int64_t h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38;
562    int64_t h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19;
563    int64_t h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38;
564    int64_t h4 = f0f4_2+f1f3_4 +f2f2   +f5f9_76+f6f8_38+f7f7_38;
565    int64_t h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38;
566    int64_t h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19;
567    int64_t h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38;
568    int64_t h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4   +f9f9_38;
569    int64_t h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2;
570 
571    h0 += h0;
572    h1 += h1;
573    h2 += h2;
574    h3 += h3;
575    h4 += h4;
576    h5 += h5;
577    h6 += h6;
578    h7 += h7;
579    h8 += h8;
580    h9 += h9;
581 
582    carry<26>(h0, h1);
583    carry<26>(h4, h5);
584 
585    carry<25>(h1, h2);
586    carry<25>(h5, h6);
587 
588    carry<26>(h2, h3);
589    carry<26>(h6, h7);
590 
591    carry<25>(h3, h4);
592    carry<25>(h7, h8);
593    carry<26>(h4, h5);
594    carry<26>(h8, h9);
595    carry<25,19>(h9, h0);
596    carry<26>(h0, h1);
597 
598    return FE_25519(h0, h1, h2, h3, h4, h5, h6, h7, h8, h9);
599    }
600 
601 /*
602 Ignores top bit of h.
603 */
604 
from_bytes(const uint8_t s[32])605 void FE_25519::from_bytes(const uint8_t s[32])
606    {
607    int64_t h0 = load_4(s);
608    int64_t h1 = load_3(s + 4) << 6;
609    int64_t h2 = load_3(s + 7) << 5;
610    int64_t h3 = load_3(s + 10) << 3;
611    int64_t h4 = load_3(s + 13) << 2;
612    int64_t h5 = load_4(s + 16);
613    int64_t h6 = load_3(s + 20) << 7;
614    int64_t h7 = load_3(s + 23) << 5;
615    int64_t h8 = load_3(s + 26) << 4;
616    int64_t h9 = (load_3(s + 29) & 0x7fffff) << 2;
617 
618    carry<25,19>(h9, h0);
619    carry<25>(h1, h2);
620    carry<25>(h3, h4);
621    carry<25>(h5, h6);
622    carry<25>(h7, h8);
623 
624    carry<26>(h0, h1);
625    carry<26>(h2, h3);
626    carry<26>(h4, h5);
627    carry<26>(h6, h7);
628    carry<26>(h8, h9);
629 
630    m_fe[0] = static_cast<int32_t>(h0);
631    m_fe[1] = static_cast<int32_t>(h1);
632    m_fe[2] = static_cast<int32_t>(h2);
633    m_fe[3] = static_cast<int32_t>(h3);
634    m_fe[4] = static_cast<int32_t>(h4);
635    m_fe[5] = static_cast<int32_t>(h5);
636    m_fe[6] = static_cast<int32_t>(h6);
637    m_fe[7] = static_cast<int32_t>(h7);
638    m_fe[8] = static_cast<int32_t>(h8);
639    m_fe[9] = static_cast<int32_t>(h9);
640    }
641 
642 /*
643 Preconditions:
644 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
645 
646 Write p=2^255-19; q=floor(h/p).
647 Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
648 
649 Proof:
650 Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
651 Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
652 
653 Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
654 Then 0<y<1.
655 
656 Write r=h-pq.
657 Have 0<=r<=p-1=2^255-20.
658 Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
659 
660 Write x=r+19(2^-255)r+y.
661 Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
662 
663 Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
664 so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
665 */
666 
to_bytes(uint8_t s[32]) const667 void FE_25519::to_bytes(uint8_t s[32]) const
668    {
669    const int64_t X25 = (1 << 25);
670 
671    int32_t h0 = m_fe[0];
672    int32_t h1 = m_fe[1];
673    int32_t h2 = m_fe[2];
674    int32_t h3 = m_fe[3];
675    int32_t h4 = m_fe[4];
676    int32_t h5 = m_fe[5];
677    int32_t h6 = m_fe[6];
678    int32_t h7 = m_fe[7];
679    int32_t h8 = m_fe[8];
680    int32_t h9 = m_fe[9];
681    int32_t q;
682 
683    q = (19 * h9 + ((static_cast<int32_t>(1) << 24))) >> 25;
684    q = (h0 + q) >> 26;
685    q = (h1 + q) >> 25;
686    q = (h2 + q) >> 26;
687    q = (h3 + q) >> 25;
688    q = (h4 + q) >> 26;
689    q = (h5 + q) >> 25;
690    q = (h6 + q) >> 26;
691    q = (h7 + q) >> 25;
692    q = (h8 + q) >> 26;
693    q = (h9 + q) >> 25;
694 
695    /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
696    h0 += 19 * q;
697    /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
698 
699    carry0<26>(h0, h1);
700    carry0<25>(h1, h2);
701    carry0<26>(h2, h3);
702    carry0<25>(h3, h4);
703    carry0<26>(h4, h5);
704    carry0<25>(h5, h6);
705    carry0<26>(h6, h7);
706    carry0<25>(h7, h8);
707    carry0<26>(h8, h9);
708 
709    int32_t carry9 = h9 >> 25;
710    h9 -= carry9 * X25;
711    /* h10 = carry9 */
712 
713    /*
714    Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
715    Have h0+...+2^230 h9 between 0 and 2^255-1;
716    evidently 2^255 h10-2^255 q = 0.
717    Goal: Output h0+...+2^230 h9.
718    */
719 
720    s[0] = static_cast<uint8_t>(h0 >> 0);
721    s[1] = static_cast<uint8_t>(h0 >> 8);
722    s[2] = static_cast<uint8_t>(h0 >> 16);
723    s[3] = static_cast<uint8_t>((h0 >> 24) | (h1 << 2));
724    s[4] = static_cast<uint8_t>(h1 >> 6);
725    s[5] = static_cast<uint8_t>(h1 >> 14);
726    s[6] = static_cast<uint8_t>((h1 >> 22) | (h2 << 3));
727    s[7] = static_cast<uint8_t>(h2 >> 5);
728    s[8] = static_cast<uint8_t>(h2 >> 13);
729    s[9] = static_cast<uint8_t>((h2 >> 21) | (h3 << 5));
730    s[10] = static_cast<uint8_t>(h3 >> 3);
731    s[11] = static_cast<uint8_t>(h3 >> 11);
732    s[12] = static_cast<uint8_t>((h3 >> 19) | (h4 << 6));
733    s[13] = static_cast<uint8_t>(h4 >> 2);
734    s[14] = static_cast<uint8_t>(h4 >> 10);
735    s[15] = static_cast<uint8_t>(h4 >> 18);
736    s[16] = static_cast<uint8_t>(h5 >> 0);
737    s[17] = static_cast<uint8_t>(h5 >> 8);
738    s[18] = static_cast<uint8_t>(h5 >> 16);
739    s[19] = static_cast<uint8_t>((h5 >> 24) | (h6 << 1));
740    s[20] = static_cast<uint8_t>(h6 >> 7);
741    s[21] = static_cast<uint8_t>(h6 >> 15);
742    s[22] = static_cast<uint8_t>((h6 >> 23) | (h7 << 3));
743    s[23] = static_cast<uint8_t>(h7 >> 5);
744    s[24] = static_cast<uint8_t>(h7 >> 13);
745    s[25] = static_cast<uint8_t>((h7 >> 21) | (h8 << 4));
746    s[26] = static_cast<uint8_t>(h8 >> 4);
747    s[27] = static_cast<uint8_t>(h8 >> 12);
748    s[28] = static_cast<uint8_t>((h8 >> 20) | (h9 << 6));
749    s[29] = static_cast<uint8_t>(h9 >> 2);
750    s[30] = static_cast<uint8_t>(h9 >> 10);
751    s[31] = static_cast<uint8_t>(h9 >> 18);
752    }
753 
754 }
755