1 #include <stddef.h>
2 #include <stdint.h>
3 #include <stdlib.h>
4 #include <string.h>
5 
6 #include "crypto_verify_32.h"
7 #include "private/common.h"
8 #include "private/ed25519_ref10.h"
9 #include "utils.h"
10 
11 static inline uint64_t
12 load_3(const unsigned char *in)
13 {
14     uint64_t result;
15 
16     result = (uint64_t) in[0];
17     result |= ((uint64_t) in[1]) << 8;
18     result |= ((uint64_t) in[2]) << 16;
19 
20     return result;
21 }
22 
23 static inline uint64_t
24 load_4(const unsigned char *in)
25 {
26     uint64_t result;
27 
28     result = (uint64_t) in[0];
29     result |= ((uint64_t) in[1]) << 8;
30     result |= ((uint64_t) in[2]) << 16;
31     result |= ((uint64_t) in[3]) << 24;
32 
33     return result;
34 }
35 
36 /*
37  * Field arithmetic:
38  * Use 5*51 bit limbs on 64-bit systems with support for 128 bit arithmetic,
39  * and 10*25.5 bit limbs elsewhere.
40  *
41  * Functions used elsewhere that are candidates for inlining are defined
42  * via "private/curve25519_ref10.h".
43  */
44 
45 #ifdef HAVE_TI_MODE
46 # include "fe_51/constants.h"
47 # include "fe_51/fe.h"
48 #else
49 # include "fe_25_5/constants.h"
50 # include "fe_25_5/fe.h"
51 #endif
52 
53 void
54 fe25519_invert(fe25519 out, const fe25519 z)
55 {
56     fe25519 t0;
57     fe25519 t1;
58     fe25519 t2;
59     fe25519 t3;
60     int     i;
61 
62     fe25519_sq(t0, z);
63     fe25519_sq(t1, t0);
64     fe25519_sq(t1, t1);
65     fe25519_mul(t1, z, t1);
66     fe25519_mul(t0, t0, t1);
67     fe25519_sq(t2, t0);
68     fe25519_mul(t1, t1, t2);
69     fe25519_sq(t2, t1);
70     for (i = 1; i < 5; ++i) {
71         fe25519_sq(t2, t2);
72     }
73     fe25519_mul(t1, t2, t1);
74     fe25519_sq(t2, t1);
75     for (i = 1; i < 10; ++i) {
76         fe25519_sq(t2, t2);
77     }
78     fe25519_mul(t2, t2, t1);
79     fe25519_sq(t3, t2);
80     for (i = 1; i < 20; ++i) {
81         fe25519_sq(t3, t3);
82     }
83     fe25519_mul(t2, t3, t2);
84     fe25519_sq(t2, t2);
85     for (i = 1; i < 10; ++i) {
86         fe25519_sq(t2, t2);
87     }
88     fe25519_mul(t1, t2, t1);
89     fe25519_sq(t2, t1);
90     for (i = 1; i < 50; ++i) {
91         fe25519_sq(t2, t2);
92     }
93     fe25519_mul(t2, t2, t1);
94     fe25519_sq(t3, t2);
95     for (i = 1; i < 100; ++i) {
96         fe25519_sq(t3, t3);
97     }
98     fe25519_mul(t2, t3, t2);
99     fe25519_sq(t2, t2);
100     for (i = 1; i < 50; ++i) {
101         fe25519_sq(t2, t2);
102     }
103     fe25519_mul(t1, t2, t1);
104     fe25519_sq(t1, t1);
105     for (i = 1; i < 5; ++i) {
106         fe25519_sq(t1, t1);
107     }
108     fe25519_mul(out, t1, t0);
109 }
110 
111 static void
112 fe25519_pow22523(fe25519 out, const fe25519 z)
113 {
114     fe25519 t0;
115     fe25519 t1;
116     fe25519 t2;
117     int     i;
118 
119     fe25519_sq(t0, z);
120     fe25519_sq(t1, t0);
121     fe25519_sq(t1, t1);
122     fe25519_mul(t1, z, t1);
123     fe25519_mul(t0, t0, t1);
124     fe25519_sq(t0, t0);
125     fe25519_mul(t0, t1, t0);
126     fe25519_sq(t1, t0);
127     for (i = 1; i < 5; ++i) {
128         fe25519_sq(t1, t1);
129     }
130     fe25519_mul(t0, t1, t0);
131     fe25519_sq(t1, t0);
132     for (i = 1; i < 10; ++i) {
133         fe25519_sq(t1, t1);
134     }
135     fe25519_mul(t1, t1, t0);
136     fe25519_sq(t2, t1);
137     for (i = 1; i < 20; ++i) {
138         fe25519_sq(t2, t2);
139     }
140     fe25519_mul(t1, t2, t1);
141     fe25519_sq(t1, t1);
142     for (i = 1; i < 10; ++i) {
143         fe25519_sq(t1, t1);
144     }
145     fe25519_mul(t0, t1, t0);
146     fe25519_sq(t1, t0);
147     for (i = 1; i < 50; ++i) {
148         fe25519_sq(t1, t1);
149     }
150     fe25519_mul(t1, t1, t0);
151     fe25519_sq(t2, t1);
152     for (i = 1; i < 100; ++i) {
153         fe25519_sq(t2, t2);
154     }
155     fe25519_mul(t1, t2, t1);
156     fe25519_sq(t1, t1);
157     for (i = 1; i < 50; ++i) {
158         fe25519_sq(t1, t1);
159     }
160     fe25519_mul(t0, t1, t0);
161     fe25519_sq(t0, t0);
162     fe25519_sq(t0, t0);
163     fe25519_mul(out, t0, z);
164 }
165 
166 /*
167  r = p + q
168  */
169 
170 void
171 ge25519_add(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q)
172 {
173     fe25519 t0;
174 
175     fe25519_add(r->X, p->Y, p->X);
176     fe25519_sub(r->Y, p->Y, p->X);
177     fe25519_mul(r->Z, r->X, q->YplusX);
178     fe25519_mul(r->Y, r->Y, q->YminusX);
179     fe25519_mul(r->T, q->T2d, p->T);
180     fe25519_mul(r->X, p->Z, q->Z);
181     fe25519_add(t0, r->X, r->X);
182     fe25519_sub(r->X, r->Z, r->Y);
183     fe25519_add(r->Y, r->Z, r->Y);
184     fe25519_add(r->Z, t0, r->T);
185     fe25519_sub(r->T, t0, r->T);
186 }
187 
188 static void
189 slide_vartime(signed char *r, const unsigned char *a)
190 {
191     int i;
192     int b;
193     int k;
194     int ribs;
195     int cmp;
196 
197     for (i = 0; i < 256; ++i) {
198         r[i] = 1 & (a[i >> 3] >> (i & 7));
199     }
200     for (i = 0; i < 256; ++i) {
201         if (! r[i]) {
202             continue;
203         }
204         for (b = 1; b <= 6 && i + b < 256; ++b) {
205             if (! r[i + b]) {
206                 continue;
207             }
208             ribs = r[i + b] << b;
209             cmp = r[i] + ribs;
210             if (cmp <= 15) {
211                 r[i] = cmp;
212                 r[i + b] = 0;
213             } else {
214                 cmp = r[i] - ribs;
215                 if (cmp < -15) {
216                     break;
217                 }
218                 r[i] = cmp;
219                 for (k = i + b; k < 256; ++k) {
220                     if (! r[k]) {
221                         r[k] = 1;
222                         break;
223                     }
224                     r[k] = 0;
225                 }
226             }
227         }
228     }
229 }
230 
231 int
232 ge25519_frombytes(ge25519_p3 *h, const unsigned char *s)
233 {
234     fe25519 u;
235     fe25519 v;
236     fe25519 v3;
237     fe25519 vxx;
238     fe25519 m_root_check, p_root_check;
239     fe25519 negx;
240     fe25519 x_sqrtm1;
241     int     has_m_root, has_p_root;
242 
243     fe25519_frombytes(h->Y, s);
244     fe25519_1(h->Z);
245     fe25519_sq(u, h->Y);
246     fe25519_mul(v, u, d);
247     fe25519_sub(u, u, h->Z); /* u = y^2-1 */
248     fe25519_add(v, v, h->Z); /* v = dy^2+1 */
249 
250     fe25519_sq(v3, v);
251     fe25519_mul(v3, v3, v); /* v3 = v^3 */
252     fe25519_sq(h->X, v3);
253     fe25519_mul(h->X, h->X, v);
254     fe25519_mul(h->X, h->X, u); /* x = uv^7 */
255 
256     fe25519_pow22523(h->X, h->X); /* x = (uv^7)^((q-5)/8) */
257     fe25519_mul(h->X, h->X, v3);
258     fe25519_mul(h->X, h->X, u); /* x = uv^3(uv^7)^((q-5)/8) */
259 
260     fe25519_sq(vxx, h->X);
261     fe25519_mul(vxx, vxx, v);
262     fe25519_sub(m_root_check, vxx, u); /* vx^2-u */
263     fe25519_add(p_root_check, vxx, u); /* vx^2+u */
264     has_m_root = fe25519_iszero(m_root_check);
265     has_p_root = fe25519_iszero(p_root_check);
266     fe25519_mul(x_sqrtm1, h->X, sqrtm1); /* x*sqrt(-1) */
267     fe25519_cmov(h->X, x_sqrtm1, 1 - has_m_root);
268 
269     fe25519_neg(negx, h->X);
270     fe25519_cmov(h->X, negx, fe25519_isnegative(h->X) ^ (s[31] >> 7));
271     fe25519_mul(h->T, h->X, h->Y);
272 
273     return (has_m_root | has_p_root) - 1;
274 }
275 
276 int
277 ge25519_frombytes_negate_vartime(ge25519_p3 *h, const unsigned char *s)
278 {
279     fe25519 u;
280     fe25519 v;
281     fe25519 v3;
282     fe25519 vxx;
283     fe25519 m_root_check, p_root_check;
284 
285     fe25519_frombytes(h->Y, s);
286     fe25519_1(h->Z);
287     fe25519_sq(u, h->Y);
288     fe25519_mul(v, u, d);
289     fe25519_sub(u, u, h->Z); /* u = y^2-1 */
290     fe25519_add(v, v, h->Z); /* v = dy^2+1 */
291 
292     fe25519_sq(v3, v);
293     fe25519_mul(v3, v3, v); /* v3 = v^3 */
294     fe25519_sq(h->X, v3);
295     fe25519_mul(h->X, h->X, v);
296     fe25519_mul(h->X, h->X, u); /* x = uv^7 */
297 
298     fe25519_pow22523(h->X, h->X); /* x = (uv^7)^((q-5)/8) */
299     fe25519_mul(h->X, h->X, v3);
300     fe25519_mul(h->X, h->X, u); /* x = uv^3(uv^7)^((q-5)/8) */
301 
302     fe25519_sq(vxx, h->X);
303     fe25519_mul(vxx, vxx, v);
304     fe25519_sub(m_root_check, vxx, u); /* vx^2-u */
305     if (fe25519_iszero(m_root_check) == 0) {
306         fe25519_add(p_root_check, vxx, u); /* vx^2+u */
307         if (fe25519_iszero(p_root_check) == 0) {
308             return -1;
309         }
310         fe25519_mul(h->X, h->X, sqrtm1);
311     }
312 
313     if (fe25519_isnegative(h->X) == (s[31] >> 7)) {
314         fe25519_neg(h->X, h->X);
315     }
316     fe25519_mul(h->T, h->X, h->Y);
317 
318     return 0;
319 }
320 
321 /*
322  r = p + q
323  */
324 
325 static void
326 ge25519_madd(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_precomp *q)
327 {
328     fe25519 t0;
329 
330     fe25519_add(r->X, p->Y, p->X);
331     fe25519_sub(r->Y, p->Y, p->X);
332     fe25519_mul(r->Z, r->X, q->yplusx);
333     fe25519_mul(r->Y, r->Y, q->yminusx);
334     fe25519_mul(r->T, q->xy2d, p->T);
335     fe25519_add(t0, p->Z, p->Z);
336     fe25519_sub(r->X, r->Z, r->Y);
337     fe25519_add(r->Y, r->Z, r->Y);
338     fe25519_add(r->Z, t0, r->T);
339     fe25519_sub(r->T, t0, r->T);
340 }
341 
342 /*
343  r = p - q
344  */
345 
346 static void
347 ge25519_msub(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_precomp *q)
348 {
349     fe25519 t0;
350 
351     fe25519_add(r->X, p->Y, p->X);
352     fe25519_sub(r->Y, p->Y, p->X);
353     fe25519_mul(r->Z, r->X, q->yminusx);
354     fe25519_mul(r->Y, r->Y, q->yplusx);
355     fe25519_mul(r->T, q->xy2d, p->T);
356     fe25519_add(t0, p->Z, p->Z);
357     fe25519_sub(r->X, r->Z, r->Y);
358     fe25519_add(r->Y, r->Z, r->Y);
359     fe25519_sub(r->Z, t0, r->T);
360     fe25519_add(r->T, t0, r->T);
361 }
362 
363 /*
364  r = p
365  */
366 
367 void
368 ge25519_p1p1_to_p2(ge25519_p2 *r, const ge25519_p1p1 *p)
369 {
370     fe25519_mul(r->X, p->X, p->T);
371     fe25519_mul(r->Y, p->Y, p->Z);
372     fe25519_mul(r->Z, p->Z, p->T);
373 }
374 
375 /*
376  r = p
377  */
378 
379 void
380 ge25519_p1p1_to_p3(ge25519_p3 *r, const ge25519_p1p1 *p)
381 {
382     fe25519_mul(r->X, p->X, p->T);
383     fe25519_mul(r->Y, p->Y, p->Z);
384     fe25519_mul(r->Z, p->Z, p->T);
385     fe25519_mul(r->T, p->X, p->Y);
386 }
387 
388 static void
389 ge25519_p2_0(ge25519_p2 *h)
390 {
391     fe25519_0(h->X);
392     fe25519_1(h->Y);
393     fe25519_1(h->Z);
394 }
395 
396 /*
397  r = 2 * p
398  */
399 
400 static void
401 ge25519_p2_dbl(ge25519_p1p1 *r, const ge25519_p2 *p)
402 {
403     fe25519 t0;
404 
405     fe25519_sq(r->X, p->X);
406     fe25519_sq(r->Z, p->Y);
407     fe25519_sq2(r->T, p->Z);
408     fe25519_add(r->Y, p->X, p->Y);
409     fe25519_sq(t0, r->Y);
410     fe25519_add(r->Y, r->Z, r->X);
411     fe25519_sub(r->Z, r->Z, r->X);
412     fe25519_sub(r->X, t0, r->Y);
413     fe25519_sub(r->T, r->T, r->Z);
414 }
415 
416 static void
417 ge25519_p3_0(ge25519_p3 *h)
418 {
419     fe25519_0(h->X);
420     fe25519_1(h->Y);
421     fe25519_1(h->Z);
422     fe25519_0(h->T);
423 }
424 
425 static void
426 ge25519_cached_0(ge25519_cached *h)
427 {
428     fe25519_1(h->YplusX);
429     fe25519_1(h->YminusX);
430     fe25519_1(h->Z);
431     fe25519_0(h->T2d);
432 }
433 
434 /*
435  r = p
436  */
437 
438 void
439 ge25519_p3_to_cached(ge25519_cached *r, const ge25519_p3 *p)
440 {
441     fe25519_add(r->YplusX, p->Y, p->X);
442     fe25519_sub(r->YminusX, p->Y, p->X);
443     fe25519_copy(r->Z, p->Z);
444     fe25519_mul(r->T2d, p->T, d2);
445 }
446 
447 static void
448 ge25519_p3_to_precomp(ge25519_precomp *pi, const ge25519_p3 *p)
449 {
450     fe25519 recip;
451     fe25519 x;
452     fe25519 y;
453     fe25519 xy;
454 
455     fe25519_invert(recip, p->Z);
456     fe25519_mul(x, p->X, recip);
457     fe25519_mul(y, p->Y, recip);
458     fe25519_add(pi->yplusx, y, x);
459     fe25519_sub(pi->yminusx, y, x);
460     fe25519_mul(xy, x, y);
461     fe25519_mul(pi->xy2d, xy, d2);
462 }
463 
464 /*
465  r = p
466  */
467 
468 static void
469 ge25519_p3_to_p2(ge25519_p2 *r, const ge25519_p3 *p)
470 {
471     fe25519_copy(r->X, p->X);
472     fe25519_copy(r->Y, p->Y);
473     fe25519_copy(r->Z, p->Z);
474 }
475 
476 void
477 ge25519_p3_tobytes(unsigned char *s, const ge25519_p3 *h)
478 {
479     fe25519 recip;
480     fe25519 x;
481     fe25519 y;
482 
483     fe25519_invert(recip, h->Z);
484     fe25519_mul(x, h->X, recip);
485     fe25519_mul(y, h->Y, recip);
486     fe25519_tobytes(s, y);
487     s[31] ^= fe25519_isnegative(x) << 7;
488 }
489 
490 /*
491  r = 2 * p
492  */
493 
494 static void
495 ge25519_p3_dbl(ge25519_p1p1 *r, const ge25519_p3 *p)
496 {
497     ge25519_p2 q;
498     ge25519_p3_to_p2(&q, p);
499     ge25519_p2_dbl(r, &q);
500 }
501 
502 static void
503 ge25519_precomp_0(ge25519_precomp *h)
504 {
505     fe25519_1(h->yplusx);
506     fe25519_1(h->yminusx);
507     fe25519_0(h->xy2d);
508 }
509 
510 static unsigned char
511 equal(signed char b, signed char c)
512 {
513     unsigned char ub = b;
514     unsigned char uc = c;
515     unsigned char x  = ub ^ uc; /* 0: yes; 1..255: no */
516     uint32_t      y  = x;       /* 0: yes; 1..255: no */
517 
518     y -= 1;   /* 4294967295: yes; 0..254: no */
519     y >>= 31; /* 1: yes; 0: no */
520 
521     return y;
522 }
523 
524 static unsigned char
525 negative(signed char b)
526 {
527     /* 18446744073709551361..18446744073709551615: yes; 0..255: no */
528     uint64_t x = b;
529 
530     x >>= 63; /* 1: yes; 0: no */
531 
532     return x;
533 }
534 
535 static void
536 ge25519_cmov(ge25519_precomp *t, const ge25519_precomp *u, unsigned char b)
537 {
538     fe25519_cmov(t->yplusx, u->yplusx, b);
539     fe25519_cmov(t->yminusx, u->yminusx, b);
540     fe25519_cmov(t->xy2d, u->xy2d, b);
541 }
542 
543 static void
544 ge25519_cmov_cached(ge25519_cached *t, const ge25519_cached *u, unsigned char b)
545 {
546     fe25519_cmov(t->YplusX, u->YplusX, b);
547     fe25519_cmov(t->YminusX, u->YminusX, b);
548     fe25519_cmov(t->Z, u->Z, b);
549     fe25519_cmov(t->T2d, u->T2d, b);
550 }
551 
552 static void
553 ge25519_select(ge25519_precomp *t, const ge25519_precomp precomp[8], const signed char b)
554 {
555     ge25519_precomp     minust;
556     const unsigned char bnegative = negative(b);
557     const unsigned char babs      = b - (((-bnegative) & b) * ((signed char) 1 << 1));
558 
559     ge25519_precomp_0(t);
560     ge25519_cmov(t, &precomp[0], equal(babs, 1));
561     ge25519_cmov(t, &precomp[1], equal(babs, 2));
562     ge25519_cmov(t, &precomp[2], equal(babs, 3));
563     ge25519_cmov(t, &precomp[3], equal(babs, 4));
564     ge25519_cmov(t, &precomp[4], equal(babs, 5));
565     ge25519_cmov(t, &precomp[5], equal(babs, 6));
566     ge25519_cmov(t, &precomp[6], equal(babs, 7));
567     ge25519_cmov(t, &precomp[7], equal(babs, 8));
568     fe25519_copy(minust.yplusx, t->yminusx);
569     fe25519_copy(minust.yminusx, t->yplusx);
570     fe25519_neg(minust.xy2d, t->xy2d);
571     ge25519_cmov(t, &minust, bnegative);
572 }
573 
574 static void
575 ge25519_select_base(ge25519_precomp *t, const int pos, const signed char b)
576 {
577     static const ge25519_precomp base[32][8] = { /* base[i][j] = (j+1)*256^i*B */
578 #ifdef HAVE_TI_MODE
579 # include "fe_51/base.h"
580 #else
581 # include "fe_25_5/base.h"
582 #endif
583     };
584     ge25519_select(t, base[pos], b);
585 }
586 
587 static void
588 ge25519_select_cached(ge25519_cached *t, const ge25519_cached cached[8], const signed char b)
589 {
590     ge25519_cached      minust;
591     const unsigned char bnegative = negative(b);
592     const unsigned char babs      = b - (((-bnegative) & b) * ((signed char) 1 << 1));
593 
594     ge25519_cached_0(t);
595     ge25519_cmov_cached(t, &cached[0], equal(babs, 1));
596     ge25519_cmov_cached(t, &cached[1], equal(babs, 2));
597     ge25519_cmov_cached(t, &cached[2], equal(babs, 3));
598     ge25519_cmov_cached(t, &cached[3], equal(babs, 4));
599     ge25519_cmov_cached(t, &cached[4], equal(babs, 5));
600     ge25519_cmov_cached(t, &cached[5], equal(babs, 6));
601     ge25519_cmov_cached(t, &cached[6], equal(babs, 7));
602     ge25519_cmov_cached(t, &cached[7], equal(babs, 8));
603     fe25519_copy(minust.YplusX, t->YminusX);
604     fe25519_copy(minust.YminusX, t->YplusX);
605     fe25519_copy(minust.Z, t->Z);
606     fe25519_neg(minust.T2d, t->T2d);
607     ge25519_cmov_cached(t, &minust, bnegative);
608 }
609 
610 /*
611  r = p - q
612  */
613 
614 void
615 ge25519_sub(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q)
616 {
617     fe25519 t0;
618 
619     fe25519_add(r->X, p->Y, p->X);
620     fe25519_sub(r->Y, p->Y, p->X);
621     fe25519_mul(r->Z, r->X, q->YminusX);
622     fe25519_mul(r->Y, r->Y, q->YplusX);
623     fe25519_mul(r->T, q->T2d, p->T);
624     fe25519_mul(r->X, p->Z, q->Z);
625     fe25519_add(t0, r->X, r->X);
626     fe25519_sub(r->X, r->Z, r->Y);
627     fe25519_add(r->Y, r->Z, r->Y);
628     fe25519_sub(r->Z, t0, r->T);
629     fe25519_add(r->T, t0, r->T);
630 }
631 
632 void
633 ge25519_tobytes(unsigned char *s, const ge25519_p2 *h)
634 {
635     fe25519 recip;
636     fe25519 x;
637     fe25519 y;
638 
639     fe25519_invert(recip, h->Z);
640     fe25519_mul(x, h->X, recip);
641     fe25519_mul(y, h->Y, recip);
642     fe25519_tobytes(s, y);
643     s[31] ^= fe25519_isnegative(x) << 7;
644 }
645 
646 /*
647  r = a * A + b * B
648  where a = a[0]+256*a[1]+...+256^31 a[31].
649  and b = b[0]+256*b[1]+...+256^31 b[31].
650  B is the Ed25519 base point (x,4/5) with x positive.
651 
652  Only used for signatures verification.
653  */
654 
655 void
656 ge25519_double_scalarmult_vartime(ge25519_p2 *r, const unsigned char *a,
657                                   const ge25519_p3 *A, const unsigned char *b)
658 {
659     static const ge25519_precomp Bi[8] = {
660 #ifdef HAVE_TI_MODE
661 # include "fe_51/base2.h"
662 #else
663 # include "fe_25_5/base2.h"
664 #endif
665     };
666     signed char    aslide[256];
667     signed char    bslide[256];
668     ge25519_cached Ai[8]; /* A,3A,5A,7A,9A,11A,13A,15A */
669     ge25519_p1p1   t;
670     ge25519_p3     u;
671     ge25519_p3     A2;
672     int            i;
673 
674     slide_vartime(aslide, a);
675     slide_vartime(bslide, b);
676 
677     ge25519_p3_to_cached(&Ai[0], A);
678 
679     ge25519_p3_dbl(&t, A);
680     ge25519_p1p1_to_p3(&A2, &t);
681 
682     ge25519_add(&t, &A2, &Ai[0]);
683     ge25519_p1p1_to_p3(&u, &t);
684     ge25519_p3_to_cached(&Ai[1], &u);
685 
686     ge25519_add(&t, &A2, &Ai[1]);
687     ge25519_p1p1_to_p3(&u, &t);
688     ge25519_p3_to_cached(&Ai[2], &u);
689 
690     ge25519_add(&t, &A2, &Ai[2]);
691     ge25519_p1p1_to_p3(&u, &t);
692     ge25519_p3_to_cached(&Ai[3], &u);
693 
694     ge25519_add(&t, &A2, &Ai[3]);
695     ge25519_p1p1_to_p3(&u, &t);
696     ge25519_p3_to_cached(&Ai[4], &u);
697 
698     ge25519_add(&t, &A2, &Ai[4]);
699     ge25519_p1p1_to_p3(&u, &t);
700     ge25519_p3_to_cached(&Ai[5], &u);
701 
702     ge25519_add(&t, &A2, &Ai[5]);
703     ge25519_p1p1_to_p3(&u, &t);
704     ge25519_p3_to_cached(&Ai[6], &u);
705 
706     ge25519_add(&t, &A2, &Ai[6]);
707     ge25519_p1p1_to_p3(&u, &t);
708     ge25519_p3_to_cached(&Ai[7], &u);
709 
710     ge25519_p2_0(r);
711 
712     for (i = 255; i >= 0; --i) {
713         if (aslide[i] || bslide[i]) {
714             break;
715         }
716     }
717 
718     for (; i >= 0; --i) {
719         ge25519_p2_dbl(&t, r);
720 
721         if (aslide[i] > 0) {
722             ge25519_p1p1_to_p3(&u, &t);
723             ge25519_add(&t, &u, &Ai[aslide[i] / 2]);
724         } else if (aslide[i] < 0) {
725             ge25519_p1p1_to_p3(&u, &t);
726             ge25519_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
727         }
728 
729         if (bslide[i] > 0) {
730             ge25519_p1p1_to_p3(&u, &t);
731             ge25519_madd(&t, &u, &Bi[bslide[i] / 2]);
732         } else if (bslide[i] < 0) {
733             ge25519_p1p1_to_p3(&u, &t);
734             ge25519_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
735         }
736 
737         ge25519_p1p1_to_p2(r, &t);
738     }
739 }
740 
741 /*
742  h = a * p
743  where a = a[0]+256*a[1]+...+256^31 a[31]
744 
745  Preconditions:
746  a[31] <= 127
747 
748  p is public
749  */
750 
751 void
752 ge25519_scalarmult(ge25519_p3 *h, const unsigned char *a, const ge25519_p3 *p)
753 {
754     signed char     e[64];
755     signed char     carry;
756     ge25519_p1p1    r;
757     ge25519_p2      s;
758     ge25519_p1p1    t2, t3, t4, t5, t6, t7, t8;
759     ge25519_p3      p2, p3, p4, p5, p6, p7, p8;
760     ge25519_cached  pi[8];
761     ge25519_cached  t;
762     int             i;
763 
764     ge25519_p3_to_cached(&pi[1 - 1], p);   /* p */
765 
766     ge25519_p3_dbl(&t2, p);
767     ge25519_p1p1_to_p3(&p2, &t2);
768     ge25519_p3_to_cached(&pi[2 - 1], &p2); /* 2p = 2*p */
769 
770     ge25519_add(&t3, p, &pi[2 - 1]);
771     ge25519_p1p1_to_p3(&p3, &t3);
772     ge25519_p3_to_cached(&pi[3 - 1], &p3); /* 3p = 2p+p */
773 
774     ge25519_p3_dbl(&t4, &p2);
775     ge25519_p1p1_to_p3(&p4, &t4);
776     ge25519_p3_to_cached(&pi[4 - 1], &p4); /* 4p = 2*2p */
777 
778     ge25519_add(&t5, p, &pi[4 - 1]);
779     ge25519_p1p1_to_p3(&p5, &t5);
780     ge25519_p3_to_cached(&pi[5 - 1], &p5); /* 5p = 4p+p */
781 
782     ge25519_p3_dbl(&t6, &p3);
783     ge25519_p1p1_to_p3(&p6, &t6);
784     ge25519_p3_to_cached(&pi[6 - 1], &p6); /* 6p = 2*3p */
785 
786     ge25519_add(&t7, p, &pi[6 - 1]);
787     ge25519_p1p1_to_p3(&p7, &t7);
788     ge25519_p3_to_cached(&pi[7 - 1], &p7); /* 7p = 6p+p */
789 
790     ge25519_p3_dbl(&t8, &p4);
791     ge25519_p1p1_to_p3(&p8, &t8);
792     ge25519_p3_to_cached(&pi[8 - 1], &p8); /* 8p = 2*4p */
793 
794     for (i = 0; i < 32; ++i) {
795         e[2 * i + 0] = (a[i] >> 0) & 15;
796         e[2 * i + 1] = (a[i] >> 4) & 15;
797     }
798     /* each e[i] is between 0 and 15 */
799     /* e[63] is between 0 and 7 */
800 
801     carry = 0;
802     for (i = 0; i < 63; ++i) {
803         e[i] += carry;
804         carry = e[i] + 8;
805         carry >>= 4;
806         e[i] -= carry * ((signed char) 1 << 4);
807     }
808     e[63] += carry;
809     /* each e[i] is between -8 and 8 */
810 
811     ge25519_p3_0(h);
812 
813     for (i = 63; i != 0; i--) {
814         ge25519_select_cached(&t, pi, e[i]);
815         ge25519_add(&r, h, &t);
816 
817         ge25519_p1p1_to_p2(&s, &r);
818         ge25519_p2_dbl(&r, &s);
819         ge25519_p1p1_to_p2(&s, &r);
820         ge25519_p2_dbl(&r, &s);
821         ge25519_p1p1_to_p2(&s, &r);
822         ge25519_p2_dbl(&r, &s);
823         ge25519_p1p1_to_p2(&s, &r);
824         ge25519_p2_dbl(&r, &s);
825 
826         ge25519_p1p1_to_p3(h, &r);  /* *16 */
827     }
828     ge25519_select_cached(&t, pi, e[i]);
829     ge25519_add(&r, h, &t);
830 
831     ge25519_p1p1_to_p3(h, &r);
832 }
833 
834 /*
835  h = a * B (with precomputation)
836  where a = a[0]+256*a[1]+...+256^31 a[31]
837  B is the Ed25519 base point (x,4/5) with x positive
838  (as bytes: 0x5866666666666666666666666666666666666666666666666666666666666666)
839 
840  Preconditions:
841  a[31] <= 127
842  */
843 
844 void
845 ge25519_scalarmult_base(ge25519_p3 *h, const unsigned char *a)
846 {
847     signed char     e[64];
848     signed char     carry;
849     ge25519_p1p1    r;
850     ge25519_p2      s;
851     ge25519_precomp t;
852     int             i;
853 
854     for (i = 0; i < 32; ++i) {
855         e[2 * i + 0] = (a[i] >> 0) & 15;
856         e[2 * i + 1] = (a[i] >> 4) & 15;
857     }
858     /* each e[i] is between 0 and 15 */
859     /* e[63] is between 0 and 7 */
860 
861     carry = 0;
862     for (i = 0; i < 63; ++i) {
863         e[i] += carry;
864         carry = e[i] + 8;
865         carry >>= 4;
866         e[i] -= carry * ((signed char) 1 << 4);
867     }
868     e[63] += carry;
869     /* each e[i] is between -8 and 8 */
870 
871     ge25519_p3_0(h);
872 
873     for (i = 1; i < 64; i += 2) {
874         ge25519_select_base(&t, i / 2, e[i]);
875         ge25519_madd(&r, h, &t);
876         ge25519_p1p1_to_p3(h, &r);
877     }
878 
879     ge25519_p3_dbl(&r, h);
880     ge25519_p1p1_to_p2(&s, &r);
881     ge25519_p2_dbl(&r, &s);
882     ge25519_p1p1_to_p2(&s, &r);
883     ge25519_p2_dbl(&r, &s);
884     ge25519_p1p1_to_p2(&s, &r);
885     ge25519_p2_dbl(&r, &s);
886     ge25519_p1p1_to_p3(h, &r);
887 
888     for (i = 0; i < 64; i += 2) {
889         ge25519_select_base(&t, i / 2, e[i]);
890         ge25519_madd(&r, h, &t);
891         ge25519_p1p1_to_p3(h, &r);
892     }
893 }
894 
895 /* multiply by the order of the main subgroup l = 2^252+27742317777372353535851937790883648493 */
896 static void
897 ge25519_mul_l(ge25519_p3 *r, const ge25519_p3 *A)
898 {
899     static const signed char aslide[253] = {
900         13, 0, 0, 0, 0, -1, 0, 0, 0, 0, -11, 0, 0, 0, 0, 0, 0, -5, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 0, -13, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, -13, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, -13, 0, 0, 0, 0, 0, 0, -3, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 3, 0, 0, 0, 0, -11, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 7, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
901     };
902     ge25519_cached Ai[8];
903     ge25519_p1p1   t;
904     ge25519_p3     u;
905     ge25519_p3     A2;
906     int            i;
907 
908     ge25519_p3_to_cached(&Ai[0], A);
909     ge25519_p3_dbl(&t, A);
910     ge25519_p1p1_to_p3(&A2, &t);
911     ge25519_add(&t, &A2, &Ai[0]);
912     ge25519_p1p1_to_p3(&u, &t);
913     ge25519_p3_to_cached(&Ai[1], &u);
914     ge25519_add(&t, &A2, &Ai[1]);
915     ge25519_p1p1_to_p3(&u, &t);
916     ge25519_p3_to_cached(&Ai[2], &u);
917     ge25519_add(&t, &A2, &Ai[2]);
918     ge25519_p1p1_to_p3(&u, &t);
919     ge25519_p3_to_cached(&Ai[3], &u);
920     ge25519_add(&t, &A2, &Ai[3]);
921     ge25519_p1p1_to_p3(&u, &t);
922     ge25519_p3_to_cached(&Ai[4], &u);
923     ge25519_add(&t, &A2, &Ai[4]);
924     ge25519_p1p1_to_p3(&u, &t);
925     ge25519_p3_to_cached(&Ai[5], &u);
926     ge25519_add(&t, &A2, &Ai[5]);
927     ge25519_p1p1_to_p3(&u, &t);
928     ge25519_p3_to_cached(&Ai[6], &u);
929     ge25519_add(&t, &A2, &Ai[6]);
930     ge25519_p1p1_to_p3(&u, &t);
931     ge25519_p3_to_cached(&Ai[7], &u);
932 
933     ge25519_p3_0(r);
934 
935     for (i = 252; i >= 0; --i) {
936         ge25519_p3_dbl(&t, r);
937 
938         if (aslide[i] > 0) {
939             ge25519_p1p1_to_p3(&u, &t);
940             ge25519_add(&t, &u, &Ai[aslide[i] / 2]);
941         } else if (aslide[i] < 0) {
942             ge25519_p1p1_to_p3(&u, &t);
943             ge25519_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
944         }
945 
946         ge25519_p1p1_to_p3(r, &t);
947     }
948 }
949 
950 int
951 ge25519_is_on_curve(const ge25519_p3 *p)
952 {
953     fe25519 x2;
954     fe25519 y2;
955     fe25519 z2;
956     fe25519 z4;
957     fe25519 t0;
958     fe25519 t1;
959 
960     fe25519_sq(x2, p->X);
961     fe25519_sq(y2, p->Y);
962     fe25519_sq(z2, p->Z);
963     fe25519_sub(t0, y2, x2);
964     fe25519_mul(t0, t0, z2);
965 
966     fe25519_mul(t1, x2, y2);
967     fe25519_mul(t1, t1, d);
968     fe25519_sq(z4, z2);
969     fe25519_add(t1, t1, z4);
970     fe25519_sub(t0, t0, t1);
971 
972     return fe25519_iszero(t0);
973 }
974 
975 int
976 ge25519_is_on_main_subgroup(const ge25519_p3 *p)
977 {
978     ge25519_p3 pl;
979 
980     ge25519_mul_l(&pl, p);
981 
982     return fe25519_iszero(pl.X);
983 }
984 
985 int
986 ge25519_is_canonical(const unsigned char *s)
987 {
988     unsigned char c;
989     unsigned char d;
990     unsigned int  i;
991 
992     c = (s[31] & 0x7f) ^ 0x7f;
993     for (i = 30; i > 0; i--) {
994         c |= s[i] ^ 0xff;
995     }
996     c = (((unsigned int) c) - 1U) >> 8;
997     d = (0xed - 1U - (unsigned int) s[0]) >> 8;
998 
999     return 1 - (c & d & 1);
1000 }
1001 
1002 int
1003 ge25519_has_small_order(const unsigned char s[32])
1004 {
1005     CRYPTO_ALIGN(16)
1006     static const unsigned char blacklist[][32] = {
1007         /* 0 (order 4) */
1008         { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1009           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1010           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
1011         /* 1 (order 1) */
1012         { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1013           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1014           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
1015         /* 2707385501144840649318225287225658788936804267575313519463743609750303402022
1016            (order 8) */
1017         { 0x26, 0xe8, 0x95, 0x8f, 0xc2, 0xb2, 0x27, 0xb0, 0x45, 0xc3, 0xf4,
1018           0x89, 0xf2, 0xef, 0x98, 0xf0, 0xd5, 0xdf, 0xac, 0x05, 0xd3, 0xc6,
1019           0x33, 0x39, 0xb1, 0x38, 0x02, 0x88, 0x6d, 0x53, 0xfc, 0x05 },
1020         /* 55188659117513257062467267217118295137698188065244968500265048394206261417927
1021            (order 8) */
1022         { 0xc7, 0x17, 0x6a, 0x70, 0x3d, 0x4d, 0xd8, 0x4f, 0xba, 0x3c, 0x0b,
1023           0x76, 0x0d, 0x10, 0x67, 0x0f, 0x2a, 0x20, 0x53, 0xfa, 0x2c, 0x39,
1024           0xcc, 0xc6, 0x4e, 0xc7, 0xfd, 0x77, 0x92, 0xac, 0x03, 0x7a },
1025         /* p-1 (order 2) */
1026         { 0xec, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1027           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1028           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
1029         /* p (=0, order 4) */
1030         { 0xed, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1031           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1032           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
1033         /* p+1 (=1, order 1) */
1034         { 0xee, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1035           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1036           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }
1037     };
1038     unsigned char c[7] = { 0 };
1039     unsigned int  k;
1040     size_t        i, j;
1041 
1042     COMPILER_ASSERT(7 == sizeof blacklist / sizeof blacklist[0]);
1043     for (j = 0; j < 31; j++) {
1044         for (i = 0; i < sizeof blacklist / sizeof blacklist[0]; i++) {
1045             c[i] |= s[j] ^ blacklist[i][j];
1046         }
1047     }
1048     for (i = 0; i < sizeof blacklist / sizeof blacklist[0]; i++) {
1049         c[i] |= (s[j] & 0x7f) ^ blacklist[i][j];
1050     }
1051     k = 0;
1052     for (i = 0; i < sizeof blacklist / sizeof blacklist[0]; i++) {
1053         k |= (c[i] - 1);
1054     }
1055     return (int) ((k >> 8) & 1);
1056 }
1057 
1058 /*
1059  Input:
1060  a[0]+256*a[1]+...+256^31*a[31] = a
1061  b[0]+256*b[1]+...+256^31*b[31] = b
1062  c[0]+256*c[1]+...+256^31*c[31] = c
1063  *
1064  Output:
1065  s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
1066  where l = 2^252 + 27742317777372353535851937790883648493.
1067  */
1068 
1069 void
1070 sc25519_muladd(unsigned char *s, const unsigned char *a,
1071                const unsigned char *b, const unsigned char *c)
1072 {
1073     int64_t a0  = 2097151 & load_3(a);
1074     int64_t a1  = 2097151 & (load_4(a + 2) >> 5);
1075     int64_t a2  = 2097151 & (load_3(a + 5) >> 2);
1076     int64_t a3  = 2097151 & (load_4(a + 7) >> 7);
1077     int64_t a4  = 2097151 & (load_4(a + 10) >> 4);
1078     int64_t a5  = 2097151 & (load_3(a + 13) >> 1);
1079     int64_t a6  = 2097151 & (load_4(a + 15) >> 6);
1080     int64_t a7  = 2097151 & (load_3(a + 18) >> 3);
1081     int64_t a8  = 2097151 & load_3(a + 21);
1082     int64_t a9  = 2097151 & (load_4(a + 23) >> 5);
1083     int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
1084     int64_t a11 = (load_4(a + 28) >> 7);
1085 
1086     int64_t b0  = 2097151 & load_3(b);
1087     int64_t b1  = 2097151 & (load_4(b + 2) >> 5);
1088     int64_t b2  = 2097151 & (load_3(b + 5) >> 2);
1089     int64_t b3  = 2097151 & (load_4(b + 7) >> 7);
1090     int64_t b4  = 2097151 & (load_4(b + 10) >> 4);
1091     int64_t b5  = 2097151 & (load_3(b + 13) >> 1);
1092     int64_t b6  = 2097151 & (load_4(b + 15) >> 6);
1093     int64_t b7  = 2097151 & (load_3(b + 18) >> 3);
1094     int64_t b8  = 2097151 & load_3(b + 21);
1095     int64_t b9  = 2097151 & (load_4(b + 23) >> 5);
1096     int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
1097     int64_t b11 = (load_4(b + 28) >> 7);
1098 
1099     int64_t c0  = 2097151 & load_3(c);
1100     int64_t c1  = 2097151 & (load_4(c + 2) >> 5);
1101     int64_t c2  = 2097151 & (load_3(c + 5) >> 2);
1102     int64_t c3  = 2097151 & (load_4(c + 7) >> 7);
1103     int64_t c4  = 2097151 & (load_4(c + 10) >> 4);
1104     int64_t c5  = 2097151 & (load_3(c + 13) >> 1);
1105     int64_t c6  = 2097151 & (load_4(c + 15) >> 6);
1106     int64_t c7  = 2097151 & (load_3(c + 18) >> 3);
1107     int64_t c8  = 2097151 & load_3(c + 21);
1108     int64_t c9  = 2097151 & (load_4(c + 23) >> 5);
1109     int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
1110     int64_t c11 = (load_4(c + 28) >> 7);
1111 
1112     int64_t s0;
1113     int64_t s1;
1114     int64_t s2;
1115     int64_t s3;
1116     int64_t s4;
1117     int64_t s5;
1118     int64_t s6;
1119     int64_t s7;
1120     int64_t s8;
1121     int64_t s9;
1122     int64_t s10;
1123     int64_t s11;
1124     int64_t s12;
1125     int64_t s13;
1126     int64_t s14;
1127     int64_t s15;
1128     int64_t s16;
1129     int64_t s17;
1130     int64_t s18;
1131     int64_t s19;
1132     int64_t s20;
1133     int64_t s21;
1134     int64_t s22;
1135     int64_t s23;
1136 
1137     int64_t carry0;
1138     int64_t carry1;
1139     int64_t carry2;
1140     int64_t carry3;
1141     int64_t carry4;
1142     int64_t carry5;
1143     int64_t carry6;
1144     int64_t carry7;
1145     int64_t carry8;
1146     int64_t carry9;
1147     int64_t carry10;
1148     int64_t carry11;
1149     int64_t carry12;
1150     int64_t carry13;
1151     int64_t carry14;
1152     int64_t carry15;
1153     int64_t carry16;
1154     int64_t carry17;
1155     int64_t carry18;
1156     int64_t carry19;
1157     int64_t carry20;
1158     int64_t carry21;
1159     int64_t carry22;
1160 
1161     s0 = c0 + a0 * b0;
1162     s1 = c1 + a0 * b1 + a1 * b0;
1163     s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0;
1164     s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
1165     s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
1166     s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0;
1167     s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 +
1168          a6 * b0;
1169     s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 +
1170          a6 * b1 + a7 * b0;
1171     s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 +
1172          a6 * b2 + a7 * b1 + a8 * b0;
1173     s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 +
1174          a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0;
1175     s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 +
1176           a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0;
1177     s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 +
1178           a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0;
1179     s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 +
1180           a7 * b5 + a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1;
1181     s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 +
1182           a8 * b5 + a9 * b4 + a10 * b3 + a11 * b2;
1183     s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 +
1184           a9 * b5 + a10 * b4 + a11 * b3;
1185     s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 +
1186           a10 * b5 + a11 * b4;
1187     s16 =
1188         a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5;
1189     s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6;
1190     s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7;
1191     s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8;
1192     s20 = a9 * b11 + a10 * b10 + a11 * b9;
1193     s21 = a10 * b11 + a11 * b10;
1194     s22 = a11 * b11;
1195     s23 = 0;
1196 
1197     carry0 = (s0 + (int64_t) (1L << 20)) >> 21;
1198     s1 += carry0;
1199     s0 -= carry0 * ((uint64_t) 1L << 21);
1200     carry2 = (s2 + (int64_t) (1L << 20)) >> 21;
1201     s3 += carry2;
1202     s2 -= carry2 * ((uint64_t) 1L << 21);
1203     carry4 = (s4 + (int64_t) (1L << 20)) >> 21;
1204     s5 += carry4;
1205     s4 -= carry4 * ((uint64_t) 1L << 21);
1206     carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
1207     s7 += carry6;
1208     s6 -= carry6 * ((uint64_t) 1L << 21);
1209     carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
1210     s9 += carry8;
1211     s8 -= carry8 * ((uint64_t) 1L << 21);
1212     carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
1213     s11 += carry10;
1214     s10 -= carry10 * ((uint64_t) 1L << 21);
1215     carry12 = (s12 + (int64_t) (1L << 20)) >> 21;
1216     s13 += carry12;
1217     s12 -= carry12 * ((uint64_t) 1L << 21);
1218     carry14 = (s14 + (int64_t) (1L << 20)) >> 21;
1219     s15 += carry14;
1220     s14 -= carry14 * ((uint64_t) 1L << 21);
1221     carry16 = (s16 + (int64_t) (1L << 20)) >> 21;
1222     s17 += carry16;
1223     s16 -= carry16 * ((uint64_t) 1L << 21);
1224     carry18 = (s18 + (int64_t) (1L << 20)) >> 21;
1225     s19 += carry18;
1226     s18 -= carry18 * ((uint64_t) 1L << 21);
1227     carry20 = (s20 + (int64_t) (1L << 20)) >> 21;
1228     s21 += carry20;
1229     s20 -= carry20 * ((uint64_t) 1L << 21);
1230     carry22 = (s22 + (int64_t) (1L << 20)) >> 21;
1231     s23 += carry22;
1232     s22 -= carry22 * ((uint64_t) 1L << 21);
1233 
1234     carry1 = (s1 + (int64_t) (1L << 20)) >> 21;
1235     s2 += carry1;
1236     s1 -= carry1 * ((uint64_t) 1L << 21);
1237     carry3 = (s3 + (int64_t) (1L << 20)) >> 21;
1238     s4 += carry3;
1239     s3 -= carry3 * ((uint64_t) 1L << 21);
1240     carry5 = (s5 + (int64_t) (1L << 20)) >> 21;
1241     s6 += carry5;
1242     s5 -= carry5 * ((uint64_t) 1L << 21);
1243     carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
1244     s8 += carry7;
1245     s7 -= carry7 * ((uint64_t) 1L << 21);
1246     carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
1247     s10 += carry9;
1248     s9 -= carry9 * ((uint64_t) 1L << 21);
1249     carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
1250     s12 += carry11;
1251     s11 -= carry11 * ((uint64_t) 1L << 21);
1252     carry13 = (s13 + (int64_t) (1L << 20)) >> 21;
1253     s14 += carry13;
1254     s13 -= carry13 * ((uint64_t) 1L << 21);
1255     carry15 = (s15 + (int64_t) (1L << 20)) >> 21;
1256     s16 += carry15;
1257     s15 -= carry15 * ((uint64_t) 1L << 21);
1258     carry17 = (s17 + (int64_t) (1L << 20)) >> 21;
1259     s18 += carry17;
1260     s17 -= carry17 * ((uint64_t) 1L << 21);
1261     carry19 = (s19 + (int64_t) (1L << 20)) >> 21;
1262     s20 += carry19;
1263     s19 -= carry19 * ((uint64_t) 1L << 21);
1264     carry21 = (s21 + (int64_t) (1L << 20)) >> 21;
1265     s22 += carry21;
1266     s21 -= carry21 * ((uint64_t) 1L << 21);
1267 
1268     s11 += s23 * 666643;
1269     s12 += s23 * 470296;
1270     s13 += s23 * 654183;
1271     s14 -= s23 * 997805;
1272     s15 += s23 * 136657;
1273     s16 -= s23 * 683901;
1274 
1275     s10 += s22 * 666643;
1276     s11 += s22 * 470296;
1277     s12 += s22 * 654183;
1278     s13 -= s22 * 997805;
1279     s14 += s22 * 136657;
1280     s15 -= s22 * 683901;
1281 
1282     s9 += s21 * 666643;
1283     s10 += s21 * 470296;
1284     s11 += s21 * 654183;
1285     s12 -= s21 * 997805;
1286     s13 += s21 * 136657;
1287     s14 -= s21 * 683901;
1288 
1289     s8 += s20 * 666643;
1290     s9 += s20 * 470296;
1291     s10 += s20 * 654183;
1292     s11 -= s20 * 997805;
1293     s12 += s20 * 136657;
1294     s13 -= s20 * 683901;
1295 
1296     s7 += s19 * 666643;
1297     s8 += s19 * 470296;
1298     s9 += s19 * 654183;
1299     s10 -= s19 * 997805;
1300     s11 += s19 * 136657;
1301     s12 -= s19 * 683901;
1302 
1303     s6 += s18 * 666643;
1304     s7 += s18 * 470296;
1305     s8 += s18 * 654183;
1306     s9 -= s18 * 997805;
1307     s10 += s18 * 136657;
1308     s11 -= s18 * 683901;
1309 
1310     carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
1311     s7 += carry6;
1312     s6 -= carry6 * ((uint64_t) 1L << 21);
1313     carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
1314     s9 += carry8;
1315     s8 -= carry8 * ((uint64_t) 1L << 21);
1316     carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
1317     s11 += carry10;
1318     s10 -= carry10 * ((uint64_t) 1L << 21);
1319     carry12 = (s12 + (int64_t) (1L << 20)) >> 21;
1320     s13 += carry12;
1321     s12 -= carry12 * ((uint64_t) 1L << 21);
1322     carry14 = (s14 + (int64_t) (1L << 20)) >> 21;
1323     s15 += carry14;
1324     s14 -= carry14 * ((uint64_t) 1L << 21);
1325     carry16 = (s16 + (int64_t) (1L << 20)) >> 21;
1326     s17 += carry16;
1327     s16 -= carry16 * ((uint64_t) 1L << 21);
1328 
1329     carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
1330     s8 += carry7;
1331     s7 -= carry7 * ((uint64_t) 1L << 21);
1332     carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
1333     s10 += carry9;
1334     s9 -= carry9 * ((uint64_t) 1L << 21);
1335     carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
1336     s12 += carry11;
1337     s11 -= carry11 * ((uint64_t) 1L << 21);
1338     carry13 = (s13 + (int64_t) (1L << 20)) >> 21;
1339     s14 += carry13;
1340     s13 -= carry13 * ((uint64_t) 1L << 21);
1341     carry15 = (s15 + (int64_t) (1L << 20)) >> 21;
1342     s16 += carry15;
1343     s15 -= carry15 * ((uint64_t) 1L << 21);
1344 
1345     s5 += s17 * 666643;
1346     s6 += s17 * 470296;
1347     s7 += s17 * 654183;
1348     s8 -= s17 * 997805;
1349     s9 += s17 * 136657;
1350     s10 -= s17 * 683901;
1351 
1352     s4 += s16 * 666643;
1353     s5 += s16 * 470296;
1354     s6 += s16 * 654183;
1355     s7 -= s16 * 997805;
1356     s8 += s16 * 136657;
1357     s9 -= s16 * 683901;
1358 
1359     s3 += s15 * 666643;
1360     s4 += s15 * 470296;
1361     s5 += s15 * 654183;
1362     s6 -= s15 * 997805;
1363     s7 += s15 * 136657;
1364     s8 -= s15 * 683901;
1365 
1366     s2 += s14 * 666643;
1367     s3 += s14 * 470296;
1368     s4 += s14 * 654183;
1369     s5 -= s14 * 997805;
1370     s6 += s14 * 136657;
1371     s7 -= s14 * 683901;
1372 
1373     s1 += s13 * 666643;
1374     s2 += s13 * 470296;
1375     s3 += s13 * 654183;
1376     s4 -= s13 * 997805;
1377     s5 += s13 * 136657;
1378     s6 -= s13 * 683901;
1379 
1380     s0 += s12 * 666643;
1381     s1 += s12 * 470296;
1382     s2 += s12 * 654183;
1383     s3 -= s12 * 997805;
1384     s4 += s12 * 136657;
1385     s5 -= s12 * 683901;
1386     s12 = 0;
1387 
1388     carry0 = (s0 + (int64_t) (1L << 20)) >> 21;
1389     s1 += carry0;
1390     s0 -= carry0 * ((uint64_t) 1L << 21);
1391     carry2 = (s2 + (int64_t) (1L << 20)) >> 21;
1392     s3 += carry2;
1393     s2 -= carry2 * ((uint64_t) 1L << 21);
1394     carry4 = (s4 + (int64_t) (1L << 20)) >> 21;
1395     s5 += carry4;
1396     s4 -= carry4 * ((uint64_t) 1L << 21);
1397     carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
1398     s7 += carry6;
1399     s6 -= carry6 * ((uint64_t) 1L << 21);
1400     carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
1401     s9 += carry8;
1402     s8 -= carry8 * ((uint64_t) 1L << 21);
1403     carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
1404     s11 += carry10;
1405     s10 -= carry10 * ((uint64_t) 1L << 21);
1406 
1407     carry1 = (s1 + (int64_t) (1L << 20)) >> 21;
1408     s2 += carry1;
1409     s1 -= carry1 * ((uint64_t) 1L << 21);
1410     carry3 = (s3 + (int64_t) (1L << 20)) >> 21;
1411     s4 += carry3;
1412     s3 -= carry3 * ((uint64_t) 1L << 21);
1413     carry5 = (s5 + (int64_t) (1L << 20)) >> 21;
1414     s6 += carry5;
1415     s5 -= carry5 * ((uint64_t) 1L << 21);
1416     carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
1417     s8 += carry7;
1418     s7 -= carry7 * ((uint64_t) 1L << 21);
1419     carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
1420     s10 += carry9;
1421     s9 -= carry9 * ((uint64_t) 1L << 21);
1422     carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
1423     s12 += carry11;
1424     s11 -= carry11 * ((uint64_t) 1L << 21);
1425 
1426     s0 += s12 * 666643;
1427     s1 += s12 * 470296;
1428     s2 += s12 * 654183;
1429     s3 -= s12 * 997805;
1430     s4 += s12 * 136657;
1431     s5 -= s12 * 683901;
1432     s12 = 0;
1433 
1434     carry0 = s0 >> 21;
1435     s1 += carry0;
1436     s0 -= carry0 * ((uint64_t) 1L << 21);
1437     carry1 = s1 >> 21;
1438     s2 += carry1;
1439     s1 -= carry1 * ((uint64_t) 1L << 21);
1440     carry2 = s2 >> 21;
1441     s3 += carry2;
1442     s2 -= carry2 * ((uint64_t) 1L << 21);
1443     carry3 = s3 >> 21;
1444     s4 += carry3;
1445     s3 -= carry3 * ((uint64_t) 1L << 21);
1446     carry4 = s4 >> 21;
1447     s5 += carry4;
1448     s4 -= carry4 * ((uint64_t) 1L << 21);
1449     carry5 = s5 >> 21;
1450     s6 += carry5;
1451     s5 -= carry5 * ((uint64_t) 1L << 21);
1452     carry6 = s6 >> 21;
1453     s7 += carry6;
1454     s6 -= carry6 * ((uint64_t) 1L << 21);
1455     carry7 = s7 >> 21;
1456     s8 += carry7;
1457     s7 -= carry7 * ((uint64_t) 1L << 21);
1458     carry8 = s8 >> 21;
1459     s9 += carry8;
1460     s8 -= carry8 * ((uint64_t) 1L << 21);
1461     carry9 = s9 >> 21;
1462     s10 += carry9;
1463     s9 -= carry9 * ((uint64_t) 1L << 21);
1464     carry10 = s10 >> 21;
1465     s11 += carry10;
1466     s10 -= carry10 * ((uint64_t) 1L << 21);
1467     carry11 = s11 >> 21;
1468     s12 += carry11;
1469     s11 -= carry11 * ((uint64_t) 1L << 21);
1470 
1471     s0 += s12 * 666643;
1472     s1 += s12 * 470296;
1473     s2 += s12 * 654183;
1474     s3 -= s12 * 997805;
1475     s4 += s12 * 136657;
1476     s5 -= s12 * 683901;
1477 
1478     carry0 = s0 >> 21;
1479     s1 += carry0;
1480     s0 -= carry0 * ((uint64_t) 1L << 21);
1481     carry1 = s1 >> 21;
1482     s2 += carry1;
1483     s1 -= carry1 * ((uint64_t) 1L << 21);
1484     carry2 = s2 >> 21;
1485     s3 += carry2;
1486     s2 -= carry2 * ((uint64_t) 1L << 21);
1487     carry3 = s3 >> 21;
1488     s4 += carry3;
1489     s3 -= carry3 * ((uint64_t) 1L << 21);
1490     carry4 = s4 >> 21;
1491     s5 += carry4;
1492     s4 -= carry4 * ((uint64_t) 1L << 21);
1493     carry5 = s5 >> 21;
1494     s6 += carry5;
1495     s5 -= carry5 * ((uint64_t) 1L << 21);
1496     carry6 = s6 >> 21;
1497     s7 += carry6;
1498     s6 -= carry6 * ((uint64_t) 1L << 21);
1499     carry7 = s7 >> 21;
1500     s8 += carry7;
1501     s7 -= carry7 * ((uint64_t) 1L << 21);
1502     carry8 = s8 >> 21;
1503     s9 += carry8;
1504     s8 -= carry8 * ((uint64_t) 1L << 21);
1505     carry9 = s9 >> 21;
1506     s10 += carry9;
1507     s9 -= carry9 * ((uint64_t) 1L << 21);
1508     carry10 = s10 >> 21;
1509     s11 += carry10;
1510     s10 -= carry10 * ((uint64_t) 1L << 21);
1511 
1512     s[0]  = s0 >> 0;
1513     s[1]  = s0 >> 8;
1514     s[2]  = (s0 >> 16) | (s1 * ((uint64_t) 1 << 5));
1515     s[3]  = s1 >> 3;
1516     s[4]  = s1 >> 11;
1517     s[5]  = (s1 >> 19) | (s2 * ((uint64_t) 1 << 2));
1518     s[6]  = s2 >> 6;
1519     s[7]  = (s2 >> 14) | (s3 * ((uint64_t) 1 << 7));
1520     s[8]  = s3 >> 1;
1521     s[9]  = s3 >> 9;
1522     s[10] = (s3 >> 17) | (s4 * ((uint64_t) 1 << 4));
1523     s[11] = s4 >> 4;
1524     s[12] = s4 >> 12;
1525     s[13] = (s4 >> 20) | (s5 * ((uint64_t) 1 << 1));
1526     s[14] = s5 >> 7;
1527     s[15] = (s5 >> 15) | (s6 * ((uint64_t) 1 << 6));
1528     s[16] = s6 >> 2;
1529     s[17] = s6 >> 10;
1530     s[18] = (s6 >> 18) | (s7 * ((uint64_t) 1 << 3));
1531     s[19] = s7 >> 5;
1532     s[20] = s7 >> 13;
1533     s[21] = s8 >> 0;
1534     s[22] = s8 >> 8;
1535     s[23] = (s8 >> 16) | (s9 * ((uint64_t) 1 << 5));
1536     s[24] = s9 >> 3;
1537     s[25] = s9 >> 11;
1538     s[26] = (s9 >> 19) | (s10 * ((uint64_t) 1 << 2));
1539     s[27] = s10 >> 6;
1540     s[28] = (s10 >> 14) | (s11 * ((uint64_t) 1 << 7));
1541     s[29] = s11 >> 1;
1542     s[30] = s11 >> 9;
1543     s[31] = s11 >> 17;
1544 }
1545 
1546 /*
1547  Input:
1548  s[0]+256*s[1]+...+256^63*s[63] = s
1549  *
1550  Output:
1551  s[0]+256*s[1]+...+256^31*s[31] = s mod l
1552  where l = 2^252 + 27742317777372353535851937790883648493.
1553  Overwrites s in place.
1554  */
1555 
1556 void
1557 sc25519_reduce(unsigned char *s)
1558 {
1559     int64_t s0  = 2097151 & load_3(s);
1560     int64_t s1  = 2097151 & (load_4(s + 2) >> 5);
1561     int64_t s2  = 2097151 & (load_3(s + 5) >> 2);
1562     int64_t s3  = 2097151 & (load_4(s + 7) >> 7);
1563     int64_t s4  = 2097151 & (load_4(s + 10) >> 4);
1564     int64_t s5  = 2097151 & (load_3(s + 13) >> 1);
1565     int64_t s6  = 2097151 & (load_4(s + 15) >> 6);
1566     int64_t s7  = 2097151 & (load_3(s + 18) >> 3);
1567     int64_t s8  = 2097151 & load_3(s + 21);
1568     int64_t s9  = 2097151 & (load_4(s + 23) >> 5);
1569     int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
1570     int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
1571     int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
1572     int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
1573     int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
1574     int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
1575     int64_t s16 = 2097151 & load_3(s + 42);
1576     int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
1577     int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
1578     int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
1579     int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
1580     int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
1581     int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
1582     int64_t s23 = (load_4(s + 60) >> 3);
1583 
1584     int64_t carry0;
1585     int64_t carry1;
1586     int64_t carry2;
1587     int64_t carry3;
1588     int64_t carry4;
1589     int64_t carry5;
1590     int64_t carry6;
1591     int64_t carry7;
1592     int64_t carry8;
1593     int64_t carry9;
1594     int64_t carry10;
1595     int64_t carry11;
1596     int64_t carry12;
1597     int64_t carry13;
1598     int64_t carry14;
1599     int64_t carry15;
1600     int64_t carry16;
1601 
1602     s11 += s23 * 666643;
1603     s12 += s23 * 470296;
1604     s13 += s23 * 654183;
1605     s14 -= s23 * 997805;
1606     s15 += s23 * 136657;
1607     s16 -= s23 * 683901;
1608 
1609     s10 += s22 * 666643;
1610     s11 += s22 * 470296;
1611     s12 += s22 * 654183;
1612     s13 -= s22 * 997805;
1613     s14 += s22 * 136657;
1614     s15 -= s22 * 683901;
1615 
1616     s9 += s21 * 666643;
1617     s10 += s21 * 470296;
1618     s11 += s21 * 654183;
1619     s12 -= s21 * 997805;
1620     s13 += s21 * 136657;
1621     s14 -= s21 * 683901;
1622 
1623     s8 += s20 * 666643;
1624     s9 += s20 * 470296;
1625     s10 += s20 * 654183;
1626     s11 -= s20 * 997805;
1627     s12 += s20 * 136657;
1628     s13 -= s20 * 683901;
1629 
1630     s7 += s19 * 666643;
1631     s8 += s19 * 470296;
1632     s9 += s19 * 654183;
1633     s10 -= s19 * 997805;
1634     s11 += s19 * 136657;
1635     s12 -= s19 * 683901;
1636 
1637     s6 += s18 * 666643;
1638     s7 += s18 * 470296;
1639     s8 += s18 * 654183;
1640     s9 -= s18 * 997805;
1641     s10 += s18 * 136657;
1642     s11 -= s18 * 683901;
1643 
1644     carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
1645     s7 += carry6;
1646     s6 -= carry6 * ((uint64_t) 1L << 21);
1647     carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
1648     s9 += carry8;
1649     s8 -= carry8 * ((uint64_t) 1L << 21);
1650     carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
1651     s11 += carry10;
1652     s10 -= carry10 * ((uint64_t) 1L << 21);
1653     carry12 = (s12 + (int64_t) (1L << 20)) >> 21;
1654     s13 += carry12;
1655     s12 -= carry12 * ((uint64_t) 1L << 21);
1656     carry14 = (s14 + (int64_t) (1L << 20)) >> 21;
1657     s15 += carry14;
1658     s14 -= carry14 * ((uint64_t) 1L << 21);
1659     carry16 = (s16 + (int64_t) (1L << 20)) >> 21;
1660     s17 += carry16;
1661     s16 -= carry16 * ((uint64_t) 1L << 21);
1662 
1663     carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
1664     s8 += carry7;
1665     s7 -= carry7 * ((uint64_t) 1L << 21);
1666     carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
1667     s10 += carry9;
1668     s9 -= carry9 * ((uint64_t) 1L << 21);
1669     carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
1670     s12 += carry11;
1671     s11 -= carry11 * ((uint64_t) 1L << 21);
1672     carry13 = (s13 + (int64_t) (1L << 20)) >> 21;
1673     s14 += carry13;
1674     s13 -= carry13 * ((uint64_t) 1L << 21);
1675     carry15 = (s15 + (int64_t) (1L << 20)) >> 21;
1676     s16 += carry15;
1677     s15 -= carry15 * ((uint64_t) 1L << 21);
1678 
1679     s5 += s17 * 666643;
1680     s6 += s17 * 470296;
1681     s7 += s17 * 654183;
1682     s8 -= s17 * 997805;
1683     s9 += s17 * 136657;
1684     s10 -= s17 * 683901;
1685 
1686     s4 += s16 * 666643;
1687     s5 += s16 * 470296;
1688     s6 += s16 * 654183;
1689     s7 -= s16 * 997805;
1690     s8 += s16 * 136657;
1691     s9 -= s16 * 683901;
1692 
1693     s3 += s15 * 666643;
1694     s4 += s15 * 470296;
1695     s5 += s15 * 654183;
1696     s6 -= s15 * 997805;
1697     s7 += s15 * 136657;
1698     s8 -= s15 * 683901;
1699 
1700     s2 += s14 * 666643;
1701     s3 += s14 * 470296;
1702     s4 += s14 * 654183;
1703     s5 -= s14 * 997805;
1704     s6 += s14 * 136657;
1705     s7 -= s14 * 683901;
1706 
1707     s1 += s13 * 666643;
1708     s2 += s13 * 470296;
1709     s3 += s13 * 654183;
1710     s4 -= s13 * 997805;
1711     s5 += s13 * 136657;
1712     s6 -= s13 * 683901;
1713 
1714     s0 += s12 * 666643;
1715     s1 += s12 * 470296;
1716     s2 += s12 * 654183;
1717     s3 -= s12 * 997805;
1718     s4 += s12 * 136657;
1719     s5 -= s12 * 683901;
1720     s12 = 0;
1721 
1722     carry0 = (s0 + (int64_t) (1L << 20)) >> 21;
1723     s1 += carry0;
1724     s0 -= carry0 * ((uint64_t) 1L << 21);
1725     carry2 = (s2 + (int64_t) (1L << 20)) >> 21;
1726     s3 += carry2;
1727     s2 -= carry2 * ((uint64_t) 1L << 21);
1728     carry4 = (s4 + (int64_t) (1L << 20)) >> 21;
1729     s5 += carry4;
1730     s4 -= carry4 * ((uint64_t) 1L << 21);
1731     carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
1732     s7 += carry6;
1733     s6 -= carry6 * ((uint64_t) 1L << 21);
1734     carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
1735     s9 += carry8;
1736     s8 -= carry8 * ((uint64_t) 1L << 21);
1737     carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
1738     s11 += carry10;
1739     s10 -= carry10 * ((uint64_t) 1L << 21);
1740 
1741     carry1 = (s1 + (int64_t) (1L << 20)) >> 21;
1742     s2 += carry1;
1743     s1 -= carry1 * ((uint64_t) 1L << 21);
1744     carry3 = (s3 + (int64_t) (1L << 20)) >> 21;
1745     s4 += carry3;
1746     s3 -= carry3 * ((uint64_t) 1L << 21);
1747     carry5 = (s5 + (int64_t) (1L << 20)) >> 21;
1748     s6 += carry5;
1749     s5 -= carry5 * ((uint64_t) 1L << 21);
1750     carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
1751     s8 += carry7;
1752     s7 -= carry7 * ((uint64_t) 1L << 21);
1753     carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
1754     s10 += carry9;
1755     s9 -= carry9 * ((uint64_t) 1L << 21);
1756     carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
1757     s12 += carry11;
1758     s11 -= carry11 * ((uint64_t) 1L << 21);
1759 
1760     s0 += s12 * 666643;
1761     s1 += s12 * 470296;
1762     s2 += s12 * 654183;
1763     s3 -= s12 * 997805;
1764     s4 += s12 * 136657;
1765     s5 -= s12 * 683901;
1766     s12 = 0;
1767 
1768     carry0 = s0 >> 21;
1769     s1 += carry0;
1770     s0 -= carry0 * ((uint64_t) 1L << 21);
1771     carry1 = s1 >> 21;
1772     s2 += carry1;
1773     s1 -= carry1 * ((uint64_t) 1L << 21);
1774     carry2 = s2 >> 21;
1775     s3 += carry2;
1776     s2 -= carry2 * ((uint64_t) 1L << 21);
1777     carry3 = s3 >> 21;
1778     s4 += carry3;
1779     s3 -= carry3 * ((uint64_t) 1L << 21);
1780     carry4 = s4 >> 21;
1781     s5 += carry4;
1782     s4 -= carry4 * ((uint64_t) 1L << 21);
1783     carry5 = s5 >> 21;
1784     s6 += carry5;
1785     s5 -= carry5 * ((uint64_t) 1L << 21);
1786     carry6 = s6 >> 21;
1787     s7 += carry6;
1788     s6 -= carry6 * ((uint64_t) 1L << 21);
1789     carry7 = s7 >> 21;
1790     s8 += carry7;
1791     s7 -= carry7 * ((uint64_t) 1L << 21);
1792     carry8 = s8 >> 21;
1793     s9 += carry8;
1794     s8 -= carry8 * ((uint64_t) 1L << 21);
1795     carry9 = s9 >> 21;
1796     s10 += carry9;
1797     s9 -= carry9 * ((uint64_t) 1L << 21);
1798     carry10 = s10 >> 21;
1799     s11 += carry10;
1800     s10 -= carry10 * ((uint64_t) 1L << 21);
1801     carry11 = s11 >> 21;
1802     s12 += carry11;
1803     s11 -= carry11 * ((uint64_t) 1L << 21);
1804 
1805     s0 += s12 * 666643;
1806     s1 += s12 * 470296;
1807     s2 += s12 * 654183;
1808     s3 -= s12 * 997805;
1809     s4 += s12 * 136657;
1810     s5 -= s12 * 683901;
1811 
1812     carry0 = s0 >> 21;
1813     s1 += carry0;
1814     s0 -= carry0 * ((uint64_t) 1L << 21);
1815     carry1 = s1 >> 21;
1816     s2 += carry1;
1817     s1 -= carry1 * ((uint64_t) 1L << 21);
1818     carry2 = s2 >> 21;
1819     s3 += carry2;
1820     s2 -= carry2 * ((uint64_t) 1L << 21);
1821     carry3 = s3 >> 21;
1822     s4 += carry3;
1823     s3 -= carry3 * ((uint64_t) 1L << 21);
1824     carry4 = s4 >> 21;
1825     s5 += carry4;
1826     s4 -= carry4 * ((uint64_t) 1L << 21);
1827     carry5 = s5 >> 21;
1828     s6 += carry5;
1829     s5 -= carry5 * ((uint64_t) 1L << 21);
1830     carry6 = s6 >> 21;
1831     s7 += carry6;
1832     s6 -= carry6 * ((uint64_t) 1L << 21);
1833     carry7 = s7 >> 21;
1834     s8 += carry7;
1835     s7 -= carry7 * ((uint64_t) 1L << 21);
1836     carry8 = s8 >> 21;
1837     s9 += carry8;
1838     s8 -= carry8 * ((uint64_t) 1L << 21);
1839     carry9 = s9 >> 21;
1840     s10 += carry9;
1841     s9 -= carry9 * ((uint64_t) 1L << 21);
1842     carry10 = s10 >> 21;
1843     s11 += carry10;
1844     s10 -= carry10 * ((uint64_t) 1L << 21);
1845 
1846     s[0]  = s0 >> 0;
1847     s[1]  = s0 >> 8;
1848     s[2]  = (s0 >> 16) | (s1 * ((uint64_t) 1 << 5));
1849     s[3]  = s1 >> 3;
1850     s[4]  = s1 >> 11;
1851     s[5]  = (s1 >> 19) | (s2 * ((uint64_t) 1 << 2));
1852     s[6]  = s2 >> 6;
1853     s[7]  = (s2 >> 14) | (s3 * ((uint64_t) 1 << 7));
1854     s[8]  = s3 >> 1;
1855     s[9]  = s3 >> 9;
1856     s[10] = (s3 >> 17) | (s4 * ((uint64_t) 1 << 4));
1857     s[11] = s4 >> 4;
1858     s[12] = s4 >> 12;
1859     s[13] = (s4 >> 20) | (s5 * ((uint64_t) 1 << 1));
1860     s[14] = s5 >> 7;
1861     s[15] = (s5 >> 15) | (s6 * ((uint64_t) 1 << 6));
1862     s[16] = s6 >> 2;
1863     s[17] = s6 >> 10;
1864     s[18] = (s6 >> 18) | (s7 * ((uint64_t) 1 << 3));
1865     s[19] = s7 >> 5;
1866     s[20] = s7 >> 13;
1867     s[21] = s8 >> 0;
1868     s[22] = s8 >> 8;
1869     s[23] = (s8 >> 16) | (s9 * ((uint64_t) 1 << 5));
1870     s[24] = s9 >> 3;
1871     s[25] = s9 >> 11;
1872     s[26] = (s9 >> 19) | (s10 * ((uint64_t) 1 << 2));
1873     s[27] = s10 >> 6;
1874     s[28] = (s10 >> 14) | (s11 * ((uint64_t) 1 << 7));
1875     s[29] = s11 >> 1;
1876     s[30] = s11 >> 9;
1877     s[31] = s11 >> 17;
1878 }
1879 
1880 int
1881 sc25519_is_canonical(const unsigned char *s)
1882 {
1883     /* 2^252+27742317777372353535851937790883648493 */
1884     static const unsigned char L[32] = {
1885         0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7,
1886         0xa2, 0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1887         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10
1888     };
1889     unsigned char c = 0;
1890     unsigned char n = 1;
1891     unsigned int  i = 32;
1892 
1893     do {
1894         i--;
1895         c |= ((s[i] - L[i]) >> 8) & n;
1896         n &= ((s[i] ^ L[i]) - 1) >> 8;
1897     } while (i != 0);
1898 
1899     return (c != 0);
1900 }
1901 
1902 static void
1903 chi25519(fe25519 out, const fe25519 z)
1904 {
1905     fe25519 t0, t1, t2, t3;
1906     int     i;
1907 
1908     fe25519_sq(t0, z);
1909     fe25519_mul(t1, t0, z);
1910     fe25519_sq(t0, t1);
1911     fe25519_sq(t2, t0);
1912     fe25519_sq(t2, t2);
1913     fe25519_mul(t2, t2, t0);
1914     fe25519_mul(t1, t2, z);
1915     fe25519_sq(t2, t1);
1916 
1917     for (i = 1; i < 5; i++) {
1918         fe25519_sq(t2, t2);
1919     }
1920     fe25519_mul(t1, t2, t1);
1921     fe25519_sq(t2, t1);
1922     for (i = 1; i < 10; i++) {
1923         fe25519_sq(t2, t2);
1924     }
1925     fe25519_mul(t2, t2, t1);
1926     fe25519_sq(t3, t2);
1927     for (i = 1; i < 20; i++) {
1928         fe25519_sq(t3, t3);
1929     }
1930     fe25519_mul(t2, t3, t2);
1931     fe25519_sq(t2, t2);
1932     for (i = 1; i < 10; i++) {
1933         fe25519_sq(t2, t2);
1934     }
1935     fe25519_mul(t1, t2, t1);
1936     fe25519_sq(t2, t1);
1937     for (i = 1; i < 50; i++) {
1938         fe25519_sq(t2, t2);
1939     }
1940     fe25519_mul(t2, t2, t1);
1941     fe25519_sq(t3, t2);
1942     for (i = 1; i < 100; i++) {
1943         fe25519_sq(t3, t3);
1944     }
1945     fe25519_mul(t2, t3, t2);
1946     fe25519_sq(t2, t2);
1947     for (i = 1; i < 50; i++) {
1948         fe25519_sq(t2, t2);
1949     }
1950     fe25519_mul(t1, t2, t1);
1951     fe25519_sq(t1, t1);
1952     for (i = 1; i < 4; i++) {
1953         fe25519_sq(t1, t1);
1954     }
1955     fe25519_mul(out, t1, t0);
1956 }
1957 
1958 void
1959 ge25519_from_uniform(unsigned char s[32], const unsigned char r[32])
1960 {
1961     fe25519       e;
1962     fe25519       negx;
1963     fe25519       rr2;
1964     fe25519       x, x2, x3;
1965     ge25519_p3    p3;
1966     ge25519_p1p1  p1;
1967     ge25519_p2    p2;
1968     unsigned int  e_is_minus_1;
1969     unsigned char x_sign;
1970 
1971     memcpy(s, r, 32);
1972     x_sign = s[31] & 0x80;
1973     s[31] &= 0x7f;
1974 
1975     fe25519_frombytes(rr2, s);
1976 
1977     /* elligator */
1978     fe25519_sq2(rr2, rr2);
1979     rr2[0]++;
1980     fe25519_invert(rr2, rr2);
1981     fe25519_mul(x, curve25519_A, rr2);
1982     fe25519_neg(x, x);
1983 
1984     fe25519_sq(x2, x);
1985     fe25519_mul(x3, x, x2);
1986     fe25519_add(e, x3, x);
1987     fe25519_mul(x2, x2, curve25519_A);
1988     fe25519_add(e, x2, e);
1989 
1990     chi25519(e, e);
1991 
1992     fe25519_tobytes(s, e);
1993     e_is_minus_1 = s[1] & 1;
1994     fe25519_neg(negx, x);
1995     fe25519_cmov(x, negx, e_is_minus_1);
1996     fe25519_0(x2);
1997     fe25519_cmov(x2, curve25519_A, e_is_minus_1);
1998     fe25519_sub(x, x, x2);
1999 
2000     /* yed = (x-1)/(x+1) */
2001     {
2002         fe25519 one;
2003         fe25519 x_plus_one;
2004         fe25519 x_plus_one_inv;
2005         fe25519 x_minus_one;
2006         fe25519 yed;
2007 
2008         fe25519_1(one);
2009         fe25519_add(x_plus_one, x, one);
2010         fe25519_sub(x_minus_one, x, one);
2011         fe25519_invert(x_plus_one_inv, x_plus_one);
2012         fe25519_mul(yed, x_minus_one, x_plus_one_inv);
2013         fe25519_tobytes(s, yed);
2014     }
2015 
2016     /* recover x */
2017     s[31] |= x_sign;
2018     if (ge25519_frombytes(&p3, s) != 0) {
2019         abort(); /* LCOV_EXCL_LINE */
2020     }
2021 
2022     /* multiply by the cofactor */
2023     ge25519_p3_dbl(&p1, &p3);
2024     ge25519_p1p1_to_p2(&p2, &p1);
2025     ge25519_p2_dbl(&p1, &p2);
2026     ge25519_p1p1_to_p2(&p2, &p1);
2027     ge25519_p2_dbl(&p1, &p2);
2028     ge25519_p1p1_to_p3(&p3, &p1);
2029 
2030     ge25519_p3_tobytes(s, &p3);
2031 }
2032