1 /**
2  * @file ed448goldilocks/decaf.c
3  * @author Mike Hamburg
4  *
5  * @copyright
6  *   Copyright (c) 2015-2016 Cryptography Research, Inc.  \n
7  *   Released under the MIT License.  See LICENSE.txt for license information.
8  *
9  * @brief Decaf high-level functions.
10  *
11  * @warning This file was automatically generated in Python.
12  * Please do not edit it.
13  */
14 #define _XOPEN_SOURCE 600 /* for posix_memalign */
15 #include "word.h"
16 #include "field.h"
17 
18 #include <decaf.h>
19 #include <decaf/ed448.h>
20 
21 /* MSVC has no builtint ctz, this is a fix as in
22 https://stackoverflow.com/questions/355967/how-to-use-msvc-intrinsics-to-get-the-equivalent-of-this-gcc-code/5468852#5468852
23 */
24 #ifdef _MSC_VER
25 #include <intrin.h>
26 
ctz(uint32_t value)27 uint32_t __inline ctz(uint32_t value)
28 {
29     DWORD trailing_zero = 0;
30     if ( _BitScanForward( &trailing_zero, value ) )
31         return trailing_zero;
32     else
33         return 32;  // This is undefined, I better choose 32 than 0
34 }
35 #define __builtin_ctz(x) ctz(x)
36 #endif
37 
38 /* Template stuff */
39 #define API_NS(_id) decaf_448_##_id
40 #define SCALAR_BITS DECAF_448_SCALAR_BITS
41 #define SCALAR_SER_BYTES DECAF_448_SCALAR_BYTES
42 #define SCALAR_LIMBS DECAF_448_SCALAR_LIMBS
43 #define scalar_t API_NS(scalar_t)
44 #define point_t API_NS(point_t)
45 #define precomputed_s API_NS(precomputed_s)
46 #define IMAGINE_TWIST 0
47 #define COFACTOR 4
48 
49 /* Comb config: number of combs, n, t, s. */
50 #define COMBS_N 5
51 #define COMBS_T 5
52 #define COMBS_S 18
53 #define DECAF_WINDOW_BITS 5
54 #define DECAF_WNAF_FIXED_TABLE_BITS 5
55 #define DECAF_WNAF_VAR_TABLE_BITS 3
56 
57 #define EDDSA_USE_SIGMA_ISOGENY 0
58 
59 static const int EDWARDS_D = -39081;
60 static const scalar_t point_scalarmul_adjustment = {{{
61     SC_LIMB(0xc873d6d54a7bb0cf), SC_LIMB(0xe933d8d723a70aad), SC_LIMB(0xbb124b65129c96fd), SC_LIMB(0x00000008335dc163)
62 }}}, precomputed_scalarmul_adjustment = {{{
63     SC_LIMB(0xc873d6d54a7bb0cf), SC_LIMB(0xe933d8d723a70aad), SC_LIMB(0xbb124b65129c96fd), SC_LIMB(0x00000008335dc163)
64 }}};
65 
66 const uint8_t decaf_x448_base_point[DECAF_X448_PUBLIC_BYTES] = { 0x05 };
67 
68 #define RISTRETTO_FACTOR DECAF_448_RISTRETTO_FACTOR
69 const gf RISTRETTO_FACTOR = {FIELD_LITERAL(
70     0x42ef0f45572736, 0x7bf6aa20ce5296, 0xf4fd6eded26033, 0x968c14ba839a66, 0xb8d54b64a2d780, 0x6aa0a1f1a7b8a5, 0x683bf68d722fa2, 0x22d962fbeb24f7
71 )};
72 
73 #if IMAGINE_TWIST
74 #define TWISTED_D (-(EDWARDS_D))
75 #else
76 #define TWISTED_D ((EDWARDS_D)-1)
77 #endif
78 
79 #if TWISTED_D < 0
80 #define EFF_D (-(TWISTED_D))
81 #define NEG_D 1
82 #else
83 #define EFF_D TWISTED_D
84 #define NEG_D 0
85 #endif
86 
87 /* End of template stuff */
88 
89 /* Sanity */
90 #if (COFACTOR == 8) && !IMAGINE_TWIST && !UNSAFE_CURVE_HAS_POINTS_AT_INFINITY
91 /* FUTURE MAGIC: Curve41417 doesn't have these properties. */
92 #error "Currently require IMAGINE_TWIST (and thus p=5 mod 8) for cofactor 8"
93         /* OK, but why?
94          * Two reasons: #1: There are bugs when COFACTOR == && IMAGINE_TWIST
95          # #2:
96          */
97 #endif
98 
99 #if IMAGINE_TWIST && (P_MOD_8 != 5)
100     #error "Cannot use IMAGINE_TWIST except for p == 5 mod 8"
101 #endif
102 
103 #if (COFACTOR != 8) && (COFACTOR != 4)
104     #error "COFACTOR must be 4 or 8"
105 #endif
106 
107 #if IMAGINE_TWIST
108     extern const gf SQRT_MINUS_ONE;
109 #endif
110 
111 #define WBITS DECAF_WORD_BITS /* NB this may be different from ARCH_WORD_BITS */
112 
113 extern const point_t API_NS(point_base);
114 
115 /* Projective Niels coordinates */
116 typedef struct { gf a, b, c; } niels_s, niels_t[1];
117 typedef struct { niels_t n; gf z; } VECTOR_ALIGNED pniels_s, pniels_t[1];
118 
119 /* Precomputed base */
120 struct precomputed_s { niels_t table [COMBS_N<<(COMBS_T-1)]; };
121 
122 extern const gf API_NS(precomputed_base_as_fe)[];
123 const precomputed_s *API_NS(precomputed_base) =
124     (const precomputed_s *) &API_NS(precomputed_base_as_fe);
125 
126 const size_t API_NS(sizeof_precomputed_s) = sizeof(precomputed_s);
127 const size_t API_NS(alignof_precomputed_s) = sizeof(big_register_t);
128 
129 /** Inverse. */
130 static void
gf_invert(gf y,const gf x,int assert_nonzero)131 gf_invert(gf y, const gf x, int assert_nonzero) {
132     gf t1, t2;
133     gf_sqr(t1, x); // o^2
134     mask_t ret = gf_isr(t2, t1); // +-1/sqrt(o^2) = +-1/o
135     (void)ret;
136     if (assert_nonzero) assert(ret);
137     gf_sqr(t1, t2);
138     gf_mul(t2, t1, x); // not direct to y in case of alias.
139     gf_copy(y, t2);
140 }
141 
142 /** identity = (0,1) */
143 const point_t API_NS(point_identity) = {{{{{0}}},{{{1}}},{{{1}}},{{{0}}}}};
144 
145 /* Predeclare because not static: called by elligator */
146 void API_NS(deisogenize) (
147     gf_s *__restrict__ s,
148     gf_s *__restrict__ inv_el_sum,
149     gf_s *__restrict__ inv_el_m1,
150     const point_t p,
151     mask_t toggle_s,
152     mask_t toggle_altx,
153     mask_t toggle_rotation
154 );
155 
API_NS(deisogenize)156 void API_NS(deisogenize) (
157     gf_s *__restrict__ s,
158     gf_s *__restrict__ inv_el_sum,
159     gf_s *__restrict__ inv_el_m1,
160     const point_t p,
161     mask_t toggle_s,
162     mask_t toggle_altx,
163     mask_t toggle_rotation
164 ) {
165 #if COFACTOR == 4 && !IMAGINE_TWIST
166     (void)toggle_rotation; /* Only applies to cofactor 8 */
167     gf t1;
168     gf_s *t2 = s, *t3=inv_el_sum, *t4=inv_el_m1;
169 
170     gf_add(t1,p->x,p->t);
171     gf_sub(t2,p->x,p->t);
172     gf_mul(t3,t1,t2); /* t3 = num */
173     gf_sqr(t2,p->x);
174     gf_mul(t1,t2,t3);
175     gf_mulw(t2,t1,-1-TWISTED_D); /* -x^2 * (a-d) * num */
176     gf_isr(t1,t2);    /* t1 = isr */
177     gf_mul(t2,t1,t3); /* t2 = ratio */
178     gf_mul(t4,t2,RISTRETTO_FACTOR);
179     mask_t negx = gf_lobit(t4) ^ toggle_altx;
180     gf_cond_neg(t2, negx);
181     gf_mul(t3,t2,p->z);
182     gf_sub(t3,t3,p->t);
183     gf_mul(t2,t3,p->x);
184     gf_mulw(t4,t2,-1-TWISTED_D);
185     gf_mul(s,t4,t1);
186     mask_t lobs = gf_lobit(s);
187     gf_cond_neg(s,lobs);
188     gf_copy(inv_el_m1,p->x);
189     gf_cond_neg(inv_el_m1,~lobs^negx^toggle_s);
190     gf_add(inv_el_m1,inv_el_m1,p->t);
191 
192 #elif COFACTOR == 8 && IMAGINE_TWIST
193     /* More complicated because of rotation */
194     gf t1,t2,t3,t4,t5;
195     gf_add(t1,p->z,p->y);
196     gf_sub(t2,p->z,p->y);
197     gf_mul(t3,t1,t2);      /* t3 = num */
198     gf_mul(t2,p->x,p->y);  /* t2 = den */
199     gf_sqr(t1,t2);
200     gf_mul(t4,t1,t3);
201     gf_mulw(t1,t4,-1-TWISTED_D);
202     gf_isr(t4,t1);         /* isqrt(num*(a-d)*den^2) */
203     gf_mul(t1,t2,t4);
204     gf_mul(t2,t1,RISTRETTO_FACTOR); /* t2 = "iden" in ristretto.sage */
205     gf_mul(t1,t3,t4);                 /* t1 = "inum" in ristretto.sage */
206 
207     /* Calculate altxy = iden*inum*i*t^2*(d-a) */
208     gf_mul(t3,t1,t2);
209     gf_mul_i(t4,t3);
210     gf_mul(t3,t4,p->t);
211     gf_mul(t4,t3,p->t);
212     gf_mulw(t3,t4,TWISTED_D+1);      /* iden*inum*i*t^2*(d-1) */
213     mask_t rotate = toggle_rotation ^ gf_lobit(t3);
214 
215     /* Rotate if altxy is negative */
216     gf_cond_swap(t1,t2,rotate);
217     gf_mul_i(t4,p->x);
218     gf_cond_sel(t4,p->y,t4,rotate);  /* t4 = "fac" = ix if rotate, else y */
219 
220     gf_mul_i(t5,RISTRETTO_FACTOR); /* t5 = imi */
221     gf_mul(t3,t5,t2);                /* iden * imi */
222     gf_mul(t2,t5,t1);
223     gf_mul(t5,t2,p->t);              /* "altx" = iden*imi*t */
224     mask_t negx = gf_lobit(t5) ^ toggle_altx;
225 
226     gf_cond_neg(t1,negx^rotate);
227     gf_mul(t2,t1,p->z);
228     gf_add(t2,t2,ONE);
229     gf_mul(inv_el_sum,t2,t4);
230     gf_mul(s,inv_el_sum,t3);
231 
232     mask_t negs = gf_lobit(s);
233     gf_cond_neg(s,negs);
234 
235     mask_t negz = ~negs ^ toggle_s ^ negx;
236     gf_copy(inv_el_m1,p->z);
237     gf_cond_neg(inv_el_m1,negz);
238     gf_sub(inv_el_m1,inv_el_m1,t4);
239 #else
240 #error "Cofactor must be 4 (with no IMAGINE_TWIST) or 8 (with IMAGINE_TWIST)"
241 #endif
242 }
243 
API_NS(point_encode)244 void API_NS(point_encode)( unsigned char ser[SER_BYTES], const point_t p ) {
245     gf s,ie1,ie2;
246     API_NS(deisogenize)(s,ie1,ie2,p,0,0,0);
247     gf_serialize(ser,s,1);
248 }
249 
API_NS(point_decode)250 decaf_error_t API_NS(point_decode) (
251     point_t p,
252     const unsigned char ser[SER_BYTES],
253     decaf_bool_t allow_identity
254 ) {
255     gf s, s2, num, tmp;
256     gf_s *tmp2=s2, *ynum=p->z, *isr=p->x, *den=p->t;
257 
258     mask_t succ = gf_deserialize(s, ser, 1, 0);
259     succ &= bool_to_mask(allow_identity) | ~gf_eq(s, ZERO);
260     succ &= ~gf_lobit(s);
261 
262     gf_sqr(s2,s);                  /* s^2 = -as^2 */
263 #if IMAGINE_TWIST
264     gf_sub(s2,ZERO,s2);            /* -as^2 */
265 #endif
266     gf_sub(den,ONE,s2);            /* 1+as^2 */
267     gf_add(ynum,ONE,s2);           /* 1-as^2 */
268     gf_mulw(num,s2,-4*TWISTED_D);
269     gf_sqr(tmp,den);               /* tmp = den^2 */
270     gf_add(num,tmp,num);           /* num = den^2 - 4*d*s^2 */
271     gf_mul(tmp2,num,tmp);          /* tmp2 = num*den^2 */
272     succ &= gf_isr(isr,tmp2);      /* isr = 1/sqrt(num*den^2) */
273     gf_mul(tmp,isr,den);           /* isr*den */
274     gf_mul(p->y,tmp,ynum);         /* isr*den*(1-as^2) */
275     gf_mul(tmp2,tmp,s);            /* s*isr*den */
276     gf_add(tmp2,tmp2,tmp2);        /* 2*s*isr*den */
277     gf_mul(tmp,tmp2,isr);          /* 2*s*isr^2*den */
278     gf_mul(p->x,tmp,num);          /* 2*s*isr^2*den*num */
279     gf_mul(tmp,tmp2,RISTRETTO_FACTOR); /* 2*s*isr*den*magic */
280     gf_cond_neg(p->x,gf_lobit(tmp)); /* flip x */
281 
282 #if COFACTOR==8
283     /* Additionally check y != 0 and x*y*isomagic nonegative */
284     succ &= ~gf_eq(p->y,ZERO);
285     gf_mul(tmp,p->x,p->y);
286     gf_mul(tmp2,tmp,RISTRETTO_FACTOR);
287     succ &= ~gf_lobit(tmp2);
288 #endif
289 
290 #if IMAGINE_TWIST
291     gf_copy(tmp,p->x);
292     gf_mul_i(p->x,tmp);
293 #endif
294 
295     /* Fill in z and t */
296     gf_copy(p->z,ONE);
297     gf_mul(p->t,p->x,p->y);
298 
299     assert(API_NS(point_valid)(p) | ~succ);
300     return decaf_succeed_if(mask_to_bool(succ));
301 }
302 
API_NS(point_sub)303 void API_NS(point_sub) (
304     point_t p,
305     const point_t q,
306     const point_t r
307 ) {
308     gf a, b, c, d;
309     gf_sub_nr ( b, q->y, q->x ); /* 3+e */
310     gf_sub_nr ( d, r->y, r->x ); /* 3+e */
311     gf_add_nr ( c, r->y, r->x ); /* 2+e */
312     gf_mul ( a, c, b );
313     gf_add_nr ( b, q->y, q->x ); /* 2+e */
314     gf_mul ( p->y, d, b );
315     gf_mul ( b, r->t, q->t );
316     gf_mulw ( p->x, b, 2*EFF_D );
317     gf_add_nr ( b, a, p->y );    /* 2+e */
318     gf_sub_nr ( c, p->y, a );    /* 3+e */
319     gf_mul ( a, q->z, r->z );
320     gf_add_nr ( a, a, a );       /* 2+e */
321     if (GF_HEADROOM <= 3) gf_weak_reduce(a); /* or 1+e */
322 #if NEG_D
323     gf_sub_nr ( p->y, a, p->x ); /* 4+e or 3+e */
324     gf_add_nr ( a, a, p->x );    /* 3+e or 2+e */
325 #else
326     gf_add_nr ( p->y, a, p->x ); /* 3+e or 2+e */
327     gf_sub_nr ( a, a, p->x );    /* 4+e or 3+e */
328 #endif
329     gf_mul ( p->z, a, p->y );
330     gf_mul ( p->x, p->y, c );
331     gf_mul ( p->y, a, b );
332     gf_mul ( p->t, b, c );
333 }
334 
API_NS(point_add)335 void API_NS(point_add) (
336     point_t p,
337     const point_t q,
338     const point_t r
339 ) {
340     gf a, b, c, d;
341     gf_sub_nr ( b, q->y, q->x ); /* 3+e */
342     gf_sub_nr ( c, r->y, r->x ); /* 3+e */
343     gf_add_nr ( d, r->y, r->x ); /* 2+e */
344     gf_mul ( a, c, b );
345     gf_add_nr ( b, q->y, q->x ); /* 2+e */
346     gf_mul ( p->y, d, b );
347     gf_mul ( b, r->t, q->t );
348     gf_mulw ( p->x, b, 2*EFF_D );
349     gf_add_nr ( b, a, p->y );    /* 2+e */
350     gf_sub_nr ( c, p->y, a );    /* 3+e */
351     gf_mul ( a, q->z, r->z );
352     gf_add_nr ( a, a, a );       /* 2+e */
353     if (GF_HEADROOM <= 3) gf_weak_reduce(a); /* or 1+e */
354 #if NEG_D
355     gf_add_nr ( p->y, a, p->x ); /* 3+e or 2+e */
356     gf_sub_nr ( a, a, p->x );    /* 4+e or 3+e */
357 #else
358     gf_sub_nr ( p->y, a, p->x ); /* 4+e or 3+e */
359     gf_add_nr ( a, a, p->x );    /* 3+e or 2+e */
360 #endif
361     gf_mul ( p->z, a, p->y );
362     gf_mul ( p->x, p->y, c );
363     gf_mul ( p->y, a, b );
364     gf_mul ( p->t, b, c );
365 }
366 
367 static DECAF_NOINLINE void
point_double_internal(point_t p,const point_t q,int before_double)368 point_double_internal (
369     point_t p,
370     const point_t q,
371     int before_double
372 ) {
373     gf a, b, c, d;
374     gf_sqr ( c, q->x );
375     gf_sqr ( a, q->y );
376     gf_add_nr ( d, c, a );             /* 2+e */
377     gf_add_nr ( p->t, q->y, q->x );    /* 2+e */
378     gf_sqr ( b, p->t );
379     gf_subx_nr ( b, b, d, 3 );         /* 4+e */
380     gf_sub_nr ( p->t, a, c );          /* 3+e */
381     gf_sqr ( p->x, q->z );
382     gf_add_nr ( p->z, p->x, p->x );    /* 2+e */
383     gf_subx_nr ( a, p->z, p->t, 4 );   /* 6+e */
384     if (GF_HEADROOM == 5) gf_weak_reduce(a); /* or 1+e */
385     gf_mul ( p->x, a, b );
386     gf_mul ( p->z, p->t, a );
387     gf_mul ( p->y, p->t, d );
388     if (!before_double) gf_mul ( p->t, b, d );
389 }
390 
API_NS(point_double)391 void API_NS(point_double)(point_t p, const point_t q) {
392     point_double_internal(p,q,0);
393 }
394 
API_NS(point_negate)395 void API_NS(point_negate) (
396    point_t nega,
397    const point_t a
398 ) {
399     gf_sub(nega->x, ZERO, a->x);
400     gf_copy(nega->y, a->y);
401     gf_copy(nega->z, a->z);
402     gf_sub(nega->t, ZERO, a->t);
403 }
404 
405 /* Operations on [p]niels */
406 static DECAF_INLINE void
cond_neg_niels(niels_t n,mask_t neg)407 cond_neg_niels (
408     niels_t n,
409     mask_t neg
410 ) {
411     gf_cond_swap(n->a, n->b, neg);
412     gf_cond_neg(n->c, neg);
413 }
414 
pt_to_pniels(pniels_t b,const point_t a)415 static DECAF_NOINLINE void pt_to_pniels (
416     pniels_t b,
417     const point_t a
418 ) {
419     gf_sub ( b->n->a, a->y, a->x );
420     gf_add ( b->n->b, a->x, a->y );
421     gf_mulw ( b->n->c, a->t, 2*TWISTED_D );
422     gf_add ( b->z, a->z, a->z );
423 }
424 
pniels_to_pt(point_t e,const pniels_t d)425 static DECAF_NOINLINE void pniels_to_pt (
426     point_t e,
427     const pniels_t d
428 ) {
429     gf eu;
430     gf_add ( eu, d->n->b, d->n->a );
431     gf_sub ( e->y, d->n->b, d->n->a );
432     gf_mul ( e->t, e->y, eu);
433     gf_mul ( e->x, d->z, e->y );
434     gf_mul ( e->y, d->z, eu );
435     gf_sqr ( e->z, d->z );
436 }
437 
438 static DECAF_NOINLINE void
niels_to_pt(point_t e,const niels_t n)439 niels_to_pt (
440     point_t e,
441     const niels_t n
442 ) {
443     gf_add ( e->y, n->b, n->a );
444     gf_sub ( e->x, n->b, n->a );
445     gf_mul ( e->t, e->y, e->x );
446     gf_copy ( e->z, ONE );
447 }
448 
449 static DECAF_NOINLINE void
add_niels_to_pt(point_t d,const niels_t e,int before_double)450 add_niels_to_pt (
451     point_t d,
452     const niels_t e,
453     int before_double
454 ) {
455     gf a, b, c;
456     gf_sub_nr ( b, d->y, d->x ); /* 3+e */
457     gf_mul ( a, e->a, b );
458     gf_add_nr ( b, d->x, d->y ); /* 2+e */
459     gf_mul ( d->y, e->b, b );
460     gf_mul ( d->x, e->c, d->t );
461     gf_add_nr ( c, a, d->y );    /* 2+e */
462     gf_sub_nr ( b, d->y, a );    /* 3+e */
463     gf_sub_nr ( d->y, d->z, d->x ); /* 3+e */
464     gf_add_nr ( a, d->x, d->z ); /* 2+e */
465     gf_mul ( d->z, a, d->y );
466     gf_mul ( d->x, d->y, b );
467     gf_mul ( d->y, a, c );
468     if (!before_double) gf_mul ( d->t, b, c );
469 }
470 
471 static DECAF_NOINLINE void
sub_niels_from_pt(point_t d,const niels_t e,int before_double)472 sub_niels_from_pt (
473     point_t d,
474     const niels_t e,
475     int before_double
476 ) {
477     gf a, b, c;
478     gf_sub_nr ( b, d->y, d->x ); /* 3+e */
479     gf_mul ( a, e->b, b );
480     gf_add_nr ( b, d->x, d->y ); /* 2+e */
481     gf_mul ( d->y, e->a, b );
482     gf_mul ( d->x, e->c, d->t );
483     gf_add_nr ( c, a, d->y );    /* 2+e */
484     gf_sub_nr ( b, d->y, a );    /* 3+e */
485     gf_add_nr ( d->y, d->z, d->x ); /* 2+e */
486     gf_sub_nr ( a, d->z, d->x ); /* 3+e */
487     gf_mul ( d->z, a, d->y );
488     gf_mul ( d->x, d->y, b );
489     gf_mul ( d->y, a, c );
490     if (!before_double) gf_mul ( d->t, b, c );
491 }
492 
493 static void
add_pniels_to_pt(point_t p,const pniels_t pn,int before_double)494 add_pniels_to_pt (
495     point_t p,
496     const pniels_t pn,
497     int before_double
498 ) {
499     gf L0;
500     gf_mul ( L0, p->z, pn->z );
501     gf_copy ( p->z, L0 );
502     add_niels_to_pt( p, pn->n, before_double );
503 }
504 
505 static void
sub_pniels_from_pt(point_t p,const pniels_t pn,int before_double)506 sub_pniels_from_pt (
507     point_t p,
508     const pniels_t pn,
509     int before_double
510 ) {
511     gf L0;
512     gf_mul ( L0, p->z, pn->z );
513     gf_copy ( p->z, L0 );
514     sub_niels_from_pt( p, pn->n, before_double );
515 }
516 
517 static DECAF_NOINLINE void
prepare_fixed_window(pniels_t * multiples,const point_t b,int ntable)518 prepare_fixed_window(
519     pniels_t *multiples,
520     const point_t b,
521     int ntable
522 ) {
523     point_t tmp;
524     pniels_t pn;
525     int i;
526 
527     point_double_internal(tmp, b, 0);
528     pt_to_pniels(pn, tmp);
529     pt_to_pniels(multiples[0], b);
530     API_NS(point_copy)(tmp, b);
531     for (i=1; i<ntable; i++) {
532         add_pniels_to_pt(tmp, pn, 0);
533         pt_to_pniels(multiples[i], tmp);
534     }
535 
536     decaf_bzero(pn,sizeof(pn));
537     decaf_bzero(tmp,sizeof(tmp));
538 }
539 
API_NS(point_scalarmul)540 void API_NS(point_scalarmul) (
541     point_t a,
542     const point_t b,
543     const scalar_t scalar
544 ) {
545 
546     const int WINDOW = DECAF_WINDOW_BITS,
547         WINDOW_MASK = (1<<WINDOW)-1,
548         WINDOW_T_MASK = WINDOW_MASK >> 1,
549         NTABLE = 1<<(WINDOW-1);
550 
551     scalar_t scalar1x;
552     API_NS(scalar_add)(scalar1x, scalar, point_scalarmul_adjustment);
553     API_NS(scalar_halve)(scalar1x,scalar1x);
554 
555     /* Set up a precomputed table with odd multiples of b. */
556     pniels_t pn, multiples[1<<((int)(DECAF_WINDOW_BITS)-1)];  // == NTABLE (MSVC compatibility issue)
557     point_t tmp;
558     prepare_fixed_window(multiples, b, NTABLE);
559 
560     /* Initialize. */
561     int i,j,first=1;
562     i = SCALAR_BITS - ((SCALAR_BITS-1) % WINDOW) - 1;
563 
564     for (; i>=0; i-=WINDOW) {
565         /* Fetch another block of bits */
566         word_t bits = scalar1x->limb[i/WBITS] >> (i%WBITS);
567         if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
568             bits ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
569         }
570         bits &= WINDOW_MASK;
571         mask_t inv = (bits>>(WINDOW-1))-1;
572         bits ^= inv;
573 
574         /* Add in from table.  Compute t only on last iteration. */
575         constant_time_lookup(pn, multiples, sizeof(pn), NTABLE, bits & WINDOW_T_MASK);
576         cond_neg_niels(pn->n, inv);
577         if (first) {
578             pniels_to_pt(tmp, pn);
579             first = 0;
580         } else {
581            /* Using Hisil et al's lookahead method instead of extensible here
582             * for no particular reason.  Double WINDOW times, but only compute t on
583             * the last one.
584             */
585             for (j=0; j<WINDOW-1; j++)
586                 point_double_internal(tmp, tmp, -1);
587             point_double_internal(tmp, tmp, 0);
588             add_pniels_to_pt(tmp, pn, i ? -1 : 0);
589         }
590     }
591 
592     /* Write out the answer */
593     API_NS(point_copy)(a,tmp);
594 
595     decaf_bzero(scalar1x,sizeof(scalar1x));
596     decaf_bzero(pn,sizeof(pn));
597     decaf_bzero(multiples,sizeof(multiples));
598     decaf_bzero(tmp,sizeof(tmp));
599 }
600 
API_NS(point_double_scalarmul)601 void API_NS(point_double_scalarmul) (
602     point_t a,
603     const point_t b,
604     const scalar_t scalarb,
605     const point_t c,
606     const scalar_t scalarc
607 ) {
608 
609     const int WINDOW = DECAF_WINDOW_BITS,
610         WINDOW_MASK = (1<<WINDOW)-1,
611         WINDOW_T_MASK = WINDOW_MASK >> 1,
612         NTABLE = 1<<(WINDOW-1);
613 
614     scalar_t scalar1x, scalar2x;
615     API_NS(scalar_add)(scalar1x, scalarb, point_scalarmul_adjustment);
616     API_NS(scalar_halve)(scalar1x,scalar1x);
617     API_NS(scalar_add)(scalar2x, scalarc, point_scalarmul_adjustment);
618     API_NS(scalar_halve)(scalar2x,scalar2x);
619 
620     /* Set up a precomputed table with odd multiples of b. */
621     pniels_t pn, multiples1[1<<((int)(DECAF_WINDOW_BITS)-1)], multiples2[1<<((int)(DECAF_WINDOW_BITS)-1)];
622     // Array size above equal NTABLE (MSVC compatibility issue)
623     point_t tmp;
624     prepare_fixed_window(multiples1, b, NTABLE);
625     prepare_fixed_window(multiples2, c, NTABLE);
626 
627     /* Initialize. */
628     int i,j,first=1;
629     i = SCALAR_BITS - ((SCALAR_BITS-1) % WINDOW) - 1;
630 
631     for (; i>=0; i-=WINDOW) {
632         /* Fetch another block of bits */
633         word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
634                      bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
635         if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
636             bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
637             bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
638         }
639         bits1 &= WINDOW_MASK;
640         bits2 &= WINDOW_MASK;
641         mask_t inv1 = (bits1>>(WINDOW-1))-1;
642         mask_t inv2 = (bits2>>(WINDOW-1))-1;
643         bits1 ^= inv1;
644         bits2 ^= inv2;
645 
646         /* Add in from table.  Compute t only on last iteration. */
647         constant_time_lookup(pn, multiples1, sizeof(pn), NTABLE, bits1 & WINDOW_T_MASK);
648         cond_neg_niels(pn->n, inv1);
649         if (first) {
650             pniels_to_pt(tmp, pn);
651             first = 0;
652         } else {
653            /* Using Hisil et al's lookahead method instead of extensible here
654             * for no particular reason.  Double WINDOW times, but only compute t on
655             * the last one.
656             */
657             for (j=0; j<WINDOW-1; j++)
658                 point_double_internal(tmp, tmp, -1);
659             point_double_internal(tmp, tmp, 0);
660             add_pniels_to_pt(tmp, pn, 0);
661         }
662         constant_time_lookup(pn, multiples2, sizeof(pn), NTABLE, bits2 & WINDOW_T_MASK);
663         cond_neg_niels(pn->n, inv2);
664         add_pniels_to_pt(tmp, pn, i?-1:0);
665     }
666 
667     /* Write out the answer */
668     API_NS(point_copy)(a,tmp);
669 
670 
671     decaf_bzero(scalar1x,sizeof(scalar1x));
672     decaf_bzero(scalar2x,sizeof(scalar2x));
673     decaf_bzero(pn,sizeof(pn));
674     decaf_bzero(multiples1,sizeof(multiples1));
675     decaf_bzero(multiples2,sizeof(multiples2));
676     decaf_bzero(tmp,sizeof(tmp));
677 }
678 
API_NS(point_dual_scalarmul)679 void API_NS(point_dual_scalarmul) (
680     point_t a1,
681     point_t a2,
682     const point_t b,
683     const scalar_t scalar1,
684     const scalar_t scalar2
685 ) {
686 
687     const int WINDOW = DECAF_WINDOW_BITS,
688         WINDOW_MASK = (1<<WINDOW)-1,
689         WINDOW_T_MASK = WINDOW_MASK >> 1,
690         NTABLE = 1<<(WINDOW-1);
691 
692 
693     scalar_t scalar1x, scalar2x;
694     API_NS(scalar_add)(scalar1x, scalar1, point_scalarmul_adjustment);
695     API_NS(scalar_halve)(scalar1x,scalar1x);
696     API_NS(scalar_add)(scalar2x, scalar2, point_scalarmul_adjustment);
697     API_NS(scalar_halve)(scalar2x,scalar2x);
698 
699     /* Set up a precomputed table with odd multiples of b. */
700     point_t multiples1[1<<((int)(DECAF_WINDOW_BITS)-1)], multiples2[1<<((int)(DECAF_WINDOW_BITS)-1)], working, tmp;
701     // Array sizes above equal NTABLE (MSVC compatibility issue)
702 
703     pniels_t pn;
704 
705     API_NS(point_copy)(working, b);
706 
707     /* Initialize. */
708     int i,j;
709 
710     for (i=0; i<NTABLE; i++) {
711         API_NS(point_copy)(multiples1[i], API_NS(point_identity));
712         API_NS(point_copy)(multiples2[i], API_NS(point_identity));
713     }
714 
715     for (i=0; i<SCALAR_BITS; i+=WINDOW) {
716         if (i) {
717             for (j=0; j<WINDOW-1; j++)
718                 point_double_internal(working, working, -1);
719             point_double_internal(working, working, 0);
720         }
721 
722         /* Fetch another block of bits */
723         word_t bits1 = scalar1x->limb[i/WBITS] >> (i%WBITS),
724                bits2 = scalar2x->limb[i/WBITS] >> (i%WBITS);
725         if (i%WBITS >= WBITS-WINDOW && i/WBITS<SCALAR_LIMBS-1) {
726             bits1 ^= scalar1x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
727             bits2 ^= scalar2x->limb[i/WBITS+1] << (WBITS - (i%WBITS));
728         }
729         bits1 &= WINDOW_MASK;
730         bits2 &= WINDOW_MASK;
731         mask_t inv1 = (bits1>>(WINDOW-1))-1;
732         mask_t inv2 = (bits2>>(WINDOW-1))-1;
733         bits1 ^= inv1;
734         bits2 ^= inv2;
735 
736         pt_to_pniels(pn, working);
737 
738         constant_time_lookup(tmp, multiples1, sizeof(tmp), NTABLE, bits1 & WINDOW_T_MASK);
739         cond_neg_niels(pn->n, inv1);
740         /* add_pniels_to_pt(multiples1[bits1 & WINDOW_T_MASK], pn, 0); */
741         add_pniels_to_pt(tmp, pn, 0);
742         constant_time_insert(multiples1, tmp, sizeof(tmp), NTABLE, bits1 & WINDOW_T_MASK);
743 
744 
745         constant_time_lookup(tmp, multiples2, sizeof(tmp), NTABLE, bits2 & WINDOW_T_MASK);
746         cond_neg_niels(pn->n, inv1^inv2);
747         /* add_pniels_to_pt(multiples2[bits2 & WINDOW_T_MASK], pn, 0); */
748         add_pniels_to_pt(tmp, pn, 0);
749         constant_time_insert(multiples2, tmp, sizeof(tmp), NTABLE, bits2 & WINDOW_T_MASK);
750     }
751 
752     if (NTABLE > 1) {
753         API_NS(point_copy)(working, multiples1[NTABLE-1]);
754         API_NS(point_copy)(tmp    , multiples2[NTABLE-1]);
755 
756         for (i=NTABLE-1; i>1; i--) {
757             API_NS(point_add)(multiples1[i-1], multiples1[i-1], multiples1[i]);
758             API_NS(point_add)(multiples2[i-1], multiples2[i-1], multiples2[i]);
759             API_NS(point_add)(working, working, multiples1[i-1]);
760             API_NS(point_add)(tmp,     tmp,     multiples2[i-1]);
761         }
762 
763         API_NS(point_add)(multiples1[0], multiples1[0], multiples1[1]);
764         API_NS(point_add)(multiples2[0], multiples2[0], multiples2[1]);
765         point_double_internal(working, working, 0);
766         point_double_internal(tmp,         tmp, 0);
767         API_NS(point_add)(a1, working, multiples1[0]);
768         API_NS(point_add)(a2, tmp,     multiples2[0]);
769     } else {
770         API_NS(point_copy)(a1, multiples1[0]);
771         API_NS(point_copy)(a2, multiples2[0]);
772     }
773 
774     decaf_bzero(scalar1x,sizeof(scalar1x));
775     decaf_bzero(scalar2x,sizeof(scalar2x));
776     decaf_bzero(pn,sizeof(pn));
777     decaf_bzero(multiples1,sizeof(multiples1));
778     decaf_bzero(multiples2,sizeof(multiples2));
779     decaf_bzero(tmp,sizeof(tmp));
780     decaf_bzero(working,sizeof(working));
781 }
782 
API_NS(point_eq)783 decaf_bool_t API_NS(point_eq) ( const point_t p, const point_t q ) {
784     /* equality mod 2-torsion compares x/y */
785     gf a, b;
786     gf_mul ( a, p->y, q->x );
787     gf_mul ( b, q->y, p->x );
788     mask_t succ = gf_eq(a,b);
789 
790     #if (COFACTOR == 8)
791         gf_mul ( a, p->y, q->y );
792         gf_mul ( b, q->x, p->x );
793         #if !(IMAGINE_TWIST)
794             gf_sub ( a, ZERO, a );
795         #else
796            /* Interesting note: the 4tor would normally be rotation.
797             * But because of the *i twist, it's actually
798             * (x,y) <-> (iy,ix)
799             */
800 
801            /* No code, just a comment. */
802         #endif
803         succ |= gf_eq(a,b);
804     #endif
805 
806     return mask_to_bool(succ);
807 }
808 
API_NS(point_valid)809 decaf_bool_t API_NS(point_valid) (
810     const point_t p
811 ) {
812     gf a,b,c;
813     gf_mul(a,p->x,p->y);
814     gf_mul(b,p->z,p->t);
815     mask_t out = gf_eq(a,b);
816     gf_sqr(a,p->x);
817     gf_sqr(b,p->y);
818     gf_sub(a,b,a);
819     gf_sqr(b,p->t);
820     gf_mulw(c,b,TWISTED_D);
821     gf_sqr(b,p->z);
822     gf_add(b,b,c);
823     out &= gf_eq(a,b);
824     out &= ~gf_eq(p->z,ZERO);
825     return mask_to_bool(out);
826 }
827 
API_NS(point_debugging_torque)828 void API_NS(point_debugging_torque) (
829     point_t q,
830     const point_t p
831 ) {
832 #if COFACTOR == 8 && IMAGINE_TWIST
833     gf tmp;
834     gf_mul(tmp,p->x,SQRT_MINUS_ONE);
835     gf_mul(q->x,p->y,SQRT_MINUS_ONE);
836     gf_copy(q->y,tmp);
837     gf_copy(q->z,p->z);
838     gf_sub(q->t,ZERO,p->t);
839 #else
840     gf_sub(q->x,ZERO,p->x);
841     gf_sub(q->y,ZERO,p->y);
842     gf_copy(q->z,p->z);
843     gf_copy(q->t,p->t);
844 #endif
845 }
846 
API_NS(point_debugging_pscale)847 void API_NS(point_debugging_pscale) (
848     point_t q,
849     const point_t p,
850     const uint8_t factor[SER_BYTES]
851 ) {
852     gf gfac,tmp;
853     /* NB this means you'll never pscale by negative numbers for p521 */
854     ignore_result(gf_deserialize(gfac,factor,0,0));
855     gf_cond_sel(gfac,gfac,ONE,gf_eq(gfac,ZERO));
856     gf_mul(tmp,p->x,gfac);
857     gf_copy(q->x,tmp);
858     gf_mul(tmp,p->y,gfac);
859     gf_copy(q->y,tmp);
860     gf_mul(tmp,p->z,gfac);
861     gf_copy(q->z,tmp);
862     gf_mul(tmp,p->t,gfac);
863     gf_copy(q->t,tmp);
864 }
865 
gf_batch_invert(gf * __restrict__ out,const gf * in,unsigned int n)866 static void gf_batch_invert (
867     gf *__restrict__ out,
868     const gf *in,
869     unsigned int n
870 ) {
871     gf t1;
872     assert(n>1);
873 
874     gf_copy(out[1], in[0]);
875     int i;
876     for (i=1; i<(int) (n-1); i++) {
877         gf_mul(out[i+1], out[i], in[i]);
878     }
879     gf_mul(out[0], out[n-1], in[n-1]);
880 
881     gf_invert(out[0], out[0], 1);
882 
883     for (i=n-1; i>0; i--) {
884         gf_mul(t1, out[i], out[0]);
885         gf_copy(out[i], t1);
886         gf_mul(t1, out[0], in[i]);
887         gf_copy(out[0], t1);
888     }
889 }
890 
batch_normalize_niels(niels_t * table,const gf * zs,gf * __restrict__ zis,int n)891 static void batch_normalize_niels (
892     niels_t *table,
893     const gf *zs,
894     gf *__restrict__ zis,
895     int n
896 ) {
897     int i;
898     gf product;
899     gf_batch_invert(zis, zs, n);
900 
901     for (i=0; i<n; i++) {
902         gf_mul(product, table[i]->a, zis[i]);
903         gf_strong_reduce(product);
904         gf_copy(table[i]->a, product);
905 
906         gf_mul(product, table[i]->b, zis[i]);
907         gf_strong_reduce(product);
908         gf_copy(table[i]->b, product);
909 
910         gf_mul(product, table[i]->c, zis[i]);
911         gf_strong_reduce(product);
912         gf_copy(table[i]->c, product);
913     }
914 
915     decaf_bzero(product,sizeof(product));
916 }
917 
API_NS(precompute)918 void API_NS(precompute) (
919     precomputed_s *table,
920     const point_t base
921 ) {
922     const unsigned int n = COMBS_N, t = COMBS_T, s = COMBS_S;
923     assert(n*t*s >= SCALAR_BITS);
924 
925     point_t working, start, doubles[COMBS_T-1];
926     API_NS(point_copy)(working, base);
927     pniels_t pn_tmp;
928 
929     gf zs[(unsigned int)(COMBS_N)<<(unsigned int)(COMBS_T-1)], zis[(unsigned int)(COMBS_N)<<(unsigned int)(COMBS_T-1)];
930 
931     unsigned int i,j,k;
932 
933     /* Compute n tables */
934     for (i=0; i<n; i++) {
935 
936         /* Doubling phase */
937         for (j=0; j<t; j++) {
938             if (j) API_NS(point_add)(start, start, working);
939             else API_NS(point_copy)(start, working);
940 
941             if (j==t-1 && i==n-1) break;
942 
943             point_double_internal(working, working,0);
944             if (j<t-1) API_NS(point_copy)(doubles[j], working);
945 
946             for (k=0; k<s-1; k++)
947                 point_double_internal(working, working, k<s-2);
948         }
949 
950         /* Gray-code phase */
951         for (j=0;; j++) {
952             int gray = j ^ (j>>1);
953             int idx = (((i+1)<<(t-1))-1) ^ gray;
954 
955             pt_to_pniels(pn_tmp, start);
956             memcpy(table->table[idx], pn_tmp->n, sizeof(pn_tmp->n));
957             gf_copy(zs[idx], pn_tmp->z);
958 
959             if (j >= (1u<<(t-1)) - 1) break;
960             int delta = (j+1) ^ ((j+1)>>1) ^ gray;
961 
962             for (k=0; delta>1; k++)
963                 delta >>=1;
964 
965             if (gray & (1<<k)) {
966                 API_NS(point_add)(start, start, doubles[k]);
967             } else {
968                 API_NS(point_sub)(start, start, doubles[k]);
969             }
970         }
971     }
972 
973     batch_normalize_niels(table->table,(const gf *)zs,zis,n<<(t-1));
974 
975     decaf_bzero(zs,sizeof(zs));
976     decaf_bzero(zis,sizeof(zis));
977     decaf_bzero(pn_tmp,sizeof(pn_tmp));
978     decaf_bzero(working,sizeof(working));
979     decaf_bzero(start,sizeof(start));
980     decaf_bzero(doubles,sizeof(doubles));
981 }
982 
983 static DECAF_INLINE void
constant_time_lookup_niels(niels_s * __restrict__ ni,const niels_t * table,int nelts,int idx)984 constant_time_lookup_niels (
985     niels_s *__restrict__ ni,
986     const niels_t *table,
987     int nelts,
988     int idx
989 ) {
990     constant_time_lookup(ni, table, sizeof(niels_s), nelts, idx);
991 }
992 
API_NS(precomputed_scalarmul)993 void API_NS(precomputed_scalarmul) (
994     point_t out,
995     const precomputed_s *table,
996     const scalar_t scalar
997 ) {
998     int i;
999     unsigned j,k;
1000     const unsigned int n = COMBS_N, t = COMBS_T, s = COMBS_S;
1001 
1002     scalar_t scalar1x;
1003     API_NS(scalar_add)(scalar1x, scalar, precomputed_scalarmul_adjustment);
1004     API_NS(scalar_halve)(scalar1x,scalar1x);
1005 
1006     niels_t ni;
1007 
1008     for (i=s-1; i>=0; i--) {
1009         if (i != (int)s-1) point_double_internal(out,out,0);
1010 
1011         for (j=0; j<n; j++) {
1012             int tab = 0;
1013 
1014             for (k=0; k<t; k++) {
1015                 unsigned int bit = i + s*(k + j*t);
1016                 if (bit < SCALAR_BITS) {
1017                     tab |= (scalar1x->limb[bit/WBITS] >> (bit%WBITS) & 1) << k;
1018                 }
1019             }
1020 
1021             mask_t invert = (tab>>(t-1))-1;
1022             tab ^= invert;
1023             tab &= (1<<(t-1)) - 1;
1024 
1025             constant_time_lookup_niels(ni, &table->table[j<<(t-1)], 1<<(t-1), tab);
1026 
1027             cond_neg_niels(ni, invert);
1028             if ((i!=(int)s-1)||j) {
1029                 add_niels_to_pt(out, ni, j==n-1 && i);
1030             } else {
1031                 niels_to_pt(out, ni);
1032             }
1033         }
1034     }
1035 
1036     decaf_bzero(ni,sizeof(ni));
1037     decaf_bzero(scalar1x,sizeof(scalar1x));
1038 }
1039 
API_NS(point_cond_sel)1040 void API_NS(point_cond_sel) (
1041     point_t out,
1042     const point_t a,
1043     const point_t b,
1044     decaf_bool_t pick_b
1045 ) {
1046     constant_time_select(out,a,b,sizeof(point_t),bool_to_mask(pick_b),0);
1047 }
1048 
1049 /* FUTURE: restore Curve25519 Montgomery ladder? */
API_NS(direct_scalarmul)1050 decaf_error_t API_NS(direct_scalarmul) (
1051     uint8_t scaled[SER_BYTES],
1052     const uint8_t base[SER_BYTES],
1053     const scalar_t scalar,
1054     decaf_bool_t allow_identity,
1055     decaf_bool_t short_circuit
1056 ) {
1057     point_t basep;
1058     decaf_error_t succ = API_NS(point_decode)(basep, base, allow_identity);
1059     if (short_circuit && succ != DECAF_SUCCESS) return succ;
1060     API_NS(point_cond_sel)(basep, API_NS(point_base), basep, succ);
1061     API_NS(point_scalarmul)(basep, basep, scalar);
1062     API_NS(point_encode)(scaled, basep);
1063     API_NS(point_destroy)(basep);
1064     return succ;
1065 }
1066 
API_NS(point_mul_by_ratio_and_encode_like_eddsa)1067 void API_NS(point_mul_by_ratio_and_encode_like_eddsa) (
1068     uint8_t enc[DECAF_EDDSA_448_PUBLIC_BYTES],
1069     const point_t p
1070 ) {
1071 
1072     /* The point is now on the twisted curve.  Move it to untwisted. */
1073     gf x, y, z, t;
1074     point_t q;
1075 #if COFACTOR == 8
1076     API_NS(point_double)(q,p);
1077 #else
1078     API_NS(point_copy)(q,p);
1079 #endif
1080 
1081 #if EDDSA_USE_SIGMA_ISOGENY
1082     {
1083         /* Use 4-isogeny like ed25519:
1084          *   2*x*y*sqrt(d/a-1)/(ax^2 + y^2 - 2)
1085          *   (y^2 - ax^2)/(y^2 + ax^2)
1086          * with a = -1, d = -EDWARDS_D:
1087          *   -2xysqrt(EDWARDS_D-1)/(2z^2-y^2+x^2)
1088          *   (y^2+x^2)/(y^2-x^2)
1089          */
1090         gf u;
1091         gf_sqr ( x, q->x ); // x^2
1092         gf_sqr ( t, q->y ); // y^2
1093         gf_add( u, x, t ); // x^2 + y^2
1094         gf_add( z, q->y, q->x );
1095         gf_sqr ( y, z);
1096         gf_sub ( y, u, y ); // -2xy
1097         gf_sub ( z, t, x ); // y^2 - x^2
1098         gf_sqr ( x, q->z );
1099         gf_add ( t, x, x);
1100         gf_sub ( t, t, z);  // 2z^2 - y^2 + x^2
1101         gf_mul ( x, y, z ); // 2xy(y^2-x^2)
1102         gf_mul ( y, u, t ); // (x^2+y^2)(2z^2-y^2+x^2)
1103         gf_mul ( u, z, t );
1104         gf_copy( z, u );
1105         gf_mul ( u, x, RISTRETTO_FACTOR );
1106 #if IMAGINE_TWIST
1107         gf_mul_i( x, u );
1108 #else
1109 #error "... probably wrong"
1110         gf_copy( x, u );
1111 #endif
1112         decaf_bzero(u,sizeof(u));
1113     }
1114 #elif IMAGINE_TWIST
1115     {
1116         API_NS(point_double)(q,q);
1117         API_NS(point_double)(q,q);
1118         gf_mul_i(x, q->x);
1119         gf_copy(y, q->y);
1120         gf_copy(z, q->z);
1121     }
1122 #else
1123     {
1124         /* 4-isogeny: 2xy/(y^+x^2), (y^2-x^2)/(2z^2-y^2+x^2) */
1125         gf u;
1126         gf_sqr ( x, q->x );
1127         gf_sqr ( t, q->y );
1128         gf_add( u, x, t );
1129         gf_add( z, q->y, q->x );
1130         gf_sqr ( y, z);
1131         gf_sub ( y, y, u );
1132         gf_sub ( z, t, x );
1133         gf_sqr ( x, q->z );
1134         gf_add ( t, x, x);
1135         gf_sub ( t, t, z);
1136         gf_mul ( x, t, y );
1137         gf_mul ( y, z, u );
1138         gf_mul ( z, u, t );
1139         decaf_bzero(u,sizeof(u));
1140     }
1141 #endif
1142     /* Affinize */
1143     gf_invert(z,z,1);
1144     gf_mul(t,x,z);
1145     gf_mul(x,y,z);
1146 
1147     /* Encode */
1148     enc[DECAF_EDDSA_448_PRIVATE_BYTES-1] = 0;
1149     gf_serialize(enc, x, 1);
1150     enc[DECAF_EDDSA_448_PRIVATE_BYTES-1] |= 0x80 & gf_lobit(t);
1151 
1152     decaf_bzero(x,sizeof(x));
1153     decaf_bzero(y,sizeof(y));
1154     decaf_bzero(z,sizeof(z));
1155     decaf_bzero(t,sizeof(t));
1156     API_NS(point_destroy)(q);
1157 }
1158 
1159 
API_NS(point_decode_like_eddsa_and_mul_by_ratio)1160 decaf_error_t API_NS(point_decode_like_eddsa_and_mul_by_ratio) (
1161     point_t p,
1162     const uint8_t enc[DECAF_EDDSA_448_PUBLIC_BYTES]
1163 ) {
1164     uint8_t enc2[DECAF_EDDSA_448_PUBLIC_BYTES];
1165     memcpy(enc2,enc,sizeof(enc2));
1166 
1167     mask_t low = ~word_is_zero(enc2[DECAF_EDDSA_448_PRIVATE_BYTES-1] & 0x80);
1168     enc2[DECAF_EDDSA_448_PRIVATE_BYTES-1] &= ~0x80;
1169 
1170     mask_t succ = gf_deserialize(p->y, enc2, 1, 0);
1171 #if 0 == 0
1172     succ &= word_is_zero(enc2[DECAF_EDDSA_448_PRIVATE_BYTES-1]);
1173 #endif
1174 
1175     gf_sqr(p->x,p->y);
1176     gf_sub(p->z,ONE,p->x); /* num = 1-y^2 */
1177     #if EDDSA_USE_SIGMA_ISOGENY
1178         gf_mulw(p->t,p->z,EDWARDS_D); /* d-dy^2 */
1179         gf_mulw(p->x,p->z,EDWARDS_D-1); /* num = (1-y^2)(d-1) */
1180         gf_copy(p->z,p->x);
1181     #else
1182         gf_mulw(p->t,p->x,EDWARDS_D); /* dy^2 */
1183     #endif
1184     gf_sub(p->t,ONE,p->t); /* denom = 1-dy^2 or 1-d + dy^2 */
1185 
1186     gf_mul(p->x,p->z,p->t);
1187     succ &= gf_isr(p->t,p->x); /* 1/sqrt(num * denom) */
1188 
1189     gf_mul(p->x,p->t,p->z); /* sqrt(num / denom) */
1190     gf_cond_neg(p->x,gf_lobit(p->x)^low);
1191     gf_copy(p->z,ONE);
1192 
1193     #if EDDSA_USE_SIGMA_ISOGENY
1194     {
1195        /* Use 4-isogeny like ed25519:
1196         *   2*x*y/sqrt(1-d/a)/(ax^2 + y^2 - 2)
1197         *   (y^2 - ax^2)/(y^2 + ax^2)
1198         * (MAGIC: above formula may be off by a factor of -a
1199         * or something somewhere; check it for other a)
1200         *
1201         * with a = -1, d = -EDWARDS_D:
1202         *   -2xy/sqrt(1-EDWARDS_D)/(2z^2-y^2+x^2)
1203         *   (y^2+x^2)/(y^2-x^2)
1204         */
1205         gf a, b, c, d;
1206         gf_sqr ( c, p->x );
1207         gf_sqr ( a, p->y );
1208         gf_add ( d, c, a ); // x^2 + y^2
1209         gf_add ( p->t, p->y, p->x );
1210         gf_sqr ( b, p->t );
1211         gf_sub ( b, b, d ); // 2xy
1212         gf_sub ( p->t, a, c ); // y^2 - x^2
1213         gf_sqr ( p->x, p->z );
1214         gf_add ( p->z, p->x, p->x );
1215         gf_sub ( c, p->z, p->t ); // 2z^2 - y^2 + x^2
1216         gf_div_i ( a, c );
1217         gf_mul ( c, a, RISTRETTO_FACTOR );
1218         gf_mul ( p->x, b, p->t); // (2xy)(y^2-x^2)
1219         gf_mul ( p->z, p->t, c ); // (y^2-x^2)sd(2z^2 - y^2 + x^2)
1220         gf_mul ( p->y, d, c ); // (y^2+x^2)sd(2z^2 - y^2 + x^2)
1221         gf_mul ( p->t, d, b );
1222         decaf_bzero(a,sizeof(a));
1223         decaf_bzero(b,sizeof(b));
1224         decaf_bzero(c,sizeof(c));
1225         decaf_bzero(d,sizeof(d));
1226     }
1227     #elif IMAGINE_TWIST
1228     {
1229         gf_mul(p->t,p->x,SQRT_MINUS_ONE);
1230         gf_copy(p->x,p->t);
1231         gf_mul(p->t,p->x,p->y);
1232     }
1233     #else
1234     {
1235         /* 4-isogeny 2xy/(y^2-ax^2), (y^2+ax^2)/(2-y^2-ax^2) */
1236         gf a, b, c, d;
1237         gf_sqr ( c, p->x );
1238         gf_sqr ( a, p->y );
1239         gf_add ( d, c, a );
1240         gf_add ( p->t, p->y, p->x );
1241         gf_sqr ( b, p->t );
1242         gf_sub ( b, b, d );
1243         gf_sub ( p->t, a, c );
1244         gf_sqr ( p->x, p->z );
1245         gf_add ( p->z, p->x, p->x );
1246         gf_sub ( a, p->z, d );
1247         gf_mul ( p->x, a, b );
1248         gf_mul ( p->z, p->t, a );
1249         gf_mul ( p->y, p->t, d );
1250         gf_mul ( p->t, b, d );
1251         decaf_bzero(a,sizeof(a));
1252         decaf_bzero(b,sizeof(b));
1253         decaf_bzero(c,sizeof(c));
1254         decaf_bzero(d,sizeof(d));
1255     }
1256     #endif
1257 
1258     decaf_bzero(enc2,sizeof(enc2));
1259     assert(API_NS(point_valid)(p) || ~succ);
1260 
1261     return decaf_succeed_if(mask_to_bool(succ));
1262 }
1263 
decaf_x448(uint8_t out[X_PUBLIC_BYTES],const uint8_t base[X_PUBLIC_BYTES],const uint8_t scalar[X_PRIVATE_BYTES])1264 decaf_error_t decaf_x448 (
1265     uint8_t out[X_PUBLIC_BYTES],
1266     const uint8_t base[X_PUBLIC_BYTES],
1267     const uint8_t scalar[X_PRIVATE_BYTES]
1268 ) {
1269     gf x1, x2, z2, x3, z3, t1, t2;
1270     ignore_result(gf_deserialize(x1,base,1,0));
1271     gf_copy(x2,ONE);
1272     gf_copy(z2,ZERO);
1273     gf_copy(x3,x1);
1274     gf_copy(z3,ONE);
1275 
1276     int t;
1277     mask_t swap = 0;
1278 
1279     for (t = X_PRIVATE_BITS-1; t>=0; t--) {
1280         uint8_t sb = scalar[t/8];
1281 
1282         /* Scalar conditioning */
1283         if (t/8==0) sb &= -(uint8_t)COFACTOR;
1284         else if (t == X_PRIVATE_BITS-1) sb = -1;
1285 
1286         mask_t k_t = (sb>>(t%8)) & 1;
1287         k_t = -k_t; /* set to all 0s or all 1s */
1288 
1289         swap ^= k_t;
1290         gf_cond_swap(x2,x3,swap);
1291         gf_cond_swap(z2,z3,swap);
1292         swap = k_t;
1293 
1294         gf_add_nr(t1,x2,z2); /* A = x2 + z2 */        /* 2+e */
1295         gf_sub_nr(t2,x2,z2); /* B = x2 - z2 */        /* 3+e */
1296         gf_sub_nr(z2,x3,z3); /* D = x3 - z3 */        /* 3+e */
1297         gf_mul(x2,t1,z2);    /* DA */
1298         gf_add_nr(z2,z3,x3); /* C = x3 + z3 */        /* 2+e */
1299         gf_mul(x3,t2,z2);    /* CB */
1300         gf_sub_nr(z3,x2,x3); /* DA-CB */              /* 3+e */
1301         gf_sqr(z2,z3);       /* (DA-CB)^2 */
1302         gf_mul(z3,x1,z2);    /* z3 = x1(DA-CB)^2 */
1303         gf_add_nr(z2,x2,x3); /* (DA+CB) */            /* 2+e */
1304         gf_sqr(x3,z2);       /* x3 = (DA+CB)^2 */
1305 
1306         gf_sqr(z2,t1);       /* AA = A^2 */
1307         gf_sqr(t1,t2);       /* BB = B^2 */
1308         gf_mul(x2,z2,t1);    /* x2 = AA*BB */
1309         gf_sub_nr(t2,z2,t1); /* E = AA-BB */          /* 3+e */
1310 
1311         gf_mulw(t1,t2,-EDWARDS_D); /* E*-d = a24*E */
1312         gf_add_nr(t1,t1,z2); /* AA + a24*E */         /* 2+e */
1313         gf_mul(z2,t2,t1); /* z2 = E(AA+a24*E) */
1314     }
1315 
1316     /* Finish */
1317     gf_cond_swap(x2,x3,swap);
1318     gf_cond_swap(z2,z3,swap);
1319     gf_invert(z2,z2,0);
1320     gf_mul(x1,x2,z2);
1321     gf_serialize(out,x1,1);
1322     mask_t nz = ~gf_eq(x1,ZERO);
1323 
1324     decaf_bzero(x1,sizeof(x1));
1325     decaf_bzero(x2,sizeof(x2));
1326     decaf_bzero(z2,sizeof(z2));
1327     decaf_bzero(x3,sizeof(x3));
1328     decaf_bzero(z3,sizeof(z3));
1329     decaf_bzero(t1,sizeof(t1));
1330     decaf_bzero(t2,sizeof(t2));
1331 
1332     return decaf_succeed_if(mask_to_bool(nz));
1333 }
1334 
1335 /* Thanks Johan Pascal */
decaf_ed448_convert_public_key_to_x448(uint8_t x[DECAF_X448_PUBLIC_BYTES],const uint8_t ed[DECAF_EDDSA_448_PUBLIC_BYTES])1336 void decaf_ed448_convert_public_key_to_x448 (
1337     uint8_t x[DECAF_X448_PUBLIC_BYTES],
1338     const uint8_t ed[DECAF_EDDSA_448_PUBLIC_BYTES]
1339 ) {
1340     gf y;
1341     const uint8_t mask = (uint8_t)(0xFE<<(7));
1342     ignore_result(gf_deserialize(y, ed, 1, mask));
1343 
1344     {
1345         gf n,d;
1346 
1347 #if EDDSA_USE_SIGMA_ISOGENY
1348         /* u = (1+y)/(1-y)*/
1349         gf_add(n, y, ONE); /* n = y+1 */
1350         gf_sub(d, ONE, y); /* d = 1-y */
1351         gf_invert(d, d, 0); /* d = 1/(1-y) */
1352         gf_mul(y, n, d); /* u = (y+1)/(1-y) */
1353         gf_serialize(x,y,1);
1354 #else /* EDDSA_USE_SIGMA_ISOGENY */
1355         /* u = y^2 * (1-dy^2) / (1-y^2) */
1356         gf_sqr(n,y); /* y^2*/
1357         gf_sub(d,ONE,n); /* 1-y^2*/
1358         gf_invert(d,d,0); /* 1/(1-y^2)*/
1359         gf_mul(y,n,d); /* y^2 / (1-y^2) */
1360         gf_mulw(d,n,EDWARDS_D); /* dy^2*/
1361         gf_sub(d, ONE, d); /* 1-dy^2*/
1362         gf_mul(n, y, d); /* y^2 * (1-dy^2) / (1-y^2) */
1363         gf_serialize(x,n,1);
1364 #endif /* EDDSA_USE_SIGMA_ISOGENY */
1365 
1366         decaf_bzero(y,sizeof(y));
1367         decaf_bzero(n,sizeof(n));
1368         decaf_bzero(d,sizeof(d));
1369     }
1370 }
1371 
decaf_x448_generate_key(uint8_t out[X_PUBLIC_BYTES],const uint8_t scalar[X_PRIVATE_BYTES])1372 void decaf_x448_generate_key (
1373     uint8_t out[X_PUBLIC_BYTES],
1374     const uint8_t scalar[X_PRIVATE_BYTES]
1375 ) {
1376     decaf_x448_derive_public_key(out,scalar);
1377 }
1378 
API_NS(point_mul_by_ratio_and_encode_like_x448)1379 void API_NS(point_mul_by_ratio_and_encode_like_x448) (
1380     uint8_t out[X_PUBLIC_BYTES],
1381     const point_t p
1382 ) {
1383     point_t q;
1384 #if COFACTOR == 8
1385     point_double_internal(q,p,1);
1386 #else
1387     API_NS(point_copy)(q,p);
1388 #endif
1389     gf_invert(q->t,q->x,0); /* 1/x */
1390     gf_mul(q->z,q->t,q->y); /* y/x */
1391     gf_sqr(q->y,q->z); /* (y/x)^2 */
1392 #if IMAGINE_TWIST
1393     gf_sub(q->y,ZERO,q->y);
1394 #endif
1395     gf_serialize(out,q->y,1);
1396     API_NS(point_destroy(q));
1397 }
1398 
decaf_x448_derive_public_key(uint8_t out[X_PUBLIC_BYTES],const uint8_t scalar[X_PRIVATE_BYTES])1399 void decaf_x448_derive_public_key (
1400     uint8_t out[X_PUBLIC_BYTES],
1401     const uint8_t scalar[X_PRIVATE_BYTES]
1402 ) {
1403     /* Scalar conditioning */
1404     uint8_t scalar2[X_PRIVATE_BYTES];
1405     memcpy(scalar2,scalar,sizeof(scalar2));
1406     scalar2[0] &= -(uint8_t)COFACTOR;
1407 
1408     scalar2[X_PRIVATE_BYTES-1] &= ~(-1u<<((X_PRIVATE_BITS+7)%8));
1409     scalar2[X_PRIVATE_BYTES-1] |= 1<<((X_PRIVATE_BITS+7)%8);
1410 
1411     scalar_t the_scalar;
1412     API_NS(scalar_decode_long)(the_scalar,scalar2,sizeof(scalar2));
1413 
1414     /* Compensate for the encoding ratio */
1415     for (unsigned i=1; i<DECAF_X448_ENCODE_RATIO; i<<=1) {
1416         API_NS(scalar_halve)(the_scalar,the_scalar);
1417     }
1418     point_t p;
1419     API_NS(precomputed_scalarmul)(p,API_NS(precomputed_base),the_scalar);
1420     API_NS(point_mul_by_ratio_and_encode_like_x448)(out,p);
1421     API_NS(point_destroy)(p);
1422 }
1423 
1424 /**
1425  * @cond internal
1426  * Control for variable-time scalar multiply algorithms.
1427  */
1428 struct smvt_control {
1429   int power, addend;
1430 };
1431 
recode_wnaf(struct smvt_control * control,const scalar_t scalar,unsigned int table_bits)1432 static int recode_wnaf (
1433     struct smvt_control *control, /* [nbits/(table_bits+1) + 3] */
1434     const scalar_t scalar,
1435     unsigned int table_bits
1436 ) {
1437     unsigned int table_size = SCALAR_BITS/(table_bits+1) + 3;
1438     int position = table_size - 1; /* at the end */
1439 
1440     /* place the end marker */
1441     control[position].power = -1;
1442     control[position].addend = 0;
1443     position--;
1444 
1445     /* PERF: Could negate scalar if it's large.  But then would need more cases
1446      * in the actual code that uses it, all for an expected reduction of like 1/5 op.
1447      * Probably not worth it.
1448      */
1449 
1450     uint64_t current = scalar->limb[0] & 0xFFFF;
1451     uint32_t mask = (1<<(table_bits+1))-1;
1452 
1453     unsigned int w;
1454     const unsigned int B_OVER_16 = sizeof(scalar->limb[0]) / 2;
1455     for (w = 1; w<(SCALAR_BITS-1)/16+3; w++) {
1456         if (w < (SCALAR_BITS-1)/16+1) {
1457             /* Refill the 16 high bits of current */
1458             current += (uint32_t)((scalar->limb[w/B_OVER_16]>>(16*(w%B_OVER_16)))<<16);
1459         }
1460 
1461         while (current & 0xFFFF) {
1462             assert(position >= 0);
1463             uint32_t pos = __builtin_ctz((uint32_t)current), odd = (uint32_t)current >> pos;
1464             int32_t delta = odd & mask;
1465             if (odd & 1<<(table_bits+1)) delta -= (1<<(table_bits+1));
1466             current -= delta << pos;
1467             control[position].power = pos + 16*(w-1);
1468             control[position].addend = delta;
1469             position--;
1470         }
1471         current >>= 16;
1472     }
1473     assert(current==0);
1474 
1475     position++;
1476     unsigned int n = table_size - position;
1477     unsigned int i;
1478     for (i=0; i<n; i++) {
1479         control[i] = control[i+position];
1480     }
1481     return n-1;
1482 }
1483 
1484 static void
prepare_wnaf_table(pniels_t * output,const point_t working,unsigned int tbits)1485 prepare_wnaf_table(
1486     pniels_t *output,
1487     const point_t working,
1488     unsigned int tbits
1489 ) {
1490     point_t tmp;
1491     int i;
1492     pt_to_pniels(output[0], working);
1493 
1494     if (tbits == 0) return;
1495 
1496     API_NS(point_double)(tmp,working);
1497     pniels_t twop;
1498     pt_to_pniels(twop, tmp);
1499 
1500     add_pniels_to_pt(tmp, output[0],0);
1501     pt_to_pniels(output[1], tmp);
1502 
1503     for (i=2; i < 1<<tbits; i++) {
1504         add_pniels_to_pt(tmp, twop,0);
1505         pt_to_pniels(output[i], tmp);
1506     }
1507 
1508     API_NS(point_destroy)(tmp);
1509     decaf_bzero(twop,sizeof(twop));
1510 }
1511 
1512 extern const gf API_NS(precomputed_wnaf_as_fe)[];
1513 static const niels_t *API_NS(wnaf_base) = (const niels_t *)API_NS(precomputed_wnaf_as_fe);
1514 const size_t API_NS(sizeof_precomputed_wnafs) __attribute((visibility("hidden")))
1515     = sizeof(niels_t)<<DECAF_WNAF_FIXED_TABLE_BITS;
1516 
1517 void API_NS(precompute_wnafs) (
1518     niels_t out[1<<DECAF_WNAF_FIXED_TABLE_BITS],
1519     const point_t base
1520 ) __attribute__ ((visibility ("hidden")));
1521 
API_NS(precompute_wnafs)1522 void API_NS(precompute_wnafs) (
1523     niels_t out[1<<DECAF_WNAF_FIXED_TABLE_BITS],
1524     const point_t base
1525 ) {
1526     pniels_t tmp[1<<DECAF_WNAF_FIXED_TABLE_BITS];
1527     gf zs[1<<DECAF_WNAF_FIXED_TABLE_BITS], zis[1<<DECAF_WNAF_FIXED_TABLE_BITS];
1528     int i;
1529     prepare_wnaf_table(tmp,base,DECAF_WNAF_FIXED_TABLE_BITS);
1530     for (i=0; i<1<<DECAF_WNAF_FIXED_TABLE_BITS; i++) {
1531         memcpy(out[i], tmp[i]->n, sizeof(niels_t));
1532         gf_copy(zs[i], tmp[i]->z);
1533     }
1534     batch_normalize_niels(out, (const gf *)zs, zis, 1<<DECAF_WNAF_FIXED_TABLE_BITS);
1535 
1536     decaf_bzero(tmp,sizeof(tmp));
1537     decaf_bzero(zs,sizeof(zs));
1538     decaf_bzero(zis,sizeof(zis));
1539 }
1540 
API_NS(base_double_scalarmul_non_secret)1541 void API_NS(base_double_scalarmul_non_secret) (
1542     point_t combo,
1543     const scalar_t scalar1,
1544     const point_t base2,
1545     const scalar_t scalar2
1546 ) {
1547     const int table_bits_var = DECAF_WNAF_VAR_TABLE_BITS,
1548         table_bits_pre = DECAF_WNAF_FIXED_TABLE_BITS;
1549     struct smvt_control control_var[SCALAR_BITS/((int)(DECAF_WNAF_VAR_TABLE_BITS)+1)+3];
1550     struct smvt_control control_pre[SCALAR_BITS/((int)(DECAF_WNAF_FIXED_TABLE_BITS)+1)+3];
1551 
1552     int ncb_pre = recode_wnaf(control_pre, scalar1, table_bits_pre);
1553     int ncb_var = recode_wnaf(control_var, scalar2, table_bits_var);
1554 
1555     pniels_t precmp_var[1<<(int)(DECAF_WNAF_VAR_TABLE_BITS)];
1556     prepare_wnaf_table(precmp_var, base2, table_bits_var);
1557 
1558     int contp=0, contv=0, i = control_var[0].power;
1559 
1560     if (i < 0) {
1561         API_NS(point_copy)(combo, API_NS(point_identity));
1562         return;
1563     } else if (i > control_pre[0].power) {
1564         pniels_to_pt(combo, precmp_var[control_var[0].addend >> 1]);
1565         contv++;
1566     } else if (i == control_pre[0].power && i >=0 ) {
1567         pniels_to_pt(combo, precmp_var[control_var[0].addend >> 1]);
1568         add_niels_to_pt(combo, API_NS(wnaf_base)[control_pre[0].addend >> 1], i);
1569         contv++; contp++;
1570     } else {
1571         i = control_pre[0].power;
1572         niels_to_pt(combo, API_NS(wnaf_base)[control_pre[0].addend >> 1]);
1573         contp++;
1574     }
1575 
1576     for (i--; i >= 0; i--) {
1577         int cv = (i==control_var[contv].power), cp = (i==control_pre[contp].power);
1578         point_double_internal(combo,combo,i && !(cv||cp));
1579 
1580         if (cv) {
1581             assert(control_var[contv].addend);
1582 
1583             if (control_var[contv].addend > 0) {
1584                 add_pniels_to_pt(combo, precmp_var[control_var[contv].addend >> 1], i&&!cp);
1585             } else {
1586                 sub_pniels_from_pt(combo, precmp_var[(-control_var[contv].addend) >> 1], i&&!cp);
1587             }
1588             contv++;
1589         }
1590 
1591         if (cp) {
1592             assert(control_pre[contp].addend);
1593 
1594             if (control_pre[contp].addend > 0) {
1595                 add_niels_to_pt(combo, API_NS(wnaf_base)[control_pre[contp].addend >> 1], i);
1596             } else {
1597                 sub_niels_from_pt(combo, API_NS(wnaf_base)[(-control_pre[contp].addend) >> 1], i);
1598             }
1599             contp++;
1600         }
1601     }
1602 
1603     /* This function is non-secret, but whatever this is cheap. */
1604     decaf_bzero(control_var,sizeof(control_var));
1605     decaf_bzero(control_pre,sizeof(control_pre));
1606     decaf_bzero(precmp_var,sizeof(precmp_var));
1607 
1608     assert(contv == ncb_var); (void)ncb_var;
1609     assert(contp == ncb_pre); (void)ncb_pre;
1610 }
1611 
API_NS(point_destroy)1612 void API_NS(point_destroy) (
1613     point_t point
1614 ) {
1615     decaf_bzero(point, sizeof(point_t));
1616 }
1617 
API_NS(precomputed_destroy)1618 void API_NS(precomputed_destroy) (
1619     precomputed_s *pre
1620 ) {
1621     decaf_bzero(pre, API_NS(sizeof_precomputed_s));
1622 }
1623