1 /* ===================================================================
2  *
3  * Copyright (c) 2018, Helder Eijs <helderijs@gmail.com>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21  * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  * ===================================================================
30  */
31 
32 #include <assert.h>
33 
34 #include "common.h"
35 #include "endianess.h"
36 #include "multiply.h"
37 #include "modexp_utils.h"
38 #include "mont.h"
39 
40 #if SYS_BITS == 32
41 #include "multiply_32.c"
42 #else
43 #if SYS_BITS == 64
44 #include "multiply_64.c"
45 #else
46 #error You must define the macro SYS_BITS
47 #endif
48 #endif
49 
50 #if defined(USE_SSE2)
51 #if defined(HAVE_INTRIN_H)
52 #include <intrin.h>
53 #elif defined(HAVE_X86INTRIN_H)
54 #include <x86intrin.h>
55 #elif defined(HAVE_EMMINTRIN_H)
56 #include <xmmintrin.h>
57 #include <emmintrin.h>
58 #endif
59 #endif
60 
is_odd(uint64_t x)61 static inline unsigned is_odd(uint64_t x)
62 {
63     return 1 == (x & 1);
64 }
65 
is_even(uint64_t x)66 static inline unsigned is_even(uint64_t x)
67 {
68     return !is_odd(x);
69 }
70 
71 /**
72  * Compute the inverse modulo 2⁶⁴ of a 64-bit odd integer.
73  *
74  * See https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
75  */
inverse64(uint64_t a)76 STATIC uint64_t inverse64(uint64_t a)
77 {
78     uint64_t x;
79 
80     assert(1 & a);
81     x = ((a << 1 ^ a) & 4) << 1 ^ a;
82     x += x - a*x*x;
83     x += x - a*x*x;
84     x += x - a*x*x;
85     x += x - a*x*x;
86     assert((x*a & 0xFFFFFFFFFFFFFFFFULL) == 1);
87 
88     return x;
89 }
90 
91 /**
92  * Check if a multi-word integer x is greater than or equal to y.
93  *
94  * @param x     The first term
95  * @param y     The second term
96  * @param nw    The number of words that make up x and y
97  * @return      1 if x>=y, 0 if x<y
98  */
ge(const uint64_t * x,const uint64_t * y,size_t nw)99 STATIC int ge(const uint64_t *x, const uint64_t *y, size_t nw)
100 {
101     unsigned mask = (unsigned)-1;
102     unsigned result = 0;
103     size_t i, j;
104 
105     i = nw - 1;
106     for (j=0; j<nw; j++, i--) {
107         unsigned greater, lower;
108 
109         greater = x[i] > y[i];
110         lower = x[i] < y[i];
111         result |= mask & (greater | (lower << 1));
112         mask &= (greater ^ lower) - 1;
113     }
114 
115     return result<2;
116 }
117 
118 /*
119  * Subtract a multi-word integer b from a.
120  *
121  * @param out   The location where the multi-word result is stored
122  * @param a     Number to subtract from
123  * @param b     Number to subtract
124  * @param nw    The number of words of both a and b
125  * @result      0 if there is no borrow, 1 otherwise
126  */
sub(uint64_t * out,const uint64_t * a,const uint64_t * b,size_t nw)127 STATIC unsigned sub(uint64_t *out, const uint64_t *a, const uint64_t *b, size_t nw)
128 {
129     size_t i;
130     unsigned borrow1 , borrow2;
131 
132     borrow2 = 0;
133     for (i=0; i<nw; i++) {
134         borrow1 = b[i] > a[i];
135         out[i] = a[i] - b[i];
136 
137         borrow1 |= borrow2 > out[i];
138         out[i] -= borrow2;
139 
140         borrow2 = borrow1;
141     }
142 
143     return borrow2;
144 }
145 
146 /*
147  * Compute R² mod N, where R is the smallest power of 2⁶⁴ larger than N.
148  *
149  * @param r2_mod_n  The location where the result is stored at
150  * @param n         The modulus N
151  * @param nw        The number of 64-bit words of both r2_mod_n and n
152  */
rsquare(uint64_t * r2_mod_n,uint64_t * n,size_t nw)153 STATIC void rsquare(uint64_t *r2_mod_n, uint64_t *n, size_t nw)
154 {
155     size_t i;
156     size_t R_bits;
157 
158     memset(r2_mod_n, 0, sizeof(uint64_t)*nw);
159 
160     /**
161      * Start with R2=1, double 2*bitlen(R) times,
162      * and reduce it as soon as it exceeds n
163      */
164     r2_mod_n[0] = 1;
165     R_bits = nw * sizeof(uint64_t) * 8;
166     for (i=0; i<R_bits*2; i++) {
167         unsigned overflow;
168         size_t j;
169 
170         /** Double, by shifting left by one bit **/
171         overflow = (unsigned)(r2_mod_n[nw-1] >> 63);
172         for (j=nw-1; j>0; j--) {
173             r2_mod_n[j] = (r2_mod_n[j] << 1) + (r2_mod_n[j-1] >> 63);
174         }
175         /** Fill-in with zeroes **/
176         r2_mod_n[0] <<= 1;
177 
178         /** Subtract n if the result exceeds it **/
179         while (overflow || ge(r2_mod_n, n, nw)) {
180             sub(r2_mod_n, r2_mod_n, n, nw);
181             overflow = 0;
182         }
183     }
184 }
185 
186 /*
187  * Multiply a multi-word integer a by a 64-bit scalar k and
188  * then add the result to the multi-word integer t.
189  *
190  * @param t     The multi-word integer accumulator
191  * @param tw    The number of words of t
192  * @param a     The multi-word integer to multiply with the scalar
193  * @param aw    The number of words of a
194  * @param k     The 64-bit scalar multiplier
195  */
addmul(uint64_t * t,size_t tw,const uint64_t * a,size_t aw,uint64_t k)196 STATIC void addmul(uint64_t *t, size_t tw, const uint64_t *a, size_t aw, uint64_t k)
197 {
198     size_t i;
199     uint64_t carry;
200 
201     carry = 0;
202     for (i=0; i<aw; i++) {
203         uint64_t prod_lo, prod_hi;
204 
205         DP_MULT(a[i], k, prod_lo, prod_hi);
206 
207         prod_lo += carry;
208         prod_hi += prod_lo < carry;
209 
210         t[i] += prod_lo;
211         prod_hi += t[i] < prod_lo;
212 
213         carry = prod_hi;
214     }
215 
216     for (; carry; i++) {
217         t[i] += carry;
218         carry = t[i] < carry;
219     }
220 
221     assert(i <= tw);
222 }
223 
224 /**
225  * Multiply two multi-word integers.
226  *
227  * @param t          The location where the result is stored. It is twice as big as
228  *                   either a (or b). It is an array of  2*nw words).
229  * @param scratchpad Temporary area. It is an array of 3*nw words.
230  * @param a          The first term, array of nw words.
231  * @param b          The second term, array of nw words.
232  * @param nw         The number of words of both a and b.
233  *
234  */
product(uint64_t * t,uint64_t * scratchpad,const uint64_t * a,const uint64_t * b,size_t nw)235 STATIC void product(uint64_t *t, uint64_t *scratchpad, const uint64_t *a, const uint64_t *b, size_t nw)
236 {
237     size_t i;
238 
239     memset(t, 0, 2*sizeof(uint64_t)*nw);
240 
241     for (i=0; i<(nw ^ (nw & 1)); i+=2) {
242         addmul128(&t[i], scratchpad, a, b[i], b[i+1], 2*nw-i, nw);
243     }
244 
245     if (is_odd(nw)) {
246         addmul(&t[nw-1], nw+2, a, nw, b[nw-1]);
247     }
248 }
249 
250 /*
251  * Select a number out of two, in constant time.
252  *
253  * @param out   The location where the multi-word result is stored
254  * @param a     The first choice, selected if cond is true (non-zero)
255  * @param b     The second choice, selected if cond is false (zero)
256  * @param cond  The flag that drives the selection
257  * @param words The number of words of a, b, and out
258  * @return      0 for success, the appropriate code otherwise.
259  */
mont_select(uint64_t * out,const uint64_t * a,const uint64_t * b,unsigned cond,size_t words)260 STATIC FUNC_SSE2 int mont_select(uint64_t *out, const uint64_t *a, const uint64_t *b, unsigned cond, size_t words)
261 {
262     uint64_t mask;
263 #if defined(USE_SSE2)
264     unsigned pairs, i;
265     __m128i r0, r1, r2, r3, r4, r5;
266 
267     pairs = (unsigned)words / 2;
268     mask = (uint64_t)((cond != 0) - 1); /* 0 for a, 1s for b */
269 
270 #if SYSBITS == 64
271     r0 = _mm_set1_epi64x(mask);
272 #else
273     r0 = _mm_loadl_epi64((__m128i*)&mask);
274     r0 = _mm_unpacklo_epi64(r0, r0);
275 #endif
276     for (i=0; i<pairs; i++, a+=2, b+=2, out+=2) {
277         r1 = _mm_loadu_si128((__m128i const*)b);
278         r2 = _mm_loadu_si128((__m128i const*)a);
279         r3 = _mm_and_si128(r0, r1);
280         r4 = _mm_andnot_si128(r0, r2);
281         r5 = _mm_or_si128(r3, r4);
282         _mm_storeu_si128((__m128i*)out, r5);
283     }
284 
285     if (words & 1) {
286         *out = (*b & mask) ^ (*a & ~mask);
287     }
288 #else
289     unsigned i;
290 
291     mask = (uint64_t)((cond != 0) - 1);
292     for (i=0; i<words; i++) {
293         *out++ = (*b++ & mask) ^ (*a++ & ~mask);
294     }
295 #endif
296 
297     return 0;
298 }
299 
300 /*
301  * Add two multi-word numbers with modulo arithmetic.
302  *
303  * @param out       The locaton where the multi-word result (nw words) is stored
304  * @param a         The first term (nw words)
305  * @param b         The second term (nw words)
306  * @param modulus   The modulus (nw words)
307  * @param tmp1      A temporary area (nw words)
308  * @param tmp2      A temporary area (nw words)
309  * @param nw        The number of 64-bit words in all parameters
310  */
add_mod(uint64_t * out,const uint64_t * a,const uint64_t * b,const uint64_t * modulus,uint64_t * tmp1,uint64_t * tmp2,size_t nw)311 void add_mod(uint64_t* out, const uint64_t* a, const uint64_t* b, const uint64_t *modulus, uint64_t *tmp1, uint64_t *tmp2, size_t nw)
312 {
313     unsigned i;
314     unsigned carry, borrow1, borrow2;
315 
316     /*
317      * Compute sum in tmp1[], and subtract modulus[]
318      * from tmp1[] into tmp2[].
319      */
320     borrow2 = 0;
321     for (i=0, carry=0; i<nw; i++) {
322         tmp1[i] = a[i] + carry;
323         carry = tmp1[i] < carry;
324         tmp1[i] += b[i];
325         carry += tmp1[i] < b[i];
326 
327         borrow1 = modulus[i] > tmp1[i];
328         tmp2[i] = tmp1[i] - modulus[i];
329         borrow1 |= borrow2 > tmp2[i];
330         tmp2[i] -= borrow2;
331         borrow2 = borrow1;
332     }
333 
334     /*
335      * If there is no borrow or if there is carry,
336      * tmp1[] is larger than modulus, so we must return tmp2[].
337      */
338     mont_select(out, tmp2, tmp1, carry | (borrow2 ^ 1), nw);
339 }
340 
341 /*
342  * Montgomery modular multiplication, that is a*b*R mod N.
343  *
344  * @param out   The location where the result is stored
345  * @param a     The first term (already in Montgomery form, a*R mod N)
346  * @param b     The second term (already in Montgomery form, b*R mod N)
347  * @param n     The modulus (in normal form), such that R>N
348  * @param m0    Least-significant word of the opposite of the inverse of n modulo R, that is, -n[0]⁻¹ mod R
349  * @param t     Temporary, internal result; it must have been created with mont_number(&p,SCRATCHPAD_NR,ctx).
350  * @param nw    Number of words making up the 3 integers: out, a, and b.
351  *              It also defines R as 2^(64*nw).
352  *
353  * Useful read: https://alicebob.cryptoland.net/understanding-the-montgomery-reduction-algorithm/
354  */
355 #if SCRATCHPAD_NR < 7
356 #error Scratchpad is too small
357 #endif
mont_mult_generic(uint64_t * out,const uint64_t * a,const uint64_t * b,const uint64_t * n,uint64_t m0,uint64_t * tmp,size_t nw)358 STATIC void mont_mult_generic(uint64_t *out, const uint64_t *a, const uint64_t *b, const uint64_t *n, uint64_t m0, uint64_t *tmp, size_t nw)
359 {
360     size_t i;
361     uint64_t *t, *scratchpad, *t2;
362     unsigned cond;
363 
364     /*
365      * tmp is an array of SCRATCHPAD*nw words
366      * We carve out 3 values in it:
367      * - 3*nw words, the value a*b + m*n (we only use 2*nw+1 words)
368      * - 3*nw words, temporary area for computing the product
369      * - nw words, the reduced value with a final subtraction by n
370      */
371     t = tmp;
372     scratchpad = tmp + 3*nw;
373     t2 = scratchpad + 3*nw;
374 
375     if (a == b) {
376         square(t, scratchpad, a, nw);
377     } else {
378         product(t, scratchpad, a, b, nw);
379     }
380 
381     t[2*nw] = 0; /** MSW **/
382 
383     /** Clear lower words (two at a time) **/
384     for (i=0; i<(nw ^ (nw & 1)); i+=2) {
385         uint64_t k0, k1, ti1, prod_lo, prod_hi;
386 
387         /** Multiplier for n that will make t[i+0] go 0 **/
388         k0 = t[i] * m0;
389 
390         /** Simulate Muladd for digit 0 **/
391         DP_MULT(k0, n[0], prod_lo, prod_hi);
392         prod_lo += t[i];
393         prod_hi += prod_lo < t[i];
394 
395         /** Expected digit 1 **/
396         ti1 = t[i+1] + n[1]*k0 + prod_hi;
397 
398         /** Multiplier for n that will make t[i+1] go 0 **/
399         k1 = ti1 * m0;
400 
401         addmul128(&t[i], scratchpad, n, k0, k1, 2*nw+1-i, nw);
402     }
403 
404     /** One left for odd number of words **/
405     if (is_odd(nw)) {
406         addmul(&t[nw-1], nw+2, n, nw, t[nw-1]*m0);
407     }
408 
409     assert(t[2*nw] <= 1); /** MSW **/
410 
411     /** t[0..nw-1] == 0 **/
412 
413     /** Divide by R and possibly subtract n **/
414     sub(t2, &t[nw], n, nw);
415     cond = (unsigned)(t[2*nw] | (uint64_t)ge(&t[nw], n, nw));
416     mont_select(out, t2, &t[nw], cond, (unsigned)nw);
417 }
418 
mont_mult_p256(uint64_t * out,const uint64_t * a,const uint64_t * b,const uint64_t * n,uint64_t m0,uint64_t * tmp,size_t nw)419 STATIC void mont_mult_p256(uint64_t *out, const uint64_t *a, const uint64_t *b, const uint64_t *n, uint64_t m0, uint64_t *tmp, size_t nw)
420 {
421     unsigned i;
422     uint64_t *t, *scratchpad, *t2;
423     unsigned cond;
424 #define WORDS_64        4U
425 #define PREDIV_WORDS_64 (2*WORDS_64+1)      /** Size of the number to divide by R **/
426 #define WORDS_32        (WORDS_64*2)
427 #define PREDIV_WORDS_32 (2*PREDIV_WORDS_64)
428 
429 #if SYS_BITS == 32
430     uint32_t t32[18];
431 #endif
432 
433     assert(nw == 4);
434     assert(m0 == 1);
435 
436     t = tmp;
437     scratchpad = tmp + 3*nw;
438     t2 = scratchpad + 3*nw;
439 
440     if (a == b) {
441         square(t, scratchpad, a, WORDS_64);
442     } else {
443         product(t, scratchpad, a, b, WORDS_64);
444     }
445 
446     t[PREDIV_WORDS_64-1] = 0; /** MSW **/
447 
448 #if SYS_BITS == 32
449     for (i=0; i<PREDIV_WORDS_64; i++) {
450         t32[2*i] = (uint32_t)t[i];
451         t32[2*i+1] = (uint32_t)(t[i] >> 32);
452     }
453 
454     for (i=0; i<WORDS_32; i++) {
455         uint32_t k, carry;
456         uint64_t prod, k2;
457         unsigned j;
458 
459         k = t32[i];
460         k2 = ((uint64_t)k<<32) - k;
461 
462         /* p[0] = 2³²-1 */
463         prod = k2 + t32[i+0];
464         t32[i+0] = (uint32_t)prod;
465         carry = (uint32_t)(prod >> 32);
466         /* p[1] = 2³²-1 */
467         prod = k2 + t32[i+1] + carry;
468         t32[i+1] = (uint32_t)prod;
469         carry = (uint32_t)(prod >> 32);
470         /* p[2] = 2³²-1 */
471         prod = k2 + t32[i+2] + carry;
472         t32[i+2] = (uint32_t)prod;
473         carry = (uint32_t)(prod >> 32);
474         /* p[3] = 0 */
475         t32[i+3] += carry;
476         carry = t32[i+3] < carry;
477         /* p[4] = 0 */
478         t32[i+4] += carry;
479         carry = t32[i+4] < carry;
480         /* p[5] = 0 */
481         t32[i+5] += carry;
482         carry = t32[i+5] < carry;
483         /* p[6] = 1 */
484         t32[i+6] += carry;
485         carry = t32[i+6] < carry;
486         t32[i+6] += k;
487         carry |= t32[i+6] < k;
488         /* p[7] = 2³²-1 */
489         prod = k2 + t32[i+7] + carry;
490         t32[i+7] = (uint32_t)prod;
491         carry = (uint32_t)(prod >> 32);
492 
493         for (j=WORDS_32; carry; j++) {
494             t32[i+j] += carry;
495             carry = t32[i+j] < carry;
496         }
497     }
498 
499     for (i=0; i<PREDIV_WORDS_64; i++) {
500         t[i] = ((uint64_t)t32[2*i+1]<<32) + t32[2*i];
501     }
502 
503 #elif SYS_BITS == 64
504 
505     for (i=0; i<WORDS_64; i++) {
506         unsigned j;
507         uint64_t carry, k;
508         uint64_t prod_lo, prod_hi;
509 
510         k = t[i];
511 
512         /* n[0] = 2⁶⁴ - 1 */
513         prod_lo = -k;
514         prod_hi = k - (k!=0);
515         t[i+0] += prod_lo;
516         prod_hi += t[i+0] < prod_lo;
517         carry = prod_hi;
518 
519         /* n[1] = 2³² - 1 */
520         DP_MULT(n[1], k, prod_lo, prod_hi);
521         prod_lo += carry;
522         prod_hi += prod_lo < carry;
523         t[i+1] += prod_lo;
524         prod_hi += t[i+1] < prod_lo;
525         carry = prod_hi;
526 
527         /* n[2] = 0 */
528         t[i+2] += carry;
529         carry = t[i+2] < carry;
530 
531         /* n[3] = 2⁶⁴ - 2³² + 1 */
532         DP_MULT(n[3], k, prod_lo, prod_hi);
533         prod_lo += carry;
534         prod_hi += prod_lo < carry;
535         t[i+3] += prod_lo;
536         prod_hi += t[i+3] < prod_lo;
537         carry = prod_hi;
538 
539         for (j=WORDS_64; carry; j++) {
540             t[i+j] += carry;
541             carry = t[i+j] < carry;
542         }
543     }
544 #else
545 #error You must define the SYS_BITS macro
546 #endif
547 
548     assert(t[PREDIV_WORDS_64-1] <= 1); /** MSW **/
549 
550     /** t[0..nw-1] == 0 **/
551 
552     /** Divide by R and possibly subtract n **/
553     sub(t2, &t[nw], n, WORDS_64);
554     cond = (unsigned)(t[PREDIV_WORDS_64-1] | (uint64_t)ge(&t[WORDS_64], n, WORDS_64));
555     mont_select(out, t2, &t[WORDS_64], cond, WORDS_64);
556 
557 #undef WORDS_64
558 #undef PREDIV_WORDS_64
559 #undef WORDS_32
560 #undef PREDIV_WORDS_32
561 }
562 
mont_mult_p384(uint64_t * out,const uint64_t * a,const uint64_t * b,const uint64_t * n,uint64_t m0,uint64_t * tmp,size_t nw)563 STATIC void mont_mult_p384(uint64_t *out, const uint64_t *a, const uint64_t *b, const uint64_t *n, uint64_t m0, uint64_t *tmp, size_t nw)
564 {
565     size_t i;
566     uint64_t *t, *scratchpad, *t2;
567     unsigned cond;
568 #define WORDS_64        6U
569 #define PREDIV_WORDS_64 (2*WORDS_64+1)      /** Size of the number to divide by R **/
570 #define WORDS_32        (WORDS_64*2)
571 #define PREDIV_WORDS_32 (2*PREDIV_WORDS_64)
572 
573 #if SYS_BITS == 32
574     uint32_t t32[PREDIV_WORDS_32];
575 #endif
576 
577     assert(nw == WORDS_64);
578     assert(m0 == 0x0000000100000001ULL);
579 
580     t = tmp;
581     scratchpad = tmp + 3*nw;
582     t2 = scratchpad + 3*nw;
583 
584     if (a == b) {
585         square(t, scratchpad, a, WORDS_64);
586     } else {
587         product(t, scratchpad, a, b, WORDS_64);
588     }
589 
590     t[PREDIV_WORDS_64-1] = 0; /** MSW **/
591 
592 #if SYS_BITS == 32
593     for (i=0; i<PREDIV_WORDS_64; i++) {
594         t32[2*i] = (uint32_t)t[i];
595         t32[2*i+1] = (uint32_t)(t[i] >> 32);
596     }
597 
598     for (i=0; i<WORDS_32; i++) {
599         uint32_t k, carry;
600         uint64_t prod, k2, k3;
601         unsigned j;
602 
603         k = t32[i];
604         k2 = ((uint64_t)k<<32) - k;
605         k3 = k2 - k;
606 
607         /* n32[0] = 2³² - 1 */
608         prod = k2 + t32[i+0];
609         t32[i+0] = (uint32_t)prod;
610         carry = (uint32_t)(prod >> 32);
611         /* n32[1] = 0 */
612         prod = (uint64_t)t32[i+1] + carry;
613         t32[i+1] = (uint32_t)prod;
614         carry = (uint32_t)(prod >> 32);
615         /* n32[2] = 0 */
616         prod = (uint64_t)t32[i+2] + carry;
617         t32[i+2] = (uint32_t)prod;
618         carry = (uint32_t)(prod >> 32);
619         /* n32[3] = 2³² - 1 */
620         prod = k2 + t32[i+3] + carry;
621         t32[i+3] = (uint32_t)prod;
622         carry = (uint32_t)(prod >> 32);
623         /* n32[4] = 2³² - 2 */
624         prod = k3 + t32[i+4] + carry;
625         t32[i+4] = (uint32_t)prod;
626         carry = (uint32_t)(prod >> 32);
627         /* n32[5] = 2³² - 1 */
628         prod = k2 + t32[i+5] + carry;
629         t32[i+5] = (uint32_t)prod;
630         carry = (uint32_t)(prod >> 32);
631         /* n32[6] = 2³² - 1 */
632         prod = k2 + t32[i+6] + carry;
633         t32[i+6] = (uint32_t)prod;
634         carry = (uint32_t)(prod >> 32);
635         /* n32[7] = 2³² - 1 */
636         prod = k2 + t32[i+7] + carry;
637         t32[i+7] = (uint32_t)prod;
638         carry = (uint32_t)(prod >> 32);
639         /* n32[8] = 2³² - 1 */
640         prod = k2 + t32[i+8] + carry;
641         t32[i+8] = (uint32_t)prod;
642         carry = (uint32_t)(prod >> 32);
643         /* n32[9] = 2³² - 1 */
644         prod = k2 + t32[i+9] + carry;
645         t32[i+9] = (uint32_t)prod;
646         carry = (uint32_t)(prod >> 32);
647         /* n32[10] = 2³² - 1 */
648         prod = k2 + t32[i+10] + carry;
649         t32[i+10] = (uint32_t)prod;
650         carry = (uint32_t)(prod >> 32);
651         /* n32[11] = 2³² - 1 */
652         prod = k2 + t32[i+11] + carry;
653         t32[i+11] = (uint32_t)prod;
654         carry = (uint32_t)(prod >> 32);
655 
656         for (j=WORDS_32; carry; j++) {
657             t32[i+j] += carry;
658             carry = t32[i+j] < carry;
659         }
660     }
661 
662     for (i=0; i<PREDIV_WORDS_64; i++) {
663         t[i] = ((uint64_t)t32[2*i+1]<<32) + t32[2*i];
664     }
665 
666 #elif SYS_BITS == 64
667 
668     for (i=0; i<WORDS_64; i++) {
669         unsigned j;
670         uint64_t carry;
671         uint64_t k, k2_lo, k2_hi;
672         uint64_t prod_lo, prod_hi;
673 
674         k = t[i] + (t[i] << 32);
675         k2_lo = -k;
676         k2_hi = k - (k!=0);
677 
678         /* n[0] = 2³² - 1 */
679         DP_MULT(n[0], k, prod_lo, prod_hi);
680         t[i+0] += prod_lo;
681         prod_hi += t[i+0] < prod_lo;
682         carry = prod_hi;
683         /* n[1] = 2⁶⁴ - 2³² */
684         DP_MULT(n[1], k, prod_lo, prod_hi);
685         prod_lo += carry;
686         prod_hi += prod_lo < carry;
687         t[i+1] += prod_lo;
688         prod_hi += t[i+1] < prod_lo;
689         carry = prod_hi;
690         /* n[2] = 2⁶⁴ - 2 */
691         DP_MULT(n[2], k, prod_lo, prod_hi);
692         prod_lo += carry;
693         prod_hi += prod_lo < carry;
694         t[i+2] += prod_lo;
695         prod_hi += t[i+2] < prod_lo;
696         carry = prod_hi;
697         /* n[3] = 2⁶⁴ - 1 */
698         prod_lo = k2_lo;
699         prod_hi = k2_hi;
700         prod_lo += carry;
701         prod_hi += prod_lo < carry;
702         t[i+3] += prod_lo;
703         prod_hi += t[i+3] < prod_lo;
704         carry = prod_hi;
705         /* n[4] = 2⁶⁴ - 1 */
706         prod_lo = k2_lo;
707         prod_hi = k2_hi;
708         prod_lo += carry;
709         prod_hi += prod_lo < carry;
710         t[i+4] += prod_lo;
711         prod_hi += t[i+4] < prod_lo;
712         carry = prod_hi;
713         /* n[5] = 2⁶⁴ - 1 */
714         prod_lo = k2_lo;
715         prod_hi = k2_hi;
716         prod_lo += carry;
717         prod_hi += prod_lo < carry;
718         t[i+5] += prod_lo;
719         prod_hi += t[i+5] < prod_lo;
720         carry = prod_hi;
721 
722         for (j=WORDS_64; carry; j++) {
723             t[i+j] += carry;
724             carry = t[i+j] < carry;
725         }
726     }
727 #else
728 #error You must define the SYS_BITS macro
729 #endif
730 
731     assert(t[PREDIV_WORDS_64-1] <= 1); /** MSW **/
732 
733     /** Words t[0..WORDS_64-1] have all been set to zero **/
734 
735     /** Divide by R and possibly subtract n **/
736     sub(t2, &t[WORDS_64], n, WORDS_64);
737     cond = (unsigned)(t[PREDIV_WORDS_64-1] | (uint64_t)ge(&t[WORDS_64], n, WORDS_64));
738     mont_select(out, t2, &t[WORDS_64], cond, WORDS_64);
739 
740 #undef WORDS_64
741 #undef PREDIV_WORDS_64
742 #undef WORDS_32
743 #undef PREDIV_WORDS_32
744 }
745 
mont_mult_p521(uint64_t * out,const uint64_t * a,const uint64_t * b,const uint64_t * n,uint64_t m0,uint64_t * tmp,size_t nw)746 STATIC void mont_mult_p521(uint64_t *out, const uint64_t *a, const uint64_t *b, const uint64_t *n, uint64_t m0, uint64_t *tmp, size_t nw)
747 {
748     uint64_t *t, *scratchpad, *s, *tmp1, *tmp2;
749 
750     assert(nw == 9);
751     assert(m0 == 1);
752 
753     /*
754      * A number in the form:
755      *      x*2⁵²¹ + y
756      * is congruent modulo 2⁵²¹-1 to:
757      *      x + y
758      */
759 
760     /*
761      * tmp is an array of SCRATCHPAD*nw words
762      * We carve out 3 values in it:
763      * - 2*nw words, the value a*b
764      * - 3*nw words, temporary area for computing the product
765      * - nw words, the second term of the addition
766      */
767 
768     t = tmp;
769     scratchpad = t + 2*nw;
770     s = scratchpad + 3*nw;
771     tmp1 = scratchpad;
772     tmp2 = scratchpad + nw;
773 
774     if (a == b) {
775         square(t, scratchpad, a, 9);
776     } else {
777         product(t, scratchpad, a, b, 9);
778     }
779 
780     /* t is a 1042-bit number, occupying 17 words (of the total 18); the MSW (t[16]) only has 18 bits */
781     s[0] = (t[8] >> 9)  | (t[9] << 55);     t[8] &= 0x1FF;
782     s[1] = (t[9] >> 9)  | (t[10] << 55);
783     s[2] = (t[10] >> 9) | (t[11] << 55);
784     s[3] = (t[11] >> 9) | (t[12] << 55);
785     s[4] = (t[12] >> 9) | (t[13] << 55);
786     s[5] = (t[13] >> 9) | (t[14] << 55);
787     s[6] = (t[14] >> 9) | (t[15] << 55);
788     s[7] = (t[15] >> 9) | (t[16] << 55);
789     s[8] = t[16] >> 9;
790 
791     add_mod(out, t, s, n, tmp1, tmp2, nw);
792 }
793 
794 /* ---- PUBLIC FUNCTIONS ---- */
795 
mont_context_free(MontContext * ctx)796 void mont_context_free(MontContext *ctx)
797 {
798     if (NULL == ctx)
799         return;
800     free(ctx->one);
801     free(ctx->r2_mod_n);
802     free(ctx->r_mod_n);
803     free(ctx->modulus);
804     free(ctx->modulus_min_2);
805     free(ctx);
806 }
807 
808 /*
809  * Return how many bytes a big endian multi-word number takes in memory.
810  */
mont_bytes(const MontContext * ctx)811 size_t mont_bytes(const MontContext *ctx)
812 {
813     if (NULL == ctx)
814         return 0;
815     return ctx->bytes;
816 }
817 
818 /*
819  * Allocate memory for an array of numbers in Montgomery form
820  * and initialize it to 0.
821  *
822  * @param out   The location where the address of the newly allocated
823  *              array will be placed in.
824  *              The caller is responsible for deallocating the memory
825  *              using free().
826  * @param count How many numbers the array contains.
827  * @param ctx   The Montgomery context.
828  * @return      0 if successful, the relevant error code otherwise.
829  *
830  */
mont_number(uint64_t ** out,unsigned count,const MontContext * ctx)831 int mont_number(uint64_t **out, unsigned count, const MontContext *ctx)
832 {
833     if (NULL == out || NULL == ctx)
834         return ERR_NULL;
835 
836     *out = (uint64_t*)calloc(count * ctx->words, sizeof(uint64_t));
837     if (NULL == *out)
838         return ERR_MEMORY;
839 
840     return 0;
841 }
842 
mont_random_number(uint64_t ** out,unsigned count,uint64_t seed,const MontContext * ctx)843 int mont_random_number(uint64_t **out, unsigned count, uint64_t seed, const MontContext *ctx)
844 {
845     int res;
846     unsigned i;
847     uint64_t *number;
848 
849     res = mont_number(out, count, ctx);
850     if (res)
851         return res;
852 
853     number = *out;
854     expand_seed(seed, (uint8_t*)number, count * ctx->bytes);
855     for (i=0; i<count; i++, number += ctx->words) {
856         number[ctx->words-1] = 0;
857     }
858     return 0;
859 }
860 
861 /*
862  * Transform a big endian-encoded number into Montgomery form, by performing memory allocation.
863  *
864  * @param out       The location where the pointer to the newly allocated memory will be put in.
865  *                  The memory will contain the number encoded in Montgomery form.
866  *                  The caller is responsible for deallocating the memory.
867  * @param ctx       Montgomery context, as created by mont_context_init().
868  * @param number    The big endian-encoded number to transform, strictly smaller than the modulus.
869  * @param len       The length of the big-endian number in bytes (this may be
870  *                  smaller than the output of mont_bytes(ctx)).
871  * @return          0 in case of success, the relevant error code otherwise.
872  */
mont_from_bytes(uint64_t ** out,const uint8_t * number,size_t len,const MontContext * ctx)873 int mont_from_bytes(uint64_t **out, const uint8_t *number, size_t len, const MontContext *ctx)
874 {
875     uint64_t *encoded = NULL;
876     uint64_t *tmp1 = NULL;
877     uint64_t *scratchpad = NULL;
878     int res = 0;
879 
880     if (NULL == out || NULL == ctx || NULL == number)
881         return ERR_NULL;
882 
883     *out = NULL;
884 
885     /** Removing leading zeroes but avoid a zero-length string **/
886     if (0 == len)
887         return ERR_NOT_ENOUGH_DATA;
888     while (len>1 && *number==0) {
889         len--;
890         number++;
891     }
892 
893     if (ctx->bytes < len)
894         return ERR_VALUE;
895 
896     /** The caller will deallocate this memory **/
897     *out = encoded = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
898     if (NULL == encoded)
899         return ERR_MEMORY;
900 
901     /** Input number, loaded in words **/
902     tmp1 = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
903     if (NULL == tmp1) {
904         res = ERR_MEMORY;
905         goto cleanup;
906     }
907     bytes_to_words(tmp1, ctx->words, number, len);
908 
909     /** Make sure number<modulus **/
910     if (ge(tmp1, ctx->modulus, ctx->words)) {
911         res = ERR_VALUE;
912         goto cleanup;
913     }
914 
915     /** Scratchpad **/
916     scratchpad = (uint64_t*)calloc(SCRATCHPAD_NR, ctx->words*sizeof(uint64_t));
917     if (NULL == scratchpad) {
918         res = ERR_MEMORY;
919         goto cleanup;
920     }
921 
922     if (ctx->modulus_type != ModulusP521)
923         mont_mult_generic(encoded, tmp1, ctx->r2_mod_n, ctx->modulus, ctx->m0, scratchpad, ctx->words);
924     else
925         mont_copy(encoded, tmp1, ctx);
926     res = 0;
927 
928 cleanup:
929     free(scratchpad);
930     free(tmp1);
931     if (res != 0) {
932         free(encoded);
933         *out = NULL;
934     }
935     return res;
936 }
937 
938 /*
939  * Transform a number from Montgomery representation to big endian-encoding.
940  *
941  * @param number        The location where the number will be put in, encoded
942  *                      in big-endian form and with zero padding on the left.
943  * @param len           Space allocate at number, at least ctx->modulus_len bytes.
944  * @param ctx           The address of the Montgomery context.
945  * @param mont_number   The number in Montgomery form to transform.
946  * @return              0 if successful, the relevant error code otherwise.
947  */
mont_to_bytes(uint8_t * number,size_t len,const uint64_t * mont_number,const MontContext * ctx)948 int mont_to_bytes(uint8_t *number, size_t len, const uint64_t* mont_number, const MontContext *ctx)
949 {
950     uint64_t *tmp1 = NULL;
951     uint64_t *scratchpad = NULL;
952     int res;
953 
954     if (NULL == number || NULL == ctx || NULL == mont_number)
955         return ERR_NULL;
956 
957     if (len < ctx->modulus_len)
958         return ERR_NOT_ENOUGH_DATA;
959 
960     /** Number in normal form, but still in words **/
961     tmp1 = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
962     if (NULL == tmp1)
963         return ERR_MEMORY;
964 
965     /** Scratchpad **/
966     scratchpad = (uint64_t*)calloc(SCRATCHPAD_NR, ctx->words*sizeof(uint64_t));
967     if (NULL == scratchpad) {
968         free(tmp1);
969         return ERR_MEMORY;
970     }
971 
972     if (ctx->modulus_type != ModulusP521)
973         mont_mult_generic(tmp1, mont_number, ctx->one, ctx->modulus, ctx->m0, scratchpad, ctx->words);
974     else
975         mont_copy(tmp1, mont_number, ctx);
976     res = words_to_bytes(number, len, tmp1, ctx->words);
977 
978     free(scratchpad);
979     free(tmp1);
980     return res;
981 }
982 
983 /*
984  * Add two numbers in Montgomery representation.
985  *
986  * @param out   The location where the result will be stored; it must have been created with mont_number(&p,1,ctx).
987  * @param a     The first term.
988  * @param b     The second term.
989  * @param tmp   Temporary, internal result; it must have been created with mont_number(&p,SCRATCHPAD_NR,ctx).
990  * @param ctx   The Montgomery context.
991  * @return      0 for success, the relevant error code otherwise.
992  */
mont_add(uint64_t * out,const uint64_t * a,const uint64_t * b,uint64_t * tmp,const MontContext * ctx)993 int mont_add(uint64_t* out, const uint64_t* a, const uint64_t* b, uint64_t *tmp, const MontContext *ctx)
994 {
995     if (NULL == out || NULL == a || NULL == b || NULL == tmp || NULL == ctx)
996         return ERR_NULL;
997     add_mod(out, a, b, ctx->modulus, tmp, tmp + ctx->words, ctx->words);
998     return 0;
999 }
1000 
1001 /*
1002  * Multiply two numbers in Montgomery representation.
1003  *
1004  * @param out   The location where the result will be stored at; it must have been created with mont_number(&p,1,ctx)
1005  * @param a     The first term.
1006  * @param b     The second term.
1007  * @param tmp   Temporary, internal result; it must have been created with mont_number(&p,SCRATCHPAD_NR,ctx).
1008  * @param ctx   The Montgomery context.
1009  * @return      0 for success, the relevant error code otherwise.
1010  */
mont_mult(uint64_t * out,const uint64_t * a,const uint64_t * b,uint64_t * tmp,const MontContext * ctx)1011 int mont_mult(uint64_t* out, const uint64_t* a, const uint64_t *b, uint64_t *tmp, const MontContext *ctx)
1012 {
1013     if (NULL == out || NULL == a || NULL == b || NULL == tmp || NULL == ctx)
1014         return ERR_NULL;
1015 
1016     switch (ctx->modulus_type) {
1017         case ModulusP256:
1018             mont_mult_p256(out, a, b, ctx->modulus, ctx->m0, tmp, ctx->words);
1019             break;
1020         case ModulusP384:
1021             mont_mult_p384(out, a, b, ctx->modulus, ctx->m0, tmp, ctx->words);
1022             break;
1023         case ModulusP521:
1024             mont_mult_p521(out, a, b, ctx->modulus, ctx->m0, tmp, ctx->words);
1025             break;
1026         case ModulusGeneric:
1027             mont_mult_generic(out, a, b, ctx->modulus, ctx->m0, tmp, ctx->words);
1028             break;
1029     }
1030 
1031     return 0;
1032 }
1033 
1034 /*
1035  * Subtract integer b from a.
1036  *
1037  * @param out   The location where the result is stored at; it must have been created with mont_number(&p,1,ctx).
1038  *              It can be the same as either a or b.
1039  * @param a     The number it will be subtracted from.
1040  * @param b     The number to subtract.
1041  * @param tmp   Temporary, internal result; it must have been created with mont_number(&p,2,ctx).
1042  * @param ctx   The Montgomery context.
1043  * @return      0 for success, the relevant error code otherwise.
1044  */
mont_sub(uint64_t * out,const uint64_t * a,const uint64_t * b,uint64_t * tmp,const MontContext * ctx)1045 int mont_sub(uint64_t *out, const uint64_t *a, const uint64_t *b, uint64_t *tmp, const MontContext *ctx)
1046 {
1047     unsigned i;
1048     unsigned carry, borrow1 , borrow2;
1049     uint64_t *scratchpad;
1050 
1051     if (NULL == out || NULL == a || NULL == b || NULL == tmp || NULL == ctx)
1052         return ERR_NULL;
1053 
1054     scratchpad = tmp + ctx->words;
1055 
1056     /*
1057      * Compute difference in tmp[], and add modulus[]
1058      * to tmp[] into scratchpad[].
1059      */
1060     borrow2 = 0;
1061     carry = 0;
1062     for (i=0; i<ctx->words; i++) {
1063         borrow1 = b[i] > a[i];
1064         tmp[i] = a[i] - b[i];
1065         borrow1 |= borrow2 > tmp[i];
1066         tmp[i] -= borrow2;
1067         borrow2 = borrow1;
1068 
1069         scratchpad[i] = tmp[i] + carry;
1070         carry = scratchpad[i] < carry;
1071         scratchpad[i] += ctx->modulus[i];
1072         carry += scratchpad[i] < ctx->modulus[i];
1073     }
1074 
1075     /*
1076      * If there is no borrow, tmp[] is smaller than modulus.
1077      */
1078     mont_select(out, scratchpad, tmp, borrow2, ctx->words);
1079 
1080     return 0;
1081 }
1082 
1083 /*
1084  * Compute the modular inverse of an integer in Montgomery form.
1085  *
1086  * Condition: the modulus defining the Montgomery context MUST BE a non-secret prime number.
1087  *
1088  * @param out   The location where the result will be stored at; it must have
1089  *              been allocated with mont_number(&p, 1, ctx).
1090  * @param a     The number to compute the modular inverse of, already in Montgomery form.
1091  * @param ctx   The Montgomery context.
1092  * @return      0 for success, the relevant error code otherwise.
1093  */
mont_inv_prime(uint64_t * out,uint64_t * a,const MontContext * ctx)1094 int mont_inv_prime(uint64_t *out, uint64_t *a, const MontContext *ctx)
1095 {
1096     unsigned idx_word;
1097     uint64_t bit;
1098     uint64_t *tmp1 = NULL;
1099     uint64_t *scratchpad = NULL;
1100     uint64_t *exponent = NULL;
1101     int res;
1102 
1103     if (NULL == out || NULL == a || NULL == ctx)
1104         return ERR_NULL;
1105 
1106     tmp1 = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1107     if (NULL == tmp1)
1108         return ERR_MEMORY;
1109 
1110     scratchpad = (uint64_t*)calloc(SCRATCHPAD_NR, ctx->words*sizeof(uint64_t));
1111     if (NULL == scratchpad) {
1112         res = ERR_MEMORY;
1113         goto cleanup;
1114     }
1115 
1116     /** Exponent is guaranteed to be >0 **/
1117     exponent = ctx->modulus_min_2;
1118 
1119     /* Find most significant bit */
1120     idx_word = ctx->words-1;
1121     for (;;) {
1122         if (exponent[idx_word] != 0)
1123             break;
1124         if (idx_word-- == 0)
1125             break;
1126     }
1127     for (bit = (uint64_t)1U << 63; 0 == (exponent[idx_word] & bit); bit>>=1);
1128 
1129     /* Start from 1 (in Montgomery form, which is R mod N) */
1130     memcpy(out, ctx->r_mod_n, ctx->bytes);
1131 
1132     /** Left-to-right exponentiation **/
1133     for (;;) {
1134         while (bit > 0) {
1135             mont_mult(tmp1, out, out, scratchpad, ctx);
1136             if (exponent[idx_word] & bit) {
1137                 mont_mult(out, tmp1, a, scratchpad, ctx);
1138             } else {
1139                 memcpy(out, tmp1, ctx->bytes);
1140             }
1141             bit >>= 1;
1142         }
1143         if (idx_word-- == 0)
1144             break;
1145         bit = (uint64_t)1 << 63;
1146     }
1147     res = 0;
1148 
1149 cleanup:
1150     free(tmp1);
1151     free(scratchpad);
1152     return res;
1153 }
1154 
1155 /*
1156  * Assign a value to a number in Montgomer form.
1157  *
1158  * @param out   The location where the result is stored at; it must have been created with mont_number(&p,1,ctx).
1159  * @param x     The value to set.
1160  * @param ctx   The Montgomery context.
1161  * @return      0 for success, the relevant error code otherwise.
1162  */
mont_set(uint64_t * out,uint64_t x,const MontContext * ctx)1163 int mont_set(uint64_t *out, uint64_t x, const MontContext *ctx)
1164 {
1165     uint64_t *tmp, *scratchpad;
1166 
1167     if (NULL == out || NULL == ctx)
1168         return ERR_NULL;
1169 
1170     if (x == 0) {
1171         memset(out, 0, ctx->bytes);
1172         return 0;
1173     }
1174     if (x == 1) {
1175         mont_copy(out, ctx->r_mod_n, ctx);
1176         return 0;
1177     }
1178 
1179     tmp = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1180     if (NULL == tmp)
1181         return ERR_MEMORY;
1182     tmp[0] = x;
1183 
1184     scratchpad = (uint64_t*)calloc(SCRATCHPAD_NR, ctx->words*sizeof(uint64_t));
1185     if (NULL == scratchpad) {
1186         free(tmp);
1187         return ERR_MEMORY;
1188     }
1189 
1190     if (ctx->modulus_type != ModulusP521)
1191         mont_mult_generic(out, tmp, ctx->r2_mod_n, ctx->modulus, ctx->m0, scratchpad, ctx->words);
1192     else
1193         mont_copy(out, tmp, ctx);
1194 
1195     free(tmp);
1196     free(scratchpad);
1197 
1198     return 0;
1199 }
1200 
cmp_modulus(const uint8_t * mod1,size_t mod1_len,const uint8_t * mod2,size_t mod2_len)1201 static int cmp_modulus(const uint8_t *mod1, size_t mod1_len, const uint8_t *mod2, size_t mod2_len)
1202 {
1203     size_t diff;
1204 
1205     if (mod1_len > mod2_len) {
1206         diff = mod1_len - mod2_len;
1207         if (0 != memcmp(mod1+diff, mod2, mod2_len))
1208             return -1;
1209         if (NULL != memchr_not(mod1, 0, diff))
1210             return -1;
1211     } else {
1212         diff = mod2_len - mod1_len;
1213         if (0 != memcmp(mod2+diff, mod1, mod1_len))
1214             return -1;
1215         if (NULL != memchr_not(mod2, 0, diff))
1216             return -1;
1217     }
1218     return 0;
1219 }
1220 
1221 /*
1222  * Create a new context for the Montgomery and the given odd modulus.
1223  *
1224  * @param out       The locate where the pointer to the newly allocated data will be stored at.
1225  *                  The memory will contain the new Montgomery context.
1226  * @param modulus   The modulus encoded in big endian form.
1227  * @param mod_len   The length of the modulus in bytes.
1228  * @return          0 for success, the appropriate code otherwise.
1229  */
mont_context_init(MontContext ** out,const uint8_t * modulus,size_t mod_len)1230 int mont_context_init(MontContext **out, const uint8_t *modulus, size_t mod_len)
1231 {
1232     const uint8_t p256_mod[32] = "\xff\xff\xff\xff\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
1233     const uint8_t p384_mod[48] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff";
1234     const uint8_t p521_mod[66] = "\x01\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
1235     uint64_t *scratchpad = NULL;
1236     MontContext *ctx;
1237     int res;
1238 
1239     if (NULL == out || NULL == modulus)
1240         return ERR_NULL;
1241 
1242     /** Consume leading zeros **/
1243     while (mod_len>0 && *modulus==0) {
1244         modulus++;
1245         mod_len--;
1246     }
1247     if (0 == mod_len)
1248         return ERR_MODULUS;
1249 
1250     /** Ensure modulus is odd and at least 3, otherwise we can't compute its inverse over B **/
1251     if (is_even(modulus[mod_len-1]))
1252         return ERR_MODULUS;
1253     if (mod_len==1 && modulus[0]==1)
1254         return ERR_MODULUS;
1255 
1256     *out = ctx = (MontContext*)calloc(1, sizeof(MontContext));
1257     if (NULL == ctx)
1258         return ERR_MEMORY;
1259 
1260     /* Check if the modulus has a special form */
1261     /* For P-521, modulo reduction is very simple so the Montgomery
1262      * representation is not actually used.
1263      */
1264     ctx->modulus_type = ModulusGeneric;
1265     switch (mod_len) {
1266         case sizeof(p256_mod):
1267             if (0 == cmp_modulus(modulus, mod_len, p256_mod, sizeof(p256_mod))) {
1268                 ctx->modulus_type = ModulusP256;
1269             }
1270             break;
1271         case sizeof(p384_mod):
1272             if (0 == cmp_modulus(modulus, mod_len, p384_mod, sizeof(p384_mod))) {
1273                 ctx->modulus_type = ModulusP384;
1274             }
1275             break;
1276         case sizeof(p521_mod):
1277             if (0 == cmp_modulus(modulus, mod_len, p521_mod, sizeof(p521_mod))) {
1278                 ctx->modulus_type = ModulusP521;
1279             }
1280             break;
1281     }
1282 
1283     ctx->words = ((unsigned)mod_len + 7) / 8;
1284     ctx->bytes = (unsigned)(ctx->words * sizeof(uint64_t));
1285     ctx->modulus_len = (unsigned)mod_len;
1286 
1287     /** Load modulus N **/
1288     ctx->modulus = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1289     if (0 == ctx->modulus) {
1290         res = ERR_MEMORY;
1291         goto cleanup;
1292     }
1293     bytes_to_words(ctx->modulus, ctx->words, modulus, mod_len);
1294 
1295     /** Prepare 1 **/
1296     ctx->one = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1297     if (NULL == ctx->one) {
1298         res = ERR_MEMORY;
1299         goto cleanup;
1300     }
1301     ctx->one[0] = 1;
1302 
1303     /** Pre-compute R² mod N **/
1304     /** Pre-compute -n[0]⁻¹ mod R **/
1305     ctx->r2_mod_n = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1306     if (0 == ctx->r2_mod_n) {
1307         res = ERR_MEMORY;
1308         goto cleanup;
1309     }
1310     if (ctx->modulus_type != ModulusP521) {
1311         rsquare(ctx->r2_mod_n, ctx->modulus, ctx->words);
1312         ctx->m0 = inverse64(~ctx->modulus[0]+1);
1313     } else {
1314         memcpy(ctx->r2_mod_n, ctx->one, ctx->words * sizeof(uint64_t));
1315         ctx->m0 = 1U;
1316     }
1317 
1318     /** Pre-compute R mod N **/
1319     ctx->r_mod_n = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1320     if (NULL == ctx->r_mod_n) {
1321         res = ERR_MEMORY;
1322         goto cleanup;
1323     }
1324     scratchpad = (uint64_t*)calloc(SCRATCHPAD_NR, ctx->words*sizeof(uint64_t));
1325     if (NULL == scratchpad) {
1326         res = ERR_MEMORY;
1327         goto cleanup;
1328     }
1329     if (ctx->modulus_type != ModulusP521)
1330         mont_mult_generic(ctx->r_mod_n, ctx->one, ctx->r2_mod_n, ctx->modulus, ctx->m0, scratchpad, ctx->words);
1331     else
1332         memcpy(ctx->r_mod_n, ctx->one, ctx->words * sizeof(uint64_t));
1333 
1334     /** Pre-compute modulus - 2 **/
1335     /** Modulus is guaranteed to be >= 3 **/
1336     ctx->modulus_min_2 = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1337     if (NULL == ctx->modulus_min_2) {
1338         res = ERR_MEMORY;
1339         goto cleanup;
1340     }
1341     sub(ctx->modulus_min_2, ctx->modulus, ctx->one, ctx->words);
1342     sub(ctx->modulus_min_2, ctx->modulus_min_2, ctx->one, ctx->words);
1343 
1344     res = 0;
1345 
1346 cleanup:
1347     free(scratchpad);
1348     if (res != 0) {
1349         mont_context_free(ctx);
1350     }
1351     return res;
1352 }
1353 
mont_is_zero(const uint64_t * a,const MontContext * ctx)1354 int mont_is_zero(const uint64_t *a, const MontContext *ctx)
1355 {
1356     unsigned i;
1357     uint64_t sum = 0;
1358 
1359     if (NULL == a || NULL == ctx)
1360         return -1;
1361 
1362     for (i=0; i<ctx->words; i++) {
1363         sum |= *a++;
1364     }
1365 
1366     return (sum == 0);
1367 }
1368 
mont_is_one(const uint64_t * a,const MontContext * ctx)1369 int mont_is_one(const uint64_t *a, const MontContext *ctx)
1370 {
1371     unsigned i;
1372     uint64_t sum = 0;
1373 
1374     if (NULL == a || NULL == ctx)
1375         return -1;
1376 
1377     for (i=0; i<ctx->words; i++) {
1378         sum |= a[i] ^ ctx->r_mod_n[i];
1379     }
1380 
1381     return (sum == 0);
1382 }
1383 
mont_is_equal(const uint64_t * a,const uint64_t * b,const MontContext * ctx)1384 int mont_is_equal(const uint64_t *a, const uint64_t *b, const MontContext *ctx)
1385 {
1386     unsigned i;
1387     uint64_t result = 0;
1388 
1389     if (NULL == a || NULL == b || NULL == ctx)
1390         return -1;
1391 
1392     for (i=0; i<ctx->words; i++) {
1393         result |= *a++ ^ *b++;
1394     }
1395 
1396     return (result == 0);
1397 }
1398 
mont_copy(uint64_t * out,const uint64_t * a,const MontContext * ctx)1399 int mont_copy(uint64_t *out, const uint64_t *a, const MontContext *ctx)
1400 {
1401     unsigned i;
1402 
1403     if (NULL == out || NULL == a || NULL == ctx)
1404         return ERR_NULL;
1405 
1406     for (i=0; i<ctx->words; i++) {
1407         *out++ = *a++;
1408     }
1409 
1410     return 0;
1411 }
1412