1 /* ===================================================================
2 *
3 * Copyright (c) 2018, Helder Eijs <helderijs@gmail.com>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 * ===================================================================
30 */
31
32 #include <assert.h>
33
34 #include "common.h"
35 #include "endianess.h"
36 #include "multiply.h"
37 #include "modexp_utils.h"
38 #include "mont.h"
39
40 #if SYS_BITS == 32
41 #include "multiply_32.c"
42 #else
43 #if SYS_BITS == 64
44 #include "multiply_64.c"
45 #else
46 #error You must define the macro SYS_BITS
47 #endif
48 #endif
49
50 #if defined(USE_SSE2)
51 #if defined(HAVE_INTRIN_H)
52 #include <intrin.h>
53 #elif defined(HAVE_X86INTRIN_H)
54 #include <x86intrin.h>
55 #elif defined(HAVE_EMMINTRIN_H)
56 #include <xmmintrin.h>
57 #include <emmintrin.h>
58 #endif
59 #endif
60
is_odd(uint64_t x)61 static inline unsigned is_odd(uint64_t x)
62 {
63 return 1 == (x & 1);
64 }
65
is_even(uint64_t x)66 static inline unsigned is_even(uint64_t x)
67 {
68 return !is_odd(x);
69 }
70
71 /**
72 * Compute the inverse modulo 2⁶⁴ of a 64-bit odd integer.
73 *
74 * See https://crypto.stackexchange.com/questions/47493/how-to-determine-the-multiplicative-inverse-modulo-64-or-other-power-of-two
75 */
inverse64(uint64_t a)76 STATIC uint64_t inverse64(uint64_t a)
77 {
78 uint64_t x;
79
80 assert(1 & a);
81 x = ((a << 1 ^ a) & 4) << 1 ^ a;
82 x += x - a*x*x;
83 x += x - a*x*x;
84 x += x - a*x*x;
85 x += x - a*x*x;
86 assert((x*a & 0xFFFFFFFFFFFFFFFFULL) == 1);
87
88 return x;
89 }
90
91 /**
92 * Check if a multi-word integer x is greater than or equal to y.
93 *
94 * @param x The first term
95 * @param y The second term
96 * @param nw The number of words that make up x and y
97 * @return 1 if x>=y, 0 if x<y
98 */
ge(const uint64_t * x,const uint64_t * y,size_t nw)99 STATIC int ge(const uint64_t *x, const uint64_t *y, size_t nw)
100 {
101 unsigned mask = (unsigned)-1;
102 unsigned result = 0;
103 size_t i, j;
104
105 i = nw - 1;
106 for (j=0; j<nw; j++, i--) {
107 unsigned greater, lower;
108
109 greater = x[i] > y[i];
110 lower = x[i] < y[i];
111 result |= mask & (greater | (lower << 1));
112 mask &= (greater ^ lower) - 1;
113 }
114
115 return result<2;
116 }
117
118 /*
119 * Subtract a multi-word integer b from a.
120 *
121 * @param out The location where the multi-word result is stored
122 * @param a Number to subtract from
123 * @param b Number to subtract
124 * @param nw The number of words of both a and b
125 * @result 0 if there is no borrow, 1 otherwise
126 */
sub(uint64_t * out,const uint64_t * a,const uint64_t * b,size_t nw)127 STATIC unsigned sub(uint64_t *out, const uint64_t *a, const uint64_t *b, size_t nw)
128 {
129 size_t i;
130 unsigned borrow1 , borrow2;
131
132 borrow2 = 0;
133 for (i=0; i<nw; i++) {
134 borrow1 = b[i] > a[i];
135 out[i] = a[i] - b[i];
136
137 borrow1 |= borrow2 > out[i];
138 out[i] -= borrow2;
139
140 borrow2 = borrow1;
141 }
142
143 return borrow2;
144 }
145
146 /*
147 * Compute R² mod N, where R is the smallest power of 2⁶⁴ larger than N.
148 *
149 * @param r2_mod_n The location where the result is stored at
150 * @param n The modulus N
151 * @param nw The number of 64-bit words of both r2_mod_n and n
152 */
rsquare(uint64_t * r2_mod_n,uint64_t * n,size_t nw)153 STATIC void rsquare(uint64_t *r2_mod_n, uint64_t *n, size_t nw)
154 {
155 size_t i;
156 size_t R_bits;
157
158 memset(r2_mod_n, 0, sizeof(uint64_t)*nw);
159
160 /**
161 * Start with R2=1, double 2*bitlen(R) times,
162 * and reduce it as soon as it exceeds n
163 */
164 r2_mod_n[0] = 1;
165 R_bits = nw * sizeof(uint64_t) * 8;
166 for (i=0; i<R_bits*2; i++) {
167 unsigned overflow;
168 size_t j;
169
170 /** Double, by shifting left by one bit **/
171 overflow = (unsigned)(r2_mod_n[nw-1] >> 63);
172 for (j=nw-1; j>0; j--) {
173 r2_mod_n[j] = (r2_mod_n[j] << 1) + (r2_mod_n[j-1] >> 63);
174 }
175 /** Fill-in with zeroes **/
176 r2_mod_n[0] <<= 1;
177
178 /** Subtract n if the result exceeds it **/
179 while (overflow || ge(r2_mod_n, n, nw)) {
180 sub(r2_mod_n, r2_mod_n, n, nw);
181 overflow = 0;
182 }
183 }
184 }
185
186 /*
187 * Multiply a multi-word integer a by a 64-bit scalar k and
188 * then add the result to the multi-word integer t.
189 *
190 * @param t The multi-word integer accumulator
191 * @param tw The number of words of t
192 * @param a The multi-word integer to multiply with the scalar
193 * @param aw The number of words of a
194 * @param k The 64-bit scalar multiplier
195 */
addmul(uint64_t * t,size_t tw,const uint64_t * a,size_t aw,uint64_t k)196 STATIC void addmul(uint64_t *t, size_t tw, const uint64_t *a, size_t aw, uint64_t k)
197 {
198 size_t i;
199 uint64_t carry;
200
201 carry = 0;
202 for (i=0; i<aw; i++) {
203 uint64_t prod_lo, prod_hi;
204
205 DP_MULT(a[i], k, prod_lo, prod_hi);
206
207 prod_lo += carry;
208 prod_hi += prod_lo < carry;
209
210 t[i] += prod_lo;
211 prod_hi += t[i] < prod_lo;
212
213 carry = prod_hi;
214 }
215
216 for (; carry; i++) {
217 t[i] += carry;
218 carry = t[i] < carry;
219 }
220
221 assert(i <= tw);
222 }
223
224 /**
225 * Multiply two multi-word integers.
226 *
227 * @param t The location where the result is stored. It is twice as big as
228 * either a (or b). It is an array of 2*nw words).
229 * @param scratchpad Temporary area. It is an array of 3*nw words.
230 * @param a The first term, array of nw words.
231 * @param b The second term, array of nw words.
232 * @param nw The number of words of both a and b.
233 *
234 */
product(uint64_t * t,uint64_t * scratchpad,const uint64_t * a,const uint64_t * b,size_t nw)235 STATIC void product(uint64_t *t, uint64_t *scratchpad, const uint64_t *a, const uint64_t *b, size_t nw)
236 {
237 size_t i;
238
239 memset(t, 0, 2*sizeof(uint64_t)*nw);
240
241 for (i=0; i<(nw ^ (nw & 1)); i+=2) {
242 addmul128(&t[i], scratchpad, a, b[i], b[i+1], 2*nw-i, nw);
243 }
244
245 if (is_odd(nw)) {
246 addmul(&t[nw-1], nw+2, a, nw, b[nw-1]);
247 }
248 }
249
250 /*
251 * Select a number out of two, in constant time.
252 *
253 * @param out The location where the multi-word result is stored
254 * @param a The first choice, selected if cond is true (non-zero)
255 * @param b The second choice, selected if cond is false (zero)
256 * @param cond The flag that drives the selection
257 * @param words The number of words of a, b, and out
258 * @return 0 for success, the appropriate code otherwise.
259 */
mont_select(uint64_t * out,const uint64_t * a,const uint64_t * b,unsigned cond,size_t words)260 STATIC FUNC_SSE2 int mont_select(uint64_t *out, const uint64_t *a, const uint64_t *b, unsigned cond, size_t words)
261 {
262 uint64_t mask;
263 #if defined(USE_SSE2)
264 unsigned pairs, i;
265 __m128i r0, r1, r2, r3, r4, r5;
266
267 pairs = (unsigned)words / 2;
268 mask = (uint64_t)((cond != 0) - 1); /* 0 for a, 1s for b */
269
270 #if SYSBITS == 64
271 r0 = _mm_set1_epi64x(mask);
272 #else
273 r0 = _mm_loadl_epi64((__m128i*)&mask);
274 r0 = _mm_unpacklo_epi64(r0, r0);
275 #endif
276 for (i=0; i<pairs; i++, a+=2, b+=2, out+=2) {
277 r1 = _mm_loadu_si128((__m128i const*)b);
278 r2 = _mm_loadu_si128((__m128i const*)a);
279 r3 = _mm_and_si128(r0, r1);
280 r4 = _mm_andnot_si128(r0, r2);
281 r5 = _mm_or_si128(r3, r4);
282 _mm_storeu_si128((__m128i*)out, r5);
283 }
284
285 if (words & 1) {
286 *out = (*b & mask) ^ (*a & ~mask);
287 }
288 #else
289 unsigned i;
290
291 mask = (uint64_t)((cond != 0) - 1);
292 for (i=0; i<words; i++) {
293 *out++ = (*b++ & mask) ^ (*a++ & ~mask);
294 }
295 #endif
296
297 return 0;
298 }
299
300 /*
301 * Add two multi-word numbers with modulo arithmetic.
302 *
303 * @param out The locaton where the multi-word result (nw words) is stored
304 * @param a The first term (nw words)
305 * @param b The second term (nw words)
306 * @param modulus The modulus (nw words)
307 * @param tmp1 A temporary area (nw words)
308 * @param tmp2 A temporary area (nw words)
309 * @param nw The number of 64-bit words in all parameters
310 */
add_mod(uint64_t * out,const uint64_t * a,const uint64_t * b,const uint64_t * modulus,uint64_t * tmp1,uint64_t * tmp2,size_t nw)311 void add_mod(uint64_t* out, const uint64_t* a, const uint64_t* b, const uint64_t *modulus, uint64_t *tmp1, uint64_t *tmp2, size_t nw)
312 {
313 unsigned i;
314 unsigned carry, borrow1, borrow2;
315
316 /*
317 * Compute sum in tmp1[], and subtract modulus[]
318 * from tmp1[] into tmp2[].
319 */
320 borrow2 = 0;
321 for (i=0, carry=0; i<nw; i++) {
322 tmp1[i] = a[i] + carry;
323 carry = tmp1[i] < carry;
324 tmp1[i] += b[i];
325 carry += tmp1[i] < b[i];
326
327 borrow1 = modulus[i] > tmp1[i];
328 tmp2[i] = tmp1[i] - modulus[i];
329 borrow1 |= borrow2 > tmp2[i];
330 tmp2[i] -= borrow2;
331 borrow2 = borrow1;
332 }
333
334 /*
335 * If there is no borrow or if there is carry,
336 * tmp1[] is larger than modulus, so we must return tmp2[].
337 */
338 mont_select(out, tmp2, tmp1, carry | (borrow2 ^ 1), nw);
339 }
340
341 /*
342 * Montgomery modular multiplication, that is a*b*R mod N.
343 *
344 * @param out The location where the result is stored
345 * @param a The first term (already in Montgomery form, a*R mod N)
346 * @param b The second term (already in Montgomery form, b*R mod N)
347 * @param n The modulus (in normal form), such that R>N
348 * @param m0 Least-significant word of the opposite of the inverse of n modulo R, that is, -n[0]⁻¹ mod R
349 * @param t Temporary, internal result; it must have been created with mont_number(&p,SCRATCHPAD_NR,ctx).
350 * @param nw Number of words making up the 3 integers: out, a, and b.
351 * It also defines R as 2^(64*nw).
352 *
353 * Useful read: https://alicebob.cryptoland.net/understanding-the-montgomery-reduction-algorithm/
354 */
355 #if SCRATCHPAD_NR < 7
356 #error Scratchpad is too small
357 #endif
mont_mult_generic(uint64_t * out,const uint64_t * a,const uint64_t * b,const uint64_t * n,uint64_t m0,uint64_t * tmp,size_t nw)358 STATIC void mont_mult_generic(uint64_t *out, const uint64_t *a, const uint64_t *b, const uint64_t *n, uint64_t m0, uint64_t *tmp, size_t nw)
359 {
360 size_t i;
361 uint64_t *t, *scratchpad, *t2;
362 unsigned cond;
363
364 /*
365 * tmp is an array of SCRATCHPAD*nw words
366 * We carve out 3 values in it:
367 * - 3*nw words, the value a*b + m*n (we only use 2*nw+1 words)
368 * - 3*nw words, temporary area for computing the product
369 * - nw words, the reduced value with a final subtraction by n
370 */
371 t = tmp;
372 scratchpad = tmp + 3*nw;
373 t2 = scratchpad + 3*nw;
374
375 if (a == b) {
376 square(t, scratchpad, a, nw);
377 } else {
378 product(t, scratchpad, a, b, nw);
379 }
380
381 t[2*nw] = 0; /** MSW **/
382
383 /** Clear lower words (two at a time) **/
384 for (i=0; i<(nw ^ (nw & 1)); i+=2) {
385 uint64_t k0, k1, ti1, prod_lo, prod_hi;
386
387 /** Multiplier for n that will make t[i+0] go 0 **/
388 k0 = t[i] * m0;
389
390 /** Simulate Muladd for digit 0 **/
391 DP_MULT(k0, n[0], prod_lo, prod_hi);
392 prod_lo += t[i];
393 prod_hi += prod_lo < t[i];
394
395 /** Expected digit 1 **/
396 ti1 = t[i+1] + n[1]*k0 + prod_hi;
397
398 /** Multiplier for n that will make t[i+1] go 0 **/
399 k1 = ti1 * m0;
400
401 addmul128(&t[i], scratchpad, n, k0, k1, 2*nw+1-i, nw);
402 }
403
404 /** One left for odd number of words **/
405 if (is_odd(nw)) {
406 addmul(&t[nw-1], nw+2, n, nw, t[nw-1]*m0);
407 }
408
409 assert(t[2*nw] <= 1); /** MSW **/
410
411 /** t[0..nw-1] == 0 **/
412
413 /** Divide by R and possibly subtract n **/
414 sub(t2, &t[nw], n, nw);
415 cond = (unsigned)(t[2*nw] | (uint64_t)ge(&t[nw], n, nw));
416 mont_select(out, t2, &t[nw], cond, (unsigned)nw);
417 }
418
mont_mult_p256(uint64_t * out,const uint64_t * a,const uint64_t * b,const uint64_t * n,uint64_t m0,uint64_t * tmp,size_t nw)419 STATIC void mont_mult_p256(uint64_t *out, const uint64_t *a, const uint64_t *b, const uint64_t *n, uint64_t m0, uint64_t *tmp, size_t nw)
420 {
421 unsigned i;
422 uint64_t *t, *scratchpad, *t2;
423 unsigned cond;
424 #define WORDS_64 4U
425 #define PREDIV_WORDS_64 (2*WORDS_64+1) /** Size of the number to divide by R **/
426 #define WORDS_32 (WORDS_64*2)
427 #define PREDIV_WORDS_32 (2*PREDIV_WORDS_64)
428
429 #if SYS_BITS == 32
430 uint32_t t32[18];
431 #endif
432
433 assert(nw == 4);
434 assert(m0 == 1);
435
436 t = tmp;
437 scratchpad = tmp + 3*nw;
438 t2 = scratchpad + 3*nw;
439
440 if (a == b) {
441 square(t, scratchpad, a, WORDS_64);
442 } else {
443 product(t, scratchpad, a, b, WORDS_64);
444 }
445
446 t[PREDIV_WORDS_64-1] = 0; /** MSW **/
447
448 #if SYS_BITS == 32
449 for (i=0; i<PREDIV_WORDS_64; i++) {
450 t32[2*i] = (uint32_t)t[i];
451 t32[2*i+1] = (uint32_t)(t[i] >> 32);
452 }
453
454 for (i=0; i<WORDS_32; i++) {
455 uint32_t k, carry;
456 uint64_t prod, k2;
457 unsigned j;
458
459 k = t32[i];
460 k2 = ((uint64_t)k<<32) - k;
461
462 /* p[0] = 2³²-1 */
463 prod = k2 + t32[i+0];
464 t32[i+0] = (uint32_t)prod;
465 carry = (uint32_t)(prod >> 32);
466 /* p[1] = 2³²-1 */
467 prod = k2 + t32[i+1] + carry;
468 t32[i+1] = (uint32_t)prod;
469 carry = (uint32_t)(prod >> 32);
470 /* p[2] = 2³²-1 */
471 prod = k2 + t32[i+2] + carry;
472 t32[i+2] = (uint32_t)prod;
473 carry = (uint32_t)(prod >> 32);
474 /* p[3] = 0 */
475 t32[i+3] += carry;
476 carry = t32[i+3] < carry;
477 /* p[4] = 0 */
478 t32[i+4] += carry;
479 carry = t32[i+4] < carry;
480 /* p[5] = 0 */
481 t32[i+5] += carry;
482 carry = t32[i+5] < carry;
483 /* p[6] = 1 */
484 t32[i+6] += carry;
485 carry = t32[i+6] < carry;
486 t32[i+6] += k;
487 carry |= t32[i+6] < k;
488 /* p[7] = 2³²-1 */
489 prod = k2 + t32[i+7] + carry;
490 t32[i+7] = (uint32_t)prod;
491 carry = (uint32_t)(prod >> 32);
492
493 for (j=WORDS_32; carry; j++) {
494 t32[i+j] += carry;
495 carry = t32[i+j] < carry;
496 }
497 }
498
499 for (i=0; i<PREDIV_WORDS_64; i++) {
500 t[i] = ((uint64_t)t32[2*i+1]<<32) + t32[2*i];
501 }
502
503 #elif SYS_BITS == 64
504
505 for (i=0; i<WORDS_64; i++) {
506 unsigned j;
507 uint64_t carry, k;
508 uint64_t prod_lo, prod_hi;
509
510 k = t[i];
511
512 /* n[0] = 2⁶⁴ - 1 */
513 prod_lo = -k;
514 prod_hi = k - (k!=0);
515 t[i+0] += prod_lo;
516 prod_hi += t[i+0] < prod_lo;
517 carry = prod_hi;
518
519 /* n[1] = 2³² - 1 */
520 DP_MULT(n[1], k, prod_lo, prod_hi);
521 prod_lo += carry;
522 prod_hi += prod_lo < carry;
523 t[i+1] += prod_lo;
524 prod_hi += t[i+1] < prod_lo;
525 carry = prod_hi;
526
527 /* n[2] = 0 */
528 t[i+2] += carry;
529 carry = t[i+2] < carry;
530
531 /* n[3] = 2⁶⁴ - 2³² + 1 */
532 DP_MULT(n[3], k, prod_lo, prod_hi);
533 prod_lo += carry;
534 prod_hi += prod_lo < carry;
535 t[i+3] += prod_lo;
536 prod_hi += t[i+3] < prod_lo;
537 carry = prod_hi;
538
539 for (j=WORDS_64; carry; j++) {
540 t[i+j] += carry;
541 carry = t[i+j] < carry;
542 }
543 }
544 #else
545 #error You must define the SYS_BITS macro
546 #endif
547
548 assert(t[PREDIV_WORDS_64-1] <= 1); /** MSW **/
549
550 /** t[0..nw-1] == 0 **/
551
552 /** Divide by R and possibly subtract n **/
553 sub(t2, &t[nw], n, WORDS_64);
554 cond = (unsigned)(t[PREDIV_WORDS_64-1] | (uint64_t)ge(&t[WORDS_64], n, WORDS_64));
555 mont_select(out, t2, &t[WORDS_64], cond, WORDS_64);
556
557 #undef WORDS_64
558 #undef PREDIV_WORDS_64
559 #undef WORDS_32
560 #undef PREDIV_WORDS_32
561 }
562
mont_mult_p384(uint64_t * out,const uint64_t * a,const uint64_t * b,const uint64_t * n,uint64_t m0,uint64_t * tmp,size_t nw)563 STATIC void mont_mult_p384(uint64_t *out, const uint64_t *a, const uint64_t *b, const uint64_t *n, uint64_t m0, uint64_t *tmp, size_t nw)
564 {
565 size_t i;
566 uint64_t *t, *scratchpad, *t2;
567 unsigned cond;
568 #define WORDS_64 6U
569 #define PREDIV_WORDS_64 (2*WORDS_64+1) /** Size of the number to divide by R **/
570 #define WORDS_32 (WORDS_64*2)
571 #define PREDIV_WORDS_32 (2*PREDIV_WORDS_64)
572
573 #if SYS_BITS == 32
574 uint32_t t32[PREDIV_WORDS_32];
575 #endif
576
577 assert(nw == WORDS_64);
578 assert(m0 == 0x0000000100000001ULL);
579
580 t = tmp;
581 scratchpad = tmp + 3*nw;
582 t2 = scratchpad + 3*nw;
583
584 if (a == b) {
585 square(t, scratchpad, a, WORDS_64);
586 } else {
587 product(t, scratchpad, a, b, WORDS_64);
588 }
589
590 t[PREDIV_WORDS_64-1] = 0; /** MSW **/
591
592 #if SYS_BITS == 32
593 for (i=0; i<PREDIV_WORDS_64; i++) {
594 t32[2*i] = (uint32_t)t[i];
595 t32[2*i+1] = (uint32_t)(t[i] >> 32);
596 }
597
598 for (i=0; i<WORDS_32; i++) {
599 uint32_t k, carry;
600 uint64_t prod, k2, k3;
601 unsigned j;
602
603 k = t32[i];
604 k2 = ((uint64_t)k<<32) - k;
605 k3 = k2 - k;
606
607 /* n32[0] = 2³² - 1 */
608 prod = k2 + t32[i+0];
609 t32[i+0] = (uint32_t)prod;
610 carry = (uint32_t)(prod >> 32);
611 /* n32[1] = 0 */
612 prod = (uint64_t)t32[i+1] + carry;
613 t32[i+1] = (uint32_t)prod;
614 carry = (uint32_t)(prod >> 32);
615 /* n32[2] = 0 */
616 prod = (uint64_t)t32[i+2] + carry;
617 t32[i+2] = (uint32_t)prod;
618 carry = (uint32_t)(prod >> 32);
619 /* n32[3] = 2³² - 1 */
620 prod = k2 + t32[i+3] + carry;
621 t32[i+3] = (uint32_t)prod;
622 carry = (uint32_t)(prod >> 32);
623 /* n32[4] = 2³² - 2 */
624 prod = k3 + t32[i+4] + carry;
625 t32[i+4] = (uint32_t)prod;
626 carry = (uint32_t)(prod >> 32);
627 /* n32[5] = 2³² - 1 */
628 prod = k2 + t32[i+5] + carry;
629 t32[i+5] = (uint32_t)prod;
630 carry = (uint32_t)(prod >> 32);
631 /* n32[6] = 2³² - 1 */
632 prod = k2 + t32[i+6] + carry;
633 t32[i+6] = (uint32_t)prod;
634 carry = (uint32_t)(prod >> 32);
635 /* n32[7] = 2³² - 1 */
636 prod = k2 + t32[i+7] + carry;
637 t32[i+7] = (uint32_t)prod;
638 carry = (uint32_t)(prod >> 32);
639 /* n32[8] = 2³² - 1 */
640 prod = k2 + t32[i+8] + carry;
641 t32[i+8] = (uint32_t)prod;
642 carry = (uint32_t)(prod >> 32);
643 /* n32[9] = 2³² - 1 */
644 prod = k2 + t32[i+9] + carry;
645 t32[i+9] = (uint32_t)prod;
646 carry = (uint32_t)(prod >> 32);
647 /* n32[10] = 2³² - 1 */
648 prod = k2 + t32[i+10] + carry;
649 t32[i+10] = (uint32_t)prod;
650 carry = (uint32_t)(prod >> 32);
651 /* n32[11] = 2³² - 1 */
652 prod = k2 + t32[i+11] + carry;
653 t32[i+11] = (uint32_t)prod;
654 carry = (uint32_t)(prod >> 32);
655
656 for (j=WORDS_32; carry; j++) {
657 t32[i+j] += carry;
658 carry = t32[i+j] < carry;
659 }
660 }
661
662 for (i=0; i<PREDIV_WORDS_64; i++) {
663 t[i] = ((uint64_t)t32[2*i+1]<<32) + t32[2*i];
664 }
665
666 #elif SYS_BITS == 64
667
668 for (i=0; i<WORDS_64; i++) {
669 unsigned j;
670 uint64_t carry;
671 uint64_t k, k2_lo, k2_hi;
672 uint64_t prod_lo, prod_hi;
673
674 k = t[i] + (t[i] << 32);
675 k2_lo = -k;
676 k2_hi = k - (k!=0);
677
678 /* n[0] = 2³² - 1 */
679 DP_MULT(n[0], k, prod_lo, prod_hi);
680 t[i+0] += prod_lo;
681 prod_hi += t[i+0] < prod_lo;
682 carry = prod_hi;
683 /* n[1] = 2⁶⁴ - 2³² */
684 DP_MULT(n[1], k, prod_lo, prod_hi);
685 prod_lo += carry;
686 prod_hi += prod_lo < carry;
687 t[i+1] += prod_lo;
688 prod_hi += t[i+1] < prod_lo;
689 carry = prod_hi;
690 /* n[2] = 2⁶⁴ - 2 */
691 DP_MULT(n[2], k, prod_lo, prod_hi);
692 prod_lo += carry;
693 prod_hi += prod_lo < carry;
694 t[i+2] += prod_lo;
695 prod_hi += t[i+2] < prod_lo;
696 carry = prod_hi;
697 /* n[3] = 2⁶⁴ - 1 */
698 prod_lo = k2_lo;
699 prod_hi = k2_hi;
700 prod_lo += carry;
701 prod_hi += prod_lo < carry;
702 t[i+3] += prod_lo;
703 prod_hi += t[i+3] < prod_lo;
704 carry = prod_hi;
705 /* n[4] = 2⁶⁴ - 1 */
706 prod_lo = k2_lo;
707 prod_hi = k2_hi;
708 prod_lo += carry;
709 prod_hi += prod_lo < carry;
710 t[i+4] += prod_lo;
711 prod_hi += t[i+4] < prod_lo;
712 carry = prod_hi;
713 /* n[5] = 2⁶⁴ - 1 */
714 prod_lo = k2_lo;
715 prod_hi = k2_hi;
716 prod_lo += carry;
717 prod_hi += prod_lo < carry;
718 t[i+5] += prod_lo;
719 prod_hi += t[i+5] < prod_lo;
720 carry = prod_hi;
721
722 for (j=WORDS_64; carry; j++) {
723 t[i+j] += carry;
724 carry = t[i+j] < carry;
725 }
726 }
727 #else
728 #error You must define the SYS_BITS macro
729 #endif
730
731 assert(t[PREDIV_WORDS_64-1] <= 1); /** MSW **/
732
733 /** Words t[0..WORDS_64-1] have all been set to zero **/
734
735 /** Divide by R and possibly subtract n **/
736 sub(t2, &t[WORDS_64], n, WORDS_64);
737 cond = (unsigned)(t[PREDIV_WORDS_64-1] | (uint64_t)ge(&t[WORDS_64], n, WORDS_64));
738 mont_select(out, t2, &t[WORDS_64], cond, WORDS_64);
739
740 #undef WORDS_64
741 #undef PREDIV_WORDS_64
742 #undef WORDS_32
743 #undef PREDIV_WORDS_32
744 }
745
mont_mult_p521(uint64_t * out,const uint64_t * a,const uint64_t * b,const uint64_t * n,uint64_t m0,uint64_t * tmp,size_t nw)746 STATIC void mont_mult_p521(uint64_t *out, const uint64_t *a, const uint64_t *b, const uint64_t *n, uint64_t m0, uint64_t *tmp, size_t nw)
747 {
748 uint64_t *t, *scratchpad, *s, *tmp1, *tmp2;
749
750 assert(nw == 9);
751 assert(m0 == 1);
752
753 /*
754 * A number in the form:
755 * x*2⁵²¹ + y
756 * is congruent modulo 2⁵²¹-1 to:
757 * x + y
758 */
759
760 /*
761 * tmp is an array of SCRATCHPAD*nw words
762 * We carve out 3 values in it:
763 * - 2*nw words, the value a*b
764 * - 3*nw words, temporary area for computing the product
765 * - nw words, the second term of the addition
766 */
767
768 t = tmp;
769 scratchpad = t + 2*nw;
770 s = scratchpad + 3*nw;
771 tmp1 = scratchpad;
772 tmp2 = scratchpad + nw;
773
774 if (a == b) {
775 square(t, scratchpad, a, 9);
776 } else {
777 product(t, scratchpad, a, b, 9);
778 }
779
780 /* t is a 1042-bit number, occupying 17 words (of the total 18); the MSW (t[16]) only has 18 bits */
781 s[0] = (t[8] >> 9) | (t[9] << 55); t[8] &= 0x1FF;
782 s[1] = (t[9] >> 9) | (t[10] << 55);
783 s[2] = (t[10] >> 9) | (t[11] << 55);
784 s[3] = (t[11] >> 9) | (t[12] << 55);
785 s[4] = (t[12] >> 9) | (t[13] << 55);
786 s[5] = (t[13] >> 9) | (t[14] << 55);
787 s[6] = (t[14] >> 9) | (t[15] << 55);
788 s[7] = (t[15] >> 9) | (t[16] << 55);
789 s[8] = t[16] >> 9;
790
791 add_mod(out, t, s, n, tmp1, tmp2, nw);
792 }
793
794 /* ---- PUBLIC FUNCTIONS ---- */
795
mont_context_free(MontContext * ctx)796 void mont_context_free(MontContext *ctx)
797 {
798 if (NULL == ctx)
799 return;
800 free(ctx->one);
801 free(ctx->r2_mod_n);
802 free(ctx->r_mod_n);
803 free(ctx->modulus);
804 free(ctx->modulus_min_2);
805 free(ctx);
806 }
807
808 /*
809 * Return how many bytes a big endian multi-word number takes in memory.
810 */
mont_bytes(const MontContext * ctx)811 size_t mont_bytes(const MontContext *ctx)
812 {
813 if (NULL == ctx)
814 return 0;
815 return ctx->bytes;
816 }
817
818 /*
819 * Allocate memory for an array of numbers in Montgomery form
820 * and initialize it to 0.
821 *
822 * @param out The location where the address of the newly allocated
823 * array will be placed in.
824 * The caller is responsible for deallocating the memory
825 * using free().
826 * @param count How many numbers the array contains.
827 * @param ctx The Montgomery context.
828 * @return 0 if successful, the relevant error code otherwise.
829 *
830 */
mont_number(uint64_t ** out,unsigned count,const MontContext * ctx)831 int mont_number(uint64_t **out, unsigned count, const MontContext *ctx)
832 {
833 if (NULL == out || NULL == ctx)
834 return ERR_NULL;
835
836 *out = (uint64_t*)calloc(count * ctx->words, sizeof(uint64_t));
837 if (NULL == *out)
838 return ERR_MEMORY;
839
840 return 0;
841 }
842
mont_random_number(uint64_t ** out,unsigned count,uint64_t seed,const MontContext * ctx)843 int mont_random_number(uint64_t **out, unsigned count, uint64_t seed, const MontContext *ctx)
844 {
845 int res;
846 unsigned i;
847 uint64_t *number;
848
849 res = mont_number(out, count, ctx);
850 if (res)
851 return res;
852
853 number = *out;
854 expand_seed(seed, (uint8_t*)number, count * ctx->bytes);
855 for (i=0; i<count; i++, number += ctx->words) {
856 number[ctx->words-1] = 0;
857 }
858 return 0;
859 }
860
861 /*
862 * Transform a big endian-encoded number into Montgomery form, by performing memory allocation.
863 *
864 * @param out The location where the pointer to the newly allocated memory will be put in.
865 * The memory will contain the number encoded in Montgomery form.
866 * The caller is responsible for deallocating the memory.
867 * @param ctx Montgomery context, as created by mont_context_init().
868 * @param number The big endian-encoded number to transform, strictly smaller than the modulus.
869 * @param len The length of the big-endian number in bytes (this may be
870 * smaller than the output of mont_bytes(ctx)).
871 * @return 0 in case of success, the relevant error code otherwise.
872 */
mont_from_bytes(uint64_t ** out,const uint8_t * number,size_t len,const MontContext * ctx)873 int mont_from_bytes(uint64_t **out, const uint8_t *number, size_t len, const MontContext *ctx)
874 {
875 uint64_t *encoded = NULL;
876 uint64_t *tmp1 = NULL;
877 uint64_t *scratchpad = NULL;
878 int res = 0;
879
880 if (NULL == out || NULL == ctx || NULL == number)
881 return ERR_NULL;
882
883 *out = NULL;
884
885 /** Removing leading zeroes but avoid a zero-length string **/
886 if (0 == len)
887 return ERR_NOT_ENOUGH_DATA;
888 while (len>1 && *number==0) {
889 len--;
890 number++;
891 }
892
893 if (ctx->bytes < len)
894 return ERR_VALUE;
895
896 /** The caller will deallocate this memory **/
897 *out = encoded = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
898 if (NULL == encoded)
899 return ERR_MEMORY;
900
901 /** Input number, loaded in words **/
902 tmp1 = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
903 if (NULL == tmp1) {
904 res = ERR_MEMORY;
905 goto cleanup;
906 }
907 bytes_to_words(tmp1, ctx->words, number, len);
908
909 /** Make sure number<modulus **/
910 if (ge(tmp1, ctx->modulus, ctx->words)) {
911 res = ERR_VALUE;
912 goto cleanup;
913 }
914
915 /** Scratchpad **/
916 scratchpad = (uint64_t*)calloc(SCRATCHPAD_NR, ctx->words*sizeof(uint64_t));
917 if (NULL == scratchpad) {
918 res = ERR_MEMORY;
919 goto cleanup;
920 }
921
922 if (ctx->modulus_type != ModulusP521)
923 mont_mult_generic(encoded, tmp1, ctx->r2_mod_n, ctx->modulus, ctx->m0, scratchpad, ctx->words);
924 else
925 mont_copy(encoded, tmp1, ctx);
926 res = 0;
927
928 cleanup:
929 free(scratchpad);
930 free(tmp1);
931 if (res != 0) {
932 free(encoded);
933 *out = NULL;
934 }
935 return res;
936 }
937
938 /*
939 * Transform a number from Montgomery representation to big endian-encoding.
940 *
941 * @param number The location where the number will be put in, encoded
942 * in big-endian form and with zero padding on the left.
943 * @param len Space allocate at number, at least ctx->modulus_len bytes.
944 * @param ctx The address of the Montgomery context.
945 * @param mont_number The number in Montgomery form to transform.
946 * @return 0 if successful, the relevant error code otherwise.
947 */
mont_to_bytes(uint8_t * number,size_t len,const uint64_t * mont_number,const MontContext * ctx)948 int mont_to_bytes(uint8_t *number, size_t len, const uint64_t* mont_number, const MontContext *ctx)
949 {
950 uint64_t *tmp1 = NULL;
951 uint64_t *scratchpad = NULL;
952 int res;
953
954 if (NULL == number || NULL == ctx || NULL == mont_number)
955 return ERR_NULL;
956
957 if (len < ctx->modulus_len)
958 return ERR_NOT_ENOUGH_DATA;
959
960 /** Number in normal form, but still in words **/
961 tmp1 = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
962 if (NULL == tmp1)
963 return ERR_MEMORY;
964
965 /** Scratchpad **/
966 scratchpad = (uint64_t*)calloc(SCRATCHPAD_NR, ctx->words*sizeof(uint64_t));
967 if (NULL == scratchpad) {
968 free(tmp1);
969 return ERR_MEMORY;
970 }
971
972 if (ctx->modulus_type != ModulusP521)
973 mont_mult_generic(tmp1, mont_number, ctx->one, ctx->modulus, ctx->m0, scratchpad, ctx->words);
974 else
975 mont_copy(tmp1, mont_number, ctx);
976 res = words_to_bytes(number, len, tmp1, ctx->words);
977
978 free(scratchpad);
979 free(tmp1);
980 return res;
981 }
982
983 /*
984 * Add two numbers in Montgomery representation.
985 *
986 * @param out The location where the result will be stored; it must have been created with mont_number(&p,1,ctx).
987 * @param a The first term.
988 * @param b The second term.
989 * @param tmp Temporary, internal result; it must have been created with mont_number(&p,SCRATCHPAD_NR,ctx).
990 * @param ctx The Montgomery context.
991 * @return 0 for success, the relevant error code otherwise.
992 */
mont_add(uint64_t * out,const uint64_t * a,const uint64_t * b,uint64_t * tmp,const MontContext * ctx)993 int mont_add(uint64_t* out, const uint64_t* a, const uint64_t* b, uint64_t *tmp, const MontContext *ctx)
994 {
995 if (NULL == out || NULL == a || NULL == b || NULL == tmp || NULL == ctx)
996 return ERR_NULL;
997 add_mod(out, a, b, ctx->modulus, tmp, tmp + ctx->words, ctx->words);
998 return 0;
999 }
1000
1001 /*
1002 * Multiply two numbers in Montgomery representation.
1003 *
1004 * @param out The location where the result will be stored at; it must have been created with mont_number(&p,1,ctx)
1005 * @param a The first term.
1006 * @param b The second term.
1007 * @param tmp Temporary, internal result; it must have been created with mont_number(&p,SCRATCHPAD_NR,ctx).
1008 * @param ctx The Montgomery context.
1009 * @return 0 for success, the relevant error code otherwise.
1010 */
mont_mult(uint64_t * out,const uint64_t * a,const uint64_t * b,uint64_t * tmp,const MontContext * ctx)1011 int mont_mult(uint64_t* out, const uint64_t* a, const uint64_t *b, uint64_t *tmp, const MontContext *ctx)
1012 {
1013 if (NULL == out || NULL == a || NULL == b || NULL == tmp || NULL == ctx)
1014 return ERR_NULL;
1015
1016 switch (ctx->modulus_type) {
1017 case ModulusP256:
1018 mont_mult_p256(out, a, b, ctx->modulus, ctx->m0, tmp, ctx->words);
1019 break;
1020 case ModulusP384:
1021 mont_mult_p384(out, a, b, ctx->modulus, ctx->m0, tmp, ctx->words);
1022 break;
1023 case ModulusP521:
1024 mont_mult_p521(out, a, b, ctx->modulus, ctx->m0, tmp, ctx->words);
1025 break;
1026 case ModulusGeneric:
1027 mont_mult_generic(out, a, b, ctx->modulus, ctx->m0, tmp, ctx->words);
1028 break;
1029 }
1030
1031 return 0;
1032 }
1033
1034 /*
1035 * Subtract integer b from a.
1036 *
1037 * @param out The location where the result is stored at; it must have been created with mont_number(&p,1,ctx).
1038 * It can be the same as either a or b.
1039 * @param a The number it will be subtracted from.
1040 * @param b The number to subtract.
1041 * @param tmp Temporary, internal result; it must have been created with mont_number(&p,2,ctx).
1042 * @param ctx The Montgomery context.
1043 * @return 0 for success, the relevant error code otherwise.
1044 */
mont_sub(uint64_t * out,const uint64_t * a,const uint64_t * b,uint64_t * tmp,const MontContext * ctx)1045 int mont_sub(uint64_t *out, const uint64_t *a, const uint64_t *b, uint64_t *tmp, const MontContext *ctx)
1046 {
1047 unsigned i;
1048 unsigned carry, borrow1 , borrow2;
1049 uint64_t *scratchpad;
1050
1051 if (NULL == out || NULL == a || NULL == b || NULL == tmp || NULL == ctx)
1052 return ERR_NULL;
1053
1054 scratchpad = tmp + ctx->words;
1055
1056 /*
1057 * Compute difference in tmp[], and add modulus[]
1058 * to tmp[] into scratchpad[].
1059 */
1060 borrow2 = 0;
1061 carry = 0;
1062 for (i=0; i<ctx->words; i++) {
1063 borrow1 = b[i] > a[i];
1064 tmp[i] = a[i] - b[i];
1065 borrow1 |= borrow2 > tmp[i];
1066 tmp[i] -= borrow2;
1067 borrow2 = borrow1;
1068
1069 scratchpad[i] = tmp[i] + carry;
1070 carry = scratchpad[i] < carry;
1071 scratchpad[i] += ctx->modulus[i];
1072 carry += scratchpad[i] < ctx->modulus[i];
1073 }
1074
1075 /*
1076 * If there is no borrow, tmp[] is smaller than modulus.
1077 */
1078 mont_select(out, scratchpad, tmp, borrow2, ctx->words);
1079
1080 return 0;
1081 }
1082
1083 /*
1084 * Compute the modular inverse of an integer in Montgomery form.
1085 *
1086 * Condition: the modulus defining the Montgomery context MUST BE a non-secret prime number.
1087 *
1088 * @param out The location where the result will be stored at; it must have
1089 * been allocated with mont_number(&p, 1, ctx).
1090 * @param a The number to compute the modular inverse of, already in Montgomery form.
1091 * @param ctx The Montgomery context.
1092 * @return 0 for success, the relevant error code otherwise.
1093 */
mont_inv_prime(uint64_t * out,uint64_t * a,const MontContext * ctx)1094 int mont_inv_prime(uint64_t *out, uint64_t *a, const MontContext *ctx)
1095 {
1096 unsigned idx_word;
1097 uint64_t bit;
1098 uint64_t *tmp1 = NULL;
1099 uint64_t *scratchpad = NULL;
1100 uint64_t *exponent = NULL;
1101 int res;
1102
1103 if (NULL == out || NULL == a || NULL == ctx)
1104 return ERR_NULL;
1105
1106 tmp1 = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1107 if (NULL == tmp1)
1108 return ERR_MEMORY;
1109
1110 scratchpad = (uint64_t*)calloc(SCRATCHPAD_NR, ctx->words*sizeof(uint64_t));
1111 if (NULL == scratchpad) {
1112 res = ERR_MEMORY;
1113 goto cleanup;
1114 }
1115
1116 /** Exponent is guaranteed to be >0 **/
1117 exponent = ctx->modulus_min_2;
1118
1119 /* Find most significant bit */
1120 idx_word = ctx->words-1;
1121 for (;;) {
1122 if (exponent[idx_word] != 0)
1123 break;
1124 if (idx_word-- == 0)
1125 break;
1126 }
1127 for (bit = (uint64_t)1U << 63; 0 == (exponent[idx_word] & bit); bit>>=1);
1128
1129 /* Start from 1 (in Montgomery form, which is R mod N) */
1130 memcpy(out, ctx->r_mod_n, ctx->bytes);
1131
1132 /** Left-to-right exponentiation **/
1133 for (;;) {
1134 while (bit > 0) {
1135 mont_mult(tmp1, out, out, scratchpad, ctx);
1136 if (exponent[idx_word] & bit) {
1137 mont_mult(out, tmp1, a, scratchpad, ctx);
1138 } else {
1139 memcpy(out, tmp1, ctx->bytes);
1140 }
1141 bit >>= 1;
1142 }
1143 if (idx_word-- == 0)
1144 break;
1145 bit = (uint64_t)1 << 63;
1146 }
1147 res = 0;
1148
1149 cleanup:
1150 free(tmp1);
1151 free(scratchpad);
1152 return res;
1153 }
1154
1155 /*
1156 * Assign a value to a number in Montgomer form.
1157 *
1158 * @param out The location where the result is stored at; it must have been created with mont_number(&p,1,ctx).
1159 * @param x The value to set.
1160 * @param ctx The Montgomery context.
1161 * @return 0 for success, the relevant error code otherwise.
1162 */
mont_set(uint64_t * out,uint64_t x,const MontContext * ctx)1163 int mont_set(uint64_t *out, uint64_t x, const MontContext *ctx)
1164 {
1165 uint64_t *tmp, *scratchpad;
1166
1167 if (NULL == out || NULL == ctx)
1168 return ERR_NULL;
1169
1170 if (x == 0) {
1171 memset(out, 0, ctx->bytes);
1172 return 0;
1173 }
1174 if (x == 1) {
1175 mont_copy(out, ctx->r_mod_n, ctx);
1176 return 0;
1177 }
1178
1179 tmp = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1180 if (NULL == tmp)
1181 return ERR_MEMORY;
1182 tmp[0] = x;
1183
1184 scratchpad = (uint64_t*)calloc(SCRATCHPAD_NR, ctx->words*sizeof(uint64_t));
1185 if (NULL == scratchpad) {
1186 free(tmp);
1187 return ERR_MEMORY;
1188 }
1189
1190 if (ctx->modulus_type != ModulusP521)
1191 mont_mult_generic(out, tmp, ctx->r2_mod_n, ctx->modulus, ctx->m0, scratchpad, ctx->words);
1192 else
1193 mont_copy(out, tmp, ctx);
1194
1195 free(tmp);
1196 free(scratchpad);
1197
1198 return 0;
1199 }
1200
cmp_modulus(const uint8_t * mod1,size_t mod1_len,const uint8_t * mod2,size_t mod2_len)1201 static int cmp_modulus(const uint8_t *mod1, size_t mod1_len, const uint8_t *mod2, size_t mod2_len)
1202 {
1203 size_t diff;
1204
1205 if (mod1_len > mod2_len) {
1206 diff = mod1_len - mod2_len;
1207 if (0 != memcmp(mod1+diff, mod2, mod2_len))
1208 return -1;
1209 if (NULL != memchr_not(mod1, 0, diff))
1210 return -1;
1211 } else {
1212 diff = mod2_len - mod1_len;
1213 if (0 != memcmp(mod2+diff, mod1, mod1_len))
1214 return -1;
1215 if (NULL != memchr_not(mod2, 0, diff))
1216 return -1;
1217 }
1218 return 0;
1219 }
1220
1221 /*
1222 * Create a new context for the Montgomery and the given odd modulus.
1223 *
1224 * @param out The locate where the pointer to the newly allocated data will be stored at.
1225 * The memory will contain the new Montgomery context.
1226 * @param modulus The modulus encoded in big endian form.
1227 * @param mod_len The length of the modulus in bytes.
1228 * @return 0 for success, the appropriate code otherwise.
1229 */
mont_context_init(MontContext ** out,const uint8_t * modulus,size_t mod_len)1230 int mont_context_init(MontContext **out, const uint8_t *modulus, size_t mod_len)
1231 {
1232 const uint8_t p256_mod[32] = "\xff\xff\xff\xff\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
1233 const uint8_t p384_mod[48] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff";
1234 const uint8_t p521_mod[66] = "\x01\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
1235 uint64_t *scratchpad = NULL;
1236 MontContext *ctx;
1237 int res;
1238
1239 if (NULL == out || NULL == modulus)
1240 return ERR_NULL;
1241
1242 /** Consume leading zeros **/
1243 while (mod_len>0 && *modulus==0) {
1244 modulus++;
1245 mod_len--;
1246 }
1247 if (0 == mod_len)
1248 return ERR_MODULUS;
1249
1250 /** Ensure modulus is odd and at least 3, otherwise we can't compute its inverse over B **/
1251 if (is_even(modulus[mod_len-1]))
1252 return ERR_MODULUS;
1253 if (mod_len==1 && modulus[0]==1)
1254 return ERR_MODULUS;
1255
1256 *out = ctx = (MontContext*)calloc(1, sizeof(MontContext));
1257 if (NULL == ctx)
1258 return ERR_MEMORY;
1259
1260 /* Check if the modulus has a special form */
1261 /* For P-521, modulo reduction is very simple so the Montgomery
1262 * representation is not actually used.
1263 */
1264 ctx->modulus_type = ModulusGeneric;
1265 switch (mod_len) {
1266 case sizeof(p256_mod):
1267 if (0 == cmp_modulus(modulus, mod_len, p256_mod, sizeof(p256_mod))) {
1268 ctx->modulus_type = ModulusP256;
1269 }
1270 break;
1271 case sizeof(p384_mod):
1272 if (0 == cmp_modulus(modulus, mod_len, p384_mod, sizeof(p384_mod))) {
1273 ctx->modulus_type = ModulusP384;
1274 }
1275 break;
1276 case sizeof(p521_mod):
1277 if (0 == cmp_modulus(modulus, mod_len, p521_mod, sizeof(p521_mod))) {
1278 ctx->modulus_type = ModulusP521;
1279 }
1280 break;
1281 }
1282
1283 ctx->words = ((unsigned)mod_len + 7) / 8;
1284 ctx->bytes = (unsigned)(ctx->words * sizeof(uint64_t));
1285 ctx->modulus_len = (unsigned)mod_len;
1286
1287 /** Load modulus N **/
1288 ctx->modulus = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1289 if (0 == ctx->modulus) {
1290 res = ERR_MEMORY;
1291 goto cleanup;
1292 }
1293 bytes_to_words(ctx->modulus, ctx->words, modulus, mod_len);
1294
1295 /** Prepare 1 **/
1296 ctx->one = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1297 if (NULL == ctx->one) {
1298 res = ERR_MEMORY;
1299 goto cleanup;
1300 }
1301 ctx->one[0] = 1;
1302
1303 /** Pre-compute R² mod N **/
1304 /** Pre-compute -n[0]⁻¹ mod R **/
1305 ctx->r2_mod_n = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1306 if (0 == ctx->r2_mod_n) {
1307 res = ERR_MEMORY;
1308 goto cleanup;
1309 }
1310 if (ctx->modulus_type != ModulusP521) {
1311 rsquare(ctx->r2_mod_n, ctx->modulus, ctx->words);
1312 ctx->m0 = inverse64(~ctx->modulus[0]+1);
1313 } else {
1314 memcpy(ctx->r2_mod_n, ctx->one, ctx->words * sizeof(uint64_t));
1315 ctx->m0 = 1U;
1316 }
1317
1318 /** Pre-compute R mod N **/
1319 ctx->r_mod_n = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1320 if (NULL == ctx->r_mod_n) {
1321 res = ERR_MEMORY;
1322 goto cleanup;
1323 }
1324 scratchpad = (uint64_t*)calloc(SCRATCHPAD_NR, ctx->words*sizeof(uint64_t));
1325 if (NULL == scratchpad) {
1326 res = ERR_MEMORY;
1327 goto cleanup;
1328 }
1329 if (ctx->modulus_type != ModulusP521)
1330 mont_mult_generic(ctx->r_mod_n, ctx->one, ctx->r2_mod_n, ctx->modulus, ctx->m0, scratchpad, ctx->words);
1331 else
1332 memcpy(ctx->r_mod_n, ctx->one, ctx->words * sizeof(uint64_t));
1333
1334 /** Pre-compute modulus - 2 **/
1335 /** Modulus is guaranteed to be >= 3 **/
1336 ctx->modulus_min_2 = (uint64_t*)calloc(ctx->words, sizeof(uint64_t));
1337 if (NULL == ctx->modulus_min_2) {
1338 res = ERR_MEMORY;
1339 goto cleanup;
1340 }
1341 sub(ctx->modulus_min_2, ctx->modulus, ctx->one, ctx->words);
1342 sub(ctx->modulus_min_2, ctx->modulus_min_2, ctx->one, ctx->words);
1343
1344 res = 0;
1345
1346 cleanup:
1347 free(scratchpad);
1348 if (res != 0) {
1349 mont_context_free(ctx);
1350 }
1351 return res;
1352 }
1353
mont_is_zero(const uint64_t * a,const MontContext * ctx)1354 int mont_is_zero(const uint64_t *a, const MontContext *ctx)
1355 {
1356 unsigned i;
1357 uint64_t sum = 0;
1358
1359 if (NULL == a || NULL == ctx)
1360 return -1;
1361
1362 for (i=0; i<ctx->words; i++) {
1363 sum |= *a++;
1364 }
1365
1366 return (sum == 0);
1367 }
1368
mont_is_one(const uint64_t * a,const MontContext * ctx)1369 int mont_is_one(const uint64_t *a, const MontContext *ctx)
1370 {
1371 unsigned i;
1372 uint64_t sum = 0;
1373
1374 if (NULL == a || NULL == ctx)
1375 return -1;
1376
1377 for (i=0; i<ctx->words; i++) {
1378 sum |= a[i] ^ ctx->r_mod_n[i];
1379 }
1380
1381 return (sum == 0);
1382 }
1383
mont_is_equal(const uint64_t * a,const uint64_t * b,const MontContext * ctx)1384 int mont_is_equal(const uint64_t *a, const uint64_t *b, const MontContext *ctx)
1385 {
1386 unsigned i;
1387 uint64_t result = 0;
1388
1389 if (NULL == a || NULL == b || NULL == ctx)
1390 return -1;
1391
1392 for (i=0; i<ctx->words; i++) {
1393 result |= *a++ ^ *b++;
1394 }
1395
1396 return (result == 0);
1397 }
1398
mont_copy(uint64_t * out,const uint64_t * a,const MontContext * ctx)1399 int mont_copy(uint64_t *out, const uint64_t *a, const MontContext *ctx)
1400 {
1401 unsigned i;
1402
1403 if (NULL == out || NULL == a || NULL == ctx)
1404 return ERR_NULL;
1405
1406 for (i=0; i<ctx->words; i++) {
1407 *out++ = *a++;
1408 }
1409
1410 return 0;
1411 }
1412