1 /********************************************************************************************
2 * SIDH: an efficient supersingular isogeny cryptography library
3 *
4 * Abstract: modular arithmetic optimized for x64 platforms for P503
5 *********************************************************************************************/
6
7 #include "../P503_internal.h"
8 #include "../../internal.h"
9
10 /* OQS note: this file is #include'd with the defs of these consts; removed to avoid re-defs
11 // Global constants
12 extern const uint64_t p503[NWORDS_FIELD];
13 extern const uint64_t p503p1[NWORDS_FIELD];
14 extern const uint64_t p503x2[NWORDS_FIELD];
15 extern const uint64_t p503x4[NWORDS_FIELD];
16 */
17
mp_sub503_p2(const digit_t * a,const digit_t * b,digit_t * c)18 __inline void mp_sub503_p2(const digit_t* a, const digit_t* b, digit_t* c)
19 { // Multiprecision subtraction with correction with 2*p, c = a-b+2p.
20 #if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610)
21 unsigned int i, borrow = 0;
22
23 for (i = 0; i < NWORDS_FIELD; i++) {
24 SUBC(borrow, a[i], b[i], borrow, c[i]);
25 }
26
27 borrow = 0;
28 for (i = 0; i < NWORDS_FIELD; i++) {
29 ADDC(borrow, c[i], ((digit_t*)p503x2)[i], borrow, c[i]);
30 }
31
32 #elif (OS_TARGET == OS_NIX)
33
34 oqs_kem_sike_mp_sub503_p2_asm(a, b, c);
35
36 #endif
37 }
38
39
mp_sub503_p4(const digit_t * a,const digit_t * b,digit_t * c)40 __inline void mp_sub503_p4(const digit_t* a, const digit_t* b, digit_t* c)
41 { // Multiprecision subtraction with correction with 4*p, c = a-b+4p.
42 #if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610)
43 unsigned int i, borrow = 0;
44
45 for (i = 0; i < NWORDS_FIELD; i++) {
46 SUBC(borrow, a[i], b[i], borrow, c[i]);
47 }
48
49 borrow = 0;
50 for (i = 0; i < NWORDS_FIELD; i++) {
51 ADDC(borrow, c[i], ((digit_t*)p503x4)[i], borrow, c[i]);
52 }
53
54 #elif (OS_TARGET == OS_NIX)
55
56 oqs_kem_sike_mp_sub503_p4_asm(a, b, c);
57
58 #endif
59 }
60
fpadd503(const digit_t * a,const digit_t * b,digit_t * c)61 __inline void fpadd503(const digit_t *a, const digit_t *b, digit_t *c) { // Modular addition, c = a+b mod p503.
62 // Inputs: a, b in [0, 2*p503-1]
63 // Output: c in [0, 2*p503-1]
64
65 #if (OS_TARGET == OS_WIN)
66 unsigned int i, carry = 0;
67 digit_t mask;
68
69 for (i = 0; i < NWORDS_FIELD; i++) {
70 ADDC(carry, a[i], b[i], carry, c[i]);
71 }
72
73 carry = 0;
74 for (i = 0; i < NWORDS_FIELD; i++) {
75 SUBC(carry, c[i], ((digit_t *) p503x2)[i], carry, c[i]);
76 }
77 mask = 0 - (digit_t) carry;
78
79 carry = 0;
80 for (i = 0; i < NWORDS_FIELD; i++) {
81 ADDC(carry, c[i], ((digit_t *) p503x2)[i] & mask, carry, c[i]);
82 }
83
84 #elif (OS_TARGET == OS_NIX)
85
86 oqs_kem_sike_fpadd503_asm(a, b, c);
87
88 #endif
89 }
90
fpsub503(const digit_t * a,const digit_t * b,digit_t * c)91 __inline void fpsub503(const digit_t *a, const digit_t *b, digit_t *c) { // Modular subtraction, c = a-b mod p503.
92 // Inputs: a, b in [0, 2*p503-1]
93 // Output: c in [0, 2*p503-1]
94
95 #if (OS_TARGET == OS_WIN)
96 unsigned int i, borrow = 0;
97 digit_t mask;
98
99 for (i = 0; i < NWORDS_FIELD; i++) {
100 SUBC(borrow, a[i], b[i], borrow, c[i]);
101 }
102 mask = 0 - (digit_t) borrow;
103
104 borrow = 0;
105 for (i = 0; i < NWORDS_FIELD; i++) {
106 ADDC(borrow, c[i], ((digit_t *) p503x2)[i] & mask, borrow, c[i]);
107 }
108
109 #elif (OS_TARGET == OS_NIX)
110
111 oqs_kem_sike_fpsub503_asm(a, b, c);
112
113 #endif
114 }
115
fpneg503(digit_t * a)116 __inline void fpneg503(digit_t *a) { // Modular negation, a = -a mod p503.
117 // Input/output: a in [0, 2*p503-1]
118 unsigned int i, borrow = 0;
119
120 for (i = 0; i < NWORDS_FIELD; i++) {
121 SUBC(borrow, ((digit_t *) p503x2)[i], a[i], borrow, a[i]);
122 }
123 }
124
fpdiv2_503(const digit_t * a,digit_t * c)125 void fpdiv2_503(const digit_t *a, digit_t *c) { // Modular division by two, c = a/2 mod p503.
126 // Input : a in [0, 2*p503-1]
127 // Output: c in [0, 2*p503-1]
128 unsigned int i, carry = 0;
129 digit_t mask;
130
131 mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p503
132 for (i = 0; i < NWORDS_FIELD; i++) {
133 ADDC(carry, a[i], ((digit_t *) p503)[i] & mask, carry, c[i]);
134 }
135
136 mp_shiftr1(c, NWORDS_FIELD);
137 }
138
fpcorrection503(digit_t * a)139 void fpcorrection503(digit_t *a) { // Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1].
140 unsigned int i, borrow = 0;
141 digit_t mask;
142
143 for (i = 0; i < NWORDS_FIELD; i++) {
144 SUBC(borrow, a[i], ((digit_t *) p503)[i], borrow, a[i]);
145 }
146 mask = 0 - (digit_t) borrow;
147
148 borrow = 0;
149 for (i = 0; i < NWORDS_FIELD; i++) {
150 ADDC(borrow, a[i], ((digit_t *) p503)[i] & mask, borrow, a[i]);
151 }
152 }
153
mp_mul(const digit_t * a,const digit_t * b,digit_t * c,const unsigned int nwords)154 void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) { // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords.
155
156 UNREFERENCED_PARAMETER(nwords);
157
158 #if (OS_TARGET == OS_WIN)
159 digit_t t = 0;
160 uint128_t uv = {0};
161 unsigned int carry = 0;
162
163 MULADD128(a[0], b[0], uv, carry, uv);
164 t += carry;
165 c[0] = uv[0];
166 uv[0] = uv[1];
167 uv[1] = t;
168 t = 0;
169
170 MULADD128(a[0], b[1], uv, carry, uv);
171 t += carry;
172 MULADD128(a[1], b[0], uv, carry, uv);
173 t += carry;
174 c[1] = uv[0];
175 uv[0] = uv[1];
176 uv[1] = t;
177 t = 0;
178
179 MULADD128(a[0], b[2], uv, carry, uv);
180 t += carry;
181 MULADD128(a[1], b[1], uv, carry, uv);
182 t += carry;
183 MULADD128(a[2], b[0], uv, carry, uv);
184 t += carry;
185 c[2] = uv[0];
186 uv[0] = uv[1];
187 uv[1] = t;
188 t = 0;
189
190 MULADD128(a[0], b[3], uv, carry, uv);
191 t += carry;
192 MULADD128(a[2], b[1], uv, carry, uv);
193 t += carry;
194 MULADD128(a[1], b[2], uv, carry, uv);
195 t += carry;
196 MULADD128(a[3], b[0], uv, carry, uv);
197 t += carry;
198 c[3] = uv[0];
199 uv[0] = uv[1];
200 uv[1] = t;
201 t = 0;
202
203 MULADD128(a[0], b[4], uv, carry, uv);
204 t += carry;
205 MULADD128(a[3], b[1], uv, carry, uv);
206 t += carry;
207 MULADD128(a[2], b[2], uv, carry, uv);
208 t += carry;
209 MULADD128(a[1], b[3], uv, carry, uv);
210 t += carry;
211 MULADD128(a[4], b[0], uv, carry, uv);
212 t += carry;
213 c[4] = uv[0];
214 uv[0] = uv[1];
215 uv[1] = t;
216 t = 0;
217
218 MULADD128(a[0], b[5], uv, carry, uv);
219 t += carry;
220 MULADD128(a[4], b[1], uv, carry, uv);
221 t += carry;
222 MULADD128(a[3], b[2], uv, carry, uv);
223 t += carry;
224 MULADD128(a[2], b[3], uv, carry, uv);
225 t += carry;
226 MULADD128(a[1], b[4], uv, carry, uv);
227 t += carry;
228 MULADD128(a[5], b[0], uv, carry, uv);
229 t += carry;
230 c[5] = uv[0];
231 uv[0] = uv[1];
232 uv[1] = t;
233 t = 0;
234
235 MULADD128(a[0], b[6], uv, carry, uv);
236 t += carry;
237 MULADD128(a[5], b[1], uv, carry, uv);
238 t += carry;
239 MULADD128(a[4], b[2], uv, carry, uv);
240 t += carry;
241 MULADD128(a[3], b[3], uv, carry, uv);
242 t += carry;
243 MULADD128(a[2], b[4], uv, carry, uv);
244 t += carry;
245 MULADD128(a[1], b[5], uv, carry, uv);
246 t += carry;
247 MULADD128(a[6], b[0], uv, carry, uv);
248 t += carry;
249 c[6] = uv[0];
250 uv[0] = uv[1];
251 uv[1] = t;
252 t = 0;
253
254 MULADD128(a[0], b[7], uv, carry, uv);
255 t += carry;
256 MULADD128(a[6], b[1], uv, carry, uv);
257 t += carry;
258 MULADD128(a[5], b[2], uv, carry, uv);
259 t += carry;
260 MULADD128(a[4], b[3], uv, carry, uv);
261 t += carry;
262 MULADD128(a[3], b[4], uv, carry, uv);
263 t += carry;
264 MULADD128(a[2], b[5], uv, carry, uv);
265 t += carry;
266 MULADD128(a[1], b[6], uv, carry, uv);
267 t += carry;
268 MULADD128(a[7], b[0], uv, carry, uv);
269 t += carry;
270 c[7] = uv[0];
271 uv[0] = uv[1];
272 uv[1] = t;
273 t = 0;
274
275 MULADD128(a[7], b[1], uv, carry, uv);
276 t += carry;
277 MULADD128(a[6], b[2], uv, carry, uv);
278 t += carry;
279 MULADD128(a[5], b[3], uv, carry, uv);
280 t += carry;
281 MULADD128(a[4], b[4], uv, carry, uv);
282 t += carry;
283 MULADD128(a[3], b[5], uv, carry, uv);
284 t += carry;
285 MULADD128(a[2], b[6], uv, carry, uv);
286 t += carry;
287 MULADD128(a[1], b[7], uv, carry, uv);
288 t += carry;
289 c[8] = uv[0];
290 uv[0] = uv[1];
291 uv[1] = t;
292 t = 0;
293
294 MULADD128(a[7], b[2], uv, carry, uv);
295 t += carry;
296 MULADD128(a[6], b[3], uv, carry, uv);
297 t += carry;
298 MULADD128(a[5], b[4], uv, carry, uv);
299 t += carry;
300 MULADD128(a[4], b[5], uv, carry, uv);
301 t += carry;
302 MULADD128(a[3], b[6], uv, carry, uv);
303 t += carry;
304 MULADD128(a[2], b[7], uv, carry, uv);
305 t += carry;
306 c[9] = uv[0];
307 uv[0] = uv[1];
308 uv[1] = t;
309 t = 0;
310
311 MULADD128(a[7], b[3], uv, carry, uv);
312 t += carry;
313 MULADD128(a[6], b[4], uv, carry, uv);
314 t += carry;
315 MULADD128(a[5], b[5], uv, carry, uv);
316 t += carry;
317 MULADD128(a[4], b[6], uv, carry, uv);
318 t += carry;
319 MULADD128(a[3], b[7], uv, carry, uv);
320 t += carry;
321 c[10] = uv[0];
322 uv[0] = uv[1];
323 uv[1] = t;
324 t = 0;
325
326 MULADD128(a[7], b[4], uv, carry, uv);
327 t += carry;
328 MULADD128(a[6], b[5], uv, carry, uv);
329 t += carry;
330 MULADD128(a[5], b[6], uv, carry, uv);
331 t += carry;
332 MULADD128(a[4], b[7], uv, carry, uv);
333 t += carry;
334 c[11] = uv[0];
335 uv[0] = uv[1];
336 uv[1] = t;
337 t = 0;
338
339 MULADD128(a[7], b[5], uv, carry, uv);
340 t += carry;
341 MULADD128(a[6], b[6], uv, carry, uv);
342 t += carry;
343 MULADD128(a[5], b[7], uv, carry, uv);
344 t += carry;
345 c[12] = uv[0];
346 uv[0] = uv[1];
347 uv[1] = t;
348 t = 0;
349
350 MULADD128(a[7], b[6], uv, carry, uv);
351 t += carry;
352 MULADD128(a[6], b[7], uv, carry, uv);
353 t += carry;
354 c[13] = uv[0];
355 uv[0] = uv[1];
356 uv[1] = t;
357
358 MULADD128(a[7], b[7], uv, carry, uv);
359 c[14] = uv[0];
360 c[15] = uv[1];
361
362 #elif (OS_TARGET == OS_NIX)
363
364 oqs_kem_sike_mul503_asm(a, b, c);
365
366 #endif
367 }
368
rdc_mont(digit_t * ma,digit_t * mc)369 void rdc_mont(digit_t *ma, digit_t *mc) { // Montgomery reduction exploiting special form of the prime.
370 // mc = ma*R^-1 mod p503x2, where R = 2^512.
371 // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1].
372 // ma is assumed to be in Montgomery representation.
373
374 #if (OS_TARGET == OS_WIN)
375 unsigned int carry;
376 digit_t t = 0;
377 uint128_t uv = {0};
378
379 mc[0] = ma[0];
380 mc[1] = ma[1];
381 mc[2] = ma[2];
382 MUL128(mc[0], ((digit_t *) p503p1)[3], uv);
383 ADDC(0, uv[0], ma[3], carry, uv[0]);
384 ADDC(carry, uv[1], 0, carry, uv[1]);
385 mc[3] = uv[0];
386 uv[0] = uv[1];
387 uv[1] = 0;
388
389 MULADD128(mc[0], ((digit_t *) p503p1)[4], uv, carry, uv);
390 MULADD128(mc[1], ((digit_t *) p503p1)[3], uv, carry, uv);
391 t += carry;
392 ADDC(0, uv[0], ma[4], carry, uv[0]);
393 ADDC(carry, uv[1], 0, carry, uv[1]);
394 t += carry;
395 mc[4] = uv[0];
396 uv[0] = uv[1];
397 uv[1] = t;
398 t = 0;
399
400 MULADD128(mc[0], ((digit_t *) p503p1)[5], uv, carry, uv);
401 t += carry;
402 MULADD128(mc[1], ((digit_t *) p503p1)[4], uv, carry, uv);
403 t += carry;
404 MULADD128(mc[2], ((digit_t *) p503p1)[3], uv, carry, uv);
405 t += carry;
406 ADDC(0, uv[0], ma[5], carry, uv[0]);
407 ADDC(carry, uv[1], 0, carry, uv[1]);
408 t += carry;
409 mc[5] = uv[0];
410 uv[0] = uv[1];
411 uv[1] = t;
412 t = 0;
413
414 MULADD128(mc[0], ((digit_t *) p503p1)[6], uv, carry, uv);
415 t += carry;
416 MULADD128(mc[1], ((digit_t *) p503p1)[5], uv, carry, uv);
417 t += carry;
418 MULADD128(mc[2], ((digit_t *) p503p1)[4], uv, carry, uv);
419 t += carry;
420 MULADD128(mc[3], ((digit_t *) p503p1)[3], uv, carry, uv);
421 t += carry;
422 ADDC(0, uv[0], ma[6], carry, uv[0]);
423 ADDC(carry, uv[1], 0, carry, uv[1]);
424 t += carry;
425 mc[6] = uv[0];
426 uv[0] = uv[1];
427 uv[1] = t;
428 t = 0;
429
430 MULADD128(mc[0], ((digit_t *) p503p1)[7], uv, carry, uv);
431 t += carry;
432 MULADD128(mc[1], ((digit_t *) p503p1)[6], uv, carry, uv);
433 t += carry;
434 MULADD128(mc[2], ((digit_t *) p503p1)[5], uv, carry, uv);
435 t += carry;
436 MULADD128(mc[3], ((digit_t *) p503p1)[4], uv, carry, uv);
437 t += carry;
438 MULADD128(mc[4], ((digit_t *) p503p1)[3], uv, carry, uv);
439 t += carry;
440 ADDC(0, uv[0], ma[7], carry, uv[0]);
441 ADDC(carry, uv[1], 0, carry, uv[1]);
442 t += carry;
443 mc[7] = uv[0];
444 uv[0] = uv[1];
445 uv[1] = t;
446 t = 0;
447
448 MULADD128(mc[1], ((digit_t *) p503p1)[7], uv, carry, uv);
449 t += carry;
450 MULADD128(mc[2], ((digit_t *) p503p1)[6], uv, carry, uv);
451 t += carry;
452 MULADD128(mc[3], ((digit_t *) p503p1)[5], uv, carry, uv);
453 t += carry;
454 MULADD128(mc[4], ((digit_t *) p503p1)[4], uv, carry, uv);
455 t += carry;
456 MULADD128(mc[5], ((digit_t *) p503p1)[3], uv, carry, uv);
457 t += carry;
458 ADDC(0, uv[0], ma[8], carry, uv[0]);
459 ADDC(carry, uv[1], 0, carry, uv[1]);
460 t += carry;
461 mc[0] = uv[0];
462 uv[0] = uv[1];
463 uv[1] = t;
464 t = 0;
465
466 MULADD128(mc[2], ((digit_t *) p503p1)[7], uv, carry, uv);
467 t += carry;
468 MULADD128(mc[3], ((digit_t *) p503p1)[6], uv, carry, uv);
469 t += carry;
470 MULADD128(mc[4], ((digit_t *) p503p1)[5], uv, carry, uv);
471 t += carry;
472 MULADD128(mc[5], ((digit_t *) p503p1)[4], uv, carry, uv);
473 t += carry;
474 MULADD128(mc[6], ((digit_t *) p503p1)[3], uv, carry, uv);
475 t += carry;
476 ADDC(0, uv[0], ma[9], carry, uv[0]);
477 ADDC(carry, uv[1], 0, carry, uv[1]);
478 t += carry;
479 mc[1] = uv[0];
480 uv[0] = uv[1];
481 uv[1] = t;
482 t = 0;
483
484 MULADD128(mc[3], ((digit_t *) p503p1)[7], uv, carry, uv);
485 t += carry;
486 MULADD128(mc[4], ((digit_t *) p503p1)[6], uv, carry, uv);
487 t += carry;
488 MULADD128(mc[5], ((digit_t *) p503p1)[5], uv, carry, uv);
489 t += carry;
490 MULADD128(mc[6], ((digit_t *) p503p1)[4], uv, carry, uv);
491 t += carry;
492 MULADD128(mc[7], ((digit_t *) p503p1)[3], uv, carry, uv);
493 t += carry;
494 ADDC(0, uv[0], ma[10], carry, uv[0]);
495 ADDC(carry, uv[1], 0, carry, uv[1]);
496 t += carry;
497 mc[2] = uv[0];
498 uv[0] = uv[1];
499 uv[1] = t;
500 t = 0;
501
502 MULADD128(mc[4], ((digit_t *) p503p1)[7], uv, carry, uv);
503 t += carry;
504 MULADD128(mc[5], ((digit_t *) p503p1)[6], uv, carry, uv);
505 t += carry;
506 MULADD128(mc[6], ((digit_t *) p503p1)[5], uv, carry, uv);
507 t += carry;
508 MULADD128(mc[7], ((digit_t *) p503p1)[4], uv, carry, uv);
509 t += carry;
510 ADDC(0, uv[0], ma[11], carry, uv[0]);
511 ADDC(carry, uv[1], 0, carry, uv[1]);
512 t += carry;
513 mc[3] = uv[0];
514 uv[0] = uv[1];
515 uv[1] = t;
516 t = 0;
517
518 MULADD128(mc[5], ((digit_t *) p503p1)[7], uv, carry, uv);
519 t += carry;
520 MULADD128(mc[6], ((digit_t *) p503p1)[6], uv, carry, uv);
521 t += carry;
522 MULADD128(mc[7], ((digit_t *) p503p1)[5], uv, carry, uv);
523 t += carry;
524 ADDC(0, uv[0], ma[12], carry, uv[0]);
525 ADDC(carry, uv[1], 0, carry, uv[1]);
526 t += carry;
527 mc[4] = uv[0];
528 uv[0] = uv[1];
529 uv[1] = t;
530 t = 0;
531
532 MULADD128(mc[6], ((digit_t *) p503p1)[7], uv, carry, uv);
533 t += carry;
534 MULADD128(mc[7], ((digit_t *) p503p1)[6], uv, carry, uv);
535 t += carry;
536 ADDC(0, uv[0], ma[13], carry, uv[0]);
537 ADDC(carry, uv[1], 0, carry, uv[1]);
538 t += carry;
539 mc[5] = uv[0];
540 uv[0] = uv[1];
541 uv[1] = t;
542 t = 0;
543
544 MULADD128(mc[7], ((digit_t *) p503p1)[7], uv, carry, uv);
545 t += carry;
546 ADDC(0, uv[0], ma[14], carry, mc[6]);
547 ADDC(carry, uv[1], 0, carry, uv[1]);
548 ADDC(0, uv[1], ma[15], carry, mc[7]);
549
550 #elif (OS_TARGET == OS_NIX)
551
552 oqs_kem_sike_rdc503_asm(ma, mc);
553
554 #endif
555 }
556