1 /********************************************************************************************
2 * SIDH: an efficient supersingular isogeny cryptography library
3 *
4 * Abstract: modular arithmetic optimized for x64 platforms for P503
5 *********************************************************************************************/
6 
7 #include "../P503_internal.h"
8 #include "../../internal.h"
9 
10 /* OQS note: this file is #include'd with the defs of these consts; removed to avoid re-defs
11 // Global constants
12 extern const uint64_t p503[NWORDS_FIELD];
13 extern const uint64_t p503p1[NWORDS_FIELD];
14 extern const uint64_t p503x2[NWORDS_FIELD];
15 extern const uint64_t p503x4[NWORDS_FIELD];
16 */
17 
mp_sub503_p2(const digit_t * a,const digit_t * b,digit_t * c)18 __inline void mp_sub503_p2(const digit_t* a, const digit_t* b, digit_t* c)
19 { // Multiprecision subtraction with correction with 2*p, c = a-b+2p.
20 #if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610)
21     unsigned int i, borrow = 0;
22 
23     for (i = 0; i < NWORDS_FIELD; i++) {
24         SUBC(borrow, a[i], b[i], borrow, c[i]);
25     }
26 
27     borrow = 0;
28     for (i = 0; i < NWORDS_FIELD; i++) {
29         ADDC(borrow, c[i], ((digit_t*)p503x2)[i], borrow, c[i]);
30     }
31 
32 #elif (OS_TARGET == OS_NIX)
33 
34     oqs_kem_sike_mp_sub503_p2_asm(a, b, c);
35 
36 #endif
37 }
38 
39 
mp_sub503_p4(const digit_t * a,const digit_t * b,digit_t * c)40 __inline void mp_sub503_p4(const digit_t* a, const digit_t* b, digit_t* c)
41 { // Multiprecision subtraction with correction with 4*p, c = a-b+4p.
42 #if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610)
43     unsigned int i, borrow = 0;
44 
45     for (i = 0; i < NWORDS_FIELD; i++) {
46         SUBC(borrow, a[i], b[i], borrow, c[i]);
47     }
48 
49     borrow = 0;
50     for (i = 0; i < NWORDS_FIELD; i++) {
51         ADDC(borrow, c[i], ((digit_t*)p503x4)[i], borrow, c[i]);
52     }
53 
54 #elif (OS_TARGET == OS_NIX)
55 
56     oqs_kem_sike_mp_sub503_p4_asm(a, b, c);
57 
58 #endif
59 }
60 
fpadd503(const digit_t * a,const digit_t * b,digit_t * c)61 __inline void fpadd503(const digit_t *a, const digit_t *b, digit_t *c) { // Modular addition, c = a+b mod p503.
62 	// Inputs: a, b in [0, 2*p503-1]
63 	// Output: c in [0, 2*p503-1]
64 
65 #if (OS_TARGET == OS_WIN)
66 	unsigned int i, carry = 0;
67 	digit_t mask;
68 
69 	for (i = 0; i < NWORDS_FIELD; i++) {
70 		ADDC(carry, a[i], b[i], carry, c[i]);
71 	}
72 
73 	carry = 0;
74 	for (i = 0; i < NWORDS_FIELD; i++) {
75 		SUBC(carry, c[i], ((digit_t *) p503x2)[i], carry, c[i]);
76 	}
77 	mask = 0 - (digit_t) carry;
78 
79 	carry = 0;
80 	for (i = 0; i < NWORDS_FIELD; i++) {
81 		ADDC(carry, c[i], ((digit_t *) p503x2)[i] & mask, carry, c[i]);
82 	}
83 
84 #elif (OS_TARGET == OS_NIX)
85 
86 	oqs_kem_sike_fpadd503_asm(a, b, c);
87 
88 #endif
89 }
90 
fpsub503(const digit_t * a,const digit_t * b,digit_t * c)91 __inline void fpsub503(const digit_t *a, const digit_t *b, digit_t *c) { // Modular subtraction, c = a-b mod p503.
92 	// Inputs: a, b in [0, 2*p503-1]
93 	// Output: c in [0, 2*p503-1]
94 
95 #if (OS_TARGET == OS_WIN)
96 	unsigned int i, borrow = 0;
97 	digit_t mask;
98 
99 	for (i = 0; i < NWORDS_FIELD; i++) {
100 		SUBC(borrow, a[i], b[i], borrow, c[i]);
101 	}
102 	mask = 0 - (digit_t) borrow;
103 
104 	borrow = 0;
105 	for (i = 0; i < NWORDS_FIELD; i++) {
106 		ADDC(borrow, c[i], ((digit_t *) p503x2)[i] & mask, borrow, c[i]);
107 	}
108 
109 #elif (OS_TARGET == OS_NIX)
110 
111 	oqs_kem_sike_fpsub503_asm(a, b, c);
112 
113 #endif
114 }
115 
fpneg503(digit_t * a)116 __inline void fpneg503(digit_t *a) { // Modular negation, a = -a mod p503.
117 	// Input/output: a in [0, 2*p503-1]
118 	unsigned int i, borrow = 0;
119 
120 	for (i = 0; i < NWORDS_FIELD; i++) {
121 		SUBC(borrow, ((digit_t *) p503x2)[i], a[i], borrow, a[i]);
122 	}
123 }
124 
fpdiv2_503(const digit_t * a,digit_t * c)125 void fpdiv2_503(const digit_t *a, digit_t *c) { // Modular division by two, c = a/2 mod p503.
126 	// Input : a in [0, 2*p503-1]
127 	// Output: c in [0, 2*p503-1]
128 	unsigned int i, carry = 0;
129 	digit_t mask;
130 
131 	mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p503
132 	for (i = 0; i < NWORDS_FIELD; i++) {
133 		ADDC(carry, a[i], ((digit_t *) p503)[i] & mask, carry, c[i]);
134 	}
135 
136 	mp_shiftr1(c, NWORDS_FIELD);
137 }
138 
fpcorrection503(digit_t * a)139 void fpcorrection503(digit_t *a) { // Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1].
140 	unsigned int i, borrow = 0;
141 	digit_t mask;
142 
143 	for (i = 0; i < NWORDS_FIELD; i++) {
144 		SUBC(borrow, a[i], ((digit_t *) p503)[i], borrow, a[i]);
145 	}
146 	mask = 0 - (digit_t) borrow;
147 
148 	borrow = 0;
149 	for (i = 0; i < NWORDS_FIELD; i++) {
150 		ADDC(borrow, a[i], ((digit_t *) p503)[i] & mask, borrow, a[i]);
151 	}
152 }
153 
mp_mul(const digit_t * a,const digit_t * b,digit_t * c,const unsigned int nwords)154 void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) { // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords.
155 
156 	UNREFERENCED_PARAMETER(nwords);
157 
158 #if (OS_TARGET == OS_WIN)
159 	digit_t t = 0;
160 	uint128_t uv = {0};
161 	unsigned int carry = 0;
162 
163 	MULADD128(a[0], b[0], uv, carry, uv);
164 	t += carry;
165 	c[0] = uv[0];
166 	uv[0] = uv[1];
167 	uv[1] = t;
168 	t = 0;
169 
170 	MULADD128(a[0], b[1], uv, carry, uv);
171 	t += carry;
172 	MULADD128(a[1], b[0], uv, carry, uv);
173 	t += carry;
174 	c[1] = uv[0];
175 	uv[0] = uv[1];
176 	uv[1] = t;
177 	t = 0;
178 
179 	MULADD128(a[0], b[2], uv, carry, uv);
180 	t += carry;
181 	MULADD128(a[1], b[1], uv, carry, uv);
182 	t += carry;
183 	MULADD128(a[2], b[0], uv, carry, uv);
184 	t += carry;
185 	c[2] = uv[0];
186 	uv[0] = uv[1];
187 	uv[1] = t;
188 	t = 0;
189 
190 	MULADD128(a[0], b[3], uv, carry, uv);
191 	t += carry;
192 	MULADD128(a[2], b[1], uv, carry, uv);
193 	t += carry;
194 	MULADD128(a[1], b[2], uv, carry, uv);
195 	t += carry;
196 	MULADD128(a[3], b[0], uv, carry, uv);
197 	t += carry;
198 	c[3] = uv[0];
199 	uv[0] = uv[1];
200 	uv[1] = t;
201 	t = 0;
202 
203 	MULADD128(a[0], b[4], uv, carry, uv);
204 	t += carry;
205 	MULADD128(a[3], b[1], uv, carry, uv);
206 	t += carry;
207 	MULADD128(a[2], b[2], uv, carry, uv);
208 	t += carry;
209 	MULADD128(a[1], b[3], uv, carry, uv);
210 	t += carry;
211 	MULADD128(a[4], b[0], uv, carry, uv);
212 	t += carry;
213 	c[4] = uv[0];
214 	uv[0] = uv[1];
215 	uv[1] = t;
216 	t = 0;
217 
218 	MULADD128(a[0], b[5], uv, carry, uv);
219 	t += carry;
220 	MULADD128(a[4], b[1], uv, carry, uv);
221 	t += carry;
222 	MULADD128(a[3], b[2], uv, carry, uv);
223 	t += carry;
224 	MULADD128(a[2], b[3], uv, carry, uv);
225 	t += carry;
226 	MULADD128(a[1], b[4], uv, carry, uv);
227 	t += carry;
228 	MULADD128(a[5], b[0], uv, carry, uv);
229 	t += carry;
230 	c[5] = uv[0];
231 	uv[0] = uv[1];
232 	uv[1] = t;
233 	t = 0;
234 
235 	MULADD128(a[0], b[6], uv, carry, uv);
236 	t += carry;
237 	MULADD128(a[5], b[1], uv, carry, uv);
238 	t += carry;
239 	MULADD128(a[4], b[2], uv, carry, uv);
240 	t += carry;
241 	MULADD128(a[3], b[3], uv, carry, uv);
242 	t += carry;
243 	MULADD128(a[2], b[4], uv, carry, uv);
244 	t += carry;
245 	MULADD128(a[1], b[5], uv, carry, uv);
246 	t += carry;
247 	MULADD128(a[6], b[0], uv, carry, uv);
248 	t += carry;
249 	c[6] = uv[0];
250 	uv[0] = uv[1];
251 	uv[1] = t;
252 	t = 0;
253 
254 	MULADD128(a[0], b[7], uv, carry, uv);
255 	t += carry;
256 	MULADD128(a[6], b[1], uv, carry, uv);
257 	t += carry;
258 	MULADD128(a[5], b[2], uv, carry, uv);
259 	t += carry;
260 	MULADD128(a[4], b[3], uv, carry, uv);
261 	t += carry;
262 	MULADD128(a[3], b[4], uv, carry, uv);
263 	t += carry;
264 	MULADD128(a[2], b[5], uv, carry, uv);
265 	t += carry;
266 	MULADD128(a[1], b[6], uv, carry, uv);
267 	t += carry;
268 	MULADD128(a[7], b[0], uv, carry, uv);
269 	t += carry;
270 	c[7] = uv[0];
271 	uv[0] = uv[1];
272 	uv[1] = t;
273 	t = 0;
274 
275 	MULADD128(a[7], b[1], uv, carry, uv);
276 	t += carry;
277 	MULADD128(a[6], b[2], uv, carry, uv);
278 	t += carry;
279 	MULADD128(a[5], b[3], uv, carry, uv);
280 	t += carry;
281 	MULADD128(a[4], b[4], uv, carry, uv);
282 	t += carry;
283 	MULADD128(a[3], b[5], uv, carry, uv);
284 	t += carry;
285 	MULADD128(a[2], b[6], uv, carry, uv);
286 	t += carry;
287 	MULADD128(a[1], b[7], uv, carry, uv);
288 	t += carry;
289 	c[8] = uv[0];
290 	uv[0] = uv[1];
291 	uv[1] = t;
292 	t = 0;
293 
294 	MULADD128(a[7], b[2], uv, carry, uv);
295 	t += carry;
296 	MULADD128(a[6], b[3], uv, carry, uv);
297 	t += carry;
298 	MULADD128(a[5], b[4], uv, carry, uv);
299 	t += carry;
300 	MULADD128(a[4], b[5], uv, carry, uv);
301 	t += carry;
302 	MULADD128(a[3], b[6], uv, carry, uv);
303 	t += carry;
304 	MULADD128(a[2], b[7], uv, carry, uv);
305 	t += carry;
306 	c[9] = uv[0];
307 	uv[0] = uv[1];
308 	uv[1] = t;
309 	t = 0;
310 
311 	MULADD128(a[7], b[3], uv, carry, uv);
312 	t += carry;
313 	MULADD128(a[6], b[4], uv, carry, uv);
314 	t += carry;
315 	MULADD128(a[5], b[5], uv, carry, uv);
316 	t += carry;
317 	MULADD128(a[4], b[6], uv, carry, uv);
318 	t += carry;
319 	MULADD128(a[3], b[7], uv, carry, uv);
320 	t += carry;
321 	c[10] = uv[0];
322 	uv[0] = uv[1];
323 	uv[1] = t;
324 	t = 0;
325 
326 	MULADD128(a[7], b[4], uv, carry, uv);
327 	t += carry;
328 	MULADD128(a[6], b[5], uv, carry, uv);
329 	t += carry;
330 	MULADD128(a[5], b[6], uv, carry, uv);
331 	t += carry;
332 	MULADD128(a[4], b[7], uv, carry, uv);
333 	t += carry;
334 	c[11] = uv[0];
335 	uv[0] = uv[1];
336 	uv[1] = t;
337 	t = 0;
338 
339 	MULADD128(a[7], b[5], uv, carry, uv);
340 	t += carry;
341 	MULADD128(a[6], b[6], uv, carry, uv);
342 	t += carry;
343 	MULADD128(a[5], b[7], uv, carry, uv);
344 	t += carry;
345 	c[12] = uv[0];
346 	uv[0] = uv[1];
347 	uv[1] = t;
348 	t = 0;
349 
350 	MULADD128(a[7], b[6], uv, carry, uv);
351 	t += carry;
352 	MULADD128(a[6], b[7], uv, carry, uv);
353 	t += carry;
354 	c[13] = uv[0];
355 	uv[0] = uv[1];
356 	uv[1] = t;
357 
358 	MULADD128(a[7], b[7], uv, carry, uv);
359 	c[14] = uv[0];
360 	c[15] = uv[1];
361 
362 #elif (OS_TARGET == OS_NIX)
363 
364 	oqs_kem_sike_mul503_asm(a, b, c);
365 
366 #endif
367 }
368 
rdc_mont(digit_t * ma,digit_t * mc)369 void rdc_mont(digit_t *ma, digit_t *mc) { // Montgomery reduction exploiting special form of the prime.
370 	// mc = ma*R^-1 mod p503x2, where R = 2^512.
371 	// If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1].
372 	// ma is assumed to be in Montgomery representation.
373 
374 #if (OS_TARGET == OS_WIN)
375 	unsigned int carry;
376 	digit_t t = 0;
377 	uint128_t uv = {0};
378 
379 	mc[0] = ma[0];
380 	mc[1] = ma[1];
381 	mc[2] = ma[2];
382 	MUL128(mc[0], ((digit_t *) p503p1)[3], uv);
383 	ADDC(0, uv[0], ma[3], carry, uv[0]);
384 	ADDC(carry, uv[1], 0, carry, uv[1]);
385 	mc[3] = uv[0];
386 	uv[0] = uv[1];
387 	uv[1] = 0;
388 
389 	MULADD128(mc[0], ((digit_t *) p503p1)[4], uv, carry, uv);
390 	MULADD128(mc[1], ((digit_t *) p503p1)[3], uv, carry, uv);
391 	t += carry;
392 	ADDC(0, uv[0], ma[4], carry, uv[0]);
393 	ADDC(carry, uv[1], 0, carry, uv[1]);
394 	t += carry;
395 	mc[4] = uv[0];
396 	uv[0] = uv[1];
397 	uv[1] = t;
398 	t = 0;
399 
400 	MULADD128(mc[0], ((digit_t *) p503p1)[5], uv, carry, uv);
401 	t += carry;
402 	MULADD128(mc[1], ((digit_t *) p503p1)[4], uv, carry, uv);
403 	t += carry;
404 	MULADD128(mc[2], ((digit_t *) p503p1)[3], uv, carry, uv);
405 	t += carry;
406 	ADDC(0, uv[0], ma[5], carry, uv[0]);
407 	ADDC(carry, uv[1], 0, carry, uv[1]);
408 	t += carry;
409 	mc[5] = uv[0];
410 	uv[0] = uv[1];
411 	uv[1] = t;
412 	t = 0;
413 
414 	MULADD128(mc[0], ((digit_t *) p503p1)[6], uv, carry, uv);
415 	t += carry;
416 	MULADD128(mc[1], ((digit_t *) p503p1)[5], uv, carry, uv);
417 	t += carry;
418 	MULADD128(mc[2], ((digit_t *) p503p1)[4], uv, carry, uv);
419 	t += carry;
420 	MULADD128(mc[3], ((digit_t *) p503p1)[3], uv, carry, uv);
421 	t += carry;
422 	ADDC(0, uv[0], ma[6], carry, uv[0]);
423 	ADDC(carry, uv[1], 0, carry, uv[1]);
424 	t += carry;
425 	mc[6] = uv[0];
426 	uv[0] = uv[1];
427 	uv[1] = t;
428 	t = 0;
429 
430 	MULADD128(mc[0], ((digit_t *) p503p1)[7], uv, carry, uv);
431 	t += carry;
432 	MULADD128(mc[1], ((digit_t *) p503p1)[6], uv, carry, uv);
433 	t += carry;
434 	MULADD128(mc[2], ((digit_t *) p503p1)[5], uv, carry, uv);
435 	t += carry;
436 	MULADD128(mc[3], ((digit_t *) p503p1)[4], uv, carry, uv);
437 	t += carry;
438 	MULADD128(mc[4], ((digit_t *) p503p1)[3], uv, carry, uv);
439 	t += carry;
440 	ADDC(0, uv[0], ma[7], carry, uv[0]);
441 	ADDC(carry, uv[1], 0, carry, uv[1]);
442 	t += carry;
443 	mc[7] = uv[0];
444 	uv[0] = uv[1];
445 	uv[1] = t;
446 	t = 0;
447 
448 	MULADD128(mc[1], ((digit_t *) p503p1)[7], uv, carry, uv);
449 	t += carry;
450 	MULADD128(mc[2], ((digit_t *) p503p1)[6], uv, carry, uv);
451 	t += carry;
452 	MULADD128(mc[3], ((digit_t *) p503p1)[5], uv, carry, uv);
453 	t += carry;
454 	MULADD128(mc[4], ((digit_t *) p503p1)[4], uv, carry, uv);
455 	t += carry;
456 	MULADD128(mc[5], ((digit_t *) p503p1)[3], uv, carry, uv);
457 	t += carry;
458 	ADDC(0, uv[0], ma[8], carry, uv[0]);
459 	ADDC(carry, uv[1], 0, carry, uv[1]);
460 	t += carry;
461 	mc[0] = uv[0];
462 	uv[0] = uv[1];
463 	uv[1] = t;
464 	t = 0;
465 
466 	MULADD128(mc[2], ((digit_t *) p503p1)[7], uv, carry, uv);
467 	t += carry;
468 	MULADD128(mc[3], ((digit_t *) p503p1)[6], uv, carry, uv);
469 	t += carry;
470 	MULADD128(mc[4], ((digit_t *) p503p1)[5], uv, carry, uv);
471 	t += carry;
472 	MULADD128(mc[5], ((digit_t *) p503p1)[4], uv, carry, uv);
473 	t += carry;
474 	MULADD128(mc[6], ((digit_t *) p503p1)[3], uv, carry, uv);
475 	t += carry;
476 	ADDC(0, uv[0], ma[9], carry, uv[0]);
477 	ADDC(carry, uv[1], 0, carry, uv[1]);
478 	t += carry;
479 	mc[1] = uv[0];
480 	uv[0] = uv[1];
481 	uv[1] = t;
482 	t = 0;
483 
484 	MULADD128(mc[3], ((digit_t *) p503p1)[7], uv, carry, uv);
485 	t += carry;
486 	MULADD128(mc[4], ((digit_t *) p503p1)[6], uv, carry, uv);
487 	t += carry;
488 	MULADD128(mc[5], ((digit_t *) p503p1)[5], uv, carry, uv);
489 	t += carry;
490 	MULADD128(mc[6], ((digit_t *) p503p1)[4], uv, carry, uv);
491 	t += carry;
492 	MULADD128(mc[7], ((digit_t *) p503p1)[3], uv, carry, uv);
493 	t += carry;
494 	ADDC(0, uv[0], ma[10], carry, uv[0]);
495 	ADDC(carry, uv[1], 0, carry, uv[1]);
496 	t += carry;
497 	mc[2] = uv[0];
498 	uv[0] = uv[1];
499 	uv[1] = t;
500 	t = 0;
501 
502 	MULADD128(mc[4], ((digit_t *) p503p1)[7], uv, carry, uv);
503 	t += carry;
504 	MULADD128(mc[5], ((digit_t *) p503p1)[6], uv, carry, uv);
505 	t += carry;
506 	MULADD128(mc[6], ((digit_t *) p503p1)[5], uv, carry, uv);
507 	t += carry;
508 	MULADD128(mc[7], ((digit_t *) p503p1)[4], uv, carry, uv);
509 	t += carry;
510 	ADDC(0, uv[0], ma[11], carry, uv[0]);
511 	ADDC(carry, uv[1], 0, carry, uv[1]);
512 	t += carry;
513 	mc[3] = uv[0];
514 	uv[0] = uv[1];
515 	uv[1] = t;
516 	t = 0;
517 
518 	MULADD128(mc[5], ((digit_t *) p503p1)[7], uv, carry, uv);
519 	t += carry;
520 	MULADD128(mc[6], ((digit_t *) p503p1)[6], uv, carry, uv);
521 	t += carry;
522 	MULADD128(mc[7], ((digit_t *) p503p1)[5], uv, carry, uv);
523 	t += carry;
524 	ADDC(0, uv[0], ma[12], carry, uv[0]);
525 	ADDC(carry, uv[1], 0, carry, uv[1]);
526 	t += carry;
527 	mc[4] = uv[0];
528 	uv[0] = uv[1];
529 	uv[1] = t;
530 	t = 0;
531 
532 	MULADD128(mc[6], ((digit_t *) p503p1)[7], uv, carry, uv);
533 	t += carry;
534 	MULADD128(mc[7], ((digit_t *) p503p1)[6], uv, carry, uv);
535 	t += carry;
536 	ADDC(0, uv[0], ma[13], carry, uv[0]);
537 	ADDC(carry, uv[1], 0, carry, uv[1]);
538 	t += carry;
539 	mc[5] = uv[0];
540 	uv[0] = uv[1];
541 	uv[1] = t;
542 	t = 0;
543 
544 	MULADD128(mc[7], ((digit_t *) p503p1)[7], uv, carry, uv);
545 	t += carry;
546 	ADDC(0, uv[0], ma[14], carry, mc[6]);
547 	ADDC(carry, uv[1], 0, carry, uv[1]);
548 	ADDC(0, uv[1], ma[15], carry, mc[7]);
549 
550 #elif (OS_TARGET == OS_NIX)
551 
552 	oqs_kem_sike_rdc503_asm(ma, mc);
553 
554 #endif
555 }
556