1//*******************************************************************************************
2// Supersingular Isogeny Key Encapsulation Library
3//
4// Abstract: field arithmetic in x64 assembly for P434 on Linux
5//*******************************************************************************************
6
7/* Requires bmi2 instruction set for mulx. adx instructions are optional, but preferred. */
8.intel_syntax noprefix
9
10#define S2N_SIKE_P434_R3_NAMESPACE(s) s2n_sike_p434_r3_##s
11
12// Registers that are used for parameter passing:
13#define reg_p1  rdi
14#define reg_p2  rsi
15#define reg_p3  rdx
16
17// Define addition instructions
18#ifdef S2N_ADX
19
20#define ADD1    adox
21#define ADC1    adox
22#define ADD2    adcx
23#define ADC2    adcx
24
25#else
26
27#define ADD1    add
28#define ADC1    adc
29#define ADD2    add
30#define ADC2    adc
31
32#endif
33
34.text
35
36#define asm_p434 S2N_SIKE_P434_R3_NAMESPACE(asm_p434)
37.align 32
38.type   asm_p434, @object
39.size   asm_p434, 56
40asm_p434:
41.quad   -1
42.quad   -1
43.quad   -1
44.quad   -161717841442111489
45.quad   8918917783347572387
46.quad   7853257225132122198
47.quad   620258357900100
48
49
50#define asm_p434x2 S2N_SIKE_P434_R3_NAMESPACE(asm_p434x2)
51.align 32
52.type   asm_p434x2, @object
53.size   asm_p434x2, 56
54asm_p434x2:
55.quad   -2
56.quad   -1
57.quad   -1
58.quad   -323435682884222977
59.quad   -608908507014406841
60.quad   -2740229623445307220
61.quad   1240516715800200
62
63
64#define asm_p434x4 S2N_SIKE_P434_R3_NAMESPACE(asm_p434x4)
65.align 32
66.type   asm_p434x4, @object
67.size   asm_p434x4, 56
68asm_p434x4:
69.quad   -4
70.quad   -1
71.quad   -1
72.quad   -646871365768445953
73.quad   -1217817014028813681
74.quad   -5480459246890614439
75.quad   2481033431600401
76
77
78#define asm_p434p1 S2N_SIKE_P434_R3_NAMESPACE(asm_p434p1)
79.align 32
80.type   asm_p434p1, @object
81.size   asm_p434p1, 56
82asm_p434p1:
83.quad   0
84.quad   0
85.quad   0
86.quad   -161717841442111488
87.quad   8918917783347572387
88.quad   7853257225132122198
89.quad   620258357900100
90
91//***********************************************************************
92//  Field addition
93//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
94//***********************************************************************
95#define fpadd434_asm S2N_SIKE_P434_R3_NAMESPACE(fpadd434_asm)
96.global fpadd434_asm
97fpadd434_asm:
98  push   r12
99  push   r13
100  push   r14
101  push   r15
102  push   rbx
103  push   rbp
104
105  xor    rax, rax
106  mov    r8, [reg_p1]
107  mov    r9, [reg_p1+8]
108  mov    r10, [reg_p1+16]
109  mov    r11, [reg_p1+24]
110  mov    r12, [reg_p1+32]
111  mov    r13, [reg_p1+40]
112  mov    r14, [reg_p1+48]
113  add    r8, [reg_p2]
114  adc    r9, [reg_p2+8]
115  adc    r10, [reg_p2+16]
116  adc    r11, [reg_p2+24]
117  adc    r12, [reg_p2+32]
118  adc    r13, [reg_p2+40]
119  adc    r14, [reg_p2+48]
120
121  mov    rbx, [rip+asm_p434x2]
122  sub    r8, rbx
123  mov    rcx, [rip+asm_p434x2+8]
124  sbb    r9, rcx
125  sbb    r10, rcx
126  mov    rdi, [rip+asm_p434x2+24]
127  sbb    r11, rdi
128  mov    rsi, [rip+asm_p434x2+32]
129  sbb    r12, rsi
130  mov    rbp, [rip+asm_p434x2+40]
131  sbb    r13, rbp
132  mov    r15, [rip+asm_p434x2+48]
133  sbb    r14, r15
134  sbb    rax, 0
135
136  and    rbx, rax
137  and    rcx, rax
138  and    rdi, rax
139  and    rsi, rax
140  and    rbp, rax
141  and    r15, rax
142
143  add    r8, rbx
144  adc    r9, rcx
145  adc    r10, rcx
146  adc    r11, rdi
147  adc    r12, rsi
148  adc    r13, rbp
149  adc    r14, r15
150  mov    [reg_p3], r8
151  mov    [reg_p3+8], r9
152  mov    [reg_p3+16], r10
153  mov    [reg_p3+24], r11
154  mov    [reg_p3+32], r12
155  mov    [reg_p3+40], r13
156  mov    [reg_p3+48], r14
157
158  pop    rbp
159  pop    rbx
160  pop    r15
161  pop    r14
162  pop    r13
163  pop    r12
164  ret
165
166//***********************************************************************
167//  Field subtraction
168//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
169//***********************************************************************
170#define fpsub434_asm S2N_SIKE_P434_R3_NAMESPACE(fpsub434_asm)
171.global fpsub434_asm
172fpsub434_asm:
173  push   r12
174  push   r13
175  push   r14
176
177  xor    rax, rax
178  mov    r8, [reg_p1]
179  mov    r9, [reg_p1+8]
180  mov    r10, [reg_p1+16]
181  mov    r11, [reg_p1+24]
182  mov    r12, [reg_p1+32]
183  mov    r13, [reg_p1+40]
184  mov    r14, [reg_p1+48]
185  sub    r8, [reg_p2]
186  sbb    r9, [reg_p2+8]
187  sbb    r10, [reg_p2+16]
188  sbb    r11, [reg_p2+24]
189  sbb    r12, [reg_p2+32]
190  sbb    r13, [reg_p2+40]
191  sbb    r14, [reg_p2+48]
192  sbb    rax, 0
193
194  mov    rcx, [rip+asm_p434x2]
195  mov    rdi, [rip+asm_p434x2+8]
196  mov    rsi, [rip+asm_p434x2+24]
197  and    rcx, rax
198  and    rdi, rax
199  and    rsi, rax
200  add    r8, rcx
201  adc    r9, rdi
202  adc    r10, rdi
203  adc    r11, rsi
204  mov    [reg_p3], r8
205  mov    [reg_p3+8], r9
206  mov    [reg_p3+16], r10
207  mov    [reg_p3+24], r11
208  setc   cl
209
210  mov    r8, [rip+asm_p434x2+32]
211  mov    rdi, [rip+asm_p434x2+40]
212  mov    rsi, [rip+asm_p434x2+48]
213  and    r8, rax
214  and    rdi, rax
215  and    rsi, rax
216  bt     rcx, 0
217  adc    r12, r8
218  adc    r13, rdi
219  adc    r14, rsi
220  mov    [reg_p3+32], r12
221  mov    [reg_p3+40], r13
222  mov    [reg_p3+48], r14
223
224  pop    r14
225  pop    r13
226  pop    r12
227  ret
228
229///////////////////////////////////////////////////////////////// MACRO
230.macro SUB434_PX  P0
231  push   r12
232  push   r13
233
234  mov    r8, [reg_p1]
235  mov    r9, [reg_p1+8]
236  mov    r10, [reg_p1+16]
237  mov    r11, [reg_p1+24]
238  mov    r12, [reg_p1+32]
239  mov    r13, [reg_p1+40]
240  mov    rcx, [reg_p1+48]
241  sub    r8, [reg_p2]
242  sbb    r9, [reg_p2+8]
243  sbb    r10, [reg_p2+16]
244  sbb    r11, [reg_p2+24]
245  sbb    r12, [reg_p2+32]
246  sbb    r13, [reg_p2+40]
247  sbb    rcx, [reg_p2+48]
248
249  mov    rax, [rip+\P0]
250  mov    rdi, [rip+\P0+8]
251  mov    rsi, [rip+\P0+24]
252  add    r8, rax
253  mov    rax, [rip+\P0+32]
254  adc    r9, rdi
255  adc    r10, rdi
256  adc    r11, rsi
257  mov    rdi, [rip+\P0+40]
258  mov    rsi, [rip+\P0+48]
259  adc    r12, rax
260  adc    r13, rdi
261  adc    rcx, rsi
262  mov    [reg_p3], r8
263  mov    [reg_p3+8], r9
264  mov    [reg_p3+16], r10
265  mov    [reg_p3+24], r11
266  mov    [reg_p3+32], r12
267  mov    [reg_p3+40], r13
268  mov    [reg_p3+48], rcx
269
270  pop    r13
271  pop    r12
272.endm
273
274//***********************************************************************
275//  Multiprecision subtraction with correction with 2*p434
276//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p434
277//***********************************************************************
278#define mp_sub434_p2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p2_asm)
279.global mp_sub434_p2_asm
280mp_sub434_p2_asm:
281  SUB434_PX  asm_p434x2
282  ret
283
284//***********************************************************************
285//  Multiprecision subtraction with correction with 4*p434
286//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p434
287//***********************************************************************
288#define mp_sub434_p4_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p4_asm)
289.global mp_sub434_p4_asm
290mp_sub434_p4_asm:
291  SUB434_PX  asm_p434x4
292  ret
293
294///////////////////////////////////////////////////////////////// MACRO
295// Schoolbook integer multiplication
296// Inputs:  memory pointers M0 and M1
297// Outputs: memory pointer C and regs T1, T3, rax
298// Temps:   regs T0:T6
299/////////////////////////////////////////////////////////////////
300#ifdef S2N_ADX
301
302.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
303  mov    rdx, \M0
304  mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
305  mov    \C, \T1           // C0_final
306  mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
307  xor    rax, rax
308  adox   \T0, \T2
309  mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
310  adox   \T1, \T3
311
312  mov    rdx, 8\M0
313  mulx   \T3, \T4, \M1     // T3:T4 = A1*B0
314  adox   \T2, rax
315  xor    rax, rax
316  mulx   \T5, \T6, 8\M1    // T5:T6 = A1*B1
317  adox   \T4, \T0
318  mov    8\C, \T4          // C1_final
319  adcx   \T3, \T6
320  mulx   \T6, \T0, 16\M1   // T6:T0 = A1*B2
321  adox   \T3, \T1
322  adcx   \T5, \T0
323  adcx   \T6, rax
324  adox   \T5, \T2
325
326  mov    rdx, 16\M0
327  mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
328  adox   \T6, rax
329  xor    rax, rax
330  mulx   \T4, \T2, 8\M1    // T4:T2 = A2*B1
331  adox   \T0, \T3
332  mov    16\C, \T0         // C2_final
333  adcx   \T1, \T5
334  mulx   \T0, \T3, 16\M1   // T0:T3 = A2*B2
335  adcx   \T4, \T6
336  adcx   \T0, rax
337  adox   \T1, \T2
338  adox   \T3, \T4
339  adox   rax, \T0
340.endm
341
342///////////////////////////////////////////////////////////////// MACRO
343// Schoolbook integer multiplication
344// Inputs:  memory pointers M0 and M1
345// Outputs: memory pointer C
346// Temps:   regs T0:T9
347/////////////////////////////////////////////////////////////////
348.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
349  mov    rdx, \M0
350  mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
351  mov    \C, \T1           // C0_final
352  mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
353  xor    rax, rax
354  adox   \T0, \T2
355  mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
356  adox   \T1, \T3
357  mulx   \T3, \T4, 24\M1   // T3:T4 = A0*B3
358  adox   \T2, \T4
359
360  mov    rdx, 8\M0
361  mulx   \T5, \T4, \M1     // T5:T4 = A1*B0
362  adox   \T3, rax
363  xor    rax, rax
364  mulx   \T6, \T7, 8\M1    // T6:T7 = A1*B1
365  adox   \T4, \T0
366  mov    8\C, \T4          // C1_final
367  adcx   \T5, \T7
368  mulx   \T7, \T8, 16\M1   // T7:T8 = A1*B2
369  adcx   \T6, \T8
370  adox   \T5, \T1
371  mulx   \T8, \T9, 24\M1   // T8:T9 = A1*B3
372  adcx   \T7, \T9
373  adcx   \T8, rax
374  adox   \T6, \T2
375
376  mov    rdx, 16\M0
377  mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
378  adox   \T7, \T3
379  adox   \T8, rax
380  xor    rax, rax
381  mulx   \T2, \T3, 8\M1    // T2:T3 = A2*B1
382  adox   \T0, \T5
383  mov    16\C, \T0         // C2_final
384  adcx   \T1, \T3
385  mulx   \T3, \T4, 16\M1   // T3:T4 = A2*B2
386  adcx   \T2, \T4
387  adox   \T1, \T6
388  mulx   \T4,\T9, 24\M1    // T3:T4 = A2*B3
389  adcx   \T3, \T9
390  mov    rdx, 24\M0
391  adcx   \T4, rax
392
393  adox   \T2, \T7
394  adox   \T3, \T8
395  adox   \T4, rax
396
397  mulx   \T5, \T0, \M1     // T5:T0 = A3*B0
398  xor    rax, rax
399  mulx   \T6, \T7, 8\M1    // T6:T7 = A3*B1
400  adcx   \T5, \T7
401  adox   \T1, \T0
402  mulx   \T7, \T8, 16\M1   // T7:T8 = A3*B2
403  adcx   \T6, \T8
404  adox   \T2, \T5
405  mulx   \T8, \T9, 24\M1   // T8:T9 = A3*B3
406  adcx   \T7, \T9
407  adcx   \T8, rax
408
409  adox   \T3, \T6
410  adox   \T4, \T7
411  adox   \T8, rax
412  mov    24\C, \T1         // C3_final
413  mov    32\C, \T2         // C4_final
414  mov    40\C, \T3         // C5_final
415  mov    48\C, \T4         // C6_final
416  mov    56\C, \T8         // C7_final
417.endm
418
419#else // S2N_ADX
420
421.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
422  mov    rdx, \M0
423  mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
424  mov    \C, \T1           // C0_final
425  mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
426  add    \T0, \T2
427  mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
428  adc    \T1, \T3
429
430  mov    rdx, 8\M0
431  mulx   \T3, \T4, \M1     // T3:T4 = A1*B0
432  adc    \T2, 0
433  mulx   \T5, \T6, 8\M1    // T5:T6 = A1*B1
434  add    \T4, \T0
435  mov    8\C, \T4          // C1_final
436  adc    \T3, \T1
437  adc    \T5, \T2
438  mulx   \T2, \T1, 16\M1   // T2:T1 = A1*B2
439  adc    \T2, 0
440
441  add    \T3, \T6
442  adc    \T5, \T1
443  adc    \T2, 0
444
445  mov    rdx, 16\M0
446  mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
447  add    \T0, \T3
448  mov    16\C, \T0         // C2_final
449  mulx   \T4, \T6, 8\M1    // T4:T6 = A2*B1
450  adc    \T1, \T5
451  adc    \T2, \T4
452  mulx   rax, \T3, 16\M1   // rax:T3 = A2*B2
453  adc    rax, 0
454  add    \T1, \T6
455  adc    \T3, \T2
456  adc    rax, 0
457.endm
458
459.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
460  mov    rdx, \M0
461  mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
462  mov    \C, \T1           // C0_final
463  mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
464  add    \T0, \T2
465  mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
466  adc    \T1, \T3
467  mulx   \T3, \T4, 24\M1   // T3:T4 = A0*B3
468  adc    \T2, \T4
469  mov    rdx, 8\M0
470  adc    \T3, 0
471
472  mulx   \T5, \T4, \M1     // T5:T4 = A1*B0
473  mulx   \T6, \T7, 8\M1    // T6:T7 = A1*B1
474  add    \T5, \T7
475  mulx   \T7, \T8, 16\M1   // T7:T8 = A1*B2
476  adc    \T6, \T8
477  mulx   \T8, \T9, 24\M1   // T8:T9 = A1*B3
478  adc    \T7, \T9
479  adc    \T8, 0
480
481  add    \T4, \T0
482  mov    8\C, \T4          // C1_final
483  adc    \T5, \T1
484  adc    \T6, \T2
485  adc    \T7, \T3
486  mov    rdx, 16\M0
487  adc    \T8, 0
488
489  mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
490  mulx   \T2, \T3, 8\M1    // T2:T3 = A2*B1
491  add    \T1, \T3
492  mulx   \T3, \T4, 16\M1   // T3:T4 = A2*B2
493  adc    \T2, \T4
494  mulx   \T4,\T9, 24\M1    // T3:T4 = A2*B3
495  adc    \T3, \T9
496  mov    rdx, 24\M0
497  adc    \T4, 0
498
499  add    \T0, \T5
500  mov    16\C, \T0         // C2_final
501  adc    \T1, \T6
502  adc    \T2, \T7
503  adc    \T3, \T8
504  adc    \T4, 0
505
506  mulx   \T5, \T0, \M1     // T5:T0 = A3*B0
507  mulx   \T6, \T7, 8\M1    // T6:T7 = A3*B1
508  add    \T5, \T7
509  mulx   \T7, \T8, 16\M1   // T7:T8 = A3*B2
510  adc    \T6, \T8
511  mulx   \T8, \T9, 24\M1   // T8:T9 = A3*B3
512  adc    \T7, \T9
513  adc    \T8, 0
514
515  add    \T1, \T0
516  mov    24\C, \T1         // C3_final
517  adc    \T2, \T5
518  mov    32\C, \T2         // C4_final
519  adc    \T3, \T6
520  mov    40\C, \T3         // C5_final
521  adc    \T4, \T7
522  mov    48\C, \T4         // C6_final
523  adc    \T8, 0
524  mov    56\C, \T8         // C7_final
525.endm
526
527#endif // S2N_ADX
528
529//*****************************************************************************
530//  434-bit multiplication using Karatsuba (one level), schoolbook (one level)
531//*****************************************************************************
532#define mul434_asm S2N_SIKE_P434_R3_NAMESPACE(mul434_asm)
533.global mul434_asm
534mul434_asm:
535  push   r12
536  push   r13
537  push   r14
538  push   r15
539  mov    rcx, reg_p3
540
541  // r8-r11 <- AH + AL, rax <- mask
542  xor    rax, rax
543  mov    r8, [reg_p1]
544  mov    r9, [reg_p1+8]
545  mov    r10, [reg_p1+16]
546  mov    r11, [reg_p1+24]
547  push   rbx
548  push   rbp
549  sub    rsp, 96
550  add    r8, [reg_p1+32]
551  adc    r9, [reg_p1+40]
552  adc    r10, [reg_p1+48]
553  adc    r11, 0
554  sbb    rax, 0
555  mov    [rsp], r8
556  mov    [rsp+8], r9
557  mov    [rsp+16], r10
558  mov    [rsp+24], r11
559
560  // r12-r15 <- BH + BL, rbx <- mask
561  xor    rbx, rbx
562  mov    r12, [reg_p2]
563  mov    r13, [reg_p2+8]
564  mov    r14, [reg_p2+16]
565  mov    r15, [reg_p2+24]
566  add    r12, [reg_p2+32]
567  adc    r13, [reg_p2+40]
568  adc    r14, [reg_p2+48]
569  adc    r15, 0
570  sbb    rbx, 0
571  mov    [rsp+32], r12
572  mov    [rsp+40], r13
573  mov    [rsp+48], r14
574  mov    [rsp+56], r15
575
576  // r12-r15 <- masked (BH + BL)
577  and    r12, rax
578  and    r13, rax
579  and    r14, rax
580  and    r15, rax
581
582  // r8-r11 <- masked (AH + AL)
583  and    r8, rbx
584  and    r9, rbx
585  and    r10, rbx
586  and    r11, rbx
587
588  // r8-r11 <- masked (AH + AL) + masked (AH + AL)
589  add    r8, r12
590  adc    r9, r13
591  adc    r10, r14
592  adc    r11, r15
593  mov    [rsp+64], r8
594  mov    [rsp+72], r9
595  mov    [rsp+80], r10
596  mov    [rsp+88], r11
597
598  // [rsp] <- (AH+AL) x (BH+BL), low part
599  MUL256_SCHOOL  [rsp], [rsp+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp
600
601  // [rcx] <- AL x BL
602  MUL256_SCHOOL  [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp     // Result C0-C3
603
604  // [rcx+64], rbx, rbp, rax <- AH x BH
605  MUL192_SCHOOL  [reg_p1+32], [reg_p2+32], [rcx+64], r8, rbx, r10, rbp, r12, r13, r14
606
607  // r8-r11 <- (AH+AL) x (BH+BL), final step
608  mov    r8, [rsp+64]
609  mov    r9, [rsp+72]
610  mov    r10, [rsp+80]
611  mov    r11, [rsp+88]
612  mov    rdx, [rsp+32]
613  add    r8, rdx
614  mov    rdx, [rsp+40]
615  adc    r9, rdx
616  mov    rdx, [rsp+48]
617  adc    r10, rdx
618  mov    rdx, [rsp+56]
619  adc    r11, rdx
620
621  // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL
622  mov    r12, [rsp]
623  mov    r13, [rsp+8]
624  mov    r14, [rsp+16]
625  mov    r15, [rsp+24]
626  sub    r12, [rcx]
627  sbb    r13, [rcx+8]
628  sbb    r14, [rcx+16]
629  sbb    r15, [rcx+24]
630  sbb    r8, [rcx+32]
631  sbb    r9, [rcx+40]
632  sbb    r10, [rcx+48]
633  sbb    r11, [rcx+56]
634
635  // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
636  sub    r12, [rcx+64]
637  sbb    r13, [rcx+72]
638  sbb    r14, [rcx+80]
639  sbb    r15, rbx
640  sbb    r8, rbp
641  sbb    r9, rax
642  sbb    r10, 0
643  sbb    r11, 0
644
645  add    r12, [rcx+32]
646  mov    [rcx+32], r12    // Result C4-C7
647  adc    r13, [rcx+40]
648  mov    [rcx+40], r13
649  adc    r14, [rcx+48]
650  mov    [rcx+48], r14
651  adc    r15, [rcx+56]
652  mov    [rcx+56], r15
653  adc    r8, [rcx+64]
654  mov    [rcx+64], r8    // Result C8-C15
655  adc    r9, [rcx+72]
656  mov    [rcx+72], r9
657  adc    r10, [rcx+80]
658  mov    [rcx+80], r10
659  adc    r11, rbx
660  mov    [rcx+88], r11
661  adc    rbp, 0
662  mov    [rcx+96], rbp
663  adc    rax, 0
664  mov    [rcx+104], rax
665
666  add    rsp, 96
667  pop    rbp
668  pop    rbx
669  pop    r15
670  pop    r14
671  pop    r13
672  pop    r12
673  ret
674
675///////////////////////////////////////////////////////////////// MACRO
676// Schoolbook integer multiplication
677// Inputs:  reg I0 and memory pointer M1
678// Outputs: regs T0:T4
679// Temps:   regs T0:T5
680/////////////////////////////////////////////////////////////////
681.macro MUL64x256_SCHOOL I0, M1, T0, T1, T2, T3, T4, T5
682  mulx   \T2, \T4, 8\M1
683  xor    rax, rax
684  mulx   \T3, \T5, 16\M1
685  ADD1   \T1, \T4            // T1 <- C1_final
686  ADC1   \T2, \T5            // T2 <- C2_final
687  mulx   \T4, \T5, 24\M1
688  ADC1   \T3, \T5            // T3 <- C3_final
689  ADC1   \T4, rax            // T4 <- C4_final
690.endm
691
692///////////////////////////////////////////////////////////////// MACRO
693// Schoolbook integer multiplication
694// Inputs:  regs I0 and I1, and memory pointer M1
695// Outputs: regs T0:T5
696// Temps:   regs T0:T5
697/////////////////////////////////////////////////////////////////
698#ifdef S2N_ADX
699
700.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5
701  mulx   \T2, \T4, 8\M1
702  xor    rax, rax
703  mulx   \T3, \T5, 16\M1
704  ADD1   \T1, \T4
705  ADC1   \T2, \T5
706  mulx   \T4, \T5, 24\M1
707  ADC1   \T3, \T5
708  ADC1   \T4, rax
709
710  xor    rax, rax
711  mov    rdx, \I1
712  mulx   \I1, \T5, \M1
713  ADD2   \T1, \T5            // T1 <- C1_final
714  ADC2   \T2, \I1
715  mulx   \T5, \I1, 8\M1
716  ADC2   \T3, \T5
717  ADD1   \T2, \I1
718  mulx   \T5, \I1, 16\M1
719  ADC2   \T4, \T5
720  ADC1   \T3, \I1
721  mulx   \T5, \I1, 24\M1
722  ADC2   \T5, rax
723  ADC1   \T4, \I1
724  ADC1   \T5, rax
725.endm
726
727#else // S2N_ADX
728
729.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5
730  mulx   \T2, \T4, 8\M1
731  mulx   \T3, \T5, 16\M1
732  add    \T1, \T4
733  adc    \T2, \T5
734  mulx   \T4, \T5, 24\M1
735  adc    \T3, \T5
736  adc    \T4, 0
737
738  mov    rdx, \I1
739  mulx   \I1, \T5, \M1
740  add    \T1, \T5            // T1 <- C1_final
741  adc    \T2, \I1
742  mulx   \T5, \I1, 8\M1
743  adc    \T3, \T5
744  mulx   \T5, rax, 16\M1
745  adc    \T4, \T5
746  mulx   \T5, rdx, 24\M1
747  adc    \T5, 0
748  add    \T2, \I1
749  adc    \T3, rax
750  adc    \T4, rdx
751  adc    \T5, 0
752.endm
753
754#endif // S2N_ADX
755
756//**************************************************************************************
757//  Montgomery reduction
758//  Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
759//  Operation: c [reg_p2] = a [reg_p1]
760//**************************************************************************************
761#define rdc434_asm S2N_SIKE_P434_R3_NAMESPACE(rdc434_asm)
762.global rdc434_asm
763rdc434_asm:
764  push   r14
765
766  // a[0-1] x p434p1_nz --> result: r8:r13
767  mov    rdx, [reg_p1]
768  mov    r14, [reg_p1+8]
769  mulx   r9, r8, [rip+asm_p434p1+24]   // result r8
770  push   r12
771  push   r13
772  push   r15
773  push   rbp
774  push   rbx
775  MUL128x256_SCHOOL rdx, r14, [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13
776
777  mov    rdx, [reg_p1+16]
778  mov    rcx, [reg_p1+72]
779  add    r8, [reg_p1+24]
780  adc    r9, [reg_p1+32]
781  adc    r10, [reg_p1+40]
782  adc    r11, [reg_p1+48]
783  adc    r12, [reg_p1+56]
784  adc    r13, [reg_p1+64]
785  adc    rcx, 0
786  mulx   rbp, rbx, [rip+asm_p434p1+24]   // result rbx
787  mov    [reg_p2], r9
788  mov    [reg_p2+8], r10
789  mov    [reg_p2+16], r11
790  mov    [reg_p2+24], r12
791  mov    [reg_p2+32], r13
792  mov    r9, [reg_p1+80]
793  mov    r10, [reg_p1+88]
794  mov    r11, [reg_p1+96]
795  mov    rdi, [reg_p1+104]
796  adc    r9, 0
797  adc    r10, 0
798  adc    r11, 0
799  adc    rdi, 0
800
801  // a[2-3] x p434p1_nz --> result: rbx, rbp, r12:r15
802  MUL128x256_SCHOOL rdx, r8, [rip+asm_p434p1+24], rbx, rbp, r12, r13, r14, r15
803
804  mov    rdx, [reg_p2]
805  add    rbx, [reg_p2+8]
806  adc    rbp, [reg_p2+16]
807  adc    r12, [reg_p2+24]
808  adc    r13, [reg_p2+32]
809  adc    r14, rcx
810  mov    rcx, 0
811  adc    r15, r9
812  adc    rcx, r10
813  mulx   r9, r8, [rip+asm_p434p1+24]   // result r8
814  mov    [reg_p2], rbp
815  mov    [reg_p2+8], r12
816  mov    [reg_p2+16], r13
817  adc    r11, 0
818  adc    rdi, 0
819
820  // a[4-5] x p434p1_nz --> result: r8:r13
821  MUL128x256_SCHOOL rdx, rbx, [rip+asm_p434p1+24], r8, r9, r10, rbp, r12, r13
822
823  mov    rdx, [reg_p2]
824  add    r8, [reg_p2+8]
825  adc    r9, [reg_p2+16]
826  adc    r10, r14
827  adc    rbp, r15
828  adc    r12, rcx
829  adc    r13, r11
830  adc    rdi, 0
831  mulx   r15, r14, [rip+asm_p434p1+24]  // result r14
832  mov    [reg_p2], r8        // Final result c0-c1
833  mov    [reg_p2+8], r9
834
835  // a[6-7] x p434p1_nz --> result: r14:r15, r8:r9, r11
836  MUL64x256_SCHOOL rdx, [rip+asm_p434p1+24], r14, r15, r8, r9, r11, rcx
837
838  // Final result c2:c6
839  add    r14, r10
840  adc    r15, rbp
841  pop    rbx
842  pop    rbp
843  adc    r8, r12
844  adc    r9, r13
845  adc    r11, rdi
846  mov    [reg_p2+16], r14
847  mov    [reg_p2+24], r15
848  pop    r15
849  pop    r13
850  mov    [reg_p2+32], r8
851  mov    [reg_p2+40], r9
852  mov    [reg_p2+48], r11
853
854  pop    r12
855  pop    r14
856  ret
857
858//***********************************************************************
859//  434-bit multiprecision addition
860//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
861//***********************************************************************
862#define mp_add434_asm S2N_SIKE_P434_R3_NAMESPACE(mp_add434_asm)
863.global mp_add434_asm
864mp_add434_asm:
865  mov    r8, [reg_p1]
866  mov    r9, [reg_p1+8]
867  mov    r10, [reg_p1+16]
868  mov    r11, [reg_p1+24]
869  add    r8, [reg_p2]
870  adc    r9, [reg_p2+8]
871  adc    r10, [reg_p2+16]
872  adc    r11, [reg_p2+24]
873  mov    [reg_p3], r8
874  mov    [reg_p3+8], r9
875  mov    [reg_p3+16], r10
876  mov    [reg_p3+24], r11
877
878  mov    r8, [reg_p1+32]
879  mov    r9, [reg_p1+40]
880  mov    r10, [reg_p1+48]
881  adc    r8, [reg_p2+32]
882  adc    r9, [reg_p2+40]
883  adc    r10, [reg_p2+48]
884  mov    [reg_p3+32], r8
885  mov    [reg_p3+40], r9
886  mov    [reg_p3+48], r10
887  ret
888
889//***************************************************************************
890//  2x434-bit multiprecision subtraction/addition
891//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. If c < 0, add p434*2^448
892//***************************************************************************
893#define mp_subadd434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_subadd434x2_asm)
894.global mp_subadd434x2_asm
895mp_subadd434x2_asm:
896  push   r12
897  push   r13
898  push   r14
899  push   r15
900  xor    rax, rax
901  mov    r8, [reg_p1]
902  mov    r9, [reg_p1+8]
903  mov    r10, [reg_p1+16]
904  mov    r11, [reg_p1+24]
905  mov    r12, [reg_p1+32]
906  sub    r8, [reg_p2]
907  sbb    r9, [reg_p2+8]
908  sbb    r10, [reg_p2+16]
909  sbb    r11, [reg_p2+24]
910  sbb    r12, [reg_p2+32]
911  mov    [reg_p3], r8
912  mov    [reg_p3+8], r9
913  mov    [reg_p3+16], r10
914  mov    [reg_p3+24], r11
915  mov    [reg_p3+32], r12
916
917  mov    r8, [reg_p1+40]
918  mov    r9, [reg_p1+48]
919  mov    r10, [reg_p1+56]
920  mov    r11, [reg_p1+64]
921  mov    r12, [reg_p1+72]
922  sbb    r8, [reg_p2+40]
923  sbb    r9, [reg_p2+48]
924  sbb    r10, [reg_p2+56]
925  sbb    r11, [reg_p2+64]
926  sbb    r12, [reg_p2+72]
927  mov    [reg_p3+40], r8
928  mov    [reg_p3+48], r9
929  mov    [reg_p3+56], r10
930
931  mov    r13, [reg_p1+80]
932  mov    r14, [reg_p1+88]
933  mov    r15, [reg_p1+96]
934  mov    rcx, [reg_p1+104]
935  sbb    r13, [reg_p2+80]
936  sbb    r14, [reg_p2+88]
937  sbb    r15, [reg_p2+96]
938  sbb    rcx, [reg_p2+104]
939  sbb    rax, 0
940
941  // Add p434 anded with the mask in rax
942  mov    r8, [rip+asm_p434]
943  mov    r9, [rip+asm_p434+24]
944  mov    r10, [rip+asm_p434+32]
945  mov    rdi, [rip+asm_p434+40]
946  mov    rsi, [rip+asm_p434+48]
947  and    r8, rax
948  and    r9, rax
949  and    r10, rax
950  and    rdi, rax
951  and    rsi, rax
952  mov    rax, [reg_p3+56]
953  add    rax, r8
954  adc    r11, r8
955  adc    r12, r8
956  adc    r13, r9
957  adc    r14, r10
958  adc    r15, rdi
959  adc    rcx, rsi
960
961  mov    [reg_p3+56], rax
962  mov    [reg_p3+64], r11
963  mov    [reg_p3+72], r12
964  mov    [reg_p3+80], r13
965  mov    [reg_p3+88], r14
966  mov    [reg_p3+96], r15
967  mov    [reg_p3+104], rcx
968  pop    r15
969  pop    r14
970  pop    r13
971  pop    r12
972  ret
973
974//***********************************************************************
975//  Double 2x434-bit multiprecision subtraction
976//  Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2]
977//***********************************************************************
978#define mp_dblsub434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_dblsub434x2_asm)
979.global mp_dblsub434x2_asm
980mp_dblsub434x2_asm:
981  push   r12
982  push   r13
983  push   r14
984
985  mov    r8, [reg_p3]
986  mov    r9, [reg_p3+8]
987  mov    r10, [reg_p3+16]
988  mov    r11, [reg_p3+24]
989  mov    r12, [reg_p3+32]
990  mov    r13, [reg_p3+40]
991  mov    r14, [reg_p3+48]
992  sub    r8, [reg_p1]
993  sbb    r9, [reg_p1+8]
994  sbb    r10, [reg_p1+16]
995  sbb    r11, [reg_p1+24]
996  sbb    r12, [reg_p1+32]
997  sbb    r13, [reg_p1+40]
998  sbb    r14, [reg_p1+48]
999  setc   al
1000  sub    r8, [reg_p2]
1001  sbb    r9, [reg_p2+8]
1002  sbb    r10, [reg_p2+16]
1003  sbb    r11, [reg_p2+24]
1004  sbb    r12, [reg_p2+32]
1005  sbb    r13, [reg_p2+40]
1006  sbb    r14, [reg_p2+48]
1007  setc   cl
1008  mov    [reg_p3], r8
1009  mov    [reg_p3+8], r9
1010  mov    [reg_p3+16], r10
1011  mov    [reg_p3+24], r11
1012  mov    [reg_p3+32], r12
1013  mov    [reg_p3+40], r13
1014  mov    [reg_p3+48], r14
1015
1016  mov    r8, [reg_p3+56]
1017  mov    r9, [reg_p3+64]
1018  mov    r10, [reg_p3+72]
1019  mov    r11, [reg_p3+80]
1020  mov    r12, [reg_p3+88]
1021  mov    r13, [reg_p3+96]
1022  mov    r14, [reg_p3+104]
1023  bt     rax, 0
1024  sbb    r8, [reg_p1+56]
1025  sbb    r9, [reg_p1+64]
1026  sbb    r10, [reg_p1+72]
1027  sbb    r11, [reg_p1+80]
1028  sbb    r12, [reg_p1+88]
1029  sbb    r13, [reg_p1+96]
1030  sbb    r14, [reg_p1+104]
1031  bt     rcx, 0
1032  sbb    r8, [reg_p2+56]
1033  sbb    r9, [reg_p2+64]
1034  sbb    r10, [reg_p2+72]
1035  sbb    r11, [reg_p2+80]
1036  sbb    r12, [reg_p2+88]
1037  sbb    r13, [reg_p2+96]
1038  sbb    r14, [reg_p2+104]
1039  mov    [reg_p3+56], r8
1040  mov    [reg_p3+64], r9
1041  mov    [reg_p3+72], r10
1042  mov    [reg_p3+80], r11
1043  mov    [reg_p3+88], r12
1044  mov    [reg_p3+96], r13
1045  mov    [reg_p3+104], r14
1046
1047  pop    r14
1048  pop    r13
1049  pop    r12
1050  ret
1051