1//*******************************************************************************************
2// SIDH: an efficient supersingular isogeny cryptography library
3//
4// Abstract: field arithmetic in x64 assembly for P434 on Linux
5//*******************************************************************************************
6
7.intel_syntax noprefix
8
9// Format function and variable names for Mac OS X
10#if defined(__APPLE__)
11    #define fmt(f)    _oqs_kem_sike_##f
12#else
13    #define fmt(f)    oqs_kem_sike_##f
14#endif
15
16// Registers that are used for parameter passing:
17#define reg_p1  rdi
18#define reg_p2  rsi
19#define reg_p3  rdx
20
21// Define addition instructions
22#ifdef _MULX_
23#ifdef _ADX_
24
25#define ADD1    adox
26#define ADC1    adox
27#define ADD2    adcx
28#define ADC2    adcx
29
30#else
31
32#define ADD1    add
33#define ADC1    adc
34#define ADD2    add
35#define ADC2    adc
36
37#endif
38#endif
39
40
41.text
42//***********************************************************************
43//  Field addition
44//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
45//***********************************************************************
46.global fmt(fpadd434_asm)
47fmt(fpadd434_asm):
48  push   r12
49  push   r13
50  push   r14
51  push   r15
52  push   rbx
53  push   rbp
54
55  xor    rax, rax
56  mov    r8, [reg_p1]
57  mov    r9, [reg_p1+8]
58  mov    r10, [reg_p1+16]
59  mov    r11, [reg_p1+24]
60  mov    r12, [reg_p1+32]
61  mov    r13, [reg_p1+40]
62  mov    r14, [reg_p1+48]
63  add    r8, [reg_p2]
64  adc    r9, [reg_p2+8]
65  adc    r10, [reg_p2+16]
66  adc    r11, [reg_p2+24]
67  adc    r12, [reg_p2+32]
68  adc    r13, [reg_p2+40]
69  adc    r14, [reg_p2+48]
70
71  mov    rbx, [rip+fmt(p434x2)]
72  sub    r8, rbx
73  mov    rcx, [rip+fmt(p434x2)+8]
74  sbb    r9, rcx
75  sbb    r10, rcx
76  mov    rdi, [rip+fmt(p434x2)+24]
77  sbb    r11, rdi
78  mov    rsi, [rip+fmt(p434x2)+32]
79  sbb    r12, rsi
80  mov    rbp, [rip+fmt(p434x2)+40]
81  sbb    r13, rbp
82  mov    r15, [rip+fmt(p434x2)+48]
83  sbb    r14, r15
84  sbb    rax, 0
85
86  and    rbx, rax
87  and    rcx, rax
88  and    rdi, rax
89  and    rsi, rax
90  and    rbp, rax
91  and    r15, rax
92
93  add    r8, rbx
94  adc    r9, rcx
95  adc    r10, rcx
96  adc    r11, rdi
97  adc    r12, rsi
98  adc    r13, rbp
99  adc    r14, r15
100  mov    [reg_p3], r8
101  mov    [reg_p3+8], r9
102  mov    [reg_p3+16], r10
103  mov    [reg_p3+24], r11
104  mov    [reg_p3+32], r12
105  mov    [reg_p3+40], r13
106  mov    [reg_p3+48], r14
107
108  pop    rbp
109  pop    rbx
110  pop    r15
111  pop    r14
112  pop    r13
113  pop    r12
114  ret
115
116
117//***********************************************************************
118//  Field subtraction
119//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
120//***********************************************************************
121.global fmt(fpsub434_asm)
122fmt(fpsub434_asm):
123  push   r12
124  push   r13
125  push   r14
126
127  xor    rax, rax
128  mov    r8, [reg_p1]
129  mov    r9, [reg_p1+8]
130  mov    r10, [reg_p1+16]
131  mov    r11, [reg_p1+24]
132  mov    r12, [reg_p1+32]
133  mov    r13, [reg_p1+40]
134  mov    r14, [reg_p1+48]
135  sub    r8, [reg_p2]
136  sbb    r9, [reg_p2+8]
137  sbb    r10, [reg_p2+16]
138  sbb    r11, [reg_p2+24]
139  sbb    r12, [reg_p2+32]
140  sbb    r13, [reg_p2+40]
141  sbb    r14, [reg_p2+48]
142  sbb    rax, 0
143
144  mov    rcx, [rip+fmt(p434x2)]
145  mov    rdi, [rip+fmt(p434x2)+8]
146  mov    rsi, [rip+fmt(p434x2)+24]
147  and    rcx, rax
148  and    rdi, rax
149  and    rsi, rax
150  add    r8, rcx
151  adc    r9, rdi
152  adc    r10, rdi
153  adc    r11, rsi
154  mov    [reg_p3], r8
155  mov    [reg_p3+8], r9
156  mov    [reg_p3+16], r10
157  mov    [reg_p3+24], r11
158  setc   cl
159
160  mov    r8, [rip+fmt(p434x2)+32]
161  mov    rdi, [rip+fmt(p434x2)+40]
162  mov    rsi, [rip+fmt(p434x2)+48]
163  and    r8, rax
164  and    rdi, rax
165  and    rsi, rax
166  bt     rcx, 0
167  adc    r12, r8
168  adc    r13, rdi
169  adc    r14, rsi
170  mov    [reg_p3+32], r12
171  mov    [reg_p3+40], r13
172  mov    [reg_p3+48], r14
173
174  pop    r14
175  pop    r13
176  pop    r12
177  ret
178
179
180///////////////////////////////////////////////////////////////// MACRO
181.macro SUB434_PX  P0
182  push   r12
183  push   r13
184
185  mov    r8, [reg_p1]
186  mov    r9, [reg_p1+8]
187  mov    r10, [reg_p1+16]
188  mov    r11, [reg_p1+24]
189  mov    r12, [reg_p1+32]
190  mov    r13, [reg_p1+40]
191  mov    rcx, [reg_p1+48]
192  sub    r8, [reg_p2]
193  sbb    r9, [reg_p2+8]
194  sbb    r10, [reg_p2+16]
195  sbb    r11, [reg_p2+24]
196  sbb    r12, [reg_p2+32]
197  sbb    r13, [reg_p2+40]
198  sbb    rcx, [reg_p2+48]
199
200  mov    rax, [rip+\P0]
201  mov    rdi, [rip+\P0+8]
202  mov    rsi, [rip+\P0+24]
203  add    r8, rax
204  mov    rax, [rip+\P0+32]
205  adc    r9, rdi
206  adc    r10, rdi
207  adc    r11, rsi
208  mov    rdi, [rip+\P0+40]
209  mov    rsi, [rip+\P0+48]
210  adc    r12, rax
211  adc    r13, rdi
212  adc    rcx, rsi
213  mov    [reg_p3], r8
214  mov    [reg_p3+8], r9
215  mov    [reg_p3+16], r10
216  mov    [reg_p3+24], r11
217  mov    [reg_p3+32], r12
218  mov    [reg_p3+40], r13
219  mov    [reg_p3+48], rcx
220
221  pop    r13
222  pop    r12
223  .endm
224
225
226//***********************************************************************
227//  Multiprecision subtraction with correction with 2*p434
228//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p434
229//***********************************************************************
230.global fmt(mp_sub434_p2_asm)
231fmt(mp_sub434_p2_asm):
232
233  SUB434_PX  fmt(p434x2)
234  ret
235
236
237//***********************************************************************
238//  Multiprecision subtraction with correction with 4*p434
239//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p434
240//***********************************************************************
241.global fmt(mp_sub434_p4_asm)
242fmt(mp_sub434_p4_asm):
243
244  SUB434_PX  fmt(p434x4)
245  ret
246
247
248#ifdef _MULX_
249
250///////////////////////////////////////////////////////////////// MACRO
251// Schoolbook integer multiplication
252// Inputs:  memory pointers M0 and M1
253// Outputs: memory pointer C and regs T1, T3, rax
254// Temps:   regs T0:T6
255/////////////////////////////////////////////////////////////////
256
257#ifdef _ADX_
258.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
259    mov    rdx, \M0
260    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
261    mov    \C, \T1           // C0_final
262    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
263    xor    rax, rax
264    adox   \T0, \T2
265    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
266    adox   \T1, \T3
267
268    mov    rdx, 8\M0
269    mulx   \T3, \T4, \M1     // T3:T4 = A1*B0
270    adox   \T2, rax
271    xor    rax, rax
272    mulx   \T5, \T6, 8\M1    // T5:T6 = A1*B1
273    adox   \T4, \T0
274    mov    8\C, \T4          // C1_final
275    adcx   \T3, \T6
276    mulx   \T6, \T0, 16\M1   // T6:T0 = A1*B2
277    adox   \T3, \T1
278    adcx   \T5, \T0
279    adcx   \T6, rax
280    adox   \T5, \T2
281
282    mov    rdx, 16\M0
283    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
284    adox   \T6, rax
285    xor    rax, rax
286    mulx   \T4, \T2, 8\M1    // T4:T2 = A2*B1
287    adox   \T0, \T3
288    mov    16\C, \T0         // C2_final
289    adcx   \T1, \T5
290    mulx   \T0, \T3, 16\M1   // T0:T3 = A2*B2
291    adcx   \T4, \T6
292    adcx   \T0, rax
293    adox   \T1, \T2
294    adox   \T3, \T4
295    adox   rax, \T0
296.endm
297
298///////////////////////////////////////////////////////////////// MACRO
299// Schoolbook integer multiplication
300// Inputs:  memory pointers M0 and M1
301// Outputs: memory pointer C
302// Temps:   regs T0:T9
303/////////////////////////////////////////////////////////////////
304
305.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
306    mov    rdx, \M0
307    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
308    mov    \C, \T1           // C0_final
309    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
310    xor    rax, rax
311    adox   \T0, \T2
312    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
313    adox   \T1, \T3
314    mulx   \T3, \T4, 24\M1   // T3:T4 = A0*B3
315    adox   \T2, \T4
316
317    mov    rdx, 8\M0
318    mulx   \T5, \T4, \M1     // T5:T4 = A1*B0
319    adox   \T3, rax
320    xor    rax, rax
321    mulx   \T6, \T7, 8\M1    // T6:T7 = A1*B1
322    adox   \T4, \T0
323    mov    8\C, \T4          // C1_final
324    adcx   \T5, \T7
325    mulx   \T7, \T8, 16\M1   // T7:T8 = A1*B2
326    adcx   \T6, \T8
327    adox   \T5, \T1
328    mulx   \T8, \T9, 24\M1   // T8:T9 = A1*B3
329    adcx   \T7, \T9
330    adcx   \T8, rax
331    adox   \T6, \T2
332
333    mov    rdx, 16\M0
334    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
335    adox   \T7, \T3
336    adox   \T8, rax
337    xor    rax, rax
338    mulx   \T2, \T3, 8\M1    // T2:T3 = A2*B1
339    adox   \T0, \T5
340    mov    16\C, \T0         // C2_final
341    adcx   \T1, \T3
342    mulx   \T3, \T4, 16\M1   // T3:T4 = A2*B2
343    adcx   \T2, \T4
344    adox   \T1, \T6
345    mulx   \T4,\T9, 24\M1    // T3:T4 = A2*B3
346    adcx   \T3, \T9
347    mov    rdx, 24\M0
348    adcx   \T4, rax
349
350    adox   \T2, \T7
351    adox   \T3, \T8
352    adox   \T4, rax
353
354    mulx   \T5, \T0, \M1     // T5:T0 = A3*B0
355    xor    rax, rax
356    mulx   \T6, \T7, 8\M1    // T6:T7 = A3*B1
357    adcx   \T5, \T7
358    adox   \T1, \T0
359    mulx   \T7, \T8, 16\M1   // T7:T8 = A3*B2
360    adcx   \T6, \T8
361    adox   \T2, \T5
362    mulx   \T8, \T9, 24\M1   // T8:T9 = A3*B3
363    adcx   \T7, \T9
364    adcx   \T8, rax
365
366    adox   \T3, \T6
367    adox   \T4, \T7
368    adox   \T8, rax
369    mov    24\C, \T1         // C3_final
370    mov    32\C, \T2         // C4_final
371    mov    40\C, \T3         // C5_final
372    mov    48\C, \T4         // C6_final
373    mov    56\C, \T8         // C7_final
374.endm
375
376#else
377
378.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
379    mov    rdx, \M0
380    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
381    mov    \C, \T1           // C0_final
382    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
383    add    \T0, \T2
384    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
385    adc    \T1, \T3
386
387    mov    rdx, 8\M0
388    mulx   \T3, \T4, \M1     // T3:T4 = A1*B0
389    adc    \T2, 0
390    mulx   \T5, \T6, 8\M1    // T5:T6 = A1*B1
391    add    \T4, \T0
392    mov    8\C, \T4          // C1_final
393    adc    \T3, \T1
394    adc    \T5, \T2
395    mulx   \T2, \T1, 16\M1   // T2:T1 = A1*B2
396    adc    \T2, 0
397
398    add    \T3, \T6
399    adc    \T5, \T1
400    adc    \T2, 0
401
402    mov    rdx, 16\M0
403    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
404    add    \T0, \T3
405    mov    16\C, \T0         // C2_final
406    mulx   \T4, \T6, 8\M1    // T4:T6 = A2*B1
407    adc    \T1, \T5
408    adc    \T2, \T4
409    mulx   rax, \T3, 16\M1   // rax:T3 = A2*B2
410    adc    rax, 0
411    add    \T1, \T6
412    adc    \T3, \T2
413    adc    rax, 0
414.endm
415
416.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
417    mov    rdx, \M0
418    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
419    mov    \C, \T1           // C0_final
420    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
421    add    \T0, \T2
422    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
423    adc    \T1, \T3
424    mulx   \T3, \T4, 24\M1   // T3:T4 = A0*B3
425    adc    \T2, \T4
426    mov    rdx, 8\M0
427    adc    \T3, 0
428
429    mulx   \T5, \T4, \M1     // T5:T4 = A1*B0
430    mulx   \T6, \T7, 8\M1    // T6:T7 = A1*B1
431    add    \T5, \T7
432    mulx   \T7, \T8, 16\M1   // T7:T8 = A1*B2
433    adc    \T6, \T8
434    mulx   \T8, \T9, 24\M1   // T8:T9 = A1*B3
435    adc    \T7, \T9
436    adc    \T8, 0
437
438    add    \T4, \T0
439    mov    8\C, \T4          // C1_final
440    adc    \T5, \T1
441    adc    \T6, \T2
442    adc    \T7, \T3
443    mov    rdx, 16\M0
444    adc    \T8, 0
445
446    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
447    mulx   \T2, \T3, 8\M1    // T2:T3 = A2*B1
448    add    \T1, \T3
449    mulx   \T3, \T4, 16\M1   // T3:T4 = A2*B2
450    adc    \T2, \T4
451    mulx   \T4,\T9, 24\M1    // T3:T4 = A2*B3
452    adc    \T3, \T9
453    mov    rdx, 24\M0
454    adc    \T4, 0
455
456    add    \T0, \T5
457    mov    16\C, \T0         // C2_final
458    adc    \T1, \T6
459    adc    \T2, \T7
460    adc    \T3, \T8
461    adc    \T4, 0
462
463    mulx   \T5, \T0, \M1     // T5:T0 = A3*B0
464    mulx   \T6, \T7, 8\M1    // T6:T7 = A3*B1
465    add    \T5, \T7
466    mulx   \T7, \T8, 16\M1   // T7:T8 = A3*B2
467    adc    \T6, \T8
468    mulx   \T8, \T9, 24\M1   // T8:T9 = A3*B3
469    adc    \T7, \T9
470    adc    \T8, 0
471
472    add    \T1, \T0
473    mov    24\C, \T1         // C3_final
474    adc    \T2, \T5
475    mov    32\C, \T2         // C4_final
476    adc    \T3, \T6
477    mov    40\C, \T3         // C5_final
478    adc    \T4, \T7
479    mov    48\C, \T4         // C6_final
480    adc    \T8, 0
481    mov    56\C, \T8         // C7_final
482.endm
483#endif
484
485
486//*****************************************************************************
487//  434-bit multiplication using Karatsuba (one level), schoolbook (one level)
488//*****************************************************************************
489.global fmt(mul434_asm)
490fmt(mul434_asm):
491    push   r12
492    push   r13
493    push   r14
494    push   r15
495    mov    rcx, reg_p3
496
497    // r8-r11 <- AH + AL, rax <- mask
498    xor    rax, rax
499    mov    r8, [reg_p1]
500    mov    r9, [reg_p1+8]
501    mov    r10, [reg_p1+16]
502    mov    r11, [reg_p1+24]
503    push   rbx
504    push   rbp
505    sub    rsp, 96
506    add    r8, [reg_p1+32]
507    adc    r9, [reg_p1+40]
508    adc    r10, [reg_p1+48]
509    adc    r11, 0
510    sbb    rax, 0
511    mov    [rsp], r8
512    mov    [rsp+8], r9
513    mov    [rsp+16], r10
514    mov    [rsp+24], r11
515
516    // r12-r15 <- BH + BL, rbx <- mask
517    xor    rbx, rbx
518    mov    r12, [reg_p2]
519    mov    r13, [reg_p2+8]
520    mov    r14, [reg_p2+16]
521    mov    r15, [reg_p2+24]
522    add    r12, [reg_p2+32]
523    adc    r13, [reg_p2+40]
524    adc    r14, [reg_p2+48]
525    adc    r15, 0
526    sbb    rbx, 0
527    mov    [rsp+32], r12
528    mov    [rsp+40], r13
529    mov    [rsp+48], r14
530    mov    [rsp+56], r15
531
532    // r12-r15 <- masked (BH + BL)
533    and    r12, rax
534    and    r13, rax
535    and    r14, rax
536    and    r15, rax
537
538    // r8-r11 <- masked (AH + AL)
539    and    r8, rbx
540    and    r9, rbx
541    and    r10, rbx
542    and    r11, rbx
543
544    // r8-r11 <- masked (AH + AL) + masked (AH + AL)
545    add    r8, r12
546    adc    r9, r13
547    adc    r10, r14
548    adc    r11, r15
549    mov    [rsp+64], r8
550    mov    [rsp+72], r9
551    mov    [rsp+80], r10
552    mov    [rsp+88], r11
553
554    // [rsp] <- (AH+AL) x (BH+BL), low part
555    MUL256_SCHOOL  [rsp], [rsp+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp
556
557    // [rcx] <- AL x BL
558    MUL256_SCHOOL  [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp     // Result C0-C3
559
560    // [rcx+64], rbx, rbp, rax <- AH x BH
561    MUL192_SCHOOL  [reg_p1+32], [reg_p2+32], [rcx+64], r8, rbx, r10, rbp, r12, r13, r14
562
563    // r8-r11 <- (AH+AL) x (BH+BL), final step
564    mov    r8, [rsp+64]
565    mov    r9, [rsp+72]
566    mov    r10, [rsp+80]
567    mov    r11, [rsp+88]
568    mov    rdx, [rsp+32]
569    add    r8, rdx
570    mov    rdx, [rsp+40]
571    adc    r9, rdx
572    mov    rdx, [rsp+48]
573    adc    r10, rdx
574    mov    rdx, [rsp+56]
575    adc    r11, rdx
576
577    // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL
578    mov    r12, [rsp]
579    mov    r13, [rsp+8]
580    mov    r14, [rsp+16]
581    mov    r15, [rsp+24]
582    sub    r12, [rcx]
583    sbb    r13, [rcx+8]
584    sbb    r14, [rcx+16]
585    sbb    r15, [rcx+24]
586    sbb    r8, [rcx+32]
587    sbb    r9, [rcx+40]
588    sbb    r10, [rcx+48]
589    sbb    r11, [rcx+56]
590
591    // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
592    sub    r12, [rcx+64]
593    sbb    r13, [rcx+72]
594    sbb    r14, [rcx+80]
595    sbb    r15, rbx
596    sbb    r8, rbp
597    sbb    r9, rax
598    sbb    r10, 0
599    sbb    r11, 0
600
601    add    r12, [rcx+32]
602    mov    [rcx+32], r12    // Result C4-C7
603    adc    r13, [rcx+40]
604    mov    [rcx+40], r13
605    adc    r14, [rcx+48]
606    mov    [rcx+48], r14
607    adc    r15, [rcx+56]
608    mov    [rcx+56], r15
609    adc    r8, [rcx+64]
610    mov    [rcx+64], r8    // Result C8-C15
611    adc    r9, [rcx+72]
612    mov    [rcx+72], r9
613    adc    r10, [rcx+80]
614    mov    [rcx+80], r10
615    adc    r11, rbx
616    mov    [rcx+88], r11
617    adc    rbp, 0
618    mov    [rcx+96], rbp
619    adc    rax, 0
620    mov    [rcx+104], rax
621
622    add    rsp, 96
623    pop    rbp
624    pop    rbx
625    pop    r15
626    pop    r14
627    pop    r13
628    pop    r12
629    ret
630
631#else
632
633# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE"
634
635#endif
636
637
638#ifdef _MULX_
639
640///////////////////////////////////////////////////////////////// MACRO
641// Schoolbook integer multiplication
642// Inputs:  reg I0 and memory pointer M1
643// Outputs: regs T0:T4
644// Temps:   regs T0:T5
645/////////////////////////////////////////////////////////////////
646.macro MUL64x256_SCHOOL I0, M1, T0, T1, T2, T3, T4, T5
647    mulx   \T2, \T4, 8\M1
648    xor    rax, rax
649    mulx   \T3, \T5, 16\M1
650    ADD1   \T1, \T4            // T1 <- C1_final
651    ADC1   \T2, \T5            // T2 <- C2_final
652    mulx   \T4, \T5, 24\M1
653    ADC1   \T3, \T5            // T3 <- C3_final
654    ADC1   \T4, rax            // T4 <- C4_final
655.endm
656
657///////////////////////////////////////////////////////////////// MACRO
658// Schoolbook integer multiplication
659// Inputs:  regs I0 and I1, and memory pointer M1
660// Outputs: regs T0:T5
661// Temps:   regs T0:T5
662/////////////////////////////////////////////////////////////////
663
664#ifdef _ADX_
665.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5
666    mulx   \T2, \T4, 8\M1
667    xor    rax, rax
668    mulx   \T3, \T5, 16\M1
669    ADD1   \T1, \T4
670    ADC1   \T2, \T5
671    mulx   \T4, \T5, 24\M1
672    ADC1   \T3, \T5
673    ADC1   \T4, rax
674
675    xor    rax, rax
676    mov    rdx, \I1
677    mulx   \I1, \T5, \M1
678    ADD2   \T1, \T5            // T1 <- C1_final
679    ADC2   \T2, \I1
680    mulx   \T5, \I1, 8\M1
681    ADC2   \T3, \T5
682    ADD1   \T2, \I1
683    mulx   \T5, \I1, 16\M1
684    ADC2   \T4, \T5
685    ADC1   \T3, \I1
686    mulx   \T5, \I1, 24\M1
687    ADC2   \T5, rax
688    ADC1   \T4, \I1
689    ADC1   \T5, rax
690.endm
691
692#else
693
694.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5
695    mulx   \T2, \T4, 8\M1
696    mulx   \T3, \T5, 16\M1
697    add    \T1, \T4
698    adc    \T2, \T5
699    mulx   \T4, \T5, 24\M1
700    adc    \T3, \T5
701    adc    \T4, 0
702
703    mov    rdx, \I1
704    mulx   \I1, \T5, \M1
705    add    \T1, \T5            // T1 <- C1_final
706    adc    \T2, \I1
707    mulx   \T5, \I1, 8\M1
708    adc    \T3, \T5
709    mulx   \T5, rax, 16\M1
710    adc    \T4, \T5
711    mulx   \T5, rdx, 24\M1
712    adc    \T5, 0
713    add    \T2, \I1
714    adc    \T3, rax
715    adc    \T4, rdx
716    adc    \T5, 0
717.endm
718#endif
719
720
721//**************************************************************************************
722//  Montgomery reduction
723//  Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
724//  Operation: c [reg_p2] = a [reg_p1]
725//**************************************************************************************
726.global fmt(rdc434_asm)
727fmt(rdc434_asm):
728    push   r14
729
730    // a[0-1] x p434p1_nz --> result: r8:r13
731    mov    rdx, [reg_p1]
732    mov    r14, [reg_p1+8]
733    mulx   r9, r8, [rip+fmt(p434p1)+24]   // result r8
734    push   r12
735    push   r13
736    push   r15
737    push   rbp
738    push   rbx
739    MUL128x256_SCHOOL rdx, r14, [rip+fmt(p434p1)+24], r8, r9, r10, r11, r12, r13
740
741    mov    rdx, [reg_p1+16]
742    mov    rcx, [reg_p1+72]
743    add    r8, [reg_p1+24]
744    adc    r9, [reg_p1+32]
745    adc    r10, [reg_p1+40]
746    adc    r11, [reg_p1+48]
747    adc    r12, [reg_p1+56]
748    adc    r13, [reg_p1+64]
749    adc    rcx, 0
750    mulx   rbp, rbx, [rip+fmt(p434p1)+24]   // result rbx
751    mov    [reg_p2], r9
752    mov    [reg_p2+8], r10
753    mov    [reg_p2+16], r11
754    mov    [reg_p2+24], r12
755    mov    [reg_p2+32], r13
756    mov    r9, [reg_p1+80]
757    mov    r10, [reg_p1+88]
758    mov    r11, [reg_p1+96]
759    mov    rdi, [reg_p1+104]
760    adc    r9, 0
761    adc    r10, 0
762    adc    r11, 0
763    adc    rdi, 0
764
765    // a[2-3] x p434p1_nz --> result: rbx, rbp, r12:r15
766    MUL128x256_SCHOOL rdx, r8, [rip+fmt(p434p1)+24], rbx, rbp, r12, r13, r14, r15
767
768    mov    rdx, [reg_p2]
769    add    rbx, [reg_p2+8]
770    adc    rbp, [reg_p2+16]
771    adc    r12, [reg_p2+24]
772    adc    r13, [reg_p2+32]
773    adc    r14, rcx
774    mov    rcx, 0
775    adc    r15, r9
776    adc    rcx, r10
777    mulx   r9, r8, [rip+fmt(p434p1)+24]   // result r8
778    mov    [reg_p2], rbp
779    mov    [reg_p2+8], r12
780    mov    [reg_p2+16], r13
781    adc    r11, 0
782    adc    rdi, 0
783
784    // a[4-5] x p434p1_nz --> result: r8:r13
785    MUL128x256_SCHOOL rdx, rbx, [rip+fmt(p434p1)+24], r8, r9, r10, rbp, r12, r13
786
787    mov    rdx, [reg_p2]
788    add    r8, [reg_p2+8]
789    adc    r9, [reg_p2+16]
790    adc    r10, r14
791    adc    rbp, r15
792    adc    r12, rcx
793    adc    r13, r11
794    adc    rdi, 0
795    mulx   r15, r14, [rip+fmt(p434p1)+24]  // result r14
796    mov    [reg_p2], r8        // Final result c0-c1
797    mov    [reg_p2+8], r9
798
799    // a[6-7] x p434p1_nz --> result: r14:r15, r8:r9, r11
800    MUL64x256_SCHOOL rdx, [rip+fmt(p434p1)+24], r14, r15, r8, r9, r11, rcx
801
802    // Final result c2:c6
803    add    r14, r10
804    adc    r15, rbp
805    pop    rbx
806    pop    rbp
807    adc    r8, r12
808    adc    r9, r13
809    adc    r11, rdi
810    mov    [reg_p2+16], r14
811    mov    [reg_p2+24], r15
812    pop    r15
813    pop    r13
814    mov    [reg_p2+32], r8
815    mov    [reg_p2+40], r9
816    mov    [reg_p2+48], r11
817
818    pop    r12
819    pop    r14
820    ret
821
822  #else
823
824  # error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE"
825
826  #endif
827
828
829//***********************************************************************
830//  434-bit multiprecision addition
831//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
832//***********************************************************************
833.global fmt(mp_add434_asm)
834fmt(mp_add434_asm):
835  mov    r8, [reg_p1]
836  mov    r9, [reg_p1+8]
837  mov    r10, [reg_p1+16]
838  mov    r11, [reg_p1+24]
839  add    r8, [reg_p2]
840  adc    r9, [reg_p2+8]
841  adc    r10, [reg_p2+16]
842  adc    r11, [reg_p2+24]
843  mov    [reg_p3], r8
844  mov    [reg_p3+8], r9
845  mov    [reg_p3+16], r10
846  mov    [reg_p3+24], r11
847
848  mov    r8, [reg_p1+32]
849  mov    r9, [reg_p1+40]
850  mov    r10, [reg_p1+48]
851  adc    r8, [reg_p2+32]
852  adc    r9, [reg_p2+40]
853  adc    r10, [reg_p2+48]
854  mov    [reg_p3+32], r8
855  mov    [reg_p3+40], r9
856  mov    [reg_p3+48], r10
857  ret
858
859
860//***************************************************************************
861//  2x434-bit multiprecision subtraction/addition
862//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. If c < 0, add p434*2^448
863//***************************************************************************
864.global fmt(mp_subadd434x2_asm)
865fmt(mp_subadd434x2_asm):
866  push   r12
867  push   r13
868  push   r14
869  push   r15
870  xor    rax, rax
871  mov    r8, [reg_p1]
872  mov    r9, [reg_p1+8]
873  mov    r10, [reg_p1+16]
874  mov    r11, [reg_p1+24]
875  mov    r12, [reg_p1+32]
876  sub    r8, [reg_p2]
877  sbb    r9, [reg_p2+8]
878  sbb    r10, [reg_p2+16]
879  sbb    r11, [reg_p2+24]
880  sbb    r12, [reg_p2+32]
881  mov    [reg_p3], r8
882  mov    [reg_p3+8], r9
883  mov    [reg_p3+16], r10
884  mov    [reg_p3+24], r11
885  mov    [reg_p3+32], r12
886
887  mov    r8, [reg_p1+40]
888  mov    r9, [reg_p1+48]
889  mov    r10, [reg_p1+56]
890  mov    r11, [reg_p1+64]
891  mov    r12, [reg_p1+72]
892  sbb    r8, [reg_p2+40]
893  sbb    r9, [reg_p2+48]
894  sbb    r10, [reg_p2+56]
895  sbb    r11, [reg_p2+64]
896  sbb    r12, [reg_p2+72]
897  mov    [reg_p3+40], r8
898  mov    [reg_p3+48], r9
899  mov    [reg_p3+56], r10
900
901  mov    r13, [reg_p1+80]
902  mov    r14, [reg_p1+88]
903  mov    r15, [reg_p1+96]
904  mov    rcx, [reg_p1+104]
905  sbb    r13, [reg_p2+80]
906  sbb    r14, [reg_p2+88]
907  sbb    r15, [reg_p2+96]
908  sbb    rcx, [reg_p2+104]
909  sbb    rax, 0
910
911  // Add p434 anded with the mask in rax
912  mov    r8, [rip+fmt(p434)]
913  mov    r9, [rip+fmt(p434)+24]
914  mov    r10, [rip+fmt(p434)+32]
915  mov    rdi, [rip+fmt(p434)+40]
916  mov    rsi, [rip+fmt(p434)+48]
917  and    r8, rax
918  and    r9, rax
919  and    r10, rax
920  and    rdi, rax
921  and    rsi, rax
922  mov    rax, [reg_p3+56]
923  add    rax, r8
924  adc    r11, r8
925  adc    r12, r8
926  adc    r13, r9
927  adc    r14, r10
928  adc    r15, rdi
929  adc    rcx, rsi
930
931  mov    [reg_p3+56], rax
932  mov    [reg_p3+64], r11
933  mov    [reg_p3+72], r12
934  mov    [reg_p3+80], r13
935  mov    [reg_p3+88], r14
936  mov    [reg_p3+96], r15
937  mov    [reg_p3+104], rcx
938  pop    r15
939  pop    r14
940  pop    r13
941  pop    r12
942  ret
943
944
945//***********************************************************************
946//  Double 2x434-bit multiprecision subtraction
947//  Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2]
948//***********************************************************************
949.global fmt(mp_dblsub434x2_asm)
950fmt(mp_dblsub434x2_asm):
951  push   r12
952  push   r13
953  push   r14
954
955  mov    r8, [reg_p3]
956  mov    r9, [reg_p3+8]
957  mov    r10, [reg_p3+16]
958  mov    r11, [reg_p3+24]
959  mov    r12, [reg_p3+32]
960  mov    r13, [reg_p3+40]
961  mov    r14, [reg_p3+48]
962  sub    r8, [reg_p1]
963  sbb    r9, [reg_p1+8]
964  sbb    r10, [reg_p1+16]
965  sbb    r11, [reg_p1+24]
966  sbb    r12, [reg_p1+32]
967  sbb    r13, [reg_p1+40]
968  sbb    r14, [reg_p1+48]
969  setc   al
970  sub    r8, [reg_p2]
971  sbb    r9, [reg_p2+8]
972  sbb    r10, [reg_p2+16]
973  sbb    r11, [reg_p2+24]
974  sbb    r12, [reg_p2+32]
975  sbb    r13, [reg_p2+40]
976  sbb    r14, [reg_p2+48]
977  setc   cl
978  mov    [reg_p3], r8
979  mov    [reg_p3+8], r9
980  mov    [reg_p3+16], r10
981  mov    [reg_p3+24], r11
982  mov    [reg_p3+32], r12
983  mov    [reg_p3+40], r13
984  mov    [reg_p3+48], r14
985
986  mov    r8, [reg_p3+56]
987  mov    r9, [reg_p3+64]
988  mov    r10, [reg_p3+72]
989  mov    r11, [reg_p3+80]
990  mov    r12, [reg_p3+88]
991  mov    r13, [reg_p3+96]
992  mov    r14, [reg_p3+104]
993  bt     rax, 0
994  sbb    r8, [reg_p1+56]
995  sbb    r9, [reg_p1+64]
996  sbb    r10, [reg_p1+72]
997  sbb    r11, [reg_p1+80]
998  sbb    r12, [reg_p1+88]
999  sbb    r13, [reg_p1+96]
1000  sbb    r14, [reg_p1+104]
1001  bt     rcx, 0
1002  sbb    r8, [reg_p2+56]
1003  sbb    r9, [reg_p2+64]
1004  sbb    r10, [reg_p2+72]
1005  sbb    r11, [reg_p2+80]
1006  sbb    r12, [reg_p2+88]
1007  sbb    r13, [reg_p2+96]
1008  sbb    r14, [reg_p2+104]
1009  mov    [reg_p3+56], r8
1010  mov    [reg_p3+64], r9
1011  mov    [reg_p3+72], r10
1012  mov    [reg_p3+80], r11
1013  mov    [reg_p3+88], r12
1014  mov    [reg_p3+96], r13
1015  mov    [reg_p3+104], r14
1016
1017  pop    r14
1018  pop    r13
1019  pop    r12
1020  ret
1021