1//******************************************************************************************* 2// SIDH: an efficient supersingular isogeny cryptography library 3// 4// Abstract: field arithmetic in x64 assembly for P434 on Linux 5//******************************************************************************************* 6 7.intel_syntax noprefix 8 9// Format function and variable names for Mac OS X 10#if defined(__APPLE__) 11 #define fmt(f) _oqs_kem_sike_##f 12#else 13 #define fmt(f) oqs_kem_sike_##f 14#endif 15 16// Registers that are used for parameter passing: 17#define reg_p1 rdi 18#define reg_p2 rsi 19#define reg_p3 rdx 20 21// Define addition instructions 22#ifdef _MULX_ 23#ifdef _ADX_ 24 25#define ADD1 adox 26#define ADC1 adox 27#define ADD2 adcx 28#define ADC2 adcx 29 30#else 31 32#define ADD1 add 33#define ADC1 adc 34#define ADD2 add 35#define ADC2 adc 36 37#endif 38#endif 39 40 41.text 42//*********************************************************************** 43// Field addition 44// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] 45//*********************************************************************** 46.global fmt(fpadd434_asm) 47fmt(fpadd434_asm): 48 push r12 49 push r13 50 push r14 51 push r15 52 push rbx 53 push rbp 54 55 xor rax, rax 56 mov r8, [reg_p1] 57 mov r9, [reg_p1+8] 58 mov r10, [reg_p1+16] 59 mov r11, [reg_p1+24] 60 mov r12, [reg_p1+32] 61 mov r13, [reg_p1+40] 62 mov r14, [reg_p1+48] 63 add r8, [reg_p2] 64 adc r9, [reg_p2+8] 65 adc r10, [reg_p2+16] 66 adc r11, [reg_p2+24] 67 adc r12, [reg_p2+32] 68 adc r13, [reg_p2+40] 69 adc r14, [reg_p2+48] 70 71 mov rbx, [rip+fmt(p434x2)] 72 sub r8, rbx 73 mov rcx, [rip+fmt(p434x2)+8] 74 sbb r9, rcx 75 sbb r10, rcx 76 mov rdi, [rip+fmt(p434x2)+24] 77 sbb r11, rdi 78 mov rsi, [rip+fmt(p434x2)+32] 79 sbb r12, rsi 80 mov rbp, [rip+fmt(p434x2)+40] 81 sbb r13, rbp 82 mov r15, [rip+fmt(p434x2)+48] 83 sbb r14, r15 84 sbb rax, 0 85 86 and rbx, rax 87 and rcx, rax 88 and rdi, rax 89 and rsi, rax 90 and rbp, rax 91 and r15, rax 92 93 add r8, rbx 94 adc r9, rcx 95 adc r10, rcx 96 adc r11, rdi 97 adc r12, rsi 98 adc r13, rbp 99 adc r14, r15 100 mov [reg_p3], r8 101 mov [reg_p3+8], r9 102 mov [reg_p3+16], r10 103 mov [reg_p3+24], r11 104 mov [reg_p3+32], r12 105 mov [reg_p3+40], r13 106 mov [reg_p3+48], r14 107 108 pop rbp 109 pop rbx 110 pop r15 111 pop r14 112 pop r13 113 pop r12 114 ret 115 116 117//*********************************************************************** 118// Field subtraction 119// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] 120//*********************************************************************** 121.global fmt(fpsub434_asm) 122fmt(fpsub434_asm): 123 push r12 124 push r13 125 push r14 126 127 xor rax, rax 128 mov r8, [reg_p1] 129 mov r9, [reg_p1+8] 130 mov r10, [reg_p1+16] 131 mov r11, [reg_p1+24] 132 mov r12, [reg_p1+32] 133 mov r13, [reg_p1+40] 134 mov r14, [reg_p1+48] 135 sub r8, [reg_p2] 136 sbb r9, [reg_p2+8] 137 sbb r10, [reg_p2+16] 138 sbb r11, [reg_p2+24] 139 sbb r12, [reg_p2+32] 140 sbb r13, [reg_p2+40] 141 sbb r14, [reg_p2+48] 142 sbb rax, 0 143 144 mov rcx, [rip+fmt(p434x2)] 145 mov rdi, [rip+fmt(p434x2)+8] 146 mov rsi, [rip+fmt(p434x2)+24] 147 and rcx, rax 148 and rdi, rax 149 and rsi, rax 150 add r8, rcx 151 adc r9, rdi 152 adc r10, rdi 153 adc r11, rsi 154 mov [reg_p3], r8 155 mov [reg_p3+8], r9 156 mov [reg_p3+16], r10 157 mov [reg_p3+24], r11 158 setc cl 159 160 mov r8, [rip+fmt(p434x2)+32] 161 mov rdi, [rip+fmt(p434x2)+40] 162 mov rsi, [rip+fmt(p434x2)+48] 163 and r8, rax 164 and rdi, rax 165 and rsi, rax 166 bt rcx, 0 167 adc r12, r8 168 adc r13, rdi 169 adc r14, rsi 170 mov [reg_p3+32], r12 171 mov [reg_p3+40], r13 172 mov [reg_p3+48], r14 173 174 pop r14 175 pop r13 176 pop r12 177 ret 178 179 180///////////////////////////////////////////////////////////////// MACRO 181.macro SUB434_PX P0 182 push r12 183 push r13 184 185 mov r8, [reg_p1] 186 mov r9, [reg_p1+8] 187 mov r10, [reg_p1+16] 188 mov r11, [reg_p1+24] 189 mov r12, [reg_p1+32] 190 mov r13, [reg_p1+40] 191 mov rcx, [reg_p1+48] 192 sub r8, [reg_p2] 193 sbb r9, [reg_p2+8] 194 sbb r10, [reg_p2+16] 195 sbb r11, [reg_p2+24] 196 sbb r12, [reg_p2+32] 197 sbb r13, [reg_p2+40] 198 sbb rcx, [reg_p2+48] 199 200 mov rax, [rip+\P0] 201 mov rdi, [rip+\P0+8] 202 mov rsi, [rip+\P0+24] 203 add r8, rax 204 mov rax, [rip+\P0+32] 205 adc r9, rdi 206 adc r10, rdi 207 adc r11, rsi 208 mov rdi, [rip+\P0+40] 209 mov rsi, [rip+\P0+48] 210 adc r12, rax 211 adc r13, rdi 212 adc rcx, rsi 213 mov [reg_p3], r8 214 mov [reg_p3+8], r9 215 mov [reg_p3+16], r10 216 mov [reg_p3+24], r11 217 mov [reg_p3+32], r12 218 mov [reg_p3+40], r13 219 mov [reg_p3+48], rcx 220 221 pop r13 222 pop r12 223 .endm 224 225 226//*********************************************************************** 227// Multiprecision subtraction with correction with 2*p434 228// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p434 229//*********************************************************************** 230.global fmt(mp_sub434_p2_asm) 231fmt(mp_sub434_p2_asm): 232 233 SUB434_PX fmt(p434x2) 234 ret 235 236 237//*********************************************************************** 238// Multiprecision subtraction with correction with 4*p434 239// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p434 240//*********************************************************************** 241.global fmt(mp_sub434_p4_asm) 242fmt(mp_sub434_p4_asm): 243 244 SUB434_PX fmt(p434x4) 245 ret 246 247 248#ifdef _MULX_ 249 250///////////////////////////////////////////////////////////////// MACRO 251// Schoolbook integer multiplication 252// Inputs: memory pointers M0 and M1 253// Outputs: memory pointer C and regs T1, T3, rax 254// Temps: regs T0:T6 255///////////////////////////////////////////////////////////////// 256 257#ifdef _ADX_ 258.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6 259 mov rdx, \M0 260 mulx \T0, \T1, \M1 // T0:T1 = A0*B0 261 mov \C, \T1 // C0_final 262 mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 263 xor rax, rax 264 adox \T0, \T2 265 mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 266 adox \T1, \T3 267 268 mov rdx, 8\M0 269 mulx \T3, \T4, \M1 // T3:T4 = A1*B0 270 adox \T2, rax 271 xor rax, rax 272 mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1 273 adox \T4, \T0 274 mov 8\C, \T4 // C1_final 275 adcx \T3, \T6 276 mulx \T6, \T0, 16\M1 // T6:T0 = A1*B2 277 adox \T3, \T1 278 adcx \T5, \T0 279 adcx \T6, rax 280 adox \T5, \T2 281 282 mov rdx, 16\M0 283 mulx \T1, \T0, \M1 // T1:T0 = A2*B0 284 adox \T6, rax 285 xor rax, rax 286 mulx \T4, \T2, 8\M1 // T4:T2 = A2*B1 287 adox \T0, \T3 288 mov 16\C, \T0 // C2_final 289 adcx \T1, \T5 290 mulx \T0, \T3, 16\M1 // T0:T3 = A2*B2 291 adcx \T4, \T6 292 adcx \T0, rax 293 adox \T1, \T2 294 adox \T3, \T4 295 adox rax, \T0 296.endm 297 298///////////////////////////////////////////////////////////////// MACRO 299// Schoolbook integer multiplication 300// Inputs: memory pointers M0 and M1 301// Outputs: memory pointer C 302// Temps: regs T0:T9 303///////////////////////////////////////////////////////////////// 304 305.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 306 mov rdx, \M0 307 mulx \T0, \T1, \M1 // T0:T1 = A0*B0 308 mov \C, \T1 // C0_final 309 mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 310 xor rax, rax 311 adox \T0, \T2 312 mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 313 adox \T1, \T3 314 mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 315 adox \T2, \T4 316 317 mov rdx, 8\M0 318 mulx \T5, \T4, \M1 // T5:T4 = A1*B0 319 adox \T3, rax 320 xor rax, rax 321 mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 322 adox \T4, \T0 323 mov 8\C, \T4 // C1_final 324 adcx \T5, \T7 325 mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 326 adcx \T6, \T8 327 adox \T5, \T1 328 mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 329 adcx \T7, \T9 330 adcx \T8, rax 331 adox \T6, \T2 332 333 mov rdx, 16\M0 334 mulx \T1, \T0, \M1 // T1:T0 = A2*B0 335 adox \T7, \T3 336 adox \T8, rax 337 xor rax, rax 338 mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 339 adox \T0, \T5 340 mov 16\C, \T0 // C2_final 341 adcx \T1, \T3 342 mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 343 adcx \T2, \T4 344 adox \T1, \T6 345 mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 346 adcx \T3, \T9 347 mov rdx, 24\M0 348 adcx \T4, rax 349 350 adox \T2, \T7 351 adox \T3, \T8 352 adox \T4, rax 353 354 mulx \T5, \T0, \M1 // T5:T0 = A3*B0 355 xor rax, rax 356 mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 357 adcx \T5, \T7 358 adox \T1, \T0 359 mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 360 adcx \T6, \T8 361 adox \T2, \T5 362 mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 363 adcx \T7, \T9 364 adcx \T8, rax 365 366 adox \T3, \T6 367 adox \T4, \T7 368 adox \T8, rax 369 mov 24\C, \T1 // C3_final 370 mov 32\C, \T2 // C4_final 371 mov 40\C, \T3 // C5_final 372 mov 48\C, \T4 // C6_final 373 mov 56\C, \T8 // C7_final 374.endm 375 376#else 377 378.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6 379 mov rdx, \M0 380 mulx \T0, \T1, \M1 // T0:T1 = A0*B0 381 mov \C, \T1 // C0_final 382 mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 383 add \T0, \T2 384 mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 385 adc \T1, \T3 386 387 mov rdx, 8\M0 388 mulx \T3, \T4, \M1 // T3:T4 = A1*B0 389 adc \T2, 0 390 mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1 391 add \T4, \T0 392 mov 8\C, \T4 // C1_final 393 adc \T3, \T1 394 adc \T5, \T2 395 mulx \T2, \T1, 16\M1 // T2:T1 = A1*B2 396 adc \T2, 0 397 398 add \T3, \T6 399 adc \T5, \T1 400 adc \T2, 0 401 402 mov rdx, 16\M0 403 mulx \T1, \T0, \M1 // T1:T0 = A2*B0 404 add \T0, \T3 405 mov 16\C, \T0 // C2_final 406 mulx \T4, \T6, 8\M1 // T4:T6 = A2*B1 407 adc \T1, \T5 408 adc \T2, \T4 409 mulx rax, \T3, 16\M1 // rax:T3 = A2*B2 410 adc rax, 0 411 add \T1, \T6 412 adc \T3, \T2 413 adc rax, 0 414.endm 415 416.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 417 mov rdx, \M0 418 mulx \T0, \T1, \M1 // T0:T1 = A0*B0 419 mov \C, \T1 // C0_final 420 mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 421 add \T0, \T2 422 mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 423 adc \T1, \T3 424 mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 425 adc \T2, \T4 426 mov rdx, 8\M0 427 adc \T3, 0 428 429 mulx \T5, \T4, \M1 // T5:T4 = A1*B0 430 mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 431 add \T5, \T7 432 mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 433 adc \T6, \T8 434 mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 435 adc \T7, \T9 436 adc \T8, 0 437 438 add \T4, \T0 439 mov 8\C, \T4 // C1_final 440 adc \T5, \T1 441 adc \T6, \T2 442 adc \T7, \T3 443 mov rdx, 16\M0 444 adc \T8, 0 445 446 mulx \T1, \T0, \M1 // T1:T0 = A2*B0 447 mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 448 add \T1, \T3 449 mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 450 adc \T2, \T4 451 mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 452 adc \T3, \T9 453 mov rdx, 24\M0 454 adc \T4, 0 455 456 add \T0, \T5 457 mov 16\C, \T0 // C2_final 458 adc \T1, \T6 459 adc \T2, \T7 460 adc \T3, \T8 461 adc \T4, 0 462 463 mulx \T5, \T0, \M1 // T5:T0 = A3*B0 464 mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 465 add \T5, \T7 466 mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 467 adc \T6, \T8 468 mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 469 adc \T7, \T9 470 adc \T8, 0 471 472 add \T1, \T0 473 mov 24\C, \T1 // C3_final 474 adc \T2, \T5 475 mov 32\C, \T2 // C4_final 476 adc \T3, \T6 477 mov 40\C, \T3 // C5_final 478 adc \T4, \T7 479 mov 48\C, \T4 // C6_final 480 adc \T8, 0 481 mov 56\C, \T8 // C7_final 482.endm 483#endif 484 485 486//***************************************************************************** 487// 434-bit multiplication using Karatsuba (one level), schoolbook (one level) 488//***************************************************************************** 489.global fmt(mul434_asm) 490fmt(mul434_asm): 491 push r12 492 push r13 493 push r14 494 push r15 495 mov rcx, reg_p3 496 497 // r8-r11 <- AH + AL, rax <- mask 498 xor rax, rax 499 mov r8, [reg_p1] 500 mov r9, [reg_p1+8] 501 mov r10, [reg_p1+16] 502 mov r11, [reg_p1+24] 503 push rbx 504 push rbp 505 sub rsp, 96 506 add r8, [reg_p1+32] 507 adc r9, [reg_p1+40] 508 adc r10, [reg_p1+48] 509 adc r11, 0 510 sbb rax, 0 511 mov [rsp], r8 512 mov [rsp+8], r9 513 mov [rsp+16], r10 514 mov [rsp+24], r11 515 516 // r12-r15 <- BH + BL, rbx <- mask 517 xor rbx, rbx 518 mov r12, [reg_p2] 519 mov r13, [reg_p2+8] 520 mov r14, [reg_p2+16] 521 mov r15, [reg_p2+24] 522 add r12, [reg_p2+32] 523 adc r13, [reg_p2+40] 524 adc r14, [reg_p2+48] 525 adc r15, 0 526 sbb rbx, 0 527 mov [rsp+32], r12 528 mov [rsp+40], r13 529 mov [rsp+48], r14 530 mov [rsp+56], r15 531 532 // r12-r15 <- masked (BH + BL) 533 and r12, rax 534 and r13, rax 535 and r14, rax 536 and r15, rax 537 538 // r8-r11 <- masked (AH + AL) 539 and r8, rbx 540 and r9, rbx 541 and r10, rbx 542 and r11, rbx 543 544 // r8-r11 <- masked (AH + AL) + masked (AH + AL) 545 add r8, r12 546 adc r9, r13 547 adc r10, r14 548 adc r11, r15 549 mov [rsp+64], r8 550 mov [rsp+72], r9 551 mov [rsp+80], r10 552 mov [rsp+88], r11 553 554 // [rsp] <- (AH+AL) x (BH+BL), low part 555 MUL256_SCHOOL [rsp], [rsp+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp 556 557 // [rcx] <- AL x BL 558 MUL256_SCHOOL [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp // Result C0-C3 559 560 // [rcx+64], rbx, rbp, rax <- AH x BH 561 MUL192_SCHOOL [reg_p1+32], [reg_p2+32], [rcx+64], r8, rbx, r10, rbp, r12, r13, r14 562 563 // r8-r11 <- (AH+AL) x (BH+BL), final step 564 mov r8, [rsp+64] 565 mov r9, [rsp+72] 566 mov r10, [rsp+80] 567 mov r11, [rsp+88] 568 mov rdx, [rsp+32] 569 add r8, rdx 570 mov rdx, [rsp+40] 571 adc r9, rdx 572 mov rdx, [rsp+48] 573 adc r10, rdx 574 mov rdx, [rsp+56] 575 adc r11, rdx 576 577 // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL 578 mov r12, [rsp] 579 mov r13, [rsp+8] 580 mov r14, [rsp+16] 581 mov r15, [rsp+24] 582 sub r12, [rcx] 583 sbb r13, [rcx+8] 584 sbb r14, [rcx+16] 585 sbb r15, [rcx+24] 586 sbb r8, [rcx+32] 587 sbb r9, [rcx+40] 588 sbb r10, [rcx+48] 589 sbb r11, [rcx+56] 590 591 // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH 592 sub r12, [rcx+64] 593 sbb r13, [rcx+72] 594 sbb r14, [rcx+80] 595 sbb r15, rbx 596 sbb r8, rbp 597 sbb r9, rax 598 sbb r10, 0 599 sbb r11, 0 600 601 add r12, [rcx+32] 602 mov [rcx+32], r12 // Result C4-C7 603 adc r13, [rcx+40] 604 mov [rcx+40], r13 605 adc r14, [rcx+48] 606 mov [rcx+48], r14 607 adc r15, [rcx+56] 608 mov [rcx+56], r15 609 adc r8, [rcx+64] 610 mov [rcx+64], r8 // Result C8-C15 611 adc r9, [rcx+72] 612 mov [rcx+72], r9 613 adc r10, [rcx+80] 614 mov [rcx+80], r10 615 adc r11, rbx 616 mov [rcx+88], r11 617 adc rbp, 0 618 mov [rcx+96], rbp 619 adc rax, 0 620 mov [rcx+104], rax 621 622 add rsp, 96 623 pop rbp 624 pop rbx 625 pop r15 626 pop r14 627 pop r13 628 pop r12 629 ret 630 631#else 632 633# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE" 634 635#endif 636 637 638#ifdef _MULX_ 639 640///////////////////////////////////////////////////////////////// MACRO 641// Schoolbook integer multiplication 642// Inputs: reg I0 and memory pointer M1 643// Outputs: regs T0:T4 644// Temps: regs T0:T5 645///////////////////////////////////////////////////////////////// 646.macro MUL64x256_SCHOOL I0, M1, T0, T1, T2, T3, T4, T5 647 mulx \T2, \T4, 8\M1 648 xor rax, rax 649 mulx \T3, \T5, 16\M1 650 ADD1 \T1, \T4 // T1 <- C1_final 651 ADC1 \T2, \T5 // T2 <- C2_final 652 mulx \T4, \T5, 24\M1 653 ADC1 \T3, \T5 // T3 <- C3_final 654 ADC1 \T4, rax // T4 <- C4_final 655.endm 656 657///////////////////////////////////////////////////////////////// MACRO 658// Schoolbook integer multiplication 659// Inputs: regs I0 and I1, and memory pointer M1 660// Outputs: regs T0:T5 661// Temps: regs T0:T5 662///////////////////////////////////////////////////////////////// 663 664#ifdef _ADX_ 665.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 666 mulx \T2, \T4, 8\M1 667 xor rax, rax 668 mulx \T3, \T5, 16\M1 669 ADD1 \T1, \T4 670 ADC1 \T2, \T5 671 mulx \T4, \T5, 24\M1 672 ADC1 \T3, \T5 673 ADC1 \T4, rax 674 675 xor rax, rax 676 mov rdx, \I1 677 mulx \I1, \T5, \M1 678 ADD2 \T1, \T5 // T1 <- C1_final 679 ADC2 \T2, \I1 680 mulx \T5, \I1, 8\M1 681 ADC2 \T3, \T5 682 ADD1 \T2, \I1 683 mulx \T5, \I1, 16\M1 684 ADC2 \T4, \T5 685 ADC1 \T3, \I1 686 mulx \T5, \I1, 24\M1 687 ADC2 \T5, rax 688 ADC1 \T4, \I1 689 ADC1 \T5, rax 690.endm 691 692#else 693 694.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 695 mulx \T2, \T4, 8\M1 696 mulx \T3, \T5, 16\M1 697 add \T1, \T4 698 adc \T2, \T5 699 mulx \T4, \T5, 24\M1 700 adc \T3, \T5 701 adc \T4, 0 702 703 mov rdx, \I1 704 mulx \I1, \T5, \M1 705 add \T1, \T5 // T1 <- C1_final 706 adc \T2, \I1 707 mulx \T5, \I1, 8\M1 708 adc \T3, \T5 709 mulx \T5, rax, 16\M1 710 adc \T4, \T5 711 mulx \T5, rdx, 24\M1 712 adc \T5, 0 713 add \T2, \I1 714 adc \T3, rax 715 adc \T4, rdx 716 adc \T5, 0 717.endm 718#endif 719 720 721//************************************************************************************** 722// Montgomery reduction 723// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 724// Operation: c [reg_p2] = a [reg_p1] 725//************************************************************************************** 726.global fmt(rdc434_asm) 727fmt(rdc434_asm): 728 push r14 729 730 // a[0-1] x p434p1_nz --> result: r8:r13 731 mov rdx, [reg_p1] 732 mov r14, [reg_p1+8] 733 mulx r9, r8, [rip+fmt(p434p1)+24] // result r8 734 push r12 735 push r13 736 push r15 737 push rbp 738 push rbx 739 MUL128x256_SCHOOL rdx, r14, [rip+fmt(p434p1)+24], r8, r9, r10, r11, r12, r13 740 741 mov rdx, [reg_p1+16] 742 mov rcx, [reg_p1+72] 743 add r8, [reg_p1+24] 744 adc r9, [reg_p1+32] 745 adc r10, [reg_p1+40] 746 adc r11, [reg_p1+48] 747 adc r12, [reg_p1+56] 748 adc r13, [reg_p1+64] 749 adc rcx, 0 750 mulx rbp, rbx, [rip+fmt(p434p1)+24] // result rbx 751 mov [reg_p2], r9 752 mov [reg_p2+8], r10 753 mov [reg_p2+16], r11 754 mov [reg_p2+24], r12 755 mov [reg_p2+32], r13 756 mov r9, [reg_p1+80] 757 mov r10, [reg_p1+88] 758 mov r11, [reg_p1+96] 759 mov rdi, [reg_p1+104] 760 adc r9, 0 761 adc r10, 0 762 adc r11, 0 763 adc rdi, 0 764 765 // a[2-3] x p434p1_nz --> result: rbx, rbp, r12:r15 766 MUL128x256_SCHOOL rdx, r8, [rip+fmt(p434p1)+24], rbx, rbp, r12, r13, r14, r15 767 768 mov rdx, [reg_p2] 769 add rbx, [reg_p2+8] 770 adc rbp, [reg_p2+16] 771 adc r12, [reg_p2+24] 772 adc r13, [reg_p2+32] 773 adc r14, rcx 774 mov rcx, 0 775 adc r15, r9 776 adc rcx, r10 777 mulx r9, r8, [rip+fmt(p434p1)+24] // result r8 778 mov [reg_p2], rbp 779 mov [reg_p2+8], r12 780 mov [reg_p2+16], r13 781 adc r11, 0 782 adc rdi, 0 783 784 // a[4-5] x p434p1_nz --> result: r8:r13 785 MUL128x256_SCHOOL rdx, rbx, [rip+fmt(p434p1)+24], r8, r9, r10, rbp, r12, r13 786 787 mov rdx, [reg_p2] 788 add r8, [reg_p2+8] 789 adc r9, [reg_p2+16] 790 adc r10, r14 791 adc rbp, r15 792 adc r12, rcx 793 adc r13, r11 794 adc rdi, 0 795 mulx r15, r14, [rip+fmt(p434p1)+24] // result r14 796 mov [reg_p2], r8 // Final result c0-c1 797 mov [reg_p2+8], r9 798 799 // a[6-7] x p434p1_nz --> result: r14:r15, r8:r9, r11 800 MUL64x256_SCHOOL rdx, [rip+fmt(p434p1)+24], r14, r15, r8, r9, r11, rcx 801 802 // Final result c2:c6 803 add r14, r10 804 adc r15, rbp 805 pop rbx 806 pop rbp 807 adc r8, r12 808 adc r9, r13 809 adc r11, rdi 810 mov [reg_p2+16], r14 811 mov [reg_p2+24], r15 812 pop r15 813 pop r13 814 mov [reg_p2+32], r8 815 mov [reg_p2+40], r9 816 mov [reg_p2+48], r11 817 818 pop r12 819 pop r14 820 ret 821 822 #else 823 824 # error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE" 825 826 #endif 827 828 829//*********************************************************************** 830// 434-bit multiprecision addition 831// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] 832//*********************************************************************** 833.global fmt(mp_add434_asm) 834fmt(mp_add434_asm): 835 mov r8, [reg_p1] 836 mov r9, [reg_p1+8] 837 mov r10, [reg_p1+16] 838 mov r11, [reg_p1+24] 839 add r8, [reg_p2] 840 adc r9, [reg_p2+8] 841 adc r10, [reg_p2+16] 842 adc r11, [reg_p2+24] 843 mov [reg_p3], r8 844 mov [reg_p3+8], r9 845 mov [reg_p3+16], r10 846 mov [reg_p3+24], r11 847 848 mov r8, [reg_p1+32] 849 mov r9, [reg_p1+40] 850 mov r10, [reg_p1+48] 851 adc r8, [reg_p2+32] 852 adc r9, [reg_p2+40] 853 adc r10, [reg_p2+48] 854 mov [reg_p3+32], r8 855 mov [reg_p3+40], r9 856 mov [reg_p3+48], r10 857 ret 858 859 860//*************************************************************************** 861// 2x434-bit multiprecision subtraction/addition 862// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. If c < 0, add p434*2^448 863//*************************************************************************** 864.global fmt(mp_subadd434x2_asm) 865fmt(mp_subadd434x2_asm): 866 push r12 867 push r13 868 push r14 869 push r15 870 xor rax, rax 871 mov r8, [reg_p1] 872 mov r9, [reg_p1+8] 873 mov r10, [reg_p1+16] 874 mov r11, [reg_p1+24] 875 mov r12, [reg_p1+32] 876 sub r8, [reg_p2] 877 sbb r9, [reg_p2+8] 878 sbb r10, [reg_p2+16] 879 sbb r11, [reg_p2+24] 880 sbb r12, [reg_p2+32] 881 mov [reg_p3], r8 882 mov [reg_p3+8], r9 883 mov [reg_p3+16], r10 884 mov [reg_p3+24], r11 885 mov [reg_p3+32], r12 886 887 mov r8, [reg_p1+40] 888 mov r9, [reg_p1+48] 889 mov r10, [reg_p1+56] 890 mov r11, [reg_p1+64] 891 mov r12, [reg_p1+72] 892 sbb r8, [reg_p2+40] 893 sbb r9, [reg_p2+48] 894 sbb r10, [reg_p2+56] 895 sbb r11, [reg_p2+64] 896 sbb r12, [reg_p2+72] 897 mov [reg_p3+40], r8 898 mov [reg_p3+48], r9 899 mov [reg_p3+56], r10 900 901 mov r13, [reg_p1+80] 902 mov r14, [reg_p1+88] 903 mov r15, [reg_p1+96] 904 mov rcx, [reg_p1+104] 905 sbb r13, [reg_p2+80] 906 sbb r14, [reg_p2+88] 907 sbb r15, [reg_p2+96] 908 sbb rcx, [reg_p2+104] 909 sbb rax, 0 910 911 // Add p434 anded with the mask in rax 912 mov r8, [rip+fmt(p434)] 913 mov r9, [rip+fmt(p434)+24] 914 mov r10, [rip+fmt(p434)+32] 915 mov rdi, [rip+fmt(p434)+40] 916 mov rsi, [rip+fmt(p434)+48] 917 and r8, rax 918 and r9, rax 919 and r10, rax 920 and rdi, rax 921 and rsi, rax 922 mov rax, [reg_p3+56] 923 add rax, r8 924 adc r11, r8 925 adc r12, r8 926 adc r13, r9 927 adc r14, r10 928 adc r15, rdi 929 adc rcx, rsi 930 931 mov [reg_p3+56], rax 932 mov [reg_p3+64], r11 933 mov [reg_p3+72], r12 934 mov [reg_p3+80], r13 935 mov [reg_p3+88], r14 936 mov [reg_p3+96], r15 937 mov [reg_p3+104], rcx 938 pop r15 939 pop r14 940 pop r13 941 pop r12 942 ret 943 944 945//*********************************************************************** 946// Double 2x434-bit multiprecision subtraction 947// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2] 948//*********************************************************************** 949.global fmt(mp_dblsub434x2_asm) 950fmt(mp_dblsub434x2_asm): 951 push r12 952 push r13 953 push r14 954 955 mov r8, [reg_p3] 956 mov r9, [reg_p3+8] 957 mov r10, [reg_p3+16] 958 mov r11, [reg_p3+24] 959 mov r12, [reg_p3+32] 960 mov r13, [reg_p3+40] 961 mov r14, [reg_p3+48] 962 sub r8, [reg_p1] 963 sbb r9, [reg_p1+8] 964 sbb r10, [reg_p1+16] 965 sbb r11, [reg_p1+24] 966 sbb r12, [reg_p1+32] 967 sbb r13, [reg_p1+40] 968 sbb r14, [reg_p1+48] 969 setc al 970 sub r8, [reg_p2] 971 sbb r9, [reg_p2+8] 972 sbb r10, [reg_p2+16] 973 sbb r11, [reg_p2+24] 974 sbb r12, [reg_p2+32] 975 sbb r13, [reg_p2+40] 976 sbb r14, [reg_p2+48] 977 setc cl 978 mov [reg_p3], r8 979 mov [reg_p3+8], r9 980 mov [reg_p3+16], r10 981 mov [reg_p3+24], r11 982 mov [reg_p3+32], r12 983 mov [reg_p3+40], r13 984 mov [reg_p3+48], r14 985 986 mov r8, [reg_p3+56] 987 mov r9, [reg_p3+64] 988 mov r10, [reg_p3+72] 989 mov r11, [reg_p3+80] 990 mov r12, [reg_p3+88] 991 mov r13, [reg_p3+96] 992 mov r14, [reg_p3+104] 993 bt rax, 0 994 sbb r8, [reg_p1+56] 995 sbb r9, [reg_p1+64] 996 sbb r10, [reg_p1+72] 997 sbb r11, [reg_p1+80] 998 sbb r12, [reg_p1+88] 999 sbb r13, [reg_p1+96] 1000 sbb r14, [reg_p1+104] 1001 bt rcx, 0 1002 sbb r8, [reg_p2+56] 1003 sbb r9, [reg_p2+64] 1004 sbb r10, [reg_p2+72] 1005 sbb r11, [reg_p2+80] 1006 sbb r12, [reg_p2+88] 1007 sbb r13, [reg_p2+96] 1008 sbb r14, [reg_p2+104] 1009 mov [reg_p3+56], r8 1010 mov [reg_p3+64], r9 1011 mov [reg_p3+72], r10 1012 mov [reg_p3+80], r11 1013 mov [reg_p3+88], r12 1014 mov [reg_p3+96], r13 1015 mov [reg_p3+104], r14 1016 1017 pop r14 1018 pop r13 1019 pop r12 1020 ret 1021