1//******************************************************************************************* 2// Supersingular Isogeny Key Encapsulation Library 3// 4// Abstract: field arithmetic in x64 assembly for P434 on Linux 5//******************************************************************************************* 6 7/* Requires bmi2 instruction set for mulx. adx instructions are optional, but preferred. */ 8.intel_syntax noprefix 9 10#define S2N_SIKE_P434_R3_NAMESPACE(s) s2n_sike_p434_r3_##s 11 12// Registers that are used for parameter passing: 13#define reg_p1 rdi 14#define reg_p2 rsi 15#define reg_p3 rdx 16 17// Define addition instructions 18#ifdef S2N_ADX 19 20#define ADD1 adox 21#define ADC1 adox 22#define ADD2 adcx 23#define ADC2 adcx 24 25#else 26 27#define ADD1 add 28#define ADC1 adc 29#define ADD2 add 30#define ADC2 adc 31 32#endif 33 34.text 35 36#define asm_p434 S2N_SIKE_P434_R3_NAMESPACE(asm_p434) 37.align 32 38.type asm_p434, @object 39.size asm_p434, 56 40asm_p434: 41.quad -1 42.quad -1 43.quad -1 44.quad -161717841442111489 45.quad 8918917783347572387 46.quad 7853257225132122198 47.quad 620258357900100 48 49 50#define asm_p434x2 S2N_SIKE_P434_R3_NAMESPACE(asm_p434x2) 51.align 32 52.type asm_p434x2, @object 53.size asm_p434x2, 56 54asm_p434x2: 55.quad -2 56.quad -1 57.quad -1 58.quad -323435682884222977 59.quad -608908507014406841 60.quad -2740229623445307220 61.quad 1240516715800200 62 63 64#define asm_p434x4 S2N_SIKE_P434_R3_NAMESPACE(asm_p434x4) 65.align 32 66.type asm_p434x4, @object 67.size asm_p434x4, 56 68asm_p434x4: 69.quad -4 70.quad -1 71.quad -1 72.quad -646871365768445953 73.quad -1217817014028813681 74.quad -5480459246890614439 75.quad 2481033431600401 76 77 78#define asm_p434p1 S2N_SIKE_P434_R3_NAMESPACE(asm_p434p1) 79.align 32 80.type asm_p434p1, @object 81.size asm_p434p1, 56 82asm_p434p1: 83.quad 0 84.quad 0 85.quad 0 86.quad -161717841442111488 87.quad 8918917783347572387 88.quad 7853257225132122198 89.quad 620258357900100 90 91//*********************************************************************** 92// Field addition 93// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] 94//*********************************************************************** 95#define fpadd434_asm S2N_SIKE_P434_R3_NAMESPACE(fpadd434_asm) 96.global fpadd434_asm 97fpadd434_asm: 98 push r12 99 push r13 100 push r14 101 push r15 102 push rbx 103 push rbp 104 105 xor rax, rax 106 mov r8, [reg_p1] 107 mov r9, [reg_p1+8] 108 mov r10, [reg_p1+16] 109 mov r11, [reg_p1+24] 110 mov r12, [reg_p1+32] 111 mov r13, [reg_p1+40] 112 mov r14, [reg_p1+48] 113 add r8, [reg_p2] 114 adc r9, [reg_p2+8] 115 adc r10, [reg_p2+16] 116 adc r11, [reg_p2+24] 117 adc r12, [reg_p2+32] 118 adc r13, [reg_p2+40] 119 adc r14, [reg_p2+48] 120 121 mov rbx, [rip+asm_p434x2] 122 sub r8, rbx 123 mov rcx, [rip+asm_p434x2+8] 124 sbb r9, rcx 125 sbb r10, rcx 126 mov rdi, [rip+asm_p434x2+24] 127 sbb r11, rdi 128 mov rsi, [rip+asm_p434x2+32] 129 sbb r12, rsi 130 mov rbp, [rip+asm_p434x2+40] 131 sbb r13, rbp 132 mov r15, [rip+asm_p434x2+48] 133 sbb r14, r15 134 sbb rax, 0 135 136 and rbx, rax 137 and rcx, rax 138 and rdi, rax 139 and rsi, rax 140 and rbp, rax 141 and r15, rax 142 143 add r8, rbx 144 adc r9, rcx 145 adc r10, rcx 146 adc r11, rdi 147 adc r12, rsi 148 adc r13, rbp 149 adc r14, r15 150 mov [reg_p3], r8 151 mov [reg_p3+8], r9 152 mov [reg_p3+16], r10 153 mov [reg_p3+24], r11 154 mov [reg_p3+32], r12 155 mov [reg_p3+40], r13 156 mov [reg_p3+48], r14 157 158 pop rbp 159 pop rbx 160 pop r15 161 pop r14 162 pop r13 163 pop r12 164 ret 165 166//*********************************************************************** 167// Field subtraction 168// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] 169//*********************************************************************** 170#define fpsub434_asm S2N_SIKE_P434_R3_NAMESPACE(fpsub434_asm) 171.global fpsub434_asm 172fpsub434_asm: 173 push r12 174 push r13 175 push r14 176 177 xor rax, rax 178 mov r8, [reg_p1] 179 mov r9, [reg_p1+8] 180 mov r10, [reg_p1+16] 181 mov r11, [reg_p1+24] 182 mov r12, [reg_p1+32] 183 mov r13, [reg_p1+40] 184 mov r14, [reg_p1+48] 185 sub r8, [reg_p2] 186 sbb r9, [reg_p2+8] 187 sbb r10, [reg_p2+16] 188 sbb r11, [reg_p2+24] 189 sbb r12, [reg_p2+32] 190 sbb r13, [reg_p2+40] 191 sbb r14, [reg_p2+48] 192 sbb rax, 0 193 194 mov rcx, [rip+asm_p434x2] 195 mov rdi, [rip+asm_p434x2+8] 196 mov rsi, [rip+asm_p434x2+24] 197 and rcx, rax 198 and rdi, rax 199 and rsi, rax 200 add r8, rcx 201 adc r9, rdi 202 adc r10, rdi 203 adc r11, rsi 204 mov [reg_p3], r8 205 mov [reg_p3+8], r9 206 mov [reg_p3+16], r10 207 mov [reg_p3+24], r11 208 setc cl 209 210 mov r8, [rip+asm_p434x2+32] 211 mov rdi, [rip+asm_p434x2+40] 212 mov rsi, [rip+asm_p434x2+48] 213 and r8, rax 214 and rdi, rax 215 and rsi, rax 216 bt rcx, 0 217 adc r12, r8 218 adc r13, rdi 219 adc r14, rsi 220 mov [reg_p3+32], r12 221 mov [reg_p3+40], r13 222 mov [reg_p3+48], r14 223 224 pop r14 225 pop r13 226 pop r12 227 ret 228 229///////////////////////////////////////////////////////////////// MACRO 230.macro SUB434_PX P0 231 push r12 232 push r13 233 234 mov r8, [reg_p1] 235 mov r9, [reg_p1+8] 236 mov r10, [reg_p1+16] 237 mov r11, [reg_p1+24] 238 mov r12, [reg_p1+32] 239 mov r13, [reg_p1+40] 240 mov rcx, [reg_p1+48] 241 sub r8, [reg_p2] 242 sbb r9, [reg_p2+8] 243 sbb r10, [reg_p2+16] 244 sbb r11, [reg_p2+24] 245 sbb r12, [reg_p2+32] 246 sbb r13, [reg_p2+40] 247 sbb rcx, [reg_p2+48] 248 249 mov rax, [rip+\P0] 250 mov rdi, [rip+\P0+8] 251 mov rsi, [rip+\P0+24] 252 add r8, rax 253 mov rax, [rip+\P0+32] 254 adc r9, rdi 255 adc r10, rdi 256 adc r11, rsi 257 mov rdi, [rip+\P0+40] 258 mov rsi, [rip+\P0+48] 259 adc r12, rax 260 adc r13, rdi 261 adc rcx, rsi 262 mov [reg_p3], r8 263 mov [reg_p3+8], r9 264 mov [reg_p3+16], r10 265 mov [reg_p3+24], r11 266 mov [reg_p3+32], r12 267 mov [reg_p3+40], r13 268 mov [reg_p3+48], rcx 269 270 pop r13 271 pop r12 272.endm 273 274//*********************************************************************** 275// Multiprecision subtraction with correction with 2*p434 276// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p434 277//*********************************************************************** 278#define mp_sub434_p2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p2_asm) 279.global mp_sub434_p2_asm 280mp_sub434_p2_asm: 281 SUB434_PX asm_p434x2 282 ret 283 284//*********************************************************************** 285// Multiprecision subtraction with correction with 4*p434 286// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p434 287//*********************************************************************** 288#define mp_sub434_p4_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p4_asm) 289.global mp_sub434_p4_asm 290mp_sub434_p4_asm: 291 SUB434_PX asm_p434x4 292 ret 293 294///////////////////////////////////////////////////////////////// MACRO 295// Schoolbook integer multiplication 296// Inputs: memory pointers M0 and M1 297// Outputs: memory pointer C and regs T1, T3, rax 298// Temps: regs T0:T6 299///////////////////////////////////////////////////////////////// 300#ifdef S2N_ADX 301 302.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6 303 mov rdx, \M0 304 mulx \T0, \T1, \M1 // T0:T1 = A0*B0 305 mov \C, \T1 // C0_final 306 mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 307 xor rax, rax 308 adox \T0, \T2 309 mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 310 adox \T1, \T3 311 312 mov rdx, 8\M0 313 mulx \T3, \T4, \M1 // T3:T4 = A1*B0 314 adox \T2, rax 315 xor rax, rax 316 mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1 317 adox \T4, \T0 318 mov 8\C, \T4 // C1_final 319 adcx \T3, \T6 320 mulx \T6, \T0, 16\M1 // T6:T0 = A1*B2 321 adox \T3, \T1 322 adcx \T5, \T0 323 adcx \T6, rax 324 adox \T5, \T2 325 326 mov rdx, 16\M0 327 mulx \T1, \T0, \M1 // T1:T0 = A2*B0 328 adox \T6, rax 329 xor rax, rax 330 mulx \T4, \T2, 8\M1 // T4:T2 = A2*B1 331 adox \T0, \T3 332 mov 16\C, \T0 // C2_final 333 adcx \T1, \T5 334 mulx \T0, \T3, 16\M1 // T0:T3 = A2*B2 335 adcx \T4, \T6 336 adcx \T0, rax 337 adox \T1, \T2 338 adox \T3, \T4 339 adox rax, \T0 340.endm 341 342///////////////////////////////////////////////////////////////// MACRO 343// Schoolbook integer multiplication 344// Inputs: memory pointers M0 and M1 345// Outputs: memory pointer C 346// Temps: regs T0:T9 347///////////////////////////////////////////////////////////////// 348.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 349 mov rdx, \M0 350 mulx \T0, \T1, \M1 // T0:T1 = A0*B0 351 mov \C, \T1 // C0_final 352 mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 353 xor rax, rax 354 adox \T0, \T2 355 mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 356 adox \T1, \T3 357 mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 358 adox \T2, \T4 359 360 mov rdx, 8\M0 361 mulx \T5, \T4, \M1 // T5:T4 = A1*B0 362 adox \T3, rax 363 xor rax, rax 364 mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 365 adox \T4, \T0 366 mov 8\C, \T4 // C1_final 367 adcx \T5, \T7 368 mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 369 adcx \T6, \T8 370 adox \T5, \T1 371 mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 372 adcx \T7, \T9 373 adcx \T8, rax 374 adox \T6, \T2 375 376 mov rdx, 16\M0 377 mulx \T1, \T0, \M1 // T1:T0 = A2*B0 378 adox \T7, \T3 379 adox \T8, rax 380 xor rax, rax 381 mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 382 adox \T0, \T5 383 mov 16\C, \T0 // C2_final 384 adcx \T1, \T3 385 mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 386 adcx \T2, \T4 387 adox \T1, \T6 388 mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 389 adcx \T3, \T9 390 mov rdx, 24\M0 391 adcx \T4, rax 392 393 adox \T2, \T7 394 adox \T3, \T8 395 adox \T4, rax 396 397 mulx \T5, \T0, \M1 // T5:T0 = A3*B0 398 xor rax, rax 399 mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 400 adcx \T5, \T7 401 adox \T1, \T0 402 mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 403 adcx \T6, \T8 404 adox \T2, \T5 405 mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 406 adcx \T7, \T9 407 adcx \T8, rax 408 409 adox \T3, \T6 410 adox \T4, \T7 411 adox \T8, rax 412 mov 24\C, \T1 // C3_final 413 mov 32\C, \T2 // C4_final 414 mov 40\C, \T3 // C5_final 415 mov 48\C, \T4 // C6_final 416 mov 56\C, \T8 // C7_final 417.endm 418 419#else // S2N_ADX 420 421.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6 422 mov rdx, \M0 423 mulx \T0, \T1, \M1 // T0:T1 = A0*B0 424 mov \C, \T1 // C0_final 425 mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 426 add \T0, \T2 427 mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 428 adc \T1, \T3 429 430 mov rdx, 8\M0 431 mulx \T3, \T4, \M1 // T3:T4 = A1*B0 432 adc \T2, 0 433 mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1 434 add \T4, \T0 435 mov 8\C, \T4 // C1_final 436 adc \T3, \T1 437 adc \T5, \T2 438 mulx \T2, \T1, 16\M1 // T2:T1 = A1*B2 439 adc \T2, 0 440 441 add \T3, \T6 442 adc \T5, \T1 443 adc \T2, 0 444 445 mov rdx, 16\M0 446 mulx \T1, \T0, \M1 // T1:T0 = A2*B0 447 add \T0, \T3 448 mov 16\C, \T0 // C2_final 449 mulx \T4, \T6, 8\M1 // T4:T6 = A2*B1 450 adc \T1, \T5 451 adc \T2, \T4 452 mulx rax, \T3, 16\M1 // rax:T3 = A2*B2 453 adc rax, 0 454 add \T1, \T6 455 adc \T3, \T2 456 adc rax, 0 457.endm 458 459.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 460 mov rdx, \M0 461 mulx \T0, \T1, \M1 // T0:T1 = A0*B0 462 mov \C, \T1 // C0_final 463 mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 464 add \T0, \T2 465 mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 466 adc \T1, \T3 467 mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 468 adc \T2, \T4 469 mov rdx, 8\M0 470 adc \T3, 0 471 472 mulx \T5, \T4, \M1 // T5:T4 = A1*B0 473 mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 474 add \T5, \T7 475 mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 476 adc \T6, \T8 477 mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 478 adc \T7, \T9 479 adc \T8, 0 480 481 add \T4, \T0 482 mov 8\C, \T4 // C1_final 483 adc \T5, \T1 484 adc \T6, \T2 485 adc \T7, \T3 486 mov rdx, 16\M0 487 adc \T8, 0 488 489 mulx \T1, \T0, \M1 // T1:T0 = A2*B0 490 mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 491 add \T1, \T3 492 mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 493 adc \T2, \T4 494 mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 495 adc \T3, \T9 496 mov rdx, 24\M0 497 adc \T4, 0 498 499 add \T0, \T5 500 mov 16\C, \T0 // C2_final 501 adc \T1, \T6 502 adc \T2, \T7 503 adc \T3, \T8 504 adc \T4, 0 505 506 mulx \T5, \T0, \M1 // T5:T0 = A3*B0 507 mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 508 add \T5, \T7 509 mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 510 adc \T6, \T8 511 mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 512 adc \T7, \T9 513 adc \T8, 0 514 515 add \T1, \T0 516 mov 24\C, \T1 // C3_final 517 adc \T2, \T5 518 mov 32\C, \T2 // C4_final 519 adc \T3, \T6 520 mov 40\C, \T3 // C5_final 521 adc \T4, \T7 522 mov 48\C, \T4 // C6_final 523 adc \T8, 0 524 mov 56\C, \T8 // C7_final 525.endm 526 527#endif // S2N_ADX 528 529//***************************************************************************** 530// 434-bit multiplication using Karatsuba (one level), schoolbook (one level) 531//***************************************************************************** 532#define mul434_asm S2N_SIKE_P434_R3_NAMESPACE(mul434_asm) 533.global mul434_asm 534mul434_asm: 535 push r12 536 push r13 537 push r14 538 push r15 539 mov rcx, reg_p3 540 541 // r8-r11 <- AH + AL, rax <- mask 542 xor rax, rax 543 mov r8, [reg_p1] 544 mov r9, [reg_p1+8] 545 mov r10, [reg_p1+16] 546 mov r11, [reg_p1+24] 547 push rbx 548 push rbp 549 sub rsp, 96 550 add r8, [reg_p1+32] 551 adc r9, [reg_p1+40] 552 adc r10, [reg_p1+48] 553 adc r11, 0 554 sbb rax, 0 555 mov [rsp], r8 556 mov [rsp+8], r9 557 mov [rsp+16], r10 558 mov [rsp+24], r11 559 560 // r12-r15 <- BH + BL, rbx <- mask 561 xor rbx, rbx 562 mov r12, [reg_p2] 563 mov r13, [reg_p2+8] 564 mov r14, [reg_p2+16] 565 mov r15, [reg_p2+24] 566 add r12, [reg_p2+32] 567 adc r13, [reg_p2+40] 568 adc r14, [reg_p2+48] 569 adc r15, 0 570 sbb rbx, 0 571 mov [rsp+32], r12 572 mov [rsp+40], r13 573 mov [rsp+48], r14 574 mov [rsp+56], r15 575 576 // r12-r15 <- masked (BH + BL) 577 and r12, rax 578 and r13, rax 579 and r14, rax 580 and r15, rax 581 582 // r8-r11 <- masked (AH + AL) 583 and r8, rbx 584 and r9, rbx 585 and r10, rbx 586 and r11, rbx 587 588 // r8-r11 <- masked (AH + AL) + masked (AH + AL) 589 add r8, r12 590 adc r9, r13 591 adc r10, r14 592 adc r11, r15 593 mov [rsp+64], r8 594 mov [rsp+72], r9 595 mov [rsp+80], r10 596 mov [rsp+88], r11 597 598 // [rsp] <- (AH+AL) x (BH+BL), low part 599 MUL256_SCHOOL [rsp], [rsp+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp 600 601 // [rcx] <- AL x BL 602 MUL256_SCHOOL [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp // Result C0-C3 603 604 // [rcx+64], rbx, rbp, rax <- AH x BH 605 MUL192_SCHOOL [reg_p1+32], [reg_p2+32], [rcx+64], r8, rbx, r10, rbp, r12, r13, r14 606 607 // r8-r11 <- (AH+AL) x (BH+BL), final step 608 mov r8, [rsp+64] 609 mov r9, [rsp+72] 610 mov r10, [rsp+80] 611 mov r11, [rsp+88] 612 mov rdx, [rsp+32] 613 add r8, rdx 614 mov rdx, [rsp+40] 615 adc r9, rdx 616 mov rdx, [rsp+48] 617 adc r10, rdx 618 mov rdx, [rsp+56] 619 adc r11, rdx 620 621 // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL 622 mov r12, [rsp] 623 mov r13, [rsp+8] 624 mov r14, [rsp+16] 625 mov r15, [rsp+24] 626 sub r12, [rcx] 627 sbb r13, [rcx+8] 628 sbb r14, [rcx+16] 629 sbb r15, [rcx+24] 630 sbb r8, [rcx+32] 631 sbb r9, [rcx+40] 632 sbb r10, [rcx+48] 633 sbb r11, [rcx+56] 634 635 // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH 636 sub r12, [rcx+64] 637 sbb r13, [rcx+72] 638 sbb r14, [rcx+80] 639 sbb r15, rbx 640 sbb r8, rbp 641 sbb r9, rax 642 sbb r10, 0 643 sbb r11, 0 644 645 add r12, [rcx+32] 646 mov [rcx+32], r12 // Result C4-C7 647 adc r13, [rcx+40] 648 mov [rcx+40], r13 649 adc r14, [rcx+48] 650 mov [rcx+48], r14 651 adc r15, [rcx+56] 652 mov [rcx+56], r15 653 adc r8, [rcx+64] 654 mov [rcx+64], r8 // Result C8-C15 655 adc r9, [rcx+72] 656 mov [rcx+72], r9 657 adc r10, [rcx+80] 658 mov [rcx+80], r10 659 adc r11, rbx 660 mov [rcx+88], r11 661 adc rbp, 0 662 mov [rcx+96], rbp 663 adc rax, 0 664 mov [rcx+104], rax 665 666 add rsp, 96 667 pop rbp 668 pop rbx 669 pop r15 670 pop r14 671 pop r13 672 pop r12 673 ret 674 675///////////////////////////////////////////////////////////////// MACRO 676// Schoolbook integer multiplication 677// Inputs: reg I0 and memory pointer M1 678// Outputs: regs T0:T4 679// Temps: regs T0:T5 680///////////////////////////////////////////////////////////////// 681.macro MUL64x256_SCHOOL I0, M1, T0, T1, T2, T3, T4, T5 682 mulx \T2, \T4, 8\M1 683 xor rax, rax 684 mulx \T3, \T5, 16\M1 685 ADD1 \T1, \T4 // T1 <- C1_final 686 ADC1 \T2, \T5 // T2 <- C2_final 687 mulx \T4, \T5, 24\M1 688 ADC1 \T3, \T5 // T3 <- C3_final 689 ADC1 \T4, rax // T4 <- C4_final 690.endm 691 692///////////////////////////////////////////////////////////////// MACRO 693// Schoolbook integer multiplication 694// Inputs: regs I0 and I1, and memory pointer M1 695// Outputs: regs T0:T5 696// Temps: regs T0:T5 697///////////////////////////////////////////////////////////////// 698#ifdef S2N_ADX 699 700.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 701 mulx \T2, \T4, 8\M1 702 xor rax, rax 703 mulx \T3, \T5, 16\M1 704 ADD1 \T1, \T4 705 ADC1 \T2, \T5 706 mulx \T4, \T5, 24\M1 707 ADC1 \T3, \T5 708 ADC1 \T4, rax 709 710 xor rax, rax 711 mov rdx, \I1 712 mulx \I1, \T5, \M1 713 ADD2 \T1, \T5 // T1 <- C1_final 714 ADC2 \T2, \I1 715 mulx \T5, \I1, 8\M1 716 ADC2 \T3, \T5 717 ADD1 \T2, \I1 718 mulx \T5, \I1, 16\M1 719 ADC2 \T4, \T5 720 ADC1 \T3, \I1 721 mulx \T5, \I1, 24\M1 722 ADC2 \T5, rax 723 ADC1 \T4, \I1 724 ADC1 \T5, rax 725.endm 726 727#else // S2N_ADX 728 729.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 730 mulx \T2, \T4, 8\M1 731 mulx \T3, \T5, 16\M1 732 add \T1, \T4 733 adc \T2, \T5 734 mulx \T4, \T5, 24\M1 735 adc \T3, \T5 736 adc \T4, 0 737 738 mov rdx, \I1 739 mulx \I1, \T5, \M1 740 add \T1, \T5 // T1 <- C1_final 741 adc \T2, \I1 742 mulx \T5, \I1, 8\M1 743 adc \T3, \T5 744 mulx \T5, rax, 16\M1 745 adc \T4, \T5 746 mulx \T5, rdx, 24\M1 747 adc \T5, 0 748 add \T2, \I1 749 adc \T3, rax 750 adc \T4, rdx 751 adc \T5, 0 752.endm 753 754#endif // S2N_ADX 755 756//************************************************************************************** 757// Montgomery reduction 758// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 759// Operation: c [reg_p2] = a [reg_p1] 760//************************************************************************************** 761#define rdc434_asm S2N_SIKE_P434_R3_NAMESPACE(rdc434_asm) 762.global rdc434_asm 763rdc434_asm: 764 push r14 765 766 // a[0-1] x p434p1_nz --> result: r8:r13 767 mov rdx, [reg_p1] 768 mov r14, [reg_p1+8] 769 mulx r9, r8, [rip+asm_p434p1+24] // result r8 770 push r12 771 push r13 772 push r15 773 push rbp 774 push rbx 775 MUL128x256_SCHOOL rdx, r14, [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13 776 777 mov rdx, [reg_p1+16] 778 mov rcx, [reg_p1+72] 779 add r8, [reg_p1+24] 780 adc r9, [reg_p1+32] 781 adc r10, [reg_p1+40] 782 adc r11, [reg_p1+48] 783 adc r12, [reg_p1+56] 784 adc r13, [reg_p1+64] 785 adc rcx, 0 786 mulx rbp, rbx, [rip+asm_p434p1+24] // result rbx 787 mov [reg_p2], r9 788 mov [reg_p2+8], r10 789 mov [reg_p2+16], r11 790 mov [reg_p2+24], r12 791 mov [reg_p2+32], r13 792 mov r9, [reg_p1+80] 793 mov r10, [reg_p1+88] 794 mov r11, [reg_p1+96] 795 mov rdi, [reg_p1+104] 796 adc r9, 0 797 adc r10, 0 798 adc r11, 0 799 adc rdi, 0 800 801 // a[2-3] x p434p1_nz --> result: rbx, rbp, r12:r15 802 MUL128x256_SCHOOL rdx, r8, [rip+asm_p434p1+24], rbx, rbp, r12, r13, r14, r15 803 804 mov rdx, [reg_p2] 805 add rbx, [reg_p2+8] 806 adc rbp, [reg_p2+16] 807 adc r12, [reg_p2+24] 808 adc r13, [reg_p2+32] 809 adc r14, rcx 810 mov rcx, 0 811 adc r15, r9 812 adc rcx, r10 813 mulx r9, r8, [rip+asm_p434p1+24] // result r8 814 mov [reg_p2], rbp 815 mov [reg_p2+8], r12 816 mov [reg_p2+16], r13 817 adc r11, 0 818 adc rdi, 0 819 820 // a[4-5] x p434p1_nz --> result: r8:r13 821 MUL128x256_SCHOOL rdx, rbx, [rip+asm_p434p1+24], r8, r9, r10, rbp, r12, r13 822 823 mov rdx, [reg_p2] 824 add r8, [reg_p2+8] 825 adc r9, [reg_p2+16] 826 adc r10, r14 827 adc rbp, r15 828 adc r12, rcx 829 adc r13, r11 830 adc rdi, 0 831 mulx r15, r14, [rip+asm_p434p1+24] // result r14 832 mov [reg_p2], r8 // Final result c0-c1 833 mov [reg_p2+8], r9 834 835 // a[6-7] x p434p1_nz --> result: r14:r15, r8:r9, r11 836 MUL64x256_SCHOOL rdx, [rip+asm_p434p1+24], r14, r15, r8, r9, r11, rcx 837 838 // Final result c2:c6 839 add r14, r10 840 adc r15, rbp 841 pop rbx 842 pop rbp 843 adc r8, r12 844 adc r9, r13 845 adc r11, rdi 846 mov [reg_p2+16], r14 847 mov [reg_p2+24], r15 848 pop r15 849 pop r13 850 mov [reg_p2+32], r8 851 mov [reg_p2+40], r9 852 mov [reg_p2+48], r11 853 854 pop r12 855 pop r14 856 ret 857 858//*********************************************************************** 859// 434-bit multiprecision addition 860// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] 861//*********************************************************************** 862#define mp_add434_asm S2N_SIKE_P434_R3_NAMESPACE(mp_add434_asm) 863.global mp_add434_asm 864mp_add434_asm: 865 mov r8, [reg_p1] 866 mov r9, [reg_p1+8] 867 mov r10, [reg_p1+16] 868 mov r11, [reg_p1+24] 869 add r8, [reg_p2] 870 adc r9, [reg_p2+8] 871 adc r10, [reg_p2+16] 872 adc r11, [reg_p2+24] 873 mov [reg_p3], r8 874 mov [reg_p3+8], r9 875 mov [reg_p3+16], r10 876 mov [reg_p3+24], r11 877 878 mov r8, [reg_p1+32] 879 mov r9, [reg_p1+40] 880 mov r10, [reg_p1+48] 881 adc r8, [reg_p2+32] 882 adc r9, [reg_p2+40] 883 adc r10, [reg_p2+48] 884 mov [reg_p3+32], r8 885 mov [reg_p3+40], r9 886 mov [reg_p3+48], r10 887 ret 888 889//*************************************************************************** 890// 2x434-bit multiprecision subtraction/addition 891// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. If c < 0, add p434*2^448 892//*************************************************************************** 893#define mp_subadd434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_subadd434x2_asm) 894.global mp_subadd434x2_asm 895mp_subadd434x2_asm: 896 push r12 897 push r13 898 push r14 899 push r15 900 xor rax, rax 901 mov r8, [reg_p1] 902 mov r9, [reg_p1+8] 903 mov r10, [reg_p1+16] 904 mov r11, [reg_p1+24] 905 mov r12, [reg_p1+32] 906 sub r8, [reg_p2] 907 sbb r9, [reg_p2+8] 908 sbb r10, [reg_p2+16] 909 sbb r11, [reg_p2+24] 910 sbb r12, [reg_p2+32] 911 mov [reg_p3], r8 912 mov [reg_p3+8], r9 913 mov [reg_p3+16], r10 914 mov [reg_p3+24], r11 915 mov [reg_p3+32], r12 916 917 mov r8, [reg_p1+40] 918 mov r9, [reg_p1+48] 919 mov r10, [reg_p1+56] 920 mov r11, [reg_p1+64] 921 mov r12, [reg_p1+72] 922 sbb r8, [reg_p2+40] 923 sbb r9, [reg_p2+48] 924 sbb r10, [reg_p2+56] 925 sbb r11, [reg_p2+64] 926 sbb r12, [reg_p2+72] 927 mov [reg_p3+40], r8 928 mov [reg_p3+48], r9 929 mov [reg_p3+56], r10 930 931 mov r13, [reg_p1+80] 932 mov r14, [reg_p1+88] 933 mov r15, [reg_p1+96] 934 mov rcx, [reg_p1+104] 935 sbb r13, [reg_p2+80] 936 sbb r14, [reg_p2+88] 937 sbb r15, [reg_p2+96] 938 sbb rcx, [reg_p2+104] 939 sbb rax, 0 940 941 // Add p434 anded with the mask in rax 942 mov r8, [rip+asm_p434] 943 mov r9, [rip+asm_p434+24] 944 mov r10, [rip+asm_p434+32] 945 mov rdi, [rip+asm_p434+40] 946 mov rsi, [rip+asm_p434+48] 947 and r8, rax 948 and r9, rax 949 and r10, rax 950 and rdi, rax 951 and rsi, rax 952 mov rax, [reg_p3+56] 953 add rax, r8 954 adc r11, r8 955 adc r12, r8 956 adc r13, r9 957 adc r14, r10 958 adc r15, rdi 959 adc rcx, rsi 960 961 mov [reg_p3+56], rax 962 mov [reg_p3+64], r11 963 mov [reg_p3+72], r12 964 mov [reg_p3+80], r13 965 mov [reg_p3+88], r14 966 mov [reg_p3+96], r15 967 mov [reg_p3+104], rcx 968 pop r15 969 pop r14 970 pop r13 971 pop r12 972 ret 973 974//*********************************************************************** 975// Double 2x434-bit multiprecision subtraction 976// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2] 977//*********************************************************************** 978#define mp_dblsub434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_dblsub434x2_asm) 979.global mp_dblsub434x2_asm 980mp_dblsub434x2_asm: 981 push r12 982 push r13 983 push r14 984 985 mov r8, [reg_p3] 986 mov r9, [reg_p3+8] 987 mov r10, [reg_p3+16] 988 mov r11, [reg_p3+24] 989 mov r12, [reg_p3+32] 990 mov r13, [reg_p3+40] 991 mov r14, [reg_p3+48] 992 sub r8, [reg_p1] 993 sbb r9, [reg_p1+8] 994 sbb r10, [reg_p1+16] 995 sbb r11, [reg_p1+24] 996 sbb r12, [reg_p1+32] 997 sbb r13, [reg_p1+40] 998 sbb r14, [reg_p1+48] 999 setc al 1000 sub r8, [reg_p2] 1001 sbb r9, [reg_p2+8] 1002 sbb r10, [reg_p2+16] 1003 sbb r11, [reg_p2+24] 1004 sbb r12, [reg_p2+32] 1005 sbb r13, [reg_p2+40] 1006 sbb r14, [reg_p2+48] 1007 setc cl 1008 mov [reg_p3], r8 1009 mov [reg_p3+8], r9 1010 mov [reg_p3+16], r10 1011 mov [reg_p3+24], r11 1012 mov [reg_p3+32], r12 1013 mov [reg_p3+40], r13 1014 mov [reg_p3+48], r14 1015 1016 mov r8, [reg_p3+56] 1017 mov r9, [reg_p3+64] 1018 mov r10, [reg_p3+72] 1019 mov r11, [reg_p3+80] 1020 mov r12, [reg_p3+88] 1021 mov r13, [reg_p3+96] 1022 mov r14, [reg_p3+104] 1023 bt rax, 0 1024 sbb r8, [reg_p1+56] 1025 sbb r9, [reg_p1+64] 1026 sbb r10, [reg_p1+72] 1027 sbb r11, [reg_p1+80] 1028 sbb r12, [reg_p1+88] 1029 sbb r13, [reg_p1+96] 1030 sbb r14, [reg_p1+104] 1031 bt rcx, 0 1032 sbb r8, [reg_p2+56] 1033 sbb r9, [reg_p2+64] 1034 sbb r10, [reg_p2+72] 1035 sbb r11, [reg_p2+80] 1036 sbb r12, [reg_p2+88] 1037 sbb r13, [reg_p2+96] 1038 sbb r14, [reg_p2+104] 1039 mov [reg_p3+56], r8 1040 mov [reg_p3+64], r9 1041 mov [reg_p3+72], r10 1042 mov [reg_p3+80], r11 1043 mov [reg_p3+88], r12 1044 mov [reg_p3+96], r13 1045 mov [reg_p3+104], r14 1046 1047 pop r14 1048 pop r13 1049 pop r12 1050 ret 1051