1.text 2 3 4 5.globl bn_mul_mont 6.type bn_mul_mont,@function 7.align 16 8bn_mul_mont: 9.cfi_startproc 10 movl %r9d,%r9d 11 movq %rsp,%rax 12.cfi_def_cfa_register %rax 13 testl $3,%r9d 14 jnz .Lmul_enter 15 cmpl $8,%r9d 16 jb .Lmul_enter 17 movl OPENSSL_ia32cap_P+8(%rip),%r11d 18 cmpq %rsi,%rdx 19 jne .Lmul4x_enter 20 testl $7,%r9d 21 jz .Lsqr8x_enter 22 jmp .Lmul4x_enter 23 24.align 16 25.Lmul_enter: 26 pushq %rbx 27.cfi_offset %rbx,-16 28 pushq %rbp 29.cfi_offset %rbp,-24 30 pushq %r12 31.cfi_offset %r12,-32 32 pushq %r13 33.cfi_offset %r13,-40 34 pushq %r14 35.cfi_offset %r14,-48 36 pushq %r15 37.cfi_offset %r15,-56 38 39 negq %r9 40 movq %rsp,%r11 41 leaq -16(%rsp,%r9,8),%r10 42 negq %r9 43 andq $-1024,%r10 44 45 46 47 48 49 50 51 52 53 subq %r10,%r11 54 andq $-4096,%r11 55 leaq (%r10,%r11,1),%rsp 56 movq (%rsp),%r11 57 cmpq %r10,%rsp 58 ja .Lmul_page_walk 59 jmp .Lmul_page_walk_done 60 61.align 16 62.Lmul_page_walk: 63 leaq -4096(%rsp),%rsp 64 movq (%rsp),%r11 65 cmpq %r10,%rsp 66 ja .Lmul_page_walk 67.Lmul_page_walk_done: 68 69 movq %rax,8(%rsp,%r9,8) 70.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 71.Lmul_body: 72 movq %rdx,%r12 73 movq (%r8),%r8 74 movq (%r12),%rbx 75 movq (%rsi),%rax 76 77 xorq %r14,%r14 78 xorq %r15,%r15 79 80 movq %r8,%rbp 81 mulq %rbx 82 movq %rax,%r10 83 movq (%rcx),%rax 84 85 imulq %r10,%rbp 86 movq %rdx,%r11 87 88 mulq %rbp 89 addq %rax,%r10 90 movq 8(%rsi),%rax 91 adcq $0,%rdx 92 movq %rdx,%r13 93 94 leaq 1(%r15),%r15 95 jmp .L1st_enter 96 97.align 16 98.L1st: 99 addq %rax,%r13 100 movq (%rsi,%r15,8),%rax 101 adcq $0,%rdx 102 addq %r11,%r13 103 movq %r10,%r11 104 adcq $0,%rdx 105 movq %r13,-16(%rsp,%r15,8) 106 movq %rdx,%r13 107 108.L1st_enter: 109 mulq %rbx 110 addq %rax,%r11 111 movq (%rcx,%r15,8),%rax 112 adcq $0,%rdx 113 leaq 1(%r15),%r15 114 movq %rdx,%r10 115 116 mulq %rbp 117 cmpq %r9,%r15 118 jne .L1st 119 120 addq %rax,%r13 121 movq (%rsi),%rax 122 adcq $0,%rdx 123 addq %r11,%r13 124 adcq $0,%rdx 125 movq %r13,-16(%rsp,%r15,8) 126 movq %rdx,%r13 127 movq %r10,%r11 128 129 xorq %rdx,%rdx 130 addq %r11,%r13 131 adcq $0,%rdx 132 movq %r13,-8(%rsp,%r9,8) 133 movq %rdx,(%rsp,%r9,8) 134 135 leaq 1(%r14),%r14 136 jmp .Louter 137.align 16 138.Louter: 139 movq (%r12,%r14,8),%rbx 140 xorq %r15,%r15 141 movq %r8,%rbp 142 movq (%rsp),%r10 143 mulq %rbx 144 addq %rax,%r10 145 movq (%rcx),%rax 146 adcq $0,%rdx 147 148 imulq %r10,%rbp 149 movq %rdx,%r11 150 151 mulq %rbp 152 addq %rax,%r10 153 movq 8(%rsi),%rax 154 adcq $0,%rdx 155 movq 8(%rsp),%r10 156 movq %rdx,%r13 157 158 leaq 1(%r15),%r15 159 jmp .Linner_enter 160 161.align 16 162.Linner: 163 addq %rax,%r13 164 movq (%rsi,%r15,8),%rax 165 adcq $0,%rdx 166 addq %r10,%r13 167 movq (%rsp,%r15,8),%r10 168 adcq $0,%rdx 169 movq %r13,-16(%rsp,%r15,8) 170 movq %rdx,%r13 171 172.Linner_enter: 173 mulq %rbx 174 addq %rax,%r11 175 movq (%rcx,%r15,8),%rax 176 adcq $0,%rdx 177 addq %r11,%r10 178 movq %rdx,%r11 179 adcq $0,%r11 180 leaq 1(%r15),%r15 181 182 mulq %rbp 183 cmpq %r9,%r15 184 jne .Linner 185 186 addq %rax,%r13 187 movq (%rsi),%rax 188 adcq $0,%rdx 189 addq %r10,%r13 190 movq (%rsp,%r15,8),%r10 191 adcq $0,%rdx 192 movq %r13,-16(%rsp,%r15,8) 193 movq %rdx,%r13 194 195 xorq %rdx,%rdx 196 addq %r11,%r13 197 adcq $0,%rdx 198 addq %r10,%r13 199 adcq $0,%rdx 200 movq %r13,-8(%rsp,%r9,8) 201 movq %rdx,(%rsp,%r9,8) 202 203 leaq 1(%r14),%r14 204 cmpq %r9,%r14 205 jb .Louter 206 207 xorq %r14,%r14 208 movq (%rsp),%rax 209 movq %r9,%r15 210 211.align 16 212.Lsub: sbbq (%rcx,%r14,8),%rax 213 movq %rax,(%rdi,%r14,8) 214 movq 8(%rsp,%r14,8),%rax 215 leaq 1(%r14),%r14 216 decq %r15 217 jnz .Lsub 218 219 sbbq $0,%rax 220 movq $-1,%rbx 221 xorq %rax,%rbx 222 xorq %r14,%r14 223 movq %r9,%r15 224 225.Lcopy: 226 movq (%rdi,%r14,8),%rcx 227 movq (%rsp,%r14,8),%rdx 228 andq %rbx,%rcx 229 andq %rax,%rdx 230 movq %r9,(%rsp,%r14,8) 231 orq %rcx,%rdx 232 movq %rdx,(%rdi,%r14,8) 233 leaq 1(%r14),%r14 234 subq $1,%r15 235 jnz .Lcopy 236 237 movq 8(%rsp,%r9,8),%rsi 238.cfi_def_cfa %rsi,8 239 movq $1,%rax 240 movq -48(%rsi),%r15 241.cfi_restore %r15 242 movq -40(%rsi),%r14 243.cfi_restore %r14 244 movq -32(%rsi),%r13 245.cfi_restore %r13 246 movq -24(%rsi),%r12 247.cfi_restore %r12 248 movq -16(%rsi),%rbp 249.cfi_restore %rbp 250 movq -8(%rsi),%rbx 251.cfi_restore %rbx 252 leaq (%rsi),%rsp 253.cfi_def_cfa_register %rsp 254.Lmul_epilogue: 255 .byte 0xf3,0xc3 256.cfi_endproc 257.size bn_mul_mont,.-bn_mul_mont 258.type bn_mul4x_mont,@function 259.align 16 260bn_mul4x_mont: 261.cfi_startproc 262 movl %r9d,%r9d 263 movq %rsp,%rax 264.cfi_def_cfa_register %rax 265.Lmul4x_enter: 266 andl $0x80100,%r11d 267 cmpl $0x80100,%r11d 268 je .Lmulx4x_enter 269 pushq %rbx 270.cfi_offset %rbx,-16 271 pushq %rbp 272.cfi_offset %rbp,-24 273 pushq %r12 274.cfi_offset %r12,-32 275 pushq %r13 276.cfi_offset %r13,-40 277 pushq %r14 278.cfi_offset %r14,-48 279 pushq %r15 280.cfi_offset %r15,-56 281 282 negq %r9 283 movq %rsp,%r11 284 leaq -32(%rsp,%r9,8),%r10 285 negq %r9 286 andq $-1024,%r10 287 288 subq %r10,%r11 289 andq $-4096,%r11 290 leaq (%r10,%r11,1),%rsp 291 movq (%rsp),%r11 292 cmpq %r10,%rsp 293 ja .Lmul4x_page_walk 294 jmp .Lmul4x_page_walk_done 295 296.Lmul4x_page_walk: 297 leaq -4096(%rsp),%rsp 298 movq (%rsp),%r11 299 cmpq %r10,%rsp 300 ja .Lmul4x_page_walk 301.Lmul4x_page_walk_done: 302 303 movq %rax,8(%rsp,%r9,8) 304.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 305.Lmul4x_body: 306 movq %rdi,16(%rsp,%r9,8) 307 movq %rdx,%r12 308 movq (%r8),%r8 309 movq (%r12),%rbx 310 movq (%rsi),%rax 311 312 xorq %r14,%r14 313 xorq %r15,%r15 314 315 movq %r8,%rbp 316 mulq %rbx 317 movq %rax,%r10 318 movq (%rcx),%rax 319 320 imulq %r10,%rbp 321 movq %rdx,%r11 322 323 mulq %rbp 324 addq %rax,%r10 325 movq 8(%rsi),%rax 326 adcq $0,%rdx 327 movq %rdx,%rdi 328 329 mulq %rbx 330 addq %rax,%r11 331 movq 8(%rcx),%rax 332 adcq $0,%rdx 333 movq %rdx,%r10 334 335 mulq %rbp 336 addq %rax,%rdi 337 movq 16(%rsi),%rax 338 adcq $0,%rdx 339 addq %r11,%rdi 340 leaq 4(%r15),%r15 341 adcq $0,%rdx 342 movq %rdi,(%rsp) 343 movq %rdx,%r13 344 jmp .L1st4x 345.align 16 346.L1st4x: 347 mulq %rbx 348 addq %rax,%r10 349 movq -16(%rcx,%r15,8),%rax 350 adcq $0,%rdx 351 movq %rdx,%r11 352 353 mulq %rbp 354 addq %rax,%r13 355 movq -8(%rsi,%r15,8),%rax 356 adcq $0,%rdx 357 addq %r10,%r13 358 adcq $0,%rdx 359 movq %r13,-24(%rsp,%r15,8) 360 movq %rdx,%rdi 361 362 mulq %rbx 363 addq %rax,%r11 364 movq -8(%rcx,%r15,8),%rax 365 adcq $0,%rdx 366 movq %rdx,%r10 367 368 mulq %rbp 369 addq %rax,%rdi 370 movq (%rsi,%r15,8),%rax 371 adcq $0,%rdx 372 addq %r11,%rdi 373 adcq $0,%rdx 374 movq %rdi,-16(%rsp,%r15,8) 375 movq %rdx,%r13 376 377 mulq %rbx 378 addq %rax,%r10 379 movq (%rcx,%r15,8),%rax 380 adcq $0,%rdx 381 movq %rdx,%r11 382 383 mulq %rbp 384 addq %rax,%r13 385 movq 8(%rsi,%r15,8),%rax 386 adcq $0,%rdx 387 addq %r10,%r13 388 adcq $0,%rdx 389 movq %r13,-8(%rsp,%r15,8) 390 movq %rdx,%rdi 391 392 mulq %rbx 393 addq %rax,%r11 394 movq 8(%rcx,%r15,8),%rax 395 adcq $0,%rdx 396 leaq 4(%r15),%r15 397 movq %rdx,%r10 398 399 mulq %rbp 400 addq %rax,%rdi 401 movq -16(%rsi,%r15,8),%rax 402 adcq $0,%rdx 403 addq %r11,%rdi 404 adcq $0,%rdx 405 movq %rdi,-32(%rsp,%r15,8) 406 movq %rdx,%r13 407 cmpq %r9,%r15 408 jb .L1st4x 409 410 mulq %rbx 411 addq %rax,%r10 412 movq -16(%rcx,%r15,8),%rax 413 adcq $0,%rdx 414 movq %rdx,%r11 415 416 mulq %rbp 417 addq %rax,%r13 418 movq -8(%rsi,%r15,8),%rax 419 adcq $0,%rdx 420 addq %r10,%r13 421 adcq $0,%rdx 422 movq %r13,-24(%rsp,%r15,8) 423 movq %rdx,%rdi 424 425 mulq %rbx 426 addq %rax,%r11 427 movq -8(%rcx,%r15,8),%rax 428 adcq $0,%rdx 429 movq %rdx,%r10 430 431 mulq %rbp 432 addq %rax,%rdi 433 movq (%rsi),%rax 434 adcq $0,%rdx 435 addq %r11,%rdi 436 adcq $0,%rdx 437 movq %rdi,-16(%rsp,%r15,8) 438 movq %rdx,%r13 439 440 xorq %rdi,%rdi 441 addq %r10,%r13 442 adcq $0,%rdi 443 movq %r13,-8(%rsp,%r15,8) 444 movq %rdi,(%rsp,%r15,8) 445 446 leaq 1(%r14),%r14 447.align 4 448.Louter4x: 449 movq (%r12,%r14,8),%rbx 450 xorq %r15,%r15 451 movq (%rsp),%r10 452 movq %r8,%rbp 453 mulq %rbx 454 addq %rax,%r10 455 movq (%rcx),%rax 456 adcq $0,%rdx 457 458 imulq %r10,%rbp 459 movq %rdx,%r11 460 461 mulq %rbp 462 addq %rax,%r10 463 movq 8(%rsi),%rax 464 adcq $0,%rdx 465 movq %rdx,%rdi 466 467 mulq %rbx 468 addq %rax,%r11 469 movq 8(%rcx),%rax 470 adcq $0,%rdx 471 addq 8(%rsp),%r11 472 adcq $0,%rdx 473 movq %rdx,%r10 474 475 mulq %rbp 476 addq %rax,%rdi 477 movq 16(%rsi),%rax 478 adcq $0,%rdx 479 addq %r11,%rdi 480 leaq 4(%r15),%r15 481 adcq $0,%rdx 482 movq %rdi,(%rsp) 483 movq %rdx,%r13 484 jmp .Linner4x 485.align 16 486.Linner4x: 487 mulq %rbx 488 addq %rax,%r10 489 movq -16(%rcx,%r15,8),%rax 490 adcq $0,%rdx 491 addq -16(%rsp,%r15,8),%r10 492 adcq $0,%rdx 493 movq %rdx,%r11 494 495 mulq %rbp 496 addq %rax,%r13 497 movq -8(%rsi,%r15,8),%rax 498 adcq $0,%rdx 499 addq %r10,%r13 500 adcq $0,%rdx 501 movq %r13,-24(%rsp,%r15,8) 502 movq %rdx,%rdi 503 504 mulq %rbx 505 addq %rax,%r11 506 movq -8(%rcx,%r15,8),%rax 507 adcq $0,%rdx 508 addq -8(%rsp,%r15,8),%r11 509 adcq $0,%rdx 510 movq %rdx,%r10 511 512 mulq %rbp 513 addq %rax,%rdi 514 movq (%rsi,%r15,8),%rax 515 adcq $0,%rdx 516 addq %r11,%rdi 517 adcq $0,%rdx 518 movq %rdi,-16(%rsp,%r15,8) 519 movq %rdx,%r13 520 521 mulq %rbx 522 addq %rax,%r10 523 movq (%rcx,%r15,8),%rax 524 adcq $0,%rdx 525 addq (%rsp,%r15,8),%r10 526 adcq $0,%rdx 527 movq %rdx,%r11 528 529 mulq %rbp 530 addq %rax,%r13 531 movq 8(%rsi,%r15,8),%rax 532 adcq $0,%rdx 533 addq %r10,%r13 534 adcq $0,%rdx 535 movq %r13,-8(%rsp,%r15,8) 536 movq %rdx,%rdi 537 538 mulq %rbx 539 addq %rax,%r11 540 movq 8(%rcx,%r15,8),%rax 541 adcq $0,%rdx 542 addq 8(%rsp,%r15,8),%r11 543 adcq $0,%rdx 544 leaq 4(%r15),%r15 545 movq %rdx,%r10 546 547 mulq %rbp 548 addq %rax,%rdi 549 movq -16(%rsi,%r15,8),%rax 550 adcq $0,%rdx 551 addq %r11,%rdi 552 adcq $0,%rdx 553 movq %rdi,-32(%rsp,%r15,8) 554 movq %rdx,%r13 555 cmpq %r9,%r15 556 jb .Linner4x 557 558 mulq %rbx 559 addq %rax,%r10 560 movq -16(%rcx,%r15,8),%rax 561 adcq $0,%rdx 562 addq -16(%rsp,%r15,8),%r10 563 adcq $0,%rdx 564 movq %rdx,%r11 565 566 mulq %rbp 567 addq %rax,%r13 568 movq -8(%rsi,%r15,8),%rax 569 adcq $0,%rdx 570 addq %r10,%r13 571 adcq $0,%rdx 572 movq %r13,-24(%rsp,%r15,8) 573 movq %rdx,%rdi 574 575 mulq %rbx 576 addq %rax,%r11 577 movq -8(%rcx,%r15,8),%rax 578 adcq $0,%rdx 579 addq -8(%rsp,%r15,8),%r11 580 adcq $0,%rdx 581 leaq 1(%r14),%r14 582 movq %rdx,%r10 583 584 mulq %rbp 585 addq %rax,%rdi 586 movq (%rsi),%rax 587 adcq $0,%rdx 588 addq %r11,%rdi 589 adcq $0,%rdx 590 movq %rdi,-16(%rsp,%r15,8) 591 movq %rdx,%r13 592 593 xorq %rdi,%rdi 594 addq %r10,%r13 595 adcq $0,%rdi 596 addq (%rsp,%r9,8),%r13 597 adcq $0,%rdi 598 movq %r13,-8(%rsp,%r15,8) 599 movq %rdi,(%rsp,%r15,8) 600 601 cmpq %r9,%r14 602 jb .Louter4x 603 movq 16(%rsp,%r9,8),%rdi 604 leaq -4(%r9),%r15 605 movq 0(%rsp),%rax 606 movq 8(%rsp),%rdx 607 shrq $2,%r15 608 leaq (%rsp),%rsi 609 xorq %r14,%r14 610 611 subq 0(%rcx),%rax 612 movq 16(%rsi),%rbx 613 movq 24(%rsi),%rbp 614 sbbq 8(%rcx),%rdx 615 616.Lsub4x: 617 movq %rax,0(%rdi,%r14,8) 618 movq %rdx,8(%rdi,%r14,8) 619 sbbq 16(%rcx,%r14,8),%rbx 620 movq 32(%rsi,%r14,8),%rax 621 movq 40(%rsi,%r14,8),%rdx 622 sbbq 24(%rcx,%r14,8),%rbp 623 movq %rbx,16(%rdi,%r14,8) 624 movq %rbp,24(%rdi,%r14,8) 625 sbbq 32(%rcx,%r14,8),%rax 626 movq 48(%rsi,%r14,8),%rbx 627 movq 56(%rsi,%r14,8),%rbp 628 sbbq 40(%rcx,%r14,8),%rdx 629 leaq 4(%r14),%r14 630 decq %r15 631 jnz .Lsub4x 632 633 movq %rax,0(%rdi,%r14,8) 634 movq 32(%rsi,%r14,8),%rax 635 sbbq 16(%rcx,%r14,8),%rbx 636 movq %rdx,8(%rdi,%r14,8) 637 sbbq 24(%rcx,%r14,8),%rbp 638 movq %rbx,16(%rdi,%r14,8) 639 640 sbbq $0,%rax 641 movq %rbp,24(%rdi,%r14,8) 642 pxor %xmm0,%xmm0 643.byte 102,72,15,110,224 644 pcmpeqd %xmm5,%xmm5 645 pshufd $0,%xmm4,%xmm4 646 movq %r9,%r15 647 pxor %xmm4,%xmm5 648 shrq $2,%r15 649 xorl %eax,%eax 650 651 jmp .Lcopy4x 652.align 16 653.Lcopy4x: 654 movdqa (%rsp,%rax,1),%xmm1 655 movdqu (%rdi,%rax,1),%xmm2 656 pand %xmm4,%xmm1 657 pand %xmm5,%xmm2 658 movdqa 16(%rsp,%rax,1),%xmm3 659 movdqa %xmm0,(%rsp,%rax,1) 660 por %xmm2,%xmm1 661 movdqu 16(%rdi,%rax,1),%xmm2 662 movdqu %xmm1,(%rdi,%rax,1) 663 pand %xmm4,%xmm3 664 pand %xmm5,%xmm2 665 movdqa %xmm0,16(%rsp,%rax,1) 666 por %xmm2,%xmm3 667 movdqu %xmm3,16(%rdi,%rax,1) 668 leaq 32(%rax),%rax 669 decq %r15 670 jnz .Lcopy4x 671 movq 8(%rsp,%r9,8),%rsi 672.cfi_def_cfa %rsi, 8 673 movq $1,%rax 674 movq -48(%rsi),%r15 675.cfi_restore %r15 676 movq -40(%rsi),%r14 677.cfi_restore %r14 678 movq -32(%rsi),%r13 679.cfi_restore %r13 680 movq -24(%rsi),%r12 681.cfi_restore %r12 682 movq -16(%rsi),%rbp 683.cfi_restore %rbp 684 movq -8(%rsi),%rbx 685.cfi_restore %rbx 686 leaq (%rsi),%rsp 687.cfi_def_cfa_register %rsp 688.Lmul4x_epilogue: 689 .byte 0xf3,0xc3 690.cfi_endproc 691.size bn_mul4x_mont,.-bn_mul4x_mont 692 693 694 695.type bn_sqr8x_mont,@function 696.align 32 697bn_sqr8x_mont: 698.cfi_startproc 699 movq %rsp,%rax 700.cfi_def_cfa_register %rax 701.Lsqr8x_enter: 702 pushq %rbx 703.cfi_offset %rbx,-16 704 pushq %rbp 705.cfi_offset %rbp,-24 706 pushq %r12 707.cfi_offset %r12,-32 708 pushq %r13 709.cfi_offset %r13,-40 710 pushq %r14 711.cfi_offset %r14,-48 712 pushq %r15 713.cfi_offset %r15,-56 714.Lsqr8x_prologue: 715 716 movl %r9d,%r10d 717 shll $3,%r9d 718 shlq $3+2,%r10 719 negq %r9 720 721 722 723 724 725 726 leaq -64(%rsp,%r9,2),%r11 727 movq %rsp,%rbp 728 movq (%r8),%r8 729 subq %rsi,%r11 730 andq $4095,%r11 731 cmpq %r11,%r10 732 jb .Lsqr8x_sp_alt 733 subq %r11,%rbp 734 leaq -64(%rbp,%r9,2),%rbp 735 jmp .Lsqr8x_sp_done 736 737.align 32 738.Lsqr8x_sp_alt: 739 leaq 4096-64(,%r9,2),%r10 740 leaq -64(%rbp,%r9,2),%rbp 741 subq %r10,%r11 742 movq $0,%r10 743 cmovcq %r10,%r11 744 subq %r11,%rbp 745.Lsqr8x_sp_done: 746 andq $-64,%rbp 747 movq %rsp,%r11 748 subq %rbp,%r11 749 andq $-4096,%r11 750 leaq (%r11,%rbp,1),%rsp 751 movq (%rsp),%r10 752 cmpq %rbp,%rsp 753 ja .Lsqr8x_page_walk 754 jmp .Lsqr8x_page_walk_done 755 756.align 16 757.Lsqr8x_page_walk: 758 leaq -4096(%rsp),%rsp 759 movq (%rsp),%r10 760 cmpq %rbp,%rsp 761 ja .Lsqr8x_page_walk 762.Lsqr8x_page_walk_done: 763 764 movq %r9,%r10 765 negq %r9 766 767 movq %r8,32(%rsp) 768 movq %rax,40(%rsp) 769.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 770.Lsqr8x_body: 771 772.byte 102,72,15,110,209 773 pxor %xmm0,%xmm0 774.byte 102,72,15,110,207 775.byte 102,73,15,110,218 776 movl OPENSSL_ia32cap_P+8(%rip),%eax 777 andl $0x80100,%eax 778 cmpl $0x80100,%eax 779 jne .Lsqr8x_nox 780 781 call bn_sqrx8x_internal 782 783 784 785 786 leaq (%r8,%rcx,1),%rbx 787 movq %rcx,%r9 788 movq %rcx,%rdx 789.byte 102,72,15,126,207 790 sarq $3+2,%rcx 791 jmp .Lsqr8x_sub 792 793.align 32 794.Lsqr8x_nox: 795 call bn_sqr8x_internal 796 797 798 799 800 leaq (%rdi,%r9,1),%rbx 801 movq %r9,%rcx 802 movq %r9,%rdx 803.byte 102,72,15,126,207 804 sarq $3+2,%rcx 805 jmp .Lsqr8x_sub 806 807.align 32 808.Lsqr8x_sub: 809 movq 0(%rbx),%r12 810 movq 8(%rbx),%r13 811 movq 16(%rbx),%r14 812 movq 24(%rbx),%r15 813 leaq 32(%rbx),%rbx 814 sbbq 0(%rbp),%r12 815 sbbq 8(%rbp),%r13 816 sbbq 16(%rbp),%r14 817 sbbq 24(%rbp),%r15 818 leaq 32(%rbp),%rbp 819 movq %r12,0(%rdi) 820 movq %r13,8(%rdi) 821 movq %r14,16(%rdi) 822 movq %r15,24(%rdi) 823 leaq 32(%rdi),%rdi 824 incq %rcx 825 jnz .Lsqr8x_sub 826 827 sbbq $0,%rax 828 leaq (%rbx,%r9,1),%rbx 829 leaq (%rdi,%r9,1),%rdi 830 831.byte 102,72,15,110,200 832 pxor %xmm0,%xmm0 833 pshufd $0,%xmm1,%xmm1 834 movq 40(%rsp),%rsi 835.cfi_def_cfa %rsi,8 836 jmp .Lsqr8x_cond_copy 837 838.align 32 839.Lsqr8x_cond_copy: 840 movdqa 0(%rbx),%xmm2 841 movdqa 16(%rbx),%xmm3 842 leaq 32(%rbx),%rbx 843 movdqu 0(%rdi),%xmm4 844 movdqu 16(%rdi),%xmm5 845 leaq 32(%rdi),%rdi 846 movdqa %xmm0,-32(%rbx) 847 movdqa %xmm0,-16(%rbx) 848 movdqa %xmm0,-32(%rbx,%rdx,1) 849 movdqa %xmm0,-16(%rbx,%rdx,1) 850 pcmpeqd %xmm1,%xmm0 851 pand %xmm1,%xmm2 852 pand %xmm1,%xmm3 853 pand %xmm0,%xmm4 854 pand %xmm0,%xmm5 855 pxor %xmm0,%xmm0 856 por %xmm2,%xmm4 857 por %xmm3,%xmm5 858 movdqu %xmm4,-32(%rdi) 859 movdqu %xmm5,-16(%rdi) 860 addq $32,%r9 861 jnz .Lsqr8x_cond_copy 862 863 movq $1,%rax 864 movq -48(%rsi),%r15 865.cfi_restore %r15 866 movq -40(%rsi),%r14 867.cfi_restore %r14 868 movq -32(%rsi),%r13 869.cfi_restore %r13 870 movq -24(%rsi),%r12 871.cfi_restore %r12 872 movq -16(%rsi),%rbp 873.cfi_restore %rbp 874 movq -8(%rsi),%rbx 875.cfi_restore %rbx 876 leaq (%rsi),%rsp 877.cfi_def_cfa_register %rsp 878.Lsqr8x_epilogue: 879 .byte 0xf3,0xc3 880.cfi_endproc 881.size bn_sqr8x_mont,.-bn_sqr8x_mont 882.type bn_mulx4x_mont,@function 883.align 32 884bn_mulx4x_mont: 885.cfi_startproc 886 movq %rsp,%rax 887.cfi_def_cfa_register %rax 888.Lmulx4x_enter: 889 pushq %rbx 890.cfi_offset %rbx,-16 891 pushq %rbp 892.cfi_offset %rbp,-24 893 pushq %r12 894.cfi_offset %r12,-32 895 pushq %r13 896.cfi_offset %r13,-40 897 pushq %r14 898.cfi_offset %r14,-48 899 pushq %r15 900.cfi_offset %r15,-56 901.Lmulx4x_prologue: 902 903 shll $3,%r9d 904 xorq %r10,%r10 905 subq %r9,%r10 906 movq (%r8),%r8 907 leaq -72(%rsp,%r10,1),%rbp 908 andq $-128,%rbp 909 movq %rsp,%r11 910 subq %rbp,%r11 911 andq $-4096,%r11 912 leaq (%r11,%rbp,1),%rsp 913 movq (%rsp),%r10 914 cmpq %rbp,%rsp 915 ja .Lmulx4x_page_walk 916 jmp .Lmulx4x_page_walk_done 917 918.align 16 919.Lmulx4x_page_walk: 920 leaq -4096(%rsp),%rsp 921 movq (%rsp),%r10 922 cmpq %rbp,%rsp 923 ja .Lmulx4x_page_walk 924.Lmulx4x_page_walk_done: 925 926 leaq (%rdx,%r9,1),%r10 927 928 929 930 931 932 933 934 935 936 937 938 939 movq %r9,0(%rsp) 940 shrq $5,%r9 941 movq %r10,16(%rsp) 942 subq $1,%r9 943 movq %r8,24(%rsp) 944 movq %rdi,32(%rsp) 945 movq %rax,40(%rsp) 946.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 947 movq %r9,48(%rsp) 948 jmp .Lmulx4x_body 949 950.align 32 951.Lmulx4x_body: 952 leaq 8(%rdx),%rdi 953 movq (%rdx),%rdx 954 leaq 64+32(%rsp),%rbx 955 movq %rdx,%r9 956 957 mulxq 0(%rsi),%r8,%rax 958 mulxq 8(%rsi),%r11,%r14 959 addq %rax,%r11 960 movq %rdi,8(%rsp) 961 mulxq 16(%rsi),%r12,%r13 962 adcq %r14,%r12 963 adcq $0,%r13 964 965 movq %r8,%rdi 966 imulq 24(%rsp),%r8 967 xorq %rbp,%rbp 968 969 mulxq 24(%rsi),%rax,%r14 970 movq %r8,%rdx 971 leaq 32(%rsi),%rsi 972 adcxq %rax,%r13 973 adcxq %rbp,%r14 974 975 mulxq 0(%rcx),%rax,%r10 976 adcxq %rax,%rdi 977 adoxq %r11,%r10 978 mulxq 8(%rcx),%rax,%r11 979 adcxq %rax,%r10 980 adoxq %r12,%r11 981.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 982 movq 48(%rsp),%rdi 983 movq %r10,-32(%rbx) 984 adcxq %rax,%r11 985 adoxq %r13,%r12 986 mulxq 24(%rcx),%rax,%r15 987 movq %r9,%rdx 988 movq %r11,-24(%rbx) 989 adcxq %rax,%r12 990 adoxq %rbp,%r15 991 leaq 32(%rcx),%rcx 992 movq %r12,-16(%rbx) 993 994 jmp .Lmulx4x_1st 995 996.align 32 997.Lmulx4x_1st: 998 adcxq %rbp,%r15 999 mulxq 0(%rsi),%r10,%rax 1000 adcxq %r14,%r10 1001 mulxq 8(%rsi),%r11,%r14 1002 adcxq %rax,%r11 1003 mulxq 16(%rsi),%r12,%rax 1004 adcxq %r14,%r12 1005 mulxq 24(%rsi),%r13,%r14 1006.byte 0x67,0x67 1007 movq %r8,%rdx 1008 adcxq %rax,%r13 1009 adcxq %rbp,%r14 1010 leaq 32(%rsi),%rsi 1011 leaq 32(%rbx),%rbx 1012 1013 adoxq %r15,%r10 1014 mulxq 0(%rcx),%rax,%r15 1015 adcxq %rax,%r10 1016 adoxq %r15,%r11 1017 mulxq 8(%rcx),%rax,%r15 1018 adcxq %rax,%r11 1019 adoxq %r15,%r12 1020 mulxq 16(%rcx),%rax,%r15 1021 movq %r10,-40(%rbx) 1022 adcxq %rax,%r12 1023 movq %r11,-32(%rbx) 1024 adoxq %r15,%r13 1025 mulxq 24(%rcx),%rax,%r15 1026 movq %r9,%rdx 1027 movq %r12,-24(%rbx) 1028 adcxq %rax,%r13 1029 adoxq %rbp,%r15 1030 leaq 32(%rcx),%rcx 1031 movq %r13,-16(%rbx) 1032 1033 decq %rdi 1034 jnz .Lmulx4x_1st 1035 1036 movq 0(%rsp),%rax 1037 movq 8(%rsp),%rdi 1038 adcq %rbp,%r15 1039 addq %r15,%r14 1040 sbbq %r15,%r15 1041 movq %r14,-8(%rbx) 1042 jmp .Lmulx4x_outer 1043 1044.align 32 1045.Lmulx4x_outer: 1046 movq (%rdi),%rdx 1047 leaq 8(%rdi),%rdi 1048 subq %rax,%rsi 1049 movq %r15,(%rbx) 1050 leaq 64+32(%rsp),%rbx 1051 subq %rax,%rcx 1052 1053 mulxq 0(%rsi),%r8,%r11 1054 xorl %ebp,%ebp 1055 movq %rdx,%r9 1056 mulxq 8(%rsi),%r14,%r12 1057 adoxq -32(%rbx),%r8 1058 adcxq %r14,%r11 1059 mulxq 16(%rsi),%r15,%r13 1060 adoxq -24(%rbx),%r11 1061 adcxq %r15,%r12 1062 adoxq -16(%rbx),%r12 1063 adcxq %rbp,%r13 1064 adoxq %rbp,%r13 1065 1066 movq %rdi,8(%rsp) 1067 movq %r8,%r15 1068 imulq 24(%rsp),%r8 1069 xorl %ebp,%ebp 1070 1071 mulxq 24(%rsi),%rax,%r14 1072 movq %r8,%rdx 1073 adcxq %rax,%r13 1074 adoxq -8(%rbx),%r13 1075 adcxq %rbp,%r14 1076 leaq 32(%rsi),%rsi 1077 adoxq %rbp,%r14 1078 1079 mulxq 0(%rcx),%rax,%r10 1080 adcxq %rax,%r15 1081 adoxq %r11,%r10 1082 mulxq 8(%rcx),%rax,%r11 1083 adcxq %rax,%r10 1084 adoxq %r12,%r11 1085 mulxq 16(%rcx),%rax,%r12 1086 movq %r10,-32(%rbx) 1087 adcxq %rax,%r11 1088 adoxq %r13,%r12 1089 mulxq 24(%rcx),%rax,%r15 1090 movq %r9,%rdx 1091 movq %r11,-24(%rbx) 1092 leaq 32(%rcx),%rcx 1093 adcxq %rax,%r12 1094 adoxq %rbp,%r15 1095 movq 48(%rsp),%rdi 1096 movq %r12,-16(%rbx) 1097 1098 jmp .Lmulx4x_inner 1099 1100.align 32 1101.Lmulx4x_inner: 1102 mulxq 0(%rsi),%r10,%rax 1103 adcxq %rbp,%r15 1104 adoxq %r14,%r10 1105 mulxq 8(%rsi),%r11,%r14 1106 adcxq 0(%rbx),%r10 1107 adoxq %rax,%r11 1108 mulxq 16(%rsi),%r12,%rax 1109 adcxq 8(%rbx),%r11 1110 adoxq %r14,%r12 1111 mulxq 24(%rsi),%r13,%r14 1112 movq %r8,%rdx 1113 adcxq 16(%rbx),%r12 1114 adoxq %rax,%r13 1115 adcxq 24(%rbx),%r13 1116 adoxq %rbp,%r14 1117 leaq 32(%rsi),%rsi 1118 leaq 32(%rbx),%rbx 1119 adcxq %rbp,%r14 1120 1121 adoxq %r15,%r10 1122 mulxq 0(%rcx),%rax,%r15 1123 adcxq %rax,%r10 1124 adoxq %r15,%r11 1125 mulxq 8(%rcx),%rax,%r15 1126 adcxq %rax,%r11 1127 adoxq %r15,%r12 1128 mulxq 16(%rcx),%rax,%r15 1129 movq %r10,-40(%rbx) 1130 adcxq %rax,%r12 1131 adoxq %r15,%r13 1132 mulxq 24(%rcx),%rax,%r15 1133 movq %r9,%rdx 1134 movq %r11,-32(%rbx) 1135 movq %r12,-24(%rbx) 1136 adcxq %rax,%r13 1137 adoxq %rbp,%r15 1138 leaq 32(%rcx),%rcx 1139 movq %r13,-16(%rbx) 1140 1141 decq %rdi 1142 jnz .Lmulx4x_inner 1143 1144 movq 0(%rsp),%rax 1145 movq 8(%rsp),%rdi 1146 adcq %rbp,%r15 1147 subq 0(%rbx),%rbp 1148 adcq %r15,%r14 1149 sbbq %r15,%r15 1150 movq %r14,-8(%rbx) 1151 1152 cmpq 16(%rsp),%rdi 1153 jne .Lmulx4x_outer 1154 1155 leaq 64(%rsp),%rbx 1156 subq %rax,%rcx 1157 negq %r15 1158 movq %rax,%rdx 1159 shrq $3+2,%rax 1160 movq 32(%rsp),%rdi 1161 jmp .Lmulx4x_sub 1162 1163.align 32 1164.Lmulx4x_sub: 1165 movq 0(%rbx),%r11 1166 movq 8(%rbx),%r12 1167 movq 16(%rbx),%r13 1168 movq 24(%rbx),%r14 1169 leaq 32(%rbx),%rbx 1170 sbbq 0(%rcx),%r11 1171 sbbq 8(%rcx),%r12 1172 sbbq 16(%rcx),%r13 1173 sbbq 24(%rcx),%r14 1174 leaq 32(%rcx),%rcx 1175 movq %r11,0(%rdi) 1176 movq %r12,8(%rdi) 1177 movq %r13,16(%rdi) 1178 movq %r14,24(%rdi) 1179 leaq 32(%rdi),%rdi 1180 decq %rax 1181 jnz .Lmulx4x_sub 1182 1183 sbbq $0,%r15 1184 leaq 64(%rsp),%rbx 1185 subq %rdx,%rdi 1186 1187.byte 102,73,15,110,207 1188 pxor %xmm0,%xmm0 1189 pshufd $0,%xmm1,%xmm1 1190 movq 40(%rsp),%rsi 1191.cfi_def_cfa %rsi,8 1192 jmp .Lmulx4x_cond_copy 1193 1194.align 32 1195.Lmulx4x_cond_copy: 1196 movdqa 0(%rbx),%xmm2 1197 movdqa 16(%rbx),%xmm3 1198 leaq 32(%rbx),%rbx 1199 movdqu 0(%rdi),%xmm4 1200 movdqu 16(%rdi),%xmm5 1201 leaq 32(%rdi),%rdi 1202 movdqa %xmm0,-32(%rbx) 1203 movdqa %xmm0,-16(%rbx) 1204 pcmpeqd %xmm1,%xmm0 1205 pand %xmm1,%xmm2 1206 pand %xmm1,%xmm3 1207 pand %xmm0,%xmm4 1208 pand %xmm0,%xmm5 1209 pxor %xmm0,%xmm0 1210 por %xmm2,%xmm4 1211 por %xmm3,%xmm5 1212 movdqu %xmm4,-32(%rdi) 1213 movdqu %xmm5,-16(%rdi) 1214 subq $32,%rdx 1215 jnz .Lmulx4x_cond_copy 1216 1217 movq %rdx,(%rbx) 1218 1219 movq $1,%rax 1220 movq -48(%rsi),%r15 1221.cfi_restore %r15 1222 movq -40(%rsi),%r14 1223.cfi_restore %r14 1224 movq -32(%rsi),%r13 1225.cfi_restore %r13 1226 movq -24(%rsi),%r12 1227.cfi_restore %r12 1228 movq -16(%rsi),%rbp 1229.cfi_restore %rbp 1230 movq -8(%rsi),%rbx 1231.cfi_restore %rbx 1232 leaq (%rsi),%rsp 1233.cfi_def_cfa_register %rsp 1234.Lmulx4x_epilogue: 1235 .byte 0xf3,0xc3 1236.cfi_endproc 1237.size bn_mulx4x_mont,.-bn_mulx4x_mont 1238.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1239.align 16 1240