1 2; AMD64 mpn_mul_basecase 3 4; Copyright 2008,2009 Jason Moxham 5 6; This file is part of the MPIR Library. 7 8; The MPIR Library is free software; you can redistribute it and/or modify 9; it under the terms of the GNU Lesser General Public License as published 10; by the Free Software Foundation; either version 2.1 of the License, or (at 11; your option) any later version. 12 13; The MPIR Library is distributed in the hope that it will be useful, but 14; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16; License for more details. 17 18; You should have received a copy of the GNU Lesser General Public License 19; along with the MPIR Library; see the file COPYING.LIB. If not, write 20; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 21; Boston, MA 02110-1301, USA. 22 23%include 'yasm_mac.inc' 24 25; C (rdi,rdx+r8)=(rsi,rdx)*(rcx,r8) 26; C Version 1.0.7 27 28 29%macro addmul2lp 1 30 align 16 31%%1: 32 mov rax, [rsi+rbx*8] 33 mul r8 34 add r9, rax 35 mov rax, [rsi+rbx*8+8] 36 adc r10, rdx 37 mov r11, 0 38 mul rcx 39 add [rdi+rbx*8], r12 40 adc r9, rax 41 mov r12, 0 42 adc r10, rdx 43 mov rax, [rsi+rbx*8+8] 44 adc r11, 0 45 mul r8 46 add [rdi+rbx*8+8], r9 47 adc r10, rax 48 adc r11, rdx 49 mov rax, [rsi+rbx*8+16] 50 mul rcx 51 add r10, rax 52 mov rax, [rsi+rbx*8+16] 53 adc r11, rdx 54 adc r12, 0 55 mul r8 56 add [rdi+rbx*8+16], r10 57 mov r9, 0 58 adc r11, rax 59 mov r10, 0 60 mov rax, [rsi+rbx*8+24] 61 adc r12, rdx 62 mov r15, r15 63 mul rcx 64 add r11, rax 65 mov rax, [rsi+rbx*8+24] 66 adc r12, rdx 67 adc r9, 0 68 mul r8 69 add [rdi+rbx*8+24], r11 70 adc r12, rax 71 adc r9, rdx 72 mov rax, [rsi+rbx*8+32] 73 mul rcx 74 add r12, rax 75 adc r9, rdx 76 adc r10, 0 77 add rbx, 4 78 jnc %%1 79%endmacro 80 81%macro addmul2pro0 0 82 mov rcx, [r13+r15*8] 83 mul rcx 84 mov r12, rax 85 mov r9, rdx 86 mov r10, 0 87 mov r8, [r13+r15*8+8] 88%endmacro 89 90%macro addmul2epi0 0 91 mov rbx, r14 92 mov rax, [rsi+24] 93 mul r8 94 add [rdi+24], r12 95 adc r9, rax 96 adc r10, rdx 97 add r15, 2 98 mov rax, [rsi+r14*8] 99 mov [rdi+32], r9 100 lea rdi, [rdi+16] 101 mov [rdi+24], r10 102%endmacro 103 104%macro addmul2pro1 0 105 mov rcx, [r13+r15*8] 106 mul rcx 107 mov r12, rax 108 mov r10, 0 109 mov r9, rdx 110 mov r8, [r13+r15*8+8] 111%endmacro 112 113%macro addmul2epi1 0 114 mov rax, [rsi+16] 115 lea rdi, [rdi+16] 116 mul r8 117 add r9, rax 118 mov rax, [rsi+24] 119 mov r11, 0 120 adc r10, rdx 121 mul rcx 122 add [rdi], r12 123 adc r9, rax 124 adc r10, rdx 125 adc r11, 0 126 mov rax, [rsi+24] 127 mul r8 128 add [rdi+8], r9 129 adc r10, rax 130 adc r11, rdx 131 add r15, 2 132 mov rbx, r14 133 mov rax, [rsi+r14*8] 134 mov [rdi+24], r11 135 mov [rdi+16], r10 136%endmacro 137 138%macro addmul2pro2 0 139 mov rcx, [r13+r15*8] 140 mul rcx 141 mov r10, 0 142 mov r12, rax 143 mov r9, rdx 144 mov r8, [r13+r15*8+8] 145%endmacro 146 147%macro addmul2epi2 0 148 mov rax, [rsi+8] 149 lea rdi, [rdi+16] 150 mul r8 151 add r9, rax 152 mov rax, [rsi+16] 153 adc r10, rdx 154 mov r11, 0 155 mul rcx 156 add [rdi-8], r12 157 adc r9, rax 158 mov r12, 0 159 adc r10, rdx 160 mov rax, [rsi+16] 161 adc r11, 0 162 mul r8 163 add [rdi], r9 164 adc r10, rax 165 adc r11, rdx 166 mov rax, [rsi+24] 167 mul rcx 168 add r10, rax 169 mov rax, [rsi+24] 170 adc r11, rdx 171 adc r12, 0 172 mul r8 173 add [rdi+8], r10 174 adc r11, rax 175 adc r12, rdx 176 mov rax, [rsi+r14*8] 177 mov [rdi+16], r11 178 mov [rdi+24], r12 179 add r15, 2 180 mov rbx, r14 181%endmacro 182 183%macro addmul2pro3 0 184 mov rcx, [r13+r15*8] 185 mul rcx 186 mov r12, rax 187 mov r9, rdx 188 mov r8, [r13+r15*8+8] 189 mov r10, 0 190%endmacro 191 192%macro addmul2epi3 0 193 mov rax, [rsi] 194 lea rdi, [rdi+16] 195 mul r8 196 add r9, rax 197 mov rax, [rsi+8] 198 adc r10, rdx 199 mov r11, 0 200 mul rcx 201 add [rdi-16], r12 202 adc r9, rax 203 mov r12, 0 204 adc r10, rdx 205 mov rax, [rsi+8] 206 adc r11, 0 207 mul r8 208 add [rdi-8], r9 209 adc r10, rax 210 adc r11, rdx 211 mov rax, [rsi+16] 212 mul rcx 213 add r10, rax 214 mov rax, [rsi+16] 215 adc r11, rdx 216 adc r12, 0 217 mul r8 218 add [rdi], r10 219 mov r9, 0 220 adc r11, rax 221 mov r10, 0 222 mov rax, [rsi+24] 223 adc r12, rdx 224 mov r15, r15 225 mul rcx 226 add r11, rax 227 mov rax, [rsi+24] 228 adc r12, rdx 229 adc r9, 0 230 mul r8 231 add [rdi+8], r11 232 adc r12, rax 233 adc r9, rdx 234 mov rax, [rsi+r14*8] 235 mov [rdi+16], r12 236 mov [rdi+24], r9 237 add r15, 2 238 mov rbx, r14 239%endmacro 240 241%macro mul2lp 0 242 align 16 243%%1: 244 mov rax, [rsi+rbx*8] 245 mul r8 246 add r9, rax 247 mov rax, [rsi+rbx*8+8] 248 adc r10, rdx 249 mov r11, 0 250 mul rcx 251 mov [rdi+rbx*8], r12 252 add r9, rax 253 mov r12, 0 254 adc r10, rdx 255 mov rax, [rsi+rbx*8+8] 256 adc r11, 0 257 mul r8 258 mov [rdi+rbx*8+8], r9 259 add r10, rax 260 adc r11, rdx 261 mov rax, [rsi+rbx*8+16] 262 mul rcx 263 add r10, rax 264 mov rax, [rsi+rbx*8+16] 265 adc r11, rdx 266 adc r12, 0 267 mul r8 268 mov [rdi+rbx*8+16], r10 269 mov r9, 0 270 add r11, rax 271 mov r10, 0 272 mov rax, [rsi+rbx*8+24] 273 adc r12, rdx 274 mov r15, r15 275 mul rcx 276 add r11, rax 277 mov rax, [rsi+rbx*8+24] 278 adc r12, rdx 279 adc r9, 0 280 mul r8 281 mov [rdi+rbx*8+24], r11 282 add r12, rax 283 adc r9, rdx 284 mov rax, [rsi+rbx*8+32] 285 mul rcx 286 add r12, rax 287 adc r9, rdx 288 adc r10, 0 289 add rbx, 4 290 jnc %%1 291%endmacro 292 293%macro mul2pro0 0 294 mov rcx, [r13+r15*8] 295 mul rcx 296 mov r12, rax 297 mov r9, rdx 298 mov r10, 0 299 mov r8, [r13+r15*8+8] 300%endmacro 301 302%macro mul2epi0 0 303 mov rbx, r14 304 mov rax, [rsi+24] 305 mul r8 306 mov [rdi+24], r12 307 add r9, rax 308 adc r10, rdx 309 add r15, 2 310 mov rax, [rsi+r14*8] 311 mov [rdi+32], r9 312 lea rdi, [rdi+16] 313 mov [rdi+24], r10 314%endmacro 315 316%macro mul2pro1 0 317 mov rcx, [r13+r15*8] 318 mul rcx 319 mov r12, rax 320 mov r10, 0 321 mov r9, rdx 322 mov r8, [r13+r15*8+8] 323%endmacro 324 325%macro mul2epi1 0 326 mov rax, [rsi+16] 327 lea rdi, [rdi+16] 328 mul r8 329 add r9, rax 330 mov rax, [rsi+24] 331 mov r11, 0 332 adc r10, rdx 333 mul rcx 334 mov [rdi], r12 335 add r9, rax 336 adc r10, rdx 337 adc r11, 0 338 mov rax, [rsi+24] 339 mul r8 340 mov [rdi+8], r9 341 add r10, rax 342 adc r11, rdx 343 add r15, 2 344 mov rbx, r14 345 mov rax, [rsi+r14*8] 346 mov [rdi+24], r11 347 mov [rdi+16], r10 348%endmacro 349 350%macro mul2pro2 0 351 mov rcx, [r13+r15*8] 352 mul rcx 353 mov r10, 0 354 mov r12, rax 355 mov r9, rdx 356 mov r8, [r13+r15*8+8] 357%endmacro 358 359%macro mul2epi2 0 360 mov rax, [rsi+8] 361 lea rdi, [rdi+16] 362 mul r8 363 add r9, rax 364 mov rax, [rsi+16] 365 adc r10, rdx 366 mov r11, 0 367 mul rcx 368 mov [rdi-8], r12 369 add r9, rax 370 mov r12, 0 371 adc r10, rdx 372 mov rax, [rsi+16] 373 adc r11, 0 374 mul r8 375 mov [rdi], r9 376 add r10, rax 377 adc r11, rdx 378 mov rax, [rsi+24] 379 mul rcx 380 add r10, rax 381 mov rax, [rsi+24] 382 adc r11, rdx 383 adc r12, 0 384 mul r8 385 mov [rdi+8], r10 386 add r11, rax 387 adc r12, rdx 388 mov rax, [rsi+r14*8] 389 mov [rdi+16], r11 390 mov [rdi+24], r12 391 add r15, 2 392 mov rbx, r14 393%endmacro 394 395%macro mul2pro3 0 396 mov rcx, [r13+r15*8] 397 mul rcx 398 mov r12, rax 399 mov r9, rdx 400 mov r8, [r13+r15*8+8] 401 mov r10, 0 402%endmacro 403 404%macro mul2epi3 0 405 mov rax, [rsi] 406 lea rdi, [rdi+16] 407 mul r8 408 add r9, rax 409 mov rax, [rsi+8] 410 adc r10, rdx 411 mov r11, 0 412 mul rcx 413 mov [rdi-16], r12 414 add r9, rax 415 mov r12, 0 416 adc r10, rdx 417 mov rax, [rsi+8] 418 adc r11, 0 419 mul r8 420 mov [rdi-8], r9 421 add r10, rax 422 adc r11, rdx 423 mov rax, [rsi+16] 424 mul rcx 425 add r10, rax 426 mov rax, [rsi+16] 427 adc r11, rdx 428 adc r12, 0 429 mul r8 430 mov [rdi], r10 431 mov r9, 0 432 add r11, rax 433 mov r10, 0 434 mov rax, [rsi+24] 435 adc r12, rdx 436 mov r15, r15 437 mul rcx 438 add r11, rax 439 mov rax, [rsi+24] 440 adc r12, rdx 441 adc r9, 0 442 mul r8 443 mov [rdi+8], r11 444 add r12, rax 445 adc r9, rdx 446 mov rax, [rsi+r14*8] 447 mov [rdi+16], r12 448 mov [rdi+24], r9 449 add r15, 2 450 mov rbx, r14 451%endmacro 452 453%macro mul1lp 0 454 align 16 455%%1: 456 mov r10, 0 457 mul r8 458 mov [rdi+rbx*8-8], r12 459 add r9, rax 460 db 0x26 461 adc r10, rdx 462 mov rax, [rsi+rbx*8+8] 463 mul r8 464 mov [rdi+rbx*8], r9 465 add r10, rax 466 mov r11d, 0 467 adc r11, rdx 468 mov rax, [rsi+rbx*8+16] 469 mov r12, 0 470 mov r9, 0 471 mul r8 472 mov [rdi+rbx*8+8], r10 473 db 0x26 474 add r11, rax 475 db 0x26 476 adc r12, rdx 477 mov rax, [rsi+rbx*8+24] 478 mul r8 479 mov [rdi+rbx*8+16], r11 480 db 0x26 481 add r12, rax 482 db 0x26 483 adc r9, rdx 484 add rbx, 4 485 mov rax, [rsi+rbx*8] 486 jnc %%1 487%endmacro 488 489; rbx is 0 490%macro mulnext0 0 491 mov rax, [rsi+8] 492 mul r8 493 mov [rdi], r9 494 add r10, rax 495 mov r11d, 0 496 adc r11, rdx 497 mov rax, [rsi+16] 498 mov r12d, 0 499 mul r8 500 mov [rdi+8], r10 501 add r11, rax 502 adc r12, rdx 503 mov rax, [rsi+24] 504 mul r8 505 mov [rdi+16], r11 506 add r12, rax 507 adc rdx, 0 508 mov [rdi+24], r12 509 mov rax, [rsi+r14*8] 510 mov [rdi+32], rdx 511 inc r15 512 lea rdi, [rdi+8] 513 mov rbx, r14 514%endmacro 515 516; rbx is 1 517%macro mulnext1 0 518 mov rax, [rsi+16] 519 mul r8 520 mov [rdi+8], r9 521 add r10, rax 522 mov r12d, 0 523 adc r12, rdx 524 mov rax, [rsi+24] 525 mul r8 526 mov [rdi+16], r10 527 add r12, rax 528 adc rdx, 0 529 mov [rdi+24], r12 530 mov [rdi+32], rdx 531 inc r15 532 lea rdi, [rdi+8] 533 mov rbx, r14 534 mov rax, [rsi+r14*8] 535%endmacro 536 537; rbx is 2 538%macro mulnext2 0 539 mov rax, [rsi+24] 540 mul r8 541 mov [rdi+16], r9 542 add r10, rax 543 mov r11d, 0 544 adc r11, rdx 545 mov [rdi+24], r10 546 mov [rdi+32], r11 547 inc r15 548 lea rdi, [rdi+8] 549 mov rax, [rsi+r14*8] 550 mov rbx, r14 551%endmacro 552 553; rbx is 3 554%macro mulnext3 0 555 mov [rdi+24], r9 556 mov [rdi+32], r10 557 inc r15 558 lea rdi, [rdi+8] 559 mov rax, [rsi+r14*8] 560 mov rbx, r14 561%endmacro 562 563%macro mpn_addmul_2_int 1 564 jz %%2 565 align 16 566%%1: 567 addmul2pro%1 568 addmul2lp %1 569 addmul2epi%1 570 jnz %%1 571%%2: 572 mov r13, [rsp-8] 573 mov r14, [rsp-16] 574 mov rbx, [rsp-24] 575 mov r12, [rsp-32] 576 mov r15, [rsp-40] 577 ret 578%endmacro 579 580%macro oldmulnext0 0 581 mov rax, [rsi+r11*8+16] 582 mul r13 583 mov [rdi+r11*8+8], r9 584 add r10, rax 585 mov ebx, 0 586 adc rbx, rdx 587 mov rax, [rsi+r11*8+24] 588 mov r12d, 0 589 mul r13 590 mov [rdi+r11*8+16], r10 591 add rbx, rax 592 adc r12, rdx 593 mov rax, [rsi+r11*8+32] 594 mul r13 595 mov [rdi+r11*8+24], rbx 596 add r12, rax 597 adc rdx, 0 598 mov [rdi+r11*8+32], r12 599 mov rax, [rsi+r14*8] 600 mov [rdi+r11*8+40], rdx 601 inc r8 602 mov r11, r14 603%endmacro 604 605%macro oldmulnext1 0 606 mov rax, [rsi+r11*8+16] 607 mul r13 608 mov [rdi+r11*8+8], r9 609 add r10, rax 610 mov r12d, 0 611 adc r12, rdx 612 mov rax, [rsi+r11*8+24] 613 mul r13 614 mov [rdi+r11*8+16], r10 615 add r12, rax 616 adc rdx, 0 617 mov [rdi+r11*8+24], r12 618 mov [rdi+r11*8+32], rdx 619 inc r8 620 lea rdi, [rdi+8] 621 mov r11, r14 622 mov rax, [rsi+r14*8] 623%endmacro 624 625%macro oldmulnext2 0 626 mov rax, [rsi+r11*8+16] 627 mul r13 628 mov [rdi+r11*8+8], r9 629 add r10, rax 630 mov ebx, 0 631 adc rbx, rdx 632 mov [rdi+r11*8+16], r10 633 mov [rdi+r11*8+24], rbx 634 inc r8 635 mov rax, [rsi+r14*8] 636 mov r11, r14 637%endmacro 638 639%macro oldmulnext3 0 640 mov [rdi+r11*8+8], r9 641 mov [rdi+r11*8+16], r10 642 inc r8 643 mov rax, [rsi+r14*8] 644 mov r11, r14 645%endmacro 646 647%macro oldaddmulpro0 0 648 mov r13, [rcx+r8*8] 649 db 0x26 650 mul r13 651 db 0x26 652 mov r12, rax 653 mov rax, [rsi+r14*8+8] 654 db 0x26 655 mov r9, rdx 656 lea rdi, [rdi+8] 657%endmacro 658 659%macro oldaddmulnext0 0 660 mov r10d, 0 661 mul r13 662 add [rdi], r12 663 adc r9, rax 664 adc r10, rdx 665 mov rax, [rsi+16] 666 mul r13 667 add [rdi+8], r9 668 adc r10, rax 669 mov ebx, 0 670 adc rbx, rdx 671 mov rax, [rsi+24] 672 mov r12d, 0 673 mov r11, r14 674 mul r13 675 add [rdi+16], r10 676 adc rbx, rax 677 adc r12, rdx 678 mov rax, [rsi+32] 679 mul r13 680 add [rdi+24], rbx 681 adc r12, rax 682 adc rdx, 0 683 add [rdi+32], r12 684 mov rax, [rsi+r14*8] 685 adc rdx, 0 686 inc r8 687 mov [rdi+40], rdx 688%endmacro 689 690%macro oldaddmulpro1 0 691 mov r13, [rcx+r8*8] 692 mul r13 693 mov r12, rax 694 mov rax, [rsi+r14*8+8] 695 mov r9, rdx 696%endmacro 697 698%macro oldaddmulnext1 0 699 mov r10d, 0 700 mul r13 701 add [rdi+8], r12 702 adc r9, rax 703 adc r10, rdx 704 mov rax, [rsi+24] 705 mul r13 706 lea rdi, [rdi+8] 707 add [rdi+8], r9 708 adc r10, rax 709 mov r12d, 0 710 mov rax, [rsi+32] 711 adc r12, rdx 712 mov r11, r14 713 mul r13 714 add [rdi+16], r10 715 adc r12, rax 716 adc rdx, 0 717 add [rdi+24], r12 718 adc rdx, 0 719 mov [rdi+32], rdx 720 inc r8 721 mov rax, [rsi+r14*8] 722%endmacro 723 724%macro oldaddmulpro2 0 725 mov r13, [rcx+r8*8] 726 lea rdi, [rdi+8] 727 mul r13 728 mov r12, rax 729 mov rax, [rsi+r14*8+8] 730 mov r9, rdx 731%endmacro 732 733%macro oldaddmulnext2 0 734 mov r10d, 0 735 mul r13 736 add [rdi+r11*8], r12 737 adc r9, rax 738 adc r10, rdx 739 mov rax, [rsi+r11*8+16] 740 mul r13 741 mov ebx, 0 742 add [rdi+r11*8+8], r9 743 adc r10, rax 744 adc rbx, rdx 745 mov rax, [rsi+r14*8] 746 add [rdi+r11*8+16], r10 747 adc rbx, 0 748 mov [rdi+r11*8+24], rbx 749 inc r8 750 mov r11, r14 751%endmacro 752 753%macro oldaddmulpro3 0 754 mov r13, [rcx+r8*8] 755 db 0x26 756 mul r13 757 db 0x26 758 mov r12, rax 759 db 0x26 760 lea rdi, [rdi+8] 761 db 0x26 762 mov r9, rdx 763 mov rax, [rsi+r14*8+8] 764%endmacro 765 766%macro oldaddmulnext3 0 767 mov r11, r14 768 mul r13 769 add [rdi+24], r12 770 adc r9, rax 771 adc rdx, 0 772 add [rdi+32], r9 773 mov rax, [rsi+r14*8] 774 adc rdx, 0 775 inc r8 776 mov [rdi+40], rdx 777%endmacro 778 779%macro oldmpn_muladdmul_1_int 1 780 oldmulnext%1 781 jz %%2 782 align 16 783%%1: 784 oldaddmulpro%1 785 oldaddmulnext%1 786 jnz %%1 787%%2: 788 mov r13, [rsp-8] 789 mov r14, [rsp-16] 790 mov rbx, [rsp-24] 791 mov r12, [rsp-32] 792 ret 793%endmacro 794 795 ASM_START 796 GLOBAL_FUNC mpn_mul_basecase 797; the current mul does not handle case one 798 cmp rdx, 4 799 jg L_fiveormore 800 cmp rdx, 1 801 je L_one 802 mov [rsp-8], r13 803 mov [rsp-16], r14 804 mov [rsp-24], rbx 805 mov [rsp-32], r12 806 mov r14, 5 807 sub r14, rdx 808 lea rdi, [rdi+rdx*8-40] 809 lea rcx, [rcx+r8*8] 810 neg r8 811 lea rsi, [rsi+rdx*8-40] 812 mov rax, [rsi+r14*8] 813 mov r13, [rcx+r8*8] 814 mov r11, r14 815 mul r13 816 mov r12, rax 817 mov rax, [rsi+r14*8+8] 818 mov r9, rdx 819 mov r10d, 0 820 mul r13 821 mov [rdi+r11*8], r12 822 add r9, rax 823 adc r10, rdx 824 cmp r11, 2 825 ja L_oldcase3 826 jz L_oldcase2 827 jp L_oldcase1 828L_oldcase0: 829 oldmpn_muladdmul_1_int 0 830L_oldcase1: 831 oldmpn_muladdmul_1_int 1 832L_oldcase2: 833 oldmpn_muladdmul_1_int 2 834L_oldcase3: 835 oldmpn_muladdmul_1_int 3 836 align 16 837L_fiveormore: 838; rdx >= 5 as we dont have an inner jump 839; (rdi,rdx+r8)=(rsi,rdx)*(rcx,r8) 840 mov [rsp-8], r13 841 mov [rsp-16], r14 842 mov [rsp-24], rbx 843 mov [rsp-32], r12 844 mov [rsp-40], r15 845 mov r14, 4 846 sub r14, rdx 847 lea rdi, [rdi+rdx*8-32] 848 lea rsi, [rsi+rdx*8-32] 849 mov r13, rcx 850 mov r15, r8 851 lea r13, [r13+r15*8] 852 neg r15 853 mov rbx, r14 854 mov rax, [rsi+r14*8] 855 bt r15, 0 856 jnc L_even 857L_odd: 858 inc rbx 859 mov r8, [r13+r15*8] 860 mul r8 861 mov r12, rax 862 mov rax, [rsi+r14*8+8] 863 mov r9, rdx 864 cmp rbx, 0 865 jge L_mulskiploop 866 mul1lp 867L_mulskiploop: 868 mov r10d, 0 869 mul r8 870 mov [rdi+rbx*8-8], r12 871 add r9, rax 872 adc r10, rdx 873 cmp rbx, 2 874 ja L_mul1case3 875 jz L_mul1case2 876 jp L_mul1case1 877L_mul1case0: 878 mulnext0 879 jmp L_case0 880L_mul1case1: 881 mulnext1 882 jmp L_case3 883L_mul1case2: 884 mulnext2 885 jmp L_case2 886L_mul1case3: 887 mulnext3 888 jmp L_case1 889L_even: 890 ; as all the mul2pro? are the same 891 mul2pro0 892 mul2lp 893 cmp rbx, 2 894 ja L_mul2case0 895 jz L_mul2case1 896 jp L_mul2case2 897L_mul2case3: 898 mul2epi3 899L_case3: 900 mpn_addmul_2_int 3 901L_mul2case2: 902 mul2epi2 903L_case2: 904 mpn_addmul_2_int 2 905L_mul2case1: 906 mul2epi1 907L_case1: 908 mpn_addmul_2_int 1 909L_mul2case0: 910 mul2epi0 911L_case0: 912 mpn_addmul_2_int 0 913 align 16 914L_one: 915 mov rax, [rsi] 916 mul qword [rcx] 917 mov [rdi], rax 918 mov [rdi+8], rdx 919 ret 920 end 921 922