1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright (c) 2012, Intel Corporation # 6# # 7# All rights reserved. # 8# # 9# Redistribution and use in source and binary forms, with or without # 10# modification, are permitted provided that the following conditions are # 11# met: # 12# # 13# * Redistributions of source code must retain the above copyright # 14# notice, this list of conditions and the following disclaimer. # 15# # 16# * Redistributions in binary form must reproduce the above copyright # 17# notice, this list of conditions and the following disclaimer in the # 18# documentation and/or other materials provided with the # 19# distribution. # 20# # 21# * Neither the name of the Intel Corporation nor the names of its # 22# contributors may be used to endorse or promote products derived from # 23# this software without specific prior written permission. # 24# # 25# # 26# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # 27# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # 28# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # 29# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # 30# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # 31# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # 32# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # 33# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # 34# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # 35# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # 36# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 37# # 38############################################################################## 39# Developers and authors: # 40# Shay Gueron (1, 2), and Vlad Krasnov (1) # 41# (1) Intel Architecture Group, Microprocessor and Chipset Development, # 42# Israel Development Center, Haifa, Israel # 43# (2) University of Haifa # 44############################################################################## 45# Reference: # 46# [1] S. Gueron, "Efficient Software Implementations of Modular # 47# Exponentiation", http://eprint.iacr.org/2011/239 # 48# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". # 49# IEEE Proceedings of 9th International Conference on Information # 50# Technology: New Generations (ITNG 2012), 821-823 (2012). # 51# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation# 52# Journal of Cryptographic Engineering 2:31-43 (2012). # 53# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # 54# resistant 512-bit and 1024-bit modular exponentiation for optimizing # 55# RSA1024 and RSA2048 on x86_64 platforms", # 56# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest# 57############################################################################## 58 59# While original submission covers 512- and 1024-bit exponentiation, 60# this module is limited to 512-bit version only (and as such 61# accelerates RSA1024 sign). This is because improvement for longer 62# keys is not high enough to justify the effort, highest measured 63# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming 64# for the moment of this writing!] Nor does this module implement 65# "monolithic" complete exponentiation jumbo-subroutine, but adheres 66# to more modular mixture of C and assembly. And it's optimized even 67# for processors other than Intel Core family (see table below for 68# improvement coefficients). 69# <appro@openssl.org> 70# 71# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) 72# ----------------+--------------------------- 73# Opteron +13% |+5% +20% 74# Bulldozer -0% |-1% +10% 75# P4 +11% |+7% +8% 76# Westmere +5% |+14% +17% 77# Sandy Bridge +2% |+12% +29% 78# Ivy Bridge +1% |+11% +35% 79# Haswell(**) -0% |+12% +39% 80# Atom +13% |+11% +4% 81# VIA Nano +70% |+9% +25% 82# 83# (*) rsax engine and fips numbers are presented for reference 84# purposes; 85# (**) MULX was attempted, but found to give only marginal improvement; 86 87$flavour = shift; 88$output = shift; 89if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 90 91$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 92 93$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 94( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 95( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 96die "can't locate x86_64-xlate.pl"; 97 98open OUT,"| \"$^X\" $xlate $flavour $output"; 99*STDOUT=*OUT; 100 101if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 102 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 103 $addx = ($1>=2.23); 104} 105 106if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 107 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 108 $addx = ($1>=2.10); 109} 110 111if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 112 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 113 $addx = ($1>=12); 114} 115 116if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 117 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 118 $addx = ($ver>=3.03); 119} 120 121($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API 122{ 123my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); 124 125$code.=<<___; 126.text 127 128.extern OPENSSL_ia32cap_P 129 130.globl rsaz_512_sqr 131.type rsaz_512_sqr,\@function,5 132.align 32 133rsaz_512_sqr: # 25-29% faster than rsaz_512_mul 134 push %rbx 135 push %rbp 136 push %r12 137 push %r13 138 push %r14 139 push %r15 140 141 subq \$128+24, %rsp 142.Lsqr_body: 143 movq $mod, %rbp # common argument 144 movq ($inp), %rdx 145 movq 8($inp), %rax 146 movq $n0, 128(%rsp) 147___ 148$code.=<<___ if ($addx); 149 movl \$0x80100,%r11d 150 andl OPENSSL_ia32cap_P+8(%rip),%r11d 151 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 152 je .Loop_sqrx 153___ 154$code.=<<___; 155 jmp .Loop_sqr 156 157.align 32 158.Loop_sqr: 159 movl $times,128+8(%rsp) 160#first iteration 161 movq %rdx, %rbx 162 mulq %rdx 163 movq %rax, %r8 164 movq 16($inp), %rax 165 movq %rdx, %r9 166 167 mulq %rbx 168 addq %rax, %r9 169 movq 24($inp), %rax 170 movq %rdx, %r10 171 adcq \$0, %r10 172 173 mulq %rbx 174 addq %rax, %r10 175 movq 32($inp), %rax 176 movq %rdx, %r11 177 adcq \$0, %r11 178 179 mulq %rbx 180 addq %rax, %r11 181 movq 40($inp), %rax 182 movq %rdx, %r12 183 adcq \$0, %r12 184 185 mulq %rbx 186 addq %rax, %r12 187 movq 48($inp), %rax 188 movq %rdx, %r13 189 adcq \$0, %r13 190 191 mulq %rbx 192 addq %rax, %r13 193 movq 56($inp), %rax 194 movq %rdx, %r14 195 adcq \$0, %r14 196 197 mulq %rbx 198 addq %rax, %r14 199 movq %rbx, %rax 200 movq %rdx, %r15 201 adcq \$0, %r15 202 203 addq %r8, %r8 #shlq \$1, %r8 204 movq %r9, %rcx 205 adcq %r9, %r9 #shld \$1, %r8, %r9 206 207 mulq %rax 208 movq %rax, (%rsp) 209 addq %rdx, %r8 210 adcq \$0, %r9 211 212 movq %r8, 8(%rsp) 213 shrq \$63, %rcx 214 215#second iteration 216 movq 8($inp), %r8 217 movq 16($inp), %rax 218 mulq %r8 219 addq %rax, %r10 220 movq 24($inp), %rax 221 movq %rdx, %rbx 222 adcq \$0, %rbx 223 224 mulq %r8 225 addq %rax, %r11 226 movq 32($inp), %rax 227 adcq \$0, %rdx 228 addq %rbx, %r11 229 movq %rdx, %rbx 230 adcq \$0, %rbx 231 232 mulq %r8 233 addq %rax, %r12 234 movq 40($inp), %rax 235 adcq \$0, %rdx 236 addq %rbx, %r12 237 movq %rdx, %rbx 238 adcq \$0, %rbx 239 240 mulq %r8 241 addq %rax, %r13 242 movq 48($inp), %rax 243 adcq \$0, %rdx 244 addq %rbx, %r13 245 movq %rdx, %rbx 246 adcq \$0, %rbx 247 248 mulq %r8 249 addq %rax, %r14 250 movq 56($inp), %rax 251 adcq \$0, %rdx 252 addq %rbx, %r14 253 movq %rdx, %rbx 254 adcq \$0, %rbx 255 256 mulq %r8 257 addq %rax, %r15 258 movq %r8, %rax 259 adcq \$0, %rdx 260 addq %rbx, %r15 261 movq %rdx, %r8 262 movq %r10, %rdx 263 adcq \$0, %r8 264 265 add %rdx, %rdx 266 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 267 movq %r11, %rbx 268 adcq %r11, %r11 #shld \$1, %r10, %r11 269 270 mulq %rax 271 addq %rax, %r9 272 adcq %rdx, %r10 273 adcq \$0, %r11 274 275 movq %r9, 16(%rsp) 276 movq %r10, 24(%rsp) 277 shrq \$63, %rbx 278 279#third iteration 280 movq 16($inp), %r9 281 movq 24($inp), %rax 282 mulq %r9 283 addq %rax, %r12 284 movq 32($inp), %rax 285 movq %rdx, %rcx 286 adcq \$0, %rcx 287 288 mulq %r9 289 addq %rax, %r13 290 movq 40($inp), %rax 291 adcq \$0, %rdx 292 addq %rcx, %r13 293 movq %rdx, %rcx 294 adcq \$0, %rcx 295 296 mulq %r9 297 addq %rax, %r14 298 movq 48($inp), %rax 299 adcq \$0, %rdx 300 addq %rcx, %r14 301 movq %rdx, %rcx 302 adcq \$0, %rcx 303 304 mulq %r9 305 movq %r12, %r10 306 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12 307 addq %rax, %r15 308 movq 56($inp), %rax 309 adcq \$0, %rdx 310 addq %rcx, %r15 311 movq %rdx, %rcx 312 adcq \$0, %rcx 313 314 mulq %r9 315 shrq \$63, %r10 316 addq %rax, %r8 317 movq %r9, %rax 318 adcq \$0, %rdx 319 addq %rcx, %r8 320 movq %rdx, %r9 321 adcq \$0, %r9 322 323 movq %r13, %rcx 324 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13 325 326 mulq %rax 327 addq %rax, %r11 328 adcq %rdx, %r12 329 adcq \$0, %r13 330 331 movq %r11, 32(%rsp) 332 movq %r12, 40(%rsp) 333 shrq \$63, %rcx 334 335#fourth iteration 336 movq 24($inp), %r10 337 movq 32($inp), %rax 338 mulq %r10 339 addq %rax, %r14 340 movq 40($inp), %rax 341 movq %rdx, %rbx 342 adcq \$0, %rbx 343 344 mulq %r10 345 addq %rax, %r15 346 movq 48($inp), %rax 347 adcq \$0, %rdx 348 addq %rbx, %r15 349 movq %rdx, %rbx 350 adcq \$0, %rbx 351 352 mulq %r10 353 movq %r14, %r12 354 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14 355 addq %rax, %r8 356 movq 56($inp), %rax 357 adcq \$0, %rdx 358 addq %rbx, %r8 359 movq %rdx, %rbx 360 adcq \$0, %rbx 361 362 mulq %r10 363 shrq \$63, %r12 364 addq %rax, %r9 365 movq %r10, %rax 366 adcq \$0, %rdx 367 addq %rbx, %r9 368 movq %rdx, %r10 369 adcq \$0, %r10 370 371 movq %r15, %rbx 372 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15 373 374 mulq %rax 375 addq %rax, %r13 376 adcq %rdx, %r14 377 adcq \$0, %r15 378 379 movq %r13, 48(%rsp) 380 movq %r14, 56(%rsp) 381 shrq \$63, %rbx 382 383#fifth iteration 384 movq 32($inp), %r11 385 movq 40($inp), %rax 386 mulq %r11 387 addq %rax, %r8 388 movq 48($inp), %rax 389 movq %rdx, %rcx 390 adcq \$0, %rcx 391 392 mulq %r11 393 addq %rax, %r9 394 movq 56($inp), %rax 395 adcq \$0, %rdx 396 movq %r8, %r12 397 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8 398 addq %rcx, %r9 399 movq %rdx, %rcx 400 adcq \$0, %rcx 401 402 mulq %r11 403 shrq \$63, %r12 404 addq %rax, %r10 405 movq %r11, %rax 406 adcq \$0, %rdx 407 addq %rcx, %r10 408 movq %rdx, %r11 409 adcq \$0, %r11 410 411 movq %r9, %rcx 412 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9 413 414 mulq %rax 415 addq %rax, %r15 416 adcq %rdx, %r8 417 adcq \$0, %r9 418 419 movq %r15, 64(%rsp) 420 movq %r8, 72(%rsp) 421 shrq \$63, %rcx 422 423#sixth iteration 424 movq 40($inp), %r12 425 movq 48($inp), %rax 426 mulq %r12 427 addq %rax, %r10 428 movq 56($inp), %rax 429 movq %rdx, %rbx 430 adcq \$0, %rbx 431 432 mulq %r12 433 addq %rax, %r11 434 movq %r12, %rax 435 movq %r10, %r15 436 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 437 adcq \$0, %rdx 438 shrq \$63, %r15 439 addq %rbx, %r11 440 movq %rdx, %r12 441 adcq \$0, %r12 442 443 movq %r11, %rbx 444 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11 445 446 mulq %rax 447 addq %rax, %r9 448 adcq %rdx, %r10 449 adcq \$0, %r11 450 451 movq %r9, 80(%rsp) 452 movq %r10, 88(%rsp) 453 454#seventh iteration 455 movq 48($inp), %r13 456 movq 56($inp), %rax 457 mulq %r13 458 addq %rax, %r12 459 movq %r13, %rax 460 movq %rdx, %r13 461 adcq \$0, %r13 462 463 xorq %r14, %r14 464 shlq \$1, %rbx 465 adcq %r12, %r12 #shld \$1, %rbx, %r12 466 adcq %r13, %r13 #shld \$1, %r12, %r13 467 adcq %r14, %r14 #shld \$1, %r13, %r14 468 469 mulq %rax 470 addq %rax, %r11 471 adcq %rdx, %r12 472 adcq \$0, %r13 473 474 movq %r11, 96(%rsp) 475 movq %r12, 104(%rsp) 476 477#eighth iteration 478 movq 56($inp), %rax 479 mulq %rax 480 addq %rax, %r13 481 adcq \$0, %rdx 482 483 addq %rdx, %r14 484 485 movq %r13, 112(%rsp) 486 movq %r14, 120(%rsp) 487 488 movq (%rsp), %r8 489 movq 8(%rsp), %r9 490 movq 16(%rsp), %r10 491 movq 24(%rsp), %r11 492 movq 32(%rsp), %r12 493 movq 40(%rsp), %r13 494 movq 48(%rsp), %r14 495 movq 56(%rsp), %r15 496 497 call __rsaz_512_reduce 498 499 addq 64(%rsp), %r8 500 adcq 72(%rsp), %r9 501 adcq 80(%rsp), %r10 502 adcq 88(%rsp), %r11 503 adcq 96(%rsp), %r12 504 adcq 104(%rsp), %r13 505 adcq 112(%rsp), %r14 506 adcq 120(%rsp), %r15 507 sbbq %rcx, %rcx 508 509 call __rsaz_512_subtract 510 511 movq %r8, %rdx 512 movq %r9, %rax 513 movl 128+8(%rsp), $times 514 movq $out, $inp 515 516 decl $times 517 jnz .Loop_sqr 518___ 519if ($addx) { 520$code.=<<___; 521 jmp .Lsqr_tail 522 523.align 32 524.Loop_sqrx: 525 movl $times,128+8(%rsp) 526 movq $out, %xmm0 # off-load 527 movq %rbp, %xmm1 # off-load 528#first iteration 529 mulx %rax, %r8, %r9 530 531 mulx 16($inp), %rcx, %r10 532 xor %rbp, %rbp # cf=0, of=0 533 534 mulx 24($inp), %rax, %r11 535 adcx %rcx, %r9 536 537 mulx 32($inp), %rcx, %r12 538 adcx %rax, %r10 539 540 mulx 40($inp), %rax, %r13 541 adcx %rcx, %r11 542 543 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14 544 adcx %rax, %r12 545 adcx %rcx, %r13 546 547 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15 548 adcx %rax, %r14 549 adcx %rbp, %r15 # %rbp is 0 550 551 mov %r9, %rcx 552 shld \$1, %r8, %r9 553 shl \$1, %r8 554 555 xor %ebp, %ebp 556 mulx %rdx, %rax, %rdx 557 adcx %rdx, %r8 558 mov 8($inp), %rdx 559 adcx %rbp, %r9 560 561 mov %rax, (%rsp) 562 mov %r8, 8(%rsp) 563 564#second iteration 565 mulx 16($inp), %rax, %rbx 566 adox %rax, %r10 567 adcx %rbx, %r11 568 569 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8 570 adox $out, %r11 571 adcx %r8, %r12 572 573 mulx 32($inp), %rax, %rbx 574 adox %rax, %r12 575 adcx %rbx, %r13 576 577 mulx 40($inp), $out, %r8 578 adox $out, %r13 579 adcx %r8, %r14 580 581 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 582 adox %rax, %r14 583 adcx %rbx, %r15 584 585 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 586 adox $out, %r15 587 adcx %rbp, %r8 588 adox %rbp, %r8 589 590 mov %r11, %rbx 591 shld \$1, %r10, %r11 592 shld \$1, %rcx, %r10 593 594 xor %ebp,%ebp 595 mulx %rdx, %rax, %rcx 596 mov 16($inp), %rdx 597 adcx %rax, %r9 598 adcx %rcx, %r10 599 adcx %rbp, %r11 600 601 mov %r9, 16(%rsp) 602 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) 603 604#third iteration 605 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9 606 adox $out, %r12 607 adcx %r9, %r13 608 609 mulx 32($inp), %rax, %rcx 610 adox %rax, %r13 611 adcx %rcx, %r14 612 613 mulx 40($inp), $out, %r9 614 adox $out, %r14 615 adcx %r9, %r15 616 617 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx 618 adox %rax, %r15 619 adcx %rcx, %r8 620 621 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9 622 adox $out, %r8 623 adcx %rbp, %r9 624 adox %rbp, %r9 625 626 mov %r13, %rcx 627 shld \$1, %r12, %r13 628 shld \$1, %rbx, %r12 629 630 xor %ebp, %ebp 631 mulx %rdx, %rax, %rdx 632 adcx %rax, %r11 633 adcx %rdx, %r12 634 mov 24($inp), %rdx 635 adcx %rbp, %r13 636 637 mov %r11, 32(%rsp) 638 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp) 639 640#fourth iteration 641 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx 642 adox %rax, %r14 643 adcx %rbx, %r15 644 645 mulx 40($inp), $out, %r10 646 adox $out, %r15 647 adcx %r10, %r8 648 649 mulx 48($inp), %rax, %rbx 650 adox %rax, %r8 651 adcx %rbx, %r9 652 653 mulx 56($inp), $out, %r10 654 adox $out, %r9 655 adcx %rbp, %r10 656 adox %rbp, %r10 657 658 .byte 0x66 659 mov %r15, %rbx 660 shld \$1, %r14, %r15 661 shld \$1, %rcx, %r14 662 663 xor %ebp, %ebp 664 mulx %rdx, %rax, %rdx 665 adcx %rax, %r13 666 adcx %rdx, %r14 667 mov 32($inp), %rdx 668 adcx %rbp, %r15 669 670 mov %r13, 48(%rsp) 671 mov %r14, 56(%rsp) 672 673#fifth iteration 674 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11 675 adox $out, %r8 676 adcx %r11, %r9 677 678 mulx 48($inp), %rax, %rcx 679 adox %rax, %r9 680 adcx %rcx, %r10 681 682 mulx 56($inp), $out, %r11 683 adox $out, %r10 684 adcx %rbp, %r11 685 adox %rbp, %r11 686 687 mov %r9, %rcx 688 shld \$1, %r8, %r9 689 shld \$1, %rbx, %r8 690 691 xor %ebp, %ebp 692 mulx %rdx, %rax, %rdx 693 adcx %rax, %r15 694 adcx %rdx, %r8 695 mov 40($inp), %rdx 696 adcx %rbp, %r9 697 698 mov %r15, 64(%rsp) 699 mov %r8, 72(%rsp) 700 701#sixth iteration 702 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 703 adox %rax, %r10 704 adcx %rbx, %r11 705 706 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 707 adox $out, %r11 708 adcx %rbp, %r12 709 adox %rbp, %r12 710 711 mov %r11, %rbx 712 shld \$1, %r10, %r11 713 shld \$1, %rcx, %r10 714 715 xor %ebp, %ebp 716 mulx %rdx, %rax, %rdx 717 adcx %rax, %r9 718 adcx %rdx, %r10 719 mov 48($inp), %rdx 720 adcx %rbp, %r11 721 722 mov %r9, 80(%rsp) 723 mov %r10, 88(%rsp) 724 725#seventh iteration 726 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 727 adox %rax, %r12 728 adox %rbp, %r13 729 730 xor %r14, %r14 731 shld \$1, %r13, %r14 732 shld \$1, %r12, %r13 733 shld \$1, %rbx, %r12 734 735 xor %ebp, %ebp 736 mulx %rdx, %rax, %rdx 737 adcx %rax, %r11 738 adcx %rdx, %r12 739 mov 56($inp), %rdx 740 adcx %rbp, %r13 741 742 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) 743 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) 744 745#eighth iteration 746 mulx %rdx, %rax, %rdx 747 adox %rax, %r13 748 adox %rbp, %rdx 749 750 .byte 0x66 751 add %rdx, %r14 752 753 movq %r13, 112(%rsp) 754 movq %r14, 120(%rsp) 755 movq %xmm0, $out 756 movq %xmm1, %rbp 757 758 movq 128(%rsp), %rdx # pull $n0 759 movq (%rsp), %r8 760 movq 8(%rsp), %r9 761 movq 16(%rsp), %r10 762 movq 24(%rsp), %r11 763 movq 32(%rsp), %r12 764 movq 40(%rsp), %r13 765 movq 48(%rsp), %r14 766 movq 56(%rsp), %r15 767 768 call __rsaz_512_reducex 769 770 addq 64(%rsp), %r8 771 adcq 72(%rsp), %r9 772 adcq 80(%rsp), %r10 773 adcq 88(%rsp), %r11 774 adcq 96(%rsp), %r12 775 adcq 104(%rsp), %r13 776 adcq 112(%rsp), %r14 777 adcq 120(%rsp), %r15 778 sbbq %rcx, %rcx 779 780 call __rsaz_512_subtract 781 782 movq %r8, %rdx 783 movq %r9, %rax 784 movl 128+8(%rsp), $times 785 movq $out, $inp 786 787 decl $times 788 jnz .Loop_sqrx 789 790.Lsqr_tail: 791___ 792} 793$code.=<<___; 794 795 leaq 128+24+48(%rsp), %rax 796 movq -48(%rax), %r15 797 movq -40(%rax), %r14 798 movq -32(%rax), %r13 799 movq -24(%rax), %r12 800 movq -16(%rax), %rbp 801 movq -8(%rax), %rbx 802 leaq (%rax), %rsp 803.Lsqr_epilogue: 804 ret 805.size rsaz_512_sqr,.-rsaz_512_sqr 806___ 807} 808{ 809my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); 810$code.=<<___; 811.globl rsaz_512_mul 812.type rsaz_512_mul,\@function,5 813.align 32 814rsaz_512_mul: 815 push %rbx 816 push %rbp 817 push %r12 818 push %r13 819 push %r14 820 push %r15 821 822 subq \$128+24, %rsp 823.Lmul_body: 824 movq $out, %xmm0 # off-load arguments 825 movq $mod, %xmm1 826 movq $n0, 128(%rsp) 827___ 828$code.=<<___ if ($addx); 829 movl \$0x80100,%r11d 830 andl OPENSSL_ia32cap_P+8(%rip),%r11d 831 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 832 je .Lmulx 833___ 834$code.=<<___; 835 movq ($bp), %rbx # pass b[0] 836 movq $bp, %rbp # pass argument 837 call __rsaz_512_mul 838 839 movq %xmm0, $out 840 movq %xmm1, %rbp 841 842 movq (%rsp), %r8 843 movq 8(%rsp), %r9 844 movq 16(%rsp), %r10 845 movq 24(%rsp), %r11 846 movq 32(%rsp), %r12 847 movq 40(%rsp), %r13 848 movq 48(%rsp), %r14 849 movq 56(%rsp), %r15 850 851 call __rsaz_512_reduce 852___ 853$code.=<<___ if ($addx); 854 jmp .Lmul_tail 855 856.align 32 857.Lmulx: 858 movq $bp, %rbp # pass argument 859 movq ($bp), %rdx # pass b[0] 860 call __rsaz_512_mulx 861 862 movq %xmm0, $out 863 movq %xmm1, %rbp 864 865 movq 128(%rsp), %rdx # pull $n0 866 movq (%rsp), %r8 867 movq 8(%rsp), %r9 868 movq 16(%rsp), %r10 869 movq 24(%rsp), %r11 870 movq 32(%rsp), %r12 871 movq 40(%rsp), %r13 872 movq 48(%rsp), %r14 873 movq 56(%rsp), %r15 874 875 call __rsaz_512_reducex 876.Lmul_tail: 877___ 878$code.=<<___; 879 addq 64(%rsp), %r8 880 adcq 72(%rsp), %r9 881 adcq 80(%rsp), %r10 882 adcq 88(%rsp), %r11 883 adcq 96(%rsp), %r12 884 adcq 104(%rsp), %r13 885 adcq 112(%rsp), %r14 886 adcq 120(%rsp), %r15 887 sbbq %rcx, %rcx 888 889 call __rsaz_512_subtract 890 891 leaq 128+24+48(%rsp), %rax 892 movq -48(%rax), %r15 893 movq -40(%rax), %r14 894 movq -32(%rax), %r13 895 movq -24(%rax), %r12 896 movq -16(%rax), %rbp 897 movq -8(%rax), %rbx 898 leaq (%rax), %rsp 899.Lmul_epilogue: 900 ret 901.size rsaz_512_mul,.-rsaz_512_mul 902___ 903} 904{ 905my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 906$code.=<<___; 907.globl rsaz_512_mul_gather4 908.type rsaz_512_mul_gather4,\@function,6 909.align 32 910rsaz_512_mul_gather4: 911 push %rbx 912 push %rbp 913 push %r12 914 push %r13 915 push %r14 916 push %r15 917 918 subq \$`128+24+($win64?0xb0:0)`, %rsp 919___ 920$code.=<<___ if ($win64); 921 movaps %xmm6,0xa0(%rsp) 922 movaps %xmm7,0xb0(%rsp) 923 movaps %xmm8,0xc0(%rsp) 924 movaps %xmm9,0xd0(%rsp) 925 movaps %xmm10,0xe0(%rsp) 926 movaps %xmm11,0xf0(%rsp) 927 movaps %xmm12,0x100(%rsp) 928 movaps %xmm13,0x110(%rsp) 929 movaps %xmm14,0x120(%rsp) 930 movaps %xmm15,0x130(%rsp) 931___ 932$code.=<<___; 933.Lmul_gather4_body: 934 movd $pwr,%xmm8 935 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 936 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 937 938 pshufd \$0,%xmm8,%xmm8 # broadcast $power 939 movdqa %xmm1,%xmm7 940 movdqa %xmm1,%xmm2 941___ 942######################################################################## 943# calculate mask by comparing 0..15 to $power 944# 945for($i=0;$i<4;$i++) { 946$code.=<<___; 947 paddd %xmm`$i`,%xmm`$i+1` 948 pcmpeqd %xmm8,%xmm`$i` 949 movdqa %xmm7,%xmm`$i+3` 950___ 951} 952for(;$i<7;$i++) { 953$code.=<<___; 954 paddd %xmm`$i`,%xmm`$i+1` 955 pcmpeqd %xmm8,%xmm`$i` 956___ 957} 958$code.=<<___; 959 pcmpeqd %xmm8,%xmm7 960 961 movdqa 16*0($bp),%xmm8 962 movdqa 16*1($bp),%xmm9 963 movdqa 16*2($bp),%xmm10 964 movdqa 16*3($bp),%xmm11 965 pand %xmm0,%xmm8 966 movdqa 16*4($bp),%xmm12 967 pand %xmm1,%xmm9 968 movdqa 16*5($bp),%xmm13 969 pand %xmm2,%xmm10 970 movdqa 16*6($bp),%xmm14 971 pand %xmm3,%xmm11 972 movdqa 16*7($bp),%xmm15 973 leaq 128($bp), %rbp 974 pand %xmm4,%xmm12 975 pand %xmm5,%xmm13 976 pand %xmm6,%xmm14 977 pand %xmm7,%xmm15 978 por %xmm10,%xmm8 979 por %xmm11,%xmm9 980 por %xmm12,%xmm8 981 por %xmm13,%xmm9 982 por %xmm14,%xmm8 983 por %xmm15,%xmm9 984 985 por %xmm9,%xmm8 986 pshufd \$0x4e,%xmm8,%xmm9 987 por %xmm9,%xmm8 988___ 989$code.=<<___ if ($addx); 990 movl \$0x80100,%r11d 991 andl OPENSSL_ia32cap_P+8(%rip),%r11d 992 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 993 je .Lmulx_gather 994___ 995$code.=<<___; 996 movq %xmm8,%rbx 997 998 movq $n0, 128(%rsp) # off-load arguments 999 movq $out, 128+8(%rsp) 1000 movq $mod, 128+16(%rsp) 1001 1002 movq ($ap), %rax 1003 movq 8($ap), %rcx 1004 mulq %rbx # 0 iteration 1005 movq %rax, (%rsp) 1006 movq %rcx, %rax 1007 movq %rdx, %r8 1008 1009 mulq %rbx 1010 addq %rax, %r8 1011 movq 16($ap), %rax 1012 movq %rdx, %r9 1013 adcq \$0, %r9 1014 1015 mulq %rbx 1016 addq %rax, %r9 1017 movq 24($ap), %rax 1018 movq %rdx, %r10 1019 adcq \$0, %r10 1020 1021 mulq %rbx 1022 addq %rax, %r10 1023 movq 32($ap), %rax 1024 movq %rdx, %r11 1025 adcq \$0, %r11 1026 1027 mulq %rbx 1028 addq %rax, %r11 1029 movq 40($ap), %rax 1030 movq %rdx, %r12 1031 adcq \$0, %r12 1032 1033 mulq %rbx 1034 addq %rax, %r12 1035 movq 48($ap), %rax 1036 movq %rdx, %r13 1037 adcq \$0, %r13 1038 1039 mulq %rbx 1040 addq %rax, %r13 1041 movq 56($ap), %rax 1042 movq %rdx, %r14 1043 adcq \$0, %r14 1044 1045 mulq %rbx 1046 addq %rax, %r14 1047 movq ($ap), %rax 1048 movq %rdx, %r15 1049 adcq \$0, %r15 1050 1051 leaq 8(%rsp), %rdi 1052 movl \$7, %ecx 1053 jmp .Loop_mul_gather 1054 1055.align 32 1056.Loop_mul_gather: 1057 movdqa 16*0(%rbp),%xmm8 1058 movdqa 16*1(%rbp),%xmm9 1059 movdqa 16*2(%rbp),%xmm10 1060 movdqa 16*3(%rbp),%xmm11 1061 pand %xmm0,%xmm8 1062 movdqa 16*4(%rbp),%xmm12 1063 pand %xmm1,%xmm9 1064 movdqa 16*5(%rbp),%xmm13 1065 pand %xmm2,%xmm10 1066 movdqa 16*6(%rbp),%xmm14 1067 pand %xmm3,%xmm11 1068 movdqa 16*7(%rbp),%xmm15 1069 leaq 128(%rbp), %rbp 1070 pand %xmm4,%xmm12 1071 pand %xmm5,%xmm13 1072 pand %xmm6,%xmm14 1073 pand %xmm7,%xmm15 1074 por %xmm10,%xmm8 1075 por %xmm11,%xmm9 1076 por %xmm12,%xmm8 1077 por %xmm13,%xmm9 1078 por %xmm14,%xmm8 1079 por %xmm15,%xmm9 1080 1081 por %xmm9,%xmm8 1082 pshufd \$0x4e,%xmm8,%xmm9 1083 por %xmm9,%xmm8 1084 movq %xmm8,%rbx 1085 1086 mulq %rbx 1087 addq %rax, %r8 1088 movq 8($ap), %rax 1089 movq %r8, (%rdi) 1090 movq %rdx, %r8 1091 adcq \$0, %r8 1092 1093 mulq %rbx 1094 addq %rax, %r9 1095 movq 16($ap), %rax 1096 adcq \$0, %rdx 1097 addq %r9, %r8 1098 movq %rdx, %r9 1099 adcq \$0, %r9 1100 1101 mulq %rbx 1102 addq %rax, %r10 1103 movq 24($ap), %rax 1104 adcq \$0, %rdx 1105 addq %r10, %r9 1106 movq %rdx, %r10 1107 adcq \$0, %r10 1108 1109 mulq %rbx 1110 addq %rax, %r11 1111 movq 32($ap), %rax 1112 adcq \$0, %rdx 1113 addq %r11, %r10 1114 movq %rdx, %r11 1115 adcq \$0, %r11 1116 1117 mulq %rbx 1118 addq %rax, %r12 1119 movq 40($ap), %rax 1120 adcq \$0, %rdx 1121 addq %r12, %r11 1122 movq %rdx, %r12 1123 adcq \$0, %r12 1124 1125 mulq %rbx 1126 addq %rax, %r13 1127 movq 48($ap), %rax 1128 adcq \$0, %rdx 1129 addq %r13, %r12 1130 movq %rdx, %r13 1131 adcq \$0, %r13 1132 1133 mulq %rbx 1134 addq %rax, %r14 1135 movq 56($ap), %rax 1136 adcq \$0, %rdx 1137 addq %r14, %r13 1138 movq %rdx, %r14 1139 adcq \$0, %r14 1140 1141 mulq %rbx 1142 addq %rax, %r15 1143 movq ($ap), %rax 1144 adcq \$0, %rdx 1145 addq %r15, %r14 1146 movq %rdx, %r15 1147 adcq \$0, %r15 1148 1149 leaq 8(%rdi), %rdi 1150 1151 decl %ecx 1152 jnz .Loop_mul_gather 1153 1154 movq %r8, (%rdi) 1155 movq %r9, 8(%rdi) 1156 movq %r10, 16(%rdi) 1157 movq %r11, 24(%rdi) 1158 movq %r12, 32(%rdi) 1159 movq %r13, 40(%rdi) 1160 movq %r14, 48(%rdi) 1161 movq %r15, 56(%rdi) 1162 1163 movq 128+8(%rsp), $out 1164 movq 128+16(%rsp), %rbp 1165 1166 movq (%rsp), %r8 1167 movq 8(%rsp), %r9 1168 movq 16(%rsp), %r10 1169 movq 24(%rsp), %r11 1170 movq 32(%rsp), %r12 1171 movq 40(%rsp), %r13 1172 movq 48(%rsp), %r14 1173 movq 56(%rsp), %r15 1174 1175 call __rsaz_512_reduce 1176___ 1177$code.=<<___ if ($addx); 1178 jmp .Lmul_gather_tail 1179 1180.align 32 1181.Lmulx_gather: 1182 movq %xmm8,%rdx 1183 1184 mov $n0, 128(%rsp) # off-load arguments 1185 mov $out, 128+8(%rsp) 1186 mov $mod, 128+16(%rsp) 1187 1188 mulx ($ap), %rbx, %r8 # 0 iteration 1189 mov %rbx, (%rsp) 1190 xor %edi, %edi # cf=0, of=0 1191 1192 mulx 8($ap), %rax, %r9 1193 1194 mulx 16($ap), %rbx, %r10 1195 adcx %rax, %r8 1196 1197 mulx 24($ap), %rax, %r11 1198 adcx %rbx, %r9 1199 1200 mulx 32($ap), %rbx, %r12 1201 adcx %rax, %r10 1202 1203 mulx 40($ap), %rax, %r13 1204 adcx %rbx, %r11 1205 1206 mulx 48($ap), %rbx, %r14 1207 adcx %rax, %r12 1208 1209 mulx 56($ap), %rax, %r15 1210 adcx %rbx, %r13 1211 adcx %rax, %r14 1212 .byte 0x67 1213 mov %r8, %rbx 1214 adcx %rdi, %r15 # %rdi is 0 1215 1216 mov \$-7, %rcx 1217 jmp .Loop_mulx_gather 1218 1219.align 32 1220.Loop_mulx_gather: 1221 movdqa 16*0(%rbp),%xmm8 1222 movdqa 16*1(%rbp),%xmm9 1223 movdqa 16*2(%rbp),%xmm10 1224 movdqa 16*3(%rbp),%xmm11 1225 pand %xmm0,%xmm8 1226 movdqa 16*4(%rbp),%xmm12 1227 pand %xmm1,%xmm9 1228 movdqa 16*5(%rbp),%xmm13 1229 pand %xmm2,%xmm10 1230 movdqa 16*6(%rbp),%xmm14 1231 pand %xmm3,%xmm11 1232 movdqa 16*7(%rbp),%xmm15 1233 leaq 128(%rbp), %rbp 1234 pand %xmm4,%xmm12 1235 pand %xmm5,%xmm13 1236 pand %xmm6,%xmm14 1237 pand %xmm7,%xmm15 1238 por %xmm10,%xmm8 1239 por %xmm11,%xmm9 1240 por %xmm12,%xmm8 1241 por %xmm13,%xmm9 1242 por %xmm14,%xmm8 1243 por %xmm15,%xmm9 1244 1245 por %xmm9,%xmm8 1246 pshufd \$0x4e,%xmm8,%xmm9 1247 por %xmm9,%xmm8 1248 movq %xmm8,%rdx 1249 1250 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8 1251 adcx %rax, %rbx 1252 adox %r9, %r8 1253 1254 mulx 8($ap), %rax, %r9 1255 adcx %rax, %r8 1256 adox %r10, %r9 1257 1258 mulx 16($ap), %rax, %r10 1259 adcx %rax, %r9 1260 adox %r11, %r10 1261 1262 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 1263 adcx %rax, %r10 1264 adox %r12, %r11 1265 1266 mulx 32($ap), %rax, %r12 1267 adcx %rax, %r11 1268 adox %r13, %r12 1269 1270 mulx 40($ap), %rax, %r13 1271 adcx %rax, %r12 1272 adox %r14, %r13 1273 1274 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 1275 adcx %rax, %r13 1276 .byte 0x67 1277 adox %r15, %r14 1278 1279 mulx 56($ap), %rax, %r15 1280 mov %rbx, 64(%rsp,%rcx,8) 1281 adcx %rax, %r14 1282 adox %rdi, %r15 1283 mov %r8, %rbx 1284 adcx %rdi, %r15 # cf=0 1285 1286 inc %rcx # of=0 1287 jnz .Loop_mulx_gather 1288 1289 mov %r8, 64(%rsp) 1290 mov %r9, 64+8(%rsp) 1291 mov %r10, 64+16(%rsp) 1292 mov %r11, 64+24(%rsp) 1293 mov %r12, 64+32(%rsp) 1294 mov %r13, 64+40(%rsp) 1295 mov %r14, 64+48(%rsp) 1296 mov %r15, 64+56(%rsp) 1297 1298 mov 128(%rsp), %rdx # pull arguments 1299 mov 128+8(%rsp), $out 1300 mov 128+16(%rsp), %rbp 1301 1302 mov (%rsp), %r8 1303 mov 8(%rsp), %r9 1304 mov 16(%rsp), %r10 1305 mov 24(%rsp), %r11 1306 mov 32(%rsp), %r12 1307 mov 40(%rsp), %r13 1308 mov 48(%rsp), %r14 1309 mov 56(%rsp), %r15 1310 1311 call __rsaz_512_reducex 1312 1313.Lmul_gather_tail: 1314___ 1315$code.=<<___; 1316 addq 64(%rsp), %r8 1317 adcq 72(%rsp), %r9 1318 adcq 80(%rsp), %r10 1319 adcq 88(%rsp), %r11 1320 adcq 96(%rsp), %r12 1321 adcq 104(%rsp), %r13 1322 adcq 112(%rsp), %r14 1323 adcq 120(%rsp), %r15 1324 sbbq %rcx, %rcx 1325 1326 call __rsaz_512_subtract 1327 1328 leaq 128+24+48(%rsp), %rax 1329___ 1330$code.=<<___ if ($win64); 1331 movaps 0xa0-0xc8(%rax),%xmm6 1332 movaps 0xb0-0xc8(%rax),%xmm7 1333 movaps 0xc0-0xc8(%rax),%xmm8 1334 movaps 0xd0-0xc8(%rax),%xmm9 1335 movaps 0xe0-0xc8(%rax),%xmm10 1336 movaps 0xf0-0xc8(%rax),%xmm11 1337 movaps 0x100-0xc8(%rax),%xmm12 1338 movaps 0x110-0xc8(%rax),%xmm13 1339 movaps 0x120-0xc8(%rax),%xmm14 1340 movaps 0x130-0xc8(%rax),%xmm15 1341 lea 0xb0(%rax),%rax 1342___ 1343$code.=<<___; 1344 movq -48(%rax), %r15 1345 movq -40(%rax), %r14 1346 movq -32(%rax), %r13 1347 movq -24(%rax), %r12 1348 movq -16(%rax), %rbp 1349 movq -8(%rax), %rbx 1350 leaq (%rax), %rsp 1351.Lmul_gather4_epilogue: 1352 ret 1353.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 1354___ 1355} 1356{ 1357my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1358$code.=<<___; 1359.globl rsaz_512_mul_scatter4 1360.type rsaz_512_mul_scatter4,\@function,6 1361.align 32 1362rsaz_512_mul_scatter4: 1363 push %rbx 1364 push %rbp 1365 push %r12 1366 push %r13 1367 push %r14 1368 push %r15 1369 1370 mov $pwr, $pwr 1371 subq \$128+24, %rsp 1372.Lmul_scatter4_body: 1373 leaq ($tbl,$pwr,8), $tbl 1374 movq $out, %xmm0 # off-load arguments 1375 movq $mod, %xmm1 1376 movq $tbl, %xmm2 1377 movq $n0, 128(%rsp) 1378 1379 movq $out, %rbp 1380___ 1381$code.=<<___ if ($addx); 1382 movl \$0x80100,%r11d 1383 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1384 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1385 je .Lmulx_scatter 1386___ 1387$code.=<<___; 1388 movq ($out),%rbx # pass b[0] 1389 call __rsaz_512_mul 1390 1391 movq %xmm0, $out 1392 movq %xmm1, %rbp 1393 1394 movq (%rsp), %r8 1395 movq 8(%rsp), %r9 1396 movq 16(%rsp), %r10 1397 movq 24(%rsp), %r11 1398 movq 32(%rsp), %r12 1399 movq 40(%rsp), %r13 1400 movq 48(%rsp), %r14 1401 movq 56(%rsp), %r15 1402 1403 call __rsaz_512_reduce 1404___ 1405$code.=<<___ if ($addx); 1406 jmp .Lmul_scatter_tail 1407 1408.align 32 1409.Lmulx_scatter: 1410 movq ($out), %rdx # pass b[0] 1411 call __rsaz_512_mulx 1412 1413 movq %xmm0, $out 1414 movq %xmm1, %rbp 1415 1416 movq 128(%rsp), %rdx # pull $n0 1417 movq (%rsp), %r8 1418 movq 8(%rsp), %r9 1419 movq 16(%rsp), %r10 1420 movq 24(%rsp), %r11 1421 movq 32(%rsp), %r12 1422 movq 40(%rsp), %r13 1423 movq 48(%rsp), %r14 1424 movq 56(%rsp), %r15 1425 1426 call __rsaz_512_reducex 1427 1428.Lmul_scatter_tail: 1429___ 1430$code.=<<___; 1431 addq 64(%rsp), %r8 1432 adcq 72(%rsp), %r9 1433 adcq 80(%rsp), %r10 1434 adcq 88(%rsp), %r11 1435 adcq 96(%rsp), %r12 1436 adcq 104(%rsp), %r13 1437 adcq 112(%rsp), %r14 1438 adcq 120(%rsp), %r15 1439 movq %xmm2, $inp 1440 sbbq %rcx, %rcx 1441 1442 call __rsaz_512_subtract 1443 1444 movq %r8, 128*0($inp) # scatter 1445 movq %r9, 128*1($inp) 1446 movq %r10, 128*2($inp) 1447 movq %r11, 128*3($inp) 1448 movq %r12, 128*4($inp) 1449 movq %r13, 128*5($inp) 1450 movq %r14, 128*6($inp) 1451 movq %r15, 128*7($inp) 1452 1453 leaq 128+24+48(%rsp), %rax 1454 movq -48(%rax), %r15 1455 movq -40(%rax), %r14 1456 movq -32(%rax), %r13 1457 movq -24(%rax), %r12 1458 movq -16(%rax), %rbp 1459 movq -8(%rax), %rbx 1460 leaq (%rax), %rsp 1461.Lmul_scatter4_epilogue: 1462 ret 1463.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 1464___ 1465} 1466{ 1467my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); 1468$code.=<<___; 1469.globl rsaz_512_mul_by_one 1470.type rsaz_512_mul_by_one,\@function,4 1471.align 32 1472rsaz_512_mul_by_one: 1473 push %rbx 1474 push %rbp 1475 push %r12 1476 push %r13 1477 push %r14 1478 push %r15 1479 1480 subq \$128+24, %rsp 1481.Lmul_by_one_body: 1482___ 1483$code.=<<___ if ($addx); 1484 movl OPENSSL_ia32cap_P+8(%rip),%eax 1485___ 1486$code.=<<___; 1487 movq $mod, %rbp # reassign argument 1488 movq $n0, 128(%rsp) 1489 1490 movq ($inp), %r8 1491 pxor %xmm0, %xmm0 1492 movq 8($inp), %r9 1493 movq 16($inp), %r10 1494 movq 24($inp), %r11 1495 movq 32($inp), %r12 1496 movq 40($inp), %r13 1497 movq 48($inp), %r14 1498 movq 56($inp), %r15 1499 1500 movdqa %xmm0, (%rsp) 1501 movdqa %xmm0, 16(%rsp) 1502 movdqa %xmm0, 32(%rsp) 1503 movdqa %xmm0, 48(%rsp) 1504 movdqa %xmm0, 64(%rsp) 1505 movdqa %xmm0, 80(%rsp) 1506 movdqa %xmm0, 96(%rsp) 1507___ 1508$code.=<<___ if ($addx); 1509 andl \$0x80100,%eax 1510 cmpl \$0x80100,%eax # check for MULX and ADO/CX 1511 je .Lby_one_callx 1512___ 1513$code.=<<___; 1514 call __rsaz_512_reduce 1515___ 1516$code.=<<___ if ($addx); 1517 jmp .Lby_one_tail 1518.align 32 1519.Lby_one_callx: 1520 movq 128(%rsp), %rdx # pull $n0 1521 call __rsaz_512_reducex 1522.Lby_one_tail: 1523___ 1524$code.=<<___; 1525 movq %r8, ($out) 1526 movq %r9, 8($out) 1527 movq %r10, 16($out) 1528 movq %r11, 24($out) 1529 movq %r12, 32($out) 1530 movq %r13, 40($out) 1531 movq %r14, 48($out) 1532 movq %r15, 56($out) 1533 1534 leaq 128+24+48(%rsp), %rax 1535 movq -48(%rax), %r15 1536 movq -40(%rax), %r14 1537 movq -32(%rax), %r13 1538 movq -24(%rax), %r12 1539 movq -16(%rax), %rbp 1540 movq -8(%rax), %rbx 1541 leaq (%rax), %rsp 1542.Lmul_by_one_epilogue: 1543 ret 1544.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 1545___ 1546} 1547{ # __rsaz_512_reduce 1548 # 1549 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1550 # output: %r8-%r15 1551 # clobbers: everything except %rbp and %rdi 1552$code.=<<___; 1553.type __rsaz_512_reduce,\@abi-omnipotent 1554.align 32 1555__rsaz_512_reduce: 1556 movq %r8, %rbx 1557 imulq 128+8(%rsp), %rbx 1558 movq 0(%rbp), %rax 1559 movl \$8, %ecx 1560 jmp .Lreduction_loop 1561 1562.align 32 1563.Lreduction_loop: 1564 mulq %rbx 1565 movq 8(%rbp), %rax 1566 negq %r8 1567 movq %rdx, %r8 1568 adcq \$0, %r8 1569 1570 mulq %rbx 1571 addq %rax, %r9 1572 movq 16(%rbp), %rax 1573 adcq \$0, %rdx 1574 addq %r9, %r8 1575 movq %rdx, %r9 1576 adcq \$0, %r9 1577 1578 mulq %rbx 1579 addq %rax, %r10 1580 movq 24(%rbp), %rax 1581 adcq \$0, %rdx 1582 addq %r10, %r9 1583 movq %rdx, %r10 1584 adcq \$0, %r10 1585 1586 mulq %rbx 1587 addq %rax, %r11 1588 movq 32(%rbp), %rax 1589 adcq \$0, %rdx 1590 addq %r11, %r10 1591 movq 128+8(%rsp), %rsi 1592 #movq %rdx, %r11 1593 #adcq \$0, %r11 1594 adcq \$0, %rdx 1595 movq %rdx, %r11 1596 1597 mulq %rbx 1598 addq %rax, %r12 1599 movq 40(%rbp), %rax 1600 adcq \$0, %rdx 1601 imulq %r8, %rsi 1602 addq %r12, %r11 1603 movq %rdx, %r12 1604 adcq \$0, %r12 1605 1606 mulq %rbx 1607 addq %rax, %r13 1608 movq 48(%rbp), %rax 1609 adcq \$0, %rdx 1610 addq %r13, %r12 1611 movq %rdx, %r13 1612 adcq \$0, %r13 1613 1614 mulq %rbx 1615 addq %rax, %r14 1616 movq 56(%rbp), %rax 1617 adcq \$0, %rdx 1618 addq %r14, %r13 1619 movq %rdx, %r14 1620 adcq \$0, %r14 1621 1622 mulq %rbx 1623 movq %rsi, %rbx 1624 addq %rax, %r15 1625 movq 0(%rbp), %rax 1626 adcq \$0, %rdx 1627 addq %r15, %r14 1628 movq %rdx, %r15 1629 adcq \$0, %r15 1630 1631 decl %ecx 1632 jne .Lreduction_loop 1633 1634 ret 1635.size __rsaz_512_reduce,.-__rsaz_512_reduce 1636___ 1637} 1638if ($addx) { 1639 # __rsaz_512_reducex 1640 # 1641 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1642 # output: %r8-%r15 1643 # clobbers: everything except %rbp and %rdi 1644$code.=<<___; 1645.type __rsaz_512_reducex,\@abi-omnipotent 1646.align 32 1647__rsaz_512_reducex: 1648 #movq 128+8(%rsp), %rdx # pull $n0 1649 imulq %r8, %rdx 1650 xorq %rsi, %rsi # cf=0,of=0 1651 movl \$8, %ecx 1652 jmp .Lreduction_loopx 1653 1654.align 32 1655.Lreduction_loopx: 1656 mov %r8, %rbx 1657 mulx 0(%rbp), %rax, %r8 1658 adcx %rbx, %rax 1659 adox %r9, %r8 1660 1661 mulx 8(%rbp), %rax, %r9 1662 adcx %rax, %r8 1663 adox %r10, %r9 1664 1665 mulx 16(%rbp), %rbx, %r10 1666 adcx %rbx, %r9 1667 adox %r11, %r10 1668 1669 mulx 24(%rbp), %rbx, %r11 1670 adcx %rbx, %r10 1671 adox %r12, %r11 1672 1673 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 1674 mov %rdx, %rax 1675 mov %r8, %rdx 1676 adcx %rbx, %r11 1677 adox %r13, %r12 1678 1679 mulx 128+8(%rsp), %rbx, %rdx 1680 mov %rax, %rdx 1681 1682 mulx 40(%rbp), %rax, %r13 1683 adcx %rax, %r12 1684 adox %r14, %r13 1685 1686 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 1687 adcx %rax, %r13 1688 adox %r15, %r14 1689 1690 mulx 56(%rbp), %rax, %r15 1691 mov %rbx, %rdx 1692 adcx %rax, %r14 1693 adox %rsi, %r15 # %rsi is 0 1694 adcx %rsi, %r15 # cf=0 1695 1696 decl %ecx # of=0 1697 jne .Lreduction_loopx 1698 1699 ret 1700.size __rsaz_512_reducex,.-__rsaz_512_reducex 1701___ 1702} 1703{ # __rsaz_512_subtract 1704 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask 1705 # output: 1706 # clobbers: everything but %rdi, %rsi and %rbp 1707$code.=<<___; 1708.type __rsaz_512_subtract,\@abi-omnipotent 1709.align 32 1710__rsaz_512_subtract: 1711 movq %r8, ($out) 1712 movq %r9, 8($out) 1713 movq %r10, 16($out) 1714 movq %r11, 24($out) 1715 movq %r12, 32($out) 1716 movq %r13, 40($out) 1717 movq %r14, 48($out) 1718 movq %r15, 56($out) 1719 1720 movq 0($mod), %r8 1721 movq 8($mod), %r9 1722 negq %r8 1723 notq %r9 1724 andq %rcx, %r8 1725 movq 16($mod), %r10 1726 andq %rcx, %r9 1727 notq %r10 1728 movq 24($mod), %r11 1729 andq %rcx, %r10 1730 notq %r11 1731 movq 32($mod), %r12 1732 andq %rcx, %r11 1733 notq %r12 1734 movq 40($mod), %r13 1735 andq %rcx, %r12 1736 notq %r13 1737 movq 48($mod), %r14 1738 andq %rcx, %r13 1739 notq %r14 1740 movq 56($mod), %r15 1741 andq %rcx, %r14 1742 notq %r15 1743 andq %rcx, %r15 1744 1745 addq ($out), %r8 1746 adcq 8($out), %r9 1747 adcq 16($out), %r10 1748 adcq 24($out), %r11 1749 adcq 32($out), %r12 1750 adcq 40($out), %r13 1751 adcq 48($out), %r14 1752 adcq 56($out), %r15 1753 1754 movq %r8, ($out) 1755 movq %r9, 8($out) 1756 movq %r10, 16($out) 1757 movq %r11, 24($out) 1758 movq %r12, 32($out) 1759 movq %r13, 40($out) 1760 movq %r14, 48($out) 1761 movq %r15, 56($out) 1762 1763 ret 1764.size __rsaz_512_subtract,.-__rsaz_512_subtract 1765___ 1766} 1767{ # __rsaz_512_mul 1768 # 1769 # input: %rsi - ap, %rbp - bp 1770 # ouput: 1771 # clobbers: everything 1772my ($ap,$bp) = ("%rsi","%rbp"); 1773$code.=<<___; 1774.type __rsaz_512_mul,\@abi-omnipotent 1775.align 32 1776__rsaz_512_mul: 1777 leaq 8(%rsp), %rdi 1778 1779 movq ($ap), %rax 1780 mulq %rbx 1781 movq %rax, (%rdi) 1782 movq 8($ap), %rax 1783 movq %rdx, %r8 1784 1785 mulq %rbx 1786 addq %rax, %r8 1787 movq 16($ap), %rax 1788 movq %rdx, %r9 1789 adcq \$0, %r9 1790 1791 mulq %rbx 1792 addq %rax, %r9 1793 movq 24($ap), %rax 1794 movq %rdx, %r10 1795 adcq \$0, %r10 1796 1797 mulq %rbx 1798 addq %rax, %r10 1799 movq 32($ap), %rax 1800 movq %rdx, %r11 1801 adcq \$0, %r11 1802 1803 mulq %rbx 1804 addq %rax, %r11 1805 movq 40($ap), %rax 1806 movq %rdx, %r12 1807 adcq \$0, %r12 1808 1809 mulq %rbx 1810 addq %rax, %r12 1811 movq 48($ap), %rax 1812 movq %rdx, %r13 1813 adcq \$0, %r13 1814 1815 mulq %rbx 1816 addq %rax, %r13 1817 movq 56($ap), %rax 1818 movq %rdx, %r14 1819 adcq \$0, %r14 1820 1821 mulq %rbx 1822 addq %rax, %r14 1823 movq ($ap), %rax 1824 movq %rdx, %r15 1825 adcq \$0, %r15 1826 1827 leaq 8($bp), $bp 1828 leaq 8(%rdi), %rdi 1829 1830 movl \$7, %ecx 1831 jmp .Loop_mul 1832 1833.align 32 1834.Loop_mul: 1835 movq ($bp), %rbx 1836 mulq %rbx 1837 addq %rax, %r8 1838 movq 8($ap), %rax 1839 movq %r8, (%rdi) 1840 movq %rdx, %r8 1841 adcq \$0, %r8 1842 1843 mulq %rbx 1844 addq %rax, %r9 1845 movq 16($ap), %rax 1846 adcq \$0, %rdx 1847 addq %r9, %r8 1848 movq %rdx, %r9 1849 adcq \$0, %r9 1850 1851 mulq %rbx 1852 addq %rax, %r10 1853 movq 24($ap), %rax 1854 adcq \$0, %rdx 1855 addq %r10, %r9 1856 movq %rdx, %r10 1857 adcq \$0, %r10 1858 1859 mulq %rbx 1860 addq %rax, %r11 1861 movq 32($ap), %rax 1862 adcq \$0, %rdx 1863 addq %r11, %r10 1864 movq %rdx, %r11 1865 adcq \$0, %r11 1866 1867 mulq %rbx 1868 addq %rax, %r12 1869 movq 40($ap), %rax 1870 adcq \$0, %rdx 1871 addq %r12, %r11 1872 movq %rdx, %r12 1873 adcq \$0, %r12 1874 1875 mulq %rbx 1876 addq %rax, %r13 1877 movq 48($ap), %rax 1878 adcq \$0, %rdx 1879 addq %r13, %r12 1880 movq %rdx, %r13 1881 adcq \$0, %r13 1882 1883 mulq %rbx 1884 addq %rax, %r14 1885 movq 56($ap), %rax 1886 adcq \$0, %rdx 1887 addq %r14, %r13 1888 movq %rdx, %r14 1889 leaq 8($bp), $bp 1890 adcq \$0, %r14 1891 1892 mulq %rbx 1893 addq %rax, %r15 1894 movq ($ap), %rax 1895 adcq \$0, %rdx 1896 addq %r15, %r14 1897 movq %rdx, %r15 1898 adcq \$0, %r15 1899 1900 leaq 8(%rdi), %rdi 1901 1902 decl %ecx 1903 jnz .Loop_mul 1904 1905 movq %r8, (%rdi) 1906 movq %r9, 8(%rdi) 1907 movq %r10, 16(%rdi) 1908 movq %r11, 24(%rdi) 1909 movq %r12, 32(%rdi) 1910 movq %r13, 40(%rdi) 1911 movq %r14, 48(%rdi) 1912 movq %r15, 56(%rdi) 1913 1914 ret 1915.size __rsaz_512_mul,.-__rsaz_512_mul 1916___ 1917} 1918if ($addx) { 1919 # __rsaz_512_mulx 1920 # 1921 # input: %rsi - ap, %rbp - bp 1922 # ouput: 1923 # clobbers: everything 1924my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); 1925$code.=<<___; 1926.type __rsaz_512_mulx,\@abi-omnipotent 1927.align 32 1928__rsaz_512_mulx: 1929 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller 1930 mov \$-6, %rcx 1931 1932 mulx 8($ap), %rax, %r9 1933 movq %rbx, 8(%rsp) 1934 1935 mulx 16($ap), %rbx, %r10 1936 adc %rax, %r8 1937 1938 mulx 24($ap), %rax, %r11 1939 adc %rbx, %r9 1940 1941 mulx 32($ap), %rbx, %r12 1942 adc %rax, %r10 1943 1944 mulx 40($ap), %rax, %r13 1945 adc %rbx, %r11 1946 1947 mulx 48($ap), %rbx, %r14 1948 adc %rax, %r12 1949 1950 mulx 56($ap), %rax, %r15 1951 mov 8($bp), %rdx 1952 adc %rbx, %r13 1953 adc %rax, %r14 1954 adc \$0, %r15 1955 1956 xor $zero, $zero # cf=0,of=0 1957 jmp .Loop_mulx 1958 1959.align 32 1960.Loop_mulx: 1961 movq %r8, %rbx 1962 mulx ($ap), %rax, %r8 1963 adcx %rax, %rbx 1964 adox %r9, %r8 1965 1966 mulx 8($ap), %rax, %r9 1967 adcx %rax, %r8 1968 adox %r10, %r9 1969 1970 mulx 16($ap), %rax, %r10 1971 adcx %rax, %r9 1972 adox %r11, %r10 1973 1974 mulx 24($ap), %rax, %r11 1975 adcx %rax, %r10 1976 adox %r12, %r11 1977 1978 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 1979 adcx %rax, %r11 1980 adox %r13, %r12 1981 1982 mulx 40($ap), %rax, %r13 1983 adcx %rax, %r12 1984 adox %r14, %r13 1985 1986 mulx 48($ap), %rax, %r14 1987 adcx %rax, %r13 1988 adox %r15, %r14 1989 1990 mulx 56($ap), %rax, %r15 1991 movq 64($bp,%rcx,8), %rdx 1992 movq %rbx, 8+64-8(%rsp,%rcx,8) 1993 adcx %rax, %r14 1994 adox $zero, %r15 1995 adcx $zero, %r15 # cf=0 1996 1997 inc %rcx # of=0 1998 jnz .Loop_mulx 1999 2000 movq %r8, %rbx 2001 mulx ($ap), %rax, %r8 2002 adcx %rax, %rbx 2003 adox %r9, %r8 2004 2005 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 2006 adcx %rax, %r8 2007 adox %r10, %r9 2008 2009 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 2010 adcx %rax, %r9 2011 adox %r11, %r10 2012 2013 mulx 24($ap), %rax, %r11 2014 adcx %rax, %r10 2015 adox %r12, %r11 2016 2017 mulx 32($ap), %rax, %r12 2018 adcx %rax, %r11 2019 adox %r13, %r12 2020 2021 mulx 40($ap), %rax, %r13 2022 adcx %rax, %r12 2023 adox %r14, %r13 2024 2025 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 2026 adcx %rax, %r13 2027 adox %r15, %r14 2028 2029 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 2030 adcx %rax, %r14 2031 adox $zero, %r15 2032 adcx $zero, %r15 2033 2034 mov %rbx, 8+64-8(%rsp) 2035 mov %r8, 8+64(%rsp) 2036 mov %r9, 8+64+8(%rsp) 2037 mov %r10, 8+64+16(%rsp) 2038 mov %r11, 8+64+24(%rsp) 2039 mov %r12, 8+64+32(%rsp) 2040 mov %r13, 8+64+40(%rsp) 2041 mov %r14, 8+64+48(%rsp) 2042 mov %r15, 8+64+56(%rsp) 2043 2044 ret 2045.size __rsaz_512_mulx,.-__rsaz_512_mulx 2046___ 2047} 2048{ 2049my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 2050$code.=<<___; 2051.globl rsaz_512_scatter4 2052.type rsaz_512_scatter4,\@abi-omnipotent 2053.align 16 2054rsaz_512_scatter4: 2055 leaq ($out,$power,8), $out 2056 movl \$8, %r9d 2057 jmp .Loop_scatter 2058.align 16 2059.Loop_scatter: 2060 movq ($inp), %rax 2061 leaq 8($inp), $inp 2062 movq %rax, ($out) 2063 leaq 128($out), $out 2064 decl %r9d 2065 jnz .Loop_scatter 2066 ret 2067.size rsaz_512_scatter4,.-rsaz_512_scatter4 2068 2069.globl rsaz_512_gather4 2070.type rsaz_512_gather4,\@abi-omnipotent 2071.align 16 2072rsaz_512_gather4: 2073___ 2074$code.=<<___ if ($win64); 2075.LSEH_begin_rsaz_512_gather4: 2076 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp 2077 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp) 2078 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp) 2079 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp) 2080 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp) 2081 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp) 2082 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp) 2083 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp) 2084 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp) 2085 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp) 2086 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp) 2087___ 2088$code.=<<___; 2089 movd $power,%xmm8 2090 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 2091 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 2092 2093 pshufd \$0,%xmm8,%xmm8 # broadcast $power 2094 movdqa %xmm1,%xmm7 2095 movdqa %xmm1,%xmm2 2096___ 2097######################################################################## 2098# calculate mask by comparing 0..15 to $power 2099# 2100for($i=0;$i<4;$i++) { 2101$code.=<<___; 2102 paddd %xmm`$i`,%xmm`$i+1` 2103 pcmpeqd %xmm8,%xmm`$i` 2104 movdqa %xmm7,%xmm`$i+3` 2105___ 2106} 2107for(;$i<7;$i++) { 2108$code.=<<___; 2109 paddd %xmm`$i`,%xmm`$i+1` 2110 pcmpeqd %xmm8,%xmm`$i` 2111___ 2112} 2113$code.=<<___; 2114 pcmpeqd %xmm8,%xmm7 2115 movl \$8, %r9d 2116 jmp .Loop_gather 2117.align 16 2118.Loop_gather: 2119 movdqa 16*0($inp),%xmm8 2120 movdqa 16*1($inp),%xmm9 2121 movdqa 16*2($inp),%xmm10 2122 movdqa 16*3($inp),%xmm11 2123 pand %xmm0,%xmm8 2124 movdqa 16*4($inp),%xmm12 2125 pand %xmm1,%xmm9 2126 movdqa 16*5($inp),%xmm13 2127 pand %xmm2,%xmm10 2128 movdqa 16*6($inp),%xmm14 2129 pand %xmm3,%xmm11 2130 movdqa 16*7($inp),%xmm15 2131 leaq 128($inp), $inp 2132 pand %xmm4,%xmm12 2133 pand %xmm5,%xmm13 2134 pand %xmm6,%xmm14 2135 pand %xmm7,%xmm15 2136 por %xmm10,%xmm8 2137 por %xmm11,%xmm9 2138 por %xmm12,%xmm8 2139 por %xmm13,%xmm9 2140 por %xmm14,%xmm8 2141 por %xmm15,%xmm9 2142 2143 por %xmm9,%xmm8 2144 pshufd \$0x4e,%xmm8,%xmm9 2145 por %xmm9,%xmm8 2146 movq %xmm8,($out) 2147 leaq 8($out), $out 2148 decl %r9d 2149 jnz .Loop_gather 2150___ 2151$code.=<<___ if ($win64); 2152 movaps 0x00(%rsp),%xmm6 2153 movaps 0x10(%rsp),%xmm7 2154 movaps 0x20(%rsp),%xmm8 2155 movaps 0x30(%rsp),%xmm9 2156 movaps 0x40(%rsp),%xmm10 2157 movaps 0x50(%rsp),%xmm11 2158 movaps 0x60(%rsp),%xmm12 2159 movaps 0x70(%rsp),%xmm13 2160 movaps 0x80(%rsp),%xmm14 2161 movaps 0x90(%rsp),%xmm15 2162 add \$0xa8,%rsp 2163___ 2164$code.=<<___; 2165 ret 2166.LSEH_end_rsaz_512_gather4: 2167.size rsaz_512_gather4,.-rsaz_512_gather4 2168 2169.align 64 2170.Linc: 2171 .long 0,0, 1,1 2172 .long 2,2, 2,2 2173___ 2174} 2175 2176# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2177# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2178if ($win64) { 2179$rec="%rcx"; 2180$frame="%rdx"; 2181$context="%r8"; 2182$disp="%r9"; 2183 2184$code.=<<___; 2185.extern __imp_RtlVirtualUnwind 2186.type se_handler,\@abi-omnipotent 2187.align 16 2188se_handler: 2189 push %rsi 2190 push %rdi 2191 push %rbx 2192 push %rbp 2193 push %r12 2194 push %r13 2195 push %r14 2196 push %r15 2197 pushfq 2198 sub \$64,%rsp 2199 2200 mov 120($context),%rax # pull context->Rax 2201 mov 248($context),%rbx # pull context->Rip 2202 2203 mov 8($disp),%rsi # disp->ImageBase 2204 mov 56($disp),%r11 # disp->HandlerData 2205 2206 mov 0(%r11),%r10d # HandlerData[0] 2207 lea (%rsi,%r10),%r10 # end of prologue label 2208 cmp %r10,%rbx # context->Rip<end of prologue label 2209 jb .Lcommon_seh_tail 2210 2211 mov 152($context),%rax # pull context->Rsp 2212 2213 mov 4(%r11),%r10d # HandlerData[1] 2214 lea (%rsi,%r10),%r10 # epilogue label 2215 cmp %r10,%rbx # context->Rip>=epilogue label 2216 jae .Lcommon_seh_tail 2217 2218 lea 128+24+48(%rax),%rax 2219 2220 lea .Lmul_gather4_epilogue(%rip),%rbx 2221 cmp %r10,%rbx 2222 jne .Lse_not_in_mul_gather4 2223 2224 lea 0xb0(%rax),%rax 2225 2226 lea -48-0xa8(%rax),%rsi 2227 lea 512($context),%rdi 2228 mov \$20,%ecx 2229 .long 0xa548f3fc # cld; rep movsq 2230 2231.Lse_not_in_mul_gather4: 2232 mov -8(%rax),%rbx 2233 mov -16(%rax),%rbp 2234 mov -24(%rax),%r12 2235 mov -32(%rax),%r13 2236 mov -40(%rax),%r14 2237 mov -48(%rax),%r15 2238 mov %rbx,144($context) # restore context->Rbx 2239 mov %rbp,160($context) # restore context->Rbp 2240 mov %r12,216($context) # restore context->R12 2241 mov %r13,224($context) # restore context->R13 2242 mov %r14,232($context) # restore context->R14 2243 mov %r15,240($context) # restore context->R15 2244 2245.Lcommon_seh_tail: 2246 mov 8(%rax),%rdi 2247 mov 16(%rax),%rsi 2248 mov %rax,152($context) # restore context->Rsp 2249 mov %rsi,168($context) # restore context->Rsi 2250 mov %rdi,176($context) # restore context->Rdi 2251 2252 mov 40($disp),%rdi # disp->ContextRecord 2253 mov $context,%rsi # context 2254 mov \$154,%ecx # sizeof(CONTEXT) 2255 .long 0xa548f3fc # cld; rep movsq 2256 2257 mov $disp,%rsi 2258 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2259 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2260 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2261 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2262 mov 40(%rsi),%r10 # disp->ContextRecord 2263 lea 56(%rsi),%r11 # &disp->HandlerData 2264 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2265 mov %r10,32(%rsp) # arg5 2266 mov %r11,40(%rsp) # arg6 2267 mov %r12,48(%rsp) # arg7 2268 mov %rcx,56(%rsp) # arg8, (NULL) 2269 call *__imp_RtlVirtualUnwind(%rip) 2270 2271 mov \$1,%eax # ExceptionContinueSearch 2272 add \$64,%rsp 2273 popfq 2274 pop %r15 2275 pop %r14 2276 pop %r13 2277 pop %r12 2278 pop %rbp 2279 pop %rbx 2280 pop %rdi 2281 pop %rsi 2282 ret 2283.size se_handler,.-se_handler 2284 2285.section .pdata 2286.align 4 2287 .rva .LSEH_begin_rsaz_512_sqr 2288 .rva .LSEH_end_rsaz_512_sqr 2289 .rva .LSEH_info_rsaz_512_sqr 2290 2291 .rva .LSEH_begin_rsaz_512_mul 2292 .rva .LSEH_end_rsaz_512_mul 2293 .rva .LSEH_info_rsaz_512_mul 2294 2295 .rva .LSEH_begin_rsaz_512_mul_gather4 2296 .rva .LSEH_end_rsaz_512_mul_gather4 2297 .rva .LSEH_info_rsaz_512_mul_gather4 2298 2299 .rva .LSEH_begin_rsaz_512_mul_scatter4 2300 .rva .LSEH_end_rsaz_512_mul_scatter4 2301 .rva .LSEH_info_rsaz_512_mul_scatter4 2302 2303 .rva .LSEH_begin_rsaz_512_mul_by_one 2304 .rva .LSEH_end_rsaz_512_mul_by_one 2305 .rva .LSEH_info_rsaz_512_mul_by_one 2306 2307 .rva .LSEH_begin_rsaz_512_gather4 2308 .rva .LSEH_end_rsaz_512_gather4 2309 .rva .LSEH_info_rsaz_512_gather4 2310 2311.section .xdata 2312.align 8 2313.LSEH_info_rsaz_512_sqr: 2314 .byte 9,0,0,0 2315 .rva se_handler 2316 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 2317.LSEH_info_rsaz_512_mul: 2318 .byte 9,0,0,0 2319 .rva se_handler 2320 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 2321.LSEH_info_rsaz_512_mul_gather4: 2322 .byte 9,0,0,0 2323 .rva se_handler 2324 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] 2325.LSEH_info_rsaz_512_mul_scatter4: 2326 .byte 9,0,0,0 2327 .rva se_handler 2328 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] 2329.LSEH_info_rsaz_512_mul_by_one: 2330 .byte 9,0,0,0 2331 .rva se_handler 2332 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] 2333.LSEH_info_rsaz_512_gather4: 2334 .byte 0x01,0x46,0x16,0x00 2335 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 2336 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 2337 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 2338 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 2339 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 2340 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 2341 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 2342 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 2343 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 2344 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 2345 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8 2346___ 2347} 2348 2349$code =~ s/\`([^\`]*)\`/eval $1/gem; 2350print $code; 2351close STDOUT; 2352