1#! /usr/bin/env perl 2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. 3# Copyright (c) 2012, Intel Corporation. All Rights Reserved. 4# 5# Licensed under the OpenSSL license (the "License"). You may not use 6# this file except in compliance with the License. You can obtain a copy 7# in the file LICENSE in the source distribution or at 8# https://www.openssl.org/source/license.html 9# 10# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) 11# (1) Intel Corporation, Israel Development Center, Haifa, Israel 12# (2) University of Haifa, Israel 13# 14# References: 15# [1] S. Gueron, "Efficient Software Implementations of Modular 16# Exponentiation", http://eprint.iacr.org/2011/239 17# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". 18# IEEE Proceedings of 9th International Conference on Information 19# Technology: New Generations (ITNG 2012), 821-823 (2012). 20# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation 21# Journal of Cryptographic Engineering 2:31-43 (2012). 22# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis 23# resistant 512-bit and 1024-bit modular exponentiation for optimizing 24# RSA1024 and RSA2048 on x86_64 platforms", 25# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest 26# 27# While original submission covers 512- and 1024-bit exponentiation, 28# this module is limited to 512-bit version only (and as such 29# accelerates RSA1024 sign). This is because improvement for longer 30# keys is not high enough to justify the effort, highest measured 31# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming 32# for the moment of this writing!] Nor does this module implement 33# "monolithic" complete exponentiation jumbo-subroutine, but adheres 34# to more modular mixture of C and assembly. And it's optimized even 35# for processors other than Intel Core family (see table below for 36# improvement coefficients). 37# <appro@openssl.org> 38# 39# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) 40# ----------------+--------------------------- 41# Opteron +13% |+5% +20% 42# Bulldozer -0% |-1% +10% 43# P4 +11% |+7% +8% 44# Westmere +5% |+14% +17% 45# Sandy Bridge +2% |+12% +29% 46# Ivy Bridge +1% |+11% +35% 47# Haswell(**) -0% |+12% +39% 48# Atom +13% |+11% +4% 49# VIA Nano +70% |+9% +25% 50# 51# (*) rsax engine and fips numbers are presented for reference 52# purposes; 53# (**) MULX was attempted, but found to give only marginal improvement; 54 55$flavour = shift; 56$output = shift; 57if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 58 59$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 60 61$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 62( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 63( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 64die "can't locate x86_64-xlate.pl"; 65 66open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 67*STDOUT=*OUT; 68 69if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 70 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 71 $addx = ($1>=2.23); 72} 73 74if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 75 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 76 $addx = ($1>=2.10); 77} 78 79if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 80 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 81 $addx = ($1>=12); 82} 83 84if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 85 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 86 $addx = ($ver>=3.03); 87} 88 89($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API 90{ 91my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); 92 93$code.=<<___; 94.text 95 96.extern OPENSSL_ia32cap_P 97 98.globl rsaz_512_sqr 99.type rsaz_512_sqr,\@function,5 100.align 32 101rsaz_512_sqr: # 25-29% faster than rsaz_512_mul 102.cfi_startproc 103 push %rbx 104.cfi_push %rbx 105 push %rbp 106.cfi_push %rbp 107 push %r12 108.cfi_push %r12 109 push %r13 110.cfi_push %r13 111 push %r14 112.cfi_push %r14 113 push %r15 114.cfi_push %r15 115 116 subq \$128+24, %rsp 117.cfi_adjust_cfa_offset 128+24 118.Lsqr_body: 119 movq $mod, %rbp # common argument 120 movq ($inp), %rdx 121 movq 8($inp), %rax 122 movq $n0, 128(%rsp) 123___ 124$code.=<<___ if ($addx); 125 movl \$0x80100,%r11d 126 andl OPENSSL_ia32cap_P+8(%rip),%r11d 127 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 128 je .Loop_sqrx 129___ 130$code.=<<___; 131 jmp .Loop_sqr 132 133.align 32 134.Loop_sqr: 135 movl $times,128+8(%rsp) 136#first iteration 137 movq %rdx, %rbx 138 mulq %rdx 139 movq %rax, %r8 140 movq 16($inp), %rax 141 movq %rdx, %r9 142 143 mulq %rbx 144 addq %rax, %r9 145 movq 24($inp), %rax 146 movq %rdx, %r10 147 adcq \$0, %r10 148 149 mulq %rbx 150 addq %rax, %r10 151 movq 32($inp), %rax 152 movq %rdx, %r11 153 adcq \$0, %r11 154 155 mulq %rbx 156 addq %rax, %r11 157 movq 40($inp), %rax 158 movq %rdx, %r12 159 adcq \$0, %r12 160 161 mulq %rbx 162 addq %rax, %r12 163 movq 48($inp), %rax 164 movq %rdx, %r13 165 adcq \$0, %r13 166 167 mulq %rbx 168 addq %rax, %r13 169 movq 56($inp), %rax 170 movq %rdx, %r14 171 adcq \$0, %r14 172 173 mulq %rbx 174 addq %rax, %r14 175 movq %rbx, %rax 176 movq %rdx, %r15 177 adcq \$0, %r15 178 179 addq %r8, %r8 #shlq \$1, %r8 180 movq %r9, %rcx 181 adcq %r9, %r9 #shld \$1, %r8, %r9 182 183 mulq %rax 184 movq %rax, (%rsp) 185 addq %rdx, %r8 186 adcq \$0, %r9 187 188 movq %r8, 8(%rsp) 189 shrq \$63, %rcx 190 191#second iteration 192 movq 8($inp), %r8 193 movq 16($inp), %rax 194 mulq %r8 195 addq %rax, %r10 196 movq 24($inp), %rax 197 movq %rdx, %rbx 198 adcq \$0, %rbx 199 200 mulq %r8 201 addq %rax, %r11 202 movq 32($inp), %rax 203 adcq \$0, %rdx 204 addq %rbx, %r11 205 movq %rdx, %rbx 206 adcq \$0, %rbx 207 208 mulq %r8 209 addq %rax, %r12 210 movq 40($inp), %rax 211 adcq \$0, %rdx 212 addq %rbx, %r12 213 movq %rdx, %rbx 214 adcq \$0, %rbx 215 216 mulq %r8 217 addq %rax, %r13 218 movq 48($inp), %rax 219 adcq \$0, %rdx 220 addq %rbx, %r13 221 movq %rdx, %rbx 222 adcq \$0, %rbx 223 224 mulq %r8 225 addq %rax, %r14 226 movq 56($inp), %rax 227 adcq \$0, %rdx 228 addq %rbx, %r14 229 movq %rdx, %rbx 230 adcq \$0, %rbx 231 232 mulq %r8 233 addq %rax, %r15 234 movq %r8, %rax 235 adcq \$0, %rdx 236 addq %rbx, %r15 237 movq %rdx, %r8 238 movq %r10, %rdx 239 adcq \$0, %r8 240 241 add %rdx, %rdx 242 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 243 movq %r11, %rbx 244 adcq %r11, %r11 #shld \$1, %r10, %r11 245 246 mulq %rax 247 addq %rax, %r9 248 adcq %rdx, %r10 249 adcq \$0, %r11 250 251 movq %r9, 16(%rsp) 252 movq %r10, 24(%rsp) 253 shrq \$63, %rbx 254 255#third iteration 256 movq 16($inp), %r9 257 movq 24($inp), %rax 258 mulq %r9 259 addq %rax, %r12 260 movq 32($inp), %rax 261 movq %rdx, %rcx 262 adcq \$0, %rcx 263 264 mulq %r9 265 addq %rax, %r13 266 movq 40($inp), %rax 267 adcq \$0, %rdx 268 addq %rcx, %r13 269 movq %rdx, %rcx 270 adcq \$0, %rcx 271 272 mulq %r9 273 addq %rax, %r14 274 movq 48($inp), %rax 275 adcq \$0, %rdx 276 addq %rcx, %r14 277 movq %rdx, %rcx 278 adcq \$0, %rcx 279 280 mulq %r9 281 movq %r12, %r10 282 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12 283 addq %rax, %r15 284 movq 56($inp), %rax 285 adcq \$0, %rdx 286 addq %rcx, %r15 287 movq %rdx, %rcx 288 adcq \$0, %rcx 289 290 mulq %r9 291 shrq \$63, %r10 292 addq %rax, %r8 293 movq %r9, %rax 294 adcq \$0, %rdx 295 addq %rcx, %r8 296 movq %rdx, %r9 297 adcq \$0, %r9 298 299 movq %r13, %rcx 300 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13 301 302 mulq %rax 303 addq %rax, %r11 304 adcq %rdx, %r12 305 adcq \$0, %r13 306 307 movq %r11, 32(%rsp) 308 movq %r12, 40(%rsp) 309 shrq \$63, %rcx 310 311#fourth iteration 312 movq 24($inp), %r10 313 movq 32($inp), %rax 314 mulq %r10 315 addq %rax, %r14 316 movq 40($inp), %rax 317 movq %rdx, %rbx 318 adcq \$0, %rbx 319 320 mulq %r10 321 addq %rax, %r15 322 movq 48($inp), %rax 323 adcq \$0, %rdx 324 addq %rbx, %r15 325 movq %rdx, %rbx 326 adcq \$0, %rbx 327 328 mulq %r10 329 movq %r14, %r12 330 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14 331 addq %rax, %r8 332 movq 56($inp), %rax 333 adcq \$0, %rdx 334 addq %rbx, %r8 335 movq %rdx, %rbx 336 adcq \$0, %rbx 337 338 mulq %r10 339 shrq \$63, %r12 340 addq %rax, %r9 341 movq %r10, %rax 342 adcq \$0, %rdx 343 addq %rbx, %r9 344 movq %rdx, %r10 345 adcq \$0, %r10 346 347 movq %r15, %rbx 348 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15 349 350 mulq %rax 351 addq %rax, %r13 352 adcq %rdx, %r14 353 adcq \$0, %r15 354 355 movq %r13, 48(%rsp) 356 movq %r14, 56(%rsp) 357 shrq \$63, %rbx 358 359#fifth iteration 360 movq 32($inp), %r11 361 movq 40($inp), %rax 362 mulq %r11 363 addq %rax, %r8 364 movq 48($inp), %rax 365 movq %rdx, %rcx 366 adcq \$0, %rcx 367 368 mulq %r11 369 addq %rax, %r9 370 movq 56($inp), %rax 371 adcq \$0, %rdx 372 movq %r8, %r12 373 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8 374 addq %rcx, %r9 375 movq %rdx, %rcx 376 adcq \$0, %rcx 377 378 mulq %r11 379 shrq \$63, %r12 380 addq %rax, %r10 381 movq %r11, %rax 382 adcq \$0, %rdx 383 addq %rcx, %r10 384 movq %rdx, %r11 385 adcq \$0, %r11 386 387 movq %r9, %rcx 388 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9 389 390 mulq %rax 391 addq %rax, %r15 392 adcq %rdx, %r8 393 adcq \$0, %r9 394 395 movq %r15, 64(%rsp) 396 movq %r8, 72(%rsp) 397 shrq \$63, %rcx 398 399#sixth iteration 400 movq 40($inp), %r12 401 movq 48($inp), %rax 402 mulq %r12 403 addq %rax, %r10 404 movq 56($inp), %rax 405 movq %rdx, %rbx 406 adcq \$0, %rbx 407 408 mulq %r12 409 addq %rax, %r11 410 movq %r12, %rax 411 movq %r10, %r15 412 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 413 adcq \$0, %rdx 414 shrq \$63, %r15 415 addq %rbx, %r11 416 movq %rdx, %r12 417 adcq \$0, %r12 418 419 movq %r11, %rbx 420 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11 421 422 mulq %rax 423 addq %rax, %r9 424 adcq %rdx, %r10 425 adcq \$0, %r11 426 427 movq %r9, 80(%rsp) 428 movq %r10, 88(%rsp) 429 430#seventh iteration 431 movq 48($inp), %r13 432 movq 56($inp), %rax 433 mulq %r13 434 addq %rax, %r12 435 movq %r13, %rax 436 movq %rdx, %r13 437 adcq \$0, %r13 438 439 xorq %r14, %r14 440 shlq \$1, %rbx 441 adcq %r12, %r12 #shld \$1, %rbx, %r12 442 adcq %r13, %r13 #shld \$1, %r12, %r13 443 adcq %r14, %r14 #shld \$1, %r13, %r14 444 445 mulq %rax 446 addq %rax, %r11 447 adcq %rdx, %r12 448 adcq \$0, %r13 449 450 movq %r11, 96(%rsp) 451 movq %r12, 104(%rsp) 452 453#eighth iteration 454 movq 56($inp), %rax 455 mulq %rax 456 addq %rax, %r13 457 adcq \$0, %rdx 458 459 addq %rdx, %r14 460 461 movq %r13, 112(%rsp) 462 movq %r14, 120(%rsp) 463 464 movq (%rsp), %r8 465 movq 8(%rsp), %r9 466 movq 16(%rsp), %r10 467 movq 24(%rsp), %r11 468 movq 32(%rsp), %r12 469 movq 40(%rsp), %r13 470 movq 48(%rsp), %r14 471 movq 56(%rsp), %r15 472 473 call __rsaz_512_reduce 474 475 addq 64(%rsp), %r8 476 adcq 72(%rsp), %r9 477 adcq 80(%rsp), %r10 478 adcq 88(%rsp), %r11 479 adcq 96(%rsp), %r12 480 adcq 104(%rsp), %r13 481 adcq 112(%rsp), %r14 482 adcq 120(%rsp), %r15 483 sbbq %rcx, %rcx 484 485 call __rsaz_512_subtract 486 487 movq %r8, %rdx 488 movq %r9, %rax 489 movl 128+8(%rsp), $times 490 movq $out, $inp 491 492 decl $times 493 jnz .Loop_sqr 494___ 495if ($addx) { 496$code.=<<___; 497 jmp .Lsqr_tail 498 499.align 32 500.Loop_sqrx: 501 movl $times,128+8(%rsp) 502 movq $out, %xmm0 # off-load 503 movq %rbp, %xmm1 # off-load 504#first iteration 505 mulx %rax, %r8, %r9 506 507 mulx 16($inp), %rcx, %r10 508 xor %rbp, %rbp # cf=0, of=0 509 510 mulx 24($inp), %rax, %r11 511 adcx %rcx, %r9 512 513 mulx 32($inp), %rcx, %r12 514 adcx %rax, %r10 515 516 mulx 40($inp), %rax, %r13 517 adcx %rcx, %r11 518 519 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14 520 adcx %rax, %r12 521 adcx %rcx, %r13 522 523 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15 524 adcx %rax, %r14 525 adcx %rbp, %r15 # %rbp is 0 526 527 mov %r9, %rcx 528 shld \$1, %r8, %r9 529 shl \$1, %r8 530 531 xor %ebp, %ebp 532 mulx %rdx, %rax, %rdx 533 adcx %rdx, %r8 534 mov 8($inp), %rdx 535 adcx %rbp, %r9 536 537 mov %rax, (%rsp) 538 mov %r8, 8(%rsp) 539 540#second iteration 541 mulx 16($inp), %rax, %rbx 542 adox %rax, %r10 543 adcx %rbx, %r11 544 545 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8 546 adox $out, %r11 547 adcx %r8, %r12 548 549 mulx 32($inp), %rax, %rbx 550 adox %rax, %r12 551 adcx %rbx, %r13 552 553 mulx 40($inp), $out, %r8 554 adox $out, %r13 555 adcx %r8, %r14 556 557 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 558 adox %rax, %r14 559 adcx %rbx, %r15 560 561 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 562 adox $out, %r15 563 adcx %rbp, %r8 564 adox %rbp, %r8 565 566 mov %r11, %rbx 567 shld \$1, %r10, %r11 568 shld \$1, %rcx, %r10 569 570 xor %ebp,%ebp 571 mulx %rdx, %rax, %rcx 572 mov 16($inp), %rdx 573 adcx %rax, %r9 574 adcx %rcx, %r10 575 adcx %rbp, %r11 576 577 mov %r9, 16(%rsp) 578 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) 579 580#third iteration 581 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9 582 adox $out, %r12 583 adcx %r9, %r13 584 585 mulx 32($inp), %rax, %rcx 586 adox %rax, %r13 587 adcx %rcx, %r14 588 589 mulx 40($inp), $out, %r9 590 adox $out, %r14 591 adcx %r9, %r15 592 593 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx 594 adox %rax, %r15 595 adcx %rcx, %r8 596 597 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9 598 adox $out, %r8 599 adcx %rbp, %r9 600 adox %rbp, %r9 601 602 mov %r13, %rcx 603 shld \$1, %r12, %r13 604 shld \$1, %rbx, %r12 605 606 xor %ebp, %ebp 607 mulx %rdx, %rax, %rdx 608 adcx %rax, %r11 609 adcx %rdx, %r12 610 mov 24($inp), %rdx 611 adcx %rbp, %r13 612 613 mov %r11, 32(%rsp) 614 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp) 615 616#fourth iteration 617 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx 618 adox %rax, %r14 619 adcx %rbx, %r15 620 621 mulx 40($inp), $out, %r10 622 adox $out, %r15 623 adcx %r10, %r8 624 625 mulx 48($inp), %rax, %rbx 626 adox %rax, %r8 627 adcx %rbx, %r9 628 629 mulx 56($inp), $out, %r10 630 adox $out, %r9 631 adcx %rbp, %r10 632 adox %rbp, %r10 633 634 .byte 0x66 635 mov %r15, %rbx 636 shld \$1, %r14, %r15 637 shld \$1, %rcx, %r14 638 639 xor %ebp, %ebp 640 mulx %rdx, %rax, %rdx 641 adcx %rax, %r13 642 adcx %rdx, %r14 643 mov 32($inp), %rdx 644 adcx %rbp, %r15 645 646 mov %r13, 48(%rsp) 647 mov %r14, 56(%rsp) 648 649#fifth iteration 650 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11 651 adox $out, %r8 652 adcx %r11, %r9 653 654 mulx 48($inp), %rax, %rcx 655 adox %rax, %r9 656 adcx %rcx, %r10 657 658 mulx 56($inp), $out, %r11 659 adox $out, %r10 660 adcx %rbp, %r11 661 adox %rbp, %r11 662 663 mov %r9, %rcx 664 shld \$1, %r8, %r9 665 shld \$1, %rbx, %r8 666 667 xor %ebp, %ebp 668 mulx %rdx, %rax, %rdx 669 adcx %rax, %r15 670 adcx %rdx, %r8 671 mov 40($inp), %rdx 672 adcx %rbp, %r9 673 674 mov %r15, 64(%rsp) 675 mov %r8, 72(%rsp) 676 677#sixth iteration 678 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 679 adox %rax, %r10 680 adcx %rbx, %r11 681 682 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 683 adox $out, %r11 684 adcx %rbp, %r12 685 adox %rbp, %r12 686 687 mov %r11, %rbx 688 shld \$1, %r10, %r11 689 shld \$1, %rcx, %r10 690 691 xor %ebp, %ebp 692 mulx %rdx, %rax, %rdx 693 adcx %rax, %r9 694 adcx %rdx, %r10 695 mov 48($inp), %rdx 696 adcx %rbp, %r11 697 698 mov %r9, 80(%rsp) 699 mov %r10, 88(%rsp) 700 701#seventh iteration 702 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 703 adox %rax, %r12 704 adox %rbp, %r13 705 706 xor %r14, %r14 707 shld \$1, %r13, %r14 708 shld \$1, %r12, %r13 709 shld \$1, %rbx, %r12 710 711 xor %ebp, %ebp 712 mulx %rdx, %rax, %rdx 713 adcx %rax, %r11 714 adcx %rdx, %r12 715 mov 56($inp), %rdx 716 adcx %rbp, %r13 717 718 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) 719 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) 720 721#eighth iteration 722 mulx %rdx, %rax, %rdx 723 adox %rax, %r13 724 adox %rbp, %rdx 725 726 .byte 0x66 727 add %rdx, %r14 728 729 movq %r13, 112(%rsp) 730 movq %r14, 120(%rsp) 731 movq %xmm0, $out 732 movq %xmm1, %rbp 733 734 movq 128(%rsp), %rdx # pull $n0 735 movq (%rsp), %r8 736 movq 8(%rsp), %r9 737 movq 16(%rsp), %r10 738 movq 24(%rsp), %r11 739 movq 32(%rsp), %r12 740 movq 40(%rsp), %r13 741 movq 48(%rsp), %r14 742 movq 56(%rsp), %r15 743 744 call __rsaz_512_reducex 745 746 addq 64(%rsp), %r8 747 adcq 72(%rsp), %r9 748 adcq 80(%rsp), %r10 749 adcq 88(%rsp), %r11 750 adcq 96(%rsp), %r12 751 adcq 104(%rsp), %r13 752 adcq 112(%rsp), %r14 753 adcq 120(%rsp), %r15 754 sbbq %rcx, %rcx 755 756 call __rsaz_512_subtract 757 758 movq %r8, %rdx 759 movq %r9, %rax 760 movl 128+8(%rsp), $times 761 movq $out, $inp 762 763 decl $times 764 jnz .Loop_sqrx 765 766.Lsqr_tail: 767___ 768} 769$code.=<<___; 770 771 leaq 128+24+48(%rsp), %rax 772.cfi_def_cfa %rax,8 773 movq -48(%rax), %r15 774.cfi_restore %r15 775 movq -40(%rax), %r14 776.cfi_restore %r14 777 movq -32(%rax), %r13 778.cfi_restore %r13 779 movq -24(%rax), %r12 780.cfi_restore %r12 781 movq -16(%rax), %rbp 782.cfi_restore %rbp 783 movq -8(%rax), %rbx 784.cfi_restore %rbx 785 leaq (%rax), %rsp 786.cfi_def_cfa_register %rsp 787.Lsqr_epilogue: 788 ret 789.cfi_endproc 790.size rsaz_512_sqr,.-rsaz_512_sqr 791___ 792} 793{ 794my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); 795$code.=<<___; 796.globl rsaz_512_mul 797.type rsaz_512_mul,\@function,5 798.align 32 799rsaz_512_mul: 800.cfi_startproc 801 push %rbx 802.cfi_push %rbx 803 push %rbp 804.cfi_push %rbp 805 push %r12 806.cfi_push %r12 807 push %r13 808.cfi_push %r13 809 push %r14 810.cfi_push %r14 811 push %r15 812.cfi_push %r15 813 814 subq \$128+24, %rsp 815.cfi_adjust_cfa_offset 128+24 816.Lmul_body: 817 movq $out, %xmm0 # off-load arguments 818 movq $mod, %xmm1 819 movq $n0, 128(%rsp) 820___ 821$code.=<<___ if ($addx); 822 movl \$0x80100,%r11d 823 andl OPENSSL_ia32cap_P+8(%rip),%r11d 824 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 825 je .Lmulx 826___ 827$code.=<<___; 828 movq ($bp), %rbx # pass b[0] 829 movq $bp, %rbp # pass argument 830 call __rsaz_512_mul 831 832 movq %xmm0, $out 833 movq %xmm1, %rbp 834 835 movq (%rsp), %r8 836 movq 8(%rsp), %r9 837 movq 16(%rsp), %r10 838 movq 24(%rsp), %r11 839 movq 32(%rsp), %r12 840 movq 40(%rsp), %r13 841 movq 48(%rsp), %r14 842 movq 56(%rsp), %r15 843 844 call __rsaz_512_reduce 845___ 846$code.=<<___ if ($addx); 847 jmp .Lmul_tail 848 849.align 32 850.Lmulx: 851 movq $bp, %rbp # pass argument 852 movq ($bp), %rdx # pass b[0] 853 call __rsaz_512_mulx 854 855 movq %xmm0, $out 856 movq %xmm1, %rbp 857 858 movq 128(%rsp), %rdx # pull $n0 859 movq (%rsp), %r8 860 movq 8(%rsp), %r9 861 movq 16(%rsp), %r10 862 movq 24(%rsp), %r11 863 movq 32(%rsp), %r12 864 movq 40(%rsp), %r13 865 movq 48(%rsp), %r14 866 movq 56(%rsp), %r15 867 868 call __rsaz_512_reducex 869.Lmul_tail: 870___ 871$code.=<<___; 872 addq 64(%rsp), %r8 873 adcq 72(%rsp), %r9 874 adcq 80(%rsp), %r10 875 adcq 88(%rsp), %r11 876 adcq 96(%rsp), %r12 877 adcq 104(%rsp), %r13 878 adcq 112(%rsp), %r14 879 adcq 120(%rsp), %r15 880 sbbq %rcx, %rcx 881 882 call __rsaz_512_subtract 883 884 leaq 128+24+48(%rsp), %rax 885.cfi_def_cfa %rax,8 886 movq -48(%rax), %r15 887.cfi_restore %r15 888 movq -40(%rax), %r14 889.cfi_restore %r14 890 movq -32(%rax), %r13 891.cfi_restore %r13 892 movq -24(%rax), %r12 893.cfi_restore %r12 894 movq -16(%rax), %rbp 895.cfi_restore %rbp 896 movq -8(%rax), %rbx 897.cfi_restore %rbx 898 leaq (%rax), %rsp 899.cfi_def_cfa_register %rsp 900.Lmul_epilogue: 901 ret 902.cfi_endproc 903.size rsaz_512_mul,.-rsaz_512_mul 904___ 905} 906{ 907my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 908$code.=<<___; 909.globl rsaz_512_mul_gather4 910.type rsaz_512_mul_gather4,\@function,6 911.align 32 912rsaz_512_mul_gather4: 913.cfi_startproc 914 push %rbx 915.cfi_push %rbx 916 push %rbp 917.cfi_push %rbp 918 push %r12 919.cfi_push %r12 920 push %r13 921.cfi_push %r13 922 push %r14 923.cfi_push %r14 924 push %r15 925.cfi_push %r15 926 927 subq \$`128+24+($win64?0xb0:0)`, %rsp 928.cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)` 929___ 930$code.=<<___ if ($win64); 931 movaps %xmm6,0xa0(%rsp) 932 movaps %xmm7,0xb0(%rsp) 933 movaps %xmm8,0xc0(%rsp) 934 movaps %xmm9,0xd0(%rsp) 935 movaps %xmm10,0xe0(%rsp) 936 movaps %xmm11,0xf0(%rsp) 937 movaps %xmm12,0x100(%rsp) 938 movaps %xmm13,0x110(%rsp) 939 movaps %xmm14,0x120(%rsp) 940 movaps %xmm15,0x130(%rsp) 941___ 942$code.=<<___; 943.Lmul_gather4_body: 944 movd $pwr,%xmm8 945 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 946 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 947 948 pshufd \$0,%xmm8,%xmm8 # broadcast $power 949 movdqa %xmm1,%xmm7 950 movdqa %xmm1,%xmm2 951___ 952######################################################################## 953# calculate mask by comparing 0..15 to $power 954# 955for($i=0;$i<4;$i++) { 956$code.=<<___; 957 paddd %xmm`$i`,%xmm`$i+1` 958 pcmpeqd %xmm8,%xmm`$i` 959 movdqa %xmm7,%xmm`$i+3` 960___ 961} 962for(;$i<7;$i++) { 963$code.=<<___; 964 paddd %xmm`$i`,%xmm`$i+1` 965 pcmpeqd %xmm8,%xmm`$i` 966___ 967} 968$code.=<<___; 969 pcmpeqd %xmm8,%xmm7 970 971 movdqa 16*0($bp),%xmm8 972 movdqa 16*1($bp),%xmm9 973 movdqa 16*2($bp),%xmm10 974 movdqa 16*3($bp),%xmm11 975 pand %xmm0,%xmm8 976 movdqa 16*4($bp),%xmm12 977 pand %xmm1,%xmm9 978 movdqa 16*5($bp),%xmm13 979 pand %xmm2,%xmm10 980 movdqa 16*6($bp),%xmm14 981 pand %xmm3,%xmm11 982 movdqa 16*7($bp),%xmm15 983 leaq 128($bp), %rbp 984 pand %xmm4,%xmm12 985 pand %xmm5,%xmm13 986 pand %xmm6,%xmm14 987 pand %xmm7,%xmm15 988 por %xmm10,%xmm8 989 por %xmm11,%xmm9 990 por %xmm12,%xmm8 991 por %xmm13,%xmm9 992 por %xmm14,%xmm8 993 por %xmm15,%xmm9 994 995 por %xmm9,%xmm8 996 pshufd \$0x4e,%xmm8,%xmm9 997 por %xmm9,%xmm8 998___ 999$code.=<<___ if ($addx); 1000 movl \$0x80100,%r11d 1001 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1002 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1003 je .Lmulx_gather 1004___ 1005$code.=<<___; 1006 movq %xmm8,%rbx 1007 1008 movq $n0, 128(%rsp) # off-load arguments 1009 movq $out, 128+8(%rsp) 1010 movq $mod, 128+16(%rsp) 1011 1012 movq ($ap), %rax 1013 movq 8($ap), %rcx 1014 mulq %rbx # 0 iteration 1015 movq %rax, (%rsp) 1016 movq %rcx, %rax 1017 movq %rdx, %r8 1018 1019 mulq %rbx 1020 addq %rax, %r8 1021 movq 16($ap), %rax 1022 movq %rdx, %r9 1023 adcq \$0, %r9 1024 1025 mulq %rbx 1026 addq %rax, %r9 1027 movq 24($ap), %rax 1028 movq %rdx, %r10 1029 adcq \$0, %r10 1030 1031 mulq %rbx 1032 addq %rax, %r10 1033 movq 32($ap), %rax 1034 movq %rdx, %r11 1035 adcq \$0, %r11 1036 1037 mulq %rbx 1038 addq %rax, %r11 1039 movq 40($ap), %rax 1040 movq %rdx, %r12 1041 adcq \$0, %r12 1042 1043 mulq %rbx 1044 addq %rax, %r12 1045 movq 48($ap), %rax 1046 movq %rdx, %r13 1047 adcq \$0, %r13 1048 1049 mulq %rbx 1050 addq %rax, %r13 1051 movq 56($ap), %rax 1052 movq %rdx, %r14 1053 adcq \$0, %r14 1054 1055 mulq %rbx 1056 addq %rax, %r14 1057 movq ($ap), %rax 1058 movq %rdx, %r15 1059 adcq \$0, %r15 1060 1061 leaq 8(%rsp), %rdi 1062 movl \$7, %ecx 1063 jmp .Loop_mul_gather 1064 1065.align 32 1066.Loop_mul_gather: 1067 movdqa 16*0(%rbp),%xmm8 1068 movdqa 16*1(%rbp),%xmm9 1069 movdqa 16*2(%rbp),%xmm10 1070 movdqa 16*3(%rbp),%xmm11 1071 pand %xmm0,%xmm8 1072 movdqa 16*4(%rbp),%xmm12 1073 pand %xmm1,%xmm9 1074 movdqa 16*5(%rbp),%xmm13 1075 pand %xmm2,%xmm10 1076 movdqa 16*6(%rbp),%xmm14 1077 pand %xmm3,%xmm11 1078 movdqa 16*7(%rbp),%xmm15 1079 leaq 128(%rbp), %rbp 1080 pand %xmm4,%xmm12 1081 pand %xmm5,%xmm13 1082 pand %xmm6,%xmm14 1083 pand %xmm7,%xmm15 1084 por %xmm10,%xmm8 1085 por %xmm11,%xmm9 1086 por %xmm12,%xmm8 1087 por %xmm13,%xmm9 1088 por %xmm14,%xmm8 1089 por %xmm15,%xmm9 1090 1091 por %xmm9,%xmm8 1092 pshufd \$0x4e,%xmm8,%xmm9 1093 por %xmm9,%xmm8 1094 movq %xmm8,%rbx 1095 1096 mulq %rbx 1097 addq %rax, %r8 1098 movq 8($ap), %rax 1099 movq %r8, (%rdi) 1100 movq %rdx, %r8 1101 adcq \$0, %r8 1102 1103 mulq %rbx 1104 addq %rax, %r9 1105 movq 16($ap), %rax 1106 adcq \$0, %rdx 1107 addq %r9, %r8 1108 movq %rdx, %r9 1109 adcq \$0, %r9 1110 1111 mulq %rbx 1112 addq %rax, %r10 1113 movq 24($ap), %rax 1114 adcq \$0, %rdx 1115 addq %r10, %r9 1116 movq %rdx, %r10 1117 adcq \$0, %r10 1118 1119 mulq %rbx 1120 addq %rax, %r11 1121 movq 32($ap), %rax 1122 adcq \$0, %rdx 1123 addq %r11, %r10 1124 movq %rdx, %r11 1125 adcq \$0, %r11 1126 1127 mulq %rbx 1128 addq %rax, %r12 1129 movq 40($ap), %rax 1130 adcq \$0, %rdx 1131 addq %r12, %r11 1132 movq %rdx, %r12 1133 adcq \$0, %r12 1134 1135 mulq %rbx 1136 addq %rax, %r13 1137 movq 48($ap), %rax 1138 adcq \$0, %rdx 1139 addq %r13, %r12 1140 movq %rdx, %r13 1141 adcq \$0, %r13 1142 1143 mulq %rbx 1144 addq %rax, %r14 1145 movq 56($ap), %rax 1146 adcq \$0, %rdx 1147 addq %r14, %r13 1148 movq %rdx, %r14 1149 adcq \$0, %r14 1150 1151 mulq %rbx 1152 addq %rax, %r15 1153 movq ($ap), %rax 1154 adcq \$0, %rdx 1155 addq %r15, %r14 1156 movq %rdx, %r15 1157 adcq \$0, %r15 1158 1159 leaq 8(%rdi), %rdi 1160 1161 decl %ecx 1162 jnz .Loop_mul_gather 1163 1164 movq %r8, (%rdi) 1165 movq %r9, 8(%rdi) 1166 movq %r10, 16(%rdi) 1167 movq %r11, 24(%rdi) 1168 movq %r12, 32(%rdi) 1169 movq %r13, 40(%rdi) 1170 movq %r14, 48(%rdi) 1171 movq %r15, 56(%rdi) 1172 1173 movq 128+8(%rsp), $out 1174 movq 128+16(%rsp), %rbp 1175 1176 movq (%rsp), %r8 1177 movq 8(%rsp), %r9 1178 movq 16(%rsp), %r10 1179 movq 24(%rsp), %r11 1180 movq 32(%rsp), %r12 1181 movq 40(%rsp), %r13 1182 movq 48(%rsp), %r14 1183 movq 56(%rsp), %r15 1184 1185 call __rsaz_512_reduce 1186___ 1187$code.=<<___ if ($addx); 1188 jmp .Lmul_gather_tail 1189 1190.align 32 1191.Lmulx_gather: 1192 movq %xmm8,%rdx 1193 1194 mov $n0, 128(%rsp) # off-load arguments 1195 mov $out, 128+8(%rsp) 1196 mov $mod, 128+16(%rsp) 1197 1198 mulx ($ap), %rbx, %r8 # 0 iteration 1199 mov %rbx, (%rsp) 1200 xor %edi, %edi # cf=0, of=0 1201 1202 mulx 8($ap), %rax, %r9 1203 1204 mulx 16($ap), %rbx, %r10 1205 adcx %rax, %r8 1206 1207 mulx 24($ap), %rax, %r11 1208 adcx %rbx, %r9 1209 1210 mulx 32($ap), %rbx, %r12 1211 adcx %rax, %r10 1212 1213 mulx 40($ap), %rax, %r13 1214 adcx %rbx, %r11 1215 1216 mulx 48($ap), %rbx, %r14 1217 adcx %rax, %r12 1218 1219 mulx 56($ap), %rax, %r15 1220 adcx %rbx, %r13 1221 adcx %rax, %r14 1222 .byte 0x67 1223 mov %r8, %rbx 1224 adcx %rdi, %r15 # %rdi is 0 1225 1226 mov \$-7, %rcx 1227 jmp .Loop_mulx_gather 1228 1229.align 32 1230.Loop_mulx_gather: 1231 movdqa 16*0(%rbp),%xmm8 1232 movdqa 16*1(%rbp),%xmm9 1233 movdqa 16*2(%rbp),%xmm10 1234 movdqa 16*3(%rbp),%xmm11 1235 pand %xmm0,%xmm8 1236 movdqa 16*4(%rbp),%xmm12 1237 pand %xmm1,%xmm9 1238 movdqa 16*5(%rbp),%xmm13 1239 pand %xmm2,%xmm10 1240 movdqa 16*6(%rbp),%xmm14 1241 pand %xmm3,%xmm11 1242 movdqa 16*7(%rbp),%xmm15 1243 leaq 128(%rbp), %rbp 1244 pand %xmm4,%xmm12 1245 pand %xmm5,%xmm13 1246 pand %xmm6,%xmm14 1247 pand %xmm7,%xmm15 1248 por %xmm10,%xmm8 1249 por %xmm11,%xmm9 1250 por %xmm12,%xmm8 1251 por %xmm13,%xmm9 1252 por %xmm14,%xmm8 1253 por %xmm15,%xmm9 1254 1255 por %xmm9,%xmm8 1256 pshufd \$0x4e,%xmm8,%xmm9 1257 por %xmm9,%xmm8 1258 movq %xmm8,%rdx 1259 1260 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8 1261 adcx %rax, %rbx 1262 adox %r9, %r8 1263 1264 mulx 8($ap), %rax, %r9 1265 adcx %rax, %r8 1266 adox %r10, %r9 1267 1268 mulx 16($ap), %rax, %r10 1269 adcx %rax, %r9 1270 adox %r11, %r10 1271 1272 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 1273 adcx %rax, %r10 1274 adox %r12, %r11 1275 1276 mulx 32($ap), %rax, %r12 1277 adcx %rax, %r11 1278 adox %r13, %r12 1279 1280 mulx 40($ap), %rax, %r13 1281 adcx %rax, %r12 1282 adox %r14, %r13 1283 1284 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 1285 adcx %rax, %r13 1286 .byte 0x67 1287 adox %r15, %r14 1288 1289 mulx 56($ap), %rax, %r15 1290 mov %rbx, 64(%rsp,%rcx,8) 1291 adcx %rax, %r14 1292 adox %rdi, %r15 1293 mov %r8, %rbx 1294 adcx %rdi, %r15 # cf=0 1295 1296 inc %rcx # of=0 1297 jnz .Loop_mulx_gather 1298 1299 mov %r8, 64(%rsp) 1300 mov %r9, 64+8(%rsp) 1301 mov %r10, 64+16(%rsp) 1302 mov %r11, 64+24(%rsp) 1303 mov %r12, 64+32(%rsp) 1304 mov %r13, 64+40(%rsp) 1305 mov %r14, 64+48(%rsp) 1306 mov %r15, 64+56(%rsp) 1307 1308 mov 128(%rsp), %rdx # pull arguments 1309 mov 128+8(%rsp), $out 1310 mov 128+16(%rsp), %rbp 1311 1312 mov (%rsp), %r8 1313 mov 8(%rsp), %r9 1314 mov 16(%rsp), %r10 1315 mov 24(%rsp), %r11 1316 mov 32(%rsp), %r12 1317 mov 40(%rsp), %r13 1318 mov 48(%rsp), %r14 1319 mov 56(%rsp), %r15 1320 1321 call __rsaz_512_reducex 1322 1323.Lmul_gather_tail: 1324___ 1325$code.=<<___; 1326 addq 64(%rsp), %r8 1327 adcq 72(%rsp), %r9 1328 adcq 80(%rsp), %r10 1329 adcq 88(%rsp), %r11 1330 adcq 96(%rsp), %r12 1331 adcq 104(%rsp), %r13 1332 adcq 112(%rsp), %r14 1333 adcq 120(%rsp), %r15 1334 sbbq %rcx, %rcx 1335 1336 call __rsaz_512_subtract 1337 1338 leaq 128+24+48(%rsp), %rax 1339___ 1340$code.=<<___ if ($win64); 1341 movaps 0xa0-0xc8(%rax),%xmm6 1342 movaps 0xb0-0xc8(%rax),%xmm7 1343 movaps 0xc0-0xc8(%rax),%xmm8 1344 movaps 0xd0-0xc8(%rax),%xmm9 1345 movaps 0xe0-0xc8(%rax),%xmm10 1346 movaps 0xf0-0xc8(%rax),%xmm11 1347 movaps 0x100-0xc8(%rax),%xmm12 1348 movaps 0x110-0xc8(%rax),%xmm13 1349 movaps 0x120-0xc8(%rax),%xmm14 1350 movaps 0x130-0xc8(%rax),%xmm15 1351 lea 0xb0(%rax),%rax 1352___ 1353$code.=<<___; 1354.cfi_def_cfa %rax,8 1355 movq -48(%rax), %r15 1356.cfi_restore %r15 1357 movq -40(%rax), %r14 1358.cfi_restore %r14 1359 movq -32(%rax), %r13 1360.cfi_restore %r13 1361 movq -24(%rax), %r12 1362.cfi_restore %r12 1363 movq -16(%rax), %rbp 1364.cfi_restore %rbp 1365 movq -8(%rax), %rbx 1366.cfi_restore %rbx 1367 leaq (%rax), %rsp 1368.cfi_def_cfa_register %rsp 1369.Lmul_gather4_epilogue: 1370 ret 1371.cfi_endproc 1372.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 1373___ 1374} 1375{ 1376my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1377$code.=<<___; 1378.globl rsaz_512_mul_scatter4 1379.type rsaz_512_mul_scatter4,\@function,6 1380.align 32 1381rsaz_512_mul_scatter4: 1382.cfi_startproc 1383 push %rbx 1384.cfi_push %rbx 1385 push %rbp 1386.cfi_push %rbp 1387 push %r12 1388.cfi_push %r12 1389 push %r13 1390.cfi_push %r13 1391 push %r14 1392.cfi_push %r14 1393 push %r15 1394.cfi_push %r15 1395 1396 mov $pwr, $pwr 1397 subq \$128+24, %rsp 1398.cfi_adjust_cfa_offset 128+24 1399.Lmul_scatter4_body: 1400 leaq ($tbl,$pwr,8), $tbl 1401 movq $out, %xmm0 # off-load arguments 1402 movq $mod, %xmm1 1403 movq $tbl, %xmm2 1404 movq $n0, 128(%rsp) 1405 1406 movq $out, %rbp 1407___ 1408$code.=<<___ if ($addx); 1409 movl \$0x80100,%r11d 1410 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1411 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1412 je .Lmulx_scatter 1413___ 1414$code.=<<___; 1415 movq ($out),%rbx # pass b[0] 1416 call __rsaz_512_mul 1417 1418 movq %xmm0, $out 1419 movq %xmm1, %rbp 1420 1421 movq (%rsp), %r8 1422 movq 8(%rsp), %r9 1423 movq 16(%rsp), %r10 1424 movq 24(%rsp), %r11 1425 movq 32(%rsp), %r12 1426 movq 40(%rsp), %r13 1427 movq 48(%rsp), %r14 1428 movq 56(%rsp), %r15 1429 1430 call __rsaz_512_reduce 1431___ 1432$code.=<<___ if ($addx); 1433 jmp .Lmul_scatter_tail 1434 1435.align 32 1436.Lmulx_scatter: 1437 movq ($out), %rdx # pass b[0] 1438 call __rsaz_512_mulx 1439 1440 movq %xmm0, $out 1441 movq %xmm1, %rbp 1442 1443 movq 128(%rsp), %rdx # pull $n0 1444 movq (%rsp), %r8 1445 movq 8(%rsp), %r9 1446 movq 16(%rsp), %r10 1447 movq 24(%rsp), %r11 1448 movq 32(%rsp), %r12 1449 movq 40(%rsp), %r13 1450 movq 48(%rsp), %r14 1451 movq 56(%rsp), %r15 1452 1453 call __rsaz_512_reducex 1454 1455.Lmul_scatter_tail: 1456___ 1457$code.=<<___; 1458 addq 64(%rsp), %r8 1459 adcq 72(%rsp), %r9 1460 adcq 80(%rsp), %r10 1461 adcq 88(%rsp), %r11 1462 adcq 96(%rsp), %r12 1463 adcq 104(%rsp), %r13 1464 adcq 112(%rsp), %r14 1465 adcq 120(%rsp), %r15 1466 movq %xmm2, $inp 1467 sbbq %rcx, %rcx 1468 1469 call __rsaz_512_subtract 1470 1471 movq %r8, 128*0($inp) # scatter 1472 movq %r9, 128*1($inp) 1473 movq %r10, 128*2($inp) 1474 movq %r11, 128*3($inp) 1475 movq %r12, 128*4($inp) 1476 movq %r13, 128*5($inp) 1477 movq %r14, 128*6($inp) 1478 movq %r15, 128*7($inp) 1479 1480 leaq 128+24+48(%rsp), %rax 1481.cfi_def_cfa %rax,8 1482 movq -48(%rax), %r15 1483.cfi_restore %r15 1484 movq -40(%rax), %r14 1485.cfi_restore %r14 1486 movq -32(%rax), %r13 1487.cfi_restore %r13 1488 movq -24(%rax), %r12 1489.cfi_restore %r12 1490 movq -16(%rax), %rbp 1491.cfi_restore %rbp 1492 movq -8(%rax), %rbx 1493.cfi_restore %rbx 1494 leaq (%rax), %rsp 1495.cfi_def_cfa_register %rsp 1496.Lmul_scatter4_epilogue: 1497 ret 1498.cfi_endproc 1499.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 1500___ 1501} 1502{ 1503my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); 1504$code.=<<___; 1505.globl rsaz_512_mul_by_one 1506.type rsaz_512_mul_by_one,\@function,4 1507.align 32 1508rsaz_512_mul_by_one: 1509.cfi_startproc 1510 push %rbx 1511.cfi_push %rbx 1512 push %rbp 1513.cfi_push %rbp 1514 push %r12 1515.cfi_push %r12 1516 push %r13 1517.cfi_push %r13 1518 push %r14 1519.cfi_push %r14 1520 push %r15 1521.cfi_push %r15 1522 1523 subq \$128+24, %rsp 1524.cfi_adjust_cfa_offset 128+24 1525.Lmul_by_one_body: 1526___ 1527$code.=<<___ if ($addx); 1528 movl OPENSSL_ia32cap_P+8(%rip),%eax 1529___ 1530$code.=<<___; 1531 movq $mod, %rbp # reassign argument 1532 movq $n0, 128(%rsp) 1533 1534 movq ($inp), %r8 1535 pxor %xmm0, %xmm0 1536 movq 8($inp), %r9 1537 movq 16($inp), %r10 1538 movq 24($inp), %r11 1539 movq 32($inp), %r12 1540 movq 40($inp), %r13 1541 movq 48($inp), %r14 1542 movq 56($inp), %r15 1543 1544 movdqa %xmm0, (%rsp) 1545 movdqa %xmm0, 16(%rsp) 1546 movdqa %xmm0, 32(%rsp) 1547 movdqa %xmm0, 48(%rsp) 1548 movdqa %xmm0, 64(%rsp) 1549 movdqa %xmm0, 80(%rsp) 1550 movdqa %xmm0, 96(%rsp) 1551___ 1552$code.=<<___ if ($addx); 1553 andl \$0x80100,%eax 1554 cmpl \$0x80100,%eax # check for MULX and ADO/CX 1555 je .Lby_one_callx 1556___ 1557$code.=<<___; 1558 call __rsaz_512_reduce 1559___ 1560$code.=<<___ if ($addx); 1561 jmp .Lby_one_tail 1562.align 32 1563.Lby_one_callx: 1564 movq 128(%rsp), %rdx # pull $n0 1565 call __rsaz_512_reducex 1566.Lby_one_tail: 1567___ 1568$code.=<<___; 1569 movq %r8, ($out) 1570 movq %r9, 8($out) 1571 movq %r10, 16($out) 1572 movq %r11, 24($out) 1573 movq %r12, 32($out) 1574 movq %r13, 40($out) 1575 movq %r14, 48($out) 1576 movq %r15, 56($out) 1577 1578 leaq 128+24+48(%rsp), %rax 1579.cfi_def_cfa %rax,8 1580 movq -48(%rax), %r15 1581.cfi_restore %r15 1582 movq -40(%rax), %r14 1583.cfi_restore %r14 1584 movq -32(%rax), %r13 1585.cfi_restore %r13 1586 movq -24(%rax), %r12 1587.cfi_restore %r12 1588 movq -16(%rax), %rbp 1589.cfi_restore %rbp 1590 movq -8(%rax), %rbx 1591.cfi_restore %rbx 1592 leaq (%rax), %rsp 1593.cfi_def_cfa_register %rsp 1594.Lmul_by_one_epilogue: 1595 ret 1596.cfi_endproc 1597.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 1598___ 1599} 1600{ # __rsaz_512_reduce 1601 # 1602 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1603 # output: %r8-%r15 1604 # clobbers: everything except %rbp and %rdi 1605$code.=<<___; 1606.type __rsaz_512_reduce,\@abi-omnipotent 1607.align 32 1608__rsaz_512_reduce: 1609 movq %r8, %rbx 1610 imulq 128+8(%rsp), %rbx 1611 movq 0(%rbp), %rax 1612 movl \$8, %ecx 1613 jmp .Lreduction_loop 1614 1615.align 32 1616.Lreduction_loop: 1617 mulq %rbx 1618 movq 8(%rbp), %rax 1619 negq %r8 1620 movq %rdx, %r8 1621 adcq \$0, %r8 1622 1623 mulq %rbx 1624 addq %rax, %r9 1625 movq 16(%rbp), %rax 1626 adcq \$0, %rdx 1627 addq %r9, %r8 1628 movq %rdx, %r9 1629 adcq \$0, %r9 1630 1631 mulq %rbx 1632 addq %rax, %r10 1633 movq 24(%rbp), %rax 1634 adcq \$0, %rdx 1635 addq %r10, %r9 1636 movq %rdx, %r10 1637 adcq \$0, %r10 1638 1639 mulq %rbx 1640 addq %rax, %r11 1641 movq 32(%rbp), %rax 1642 adcq \$0, %rdx 1643 addq %r11, %r10 1644 movq 128+8(%rsp), %rsi 1645 #movq %rdx, %r11 1646 #adcq \$0, %r11 1647 adcq \$0, %rdx 1648 movq %rdx, %r11 1649 1650 mulq %rbx 1651 addq %rax, %r12 1652 movq 40(%rbp), %rax 1653 adcq \$0, %rdx 1654 imulq %r8, %rsi 1655 addq %r12, %r11 1656 movq %rdx, %r12 1657 adcq \$0, %r12 1658 1659 mulq %rbx 1660 addq %rax, %r13 1661 movq 48(%rbp), %rax 1662 adcq \$0, %rdx 1663 addq %r13, %r12 1664 movq %rdx, %r13 1665 adcq \$0, %r13 1666 1667 mulq %rbx 1668 addq %rax, %r14 1669 movq 56(%rbp), %rax 1670 adcq \$0, %rdx 1671 addq %r14, %r13 1672 movq %rdx, %r14 1673 adcq \$0, %r14 1674 1675 mulq %rbx 1676 movq %rsi, %rbx 1677 addq %rax, %r15 1678 movq 0(%rbp), %rax 1679 adcq \$0, %rdx 1680 addq %r15, %r14 1681 movq %rdx, %r15 1682 adcq \$0, %r15 1683 1684 decl %ecx 1685 jne .Lreduction_loop 1686 1687 ret 1688.size __rsaz_512_reduce,.-__rsaz_512_reduce 1689___ 1690} 1691if ($addx) { 1692 # __rsaz_512_reducex 1693 # 1694 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1695 # output: %r8-%r15 1696 # clobbers: everything except %rbp and %rdi 1697$code.=<<___; 1698.type __rsaz_512_reducex,\@abi-omnipotent 1699.align 32 1700__rsaz_512_reducex: 1701 #movq 128+8(%rsp), %rdx # pull $n0 1702 imulq %r8, %rdx 1703 xorq %rsi, %rsi # cf=0,of=0 1704 movl \$8, %ecx 1705 jmp .Lreduction_loopx 1706 1707.align 32 1708.Lreduction_loopx: 1709 mov %r8, %rbx 1710 mulx 0(%rbp), %rax, %r8 1711 adcx %rbx, %rax 1712 adox %r9, %r8 1713 1714 mulx 8(%rbp), %rax, %r9 1715 adcx %rax, %r8 1716 adox %r10, %r9 1717 1718 mulx 16(%rbp), %rbx, %r10 1719 adcx %rbx, %r9 1720 adox %r11, %r10 1721 1722 mulx 24(%rbp), %rbx, %r11 1723 adcx %rbx, %r10 1724 adox %r12, %r11 1725 1726 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 1727 mov %rdx, %rax 1728 mov %r8, %rdx 1729 adcx %rbx, %r11 1730 adox %r13, %r12 1731 1732 mulx 128+8(%rsp), %rbx, %rdx 1733 mov %rax, %rdx 1734 1735 mulx 40(%rbp), %rax, %r13 1736 adcx %rax, %r12 1737 adox %r14, %r13 1738 1739 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 1740 adcx %rax, %r13 1741 adox %r15, %r14 1742 1743 mulx 56(%rbp), %rax, %r15 1744 mov %rbx, %rdx 1745 adcx %rax, %r14 1746 adox %rsi, %r15 # %rsi is 0 1747 adcx %rsi, %r15 # cf=0 1748 1749 decl %ecx # of=0 1750 jne .Lreduction_loopx 1751 1752 ret 1753.size __rsaz_512_reducex,.-__rsaz_512_reducex 1754___ 1755} 1756{ # __rsaz_512_subtract 1757 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask 1758 # output: 1759 # clobbers: everything but %rdi, %rsi and %rbp 1760$code.=<<___; 1761.type __rsaz_512_subtract,\@abi-omnipotent 1762.align 32 1763__rsaz_512_subtract: 1764 movq %r8, ($out) 1765 movq %r9, 8($out) 1766 movq %r10, 16($out) 1767 movq %r11, 24($out) 1768 movq %r12, 32($out) 1769 movq %r13, 40($out) 1770 movq %r14, 48($out) 1771 movq %r15, 56($out) 1772 1773 movq 0($mod), %r8 1774 movq 8($mod), %r9 1775 negq %r8 1776 notq %r9 1777 andq %rcx, %r8 1778 movq 16($mod), %r10 1779 andq %rcx, %r9 1780 notq %r10 1781 movq 24($mod), %r11 1782 andq %rcx, %r10 1783 notq %r11 1784 movq 32($mod), %r12 1785 andq %rcx, %r11 1786 notq %r12 1787 movq 40($mod), %r13 1788 andq %rcx, %r12 1789 notq %r13 1790 movq 48($mod), %r14 1791 andq %rcx, %r13 1792 notq %r14 1793 movq 56($mod), %r15 1794 andq %rcx, %r14 1795 notq %r15 1796 andq %rcx, %r15 1797 1798 addq ($out), %r8 1799 adcq 8($out), %r9 1800 adcq 16($out), %r10 1801 adcq 24($out), %r11 1802 adcq 32($out), %r12 1803 adcq 40($out), %r13 1804 adcq 48($out), %r14 1805 adcq 56($out), %r15 1806 1807 movq %r8, ($out) 1808 movq %r9, 8($out) 1809 movq %r10, 16($out) 1810 movq %r11, 24($out) 1811 movq %r12, 32($out) 1812 movq %r13, 40($out) 1813 movq %r14, 48($out) 1814 movq %r15, 56($out) 1815 1816 ret 1817.size __rsaz_512_subtract,.-__rsaz_512_subtract 1818___ 1819} 1820{ # __rsaz_512_mul 1821 # 1822 # input: %rsi - ap, %rbp - bp 1823 # output: 1824 # clobbers: everything 1825my ($ap,$bp) = ("%rsi","%rbp"); 1826$code.=<<___; 1827.type __rsaz_512_mul,\@abi-omnipotent 1828.align 32 1829__rsaz_512_mul: 1830 leaq 8(%rsp), %rdi 1831 1832 movq ($ap), %rax 1833 mulq %rbx 1834 movq %rax, (%rdi) 1835 movq 8($ap), %rax 1836 movq %rdx, %r8 1837 1838 mulq %rbx 1839 addq %rax, %r8 1840 movq 16($ap), %rax 1841 movq %rdx, %r9 1842 adcq \$0, %r9 1843 1844 mulq %rbx 1845 addq %rax, %r9 1846 movq 24($ap), %rax 1847 movq %rdx, %r10 1848 adcq \$0, %r10 1849 1850 mulq %rbx 1851 addq %rax, %r10 1852 movq 32($ap), %rax 1853 movq %rdx, %r11 1854 adcq \$0, %r11 1855 1856 mulq %rbx 1857 addq %rax, %r11 1858 movq 40($ap), %rax 1859 movq %rdx, %r12 1860 adcq \$0, %r12 1861 1862 mulq %rbx 1863 addq %rax, %r12 1864 movq 48($ap), %rax 1865 movq %rdx, %r13 1866 adcq \$0, %r13 1867 1868 mulq %rbx 1869 addq %rax, %r13 1870 movq 56($ap), %rax 1871 movq %rdx, %r14 1872 adcq \$0, %r14 1873 1874 mulq %rbx 1875 addq %rax, %r14 1876 movq ($ap), %rax 1877 movq %rdx, %r15 1878 adcq \$0, %r15 1879 1880 leaq 8($bp), $bp 1881 leaq 8(%rdi), %rdi 1882 1883 movl \$7, %ecx 1884 jmp .Loop_mul 1885 1886.align 32 1887.Loop_mul: 1888 movq ($bp), %rbx 1889 mulq %rbx 1890 addq %rax, %r8 1891 movq 8($ap), %rax 1892 movq %r8, (%rdi) 1893 movq %rdx, %r8 1894 adcq \$0, %r8 1895 1896 mulq %rbx 1897 addq %rax, %r9 1898 movq 16($ap), %rax 1899 adcq \$0, %rdx 1900 addq %r9, %r8 1901 movq %rdx, %r9 1902 adcq \$0, %r9 1903 1904 mulq %rbx 1905 addq %rax, %r10 1906 movq 24($ap), %rax 1907 adcq \$0, %rdx 1908 addq %r10, %r9 1909 movq %rdx, %r10 1910 adcq \$0, %r10 1911 1912 mulq %rbx 1913 addq %rax, %r11 1914 movq 32($ap), %rax 1915 adcq \$0, %rdx 1916 addq %r11, %r10 1917 movq %rdx, %r11 1918 adcq \$0, %r11 1919 1920 mulq %rbx 1921 addq %rax, %r12 1922 movq 40($ap), %rax 1923 adcq \$0, %rdx 1924 addq %r12, %r11 1925 movq %rdx, %r12 1926 adcq \$0, %r12 1927 1928 mulq %rbx 1929 addq %rax, %r13 1930 movq 48($ap), %rax 1931 adcq \$0, %rdx 1932 addq %r13, %r12 1933 movq %rdx, %r13 1934 adcq \$0, %r13 1935 1936 mulq %rbx 1937 addq %rax, %r14 1938 movq 56($ap), %rax 1939 adcq \$0, %rdx 1940 addq %r14, %r13 1941 movq %rdx, %r14 1942 leaq 8($bp), $bp 1943 adcq \$0, %r14 1944 1945 mulq %rbx 1946 addq %rax, %r15 1947 movq ($ap), %rax 1948 adcq \$0, %rdx 1949 addq %r15, %r14 1950 movq %rdx, %r15 1951 adcq \$0, %r15 1952 1953 leaq 8(%rdi), %rdi 1954 1955 decl %ecx 1956 jnz .Loop_mul 1957 1958 movq %r8, (%rdi) 1959 movq %r9, 8(%rdi) 1960 movq %r10, 16(%rdi) 1961 movq %r11, 24(%rdi) 1962 movq %r12, 32(%rdi) 1963 movq %r13, 40(%rdi) 1964 movq %r14, 48(%rdi) 1965 movq %r15, 56(%rdi) 1966 1967 ret 1968.size __rsaz_512_mul,.-__rsaz_512_mul 1969___ 1970} 1971if ($addx) { 1972 # __rsaz_512_mulx 1973 # 1974 # input: %rsi - ap, %rbp - bp 1975 # output: 1976 # clobbers: everything 1977my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); 1978$code.=<<___; 1979.type __rsaz_512_mulx,\@abi-omnipotent 1980.align 32 1981__rsaz_512_mulx: 1982 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller 1983 mov \$-6, %rcx 1984 1985 mulx 8($ap), %rax, %r9 1986 movq %rbx, 8(%rsp) 1987 1988 mulx 16($ap), %rbx, %r10 1989 adc %rax, %r8 1990 1991 mulx 24($ap), %rax, %r11 1992 adc %rbx, %r9 1993 1994 mulx 32($ap), %rbx, %r12 1995 adc %rax, %r10 1996 1997 mulx 40($ap), %rax, %r13 1998 adc %rbx, %r11 1999 2000 mulx 48($ap), %rbx, %r14 2001 adc %rax, %r12 2002 2003 mulx 56($ap), %rax, %r15 2004 mov 8($bp), %rdx 2005 adc %rbx, %r13 2006 adc %rax, %r14 2007 adc \$0, %r15 2008 2009 xor $zero, $zero # cf=0,of=0 2010 jmp .Loop_mulx 2011 2012.align 32 2013.Loop_mulx: 2014 movq %r8, %rbx 2015 mulx ($ap), %rax, %r8 2016 adcx %rax, %rbx 2017 adox %r9, %r8 2018 2019 mulx 8($ap), %rax, %r9 2020 adcx %rax, %r8 2021 adox %r10, %r9 2022 2023 mulx 16($ap), %rax, %r10 2024 adcx %rax, %r9 2025 adox %r11, %r10 2026 2027 mulx 24($ap), %rax, %r11 2028 adcx %rax, %r10 2029 adox %r12, %r11 2030 2031 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 2032 adcx %rax, %r11 2033 adox %r13, %r12 2034 2035 mulx 40($ap), %rax, %r13 2036 adcx %rax, %r12 2037 adox %r14, %r13 2038 2039 mulx 48($ap), %rax, %r14 2040 adcx %rax, %r13 2041 adox %r15, %r14 2042 2043 mulx 56($ap), %rax, %r15 2044 movq 64($bp,%rcx,8), %rdx 2045 movq %rbx, 8+64-8(%rsp,%rcx,8) 2046 adcx %rax, %r14 2047 adox $zero, %r15 2048 adcx $zero, %r15 # cf=0 2049 2050 inc %rcx # of=0 2051 jnz .Loop_mulx 2052 2053 movq %r8, %rbx 2054 mulx ($ap), %rax, %r8 2055 adcx %rax, %rbx 2056 adox %r9, %r8 2057 2058 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 2059 adcx %rax, %r8 2060 adox %r10, %r9 2061 2062 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 2063 adcx %rax, %r9 2064 adox %r11, %r10 2065 2066 mulx 24($ap), %rax, %r11 2067 adcx %rax, %r10 2068 adox %r12, %r11 2069 2070 mulx 32($ap), %rax, %r12 2071 adcx %rax, %r11 2072 adox %r13, %r12 2073 2074 mulx 40($ap), %rax, %r13 2075 adcx %rax, %r12 2076 adox %r14, %r13 2077 2078 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 2079 adcx %rax, %r13 2080 adox %r15, %r14 2081 2082 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 2083 adcx %rax, %r14 2084 adox $zero, %r15 2085 adcx $zero, %r15 2086 2087 mov %rbx, 8+64-8(%rsp) 2088 mov %r8, 8+64(%rsp) 2089 mov %r9, 8+64+8(%rsp) 2090 mov %r10, 8+64+16(%rsp) 2091 mov %r11, 8+64+24(%rsp) 2092 mov %r12, 8+64+32(%rsp) 2093 mov %r13, 8+64+40(%rsp) 2094 mov %r14, 8+64+48(%rsp) 2095 mov %r15, 8+64+56(%rsp) 2096 2097 ret 2098.size __rsaz_512_mulx,.-__rsaz_512_mulx 2099___ 2100} 2101{ 2102my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 2103$code.=<<___; 2104.globl rsaz_512_scatter4 2105.type rsaz_512_scatter4,\@abi-omnipotent 2106.align 16 2107rsaz_512_scatter4: 2108 leaq ($out,$power,8), $out 2109 movl \$8, %r9d 2110 jmp .Loop_scatter 2111.align 16 2112.Loop_scatter: 2113 movq ($inp), %rax 2114 leaq 8($inp), $inp 2115 movq %rax, ($out) 2116 leaq 128($out), $out 2117 decl %r9d 2118 jnz .Loop_scatter 2119 ret 2120.size rsaz_512_scatter4,.-rsaz_512_scatter4 2121 2122.globl rsaz_512_gather4 2123.type rsaz_512_gather4,\@abi-omnipotent 2124.align 16 2125rsaz_512_gather4: 2126___ 2127$code.=<<___ if ($win64); 2128.LSEH_begin_rsaz_512_gather4: 2129 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp 2130 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp) 2131 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp) 2132 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp) 2133 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp) 2134 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp) 2135 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp) 2136 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp) 2137 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp) 2138 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp) 2139 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp) 2140___ 2141$code.=<<___; 2142 movd $power,%xmm8 2143 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 2144 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 2145 2146 pshufd \$0,%xmm8,%xmm8 # broadcast $power 2147 movdqa %xmm1,%xmm7 2148 movdqa %xmm1,%xmm2 2149___ 2150######################################################################## 2151# calculate mask by comparing 0..15 to $power 2152# 2153for($i=0;$i<4;$i++) { 2154$code.=<<___; 2155 paddd %xmm`$i`,%xmm`$i+1` 2156 pcmpeqd %xmm8,%xmm`$i` 2157 movdqa %xmm7,%xmm`$i+3` 2158___ 2159} 2160for(;$i<7;$i++) { 2161$code.=<<___; 2162 paddd %xmm`$i`,%xmm`$i+1` 2163 pcmpeqd %xmm8,%xmm`$i` 2164___ 2165} 2166$code.=<<___; 2167 pcmpeqd %xmm8,%xmm7 2168 movl \$8, %r9d 2169 jmp .Loop_gather 2170.align 16 2171.Loop_gather: 2172 movdqa 16*0($inp),%xmm8 2173 movdqa 16*1($inp),%xmm9 2174 movdqa 16*2($inp),%xmm10 2175 movdqa 16*3($inp),%xmm11 2176 pand %xmm0,%xmm8 2177 movdqa 16*4($inp),%xmm12 2178 pand %xmm1,%xmm9 2179 movdqa 16*5($inp),%xmm13 2180 pand %xmm2,%xmm10 2181 movdqa 16*6($inp),%xmm14 2182 pand %xmm3,%xmm11 2183 movdqa 16*7($inp),%xmm15 2184 leaq 128($inp), $inp 2185 pand %xmm4,%xmm12 2186 pand %xmm5,%xmm13 2187 pand %xmm6,%xmm14 2188 pand %xmm7,%xmm15 2189 por %xmm10,%xmm8 2190 por %xmm11,%xmm9 2191 por %xmm12,%xmm8 2192 por %xmm13,%xmm9 2193 por %xmm14,%xmm8 2194 por %xmm15,%xmm9 2195 2196 por %xmm9,%xmm8 2197 pshufd \$0x4e,%xmm8,%xmm9 2198 por %xmm9,%xmm8 2199 movq %xmm8,($out) 2200 leaq 8($out), $out 2201 decl %r9d 2202 jnz .Loop_gather 2203___ 2204$code.=<<___ if ($win64); 2205 movaps 0x00(%rsp),%xmm6 2206 movaps 0x10(%rsp),%xmm7 2207 movaps 0x20(%rsp),%xmm8 2208 movaps 0x30(%rsp),%xmm9 2209 movaps 0x40(%rsp),%xmm10 2210 movaps 0x50(%rsp),%xmm11 2211 movaps 0x60(%rsp),%xmm12 2212 movaps 0x70(%rsp),%xmm13 2213 movaps 0x80(%rsp),%xmm14 2214 movaps 0x90(%rsp),%xmm15 2215 add \$0xa8,%rsp 2216___ 2217$code.=<<___; 2218 ret 2219.LSEH_end_rsaz_512_gather4: 2220.size rsaz_512_gather4,.-rsaz_512_gather4 2221 2222.align 64 2223.Linc: 2224 .long 0,0, 1,1 2225 .long 2,2, 2,2 2226___ 2227} 2228 2229# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2230# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2231if ($win64) { 2232$rec="%rcx"; 2233$frame="%rdx"; 2234$context="%r8"; 2235$disp="%r9"; 2236 2237$code.=<<___; 2238.extern __imp_RtlVirtualUnwind 2239.type se_handler,\@abi-omnipotent 2240.align 16 2241se_handler: 2242 push %rsi 2243 push %rdi 2244 push %rbx 2245 push %rbp 2246 push %r12 2247 push %r13 2248 push %r14 2249 push %r15 2250 pushfq 2251 sub \$64,%rsp 2252 2253 mov 120($context),%rax # pull context->Rax 2254 mov 248($context),%rbx # pull context->Rip 2255 2256 mov 8($disp),%rsi # disp->ImageBase 2257 mov 56($disp),%r11 # disp->HandlerData 2258 2259 mov 0(%r11),%r10d # HandlerData[0] 2260 lea (%rsi,%r10),%r10 # end of prologue label 2261 cmp %r10,%rbx # context->Rip<end of prologue label 2262 jb .Lcommon_seh_tail 2263 2264 mov 152($context),%rax # pull context->Rsp 2265 2266 mov 4(%r11),%r10d # HandlerData[1] 2267 lea (%rsi,%r10),%r10 # epilogue label 2268 cmp %r10,%rbx # context->Rip>=epilogue label 2269 jae .Lcommon_seh_tail 2270 2271 lea 128+24+48(%rax),%rax 2272 2273 lea .Lmul_gather4_epilogue(%rip),%rbx 2274 cmp %r10,%rbx 2275 jne .Lse_not_in_mul_gather4 2276 2277 lea 0xb0(%rax),%rax 2278 2279 lea -48-0xa8(%rax),%rsi 2280 lea 512($context),%rdi 2281 mov \$20,%ecx 2282 .long 0xa548f3fc # cld; rep movsq 2283 2284.Lse_not_in_mul_gather4: 2285 mov -8(%rax),%rbx 2286 mov -16(%rax),%rbp 2287 mov -24(%rax),%r12 2288 mov -32(%rax),%r13 2289 mov -40(%rax),%r14 2290 mov -48(%rax),%r15 2291 mov %rbx,144($context) # restore context->Rbx 2292 mov %rbp,160($context) # restore context->Rbp 2293 mov %r12,216($context) # restore context->R12 2294 mov %r13,224($context) # restore context->R13 2295 mov %r14,232($context) # restore context->R14 2296 mov %r15,240($context) # restore context->R15 2297 2298.Lcommon_seh_tail: 2299 mov 8(%rax),%rdi 2300 mov 16(%rax),%rsi 2301 mov %rax,152($context) # restore context->Rsp 2302 mov %rsi,168($context) # restore context->Rsi 2303 mov %rdi,176($context) # restore context->Rdi 2304 2305 mov 40($disp),%rdi # disp->ContextRecord 2306 mov $context,%rsi # context 2307 mov \$154,%ecx # sizeof(CONTEXT) 2308 .long 0xa548f3fc # cld; rep movsq 2309 2310 mov $disp,%rsi 2311 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2312 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2313 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2314 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2315 mov 40(%rsi),%r10 # disp->ContextRecord 2316 lea 56(%rsi),%r11 # &disp->HandlerData 2317 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2318 mov %r10,32(%rsp) # arg5 2319 mov %r11,40(%rsp) # arg6 2320 mov %r12,48(%rsp) # arg7 2321 mov %rcx,56(%rsp) # arg8, (NULL) 2322 call *__imp_RtlVirtualUnwind(%rip) 2323 2324 mov \$1,%eax # ExceptionContinueSearch 2325 add \$64,%rsp 2326 popfq 2327 pop %r15 2328 pop %r14 2329 pop %r13 2330 pop %r12 2331 pop %rbp 2332 pop %rbx 2333 pop %rdi 2334 pop %rsi 2335 ret 2336.size se_handler,.-se_handler 2337 2338.section .pdata 2339.align 4 2340 .rva .LSEH_begin_rsaz_512_sqr 2341 .rva .LSEH_end_rsaz_512_sqr 2342 .rva .LSEH_info_rsaz_512_sqr 2343 2344 .rva .LSEH_begin_rsaz_512_mul 2345 .rva .LSEH_end_rsaz_512_mul 2346 .rva .LSEH_info_rsaz_512_mul 2347 2348 .rva .LSEH_begin_rsaz_512_mul_gather4 2349 .rva .LSEH_end_rsaz_512_mul_gather4 2350 .rva .LSEH_info_rsaz_512_mul_gather4 2351 2352 .rva .LSEH_begin_rsaz_512_mul_scatter4 2353 .rva .LSEH_end_rsaz_512_mul_scatter4 2354 .rva .LSEH_info_rsaz_512_mul_scatter4 2355 2356 .rva .LSEH_begin_rsaz_512_mul_by_one 2357 .rva .LSEH_end_rsaz_512_mul_by_one 2358 .rva .LSEH_info_rsaz_512_mul_by_one 2359 2360 .rva .LSEH_begin_rsaz_512_gather4 2361 .rva .LSEH_end_rsaz_512_gather4 2362 .rva .LSEH_info_rsaz_512_gather4 2363 2364.section .xdata 2365.align 8 2366.LSEH_info_rsaz_512_sqr: 2367 .byte 9,0,0,0 2368 .rva se_handler 2369 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 2370.LSEH_info_rsaz_512_mul: 2371 .byte 9,0,0,0 2372 .rva se_handler 2373 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 2374.LSEH_info_rsaz_512_mul_gather4: 2375 .byte 9,0,0,0 2376 .rva se_handler 2377 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] 2378.LSEH_info_rsaz_512_mul_scatter4: 2379 .byte 9,0,0,0 2380 .rva se_handler 2381 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] 2382.LSEH_info_rsaz_512_mul_by_one: 2383 .byte 9,0,0,0 2384 .rva se_handler 2385 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] 2386.LSEH_info_rsaz_512_gather4: 2387 .byte 0x01,0x46,0x16,0x00 2388 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 2389 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 2390 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 2391 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 2392 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 2393 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 2394 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 2395 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 2396 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 2397 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 2398 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8 2399___ 2400} 2401 2402$code =~ s/\`([^\`]*)\`/eval $1/gem; 2403print $code; 2404close STDOUT; 2405