1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# sha1_block procedure for x86_64. 11# 12# It was brought to my attention that on EM64T compiler-generated code 13# was far behind 32-bit assembler implementation. This is unlike on 14# Opteron where compiler-generated code was only 15% behind 32-bit 15# assembler, which originally made it hard to motivate the effort. 16# There was suggestion to mechanically translate 32-bit code, but I 17# dismissed it, reasoning that x86_64 offers enough register bank 18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh 19# implementation:-) However! While 64-bit code does perform better 20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, 21# x86_64 does offer larger *addressable* bank, but out-of-order core 22# reaches for even more registers through dynamic aliasing, and EM64T 23# core must have managed to run-time optimize even 32-bit code just as 24# good as 64-bit one. Performance improvement is summarized in the 25# following table: 26# 27# gcc 3.4 32-bit asm cycles/byte 28# Opteron +45% +20% 6.8 29# Xeon P4 +65% +0% 9.9 30# Core2 +60% +10% 7.0 31 32# August 2009. 33# 34# The code was revised to minimize code size and to maximize 35# "distance" between instructions producing input to 'lea' 36# instruction and the 'lea' instruction itself, which is essential 37# for Intel Atom core. 38 39# October 2010. 40# 41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it 42# is to offload message schedule denoted by Wt in NIST specification, 43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module 44# for background and implementation details. The only difference from 45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements 46# to free temporary registers. 47 48# April 2011. 49# 50# Add AVX code path. See sha1-586.pl for further information. 51 52###################################################################### 53# Current performance is summarized in following table. Numbers are 54# CPU clock cycles spent to process single byte (less is better). 55# 56# x86_64 SSSE3 AVX 57# P4 9.8 - 58# Opteron 6.6 - 59# Core2 6.7 6.1/+10% - 60# Atom 11.0 9.7/+13% - 61# Westmere 7.1 5.6/+27% - 62# Sandy Bridge 7.9 6.3/+25% 5.2/+51% 63 64$flavour = shift; 65$output = shift; 66if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 67 68$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 69 70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 71( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 72( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 73die "can't locate x86_64-xlate.pl"; 74 75$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 76 =~ /GNU assembler version ([2-9]\.[0-9]+)/ && 77 $1>=2.19); 78$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && 80 $1>=2.09); 81$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 82 `ml64 2>&1` =~ /Version ([0-9]+)\./ && 83 $1>=10); 84 85open OUT,"| \"$^X\" $xlate $flavour $output"; 86*STDOUT=*OUT; 87 88$ctx="%rdi"; # 1st arg 89$inp="%rsi"; # 2nd arg 90$num="%rdx"; # 3rd arg 91 92# reassign arguments in order to produce more compact code 93$ctx="%r8"; 94$inp="%r9"; 95$num="%r10"; 96 97$t0="%eax"; 98$t1="%ebx"; 99$t2="%ecx"; 100@xi=("%edx","%ebp"); 101$A="%esi"; 102$B="%edi"; 103$C="%r11d"; 104$D="%r12d"; 105$E="%r13d"; 106 107@V=($A,$B,$C,$D,$E); 108 109sub BODY_00_19 { 110my ($i,$a,$b,$c,$d,$e)=@_; 111my $j=$i+1; 112$code.=<<___ if ($i==0); 113 mov `4*$i`($inp),$xi[0] 114 bswap $xi[0] 115 mov $xi[0],`4*$i`(%rsp) 116___ 117$code.=<<___ if ($i<15); 118 mov $c,$t0 119 mov `4*$j`($inp),$xi[1] 120 mov $a,$t2 121 xor $d,$t0 122 bswap $xi[1] 123 rol \$5,$t2 124 lea 0x5a827999($xi[0],$e),$e 125 and $b,$t0 126 mov $xi[1],`4*$j`(%rsp) 127 add $t2,$e 128 xor $d,$t0 129 rol \$30,$b 130 add $t0,$e 131___ 132$code.=<<___ if ($i>=15); 133 mov `4*($j%16)`(%rsp),$xi[1] 134 mov $c,$t0 135 mov $a,$t2 136 xor `4*(($j+2)%16)`(%rsp),$xi[1] 137 xor $d,$t0 138 rol \$5,$t2 139 xor `4*(($j+8)%16)`(%rsp),$xi[1] 140 and $b,$t0 141 lea 0x5a827999($xi[0],$e),$e 142 xor `4*(($j+13)%16)`(%rsp),$xi[1] 143 xor $d,$t0 144 rol \$1,$xi[1] 145 add $t2,$e 146 rol \$30,$b 147 mov $xi[1],`4*($j%16)`(%rsp) 148 add $t0,$e 149___ 150unshift(@xi,pop(@xi)); 151} 152 153sub BODY_20_39 { 154my ($i,$a,$b,$c,$d,$e)=@_; 155my $j=$i+1; 156my $K=($i<40)?0x6ed9eba1:0xca62c1d6; 157$code.=<<___ if ($i<79); 158 mov `4*($j%16)`(%rsp),$xi[1] 159 mov $c,$t0 160 mov $a,$t2 161 xor `4*(($j+2)%16)`(%rsp),$xi[1] 162 xor $b,$t0 163 rol \$5,$t2 164 lea $K($xi[0],$e),$e 165 xor `4*(($j+8)%16)`(%rsp),$xi[1] 166 xor $d,$t0 167 add $t2,$e 168 xor `4*(($j+13)%16)`(%rsp),$xi[1] 169 rol \$30,$b 170 add $t0,$e 171 rol \$1,$xi[1] 172___ 173$code.=<<___ if ($i<76); 174 mov $xi[1],`4*($j%16)`(%rsp) 175___ 176$code.=<<___ if ($i==79); 177 mov $c,$t0 178 mov $a,$t2 179 xor $b,$t0 180 lea $K($xi[0],$e),$e 181 rol \$5,$t2 182 xor $d,$t0 183 add $t2,$e 184 rol \$30,$b 185 add $t0,$e 186___ 187unshift(@xi,pop(@xi)); 188} 189 190sub BODY_40_59 { 191my ($i,$a,$b,$c,$d,$e)=@_; 192my $j=$i+1; 193$code.=<<___; 194 mov `4*($j%16)`(%rsp),$xi[1] 195 mov $c,$t0 196 mov $c,$t1 197 xor `4*(($j+2)%16)`(%rsp),$xi[1] 198 and $d,$t0 199 mov $a,$t2 200 xor `4*(($j+8)%16)`(%rsp),$xi[1] 201 xor $d,$t1 202 lea 0x8f1bbcdc($xi[0],$e),$e 203 rol \$5,$t2 204 xor `4*(($j+13)%16)`(%rsp),$xi[1] 205 add $t0,$e 206 and $b,$t1 207 rol \$1,$xi[1] 208 add $t1,$e 209 rol \$30,$b 210 mov $xi[1],`4*($j%16)`(%rsp) 211 add $t2,$e 212___ 213unshift(@xi,pop(@xi)); 214} 215 216$code.=<<___; 217.text 218.extern OPENSSL_ia32cap_P 219 220.globl sha1_block_data_order 221.type sha1_block_data_order,\@function,3 222.align 16 223sha1_block_data_order: 224 mov OPENSSL_ia32cap_P+0(%rip),%r8 225 mov 4(%r8),%r8d 226 bt \$9,%r8d 227 jnc .Lialu 228___ 229$code.=<<___ if ($avx); 230 bt \$28,%r8d 231 jc _avx_shortcut 232___ 233$code.=<<___; 234 jmp _ssse3_shortcut 235 236.align 16 237.Lialu: 238 push %rbx 239 push %rbp 240 push %r12 241 push %r13 242 mov %rsp,%r11 243 mov %rdi,$ctx # reassigned argument 244 sub \$`8+16*4`,%rsp 245 mov %rsi,$inp # reassigned argument 246 and \$-64,%rsp 247 mov %rdx,$num # reassigned argument 248 mov %r11,`16*4`(%rsp) 249.Lprologue: 250 251 mov 0($ctx),$A 252 mov 4($ctx),$B 253 mov 8($ctx),$C 254 mov 12($ctx),$D 255 mov 16($ctx),$E 256 jmp .Lloop 257 258.align 16 259.Lloop: 260___ 261for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 262for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 263for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 264for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 265$code.=<<___; 266 add 0($ctx),$A 267 add 4($ctx),$B 268 add 8($ctx),$C 269 add 12($ctx),$D 270 add 16($ctx),$E 271 mov $A,0($ctx) 272 mov $B,4($ctx) 273 mov $C,8($ctx) 274 mov $D,12($ctx) 275 mov $E,16($ctx) 276 277 sub \$1,$num 278 lea `16*4`($inp),$inp 279 jnz .Lloop 280 281 mov `16*4`(%rsp),%rsi 282 mov (%rsi),%r13 283 mov 8(%rsi),%r12 284 mov 16(%rsi),%rbp 285 mov 24(%rsi),%rbx 286 lea 32(%rsi),%rsp 287.Lepilogue: 288 ret 289.size sha1_block_data_order,.-sha1_block_data_order 290___ 291{{{ 292my $Xi=4; 293my @X=map("%xmm$_",(4..7,0..3)); 294my @Tx=map("%xmm$_",(8..10)); 295my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 296my @T=("%esi","%edi"); 297my $j=0; 298my $K_XX_XX="%r11"; 299 300my $_rol=sub { &rol(@_) }; 301my $_ror=sub { &ror(@_) }; 302 303$code.=<<___; 304.type sha1_block_data_order_ssse3,\@function,3 305.align 16 306sha1_block_data_order_ssse3: 307_ssse3_shortcut: 308 push %rbx 309 push %rbp 310 push %r12 311 lea `-64-($win64?5*16:0)`(%rsp),%rsp 312___ 313$code.=<<___ if ($win64); 314 movaps %xmm6,64+0(%rsp) 315 movaps %xmm7,64+16(%rsp) 316 movaps %xmm8,64+32(%rsp) 317 movaps %xmm9,64+48(%rsp) 318 movaps %xmm10,64+64(%rsp) 319.Lprologue_ssse3: 320___ 321$code.=<<___; 322 mov %rdi,$ctx # reassigned argument 323 mov %rsi,$inp # reassigned argument 324 mov %rdx,$num # reassigned argument 325 326 shl \$6,$num 327 add $inp,$num 328 lea K_XX_XX(%rip),$K_XX_XX 329 330 mov 0($ctx),$A # load context 331 mov 4($ctx),$B 332 mov 8($ctx),$C 333 mov 12($ctx),$D 334 mov $B,@T[0] # magic seed 335 mov 16($ctx),$E 336 337 movdqa 64($K_XX_XX),@X[2] # pbswap mask 338 movdqa 0($K_XX_XX),@Tx[1] # K_00_19 339 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 340 movdqu 16($inp),@X[-3&7] 341 movdqu 32($inp),@X[-2&7] 342 movdqu 48($inp),@X[-1&7] 343 pshufb @X[2],@X[-4&7] # byte swap 344 add \$64,$inp 345 pshufb @X[2],@X[-3&7] 346 pshufb @X[2],@X[-2&7] 347 pshufb @X[2],@X[-1&7] 348 paddd @Tx[1],@X[-4&7] # add K_00_19 349 paddd @Tx[1],@X[-3&7] 350 paddd @Tx[1],@X[-2&7] 351 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU 352 psubd @Tx[1],@X[-4&7] # restore X[] 353 movdqa @X[-3&7],16(%rsp) 354 psubd @Tx[1],@X[-3&7] 355 movdqa @X[-2&7],32(%rsp) 356 psubd @Tx[1],@X[-2&7] 357 jmp .Loop_ssse3 358___ 359 360sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 361{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 362 my $arg = pop; 363 $arg = "\$$arg" if ($arg*1 eq $arg); 364 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 365} 366 367sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 368{ use integer; 369 my $body = shift; 370 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 371 my ($a,$b,$c,$d,$e); 372 373 &movdqa (@X[0],@X[-3&7]); 374 eval(shift(@insns)); 375 eval(shift(@insns)); 376 &movdqa (@Tx[0],@X[-1&7]); 377 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" 378 eval(shift(@insns)); 379 eval(shift(@insns)); 380 381 &paddd (@Tx[1],@X[-1&7]); 382 eval(shift(@insns)); 383 eval(shift(@insns)); 384 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords 385 eval(shift(@insns)); 386 eval(shift(@insns)); 387 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 388 eval(shift(@insns)); 389 eval(shift(@insns)); 390 391 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 392 eval(shift(@insns)); 393 eval(shift(@insns)); 394 eval(shift(@insns)); 395 eval(shift(@insns)); 396 397 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 398 eval(shift(@insns)); 399 eval(shift(@insns)); 400 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 401 eval(shift(@insns)); 402 eval(shift(@insns)); 403 404 &movdqa (@Tx[2],@X[0]); 405 &movdqa (@Tx[0],@X[0]); 406 eval(shift(@insns)); 407 eval(shift(@insns)); 408 eval(shift(@insns)); 409 eval(shift(@insns)); 410 411 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword 412 &paddd (@X[0],@X[0]); 413 eval(shift(@insns)); 414 eval(shift(@insns)); 415 eval(shift(@insns)); 416 eval(shift(@insns)); 417 418 &psrld (@Tx[0],31); 419 eval(shift(@insns)); 420 eval(shift(@insns)); 421 &movdqa (@Tx[1],@Tx[2]); 422 eval(shift(@insns)); 423 eval(shift(@insns)); 424 425 &psrld (@Tx[2],30); 426 &por (@X[0],@Tx[0]); # "X[0]"<<<=1 427 eval(shift(@insns)); 428 eval(shift(@insns)); 429 eval(shift(@insns)); 430 eval(shift(@insns)); 431 432 &pslld (@Tx[1],2); 433 &pxor (@X[0],@Tx[2]); 434 eval(shift(@insns)); 435 eval(shift(@insns)); 436 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX 437 eval(shift(@insns)); 438 eval(shift(@insns)); 439 440 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 441 442 foreach (@insns) { eval; } # remaining instructions [if any] 443 444 $Xi++; push(@X,shift(@X)); # "rotate" X[] 445 push(@Tx,shift(@Tx)); 446} 447 448sub Xupdate_ssse3_32_79() 449{ use integer; 450 my $body = shift; 451 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 452 my ($a,$b,$c,$d,$e); 453 454 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); 455 eval(shift(@insns)); # body_20_39 456 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 457 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" 458 eval(shift(@insns)); 459 eval(shift(@insns)); 460 eval(shift(@insns)); # rol 461 462 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 463 eval(shift(@insns)); 464 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); 465 if ($Xi%5) { 466 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... 467 } else { # ... or load next one 468 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); 469 } 470 &paddd (@Tx[1],@X[-1&7]); 471 eval(shift(@insns)); # ror 472 eval(shift(@insns)); 473 474 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" 475 eval(shift(@insns)); # body_20_39 476 eval(shift(@insns)); 477 eval(shift(@insns)); 478 eval(shift(@insns)); # rol 479 480 &movdqa (@Tx[0],@X[0]); 481 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 482 eval(shift(@insns)); 483 eval(shift(@insns)); 484 eval(shift(@insns)); # ror 485 eval(shift(@insns)); 486 487 &pslld (@X[0],2); 488 eval(shift(@insns)); # body_20_39 489 eval(shift(@insns)); 490 &psrld (@Tx[0],30); 491 eval(shift(@insns)); 492 eval(shift(@insns)); # rol 493 eval(shift(@insns)); 494 eval(shift(@insns)); 495 eval(shift(@insns)); # ror 496 eval(shift(@insns)); 497 498 &por (@X[0],@Tx[0]); # "X[0]"<<<=2 499 eval(shift(@insns)); # body_20_39 500 eval(shift(@insns)); 501 &movdqa (@Tx[1],@X[0]) if ($Xi<19); 502 eval(shift(@insns)); 503 eval(shift(@insns)); # rol 504 eval(shift(@insns)); 505 eval(shift(@insns)); 506 eval(shift(@insns)); # rol 507 eval(shift(@insns)); 508 509 foreach (@insns) { eval; } # remaining instructions 510 511 $Xi++; push(@X,shift(@X)); # "rotate" X[] 512 push(@Tx,shift(@Tx)); 513} 514 515sub Xuplast_ssse3_80() 516{ use integer; 517 my $body = shift; 518 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 519 my ($a,$b,$c,$d,$e); 520 521 eval(shift(@insns)); 522 &paddd (@Tx[1],@X[-1&7]); 523 eval(shift(@insns)); 524 eval(shift(@insns)); 525 eval(shift(@insns)); 526 eval(shift(@insns)); 527 528 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 529 530 foreach (@insns) { eval; } # remaining instructions 531 532 &cmp ($inp,$num); 533 &je (".Ldone_ssse3"); 534 535 unshift(@Tx,pop(@Tx)); 536 537 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask 538 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 539 &movdqu (@X[-4&7],"0($inp)"); # load input 540 &movdqu (@X[-3&7],"16($inp)"); 541 &movdqu (@X[-2&7],"32($inp)"); 542 &movdqu (@X[-1&7],"48($inp)"); 543 &pshufb (@X[-4&7],@X[2]); # byte swap 544 &add ($inp,64); 545 546 $Xi=0; 547} 548 549sub Xloop_ssse3() 550{ use integer; 551 my $body = shift; 552 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 553 my ($a,$b,$c,$d,$e); 554 555 eval(shift(@insns)); 556 eval(shift(@insns)); 557 &pshufb (@X[($Xi-3)&7],@X[2]); 558 eval(shift(@insns)); 559 eval(shift(@insns)); 560 &paddd (@X[($Xi-4)&7],@Tx[1]); 561 eval(shift(@insns)); 562 eval(shift(@insns)); 563 eval(shift(@insns)); 564 eval(shift(@insns)); 565 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU 566 eval(shift(@insns)); 567 eval(shift(@insns)); 568 &psubd (@X[($Xi-4)&7],@Tx[1]); 569 570 foreach (@insns) { eval; } 571 $Xi++; 572} 573 574sub Xtail_ssse3() 575{ use integer; 576 my $body = shift; 577 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 578 my ($a,$b,$c,$d,$e); 579 580 foreach (@insns) { eval; } 581} 582 583sub body_00_19 () { 584 ( 585 '($a,$b,$c,$d,$e)=@V;'. 586 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer 587 '&xor ($c,$d);', 588 '&mov (@T[1],$a);', # $b in next round 589 '&$_rol ($a,5);', 590 '&and (@T[0],$c);', # ($b&($c^$d)) 591 '&xor ($c,$d);', # restore $c 592 '&xor (@T[0],$d);', 593 '&add ($e,$a);', 594 '&$_ror ($b,$j?7:2);', # $b>>>2 595 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 596 ); 597} 598 599sub body_20_39 () { 600 ( 601 '($a,$b,$c,$d,$e)=@V;'. 602 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer 603 '&xor (@T[0],$d);', # ($b^$d) 604 '&mov (@T[1],$a);', # $b in next round 605 '&$_rol ($a,5);', 606 '&xor (@T[0],$c);', # ($b^$d^$c) 607 '&add ($e,$a);', 608 '&$_ror ($b,7);', # $b>>>2 609 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' 610 ); 611} 612 613sub body_40_59 () { 614 ( 615 '($a,$b,$c,$d,$e)=@V;'. 616 '&mov (@T[1],$c);', 617 '&xor ($c,$d);', 618 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer 619 '&and (@T[1],$d);', 620 '&and (@T[0],$c);', # ($b&($c^$d)) 621 '&$_ror ($b,7);', # $b>>>2 622 '&add ($e,@T[1]);', 623 '&mov (@T[1],$a);', # $b in next round 624 '&$_rol ($a,5);', 625 '&add ($e,@T[0]);', 626 '&xor ($c,$d);', # restore $c 627 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' 628 ); 629} 630$code.=<<___; 631.align 16 632.Loop_ssse3: 633___ 634 &Xupdate_ssse3_16_31(\&body_00_19); 635 &Xupdate_ssse3_16_31(\&body_00_19); 636 &Xupdate_ssse3_16_31(\&body_00_19); 637 &Xupdate_ssse3_16_31(\&body_00_19); 638 &Xupdate_ssse3_32_79(\&body_00_19); 639 &Xupdate_ssse3_32_79(\&body_20_39); 640 &Xupdate_ssse3_32_79(\&body_20_39); 641 &Xupdate_ssse3_32_79(\&body_20_39); 642 &Xupdate_ssse3_32_79(\&body_20_39); 643 &Xupdate_ssse3_32_79(\&body_20_39); 644 &Xupdate_ssse3_32_79(\&body_40_59); 645 &Xupdate_ssse3_32_79(\&body_40_59); 646 &Xupdate_ssse3_32_79(\&body_40_59); 647 &Xupdate_ssse3_32_79(\&body_40_59); 648 &Xupdate_ssse3_32_79(\&body_40_59); 649 &Xupdate_ssse3_32_79(\&body_20_39); 650 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" 651 652 $saved_j=$j; @saved_V=@V; 653 654 &Xloop_ssse3(\&body_20_39); 655 &Xloop_ssse3(\&body_20_39); 656 &Xloop_ssse3(\&body_20_39); 657 658$code.=<<___; 659 add 0($ctx),$A # update context 660 add 4($ctx),@T[0] 661 add 8($ctx),$C 662 add 12($ctx),$D 663 mov $A,0($ctx) 664 add 16($ctx),$E 665 mov @T[0],4($ctx) 666 mov @T[0],$B # magic seed 667 mov $C,8($ctx) 668 mov $D,12($ctx) 669 mov $E,16($ctx) 670 jmp .Loop_ssse3 671 672.align 16 673.Ldone_ssse3: 674___ 675 $j=$saved_j; @V=@saved_V; 676 677 &Xtail_ssse3(\&body_20_39); 678 &Xtail_ssse3(\&body_20_39); 679 &Xtail_ssse3(\&body_20_39); 680 681$code.=<<___; 682 add 0($ctx),$A # update context 683 add 4($ctx),@T[0] 684 add 8($ctx),$C 685 mov $A,0($ctx) 686 add 12($ctx),$D 687 mov @T[0],4($ctx) 688 add 16($ctx),$E 689 mov $C,8($ctx) 690 mov $D,12($ctx) 691 mov $E,16($ctx) 692___ 693$code.=<<___ if ($win64); 694 movaps 64+0(%rsp),%xmm6 695 movaps 64+16(%rsp),%xmm7 696 movaps 64+32(%rsp),%xmm8 697 movaps 64+48(%rsp),%xmm9 698 movaps 64+64(%rsp),%xmm10 699___ 700$code.=<<___; 701 lea `64+($win64?5*16:0)`(%rsp),%rsi 702 mov 0(%rsi),%r12 703 mov 8(%rsi),%rbp 704 mov 16(%rsi),%rbx 705 lea 24(%rsi),%rsp 706.Lepilogue_ssse3: 707 ret 708.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 709___ 710 711if ($avx) { 712my $Xi=4; 713my @X=map("%xmm$_",(4..7,0..3)); 714my @Tx=map("%xmm$_",(8..10)); 715my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 716my @T=("%esi","%edi"); 717my $j=0; 718my $K_XX_XX="%r11"; 719 720my $_rol=sub { &shld(@_[0],@_) }; 721my $_ror=sub { &shrd(@_[0],@_) }; 722 723$code.=<<___; 724.type sha1_block_data_order_avx,\@function,3 725.align 16 726sha1_block_data_order_avx: 727_avx_shortcut: 728 push %rbx 729 push %rbp 730 push %r12 731 lea `-64-($win64?5*16:0)`(%rsp),%rsp 732___ 733$code.=<<___ if ($win64); 734 movaps %xmm6,64+0(%rsp) 735 movaps %xmm7,64+16(%rsp) 736 movaps %xmm8,64+32(%rsp) 737 movaps %xmm9,64+48(%rsp) 738 movaps %xmm10,64+64(%rsp) 739.Lprologue_avx: 740___ 741$code.=<<___; 742 mov %rdi,$ctx # reassigned argument 743 mov %rsi,$inp # reassigned argument 744 mov %rdx,$num # reassigned argument 745 vzeroupper 746 747 shl \$6,$num 748 add $inp,$num 749 lea K_XX_XX(%rip),$K_XX_XX 750 751 mov 0($ctx),$A # load context 752 mov 4($ctx),$B 753 mov 8($ctx),$C 754 mov 12($ctx),$D 755 mov $B,@T[0] # magic seed 756 mov 16($ctx),$E 757 758 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask 759 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 760 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 761 vmovdqu 16($inp),@X[-3&7] 762 vmovdqu 32($inp),@X[-2&7] 763 vmovdqu 48($inp),@X[-1&7] 764 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap 765 add \$64,$inp 766 vpshufb @X[2],@X[-3&7],@X[-3&7] 767 vpshufb @X[2],@X[-2&7],@X[-2&7] 768 vpshufb @X[2],@X[-1&7],@X[-1&7] 769 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 770 vpaddd @Tx[1],@X[-3&7],@X[1] 771 vpaddd @Tx[1],@X[-2&7],@X[2] 772 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU 773 vmovdqa @X[1],16(%rsp) 774 vmovdqa @X[2],32(%rsp) 775 jmp .Loop_avx 776___ 777 778sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 779{ use integer; 780 my $body = shift; 781 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 782 my ($a,$b,$c,$d,$e); 783 784 eval(shift(@insns)); 785 eval(shift(@insns)); 786 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" 787 eval(shift(@insns)); 788 eval(shift(@insns)); 789 790 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); 791 eval(shift(@insns)); 792 eval(shift(@insns)); 793 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords 794 eval(shift(@insns)); 795 eval(shift(@insns)); 796 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 797 eval(shift(@insns)); 798 eval(shift(@insns)); 799 800 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 801 eval(shift(@insns)); 802 eval(shift(@insns)); 803 eval(shift(@insns)); 804 eval(shift(@insns)); 805 806 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 807 eval(shift(@insns)); 808 eval(shift(@insns)); 809 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 810 eval(shift(@insns)); 811 eval(shift(@insns)); 812 813 &vpsrld (@Tx[0],@X[0],31); 814 eval(shift(@insns)); 815 eval(shift(@insns)); 816 eval(shift(@insns)); 817 eval(shift(@insns)); 818 819 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword 820 &vpaddd (@X[0],@X[0],@X[0]); 821 eval(shift(@insns)); 822 eval(shift(@insns)); 823 eval(shift(@insns)); 824 eval(shift(@insns)); 825 826 &vpsrld (@Tx[1],@Tx[2],30); 827 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 828 eval(shift(@insns)); 829 eval(shift(@insns)); 830 eval(shift(@insns)); 831 eval(shift(@insns)); 832 833 &vpslld (@Tx[2],@Tx[2],2); 834 &vpxor (@X[0],@X[0],@Tx[1]); 835 eval(shift(@insns)); 836 eval(shift(@insns)); 837 eval(shift(@insns)); 838 eval(shift(@insns)); 839 840 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 841 eval(shift(@insns)); 842 eval(shift(@insns)); 843 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX 844 eval(shift(@insns)); 845 eval(shift(@insns)); 846 847 848 foreach (@insns) { eval; } # remaining instructions [if any] 849 850 $Xi++; push(@X,shift(@X)); # "rotate" X[] 851 push(@Tx,shift(@Tx)); 852} 853 854sub Xupdate_avx_32_79() 855{ use integer; 856 my $body = shift; 857 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 858 my ($a,$b,$c,$d,$e); 859 860 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" 861 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 862 eval(shift(@insns)); # body_20_39 863 eval(shift(@insns)); 864 eval(shift(@insns)); 865 eval(shift(@insns)); # rol 866 867 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 868 eval(shift(@insns)); 869 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); 870 if ($Xi%5) { 871 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... 872 } else { # ... or load next one 873 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); 874 } 875 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); 876 eval(shift(@insns)); # ror 877 eval(shift(@insns)); 878 879 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" 880 eval(shift(@insns)); # body_20_39 881 eval(shift(@insns)); 882 eval(shift(@insns)); 883 eval(shift(@insns)); # rol 884 885 &vpsrld (@Tx[0],@X[0],30); 886 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 887 eval(shift(@insns)); 888 eval(shift(@insns)); 889 eval(shift(@insns)); # ror 890 eval(shift(@insns)); 891 892 &vpslld (@X[0],@X[0],2); 893 eval(shift(@insns)); # body_20_39 894 eval(shift(@insns)); 895 eval(shift(@insns)); 896 eval(shift(@insns)); # rol 897 eval(shift(@insns)); 898 eval(shift(@insns)); 899 eval(shift(@insns)); # ror 900 eval(shift(@insns)); 901 902 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 903 eval(shift(@insns)); # body_20_39 904 eval(shift(@insns)); 905 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); 906 eval(shift(@insns)); 907 eval(shift(@insns)); # rol 908 eval(shift(@insns)); 909 eval(shift(@insns)); 910 eval(shift(@insns)); # rol 911 eval(shift(@insns)); 912 913 foreach (@insns) { eval; } # remaining instructions 914 915 $Xi++; push(@X,shift(@X)); # "rotate" X[] 916 push(@Tx,shift(@Tx)); 917} 918 919sub Xuplast_avx_80() 920{ use integer; 921 my $body = shift; 922 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 923 my ($a,$b,$c,$d,$e); 924 925 eval(shift(@insns)); 926 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); 927 eval(shift(@insns)); 928 eval(shift(@insns)); 929 eval(shift(@insns)); 930 eval(shift(@insns)); 931 932 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 933 934 foreach (@insns) { eval; } # remaining instructions 935 936 &cmp ($inp,$num); 937 &je (".Ldone_avx"); 938 939 unshift(@Tx,pop(@Tx)); 940 941 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask 942 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 943 &vmovdqu(@X[-4&7],"0($inp)"); # load input 944 &vmovdqu(@X[-3&7],"16($inp)"); 945 &vmovdqu(@X[-2&7],"32($inp)"); 946 &vmovdqu(@X[-1&7],"48($inp)"); 947 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap 948 &add ($inp,64); 949 950 $Xi=0; 951} 952 953sub Xloop_avx() 954{ use integer; 955 my $body = shift; 956 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 957 my ($a,$b,$c,$d,$e); 958 959 eval(shift(@insns)); 960 eval(shift(@insns)); 961 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); 962 eval(shift(@insns)); 963 eval(shift(@insns)); 964 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); 965 eval(shift(@insns)); 966 eval(shift(@insns)); 967 eval(shift(@insns)); 968 eval(shift(@insns)); 969 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU 970 eval(shift(@insns)); 971 eval(shift(@insns)); 972 973 foreach (@insns) { eval; } 974 $Xi++; 975} 976 977sub Xtail_avx() 978{ use integer; 979 my $body = shift; 980 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 981 my ($a,$b,$c,$d,$e); 982 983 foreach (@insns) { eval; } 984} 985 986$code.=<<___; 987.align 16 988.Loop_avx: 989___ 990 &Xupdate_avx_16_31(\&body_00_19); 991 &Xupdate_avx_16_31(\&body_00_19); 992 &Xupdate_avx_16_31(\&body_00_19); 993 &Xupdate_avx_16_31(\&body_00_19); 994 &Xupdate_avx_32_79(\&body_00_19); 995 &Xupdate_avx_32_79(\&body_20_39); 996 &Xupdate_avx_32_79(\&body_20_39); 997 &Xupdate_avx_32_79(\&body_20_39); 998 &Xupdate_avx_32_79(\&body_20_39); 999 &Xupdate_avx_32_79(\&body_20_39); 1000 &Xupdate_avx_32_79(\&body_40_59); 1001 &Xupdate_avx_32_79(\&body_40_59); 1002 &Xupdate_avx_32_79(\&body_40_59); 1003 &Xupdate_avx_32_79(\&body_40_59); 1004 &Xupdate_avx_32_79(\&body_40_59); 1005 &Xupdate_avx_32_79(\&body_20_39); 1006 &Xuplast_avx_80(\&body_20_39); # can jump to "done" 1007 1008 $saved_j=$j; @saved_V=@V; 1009 1010 &Xloop_avx(\&body_20_39); 1011 &Xloop_avx(\&body_20_39); 1012 &Xloop_avx(\&body_20_39); 1013 1014$code.=<<___; 1015 add 0($ctx),$A # update context 1016 add 4($ctx),@T[0] 1017 add 8($ctx),$C 1018 add 12($ctx),$D 1019 mov $A,0($ctx) 1020 add 16($ctx),$E 1021 mov @T[0],4($ctx) 1022 mov @T[0],$B # magic seed 1023 mov $C,8($ctx) 1024 mov $D,12($ctx) 1025 mov $E,16($ctx) 1026 jmp .Loop_avx 1027 1028.align 16 1029.Ldone_avx: 1030___ 1031 $j=$saved_j; @V=@saved_V; 1032 1033 &Xtail_avx(\&body_20_39); 1034 &Xtail_avx(\&body_20_39); 1035 &Xtail_avx(\&body_20_39); 1036 1037$code.=<<___; 1038 vzeroupper 1039 1040 add 0($ctx),$A # update context 1041 add 4($ctx),@T[0] 1042 add 8($ctx),$C 1043 mov $A,0($ctx) 1044 add 12($ctx),$D 1045 mov @T[0],4($ctx) 1046 add 16($ctx),$E 1047 mov $C,8($ctx) 1048 mov $D,12($ctx) 1049 mov $E,16($ctx) 1050___ 1051$code.=<<___ if ($win64); 1052 movaps 64+0(%rsp),%xmm6 1053 movaps 64+16(%rsp),%xmm7 1054 movaps 64+32(%rsp),%xmm8 1055 movaps 64+48(%rsp),%xmm9 1056 movaps 64+64(%rsp),%xmm10 1057___ 1058$code.=<<___; 1059 lea `64+($win64?5*16:0)`(%rsp),%rsi 1060 mov 0(%rsi),%r12 1061 mov 8(%rsi),%rbp 1062 mov 16(%rsi),%rbx 1063 lea 24(%rsi),%rsp 1064.Lepilogue_avx: 1065 ret 1066.size sha1_block_data_order_avx,.-sha1_block_data_order_avx 1067___ 1068} 1069$code.=<<___; 1070.align 64 1071K_XX_XX: 1072.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1073.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1074.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1075.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1076.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask 1077___ 1078}}} 1079$code.=<<___; 1080.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1081.align 64 1082___ 1083 1084# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1085# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1086if ($win64) { 1087$rec="%rcx"; 1088$frame="%rdx"; 1089$context="%r8"; 1090$disp="%r9"; 1091 1092$code.=<<___; 1093.extern __imp_RtlVirtualUnwind 1094.type se_handler,\@abi-omnipotent 1095.align 16 1096se_handler: 1097 push %rsi 1098 push %rdi 1099 push %rbx 1100 push %rbp 1101 push %r12 1102 push %r13 1103 push %r14 1104 push %r15 1105 pushfq 1106 sub \$64,%rsp 1107 1108 mov 120($context),%rax # pull context->Rax 1109 mov 248($context),%rbx # pull context->Rip 1110 1111 lea .Lprologue(%rip),%r10 1112 cmp %r10,%rbx # context->Rip<.Lprologue 1113 jb .Lcommon_seh_tail 1114 1115 mov 152($context),%rax # pull context->Rsp 1116 1117 lea .Lepilogue(%rip),%r10 1118 cmp %r10,%rbx # context->Rip>=.Lepilogue 1119 jae .Lcommon_seh_tail 1120 1121 mov `16*4`(%rax),%rax # pull saved stack pointer 1122 lea 32(%rax),%rax 1123 1124 mov -8(%rax),%rbx 1125 mov -16(%rax),%rbp 1126 mov -24(%rax),%r12 1127 mov -32(%rax),%r13 1128 mov %rbx,144($context) # restore context->Rbx 1129 mov %rbp,160($context) # restore context->Rbp 1130 mov %r12,216($context) # restore context->R12 1131 mov %r13,224($context) # restore context->R13 1132 1133 jmp .Lcommon_seh_tail 1134.size se_handler,.-se_handler 1135 1136.type ssse3_handler,\@abi-omnipotent 1137.align 16 1138ssse3_handler: 1139 push %rsi 1140 push %rdi 1141 push %rbx 1142 push %rbp 1143 push %r12 1144 push %r13 1145 push %r14 1146 push %r15 1147 pushfq 1148 sub \$64,%rsp 1149 1150 mov 120($context),%rax # pull context->Rax 1151 mov 248($context),%rbx # pull context->Rip 1152 1153 mov 8($disp),%rsi # disp->ImageBase 1154 mov 56($disp),%r11 # disp->HandlerData 1155 1156 mov 0(%r11),%r10d # HandlerData[0] 1157 lea (%rsi,%r10),%r10 # prologue label 1158 cmp %r10,%rbx # context->Rip<prologue label 1159 jb .Lcommon_seh_tail 1160 1161 mov 152($context),%rax # pull context->Rsp 1162 1163 mov 4(%r11),%r10d # HandlerData[1] 1164 lea (%rsi,%r10),%r10 # epilogue label 1165 cmp %r10,%rbx # context->Rip>=epilogue label 1166 jae .Lcommon_seh_tail 1167 1168 lea 64(%rax),%rsi 1169 lea 512($context),%rdi # &context.Xmm6 1170 mov \$10,%ecx 1171 .long 0xa548f3fc # cld; rep movsq 1172 lea `24+64+5*16`(%rax),%rax # adjust stack pointer 1173 1174 mov -8(%rax),%rbx 1175 mov -16(%rax),%rbp 1176 mov -24(%rax),%r12 1177 mov %rbx,144($context) # restore context->Rbx 1178 mov %rbp,160($context) # restore context->Rbp 1179 mov %r12,216($context) # restore cotnext->R12 1180 1181.Lcommon_seh_tail: 1182 mov 8(%rax),%rdi 1183 mov 16(%rax),%rsi 1184 mov %rax,152($context) # restore context->Rsp 1185 mov %rsi,168($context) # restore context->Rsi 1186 mov %rdi,176($context) # restore context->Rdi 1187 1188 mov 40($disp),%rdi # disp->ContextRecord 1189 mov $context,%rsi # context 1190 mov \$154,%ecx # sizeof(CONTEXT) 1191 .long 0xa548f3fc # cld; rep movsq 1192 1193 mov $disp,%rsi 1194 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1195 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1196 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1197 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1198 mov 40(%rsi),%r10 # disp->ContextRecord 1199 lea 56(%rsi),%r11 # &disp->HandlerData 1200 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1201 mov %r10,32(%rsp) # arg5 1202 mov %r11,40(%rsp) # arg6 1203 mov %r12,48(%rsp) # arg7 1204 mov %rcx,56(%rsp) # arg8, (NULL) 1205 call *__imp_RtlVirtualUnwind(%rip) 1206 1207 mov \$1,%eax # ExceptionContinueSearch 1208 add \$64,%rsp 1209 popfq 1210 pop %r15 1211 pop %r14 1212 pop %r13 1213 pop %r12 1214 pop %rbp 1215 pop %rbx 1216 pop %rdi 1217 pop %rsi 1218 ret 1219.size ssse3_handler,.-ssse3_handler 1220 1221.section .pdata 1222.align 4 1223 .rva .LSEH_begin_sha1_block_data_order 1224 .rva .LSEH_end_sha1_block_data_order 1225 .rva .LSEH_info_sha1_block_data_order 1226 .rva .LSEH_begin_sha1_block_data_order_ssse3 1227 .rva .LSEH_end_sha1_block_data_order_ssse3 1228 .rva .LSEH_info_sha1_block_data_order_ssse3 1229___ 1230$code.=<<___ if ($avx); 1231 .rva .LSEH_begin_sha1_block_data_order_avx 1232 .rva .LSEH_end_sha1_block_data_order_avx 1233 .rva .LSEH_info_sha1_block_data_order_avx 1234___ 1235$code.=<<___; 1236.section .xdata 1237.align 8 1238.LSEH_info_sha1_block_data_order: 1239 .byte 9,0,0,0 1240 .rva se_handler 1241.LSEH_info_sha1_block_data_order_ssse3: 1242 .byte 9,0,0,0 1243 .rva ssse3_handler 1244 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 1245___ 1246$code.=<<___ if ($avx); 1247.LSEH_info_sha1_block_data_order_avx: 1248 .byte 9,0,0,0 1249 .rva ssse3_handler 1250 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 1251___ 1252} 1253 1254#################################################################### 1255 1256$code =~ s/\`([^\`]*)\`/eval $1/gem; 1257print $code; 1258close STDOUT; 1259