1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# Multi-buffer SHA256 procedure processes n buffers in parallel by 11# placing buffer data to designated lane of SIMD register. n is 12# naturally limited to 4 on pre-AVX2 processors and to 8 on 13# AVX2-capable processors such as Haswell. 14# 15# this +aesni(i) sha256 aesni-sha256 gain(iv) 16# ------------------------------------------------------------------- 17# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126% 18# Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95% 19# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103% 20# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82% 21# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170% 22# Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100% 23# 24# (i) multi-block CBC encrypt with 128-bit key; 25# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, 26# because of lower AES-NI instruction throughput, nor is there 27# AES-NI-SHA256 stitch for these processors; 28# (iii) "this" is for n=8, when we gather twice as much data, result 29# for n=4 is 20.3+4.44=24.7; 30# (iv) presented improvement coefficients are asymptotic limits and 31# in real-life application are somewhat lower, e.g. for 2KB 32# fragments they range from 75% to 130% (on Haswell); 33 34$flavour = shift; 35$output = shift; 36if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 37 38$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 39 40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 41( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 42( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 43die "can't locate x86_64-xlate.pl"; 44 45$avx=0; 46 47if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 49 $avx = ($1>=2.19) + ($1>=2.22); 50} 51 52if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 54 $avx = ($1>=2.09) + ($1>=2.10); 55} 56 57if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 58 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 59 $avx = ($1>=10) + ($1>=11); 60} 61 62if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 63 $avx = ($2>=3.0) + ($2>3.0); 64} 65 66open OUT,"| \"$^X\" $xlate $flavour $output"; 67*STDOUT=*OUT; 68 69# void sha256_multi_block ( 70# struct { unsigned int A[8]; 71# unsigned int B[8]; 72# unsigned int C[8]; 73# unsigned int D[8]; 74# unsigned int E[8]; 75# unsigned int F[8]; 76# unsigned int G[8]; 77# unsigned int H[8]; } *ctx, 78# struct { void *ptr; int blocks; } inp[8], 79# int num); /* 1 or 2 */ 80# 81$ctx="%rdi"; # 1st arg 82$inp="%rsi"; # 2nd arg 83$num="%edx"; # 3rd arg 84@ptr=map("%r$_",(8..11)); 85$Tbl="%rbp"; 86 87@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15)); 88($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7)); 89 90$REG_SZ=16; 91 92sub Xi_off { 93my $off = shift; 94 95 $off %= 16; $off *= $REG_SZ; 96 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; 97} 98 99sub ROUND_00_15 { 100my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 101 102$code.=<<___ if ($i<15); 103 movd `4*$i`(@ptr[0]),$Xi 104 movd `4*$i`(@ptr[1]),$t1 105 movd `4*$i`(@ptr[2]),$t2 106 movd `4*$i`(@ptr[3]),$t3 107 punpckldq $t2,$Xi 108 punpckldq $t3,$t1 109 punpckldq $t1,$Xi 110___ 111$code.=<<___ if ($i==15); 112 movd `4*$i`(@ptr[0]),$Xi 113 lea `16*4`(@ptr[0]),@ptr[0] 114 movd `4*$i`(@ptr[1]),$t1 115 lea `16*4`(@ptr[1]),@ptr[1] 116 movd `4*$i`(@ptr[2]),$t2 117 lea `16*4`(@ptr[2]),@ptr[2] 118 movd `4*$i`(@ptr[3]),$t3 119 lea `16*4`(@ptr[3]),@ptr[3] 120 punpckldq $t2,$Xi 121 punpckldq $t3,$t1 122 punpckldq $t1,$Xi 123___ 124$code.=<<___; 125 movdqa $e,$sigma 126 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)` 127 movdqa $e,$t3 128 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)` 129 psrld \$6,$sigma 130 movdqa $e,$t2 131 pslld \$7,$t3 132 movdqa $Xi,`&Xi_off($i)` 133 paddd $h,$Xi # Xi+=h 134 135 psrld \$11,$t2 136 pxor $t3,$sigma 137 pslld \$21-7,$t3 138 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round] 139 pxor $t2,$sigma 140 141 psrld \$25-11,$t2 142 movdqa $e,$t1 143 `"prefetcht0 63(@ptr[0])" if ($i==15)` 144 pxor $t3,$sigma 145 movdqa $e,$axb # borrow $axb 146 pslld \$26-21,$t3 147 pandn $g,$t1 148 pand $f,$axb 149 pxor $t2,$sigma 150 151 `"prefetcht0 63(@ptr[1])" if ($i==15)` 152 movdqa $a,$t2 153 pxor $t3,$sigma # Sigma1(e) 154 movdqa $a,$t3 155 psrld \$2,$t2 156 paddd $sigma,$Xi # Xi+=Sigma1(e) 157 pxor $axb,$t1 # Ch(e,f,g) 158 movdqa $b,$axb 159 movdqa $a,$sigma 160 pslld \$10,$t3 161 pxor $a,$axb # a^b, b^c in next round 162 163 `"prefetcht0 63(@ptr[2])" if ($i==15)` 164 psrld \$13,$sigma 165 pxor $t3,$t2 166 paddd $t1,$Xi # Xi+=Ch(e,f,g) 167 pslld \$19-10,$t3 168 pand $axb,$bxc 169 pxor $sigma,$t2 170 171 `"prefetcht0 63(@ptr[3])" if ($i==15)` 172 psrld \$22-13,$sigma 173 pxor $t3,$t2 174 movdqa $b,$h 175 pslld \$30-19,$t3 176 pxor $t2,$sigma 177 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 178 paddd $Xi,$d # d+=Xi 179 pxor $t3,$sigma # Sigma0(a) 180 181 paddd $Xi,$h # h+=Xi 182 paddd $sigma,$h # h+=Sigma0(a) 183___ 184$code.=<<___ if (($i%8)==7); 185 lea `32*8`($Tbl),$Tbl 186___ 187 ($axb,$bxc)=($bxc,$axb); 188} 189 190sub ROUND_16_XX { 191my $i=shift; 192 193$code.=<<___; 194 movdqa `&Xi_off($i+1)`,$Xn 195 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9] 196 197 movdqa $Xn,$sigma 198 movdqa $Xn,$t2 199 psrld \$3,$sigma 200 movdqa $Xn,$t3 201 202 psrld \$7,$t2 203 movdqa `&Xi_off($i+14)`,$t1 204 pslld \$14,$t3 205 pxor $t2,$sigma 206 psrld \$18-7,$t2 207 movdqa $t1,$axb # borrow $axb 208 pxor $t3,$sigma 209 pslld \$25-14,$t3 210 pxor $t2,$sigma 211 psrld \$10,$t1 212 movdqa $axb,$t2 213 214 psrld \$17,$axb 215 pxor $t3,$sigma # sigma0(X[i+1]) 216 pslld \$13,$t2 217 paddd $sigma,$Xi # Xi+=sigma0(e) 218 pxor $axb,$t1 219 psrld \$19-17,$axb 220 pxor $t2,$t1 221 pslld \$15-13,$t2 222 pxor $axb,$t1 223 pxor $t2,$t1 # sigma0(X[i+14]) 224 paddd $t1,$Xi # Xi+=sigma1(X[i+14]) 225___ 226 &ROUND_00_15($i,@_); 227 ($Xi,$Xn)=($Xn,$Xi); 228} 229 230$code.=<<___; 231.text 232 233.extern OPENSSL_ia32cap_P 234 235.globl sha256_multi_block 236.type sha256_multi_block,\@function,3 237.align 32 238sha256_multi_block: 239 mov OPENSSL_ia32cap_P+4(%rip),%rcx 240 bt \$61,%rcx # check SHA bit 241 jc _shaext_shortcut 242___ 243$code.=<<___ if ($avx); 244 test \$`1<<28`,%ecx 245 jnz _avx_shortcut 246___ 247$code.=<<___; 248 mov %rsp,%rax 249 push %rbx 250 push %rbp 251___ 252$code.=<<___ if ($win64); 253 lea -0xa8(%rsp),%rsp 254 movaps %xmm6,(%rsp) 255 movaps %xmm7,0x10(%rsp) 256 movaps %xmm8,0x20(%rsp) 257 movaps %xmm9,0x30(%rsp) 258 movaps %xmm10,-0x78(%rax) 259 movaps %xmm11,-0x68(%rax) 260 movaps %xmm12,-0x58(%rax) 261 movaps %xmm13,-0x48(%rax) 262 movaps %xmm14,-0x38(%rax) 263 movaps %xmm15,-0x28(%rax) 264___ 265$code.=<<___; 266 sub \$`$REG_SZ*18`, %rsp 267 and \$-256,%rsp 268 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 269.Lbody: 270 lea K256+128(%rip),$Tbl 271 lea `$REG_SZ*16`(%rsp),%rbx 272 lea 0x80($ctx),$ctx # size optimization 273 274.Loop_grande: 275 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 276 xor $num,$num 277___ 278for($i=0;$i<4;$i++) { 279 $code.=<<___; 280 mov `16*$i+0`($inp),@ptr[$i] # input pointer 281 mov `16*$i+8`($inp),%ecx # number of blocks 282 cmp $num,%ecx 283 cmovg %ecx,$num # find maximum 284 test %ecx,%ecx 285 mov %ecx,`4*$i`(%rbx) # initialize counters 286 cmovle $Tbl,@ptr[$i] # cancel input 287___ 288} 289$code.=<<___; 290 test $num,$num 291 jz .Ldone 292 293 movdqu 0x00-0x80($ctx),$A # load context 294 lea 128(%rsp),%rax 295 movdqu 0x20-0x80($ctx),$B 296 movdqu 0x40-0x80($ctx),$C 297 movdqu 0x60-0x80($ctx),$D 298 movdqu 0x80-0x80($ctx),$E 299 movdqu 0xa0-0x80($ctx),$F 300 movdqu 0xc0-0x80($ctx),$G 301 movdqu 0xe0-0x80($ctx),$H 302 movdqu .Lpbswap(%rip),$Xn 303 jmp .Loop 304 305.align 32 306.Loop: 307 movdqa $C,$bxc 308 pxor $B,$bxc # magic seed 309___ 310for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } 311$code.=<<___; 312 movdqu `&Xi_off($i)`,$Xi 313 mov \$3,%ecx 314 jmp .Loop_16_xx 315.align 32 316.Loop_16_xx: 317___ 318for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); } 319$code.=<<___; 320 dec %ecx 321 jnz .Loop_16_xx 322 323 mov \$1,%ecx 324 lea K256+128(%rip),$Tbl 325 326 movdqa (%rbx),$sigma # pull counters 327 cmp 4*0(%rbx),%ecx # examine counters 328 pxor $t1,$t1 329 cmovge $Tbl,@ptr[0] # cancel input 330 cmp 4*1(%rbx),%ecx 331 movdqa $sigma,$Xn 332 cmovge $Tbl,@ptr[1] 333 cmp 4*2(%rbx),%ecx 334 pcmpgtd $t1,$Xn # mask value 335 cmovge $Tbl,@ptr[2] 336 cmp 4*3(%rbx),%ecx 337 paddd $Xn,$sigma # counters-- 338 cmovge $Tbl,@ptr[3] 339 340 movdqu 0x00-0x80($ctx),$t1 341 pand $Xn,$A 342 movdqu 0x20-0x80($ctx),$t2 343 pand $Xn,$B 344 movdqu 0x40-0x80($ctx),$t3 345 pand $Xn,$C 346 movdqu 0x60-0x80($ctx),$Xi 347 pand $Xn,$D 348 paddd $t1,$A 349 movdqu 0x80-0x80($ctx),$t1 350 pand $Xn,$E 351 paddd $t2,$B 352 movdqu 0xa0-0x80($ctx),$t2 353 pand $Xn,$F 354 paddd $t3,$C 355 movdqu 0xc0-0x80($ctx),$t3 356 pand $Xn,$G 357 paddd $Xi,$D 358 movdqu 0xe0-0x80($ctx),$Xi 359 pand $Xn,$H 360 paddd $t1,$E 361 paddd $t2,$F 362 movdqu $A,0x00-0x80($ctx) 363 paddd $t3,$G 364 movdqu $B,0x20-0x80($ctx) 365 paddd $Xi,$H 366 movdqu $C,0x40-0x80($ctx) 367 movdqu $D,0x60-0x80($ctx) 368 movdqu $E,0x80-0x80($ctx) 369 movdqu $F,0xa0-0x80($ctx) 370 movdqu $G,0xc0-0x80($ctx) 371 movdqu $H,0xe0-0x80($ctx) 372 373 movdqa $sigma,(%rbx) # save counters 374 movdqa .Lpbswap(%rip),$Xn 375 dec $num 376 jnz .Loop 377 378 mov `$REG_SZ*17+8`(%rsp),$num 379 lea $REG_SZ($ctx),$ctx 380 lea `16*$REG_SZ/4`($inp),$inp 381 dec $num 382 jnz .Loop_grande 383 384.Ldone: 385 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp 386___ 387$code.=<<___ if ($win64); 388 movaps -0xb8(%rax),%xmm6 389 movaps -0xa8(%rax),%xmm7 390 movaps -0x98(%rax),%xmm8 391 movaps -0x88(%rax),%xmm9 392 movaps -0x78(%rax),%xmm10 393 movaps -0x68(%rax),%xmm11 394 movaps -0x58(%rax),%xmm12 395 movaps -0x48(%rax),%xmm13 396 movaps -0x38(%rax),%xmm14 397 movaps -0x28(%rax),%xmm15 398___ 399$code.=<<___; 400 mov -16(%rax),%rbp 401 mov -8(%rax),%rbx 402 lea (%rax),%rsp 403.Lepilogue: 404 ret 405.size sha256_multi_block,.-sha256_multi_block 406___ 407 {{{ 408my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15)); 409my @MSG0=map("%xmm$_",(4..7)); 410my @MSG1=map("%xmm$_",(8..11)); 411 412$code.=<<___; 413.type sha256_multi_block_shaext,\@function,3 414.align 32 415sha256_multi_block_shaext: 416_shaext_shortcut: 417 mov %rsp,%rax 418 push %rbx 419 push %rbp 420___ 421$code.=<<___ if ($win64); 422 lea -0xa8(%rsp),%rsp 423 movaps %xmm6,(%rsp) 424 movaps %xmm7,0x10(%rsp) 425 movaps %xmm8,0x20(%rsp) 426 movaps %xmm9,0x30(%rsp) 427 movaps %xmm10,-0x78(%rax) 428 movaps %xmm11,-0x68(%rax) 429 movaps %xmm12,-0x58(%rax) 430 movaps %xmm13,-0x48(%rax) 431 movaps %xmm14,-0x38(%rax) 432 movaps %xmm15,-0x28(%rax) 433___ 434$code.=<<___; 435 sub \$`$REG_SZ*18`,%rsp 436 shl \$1,$num # we process pair at a time 437 and \$-256,%rsp 438 lea 0x80($ctx),$ctx # size optimization 439 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 440.Lbody_shaext: 441 lea `$REG_SZ*16`(%rsp),%rbx 442 lea K256_shaext+0x80(%rip),$Tbl 443 444.Loop_grande_shaext: 445 mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num 446 xor $num,$num 447___ 448for($i=0;$i<2;$i++) { 449 $code.=<<___; 450 mov `16*$i+0`($inp),@ptr[$i] # input pointer 451 mov `16*$i+8`($inp),%ecx # number of blocks 452 cmp $num,%ecx 453 cmovg %ecx,$num # find maximum 454 test %ecx,%ecx 455 mov %ecx,`4*$i`(%rbx) # initialize counters 456 cmovle %rsp,@ptr[$i] # cancel input 457___ 458} 459$code.=<<___; 460 test $num,$num 461 jz .Ldone_shaext 462 463 movq 0x00-0x80($ctx),$ABEF0 # A1.A0 464 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0 465 movq 0x40-0x80($ctx),$CDGH0 # C1.C0 466 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0 467 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0 468 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0 469 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0 470 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0 471 472 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0 473 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0 474 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0 475 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0 476 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap 477 478 movdqa $ABEF0,$ABEF1 479 movdqa $CDGH0,$CDGH1 480 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0 481 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0 482 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1 483 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1 484 485 pshufd \$0b00011011,$ABEF0,$ABEF0 486 pshufd \$0b00011011,$CDGH0,$CDGH0 487 pshufd \$0b00011011,$ABEF1,$ABEF1 488 pshufd \$0b00011011,$CDGH1,$CDGH1 489 jmp .Loop_shaext 490 491.align 32 492.Loop_shaext: 493 movdqu 0x00(@ptr[0]),@MSG0[0] 494 movdqu 0x00(@ptr[1]),@MSG1[0] 495 movdqu 0x10(@ptr[0]),@MSG0[1] 496 movdqu 0x10(@ptr[1]),@MSG1[1] 497 movdqu 0x20(@ptr[0]),@MSG0[2] 498 pshufb $TMPx,@MSG0[0] 499 movdqu 0x20(@ptr[1]),@MSG1[2] 500 pshufb $TMPx,@MSG1[0] 501 movdqu 0x30(@ptr[0]),@MSG0[3] 502 lea 0x40(@ptr[0]),@ptr[0] 503 movdqu 0x30(@ptr[1]),@MSG1[3] 504 lea 0x40(@ptr[1]),@ptr[1] 505 506 movdqa 0*16-0x80($Tbl),$Wi 507 pshufb $TMPx,@MSG0[1] 508 paddd @MSG0[0],$Wi 509 pxor $ABEF0,@MSG0[0] # black magic 510 movdqa $Wi,$TMP0 511 movdqa 0*16-0x80($Tbl),$TMP1 512 pshufb $TMPx,@MSG1[1] 513 paddd @MSG1[0],$TMP1 514 movdqa $CDGH0,0x50(%rsp) # offload 515 sha256rnds2 $ABEF0,$CDGH0 # 0-3 516 pxor $ABEF1,@MSG1[0] # black magic 517 movdqa $TMP1,$Wi 518 movdqa $CDGH1,0x70(%rsp) 519 sha256rnds2 $ABEF1,$CDGH1 # 0-3 520 pshufd \$0x0e,$TMP0,$Wi 521 pxor $ABEF0,@MSG0[0] # black magic 522 movdqa $ABEF0,0x40(%rsp) # offload 523 sha256rnds2 $CDGH0,$ABEF0 524 pshufd \$0x0e,$TMP1,$Wi 525 pxor $ABEF1,@MSG1[0] # black magic 526 movdqa $ABEF1,0x60(%rsp) 527 movdqa 1*16-0x80($Tbl),$TMP0 528 paddd @MSG0[1],$TMP0 529 pshufb $TMPx,@MSG0[2] 530 sha256rnds2 $CDGH1,$ABEF1 531 532 movdqa $TMP0,$Wi 533 movdqa 1*16-0x80($Tbl),$TMP1 534 paddd @MSG1[1],$TMP1 535 sha256rnds2 $ABEF0,$CDGH0 # 4-7 536 movdqa $TMP1,$Wi 537 prefetcht0 127(@ptr[0]) 538 pshufb $TMPx,@MSG0[3] 539 pshufb $TMPx,@MSG1[2] 540 prefetcht0 127(@ptr[1]) 541 sha256rnds2 $ABEF1,$CDGH1 # 4-7 542 pshufd \$0x0e,$TMP0,$Wi 543 pshufb $TMPx,@MSG1[3] 544 sha256msg1 @MSG0[1],@MSG0[0] 545 sha256rnds2 $CDGH0,$ABEF0 546 pshufd \$0x0e,$TMP1,$Wi 547 movdqa 2*16-0x80($Tbl),$TMP0 548 paddd @MSG0[2],$TMP0 549 sha256rnds2 $CDGH1,$ABEF1 550 551 movdqa $TMP0,$Wi 552 movdqa 2*16-0x80($Tbl),$TMP1 553 paddd @MSG1[2],$TMP1 554 sha256rnds2 $ABEF0,$CDGH0 # 8-11 555 sha256msg1 @MSG1[1],@MSG1[0] 556 movdqa $TMP1,$Wi 557 movdqa @MSG0[3],$TMPx 558 sha256rnds2 $ABEF1,$CDGH1 # 8-11 559 pshufd \$0x0e,$TMP0,$Wi 560 palignr \$4,@MSG0[2],$TMPx 561 paddd $TMPx,@MSG0[0] 562 movdqa @MSG1[3],$TMPx 563 palignr \$4,@MSG1[2],$TMPx 564 sha256msg1 @MSG0[2],@MSG0[1] 565 sha256rnds2 $CDGH0,$ABEF0 566 pshufd \$0x0e,$TMP1,$Wi 567 movdqa 3*16-0x80($Tbl),$TMP0 568 paddd @MSG0[3],$TMP0 569 sha256rnds2 $CDGH1,$ABEF1 570 sha256msg1 @MSG1[2],@MSG1[1] 571 572 movdqa $TMP0,$Wi 573 movdqa 3*16-0x80($Tbl),$TMP1 574 paddd $TMPx,@MSG1[0] 575 paddd @MSG1[3],$TMP1 576 sha256msg2 @MSG0[3],@MSG0[0] 577 sha256rnds2 $ABEF0,$CDGH0 # 12-15 578 movdqa $TMP1,$Wi 579 movdqa @MSG0[0],$TMPx 580 palignr \$4,@MSG0[3],$TMPx 581 sha256rnds2 $ABEF1,$CDGH1 # 12-15 582 sha256msg2 @MSG1[3],@MSG1[0] 583 pshufd \$0x0e,$TMP0,$Wi 584 paddd $TMPx,@MSG0[1] 585 movdqa @MSG1[0],$TMPx 586 palignr \$4,@MSG1[3],$TMPx 587 sha256msg1 @MSG0[3],@MSG0[2] 588 sha256rnds2 $CDGH0,$ABEF0 589 pshufd \$0x0e,$TMP1,$Wi 590 movdqa 4*16-0x80($Tbl),$TMP0 591 paddd @MSG0[0],$TMP0 592 sha256rnds2 $CDGH1,$ABEF1 593 sha256msg1 @MSG1[3],@MSG1[2] 594___ 595for($i=4;$i<16-3;$i++) { 596$code.=<<___; 597 movdqa $TMP0,$Wi 598 movdqa $i*16-0x80($Tbl),$TMP1 599 paddd $TMPx,@MSG1[1] 600 paddd @MSG1[0],$TMP1 601 sha256msg2 @MSG0[0],@MSG0[1] 602 sha256rnds2 $ABEF0,$CDGH0 # 16-19... 603 movdqa $TMP1,$Wi 604 movdqa @MSG0[1],$TMPx 605 palignr \$4,@MSG0[0],$TMPx 606 sha256rnds2 $ABEF1,$CDGH1 # 16-19... 607 sha256msg2 @MSG1[0],@MSG1[1] 608 pshufd \$0x0e,$TMP0,$Wi 609 paddd $TMPx,@MSG0[2] 610 movdqa @MSG1[1],$TMPx 611 palignr \$4,@MSG1[0],$TMPx 612 sha256msg1 @MSG0[0],@MSG0[3] 613 sha256rnds2 $CDGH0,$ABEF0 614 pshufd \$0x0e,$TMP1,$Wi 615 movdqa `($i+1)*16`-0x80($Tbl),$TMP0 616 paddd @MSG0[1],$TMP0 617 sha256rnds2 $CDGH1,$ABEF1 618 sha256msg1 @MSG1[0],@MSG1[3] 619___ 620 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); 621} 622$code.=<<___; 623 movdqa $TMP0,$Wi 624 movdqa 13*16-0x80($Tbl),$TMP1 625 paddd $TMPx,@MSG1[1] 626 paddd @MSG1[0],$TMP1 627 sha256msg2 @MSG0[0],@MSG0[1] 628 sha256rnds2 $ABEF0,$CDGH0 # 52-55 629 movdqa $TMP1,$Wi 630 movdqa @MSG0[1],$TMPx 631 palignr \$4,@MSG0[0],$TMPx 632 sha256rnds2 $ABEF1,$CDGH1 # 52-55 633 sha256msg2 @MSG1[0],@MSG1[1] 634 pshufd \$0x0e,$TMP0,$Wi 635 paddd $TMPx,@MSG0[2] 636 movdqa @MSG1[1],$TMPx 637 palignr \$4,@MSG1[0],$TMPx 638 nop 639 sha256rnds2 $CDGH0,$ABEF0 640 pshufd \$0x0e,$TMP1,$Wi 641 movdqa 14*16-0x80($Tbl),$TMP0 642 paddd @MSG0[1],$TMP0 643 sha256rnds2 $CDGH1,$ABEF1 644 645 movdqa $TMP0,$Wi 646 movdqa 14*16-0x80($Tbl),$TMP1 647 paddd $TMPx,@MSG1[2] 648 paddd @MSG1[1],$TMP1 649 sha256msg2 @MSG0[1],@MSG0[2] 650 nop 651 sha256rnds2 $ABEF0,$CDGH0 # 56-59 652 movdqa $TMP1,$Wi 653 mov \$1,%ecx 654 pxor @MSG0[1],@MSG0[1] # zero 655 sha256rnds2 $ABEF1,$CDGH1 # 56-59 656 sha256msg2 @MSG1[1],@MSG1[2] 657 pshufd \$0x0e,$TMP0,$Wi 658 movdqa 15*16-0x80($Tbl),$TMP0 659 paddd @MSG0[2],$TMP0 660 movq (%rbx),@MSG0[2] # pull counters 661 nop 662 sha256rnds2 $CDGH0,$ABEF0 663 pshufd \$0x0e,$TMP1,$Wi 664 movdqa 15*16-0x80($Tbl),$TMP1 665 paddd @MSG1[2],$TMP1 666 sha256rnds2 $CDGH1,$ABEF1 667 668 movdqa $TMP0,$Wi 669 cmp 4*0(%rbx),%ecx # examine counters 670 cmovge %rsp,@ptr[0] # cancel input 671 cmp 4*1(%rbx),%ecx 672 cmovge %rsp,@ptr[1] 673 pshufd \$0x00,@MSG0[2],@MSG1[0] 674 sha256rnds2 $ABEF0,$CDGH0 # 60-63 675 movdqa $TMP1,$Wi 676 pshufd \$0x55,@MSG0[2],@MSG1[1] 677 movdqa @MSG0[2],@MSG1[2] 678 sha256rnds2 $ABEF1,$CDGH1 # 60-63 679 pshufd \$0x0e,$TMP0,$Wi 680 pcmpgtd @MSG0[1],@MSG1[0] 681 pcmpgtd @MSG0[1],@MSG1[1] 682 sha256rnds2 $CDGH0,$ABEF0 683 pshufd \$0x0e,$TMP1,$Wi 684 pcmpgtd @MSG0[1],@MSG1[2] # counter mask 685 movdqa K256_shaext-0x10(%rip),$TMPx 686 sha256rnds2 $CDGH1,$ABEF1 687 688 pand @MSG1[0],$CDGH0 689 pand @MSG1[1],$CDGH1 690 pand @MSG1[0],$ABEF0 691 pand @MSG1[1],$ABEF1 692 paddd @MSG0[2],@MSG1[2] # counters-- 693 694 paddd 0x50(%rsp),$CDGH0 695 paddd 0x70(%rsp),$CDGH1 696 paddd 0x40(%rsp),$ABEF0 697 paddd 0x60(%rsp),$ABEF1 698 699 movq @MSG1[2],(%rbx) # save counters 700 dec $num 701 jnz .Loop_shaext 702 703 mov `$REG_SZ*17+8`(%rsp),$num 704 705 pshufd \$0b00011011,$ABEF0,$ABEF0 706 pshufd \$0b00011011,$CDGH0,$CDGH0 707 pshufd \$0b00011011,$ABEF1,$ABEF1 708 pshufd \$0b00011011,$CDGH1,$CDGH1 709 710 movdqa $ABEF0,@MSG0[0] 711 movdqa $CDGH0,@MSG0[1] 712 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0 713 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0 714 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0 715 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0 716 717 movq $ABEF0,0x00-0x80($ctx) # A1.A0 718 psrldq \$8,$ABEF0 719 movq @MSG0[0],0x80-0x80($ctx) # E1.E0 720 psrldq \$8,@MSG0[0] 721 movq $ABEF0,0x20-0x80($ctx) # B1.B0 722 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0 723 724 movq $CDGH0,0x40-0x80($ctx) # C1.C0 725 psrldq \$8,$CDGH0 726 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0 727 psrldq \$8,@MSG0[1] 728 movq $CDGH0,0x60-0x80($ctx) # D1.D0 729 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0 730 731 lea `$REG_SZ/2`($ctx),$ctx 732 lea `16*2`($inp),$inp 733 dec $num 734 jnz .Loop_grande_shaext 735 736.Ldone_shaext: 737 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp 738___ 739$code.=<<___ if ($win64); 740 movaps -0xb8(%rax),%xmm6 741 movaps -0xa8(%rax),%xmm7 742 movaps -0x98(%rax),%xmm8 743 movaps -0x88(%rax),%xmm9 744 movaps -0x78(%rax),%xmm10 745 movaps -0x68(%rax),%xmm11 746 movaps -0x58(%rax),%xmm12 747 movaps -0x48(%rax),%xmm13 748 movaps -0x38(%rax),%xmm14 749 movaps -0x28(%rax),%xmm15 750___ 751$code.=<<___; 752 mov -16(%rax),%rbp 753 mov -8(%rax),%rbx 754 lea (%rax),%rsp 755.Lepilogue_shaext: 756 ret 757.size sha256_multi_block_shaext,.-sha256_multi_block_shaext 758___ 759 }}} 760 if ($avx) {{{ 761sub ROUND_00_15_avx { 762my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 763 764$code.=<<___ if ($i<15 && $REG_SZ==16); 765 vmovd `4*$i`(@ptr[0]),$Xi 766 vmovd `4*$i`(@ptr[1]),$t1 767 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi 768 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 769 vpunpckldq $t1,$Xi,$Xi 770 vpshufb $Xn,$Xi,$Xi 771___ 772$code.=<<___ if ($i==15 && $REG_SZ==16); 773 vmovd `4*$i`(@ptr[0]),$Xi 774 lea `16*4`(@ptr[0]),@ptr[0] 775 vmovd `4*$i`(@ptr[1]),$t1 776 lea `16*4`(@ptr[1]),@ptr[1] 777 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi 778 lea `16*4`(@ptr[2]),@ptr[2] 779 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 780 lea `16*4`(@ptr[3]),@ptr[3] 781 vpunpckldq $t1,$Xi,$Xi 782 vpshufb $Xn,$Xi,$Xi 783___ 784$code.=<<___ if ($i<15 && $REG_SZ==32); 785 vmovd `4*$i`(@ptr[0]),$Xi 786 vmovd `4*$i`(@ptr[4]),$t1 787 vmovd `4*$i`(@ptr[1]),$t2 788 vmovd `4*$i`(@ptr[5]),$t3 789 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi 790 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 791 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 792 vpunpckldq $t2,$Xi,$Xi 793 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 794 vpunpckldq $t3,$t1,$t1 795 vinserti128 $t1,$Xi,$Xi 796 vpshufb $Xn,$Xi,$Xi 797___ 798$code.=<<___ if ($i==15 && $REG_SZ==32); 799 vmovd `4*$i`(@ptr[0]),$Xi 800 lea `16*4`(@ptr[0]),@ptr[0] 801 vmovd `4*$i`(@ptr[4]),$t1 802 lea `16*4`(@ptr[4]),@ptr[4] 803 vmovd `4*$i`(@ptr[1]),$t2 804 lea `16*4`(@ptr[1]),@ptr[1] 805 vmovd `4*$i`(@ptr[5]),$t3 806 lea `16*4`(@ptr[5]),@ptr[5] 807 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi 808 lea `16*4`(@ptr[2]),@ptr[2] 809 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 810 lea `16*4`(@ptr[6]),@ptr[6] 811 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 812 lea `16*4`(@ptr[3]),@ptr[3] 813 vpunpckldq $t2,$Xi,$Xi 814 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 815 lea `16*4`(@ptr[7]),@ptr[7] 816 vpunpckldq $t3,$t1,$t1 817 vinserti128 $t1,$Xi,$Xi 818 vpshufb $Xn,$Xi,$Xi 819___ 820$code.=<<___; 821 vpsrld \$6,$e,$sigma 822 vpslld \$26,$e,$t3 823 vmovdqu $Xi,`&Xi_off($i)` 824 vpaddd $h,$Xi,$Xi # Xi+=h 825 826 vpsrld \$11,$e,$t2 827 vpxor $t3,$sigma,$sigma 828 vpslld \$21,$e,$t3 829 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round] 830 vpxor $t2,$sigma,$sigma 831 832 vpsrld \$25,$e,$t2 833 vpxor $t3,$sigma,$sigma 834 `"prefetcht0 63(@ptr[0])" if ($i==15)` 835 vpslld \$7,$e,$t3 836 vpandn $g,$e,$t1 837 vpand $f,$e,$axb # borrow $axb 838 `"prefetcht0 63(@ptr[1])" if ($i==15)` 839 vpxor $t2,$sigma,$sigma 840 841 vpsrld \$2,$a,$h # borrow $h 842 vpxor $t3,$sigma,$sigma # Sigma1(e) 843 `"prefetcht0 63(@ptr[2])" if ($i==15)` 844 vpslld \$30,$a,$t2 845 vpxor $axb,$t1,$t1 # Ch(e,f,g) 846 vpxor $a,$b,$axb # a^b, b^c in next round 847 `"prefetcht0 63(@ptr[3])" if ($i==15)` 848 vpxor $t2,$h,$h 849 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e) 850 851 vpsrld \$13,$a,$t2 852 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` 853 vpslld \$19,$a,$t3 854 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g) 855 vpand $axb,$bxc,$bxc 856 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` 857 vpxor $t2,$h,$sigma 858 859 vpsrld \$22,$a,$t2 860 vpxor $t3,$sigma,$sigma 861 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` 862 vpslld \$10,$a,$t3 863 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 864 vpaddd $Xi,$d,$d # d+=Xi 865 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` 866 vpxor $t2,$sigma,$sigma 867 vpxor $t3,$sigma,$sigma # Sigma0(a) 868 869 vpaddd $Xi,$h,$h # h+=Xi 870 vpaddd $sigma,$h,$h # h+=Sigma0(a) 871___ 872$code.=<<___ if (($i%8)==7); 873 add \$`32*8`,$Tbl 874___ 875 ($axb,$bxc)=($bxc,$axb); 876} 877 878sub ROUND_16_XX_avx { 879my $i=shift; 880 881$code.=<<___; 882 vmovdqu `&Xi_off($i+1)`,$Xn 883 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9] 884 885 vpsrld \$3,$Xn,$sigma 886 vpsrld \$7,$Xn,$t2 887 vpslld \$25,$Xn,$t3 888 vpxor $t2,$sigma,$sigma 889 vpsrld \$18,$Xn,$t2 890 vpxor $t3,$sigma,$sigma 891 vpslld \$14,$Xn,$t3 892 vmovdqu `&Xi_off($i+14)`,$t1 893 vpsrld \$10,$t1,$axb # borrow $axb 894 895 vpxor $t2,$sigma,$sigma 896 vpsrld \$17,$t1,$t2 897 vpxor $t3,$sigma,$sigma # sigma0(X[i+1]) 898 vpslld \$15,$t1,$t3 899 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e) 900 vpxor $t2,$axb,$sigma 901 vpsrld \$19,$t1,$t2 902 vpxor $t3,$sigma,$sigma 903 vpslld \$13,$t1,$t3 904 vpxor $t2,$sigma,$sigma 905 vpxor $t3,$sigma,$sigma # sigma0(X[i+14]) 906 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14]) 907___ 908 &ROUND_00_15_avx($i,@_); 909 ($Xi,$Xn)=($Xn,$Xi); 910} 911 912$code.=<<___; 913.type sha256_multi_block_avx,\@function,3 914.align 32 915sha256_multi_block_avx: 916_avx_shortcut: 917___ 918$code.=<<___ if ($avx>1); 919 shr \$32,%rcx 920 cmp \$2,$num 921 jb .Lavx 922 test \$`1<<5`,%ecx 923 jnz _avx2_shortcut 924 jmp .Lavx 925.align 32 926.Lavx: 927___ 928$code.=<<___; 929 mov %rsp,%rax 930 push %rbx 931 push %rbp 932___ 933$code.=<<___ if ($win64); 934 lea -0xa8(%rsp),%rsp 935 movaps %xmm6,(%rsp) 936 movaps %xmm7,0x10(%rsp) 937 movaps %xmm8,0x20(%rsp) 938 movaps %xmm9,0x30(%rsp) 939 movaps %xmm10,-0x78(%rax) 940 movaps %xmm11,-0x68(%rax) 941 movaps %xmm12,-0x58(%rax) 942 movaps %xmm13,-0x48(%rax) 943 movaps %xmm14,-0x38(%rax) 944 movaps %xmm15,-0x28(%rax) 945___ 946$code.=<<___; 947 sub \$`$REG_SZ*18`, %rsp 948 and \$-256,%rsp 949 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 950.Lbody_avx: 951 lea K256+128(%rip),$Tbl 952 lea `$REG_SZ*16`(%rsp),%rbx 953 lea 0x80($ctx),$ctx # size optimization 954 955.Loop_grande_avx: 956 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 957 xor $num,$num 958___ 959for($i=0;$i<4;$i++) { 960 $code.=<<___; 961 mov `16*$i+0`($inp),@ptr[$i] # input pointer 962 mov `16*$i+8`($inp),%ecx # number of blocks 963 cmp $num,%ecx 964 cmovg %ecx,$num # find maximum 965 test %ecx,%ecx 966 mov %ecx,`4*$i`(%rbx) # initialize counters 967 cmovle $Tbl,@ptr[$i] # cancel input 968___ 969} 970$code.=<<___; 971 test $num,$num 972 jz .Ldone_avx 973 974 vmovdqu 0x00-0x80($ctx),$A # load context 975 lea 128(%rsp),%rax 976 vmovdqu 0x20-0x80($ctx),$B 977 vmovdqu 0x40-0x80($ctx),$C 978 vmovdqu 0x60-0x80($ctx),$D 979 vmovdqu 0x80-0x80($ctx),$E 980 vmovdqu 0xa0-0x80($ctx),$F 981 vmovdqu 0xc0-0x80($ctx),$G 982 vmovdqu 0xe0-0x80($ctx),$H 983 vmovdqu .Lpbswap(%rip),$Xn 984 jmp .Loop_avx 985 986.align 32 987.Loop_avx: 988 vpxor $B,$C,$bxc # magic seed 989___ 990for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } 991$code.=<<___; 992 vmovdqu `&Xi_off($i)`,$Xi 993 mov \$3,%ecx 994 jmp .Loop_16_xx_avx 995.align 32 996.Loop_16_xx_avx: 997___ 998for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } 999$code.=<<___; 1000 dec %ecx 1001 jnz .Loop_16_xx_avx 1002 1003 mov \$1,%ecx 1004 lea K256+128(%rip),$Tbl 1005___ 1006for($i=0;$i<4;$i++) { 1007 $code.=<<___; 1008 cmp `4*$i`(%rbx),%ecx # examine counters 1009 cmovge $Tbl,@ptr[$i] # cancel input 1010___ 1011} 1012$code.=<<___; 1013 vmovdqa (%rbx),$sigma # pull counters 1014 vpxor $t1,$t1,$t1 1015 vmovdqa $sigma,$Xn 1016 vpcmpgtd $t1,$Xn,$Xn # mask value 1017 vpaddd $Xn,$sigma,$sigma # counters-- 1018 1019 vmovdqu 0x00-0x80($ctx),$t1 1020 vpand $Xn,$A,$A 1021 vmovdqu 0x20-0x80($ctx),$t2 1022 vpand $Xn,$B,$B 1023 vmovdqu 0x40-0x80($ctx),$t3 1024 vpand $Xn,$C,$C 1025 vmovdqu 0x60-0x80($ctx),$Xi 1026 vpand $Xn,$D,$D 1027 vpaddd $t1,$A,$A 1028 vmovdqu 0x80-0x80($ctx),$t1 1029 vpand $Xn,$E,$E 1030 vpaddd $t2,$B,$B 1031 vmovdqu 0xa0-0x80($ctx),$t2 1032 vpand $Xn,$F,$F 1033 vpaddd $t3,$C,$C 1034 vmovdqu 0xc0-0x80($ctx),$t3 1035 vpand $Xn,$G,$G 1036 vpaddd $Xi,$D,$D 1037 vmovdqu 0xe0-0x80($ctx),$Xi 1038 vpand $Xn,$H,$H 1039 vpaddd $t1,$E,$E 1040 vpaddd $t2,$F,$F 1041 vmovdqu $A,0x00-0x80($ctx) 1042 vpaddd $t3,$G,$G 1043 vmovdqu $B,0x20-0x80($ctx) 1044 vpaddd $Xi,$H,$H 1045 vmovdqu $C,0x40-0x80($ctx) 1046 vmovdqu $D,0x60-0x80($ctx) 1047 vmovdqu $E,0x80-0x80($ctx) 1048 vmovdqu $F,0xa0-0x80($ctx) 1049 vmovdqu $G,0xc0-0x80($ctx) 1050 vmovdqu $H,0xe0-0x80($ctx) 1051 1052 vmovdqu $sigma,(%rbx) # save counters 1053 vmovdqu .Lpbswap(%rip),$Xn 1054 dec $num 1055 jnz .Loop_avx 1056 1057 mov `$REG_SZ*17+8`(%rsp),$num 1058 lea $REG_SZ($ctx),$ctx 1059 lea `16*$REG_SZ/4`($inp),$inp 1060 dec $num 1061 jnz .Loop_grande_avx 1062 1063.Ldone_avx: 1064 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp 1065 vzeroupper 1066___ 1067$code.=<<___ if ($win64); 1068 movaps -0xb8(%rax),%xmm6 1069 movaps -0xa8(%rax),%xmm7 1070 movaps -0x98(%rax),%xmm8 1071 movaps -0x88(%rax),%xmm9 1072 movaps -0x78(%rax),%xmm10 1073 movaps -0x68(%rax),%xmm11 1074 movaps -0x58(%rax),%xmm12 1075 movaps -0x48(%rax),%xmm13 1076 movaps -0x38(%rax),%xmm14 1077 movaps -0x28(%rax),%xmm15 1078___ 1079$code.=<<___; 1080 mov -16(%rax),%rbp 1081 mov -8(%rax),%rbx 1082 lea (%rax),%rsp 1083.Lepilogue_avx: 1084 ret 1085.size sha256_multi_block_avx,.-sha256_multi_block_avx 1086___ 1087 if ($avx>1) { 1088$code =~ s/\`([^\`]*)\`/eval $1/gem; 1089 1090$REG_SZ=32; 1091@ptr=map("%r$_",(12..15,8..11)); 1092 1093@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15)); 1094($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7)); 1095 1096$code.=<<___; 1097.type sha256_multi_block_avx2,\@function,3 1098.align 32 1099sha256_multi_block_avx2: 1100_avx2_shortcut: 1101 mov %rsp,%rax 1102 push %rbx 1103 push %rbp 1104 push %r12 1105 push %r13 1106 push %r14 1107 push %r15 1108___ 1109$code.=<<___ if ($win64); 1110 lea -0xa8(%rsp),%rsp 1111 movaps %xmm6,(%rsp) 1112 movaps %xmm7,0x10(%rsp) 1113 movaps %xmm8,0x20(%rsp) 1114 movaps %xmm9,0x30(%rsp) 1115 movaps %xmm10,0x40(%rsp) 1116 movaps %xmm11,0x50(%rsp) 1117 movaps %xmm12,-0x78(%rax) 1118 movaps %xmm13,-0x68(%rax) 1119 movaps %xmm14,-0x58(%rax) 1120 movaps %xmm15,-0x48(%rax) 1121___ 1122$code.=<<___; 1123 sub \$`$REG_SZ*18`, %rsp 1124 and \$-256,%rsp 1125 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1126.Lbody_avx2: 1127 lea K256+128(%rip),$Tbl 1128 lea 0x80($ctx),$ctx # size optimization 1129 1130.Loop_grande_avx2: 1131 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 1132 xor $num,$num 1133 lea `$REG_SZ*16`(%rsp),%rbx 1134___ 1135for($i=0;$i<8;$i++) { 1136 $code.=<<___; 1137 mov `16*$i+0`($inp),@ptr[$i] # input pointer 1138 mov `16*$i+8`($inp),%ecx # number of blocks 1139 cmp $num,%ecx 1140 cmovg %ecx,$num # find maximum 1141 test %ecx,%ecx 1142 mov %ecx,`4*$i`(%rbx) # initialize counters 1143 cmovle $Tbl,@ptr[$i] # cancel input 1144___ 1145} 1146$code.=<<___; 1147 vmovdqu 0x00-0x80($ctx),$A # load context 1148 lea 128(%rsp),%rax 1149 vmovdqu 0x20-0x80($ctx),$B 1150 lea 256+128(%rsp),%rbx 1151 vmovdqu 0x40-0x80($ctx),$C 1152 vmovdqu 0x60-0x80($ctx),$D 1153 vmovdqu 0x80-0x80($ctx),$E 1154 vmovdqu 0xa0-0x80($ctx),$F 1155 vmovdqu 0xc0-0x80($ctx),$G 1156 vmovdqu 0xe0-0x80($ctx),$H 1157 vmovdqu .Lpbswap(%rip),$Xn 1158 jmp .Loop_avx2 1159 1160.align 32 1161.Loop_avx2: 1162 vpxor $B,$C,$bxc # magic seed 1163___ 1164for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } 1165$code.=<<___; 1166 vmovdqu `&Xi_off($i)`,$Xi 1167 mov \$3,%ecx 1168 jmp .Loop_16_xx_avx2 1169.align 32 1170.Loop_16_xx_avx2: 1171___ 1172for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } 1173$code.=<<___; 1174 dec %ecx 1175 jnz .Loop_16_xx_avx2 1176 1177 mov \$1,%ecx 1178 lea `$REG_SZ*16`(%rsp),%rbx 1179 lea K256+128(%rip),$Tbl 1180___ 1181for($i=0;$i<8;$i++) { 1182 $code.=<<___; 1183 cmp `4*$i`(%rbx),%ecx # examine counters 1184 cmovge $Tbl,@ptr[$i] # cancel input 1185___ 1186} 1187$code.=<<___; 1188 vmovdqa (%rbx),$sigma # pull counters 1189 vpxor $t1,$t1,$t1 1190 vmovdqa $sigma,$Xn 1191 vpcmpgtd $t1,$Xn,$Xn # mask value 1192 vpaddd $Xn,$sigma,$sigma # counters-- 1193 1194 vmovdqu 0x00-0x80($ctx),$t1 1195 vpand $Xn,$A,$A 1196 vmovdqu 0x20-0x80($ctx),$t2 1197 vpand $Xn,$B,$B 1198 vmovdqu 0x40-0x80($ctx),$t3 1199 vpand $Xn,$C,$C 1200 vmovdqu 0x60-0x80($ctx),$Xi 1201 vpand $Xn,$D,$D 1202 vpaddd $t1,$A,$A 1203 vmovdqu 0x80-0x80($ctx),$t1 1204 vpand $Xn,$E,$E 1205 vpaddd $t2,$B,$B 1206 vmovdqu 0xa0-0x80($ctx),$t2 1207 vpand $Xn,$F,$F 1208 vpaddd $t3,$C,$C 1209 vmovdqu 0xc0-0x80($ctx),$t3 1210 vpand $Xn,$G,$G 1211 vpaddd $Xi,$D,$D 1212 vmovdqu 0xe0-0x80($ctx),$Xi 1213 vpand $Xn,$H,$H 1214 vpaddd $t1,$E,$E 1215 vpaddd $t2,$F,$F 1216 vmovdqu $A,0x00-0x80($ctx) 1217 vpaddd $t3,$G,$G 1218 vmovdqu $B,0x20-0x80($ctx) 1219 vpaddd $Xi,$H,$H 1220 vmovdqu $C,0x40-0x80($ctx) 1221 vmovdqu $D,0x60-0x80($ctx) 1222 vmovdqu $E,0x80-0x80($ctx) 1223 vmovdqu $F,0xa0-0x80($ctx) 1224 vmovdqu $G,0xc0-0x80($ctx) 1225 vmovdqu $H,0xe0-0x80($ctx) 1226 1227 vmovdqu $sigma,(%rbx) # save counters 1228 lea 256+128(%rsp),%rbx 1229 vmovdqu .Lpbswap(%rip),$Xn 1230 dec $num 1231 jnz .Loop_avx2 1232 1233 #mov `$REG_SZ*17+8`(%rsp),$num 1234 #lea $REG_SZ($ctx),$ctx 1235 #lea `16*$REG_SZ/4`($inp),$inp 1236 #dec $num 1237 #jnz .Loop_grande_avx2 1238 1239.Ldone_avx2: 1240 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp 1241 vzeroupper 1242___ 1243$code.=<<___ if ($win64); 1244 movaps -0xd8(%rax),%xmm6 1245 movaps -0xc8(%rax),%xmm7 1246 movaps -0xb8(%rax),%xmm8 1247 movaps -0xa8(%rax),%xmm9 1248 movaps -0x98(%rax),%xmm10 1249 movaps -0x88(%rax),%xmm11 1250 movaps -0x78(%rax),%xmm12 1251 movaps -0x68(%rax),%xmm13 1252 movaps -0x58(%rax),%xmm14 1253 movaps -0x48(%rax),%xmm15 1254___ 1255$code.=<<___; 1256 mov -48(%rax),%r15 1257 mov -40(%rax),%r14 1258 mov -32(%rax),%r13 1259 mov -24(%rax),%r12 1260 mov -16(%rax),%rbp 1261 mov -8(%rax),%rbx 1262 lea (%rax),%rsp 1263.Lepilogue_avx2: 1264 ret 1265.size sha256_multi_block_avx2,.-sha256_multi_block_avx2 1266___ 1267 } }}} 1268$code.=<<___; 1269.align 256 1270K256: 1271___ 1272sub TABLE { 1273 foreach (@_) { 1274 $code.=<<___; 1275 .long $_,$_,$_,$_ 1276 .long $_,$_,$_,$_ 1277___ 1278 } 1279} 1280&TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, 1281 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 1282 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, 1283 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 1284 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, 1285 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 1286 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, 1287 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 1288 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, 1289 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 1290 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, 1291 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 1292 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, 1293 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 1294 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, 1295 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); 1296$code.=<<___; 1297.Lpbswap: 1298 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1299 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1300K256_shaext: 1301 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 1302 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 1303 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 1304 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 1305 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 1306 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 1307 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 1308 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 1309 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 1310 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 1311 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 1312 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 1313 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 1314 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 1315 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 1316 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 1317 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1318___ 1319 1320if ($win64) { 1321# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1322# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1323$rec="%rcx"; 1324$frame="%rdx"; 1325$context="%r8"; 1326$disp="%r9"; 1327 1328$code.=<<___; 1329.extern __imp_RtlVirtualUnwind 1330.type se_handler,\@abi-omnipotent 1331.align 16 1332se_handler: 1333 push %rsi 1334 push %rdi 1335 push %rbx 1336 push %rbp 1337 push %r12 1338 push %r13 1339 push %r14 1340 push %r15 1341 pushfq 1342 sub \$64,%rsp 1343 1344 mov 120($context),%rax # pull context->Rax 1345 mov 248($context),%rbx # pull context->Rip 1346 1347 mov 8($disp),%rsi # disp->ImageBase 1348 mov 56($disp),%r11 # disp->HandlerData 1349 1350 mov 0(%r11),%r10d # HandlerData[0] 1351 lea (%rsi,%r10),%r10 # end of prologue label 1352 cmp %r10,%rbx # context->Rip<.Lbody 1353 jb .Lin_prologue 1354 1355 mov 152($context),%rax # pull context->Rsp 1356 1357 mov 4(%r11),%r10d # HandlerData[1] 1358 lea (%rsi,%r10),%r10 # epilogue label 1359 cmp %r10,%rbx # context->Rip>=.Lepilogue 1360 jae .Lin_prologue 1361 1362 mov `16*17`(%rax),%rax # pull saved stack pointer 1363 1364 mov -8(%rax),%rbx 1365 mov -16(%rax),%rbp 1366 mov %rbx,144($context) # restore context->Rbx 1367 mov %rbp,160($context) # restore context->Rbp 1368 1369 lea -24-10*16(%rax),%rsi 1370 lea 512($context),%rdi # &context.Xmm6 1371 mov \$20,%ecx 1372 .long 0xa548f3fc # cld; rep movsq 1373 1374.Lin_prologue: 1375 mov 8(%rax),%rdi 1376 mov 16(%rax),%rsi 1377 mov %rax,152($context) # restore context->Rsp 1378 mov %rsi,168($context) # restore context->Rsi 1379 mov %rdi,176($context) # restore context->Rdi 1380 1381 mov 40($disp),%rdi # disp->ContextRecord 1382 mov $context,%rsi # context 1383 mov \$154,%ecx # sizeof(CONTEXT) 1384 .long 0xa548f3fc # cld; rep movsq 1385 1386 mov $disp,%rsi 1387 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1388 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1389 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1390 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1391 mov 40(%rsi),%r10 # disp->ContextRecord 1392 lea 56(%rsi),%r11 # &disp->HandlerData 1393 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1394 mov %r10,32(%rsp) # arg5 1395 mov %r11,40(%rsp) # arg6 1396 mov %r12,48(%rsp) # arg7 1397 mov %rcx,56(%rsp) # arg8, (NULL) 1398 call *__imp_RtlVirtualUnwind(%rip) 1399 1400 mov \$1,%eax # ExceptionContinueSearch 1401 add \$64,%rsp 1402 popfq 1403 pop %r15 1404 pop %r14 1405 pop %r13 1406 pop %r12 1407 pop %rbp 1408 pop %rbx 1409 pop %rdi 1410 pop %rsi 1411 ret 1412.size se_handler,.-se_handler 1413___ 1414$code.=<<___ if ($avx>1); 1415.type avx2_handler,\@abi-omnipotent 1416.align 16 1417avx2_handler: 1418 push %rsi 1419 push %rdi 1420 push %rbx 1421 push %rbp 1422 push %r12 1423 push %r13 1424 push %r14 1425 push %r15 1426 pushfq 1427 sub \$64,%rsp 1428 1429 mov 120($context),%rax # pull context->Rax 1430 mov 248($context),%rbx # pull context->Rip 1431 1432 mov 8($disp),%rsi # disp->ImageBase 1433 mov 56($disp),%r11 # disp->HandlerData 1434 1435 mov 0(%r11),%r10d # HandlerData[0] 1436 lea (%rsi,%r10),%r10 # end of prologue label 1437 cmp %r10,%rbx # context->Rip<body label 1438 jb .Lin_prologue 1439 1440 mov 152($context),%rax # pull context->Rsp 1441 1442 mov 4(%r11),%r10d # HandlerData[1] 1443 lea (%rsi,%r10),%r10 # epilogue label 1444 cmp %r10,%rbx # context->Rip>=epilogue label 1445 jae .Lin_prologue 1446 1447 mov `32*17`($context),%rax # pull saved stack pointer 1448 1449 mov -8(%rax),%rbx 1450 mov -16(%rax),%rbp 1451 mov -24(%rax),%r12 1452 mov -32(%rax),%r13 1453 mov -40(%rax),%r14 1454 mov -48(%rax),%r15 1455 mov %rbx,144($context) # restore context->Rbx 1456 mov %rbp,160($context) # restore context->Rbp 1457 mov %r12,216($context) # restore cotnext->R12 1458 mov %r13,224($context) # restore cotnext->R13 1459 mov %r14,232($context) # restore cotnext->R14 1460 mov %r15,240($context) # restore cotnext->R15 1461 1462 lea -56-10*16(%rax),%rsi 1463 lea 512($context),%rdi # &context.Xmm6 1464 mov \$20,%ecx 1465 .long 0xa548f3fc # cld; rep movsq 1466 1467 jmp .Lin_prologue 1468.size avx2_handler,.-avx2_handler 1469___ 1470$code.=<<___; 1471.section .pdata 1472.align 4 1473 .rva .LSEH_begin_sha256_multi_block 1474 .rva .LSEH_end_sha256_multi_block 1475 .rva .LSEH_info_sha256_multi_block 1476 .rva .LSEH_begin_sha256_multi_block_shaext 1477 .rva .LSEH_end_sha256_multi_block_shaext 1478 .rva .LSEH_info_sha256_multi_block_shaext 1479___ 1480$code.=<<___ if ($avx); 1481 .rva .LSEH_begin_sha256_multi_block_avx 1482 .rva .LSEH_end_sha256_multi_block_avx 1483 .rva .LSEH_info_sha256_multi_block_avx 1484___ 1485$code.=<<___ if ($avx>1); 1486 .rva .LSEH_begin_sha256_multi_block_avx2 1487 .rva .LSEH_end_sha256_multi_block_avx2 1488 .rva .LSEH_info_sha256_multi_block_avx2 1489___ 1490$code.=<<___; 1491.section .xdata 1492.align 8 1493.LSEH_info_sha256_multi_block: 1494 .byte 9,0,0,0 1495 .rva se_handler 1496 .rva .Lbody,.Lepilogue # HandlerData[] 1497.LSEH_info_sha256_multi_block_shaext: 1498 .byte 9,0,0,0 1499 .rva se_handler 1500 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] 1501___ 1502$code.=<<___ if ($avx); 1503.LSEH_info_sha256_multi_block_avx: 1504 .byte 9,0,0,0 1505 .rva se_handler 1506 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] 1507___ 1508$code.=<<___ if ($avx>1); 1509.LSEH_info_sha256_multi_block_avx2: 1510 .byte 9,0,0,0 1511 .rva avx2_handler 1512 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] 1513___ 1514} 1515#################################################################### 1516 1517sub rex { 1518 local *opcode=shift; 1519 my ($dst,$src)=@_; 1520 my $rex=0; 1521 1522 $rex|=0x04 if ($dst>=8); 1523 $rex|=0x01 if ($src>=8); 1524 unshift @opcode,$rex|0x40 if ($rex); 1525} 1526 1527sub sha256op38 { 1528 my $instr = shift; 1529 my %opcodelet = ( 1530 "sha256rnds2" => 0xcb, 1531 "sha256msg1" => 0xcc, 1532 "sha256msg2" => 0xcd ); 1533 1534 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1535 my @opcode=(0x0f,0x38); 1536 rex(\@opcode,$2,$1); 1537 push @opcode,$opcodelet{$instr}; 1538 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1539 return ".byte\t".join(',',@opcode); 1540 } else { 1541 return $instr."\t".@_[0]; 1542 } 1543} 1544 1545foreach (split("\n",$code)) { 1546 s/\`([^\`]*)\`/eval($1)/ge; 1547 1548 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or 1549 1550 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1551 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1552 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or 1553 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1554 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or 1555 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1556 1557 print $_,"\n"; 1558} 1559 1560close STDOUT; 1561