1#! /usr/bin/env perl 2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# Multi-buffer SHA256 procedure processes n buffers in parallel by 18# placing buffer data to designated lane of SIMD register. n is 19# naturally limited to 4 on pre-AVX2 processors and to 8 on 20# AVX2-capable processors such as Haswell. 21# 22# this +aesni(i) sha256 aesni-sha256 gain(iv) 23# ------------------------------------------------------------------- 24# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126% 25# Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95% 26# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103% 27# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82% 28# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170% 29# Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170% 30# Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100% 31# 32# (i) multi-block CBC encrypt with 128-bit key; 33# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, 34# because of lower AES-NI instruction throughput, nor is there 35# AES-NI-SHA256 stitch for these processors; 36# (iii) "this" is for n=8, when we gather twice as much data, result 37# for n=4 is 20.3+4.44=24.7; 38# (iv) presented improvement coefficients are asymptotic limits and 39# in real-life application are somewhat lower, e.g. for 2KB 40# fragments they range from 75% to 130% (on Haswell); 41 42$flavour = shift; 43$output = shift; 44if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 45 46$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 47 48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 49( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 50( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 51die "can't locate x86_64-xlate.pl"; 52 53$avx=0; 54 55if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 56 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 57 $avx = ($1>=2.19) + ($1>=2.22); 58} 59 60if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 61 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 62 $avx = ($1>=2.09) + ($1>=2.10); 63} 64 65if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 66 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 67 $avx = ($1>=10) + ($1>=11); 68} 69 70if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 71 $avx = ($2>=3.0) + ($2>3.0); 72} 73 74open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 75*STDOUT=*OUT; 76 77# void sha256_multi_block ( 78# struct { unsigned int A[8]; 79# unsigned int B[8]; 80# unsigned int C[8]; 81# unsigned int D[8]; 82# unsigned int E[8]; 83# unsigned int F[8]; 84# unsigned int G[8]; 85# unsigned int H[8]; } *ctx, 86# struct { void *ptr; int blocks; } inp[8], 87# int num); /* 1 or 2 */ 88# 89$ctx="%rdi"; # 1st arg 90$inp="%rsi"; # 2nd arg 91$num="%edx"; # 3rd arg 92@ptr=map("%r$_",(8..11)); 93$Tbl="%rbp"; 94 95@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15)); 96($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7)); 97 98$REG_SZ=16; 99 100sub Xi_off { 101my $off = shift; 102 103 $off %= 16; $off *= $REG_SZ; 104 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; 105} 106 107sub ROUND_00_15 { 108my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 109 110$code.=<<___ if ($i<15); 111 movd `4*$i`(@ptr[0]),$Xi 112 movd `4*$i`(@ptr[1]),$t1 113 movd `4*$i`(@ptr[2]),$t2 114 movd `4*$i`(@ptr[3]),$t3 115 punpckldq $t2,$Xi 116 punpckldq $t3,$t1 117 punpckldq $t1,$Xi 118___ 119$code.=<<___ if ($i==15); 120 movd `4*$i`(@ptr[0]),$Xi 121 lea `16*4`(@ptr[0]),@ptr[0] 122 movd `4*$i`(@ptr[1]),$t1 123 lea `16*4`(@ptr[1]),@ptr[1] 124 movd `4*$i`(@ptr[2]),$t2 125 lea `16*4`(@ptr[2]),@ptr[2] 126 movd `4*$i`(@ptr[3]),$t3 127 lea `16*4`(@ptr[3]),@ptr[3] 128 punpckldq $t2,$Xi 129 punpckldq $t3,$t1 130 punpckldq $t1,$Xi 131___ 132$code.=<<___; 133 movdqa $e,$sigma 134 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)` 135 movdqa $e,$t3 136 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)` 137 psrld \$6,$sigma 138 movdqa $e,$t2 139 pslld \$7,$t3 140 movdqa $Xi,`&Xi_off($i)` 141 paddd $h,$Xi # Xi+=h 142 143 psrld \$11,$t2 144 pxor $t3,$sigma 145 pslld \$21-7,$t3 146 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round] 147 pxor $t2,$sigma 148 149 psrld \$25-11,$t2 150 movdqa $e,$t1 151 `"prefetcht0 63(@ptr[0])" if ($i==15)` 152 pxor $t3,$sigma 153 movdqa $e,$axb # borrow $axb 154 pslld \$26-21,$t3 155 pandn $g,$t1 156 pand $f,$axb 157 pxor $t2,$sigma 158 159 `"prefetcht0 63(@ptr[1])" if ($i==15)` 160 movdqa $a,$t2 161 pxor $t3,$sigma # Sigma1(e) 162 movdqa $a,$t3 163 psrld \$2,$t2 164 paddd $sigma,$Xi # Xi+=Sigma1(e) 165 pxor $axb,$t1 # Ch(e,f,g) 166 movdqa $b,$axb 167 movdqa $a,$sigma 168 pslld \$10,$t3 169 pxor $a,$axb # a^b, b^c in next round 170 171 `"prefetcht0 63(@ptr[2])" if ($i==15)` 172 psrld \$13,$sigma 173 pxor $t3,$t2 174 paddd $t1,$Xi # Xi+=Ch(e,f,g) 175 pslld \$19-10,$t3 176 pand $axb,$bxc 177 pxor $sigma,$t2 178 179 `"prefetcht0 63(@ptr[3])" if ($i==15)` 180 psrld \$22-13,$sigma 181 pxor $t3,$t2 182 movdqa $b,$h 183 pslld \$30-19,$t3 184 pxor $t2,$sigma 185 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 186 paddd $Xi,$d # d+=Xi 187 pxor $t3,$sigma # Sigma0(a) 188 189 paddd $Xi,$h # h+=Xi 190 paddd $sigma,$h # h+=Sigma0(a) 191___ 192$code.=<<___ if (($i%8)==7); 193 lea `32*8`($Tbl),$Tbl 194___ 195 ($axb,$bxc)=($bxc,$axb); 196} 197 198sub ROUND_16_XX { 199my $i=shift; 200 201$code.=<<___; 202 movdqa `&Xi_off($i+1)`,$Xn 203 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9] 204 205 movdqa $Xn,$sigma 206 movdqa $Xn,$t2 207 psrld \$3,$sigma 208 movdqa $Xn,$t3 209 210 psrld \$7,$t2 211 movdqa `&Xi_off($i+14)`,$t1 212 pslld \$14,$t3 213 pxor $t2,$sigma 214 psrld \$18-7,$t2 215 movdqa $t1,$axb # borrow $axb 216 pxor $t3,$sigma 217 pslld \$25-14,$t3 218 pxor $t2,$sigma 219 psrld \$10,$t1 220 movdqa $axb,$t2 221 222 psrld \$17,$axb 223 pxor $t3,$sigma # sigma0(X[i+1]) 224 pslld \$13,$t2 225 paddd $sigma,$Xi # Xi+=sigma0(e) 226 pxor $axb,$t1 227 psrld \$19-17,$axb 228 pxor $t2,$t1 229 pslld \$15-13,$t2 230 pxor $axb,$t1 231 pxor $t2,$t1 # sigma0(X[i+14]) 232 paddd $t1,$Xi # Xi+=sigma1(X[i+14]) 233___ 234 &ROUND_00_15($i,@_); 235 ($Xi,$Xn)=($Xn,$Xi); 236} 237 238$code.=<<___; 239.text 240 241.extern OPENSSL_ia32cap_P 242 243.globl sha256_multi_block 244.type sha256_multi_block,\@function,3 245.align 32 246sha256_multi_block: 247.cfi_startproc 248 mov OPENSSL_ia32cap_P+4(%rip),%rcx 249 bt \$61,%rcx # check SHA bit 250 jc _shaext_shortcut 251___ 252$code.=<<___ if ($avx); 253 test \$`1<<28`,%ecx 254 jnz _avx_shortcut 255___ 256$code.=<<___; 257 mov %rsp,%rax 258.cfi_def_cfa_register %rax 259 push %rbx 260.cfi_push %rbx 261 push %rbp 262.cfi_push %rbp 263___ 264$code.=<<___ if ($win64); 265 lea -0xa8(%rsp),%rsp 266 movaps %xmm6,(%rsp) 267 movaps %xmm7,0x10(%rsp) 268 movaps %xmm8,0x20(%rsp) 269 movaps %xmm9,0x30(%rsp) 270 movaps %xmm10,-0x78(%rax) 271 movaps %xmm11,-0x68(%rax) 272 movaps %xmm12,-0x58(%rax) 273 movaps %xmm13,-0x48(%rax) 274 movaps %xmm14,-0x38(%rax) 275 movaps %xmm15,-0x28(%rax) 276___ 277$code.=<<___; 278 sub \$`$REG_SZ*18`, %rsp 279 and \$-256,%rsp 280 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 281.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 282.Lbody: 283 lea K256+128(%rip),$Tbl 284 lea `$REG_SZ*16`(%rsp),%rbx 285 lea 0x80($ctx),$ctx # size optimization 286 287.Loop_grande: 288 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 289 xor $num,$num 290___ 291for($i=0;$i<4;$i++) { 292 $code.=<<___; 293 mov `16*$i+0`($inp),@ptr[$i] # input pointer 294 mov `16*$i+8`($inp),%ecx # number of blocks 295 cmp $num,%ecx 296 cmovg %ecx,$num # find maximum 297 test %ecx,%ecx 298 mov %ecx,`4*$i`(%rbx) # initialize counters 299 cmovle $Tbl,@ptr[$i] # cancel input 300___ 301} 302$code.=<<___; 303 test $num,$num 304 jz .Ldone 305 306 movdqu 0x00-0x80($ctx),$A # load context 307 lea 128(%rsp),%rax 308 movdqu 0x20-0x80($ctx),$B 309 movdqu 0x40-0x80($ctx),$C 310 movdqu 0x60-0x80($ctx),$D 311 movdqu 0x80-0x80($ctx),$E 312 movdqu 0xa0-0x80($ctx),$F 313 movdqu 0xc0-0x80($ctx),$G 314 movdqu 0xe0-0x80($ctx),$H 315 movdqu .Lpbswap(%rip),$Xn 316 jmp .Loop 317 318.align 32 319.Loop: 320 movdqa $C,$bxc 321 pxor $B,$bxc # magic seed 322___ 323for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } 324$code.=<<___; 325 movdqu `&Xi_off($i)`,$Xi 326 mov \$3,%ecx 327 jmp .Loop_16_xx 328.align 32 329.Loop_16_xx: 330___ 331for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); } 332$code.=<<___; 333 dec %ecx 334 jnz .Loop_16_xx 335 336 mov \$1,%ecx 337 lea K256+128(%rip),$Tbl 338 339 movdqa (%rbx),$sigma # pull counters 340 cmp 4*0(%rbx),%ecx # examine counters 341 pxor $t1,$t1 342 cmovge $Tbl,@ptr[0] # cancel input 343 cmp 4*1(%rbx),%ecx 344 movdqa $sigma,$Xn 345 cmovge $Tbl,@ptr[1] 346 cmp 4*2(%rbx),%ecx 347 pcmpgtd $t1,$Xn # mask value 348 cmovge $Tbl,@ptr[2] 349 cmp 4*3(%rbx),%ecx 350 paddd $Xn,$sigma # counters-- 351 cmovge $Tbl,@ptr[3] 352 353 movdqu 0x00-0x80($ctx),$t1 354 pand $Xn,$A 355 movdqu 0x20-0x80($ctx),$t2 356 pand $Xn,$B 357 movdqu 0x40-0x80($ctx),$t3 358 pand $Xn,$C 359 movdqu 0x60-0x80($ctx),$Xi 360 pand $Xn,$D 361 paddd $t1,$A 362 movdqu 0x80-0x80($ctx),$t1 363 pand $Xn,$E 364 paddd $t2,$B 365 movdqu 0xa0-0x80($ctx),$t2 366 pand $Xn,$F 367 paddd $t3,$C 368 movdqu 0xc0-0x80($ctx),$t3 369 pand $Xn,$G 370 paddd $Xi,$D 371 movdqu 0xe0-0x80($ctx),$Xi 372 pand $Xn,$H 373 paddd $t1,$E 374 paddd $t2,$F 375 movdqu $A,0x00-0x80($ctx) 376 paddd $t3,$G 377 movdqu $B,0x20-0x80($ctx) 378 paddd $Xi,$H 379 movdqu $C,0x40-0x80($ctx) 380 movdqu $D,0x60-0x80($ctx) 381 movdqu $E,0x80-0x80($ctx) 382 movdqu $F,0xa0-0x80($ctx) 383 movdqu $G,0xc0-0x80($ctx) 384 movdqu $H,0xe0-0x80($ctx) 385 386 movdqa $sigma,(%rbx) # save counters 387 movdqa .Lpbswap(%rip),$Xn 388 dec $num 389 jnz .Loop 390 391 mov `$REG_SZ*17+8`(%rsp),$num 392 lea $REG_SZ($ctx),$ctx 393 lea `16*$REG_SZ/4`($inp),$inp 394 dec $num 395 jnz .Loop_grande 396 397.Ldone: 398 mov `$REG_SZ*17`(%rsp),%rax # original %rsp 399.cfi_def_cfa %rax,8 400___ 401$code.=<<___ if ($win64); 402 movaps -0xb8(%rax),%xmm6 403 movaps -0xa8(%rax),%xmm7 404 movaps -0x98(%rax),%xmm8 405 movaps -0x88(%rax),%xmm9 406 movaps -0x78(%rax),%xmm10 407 movaps -0x68(%rax),%xmm11 408 movaps -0x58(%rax),%xmm12 409 movaps -0x48(%rax),%xmm13 410 movaps -0x38(%rax),%xmm14 411 movaps -0x28(%rax),%xmm15 412___ 413$code.=<<___; 414 mov -16(%rax),%rbp 415.cfi_restore %rbp 416 mov -8(%rax),%rbx 417.cfi_restore %rbx 418 lea (%rax),%rsp 419.cfi_def_cfa_register %rsp 420.Lepilogue: 421 ret 422.cfi_endproc 423.size sha256_multi_block,.-sha256_multi_block 424___ 425 {{{ 426my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15)); 427my @MSG0=map("%xmm$_",(4..7)); 428my @MSG1=map("%xmm$_",(8..11)); 429 430$code.=<<___; 431.type sha256_multi_block_shaext,\@function,3 432.align 32 433sha256_multi_block_shaext: 434.cfi_startproc 435_shaext_shortcut: 436 mov %rsp,%rax 437.cfi_def_cfa_register %rax 438 push %rbx 439.cfi_push %rbx 440 push %rbp 441.cfi_push %rbp 442___ 443$code.=<<___ if ($win64); 444 lea -0xa8(%rsp),%rsp 445 movaps %xmm6,(%rsp) 446 movaps %xmm7,0x10(%rsp) 447 movaps %xmm8,0x20(%rsp) 448 movaps %xmm9,0x30(%rsp) 449 movaps %xmm10,-0x78(%rax) 450 movaps %xmm11,-0x68(%rax) 451 movaps %xmm12,-0x58(%rax) 452 movaps %xmm13,-0x48(%rax) 453 movaps %xmm14,-0x38(%rax) 454 movaps %xmm15,-0x28(%rax) 455___ 456$code.=<<___; 457 sub \$`$REG_SZ*18`,%rsp 458 shl \$1,$num # we process pair at a time 459 and \$-256,%rsp 460 lea 0x80($ctx),$ctx # size optimization 461 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 462.Lbody_shaext: 463 lea `$REG_SZ*16`(%rsp),%rbx 464 lea K256_shaext+0x80(%rip),$Tbl 465 466.Loop_grande_shaext: 467 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 468 xor $num,$num 469___ 470for($i=0;$i<2;$i++) { 471 $code.=<<___; 472 mov `16*$i+0`($inp),@ptr[$i] # input pointer 473 mov `16*$i+8`($inp),%ecx # number of blocks 474 cmp $num,%ecx 475 cmovg %ecx,$num # find maximum 476 test %ecx,%ecx 477 mov %ecx,`4*$i`(%rbx) # initialize counters 478 cmovle %rsp,@ptr[$i] # cancel input 479___ 480} 481$code.=<<___; 482 test $num,$num 483 jz .Ldone_shaext 484 485 movq 0x00-0x80($ctx),$ABEF0 # A1.A0 486 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0 487 movq 0x40-0x80($ctx),$CDGH0 # C1.C0 488 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0 489 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0 490 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0 491 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0 492 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0 493 494 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0 495 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0 496 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0 497 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0 498 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap 499 500 movdqa $ABEF0,$ABEF1 501 movdqa $CDGH0,$CDGH1 502 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0 503 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0 504 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1 505 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1 506 507 pshufd \$0b00011011,$ABEF0,$ABEF0 508 pshufd \$0b00011011,$CDGH0,$CDGH0 509 pshufd \$0b00011011,$ABEF1,$ABEF1 510 pshufd \$0b00011011,$CDGH1,$CDGH1 511 jmp .Loop_shaext 512 513.align 32 514.Loop_shaext: 515 movdqu 0x00(@ptr[0]),@MSG0[0] 516 movdqu 0x00(@ptr[1]),@MSG1[0] 517 movdqu 0x10(@ptr[0]),@MSG0[1] 518 movdqu 0x10(@ptr[1]),@MSG1[1] 519 movdqu 0x20(@ptr[0]),@MSG0[2] 520 pshufb $TMPx,@MSG0[0] 521 movdqu 0x20(@ptr[1]),@MSG1[2] 522 pshufb $TMPx,@MSG1[0] 523 movdqu 0x30(@ptr[0]),@MSG0[3] 524 lea 0x40(@ptr[0]),@ptr[0] 525 movdqu 0x30(@ptr[1]),@MSG1[3] 526 lea 0x40(@ptr[1]),@ptr[1] 527 528 movdqa 0*16-0x80($Tbl),$Wi 529 pshufb $TMPx,@MSG0[1] 530 paddd @MSG0[0],$Wi 531 pxor $ABEF0,@MSG0[0] # black magic 532 movdqa $Wi,$TMP0 533 movdqa 0*16-0x80($Tbl),$TMP1 534 pshufb $TMPx,@MSG1[1] 535 paddd @MSG1[0],$TMP1 536 movdqa $CDGH0,0x50(%rsp) # offload 537 sha256rnds2 $ABEF0,$CDGH0 # 0-3 538 pxor $ABEF1,@MSG1[0] # black magic 539 movdqa $TMP1,$Wi 540 movdqa $CDGH1,0x70(%rsp) 541 sha256rnds2 $ABEF1,$CDGH1 # 0-3 542 pshufd \$0x0e,$TMP0,$Wi 543 pxor $ABEF0,@MSG0[0] # black magic 544 movdqa $ABEF0,0x40(%rsp) # offload 545 sha256rnds2 $CDGH0,$ABEF0 546 pshufd \$0x0e,$TMP1,$Wi 547 pxor $ABEF1,@MSG1[0] # black magic 548 movdqa $ABEF1,0x60(%rsp) 549 movdqa 1*16-0x80($Tbl),$TMP0 550 paddd @MSG0[1],$TMP0 551 pshufb $TMPx,@MSG0[2] 552 sha256rnds2 $CDGH1,$ABEF1 553 554 movdqa $TMP0,$Wi 555 movdqa 1*16-0x80($Tbl),$TMP1 556 paddd @MSG1[1],$TMP1 557 sha256rnds2 $ABEF0,$CDGH0 # 4-7 558 movdqa $TMP1,$Wi 559 prefetcht0 127(@ptr[0]) 560 pshufb $TMPx,@MSG0[3] 561 pshufb $TMPx,@MSG1[2] 562 prefetcht0 127(@ptr[1]) 563 sha256rnds2 $ABEF1,$CDGH1 # 4-7 564 pshufd \$0x0e,$TMP0,$Wi 565 pshufb $TMPx,@MSG1[3] 566 sha256msg1 @MSG0[1],@MSG0[0] 567 sha256rnds2 $CDGH0,$ABEF0 568 pshufd \$0x0e,$TMP1,$Wi 569 movdqa 2*16-0x80($Tbl),$TMP0 570 paddd @MSG0[2],$TMP0 571 sha256rnds2 $CDGH1,$ABEF1 572 573 movdqa $TMP0,$Wi 574 movdqa 2*16-0x80($Tbl),$TMP1 575 paddd @MSG1[2],$TMP1 576 sha256rnds2 $ABEF0,$CDGH0 # 8-11 577 sha256msg1 @MSG1[1],@MSG1[0] 578 movdqa $TMP1,$Wi 579 movdqa @MSG0[3],$TMPx 580 sha256rnds2 $ABEF1,$CDGH1 # 8-11 581 pshufd \$0x0e,$TMP0,$Wi 582 palignr \$4,@MSG0[2],$TMPx 583 paddd $TMPx,@MSG0[0] 584 movdqa @MSG1[3],$TMPx 585 palignr \$4,@MSG1[2],$TMPx 586 sha256msg1 @MSG0[2],@MSG0[1] 587 sha256rnds2 $CDGH0,$ABEF0 588 pshufd \$0x0e,$TMP1,$Wi 589 movdqa 3*16-0x80($Tbl),$TMP0 590 paddd @MSG0[3],$TMP0 591 sha256rnds2 $CDGH1,$ABEF1 592 sha256msg1 @MSG1[2],@MSG1[1] 593 594 movdqa $TMP0,$Wi 595 movdqa 3*16-0x80($Tbl),$TMP1 596 paddd $TMPx,@MSG1[0] 597 paddd @MSG1[3],$TMP1 598 sha256msg2 @MSG0[3],@MSG0[0] 599 sha256rnds2 $ABEF0,$CDGH0 # 12-15 600 movdqa $TMP1,$Wi 601 movdqa @MSG0[0],$TMPx 602 palignr \$4,@MSG0[3],$TMPx 603 sha256rnds2 $ABEF1,$CDGH1 # 12-15 604 sha256msg2 @MSG1[3],@MSG1[0] 605 pshufd \$0x0e,$TMP0,$Wi 606 paddd $TMPx,@MSG0[1] 607 movdqa @MSG1[0],$TMPx 608 palignr \$4,@MSG1[3],$TMPx 609 sha256msg1 @MSG0[3],@MSG0[2] 610 sha256rnds2 $CDGH0,$ABEF0 611 pshufd \$0x0e,$TMP1,$Wi 612 movdqa 4*16-0x80($Tbl),$TMP0 613 paddd @MSG0[0],$TMP0 614 sha256rnds2 $CDGH1,$ABEF1 615 sha256msg1 @MSG1[3],@MSG1[2] 616___ 617for($i=4;$i<16-3;$i++) { 618$code.=<<___; 619 movdqa $TMP0,$Wi 620 movdqa $i*16-0x80($Tbl),$TMP1 621 paddd $TMPx,@MSG1[1] 622 paddd @MSG1[0],$TMP1 623 sha256msg2 @MSG0[0],@MSG0[1] 624 sha256rnds2 $ABEF0,$CDGH0 # 16-19... 625 movdqa $TMP1,$Wi 626 movdqa @MSG0[1],$TMPx 627 palignr \$4,@MSG0[0],$TMPx 628 sha256rnds2 $ABEF1,$CDGH1 # 16-19... 629 sha256msg2 @MSG1[0],@MSG1[1] 630 pshufd \$0x0e,$TMP0,$Wi 631 paddd $TMPx,@MSG0[2] 632 movdqa @MSG1[1],$TMPx 633 palignr \$4,@MSG1[0],$TMPx 634 sha256msg1 @MSG0[0],@MSG0[3] 635 sha256rnds2 $CDGH0,$ABEF0 636 pshufd \$0x0e,$TMP1,$Wi 637 movdqa `($i+1)*16`-0x80($Tbl),$TMP0 638 paddd @MSG0[1],$TMP0 639 sha256rnds2 $CDGH1,$ABEF1 640 sha256msg1 @MSG1[0],@MSG1[3] 641___ 642 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); 643} 644$code.=<<___; 645 movdqa $TMP0,$Wi 646 movdqa 13*16-0x80($Tbl),$TMP1 647 paddd $TMPx,@MSG1[1] 648 paddd @MSG1[0],$TMP1 649 sha256msg2 @MSG0[0],@MSG0[1] 650 sha256rnds2 $ABEF0,$CDGH0 # 52-55 651 movdqa $TMP1,$Wi 652 movdqa @MSG0[1],$TMPx 653 palignr \$4,@MSG0[0],$TMPx 654 sha256rnds2 $ABEF1,$CDGH1 # 52-55 655 sha256msg2 @MSG1[0],@MSG1[1] 656 pshufd \$0x0e,$TMP0,$Wi 657 paddd $TMPx,@MSG0[2] 658 movdqa @MSG1[1],$TMPx 659 palignr \$4,@MSG1[0],$TMPx 660 nop 661 sha256rnds2 $CDGH0,$ABEF0 662 pshufd \$0x0e,$TMP1,$Wi 663 movdqa 14*16-0x80($Tbl),$TMP0 664 paddd @MSG0[1],$TMP0 665 sha256rnds2 $CDGH1,$ABEF1 666 667 movdqa $TMP0,$Wi 668 movdqa 14*16-0x80($Tbl),$TMP1 669 paddd $TMPx,@MSG1[2] 670 paddd @MSG1[1],$TMP1 671 sha256msg2 @MSG0[1],@MSG0[2] 672 nop 673 sha256rnds2 $ABEF0,$CDGH0 # 56-59 674 movdqa $TMP1,$Wi 675 mov \$1,%ecx 676 pxor @MSG0[1],@MSG0[1] # zero 677 sha256rnds2 $ABEF1,$CDGH1 # 56-59 678 sha256msg2 @MSG1[1],@MSG1[2] 679 pshufd \$0x0e,$TMP0,$Wi 680 movdqa 15*16-0x80($Tbl),$TMP0 681 paddd @MSG0[2],$TMP0 682 movq (%rbx),@MSG0[2] # pull counters 683 nop 684 sha256rnds2 $CDGH0,$ABEF0 685 pshufd \$0x0e,$TMP1,$Wi 686 movdqa 15*16-0x80($Tbl),$TMP1 687 paddd @MSG1[2],$TMP1 688 sha256rnds2 $CDGH1,$ABEF1 689 690 movdqa $TMP0,$Wi 691 cmp 4*0(%rbx),%ecx # examine counters 692 cmovge %rsp,@ptr[0] # cancel input 693 cmp 4*1(%rbx),%ecx 694 cmovge %rsp,@ptr[1] 695 pshufd \$0x00,@MSG0[2],@MSG1[0] 696 sha256rnds2 $ABEF0,$CDGH0 # 60-63 697 movdqa $TMP1,$Wi 698 pshufd \$0x55,@MSG0[2],@MSG1[1] 699 movdqa @MSG0[2],@MSG1[2] 700 sha256rnds2 $ABEF1,$CDGH1 # 60-63 701 pshufd \$0x0e,$TMP0,$Wi 702 pcmpgtd @MSG0[1],@MSG1[0] 703 pcmpgtd @MSG0[1],@MSG1[1] 704 sha256rnds2 $CDGH0,$ABEF0 705 pshufd \$0x0e,$TMP1,$Wi 706 pcmpgtd @MSG0[1],@MSG1[2] # counter mask 707 movdqa K256_shaext-0x10(%rip),$TMPx 708 sha256rnds2 $CDGH1,$ABEF1 709 710 pand @MSG1[0],$CDGH0 711 pand @MSG1[1],$CDGH1 712 pand @MSG1[0],$ABEF0 713 pand @MSG1[1],$ABEF1 714 paddd @MSG0[2],@MSG1[2] # counters-- 715 716 paddd 0x50(%rsp),$CDGH0 717 paddd 0x70(%rsp),$CDGH1 718 paddd 0x40(%rsp),$ABEF0 719 paddd 0x60(%rsp),$ABEF1 720 721 movq @MSG1[2],(%rbx) # save counters 722 dec $num 723 jnz .Loop_shaext 724 725 mov `$REG_SZ*17+8`(%rsp),$num 726 727 pshufd \$0b00011011,$ABEF0,$ABEF0 728 pshufd \$0b00011011,$CDGH0,$CDGH0 729 pshufd \$0b00011011,$ABEF1,$ABEF1 730 pshufd \$0b00011011,$CDGH1,$CDGH1 731 732 movdqa $ABEF0,@MSG0[0] 733 movdqa $CDGH0,@MSG0[1] 734 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0 735 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0 736 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0 737 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0 738 739 movq $ABEF0,0x00-0x80($ctx) # A1.A0 740 psrldq \$8,$ABEF0 741 movq @MSG0[0],0x80-0x80($ctx) # E1.E0 742 psrldq \$8,@MSG0[0] 743 movq $ABEF0,0x20-0x80($ctx) # B1.B0 744 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0 745 746 movq $CDGH0,0x40-0x80($ctx) # C1.C0 747 psrldq \$8,$CDGH0 748 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0 749 psrldq \$8,@MSG0[1] 750 movq $CDGH0,0x60-0x80($ctx) # D1.D0 751 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0 752 753 lea `$REG_SZ/2`($ctx),$ctx 754 lea `16*2`($inp),$inp 755 dec $num 756 jnz .Loop_grande_shaext 757 758.Ldone_shaext: 759 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp 760___ 761$code.=<<___ if ($win64); 762 movaps -0xb8(%rax),%xmm6 763 movaps -0xa8(%rax),%xmm7 764 movaps -0x98(%rax),%xmm8 765 movaps -0x88(%rax),%xmm9 766 movaps -0x78(%rax),%xmm10 767 movaps -0x68(%rax),%xmm11 768 movaps -0x58(%rax),%xmm12 769 movaps -0x48(%rax),%xmm13 770 movaps -0x38(%rax),%xmm14 771 movaps -0x28(%rax),%xmm15 772___ 773$code.=<<___; 774 mov -16(%rax),%rbp 775.cfi_restore %rbp 776 mov -8(%rax),%rbx 777.cfi_restore %rbx 778 lea (%rax),%rsp 779.cfi_def_cfa_register %rsp 780.Lepilogue_shaext: 781 ret 782.cfi_endproc 783.size sha256_multi_block_shaext,.-sha256_multi_block_shaext 784___ 785 }}} 786 if ($avx) {{{ 787sub ROUND_00_15_avx { 788my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 789 790$code.=<<___ if ($i<15 && $REG_SZ==16); 791 vmovd `4*$i`(@ptr[0]),$Xi 792 vmovd `4*$i`(@ptr[1]),$t1 793 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi 794 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 795 vpunpckldq $t1,$Xi,$Xi 796 vpshufb $Xn,$Xi,$Xi 797___ 798$code.=<<___ if ($i==15 && $REG_SZ==16); 799 vmovd `4*$i`(@ptr[0]),$Xi 800 lea `16*4`(@ptr[0]),@ptr[0] 801 vmovd `4*$i`(@ptr[1]),$t1 802 lea `16*4`(@ptr[1]),@ptr[1] 803 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi 804 lea `16*4`(@ptr[2]),@ptr[2] 805 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 806 lea `16*4`(@ptr[3]),@ptr[3] 807 vpunpckldq $t1,$Xi,$Xi 808 vpshufb $Xn,$Xi,$Xi 809___ 810$code.=<<___ if ($i<15 && $REG_SZ==32); 811 vmovd `4*$i`(@ptr[0]),$Xi 812 vmovd `4*$i`(@ptr[4]),$t1 813 vmovd `4*$i`(@ptr[1]),$t2 814 vmovd `4*$i`(@ptr[5]),$t3 815 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi 816 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 817 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 818 vpunpckldq $t2,$Xi,$Xi 819 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 820 vpunpckldq $t3,$t1,$t1 821 vinserti128 $t1,$Xi,$Xi 822 vpshufb $Xn,$Xi,$Xi 823___ 824$code.=<<___ if ($i==15 && $REG_SZ==32); 825 vmovd `4*$i`(@ptr[0]),$Xi 826 lea `16*4`(@ptr[0]),@ptr[0] 827 vmovd `4*$i`(@ptr[4]),$t1 828 lea `16*4`(@ptr[4]),@ptr[4] 829 vmovd `4*$i`(@ptr[1]),$t2 830 lea `16*4`(@ptr[1]),@ptr[1] 831 vmovd `4*$i`(@ptr[5]),$t3 832 lea `16*4`(@ptr[5]),@ptr[5] 833 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi 834 lea `16*4`(@ptr[2]),@ptr[2] 835 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 836 lea `16*4`(@ptr[6]),@ptr[6] 837 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 838 lea `16*4`(@ptr[3]),@ptr[3] 839 vpunpckldq $t2,$Xi,$Xi 840 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 841 lea `16*4`(@ptr[7]),@ptr[7] 842 vpunpckldq $t3,$t1,$t1 843 vinserti128 $t1,$Xi,$Xi 844 vpshufb $Xn,$Xi,$Xi 845___ 846$code.=<<___; 847 vpsrld \$6,$e,$sigma 848 vpslld \$26,$e,$t3 849 vmovdqu $Xi,`&Xi_off($i)` 850 vpaddd $h,$Xi,$Xi # Xi+=h 851 852 vpsrld \$11,$e,$t2 853 vpxor $t3,$sigma,$sigma 854 vpslld \$21,$e,$t3 855 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round] 856 vpxor $t2,$sigma,$sigma 857 858 vpsrld \$25,$e,$t2 859 vpxor $t3,$sigma,$sigma 860 `"prefetcht0 63(@ptr[0])" if ($i==15)` 861 vpslld \$7,$e,$t3 862 vpandn $g,$e,$t1 863 vpand $f,$e,$axb # borrow $axb 864 `"prefetcht0 63(@ptr[1])" if ($i==15)` 865 vpxor $t2,$sigma,$sigma 866 867 vpsrld \$2,$a,$h # borrow $h 868 vpxor $t3,$sigma,$sigma # Sigma1(e) 869 `"prefetcht0 63(@ptr[2])" if ($i==15)` 870 vpslld \$30,$a,$t2 871 vpxor $axb,$t1,$t1 # Ch(e,f,g) 872 vpxor $a,$b,$axb # a^b, b^c in next round 873 `"prefetcht0 63(@ptr[3])" if ($i==15)` 874 vpxor $t2,$h,$h 875 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e) 876 877 vpsrld \$13,$a,$t2 878 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` 879 vpslld \$19,$a,$t3 880 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g) 881 vpand $axb,$bxc,$bxc 882 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` 883 vpxor $t2,$h,$sigma 884 885 vpsrld \$22,$a,$t2 886 vpxor $t3,$sigma,$sigma 887 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` 888 vpslld \$10,$a,$t3 889 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 890 vpaddd $Xi,$d,$d # d+=Xi 891 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` 892 vpxor $t2,$sigma,$sigma 893 vpxor $t3,$sigma,$sigma # Sigma0(a) 894 895 vpaddd $Xi,$h,$h # h+=Xi 896 vpaddd $sigma,$h,$h # h+=Sigma0(a) 897___ 898$code.=<<___ if (($i%8)==7); 899 add \$`32*8`,$Tbl 900___ 901 ($axb,$bxc)=($bxc,$axb); 902} 903 904sub ROUND_16_XX_avx { 905my $i=shift; 906 907$code.=<<___; 908 vmovdqu `&Xi_off($i+1)`,$Xn 909 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9] 910 911 vpsrld \$3,$Xn,$sigma 912 vpsrld \$7,$Xn,$t2 913 vpslld \$25,$Xn,$t3 914 vpxor $t2,$sigma,$sigma 915 vpsrld \$18,$Xn,$t2 916 vpxor $t3,$sigma,$sigma 917 vpslld \$14,$Xn,$t3 918 vmovdqu `&Xi_off($i+14)`,$t1 919 vpsrld \$10,$t1,$axb # borrow $axb 920 921 vpxor $t2,$sigma,$sigma 922 vpsrld \$17,$t1,$t2 923 vpxor $t3,$sigma,$sigma # sigma0(X[i+1]) 924 vpslld \$15,$t1,$t3 925 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e) 926 vpxor $t2,$axb,$sigma 927 vpsrld \$19,$t1,$t2 928 vpxor $t3,$sigma,$sigma 929 vpslld \$13,$t1,$t3 930 vpxor $t2,$sigma,$sigma 931 vpxor $t3,$sigma,$sigma # sigma0(X[i+14]) 932 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14]) 933___ 934 &ROUND_00_15_avx($i,@_); 935 ($Xi,$Xn)=($Xn,$Xi); 936} 937 938$code.=<<___; 939.type sha256_multi_block_avx,\@function,3 940.align 32 941sha256_multi_block_avx: 942.cfi_startproc 943_avx_shortcut: 944___ 945$code.=<<___ if ($avx>1); 946 shr \$32,%rcx 947 cmp \$2,$num 948 jb .Lavx 949 test \$`1<<5`,%ecx 950 jnz _avx2_shortcut 951 jmp .Lavx 952.align 32 953.Lavx: 954___ 955$code.=<<___; 956 mov %rsp,%rax 957.cfi_def_cfa_register %rax 958 push %rbx 959.cfi_push %rbx 960 push %rbp 961.cfi_push %rbp 962___ 963$code.=<<___ if ($win64); 964 lea -0xa8(%rsp),%rsp 965 movaps %xmm6,(%rsp) 966 movaps %xmm7,0x10(%rsp) 967 movaps %xmm8,0x20(%rsp) 968 movaps %xmm9,0x30(%rsp) 969 movaps %xmm10,-0x78(%rax) 970 movaps %xmm11,-0x68(%rax) 971 movaps %xmm12,-0x58(%rax) 972 movaps %xmm13,-0x48(%rax) 973 movaps %xmm14,-0x38(%rax) 974 movaps %xmm15,-0x28(%rax) 975___ 976$code.=<<___; 977 sub \$`$REG_SZ*18`, %rsp 978 and \$-256,%rsp 979 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 980.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 981.Lbody_avx: 982 lea K256+128(%rip),$Tbl 983 lea `$REG_SZ*16`(%rsp),%rbx 984 lea 0x80($ctx),$ctx # size optimization 985 986.Loop_grande_avx: 987 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 988 xor $num,$num 989___ 990for($i=0;$i<4;$i++) { 991 $code.=<<___; 992 mov `16*$i+0`($inp),@ptr[$i] # input pointer 993 mov `16*$i+8`($inp),%ecx # number of blocks 994 cmp $num,%ecx 995 cmovg %ecx,$num # find maximum 996 test %ecx,%ecx 997 mov %ecx,`4*$i`(%rbx) # initialize counters 998 cmovle $Tbl,@ptr[$i] # cancel input 999___ 1000} 1001$code.=<<___; 1002 test $num,$num 1003 jz .Ldone_avx 1004 1005 vmovdqu 0x00-0x80($ctx),$A # load context 1006 lea 128(%rsp),%rax 1007 vmovdqu 0x20-0x80($ctx),$B 1008 vmovdqu 0x40-0x80($ctx),$C 1009 vmovdqu 0x60-0x80($ctx),$D 1010 vmovdqu 0x80-0x80($ctx),$E 1011 vmovdqu 0xa0-0x80($ctx),$F 1012 vmovdqu 0xc0-0x80($ctx),$G 1013 vmovdqu 0xe0-0x80($ctx),$H 1014 vmovdqu .Lpbswap(%rip),$Xn 1015 jmp .Loop_avx 1016 1017.align 32 1018.Loop_avx: 1019 vpxor $B,$C,$bxc # magic seed 1020___ 1021for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } 1022$code.=<<___; 1023 vmovdqu `&Xi_off($i)`,$Xi 1024 mov \$3,%ecx 1025 jmp .Loop_16_xx_avx 1026.align 32 1027.Loop_16_xx_avx: 1028___ 1029for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } 1030$code.=<<___; 1031 dec %ecx 1032 jnz .Loop_16_xx_avx 1033 1034 mov \$1,%ecx 1035 lea K256+128(%rip),$Tbl 1036___ 1037for($i=0;$i<4;$i++) { 1038 $code.=<<___; 1039 cmp `4*$i`(%rbx),%ecx # examine counters 1040 cmovge $Tbl,@ptr[$i] # cancel input 1041___ 1042} 1043$code.=<<___; 1044 vmovdqa (%rbx),$sigma # pull counters 1045 vpxor $t1,$t1,$t1 1046 vmovdqa $sigma,$Xn 1047 vpcmpgtd $t1,$Xn,$Xn # mask value 1048 vpaddd $Xn,$sigma,$sigma # counters-- 1049 1050 vmovdqu 0x00-0x80($ctx),$t1 1051 vpand $Xn,$A,$A 1052 vmovdqu 0x20-0x80($ctx),$t2 1053 vpand $Xn,$B,$B 1054 vmovdqu 0x40-0x80($ctx),$t3 1055 vpand $Xn,$C,$C 1056 vmovdqu 0x60-0x80($ctx),$Xi 1057 vpand $Xn,$D,$D 1058 vpaddd $t1,$A,$A 1059 vmovdqu 0x80-0x80($ctx),$t1 1060 vpand $Xn,$E,$E 1061 vpaddd $t2,$B,$B 1062 vmovdqu 0xa0-0x80($ctx),$t2 1063 vpand $Xn,$F,$F 1064 vpaddd $t3,$C,$C 1065 vmovdqu 0xc0-0x80($ctx),$t3 1066 vpand $Xn,$G,$G 1067 vpaddd $Xi,$D,$D 1068 vmovdqu 0xe0-0x80($ctx),$Xi 1069 vpand $Xn,$H,$H 1070 vpaddd $t1,$E,$E 1071 vpaddd $t2,$F,$F 1072 vmovdqu $A,0x00-0x80($ctx) 1073 vpaddd $t3,$G,$G 1074 vmovdqu $B,0x20-0x80($ctx) 1075 vpaddd $Xi,$H,$H 1076 vmovdqu $C,0x40-0x80($ctx) 1077 vmovdqu $D,0x60-0x80($ctx) 1078 vmovdqu $E,0x80-0x80($ctx) 1079 vmovdqu $F,0xa0-0x80($ctx) 1080 vmovdqu $G,0xc0-0x80($ctx) 1081 vmovdqu $H,0xe0-0x80($ctx) 1082 1083 vmovdqu $sigma,(%rbx) # save counters 1084 vmovdqu .Lpbswap(%rip),$Xn 1085 dec $num 1086 jnz .Loop_avx 1087 1088 mov `$REG_SZ*17+8`(%rsp),$num 1089 lea $REG_SZ($ctx),$ctx 1090 lea `16*$REG_SZ/4`($inp),$inp 1091 dec $num 1092 jnz .Loop_grande_avx 1093 1094.Ldone_avx: 1095 mov `$REG_SZ*17`(%rsp),%rax # original %rsp 1096.cfi_def_cfa %rax,8 1097 vzeroupper 1098___ 1099$code.=<<___ if ($win64); 1100 movaps -0xb8(%rax),%xmm6 1101 movaps -0xa8(%rax),%xmm7 1102 movaps -0x98(%rax),%xmm8 1103 movaps -0x88(%rax),%xmm9 1104 movaps -0x78(%rax),%xmm10 1105 movaps -0x68(%rax),%xmm11 1106 movaps -0x58(%rax),%xmm12 1107 movaps -0x48(%rax),%xmm13 1108 movaps -0x38(%rax),%xmm14 1109 movaps -0x28(%rax),%xmm15 1110___ 1111$code.=<<___; 1112 mov -16(%rax),%rbp 1113.cfi_restore %rbp 1114 mov -8(%rax),%rbx 1115.cfi_restore %rbx 1116 lea (%rax),%rsp 1117.cfi_def_cfa_register %rsp 1118.Lepilogue_avx: 1119 ret 1120.cfi_endproc 1121.size sha256_multi_block_avx,.-sha256_multi_block_avx 1122___ 1123 if ($avx>1) { 1124$code =~ s/\`([^\`]*)\`/eval $1/gem; 1125 1126$REG_SZ=32; 1127@ptr=map("%r$_",(12..15,8..11)); 1128 1129@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15)); 1130($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7)); 1131 1132$code.=<<___; 1133.type sha256_multi_block_avx2,\@function,3 1134.align 32 1135sha256_multi_block_avx2: 1136.cfi_startproc 1137_avx2_shortcut: 1138 mov %rsp,%rax 1139.cfi_def_cfa_register %rax 1140 push %rbx 1141.cfi_push %rbx 1142 push %rbp 1143.cfi_push %rbp 1144 push %r12 1145.cfi_push %r12 1146 push %r13 1147.cfi_push %r13 1148 push %r14 1149.cfi_push %r14 1150 push %r15 1151.cfi_push %r15 1152___ 1153$code.=<<___ if ($win64); 1154 lea -0xa8(%rsp),%rsp 1155 movaps %xmm6,(%rsp) 1156 movaps %xmm7,0x10(%rsp) 1157 movaps %xmm8,0x20(%rsp) 1158 movaps %xmm9,0x30(%rsp) 1159 movaps %xmm10,0x40(%rsp) 1160 movaps %xmm11,0x50(%rsp) 1161 movaps %xmm12,-0x78(%rax) 1162 movaps %xmm13,-0x68(%rax) 1163 movaps %xmm14,-0x58(%rax) 1164 movaps %xmm15,-0x48(%rax) 1165___ 1166$code.=<<___; 1167 sub \$`$REG_SZ*18`, %rsp 1168 and \$-256,%rsp 1169 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1170.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 1171.Lbody_avx2: 1172 lea K256+128(%rip),$Tbl 1173 lea 0x80($ctx),$ctx # size optimization 1174 1175.Loop_grande_avx2: 1176 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 1177 xor $num,$num 1178 lea `$REG_SZ*16`(%rsp),%rbx 1179___ 1180for($i=0;$i<8;$i++) { 1181 $code.=<<___; 1182 mov `16*$i+0`($inp),@ptr[$i] # input pointer 1183 mov `16*$i+8`($inp),%ecx # number of blocks 1184 cmp $num,%ecx 1185 cmovg %ecx,$num # find maximum 1186 test %ecx,%ecx 1187 mov %ecx,`4*$i`(%rbx) # initialize counters 1188 cmovle $Tbl,@ptr[$i] # cancel input 1189___ 1190} 1191$code.=<<___; 1192 vmovdqu 0x00-0x80($ctx),$A # load context 1193 lea 128(%rsp),%rax 1194 vmovdqu 0x20-0x80($ctx),$B 1195 lea 256+128(%rsp),%rbx 1196 vmovdqu 0x40-0x80($ctx),$C 1197 vmovdqu 0x60-0x80($ctx),$D 1198 vmovdqu 0x80-0x80($ctx),$E 1199 vmovdqu 0xa0-0x80($ctx),$F 1200 vmovdqu 0xc0-0x80($ctx),$G 1201 vmovdqu 0xe0-0x80($ctx),$H 1202 vmovdqu .Lpbswap(%rip),$Xn 1203 jmp .Loop_avx2 1204 1205.align 32 1206.Loop_avx2: 1207 vpxor $B,$C,$bxc # magic seed 1208___ 1209for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } 1210$code.=<<___; 1211 vmovdqu `&Xi_off($i)`,$Xi 1212 mov \$3,%ecx 1213 jmp .Loop_16_xx_avx2 1214.align 32 1215.Loop_16_xx_avx2: 1216___ 1217for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } 1218$code.=<<___; 1219 dec %ecx 1220 jnz .Loop_16_xx_avx2 1221 1222 mov \$1,%ecx 1223 lea `$REG_SZ*16`(%rsp),%rbx 1224 lea K256+128(%rip),$Tbl 1225___ 1226for($i=0;$i<8;$i++) { 1227 $code.=<<___; 1228 cmp `4*$i`(%rbx),%ecx # examine counters 1229 cmovge $Tbl,@ptr[$i] # cancel input 1230___ 1231} 1232$code.=<<___; 1233 vmovdqa (%rbx),$sigma # pull counters 1234 vpxor $t1,$t1,$t1 1235 vmovdqa $sigma,$Xn 1236 vpcmpgtd $t1,$Xn,$Xn # mask value 1237 vpaddd $Xn,$sigma,$sigma # counters-- 1238 1239 vmovdqu 0x00-0x80($ctx),$t1 1240 vpand $Xn,$A,$A 1241 vmovdqu 0x20-0x80($ctx),$t2 1242 vpand $Xn,$B,$B 1243 vmovdqu 0x40-0x80($ctx),$t3 1244 vpand $Xn,$C,$C 1245 vmovdqu 0x60-0x80($ctx),$Xi 1246 vpand $Xn,$D,$D 1247 vpaddd $t1,$A,$A 1248 vmovdqu 0x80-0x80($ctx),$t1 1249 vpand $Xn,$E,$E 1250 vpaddd $t2,$B,$B 1251 vmovdqu 0xa0-0x80($ctx),$t2 1252 vpand $Xn,$F,$F 1253 vpaddd $t3,$C,$C 1254 vmovdqu 0xc0-0x80($ctx),$t3 1255 vpand $Xn,$G,$G 1256 vpaddd $Xi,$D,$D 1257 vmovdqu 0xe0-0x80($ctx),$Xi 1258 vpand $Xn,$H,$H 1259 vpaddd $t1,$E,$E 1260 vpaddd $t2,$F,$F 1261 vmovdqu $A,0x00-0x80($ctx) 1262 vpaddd $t3,$G,$G 1263 vmovdqu $B,0x20-0x80($ctx) 1264 vpaddd $Xi,$H,$H 1265 vmovdqu $C,0x40-0x80($ctx) 1266 vmovdqu $D,0x60-0x80($ctx) 1267 vmovdqu $E,0x80-0x80($ctx) 1268 vmovdqu $F,0xa0-0x80($ctx) 1269 vmovdqu $G,0xc0-0x80($ctx) 1270 vmovdqu $H,0xe0-0x80($ctx) 1271 1272 vmovdqu $sigma,(%rbx) # save counters 1273 lea 256+128(%rsp),%rbx 1274 vmovdqu .Lpbswap(%rip),$Xn 1275 dec $num 1276 jnz .Loop_avx2 1277 1278 #mov `$REG_SZ*17+8`(%rsp),$num 1279 #lea $REG_SZ($ctx),$ctx 1280 #lea `16*$REG_SZ/4`($inp),$inp 1281 #dec $num 1282 #jnz .Loop_grande_avx2 1283 1284.Ldone_avx2: 1285 mov `$REG_SZ*17`(%rsp),%rax # original %rsp 1286.cfi_def_cfa %rax,8 1287 vzeroupper 1288___ 1289$code.=<<___ if ($win64); 1290 movaps -0xd8(%rax),%xmm6 1291 movaps -0xc8(%rax),%xmm7 1292 movaps -0xb8(%rax),%xmm8 1293 movaps -0xa8(%rax),%xmm9 1294 movaps -0x98(%rax),%xmm10 1295 movaps -0x88(%rax),%xmm11 1296 movaps -0x78(%rax),%xmm12 1297 movaps -0x68(%rax),%xmm13 1298 movaps -0x58(%rax),%xmm14 1299 movaps -0x48(%rax),%xmm15 1300___ 1301$code.=<<___; 1302 mov -48(%rax),%r15 1303.cfi_restore %r15 1304 mov -40(%rax),%r14 1305.cfi_restore %r14 1306 mov -32(%rax),%r13 1307.cfi_restore %r13 1308 mov -24(%rax),%r12 1309.cfi_restore %r12 1310 mov -16(%rax),%rbp 1311.cfi_restore %rbp 1312 mov -8(%rax),%rbx 1313.cfi_restore %rbx 1314 lea (%rax),%rsp 1315.cfi_def_cfa_register %rsp 1316.Lepilogue_avx2: 1317 ret 1318.cfi_endproc 1319.size sha256_multi_block_avx2,.-sha256_multi_block_avx2 1320___ 1321 } }}} 1322$code.=<<___; 1323.align 256 1324K256: 1325___ 1326sub TABLE { 1327 foreach (@_) { 1328 $code.=<<___; 1329 .long $_,$_,$_,$_ 1330 .long $_,$_,$_,$_ 1331___ 1332 } 1333} 1334&TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, 1335 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 1336 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, 1337 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 1338 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, 1339 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 1340 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, 1341 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 1342 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, 1343 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 1344 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, 1345 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 1346 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, 1347 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 1348 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, 1349 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); 1350$code.=<<___; 1351.Lpbswap: 1352 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1353 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1354K256_shaext: 1355 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 1356 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 1357 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 1358 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 1359 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 1360 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 1361 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 1362 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 1363 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 1364 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 1365 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 1366 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 1367 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 1368 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 1369 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 1370 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 1371 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1372___ 1373 1374if ($win64) { 1375# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1376# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1377$rec="%rcx"; 1378$frame="%rdx"; 1379$context="%r8"; 1380$disp="%r9"; 1381 1382$code.=<<___; 1383.extern __imp_RtlVirtualUnwind 1384.type se_handler,\@abi-omnipotent 1385.align 16 1386se_handler: 1387 push %rsi 1388 push %rdi 1389 push %rbx 1390 push %rbp 1391 push %r12 1392 push %r13 1393 push %r14 1394 push %r15 1395 pushfq 1396 sub \$64,%rsp 1397 1398 mov 120($context),%rax # pull context->Rax 1399 mov 248($context),%rbx # pull context->Rip 1400 1401 mov 8($disp),%rsi # disp->ImageBase 1402 mov 56($disp),%r11 # disp->HandlerData 1403 1404 mov 0(%r11),%r10d # HandlerData[0] 1405 lea (%rsi,%r10),%r10 # end of prologue label 1406 cmp %r10,%rbx # context->Rip<.Lbody 1407 jb .Lin_prologue 1408 1409 mov 152($context),%rax # pull context->Rsp 1410 1411 mov 4(%r11),%r10d # HandlerData[1] 1412 lea (%rsi,%r10),%r10 # epilogue label 1413 cmp %r10,%rbx # context->Rip>=.Lepilogue 1414 jae .Lin_prologue 1415 1416 mov `16*17`(%rax),%rax # pull saved stack pointer 1417 1418 mov -8(%rax),%rbx 1419 mov -16(%rax),%rbp 1420 mov %rbx,144($context) # restore context->Rbx 1421 mov %rbp,160($context) # restore context->Rbp 1422 1423 lea -24-10*16(%rax),%rsi 1424 lea 512($context),%rdi # &context.Xmm6 1425 mov \$20,%ecx 1426 .long 0xa548f3fc # cld; rep movsq 1427 1428.Lin_prologue: 1429 mov 8(%rax),%rdi 1430 mov 16(%rax),%rsi 1431 mov %rax,152($context) # restore context->Rsp 1432 mov %rsi,168($context) # restore context->Rsi 1433 mov %rdi,176($context) # restore context->Rdi 1434 1435 mov 40($disp),%rdi # disp->ContextRecord 1436 mov $context,%rsi # context 1437 mov \$154,%ecx # sizeof(CONTEXT) 1438 .long 0xa548f3fc # cld; rep movsq 1439 1440 mov $disp,%rsi 1441 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1442 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1443 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1444 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1445 mov 40(%rsi),%r10 # disp->ContextRecord 1446 lea 56(%rsi),%r11 # &disp->HandlerData 1447 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1448 mov %r10,32(%rsp) # arg5 1449 mov %r11,40(%rsp) # arg6 1450 mov %r12,48(%rsp) # arg7 1451 mov %rcx,56(%rsp) # arg8, (NULL) 1452 call *__imp_RtlVirtualUnwind(%rip) 1453 1454 mov \$1,%eax # ExceptionContinueSearch 1455 add \$64,%rsp 1456 popfq 1457 pop %r15 1458 pop %r14 1459 pop %r13 1460 pop %r12 1461 pop %rbp 1462 pop %rbx 1463 pop %rdi 1464 pop %rsi 1465 ret 1466.size se_handler,.-se_handler 1467___ 1468$code.=<<___ if ($avx>1); 1469.type avx2_handler,\@abi-omnipotent 1470.align 16 1471avx2_handler: 1472 push %rsi 1473 push %rdi 1474 push %rbx 1475 push %rbp 1476 push %r12 1477 push %r13 1478 push %r14 1479 push %r15 1480 pushfq 1481 sub \$64,%rsp 1482 1483 mov 120($context),%rax # pull context->Rax 1484 mov 248($context),%rbx # pull context->Rip 1485 1486 mov 8($disp),%rsi # disp->ImageBase 1487 mov 56($disp),%r11 # disp->HandlerData 1488 1489 mov 0(%r11),%r10d # HandlerData[0] 1490 lea (%rsi,%r10),%r10 # end of prologue label 1491 cmp %r10,%rbx # context->Rip<body label 1492 jb .Lin_prologue 1493 1494 mov 152($context),%rax # pull context->Rsp 1495 1496 mov 4(%r11),%r10d # HandlerData[1] 1497 lea (%rsi,%r10),%r10 # epilogue label 1498 cmp %r10,%rbx # context->Rip>=epilogue label 1499 jae .Lin_prologue 1500 1501 mov `32*17`($context),%rax # pull saved stack pointer 1502 1503 mov -8(%rax),%rbx 1504 mov -16(%rax),%rbp 1505 mov -24(%rax),%r12 1506 mov -32(%rax),%r13 1507 mov -40(%rax),%r14 1508 mov -48(%rax),%r15 1509 mov %rbx,144($context) # restore context->Rbx 1510 mov %rbp,160($context) # restore context->Rbp 1511 mov %r12,216($context) # restore context->R12 1512 mov %r13,224($context) # restore context->R13 1513 mov %r14,232($context) # restore context->R14 1514 mov %r15,240($context) # restore context->R15 1515 1516 lea -56-10*16(%rax),%rsi 1517 lea 512($context),%rdi # &context.Xmm6 1518 mov \$20,%ecx 1519 .long 0xa548f3fc # cld; rep movsq 1520 1521 jmp .Lin_prologue 1522.size avx2_handler,.-avx2_handler 1523___ 1524$code.=<<___; 1525.section .pdata 1526.align 4 1527 .rva .LSEH_begin_sha256_multi_block 1528 .rva .LSEH_end_sha256_multi_block 1529 .rva .LSEH_info_sha256_multi_block 1530 .rva .LSEH_begin_sha256_multi_block_shaext 1531 .rva .LSEH_end_sha256_multi_block_shaext 1532 .rva .LSEH_info_sha256_multi_block_shaext 1533___ 1534$code.=<<___ if ($avx); 1535 .rva .LSEH_begin_sha256_multi_block_avx 1536 .rva .LSEH_end_sha256_multi_block_avx 1537 .rva .LSEH_info_sha256_multi_block_avx 1538___ 1539$code.=<<___ if ($avx>1); 1540 .rva .LSEH_begin_sha256_multi_block_avx2 1541 .rva .LSEH_end_sha256_multi_block_avx2 1542 .rva .LSEH_info_sha256_multi_block_avx2 1543___ 1544$code.=<<___; 1545.section .xdata 1546.align 8 1547.LSEH_info_sha256_multi_block: 1548 .byte 9,0,0,0 1549 .rva se_handler 1550 .rva .Lbody,.Lepilogue # HandlerData[] 1551.LSEH_info_sha256_multi_block_shaext: 1552 .byte 9,0,0,0 1553 .rva se_handler 1554 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] 1555___ 1556$code.=<<___ if ($avx); 1557.LSEH_info_sha256_multi_block_avx: 1558 .byte 9,0,0,0 1559 .rva se_handler 1560 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] 1561___ 1562$code.=<<___ if ($avx>1); 1563.LSEH_info_sha256_multi_block_avx2: 1564 .byte 9,0,0,0 1565 .rva avx2_handler 1566 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] 1567___ 1568} 1569#################################################################### 1570 1571sub rex { 1572 local *opcode=shift; 1573 my ($dst,$src)=@_; 1574 my $rex=0; 1575 1576 $rex|=0x04 if ($dst>=8); 1577 $rex|=0x01 if ($src>=8); 1578 unshift @opcode,$rex|0x40 if ($rex); 1579} 1580 1581sub sha256op38 { 1582 my $instr = shift; 1583 my %opcodelet = ( 1584 "sha256rnds2" => 0xcb, 1585 "sha256msg1" => 0xcc, 1586 "sha256msg2" => 0xcd ); 1587 1588 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1589 my @opcode=(0x0f,0x38); 1590 rex(\@opcode,$2,$1); 1591 push @opcode,$opcodelet{$instr}; 1592 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1593 return ".byte\t".join(',',@opcode); 1594 } else { 1595 return $instr."\t".@_[0]; 1596 } 1597} 1598 1599foreach (split("\n",$code)) { 1600 s/\`([^\`]*)\`/eval($1)/ge; 1601 1602 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or 1603 1604 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1605 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1606 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or 1607 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1608 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or 1609 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1610 1611 print $_,"\n"; 1612} 1613 1614close STDOUT or die "error closing STDOUT: $!"; 1615