1#! /usr/bin/env perl 2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. Rights for redistribution and usage in source and binary 13# forms are granted according to the OpenSSL license. 14# ==================================================================== 15# 16# sha256/512_block procedure for x86_64. 17# 18# 40% improvement over compiler-generated code on Opteron. On EM64T 19# sha256 was observed to run >80% faster and sha512 - >40%. No magical 20# tricks, just straight implementation... I really wonder why gcc 21# [being armed with inline assembler] fails to generate as fast code. 22# The only thing which is cool about this module is that it's very 23# same instruction sequence used for both SHA-256 and SHA-512. In 24# former case the instructions operate on 32-bit operands, while in 25# latter - on 64-bit ones. All I had to do is to get one flavor right, 26# the other one passed the test right away:-) 27# 28# sha256_block runs in ~1005 cycles on Opteron, which gives you 29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 30# frequency in GHz. sha512_block runs in ~1275 cycles, which results 31# in 128*1000/1275=100MBps per GHz. Is there room for improvement? 32# Well, if you compare it to IA-64 implementation, which maintains 33# X[16] in register bank[!], tends to 4 instructions per CPU clock 34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way 35# issue Opteron pipeline and X[16] maintained in memory. So that *if* 36# there is a way to improve it, *then* the only way would be to try to 37# offload X[16] updates to SSE unit, but that would require "deeper" 38# loop unroll, which in turn would naturally cause size blow-up, not 39# to mention increased complexity! And once again, only *if* it's 40# actually possible to noticeably improve overall ILP, instruction 41# level parallelism, on a given CPU implementation in this case. 42# 43# Special note on Intel EM64T. While Opteron CPU exhibits perfect 44# performance ratio of 1.5 between 64- and 32-bit flavors [see above], 45# [currently available] EM64T CPUs apparently are far from it. On the 46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 47# sha256_block:-( This is presumably because 64-bit shifts/rotates 48# apparently are not atomic instructions, but implemented in microcode. 49# 50# May 2012. 51# 52# Optimization including one of Pavel Semjanov's ideas, alternative 53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 54# unfortunately -2% SHA512 on P4 [which nobody should care about 55# that much]. 56# 57# June 2012. 58# 59# Add SIMD code paths, see below for improvement coefficients. SSSE3 60# code path was not attempted for SHA512, because improvement is not 61# estimated to be high enough, noticeably less than 9%, to justify 62# the effort, not on pre-AVX processors. [Obviously with exclusion 63# for VIA Nano, but it has SHA512 instruction that is faster and 64# should be used instead.] For reference, corresponding estimated 65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 66# higher coefficients are observed on VIA Nano and Bulldozer has more 67# to do with specifics of their architecture [which is topic for 68# separate discussion]. 69# 70# November 2012. 71# 72# Add AVX2 code path. Two consecutive input blocks are loaded to 73# 256-bit %ymm registers, with data from first block to least 74# significant 128-bit halves and data from second to most significant. 75# The data is then processed with same SIMD instruction sequence as 76# for AVX, but with %ymm as operands. Side effect is increased stack 77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 78# code size increase. 79# 80# March 2014. 81# 82# Add support for Intel SHA Extensions. 83 84###################################################################### 85# Current performance in cycles per processed byte (less is better): 86# 87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 88# 89# AMD K8 14.9 - - 9.57 - 90# P4 17.3 - - 30.8 - 91# Core 2 15.6 13.8(+13%) - 9.97 - 92# Westmere 14.8 12.3(+19%) - 9.58 - 93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) 97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) 99# VIA Nano 23.0 16.5(+39%) - 14.7 - 100# Atom 23.0 18.9(+22%) - 14.7 - 101# Silvermont 27.4 20.6(+33%) - 17.5 - 102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) 103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - 104# 105# (*) whichever best applicable, including SHAEXT; 106# (**) switch from ror to shrd stands for fair share of improvement; 107# (***) execution time is fully determined by remaining integer-only 108# part, body_00_15; reducing the amount of SIMD instructions 109# below certain limit makes no difference/sense; to conserve 110# space SHA256 XOP code path is therefore omitted; 111 112$flavour = shift; 113$output = shift; 114if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 115 116$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 117 118$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 119( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 120( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 121die "can't locate x86_64-xlate.pl"; 122 123if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 124 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 125 $avx = ($1>=2.19) + ($1>=2.22); 126} 127 128if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 129 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 130 $avx = ($1>=2.09) + ($1>=2.10); 131} 132 133if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 134 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 135 $avx = ($1>=10) + ($1>=11); 136} 137 138if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 139 $avx = ($2>=3.0) + ($2>3.0); 140} 141 142$shaext=1; ### set to zero if compiling for 1.0.1 143$avx=1 if (!$shaext && $avx); 144 145open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 146*STDOUT=*OUT; 147 148if ($output =~ /512/) { 149 $func="sha512_block_data_order"; 150 $TABLE="K512"; 151 $SZ=8; 152 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 153 "%r8", "%r9", "%r10","%r11"); 154 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 155 @Sigma0=(28,34,39); 156 @Sigma1=(14,18,41); 157 @sigma0=(1, 8, 7); 158 @sigma1=(19,61, 6); 159 $rounds=80; 160} else { 161 $func="sha256_block_data_order"; 162 $TABLE="K256"; 163 $SZ=4; 164 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 165 "%r8d","%r9d","%r10d","%r11d"); 166 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 167 @Sigma0=( 2,13,22); 168 @Sigma1=( 6,11,25); 169 @sigma0=( 7,18, 3); 170 @sigma1=(17,19,10); 171 $rounds=64; 172} 173 174$ctx="%rdi"; # 1st arg, zapped by $a3 175$inp="%rsi"; # 2nd arg 176$Tbl="%rbp"; 177 178$_ctx="16*$SZ+0*8(%rsp)"; 179$_inp="16*$SZ+1*8(%rsp)"; 180$_end="16*$SZ+2*8(%rsp)"; 181$_rsp="`16*$SZ+3*8`(%rsp)"; 182$framesz="16*$SZ+4*8"; 183 184 185sub ROUND_00_15() 186{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 187 my $STRIDE=$SZ; 188 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 189 190$code.=<<___; 191 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 192 mov $f,$a2 193 194 xor $e,$a0 195 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 196 xor $g,$a2 # f^g 197 198 mov $T1,`$SZ*($i&0xf)`(%rsp) 199 xor $a,$a1 200 and $e,$a2 # (f^g)&e 201 202 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 203 add $h,$T1 # T1+=h 204 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 205 206 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 207 xor $e,$a0 208 add $a2,$T1 # T1+=Ch(e,f,g) 209 210 mov $a,$a2 211 add ($Tbl),$T1 # T1+=K[round] 212 xor $a,$a1 213 214 xor $b,$a2 # a^b, b^c in next round 215 ror \$$Sigma1[0],$a0 # Sigma1(e) 216 mov $b,$h 217 218 and $a2,$a3 219 ror \$$Sigma0[0],$a1 # Sigma0(a) 220 add $a0,$T1 # T1+=Sigma1(e) 221 222 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 223 add $T1,$d # d+=T1 224 add $T1,$h # h+=T1 225 226 lea $STRIDE($Tbl),$Tbl # round++ 227___ 228$code.=<<___ if ($i<15); 229 add $a1,$h # h+=Sigma0(a) 230___ 231 ($a2,$a3) = ($a3,$a2); 232} 233 234sub ROUND_16_XX() 235{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 236 237$code.=<<___; 238 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 239 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 240 241 mov $a0,$T1 242 ror \$`$sigma0[1]-$sigma0[0]`,$a0 243 add $a1,$a # modulo-scheduled h+=Sigma0(a) 244 mov $a2,$a1 245 ror \$`$sigma1[1]-$sigma1[0]`,$a2 246 247 xor $T1,$a0 248 shr \$$sigma0[2],$T1 249 ror \$$sigma0[0],$a0 250 xor $a1,$a2 251 shr \$$sigma1[2],$a1 252 253 ror \$$sigma1[0],$a2 254 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 255 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 256 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 257 258 add `$SZ*($i&0xf)`(%rsp),$T1 259 mov $e,$a0 260 add $a2,$T1 261 mov $a,$a1 262___ 263 &ROUND_00_15(@_); 264} 265 266$code=<<___; 267.text 268 269.extern OPENSSL_ia32cap_P 270.globl $func 271.type $func,\@function,3 272.align 16 273$func: 274.cfi_startproc 275___ 276$code.=<<___ if ($SZ==4 || $avx); 277 lea OPENSSL_ia32cap_P(%rip),%r11 278 mov 0(%r11),%r9d 279 mov 4(%r11),%r10d 280 mov 8(%r11),%r11d 281___ 282$code.=<<___ if ($SZ==4 && $shaext); 283 test \$`1<<29`,%r11d # check for SHA 284 jnz _shaext_shortcut 285___ 286$code.=<<___ if ($avx && $SZ==8); 287 test \$`1<<11`,%r10d # check for XOP 288 jnz .Lxop_shortcut 289___ 290$code.=<<___ if ($avx>1); 291 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 292 cmp \$`1<<8|1<<5|1<<3`,%r11d 293 je .Lavx2_shortcut 294___ 295$code.=<<___ if ($avx); 296 and \$`1<<30`,%r9d # mask "Intel CPU" bit 297 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 298 or %r9d,%r10d 299 cmp \$`1<<28|1<<9|1<<30`,%r10d 300 je .Lavx_shortcut 301___ 302$code.=<<___ if ($SZ==4); 303 test \$`1<<9`,%r10d 304 jnz .Lssse3_shortcut 305___ 306$code.=<<___; 307 mov %rsp,%rax # copy %rsp 308.cfi_def_cfa_register %rax 309 push %rbx 310.cfi_push %rbx 311 push %rbp 312.cfi_push %rbp 313 push %r12 314.cfi_push %r12 315 push %r13 316.cfi_push %r13 317 push %r14 318.cfi_push %r14 319 push %r15 320.cfi_push %r15 321 shl \$4,%rdx # num*16 322 sub \$$framesz,%rsp 323 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 324 and \$-64,%rsp # align stack frame 325 mov $ctx,$_ctx # save ctx, 1st arg 326 mov $inp,$_inp # save inp, 2nd arh 327 mov %rdx,$_end # save end pointer, "3rd" arg 328 mov %rax,$_rsp # save copy of %rsp 329.cfi_cfa_expression $_rsp,deref,+8 330.Lprologue: 331 332 mov $SZ*0($ctx),$A 333 mov $SZ*1($ctx),$B 334 mov $SZ*2($ctx),$C 335 mov $SZ*3($ctx),$D 336 mov $SZ*4($ctx),$E 337 mov $SZ*5($ctx),$F 338 mov $SZ*6($ctx),$G 339 mov $SZ*7($ctx),$H 340 jmp .Lloop 341 342.align 16 343.Lloop: 344 mov $B,$a3 345 lea $TABLE(%rip),$Tbl 346 xor $C,$a3 # magic 347___ 348 for($i=0;$i<16;$i++) { 349 $code.=" mov $SZ*$i($inp),$T1\n"; 350 $code.=" mov @ROT[4],$a0\n"; 351 $code.=" mov @ROT[0],$a1\n"; 352 $code.=" bswap $T1\n"; 353 &ROUND_00_15($i,@ROT); 354 unshift(@ROT,pop(@ROT)); 355 } 356$code.=<<___; 357 jmp .Lrounds_16_xx 358.align 16 359.Lrounds_16_xx: 360___ 361 for(;$i<32;$i++) { 362 &ROUND_16_XX($i,@ROT); 363 unshift(@ROT,pop(@ROT)); 364 } 365 366$code.=<<___; 367 cmpb \$0,`$SZ-1`($Tbl) 368 jnz .Lrounds_16_xx 369 370 mov $_ctx,$ctx 371 add $a1,$A # modulo-scheduled h+=Sigma0(a) 372 lea 16*$SZ($inp),$inp 373 374 add $SZ*0($ctx),$A 375 add $SZ*1($ctx),$B 376 add $SZ*2($ctx),$C 377 add $SZ*3($ctx),$D 378 add $SZ*4($ctx),$E 379 add $SZ*5($ctx),$F 380 add $SZ*6($ctx),$G 381 add $SZ*7($ctx),$H 382 383 cmp $_end,$inp 384 385 mov $A,$SZ*0($ctx) 386 mov $B,$SZ*1($ctx) 387 mov $C,$SZ*2($ctx) 388 mov $D,$SZ*3($ctx) 389 mov $E,$SZ*4($ctx) 390 mov $F,$SZ*5($ctx) 391 mov $G,$SZ*6($ctx) 392 mov $H,$SZ*7($ctx) 393 jb .Lloop 394 395 mov $_rsp,%rsi 396.cfi_def_cfa %rsi,8 397 mov -48(%rsi),%r15 398.cfi_restore %r15 399 mov -40(%rsi),%r14 400.cfi_restore %r14 401 mov -32(%rsi),%r13 402.cfi_restore %r13 403 mov -24(%rsi),%r12 404.cfi_restore %r12 405 mov -16(%rsi),%rbp 406.cfi_restore %rbp 407 mov -8(%rsi),%rbx 408.cfi_restore %rbx 409 lea (%rsi),%rsp 410.cfi_def_cfa_register %rsp 411.Lepilogue: 412 ret 413.cfi_endproc 414.size $func,.-$func 415___ 416 417if ($SZ==4) { 418$code.=<<___; 419.align 64 420.type $TABLE,\@object 421$TABLE: 422 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 423 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 424 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 425 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 426 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 427 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 428 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 429 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 430 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 431 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 432 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 433 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 434 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 435 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 436 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 437 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 438 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 439 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 440 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 441 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 442 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 443 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 444 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 445 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 446 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 447 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 448 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 449 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 450 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 451 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 452 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 453 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 454 455 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 456 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 457 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 458 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 459 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 460 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 461 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 462___ 463} else { 464$code.=<<___; 465.align 64 466.type $TABLE,\@object 467$TABLE: 468 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 469 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 470 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 471 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 472 .quad 0x3956c25bf348b538,0x59f111f1b605d019 473 .quad 0x3956c25bf348b538,0x59f111f1b605d019 474 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 475 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 476 .quad 0xd807aa98a3030242,0x12835b0145706fbe 477 .quad 0xd807aa98a3030242,0x12835b0145706fbe 478 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 479 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 480 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 481 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 482 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 483 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 484 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 485 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 486 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 487 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 488 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 489 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 490 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 491 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 492 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 493 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 494 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 495 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 496 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 497 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 498 .quad 0x06ca6351e003826f,0x142929670a0e6e70 499 .quad 0x06ca6351e003826f,0x142929670a0e6e70 500 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 501 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 502 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 503 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 504 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 505 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 506 .quad 0x81c2c92e47edaee6,0x92722c851482353b 507 .quad 0x81c2c92e47edaee6,0x92722c851482353b 508 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 509 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 510 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 511 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 512 .quad 0xd192e819d6ef5218,0xd69906245565a910 513 .quad 0xd192e819d6ef5218,0xd69906245565a910 514 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 515 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 516 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 517 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 518 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 519 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 520 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 521 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 522 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 523 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 524 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 525 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 526 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 527 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 528 .quad 0x90befffa23631e28,0xa4506cebde82bde9 529 .quad 0x90befffa23631e28,0xa4506cebde82bde9 530 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 531 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 532 .quad 0xca273eceea26619c,0xd186b8c721c0c207 533 .quad 0xca273eceea26619c,0xd186b8c721c0c207 534 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 535 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 536 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 537 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 538 .quad 0x113f9804bef90dae,0x1b710b35131c471b 539 .quad 0x113f9804bef90dae,0x1b710b35131c471b 540 .quad 0x28db77f523047d84,0x32caab7b40c72493 541 .quad 0x28db77f523047d84,0x32caab7b40c72493 542 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 543 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 544 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 545 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 546 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 547 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 548 549 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 550 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 551 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 552___ 553} 554 555###################################################################### 556# SIMD code paths 557# 558if ($SZ==4 && $shaext) {{{ 559###################################################################### 560# Intel SHA Extensions implementation of SHA256 update function. 561# 562my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 563 564my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 565my @MSG=map("%xmm$_",(3..6)); 566 567$code.=<<___; 568.type sha256_block_data_order_shaext,\@function,3 569.align 64 570sha256_block_data_order_shaext: 571_shaext_shortcut: 572.cfi_startproc 573___ 574$code.=<<___ if ($win64); 575 lea `-8-5*16`(%rsp),%rsp 576 movaps %xmm6,-8-5*16(%rax) 577 movaps %xmm7,-8-4*16(%rax) 578 movaps %xmm8,-8-3*16(%rax) 579 movaps %xmm9,-8-2*16(%rax) 580 movaps %xmm10,-8-1*16(%rax) 581.Lprologue_shaext: 582___ 583$code.=<<___; 584 lea K256+0x80(%rip),$Tbl 585 movdqu ($ctx),$ABEF # DCBA 586 movdqu 16($ctx),$CDGH # HGFE 587 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 588 589 pshufd \$0x1b,$ABEF,$Wi # ABCD 590 pshufd \$0xb1,$ABEF,$ABEF # CDAB 591 pshufd \$0x1b,$CDGH,$CDGH # EFGH 592 movdqa $TMP,$BSWAP # offload 593 palignr \$8,$CDGH,$ABEF # ABEF 594 punpcklqdq $Wi,$CDGH # CDGH 595 jmp .Loop_shaext 596 597.align 16 598.Loop_shaext: 599 movdqu ($inp),@MSG[0] 600 movdqu 0x10($inp),@MSG[1] 601 movdqu 0x20($inp),@MSG[2] 602 pshufb $TMP,@MSG[0] 603 movdqu 0x30($inp),@MSG[3] 604 605 movdqa 0*32-0x80($Tbl),$Wi 606 paddd @MSG[0],$Wi 607 pshufb $TMP,@MSG[1] 608 movdqa $CDGH,$CDGH_SAVE # offload 609 sha256rnds2 $ABEF,$CDGH # 0-3 610 pshufd \$0x0e,$Wi,$Wi 611 nop 612 movdqa $ABEF,$ABEF_SAVE # offload 613 sha256rnds2 $CDGH,$ABEF 614 615 movdqa 1*32-0x80($Tbl),$Wi 616 paddd @MSG[1],$Wi 617 pshufb $TMP,@MSG[2] 618 sha256rnds2 $ABEF,$CDGH # 4-7 619 pshufd \$0x0e,$Wi,$Wi 620 lea 0x40($inp),$inp 621 sha256msg1 @MSG[1],@MSG[0] 622 sha256rnds2 $CDGH,$ABEF 623 624 movdqa 2*32-0x80($Tbl),$Wi 625 paddd @MSG[2],$Wi 626 pshufb $TMP,@MSG[3] 627 sha256rnds2 $ABEF,$CDGH # 8-11 628 pshufd \$0x0e,$Wi,$Wi 629 movdqa @MSG[3],$TMP 630 palignr \$4,@MSG[2],$TMP 631 nop 632 paddd $TMP,@MSG[0] 633 sha256msg1 @MSG[2],@MSG[1] 634 sha256rnds2 $CDGH,$ABEF 635 636 movdqa 3*32-0x80($Tbl),$Wi 637 paddd @MSG[3],$Wi 638 sha256msg2 @MSG[3],@MSG[0] 639 sha256rnds2 $ABEF,$CDGH # 12-15 640 pshufd \$0x0e,$Wi,$Wi 641 movdqa @MSG[0],$TMP 642 palignr \$4,@MSG[3],$TMP 643 nop 644 paddd $TMP,@MSG[1] 645 sha256msg1 @MSG[3],@MSG[2] 646 sha256rnds2 $CDGH,$ABEF 647___ 648for($i=4;$i<16-3;$i++) { 649$code.=<<___; 650 movdqa $i*32-0x80($Tbl),$Wi 651 paddd @MSG[0],$Wi 652 sha256msg2 @MSG[0],@MSG[1] 653 sha256rnds2 $ABEF,$CDGH # 16-19... 654 pshufd \$0x0e,$Wi,$Wi 655 movdqa @MSG[1],$TMP 656 palignr \$4,@MSG[0],$TMP 657 nop 658 paddd $TMP,@MSG[2] 659 sha256msg1 @MSG[0],@MSG[3] 660 sha256rnds2 $CDGH,$ABEF 661___ 662 push(@MSG,shift(@MSG)); 663} 664$code.=<<___; 665 movdqa 13*32-0x80($Tbl),$Wi 666 paddd @MSG[0],$Wi 667 sha256msg2 @MSG[0],@MSG[1] 668 sha256rnds2 $ABEF,$CDGH # 52-55 669 pshufd \$0x0e,$Wi,$Wi 670 movdqa @MSG[1],$TMP 671 palignr \$4,@MSG[0],$TMP 672 sha256rnds2 $CDGH,$ABEF 673 paddd $TMP,@MSG[2] 674 675 movdqa 14*32-0x80($Tbl),$Wi 676 paddd @MSG[1],$Wi 677 sha256rnds2 $ABEF,$CDGH # 56-59 678 pshufd \$0x0e,$Wi,$Wi 679 sha256msg2 @MSG[1],@MSG[2] 680 movdqa $BSWAP,$TMP 681 sha256rnds2 $CDGH,$ABEF 682 683 movdqa 15*32-0x80($Tbl),$Wi 684 paddd @MSG[2],$Wi 685 nop 686 sha256rnds2 $ABEF,$CDGH # 60-63 687 pshufd \$0x0e,$Wi,$Wi 688 dec $num 689 nop 690 sha256rnds2 $CDGH,$ABEF 691 692 paddd $CDGH_SAVE,$CDGH 693 paddd $ABEF_SAVE,$ABEF 694 jnz .Loop_shaext 695 696 pshufd \$0xb1,$CDGH,$CDGH # DCHG 697 pshufd \$0x1b,$ABEF,$TMP # FEBA 698 pshufd \$0xb1,$ABEF,$ABEF # BAFE 699 punpckhqdq $CDGH,$ABEF # DCBA 700 palignr \$8,$TMP,$CDGH # HGFE 701 702 movdqu $ABEF,($ctx) 703 movdqu $CDGH,16($ctx) 704___ 705$code.=<<___ if ($win64); 706 movaps -8-5*16(%rax),%xmm6 707 movaps -8-4*16(%rax),%xmm7 708 movaps -8-3*16(%rax),%xmm8 709 movaps -8-2*16(%rax),%xmm9 710 movaps -8-1*16(%rax),%xmm10 711 mov %rax,%rsp 712.Lepilogue_shaext: 713___ 714$code.=<<___; 715 ret 716.cfi_endproc 717.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 718___ 719}}} 720{{{ 721 722my $a4=$T1; 723my ($a,$b,$c,$d,$e,$f,$g,$h); 724 725sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 726{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 727 my $arg = pop; 728 $arg = "\$$arg" if ($arg*1 eq $arg); 729 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 730} 731 732sub body_00_15 () { 733 ( 734 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 735 736 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 737 '&mov ($a,$a1)', 738 '&mov ($a4,$f)', 739 740 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 741 '&xor ($a0,$e)', 742 '&xor ($a4,$g)', # f^g 743 744 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 745 '&xor ($a1,$a)', 746 '&and ($a4,$e)', # (f^g)&e 747 748 '&xor ($a0,$e)', 749 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 750 '&mov ($a2,$a)', 751 752 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 753 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 754 '&xor ($a2,$b)', # a^b, b^c in next round 755 756 '&add ($h,$a4)', # h+=Ch(e,f,g) 757 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 758 '&and ($a3,$a2)', # (b^c)&(a^b) 759 760 '&xor ($a1,$a)', 761 '&add ($h,$a0)', # h+=Sigma1(e) 762 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 763 764 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 765 '&add ($d,$h)', # d+=h 766 '&add ($h,$a3)', # h+=Maj(a,b,c) 767 768 '&mov ($a0,$d)', 769 '&add ($a1,$h);'. # h+=Sigma0(a) 770 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 771 ); 772} 773 774###################################################################### 775# SSSE3 code path 776# 777if ($SZ==4) { # SHA256 only 778my @X = map("%xmm$_",(0..3)); 779my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 780 781$code.=<<___; 782.type ${func}_ssse3,\@function,3 783.align 64 784${func}_ssse3: 785.cfi_startproc 786.Lssse3_shortcut: 787 mov %rsp,%rax # copy %rsp 788.cfi_def_cfa_register %rax 789 push %rbx 790.cfi_push %rbx 791 push %rbp 792.cfi_push %rbp 793 push %r12 794.cfi_push %r12 795 push %r13 796.cfi_push %r13 797 push %r14 798.cfi_push %r14 799 push %r15 800.cfi_push %r15 801 shl \$4,%rdx # num*16 802 sub \$`$framesz+$win64*16*4`,%rsp 803 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 804 and \$-64,%rsp # align stack frame 805 mov $ctx,$_ctx # save ctx, 1st arg 806 mov $inp,$_inp # save inp, 2nd arh 807 mov %rdx,$_end # save end pointer, "3rd" arg 808 mov %rax,$_rsp # save copy of %rsp 809.cfi_cfa_expression $_rsp,deref,+8 810___ 811$code.=<<___ if ($win64); 812 movaps %xmm6,16*$SZ+32(%rsp) 813 movaps %xmm7,16*$SZ+48(%rsp) 814 movaps %xmm8,16*$SZ+64(%rsp) 815 movaps %xmm9,16*$SZ+80(%rsp) 816___ 817$code.=<<___; 818.Lprologue_ssse3: 819 820 mov $SZ*0($ctx),$A 821 mov $SZ*1($ctx),$B 822 mov $SZ*2($ctx),$C 823 mov $SZ*3($ctx),$D 824 mov $SZ*4($ctx),$E 825 mov $SZ*5($ctx),$F 826 mov $SZ*6($ctx),$G 827 mov $SZ*7($ctx),$H 828___ 829 830$code.=<<___; 831 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 832 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 833 jmp .Lloop_ssse3 834.align 16 835.Lloop_ssse3: 836 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 837 movdqu 0x00($inp),@X[0] 838 movdqu 0x10($inp),@X[1] 839 movdqu 0x20($inp),@X[2] 840 pshufb $t3,@X[0] 841 movdqu 0x30($inp),@X[3] 842 lea $TABLE(%rip),$Tbl 843 pshufb $t3,@X[1] 844 movdqa 0x00($Tbl),$t0 845 movdqa 0x20($Tbl),$t1 846 pshufb $t3,@X[2] 847 paddd @X[0],$t0 848 movdqa 0x40($Tbl),$t2 849 pshufb $t3,@X[3] 850 movdqa 0x60($Tbl),$t3 851 paddd @X[1],$t1 852 paddd @X[2],$t2 853 paddd @X[3],$t3 854 movdqa $t0,0x00(%rsp) 855 mov $A,$a1 856 movdqa $t1,0x10(%rsp) 857 mov $B,$a3 858 movdqa $t2,0x20(%rsp) 859 xor $C,$a3 # magic 860 movdqa $t3,0x30(%rsp) 861 mov $E,$a0 862 jmp .Lssse3_00_47 863 864.align 16 865.Lssse3_00_47: 866 sub \$`-16*2*$SZ`,$Tbl # size optimization 867___ 868sub Xupdate_256_SSSE3 () { 869 ( 870 '&movdqa ($t0,@X[1]);', 871 '&movdqa ($t3,@X[3])', 872 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 873 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 874 '&movdqa ($t1,$t0)', 875 '&movdqa ($t2,$t0);', 876 '&psrld ($t0,$sigma0[2])', 877 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 878 '&psrld ($t2,$sigma0[0])', 879 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 880 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 881 '&pxor ($t0,$t2)', 882 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 883 '&pxor ($t0,$t1)', 884 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 885 '&pxor ($t0,$t2);', 886 '&movdqa ($t2,$t3)', 887 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 888 '&psrld ($t3,$sigma1[2])', 889 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 890 '&psrlq ($t2,$sigma1[0])', 891 '&pxor ($t3,$t2);', 892 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 893 '&pxor ($t3,$t2)', 894 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 895 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 896 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 897 '&movdqa ($t2,$t3);', 898 '&psrld ($t3,$sigma1[2])', 899 '&psrlq ($t2,$sigma1[0])', 900 '&pxor ($t3,$t2);', 901 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 902 '&pxor ($t3,$t2);', 903 '&movdqa ($t2,16*2*$j."($Tbl)")', 904 '&pshufb ($t3,$t5)', 905 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 906 ); 907} 908 909sub SSSE3_256_00_47 () { 910my $j = shift; 911my $body = shift; 912my @X = @_; 913my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 914 915 if (0) { 916 foreach (Xupdate_256_SSSE3()) { # 36 instructions 917 eval; 918 eval(shift(@insns)); 919 eval(shift(@insns)); 920 eval(shift(@insns)); 921 } 922 } else { # squeeze extra 4% on Westmere and 19% on Atom 923 eval(shift(@insns)); #@ 924 &movdqa ($t0,@X[1]); 925 eval(shift(@insns)); 926 eval(shift(@insns)); 927 &movdqa ($t3,@X[3]); 928 eval(shift(@insns)); #@ 929 eval(shift(@insns)); 930 eval(shift(@insns)); 931 eval(shift(@insns)); #@ 932 eval(shift(@insns)); 933 &palignr ($t0,@X[0],$SZ); # X[1..4] 934 eval(shift(@insns)); 935 eval(shift(@insns)); 936 &palignr ($t3,@X[2],$SZ); # X[9..12] 937 eval(shift(@insns)); 938 eval(shift(@insns)); 939 eval(shift(@insns)); 940 eval(shift(@insns)); #@ 941 &movdqa ($t1,$t0); 942 eval(shift(@insns)); 943 eval(shift(@insns)); 944 &movdqa ($t2,$t0); 945 eval(shift(@insns)); #@ 946 eval(shift(@insns)); 947 &psrld ($t0,$sigma0[2]); 948 eval(shift(@insns)); 949 eval(shift(@insns)); 950 eval(shift(@insns)); 951 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 952 eval(shift(@insns)); #@ 953 eval(shift(@insns)); 954 &psrld ($t2,$sigma0[0]); 955 eval(shift(@insns)); 956 eval(shift(@insns)); 957 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 958 eval(shift(@insns)); 959 eval(shift(@insns)); #@ 960 &pslld ($t1,8*$SZ-$sigma0[1]); 961 eval(shift(@insns)); 962 eval(shift(@insns)); 963 &pxor ($t0,$t2); 964 eval(shift(@insns)); #@ 965 eval(shift(@insns)); 966 eval(shift(@insns)); 967 eval(shift(@insns)); #@ 968 &psrld ($t2,$sigma0[1]-$sigma0[0]); 969 eval(shift(@insns)); 970 &pxor ($t0,$t1); 971 eval(shift(@insns)); 972 eval(shift(@insns)); 973 &pslld ($t1,$sigma0[1]-$sigma0[0]); 974 eval(shift(@insns)); 975 eval(shift(@insns)); 976 &pxor ($t0,$t2); 977 eval(shift(@insns)); 978 eval(shift(@insns)); #@ 979 &movdqa ($t2,$t3); 980 eval(shift(@insns)); 981 eval(shift(@insns)); 982 &pxor ($t0,$t1); # sigma0(X[1..4]) 983 eval(shift(@insns)); #@ 984 eval(shift(@insns)); 985 eval(shift(@insns)); 986 &psrld ($t3,$sigma1[2]); 987 eval(shift(@insns)); 988 eval(shift(@insns)); 989 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 990 eval(shift(@insns)); #@ 991 eval(shift(@insns)); 992 &psrlq ($t2,$sigma1[0]); 993 eval(shift(@insns)); 994 eval(shift(@insns)); 995 eval(shift(@insns)); 996 &pxor ($t3,$t2); 997 eval(shift(@insns)); #@ 998 eval(shift(@insns)); 999 eval(shift(@insns)); 1000 eval(shift(@insns)); #@ 1001 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1002 eval(shift(@insns)); 1003 eval(shift(@insns)); 1004 &pxor ($t3,$t2); 1005 eval(shift(@insns)); #@ 1006 eval(shift(@insns)); 1007 eval(shift(@insns)); 1008 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 1009 &pshufd ($t3,$t3,0b10000000); 1010 eval(shift(@insns)); 1011 eval(shift(@insns)); 1012 eval(shift(@insns)); 1013 &psrldq ($t3,8); 1014 eval(shift(@insns)); 1015 eval(shift(@insns)); #@ 1016 eval(shift(@insns)); 1017 eval(shift(@insns)); 1018 eval(shift(@insns)); #@ 1019 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1020 eval(shift(@insns)); 1021 eval(shift(@insns)); 1022 eval(shift(@insns)); 1023 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 1024 eval(shift(@insns)); 1025 eval(shift(@insns)); #@ 1026 eval(shift(@insns)); 1027 &movdqa ($t2,$t3); 1028 eval(shift(@insns)); 1029 eval(shift(@insns)); 1030 &psrld ($t3,$sigma1[2]); 1031 eval(shift(@insns)); 1032 eval(shift(@insns)); #@ 1033 &psrlq ($t2,$sigma1[0]); 1034 eval(shift(@insns)); 1035 eval(shift(@insns)); 1036 &pxor ($t3,$t2); 1037 eval(shift(@insns)); #@ 1038 eval(shift(@insns)); 1039 eval(shift(@insns)); 1040 eval(shift(@insns)); #@ 1041 eval(shift(@insns)); 1042 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1043 eval(shift(@insns)); 1044 eval(shift(@insns)); 1045 eval(shift(@insns)); 1046 &pxor ($t3,$t2); 1047 eval(shift(@insns)); 1048 eval(shift(@insns)); 1049 eval(shift(@insns)); #@ 1050 #&pshufb ($t3,$t5); 1051 &pshufd ($t3,$t3,0b00001000); 1052 eval(shift(@insns)); 1053 eval(shift(@insns)); 1054 &movdqa ($t2,16*2*$j."($Tbl)"); 1055 eval(shift(@insns)); #@ 1056 eval(shift(@insns)); 1057 &pslldq ($t3,8); 1058 eval(shift(@insns)); 1059 eval(shift(@insns)); 1060 eval(shift(@insns)); 1061 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1062 eval(shift(@insns)); #@ 1063 eval(shift(@insns)); 1064 eval(shift(@insns)); 1065 } 1066 &paddd ($t2,@X[0]); 1067 foreach (@insns) { eval; } # remaining instructions 1068 &movdqa (16*$j."(%rsp)",$t2); 1069} 1070 1071 for ($i=0,$j=0; $j<4; $j++) { 1072 &SSSE3_256_00_47($j,\&body_00_15,@X); 1073 push(@X,shift(@X)); # rotate(@X) 1074 } 1075 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1076 &jne (".Lssse3_00_47"); 1077 1078 for ($i=0; $i<16; ) { 1079 foreach(body_00_15()) { eval; } 1080 } 1081$code.=<<___; 1082 mov $_ctx,$ctx 1083 mov $a1,$A 1084 1085 add $SZ*0($ctx),$A 1086 lea 16*$SZ($inp),$inp 1087 add $SZ*1($ctx),$B 1088 add $SZ*2($ctx),$C 1089 add $SZ*3($ctx),$D 1090 add $SZ*4($ctx),$E 1091 add $SZ*5($ctx),$F 1092 add $SZ*6($ctx),$G 1093 add $SZ*7($ctx),$H 1094 1095 cmp $_end,$inp 1096 1097 mov $A,$SZ*0($ctx) 1098 mov $B,$SZ*1($ctx) 1099 mov $C,$SZ*2($ctx) 1100 mov $D,$SZ*3($ctx) 1101 mov $E,$SZ*4($ctx) 1102 mov $F,$SZ*5($ctx) 1103 mov $G,$SZ*6($ctx) 1104 mov $H,$SZ*7($ctx) 1105 jb .Lloop_ssse3 1106 1107 mov $_rsp,%rsi 1108.cfi_def_cfa %rsi,8 1109___ 1110$code.=<<___ if ($win64); 1111 movaps 16*$SZ+32(%rsp),%xmm6 1112 movaps 16*$SZ+48(%rsp),%xmm7 1113 movaps 16*$SZ+64(%rsp),%xmm8 1114 movaps 16*$SZ+80(%rsp),%xmm9 1115___ 1116$code.=<<___; 1117 mov -48(%rsi),%r15 1118.cfi_restore %r15 1119 mov -40(%rsi),%r14 1120.cfi_restore %r14 1121 mov -32(%rsi),%r13 1122.cfi_restore %r13 1123 mov -24(%rsi),%r12 1124.cfi_restore %r12 1125 mov -16(%rsi),%rbp 1126.cfi_restore %rbp 1127 mov -8(%rsi),%rbx 1128.cfi_restore %rbx 1129 lea (%rsi),%rsp 1130.cfi_def_cfa_register %rsp 1131.Lepilogue_ssse3: 1132 ret 1133.cfi_endproc 1134.size ${func}_ssse3,.-${func}_ssse3 1135___ 1136} 1137 1138if ($avx) {{ 1139###################################################################### 1140# XOP code path 1141# 1142if ($SZ==8) { # SHA512 only 1143$code.=<<___; 1144.type ${func}_xop,\@function,3 1145.align 64 1146${func}_xop: 1147.cfi_startproc 1148.Lxop_shortcut: 1149 mov %rsp,%rax # copy %rsp 1150.cfi_def_cfa_register %rax 1151 push %rbx 1152.cfi_push %rbx 1153 push %rbp 1154.cfi_push %rbp 1155 push %r12 1156.cfi_push %r12 1157 push %r13 1158.cfi_push %r13 1159 push %r14 1160.cfi_push %r14 1161 push %r15 1162.cfi_push %r15 1163 shl \$4,%rdx # num*16 1164 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1165 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1166 and \$-64,%rsp # align stack frame 1167 mov $ctx,$_ctx # save ctx, 1st arg 1168 mov $inp,$_inp # save inp, 2nd arh 1169 mov %rdx,$_end # save end pointer, "3rd" arg 1170 mov %rax,$_rsp # save copy of %rsp 1171.cfi_cfa_expression $_rsp,deref,+8 1172___ 1173$code.=<<___ if ($win64); 1174 movaps %xmm6,16*$SZ+32(%rsp) 1175 movaps %xmm7,16*$SZ+48(%rsp) 1176 movaps %xmm8,16*$SZ+64(%rsp) 1177 movaps %xmm9,16*$SZ+80(%rsp) 1178___ 1179$code.=<<___ if ($win64 && $SZ>4); 1180 movaps %xmm10,16*$SZ+96(%rsp) 1181 movaps %xmm11,16*$SZ+112(%rsp) 1182___ 1183$code.=<<___; 1184.Lprologue_xop: 1185 1186 vzeroupper 1187 mov $SZ*0($ctx),$A 1188 mov $SZ*1($ctx),$B 1189 mov $SZ*2($ctx),$C 1190 mov $SZ*3($ctx),$D 1191 mov $SZ*4($ctx),$E 1192 mov $SZ*5($ctx),$F 1193 mov $SZ*6($ctx),$G 1194 mov $SZ*7($ctx),$H 1195 jmp .Lloop_xop 1196___ 1197 if ($SZ==4) { # SHA256 1198 my @X = map("%xmm$_",(0..3)); 1199 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 1200 1201$code.=<<___; 1202.align 16 1203.Lloop_xop: 1204 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1205 vmovdqu 0x00($inp),@X[0] 1206 vmovdqu 0x10($inp),@X[1] 1207 vmovdqu 0x20($inp),@X[2] 1208 vmovdqu 0x30($inp),@X[3] 1209 vpshufb $t3,@X[0],@X[0] 1210 lea $TABLE(%rip),$Tbl 1211 vpshufb $t3,@X[1],@X[1] 1212 vpshufb $t3,@X[2],@X[2] 1213 vpaddd 0x00($Tbl),@X[0],$t0 1214 vpshufb $t3,@X[3],@X[3] 1215 vpaddd 0x20($Tbl),@X[1],$t1 1216 vpaddd 0x40($Tbl),@X[2],$t2 1217 vpaddd 0x60($Tbl),@X[3],$t3 1218 vmovdqa $t0,0x00(%rsp) 1219 mov $A,$a1 1220 vmovdqa $t1,0x10(%rsp) 1221 mov $B,$a3 1222 vmovdqa $t2,0x20(%rsp) 1223 xor $C,$a3 # magic 1224 vmovdqa $t3,0x30(%rsp) 1225 mov $E,$a0 1226 jmp .Lxop_00_47 1227 1228.align 16 1229.Lxop_00_47: 1230 sub \$`-16*2*$SZ`,$Tbl # size optimization 1231___ 1232sub XOP_256_00_47 () { 1233my $j = shift; 1234my $body = shift; 1235my @X = @_; 1236my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1237 1238 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 1239 eval(shift(@insns)); 1240 eval(shift(@insns)); 1241 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 1242 eval(shift(@insns)); 1243 eval(shift(@insns)); 1244 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 1245 eval(shift(@insns)); 1246 eval(shift(@insns)); 1247 &vpsrld ($t0,$t0,$sigma0[2]); 1248 eval(shift(@insns)); 1249 eval(shift(@insns)); 1250 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 1251 eval(shift(@insns)); 1252 eval(shift(@insns)); 1253 eval(shift(@insns)); 1254 eval(shift(@insns)); 1255 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 1256 eval(shift(@insns)); 1257 eval(shift(@insns)); 1258 &vpxor ($t0,$t0,$t1); 1259 eval(shift(@insns)); 1260 eval(shift(@insns)); 1261 eval(shift(@insns)); 1262 eval(shift(@insns)); 1263 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 1264 eval(shift(@insns)); 1265 eval(shift(@insns)); 1266 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 1267 eval(shift(@insns)); 1268 eval(shift(@insns)); 1269 &vpsrld ($t2,@X[3],$sigma1[2]); 1270 eval(shift(@insns)); 1271 eval(shift(@insns)); 1272 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 1273 eval(shift(@insns)); 1274 eval(shift(@insns)); 1275 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1276 eval(shift(@insns)); 1277 eval(shift(@insns)); 1278 &vpxor ($t3,$t3,$t2); 1279 eval(shift(@insns)); 1280 eval(shift(@insns)); 1281 eval(shift(@insns)); 1282 eval(shift(@insns)); 1283 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1284 eval(shift(@insns)); 1285 eval(shift(@insns)); 1286 eval(shift(@insns)); 1287 eval(shift(@insns)); 1288 &vpsrldq ($t3,$t3,8); 1289 eval(shift(@insns)); 1290 eval(shift(@insns)); 1291 eval(shift(@insns)); 1292 eval(shift(@insns)); 1293 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1294 eval(shift(@insns)); 1295 eval(shift(@insns)); 1296 eval(shift(@insns)); 1297 eval(shift(@insns)); 1298 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 1299 eval(shift(@insns)); 1300 eval(shift(@insns)); 1301 &vpsrld ($t2,@X[0],$sigma1[2]); 1302 eval(shift(@insns)); 1303 eval(shift(@insns)); 1304 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1305 eval(shift(@insns)); 1306 eval(shift(@insns)); 1307 &vpxor ($t3,$t3,$t2); 1308 eval(shift(@insns)); 1309 eval(shift(@insns)); 1310 eval(shift(@insns)); 1311 eval(shift(@insns)); 1312 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 1313 eval(shift(@insns)); 1314 eval(shift(@insns)); 1315 eval(shift(@insns)); 1316 eval(shift(@insns)); 1317 &vpslldq ($t3,$t3,8); # 22 instructions 1318 eval(shift(@insns)); 1319 eval(shift(@insns)); 1320 eval(shift(@insns)); 1321 eval(shift(@insns)); 1322 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1323 eval(shift(@insns)); 1324 eval(shift(@insns)); 1325 eval(shift(@insns)); 1326 eval(shift(@insns)); 1327 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1328 foreach (@insns) { eval; } # remaining instructions 1329 &vmovdqa (16*$j."(%rsp)",$t2); 1330} 1331 1332 for ($i=0,$j=0; $j<4; $j++) { 1333 &XOP_256_00_47($j,\&body_00_15,@X); 1334 push(@X,shift(@X)); # rotate(@X) 1335 } 1336 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1337 &jne (".Lxop_00_47"); 1338 1339 for ($i=0; $i<16; ) { 1340 foreach(body_00_15()) { eval; } 1341 } 1342 1343 } else { # SHA512 1344 my @X = map("%xmm$_",(0..7)); 1345 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1346 1347$code.=<<___; 1348.align 16 1349.Lloop_xop: 1350 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1351 vmovdqu 0x00($inp),@X[0] 1352 lea $TABLE+0x80(%rip),$Tbl # size optimization 1353 vmovdqu 0x10($inp),@X[1] 1354 vmovdqu 0x20($inp),@X[2] 1355 vpshufb $t3,@X[0],@X[0] 1356 vmovdqu 0x30($inp),@X[3] 1357 vpshufb $t3,@X[1],@X[1] 1358 vmovdqu 0x40($inp),@X[4] 1359 vpshufb $t3,@X[2],@X[2] 1360 vmovdqu 0x50($inp),@X[5] 1361 vpshufb $t3,@X[3],@X[3] 1362 vmovdqu 0x60($inp),@X[6] 1363 vpshufb $t3,@X[4],@X[4] 1364 vmovdqu 0x70($inp),@X[7] 1365 vpshufb $t3,@X[5],@X[5] 1366 vpaddq -0x80($Tbl),@X[0],$t0 1367 vpshufb $t3,@X[6],@X[6] 1368 vpaddq -0x60($Tbl),@X[1],$t1 1369 vpshufb $t3,@X[7],@X[7] 1370 vpaddq -0x40($Tbl),@X[2],$t2 1371 vpaddq -0x20($Tbl),@X[3],$t3 1372 vmovdqa $t0,0x00(%rsp) 1373 vpaddq 0x00($Tbl),@X[4],$t0 1374 vmovdqa $t1,0x10(%rsp) 1375 vpaddq 0x20($Tbl),@X[5],$t1 1376 vmovdqa $t2,0x20(%rsp) 1377 vpaddq 0x40($Tbl),@X[6],$t2 1378 vmovdqa $t3,0x30(%rsp) 1379 vpaddq 0x60($Tbl),@X[7],$t3 1380 vmovdqa $t0,0x40(%rsp) 1381 mov $A,$a1 1382 vmovdqa $t1,0x50(%rsp) 1383 mov $B,$a3 1384 vmovdqa $t2,0x60(%rsp) 1385 xor $C,$a3 # magic 1386 vmovdqa $t3,0x70(%rsp) 1387 mov $E,$a0 1388 jmp .Lxop_00_47 1389 1390.align 16 1391.Lxop_00_47: 1392 add \$`16*2*$SZ`,$Tbl 1393___ 1394sub XOP_512_00_47 () { 1395my $j = shift; 1396my $body = shift; 1397my @X = @_; 1398my @insns = (&$body,&$body); # 52 instructions 1399 1400 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] 1401 eval(shift(@insns)); 1402 eval(shift(@insns)); 1403 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] 1404 eval(shift(@insns)); 1405 eval(shift(@insns)); 1406 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); 1407 eval(shift(@insns)); 1408 eval(shift(@insns)); 1409 &vpsrlq ($t0,$t0,$sigma0[2]); 1410 eval(shift(@insns)); 1411 eval(shift(@insns)); 1412 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] 1413 eval(shift(@insns)); 1414 eval(shift(@insns)); 1415 eval(shift(@insns)); 1416 eval(shift(@insns)); 1417 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); 1418 eval(shift(@insns)); 1419 eval(shift(@insns)); 1420 &vpxor ($t0,$t0,$t1); 1421 eval(shift(@insns)); 1422 eval(shift(@insns)); 1423 eval(shift(@insns)); 1424 eval(shift(@insns)); 1425 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); 1426 eval(shift(@insns)); 1427 eval(shift(@insns)); 1428 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) 1429 eval(shift(@insns)); 1430 eval(shift(@insns)); 1431 &vpsrlq ($t2,@X[7],$sigma1[2]); 1432 eval(shift(@insns)); 1433 eval(shift(@insns)); 1434 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) 1435 eval(shift(@insns)); 1436 eval(shift(@insns)); 1437 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); 1438 eval(shift(@insns)); 1439 eval(shift(@insns)); 1440 &vpxor ($t3,$t3,$t2); 1441 eval(shift(@insns)); 1442 eval(shift(@insns)); 1443 eval(shift(@insns)); 1444 eval(shift(@insns)); 1445 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1446 eval(shift(@insns)); 1447 eval(shift(@insns)); 1448 eval(shift(@insns)); 1449 eval(shift(@insns)); 1450 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1451 eval(shift(@insns)); 1452 eval(shift(@insns)); 1453 eval(shift(@insns)); 1454 eval(shift(@insns)); 1455 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1456 foreach (@insns) { eval; } # remaining instructions 1457 &vmovdqa (16*$j."(%rsp)",$t2); 1458} 1459 1460 for ($i=0,$j=0; $j<8; $j++) { 1461 &XOP_512_00_47($j,\&body_00_15,@X); 1462 push(@X,shift(@X)); # rotate(@X) 1463 } 1464 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1465 &jne (".Lxop_00_47"); 1466 1467 for ($i=0; $i<16; ) { 1468 foreach(body_00_15()) { eval; } 1469 } 1470} 1471$code.=<<___; 1472 mov $_ctx,$ctx 1473 mov $a1,$A 1474 1475 add $SZ*0($ctx),$A 1476 lea 16*$SZ($inp),$inp 1477 add $SZ*1($ctx),$B 1478 add $SZ*2($ctx),$C 1479 add $SZ*3($ctx),$D 1480 add $SZ*4($ctx),$E 1481 add $SZ*5($ctx),$F 1482 add $SZ*6($ctx),$G 1483 add $SZ*7($ctx),$H 1484 1485 cmp $_end,$inp 1486 1487 mov $A,$SZ*0($ctx) 1488 mov $B,$SZ*1($ctx) 1489 mov $C,$SZ*2($ctx) 1490 mov $D,$SZ*3($ctx) 1491 mov $E,$SZ*4($ctx) 1492 mov $F,$SZ*5($ctx) 1493 mov $G,$SZ*6($ctx) 1494 mov $H,$SZ*7($ctx) 1495 jb .Lloop_xop 1496 1497 mov $_rsp,%rsi 1498.cfi_def_cfa %rsi,8 1499 vzeroupper 1500___ 1501$code.=<<___ if ($win64); 1502 movaps 16*$SZ+32(%rsp),%xmm6 1503 movaps 16*$SZ+48(%rsp),%xmm7 1504 movaps 16*$SZ+64(%rsp),%xmm8 1505 movaps 16*$SZ+80(%rsp),%xmm9 1506___ 1507$code.=<<___ if ($win64 && $SZ>4); 1508 movaps 16*$SZ+96(%rsp),%xmm10 1509 movaps 16*$SZ+112(%rsp),%xmm11 1510___ 1511$code.=<<___; 1512 mov -48(%rsi),%r15 1513.cfi_restore %r15 1514 mov -40(%rsi),%r14 1515.cfi_restore %r14 1516 mov -32(%rsi),%r13 1517.cfi_restore %r13 1518 mov -24(%rsi),%r12 1519.cfi_restore %r12 1520 mov -16(%rsi),%rbp 1521.cfi_restore %rbp 1522 mov -8(%rsi),%rbx 1523.cfi_restore %rbx 1524 lea (%rsi),%rsp 1525.cfi_def_cfa_register %rsp 1526.Lepilogue_xop: 1527 ret 1528.cfi_endproc 1529.size ${func}_xop,.-${func}_xop 1530___ 1531} 1532###################################################################### 1533# AVX+shrd code path 1534# 1535local *ror = sub { &shrd(@_[0],@_) }; 1536 1537$code.=<<___; 1538.type ${func}_avx,\@function,3 1539.align 64 1540${func}_avx: 1541.cfi_startproc 1542.Lavx_shortcut: 1543 mov %rsp,%rax # copy %rsp 1544.cfi_def_cfa_register %rax 1545 push %rbx 1546.cfi_push %rbx 1547 push %rbp 1548.cfi_push %rbp 1549 push %r12 1550.cfi_push %r12 1551 push %r13 1552.cfi_push %r13 1553 push %r14 1554.cfi_push %r14 1555 push %r15 1556.cfi_push %r15 1557 shl \$4,%rdx # num*16 1558 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1559 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1560 and \$-64,%rsp # align stack frame 1561 mov $ctx,$_ctx # save ctx, 1st arg 1562 mov $inp,$_inp # save inp, 2nd arh 1563 mov %rdx,$_end # save end pointer, "3rd" arg 1564 mov %rax,$_rsp # save copy of %rsp 1565.cfi_cfa_expression $_rsp,deref,+8 1566___ 1567$code.=<<___ if ($win64); 1568 movaps %xmm6,16*$SZ+32(%rsp) 1569 movaps %xmm7,16*$SZ+48(%rsp) 1570 movaps %xmm8,16*$SZ+64(%rsp) 1571 movaps %xmm9,16*$SZ+80(%rsp) 1572___ 1573$code.=<<___ if ($win64 && $SZ>4); 1574 movaps %xmm10,16*$SZ+96(%rsp) 1575 movaps %xmm11,16*$SZ+112(%rsp) 1576___ 1577$code.=<<___; 1578.Lprologue_avx: 1579 1580 vzeroupper 1581 mov $SZ*0($ctx),$A 1582 mov $SZ*1($ctx),$B 1583 mov $SZ*2($ctx),$C 1584 mov $SZ*3($ctx),$D 1585 mov $SZ*4($ctx),$E 1586 mov $SZ*5($ctx),$F 1587 mov $SZ*6($ctx),$G 1588 mov $SZ*7($ctx),$H 1589___ 1590 if ($SZ==4) { # SHA256 1591 my @X = map("%xmm$_",(0..3)); 1592 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1593 1594$code.=<<___; 1595 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1596 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1597 jmp .Lloop_avx 1598.align 16 1599.Lloop_avx: 1600 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1601 vmovdqu 0x00($inp),@X[0] 1602 vmovdqu 0x10($inp),@X[1] 1603 vmovdqu 0x20($inp),@X[2] 1604 vmovdqu 0x30($inp),@X[3] 1605 vpshufb $t3,@X[0],@X[0] 1606 lea $TABLE(%rip),$Tbl 1607 vpshufb $t3,@X[1],@X[1] 1608 vpshufb $t3,@X[2],@X[2] 1609 vpaddd 0x00($Tbl),@X[0],$t0 1610 vpshufb $t3,@X[3],@X[3] 1611 vpaddd 0x20($Tbl),@X[1],$t1 1612 vpaddd 0x40($Tbl),@X[2],$t2 1613 vpaddd 0x60($Tbl),@X[3],$t3 1614 vmovdqa $t0,0x00(%rsp) 1615 mov $A,$a1 1616 vmovdqa $t1,0x10(%rsp) 1617 mov $B,$a3 1618 vmovdqa $t2,0x20(%rsp) 1619 xor $C,$a3 # magic 1620 vmovdqa $t3,0x30(%rsp) 1621 mov $E,$a0 1622 jmp .Lavx_00_47 1623 1624.align 16 1625.Lavx_00_47: 1626 sub \$`-16*2*$SZ`,$Tbl # size optimization 1627___ 1628sub Xupdate_256_AVX () { 1629 ( 1630 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1631 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1632 '&vpsrld ($t2,$t0,$sigma0[0]);', 1633 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1634 '&vpsrld ($t3,$t0,$sigma0[2])', 1635 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1636 '&vpxor ($t0,$t3,$t2)', 1637 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1638 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1639 '&vpxor ($t0,$t0,$t1)', 1640 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1641 '&vpxor ($t0,$t0,$t2)', 1642 '&vpsrld ($t2,$t3,$sigma1[2]);', 1643 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1644 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1645 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1646 '&vpxor ($t2,$t2,$t3);', 1647 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1648 '&vpxor ($t2,$t2,$t3)', 1649 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1650 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1651 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1652 '&vpsrld ($t2,$t3,$sigma1[2])', 1653 '&vpsrlq ($t3,$t3,$sigma1[0])', 1654 '&vpxor ($t2,$t2,$t3);', 1655 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1656 '&vpxor ($t2,$t2,$t3)', 1657 '&vpshufb ($t2,$t2,$t5)', 1658 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1659 ); 1660} 1661 1662sub AVX_256_00_47 () { 1663my $j = shift; 1664my $body = shift; 1665my @X = @_; 1666my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1667 1668 foreach (Xupdate_256_AVX()) { # 29 instructions 1669 eval; 1670 eval(shift(@insns)); 1671 eval(shift(@insns)); 1672 eval(shift(@insns)); 1673 } 1674 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1675 foreach (@insns) { eval; } # remaining instructions 1676 &vmovdqa (16*$j."(%rsp)",$t2); 1677} 1678 1679 for ($i=0,$j=0; $j<4; $j++) { 1680 &AVX_256_00_47($j,\&body_00_15,@X); 1681 push(@X,shift(@X)); # rotate(@X) 1682 } 1683 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1684 &jne (".Lavx_00_47"); 1685 1686 for ($i=0; $i<16; ) { 1687 foreach(body_00_15()) { eval; } 1688 } 1689 1690 } else { # SHA512 1691 my @X = map("%xmm$_",(0..7)); 1692 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1693 1694$code.=<<___; 1695 jmp .Lloop_avx 1696.align 16 1697.Lloop_avx: 1698 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1699 vmovdqu 0x00($inp),@X[0] 1700 lea $TABLE+0x80(%rip),$Tbl # size optimization 1701 vmovdqu 0x10($inp),@X[1] 1702 vmovdqu 0x20($inp),@X[2] 1703 vpshufb $t3,@X[0],@X[0] 1704 vmovdqu 0x30($inp),@X[3] 1705 vpshufb $t3,@X[1],@X[1] 1706 vmovdqu 0x40($inp),@X[4] 1707 vpshufb $t3,@X[2],@X[2] 1708 vmovdqu 0x50($inp),@X[5] 1709 vpshufb $t3,@X[3],@X[3] 1710 vmovdqu 0x60($inp),@X[6] 1711 vpshufb $t3,@X[4],@X[4] 1712 vmovdqu 0x70($inp),@X[7] 1713 vpshufb $t3,@X[5],@X[5] 1714 vpaddq -0x80($Tbl),@X[0],$t0 1715 vpshufb $t3,@X[6],@X[6] 1716 vpaddq -0x60($Tbl),@X[1],$t1 1717 vpshufb $t3,@X[7],@X[7] 1718 vpaddq -0x40($Tbl),@X[2],$t2 1719 vpaddq -0x20($Tbl),@X[3],$t3 1720 vmovdqa $t0,0x00(%rsp) 1721 vpaddq 0x00($Tbl),@X[4],$t0 1722 vmovdqa $t1,0x10(%rsp) 1723 vpaddq 0x20($Tbl),@X[5],$t1 1724 vmovdqa $t2,0x20(%rsp) 1725 vpaddq 0x40($Tbl),@X[6],$t2 1726 vmovdqa $t3,0x30(%rsp) 1727 vpaddq 0x60($Tbl),@X[7],$t3 1728 vmovdqa $t0,0x40(%rsp) 1729 mov $A,$a1 1730 vmovdqa $t1,0x50(%rsp) 1731 mov $B,$a3 1732 vmovdqa $t2,0x60(%rsp) 1733 xor $C,$a3 # magic 1734 vmovdqa $t3,0x70(%rsp) 1735 mov $E,$a0 1736 jmp .Lavx_00_47 1737 1738.align 16 1739.Lavx_00_47: 1740 add \$`16*2*$SZ`,$Tbl 1741___ 1742sub Xupdate_512_AVX () { 1743 ( 1744 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1745 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1746 '&vpsrlq ($t2,$t0,$sigma0[0])', 1747 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1748 '&vpsrlq ($t3,$t0,$sigma0[2])', 1749 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1750 '&vpxor ($t0,$t3,$t2)', 1751 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1752 '&vpxor ($t0,$t0,$t1)', 1753 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1754 '&vpxor ($t0,$t0,$t2)', 1755 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1756 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1757 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1758 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1759 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1760 '&vpxor ($t3,$t3,$t2)', 1761 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1762 '&vpxor ($t3,$t3,$t1)', 1763 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1764 '&vpxor ($t3,$t3,$t2)', 1765 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1766 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1767 ); 1768} 1769 1770sub AVX_512_00_47 () { 1771my $j = shift; 1772my $body = shift; 1773my @X = @_; 1774my @insns = (&$body,&$body); # 52 instructions 1775 1776 foreach (Xupdate_512_AVX()) { # 23 instructions 1777 eval; 1778 eval(shift(@insns)); 1779 eval(shift(@insns)); 1780 } 1781 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1782 foreach (@insns) { eval; } # remaining instructions 1783 &vmovdqa (16*$j."(%rsp)",$t2); 1784} 1785 1786 for ($i=0,$j=0; $j<8; $j++) { 1787 &AVX_512_00_47($j,\&body_00_15,@X); 1788 push(@X,shift(@X)); # rotate(@X) 1789 } 1790 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1791 &jne (".Lavx_00_47"); 1792 1793 for ($i=0; $i<16; ) { 1794 foreach(body_00_15()) { eval; } 1795 } 1796} 1797$code.=<<___; 1798 mov $_ctx,$ctx 1799 mov $a1,$A 1800 1801 add $SZ*0($ctx),$A 1802 lea 16*$SZ($inp),$inp 1803 add $SZ*1($ctx),$B 1804 add $SZ*2($ctx),$C 1805 add $SZ*3($ctx),$D 1806 add $SZ*4($ctx),$E 1807 add $SZ*5($ctx),$F 1808 add $SZ*6($ctx),$G 1809 add $SZ*7($ctx),$H 1810 1811 cmp $_end,$inp 1812 1813 mov $A,$SZ*0($ctx) 1814 mov $B,$SZ*1($ctx) 1815 mov $C,$SZ*2($ctx) 1816 mov $D,$SZ*3($ctx) 1817 mov $E,$SZ*4($ctx) 1818 mov $F,$SZ*5($ctx) 1819 mov $G,$SZ*6($ctx) 1820 mov $H,$SZ*7($ctx) 1821 jb .Lloop_avx 1822 1823 mov $_rsp,%rsi 1824.cfi_def_cfa %rsi,8 1825 vzeroupper 1826___ 1827$code.=<<___ if ($win64); 1828 movaps 16*$SZ+32(%rsp),%xmm6 1829 movaps 16*$SZ+48(%rsp),%xmm7 1830 movaps 16*$SZ+64(%rsp),%xmm8 1831 movaps 16*$SZ+80(%rsp),%xmm9 1832___ 1833$code.=<<___ if ($win64 && $SZ>4); 1834 movaps 16*$SZ+96(%rsp),%xmm10 1835 movaps 16*$SZ+112(%rsp),%xmm11 1836___ 1837$code.=<<___; 1838 mov -48(%rsi),%r15 1839.cfi_restore %r15 1840 mov -40(%rsi),%r14 1841.cfi_restore %r14 1842 mov -32(%rsi),%r13 1843.cfi_restore %r13 1844 mov -24(%rsi),%r12 1845.cfi_restore %r12 1846 mov -16(%rsi),%rbp 1847.cfi_restore %rbp 1848 mov -8(%rsi),%rbx 1849.cfi_restore %rbx 1850 lea (%rsi),%rsp 1851.cfi_def_cfa_register %rsp 1852.Lepilogue_avx: 1853 ret 1854.cfi_endproc 1855.size ${func}_avx,.-${func}_avx 1856___ 1857 1858if ($avx>1) {{ 1859###################################################################### 1860# AVX2+BMI code path 1861# 1862my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1863my $PUSH8=8*2*$SZ; 1864use integer; 1865 1866sub bodyx_00_15 () { 1867 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1868 ( 1869 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1870 1871 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1872 '&and ($a4,$e)', # f&e 1873 '&rorx ($a0,$e,$Sigma1[2])', 1874 '&rorx ($a2,$e,$Sigma1[1])', 1875 1876 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1877 '&lea ($h,"($h,$a4)")', 1878 '&andn ($a4,$e,$g)', # ~e&g 1879 '&xor ($a0,$a2)', 1880 1881 '&rorx ($a1,$e,$Sigma1[0])', 1882 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1883 '&xor ($a0,$a1)', # Sigma1(e) 1884 '&mov ($a2,$a)', 1885 1886 '&rorx ($a4,$a,$Sigma0[2])', 1887 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1888 '&xor ($a2,$b)', # a^b, b^c in next round 1889 '&rorx ($a1,$a,$Sigma0[1])', 1890 1891 '&rorx ($a0,$a,$Sigma0[0])', 1892 '&lea ($d,"($d,$h)")', # d+=h 1893 '&and ($a3,$a2)', # (b^c)&(a^b) 1894 '&xor ($a1,$a4)', 1895 1896 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1897 '&xor ($a1,$a0)', # Sigma0(a) 1898 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1899 '&mov ($a4,$e)', # copy of f in future 1900 1901 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1902 ); 1903 # and at the finish one has to $a+=$a1 1904} 1905 1906$code.=<<___; 1907.type ${func}_avx2,\@function,3 1908.align 64 1909${func}_avx2: 1910.cfi_startproc 1911.Lavx2_shortcut: 1912 mov %rsp,%rax # copy %rsp 1913.cfi_def_cfa_register %rax 1914 push %rbx 1915.cfi_push %rbx 1916 push %rbp 1917.cfi_push %rbp 1918 push %r12 1919.cfi_push %r12 1920 push %r13 1921.cfi_push %r13 1922 push %r14 1923.cfi_push %r14 1924 push %r15 1925.cfi_push %r15 1926 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1927 shl \$4,%rdx # num*16 1928 and \$-256*$SZ,%rsp # align stack frame 1929 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1930 add \$`2*$SZ*($rounds-8)`,%rsp 1931 mov $ctx,$_ctx # save ctx, 1st arg 1932 mov $inp,$_inp # save inp, 2nd arh 1933 mov %rdx,$_end # save end pointer, "3rd" arg 1934 mov %rax,$_rsp # save copy of %rsp 1935.cfi_cfa_expression $_rsp,deref,+8 1936___ 1937$code.=<<___ if ($win64); 1938 movaps %xmm6,16*$SZ+32(%rsp) 1939 movaps %xmm7,16*$SZ+48(%rsp) 1940 movaps %xmm8,16*$SZ+64(%rsp) 1941 movaps %xmm9,16*$SZ+80(%rsp) 1942___ 1943$code.=<<___ if ($win64 && $SZ>4); 1944 movaps %xmm10,16*$SZ+96(%rsp) 1945 movaps %xmm11,16*$SZ+112(%rsp) 1946___ 1947$code.=<<___; 1948.Lprologue_avx2: 1949 1950 vzeroupper 1951 sub \$-16*$SZ,$inp # inp++, size optimization 1952 mov $SZ*0($ctx),$A 1953 mov $inp,%r12 # borrow $T1 1954 mov $SZ*1($ctx),$B 1955 cmp %rdx,$inp # $_end 1956 mov $SZ*2($ctx),$C 1957 cmove %rsp,%r12 # next block or random data 1958 mov $SZ*3($ctx),$D 1959 mov $SZ*4($ctx),$E 1960 mov $SZ*5($ctx),$F 1961 mov $SZ*6($ctx),$G 1962 mov $SZ*7($ctx),$H 1963___ 1964 if ($SZ==4) { # SHA256 1965 my @X = map("%ymm$_",(0..3)); 1966 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1967 1968$code.=<<___; 1969 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1970 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1971 jmp .Loop_avx2 1972.align 16 1973.Loop_avx2: 1974 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1975 vmovdqu -16*$SZ+0($inp),%xmm0 1976 vmovdqu -16*$SZ+16($inp),%xmm1 1977 vmovdqu -16*$SZ+32($inp),%xmm2 1978 vmovdqu -16*$SZ+48($inp),%xmm3 1979 #mov $inp,$_inp # offload $inp 1980 vinserti128 \$1,(%r12),@X[0],@X[0] 1981 vinserti128 \$1,16(%r12),@X[1],@X[1] 1982 vpshufb $t3,@X[0],@X[0] 1983 vinserti128 \$1,32(%r12),@X[2],@X[2] 1984 vpshufb $t3,@X[1],@X[1] 1985 vinserti128 \$1,48(%r12),@X[3],@X[3] 1986 1987 lea $TABLE(%rip),$Tbl 1988 vpshufb $t3,@X[2],@X[2] 1989 vpaddd 0x00($Tbl),@X[0],$t0 1990 vpshufb $t3,@X[3],@X[3] 1991 vpaddd 0x20($Tbl),@X[1],$t1 1992 vpaddd 0x40($Tbl),@X[2],$t2 1993 vpaddd 0x60($Tbl),@X[3],$t3 1994 vmovdqa $t0,0x00(%rsp) 1995 xor $a1,$a1 1996 vmovdqa $t1,0x20(%rsp) 1997___ 1998$code.=<<___ if (!$win64); 1999# temporarily use %rdi as frame pointer 2000 mov $_rsp,%rdi 2001.cfi_def_cfa %rdi,8 2002___ 2003$code.=<<___; 2004 lea -$PUSH8(%rsp),%rsp 2005___ 2006$code.=<<___ if (!$win64); 2007# the frame info is at $_rsp, but the stack is moving... 2008# so a second frame pointer is saved at -8(%rsp) 2009# that is in the red zone 2010 mov %rdi,-8(%rsp) 2011.cfi_cfa_expression %rsp-8,deref,+8 2012___ 2013$code.=<<___; 2014 mov $B,$a3 2015 vmovdqa $t2,0x00(%rsp) 2016 xor $C,$a3 # magic 2017 vmovdqa $t3,0x20(%rsp) 2018 mov $F,$a4 2019 sub \$-16*2*$SZ,$Tbl # size optimization 2020 jmp .Lavx2_00_47 2021 2022.align 16 2023.Lavx2_00_47: 2024___ 2025 2026sub AVX2_256_00_47 () { 2027my $j = shift; 2028my $body = shift; 2029my @X = @_; 2030my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 2031my $base = "+2*$PUSH8(%rsp)"; 2032 2033 if (($j%2)==0) { 2034 &lea ("%rsp","-$PUSH8(%rsp)"); 2035$code.=<<___ if (!$win64); 2036.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 2037# copy secondary frame pointer to new location again at -8(%rsp) 2038 pushq $PUSH8-8(%rsp) 2039.cfi_cfa_expression %rsp,deref,+8 2040 lea 8(%rsp),%rsp 2041.cfi_cfa_expression %rsp-8,deref,+8 2042___ 2043 } 2044 2045 foreach (Xupdate_256_AVX()) { # 29 instructions 2046 eval; 2047 eval(shift(@insns)); 2048 eval(shift(@insns)); 2049 eval(shift(@insns)); 2050 } 2051 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 2052 foreach (@insns) { eval; } # remaining instructions 2053 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2054} 2055 2056 for ($i=0,$j=0; $j<4; $j++) { 2057 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 2058 push(@X,shift(@X)); # rotate(@X) 2059 } 2060 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2061 &cmpb (($SZ-1)."($Tbl)",0); 2062 &jne (".Lavx2_00_47"); 2063 2064 for ($i=0; $i<16; ) { 2065 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2066 foreach(bodyx_00_15()) { eval; } 2067 } 2068 } else { # SHA512 2069 my @X = map("%ymm$_",(0..7)); 2070 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 2071 2072$code.=<<___; 2073 jmp .Loop_avx2 2074.align 16 2075.Loop_avx2: 2076 vmovdqu -16*$SZ($inp),%xmm0 2077 vmovdqu -16*$SZ+16($inp),%xmm1 2078 vmovdqu -16*$SZ+32($inp),%xmm2 2079 lea $TABLE+0x80(%rip),$Tbl # size optimization 2080 vmovdqu -16*$SZ+48($inp),%xmm3 2081 vmovdqu -16*$SZ+64($inp),%xmm4 2082 vmovdqu -16*$SZ+80($inp),%xmm5 2083 vmovdqu -16*$SZ+96($inp),%xmm6 2084 vmovdqu -16*$SZ+112($inp),%xmm7 2085 #mov $inp,$_inp # offload $inp 2086 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 2087 vinserti128 \$1,(%r12),@X[0],@X[0] 2088 vinserti128 \$1,16(%r12),@X[1],@X[1] 2089 vpshufb $t2,@X[0],@X[0] 2090 vinserti128 \$1,32(%r12),@X[2],@X[2] 2091 vpshufb $t2,@X[1],@X[1] 2092 vinserti128 \$1,48(%r12),@X[3],@X[3] 2093 vpshufb $t2,@X[2],@X[2] 2094 vinserti128 \$1,64(%r12),@X[4],@X[4] 2095 vpshufb $t2,@X[3],@X[3] 2096 vinserti128 \$1,80(%r12),@X[5],@X[5] 2097 vpshufb $t2,@X[4],@X[4] 2098 vinserti128 \$1,96(%r12),@X[6],@X[6] 2099 vpshufb $t2,@X[5],@X[5] 2100 vinserti128 \$1,112(%r12),@X[7],@X[7] 2101 2102 vpaddq -0x80($Tbl),@X[0],$t0 2103 vpshufb $t2,@X[6],@X[6] 2104 vpaddq -0x60($Tbl),@X[1],$t1 2105 vpshufb $t2,@X[7],@X[7] 2106 vpaddq -0x40($Tbl),@X[2],$t2 2107 vpaddq -0x20($Tbl),@X[3],$t3 2108 vmovdqa $t0,0x00(%rsp) 2109 vpaddq 0x00($Tbl),@X[4],$t0 2110 vmovdqa $t1,0x20(%rsp) 2111 vpaddq 0x20($Tbl),@X[5],$t1 2112 vmovdqa $t2,0x40(%rsp) 2113 vpaddq 0x40($Tbl),@X[6],$t2 2114 vmovdqa $t3,0x60(%rsp) 2115___ 2116$code.=<<___ if (!$win64); 2117# temporarily use %rdi as frame pointer 2118 mov $_rsp,%rdi 2119.cfi_def_cfa %rdi,8 2120___ 2121$code.=<<___; 2122 lea -$PUSH8(%rsp),%rsp 2123___ 2124$code.=<<___ if (!$win64); 2125# the frame info is at $_rsp, but the stack is moving... 2126# so a second frame pointer is saved at -8(%rsp) 2127# that is in the red zone 2128 mov %rdi,-8(%rsp) 2129.cfi_cfa_expression %rsp-8,deref,+8 2130___ 2131$code.=<<___; 2132 vpaddq 0x60($Tbl),@X[7],$t3 2133 vmovdqa $t0,0x00(%rsp) 2134 xor $a1,$a1 2135 vmovdqa $t1,0x20(%rsp) 2136 mov $B,$a3 2137 vmovdqa $t2,0x40(%rsp) 2138 xor $C,$a3 # magic 2139 vmovdqa $t3,0x60(%rsp) 2140 mov $F,$a4 2141 add \$16*2*$SZ,$Tbl 2142 jmp .Lavx2_00_47 2143 2144.align 16 2145.Lavx2_00_47: 2146___ 2147 2148sub AVX2_512_00_47 () { 2149my $j = shift; 2150my $body = shift; 2151my @X = @_; 2152my @insns = (&$body,&$body); # 48 instructions 2153my $base = "+2*$PUSH8(%rsp)"; 2154 2155 if (($j%4)==0) { 2156 &lea ("%rsp","-$PUSH8(%rsp)"); 2157$code.=<<___ if (!$win64); 2158.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 2159# copy secondary frame pointer to new location again at -8(%rsp) 2160 pushq $PUSH8-8(%rsp) 2161.cfi_cfa_expression %rsp,deref,+8 2162 lea 8(%rsp),%rsp 2163.cfi_cfa_expression %rsp-8,deref,+8 2164___ 2165 } 2166 2167 foreach (Xupdate_512_AVX()) { # 23 instructions 2168 eval; 2169 if ($_ !~ /\;$/) { 2170 eval(shift(@insns)); 2171 eval(shift(@insns)); 2172 eval(shift(@insns)); 2173 } 2174 } 2175 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 2176 foreach (@insns) { eval; } # remaining instructions 2177 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2178} 2179 2180 for ($i=0,$j=0; $j<8; $j++) { 2181 &AVX2_512_00_47($j,\&bodyx_00_15,@X); 2182 push(@X,shift(@X)); # rotate(@X) 2183 } 2184 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2185 &cmpb (($SZ-1-0x80)."($Tbl)",0); 2186 &jne (".Lavx2_00_47"); 2187 2188 for ($i=0; $i<16; ) { 2189 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2190 foreach(bodyx_00_15()) { eval; } 2191 } 2192} 2193$code.=<<___; 2194 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2195 add $a1,$A 2196 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2197 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 2198 2199 add $SZ*0($ctx),$A 2200 add $SZ*1($ctx),$B 2201 add $SZ*2($ctx),$C 2202 add $SZ*3($ctx),$D 2203 add $SZ*4($ctx),$E 2204 add $SZ*5($ctx),$F 2205 add $SZ*6($ctx),$G 2206 add $SZ*7($ctx),$H 2207 2208 mov $A,$SZ*0($ctx) 2209 mov $B,$SZ*1($ctx) 2210 mov $C,$SZ*2($ctx) 2211 mov $D,$SZ*3($ctx) 2212 mov $E,$SZ*4($ctx) 2213 mov $F,$SZ*5($ctx) 2214 mov $G,$SZ*6($ctx) 2215 mov $H,$SZ*7($ctx) 2216 2217 cmp `$PUSH8+2*8`($Tbl),$inp # $_end 2218 je .Ldone_avx2 2219 2220 xor $a1,$a1 2221 mov $B,$a3 2222 xor $C,$a3 # magic 2223 mov $F,$a4 2224 jmp .Lower_avx2 2225.align 16 2226.Lower_avx2: 2227___ 2228 for ($i=0; $i<8; ) { 2229 my $base="+16($Tbl)"; 2230 foreach(bodyx_00_15()) { eval; } 2231 } 2232$code.=<<___; 2233 lea -$PUSH8($Tbl),$Tbl 2234 cmp %rsp,$Tbl 2235 jae .Lower_avx2 2236 2237 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2238 add $a1,$A 2239 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2240 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 2241# restore frame pointer to original location at $_rsp 2242.cfi_cfa_expression $_rsp,deref,+8 2243 2244 add $SZ*0($ctx),$A 2245 add $SZ*1($ctx),$B 2246 add $SZ*2($ctx),$C 2247 add $SZ*3($ctx),$D 2248 add $SZ*4($ctx),$E 2249 add $SZ*5($ctx),$F 2250 lea `2*16*$SZ`($inp),$inp # inp+=2 2251 add $SZ*6($ctx),$G 2252 mov $inp,%r12 2253 add $SZ*7($ctx),$H 2254 cmp $_end,$inp 2255 2256 mov $A,$SZ*0($ctx) 2257 cmove %rsp,%r12 # next block or stale data 2258 mov $B,$SZ*1($ctx) 2259 mov $C,$SZ*2($ctx) 2260 mov $D,$SZ*3($ctx) 2261 mov $E,$SZ*4($ctx) 2262 mov $F,$SZ*5($ctx) 2263 mov $G,$SZ*6($ctx) 2264 mov $H,$SZ*7($ctx) 2265 2266 jbe .Loop_avx2 2267 lea (%rsp),$Tbl 2268# temporarily use $Tbl as index to $_rsp 2269# this avoids the need to save a secondary frame pointer at -8(%rsp) 2270.cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8 2271 2272.Ldone_avx2: 2273 mov `16*$SZ+3*8`($Tbl),%rsi 2274.cfi_def_cfa %rsi,8 2275 vzeroupper 2276___ 2277$code.=<<___ if ($win64); 2278 movaps 16*$SZ+32($Tbl),%xmm6 2279 movaps 16*$SZ+48($Tbl),%xmm7 2280 movaps 16*$SZ+64($Tbl),%xmm8 2281 movaps 16*$SZ+80($Tbl),%xmm9 2282___ 2283$code.=<<___ if ($win64 && $SZ>4); 2284 movaps 16*$SZ+96($Tbl),%xmm10 2285 movaps 16*$SZ+112($Tbl),%xmm11 2286___ 2287$code.=<<___; 2288 mov -48(%rsi),%r15 2289.cfi_restore %r15 2290 mov -40(%rsi),%r14 2291.cfi_restore %r14 2292 mov -32(%rsi),%r13 2293.cfi_restore %r13 2294 mov -24(%rsi),%r12 2295.cfi_restore %r12 2296 mov -16(%rsi),%rbp 2297.cfi_restore %rbp 2298 mov -8(%rsi),%rbx 2299.cfi_restore %rbx 2300 lea (%rsi),%rsp 2301.cfi_def_cfa_register %rsp 2302.Lepilogue_avx2: 2303 ret 2304.cfi_endproc 2305.size ${func}_avx2,.-${func}_avx2 2306___ 2307}} 2308}}}}} 2309 2310# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2311# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2312if ($win64) { 2313$rec="%rcx"; 2314$frame="%rdx"; 2315$context="%r8"; 2316$disp="%r9"; 2317 2318$code.=<<___; 2319.extern __imp_RtlVirtualUnwind 2320.type se_handler,\@abi-omnipotent 2321.align 16 2322se_handler: 2323 push %rsi 2324 push %rdi 2325 push %rbx 2326 push %rbp 2327 push %r12 2328 push %r13 2329 push %r14 2330 push %r15 2331 pushfq 2332 sub \$64,%rsp 2333 2334 mov 120($context),%rax # pull context->Rax 2335 mov 248($context),%rbx # pull context->Rip 2336 2337 mov 8($disp),%rsi # disp->ImageBase 2338 mov 56($disp),%r11 # disp->HanderlData 2339 2340 mov 0(%r11),%r10d # HandlerData[0] 2341 lea (%rsi,%r10),%r10 # prologue label 2342 cmp %r10,%rbx # context->Rip<prologue label 2343 jb .Lin_prologue 2344 2345 mov 152($context),%rax # pull context->Rsp 2346 2347 mov 4(%r11),%r10d # HandlerData[1] 2348 lea (%rsi,%r10),%r10 # epilogue label 2349 cmp %r10,%rbx # context->Rip>=epilogue label 2350 jae .Lin_prologue 2351___ 2352$code.=<<___ if ($avx>1); 2353 lea .Lavx2_shortcut(%rip),%r10 2354 cmp %r10,%rbx # context->Rip<avx2_shortcut 2355 jb .Lnot_in_avx2 2356 2357 and \$-256*$SZ,%rax 2358 add \$`2*$SZ*($rounds-8)`,%rax 2359.Lnot_in_avx2: 2360___ 2361$code.=<<___; 2362 mov %rax,%rsi # put aside Rsp 2363 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 2364 2365 mov -8(%rax),%rbx 2366 mov -16(%rax),%rbp 2367 mov -24(%rax),%r12 2368 mov -32(%rax),%r13 2369 mov -40(%rax),%r14 2370 mov -48(%rax),%r15 2371 mov %rbx,144($context) # restore context->Rbx 2372 mov %rbp,160($context) # restore context->Rbp 2373 mov %r12,216($context) # restore context->R12 2374 mov %r13,224($context) # restore context->R13 2375 mov %r14,232($context) # restore context->R14 2376 mov %r15,240($context) # restore context->R15 2377 2378 lea .Lepilogue(%rip),%r10 2379 cmp %r10,%rbx 2380 jb .Lin_prologue # non-AVX code 2381 2382 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 2383 lea 512($context),%rdi # &context.Xmm6 2384 mov \$`$SZ==4?8:12`,%ecx 2385 .long 0xa548f3fc # cld; rep movsq 2386 2387.Lin_prologue: 2388 mov 8(%rax),%rdi 2389 mov 16(%rax),%rsi 2390 mov %rax,152($context) # restore context->Rsp 2391 mov %rsi,168($context) # restore context->Rsi 2392 mov %rdi,176($context) # restore context->Rdi 2393 2394 mov 40($disp),%rdi # disp->ContextRecord 2395 mov $context,%rsi # context 2396 mov \$154,%ecx # sizeof(CONTEXT) 2397 .long 0xa548f3fc # cld; rep movsq 2398 2399 mov $disp,%rsi 2400 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2401 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2402 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2403 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2404 mov 40(%rsi),%r10 # disp->ContextRecord 2405 lea 56(%rsi),%r11 # &disp->HandlerData 2406 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2407 mov %r10,32(%rsp) # arg5 2408 mov %r11,40(%rsp) # arg6 2409 mov %r12,48(%rsp) # arg7 2410 mov %rcx,56(%rsp) # arg8, (NULL) 2411 call *__imp_RtlVirtualUnwind(%rip) 2412 2413 mov \$1,%eax # ExceptionContinueSearch 2414 add \$64,%rsp 2415 popfq 2416 pop %r15 2417 pop %r14 2418 pop %r13 2419 pop %r12 2420 pop %rbp 2421 pop %rbx 2422 pop %rdi 2423 pop %rsi 2424 ret 2425.size se_handler,.-se_handler 2426___ 2427 2428$code.=<<___ if ($SZ==4 && $shaext); 2429.type shaext_handler,\@abi-omnipotent 2430.align 16 2431shaext_handler: 2432 push %rsi 2433 push %rdi 2434 push %rbx 2435 push %rbp 2436 push %r12 2437 push %r13 2438 push %r14 2439 push %r15 2440 pushfq 2441 sub \$64,%rsp 2442 2443 mov 120($context),%rax # pull context->Rax 2444 mov 248($context),%rbx # pull context->Rip 2445 2446 lea .Lprologue_shaext(%rip),%r10 2447 cmp %r10,%rbx # context->Rip<.Lprologue 2448 jb .Lin_prologue 2449 2450 lea .Lepilogue_shaext(%rip),%r10 2451 cmp %r10,%rbx # context->Rip>=.Lepilogue 2452 jae .Lin_prologue 2453 2454 lea -8-5*16(%rax),%rsi 2455 lea 512($context),%rdi # &context.Xmm6 2456 mov \$10,%ecx 2457 .long 0xa548f3fc # cld; rep movsq 2458 2459 jmp .Lin_prologue 2460.size shaext_handler,.-shaext_handler 2461___ 2462 2463$code.=<<___; 2464.section .pdata 2465.align 4 2466 .rva .LSEH_begin_$func 2467 .rva .LSEH_end_$func 2468 .rva .LSEH_info_$func 2469___ 2470$code.=<<___ if ($SZ==4 && $shaext); 2471 .rva .LSEH_begin_${func}_shaext 2472 .rva .LSEH_end_${func}_shaext 2473 .rva .LSEH_info_${func}_shaext 2474___ 2475$code.=<<___ if ($SZ==4); 2476 .rva .LSEH_begin_${func}_ssse3 2477 .rva .LSEH_end_${func}_ssse3 2478 .rva .LSEH_info_${func}_ssse3 2479___ 2480$code.=<<___ if ($avx && $SZ==8); 2481 .rva .LSEH_begin_${func}_xop 2482 .rva .LSEH_end_${func}_xop 2483 .rva .LSEH_info_${func}_xop 2484___ 2485$code.=<<___ if ($avx); 2486 .rva .LSEH_begin_${func}_avx 2487 .rva .LSEH_end_${func}_avx 2488 .rva .LSEH_info_${func}_avx 2489___ 2490$code.=<<___ if ($avx>1); 2491 .rva .LSEH_begin_${func}_avx2 2492 .rva .LSEH_end_${func}_avx2 2493 .rva .LSEH_info_${func}_avx2 2494___ 2495$code.=<<___; 2496.section .xdata 2497.align 8 2498.LSEH_info_$func: 2499 .byte 9,0,0,0 2500 .rva se_handler 2501 .rva .Lprologue,.Lepilogue # HandlerData[] 2502___ 2503$code.=<<___ if ($SZ==4 && $shaext); 2504.LSEH_info_${func}_shaext: 2505 .byte 9,0,0,0 2506 .rva shaext_handler 2507___ 2508$code.=<<___ if ($SZ==4); 2509.LSEH_info_${func}_ssse3: 2510 .byte 9,0,0,0 2511 .rva se_handler 2512 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2513___ 2514$code.=<<___ if ($avx && $SZ==8); 2515.LSEH_info_${func}_xop: 2516 .byte 9,0,0,0 2517 .rva se_handler 2518 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 2519___ 2520$code.=<<___ if ($avx); 2521.LSEH_info_${func}_avx: 2522 .byte 9,0,0,0 2523 .rva se_handler 2524 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2525___ 2526$code.=<<___ if ($avx>1); 2527.LSEH_info_${func}_avx2: 2528 .byte 9,0,0,0 2529 .rva se_handler 2530 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2531___ 2532} 2533 2534sub sha256op38 { 2535 my $instr = shift; 2536 my %opcodelet = ( 2537 "sha256rnds2" => 0xcb, 2538 "sha256msg1" => 0xcc, 2539 "sha256msg2" => 0xcd ); 2540 2541 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2542 my @opcode=(0x0f,0x38); 2543 push @opcode,$opcodelet{$instr}; 2544 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2545 return ".byte\t".join(',',@opcode); 2546 } else { 2547 return $instr."\t".@_[0]; 2548 } 2549} 2550 2551foreach (split("\n",$code)) { 2552 s/\`([^\`]*)\`/eval $1/geo; 2553 2554 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2555 2556 print $_,"\n"; 2557} 2558close STDOUT or die "error closing STDOUT: $!"; 2559