1#! /usr/bin/env perl 2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. Rights for redistribution and usage in source and binary 13# forms are granted according to the OpenSSL license. 14# ==================================================================== 15# 16# sha256/512_block procedure for x86_64. 17# 18# 40% improvement over compiler-generated code on Opteron. On EM64T 19# sha256 was observed to run >80% faster and sha512 - >40%. No magical 20# tricks, just straight implementation... I really wonder why gcc 21# [being armed with inline assembler] fails to generate as fast code. 22# The only thing which is cool about this module is that it's very 23# same instruction sequence used for both SHA-256 and SHA-512. In 24# former case the instructions operate on 32-bit operands, while in 25# latter - on 64-bit ones. All I had to do is to get one flavor right, 26# the other one passed the test right away:-) 27# 28# sha256_block runs in ~1005 cycles on Opteron, which gives you 29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 30# frequency in GHz. sha512_block runs in ~1275 cycles, which results 31# in 128*1000/1275=100MBps per GHz. Is there room for improvement? 32# Well, if you compare it to IA-64 implementation, which maintains 33# X[16] in register bank[!], tends to 4 instructions per CPU clock 34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way 35# issue Opteron pipeline and X[16] maintained in memory. So that *if* 36# there is a way to improve it, *then* the only way would be to try to 37# offload X[16] updates to SSE unit, but that would require "deeper" 38# loop unroll, which in turn would naturally cause size blow-up, not 39# to mention increased complexity! And once again, only *if* it's 40# actually possible to noticeably improve overall ILP, instruction 41# level parallelism, on a given CPU implementation in this case. 42# 43# Special note on Intel EM64T. While Opteron CPU exhibits perfect 44# performance ratio of 1.5 between 64- and 32-bit flavors [see above], 45# [currently available] EM64T CPUs apparently are far from it. On the 46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 47# sha256_block:-( This is presumably because 64-bit shifts/rotates 48# apparently are not atomic instructions, but implemented in microcode. 49# 50# May 2012. 51# 52# Optimization including one of Pavel Semjanov's ideas, alternative 53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 54# unfortunately -2% SHA512 on P4 [which nobody should care about 55# that much]. 56# 57# June 2012. 58# 59# Add SIMD code paths, see below for improvement coefficients. SSSE3 60# code path was not attempted for SHA512, because improvement is not 61# estimated to be high enough, noticeably less than 9%, to justify 62# the effort, not on pre-AVX processors. [Obviously with exclusion 63# for VIA Nano, but it has SHA512 instruction that is faster and 64# should be used instead.] For reference, corresponding estimated 65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 66# higher coefficients are observed on VIA Nano and Bulldozer has more 67# to do with specifics of their architecture [which is topic for 68# separate discussion]. 69# 70# November 2012. 71# 72# Add AVX2 code path. Two consecutive input blocks are loaded to 73# 256-bit %ymm registers, with data from first block to least 74# significant 128-bit halves and data from second to most significant. 75# The data is then processed with same SIMD instruction sequence as 76# for AVX, but with %ymm as operands. Side effect is increased stack 77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 78# code size increase. 79# 80# March 2014. 81# 82# Add support for Intel SHA Extensions. 83 84###################################################################### 85# Current performance in cycles per processed byte (less is better): 86# 87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 88# 89# AMD K8 14.9 - - 9.57 - 90# P4 17.3 - - 30.8 - 91# Core 2 15.6 13.8(+13%) - 9.97 - 92# Westmere 14.8 12.3(+19%) - 9.58 - 93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) 97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) 99# VIA Nano 23.0 16.5(+39%) - 14.7 - 100# Atom 23.0 18.9(+22%) - 14.7 - 101# Silvermont 27.4 20.6(+33%) - 17.5 - 102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) 103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - 104# 105# (*) whichever best applicable, including SHAEXT; 106# (**) switch from ror to shrd stands for fair share of improvement; 107# (***) execution time is fully determined by remaining integer-only 108# part, body_00_15; reducing the amount of SIMD instructions 109# below certain limit makes no difference/sense; to conserve 110# space SHA256 XOP code path is therefore omitted; 111 112$flavour = shift; 113$output = shift; 114if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 115 116$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 117 118$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 119( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 120( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 121die "can't locate x86_64-xlate.pl"; 122 123if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 124 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 125 $avx = ($1>=2.19) + ($1>=2.22); 126} 127 128if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 129 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 130 $avx = ($1>=2.09) + ($1>=2.10); 131} 132 133if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 134 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 135 $avx = ($1>=10) + ($1>=11); 136} 137 138if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 139 $avx = ($2>=3.0) + ($2>3.0); 140} 141 142$shaext=1; ### set to zero if compiling for 1.0.1 143$avx=1 if (!$shaext && $avx); 144 145open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 146*STDOUT=*OUT; 147 148if ($output =~ /512/) { 149 $func="sha512_block_data_order"; 150 $TABLE="K512"; 151 $SZ=8; 152 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 153 "%r8", "%r9", "%r10","%r11"); 154 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 155 @Sigma0=(28,34,39); 156 @Sigma1=(14,18,41); 157 @sigma0=(1, 8, 7); 158 @sigma1=(19,61, 6); 159 $rounds=80; 160} else { 161 $func="sha256_block_data_order"; 162 $TABLE="K256"; 163 $SZ=4; 164 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 165 "%r8d","%r9d","%r10d","%r11d"); 166 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 167 @Sigma0=( 2,13,22); 168 @Sigma1=( 6,11,25); 169 @sigma0=( 7,18, 3); 170 @sigma1=(17,19,10); 171 $rounds=64; 172} 173 174$ctx="%rdi"; # 1st arg, zapped by $a3 175$inp="%rsi"; # 2nd arg 176$Tbl="%rbp"; 177 178$_ctx="16*$SZ+0*8(%rsp)"; 179$_inp="16*$SZ+1*8(%rsp)"; 180$_end="16*$SZ+2*8(%rsp)"; 181$_rsp="`16*$SZ+3*8`(%rsp)"; 182$framesz="16*$SZ+4*8"; 183 184 185sub ROUND_00_15() 186{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 187 my $STRIDE=$SZ; 188 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 189 190$code.=<<___; 191 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 192 mov $f,$a2 193 194 xor $e,$a0 195 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 196 xor $g,$a2 # f^g 197 198 mov $T1,`$SZ*($i&0xf)`(%rsp) 199 xor $a,$a1 200 and $e,$a2 # (f^g)&e 201 202 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 203 add $h,$T1 # T1+=h 204 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 205 206 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 207 xor $e,$a0 208 add $a2,$T1 # T1+=Ch(e,f,g) 209 210 mov $a,$a2 211 add ($Tbl),$T1 # T1+=K[round] 212 xor $a,$a1 213 214 xor $b,$a2 # a^b, b^c in next round 215 ror \$$Sigma1[0],$a0 # Sigma1(e) 216 mov $b,$h 217 218 and $a2,$a3 219 ror \$$Sigma0[0],$a1 # Sigma0(a) 220 add $a0,$T1 # T1+=Sigma1(e) 221 222 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 223 add $T1,$d # d+=T1 224 add $T1,$h # h+=T1 225 226 lea $STRIDE($Tbl),$Tbl # round++ 227___ 228$code.=<<___ if ($i<15); 229 add $a1,$h # h+=Sigma0(a) 230___ 231 ($a2,$a3) = ($a3,$a2); 232} 233 234sub ROUND_16_XX() 235{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 236 237$code.=<<___; 238 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 239 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 240 241 mov $a0,$T1 242 ror \$`$sigma0[1]-$sigma0[0]`,$a0 243 add $a1,$a # modulo-scheduled h+=Sigma0(a) 244 mov $a2,$a1 245 ror \$`$sigma1[1]-$sigma1[0]`,$a2 246 247 xor $T1,$a0 248 shr \$$sigma0[2],$T1 249 ror \$$sigma0[0],$a0 250 xor $a1,$a2 251 shr \$$sigma1[2],$a1 252 253 ror \$$sigma1[0],$a2 254 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 255 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 256 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 257 258 add `$SZ*($i&0xf)`(%rsp),$T1 259 mov $e,$a0 260 add $a2,$T1 261 mov $a,$a1 262___ 263 &ROUND_00_15(@_); 264} 265 266$code=<<___; 267.text 268 269.extern OPENSSL_ia32cap_P 270.globl $func 271.type $func,\@function,3 272.align 16 273$func: 274.cfi_startproc 275___ 276$code.=<<___ if ($SZ==4 || $avx); 277 lea OPENSSL_ia32cap_P(%rip),%r11 278 mov 0(%r11),%r9d 279 mov 4(%r11),%r10d 280 mov 8(%r11),%r11d 281___ 282$code.=<<___ if ($SZ==4 && $shaext); 283 test \$`1<<29`,%r11d # check for SHA 284 jnz _shaext_shortcut 285___ 286$code.=<<___ if ($avx && $SZ==8); 287 test \$`1<<11`,%r10d # check for XOP 288 jnz .Lxop_shortcut 289___ 290$code.=<<___ if ($avx>1); 291 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 292 cmp \$`1<<8|1<<5|1<<3`,%r11d 293 je .Lavx2_shortcut 294___ 295$code.=<<___ if ($avx); 296 and \$`1<<30`,%r9d # mask "Intel CPU" bit 297 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 298 or %r9d,%r10d 299 cmp \$`1<<28|1<<9|1<<30`,%r10d 300 je .Lavx_shortcut 301___ 302$code.=<<___ if ($SZ==4); 303 test \$`1<<9`,%r10d 304 jnz .Lssse3_shortcut 305___ 306$code.=<<___; 307 mov %rsp,%rax # copy %rsp 308.cfi_def_cfa_register %rax 309 push %rbx 310.cfi_push %rbx 311 push %rbp 312.cfi_push %rbp 313 push %r12 314.cfi_push %r12 315 push %r13 316.cfi_push %r13 317 push %r14 318.cfi_push %r14 319 push %r15 320.cfi_push %r15 321 shl \$4,%rdx # num*16 322 sub \$$framesz,%rsp 323 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 324 and \$-64,%rsp # align stack frame 325 mov $ctx,$_ctx # save ctx, 1st arg 326 mov $inp,$_inp # save inp, 2nd arh 327 mov %rdx,$_end # save end pointer, "3rd" arg 328 mov %rax,$_rsp # save copy of %rsp 329.cfi_cfa_expression $_rsp,deref,+8 330.Lprologue: 331 332 mov $SZ*0($ctx),$A 333 mov $SZ*1($ctx),$B 334 mov $SZ*2($ctx),$C 335 mov $SZ*3($ctx),$D 336 mov $SZ*4($ctx),$E 337 mov $SZ*5($ctx),$F 338 mov $SZ*6($ctx),$G 339 mov $SZ*7($ctx),$H 340 jmp .Lloop 341 342.align 16 343.Lloop: 344 mov $B,$a3 345 lea $TABLE(%rip),$Tbl 346 xor $C,$a3 # magic 347___ 348 for($i=0;$i<16;$i++) { 349 $code.=" mov $SZ*$i($inp),$T1\n"; 350 $code.=" mov @ROT[4],$a0\n"; 351 $code.=" mov @ROT[0],$a1\n"; 352 $code.=" bswap $T1\n"; 353 &ROUND_00_15($i,@ROT); 354 unshift(@ROT,pop(@ROT)); 355 } 356$code.=<<___; 357 jmp .Lrounds_16_xx 358.align 16 359.Lrounds_16_xx: 360___ 361 for(;$i<32;$i++) { 362 &ROUND_16_XX($i,@ROT); 363 unshift(@ROT,pop(@ROT)); 364 } 365 366$code.=<<___; 367 cmpb \$0,`$SZ-1`($Tbl) 368 jnz .Lrounds_16_xx 369 370 mov $_ctx,$ctx 371 add $a1,$A # modulo-scheduled h+=Sigma0(a) 372 lea 16*$SZ($inp),$inp 373 374 add $SZ*0($ctx),$A 375 add $SZ*1($ctx),$B 376 add $SZ*2($ctx),$C 377 add $SZ*3($ctx),$D 378 add $SZ*4($ctx),$E 379 add $SZ*5($ctx),$F 380 add $SZ*6($ctx),$G 381 add $SZ*7($ctx),$H 382 383 cmp $_end,$inp 384 385 mov $A,$SZ*0($ctx) 386 mov $B,$SZ*1($ctx) 387 mov $C,$SZ*2($ctx) 388 mov $D,$SZ*3($ctx) 389 mov $E,$SZ*4($ctx) 390 mov $F,$SZ*5($ctx) 391 mov $G,$SZ*6($ctx) 392 mov $H,$SZ*7($ctx) 393 jb .Lloop 394 395 mov $_rsp,%rsi 396.cfi_def_cfa %rsi,8 397 mov -48(%rsi),%r15 398.cfi_restore %r15 399 mov -40(%rsi),%r14 400.cfi_restore %r14 401 mov -32(%rsi),%r13 402.cfi_restore %r13 403 mov -24(%rsi),%r12 404.cfi_restore %r12 405 mov -16(%rsi),%rbp 406.cfi_restore %rbp 407 mov -8(%rsi),%rbx 408.cfi_restore %rbx 409 lea (%rsi),%rsp 410.cfi_def_cfa_register %rsp 411.Lepilogue: 412 ret 413.cfi_endproc 414.size $func,.-$func 415___ 416 417if ($SZ==4) { 418$code.=<<___; 419.align 64 420.type $TABLE,\@object 421$TABLE: 422 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 423 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 424 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 425 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 426 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 427 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 428 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 429 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 430 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 431 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 432 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 433 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 434 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 435 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 436 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 437 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 438 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 439 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 440 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 441 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 442 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 443 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 444 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 445 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 446 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 447 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 448 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 449 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 450 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 451 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 452 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 453 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 454 455 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 456 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 457 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 458 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 459 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 460 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 461 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 462___ 463} else { 464$code.=<<___; 465.align 64 466.type $TABLE,\@object 467$TABLE: 468 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 469 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 470 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 471 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 472 .quad 0x3956c25bf348b538,0x59f111f1b605d019 473 .quad 0x3956c25bf348b538,0x59f111f1b605d019 474 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 475 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 476 .quad 0xd807aa98a3030242,0x12835b0145706fbe 477 .quad 0xd807aa98a3030242,0x12835b0145706fbe 478 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 479 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 480 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 481 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 482 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 483 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 484 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 485 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 486 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 487 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 488 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 489 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 490 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 491 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 492 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 493 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 494 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 495 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 496 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 497 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 498 .quad 0x06ca6351e003826f,0x142929670a0e6e70 499 .quad 0x06ca6351e003826f,0x142929670a0e6e70 500 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 501 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 502 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 503 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 504 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 505 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 506 .quad 0x81c2c92e47edaee6,0x92722c851482353b 507 .quad 0x81c2c92e47edaee6,0x92722c851482353b 508 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 509 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 510 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 511 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 512 .quad 0xd192e819d6ef5218,0xd69906245565a910 513 .quad 0xd192e819d6ef5218,0xd69906245565a910 514 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 515 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 516 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 517 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 518 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 519 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 520 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 521 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 522 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 523 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 524 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 525 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 526 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 527 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 528 .quad 0x90befffa23631e28,0xa4506cebde82bde9 529 .quad 0x90befffa23631e28,0xa4506cebde82bde9 530 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 531 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 532 .quad 0xca273eceea26619c,0xd186b8c721c0c207 533 .quad 0xca273eceea26619c,0xd186b8c721c0c207 534 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 535 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 536 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 537 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 538 .quad 0x113f9804bef90dae,0x1b710b35131c471b 539 .quad 0x113f9804bef90dae,0x1b710b35131c471b 540 .quad 0x28db77f523047d84,0x32caab7b40c72493 541 .quad 0x28db77f523047d84,0x32caab7b40c72493 542 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 543 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 544 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 545 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 546 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 547 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 548 549 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 550 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 551 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 552___ 553} 554 555###################################################################### 556# SIMD code paths 557# 558if ($SZ==4 && $shaext) {{{ 559###################################################################### 560# Intel SHA Extensions implementation of SHA256 update function. 561# 562my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 563 564my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 565my @MSG=map("%xmm$_",(3..6)); 566 567$code.=<<___; 568.type sha256_block_data_order_shaext,\@function,3 569.align 64 570sha256_block_data_order_shaext: 571_shaext_shortcut: 572___ 573$code.=<<___ if ($win64); 574 lea `-8-5*16`(%rsp),%rsp 575 movaps %xmm6,-8-5*16(%rax) 576 movaps %xmm7,-8-4*16(%rax) 577 movaps %xmm8,-8-3*16(%rax) 578 movaps %xmm9,-8-2*16(%rax) 579 movaps %xmm10,-8-1*16(%rax) 580.Lprologue_shaext: 581___ 582$code.=<<___; 583 lea K256+0x80(%rip),$Tbl 584 movdqu ($ctx),$ABEF # DCBA 585 movdqu 16($ctx),$CDGH # HGFE 586 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 587 588 pshufd \$0x1b,$ABEF,$Wi # ABCD 589 pshufd \$0xb1,$ABEF,$ABEF # CDAB 590 pshufd \$0x1b,$CDGH,$CDGH # EFGH 591 movdqa $TMP,$BSWAP # offload 592 palignr \$8,$CDGH,$ABEF # ABEF 593 punpcklqdq $Wi,$CDGH # CDGH 594 jmp .Loop_shaext 595 596.align 16 597.Loop_shaext: 598 movdqu ($inp),@MSG[0] 599 movdqu 0x10($inp),@MSG[1] 600 movdqu 0x20($inp),@MSG[2] 601 pshufb $TMP,@MSG[0] 602 movdqu 0x30($inp),@MSG[3] 603 604 movdqa 0*32-0x80($Tbl),$Wi 605 paddd @MSG[0],$Wi 606 pshufb $TMP,@MSG[1] 607 movdqa $CDGH,$CDGH_SAVE # offload 608 sha256rnds2 $ABEF,$CDGH # 0-3 609 pshufd \$0x0e,$Wi,$Wi 610 nop 611 movdqa $ABEF,$ABEF_SAVE # offload 612 sha256rnds2 $CDGH,$ABEF 613 614 movdqa 1*32-0x80($Tbl),$Wi 615 paddd @MSG[1],$Wi 616 pshufb $TMP,@MSG[2] 617 sha256rnds2 $ABEF,$CDGH # 4-7 618 pshufd \$0x0e,$Wi,$Wi 619 lea 0x40($inp),$inp 620 sha256msg1 @MSG[1],@MSG[0] 621 sha256rnds2 $CDGH,$ABEF 622 623 movdqa 2*32-0x80($Tbl),$Wi 624 paddd @MSG[2],$Wi 625 pshufb $TMP,@MSG[3] 626 sha256rnds2 $ABEF,$CDGH # 8-11 627 pshufd \$0x0e,$Wi,$Wi 628 movdqa @MSG[3],$TMP 629 palignr \$4,@MSG[2],$TMP 630 nop 631 paddd $TMP,@MSG[0] 632 sha256msg1 @MSG[2],@MSG[1] 633 sha256rnds2 $CDGH,$ABEF 634 635 movdqa 3*32-0x80($Tbl),$Wi 636 paddd @MSG[3],$Wi 637 sha256msg2 @MSG[3],@MSG[0] 638 sha256rnds2 $ABEF,$CDGH # 12-15 639 pshufd \$0x0e,$Wi,$Wi 640 movdqa @MSG[0],$TMP 641 palignr \$4,@MSG[3],$TMP 642 nop 643 paddd $TMP,@MSG[1] 644 sha256msg1 @MSG[3],@MSG[2] 645 sha256rnds2 $CDGH,$ABEF 646___ 647for($i=4;$i<16-3;$i++) { 648$code.=<<___; 649 movdqa $i*32-0x80($Tbl),$Wi 650 paddd @MSG[0],$Wi 651 sha256msg2 @MSG[0],@MSG[1] 652 sha256rnds2 $ABEF,$CDGH # 16-19... 653 pshufd \$0x0e,$Wi,$Wi 654 movdqa @MSG[1],$TMP 655 palignr \$4,@MSG[0],$TMP 656 nop 657 paddd $TMP,@MSG[2] 658 sha256msg1 @MSG[0],@MSG[3] 659 sha256rnds2 $CDGH,$ABEF 660___ 661 push(@MSG,shift(@MSG)); 662} 663$code.=<<___; 664 movdqa 13*32-0x80($Tbl),$Wi 665 paddd @MSG[0],$Wi 666 sha256msg2 @MSG[0],@MSG[1] 667 sha256rnds2 $ABEF,$CDGH # 52-55 668 pshufd \$0x0e,$Wi,$Wi 669 movdqa @MSG[1],$TMP 670 palignr \$4,@MSG[0],$TMP 671 sha256rnds2 $CDGH,$ABEF 672 paddd $TMP,@MSG[2] 673 674 movdqa 14*32-0x80($Tbl),$Wi 675 paddd @MSG[1],$Wi 676 sha256rnds2 $ABEF,$CDGH # 56-59 677 pshufd \$0x0e,$Wi,$Wi 678 sha256msg2 @MSG[1],@MSG[2] 679 movdqa $BSWAP,$TMP 680 sha256rnds2 $CDGH,$ABEF 681 682 movdqa 15*32-0x80($Tbl),$Wi 683 paddd @MSG[2],$Wi 684 nop 685 sha256rnds2 $ABEF,$CDGH # 60-63 686 pshufd \$0x0e,$Wi,$Wi 687 dec $num 688 nop 689 sha256rnds2 $CDGH,$ABEF 690 691 paddd $CDGH_SAVE,$CDGH 692 paddd $ABEF_SAVE,$ABEF 693 jnz .Loop_shaext 694 695 pshufd \$0xb1,$CDGH,$CDGH # DCHG 696 pshufd \$0x1b,$ABEF,$TMP # FEBA 697 pshufd \$0xb1,$ABEF,$ABEF # BAFE 698 punpckhqdq $CDGH,$ABEF # DCBA 699 palignr \$8,$TMP,$CDGH # HGFE 700 701 movdqu $ABEF,($ctx) 702 movdqu $CDGH,16($ctx) 703___ 704$code.=<<___ if ($win64); 705 movaps -8-5*16(%rax),%xmm6 706 movaps -8-4*16(%rax),%xmm7 707 movaps -8-3*16(%rax),%xmm8 708 movaps -8-2*16(%rax),%xmm9 709 movaps -8-1*16(%rax),%xmm10 710 mov %rax,%rsp 711.Lepilogue_shaext: 712___ 713$code.=<<___; 714 ret 715.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 716___ 717}}} 718{{{ 719 720my $a4=$T1; 721my ($a,$b,$c,$d,$e,$f,$g,$h); 722 723sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 724{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 725 my $arg = pop; 726 $arg = "\$$arg" if ($arg*1 eq $arg); 727 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 728} 729 730sub body_00_15 () { 731 ( 732 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 733 734 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 735 '&mov ($a,$a1)', 736 '&mov ($a4,$f)', 737 738 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 739 '&xor ($a0,$e)', 740 '&xor ($a4,$g)', # f^g 741 742 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 743 '&xor ($a1,$a)', 744 '&and ($a4,$e)', # (f^g)&e 745 746 '&xor ($a0,$e)', 747 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 748 '&mov ($a2,$a)', 749 750 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 751 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 752 '&xor ($a2,$b)', # a^b, b^c in next round 753 754 '&add ($h,$a4)', # h+=Ch(e,f,g) 755 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 756 '&and ($a3,$a2)', # (b^c)&(a^b) 757 758 '&xor ($a1,$a)', 759 '&add ($h,$a0)', # h+=Sigma1(e) 760 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 761 762 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 763 '&add ($d,$h)', # d+=h 764 '&add ($h,$a3)', # h+=Maj(a,b,c) 765 766 '&mov ($a0,$d)', 767 '&add ($a1,$h);'. # h+=Sigma0(a) 768 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 769 ); 770} 771 772###################################################################### 773# SSSE3 code path 774# 775if ($SZ==4) { # SHA256 only 776my @X = map("%xmm$_",(0..3)); 777my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 778 779$code.=<<___; 780.type ${func}_ssse3,\@function,3 781.align 64 782${func}_ssse3: 783.cfi_startproc 784.Lssse3_shortcut: 785 mov %rsp,%rax # copy %rsp 786.cfi_def_cfa_register %rax 787 push %rbx 788.cfi_push %rbx 789 push %rbp 790.cfi_push %rbp 791 push %r12 792.cfi_push %r12 793 push %r13 794.cfi_push %r13 795 push %r14 796.cfi_push %r14 797 push %r15 798.cfi_push %r15 799 shl \$4,%rdx # num*16 800 sub \$`$framesz+$win64*16*4`,%rsp 801 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 802 and \$-64,%rsp # align stack frame 803 mov $ctx,$_ctx # save ctx, 1st arg 804 mov $inp,$_inp # save inp, 2nd arh 805 mov %rdx,$_end # save end pointer, "3rd" arg 806 mov %rax,$_rsp # save copy of %rsp 807.cfi_cfa_expression $_rsp,deref,+8 808___ 809$code.=<<___ if ($win64); 810 movaps %xmm6,16*$SZ+32(%rsp) 811 movaps %xmm7,16*$SZ+48(%rsp) 812 movaps %xmm8,16*$SZ+64(%rsp) 813 movaps %xmm9,16*$SZ+80(%rsp) 814___ 815$code.=<<___; 816.Lprologue_ssse3: 817 818 mov $SZ*0($ctx),$A 819 mov $SZ*1($ctx),$B 820 mov $SZ*2($ctx),$C 821 mov $SZ*3($ctx),$D 822 mov $SZ*4($ctx),$E 823 mov $SZ*5($ctx),$F 824 mov $SZ*6($ctx),$G 825 mov $SZ*7($ctx),$H 826___ 827 828$code.=<<___; 829 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 830 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 831 jmp .Lloop_ssse3 832.align 16 833.Lloop_ssse3: 834 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 835 movdqu 0x00($inp),@X[0] 836 movdqu 0x10($inp),@X[1] 837 movdqu 0x20($inp),@X[2] 838 pshufb $t3,@X[0] 839 movdqu 0x30($inp),@X[3] 840 lea $TABLE(%rip),$Tbl 841 pshufb $t3,@X[1] 842 movdqa 0x00($Tbl),$t0 843 movdqa 0x20($Tbl),$t1 844 pshufb $t3,@X[2] 845 paddd @X[0],$t0 846 movdqa 0x40($Tbl),$t2 847 pshufb $t3,@X[3] 848 movdqa 0x60($Tbl),$t3 849 paddd @X[1],$t1 850 paddd @X[2],$t2 851 paddd @X[3],$t3 852 movdqa $t0,0x00(%rsp) 853 mov $A,$a1 854 movdqa $t1,0x10(%rsp) 855 mov $B,$a3 856 movdqa $t2,0x20(%rsp) 857 xor $C,$a3 # magic 858 movdqa $t3,0x30(%rsp) 859 mov $E,$a0 860 jmp .Lssse3_00_47 861 862.align 16 863.Lssse3_00_47: 864 sub \$`-16*2*$SZ`,$Tbl # size optimization 865___ 866sub Xupdate_256_SSSE3 () { 867 ( 868 '&movdqa ($t0,@X[1]);', 869 '&movdqa ($t3,@X[3])', 870 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 871 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 872 '&movdqa ($t1,$t0)', 873 '&movdqa ($t2,$t0);', 874 '&psrld ($t0,$sigma0[2])', 875 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 876 '&psrld ($t2,$sigma0[0])', 877 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 878 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 879 '&pxor ($t0,$t2)', 880 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 881 '&pxor ($t0,$t1)', 882 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 883 '&pxor ($t0,$t2);', 884 '&movdqa ($t2,$t3)', 885 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 886 '&psrld ($t3,$sigma1[2])', 887 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 888 '&psrlq ($t2,$sigma1[0])', 889 '&pxor ($t3,$t2);', 890 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 891 '&pxor ($t3,$t2)', 892 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 893 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 894 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 895 '&movdqa ($t2,$t3);', 896 '&psrld ($t3,$sigma1[2])', 897 '&psrlq ($t2,$sigma1[0])', 898 '&pxor ($t3,$t2);', 899 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 900 '&pxor ($t3,$t2);', 901 '&movdqa ($t2,16*2*$j."($Tbl)")', 902 '&pshufb ($t3,$t5)', 903 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 904 ); 905} 906 907sub SSSE3_256_00_47 () { 908my $j = shift; 909my $body = shift; 910my @X = @_; 911my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 912 913 if (0) { 914 foreach (Xupdate_256_SSSE3()) { # 36 instructions 915 eval; 916 eval(shift(@insns)); 917 eval(shift(@insns)); 918 eval(shift(@insns)); 919 } 920 } else { # squeeze extra 4% on Westmere and 19% on Atom 921 eval(shift(@insns)); #@ 922 &movdqa ($t0,@X[1]); 923 eval(shift(@insns)); 924 eval(shift(@insns)); 925 &movdqa ($t3,@X[3]); 926 eval(shift(@insns)); #@ 927 eval(shift(@insns)); 928 eval(shift(@insns)); 929 eval(shift(@insns)); #@ 930 eval(shift(@insns)); 931 &palignr ($t0,@X[0],$SZ); # X[1..4] 932 eval(shift(@insns)); 933 eval(shift(@insns)); 934 &palignr ($t3,@X[2],$SZ); # X[9..12] 935 eval(shift(@insns)); 936 eval(shift(@insns)); 937 eval(shift(@insns)); 938 eval(shift(@insns)); #@ 939 &movdqa ($t1,$t0); 940 eval(shift(@insns)); 941 eval(shift(@insns)); 942 &movdqa ($t2,$t0); 943 eval(shift(@insns)); #@ 944 eval(shift(@insns)); 945 &psrld ($t0,$sigma0[2]); 946 eval(shift(@insns)); 947 eval(shift(@insns)); 948 eval(shift(@insns)); 949 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 950 eval(shift(@insns)); #@ 951 eval(shift(@insns)); 952 &psrld ($t2,$sigma0[0]); 953 eval(shift(@insns)); 954 eval(shift(@insns)); 955 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 956 eval(shift(@insns)); 957 eval(shift(@insns)); #@ 958 &pslld ($t1,8*$SZ-$sigma0[1]); 959 eval(shift(@insns)); 960 eval(shift(@insns)); 961 &pxor ($t0,$t2); 962 eval(shift(@insns)); #@ 963 eval(shift(@insns)); 964 eval(shift(@insns)); 965 eval(shift(@insns)); #@ 966 &psrld ($t2,$sigma0[1]-$sigma0[0]); 967 eval(shift(@insns)); 968 &pxor ($t0,$t1); 969 eval(shift(@insns)); 970 eval(shift(@insns)); 971 &pslld ($t1,$sigma0[1]-$sigma0[0]); 972 eval(shift(@insns)); 973 eval(shift(@insns)); 974 &pxor ($t0,$t2); 975 eval(shift(@insns)); 976 eval(shift(@insns)); #@ 977 &movdqa ($t2,$t3); 978 eval(shift(@insns)); 979 eval(shift(@insns)); 980 &pxor ($t0,$t1); # sigma0(X[1..4]) 981 eval(shift(@insns)); #@ 982 eval(shift(@insns)); 983 eval(shift(@insns)); 984 &psrld ($t3,$sigma1[2]); 985 eval(shift(@insns)); 986 eval(shift(@insns)); 987 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 988 eval(shift(@insns)); #@ 989 eval(shift(@insns)); 990 &psrlq ($t2,$sigma1[0]); 991 eval(shift(@insns)); 992 eval(shift(@insns)); 993 eval(shift(@insns)); 994 &pxor ($t3,$t2); 995 eval(shift(@insns)); #@ 996 eval(shift(@insns)); 997 eval(shift(@insns)); 998 eval(shift(@insns)); #@ 999 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1000 eval(shift(@insns)); 1001 eval(shift(@insns)); 1002 &pxor ($t3,$t2); 1003 eval(shift(@insns)); #@ 1004 eval(shift(@insns)); 1005 eval(shift(@insns)); 1006 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 1007 &pshufd ($t3,$t3,0b10000000); 1008 eval(shift(@insns)); 1009 eval(shift(@insns)); 1010 eval(shift(@insns)); 1011 &psrldq ($t3,8); 1012 eval(shift(@insns)); 1013 eval(shift(@insns)); #@ 1014 eval(shift(@insns)); 1015 eval(shift(@insns)); 1016 eval(shift(@insns)); #@ 1017 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1018 eval(shift(@insns)); 1019 eval(shift(@insns)); 1020 eval(shift(@insns)); 1021 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 1022 eval(shift(@insns)); 1023 eval(shift(@insns)); #@ 1024 eval(shift(@insns)); 1025 &movdqa ($t2,$t3); 1026 eval(shift(@insns)); 1027 eval(shift(@insns)); 1028 &psrld ($t3,$sigma1[2]); 1029 eval(shift(@insns)); 1030 eval(shift(@insns)); #@ 1031 &psrlq ($t2,$sigma1[0]); 1032 eval(shift(@insns)); 1033 eval(shift(@insns)); 1034 &pxor ($t3,$t2); 1035 eval(shift(@insns)); #@ 1036 eval(shift(@insns)); 1037 eval(shift(@insns)); 1038 eval(shift(@insns)); #@ 1039 eval(shift(@insns)); 1040 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1041 eval(shift(@insns)); 1042 eval(shift(@insns)); 1043 eval(shift(@insns)); 1044 &pxor ($t3,$t2); 1045 eval(shift(@insns)); 1046 eval(shift(@insns)); 1047 eval(shift(@insns)); #@ 1048 #&pshufb ($t3,$t5); 1049 &pshufd ($t3,$t3,0b00001000); 1050 eval(shift(@insns)); 1051 eval(shift(@insns)); 1052 &movdqa ($t2,16*2*$j."($Tbl)"); 1053 eval(shift(@insns)); #@ 1054 eval(shift(@insns)); 1055 &pslldq ($t3,8); 1056 eval(shift(@insns)); 1057 eval(shift(@insns)); 1058 eval(shift(@insns)); 1059 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1060 eval(shift(@insns)); #@ 1061 eval(shift(@insns)); 1062 eval(shift(@insns)); 1063 } 1064 &paddd ($t2,@X[0]); 1065 foreach (@insns) { eval; } # remaining instructions 1066 &movdqa (16*$j."(%rsp)",$t2); 1067} 1068 1069 for ($i=0,$j=0; $j<4; $j++) { 1070 &SSSE3_256_00_47($j,\&body_00_15,@X); 1071 push(@X,shift(@X)); # rotate(@X) 1072 } 1073 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1074 &jne (".Lssse3_00_47"); 1075 1076 for ($i=0; $i<16; ) { 1077 foreach(body_00_15()) { eval; } 1078 } 1079$code.=<<___; 1080 mov $_ctx,$ctx 1081 mov $a1,$A 1082 1083 add $SZ*0($ctx),$A 1084 lea 16*$SZ($inp),$inp 1085 add $SZ*1($ctx),$B 1086 add $SZ*2($ctx),$C 1087 add $SZ*3($ctx),$D 1088 add $SZ*4($ctx),$E 1089 add $SZ*5($ctx),$F 1090 add $SZ*6($ctx),$G 1091 add $SZ*7($ctx),$H 1092 1093 cmp $_end,$inp 1094 1095 mov $A,$SZ*0($ctx) 1096 mov $B,$SZ*1($ctx) 1097 mov $C,$SZ*2($ctx) 1098 mov $D,$SZ*3($ctx) 1099 mov $E,$SZ*4($ctx) 1100 mov $F,$SZ*5($ctx) 1101 mov $G,$SZ*6($ctx) 1102 mov $H,$SZ*7($ctx) 1103 jb .Lloop_ssse3 1104 1105 mov $_rsp,%rsi 1106.cfi_def_cfa %rsi,8 1107___ 1108$code.=<<___ if ($win64); 1109 movaps 16*$SZ+32(%rsp),%xmm6 1110 movaps 16*$SZ+48(%rsp),%xmm7 1111 movaps 16*$SZ+64(%rsp),%xmm8 1112 movaps 16*$SZ+80(%rsp),%xmm9 1113___ 1114$code.=<<___; 1115 mov -48(%rsi),%r15 1116.cfi_restore %r15 1117 mov -40(%rsi),%r14 1118.cfi_restore %r14 1119 mov -32(%rsi),%r13 1120.cfi_restore %r13 1121 mov -24(%rsi),%r12 1122.cfi_restore %r12 1123 mov -16(%rsi),%rbp 1124.cfi_restore %rbp 1125 mov -8(%rsi),%rbx 1126.cfi_restore %rbx 1127 lea (%rsi),%rsp 1128.cfi_def_cfa_register %rsp 1129.Lepilogue_ssse3: 1130 ret 1131.cfi_endproc 1132.size ${func}_ssse3,.-${func}_ssse3 1133___ 1134} 1135 1136if ($avx) {{ 1137###################################################################### 1138# XOP code path 1139# 1140if ($SZ==8) { # SHA512 only 1141$code.=<<___; 1142.type ${func}_xop,\@function,3 1143.align 64 1144${func}_xop: 1145.cfi_startproc 1146.Lxop_shortcut: 1147 mov %rsp,%rax # copy %rsp 1148.cfi_def_cfa_register %rax 1149 push %rbx 1150.cfi_push %rbx 1151 push %rbp 1152.cfi_push %rbp 1153 push %r12 1154.cfi_push %r12 1155 push %r13 1156.cfi_push %r13 1157 push %r14 1158.cfi_push %r14 1159 push %r15 1160.cfi_push %r15 1161 shl \$4,%rdx # num*16 1162 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1163 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1164 and \$-64,%rsp # align stack frame 1165 mov $ctx,$_ctx # save ctx, 1st arg 1166 mov $inp,$_inp # save inp, 2nd arh 1167 mov %rdx,$_end # save end pointer, "3rd" arg 1168 mov %rax,$_rsp # save copy of %rsp 1169.cfi_cfa_expression $_rsp,deref,+8 1170___ 1171$code.=<<___ if ($win64); 1172 movaps %xmm6,16*$SZ+32(%rsp) 1173 movaps %xmm7,16*$SZ+48(%rsp) 1174 movaps %xmm8,16*$SZ+64(%rsp) 1175 movaps %xmm9,16*$SZ+80(%rsp) 1176___ 1177$code.=<<___ if ($win64 && $SZ>4); 1178 movaps %xmm10,16*$SZ+96(%rsp) 1179 movaps %xmm11,16*$SZ+112(%rsp) 1180___ 1181$code.=<<___; 1182.Lprologue_xop: 1183 1184 vzeroupper 1185 mov $SZ*0($ctx),$A 1186 mov $SZ*1($ctx),$B 1187 mov $SZ*2($ctx),$C 1188 mov $SZ*3($ctx),$D 1189 mov $SZ*4($ctx),$E 1190 mov $SZ*5($ctx),$F 1191 mov $SZ*6($ctx),$G 1192 mov $SZ*7($ctx),$H 1193 jmp .Lloop_xop 1194___ 1195 if ($SZ==4) { # SHA256 1196 my @X = map("%xmm$_",(0..3)); 1197 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 1198 1199$code.=<<___; 1200.align 16 1201.Lloop_xop: 1202 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1203 vmovdqu 0x00($inp),@X[0] 1204 vmovdqu 0x10($inp),@X[1] 1205 vmovdqu 0x20($inp),@X[2] 1206 vmovdqu 0x30($inp),@X[3] 1207 vpshufb $t3,@X[0],@X[0] 1208 lea $TABLE(%rip),$Tbl 1209 vpshufb $t3,@X[1],@X[1] 1210 vpshufb $t3,@X[2],@X[2] 1211 vpaddd 0x00($Tbl),@X[0],$t0 1212 vpshufb $t3,@X[3],@X[3] 1213 vpaddd 0x20($Tbl),@X[1],$t1 1214 vpaddd 0x40($Tbl),@X[2],$t2 1215 vpaddd 0x60($Tbl),@X[3],$t3 1216 vmovdqa $t0,0x00(%rsp) 1217 mov $A,$a1 1218 vmovdqa $t1,0x10(%rsp) 1219 mov $B,$a3 1220 vmovdqa $t2,0x20(%rsp) 1221 xor $C,$a3 # magic 1222 vmovdqa $t3,0x30(%rsp) 1223 mov $E,$a0 1224 jmp .Lxop_00_47 1225 1226.align 16 1227.Lxop_00_47: 1228 sub \$`-16*2*$SZ`,$Tbl # size optimization 1229___ 1230sub XOP_256_00_47 () { 1231my $j = shift; 1232my $body = shift; 1233my @X = @_; 1234my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1235 1236 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 1237 eval(shift(@insns)); 1238 eval(shift(@insns)); 1239 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 1240 eval(shift(@insns)); 1241 eval(shift(@insns)); 1242 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 1243 eval(shift(@insns)); 1244 eval(shift(@insns)); 1245 &vpsrld ($t0,$t0,$sigma0[2]); 1246 eval(shift(@insns)); 1247 eval(shift(@insns)); 1248 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 1249 eval(shift(@insns)); 1250 eval(shift(@insns)); 1251 eval(shift(@insns)); 1252 eval(shift(@insns)); 1253 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 1254 eval(shift(@insns)); 1255 eval(shift(@insns)); 1256 &vpxor ($t0,$t0,$t1); 1257 eval(shift(@insns)); 1258 eval(shift(@insns)); 1259 eval(shift(@insns)); 1260 eval(shift(@insns)); 1261 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 1262 eval(shift(@insns)); 1263 eval(shift(@insns)); 1264 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 1265 eval(shift(@insns)); 1266 eval(shift(@insns)); 1267 &vpsrld ($t2,@X[3],$sigma1[2]); 1268 eval(shift(@insns)); 1269 eval(shift(@insns)); 1270 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 1271 eval(shift(@insns)); 1272 eval(shift(@insns)); 1273 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1274 eval(shift(@insns)); 1275 eval(shift(@insns)); 1276 &vpxor ($t3,$t3,$t2); 1277 eval(shift(@insns)); 1278 eval(shift(@insns)); 1279 eval(shift(@insns)); 1280 eval(shift(@insns)); 1281 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1282 eval(shift(@insns)); 1283 eval(shift(@insns)); 1284 eval(shift(@insns)); 1285 eval(shift(@insns)); 1286 &vpsrldq ($t3,$t3,8); 1287 eval(shift(@insns)); 1288 eval(shift(@insns)); 1289 eval(shift(@insns)); 1290 eval(shift(@insns)); 1291 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1292 eval(shift(@insns)); 1293 eval(shift(@insns)); 1294 eval(shift(@insns)); 1295 eval(shift(@insns)); 1296 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 1297 eval(shift(@insns)); 1298 eval(shift(@insns)); 1299 &vpsrld ($t2,@X[0],$sigma1[2]); 1300 eval(shift(@insns)); 1301 eval(shift(@insns)); 1302 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1303 eval(shift(@insns)); 1304 eval(shift(@insns)); 1305 &vpxor ($t3,$t3,$t2); 1306 eval(shift(@insns)); 1307 eval(shift(@insns)); 1308 eval(shift(@insns)); 1309 eval(shift(@insns)); 1310 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 1311 eval(shift(@insns)); 1312 eval(shift(@insns)); 1313 eval(shift(@insns)); 1314 eval(shift(@insns)); 1315 &vpslldq ($t3,$t3,8); # 22 instructions 1316 eval(shift(@insns)); 1317 eval(shift(@insns)); 1318 eval(shift(@insns)); 1319 eval(shift(@insns)); 1320 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1321 eval(shift(@insns)); 1322 eval(shift(@insns)); 1323 eval(shift(@insns)); 1324 eval(shift(@insns)); 1325 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1326 foreach (@insns) { eval; } # remaining instructions 1327 &vmovdqa (16*$j."(%rsp)",$t2); 1328} 1329 1330 for ($i=0,$j=0; $j<4; $j++) { 1331 &XOP_256_00_47($j,\&body_00_15,@X); 1332 push(@X,shift(@X)); # rotate(@X) 1333 } 1334 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1335 &jne (".Lxop_00_47"); 1336 1337 for ($i=0; $i<16; ) { 1338 foreach(body_00_15()) { eval; } 1339 } 1340 1341 } else { # SHA512 1342 my @X = map("%xmm$_",(0..7)); 1343 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1344 1345$code.=<<___; 1346.align 16 1347.Lloop_xop: 1348 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1349 vmovdqu 0x00($inp),@X[0] 1350 lea $TABLE+0x80(%rip),$Tbl # size optimization 1351 vmovdqu 0x10($inp),@X[1] 1352 vmovdqu 0x20($inp),@X[2] 1353 vpshufb $t3,@X[0],@X[0] 1354 vmovdqu 0x30($inp),@X[3] 1355 vpshufb $t3,@X[1],@X[1] 1356 vmovdqu 0x40($inp),@X[4] 1357 vpshufb $t3,@X[2],@X[2] 1358 vmovdqu 0x50($inp),@X[5] 1359 vpshufb $t3,@X[3],@X[3] 1360 vmovdqu 0x60($inp),@X[6] 1361 vpshufb $t3,@X[4],@X[4] 1362 vmovdqu 0x70($inp),@X[7] 1363 vpshufb $t3,@X[5],@X[5] 1364 vpaddq -0x80($Tbl),@X[0],$t0 1365 vpshufb $t3,@X[6],@X[6] 1366 vpaddq -0x60($Tbl),@X[1],$t1 1367 vpshufb $t3,@X[7],@X[7] 1368 vpaddq -0x40($Tbl),@X[2],$t2 1369 vpaddq -0x20($Tbl),@X[3],$t3 1370 vmovdqa $t0,0x00(%rsp) 1371 vpaddq 0x00($Tbl),@X[4],$t0 1372 vmovdqa $t1,0x10(%rsp) 1373 vpaddq 0x20($Tbl),@X[5],$t1 1374 vmovdqa $t2,0x20(%rsp) 1375 vpaddq 0x40($Tbl),@X[6],$t2 1376 vmovdqa $t3,0x30(%rsp) 1377 vpaddq 0x60($Tbl),@X[7],$t3 1378 vmovdqa $t0,0x40(%rsp) 1379 mov $A,$a1 1380 vmovdqa $t1,0x50(%rsp) 1381 mov $B,$a3 1382 vmovdqa $t2,0x60(%rsp) 1383 xor $C,$a3 # magic 1384 vmovdqa $t3,0x70(%rsp) 1385 mov $E,$a0 1386 jmp .Lxop_00_47 1387 1388.align 16 1389.Lxop_00_47: 1390 add \$`16*2*$SZ`,$Tbl 1391___ 1392sub XOP_512_00_47 () { 1393my $j = shift; 1394my $body = shift; 1395my @X = @_; 1396my @insns = (&$body,&$body); # 52 instructions 1397 1398 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] 1399 eval(shift(@insns)); 1400 eval(shift(@insns)); 1401 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] 1402 eval(shift(@insns)); 1403 eval(shift(@insns)); 1404 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); 1405 eval(shift(@insns)); 1406 eval(shift(@insns)); 1407 &vpsrlq ($t0,$t0,$sigma0[2]); 1408 eval(shift(@insns)); 1409 eval(shift(@insns)); 1410 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] 1411 eval(shift(@insns)); 1412 eval(shift(@insns)); 1413 eval(shift(@insns)); 1414 eval(shift(@insns)); 1415 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); 1416 eval(shift(@insns)); 1417 eval(shift(@insns)); 1418 &vpxor ($t0,$t0,$t1); 1419 eval(shift(@insns)); 1420 eval(shift(@insns)); 1421 eval(shift(@insns)); 1422 eval(shift(@insns)); 1423 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); 1424 eval(shift(@insns)); 1425 eval(shift(@insns)); 1426 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) 1427 eval(shift(@insns)); 1428 eval(shift(@insns)); 1429 &vpsrlq ($t2,@X[7],$sigma1[2]); 1430 eval(shift(@insns)); 1431 eval(shift(@insns)); 1432 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) 1433 eval(shift(@insns)); 1434 eval(shift(@insns)); 1435 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); 1436 eval(shift(@insns)); 1437 eval(shift(@insns)); 1438 &vpxor ($t3,$t3,$t2); 1439 eval(shift(@insns)); 1440 eval(shift(@insns)); 1441 eval(shift(@insns)); 1442 eval(shift(@insns)); 1443 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1444 eval(shift(@insns)); 1445 eval(shift(@insns)); 1446 eval(shift(@insns)); 1447 eval(shift(@insns)); 1448 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1449 eval(shift(@insns)); 1450 eval(shift(@insns)); 1451 eval(shift(@insns)); 1452 eval(shift(@insns)); 1453 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1454 foreach (@insns) { eval; } # remaining instructions 1455 &vmovdqa (16*$j."(%rsp)",$t2); 1456} 1457 1458 for ($i=0,$j=0; $j<8; $j++) { 1459 &XOP_512_00_47($j,\&body_00_15,@X); 1460 push(@X,shift(@X)); # rotate(@X) 1461 } 1462 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1463 &jne (".Lxop_00_47"); 1464 1465 for ($i=0; $i<16; ) { 1466 foreach(body_00_15()) { eval; } 1467 } 1468} 1469$code.=<<___; 1470 mov $_ctx,$ctx 1471 mov $a1,$A 1472 1473 add $SZ*0($ctx),$A 1474 lea 16*$SZ($inp),$inp 1475 add $SZ*1($ctx),$B 1476 add $SZ*2($ctx),$C 1477 add $SZ*3($ctx),$D 1478 add $SZ*4($ctx),$E 1479 add $SZ*5($ctx),$F 1480 add $SZ*6($ctx),$G 1481 add $SZ*7($ctx),$H 1482 1483 cmp $_end,$inp 1484 1485 mov $A,$SZ*0($ctx) 1486 mov $B,$SZ*1($ctx) 1487 mov $C,$SZ*2($ctx) 1488 mov $D,$SZ*3($ctx) 1489 mov $E,$SZ*4($ctx) 1490 mov $F,$SZ*5($ctx) 1491 mov $G,$SZ*6($ctx) 1492 mov $H,$SZ*7($ctx) 1493 jb .Lloop_xop 1494 1495 mov $_rsp,%rsi 1496.cfi_def_cfa %rsi,8 1497 vzeroupper 1498___ 1499$code.=<<___ if ($win64); 1500 movaps 16*$SZ+32(%rsp),%xmm6 1501 movaps 16*$SZ+48(%rsp),%xmm7 1502 movaps 16*$SZ+64(%rsp),%xmm8 1503 movaps 16*$SZ+80(%rsp),%xmm9 1504___ 1505$code.=<<___ if ($win64 && $SZ>4); 1506 movaps 16*$SZ+96(%rsp),%xmm10 1507 movaps 16*$SZ+112(%rsp),%xmm11 1508___ 1509$code.=<<___; 1510 mov -48(%rsi),%r15 1511.cfi_restore %r15 1512 mov -40(%rsi),%r14 1513.cfi_restore %r14 1514 mov -32(%rsi),%r13 1515.cfi_restore %r13 1516 mov -24(%rsi),%r12 1517.cfi_restore %r12 1518 mov -16(%rsi),%rbp 1519.cfi_restore %rbp 1520 mov -8(%rsi),%rbx 1521.cfi_restore %rbx 1522 lea (%rsi),%rsp 1523.cfi_def_cfa_register %rsp 1524.Lepilogue_xop: 1525 ret 1526.cfi_endproc 1527.size ${func}_xop,.-${func}_xop 1528___ 1529} 1530###################################################################### 1531# AVX+shrd code path 1532# 1533local *ror = sub { &shrd(@_[0],@_) }; 1534 1535$code.=<<___; 1536.type ${func}_avx,\@function,3 1537.align 64 1538${func}_avx: 1539.cfi_startproc 1540.Lavx_shortcut: 1541 mov %rsp,%rax # copy %rsp 1542.cfi_def_cfa_register %rax 1543 push %rbx 1544.cfi_push %rbx 1545 push %rbp 1546.cfi_push %rbp 1547 push %r12 1548.cfi_push %r12 1549 push %r13 1550.cfi_push %r13 1551 push %r14 1552.cfi_push %r14 1553 push %r15 1554.cfi_push %r15 1555 shl \$4,%rdx # num*16 1556 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1557 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1558 and \$-64,%rsp # align stack frame 1559 mov $ctx,$_ctx # save ctx, 1st arg 1560 mov $inp,$_inp # save inp, 2nd arh 1561 mov %rdx,$_end # save end pointer, "3rd" arg 1562 mov %rax,$_rsp # save copy of %rsp 1563.cfi_cfa_expression $_rsp,deref,+8 1564___ 1565$code.=<<___ if ($win64); 1566 movaps %xmm6,16*$SZ+32(%rsp) 1567 movaps %xmm7,16*$SZ+48(%rsp) 1568 movaps %xmm8,16*$SZ+64(%rsp) 1569 movaps %xmm9,16*$SZ+80(%rsp) 1570___ 1571$code.=<<___ if ($win64 && $SZ>4); 1572 movaps %xmm10,16*$SZ+96(%rsp) 1573 movaps %xmm11,16*$SZ+112(%rsp) 1574___ 1575$code.=<<___; 1576.Lprologue_avx: 1577 1578 vzeroupper 1579 mov $SZ*0($ctx),$A 1580 mov $SZ*1($ctx),$B 1581 mov $SZ*2($ctx),$C 1582 mov $SZ*3($ctx),$D 1583 mov $SZ*4($ctx),$E 1584 mov $SZ*5($ctx),$F 1585 mov $SZ*6($ctx),$G 1586 mov $SZ*7($ctx),$H 1587___ 1588 if ($SZ==4) { # SHA256 1589 my @X = map("%xmm$_",(0..3)); 1590 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1591 1592$code.=<<___; 1593 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1594 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1595 jmp .Lloop_avx 1596.align 16 1597.Lloop_avx: 1598 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1599 vmovdqu 0x00($inp),@X[0] 1600 vmovdqu 0x10($inp),@X[1] 1601 vmovdqu 0x20($inp),@X[2] 1602 vmovdqu 0x30($inp),@X[3] 1603 vpshufb $t3,@X[0],@X[0] 1604 lea $TABLE(%rip),$Tbl 1605 vpshufb $t3,@X[1],@X[1] 1606 vpshufb $t3,@X[2],@X[2] 1607 vpaddd 0x00($Tbl),@X[0],$t0 1608 vpshufb $t3,@X[3],@X[3] 1609 vpaddd 0x20($Tbl),@X[1],$t1 1610 vpaddd 0x40($Tbl),@X[2],$t2 1611 vpaddd 0x60($Tbl),@X[3],$t3 1612 vmovdqa $t0,0x00(%rsp) 1613 mov $A,$a1 1614 vmovdqa $t1,0x10(%rsp) 1615 mov $B,$a3 1616 vmovdqa $t2,0x20(%rsp) 1617 xor $C,$a3 # magic 1618 vmovdqa $t3,0x30(%rsp) 1619 mov $E,$a0 1620 jmp .Lavx_00_47 1621 1622.align 16 1623.Lavx_00_47: 1624 sub \$`-16*2*$SZ`,$Tbl # size optimization 1625___ 1626sub Xupdate_256_AVX () { 1627 ( 1628 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1629 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1630 '&vpsrld ($t2,$t0,$sigma0[0]);', 1631 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1632 '&vpsrld ($t3,$t0,$sigma0[2])', 1633 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1634 '&vpxor ($t0,$t3,$t2)', 1635 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1636 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1637 '&vpxor ($t0,$t0,$t1)', 1638 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1639 '&vpxor ($t0,$t0,$t2)', 1640 '&vpsrld ($t2,$t3,$sigma1[2]);', 1641 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1642 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1643 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1644 '&vpxor ($t2,$t2,$t3);', 1645 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1646 '&vpxor ($t2,$t2,$t3)', 1647 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1648 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1649 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1650 '&vpsrld ($t2,$t3,$sigma1[2])', 1651 '&vpsrlq ($t3,$t3,$sigma1[0])', 1652 '&vpxor ($t2,$t2,$t3);', 1653 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1654 '&vpxor ($t2,$t2,$t3)', 1655 '&vpshufb ($t2,$t2,$t5)', 1656 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1657 ); 1658} 1659 1660sub AVX_256_00_47 () { 1661my $j = shift; 1662my $body = shift; 1663my @X = @_; 1664my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1665 1666 foreach (Xupdate_256_AVX()) { # 29 instructions 1667 eval; 1668 eval(shift(@insns)); 1669 eval(shift(@insns)); 1670 eval(shift(@insns)); 1671 } 1672 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1673 foreach (@insns) { eval; } # remaining instructions 1674 &vmovdqa (16*$j."(%rsp)",$t2); 1675} 1676 1677 for ($i=0,$j=0; $j<4; $j++) { 1678 &AVX_256_00_47($j,\&body_00_15,@X); 1679 push(@X,shift(@X)); # rotate(@X) 1680 } 1681 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1682 &jne (".Lavx_00_47"); 1683 1684 for ($i=0; $i<16; ) { 1685 foreach(body_00_15()) { eval; } 1686 } 1687 1688 } else { # SHA512 1689 my @X = map("%xmm$_",(0..7)); 1690 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1691 1692$code.=<<___; 1693 jmp .Lloop_avx 1694.align 16 1695.Lloop_avx: 1696 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1697 vmovdqu 0x00($inp),@X[0] 1698 lea $TABLE+0x80(%rip),$Tbl # size optimization 1699 vmovdqu 0x10($inp),@X[1] 1700 vmovdqu 0x20($inp),@X[2] 1701 vpshufb $t3,@X[0],@X[0] 1702 vmovdqu 0x30($inp),@X[3] 1703 vpshufb $t3,@X[1],@X[1] 1704 vmovdqu 0x40($inp),@X[4] 1705 vpshufb $t3,@X[2],@X[2] 1706 vmovdqu 0x50($inp),@X[5] 1707 vpshufb $t3,@X[3],@X[3] 1708 vmovdqu 0x60($inp),@X[6] 1709 vpshufb $t3,@X[4],@X[4] 1710 vmovdqu 0x70($inp),@X[7] 1711 vpshufb $t3,@X[5],@X[5] 1712 vpaddq -0x80($Tbl),@X[0],$t0 1713 vpshufb $t3,@X[6],@X[6] 1714 vpaddq -0x60($Tbl),@X[1],$t1 1715 vpshufb $t3,@X[7],@X[7] 1716 vpaddq -0x40($Tbl),@X[2],$t2 1717 vpaddq -0x20($Tbl),@X[3],$t3 1718 vmovdqa $t0,0x00(%rsp) 1719 vpaddq 0x00($Tbl),@X[4],$t0 1720 vmovdqa $t1,0x10(%rsp) 1721 vpaddq 0x20($Tbl),@X[5],$t1 1722 vmovdqa $t2,0x20(%rsp) 1723 vpaddq 0x40($Tbl),@X[6],$t2 1724 vmovdqa $t3,0x30(%rsp) 1725 vpaddq 0x60($Tbl),@X[7],$t3 1726 vmovdqa $t0,0x40(%rsp) 1727 mov $A,$a1 1728 vmovdqa $t1,0x50(%rsp) 1729 mov $B,$a3 1730 vmovdqa $t2,0x60(%rsp) 1731 xor $C,$a3 # magic 1732 vmovdqa $t3,0x70(%rsp) 1733 mov $E,$a0 1734 jmp .Lavx_00_47 1735 1736.align 16 1737.Lavx_00_47: 1738 add \$`16*2*$SZ`,$Tbl 1739___ 1740sub Xupdate_512_AVX () { 1741 ( 1742 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1743 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1744 '&vpsrlq ($t2,$t0,$sigma0[0])', 1745 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1746 '&vpsrlq ($t3,$t0,$sigma0[2])', 1747 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1748 '&vpxor ($t0,$t3,$t2)', 1749 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1750 '&vpxor ($t0,$t0,$t1)', 1751 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1752 '&vpxor ($t0,$t0,$t2)', 1753 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1754 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1755 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1756 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1757 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1758 '&vpxor ($t3,$t3,$t2)', 1759 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1760 '&vpxor ($t3,$t3,$t1)', 1761 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1762 '&vpxor ($t3,$t3,$t2)', 1763 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1764 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1765 ); 1766} 1767 1768sub AVX_512_00_47 () { 1769my $j = shift; 1770my $body = shift; 1771my @X = @_; 1772my @insns = (&$body,&$body); # 52 instructions 1773 1774 foreach (Xupdate_512_AVX()) { # 23 instructions 1775 eval; 1776 eval(shift(@insns)); 1777 eval(shift(@insns)); 1778 } 1779 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1780 foreach (@insns) { eval; } # remaining instructions 1781 &vmovdqa (16*$j."(%rsp)",$t2); 1782} 1783 1784 for ($i=0,$j=0; $j<8; $j++) { 1785 &AVX_512_00_47($j,\&body_00_15,@X); 1786 push(@X,shift(@X)); # rotate(@X) 1787 } 1788 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1789 &jne (".Lavx_00_47"); 1790 1791 for ($i=0; $i<16; ) { 1792 foreach(body_00_15()) { eval; } 1793 } 1794} 1795$code.=<<___; 1796 mov $_ctx,$ctx 1797 mov $a1,$A 1798 1799 add $SZ*0($ctx),$A 1800 lea 16*$SZ($inp),$inp 1801 add $SZ*1($ctx),$B 1802 add $SZ*2($ctx),$C 1803 add $SZ*3($ctx),$D 1804 add $SZ*4($ctx),$E 1805 add $SZ*5($ctx),$F 1806 add $SZ*6($ctx),$G 1807 add $SZ*7($ctx),$H 1808 1809 cmp $_end,$inp 1810 1811 mov $A,$SZ*0($ctx) 1812 mov $B,$SZ*1($ctx) 1813 mov $C,$SZ*2($ctx) 1814 mov $D,$SZ*3($ctx) 1815 mov $E,$SZ*4($ctx) 1816 mov $F,$SZ*5($ctx) 1817 mov $G,$SZ*6($ctx) 1818 mov $H,$SZ*7($ctx) 1819 jb .Lloop_avx 1820 1821 mov $_rsp,%rsi 1822.cfi_def_cfa %rsi,8 1823 vzeroupper 1824___ 1825$code.=<<___ if ($win64); 1826 movaps 16*$SZ+32(%rsp),%xmm6 1827 movaps 16*$SZ+48(%rsp),%xmm7 1828 movaps 16*$SZ+64(%rsp),%xmm8 1829 movaps 16*$SZ+80(%rsp),%xmm9 1830___ 1831$code.=<<___ if ($win64 && $SZ>4); 1832 movaps 16*$SZ+96(%rsp),%xmm10 1833 movaps 16*$SZ+112(%rsp),%xmm11 1834___ 1835$code.=<<___; 1836 mov -48(%rsi),%r15 1837.cfi_restore %r15 1838 mov -40(%rsi),%r14 1839.cfi_restore %r14 1840 mov -32(%rsi),%r13 1841.cfi_restore %r13 1842 mov -24(%rsi),%r12 1843.cfi_restore %r12 1844 mov -16(%rsi),%rbp 1845.cfi_restore %rbp 1846 mov -8(%rsi),%rbx 1847.cfi_restore %rbx 1848 lea (%rsi),%rsp 1849.cfi_def_cfa_register %rsp 1850.Lepilogue_avx: 1851 ret 1852.cfi_endproc 1853.size ${func}_avx,.-${func}_avx 1854___ 1855 1856if ($avx>1) {{ 1857###################################################################### 1858# AVX2+BMI code path 1859# 1860my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1861my $PUSH8=8*2*$SZ; 1862use integer; 1863 1864sub bodyx_00_15 () { 1865 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1866 ( 1867 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1868 1869 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1870 '&and ($a4,$e)', # f&e 1871 '&rorx ($a0,$e,$Sigma1[2])', 1872 '&rorx ($a2,$e,$Sigma1[1])', 1873 1874 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1875 '&lea ($h,"($h,$a4)")', 1876 '&andn ($a4,$e,$g)', # ~e&g 1877 '&xor ($a0,$a2)', 1878 1879 '&rorx ($a1,$e,$Sigma1[0])', 1880 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1881 '&xor ($a0,$a1)', # Sigma1(e) 1882 '&mov ($a2,$a)', 1883 1884 '&rorx ($a4,$a,$Sigma0[2])', 1885 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1886 '&xor ($a2,$b)', # a^b, b^c in next round 1887 '&rorx ($a1,$a,$Sigma0[1])', 1888 1889 '&rorx ($a0,$a,$Sigma0[0])', 1890 '&lea ($d,"($d,$h)")', # d+=h 1891 '&and ($a3,$a2)', # (b^c)&(a^b) 1892 '&xor ($a1,$a4)', 1893 1894 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1895 '&xor ($a1,$a0)', # Sigma0(a) 1896 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1897 '&mov ($a4,$e)', # copy of f in future 1898 1899 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1900 ); 1901 # and at the finish one has to $a+=$a1 1902} 1903 1904$code.=<<___; 1905.type ${func}_avx2,\@function,3 1906.align 64 1907${func}_avx2: 1908.cfi_startproc 1909.Lavx2_shortcut: 1910 mov %rsp,%rax # copy %rsp 1911.cfi_def_cfa_register %rax 1912 push %rbx 1913.cfi_push %rbx 1914 push %rbp 1915.cfi_push %rbp 1916 push %r12 1917.cfi_push %r12 1918 push %r13 1919.cfi_push %r13 1920 push %r14 1921.cfi_push %r14 1922 push %r15 1923.cfi_push %r15 1924 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1925 shl \$4,%rdx # num*16 1926 and \$-256*$SZ,%rsp # align stack frame 1927 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1928 add \$`2*$SZ*($rounds-8)`,%rsp 1929 mov $ctx,$_ctx # save ctx, 1st arg 1930 mov $inp,$_inp # save inp, 2nd arh 1931 mov %rdx,$_end # save end pointer, "3rd" arg 1932 mov %rax,$_rsp # save copy of %rsp 1933.cfi_cfa_expression $_rsp,deref,+8 1934___ 1935$code.=<<___ if ($win64); 1936 movaps %xmm6,16*$SZ+32(%rsp) 1937 movaps %xmm7,16*$SZ+48(%rsp) 1938 movaps %xmm8,16*$SZ+64(%rsp) 1939 movaps %xmm9,16*$SZ+80(%rsp) 1940___ 1941$code.=<<___ if ($win64 && $SZ>4); 1942 movaps %xmm10,16*$SZ+96(%rsp) 1943 movaps %xmm11,16*$SZ+112(%rsp) 1944___ 1945$code.=<<___; 1946.Lprologue_avx2: 1947 1948 vzeroupper 1949 sub \$-16*$SZ,$inp # inp++, size optimization 1950 mov $SZ*0($ctx),$A 1951 mov $inp,%r12 # borrow $T1 1952 mov $SZ*1($ctx),$B 1953 cmp %rdx,$inp # $_end 1954 mov $SZ*2($ctx),$C 1955 cmove %rsp,%r12 # next block or random data 1956 mov $SZ*3($ctx),$D 1957 mov $SZ*4($ctx),$E 1958 mov $SZ*5($ctx),$F 1959 mov $SZ*6($ctx),$G 1960 mov $SZ*7($ctx),$H 1961___ 1962 if ($SZ==4) { # SHA256 1963 my @X = map("%ymm$_",(0..3)); 1964 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1965 1966$code.=<<___; 1967 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1968 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1969 jmp .Loop_avx2 1970.align 16 1971.Loop_avx2: 1972 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1973 vmovdqu -16*$SZ+0($inp),%xmm0 1974 vmovdqu -16*$SZ+16($inp),%xmm1 1975 vmovdqu -16*$SZ+32($inp),%xmm2 1976 vmovdqu -16*$SZ+48($inp),%xmm3 1977 #mov $inp,$_inp # offload $inp 1978 vinserti128 \$1,(%r12),@X[0],@X[0] 1979 vinserti128 \$1,16(%r12),@X[1],@X[1] 1980 vpshufb $t3,@X[0],@X[0] 1981 vinserti128 \$1,32(%r12),@X[2],@X[2] 1982 vpshufb $t3,@X[1],@X[1] 1983 vinserti128 \$1,48(%r12),@X[3],@X[3] 1984 1985 lea $TABLE(%rip),$Tbl 1986 vpshufb $t3,@X[2],@X[2] 1987 vpaddd 0x00($Tbl),@X[0],$t0 1988 vpshufb $t3,@X[3],@X[3] 1989 vpaddd 0x20($Tbl),@X[1],$t1 1990 vpaddd 0x40($Tbl),@X[2],$t2 1991 vpaddd 0x60($Tbl),@X[3],$t3 1992 vmovdqa $t0,0x00(%rsp) 1993 xor $a1,$a1 1994 vmovdqa $t1,0x20(%rsp) 1995 lea -$PUSH8(%rsp),%rsp 1996 mov $B,$a3 1997 vmovdqa $t2,0x00(%rsp) 1998 xor $C,$a3 # magic 1999 vmovdqa $t3,0x20(%rsp) 2000 mov $F,$a4 2001 sub \$-16*2*$SZ,$Tbl # size optimization 2002 jmp .Lavx2_00_47 2003 2004.align 16 2005.Lavx2_00_47: 2006___ 2007 2008sub AVX2_256_00_47 () { 2009my $j = shift; 2010my $body = shift; 2011my @X = @_; 2012my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 2013my $base = "+2*$PUSH8(%rsp)"; 2014 2015 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); 2016 foreach (Xupdate_256_AVX()) { # 29 instructions 2017 eval; 2018 eval(shift(@insns)); 2019 eval(shift(@insns)); 2020 eval(shift(@insns)); 2021 } 2022 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 2023 foreach (@insns) { eval; } # remaining instructions 2024 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2025} 2026 2027 for ($i=0,$j=0; $j<4; $j++) { 2028 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 2029 push(@X,shift(@X)); # rotate(@X) 2030 } 2031 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2032 &cmpb (($SZ-1)."($Tbl)",0); 2033 &jne (".Lavx2_00_47"); 2034 2035 for ($i=0; $i<16; ) { 2036 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2037 foreach(bodyx_00_15()) { eval; } 2038 } 2039 } else { # SHA512 2040 my @X = map("%ymm$_",(0..7)); 2041 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 2042 2043$code.=<<___; 2044 jmp .Loop_avx2 2045.align 16 2046.Loop_avx2: 2047 vmovdqu -16*$SZ($inp),%xmm0 2048 vmovdqu -16*$SZ+16($inp),%xmm1 2049 vmovdqu -16*$SZ+32($inp),%xmm2 2050 lea $TABLE+0x80(%rip),$Tbl # size optimization 2051 vmovdqu -16*$SZ+48($inp),%xmm3 2052 vmovdqu -16*$SZ+64($inp),%xmm4 2053 vmovdqu -16*$SZ+80($inp),%xmm5 2054 vmovdqu -16*$SZ+96($inp),%xmm6 2055 vmovdqu -16*$SZ+112($inp),%xmm7 2056 #mov $inp,$_inp # offload $inp 2057 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 2058 vinserti128 \$1,(%r12),@X[0],@X[0] 2059 vinserti128 \$1,16(%r12),@X[1],@X[1] 2060 vpshufb $t2,@X[0],@X[0] 2061 vinserti128 \$1,32(%r12),@X[2],@X[2] 2062 vpshufb $t2,@X[1],@X[1] 2063 vinserti128 \$1,48(%r12),@X[3],@X[3] 2064 vpshufb $t2,@X[2],@X[2] 2065 vinserti128 \$1,64(%r12),@X[4],@X[4] 2066 vpshufb $t2,@X[3],@X[3] 2067 vinserti128 \$1,80(%r12),@X[5],@X[5] 2068 vpshufb $t2,@X[4],@X[4] 2069 vinserti128 \$1,96(%r12),@X[6],@X[6] 2070 vpshufb $t2,@X[5],@X[5] 2071 vinserti128 \$1,112(%r12),@X[7],@X[7] 2072 2073 vpaddq -0x80($Tbl),@X[0],$t0 2074 vpshufb $t2,@X[6],@X[6] 2075 vpaddq -0x60($Tbl),@X[1],$t1 2076 vpshufb $t2,@X[7],@X[7] 2077 vpaddq -0x40($Tbl),@X[2],$t2 2078 vpaddq -0x20($Tbl),@X[3],$t3 2079 vmovdqa $t0,0x00(%rsp) 2080 vpaddq 0x00($Tbl),@X[4],$t0 2081 vmovdqa $t1,0x20(%rsp) 2082 vpaddq 0x20($Tbl),@X[5],$t1 2083 vmovdqa $t2,0x40(%rsp) 2084 vpaddq 0x40($Tbl),@X[6],$t2 2085 vmovdqa $t3,0x60(%rsp) 2086 lea -$PUSH8(%rsp),%rsp 2087 vpaddq 0x60($Tbl),@X[7],$t3 2088 vmovdqa $t0,0x00(%rsp) 2089 xor $a1,$a1 2090 vmovdqa $t1,0x20(%rsp) 2091 mov $B,$a3 2092 vmovdqa $t2,0x40(%rsp) 2093 xor $C,$a3 # magic 2094 vmovdqa $t3,0x60(%rsp) 2095 mov $F,$a4 2096 add \$16*2*$SZ,$Tbl 2097 jmp .Lavx2_00_47 2098 2099.align 16 2100.Lavx2_00_47: 2101___ 2102 2103sub AVX2_512_00_47 () { 2104my $j = shift; 2105my $body = shift; 2106my @X = @_; 2107my @insns = (&$body,&$body); # 48 instructions 2108my $base = "+2*$PUSH8(%rsp)"; 2109 2110 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); 2111 foreach (Xupdate_512_AVX()) { # 23 instructions 2112 eval; 2113 if ($_ !~ /\;$/) { 2114 eval(shift(@insns)); 2115 eval(shift(@insns)); 2116 eval(shift(@insns)); 2117 } 2118 } 2119 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 2120 foreach (@insns) { eval; } # remaining instructions 2121 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2122} 2123 2124 for ($i=0,$j=0; $j<8; $j++) { 2125 &AVX2_512_00_47($j,\&bodyx_00_15,@X); 2126 push(@X,shift(@X)); # rotate(@X) 2127 } 2128 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2129 &cmpb (($SZ-1-0x80)."($Tbl)",0); 2130 &jne (".Lavx2_00_47"); 2131 2132 for ($i=0; $i<16; ) { 2133 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2134 foreach(bodyx_00_15()) { eval; } 2135 } 2136} 2137$code.=<<___; 2138 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2139 add $a1,$A 2140 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2141 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 2142 2143 add $SZ*0($ctx),$A 2144 add $SZ*1($ctx),$B 2145 add $SZ*2($ctx),$C 2146 add $SZ*3($ctx),$D 2147 add $SZ*4($ctx),$E 2148 add $SZ*5($ctx),$F 2149 add $SZ*6($ctx),$G 2150 add $SZ*7($ctx),$H 2151 2152 mov $A,$SZ*0($ctx) 2153 mov $B,$SZ*1($ctx) 2154 mov $C,$SZ*2($ctx) 2155 mov $D,$SZ*3($ctx) 2156 mov $E,$SZ*4($ctx) 2157 mov $F,$SZ*5($ctx) 2158 mov $G,$SZ*6($ctx) 2159 mov $H,$SZ*7($ctx) 2160 2161 cmp `$PUSH8+2*8`($Tbl),$inp # $_end 2162 je .Ldone_avx2 2163 2164 xor $a1,$a1 2165 mov $B,$a3 2166 xor $C,$a3 # magic 2167 mov $F,$a4 2168 jmp .Lower_avx2 2169.align 16 2170.Lower_avx2: 2171___ 2172 for ($i=0; $i<8; ) { 2173 my $base="+16($Tbl)"; 2174 foreach(bodyx_00_15()) { eval; } 2175 } 2176$code.=<<___; 2177 lea -$PUSH8($Tbl),$Tbl 2178 cmp %rsp,$Tbl 2179 jae .Lower_avx2 2180 2181 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2182 add $a1,$A 2183 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2184 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 2185 2186 add $SZ*0($ctx),$A 2187 add $SZ*1($ctx),$B 2188 add $SZ*2($ctx),$C 2189 add $SZ*3($ctx),$D 2190 add $SZ*4($ctx),$E 2191 add $SZ*5($ctx),$F 2192 lea `2*16*$SZ`($inp),$inp # inp+=2 2193 add $SZ*6($ctx),$G 2194 mov $inp,%r12 2195 add $SZ*7($ctx),$H 2196 cmp $_end,$inp 2197 2198 mov $A,$SZ*0($ctx) 2199 cmove %rsp,%r12 # next block or stale data 2200 mov $B,$SZ*1($ctx) 2201 mov $C,$SZ*2($ctx) 2202 mov $D,$SZ*3($ctx) 2203 mov $E,$SZ*4($ctx) 2204 mov $F,$SZ*5($ctx) 2205 mov $G,$SZ*6($ctx) 2206 mov $H,$SZ*7($ctx) 2207 2208 jbe .Loop_avx2 2209 lea (%rsp),$Tbl 2210 2211.Ldone_avx2: 2212 lea ($Tbl),%rsp 2213 mov $_rsp,%rsi 2214.cfi_def_cfa %rsi,8 2215 vzeroupper 2216___ 2217$code.=<<___ if ($win64); 2218 movaps 16*$SZ+32(%rsp),%xmm6 2219 movaps 16*$SZ+48(%rsp),%xmm7 2220 movaps 16*$SZ+64(%rsp),%xmm8 2221 movaps 16*$SZ+80(%rsp),%xmm9 2222___ 2223$code.=<<___ if ($win64 && $SZ>4); 2224 movaps 16*$SZ+96(%rsp),%xmm10 2225 movaps 16*$SZ+112(%rsp),%xmm11 2226___ 2227$code.=<<___; 2228 mov -48(%rsi),%r15 2229.cfi_restore %r15 2230 mov -40(%rsi),%r14 2231.cfi_restore %r14 2232 mov -32(%rsi),%r13 2233.cfi_restore %r13 2234 mov -24(%rsi),%r12 2235.cfi_restore %r12 2236 mov -16(%rsi),%rbp 2237.cfi_restore %rbp 2238 mov -8(%rsi),%rbx 2239.cfi_restore %rbx 2240 lea (%rsi),%rsp 2241.cfi_def_cfa_register %rsp 2242.Lepilogue_avx2: 2243 ret 2244.cfi_endproc 2245.size ${func}_avx2,.-${func}_avx2 2246___ 2247}} 2248}}}}} 2249 2250# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2251# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2252if ($win64) { 2253$rec="%rcx"; 2254$frame="%rdx"; 2255$context="%r8"; 2256$disp="%r9"; 2257 2258$code.=<<___; 2259.extern __imp_RtlVirtualUnwind 2260.type se_handler,\@abi-omnipotent 2261.align 16 2262se_handler: 2263 push %rsi 2264 push %rdi 2265 push %rbx 2266 push %rbp 2267 push %r12 2268 push %r13 2269 push %r14 2270 push %r15 2271 pushfq 2272 sub \$64,%rsp 2273 2274 mov 120($context),%rax # pull context->Rax 2275 mov 248($context),%rbx # pull context->Rip 2276 2277 mov 8($disp),%rsi # disp->ImageBase 2278 mov 56($disp),%r11 # disp->HanderlData 2279 2280 mov 0(%r11),%r10d # HandlerData[0] 2281 lea (%rsi,%r10),%r10 # prologue label 2282 cmp %r10,%rbx # context->Rip<prologue label 2283 jb .Lin_prologue 2284 2285 mov 152($context),%rax # pull context->Rsp 2286 2287 mov 4(%r11),%r10d # HandlerData[1] 2288 lea (%rsi,%r10),%r10 # epilogue label 2289 cmp %r10,%rbx # context->Rip>=epilogue label 2290 jae .Lin_prologue 2291___ 2292$code.=<<___ if ($avx>1); 2293 lea .Lavx2_shortcut(%rip),%r10 2294 cmp %r10,%rbx # context->Rip<avx2_shortcut 2295 jb .Lnot_in_avx2 2296 2297 and \$-256*$SZ,%rax 2298 add \$`2*$SZ*($rounds-8)`,%rax 2299.Lnot_in_avx2: 2300___ 2301$code.=<<___; 2302 mov %rax,%rsi # put aside Rsp 2303 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 2304 2305 mov -8(%rax),%rbx 2306 mov -16(%rax),%rbp 2307 mov -24(%rax),%r12 2308 mov -32(%rax),%r13 2309 mov -40(%rax),%r14 2310 mov -48(%rax),%r15 2311 mov %rbx,144($context) # restore context->Rbx 2312 mov %rbp,160($context) # restore context->Rbp 2313 mov %r12,216($context) # restore context->R12 2314 mov %r13,224($context) # restore context->R13 2315 mov %r14,232($context) # restore context->R14 2316 mov %r15,240($context) # restore context->R15 2317 2318 lea .Lepilogue(%rip),%r10 2319 cmp %r10,%rbx 2320 jb .Lin_prologue # non-AVX code 2321 2322 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 2323 lea 512($context),%rdi # &context.Xmm6 2324 mov \$`$SZ==4?8:12`,%ecx 2325 .long 0xa548f3fc # cld; rep movsq 2326 2327.Lin_prologue: 2328 mov 8(%rax),%rdi 2329 mov 16(%rax),%rsi 2330 mov %rax,152($context) # restore context->Rsp 2331 mov %rsi,168($context) # restore context->Rsi 2332 mov %rdi,176($context) # restore context->Rdi 2333 2334 mov 40($disp),%rdi # disp->ContextRecord 2335 mov $context,%rsi # context 2336 mov \$154,%ecx # sizeof(CONTEXT) 2337 .long 0xa548f3fc # cld; rep movsq 2338 2339 mov $disp,%rsi 2340 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2341 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2342 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2343 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2344 mov 40(%rsi),%r10 # disp->ContextRecord 2345 lea 56(%rsi),%r11 # &disp->HandlerData 2346 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2347 mov %r10,32(%rsp) # arg5 2348 mov %r11,40(%rsp) # arg6 2349 mov %r12,48(%rsp) # arg7 2350 mov %rcx,56(%rsp) # arg8, (NULL) 2351 call *__imp_RtlVirtualUnwind(%rip) 2352 2353 mov \$1,%eax # ExceptionContinueSearch 2354 add \$64,%rsp 2355 popfq 2356 pop %r15 2357 pop %r14 2358 pop %r13 2359 pop %r12 2360 pop %rbp 2361 pop %rbx 2362 pop %rdi 2363 pop %rsi 2364 ret 2365.size se_handler,.-se_handler 2366___ 2367 2368$code.=<<___ if ($SZ==4 && $shaext); 2369.type shaext_handler,\@abi-omnipotent 2370.align 16 2371shaext_handler: 2372 push %rsi 2373 push %rdi 2374 push %rbx 2375 push %rbp 2376 push %r12 2377 push %r13 2378 push %r14 2379 push %r15 2380 pushfq 2381 sub \$64,%rsp 2382 2383 mov 120($context),%rax # pull context->Rax 2384 mov 248($context),%rbx # pull context->Rip 2385 2386 lea .Lprologue_shaext(%rip),%r10 2387 cmp %r10,%rbx # context->Rip<.Lprologue 2388 jb .Lin_prologue 2389 2390 lea .Lepilogue_shaext(%rip),%r10 2391 cmp %r10,%rbx # context->Rip>=.Lepilogue 2392 jae .Lin_prologue 2393 2394 lea -8-5*16(%rax),%rsi 2395 lea 512($context),%rdi # &context.Xmm6 2396 mov \$10,%ecx 2397 .long 0xa548f3fc # cld; rep movsq 2398 2399 jmp .Lin_prologue 2400.size shaext_handler,.-shaext_handler 2401___ 2402 2403$code.=<<___; 2404.section .pdata 2405.align 4 2406 .rva .LSEH_begin_$func 2407 .rva .LSEH_end_$func 2408 .rva .LSEH_info_$func 2409___ 2410$code.=<<___ if ($SZ==4 && $shaext); 2411 .rva .LSEH_begin_${func}_shaext 2412 .rva .LSEH_end_${func}_shaext 2413 .rva .LSEH_info_${func}_shaext 2414___ 2415$code.=<<___ if ($SZ==4); 2416 .rva .LSEH_begin_${func}_ssse3 2417 .rva .LSEH_end_${func}_ssse3 2418 .rva .LSEH_info_${func}_ssse3 2419___ 2420$code.=<<___ if ($avx && $SZ==8); 2421 .rva .LSEH_begin_${func}_xop 2422 .rva .LSEH_end_${func}_xop 2423 .rva .LSEH_info_${func}_xop 2424___ 2425$code.=<<___ if ($avx); 2426 .rva .LSEH_begin_${func}_avx 2427 .rva .LSEH_end_${func}_avx 2428 .rva .LSEH_info_${func}_avx 2429___ 2430$code.=<<___ if ($avx>1); 2431 .rva .LSEH_begin_${func}_avx2 2432 .rva .LSEH_end_${func}_avx2 2433 .rva .LSEH_info_${func}_avx2 2434___ 2435$code.=<<___; 2436.section .xdata 2437.align 8 2438.LSEH_info_$func: 2439 .byte 9,0,0,0 2440 .rva se_handler 2441 .rva .Lprologue,.Lepilogue # HandlerData[] 2442___ 2443$code.=<<___ if ($SZ==4 && $shaext); 2444.LSEH_info_${func}_shaext: 2445 .byte 9,0,0,0 2446 .rva shaext_handler 2447___ 2448$code.=<<___ if ($SZ==4); 2449.LSEH_info_${func}_ssse3: 2450 .byte 9,0,0,0 2451 .rva se_handler 2452 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2453___ 2454$code.=<<___ if ($avx && $SZ==8); 2455.LSEH_info_${func}_xop: 2456 .byte 9,0,0,0 2457 .rva se_handler 2458 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 2459___ 2460$code.=<<___ if ($avx); 2461.LSEH_info_${func}_avx: 2462 .byte 9,0,0,0 2463 .rva se_handler 2464 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2465___ 2466$code.=<<___ if ($avx>1); 2467.LSEH_info_${func}_avx2: 2468 .byte 9,0,0,0 2469 .rva se_handler 2470 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2471___ 2472} 2473 2474sub sha256op38 { 2475 my $instr = shift; 2476 my %opcodelet = ( 2477 "sha256rnds2" => 0xcb, 2478 "sha256msg1" => 0xcc, 2479 "sha256msg2" => 0xcd ); 2480 2481 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2482 my @opcode=(0x0f,0x38); 2483 push @opcode,$opcodelet{$instr}; 2484 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2485 return ".byte\t".join(',',@opcode); 2486 } else { 2487 return $instr."\t".@_[0]; 2488 } 2489} 2490 2491foreach (split("\n",$code)) { 2492 s/\`([^\`]*)\`/eval $1/geo; 2493 2494 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2495 2496 print $_,"\n"; 2497} 2498close STDOUT; 2499