1#! /usr/bin/env perl 2# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# SHA256 block transform for x86. September 2007. 18# 19# Performance improvement over compiler generated code varies from 20# 10% to 40% [see below]. Not very impressive on some µ-archs, but 21# it's 5 times smaller and optimizes amount of writes. 22# 23# May 2012. 24# 25# Optimization including two of Pavel Semjanov's ideas, alternative 26# Maj and full unroll, resulted in ~20-25% improvement on most CPUs, 27# ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost 28# 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not 29# on P4, where it kills performance, nor Sandy Bridge, where folded 30# loop is approximately as fast... 31# 32# June 2012. 33# 34# Add AMD XOP-specific code path, >30% improvement on Bulldozer over 35# May version, >60% over original. Add AVX+shrd code path, >25% 36# improvement on Sandy Bridge over May version, 60% over original. 37# 38# May 2013. 39# 40# Replace AMD XOP code path with SSSE3 to cover more processors. 41# (Biggest improvement coefficient is on upcoming Atom Silvermont, 42# not shown.) Add AVX+BMI code path. 43# 44# March 2014. 45# 46# Add support for Intel SHA Extensions. 47# 48# Performance in clock cycles per processed byte (less is better): 49# 50# gcc icc x86 asm(*) SIMD x86_64 asm(**) 51# Pentium 46 57 40/38 - - 52# PIII 36 33 27/24 - - 53# P4 41 38 28 - 17.3 54# AMD K8 27 25 19/15.5 - 14.9 55# Core2 26 23 18/15.6 14.3 13.8 56# Westmere 27 - 19/15.7 13.4 12.3 57# Sandy Bridge 25 - 15.9 12.4 11.6 58# Ivy Bridge 24 - 15.0 11.4 10.3 59# Haswell 22 - 13.9 9.46 7.80 60# Skylake 20 - 14.9 9.50 7.70 61# Bulldozer 36 - 27/22 17.0 13.6 62# VIA Nano 36 - 25/22 16.8 16.5 63# Atom 50 - 30/25 21.9 18.9 64# Silvermont 40 - 34/31 22.9 20.6 65# Goldmont 29 - 20 16.3(***) 66# 67# (*) numbers after slash are for unrolled loop, where applicable; 68# (**) x86_64 assembly performance is presented for reference 69# purposes, results are best-available; 70# (***) SHAEXT result is 4.1, strangely enough better than 64-bit one; 71 72$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 73push(@INC,"${dir}","${dir}../../perlasm"); 74require "x86asm.pl"; 75 76$output=pop; 77open STDOUT,">$output"; 78 79&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 80 81$xmm=$avx=0; 82for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } 83 84if ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 85 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 86 $avx = ($1>=2.19) + ($1>=2.22); 87} 88 89if ($xmm && !$avx && $ARGV[0] eq "win32n" && 90 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 91 $avx = ($1>=2.03) + ($1>=2.10); 92} 93 94if ($xmm && !$avx && $ARGV[0] eq "win32" && 95 `ml 2>&1` =~ /Version ([0-9]+)\./) { 96 $avx = ($1>=10) + ($1>=11); 97} 98 99if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) { 100 $avx = ($2>=3.0) + ($2>3.0); 101} 102 103$shaext=$xmm; ### set to zero if compiling for 1.0.1 104 105$unroll_after = 64*4; # If pre-evicted from L1P cache first spin of 106 # fully unrolled loop was measured to run about 107 # 3-4x slower. If slowdown coefficient is N and 108 # unrolled loop is m times faster, then you break 109 # even at (N-1)/(m-1) blocks. Then it needs to be 110 # adjusted for probability of code being evicted, 111 # code size/cache size=1/4. Typical m is 1.15... 112 113$A="eax"; 114$E="edx"; 115$T="ebx"; 116$Aoff=&DWP(4,"esp"); 117$Boff=&DWP(8,"esp"); 118$Coff=&DWP(12,"esp"); 119$Doff=&DWP(16,"esp"); 120$Eoff=&DWP(20,"esp"); 121$Foff=&DWP(24,"esp"); 122$Goff=&DWP(28,"esp"); 123$Hoff=&DWP(32,"esp"); 124$Xoff=&DWP(36,"esp"); 125$K256="ebp"; 126 127sub BODY_16_63() { 128 &mov ($T,"ecx"); # "ecx" is preloaded 129 &mov ("esi",&DWP(4*(9+15+16-14),"esp")); 130 &ror ("ecx",18-7); 131 &mov ("edi","esi"); 132 &ror ("esi",19-17); 133 &xor ("ecx",$T); 134 &shr ($T,3); 135 &ror ("ecx",7); 136 &xor ("esi","edi"); 137 &xor ($T,"ecx"); # T = sigma0(X[-15]) 138 &ror ("esi",17); 139 &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16] 140 &shr ("edi",10); 141 &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7] 142 #&xor ("edi","esi") # sigma1(X[-2]) 143 # &add ($T,"edi"); # T += sigma1(X[-2]) 144 # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] 145 146 &BODY_00_15(1); 147} 148sub BODY_00_15() { 149 my $in_16_63=shift; 150 151 &mov ("ecx",$E); 152 &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2]) 153 &mov ("esi",$Foff); 154 &ror ("ecx",25-11); 155 &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) 156 &mov ("edi",$Goff); 157 &xor ("ecx",$E); 158 &xor ("esi","edi"); 159 &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63); 160 &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0] 161 &ror ("ecx",11-6); 162 &and ("esi",$E); 163 &mov ($Eoff,$E); # modulo-scheduled 164 &xor ($E,"ecx"); 165 &add ($T,$Hoff); # T += h 166 &xor ("esi","edi"); # Ch(e,f,g) 167 &ror ($E,6); # Sigma1(e) 168 &mov ("ecx",$A); 169 &add ($T,"esi"); # T += Ch(e,f,g) 170 171 &ror ("ecx",22-13); 172 &add ($T,$E); # T += Sigma1(e) 173 &mov ("edi",$Boff); 174 &xor ("ecx",$A); 175 &mov ($Aoff,$A); # modulo-scheduled 176 &lea ("esp",&DWP(-4,"esp")); 177 &ror ("ecx",13-2); 178 &mov ("esi",&DWP(0,$K256)); 179 &xor ("ecx",$A); 180 &mov ($E,$Eoff); # e in next iteration, d in this one 181 &xor ($A,"edi"); # a ^= b 182 &ror ("ecx",2); # Sigma0(a) 183 184 &add ($T,"esi"); # T+= K[i] 185 &mov (&DWP(0,"esp"),$A); # (b^c) in next round 186 &add ($E,$T); # d += T 187 &and ($A,&DWP(4,"esp")); # a &= (b^c) 188 &add ($T,"ecx"); # T += Sigma0(a) 189 &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) 190 &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T 191 &add ($K256,4); 192 &add ($A,$T); # h += T 193} 194 195&external_label("OPENSSL_ia32cap_P") if (!$i386); 196 197&function_begin("sha256_block_data_order"); 198 &mov ("esi",wparam(0)); # ctx 199 &mov ("edi",wparam(1)); # inp 200 &mov ("eax",wparam(2)); # num 201 &mov ("ebx","esp"); # saved sp 202 203 &call (&label("pic_point")); # make it PIC! 204&set_label("pic_point"); 205 &blindpop($K256); 206 &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256)); 207 208 &sub ("esp",16); 209 &and ("esp",-64); 210 211 &shl ("eax",6); 212 &add ("eax","edi"); 213 &mov (&DWP(0,"esp"),"esi"); # ctx 214 &mov (&DWP(4,"esp"),"edi"); # inp 215 &mov (&DWP(8,"esp"),"eax"); # inp+num*128 216 &mov (&DWP(12,"esp"),"ebx"); # saved sp 217 if (!$i386 && $xmm) { 218 &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256")); 219 &mov ("ecx",&DWP(0,"edx")); 220 &mov ("ebx",&DWP(4,"edx")); 221 &test ("ecx",1<<20); # check for P4 222 &jnz (&label("loop")); 223 &mov ("edx",&DWP(8,"edx")) if ($xmm); 224 &test ("ecx",1<<24); # check for FXSR 225 &jz ($unroll_after?&label("no_xmm"):&label("loop")); 226 &and ("ecx",1<<30); # mask "Intel CPU" bit 227 &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits 228 &test ("edx",1<<29) if ($shaext); # check for SHA 229 &jnz (&label("shaext")) if ($shaext); 230 &or ("ecx","ebx"); 231 &and ("ecx",1<<28|1<<30); 232 &cmp ("ecx",1<<28|1<<30); 233 if ($xmm) { 234 &je (&label("AVX")) if ($avx); 235 &test ("ebx",1<<9); # check for SSSE3 236 &jnz (&label("SSSE3")); 237 } else { 238 &je (&label("loop_shrd")); 239 } 240 if ($unroll_after) { 241&set_label("no_xmm"); 242 &sub ("eax","edi"); 243 &cmp ("eax",$unroll_after); 244 &jae (&label("unrolled")); 245 } } 246 &jmp (&label("loop")); 247 248sub COMPACT_LOOP() { 249my $suffix=shift; 250 251&set_label("loop$suffix",$suffix?32:16); 252 # copy input block to stack reversing byte and dword order 253 for($i=0;$i<4;$i++) { 254 &mov ("eax",&DWP($i*16+0,"edi")); 255 &mov ("ebx",&DWP($i*16+4,"edi")); 256 &mov ("ecx",&DWP($i*16+8,"edi")); 257 &bswap ("eax"); 258 &mov ("edx",&DWP($i*16+12,"edi")); 259 &bswap ("ebx"); 260 &push ("eax"); 261 &bswap ("ecx"); 262 &push ("ebx"); 263 &bswap ("edx"); 264 &push ("ecx"); 265 &push ("edx"); 266 } 267 &add ("edi",64); 268 &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H 269 &mov (&DWP(4*(9+16)+4,"esp"),"edi"); 270 271 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 272 &mov ($A,&DWP(0,"esi")); 273 &mov ("ebx",&DWP(4,"esi")); 274 &mov ("ecx",&DWP(8,"esi")); 275 &mov ("edi",&DWP(12,"esi")); 276 # &mov ($Aoff,$A); 277 &mov ($Boff,"ebx"); 278 &xor ("ebx","ecx"); 279 &mov ($Coff,"ecx"); 280 &mov ($Doff,"edi"); 281 &mov (&DWP(0,"esp"),"ebx"); # magic 282 &mov ($E,&DWP(16,"esi")); 283 &mov ("ebx",&DWP(20,"esi")); 284 &mov ("ecx",&DWP(24,"esi")); 285 &mov ("edi",&DWP(28,"esi")); 286 # &mov ($Eoff,$E); 287 &mov ($Foff,"ebx"); 288 &mov ($Goff,"ecx"); 289 &mov ($Hoff,"edi"); 290 291&set_label("00_15$suffix",16); 292 293 &BODY_00_15(); 294 295 &cmp ("esi",0xc19bf174); 296 &jne (&label("00_15$suffix")); 297 298 &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1) 299 &jmp (&label("16_63$suffix")); 300 301&set_label("16_63$suffix",16); 302 303 &BODY_16_63(); 304 305 &cmp ("esi",0xc67178f2); 306 &jne (&label("16_63$suffix")); 307 308 &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx 309 # &mov ($A,$Aoff); 310 &mov ("ebx",$Boff); 311 # &mov ("edi",$Coff); 312 &mov ("ecx",$Doff); 313 &add ($A,&DWP(0,"esi")); 314 &add ("ebx",&DWP(4,"esi")); 315 &add ("edi",&DWP(8,"esi")); 316 &add ("ecx",&DWP(12,"esi")); 317 &mov (&DWP(0,"esi"),$A); 318 &mov (&DWP(4,"esi"),"ebx"); 319 &mov (&DWP(8,"esi"),"edi"); 320 &mov (&DWP(12,"esi"),"ecx"); 321 # &mov ($E,$Eoff); 322 &mov ("eax",$Foff); 323 &mov ("ebx",$Goff); 324 &mov ("ecx",$Hoff); 325 &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp 326 &add ($E,&DWP(16,"esi")); 327 &add ("eax",&DWP(20,"esi")); 328 &add ("ebx",&DWP(24,"esi")); 329 &add ("ecx",&DWP(28,"esi")); 330 &mov (&DWP(16,"esi"),$E); 331 &mov (&DWP(20,"esi"),"eax"); 332 &mov (&DWP(24,"esi"),"ebx"); 333 &mov (&DWP(28,"esi"),"ecx"); 334 335 &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame 336 &sub ($K256,4*64); # rewind K 337 338 &cmp ("edi",&DWP(8,"esp")); # are we done yet? 339 &jb (&label("loop$suffix")); 340} 341 &COMPACT_LOOP(); 342 &mov ("esp",&DWP(12,"esp")); # restore sp 343&function_end_A(); 344 if (!$i386 && !$xmm) { 345 # ~20% improvement on Sandy Bridge 346 local *ror = sub { &shrd(@_[0],@_) }; 347 &COMPACT_LOOP("_shrd"); 348 &mov ("esp",&DWP(12,"esp")); # restore sp 349&function_end_A(); 350 } 351 352&set_label("K256",64); # Yes! I keep it in the code segment! 353@K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, 354 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 355 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, 356 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 357 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, 358 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 359 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, 360 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 361 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, 362 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 363 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, 364 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 365 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, 366 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 367 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, 368 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); 369&data_word(@K256); 370&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask 371&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); 372 373($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets 374sub off { &DWP(4*(((shift)-$i)&7),"esp"); } 375 376if (!$i386 && $unroll_after) { 377my @AH=($A,$K256); 378 379&set_label("unrolled",16); 380 &lea ("esp",&DWP(-96,"esp")); 381 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 382 &mov ($AH[0],&DWP(0,"esi")); 383 &mov ($AH[1],&DWP(4,"esi")); 384 &mov ("ecx",&DWP(8,"esi")); 385 &mov ("ebx",&DWP(12,"esi")); 386 #&mov (&DWP(0,"esp"),$AH[0]); 387 &mov (&DWP(4,"esp"),$AH[1]); 388 &xor ($AH[1],"ecx"); # magic 389 &mov (&DWP(8,"esp"),"ecx"); 390 &mov (&DWP(12,"esp"),"ebx"); 391 &mov ($E,&DWP(16,"esi")); 392 &mov ("ebx",&DWP(20,"esi")); 393 &mov ("ecx",&DWP(24,"esi")); 394 &mov ("esi",&DWP(28,"esi")); 395 #&mov (&DWP(16,"esp"),$E); 396 &mov (&DWP(20,"esp"),"ebx"); 397 &mov (&DWP(24,"esp"),"ecx"); 398 &mov (&DWP(28,"esp"),"esi"); 399 &jmp (&label("grand_loop")); 400 401&set_label("grand_loop",16); 402 # copy input block to stack reversing byte order 403 for($i=0;$i<5;$i++) { 404 &mov ("ebx",&DWP(12*$i+0,"edi")); 405 &mov ("ecx",&DWP(12*$i+4,"edi")); 406 &bswap ("ebx"); 407 &mov ("esi",&DWP(12*$i+8,"edi")); 408 &bswap ("ecx"); 409 &mov (&DWP(32+12*$i+0,"esp"),"ebx"); 410 &bswap ("esi"); 411 &mov (&DWP(32+12*$i+4,"esp"),"ecx"); 412 &mov (&DWP(32+12*$i+8,"esp"),"esi"); 413 } 414 &mov ("ebx",&DWP($i*12,"edi")); 415 &add ("edi",64); 416 &bswap ("ebx"); 417 &mov (&DWP(96+4,"esp"),"edi"); 418 &mov (&DWP(32+12*$i,"esp"),"ebx"); 419 420 my ($t1,$t2) = ("ecx","esi"); 421 422 for ($i=0;$i<64;$i++) { 423 424 if ($i>=16) { 425 &mov ($T,$t1); # $t1 is preloaded 426 # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp")); 427 &ror ($t1,18-7); 428 &mov ("edi",$t2); 429 &ror ($t2,19-17); 430 &xor ($t1,$T); 431 &shr ($T,3); 432 &ror ($t1,7); 433 &xor ($t2,"edi"); 434 &xor ($T,$t1); # T = sigma0(X[-15]) 435 &ror ($t2,17); 436 &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16] 437 &shr ("edi",10); 438 &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7] 439 #&xor ("edi",$t2) # sigma1(X[-2]) 440 # &add ($T,"edi"); # T += sigma1(X[-2]) 441 # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] 442 } 443 &mov ($t1,$E); 444 &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2]) 445 &mov ($t2,&off($f)); 446 &ror ($E,25-11); 447 &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2]) 448 &mov ("edi",&off($g)); 449 &xor ($E,$t1); 450 &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i] 451 &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0] 452 &xor ($t2,"edi"); 453 &ror ($E,11-6); 454 &and ($t2,$t1); 455 &mov (&off($e),$t1); # save $E, modulo-scheduled 456 &xor ($E,$t1); 457 &add ($T,&off($h)); # T += h 458 &xor ("edi",$t2); # Ch(e,f,g) 459 &ror ($E,6); # Sigma1(e) 460 &mov ($t1,$AH[0]); 461 &add ($T,"edi"); # T += Ch(e,f,g) 462 463 &ror ($t1,22-13); 464 &mov ($t2,$AH[0]); 465 &mov ("edi",&off($b)); 466 &xor ($t1,$AH[0]); 467 &mov (&off($a),$AH[0]); # save $A, modulo-scheduled 468 &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round 469 &ror ($t1,13-2); 470 &and ($AH[1],$AH[0]); # (b^c) &= (a^b) 471 &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i] 472 &xor ($t1,$t2); 473 &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) 474 &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63); 475 &ror ($t1,2); # Sigma0(a) 476 477 &add ($AH[1],$E); # h += T 478 &add ($E,&off($d)); # d += T 479 &add ($AH[1],$t1); # h += Sigma0(a) 480 &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63); 481 482 @AH = reverse(@AH); # rotate(a,h) 483 ($t1,$t2) = ($t2,$t1); # rotate(t1,t2) 484 } 485 &mov ("esi",&DWP(96,"esp")); #ctx 486 #&mov ($AH[0],&DWP(0,"esp")); 487 &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 488 #&mov ("edi", &DWP(8,"esp")); 489 &mov ("ecx",&DWP(12,"esp")); 490 &add ($AH[0],&DWP(0,"esi")); 491 &add ($AH[1],&DWP(4,"esi")); 492 &add ("edi",&DWP(8,"esi")); 493 &add ("ecx",&DWP(12,"esi")); 494 &mov (&DWP(0,"esi"),$AH[0]); 495 &mov (&DWP(4,"esi"),$AH[1]); 496 &mov (&DWP(8,"esi"),"edi"); 497 &mov (&DWP(12,"esi"),"ecx"); 498 #&mov (&DWP(0,"esp"),$AH[0]); 499 &mov (&DWP(4,"esp"),$AH[1]); 500 &xor ($AH[1],"edi"); # magic 501 &mov (&DWP(8,"esp"),"edi"); 502 &mov (&DWP(12,"esp"),"ecx"); 503 #&mov ($E,&DWP(16,"esp")); 504 &mov ("edi",&DWP(20,"esp")); 505 &mov ("ebx",&DWP(24,"esp")); 506 &mov ("ecx",&DWP(28,"esp")); 507 &add ($E,&DWP(16,"esi")); 508 &add ("edi",&DWP(20,"esi")); 509 &add ("ebx",&DWP(24,"esi")); 510 &add ("ecx",&DWP(28,"esi")); 511 &mov (&DWP(16,"esi"),$E); 512 &mov (&DWP(20,"esi"),"edi"); 513 &mov (&DWP(24,"esi"),"ebx"); 514 &mov (&DWP(28,"esi"),"ecx"); 515 #&mov (&DWP(16,"esp"),$E); 516 &mov (&DWP(20,"esp"),"edi"); 517 &mov ("edi",&DWP(96+4,"esp")); # inp 518 &mov (&DWP(24,"esp"),"ebx"); 519 &mov (&DWP(28,"esp"),"ecx"); 520 521 &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 522 &jb (&label("grand_loop")); 523 524 &mov ("esp",&DWP(96+12,"esp")); # restore sp 525&function_end_A(); 526} 527 if (!$i386 && $xmm) {{{ 528if ($shaext) { 529###################################################################### 530# Intel SHA Extensions implementation of SHA256 update function. 531# 532my ($ctx,$inp,$end)=("esi","edi","eax"); 533my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7)); 534my @MSG=map("xmm$_",(3..6)); 535 536sub sha256op38 { 537 my ($opcodelet,$dst,$src)=@_; 538 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 539 { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } 540} 541sub sha256rnds2 { sha256op38(0xcb,@_); } 542sub sha256msg1 { sha256op38(0xcc,@_); } 543sub sha256msg2 { sha256op38(0xcd,@_); } 544 545&set_label("shaext",32); 546 &sub ("esp",32); 547 548 &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA 549 &lea ($K256,&DWP(0x80,$K256)); 550 &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE 551 &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask 552 553 &pshufd ($Wi,$ABEF,0x1b); # ABCD 554 &pshufd ($ABEF,$ABEF,0xb1); # CDAB 555 &pshufd ($CDGH,$CDGH,0x1b); # EFGH 556 &palignr ($ABEF,$CDGH,8); # ABEF 557 &punpcklqdq ($CDGH,$Wi); # CDGH 558 &jmp (&label("loop_shaext")); 559 560&set_label("loop_shaext",16); 561 &movdqu (@MSG[0],&QWP(0,$inp)); 562 &movdqu (@MSG[1],&QWP(0x10,$inp)); 563 &movdqu (@MSG[2],&QWP(0x20,$inp)); 564 &pshufb (@MSG[0],$TMP); 565 &movdqu (@MSG[3],&QWP(0x30,$inp)); 566 &movdqa (&QWP(16,"esp"),$CDGH); # offload 567 568 &movdqa ($Wi,&QWP(0*16-0x80,$K256)); 569 &paddd ($Wi,@MSG[0]); 570 &pshufb (@MSG[1],$TMP); 571 &sha256rnds2 ($CDGH,$ABEF); # 0-3 572 &pshufd ($Wi,$Wi,0x0e); 573 &nop (); 574 &movdqa (&QWP(0,"esp"),$ABEF); # offload 575 &sha256rnds2 ($ABEF,$CDGH); 576 577 &movdqa ($Wi,&QWP(1*16-0x80,$K256)); 578 &paddd ($Wi,@MSG[1]); 579 &pshufb (@MSG[2],$TMP); 580 &sha256rnds2 ($CDGH,$ABEF); # 4-7 581 &pshufd ($Wi,$Wi,0x0e); 582 &lea ($inp,&DWP(0x40,$inp)); 583 &sha256msg1 (@MSG[0],@MSG[1]); 584 &sha256rnds2 ($ABEF,$CDGH); 585 586 &movdqa ($Wi,&QWP(2*16-0x80,$K256)); 587 &paddd ($Wi,@MSG[2]); 588 &pshufb (@MSG[3],$TMP); 589 &sha256rnds2 ($CDGH,$ABEF); # 8-11 590 &pshufd ($Wi,$Wi,0x0e); 591 &movdqa ($TMP,@MSG[3]); 592 &palignr ($TMP,@MSG[2],4); 593 &nop (); 594 &paddd (@MSG[0],$TMP); 595 &sha256msg1 (@MSG[1],@MSG[2]); 596 &sha256rnds2 ($ABEF,$CDGH); 597 598 &movdqa ($Wi,&QWP(3*16-0x80,$K256)); 599 &paddd ($Wi,@MSG[3]); 600 &sha256msg2 (@MSG[0],@MSG[3]); 601 &sha256rnds2 ($CDGH,$ABEF); # 12-15 602 &pshufd ($Wi,$Wi,0x0e); 603 &movdqa ($TMP,@MSG[0]); 604 &palignr ($TMP,@MSG[3],4); 605 &nop (); 606 &paddd (@MSG[1],$TMP); 607 &sha256msg1 (@MSG[2],@MSG[3]); 608 &sha256rnds2 ($ABEF,$CDGH); 609 610for($i=4;$i<16-3;$i++) { 611 &movdqa ($Wi,&QWP($i*16-0x80,$K256)); 612 &paddd ($Wi,@MSG[0]); 613 &sha256msg2 (@MSG[1],@MSG[0]); 614 &sha256rnds2 ($CDGH,$ABEF); # 16-19... 615 &pshufd ($Wi,$Wi,0x0e); 616 &movdqa ($TMP,@MSG[1]); 617 &palignr ($TMP,@MSG[0],4); 618 &nop (); 619 &paddd (@MSG[2],$TMP); 620 &sha256msg1 (@MSG[3],@MSG[0]); 621 &sha256rnds2 ($ABEF,$CDGH); 622 623 push(@MSG,shift(@MSG)); 624} 625 &movdqa ($Wi,&QWP(13*16-0x80,$K256)); 626 &paddd ($Wi,@MSG[0]); 627 &sha256msg2 (@MSG[1],@MSG[0]); 628 &sha256rnds2 ($CDGH,$ABEF); # 52-55 629 &pshufd ($Wi,$Wi,0x0e); 630 &movdqa ($TMP,@MSG[1]) 631 &palignr ($TMP,@MSG[0],4); 632 &sha256rnds2 ($ABEF,$CDGH); 633 &paddd (@MSG[2],$TMP); 634 635 &movdqa ($Wi,&QWP(14*16-0x80,$K256)); 636 &paddd ($Wi,@MSG[1]); 637 &sha256rnds2 ($CDGH,$ABEF); # 56-59 638 &pshufd ($Wi,$Wi,0x0e); 639 &sha256msg2 (@MSG[2],@MSG[1]); 640 &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask 641 &sha256rnds2 ($ABEF,$CDGH); 642 643 &movdqa ($Wi,&QWP(15*16-0x80,$K256)); 644 &paddd ($Wi,@MSG[2]); 645 &nop (); 646 &sha256rnds2 ($CDGH,$ABEF); # 60-63 647 &pshufd ($Wi,$Wi,0x0e); 648 &cmp ($end,$inp); 649 &nop (); 650 &sha256rnds2 ($ABEF,$CDGH); 651 652 &paddd ($CDGH,&QWP(16,"esp")); 653 &paddd ($ABEF,&QWP(0,"esp")); 654 &jnz (&label("loop_shaext")); 655 656 &pshufd ($CDGH,$CDGH,0xb1); # DCHG 657 &pshufd ($TMP,$ABEF,0x1b); # FEBA 658 &pshufd ($ABEF,$ABEF,0xb1); # BAFE 659 &punpckhqdq ($ABEF,$CDGH); # DCBA 660 &palignr ($CDGH,$TMP,8); # HGFE 661 662 &mov ("esp",&DWP(32+12,"esp")); 663 &movdqu (&QWP(0,$ctx),$ABEF); 664 &movdqu (&QWP(16,$ctx),$CDGH); 665&function_end_A(); 666} 667 668my @X = map("xmm$_",(0..3)); 669my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7)); 670my @AH = ($A,$T); 671 672&set_label("SSSE3",32); 673 &lea ("esp",&DWP(-96,"esp")); 674 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 675 &mov ($AH[0],&DWP(0,"esi")); 676 &mov ($AH[1],&DWP(4,"esi")); 677 &mov ("ecx",&DWP(8,"esi")); 678 &mov ("edi",&DWP(12,"esi")); 679 #&mov (&DWP(0,"esp"),$AH[0]); 680 &mov (&DWP(4,"esp"),$AH[1]); 681 &xor ($AH[1],"ecx"); # magic 682 &mov (&DWP(8,"esp"),"ecx"); 683 &mov (&DWP(12,"esp"),"edi"); 684 &mov ($E,&DWP(16,"esi")); 685 &mov ("edi",&DWP(20,"esi")); 686 &mov ("ecx",&DWP(24,"esi")); 687 &mov ("esi",&DWP(28,"esi")); 688 #&mov (&DWP(16,"esp"),$E); 689 &mov (&DWP(20,"esp"),"edi"); 690 &mov ("edi",&DWP(96+4,"esp")); # inp 691 &mov (&DWP(24,"esp"),"ecx"); 692 &mov (&DWP(28,"esp"),"esi"); 693 &movdqa ($t3,&QWP(256,$K256)); 694 &jmp (&label("grand_ssse3")); 695 696&set_label("grand_ssse3",16); 697 # load input, reverse byte order, add K256[0..15], save to stack 698 &movdqu (@X[0],&QWP(0,"edi")); 699 &movdqu (@X[1],&QWP(16,"edi")); 700 &movdqu (@X[2],&QWP(32,"edi")); 701 &movdqu (@X[3],&QWP(48,"edi")); 702 &add ("edi",64); 703 &pshufb (@X[0],$t3); 704 &mov (&DWP(96+4,"esp"),"edi"); 705 &pshufb (@X[1],$t3); 706 &movdqa ($t0,&QWP(0,$K256)); 707 &pshufb (@X[2],$t3); 708 &movdqa ($t1,&QWP(16,$K256)); 709 &paddd ($t0,@X[0]); 710 &pshufb (@X[3],$t3); 711 &movdqa ($t2,&QWP(32,$K256)); 712 &paddd ($t1,@X[1]); 713 &movdqa ($t3,&QWP(48,$K256)); 714 &movdqa (&QWP(32+0,"esp"),$t0); 715 &paddd ($t2,@X[2]); 716 &movdqa (&QWP(32+16,"esp"),$t1); 717 &paddd ($t3,@X[3]); 718 &movdqa (&QWP(32+32,"esp"),$t2); 719 &movdqa (&QWP(32+48,"esp"),$t3); 720 &jmp (&label("ssse3_00_47")); 721 722&set_label("ssse3_00_47",16); 723 &add ($K256,64); 724 725sub SSSE3_00_47 () { 726my $j = shift; 727my $body = shift; 728my @X = @_; 729my @insns = (&$body,&$body,&$body,&$body); # 120 instructions 730 731 eval(shift(@insns)); 732 &movdqa ($t0,@X[1]); 733 eval(shift(@insns)); # @ 734 eval(shift(@insns)); 735 &movdqa ($t3,@X[3]); 736 eval(shift(@insns)); 737 eval(shift(@insns)); 738 &palignr ($t0,@X[0],4); # X[1..4] 739 eval(shift(@insns)); 740 eval(shift(@insns)); # @ 741 eval(shift(@insns)); 742 &palignr ($t3,@X[2],4); # X[9..12] 743 eval(shift(@insns)); 744 eval(shift(@insns)); 745 eval(shift(@insns)); 746 &movdqa ($t1,$t0); 747 eval(shift(@insns)); # @ 748 eval(shift(@insns)); 749 &movdqa ($t2,$t0); 750 eval(shift(@insns)); 751 eval(shift(@insns)); 752 &psrld ($t0,3); 753 eval(shift(@insns)); 754 eval(shift(@insns)); # @ 755 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 756 eval(shift(@insns)); 757 eval(shift(@insns)); 758 &psrld ($t2,7); 759 eval(shift(@insns)); 760 eval(shift(@insns)); 761 eval(shift(@insns)); # @ 762 eval(shift(@insns)); 763 &pshufd ($t3,@X[3],0b11111010); # X[14..15] 764 eval(shift(@insns)); 765 eval(shift(@insns)); 766 &pslld ($t1,32-18); 767 eval(shift(@insns)); 768 eval(shift(@insns)); # @ 769 &pxor ($t0,$t2); 770 eval(shift(@insns)); 771 eval(shift(@insns)); 772 &psrld ($t2,18-7); 773 eval(shift(@insns)); 774 eval(shift(@insns)); 775 eval(shift(@insns)); # @ 776 &pxor ($t0,$t1); 777 eval(shift(@insns)); 778 eval(shift(@insns)); 779 &pslld ($t1,18-7); 780 eval(shift(@insns)); 781 eval(shift(@insns)); 782 eval(shift(@insns)); # @ 783 &pxor ($t0,$t2); 784 eval(shift(@insns)); 785 eval(shift(@insns)); 786 &movdqa ($t2,$t3); 787 eval(shift(@insns)); 788 eval(shift(@insns)); 789 eval(shift(@insns)); # @ 790 &pxor ($t0,$t1); # sigma0(X[1..4]) 791 eval(shift(@insns)); 792 eval(shift(@insns)); 793 &psrld ($t3,10); 794 eval(shift(@insns)); 795 eval(shift(@insns)); 796 eval(shift(@insns)); # @ 797 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 798 eval(shift(@insns)); 799 eval(shift(@insns)); 800 &psrlq ($t2,17); 801 eval(shift(@insns)); 802 eval(shift(@insns)); 803 eval(shift(@insns)); # @ 804 &pxor ($t3,$t2); 805 eval(shift(@insns)); 806 eval(shift(@insns)); 807 &psrlq ($t2,19-17); 808 eval(shift(@insns)); 809 eval(shift(@insns)); 810 eval(shift(@insns)); # @ 811 &pxor ($t3,$t2); 812 eval(shift(@insns)); 813 eval(shift(@insns)); 814 &pshufd ($t3,$t3,0b10000000); 815 eval(shift(@insns)); 816 eval(shift(@insns)); 817 eval(shift(@insns)); # @ 818 eval(shift(@insns)); 819 eval(shift(@insns)); 820 eval(shift(@insns)); 821 eval(shift(@insns)); 822 eval(shift(@insns)); # @ 823 eval(shift(@insns)); 824 &psrldq ($t3,8); 825 eval(shift(@insns)); 826 eval(shift(@insns)); 827 eval(shift(@insns)); 828 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 829 eval(shift(@insns)); # @ 830 eval(shift(@insns)); 831 eval(shift(@insns)); 832 eval(shift(@insns)); 833 eval(shift(@insns)); 834 eval(shift(@insns)); # @ 835 eval(shift(@insns)); 836 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 837 eval(shift(@insns)); 838 eval(shift(@insns)); 839 eval(shift(@insns)); 840 &movdqa ($t2,$t3); 841 eval(shift(@insns)); # @ 842 &psrld ($t3,10); 843 eval(shift(@insns)); 844 &psrlq ($t2,17); 845 eval(shift(@insns)); 846 eval(shift(@insns)); 847 eval(shift(@insns)); 848 eval(shift(@insns)); # @ 849 &pxor ($t3,$t2); 850 eval(shift(@insns)); 851 eval(shift(@insns)); 852 &psrlq ($t2,19-17); 853 eval(shift(@insns)); 854 eval(shift(@insns)); 855 eval(shift(@insns)); # @ 856 &pxor ($t3,$t2); 857 eval(shift(@insns)); 858 eval(shift(@insns)); 859 eval(shift(@insns)); 860 &pshufd ($t3,$t3,0b00001000); 861 eval(shift(@insns)); 862 eval(shift(@insns)); # @ 863 &movdqa ($t2,&QWP(16*$j,$K256)); 864 eval(shift(@insns)); 865 eval(shift(@insns)); 866 &pslldq ($t3,8); 867 eval(shift(@insns)); 868 eval(shift(@insns)); 869 eval(shift(@insns)); # @ 870 eval(shift(@insns)); 871 eval(shift(@insns)); 872 eval(shift(@insns)); 873 eval(shift(@insns)); 874 eval(shift(@insns)); # @ 875 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 876 eval(shift(@insns)); 877 eval(shift(@insns)); 878 eval(shift(@insns)); 879 eval(shift(@insns)); 880 &paddd ($t2,@X[0]); 881 eval(shift(@insns)); # @ 882 883 foreach (@insns) { eval; } # remaining instructions 884 885 &movdqa (&QWP(32+16*$j,"esp"),$t2); 886} 887 888sub body_00_15 () { 889 ( 890 '&mov ("ecx",$E);', 891 '&ror ($E,25-11);', 892 '&mov ("esi",&off($f));', 893 '&xor ($E,"ecx");', 894 '&mov ("edi",&off($g));', 895 '&xor ("esi","edi");', 896 '&ror ($E,11-6);', 897 '&and ("esi","ecx");', 898 '&mov (&off($e),"ecx");', # save $E, modulo-scheduled 899 '&xor ($E,"ecx");', 900 '&xor ("edi","esi");', # Ch(e,f,g) 901 '&ror ($E,6);', # T = Sigma1(e) 902 '&mov ("ecx",$AH[0]);', 903 '&add ($E,"edi");', # T += Ch(e,f,g) 904 '&mov ("edi",&off($b));', 905 '&mov ("esi",$AH[0]);', 906 907 '&ror ("ecx",22-13);', 908 '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled 909 '&xor ("ecx",$AH[0]);', 910 '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round 911 '&add ($E,&off($h));', # T += h 912 '&ror ("ecx",13-2);', 913 '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b) 914 '&xor ("ecx","esi");', 915 '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i] 916 '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b) 917 '&ror ("ecx",2);', # Sigma0(a) 918 919 '&add ($AH[1],$E);', # h += T 920 '&add ($E,&off($d));', # d += T 921 '&add ($AH[1],"ecx");'. # h += Sigma0(a) 922 923 '@AH = reverse(@AH); $i++;' # rotate(a,h) 924 ); 925} 926 927 for ($i=0,$j=0; $j<4; $j++) { 928 &SSSE3_00_47($j,\&body_00_15,@X); 929 push(@X,shift(@X)); # rotate(@X) 930 } 931 &cmp (&DWP(16*$j,$K256),0x00010203); 932 &jne (&label("ssse3_00_47")); 933 934 for ($i=0; $i<16; ) { 935 foreach(body_00_15()) { eval; } 936 } 937 938 &mov ("esi",&DWP(96,"esp")); #ctx 939 #&mov ($AH[0],&DWP(0,"esp")); 940 &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 941 #&mov ("edi", &DWP(8,"esp")); 942 &mov ("ecx",&DWP(12,"esp")); 943 &add ($AH[0],&DWP(0,"esi")); 944 &add ($AH[1],&DWP(4,"esi")); 945 &add ("edi",&DWP(8,"esi")); 946 &add ("ecx",&DWP(12,"esi")); 947 &mov (&DWP(0,"esi"),$AH[0]); 948 &mov (&DWP(4,"esi"),$AH[1]); 949 &mov (&DWP(8,"esi"),"edi"); 950 &mov (&DWP(12,"esi"),"ecx"); 951 #&mov (&DWP(0,"esp"),$AH[0]); 952 &mov (&DWP(4,"esp"),$AH[1]); 953 &xor ($AH[1],"edi"); # magic 954 &mov (&DWP(8,"esp"),"edi"); 955 &mov (&DWP(12,"esp"),"ecx"); 956 #&mov ($E,&DWP(16,"esp")); 957 &mov ("edi",&DWP(20,"esp")); 958 &mov ("ecx",&DWP(24,"esp")); 959 &add ($E,&DWP(16,"esi")); 960 &add ("edi",&DWP(20,"esi")); 961 &add ("ecx",&DWP(24,"esi")); 962 &mov (&DWP(16,"esi"),$E); 963 &mov (&DWP(20,"esi"),"edi"); 964 &mov (&DWP(20,"esp"),"edi"); 965 &mov ("edi",&DWP(28,"esp")); 966 &mov (&DWP(24,"esi"),"ecx"); 967 #&mov (&DWP(16,"esp"),$E); 968 &add ("edi",&DWP(28,"esi")); 969 &mov (&DWP(24,"esp"),"ecx"); 970 &mov (&DWP(28,"esi"),"edi"); 971 &mov (&DWP(28,"esp"),"edi"); 972 &mov ("edi",&DWP(96+4,"esp")); # inp 973 974 &movdqa ($t3,&QWP(64,$K256)); 975 &sub ($K256,3*64); # rewind K 976 &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 977 &jb (&label("grand_ssse3")); 978 979 &mov ("esp",&DWP(96+12,"esp")); # restore sp 980&function_end_A(); 981 if ($avx) { 982&set_label("AVX",32); 983 if ($avx>1) { 984 &and ("edx",1<<8|1<<3); # check for BMI2+BMI1 985 &cmp ("edx",1<<8|1<<3); 986 &je (&label("AVX_BMI")); 987 } 988 &lea ("esp",&DWP(-96,"esp")); 989 &vzeroall (); 990 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 991 &mov ($AH[0],&DWP(0,"esi")); 992 &mov ($AH[1],&DWP(4,"esi")); 993 &mov ("ecx",&DWP(8,"esi")); 994 &mov ("edi",&DWP(12,"esi")); 995 #&mov (&DWP(0,"esp"),$AH[0]); 996 &mov (&DWP(4,"esp"),$AH[1]); 997 &xor ($AH[1],"ecx"); # magic 998 &mov (&DWP(8,"esp"),"ecx"); 999 &mov (&DWP(12,"esp"),"edi"); 1000 &mov ($E,&DWP(16,"esi")); 1001 &mov ("edi",&DWP(20,"esi")); 1002 &mov ("ecx",&DWP(24,"esi")); 1003 &mov ("esi",&DWP(28,"esi")); 1004 #&mov (&DWP(16,"esp"),$E); 1005 &mov (&DWP(20,"esp"),"edi"); 1006 &mov ("edi",&DWP(96+4,"esp")); # inp 1007 &mov (&DWP(24,"esp"),"ecx"); 1008 &mov (&DWP(28,"esp"),"esi"); 1009 &vmovdqa ($t3,&QWP(256,$K256)); 1010 &jmp (&label("grand_avx")); 1011 1012&set_label("grand_avx",32); 1013 # load input, reverse byte order, add K256[0..15], save to stack 1014 &vmovdqu (@X[0],&QWP(0,"edi")); 1015 &vmovdqu (@X[1],&QWP(16,"edi")); 1016 &vmovdqu (@X[2],&QWP(32,"edi")); 1017 &vmovdqu (@X[3],&QWP(48,"edi")); 1018 &add ("edi",64); 1019 &vpshufb (@X[0],@X[0],$t3); 1020 &mov (&DWP(96+4,"esp"),"edi"); 1021 &vpshufb (@X[1],@X[1],$t3); 1022 &vpshufb (@X[2],@X[2],$t3); 1023 &vpaddd ($t0,@X[0],&QWP(0,$K256)); 1024 &vpshufb (@X[3],@X[3],$t3); 1025 &vpaddd ($t1,@X[1],&QWP(16,$K256)); 1026 &vpaddd ($t2,@X[2],&QWP(32,$K256)); 1027 &vpaddd ($t3,@X[3],&QWP(48,$K256)); 1028 &vmovdqa (&QWP(32+0,"esp"),$t0); 1029 &vmovdqa (&QWP(32+16,"esp"),$t1); 1030 &vmovdqa (&QWP(32+32,"esp"),$t2); 1031 &vmovdqa (&QWP(32+48,"esp"),$t3); 1032 &jmp (&label("avx_00_47")); 1033 1034&set_label("avx_00_47",16); 1035 &add ($K256,64); 1036 1037sub Xupdate_AVX () { 1038 ( 1039 '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4] 1040 '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12] 1041 '&vpsrld ($t2,$t0,7);', 1042 '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16] 1043 '&vpsrld ($t3,$t0,3);', 1044 '&vpslld ($t1,$t0,14);', 1045 '&vpxor ($t0,$t3,$t2);', 1046 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1047 '&vpsrld ($t2,$t2,18-7);', 1048 '&vpxor ($t0,$t0,$t1);', 1049 '&vpslld ($t1,$t1,25-14);', 1050 '&vpxor ($t0,$t0,$t2);', 1051 '&vpsrld ($t2,$t3,10);', 1052 '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4]) 1053 '&vpsrlq ($t1,$t3,17);', 1054 '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 1055 '&vpxor ($t2,$t2,$t1);', 1056 '&vpsrlq ($t3,$t3,19);', 1057 '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15] 1058 '&vpshufd ($t3,$t2,0b10000100);', 1059 '&vpsrldq ($t3,$t3,8);', 1060 '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15]) 1061 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1062 '&vpsrld ($t2,$t3,10);', 1063 '&vpsrlq ($t1,$t3,17);', 1064 '&vpxor ($t2,$t2,$t1);', 1065 '&vpsrlq ($t3,$t3,19);', 1066 '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17] 1067 '&vpshufd ($t3,$t2,0b11101000);', 1068 '&vpslldq ($t3,$t3,8);', 1069 '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17]) 1070 ); 1071} 1072 1073local *ror = sub { &shrd(@_[0],@_) }; 1074sub AVX_00_47 () { 1075my $j = shift; 1076my $body = shift; 1077my @X = @_; 1078my @insns = (&$body,&$body,&$body,&$body); # 120 instructions 1079my $insn; 1080 1081 foreach (Xupdate_AVX()) { # 31 instructions 1082 eval; 1083 eval(shift(@insns)); 1084 eval(shift(@insns)); 1085 eval($insn = shift(@insns)); 1086 eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/); 1087 } 1088 &vpaddd ($t2,@X[0],&QWP(16*$j,$K256)); 1089 foreach (@insns) { eval; } # remaining instructions 1090 &vmovdqa (&QWP(32+16*$j,"esp"),$t2); 1091} 1092 1093 for ($i=0,$j=0; $j<4; $j++) { 1094 &AVX_00_47($j,\&body_00_15,@X); 1095 push(@X,shift(@X)); # rotate(@X) 1096 } 1097 &cmp (&DWP(16*$j,$K256),0x00010203); 1098 &jne (&label("avx_00_47")); 1099 1100 for ($i=0; $i<16; ) { 1101 foreach(body_00_15()) { eval; } 1102 } 1103 1104 &mov ("esi",&DWP(96,"esp")); #ctx 1105 #&mov ($AH[0],&DWP(0,"esp")); 1106 &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 1107 #&mov ("edi", &DWP(8,"esp")); 1108 &mov ("ecx",&DWP(12,"esp")); 1109 &add ($AH[0],&DWP(0,"esi")); 1110 &add ($AH[1],&DWP(4,"esi")); 1111 &add ("edi",&DWP(8,"esi")); 1112 &add ("ecx",&DWP(12,"esi")); 1113 &mov (&DWP(0,"esi"),$AH[0]); 1114 &mov (&DWP(4,"esi"),$AH[1]); 1115 &mov (&DWP(8,"esi"),"edi"); 1116 &mov (&DWP(12,"esi"),"ecx"); 1117 #&mov (&DWP(0,"esp"),$AH[0]); 1118 &mov (&DWP(4,"esp"),$AH[1]); 1119 &xor ($AH[1],"edi"); # magic 1120 &mov (&DWP(8,"esp"),"edi"); 1121 &mov (&DWP(12,"esp"),"ecx"); 1122 #&mov ($E,&DWP(16,"esp")); 1123 &mov ("edi",&DWP(20,"esp")); 1124 &mov ("ecx",&DWP(24,"esp")); 1125 &add ($E,&DWP(16,"esi")); 1126 &add ("edi",&DWP(20,"esi")); 1127 &add ("ecx",&DWP(24,"esi")); 1128 &mov (&DWP(16,"esi"),$E); 1129 &mov (&DWP(20,"esi"),"edi"); 1130 &mov (&DWP(20,"esp"),"edi"); 1131 &mov ("edi",&DWP(28,"esp")); 1132 &mov (&DWP(24,"esi"),"ecx"); 1133 #&mov (&DWP(16,"esp"),$E); 1134 &add ("edi",&DWP(28,"esi")); 1135 &mov (&DWP(24,"esp"),"ecx"); 1136 &mov (&DWP(28,"esi"),"edi"); 1137 &mov (&DWP(28,"esp"),"edi"); 1138 &mov ("edi",&DWP(96+4,"esp")); # inp 1139 1140 &vmovdqa ($t3,&QWP(64,$K256)); 1141 &sub ($K256,3*64); # rewind K 1142 &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 1143 &jb (&label("grand_avx")); 1144 1145 &mov ("esp",&DWP(96+12,"esp")); # restore sp 1146 &vzeroall (); 1147&function_end_A(); 1148 if ($avx>1) { 1149sub bodyx_00_15 () { # +10% 1150 ( 1151 '&rorx ("ecx",$E,6)', 1152 '&rorx ("esi",$E,11)', 1153 '&mov (&off($e),$E)', # save $E, modulo-scheduled 1154 '&rorx ("edi",$E,25)', 1155 '&xor ("ecx","esi")', 1156 '&andn ("esi",$E,&off($g))', 1157 '&xor ("ecx","edi")', # Sigma1(e) 1158 '&and ($E,&off($f))', 1159 '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled 1160 '&or ($E,"esi")', # T = Ch(e,f,g) 1161 1162 '&rorx ("edi",$AH[0],2)', 1163 '&rorx ("esi",$AH[0],13)', 1164 '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e) 1165 '&rorx ("ecx",$AH[0],22)', 1166 '&xor ("esi","edi")', 1167 '&mov ("edi",&off($b))', 1168 '&xor ("ecx","esi")', # Sigma0(a) 1169 1170 '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round 1171 '&add ($E,&off($h))', # T += h 1172 '&and ($AH[1],$AH[0])', # (b^c) &= (a^b) 1173 '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i] 1174 '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b) 1175 1176 '&add ("ecx",$E)', # h += T 1177 '&add ($E,&off($d))', # d += T 1178 '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a) 1179 1180 '@AH = reverse(@AH); $i++;' # rotate(a,h) 1181 ); 1182} 1183 1184&set_label("AVX_BMI",32); 1185 &lea ("esp",&DWP(-96,"esp")); 1186 &vzeroall (); 1187 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 1188 &mov ($AH[0],&DWP(0,"esi")); 1189 &mov ($AH[1],&DWP(4,"esi")); 1190 &mov ("ecx",&DWP(8,"esi")); 1191 &mov ("edi",&DWP(12,"esi")); 1192 #&mov (&DWP(0,"esp"),$AH[0]); 1193 &mov (&DWP(4,"esp"),$AH[1]); 1194 &xor ($AH[1],"ecx"); # magic 1195 &mov (&DWP(8,"esp"),"ecx"); 1196 &mov (&DWP(12,"esp"),"edi"); 1197 &mov ($E,&DWP(16,"esi")); 1198 &mov ("edi",&DWP(20,"esi")); 1199 &mov ("ecx",&DWP(24,"esi")); 1200 &mov ("esi",&DWP(28,"esi")); 1201 #&mov (&DWP(16,"esp"),$E); 1202 &mov (&DWP(20,"esp"),"edi"); 1203 &mov ("edi",&DWP(96+4,"esp")); # inp 1204 &mov (&DWP(24,"esp"),"ecx"); 1205 &mov (&DWP(28,"esp"),"esi"); 1206 &vmovdqa ($t3,&QWP(256,$K256)); 1207 &jmp (&label("grand_avx_bmi")); 1208 1209&set_label("grand_avx_bmi",32); 1210 # load input, reverse byte order, add K256[0..15], save to stack 1211 &vmovdqu (@X[0],&QWP(0,"edi")); 1212 &vmovdqu (@X[1],&QWP(16,"edi")); 1213 &vmovdqu (@X[2],&QWP(32,"edi")); 1214 &vmovdqu (@X[3],&QWP(48,"edi")); 1215 &add ("edi",64); 1216 &vpshufb (@X[0],@X[0],$t3); 1217 &mov (&DWP(96+4,"esp"),"edi"); 1218 &vpshufb (@X[1],@X[1],$t3); 1219 &vpshufb (@X[2],@X[2],$t3); 1220 &vpaddd ($t0,@X[0],&QWP(0,$K256)); 1221 &vpshufb (@X[3],@X[3],$t3); 1222 &vpaddd ($t1,@X[1],&QWP(16,$K256)); 1223 &vpaddd ($t2,@X[2],&QWP(32,$K256)); 1224 &vpaddd ($t3,@X[3],&QWP(48,$K256)); 1225 &vmovdqa (&QWP(32+0,"esp"),$t0); 1226 &vmovdqa (&QWP(32+16,"esp"),$t1); 1227 &vmovdqa (&QWP(32+32,"esp"),$t2); 1228 &vmovdqa (&QWP(32+48,"esp"),$t3); 1229 &jmp (&label("avx_bmi_00_47")); 1230 1231&set_label("avx_bmi_00_47",16); 1232 &add ($K256,64); 1233 1234 for ($i=0,$j=0; $j<4; $j++) { 1235 &AVX_00_47($j,\&bodyx_00_15,@X); 1236 push(@X,shift(@X)); # rotate(@X) 1237 } 1238 &cmp (&DWP(16*$j,$K256),0x00010203); 1239 &jne (&label("avx_bmi_00_47")); 1240 1241 for ($i=0; $i<16; ) { 1242 foreach(bodyx_00_15()) { eval; } 1243 } 1244 1245 &mov ("esi",&DWP(96,"esp")); #ctx 1246 #&mov ($AH[0],&DWP(0,"esp")); 1247 &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 1248 #&mov ("edi", &DWP(8,"esp")); 1249 &mov ("ecx",&DWP(12,"esp")); 1250 &add ($AH[0],&DWP(0,"esi")); 1251 &add ($AH[1],&DWP(4,"esi")); 1252 &add ("edi",&DWP(8,"esi")); 1253 &add ("ecx",&DWP(12,"esi")); 1254 &mov (&DWP(0,"esi"),$AH[0]); 1255 &mov (&DWP(4,"esi"),$AH[1]); 1256 &mov (&DWP(8,"esi"),"edi"); 1257 &mov (&DWP(12,"esi"),"ecx"); 1258 #&mov (&DWP(0,"esp"),$AH[0]); 1259 &mov (&DWP(4,"esp"),$AH[1]); 1260 &xor ($AH[1],"edi"); # magic 1261 &mov (&DWP(8,"esp"),"edi"); 1262 &mov (&DWP(12,"esp"),"ecx"); 1263 #&mov ($E,&DWP(16,"esp")); 1264 &mov ("edi",&DWP(20,"esp")); 1265 &mov ("ecx",&DWP(24,"esp")); 1266 &add ($E,&DWP(16,"esi")); 1267 &add ("edi",&DWP(20,"esi")); 1268 &add ("ecx",&DWP(24,"esi")); 1269 &mov (&DWP(16,"esi"),$E); 1270 &mov (&DWP(20,"esi"),"edi"); 1271 &mov (&DWP(20,"esp"),"edi"); 1272 &mov ("edi",&DWP(28,"esp")); 1273 &mov (&DWP(24,"esi"),"ecx"); 1274 #&mov (&DWP(16,"esp"),$E); 1275 &add ("edi",&DWP(28,"esi")); 1276 &mov (&DWP(24,"esp"),"ecx"); 1277 &mov (&DWP(28,"esi"),"edi"); 1278 &mov (&DWP(28,"esp"),"edi"); 1279 &mov ("edi",&DWP(96+4,"esp")); # inp 1280 1281 &vmovdqa ($t3,&QWP(64,$K256)); 1282 &sub ($K256,3*64); # rewind K 1283 &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 1284 &jb (&label("grand_avx_bmi")); 1285 1286 &mov ("esp",&DWP(96+12,"esp")); # restore sp 1287 &vzeroall (); 1288&function_end_A(); 1289 } 1290 } 1291 }}} 1292&function_end_B("sha256_block_data_order"); 1293 1294&asm_finish(); 1295 1296close STDOUT or die "error closing STDOUT: $!"; 1297