1#! /usr/bin/env perl 2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# January 2013 18# 19# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled 20# in http://download.intel.com/design/intarch/papers/323686.pdf, is 21# that since AESNI-CBC encrypt exhibit *very* low instruction-level 22# parallelism, interleaving it with another algorithm would allow to 23# utilize processor resources better and achieve better performance. 24# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and 25# AESNI code is weaved into it. As SHA256 dominates execution time, 26# stitch performance does not depend on AES key length. Below are 27# performance numbers in cycles per processed byte, less is better, 28# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched 29# subroutine: 30# 31# AES-128/-192/-256+SHA256 this(**) gain 32# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% 33# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% 34# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% 35# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40% 36# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% 37# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54% 38# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60% 39# 40# (*) there are XOP, AVX1 and AVX2 code paths, meaning that 41# Westmere is omitted from loop, this is because gain was not 42# estimated high enough to justify the effort; 43# (**) these are EVP-free results, results obtained with 'speed 44# -evp aes-256-cbc-hmac-sha256' will vary by percent or two; 45# (***) these are SHAEXT results; 46 47$flavour = shift; 48$output = shift; 49if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 50 51$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 52 53$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 54( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 55( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 56die "can't locate x86_64-xlate.pl"; 57 58if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 59 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 60 $avx = ($1>=2.19) + ($1>=2.22); 61} 62 63if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 64 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 65 $avx = ($1>=2.09) + ($1>=2.10); 66} 67 68if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 69 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 70 $avx = ($1>=10) + ($1>=12); 71} 72 73if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 74 $avx = ($2>=3.0) + ($2>3.0); 75} 76 77$shaext=$avx; ### set to zero if compiling for 1.0.1 78$avx=1 if (!$shaext && $avx); 79 80open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 81*STDOUT=*OUT; 82 83$func="aesni_cbc_sha256_enc"; 84$TABLE="K256"; 85$SZ=4; 86@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 87 "%r8d","%r9d","%r10d","%r11d"); 88($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi"); 89@Sigma0=( 2,13,22); 90@Sigma1=( 6,11,25); 91@sigma0=( 7,18, 3); 92@sigma1=(17,19,10); 93$rounds=64; 94 95######################################################################## 96# void aesni_cbc_sha256_enc(const void *inp, 97# void *out, 98# size_t length, 99# const AES_KEY *key, 100# unsigned char *iv, 101# SHA256_CTX *ctx, 102# const void *in0); 103($inp, $out, $len, $key, $ivp, $ctx, $in0) = 104("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 105 106$Tbl="%rbp"; 107 108$_inp="16*$SZ+0*8(%rsp)"; 109$_out="16*$SZ+1*8(%rsp)"; 110$_end="16*$SZ+2*8(%rsp)"; 111$_key="16*$SZ+3*8(%rsp)"; 112$_ivp="16*$SZ+4*8(%rsp)"; 113$_ctx="16*$SZ+5*8(%rsp)"; 114$_in0="16*$SZ+6*8(%rsp)"; 115$_rsp="`16*$SZ+7*8`(%rsp)"; 116$framesz=16*$SZ+8*8; 117 118$code=<<___; 119.text 120 121.extern OPENSSL_ia32cap_P 122.globl $func 123.type $func,\@abi-omnipotent 124.align 16 125$func: 126.cfi_startproc 127___ 128 if ($avx) { 129$code.=<<___; 130 lea OPENSSL_ia32cap_P(%rip),%r11 131 mov \$1,%eax 132 cmp \$0,`$win64?"%rcx":"%rdi"` 133 je .Lprobe 134 mov 0(%r11),%eax 135 mov 4(%r11),%r10 136___ 137$code.=<<___ if ($shaext); 138 bt \$61,%r10 # check for SHA 139 jc ${func}_shaext 140___ 141$code.=<<___; 142 mov %r10,%r11 143 shr \$32,%r11 144 145 test \$`1<<11`,%r10d # check for XOP 146 jnz ${func}_xop 147___ 148$code.=<<___ if ($avx>1); 149 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 150 cmp \$`1<<8|1<<5|1<<3`,%r11d 151 je ${func}_avx2 152___ 153$code.=<<___; 154 and \$`1<<28`,%r10d # check for AVX 155 jnz ${func}_avx 156 ud2 157___ 158 } 159$code.=<<___; 160 xor %eax,%eax 161 cmp \$0,`$win64?"%rcx":"%rdi"` 162 je .Lprobe 163 ud2 164.Lprobe: 165 ret 166.cfi_endproc 167.size $func,.-$func 168 169.align 64 170.type $TABLE,\@object 171$TABLE: 172 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 173 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 174 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 175 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 176 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 177 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 178 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 179 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 180 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 181 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 182 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 183 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 184 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 185 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 186 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 187 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 188 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 189 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 190 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 191 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 192 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 193 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 194 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 195 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 196 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 197 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 198 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 199 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 200 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 201 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 202 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 203 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 204 205 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 206 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 207 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 208 .long 0,0,0,0, 0,0,0,0 209 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 210.align 64 211___ 212 213###################################################################### 214# SIMD code paths 215# 216{{{ 217($iv,$inout,$roundkey,$temp, 218 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15)); 219 220$aesni_cbc_idx=0; 221@aesni_cbc_block = ( 222## &vmovdqu ($roundkey,"0x00-0x80($inp)");' 223## &vmovdqu ($inout,($inp)); 224## &mov ($_inp,$inp); 225 226 '&vpxor ($inout,$inout,$roundkey);'. 227 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");', 228 229 '&vpxor ($inout,$inout,$iv);', 230 231 '&vaesenc ($inout,$inout,$roundkey);'. 232 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");', 233 234 '&vaesenc ($inout,$inout,$roundkey);'. 235 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");', 236 237 '&vaesenc ($inout,$inout,$roundkey);'. 238 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");', 239 240 '&vaesenc ($inout,$inout,$roundkey);'. 241 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");', 242 243 '&vaesenc ($inout,$inout,$roundkey);'. 244 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");', 245 246 '&vaesenc ($inout,$inout,$roundkey);'. 247 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");', 248 249 '&vaesenc ($inout,$inout,$roundkey);'. 250 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");', 251 252 '&vaesenc ($inout,$inout,$roundkey);'. 253 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");', 254 255 '&vaesenc ($inout,$inout,$roundkey);'. 256 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");', 257 258 '&vaesenclast ($temp,$inout,$roundkey);'. 259 ' &vaesenc ($inout,$inout,$roundkey);'. 260 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");', 261 262 '&vpand ($iv,$temp,$mask10);'. 263 ' &vaesenc ($inout,$inout,$roundkey);'. 264 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");', 265 266 '&vaesenclast ($temp,$inout,$roundkey);'. 267 ' &vaesenc ($inout,$inout,$roundkey);'. 268 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");', 269 270 '&vpand ($temp,$temp,$mask12);'. 271 ' &vaesenc ($inout,$inout,$roundkey);'. 272 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");', 273 274 '&vpor ($iv,$iv,$temp);'. 275 ' &vaesenclast ($temp,$inout,$roundkey);'. 276 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");' 277 278## &mov ($inp,$_inp); 279## &mov ($out,$_out); 280## &vpand ($temp,$temp,$mask14); 281## &vpor ($iv,$iv,$temp); 282## &vmovdqu ($iv,($out,$inp); 283## &lea (inp,16($inp)); 284); 285 286my $a4=$T1; 287my ($a,$b,$c,$d,$e,$f,$g,$h); 288 289sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 290{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 291 my $arg = pop; 292 $arg = "\$$arg" if ($arg*1 eq $arg); 293 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 294} 295 296sub body_00_15 () { 297 ( 298 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 299 300 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 301 '&mov ($a,$a1)', 302 '&mov ($a4,$f)', 303 304 '&xor ($a0,$e)', 305 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 306 '&xor ($a4,$g)', # f^g 307 308 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 309 '&xor ($a1,$a)', 310 '&and ($a4,$e)', # (f^g)&e 311 312 @aesni_cbc_block[$aesni_cbc_idx++]. 313 '&xor ($a0,$e)', 314 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 315 '&mov ($a2,$a)', 316 317 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 318 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 319 '&xor ($a2,$b)', # a^b, b^c in next round 320 321 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 322 '&add ($h,$a4)', # h+=Ch(e,f,g) 323 '&and ($a3,$a2)', # (b^c)&(a^b) 324 325 '&xor ($a1,$a)', 326 '&add ($h,$a0)', # h+=Sigma1(e) 327 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 328 329 '&add ($d,$h)', # d+=h 330 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 331 '&add ($h,$a3)', # h+=Maj(a,b,c) 332 333 '&mov ($a0,$d)', 334 '&add ($a1,$h);'. # h+=Sigma0(a) 335 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 336 ); 337} 338 339if ($avx) {{ 340###################################################################### 341# XOP code path 342# 343$code.=<<___; 344.type ${func}_xop,\@function,6 345.align 64 346${func}_xop: 347.cfi_startproc 348.Lxop_shortcut: 349 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 350 mov %rsp,%rax # copy %rsp 351.cfi_def_cfa_register %rax 352 push %rbx 353.cfi_push %rbx 354 push %rbp 355.cfi_push %rbp 356 push %r12 357.cfi_push %r12 358 push %r13 359.cfi_push %r13 360 push %r14 361.cfi_push %r14 362 push %r15 363.cfi_push %r15 364 sub \$`$framesz+$win64*16*10`,%rsp 365 and \$-64,%rsp # align stack frame 366 367 shl \$6,$len 368 sub $inp,$out # re-bias 369 sub $inp,$in0 370 add $inp,$len # end of input 371 372 #mov $inp,$_inp # saved later 373 mov $out,$_out 374 mov $len,$_end 375 #mov $key,$_key # remains resident in $inp register 376 mov $ivp,$_ivp 377 mov $ctx,$_ctx 378 mov $in0,$_in0 379 mov %rax,$_rsp 380.cfi_cfa_expression $_rsp,deref,+8 381___ 382$code.=<<___ if ($win64); 383 movaps %xmm6,`$framesz+16*0`(%rsp) 384 movaps %xmm7,`$framesz+16*1`(%rsp) 385 movaps %xmm8,`$framesz+16*2`(%rsp) 386 movaps %xmm9,`$framesz+16*3`(%rsp) 387 movaps %xmm10,`$framesz+16*4`(%rsp) 388 movaps %xmm11,`$framesz+16*5`(%rsp) 389 movaps %xmm12,`$framesz+16*6`(%rsp) 390 movaps %xmm13,`$framesz+16*7`(%rsp) 391 movaps %xmm14,`$framesz+16*8`(%rsp) 392 movaps %xmm15,`$framesz+16*9`(%rsp) 393___ 394$code.=<<___; 395.Lprologue_xop: 396 vzeroall 397 398 mov $inp,%r12 # borrow $a4 399 lea 0x80($key),$inp # size optimization, reassign 400 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 401 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 402 mov $ctx,%r15 # borrow $a2 403 mov $in0,%rsi # borrow $a3 404 vmovdqu ($ivp),$iv # load IV 405 sub \$9,%r14 406 407 mov $SZ*0(%r15),$A 408 mov $SZ*1(%r15),$B 409 mov $SZ*2(%r15),$C 410 mov $SZ*3(%r15),$D 411 mov $SZ*4(%r15),$E 412 mov $SZ*5(%r15),$F 413 mov $SZ*6(%r15),$G 414 mov $SZ*7(%r15),$H 415 416 vmovdqa 0x00(%r13,%r14,8),$mask14 417 vmovdqa 0x10(%r13,%r14,8),$mask12 418 vmovdqa 0x20(%r13,%r14,8),$mask10 419 vmovdqu 0x00-0x80($inp),$roundkey 420 jmp .Lloop_xop 421___ 422 if ($SZ==4) { # SHA256 423 my @X = map("%xmm$_",(0..3)); 424 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 425 426$code.=<<___; 427.align 16 428.Lloop_xop: 429 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 430 vmovdqu 0x00(%rsi,%r12),@X[0] 431 vmovdqu 0x10(%rsi,%r12),@X[1] 432 vmovdqu 0x20(%rsi,%r12),@X[2] 433 vmovdqu 0x30(%rsi,%r12),@X[3] 434 vpshufb $t3,@X[0],@X[0] 435 lea $TABLE(%rip),$Tbl 436 vpshufb $t3,@X[1],@X[1] 437 vpshufb $t3,@X[2],@X[2] 438 vpaddd 0x00($Tbl),@X[0],$t0 439 vpshufb $t3,@X[3],@X[3] 440 vpaddd 0x20($Tbl),@X[1],$t1 441 vpaddd 0x40($Tbl),@X[2],$t2 442 vpaddd 0x60($Tbl),@X[3],$t3 443 vmovdqa $t0,0x00(%rsp) 444 mov $A,$a1 445 vmovdqa $t1,0x10(%rsp) 446 mov $B,$a3 447 vmovdqa $t2,0x20(%rsp) 448 xor $C,$a3 # magic 449 vmovdqa $t3,0x30(%rsp) 450 mov $E,$a0 451 jmp .Lxop_00_47 452 453.align 16 454.Lxop_00_47: 455 sub \$-16*2*$SZ,$Tbl # size optimization 456 vmovdqu (%r12),$inout # $a4 457 mov %r12,$_inp # $a4 458___ 459sub XOP_256_00_47 () { 460my $j = shift; 461my $body = shift; 462my @X = @_; 463my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 464 465 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 466 eval(shift(@insns)); 467 eval(shift(@insns)); 468 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 469 eval(shift(@insns)); 470 eval(shift(@insns)); 471 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 472 eval(shift(@insns)); 473 eval(shift(@insns)); 474 &vpsrld ($t0,$t0,$sigma0[2]); 475 eval(shift(@insns)); 476 eval(shift(@insns)); 477 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 478 eval(shift(@insns)); 479 eval(shift(@insns)); 480 eval(shift(@insns)); 481 eval(shift(@insns)); 482 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 483 eval(shift(@insns)); 484 eval(shift(@insns)); 485 &vpxor ($t0,$t0,$t1); 486 eval(shift(@insns)); 487 eval(shift(@insns)); 488 eval(shift(@insns)); 489 eval(shift(@insns)); 490 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 491 eval(shift(@insns)); 492 eval(shift(@insns)); 493 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 494 eval(shift(@insns)); 495 eval(shift(@insns)); 496 &vpsrld ($t2,@X[3],$sigma1[2]); 497 eval(shift(@insns)); 498 eval(shift(@insns)); 499 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 500 eval(shift(@insns)); 501 eval(shift(@insns)); 502 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 503 eval(shift(@insns)); 504 eval(shift(@insns)); 505 &vpxor ($t3,$t3,$t2); 506 eval(shift(@insns)); 507 eval(shift(@insns)); 508 eval(shift(@insns)); 509 eval(shift(@insns)); 510 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 511 eval(shift(@insns)); 512 eval(shift(@insns)); 513 eval(shift(@insns)); 514 eval(shift(@insns)); 515 &vpsrldq ($t3,$t3,8); 516 eval(shift(@insns)); 517 eval(shift(@insns)); 518 eval(shift(@insns)); 519 eval(shift(@insns)); 520 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 521 eval(shift(@insns)); 522 eval(shift(@insns)); 523 eval(shift(@insns)); 524 eval(shift(@insns)); 525 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 526 eval(shift(@insns)); 527 eval(shift(@insns)); 528 &vpsrld ($t2,@X[0],$sigma1[2]); 529 eval(shift(@insns)); 530 eval(shift(@insns)); 531 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 532 eval(shift(@insns)); 533 eval(shift(@insns)); 534 &vpxor ($t3,$t3,$t2); 535 eval(shift(@insns)); 536 eval(shift(@insns)); 537 eval(shift(@insns)); 538 eval(shift(@insns)); 539 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 540 eval(shift(@insns)); 541 eval(shift(@insns)); 542 eval(shift(@insns)); 543 eval(shift(@insns)); 544 &vpslldq ($t3,$t3,8); # 22 instructions 545 eval(shift(@insns)); 546 eval(shift(@insns)); 547 eval(shift(@insns)); 548 eval(shift(@insns)); 549 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 550 eval(shift(@insns)); 551 eval(shift(@insns)); 552 eval(shift(@insns)); 553 eval(shift(@insns)); 554 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 555 foreach (@insns) { eval; } # remaining instructions 556 &vmovdqa (16*$j."(%rsp)",$t2); 557} 558 559 $aesni_cbc_idx=0; 560 for ($i=0,$j=0; $j<4; $j++) { 561 &XOP_256_00_47($j,\&body_00_15,@X); 562 push(@X,shift(@X)); # rotate(@X) 563 } 564 &mov ("%r12",$_inp); # borrow $a4 565 &vpand ($temp,$temp,$mask14); 566 &mov ("%r15",$_out); # borrow $a2 567 &vpor ($iv,$iv,$temp); 568 &vmovdqu ("(%r15,%r12)",$iv); # write output 569 &lea ("%r12","16(%r12)"); # inp++ 570 571 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 572 &jne (".Lxop_00_47"); 573 574 &vmovdqu ($inout,"(%r12)"); 575 &mov ($_inp,"%r12"); 576 577 $aesni_cbc_idx=0; 578 for ($i=0; $i<16; ) { 579 foreach(body_00_15()) { eval; } 580 } 581 } 582$code.=<<___; 583 mov $_inp,%r12 # borrow $a4 584 mov $_out,%r13 # borrow $a0 585 mov $_ctx,%r15 # borrow $a2 586 mov $_in0,%rsi # borrow $a3 587 588 vpand $mask14,$temp,$temp 589 mov $a1,$A 590 vpor $temp,$iv,$iv 591 vmovdqu $iv,(%r13,%r12) # write output 592 lea 16(%r12),%r12 # inp++ 593 594 add $SZ*0(%r15),$A 595 add $SZ*1(%r15),$B 596 add $SZ*2(%r15),$C 597 add $SZ*3(%r15),$D 598 add $SZ*4(%r15),$E 599 add $SZ*5(%r15),$F 600 add $SZ*6(%r15),$G 601 add $SZ*7(%r15),$H 602 603 cmp $_end,%r12 604 605 mov $A,$SZ*0(%r15) 606 mov $B,$SZ*1(%r15) 607 mov $C,$SZ*2(%r15) 608 mov $D,$SZ*3(%r15) 609 mov $E,$SZ*4(%r15) 610 mov $F,$SZ*5(%r15) 611 mov $G,$SZ*6(%r15) 612 mov $H,$SZ*7(%r15) 613 614 jb .Lloop_xop 615 616 mov $_ivp,$ivp 617 mov $_rsp,%rsi 618.cfi_def_cfa %rsi,8 619 vmovdqu $iv,($ivp) # output IV 620 vzeroall 621___ 622$code.=<<___ if ($win64); 623 movaps `$framesz+16*0`(%rsp),%xmm6 624 movaps `$framesz+16*1`(%rsp),%xmm7 625 movaps `$framesz+16*2`(%rsp),%xmm8 626 movaps `$framesz+16*3`(%rsp),%xmm9 627 movaps `$framesz+16*4`(%rsp),%xmm10 628 movaps `$framesz+16*5`(%rsp),%xmm11 629 movaps `$framesz+16*6`(%rsp),%xmm12 630 movaps `$framesz+16*7`(%rsp),%xmm13 631 movaps `$framesz+16*8`(%rsp),%xmm14 632 movaps `$framesz+16*9`(%rsp),%xmm15 633___ 634$code.=<<___; 635 mov -48(%rsi),%r15 636.cfi_restore %r15 637 mov -40(%rsi),%r14 638.cfi_restore %r14 639 mov -32(%rsi),%r13 640.cfi_restore %r13 641 mov -24(%rsi),%r12 642.cfi_restore %r12 643 mov -16(%rsi),%rbp 644.cfi_restore %rbp 645 mov -8(%rsi),%rbx 646.cfi_restore %rbx 647 lea (%rsi),%rsp 648.cfi_def_cfa_register %rsp 649.Lepilogue_xop: 650 ret 651.cfi_endproc 652.size ${func}_xop,.-${func}_xop 653___ 654###################################################################### 655# AVX+shrd code path 656# 657local *ror = sub { &shrd(@_[0],@_) }; 658 659$code.=<<___; 660.type ${func}_avx,\@function,6 661.align 64 662${func}_avx: 663.cfi_startproc 664.Lavx_shortcut: 665 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 666 mov %rsp,%rax # copy %rsp 667.cfi_def_cfa_register %rax 668 push %rbx 669.cfi_push %rbx 670 push %rbp 671.cfi_push %rbp 672 push %r12 673.cfi_push %r12 674 push %r13 675.cfi_push %r13 676 push %r14 677.cfi_push %r14 678 push %r15 679.cfi_push %r15 680 sub \$`$framesz+$win64*16*10`,%rsp 681 and \$-64,%rsp # align stack frame 682 683 shl \$6,$len 684 sub $inp,$out # re-bias 685 sub $inp,$in0 686 add $inp,$len # end of input 687 688 #mov $inp,$_inp # saved later 689 mov $out,$_out 690 mov $len,$_end 691 #mov $key,$_key # remains resident in $inp register 692 mov $ivp,$_ivp 693 mov $ctx,$_ctx 694 mov $in0,$_in0 695 mov %rax,$_rsp 696.cfi_cfa_expression $_rsp,deref,+8 697___ 698$code.=<<___ if ($win64); 699 movaps %xmm6,`$framesz+16*0`(%rsp) 700 movaps %xmm7,`$framesz+16*1`(%rsp) 701 movaps %xmm8,`$framesz+16*2`(%rsp) 702 movaps %xmm9,`$framesz+16*3`(%rsp) 703 movaps %xmm10,`$framesz+16*4`(%rsp) 704 movaps %xmm11,`$framesz+16*5`(%rsp) 705 movaps %xmm12,`$framesz+16*6`(%rsp) 706 movaps %xmm13,`$framesz+16*7`(%rsp) 707 movaps %xmm14,`$framesz+16*8`(%rsp) 708 movaps %xmm15,`$framesz+16*9`(%rsp) 709___ 710$code.=<<___; 711.Lprologue_avx: 712 vzeroall 713 714 mov $inp,%r12 # borrow $a4 715 lea 0x80($key),$inp # size optimization, reassign 716 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 717 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 718 mov $ctx,%r15 # borrow $a2 719 mov $in0,%rsi # borrow $a3 720 vmovdqu ($ivp),$iv # load IV 721 sub \$9,%r14 722 723 mov $SZ*0(%r15),$A 724 mov $SZ*1(%r15),$B 725 mov $SZ*2(%r15),$C 726 mov $SZ*3(%r15),$D 727 mov $SZ*4(%r15),$E 728 mov $SZ*5(%r15),$F 729 mov $SZ*6(%r15),$G 730 mov $SZ*7(%r15),$H 731 732 vmovdqa 0x00(%r13,%r14,8),$mask14 733 vmovdqa 0x10(%r13,%r14,8),$mask12 734 vmovdqa 0x20(%r13,%r14,8),$mask10 735 vmovdqu 0x00-0x80($inp),$roundkey 736___ 737 if ($SZ==4) { # SHA256 738 my @X = map("%xmm$_",(0..3)); 739 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 740 741$code.=<<___; 742 jmp .Lloop_avx 743.align 16 744.Lloop_avx: 745 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 746 vmovdqu 0x00(%rsi,%r12),@X[0] 747 vmovdqu 0x10(%rsi,%r12),@X[1] 748 vmovdqu 0x20(%rsi,%r12),@X[2] 749 vmovdqu 0x30(%rsi,%r12),@X[3] 750 vpshufb $t3,@X[0],@X[0] 751 lea $TABLE(%rip),$Tbl 752 vpshufb $t3,@X[1],@X[1] 753 vpshufb $t3,@X[2],@X[2] 754 vpaddd 0x00($Tbl),@X[0],$t0 755 vpshufb $t3,@X[3],@X[3] 756 vpaddd 0x20($Tbl),@X[1],$t1 757 vpaddd 0x40($Tbl),@X[2],$t2 758 vpaddd 0x60($Tbl),@X[3],$t3 759 vmovdqa $t0,0x00(%rsp) 760 mov $A,$a1 761 vmovdqa $t1,0x10(%rsp) 762 mov $B,$a3 763 vmovdqa $t2,0x20(%rsp) 764 xor $C,$a3 # magic 765 vmovdqa $t3,0x30(%rsp) 766 mov $E,$a0 767 jmp .Lavx_00_47 768 769.align 16 770.Lavx_00_47: 771 sub \$-16*2*$SZ,$Tbl # size optimization 772 vmovdqu (%r12),$inout # $a4 773 mov %r12,$_inp # $a4 774___ 775sub Xupdate_256_AVX () { 776 ( 777 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 778 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 779 '&vpsrld ($t2,$t0,$sigma0[0]);', 780 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 781 '&vpsrld ($t3,$t0,$sigma0[2])', 782 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 783 '&vpxor ($t0,$t3,$t2)', 784 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 785 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 786 '&vpxor ($t0,$t0,$t1)', 787 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 788 '&vpxor ($t0,$t0,$t2)', 789 '&vpsrld ($t2,$t3,$sigma1[2]);', 790 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 791 '&vpsrlq ($t3,$t3,$sigma1[0]);', 792 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 793 '&vpxor ($t2,$t2,$t3);', 794 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 795 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15]) 796 '&vpshufd ($t2,$t2,0b10000100)', 797 '&vpsrldq ($t2,$t2,8)', 798 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 799 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 800 '&vpsrld ($t2,$t3,$sigma1[2])', 801 '&vpsrlq ($t3,$t3,$sigma1[0])', 802 '&vpxor ($t2,$t2,$t3);', 803 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 804 '&vpxor ($t2,$t2,$t3)', 805 '&vpshufd ($t2,$t2,0b11101000)', 806 '&vpslldq ($t2,$t2,8)', 807 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 808 ); 809} 810 811sub AVX_256_00_47 () { 812my $j = shift; 813my $body = shift; 814my @X = @_; 815my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 816 817 foreach (Xupdate_256_AVX()) { # 29 instructions 818 eval; 819 eval(shift(@insns)); 820 eval(shift(@insns)); 821 eval(shift(@insns)); 822 } 823 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 824 foreach (@insns) { eval; } # remaining instructions 825 &vmovdqa (16*$j."(%rsp)",$t2); 826} 827 828 $aesni_cbc_idx=0; 829 for ($i=0,$j=0; $j<4; $j++) { 830 &AVX_256_00_47($j,\&body_00_15,@X); 831 push(@X,shift(@X)); # rotate(@X) 832 } 833 &mov ("%r12",$_inp); # borrow $a4 834 &vpand ($temp,$temp,$mask14); 835 &mov ("%r15",$_out); # borrow $a2 836 &vpor ($iv,$iv,$temp); 837 &vmovdqu ("(%r15,%r12)",$iv); # write output 838 &lea ("%r12","16(%r12)"); # inp++ 839 840 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 841 &jne (".Lavx_00_47"); 842 843 &vmovdqu ($inout,"(%r12)"); 844 &mov ($_inp,"%r12"); 845 846 $aesni_cbc_idx=0; 847 for ($i=0; $i<16; ) { 848 foreach(body_00_15()) { eval; } 849 } 850 851 } 852$code.=<<___; 853 mov $_inp,%r12 # borrow $a4 854 mov $_out,%r13 # borrow $a0 855 mov $_ctx,%r15 # borrow $a2 856 mov $_in0,%rsi # borrow $a3 857 858 vpand $mask14,$temp,$temp 859 mov $a1,$A 860 vpor $temp,$iv,$iv 861 vmovdqu $iv,(%r13,%r12) # write output 862 lea 16(%r12),%r12 # inp++ 863 864 add $SZ*0(%r15),$A 865 add $SZ*1(%r15),$B 866 add $SZ*2(%r15),$C 867 add $SZ*3(%r15),$D 868 add $SZ*4(%r15),$E 869 add $SZ*5(%r15),$F 870 add $SZ*6(%r15),$G 871 add $SZ*7(%r15),$H 872 873 cmp $_end,%r12 874 875 mov $A,$SZ*0(%r15) 876 mov $B,$SZ*1(%r15) 877 mov $C,$SZ*2(%r15) 878 mov $D,$SZ*3(%r15) 879 mov $E,$SZ*4(%r15) 880 mov $F,$SZ*5(%r15) 881 mov $G,$SZ*6(%r15) 882 mov $H,$SZ*7(%r15) 883 jb .Lloop_avx 884 885 mov $_ivp,$ivp 886 mov $_rsp,%rsi 887.cfi_def_cfa %rsi,8 888 vmovdqu $iv,($ivp) # output IV 889 vzeroall 890___ 891$code.=<<___ if ($win64); 892 movaps `$framesz+16*0`(%rsp),%xmm6 893 movaps `$framesz+16*1`(%rsp),%xmm7 894 movaps `$framesz+16*2`(%rsp),%xmm8 895 movaps `$framesz+16*3`(%rsp),%xmm9 896 movaps `$framesz+16*4`(%rsp),%xmm10 897 movaps `$framesz+16*5`(%rsp),%xmm11 898 movaps `$framesz+16*6`(%rsp),%xmm12 899 movaps `$framesz+16*7`(%rsp),%xmm13 900 movaps `$framesz+16*8`(%rsp),%xmm14 901 movaps `$framesz+16*9`(%rsp),%xmm15 902___ 903$code.=<<___; 904 mov -48(%rsi),%r15 905.cfi_restore %r15 906 mov -40(%rsi),%r14 907.cfi_restore %r14 908 mov -32(%rsi),%r13 909.cfi_restore %r13 910 mov -24(%rsi),%r12 911.cfi_restore %r12 912 mov -16(%rsi),%rbp 913.cfi_restore %rbp 914 mov -8(%rsi),%rbx 915.cfi_restore %rbx 916 lea (%rsi),%rsp 917.cfi_def_cfa_register %rsp 918.Lepilogue_avx: 919 ret 920.cfi_endproc 921.size ${func}_avx,.-${func}_avx 922___ 923 924if ($avx>1) {{ 925###################################################################### 926# AVX2+BMI code path 927# 928my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 929my $PUSH8=8*2*$SZ; 930use integer; 931 932sub bodyx_00_15 () { 933 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 934 ( 935 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 936 937 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 938 '&and ($a4,$e)', # f&e 939 '&rorx ($a0,$e,$Sigma1[2])', 940 '&rorx ($a2,$e,$Sigma1[1])', 941 942 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 943 '&lea ($h,"($h,$a4)")', 944 '&andn ($a4,$e,$g)', # ~e&g 945 '&xor ($a0,$a2)', 946 947 '&rorx ($a1,$e,$Sigma1[0])', 948 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 949 '&xor ($a0,$a1)', # Sigma1(e) 950 '&mov ($a2,$a)', 951 952 '&rorx ($a4,$a,$Sigma0[2])', 953 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 954 '&xor ($a2,$b)', # a^b, b^c in next round 955 '&rorx ($a1,$a,$Sigma0[1])', 956 957 '&rorx ($a0,$a,$Sigma0[0])', 958 '&lea ($d,"($d,$h)")', # d+=h 959 '&and ($a3,$a2)', # (b^c)&(a^b) 960 @aesni_cbc_block[$aesni_cbc_idx++]. 961 '&xor ($a1,$a4)', 962 963 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 964 '&xor ($a1,$a0)', # Sigma0(a) 965 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 966 '&mov ($a4,$e)', # copy of f in future 967 968 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 969 ); 970 # and at the finish one has to $a+=$a1 971} 972 973$code.=<<___; 974.type ${func}_avx2,\@function,6 975.align 64 976${func}_avx2: 977.cfi_startproc 978.Lavx2_shortcut: 979 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 980 mov %rsp,%rax # copy %rsp 981.cfi_def_cfa_register %rax 982 push %rbx 983.cfi_push %rbx 984 push %rbp 985.cfi_push %rbp 986 push %r12 987.cfi_push %r12 988 push %r13 989.cfi_push %r13 990 push %r14 991.cfi_push %r14 992 push %r15 993.cfi_push %r15 994 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp 995 and \$-256*$SZ,%rsp # align stack frame 996 add \$`2*$SZ*($rounds-8)`,%rsp 997 998 shl \$6,$len 999 sub $inp,$out # re-bias 1000 sub $inp,$in0 1001 add $inp,$len # end of input 1002 1003 #mov $inp,$_inp # saved later 1004 #mov $out,$_out # kept in $offload 1005 mov $len,$_end 1006 #mov $key,$_key # remains resident in $inp register 1007 mov $ivp,$_ivp 1008 mov $ctx,$_ctx 1009 mov $in0,$_in0 1010 mov %rax,$_rsp 1011.cfi_cfa_expression $_rsp,deref,+8 1012___ 1013$code.=<<___ if ($win64); 1014 movaps %xmm6,`$framesz+16*0`(%rsp) 1015 movaps %xmm7,`$framesz+16*1`(%rsp) 1016 movaps %xmm8,`$framesz+16*2`(%rsp) 1017 movaps %xmm9,`$framesz+16*3`(%rsp) 1018 movaps %xmm10,`$framesz+16*4`(%rsp) 1019 movaps %xmm11,`$framesz+16*5`(%rsp) 1020 movaps %xmm12,`$framesz+16*6`(%rsp) 1021 movaps %xmm13,`$framesz+16*7`(%rsp) 1022 movaps %xmm14,`$framesz+16*8`(%rsp) 1023 movaps %xmm15,`$framesz+16*9`(%rsp) 1024___ 1025$code.=<<___; 1026.Lprologue_avx2: 1027 vzeroall 1028 1029 mov $inp,%r13 # borrow $a0 1030 vpinsrq \$1,$out,$offload,$offload 1031 lea 0x80($key),$inp # size optimization, reassign 1032 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4 1033 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 1034 mov $ctx,%r15 # borrow $a2 1035 mov $in0,%rsi # borrow $a3 1036 vmovdqu ($ivp),$iv # load IV 1037 lea -9(%r14),%r14 1038 1039 vmovdqa 0x00(%r12,%r14,8),$mask14 1040 vmovdqa 0x10(%r12,%r14,8),$mask12 1041 vmovdqa 0x20(%r12,%r14,8),$mask10 1042 1043 sub \$-16*$SZ,%r13 # inp++, size optimization 1044 mov $SZ*0(%r15),$A 1045 lea (%rsi,%r13),%r12 # borrow $a0 1046 mov $SZ*1(%r15),$B 1047 cmp $len,%r13 # $_end 1048 mov $SZ*2(%r15),$C 1049 cmove %rsp,%r12 # next block or random data 1050 mov $SZ*3(%r15),$D 1051 mov $SZ*4(%r15),$E 1052 mov $SZ*5(%r15),$F 1053 mov $SZ*6(%r15),$G 1054 mov $SZ*7(%r15),$H 1055 vmovdqu 0x00-0x80($inp),$roundkey 1056___ 1057 if ($SZ==4) { # SHA256 1058 my @X = map("%ymm$_",(0..3)); 1059 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7)); 1060 1061$code.=<<___; 1062 jmp .Loop_avx2 1063.align 16 1064.Loop_avx2: 1065 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1066 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0 1067 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1 1068 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2 1069 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3 1070 1071 vinserti128 \$1,(%r12),@X[0],@X[0] 1072 vinserti128 \$1,16(%r12),@X[1],@X[1] 1073 vpshufb $t3,@X[0],@X[0] 1074 vinserti128 \$1,32(%r12),@X[2],@X[2] 1075 vpshufb $t3,@X[1],@X[1] 1076 vinserti128 \$1,48(%r12),@X[3],@X[3] 1077 1078 lea $TABLE(%rip),$Tbl 1079 vpshufb $t3,@X[2],@X[2] 1080 lea -16*$SZ(%r13),%r13 1081 vpaddd 0x00($Tbl),@X[0],$t0 1082 vpshufb $t3,@X[3],@X[3] 1083 vpaddd 0x20($Tbl),@X[1],$t1 1084 vpaddd 0x40($Tbl),@X[2],$t2 1085 vpaddd 0x60($Tbl),@X[3],$t3 1086 vmovdqa $t0,0x00(%rsp) 1087 xor $a1,$a1 1088 vmovdqa $t1,0x20(%rsp) 1089___ 1090$code.=<<___ if (!$win64); 1091# temporarily use %rsi as frame pointer 1092 mov $_rsp,%rsi 1093.cfi_def_cfa %rsi,8 1094___ 1095$code.=<<___; 1096 lea -$PUSH8(%rsp),%rsp 1097___ 1098$code.=<<___ if (!$win64); 1099# the frame info is at $_rsp, but the stack is moving... 1100# so a second frame pointer is saved at -8(%rsp) 1101# that is in the red zone 1102 mov %rsi,-8(%rsp) 1103.cfi_cfa_expression %rsp-8,deref,+8 1104___ 1105$code.=<<___; 1106 mov $B,$a3 1107 vmovdqa $t2,0x00(%rsp) 1108 xor $C,$a3 # magic 1109 vmovdqa $t3,0x20(%rsp) 1110 mov $F,$a4 1111 sub \$-16*2*$SZ,$Tbl # size optimization 1112 jmp .Lavx2_00_47 1113 1114.align 16 1115.Lavx2_00_47: 1116 vmovdqu (%r13),$inout 1117 vpinsrq \$0,%r13,$offload,$offload 1118___ 1119 1120sub AVX2_256_00_47 () { 1121my $j = shift; 1122my $body = shift; 1123my @X = @_; 1124my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1125my $base = "+2*$PUSH8(%rsp)"; 1126 1127 if (($j%2)==0) { 1128 &lea ("%rsp","-$PUSH8(%rsp)"); 1129$code.=<<___ if (!$win64); 1130.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 1131# copy secondary frame pointer to new location again at -8(%rsp) 1132 pushq $PUSH8-8(%rsp) 1133.cfi_cfa_expression %rsp,deref,+8 1134 lea 8(%rsp),%rsp 1135.cfi_cfa_expression %rsp-8,deref,+8 1136___ 1137 } 1138 foreach (Xupdate_256_AVX()) { # 29 instructions 1139 eval; 1140 eval(shift(@insns)); 1141 eval(shift(@insns)); 1142 eval(shift(@insns)); 1143 } 1144 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1145 foreach (@insns) { eval; } # remaining instructions 1146 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1147} 1148 $aesni_cbc_idx=0; 1149 for ($i=0,$j=0; $j<4; $j++) { 1150 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1151 push(@X,shift(@X)); # rotate(@X) 1152 } 1153 &vmovq ("%r13",$offload); # borrow $a0 1154 &vpextrq ("%r15",$offload,1); # borrow $a2 1155 &vpand ($temp,$temp,$mask14); 1156 &vpor ($iv,$iv,$temp); 1157 &vmovdqu ("(%r15,%r13)",$iv); # write output 1158 &lea ("%r13","16(%r13)"); # inp++ 1159 1160 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1161 &cmpb (($SZ-1)."($Tbl)",0); 1162 &jne (".Lavx2_00_47"); 1163 1164 &vmovdqu ($inout,"(%r13)"); 1165 &vpinsrq ($offload,$offload,"%r13",0); 1166 1167 $aesni_cbc_idx=0; 1168 for ($i=0; $i<16; ) { 1169 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1170 foreach(bodyx_00_15()) { eval; } 1171 } 1172 } 1173$code.=<<___; 1174 vpextrq \$1,$offload,%r12 # $_out, borrow $a4 1175 vmovq $offload,%r13 # $_inp, borrow $a0 1176 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 1177 add $a1,$A 1178 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 1179 1180 vpand $mask14,$temp,$temp 1181 vpor $temp,$iv,$iv 1182 vmovdqu $iv,(%r12,%r13) # write output 1183 lea 16(%r13),%r13 1184 1185 add $SZ*0(%r15),$A 1186 add $SZ*1(%r15),$B 1187 add $SZ*2(%r15),$C 1188 add $SZ*3(%r15),$D 1189 add $SZ*4(%r15),$E 1190 add $SZ*5(%r15),$F 1191 add $SZ*6(%r15),$G 1192 add $SZ*7(%r15),$H 1193 1194 mov $A,$SZ*0(%r15) 1195 mov $B,$SZ*1(%r15) 1196 mov $C,$SZ*2(%r15) 1197 mov $D,$SZ*3(%r15) 1198 mov $E,$SZ*4(%r15) 1199 mov $F,$SZ*5(%r15) 1200 mov $G,$SZ*6(%r15) 1201 mov $H,$SZ*7(%r15) 1202 1203 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end 1204 je .Ldone_avx2 1205 1206 xor $a1,$a1 1207 mov $B,$a3 1208 mov $F,$a4 1209 xor $C,$a3 # magic 1210 jmp .Lower_avx2 1211.align 16 1212.Lower_avx2: 1213 vmovdqu (%r13),$inout 1214 vpinsrq \$0,%r13,$offload,$offload 1215___ 1216 $aesni_cbc_idx=0; 1217 for ($i=0; $i<16; ) { 1218 my $base="+16($Tbl)"; 1219 foreach(bodyx_00_15()) { eval; } 1220 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8); 1221 } 1222$code.=<<___; 1223 vmovq $offload,%r13 # borrow $a0 1224 vpextrq \$1,$offload,%r15 # borrow $a2 1225 vpand $mask14,$temp,$temp 1226 vpor $temp,$iv,$iv 1227 lea -$PUSH8($Tbl),$Tbl 1228 vmovdqu $iv,(%r15,%r13) # write output 1229 lea 16(%r13),%r13 # inp++ 1230 cmp %rsp,$Tbl 1231 jae .Lower_avx2 1232 1233 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 1234 lea 16*$SZ(%r13),%r13 1235 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3 1236 add $a1,$A 1237 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 1238 1239 add $SZ*0(%r15),$A 1240 add $SZ*1(%r15),$B 1241 add $SZ*2(%r15),$C 1242 add $SZ*3(%r15),$D 1243 add $SZ*4(%r15),$E 1244 add $SZ*5(%r15),$F 1245 add $SZ*6(%r15),$G 1246 lea (%rsi,%r13),%r12 1247 add $SZ*7(%r15),$H 1248 1249 cmp $_end,%r13 1250 1251 mov $A,$SZ*0(%r15) 1252 cmove %rsp,%r12 # next block or stale data 1253 mov $B,$SZ*1(%r15) 1254 mov $C,$SZ*2(%r15) 1255 mov $D,$SZ*3(%r15) 1256 mov $E,$SZ*4(%r15) 1257 mov $F,$SZ*5(%r15) 1258 mov $G,$SZ*6(%r15) 1259 mov $H,$SZ*7(%r15) 1260 1261 jbe .Loop_avx2 1262 lea (%rsp),$Tbl 1263# temporarily use $Tbl as index to $_rsp 1264# this avoids the need to save a secondary frame pointer at -8(%rsp) 1265.cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8 1266 1267.Ldone_avx2: 1268 mov 16*$SZ+4*8($Tbl),$ivp 1269 mov 16*$SZ+7*8($Tbl),%rsi 1270.cfi_def_cfa %rsi,8 1271 vmovdqu $iv,($ivp) # output IV 1272 vzeroall 1273___ 1274$code.=<<___ if ($win64); 1275 movaps `$framesz+16*0`($Tbl),%xmm6 1276 movaps `$framesz+16*1`($Tbl),%xmm7 1277 movaps `$framesz+16*2`($Tbl),%xmm8 1278 movaps `$framesz+16*3`($Tbl),%xmm9 1279 movaps `$framesz+16*4`($Tbl),%xmm10 1280 movaps `$framesz+16*5`($Tbl),%xmm11 1281 movaps `$framesz+16*6`($Tbl),%xmm12 1282 movaps `$framesz+16*7`($Tbl),%xmm13 1283 movaps `$framesz+16*8`($Tbl),%xmm14 1284 movaps `$framesz+16*9`($Tbl),%xmm15 1285___ 1286$code.=<<___; 1287 mov -48(%rsi),%r15 1288.cfi_restore %r15 1289 mov -40(%rsi),%r14 1290.cfi_restore %r14 1291 mov -32(%rsi),%r13 1292.cfi_restore %r13 1293 mov -24(%rsi),%r12 1294.cfi_restore %r12 1295 mov -16(%rsi),%rbp 1296.cfi_restore %rbp 1297 mov -8(%rsi),%rbx 1298.cfi_restore %rbx 1299 lea (%rsi),%rsp 1300.cfi_def_cfa_register %rsp 1301.Lepilogue_avx2: 1302 ret 1303.cfi_endproc 1304.size ${func}_avx2,.-${func}_avx2 1305___ 1306}} 1307}} 1308{{ 1309my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 1310 1311my ($rounds,$Tbl)=("%r11d","%rbx"); 1312 1313my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15)); 1314my @rndkey=("%xmm4","%xmm5"); 1315my $r=0; 1316my $sn=0; 1317 1318my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9)); 1319my @MSG=map("%xmm$_",(10..13)); 1320 1321my $aesenc=sub { 1322 use integer; 1323 my ($n,$k)=($r/10,$r%10); 1324 if ($k==0) { 1325 $code.=<<___; 1326 movups `16*$n`($in0),$in # load input 1327 xorps $rndkey0,$in 1328___ 1329 $code.=<<___ if ($n); 1330 movups $iv,`16*($n-1)`($out,$in0) # write output 1331___ 1332 $code.=<<___; 1333 xorps $in,$iv 1334 movups `32+16*$k-112`($key),$rndkey[1] 1335 aesenc $rndkey[0],$iv 1336___ 1337 } elsif ($k==9) { 1338 $sn++; 1339 $code.=<<___; 1340 cmp \$11,$rounds 1341 jb .Laesenclast$sn 1342 movups `32+16*($k+0)-112`($key),$rndkey[1] 1343 aesenc $rndkey[0],$iv 1344 movups `32+16*($k+1)-112`($key),$rndkey[0] 1345 aesenc $rndkey[1],$iv 1346 je .Laesenclast$sn 1347 movups `32+16*($k+2)-112`($key),$rndkey[1] 1348 aesenc $rndkey[0],$iv 1349 movups `32+16*($k+3)-112`($key),$rndkey[0] 1350 aesenc $rndkey[1],$iv 1351.Laesenclast$sn: 1352 aesenclast $rndkey[0],$iv 1353 movups 16-112($key),$rndkey[1] # forward reference 1354 nop 1355___ 1356 } else { 1357 $code.=<<___; 1358 movups `32+16*$k-112`($key),$rndkey[1] 1359 aesenc $rndkey[0],$iv 1360___ 1361 } 1362 $r++; unshift(@rndkey,pop(@rndkey)); 1363}; 1364 1365if ($shaext) { 1366my $Tbl="%rax"; 1367 1368$code.=<<___; 1369.type ${func}_shaext,\@function,6 1370.align 32 1371${func}_shaext: 1372.cfi_startproc 1373 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 1374___ 1375$code.=<<___ if ($win64); 1376 lea `-8-10*16`(%rsp),%rsp 1377 movaps %xmm6,-8-10*16(%rax) 1378 movaps %xmm7,-8-9*16(%rax) 1379 movaps %xmm8,-8-8*16(%rax) 1380 movaps %xmm9,-8-7*16(%rax) 1381 movaps %xmm10,-8-6*16(%rax) 1382 movaps %xmm11,-8-5*16(%rax) 1383 movaps %xmm12,-8-4*16(%rax) 1384 movaps %xmm13,-8-3*16(%rax) 1385 movaps %xmm14,-8-2*16(%rax) 1386 movaps %xmm15,-8-1*16(%rax) 1387.Lprologue_shaext: 1388___ 1389$code.=<<___; 1390 lea K256+0x80(%rip),$Tbl 1391 movdqu ($ctx),$ABEF # DCBA 1392 movdqu 16($ctx),$CDGH # HGFE 1393 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 1394 1395 mov 240($key),$rounds 1396 sub $in0,$out 1397 movups ($key),$rndkey0 # $key[0] 1398 movups ($ivp),$iv # load IV 1399 movups 16($key),$rndkey[0] # forward reference 1400 lea 112($key),$key # size optimization 1401 1402 pshufd \$0x1b,$ABEF,$Wi # ABCD 1403 pshufd \$0xb1,$ABEF,$ABEF # CDAB 1404 pshufd \$0x1b,$CDGH,$CDGH # EFGH 1405 movdqa $TMP,$BSWAP # offload 1406 palignr \$8,$CDGH,$ABEF # ABEF 1407 punpcklqdq $Wi,$CDGH # CDGH 1408 1409 jmp .Loop_shaext 1410 1411.align 16 1412.Loop_shaext: 1413 movdqu ($inp),@MSG[0] 1414 movdqu 0x10($inp),@MSG[1] 1415 movdqu 0x20($inp),@MSG[2] 1416 pshufb $TMP,@MSG[0] 1417 movdqu 0x30($inp),@MSG[3] 1418 1419 movdqa 0*32-0x80($Tbl),$Wi 1420 paddd @MSG[0],$Wi 1421 pshufb $TMP,@MSG[1] 1422 movdqa $CDGH,$CDGH_SAVE # offload 1423 movdqa $ABEF,$ABEF_SAVE # offload 1424___ 1425 &$aesenc(); 1426$code.=<<___; 1427 sha256rnds2 $ABEF,$CDGH # 0-3 1428 pshufd \$0x0e,$Wi,$Wi 1429___ 1430 &$aesenc(); 1431$code.=<<___; 1432 sha256rnds2 $CDGH,$ABEF 1433 1434 movdqa 1*32-0x80($Tbl),$Wi 1435 paddd @MSG[1],$Wi 1436 pshufb $TMP,@MSG[2] 1437 lea 0x40($inp),$inp 1438___ 1439 &$aesenc(); 1440$code.=<<___; 1441 sha256rnds2 $ABEF,$CDGH # 4-7 1442 pshufd \$0x0e,$Wi,$Wi 1443___ 1444 &$aesenc(); 1445$code.=<<___; 1446 sha256rnds2 $CDGH,$ABEF 1447 1448 movdqa 2*32-0x80($Tbl),$Wi 1449 paddd @MSG[2],$Wi 1450 pshufb $TMP,@MSG[3] 1451 sha256msg1 @MSG[1],@MSG[0] 1452___ 1453 &$aesenc(); 1454$code.=<<___; 1455 sha256rnds2 $ABEF,$CDGH # 8-11 1456 pshufd \$0x0e,$Wi,$Wi 1457 movdqa @MSG[3],$TMP 1458 palignr \$4,@MSG[2],$TMP 1459 paddd $TMP,@MSG[0] 1460___ 1461 &$aesenc(); 1462$code.=<<___; 1463 sha256rnds2 $CDGH,$ABEF 1464 1465 movdqa 3*32-0x80($Tbl),$Wi 1466 paddd @MSG[3],$Wi 1467 sha256msg2 @MSG[3],@MSG[0] 1468 sha256msg1 @MSG[2],@MSG[1] 1469___ 1470 &$aesenc(); 1471$code.=<<___; 1472 sha256rnds2 $ABEF,$CDGH # 12-15 1473 pshufd \$0x0e,$Wi,$Wi 1474___ 1475 &$aesenc(); 1476$code.=<<___; 1477 movdqa @MSG[0],$TMP 1478 palignr \$4,@MSG[3],$TMP 1479 paddd $TMP,@MSG[1] 1480 sha256rnds2 $CDGH,$ABEF 1481___ 1482for($i=4;$i<16-3;$i++) { 1483 &$aesenc() if (($r%10)==0); 1484$code.=<<___; 1485 movdqa $i*32-0x80($Tbl),$Wi 1486 paddd @MSG[0],$Wi 1487 sha256msg2 @MSG[0],@MSG[1] 1488 sha256msg1 @MSG[3],@MSG[2] 1489___ 1490 &$aesenc(); 1491$code.=<<___; 1492 sha256rnds2 $ABEF,$CDGH # 16-19... 1493 pshufd \$0x0e,$Wi,$Wi 1494 movdqa @MSG[1],$TMP 1495 palignr \$4,@MSG[0],$TMP 1496 paddd $TMP,@MSG[2] 1497___ 1498 &$aesenc(); 1499 &$aesenc() if ($r==19); 1500$code.=<<___; 1501 sha256rnds2 $CDGH,$ABEF 1502___ 1503 push(@MSG,shift(@MSG)); 1504} 1505$code.=<<___; 1506 movdqa 13*32-0x80($Tbl),$Wi 1507 paddd @MSG[0],$Wi 1508 sha256msg2 @MSG[0],@MSG[1] 1509 sha256msg1 @MSG[3],@MSG[2] 1510___ 1511 &$aesenc(); 1512$code.=<<___; 1513 sha256rnds2 $ABEF,$CDGH # 52-55 1514 pshufd \$0x0e,$Wi,$Wi 1515 movdqa @MSG[1],$TMP 1516 palignr \$4,@MSG[0],$TMP 1517 paddd $TMP,@MSG[2] 1518___ 1519 &$aesenc(); 1520 &$aesenc(); 1521$code.=<<___; 1522 sha256rnds2 $CDGH,$ABEF 1523 1524 movdqa 14*32-0x80($Tbl),$Wi 1525 paddd @MSG[1],$Wi 1526 sha256msg2 @MSG[1],@MSG[2] 1527 movdqa $BSWAP,$TMP 1528___ 1529 &$aesenc(); 1530$code.=<<___; 1531 sha256rnds2 $ABEF,$CDGH # 56-59 1532 pshufd \$0x0e,$Wi,$Wi 1533___ 1534 &$aesenc(); 1535$code.=<<___; 1536 sha256rnds2 $CDGH,$ABEF 1537 1538 movdqa 15*32-0x80($Tbl),$Wi 1539 paddd @MSG[2],$Wi 1540___ 1541 &$aesenc(); 1542 &$aesenc(); 1543$code.=<<___; 1544 sha256rnds2 $ABEF,$CDGH # 60-63 1545 pshufd \$0x0e,$Wi,$Wi 1546___ 1547 &$aesenc(); 1548$code.=<<___; 1549 sha256rnds2 $CDGH,$ABEF 1550 #pxor $CDGH,$rndkey0 # black magic 1551___ 1552 while ($r<40) { &$aesenc(); } # remaining aesenc's 1553$code.=<<___; 1554 #xorps $CDGH,$rndkey0 # black magic 1555 paddd $CDGH_SAVE,$CDGH 1556 paddd $ABEF_SAVE,$ABEF 1557 1558 dec $len 1559 movups $iv,48($out,$in0) # write output 1560 lea 64($in0),$in0 1561 jnz .Loop_shaext 1562 1563 pshufd \$0xb1,$CDGH,$CDGH # DCHG 1564 pshufd \$0x1b,$ABEF,$TMP # FEBA 1565 pshufd \$0xb1,$ABEF,$ABEF # BAFE 1566 punpckhqdq $CDGH,$ABEF # DCBA 1567 palignr \$8,$TMP,$CDGH # HGFE 1568 1569 movups $iv,($ivp) # write IV 1570 movdqu $ABEF,($ctx) 1571 movdqu $CDGH,16($ctx) 1572___ 1573$code.=<<___ if ($win64); 1574 movaps 0*16(%rsp),%xmm6 1575 movaps 1*16(%rsp),%xmm7 1576 movaps 2*16(%rsp),%xmm8 1577 movaps 3*16(%rsp),%xmm9 1578 movaps 4*16(%rsp),%xmm10 1579 movaps 5*16(%rsp),%xmm11 1580 movaps 6*16(%rsp),%xmm12 1581 movaps 7*16(%rsp),%xmm13 1582 movaps 8*16(%rsp),%xmm14 1583 movaps 9*16(%rsp),%xmm15 1584 lea 8+10*16(%rsp),%rsp 1585.Lepilogue_shaext: 1586___ 1587$code.=<<___; 1588 ret 1589.cfi_endproc 1590.size ${func}_shaext,.-${func}_shaext 1591___ 1592} 1593}}}}} 1594 1595# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1596# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1597if ($win64 && $avx) { 1598$rec="%rcx"; 1599$frame="%rdx"; 1600$context="%r8"; 1601$disp="%r9"; 1602 1603$code.=<<___; 1604.extern __imp_RtlVirtualUnwind 1605.type se_handler,\@abi-omnipotent 1606.align 16 1607se_handler: 1608 push %rsi 1609 push %rdi 1610 push %rbx 1611 push %rbp 1612 push %r12 1613 push %r13 1614 push %r14 1615 push %r15 1616 pushfq 1617 sub \$64,%rsp 1618 1619 mov 120($context),%rax # pull context->Rax 1620 mov 248($context),%rbx # pull context->Rip 1621 1622 mov 8($disp),%rsi # disp->ImageBase 1623 mov 56($disp),%r11 # disp->HanderlData 1624 1625 mov 0(%r11),%r10d # HandlerData[0] 1626 lea (%rsi,%r10),%r10 # prologue label 1627 cmp %r10,%rbx # context->Rip<prologue label 1628 jb .Lin_prologue 1629 1630 mov 152($context),%rax # pull context->Rsp 1631 1632 mov 4(%r11),%r10d # HandlerData[1] 1633 lea (%rsi,%r10),%r10 # epilogue label 1634 cmp %r10,%rbx # context->Rip>=epilogue label 1635 jae .Lin_prologue 1636___ 1637$code.=<<___ if ($shaext); 1638 lea aesni_cbc_sha256_enc_shaext(%rip),%r10 1639 cmp %r10,%rbx 1640 jb .Lnot_in_shaext 1641 1642 lea (%rax),%rsi 1643 lea 512($context),%rdi # &context.Xmm6 1644 mov \$20,%ecx 1645 .long 0xa548f3fc # cld; rep movsq 1646 lea 168(%rax),%rax # adjust stack pointer 1647 jmp .Lin_prologue 1648.Lnot_in_shaext: 1649___ 1650$code.=<<___ if ($avx>1); 1651 lea .Lavx2_shortcut(%rip),%r10 1652 cmp %r10,%rbx # context->Rip<avx2_shortcut 1653 jb .Lnot_in_avx2 1654 1655 and \$-256*$SZ,%rax 1656 add \$`2*$SZ*($rounds-8)`,%rax 1657.Lnot_in_avx2: 1658___ 1659$code.=<<___; 1660 mov %rax,%rsi # put aside Rsp 1661 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp 1662 1663 mov -8(%rax),%rbx 1664 mov -16(%rax),%rbp 1665 mov -24(%rax),%r12 1666 mov -32(%rax),%r13 1667 mov -40(%rax),%r14 1668 mov -48(%rax),%r15 1669 mov %rbx,144($context) # restore context->Rbx 1670 mov %rbp,160($context) # restore context->Rbp 1671 mov %r12,216($context) # restore context->R12 1672 mov %r13,224($context) # restore context->R13 1673 mov %r14,232($context) # restore context->R14 1674 mov %r15,240($context) # restore context->R15 1675 1676 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area 1677 lea 512($context),%rdi # &context.Xmm6 1678 mov \$20,%ecx 1679 .long 0xa548f3fc # cld; rep movsq 1680 1681.Lin_prologue: 1682 mov 8(%rax),%rdi 1683 mov 16(%rax),%rsi 1684 mov %rax,152($context) # restore context->Rsp 1685 mov %rsi,168($context) # restore context->Rsi 1686 mov %rdi,176($context) # restore context->Rdi 1687 1688 mov 40($disp),%rdi # disp->ContextRecord 1689 mov $context,%rsi # context 1690 mov \$154,%ecx # sizeof(CONTEXT) 1691 .long 0xa548f3fc # cld; rep movsq 1692 1693 mov $disp,%rsi 1694 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1695 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1696 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1697 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1698 mov 40(%rsi),%r10 # disp->ContextRecord 1699 lea 56(%rsi),%r11 # &disp->HandlerData 1700 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1701 mov %r10,32(%rsp) # arg5 1702 mov %r11,40(%rsp) # arg6 1703 mov %r12,48(%rsp) # arg7 1704 mov %rcx,56(%rsp) # arg8, (NULL) 1705 call *__imp_RtlVirtualUnwind(%rip) 1706 1707 mov \$1,%eax # ExceptionContinueSearch 1708 add \$64,%rsp 1709 popfq 1710 pop %r15 1711 pop %r14 1712 pop %r13 1713 pop %r12 1714 pop %rbp 1715 pop %rbx 1716 pop %rdi 1717 pop %rsi 1718 ret 1719.size se_handler,.-se_handler 1720 1721.section .pdata 1722 .rva .LSEH_begin_${func}_xop 1723 .rva .LSEH_end_${func}_xop 1724 .rva .LSEH_info_${func}_xop 1725 1726 .rva .LSEH_begin_${func}_avx 1727 .rva .LSEH_end_${func}_avx 1728 .rva .LSEH_info_${func}_avx 1729___ 1730$code.=<<___ if ($avx>1); 1731 .rva .LSEH_begin_${func}_avx2 1732 .rva .LSEH_end_${func}_avx2 1733 .rva .LSEH_info_${func}_avx2 1734___ 1735$code.=<<___ if ($shaext); 1736 .rva .LSEH_begin_${func}_shaext 1737 .rva .LSEH_end_${func}_shaext 1738 .rva .LSEH_info_${func}_shaext 1739___ 1740$code.=<<___; 1741.section .xdata 1742.align 8 1743.LSEH_info_${func}_xop: 1744 .byte 9,0,0,0 1745 .rva se_handler 1746 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 1747 1748.LSEH_info_${func}_avx: 1749 .byte 9,0,0,0 1750 .rva se_handler 1751 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 1752___ 1753$code.=<<___ if ($avx>1); 1754.LSEH_info_${func}_avx2: 1755 .byte 9,0,0,0 1756 .rva se_handler 1757 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 1758___ 1759$code.=<<___ if ($shaext); 1760.LSEH_info_${func}_shaext: 1761 .byte 9,0,0,0 1762 .rva se_handler 1763 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] 1764___ 1765} 1766 1767#################################################################### 1768sub rex { 1769 local *opcode=shift; 1770 my ($dst,$src)=@_; 1771 my $rex=0; 1772 1773 $rex|=0x04 if($dst>=8); 1774 $rex|=0x01 if($src>=8); 1775 unshift @opcode,$rex|0x40 if($rex); 1776} 1777 1778{ 1779 my %opcodelet = ( 1780 "sha256rnds2" => 0xcb, 1781 "sha256msg1" => 0xcc, 1782 "sha256msg2" => 0xcd ); 1783 1784 sub sha256op38 { 1785 my $instr = shift; 1786 1787 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1788 my @opcode=(0x0f,0x38); 1789 rex(\@opcode,$2,$1); 1790 push @opcode,$opcodelet{$instr}; 1791 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1792 return ".byte\t".join(',',@opcode); 1793 } else { 1794 return $instr."\t".@_[0]; 1795 } 1796 } 1797} 1798 1799$code =~ s/\`([^\`]*)\`/eval $1/gem; 1800$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; 1801print $code; 1802close STDOUT or die "error closing STDOUT: $!"; 1803