1#! /usr/bin/env perl 2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# January 2013 18# 19# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled 20# in http://download.intel.com/design/intarch/papers/323686.pdf, is 21# that since AESNI-CBC encrypt exhibit *very* low instruction-level 22# parallelism, interleaving it with another algorithm would allow to 23# utilize processor resources better and achieve better performance. 24# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and 25# AESNI code is weaved into it. As SHA256 dominates execution time, 26# stitch performance does not depend on AES key length. Below are 27# performance numbers in cycles per processed byte, less is better, 28# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched 29# subroutine: 30# 31# AES-128/-192/-256+SHA256 this(**) gain 32# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% 33# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% 34# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% 35# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40% 36# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% 37# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54% 38# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60% 39# 40# (*) there are XOP, AVX1 and AVX2 code paths, meaning that 41# Westmere is omitted from loop, this is because gain was not 42# estimated high enough to justify the effort; 43# (**) these are EVP-free results, results obtained with 'speed 44# -evp aes-256-cbc-hmac-sha256' will vary by percent or two; 45# (***) these are SHAEXT results; 46 47# $output is the last argument if it looks like a file (it has an extension) 48# $flavour is the first argument if it doesn't look like a file 49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 51 52$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 53 54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 55( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 56( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 57die "can't locate x86_64-xlate.pl"; 58 59if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 60 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 61 $avx = ($1>=2.19) + ($1>=2.22); 62} 63 64if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 65 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 66 $avx = ($1>=2.09) + ($1>=2.10); 67} 68 69if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 70 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 71 $avx = ($1>=10) + ($1>=12); 72} 73 74if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 75 $avx = ($2>=3.0) + ($2>3.0); 76} 77 78$shaext=$avx; ### set to zero if compiling for 1.0.1 79$avx=1 if (!$shaext && $avx); 80 81open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 82 or die "can't call $xlate: $!"; 83*STDOUT=*OUT; 84 85$func="aesni_cbc_sha256_enc"; 86$TABLE="K256"; 87$SZ=4; 88@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 89 "%r8d","%r9d","%r10d","%r11d"); 90($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi"); 91@Sigma0=( 2,13,22); 92@Sigma1=( 6,11,25); 93@sigma0=( 7,18, 3); 94@sigma1=(17,19,10); 95$rounds=64; 96 97######################################################################## 98# void aesni_cbc_sha256_enc(const void *inp, 99# void *out, 100# size_t length, 101# const AES_KEY *key, 102# unsigned char *iv, 103# SHA256_CTX *ctx, 104# const void *in0); 105($inp, $out, $len, $key, $ivp, $ctx, $in0) = 106("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 107 108$Tbl="%rbp"; 109 110$_inp="16*$SZ+0*8(%rsp)"; 111$_out="16*$SZ+1*8(%rsp)"; 112$_end="16*$SZ+2*8(%rsp)"; 113$_key="16*$SZ+3*8(%rsp)"; 114$_ivp="16*$SZ+4*8(%rsp)"; 115$_ctx="16*$SZ+5*8(%rsp)"; 116$_in0="16*$SZ+6*8(%rsp)"; 117$_rsp="`16*$SZ+7*8`(%rsp)"; 118$framesz=16*$SZ+8*8; 119 120$code=<<___; 121.text 122 123.extern OPENSSL_ia32cap_P 124.globl $func 125.type $func,\@abi-omnipotent 126.align 16 127$func: 128.cfi_startproc 129___ 130 if ($avx) { 131$code.=<<___; 132 lea OPENSSL_ia32cap_P(%rip),%r11 133 mov \$1,%eax 134 cmp \$0,`$win64?"%rcx":"%rdi"` 135 je .Lprobe 136 mov 0(%r11),%eax 137 mov 4(%r11),%r10 138___ 139$code.=<<___ if ($shaext); 140 bt \$61,%r10 # check for SHA 141 jc ${func}_shaext 142___ 143$code.=<<___; 144 mov %r10,%r11 145 shr \$32,%r11 146 147 test \$`1<<11`,%r10d # check for XOP 148 jnz ${func}_xop 149___ 150$code.=<<___ if ($avx>1); 151 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 152 cmp \$`1<<8|1<<5|1<<3`,%r11d 153 je ${func}_avx2 154___ 155$code.=<<___; 156 and \$`1<<28`,%r10d # check for AVX 157 jnz ${func}_avx 158 ud2 159___ 160 } 161$code.=<<___; 162 xor %eax,%eax 163 cmp \$0,`$win64?"%rcx":"%rdi"` 164 je .Lprobe 165 ud2 166.Lprobe: 167 ret 168.cfi_endproc 169.size $func,.-$func 170 171.align 64 172.type $TABLE,\@object 173$TABLE: 174 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 175 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 176 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 177 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 178 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 179 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 180 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 181 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 182 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 183 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 184 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 185 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 186 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 187 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 188 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 189 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 190 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 191 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 192 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 193 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 194 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 195 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 196 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 197 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 198 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 199 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 200 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 201 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 202 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 203 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 204 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 205 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 206 207 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 208 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 209 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 210 .long 0,0,0,0, 0,0,0,0 211 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 212.align 64 213___ 214 215###################################################################### 216# SIMD code paths 217# 218{{{ 219($iv,$inout,$roundkey,$temp, 220 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15)); 221 222$aesni_cbc_idx=0; 223@aesni_cbc_block = ( 224## &vmovdqu ($roundkey,"0x00-0x80($inp)");' 225## &vmovdqu ($inout,($inp)); 226## &mov ($_inp,$inp); 227 228 '&vpxor ($inout,$inout,$roundkey);'. 229 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");', 230 231 '&vpxor ($inout,$inout,$iv);', 232 233 '&vaesenc ($inout,$inout,$roundkey);'. 234 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");', 235 236 '&vaesenc ($inout,$inout,$roundkey);'. 237 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");', 238 239 '&vaesenc ($inout,$inout,$roundkey);'. 240 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");', 241 242 '&vaesenc ($inout,$inout,$roundkey);'. 243 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");', 244 245 '&vaesenc ($inout,$inout,$roundkey);'. 246 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");', 247 248 '&vaesenc ($inout,$inout,$roundkey);'. 249 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");', 250 251 '&vaesenc ($inout,$inout,$roundkey);'. 252 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");', 253 254 '&vaesenc ($inout,$inout,$roundkey);'. 255 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");', 256 257 '&vaesenc ($inout,$inout,$roundkey);'. 258 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");', 259 260 '&vaesenclast ($temp,$inout,$roundkey);'. 261 ' &vaesenc ($inout,$inout,$roundkey);'. 262 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");', 263 264 '&vpand ($iv,$temp,$mask10);'. 265 ' &vaesenc ($inout,$inout,$roundkey);'. 266 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");', 267 268 '&vaesenclast ($temp,$inout,$roundkey);'. 269 ' &vaesenc ($inout,$inout,$roundkey);'. 270 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");', 271 272 '&vpand ($temp,$temp,$mask12);'. 273 ' &vaesenc ($inout,$inout,$roundkey);'. 274 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");', 275 276 '&vpor ($iv,$iv,$temp);'. 277 ' &vaesenclast ($temp,$inout,$roundkey);'. 278 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");' 279 280## &mov ($inp,$_inp); 281## &mov ($out,$_out); 282## &vpand ($temp,$temp,$mask14); 283## &vpor ($iv,$iv,$temp); 284## &vmovdqu ($iv,($out,$inp); 285## &lea (inp,16($inp)); 286); 287 288my $a4=$T1; 289my ($a,$b,$c,$d,$e,$f,$g,$h); 290 291sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 292{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 293 my $arg = pop; 294 $arg = "\$$arg" if ($arg*1 eq $arg); 295 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 296} 297 298sub body_00_15 () { 299 ( 300 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 301 302 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 303 '&mov ($a,$a1)', 304 '&mov ($a4,$f)', 305 306 '&xor ($a0,$e)', 307 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 308 '&xor ($a4,$g)', # f^g 309 310 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 311 '&xor ($a1,$a)', 312 '&and ($a4,$e)', # (f^g)&e 313 314 @aesni_cbc_block[$aesni_cbc_idx++]. 315 '&xor ($a0,$e)', 316 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 317 '&mov ($a2,$a)', 318 319 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 320 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 321 '&xor ($a2,$b)', # a^b, b^c in next round 322 323 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 324 '&add ($h,$a4)', # h+=Ch(e,f,g) 325 '&and ($a3,$a2)', # (b^c)&(a^b) 326 327 '&xor ($a1,$a)', 328 '&add ($h,$a0)', # h+=Sigma1(e) 329 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 330 331 '&add ($d,$h)', # d+=h 332 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 333 '&add ($h,$a3)', # h+=Maj(a,b,c) 334 335 '&mov ($a0,$d)', 336 '&add ($a1,$h);'. # h+=Sigma0(a) 337 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 338 ); 339} 340 341if ($avx) {{ 342###################################################################### 343# XOP code path 344# 345$code.=<<___; 346.type ${func}_xop,\@function,6 347.align 64 348${func}_xop: 349.cfi_startproc 350.Lxop_shortcut: 351 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 352 mov %rsp,%rax # copy %rsp 353.cfi_def_cfa_register %rax 354 push %rbx 355.cfi_push %rbx 356 push %rbp 357.cfi_push %rbp 358 push %r12 359.cfi_push %r12 360 push %r13 361.cfi_push %r13 362 push %r14 363.cfi_push %r14 364 push %r15 365.cfi_push %r15 366 sub \$`$framesz+$win64*16*10`,%rsp 367 and \$-64,%rsp # align stack frame 368 369 shl \$6,$len 370 sub $inp,$out # re-bias 371 sub $inp,$in0 372 add $inp,$len # end of input 373 374 #mov $inp,$_inp # saved later 375 mov $out,$_out 376 mov $len,$_end 377 #mov $key,$_key # remains resident in $inp register 378 mov $ivp,$_ivp 379 mov $ctx,$_ctx 380 mov $in0,$_in0 381 mov %rax,$_rsp 382.cfi_cfa_expression $_rsp,deref,+8 383___ 384$code.=<<___ if ($win64); 385 movaps %xmm6,`$framesz+16*0`(%rsp) 386 movaps %xmm7,`$framesz+16*1`(%rsp) 387 movaps %xmm8,`$framesz+16*2`(%rsp) 388 movaps %xmm9,`$framesz+16*3`(%rsp) 389 movaps %xmm10,`$framesz+16*4`(%rsp) 390 movaps %xmm11,`$framesz+16*5`(%rsp) 391 movaps %xmm12,`$framesz+16*6`(%rsp) 392 movaps %xmm13,`$framesz+16*7`(%rsp) 393 movaps %xmm14,`$framesz+16*8`(%rsp) 394 movaps %xmm15,`$framesz+16*9`(%rsp) 395___ 396$code.=<<___; 397.Lprologue_xop: 398 vzeroall 399 400 mov $inp,%r12 # borrow $a4 401 lea 0x80($key),$inp # size optimization, reassign 402 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 403 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 404 mov $ctx,%r15 # borrow $a2 405 mov $in0,%rsi # borrow $a3 406 vmovdqu ($ivp),$iv # load IV 407 sub \$9,%r14 408 409 mov $SZ*0(%r15),$A 410 mov $SZ*1(%r15),$B 411 mov $SZ*2(%r15),$C 412 mov $SZ*3(%r15),$D 413 mov $SZ*4(%r15),$E 414 mov $SZ*5(%r15),$F 415 mov $SZ*6(%r15),$G 416 mov $SZ*7(%r15),$H 417 418 vmovdqa 0x00(%r13,%r14,8),$mask14 419 vmovdqa 0x10(%r13,%r14,8),$mask12 420 vmovdqa 0x20(%r13,%r14,8),$mask10 421 vmovdqu 0x00-0x80($inp),$roundkey 422 jmp .Lloop_xop 423___ 424 if ($SZ==4) { # SHA256 425 my @X = map("%xmm$_",(0..3)); 426 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 427 428$code.=<<___; 429.align 16 430.Lloop_xop: 431 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 432 vmovdqu 0x00(%rsi,%r12),@X[0] 433 vmovdqu 0x10(%rsi,%r12),@X[1] 434 vmovdqu 0x20(%rsi,%r12),@X[2] 435 vmovdqu 0x30(%rsi,%r12),@X[3] 436 vpshufb $t3,@X[0],@X[0] 437 lea $TABLE(%rip),$Tbl 438 vpshufb $t3,@X[1],@X[1] 439 vpshufb $t3,@X[2],@X[2] 440 vpaddd 0x00($Tbl),@X[0],$t0 441 vpshufb $t3,@X[3],@X[3] 442 vpaddd 0x20($Tbl),@X[1],$t1 443 vpaddd 0x40($Tbl),@X[2],$t2 444 vpaddd 0x60($Tbl),@X[3],$t3 445 vmovdqa $t0,0x00(%rsp) 446 mov $A,$a1 447 vmovdqa $t1,0x10(%rsp) 448 mov $B,$a3 449 vmovdqa $t2,0x20(%rsp) 450 xor $C,$a3 # magic 451 vmovdqa $t3,0x30(%rsp) 452 mov $E,$a0 453 jmp .Lxop_00_47 454 455.align 16 456.Lxop_00_47: 457 sub \$-16*2*$SZ,$Tbl # size optimization 458 vmovdqu (%r12),$inout # $a4 459 mov %r12,$_inp # $a4 460___ 461sub XOP_256_00_47 () { 462my $j = shift; 463my $body = shift; 464my @X = @_; 465my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 466 467 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 468 eval(shift(@insns)); 469 eval(shift(@insns)); 470 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 471 eval(shift(@insns)); 472 eval(shift(@insns)); 473 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 474 eval(shift(@insns)); 475 eval(shift(@insns)); 476 &vpsrld ($t0,$t0,$sigma0[2]); 477 eval(shift(@insns)); 478 eval(shift(@insns)); 479 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 480 eval(shift(@insns)); 481 eval(shift(@insns)); 482 eval(shift(@insns)); 483 eval(shift(@insns)); 484 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 485 eval(shift(@insns)); 486 eval(shift(@insns)); 487 &vpxor ($t0,$t0,$t1); 488 eval(shift(@insns)); 489 eval(shift(@insns)); 490 eval(shift(@insns)); 491 eval(shift(@insns)); 492 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 493 eval(shift(@insns)); 494 eval(shift(@insns)); 495 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 496 eval(shift(@insns)); 497 eval(shift(@insns)); 498 &vpsrld ($t2,@X[3],$sigma1[2]); 499 eval(shift(@insns)); 500 eval(shift(@insns)); 501 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 502 eval(shift(@insns)); 503 eval(shift(@insns)); 504 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 505 eval(shift(@insns)); 506 eval(shift(@insns)); 507 &vpxor ($t3,$t3,$t2); 508 eval(shift(@insns)); 509 eval(shift(@insns)); 510 eval(shift(@insns)); 511 eval(shift(@insns)); 512 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 513 eval(shift(@insns)); 514 eval(shift(@insns)); 515 eval(shift(@insns)); 516 eval(shift(@insns)); 517 &vpsrldq ($t3,$t3,8); 518 eval(shift(@insns)); 519 eval(shift(@insns)); 520 eval(shift(@insns)); 521 eval(shift(@insns)); 522 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 523 eval(shift(@insns)); 524 eval(shift(@insns)); 525 eval(shift(@insns)); 526 eval(shift(@insns)); 527 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 528 eval(shift(@insns)); 529 eval(shift(@insns)); 530 &vpsrld ($t2,@X[0],$sigma1[2]); 531 eval(shift(@insns)); 532 eval(shift(@insns)); 533 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 534 eval(shift(@insns)); 535 eval(shift(@insns)); 536 &vpxor ($t3,$t3,$t2); 537 eval(shift(@insns)); 538 eval(shift(@insns)); 539 eval(shift(@insns)); 540 eval(shift(@insns)); 541 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 542 eval(shift(@insns)); 543 eval(shift(@insns)); 544 eval(shift(@insns)); 545 eval(shift(@insns)); 546 &vpslldq ($t3,$t3,8); # 22 instructions 547 eval(shift(@insns)); 548 eval(shift(@insns)); 549 eval(shift(@insns)); 550 eval(shift(@insns)); 551 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 552 eval(shift(@insns)); 553 eval(shift(@insns)); 554 eval(shift(@insns)); 555 eval(shift(@insns)); 556 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 557 foreach (@insns) { eval; } # remaining instructions 558 &vmovdqa (16*$j."(%rsp)",$t2); 559} 560 561 $aesni_cbc_idx=0; 562 for ($i=0,$j=0; $j<4; $j++) { 563 &XOP_256_00_47($j,\&body_00_15,@X); 564 push(@X,shift(@X)); # rotate(@X) 565 } 566 &mov ("%r12",$_inp); # borrow $a4 567 &vpand ($temp,$temp,$mask14); 568 &mov ("%r15",$_out); # borrow $a2 569 &vpor ($iv,$iv,$temp); 570 &vmovdqu ("(%r15,%r12)",$iv); # write output 571 &lea ("%r12","16(%r12)"); # inp++ 572 573 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 574 &jne (".Lxop_00_47"); 575 576 &vmovdqu ($inout,"(%r12)"); 577 &mov ($_inp,"%r12"); 578 579 $aesni_cbc_idx=0; 580 for ($i=0; $i<16; ) { 581 foreach(body_00_15()) { eval; } 582 } 583 } 584$code.=<<___; 585 mov $_inp,%r12 # borrow $a4 586 mov $_out,%r13 # borrow $a0 587 mov $_ctx,%r15 # borrow $a2 588 mov $_in0,%rsi # borrow $a3 589 590 vpand $mask14,$temp,$temp 591 mov $a1,$A 592 vpor $temp,$iv,$iv 593 vmovdqu $iv,(%r13,%r12) # write output 594 lea 16(%r12),%r12 # inp++ 595 596 add $SZ*0(%r15),$A 597 add $SZ*1(%r15),$B 598 add $SZ*2(%r15),$C 599 add $SZ*3(%r15),$D 600 add $SZ*4(%r15),$E 601 add $SZ*5(%r15),$F 602 add $SZ*6(%r15),$G 603 add $SZ*7(%r15),$H 604 605 cmp $_end,%r12 606 607 mov $A,$SZ*0(%r15) 608 mov $B,$SZ*1(%r15) 609 mov $C,$SZ*2(%r15) 610 mov $D,$SZ*3(%r15) 611 mov $E,$SZ*4(%r15) 612 mov $F,$SZ*5(%r15) 613 mov $G,$SZ*6(%r15) 614 mov $H,$SZ*7(%r15) 615 616 jb .Lloop_xop 617 618 mov $_ivp,$ivp 619 mov $_rsp,%rsi 620.cfi_def_cfa %rsi,8 621 vmovdqu $iv,($ivp) # output IV 622 vzeroall 623___ 624$code.=<<___ if ($win64); 625 movaps `$framesz+16*0`(%rsp),%xmm6 626 movaps `$framesz+16*1`(%rsp),%xmm7 627 movaps `$framesz+16*2`(%rsp),%xmm8 628 movaps `$framesz+16*3`(%rsp),%xmm9 629 movaps `$framesz+16*4`(%rsp),%xmm10 630 movaps `$framesz+16*5`(%rsp),%xmm11 631 movaps `$framesz+16*6`(%rsp),%xmm12 632 movaps `$framesz+16*7`(%rsp),%xmm13 633 movaps `$framesz+16*8`(%rsp),%xmm14 634 movaps `$framesz+16*9`(%rsp),%xmm15 635___ 636$code.=<<___; 637 mov -48(%rsi),%r15 638.cfi_restore %r15 639 mov -40(%rsi),%r14 640.cfi_restore %r14 641 mov -32(%rsi),%r13 642.cfi_restore %r13 643 mov -24(%rsi),%r12 644.cfi_restore %r12 645 mov -16(%rsi),%rbp 646.cfi_restore %rbp 647 mov -8(%rsi),%rbx 648.cfi_restore %rbx 649 lea (%rsi),%rsp 650.cfi_def_cfa_register %rsp 651.Lepilogue_xop: 652 ret 653.cfi_endproc 654.size ${func}_xop,.-${func}_xop 655___ 656###################################################################### 657# AVX+shrd code path 658# 659local *ror = sub { &shrd(@_[0],@_) }; 660 661$code.=<<___; 662.type ${func}_avx,\@function,6 663.align 64 664${func}_avx: 665.cfi_startproc 666.Lavx_shortcut: 667 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 668 mov %rsp,%rax # copy %rsp 669.cfi_def_cfa_register %rax 670 push %rbx 671.cfi_push %rbx 672 push %rbp 673.cfi_push %rbp 674 push %r12 675.cfi_push %r12 676 push %r13 677.cfi_push %r13 678 push %r14 679.cfi_push %r14 680 push %r15 681.cfi_push %r15 682 sub \$`$framesz+$win64*16*10`,%rsp 683 and \$-64,%rsp # align stack frame 684 685 shl \$6,$len 686 sub $inp,$out # re-bias 687 sub $inp,$in0 688 add $inp,$len # end of input 689 690 #mov $inp,$_inp # saved later 691 mov $out,$_out 692 mov $len,$_end 693 #mov $key,$_key # remains resident in $inp register 694 mov $ivp,$_ivp 695 mov $ctx,$_ctx 696 mov $in0,$_in0 697 mov %rax,$_rsp 698.cfi_cfa_expression $_rsp,deref,+8 699___ 700$code.=<<___ if ($win64); 701 movaps %xmm6,`$framesz+16*0`(%rsp) 702 movaps %xmm7,`$framesz+16*1`(%rsp) 703 movaps %xmm8,`$framesz+16*2`(%rsp) 704 movaps %xmm9,`$framesz+16*3`(%rsp) 705 movaps %xmm10,`$framesz+16*4`(%rsp) 706 movaps %xmm11,`$framesz+16*5`(%rsp) 707 movaps %xmm12,`$framesz+16*6`(%rsp) 708 movaps %xmm13,`$framesz+16*7`(%rsp) 709 movaps %xmm14,`$framesz+16*8`(%rsp) 710 movaps %xmm15,`$framesz+16*9`(%rsp) 711___ 712$code.=<<___; 713.Lprologue_avx: 714 vzeroall 715 716 mov $inp,%r12 # borrow $a4 717 lea 0x80($key),$inp # size optimization, reassign 718 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 719 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 720 mov $ctx,%r15 # borrow $a2 721 mov $in0,%rsi # borrow $a3 722 vmovdqu ($ivp),$iv # load IV 723 sub \$9,%r14 724 725 mov $SZ*0(%r15),$A 726 mov $SZ*1(%r15),$B 727 mov $SZ*2(%r15),$C 728 mov $SZ*3(%r15),$D 729 mov $SZ*4(%r15),$E 730 mov $SZ*5(%r15),$F 731 mov $SZ*6(%r15),$G 732 mov $SZ*7(%r15),$H 733 734 vmovdqa 0x00(%r13,%r14,8),$mask14 735 vmovdqa 0x10(%r13,%r14,8),$mask12 736 vmovdqa 0x20(%r13,%r14,8),$mask10 737 vmovdqu 0x00-0x80($inp),$roundkey 738___ 739 if ($SZ==4) { # SHA256 740 my @X = map("%xmm$_",(0..3)); 741 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 742 743$code.=<<___; 744 jmp .Lloop_avx 745.align 16 746.Lloop_avx: 747 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 748 vmovdqu 0x00(%rsi,%r12),@X[0] 749 vmovdqu 0x10(%rsi,%r12),@X[1] 750 vmovdqu 0x20(%rsi,%r12),@X[2] 751 vmovdqu 0x30(%rsi,%r12),@X[3] 752 vpshufb $t3,@X[0],@X[0] 753 lea $TABLE(%rip),$Tbl 754 vpshufb $t3,@X[1],@X[1] 755 vpshufb $t3,@X[2],@X[2] 756 vpaddd 0x00($Tbl),@X[0],$t0 757 vpshufb $t3,@X[3],@X[3] 758 vpaddd 0x20($Tbl),@X[1],$t1 759 vpaddd 0x40($Tbl),@X[2],$t2 760 vpaddd 0x60($Tbl),@X[3],$t3 761 vmovdqa $t0,0x00(%rsp) 762 mov $A,$a1 763 vmovdqa $t1,0x10(%rsp) 764 mov $B,$a3 765 vmovdqa $t2,0x20(%rsp) 766 xor $C,$a3 # magic 767 vmovdqa $t3,0x30(%rsp) 768 mov $E,$a0 769 jmp .Lavx_00_47 770 771.align 16 772.Lavx_00_47: 773 sub \$-16*2*$SZ,$Tbl # size optimization 774 vmovdqu (%r12),$inout # $a4 775 mov %r12,$_inp # $a4 776___ 777sub Xupdate_256_AVX () { 778 ( 779 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 780 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 781 '&vpsrld ($t2,$t0,$sigma0[0]);', 782 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 783 '&vpsrld ($t3,$t0,$sigma0[2])', 784 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 785 '&vpxor ($t0,$t3,$t2)', 786 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 787 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 788 '&vpxor ($t0,$t0,$t1)', 789 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 790 '&vpxor ($t0,$t0,$t2)', 791 '&vpsrld ($t2,$t3,$sigma1[2]);', 792 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 793 '&vpsrlq ($t3,$t3,$sigma1[0]);', 794 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 795 '&vpxor ($t2,$t2,$t3);', 796 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 797 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15]) 798 '&vpshufd ($t2,$t2,0b10000100)', 799 '&vpsrldq ($t2,$t2,8)', 800 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 801 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 802 '&vpsrld ($t2,$t3,$sigma1[2])', 803 '&vpsrlq ($t3,$t3,$sigma1[0])', 804 '&vpxor ($t2,$t2,$t3);', 805 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 806 '&vpxor ($t2,$t2,$t3)', 807 '&vpshufd ($t2,$t2,0b11101000)', 808 '&vpslldq ($t2,$t2,8)', 809 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 810 ); 811} 812 813sub AVX_256_00_47 () { 814my $j = shift; 815my $body = shift; 816my @X = @_; 817my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 818 819 foreach (Xupdate_256_AVX()) { # 29 instructions 820 eval; 821 eval(shift(@insns)); 822 eval(shift(@insns)); 823 eval(shift(@insns)); 824 } 825 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 826 foreach (@insns) { eval; } # remaining instructions 827 &vmovdqa (16*$j."(%rsp)",$t2); 828} 829 830 $aesni_cbc_idx=0; 831 for ($i=0,$j=0; $j<4; $j++) { 832 &AVX_256_00_47($j,\&body_00_15,@X); 833 push(@X,shift(@X)); # rotate(@X) 834 } 835 &mov ("%r12",$_inp); # borrow $a4 836 &vpand ($temp,$temp,$mask14); 837 &mov ("%r15",$_out); # borrow $a2 838 &vpor ($iv,$iv,$temp); 839 &vmovdqu ("(%r15,%r12)",$iv); # write output 840 &lea ("%r12","16(%r12)"); # inp++ 841 842 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 843 &jne (".Lavx_00_47"); 844 845 &vmovdqu ($inout,"(%r12)"); 846 &mov ($_inp,"%r12"); 847 848 $aesni_cbc_idx=0; 849 for ($i=0; $i<16; ) { 850 foreach(body_00_15()) { eval; } 851 } 852 853 } 854$code.=<<___; 855 mov $_inp,%r12 # borrow $a4 856 mov $_out,%r13 # borrow $a0 857 mov $_ctx,%r15 # borrow $a2 858 mov $_in0,%rsi # borrow $a3 859 860 vpand $mask14,$temp,$temp 861 mov $a1,$A 862 vpor $temp,$iv,$iv 863 vmovdqu $iv,(%r13,%r12) # write output 864 lea 16(%r12),%r12 # inp++ 865 866 add $SZ*0(%r15),$A 867 add $SZ*1(%r15),$B 868 add $SZ*2(%r15),$C 869 add $SZ*3(%r15),$D 870 add $SZ*4(%r15),$E 871 add $SZ*5(%r15),$F 872 add $SZ*6(%r15),$G 873 add $SZ*7(%r15),$H 874 875 cmp $_end,%r12 876 877 mov $A,$SZ*0(%r15) 878 mov $B,$SZ*1(%r15) 879 mov $C,$SZ*2(%r15) 880 mov $D,$SZ*3(%r15) 881 mov $E,$SZ*4(%r15) 882 mov $F,$SZ*5(%r15) 883 mov $G,$SZ*6(%r15) 884 mov $H,$SZ*7(%r15) 885 jb .Lloop_avx 886 887 mov $_ivp,$ivp 888 mov $_rsp,%rsi 889.cfi_def_cfa %rsi,8 890 vmovdqu $iv,($ivp) # output IV 891 vzeroall 892___ 893$code.=<<___ if ($win64); 894 movaps `$framesz+16*0`(%rsp),%xmm6 895 movaps `$framesz+16*1`(%rsp),%xmm7 896 movaps `$framesz+16*2`(%rsp),%xmm8 897 movaps `$framesz+16*3`(%rsp),%xmm9 898 movaps `$framesz+16*4`(%rsp),%xmm10 899 movaps `$framesz+16*5`(%rsp),%xmm11 900 movaps `$framesz+16*6`(%rsp),%xmm12 901 movaps `$framesz+16*7`(%rsp),%xmm13 902 movaps `$framesz+16*8`(%rsp),%xmm14 903 movaps `$framesz+16*9`(%rsp),%xmm15 904___ 905$code.=<<___; 906 mov -48(%rsi),%r15 907.cfi_restore %r15 908 mov -40(%rsi),%r14 909.cfi_restore %r14 910 mov -32(%rsi),%r13 911.cfi_restore %r13 912 mov -24(%rsi),%r12 913.cfi_restore %r12 914 mov -16(%rsi),%rbp 915.cfi_restore %rbp 916 mov -8(%rsi),%rbx 917.cfi_restore %rbx 918 lea (%rsi),%rsp 919.cfi_def_cfa_register %rsp 920.Lepilogue_avx: 921 ret 922.cfi_endproc 923.size ${func}_avx,.-${func}_avx 924___ 925 926if ($avx>1) {{ 927###################################################################### 928# AVX2+BMI code path 929# 930my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 931my $PUSH8=8*2*$SZ; 932use integer; 933 934sub bodyx_00_15 () { 935 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 936 ( 937 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 938 939 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 940 '&and ($a4,$e)', # f&e 941 '&rorx ($a0,$e,$Sigma1[2])', 942 '&rorx ($a2,$e,$Sigma1[1])', 943 944 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 945 '&lea ($h,"($h,$a4)")', 946 '&andn ($a4,$e,$g)', # ~e&g 947 '&xor ($a0,$a2)', 948 949 '&rorx ($a1,$e,$Sigma1[0])', 950 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 951 '&xor ($a0,$a1)', # Sigma1(e) 952 '&mov ($a2,$a)', 953 954 '&rorx ($a4,$a,$Sigma0[2])', 955 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 956 '&xor ($a2,$b)', # a^b, b^c in next round 957 '&rorx ($a1,$a,$Sigma0[1])', 958 959 '&rorx ($a0,$a,$Sigma0[0])', 960 '&lea ($d,"($d,$h)")', # d+=h 961 '&and ($a3,$a2)', # (b^c)&(a^b) 962 @aesni_cbc_block[$aesni_cbc_idx++]. 963 '&xor ($a1,$a4)', 964 965 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 966 '&xor ($a1,$a0)', # Sigma0(a) 967 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 968 '&mov ($a4,$e)', # copy of f in future 969 970 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 971 ); 972 # and at the finish one has to $a+=$a1 973} 974 975$code.=<<___; 976.type ${func}_avx2,\@function,6 977.align 64 978${func}_avx2: 979.cfi_startproc 980.Lavx2_shortcut: 981 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 982 mov %rsp,%rax # copy %rsp 983.cfi_def_cfa_register %rax 984 push %rbx 985.cfi_push %rbx 986 push %rbp 987.cfi_push %rbp 988 push %r12 989.cfi_push %r12 990 push %r13 991.cfi_push %r13 992 push %r14 993.cfi_push %r14 994 push %r15 995.cfi_push %r15 996 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp 997 and \$-256*$SZ,%rsp # align stack frame 998 add \$`2*$SZ*($rounds-8)`,%rsp 999 1000 shl \$6,$len 1001 sub $inp,$out # re-bias 1002 sub $inp,$in0 1003 add $inp,$len # end of input 1004 1005 #mov $inp,$_inp # saved later 1006 #mov $out,$_out # kept in $offload 1007 mov $len,$_end 1008 #mov $key,$_key # remains resident in $inp register 1009 mov $ivp,$_ivp 1010 mov $ctx,$_ctx 1011 mov $in0,$_in0 1012 mov %rax,$_rsp 1013.cfi_cfa_expression $_rsp,deref,+8 1014___ 1015$code.=<<___ if ($win64); 1016 movaps %xmm6,`$framesz+16*0`(%rsp) 1017 movaps %xmm7,`$framesz+16*1`(%rsp) 1018 movaps %xmm8,`$framesz+16*2`(%rsp) 1019 movaps %xmm9,`$framesz+16*3`(%rsp) 1020 movaps %xmm10,`$framesz+16*4`(%rsp) 1021 movaps %xmm11,`$framesz+16*5`(%rsp) 1022 movaps %xmm12,`$framesz+16*6`(%rsp) 1023 movaps %xmm13,`$framesz+16*7`(%rsp) 1024 movaps %xmm14,`$framesz+16*8`(%rsp) 1025 movaps %xmm15,`$framesz+16*9`(%rsp) 1026___ 1027$code.=<<___; 1028.Lprologue_avx2: 1029 vzeroall 1030 1031 mov $inp,%r13 # borrow $a0 1032 vpinsrq \$1,$out,$offload,$offload 1033 lea 0x80($key),$inp # size optimization, reassign 1034 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4 1035 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 1036 mov $ctx,%r15 # borrow $a2 1037 mov $in0,%rsi # borrow $a3 1038 vmovdqu ($ivp),$iv # load IV 1039 lea -9(%r14),%r14 1040 1041 vmovdqa 0x00(%r12,%r14,8),$mask14 1042 vmovdqa 0x10(%r12,%r14,8),$mask12 1043 vmovdqa 0x20(%r12,%r14,8),$mask10 1044 1045 sub \$-16*$SZ,%r13 # inp++, size optimization 1046 mov $SZ*0(%r15),$A 1047 lea (%rsi,%r13),%r12 # borrow $a0 1048 mov $SZ*1(%r15),$B 1049 cmp $len,%r13 # $_end 1050 mov $SZ*2(%r15),$C 1051 cmove %rsp,%r12 # next block or random data 1052 mov $SZ*3(%r15),$D 1053 mov $SZ*4(%r15),$E 1054 mov $SZ*5(%r15),$F 1055 mov $SZ*6(%r15),$G 1056 mov $SZ*7(%r15),$H 1057 vmovdqu 0x00-0x80($inp),$roundkey 1058___ 1059 if ($SZ==4) { # SHA256 1060 my @X = map("%ymm$_",(0..3)); 1061 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7)); 1062 1063$code.=<<___; 1064 jmp .Loop_avx2 1065.align 16 1066.Loop_avx2: 1067 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1068 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0 1069 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1 1070 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2 1071 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3 1072 1073 vinserti128 \$1,(%r12),@X[0],@X[0] 1074 vinserti128 \$1,16(%r12),@X[1],@X[1] 1075 vpshufb $t3,@X[0],@X[0] 1076 vinserti128 \$1,32(%r12),@X[2],@X[2] 1077 vpshufb $t3,@X[1],@X[1] 1078 vinserti128 \$1,48(%r12),@X[3],@X[3] 1079 1080 lea $TABLE(%rip),$Tbl 1081 vpshufb $t3,@X[2],@X[2] 1082 lea -16*$SZ(%r13),%r13 1083 vpaddd 0x00($Tbl),@X[0],$t0 1084 vpshufb $t3,@X[3],@X[3] 1085 vpaddd 0x20($Tbl),@X[1],$t1 1086 vpaddd 0x40($Tbl),@X[2],$t2 1087 vpaddd 0x60($Tbl),@X[3],$t3 1088 vmovdqa $t0,0x00(%rsp) 1089 xor $a1,$a1 1090 vmovdqa $t1,0x20(%rsp) 1091___ 1092$code.=<<___ if (!$win64); 1093# temporarily use %rsi as frame pointer 1094 mov $_rsp,%rsi 1095.cfi_def_cfa %rsi,8 1096___ 1097$code.=<<___; 1098 lea -$PUSH8(%rsp),%rsp 1099___ 1100$code.=<<___ if (!$win64); 1101# the frame info is at $_rsp, but the stack is moving... 1102# so a second frame pointer is saved at -8(%rsp) 1103# that is in the red zone 1104 mov %rsi,-8(%rsp) 1105.cfi_cfa_expression %rsp-8,deref,+8 1106___ 1107$code.=<<___; 1108 mov $B,$a3 1109 vmovdqa $t2,0x00(%rsp) 1110 xor $C,$a3 # magic 1111 vmovdqa $t3,0x20(%rsp) 1112 mov $F,$a4 1113 sub \$-16*2*$SZ,$Tbl # size optimization 1114 jmp .Lavx2_00_47 1115 1116.align 16 1117.Lavx2_00_47: 1118 vmovdqu (%r13),$inout 1119 vpinsrq \$0,%r13,$offload,$offload 1120___ 1121 1122sub AVX2_256_00_47 () { 1123my $j = shift; 1124my $body = shift; 1125my @X = @_; 1126my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1127my $base = "+2*$PUSH8(%rsp)"; 1128 1129 if (($j%2)==0) { 1130 &lea ("%rsp","-$PUSH8(%rsp)"); 1131$code.=<<___ if (!$win64); 1132.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 1133# copy secondary frame pointer to new location again at -8(%rsp) 1134 pushq $PUSH8-8(%rsp) 1135.cfi_cfa_expression %rsp,deref,+8 1136 lea 8(%rsp),%rsp 1137.cfi_cfa_expression %rsp-8,deref,+8 1138___ 1139 } 1140 foreach (Xupdate_256_AVX()) { # 29 instructions 1141 eval; 1142 eval(shift(@insns)); 1143 eval(shift(@insns)); 1144 eval(shift(@insns)); 1145 } 1146 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1147 foreach (@insns) { eval; } # remaining instructions 1148 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1149} 1150 $aesni_cbc_idx=0; 1151 for ($i=0,$j=0; $j<4; $j++) { 1152 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1153 push(@X,shift(@X)); # rotate(@X) 1154 } 1155 &vmovq ("%r13",$offload); # borrow $a0 1156 &vpextrq ("%r15",$offload,1); # borrow $a2 1157 &vpand ($temp,$temp,$mask14); 1158 &vpor ($iv,$iv,$temp); 1159 &vmovdqu ("(%r15,%r13)",$iv); # write output 1160 &lea ("%r13","16(%r13)"); # inp++ 1161 1162 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1163 &cmpb (($SZ-1)."($Tbl)",0); 1164 &jne (".Lavx2_00_47"); 1165 1166 &vmovdqu ($inout,"(%r13)"); 1167 &vpinsrq ($offload,$offload,"%r13",0); 1168 1169 $aesni_cbc_idx=0; 1170 for ($i=0; $i<16; ) { 1171 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1172 foreach(bodyx_00_15()) { eval; } 1173 } 1174 } 1175$code.=<<___; 1176 vpextrq \$1,$offload,%r12 # $_out, borrow $a4 1177 vmovq $offload,%r13 # $_inp, borrow $a0 1178 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 1179 add $a1,$A 1180 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 1181 1182 vpand $mask14,$temp,$temp 1183 vpor $temp,$iv,$iv 1184 vmovdqu $iv,(%r12,%r13) # write output 1185 lea 16(%r13),%r13 1186 1187 add $SZ*0(%r15),$A 1188 add $SZ*1(%r15),$B 1189 add $SZ*2(%r15),$C 1190 add $SZ*3(%r15),$D 1191 add $SZ*4(%r15),$E 1192 add $SZ*5(%r15),$F 1193 add $SZ*6(%r15),$G 1194 add $SZ*7(%r15),$H 1195 1196 mov $A,$SZ*0(%r15) 1197 mov $B,$SZ*1(%r15) 1198 mov $C,$SZ*2(%r15) 1199 mov $D,$SZ*3(%r15) 1200 mov $E,$SZ*4(%r15) 1201 mov $F,$SZ*5(%r15) 1202 mov $G,$SZ*6(%r15) 1203 mov $H,$SZ*7(%r15) 1204 1205 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end 1206 je .Ldone_avx2 1207 1208 xor $a1,$a1 1209 mov $B,$a3 1210 mov $F,$a4 1211 xor $C,$a3 # magic 1212 jmp .Lower_avx2 1213.align 16 1214.Lower_avx2: 1215 vmovdqu (%r13),$inout 1216 vpinsrq \$0,%r13,$offload,$offload 1217___ 1218 $aesni_cbc_idx=0; 1219 for ($i=0; $i<16; ) { 1220 my $base="+16($Tbl)"; 1221 foreach(bodyx_00_15()) { eval; } 1222 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8); 1223 } 1224$code.=<<___; 1225 vmovq $offload,%r13 # borrow $a0 1226 vpextrq \$1,$offload,%r15 # borrow $a2 1227 vpand $mask14,$temp,$temp 1228 vpor $temp,$iv,$iv 1229 lea -$PUSH8($Tbl),$Tbl 1230 vmovdqu $iv,(%r15,%r13) # write output 1231 lea 16(%r13),%r13 # inp++ 1232 cmp %rsp,$Tbl 1233 jae .Lower_avx2 1234 1235 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 1236 lea 16*$SZ(%r13),%r13 1237 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3 1238 add $a1,$A 1239 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 1240 1241 add $SZ*0(%r15),$A 1242 add $SZ*1(%r15),$B 1243 add $SZ*2(%r15),$C 1244 add $SZ*3(%r15),$D 1245 add $SZ*4(%r15),$E 1246 add $SZ*5(%r15),$F 1247 add $SZ*6(%r15),$G 1248 lea (%rsi,%r13),%r12 1249 add $SZ*7(%r15),$H 1250 1251 cmp $_end,%r13 1252 1253 mov $A,$SZ*0(%r15) 1254 cmove %rsp,%r12 # next block or stale data 1255 mov $B,$SZ*1(%r15) 1256 mov $C,$SZ*2(%r15) 1257 mov $D,$SZ*3(%r15) 1258 mov $E,$SZ*4(%r15) 1259 mov $F,$SZ*5(%r15) 1260 mov $G,$SZ*6(%r15) 1261 mov $H,$SZ*7(%r15) 1262 1263 jbe .Loop_avx2 1264 lea (%rsp),$Tbl 1265# temporarily use $Tbl as index to $_rsp 1266# this avoids the need to save a secondary frame pointer at -8(%rsp) 1267.cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8 1268 1269.Ldone_avx2: 1270 mov 16*$SZ+4*8($Tbl),$ivp 1271 mov 16*$SZ+7*8($Tbl),%rsi 1272.cfi_def_cfa %rsi,8 1273 vmovdqu $iv,($ivp) # output IV 1274 vzeroall 1275___ 1276$code.=<<___ if ($win64); 1277 movaps `$framesz+16*0`($Tbl),%xmm6 1278 movaps `$framesz+16*1`($Tbl),%xmm7 1279 movaps `$framesz+16*2`($Tbl),%xmm8 1280 movaps `$framesz+16*3`($Tbl),%xmm9 1281 movaps `$framesz+16*4`($Tbl),%xmm10 1282 movaps `$framesz+16*5`($Tbl),%xmm11 1283 movaps `$framesz+16*6`($Tbl),%xmm12 1284 movaps `$framesz+16*7`($Tbl),%xmm13 1285 movaps `$framesz+16*8`($Tbl),%xmm14 1286 movaps `$framesz+16*9`($Tbl),%xmm15 1287___ 1288$code.=<<___; 1289 mov -48(%rsi),%r15 1290.cfi_restore %r15 1291 mov -40(%rsi),%r14 1292.cfi_restore %r14 1293 mov -32(%rsi),%r13 1294.cfi_restore %r13 1295 mov -24(%rsi),%r12 1296.cfi_restore %r12 1297 mov -16(%rsi),%rbp 1298.cfi_restore %rbp 1299 mov -8(%rsi),%rbx 1300.cfi_restore %rbx 1301 lea (%rsi),%rsp 1302.cfi_def_cfa_register %rsp 1303.Lepilogue_avx2: 1304 ret 1305.cfi_endproc 1306.size ${func}_avx2,.-${func}_avx2 1307___ 1308}} 1309}} 1310{{ 1311my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 1312 1313my ($rounds,$Tbl)=("%r11d","%rbx"); 1314 1315my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15)); 1316my @rndkey=("%xmm4","%xmm5"); 1317my $r=0; 1318my $sn=0; 1319 1320my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9)); 1321my @MSG=map("%xmm$_",(10..13)); 1322 1323my $aesenc=sub { 1324 use integer; 1325 my ($n,$k)=($r/10,$r%10); 1326 if ($k==0) { 1327 $code.=<<___; 1328 movups `16*$n`($in0),$in # load input 1329 xorps $rndkey0,$in 1330___ 1331 $code.=<<___ if ($n); 1332 movups $iv,`16*($n-1)`($out,$in0) # write output 1333___ 1334 $code.=<<___; 1335 xorps $in,$iv 1336 movups `32+16*$k-112`($key),$rndkey[1] 1337 aesenc $rndkey[0],$iv 1338___ 1339 } elsif ($k==9) { 1340 $sn++; 1341 $code.=<<___; 1342 cmp \$11,$rounds 1343 jb .Laesenclast$sn 1344 movups `32+16*($k+0)-112`($key),$rndkey[1] 1345 aesenc $rndkey[0],$iv 1346 movups `32+16*($k+1)-112`($key),$rndkey[0] 1347 aesenc $rndkey[1],$iv 1348 je .Laesenclast$sn 1349 movups `32+16*($k+2)-112`($key),$rndkey[1] 1350 aesenc $rndkey[0],$iv 1351 movups `32+16*($k+3)-112`($key),$rndkey[0] 1352 aesenc $rndkey[1],$iv 1353.Laesenclast$sn: 1354 aesenclast $rndkey[0],$iv 1355 movups 16-112($key),$rndkey[1] # forward reference 1356 nop 1357___ 1358 } else { 1359 $code.=<<___; 1360 movups `32+16*$k-112`($key),$rndkey[1] 1361 aesenc $rndkey[0],$iv 1362___ 1363 } 1364 $r++; unshift(@rndkey,pop(@rndkey)); 1365}; 1366 1367if ($shaext) { 1368my $Tbl="%rax"; 1369 1370$code.=<<___; 1371.type ${func}_shaext,\@function,6 1372.align 32 1373${func}_shaext: 1374.cfi_startproc 1375 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 1376___ 1377$code.=<<___ if ($win64); 1378 lea `-8-10*16`(%rsp),%rsp 1379 movaps %xmm6,-8-10*16(%rax) 1380 movaps %xmm7,-8-9*16(%rax) 1381 movaps %xmm8,-8-8*16(%rax) 1382 movaps %xmm9,-8-7*16(%rax) 1383 movaps %xmm10,-8-6*16(%rax) 1384 movaps %xmm11,-8-5*16(%rax) 1385 movaps %xmm12,-8-4*16(%rax) 1386 movaps %xmm13,-8-3*16(%rax) 1387 movaps %xmm14,-8-2*16(%rax) 1388 movaps %xmm15,-8-1*16(%rax) 1389.Lprologue_shaext: 1390___ 1391$code.=<<___; 1392 lea K256+0x80(%rip),$Tbl 1393 movdqu ($ctx),$ABEF # DCBA 1394 movdqu 16($ctx),$CDGH # HGFE 1395 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 1396 1397 mov 240($key),$rounds 1398 sub $in0,$out 1399 movups ($key),$rndkey0 # $key[0] 1400 movups ($ivp),$iv # load IV 1401 movups 16($key),$rndkey[0] # forward reference 1402 lea 112($key),$key # size optimization 1403 1404 pshufd \$0x1b,$ABEF,$Wi # ABCD 1405 pshufd \$0xb1,$ABEF,$ABEF # CDAB 1406 pshufd \$0x1b,$CDGH,$CDGH # EFGH 1407 movdqa $TMP,$BSWAP # offload 1408 palignr \$8,$CDGH,$ABEF # ABEF 1409 punpcklqdq $Wi,$CDGH # CDGH 1410 1411 jmp .Loop_shaext 1412 1413.align 16 1414.Loop_shaext: 1415 movdqu ($inp),@MSG[0] 1416 movdqu 0x10($inp),@MSG[1] 1417 movdqu 0x20($inp),@MSG[2] 1418 pshufb $TMP,@MSG[0] 1419 movdqu 0x30($inp),@MSG[3] 1420 1421 movdqa 0*32-0x80($Tbl),$Wi 1422 paddd @MSG[0],$Wi 1423 pshufb $TMP,@MSG[1] 1424 movdqa $CDGH,$CDGH_SAVE # offload 1425 movdqa $ABEF,$ABEF_SAVE # offload 1426___ 1427 &$aesenc(); 1428$code.=<<___; 1429 sha256rnds2 $ABEF,$CDGH # 0-3 1430 pshufd \$0x0e,$Wi,$Wi 1431___ 1432 &$aesenc(); 1433$code.=<<___; 1434 sha256rnds2 $CDGH,$ABEF 1435 1436 movdqa 1*32-0x80($Tbl),$Wi 1437 paddd @MSG[1],$Wi 1438 pshufb $TMP,@MSG[2] 1439 lea 0x40($inp),$inp 1440___ 1441 &$aesenc(); 1442$code.=<<___; 1443 sha256rnds2 $ABEF,$CDGH # 4-7 1444 pshufd \$0x0e,$Wi,$Wi 1445___ 1446 &$aesenc(); 1447$code.=<<___; 1448 sha256rnds2 $CDGH,$ABEF 1449 1450 movdqa 2*32-0x80($Tbl),$Wi 1451 paddd @MSG[2],$Wi 1452 pshufb $TMP,@MSG[3] 1453 sha256msg1 @MSG[1],@MSG[0] 1454___ 1455 &$aesenc(); 1456$code.=<<___; 1457 sha256rnds2 $ABEF,$CDGH # 8-11 1458 pshufd \$0x0e,$Wi,$Wi 1459 movdqa @MSG[3],$TMP 1460 palignr \$4,@MSG[2],$TMP 1461 paddd $TMP,@MSG[0] 1462___ 1463 &$aesenc(); 1464$code.=<<___; 1465 sha256rnds2 $CDGH,$ABEF 1466 1467 movdqa 3*32-0x80($Tbl),$Wi 1468 paddd @MSG[3],$Wi 1469 sha256msg2 @MSG[3],@MSG[0] 1470 sha256msg1 @MSG[2],@MSG[1] 1471___ 1472 &$aesenc(); 1473$code.=<<___; 1474 sha256rnds2 $ABEF,$CDGH # 12-15 1475 pshufd \$0x0e,$Wi,$Wi 1476___ 1477 &$aesenc(); 1478$code.=<<___; 1479 movdqa @MSG[0],$TMP 1480 palignr \$4,@MSG[3],$TMP 1481 paddd $TMP,@MSG[1] 1482 sha256rnds2 $CDGH,$ABEF 1483___ 1484for($i=4;$i<16-3;$i++) { 1485 &$aesenc() if (($r%10)==0); 1486$code.=<<___; 1487 movdqa $i*32-0x80($Tbl),$Wi 1488 paddd @MSG[0],$Wi 1489 sha256msg2 @MSG[0],@MSG[1] 1490 sha256msg1 @MSG[3],@MSG[2] 1491___ 1492 &$aesenc(); 1493$code.=<<___; 1494 sha256rnds2 $ABEF,$CDGH # 16-19... 1495 pshufd \$0x0e,$Wi,$Wi 1496 movdqa @MSG[1],$TMP 1497 palignr \$4,@MSG[0],$TMP 1498 paddd $TMP,@MSG[2] 1499___ 1500 &$aesenc(); 1501 &$aesenc() if ($r==19); 1502$code.=<<___; 1503 sha256rnds2 $CDGH,$ABEF 1504___ 1505 push(@MSG,shift(@MSG)); 1506} 1507$code.=<<___; 1508 movdqa 13*32-0x80($Tbl),$Wi 1509 paddd @MSG[0],$Wi 1510 sha256msg2 @MSG[0],@MSG[1] 1511 sha256msg1 @MSG[3],@MSG[2] 1512___ 1513 &$aesenc(); 1514$code.=<<___; 1515 sha256rnds2 $ABEF,$CDGH # 52-55 1516 pshufd \$0x0e,$Wi,$Wi 1517 movdqa @MSG[1],$TMP 1518 palignr \$4,@MSG[0],$TMP 1519 paddd $TMP,@MSG[2] 1520___ 1521 &$aesenc(); 1522 &$aesenc(); 1523$code.=<<___; 1524 sha256rnds2 $CDGH,$ABEF 1525 1526 movdqa 14*32-0x80($Tbl),$Wi 1527 paddd @MSG[1],$Wi 1528 sha256msg2 @MSG[1],@MSG[2] 1529 movdqa $BSWAP,$TMP 1530___ 1531 &$aesenc(); 1532$code.=<<___; 1533 sha256rnds2 $ABEF,$CDGH # 56-59 1534 pshufd \$0x0e,$Wi,$Wi 1535___ 1536 &$aesenc(); 1537$code.=<<___; 1538 sha256rnds2 $CDGH,$ABEF 1539 1540 movdqa 15*32-0x80($Tbl),$Wi 1541 paddd @MSG[2],$Wi 1542___ 1543 &$aesenc(); 1544 &$aesenc(); 1545$code.=<<___; 1546 sha256rnds2 $ABEF,$CDGH # 60-63 1547 pshufd \$0x0e,$Wi,$Wi 1548___ 1549 &$aesenc(); 1550$code.=<<___; 1551 sha256rnds2 $CDGH,$ABEF 1552 #pxor $CDGH,$rndkey0 # black magic 1553___ 1554 while ($r<40) { &$aesenc(); } # remaining aesenc's 1555$code.=<<___; 1556 #xorps $CDGH,$rndkey0 # black magic 1557 paddd $CDGH_SAVE,$CDGH 1558 paddd $ABEF_SAVE,$ABEF 1559 1560 dec $len 1561 movups $iv,48($out,$in0) # write output 1562 lea 64($in0),$in0 1563 jnz .Loop_shaext 1564 1565 pshufd \$0xb1,$CDGH,$CDGH # DCHG 1566 pshufd \$0x1b,$ABEF,$TMP # FEBA 1567 pshufd \$0xb1,$ABEF,$ABEF # BAFE 1568 punpckhqdq $CDGH,$ABEF # DCBA 1569 palignr \$8,$TMP,$CDGH # HGFE 1570 1571 movups $iv,($ivp) # write IV 1572 movdqu $ABEF,($ctx) 1573 movdqu $CDGH,16($ctx) 1574___ 1575$code.=<<___ if ($win64); 1576 movaps 0*16(%rsp),%xmm6 1577 movaps 1*16(%rsp),%xmm7 1578 movaps 2*16(%rsp),%xmm8 1579 movaps 3*16(%rsp),%xmm9 1580 movaps 4*16(%rsp),%xmm10 1581 movaps 5*16(%rsp),%xmm11 1582 movaps 6*16(%rsp),%xmm12 1583 movaps 7*16(%rsp),%xmm13 1584 movaps 8*16(%rsp),%xmm14 1585 movaps 9*16(%rsp),%xmm15 1586 lea 8+10*16(%rsp),%rsp 1587.Lepilogue_shaext: 1588___ 1589$code.=<<___; 1590 ret 1591.cfi_endproc 1592.size ${func}_shaext,.-${func}_shaext 1593___ 1594} 1595}}}}} 1596 1597# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1598# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1599if ($win64 && $avx) { 1600$rec="%rcx"; 1601$frame="%rdx"; 1602$context="%r8"; 1603$disp="%r9"; 1604 1605$code.=<<___; 1606.extern __imp_RtlVirtualUnwind 1607.type se_handler,\@abi-omnipotent 1608.align 16 1609se_handler: 1610 push %rsi 1611 push %rdi 1612 push %rbx 1613 push %rbp 1614 push %r12 1615 push %r13 1616 push %r14 1617 push %r15 1618 pushfq 1619 sub \$64,%rsp 1620 1621 mov 120($context),%rax # pull context->Rax 1622 mov 248($context),%rbx # pull context->Rip 1623 1624 mov 8($disp),%rsi # disp->ImageBase 1625 mov 56($disp),%r11 # disp->HanderlData 1626 1627 mov 0(%r11),%r10d # HandlerData[0] 1628 lea (%rsi,%r10),%r10 # prologue label 1629 cmp %r10,%rbx # context->Rip<prologue label 1630 jb .Lin_prologue 1631 1632 mov 152($context),%rax # pull context->Rsp 1633 1634 mov 4(%r11),%r10d # HandlerData[1] 1635 lea (%rsi,%r10),%r10 # epilogue label 1636 cmp %r10,%rbx # context->Rip>=epilogue label 1637 jae .Lin_prologue 1638___ 1639$code.=<<___ if ($shaext); 1640 lea aesni_cbc_sha256_enc_shaext(%rip),%r10 1641 cmp %r10,%rbx 1642 jb .Lnot_in_shaext 1643 1644 lea (%rax),%rsi 1645 lea 512($context),%rdi # &context.Xmm6 1646 mov \$20,%ecx 1647 .long 0xa548f3fc # cld; rep movsq 1648 lea 168(%rax),%rax # adjust stack pointer 1649 jmp .Lin_prologue 1650.Lnot_in_shaext: 1651___ 1652$code.=<<___ if ($avx>1); 1653 lea .Lavx2_shortcut(%rip),%r10 1654 cmp %r10,%rbx # context->Rip<avx2_shortcut 1655 jb .Lnot_in_avx2 1656 1657 and \$-256*$SZ,%rax 1658 add \$`2*$SZ*($rounds-8)`,%rax 1659.Lnot_in_avx2: 1660___ 1661$code.=<<___; 1662 mov %rax,%rsi # put aside Rsp 1663 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp 1664 1665 mov -8(%rax),%rbx 1666 mov -16(%rax),%rbp 1667 mov -24(%rax),%r12 1668 mov -32(%rax),%r13 1669 mov -40(%rax),%r14 1670 mov -48(%rax),%r15 1671 mov %rbx,144($context) # restore context->Rbx 1672 mov %rbp,160($context) # restore context->Rbp 1673 mov %r12,216($context) # restore context->R12 1674 mov %r13,224($context) # restore context->R13 1675 mov %r14,232($context) # restore context->R14 1676 mov %r15,240($context) # restore context->R15 1677 1678 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area 1679 lea 512($context),%rdi # &context.Xmm6 1680 mov \$20,%ecx 1681 .long 0xa548f3fc # cld; rep movsq 1682 1683.Lin_prologue: 1684 mov 8(%rax),%rdi 1685 mov 16(%rax),%rsi 1686 mov %rax,152($context) # restore context->Rsp 1687 mov %rsi,168($context) # restore context->Rsi 1688 mov %rdi,176($context) # restore context->Rdi 1689 1690 mov 40($disp),%rdi # disp->ContextRecord 1691 mov $context,%rsi # context 1692 mov \$154,%ecx # sizeof(CONTEXT) 1693 .long 0xa548f3fc # cld; rep movsq 1694 1695 mov $disp,%rsi 1696 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1697 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1698 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1699 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1700 mov 40(%rsi),%r10 # disp->ContextRecord 1701 lea 56(%rsi),%r11 # &disp->HandlerData 1702 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1703 mov %r10,32(%rsp) # arg5 1704 mov %r11,40(%rsp) # arg6 1705 mov %r12,48(%rsp) # arg7 1706 mov %rcx,56(%rsp) # arg8, (NULL) 1707 call *__imp_RtlVirtualUnwind(%rip) 1708 1709 mov \$1,%eax # ExceptionContinueSearch 1710 add \$64,%rsp 1711 popfq 1712 pop %r15 1713 pop %r14 1714 pop %r13 1715 pop %r12 1716 pop %rbp 1717 pop %rbx 1718 pop %rdi 1719 pop %rsi 1720 ret 1721.size se_handler,.-se_handler 1722 1723.section .pdata 1724 .rva .LSEH_begin_${func}_xop 1725 .rva .LSEH_end_${func}_xop 1726 .rva .LSEH_info_${func}_xop 1727 1728 .rva .LSEH_begin_${func}_avx 1729 .rva .LSEH_end_${func}_avx 1730 .rva .LSEH_info_${func}_avx 1731___ 1732$code.=<<___ if ($avx>1); 1733 .rva .LSEH_begin_${func}_avx2 1734 .rva .LSEH_end_${func}_avx2 1735 .rva .LSEH_info_${func}_avx2 1736___ 1737$code.=<<___ if ($shaext); 1738 .rva .LSEH_begin_${func}_shaext 1739 .rva .LSEH_end_${func}_shaext 1740 .rva .LSEH_info_${func}_shaext 1741___ 1742$code.=<<___; 1743.section .xdata 1744.align 8 1745.LSEH_info_${func}_xop: 1746 .byte 9,0,0,0 1747 .rva se_handler 1748 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 1749 1750.LSEH_info_${func}_avx: 1751 .byte 9,0,0,0 1752 .rva se_handler 1753 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 1754___ 1755$code.=<<___ if ($avx>1); 1756.LSEH_info_${func}_avx2: 1757 .byte 9,0,0,0 1758 .rva se_handler 1759 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 1760___ 1761$code.=<<___ if ($shaext); 1762.LSEH_info_${func}_shaext: 1763 .byte 9,0,0,0 1764 .rva se_handler 1765 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] 1766___ 1767} 1768 1769#################################################################### 1770sub rex { 1771 local *opcode=shift; 1772 my ($dst,$src)=@_; 1773 my $rex=0; 1774 1775 $rex|=0x04 if($dst>=8); 1776 $rex|=0x01 if($src>=8); 1777 unshift @opcode,$rex|0x40 if($rex); 1778} 1779 1780{ 1781 my %opcodelet = ( 1782 "sha256rnds2" => 0xcb, 1783 "sha256msg1" => 0xcc, 1784 "sha256msg2" => 0xcd ); 1785 1786 sub sha256op38 { 1787 my $instr = shift; 1788 1789 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1790 my @opcode=(0x0f,0x38); 1791 rex(\@opcode,$2,$1); 1792 push @opcode,$opcodelet{$instr}; 1793 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1794 return ".byte\t".join(',',@opcode); 1795 } else { 1796 return $instr."\t".@_[0]; 1797 } 1798 } 1799} 1800 1801$code =~ s/\`([^\`]*)\`/eval $1/gem; 1802$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; 1803print $code; 1804close STDOUT or die "error closing STDOUT: $!"; 1805