1#! /usr/bin/env perl 2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# January 2013 18# 19# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled 20# in http://download.intel.com/design/intarch/papers/323686.pdf, is 21# that since AESNI-CBC encrypt exhibit *very* low instruction-level 22# parallelism, interleaving it with another algorithm would allow to 23# utilize processor resources better and achieve better performance. 24# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and 25# AESNI code is weaved into it. As SHA256 dominates execution time, 26# stitch performance does not depend on AES key length. Below are 27# performance numbers in cycles per processed byte, less is better, 28# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched 29# subroutine: 30# 31# AES-128/-192/-256+SHA256 this(**) gain 32# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% 33# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% 34# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% 35# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40% 36# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% 37# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54% 38# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60% 39# 40# (*) there are XOP, AVX1 and AVX2 code paths, meaning that 41# Westmere is omitted from loop, this is because gain was not 42# estimated high enough to justify the effort; 43# (**) these are EVP-free results, results obtained with 'speed 44# -evp aes-256-cbc-hmac-sha256' will vary by percent or two; 45# (***) these are SHAEXT results; 46 47$flavour = shift; 48$output = shift; 49if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 50 51$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 52 53$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 54( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 55( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 56die "can't locate x86_64-xlate.pl"; 57 58if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 59 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 60 $avx = ($1>=2.19) + ($1>=2.22); 61} 62 63if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 64 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 65 $avx = ($1>=2.09) + ($1>=2.10); 66} 67 68if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 69 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 70 $avx = ($1>=10) + ($1>=12); 71} 72 73if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 74 $avx = ($2>=3.0) + ($2>3.0); 75} 76 77$shaext=$avx; ### set to zero if compiling for 1.0.1 78$avx=1 if (!$shaext && $avx); 79 80open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 81*STDOUT=*OUT; 82 83$func="aesni_cbc_sha256_enc"; 84$TABLE="K256"; 85$SZ=4; 86@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 87 "%r8d","%r9d","%r10d","%r11d"); 88($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi"); 89@Sigma0=( 2,13,22); 90@Sigma1=( 6,11,25); 91@sigma0=( 7,18, 3); 92@sigma1=(17,19,10); 93$rounds=64; 94 95######################################################################## 96# void aesni_cbc_sha256_enc(const void *inp, 97# void *out, 98# size_t length, 99# const AES_KEY *key, 100# unsigned char *iv, 101# SHA256_CTX *ctx, 102# const void *in0); 103($inp, $out, $len, $key, $ivp, $ctx, $in0) = 104("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 105 106$Tbl="%rbp"; 107 108$_inp="16*$SZ+0*8(%rsp)"; 109$_out="16*$SZ+1*8(%rsp)"; 110$_end="16*$SZ+2*8(%rsp)"; 111$_key="16*$SZ+3*8(%rsp)"; 112$_ivp="16*$SZ+4*8(%rsp)"; 113$_ctx="16*$SZ+5*8(%rsp)"; 114$_in0="16*$SZ+6*8(%rsp)"; 115$_rsp="`16*$SZ+7*8`(%rsp)"; 116$framesz=16*$SZ+8*8; 117 118$code=<<___; 119.text 120 121.extern OPENSSL_ia32cap_P 122.globl $func 123.type $func,\@abi-omnipotent 124.align 16 125$func: 126___ 127 if ($avx) { 128$code.=<<___; 129 lea OPENSSL_ia32cap_P(%rip),%r11 130 mov \$1,%eax 131 cmp \$0,`$win64?"%rcx":"%rdi"` 132 je .Lprobe 133 mov 0(%r11),%eax 134 mov 4(%r11),%r10 135___ 136$code.=<<___ if ($shaext); 137 bt \$61,%r10 # check for SHA 138 jc ${func}_shaext 139___ 140$code.=<<___; 141 mov %r10,%r11 142 shr \$32,%r11 143 144 test \$`1<<11`,%r10d # check for XOP 145 jnz ${func}_xop 146___ 147$code.=<<___ if ($avx>1); 148 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 149 cmp \$`1<<8|1<<5|1<<3`,%r11d 150 je ${func}_avx2 151___ 152$code.=<<___; 153 and \$`1<<28`,%r10d # check for AVX 154 jnz ${func}_avx 155 ud2 156___ 157 } 158$code.=<<___; 159 xor %eax,%eax 160 cmp \$0,`$win64?"%rcx":"%rdi"` 161 je .Lprobe 162 ud2 163.Lprobe: 164 ret 165.size $func,.-$func 166 167.align 64 168.type $TABLE,\@object 169$TABLE: 170 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 171 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 172 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 173 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 174 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 175 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 176 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 177 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 178 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 179 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 180 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 181 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 182 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 183 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 184 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 185 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 186 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 187 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 188 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 189 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 190 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 191 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 192 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 193 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 194 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 195 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 196 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 197 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 198 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 199 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 200 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 201 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 202 203 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 204 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 205 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 206 .long 0,0,0,0, 0,0,0,0 207 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 208.align 64 209___ 210 211###################################################################### 212# SIMD code paths 213# 214{{{ 215($iv,$inout,$roundkey,$temp, 216 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15)); 217 218$aesni_cbc_idx=0; 219@aesni_cbc_block = ( 220## &vmovdqu ($roundkey,"0x00-0x80($inp)");' 221## &vmovdqu ($inout,($inp)); 222## &mov ($_inp,$inp); 223 224 '&vpxor ($inout,$inout,$roundkey);'. 225 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");', 226 227 '&vpxor ($inout,$inout,$iv);', 228 229 '&vaesenc ($inout,$inout,$roundkey);'. 230 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");', 231 232 '&vaesenc ($inout,$inout,$roundkey);'. 233 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");', 234 235 '&vaesenc ($inout,$inout,$roundkey);'. 236 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");', 237 238 '&vaesenc ($inout,$inout,$roundkey);'. 239 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");', 240 241 '&vaesenc ($inout,$inout,$roundkey);'. 242 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");', 243 244 '&vaesenc ($inout,$inout,$roundkey);'. 245 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");', 246 247 '&vaesenc ($inout,$inout,$roundkey);'. 248 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");', 249 250 '&vaesenc ($inout,$inout,$roundkey);'. 251 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");', 252 253 '&vaesenc ($inout,$inout,$roundkey);'. 254 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");', 255 256 '&vaesenclast ($temp,$inout,$roundkey);'. 257 ' &vaesenc ($inout,$inout,$roundkey);'. 258 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");', 259 260 '&vpand ($iv,$temp,$mask10);'. 261 ' &vaesenc ($inout,$inout,$roundkey);'. 262 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");', 263 264 '&vaesenclast ($temp,$inout,$roundkey);'. 265 ' &vaesenc ($inout,$inout,$roundkey);'. 266 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");', 267 268 '&vpand ($temp,$temp,$mask12);'. 269 ' &vaesenc ($inout,$inout,$roundkey);'. 270 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");', 271 272 '&vpor ($iv,$iv,$temp);'. 273 ' &vaesenclast ($temp,$inout,$roundkey);'. 274 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");' 275 276## &mov ($inp,$_inp); 277## &mov ($out,$_out); 278## &vpand ($temp,$temp,$mask14); 279## &vpor ($iv,$iv,$temp); 280## &vmovdqu ($iv,($out,$inp); 281## &lea (inp,16($inp)); 282); 283 284my $a4=$T1; 285my ($a,$b,$c,$d,$e,$f,$g,$h); 286 287sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 288{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 289 my $arg = pop; 290 $arg = "\$$arg" if ($arg*1 eq $arg); 291 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 292} 293 294sub body_00_15 () { 295 ( 296 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 297 298 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 299 '&mov ($a,$a1)', 300 '&mov ($a4,$f)', 301 302 '&xor ($a0,$e)', 303 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 304 '&xor ($a4,$g)', # f^g 305 306 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 307 '&xor ($a1,$a)', 308 '&and ($a4,$e)', # (f^g)&e 309 310 @aesni_cbc_block[$aesni_cbc_idx++]. 311 '&xor ($a0,$e)', 312 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 313 '&mov ($a2,$a)', 314 315 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 316 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 317 '&xor ($a2,$b)', # a^b, b^c in next round 318 319 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 320 '&add ($h,$a4)', # h+=Ch(e,f,g) 321 '&and ($a3,$a2)', # (b^c)&(a^b) 322 323 '&xor ($a1,$a)', 324 '&add ($h,$a0)', # h+=Sigma1(e) 325 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 326 327 '&add ($d,$h)', # d+=h 328 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 329 '&add ($h,$a3)', # h+=Maj(a,b,c) 330 331 '&mov ($a0,$d)', 332 '&add ($a1,$h);'. # h+=Sigma0(a) 333 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 334 ); 335} 336 337if ($avx) {{ 338###################################################################### 339# XOP code path 340# 341$code.=<<___; 342.type ${func}_xop,\@function,6 343.align 64 344${func}_xop: 345.cfi_startproc 346.Lxop_shortcut: 347 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 348 mov %rsp,%rax # copy %rsp 349.cfi_def_cfa_register %rax 350 push %rbx 351.cfi_push %rbx 352 push %rbp 353.cfi_push %rbp 354 push %r12 355.cfi_push %r12 356 push %r13 357.cfi_push %r13 358 push %r14 359.cfi_push %r14 360 push %r15 361.cfi_push %r15 362 sub \$`$framesz+$win64*16*10`,%rsp 363 and \$-64,%rsp # align stack frame 364 365 shl \$6,$len 366 sub $inp,$out # re-bias 367 sub $inp,$in0 368 add $inp,$len # end of input 369 370 #mov $inp,$_inp # saved later 371 mov $out,$_out 372 mov $len,$_end 373 #mov $key,$_key # remains resident in $inp register 374 mov $ivp,$_ivp 375 mov $ctx,$_ctx 376 mov $in0,$_in0 377 mov %rax,$_rsp 378.cfi_cfa_expression $_rsp,deref,+8 379___ 380$code.=<<___ if ($win64); 381 movaps %xmm6,`$framesz+16*0`(%rsp) 382 movaps %xmm7,`$framesz+16*1`(%rsp) 383 movaps %xmm8,`$framesz+16*2`(%rsp) 384 movaps %xmm9,`$framesz+16*3`(%rsp) 385 movaps %xmm10,`$framesz+16*4`(%rsp) 386 movaps %xmm11,`$framesz+16*5`(%rsp) 387 movaps %xmm12,`$framesz+16*6`(%rsp) 388 movaps %xmm13,`$framesz+16*7`(%rsp) 389 movaps %xmm14,`$framesz+16*8`(%rsp) 390 movaps %xmm15,`$framesz+16*9`(%rsp) 391___ 392$code.=<<___; 393.Lprologue_xop: 394 vzeroall 395 396 mov $inp,%r12 # borrow $a4 397 lea 0x80($key),$inp # size optimization, reassign 398 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 399 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 400 mov $ctx,%r15 # borrow $a2 401 mov $in0,%rsi # borrow $a3 402 vmovdqu ($ivp),$iv # load IV 403 sub \$9,%r14 404 405 mov $SZ*0(%r15),$A 406 mov $SZ*1(%r15),$B 407 mov $SZ*2(%r15),$C 408 mov $SZ*3(%r15),$D 409 mov $SZ*4(%r15),$E 410 mov $SZ*5(%r15),$F 411 mov $SZ*6(%r15),$G 412 mov $SZ*7(%r15),$H 413 414 vmovdqa 0x00(%r13,%r14,8),$mask14 415 vmovdqa 0x10(%r13,%r14,8),$mask12 416 vmovdqa 0x20(%r13,%r14,8),$mask10 417 vmovdqu 0x00-0x80($inp),$roundkey 418 jmp .Lloop_xop 419___ 420 if ($SZ==4) { # SHA256 421 my @X = map("%xmm$_",(0..3)); 422 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 423 424$code.=<<___; 425.align 16 426.Lloop_xop: 427 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 428 vmovdqu 0x00(%rsi,%r12),@X[0] 429 vmovdqu 0x10(%rsi,%r12),@X[1] 430 vmovdqu 0x20(%rsi,%r12),@X[2] 431 vmovdqu 0x30(%rsi,%r12),@X[3] 432 vpshufb $t3,@X[0],@X[0] 433 lea $TABLE(%rip),$Tbl 434 vpshufb $t3,@X[1],@X[1] 435 vpshufb $t3,@X[2],@X[2] 436 vpaddd 0x00($Tbl),@X[0],$t0 437 vpshufb $t3,@X[3],@X[3] 438 vpaddd 0x20($Tbl),@X[1],$t1 439 vpaddd 0x40($Tbl),@X[2],$t2 440 vpaddd 0x60($Tbl),@X[3],$t3 441 vmovdqa $t0,0x00(%rsp) 442 mov $A,$a1 443 vmovdqa $t1,0x10(%rsp) 444 mov $B,$a3 445 vmovdqa $t2,0x20(%rsp) 446 xor $C,$a3 # magic 447 vmovdqa $t3,0x30(%rsp) 448 mov $E,$a0 449 jmp .Lxop_00_47 450 451.align 16 452.Lxop_00_47: 453 sub \$-16*2*$SZ,$Tbl # size optimization 454 vmovdqu (%r12),$inout # $a4 455 mov %r12,$_inp # $a4 456___ 457sub XOP_256_00_47 () { 458my $j = shift; 459my $body = shift; 460my @X = @_; 461my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 462 463 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 464 eval(shift(@insns)); 465 eval(shift(@insns)); 466 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 467 eval(shift(@insns)); 468 eval(shift(@insns)); 469 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 470 eval(shift(@insns)); 471 eval(shift(@insns)); 472 &vpsrld ($t0,$t0,$sigma0[2]); 473 eval(shift(@insns)); 474 eval(shift(@insns)); 475 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 476 eval(shift(@insns)); 477 eval(shift(@insns)); 478 eval(shift(@insns)); 479 eval(shift(@insns)); 480 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 481 eval(shift(@insns)); 482 eval(shift(@insns)); 483 &vpxor ($t0,$t0,$t1); 484 eval(shift(@insns)); 485 eval(shift(@insns)); 486 eval(shift(@insns)); 487 eval(shift(@insns)); 488 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 489 eval(shift(@insns)); 490 eval(shift(@insns)); 491 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 492 eval(shift(@insns)); 493 eval(shift(@insns)); 494 &vpsrld ($t2,@X[3],$sigma1[2]); 495 eval(shift(@insns)); 496 eval(shift(@insns)); 497 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 498 eval(shift(@insns)); 499 eval(shift(@insns)); 500 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 501 eval(shift(@insns)); 502 eval(shift(@insns)); 503 &vpxor ($t3,$t3,$t2); 504 eval(shift(@insns)); 505 eval(shift(@insns)); 506 eval(shift(@insns)); 507 eval(shift(@insns)); 508 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 509 eval(shift(@insns)); 510 eval(shift(@insns)); 511 eval(shift(@insns)); 512 eval(shift(@insns)); 513 &vpsrldq ($t3,$t3,8); 514 eval(shift(@insns)); 515 eval(shift(@insns)); 516 eval(shift(@insns)); 517 eval(shift(@insns)); 518 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 519 eval(shift(@insns)); 520 eval(shift(@insns)); 521 eval(shift(@insns)); 522 eval(shift(@insns)); 523 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 524 eval(shift(@insns)); 525 eval(shift(@insns)); 526 &vpsrld ($t2,@X[0],$sigma1[2]); 527 eval(shift(@insns)); 528 eval(shift(@insns)); 529 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 530 eval(shift(@insns)); 531 eval(shift(@insns)); 532 &vpxor ($t3,$t3,$t2); 533 eval(shift(@insns)); 534 eval(shift(@insns)); 535 eval(shift(@insns)); 536 eval(shift(@insns)); 537 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 538 eval(shift(@insns)); 539 eval(shift(@insns)); 540 eval(shift(@insns)); 541 eval(shift(@insns)); 542 &vpslldq ($t3,$t3,8); # 22 instructions 543 eval(shift(@insns)); 544 eval(shift(@insns)); 545 eval(shift(@insns)); 546 eval(shift(@insns)); 547 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 548 eval(shift(@insns)); 549 eval(shift(@insns)); 550 eval(shift(@insns)); 551 eval(shift(@insns)); 552 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 553 foreach (@insns) { eval; } # remaining instructions 554 &vmovdqa (16*$j."(%rsp)",$t2); 555} 556 557 $aesni_cbc_idx=0; 558 for ($i=0,$j=0; $j<4; $j++) { 559 &XOP_256_00_47($j,\&body_00_15,@X); 560 push(@X,shift(@X)); # rotate(@X) 561 } 562 &mov ("%r12",$_inp); # borrow $a4 563 &vpand ($temp,$temp,$mask14); 564 &mov ("%r15",$_out); # borrow $a2 565 &vpor ($iv,$iv,$temp); 566 &vmovdqu ("(%r15,%r12)",$iv); # write output 567 &lea ("%r12","16(%r12)"); # inp++ 568 569 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 570 &jne (".Lxop_00_47"); 571 572 &vmovdqu ($inout,"(%r12)"); 573 &mov ($_inp,"%r12"); 574 575 $aesni_cbc_idx=0; 576 for ($i=0; $i<16; ) { 577 foreach(body_00_15()) { eval; } 578 } 579 } 580$code.=<<___; 581 mov $_inp,%r12 # borrow $a4 582 mov $_out,%r13 # borrow $a0 583 mov $_ctx,%r15 # borrow $a2 584 mov $_in0,%rsi # borrow $a3 585 586 vpand $mask14,$temp,$temp 587 mov $a1,$A 588 vpor $temp,$iv,$iv 589 vmovdqu $iv,(%r13,%r12) # write output 590 lea 16(%r12),%r12 # inp++ 591 592 add $SZ*0(%r15),$A 593 add $SZ*1(%r15),$B 594 add $SZ*2(%r15),$C 595 add $SZ*3(%r15),$D 596 add $SZ*4(%r15),$E 597 add $SZ*5(%r15),$F 598 add $SZ*6(%r15),$G 599 add $SZ*7(%r15),$H 600 601 cmp $_end,%r12 602 603 mov $A,$SZ*0(%r15) 604 mov $B,$SZ*1(%r15) 605 mov $C,$SZ*2(%r15) 606 mov $D,$SZ*3(%r15) 607 mov $E,$SZ*4(%r15) 608 mov $F,$SZ*5(%r15) 609 mov $G,$SZ*6(%r15) 610 mov $H,$SZ*7(%r15) 611 612 jb .Lloop_xop 613 614 mov $_ivp,$ivp 615 mov $_rsp,%rsi 616.cfi_def_cfa %rsi,8 617 vmovdqu $iv,($ivp) # output IV 618 vzeroall 619___ 620$code.=<<___ if ($win64); 621 movaps `$framesz+16*0`(%rsp),%xmm6 622 movaps `$framesz+16*1`(%rsp),%xmm7 623 movaps `$framesz+16*2`(%rsp),%xmm8 624 movaps `$framesz+16*3`(%rsp),%xmm9 625 movaps `$framesz+16*4`(%rsp),%xmm10 626 movaps `$framesz+16*5`(%rsp),%xmm11 627 movaps `$framesz+16*6`(%rsp),%xmm12 628 movaps `$framesz+16*7`(%rsp),%xmm13 629 movaps `$framesz+16*8`(%rsp),%xmm14 630 movaps `$framesz+16*9`(%rsp),%xmm15 631___ 632$code.=<<___; 633 mov -48(%rsi),%r15 634.cfi_restore %r15 635 mov -40(%rsi),%r14 636.cfi_restore %r14 637 mov -32(%rsi),%r13 638.cfi_restore %r13 639 mov -24(%rsi),%r12 640.cfi_restore %r12 641 mov -16(%rsi),%rbp 642.cfi_restore %rbp 643 mov -8(%rsi),%rbx 644.cfi_restore %rbx 645 lea (%rsi),%rsp 646.cfi_def_cfa_register %rsp 647.Lepilogue_xop: 648 ret 649.cfi_endproc 650.size ${func}_xop,.-${func}_xop 651___ 652###################################################################### 653# AVX+shrd code path 654# 655local *ror = sub { &shrd(@_[0],@_) }; 656 657$code.=<<___; 658.type ${func}_avx,\@function,6 659.align 64 660${func}_avx: 661.cfi_startproc 662.Lavx_shortcut: 663 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 664 mov %rsp,%rax # copy %rsp 665.cfi_def_cfa_register %rax 666 push %rbx 667.cfi_push %rbx 668 push %rbp 669.cfi_push %rbp 670 push %r12 671.cfi_push %r12 672 push %r13 673.cfi_push %r13 674 push %r14 675.cfi_push %r14 676 push %r15 677.cfi_push %r15 678 sub \$`$framesz+$win64*16*10`,%rsp 679 and \$-64,%rsp # align stack frame 680 681 shl \$6,$len 682 sub $inp,$out # re-bias 683 sub $inp,$in0 684 add $inp,$len # end of input 685 686 #mov $inp,$_inp # saved later 687 mov $out,$_out 688 mov $len,$_end 689 #mov $key,$_key # remains resident in $inp register 690 mov $ivp,$_ivp 691 mov $ctx,$_ctx 692 mov $in0,$_in0 693 mov %rax,$_rsp 694.cfi_cfa_expression $_rsp,deref,+8 695___ 696$code.=<<___ if ($win64); 697 movaps %xmm6,`$framesz+16*0`(%rsp) 698 movaps %xmm7,`$framesz+16*1`(%rsp) 699 movaps %xmm8,`$framesz+16*2`(%rsp) 700 movaps %xmm9,`$framesz+16*3`(%rsp) 701 movaps %xmm10,`$framesz+16*4`(%rsp) 702 movaps %xmm11,`$framesz+16*5`(%rsp) 703 movaps %xmm12,`$framesz+16*6`(%rsp) 704 movaps %xmm13,`$framesz+16*7`(%rsp) 705 movaps %xmm14,`$framesz+16*8`(%rsp) 706 movaps %xmm15,`$framesz+16*9`(%rsp) 707___ 708$code.=<<___; 709.Lprologue_avx: 710 vzeroall 711 712 mov $inp,%r12 # borrow $a4 713 lea 0x80($key),$inp # size optimization, reassign 714 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 715 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 716 mov $ctx,%r15 # borrow $a2 717 mov $in0,%rsi # borrow $a3 718 vmovdqu ($ivp),$iv # load IV 719 sub \$9,%r14 720 721 mov $SZ*0(%r15),$A 722 mov $SZ*1(%r15),$B 723 mov $SZ*2(%r15),$C 724 mov $SZ*3(%r15),$D 725 mov $SZ*4(%r15),$E 726 mov $SZ*5(%r15),$F 727 mov $SZ*6(%r15),$G 728 mov $SZ*7(%r15),$H 729 730 vmovdqa 0x00(%r13,%r14,8),$mask14 731 vmovdqa 0x10(%r13,%r14,8),$mask12 732 vmovdqa 0x20(%r13,%r14,8),$mask10 733 vmovdqu 0x00-0x80($inp),$roundkey 734___ 735 if ($SZ==4) { # SHA256 736 my @X = map("%xmm$_",(0..3)); 737 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 738 739$code.=<<___; 740 jmp .Lloop_avx 741.align 16 742.Lloop_avx: 743 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 744 vmovdqu 0x00(%rsi,%r12),@X[0] 745 vmovdqu 0x10(%rsi,%r12),@X[1] 746 vmovdqu 0x20(%rsi,%r12),@X[2] 747 vmovdqu 0x30(%rsi,%r12),@X[3] 748 vpshufb $t3,@X[0],@X[0] 749 lea $TABLE(%rip),$Tbl 750 vpshufb $t3,@X[1],@X[1] 751 vpshufb $t3,@X[2],@X[2] 752 vpaddd 0x00($Tbl),@X[0],$t0 753 vpshufb $t3,@X[3],@X[3] 754 vpaddd 0x20($Tbl),@X[1],$t1 755 vpaddd 0x40($Tbl),@X[2],$t2 756 vpaddd 0x60($Tbl),@X[3],$t3 757 vmovdqa $t0,0x00(%rsp) 758 mov $A,$a1 759 vmovdqa $t1,0x10(%rsp) 760 mov $B,$a3 761 vmovdqa $t2,0x20(%rsp) 762 xor $C,$a3 # magic 763 vmovdqa $t3,0x30(%rsp) 764 mov $E,$a0 765 jmp .Lavx_00_47 766 767.align 16 768.Lavx_00_47: 769 sub \$-16*2*$SZ,$Tbl # size optimization 770 vmovdqu (%r12),$inout # $a4 771 mov %r12,$_inp # $a4 772___ 773sub Xupdate_256_AVX () { 774 ( 775 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 776 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 777 '&vpsrld ($t2,$t0,$sigma0[0]);', 778 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 779 '&vpsrld ($t3,$t0,$sigma0[2])', 780 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 781 '&vpxor ($t0,$t3,$t2)', 782 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 783 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 784 '&vpxor ($t0,$t0,$t1)', 785 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 786 '&vpxor ($t0,$t0,$t2)', 787 '&vpsrld ($t2,$t3,$sigma1[2]);', 788 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 789 '&vpsrlq ($t3,$t3,$sigma1[0]);', 790 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 791 '&vpxor ($t2,$t2,$t3);', 792 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 793 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15]) 794 '&vpshufd ($t2,$t2,0b10000100)', 795 '&vpsrldq ($t2,$t2,8)', 796 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 797 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 798 '&vpsrld ($t2,$t3,$sigma1[2])', 799 '&vpsrlq ($t3,$t3,$sigma1[0])', 800 '&vpxor ($t2,$t2,$t3);', 801 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 802 '&vpxor ($t2,$t2,$t3)', 803 '&vpshufd ($t2,$t2,0b11101000)', 804 '&vpslldq ($t2,$t2,8)', 805 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 806 ); 807} 808 809sub AVX_256_00_47 () { 810my $j = shift; 811my $body = shift; 812my @X = @_; 813my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 814 815 foreach (Xupdate_256_AVX()) { # 29 instructions 816 eval; 817 eval(shift(@insns)); 818 eval(shift(@insns)); 819 eval(shift(@insns)); 820 } 821 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 822 foreach (@insns) { eval; } # remaining instructions 823 &vmovdqa (16*$j."(%rsp)",$t2); 824} 825 826 $aesni_cbc_idx=0; 827 for ($i=0,$j=0; $j<4; $j++) { 828 &AVX_256_00_47($j,\&body_00_15,@X); 829 push(@X,shift(@X)); # rotate(@X) 830 } 831 &mov ("%r12",$_inp); # borrow $a4 832 &vpand ($temp,$temp,$mask14); 833 &mov ("%r15",$_out); # borrow $a2 834 &vpor ($iv,$iv,$temp); 835 &vmovdqu ("(%r15,%r12)",$iv); # write output 836 &lea ("%r12","16(%r12)"); # inp++ 837 838 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 839 &jne (".Lavx_00_47"); 840 841 &vmovdqu ($inout,"(%r12)"); 842 &mov ($_inp,"%r12"); 843 844 $aesni_cbc_idx=0; 845 for ($i=0; $i<16; ) { 846 foreach(body_00_15()) { eval; } 847 } 848 849 } 850$code.=<<___; 851 mov $_inp,%r12 # borrow $a4 852 mov $_out,%r13 # borrow $a0 853 mov $_ctx,%r15 # borrow $a2 854 mov $_in0,%rsi # borrow $a3 855 856 vpand $mask14,$temp,$temp 857 mov $a1,$A 858 vpor $temp,$iv,$iv 859 vmovdqu $iv,(%r13,%r12) # write output 860 lea 16(%r12),%r12 # inp++ 861 862 add $SZ*0(%r15),$A 863 add $SZ*1(%r15),$B 864 add $SZ*2(%r15),$C 865 add $SZ*3(%r15),$D 866 add $SZ*4(%r15),$E 867 add $SZ*5(%r15),$F 868 add $SZ*6(%r15),$G 869 add $SZ*7(%r15),$H 870 871 cmp $_end,%r12 872 873 mov $A,$SZ*0(%r15) 874 mov $B,$SZ*1(%r15) 875 mov $C,$SZ*2(%r15) 876 mov $D,$SZ*3(%r15) 877 mov $E,$SZ*4(%r15) 878 mov $F,$SZ*5(%r15) 879 mov $G,$SZ*6(%r15) 880 mov $H,$SZ*7(%r15) 881 jb .Lloop_avx 882 883 mov $_ivp,$ivp 884 mov $_rsp,%rsi 885.cfi_def_cfa %rsi,8 886 vmovdqu $iv,($ivp) # output IV 887 vzeroall 888___ 889$code.=<<___ if ($win64); 890 movaps `$framesz+16*0`(%rsp),%xmm6 891 movaps `$framesz+16*1`(%rsp),%xmm7 892 movaps `$framesz+16*2`(%rsp),%xmm8 893 movaps `$framesz+16*3`(%rsp),%xmm9 894 movaps `$framesz+16*4`(%rsp),%xmm10 895 movaps `$framesz+16*5`(%rsp),%xmm11 896 movaps `$framesz+16*6`(%rsp),%xmm12 897 movaps `$framesz+16*7`(%rsp),%xmm13 898 movaps `$framesz+16*8`(%rsp),%xmm14 899 movaps `$framesz+16*9`(%rsp),%xmm15 900___ 901$code.=<<___; 902 mov -48(%rsi),%r15 903.cfi_restore %r15 904 mov -40(%rsi),%r14 905.cfi_restore %r14 906 mov -32(%rsi),%r13 907.cfi_restore %r13 908 mov -24(%rsi),%r12 909.cfi_restore %r12 910 mov -16(%rsi),%rbp 911.cfi_restore %rbp 912 mov -8(%rsi),%rbx 913.cfi_restore %rbx 914 lea (%rsi),%rsp 915.cfi_def_cfa_register %rsp 916.Lepilogue_avx: 917 ret 918.cfi_endproc 919.size ${func}_avx,.-${func}_avx 920___ 921 922if ($avx>1) {{ 923###################################################################### 924# AVX2+BMI code path 925# 926my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 927my $PUSH8=8*2*$SZ; 928use integer; 929 930sub bodyx_00_15 () { 931 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 932 ( 933 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 934 935 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 936 '&and ($a4,$e)', # f&e 937 '&rorx ($a0,$e,$Sigma1[2])', 938 '&rorx ($a2,$e,$Sigma1[1])', 939 940 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 941 '&lea ($h,"($h,$a4)")', 942 '&andn ($a4,$e,$g)', # ~e&g 943 '&xor ($a0,$a2)', 944 945 '&rorx ($a1,$e,$Sigma1[0])', 946 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 947 '&xor ($a0,$a1)', # Sigma1(e) 948 '&mov ($a2,$a)', 949 950 '&rorx ($a4,$a,$Sigma0[2])', 951 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 952 '&xor ($a2,$b)', # a^b, b^c in next round 953 '&rorx ($a1,$a,$Sigma0[1])', 954 955 '&rorx ($a0,$a,$Sigma0[0])', 956 '&lea ($d,"($d,$h)")', # d+=h 957 '&and ($a3,$a2)', # (b^c)&(a^b) 958 @aesni_cbc_block[$aesni_cbc_idx++]. 959 '&xor ($a1,$a4)', 960 961 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 962 '&xor ($a1,$a0)', # Sigma0(a) 963 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 964 '&mov ($a4,$e)', # copy of f in future 965 966 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 967 ); 968 # and at the finish one has to $a+=$a1 969} 970 971$code.=<<___; 972.type ${func}_avx2,\@function,6 973.align 64 974${func}_avx2: 975.cfi_startproc 976.Lavx2_shortcut: 977 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 978 mov %rsp,%rax # copy %rsp 979.cfi_def_cfa_register %rax 980 push %rbx 981.cfi_push %rbx 982 push %rbp 983.cfi_push %rbp 984 push %r12 985.cfi_push %r12 986 push %r13 987.cfi_push %r13 988 push %r14 989.cfi_push %r14 990 push %r15 991.cfi_push %r15 992 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp 993 and \$-256*$SZ,%rsp # align stack frame 994 add \$`2*$SZ*($rounds-8)`,%rsp 995 996 shl \$6,$len 997 sub $inp,$out # re-bias 998 sub $inp,$in0 999 add $inp,$len # end of input 1000 1001 #mov $inp,$_inp # saved later 1002 #mov $out,$_out # kept in $offload 1003 mov $len,$_end 1004 #mov $key,$_key # remains resident in $inp register 1005 mov $ivp,$_ivp 1006 mov $ctx,$_ctx 1007 mov $in0,$_in0 1008 mov %rax,$_rsp 1009.cfi_cfa_expression $_rsp,deref,+8 1010___ 1011$code.=<<___ if ($win64); 1012 movaps %xmm6,`$framesz+16*0`(%rsp) 1013 movaps %xmm7,`$framesz+16*1`(%rsp) 1014 movaps %xmm8,`$framesz+16*2`(%rsp) 1015 movaps %xmm9,`$framesz+16*3`(%rsp) 1016 movaps %xmm10,`$framesz+16*4`(%rsp) 1017 movaps %xmm11,`$framesz+16*5`(%rsp) 1018 movaps %xmm12,`$framesz+16*6`(%rsp) 1019 movaps %xmm13,`$framesz+16*7`(%rsp) 1020 movaps %xmm14,`$framesz+16*8`(%rsp) 1021 movaps %xmm15,`$framesz+16*9`(%rsp) 1022___ 1023$code.=<<___; 1024.Lprologue_avx2: 1025 vzeroall 1026 1027 mov $inp,%r13 # borrow $a0 1028 vpinsrq \$1,$out,$offload,$offload 1029 lea 0x80($key),$inp # size optimization, reassign 1030 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4 1031 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 1032 mov $ctx,%r15 # borrow $a2 1033 mov $in0,%rsi # borrow $a3 1034 vmovdqu ($ivp),$iv # load IV 1035 lea -9(%r14),%r14 1036 1037 vmovdqa 0x00(%r12,%r14,8),$mask14 1038 vmovdqa 0x10(%r12,%r14,8),$mask12 1039 vmovdqa 0x20(%r12,%r14,8),$mask10 1040 1041 sub \$-16*$SZ,%r13 # inp++, size optimization 1042 mov $SZ*0(%r15),$A 1043 lea (%rsi,%r13),%r12 # borrow $a0 1044 mov $SZ*1(%r15),$B 1045 cmp $len,%r13 # $_end 1046 mov $SZ*2(%r15),$C 1047 cmove %rsp,%r12 # next block or random data 1048 mov $SZ*3(%r15),$D 1049 mov $SZ*4(%r15),$E 1050 mov $SZ*5(%r15),$F 1051 mov $SZ*6(%r15),$G 1052 mov $SZ*7(%r15),$H 1053 vmovdqu 0x00-0x80($inp),$roundkey 1054___ 1055 if ($SZ==4) { # SHA256 1056 my @X = map("%ymm$_",(0..3)); 1057 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7)); 1058 1059$code.=<<___; 1060 jmp .Loop_avx2 1061.align 16 1062.Loop_avx2: 1063 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1064 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0 1065 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1 1066 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2 1067 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3 1068 1069 vinserti128 \$1,(%r12),@X[0],@X[0] 1070 vinserti128 \$1,16(%r12),@X[1],@X[1] 1071 vpshufb $t3,@X[0],@X[0] 1072 vinserti128 \$1,32(%r12),@X[2],@X[2] 1073 vpshufb $t3,@X[1],@X[1] 1074 vinserti128 \$1,48(%r12),@X[3],@X[3] 1075 1076 lea $TABLE(%rip),$Tbl 1077 vpshufb $t3,@X[2],@X[2] 1078 lea -16*$SZ(%r13),%r13 1079 vpaddd 0x00($Tbl),@X[0],$t0 1080 vpshufb $t3,@X[3],@X[3] 1081 vpaddd 0x20($Tbl),@X[1],$t1 1082 vpaddd 0x40($Tbl),@X[2],$t2 1083 vpaddd 0x60($Tbl),@X[3],$t3 1084 vmovdqa $t0,0x00(%rsp) 1085 xor $a1,$a1 1086 vmovdqa $t1,0x20(%rsp) 1087 lea -$PUSH8(%rsp),%rsp 1088 mov $B,$a3 1089 vmovdqa $t2,0x00(%rsp) 1090 xor $C,$a3 # magic 1091 vmovdqa $t3,0x20(%rsp) 1092 mov $F,$a4 1093 sub \$-16*2*$SZ,$Tbl # size optimization 1094 jmp .Lavx2_00_47 1095 1096.align 16 1097.Lavx2_00_47: 1098 vmovdqu (%r13),$inout 1099 vpinsrq \$0,%r13,$offload,$offload 1100___ 1101 1102sub AVX2_256_00_47 () { 1103my $j = shift; 1104my $body = shift; 1105my @X = @_; 1106my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1107my $base = "+2*$PUSH8(%rsp)"; 1108 1109 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); 1110 foreach (Xupdate_256_AVX()) { # 29 instructions 1111 eval; 1112 eval(shift(@insns)); 1113 eval(shift(@insns)); 1114 eval(shift(@insns)); 1115 } 1116 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1117 foreach (@insns) { eval; } # remaining instructions 1118 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1119} 1120 $aesni_cbc_idx=0; 1121 for ($i=0,$j=0; $j<4; $j++) { 1122 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1123 push(@X,shift(@X)); # rotate(@X) 1124 } 1125 &vmovq ("%r13",$offload); # borrow $a0 1126 &vpextrq ("%r15",$offload,1); # borrow $a2 1127 &vpand ($temp,$temp,$mask14); 1128 &vpor ($iv,$iv,$temp); 1129 &vmovdqu ("(%r15,%r13)",$iv); # write output 1130 &lea ("%r13","16(%r13)"); # inp++ 1131 1132 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1133 &cmpb (($SZ-1)."($Tbl)",0); 1134 &jne (".Lavx2_00_47"); 1135 1136 &vmovdqu ($inout,"(%r13)"); 1137 &vpinsrq ($offload,$offload,"%r13",0); 1138 1139 $aesni_cbc_idx=0; 1140 for ($i=0; $i<16; ) { 1141 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1142 foreach(bodyx_00_15()) { eval; } 1143 } 1144 } 1145$code.=<<___; 1146 vpextrq \$1,$offload,%r12 # $_out, borrow $a4 1147 vmovq $offload,%r13 # $_inp, borrow $a0 1148 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 1149 add $a1,$A 1150 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 1151 1152 vpand $mask14,$temp,$temp 1153 vpor $temp,$iv,$iv 1154 vmovdqu $iv,(%r12,%r13) # write output 1155 lea 16(%r13),%r13 1156 1157 add $SZ*0(%r15),$A 1158 add $SZ*1(%r15),$B 1159 add $SZ*2(%r15),$C 1160 add $SZ*3(%r15),$D 1161 add $SZ*4(%r15),$E 1162 add $SZ*5(%r15),$F 1163 add $SZ*6(%r15),$G 1164 add $SZ*7(%r15),$H 1165 1166 mov $A,$SZ*0(%r15) 1167 mov $B,$SZ*1(%r15) 1168 mov $C,$SZ*2(%r15) 1169 mov $D,$SZ*3(%r15) 1170 mov $E,$SZ*4(%r15) 1171 mov $F,$SZ*5(%r15) 1172 mov $G,$SZ*6(%r15) 1173 mov $H,$SZ*7(%r15) 1174 1175 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end 1176 je .Ldone_avx2 1177 1178 xor $a1,$a1 1179 mov $B,$a3 1180 mov $F,$a4 1181 xor $C,$a3 # magic 1182 jmp .Lower_avx2 1183.align 16 1184.Lower_avx2: 1185 vmovdqu (%r13),$inout 1186 vpinsrq \$0,%r13,$offload,$offload 1187___ 1188 $aesni_cbc_idx=0; 1189 for ($i=0; $i<16; ) { 1190 my $base="+16($Tbl)"; 1191 foreach(bodyx_00_15()) { eval; } 1192 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8); 1193 } 1194$code.=<<___; 1195 vmovq $offload,%r13 # borrow $a0 1196 vpextrq \$1,$offload,%r15 # borrow $a2 1197 vpand $mask14,$temp,$temp 1198 vpor $temp,$iv,$iv 1199 lea -$PUSH8($Tbl),$Tbl 1200 vmovdqu $iv,(%r15,%r13) # write output 1201 lea 16(%r13),%r13 # inp++ 1202 cmp %rsp,$Tbl 1203 jae .Lower_avx2 1204 1205 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 1206 lea 16*$SZ(%r13),%r13 1207 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3 1208 add $a1,$A 1209 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 1210 1211 add $SZ*0(%r15),$A 1212 add $SZ*1(%r15),$B 1213 add $SZ*2(%r15),$C 1214 add $SZ*3(%r15),$D 1215 add $SZ*4(%r15),$E 1216 add $SZ*5(%r15),$F 1217 add $SZ*6(%r15),$G 1218 lea (%rsi,%r13),%r12 1219 add $SZ*7(%r15),$H 1220 1221 cmp $_end,%r13 1222 1223 mov $A,$SZ*0(%r15) 1224 cmove %rsp,%r12 # next block or stale data 1225 mov $B,$SZ*1(%r15) 1226 mov $C,$SZ*2(%r15) 1227 mov $D,$SZ*3(%r15) 1228 mov $E,$SZ*4(%r15) 1229 mov $F,$SZ*5(%r15) 1230 mov $G,$SZ*6(%r15) 1231 mov $H,$SZ*7(%r15) 1232 1233 jbe .Loop_avx2 1234 lea (%rsp),$Tbl 1235 1236.Ldone_avx2: 1237 lea ($Tbl),%rsp 1238 mov $_ivp,$ivp 1239 mov $_rsp,%rsi 1240.cfi_def_cfa %rsi,8 1241 vmovdqu $iv,($ivp) # output IV 1242 vzeroall 1243___ 1244$code.=<<___ if ($win64); 1245 movaps `$framesz+16*0`(%rsp),%xmm6 1246 movaps `$framesz+16*1`(%rsp),%xmm7 1247 movaps `$framesz+16*2`(%rsp),%xmm8 1248 movaps `$framesz+16*3`(%rsp),%xmm9 1249 movaps `$framesz+16*4`(%rsp),%xmm10 1250 movaps `$framesz+16*5`(%rsp),%xmm11 1251 movaps `$framesz+16*6`(%rsp),%xmm12 1252 movaps `$framesz+16*7`(%rsp),%xmm13 1253 movaps `$framesz+16*8`(%rsp),%xmm14 1254 movaps `$framesz+16*9`(%rsp),%xmm15 1255___ 1256$code.=<<___; 1257 mov -48(%rsi),%r15 1258.cfi_restore %r15 1259 mov -40(%rsi),%r14 1260.cfi_restore %r14 1261 mov -32(%rsi),%r13 1262.cfi_restore %r13 1263 mov -24(%rsi),%r12 1264.cfi_restore %r12 1265 mov -16(%rsi),%rbp 1266.cfi_restore %rbp 1267 mov -8(%rsi),%rbx 1268.cfi_restore %rbx 1269 lea (%rsi),%rsp 1270.cfi_def_cfa_register %rsp 1271.Lepilogue_avx2: 1272 ret 1273.cfi_endproc 1274.size ${func}_avx2,.-${func}_avx2 1275___ 1276}} 1277}} 1278{{ 1279my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 1280 1281my ($rounds,$Tbl)=("%r11d","%rbx"); 1282 1283my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15)); 1284my @rndkey=("%xmm4","%xmm5"); 1285my $r=0; 1286my $sn=0; 1287 1288my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9)); 1289my @MSG=map("%xmm$_",(10..13)); 1290 1291my $aesenc=sub { 1292 use integer; 1293 my ($n,$k)=($r/10,$r%10); 1294 if ($k==0) { 1295 $code.=<<___; 1296 movups `16*$n`($in0),$in # load input 1297 xorps $rndkey0,$in 1298___ 1299 $code.=<<___ if ($n); 1300 movups $iv,`16*($n-1)`($out,$in0) # write output 1301___ 1302 $code.=<<___; 1303 xorps $in,$iv 1304 movups `32+16*$k-112`($key),$rndkey[1] 1305 aesenc $rndkey[0],$iv 1306___ 1307 } elsif ($k==9) { 1308 $sn++; 1309 $code.=<<___; 1310 cmp \$11,$rounds 1311 jb .Laesenclast$sn 1312 movups `32+16*($k+0)-112`($key),$rndkey[1] 1313 aesenc $rndkey[0],$iv 1314 movups `32+16*($k+1)-112`($key),$rndkey[0] 1315 aesenc $rndkey[1],$iv 1316 je .Laesenclast$sn 1317 movups `32+16*($k+2)-112`($key),$rndkey[1] 1318 aesenc $rndkey[0],$iv 1319 movups `32+16*($k+3)-112`($key),$rndkey[0] 1320 aesenc $rndkey[1],$iv 1321.Laesenclast$sn: 1322 aesenclast $rndkey[0],$iv 1323 movups 16-112($key),$rndkey[1] # forward reference 1324 nop 1325___ 1326 } else { 1327 $code.=<<___; 1328 movups `32+16*$k-112`($key),$rndkey[1] 1329 aesenc $rndkey[0],$iv 1330___ 1331 } 1332 $r++; unshift(@rndkey,pop(@rndkey)); 1333}; 1334 1335if ($shaext) { 1336my $Tbl="%rax"; 1337 1338$code.=<<___; 1339.type ${func}_shaext,\@function,6 1340.align 32 1341${func}_shaext: 1342 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 1343___ 1344$code.=<<___ if ($win64); 1345 lea `-8-10*16`(%rsp),%rsp 1346 movaps %xmm6,-8-10*16(%rax) 1347 movaps %xmm7,-8-9*16(%rax) 1348 movaps %xmm8,-8-8*16(%rax) 1349 movaps %xmm9,-8-7*16(%rax) 1350 movaps %xmm10,-8-6*16(%rax) 1351 movaps %xmm11,-8-5*16(%rax) 1352 movaps %xmm12,-8-4*16(%rax) 1353 movaps %xmm13,-8-3*16(%rax) 1354 movaps %xmm14,-8-2*16(%rax) 1355 movaps %xmm15,-8-1*16(%rax) 1356.Lprologue_shaext: 1357___ 1358$code.=<<___; 1359 lea K256+0x80(%rip),$Tbl 1360 movdqu ($ctx),$ABEF # DCBA 1361 movdqu 16($ctx),$CDGH # HGFE 1362 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 1363 1364 mov 240($key),$rounds 1365 sub $in0,$out 1366 movups ($key),$rndkey0 # $key[0] 1367 movups ($ivp),$iv # load IV 1368 movups 16($key),$rndkey[0] # forward reference 1369 lea 112($key),$key # size optimization 1370 1371 pshufd \$0x1b,$ABEF,$Wi # ABCD 1372 pshufd \$0xb1,$ABEF,$ABEF # CDAB 1373 pshufd \$0x1b,$CDGH,$CDGH # EFGH 1374 movdqa $TMP,$BSWAP # offload 1375 palignr \$8,$CDGH,$ABEF # ABEF 1376 punpcklqdq $Wi,$CDGH # CDGH 1377 1378 jmp .Loop_shaext 1379 1380.align 16 1381.Loop_shaext: 1382 movdqu ($inp),@MSG[0] 1383 movdqu 0x10($inp),@MSG[1] 1384 movdqu 0x20($inp),@MSG[2] 1385 pshufb $TMP,@MSG[0] 1386 movdqu 0x30($inp),@MSG[3] 1387 1388 movdqa 0*32-0x80($Tbl),$Wi 1389 paddd @MSG[0],$Wi 1390 pshufb $TMP,@MSG[1] 1391 movdqa $CDGH,$CDGH_SAVE # offload 1392 movdqa $ABEF,$ABEF_SAVE # offload 1393___ 1394 &$aesenc(); 1395$code.=<<___; 1396 sha256rnds2 $ABEF,$CDGH # 0-3 1397 pshufd \$0x0e,$Wi,$Wi 1398___ 1399 &$aesenc(); 1400$code.=<<___; 1401 sha256rnds2 $CDGH,$ABEF 1402 1403 movdqa 1*32-0x80($Tbl),$Wi 1404 paddd @MSG[1],$Wi 1405 pshufb $TMP,@MSG[2] 1406 lea 0x40($inp),$inp 1407___ 1408 &$aesenc(); 1409$code.=<<___; 1410 sha256rnds2 $ABEF,$CDGH # 4-7 1411 pshufd \$0x0e,$Wi,$Wi 1412___ 1413 &$aesenc(); 1414$code.=<<___; 1415 sha256rnds2 $CDGH,$ABEF 1416 1417 movdqa 2*32-0x80($Tbl),$Wi 1418 paddd @MSG[2],$Wi 1419 pshufb $TMP,@MSG[3] 1420 sha256msg1 @MSG[1],@MSG[0] 1421___ 1422 &$aesenc(); 1423$code.=<<___; 1424 sha256rnds2 $ABEF,$CDGH # 8-11 1425 pshufd \$0x0e,$Wi,$Wi 1426 movdqa @MSG[3],$TMP 1427 palignr \$4,@MSG[2],$TMP 1428 paddd $TMP,@MSG[0] 1429___ 1430 &$aesenc(); 1431$code.=<<___; 1432 sha256rnds2 $CDGH,$ABEF 1433 1434 movdqa 3*32-0x80($Tbl),$Wi 1435 paddd @MSG[3],$Wi 1436 sha256msg2 @MSG[3],@MSG[0] 1437 sha256msg1 @MSG[2],@MSG[1] 1438___ 1439 &$aesenc(); 1440$code.=<<___; 1441 sha256rnds2 $ABEF,$CDGH # 12-15 1442 pshufd \$0x0e,$Wi,$Wi 1443___ 1444 &$aesenc(); 1445$code.=<<___; 1446 movdqa @MSG[0],$TMP 1447 palignr \$4,@MSG[3],$TMP 1448 paddd $TMP,@MSG[1] 1449 sha256rnds2 $CDGH,$ABEF 1450___ 1451for($i=4;$i<16-3;$i++) { 1452 &$aesenc() if (($r%10)==0); 1453$code.=<<___; 1454 movdqa $i*32-0x80($Tbl),$Wi 1455 paddd @MSG[0],$Wi 1456 sha256msg2 @MSG[0],@MSG[1] 1457 sha256msg1 @MSG[3],@MSG[2] 1458___ 1459 &$aesenc(); 1460$code.=<<___; 1461 sha256rnds2 $ABEF,$CDGH # 16-19... 1462 pshufd \$0x0e,$Wi,$Wi 1463 movdqa @MSG[1],$TMP 1464 palignr \$4,@MSG[0],$TMP 1465 paddd $TMP,@MSG[2] 1466___ 1467 &$aesenc(); 1468 &$aesenc() if ($r==19); 1469$code.=<<___; 1470 sha256rnds2 $CDGH,$ABEF 1471___ 1472 push(@MSG,shift(@MSG)); 1473} 1474$code.=<<___; 1475 movdqa 13*32-0x80($Tbl),$Wi 1476 paddd @MSG[0],$Wi 1477 sha256msg2 @MSG[0],@MSG[1] 1478 sha256msg1 @MSG[3],@MSG[2] 1479___ 1480 &$aesenc(); 1481$code.=<<___; 1482 sha256rnds2 $ABEF,$CDGH # 52-55 1483 pshufd \$0x0e,$Wi,$Wi 1484 movdqa @MSG[1],$TMP 1485 palignr \$4,@MSG[0],$TMP 1486 paddd $TMP,@MSG[2] 1487___ 1488 &$aesenc(); 1489 &$aesenc(); 1490$code.=<<___; 1491 sha256rnds2 $CDGH,$ABEF 1492 1493 movdqa 14*32-0x80($Tbl),$Wi 1494 paddd @MSG[1],$Wi 1495 sha256msg2 @MSG[1],@MSG[2] 1496 movdqa $BSWAP,$TMP 1497___ 1498 &$aesenc(); 1499$code.=<<___; 1500 sha256rnds2 $ABEF,$CDGH # 56-59 1501 pshufd \$0x0e,$Wi,$Wi 1502___ 1503 &$aesenc(); 1504$code.=<<___; 1505 sha256rnds2 $CDGH,$ABEF 1506 1507 movdqa 15*32-0x80($Tbl),$Wi 1508 paddd @MSG[2],$Wi 1509___ 1510 &$aesenc(); 1511 &$aesenc(); 1512$code.=<<___; 1513 sha256rnds2 $ABEF,$CDGH # 60-63 1514 pshufd \$0x0e,$Wi,$Wi 1515___ 1516 &$aesenc(); 1517$code.=<<___; 1518 sha256rnds2 $CDGH,$ABEF 1519 #pxor $CDGH,$rndkey0 # black magic 1520___ 1521 while ($r<40) { &$aesenc(); } # remaining aesenc's 1522$code.=<<___; 1523 #xorps $CDGH,$rndkey0 # black magic 1524 paddd $CDGH_SAVE,$CDGH 1525 paddd $ABEF_SAVE,$ABEF 1526 1527 dec $len 1528 movups $iv,48($out,$in0) # write output 1529 lea 64($in0),$in0 1530 jnz .Loop_shaext 1531 1532 pshufd \$0xb1,$CDGH,$CDGH # DCHG 1533 pshufd \$0x1b,$ABEF,$TMP # FEBA 1534 pshufd \$0xb1,$ABEF,$ABEF # BAFE 1535 punpckhqdq $CDGH,$ABEF # DCBA 1536 palignr \$8,$TMP,$CDGH # HGFE 1537 1538 movups $iv,($ivp) # write IV 1539 movdqu $ABEF,($ctx) 1540 movdqu $CDGH,16($ctx) 1541___ 1542$code.=<<___ if ($win64); 1543 movaps 0*16(%rsp),%xmm6 1544 movaps 1*16(%rsp),%xmm7 1545 movaps 2*16(%rsp),%xmm8 1546 movaps 3*16(%rsp),%xmm9 1547 movaps 4*16(%rsp),%xmm10 1548 movaps 5*16(%rsp),%xmm11 1549 movaps 6*16(%rsp),%xmm12 1550 movaps 7*16(%rsp),%xmm13 1551 movaps 8*16(%rsp),%xmm14 1552 movaps 9*16(%rsp),%xmm15 1553 lea 8+10*16(%rsp),%rsp 1554.Lepilogue_shaext: 1555___ 1556$code.=<<___; 1557 ret 1558.size ${func}_shaext,.-${func}_shaext 1559___ 1560} 1561}}}}} 1562 1563# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1564# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1565if ($win64 && $avx) { 1566$rec="%rcx"; 1567$frame="%rdx"; 1568$context="%r8"; 1569$disp="%r9"; 1570 1571$code.=<<___; 1572.extern __imp_RtlVirtualUnwind 1573.type se_handler,\@abi-omnipotent 1574.align 16 1575se_handler: 1576 push %rsi 1577 push %rdi 1578 push %rbx 1579 push %rbp 1580 push %r12 1581 push %r13 1582 push %r14 1583 push %r15 1584 pushfq 1585 sub \$64,%rsp 1586 1587 mov 120($context),%rax # pull context->Rax 1588 mov 248($context),%rbx # pull context->Rip 1589 1590 mov 8($disp),%rsi # disp->ImageBase 1591 mov 56($disp),%r11 # disp->HanderlData 1592 1593 mov 0(%r11),%r10d # HandlerData[0] 1594 lea (%rsi,%r10),%r10 # prologue label 1595 cmp %r10,%rbx # context->Rip<prologue label 1596 jb .Lin_prologue 1597 1598 mov 152($context),%rax # pull context->Rsp 1599 1600 mov 4(%r11),%r10d # HandlerData[1] 1601 lea (%rsi,%r10),%r10 # epilogue label 1602 cmp %r10,%rbx # context->Rip>=epilogue label 1603 jae .Lin_prologue 1604___ 1605$code.=<<___ if ($shaext); 1606 lea aesni_cbc_sha256_enc_shaext(%rip),%r10 1607 cmp %r10,%rbx 1608 jb .Lnot_in_shaext 1609 1610 lea (%rax),%rsi 1611 lea 512($context),%rdi # &context.Xmm6 1612 mov \$20,%ecx 1613 .long 0xa548f3fc # cld; rep movsq 1614 lea 168(%rax),%rax # adjust stack pointer 1615 jmp .Lin_prologue 1616.Lnot_in_shaext: 1617___ 1618$code.=<<___ if ($avx>1); 1619 lea .Lavx2_shortcut(%rip),%r10 1620 cmp %r10,%rbx # context->Rip<avx2_shortcut 1621 jb .Lnot_in_avx2 1622 1623 and \$-256*$SZ,%rax 1624 add \$`2*$SZ*($rounds-8)`,%rax 1625.Lnot_in_avx2: 1626___ 1627$code.=<<___; 1628 mov %rax,%rsi # put aside Rsp 1629 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp 1630 1631 mov -8(%rax),%rbx 1632 mov -16(%rax),%rbp 1633 mov -24(%rax),%r12 1634 mov -32(%rax),%r13 1635 mov -40(%rax),%r14 1636 mov -48(%rax),%r15 1637 mov %rbx,144($context) # restore context->Rbx 1638 mov %rbp,160($context) # restore context->Rbp 1639 mov %r12,216($context) # restore context->R12 1640 mov %r13,224($context) # restore context->R13 1641 mov %r14,232($context) # restore context->R14 1642 mov %r15,240($context) # restore context->R15 1643 1644 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area 1645 lea 512($context),%rdi # &context.Xmm6 1646 mov \$20,%ecx 1647 .long 0xa548f3fc # cld; rep movsq 1648 1649.Lin_prologue: 1650 mov 8(%rax),%rdi 1651 mov 16(%rax),%rsi 1652 mov %rax,152($context) # restore context->Rsp 1653 mov %rsi,168($context) # restore context->Rsi 1654 mov %rdi,176($context) # restore context->Rdi 1655 1656 mov 40($disp),%rdi # disp->ContextRecord 1657 mov $context,%rsi # context 1658 mov \$154,%ecx # sizeof(CONTEXT) 1659 .long 0xa548f3fc # cld; rep movsq 1660 1661 mov $disp,%rsi 1662 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1663 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1664 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1665 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1666 mov 40(%rsi),%r10 # disp->ContextRecord 1667 lea 56(%rsi),%r11 # &disp->HandlerData 1668 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1669 mov %r10,32(%rsp) # arg5 1670 mov %r11,40(%rsp) # arg6 1671 mov %r12,48(%rsp) # arg7 1672 mov %rcx,56(%rsp) # arg8, (NULL) 1673 call *__imp_RtlVirtualUnwind(%rip) 1674 1675 mov \$1,%eax # ExceptionContinueSearch 1676 add \$64,%rsp 1677 popfq 1678 pop %r15 1679 pop %r14 1680 pop %r13 1681 pop %r12 1682 pop %rbp 1683 pop %rbx 1684 pop %rdi 1685 pop %rsi 1686 ret 1687.size se_handler,.-se_handler 1688 1689.section .pdata 1690 .rva .LSEH_begin_${func}_xop 1691 .rva .LSEH_end_${func}_xop 1692 .rva .LSEH_info_${func}_xop 1693 1694 .rva .LSEH_begin_${func}_avx 1695 .rva .LSEH_end_${func}_avx 1696 .rva .LSEH_info_${func}_avx 1697___ 1698$code.=<<___ if ($avx>1); 1699 .rva .LSEH_begin_${func}_avx2 1700 .rva .LSEH_end_${func}_avx2 1701 .rva .LSEH_info_${func}_avx2 1702___ 1703$code.=<<___ if ($shaext); 1704 .rva .LSEH_begin_${func}_shaext 1705 .rva .LSEH_end_${func}_shaext 1706 .rva .LSEH_info_${func}_shaext 1707___ 1708$code.=<<___; 1709.section .xdata 1710.align 8 1711.LSEH_info_${func}_xop: 1712 .byte 9,0,0,0 1713 .rva se_handler 1714 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 1715 1716.LSEH_info_${func}_avx: 1717 .byte 9,0,0,0 1718 .rva se_handler 1719 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 1720___ 1721$code.=<<___ if ($avx>1); 1722.LSEH_info_${func}_avx2: 1723 .byte 9,0,0,0 1724 .rva se_handler 1725 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 1726___ 1727$code.=<<___ if ($shaext); 1728.LSEH_info_${func}_shaext: 1729 .byte 9,0,0,0 1730 .rva se_handler 1731 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] 1732___ 1733} 1734 1735#################################################################### 1736sub rex { 1737 local *opcode=shift; 1738 my ($dst,$src)=@_; 1739 my $rex=0; 1740 1741 $rex|=0x04 if($dst>=8); 1742 $rex|=0x01 if($src>=8); 1743 unshift @opcode,$rex|0x40 if($rex); 1744} 1745 1746{ 1747 my %opcodelet = ( 1748 "sha256rnds2" => 0xcb, 1749 "sha256msg1" => 0xcc, 1750 "sha256msg2" => 0xcd ); 1751 1752 sub sha256op38 { 1753 my $instr = shift; 1754 1755 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1756 my @opcode=(0x0f,0x38); 1757 rex(\@opcode,$2,$1); 1758 push @opcode,$opcodelet{$instr}; 1759 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1760 return ".byte\t".join(',',@opcode); 1761 } else { 1762 return $instr."\t".@_[0]; 1763 } 1764 } 1765} 1766 1767$code =~ s/\`([^\`]*)\`/eval $1/gem; 1768$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; 1769print $code; 1770close STDOUT; 1771