1#! /usr/bin/env perl 2# Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# June 2011 18# 19# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled 20# in http://download.intel.com/design/intarch/papers/323686.pdf, is 21# that since AESNI-CBC encrypt exhibit *very* low instruction-level 22# parallelism, interleaving it with another algorithm would allow to 23# utilize processor resources better and achieve better performance. 24# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and 25# AESNI code is weaved into it. Below are performance numbers in 26# cycles per processed byte, less is better, for standalone AESNI-CBC 27# encrypt, sum of the latter and standalone SHA1, and "stitched" 28# subroutine: 29# 30# AES-128-CBC +SHA1 stitch gain 31# Westmere 3.77[+5.3] 9.07 6.55 +38% 32# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%) 33# Ivy Bridge 5.05[+4.6] 9.65 5.54 +74% 34# Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%) 35# Skylake 2.63[+3.5(4.1)] 6.17(6.69) 4.23(4.44) +46%(+51%) 36# Bulldozer 5.77[+6.0] 11.72 6.37 +84% 37# Ryzen(**) 2.71[+1.93] 4.64 2.74 +69% 38# Goldmont(**) 3.82[+1.70] 5.52 4.20 +31% 39# 40# AES-192-CBC 41# Westmere 4.51 9.81 6.80 +44% 42# Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%) 43# Ivy Bridge 6.05 10.65 6.07 +75% 44# Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%) 45# Bulldozer 6.89 12.84 6.96 +84% 46# 47# AES-256-CBC 48# Westmere 5.25 10.55 7.21 +46% 49# Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%) 50# Ivy Bridge 7.05 11.65 7.12 +64% 51# Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%) 52# Skylake 3.62 7.16(7.68) 4.56(4.76) +57%(+61%) 53# Bulldozer 8.00 13.95 8.25 +69% 54# Ryzen(**) 3.71 5.64 3.72 +52% 55# Goldmont(**) 5.35 7.05 5.76 +22% 56# 57# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for 58# background information. Above numbers in parentheses are SSSE3 59# results collected on AVX-capable CPU, i.e. apply on OSes that 60# don't support AVX. 61# (**) SHAEXT results. 62# 63# Needless to mention that it makes no sense to implement "stitched" 64# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1 65# fully utilize parallelism, so stitching would not give any gain 66# anyway. Well, there might be some, e.g. because of better cache 67# locality... For reference, here are performance results for 68# standalone AESNI-CBC decrypt: 69# 70# AES-128-CBC AES-192-CBC AES-256-CBC 71# Westmere 1.25 1.50 1.75 72# Sandy Bridge 0.74 0.91 1.09 73# Ivy Bridge 0.74 0.90 1.11 74# Haswell 0.63 0.76 0.88 75# Bulldozer 0.70 0.85 0.99 76 77# And indeed: 78# 79# AES-256-CBC +SHA1 stitch gain 80# Westmere 1.75 7.20 6.68 +7.8% 81# Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%) 82# Ivy Bridge 1.11 5.70 5.45 +4.6% 83# Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%) 84# Bulldozer 0.99 6.95 5.95 +17%(**) 85# 86# (*) Tiny improvement coefficient on Haswell is because we compare 87# AVX1 stitch to sum with AVX2 SHA1. 88# (**) Execution is fully dominated by integer code sequence and 89# SIMD still hardly shows [in single-process benchmark;-] 90 91$flavour = shift; 92$output = shift; 93if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 94 95$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 96 97$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 98( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 99( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 100die "can't locate x86_64-xlate.pl"; 101 102$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 103 =~ /GNU assembler version ([2-9]\.[0-9]+)/ && 104 $1>=2.19); 105$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 106 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && 107 $1>=2.09); 108$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 109 `ml64 2>&1` =~ /Version ([0-9]+)\./ && 110 $1>=10); 111$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/ && $2>=3.0); 112 113$shaext=1; ### set to zero if compiling for 1.0.1 114 115$stitched_decrypt=0; 116 117open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 118*STDOUT=*OUT; 119 120# void aesni_cbc_sha1_enc(const void *inp, 121# void *out, 122# size_t length, 123# const AES_KEY *key, 124# unsigned char *iv, 125# SHA_CTX *ctx, 126# const void *in0); 127 128$code.=<<___; 129.text 130.extern OPENSSL_ia32cap_P 131 132.globl aesni_cbc_sha1_enc 133.type aesni_cbc_sha1_enc,\@abi-omnipotent 134.align 32 135aesni_cbc_sha1_enc: 136.cfi_startproc 137 # caller should check for SSSE3 and AES-NI bits 138 mov OPENSSL_ia32cap_P+0(%rip),%r10d 139 mov OPENSSL_ia32cap_P+4(%rip),%r11 140___ 141$code.=<<___ if ($shaext); 142 bt \$61,%r11 # check SHA bit 143 jc aesni_cbc_sha1_enc_shaext 144___ 145$code.=<<___ if ($avx); 146 and \$`1<<28`,%r11d # mask AVX bit 147 and \$`1<<30`,%r10d # mask "Intel CPU" bit 148 or %r11d,%r10d 149 cmp \$`1<<28|1<<30`,%r10d 150 je aesni_cbc_sha1_enc_avx 151___ 152$code.=<<___; 153 jmp aesni_cbc_sha1_enc_ssse3 154 ret 155.cfi_endproc 156.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc 157___ 158 159my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 160 161my $Xi=4; 162my @X=map("%xmm$_",(4..7,0..3)); 163my @Tx=map("%xmm$_",(8..10)); 164my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 165my @T=("%esi","%edi"); 166my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0; 167my $K_XX_XX="%r11"; 168my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc 169my @rndkey=("%xmm14","%xmm15"); # for enc 170my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec 171 172if (1) { # reassign for Atom Silvermont 173 # The goal is to minimize amount of instructions with more than 174 # 3 prefix bytes. Or in more practical terms to keep AES-NI *and* 175 # SSSE3 instructions to upper half of the register bank. 176 @X=map("%xmm$_",(8..11,4..7)); 177 @Tx=map("%xmm$_",(12,13,3)); 178 ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15)); 179 @rndkey=("%xmm0","%xmm1"); 180} 181 182sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 183{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 184 my $arg = pop; 185 $arg = "\$$arg" if ($arg*1 eq $arg); 186 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 187} 188 189my $_rol=sub { &rol(@_) }; 190my $_ror=sub { &ror(@_) }; 191 192$code.=<<___; 193.type aesni_cbc_sha1_enc_ssse3,\@function,6 194.align 32 195aesni_cbc_sha1_enc_ssse3: 196.cfi_startproc 197 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 198 #shr \$6,$len # debugging artefact 199 #jz .Lepilogue_ssse3 # debugging artefact 200 push %rbx 201.cfi_push %rbx 202 push %rbp 203.cfi_push %rbp 204 push %r12 205.cfi_push %r12 206 push %r13 207.cfi_push %r13 208 push %r14 209.cfi_push %r14 210 push %r15 211.cfi_push %r15 212 lea `-104-($win64?10*16:0)`(%rsp),%rsp 213.cfi_adjust_cfa_offset `104+($win64?10*16:0)` 214 #mov $in0,$inp # debugging artefact 215 #lea 64(%rsp),$ctx # debugging artefact 216___ 217$code.=<<___ if ($win64); 218 movaps %xmm6,96+0(%rsp) 219 movaps %xmm7,96+16(%rsp) 220 movaps %xmm8,96+32(%rsp) 221 movaps %xmm9,96+48(%rsp) 222 movaps %xmm10,96+64(%rsp) 223 movaps %xmm11,96+80(%rsp) 224 movaps %xmm12,96+96(%rsp) 225 movaps %xmm13,96+112(%rsp) 226 movaps %xmm14,96+128(%rsp) 227 movaps %xmm15,96+144(%rsp) 228.Lprologue_ssse3: 229___ 230$code.=<<___; 231 mov $in0,%r12 # reassign arguments 232 mov $out,%r13 233 mov $len,%r14 234 lea 112($key),%r15 # size optimization 235 movdqu ($ivp),$iv # load IV 236 mov $ivp,88(%rsp) # save $ivp 237___ 238($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments 239my $rounds="${ivp}d"; 240$code.=<<___; 241 shl \$6,$len 242 sub $in0,$out 243 mov 240-112($key),$rounds 244 add $inp,$len # end of input 245 246 lea K_XX_XX(%rip),$K_XX_XX 247 mov 0($ctx),$A # load context 248 mov 4($ctx),$B 249 mov 8($ctx),$C 250 mov 12($ctx),$D 251 mov $B,@T[0] # magic seed 252 mov 16($ctx),$E 253 mov $C,@T[1] 254 xor $D,@T[1] 255 and @T[1],@T[0] 256 257 movdqa 64($K_XX_XX),@Tx[2] # pbswap mask 258 movdqa 0($K_XX_XX),@Tx[1] # K_00_19 259 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 260 movdqu 16($inp),@X[-3&7] 261 movdqu 32($inp),@X[-2&7] 262 movdqu 48($inp),@X[-1&7] 263 pshufb @Tx[2],@X[-4&7] # byte swap 264 pshufb @Tx[2],@X[-3&7] 265 pshufb @Tx[2],@X[-2&7] 266 add \$64,$inp 267 paddd @Tx[1],@X[-4&7] # add K_00_19 268 pshufb @Tx[2],@X[-1&7] 269 paddd @Tx[1],@X[-3&7] 270 paddd @Tx[1],@X[-2&7] 271 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU 272 psubd @Tx[1],@X[-4&7] # restore X[] 273 movdqa @X[-3&7],16(%rsp) 274 psubd @Tx[1],@X[-3&7] 275 movdqa @X[-2&7],32(%rsp) 276 psubd @Tx[1],@X[-2&7] 277 movups -112($key),$rndkey0 # $key[0] 278 movups 16-112($key),$rndkey[0] # forward reference 279 jmp .Loop_ssse3 280___ 281 282my $aesenc=sub { 283 use integer; 284 my ($n,$k)=($r/10,$r%10); 285 if ($k==0) { 286 $code.=<<___; 287 movups `16*$n`($in0),$in # load input 288 xorps $rndkey0,$in 289___ 290 $code.=<<___ if ($n); 291 movups $iv,`16*($n-1)`($out,$in0) # write output 292___ 293 $code.=<<___; 294 xorps $in,$iv 295 movups `32+16*$k-112`($key),$rndkey[1] 296 aesenc $rndkey[0],$iv 297___ 298 } elsif ($k==9) { 299 $sn++; 300 $code.=<<___; 301 cmp \$11,$rounds 302 jb .Laesenclast$sn 303 movups `32+16*($k+0)-112`($key),$rndkey[1] 304 aesenc $rndkey[0],$iv 305 movups `32+16*($k+1)-112`($key),$rndkey[0] 306 aesenc $rndkey[1],$iv 307 je .Laesenclast$sn 308 movups `32+16*($k+2)-112`($key),$rndkey[1] 309 aesenc $rndkey[0],$iv 310 movups `32+16*($k+3)-112`($key),$rndkey[0] 311 aesenc $rndkey[1],$iv 312.Laesenclast$sn: 313 aesenclast $rndkey[0],$iv 314 movups 16-112($key),$rndkey[1] # forward reference 315___ 316 } else { 317 $code.=<<___; 318 movups `32+16*$k-112`($key),$rndkey[1] 319 aesenc $rndkey[0],$iv 320___ 321 } 322 $r++; unshift(@rndkey,pop(@rndkey)); 323}; 324 325sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 326{ use integer; 327 my $body = shift; 328 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 329 my ($a,$b,$c,$d,$e); 330 331 eval(shift(@insns)); # ror 332 &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); 333 eval(shift(@insns)); 334 &movdqa (@Tx[0],@X[-1&7]); 335 &paddd (@Tx[1],@X[-1&7]); 336 eval(shift(@insns)); 337 eval(shift(@insns)); 338 339 &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); 340 eval(shift(@insns)); 341 eval(shift(@insns)); # rol 342 eval(shift(@insns)); 343 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords 344 eval(shift(@insns)); 345 eval(shift(@insns)); 346 347 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 348 eval(shift(@insns)); 349 eval(shift(@insns)); # ror 350 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 351 eval(shift(@insns)); 352 eval(shift(@insns)); 353 eval(shift(@insns)); 354 355 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 356 eval(shift(@insns)); 357 eval(shift(@insns)); # rol 358 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 359 eval(shift(@insns)); 360 eval(shift(@insns)); 361 362 &movdqa (@Tx[2],@X[0]); 363 eval(shift(@insns)); 364 eval(shift(@insns)); 365 eval(shift(@insns)); # ror 366 &movdqa (@Tx[0],@X[0]); 367 eval(shift(@insns)); 368 369 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword 370 &paddd (@X[0],@X[0]); 371 eval(shift(@insns)); 372 eval(shift(@insns)); 373 374 &psrld (@Tx[0],31); 375 eval(shift(@insns)); 376 eval(shift(@insns)); # rol 377 eval(shift(@insns)); 378 &movdqa (@Tx[1],@Tx[2]); 379 eval(shift(@insns)); 380 eval(shift(@insns)); 381 382 &psrld (@Tx[2],30); 383 eval(shift(@insns)); 384 eval(shift(@insns)); # ror 385 &por (@X[0],@Tx[0]); # "X[0]"<<<=1 386 eval(shift(@insns)); 387 eval(shift(@insns)); 388 eval(shift(@insns)); 389 390 &pslld (@Tx[1],2); 391 &pxor (@X[0],@Tx[2]); 392 eval(shift(@insns)); 393 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX 394 eval(shift(@insns)); # rol 395 eval(shift(@insns)); 396 eval(shift(@insns)); 397 398 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 399 &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 400 401 foreach (@insns) { eval; } # remaining instructions [if any] 402 403 $Xi++; push(@X,shift(@X)); # "rotate" X[] 404 push(@Tx,shift(@Tx)); 405} 406 407sub Xupdate_ssse3_32_79() 408{ use integer; 409 my $body = shift; 410 my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions 411 my ($a,$b,$c,$d,$e); 412 413 eval(shift(@insns)) if ($Xi==8); 414 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 415 eval(shift(@insns)) if ($Xi==8); 416 eval(shift(@insns)); # body_20_39 417 eval(shift(@insns)); 418 eval(shift(@insns)) if (@insns[1] =~ /_ror/); 419 eval(shift(@insns)) if (@insns[0] =~ /_ror/); 420 &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); 421 eval(shift(@insns)); 422 eval(shift(@insns)); # rol 423 424 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 425 eval(shift(@insns)); 426 eval(shift(@insns)); 427 if ($Xi%5) { 428 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... 429 } else { # ... or load next one 430 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); 431 } 432 eval(shift(@insns)); # ror 433 &paddd (@Tx[1],@X[-1&7]); 434 eval(shift(@insns)); 435 436 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" 437 eval(shift(@insns)); # body_20_39 438 eval(shift(@insns)); 439 eval(shift(@insns)); 440 eval(shift(@insns)); # rol 441 eval(shift(@insns)) if (@insns[0] =~ /_ror/); 442 443 &movdqa (@Tx[0],@X[0]); 444 eval(shift(@insns)); 445 eval(shift(@insns)); 446 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 447 eval(shift(@insns)); # ror 448 eval(shift(@insns)); 449 eval(shift(@insns)); # body_20_39 450 451 &pslld (@X[0],2); 452 eval(shift(@insns)); 453 eval(shift(@insns)); 454 &psrld (@Tx[0],30); 455 eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol 456 eval(shift(@insns)); 457 eval(shift(@insns)); 458 eval(shift(@insns)); # ror 459 460 &por (@X[0],@Tx[0]); # "X[0]"<<<=2 461 eval(shift(@insns)); 462 eval(shift(@insns)); # body_20_39 463 eval(shift(@insns)) if (@insns[1] =~ /_rol/); 464 eval(shift(@insns)) if (@insns[0] =~ /_rol/); 465 &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) 466 eval(shift(@insns)); 467 eval(shift(@insns)); # rol 468 eval(shift(@insns)); 469 eval(shift(@insns)); 470 eval(shift(@insns)); # rol 471 eval(shift(@insns)); 472 473 foreach (@insns) { eval; } # remaining instructions 474 475 $Xi++; push(@X,shift(@X)); # "rotate" X[] 476 push(@Tx,shift(@Tx)); 477} 478 479sub Xuplast_ssse3_80() 480{ use integer; 481 my $body = shift; 482 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 483 my ($a,$b,$c,$d,$e); 484 485 eval(shift(@insns)); 486 eval(shift(@insns)); 487 eval(shift(@insns)); 488 eval(shift(@insns)); 489 &paddd (@Tx[1],@X[-1&7]); 490 eval(shift(@insns)); 491 eval(shift(@insns)); 492 493 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 494 495 foreach (@insns) { eval; } # remaining instructions 496 497 &cmp ($inp,$len); 498 &je (shift); 499 500 unshift(@Tx,pop(@Tx)); 501 502 &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask 503 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 504 &movdqu (@X[-4&7],"0($inp)"); # load input 505 &movdqu (@X[-3&7],"16($inp)"); 506 &movdqu (@X[-2&7],"32($inp)"); 507 &movdqu (@X[-1&7],"48($inp)"); 508 &pshufb (@X[-4&7],@Tx[2]); # byte swap 509 &add ($inp,64); 510 511 $Xi=0; 512} 513 514sub Xloop_ssse3() 515{ use integer; 516 my $body = shift; 517 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 518 my ($a,$b,$c,$d,$e); 519 520 eval(shift(@insns)); 521 eval(shift(@insns)); 522 eval(shift(@insns)); 523 &pshufb (@X[($Xi-3)&7],@Tx[2]); 524 eval(shift(@insns)); 525 eval(shift(@insns)); 526 eval(shift(@insns)); 527 eval(shift(@insns)); 528 &paddd (@X[($Xi-4)&7],@Tx[1]); 529 eval(shift(@insns)); 530 eval(shift(@insns)); 531 eval(shift(@insns)); 532 eval(shift(@insns)); 533 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU 534 eval(shift(@insns)); 535 eval(shift(@insns)); 536 eval(shift(@insns)); 537 eval(shift(@insns)); 538 &psubd (@X[($Xi-4)&7],@Tx[1]); 539 540 foreach (@insns) { eval; } 541 $Xi++; 542} 543 544sub Xtail_ssse3() 545{ use integer; 546 my $body = shift; 547 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 548 my ($a,$b,$c,$d,$e); 549 550 foreach (@insns) { eval; } 551} 552 553my @body_00_19 = ( 554 '($a,$b,$c,$d,$e)=@V;'. 555 '&$_ror ($b,$j?7:2);', # $b>>>2 556 '&xor (@T[0],$d);', 557 '&mov (@T[1],$a);', # $b for next round 558 559 '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer 560 '&xor ($b,$c);', # $c^$d for next round 561 562 '&$_rol ($a,5);', 563 '&add ($e,@T[0]);', 564 '&and (@T[1],$b);', # ($b&($c^$d)) for next round 565 566 '&xor ($b,$c);', # restore $b 567 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 568 ); 569 570sub body_00_19 () { # ((c^d)&b)^d 571 # on start @T[0]=(c^d)&b 572 return &body_20_39() if ($rx==19); $rx++; 573 574 use integer; 575 my ($k,$n); 576 my @r=@body_00_19; 577 578 $n = scalar(@r); 579 $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds 580 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); 581 $jj++; 582 583 return @r; 584} 585 586my @body_20_39 = ( 587 '($a,$b,$c,$d,$e)=@V;'. 588 '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer 589 '&xor (@T[0],$d) if($j==19);'. 590 '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c) 591 '&mov (@T[1],$a);', # $b for next round 592 593 '&$_rol ($a,5);', 594 '&add ($e,@T[0]);', 595 '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round 596 597 '&$_ror ($b,7);', # $b>>>2 598 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 599 ); 600 601sub body_20_39 () { # b^d^c 602 # on entry @T[0]=b^d 603 return &body_40_59() if ($rx==39); $rx++; 604 605 use integer; 606 my ($k,$n); 607 my @r=@body_20_39; 608 609 $n = scalar(@r); 610 $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds 611 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20); 612 $jj++; 613 614 return @r; 615} 616 617my @body_40_59 = ( 618 '($a,$b,$c,$d,$e)=@V;'. 619 '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer 620 '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d) 621 '&xor ($c,$d) if ($j>=40);', # restore $c 622 623 '&$_ror ($b,7);', # $b>>>2 624 '&mov (@T[1],$a);', # $b for next round 625 '&xor (@T[0],$c);', 626 627 '&$_rol ($a,5);', 628 '&add ($e,@T[0]);', 629 '&xor (@T[1],$c) if ($j==59);'. 630 '&xor (@T[1],$b) if ($j< 59);', # b^c for next round 631 632 '&xor ($b,$c) if ($j< 59);', # c^d for next round 633 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 634 ); 635 636sub body_40_59 () { # ((b^c)&(c^d))^c 637 # on entry @T[0]=(b^c), (c^=d) 638 $rx++; 639 640 use integer; 641 my ($k,$n); 642 my @r=@body_40_59; 643 644 $n = scalar(@r); 645 $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds 646 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40); 647 $jj++; 648 649 return @r; 650} 651$code.=<<___; 652.align 32 653.Loop_ssse3: 654___ 655 &Xupdate_ssse3_16_31(\&body_00_19); 656 &Xupdate_ssse3_16_31(\&body_00_19); 657 &Xupdate_ssse3_16_31(\&body_00_19); 658 &Xupdate_ssse3_16_31(\&body_00_19); 659 &Xupdate_ssse3_32_79(\&body_00_19); 660 &Xupdate_ssse3_32_79(\&body_20_39); 661 &Xupdate_ssse3_32_79(\&body_20_39); 662 &Xupdate_ssse3_32_79(\&body_20_39); 663 &Xupdate_ssse3_32_79(\&body_20_39); 664 &Xupdate_ssse3_32_79(\&body_20_39); 665 &Xupdate_ssse3_32_79(\&body_40_59); 666 &Xupdate_ssse3_32_79(\&body_40_59); 667 &Xupdate_ssse3_32_79(\&body_40_59); 668 &Xupdate_ssse3_32_79(\&body_40_59); 669 &Xupdate_ssse3_32_79(\&body_40_59); 670 &Xupdate_ssse3_32_79(\&body_20_39); 671 &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done" 672 673 $saved_j=$j; @saved_V=@V; 674 $saved_r=$r; @saved_rndkey=@rndkey; 675 676 &Xloop_ssse3(\&body_20_39); 677 &Xloop_ssse3(\&body_20_39); 678 &Xloop_ssse3(\&body_20_39); 679 680$code.=<<___; 681 movups $iv,48($out,$in0) # write output 682 lea 64($in0),$in0 683 684 add 0($ctx),$A # update context 685 add 4($ctx),@T[0] 686 add 8($ctx),$C 687 add 12($ctx),$D 688 mov $A,0($ctx) 689 add 16($ctx),$E 690 mov @T[0],4($ctx) 691 mov @T[0],$B # magic seed 692 mov $C,8($ctx) 693 mov $C,@T[1] 694 mov $D,12($ctx) 695 xor $D,@T[1] 696 mov $E,16($ctx) 697 and @T[1],@T[0] 698 jmp .Loop_ssse3 699 700.Ldone_ssse3: 701___ 702 $jj=$j=$saved_j; @V=@saved_V; 703 $r=$saved_r; @rndkey=@saved_rndkey; 704 705 &Xtail_ssse3(\&body_20_39); 706 &Xtail_ssse3(\&body_20_39); 707 &Xtail_ssse3(\&body_20_39); 708 709$code.=<<___; 710 movups $iv,48($out,$in0) # write output 711 mov 88(%rsp),$ivp # restore $ivp 712 713 add 0($ctx),$A # update context 714 add 4($ctx),@T[0] 715 add 8($ctx),$C 716 mov $A,0($ctx) 717 add 12($ctx),$D 718 mov @T[0],4($ctx) 719 add 16($ctx),$E 720 mov $C,8($ctx) 721 mov $D,12($ctx) 722 mov $E,16($ctx) 723 movups $iv,($ivp) # write IV 724___ 725$code.=<<___ if ($win64); 726 movaps 96+0(%rsp),%xmm6 727 movaps 96+16(%rsp),%xmm7 728 movaps 96+32(%rsp),%xmm8 729 movaps 96+48(%rsp),%xmm9 730 movaps 96+64(%rsp),%xmm10 731 movaps 96+80(%rsp),%xmm11 732 movaps 96+96(%rsp),%xmm12 733 movaps 96+112(%rsp),%xmm13 734 movaps 96+128(%rsp),%xmm14 735 movaps 96+144(%rsp),%xmm15 736___ 737$code.=<<___; 738 lea `104+($win64?10*16:0)`(%rsp),%rsi 739.cfi_def_cfa %rsi,56 740 mov 0(%rsi),%r15 741.cfi_restore %r15 742 mov 8(%rsi),%r14 743.cfi_restore %r14 744 mov 16(%rsi),%r13 745.cfi_restore %r13 746 mov 24(%rsi),%r12 747.cfi_restore %r12 748 mov 32(%rsi),%rbp 749.cfi_restore %rbp 750 mov 40(%rsi),%rbx 751.cfi_restore %rbx 752 lea 48(%rsi),%rsp 753.cfi_def_cfa %rsp,8 754.Lepilogue_ssse3: 755 ret 756.cfi_endproc 757.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 758___ 759 760 if ($stitched_decrypt) {{{ 761# reset 762($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 763$j=$jj=$r=$rx=0; 764$Xi=4; 765 766# reassign for Atom Silvermont (see above) 767($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4)); 768@X=map("%xmm$_",(8..13,6,7)); 769@Tx=map("%xmm$_",(14,15,5)); 770 771my @aes256_dec = ( 772 '&movdqu($inout0,"0x00($in0)");', 773 '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);', 774 '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);', 775 '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);', 776 777 '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");', 778 '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3] 779 undef,undef 780 ); 781for ($i=0;$i<13;$i++) { 782 push (@aes256_dec,( 783 '&aesdec ($inout0,$rndkey0);', 784 '&aesdec ($inout1,$rndkey0);', 785 '&aesdec ($inout2,$rndkey0);', 786 '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");' 787 )); 788 push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11); 789 push (@aes256_dec,(undef,undef)) if ($i==5); 790} 791push(@aes256_dec,( 792 '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");', 793 '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");', 794 '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");', 795 '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");', 796 797 '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");', 798 '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);', 799 '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);', 800 '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);', 801 802 '&movups ("0x30($out,$in0)",$inout3);' 803 )); 804 805sub body_00_19_dec () { # ((c^d)&b)^d 806 # on start @T[0]=(c^d)&b 807 return &body_20_39_dec() if ($rx==19); 808 809 my @r=@body_00_19; 810 811 unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); 812 $rx++; 813 814 return @r; 815} 816 817sub body_20_39_dec () { # b^d^c 818 # on entry @T[0]=b^d 819 return &body_40_59_dec() if ($rx==39); 820 821 my @r=@body_20_39; 822 823 unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); 824 $rx++; 825 826 return @r; 827} 828 829sub body_40_59_dec () { # ((b^c)&(c^d))^c 830 # on entry @T[0]=(b^c), (c^=d) 831 832 my @r=@body_40_59; 833 834 unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); 835 $rx++; 836 837 return @r; 838} 839 840$code.=<<___; 841.globl aesni256_cbc_sha1_dec 842.type aesni256_cbc_sha1_dec,\@abi-omnipotent 843.align 32 844aesni256_cbc_sha1_dec: 845.cfi_startproc 846 # caller should check for SSSE3 and AES-NI bits 847 mov OPENSSL_ia32cap_P+0(%rip),%r10d 848 mov OPENSSL_ia32cap_P+4(%rip),%r11d 849___ 850$code.=<<___ if ($avx); 851 and \$`1<<28`,%r11d # mask AVX bit 852 and \$`1<<30`,%r10d # mask "Intel CPU" bit 853 or %r11d,%r10d 854 cmp \$`1<<28|1<<30`,%r10d 855 je aesni256_cbc_sha1_dec_avx 856___ 857$code.=<<___; 858 jmp aesni256_cbc_sha1_dec_ssse3 859 ret 860.cfi_endproc 861.size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec 862 863.type aesni256_cbc_sha1_dec_ssse3,\@function,6 864.align 32 865aesni256_cbc_sha1_dec_ssse3: 866.cfi_startproc 867 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 868 push %rbx 869.cfi_push %rbx 870 push %rbp 871.cfi_push %rbp 872 push %r12 873.cfi_push %r12 874 push %r13 875.cfi_push %r13 876 push %r14 877.cfi_push %r14 878 push %r15 879.cfi_push %r15 880 lea `-104-($win64?10*16:0)`(%rsp),%rsp 881.cfi_adjust_cfa_offset `104+($win64?10*16:0)` 882___ 883$code.=<<___ if ($win64); 884 movaps %xmm6,96+0(%rsp) 885 movaps %xmm7,96+16(%rsp) 886 movaps %xmm8,96+32(%rsp) 887 movaps %xmm9,96+48(%rsp) 888 movaps %xmm10,96+64(%rsp) 889 movaps %xmm11,96+80(%rsp) 890 movaps %xmm12,96+96(%rsp) 891 movaps %xmm13,96+112(%rsp) 892 movaps %xmm14,96+128(%rsp) 893 movaps %xmm15,96+144(%rsp) 894.Lprologue_dec_ssse3: 895___ 896$code.=<<___; 897 mov $in0,%r12 # reassign arguments 898 mov $out,%r13 899 mov $len,%r14 900 lea 112($key),%r15 # size optimization 901 movdqu ($ivp),@X[3] # load IV 902 #mov $ivp,88(%rsp) # save $ivp 903___ 904($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments 905$code.=<<___; 906 shl \$6,$len 907 sub $in0,$out 908 add $inp,$len # end of input 909 910 lea K_XX_XX(%rip),$K_XX_XX 911 mov 0($ctx),$A # load context 912 mov 4($ctx),$B 913 mov 8($ctx),$C 914 mov 12($ctx),$D 915 mov $B,@T[0] # magic seed 916 mov 16($ctx),$E 917 mov $C,@T[1] 918 xor $D,@T[1] 919 and @T[1],@T[0] 920 921 movdqa 64($K_XX_XX),@Tx[2] # pbswap mask 922 movdqa 0($K_XX_XX),@Tx[1] # K_00_19 923 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 924 movdqu 16($inp),@X[-3&7] 925 movdqu 32($inp),@X[-2&7] 926 movdqu 48($inp),@X[-1&7] 927 pshufb @Tx[2],@X[-4&7] # byte swap 928 add \$64,$inp 929 pshufb @Tx[2],@X[-3&7] 930 pshufb @Tx[2],@X[-2&7] 931 pshufb @Tx[2],@X[-1&7] 932 paddd @Tx[1],@X[-4&7] # add K_00_19 933 paddd @Tx[1],@X[-3&7] 934 paddd @Tx[1],@X[-2&7] 935 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU 936 psubd @Tx[1],@X[-4&7] # restore X[] 937 movdqa @X[-3&7],16(%rsp) 938 psubd @Tx[1],@X[-3&7] 939 movdqa @X[-2&7],32(%rsp) 940 psubd @Tx[1],@X[-2&7] 941 movdqu -112($key),$rndkey0 # $key[0] 942 jmp .Loop_dec_ssse3 943 944.align 32 945.Loop_dec_ssse3: 946___ 947 &Xupdate_ssse3_16_31(\&body_00_19_dec); 948 &Xupdate_ssse3_16_31(\&body_00_19_dec); 949 &Xupdate_ssse3_16_31(\&body_00_19_dec); 950 &Xupdate_ssse3_16_31(\&body_00_19_dec); 951 &Xupdate_ssse3_32_79(\&body_00_19_dec); 952 &Xupdate_ssse3_32_79(\&body_20_39_dec); 953 &Xupdate_ssse3_32_79(\&body_20_39_dec); 954 &Xupdate_ssse3_32_79(\&body_20_39_dec); 955 &Xupdate_ssse3_32_79(\&body_20_39_dec); 956 &Xupdate_ssse3_32_79(\&body_20_39_dec); 957 &Xupdate_ssse3_32_79(\&body_40_59_dec); 958 &Xupdate_ssse3_32_79(\&body_40_59_dec); 959 &Xupdate_ssse3_32_79(\&body_40_59_dec); 960 &Xupdate_ssse3_32_79(\&body_40_59_dec); 961 &Xupdate_ssse3_32_79(\&body_40_59_dec); 962 &Xupdate_ssse3_32_79(\&body_20_39_dec); 963 &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done" 964 965 $saved_j=$j; @saved_V=@V; 966 $saved_rx=$rx; 967 968 &Xloop_ssse3(\&body_20_39_dec); 969 &Xloop_ssse3(\&body_20_39_dec); 970 &Xloop_ssse3(\&body_20_39_dec); 971 972 eval(@aes256_dec[-1]); # last store 973$code.=<<___; 974 lea 64($in0),$in0 975 976 add 0($ctx),$A # update context 977 add 4($ctx),@T[0] 978 add 8($ctx),$C 979 add 12($ctx),$D 980 mov $A,0($ctx) 981 add 16($ctx),$E 982 mov @T[0],4($ctx) 983 mov @T[0],$B # magic seed 984 mov $C,8($ctx) 985 mov $C,@T[1] 986 mov $D,12($ctx) 987 xor $D,@T[1] 988 mov $E,16($ctx) 989 and @T[1],@T[0] 990 jmp .Loop_dec_ssse3 991 992.Ldone_dec_ssse3: 993___ 994 $jj=$j=$saved_j; @V=@saved_V; 995 $rx=$saved_rx; 996 997 &Xtail_ssse3(\&body_20_39_dec); 998 &Xtail_ssse3(\&body_20_39_dec); 999 &Xtail_ssse3(\&body_20_39_dec); 1000 1001 eval(@aes256_dec[-1]); # last store 1002$code.=<<___; 1003 add 0($ctx),$A # update context 1004 add 4($ctx),@T[0] 1005 add 8($ctx),$C 1006 mov $A,0($ctx) 1007 add 12($ctx),$D 1008 mov @T[0],4($ctx) 1009 add 16($ctx),$E 1010 mov $C,8($ctx) 1011 mov $D,12($ctx) 1012 mov $E,16($ctx) 1013 movups @X[3],($ivp) # write IV 1014___ 1015$code.=<<___ if ($win64); 1016 movaps 96+0(%rsp),%xmm6 1017 movaps 96+16(%rsp),%xmm7 1018 movaps 96+32(%rsp),%xmm8 1019 movaps 96+48(%rsp),%xmm9 1020 movaps 96+64(%rsp),%xmm10 1021 movaps 96+80(%rsp),%xmm11 1022 movaps 96+96(%rsp),%xmm12 1023 movaps 96+112(%rsp),%xmm13 1024 movaps 96+128(%rsp),%xmm14 1025 movaps 96+144(%rsp),%xmm15 1026___ 1027$code.=<<___; 1028 lea `104+($win64?10*16:0)`(%rsp),%rsi 1029.cfi_cfa_def %rsi,56 1030 mov 0(%rsi),%r15 1031.cfi_restore %r15 1032 mov 8(%rsi),%r14 1033.cfi_restore %r14 1034 mov 16(%rsi),%r13 1035.cfi_restore %r13 1036 mov 24(%rsi),%r12 1037.cfi_restore %r12 1038 mov 32(%rsi),%rbp 1039.cfi_restore %rbp 1040 mov 40(%rsi),%rbx 1041.cfi_restore %rbx 1042 lea 48(%rsi),%rsp 1043.cfi_cfa_def %rsp,8 1044.Lepilogue_dec_ssse3: 1045 ret 1046.cfi_endproc 1047.size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3 1048___ 1049 }}} 1050$j=$jj=$r=$rx=0; 1051 1052if ($avx) { 1053my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 1054 1055my $Xi=4; 1056my @X=map("%xmm$_",(4..7,0..3)); 1057my @Tx=map("%xmm$_",(8..10)); 1058my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 1059my @T=("%esi","%edi"); 1060my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); 1061my @rndkey=("%xmm14","%xmm15"); 1062my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec 1063my $Kx=@Tx[2]; 1064 1065my $_rol=sub { &shld(@_[0],@_) }; 1066my $_ror=sub { &shrd(@_[0],@_) }; 1067 1068$code.=<<___; 1069.type aesni_cbc_sha1_enc_avx,\@function,6 1070.align 32 1071aesni_cbc_sha1_enc_avx: 1072.cfi_startproc 1073 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 1074 #shr \$6,$len # debugging artefact 1075 #jz .Lepilogue_avx # debugging artefact 1076 push %rbx 1077.cfi_push %rbx 1078 push %rbp 1079.cfi_push %rbp 1080 push %r12 1081.cfi_push %r12 1082 push %r13 1083.cfi_push %r13 1084 push %r14 1085.cfi_push %r14 1086 push %r15 1087.cfi_push %r15 1088 lea `-104-($win64?10*16:0)`(%rsp),%rsp 1089.cfi_adjust_cfa_offset `104+($win64?10*16:0)` 1090 #mov $in0,$inp # debugging artefact 1091 #lea 64(%rsp),$ctx # debugging artefact 1092___ 1093$code.=<<___ if ($win64); 1094 movaps %xmm6,96+0(%rsp) 1095 movaps %xmm7,96+16(%rsp) 1096 movaps %xmm8,96+32(%rsp) 1097 movaps %xmm9,96+48(%rsp) 1098 movaps %xmm10,96+64(%rsp) 1099 movaps %xmm11,96+80(%rsp) 1100 movaps %xmm12,96+96(%rsp) 1101 movaps %xmm13,96+112(%rsp) 1102 movaps %xmm14,96+128(%rsp) 1103 movaps %xmm15,96+144(%rsp) 1104.Lprologue_avx: 1105___ 1106$code.=<<___; 1107 vzeroall 1108 mov $in0,%r12 # reassign arguments 1109 mov $out,%r13 1110 mov $len,%r14 1111 lea 112($key),%r15 # size optimization 1112 vmovdqu ($ivp),$iv # load IV 1113 mov $ivp,88(%rsp) # save $ivp 1114___ 1115($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments 1116my $rounds="${ivp}d"; 1117$code.=<<___; 1118 shl \$6,$len 1119 sub $in0,$out 1120 mov 240-112($key),$rounds 1121 add $inp,$len # end of input 1122 1123 lea K_XX_XX(%rip),$K_XX_XX 1124 mov 0($ctx),$A # load context 1125 mov 4($ctx),$B 1126 mov 8($ctx),$C 1127 mov 12($ctx),$D 1128 mov $B,@T[0] # magic seed 1129 mov 16($ctx),$E 1130 mov $C,@T[1] 1131 xor $D,@T[1] 1132 and @T[1],@T[0] 1133 1134 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask 1135 vmovdqa 0($K_XX_XX),$Kx # K_00_19 1136 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 1137 vmovdqu 16($inp),@X[-3&7] 1138 vmovdqu 32($inp),@X[-2&7] 1139 vmovdqu 48($inp),@X[-1&7] 1140 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap 1141 add \$64,$inp 1142 vpshufb @X[2],@X[-3&7],@X[-3&7] 1143 vpshufb @X[2],@X[-2&7],@X[-2&7] 1144 vpshufb @X[2],@X[-1&7],@X[-1&7] 1145 vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 1146 vpaddd $Kx,@X[-3&7],@X[1] 1147 vpaddd $Kx,@X[-2&7],@X[2] 1148 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU 1149 vmovdqa @X[1],16(%rsp) 1150 vmovdqa @X[2],32(%rsp) 1151 vmovups -112($key),$rndkey[1] # $key[0] 1152 vmovups 16-112($key),$rndkey[0] # forward reference 1153 jmp .Loop_avx 1154___ 1155 1156my $aesenc=sub { 1157 use integer; 1158 my ($n,$k)=($r/10,$r%10); 1159 if ($k==0) { 1160 $code.=<<___; 1161 vmovdqu `16*$n`($in0),$in # load input 1162 vpxor $rndkey[1],$in,$in 1163___ 1164 $code.=<<___ if ($n); 1165 vmovups $iv,`16*($n-1)`($out,$in0) # write output 1166___ 1167 $code.=<<___; 1168 vpxor $in,$iv,$iv 1169 vaesenc $rndkey[0],$iv,$iv 1170 vmovups `32+16*$k-112`($key),$rndkey[1] 1171___ 1172 } elsif ($k==9) { 1173 $sn++; 1174 $code.=<<___; 1175 cmp \$11,$rounds 1176 jb .Lvaesenclast$sn 1177 vaesenc $rndkey[0],$iv,$iv 1178 vmovups `32+16*($k+0)-112`($key),$rndkey[1] 1179 vaesenc $rndkey[1],$iv,$iv 1180 vmovups `32+16*($k+1)-112`($key),$rndkey[0] 1181 je .Lvaesenclast$sn 1182 vaesenc $rndkey[0],$iv,$iv 1183 vmovups `32+16*($k+2)-112`($key),$rndkey[1] 1184 vaesenc $rndkey[1],$iv,$iv 1185 vmovups `32+16*($k+3)-112`($key),$rndkey[0] 1186.Lvaesenclast$sn: 1187 vaesenclast $rndkey[0],$iv,$iv 1188 vmovups -112($key),$rndkey[0] 1189 vmovups 16-112($key),$rndkey[1] # forward reference 1190___ 1191 } else { 1192 $code.=<<___; 1193 vaesenc $rndkey[0],$iv,$iv 1194 vmovups `32+16*$k-112`($key),$rndkey[1] 1195___ 1196 } 1197 $r++; unshift(@rndkey,pop(@rndkey)); 1198}; 1199 1200sub Xupdate_avx_16_31() # recall that $Xi starts with 4 1201{ use integer; 1202 my $body = shift; 1203 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 1204 my ($a,$b,$c,$d,$e); 1205 1206 eval(shift(@insns)); 1207 eval(shift(@insns)); 1208 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" 1209 eval(shift(@insns)); 1210 eval(shift(@insns)); 1211 1212 &vpaddd (@Tx[1],$Kx,@X[-1&7]); 1213 eval(shift(@insns)); 1214 eval(shift(@insns)); 1215 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords 1216 eval(shift(@insns)); 1217 eval(shift(@insns)); 1218 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 1219 eval(shift(@insns)); 1220 eval(shift(@insns)); 1221 1222 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 1223 eval(shift(@insns)); 1224 eval(shift(@insns)); 1225 eval(shift(@insns)); 1226 eval(shift(@insns)); 1227 1228 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 1229 eval(shift(@insns)); 1230 eval(shift(@insns)); 1231 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 1232 eval(shift(@insns)); 1233 eval(shift(@insns)); 1234 1235 &vpsrld (@Tx[0],@X[0],31); 1236 eval(shift(@insns)); 1237 eval(shift(@insns)); 1238 eval(shift(@insns)); 1239 eval(shift(@insns)); 1240 1241 &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword 1242 &vpaddd (@X[0],@X[0],@X[0]); 1243 eval(shift(@insns)); 1244 eval(shift(@insns)); 1245 eval(shift(@insns)); 1246 eval(shift(@insns)); 1247 1248 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 1249 &vpsrld (@Tx[0],@Tx[1],30); 1250 eval(shift(@insns)); 1251 eval(shift(@insns)); 1252 eval(shift(@insns)); 1253 eval(shift(@insns)); 1254 1255 &vpslld (@Tx[1],@Tx[1],2); 1256 &vpxor (@X[0],@X[0],@Tx[0]); 1257 eval(shift(@insns)); 1258 eval(shift(@insns)); 1259 eval(shift(@insns)); 1260 eval(shift(@insns)); 1261 1262 &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 1263 eval(shift(@insns)); 1264 eval(shift(@insns)); 1265 &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX 1266 eval(shift(@insns)); 1267 eval(shift(@insns)); 1268 1269 1270 foreach (@insns) { eval; } # remaining instructions [if any] 1271 1272 $Xi++; push(@X,shift(@X)); # "rotate" X[] 1273} 1274 1275sub Xupdate_avx_32_79() 1276{ use integer; 1277 my $body = shift; 1278 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 1279 my ($a,$b,$c,$d,$e); 1280 1281 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" 1282 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 1283 eval(shift(@insns)); # body_20_39 1284 eval(shift(@insns)); 1285 eval(shift(@insns)); 1286 eval(shift(@insns)); # rol 1287 1288 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 1289 eval(shift(@insns)); 1290 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); 1291 &vpaddd (@Tx[1],$Kx,@X[-1&7]); 1292 &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0); 1293 eval(shift(@insns)); # ror 1294 eval(shift(@insns)); 1295 1296 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" 1297 eval(shift(@insns)); # body_20_39 1298 eval(shift(@insns)); 1299 eval(shift(@insns)); 1300 eval(shift(@insns)); # rol 1301 1302 &vpsrld (@Tx[0],@X[0],30); 1303 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 1304 eval(shift(@insns)); 1305 eval(shift(@insns)); 1306 eval(shift(@insns)); # ror 1307 eval(shift(@insns)); 1308 1309 &vpslld (@X[0],@X[0],2); 1310 eval(shift(@insns)); # body_20_39 1311 eval(shift(@insns)); 1312 eval(shift(@insns)); 1313 eval(shift(@insns)); # rol 1314 eval(shift(@insns)); 1315 eval(shift(@insns)); 1316 eval(shift(@insns)); # ror 1317 eval(shift(@insns)); 1318 1319 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 1320 eval(shift(@insns)); # body_20_39 1321 eval(shift(@insns)); 1322 eval(shift(@insns)); 1323 eval(shift(@insns)); # rol 1324 eval(shift(@insns)); 1325 eval(shift(@insns)); 1326 eval(shift(@insns)); # rol 1327 eval(shift(@insns)); 1328 1329 foreach (@insns) { eval; } # remaining instructions 1330 1331 $Xi++; push(@X,shift(@X)); # "rotate" X[] 1332} 1333 1334sub Xuplast_avx_80() 1335{ use integer; 1336 my $body = shift; 1337 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1338 my ($a,$b,$c,$d,$e); 1339 1340 eval(shift(@insns)); 1341 &vpaddd (@Tx[1],$Kx,@X[-1&7]); 1342 eval(shift(@insns)); 1343 eval(shift(@insns)); 1344 eval(shift(@insns)); 1345 eval(shift(@insns)); 1346 1347 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 1348 1349 foreach (@insns) { eval; } # remaining instructions 1350 1351 &cmp ($inp,$len); 1352 &je (shift); 1353 1354 &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask 1355 &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19 1356 &vmovdqu(@X[-4&7],"0($inp)"); # load input 1357 &vmovdqu(@X[-3&7],"16($inp)"); 1358 &vmovdqu(@X[-2&7],"32($inp)"); 1359 &vmovdqu(@X[-1&7],"48($inp)"); 1360 &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap 1361 &add ($inp,64); 1362 1363 $Xi=0; 1364} 1365 1366sub Xloop_avx() 1367{ use integer; 1368 my $body = shift; 1369 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1370 my ($a,$b,$c,$d,$e); 1371 1372 eval(shift(@insns)); 1373 eval(shift(@insns)); 1374 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]); 1375 eval(shift(@insns)); 1376 eval(shift(@insns)); 1377 &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx); 1378 eval(shift(@insns)); 1379 eval(shift(@insns)); 1380 eval(shift(@insns)); 1381 eval(shift(@insns)); 1382 &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU 1383 eval(shift(@insns)); 1384 eval(shift(@insns)); 1385 1386 foreach (@insns) { eval; } 1387 $Xi++; 1388} 1389 1390sub Xtail_avx() 1391{ use integer; 1392 my $body = shift; 1393 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1394 my ($a,$b,$c,$d,$e); 1395 1396 foreach (@insns) { eval; } 1397} 1398 1399$code.=<<___; 1400.align 32 1401.Loop_avx: 1402___ 1403 &Xupdate_avx_16_31(\&body_00_19); 1404 &Xupdate_avx_16_31(\&body_00_19); 1405 &Xupdate_avx_16_31(\&body_00_19); 1406 &Xupdate_avx_16_31(\&body_00_19); 1407 &Xupdate_avx_32_79(\&body_00_19); 1408 &Xupdate_avx_32_79(\&body_20_39); 1409 &Xupdate_avx_32_79(\&body_20_39); 1410 &Xupdate_avx_32_79(\&body_20_39); 1411 &Xupdate_avx_32_79(\&body_20_39); 1412 &Xupdate_avx_32_79(\&body_20_39); 1413 &Xupdate_avx_32_79(\&body_40_59); 1414 &Xupdate_avx_32_79(\&body_40_59); 1415 &Xupdate_avx_32_79(\&body_40_59); 1416 &Xupdate_avx_32_79(\&body_40_59); 1417 &Xupdate_avx_32_79(\&body_40_59); 1418 &Xupdate_avx_32_79(\&body_20_39); 1419 &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done" 1420 1421 $saved_j=$j; @saved_V=@V; 1422 $saved_r=$r; @saved_rndkey=@rndkey; 1423 1424 &Xloop_avx(\&body_20_39); 1425 &Xloop_avx(\&body_20_39); 1426 &Xloop_avx(\&body_20_39); 1427 1428$code.=<<___; 1429 vmovups $iv,48($out,$in0) # write output 1430 lea 64($in0),$in0 1431 1432 add 0($ctx),$A # update context 1433 add 4($ctx),@T[0] 1434 add 8($ctx),$C 1435 add 12($ctx),$D 1436 mov $A,0($ctx) 1437 add 16($ctx),$E 1438 mov @T[0],4($ctx) 1439 mov @T[0],$B # magic seed 1440 mov $C,8($ctx) 1441 mov $C,@T[1] 1442 mov $D,12($ctx) 1443 xor $D,@T[1] 1444 mov $E,16($ctx) 1445 and @T[1],@T[0] 1446 jmp .Loop_avx 1447 1448.Ldone_avx: 1449___ 1450 $jj=$j=$saved_j; @V=@saved_V; 1451 $r=$saved_r; @rndkey=@saved_rndkey; 1452 1453 &Xtail_avx(\&body_20_39); 1454 &Xtail_avx(\&body_20_39); 1455 &Xtail_avx(\&body_20_39); 1456 1457$code.=<<___; 1458 vmovups $iv,48($out,$in0) # write output 1459 mov 88(%rsp),$ivp # restore $ivp 1460 1461 add 0($ctx),$A # update context 1462 add 4($ctx),@T[0] 1463 add 8($ctx),$C 1464 mov $A,0($ctx) 1465 add 12($ctx),$D 1466 mov @T[0],4($ctx) 1467 add 16($ctx),$E 1468 mov $C,8($ctx) 1469 mov $D,12($ctx) 1470 mov $E,16($ctx) 1471 vmovups $iv,($ivp) # write IV 1472 vzeroall 1473___ 1474$code.=<<___ if ($win64); 1475 movaps 96+0(%rsp),%xmm6 1476 movaps 96+16(%rsp),%xmm7 1477 movaps 96+32(%rsp),%xmm8 1478 movaps 96+48(%rsp),%xmm9 1479 movaps 96+64(%rsp),%xmm10 1480 movaps 96+80(%rsp),%xmm11 1481 movaps 96+96(%rsp),%xmm12 1482 movaps 96+112(%rsp),%xmm13 1483 movaps 96+128(%rsp),%xmm14 1484 movaps 96+144(%rsp),%xmm15 1485___ 1486$code.=<<___; 1487 lea `104+($win64?10*16:0)`(%rsp),%rsi 1488.cfi_def_cfa %rsi,56 1489 mov 0(%rsi),%r15 1490.cfi_restore %r15 1491 mov 8(%rsi),%r14 1492.cfi_restore %r14 1493 mov 16(%rsi),%r13 1494.cfi_restore %r13 1495 mov 24(%rsi),%r12 1496.cfi_restore %r12 1497 mov 32(%rsi),%rbp 1498.cfi_restore %rbp 1499 mov 40(%rsi),%rbx 1500.cfi_restore %rbx 1501 lea 48(%rsi),%rsp 1502.cfi_def_cfa %rsp,8 1503.Lepilogue_avx: 1504 ret 1505.cfi_endproc 1506.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx 1507___ 1508 1509 if ($stitched_decrypt) {{{ 1510# reset 1511($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 1512 1513$j=$jj=$r=$rx=0; 1514$Xi=4; 1515 1516@aes256_dec = ( 1517 '&vpxor ($inout0,$rndkey0,"0x00($in0)");', 1518 '&vpxor ($inout1,$rndkey0,"0x10($in0)");', 1519 '&vpxor ($inout2,$rndkey0,"0x20($in0)");', 1520 '&vpxor ($inout3,$rndkey0,"0x30($in0)");', 1521 1522 '&vmovups($rndkey0,"16-112($key)");', 1523 '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3] 1524 undef,undef 1525 ); 1526for ($i=0;$i<13;$i++) { 1527 push (@aes256_dec,( 1528 '&vaesdec ($inout0,$inout0,$rndkey0);', 1529 '&vaesdec ($inout1,$inout1,$rndkey0);', 1530 '&vaesdec ($inout2,$inout2,$rndkey0);', 1531 '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");' 1532 )); 1533 push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11); 1534 push (@aes256_dec,(undef,undef)) if ($i==5); 1535} 1536push(@aes256_dec,( 1537 '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");', 1538 '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");', 1539 '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");', 1540 '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");', 1541 1542 '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");', 1543 '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);', 1544 '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);', 1545 '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);', 1546 1547 '&vmovups ("0x30($out,$in0)",$inout3);' 1548 )); 1549 1550$code.=<<___; 1551.type aesni256_cbc_sha1_dec_avx,\@function,6 1552.align 32 1553aesni256_cbc_sha1_dec_avx: 1554.cfi_startproc 1555 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 1556 push %rbx 1557.cfi_push %rbx 1558 push %rbp 1559.cfi_push %rbp 1560 push %r12 1561.cfi_push %r12 1562 push %r13 1563.cfi_push %r13 1564 push %r14 1565.cfi_push %r14 1566 push %r15 1567.cfi_push %r15 1568 lea `-104-($win64?10*16:0)`(%rsp),%rsp 1569.cfi_adjust_cfa_offset `104+($win64?10*16:0)` 1570___ 1571$code.=<<___ if ($win64); 1572 movaps %xmm6,96+0(%rsp) 1573 movaps %xmm7,96+16(%rsp) 1574 movaps %xmm8,96+32(%rsp) 1575 movaps %xmm9,96+48(%rsp) 1576 movaps %xmm10,96+64(%rsp) 1577 movaps %xmm11,96+80(%rsp) 1578 movaps %xmm12,96+96(%rsp) 1579 movaps %xmm13,96+112(%rsp) 1580 movaps %xmm14,96+128(%rsp) 1581 movaps %xmm15,96+144(%rsp) 1582.Lprologue_dec_avx: 1583___ 1584$code.=<<___; 1585 vzeroall 1586 mov $in0,%r12 # reassign arguments 1587 mov $out,%r13 1588 mov $len,%r14 1589 lea 112($key),%r15 # size optimization 1590 vmovdqu ($ivp),@X[3] # load IV 1591___ 1592($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments 1593$code.=<<___; 1594 shl \$6,$len 1595 sub $in0,$out 1596 add $inp,$len # end of input 1597 1598 lea K_XX_XX(%rip),$K_XX_XX 1599 mov 0($ctx),$A # load context 1600 mov 4($ctx),$B 1601 mov 8($ctx),$C 1602 mov 12($ctx),$D 1603 mov $B,@T[0] # magic seed 1604 mov 16($ctx),$E 1605 mov $C,@T[1] 1606 xor $D,@T[1] 1607 and @T[1],@T[0] 1608 1609 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask 1610 vmovdqa 0($K_XX_XX),$Kx # K_00_19 1611 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 1612 vmovdqu 16($inp),@X[-3&7] 1613 vmovdqu 32($inp),@X[-2&7] 1614 vmovdqu 48($inp),@X[-1&7] 1615 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap 1616 add \$64,$inp 1617 vpshufb @X[2],@X[-3&7],@X[-3&7] 1618 vpshufb @X[2],@X[-2&7],@X[-2&7] 1619 vpshufb @X[2],@X[-1&7],@X[-1&7] 1620 vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 1621 vpaddd $Kx,@X[-3&7],@X[1] 1622 vpaddd $Kx,@X[-2&7],@X[2] 1623 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU 1624 vmovdqa @X[1],16(%rsp) 1625 vmovdqa @X[2],32(%rsp) 1626 vmovups -112($key),$rndkey0 # $key[0] 1627 jmp .Loop_dec_avx 1628 1629.align 32 1630.Loop_dec_avx: 1631___ 1632 &Xupdate_avx_16_31(\&body_00_19_dec); 1633 &Xupdate_avx_16_31(\&body_00_19_dec); 1634 &Xupdate_avx_16_31(\&body_00_19_dec); 1635 &Xupdate_avx_16_31(\&body_00_19_dec); 1636 &Xupdate_avx_32_79(\&body_00_19_dec); 1637 &Xupdate_avx_32_79(\&body_20_39_dec); 1638 &Xupdate_avx_32_79(\&body_20_39_dec); 1639 &Xupdate_avx_32_79(\&body_20_39_dec); 1640 &Xupdate_avx_32_79(\&body_20_39_dec); 1641 &Xupdate_avx_32_79(\&body_20_39_dec); 1642 &Xupdate_avx_32_79(\&body_40_59_dec); 1643 &Xupdate_avx_32_79(\&body_40_59_dec); 1644 &Xupdate_avx_32_79(\&body_40_59_dec); 1645 &Xupdate_avx_32_79(\&body_40_59_dec); 1646 &Xupdate_avx_32_79(\&body_40_59_dec); 1647 &Xupdate_avx_32_79(\&body_20_39_dec); 1648 &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done" 1649 1650 $saved_j=$j; @saved_V=@V; 1651 $saved_rx=$rx; 1652 1653 &Xloop_avx(\&body_20_39_dec); 1654 &Xloop_avx(\&body_20_39_dec); 1655 &Xloop_avx(\&body_20_39_dec); 1656 1657 eval(@aes256_dec[-1]); # last store 1658$code.=<<___; 1659 lea 64($in0),$in0 1660 1661 add 0($ctx),$A # update context 1662 add 4($ctx),@T[0] 1663 add 8($ctx),$C 1664 add 12($ctx),$D 1665 mov $A,0($ctx) 1666 add 16($ctx),$E 1667 mov @T[0],4($ctx) 1668 mov @T[0],$B # magic seed 1669 mov $C,8($ctx) 1670 mov $C,@T[1] 1671 mov $D,12($ctx) 1672 xor $D,@T[1] 1673 mov $E,16($ctx) 1674 and @T[1],@T[0] 1675 jmp .Loop_dec_avx 1676 1677.Ldone_dec_avx: 1678___ 1679 $jj=$j=$saved_j; @V=@saved_V; 1680 $rx=$saved_rx; 1681 1682 &Xtail_avx(\&body_20_39_dec); 1683 &Xtail_avx(\&body_20_39_dec); 1684 &Xtail_avx(\&body_20_39_dec); 1685 1686 eval(@aes256_dec[-1]); # last store 1687$code.=<<___; 1688 1689 add 0($ctx),$A # update context 1690 add 4($ctx),@T[0] 1691 add 8($ctx),$C 1692 mov $A,0($ctx) 1693 add 12($ctx),$D 1694 mov @T[0],4($ctx) 1695 add 16($ctx),$E 1696 mov $C,8($ctx) 1697 mov $D,12($ctx) 1698 mov $E,16($ctx) 1699 vmovups @X[3],($ivp) # write IV 1700 vzeroall 1701___ 1702$code.=<<___ if ($win64); 1703 movaps 96+0(%rsp),%xmm6 1704 movaps 96+16(%rsp),%xmm7 1705 movaps 96+32(%rsp),%xmm8 1706 movaps 96+48(%rsp),%xmm9 1707 movaps 96+64(%rsp),%xmm10 1708 movaps 96+80(%rsp),%xmm11 1709 movaps 96+96(%rsp),%xmm12 1710 movaps 96+112(%rsp),%xmm13 1711 movaps 96+128(%rsp),%xmm14 1712 movaps 96+144(%rsp),%xmm15 1713___ 1714$code.=<<___; 1715 lea `104+($win64?10*16:0)`(%rsp),%rsi 1716.cfi_def_cfa %rsi,56 1717 mov 0(%rsi),%r15 1718.cfi_restore %r15 1719 mov 8(%rsi),%r14 1720.cfi_restore %r14 1721 mov 16(%rsi),%r13 1722.cfi_restore %r13 1723 mov 24(%rsi),%r12 1724.cfi_restore %r12 1725 mov 32(%rsi),%rbp 1726.cfi_restore %rbp 1727 mov 40(%rsi),%rbx 1728.cfi_restore %rbx 1729 lea 48(%rsi),%rsp 1730.cfi_def_cfa %rsp,8 1731.Lepilogue_dec_avx: 1732 ret 1733.cfi_endproc 1734.size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx 1735___ 1736 }}} 1737} 1738$code.=<<___; 1739.align 64 1740K_XX_XX: 1741.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1742.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1743.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1744.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1745.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask 1746.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 1747 1748.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1749.align 64 1750___ 1751 if ($shaext) {{{ 1752($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 1753 1754$rounds="%r11d"; 1755 1756($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15)); 1757@rndkey=("%xmm0","%xmm1"); 1758$r=0; 1759 1760my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12)); 1761my @MSG=map("%xmm$_",(3..6)); 1762 1763$code.=<<___; 1764.type aesni_cbc_sha1_enc_shaext,\@function,6 1765.align 32 1766aesni_cbc_sha1_enc_shaext: 1767.cfi_startproc 1768 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 1769___ 1770$code.=<<___ if ($win64); 1771 lea `-8-10*16`(%rsp),%rsp 1772 movaps %xmm6,-8-10*16(%rax) 1773 movaps %xmm7,-8-9*16(%rax) 1774 movaps %xmm8,-8-8*16(%rax) 1775 movaps %xmm9,-8-7*16(%rax) 1776 movaps %xmm10,-8-6*16(%rax) 1777 movaps %xmm11,-8-5*16(%rax) 1778 movaps %xmm12,-8-4*16(%rax) 1779 movaps %xmm13,-8-3*16(%rax) 1780 movaps %xmm14,-8-2*16(%rax) 1781 movaps %xmm15,-8-1*16(%rax) 1782.Lprologue_shaext: 1783___ 1784$code.=<<___; 1785 movdqu ($ctx),$ABCD 1786 movd 16($ctx),$E 1787 movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap 1788 1789 mov 240($key),$rounds 1790 sub $in0,$out 1791 movups ($key),$rndkey0 # $key[0] 1792 movups ($ivp),$iv # load IV 1793 movups 16($key),$rndkey[0] # forward reference 1794 lea 112($key),$key # size optimization 1795 1796 pshufd \$0b00011011,$ABCD,$ABCD # flip word order 1797 pshufd \$0b00011011,$E,$E # flip word order 1798 jmp .Loop_shaext 1799 1800.align 16 1801.Loop_shaext: 1802___ 1803 &$aesenc(); 1804$code.=<<___; 1805 movdqu ($inp),@MSG[0] 1806 movdqa $E,$E_SAVE # offload $E 1807 pshufb $BSWAP,@MSG[0] 1808 movdqu 0x10($inp),@MSG[1] 1809 movdqa $ABCD,$ABCD_SAVE # offload $ABCD 1810___ 1811 &$aesenc(); 1812$code.=<<___; 1813 pshufb $BSWAP,@MSG[1] 1814 1815 paddd @MSG[0],$E 1816 movdqu 0x20($inp),@MSG[2] 1817 lea 0x40($inp),$inp 1818 pxor $E_SAVE,@MSG[0] # black magic 1819___ 1820 &$aesenc(); 1821$code.=<<___; 1822 pxor $E_SAVE,@MSG[0] # black magic 1823 movdqa $ABCD,$E_ 1824 pshufb $BSWAP,@MSG[2] 1825 sha1rnds4 \$0,$E,$ABCD # 0-3 1826 sha1nexte @MSG[1],$E_ 1827___ 1828 &$aesenc(); 1829$code.=<<___; 1830 sha1msg1 @MSG[1],@MSG[0] 1831 movdqu -0x10($inp),@MSG[3] 1832 movdqa $ABCD,$E 1833 pshufb $BSWAP,@MSG[3] 1834___ 1835 &$aesenc(); 1836$code.=<<___; 1837 sha1rnds4 \$0,$E_,$ABCD # 4-7 1838 sha1nexte @MSG[2],$E 1839 pxor @MSG[2],@MSG[0] 1840 sha1msg1 @MSG[2],@MSG[1] 1841___ 1842 &$aesenc(); 1843 1844for($i=2;$i<20-4;$i++) { 1845$code.=<<___; 1846 movdqa $ABCD,$E_ 1847 sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11 1848 sha1nexte @MSG[3],$E_ 1849___ 1850 &$aesenc(); 1851$code.=<<___; 1852 sha1msg2 @MSG[3],@MSG[0] 1853 pxor @MSG[3],@MSG[1] 1854 sha1msg1 @MSG[3],@MSG[2] 1855___ 1856 ($E,$E_)=($E_,$E); 1857 push(@MSG,shift(@MSG)); 1858 1859 &$aesenc(); 1860} 1861$code.=<<___; 1862 movdqa $ABCD,$E_ 1863 sha1rnds4 \$3,$E,$ABCD # 64-67 1864 sha1nexte @MSG[3],$E_ 1865 sha1msg2 @MSG[3],@MSG[0] 1866 pxor @MSG[3],@MSG[1] 1867___ 1868 &$aesenc(); 1869$code.=<<___; 1870 movdqa $ABCD,$E 1871 sha1rnds4 \$3,$E_,$ABCD # 68-71 1872 sha1nexte @MSG[0],$E 1873 sha1msg2 @MSG[0],@MSG[1] 1874___ 1875 &$aesenc(); 1876$code.=<<___; 1877 movdqa $E_SAVE,@MSG[0] 1878 movdqa $ABCD,$E_ 1879 sha1rnds4 \$3,$E,$ABCD # 72-75 1880 sha1nexte @MSG[1],$E_ 1881___ 1882 &$aesenc(); 1883$code.=<<___; 1884 movdqa $ABCD,$E 1885 sha1rnds4 \$3,$E_,$ABCD # 76-79 1886 sha1nexte $MSG[0],$E 1887___ 1888 while($r<40) { &$aesenc(); } # remaining aesenc's 1889$code.=<<___; 1890 dec $len 1891 1892 paddd $ABCD_SAVE,$ABCD 1893 movups $iv,48($out,$in0) # write output 1894 lea 64($in0),$in0 1895 jnz .Loop_shaext 1896 1897 pshufd \$0b00011011,$ABCD,$ABCD 1898 pshufd \$0b00011011,$E,$E 1899 movups $iv,($ivp) # write IV 1900 movdqu $ABCD,($ctx) 1901 movd $E,16($ctx) 1902___ 1903$code.=<<___ if ($win64); 1904 movaps -8-10*16(%rax),%xmm6 1905 movaps -8-9*16(%rax),%xmm7 1906 movaps -8-8*16(%rax),%xmm8 1907 movaps -8-7*16(%rax),%xmm9 1908 movaps -8-6*16(%rax),%xmm10 1909 movaps -8-5*16(%rax),%xmm11 1910 movaps -8-4*16(%rax),%xmm12 1911 movaps -8-3*16(%rax),%xmm13 1912 movaps -8-2*16(%rax),%xmm14 1913 movaps -8-1*16(%rax),%xmm15 1914 mov %rax,%rsp 1915.Lepilogue_shaext: 1916___ 1917$code.=<<___; 1918 ret 1919.cfi_endproc 1920.size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext 1921___ 1922 }}} 1923# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1924# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1925if ($win64) { 1926$rec="%rcx"; 1927$frame="%rdx"; 1928$context="%r8"; 1929$disp="%r9"; 1930 1931$code.=<<___; 1932.extern __imp_RtlVirtualUnwind 1933.type ssse3_handler,\@abi-omnipotent 1934.align 16 1935ssse3_handler: 1936 push %rsi 1937 push %rdi 1938 push %rbx 1939 push %rbp 1940 push %r12 1941 push %r13 1942 push %r14 1943 push %r15 1944 pushfq 1945 sub \$64,%rsp 1946 1947 mov 120($context),%rax # pull context->Rax 1948 mov 248($context),%rbx # pull context->Rip 1949 1950 mov 8($disp),%rsi # disp->ImageBase 1951 mov 56($disp),%r11 # disp->HandlerData 1952 1953 mov 0(%r11),%r10d # HandlerData[0] 1954 lea (%rsi,%r10),%r10 # prologue label 1955 cmp %r10,%rbx # context->Rip<prologue label 1956 jb .Lcommon_seh_tail 1957 1958 mov 152($context),%rax # pull context->Rsp 1959 1960 mov 4(%r11),%r10d # HandlerData[1] 1961 lea (%rsi,%r10),%r10 # epilogue label 1962 cmp %r10,%rbx # context->Rip>=epilogue label 1963 jae .Lcommon_seh_tail 1964___ 1965$code.=<<___ if ($shaext); 1966 lea aesni_cbc_sha1_enc_shaext(%rip),%r10 1967 cmp %r10,%rbx 1968 jb .Lseh_no_shaext 1969 1970 lea (%rax),%rsi 1971 lea 512($context),%rdi # &context.Xmm6 1972 mov \$20,%ecx 1973 .long 0xa548f3fc # cld; rep movsq 1974 lea 168(%rax),%rax # adjust stack pointer 1975 jmp .Lcommon_seh_tail 1976.Lseh_no_shaext: 1977___ 1978$code.=<<___; 1979 lea 96(%rax),%rsi 1980 lea 512($context),%rdi # &context.Xmm6 1981 mov \$20,%ecx 1982 .long 0xa548f3fc # cld; rep movsq 1983 lea `104+10*16`(%rax),%rax # adjust stack pointer 1984 1985 mov 0(%rax),%r15 1986 mov 8(%rax),%r14 1987 mov 16(%rax),%r13 1988 mov 24(%rax),%r12 1989 mov 32(%rax),%rbp 1990 mov 40(%rax),%rbx 1991 lea 48(%rax),%rax 1992 mov %rbx,144($context) # restore context->Rbx 1993 mov %rbp,160($context) # restore context->Rbp 1994 mov %r12,216($context) # restore context->R12 1995 mov %r13,224($context) # restore context->R13 1996 mov %r14,232($context) # restore context->R14 1997 mov %r15,240($context) # restore context->R15 1998 1999.Lcommon_seh_tail: 2000 mov 8(%rax),%rdi 2001 mov 16(%rax),%rsi 2002 mov %rax,152($context) # restore context->Rsp 2003 mov %rsi,168($context) # restore context->Rsi 2004 mov %rdi,176($context) # restore context->Rdi 2005 2006 mov 40($disp),%rdi # disp->ContextRecord 2007 mov $context,%rsi # context 2008 mov \$154,%ecx # sizeof(CONTEXT) 2009 .long 0xa548f3fc # cld; rep movsq 2010 2011 mov $disp,%rsi 2012 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2013 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2014 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2015 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2016 mov 40(%rsi),%r10 # disp->ContextRecord 2017 lea 56(%rsi),%r11 # &disp->HandlerData 2018 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2019 mov %r10,32(%rsp) # arg5 2020 mov %r11,40(%rsp) # arg6 2021 mov %r12,48(%rsp) # arg7 2022 mov %rcx,56(%rsp) # arg8, (NULL) 2023 call *__imp_RtlVirtualUnwind(%rip) 2024 2025 mov \$1,%eax # ExceptionContinueSearch 2026 add \$64,%rsp 2027 popfq 2028 pop %r15 2029 pop %r14 2030 pop %r13 2031 pop %r12 2032 pop %rbp 2033 pop %rbx 2034 pop %rdi 2035 pop %rsi 2036 ret 2037.size ssse3_handler,.-ssse3_handler 2038 2039.section .pdata 2040.align 4 2041 .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3 2042 .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3 2043 .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3 2044___ 2045$code.=<<___ if ($avx); 2046 .rva .LSEH_begin_aesni_cbc_sha1_enc_avx 2047 .rva .LSEH_end_aesni_cbc_sha1_enc_avx 2048 .rva .LSEH_info_aesni_cbc_sha1_enc_avx 2049___ 2050$code.=<<___ if ($shaext); 2051 .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext 2052 .rva .LSEH_end_aesni_cbc_sha1_enc_shaext 2053 .rva .LSEH_info_aesni_cbc_sha1_enc_shaext 2054___ 2055$code.=<<___; 2056.section .xdata 2057.align 8 2058.LSEH_info_aesni_cbc_sha1_enc_ssse3: 2059 .byte 9,0,0,0 2060 .rva ssse3_handler 2061 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2062___ 2063$code.=<<___ if ($avx); 2064.LSEH_info_aesni_cbc_sha1_enc_avx: 2065 .byte 9,0,0,0 2066 .rva ssse3_handler 2067 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2068___ 2069$code.=<<___ if ($shaext); 2070.LSEH_info_aesni_cbc_sha1_enc_shaext: 2071 .byte 9,0,0,0 2072 .rva ssse3_handler 2073 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] 2074___ 2075} 2076 2077#################################################################### 2078sub rex { 2079 local *opcode=shift; 2080 my ($dst,$src)=@_; 2081 my $rex=0; 2082 2083 $rex|=0x04 if($dst>=8); 2084 $rex|=0x01 if($src>=8); 2085 unshift @opcode,$rex|0x40 if($rex); 2086} 2087 2088sub sha1rnds4 { 2089 if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 2090 my @opcode=(0x0f,0x3a,0xcc); 2091 rex(\@opcode,$3,$2); 2092 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 2093 my $c=$1; 2094 push @opcode,$c=~/^0/?oct($c):$c; 2095 return ".byte\t".join(',',@opcode); 2096 } else { 2097 return "sha1rnds4\t".@_[0]; 2098 } 2099} 2100 2101sub sha1op38 { 2102 my $instr = shift; 2103 my %opcodelet = ( 2104 "sha1nexte" => 0xc8, 2105 "sha1msg1" => 0xc9, 2106 "sha1msg2" => 0xca ); 2107 2108 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 2109 my @opcode=(0x0f,0x38); 2110 rex(\@opcode,$2,$1); 2111 push @opcode,$opcodelet{$instr}; 2112 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2113 return ".byte\t".join(',',@opcode); 2114 } else { 2115 return $instr."\t".@_[0]; 2116 } 2117} 2118 2119sub aesni { 2120 my $line=shift; 2121 my @opcode=(0x0f,0x38); 2122 2123 if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 2124 my %opcodelet = ( 2125 "aesenc" => 0xdc, "aesenclast" => 0xdd, 2126 "aesdec" => 0xde, "aesdeclast" => 0xdf 2127 ); 2128 return undef if (!defined($opcodelet{$1})); 2129 rex(\@opcode,$3,$2); 2130 push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M 2131 unshift @opcode,0x66; 2132 return ".byte\t".join(',',@opcode); 2133 } 2134 return $line; 2135} 2136 2137foreach (split("\n",$code)) { 2138 s/\`([^\`]*)\`/eval $1/geo; 2139 2140 s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or 2141 s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or 2142 s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo; 2143 2144 print $_,"\n"; 2145} 2146close STDOUT or die "error closing STDOUT: $!"; 2147