1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# This module implements support for Intel AES-NI extension. In 11# OpenSSL context it's used with Intel engine, but can also be used as 12# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 13# details]. 14 15$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 16 # generates drop-in replacement for 17 # crypto/aes/asm/aes-x86_64.pl:-) 18 19$flavour = shift; 20$output = shift; 21if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 22 23$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 24 25$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 26( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 27( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 28die "can't locate x86_64-xlate.pl"; 29 30open STDOUT,"| $^X $xlate $flavour $output"; 31 32$movkey = $PREFIX eq "aesni" ? "movaps" : "movups"; 33@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 34 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 35 36$code=".text\n"; 37 38$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 39# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 40$inp="%rdi"; 41$out="%rsi"; 42$len="%rdx"; 43$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 44$ivp="%r8"; # cbc 45 46$rnds_="%r10d"; # backup copy for $rounds 47$key_="%r11"; # backup copy for $key 48 49# %xmm register layout 50$inout0="%xmm0"; $inout1="%xmm1"; 51$inout2="%xmm2"; $inout3="%xmm3"; 52$rndkey0="%xmm4"; $rndkey1="%xmm5"; 53 54$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt 55$in1="%xmm8"; $in2="%xmm9"; 56 57# Inline version of internal aesni_[en|de]crypt1. 58# 59# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 60# cycles which take care of loop variables... 61{ my $sn; 62sub aesni_generate1 { 63my ($p,$key,$rounds)=@_; 64++$sn; 65$code.=<<___; 66 $movkey ($key),$rndkey0 67 $movkey 16($key),$rndkey1 68 lea 32($key),$key 69 pxor $rndkey0,$inout0 70.Loop_${p}1_$sn: 71 aes${p} $rndkey1,$inout0 72 dec $rounds 73 $movkey ($key),$rndkey1 74 lea 16($key),$key 75 jnz .Loop_${p}1_$sn # loop body is 16 bytes 76 aes${p}last $rndkey1,$inout0 77___ 78}} 79# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 80# 81{ my ($inp,$out,$key) = @_4args; 82 83$code.=<<___; 84.globl ${PREFIX}_encrypt 85.type ${PREFIX}_encrypt,\@abi-omnipotent 86.align 16 87${PREFIX}_encrypt: 88 movups ($inp),$inout0 # load input 89 mov 240($key),$rounds # pull $rounds 90___ 91 &aesni_generate1("enc",$key,$rounds); 92$code.=<<___; 93 movups $inout0,($out) # output 94 ret 95.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 96 97.globl ${PREFIX}_decrypt 98.type ${PREFIX}_decrypt,\@abi-omnipotent 99.align 16 100${PREFIX}_decrypt: 101 movups ($inp),$inout0 # load input 102 mov 240($key),$rounds # pull $rounds 103___ 104 &aesni_generate1("dec",$key,$rounds); 105$code.=<<___; 106 movups $inout0,($out) # output 107 ret 108.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 109___ 110} 111 112# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave 113# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] 114# latency is 6, it turned out that it can be scheduled only every 115# *second* cycle. Thus 3x interleave is the one providing optimal 116# utilization, i.e. when subroutine's throughput is virtually same as 117# of non-interleaved subroutine [for number of input blocks up to 3]. 118# This is why it makes no sense to implement 2x subroutine. As soon 119# as/if Intel improves throughput by making it possible to schedule 120# the instructions in question *every* cycles I would have to 121# implement 6x interleave and use it in loop... 122sub aesni_generate3 { 123my $dir=shift; 124# As already mentioned it takes in $key and $rounds, which are *not* 125# preserved. $inout[0-2] is cipher/clear text... 126$code.=<<___; 127.type _aesni_${dir}rypt3,\@abi-omnipotent 128.align 16 129_aesni_${dir}rypt3: 130 $movkey ($key),$rndkey0 131 shr \$1,$rounds 132 $movkey 16($key),$rndkey1 133 lea 32($key),$key 134 pxor $rndkey0,$inout0 135 pxor $rndkey0,$inout1 136 pxor $rndkey0,$inout2 137 138.L${dir}_loop3: 139 aes${dir} $rndkey1,$inout0 140 $movkey ($key),$rndkey0 141 aes${dir} $rndkey1,$inout1 142 dec $rounds 143 aes${dir} $rndkey1,$inout2 144 aes${dir} $rndkey0,$inout0 145 $movkey 16($key),$rndkey1 146 aes${dir} $rndkey0,$inout1 147 lea 32($key),$key 148 aes${dir} $rndkey0,$inout2 149 jnz .L${dir}_loop3 150 151 aes${dir} $rndkey1,$inout0 152 $movkey ($key),$rndkey0 153 aes${dir} $rndkey1,$inout1 154 aes${dir} $rndkey1,$inout2 155 aes${dir}last $rndkey0,$inout0 156 aes${dir}last $rndkey0,$inout1 157 aes${dir}last $rndkey0,$inout2 158 ret 159.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 160___ 161} 162# 4x interleave is implemented to improve small block performance, 163# most notably [and naturally] 4 block by ~30%. One can argue that one 164# should have implemented 5x as well, but improvement would be <20%, 165# so it's not worth it... 166sub aesni_generate4 { 167my $dir=shift; 168# As already mentioned it takes in $key and $rounds, which are *not* 169# preserved. $inout[0-3] is cipher/clear text... 170$code.=<<___; 171.type _aesni_${dir}rypt4,\@abi-omnipotent 172.align 16 173_aesni_${dir}rypt4: 174 $movkey ($key),$rndkey0 175 shr \$1,$rounds 176 $movkey 16($key),$rndkey1 177 lea 32($key),$key 178 pxor $rndkey0,$inout0 179 pxor $rndkey0,$inout1 180 pxor $rndkey0,$inout2 181 pxor $rndkey0,$inout3 182 183.L${dir}_loop4: 184 aes${dir} $rndkey1,$inout0 185 $movkey ($key),$rndkey0 186 aes${dir} $rndkey1,$inout1 187 dec $rounds 188 aes${dir} $rndkey1,$inout2 189 aes${dir} $rndkey1,$inout3 190 aes${dir} $rndkey0,$inout0 191 $movkey 16($key),$rndkey1 192 aes${dir} $rndkey0,$inout1 193 lea 32($key),$key 194 aes${dir} $rndkey0,$inout2 195 aes${dir} $rndkey0,$inout3 196 jnz .L${dir}_loop4 197 198 aes${dir} $rndkey1,$inout0 199 $movkey ($key),$rndkey0 200 aes${dir} $rndkey1,$inout1 201 aes${dir} $rndkey1,$inout2 202 aes${dir} $rndkey1,$inout3 203 aes${dir}last $rndkey0,$inout0 204 aes${dir}last $rndkey0,$inout1 205 aes${dir}last $rndkey0,$inout2 206 aes${dir}last $rndkey0,$inout3 207 ret 208.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 209___ 210} 211&aesni_generate3("enc") if ($PREFIX eq "aesni"); 212&aesni_generate3("dec"); 213&aesni_generate4("enc") if ($PREFIX eq "aesni"); 214&aesni_generate4("dec"); 215 216if ($PREFIX eq "aesni") { 217# void aesni_ecb_encrypt (const void *in, void *out, 218# size_t length, const AES_KEY *key, 219# int enc); 220$code.=<<___; 221.globl aesni_ecb_encrypt 222.type aesni_ecb_encrypt,\@function,5 223.align 16 224aesni_ecb_encrypt: 225 cmp \$16,$len # check length 226 jb .Lecb_ret 227 228 mov 240($key),$rounds # pull $rounds 229 and \$-16,$len 230 mov $key,$key_ # backup $key 231 test %r8d,%r8d # 5th argument 232 mov $rounds,$rnds_ # backup $rounds 233 jz .Lecb_decrypt 234#--------------------------- ECB ENCRYPT ------------------------------# 235 sub \$0x40,$len 236 jbe .Lecb_enc_tail 237 jmp .Lecb_enc_loop3 238.align 16 239.Lecb_enc_loop3: 240 movups ($inp),$inout0 241 movups 0x10($inp),$inout1 242 movups 0x20($inp),$inout2 243 call _aesni_encrypt3 244 sub \$0x30,$len 245 lea 0x30($inp),$inp 246 lea 0x30($out),$out 247 movups $inout0,-0x30($out) 248 mov $rnds_,$rounds # restore $rounds 249 movups $inout1,-0x20($out) 250 mov $key_,$key # restore $key 251 movups $inout2,-0x10($out) 252 ja .Lecb_enc_loop3 253 254.Lecb_enc_tail: 255 add \$0x40,$len 256 jz .Lecb_ret 257 258 cmp \$0x10,$len 259 movups ($inp),$inout0 260 je .Lecb_enc_one 261 cmp \$0x20,$len 262 movups 0x10($inp),$inout1 263 je .Lecb_enc_two 264 cmp \$0x30,$len 265 movups 0x20($inp),$inout2 266 je .Lecb_enc_three 267 movups 0x30($inp),$inout3 268 call _aesni_encrypt4 269 movups $inout0,($out) 270 movups $inout1,0x10($out) 271 movups $inout2,0x20($out) 272 movups $inout3,0x30($out) 273 jmp .Lecb_ret 274.align 16 275.Lecb_enc_one: 276___ 277 &aesni_generate1("enc",$key,$rounds); 278$code.=<<___; 279 movups $inout0,($out) 280 jmp .Lecb_ret 281.align 16 282.Lecb_enc_two: 283 call _aesni_encrypt3 284 movups $inout0,($out) 285 movups $inout1,0x10($out) 286 jmp .Lecb_ret 287.align 16 288.Lecb_enc_three: 289 call _aesni_encrypt3 290 movups $inout0,($out) 291 movups $inout1,0x10($out) 292 movups $inout2,0x20($out) 293 jmp .Lecb_ret 294#--------------------------- ECB DECRYPT ------------------------------# 295.align 16 296.Lecb_decrypt: 297 sub \$0x40,$len 298 jbe .Lecb_dec_tail 299 jmp .Lecb_dec_loop3 300.align 16 301.Lecb_dec_loop3: 302 movups ($inp),$inout0 303 movups 0x10($inp),$inout1 304 movups 0x20($inp),$inout2 305 call _aesni_decrypt3 306 sub \$0x30,$len 307 lea 0x30($inp),$inp 308 lea 0x30($out),$out 309 movups $inout0,-0x30($out) 310 mov $rnds_,$rounds # restore $rounds 311 movups $inout1,-0x20($out) 312 mov $key_,$key # restore $key 313 movups $inout2,-0x10($out) 314 ja .Lecb_dec_loop3 315 316.Lecb_dec_tail: 317 add \$0x40,$len 318 jz .Lecb_ret 319 320 cmp \$0x10,$len 321 movups ($inp),$inout0 322 je .Lecb_dec_one 323 cmp \$0x20,$len 324 movups 0x10($inp),$inout1 325 je .Lecb_dec_two 326 cmp \$0x30,$len 327 movups 0x20($inp),$inout2 328 je .Lecb_dec_three 329 movups 0x30($inp),$inout3 330 call _aesni_decrypt4 331 movups $inout0,($out) 332 movups $inout1,0x10($out) 333 movups $inout2,0x20($out) 334 movups $inout3,0x30($out) 335 jmp .Lecb_ret 336.align 16 337.Lecb_dec_one: 338___ 339 &aesni_generate1("dec",$key,$rounds); 340$code.=<<___; 341 movups $inout0,($out) 342 jmp .Lecb_ret 343.align 16 344.Lecb_dec_two: 345 call _aesni_decrypt3 346 movups $inout0,($out) 347 movups $inout1,0x10($out) 348 jmp .Lecb_ret 349.align 16 350.Lecb_dec_three: 351 call _aesni_decrypt3 352 movups $inout0,($out) 353 movups $inout1,0x10($out) 354 movups $inout2,0x20($out) 355 356.Lecb_ret: 357 ret 358.size aesni_ecb_encrypt,.-aesni_ecb_encrypt 359___ 360} 361 362# void $PREFIX_cbc_encrypt (const void *inp, void *out, 363# size_t length, const AES_KEY *key, 364# unsigned char *ivp,const int enc); 365$reserved = $win64?0x40:-0x18; # used in decrypt 366$code.=<<___; 367.globl ${PREFIX}_cbc_encrypt 368.type ${PREFIX}_cbc_encrypt,\@function,6 369.align 16 370${PREFIX}_cbc_encrypt: 371 test $len,$len # check length 372 jz .Lcbc_ret 373 374 mov 240($key),$rnds_ # pull $rounds 375 mov $key,$key_ # backup $key 376 test %r9d,%r9d # 6th argument 377 jz .Lcbc_decrypt 378#--------------------------- CBC ENCRYPT ------------------------------# 379 movups ($ivp),$inout0 # load iv as initial state 380 cmp \$16,$len 381 mov $rnds_,$rounds 382 jb .Lcbc_enc_tail 383 sub \$16,$len 384 jmp .Lcbc_enc_loop 385.align 16 386.Lcbc_enc_loop: 387 movups ($inp),$inout1 # load input 388 lea 16($inp),$inp 389 pxor $inout1,$inout0 390___ 391 &aesni_generate1("enc",$key,$rounds); 392$code.=<<___; 393 sub \$16,$len 394 lea 16($out),$out 395 mov $rnds_,$rounds # restore $rounds 396 mov $key_,$key # restore $key 397 movups $inout0,-16($out) # store output 398 jnc .Lcbc_enc_loop 399 add \$16,$len 400 jnz .Lcbc_enc_tail 401 movups $inout0,($ivp) 402 jmp .Lcbc_ret 403 404.Lcbc_enc_tail: 405 mov $len,%rcx # zaps $key 406 xchg $inp,$out # $inp is %rsi and $out is %rdi now 407 .long 0x9066A4F3 # rep movsb 408 mov \$16,%ecx # zero tail 409 sub $len,%rcx 410 xor %eax,%eax 411 .long 0x9066AAF3 # rep stosb 412 lea -16(%rdi),%rdi # rewind $out by 1 block 413 mov $rnds_,$rounds # restore $rounds 414 mov %rdi,%rsi # $inp and $out are the same 415 mov $key_,$key # restore $key 416 xor $len,$len # len=16 417 jmp .Lcbc_enc_loop # one more spin 418#--------------------------- CBC DECRYPT ------------------------------# 419.align 16 420.Lcbc_decrypt: 421___ 422$code.=<<___ if ($win64); 423 lea -0x58(%rsp),%rsp 424 movaps %xmm6,(%rsp) 425 movaps %xmm7,0x10(%rsp) 426 movaps %xmm8,0x20(%rsp) 427 movaps %xmm9,0x30(%rsp) 428.Lcbc_decrypt_body: 429___ 430$code.=<<___; 431 movups ($ivp),$iv 432 sub \$0x40,$len 433 mov $rnds_,$rounds 434 jbe .Lcbc_dec_tail 435 jmp .Lcbc_dec_loop3 436.align 16 437.Lcbc_dec_loop3: 438 movups ($inp),$inout0 439 movups 0x10($inp),$inout1 440 movups 0x20($inp),$inout2 441 movaps $inout0,$in0 442 movaps $inout1,$in1 443 movaps $inout2,$in2 444 call _aesni_decrypt3 445 sub \$0x30,$len 446 lea 0x30($inp),$inp 447 lea 0x30($out),$out 448 pxor $iv,$inout0 449 pxor $in0,$inout1 450 movaps $in2,$iv 451 pxor $in1,$inout2 452 movups $inout0,-0x30($out) 453 mov $rnds_,$rounds # restore $rounds 454 movups $inout1,-0x20($out) 455 mov $key_,$key # restore $key 456 movups $inout2,-0x10($out) 457 ja .Lcbc_dec_loop3 458 459.Lcbc_dec_tail: 460 add \$0x40,$len 461 movups $iv,($ivp) 462 jz .Lcbc_dec_ret 463 464 movups ($inp),$inout0 465 cmp \$0x10,$len 466 movaps $inout0,$in0 467 jbe .Lcbc_dec_one 468 movups 0x10($inp),$inout1 469 cmp \$0x20,$len 470 movaps $inout1,$in1 471 jbe .Lcbc_dec_two 472 movups 0x20($inp),$inout2 473 cmp \$0x30,$len 474 movaps $inout2,$in2 475 jbe .Lcbc_dec_three 476 movups 0x30($inp),$inout3 477 call _aesni_decrypt4 478 pxor $iv,$inout0 479 movups 0x30($inp),$iv 480 pxor $in0,$inout1 481 movups $inout0,($out) 482 pxor $in1,$inout2 483 movups $inout1,0x10($out) 484 pxor $in2,$inout3 485 movups $inout2,0x20($out) 486 movaps $inout3,$inout0 487 lea 0x30($out),$out 488 jmp .Lcbc_dec_tail_collected 489.align 16 490.Lcbc_dec_one: 491___ 492 &aesni_generate1("dec",$key,$rounds); 493$code.=<<___; 494 pxor $iv,$inout0 495 movaps $in0,$iv 496 jmp .Lcbc_dec_tail_collected 497.align 16 498.Lcbc_dec_two: 499 call _aesni_decrypt3 500 pxor $iv,$inout0 501 pxor $in0,$inout1 502 movups $inout0,($out) 503 movaps $in1,$iv 504 movaps $inout1,$inout0 505 lea 0x10($out),$out 506 jmp .Lcbc_dec_tail_collected 507.align 16 508.Lcbc_dec_three: 509 call _aesni_decrypt3 510 pxor $iv,$inout0 511 pxor $in0,$inout1 512 movups $inout0,($out) 513 pxor $in1,$inout2 514 movups $inout1,0x10($out) 515 movaps $in2,$iv 516 movaps $inout2,$inout0 517 lea 0x20($out),$out 518 jmp .Lcbc_dec_tail_collected 519.align 16 520.Lcbc_dec_tail_collected: 521 and \$15,$len 522 movups $iv,($ivp) 523 jnz .Lcbc_dec_tail_partial 524 movups $inout0,($out) 525 jmp .Lcbc_dec_ret 526.Lcbc_dec_tail_partial: 527 movaps $inout0,$reserved(%rsp) 528 mov $out,%rdi 529 mov $len,%rcx 530 lea $reserved(%rsp),%rsi 531 .long 0x9066A4F3 # rep movsb 532 533.Lcbc_dec_ret: 534___ 535$code.=<<___ if ($win64); 536 movaps (%rsp),%xmm6 537 movaps 0x10(%rsp),%xmm7 538 movaps 0x20(%rsp),%xmm8 539 movaps 0x30(%rsp),%xmm9 540 lea 0x58(%rsp),%rsp 541___ 542$code.=<<___; 543.Lcbc_ret: 544 ret 545.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 546___ 547 548# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, 549# int bits, AES_KEY *key) 550{ my ($inp,$bits,$key) = @_4args; 551 $bits =~ s/%r/%e/; 552 553$code.=<<___; 554.globl ${PREFIX}_set_decrypt_key 555.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 556.align 16 557${PREFIX}_set_decrypt_key: 558 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 559 call _aesni_set_encrypt_key 560 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 561 test %eax,%eax 562 jnz .Ldec_key_ret 563 lea 16($key,$bits),$inp # points at the end of key schedule 564 565 $movkey ($key),%xmm0 # just swap 566 $movkey ($inp),%xmm1 567 $movkey %xmm0,($inp) 568 $movkey %xmm1,($key) 569 lea 16($key),$key 570 lea -16($inp),$inp 571 572.Ldec_key_inverse: 573 $movkey ($key),%xmm0 # swap and inverse 574 $movkey ($inp),%xmm1 575 aesimc %xmm0,%xmm0 576 aesimc %xmm1,%xmm1 577 lea 16($key),$key 578 lea -16($inp),$inp 579 cmp $key,$inp 580 $movkey %xmm0,16($inp) 581 $movkey %xmm1,-16($key) 582 ja .Ldec_key_inverse 583 584 $movkey ($key),%xmm0 # inverse middle 585 aesimc %xmm0,%xmm0 586 $movkey %xmm0,($inp) 587.Ldec_key_ret: 588 add \$8,%rsp 589 ret 590.LSEH_end_set_decrypt_key: 591.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 592___ 593 594# This is based on submission by 595# 596# Huang Ying <ying.huang@intel.com> 597# Vinodh Gopal <vinodh.gopal@intel.com> 598# Kahraman Akdemir 599# 600# Agressively optimized in respect to aeskeygenassist's critical path 601# and is contained in %xmm0-5 to meet Win64 ABI requirement. 602# 603$code.=<<___; 604.globl ${PREFIX}_set_encrypt_key 605.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 606.align 16 607${PREFIX}_set_encrypt_key: 608_aesni_set_encrypt_key: 609 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 610 test $inp,$inp 611 mov \$-1,%rax 612 jz .Lenc_key_ret 613 test $key,$key 614 jz .Lenc_key_ret 615 616 movups ($inp),%xmm0 # pull first 128 bits of *userKey 617 pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0 618 lea 16($key),%rax 619 cmp \$256,$bits 620 je .L14rounds 621 cmp \$192,$bits 622 je .L12rounds 623 cmp \$128,$bits 624 jne .Lbad_keybits 625 626.L10rounds: 627 mov \$9,$bits # 10 rounds for 128-bit key 628 $movkey %xmm0,($key) # round 0 629 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 630 call .Lkey_expansion_128_cold 631 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 632 call .Lkey_expansion_128 633 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 634 call .Lkey_expansion_128 635 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 636 call .Lkey_expansion_128 637 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 638 call .Lkey_expansion_128 639 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 640 call .Lkey_expansion_128 641 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 642 call .Lkey_expansion_128 643 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 644 call .Lkey_expansion_128 645 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 646 call .Lkey_expansion_128 647 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 648 call .Lkey_expansion_128 649 $movkey %xmm0,(%rax) 650 mov $bits,80(%rax) # 240(%rdx) 651 xor %eax,%eax 652 jmp .Lenc_key_ret 653 654.align 16 655.L12rounds: 656 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 657 mov \$11,$bits # 12 rounds for 192 658 $movkey %xmm0,($key) # round 0 659 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 660 call .Lkey_expansion_192a_cold 661 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 662 call .Lkey_expansion_192b 663 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 664 call .Lkey_expansion_192a 665 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 666 call .Lkey_expansion_192b 667 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 668 call .Lkey_expansion_192a 669 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 670 call .Lkey_expansion_192b 671 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 672 call .Lkey_expansion_192a 673 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 674 call .Lkey_expansion_192b 675 $movkey %xmm0,(%rax) 676 mov $bits,48(%rax) # 240(%rdx) 677 xor %rax, %rax 678 jmp .Lenc_key_ret 679 680.align 16 681.L14rounds: 682 movups 16($inp),%xmm2 # remaning half of *userKey 683 mov \$13,$bits # 14 rounds for 256 684 lea 16(%rax),%rax 685 $movkey %xmm0,($key) # round 0 686 $movkey %xmm2,16($key) # round 1 687 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 688 call .Lkey_expansion_256a_cold 689 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 690 call .Lkey_expansion_256b 691 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 692 call .Lkey_expansion_256a 693 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 694 call .Lkey_expansion_256b 695 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 696 call .Lkey_expansion_256a 697 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 698 call .Lkey_expansion_256b 699 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 700 call .Lkey_expansion_256a 701 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 702 call .Lkey_expansion_256b 703 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 704 call .Lkey_expansion_256a 705 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 706 call .Lkey_expansion_256b 707 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 708 call .Lkey_expansion_256a 709 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 710 call .Lkey_expansion_256b 711 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 712 call .Lkey_expansion_256a 713 $movkey %xmm0,(%rax) 714 mov $bits,16(%rax) # 240(%rdx) 715 xor %rax,%rax 716 jmp .Lenc_key_ret 717 718.align 16 719.Lbad_keybits: 720 mov \$-2,%rax 721.Lenc_key_ret: 722 add \$8,%rsp 723 ret 724.LSEH_end_set_encrypt_key: 725 726.align 16 727.Lkey_expansion_128: 728 $movkey %xmm0,(%rax) 729 lea 16(%rax),%rax 730.Lkey_expansion_128_cold: 731 shufps \$0b00010000,%xmm0,%xmm4 732 pxor %xmm4, %xmm0 733 shufps \$0b10001100,%xmm0,%xmm4 734 pxor %xmm4, %xmm0 735 pshufd \$0b11111111,%xmm1,%xmm1 # critical path 736 pxor %xmm1,%xmm0 737 ret 738 739.align 16 740.Lkey_expansion_192a: 741 $movkey %xmm0,(%rax) 742 lea 16(%rax),%rax 743.Lkey_expansion_192a_cold: 744 movaps %xmm2, %xmm5 745.Lkey_expansion_192b_warm: 746 shufps \$0b00010000,%xmm0,%xmm4 747 movaps %xmm2,%xmm3 748 pxor %xmm4,%xmm0 749 shufps \$0b10001100,%xmm0,%xmm4 750 pslldq \$4,%xmm3 751 pxor %xmm4,%xmm0 752 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 753 pxor %xmm3,%xmm2 754 pxor %xmm1,%xmm0 755 pshufd \$0b11111111,%xmm0,%xmm3 756 pxor %xmm3,%xmm2 757 ret 758 759.align 16 760.Lkey_expansion_192b: 761 movaps %xmm0,%xmm3 762 shufps \$0b01000100,%xmm0,%xmm5 763 $movkey %xmm5,(%rax) 764 shufps \$0b01001110,%xmm2,%xmm3 765 $movkey %xmm3,16(%rax) 766 lea 32(%rax),%rax 767 jmp .Lkey_expansion_192b_warm 768 769.align 16 770.Lkey_expansion_256a: 771 $movkey %xmm2,(%rax) 772 lea 16(%rax),%rax 773.Lkey_expansion_256a_cold: 774 shufps \$0b00010000,%xmm0,%xmm4 775 pxor %xmm4,%xmm0 776 shufps \$0b10001100,%xmm0,%xmm4 777 pxor %xmm4,%xmm0 778 pshufd \$0b11111111,%xmm1,%xmm1 # critical path 779 pxor %xmm1,%xmm0 780 ret 781 782.align 16 783.Lkey_expansion_256b: 784 $movkey %xmm0,(%rax) 785 lea 16(%rax),%rax 786 787 shufps \$0b00010000,%xmm2,%xmm4 788 pxor %xmm4,%xmm2 789 shufps \$0b10001100,%xmm2,%xmm4 790 pxor %xmm4,%xmm2 791 pshufd \$0b10101010,%xmm1,%xmm1 # critical path 792 pxor %xmm1,%xmm2 793 ret 794.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 795___ 796} 797 798$code.=<<___; 799.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 800.align 64 801___ 802 803# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 804# CONTEXT *context,DISPATCHER_CONTEXT *disp) 805if ($win64) { 806$rec="%rcx"; 807$frame="%rdx"; 808$context="%r8"; 809$disp="%r9"; 810 811$code.=<<___; 812.extern __imp_RtlVirtualUnwind 813.type cbc_se_handler,\@abi-omnipotent 814.align 16 815cbc_se_handler: 816 push %rsi 817 push %rdi 818 push %rbx 819 push %rbp 820 push %r12 821 push %r13 822 push %r14 823 push %r15 824 pushfq 825 sub \$64,%rsp 826 827 mov 152($context),%rax # pull context->Rsp 828 mov 248($context),%rbx # pull context->Rip 829 830 lea .Lcbc_decrypt(%rip),%r10 831 cmp %r10,%rbx # context->Rip<"prologue" label 832 jb .Lin_prologue 833 834 lea .Lcbc_decrypt_body(%rip),%r10 835 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 836 jb .Lrestore_rax 837 838 lea .Lcbc_ret(%rip),%r10 839 cmp %r10,%rbx # context->Rip>="epilogue" label 840 jae .Lin_prologue 841 842 lea 0(%rax),%rsi # top of stack 843 lea 512($context),%rdi # &context.Xmm6 844 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 845 .long 0xa548f3fc # cld; rep movsq 846 lea 0x58(%rax),%rax # adjust stack pointer 847 jmp .Lin_prologue 848 849.Lrestore_rax: 850 mov 120($context),%rax 851.Lin_prologue: 852 mov 8(%rax),%rdi 853 mov 16(%rax),%rsi 854 mov %rax,152($context) # restore context->Rsp 855 mov %rsi,168($context) # restore context->Rsi 856 mov %rdi,176($context) # restore context->Rdi 857 858 jmp .Lcommon_seh_exit 859.size cbc_se_handler,.-cbc_se_handler 860 861.type ecb_se_handler,\@abi-omnipotent 862.align 16 863ecb_se_handler: 864 push %rsi 865 push %rdi 866 push %rbx 867 push %rbp 868 push %r12 869 push %r13 870 push %r14 871 push %r15 872 pushfq 873 sub \$64,%rsp 874 875 mov 152($context),%rax # pull context->Rsp 876 mov 8(%rax),%rdi 877 mov 16(%rax),%rsi 878 mov %rsi,168($context) # restore context->Rsi 879 mov %rdi,176($context) # restore context->Rdi 880 881.Lcommon_seh_exit: 882 883 mov 40($disp),%rdi # disp->ContextRecord 884 mov $context,%rsi # context 885 mov \$154,%ecx # sizeof(CONTEXT) 886 .long 0xa548f3fc # cld; rep movsq 887 888 mov $disp,%rsi 889 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 890 mov 8(%rsi),%rdx # arg2, disp->ImageBase 891 mov 0(%rsi),%r8 # arg3, disp->ControlPc 892 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 893 mov 40(%rsi),%r10 # disp->ContextRecord 894 lea 56(%rsi),%r11 # &disp->HandlerData 895 lea 24(%rsi),%r12 # &disp->EstablisherFrame 896 mov %r10,32(%rsp) # arg5 897 mov %r11,40(%rsp) # arg6 898 mov %r12,48(%rsp) # arg7 899 mov %rcx,56(%rsp) # arg8, (NULL) 900 call *__imp_RtlVirtualUnwind(%rip) 901 902 mov \$1,%eax # ExceptionContinueSearch 903 add \$64,%rsp 904 popfq 905 pop %r15 906 pop %r14 907 pop %r13 908 pop %r12 909 pop %rbp 910 pop %rbx 911 pop %rdi 912 pop %rsi 913 ret 914.size cbc_se_handler,.-cbc_se_handler 915 916.section .pdata 917.align 4 918 .rva .LSEH_begin_${PREFIX}_ecb_encrypt 919 .rva .LSEH_end_${PREFIX}_ecb_encrypt 920 .rva .LSEH_info_ecb 921 922 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 923 .rva .LSEH_end_${PREFIX}_cbc_encrypt 924 .rva .LSEH_info_cbc 925 926 .rva ${PREFIX}_set_decrypt_key 927 .rva .LSEH_end_set_decrypt_key 928 .rva .LSEH_info_key 929 930 .rva ${PREFIX}_set_encrypt_key 931 .rva .LSEH_end_set_encrypt_key 932 .rva .LSEH_info_key 933.section .xdata 934.align 8 935.LSEH_info_ecb: 936 .byte 9,0,0,0 937 .rva ecb_se_handler 938.LSEH_info_cbc: 939 .byte 9,0,0,0 940 .rva cbc_se_handler 941.LSEH_info_key: 942 .byte 0x01,0x04,0x01,0x00 943 .byte 0x04,0x02,0x00,0x00 944___ 945} 946 947sub rex { 948 local *opcode=shift; 949 my ($dst,$src)=@_; 950 951 if ($dst>=8 || $src>=8) { 952 $rex=0x40; 953 $rex|=0x04 if($dst>=8); 954 $rex|=0x01 if($src>=8); 955 push @opcode,$rex; 956 } 957} 958 959sub aesni { 960 my $line=shift; 961 my @opcode=(0x66); 962 963 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 964 rex(\@opcode,$4,$3); 965 push @opcode,0x0f,0x3a,0xdf; 966 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 967 my $c=$2; 968 push @opcode,$c=~/^0/?oct($c):$c; 969 return ".byte\t".join(',',@opcode); 970 } 971 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 972 my %opcodelet = ( 973 "aesimc" => 0xdb, 974 "aesenc" => 0xdc, "aesenclast" => 0xdd, 975 "aesdec" => 0xde, "aesdeclast" => 0xdf 976 ); 977 return undef if (!defined($opcodelet{$1})); 978 rex(\@opcode,$3,$2); 979 push @opcode,0x0f,0x38,$opcodelet{$1}; 980 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 981 return ".byte\t".join(',',@opcode); 982 } 983 return $line; 984} 985 986$code =~ s/\`([^\`]*)\`/eval($1)/gem; 987$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 988 989print $code; 990 991close STDOUT; 992