1#! /usr/bin/env perl 2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# Version 2.1. 18# 19# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on 20# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version 21# [you'll notice a lot of resemblance], such as compressed S-boxes 22# in little-endian byte order, prefetch of these tables in CBC mode, 23# as well as avoiding L1 cache aliasing between stack frame and key 24# schedule and already mentioned tables, compressed Td4... 25# 26# Performance in number of cycles per processed byte for 128-bit key: 27# 28# ECB encrypt ECB decrypt CBC large chunk 29# AMD64 33 43 13.0 30# EM64T 38 56 18.6(*) 31# Core 2 30 42 14.5(*) 32# Atom 65 86 32.1(*) 33# 34# (*) with hyper-threading off 35 36$flavour = shift; 37$output = shift; 38if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 39 40$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 41 42$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 43( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 44( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 45die "can't locate x86_64-xlate.pl"; 46 47open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 48*STDOUT=*OUT; 49 50$verticalspin=1; # unlike 32-bit version $verticalspin performs 51 # ~15% better on both AMD and Intel cores 52$speed_limit=512; # see aes-586.pl for details 53 54$code=".text\n"; 55 56$s0="%eax"; 57$s1="%ebx"; 58$s2="%ecx"; 59$s3="%edx"; 60$acc0="%esi"; $mask80="%rsi"; 61$acc1="%edi"; $maskfe="%rdi"; 62$acc2="%ebp"; $mask1b="%rbp"; 63$inp="%r8"; 64$out="%r9"; 65$t0="%r10d"; 66$t1="%r11d"; 67$t2="%r12d"; 68$rnds="%r13d"; 69$sbox="%r14"; 70$key="%r15"; 71 72sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } 73sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; 74 $r =~ s/%[er]([sd]i)/%\1l/; 75 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 76sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/; 77 $r =~ s/%r([0-9]+)/%r\1d/; $r; } 78sub _data_word() 79{ my $i; 80 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } 81} 82sub data_word() 83{ my $i; 84 my $last=pop(@_); 85 $code.=".long\t"; 86 while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; } 87 $code.=sprintf"0x%08x\n",$last; 88} 89 90sub data_byte() 91{ my $i; 92 my $last=pop(@_); 93 $code.=".byte\t"; 94 while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; } 95 $code.=sprintf"0x%02x\n",$last&0xff; 96} 97 98sub encvert() 99{ my $t3="%r8d"; # zaps $inp! 100 101$code.=<<___; 102 # favor 3-way issue Opteron pipeline... 103 movzb `&lo("$s0")`,$acc0 104 movzb `&lo("$s1")`,$acc1 105 movzb `&lo("$s2")`,$acc2 106 mov 0($sbox,$acc0,8),$t0 107 mov 0($sbox,$acc1,8),$t1 108 mov 0($sbox,$acc2,8),$t2 109 110 movzb `&hi("$s1")`,$acc0 111 movzb `&hi("$s2")`,$acc1 112 movzb `&lo("$s3")`,$acc2 113 xor 3($sbox,$acc0,8),$t0 114 xor 3($sbox,$acc1,8),$t1 115 mov 0($sbox,$acc2,8),$t3 116 117 movzb `&hi("$s3")`,$acc0 118 shr \$16,$s2 119 movzb `&hi("$s0")`,$acc2 120 xor 3($sbox,$acc0,8),$t2 121 shr \$16,$s3 122 xor 3($sbox,$acc2,8),$t3 123 124 shr \$16,$s1 125 lea 16($key),$key 126 shr \$16,$s0 127 128 movzb `&lo("$s2")`,$acc0 129 movzb `&lo("$s3")`,$acc1 130 movzb `&lo("$s0")`,$acc2 131 xor 2($sbox,$acc0,8),$t0 132 xor 2($sbox,$acc1,8),$t1 133 xor 2($sbox,$acc2,8),$t2 134 135 movzb `&hi("$s3")`,$acc0 136 movzb `&hi("$s0")`,$acc1 137 movzb `&lo("$s1")`,$acc2 138 xor 1($sbox,$acc0,8),$t0 139 xor 1($sbox,$acc1,8),$t1 140 xor 2($sbox,$acc2,8),$t3 141 142 mov 12($key),$s3 143 movzb `&hi("$s1")`,$acc1 144 movzb `&hi("$s2")`,$acc2 145 mov 0($key),$s0 146 xor 1($sbox,$acc1,8),$t2 147 xor 1($sbox,$acc2,8),$t3 148 149 mov 4($key),$s1 150 mov 8($key),$s2 151 xor $t0,$s0 152 xor $t1,$s1 153 xor $t2,$s2 154 xor $t3,$s3 155___ 156} 157 158sub enclastvert() 159{ my $t3="%r8d"; # zaps $inp! 160 161$code.=<<___; 162 movzb `&lo("$s0")`,$acc0 163 movzb `&lo("$s1")`,$acc1 164 movzb `&lo("$s2")`,$acc2 165 movzb 2($sbox,$acc0,8),$t0 166 movzb 2($sbox,$acc1,8),$t1 167 movzb 2($sbox,$acc2,8),$t2 168 169 movzb `&lo("$s3")`,$acc0 170 movzb `&hi("$s1")`,$acc1 171 movzb `&hi("$s2")`,$acc2 172 movzb 2($sbox,$acc0,8),$t3 173 mov 0($sbox,$acc1,8),$acc1 #$t0 174 mov 0($sbox,$acc2,8),$acc2 #$t1 175 176 and \$0x0000ff00,$acc1 177 and \$0x0000ff00,$acc2 178 179 xor $acc1,$t0 180 xor $acc2,$t1 181 shr \$16,$s2 182 183 movzb `&hi("$s3")`,$acc0 184 movzb `&hi("$s0")`,$acc1 185 shr \$16,$s3 186 mov 0($sbox,$acc0,8),$acc0 #$t2 187 mov 0($sbox,$acc1,8),$acc1 #$t3 188 189 and \$0x0000ff00,$acc0 190 and \$0x0000ff00,$acc1 191 shr \$16,$s1 192 xor $acc0,$t2 193 xor $acc1,$t3 194 shr \$16,$s0 195 196 movzb `&lo("$s2")`,$acc0 197 movzb `&lo("$s3")`,$acc1 198 movzb `&lo("$s0")`,$acc2 199 mov 0($sbox,$acc0,8),$acc0 #$t0 200 mov 0($sbox,$acc1,8),$acc1 #$t1 201 mov 0($sbox,$acc2,8),$acc2 #$t2 202 203 and \$0x00ff0000,$acc0 204 and \$0x00ff0000,$acc1 205 and \$0x00ff0000,$acc2 206 207 xor $acc0,$t0 208 xor $acc1,$t1 209 xor $acc2,$t2 210 211 movzb `&lo("$s1")`,$acc0 212 movzb `&hi("$s3")`,$acc1 213 movzb `&hi("$s0")`,$acc2 214 mov 0($sbox,$acc0,8),$acc0 #$t3 215 mov 2($sbox,$acc1,8),$acc1 #$t0 216 mov 2($sbox,$acc2,8),$acc2 #$t1 217 218 and \$0x00ff0000,$acc0 219 and \$0xff000000,$acc1 220 and \$0xff000000,$acc2 221 222 xor $acc0,$t3 223 xor $acc1,$t0 224 xor $acc2,$t1 225 226 movzb `&hi("$s1")`,$acc0 227 movzb `&hi("$s2")`,$acc1 228 mov 16+12($key),$s3 229 mov 2($sbox,$acc0,8),$acc0 #$t2 230 mov 2($sbox,$acc1,8),$acc1 #$t3 231 mov 16+0($key),$s0 232 233 and \$0xff000000,$acc0 234 and \$0xff000000,$acc1 235 236 xor $acc0,$t2 237 xor $acc1,$t3 238 239 mov 16+4($key),$s1 240 mov 16+8($key),$s2 241 xor $t0,$s0 242 xor $t1,$s1 243 xor $t2,$s2 244 xor $t3,$s3 245___ 246} 247 248sub encstep() 249{ my ($i,@s) = @_; 250 my $tmp0=$acc0; 251 my $tmp1=$acc1; 252 my $tmp2=$acc2; 253 my $out=($t0,$t1,$t2,$s[0])[$i]; 254 255 if ($i==3) { 256 $tmp0=$s[1]; 257 $tmp1=$s[2]; 258 $tmp2=$s[3]; 259 } 260 $code.=" movzb ".&lo($s[0]).",$out\n"; 261 $code.=" mov $s[2],$tmp1\n" if ($i!=3); 262 $code.=" lea 16($key),$key\n" if ($i==0); 263 264 $code.=" movzb ".&hi($s[1]).",$tmp0\n"; 265 $code.=" mov 0($sbox,$out,8),$out\n"; 266 267 $code.=" shr \$16,$tmp1\n"; 268 $code.=" mov $s[3],$tmp2\n" if ($i!=3); 269 $code.=" xor 3($sbox,$tmp0,8),$out\n"; 270 271 $code.=" movzb ".&lo($tmp1).",$tmp1\n"; 272 $code.=" shr \$24,$tmp2\n"; 273 $code.=" xor 4*$i($key),$out\n"; 274 275 $code.=" xor 2($sbox,$tmp1,8),$out\n"; 276 $code.=" xor 1($sbox,$tmp2,8),$out\n"; 277 278 $code.=" mov $t0,$s[1]\n" if ($i==3); 279 $code.=" mov $t1,$s[2]\n" if ($i==3); 280 $code.=" mov $t2,$s[3]\n" if ($i==3); 281 $code.="\n"; 282} 283 284sub enclast() 285{ my ($i,@s)=@_; 286 my $tmp0=$acc0; 287 my $tmp1=$acc1; 288 my $tmp2=$acc2; 289 my $out=($t0,$t1,$t2,$s[0])[$i]; 290 291 if ($i==3) { 292 $tmp0=$s[1]; 293 $tmp1=$s[2]; 294 $tmp2=$s[3]; 295 } 296 $code.=" movzb ".&lo($s[0]).",$out\n"; 297 $code.=" mov $s[2],$tmp1\n" if ($i!=3); 298 299 $code.=" mov 2($sbox,$out,8),$out\n"; 300 $code.=" shr \$16,$tmp1\n"; 301 $code.=" mov $s[3],$tmp2\n" if ($i!=3); 302 303 $code.=" and \$0x000000ff,$out\n"; 304 $code.=" movzb ".&hi($s[1]).",$tmp0\n"; 305 $code.=" movzb ".&lo($tmp1).",$tmp1\n"; 306 $code.=" shr \$24,$tmp2\n"; 307 308 $code.=" mov 0($sbox,$tmp0,8),$tmp0\n"; 309 $code.=" mov 0($sbox,$tmp1,8),$tmp1\n"; 310 $code.=" mov 2($sbox,$tmp2,8),$tmp2\n"; 311 312 $code.=" and \$0x0000ff00,$tmp0\n"; 313 $code.=" and \$0x00ff0000,$tmp1\n"; 314 $code.=" and \$0xff000000,$tmp2\n"; 315 316 $code.=" xor $tmp0,$out\n"; 317 $code.=" mov $t0,$s[1]\n" if ($i==3); 318 $code.=" xor $tmp1,$out\n"; 319 $code.=" mov $t1,$s[2]\n" if ($i==3); 320 $code.=" xor $tmp2,$out\n"; 321 $code.=" mov $t2,$s[3]\n" if ($i==3); 322 $code.="\n"; 323} 324 325$code.=<<___; 326.type _x86_64_AES_encrypt,\@abi-omnipotent 327.align 16 328_x86_64_AES_encrypt: 329 xor 0($key),$s0 # xor with key 330 xor 4($key),$s1 331 xor 8($key),$s2 332 xor 12($key),$s3 333 334 mov 240($key),$rnds # load key->rounds 335 sub \$1,$rnds 336 jmp .Lenc_loop 337.align 16 338.Lenc_loop: 339___ 340 if ($verticalspin) { &encvert(); } 341 else { &encstep(0,$s0,$s1,$s2,$s3); 342 &encstep(1,$s1,$s2,$s3,$s0); 343 &encstep(2,$s2,$s3,$s0,$s1); 344 &encstep(3,$s3,$s0,$s1,$s2); 345 } 346$code.=<<___; 347 sub \$1,$rnds 348 jnz .Lenc_loop 349___ 350 if ($verticalspin) { &enclastvert(); } 351 else { &enclast(0,$s0,$s1,$s2,$s3); 352 &enclast(1,$s1,$s2,$s3,$s0); 353 &enclast(2,$s2,$s3,$s0,$s1); 354 &enclast(3,$s3,$s0,$s1,$s2); 355 $code.=<<___; 356 xor 16+0($key),$s0 # xor with key 357 xor 16+4($key),$s1 358 xor 16+8($key),$s2 359 xor 16+12($key),$s3 360___ 361 } 362$code.=<<___; 363 .byte 0xf3,0xc3 # rep ret 364.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt 365___ 366 367# it's possible to implement this by shifting tN by 8, filling least 368# significant byte with byte load and finally bswap-ing at the end, 369# but such partial register load kills Core 2... 370sub enccompactvert() 371{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); 372 373$code.=<<___; 374 movzb `&lo("$s0")`,$t0 375 movzb `&lo("$s1")`,$t1 376 movzb `&lo("$s2")`,$t2 377 movzb `&lo("$s3")`,$t3 378 movzb `&hi("$s1")`,$acc0 379 movzb `&hi("$s2")`,$acc1 380 shr \$16,$s2 381 movzb `&hi("$s3")`,$acc2 382 movzb ($sbox,$t0,1),$t0 383 movzb ($sbox,$t1,1),$t1 384 movzb ($sbox,$t2,1),$t2 385 movzb ($sbox,$t3,1),$t3 386 387 movzb ($sbox,$acc0,1),$t4 #$t0 388 movzb `&hi("$s0")`,$acc0 389 movzb ($sbox,$acc1,1),$t5 #$t1 390 movzb `&lo("$s2")`,$acc1 391 movzb ($sbox,$acc2,1),$acc2 #$t2 392 movzb ($sbox,$acc0,1),$acc0 #$t3 393 394 shl \$8,$t4 395 shr \$16,$s3 396 shl \$8,$t5 397 xor $t4,$t0 398 shr \$16,$s0 399 movzb `&lo("$s3")`,$t4 400 shr \$16,$s1 401 xor $t5,$t1 402 shl \$8,$acc2 403 movzb `&lo("$s0")`,$t5 404 movzb ($sbox,$acc1,1),$acc1 #$t0 405 xor $acc2,$t2 406 407 shl \$8,$acc0 408 movzb `&lo("$s1")`,$acc2 409 shl \$16,$acc1 410 xor $acc0,$t3 411 movzb ($sbox,$t4,1),$t4 #$t1 412 movzb `&hi("$s3")`,$acc0 413 movzb ($sbox,$t5,1),$t5 #$t2 414 xor $acc1,$t0 415 416 shr \$8,$s2 417 movzb `&hi("$s0")`,$acc1 418 shl \$16,$t4 419 shr \$8,$s1 420 shl \$16,$t5 421 xor $t4,$t1 422 movzb ($sbox,$acc2,1),$acc2 #$t3 423 movzb ($sbox,$acc0,1),$acc0 #$t0 424 movzb ($sbox,$acc1,1),$acc1 #$t1 425 movzb ($sbox,$s2,1),$s3 #$t3 426 movzb ($sbox,$s1,1),$s2 #$t2 427 428 shl \$16,$acc2 429 xor $t5,$t2 430 shl \$24,$acc0 431 xor $acc2,$t3 432 shl \$24,$acc1 433 xor $acc0,$t0 434 shl \$24,$s3 435 xor $acc1,$t1 436 shl \$24,$s2 437 mov $t0,$s0 438 mov $t1,$s1 439 xor $t2,$s2 440 xor $t3,$s3 441___ 442} 443 444sub enctransform_ref() 445{ my $sn = shift; 446 my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d"); 447 448$code.=<<___; 449 mov $sn,$acc 450 and \$0x80808080,$acc 451 mov $acc,$tmp 452 shr \$7,$tmp 453 lea ($sn,$sn),$r2 454 sub $tmp,$acc 455 and \$0xfefefefe,$r2 456 and \$0x1b1b1b1b,$acc 457 mov $sn,$tmp 458 xor $acc,$r2 459 460 xor $r2,$sn 461 rol \$24,$sn 462 xor $r2,$sn 463 ror \$16,$tmp 464 xor $tmp,$sn 465 ror \$8,$tmp 466 xor $tmp,$sn 467___ 468} 469 470# unlike decrypt case it does not pay off to parallelize enctransform 471sub enctransform() 472{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); 473 474$code.=<<___; 475 mov \$0x80808080,$t0 476 mov \$0x80808080,$t1 477 and $s0,$t0 478 and $s1,$t1 479 mov $t0,$acc0 480 mov $t1,$acc1 481 shr \$7,$t0 482 lea ($s0,$s0),$r20 483 shr \$7,$t1 484 lea ($s1,$s1),$r21 485 sub $t0,$acc0 486 sub $t1,$acc1 487 and \$0xfefefefe,$r20 488 and \$0xfefefefe,$r21 489 and \$0x1b1b1b1b,$acc0 490 and \$0x1b1b1b1b,$acc1 491 mov $s0,$t0 492 mov $s1,$t1 493 xor $acc0,$r20 494 xor $acc1,$r21 495 496 xor $r20,$s0 497 xor $r21,$s1 498 mov \$0x80808080,$t2 499 rol \$24,$s0 500 mov \$0x80808080,$t3 501 rol \$24,$s1 502 and $s2,$t2 503 and $s3,$t3 504 xor $r20,$s0 505 xor $r21,$s1 506 mov $t2,$acc0 507 ror \$16,$t0 508 mov $t3,$acc1 509 ror \$16,$t1 510 lea ($s2,$s2),$r20 511 shr \$7,$t2 512 xor $t0,$s0 513 shr \$7,$t3 514 xor $t1,$s1 515 ror \$8,$t0 516 lea ($s3,$s3),$r21 517 ror \$8,$t1 518 sub $t2,$acc0 519 sub $t3,$acc1 520 xor $t0,$s0 521 xor $t1,$s1 522 523 and \$0xfefefefe,$r20 524 and \$0xfefefefe,$r21 525 and \$0x1b1b1b1b,$acc0 526 and \$0x1b1b1b1b,$acc1 527 mov $s2,$t2 528 mov $s3,$t3 529 xor $acc0,$r20 530 xor $acc1,$r21 531 532 ror \$16,$t2 533 xor $r20,$s2 534 ror \$16,$t3 535 xor $r21,$s3 536 rol \$24,$s2 537 mov 0($sbox),$acc0 # prefetch Te4 538 rol \$24,$s3 539 xor $r20,$s2 540 mov 64($sbox),$acc1 541 xor $r21,$s3 542 mov 128($sbox),$r20 543 xor $t2,$s2 544 ror \$8,$t2 545 xor $t3,$s3 546 ror \$8,$t3 547 xor $t2,$s2 548 mov 192($sbox),$r21 549 xor $t3,$s3 550___ 551} 552 553$code.=<<___; 554.type _x86_64_AES_encrypt_compact,\@abi-omnipotent 555.align 16 556_x86_64_AES_encrypt_compact: 557 lea 128($sbox),$inp # size optimization 558 mov 0-128($inp),$acc1 # prefetch Te4 559 mov 32-128($inp),$acc2 560 mov 64-128($inp),$t0 561 mov 96-128($inp),$t1 562 mov 128-128($inp),$acc1 563 mov 160-128($inp),$acc2 564 mov 192-128($inp),$t0 565 mov 224-128($inp),$t1 566 jmp .Lenc_loop_compact 567.align 16 568.Lenc_loop_compact: 569 xor 0($key),$s0 # xor with key 570 xor 4($key),$s1 571 xor 8($key),$s2 572 xor 12($key),$s3 573 lea 16($key),$key 574___ 575 &enccompactvert(); 576$code.=<<___; 577 cmp 16(%rsp),$key 578 je .Lenc_compact_done 579___ 580 &enctransform(); 581$code.=<<___; 582 jmp .Lenc_loop_compact 583.align 16 584.Lenc_compact_done: 585 xor 0($key),$s0 586 xor 4($key),$s1 587 xor 8($key),$s2 588 xor 12($key),$s3 589 .byte 0xf3,0xc3 # rep ret 590.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact 591___ 592 593# void GFp_aes_nohw_encrypt (const void *inp,void *out,const AES_KEY *key); 594$code.=<<___; 595.align 16 596.globl GFp_aes_nohw_encrypt 597.type GFp_aes_nohw_encrypt,\@function,3 598.hidden GFp_aes_nohw_encrypt 599GFp_aes_nohw_encrypt: 600.cfi_startproc 601 mov %rsp,%rax 602.cfi_def_cfa_register %rax 603 push %rbx 604.cfi_push %rbx 605 push %rbp 606.cfi_push %rbp 607 push %r12 608.cfi_push %r12 609 push %r13 610.cfi_push %r13 611 push %r14 612.cfi_push %r14 613 push %r15 614.cfi_push %r15 615 616 # allocate frame "above" key schedule 617 lea -63(%rdx),%rcx # %rdx is key argument 618 and \$-64,%rsp 619 sub %rsp,%rcx 620 neg %rcx 621 and \$0x3c0,%rcx 622 sub %rcx,%rsp 623 sub \$32,%rsp 624 625 mov %rsi,16(%rsp) # save out 626 mov %rax,24(%rsp) # save original stack pointer 627.cfi_cfa_expression %rsp+24,deref,+8 628.Lenc_prologue: 629 630 mov %rdx,$key 631 mov 240($key),$rnds # load rounds 632 633 mov 0(%rdi),$s0 # load input vector 634 mov 4(%rdi),$s1 635 mov 8(%rdi),$s2 636 mov 12(%rdi),$s3 637 638 shl \$4,$rnds 639 lea ($key,$rnds),%rbp 640 mov $key,(%rsp) # key schedule 641 mov %rbp,8(%rsp) # end of key schedule 642 643 # pick Te4 copy which can't "overlap" with stack frame or key schedule 644 lea .LAES_Te+2048(%rip),$sbox 645 lea 768(%rsp),%rbp 646 sub $sbox,%rbp 647 and \$0x300,%rbp 648 lea ($sbox,%rbp),$sbox 649 650 call _x86_64_AES_encrypt_compact 651 652 mov 16(%rsp),$out # restore out 653 mov 24(%rsp),%rsi # restore saved stack pointer 654.cfi_def_cfa %rsi,8 655 mov $s0,0($out) # write output vector 656 mov $s1,4($out) 657 mov $s2,8($out) 658 mov $s3,12($out) 659 660 mov -48(%rsi),%r15 661.cfi_restore %r15 662 mov -40(%rsi),%r14 663.cfi_restore %r14 664 mov -32(%rsi),%r13 665.cfi_restore %r13 666 mov -24(%rsi),%r12 667.cfi_restore %r12 668 mov -16(%rsi),%rbp 669.cfi_restore %rbp 670 mov -8(%rsi),%rbx 671.cfi_restore %rbx 672 lea (%rsi),%rsp 673.cfi_def_cfa_register %rsp 674.Lenc_epilogue: 675 ret 676.cfi_endproc 677.size GFp_aes_nohw_encrypt,.-GFp_aes_nohw_encrypt 678___ 679 680#------------------------------------------------------------------# 681 682sub enckey() 683{ 684$code.=<<___; 685 movz %dl,%esi # rk[i]>>0 686 movzb -128(%rbp,%rsi),%ebx 687 movz %dh,%esi # rk[i]>>8 688 shl \$24,%ebx 689 xor %ebx,%eax 690 691 movzb -128(%rbp,%rsi),%ebx 692 shr \$16,%edx 693 movz %dl,%esi # rk[i]>>16 694 xor %ebx,%eax 695 696 movzb -128(%rbp,%rsi),%ebx 697 movz %dh,%esi # rk[i]>>24 698 shl \$8,%ebx 699 xor %ebx,%eax 700 701 movzb -128(%rbp,%rsi),%ebx 702 shl \$16,%ebx 703 xor %ebx,%eax 704 705 xor 1024-128(%rbp,%rcx,4),%eax # rcon 706___ 707} 708 709# int GFp_aes_nohw_set_encrypt_key(const unsigned char *userKey, const int bits, 710# AES_KEY *key) 711$code.=<<___; 712.align 16 713.globl GFp_aes_nohw_set_encrypt_key 714.type GFp_aes_nohw_set_encrypt_key,\@function,3 715GFp_aes_nohw_set_encrypt_key: 716.cfi_startproc 717 push %rbx 718.cfi_push %rbx 719 push %rbp 720.cfi_push %rbp 721 push %r12 # redundant, but allows to share 722.cfi_push %r12 723 push %r13 # exception handler... 724.cfi_push %r13 725 push %r14 726.cfi_push %r14 727 push %r15 728.cfi_push %r15 729 sub \$8,%rsp 730.cfi_adjust_cfa_offset 8 731.Lenc_key_prologue: 732 733 call _x86_64_AES_set_encrypt_key 734 735 mov 40(%rsp),%rbp 736.cfi_restore %rbp 737 mov 48(%rsp),%rbx 738.cfi_restore %rbx 739 add \$56,%rsp 740.cfi_adjust_cfa_offset -56 741.Lenc_key_epilogue: 742 ret 743.cfi_endproc 744.size GFp_aes_nohw_set_encrypt_key,.-GFp_aes_nohw_set_encrypt_key 745 746.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent 747.align 16 748_x86_64_AES_set_encrypt_key: 749 mov %esi,%ecx # %ecx=bits 750 mov %rdi,%rsi # %rsi=userKey 751 mov %rdx,%rdi # %rdi=key 752 753 test \$-1,%rsi 754 jz .Lbadpointer 755 test \$-1,%rdi 756 jz .Lbadpointer 757 758 lea .LAES_Te(%rip),%rbp 759 lea 2048+128(%rbp),%rbp 760 761 # prefetch Te4 762 mov 0-128(%rbp),%eax 763 mov 32-128(%rbp),%ebx 764 mov 64-128(%rbp),%r8d 765 mov 96-128(%rbp),%edx 766 mov 128-128(%rbp),%eax 767 mov 160-128(%rbp),%ebx 768 mov 192-128(%rbp),%r8d 769 mov 224-128(%rbp),%edx 770 771 cmp \$128,%ecx 772 je .L10rounds 773 cmp \$256,%ecx 774 je .L14rounds 775 mov \$-2,%rax # invalid number of bits 776 jmp .Lexit 777 778.L10rounds: 779 mov 0(%rsi),%rax # copy first 4 dwords 780 mov 8(%rsi),%rdx 781 mov %rax,0(%rdi) 782 mov %rdx,8(%rdi) 783 784 shr \$32,%rdx 785 xor %ecx,%ecx 786 jmp .L10shortcut 787.align 4 788.L10loop: 789 mov 0(%rdi),%eax # rk[0] 790 mov 12(%rdi),%edx # rk[3] 791.L10shortcut: 792___ 793 &enckey (); 794$code.=<<___; 795 mov %eax,16(%rdi) # rk[4] 796 xor 4(%rdi),%eax 797 mov %eax,20(%rdi) # rk[5] 798 xor 8(%rdi),%eax 799 mov %eax,24(%rdi) # rk[6] 800 xor 12(%rdi),%eax 801 mov %eax,28(%rdi) # rk[7] 802 add \$1,%ecx 803 lea 16(%rdi),%rdi 804 cmp \$10,%ecx 805 jl .L10loop 806 807 movl \$10,80(%rdi) # setup number of rounds 808 xor %rax,%rax 809 jmp .Lexit 810 811.L14rounds: 812 mov 0(%rsi),%rax # copy first 8 dwords 813 mov 8(%rsi),%rbx 814 mov 16(%rsi),%rcx 815 mov 24(%rsi),%rdx 816 mov %rax,0(%rdi) 817 mov %rbx,8(%rdi) 818 mov %rcx,16(%rdi) 819 mov %rdx,24(%rdi) 820 821 shr \$32,%rdx 822 xor %ecx,%ecx 823 jmp .L14shortcut 824.align 4 825.L14loop: 826 mov 0(%rdi),%eax # rk[0] 827 mov 28(%rdi),%edx # rk[4] 828.L14shortcut: 829___ 830 &enckey (); 831$code.=<<___; 832 mov %eax,32(%rdi) # rk[8] 833 xor 4(%rdi),%eax 834 mov %eax,36(%rdi) # rk[9] 835 xor 8(%rdi),%eax 836 mov %eax,40(%rdi) # rk[10] 837 xor 12(%rdi),%eax 838 mov %eax,44(%rdi) # rk[11] 839 840 cmp \$6,%ecx 841 je .L14break 842 add \$1,%ecx 843 844 mov %eax,%edx 845 mov 16(%rdi),%eax # rk[4] 846 movz %dl,%esi # rk[11]>>0 847 movzb -128(%rbp,%rsi),%ebx 848 movz %dh,%esi # rk[11]>>8 849 xor %ebx,%eax 850 851 movzb -128(%rbp,%rsi),%ebx 852 shr \$16,%edx 853 shl \$8,%ebx 854 movz %dl,%esi # rk[11]>>16 855 xor %ebx,%eax 856 857 movzb -128(%rbp,%rsi),%ebx 858 movz %dh,%esi # rk[11]>>24 859 shl \$16,%ebx 860 xor %ebx,%eax 861 862 movzb -128(%rbp,%rsi),%ebx 863 shl \$24,%ebx 864 xor %ebx,%eax 865 866 mov %eax,48(%rdi) # rk[12] 867 xor 20(%rdi),%eax 868 mov %eax,52(%rdi) # rk[13] 869 xor 24(%rdi),%eax 870 mov %eax,56(%rdi) # rk[14] 871 xor 28(%rdi),%eax 872 mov %eax,60(%rdi) # rk[15] 873 874 lea 32(%rdi),%rdi 875 jmp .L14loop 876.L14break: 877 movl \$14,48(%rdi) # setup number of rounds 878 xor %rax,%rax 879 jmp .Lexit 880 881.Lbadpointer: 882 mov \$-1,%rax 883.Lexit: 884 .byte 0xf3,0xc3 # rep ret 885.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key 886___ 887 888$code.=<<___; 889.align 64 890.LAES_Te: 891___ 892 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); 893 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); 894 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56); 895 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec); 896 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa); 897 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb); 898 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45); 899 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b); 900 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c); 901 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83); 902 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9); 903 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a); 904 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d); 905 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f); 906 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df); 907 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea); 908 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34); 909 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b); 910 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d); 911 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413); 912 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1); 913 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6); 914 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972); 915 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85); 916 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed); 917 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511); 918 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe); 919 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b); 920 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05); 921 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1); 922 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142); 923 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf); 924 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3); 925 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e); 926 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a); 927 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6); 928 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3); 929 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b); 930 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428); 931 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad); 932 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14); 933 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8); 934 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4); 935 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2); 936 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda); 937 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949); 938 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf); 939 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810); 940 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c); 941 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697); 942 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e); 943 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f); 944 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc); 945 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c); 946 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969); 947 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27); 948 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122); 949 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433); 950 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9); 951 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5); 952 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a); 953 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); 954 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); 955 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); 956 957#Te4 # four copies of Te4 to choose from to avoid L1 aliasing 958 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); 959 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); 960 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); 961 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); 962 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); 963 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); 964 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); 965 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); 966 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); 967 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); 968 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); 969 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); 970 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); 971 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); 972 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); 973 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); 974 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); 975 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); 976 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); 977 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); 978 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); 979 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); 980 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); 981 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); 982 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); 983 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); 984 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); 985 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); 986 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); 987 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); 988 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); 989 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); 990 991 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); 992 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); 993 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); 994 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); 995 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); 996 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); 997 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); 998 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); 999 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); 1000 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); 1001 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); 1002 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); 1003 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); 1004 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); 1005 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); 1006 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); 1007 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); 1008 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); 1009 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); 1010 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); 1011 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); 1012 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); 1013 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); 1014 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); 1015 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); 1016 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); 1017 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); 1018 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); 1019 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); 1020 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); 1021 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); 1022 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); 1023 1024 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); 1025 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); 1026 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); 1027 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); 1028 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); 1029 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); 1030 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); 1031 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); 1032 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); 1033 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); 1034 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); 1035 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); 1036 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); 1037 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); 1038 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); 1039 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); 1040 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); 1041 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); 1042 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); 1043 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); 1044 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); 1045 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); 1046 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); 1047 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); 1048 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); 1049 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); 1050 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); 1051 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); 1052 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); 1053 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); 1054 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); 1055 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); 1056 1057 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); 1058 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); 1059 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); 1060 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); 1061 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); 1062 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); 1063 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); 1064 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); 1065 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); 1066 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); 1067 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); 1068 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); 1069 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); 1070 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); 1071 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); 1072 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); 1073 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); 1074 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); 1075 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); 1076 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); 1077 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); 1078 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); 1079 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); 1080 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); 1081 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); 1082 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); 1083 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); 1084 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); 1085 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); 1086 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); 1087 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); 1088 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); 1089#rcon: 1090$code.=<<___; 1091 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 1092 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 1093 .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080 1094 .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b 1095___ 1096 1097$code.=<<___; 1098.asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1099.align 64 1100___ 1101 1102# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1103# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1104if ($win64) { 1105$rec="%rcx"; 1106$frame="%rdx"; 1107$context="%r8"; 1108$disp="%r9"; 1109 1110$code.=<<___; 1111.extern __imp_RtlVirtualUnwind 1112.type block_se_handler,\@abi-omnipotent 1113.align 16 1114block_se_handler: 1115 push %rsi 1116 push %rdi 1117 push %rbx 1118 push %rbp 1119 push %r12 1120 push %r13 1121 push %r14 1122 push %r15 1123 pushfq 1124 sub \$64,%rsp 1125 1126 mov 120($context),%rax # pull context->Rax 1127 mov 248($context),%rbx # pull context->Rip 1128 1129 mov 8($disp),%rsi # disp->ImageBase 1130 mov 56($disp),%r11 # disp->HandlerData 1131 1132 mov 0(%r11),%r10d # HandlerData[0] 1133 lea (%rsi,%r10),%r10 # prologue label 1134 cmp %r10,%rbx # context->Rip<prologue label 1135 jb .Lin_block_prologue 1136 1137 mov 152($context),%rax # pull context->Rsp 1138 1139 mov 4(%r11),%r10d # HandlerData[1] 1140 lea (%rsi,%r10),%r10 # epilogue label 1141 cmp %r10,%rbx # context->Rip>=epilogue label 1142 jae .Lin_block_prologue 1143 1144 mov 24(%rax),%rax # pull saved real stack pointer 1145 1146 mov -8(%rax),%rbx 1147 mov -16(%rax),%rbp 1148 mov -24(%rax),%r12 1149 mov -32(%rax),%r13 1150 mov -40(%rax),%r14 1151 mov -48(%rax),%r15 1152 mov %rbx,144($context) # restore context->Rbx 1153 mov %rbp,160($context) # restore context->Rbp 1154 mov %r12,216($context) # restore context->R12 1155 mov %r13,224($context) # restore context->R13 1156 mov %r14,232($context) # restore context->R14 1157 mov %r15,240($context) # restore context->R15 1158 1159.Lin_block_prologue: 1160 mov 8(%rax),%rdi 1161 mov 16(%rax),%rsi 1162 mov %rax,152($context) # restore context->Rsp 1163 mov %rsi,168($context) # restore context->Rsi 1164 mov %rdi,176($context) # restore context->Rdi 1165 1166 jmp .Lcommon_seh_exit 1167.size block_se_handler,.-block_se_handler 1168 1169.type key_se_handler,\@abi-omnipotent 1170.align 16 1171key_se_handler: 1172 push %rsi 1173 push %rdi 1174 push %rbx 1175 push %rbp 1176 push %r12 1177 push %r13 1178 push %r14 1179 push %r15 1180 pushfq 1181 sub \$64,%rsp 1182 1183 mov 120($context),%rax # pull context->Rax 1184 mov 248($context),%rbx # pull context->Rip 1185 1186 mov 8($disp),%rsi # disp->ImageBase 1187 mov 56($disp),%r11 # disp->HandlerData 1188 1189 mov 0(%r11),%r10d # HandlerData[0] 1190 lea (%rsi,%r10),%r10 # prologue label 1191 cmp %r10,%rbx # context->Rip<prologue label 1192 jb .Lin_key_prologue 1193 1194 mov 152($context),%rax # pull context->Rsp 1195 1196 mov 4(%r11),%r10d # HandlerData[1] 1197 lea (%rsi,%r10),%r10 # epilogue label 1198 cmp %r10,%rbx # context->Rip>=epilogue label 1199 jae .Lin_key_prologue 1200 1201 lea 56(%rax),%rax 1202 1203 mov -8(%rax),%rbx 1204 mov -16(%rax),%rbp 1205 mov -24(%rax),%r12 1206 mov -32(%rax),%r13 1207 mov -40(%rax),%r14 1208 mov -48(%rax),%r15 1209 mov %rbx,144($context) # restore context->Rbx 1210 mov %rbp,160($context) # restore context->Rbp 1211 mov %r12,216($context) # restore context->R12 1212 mov %r13,224($context) # restore context->R13 1213 mov %r14,232($context) # restore context->R14 1214 mov %r15,240($context) # restore context->R15 1215 1216.Lin_key_prologue: 1217 mov 8(%rax),%rdi 1218 mov 16(%rax),%rsi 1219 mov %rax,152($context) # restore context->Rsp 1220 mov %rsi,168($context) # restore context->Rsi 1221 mov %rdi,176($context) # restore context->Rdi 1222 1223.Lcommon_seh_exit: 1224 mov 40($disp),%rdi # disp->ContextRecord 1225 mov $context,%rsi # context 1226 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 1227 .long 0xa548f3fc # cld; rep movsq 1228 1229 mov $disp,%rsi 1230 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1231 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1232 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1233 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1234 mov 40(%rsi),%r10 # disp->ContextRecord 1235 lea 56(%rsi),%r11 # &disp->HandlerData 1236 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1237 mov %r10,32(%rsp) # arg5 1238 mov %r11,40(%rsp) # arg6 1239 mov %r12,48(%rsp) # arg7 1240 mov %rcx,56(%rsp) # arg8, (NULL) 1241 call *__imp_RtlVirtualUnwind(%rip) 1242 1243 mov \$1,%eax # ExceptionContinueSearch 1244 add \$64,%rsp 1245 popfq 1246 pop %r15 1247 pop %r14 1248 pop %r13 1249 pop %r12 1250 pop %rbp 1251 pop %rbx 1252 pop %rdi 1253 pop %rsi 1254 ret 1255.size key_se_handler,.-key_se_handler 1256 1257.section .pdata 1258.align 4 1259 .rva .LSEH_begin_GFp_aes_nohw_encrypt 1260 .rva .LSEH_end_GFp_aes_nohw_encrypt 1261 .rva .LSEH_info_GFp_aes_nohw_encrypt 1262 1263 .rva .LSEH_begin_GFp_aes_nohw_set_encrypt_key 1264 .rva .LSEH_end_GFp_aes_nohw_set_encrypt_key 1265 .rva .LSEH_info_GFp_aes_nohw_set_encrypt_key 1266 1267.section .xdata 1268.align 8 1269.LSEH_info_GFp_aes_nohw_encrypt: 1270 .byte 9,0,0,0 1271 .rva block_se_handler 1272 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] 1273.LSEH_info_GFp_aes_nohw_set_encrypt_key: 1274 .byte 9,0,0,0 1275 .rva key_se_handler 1276 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] 1277___ 1278} 1279 1280$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1281 1282print $code; 1283 1284close STDOUT; 1285