1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# This module implements support for Intel AES-NI extension. In 11# OpenSSL context it's used with Intel engine, but can also be used as 12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 13# details]. 14 15$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 16 # generates drop-in replacement for 17 # crypto/aes/asm/aes-586.pl:-) 18$inline=1; # inline _aesni_[en|de]crypt 19 20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 21push(@INC,"${dir}","${dir}../../perlasm"); 22require "x86asm.pl"; 23 24&asm_init($ARGV[0],$0); 25 26$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups"); 27 28$len="eax"; 29$rounds="ecx"; 30$key="edx"; 31$inp="esi"; 32$out="edi"; 33$rounds_="ebx"; # backup copy for $rounds 34$key_="ebp"; # backup copy for $key 35 36$inout0="xmm0"; 37$inout1="xmm1"; 38$inout2="xmm2"; 39$rndkey0="xmm3"; 40$rndkey1="xmm4"; 41$ivec="xmm5"; 42$in0="xmm6"; 43$in1="xmm7"; $inout3="xmm7"; 44 45# Inline version of internal aesni_[en|de]crypt1 46sub aesni_inline_generate1 47{ my $p=shift; 48 49 &$movekey ($rndkey0,&QWP(0,$key)); 50 &$movekey ($rndkey1,&QWP(16,$key)); 51 &lea ($key,&DWP(32,$key)); 52 &pxor ($inout0,$rndkey0); 53 &set_label("${p}1_loop"); 54 eval"&aes${p} ($inout0,$rndkey1)"; 55 &dec ($rounds); 56 &$movekey ($rndkey1,&QWP(0,$key)); 57 &lea ($key,&DWP(16,$key)); 58 &jnz (&label("${p}1_loop")); 59 eval"&aes${p}last ($inout0,$rndkey1)"; 60} 61 62sub aesni_generate1 # fully unrolled loop 63{ my $p=shift; 64 65 &function_begin_B("_aesni_${p}rypt1"); 66 &$movekey ($rndkey0,&QWP(0,$key)); 67 &$movekey ($rndkey1,&QWP(0x10,$key)); 68 &cmp ($rounds,11); 69 &pxor ($inout0,$rndkey0); 70 &$movekey ($rndkey0,&QWP(0x20,$key)); 71 &lea ($key,&DWP(0x30,$key)); 72 &jb (&label("${p}128")); 73 &lea ($key,&DWP(0x20,$key)); 74 &je (&label("${p}192")); 75 &lea ($key,&DWP(0x20,$key)); 76 eval"&aes${p} ($inout0,$rndkey1)"; 77 &$movekey ($rndkey1,&QWP(-0x40,$key)); 78 eval"&aes${p} ($inout0,$rndkey0)"; 79 &$movekey ($rndkey0,&QWP(-0x30,$key)); 80 &set_label("${p}192"); 81 eval"&aes${p} ($inout0,$rndkey1)"; 82 &$movekey ($rndkey1,&QWP(-0x20,$key)); 83 eval"&aes${p} ($inout0,$rndkey0)"; 84 &$movekey ($rndkey0,&QWP(-0x10,$key)); 85 &set_label("${p}128"); 86 eval"&aes${p} ($inout0,$rndkey1)"; 87 &$movekey ($rndkey1,&QWP(0,$key)); 88 eval"&aes${p} ($inout0,$rndkey0)"; 89 &$movekey ($rndkey0,&QWP(0x10,$key)); 90 eval"&aes${p} ($inout0,$rndkey1)"; 91 &$movekey ($rndkey1,&QWP(0x20,$key)); 92 eval"&aes${p} ($inout0,$rndkey0)"; 93 &$movekey ($rndkey0,&QWP(0x30,$key)); 94 eval"&aes${p} ($inout0,$rndkey1)"; 95 &$movekey ($rndkey1,&QWP(0x40,$key)); 96 eval"&aes${p} ($inout0,$rndkey0)"; 97 &$movekey ($rndkey0,&QWP(0x50,$key)); 98 eval"&aes${p} ($inout0,$rndkey1)"; 99 &$movekey ($rndkey1,&QWP(0x60,$key)); 100 eval"&aes${p} ($inout0,$rndkey0)"; 101 &$movekey ($rndkey0,&QWP(0x70,$key)); 102 eval"&aes${p} ($inout0,$rndkey1)"; 103 eval"&aes${p}last ($inout0,$rndkey0)"; 104 &ret(); 105 &function_end_B("_aesni_${p}rypt1"); 106} 107 108# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 109&aesni_generate1("enc") if (!$inline); 110&function_begin_B("${PREFIX}_encrypt"); 111 &mov ("eax",&wparam(0)); 112 &mov ($key,&wparam(2)); 113 &movups ($inout0,&QWP(0,"eax")); 114 &mov ($rounds,&DWP(240,$key)); 115 &mov ("eax",&wparam(1)); 116 if ($inline) 117 { &aesni_inline_generate1("enc"); } 118 else 119 { &call ("_aesni_encrypt1"); } 120 &movups (&QWP(0,"eax"),$inout0); 121 &ret (); 122&function_end_B("${PREFIX}_encrypt"); 123 124# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 125&aesni_generate1("dec") if(!$inline); 126&function_begin_B("${PREFIX}_decrypt"); 127 &mov ("eax",&wparam(0)); 128 &mov ($key,&wparam(2)); 129 &movups ($inout0,&QWP(0,"eax")); 130 &mov ($rounds,&DWP(240,$key)); 131 &mov ("eax",&wparam(1)); 132 if ($inline) 133 { &aesni_inline_generate1("dec"); } 134 else 135 { &call ("_aesni_decrypt1"); } 136 &movups (&QWP(0,"eax"),$inout0); 137 &ret (); 138&function_end_B("${PREFIX}_decrypt"); 139 140# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave 141# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] 142# latency is 6, it turned out that it can be scheduled only every 143# *second* cycle. Thus 3x interleave is the one providing optimal 144# utilization, i.e. when subroutine's throughput is virtually same as 145# of non-interleaved subroutine [for number of input blocks up to 3]. 146# This is why it makes no sense to implement 2x subroutine. As soon 147# as/if Intel improves throughput by making it possible to schedule 148# the instructions in question *every* cycles I would have to 149# implement 6x interleave and use it in loop... 150sub aesni_generate3 151{ my $p=shift; 152 153 &function_begin_B("_aesni_${p}rypt3"); 154 &$movekey ($rndkey0,&QWP(0,$key)); 155 &shr ($rounds,1); 156 &$movekey ($rndkey1,&QWP(16,$key)); 157 &lea ($key,&DWP(32,$key)); 158 &pxor ($inout0,$rndkey0); 159 &pxor ($inout1,$rndkey0); 160 &pxor ($inout2,$rndkey0); 161 &jmp (&label("${p}3_loop")); 162 &set_label("${p}3_loop",16); 163 eval"&aes${p} ($inout0,$rndkey1)"; 164 &$movekey ($rndkey0,&QWP(0,$key)); 165 eval"&aes${p} ($inout1,$rndkey1)"; 166 &dec ($rounds); 167 eval"&aes${p} ($inout2,$rndkey1)"; 168 &$movekey ($rndkey1,&QWP(16,$key)); 169 eval"&aes${p} ($inout0,$rndkey0)"; 170 &lea ($key,&DWP(32,$key)); 171 eval"&aes${p} ($inout1,$rndkey0)"; 172 eval"&aes${p} ($inout2,$rndkey0)"; 173 &jnz (&label("${p}3_loop")); 174 eval"&aes${p} ($inout0,$rndkey1)"; 175 &$movekey ($rndkey0,&QWP(0,$key)); 176 eval"&aes${p} ($inout1,$rndkey1)"; 177 eval"&aes${p} ($inout2,$rndkey1)"; 178 eval"&aes${p}last ($inout0,$rndkey0)"; 179 eval"&aes${p}last ($inout1,$rndkey0)"; 180 eval"&aes${p}last ($inout2,$rndkey0)"; 181 &ret(); 182 &function_end_B("_aesni_${p}rypt3"); 183} 184 185# 4x interleave is implemented to improve small block performance, 186# most notably [and naturally] 4 block by ~30%. One can argue that one 187# should have implemented 5x as well, but improvement would be <20%, 188# so it's not worth it... 189sub aesni_generate4 190{ my $p=shift; 191 192 &function_begin_B("_aesni_${p}rypt4"); 193 &$movekey ($rndkey0,&QWP(0,$key)); 194 &$movekey ($rndkey1,&QWP(16,$key)); 195 &shr ($rounds,1); 196 &lea ($key,&DWP(32,$key)); 197 &pxor ($inout0,$rndkey0); 198 &pxor ($inout1,$rndkey0); 199 &pxor ($inout2,$rndkey0); 200 &pxor ($inout3,$rndkey0); 201 &jmp (&label("${p}3_loop")); 202 &set_label("${p}3_loop",16); 203 eval"&aes${p} ($inout0,$rndkey1)"; 204 &$movekey ($rndkey0,&QWP(0,$key)); 205 eval"&aes${p} ($inout1,$rndkey1)"; 206 &dec ($rounds); 207 eval"&aes${p} ($inout2,$rndkey1)"; 208 eval"&aes${p} ($inout3,$rndkey1)"; 209 &$movekey ($rndkey1,&QWP(16,$key)); 210 eval"&aes${p} ($inout0,$rndkey0)"; 211 &lea ($key,&DWP(32,$key)); 212 eval"&aes${p} ($inout1,$rndkey0)"; 213 eval"&aes${p} ($inout2,$rndkey0)"; 214 eval"&aes${p} ($inout3,$rndkey0)"; 215 &jnz (&label("${p}3_loop")); 216 eval"&aes${p} ($inout0,$rndkey1)"; 217 &$movekey ($rndkey0,&QWP(0,$key)); 218 eval"&aes${p} ($inout1,$rndkey1)"; 219 eval"&aes${p} ($inout2,$rndkey1)"; 220 eval"&aes${p} ($inout3,$rndkey1)"; 221 eval"&aes${p}last ($inout0,$rndkey0)"; 222 eval"&aes${p}last ($inout1,$rndkey0)"; 223 eval"&aes${p}last ($inout2,$rndkey0)"; 224 eval"&aes${p}last ($inout3,$rndkey0)"; 225 &ret(); 226 &function_end_B("_aesni_${p}rypt4"); 227} 228&aesni_generate3("enc") if ($PREFIX eq "aesni"); 229&aesni_generate3("dec"); 230&aesni_generate4("enc") if ($PREFIX eq "aesni"); 231&aesni_generate4("dec"); 232 233if ($PREFIX eq "aesni") { 234# void aesni_ecb_encrypt (const void *in, void *out, 235# size_t length, const AES_KEY *key, 236# int enc); 237&function_begin("aesni_ecb_encrypt"); 238 &mov ($inp,&wparam(0)); 239 &mov ($out,&wparam(1)); 240 &mov ($len,&wparam(2)); 241 &mov ($key,&wparam(3)); 242 &mov ($rounds,&wparam(4)); 243 &cmp ($len,16); 244 &jb (&label("ecb_ret")); 245 &and ($len,-16); 246 &test ($rounds,$rounds) 247 &mov ($rounds,&DWP(240,$key)); 248 &mov ($key_,$key); # backup $key 249 &mov ($rounds_,$rounds); # backup $rounds 250 &jz (&label("ecb_decrypt")); 251 252 &sub ($len,0x40); 253 &jbe (&label("ecb_enc_tail")); 254 &jmp (&label("ecb_enc_loop3")); 255 256&set_label("ecb_enc_loop3",16); 257 &movups ($inout0,&QWP(0,$inp)); 258 &movups ($inout1,&QWP(0x10,$inp)); 259 &movups ($inout2,&QWP(0x20,$inp)); 260 &call ("_aesni_encrypt3"); 261 &sub ($len,0x30); 262 &lea ($inp,&DWP(0x30,$inp)); 263 &lea ($out,&DWP(0x30,$out)); 264 &movups (&QWP(-0x30,$out),$inout0); 265 &mov ($key,$key_); # restore $key 266 &movups (&QWP(-0x20,$out),$inout1); 267 &mov ($rounds,$rounds_); # restore $rounds 268 &movups (&QWP(-0x10,$out),$inout2); 269 &ja (&label("ecb_enc_loop3")); 270 271&set_label("ecb_enc_tail"); 272 &add ($len,0x40); 273 &jz (&label("ecb_ret")); 274 275 &cmp ($len,0x10); 276 &movups ($inout0,&QWP(0,$inp)); 277 &je (&label("ecb_enc_one")); 278 &cmp ($len,0x20); 279 &movups ($inout1,&QWP(0x10,$inp)); 280 &je (&label("ecb_enc_two")); 281 &cmp ($len,0x30); 282 &movups ($inout2,&QWP(0x20,$inp)); 283 &je (&label("ecb_enc_three")); 284 &movups ($inout3,&QWP(0x30,$inp)); 285 &call ("_aesni_encrypt4"); 286 &movups (&QWP(0,$out),$inout0); 287 &movups (&QWP(0x10,$out),$inout1); 288 &movups (&QWP(0x20,$out),$inout2); 289 &movups (&QWP(0x30,$out),$inout3); 290 jmp (&label("ecb_ret")); 291 292&set_label("ecb_enc_one",16); 293 if ($inline) 294 { &aesni_inline_generate1("enc"); } 295 else 296 { &call ("_aesni_encrypt1"); } 297 &movups (&QWP(0,$out),$inout0); 298 &jmp (&label("ecb_ret")); 299 300&set_label("ecb_enc_two",16); 301 &call ("_aesni_encrypt3"); 302 &movups (&QWP(0,$out),$inout0); 303 &movups (&QWP(0x10,$out),$inout1); 304 &jmp (&label("ecb_ret")); 305 306&set_label("ecb_enc_three",16); 307 &call ("_aesni_encrypt3"); 308 &movups (&QWP(0,$out),$inout0); 309 &movups (&QWP(0x10,$out),$inout1); 310 &movups (&QWP(0x20,$out),$inout2); 311 &jmp (&label("ecb_ret")); 312 313&set_label("ecb_decrypt",16); 314 &sub ($len,0x40); 315 &jbe (&label("ecb_dec_tail")); 316 &jmp (&label("ecb_dec_loop3")); 317 318&set_label("ecb_dec_loop3",16); 319 &movups ($inout0,&QWP(0,$inp)); 320 &movups ($inout1,&QWP(0x10,$inp)); 321 &movups ($inout2,&QWP(0x20,$inp)); 322 &call ("_aesni_decrypt3"); 323 &sub ($len,0x30); 324 &lea ($inp,&DWP(0x30,$inp)); 325 &lea ($out,&DWP(0x30,$out)); 326 &movups (&QWP(-0x30,$out),$inout0); 327 &mov ($key,$key_); # restore $key 328 &movups (&QWP(-0x20,$out),$inout1); 329 &mov ($rounds,$rounds_); # restore $rounds 330 &movups (&QWP(-0x10,$out),$inout2); 331 &ja (&label("ecb_dec_loop3")); 332 333&set_label("ecb_dec_tail"); 334 &add ($len,0x40); 335 &jz (&label("ecb_ret")); 336 337 &cmp ($len,0x10); 338 &movups ($inout0,&QWP(0,$inp)); 339 &je (&label("ecb_dec_one")); 340 &cmp ($len,0x20); 341 &movups ($inout1,&QWP(0x10,$inp)); 342 &je (&label("ecb_dec_two")); 343 &cmp ($len,0x30); 344 &movups ($inout2,&QWP(0x20,$inp)); 345 &je (&label("ecb_dec_three")); 346 &movups ($inout3,&QWP(0x30,$inp)); 347 &call ("_aesni_decrypt4"); 348 &movups (&QWP(0,$out),$inout0); 349 &movups (&QWP(0x10,$out),$inout1); 350 &movups (&QWP(0x20,$out),$inout2); 351 &movups (&QWP(0x30,$out),$inout3); 352 &jmp (&label("ecb_ret")); 353 354&set_label("ecb_dec_one",16); 355 if ($inline) 356 { &aesni_inline_generate1("dec"); } 357 else 358 { &call ("_aesni_decrypt1"); } 359 &movups (&QWP(0,$out),$inout0); 360 &jmp (&label("ecb_ret")); 361 362&set_label("ecb_dec_two",16); 363 &call ("_aesni_decrypt3"); 364 &movups (&QWP(0,$out),$inout0); 365 &movups (&QWP(0x10,$out),$inout1); 366 &jmp (&label("ecb_ret")); 367 368&set_label("ecb_dec_three",16); 369 &call ("_aesni_decrypt3"); 370 &movups (&QWP(0,$out),$inout0); 371 &movups (&QWP(0x10,$out),$inout1); 372 &movups (&QWP(0x20,$out),$inout2); 373 374&set_label("ecb_ret"); 375&function_end("aesni_ecb_encrypt"); 376} 377 378# void $PREFIX_cbc_encrypt (const void *inp, void *out, 379# size_t length, const AES_KEY *key, 380# unsigned char *ivp,const int enc); 381&function_begin("${PREFIX}_cbc_encrypt"); 382 &mov ($inp,&wparam(0)); 383 &mov ($out,&wparam(1)); 384 &mov ($len,&wparam(2)); 385 &mov ($key,&wparam(3)); 386 &test ($len,$len); 387 &mov ($key_,&wparam(4)); 388 &jz (&label("cbc_ret")); 389 390 &cmp (&wparam(5),0); 391 &movups ($ivec,&QWP(0,$key_)); # load IV 392 &mov ($rounds,&DWP(240,$key)); 393 &mov ($key_,$key); # backup $key 394 &mov ($rounds_,$rounds); # backup $rounds 395 &je (&label("cbc_decrypt")); 396 397 &movaps ($inout0,$ivec); 398 &cmp ($len,16); 399 &jb (&label("cbc_enc_tail")); 400 &sub ($len,16); 401 &jmp (&label("cbc_enc_loop")); 402 403&set_label("cbc_enc_loop",16); 404 &movups ($ivec,&QWP(0,$inp)); 405 &lea ($inp,&DWP(16,$inp)); 406 &pxor ($inout0,$ivec); 407 if ($inline) 408 { &aesni_inline_generate1("enc"); } 409 else 410 { &call ("_aesni_encrypt1"); } 411 &sub ($len,16); 412 &lea ($out,&DWP(16,$out)); 413 &mov ($rounds,$rounds_); # restore $rounds 414 &mov ($key,$key_); # restore $key 415 &movups (&QWP(-16,$out),$inout0); 416 &jnc (&label("cbc_enc_loop")); 417 &add ($len,16); 418 &jnz (&label("cbc_enc_tail")); 419 &movaps ($ivec,$inout0); 420 &jmp (&label("cbc_ret")); 421 422&set_label("cbc_enc_tail"); 423 &mov ("ecx",$len); # zaps $rounds 424 &data_word(0xA4F3F689); # rep movsb 425 &mov ("ecx",16); # zero tail 426 &sub ("ecx",$len); 427 &xor ("eax","eax"); # zaps $len 428 &data_word(0xAAF3F689); # rep stosb 429 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 430 &mov ($rounds,$rounds_); # restore $rounds 431 &mov ($inp,$out); # $inp and $out are the same 432 &mov ($key,$key_); # restore $key 433 &jmp (&label("cbc_enc_loop")); 434 435&set_label("cbc_decrypt",16); 436 &sub ($len,0x40); 437 &jbe (&label("cbc_dec_tail")); 438 &jmp (&label("cbc_dec_loop3")); 439 440&set_label("cbc_dec_loop3",16); 441 &movups ($inout0,&QWP(0,$inp)); 442 &movups ($inout1,&QWP(0x10,$inp)); 443 &movups ($inout2,&QWP(0x20,$inp)); 444 &movaps ($in0,$inout0); 445 &movaps ($in1,$inout1); 446 &call ("_aesni_decrypt3"); 447 &sub ($len,0x30); 448 &lea ($inp,&DWP(0x30,$inp)); 449 &lea ($out,&DWP(0x30,$out)); 450 &pxor ($inout0,$ivec); 451 &pxor ($inout1,$in0); 452 &movups ($ivec,&QWP(-0x10,$inp)); 453 &pxor ($inout2,$in1); 454 &movups (&QWP(-0x30,$out),$inout0); 455 &mov ($rounds,$rounds_) # restore $rounds 456 &movups (&QWP(-0x20,$out),$inout1); 457 &mov ($key,$key_); # restore $key 458 &movups (&QWP(-0x10,$out),$inout2); 459 &ja (&label("cbc_dec_loop3")); 460 461&set_label("cbc_dec_tail"); 462 &add ($len,0x40); 463 &jz (&label("cbc_ret")); 464 465 &movups ($inout0,&QWP(0,$inp)); 466 &cmp ($len,0x10); 467 &movaps ($in0,$inout0); 468 &jbe (&label("cbc_dec_one")); 469 &movups ($inout1,&QWP(0x10,$inp)); 470 &cmp ($len,0x20); 471 &movaps ($in1,$inout1); 472 &jbe (&label("cbc_dec_two")); 473 &movups ($inout2,&QWP(0x20,$inp)); 474 &cmp ($len,0x30); 475 &jbe (&label("cbc_dec_three")); 476 &movups ($inout3,&QWP(0x30,$inp)); 477 &call ("_aesni_decrypt4"); 478 &movups ($rndkey0,&QWP(0x10,$inp)); 479 &movups ($rndkey1,&QWP(0x20,$inp)); 480 &pxor ($inout0,$ivec); 481 &pxor ($inout1,$in0); 482 &movups ($ivec,&QWP(0x30,$inp)); 483 &movups (&QWP(0,$out),$inout0); 484 &pxor ($inout2,$rndkey0); 485 &pxor ($inout3,$rndkey1); 486 &movups (&QWP(0x10,$out),$inout1); 487 &movups (&QWP(0x20,$out),$inout2); 488 &movaps ($inout0,$inout3); 489 &lea ($out,&DWP(0x30,$out)); 490 &jmp (&label("cbc_dec_tail_collected")); 491 492&set_label("cbc_dec_one"); 493 if ($inline) 494 { &aesni_inline_generate1("dec"); } 495 else 496 { &call ("_aesni_decrypt1"); } 497 &pxor ($inout0,$ivec); 498 &movaps ($ivec,$in0); 499 &jmp (&label("cbc_dec_tail_collected")); 500 501&set_label("cbc_dec_two"); 502 &call ("_aesni_decrypt3"); 503 &pxor ($inout0,$ivec); 504 &pxor ($inout1,$in0); 505 &movups (&QWP(0,$out),$inout0); 506 &movaps ($inout0,$inout1); 507 &movaps ($ivec,$in1); 508 &lea ($out,&DWP(0x10,$out)); 509 &jmp (&label("cbc_dec_tail_collected")); 510 511&set_label("cbc_dec_three"); 512 &call ("_aesni_decrypt3"); 513 &pxor ($inout0,$ivec); 514 &pxor ($inout1,$in0); 515 &pxor ($inout2,$in1); 516 &movups (&QWP(0,$out),$inout0); 517 &movups (&QWP(0x10,$out),$inout1); 518 &movaps ($inout0,$inout2); 519 &movups ($ivec,&QWP(0x20,$inp)); 520 &lea ($out,&DWP(0x20,$out)); 521 522&set_label("cbc_dec_tail_collected"); 523 &and ($len,15); 524 &jnz (&label("cbc_dec_tail_partial")); 525 &movups (&QWP(0,$out),$inout0); 526 &jmp (&label("cbc_ret")); 527 528&set_label("cbc_dec_tail_partial"); 529 &mov ($key_,"esp"); 530 &sub ("esp",16); 531 &and ("esp",-16); 532 &movaps (&QWP(0,"esp"),$inout0); 533 &mov ($inp,"esp"); 534 &mov ("ecx",$len); 535 &data_word(0xA4F3F689); # rep movsb 536 &mov ("esp",$key_); 537 538&set_label("cbc_ret"); 539 &mov ($key_,&wparam(4)); 540 &movups (&QWP(0,$key_),$ivec); # output IV 541&function_end("${PREFIX}_cbc_encrypt"); 542 543# Mechanical port from aesni-x86_64.pl. 544# 545# _aesni_set_encrypt_key is private interface, 546# input: 547# "eax" const unsigned char *userKey 548# $rounds int bits 549# $key AES_KEY *key 550# output: 551# "eax" return code 552# $round rounds 553 554&function_begin_B("_aesni_set_encrypt_key"); 555 &test ("eax","eax"); 556 &jz (&label("bad_pointer")); 557 &test ($key,$key); 558 &jz (&label("bad_pointer")); 559 560 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 561 &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 562 &lea ($key,&DWP(16,$key)); 563 &cmp ($rounds,256); 564 &je (&label("14rounds")); 565 &cmp ($rounds,192); 566 &je (&label("12rounds")); 567 &cmp ($rounds,128); 568 &jne (&label("bad_keybits")); 569 570&set_label("10rounds",16); 571 &mov ($rounds,9); 572 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 573 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 574 &call (&label("key_128_cold")); 575 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 576 &call (&label("key_128")); 577 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 578 &call (&label("key_128")); 579 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 580 &call (&label("key_128")); 581 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 582 &call (&label("key_128")); 583 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 584 &call (&label("key_128")); 585 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 586 &call (&label("key_128")); 587 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 588 &call (&label("key_128")); 589 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 590 &call (&label("key_128")); 591 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 592 &call (&label("key_128")); 593 &$movekey (&QWP(0,$key),"xmm0"); 594 &mov (&DWP(80,$key),$rounds); 595 &xor ("eax","eax"); 596 &ret(); 597 598&set_label("key_128",16); 599 &$movekey (&QWP(0,$key),"xmm0"); 600 &lea ($key,&DWP(16,$key)); 601&set_label("key_128_cold"); 602 &shufps ("xmm4","xmm0",0b00010000); 603 &pxor ("xmm0","xmm4"); 604 &shufps ("xmm4","xmm0",0b10001100,); 605 &pxor ("xmm0","xmm4"); 606 &pshufd ("xmm1","xmm1",0b11111111); # critical path 607 &pxor ("xmm0","xmm1"); 608 &ret(); 609 610&set_label("12rounds",16); 611 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 612 &mov ($rounds,11); 613 &$movekey (&QWP(-16,$key),"xmm0") # round 0 614 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 615 &call (&label("key_192a_cold")); 616 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 617 &call (&label("key_192b")); 618 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 619 &call (&label("key_192a")); 620 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 621 &call (&label("key_192b")); 622 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 623 &call (&label("key_192a")); 624 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 625 &call (&label("key_192b")); 626 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 627 &call (&label("key_192a")); 628 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 629 &call (&label("key_192b")); 630 &$movekey (&QWP(0,$key),"xmm0"); 631 &mov (&DWP(48,$key),$rounds); 632 &xor ("eax","eax"); 633 &ret(); 634 635&set_label("key_192a",16); 636 &$movekey (&QWP(0,$key),"xmm0"); 637 &lea ($key,&DWP(16,$key)); 638&set_label("key_192a_cold",16); 639 &movaps ("xmm5","xmm2"); 640&set_label("key_192b_warm"); 641 &shufps ("xmm4","xmm0",0b00010000); 642 &movaps ("xmm3","xmm2"); 643 &pxor ("xmm0","xmm4"); 644 &shufps ("xmm4","xmm0",0b10001100); 645 &pslldq ("xmm3",4); 646 &pxor ("xmm0","xmm4"); 647 &pshufd ("xmm1","xmm1",0b01010101); # critical path 648 &pxor ("xmm2","xmm3"); 649 &pxor ("xmm0","xmm1"); 650 &pshufd ("xmm3","xmm0",0b11111111); 651 &pxor ("xmm2","xmm3"); 652 &ret(); 653 654&set_label("key_192b",16); 655 &movaps ("xmm3","xmm0"); 656 &shufps ("xmm5","xmm0",0b01000100); 657 &$movekey (&QWP(0,$key),"xmm5"); 658 &shufps ("xmm3","xmm2",0b01001110); 659 &$movekey (&QWP(16,$key),"xmm3"); 660 &lea ($key,&DWP(32,$key)); 661 &jmp (&label("key_192b_warm")); 662 663&set_label("14rounds",16); 664 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 665 &mov ($rounds,13); 666 &lea ($key,&DWP(16,$key)); 667 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 668 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 669 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 670 &call (&label("key_256a_cold")); 671 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 672 &call (&label("key_256b")); 673 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 674 &call (&label("key_256a")); 675 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 676 &call (&label("key_256b")); 677 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 678 &call (&label("key_256a")); 679 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 680 &call (&label("key_256b")); 681 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 682 &call (&label("key_256a")); 683 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 684 &call (&label("key_256b")); 685 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 686 &call (&label("key_256a")); 687 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 688 &call (&label("key_256b")); 689 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 690 &call (&label("key_256a")); 691 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 692 &call (&label("key_256b")); 693 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 694 &call (&label("key_256a")); 695 &$movekey (&QWP(0,$key),"xmm0"); 696 &mov (&DWP(16,$key),$rounds); 697 &xor ("eax","eax"); 698 &ret(); 699 700&set_label("key_256a",16); 701 &$movekey (&QWP(0,$key),"xmm2"); 702 &lea ($key,&DWP(16,$key)); 703&set_label("key_256a_cold"); 704 &shufps ("xmm4","xmm0",0b00010000); 705 &pxor ("xmm0","xmm4"); 706 &shufps ("xmm4","xmm0",0b10001100); 707 &pxor ("xmm0","xmm4"); 708 &pshufd ("xmm1","xmm1",0b11111111); # critical path 709 &pxor ("xmm0","xmm1"); 710 &ret(); 711 712&set_label("key_256b",16); 713 &$movekey (&QWP(0,$key),"xmm0"); 714 &lea ($key,&DWP(16,$key)); 715 716 &shufps ("xmm4","xmm2",0b00010000); 717 &pxor ("xmm2","xmm4"); 718 &shufps ("xmm4","xmm2",0b10001100); 719 &pxor ("xmm2","xmm4"); 720 &pshufd ("xmm1","xmm1",0b10101010); # critical path 721 &pxor ("xmm2","xmm1"); 722 &ret(); 723 724&set_label("bad_pointer",4); 725 &mov ("eax",-1); 726 &ret (); 727&set_label("bad_keybits",4); 728 &mov ("eax",-2); 729 &ret (); 730&function_end_B("_aesni_set_encrypt_key"); 731 732# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 733# AES_KEY *key) 734&function_begin_B("${PREFIX}_set_encrypt_key"); 735 &mov ("eax",&wparam(0)); 736 &mov ($rounds,&wparam(1)); 737 &mov ($key,&wparam(2)); 738 &call ("_aesni_set_encrypt_key"); 739 &ret (); 740&function_end_B("${PREFIX}_set_encrypt_key"); 741 742# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 743# AES_KEY *key) 744&function_begin_B("${PREFIX}_set_decrypt_key"); 745 &mov ("eax",&wparam(0)); 746 &mov ($rounds,&wparam(1)); 747 &mov ($key,&wparam(2)); 748 &call ("_aesni_set_encrypt_key"); 749 &mov ($key,&wparam(2)); 750 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key 751 &test ("eax","eax"); 752 &jnz (&label("dec_key_ret")); 753 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 754 755 &$movekey ("xmm0",&QWP(0,$key)); # just swap 756 &$movekey ("xmm1",&QWP(0,"eax")); 757 &$movekey (&QWP(0,"eax"),"xmm0"); 758 &$movekey (&QWP(0,$key),"xmm1"); 759 &lea ($key,&DWP(16,$key)); 760 &lea ("eax",&DWP(-16,"eax")); 761 762&set_label("dec_key_inverse"); 763 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 764 &$movekey ("xmm1",&QWP(0,"eax")); 765 &aesimc ("xmm0","xmm0"); 766 &aesimc ("xmm1","xmm1"); 767 &lea ($key,&DWP(16,$key)); 768 &lea ("eax",&DWP(-16,"eax")); 769 &cmp ("eax",$key); 770 &$movekey (&QWP(16,"eax"),"xmm0"); 771 &$movekey (&QWP(-16,$key),"xmm1"); 772 &ja (&label("dec_key_inverse")); 773 774 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 775 &aesimc ("xmm0","xmm0"); 776 &$movekey (&QWP(0,$key),"xmm0"); 777 778 &xor ("eax","eax"); # return success 779&set_label("dec_key_ret"); 780 &ret (); 781&function_end_B("${PREFIX}_set_decrypt_key"); 782&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 783 784&asm_finish(); 785