1#!/usr/bin/env perl 2 3################################################################### 4### AES-128 [originally in CTR mode] ### 5### bitsliced implementation for Intel Core 2 processors ### 6### requires support of SSE extensions up to SSSE3 ### 7### Author: Emilia Käsper and Peter Schwabe ### 8### Date: 2009-03-19 ### 9### Public domain ### 10### ### 11### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 12### further information. ### 13################################################################### 14# 15# September 2011. 16# 17# Started as transliteration to "perlasm" the original code has 18# undergone following changes: 19# 20# - code was made position-independent; 21# - rounds were folded into a loop resulting in >5x size reduction 22# from 12.5KB to 2.2KB; 23# - above was possible thanks to mixcolumns() modification that 24# allowed to feed its output back to aesenc[last], this was 25# achieved at cost of two additional inter-registers moves; 26# - some instruction reordering and interleaving; 27# - this module doesn't implement key setup subroutine, instead it 28# relies on conversion of "conventional" key schedule as returned 29# by AES_set_encrypt_key (see discussion below); 30# - first and last round keys are treated differently, which allowed 31# to skip one shiftrows(), reduce bit-sliced key schedule and 32# speed-up conversion by 22%; 33# - support for 192- and 256-bit keys was added; 34# 35# Resulting performance in CPU cycles spent to encrypt one byte out 36# of 4096-byte buffer with 128-bit key is: 37# 38# Emilia's this(*) difference 39# 40# Core 2 9.30 8.69 +7% 41# Nehalem(**) 7.63 6.98 +9% 42# Atom 17.1 17.4 -2%(***) 43# 44# (*) Comparison is not completely fair, because "this" is ECB, 45# i.e. no extra processing such as counter values calculation 46# and xor-ing input as in Emilia's CTR implementation is 47# performed. However, the CTR calculations stand for not more 48# than 1% of total time, so comparison is *rather* fair. 49# 50# (**) Results were collected on Westmere, which is considered to 51# be equivalent to Nehalem for this code. 52# 53# (***) Slowdown on Atom is rather strange per se, because original 54# implementation has a number of 9+-bytes instructions, which 55# are bad for Atom front-end, and which I eliminated completely. 56# In attempt to address deterioration sbox() was tested in FP 57# SIMD "domain" (movaps instead of movdqa, xorps instead of 58# pxor, etc.). While it resulted in nominal 4% improvement on 59# Atom, it hurted Westmere by more than 2x factor. 60# 61# As for key schedule conversion subroutine. Interface to OpenSSL 62# relies on per-invocation on-the-fly conversion. This naturally 63# has impact on performance, especially for short inputs. Conversion 64# time in CPU cycles and its ratio to CPU cycles spent in 8x block 65# function is: 66# 67# conversion conversion/8x block 68# Core 2 240 0.22 69# Nehalem 180 0.20 70# Atom 430 0.19 71# 72# The ratio values mean that 128-byte blocks will be processed 73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 74# etc. Then keep in mind that input sizes not divisible by 128 are 75# *effectively* slower, especially shortest ones, e.g. consecutive 76# 144-byte blocks are processed 44% slower than one would expect, 77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 78# it's still faster than ["hyper-threading-safe" code path in] 79# aes-x86_64.pl on all lengths above 64 bytes... 80# 81# October 2011. 82# 83# Add decryption procedure. Performance in CPU cycles spent to decrypt 84# one byte out of 4096-byte buffer with 128-bit key is: 85# 86# Core 2 9.83 87# Nehalem 7.74 88# Atom 19.0 89# 90# November 2011. 91# 92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 93# suboptimal, but XTS is meant to be used with larger blocks... 94# 95# <appro@openssl.org> 96 97$flavour = shift; 98$output = shift; 99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 100 101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 102 103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 106die "can't locate x86_64-xlate.pl"; 107 108open OUT,"| \"$^X\" $xlate $flavour $output"; 109*STDOUT=*OUT; 110 111my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 112my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 113my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 114 115{ 116my ($key,$rounds,$const)=("%rax","%r10d","%r11"); 117 118sub Sbox { 119# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 120# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 121my @b=@_[0..7]; 122my @t=@_[8..11]; 123my @s=@_[12..15]; 124 &InBasisChange (@b); 125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 127} 128 129sub InBasisChange { 130# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 131# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 132my @b=@_[0..7]; 133$code.=<<___; 134 pxor @b[6], @b[5] 135 pxor @b[1], @b[2] 136 pxor @b[0], @b[3] 137 pxor @b[2], @b[6] 138 pxor @b[0], @b[5] 139 140 pxor @b[3], @b[6] 141 pxor @b[7], @b[3] 142 pxor @b[5], @b[7] 143 pxor @b[4], @b[3] 144 pxor @b[5], @b[4] 145 pxor @b[1], @b[3] 146 147 pxor @b[7], @b[2] 148 pxor @b[5], @b[1] 149___ 150} 151 152sub OutBasisChange { 153# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 154# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 155my @b=@_[0..7]; 156$code.=<<___; 157 pxor @b[6], @b[0] 158 pxor @b[4], @b[1] 159 pxor @b[0], @b[2] 160 pxor @b[6], @b[4] 161 pxor @b[1], @b[6] 162 163 pxor @b[5], @b[1] 164 pxor @b[3], @b[5] 165 pxor @b[7], @b[3] 166 pxor @b[5], @b[7] 167 pxor @b[5], @b[2] 168 169 pxor @b[7], @b[4] 170___ 171} 172 173sub InvSbox { 174# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 175# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 176my @b=@_[0..7]; 177my @t=@_[8..11]; 178my @s=@_[12..15]; 179 &InvInBasisChange (@b); 180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 182} 183 184sub InvInBasisChange { # OutBasisChange in reverse 185my @b=@_[5,1,2,6,3,7,0,4]; 186$code.=<<___ 187 pxor @b[7], @b[4] 188 189 pxor @b[5], @b[7] 190 pxor @b[5], @b[2] 191 pxor @b[7], @b[3] 192 pxor @b[3], @b[5] 193 pxor @b[5], @b[1] 194 195 pxor @b[1], @b[6] 196 pxor @b[0], @b[2] 197 pxor @b[6], @b[4] 198 pxor @b[6], @b[0] 199 pxor @b[4], @b[1] 200___ 201} 202 203sub InvOutBasisChange { # InBasisChange in reverse 204my @b=@_[2,5,7,3,6,1,0,4]; 205$code.=<<___; 206 pxor @b[5], @b[1] 207 pxor @b[7], @b[2] 208 209 pxor @b[1], @b[3] 210 pxor @b[5], @b[4] 211 pxor @b[5], @b[7] 212 pxor @b[4], @b[3] 213 pxor @b[0], @b[5] 214 pxor @b[7], @b[3] 215 pxor @b[2], @b[6] 216 pxor @b[1], @b[2] 217 pxor @b[3], @b[6] 218 219 pxor @b[0], @b[3] 220 pxor @b[6], @b[5] 221___ 222} 223 224sub Mul_GF4 { 225#;************************************************************* 226#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 227#;************************************************************* 228my ($x0,$x1,$y0,$y1,$t0)=@_; 229$code.=<<___; 230 movdqa $y0, $t0 231 pxor $y1, $t0 232 pand $x0, $t0 233 pxor $x1, $x0 234 pand $y0, $x1 235 pand $y1, $x0 236 pxor $x1, $x0 237 pxor $t0, $x1 238___ 239} 240 241sub Mul_GF4_N { # not used, see next subroutine 242# multiply and scale by N 243my ($x0,$x1,$y0,$y1,$t0)=@_; 244$code.=<<___; 245 movdqa $y0, $t0 246 pxor $y1, $t0 247 pand $x0, $t0 248 pxor $x1, $x0 249 pand $y0, $x1 250 pand $y1, $x0 251 pxor $x0, $x1 252 pxor $t0, $x0 253___ 254} 255 256sub Mul_GF4_N_GF4 { 257# interleaved Mul_GF4_N and Mul_GF4 258my ($x0,$x1,$y0,$y1,$t0, 259 $x2,$x3,$y2,$y3,$t1)=@_; 260$code.=<<___; 261 movdqa $y0, $t0 262 movdqa $y2, $t1 263 pxor $y1, $t0 264 pxor $y3, $t1 265 pand $x0, $t0 266 pand $x2, $t1 267 pxor $x1, $x0 268 pxor $x3, $x2 269 pand $y0, $x1 270 pand $y2, $x3 271 pand $y1, $x0 272 pand $y3, $x2 273 pxor $x0, $x1 274 pxor $x3, $x2 275 pxor $t0, $x0 276 pxor $t1, $x3 277___ 278} 279sub Mul_GF16_2 { 280my @x=@_[0..7]; 281my @y=@_[8..11]; 282my @t=@_[12..15]; 283$code.=<<___; 284 movdqa @x[0], @t[0] 285 movdqa @x[1], @t[1] 286___ 287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 288$code.=<<___; 289 pxor @x[2], @t[0] 290 pxor @x[3], @t[1] 291 pxor @y[2], @y[0] 292 pxor @y[3], @y[1] 293___ 294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 295 @x[2], @x[3], @y[2], @y[3], @t[2]); 296$code.=<<___; 297 pxor @t[0], @x[0] 298 pxor @t[0], @x[2] 299 pxor @t[1], @x[1] 300 pxor @t[1], @x[3] 301 302 movdqa @x[4], @t[0] 303 movdqa @x[5], @t[1] 304 pxor @x[6], @t[0] 305 pxor @x[7], @t[1] 306___ 307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 308 @x[6], @x[7], @y[2], @y[3], @t[2]); 309$code.=<<___; 310 pxor @y[2], @y[0] 311 pxor @y[3], @y[1] 312___ 313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 314$code.=<<___; 315 pxor @t[0], @x[4] 316 pxor @t[0], @x[6] 317 pxor @t[1], @x[5] 318 pxor @t[1], @x[7] 319___ 320} 321sub Inv_GF256 { 322#;******************************************************************** 323#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 324#;******************************************************************** 325my @x=@_[0..7]; 326my @t=@_[8..11]; 327my @s=@_[12..15]; 328# direct optimizations from hardware 329$code.=<<___; 330 movdqa @x[4], @t[3] 331 movdqa @x[5], @t[2] 332 movdqa @x[1], @t[1] 333 movdqa @x[7], @s[1] 334 movdqa @x[0], @s[0] 335 336 pxor @x[6], @t[3] 337 pxor @x[7], @t[2] 338 pxor @x[3], @t[1] 339 movdqa @t[3], @s[2] 340 pxor @x[6], @s[1] 341 movdqa @t[2], @t[0] 342 pxor @x[2], @s[0] 343 movdqa @t[3], @s[3] 344 345 por @t[1], @t[2] 346 por @s[0], @t[3] 347 pxor @t[0], @s[3] 348 pand @s[0], @s[2] 349 pxor @t[1], @s[0] 350 pand @t[1], @t[0] 351 pand @s[0], @s[3] 352 movdqa @x[3], @s[0] 353 pxor @x[2], @s[0] 354 pand @s[0], @s[1] 355 pxor @s[1], @t[3] 356 pxor @s[1], @t[2] 357 movdqa @x[4], @s[1] 358 movdqa @x[1], @s[0] 359 pxor @x[5], @s[1] 360 pxor @x[0], @s[0] 361 movdqa @s[1], @t[1] 362 pand @s[0], @s[1] 363 por @s[0], @t[1] 364 pxor @s[1], @t[0] 365 pxor @s[3], @t[3] 366 pxor @s[2], @t[2] 367 pxor @s[3], @t[1] 368 movdqa @x[7], @s[0] 369 pxor @s[2], @t[0] 370 movdqa @x[6], @s[1] 371 pxor @s[2], @t[1] 372 movdqa @x[5], @s[2] 373 pand @x[3], @s[0] 374 movdqa @x[4], @s[3] 375 pand @x[2], @s[1] 376 pand @x[1], @s[2] 377 por @x[0], @s[3] 378 pxor @s[0], @t[3] 379 pxor @s[1], @t[2] 380 pxor @s[2], @t[1] 381 pxor @s[3], @t[0] 382 383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 384 385 # new smaller inversion 386 387 movdqa @t[3], @s[0] 388 pand @t[1], @t[3] 389 pxor @t[2], @s[0] 390 391 movdqa @t[0], @s[2] 392 movdqa @s[0], @s[3] 393 pxor @t[3], @s[2] 394 pand @s[2], @s[3] 395 396 movdqa @t[1], @s[1] 397 pxor @t[2], @s[3] 398 pxor @t[0], @s[1] 399 400 pxor @t[2], @t[3] 401 402 pand @t[3], @s[1] 403 404 movdqa @s[2], @t[2] 405 pxor @t[0], @s[1] 406 407 pxor @s[1], @t[2] 408 pxor @s[1], @t[1] 409 410 pand @t[0], @t[2] 411 412 pxor @t[2], @s[2] 413 pxor @t[2], @t[1] 414 415 pand @s[3], @s[2] 416 417 pxor @s[0], @s[2] 418___ 419# output in s3, s2, s1, t1 420 421# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 422 423# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 425 426### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 427} 428 429# AES linear components 430 431sub ShiftRows { 432my @x=@_[0..7]; 433my $mask=pop; 434$code.=<<___; 435 pxor 0x00($key),@x[0] 436 pxor 0x10($key),@x[1] 437 pshufb $mask,@x[0] 438 pxor 0x20($key),@x[2] 439 pshufb $mask,@x[1] 440 pxor 0x30($key),@x[3] 441 pshufb $mask,@x[2] 442 pxor 0x40($key),@x[4] 443 pshufb $mask,@x[3] 444 pxor 0x50($key),@x[5] 445 pshufb $mask,@x[4] 446 pxor 0x60($key),@x[6] 447 pshufb $mask,@x[5] 448 pxor 0x70($key),@x[7] 449 pshufb $mask,@x[6] 450 lea 0x80($key),$key 451 pshufb $mask,@x[7] 452___ 453} 454 455sub MixColumns { 456# modified to emit output in order suitable for feeding back to aesenc[last] 457my @x=@_[0..7]; 458my @t=@_[8..15]; 459my $inv=@_[16]; # optional 460$code.=<<___; 461 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 462 pshufd \$0x93, @x[1], @t[1] 463 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 464 pshufd \$0x93, @x[2], @t[2] 465 pxor @t[1], @x[1] 466 pshufd \$0x93, @x[3], @t[3] 467 pxor @t[2], @x[2] 468 pshufd \$0x93, @x[4], @t[4] 469 pxor @t[3], @x[3] 470 pshufd \$0x93, @x[5], @t[5] 471 pxor @t[4], @x[4] 472 pshufd \$0x93, @x[6], @t[6] 473 pxor @t[5], @x[5] 474 pshufd \$0x93, @x[7], @t[7] 475 pxor @t[6], @x[6] 476 pxor @t[7], @x[7] 477 478 pxor @x[0], @t[1] 479 pxor @x[7], @t[0] 480 pxor @x[7], @t[1] 481 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 482 pxor @x[1], @t[2] 483 pshufd \$0x4E, @x[1], @x[1] 484 pxor @x[4], @t[5] 485 pxor @t[0], @x[0] 486 pxor @x[5], @t[6] 487 pxor @t[1], @x[1] 488 pxor @x[3], @t[4] 489 pshufd \$0x4E, @x[4], @t[0] 490 pxor @x[6], @t[7] 491 pshufd \$0x4E, @x[5], @t[1] 492 pxor @x[2], @t[3] 493 pshufd \$0x4E, @x[3], @x[4] 494 pxor @x[7], @t[3] 495 pshufd \$0x4E, @x[7], @x[5] 496 pxor @x[7], @t[4] 497 pshufd \$0x4E, @x[6], @x[3] 498 pxor @t[4], @t[0] 499 pshufd \$0x4E, @x[2], @x[6] 500 pxor @t[5], @t[1] 501___ 502$code.=<<___ if (!$inv); 503 pxor @t[3], @x[4] 504 pxor @t[7], @x[5] 505 pxor @t[6], @x[3] 506 movdqa @t[0], @x[2] 507 pxor @t[2], @x[6] 508 movdqa @t[1], @x[7] 509___ 510$code.=<<___ if ($inv); 511 pxor @x[4], @t[3] 512 pxor @t[7], @x[5] 513 pxor @x[3], @t[6] 514 movdqa @t[0], @x[3] 515 pxor @t[2], @x[6] 516 movdqa @t[6], @x[2] 517 movdqa @t[1], @x[7] 518 movdqa @x[6], @x[4] 519 movdqa @t[3], @x[6] 520___ 521} 522 523sub InvMixColumns_orig { 524my @x=@_[0..7]; 525my @t=@_[8..15]; 526 527$code.=<<___; 528 # multiplication by 0x0e 529 pshufd \$0x93, @x[7], @t[7] 530 movdqa @x[2], @t[2] 531 pxor @x[5], @x[7] # 7 5 532 pxor @x[5], @x[2] # 2 5 533 pshufd \$0x93, @x[0], @t[0] 534 movdqa @x[5], @t[5] 535 pxor @x[0], @x[5] # 5 0 [1] 536 pxor @x[1], @x[0] # 0 1 537 pshufd \$0x93, @x[1], @t[1] 538 pxor @x[2], @x[1] # 1 25 539 pxor @x[6], @x[0] # 01 6 [2] 540 pxor @x[3], @x[1] # 125 3 [4] 541 pshufd \$0x93, @x[3], @t[3] 542 pxor @x[0], @x[2] # 25 016 [3] 543 pxor @x[7], @x[3] # 3 75 544 pxor @x[6], @x[7] # 75 6 [0] 545 pshufd \$0x93, @x[6], @t[6] 546 movdqa @x[4], @t[4] 547 pxor @x[4], @x[6] # 6 4 548 pxor @x[3], @x[4] # 4 375 [6] 549 pxor @x[7], @x[3] # 375 756=36 550 pxor @t[5], @x[6] # 64 5 [7] 551 pxor @t[2], @x[3] # 36 2 552 pxor @t[4], @x[3] # 362 4 [5] 553 pshufd \$0x93, @t[5], @t[5] 554___ 555 my @y = @x[7,5,0,2,1,3,4,6]; 556$code.=<<___; 557 # multiplication by 0x0b 558 pxor @y[0], @y[1] 559 pxor @t[0], @y[0] 560 pxor @t[1], @y[1] 561 pshufd \$0x93, @t[2], @t[2] 562 pxor @t[5], @y[0] 563 pxor @t[6], @y[1] 564 pxor @t[7], @y[0] 565 pshufd \$0x93, @t[4], @t[4] 566 pxor @t[6], @t[7] # clobber t[7] 567 pxor @y[0], @y[1] 568 569 pxor @t[0], @y[3] 570 pshufd \$0x93, @t[0], @t[0] 571 pxor @t[1], @y[2] 572 pxor @t[1], @y[4] 573 pxor @t[2], @y[2] 574 pshufd \$0x93, @t[1], @t[1] 575 pxor @t[2], @y[3] 576 pxor @t[2], @y[5] 577 pxor @t[7], @y[2] 578 pshufd \$0x93, @t[2], @t[2] 579 pxor @t[3], @y[3] 580 pxor @t[3], @y[6] 581 pxor @t[3], @y[4] 582 pshufd \$0x93, @t[3], @t[3] 583 pxor @t[4], @y[7] 584 pxor @t[4], @y[5] 585 pxor @t[7], @y[7] 586 pxor @t[5], @y[3] 587 pxor @t[4], @y[4] 588 pxor @t[5], @t[7] # clobber t[7] even more 589 590 pxor @t[7], @y[5] 591 pshufd \$0x93, @t[4], @t[4] 592 pxor @t[7], @y[6] 593 pxor @t[7], @y[4] 594 595 pxor @t[5], @t[7] 596 pshufd \$0x93, @t[5], @t[5] 597 pxor @t[6], @t[7] # restore t[7] 598 599 # multiplication by 0x0d 600 pxor @y[7], @y[4] 601 pxor @t[4], @y[7] 602 pshufd \$0x93, @t[6], @t[6] 603 pxor @t[0], @y[2] 604 pxor @t[5], @y[7] 605 pxor @t[2], @y[2] 606 pshufd \$0x93, @t[7], @t[7] 607 608 pxor @y[1], @y[3] 609 pxor @t[1], @y[1] 610 pxor @t[0], @y[0] 611 pxor @t[0], @y[3] 612 pxor @t[5], @y[1] 613 pxor @t[5], @y[0] 614 pxor @t[7], @y[1] 615 pshufd \$0x93, @t[0], @t[0] 616 pxor @t[6], @y[0] 617 pxor @y[1], @y[3] 618 pxor @t[1], @y[4] 619 pshufd \$0x93, @t[1], @t[1] 620 621 pxor @t[7], @y[7] 622 pxor @t[2], @y[4] 623 pxor @t[2], @y[5] 624 pshufd \$0x93, @t[2], @t[2] 625 pxor @t[6], @y[2] 626 pxor @t[3], @t[6] # clobber t[6] 627 pxor @y[7], @y[4] 628 pxor @t[6], @y[3] 629 630 pxor @t[6], @y[6] 631 pxor @t[5], @y[5] 632 pxor @t[4], @y[6] 633 pshufd \$0x93, @t[4], @t[4] 634 pxor @t[6], @y[5] 635 pxor @t[7], @y[6] 636 pxor @t[3], @t[6] # restore t[6] 637 638 pshufd \$0x93, @t[5], @t[5] 639 pshufd \$0x93, @t[6], @t[6] 640 pshufd \$0x93, @t[7], @t[7] 641 pshufd \$0x93, @t[3], @t[3] 642 643 # multiplication by 0x09 644 pxor @y[1], @y[4] 645 pxor @y[1], @t[1] # t[1]=y[1] 646 pxor @t[5], @t[0] # clobber t[0] 647 pxor @t[5], @t[1] 648 pxor @t[0], @y[3] 649 pxor @y[0], @t[0] # t[0]=y[0] 650 pxor @t[6], @t[1] 651 pxor @t[7], @t[6] # clobber t[6] 652 pxor @t[1], @y[4] 653 pxor @t[4], @y[7] 654 pxor @y[4], @t[4] # t[4]=y[4] 655 pxor @t[3], @y[6] 656 pxor @y[3], @t[3] # t[3]=y[3] 657 pxor @t[2], @y[5] 658 pxor @y[2], @t[2] # t[2]=y[2] 659 pxor @t[7], @t[3] 660 pxor @y[5], @t[5] # t[5]=y[5] 661 pxor @t[6], @t[2] 662 pxor @t[6], @t[5] 663 pxor @y[6], @t[6] # t[6]=y[6] 664 pxor @y[7], @t[7] # t[7]=y[7] 665 666 movdqa @t[0],@XMM[0] 667 movdqa @t[1],@XMM[1] 668 movdqa @t[2],@XMM[2] 669 movdqa @t[3],@XMM[3] 670 movdqa @t[4],@XMM[4] 671 movdqa @t[5],@XMM[5] 672 movdqa @t[6],@XMM[6] 673 movdqa @t[7],@XMM[7] 674___ 675} 676 677sub InvMixColumns { 678my @x=@_[0..7]; 679my @t=@_[8..15]; 680 681# Thanks to Jussi Kivilinna for providing pointer to 682# 683# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 684# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 685# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 686# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 687 688$code.=<<___; 689 # multiplication by 0x05-0x00-0x04-0x00 690 pshufd \$0x4E, @x[0], @t[0] 691 pshufd \$0x4E, @x[6], @t[6] 692 pxor @x[0], @t[0] 693 pshufd \$0x4E, @x[7], @t[7] 694 pxor @x[6], @t[6] 695 pshufd \$0x4E, @x[1], @t[1] 696 pxor @x[7], @t[7] 697 pshufd \$0x4E, @x[2], @t[2] 698 pxor @x[1], @t[1] 699 pshufd \$0x4E, @x[3], @t[3] 700 pxor @x[2], @t[2] 701 pxor @t[6], @x[0] 702 pxor @t[6], @x[1] 703 pshufd \$0x4E, @x[4], @t[4] 704 pxor @x[3], @t[3] 705 pxor @t[0], @x[2] 706 pxor @t[1], @x[3] 707 pshufd \$0x4E, @x[5], @t[5] 708 pxor @x[4], @t[4] 709 pxor @t[7], @x[1] 710 pxor @t[2], @x[4] 711 pxor @x[5], @t[5] 712 713 pxor @t[7], @x[2] 714 pxor @t[6], @x[3] 715 pxor @t[6], @x[4] 716 pxor @t[3], @x[5] 717 pxor @t[4], @x[6] 718 pxor @t[7], @x[4] 719 pxor @t[7], @x[5] 720 pxor @t[5], @x[7] 721___ 722 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 723} 724 725sub aesenc { # not used 726my @b=@_[0..7]; 727my @t=@_[8..15]; 728$code.=<<___; 729 movdqa 0x30($const),@t[0] # .LSR 730___ 731 &ShiftRows (@b,@t[0]); 732 &Sbox (@b,@t); 733 &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 734} 735 736sub aesenclast { # not used 737my @b=@_[0..7]; 738my @t=@_[8..15]; 739$code.=<<___; 740 movdqa 0x40($const),@t[0] # .LSRM0 741___ 742 &ShiftRows (@b,@t[0]); 743 &Sbox (@b,@t); 744$code.=<<___ 745 pxor 0x00($key),@b[0] 746 pxor 0x10($key),@b[1] 747 pxor 0x20($key),@b[4] 748 pxor 0x30($key),@b[6] 749 pxor 0x40($key),@b[3] 750 pxor 0x50($key),@b[7] 751 pxor 0x60($key),@b[2] 752 pxor 0x70($key),@b[5] 753___ 754} 755 756sub swapmove { 757my ($a,$b,$n,$mask,$t)=@_; 758$code.=<<___; 759 movdqa $b,$t 760 psrlq \$$n,$b 761 pxor $a,$b 762 pand $mask,$b 763 pxor $b,$a 764 psllq \$$n,$b 765 pxor $t,$b 766___ 767} 768sub swapmove2x { 769my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 770$code.=<<___; 771 movdqa $b0,$t0 772 psrlq \$$n,$b0 773 movdqa $b1,$t1 774 psrlq \$$n,$b1 775 pxor $a0,$b0 776 pxor $a1,$b1 777 pand $mask,$b0 778 pand $mask,$b1 779 pxor $b0,$a0 780 psllq \$$n,$b0 781 pxor $b1,$a1 782 psllq \$$n,$b1 783 pxor $t0,$b0 784 pxor $t1,$b1 785___ 786} 787 788sub bitslice { 789my @x=reverse(@_[0..7]); 790my ($t0,$t1,$t2,$t3)=@_[8..11]; 791$code.=<<___; 792 movdqa 0x00($const),$t0 # .LBS0 793 movdqa 0x10($const),$t1 # .LBS1 794___ 795 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 796 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 797$code.=<<___; 798 movdqa 0x20($const),$t0 # .LBS2 799___ 800 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 801 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 802 803 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 804 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 805} 806 807$code.=<<___; 808.text 809 810.extern asm_AES_encrypt 811.extern asm_AES_decrypt 812 813.type _bsaes_encrypt8,\@abi-omnipotent 814.align 64 815_bsaes_encrypt8: 816 _CET_ENDBR 817 lea .LBS0(%rip), $const # constants table 818 819 movdqa ($key), @XMM[9] # round 0 key 820 lea 0x10($key), $key 821 movdqa 0x50($const), @XMM[8] # .LM0SR 822 pxor @XMM[9], @XMM[0] # xor with round0 key 823 pxor @XMM[9], @XMM[1] 824 pshufb @XMM[8], @XMM[0] 825 pxor @XMM[9], @XMM[2] 826 pshufb @XMM[8], @XMM[1] 827 pxor @XMM[9], @XMM[3] 828 pshufb @XMM[8], @XMM[2] 829 pxor @XMM[9], @XMM[4] 830 pshufb @XMM[8], @XMM[3] 831 pxor @XMM[9], @XMM[5] 832 pshufb @XMM[8], @XMM[4] 833 pxor @XMM[9], @XMM[6] 834 pshufb @XMM[8], @XMM[5] 835 pxor @XMM[9], @XMM[7] 836 pshufb @XMM[8], @XMM[6] 837 pshufb @XMM[8], @XMM[7] 838_bsaes_encrypt8_bitslice: 839___ 840 &bitslice (@XMM[0..7, 8..11]); 841$code.=<<___; 842 dec $rounds 843 jmp .Lenc_sbox 844.align 16 845.Lenc_loop: 846___ 847 &ShiftRows (@XMM[0..7, 8]); 848$code.=".Lenc_sbox:\n"; 849 &Sbox (@XMM[0..7, 8..15]); 850$code.=<<___; 851 dec $rounds 852 jl .Lenc_done 853___ 854 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 855$code.=<<___; 856 movdqa 0x30($const), @XMM[8] # .LSR 857 jnz .Lenc_loop 858 movdqa 0x40($const), @XMM[8] # .LSRM0 859 jmp .Lenc_loop 860.align 16 861.Lenc_done: 862___ 863 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 864 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 865$code.=<<___; 866 movdqa ($key), @XMM[8] # last round key 867 pxor @XMM[8], @XMM[4] 868 pxor @XMM[8], @XMM[6] 869 pxor @XMM[8], @XMM[3] 870 pxor @XMM[8], @XMM[7] 871 pxor @XMM[8], @XMM[2] 872 pxor @XMM[8], @XMM[5] 873 pxor @XMM[8], @XMM[0] 874 pxor @XMM[8], @XMM[1] 875 ret 876.size _bsaes_encrypt8,.-_bsaes_encrypt8 877 878.type _bsaes_decrypt8,\@abi-omnipotent 879.align 64 880_bsaes_decrypt8: 881 _CET_ENDBR 882 lea .LBS0(%rip), $const # constants table 883 884 movdqa ($key), @XMM[9] # round 0 key 885 lea 0x10($key), $key 886 movdqa -0x30($const), @XMM[8] # .LM0ISR 887 pxor @XMM[9], @XMM[0] # xor with round0 key 888 pxor @XMM[9], @XMM[1] 889 pshufb @XMM[8], @XMM[0] 890 pxor @XMM[9], @XMM[2] 891 pshufb @XMM[8], @XMM[1] 892 pxor @XMM[9], @XMM[3] 893 pshufb @XMM[8], @XMM[2] 894 pxor @XMM[9], @XMM[4] 895 pshufb @XMM[8], @XMM[3] 896 pxor @XMM[9], @XMM[5] 897 pshufb @XMM[8], @XMM[4] 898 pxor @XMM[9], @XMM[6] 899 pshufb @XMM[8], @XMM[5] 900 pxor @XMM[9], @XMM[7] 901 pshufb @XMM[8], @XMM[6] 902 pshufb @XMM[8], @XMM[7] 903___ 904 &bitslice (@XMM[0..7, 8..11]); 905$code.=<<___; 906 dec $rounds 907 jmp .Ldec_sbox 908.align 16 909.Ldec_loop: 910___ 911 &ShiftRows (@XMM[0..7, 8]); 912$code.=".Ldec_sbox:\n"; 913 &InvSbox (@XMM[0..7, 8..15]); 914$code.=<<___; 915 dec $rounds 916 jl .Ldec_done 917___ 918 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 919$code.=<<___; 920 movdqa -0x10($const), @XMM[8] # .LISR 921 jnz .Ldec_loop 922 movdqa -0x20($const), @XMM[8] # .LISRM0 923 jmp .Ldec_loop 924.align 16 925.Ldec_done: 926___ 927 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 928$code.=<<___; 929 movdqa ($key), @XMM[8] # last round key 930 pxor @XMM[8], @XMM[6] 931 pxor @XMM[8], @XMM[4] 932 pxor @XMM[8], @XMM[2] 933 pxor @XMM[8], @XMM[7] 934 pxor @XMM[8], @XMM[3] 935 pxor @XMM[8], @XMM[5] 936 pxor @XMM[8], @XMM[0] 937 pxor @XMM[8], @XMM[1] 938 ret 939.size _bsaes_decrypt8,.-_bsaes_decrypt8 940___ 941} 942{ 943my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 944 945sub bitslice_key { 946my @x=reverse(@_[0..7]); 947my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 948 949 &swapmove (@x[0,1],1,$bs0,$t2,$t3); 950$code.=<<___; 951 #&swapmove(@x[2,3],1,$t0,$t2,$t3); 952 movdqa @x[0], @x[2] 953 movdqa @x[1], @x[3] 954___ 955 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 956 957 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 958$code.=<<___; 959 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 960 movdqa @x[0], @x[4] 961 movdqa @x[2], @x[6] 962 movdqa @x[1], @x[5] 963 movdqa @x[3], @x[7] 964___ 965 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 966 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 967} 968 969$code.=<<___; 970.type _bsaes_key_convert,\@abi-omnipotent 971.align 16 972_bsaes_key_convert: 973 _CET_ENDBR 974 lea .Lmasks(%rip), $const 975 movdqu ($inp), %xmm7 # load round 0 key 976 lea 0x10($inp), $inp 977 movdqa 0x00($const), %xmm0 # 0x01... 978 movdqa 0x10($const), %xmm1 # 0x02... 979 movdqa 0x20($const), %xmm2 # 0x04... 980 movdqa 0x30($const), %xmm3 # 0x08... 981 movdqa 0x40($const), %xmm4 # .LM0 982 pcmpeqd %xmm5, %xmm5 # .LNOT 983 984 movdqu ($inp), %xmm6 # load round 1 key 985 movdqa %xmm7, ($out) # save round 0 key 986 lea 0x10($out), $out 987 dec $rounds 988 jmp .Lkey_loop 989.align 16 990.Lkey_loop: 991 pshufb %xmm4, %xmm6 # .LM0 992 993 movdqa %xmm0, %xmm8 994 movdqa %xmm1, %xmm9 995 996 pand %xmm6, %xmm8 997 pand %xmm6, %xmm9 998 movdqa %xmm2, %xmm10 999 pcmpeqb %xmm0, %xmm8 1000 psllq \$4, %xmm0 # 0x10... 1001 movdqa %xmm3, %xmm11 1002 pcmpeqb %xmm1, %xmm9 1003 psllq \$4, %xmm1 # 0x20... 1004 1005 pand %xmm6, %xmm10 1006 pand %xmm6, %xmm11 1007 movdqa %xmm0, %xmm12 1008 pcmpeqb %xmm2, %xmm10 1009 psllq \$4, %xmm2 # 0x40... 1010 movdqa %xmm1, %xmm13 1011 pcmpeqb %xmm3, %xmm11 1012 psllq \$4, %xmm3 # 0x80... 1013 1014 movdqa %xmm2, %xmm14 1015 movdqa %xmm3, %xmm15 1016 pxor %xmm5, %xmm8 # "pnot" 1017 pxor %xmm5, %xmm9 1018 1019 pand %xmm6, %xmm12 1020 pand %xmm6, %xmm13 1021 movdqa %xmm8, 0x00($out) # write bit-sliced round key 1022 pcmpeqb %xmm0, %xmm12 1023 psrlq \$4, %xmm0 # 0x01... 1024 movdqa %xmm9, 0x10($out) 1025 pcmpeqb %xmm1, %xmm13 1026 psrlq \$4, %xmm1 # 0x02... 1027 lea 0x10($inp), $inp 1028 1029 pand %xmm6, %xmm14 1030 pand %xmm6, %xmm15 1031 movdqa %xmm10, 0x20($out) 1032 pcmpeqb %xmm2, %xmm14 1033 psrlq \$4, %xmm2 # 0x04... 1034 movdqa %xmm11, 0x30($out) 1035 pcmpeqb %xmm3, %xmm15 1036 psrlq \$4, %xmm3 # 0x08... 1037 movdqu ($inp), %xmm6 # load next round key 1038 1039 pxor %xmm5, %xmm13 # "pnot" 1040 pxor %xmm5, %xmm14 1041 movdqa %xmm12, 0x40($out) 1042 movdqa %xmm13, 0x50($out) 1043 movdqa %xmm14, 0x60($out) 1044 movdqa %xmm15, 0x70($out) 1045 lea 0x80($out),$out 1046 dec $rounds 1047 jnz .Lkey_loop 1048 1049 movdqa 0x50($const), %xmm7 # .L63 1050 #movdqa %xmm6, ($out) # don't save last round key 1051 ret 1052.size _bsaes_key_convert,.-_bsaes_key_convert 1053___ 1054} 1055 1056if (0 && !$win64) { # following four functions are unsupported interface 1057 # used for benchmarking... 1058$code.=<<___; 1059.globl bsaes_enc_key_convert 1060.type bsaes_enc_key_convert,\@function,2 1061.align 16 1062bsaes_enc_key_convert: 1063 _CET_ENDBR 1064 mov 240($inp),%r10d # pass rounds 1065 mov $inp,%rcx # pass key 1066 mov $out,%rax # pass key schedule 1067 call _bsaes_key_convert 1068 pxor %xmm6,%xmm7 # fix up last round key 1069 movdqa %xmm7,(%rax) # save last round key 1070 ret 1071.size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1072 1073.globl bsaes_encrypt_128 1074.type bsaes_encrypt_128,\@function,4 1075.align 16 1076bsaes_encrypt_128: 1077.Lenc128_loop: 1078 _CET_ENDBR 1079 movdqu 0x00($inp), @XMM[0] # load input 1080 movdqu 0x10($inp), @XMM[1] 1081 movdqu 0x20($inp), @XMM[2] 1082 movdqu 0x30($inp), @XMM[3] 1083 movdqu 0x40($inp), @XMM[4] 1084 movdqu 0x50($inp), @XMM[5] 1085 movdqu 0x60($inp), @XMM[6] 1086 movdqu 0x70($inp), @XMM[7] 1087 mov $key, %rax # pass the $key 1088 lea 0x80($inp), $inp 1089 mov \$10,%r10d 1090 1091 call _bsaes_encrypt8 1092 1093 movdqu @XMM[0], 0x00($out) # write output 1094 movdqu @XMM[1], 0x10($out) 1095 movdqu @XMM[4], 0x20($out) 1096 movdqu @XMM[6], 0x30($out) 1097 movdqu @XMM[3], 0x40($out) 1098 movdqu @XMM[7], 0x50($out) 1099 movdqu @XMM[2], 0x60($out) 1100 movdqu @XMM[5], 0x70($out) 1101 lea 0x80($out), $out 1102 sub \$0x80,$len 1103 ja .Lenc128_loop 1104 ret 1105.size bsaes_encrypt_128,.-bsaes_encrypt_128 1106 1107.globl bsaes_dec_key_convert 1108.type bsaes_dec_key_convert,\@function,2 1109.align 16 1110bsaes_dec_key_convert: 1111 _CET_ENDBR 1112 mov 240($inp),%r10d # pass rounds 1113 mov $inp,%rcx # pass key 1114 mov $out,%rax # pass key schedule 1115 call _bsaes_key_convert 1116 pxor ($out),%xmm7 # fix up round 0 key 1117 movdqa %xmm6,(%rax) # save last round key 1118 movdqa %xmm7,($out) 1119 ret 1120.size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1121 1122.globl bsaes_decrypt_128 1123.type bsaes_decrypt_128,\@function,4 1124.align 16 1125bsaes_decrypt_128: 1126 _CET_ENDBR 1127.Ldec128_loop: 1128 movdqu 0x00($inp), @XMM[0] # load input 1129 movdqu 0x10($inp), @XMM[1] 1130 movdqu 0x20($inp), @XMM[2] 1131 movdqu 0x30($inp), @XMM[3] 1132 movdqu 0x40($inp), @XMM[4] 1133 movdqu 0x50($inp), @XMM[5] 1134 movdqu 0x60($inp), @XMM[6] 1135 movdqu 0x70($inp), @XMM[7] 1136 mov $key, %rax # pass the $key 1137 lea 0x80($inp), $inp 1138 mov \$10,%r10d 1139 1140 call _bsaes_decrypt8 1141 1142 movdqu @XMM[0], 0x00($out) # write output 1143 movdqu @XMM[1], 0x10($out) 1144 movdqu @XMM[6], 0x20($out) 1145 movdqu @XMM[4], 0x30($out) 1146 movdqu @XMM[2], 0x40($out) 1147 movdqu @XMM[7], 0x50($out) 1148 movdqu @XMM[3], 0x60($out) 1149 movdqu @XMM[5], 0x70($out) 1150 lea 0x80($out), $out 1151 sub \$0x80,$len 1152 ja .Ldec128_loop 1153 ret 1154.size bsaes_decrypt_128,.-bsaes_decrypt_128 1155___ 1156} 1157{ 1158###################################################################### 1159# 1160# OpenSSL interface 1161# 1162my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1163 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1164my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1165 1166if ($ecb) { 1167$code.=<<___; 1168.globl bsaes_ecb_encrypt_blocks 1169.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1170.align 16 1171bsaes_ecb_encrypt_blocks: 1172 _CET_ENDBR 1173 mov %rsp, %rax 1174.Lecb_enc_prologue: 1175 push %rbp 1176 push %rbx 1177 push %r12 1178 push %r13 1179 push %r14 1180 push %r15 1181 lea -0x48(%rsp),%rsp 1182___ 1183$code.=<<___ if ($win64); 1184 lea -0xa0(%rsp), %rsp 1185 movaps %xmm6, 0x40(%rsp) 1186 movaps %xmm7, 0x50(%rsp) 1187 movaps %xmm8, 0x60(%rsp) 1188 movaps %xmm9, 0x70(%rsp) 1189 movaps %xmm10, 0x80(%rsp) 1190 movaps %xmm11, 0x90(%rsp) 1191 movaps %xmm12, 0xa0(%rsp) 1192 movaps %xmm13, 0xb0(%rsp) 1193 movaps %xmm14, 0xc0(%rsp) 1194 movaps %xmm15, 0xd0(%rsp) 1195.Lecb_enc_body: 1196___ 1197$code.=<<___; 1198 mov %rsp,%rbp # backup %rsp 1199 mov 240($arg4),%eax # rounds 1200 mov $arg1,$inp # backup arguments 1201 mov $arg2,$out 1202 mov $arg3,$len 1203 mov $arg4,$key 1204 cmp \$8,$arg3 1205 jb .Lecb_enc_short 1206 1207 mov %eax,%ebx # backup rounds 1208 shl \$7,%rax # 128 bytes per inner round key 1209 sub \$`128-32`,%rax # size of bit-sliced key schedule 1210 sub %rax,%rsp 1211 mov %rsp,%rax # pass key schedule 1212 mov $key,%rcx # pass key 1213 mov %ebx,%r10d # pass rounds 1214 call _bsaes_key_convert 1215 pxor %xmm6,%xmm7 # fix up last round key 1216 movdqa %xmm7,(%rax) # save last round key 1217 1218 sub \$8,$len 1219.Lecb_enc_loop: 1220 movdqu 0x00($inp), @XMM[0] # load input 1221 movdqu 0x10($inp), @XMM[1] 1222 movdqu 0x20($inp), @XMM[2] 1223 movdqu 0x30($inp), @XMM[3] 1224 movdqu 0x40($inp), @XMM[4] 1225 movdqu 0x50($inp), @XMM[5] 1226 mov %rsp, %rax # pass key schedule 1227 movdqu 0x60($inp), @XMM[6] 1228 mov %ebx,%r10d # pass rounds 1229 movdqu 0x70($inp), @XMM[7] 1230 lea 0x80($inp), $inp 1231 1232 call _bsaes_encrypt8 1233 1234 movdqu @XMM[0], 0x00($out) # write output 1235 movdqu @XMM[1], 0x10($out) 1236 movdqu @XMM[4], 0x20($out) 1237 movdqu @XMM[6], 0x30($out) 1238 movdqu @XMM[3], 0x40($out) 1239 movdqu @XMM[7], 0x50($out) 1240 movdqu @XMM[2], 0x60($out) 1241 movdqu @XMM[5], 0x70($out) 1242 lea 0x80($out), $out 1243 sub \$8,$len 1244 jnc .Lecb_enc_loop 1245 1246 add \$8,$len 1247 jz .Lecb_enc_done 1248 1249 movdqu 0x00($inp), @XMM[0] # load input 1250 mov %rsp, %rax # pass key schedule 1251 mov %ebx,%r10d # pass rounds 1252 cmp \$2,$len 1253 jb .Lecb_enc_one 1254 movdqu 0x10($inp), @XMM[1] 1255 je .Lecb_enc_two 1256 movdqu 0x20($inp), @XMM[2] 1257 cmp \$4,$len 1258 jb .Lecb_enc_three 1259 movdqu 0x30($inp), @XMM[3] 1260 je .Lecb_enc_four 1261 movdqu 0x40($inp), @XMM[4] 1262 cmp \$6,$len 1263 jb .Lecb_enc_five 1264 movdqu 0x50($inp), @XMM[5] 1265 je .Lecb_enc_six 1266 movdqu 0x60($inp), @XMM[6] 1267 call _bsaes_encrypt8 1268 movdqu @XMM[0], 0x00($out) # write output 1269 movdqu @XMM[1], 0x10($out) 1270 movdqu @XMM[4], 0x20($out) 1271 movdqu @XMM[6], 0x30($out) 1272 movdqu @XMM[3], 0x40($out) 1273 movdqu @XMM[7], 0x50($out) 1274 movdqu @XMM[2], 0x60($out) 1275 jmp .Lecb_enc_done 1276.align 16 1277.Lecb_enc_six: 1278 call _bsaes_encrypt8 1279 movdqu @XMM[0], 0x00($out) # write output 1280 movdqu @XMM[1], 0x10($out) 1281 movdqu @XMM[4], 0x20($out) 1282 movdqu @XMM[6], 0x30($out) 1283 movdqu @XMM[3], 0x40($out) 1284 movdqu @XMM[7], 0x50($out) 1285 jmp .Lecb_enc_done 1286.align 16 1287.Lecb_enc_five: 1288 call _bsaes_encrypt8 1289 movdqu @XMM[0], 0x00($out) # write output 1290 movdqu @XMM[1], 0x10($out) 1291 movdqu @XMM[4], 0x20($out) 1292 movdqu @XMM[6], 0x30($out) 1293 movdqu @XMM[3], 0x40($out) 1294 jmp .Lecb_enc_done 1295.align 16 1296.Lecb_enc_four: 1297 call _bsaes_encrypt8 1298 movdqu @XMM[0], 0x00($out) # write output 1299 movdqu @XMM[1], 0x10($out) 1300 movdqu @XMM[4], 0x20($out) 1301 movdqu @XMM[6], 0x30($out) 1302 jmp .Lecb_enc_done 1303.align 16 1304.Lecb_enc_three: 1305 call _bsaes_encrypt8 1306 movdqu @XMM[0], 0x00($out) # write output 1307 movdqu @XMM[1], 0x10($out) 1308 movdqu @XMM[4], 0x20($out) 1309 jmp .Lecb_enc_done 1310.align 16 1311.Lecb_enc_two: 1312 call _bsaes_encrypt8 1313 movdqu @XMM[0], 0x00($out) # write output 1314 movdqu @XMM[1], 0x10($out) 1315 jmp .Lecb_enc_done 1316.align 16 1317.Lecb_enc_one: 1318 call _bsaes_encrypt8 1319 movdqu @XMM[0], 0x00($out) # write output 1320 jmp .Lecb_enc_done 1321.align 16 1322.Lecb_enc_short: 1323 lea ($inp), $arg1 1324 lea ($out), $arg2 1325 lea ($key), $arg3 1326 call asm_AES_encrypt 1327 lea 16($inp), $inp 1328 lea 16($out), $out 1329 dec $len 1330 jnz .Lecb_enc_short 1331 1332.Lecb_enc_done: 1333 lea (%rsp),%rax 1334 pxor %xmm0, %xmm0 1335.Lecb_enc_bzero: # wipe key schedule [if any] 1336 movdqa %xmm0, 0x00(%rax) 1337 movdqa %xmm0, 0x10(%rax) 1338 lea 0x20(%rax), %rax 1339 cmp %rax, %rbp 1340 jb .Lecb_enc_bzero 1341 1342 lea (%rbp),%rsp # restore %rsp 1343___ 1344$code.=<<___ if ($win64); 1345 movaps 0x40(%rbp), %xmm6 1346 movaps 0x50(%rbp), %xmm7 1347 movaps 0x60(%rbp), %xmm8 1348 movaps 0x70(%rbp), %xmm9 1349 movaps 0x80(%rbp), %xmm10 1350 movaps 0x90(%rbp), %xmm11 1351 movaps 0xa0(%rbp), %xmm12 1352 movaps 0xb0(%rbp), %xmm13 1353 movaps 0xc0(%rbp), %xmm14 1354 movaps 0xd0(%rbp), %xmm15 1355 lea 0xa0(%rbp), %rsp 1356___ 1357$code.=<<___; 1358 mov 0x48(%rsp), %r15 1359 mov 0x50(%rsp), %r14 1360 mov 0x58(%rsp), %r13 1361 mov 0x60(%rsp), %r12 1362 mov 0x68(%rsp), %rbx 1363 mov 0x70(%rsp), %rax 1364 lea 0x78(%rsp), %rsp 1365 mov %rax, %rbp 1366.Lecb_enc_epilogue: 1367 ret 1368.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1369 1370.globl bsaes_ecb_decrypt_blocks 1371.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1372.align 16 1373bsaes_ecb_decrypt_blocks: 1374 _CET_ENDBR 1375 mov %rsp, %rax 1376.Lecb_dec_prologue: 1377 push %rbp 1378 push %rbx 1379 push %r12 1380 push %r13 1381 push %r14 1382 push %r15 1383 lea -0x48(%rsp),%rsp 1384___ 1385$code.=<<___ if ($win64); 1386 lea -0xa0(%rsp), %rsp 1387 movaps %xmm6, 0x40(%rsp) 1388 movaps %xmm7, 0x50(%rsp) 1389 movaps %xmm8, 0x60(%rsp) 1390 movaps %xmm9, 0x70(%rsp) 1391 movaps %xmm10, 0x80(%rsp) 1392 movaps %xmm11, 0x90(%rsp) 1393 movaps %xmm12, 0xa0(%rsp) 1394 movaps %xmm13, 0xb0(%rsp) 1395 movaps %xmm14, 0xc0(%rsp) 1396 movaps %xmm15, 0xd0(%rsp) 1397.Lecb_dec_body: 1398___ 1399$code.=<<___; 1400 mov %rsp,%rbp # backup %rsp 1401 mov 240($arg4),%eax # rounds 1402 mov $arg1,$inp # backup arguments 1403 mov $arg2,$out 1404 mov $arg3,$len 1405 mov $arg4,$key 1406 cmp \$8,$arg3 1407 jb .Lecb_dec_short 1408 1409 mov %eax,%ebx # backup rounds 1410 shl \$7,%rax # 128 bytes per inner round key 1411 sub \$`128-32`,%rax # size of bit-sliced key schedule 1412 sub %rax,%rsp 1413 mov %rsp,%rax # pass key schedule 1414 mov $key,%rcx # pass key 1415 mov %ebx,%r10d # pass rounds 1416 call _bsaes_key_convert 1417 pxor (%rsp),%xmm7 # fix up 0 round key 1418 movdqa %xmm6,(%rax) # save last round key 1419 movdqa %xmm7,(%rsp) 1420 1421 sub \$8,$len 1422.Lecb_dec_loop: 1423 movdqu 0x00($inp), @XMM[0] # load input 1424 movdqu 0x10($inp), @XMM[1] 1425 movdqu 0x20($inp), @XMM[2] 1426 movdqu 0x30($inp), @XMM[3] 1427 movdqu 0x40($inp), @XMM[4] 1428 movdqu 0x50($inp), @XMM[5] 1429 mov %rsp, %rax # pass key schedule 1430 movdqu 0x60($inp), @XMM[6] 1431 mov %ebx,%r10d # pass rounds 1432 movdqu 0x70($inp), @XMM[7] 1433 lea 0x80($inp), $inp 1434 1435 call _bsaes_decrypt8 1436 1437 movdqu @XMM[0], 0x00($out) # write output 1438 movdqu @XMM[1], 0x10($out) 1439 movdqu @XMM[6], 0x20($out) 1440 movdqu @XMM[4], 0x30($out) 1441 movdqu @XMM[2], 0x40($out) 1442 movdqu @XMM[7], 0x50($out) 1443 movdqu @XMM[3], 0x60($out) 1444 movdqu @XMM[5], 0x70($out) 1445 lea 0x80($out), $out 1446 sub \$8,$len 1447 jnc .Lecb_dec_loop 1448 1449 add \$8,$len 1450 jz .Lecb_dec_done 1451 1452 movdqu 0x00($inp), @XMM[0] # load input 1453 mov %rsp, %rax # pass key schedule 1454 mov %ebx,%r10d # pass rounds 1455 cmp \$2,$len 1456 jb .Lecb_dec_one 1457 movdqu 0x10($inp), @XMM[1] 1458 je .Lecb_dec_two 1459 movdqu 0x20($inp), @XMM[2] 1460 cmp \$4,$len 1461 jb .Lecb_dec_three 1462 movdqu 0x30($inp), @XMM[3] 1463 je .Lecb_dec_four 1464 movdqu 0x40($inp), @XMM[4] 1465 cmp \$6,$len 1466 jb .Lecb_dec_five 1467 movdqu 0x50($inp), @XMM[5] 1468 je .Lecb_dec_six 1469 movdqu 0x60($inp), @XMM[6] 1470 call _bsaes_decrypt8 1471 movdqu @XMM[0], 0x00($out) # write output 1472 movdqu @XMM[1], 0x10($out) 1473 movdqu @XMM[6], 0x20($out) 1474 movdqu @XMM[4], 0x30($out) 1475 movdqu @XMM[2], 0x40($out) 1476 movdqu @XMM[7], 0x50($out) 1477 movdqu @XMM[3], 0x60($out) 1478 jmp .Lecb_dec_done 1479.align 16 1480.Lecb_dec_six: 1481 call _bsaes_decrypt8 1482 movdqu @XMM[0], 0x00($out) # write output 1483 movdqu @XMM[1], 0x10($out) 1484 movdqu @XMM[6], 0x20($out) 1485 movdqu @XMM[4], 0x30($out) 1486 movdqu @XMM[2], 0x40($out) 1487 movdqu @XMM[7], 0x50($out) 1488 jmp .Lecb_dec_done 1489.align 16 1490.Lecb_dec_five: 1491 call _bsaes_decrypt8 1492 movdqu @XMM[0], 0x00($out) # write output 1493 movdqu @XMM[1], 0x10($out) 1494 movdqu @XMM[6], 0x20($out) 1495 movdqu @XMM[4], 0x30($out) 1496 movdqu @XMM[2], 0x40($out) 1497 jmp .Lecb_dec_done 1498.align 16 1499.Lecb_dec_four: 1500 call _bsaes_decrypt8 1501 movdqu @XMM[0], 0x00($out) # write output 1502 movdqu @XMM[1], 0x10($out) 1503 movdqu @XMM[6], 0x20($out) 1504 movdqu @XMM[4], 0x30($out) 1505 jmp .Lecb_dec_done 1506.align 16 1507.Lecb_dec_three: 1508 call _bsaes_decrypt8 1509 movdqu @XMM[0], 0x00($out) # write output 1510 movdqu @XMM[1], 0x10($out) 1511 movdqu @XMM[6], 0x20($out) 1512 jmp .Lecb_dec_done 1513.align 16 1514.Lecb_dec_two: 1515 call _bsaes_decrypt8 1516 movdqu @XMM[0], 0x00($out) # write output 1517 movdqu @XMM[1], 0x10($out) 1518 jmp .Lecb_dec_done 1519.align 16 1520.Lecb_dec_one: 1521 call _bsaes_decrypt8 1522 movdqu @XMM[0], 0x00($out) # write output 1523 jmp .Lecb_dec_done 1524.align 16 1525.Lecb_dec_short: 1526 lea ($inp), $arg1 1527 lea ($out), $arg2 1528 lea ($key), $arg3 1529 call asm_AES_decrypt 1530 lea 16($inp), $inp 1531 lea 16($out), $out 1532 dec $len 1533 jnz .Lecb_dec_short 1534 1535.Lecb_dec_done: 1536 lea (%rsp),%rax 1537 pxor %xmm0, %xmm0 1538.Lecb_dec_bzero: # wipe key schedule [if any] 1539 movdqa %xmm0, 0x00(%rax) 1540 movdqa %xmm0, 0x10(%rax) 1541 lea 0x20(%rax), %rax 1542 cmp %rax, %rbp 1543 jb .Lecb_dec_bzero 1544 1545 lea (%rbp),%rsp # restore %rsp 1546___ 1547$code.=<<___ if ($win64); 1548 movaps 0x40(%rbp), %xmm6 1549 movaps 0x50(%rbp), %xmm7 1550 movaps 0x60(%rbp), %xmm8 1551 movaps 0x70(%rbp), %xmm9 1552 movaps 0x80(%rbp), %xmm10 1553 movaps 0x90(%rbp), %xmm11 1554 movaps 0xa0(%rbp), %xmm12 1555 movaps 0xb0(%rbp), %xmm13 1556 movaps 0xc0(%rbp), %xmm14 1557 movaps 0xd0(%rbp), %xmm15 1558 lea 0xa0(%rbp), %rsp 1559___ 1560$code.=<<___; 1561 mov 0x48(%rsp), %r15 1562 mov 0x50(%rsp), %r14 1563 mov 0x58(%rsp), %r13 1564 mov 0x60(%rsp), %r12 1565 mov 0x68(%rsp), %rbx 1566 mov 0x70(%rsp), %rax 1567 lea 0x78(%rsp), %rsp 1568 mov %rax, %rbp 1569.Lecb_dec_epilogue: 1570 ret 1571.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1572___ 1573} 1574$code.=<<___; 1575.extern asm_AES_cbc_encrypt 1576.globl bsaes_cbc_encrypt 1577.type bsaes_cbc_encrypt,\@abi-omnipotent 1578.align 16 1579bsaes_cbc_encrypt: 1580 _CET_ENDBR 1581___ 1582$code.=<<___ if ($win64); 1583 mov 48(%rsp),$arg6 # pull direction flag 1584___ 1585$code.=<<___; 1586 cmp \$0,$arg6 1587 jne asm_AES_cbc_encrypt 1588 cmp \$128,$arg3 1589 jb asm_AES_cbc_encrypt 1590 1591 mov %rsp, %rax 1592.Lcbc_dec_prologue: 1593 push %rbp 1594 push %rbx 1595 push %r12 1596 push %r13 1597 push %r14 1598 push %r15 1599 lea -0x48(%rsp), %rsp 1600___ 1601$code.=<<___ if ($win64); 1602 mov 0xa0(%rsp),$arg5 # pull ivp 1603 lea -0xa0(%rsp), %rsp 1604 movaps %xmm6, 0x40(%rsp) 1605 movaps %xmm7, 0x50(%rsp) 1606 movaps %xmm8, 0x60(%rsp) 1607 movaps %xmm9, 0x70(%rsp) 1608 movaps %xmm10, 0x80(%rsp) 1609 movaps %xmm11, 0x90(%rsp) 1610 movaps %xmm12, 0xa0(%rsp) 1611 movaps %xmm13, 0xb0(%rsp) 1612 movaps %xmm14, 0xc0(%rsp) 1613 movaps %xmm15, 0xd0(%rsp) 1614.Lcbc_dec_body: 1615___ 1616$code.=<<___; 1617 mov %rsp, %rbp # backup %rsp 1618 mov 240($arg4), %eax # rounds 1619 mov $arg1, $inp # backup arguments 1620 mov $arg2, $out 1621 mov $arg3, $len 1622 mov $arg4, $key 1623 mov $arg5, %rbx 1624 shr \$4, $len # bytes to blocks 1625 1626 mov %eax, %edx # rounds 1627 shl \$7, %rax # 128 bytes per inner round key 1628 sub \$`128-32`, %rax # size of bit-sliced key schedule 1629 sub %rax, %rsp 1630 1631 mov %rsp, %rax # pass key schedule 1632 mov $key, %rcx # pass key 1633 mov %edx, %r10d # pass rounds 1634 call _bsaes_key_convert 1635 pxor (%rsp),%xmm7 # fix up 0 round key 1636 movdqa %xmm6,(%rax) # save last round key 1637 movdqa %xmm7,(%rsp) 1638 1639 movdqu (%rbx), @XMM[15] # load IV 1640 sub \$8,$len 1641.Lcbc_dec_loop: 1642 movdqu 0x00($inp), @XMM[0] # load input 1643 movdqu 0x10($inp), @XMM[1] 1644 movdqu 0x20($inp), @XMM[2] 1645 movdqu 0x30($inp), @XMM[3] 1646 movdqu 0x40($inp), @XMM[4] 1647 movdqu 0x50($inp), @XMM[5] 1648 mov %rsp, %rax # pass key schedule 1649 movdqu 0x60($inp), @XMM[6] 1650 mov %edx,%r10d # pass rounds 1651 movdqu 0x70($inp), @XMM[7] 1652 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1653 1654 call _bsaes_decrypt8 1655 1656 pxor 0x20(%rbp), @XMM[0] # ^= IV 1657 movdqu 0x00($inp), @XMM[8] # re-load input 1658 movdqu 0x10($inp), @XMM[9] 1659 pxor @XMM[8], @XMM[1] 1660 movdqu 0x20($inp), @XMM[10] 1661 pxor @XMM[9], @XMM[6] 1662 movdqu 0x30($inp), @XMM[11] 1663 pxor @XMM[10], @XMM[4] 1664 movdqu 0x40($inp), @XMM[12] 1665 pxor @XMM[11], @XMM[2] 1666 movdqu 0x50($inp), @XMM[13] 1667 pxor @XMM[12], @XMM[7] 1668 movdqu 0x60($inp), @XMM[14] 1669 pxor @XMM[13], @XMM[3] 1670 movdqu 0x70($inp), @XMM[15] # IV 1671 pxor @XMM[14], @XMM[5] 1672 movdqu @XMM[0], 0x00($out) # write output 1673 lea 0x80($inp), $inp 1674 movdqu @XMM[1], 0x10($out) 1675 movdqu @XMM[6], 0x20($out) 1676 movdqu @XMM[4], 0x30($out) 1677 movdqu @XMM[2], 0x40($out) 1678 movdqu @XMM[7], 0x50($out) 1679 movdqu @XMM[3], 0x60($out) 1680 movdqu @XMM[5], 0x70($out) 1681 lea 0x80($out), $out 1682 sub \$8,$len 1683 jnc .Lcbc_dec_loop 1684 1685 add \$8,$len 1686 jz .Lcbc_dec_done 1687 1688 movdqu 0x00($inp), @XMM[0] # load input 1689 mov %rsp, %rax # pass key schedule 1690 mov %edx, %r10d # pass rounds 1691 cmp \$2,$len 1692 jb .Lcbc_dec_one 1693 movdqu 0x10($inp), @XMM[1] 1694 je .Lcbc_dec_two 1695 movdqu 0x20($inp), @XMM[2] 1696 cmp \$4,$len 1697 jb .Lcbc_dec_three 1698 movdqu 0x30($inp), @XMM[3] 1699 je .Lcbc_dec_four 1700 movdqu 0x40($inp), @XMM[4] 1701 cmp \$6,$len 1702 jb .Lcbc_dec_five 1703 movdqu 0x50($inp), @XMM[5] 1704 je .Lcbc_dec_six 1705 movdqu 0x60($inp), @XMM[6] 1706 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1707 call _bsaes_decrypt8 1708 pxor 0x20(%rbp), @XMM[0] # ^= IV 1709 movdqu 0x00($inp), @XMM[8] # re-load input 1710 movdqu 0x10($inp), @XMM[9] 1711 pxor @XMM[8], @XMM[1] 1712 movdqu 0x20($inp), @XMM[10] 1713 pxor @XMM[9], @XMM[6] 1714 movdqu 0x30($inp), @XMM[11] 1715 pxor @XMM[10], @XMM[4] 1716 movdqu 0x40($inp), @XMM[12] 1717 pxor @XMM[11], @XMM[2] 1718 movdqu 0x50($inp), @XMM[13] 1719 pxor @XMM[12], @XMM[7] 1720 movdqu 0x60($inp), @XMM[15] # IV 1721 pxor @XMM[13], @XMM[3] 1722 movdqu @XMM[0], 0x00($out) # write output 1723 movdqu @XMM[1], 0x10($out) 1724 movdqu @XMM[6], 0x20($out) 1725 movdqu @XMM[4], 0x30($out) 1726 movdqu @XMM[2], 0x40($out) 1727 movdqu @XMM[7], 0x50($out) 1728 movdqu @XMM[3], 0x60($out) 1729 jmp .Lcbc_dec_done 1730.align 16 1731.Lcbc_dec_six: 1732 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1733 call _bsaes_decrypt8 1734 pxor 0x20(%rbp), @XMM[0] # ^= IV 1735 movdqu 0x00($inp), @XMM[8] # re-load input 1736 movdqu 0x10($inp), @XMM[9] 1737 pxor @XMM[8], @XMM[1] 1738 movdqu 0x20($inp), @XMM[10] 1739 pxor @XMM[9], @XMM[6] 1740 movdqu 0x30($inp), @XMM[11] 1741 pxor @XMM[10], @XMM[4] 1742 movdqu 0x40($inp), @XMM[12] 1743 pxor @XMM[11], @XMM[2] 1744 movdqu 0x50($inp), @XMM[15] # IV 1745 pxor @XMM[12], @XMM[7] 1746 movdqu @XMM[0], 0x00($out) # write output 1747 movdqu @XMM[1], 0x10($out) 1748 movdqu @XMM[6], 0x20($out) 1749 movdqu @XMM[4], 0x30($out) 1750 movdqu @XMM[2], 0x40($out) 1751 movdqu @XMM[7], 0x50($out) 1752 jmp .Lcbc_dec_done 1753.align 16 1754.Lcbc_dec_five: 1755 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1756 call _bsaes_decrypt8 1757 pxor 0x20(%rbp), @XMM[0] # ^= IV 1758 movdqu 0x00($inp), @XMM[8] # re-load input 1759 movdqu 0x10($inp), @XMM[9] 1760 pxor @XMM[8], @XMM[1] 1761 movdqu 0x20($inp), @XMM[10] 1762 pxor @XMM[9], @XMM[6] 1763 movdqu 0x30($inp), @XMM[11] 1764 pxor @XMM[10], @XMM[4] 1765 movdqu 0x40($inp), @XMM[15] # IV 1766 pxor @XMM[11], @XMM[2] 1767 movdqu @XMM[0], 0x00($out) # write output 1768 movdqu @XMM[1], 0x10($out) 1769 movdqu @XMM[6], 0x20($out) 1770 movdqu @XMM[4], 0x30($out) 1771 movdqu @XMM[2], 0x40($out) 1772 jmp .Lcbc_dec_done 1773.align 16 1774.Lcbc_dec_four: 1775 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1776 call _bsaes_decrypt8 1777 pxor 0x20(%rbp), @XMM[0] # ^= IV 1778 movdqu 0x00($inp), @XMM[8] # re-load input 1779 movdqu 0x10($inp), @XMM[9] 1780 pxor @XMM[8], @XMM[1] 1781 movdqu 0x20($inp), @XMM[10] 1782 pxor @XMM[9], @XMM[6] 1783 movdqu 0x30($inp), @XMM[15] # IV 1784 pxor @XMM[10], @XMM[4] 1785 movdqu @XMM[0], 0x00($out) # write output 1786 movdqu @XMM[1], 0x10($out) 1787 movdqu @XMM[6], 0x20($out) 1788 movdqu @XMM[4], 0x30($out) 1789 jmp .Lcbc_dec_done 1790.align 16 1791.Lcbc_dec_three: 1792 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1793 call _bsaes_decrypt8 1794 pxor 0x20(%rbp), @XMM[0] # ^= IV 1795 movdqu 0x00($inp), @XMM[8] # re-load input 1796 movdqu 0x10($inp), @XMM[9] 1797 pxor @XMM[8], @XMM[1] 1798 movdqu 0x20($inp), @XMM[15] # IV 1799 pxor @XMM[9], @XMM[6] 1800 movdqu @XMM[0], 0x00($out) # write output 1801 movdqu @XMM[1], 0x10($out) 1802 movdqu @XMM[6], 0x20($out) 1803 jmp .Lcbc_dec_done 1804.align 16 1805.Lcbc_dec_two: 1806 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1807 call _bsaes_decrypt8 1808 pxor 0x20(%rbp), @XMM[0] # ^= IV 1809 movdqu 0x00($inp), @XMM[8] # re-load input 1810 movdqu 0x10($inp), @XMM[15] # IV 1811 pxor @XMM[8], @XMM[1] 1812 movdqu @XMM[0], 0x00($out) # write output 1813 movdqu @XMM[1], 0x10($out) 1814 jmp .Lcbc_dec_done 1815.align 16 1816.Lcbc_dec_one: 1817 lea ($inp), $arg1 1818 lea 0x20(%rbp), $arg2 # buffer output 1819 lea ($key), $arg3 1820 call asm_AES_decrypt # doesn't touch %xmm 1821 pxor 0x20(%rbp), @XMM[15] # ^= IV 1822 movdqu @XMM[15], ($out) # write output 1823 movdqa @XMM[0], @XMM[15] # IV 1824 1825.Lcbc_dec_done: 1826 movdqu @XMM[15], (%rbx) # return IV 1827 lea (%rsp), %rax 1828 pxor %xmm0, %xmm0 1829.Lcbc_dec_bzero: # wipe key schedule [if any] 1830 movdqa %xmm0, 0x00(%rax) 1831 movdqa %xmm0, 0x10(%rax) 1832 lea 0x20(%rax), %rax 1833 cmp %rax, %rbp 1834 ja .Lcbc_dec_bzero 1835 1836 lea (%rbp),%rsp # restore %rsp 1837___ 1838$code.=<<___ if ($win64); 1839 movaps 0x40(%rbp), %xmm6 1840 movaps 0x50(%rbp), %xmm7 1841 movaps 0x60(%rbp), %xmm8 1842 movaps 0x70(%rbp), %xmm9 1843 movaps 0x80(%rbp), %xmm10 1844 movaps 0x90(%rbp), %xmm11 1845 movaps 0xa0(%rbp), %xmm12 1846 movaps 0xb0(%rbp), %xmm13 1847 movaps 0xc0(%rbp), %xmm14 1848 movaps 0xd0(%rbp), %xmm15 1849 lea 0xa0(%rbp), %rsp 1850___ 1851$code.=<<___; 1852 mov 0x48(%rsp), %r15 1853 mov 0x50(%rsp), %r14 1854 mov 0x58(%rsp), %r13 1855 mov 0x60(%rsp), %r12 1856 mov 0x68(%rsp), %rbx 1857 mov 0x70(%rsp), %rax 1858 lea 0x78(%rsp), %rsp 1859 mov %rax, %rbp 1860.Lcbc_dec_epilogue: 1861 ret 1862.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt 1863 1864.globl bsaes_ctr32_encrypt_blocks 1865.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1866.align 16 1867bsaes_ctr32_encrypt_blocks: 1868 _CET_ENDBR 1869 mov %rsp, %rax 1870.Lctr_enc_prologue: 1871 push %rbp 1872 push %rbx 1873 push %r12 1874 push %r13 1875 push %r14 1876 push %r15 1877 lea -0x48(%rsp), %rsp 1878___ 1879$code.=<<___ if ($win64); 1880 mov 0xa0(%rsp),$arg5 # pull ivp 1881 lea -0xa0(%rsp), %rsp 1882 movaps %xmm6, 0x40(%rsp) 1883 movaps %xmm7, 0x50(%rsp) 1884 movaps %xmm8, 0x60(%rsp) 1885 movaps %xmm9, 0x70(%rsp) 1886 movaps %xmm10, 0x80(%rsp) 1887 movaps %xmm11, 0x90(%rsp) 1888 movaps %xmm12, 0xa0(%rsp) 1889 movaps %xmm13, 0xb0(%rsp) 1890 movaps %xmm14, 0xc0(%rsp) 1891 movaps %xmm15, 0xd0(%rsp) 1892.Lctr_enc_body: 1893___ 1894$code.=<<___; 1895 mov %rsp, %rbp # backup %rsp 1896 movdqu ($arg5), %xmm0 # load counter 1897 mov 240($arg4), %eax # rounds 1898 mov $arg1, $inp # backup arguments 1899 mov $arg2, $out 1900 mov $arg3, $len 1901 mov $arg4, $key 1902 movdqa %xmm0, 0x20(%rbp) # copy counter 1903 cmp \$8, $arg3 1904 jb .Lctr_enc_short 1905 1906 mov %eax, %ebx # rounds 1907 shl \$7, %rax # 128 bytes per inner round key 1908 sub \$`128-32`, %rax # size of bit-sliced key schedule 1909 sub %rax, %rsp 1910 1911 mov %rsp, %rax # pass key schedule 1912 mov $key, %rcx # pass key 1913 mov %ebx, %r10d # pass rounds 1914 call _bsaes_key_convert 1915 pxor %xmm6,%xmm7 # fix up last round key 1916 movdqa %xmm7,(%rax) # save last round key 1917 1918 movdqa (%rsp), @XMM[9] # load round0 key 1919 lea .LADD1(%rip), %r11 1920 movdqa 0x20(%rbp), @XMM[0] # counter copy 1921 movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1922 pshufb @XMM[8], @XMM[9] # byte swap upper part 1923 pshufb @XMM[8], @XMM[0] 1924 movdqa @XMM[9], (%rsp) # save adjusted round0 key 1925 jmp .Lctr_enc_loop 1926.align 16 1927.Lctr_enc_loop: 1928 movdqa @XMM[0], 0x20(%rbp) # save counter 1929 movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1930 movdqa @XMM[0], @XMM[2] 1931 paddd 0x00(%r11), @XMM[1] # .LADD1 1932 movdqa @XMM[0], @XMM[3] 1933 paddd 0x10(%r11), @XMM[2] # .LADD2 1934 movdqa @XMM[0], @XMM[4] 1935 paddd 0x20(%r11), @XMM[3] # .LADD3 1936 movdqa @XMM[0], @XMM[5] 1937 paddd 0x30(%r11), @XMM[4] # .LADD4 1938 movdqa @XMM[0], @XMM[6] 1939 paddd 0x40(%r11), @XMM[5] # .LADD5 1940 movdqa @XMM[0], @XMM[7] 1941 paddd 0x50(%r11), @XMM[6] # .LADD6 1942 paddd 0x60(%r11), @XMM[7] # .LADD7 1943 1944 # Borrow prologue from _bsaes_encrypt8 to use the opportunity 1945 # to flip byte order in 32-bit counter 1946 movdqa (%rsp), @XMM[9] # round 0 key 1947 lea 0x10(%rsp), %rax # pass key schedule 1948 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 1949 pxor @XMM[9], @XMM[0] # xor with round0 key 1950 pxor @XMM[9], @XMM[1] 1951 pshufb @XMM[8], @XMM[0] 1952 pxor @XMM[9], @XMM[2] 1953 pshufb @XMM[8], @XMM[1] 1954 pxor @XMM[9], @XMM[3] 1955 pshufb @XMM[8], @XMM[2] 1956 pxor @XMM[9], @XMM[4] 1957 pshufb @XMM[8], @XMM[3] 1958 pxor @XMM[9], @XMM[5] 1959 pshufb @XMM[8], @XMM[4] 1960 pxor @XMM[9], @XMM[6] 1961 pshufb @XMM[8], @XMM[5] 1962 pxor @XMM[9], @XMM[7] 1963 pshufb @XMM[8], @XMM[6] 1964 lea .LBS0(%rip), %r11 # constants table 1965 pshufb @XMM[8], @XMM[7] 1966 mov %ebx,%r10d # pass rounds 1967 1968 call _bsaes_encrypt8_bitslice 1969 1970 sub \$8,$len 1971 jc .Lctr_enc_loop_done 1972 1973 movdqu 0x00($inp), @XMM[8] # load input 1974 movdqu 0x10($inp), @XMM[9] 1975 movdqu 0x20($inp), @XMM[10] 1976 movdqu 0x30($inp), @XMM[11] 1977 movdqu 0x40($inp), @XMM[12] 1978 movdqu 0x50($inp), @XMM[13] 1979 movdqu 0x60($inp), @XMM[14] 1980 movdqu 0x70($inp), @XMM[15] 1981 lea 0x80($inp),$inp 1982 pxor @XMM[0], @XMM[8] 1983 movdqa 0x20(%rbp), @XMM[0] # load counter 1984 pxor @XMM[9], @XMM[1] 1985 movdqu @XMM[8], 0x00($out) # write output 1986 pxor @XMM[10], @XMM[4] 1987 movdqu @XMM[1], 0x10($out) 1988 pxor @XMM[11], @XMM[6] 1989 movdqu @XMM[4], 0x20($out) 1990 pxor @XMM[12], @XMM[3] 1991 movdqu @XMM[6], 0x30($out) 1992 pxor @XMM[13], @XMM[7] 1993 movdqu @XMM[3], 0x40($out) 1994 pxor @XMM[14], @XMM[2] 1995 movdqu @XMM[7], 0x50($out) 1996 pxor @XMM[15], @XMM[5] 1997 movdqu @XMM[2], 0x60($out) 1998 lea .LADD1(%rip), %r11 1999 movdqu @XMM[5], 0x70($out) 2000 lea 0x80($out), $out 2001 paddd 0x70(%r11), @XMM[0] # .LADD8 2002 jnz .Lctr_enc_loop 2003 2004 jmp .Lctr_enc_done 2005.align 16 2006.Lctr_enc_loop_done: 2007 add \$8, $len 2008 movdqu 0x00($inp), @XMM[8] # load input 2009 pxor @XMM[8], @XMM[0] 2010 movdqu @XMM[0], 0x00($out) # write output 2011 cmp \$2,$len 2012 jb .Lctr_enc_done 2013 movdqu 0x10($inp), @XMM[9] 2014 pxor @XMM[9], @XMM[1] 2015 movdqu @XMM[1], 0x10($out) 2016 je .Lctr_enc_done 2017 movdqu 0x20($inp), @XMM[10] 2018 pxor @XMM[10], @XMM[4] 2019 movdqu @XMM[4], 0x20($out) 2020 cmp \$4,$len 2021 jb .Lctr_enc_done 2022 movdqu 0x30($inp), @XMM[11] 2023 pxor @XMM[11], @XMM[6] 2024 movdqu @XMM[6], 0x30($out) 2025 je .Lctr_enc_done 2026 movdqu 0x40($inp), @XMM[12] 2027 pxor @XMM[12], @XMM[3] 2028 movdqu @XMM[3], 0x40($out) 2029 cmp \$6,$len 2030 jb .Lctr_enc_done 2031 movdqu 0x50($inp), @XMM[13] 2032 pxor @XMM[13], @XMM[7] 2033 movdqu @XMM[7], 0x50($out) 2034 je .Lctr_enc_done 2035 movdqu 0x60($inp), @XMM[14] 2036 pxor @XMM[14], @XMM[2] 2037 movdqu @XMM[2], 0x60($out) 2038 jmp .Lctr_enc_done 2039 2040.align 16 2041.Lctr_enc_short: 2042 lea 0x20(%rbp), $arg1 2043 lea 0x30(%rbp), $arg2 2044 lea ($key), $arg3 2045 call asm_AES_encrypt 2046 movdqu ($inp), @XMM[1] 2047 lea 16($inp), $inp 2048 mov 0x2c(%rbp), %eax # load 32-bit counter 2049 bswap %eax 2050 pxor 0x30(%rbp), @XMM[1] 2051 inc %eax # increment 2052 movdqu @XMM[1], ($out) 2053 bswap %eax 2054 lea 16($out), $out 2055 mov %eax, 0x2c(%rsp) # save 32-bit counter 2056 dec $len 2057 jnz .Lctr_enc_short 2058 2059.Lctr_enc_done: 2060 lea (%rsp), %rax 2061 pxor %xmm0, %xmm0 2062.Lctr_enc_bzero: # wipe key schedule [if any] 2063 movdqa %xmm0, 0x00(%rax) 2064 movdqa %xmm0, 0x10(%rax) 2065 lea 0x20(%rax), %rax 2066 cmp %rax, %rbp 2067 ja .Lctr_enc_bzero 2068 2069 lea (%rbp),%rsp # restore %rsp 2070___ 2071$code.=<<___ if ($win64); 2072 movaps 0x40(%rbp), %xmm6 2073 movaps 0x50(%rbp), %xmm7 2074 movaps 0x60(%rbp), %xmm8 2075 movaps 0x70(%rbp), %xmm9 2076 movaps 0x80(%rbp), %xmm10 2077 movaps 0x90(%rbp), %xmm11 2078 movaps 0xa0(%rbp), %xmm12 2079 movaps 0xb0(%rbp), %xmm13 2080 movaps 0xc0(%rbp), %xmm14 2081 movaps 0xd0(%rbp), %xmm15 2082 lea 0xa0(%rbp), %rsp 2083___ 2084$code.=<<___; 2085 mov 0x48(%rsp), %r15 2086 mov 0x50(%rsp), %r14 2087 mov 0x58(%rsp), %r13 2088 mov 0x60(%rsp), %r12 2089 mov 0x68(%rsp), %rbx 2090 mov 0x70(%rsp), %rax 2091 lea 0x78(%rsp), %rsp 2092 mov %rax, %rbp 2093.Lctr_enc_epilogue: 2094 ret 2095.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks 2096___ 2097###################################################################### 2098# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2099# const AES_KEY *key1, const AES_KEY *key2, 2100# const unsigned char iv[16]); 2101# 2102my ($twmask,$twres,$twtmp)=@XMM[13..15]; 2103$arg6=~s/d$//; 2104 2105$code.=<<___; 2106.globl bsaes_xts_encrypt 2107.type bsaes_xts_encrypt,\@abi-omnipotent 2108.align 16 2109bsaes_xts_encrypt: 2110 _CET_ENDBR 2111 mov %rsp, %rax 2112.Lxts_enc_prologue: 2113 push %rbp 2114 push %rbx 2115 push %r12 2116 push %r13 2117 push %r14 2118 push %r15 2119 lea -0x48(%rsp), %rsp 2120___ 2121$code.=<<___ if ($win64); 2122 mov 0xa0(%rsp),$arg5 # pull key2 2123 mov 0xa8(%rsp),$arg6 # pull ivp 2124 lea -0xa0(%rsp), %rsp 2125 movaps %xmm6, 0x40(%rsp) 2126 movaps %xmm7, 0x50(%rsp) 2127 movaps %xmm8, 0x60(%rsp) 2128 movaps %xmm9, 0x70(%rsp) 2129 movaps %xmm10, 0x80(%rsp) 2130 movaps %xmm11, 0x90(%rsp) 2131 movaps %xmm12, 0xa0(%rsp) 2132 movaps %xmm13, 0xb0(%rsp) 2133 movaps %xmm14, 0xc0(%rsp) 2134 movaps %xmm15, 0xd0(%rsp) 2135.Lxts_enc_body: 2136___ 2137$code.=<<___; 2138 mov %rsp, %rbp # backup %rsp 2139 mov $arg1, $inp # backup arguments 2140 mov $arg2, $out 2141 mov $arg3, $len 2142 mov $arg4, $key 2143 2144 lea ($arg6), $arg1 2145 lea 0x20(%rbp), $arg2 2146 lea ($arg5), $arg3 2147 call asm_AES_encrypt # generate initial tweak 2148 2149 mov 240($key), %eax # rounds 2150 mov $len, %rbx # backup $len 2151 2152 mov %eax, %edx # rounds 2153 shl \$7, %rax # 128 bytes per inner round key 2154 sub \$`128-32`, %rax # size of bit-sliced key schedule 2155 sub %rax, %rsp 2156 2157 mov %rsp, %rax # pass key schedule 2158 mov $key, %rcx # pass key 2159 mov %edx, %r10d # pass rounds 2160 call _bsaes_key_convert 2161 pxor %xmm6, %xmm7 # fix up last round key 2162 movdqa %xmm7, (%rax) # save last round key 2163 2164 and \$-16, $len 2165 sub \$0x80, %rsp # place for tweak[8] 2166 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2167 2168 pxor $twtmp, $twtmp 2169 movdqa .Lxts_magic(%rip), $twmask 2170 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2171 2172 sub \$0x80, $len 2173 jc .Lxts_enc_short 2174 jmp .Lxts_enc_loop 2175 2176.align 16 2177.Lxts_enc_loop: 2178___ 2179 for ($i=0;$i<7;$i++) { 2180 $code.=<<___; 2181 pshufd \$0x13, $twtmp, $twres 2182 pxor $twtmp, $twtmp 2183 movdqa @XMM[7], @XMM[$i] 2184 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2185 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2186 pand $twmask, $twres # isolate carry and residue 2187 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2188 pxor $twres, @XMM[7] 2189___ 2190 $code.=<<___ if ($i>=1); 2191 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2192___ 2193 $code.=<<___ if ($i>=2); 2194 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2195___ 2196 } 2197$code.=<<___; 2198 movdqu 0x60($inp), @XMM[8+6] 2199 pxor @XMM[8+5], @XMM[5] 2200 movdqu 0x70($inp), @XMM[8+7] 2201 lea 0x80($inp), $inp 2202 movdqa @XMM[7], 0x70(%rsp) 2203 pxor @XMM[8+6], @XMM[6] 2204 lea 0x80(%rsp), %rax # pass key schedule 2205 pxor @XMM[8+7], @XMM[7] 2206 mov %edx, %r10d # pass rounds 2207 2208 call _bsaes_encrypt8 2209 2210 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2211 pxor 0x10(%rsp), @XMM[1] 2212 movdqu @XMM[0], 0x00($out) # write output 2213 pxor 0x20(%rsp), @XMM[4] 2214 movdqu @XMM[1], 0x10($out) 2215 pxor 0x30(%rsp), @XMM[6] 2216 movdqu @XMM[4], 0x20($out) 2217 pxor 0x40(%rsp), @XMM[3] 2218 movdqu @XMM[6], 0x30($out) 2219 pxor 0x50(%rsp), @XMM[7] 2220 movdqu @XMM[3], 0x40($out) 2221 pxor 0x60(%rsp), @XMM[2] 2222 movdqu @XMM[7], 0x50($out) 2223 pxor 0x70(%rsp), @XMM[5] 2224 movdqu @XMM[2], 0x60($out) 2225 movdqu @XMM[5], 0x70($out) 2226 lea 0x80($out), $out 2227 2228 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2229 pxor $twtmp, $twtmp 2230 movdqa .Lxts_magic(%rip), $twmask 2231 pcmpgtd @XMM[7], $twtmp 2232 pshufd \$0x13, $twtmp, $twres 2233 pxor $twtmp, $twtmp 2234 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2235 pand $twmask, $twres # isolate carry and residue 2236 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2237 pxor $twres, @XMM[7] 2238 2239 sub \$0x80,$len 2240 jnc .Lxts_enc_loop 2241 2242.Lxts_enc_short: 2243 add \$0x80, $len 2244 jz .Lxts_enc_done 2245___ 2246 for ($i=0;$i<7;$i++) { 2247 $code.=<<___; 2248 pshufd \$0x13, $twtmp, $twres 2249 pxor $twtmp, $twtmp 2250 movdqa @XMM[7], @XMM[$i] 2251 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2252 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2253 pand $twmask, $twres # isolate carry and residue 2254 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2255 pxor $twres, @XMM[7] 2256___ 2257 $code.=<<___ if ($i>=1); 2258 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2259 cmp \$`0x10*$i`,$len 2260 je .Lxts_enc_$i 2261___ 2262 $code.=<<___ if ($i>=2); 2263 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2264___ 2265 } 2266$code.=<<___; 2267 movdqu 0x60($inp), @XMM[8+6] 2268 pxor @XMM[8+5], @XMM[5] 2269 movdqa @XMM[7], 0x70(%rsp) 2270 lea 0x70($inp), $inp 2271 pxor @XMM[8+6], @XMM[6] 2272 lea 0x80(%rsp), %rax # pass key schedule 2273 mov %edx, %r10d # pass rounds 2274 2275 call _bsaes_encrypt8 2276 2277 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2278 pxor 0x10(%rsp), @XMM[1] 2279 movdqu @XMM[0], 0x00($out) # write output 2280 pxor 0x20(%rsp), @XMM[4] 2281 movdqu @XMM[1], 0x10($out) 2282 pxor 0x30(%rsp), @XMM[6] 2283 movdqu @XMM[4], 0x20($out) 2284 pxor 0x40(%rsp), @XMM[3] 2285 movdqu @XMM[6], 0x30($out) 2286 pxor 0x50(%rsp), @XMM[7] 2287 movdqu @XMM[3], 0x40($out) 2288 pxor 0x60(%rsp), @XMM[2] 2289 movdqu @XMM[7], 0x50($out) 2290 movdqu @XMM[2], 0x60($out) 2291 lea 0x70($out), $out 2292 2293 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2294 jmp .Lxts_enc_done 2295.align 16 2296.Lxts_enc_6: 2297 pxor @XMM[8+4], @XMM[4] 2298 lea 0x60($inp), $inp 2299 pxor @XMM[8+5], @XMM[5] 2300 lea 0x80(%rsp), %rax # pass key schedule 2301 mov %edx, %r10d # pass rounds 2302 2303 call _bsaes_encrypt8 2304 2305 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2306 pxor 0x10(%rsp), @XMM[1] 2307 movdqu @XMM[0], 0x00($out) # write output 2308 pxor 0x20(%rsp), @XMM[4] 2309 movdqu @XMM[1], 0x10($out) 2310 pxor 0x30(%rsp), @XMM[6] 2311 movdqu @XMM[4], 0x20($out) 2312 pxor 0x40(%rsp), @XMM[3] 2313 movdqu @XMM[6], 0x30($out) 2314 pxor 0x50(%rsp), @XMM[7] 2315 movdqu @XMM[3], 0x40($out) 2316 movdqu @XMM[7], 0x50($out) 2317 lea 0x60($out), $out 2318 2319 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2320 jmp .Lxts_enc_done 2321.align 16 2322.Lxts_enc_5: 2323 pxor @XMM[8+3], @XMM[3] 2324 lea 0x50($inp), $inp 2325 pxor @XMM[8+4], @XMM[4] 2326 lea 0x80(%rsp), %rax # pass key schedule 2327 mov %edx, %r10d # pass rounds 2328 2329 call _bsaes_encrypt8 2330 2331 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2332 pxor 0x10(%rsp), @XMM[1] 2333 movdqu @XMM[0], 0x00($out) # write output 2334 pxor 0x20(%rsp), @XMM[4] 2335 movdqu @XMM[1], 0x10($out) 2336 pxor 0x30(%rsp), @XMM[6] 2337 movdqu @XMM[4], 0x20($out) 2338 pxor 0x40(%rsp), @XMM[3] 2339 movdqu @XMM[6], 0x30($out) 2340 movdqu @XMM[3], 0x40($out) 2341 lea 0x50($out), $out 2342 2343 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2344 jmp .Lxts_enc_done 2345.align 16 2346.Lxts_enc_4: 2347 pxor @XMM[8+2], @XMM[2] 2348 lea 0x40($inp), $inp 2349 pxor @XMM[8+3], @XMM[3] 2350 lea 0x80(%rsp), %rax # pass key schedule 2351 mov %edx, %r10d # pass rounds 2352 2353 call _bsaes_encrypt8 2354 2355 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2356 pxor 0x10(%rsp), @XMM[1] 2357 movdqu @XMM[0], 0x00($out) # write output 2358 pxor 0x20(%rsp), @XMM[4] 2359 movdqu @XMM[1], 0x10($out) 2360 pxor 0x30(%rsp), @XMM[6] 2361 movdqu @XMM[4], 0x20($out) 2362 movdqu @XMM[6], 0x30($out) 2363 lea 0x40($out), $out 2364 2365 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2366 jmp .Lxts_enc_done 2367.align 16 2368.Lxts_enc_3: 2369 pxor @XMM[8+1], @XMM[1] 2370 lea 0x30($inp), $inp 2371 pxor @XMM[8+2], @XMM[2] 2372 lea 0x80(%rsp), %rax # pass key schedule 2373 mov %edx, %r10d # pass rounds 2374 2375 call _bsaes_encrypt8 2376 2377 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2378 pxor 0x10(%rsp), @XMM[1] 2379 movdqu @XMM[0], 0x00($out) # write output 2380 pxor 0x20(%rsp), @XMM[4] 2381 movdqu @XMM[1], 0x10($out) 2382 movdqu @XMM[4], 0x20($out) 2383 lea 0x30($out), $out 2384 2385 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2386 jmp .Lxts_enc_done 2387.align 16 2388.Lxts_enc_2: 2389 pxor @XMM[8+0], @XMM[0] 2390 lea 0x20($inp), $inp 2391 pxor @XMM[8+1], @XMM[1] 2392 lea 0x80(%rsp), %rax # pass key schedule 2393 mov %edx, %r10d # pass rounds 2394 2395 call _bsaes_encrypt8 2396 2397 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2398 pxor 0x10(%rsp), @XMM[1] 2399 movdqu @XMM[0], 0x00($out) # write output 2400 movdqu @XMM[1], 0x10($out) 2401 lea 0x20($out), $out 2402 2403 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2404 jmp .Lxts_enc_done 2405.align 16 2406.Lxts_enc_1: 2407 pxor @XMM[0], @XMM[8] 2408 lea 0x10($inp), $inp 2409 movdqa @XMM[8], 0x20(%rbp) 2410 lea 0x20(%rbp), $arg1 2411 lea 0x20(%rbp), $arg2 2412 lea ($key), $arg3 2413 call asm_AES_encrypt # doesn't touch %xmm 2414 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2415 #pxor @XMM[8], @XMM[0] 2416 #lea 0x80(%rsp), %rax # pass key schedule 2417 #mov %edx, %r10d # pass rounds 2418 #call _bsaes_encrypt8 2419 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2420 movdqu @XMM[0], 0x00($out) # write output 2421 lea 0x10($out), $out 2422 2423 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2424 2425.Lxts_enc_done: 2426 and \$15, %ebx 2427 jz .Lxts_enc_ret 2428 mov $out, %rdx 2429 2430.Lxts_enc_steal: 2431 movzb ($inp), %eax 2432 movzb -16(%rdx), %ecx 2433 lea 1($inp), $inp 2434 mov %al, -16(%rdx) 2435 mov %cl, 0(%rdx) 2436 lea 1(%rdx), %rdx 2437 sub \$1,%ebx 2438 jnz .Lxts_enc_steal 2439 2440 movdqu -16($out), @XMM[0] 2441 lea 0x20(%rbp), $arg1 2442 pxor @XMM[7], @XMM[0] 2443 lea 0x20(%rbp), $arg2 2444 movdqa @XMM[0], 0x20(%rbp) 2445 lea ($key), $arg3 2446 call asm_AES_encrypt # doesn't touch %xmm 2447 pxor 0x20(%rbp), @XMM[7] 2448 movdqu @XMM[7], -16($out) 2449 2450.Lxts_enc_ret: 2451 lea (%rsp), %rax 2452 pxor %xmm0, %xmm0 2453.Lxts_enc_bzero: # wipe key schedule [if any] 2454 movdqa %xmm0, 0x00(%rax) 2455 movdqa %xmm0, 0x10(%rax) 2456 lea 0x20(%rax), %rax 2457 cmp %rax, %rbp 2458 ja .Lxts_enc_bzero 2459 2460 lea (%rbp),%rsp # restore %rsp 2461___ 2462$code.=<<___ if ($win64); 2463 movaps 0x40(%rbp), %xmm6 2464 movaps 0x50(%rbp), %xmm7 2465 movaps 0x60(%rbp), %xmm8 2466 movaps 0x70(%rbp), %xmm9 2467 movaps 0x80(%rbp), %xmm10 2468 movaps 0x90(%rbp), %xmm11 2469 movaps 0xa0(%rbp), %xmm12 2470 movaps 0xb0(%rbp), %xmm13 2471 movaps 0xc0(%rbp), %xmm14 2472 movaps 0xd0(%rbp), %xmm15 2473 lea 0xa0(%rbp), %rsp 2474___ 2475$code.=<<___; 2476 mov 0x48(%rsp), %r15 2477 mov 0x50(%rsp), %r14 2478 mov 0x58(%rsp), %r13 2479 mov 0x60(%rsp), %r12 2480 mov 0x68(%rsp), %rbx 2481 mov 0x70(%rsp), %rax 2482 lea 0x78(%rsp), %rsp 2483 mov %rax, %rbp 2484.Lxts_enc_epilogue: 2485 ret 2486.size bsaes_xts_encrypt,.-bsaes_xts_encrypt 2487 2488.globl bsaes_xts_decrypt 2489.type bsaes_xts_decrypt,\@abi-omnipotent 2490.align 16 2491bsaes_xts_decrypt: 2492 _CET_ENDBR 2493 mov %rsp, %rax 2494.Lxts_dec_prologue: 2495 push %rbp 2496 push %rbx 2497 push %r12 2498 push %r13 2499 push %r14 2500 push %r15 2501 lea -0x48(%rsp), %rsp 2502___ 2503$code.=<<___ if ($win64); 2504 mov 0xa0(%rsp),$arg5 # pull key2 2505 mov 0xa8(%rsp),$arg6 # pull ivp 2506 lea -0xa0(%rsp), %rsp 2507 movaps %xmm6, 0x40(%rsp) 2508 movaps %xmm7, 0x50(%rsp) 2509 movaps %xmm8, 0x60(%rsp) 2510 movaps %xmm9, 0x70(%rsp) 2511 movaps %xmm10, 0x80(%rsp) 2512 movaps %xmm11, 0x90(%rsp) 2513 movaps %xmm12, 0xa0(%rsp) 2514 movaps %xmm13, 0xb0(%rsp) 2515 movaps %xmm14, 0xc0(%rsp) 2516 movaps %xmm15, 0xd0(%rsp) 2517.Lxts_dec_body: 2518___ 2519$code.=<<___; 2520 mov %rsp, %rbp # backup %rsp 2521 mov $arg1, $inp # backup arguments 2522 mov $arg2, $out 2523 mov $arg3, $len 2524 mov $arg4, $key 2525 2526 lea ($arg6), $arg1 2527 lea 0x20(%rbp), $arg2 2528 lea ($arg5), $arg3 2529 call asm_AES_encrypt # generate initial tweak 2530 2531 mov 240($key), %eax # rounds 2532 mov $len, %rbx # backup $len 2533 2534 mov %eax, %edx # rounds 2535 shl \$7, %rax # 128 bytes per inner round key 2536 sub \$`128-32`, %rax # size of bit-sliced key schedule 2537 sub %rax, %rsp 2538 2539 mov %rsp, %rax # pass key schedule 2540 mov $key, %rcx # pass key 2541 mov %edx, %r10d # pass rounds 2542 call _bsaes_key_convert 2543 pxor (%rsp), %xmm7 # fix up round 0 key 2544 movdqa %xmm6, (%rax) # save last round key 2545 movdqa %xmm7, (%rsp) 2546 2547 xor %eax, %eax # if ($len%16) len-=16; 2548 and \$-16, $len 2549 test \$15, %ebx 2550 setnz %al 2551 shl \$4, %rax 2552 sub %rax, $len 2553 2554 sub \$0x80, %rsp # place for tweak[8] 2555 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2556 2557 pxor $twtmp, $twtmp 2558 movdqa .Lxts_magic(%rip), $twmask 2559 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2560 2561 sub \$0x80, $len 2562 jc .Lxts_dec_short 2563 jmp .Lxts_dec_loop 2564 2565.align 16 2566.Lxts_dec_loop: 2567___ 2568 for ($i=0;$i<7;$i++) { 2569 $code.=<<___; 2570 pshufd \$0x13, $twtmp, $twres 2571 pxor $twtmp, $twtmp 2572 movdqa @XMM[7], @XMM[$i] 2573 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2574 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2575 pand $twmask, $twres # isolate carry and residue 2576 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2577 pxor $twres, @XMM[7] 2578___ 2579 $code.=<<___ if ($i>=1); 2580 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2581___ 2582 $code.=<<___ if ($i>=2); 2583 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2584___ 2585 } 2586$code.=<<___; 2587 movdqu 0x60($inp), @XMM[8+6] 2588 pxor @XMM[8+5], @XMM[5] 2589 movdqu 0x70($inp), @XMM[8+7] 2590 lea 0x80($inp), $inp 2591 movdqa @XMM[7], 0x70(%rsp) 2592 pxor @XMM[8+6], @XMM[6] 2593 lea 0x80(%rsp), %rax # pass key schedule 2594 pxor @XMM[8+7], @XMM[7] 2595 mov %edx, %r10d # pass rounds 2596 2597 call _bsaes_decrypt8 2598 2599 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2600 pxor 0x10(%rsp), @XMM[1] 2601 movdqu @XMM[0], 0x00($out) # write output 2602 pxor 0x20(%rsp), @XMM[6] 2603 movdqu @XMM[1], 0x10($out) 2604 pxor 0x30(%rsp), @XMM[4] 2605 movdqu @XMM[6], 0x20($out) 2606 pxor 0x40(%rsp), @XMM[2] 2607 movdqu @XMM[4], 0x30($out) 2608 pxor 0x50(%rsp), @XMM[7] 2609 movdqu @XMM[2], 0x40($out) 2610 pxor 0x60(%rsp), @XMM[3] 2611 movdqu @XMM[7], 0x50($out) 2612 pxor 0x70(%rsp), @XMM[5] 2613 movdqu @XMM[3], 0x60($out) 2614 movdqu @XMM[5], 0x70($out) 2615 lea 0x80($out), $out 2616 2617 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2618 pxor $twtmp, $twtmp 2619 movdqa .Lxts_magic(%rip), $twmask 2620 pcmpgtd @XMM[7], $twtmp 2621 pshufd \$0x13, $twtmp, $twres 2622 pxor $twtmp, $twtmp 2623 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2624 pand $twmask, $twres # isolate carry and residue 2625 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2626 pxor $twres, @XMM[7] 2627 2628 sub \$0x80,$len 2629 jnc .Lxts_dec_loop 2630 2631.Lxts_dec_short: 2632 add \$0x80, $len 2633 jz .Lxts_dec_done 2634___ 2635 for ($i=0;$i<7;$i++) { 2636 $code.=<<___; 2637 pshufd \$0x13, $twtmp, $twres 2638 pxor $twtmp, $twtmp 2639 movdqa @XMM[7], @XMM[$i] 2640 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2641 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2642 pand $twmask, $twres # isolate carry and residue 2643 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2644 pxor $twres, @XMM[7] 2645___ 2646 $code.=<<___ if ($i>=1); 2647 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2648 cmp \$`0x10*$i`,$len 2649 je .Lxts_dec_$i 2650___ 2651 $code.=<<___ if ($i>=2); 2652 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2653___ 2654 } 2655$code.=<<___; 2656 movdqu 0x60($inp), @XMM[8+6] 2657 pxor @XMM[8+5], @XMM[5] 2658 movdqa @XMM[7], 0x70(%rsp) 2659 lea 0x70($inp), $inp 2660 pxor @XMM[8+6], @XMM[6] 2661 lea 0x80(%rsp), %rax # pass key schedule 2662 mov %edx, %r10d # pass rounds 2663 2664 call _bsaes_decrypt8 2665 2666 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2667 pxor 0x10(%rsp), @XMM[1] 2668 movdqu @XMM[0], 0x00($out) # write output 2669 pxor 0x20(%rsp), @XMM[6] 2670 movdqu @XMM[1], 0x10($out) 2671 pxor 0x30(%rsp), @XMM[4] 2672 movdqu @XMM[6], 0x20($out) 2673 pxor 0x40(%rsp), @XMM[2] 2674 movdqu @XMM[4], 0x30($out) 2675 pxor 0x50(%rsp), @XMM[7] 2676 movdqu @XMM[2], 0x40($out) 2677 pxor 0x60(%rsp), @XMM[3] 2678 movdqu @XMM[7], 0x50($out) 2679 movdqu @XMM[3], 0x60($out) 2680 lea 0x70($out), $out 2681 2682 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2683 jmp .Lxts_dec_done 2684.align 16 2685.Lxts_dec_6: 2686 pxor @XMM[8+4], @XMM[4] 2687 lea 0x60($inp), $inp 2688 pxor @XMM[8+5], @XMM[5] 2689 lea 0x80(%rsp), %rax # pass key schedule 2690 mov %edx, %r10d # pass rounds 2691 2692 call _bsaes_decrypt8 2693 2694 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2695 pxor 0x10(%rsp), @XMM[1] 2696 movdqu @XMM[0], 0x00($out) # write output 2697 pxor 0x20(%rsp), @XMM[6] 2698 movdqu @XMM[1], 0x10($out) 2699 pxor 0x30(%rsp), @XMM[4] 2700 movdqu @XMM[6], 0x20($out) 2701 pxor 0x40(%rsp), @XMM[2] 2702 movdqu @XMM[4], 0x30($out) 2703 pxor 0x50(%rsp), @XMM[7] 2704 movdqu @XMM[2], 0x40($out) 2705 movdqu @XMM[7], 0x50($out) 2706 lea 0x60($out), $out 2707 2708 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2709 jmp .Lxts_dec_done 2710.align 16 2711.Lxts_dec_5: 2712 pxor @XMM[8+3], @XMM[3] 2713 lea 0x50($inp), $inp 2714 pxor @XMM[8+4], @XMM[4] 2715 lea 0x80(%rsp), %rax # pass key schedule 2716 mov %edx, %r10d # pass rounds 2717 2718 call _bsaes_decrypt8 2719 2720 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2721 pxor 0x10(%rsp), @XMM[1] 2722 movdqu @XMM[0], 0x00($out) # write output 2723 pxor 0x20(%rsp), @XMM[6] 2724 movdqu @XMM[1], 0x10($out) 2725 pxor 0x30(%rsp), @XMM[4] 2726 movdqu @XMM[6], 0x20($out) 2727 pxor 0x40(%rsp), @XMM[2] 2728 movdqu @XMM[4], 0x30($out) 2729 movdqu @XMM[2], 0x40($out) 2730 lea 0x50($out), $out 2731 2732 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2733 jmp .Lxts_dec_done 2734.align 16 2735.Lxts_dec_4: 2736 pxor @XMM[8+2], @XMM[2] 2737 lea 0x40($inp), $inp 2738 pxor @XMM[8+3], @XMM[3] 2739 lea 0x80(%rsp), %rax # pass key schedule 2740 mov %edx, %r10d # pass rounds 2741 2742 call _bsaes_decrypt8 2743 2744 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2745 pxor 0x10(%rsp), @XMM[1] 2746 movdqu @XMM[0], 0x00($out) # write output 2747 pxor 0x20(%rsp), @XMM[6] 2748 movdqu @XMM[1], 0x10($out) 2749 pxor 0x30(%rsp), @XMM[4] 2750 movdqu @XMM[6], 0x20($out) 2751 movdqu @XMM[4], 0x30($out) 2752 lea 0x40($out), $out 2753 2754 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2755 jmp .Lxts_dec_done 2756.align 16 2757.Lxts_dec_3: 2758 pxor @XMM[8+1], @XMM[1] 2759 lea 0x30($inp), $inp 2760 pxor @XMM[8+2], @XMM[2] 2761 lea 0x80(%rsp), %rax # pass key schedule 2762 mov %edx, %r10d # pass rounds 2763 2764 call _bsaes_decrypt8 2765 2766 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2767 pxor 0x10(%rsp), @XMM[1] 2768 movdqu @XMM[0], 0x00($out) # write output 2769 pxor 0x20(%rsp), @XMM[6] 2770 movdqu @XMM[1], 0x10($out) 2771 movdqu @XMM[6], 0x20($out) 2772 lea 0x30($out), $out 2773 2774 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2775 jmp .Lxts_dec_done 2776.align 16 2777.Lxts_dec_2: 2778 pxor @XMM[8+0], @XMM[0] 2779 lea 0x20($inp), $inp 2780 pxor @XMM[8+1], @XMM[1] 2781 lea 0x80(%rsp), %rax # pass key schedule 2782 mov %edx, %r10d # pass rounds 2783 2784 call _bsaes_decrypt8 2785 2786 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2787 pxor 0x10(%rsp), @XMM[1] 2788 movdqu @XMM[0], 0x00($out) # write output 2789 movdqu @XMM[1], 0x10($out) 2790 lea 0x20($out), $out 2791 2792 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2793 jmp .Lxts_dec_done 2794.align 16 2795.Lxts_dec_1: 2796 pxor @XMM[0], @XMM[8] 2797 lea 0x10($inp), $inp 2798 movdqa @XMM[8], 0x20(%rbp) 2799 lea 0x20(%rbp), $arg1 2800 lea 0x20(%rbp), $arg2 2801 lea ($key), $arg3 2802 call asm_AES_decrypt # doesn't touch %xmm 2803 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2804 #pxor @XMM[8], @XMM[0] 2805 #lea 0x80(%rsp), %rax # pass key schedule 2806 #mov %edx, %r10d # pass rounds 2807 #call _bsaes_decrypt8 2808 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2809 movdqu @XMM[0], 0x00($out) # write output 2810 lea 0x10($out), $out 2811 2812 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2813 2814.Lxts_dec_done: 2815 and \$15, %ebx 2816 jz .Lxts_dec_ret 2817 2818 pxor $twtmp, $twtmp 2819 movdqa .Lxts_magic(%rip), $twmask 2820 pcmpgtd @XMM[7], $twtmp 2821 pshufd \$0x13, $twtmp, $twres 2822 movdqa @XMM[7], @XMM[6] 2823 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2824 pand $twmask, $twres # isolate carry and residue 2825 movdqu ($inp), @XMM[0] 2826 pxor $twres, @XMM[7] 2827 2828 lea 0x20(%rbp), $arg1 2829 pxor @XMM[7], @XMM[0] 2830 lea 0x20(%rbp), $arg2 2831 movdqa @XMM[0], 0x20(%rbp) 2832 lea ($key), $arg3 2833 call asm_AES_decrypt # doesn't touch %xmm 2834 pxor 0x20(%rbp), @XMM[7] 2835 mov $out, %rdx 2836 movdqu @XMM[7], ($out) 2837 2838.Lxts_dec_steal: 2839 movzb 16($inp), %eax 2840 movzb (%rdx), %ecx 2841 lea 1($inp), $inp 2842 mov %al, (%rdx) 2843 mov %cl, 16(%rdx) 2844 lea 1(%rdx), %rdx 2845 sub \$1,%ebx 2846 jnz .Lxts_dec_steal 2847 2848 movdqu ($out), @XMM[0] 2849 lea 0x20(%rbp), $arg1 2850 pxor @XMM[6], @XMM[0] 2851 lea 0x20(%rbp), $arg2 2852 movdqa @XMM[0], 0x20(%rbp) 2853 lea ($key), $arg3 2854 call asm_AES_decrypt # doesn't touch %xmm 2855 pxor 0x20(%rbp), @XMM[6] 2856 movdqu @XMM[6], ($out) 2857 2858.Lxts_dec_ret: 2859 lea (%rsp), %rax 2860 pxor %xmm0, %xmm0 2861.Lxts_dec_bzero: # wipe key schedule [if any] 2862 movdqa %xmm0, 0x00(%rax) 2863 movdqa %xmm0, 0x10(%rax) 2864 lea 0x20(%rax), %rax 2865 cmp %rax, %rbp 2866 ja .Lxts_dec_bzero 2867 2868 lea (%rbp),%rsp # restore %rsp 2869___ 2870$code.=<<___ if ($win64); 2871 movaps 0x40(%rbp), %xmm6 2872 movaps 0x50(%rbp), %xmm7 2873 movaps 0x60(%rbp), %xmm8 2874 movaps 0x70(%rbp), %xmm9 2875 movaps 0x80(%rbp), %xmm10 2876 movaps 0x90(%rbp), %xmm11 2877 movaps 0xa0(%rbp), %xmm12 2878 movaps 0xb0(%rbp), %xmm13 2879 movaps 0xc0(%rbp), %xmm14 2880 movaps 0xd0(%rbp), %xmm15 2881 lea 0xa0(%rbp), %rsp 2882___ 2883$code.=<<___; 2884 mov 0x48(%rsp), %r15 2885 mov 0x50(%rsp), %r14 2886 mov 0x58(%rsp), %r13 2887 mov 0x60(%rsp), %r12 2888 mov 0x68(%rsp), %rbx 2889 mov 0x70(%rsp), %rax 2890 lea 0x78(%rsp), %rsp 2891 mov %rax, %rbp 2892.Lxts_dec_epilogue: 2893 ret 2894.size bsaes_xts_decrypt,.-bsaes_xts_decrypt 2895___ 2896} 2897$code.=<<___; 2898.section .rodata 2899.type _bsaes_const,\@object 2900.align 64 2901_bsaes_const: 2902.LM0ISR: # InvShiftRows constants 2903 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 2904.LISRM0: 2905 .quad 0x01040b0e0205080f, 0x0306090c00070a0d 2906.LISR: 2907 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 2908.LBS0: # bit-slice constants 2909 .quad 0x5555555555555555, 0x5555555555555555 2910.LBS1: 2911 .quad 0x3333333333333333, 0x3333333333333333 2912.LBS2: 2913 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 2914.LSR: # shiftrows constants 2915 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 2916.LSRM0: 2917 .quad 0x0304090e00050a0f, 0x01060b0c0207080d 2918.LM0SR: 2919 .quad 0x0a0e02060f03070b, 0x0004080c05090d01 2920.LSWPUP: # byte-swap upper dword 2921 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 2922.LSWPUPM0SR: 2923 .quad 0x0a0d02060c03070b, 0x0004080f05090e01 2924.LADD1: # counter increment constants 2925 .quad 0x0000000000000000, 0x0000000100000000 2926.LADD2: 2927 .quad 0x0000000000000000, 0x0000000200000000 2928.LADD3: 2929 .quad 0x0000000000000000, 0x0000000300000000 2930.LADD4: 2931 .quad 0x0000000000000000, 0x0000000400000000 2932.LADD5: 2933 .quad 0x0000000000000000, 0x0000000500000000 2934.LADD6: 2935 .quad 0x0000000000000000, 0x0000000600000000 2936.LADD7: 2937 .quad 0x0000000000000000, 0x0000000700000000 2938.LADD8: 2939 .quad 0x0000000000000000, 0x0000000800000000 2940.Lxts_magic: 2941 .long 0x87,0,1,0 2942.Lmasks: 2943 .quad 0x0101010101010101, 0x0101010101010101 2944 .quad 0x0202020202020202, 0x0202020202020202 2945 .quad 0x0404040404040404, 0x0404040404040404 2946 .quad 0x0808080808080808, 0x0808080808080808 2947.LM0: 2948 .quad 0x02060a0e03070b0f, 0x0004080c0105090d 2949.L63: 2950 .quad 0x6363636363636363, 0x6363636363636363 2951.align 64 2952.size _bsaes_const,.-_bsaes_const 2953.text 2954___ 2955 2956# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2957# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2958if ($win64) { 2959$rec="%rcx"; 2960$frame="%rdx"; 2961$context="%r8"; 2962$disp="%r9"; 2963 2964$code.=<<___; 2965.extern __imp_RtlVirtualUnwind 2966.type se_handler,\@abi-omnipotent 2967.align 16 2968se_handler: 2969 _CET_ENDBR 2970 push %rsi 2971 push %rdi 2972 push %rbx 2973 push %rbp 2974 push %r12 2975 push %r13 2976 push %r14 2977 push %r15 2978 pushfq 2979 sub \$64,%rsp 2980 2981 mov 120($context),%rax # pull context->Rax 2982 mov 248($context),%rbx # pull context->Rip 2983 2984 mov 8($disp),%rsi # disp->ImageBase 2985 mov 56($disp),%r11 # disp->HandlerData 2986 2987 mov 0(%r11),%r10d # HandlerData[0] 2988 lea (%rsi,%r10),%r10 # prologue label 2989 cmp %r10,%rbx # context->Rip<prologue label 2990 jb .Lin_prologue 2991 2992 mov 152($context),%rax # pull context->Rsp 2993 2994 mov 4(%r11),%r10d # HandlerData[1] 2995 lea (%rsi,%r10),%r10 # epilogue label 2996 cmp %r10,%rbx # context->Rip>=epilogue label 2997 jae .Lin_prologue 2998 2999 mov 160($context),%rax # pull context->Rbp 3000 3001 lea 0x40(%rax),%rsi # %xmm save area 3002 lea 512($context),%rdi # &context.Xmm6 3003 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3004 .long 0xa548f3fc # cld; rep movsq 3005 lea 0xa0(%rax),%rax # adjust stack pointer 3006 3007 mov 0x70(%rax),%rbp 3008 mov 0x68(%rax),%rbx 3009 mov 0x60(%rax),%r12 3010 mov 0x58(%rax),%r13 3011 mov 0x50(%rax),%r14 3012 mov 0x48(%rax),%r15 3013 lea 0x78(%rax),%rax # adjust stack pointer 3014 mov %rbx,144($context) # restore context->Rbx 3015 mov %rbp,160($context) # restore context->Rbp 3016 mov %r12,216($context) # restore context->R12 3017 mov %r13,224($context) # restore context->R13 3018 mov %r14,232($context) # restore context->R14 3019 mov %r15,240($context) # restore context->R15 3020 3021.Lin_prologue: 3022 mov %rax,152($context) # restore context->Rsp 3023 3024 mov 40($disp),%rdi # disp->ContextRecord 3025 mov $context,%rsi # context 3026 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3027 .long 0xa548f3fc # cld; rep movsq 3028 3029 mov $disp,%rsi 3030 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3031 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3032 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3033 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3034 mov 40(%rsi),%r10 # disp->ContextRecord 3035 lea 56(%rsi),%r11 # &disp->HandlerData 3036 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3037 mov %r10,32(%rsp) # arg5 3038 mov %r11,40(%rsp) # arg6 3039 mov %r12,48(%rsp) # arg7 3040 mov %rcx,56(%rsp) # arg8, (NULL) 3041 call *__imp_RtlVirtualUnwind(%rip) 3042 3043 mov \$1,%eax # ExceptionContinueSearch 3044 add \$64,%rsp 3045 popfq 3046 pop %r15 3047 pop %r14 3048 pop %r13 3049 pop %r12 3050 pop %rbp 3051 pop %rbx 3052 pop %rdi 3053 pop %rsi 3054 ret 3055.size se_handler,.-se_handler 3056 3057.section .pdata 3058.align 4 3059___ 3060$code.=<<___ if ($ecb); 3061 .rva .Lecb_enc_prologue 3062 .rva .Lecb_enc_epilogue 3063 .rva .Lecb_enc_info 3064 3065 .rva .Lecb_dec_prologue 3066 .rva .Lecb_dec_epilogue 3067 .rva .Lecb_dec_info 3068___ 3069$code.=<<___; 3070 .rva .Lcbc_dec_prologue 3071 .rva .Lcbc_dec_epilogue 3072 .rva .Lcbc_dec_info 3073 3074 .rva .Lctr_enc_prologue 3075 .rva .Lctr_enc_epilogue 3076 .rva .Lctr_enc_info 3077 3078 .rva .Lxts_enc_prologue 3079 .rva .Lxts_enc_epilogue 3080 .rva .Lxts_enc_info 3081 3082 .rva .Lxts_dec_prologue 3083 .rva .Lxts_dec_epilogue 3084 .rva .Lxts_dec_info 3085 3086.section .xdata 3087.align 8 3088___ 3089$code.=<<___ if ($ecb); 3090.Lecb_enc_info: 3091 .byte 9,0,0,0 3092 .rva se_handler 3093 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3094.Lecb_dec_info: 3095 .byte 9,0,0,0 3096 .rva se_handler 3097 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3098___ 3099$code.=<<___; 3100.Lcbc_dec_info: 3101 .byte 9,0,0,0 3102 .rva se_handler 3103 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3104.Lctr_enc_info: 3105 .byte 9,0,0,0 3106 .rva se_handler 3107 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3108.Lxts_enc_info: 3109 .byte 9,0,0,0 3110 .rva se_handler 3111 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3112.Lxts_dec_info: 3113 .byte 9,0,0,0 3114 .rva se_handler 3115 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3116___ 3117} 3118 3119$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3120 3121print $code; 3122 3123close STDOUT; 3124