1ec07fdf1Sdjm#!/usr/bin/env perl 2ec07fdf1Sdjm 3ec07fdf1Sdjm################################################################### 4ec07fdf1Sdjm### AES-128 [originally in CTR mode] ### 5ec07fdf1Sdjm### bitsliced implementation for Intel Core 2 processors ### 6ec07fdf1Sdjm### requires support of SSE extensions up to SSSE3 ### 7ec07fdf1Sdjm### Author: Emilia Käsper and Peter Schwabe ### 8ec07fdf1Sdjm### Date: 2009-03-19 ### 9ec07fdf1Sdjm### Public domain ### 10ec07fdf1Sdjm### ### 11ec07fdf1Sdjm### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 12ec07fdf1Sdjm### further information. ### 13ec07fdf1Sdjm################################################################### 14ec07fdf1Sdjm# 15ec07fdf1Sdjm# September 2011. 16ec07fdf1Sdjm# 17ec07fdf1Sdjm# Started as transliteration to "perlasm" the original code has 18ec07fdf1Sdjm# undergone following changes: 19ec07fdf1Sdjm# 20ec07fdf1Sdjm# - code was made position-independent; 21ec07fdf1Sdjm# - rounds were folded into a loop resulting in >5x size reduction 22ec07fdf1Sdjm# from 12.5KB to 2.2KB; 2371743258Sjmc# - above was possible thanks to mixcolumns() modification that 24ec07fdf1Sdjm# allowed to feed its output back to aesenc[last], this was 25ec07fdf1Sdjm# achieved at cost of two additional inter-registers moves; 26ec07fdf1Sdjm# - some instruction reordering and interleaving; 27ec07fdf1Sdjm# - this module doesn't implement key setup subroutine, instead it 28ec07fdf1Sdjm# relies on conversion of "conventional" key schedule as returned 29ec07fdf1Sdjm# by AES_set_encrypt_key (see discussion below); 30ec07fdf1Sdjm# - first and last round keys are treated differently, which allowed 31ec07fdf1Sdjm# to skip one shiftrows(), reduce bit-sliced key schedule and 32ec07fdf1Sdjm# speed-up conversion by 22%; 33ec07fdf1Sdjm# - support for 192- and 256-bit keys was added; 34ec07fdf1Sdjm# 35ec07fdf1Sdjm# Resulting performance in CPU cycles spent to encrypt one byte out 36ec07fdf1Sdjm# of 4096-byte buffer with 128-bit key is: 37ec07fdf1Sdjm# 38ec07fdf1Sdjm# Emilia's this(*) difference 39ec07fdf1Sdjm# 40ec07fdf1Sdjm# Core 2 9.30 8.69 +7% 41ec07fdf1Sdjm# Nehalem(**) 7.63 6.98 +9% 42ec07fdf1Sdjm# Atom 17.1 17.4 -2%(***) 43ec07fdf1Sdjm# 44ec07fdf1Sdjm# (*) Comparison is not completely fair, because "this" is ECB, 45ec07fdf1Sdjm# i.e. no extra processing such as counter values calculation 46ec07fdf1Sdjm# and xor-ing input as in Emilia's CTR implementation is 47ec07fdf1Sdjm# performed. However, the CTR calculations stand for not more 48ec07fdf1Sdjm# than 1% of total time, so comparison is *rather* fair. 49ec07fdf1Sdjm# 50ec07fdf1Sdjm# (**) Results were collected on Westmere, which is considered to 51ec07fdf1Sdjm# be equivalent to Nehalem for this code. 52ec07fdf1Sdjm# 53ec07fdf1Sdjm# (***) Slowdown on Atom is rather strange per se, because original 54ec07fdf1Sdjm# implementation has a number of 9+-bytes instructions, which 55ec07fdf1Sdjm# are bad for Atom front-end, and which I eliminated completely. 56ec07fdf1Sdjm# In attempt to address deterioration sbox() was tested in FP 57ec07fdf1Sdjm# SIMD "domain" (movaps instead of movdqa, xorps instead of 58ec07fdf1Sdjm# pxor, etc.). While it resulted in nominal 4% improvement on 59ec07fdf1Sdjm# Atom, it hurted Westmere by more than 2x factor. 60ec07fdf1Sdjm# 61ec07fdf1Sdjm# As for key schedule conversion subroutine. Interface to OpenSSL 62ec07fdf1Sdjm# relies on per-invocation on-the-fly conversion. This naturally 63ec07fdf1Sdjm# has impact on performance, especially for short inputs. Conversion 64ec07fdf1Sdjm# time in CPU cycles and its ratio to CPU cycles spent in 8x block 65ec07fdf1Sdjm# function is: 66ec07fdf1Sdjm# 67ec07fdf1Sdjm# conversion conversion/8x block 68ec07fdf1Sdjm# Core 2 240 0.22 69ec07fdf1Sdjm# Nehalem 180 0.20 70ec07fdf1Sdjm# Atom 430 0.19 71ec07fdf1Sdjm# 72ec07fdf1Sdjm# The ratio values mean that 128-byte blocks will be processed 73ec07fdf1Sdjm# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 74ec07fdf1Sdjm# etc. Then keep in mind that input sizes not divisible by 128 are 75ec07fdf1Sdjm# *effectively* slower, especially shortest ones, e.g. consecutive 76ec07fdf1Sdjm# 144-byte blocks are processed 44% slower than one would expect, 77ec07fdf1Sdjm# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 78ec07fdf1Sdjm# it's still faster than ["hyper-threading-safe" code path in] 79ec07fdf1Sdjm# aes-x86_64.pl on all lengths above 64 bytes... 80ec07fdf1Sdjm# 81ec07fdf1Sdjm# October 2011. 82ec07fdf1Sdjm# 83ec07fdf1Sdjm# Add decryption procedure. Performance in CPU cycles spent to decrypt 84ec07fdf1Sdjm# one byte out of 4096-byte buffer with 128-bit key is: 85ec07fdf1Sdjm# 869eac5592Smiod# Core 2 9.83 879eac5592Smiod# Nehalem 7.74 889eac5592Smiod# Atom 19.0 89ec07fdf1Sdjm# 90ec07fdf1Sdjm# November 2011. 91ec07fdf1Sdjm# 92ec07fdf1Sdjm# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 93ec07fdf1Sdjm# suboptimal, but XTS is meant to be used with larger blocks... 94ec07fdf1Sdjm# 95ec07fdf1Sdjm# <appro@openssl.org> 96ec07fdf1Sdjm 97ec07fdf1Sdjm$flavour = shift; 98ec07fdf1Sdjm$output = shift; 99ec07fdf1Sdjmif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 100ec07fdf1Sdjm 101ec07fdf1Sdjm$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 102ec07fdf1Sdjm 103ec07fdf1Sdjm$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 104ec07fdf1Sdjm( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 105ec07fdf1Sdjm( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 106ec07fdf1Sdjmdie "can't locate x86_64-xlate.pl"; 107ec07fdf1Sdjm 1089eac5592Smiodopen OUT,"| \"$^X\" $xlate $flavour $output"; 1099eac5592Smiod*STDOUT=*OUT; 110ec07fdf1Sdjm 111ec07fdf1Sdjmmy ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 112ec07fdf1Sdjmmy @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 113ec07fdf1Sdjmmy $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 114ec07fdf1Sdjm 115ec07fdf1Sdjm{ 116ec07fdf1Sdjmmy ($key,$rounds,$const)=("%rax","%r10d","%r11"); 117ec07fdf1Sdjm 118ec07fdf1Sdjmsub Sbox { 119ec07fdf1Sdjm# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 120ec07fdf1Sdjm# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 121ec07fdf1Sdjmmy @b=@_[0..7]; 122ec07fdf1Sdjmmy @t=@_[8..11]; 123ec07fdf1Sdjmmy @s=@_[12..15]; 124ec07fdf1Sdjm &InBasisChange (@b); 125ec07fdf1Sdjm &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 126ec07fdf1Sdjm &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 127ec07fdf1Sdjm} 128ec07fdf1Sdjm 129ec07fdf1Sdjmsub InBasisChange { 130ec07fdf1Sdjm# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 131ec07fdf1Sdjm# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 132ec07fdf1Sdjmmy @b=@_[0..7]; 133ec07fdf1Sdjm$code.=<<___; 134ec07fdf1Sdjm pxor @b[6], @b[5] 135ec07fdf1Sdjm pxor @b[1], @b[2] 136ec07fdf1Sdjm pxor @b[0], @b[3] 137ec07fdf1Sdjm pxor @b[2], @b[6] 138ec07fdf1Sdjm pxor @b[0], @b[5] 139ec07fdf1Sdjm 140ec07fdf1Sdjm pxor @b[3], @b[6] 141ec07fdf1Sdjm pxor @b[7], @b[3] 142ec07fdf1Sdjm pxor @b[5], @b[7] 143ec07fdf1Sdjm pxor @b[4], @b[3] 144ec07fdf1Sdjm pxor @b[5], @b[4] 145ec07fdf1Sdjm pxor @b[1], @b[3] 146ec07fdf1Sdjm 147ec07fdf1Sdjm pxor @b[7], @b[2] 148ec07fdf1Sdjm pxor @b[5], @b[1] 149ec07fdf1Sdjm___ 150ec07fdf1Sdjm} 151ec07fdf1Sdjm 152ec07fdf1Sdjmsub OutBasisChange { 153ec07fdf1Sdjm# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 154ec07fdf1Sdjm# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 155ec07fdf1Sdjmmy @b=@_[0..7]; 156ec07fdf1Sdjm$code.=<<___; 157ec07fdf1Sdjm pxor @b[6], @b[0] 158ec07fdf1Sdjm pxor @b[4], @b[1] 159ec07fdf1Sdjm pxor @b[0], @b[2] 160ec07fdf1Sdjm pxor @b[6], @b[4] 161ec07fdf1Sdjm pxor @b[1], @b[6] 162ec07fdf1Sdjm 163ec07fdf1Sdjm pxor @b[5], @b[1] 164ec07fdf1Sdjm pxor @b[3], @b[5] 165ec07fdf1Sdjm pxor @b[7], @b[3] 166ec07fdf1Sdjm pxor @b[5], @b[7] 167ec07fdf1Sdjm pxor @b[5], @b[2] 168ec07fdf1Sdjm 169ec07fdf1Sdjm pxor @b[7], @b[4] 170ec07fdf1Sdjm___ 171ec07fdf1Sdjm} 172ec07fdf1Sdjm 173ec07fdf1Sdjmsub InvSbox { 174ec07fdf1Sdjm# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 175ec07fdf1Sdjm# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 176ec07fdf1Sdjmmy @b=@_[0..7]; 177ec07fdf1Sdjmmy @t=@_[8..11]; 178ec07fdf1Sdjmmy @s=@_[12..15]; 179ec07fdf1Sdjm &InvInBasisChange (@b); 180ec07fdf1Sdjm &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 181ec07fdf1Sdjm &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 182ec07fdf1Sdjm} 183ec07fdf1Sdjm 184ec07fdf1Sdjmsub InvInBasisChange { # OutBasisChange in reverse 185ec07fdf1Sdjmmy @b=@_[5,1,2,6,3,7,0,4]; 186ec07fdf1Sdjm$code.=<<___ 187ec07fdf1Sdjm pxor @b[7], @b[4] 188ec07fdf1Sdjm 189ec07fdf1Sdjm pxor @b[5], @b[7] 190ec07fdf1Sdjm pxor @b[5], @b[2] 191ec07fdf1Sdjm pxor @b[7], @b[3] 192ec07fdf1Sdjm pxor @b[3], @b[5] 193ec07fdf1Sdjm pxor @b[5], @b[1] 194ec07fdf1Sdjm 195ec07fdf1Sdjm pxor @b[1], @b[6] 196ec07fdf1Sdjm pxor @b[0], @b[2] 197ec07fdf1Sdjm pxor @b[6], @b[4] 198ec07fdf1Sdjm pxor @b[6], @b[0] 199ec07fdf1Sdjm pxor @b[4], @b[1] 200ec07fdf1Sdjm___ 201ec07fdf1Sdjm} 202ec07fdf1Sdjm 203ec07fdf1Sdjmsub InvOutBasisChange { # InBasisChange in reverse 204ec07fdf1Sdjmmy @b=@_[2,5,7,3,6,1,0,4]; 205ec07fdf1Sdjm$code.=<<___; 206ec07fdf1Sdjm pxor @b[5], @b[1] 207ec07fdf1Sdjm pxor @b[7], @b[2] 208ec07fdf1Sdjm 209ec07fdf1Sdjm pxor @b[1], @b[3] 210ec07fdf1Sdjm pxor @b[5], @b[4] 211ec07fdf1Sdjm pxor @b[5], @b[7] 212ec07fdf1Sdjm pxor @b[4], @b[3] 213ec07fdf1Sdjm pxor @b[0], @b[5] 214ec07fdf1Sdjm pxor @b[7], @b[3] 215ec07fdf1Sdjm pxor @b[2], @b[6] 216ec07fdf1Sdjm pxor @b[1], @b[2] 217ec07fdf1Sdjm pxor @b[3], @b[6] 218ec07fdf1Sdjm 219ec07fdf1Sdjm pxor @b[0], @b[3] 220ec07fdf1Sdjm pxor @b[6], @b[5] 221ec07fdf1Sdjm___ 222ec07fdf1Sdjm} 223ec07fdf1Sdjm 224ec07fdf1Sdjmsub Mul_GF4 { 225ec07fdf1Sdjm#;************************************************************* 226ec07fdf1Sdjm#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 227ec07fdf1Sdjm#;************************************************************* 228ec07fdf1Sdjmmy ($x0,$x1,$y0,$y1,$t0)=@_; 229ec07fdf1Sdjm$code.=<<___; 230ec07fdf1Sdjm movdqa $y0, $t0 231ec07fdf1Sdjm pxor $y1, $t0 232ec07fdf1Sdjm pand $x0, $t0 233ec07fdf1Sdjm pxor $x1, $x0 234ec07fdf1Sdjm pand $y0, $x1 235ec07fdf1Sdjm pand $y1, $x0 236ec07fdf1Sdjm pxor $x1, $x0 237ec07fdf1Sdjm pxor $t0, $x1 238ec07fdf1Sdjm___ 239ec07fdf1Sdjm} 240ec07fdf1Sdjm 241ec07fdf1Sdjmsub Mul_GF4_N { # not used, see next subroutine 242ec07fdf1Sdjm# multiply and scale by N 243ec07fdf1Sdjmmy ($x0,$x1,$y0,$y1,$t0)=@_; 244ec07fdf1Sdjm$code.=<<___; 245ec07fdf1Sdjm movdqa $y0, $t0 246ec07fdf1Sdjm pxor $y1, $t0 247ec07fdf1Sdjm pand $x0, $t0 248ec07fdf1Sdjm pxor $x1, $x0 249ec07fdf1Sdjm pand $y0, $x1 250ec07fdf1Sdjm pand $y1, $x0 251ec07fdf1Sdjm pxor $x0, $x1 252ec07fdf1Sdjm pxor $t0, $x0 253ec07fdf1Sdjm___ 254ec07fdf1Sdjm} 255ec07fdf1Sdjm 256ec07fdf1Sdjmsub Mul_GF4_N_GF4 { 257ec07fdf1Sdjm# interleaved Mul_GF4_N and Mul_GF4 258ec07fdf1Sdjmmy ($x0,$x1,$y0,$y1,$t0, 259ec07fdf1Sdjm $x2,$x3,$y2,$y3,$t1)=@_; 260ec07fdf1Sdjm$code.=<<___; 261ec07fdf1Sdjm movdqa $y0, $t0 262ec07fdf1Sdjm movdqa $y2, $t1 263ec07fdf1Sdjm pxor $y1, $t0 264ec07fdf1Sdjm pxor $y3, $t1 265ec07fdf1Sdjm pand $x0, $t0 266ec07fdf1Sdjm pand $x2, $t1 267ec07fdf1Sdjm pxor $x1, $x0 268ec07fdf1Sdjm pxor $x3, $x2 269ec07fdf1Sdjm pand $y0, $x1 270ec07fdf1Sdjm pand $y2, $x3 271ec07fdf1Sdjm pand $y1, $x0 272ec07fdf1Sdjm pand $y3, $x2 273ec07fdf1Sdjm pxor $x0, $x1 274ec07fdf1Sdjm pxor $x3, $x2 275ec07fdf1Sdjm pxor $t0, $x0 276ec07fdf1Sdjm pxor $t1, $x3 277ec07fdf1Sdjm___ 278ec07fdf1Sdjm} 279ec07fdf1Sdjmsub Mul_GF16_2 { 280ec07fdf1Sdjmmy @x=@_[0..7]; 281ec07fdf1Sdjmmy @y=@_[8..11]; 282ec07fdf1Sdjmmy @t=@_[12..15]; 283ec07fdf1Sdjm$code.=<<___; 284ec07fdf1Sdjm movdqa @x[0], @t[0] 285ec07fdf1Sdjm movdqa @x[1], @t[1] 286ec07fdf1Sdjm___ 287ec07fdf1Sdjm &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 288ec07fdf1Sdjm$code.=<<___; 289ec07fdf1Sdjm pxor @x[2], @t[0] 290ec07fdf1Sdjm pxor @x[3], @t[1] 291ec07fdf1Sdjm pxor @y[2], @y[0] 292ec07fdf1Sdjm pxor @y[3], @y[1] 293ec07fdf1Sdjm___ 294ec07fdf1Sdjm Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 295ec07fdf1Sdjm @x[2], @x[3], @y[2], @y[3], @t[2]); 296ec07fdf1Sdjm$code.=<<___; 297ec07fdf1Sdjm pxor @t[0], @x[0] 298ec07fdf1Sdjm pxor @t[0], @x[2] 299ec07fdf1Sdjm pxor @t[1], @x[1] 300ec07fdf1Sdjm pxor @t[1], @x[3] 301ec07fdf1Sdjm 302ec07fdf1Sdjm movdqa @x[4], @t[0] 303ec07fdf1Sdjm movdqa @x[5], @t[1] 304ec07fdf1Sdjm pxor @x[6], @t[0] 305ec07fdf1Sdjm pxor @x[7], @t[1] 306ec07fdf1Sdjm___ 307ec07fdf1Sdjm &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 308ec07fdf1Sdjm @x[6], @x[7], @y[2], @y[3], @t[2]); 309ec07fdf1Sdjm$code.=<<___; 310ec07fdf1Sdjm pxor @y[2], @y[0] 311ec07fdf1Sdjm pxor @y[3], @y[1] 312ec07fdf1Sdjm___ 313ec07fdf1Sdjm &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 314ec07fdf1Sdjm$code.=<<___; 315ec07fdf1Sdjm pxor @t[0], @x[4] 316ec07fdf1Sdjm pxor @t[0], @x[6] 317ec07fdf1Sdjm pxor @t[1], @x[5] 318ec07fdf1Sdjm pxor @t[1], @x[7] 319ec07fdf1Sdjm___ 320ec07fdf1Sdjm} 321ec07fdf1Sdjmsub Inv_GF256 { 322ec07fdf1Sdjm#;******************************************************************** 323ec07fdf1Sdjm#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 324ec07fdf1Sdjm#;******************************************************************** 325ec07fdf1Sdjmmy @x=@_[0..7]; 326ec07fdf1Sdjmmy @t=@_[8..11]; 327ec07fdf1Sdjmmy @s=@_[12..15]; 328ec07fdf1Sdjm# direct optimizations from hardware 329ec07fdf1Sdjm$code.=<<___; 330ec07fdf1Sdjm movdqa @x[4], @t[3] 331ec07fdf1Sdjm movdqa @x[5], @t[2] 332ec07fdf1Sdjm movdqa @x[1], @t[1] 333ec07fdf1Sdjm movdqa @x[7], @s[1] 334ec07fdf1Sdjm movdqa @x[0], @s[0] 335ec07fdf1Sdjm 336ec07fdf1Sdjm pxor @x[6], @t[3] 337ec07fdf1Sdjm pxor @x[7], @t[2] 338ec07fdf1Sdjm pxor @x[3], @t[1] 339ec07fdf1Sdjm movdqa @t[3], @s[2] 340ec07fdf1Sdjm pxor @x[6], @s[1] 341ec07fdf1Sdjm movdqa @t[2], @t[0] 342ec07fdf1Sdjm pxor @x[2], @s[0] 343ec07fdf1Sdjm movdqa @t[3], @s[3] 344ec07fdf1Sdjm 345ec07fdf1Sdjm por @t[1], @t[2] 346ec07fdf1Sdjm por @s[0], @t[3] 347ec07fdf1Sdjm pxor @t[0], @s[3] 348ec07fdf1Sdjm pand @s[0], @s[2] 349ec07fdf1Sdjm pxor @t[1], @s[0] 350ec07fdf1Sdjm pand @t[1], @t[0] 351ec07fdf1Sdjm pand @s[0], @s[3] 352ec07fdf1Sdjm movdqa @x[3], @s[0] 353ec07fdf1Sdjm pxor @x[2], @s[0] 354ec07fdf1Sdjm pand @s[0], @s[1] 355ec07fdf1Sdjm pxor @s[1], @t[3] 356ec07fdf1Sdjm pxor @s[1], @t[2] 357ec07fdf1Sdjm movdqa @x[4], @s[1] 358ec07fdf1Sdjm movdqa @x[1], @s[0] 359ec07fdf1Sdjm pxor @x[5], @s[1] 360ec07fdf1Sdjm pxor @x[0], @s[0] 361ec07fdf1Sdjm movdqa @s[1], @t[1] 362ec07fdf1Sdjm pand @s[0], @s[1] 363ec07fdf1Sdjm por @s[0], @t[1] 364ec07fdf1Sdjm pxor @s[1], @t[0] 365ec07fdf1Sdjm pxor @s[3], @t[3] 366ec07fdf1Sdjm pxor @s[2], @t[2] 367ec07fdf1Sdjm pxor @s[3], @t[1] 368ec07fdf1Sdjm movdqa @x[7], @s[0] 369ec07fdf1Sdjm pxor @s[2], @t[0] 370ec07fdf1Sdjm movdqa @x[6], @s[1] 371ec07fdf1Sdjm pxor @s[2], @t[1] 372ec07fdf1Sdjm movdqa @x[5], @s[2] 373ec07fdf1Sdjm pand @x[3], @s[0] 374ec07fdf1Sdjm movdqa @x[4], @s[3] 375ec07fdf1Sdjm pand @x[2], @s[1] 376ec07fdf1Sdjm pand @x[1], @s[2] 377ec07fdf1Sdjm por @x[0], @s[3] 378ec07fdf1Sdjm pxor @s[0], @t[3] 379ec07fdf1Sdjm pxor @s[1], @t[2] 380ec07fdf1Sdjm pxor @s[2], @t[1] 381ec07fdf1Sdjm pxor @s[3], @t[0] 382ec07fdf1Sdjm 383ec07fdf1Sdjm #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 384ec07fdf1Sdjm 385ec07fdf1Sdjm # new smaller inversion 386ec07fdf1Sdjm 387ec07fdf1Sdjm movdqa @t[3], @s[0] 388ec07fdf1Sdjm pand @t[1], @t[3] 389ec07fdf1Sdjm pxor @t[2], @s[0] 390ec07fdf1Sdjm 391ec07fdf1Sdjm movdqa @t[0], @s[2] 392ec07fdf1Sdjm movdqa @s[0], @s[3] 393ec07fdf1Sdjm pxor @t[3], @s[2] 394ec07fdf1Sdjm pand @s[2], @s[3] 395ec07fdf1Sdjm 396ec07fdf1Sdjm movdqa @t[1], @s[1] 397ec07fdf1Sdjm pxor @t[2], @s[3] 398ec07fdf1Sdjm pxor @t[0], @s[1] 399ec07fdf1Sdjm 400ec07fdf1Sdjm pxor @t[2], @t[3] 401ec07fdf1Sdjm 402ec07fdf1Sdjm pand @t[3], @s[1] 403ec07fdf1Sdjm 404ec07fdf1Sdjm movdqa @s[2], @t[2] 405ec07fdf1Sdjm pxor @t[0], @s[1] 406ec07fdf1Sdjm 407ec07fdf1Sdjm pxor @s[1], @t[2] 408ec07fdf1Sdjm pxor @s[1], @t[1] 409ec07fdf1Sdjm 410ec07fdf1Sdjm pand @t[0], @t[2] 411ec07fdf1Sdjm 412ec07fdf1Sdjm pxor @t[2], @s[2] 413ec07fdf1Sdjm pxor @t[2], @t[1] 414ec07fdf1Sdjm 415ec07fdf1Sdjm pand @s[3], @s[2] 416ec07fdf1Sdjm 417ec07fdf1Sdjm pxor @s[0], @s[2] 418ec07fdf1Sdjm___ 419ec07fdf1Sdjm# output in s3, s2, s1, t1 420ec07fdf1Sdjm 421ec07fdf1Sdjm# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 422ec07fdf1Sdjm 423ec07fdf1Sdjm# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 424ec07fdf1Sdjm &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 425ec07fdf1Sdjm 426ec07fdf1Sdjm### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 427ec07fdf1Sdjm} 428ec07fdf1Sdjm 429ec07fdf1Sdjm# AES linear components 430ec07fdf1Sdjm 431ec07fdf1Sdjmsub ShiftRows { 432ec07fdf1Sdjmmy @x=@_[0..7]; 433ec07fdf1Sdjmmy $mask=pop; 434ec07fdf1Sdjm$code.=<<___; 435ec07fdf1Sdjm pxor 0x00($key),@x[0] 436ec07fdf1Sdjm pxor 0x10($key),@x[1] 437ec07fdf1Sdjm pshufb $mask,@x[0] 438ec07fdf1Sdjm pxor 0x20($key),@x[2] 439ec07fdf1Sdjm pshufb $mask,@x[1] 440ec07fdf1Sdjm pxor 0x30($key),@x[3] 441ec07fdf1Sdjm pshufb $mask,@x[2] 442ec07fdf1Sdjm pxor 0x40($key),@x[4] 443ec07fdf1Sdjm pshufb $mask,@x[3] 444ec07fdf1Sdjm pxor 0x50($key),@x[5] 445ec07fdf1Sdjm pshufb $mask,@x[4] 446ec07fdf1Sdjm pxor 0x60($key),@x[6] 447ec07fdf1Sdjm pshufb $mask,@x[5] 448ec07fdf1Sdjm pxor 0x70($key),@x[7] 449ec07fdf1Sdjm pshufb $mask,@x[6] 450ec07fdf1Sdjm lea 0x80($key),$key 451ec07fdf1Sdjm pshufb $mask,@x[7] 452ec07fdf1Sdjm___ 453ec07fdf1Sdjm} 454ec07fdf1Sdjm 455ec07fdf1Sdjmsub MixColumns { 456ec07fdf1Sdjm# modified to emit output in order suitable for feeding back to aesenc[last] 457ec07fdf1Sdjmmy @x=@_[0..7]; 458ec07fdf1Sdjmmy @t=@_[8..15]; 4599eac5592Smiodmy $inv=@_[16]; # optional 460ec07fdf1Sdjm$code.=<<___; 461ec07fdf1Sdjm pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 462ec07fdf1Sdjm pshufd \$0x93, @x[1], @t[1] 463ec07fdf1Sdjm pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 464ec07fdf1Sdjm pshufd \$0x93, @x[2], @t[2] 465ec07fdf1Sdjm pxor @t[1], @x[1] 466ec07fdf1Sdjm pshufd \$0x93, @x[3], @t[3] 467ec07fdf1Sdjm pxor @t[2], @x[2] 468ec07fdf1Sdjm pshufd \$0x93, @x[4], @t[4] 469ec07fdf1Sdjm pxor @t[3], @x[3] 470ec07fdf1Sdjm pshufd \$0x93, @x[5], @t[5] 471ec07fdf1Sdjm pxor @t[4], @x[4] 472ec07fdf1Sdjm pshufd \$0x93, @x[6], @t[6] 473ec07fdf1Sdjm pxor @t[5], @x[5] 474ec07fdf1Sdjm pshufd \$0x93, @x[7], @t[7] 475ec07fdf1Sdjm pxor @t[6], @x[6] 476ec07fdf1Sdjm pxor @t[7], @x[7] 477ec07fdf1Sdjm 478ec07fdf1Sdjm pxor @x[0], @t[1] 479ec07fdf1Sdjm pxor @x[7], @t[0] 480ec07fdf1Sdjm pxor @x[7], @t[1] 481ec07fdf1Sdjm pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 482ec07fdf1Sdjm pxor @x[1], @t[2] 483ec07fdf1Sdjm pshufd \$0x4E, @x[1], @x[1] 484ec07fdf1Sdjm pxor @x[4], @t[5] 485ec07fdf1Sdjm pxor @t[0], @x[0] 486ec07fdf1Sdjm pxor @x[5], @t[6] 487ec07fdf1Sdjm pxor @t[1], @x[1] 488ec07fdf1Sdjm pxor @x[3], @t[4] 489ec07fdf1Sdjm pshufd \$0x4E, @x[4], @t[0] 490ec07fdf1Sdjm pxor @x[6], @t[7] 491ec07fdf1Sdjm pshufd \$0x4E, @x[5], @t[1] 492ec07fdf1Sdjm pxor @x[2], @t[3] 493ec07fdf1Sdjm pshufd \$0x4E, @x[3], @x[4] 494ec07fdf1Sdjm pxor @x[7], @t[3] 495ec07fdf1Sdjm pshufd \$0x4E, @x[7], @x[5] 496ec07fdf1Sdjm pxor @x[7], @t[4] 497ec07fdf1Sdjm pshufd \$0x4E, @x[6], @x[3] 498ec07fdf1Sdjm pxor @t[4], @t[0] 499ec07fdf1Sdjm pshufd \$0x4E, @x[2], @x[6] 500ec07fdf1Sdjm pxor @t[5], @t[1] 5019eac5592Smiod___ 5029eac5592Smiod$code.=<<___ if (!$inv); 503ec07fdf1Sdjm pxor @t[3], @x[4] 504ec07fdf1Sdjm pxor @t[7], @x[5] 505ec07fdf1Sdjm pxor @t[6], @x[3] 506ec07fdf1Sdjm movdqa @t[0], @x[2] 507ec07fdf1Sdjm pxor @t[2], @x[6] 508ec07fdf1Sdjm movdqa @t[1], @x[7] 509ec07fdf1Sdjm___ 5109eac5592Smiod$code.=<<___ if ($inv); 5119eac5592Smiod pxor @x[4], @t[3] 5129eac5592Smiod pxor @t[7], @x[5] 5139eac5592Smiod pxor @x[3], @t[6] 5149eac5592Smiod movdqa @t[0], @x[3] 5159eac5592Smiod pxor @t[2], @x[6] 5169eac5592Smiod movdqa @t[6], @x[2] 5179eac5592Smiod movdqa @t[1], @x[7] 5189eac5592Smiod movdqa @x[6], @x[4] 5199eac5592Smiod movdqa @t[3], @x[6] 5209eac5592Smiod___ 521ec07fdf1Sdjm} 522ec07fdf1Sdjm 5239eac5592Smiodsub InvMixColumns_orig { 524ec07fdf1Sdjmmy @x=@_[0..7]; 525ec07fdf1Sdjmmy @t=@_[8..15]; 526ec07fdf1Sdjm 527ec07fdf1Sdjm$code.=<<___; 528ec07fdf1Sdjm # multiplication by 0x0e 529ec07fdf1Sdjm pshufd \$0x93, @x[7], @t[7] 530ec07fdf1Sdjm movdqa @x[2], @t[2] 531ec07fdf1Sdjm pxor @x[5], @x[7] # 7 5 532ec07fdf1Sdjm pxor @x[5], @x[2] # 2 5 533ec07fdf1Sdjm pshufd \$0x93, @x[0], @t[0] 534ec07fdf1Sdjm movdqa @x[5], @t[5] 535ec07fdf1Sdjm pxor @x[0], @x[5] # 5 0 [1] 536ec07fdf1Sdjm pxor @x[1], @x[0] # 0 1 537ec07fdf1Sdjm pshufd \$0x93, @x[1], @t[1] 538ec07fdf1Sdjm pxor @x[2], @x[1] # 1 25 539ec07fdf1Sdjm pxor @x[6], @x[0] # 01 6 [2] 540ec07fdf1Sdjm pxor @x[3], @x[1] # 125 3 [4] 541ec07fdf1Sdjm pshufd \$0x93, @x[3], @t[3] 542ec07fdf1Sdjm pxor @x[0], @x[2] # 25 016 [3] 543ec07fdf1Sdjm pxor @x[7], @x[3] # 3 75 544ec07fdf1Sdjm pxor @x[6], @x[7] # 75 6 [0] 545ec07fdf1Sdjm pshufd \$0x93, @x[6], @t[6] 546ec07fdf1Sdjm movdqa @x[4], @t[4] 547ec07fdf1Sdjm pxor @x[4], @x[6] # 6 4 548ec07fdf1Sdjm pxor @x[3], @x[4] # 4 375 [6] 549ec07fdf1Sdjm pxor @x[7], @x[3] # 375 756=36 550ec07fdf1Sdjm pxor @t[5], @x[6] # 64 5 [7] 551ec07fdf1Sdjm pxor @t[2], @x[3] # 36 2 552ec07fdf1Sdjm pxor @t[4], @x[3] # 362 4 [5] 553ec07fdf1Sdjm pshufd \$0x93, @t[5], @t[5] 554ec07fdf1Sdjm___ 555ec07fdf1Sdjm my @y = @x[7,5,0,2,1,3,4,6]; 556ec07fdf1Sdjm$code.=<<___; 557ec07fdf1Sdjm # multiplication by 0x0b 558ec07fdf1Sdjm pxor @y[0], @y[1] 559ec07fdf1Sdjm pxor @t[0], @y[0] 560ec07fdf1Sdjm pxor @t[1], @y[1] 561ec07fdf1Sdjm pshufd \$0x93, @t[2], @t[2] 562ec07fdf1Sdjm pxor @t[5], @y[0] 563ec07fdf1Sdjm pxor @t[6], @y[1] 564ec07fdf1Sdjm pxor @t[7], @y[0] 565ec07fdf1Sdjm pshufd \$0x93, @t[4], @t[4] 566ec07fdf1Sdjm pxor @t[6], @t[7] # clobber t[7] 567ec07fdf1Sdjm pxor @y[0], @y[1] 568ec07fdf1Sdjm 569ec07fdf1Sdjm pxor @t[0], @y[3] 570ec07fdf1Sdjm pshufd \$0x93, @t[0], @t[0] 571ec07fdf1Sdjm pxor @t[1], @y[2] 572ec07fdf1Sdjm pxor @t[1], @y[4] 573ec07fdf1Sdjm pxor @t[2], @y[2] 574ec07fdf1Sdjm pshufd \$0x93, @t[1], @t[1] 575ec07fdf1Sdjm pxor @t[2], @y[3] 576ec07fdf1Sdjm pxor @t[2], @y[5] 577ec07fdf1Sdjm pxor @t[7], @y[2] 578ec07fdf1Sdjm pshufd \$0x93, @t[2], @t[2] 579ec07fdf1Sdjm pxor @t[3], @y[3] 580ec07fdf1Sdjm pxor @t[3], @y[6] 581ec07fdf1Sdjm pxor @t[3], @y[4] 582ec07fdf1Sdjm pshufd \$0x93, @t[3], @t[3] 583ec07fdf1Sdjm pxor @t[4], @y[7] 584ec07fdf1Sdjm pxor @t[4], @y[5] 585ec07fdf1Sdjm pxor @t[7], @y[7] 586ec07fdf1Sdjm pxor @t[5], @y[3] 587ec07fdf1Sdjm pxor @t[4], @y[4] 588ec07fdf1Sdjm pxor @t[5], @t[7] # clobber t[7] even more 589ec07fdf1Sdjm 590ec07fdf1Sdjm pxor @t[7], @y[5] 591ec07fdf1Sdjm pshufd \$0x93, @t[4], @t[4] 592ec07fdf1Sdjm pxor @t[7], @y[6] 593ec07fdf1Sdjm pxor @t[7], @y[4] 594ec07fdf1Sdjm 595ec07fdf1Sdjm pxor @t[5], @t[7] 596ec07fdf1Sdjm pshufd \$0x93, @t[5], @t[5] 597ec07fdf1Sdjm pxor @t[6], @t[7] # restore t[7] 598ec07fdf1Sdjm 599ec07fdf1Sdjm # multiplication by 0x0d 600ec07fdf1Sdjm pxor @y[7], @y[4] 601ec07fdf1Sdjm pxor @t[4], @y[7] 602ec07fdf1Sdjm pshufd \$0x93, @t[6], @t[6] 603ec07fdf1Sdjm pxor @t[0], @y[2] 604ec07fdf1Sdjm pxor @t[5], @y[7] 605ec07fdf1Sdjm pxor @t[2], @y[2] 606ec07fdf1Sdjm pshufd \$0x93, @t[7], @t[7] 607ec07fdf1Sdjm 608ec07fdf1Sdjm pxor @y[1], @y[3] 609ec07fdf1Sdjm pxor @t[1], @y[1] 610ec07fdf1Sdjm pxor @t[0], @y[0] 611ec07fdf1Sdjm pxor @t[0], @y[3] 612ec07fdf1Sdjm pxor @t[5], @y[1] 613ec07fdf1Sdjm pxor @t[5], @y[0] 614ec07fdf1Sdjm pxor @t[7], @y[1] 615ec07fdf1Sdjm pshufd \$0x93, @t[0], @t[0] 616ec07fdf1Sdjm pxor @t[6], @y[0] 617ec07fdf1Sdjm pxor @y[1], @y[3] 618ec07fdf1Sdjm pxor @t[1], @y[4] 619ec07fdf1Sdjm pshufd \$0x93, @t[1], @t[1] 620ec07fdf1Sdjm 621ec07fdf1Sdjm pxor @t[7], @y[7] 622ec07fdf1Sdjm pxor @t[2], @y[4] 623ec07fdf1Sdjm pxor @t[2], @y[5] 624ec07fdf1Sdjm pshufd \$0x93, @t[2], @t[2] 625ec07fdf1Sdjm pxor @t[6], @y[2] 626ec07fdf1Sdjm pxor @t[3], @t[6] # clobber t[6] 627ec07fdf1Sdjm pxor @y[7], @y[4] 628ec07fdf1Sdjm pxor @t[6], @y[3] 629ec07fdf1Sdjm 630ec07fdf1Sdjm pxor @t[6], @y[6] 631ec07fdf1Sdjm pxor @t[5], @y[5] 632ec07fdf1Sdjm pxor @t[4], @y[6] 633ec07fdf1Sdjm pshufd \$0x93, @t[4], @t[4] 634ec07fdf1Sdjm pxor @t[6], @y[5] 635ec07fdf1Sdjm pxor @t[7], @y[6] 636ec07fdf1Sdjm pxor @t[3], @t[6] # restore t[6] 637ec07fdf1Sdjm 638ec07fdf1Sdjm pshufd \$0x93, @t[5], @t[5] 639ec07fdf1Sdjm pshufd \$0x93, @t[6], @t[6] 640ec07fdf1Sdjm pshufd \$0x93, @t[7], @t[7] 641ec07fdf1Sdjm pshufd \$0x93, @t[3], @t[3] 642ec07fdf1Sdjm 643ec07fdf1Sdjm # multiplication by 0x09 644ec07fdf1Sdjm pxor @y[1], @y[4] 645ec07fdf1Sdjm pxor @y[1], @t[1] # t[1]=y[1] 646ec07fdf1Sdjm pxor @t[5], @t[0] # clobber t[0] 647ec07fdf1Sdjm pxor @t[5], @t[1] 648ec07fdf1Sdjm pxor @t[0], @y[3] 649ec07fdf1Sdjm pxor @y[0], @t[0] # t[0]=y[0] 650ec07fdf1Sdjm pxor @t[6], @t[1] 651ec07fdf1Sdjm pxor @t[7], @t[6] # clobber t[6] 652ec07fdf1Sdjm pxor @t[1], @y[4] 653ec07fdf1Sdjm pxor @t[4], @y[7] 654ec07fdf1Sdjm pxor @y[4], @t[4] # t[4]=y[4] 655ec07fdf1Sdjm pxor @t[3], @y[6] 656ec07fdf1Sdjm pxor @y[3], @t[3] # t[3]=y[3] 657ec07fdf1Sdjm pxor @t[2], @y[5] 658ec07fdf1Sdjm pxor @y[2], @t[2] # t[2]=y[2] 659ec07fdf1Sdjm pxor @t[7], @t[3] 660ec07fdf1Sdjm pxor @y[5], @t[5] # t[5]=y[5] 661ec07fdf1Sdjm pxor @t[6], @t[2] 662ec07fdf1Sdjm pxor @t[6], @t[5] 663ec07fdf1Sdjm pxor @y[6], @t[6] # t[6]=y[6] 664ec07fdf1Sdjm pxor @y[7], @t[7] # t[7]=y[7] 665ec07fdf1Sdjm 666ec07fdf1Sdjm movdqa @t[0],@XMM[0] 667ec07fdf1Sdjm movdqa @t[1],@XMM[1] 668ec07fdf1Sdjm movdqa @t[2],@XMM[2] 669ec07fdf1Sdjm movdqa @t[3],@XMM[3] 670ec07fdf1Sdjm movdqa @t[4],@XMM[4] 671ec07fdf1Sdjm movdqa @t[5],@XMM[5] 672ec07fdf1Sdjm movdqa @t[6],@XMM[6] 673ec07fdf1Sdjm movdqa @t[7],@XMM[7] 674ec07fdf1Sdjm___ 675ec07fdf1Sdjm} 676ec07fdf1Sdjm 6779eac5592Smiodsub InvMixColumns { 6789eac5592Smiodmy @x=@_[0..7]; 6799eac5592Smiodmy @t=@_[8..15]; 6809eac5592Smiod 6819eac5592Smiod# Thanks to Jussi Kivilinna for providing pointer to 6829eac5592Smiod# 6839eac5592Smiod# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 6849eac5592Smiod# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 6859eac5592Smiod# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 6869eac5592Smiod# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 6879eac5592Smiod 6889eac5592Smiod$code.=<<___; 6899eac5592Smiod # multiplication by 0x05-0x00-0x04-0x00 6909eac5592Smiod pshufd \$0x4E, @x[0], @t[0] 6919eac5592Smiod pshufd \$0x4E, @x[6], @t[6] 6929eac5592Smiod pxor @x[0], @t[0] 6939eac5592Smiod pshufd \$0x4E, @x[7], @t[7] 6949eac5592Smiod pxor @x[6], @t[6] 6959eac5592Smiod pshufd \$0x4E, @x[1], @t[1] 6969eac5592Smiod pxor @x[7], @t[7] 6979eac5592Smiod pshufd \$0x4E, @x[2], @t[2] 6989eac5592Smiod pxor @x[1], @t[1] 6999eac5592Smiod pshufd \$0x4E, @x[3], @t[3] 7009eac5592Smiod pxor @x[2], @t[2] 7019eac5592Smiod pxor @t[6], @x[0] 7029eac5592Smiod pxor @t[6], @x[1] 7039eac5592Smiod pshufd \$0x4E, @x[4], @t[4] 7049eac5592Smiod pxor @x[3], @t[3] 7059eac5592Smiod pxor @t[0], @x[2] 7069eac5592Smiod pxor @t[1], @x[3] 7079eac5592Smiod pshufd \$0x4E, @x[5], @t[5] 7089eac5592Smiod pxor @x[4], @t[4] 7099eac5592Smiod pxor @t[7], @x[1] 7109eac5592Smiod pxor @t[2], @x[4] 7119eac5592Smiod pxor @x[5], @t[5] 7129eac5592Smiod 7139eac5592Smiod pxor @t[7], @x[2] 7149eac5592Smiod pxor @t[6], @x[3] 7159eac5592Smiod pxor @t[6], @x[4] 7169eac5592Smiod pxor @t[3], @x[5] 7179eac5592Smiod pxor @t[4], @x[6] 7189eac5592Smiod pxor @t[7], @x[4] 7199eac5592Smiod pxor @t[7], @x[5] 7209eac5592Smiod pxor @t[5], @x[7] 7219eac5592Smiod___ 7229eac5592Smiod &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 7239eac5592Smiod} 7249eac5592Smiod 725ec07fdf1Sdjmsub aesenc { # not used 726ec07fdf1Sdjmmy @b=@_[0..7]; 727ec07fdf1Sdjmmy @t=@_[8..15]; 728ec07fdf1Sdjm$code.=<<___; 729ec07fdf1Sdjm movdqa 0x30($const),@t[0] # .LSR 730ec07fdf1Sdjm___ 731ec07fdf1Sdjm &ShiftRows (@b,@t[0]); 732ec07fdf1Sdjm &Sbox (@b,@t); 733ec07fdf1Sdjm &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 734ec07fdf1Sdjm} 735ec07fdf1Sdjm 736ec07fdf1Sdjmsub aesenclast { # not used 737ec07fdf1Sdjmmy @b=@_[0..7]; 738ec07fdf1Sdjmmy @t=@_[8..15]; 739ec07fdf1Sdjm$code.=<<___; 740ec07fdf1Sdjm movdqa 0x40($const),@t[0] # .LSRM0 741ec07fdf1Sdjm___ 742ec07fdf1Sdjm &ShiftRows (@b,@t[0]); 743ec07fdf1Sdjm &Sbox (@b,@t); 744ec07fdf1Sdjm$code.=<<___ 745ec07fdf1Sdjm pxor 0x00($key),@b[0] 746ec07fdf1Sdjm pxor 0x10($key),@b[1] 747ec07fdf1Sdjm pxor 0x20($key),@b[4] 748ec07fdf1Sdjm pxor 0x30($key),@b[6] 749ec07fdf1Sdjm pxor 0x40($key),@b[3] 750ec07fdf1Sdjm pxor 0x50($key),@b[7] 751ec07fdf1Sdjm pxor 0x60($key),@b[2] 752ec07fdf1Sdjm pxor 0x70($key),@b[5] 753ec07fdf1Sdjm___ 754ec07fdf1Sdjm} 755ec07fdf1Sdjm 756ec07fdf1Sdjmsub swapmove { 757ec07fdf1Sdjmmy ($a,$b,$n,$mask,$t)=@_; 758ec07fdf1Sdjm$code.=<<___; 759ec07fdf1Sdjm movdqa $b,$t 760ec07fdf1Sdjm psrlq \$$n,$b 761ec07fdf1Sdjm pxor $a,$b 762ec07fdf1Sdjm pand $mask,$b 763ec07fdf1Sdjm pxor $b,$a 764ec07fdf1Sdjm psllq \$$n,$b 765ec07fdf1Sdjm pxor $t,$b 766ec07fdf1Sdjm___ 767ec07fdf1Sdjm} 768ec07fdf1Sdjmsub swapmove2x { 769ec07fdf1Sdjmmy ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 770ec07fdf1Sdjm$code.=<<___; 771ec07fdf1Sdjm movdqa $b0,$t0 772ec07fdf1Sdjm psrlq \$$n,$b0 773ec07fdf1Sdjm movdqa $b1,$t1 774ec07fdf1Sdjm psrlq \$$n,$b1 775ec07fdf1Sdjm pxor $a0,$b0 776ec07fdf1Sdjm pxor $a1,$b1 777ec07fdf1Sdjm pand $mask,$b0 778ec07fdf1Sdjm pand $mask,$b1 779ec07fdf1Sdjm pxor $b0,$a0 780ec07fdf1Sdjm psllq \$$n,$b0 781ec07fdf1Sdjm pxor $b1,$a1 782ec07fdf1Sdjm psllq \$$n,$b1 783ec07fdf1Sdjm pxor $t0,$b0 784ec07fdf1Sdjm pxor $t1,$b1 785ec07fdf1Sdjm___ 786ec07fdf1Sdjm} 787ec07fdf1Sdjm 788ec07fdf1Sdjmsub bitslice { 789ec07fdf1Sdjmmy @x=reverse(@_[0..7]); 790ec07fdf1Sdjmmy ($t0,$t1,$t2,$t3)=@_[8..11]; 791ec07fdf1Sdjm$code.=<<___; 792ec07fdf1Sdjm movdqa 0x00($const),$t0 # .LBS0 793ec07fdf1Sdjm movdqa 0x10($const),$t1 # .LBS1 794ec07fdf1Sdjm___ 795ec07fdf1Sdjm &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 796ec07fdf1Sdjm &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 797ec07fdf1Sdjm$code.=<<___; 798ec07fdf1Sdjm movdqa 0x20($const),$t0 # .LBS2 799ec07fdf1Sdjm___ 800ec07fdf1Sdjm &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 801ec07fdf1Sdjm &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 802ec07fdf1Sdjm 803ec07fdf1Sdjm &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 804ec07fdf1Sdjm &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 805ec07fdf1Sdjm} 806ec07fdf1Sdjm 807ec07fdf1Sdjm$code.=<<___; 808ec07fdf1Sdjm.text 809ec07fdf1Sdjm 810ec07fdf1Sdjm.extern asm_AES_encrypt 811ec07fdf1Sdjm.extern asm_AES_decrypt 812ec07fdf1Sdjm 813ec07fdf1Sdjm.type _bsaes_encrypt8,\@abi-omnipotent 814ec07fdf1Sdjm.align 64 815ec07fdf1Sdjm_bsaes_encrypt8: 816*22787c51Stb _CET_ENDBR 817ec07fdf1Sdjm lea .LBS0(%rip), $const # constants table 818ec07fdf1Sdjm 819ec07fdf1Sdjm movdqa ($key), @XMM[9] # round 0 key 820ec07fdf1Sdjm lea 0x10($key), $key 821ec07fdf1Sdjm movdqa 0x50($const), @XMM[8] # .LM0SR 822ec07fdf1Sdjm pxor @XMM[9], @XMM[0] # xor with round0 key 823ec07fdf1Sdjm pxor @XMM[9], @XMM[1] 824ec07fdf1Sdjm pshufb @XMM[8], @XMM[0] 825ec07fdf1Sdjm pxor @XMM[9], @XMM[2] 826ec07fdf1Sdjm pshufb @XMM[8], @XMM[1] 827ec07fdf1Sdjm pxor @XMM[9], @XMM[3] 828ec07fdf1Sdjm pshufb @XMM[8], @XMM[2] 829ec07fdf1Sdjm pxor @XMM[9], @XMM[4] 830ec07fdf1Sdjm pshufb @XMM[8], @XMM[3] 831ec07fdf1Sdjm pxor @XMM[9], @XMM[5] 832ec07fdf1Sdjm pshufb @XMM[8], @XMM[4] 833ec07fdf1Sdjm pxor @XMM[9], @XMM[6] 834ec07fdf1Sdjm pshufb @XMM[8], @XMM[5] 835ec07fdf1Sdjm pxor @XMM[9], @XMM[7] 836ec07fdf1Sdjm pshufb @XMM[8], @XMM[6] 837ec07fdf1Sdjm pshufb @XMM[8], @XMM[7] 838ec07fdf1Sdjm_bsaes_encrypt8_bitslice: 839ec07fdf1Sdjm___ 840ec07fdf1Sdjm &bitslice (@XMM[0..7, 8..11]); 841ec07fdf1Sdjm$code.=<<___; 842ec07fdf1Sdjm dec $rounds 843ec07fdf1Sdjm jmp .Lenc_sbox 844ec07fdf1Sdjm.align 16 845ec07fdf1Sdjm.Lenc_loop: 846ec07fdf1Sdjm___ 847ec07fdf1Sdjm &ShiftRows (@XMM[0..7, 8]); 848ec07fdf1Sdjm$code.=".Lenc_sbox:\n"; 849ec07fdf1Sdjm &Sbox (@XMM[0..7, 8..15]); 850ec07fdf1Sdjm$code.=<<___; 851ec07fdf1Sdjm dec $rounds 852ec07fdf1Sdjm jl .Lenc_done 853ec07fdf1Sdjm___ 854ec07fdf1Sdjm &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 855ec07fdf1Sdjm$code.=<<___; 856ec07fdf1Sdjm movdqa 0x30($const), @XMM[8] # .LSR 857ec07fdf1Sdjm jnz .Lenc_loop 858ec07fdf1Sdjm movdqa 0x40($const), @XMM[8] # .LSRM0 859ec07fdf1Sdjm jmp .Lenc_loop 860ec07fdf1Sdjm.align 16 861ec07fdf1Sdjm.Lenc_done: 862ec07fdf1Sdjm___ 863ec07fdf1Sdjm # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 864ec07fdf1Sdjm &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 865ec07fdf1Sdjm$code.=<<___; 866ec07fdf1Sdjm movdqa ($key), @XMM[8] # last round key 867ec07fdf1Sdjm pxor @XMM[8], @XMM[4] 868ec07fdf1Sdjm pxor @XMM[8], @XMM[6] 869ec07fdf1Sdjm pxor @XMM[8], @XMM[3] 870ec07fdf1Sdjm pxor @XMM[8], @XMM[7] 871ec07fdf1Sdjm pxor @XMM[8], @XMM[2] 872ec07fdf1Sdjm pxor @XMM[8], @XMM[5] 873ec07fdf1Sdjm pxor @XMM[8], @XMM[0] 874ec07fdf1Sdjm pxor @XMM[8], @XMM[1] 875ec07fdf1Sdjm ret 876ec07fdf1Sdjm.size _bsaes_encrypt8,.-_bsaes_encrypt8 877ec07fdf1Sdjm 878ec07fdf1Sdjm.type _bsaes_decrypt8,\@abi-omnipotent 879ec07fdf1Sdjm.align 64 880ec07fdf1Sdjm_bsaes_decrypt8: 881*22787c51Stb _CET_ENDBR 882ec07fdf1Sdjm lea .LBS0(%rip), $const # constants table 883ec07fdf1Sdjm 884ec07fdf1Sdjm movdqa ($key), @XMM[9] # round 0 key 885ec07fdf1Sdjm lea 0x10($key), $key 886ec07fdf1Sdjm movdqa -0x30($const), @XMM[8] # .LM0ISR 887ec07fdf1Sdjm pxor @XMM[9], @XMM[0] # xor with round0 key 888ec07fdf1Sdjm pxor @XMM[9], @XMM[1] 889ec07fdf1Sdjm pshufb @XMM[8], @XMM[0] 890ec07fdf1Sdjm pxor @XMM[9], @XMM[2] 891ec07fdf1Sdjm pshufb @XMM[8], @XMM[1] 892ec07fdf1Sdjm pxor @XMM[9], @XMM[3] 893ec07fdf1Sdjm pshufb @XMM[8], @XMM[2] 894ec07fdf1Sdjm pxor @XMM[9], @XMM[4] 895ec07fdf1Sdjm pshufb @XMM[8], @XMM[3] 896ec07fdf1Sdjm pxor @XMM[9], @XMM[5] 897ec07fdf1Sdjm pshufb @XMM[8], @XMM[4] 898ec07fdf1Sdjm pxor @XMM[9], @XMM[6] 899ec07fdf1Sdjm pshufb @XMM[8], @XMM[5] 900ec07fdf1Sdjm pxor @XMM[9], @XMM[7] 901ec07fdf1Sdjm pshufb @XMM[8], @XMM[6] 902ec07fdf1Sdjm pshufb @XMM[8], @XMM[7] 903ec07fdf1Sdjm___ 904ec07fdf1Sdjm &bitslice (@XMM[0..7, 8..11]); 905ec07fdf1Sdjm$code.=<<___; 906ec07fdf1Sdjm dec $rounds 907ec07fdf1Sdjm jmp .Ldec_sbox 908ec07fdf1Sdjm.align 16 909ec07fdf1Sdjm.Ldec_loop: 910ec07fdf1Sdjm___ 911ec07fdf1Sdjm &ShiftRows (@XMM[0..7, 8]); 912ec07fdf1Sdjm$code.=".Ldec_sbox:\n"; 913ec07fdf1Sdjm &InvSbox (@XMM[0..7, 8..15]); 914ec07fdf1Sdjm$code.=<<___; 915ec07fdf1Sdjm dec $rounds 916ec07fdf1Sdjm jl .Ldec_done 917ec07fdf1Sdjm___ 918ec07fdf1Sdjm &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 919ec07fdf1Sdjm$code.=<<___; 920ec07fdf1Sdjm movdqa -0x10($const), @XMM[8] # .LISR 921ec07fdf1Sdjm jnz .Ldec_loop 922ec07fdf1Sdjm movdqa -0x20($const), @XMM[8] # .LISRM0 923ec07fdf1Sdjm jmp .Ldec_loop 924ec07fdf1Sdjm.align 16 925ec07fdf1Sdjm.Ldec_done: 926ec07fdf1Sdjm___ 927ec07fdf1Sdjm &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 928ec07fdf1Sdjm$code.=<<___; 929ec07fdf1Sdjm movdqa ($key), @XMM[8] # last round key 930ec07fdf1Sdjm pxor @XMM[8], @XMM[6] 931ec07fdf1Sdjm pxor @XMM[8], @XMM[4] 932ec07fdf1Sdjm pxor @XMM[8], @XMM[2] 933ec07fdf1Sdjm pxor @XMM[8], @XMM[7] 934ec07fdf1Sdjm pxor @XMM[8], @XMM[3] 935ec07fdf1Sdjm pxor @XMM[8], @XMM[5] 936ec07fdf1Sdjm pxor @XMM[8], @XMM[0] 937ec07fdf1Sdjm pxor @XMM[8], @XMM[1] 938ec07fdf1Sdjm ret 939ec07fdf1Sdjm.size _bsaes_decrypt8,.-_bsaes_decrypt8 940ec07fdf1Sdjm___ 941ec07fdf1Sdjm} 942ec07fdf1Sdjm{ 943ec07fdf1Sdjmmy ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 944ec07fdf1Sdjm 945ec07fdf1Sdjmsub bitslice_key { 946ec07fdf1Sdjmmy @x=reverse(@_[0..7]); 947ec07fdf1Sdjmmy ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 948ec07fdf1Sdjm 949ec07fdf1Sdjm &swapmove (@x[0,1],1,$bs0,$t2,$t3); 950ec07fdf1Sdjm$code.=<<___; 951ec07fdf1Sdjm #&swapmove(@x[2,3],1,$t0,$t2,$t3); 952ec07fdf1Sdjm movdqa @x[0], @x[2] 953ec07fdf1Sdjm movdqa @x[1], @x[3] 954ec07fdf1Sdjm___ 955ec07fdf1Sdjm #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 956ec07fdf1Sdjm 957ec07fdf1Sdjm &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 958ec07fdf1Sdjm$code.=<<___; 959ec07fdf1Sdjm #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 960ec07fdf1Sdjm movdqa @x[0], @x[4] 961ec07fdf1Sdjm movdqa @x[2], @x[6] 962ec07fdf1Sdjm movdqa @x[1], @x[5] 963ec07fdf1Sdjm movdqa @x[3], @x[7] 964ec07fdf1Sdjm___ 965ec07fdf1Sdjm &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 966ec07fdf1Sdjm &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 967ec07fdf1Sdjm} 968ec07fdf1Sdjm 969ec07fdf1Sdjm$code.=<<___; 970ec07fdf1Sdjm.type _bsaes_key_convert,\@abi-omnipotent 971ec07fdf1Sdjm.align 16 972ec07fdf1Sdjm_bsaes_key_convert: 973*22787c51Stb _CET_ENDBR 974ec07fdf1Sdjm lea .Lmasks(%rip), $const 975ec07fdf1Sdjm movdqu ($inp), %xmm7 # load round 0 key 976ec07fdf1Sdjm lea 0x10($inp), $inp 977ec07fdf1Sdjm movdqa 0x00($const), %xmm0 # 0x01... 978ec07fdf1Sdjm movdqa 0x10($const), %xmm1 # 0x02... 979ec07fdf1Sdjm movdqa 0x20($const), %xmm2 # 0x04... 980ec07fdf1Sdjm movdqa 0x30($const), %xmm3 # 0x08... 981ec07fdf1Sdjm movdqa 0x40($const), %xmm4 # .LM0 982ec07fdf1Sdjm pcmpeqd %xmm5, %xmm5 # .LNOT 983ec07fdf1Sdjm 984ec07fdf1Sdjm movdqu ($inp), %xmm6 # load round 1 key 985ec07fdf1Sdjm movdqa %xmm7, ($out) # save round 0 key 986ec07fdf1Sdjm lea 0x10($out), $out 987ec07fdf1Sdjm dec $rounds 988ec07fdf1Sdjm jmp .Lkey_loop 989ec07fdf1Sdjm.align 16 990ec07fdf1Sdjm.Lkey_loop: 991ec07fdf1Sdjm pshufb %xmm4, %xmm6 # .LM0 992ec07fdf1Sdjm 993ec07fdf1Sdjm movdqa %xmm0, %xmm8 994ec07fdf1Sdjm movdqa %xmm1, %xmm9 995ec07fdf1Sdjm 996ec07fdf1Sdjm pand %xmm6, %xmm8 997ec07fdf1Sdjm pand %xmm6, %xmm9 998ec07fdf1Sdjm movdqa %xmm2, %xmm10 999ec07fdf1Sdjm pcmpeqb %xmm0, %xmm8 1000ec07fdf1Sdjm psllq \$4, %xmm0 # 0x10... 1001ec07fdf1Sdjm movdqa %xmm3, %xmm11 1002ec07fdf1Sdjm pcmpeqb %xmm1, %xmm9 1003ec07fdf1Sdjm psllq \$4, %xmm1 # 0x20... 1004ec07fdf1Sdjm 1005ec07fdf1Sdjm pand %xmm6, %xmm10 1006ec07fdf1Sdjm pand %xmm6, %xmm11 1007ec07fdf1Sdjm movdqa %xmm0, %xmm12 1008ec07fdf1Sdjm pcmpeqb %xmm2, %xmm10 1009ec07fdf1Sdjm psllq \$4, %xmm2 # 0x40... 1010ec07fdf1Sdjm movdqa %xmm1, %xmm13 1011ec07fdf1Sdjm pcmpeqb %xmm3, %xmm11 1012ec07fdf1Sdjm psllq \$4, %xmm3 # 0x80... 1013ec07fdf1Sdjm 1014ec07fdf1Sdjm movdqa %xmm2, %xmm14 1015ec07fdf1Sdjm movdqa %xmm3, %xmm15 1016ec07fdf1Sdjm pxor %xmm5, %xmm8 # "pnot" 1017ec07fdf1Sdjm pxor %xmm5, %xmm9 1018ec07fdf1Sdjm 1019ec07fdf1Sdjm pand %xmm6, %xmm12 1020ec07fdf1Sdjm pand %xmm6, %xmm13 1021ec07fdf1Sdjm movdqa %xmm8, 0x00($out) # write bit-sliced round key 1022ec07fdf1Sdjm pcmpeqb %xmm0, %xmm12 1023ec07fdf1Sdjm psrlq \$4, %xmm0 # 0x01... 1024ec07fdf1Sdjm movdqa %xmm9, 0x10($out) 1025ec07fdf1Sdjm pcmpeqb %xmm1, %xmm13 1026ec07fdf1Sdjm psrlq \$4, %xmm1 # 0x02... 1027ec07fdf1Sdjm lea 0x10($inp), $inp 1028ec07fdf1Sdjm 1029ec07fdf1Sdjm pand %xmm6, %xmm14 1030ec07fdf1Sdjm pand %xmm6, %xmm15 1031ec07fdf1Sdjm movdqa %xmm10, 0x20($out) 1032ec07fdf1Sdjm pcmpeqb %xmm2, %xmm14 1033ec07fdf1Sdjm psrlq \$4, %xmm2 # 0x04... 1034ec07fdf1Sdjm movdqa %xmm11, 0x30($out) 1035ec07fdf1Sdjm pcmpeqb %xmm3, %xmm15 1036ec07fdf1Sdjm psrlq \$4, %xmm3 # 0x08... 1037ec07fdf1Sdjm movdqu ($inp), %xmm6 # load next round key 1038ec07fdf1Sdjm 1039ec07fdf1Sdjm pxor %xmm5, %xmm13 # "pnot" 1040ec07fdf1Sdjm pxor %xmm5, %xmm14 1041ec07fdf1Sdjm movdqa %xmm12, 0x40($out) 1042ec07fdf1Sdjm movdqa %xmm13, 0x50($out) 1043ec07fdf1Sdjm movdqa %xmm14, 0x60($out) 1044ec07fdf1Sdjm movdqa %xmm15, 0x70($out) 1045ec07fdf1Sdjm lea 0x80($out),$out 1046ec07fdf1Sdjm dec $rounds 1047ec07fdf1Sdjm jnz .Lkey_loop 1048ec07fdf1Sdjm 1049ec07fdf1Sdjm movdqa 0x50($const), %xmm7 # .L63 1050ec07fdf1Sdjm #movdqa %xmm6, ($out) # don't save last round key 1051ec07fdf1Sdjm ret 1052ec07fdf1Sdjm.size _bsaes_key_convert,.-_bsaes_key_convert 1053ec07fdf1Sdjm___ 1054ec07fdf1Sdjm} 1055ec07fdf1Sdjm 1056ec07fdf1Sdjmif (0 && !$win64) { # following four functions are unsupported interface 1057ec07fdf1Sdjm # used for benchmarking... 1058ec07fdf1Sdjm$code.=<<___; 1059ec07fdf1Sdjm.globl bsaes_enc_key_convert 1060ec07fdf1Sdjm.type bsaes_enc_key_convert,\@function,2 1061ec07fdf1Sdjm.align 16 1062ec07fdf1Sdjmbsaes_enc_key_convert: 1063*22787c51Stb _CET_ENDBR 1064ec07fdf1Sdjm mov 240($inp),%r10d # pass rounds 1065ec07fdf1Sdjm mov $inp,%rcx # pass key 1066ec07fdf1Sdjm mov $out,%rax # pass key schedule 1067ec07fdf1Sdjm call _bsaes_key_convert 1068ec07fdf1Sdjm pxor %xmm6,%xmm7 # fix up last round key 1069ec07fdf1Sdjm movdqa %xmm7,(%rax) # save last round key 1070ec07fdf1Sdjm ret 1071ec07fdf1Sdjm.size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1072ec07fdf1Sdjm 1073ec07fdf1Sdjm.globl bsaes_encrypt_128 1074ec07fdf1Sdjm.type bsaes_encrypt_128,\@function,4 1075ec07fdf1Sdjm.align 16 1076ec07fdf1Sdjmbsaes_encrypt_128: 1077ec07fdf1Sdjm.Lenc128_loop: 1078*22787c51Stb _CET_ENDBR 1079ec07fdf1Sdjm movdqu 0x00($inp), @XMM[0] # load input 1080ec07fdf1Sdjm movdqu 0x10($inp), @XMM[1] 1081ec07fdf1Sdjm movdqu 0x20($inp), @XMM[2] 1082ec07fdf1Sdjm movdqu 0x30($inp), @XMM[3] 1083ec07fdf1Sdjm movdqu 0x40($inp), @XMM[4] 1084ec07fdf1Sdjm movdqu 0x50($inp), @XMM[5] 1085ec07fdf1Sdjm movdqu 0x60($inp), @XMM[6] 1086ec07fdf1Sdjm movdqu 0x70($inp), @XMM[7] 1087ec07fdf1Sdjm mov $key, %rax # pass the $key 1088ec07fdf1Sdjm lea 0x80($inp), $inp 1089ec07fdf1Sdjm mov \$10,%r10d 1090ec07fdf1Sdjm 1091ec07fdf1Sdjm call _bsaes_encrypt8 1092ec07fdf1Sdjm 1093ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1094ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1095ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 1096ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 1097ec07fdf1Sdjm movdqu @XMM[3], 0x40($out) 1098ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 1099ec07fdf1Sdjm movdqu @XMM[2], 0x60($out) 1100ec07fdf1Sdjm movdqu @XMM[5], 0x70($out) 1101ec07fdf1Sdjm lea 0x80($out), $out 1102ec07fdf1Sdjm sub \$0x80,$len 1103ec07fdf1Sdjm ja .Lenc128_loop 1104ec07fdf1Sdjm ret 1105ec07fdf1Sdjm.size bsaes_encrypt_128,.-bsaes_encrypt_128 1106ec07fdf1Sdjm 1107ec07fdf1Sdjm.globl bsaes_dec_key_convert 1108ec07fdf1Sdjm.type bsaes_dec_key_convert,\@function,2 1109ec07fdf1Sdjm.align 16 1110ec07fdf1Sdjmbsaes_dec_key_convert: 1111*22787c51Stb _CET_ENDBR 1112ec07fdf1Sdjm mov 240($inp),%r10d # pass rounds 1113ec07fdf1Sdjm mov $inp,%rcx # pass key 1114ec07fdf1Sdjm mov $out,%rax # pass key schedule 1115ec07fdf1Sdjm call _bsaes_key_convert 1116ec07fdf1Sdjm pxor ($out),%xmm7 # fix up round 0 key 1117ec07fdf1Sdjm movdqa %xmm6,(%rax) # save last round key 1118ec07fdf1Sdjm movdqa %xmm7,($out) 1119ec07fdf1Sdjm ret 1120ec07fdf1Sdjm.size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1121ec07fdf1Sdjm 1122ec07fdf1Sdjm.globl bsaes_decrypt_128 1123ec07fdf1Sdjm.type bsaes_decrypt_128,\@function,4 1124ec07fdf1Sdjm.align 16 1125ec07fdf1Sdjmbsaes_decrypt_128: 1126*22787c51Stb _CET_ENDBR 1127ec07fdf1Sdjm.Ldec128_loop: 1128ec07fdf1Sdjm movdqu 0x00($inp), @XMM[0] # load input 1129ec07fdf1Sdjm movdqu 0x10($inp), @XMM[1] 1130ec07fdf1Sdjm movdqu 0x20($inp), @XMM[2] 1131ec07fdf1Sdjm movdqu 0x30($inp), @XMM[3] 1132ec07fdf1Sdjm movdqu 0x40($inp), @XMM[4] 1133ec07fdf1Sdjm movdqu 0x50($inp), @XMM[5] 1134ec07fdf1Sdjm movdqu 0x60($inp), @XMM[6] 1135ec07fdf1Sdjm movdqu 0x70($inp), @XMM[7] 1136ec07fdf1Sdjm mov $key, %rax # pass the $key 1137ec07fdf1Sdjm lea 0x80($inp), $inp 1138ec07fdf1Sdjm mov \$10,%r10d 1139ec07fdf1Sdjm 1140ec07fdf1Sdjm call _bsaes_decrypt8 1141ec07fdf1Sdjm 1142ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1143ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1144ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1145ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 1146ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 1147ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 1148ec07fdf1Sdjm movdqu @XMM[3], 0x60($out) 1149ec07fdf1Sdjm movdqu @XMM[5], 0x70($out) 1150ec07fdf1Sdjm lea 0x80($out), $out 1151ec07fdf1Sdjm sub \$0x80,$len 1152ec07fdf1Sdjm ja .Ldec128_loop 1153ec07fdf1Sdjm ret 1154ec07fdf1Sdjm.size bsaes_decrypt_128,.-bsaes_decrypt_128 1155ec07fdf1Sdjm___ 1156ec07fdf1Sdjm} 1157ec07fdf1Sdjm{ 1158ec07fdf1Sdjm###################################################################### 1159ec07fdf1Sdjm# 1160ec07fdf1Sdjm# OpenSSL interface 1161ec07fdf1Sdjm# 1162ec07fdf1Sdjmmy ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1163ec07fdf1Sdjm : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1164ec07fdf1Sdjmmy ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1165ec07fdf1Sdjm 1166ec07fdf1Sdjmif ($ecb) { 1167ec07fdf1Sdjm$code.=<<___; 1168ec07fdf1Sdjm.globl bsaes_ecb_encrypt_blocks 1169ec07fdf1Sdjm.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1170ec07fdf1Sdjm.align 16 1171ec07fdf1Sdjmbsaes_ecb_encrypt_blocks: 1172*22787c51Stb _CET_ENDBR 1173ec07fdf1Sdjm mov %rsp, %rax 1174ec07fdf1Sdjm.Lecb_enc_prologue: 1175ec07fdf1Sdjm push %rbp 1176ec07fdf1Sdjm push %rbx 1177ec07fdf1Sdjm push %r12 1178ec07fdf1Sdjm push %r13 1179ec07fdf1Sdjm push %r14 1180ec07fdf1Sdjm push %r15 1181ec07fdf1Sdjm lea -0x48(%rsp),%rsp 1182ec07fdf1Sdjm___ 1183ec07fdf1Sdjm$code.=<<___ if ($win64); 1184ec07fdf1Sdjm lea -0xa0(%rsp), %rsp 1185ec07fdf1Sdjm movaps %xmm6, 0x40(%rsp) 1186ec07fdf1Sdjm movaps %xmm7, 0x50(%rsp) 1187ec07fdf1Sdjm movaps %xmm8, 0x60(%rsp) 1188ec07fdf1Sdjm movaps %xmm9, 0x70(%rsp) 1189ec07fdf1Sdjm movaps %xmm10, 0x80(%rsp) 1190ec07fdf1Sdjm movaps %xmm11, 0x90(%rsp) 1191ec07fdf1Sdjm movaps %xmm12, 0xa0(%rsp) 1192ec07fdf1Sdjm movaps %xmm13, 0xb0(%rsp) 1193ec07fdf1Sdjm movaps %xmm14, 0xc0(%rsp) 1194ec07fdf1Sdjm movaps %xmm15, 0xd0(%rsp) 1195ec07fdf1Sdjm.Lecb_enc_body: 1196ec07fdf1Sdjm___ 1197ec07fdf1Sdjm$code.=<<___; 1198ec07fdf1Sdjm mov %rsp,%rbp # backup %rsp 1199ec07fdf1Sdjm mov 240($arg4),%eax # rounds 1200ec07fdf1Sdjm mov $arg1,$inp # backup arguments 1201ec07fdf1Sdjm mov $arg2,$out 1202ec07fdf1Sdjm mov $arg3,$len 1203ec07fdf1Sdjm mov $arg4,$key 1204ec07fdf1Sdjm cmp \$8,$arg3 1205ec07fdf1Sdjm jb .Lecb_enc_short 1206ec07fdf1Sdjm 1207ec07fdf1Sdjm mov %eax,%ebx # backup rounds 1208ec07fdf1Sdjm shl \$7,%rax # 128 bytes per inner round key 1209ec07fdf1Sdjm sub \$`128-32`,%rax # size of bit-sliced key schedule 1210ec07fdf1Sdjm sub %rax,%rsp 1211ec07fdf1Sdjm mov %rsp,%rax # pass key schedule 1212ec07fdf1Sdjm mov $key,%rcx # pass key 1213ec07fdf1Sdjm mov %ebx,%r10d # pass rounds 1214ec07fdf1Sdjm call _bsaes_key_convert 1215ec07fdf1Sdjm pxor %xmm6,%xmm7 # fix up last round key 1216ec07fdf1Sdjm movdqa %xmm7,(%rax) # save last round key 1217ec07fdf1Sdjm 1218ec07fdf1Sdjm sub \$8,$len 1219ec07fdf1Sdjm.Lecb_enc_loop: 1220ec07fdf1Sdjm movdqu 0x00($inp), @XMM[0] # load input 1221ec07fdf1Sdjm movdqu 0x10($inp), @XMM[1] 1222ec07fdf1Sdjm movdqu 0x20($inp), @XMM[2] 1223ec07fdf1Sdjm movdqu 0x30($inp), @XMM[3] 1224ec07fdf1Sdjm movdqu 0x40($inp), @XMM[4] 1225ec07fdf1Sdjm movdqu 0x50($inp), @XMM[5] 1226ec07fdf1Sdjm mov %rsp, %rax # pass key schedule 1227ec07fdf1Sdjm movdqu 0x60($inp), @XMM[6] 1228ec07fdf1Sdjm mov %ebx,%r10d # pass rounds 1229ec07fdf1Sdjm movdqu 0x70($inp), @XMM[7] 1230ec07fdf1Sdjm lea 0x80($inp), $inp 1231ec07fdf1Sdjm 1232ec07fdf1Sdjm call _bsaes_encrypt8 1233ec07fdf1Sdjm 1234ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1235ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1236ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 1237ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 1238ec07fdf1Sdjm movdqu @XMM[3], 0x40($out) 1239ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 1240ec07fdf1Sdjm movdqu @XMM[2], 0x60($out) 1241ec07fdf1Sdjm movdqu @XMM[5], 0x70($out) 1242ec07fdf1Sdjm lea 0x80($out), $out 1243ec07fdf1Sdjm sub \$8,$len 1244ec07fdf1Sdjm jnc .Lecb_enc_loop 1245ec07fdf1Sdjm 1246ec07fdf1Sdjm add \$8,$len 1247ec07fdf1Sdjm jz .Lecb_enc_done 1248ec07fdf1Sdjm 1249ec07fdf1Sdjm movdqu 0x00($inp), @XMM[0] # load input 1250ec07fdf1Sdjm mov %rsp, %rax # pass key schedule 1251ec07fdf1Sdjm mov %ebx,%r10d # pass rounds 1252ec07fdf1Sdjm cmp \$2,$len 1253ec07fdf1Sdjm jb .Lecb_enc_one 1254ec07fdf1Sdjm movdqu 0x10($inp), @XMM[1] 1255ec07fdf1Sdjm je .Lecb_enc_two 1256ec07fdf1Sdjm movdqu 0x20($inp), @XMM[2] 1257ec07fdf1Sdjm cmp \$4,$len 1258ec07fdf1Sdjm jb .Lecb_enc_three 1259ec07fdf1Sdjm movdqu 0x30($inp), @XMM[3] 1260ec07fdf1Sdjm je .Lecb_enc_four 1261ec07fdf1Sdjm movdqu 0x40($inp), @XMM[4] 1262ec07fdf1Sdjm cmp \$6,$len 1263ec07fdf1Sdjm jb .Lecb_enc_five 1264ec07fdf1Sdjm movdqu 0x50($inp), @XMM[5] 1265ec07fdf1Sdjm je .Lecb_enc_six 1266ec07fdf1Sdjm movdqu 0x60($inp), @XMM[6] 1267ec07fdf1Sdjm call _bsaes_encrypt8 1268ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1269ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1270ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 1271ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 1272ec07fdf1Sdjm movdqu @XMM[3], 0x40($out) 1273ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 1274ec07fdf1Sdjm movdqu @XMM[2], 0x60($out) 1275ec07fdf1Sdjm jmp .Lecb_enc_done 1276ec07fdf1Sdjm.align 16 1277ec07fdf1Sdjm.Lecb_enc_six: 1278ec07fdf1Sdjm call _bsaes_encrypt8 1279ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1280ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1281ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 1282ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 1283ec07fdf1Sdjm movdqu @XMM[3], 0x40($out) 1284ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 1285ec07fdf1Sdjm jmp .Lecb_enc_done 1286ec07fdf1Sdjm.align 16 1287ec07fdf1Sdjm.Lecb_enc_five: 1288ec07fdf1Sdjm call _bsaes_encrypt8 1289ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1290ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1291ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 1292ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 1293ec07fdf1Sdjm movdqu @XMM[3], 0x40($out) 1294ec07fdf1Sdjm jmp .Lecb_enc_done 1295ec07fdf1Sdjm.align 16 1296ec07fdf1Sdjm.Lecb_enc_four: 1297ec07fdf1Sdjm call _bsaes_encrypt8 1298ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1299ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1300ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 1301ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 1302ec07fdf1Sdjm jmp .Lecb_enc_done 1303ec07fdf1Sdjm.align 16 1304ec07fdf1Sdjm.Lecb_enc_three: 1305ec07fdf1Sdjm call _bsaes_encrypt8 1306ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1307ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1308ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 1309ec07fdf1Sdjm jmp .Lecb_enc_done 1310ec07fdf1Sdjm.align 16 1311ec07fdf1Sdjm.Lecb_enc_two: 1312ec07fdf1Sdjm call _bsaes_encrypt8 1313ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1314ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1315ec07fdf1Sdjm jmp .Lecb_enc_done 1316ec07fdf1Sdjm.align 16 1317ec07fdf1Sdjm.Lecb_enc_one: 1318ec07fdf1Sdjm call _bsaes_encrypt8 1319ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1320ec07fdf1Sdjm jmp .Lecb_enc_done 1321ec07fdf1Sdjm.align 16 1322ec07fdf1Sdjm.Lecb_enc_short: 1323ec07fdf1Sdjm lea ($inp), $arg1 1324ec07fdf1Sdjm lea ($out), $arg2 1325ec07fdf1Sdjm lea ($key), $arg3 1326ec07fdf1Sdjm call asm_AES_encrypt 1327ec07fdf1Sdjm lea 16($inp), $inp 1328ec07fdf1Sdjm lea 16($out), $out 1329ec07fdf1Sdjm dec $len 1330ec07fdf1Sdjm jnz .Lecb_enc_short 1331ec07fdf1Sdjm 1332ec07fdf1Sdjm.Lecb_enc_done: 1333ec07fdf1Sdjm lea (%rsp),%rax 1334ec07fdf1Sdjm pxor %xmm0, %xmm0 1335ec07fdf1Sdjm.Lecb_enc_bzero: # wipe key schedule [if any] 1336ec07fdf1Sdjm movdqa %xmm0, 0x00(%rax) 1337ec07fdf1Sdjm movdqa %xmm0, 0x10(%rax) 1338ec07fdf1Sdjm lea 0x20(%rax), %rax 1339ec07fdf1Sdjm cmp %rax, %rbp 1340ec07fdf1Sdjm jb .Lecb_enc_bzero 1341ec07fdf1Sdjm 1342ec07fdf1Sdjm lea (%rbp),%rsp # restore %rsp 1343ec07fdf1Sdjm___ 1344ec07fdf1Sdjm$code.=<<___ if ($win64); 1345ec07fdf1Sdjm movaps 0x40(%rbp), %xmm6 1346ec07fdf1Sdjm movaps 0x50(%rbp), %xmm7 1347ec07fdf1Sdjm movaps 0x60(%rbp), %xmm8 1348ec07fdf1Sdjm movaps 0x70(%rbp), %xmm9 1349ec07fdf1Sdjm movaps 0x80(%rbp), %xmm10 1350ec07fdf1Sdjm movaps 0x90(%rbp), %xmm11 1351ec07fdf1Sdjm movaps 0xa0(%rbp), %xmm12 1352ec07fdf1Sdjm movaps 0xb0(%rbp), %xmm13 1353ec07fdf1Sdjm movaps 0xc0(%rbp), %xmm14 1354ec07fdf1Sdjm movaps 0xd0(%rbp), %xmm15 1355ec07fdf1Sdjm lea 0xa0(%rbp), %rsp 1356ec07fdf1Sdjm___ 1357ec07fdf1Sdjm$code.=<<___; 1358ec07fdf1Sdjm mov 0x48(%rsp), %r15 1359ec07fdf1Sdjm mov 0x50(%rsp), %r14 1360ec07fdf1Sdjm mov 0x58(%rsp), %r13 1361ec07fdf1Sdjm mov 0x60(%rsp), %r12 1362ec07fdf1Sdjm mov 0x68(%rsp), %rbx 1363ec07fdf1Sdjm mov 0x70(%rsp), %rax 1364ec07fdf1Sdjm lea 0x78(%rsp), %rsp 1365ec07fdf1Sdjm mov %rax, %rbp 1366ec07fdf1Sdjm.Lecb_enc_epilogue: 1367ec07fdf1Sdjm ret 1368ec07fdf1Sdjm.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1369ec07fdf1Sdjm 1370ec07fdf1Sdjm.globl bsaes_ecb_decrypt_blocks 1371ec07fdf1Sdjm.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1372ec07fdf1Sdjm.align 16 1373ec07fdf1Sdjmbsaes_ecb_decrypt_blocks: 1374*22787c51Stb _CET_ENDBR 1375ec07fdf1Sdjm mov %rsp, %rax 1376ec07fdf1Sdjm.Lecb_dec_prologue: 1377ec07fdf1Sdjm push %rbp 1378ec07fdf1Sdjm push %rbx 1379ec07fdf1Sdjm push %r12 1380ec07fdf1Sdjm push %r13 1381ec07fdf1Sdjm push %r14 1382ec07fdf1Sdjm push %r15 1383ec07fdf1Sdjm lea -0x48(%rsp),%rsp 1384ec07fdf1Sdjm___ 1385ec07fdf1Sdjm$code.=<<___ if ($win64); 1386ec07fdf1Sdjm lea -0xa0(%rsp), %rsp 1387ec07fdf1Sdjm movaps %xmm6, 0x40(%rsp) 1388ec07fdf1Sdjm movaps %xmm7, 0x50(%rsp) 1389ec07fdf1Sdjm movaps %xmm8, 0x60(%rsp) 1390ec07fdf1Sdjm movaps %xmm9, 0x70(%rsp) 1391ec07fdf1Sdjm movaps %xmm10, 0x80(%rsp) 1392ec07fdf1Sdjm movaps %xmm11, 0x90(%rsp) 1393ec07fdf1Sdjm movaps %xmm12, 0xa0(%rsp) 1394ec07fdf1Sdjm movaps %xmm13, 0xb0(%rsp) 1395ec07fdf1Sdjm movaps %xmm14, 0xc0(%rsp) 1396ec07fdf1Sdjm movaps %xmm15, 0xd0(%rsp) 1397ec07fdf1Sdjm.Lecb_dec_body: 1398ec07fdf1Sdjm___ 1399ec07fdf1Sdjm$code.=<<___; 1400ec07fdf1Sdjm mov %rsp,%rbp # backup %rsp 1401ec07fdf1Sdjm mov 240($arg4),%eax # rounds 1402ec07fdf1Sdjm mov $arg1,$inp # backup arguments 1403ec07fdf1Sdjm mov $arg2,$out 1404ec07fdf1Sdjm mov $arg3,$len 1405ec07fdf1Sdjm mov $arg4,$key 1406ec07fdf1Sdjm cmp \$8,$arg3 1407ec07fdf1Sdjm jb .Lecb_dec_short 1408ec07fdf1Sdjm 1409ec07fdf1Sdjm mov %eax,%ebx # backup rounds 1410ec07fdf1Sdjm shl \$7,%rax # 128 bytes per inner round key 1411ec07fdf1Sdjm sub \$`128-32`,%rax # size of bit-sliced key schedule 1412ec07fdf1Sdjm sub %rax,%rsp 1413ec07fdf1Sdjm mov %rsp,%rax # pass key schedule 1414ec07fdf1Sdjm mov $key,%rcx # pass key 1415ec07fdf1Sdjm mov %ebx,%r10d # pass rounds 1416ec07fdf1Sdjm call _bsaes_key_convert 1417ec07fdf1Sdjm pxor (%rsp),%xmm7 # fix up 0 round key 1418ec07fdf1Sdjm movdqa %xmm6,(%rax) # save last round key 1419ec07fdf1Sdjm movdqa %xmm7,(%rsp) 1420ec07fdf1Sdjm 1421ec07fdf1Sdjm sub \$8,$len 1422ec07fdf1Sdjm.Lecb_dec_loop: 1423ec07fdf1Sdjm movdqu 0x00($inp), @XMM[0] # load input 1424ec07fdf1Sdjm movdqu 0x10($inp), @XMM[1] 1425ec07fdf1Sdjm movdqu 0x20($inp), @XMM[2] 1426ec07fdf1Sdjm movdqu 0x30($inp), @XMM[3] 1427ec07fdf1Sdjm movdqu 0x40($inp), @XMM[4] 1428ec07fdf1Sdjm movdqu 0x50($inp), @XMM[5] 1429ec07fdf1Sdjm mov %rsp, %rax # pass key schedule 1430ec07fdf1Sdjm movdqu 0x60($inp), @XMM[6] 1431ec07fdf1Sdjm mov %ebx,%r10d # pass rounds 1432ec07fdf1Sdjm movdqu 0x70($inp), @XMM[7] 1433ec07fdf1Sdjm lea 0x80($inp), $inp 1434ec07fdf1Sdjm 1435ec07fdf1Sdjm call _bsaes_decrypt8 1436ec07fdf1Sdjm 1437ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1438ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1439ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1440ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 1441ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 1442ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 1443ec07fdf1Sdjm movdqu @XMM[3], 0x60($out) 1444ec07fdf1Sdjm movdqu @XMM[5], 0x70($out) 1445ec07fdf1Sdjm lea 0x80($out), $out 1446ec07fdf1Sdjm sub \$8,$len 1447ec07fdf1Sdjm jnc .Lecb_dec_loop 1448ec07fdf1Sdjm 1449ec07fdf1Sdjm add \$8,$len 1450ec07fdf1Sdjm jz .Lecb_dec_done 1451ec07fdf1Sdjm 1452ec07fdf1Sdjm movdqu 0x00($inp), @XMM[0] # load input 1453ec07fdf1Sdjm mov %rsp, %rax # pass key schedule 1454ec07fdf1Sdjm mov %ebx,%r10d # pass rounds 1455ec07fdf1Sdjm cmp \$2,$len 1456ec07fdf1Sdjm jb .Lecb_dec_one 1457ec07fdf1Sdjm movdqu 0x10($inp), @XMM[1] 1458ec07fdf1Sdjm je .Lecb_dec_two 1459ec07fdf1Sdjm movdqu 0x20($inp), @XMM[2] 1460ec07fdf1Sdjm cmp \$4,$len 1461ec07fdf1Sdjm jb .Lecb_dec_three 1462ec07fdf1Sdjm movdqu 0x30($inp), @XMM[3] 1463ec07fdf1Sdjm je .Lecb_dec_four 1464ec07fdf1Sdjm movdqu 0x40($inp), @XMM[4] 1465ec07fdf1Sdjm cmp \$6,$len 1466ec07fdf1Sdjm jb .Lecb_dec_five 1467ec07fdf1Sdjm movdqu 0x50($inp), @XMM[5] 1468ec07fdf1Sdjm je .Lecb_dec_six 1469ec07fdf1Sdjm movdqu 0x60($inp), @XMM[6] 1470ec07fdf1Sdjm call _bsaes_decrypt8 1471ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1472ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1473ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1474ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 1475ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 1476ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 1477ec07fdf1Sdjm movdqu @XMM[3], 0x60($out) 1478ec07fdf1Sdjm jmp .Lecb_dec_done 1479ec07fdf1Sdjm.align 16 1480ec07fdf1Sdjm.Lecb_dec_six: 1481ec07fdf1Sdjm call _bsaes_decrypt8 1482ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1483ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1484ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1485ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 1486ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 1487ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 1488ec07fdf1Sdjm jmp .Lecb_dec_done 1489ec07fdf1Sdjm.align 16 1490ec07fdf1Sdjm.Lecb_dec_five: 1491ec07fdf1Sdjm call _bsaes_decrypt8 1492ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1493ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1494ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1495ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 1496ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 1497ec07fdf1Sdjm jmp .Lecb_dec_done 1498ec07fdf1Sdjm.align 16 1499ec07fdf1Sdjm.Lecb_dec_four: 1500ec07fdf1Sdjm call _bsaes_decrypt8 1501ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1502ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1503ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1504ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 1505ec07fdf1Sdjm jmp .Lecb_dec_done 1506ec07fdf1Sdjm.align 16 1507ec07fdf1Sdjm.Lecb_dec_three: 1508ec07fdf1Sdjm call _bsaes_decrypt8 1509ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1510ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1511ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1512ec07fdf1Sdjm jmp .Lecb_dec_done 1513ec07fdf1Sdjm.align 16 1514ec07fdf1Sdjm.Lecb_dec_two: 1515ec07fdf1Sdjm call _bsaes_decrypt8 1516ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1517ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1518ec07fdf1Sdjm jmp .Lecb_dec_done 1519ec07fdf1Sdjm.align 16 1520ec07fdf1Sdjm.Lecb_dec_one: 1521ec07fdf1Sdjm call _bsaes_decrypt8 1522ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1523ec07fdf1Sdjm jmp .Lecb_dec_done 1524ec07fdf1Sdjm.align 16 1525ec07fdf1Sdjm.Lecb_dec_short: 1526ec07fdf1Sdjm lea ($inp), $arg1 1527ec07fdf1Sdjm lea ($out), $arg2 1528ec07fdf1Sdjm lea ($key), $arg3 1529ec07fdf1Sdjm call asm_AES_decrypt 1530ec07fdf1Sdjm lea 16($inp), $inp 1531ec07fdf1Sdjm lea 16($out), $out 1532ec07fdf1Sdjm dec $len 1533ec07fdf1Sdjm jnz .Lecb_dec_short 1534ec07fdf1Sdjm 1535ec07fdf1Sdjm.Lecb_dec_done: 1536ec07fdf1Sdjm lea (%rsp),%rax 1537ec07fdf1Sdjm pxor %xmm0, %xmm0 1538ec07fdf1Sdjm.Lecb_dec_bzero: # wipe key schedule [if any] 1539ec07fdf1Sdjm movdqa %xmm0, 0x00(%rax) 1540ec07fdf1Sdjm movdqa %xmm0, 0x10(%rax) 1541ec07fdf1Sdjm lea 0x20(%rax), %rax 1542ec07fdf1Sdjm cmp %rax, %rbp 1543ec07fdf1Sdjm jb .Lecb_dec_bzero 1544ec07fdf1Sdjm 1545ec07fdf1Sdjm lea (%rbp),%rsp # restore %rsp 1546ec07fdf1Sdjm___ 1547ec07fdf1Sdjm$code.=<<___ if ($win64); 1548ec07fdf1Sdjm movaps 0x40(%rbp), %xmm6 1549ec07fdf1Sdjm movaps 0x50(%rbp), %xmm7 1550ec07fdf1Sdjm movaps 0x60(%rbp), %xmm8 1551ec07fdf1Sdjm movaps 0x70(%rbp), %xmm9 1552ec07fdf1Sdjm movaps 0x80(%rbp), %xmm10 1553ec07fdf1Sdjm movaps 0x90(%rbp), %xmm11 1554ec07fdf1Sdjm movaps 0xa0(%rbp), %xmm12 1555ec07fdf1Sdjm movaps 0xb0(%rbp), %xmm13 1556ec07fdf1Sdjm movaps 0xc0(%rbp), %xmm14 1557ec07fdf1Sdjm movaps 0xd0(%rbp), %xmm15 1558ec07fdf1Sdjm lea 0xa0(%rbp), %rsp 1559ec07fdf1Sdjm___ 1560ec07fdf1Sdjm$code.=<<___; 1561ec07fdf1Sdjm mov 0x48(%rsp), %r15 1562ec07fdf1Sdjm mov 0x50(%rsp), %r14 1563ec07fdf1Sdjm mov 0x58(%rsp), %r13 1564ec07fdf1Sdjm mov 0x60(%rsp), %r12 1565ec07fdf1Sdjm mov 0x68(%rsp), %rbx 1566ec07fdf1Sdjm mov 0x70(%rsp), %rax 1567ec07fdf1Sdjm lea 0x78(%rsp), %rsp 1568ec07fdf1Sdjm mov %rax, %rbp 1569ec07fdf1Sdjm.Lecb_dec_epilogue: 1570ec07fdf1Sdjm ret 1571ec07fdf1Sdjm.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1572ec07fdf1Sdjm___ 1573ec07fdf1Sdjm} 1574ec07fdf1Sdjm$code.=<<___; 1575ec07fdf1Sdjm.extern asm_AES_cbc_encrypt 1576ec07fdf1Sdjm.globl bsaes_cbc_encrypt 1577ec07fdf1Sdjm.type bsaes_cbc_encrypt,\@abi-omnipotent 1578ec07fdf1Sdjm.align 16 1579ec07fdf1Sdjmbsaes_cbc_encrypt: 1580*22787c51Stb _CET_ENDBR 1581ec07fdf1Sdjm___ 1582ec07fdf1Sdjm$code.=<<___ if ($win64); 1583ec07fdf1Sdjm mov 48(%rsp),$arg6 # pull direction flag 1584ec07fdf1Sdjm___ 1585ec07fdf1Sdjm$code.=<<___; 1586ec07fdf1Sdjm cmp \$0,$arg6 1587ec07fdf1Sdjm jne asm_AES_cbc_encrypt 1588ec07fdf1Sdjm cmp \$128,$arg3 1589ec07fdf1Sdjm jb asm_AES_cbc_encrypt 1590ec07fdf1Sdjm 1591ec07fdf1Sdjm mov %rsp, %rax 1592ec07fdf1Sdjm.Lcbc_dec_prologue: 1593ec07fdf1Sdjm push %rbp 1594ec07fdf1Sdjm push %rbx 1595ec07fdf1Sdjm push %r12 1596ec07fdf1Sdjm push %r13 1597ec07fdf1Sdjm push %r14 1598ec07fdf1Sdjm push %r15 1599ec07fdf1Sdjm lea -0x48(%rsp), %rsp 1600ec07fdf1Sdjm___ 1601ec07fdf1Sdjm$code.=<<___ if ($win64); 1602ec07fdf1Sdjm mov 0xa0(%rsp),$arg5 # pull ivp 1603ec07fdf1Sdjm lea -0xa0(%rsp), %rsp 1604ec07fdf1Sdjm movaps %xmm6, 0x40(%rsp) 1605ec07fdf1Sdjm movaps %xmm7, 0x50(%rsp) 1606ec07fdf1Sdjm movaps %xmm8, 0x60(%rsp) 1607ec07fdf1Sdjm movaps %xmm9, 0x70(%rsp) 1608ec07fdf1Sdjm movaps %xmm10, 0x80(%rsp) 1609ec07fdf1Sdjm movaps %xmm11, 0x90(%rsp) 1610ec07fdf1Sdjm movaps %xmm12, 0xa0(%rsp) 1611ec07fdf1Sdjm movaps %xmm13, 0xb0(%rsp) 1612ec07fdf1Sdjm movaps %xmm14, 0xc0(%rsp) 1613ec07fdf1Sdjm movaps %xmm15, 0xd0(%rsp) 1614ec07fdf1Sdjm.Lcbc_dec_body: 1615ec07fdf1Sdjm___ 1616ec07fdf1Sdjm$code.=<<___; 1617ec07fdf1Sdjm mov %rsp, %rbp # backup %rsp 1618ec07fdf1Sdjm mov 240($arg4), %eax # rounds 1619ec07fdf1Sdjm mov $arg1, $inp # backup arguments 1620ec07fdf1Sdjm mov $arg2, $out 1621ec07fdf1Sdjm mov $arg3, $len 1622ec07fdf1Sdjm mov $arg4, $key 1623ec07fdf1Sdjm mov $arg5, %rbx 1624ec07fdf1Sdjm shr \$4, $len # bytes to blocks 1625ec07fdf1Sdjm 1626ec07fdf1Sdjm mov %eax, %edx # rounds 1627ec07fdf1Sdjm shl \$7, %rax # 128 bytes per inner round key 1628ec07fdf1Sdjm sub \$`128-32`, %rax # size of bit-sliced key schedule 1629ec07fdf1Sdjm sub %rax, %rsp 1630ec07fdf1Sdjm 1631ec07fdf1Sdjm mov %rsp, %rax # pass key schedule 1632ec07fdf1Sdjm mov $key, %rcx # pass key 1633ec07fdf1Sdjm mov %edx, %r10d # pass rounds 1634ec07fdf1Sdjm call _bsaes_key_convert 1635ec07fdf1Sdjm pxor (%rsp),%xmm7 # fix up 0 round key 1636ec07fdf1Sdjm movdqa %xmm6,(%rax) # save last round key 1637ec07fdf1Sdjm movdqa %xmm7,(%rsp) 1638ec07fdf1Sdjm 1639ec07fdf1Sdjm movdqu (%rbx), @XMM[15] # load IV 1640ec07fdf1Sdjm sub \$8,$len 1641ec07fdf1Sdjm.Lcbc_dec_loop: 1642ec07fdf1Sdjm movdqu 0x00($inp), @XMM[0] # load input 1643ec07fdf1Sdjm movdqu 0x10($inp), @XMM[1] 1644ec07fdf1Sdjm movdqu 0x20($inp), @XMM[2] 1645ec07fdf1Sdjm movdqu 0x30($inp), @XMM[3] 1646ec07fdf1Sdjm movdqu 0x40($inp), @XMM[4] 1647ec07fdf1Sdjm movdqu 0x50($inp), @XMM[5] 1648ec07fdf1Sdjm mov %rsp, %rax # pass key schedule 1649ec07fdf1Sdjm movdqu 0x60($inp), @XMM[6] 1650ec07fdf1Sdjm mov %edx,%r10d # pass rounds 1651ec07fdf1Sdjm movdqu 0x70($inp), @XMM[7] 1652ec07fdf1Sdjm movdqa @XMM[15], 0x20(%rbp) # put aside IV 1653ec07fdf1Sdjm 1654ec07fdf1Sdjm call _bsaes_decrypt8 1655ec07fdf1Sdjm 1656ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[0] # ^= IV 1657ec07fdf1Sdjm movdqu 0x00($inp), @XMM[8] # re-load input 1658ec07fdf1Sdjm movdqu 0x10($inp), @XMM[9] 1659ec07fdf1Sdjm pxor @XMM[8], @XMM[1] 1660ec07fdf1Sdjm movdqu 0x20($inp), @XMM[10] 1661ec07fdf1Sdjm pxor @XMM[9], @XMM[6] 1662ec07fdf1Sdjm movdqu 0x30($inp), @XMM[11] 1663ec07fdf1Sdjm pxor @XMM[10], @XMM[4] 1664ec07fdf1Sdjm movdqu 0x40($inp), @XMM[12] 1665ec07fdf1Sdjm pxor @XMM[11], @XMM[2] 1666ec07fdf1Sdjm movdqu 0x50($inp), @XMM[13] 1667ec07fdf1Sdjm pxor @XMM[12], @XMM[7] 1668ec07fdf1Sdjm movdqu 0x60($inp), @XMM[14] 1669ec07fdf1Sdjm pxor @XMM[13], @XMM[3] 1670ec07fdf1Sdjm movdqu 0x70($inp), @XMM[15] # IV 1671ec07fdf1Sdjm pxor @XMM[14], @XMM[5] 1672ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1673ec07fdf1Sdjm lea 0x80($inp), $inp 1674ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1675ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1676ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 1677ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 1678ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 1679ec07fdf1Sdjm movdqu @XMM[3], 0x60($out) 1680ec07fdf1Sdjm movdqu @XMM[5], 0x70($out) 1681ec07fdf1Sdjm lea 0x80($out), $out 1682ec07fdf1Sdjm sub \$8,$len 1683ec07fdf1Sdjm jnc .Lcbc_dec_loop 1684ec07fdf1Sdjm 1685ec07fdf1Sdjm add \$8,$len 1686ec07fdf1Sdjm jz .Lcbc_dec_done 1687ec07fdf1Sdjm 1688ec07fdf1Sdjm movdqu 0x00($inp), @XMM[0] # load input 1689ec07fdf1Sdjm mov %rsp, %rax # pass key schedule 1690ec07fdf1Sdjm mov %edx, %r10d # pass rounds 1691ec07fdf1Sdjm cmp \$2,$len 1692ec07fdf1Sdjm jb .Lcbc_dec_one 1693ec07fdf1Sdjm movdqu 0x10($inp), @XMM[1] 1694ec07fdf1Sdjm je .Lcbc_dec_two 1695ec07fdf1Sdjm movdqu 0x20($inp), @XMM[2] 1696ec07fdf1Sdjm cmp \$4,$len 1697ec07fdf1Sdjm jb .Lcbc_dec_three 1698ec07fdf1Sdjm movdqu 0x30($inp), @XMM[3] 1699ec07fdf1Sdjm je .Lcbc_dec_four 1700ec07fdf1Sdjm movdqu 0x40($inp), @XMM[4] 1701ec07fdf1Sdjm cmp \$6,$len 1702ec07fdf1Sdjm jb .Lcbc_dec_five 1703ec07fdf1Sdjm movdqu 0x50($inp), @XMM[5] 1704ec07fdf1Sdjm je .Lcbc_dec_six 1705ec07fdf1Sdjm movdqu 0x60($inp), @XMM[6] 1706ec07fdf1Sdjm movdqa @XMM[15], 0x20(%rbp) # put aside IV 1707ec07fdf1Sdjm call _bsaes_decrypt8 1708ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[0] # ^= IV 1709ec07fdf1Sdjm movdqu 0x00($inp), @XMM[8] # re-load input 1710ec07fdf1Sdjm movdqu 0x10($inp), @XMM[9] 1711ec07fdf1Sdjm pxor @XMM[8], @XMM[1] 1712ec07fdf1Sdjm movdqu 0x20($inp), @XMM[10] 1713ec07fdf1Sdjm pxor @XMM[9], @XMM[6] 1714ec07fdf1Sdjm movdqu 0x30($inp), @XMM[11] 1715ec07fdf1Sdjm pxor @XMM[10], @XMM[4] 1716ec07fdf1Sdjm movdqu 0x40($inp), @XMM[12] 1717ec07fdf1Sdjm pxor @XMM[11], @XMM[2] 1718ec07fdf1Sdjm movdqu 0x50($inp), @XMM[13] 1719ec07fdf1Sdjm pxor @XMM[12], @XMM[7] 1720ec07fdf1Sdjm movdqu 0x60($inp), @XMM[15] # IV 1721ec07fdf1Sdjm pxor @XMM[13], @XMM[3] 1722ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1723ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1724ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1725ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 1726ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 1727ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 1728ec07fdf1Sdjm movdqu @XMM[3], 0x60($out) 1729ec07fdf1Sdjm jmp .Lcbc_dec_done 1730ec07fdf1Sdjm.align 16 1731ec07fdf1Sdjm.Lcbc_dec_six: 1732ec07fdf1Sdjm movdqa @XMM[15], 0x20(%rbp) # put aside IV 1733ec07fdf1Sdjm call _bsaes_decrypt8 1734ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[0] # ^= IV 1735ec07fdf1Sdjm movdqu 0x00($inp), @XMM[8] # re-load input 1736ec07fdf1Sdjm movdqu 0x10($inp), @XMM[9] 1737ec07fdf1Sdjm pxor @XMM[8], @XMM[1] 1738ec07fdf1Sdjm movdqu 0x20($inp), @XMM[10] 1739ec07fdf1Sdjm pxor @XMM[9], @XMM[6] 1740ec07fdf1Sdjm movdqu 0x30($inp), @XMM[11] 1741ec07fdf1Sdjm pxor @XMM[10], @XMM[4] 1742ec07fdf1Sdjm movdqu 0x40($inp), @XMM[12] 1743ec07fdf1Sdjm pxor @XMM[11], @XMM[2] 1744ec07fdf1Sdjm movdqu 0x50($inp), @XMM[15] # IV 1745ec07fdf1Sdjm pxor @XMM[12], @XMM[7] 1746ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1747ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1748ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1749ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 1750ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 1751ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 1752ec07fdf1Sdjm jmp .Lcbc_dec_done 1753ec07fdf1Sdjm.align 16 1754ec07fdf1Sdjm.Lcbc_dec_five: 1755ec07fdf1Sdjm movdqa @XMM[15], 0x20(%rbp) # put aside IV 1756ec07fdf1Sdjm call _bsaes_decrypt8 1757ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[0] # ^= IV 1758ec07fdf1Sdjm movdqu 0x00($inp), @XMM[8] # re-load input 1759ec07fdf1Sdjm movdqu 0x10($inp), @XMM[9] 1760ec07fdf1Sdjm pxor @XMM[8], @XMM[1] 1761ec07fdf1Sdjm movdqu 0x20($inp), @XMM[10] 1762ec07fdf1Sdjm pxor @XMM[9], @XMM[6] 1763ec07fdf1Sdjm movdqu 0x30($inp), @XMM[11] 1764ec07fdf1Sdjm pxor @XMM[10], @XMM[4] 1765ec07fdf1Sdjm movdqu 0x40($inp), @XMM[15] # IV 1766ec07fdf1Sdjm pxor @XMM[11], @XMM[2] 1767ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1768ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1769ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1770ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 1771ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 1772ec07fdf1Sdjm jmp .Lcbc_dec_done 1773ec07fdf1Sdjm.align 16 1774ec07fdf1Sdjm.Lcbc_dec_four: 1775ec07fdf1Sdjm movdqa @XMM[15], 0x20(%rbp) # put aside IV 1776ec07fdf1Sdjm call _bsaes_decrypt8 1777ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[0] # ^= IV 1778ec07fdf1Sdjm movdqu 0x00($inp), @XMM[8] # re-load input 1779ec07fdf1Sdjm movdqu 0x10($inp), @XMM[9] 1780ec07fdf1Sdjm pxor @XMM[8], @XMM[1] 1781ec07fdf1Sdjm movdqu 0x20($inp), @XMM[10] 1782ec07fdf1Sdjm pxor @XMM[9], @XMM[6] 1783ec07fdf1Sdjm movdqu 0x30($inp), @XMM[15] # IV 1784ec07fdf1Sdjm pxor @XMM[10], @XMM[4] 1785ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1786ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1787ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1788ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 1789ec07fdf1Sdjm jmp .Lcbc_dec_done 1790ec07fdf1Sdjm.align 16 1791ec07fdf1Sdjm.Lcbc_dec_three: 1792ec07fdf1Sdjm movdqa @XMM[15], 0x20(%rbp) # put aside IV 1793ec07fdf1Sdjm call _bsaes_decrypt8 1794ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[0] # ^= IV 1795ec07fdf1Sdjm movdqu 0x00($inp), @XMM[8] # re-load input 1796ec07fdf1Sdjm movdqu 0x10($inp), @XMM[9] 1797ec07fdf1Sdjm pxor @XMM[8], @XMM[1] 1798ec07fdf1Sdjm movdqu 0x20($inp), @XMM[15] # IV 1799ec07fdf1Sdjm pxor @XMM[9], @XMM[6] 1800ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1801ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1802ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 1803ec07fdf1Sdjm jmp .Lcbc_dec_done 1804ec07fdf1Sdjm.align 16 1805ec07fdf1Sdjm.Lcbc_dec_two: 1806ec07fdf1Sdjm movdqa @XMM[15], 0x20(%rbp) # put aside IV 1807ec07fdf1Sdjm call _bsaes_decrypt8 1808ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[0] # ^= IV 1809ec07fdf1Sdjm movdqu 0x00($inp), @XMM[8] # re-load input 1810ec07fdf1Sdjm movdqu 0x10($inp), @XMM[15] # IV 1811ec07fdf1Sdjm pxor @XMM[8], @XMM[1] 1812ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 1813ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1814ec07fdf1Sdjm jmp .Lcbc_dec_done 1815ec07fdf1Sdjm.align 16 1816ec07fdf1Sdjm.Lcbc_dec_one: 1817ec07fdf1Sdjm lea ($inp), $arg1 1818ec07fdf1Sdjm lea 0x20(%rbp), $arg2 # buffer output 1819ec07fdf1Sdjm lea ($key), $arg3 1820ec07fdf1Sdjm call asm_AES_decrypt # doesn't touch %xmm 1821ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[15] # ^= IV 1822ec07fdf1Sdjm movdqu @XMM[15], ($out) # write output 1823ec07fdf1Sdjm movdqa @XMM[0], @XMM[15] # IV 1824ec07fdf1Sdjm 1825ec07fdf1Sdjm.Lcbc_dec_done: 1826ec07fdf1Sdjm movdqu @XMM[15], (%rbx) # return IV 1827ec07fdf1Sdjm lea (%rsp), %rax 1828ec07fdf1Sdjm pxor %xmm0, %xmm0 1829ec07fdf1Sdjm.Lcbc_dec_bzero: # wipe key schedule [if any] 1830ec07fdf1Sdjm movdqa %xmm0, 0x00(%rax) 1831ec07fdf1Sdjm movdqa %xmm0, 0x10(%rax) 1832ec07fdf1Sdjm lea 0x20(%rax), %rax 1833ec07fdf1Sdjm cmp %rax, %rbp 1834ec07fdf1Sdjm ja .Lcbc_dec_bzero 1835ec07fdf1Sdjm 1836ec07fdf1Sdjm lea (%rbp),%rsp # restore %rsp 1837ec07fdf1Sdjm___ 1838ec07fdf1Sdjm$code.=<<___ if ($win64); 1839ec07fdf1Sdjm movaps 0x40(%rbp), %xmm6 1840ec07fdf1Sdjm movaps 0x50(%rbp), %xmm7 1841ec07fdf1Sdjm movaps 0x60(%rbp), %xmm8 1842ec07fdf1Sdjm movaps 0x70(%rbp), %xmm9 1843ec07fdf1Sdjm movaps 0x80(%rbp), %xmm10 1844ec07fdf1Sdjm movaps 0x90(%rbp), %xmm11 1845ec07fdf1Sdjm movaps 0xa0(%rbp), %xmm12 1846ec07fdf1Sdjm movaps 0xb0(%rbp), %xmm13 1847ec07fdf1Sdjm movaps 0xc0(%rbp), %xmm14 1848ec07fdf1Sdjm movaps 0xd0(%rbp), %xmm15 1849ec07fdf1Sdjm lea 0xa0(%rbp), %rsp 1850ec07fdf1Sdjm___ 1851ec07fdf1Sdjm$code.=<<___; 1852ec07fdf1Sdjm mov 0x48(%rsp), %r15 1853ec07fdf1Sdjm mov 0x50(%rsp), %r14 1854ec07fdf1Sdjm mov 0x58(%rsp), %r13 1855ec07fdf1Sdjm mov 0x60(%rsp), %r12 1856ec07fdf1Sdjm mov 0x68(%rsp), %rbx 1857ec07fdf1Sdjm mov 0x70(%rsp), %rax 1858ec07fdf1Sdjm lea 0x78(%rsp), %rsp 1859ec07fdf1Sdjm mov %rax, %rbp 1860ec07fdf1Sdjm.Lcbc_dec_epilogue: 1861ec07fdf1Sdjm ret 1862ec07fdf1Sdjm.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt 1863ec07fdf1Sdjm 1864ec07fdf1Sdjm.globl bsaes_ctr32_encrypt_blocks 1865ec07fdf1Sdjm.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1866ec07fdf1Sdjm.align 16 1867ec07fdf1Sdjmbsaes_ctr32_encrypt_blocks: 1868*22787c51Stb _CET_ENDBR 1869ec07fdf1Sdjm mov %rsp, %rax 1870ec07fdf1Sdjm.Lctr_enc_prologue: 1871ec07fdf1Sdjm push %rbp 1872ec07fdf1Sdjm push %rbx 1873ec07fdf1Sdjm push %r12 1874ec07fdf1Sdjm push %r13 1875ec07fdf1Sdjm push %r14 1876ec07fdf1Sdjm push %r15 1877ec07fdf1Sdjm lea -0x48(%rsp), %rsp 1878ec07fdf1Sdjm___ 1879ec07fdf1Sdjm$code.=<<___ if ($win64); 1880ec07fdf1Sdjm mov 0xa0(%rsp),$arg5 # pull ivp 1881ec07fdf1Sdjm lea -0xa0(%rsp), %rsp 1882ec07fdf1Sdjm movaps %xmm6, 0x40(%rsp) 1883ec07fdf1Sdjm movaps %xmm7, 0x50(%rsp) 1884ec07fdf1Sdjm movaps %xmm8, 0x60(%rsp) 1885ec07fdf1Sdjm movaps %xmm9, 0x70(%rsp) 1886ec07fdf1Sdjm movaps %xmm10, 0x80(%rsp) 1887ec07fdf1Sdjm movaps %xmm11, 0x90(%rsp) 1888ec07fdf1Sdjm movaps %xmm12, 0xa0(%rsp) 1889ec07fdf1Sdjm movaps %xmm13, 0xb0(%rsp) 1890ec07fdf1Sdjm movaps %xmm14, 0xc0(%rsp) 1891ec07fdf1Sdjm movaps %xmm15, 0xd0(%rsp) 1892ec07fdf1Sdjm.Lctr_enc_body: 1893ec07fdf1Sdjm___ 1894ec07fdf1Sdjm$code.=<<___; 1895ec07fdf1Sdjm mov %rsp, %rbp # backup %rsp 1896ec07fdf1Sdjm movdqu ($arg5), %xmm0 # load counter 1897ec07fdf1Sdjm mov 240($arg4), %eax # rounds 1898ec07fdf1Sdjm mov $arg1, $inp # backup arguments 1899ec07fdf1Sdjm mov $arg2, $out 1900ec07fdf1Sdjm mov $arg3, $len 1901ec07fdf1Sdjm mov $arg4, $key 1902ec07fdf1Sdjm movdqa %xmm0, 0x20(%rbp) # copy counter 1903ec07fdf1Sdjm cmp \$8, $arg3 1904ec07fdf1Sdjm jb .Lctr_enc_short 1905ec07fdf1Sdjm 1906ec07fdf1Sdjm mov %eax, %ebx # rounds 1907ec07fdf1Sdjm shl \$7, %rax # 128 bytes per inner round key 1908ec07fdf1Sdjm sub \$`128-32`, %rax # size of bit-sliced key schedule 1909ec07fdf1Sdjm sub %rax, %rsp 1910ec07fdf1Sdjm 1911ec07fdf1Sdjm mov %rsp, %rax # pass key schedule 1912ec07fdf1Sdjm mov $key, %rcx # pass key 1913ec07fdf1Sdjm mov %ebx, %r10d # pass rounds 1914ec07fdf1Sdjm call _bsaes_key_convert 1915ec07fdf1Sdjm pxor %xmm6,%xmm7 # fix up last round key 1916ec07fdf1Sdjm movdqa %xmm7,(%rax) # save last round key 1917ec07fdf1Sdjm 1918ec07fdf1Sdjm movdqa (%rsp), @XMM[9] # load round0 key 1919ec07fdf1Sdjm lea .LADD1(%rip), %r11 1920ec07fdf1Sdjm movdqa 0x20(%rbp), @XMM[0] # counter copy 1921ec07fdf1Sdjm movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1922ec07fdf1Sdjm pshufb @XMM[8], @XMM[9] # byte swap upper part 1923ec07fdf1Sdjm pshufb @XMM[8], @XMM[0] 1924ec07fdf1Sdjm movdqa @XMM[9], (%rsp) # save adjusted round0 key 1925ec07fdf1Sdjm jmp .Lctr_enc_loop 1926ec07fdf1Sdjm.align 16 1927ec07fdf1Sdjm.Lctr_enc_loop: 1928ec07fdf1Sdjm movdqa @XMM[0], 0x20(%rbp) # save counter 1929ec07fdf1Sdjm movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1930ec07fdf1Sdjm movdqa @XMM[0], @XMM[2] 1931ec07fdf1Sdjm paddd 0x00(%r11), @XMM[1] # .LADD1 1932ec07fdf1Sdjm movdqa @XMM[0], @XMM[3] 1933ec07fdf1Sdjm paddd 0x10(%r11), @XMM[2] # .LADD2 1934ec07fdf1Sdjm movdqa @XMM[0], @XMM[4] 1935ec07fdf1Sdjm paddd 0x20(%r11), @XMM[3] # .LADD3 1936ec07fdf1Sdjm movdqa @XMM[0], @XMM[5] 1937ec07fdf1Sdjm paddd 0x30(%r11), @XMM[4] # .LADD4 1938ec07fdf1Sdjm movdqa @XMM[0], @XMM[6] 1939ec07fdf1Sdjm paddd 0x40(%r11), @XMM[5] # .LADD5 1940ec07fdf1Sdjm movdqa @XMM[0], @XMM[7] 1941ec07fdf1Sdjm paddd 0x50(%r11), @XMM[6] # .LADD6 1942ec07fdf1Sdjm paddd 0x60(%r11), @XMM[7] # .LADD7 1943ec07fdf1Sdjm 1944ec07fdf1Sdjm # Borrow prologue from _bsaes_encrypt8 to use the opportunity 1945ec07fdf1Sdjm # to flip byte order in 32-bit counter 1946ec07fdf1Sdjm movdqa (%rsp), @XMM[9] # round 0 key 1947ec07fdf1Sdjm lea 0x10(%rsp), %rax # pass key schedule 1948ec07fdf1Sdjm movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 1949ec07fdf1Sdjm pxor @XMM[9], @XMM[0] # xor with round0 key 1950ec07fdf1Sdjm pxor @XMM[9], @XMM[1] 1951ec07fdf1Sdjm pshufb @XMM[8], @XMM[0] 1952ec07fdf1Sdjm pxor @XMM[9], @XMM[2] 1953ec07fdf1Sdjm pshufb @XMM[8], @XMM[1] 1954ec07fdf1Sdjm pxor @XMM[9], @XMM[3] 1955ec07fdf1Sdjm pshufb @XMM[8], @XMM[2] 1956ec07fdf1Sdjm pxor @XMM[9], @XMM[4] 1957ec07fdf1Sdjm pshufb @XMM[8], @XMM[3] 1958ec07fdf1Sdjm pxor @XMM[9], @XMM[5] 1959ec07fdf1Sdjm pshufb @XMM[8], @XMM[4] 1960ec07fdf1Sdjm pxor @XMM[9], @XMM[6] 1961ec07fdf1Sdjm pshufb @XMM[8], @XMM[5] 1962ec07fdf1Sdjm pxor @XMM[9], @XMM[7] 1963ec07fdf1Sdjm pshufb @XMM[8], @XMM[6] 1964ec07fdf1Sdjm lea .LBS0(%rip), %r11 # constants table 1965ec07fdf1Sdjm pshufb @XMM[8], @XMM[7] 1966ec07fdf1Sdjm mov %ebx,%r10d # pass rounds 1967ec07fdf1Sdjm 1968ec07fdf1Sdjm call _bsaes_encrypt8_bitslice 1969ec07fdf1Sdjm 1970ec07fdf1Sdjm sub \$8,$len 1971ec07fdf1Sdjm jc .Lctr_enc_loop_done 1972ec07fdf1Sdjm 1973ec07fdf1Sdjm movdqu 0x00($inp), @XMM[8] # load input 1974ec07fdf1Sdjm movdqu 0x10($inp), @XMM[9] 1975ec07fdf1Sdjm movdqu 0x20($inp), @XMM[10] 1976ec07fdf1Sdjm movdqu 0x30($inp), @XMM[11] 1977ec07fdf1Sdjm movdqu 0x40($inp), @XMM[12] 1978ec07fdf1Sdjm movdqu 0x50($inp), @XMM[13] 1979ec07fdf1Sdjm movdqu 0x60($inp), @XMM[14] 1980ec07fdf1Sdjm movdqu 0x70($inp), @XMM[15] 1981ec07fdf1Sdjm lea 0x80($inp),$inp 1982ec07fdf1Sdjm pxor @XMM[0], @XMM[8] 1983ec07fdf1Sdjm movdqa 0x20(%rbp), @XMM[0] # load counter 1984ec07fdf1Sdjm pxor @XMM[9], @XMM[1] 1985ec07fdf1Sdjm movdqu @XMM[8], 0x00($out) # write output 1986ec07fdf1Sdjm pxor @XMM[10], @XMM[4] 1987ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 1988ec07fdf1Sdjm pxor @XMM[11], @XMM[6] 1989ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 1990ec07fdf1Sdjm pxor @XMM[12], @XMM[3] 1991ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 1992ec07fdf1Sdjm pxor @XMM[13], @XMM[7] 1993ec07fdf1Sdjm movdqu @XMM[3], 0x40($out) 1994ec07fdf1Sdjm pxor @XMM[14], @XMM[2] 1995ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 1996ec07fdf1Sdjm pxor @XMM[15], @XMM[5] 1997ec07fdf1Sdjm movdqu @XMM[2], 0x60($out) 1998ec07fdf1Sdjm lea .LADD1(%rip), %r11 1999ec07fdf1Sdjm movdqu @XMM[5], 0x70($out) 2000ec07fdf1Sdjm lea 0x80($out), $out 2001ec07fdf1Sdjm paddd 0x70(%r11), @XMM[0] # .LADD8 2002ec07fdf1Sdjm jnz .Lctr_enc_loop 2003ec07fdf1Sdjm 2004ec07fdf1Sdjm jmp .Lctr_enc_done 2005ec07fdf1Sdjm.align 16 2006ec07fdf1Sdjm.Lctr_enc_loop_done: 2007ec07fdf1Sdjm add \$8, $len 2008ec07fdf1Sdjm movdqu 0x00($inp), @XMM[8] # load input 2009ec07fdf1Sdjm pxor @XMM[8], @XMM[0] 2010ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2011ec07fdf1Sdjm cmp \$2,$len 2012ec07fdf1Sdjm jb .Lctr_enc_done 2013ec07fdf1Sdjm movdqu 0x10($inp), @XMM[9] 2014ec07fdf1Sdjm pxor @XMM[9], @XMM[1] 2015ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2016ec07fdf1Sdjm je .Lctr_enc_done 2017ec07fdf1Sdjm movdqu 0x20($inp), @XMM[10] 2018ec07fdf1Sdjm pxor @XMM[10], @XMM[4] 2019ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 2020ec07fdf1Sdjm cmp \$4,$len 2021ec07fdf1Sdjm jb .Lctr_enc_done 2022ec07fdf1Sdjm movdqu 0x30($inp), @XMM[11] 2023ec07fdf1Sdjm pxor @XMM[11], @XMM[6] 2024ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 2025ec07fdf1Sdjm je .Lctr_enc_done 2026ec07fdf1Sdjm movdqu 0x40($inp), @XMM[12] 2027ec07fdf1Sdjm pxor @XMM[12], @XMM[3] 2028ec07fdf1Sdjm movdqu @XMM[3], 0x40($out) 2029ec07fdf1Sdjm cmp \$6,$len 2030ec07fdf1Sdjm jb .Lctr_enc_done 2031ec07fdf1Sdjm movdqu 0x50($inp), @XMM[13] 2032ec07fdf1Sdjm pxor @XMM[13], @XMM[7] 2033ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 2034ec07fdf1Sdjm je .Lctr_enc_done 2035ec07fdf1Sdjm movdqu 0x60($inp), @XMM[14] 2036ec07fdf1Sdjm pxor @XMM[14], @XMM[2] 2037ec07fdf1Sdjm movdqu @XMM[2], 0x60($out) 2038ec07fdf1Sdjm jmp .Lctr_enc_done 2039ec07fdf1Sdjm 2040ec07fdf1Sdjm.align 16 2041ec07fdf1Sdjm.Lctr_enc_short: 2042ec07fdf1Sdjm lea 0x20(%rbp), $arg1 2043ec07fdf1Sdjm lea 0x30(%rbp), $arg2 2044ec07fdf1Sdjm lea ($key), $arg3 2045ec07fdf1Sdjm call asm_AES_encrypt 2046ec07fdf1Sdjm movdqu ($inp), @XMM[1] 2047ec07fdf1Sdjm lea 16($inp), $inp 2048ec07fdf1Sdjm mov 0x2c(%rbp), %eax # load 32-bit counter 2049ec07fdf1Sdjm bswap %eax 2050ec07fdf1Sdjm pxor 0x30(%rbp), @XMM[1] 2051ec07fdf1Sdjm inc %eax # increment 2052ec07fdf1Sdjm movdqu @XMM[1], ($out) 2053ec07fdf1Sdjm bswap %eax 2054ec07fdf1Sdjm lea 16($out), $out 2055ec07fdf1Sdjm mov %eax, 0x2c(%rsp) # save 32-bit counter 2056ec07fdf1Sdjm dec $len 2057ec07fdf1Sdjm jnz .Lctr_enc_short 2058ec07fdf1Sdjm 2059ec07fdf1Sdjm.Lctr_enc_done: 2060ec07fdf1Sdjm lea (%rsp), %rax 2061ec07fdf1Sdjm pxor %xmm0, %xmm0 2062ec07fdf1Sdjm.Lctr_enc_bzero: # wipe key schedule [if any] 2063ec07fdf1Sdjm movdqa %xmm0, 0x00(%rax) 2064ec07fdf1Sdjm movdqa %xmm0, 0x10(%rax) 2065ec07fdf1Sdjm lea 0x20(%rax), %rax 2066ec07fdf1Sdjm cmp %rax, %rbp 2067ec07fdf1Sdjm ja .Lctr_enc_bzero 2068ec07fdf1Sdjm 2069ec07fdf1Sdjm lea (%rbp),%rsp # restore %rsp 2070ec07fdf1Sdjm___ 2071ec07fdf1Sdjm$code.=<<___ if ($win64); 2072ec07fdf1Sdjm movaps 0x40(%rbp), %xmm6 2073ec07fdf1Sdjm movaps 0x50(%rbp), %xmm7 2074ec07fdf1Sdjm movaps 0x60(%rbp), %xmm8 2075ec07fdf1Sdjm movaps 0x70(%rbp), %xmm9 2076ec07fdf1Sdjm movaps 0x80(%rbp), %xmm10 2077ec07fdf1Sdjm movaps 0x90(%rbp), %xmm11 2078ec07fdf1Sdjm movaps 0xa0(%rbp), %xmm12 2079ec07fdf1Sdjm movaps 0xb0(%rbp), %xmm13 2080ec07fdf1Sdjm movaps 0xc0(%rbp), %xmm14 2081ec07fdf1Sdjm movaps 0xd0(%rbp), %xmm15 2082ec07fdf1Sdjm lea 0xa0(%rbp), %rsp 2083ec07fdf1Sdjm___ 2084ec07fdf1Sdjm$code.=<<___; 2085ec07fdf1Sdjm mov 0x48(%rsp), %r15 2086ec07fdf1Sdjm mov 0x50(%rsp), %r14 2087ec07fdf1Sdjm mov 0x58(%rsp), %r13 2088ec07fdf1Sdjm mov 0x60(%rsp), %r12 2089ec07fdf1Sdjm mov 0x68(%rsp), %rbx 2090ec07fdf1Sdjm mov 0x70(%rsp), %rax 2091ec07fdf1Sdjm lea 0x78(%rsp), %rsp 2092ec07fdf1Sdjm mov %rax, %rbp 2093ec07fdf1Sdjm.Lctr_enc_epilogue: 2094ec07fdf1Sdjm ret 2095ec07fdf1Sdjm.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks 2096ec07fdf1Sdjm___ 2097ec07fdf1Sdjm###################################################################### 2098ec07fdf1Sdjm# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2099ec07fdf1Sdjm# const AES_KEY *key1, const AES_KEY *key2, 2100ec07fdf1Sdjm# const unsigned char iv[16]); 2101ec07fdf1Sdjm# 2102ec07fdf1Sdjmmy ($twmask,$twres,$twtmp)=@XMM[13..15]; 21039eac5592Smiod$arg6=~s/d$//; 21049eac5592Smiod 2105ec07fdf1Sdjm$code.=<<___; 2106ec07fdf1Sdjm.globl bsaes_xts_encrypt 2107ec07fdf1Sdjm.type bsaes_xts_encrypt,\@abi-omnipotent 2108ec07fdf1Sdjm.align 16 2109ec07fdf1Sdjmbsaes_xts_encrypt: 2110*22787c51Stb _CET_ENDBR 2111ec07fdf1Sdjm mov %rsp, %rax 2112ec07fdf1Sdjm.Lxts_enc_prologue: 2113ec07fdf1Sdjm push %rbp 2114ec07fdf1Sdjm push %rbx 2115ec07fdf1Sdjm push %r12 2116ec07fdf1Sdjm push %r13 2117ec07fdf1Sdjm push %r14 2118ec07fdf1Sdjm push %r15 2119ec07fdf1Sdjm lea -0x48(%rsp), %rsp 2120ec07fdf1Sdjm___ 2121ec07fdf1Sdjm$code.=<<___ if ($win64); 2122ec07fdf1Sdjm mov 0xa0(%rsp),$arg5 # pull key2 2123ec07fdf1Sdjm mov 0xa8(%rsp),$arg6 # pull ivp 2124ec07fdf1Sdjm lea -0xa0(%rsp), %rsp 2125ec07fdf1Sdjm movaps %xmm6, 0x40(%rsp) 2126ec07fdf1Sdjm movaps %xmm7, 0x50(%rsp) 2127ec07fdf1Sdjm movaps %xmm8, 0x60(%rsp) 2128ec07fdf1Sdjm movaps %xmm9, 0x70(%rsp) 2129ec07fdf1Sdjm movaps %xmm10, 0x80(%rsp) 2130ec07fdf1Sdjm movaps %xmm11, 0x90(%rsp) 2131ec07fdf1Sdjm movaps %xmm12, 0xa0(%rsp) 2132ec07fdf1Sdjm movaps %xmm13, 0xb0(%rsp) 2133ec07fdf1Sdjm movaps %xmm14, 0xc0(%rsp) 2134ec07fdf1Sdjm movaps %xmm15, 0xd0(%rsp) 2135ec07fdf1Sdjm.Lxts_enc_body: 2136ec07fdf1Sdjm___ 2137ec07fdf1Sdjm$code.=<<___; 2138ec07fdf1Sdjm mov %rsp, %rbp # backup %rsp 2139ec07fdf1Sdjm mov $arg1, $inp # backup arguments 2140ec07fdf1Sdjm mov $arg2, $out 2141ec07fdf1Sdjm mov $arg3, $len 2142ec07fdf1Sdjm mov $arg4, $key 2143ec07fdf1Sdjm 2144ec07fdf1Sdjm lea ($arg6), $arg1 2145ec07fdf1Sdjm lea 0x20(%rbp), $arg2 2146ec07fdf1Sdjm lea ($arg5), $arg3 2147ec07fdf1Sdjm call asm_AES_encrypt # generate initial tweak 2148ec07fdf1Sdjm 2149ec07fdf1Sdjm mov 240($key), %eax # rounds 2150ec07fdf1Sdjm mov $len, %rbx # backup $len 2151ec07fdf1Sdjm 2152ec07fdf1Sdjm mov %eax, %edx # rounds 2153ec07fdf1Sdjm shl \$7, %rax # 128 bytes per inner round key 2154ec07fdf1Sdjm sub \$`128-32`, %rax # size of bit-sliced key schedule 2155ec07fdf1Sdjm sub %rax, %rsp 2156ec07fdf1Sdjm 2157ec07fdf1Sdjm mov %rsp, %rax # pass key schedule 2158ec07fdf1Sdjm mov $key, %rcx # pass key 2159ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2160ec07fdf1Sdjm call _bsaes_key_convert 2161ec07fdf1Sdjm pxor %xmm6, %xmm7 # fix up last round key 2162ec07fdf1Sdjm movdqa %xmm7, (%rax) # save last round key 2163ec07fdf1Sdjm 2164ec07fdf1Sdjm and \$-16, $len 2165ec07fdf1Sdjm sub \$0x80, %rsp # place for tweak[8] 2166ec07fdf1Sdjm movdqa 0x20(%rbp), @XMM[7] # initial tweak 2167ec07fdf1Sdjm 2168ec07fdf1Sdjm pxor $twtmp, $twtmp 2169ec07fdf1Sdjm movdqa .Lxts_magic(%rip), $twmask 2170ec07fdf1Sdjm pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2171ec07fdf1Sdjm 2172ec07fdf1Sdjm sub \$0x80, $len 2173ec07fdf1Sdjm jc .Lxts_enc_short 2174ec07fdf1Sdjm jmp .Lxts_enc_loop 2175ec07fdf1Sdjm 2176ec07fdf1Sdjm.align 16 2177ec07fdf1Sdjm.Lxts_enc_loop: 2178ec07fdf1Sdjm___ 2179ec07fdf1Sdjm for ($i=0;$i<7;$i++) { 2180ec07fdf1Sdjm $code.=<<___; 2181ec07fdf1Sdjm pshufd \$0x13, $twtmp, $twres 2182ec07fdf1Sdjm pxor $twtmp, $twtmp 2183ec07fdf1Sdjm movdqa @XMM[7], @XMM[$i] 2184ec07fdf1Sdjm movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2185ec07fdf1Sdjm paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2186ec07fdf1Sdjm pand $twmask, $twres # isolate carry and residue 2187ec07fdf1Sdjm pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2188ec07fdf1Sdjm pxor $twres, @XMM[7] 2189ec07fdf1Sdjm___ 2190ec07fdf1Sdjm $code.=<<___ if ($i>=1); 2191ec07fdf1Sdjm movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2192ec07fdf1Sdjm___ 2193ec07fdf1Sdjm $code.=<<___ if ($i>=2); 2194ec07fdf1Sdjm pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2195ec07fdf1Sdjm___ 2196ec07fdf1Sdjm } 2197ec07fdf1Sdjm$code.=<<___; 2198ec07fdf1Sdjm movdqu 0x60($inp), @XMM[8+6] 2199ec07fdf1Sdjm pxor @XMM[8+5], @XMM[5] 2200ec07fdf1Sdjm movdqu 0x70($inp), @XMM[8+7] 2201ec07fdf1Sdjm lea 0x80($inp), $inp 2202ec07fdf1Sdjm movdqa @XMM[7], 0x70(%rsp) 2203ec07fdf1Sdjm pxor @XMM[8+6], @XMM[6] 2204ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2205ec07fdf1Sdjm pxor @XMM[8+7], @XMM[7] 2206ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2207ec07fdf1Sdjm 2208ec07fdf1Sdjm call _bsaes_encrypt8 2209ec07fdf1Sdjm 2210ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2211ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2212ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2213ec07fdf1Sdjm pxor 0x20(%rsp), @XMM[4] 2214ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2215ec07fdf1Sdjm pxor 0x30(%rsp), @XMM[6] 2216ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 2217ec07fdf1Sdjm pxor 0x40(%rsp), @XMM[3] 2218ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 2219ec07fdf1Sdjm pxor 0x50(%rsp), @XMM[7] 2220ec07fdf1Sdjm movdqu @XMM[3], 0x40($out) 2221ec07fdf1Sdjm pxor 0x60(%rsp), @XMM[2] 2222ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 2223ec07fdf1Sdjm pxor 0x70(%rsp), @XMM[5] 2224ec07fdf1Sdjm movdqu @XMM[2], 0x60($out) 2225ec07fdf1Sdjm movdqu @XMM[5], 0x70($out) 2226ec07fdf1Sdjm lea 0x80($out), $out 2227ec07fdf1Sdjm 2228ec07fdf1Sdjm movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2229ec07fdf1Sdjm pxor $twtmp, $twtmp 2230ec07fdf1Sdjm movdqa .Lxts_magic(%rip), $twmask 2231ec07fdf1Sdjm pcmpgtd @XMM[7], $twtmp 2232ec07fdf1Sdjm pshufd \$0x13, $twtmp, $twres 2233ec07fdf1Sdjm pxor $twtmp, $twtmp 2234ec07fdf1Sdjm paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2235ec07fdf1Sdjm pand $twmask, $twres # isolate carry and residue 2236ec07fdf1Sdjm pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2237ec07fdf1Sdjm pxor $twres, @XMM[7] 2238ec07fdf1Sdjm 2239ec07fdf1Sdjm sub \$0x80,$len 2240ec07fdf1Sdjm jnc .Lxts_enc_loop 2241ec07fdf1Sdjm 2242ec07fdf1Sdjm.Lxts_enc_short: 2243ec07fdf1Sdjm add \$0x80, $len 2244ec07fdf1Sdjm jz .Lxts_enc_done 2245ec07fdf1Sdjm___ 2246ec07fdf1Sdjm for ($i=0;$i<7;$i++) { 2247ec07fdf1Sdjm $code.=<<___; 2248ec07fdf1Sdjm pshufd \$0x13, $twtmp, $twres 2249ec07fdf1Sdjm pxor $twtmp, $twtmp 2250ec07fdf1Sdjm movdqa @XMM[7], @XMM[$i] 2251ec07fdf1Sdjm movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2252ec07fdf1Sdjm paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2253ec07fdf1Sdjm pand $twmask, $twres # isolate carry and residue 2254ec07fdf1Sdjm pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2255ec07fdf1Sdjm pxor $twres, @XMM[7] 2256ec07fdf1Sdjm___ 2257ec07fdf1Sdjm $code.=<<___ if ($i>=1); 2258ec07fdf1Sdjm movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2259ec07fdf1Sdjm cmp \$`0x10*$i`,$len 2260ec07fdf1Sdjm je .Lxts_enc_$i 2261ec07fdf1Sdjm___ 2262ec07fdf1Sdjm $code.=<<___ if ($i>=2); 2263ec07fdf1Sdjm pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2264ec07fdf1Sdjm___ 2265ec07fdf1Sdjm } 2266ec07fdf1Sdjm$code.=<<___; 2267ec07fdf1Sdjm movdqu 0x60($inp), @XMM[8+6] 2268ec07fdf1Sdjm pxor @XMM[8+5], @XMM[5] 2269ec07fdf1Sdjm movdqa @XMM[7], 0x70(%rsp) 2270ec07fdf1Sdjm lea 0x70($inp), $inp 2271ec07fdf1Sdjm pxor @XMM[8+6], @XMM[6] 2272ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2273ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2274ec07fdf1Sdjm 2275ec07fdf1Sdjm call _bsaes_encrypt8 2276ec07fdf1Sdjm 2277ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2278ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2279ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2280ec07fdf1Sdjm pxor 0x20(%rsp), @XMM[4] 2281ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2282ec07fdf1Sdjm pxor 0x30(%rsp), @XMM[6] 2283ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 2284ec07fdf1Sdjm pxor 0x40(%rsp), @XMM[3] 2285ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 2286ec07fdf1Sdjm pxor 0x50(%rsp), @XMM[7] 2287ec07fdf1Sdjm movdqu @XMM[3], 0x40($out) 2288ec07fdf1Sdjm pxor 0x60(%rsp), @XMM[2] 2289ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 2290ec07fdf1Sdjm movdqu @XMM[2], 0x60($out) 2291ec07fdf1Sdjm lea 0x70($out), $out 2292ec07fdf1Sdjm 2293ec07fdf1Sdjm movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2294ec07fdf1Sdjm jmp .Lxts_enc_done 2295ec07fdf1Sdjm.align 16 2296ec07fdf1Sdjm.Lxts_enc_6: 2297ec07fdf1Sdjm pxor @XMM[8+4], @XMM[4] 2298ec07fdf1Sdjm lea 0x60($inp), $inp 2299ec07fdf1Sdjm pxor @XMM[8+5], @XMM[5] 2300ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2301ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2302ec07fdf1Sdjm 2303ec07fdf1Sdjm call _bsaes_encrypt8 2304ec07fdf1Sdjm 2305ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2306ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2307ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2308ec07fdf1Sdjm pxor 0x20(%rsp), @XMM[4] 2309ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2310ec07fdf1Sdjm pxor 0x30(%rsp), @XMM[6] 2311ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 2312ec07fdf1Sdjm pxor 0x40(%rsp), @XMM[3] 2313ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 2314ec07fdf1Sdjm pxor 0x50(%rsp), @XMM[7] 2315ec07fdf1Sdjm movdqu @XMM[3], 0x40($out) 2316ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 2317ec07fdf1Sdjm lea 0x60($out), $out 2318ec07fdf1Sdjm 2319ec07fdf1Sdjm movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2320ec07fdf1Sdjm jmp .Lxts_enc_done 2321ec07fdf1Sdjm.align 16 2322ec07fdf1Sdjm.Lxts_enc_5: 2323ec07fdf1Sdjm pxor @XMM[8+3], @XMM[3] 2324ec07fdf1Sdjm lea 0x50($inp), $inp 2325ec07fdf1Sdjm pxor @XMM[8+4], @XMM[4] 2326ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2327ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2328ec07fdf1Sdjm 2329ec07fdf1Sdjm call _bsaes_encrypt8 2330ec07fdf1Sdjm 2331ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2332ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2333ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2334ec07fdf1Sdjm pxor 0x20(%rsp), @XMM[4] 2335ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2336ec07fdf1Sdjm pxor 0x30(%rsp), @XMM[6] 2337ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 2338ec07fdf1Sdjm pxor 0x40(%rsp), @XMM[3] 2339ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 2340ec07fdf1Sdjm movdqu @XMM[3], 0x40($out) 2341ec07fdf1Sdjm lea 0x50($out), $out 2342ec07fdf1Sdjm 2343ec07fdf1Sdjm movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2344ec07fdf1Sdjm jmp .Lxts_enc_done 2345ec07fdf1Sdjm.align 16 2346ec07fdf1Sdjm.Lxts_enc_4: 2347ec07fdf1Sdjm pxor @XMM[8+2], @XMM[2] 2348ec07fdf1Sdjm lea 0x40($inp), $inp 2349ec07fdf1Sdjm pxor @XMM[8+3], @XMM[3] 2350ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2351ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2352ec07fdf1Sdjm 2353ec07fdf1Sdjm call _bsaes_encrypt8 2354ec07fdf1Sdjm 2355ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2356ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2357ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2358ec07fdf1Sdjm pxor 0x20(%rsp), @XMM[4] 2359ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2360ec07fdf1Sdjm pxor 0x30(%rsp), @XMM[6] 2361ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 2362ec07fdf1Sdjm movdqu @XMM[6], 0x30($out) 2363ec07fdf1Sdjm lea 0x40($out), $out 2364ec07fdf1Sdjm 2365ec07fdf1Sdjm movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2366ec07fdf1Sdjm jmp .Lxts_enc_done 2367ec07fdf1Sdjm.align 16 2368ec07fdf1Sdjm.Lxts_enc_3: 2369ec07fdf1Sdjm pxor @XMM[8+1], @XMM[1] 2370ec07fdf1Sdjm lea 0x30($inp), $inp 2371ec07fdf1Sdjm pxor @XMM[8+2], @XMM[2] 2372ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2373ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2374ec07fdf1Sdjm 2375ec07fdf1Sdjm call _bsaes_encrypt8 2376ec07fdf1Sdjm 2377ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2378ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2379ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2380ec07fdf1Sdjm pxor 0x20(%rsp), @XMM[4] 2381ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2382ec07fdf1Sdjm movdqu @XMM[4], 0x20($out) 2383ec07fdf1Sdjm lea 0x30($out), $out 2384ec07fdf1Sdjm 2385ec07fdf1Sdjm movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2386ec07fdf1Sdjm jmp .Lxts_enc_done 2387ec07fdf1Sdjm.align 16 2388ec07fdf1Sdjm.Lxts_enc_2: 2389ec07fdf1Sdjm pxor @XMM[8+0], @XMM[0] 2390ec07fdf1Sdjm lea 0x20($inp), $inp 2391ec07fdf1Sdjm pxor @XMM[8+1], @XMM[1] 2392ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2393ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2394ec07fdf1Sdjm 2395ec07fdf1Sdjm call _bsaes_encrypt8 2396ec07fdf1Sdjm 2397ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2398ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2399ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2400ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2401ec07fdf1Sdjm lea 0x20($out), $out 2402ec07fdf1Sdjm 2403ec07fdf1Sdjm movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2404ec07fdf1Sdjm jmp .Lxts_enc_done 2405ec07fdf1Sdjm.align 16 2406ec07fdf1Sdjm.Lxts_enc_1: 2407ec07fdf1Sdjm pxor @XMM[0], @XMM[8] 2408ec07fdf1Sdjm lea 0x10($inp), $inp 2409ec07fdf1Sdjm movdqa @XMM[8], 0x20(%rbp) 2410ec07fdf1Sdjm lea 0x20(%rbp), $arg1 2411ec07fdf1Sdjm lea 0x20(%rbp), $arg2 2412ec07fdf1Sdjm lea ($key), $arg3 2413ec07fdf1Sdjm call asm_AES_encrypt # doesn't touch %xmm 2414ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2415ec07fdf1Sdjm #pxor @XMM[8], @XMM[0] 2416ec07fdf1Sdjm #lea 0x80(%rsp), %rax # pass key schedule 2417ec07fdf1Sdjm #mov %edx, %r10d # pass rounds 2418ec07fdf1Sdjm #call _bsaes_encrypt8 2419ec07fdf1Sdjm #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2420ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2421ec07fdf1Sdjm lea 0x10($out), $out 2422ec07fdf1Sdjm 2423ec07fdf1Sdjm movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2424ec07fdf1Sdjm 2425ec07fdf1Sdjm.Lxts_enc_done: 2426ec07fdf1Sdjm and \$15, %ebx 2427ec07fdf1Sdjm jz .Lxts_enc_ret 2428ec07fdf1Sdjm mov $out, %rdx 2429ec07fdf1Sdjm 2430ec07fdf1Sdjm.Lxts_enc_steal: 2431ec07fdf1Sdjm movzb ($inp), %eax 2432ec07fdf1Sdjm movzb -16(%rdx), %ecx 2433ec07fdf1Sdjm lea 1($inp), $inp 2434ec07fdf1Sdjm mov %al, -16(%rdx) 2435ec07fdf1Sdjm mov %cl, 0(%rdx) 2436ec07fdf1Sdjm lea 1(%rdx), %rdx 2437ec07fdf1Sdjm sub \$1,%ebx 2438ec07fdf1Sdjm jnz .Lxts_enc_steal 2439ec07fdf1Sdjm 2440ec07fdf1Sdjm movdqu -16($out), @XMM[0] 2441ec07fdf1Sdjm lea 0x20(%rbp), $arg1 2442ec07fdf1Sdjm pxor @XMM[7], @XMM[0] 2443ec07fdf1Sdjm lea 0x20(%rbp), $arg2 2444ec07fdf1Sdjm movdqa @XMM[0], 0x20(%rbp) 2445ec07fdf1Sdjm lea ($key), $arg3 2446ec07fdf1Sdjm call asm_AES_encrypt # doesn't touch %xmm 2447ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[7] 2448ec07fdf1Sdjm movdqu @XMM[7], -16($out) 2449ec07fdf1Sdjm 2450ec07fdf1Sdjm.Lxts_enc_ret: 2451ec07fdf1Sdjm lea (%rsp), %rax 2452ec07fdf1Sdjm pxor %xmm0, %xmm0 2453ec07fdf1Sdjm.Lxts_enc_bzero: # wipe key schedule [if any] 2454ec07fdf1Sdjm movdqa %xmm0, 0x00(%rax) 2455ec07fdf1Sdjm movdqa %xmm0, 0x10(%rax) 2456ec07fdf1Sdjm lea 0x20(%rax), %rax 2457ec07fdf1Sdjm cmp %rax, %rbp 2458ec07fdf1Sdjm ja .Lxts_enc_bzero 2459ec07fdf1Sdjm 2460ec07fdf1Sdjm lea (%rbp),%rsp # restore %rsp 2461ec07fdf1Sdjm___ 2462ec07fdf1Sdjm$code.=<<___ if ($win64); 2463ec07fdf1Sdjm movaps 0x40(%rbp), %xmm6 2464ec07fdf1Sdjm movaps 0x50(%rbp), %xmm7 2465ec07fdf1Sdjm movaps 0x60(%rbp), %xmm8 2466ec07fdf1Sdjm movaps 0x70(%rbp), %xmm9 2467ec07fdf1Sdjm movaps 0x80(%rbp), %xmm10 2468ec07fdf1Sdjm movaps 0x90(%rbp), %xmm11 2469ec07fdf1Sdjm movaps 0xa0(%rbp), %xmm12 2470ec07fdf1Sdjm movaps 0xb0(%rbp), %xmm13 2471ec07fdf1Sdjm movaps 0xc0(%rbp), %xmm14 2472ec07fdf1Sdjm movaps 0xd0(%rbp), %xmm15 2473ec07fdf1Sdjm lea 0xa0(%rbp), %rsp 2474ec07fdf1Sdjm___ 2475ec07fdf1Sdjm$code.=<<___; 2476ec07fdf1Sdjm mov 0x48(%rsp), %r15 2477ec07fdf1Sdjm mov 0x50(%rsp), %r14 2478ec07fdf1Sdjm mov 0x58(%rsp), %r13 2479ec07fdf1Sdjm mov 0x60(%rsp), %r12 2480ec07fdf1Sdjm mov 0x68(%rsp), %rbx 2481ec07fdf1Sdjm mov 0x70(%rsp), %rax 2482ec07fdf1Sdjm lea 0x78(%rsp), %rsp 2483ec07fdf1Sdjm mov %rax, %rbp 2484ec07fdf1Sdjm.Lxts_enc_epilogue: 2485ec07fdf1Sdjm ret 2486ec07fdf1Sdjm.size bsaes_xts_encrypt,.-bsaes_xts_encrypt 2487ec07fdf1Sdjm 2488ec07fdf1Sdjm.globl bsaes_xts_decrypt 2489ec07fdf1Sdjm.type bsaes_xts_decrypt,\@abi-omnipotent 2490ec07fdf1Sdjm.align 16 2491ec07fdf1Sdjmbsaes_xts_decrypt: 2492*22787c51Stb _CET_ENDBR 2493ec07fdf1Sdjm mov %rsp, %rax 2494ec07fdf1Sdjm.Lxts_dec_prologue: 2495ec07fdf1Sdjm push %rbp 2496ec07fdf1Sdjm push %rbx 2497ec07fdf1Sdjm push %r12 2498ec07fdf1Sdjm push %r13 2499ec07fdf1Sdjm push %r14 2500ec07fdf1Sdjm push %r15 2501ec07fdf1Sdjm lea -0x48(%rsp), %rsp 2502ec07fdf1Sdjm___ 2503ec07fdf1Sdjm$code.=<<___ if ($win64); 2504ec07fdf1Sdjm mov 0xa0(%rsp),$arg5 # pull key2 2505ec07fdf1Sdjm mov 0xa8(%rsp),$arg6 # pull ivp 2506ec07fdf1Sdjm lea -0xa0(%rsp), %rsp 2507ec07fdf1Sdjm movaps %xmm6, 0x40(%rsp) 2508ec07fdf1Sdjm movaps %xmm7, 0x50(%rsp) 2509ec07fdf1Sdjm movaps %xmm8, 0x60(%rsp) 2510ec07fdf1Sdjm movaps %xmm9, 0x70(%rsp) 2511ec07fdf1Sdjm movaps %xmm10, 0x80(%rsp) 2512ec07fdf1Sdjm movaps %xmm11, 0x90(%rsp) 2513ec07fdf1Sdjm movaps %xmm12, 0xa0(%rsp) 2514ec07fdf1Sdjm movaps %xmm13, 0xb0(%rsp) 2515ec07fdf1Sdjm movaps %xmm14, 0xc0(%rsp) 2516ec07fdf1Sdjm movaps %xmm15, 0xd0(%rsp) 2517ec07fdf1Sdjm.Lxts_dec_body: 2518ec07fdf1Sdjm___ 2519ec07fdf1Sdjm$code.=<<___; 2520ec07fdf1Sdjm mov %rsp, %rbp # backup %rsp 2521ec07fdf1Sdjm mov $arg1, $inp # backup arguments 2522ec07fdf1Sdjm mov $arg2, $out 2523ec07fdf1Sdjm mov $arg3, $len 2524ec07fdf1Sdjm mov $arg4, $key 2525ec07fdf1Sdjm 2526ec07fdf1Sdjm lea ($arg6), $arg1 2527ec07fdf1Sdjm lea 0x20(%rbp), $arg2 2528ec07fdf1Sdjm lea ($arg5), $arg3 2529ec07fdf1Sdjm call asm_AES_encrypt # generate initial tweak 2530ec07fdf1Sdjm 2531ec07fdf1Sdjm mov 240($key), %eax # rounds 2532ec07fdf1Sdjm mov $len, %rbx # backup $len 2533ec07fdf1Sdjm 2534ec07fdf1Sdjm mov %eax, %edx # rounds 2535ec07fdf1Sdjm shl \$7, %rax # 128 bytes per inner round key 2536ec07fdf1Sdjm sub \$`128-32`, %rax # size of bit-sliced key schedule 2537ec07fdf1Sdjm sub %rax, %rsp 2538ec07fdf1Sdjm 2539ec07fdf1Sdjm mov %rsp, %rax # pass key schedule 2540ec07fdf1Sdjm mov $key, %rcx # pass key 2541ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2542ec07fdf1Sdjm call _bsaes_key_convert 2543ec07fdf1Sdjm pxor (%rsp), %xmm7 # fix up round 0 key 2544ec07fdf1Sdjm movdqa %xmm6, (%rax) # save last round key 2545ec07fdf1Sdjm movdqa %xmm7, (%rsp) 2546ec07fdf1Sdjm 2547ec07fdf1Sdjm xor %eax, %eax # if ($len%16) len-=16; 2548ec07fdf1Sdjm and \$-16, $len 2549ec07fdf1Sdjm test \$15, %ebx 2550ec07fdf1Sdjm setnz %al 2551ec07fdf1Sdjm shl \$4, %rax 2552ec07fdf1Sdjm sub %rax, $len 2553ec07fdf1Sdjm 2554ec07fdf1Sdjm sub \$0x80, %rsp # place for tweak[8] 2555ec07fdf1Sdjm movdqa 0x20(%rbp), @XMM[7] # initial tweak 2556ec07fdf1Sdjm 2557ec07fdf1Sdjm pxor $twtmp, $twtmp 2558ec07fdf1Sdjm movdqa .Lxts_magic(%rip), $twmask 2559ec07fdf1Sdjm pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2560ec07fdf1Sdjm 2561ec07fdf1Sdjm sub \$0x80, $len 2562ec07fdf1Sdjm jc .Lxts_dec_short 2563ec07fdf1Sdjm jmp .Lxts_dec_loop 2564ec07fdf1Sdjm 2565ec07fdf1Sdjm.align 16 2566ec07fdf1Sdjm.Lxts_dec_loop: 2567ec07fdf1Sdjm___ 2568ec07fdf1Sdjm for ($i=0;$i<7;$i++) { 2569ec07fdf1Sdjm $code.=<<___; 2570ec07fdf1Sdjm pshufd \$0x13, $twtmp, $twres 2571ec07fdf1Sdjm pxor $twtmp, $twtmp 2572ec07fdf1Sdjm movdqa @XMM[7], @XMM[$i] 2573ec07fdf1Sdjm movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2574ec07fdf1Sdjm paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2575ec07fdf1Sdjm pand $twmask, $twres # isolate carry and residue 2576ec07fdf1Sdjm pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2577ec07fdf1Sdjm pxor $twres, @XMM[7] 2578ec07fdf1Sdjm___ 2579ec07fdf1Sdjm $code.=<<___ if ($i>=1); 2580ec07fdf1Sdjm movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2581ec07fdf1Sdjm___ 2582ec07fdf1Sdjm $code.=<<___ if ($i>=2); 2583ec07fdf1Sdjm pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2584ec07fdf1Sdjm___ 2585ec07fdf1Sdjm } 2586ec07fdf1Sdjm$code.=<<___; 2587ec07fdf1Sdjm movdqu 0x60($inp), @XMM[8+6] 2588ec07fdf1Sdjm pxor @XMM[8+5], @XMM[5] 2589ec07fdf1Sdjm movdqu 0x70($inp), @XMM[8+7] 2590ec07fdf1Sdjm lea 0x80($inp), $inp 2591ec07fdf1Sdjm movdqa @XMM[7], 0x70(%rsp) 2592ec07fdf1Sdjm pxor @XMM[8+6], @XMM[6] 2593ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2594ec07fdf1Sdjm pxor @XMM[8+7], @XMM[7] 2595ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2596ec07fdf1Sdjm 2597ec07fdf1Sdjm call _bsaes_decrypt8 2598ec07fdf1Sdjm 2599ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2600ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2601ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2602ec07fdf1Sdjm pxor 0x20(%rsp), @XMM[6] 2603ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2604ec07fdf1Sdjm pxor 0x30(%rsp), @XMM[4] 2605ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 2606ec07fdf1Sdjm pxor 0x40(%rsp), @XMM[2] 2607ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 2608ec07fdf1Sdjm pxor 0x50(%rsp), @XMM[7] 2609ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 2610ec07fdf1Sdjm pxor 0x60(%rsp), @XMM[3] 2611ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 2612ec07fdf1Sdjm pxor 0x70(%rsp), @XMM[5] 2613ec07fdf1Sdjm movdqu @XMM[3], 0x60($out) 2614ec07fdf1Sdjm movdqu @XMM[5], 0x70($out) 2615ec07fdf1Sdjm lea 0x80($out), $out 2616ec07fdf1Sdjm 2617ec07fdf1Sdjm movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2618ec07fdf1Sdjm pxor $twtmp, $twtmp 2619ec07fdf1Sdjm movdqa .Lxts_magic(%rip), $twmask 2620ec07fdf1Sdjm pcmpgtd @XMM[7], $twtmp 2621ec07fdf1Sdjm pshufd \$0x13, $twtmp, $twres 2622ec07fdf1Sdjm pxor $twtmp, $twtmp 2623ec07fdf1Sdjm paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2624ec07fdf1Sdjm pand $twmask, $twres # isolate carry and residue 2625ec07fdf1Sdjm pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2626ec07fdf1Sdjm pxor $twres, @XMM[7] 2627ec07fdf1Sdjm 2628ec07fdf1Sdjm sub \$0x80,$len 2629ec07fdf1Sdjm jnc .Lxts_dec_loop 2630ec07fdf1Sdjm 2631ec07fdf1Sdjm.Lxts_dec_short: 2632ec07fdf1Sdjm add \$0x80, $len 2633ec07fdf1Sdjm jz .Lxts_dec_done 2634ec07fdf1Sdjm___ 2635ec07fdf1Sdjm for ($i=0;$i<7;$i++) { 2636ec07fdf1Sdjm $code.=<<___; 2637ec07fdf1Sdjm pshufd \$0x13, $twtmp, $twres 2638ec07fdf1Sdjm pxor $twtmp, $twtmp 2639ec07fdf1Sdjm movdqa @XMM[7], @XMM[$i] 2640ec07fdf1Sdjm movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2641ec07fdf1Sdjm paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2642ec07fdf1Sdjm pand $twmask, $twres # isolate carry and residue 2643ec07fdf1Sdjm pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2644ec07fdf1Sdjm pxor $twres, @XMM[7] 2645ec07fdf1Sdjm___ 2646ec07fdf1Sdjm $code.=<<___ if ($i>=1); 2647ec07fdf1Sdjm movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2648ec07fdf1Sdjm cmp \$`0x10*$i`,$len 2649ec07fdf1Sdjm je .Lxts_dec_$i 2650ec07fdf1Sdjm___ 2651ec07fdf1Sdjm $code.=<<___ if ($i>=2); 2652ec07fdf1Sdjm pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2653ec07fdf1Sdjm___ 2654ec07fdf1Sdjm } 2655ec07fdf1Sdjm$code.=<<___; 2656ec07fdf1Sdjm movdqu 0x60($inp), @XMM[8+6] 2657ec07fdf1Sdjm pxor @XMM[8+5], @XMM[5] 2658ec07fdf1Sdjm movdqa @XMM[7], 0x70(%rsp) 2659ec07fdf1Sdjm lea 0x70($inp), $inp 2660ec07fdf1Sdjm pxor @XMM[8+6], @XMM[6] 2661ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2662ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2663ec07fdf1Sdjm 2664ec07fdf1Sdjm call _bsaes_decrypt8 2665ec07fdf1Sdjm 2666ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2667ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2668ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2669ec07fdf1Sdjm pxor 0x20(%rsp), @XMM[6] 2670ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2671ec07fdf1Sdjm pxor 0x30(%rsp), @XMM[4] 2672ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 2673ec07fdf1Sdjm pxor 0x40(%rsp), @XMM[2] 2674ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 2675ec07fdf1Sdjm pxor 0x50(%rsp), @XMM[7] 2676ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 2677ec07fdf1Sdjm pxor 0x60(%rsp), @XMM[3] 2678ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 2679ec07fdf1Sdjm movdqu @XMM[3], 0x60($out) 2680ec07fdf1Sdjm lea 0x70($out), $out 2681ec07fdf1Sdjm 2682ec07fdf1Sdjm movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2683ec07fdf1Sdjm jmp .Lxts_dec_done 2684ec07fdf1Sdjm.align 16 2685ec07fdf1Sdjm.Lxts_dec_6: 2686ec07fdf1Sdjm pxor @XMM[8+4], @XMM[4] 2687ec07fdf1Sdjm lea 0x60($inp), $inp 2688ec07fdf1Sdjm pxor @XMM[8+5], @XMM[5] 2689ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2690ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2691ec07fdf1Sdjm 2692ec07fdf1Sdjm call _bsaes_decrypt8 2693ec07fdf1Sdjm 2694ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2695ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2696ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2697ec07fdf1Sdjm pxor 0x20(%rsp), @XMM[6] 2698ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2699ec07fdf1Sdjm pxor 0x30(%rsp), @XMM[4] 2700ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 2701ec07fdf1Sdjm pxor 0x40(%rsp), @XMM[2] 2702ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 2703ec07fdf1Sdjm pxor 0x50(%rsp), @XMM[7] 2704ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 2705ec07fdf1Sdjm movdqu @XMM[7], 0x50($out) 2706ec07fdf1Sdjm lea 0x60($out), $out 2707ec07fdf1Sdjm 2708ec07fdf1Sdjm movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2709ec07fdf1Sdjm jmp .Lxts_dec_done 2710ec07fdf1Sdjm.align 16 2711ec07fdf1Sdjm.Lxts_dec_5: 2712ec07fdf1Sdjm pxor @XMM[8+3], @XMM[3] 2713ec07fdf1Sdjm lea 0x50($inp), $inp 2714ec07fdf1Sdjm pxor @XMM[8+4], @XMM[4] 2715ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2716ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2717ec07fdf1Sdjm 2718ec07fdf1Sdjm call _bsaes_decrypt8 2719ec07fdf1Sdjm 2720ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2721ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2722ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2723ec07fdf1Sdjm pxor 0x20(%rsp), @XMM[6] 2724ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2725ec07fdf1Sdjm pxor 0x30(%rsp), @XMM[4] 2726ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 2727ec07fdf1Sdjm pxor 0x40(%rsp), @XMM[2] 2728ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 2729ec07fdf1Sdjm movdqu @XMM[2], 0x40($out) 2730ec07fdf1Sdjm lea 0x50($out), $out 2731ec07fdf1Sdjm 2732ec07fdf1Sdjm movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2733ec07fdf1Sdjm jmp .Lxts_dec_done 2734ec07fdf1Sdjm.align 16 2735ec07fdf1Sdjm.Lxts_dec_4: 2736ec07fdf1Sdjm pxor @XMM[8+2], @XMM[2] 2737ec07fdf1Sdjm lea 0x40($inp), $inp 2738ec07fdf1Sdjm pxor @XMM[8+3], @XMM[3] 2739ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2740ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2741ec07fdf1Sdjm 2742ec07fdf1Sdjm call _bsaes_decrypt8 2743ec07fdf1Sdjm 2744ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2745ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2746ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2747ec07fdf1Sdjm pxor 0x20(%rsp), @XMM[6] 2748ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2749ec07fdf1Sdjm pxor 0x30(%rsp), @XMM[4] 2750ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 2751ec07fdf1Sdjm movdqu @XMM[4], 0x30($out) 2752ec07fdf1Sdjm lea 0x40($out), $out 2753ec07fdf1Sdjm 2754ec07fdf1Sdjm movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2755ec07fdf1Sdjm jmp .Lxts_dec_done 2756ec07fdf1Sdjm.align 16 2757ec07fdf1Sdjm.Lxts_dec_3: 2758ec07fdf1Sdjm pxor @XMM[8+1], @XMM[1] 2759ec07fdf1Sdjm lea 0x30($inp), $inp 2760ec07fdf1Sdjm pxor @XMM[8+2], @XMM[2] 2761ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2762ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2763ec07fdf1Sdjm 2764ec07fdf1Sdjm call _bsaes_decrypt8 2765ec07fdf1Sdjm 2766ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2767ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2768ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2769ec07fdf1Sdjm pxor 0x20(%rsp), @XMM[6] 2770ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2771ec07fdf1Sdjm movdqu @XMM[6], 0x20($out) 2772ec07fdf1Sdjm lea 0x30($out), $out 2773ec07fdf1Sdjm 2774ec07fdf1Sdjm movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2775ec07fdf1Sdjm jmp .Lxts_dec_done 2776ec07fdf1Sdjm.align 16 2777ec07fdf1Sdjm.Lxts_dec_2: 2778ec07fdf1Sdjm pxor @XMM[8+0], @XMM[0] 2779ec07fdf1Sdjm lea 0x20($inp), $inp 2780ec07fdf1Sdjm pxor @XMM[8+1], @XMM[1] 2781ec07fdf1Sdjm lea 0x80(%rsp), %rax # pass key schedule 2782ec07fdf1Sdjm mov %edx, %r10d # pass rounds 2783ec07fdf1Sdjm 2784ec07fdf1Sdjm call _bsaes_decrypt8 2785ec07fdf1Sdjm 2786ec07fdf1Sdjm pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2787ec07fdf1Sdjm pxor 0x10(%rsp), @XMM[1] 2788ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2789ec07fdf1Sdjm movdqu @XMM[1], 0x10($out) 2790ec07fdf1Sdjm lea 0x20($out), $out 2791ec07fdf1Sdjm 2792ec07fdf1Sdjm movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2793ec07fdf1Sdjm jmp .Lxts_dec_done 2794ec07fdf1Sdjm.align 16 2795ec07fdf1Sdjm.Lxts_dec_1: 2796ec07fdf1Sdjm pxor @XMM[0], @XMM[8] 2797ec07fdf1Sdjm lea 0x10($inp), $inp 2798ec07fdf1Sdjm movdqa @XMM[8], 0x20(%rbp) 2799ec07fdf1Sdjm lea 0x20(%rbp), $arg1 2800ec07fdf1Sdjm lea 0x20(%rbp), $arg2 2801ec07fdf1Sdjm lea ($key), $arg3 2802ec07fdf1Sdjm call asm_AES_decrypt # doesn't touch %xmm 2803ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2804ec07fdf1Sdjm #pxor @XMM[8], @XMM[0] 2805ec07fdf1Sdjm #lea 0x80(%rsp), %rax # pass key schedule 2806ec07fdf1Sdjm #mov %edx, %r10d # pass rounds 2807ec07fdf1Sdjm #call _bsaes_decrypt8 2808ec07fdf1Sdjm #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2809ec07fdf1Sdjm movdqu @XMM[0], 0x00($out) # write output 2810ec07fdf1Sdjm lea 0x10($out), $out 2811ec07fdf1Sdjm 2812ec07fdf1Sdjm movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2813ec07fdf1Sdjm 2814ec07fdf1Sdjm.Lxts_dec_done: 2815ec07fdf1Sdjm and \$15, %ebx 2816ec07fdf1Sdjm jz .Lxts_dec_ret 2817ec07fdf1Sdjm 2818ec07fdf1Sdjm pxor $twtmp, $twtmp 2819ec07fdf1Sdjm movdqa .Lxts_magic(%rip), $twmask 2820ec07fdf1Sdjm pcmpgtd @XMM[7], $twtmp 2821ec07fdf1Sdjm pshufd \$0x13, $twtmp, $twres 2822ec07fdf1Sdjm movdqa @XMM[7], @XMM[6] 2823ec07fdf1Sdjm paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2824ec07fdf1Sdjm pand $twmask, $twres # isolate carry and residue 2825ec07fdf1Sdjm movdqu ($inp), @XMM[0] 2826ec07fdf1Sdjm pxor $twres, @XMM[7] 2827ec07fdf1Sdjm 2828ec07fdf1Sdjm lea 0x20(%rbp), $arg1 2829ec07fdf1Sdjm pxor @XMM[7], @XMM[0] 2830ec07fdf1Sdjm lea 0x20(%rbp), $arg2 2831ec07fdf1Sdjm movdqa @XMM[0], 0x20(%rbp) 2832ec07fdf1Sdjm lea ($key), $arg3 2833ec07fdf1Sdjm call asm_AES_decrypt # doesn't touch %xmm 2834ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[7] 2835ec07fdf1Sdjm mov $out, %rdx 2836ec07fdf1Sdjm movdqu @XMM[7], ($out) 2837ec07fdf1Sdjm 2838ec07fdf1Sdjm.Lxts_dec_steal: 2839ec07fdf1Sdjm movzb 16($inp), %eax 2840ec07fdf1Sdjm movzb (%rdx), %ecx 2841ec07fdf1Sdjm lea 1($inp), $inp 2842ec07fdf1Sdjm mov %al, (%rdx) 2843ec07fdf1Sdjm mov %cl, 16(%rdx) 2844ec07fdf1Sdjm lea 1(%rdx), %rdx 2845ec07fdf1Sdjm sub \$1,%ebx 2846ec07fdf1Sdjm jnz .Lxts_dec_steal 2847ec07fdf1Sdjm 2848ec07fdf1Sdjm movdqu ($out), @XMM[0] 2849ec07fdf1Sdjm lea 0x20(%rbp), $arg1 2850ec07fdf1Sdjm pxor @XMM[6], @XMM[0] 2851ec07fdf1Sdjm lea 0x20(%rbp), $arg2 2852ec07fdf1Sdjm movdqa @XMM[0], 0x20(%rbp) 2853ec07fdf1Sdjm lea ($key), $arg3 2854ec07fdf1Sdjm call asm_AES_decrypt # doesn't touch %xmm 2855ec07fdf1Sdjm pxor 0x20(%rbp), @XMM[6] 2856ec07fdf1Sdjm movdqu @XMM[6], ($out) 2857ec07fdf1Sdjm 2858ec07fdf1Sdjm.Lxts_dec_ret: 2859ec07fdf1Sdjm lea (%rsp), %rax 2860ec07fdf1Sdjm pxor %xmm0, %xmm0 2861ec07fdf1Sdjm.Lxts_dec_bzero: # wipe key schedule [if any] 2862ec07fdf1Sdjm movdqa %xmm0, 0x00(%rax) 2863ec07fdf1Sdjm movdqa %xmm0, 0x10(%rax) 2864ec07fdf1Sdjm lea 0x20(%rax), %rax 2865ec07fdf1Sdjm cmp %rax, %rbp 2866ec07fdf1Sdjm ja .Lxts_dec_bzero 2867ec07fdf1Sdjm 2868ec07fdf1Sdjm lea (%rbp),%rsp # restore %rsp 2869ec07fdf1Sdjm___ 2870ec07fdf1Sdjm$code.=<<___ if ($win64); 2871ec07fdf1Sdjm movaps 0x40(%rbp), %xmm6 2872ec07fdf1Sdjm movaps 0x50(%rbp), %xmm7 2873ec07fdf1Sdjm movaps 0x60(%rbp), %xmm8 2874ec07fdf1Sdjm movaps 0x70(%rbp), %xmm9 2875ec07fdf1Sdjm movaps 0x80(%rbp), %xmm10 2876ec07fdf1Sdjm movaps 0x90(%rbp), %xmm11 2877ec07fdf1Sdjm movaps 0xa0(%rbp), %xmm12 2878ec07fdf1Sdjm movaps 0xb0(%rbp), %xmm13 2879ec07fdf1Sdjm movaps 0xc0(%rbp), %xmm14 2880ec07fdf1Sdjm movaps 0xd0(%rbp), %xmm15 2881ec07fdf1Sdjm lea 0xa0(%rbp), %rsp 2882ec07fdf1Sdjm___ 2883ec07fdf1Sdjm$code.=<<___; 2884ec07fdf1Sdjm mov 0x48(%rsp), %r15 2885ec07fdf1Sdjm mov 0x50(%rsp), %r14 2886ec07fdf1Sdjm mov 0x58(%rsp), %r13 2887ec07fdf1Sdjm mov 0x60(%rsp), %r12 2888ec07fdf1Sdjm mov 0x68(%rsp), %rbx 2889ec07fdf1Sdjm mov 0x70(%rsp), %rax 2890ec07fdf1Sdjm lea 0x78(%rsp), %rsp 2891ec07fdf1Sdjm mov %rax, %rbp 2892ec07fdf1Sdjm.Lxts_dec_epilogue: 2893ec07fdf1Sdjm ret 2894ec07fdf1Sdjm.size bsaes_xts_decrypt,.-bsaes_xts_decrypt 2895ec07fdf1Sdjm___ 2896ec07fdf1Sdjm} 2897ec07fdf1Sdjm$code.=<<___; 2898eda85684Stb.section .rodata 2899ec07fdf1Sdjm.type _bsaes_const,\@object 2900ec07fdf1Sdjm.align 64 2901ec07fdf1Sdjm_bsaes_const: 2902ec07fdf1Sdjm.LM0ISR: # InvShiftRows constants 2903ec07fdf1Sdjm .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 2904ec07fdf1Sdjm.LISRM0: 2905ec07fdf1Sdjm .quad 0x01040b0e0205080f, 0x0306090c00070a0d 2906ec07fdf1Sdjm.LISR: 2907ec07fdf1Sdjm .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 2908ec07fdf1Sdjm.LBS0: # bit-slice constants 2909ec07fdf1Sdjm .quad 0x5555555555555555, 0x5555555555555555 2910ec07fdf1Sdjm.LBS1: 2911ec07fdf1Sdjm .quad 0x3333333333333333, 0x3333333333333333 2912ec07fdf1Sdjm.LBS2: 2913ec07fdf1Sdjm .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 2914ec07fdf1Sdjm.LSR: # shiftrows constants 2915ec07fdf1Sdjm .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 2916ec07fdf1Sdjm.LSRM0: 2917ec07fdf1Sdjm .quad 0x0304090e00050a0f, 0x01060b0c0207080d 2918ec07fdf1Sdjm.LM0SR: 2919ec07fdf1Sdjm .quad 0x0a0e02060f03070b, 0x0004080c05090d01 2920ec07fdf1Sdjm.LSWPUP: # byte-swap upper dword 2921ec07fdf1Sdjm .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 2922ec07fdf1Sdjm.LSWPUPM0SR: 2923ec07fdf1Sdjm .quad 0x0a0d02060c03070b, 0x0004080f05090e01 2924ec07fdf1Sdjm.LADD1: # counter increment constants 2925ec07fdf1Sdjm .quad 0x0000000000000000, 0x0000000100000000 2926ec07fdf1Sdjm.LADD2: 2927ec07fdf1Sdjm .quad 0x0000000000000000, 0x0000000200000000 2928ec07fdf1Sdjm.LADD3: 2929ec07fdf1Sdjm .quad 0x0000000000000000, 0x0000000300000000 2930ec07fdf1Sdjm.LADD4: 2931ec07fdf1Sdjm .quad 0x0000000000000000, 0x0000000400000000 2932ec07fdf1Sdjm.LADD5: 2933ec07fdf1Sdjm .quad 0x0000000000000000, 0x0000000500000000 2934ec07fdf1Sdjm.LADD6: 2935ec07fdf1Sdjm .quad 0x0000000000000000, 0x0000000600000000 2936ec07fdf1Sdjm.LADD7: 2937ec07fdf1Sdjm .quad 0x0000000000000000, 0x0000000700000000 2938ec07fdf1Sdjm.LADD8: 2939ec07fdf1Sdjm .quad 0x0000000000000000, 0x0000000800000000 2940ec07fdf1Sdjm.Lxts_magic: 2941ec07fdf1Sdjm .long 0x87,0,1,0 2942ec07fdf1Sdjm.Lmasks: 2943ec07fdf1Sdjm .quad 0x0101010101010101, 0x0101010101010101 2944ec07fdf1Sdjm .quad 0x0202020202020202, 0x0202020202020202 2945ec07fdf1Sdjm .quad 0x0404040404040404, 0x0404040404040404 2946ec07fdf1Sdjm .quad 0x0808080808080808, 0x0808080808080808 2947ec07fdf1Sdjm.LM0: 2948ec07fdf1Sdjm .quad 0x02060a0e03070b0f, 0x0004080c0105090d 2949ec07fdf1Sdjm.L63: 2950ec07fdf1Sdjm .quad 0x6363636363636363, 0x6363636363636363 2951ec07fdf1Sdjm.align 64 2952ec07fdf1Sdjm.size _bsaes_const,.-_bsaes_const 295308705922Stb.text 2954ec07fdf1Sdjm___ 2955ec07fdf1Sdjm 2956ec07fdf1Sdjm# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2957ec07fdf1Sdjm# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2958ec07fdf1Sdjmif ($win64) { 2959ec07fdf1Sdjm$rec="%rcx"; 2960ec07fdf1Sdjm$frame="%rdx"; 2961ec07fdf1Sdjm$context="%r8"; 2962ec07fdf1Sdjm$disp="%r9"; 2963ec07fdf1Sdjm 2964ec07fdf1Sdjm$code.=<<___; 2965ec07fdf1Sdjm.extern __imp_RtlVirtualUnwind 2966ec07fdf1Sdjm.type se_handler,\@abi-omnipotent 2967ec07fdf1Sdjm.align 16 2968ec07fdf1Sdjmse_handler: 2969*22787c51Stb _CET_ENDBR 2970ec07fdf1Sdjm push %rsi 2971ec07fdf1Sdjm push %rdi 2972ec07fdf1Sdjm push %rbx 2973ec07fdf1Sdjm push %rbp 2974ec07fdf1Sdjm push %r12 2975ec07fdf1Sdjm push %r13 2976ec07fdf1Sdjm push %r14 2977ec07fdf1Sdjm push %r15 2978ec07fdf1Sdjm pushfq 2979ec07fdf1Sdjm sub \$64,%rsp 2980ec07fdf1Sdjm 2981ec07fdf1Sdjm mov 120($context),%rax # pull context->Rax 2982ec07fdf1Sdjm mov 248($context),%rbx # pull context->Rip 2983ec07fdf1Sdjm 2984ec07fdf1Sdjm mov 8($disp),%rsi # disp->ImageBase 2985ec07fdf1Sdjm mov 56($disp),%r11 # disp->HandlerData 2986ec07fdf1Sdjm 2987ec07fdf1Sdjm mov 0(%r11),%r10d # HandlerData[0] 2988ec07fdf1Sdjm lea (%rsi,%r10),%r10 # prologue label 2989ec07fdf1Sdjm cmp %r10,%rbx # context->Rip<prologue label 2990ec07fdf1Sdjm jb .Lin_prologue 2991ec07fdf1Sdjm 2992ec07fdf1Sdjm mov 152($context),%rax # pull context->Rsp 2993ec07fdf1Sdjm 2994ec07fdf1Sdjm mov 4(%r11),%r10d # HandlerData[1] 2995ec07fdf1Sdjm lea (%rsi,%r10),%r10 # epilogue label 2996ec07fdf1Sdjm cmp %r10,%rbx # context->Rip>=epilogue label 2997ec07fdf1Sdjm jae .Lin_prologue 2998ec07fdf1Sdjm 2999ec07fdf1Sdjm mov 160($context),%rax # pull context->Rbp 3000ec07fdf1Sdjm 3001ec07fdf1Sdjm lea 0x40(%rax),%rsi # %xmm save area 3002ec07fdf1Sdjm lea 512($context),%rdi # &context.Xmm6 3003ec07fdf1Sdjm mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3004ec07fdf1Sdjm .long 0xa548f3fc # cld; rep movsq 3005ec07fdf1Sdjm lea 0xa0(%rax),%rax # adjust stack pointer 3006ec07fdf1Sdjm 3007ec07fdf1Sdjm mov 0x70(%rax),%rbp 3008ec07fdf1Sdjm mov 0x68(%rax),%rbx 3009ec07fdf1Sdjm mov 0x60(%rax),%r12 3010ec07fdf1Sdjm mov 0x58(%rax),%r13 3011ec07fdf1Sdjm mov 0x50(%rax),%r14 3012ec07fdf1Sdjm mov 0x48(%rax),%r15 3013ec07fdf1Sdjm lea 0x78(%rax),%rax # adjust stack pointer 3014ec07fdf1Sdjm mov %rbx,144($context) # restore context->Rbx 3015ec07fdf1Sdjm mov %rbp,160($context) # restore context->Rbp 3016ec07fdf1Sdjm mov %r12,216($context) # restore context->R12 3017ec07fdf1Sdjm mov %r13,224($context) # restore context->R13 3018ec07fdf1Sdjm mov %r14,232($context) # restore context->R14 3019ec07fdf1Sdjm mov %r15,240($context) # restore context->R15 3020ec07fdf1Sdjm 3021ec07fdf1Sdjm.Lin_prologue: 3022ec07fdf1Sdjm mov %rax,152($context) # restore context->Rsp 3023ec07fdf1Sdjm 3024ec07fdf1Sdjm mov 40($disp),%rdi # disp->ContextRecord 3025ec07fdf1Sdjm mov $context,%rsi # context 3026ec07fdf1Sdjm mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3027ec07fdf1Sdjm .long 0xa548f3fc # cld; rep movsq 3028ec07fdf1Sdjm 3029ec07fdf1Sdjm mov $disp,%rsi 3030ec07fdf1Sdjm xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3031ec07fdf1Sdjm mov 8(%rsi),%rdx # arg2, disp->ImageBase 3032ec07fdf1Sdjm mov 0(%rsi),%r8 # arg3, disp->ControlPc 3033ec07fdf1Sdjm mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3034ec07fdf1Sdjm mov 40(%rsi),%r10 # disp->ContextRecord 3035ec07fdf1Sdjm lea 56(%rsi),%r11 # &disp->HandlerData 3036ec07fdf1Sdjm lea 24(%rsi),%r12 # &disp->EstablisherFrame 3037ec07fdf1Sdjm mov %r10,32(%rsp) # arg5 3038ec07fdf1Sdjm mov %r11,40(%rsp) # arg6 3039ec07fdf1Sdjm mov %r12,48(%rsp) # arg7 3040ec07fdf1Sdjm mov %rcx,56(%rsp) # arg8, (NULL) 3041ec07fdf1Sdjm call *__imp_RtlVirtualUnwind(%rip) 3042ec07fdf1Sdjm 3043ec07fdf1Sdjm mov \$1,%eax # ExceptionContinueSearch 3044ec07fdf1Sdjm add \$64,%rsp 3045ec07fdf1Sdjm popfq 3046ec07fdf1Sdjm pop %r15 3047ec07fdf1Sdjm pop %r14 3048ec07fdf1Sdjm pop %r13 3049ec07fdf1Sdjm pop %r12 3050ec07fdf1Sdjm pop %rbp 3051ec07fdf1Sdjm pop %rbx 3052ec07fdf1Sdjm pop %rdi 3053ec07fdf1Sdjm pop %rsi 3054ec07fdf1Sdjm ret 3055ec07fdf1Sdjm.size se_handler,.-se_handler 3056ec07fdf1Sdjm 3057ec07fdf1Sdjm.section .pdata 3058ec07fdf1Sdjm.align 4 3059ec07fdf1Sdjm___ 3060ec07fdf1Sdjm$code.=<<___ if ($ecb); 3061ec07fdf1Sdjm .rva .Lecb_enc_prologue 3062ec07fdf1Sdjm .rva .Lecb_enc_epilogue 3063ec07fdf1Sdjm .rva .Lecb_enc_info 3064ec07fdf1Sdjm 3065ec07fdf1Sdjm .rva .Lecb_dec_prologue 3066ec07fdf1Sdjm .rva .Lecb_dec_epilogue 3067ec07fdf1Sdjm .rva .Lecb_dec_info 3068ec07fdf1Sdjm___ 3069ec07fdf1Sdjm$code.=<<___; 3070ec07fdf1Sdjm .rva .Lcbc_dec_prologue 3071ec07fdf1Sdjm .rva .Lcbc_dec_epilogue 3072ec07fdf1Sdjm .rva .Lcbc_dec_info 3073ec07fdf1Sdjm 3074ec07fdf1Sdjm .rva .Lctr_enc_prologue 3075ec07fdf1Sdjm .rva .Lctr_enc_epilogue 3076ec07fdf1Sdjm .rva .Lctr_enc_info 3077ec07fdf1Sdjm 3078ec07fdf1Sdjm .rva .Lxts_enc_prologue 3079ec07fdf1Sdjm .rva .Lxts_enc_epilogue 3080ec07fdf1Sdjm .rva .Lxts_enc_info 3081ec07fdf1Sdjm 3082ec07fdf1Sdjm .rva .Lxts_dec_prologue 3083ec07fdf1Sdjm .rva .Lxts_dec_epilogue 3084ec07fdf1Sdjm .rva .Lxts_dec_info 3085ec07fdf1Sdjm 3086ec07fdf1Sdjm.section .xdata 3087ec07fdf1Sdjm.align 8 3088ec07fdf1Sdjm___ 3089ec07fdf1Sdjm$code.=<<___ if ($ecb); 3090ec07fdf1Sdjm.Lecb_enc_info: 3091ec07fdf1Sdjm .byte 9,0,0,0 3092ec07fdf1Sdjm .rva se_handler 3093ec07fdf1Sdjm .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3094ec07fdf1Sdjm.Lecb_dec_info: 3095ec07fdf1Sdjm .byte 9,0,0,0 3096ec07fdf1Sdjm .rva se_handler 3097ec07fdf1Sdjm .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3098ec07fdf1Sdjm___ 3099ec07fdf1Sdjm$code.=<<___; 3100ec07fdf1Sdjm.Lcbc_dec_info: 3101ec07fdf1Sdjm .byte 9,0,0,0 3102ec07fdf1Sdjm .rva se_handler 3103ec07fdf1Sdjm .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3104ec07fdf1Sdjm.Lctr_enc_info: 3105ec07fdf1Sdjm .byte 9,0,0,0 3106ec07fdf1Sdjm .rva se_handler 3107ec07fdf1Sdjm .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3108ec07fdf1Sdjm.Lxts_enc_info: 3109ec07fdf1Sdjm .byte 9,0,0,0 3110ec07fdf1Sdjm .rva se_handler 3111ec07fdf1Sdjm .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3112ec07fdf1Sdjm.Lxts_dec_info: 3113ec07fdf1Sdjm .byte 9,0,0,0 3114ec07fdf1Sdjm .rva se_handler 3115ec07fdf1Sdjm .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3116ec07fdf1Sdjm___ 3117ec07fdf1Sdjm} 3118ec07fdf1Sdjm 3119ec07fdf1Sdjm$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3120ec07fdf1Sdjm 3121ec07fdf1Sdjmprint $code; 3122ec07fdf1Sdjm 3123ec07fdf1Sdjmclose STDOUT; 3124