1 2; --------------------------------------------------------------------------- 3; Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved. 4; 5; The redistribution and use of this software (with or without changes) 6; is allowed without the payment of fees or royalties provided that: 7; 8; source code distributions include the above copyright notice, this 9; list of conditions and the following disclaimer; 10; 11; binary distributions include the above copyright notice, this list 12; of conditions and the following disclaimer in their documentation. 13; 14; This software is provided 'as is' with no explicit or implied warranties 15; in respect of its operation, including, but not limited to, correctness 16; and fitness for purpose. 17; --------------------------------------------------------------------------- 18; Issue Date: 20/12/2007 19; 20; I am grateful to Dag Arne Osvik for many discussions of the techniques that 21; can be used to optimise AES assembler code on AMD64/EM64T architectures. 22; Some of the techniques used in this implementation are the result of 23; suggestions made by him for which I am most grateful. 24 25; An AES implementation for AMD64 processors using the YASM assembler. This 26; implemetation provides only encryption, decryption and hence requires key 27; scheduling support in C. It uses 8k bytes of tables but its encryption and 28; decryption performance is very close to that obtained using large tables. 29; It can use either Windows or Gnu/Linux calling conventions, which are as 30; follows: 31; windows gnu/linux 32; 33; in_blk rcx rdi 34; out_blk rdx rsi 35; context (cx) r8 rdx 36; 37; preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 38; registers rdi - on both 39; 40; destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 41; registers - rdi on both 42; 43; The default convention is that for windows, the gnu/linux convention being 44; used if __GNUC__ is defined. 45; 46; To build for cryptlib (pcg): 47; 48; yasm -Xvc -f win64 -D _SEH_ -o aescryptx64.obj aes_amd64.asm 49; 50; Define _SEH_ to include support for Win64 structured exception handling 51; (this requires YASM version 0.6 or later). 52; 53; This code provides the standard AES block size (128 bits, 16 bytes) and the 54; three standard AES key sizes (128, 192 and 256 bits). It has the same call 55; interface as my C implementation. It uses the Microsoft C AMD64 calling 56; conventions in which the three parameters are placed in rcx, rdx and r8 57; respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. 58; 59; AES_RETURN aes_encrypt(const unsigned char in_blk[], 60; unsigned char out_blk[], const aes_encrypt_ctx cx[1]); 61; 62; AES_RETURN aes_decrypt(const unsigned char in_blk[], 63; unsigned char out_blk[], const aes_decrypt_ctx cx[1]); 64; 65; AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[], 66; const aes_encrypt_ctx cx[1]); 67; 68; AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[], 69; const aes_decrypt_ctx cx[1]); 70; 71; AES_RETURN aes_encrypt_key(const unsigned char key[], 72; unsigned int len, const aes_decrypt_ctx cx[1]); 73; 74; AES_RETURN aes_decrypt_key(const unsigned char key[], 75; unsigned int len, const aes_decrypt_ctx cx[1]); 76; 77; where <NNN> is 128, 102 or 256. In the last two calls the length can be in 78; either bits or bytes. 79; 80; Comment in/out the following lines to obtain the desired subroutines. These 81; selections MUST match those in the C header files aes.h and aesopt.h 82 83%define USE_INTEL_AES_IF_AVAILABLE 84 85%define AES_128 ; define if AES with 128 bit keys is needed 86%define AES_192 ; define if AES with 192 bit keys is needed 87%define AES_256 ; define if AES with 256 bit keys is needed 88%define AES_VAR ; define if a variable key size is needed 89%define ENCRYPTION ; define if encryption is needed 90%define DECRYPTION ; define if decryption is needed 91 92%ifdef USE_INTEL_AES_IF_AVAILABLE 93%define aes_ni(x) aes_ %+ x %+ _i 94%undef AES_REV_DKS 95%else 96%define aes_ni(x) aes_ %+ x 97%define AES_REV_DKS 98%endif 99 100%define LAST_ROUND_TABLES ; define for the faster version using extra tables 101 102; The encryption key schedule has the following in memory layout where N is the 103; number of rounds (10, 12 or 14): 104; 105; lo: | input key (round 0) | ; each round is four 32-bit words 106; | encryption round 1 | 107; | encryption round 2 | 108; .... 109; | encryption round N-1 | 110; hi: | encryption round N | 111; 112; The decryption key schedule is normally set up so that it has the same 113; layout as above by actually reversing the order of the encryption key 114; schedule in memory (this happens when AES_REV_DKS is set): 115; 116; lo: | decryption round 0 | = | encryption round N | 117; | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] 118; | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] 119; .... .... 120; | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] 121; hi: | decryption round N | = | input key (round 0) | 122; 123; with rounds except the first and last modified using inv_mix_column() 124; But if AES_REV_DKS is NOT set the order of keys is left as it is for 125; encryption so that it has to be accessed in reverse when used for 126; decryption (although the inverse mix column modifications are done) 127; 128; lo: | decryption round 0 | = | input key (round 0) | 129; | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] 130; | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] 131; .... .... 132; | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] 133; hi: | decryption round N | = | encryption round N | 134; 135; This layout is faster when the assembler key scheduling provided here 136; is used. 137; 138; The DLL interface must use the _stdcall convention in which the number 139; of bytes of parameter space is added after an @ to the sutine's name. 140; We must also remove our parameters from the stack before return (see 141; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version. 142 143;%define DLL_EXPORT 144 145; End of user defines 146 147%ifdef AES_VAR 148%ifndef AES_128 149%define AES_128 150%endif 151%ifndef AES_192 152%define AES_192 153%endif 154%ifndef AES_256 155%define AES_256 156%endif 157%endif 158 159%ifdef AES_VAR 160%define KS_LENGTH 60 161%elifdef AES_256 162%define KS_LENGTH 60 163%elifdef AES_192 164%define KS_LENGTH 52 165%else 166%define KS_LENGTH 44 167%endif 168 169%define r0 rax 170%define r1 rdx 171%define r2 rcx 172%define r3 rbx 173%define r4 rsi 174%define r5 rdi 175%define r6 rbp 176%define r7 rsp 177 178%define raxd eax 179%define rdxd edx 180%define rcxd ecx 181%define rbxd ebx 182%define rsid esi 183%define rdid edi 184%define rbpd ebp 185%define rspd esp 186 187%define raxb al 188%define rdxb dl 189%define rcxb cl 190%define rbxb bl 191%define rsib sil 192%define rdib dil 193%define rbpb bpl 194%define rspb spl 195 196%define r0h ah 197%define r1h dh 198%define r2h ch 199%define r3h bh 200 201%define r0d eax 202%define r1d edx 203%define r2d ecx 204%define r3d ebx 205 206; finite field multiplies by {02}, {04} and {08} 207 208%define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) 209%define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) 210%define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) 211 212; finite field multiplies required in table generation 213 214%define f3(x) (f2(x) ^ x) 215%define f9(x) (f8(x) ^ x) 216%define fb(x) (f8(x) ^ f2(x) ^ x) 217%define fd(x) (f8(x) ^ f4(x) ^ x) 218%define fe(x) (f8(x) ^ f4(x) ^ f2(x)) 219 220; macro for expanding S-box data 221 222%macro enc_vals 1 223 db %1(0x63),%1(0x7c),%1(0x77),%1(0x7b),%1(0xf2),%1(0x6b),%1(0x6f),%1(0xc5) 224 db %1(0x30),%1(0x01),%1(0x67),%1(0x2b),%1(0xfe),%1(0xd7),%1(0xab),%1(0x76) 225 db %1(0xca),%1(0x82),%1(0xc9),%1(0x7d),%1(0xfa),%1(0x59),%1(0x47),%1(0xf0) 226 db %1(0xad),%1(0xd4),%1(0xa2),%1(0xaf),%1(0x9c),%1(0xa4),%1(0x72),%1(0xc0) 227 db %1(0xb7),%1(0xfd),%1(0x93),%1(0x26),%1(0x36),%1(0x3f),%1(0xf7),%1(0xcc) 228 db %1(0x34),%1(0xa5),%1(0xe5),%1(0xf1),%1(0x71),%1(0xd8),%1(0x31),%1(0x15) 229 db %1(0x04),%1(0xc7),%1(0x23),%1(0xc3),%1(0x18),%1(0x96),%1(0x05),%1(0x9a) 230 db %1(0x07),%1(0x12),%1(0x80),%1(0xe2),%1(0xeb),%1(0x27),%1(0xb2),%1(0x75) 231 db %1(0x09),%1(0x83),%1(0x2c),%1(0x1a),%1(0x1b),%1(0x6e),%1(0x5a),%1(0xa0) 232 db %1(0x52),%1(0x3b),%1(0xd6),%1(0xb3),%1(0x29),%1(0xe3),%1(0x2f),%1(0x84) 233 db %1(0x53),%1(0xd1),%1(0x00),%1(0xed),%1(0x20),%1(0xfc),%1(0xb1),%1(0x5b) 234 db %1(0x6a),%1(0xcb),%1(0xbe),%1(0x39),%1(0x4a),%1(0x4c),%1(0x58),%1(0xcf) 235 db %1(0xd0),%1(0xef),%1(0xaa),%1(0xfb),%1(0x43),%1(0x4d),%1(0x33),%1(0x85) 236 db %1(0x45),%1(0xf9),%1(0x02),%1(0x7f),%1(0x50),%1(0x3c),%1(0x9f),%1(0xa8) 237 db %1(0x51),%1(0xa3),%1(0x40),%1(0x8f),%1(0x92),%1(0x9d),%1(0x38),%1(0xf5) 238 db %1(0xbc),%1(0xb6),%1(0xda),%1(0x21),%1(0x10),%1(0xff),%1(0xf3),%1(0xd2) 239 db %1(0xcd),%1(0x0c),%1(0x13),%1(0xec),%1(0x5f),%1(0x97),%1(0x44),%1(0x17) 240 db %1(0xc4),%1(0xa7),%1(0x7e),%1(0x3d),%1(0x64),%1(0x5d),%1(0x19),%1(0x73) 241 db %1(0x60),%1(0x81),%1(0x4f),%1(0xdc),%1(0x22),%1(0x2a),%1(0x90),%1(0x88) 242 db %1(0x46),%1(0xee),%1(0xb8),%1(0x14),%1(0xde),%1(0x5e),%1(0x0b),%1(0xdb) 243 db %1(0xe0),%1(0x32),%1(0x3a),%1(0x0a),%1(0x49),%1(0x06),%1(0x24),%1(0x5c) 244 db %1(0xc2),%1(0xd3),%1(0xac),%1(0x62),%1(0x91),%1(0x95),%1(0xe4),%1(0x79) 245 db %1(0xe7),%1(0xc8),%1(0x37),%1(0x6d),%1(0x8d),%1(0xd5),%1(0x4e),%1(0xa9) 246 db %1(0x6c),%1(0x56),%1(0xf4),%1(0xea),%1(0x65),%1(0x7a),%1(0xae),%1(0x08) 247 db %1(0xba),%1(0x78),%1(0x25),%1(0x2e),%1(0x1c),%1(0xa6),%1(0xb4),%1(0xc6) 248 db %1(0xe8),%1(0xdd),%1(0x74),%1(0x1f),%1(0x4b),%1(0xbd),%1(0x8b),%1(0x8a) 249 db %1(0x70),%1(0x3e),%1(0xb5),%1(0x66),%1(0x48),%1(0x03),%1(0xf6),%1(0x0e) 250 db %1(0x61),%1(0x35),%1(0x57),%1(0xb9),%1(0x86),%1(0xc1),%1(0x1d),%1(0x9e) 251 db %1(0xe1),%1(0xf8),%1(0x98),%1(0x11),%1(0x69),%1(0xd9),%1(0x8e),%1(0x94) 252 db %1(0x9b),%1(0x1e),%1(0x87),%1(0xe9),%1(0xce),%1(0x55),%1(0x28),%1(0xdf) 253 db %1(0x8c),%1(0xa1),%1(0x89),%1(0x0d),%1(0xbf),%1(0xe6),%1(0x42),%1(0x68) 254 db %1(0x41),%1(0x99),%1(0x2d),%1(0x0f),%1(0xb0),%1(0x54),%1(0xbb),%1(0x16) 255%endmacro 256 257%macro dec_vals 1 258 db %1(0x52),%1(0x09),%1(0x6a),%1(0xd5),%1(0x30),%1(0x36),%1(0xa5),%1(0x38) 259 db %1(0xbf),%1(0x40),%1(0xa3),%1(0x9e),%1(0x81),%1(0xf3),%1(0xd7),%1(0xfb) 260 db %1(0x7c),%1(0xe3),%1(0x39),%1(0x82),%1(0x9b),%1(0x2f),%1(0xff),%1(0x87) 261 db %1(0x34),%1(0x8e),%1(0x43),%1(0x44),%1(0xc4),%1(0xde),%1(0xe9),%1(0xcb) 262 db %1(0x54),%1(0x7b),%1(0x94),%1(0x32),%1(0xa6),%1(0xc2),%1(0x23),%1(0x3d) 263 db %1(0xee),%1(0x4c),%1(0x95),%1(0x0b),%1(0x42),%1(0xfa),%1(0xc3),%1(0x4e) 264 db %1(0x08),%1(0x2e),%1(0xa1),%1(0x66),%1(0x28),%1(0xd9),%1(0x24),%1(0xb2) 265 db %1(0x76),%1(0x5b),%1(0xa2),%1(0x49),%1(0x6d),%1(0x8b),%1(0xd1),%1(0x25) 266 db %1(0x72),%1(0xf8),%1(0xf6),%1(0x64),%1(0x86),%1(0x68),%1(0x98),%1(0x16) 267 db %1(0xd4),%1(0xa4),%1(0x5c),%1(0xcc),%1(0x5d),%1(0x65),%1(0xb6),%1(0x92) 268 db %1(0x6c),%1(0x70),%1(0x48),%1(0x50),%1(0xfd),%1(0xed),%1(0xb9),%1(0xda) 269 db %1(0x5e),%1(0x15),%1(0x46),%1(0x57),%1(0xa7),%1(0x8d),%1(0x9d),%1(0x84) 270 db %1(0x90),%1(0xd8),%1(0xab),%1(0x00),%1(0x8c),%1(0xbc),%1(0xd3),%1(0x0a) 271 db %1(0xf7),%1(0xe4),%1(0x58),%1(0x05),%1(0xb8),%1(0xb3),%1(0x45),%1(0x06) 272 db %1(0xd0),%1(0x2c),%1(0x1e),%1(0x8f),%1(0xca),%1(0x3f),%1(0x0f),%1(0x02) 273 db %1(0xc1),%1(0xaf),%1(0xbd),%1(0x03),%1(0x01),%1(0x13),%1(0x8a),%1(0x6b) 274 db %1(0x3a),%1(0x91),%1(0x11),%1(0x41),%1(0x4f),%1(0x67),%1(0xdc),%1(0xea) 275 db %1(0x97),%1(0xf2),%1(0xcf),%1(0xce),%1(0xf0),%1(0xb4),%1(0xe6),%1(0x73) 276 db %1(0x96),%1(0xac),%1(0x74),%1(0x22),%1(0xe7),%1(0xad),%1(0x35),%1(0x85) 277 db %1(0xe2),%1(0xf9),%1(0x37),%1(0xe8),%1(0x1c),%1(0x75),%1(0xdf),%1(0x6e) 278 db %1(0x47),%1(0xf1),%1(0x1a),%1(0x71),%1(0x1d),%1(0x29),%1(0xc5),%1(0x89) 279 db %1(0x6f),%1(0xb7),%1(0x62),%1(0x0e),%1(0xaa),%1(0x18),%1(0xbe),%1(0x1b) 280 db %1(0xfc),%1(0x56),%1(0x3e),%1(0x4b),%1(0xc6),%1(0xd2),%1(0x79),%1(0x20) 281 db %1(0x9a),%1(0xdb),%1(0xc0),%1(0xfe),%1(0x78),%1(0xcd),%1(0x5a),%1(0xf4) 282 db %1(0x1f),%1(0xdd),%1(0xa8),%1(0x33),%1(0x88),%1(0x07),%1(0xc7),%1(0x31) 283 db %1(0xb1),%1(0x12),%1(0x10),%1(0x59),%1(0x27),%1(0x80),%1(0xec),%1(0x5f) 284 db %1(0x60),%1(0x51),%1(0x7f),%1(0xa9),%1(0x19),%1(0xb5),%1(0x4a),%1(0x0d) 285 db %1(0x2d),%1(0xe5),%1(0x7a),%1(0x9f),%1(0x93),%1(0xc9),%1(0x9c),%1(0xef) 286 db %1(0xa0),%1(0xe0),%1(0x3b),%1(0x4d),%1(0xae),%1(0x2a),%1(0xf5),%1(0xb0) 287 db %1(0xc8),%1(0xeb),%1(0xbb),%1(0x3c),%1(0x83),%1(0x53),%1(0x99),%1(0x61) 288 db %1(0x17),%1(0x2b),%1(0x04),%1(0x7e),%1(0xba),%1(0x77),%1(0xd6),%1(0x26) 289 db %1(0xe1),%1(0x69),%1(0x14),%1(0x63),%1(0x55),%1(0x21),%1(0x0c),%1(0x7d) 290%endmacro 291 292%define u8(x) f2(x), x, x, f3(x), f2(x), x, x, f3(x) 293%define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x 294%define w8(x) x, 0, 0, 0, x, 0, 0, 0 295 296%define tptr rbp ; table pointer 297%define kptr r8 ; key schedule pointer 298%define fofs 128 ; adjust offset in key schedule to keep |disp| < 128 299%define fk_ref(x,y) [kptr-16*x+fofs+4*y] 300%ifdef AES_REV_DKS 301%define rofs 128 302%define ik_ref(x,y) [kptr-16*x+rofs+4*y] 303%else 304%define rofs -128 305%define ik_ref(x,y) [kptr+16*x+rofs+4*y] 306%endif 307 308%define tab_0(x) [tptr+8*x] 309%define tab_1(x) [tptr+8*x+3] 310%define tab_2(x) [tptr+8*x+2] 311%define tab_3(x) [tptr+8*x+1] 312%define tab_f(x) byte [tptr+8*x+1] 313%define tab_i(x) byte [tptr+8*x+7] 314%define t_ref(x,r) tab_ %+ x(r) 315 316%macro ff_rnd 5 ; normal forward round 317 mov %1d, fk_ref(%5,0) 318 mov %2d, fk_ref(%5,1) 319 mov %3d, fk_ref(%5,2) 320 mov %4d, fk_ref(%5,3) 321 322 movzx esi, al 323 movzx edi, ah 324 shr eax, 16 325 xor %1d, t_ref(0,rsi) 326 xor %4d, t_ref(1,rdi) 327 movzx esi, al 328 movzx edi, ah 329 xor %3d, t_ref(2,rsi) 330 xor %2d, t_ref(3,rdi) 331 332 movzx esi, bl 333 movzx edi, bh 334 shr ebx, 16 335 xor %2d, t_ref(0,rsi) 336 xor %1d, t_ref(1,rdi) 337 movzx esi, bl 338 movzx edi, bh 339 xor %4d, t_ref(2,rsi) 340 xor %3d, t_ref(3,rdi) 341 342 movzx esi, cl 343 movzx edi, ch 344 shr ecx, 16 345 xor %3d, t_ref(0,rsi) 346 xor %2d, t_ref(1,rdi) 347 movzx esi, cl 348 movzx edi, ch 349 xor %1d, t_ref(2,rsi) 350 xor %4d, t_ref(3,rdi) 351 352 movzx esi, dl 353 movzx edi, dh 354 shr edx, 16 355 xor %4d, t_ref(0,rsi) 356 xor %3d, t_ref(1,rdi) 357 movzx esi, dl 358 movzx edi, dh 359 xor %2d, t_ref(2,rsi) 360 xor %1d, t_ref(3,rdi) 361 362 mov eax,%1d 363 mov ebx,%2d 364 mov ecx,%3d 365 mov edx,%4d 366%endmacro 367 368%ifdef LAST_ROUND_TABLES 369 370%macro fl_rnd 5 ; last forward round 371 add tptr, 2048 372 mov %1d, fk_ref(%5,0) 373 mov %2d, fk_ref(%5,1) 374 mov %3d, fk_ref(%5,2) 375 mov %4d, fk_ref(%5,3) 376 377 movzx esi, al 378 movzx edi, ah 379 shr eax, 16 380 xor %1d, t_ref(0,rsi) 381 xor %4d, t_ref(1,rdi) 382 movzx esi, al 383 movzx edi, ah 384 xor %3d, t_ref(2,rsi) 385 xor %2d, t_ref(3,rdi) 386 387 movzx esi, bl 388 movzx edi, bh 389 shr ebx, 16 390 xor %2d, t_ref(0,rsi) 391 xor %1d, t_ref(1,rdi) 392 movzx esi, bl 393 movzx edi, bh 394 xor %4d, t_ref(2,rsi) 395 xor %3d, t_ref(3,rdi) 396 397 movzx esi, cl 398 movzx edi, ch 399 shr ecx, 16 400 xor %3d, t_ref(0,rsi) 401 xor %2d, t_ref(1,rdi) 402 movzx esi, cl 403 movzx edi, ch 404 xor %1d, t_ref(2,rsi) 405 xor %4d, t_ref(3,rdi) 406 407 movzx esi, dl 408 movzx edi, dh 409 shr edx, 16 410 xor %4d, t_ref(0,rsi) 411 xor %3d, t_ref(1,rdi) 412 movzx esi, dl 413 movzx edi, dh 414 xor %2d, t_ref(2,rsi) 415 xor %1d, t_ref(3,rdi) 416%endmacro 417 418%else 419 420%macro fl_rnd 5 ; last forward round 421 mov %1d, fk_ref(%5,0) 422 mov %2d, fk_ref(%5,1) 423 mov %3d, fk_ref(%5,2) 424 mov %4d, fk_ref(%5,3) 425 426 movzx esi, al 427 movzx edi, ah 428 shr eax, 16 429 movzx esi, t_ref(f,rsi) 430 movzx edi, t_ref(f,rdi) 431 xor %1d, esi 432 rol edi, 8 433 xor %4d, edi 434 movzx esi, al 435 movzx edi, ah 436 movzx esi, t_ref(f,rsi) 437 movzx edi, t_ref(f,rdi) 438 rol esi, 16 439 rol edi, 24 440 xor %3d, esi 441 xor %2d, edi 442 443 movzx esi, bl 444 movzx edi, bh 445 shr ebx, 16 446 movzx esi, t_ref(f,rsi) 447 movzx edi, t_ref(f,rdi) 448 xor %2d, esi 449 rol edi, 8 450 xor %1d, edi 451 movzx esi, bl 452 movzx edi, bh 453 movzx esi, t_ref(f,rsi) 454 movzx edi, t_ref(f,rdi) 455 rol esi, 16 456 rol edi, 24 457 xor %4d, esi 458 xor %3d, edi 459 460 movzx esi, cl 461 movzx edi, ch 462 movzx esi, t_ref(f,rsi) 463 movzx edi, t_ref(f,rdi) 464 shr ecx, 16 465 xor %3d, esi 466 rol edi, 8 467 xor %2d, edi 468 movzx esi, cl 469 movzx edi, ch 470 movzx esi, t_ref(f,rsi) 471 movzx edi, t_ref(f,rdi) 472 rol esi, 16 473 rol edi, 24 474 xor %1d, esi 475 xor %4d, edi 476 477 movzx esi, dl 478 movzx edi, dh 479 movzx esi, t_ref(f,rsi) 480 movzx edi, t_ref(f,rdi) 481 shr edx, 16 482 xor %4d, esi 483 rol edi, 8 484 xor %3d, edi 485 movzx esi, dl 486 movzx edi, dh 487 movzx esi, t_ref(f,rsi) 488 movzx edi, t_ref(f,rdi) 489 rol esi, 16 490 rol edi, 24 491 xor %2d, esi 492 xor %1d, edi 493%endmacro 494 495%endif 496 497%macro ii_rnd 5 ; normal inverse round 498 mov %1d, ik_ref(%5,0) 499 mov %2d, ik_ref(%5,1) 500 mov %3d, ik_ref(%5,2) 501 mov %4d, ik_ref(%5,3) 502 503 movzx esi, al 504 movzx edi, ah 505 shr eax, 16 506 xor %1d, t_ref(0,rsi) 507 xor %2d, t_ref(1,rdi) 508 movzx esi, al 509 movzx edi, ah 510 xor %3d, t_ref(2,rsi) 511 xor %4d, t_ref(3,rdi) 512 513 movzx esi, bl 514 movzx edi, bh 515 shr ebx, 16 516 xor %2d, t_ref(0,rsi) 517 xor %3d, t_ref(1,rdi) 518 movzx esi, bl 519 movzx edi, bh 520 xor %4d, t_ref(2,rsi) 521 xor %1d, t_ref(3,rdi) 522 523 movzx esi, cl 524 movzx edi, ch 525 shr ecx, 16 526 xor %3d, t_ref(0,rsi) 527 xor %4d, t_ref(1,rdi) 528 movzx esi, cl 529 movzx edi, ch 530 xor %1d, t_ref(2,rsi) 531 xor %2d, t_ref(3,rdi) 532 533 movzx esi, dl 534 movzx edi, dh 535 shr edx, 16 536 xor %4d, t_ref(0,rsi) 537 xor %1d, t_ref(1,rdi) 538 movzx esi, dl 539 movzx edi, dh 540 xor %2d, t_ref(2,rsi) 541 xor %3d, t_ref(3,rdi) 542 543 mov eax,%1d 544 mov ebx,%2d 545 mov ecx,%3d 546 mov edx,%4d 547%endmacro 548 549%ifdef LAST_ROUND_TABLES 550 551%macro il_rnd 5 ; last inverse round 552 add tptr, 2048 553 mov %1d, ik_ref(%5,0) 554 mov %2d, ik_ref(%5,1) 555 mov %3d, ik_ref(%5,2) 556 mov %4d, ik_ref(%5,3) 557 558 movzx esi, al 559 movzx edi, ah 560 shr eax, 16 561 xor %1d, t_ref(0,rsi) 562 xor %2d, t_ref(1,rdi) 563 movzx esi, al 564 movzx edi, ah 565 xor %3d, t_ref(2,rsi) 566 xor %4d, t_ref(3,rdi) 567 568 movzx esi, bl 569 movzx edi, bh 570 shr ebx, 16 571 xor %2d, t_ref(0,rsi) 572 xor %3d, t_ref(1,rdi) 573 movzx esi, bl 574 movzx edi, bh 575 xor %4d, t_ref(2,rsi) 576 xor %1d, t_ref(3,rdi) 577 578 movzx esi, cl 579 movzx edi, ch 580 shr ecx, 16 581 xor %3d, t_ref(0,rsi) 582 xor %4d, t_ref(1,rdi) 583 movzx esi, cl 584 movzx edi, ch 585 xor %1d, t_ref(2,rsi) 586 xor %2d, t_ref(3,rdi) 587 588 movzx esi, dl 589 movzx edi, dh 590 shr edx, 16 591 xor %4d, t_ref(0,rsi) 592 xor %1d, t_ref(1,rdi) 593 movzx esi, dl 594 movzx edi, dh 595 xor %2d, t_ref(2,rsi) 596 xor %3d, t_ref(3,rdi) 597%endmacro 598 599%else 600 601%macro il_rnd 5 ; last inverse round 602 mov %1d, ik_ref(%5,0) 603 mov %2d, ik_ref(%5,1) 604 mov %3d, ik_ref(%5,2) 605 mov %4d, ik_ref(%5,3) 606 607 movzx esi, al 608 movzx edi, ah 609 movzx esi, t_ref(i,rsi) 610 movzx edi, t_ref(i,rdi) 611 shr eax, 16 612 xor %1d, esi 613 rol edi, 8 614 xor %2d, edi 615 movzx esi, al 616 movzx edi, ah 617 movzx esi, t_ref(i,rsi) 618 movzx edi, t_ref(i,rdi) 619 rol esi, 16 620 rol edi, 24 621 xor %3d, esi 622 xor %4d, edi 623 624 movzx esi, bl 625 movzx edi, bh 626 movzx esi, t_ref(i,rsi) 627 movzx edi, t_ref(i,rdi) 628 shr ebx, 16 629 xor %2d, esi 630 rol edi, 8 631 xor %3d, edi 632 movzx esi, bl 633 movzx edi, bh 634 movzx esi, t_ref(i,rsi) 635 movzx edi, t_ref(i,rdi) 636 rol esi, 16 637 rol edi, 24 638 xor %4d, esi 639 xor %1d, edi 640 641 movzx esi, cl 642 movzx edi, ch 643 movzx esi, t_ref(i,rsi) 644 movzx edi, t_ref(i,rdi) 645 shr ecx, 16 646 xor %3d, esi 647 rol edi, 8 648 xor %4d, edi 649 movzx esi, cl 650 movzx edi, ch 651 movzx esi, t_ref(i,rsi) 652 movzx edi, t_ref(i,rdi) 653 rol esi, 16 654 rol edi, 24 655 xor %1d, esi 656 xor %2d, edi 657 658 movzx esi, dl 659 movzx edi, dh 660 movzx esi, t_ref(i,rsi) 661 movzx edi, t_ref(i,rdi) 662 shr edx, 16 663 xor %4d, esi 664 rol edi, 8 665 xor %1d, edi 666 movzx esi, dl 667 movzx edi, dh 668 movzx esi, t_ref(i,rsi) 669 movzx edi, t_ref(i,rdi) 670 rol esi, 16 671 rol edi, 24 672 xor %2d, esi 673 xor %3d, edi 674%endmacro 675 676%endif 677 678%ifdef ENCRYPTION 679 680 global aes_ni(encrypt) 681%ifdef DLL_EXPORT 682 export aes_ni(encrypt) 683%endif 684 685 section .data align=64 686 align 64 687enc_tab: 688 enc_vals u8 689%ifdef LAST_ROUND_TABLES 690 enc_vals w8 691%endif 692 693 section .text align=16 694 align 16 695 696%ifdef _SEH_ 697proc_frame aes_ni(encrypt) 698 alloc_stack 7*8 ; 7 to align stack to 16 bytes 699 save_reg rsi,4*8 700 save_reg rdi,5*8 701 save_reg rbx,1*8 702 save_reg rbp,2*8 703 save_reg r12,3*8 704end_prologue 705 mov rdi, rcx ; input pointer 706 mov [rsp+0*8], rdx ; output pointer 707%else 708 aes_ni(encrypt): 709 %ifdef __GNUC__ 710 sub rsp, 4*8 ; gnu/linux binary interface 711 mov [rsp+0*8], rsi ; output pointer 712 mov r8, rdx ; context 713 %else 714 sub rsp, 6*8 ; windows binary interface 715 mov [rsp+4*8], rsi 716 mov [rsp+5*8], rdi 717 mov rdi, rcx ; input pointer 718 mov [rsp+0*8], rdx ; output pointer 719 %endif 720 mov [rsp+1*8], rbx ; input pointer in rdi 721 mov [rsp+2*8], rbp ; output pointer in [rsp] 722 mov [rsp+3*8], r12 ; context in r8 723%endif 724 725 movzx esi, byte [kptr+4*KS_LENGTH] 726 lea tptr, [rel enc_tab] 727 sub kptr, fofs 728 729 mov eax, [rdi+0*4] 730 mov ebx, [rdi+1*4] 731 mov ecx, [rdi+2*4] 732 mov edx, [rdi+3*4] 733 734 xor eax, [kptr+fofs] 735 xor ebx, [kptr+fofs+4] 736 xor ecx, [kptr+fofs+8] 737 xor edx, [kptr+fofs+12] 738 739 lea kptr,[kptr+rsi] 740 cmp esi, 10*16 741 je .3 742 cmp esi, 12*16 743 je .2 744 cmp esi, 14*16 745 je .1 746 mov rax, -1 747 jmp .4 748 749.1: ff_rnd r9, r10, r11, r12, 13 750 ff_rnd r9, r10, r11, r12, 12 751.2: ff_rnd r9, r10, r11, r12, 11 752 ff_rnd r9, r10, r11, r12, 10 753.3: ff_rnd r9, r10, r11, r12, 9 754 ff_rnd r9, r10, r11, r12, 8 755 ff_rnd r9, r10, r11, r12, 7 756 ff_rnd r9, r10, r11, r12, 6 757 ff_rnd r9, r10, r11, r12, 5 758 ff_rnd r9, r10, r11, r12, 4 759 ff_rnd r9, r10, r11, r12, 3 760 ff_rnd r9, r10, r11, r12, 2 761 ff_rnd r9, r10, r11, r12, 1 762 fl_rnd r9, r10, r11, r12, 0 763 764 mov rbx, [rsp] 765 mov [rbx], r9d 766 mov [rbx+4], r10d 767 mov [rbx+8], r11d 768 mov [rbx+12], r12d 769 xor rax, rax 770.4: 771 mov rbx, [rsp+1*8] 772 mov rbp, [rsp+2*8] 773 mov r12, [rsp+3*8] 774%ifdef __GNUC__ 775 add rsp, 4*8 776 ret 777%else 778 mov rsi, [rsp+4*8] 779 mov rdi, [rsp+5*8] 780 %ifdef _SEH_ 781 add rsp, 7*8 782 ret 783 endproc_frame 784 %else 785 add rsp, 6*8 786 ret 787 %endif 788%endif 789 790%endif 791 792%ifdef DECRYPTION 793 794 global aes_ni(decrypt) 795%ifdef DLL_EXPORT 796 export aes_ni(decrypt) 797%endif 798 799 section .data 800 align 64 801dec_tab: 802 dec_vals v8 803%ifdef LAST_ROUND_TABLES 804 dec_vals w8 805%endif 806 807 section .text 808 align 16 809 810%ifdef _SEH_ 811proc_frame aes_ni(decrypt) 812 alloc_stack 7*8 ; 7 to align stack to 16 bytes 813 save_reg rsi,4*8 814 save_reg rdi,5*8 815 save_reg rbx,1*8 816 save_reg rbp,2*8 817 save_reg r12,3*8 818end_prologue 819 mov rdi, rcx ; input pointer 820 mov [rsp+0*8], rdx ; output pointer 821%else 822 aes_ni(decrypt): 823 %ifdef __GNUC__ 824 sub rsp, 4*8 ; gnu/linux binary interface 825 mov [rsp+0*8], rsi ; output pointer 826 mov r8, rdx ; context 827 %else 828 sub rsp, 6*8 ; windows binary interface 829 mov [rsp+4*8], rsi 830 mov [rsp+5*8], rdi 831 mov rdi, rcx ; input pointer 832 mov [rsp+0*8], rdx ; output pointer 833 %endif 834 mov [rsp+1*8], rbx ; input pointer in rdi 835 mov [rsp+2*8], rbp ; output pointer in [rsp] 836 mov [rsp+3*8], r12 ; context in r8 837%endif 838 839 movzx esi, byte[kptr+4*KS_LENGTH] 840 lea tptr, [rel dec_tab] 841 sub kptr, rofs 842 843 mov eax, [rdi+0*4] 844 mov ebx, [rdi+1*4] 845 mov ecx, [rdi+2*4] 846 mov edx, [rdi+3*4] 847 848%ifdef AES_REV_DKS 849 mov rdi, kptr 850 lea kptr,[kptr+rsi] 851%else 852 lea rdi,[kptr+rsi] 853%endif 854 855 xor eax, [rdi+rofs] 856 xor ebx, [rdi+rofs+4] 857 xor ecx, [rdi+rofs+8] 858 xor edx, [rdi+rofs+12] 859 860 cmp esi, 10*16 861 je .3 862 cmp esi, 12*16 863 je .2 864 cmp esi, 14*16 865 je .1 866 mov rax, -1 867 jmp .4 868 869.1: ii_rnd r9, r10, r11, r12, 13 870 ii_rnd r9, r10, r11, r12, 12 871.2: ii_rnd r9, r10, r11, r12, 11 872 ii_rnd r9, r10, r11, r12, 10 873.3: ii_rnd r9, r10, r11, r12, 9 874 ii_rnd r9, r10, r11, r12, 8 875 ii_rnd r9, r10, r11, r12, 7 876 ii_rnd r9, r10, r11, r12, 6 877 ii_rnd r9, r10, r11, r12, 5 878 ii_rnd r9, r10, r11, r12, 4 879 ii_rnd r9, r10, r11, r12, 3 880 ii_rnd r9, r10, r11, r12, 2 881 ii_rnd r9, r10, r11, r12, 1 882 il_rnd r9, r10, r11, r12, 0 883 884 mov rbx, [rsp] 885 mov [rbx], r9d 886 mov [rbx+4], r10d 887 mov [rbx+8], r11d 888 mov [rbx+12], r12d 889 xor rax, rax 890.4: mov rbx, [rsp+1*8] 891 mov rbp, [rsp+2*8] 892 mov r12, [rsp+3*8] 893%ifdef __GNUC__ 894 add rsp, 4*8 895 ret 896%else 897 mov rsi, [rsp+4*8] 898 mov rdi, [rsp+5*8] 899 %ifdef _SEH_ 900 add rsp, 7*8 901 ret 902 endproc_frame 903 %else 904 add rsp, 6*8 905 ret 906 %endif 907%endif 908 909%endif 910 911 end 912