1
2; ---------------------------------------------------------------------------
3; Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
4;
5; The redistribution and use of this software (with or without changes)
6; is allowed without the payment of fees or royalties provided that:
7;
8;   source code distributions include the above copyright notice, this
9;   list of conditions and the following disclaimer;
10;
11;   binary distributions include the above copyright notice, this list
12;   of conditions and the following disclaimer in their documentation.
13;
14; This software is provided 'as is' with no explicit or implied warranties
15; in respect of its operation, including, but not limited to, correctness
16; and fitness for purpose.
17; ---------------------------------------------------------------------------
18; Issue Date: 20/12/2007
19;
20; I am grateful to Dag Arne Osvik for many discussions of the techniques that
21; can be used to optimise AES assembler code on AMD64/EM64T architectures.
22; Some of the techniques used in this implementation are the result of
23; suggestions made by him for which I am most grateful.
24
25; An AES implementation for AMD64 processors using the YASM assembler.  This
26; implemetation provides only encryption, decryption and hence requires key
27; scheduling support in C. It uses 8k bytes of tables but its encryption and
28; decryption performance is very close to that obtained using large tables.
29; It can use either Windows or Gnu/Linux calling conventions, which are as
30; follows:
31;               windows  gnu/linux
32;
33;   in_blk          rcx     rdi
34;   out_blk         rdx     rsi
35;   context (cx)     r8     rdx
36;
37;   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
38;   registers       rdi      -      on both
39;
40;   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
41;   registers        -      rdi     on both
42;
43; The default convention is that for windows, the gnu/linux convention being
44; used if __GNUC__ is defined.
45;
46; To build for cryptlib (pcg):
47;
48;	yasm -Xvc -f win64 -D _SEH_ -o aescryptx64.obj aes_amd64.asm
49;
50; Define _SEH_ to include support for Win64 structured exception handling
51; (this requires YASM version 0.6 or later).
52;
53; This code provides the standard AES block size (128 bits, 16 bytes) and the
54; three standard AES key sizes (128, 192 and 256 bits). It has the same call
55; interface as my C implementation.  It uses the Microsoft C AMD64 calling
56; conventions in which the three parameters are placed in  rcx, rdx and r8
57; respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
58;
59;     AES_RETURN aes_encrypt(const unsigned char in_blk[],
60;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
61;
62;     AES_RETURN aes_decrypt(const unsigned char in_blk[],
63;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
64;
65;     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
66;                                            const aes_encrypt_ctx cx[1]);
67;
68;     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
69;                                            const aes_decrypt_ctx cx[1]);
70;
71;     AES_RETURN aes_encrypt_key(const unsigned char key[],
72;                           unsigned int len, const aes_decrypt_ctx cx[1]);
73;
74;     AES_RETURN aes_decrypt_key(const unsigned char key[],
75;                           unsigned int len, const aes_decrypt_ctx cx[1]);
76;
77; where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
78; either bits or bytes.
79;
80; Comment in/out the following lines to obtain the desired subroutines. These
81; selections MUST match those in the C header files aes.h and aesopt.h
82
83%define USE_INTEL_AES_IF_AVAILABLE
84
85%define AES_128                 ; define if AES with 128 bit keys is needed
86%define AES_192                 ; define if AES with 192 bit keys is needed
87%define AES_256                 ; define if AES with 256 bit keys is needed
88%define AES_VAR                 ; define if a variable key size is needed
89%define ENCRYPTION              ; define if encryption is needed
90%define DECRYPTION              ; define if decryption is needed
91
92%ifdef USE_INTEL_AES_IF_AVAILABLE
93%define aes_ni(x) aes_ %+ x %+ _i
94%undef  AES_REV_DKS
95%else
96%define aes_ni(x) aes_ %+ x
97%define AES_REV_DKS
98%endif
99
100%define LAST_ROUND_TABLES       ; define for the faster version using extra tables
101
102; The encryption key schedule has the following in memory layout where N is the
103; number of rounds (10, 12 or 14):
104;
105; lo: | input key (round 0)  |  ; each round is four 32-bit words
106;     | encryption round 1   |
107;     | encryption round 2   |
108;     ....
109;     | encryption round N-1 |
110; hi: | encryption round N   |
111;
112; The decryption key schedule is normally set up so that it has the same
113; layout as above by actually reversing the order of the encryption key
114; schedule in memory (this happens when AES_REV_DKS is set):
115;
116; lo: | decryption round 0   | =              | encryption round N   |
117;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
118;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
119;     ....                       ....
120;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
121; hi: | decryption round N   | =              | input key (round 0)  |
122;
123; with rounds except the first and last modified using inv_mix_column()
124; But if AES_REV_DKS is NOT set the order of keys is left as it is for
125; encryption so that it has to be accessed in reverse when used for
126; decryption (although the inverse mix column modifications are done)
127;
128; lo: | decryption round 0   | =              | input key (round 0)  |
129;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
130;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
131;     ....                       ....
132;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
133; hi: | decryption round N   | =              | encryption round N   |
134;
135; This layout is faster when the assembler key scheduling provided here
136; is used.
137;
138; The DLL interface must use the _stdcall convention in which the number
139; of bytes of parameter space is added after an @ to the sutine's name.
140; We must also remove our parameters from the stack before return (see
141; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.
142
143;%define DLL_EXPORT
144
145; End of user defines
146
147%ifdef AES_VAR
148%ifndef AES_128
149%define AES_128
150%endif
151%ifndef AES_192
152%define AES_192
153%endif
154%ifndef AES_256
155%define AES_256
156%endif
157%endif
158
159%ifdef AES_VAR
160%define KS_LENGTH       60
161%elifdef AES_256
162%define KS_LENGTH       60
163%elifdef AES_192
164%define KS_LENGTH       52
165%else
166%define KS_LENGTH       44
167%endif
168
169%define     r0  rax
170%define     r1  rdx
171%define     r2  rcx
172%define     r3  rbx
173%define     r4  rsi
174%define     r5  rdi
175%define     r6  rbp
176%define     r7  rsp
177
178%define     raxd    eax
179%define     rdxd    edx
180%define     rcxd    ecx
181%define     rbxd    ebx
182%define     rsid    esi
183%define     rdid    edi
184%define     rbpd    ebp
185%define     rspd    esp
186
187%define     raxb    al
188%define     rdxb    dl
189%define     rcxb    cl
190%define     rbxb    bl
191%define     rsib    sil
192%define     rdib    dil
193%define     rbpb    bpl
194%define     rspb    spl
195
196%define     r0h ah
197%define     r1h dh
198%define     r2h ch
199%define     r3h bh
200
201%define     r0d eax
202%define     r1d edx
203%define     r2d ecx
204%define     r3d ebx
205
206; finite field multiplies by {02}, {04} and {08}
207
208%define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
209%define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
210%define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
211
212; finite field multiplies required in table generation
213
214%define f3(x)   (f2(x) ^ x)
215%define f9(x)   (f8(x) ^ x)
216%define fb(x)   (f8(x) ^ f2(x) ^ x)
217%define fd(x)   (f8(x) ^ f4(x) ^ x)
218%define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
219
220; macro for expanding S-box data
221
222%macro enc_vals 1
223    db  %1(0x63),%1(0x7c),%1(0x77),%1(0x7b),%1(0xf2),%1(0x6b),%1(0x6f),%1(0xc5)
224    db  %1(0x30),%1(0x01),%1(0x67),%1(0x2b),%1(0xfe),%1(0xd7),%1(0xab),%1(0x76)
225    db  %1(0xca),%1(0x82),%1(0xc9),%1(0x7d),%1(0xfa),%1(0x59),%1(0x47),%1(0xf0)
226    db  %1(0xad),%1(0xd4),%1(0xa2),%1(0xaf),%1(0x9c),%1(0xa4),%1(0x72),%1(0xc0)
227    db  %1(0xb7),%1(0xfd),%1(0x93),%1(0x26),%1(0x36),%1(0x3f),%1(0xf7),%1(0xcc)
228    db  %1(0x34),%1(0xa5),%1(0xe5),%1(0xf1),%1(0x71),%1(0xd8),%1(0x31),%1(0x15)
229    db  %1(0x04),%1(0xc7),%1(0x23),%1(0xc3),%1(0x18),%1(0x96),%1(0x05),%1(0x9a)
230    db  %1(0x07),%1(0x12),%1(0x80),%1(0xe2),%1(0xeb),%1(0x27),%1(0xb2),%1(0x75)
231    db  %1(0x09),%1(0x83),%1(0x2c),%1(0x1a),%1(0x1b),%1(0x6e),%1(0x5a),%1(0xa0)
232    db  %1(0x52),%1(0x3b),%1(0xd6),%1(0xb3),%1(0x29),%1(0xe3),%1(0x2f),%1(0x84)
233    db  %1(0x53),%1(0xd1),%1(0x00),%1(0xed),%1(0x20),%1(0xfc),%1(0xb1),%1(0x5b)
234    db  %1(0x6a),%1(0xcb),%1(0xbe),%1(0x39),%1(0x4a),%1(0x4c),%1(0x58),%1(0xcf)
235    db  %1(0xd0),%1(0xef),%1(0xaa),%1(0xfb),%1(0x43),%1(0x4d),%1(0x33),%1(0x85)
236    db  %1(0x45),%1(0xf9),%1(0x02),%1(0x7f),%1(0x50),%1(0x3c),%1(0x9f),%1(0xa8)
237    db  %1(0x51),%1(0xa3),%1(0x40),%1(0x8f),%1(0x92),%1(0x9d),%1(0x38),%1(0xf5)
238    db  %1(0xbc),%1(0xb6),%1(0xda),%1(0x21),%1(0x10),%1(0xff),%1(0xf3),%1(0xd2)
239    db  %1(0xcd),%1(0x0c),%1(0x13),%1(0xec),%1(0x5f),%1(0x97),%1(0x44),%1(0x17)
240    db  %1(0xc4),%1(0xa7),%1(0x7e),%1(0x3d),%1(0x64),%1(0x5d),%1(0x19),%1(0x73)
241    db  %1(0x60),%1(0x81),%1(0x4f),%1(0xdc),%1(0x22),%1(0x2a),%1(0x90),%1(0x88)
242    db  %1(0x46),%1(0xee),%1(0xb8),%1(0x14),%1(0xde),%1(0x5e),%1(0x0b),%1(0xdb)
243    db  %1(0xe0),%1(0x32),%1(0x3a),%1(0x0a),%1(0x49),%1(0x06),%1(0x24),%1(0x5c)
244    db  %1(0xc2),%1(0xd3),%1(0xac),%1(0x62),%1(0x91),%1(0x95),%1(0xe4),%1(0x79)
245    db  %1(0xe7),%1(0xc8),%1(0x37),%1(0x6d),%1(0x8d),%1(0xd5),%1(0x4e),%1(0xa9)
246    db  %1(0x6c),%1(0x56),%1(0xf4),%1(0xea),%1(0x65),%1(0x7a),%1(0xae),%1(0x08)
247    db  %1(0xba),%1(0x78),%1(0x25),%1(0x2e),%1(0x1c),%1(0xa6),%1(0xb4),%1(0xc6)
248    db  %1(0xe8),%1(0xdd),%1(0x74),%1(0x1f),%1(0x4b),%1(0xbd),%1(0x8b),%1(0x8a)
249    db  %1(0x70),%1(0x3e),%1(0xb5),%1(0x66),%1(0x48),%1(0x03),%1(0xf6),%1(0x0e)
250    db  %1(0x61),%1(0x35),%1(0x57),%1(0xb9),%1(0x86),%1(0xc1),%1(0x1d),%1(0x9e)
251    db  %1(0xe1),%1(0xf8),%1(0x98),%1(0x11),%1(0x69),%1(0xd9),%1(0x8e),%1(0x94)
252    db  %1(0x9b),%1(0x1e),%1(0x87),%1(0xe9),%1(0xce),%1(0x55),%1(0x28),%1(0xdf)
253    db  %1(0x8c),%1(0xa1),%1(0x89),%1(0x0d),%1(0xbf),%1(0xe6),%1(0x42),%1(0x68)
254    db  %1(0x41),%1(0x99),%1(0x2d),%1(0x0f),%1(0xb0),%1(0x54),%1(0xbb),%1(0x16)
255%endmacro
256
257%macro dec_vals 1
258    db  %1(0x52),%1(0x09),%1(0x6a),%1(0xd5),%1(0x30),%1(0x36),%1(0xa5),%1(0x38)
259    db  %1(0xbf),%1(0x40),%1(0xa3),%1(0x9e),%1(0x81),%1(0xf3),%1(0xd7),%1(0xfb)
260    db  %1(0x7c),%1(0xe3),%1(0x39),%1(0x82),%1(0x9b),%1(0x2f),%1(0xff),%1(0x87)
261    db  %1(0x34),%1(0x8e),%1(0x43),%1(0x44),%1(0xc4),%1(0xde),%1(0xe9),%1(0xcb)
262    db  %1(0x54),%1(0x7b),%1(0x94),%1(0x32),%1(0xa6),%1(0xc2),%1(0x23),%1(0x3d)
263    db  %1(0xee),%1(0x4c),%1(0x95),%1(0x0b),%1(0x42),%1(0xfa),%1(0xc3),%1(0x4e)
264    db  %1(0x08),%1(0x2e),%1(0xa1),%1(0x66),%1(0x28),%1(0xd9),%1(0x24),%1(0xb2)
265    db  %1(0x76),%1(0x5b),%1(0xa2),%1(0x49),%1(0x6d),%1(0x8b),%1(0xd1),%1(0x25)
266    db  %1(0x72),%1(0xf8),%1(0xf6),%1(0x64),%1(0x86),%1(0x68),%1(0x98),%1(0x16)
267    db  %1(0xd4),%1(0xa4),%1(0x5c),%1(0xcc),%1(0x5d),%1(0x65),%1(0xb6),%1(0x92)
268    db  %1(0x6c),%1(0x70),%1(0x48),%1(0x50),%1(0xfd),%1(0xed),%1(0xb9),%1(0xda)
269    db  %1(0x5e),%1(0x15),%1(0x46),%1(0x57),%1(0xa7),%1(0x8d),%1(0x9d),%1(0x84)
270    db  %1(0x90),%1(0xd8),%1(0xab),%1(0x00),%1(0x8c),%1(0xbc),%1(0xd3),%1(0x0a)
271    db  %1(0xf7),%1(0xe4),%1(0x58),%1(0x05),%1(0xb8),%1(0xb3),%1(0x45),%1(0x06)
272    db  %1(0xd0),%1(0x2c),%1(0x1e),%1(0x8f),%1(0xca),%1(0x3f),%1(0x0f),%1(0x02)
273    db  %1(0xc1),%1(0xaf),%1(0xbd),%1(0x03),%1(0x01),%1(0x13),%1(0x8a),%1(0x6b)
274    db  %1(0x3a),%1(0x91),%1(0x11),%1(0x41),%1(0x4f),%1(0x67),%1(0xdc),%1(0xea)
275    db  %1(0x97),%1(0xf2),%1(0xcf),%1(0xce),%1(0xf0),%1(0xb4),%1(0xe6),%1(0x73)
276    db  %1(0x96),%1(0xac),%1(0x74),%1(0x22),%1(0xe7),%1(0xad),%1(0x35),%1(0x85)
277    db  %1(0xe2),%1(0xf9),%1(0x37),%1(0xe8),%1(0x1c),%1(0x75),%1(0xdf),%1(0x6e)
278    db  %1(0x47),%1(0xf1),%1(0x1a),%1(0x71),%1(0x1d),%1(0x29),%1(0xc5),%1(0x89)
279    db  %1(0x6f),%1(0xb7),%1(0x62),%1(0x0e),%1(0xaa),%1(0x18),%1(0xbe),%1(0x1b)
280    db  %1(0xfc),%1(0x56),%1(0x3e),%1(0x4b),%1(0xc6),%1(0xd2),%1(0x79),%1(0x20)
281    db  %1(0x9a),%1(0xdb),%1(0xc0),%1(0xfe),%1(0x78),%1(0xcd),%1(0x5a),%1(0xf4)
282    db  %1(0x1f),%1(0xdd),%1(0xa8),%1(0x33),%1(0x88),%1(0x07),%1(0xc7),%1(0x31)
283    db  %1(0xb1),%1(0x12),%1(0x10),%1(0x59),%1(0x27),%1(0x80),%1(0xec),%1(0x5f)
284    db  %1(0x60),%1(0x51),%1(0x7f),%1(0xa9),%1(0x19),%1(0xb5),%1(0x4a),%1(0x0d)
285    db  %1(0x2d),%1(0xe5),%1(0x7a),%1(0x9f),%1(0x93),%1(0xc9),%1(0x9c),%1(0xef)
286    db  %1(0xa0),%1(0xe0),%1(0x3b),%1(0x4d),%1(0xae),%1(0x2a),%1(0xf5),%1(0xb0)
287    db  %1(0xc8),%1(0xeb),%1(0xbb),%1(0x3c),%1(0x83),%1(0x53),%1(0x99),%1(0x61)
288    db  %1(0x17),%1(0x2b),%1(0x04),%1(0x7e),%1(0xba),%1(0x77),%1(0xd6),%1(0x26)
289    db  %1(0xe1),%1(0x69),%1(0x14),%1(0x63),%1(0x55),%1(0x21),%1(0x0c),%1(0x7d)
290%endmacro
291
292%define u8(x)   f2(x), x, x, f3(x), f2(x), x, x, f3(x)
293%define v8(x)   fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x
294%define w8(x)   x, 0, 0, 0, x, 0, 0, 0
295
296%define tptr    rbp     ; table pointer
297%define kptr    r8      ; key schedule pointer
298%define fofs    128     ; adjust offset in key schedule to keep |disp| < 128
299%define fk_ref(x,y) [kptr-16*x+fofs+4*y]
300%ifdef  AES_REV_DKS
301%define rofs    128
302%define ik_ref(x,y) [kptr-16*x+rofs+4*y]
303%else
304%define rofs    -128
305%define ik_ref(x,y) [kptr+16*x+rofs+4*y]
306%endif
307
308%define tab_0(x)   [tptr+8*x]
309%define tab_1(x)   [tptr+8*x+3]
310%define tab_2(x)   [tptr+8*x+2]
311%define tab_3(x)   [tptr+8*x+1]
312%define tab_f(x)   byte [tptr+8*x+1]
313%define tab_i(x)   byte [tptr+8*x+7]
314%define t_ref(x,r) tab_ %+ x(r)
315
316%macro ff_rnd 5                 ; normal forward round
317    mov     %1d, fk_ref(%5,0)
318    mov     %2d, fk_ref(%5,1)
319    mov     %3d, fk_ref(%5,2)
320    mov     %4d, fk_ref(%5,3)
321
322    movzx   esi, al
323    movzx   edi, ah
324    shr     eax, 16
325    xor     %1d, t_ref(0,rsi)
326    xor     %4d, t_ref(1,rdi)
327    movzx   esi, al
328    movzx   edi, ah
329    xor     %3d, t_ref(2,rsi)
330    xor     %2d, t_ref(3,rdi)
331
332    movzx   esi, bl
333    movzx   edi, bh
334    shr     ebx, 16
335    xor     %2d, t_ref(0,rsi)
336    xor     %1d, t_ref(1,rdi)
337    movzx   esi, bl
338    movzx   edi, bh
339    xor     %4d, t_ref(2,rsi)
340    xor     %3d, t_ref(3,rdi)
341
342    movzx   esi, cl
343    movzx   edi, ch
344    shr     ecx, 16
345    xor     %3d, t_ref(0,rsi)
346    xor     %2d, t_ref(1,rdi)
347    movzx   esi, cl
348    movzx   edi, ch
349    xor     %1d, t_ref(2,rsi)
350    xor     %4d, t_ref(3,rdi)
351
352    movzx   esi, dl
353    movzx   edi, dh
354    shr     edx, 16
355    xor     %4d, t_ref(0,rsi)
356    xor     %3d, t_ref(1,rdi)
357    movzx   esi, dl
358    movzx   edi, dh
359    xor     %2d, t_ref(2,rsi)
360    xor     %1d, t_ref(3,rdi)
361
362    mov     eax,%1d
363    mov     ebx,%2d
364    mov     ecx,%3d
365    mov     edx,%4d
366%endmacro
367
368%ifdef LAST_ROUND_TABLES
369
370%macro fl_rnd 5                 ; last forward round
371    add     tptr, 2048
372    mov     %1d, fk_ref(%5,0)
373    mov     %2d, fk_ref(%5,1)
374    mov     %3d, fk_ref(%5,2)
375    mov     %4d, fk_ref(%5,3)
376
377    movzx   esi, al
378    movzx   edi, ah
379    shr     eax, 16
380    xor     %1d, t_ref(0,rsi)
381    xor     %4d, t_ref(1,rdi)
382    movzx   esi, al
383    movzx   edi, ah
384    xor     %3d, t_ref(2,rsi)
385    xor     %2d, t_ref(3,rdi)
386
387    movzx   esi, bl
388    movzx   edi, bh
389    shr     ebx, 16
390    xor     %2d, t_ref(0,rsi)
391    xor     %1d, t_ref(1,rdi)
392    movzx   esi, bl
393    movzx   edi, bh
394    xor     %4d, t_ref(2,rsi)
395    xor     %3d, t_ref(3,rdi)
396
397    movzx   esi, cl
398    movzx   edi, ch
399    shr     ecx, 16
400    xor     %3d, t_ref(0,rsi)
401    xor     %2d, t_ref(1,rdi)
402    movzx   esi, cl
403    movzx   edi, ch
404    xor     %1d, t_ref(2,rsi)
405    xor     %4d, t_ref(3,rdi)
406
407    movzx   esi, dl
408    movzx   edi, dh
409    shr     edx, 16
410    xor     %4d, t_ref(0,rsi)
411    xor     %3d, t_ref(1,rdi)
412    movzx   esi, dl
413    movzx   edi, dh
414    xor     %2d, t_ref(2,rsi)
415    xor     %1d, t_ref(3,rdi)
416%endmacro
417
418%else
419
420%macro fl_rnd 5                 ; last forward round
421    mov     %1d, fk_ref(%5,0)
422    mov     %2d, fk_ref(%5,1)
423    mov     %3d, fk_ref(%5,2)
424    mov     %4d, fk_ref(%5,3)
425
426    movzx   esi, al
427    movzx   edi, ah
428    shr     eax, 16
429    movzx   esi, t_ref(f,rsi)
430    movzx   edi, t_ref(f,rdi)
431    xor     %1d, esi
432    rol     edi, 8
433    xor     %4d, edi
434    movzx   esi, al
435    movzx   edi, ah
436    movzx   esi, t_ref(f,rsi)
437    movzx   edi, t_ref(f,rdi)
438    rol     esi, 16
439    rol     edi, 24
440    xor     %3d, esi
441    xor     %2d, edi
442
443    movzx   esi, bl
444    movzx   edi, bh
445    shr     ebx, 16
446    movzx   esi, t_ref(f,rsi)
447    movzx   edi, t_ref(f,rdi)
448    xor     %2d, esi
449    rol     edi, 8
450    xor     %1d, edi
451    movzx   esi, bl
452    movzx   edi, bh
453    movzx   esi, t_ref(f,rsi)
454    movzx   edi, t_ref(f,rdi)
455    rol     esi, 16
456    rol     edi, 24
457    xor     %4d, esi
458    xor     %3d, edi
459
460    movzx   esi, cl
461    movzx   edi, ch
462    movzx   esi, t_ref(f,rsi)
463    movzx   edi, t_ref(f,rdi)
464    shr     ecx, 16
465    xor     %3d, esi
466    rol     edi, 8
467    xor     %2d, edi
468    movzx   esi, cl
469    movzx   edi, ch
470    movzx   esi, t_ref(f,rsi)
471    movzx   edi, t_ref(f,rdi)
472    rol     esi, 16
473    rol     edi, 24
474    xor     %1d, esi
475    xor     %4d, edi
476
477    movzx   esi, dl
478    movzx   edi, dh
479    movzx   esi, t_ref(f,rsi)
480    movzx   edi, t_ref(f,rdi)
481    shr     edx, 16
482    xor     %4d, esi
483    rol     edi, 8
484    xor     %3d, edi
485    movzx   esi, dl
486    movzx   edi, dh
487    movzx   esi, t_ref(f,rsi)
488    movzx   edi, t_ref(f,rdi)
489    rol     esi, 16
490    rol     edi, 24
491    xor     %2d, esi
492    xor     %1d, edi
493%endmacro
494
495%endif
496
497%macro ii_rnd 5                 ; normal inverse round
498    mov     %1d, ik_ref(%5,0)
499    mov     %2d, ik_ref(%5,1)
500    mov     %3d, ik_ref(%5,2)
501    mov     %4d, ik_ref(%5,3)
502
503    movzx   esi, al
504    movzx   edi, ah
505    shr     eax, 16
506    xor     %1d, t_ref(0,rsi)
507    xor     %2d, t_ref(1,rdi)
508    movzx   esi, al
509    movzx   edi, ah
510    xor     %3d, t_ref(2,rsi)
511    xor     %4d, t_ref(3,rdi)
512
513    movzx   esi, bl
514    movzx   edi, bh
515    shr     ebx, 16
516    xor     %2d, t_ref(0,rsi)
517    xor     %3d, t_ref(1,rdi)
518    movzx   esi, bl
519    movzx   edi, bh
520    xor     %4d, t_ref(2,rsi)
521    xor     %1d, t_ref(3,rdi)
522
523    movzx   esi, cl
524    movzx   edi, ch
525    shr     ecx, 16
526    xor     %3d, t_ref(0,rsi)
527    xor     %4d, t_ref(1,rdi)
528    movzx   esi, cl
529    movzx   edi, ch
530    xor     %1d, t_ref(2,rsi)
531    xor     %2d, t_ref(3,rdi)
532
533    movzx   esi, dl
534    movzx   edi, dh
535    shr     edx, 16
536    xor     %4d, t_ref(0,rsi)
537    xor     %1d, t_ref(1,rdi)
538    movzx   esi, dl
539    movzx   edi, dh
540    xor     %2d, t_ref(2,rsi)
541    xor     %3d, t_ref(3,rdi)
542
543    mov     eax,%1d
544    mov     ebx,%2d
545    mov     ecx,%3d
546    mov     edx,%4d
547%endmacro
548
549%ifdef LAST_ROUND_TABLES
550
551%macro il_rnd 5                 ; last inverse round
552    add     tptr, 2048
553    mov     %1d, ik_ref(%5,0)
554    mov     %2d, ik_ref(%5,1)
555    mov     %3d, ik_ref(%5,2)
556    mov     %4d, ik_ref(%5,3)
557
558    movzx   esi, al
559    movzx   edi, ah
560    shr     eax, 16
561    xor     %1d, t_ref(0,rsi)
562    xor     %2d, t_ref(1,rdi)
563    movzx   esi, al
564    movzx   edi, ah
565    xor     %3d, t_ref(2,rsi)
566    xor     %4d, t_ref(3,rdi)
567
568    movzx   esi, bl
569    movzx   edi, bh
570    shr     ebx, 16
571    xor     %2d, t_ref(0,rsi)
572    xor     %3d, t_ref(1,rdi)
573    movzx   esi, bl
574    movzx   edi, bh
575    xor     %4d, t_ref(2,rsi)
576    xor     %1d, t_ref(3,rdi)
577
578    movzx   esi, cl
579    movzx   edi, ch
580    shr     ecx, 16
581    xor     %3d, t_ref(0,rsi)
582    xor     %4d, t_ref(1,rdi)
583    movzx   esi, cl
584    movzx   edi, ch
585    xor     %1d, t_ref(2,rsi)
586    xor     %2d, t_ref(3,rdi)
587
588    movzx   esi, dl
589    movzx   edi, dh
590    shr     edx, 16
591    xor     %4d, t_ref(0,rsi)
592    xor     %1d, t_ref(1,rdi)
593    movzx   esi, dl
594    movzx   edi, dh
595    xor     %2d, t_ref(2,rsi)
596    xor     %3d, t_ref(3,rdi)
597%endmacro
598
599%else
600
601%macro il_rnd 5                 ; last inverse round
602    mov     %1d, ik_ref(%5,0)
603    mov     %2d, ik_ref(%5,1)
604    mov     %3d, ik_ref(%5,2)
605    mov     %4d, ik_ref(%5,3)
606
607    movzx   esi, al
608    movzx   edi, ah
609    movzx   esi, t_ref(i,rsi)
610    movzx   edi, t_ref(i,rdi)
611    shr     eax, 16
612    xor     %1d, esi
613    rol     edi, 8
614    xor     %2d, edi
615    movzx   esi, al
616    movzx   edi, ah
617    movzx   esi, t_ref(i,rsi)
618    movzx   edi, t_ref(i,rdi)
619    rol     esi, 16
620    rol     edi, 24
621    xor     %3d, esi
622    xor     %4d, edi
623
624    movzx   esi, bl
625    movzx   edi, bh
626    movzx   esi, t_ref(i,rsi)
627    movzx   edi, t_ref(i,rdi)
628    shr     ebx, 16
629    xor     %2d, esi
630    rol     edi, 8
631    xor     %3d, edi
632    movzx   esi, bl
633    movzx   edi, bh
634    movzx   esi, t_ref(i,rsi)
635    movzx   edi, t_ref(i,rdi)
636    rol     esi, 16
637    rol     edi, 24
638    xor     %4d, esi
639    xor     %1d, edi
640
641    movzx   esi, cl
642    movzx   edi, ch
643    movzx   esi, t_ref(i,rsi)
644    movzx   edi, t_ref(i,rdi)
645    shr     ecx, 16
646    xor     %3d, esi
647    rol     edi, 8
648    xor     %4d, edi
649    movzx   esi, cl
650    movzx   edi, ch
651    movzx   esi, t_ref(i,rsi)
652    movzx   edi, t_ref(i,rdi)
653    rol     esi, 16
654    rol     edi, 24
655    xor     %1d, esi
656    xor     %2d, edi
657
658    movzx   esi, dl
659    movzx   edi, dh
660    movzx   esi, t_ref(i,rsi)
661    movzx   edi, t_ref(i,rdi)
662    shr     edx, 16
663    xor     %4d, esi
664    rol     edi, 8
665    xor     %1d, edi
666    movzx   esi, dl
667    movzx   edi, dh
668    movzx   esi, t_ref(i,rsi)
669    movzx   edi, t_ref(i,rdi)
670    rol     esi, 16
671    rol     edi, 24
672    xor     %2d, esi
673    xor     %3d, edi
674%endmacro
675
676%endif
677
678%ifdef ENCRYPTION
679
680    global  aes_ni(encrypt)
681%ifdef DLL_EXPORT
682    export  aes_ni(encrypt)
683%endif
684
685    section .data align=64
686    align   64
687enc_tab:
688    enc_vals u8
689%ifdef LAST_ROUND_TABLES
690    enc_vals w8
691%endif
692
693    section .text align=16
694    align   16
695
696%ifdef _SEH_
697proc_frame aes_ni(encrypt)
698    alloc_stack	7*8			; 7 to align stack to 16 bytes
699    save_reg	rsi,4*8
700    save_reg	rdi,5*8
701    save_reg	rbx,1*8
702    save_reg	rbp,2*8
703    save_reg	r12,3*8
704end_prologue
705    mov     rdi, rcx        ; input pointer
706    mov     [rsp+0*8], rdx  ; output pointer
707%else
708    aes_ni(encrypt):
709    %ifdef __GNUC__
710        sub     rsp, 4*8        ; gnu/linux binary interface
711        mov     [rsp+0*8], rsi  ; output pointer
712        mov     r8, rdx         ; context
713    %else
714        sub     rsp, 6*8        ; windows binary interface
715        mov     [rsp+4*8], rsi
716        mov     [rsp+5*8], rdi
717        mov     rdi, rcx        ; input pointer
718        mov     [rsp+0*8], rdx  ; output pointer
719    %endif
720        mov     [rsp+1*8], rbx  ; input pointer in rdi
721        mov     [rsp+2*8], rbp  ; output pointer in [rsp]
722        mov     [rsp+3*8], r12  ; context in r8
723%endif
724
725    movzx   esi, byte [kptr+4*KS_LENGTH]
726    lea     tptr, [rel enc_tab]
727    sub     kptr, fofs
728
729    mov     eax, [rdi+0*4]
730    mov     ebx, [rdi+1*4]
731    mov     ecx, [rdi+2*4]
732    mov     edx, [rdi+3*4]
733
734    xor     eax, [kptr+fofs]
735    xor     ebx, [kptr+fofs+4]
736    xor     ecx, [kptr+fofs+8]
737    xor     edx, [kptr+fofs+12]
738
739    lea     kptr,[kptr+rsi]
740    cmp     esi, 10*16
741    je      .3
742    cmp     esi, 12*16
743    je      .2
744    cmp     esi, 14*16
745    je      .1
746    mov     rax, -1
747    jmp     .4
748
749.1: ff_rnd  r9, r10, r11, r12, 13
750    ff_rnd  r9, r10, r11, r12, 12
751.2: ff_rnd  r9, r10, r11, r12, 11
752    ff_rnd  r9, r10, r11, r12, 10
753.3: ff_rnd  r9, r10, r11, r12, 9
754    ff_rnd  r9, r10, r11, r12, 8
755    ff_rnd  r9, r10, r11, r12, 7
756    ff_rnd  r9, r10, r11, r12, 6
757    ff_rnd  r9, r10, r11, r12, 5
758    ff_rnd  r9, r10, r11, r12, 4
759    ff_rnd  r9, r10, r11, r12, 3
760    ff_rnd  r9, r10, r11, r12, 2
761    ff_rnd  r9, r10, r11, r12, 1
762    fl_rnd  r9, r10, r11, r12, 0
763
764    mov     rbx, [rsp]
765    mov     [rbx], r9d
766    mov     [rbx+4], r10d
767    mov     [rbx+8], r11d
768    mov     [rbx+12], r12d
769    xor     rax, rax
770.4:
771    mov     rbx, [rsp+1*8]
772    mov     rbp, [rsp+2*8]
773    mov     r12, [rsp+3*8]
774%ifdef __GNUC__
775    add     rsp, 4*8
776    ret
777%else
778    mov     rsi, [rsp+4*8]
779    mov     rdi, [rsp+5*8]
780    %ifdef _SEH_
781        add     rsp, 7*8
782        ret
783    endproc_frame
784    %else
785        add     rsp, 6*8
786        ret
787    %endif
788%endif
789
790%endif
791
792%ifdef DECRYPTION
793
794    global  aes_ni(decrypt)
795%ifdef DLL_EXPORT
796    export  aes_ni(decrypt)
797%endif
798
799    section .data
800    align   64
801dec_tab:
802    dec_vals v8
803%ifdef LAST_ROUND_TABLES
804    dec_vals w8
805%endif
806
807    section .text
808    align   16
809
810%ifdef _SEH_
811proc_frame aes_ni(decrypt)
812    alloc_stack	7*8			; 7 to align stack to 16 bytes
813    save_reg	rsi,4*8
814    save_reg	rdi,5*8
815    save_reg	rbx,1*8
816    save_reg	rbp,2*8
817    save_reg	r12,3*8
818end_prologue
819    mov     rdi, rcx        ; input pointer
820    mov     [rsp+0*8], rdx  ; output pointer
821%else
822    aes_ni(decrypt):
823    %ifdef __GNUC__
824        sub     rsp, 4*8        ; gnu/linux binary interface
825        mov     [rsp+0*8], rsi  ; output pointer
826        mov     r8, rdx         ; context
827    %else
828        sub     rsp, 6*8        ; windows binary interface
829        mov     [rsp+4*8], rsi
830        mov     [rsp+5*8], rdi
831        mov     rdi, rcx        ; input pointer
832        mov     [rsp+0*8], rdx  ; output pointer
833    %endif
834        mov     [rsp+1*8], rbx  ; input pointer in rdi
835        mov     [rsp+2*8], rbp  ; output pointer in [rsp]
836        mov     [rsp+3*8], r12  ; context in r8
837%endif
838
839    movzx   esi, byte[kptr+4*KS_LENGTH]
840    lea     tptr, [rel dec_tab]
841    sub     kptr, rofs
842
843    mov     eax, [rdi+0*4]
844    mov     ebx, [rdi+1*4]
845    mov     ecx, [rdi+2*4]
846    mov     edx, [rdi+3*4]
847
848%ifdef      AES_REV_DKS
849    mov     rdi, kptr
850    lea     kptr,[kptr+rsi]
851%else
852    lea     rdi,[kptr+rsi]
853%endif
854
855    xor     eax, [rdi+rofs]
856    xor     ebx, [rdi+rofs+4]
857    xor     ecx, [rdi+rofs+8]
858    xor     edx, [rdi+rofs+12]
859
860    cmp     esi, 10*16
861    je      .3
862    cmp     esi, 12*16
863    je      .2
864    cmp     esi, 14*16
865    je      .1
866    mov     rax, -1
867    jmp     .4
868
869.1: ii_rnd  r9, r10, r11, r12, 13
870    ii_rnd  r9, r10, r11, r12, 12
871.2: ii_rnd  r9, r10, r11, r12, 11
872    ii_rnd  r9, r10, r11, r12, 10
873.3: ii_rnd  r9, r10, r11, r12, 9
874    ii_rnd  r9, r10, r11, r12, 8
875    ii_rnd  r9, r10, r11, r12, 7
876    ii_rnd  r9, r10, r11, r12, 6
877    ii_rnd  r9, r10, r11, r12, 5
878    ii_rnd  r9, r10, r11, r12, 4
879    ii_rnd  r9, r10, r11, r12, 3
880    ii_rnd  r9, r10, r11, r12, 2
881    ii_rnd  r9, r10, r11, r12, 1
882    il_rnd  r9, r10, r11, r12, 0
883
884    mov     rbx, [rsp]
885    mov     [rbx], r9d
886    mov     [rbx+4], r10d
887    mov     [rbx+8], r11d
888    mov     [rbx+12], r12d
889    xor     rax, rax
890.4: mov     rbx, [rsp+1*8]
891    mov     rbp, [rsp+2*8]
892    mov     r12, [rsp+3*8]
893%ifdef __GNUC__
894    add     rsp, 4*8
895    ret
896%else
897    mov     rsi, [rsp+4*8]
898    mov     rdi, [rsp+5*8]
899    %ifdef _SEH_
900        add     rsp, 7*8
901        ret
902    endproc_frame
903    %else
904        add     rsp, 6*8
905        ret
906    %endif
907%endif
908
909%endif
910
911    end
912