1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5	.text
6
7#define IV_OFFSET 256
8
9/*
10 * Warning: the length values used in this module are "unsigned int"
11 * in C, which is 32-bit.  When they're passed in registers, use only
12 * the low 32 bits, because the top half is unspecified.
13 *
14 * This is called from C code, so the contents of those bits can
15 * depend on the C compiler's optimization decisions.  This means that
16 * mistakes might not be obvious in testing if those bits happen to be
17 * zero in your build.
18 *
19 * Exception: 32-bit lea instructions use a 64-bit address because the
20 * address size doesn't affect the result, and that form is more
21 * compactly encoded and preferred by compilers over a 32-bit address.
22 */
23
24/* in %rdi : the key
25   in %rsi : buffer for expanded key
26*/
27	.type intel_aes_encrypt_init_128,@function
28	.globl intel_aes_encrypt_init_128
29	.align	16
30intel_aes_encrypt_init_128:
31	movups	(%rdi), %xmm1
32	movups	%xmm1, (%rsi)
33	leaq	16(%rsi), %rsi
34	xorl	%eax, %eax
35
36	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x01	/* aeskeygenassist $0x01, %xmm1, %xmm2 */
37	call key_expansion128
38	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x02	/* aeskeygenassist $0x02, %xmm1, %xmm2 */
39	call key_expansion128
40	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x04	/* aeskeygenassist $0x04, %xmm1, %xmm2 */
41	call key_expansion128
42	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x08	/* aeskeygenassist $0x08, %xmm1, %xmm2 */
43	call key_expansion128
44	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x10	/* aeskeygenassist $0x10, %xmm1, %xmm2 */
45	call key_expansion128
46	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x20	/* aeskeygenassist $0x20, %xmm1, %xmm2 */
47	call key_expansion128
48	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x40	/* aeskeygenassist $0x40, %xmm1, %xmm2 */
49	call key_expansion128
50	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x80	/* aeskeygenassist $0x80, %xmm1, %xmm2 */
51	call key_expansion128
52	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x1b	/* aeskeygenassist $0x1b, %xmm1, %xmm2 */
53	call key_expansion128
54	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x36	/* aeskeygenassist $0x36, %xmm1, %xmm2 */
55	call key_expansion128
56
57	ret
58	.size intel_aes_encrypt_init_128, .-intel_aes_encrypt_init_128
59
60
61/* in %rdi : the key
62   in %rsi : buffer for expanded key
63*/
64	.type intel_aes_decrypt_init_128,@function
65	.globl intel_aes_decrypt_init_128
66	.align	16
67intel_aes_decrypt_init_128:
68	movups	(%rdi), %xmm1
69	movups	%xmm1, (%rsi)
70	leaq	16(%rsi), %rsi
71	xorl	%eax, %eax
72
73	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x01	/* aeskeygenassist $0x01, %xmm1, %xmm2 */
74	call key_expansion128
75	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
76	movups	%xmm2, -16(%rsi)
77	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x02	/* aeskeygenassist $0x02, %xmm1, %xmm2 */
78	call key_expansion128
79	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
80	movups	%xmm2, -16(%rsi)
81	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x04	/* aeskeygenassist $0x04, %xmm1, %xmm2 */
82	call key_expansion128
83	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
84	movups	%xmm2, -16(%rsi)
85	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x08	/* aeskeygenassist $0x08, %xmm1, %xmm2 */
86	call key_expansion128
87	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
88	movups	%xmm2, -16(%rsi)
89	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x10	/* aeskeygenassist $0x10, %xmm1, %xmm2 */
90	call key_expansion128
91	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
92	movups	%xmm2, -16(%rsi)
93	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x20	/* aeskeygenassist $0x20, %xmm1, %xmm2 */
94	call key_expansion128
95	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
96	movups	%xmm2, -16(%rsi)
97	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x40	/* aeskeygenassist $0x40, %xmm1, %xmm2 */
98	call key_expansion128
99	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
100	movups	%xmm2, -16(%rsi)
101	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x80	/* aeskeygenassist $0x80, %xmm1, %xmm2 */
102	call key_expansion128
103	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
104	movups	%xmm2, -16(%rsi)
105	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x1b	/* aeskeygenassist $0x1b, %xmm1, %xmm2 */
106	call key_expansion128
107	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
108	movups	%xmm2, -16(%rsi)
109	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x36	/* aeskeygenassist $0x36, %xmm1, %xmm2 */
110	call key_expansion128
111
112	ret
113	.size intel_aes_decrypt_init_128, .-intel_aes_decrypt_init_128
114
115
116	.type key_expansion128,@function
117	.align	16
118key_expansion128:
119	movd	%eax, %xmm3
120	pshufd	$0xff, %xmm2, %xmm2
121	shufps	$0x10, %xmm1, %xmm3
122	pxor	%xmm3, %xmm1
123	shufps	$0x8c, %xmm1, %xmm3
124	pxor	%xmm2, %xmm1
125	pxor	%xmm3, %xmm1
126	movdqu	%xmm1, (%rsi)
127	addq	$16, %rsi
128	ret
129	.size key_expansion128, .-key_expansion128
130
131
132/* in %rdi : cx - context
133   in %rsi : output - pointer to output buffer
134   in %rdx : outputLen - pointer to variable for length of output
135             (already filled in by caller)
136   in %ecx : maxOutputLen - length of output buffer
137             (already checked by caller)
138   in %r8  : input - pointer to input buffer
139   in %r9d : inputLen - length of input buffer
140   on stack: blocksize - AES blocksize (always 16, unused)
141*/
142	.type intel_aes_encrypt_ecb_128,@function
143	.globl intel_aes_encrypt_ecb_128
144	.align	16
145intel_aes_encrypt_ecb_128:
146	movdqu	(%rdi), %xmm2
147	movdqu	160(%rdi), %xmm12
148	xor	%eax, %eax
149//	cmpl	$8*16, %r9d
150	cmpl	$128, %r9d
151	jb	1f
152//	leal	-8*16(%r9), %r11d
153	leal	-128(%r9), %r11d
1542:	movdqu	(%r8, %rax), %xmm3
155	movdqu	16(%r8, %rax), %xmm4
156	movdqu	32(%r8, %rax), %xmm5
157	movdqu	48(%r8, %rax), %xmm6
158	movdqu	64(%r8, %rax), %xmm7
159	movdqu	80(%r8, %rax), %xmm8
160	movdqu	96(%r8, %rax), %xmm9
161	movdqu	112(%r8, %rax), %xmm10
162	pxor	%xmm2, %xmm3
163	pxor	%xmm2, %xmm4
164	pxor	%xmm2, %xmm5
165	pxor	%xmm2, %xmm6
166	pxor	%xmm2, %xmm7
167	pxor	%xmm2, %xmm8
168	pxor	%xmm2, %xmm9
169	pxor	%xmm2, %xmm10
170
171// complete loop unrolling
172	movdqu 16(%rdi), %xmm1
173	movdqu 32(%rdi), %xmm11
174	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
175	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
176	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
177	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
178	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
179	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
180	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
181	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
182	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
183	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
184	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
185	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
186	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
187	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
188	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
189	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
190
191	movdqu 48(%rdi), %xmm1
192	movdqu 64(%rdi), %xmm11
193	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
194	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
195	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
196	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
197	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
198	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
199	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
200	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
201	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
202	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
203	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
204	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
205	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
206	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
207	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
208	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
209
210	movdqu 80(%rdi), %xmm1
211	movdqu 96(%rdi), %xmm11
212	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
213	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
214	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
215	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
216	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
217	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
218	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
219	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
220	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
221	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
222	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
223	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
224	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
225	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
226	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
227	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
228
229	movdqu 112(%rdi), %xmm1
230	movdqu 128(%rdi), %xmm11
231	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
232	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
233	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
234	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
235	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
236	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
237	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
238	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
239	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
240	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
241	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
242	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
243	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
244	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
245	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
246	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
247
248	movdqu 144(%rdi), %xmm1
249	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
250	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
251	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
252	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
253	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
254	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
255	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
256	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
257	.byte 0x66,0x41,0x0f,0x38,0xdd,0xdc 	/* aesenclast 	%xmm12, %xmm3 */
258	.byte 0x66,0x41,0x0f,0x38,0xdd,0xe4 	/* aesenclast 	%xmm12, %xmm4 */
259	.byte 0x66,0x41,0x0f,0x38,0xdd,0xec 	/* aesenclast 	%xmm12, %xmm5 */
260	.byte 0x66,0x41,0x0f,0x38,0xdd,0xf4 	/* aesenclast 	%xmm12, %xmm6 */
261	.byte 0x66,0x41,0x0f,0x38,0xdd,0xfc 	/* aesenclast 	%xmm12, %xmm7 */
262	.byte 0x66,0x45,0x0f,0x38,0xdd,0xc4 	/* aesenclast 	%xmm12, %xmm8 */
263	.byte 0x66,0x45,0x0f,0x38,0xdd,0xcc 	/* aesenclast 	%xmm12, %xmm9 */
264	.byte 0x66,0x45,0x0f,0x38,0xdd,0xd4 	/* aesenclast 	%xmm12, %xmm10 */
265
266	movdqu	%xmm3, (%rsi, %rax)
267	movdqu	%xmm4, 16(%rsi, %rax)
268	movdqu	%xmm5, 32(%rsi, %rax)
269	movdqu	%xmm6, 48(%rsi, %rax)
270	movdqu	%xmm7, 64(%rsi, %rax)
271	movdqu	%xmm8, 80(%rsi, %rax)
272	movdqu	%xmm9, 96(%rsi, %rax)
273	movdqu	%xmm10, 112(%rsi, %rax)
274//	addl	$8*16, %eax
275	addl	$128, %eax
276	cmpl	%r11d, %eax
277	jbe	2b
2781:	cmpl	%eax, %r9d
279	je	5f
280
281	movdqu	16(%rdi), %xmm3
282	movdqu	32(%rdi), %xmm4
283	movdqu	48(%rdi), %xmm5
284	movdqu	64(%rdi), %xmm6
285	movdqu	80(%rdi), %xmm7
286	movdqu	96(%rdi), %xmm8
287	movdqu	112(%rdi), %xmm9
288	movdqu	128(%rdi), %xmm10
289	movdqu	144(%rdi), %xmm11
290
2914:	movdqu	(%r8, %rax), %xmm1
292	pxor	%xmm2, %xmm1
293	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
294	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
295	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
296	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
297	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
298	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
299	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
300	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
301	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
302	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcc	/* aesenclast %xmm12, %xmm1 */
303	movdqu	%xmm1, (%rsi, %rax)
304	addl	$16, %eax
305	cmpl	%eax, %r9d
306	jne	4b
307
3085:	xor	%eax, %eax
309	ret
310	.size intel_aes_encrypt_ecb_128, .-intel_aes_encrypt_ecb_128
311
312
313/* in %rdi : cx - context
314   in %rsi : output - pointer to output buffer
315   in %rdx : outputLen - pointer to variable for length of output
316             (already filled in by caller)
317   in %ecx : maxOutputLen - length of output buffer
318             (already checked by caller)
319   in %r8  : input - pointer to input buffer
320   in %r9d : inputLen - length of input buffer
321   on stack: blocksize - AES blocksize (always 16, unused)
322*/
323	.type intel_aes_decrypt_ecb_128,@function
324	.globl intel_aes_decrypt_ecb_128
325	.align	16
326intel_aes_decrypt_ecb_128:
327	movdqu	(%rdi), %xmm2
328	movdqu	160(%rdi), %xmm12
329	xorl	%eax, %eax
330//	cmpl	$8*16, %r9d
331	cmpl	$128, %r9d
332	jb	1f
333//	leal	-8*16(%r9), %r11d
334	leal	-128(%r9), %r11d
3352:	movdqu	(%r8, %rax), %xmm3
336	movdqu	16(%r8, %rax), %xmm4
337	movdqu	32(%r8, %rax), %xmm5
338	movdqu	48(%r8, %rax), %xmm6
339	movdqu	64(%r8, %rax), %xmm7
340	movdqu	80(%r8, %rax), %xmm8
341	movdqu	96(%r8, %rax), %xmm9
342	movdqu	112(%r8, %rax), %xmm10
343	pxor	%xmm12, %xmm3
344	pxor	%xmm12, %xmm4
345	pxor	%xmm12, %xmm5
346	pxor	%xmm12, %xmm6
347	pxor	%xmm12, %xmm7
348	pxor	%xmm12, %xmm8
349	pxor	%xmm12, %xmm9
350	pxor	%xmm12, %xmm10
351
352// complete loop unrolling
353	movdqu 144(%rdi), %xmm1
354	movdqu 128(%rdi), %xmm11
355	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
356	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
357	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
358	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
359	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
360	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
361	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
362	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
363	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
364	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
365	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
366	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
367	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
368	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
369	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
370	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
371
372	movdqu 112(%rdi), %xmm1
373	movdqu 96(%rdi), %xmm11
374	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
375	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
376	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
377	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
378	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
379	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
380	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
381	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
382	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
383	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
384	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
385	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
386	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
387	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
388	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
389	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
390
391	movdqu 80(%rdi), %xmm1
392	movdqu 64(%rdi), %xmm11
393	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
394	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
395	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
396	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
397	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
398	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
399	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
400	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
401	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
402	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
403	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
404	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
405	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
406	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
407	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
408	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
409
410	movdqu 48(%rdi), %xmm1
411	movdqu 32(%rdi), %xmm11
412	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
413	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
414	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
415	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
416	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
417	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
418	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
419	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
420	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
421	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
422	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
423	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
424	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
425	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
426	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
427	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
428
429	movdqu 16(%rdi), %xmm1
430	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
431	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
432	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
433	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
434	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
435	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
436	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
437	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
438	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
439	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
440	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
441	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
442	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
443	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
444	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
445	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
446
447	movdqu	%xmm3, (%rsi, %rax)
448	movdqu	%xmm4, 16(%rsi, %rax)
449	movdqu	%xmm5, 32(%rsi, %rax)
450	movdqu	%xmm6, 48(%rsi, %rax)
451	movdqu	%xmm7, 64(%rsi, %rax)
452	movdqu	%xmm8, 80(%rsi, %rax)
453	movdqu	%xmm9, 96(%rsi, %rax)
454	movdqu	%xmm10, 112(%rsi, %rax)
455//	addl	$8*16, %eax
456	addl	$128, %eax
457	cmpl	%r11d, %eax
458	jbe	2b
4591:	cmpl	%eax, %r9d
460	je	5f
461
462	movdqu	16(%rdi), %xmm3
463	movdqu	32(%rdi), %xmm4
464	movdqu	48(%rdi), %xmm5
465	movdqu	64(%rdi), %xmm6
466	movdqu	80(%rdi), %xmm7
467	movdqu	96(%rdi), %xmm8
468	movdqu	112(%rdi), %xmm9
469	movdqu	128(%rdi), %xmm10
470	movdqu	144(%rdi), %xmm11
471
4724:	movdqu	(%r8, %rax), %xmm1
473	pxor	%xmm12, %xmm1
474	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
475	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
476	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
477	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
478	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
479	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm7, %xmm1 */
480	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm7, %xmm1 */
481	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm7, %xmm1 */
482	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm7, %xmm1 */
483	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
484	movdqu	%xmm1, (%rsi, %rax)
485	addl	$16, %eax
486	cmpl	%eax, %r9d
487	jne	4b
488
4895:	xor	%eax, %eax
490	ret
491	.size intel_aes_decrypt_ecb_128, .-intel_aes_decrypt_ecb_128
492
493
494/* in %rdi : cx - context
495   in %rsi : output - pointer to output buffer
496   in %rdx : outputLen - pointer to variable for length of output
497             (already filled in by caller)
498   in %ecx : maxOutputLen - length of output buffer
499             (already checked by caller)
500   in %r8  : input - pointer to input buffer
501   in %r9d : inputLen - length of input buffer
502   on stack: blocksize - AES blocksize (always 16, unused)
503*/
504	.type intel_aes_encrypt_cbc_128,@function
505	.globl intel_aes_encrypt_cbc_128
506	.align	16
507intel_aes_encrypt_cbc_128:
508	testl	%r9d, %r9d
509	je	2f
510
511//	leaq	IV_OFFSET(%rdi), %rdx
512	leaq	256(%rdi), %rdx
513
514	movdqu	(%rdx), %xmm0
515	movdqu	(%rdi), %xmm2
516	movdqu	16(%rdi), %xmm3
517	movdqu	32(%rdi), %xmm4
518	movdqu	48(%rdi), %xmm5
519	movdqu	64(%rdi), %xmm6
520	movdqu	80(%rdi), %xmm7
521	movdqu	96(%rdi), %xmm8
522	movdqu	112(%rdi), %xmm9
523	movdqu	128(%rdi), %xmm10
524	movdqu	144(%rdi), %xmm11
525	movdqu	160(%rdi), %xmm12
526
527	xorl	%eax, %eax
5281:	movdqu	(%r8, %rax), %xmm1
529	pxor	%xmm0, %xmm1
530	pxor	%xmm2, %xmm1
531	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
532	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
533	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
534	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
535	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
536	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
537	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
538	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmma, %xmm1 */
539	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmmb, %xmm1 */
540	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcc	/* aesenclast %xmm12, %xmm1 */
541	movdqu	%xmm1, (%rsi, %rax)
542	movdqa	%xmm1, %xmm0
543	addl	$16, %eax
544	cmpl	%eax, %r9d
545	jne	1b
546
547	movdqu	%xmm0, (%rdx)
548
5492:	xor	%eax, %eax
550	ret
551	.size intel_aes_encrypt_cbc_128, .-intel_aes_encrypt_cbc_128
552
553
554/* in %rdi : cx - context
555   in %rsi : output - pointer to output buffer
556   in %rdx : outputLen - pointer to variable for length of output
557             (already filled in by caller)
558   in %ecx : maxOutputLen - length of output buffer
559             (already checked by caller)
560   in %r8  : input - pointer to input buffer
561   in %r9d : inputLen - length of input buffer
562   on stack: blocksize - AES blocksize (always 16, unused)
563*/
564	.type intel_aes_decrypt_cbc_128,@function
565	.globl intel_aes_decrypt_cbc_128
566	.align	16
567intel_aes_decrypt_cbc_128:
568//	leaq	IV_OFFSET(%rdi), %rdx
569	leaq	256(%rdi), %rdx
570
571	movdqu	(%rdx), %xmm0   /* iv */
572	movdqu	(%rdi), %xmm2   /* first key block */
573	movdqu	160(%rdi), %xmm12 /* last key block */
574	xorl	%eax, %eax
575	cmpl	$128, %r9d
576	jb	1f
577	leal	-128(%r9), %r11d
5782:	movdqu	(%r8, %rax), %xmm3 /* 1st data block */
579	movdqu	16(%r8, %rax), %xmm4 /* 2d data block */
580	movdqu	32(%r8, %rax), %xmm5
581	movdqu	48(%r8, %rax), %xmm6
582	movdqu	64(%r8, %rax), %xmm7
583	movdqu	80(%r8, %rax), %xmm8
584	movdqu	96(%r8, %rax), %xmm9
585	movdqu	112(%r8, %rax), %xmm10
586	pxor	%xmm12, %xmm3
587	pxor	%xmm12, %xmm4
588	pxor	%xmm12, %xmm5
589	pxor	%xmm12, %xmm6
590	pxor	%xmm12, %xmm7
591	pxor	%xmm12, %xmm8
592	pxor	%xmm12, %xmm9
593	pxor	%xmm12, %xmm10
594
595// complete loop unrolling
596	movdqu 144(%rdi), %xmm1
597	movdqu 128(%rdi), %xmm11
598	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
599	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
600	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
601	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
602	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
603	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
604	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
605	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
606	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
607	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
608	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
609	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
610	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
611	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
612	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
613	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
614
615	movdqu 112(%rdi), %xmm1
616	movdqu 96(%rdi), %xmm11
617	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
618	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
619	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
620	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
621	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
622	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
623	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
624	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
625	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
626	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
627	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
628	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
629	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
630	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
631	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
632	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
633
634	movdqu 80(%rdi), %xmm1
635	movdqu 64(%rdi), %xmm11
636	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
637	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
638	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
639	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
640	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
641	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
642	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
643	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
644	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
645	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
646	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
647	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
648	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
649	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
650	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
651	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
652
653	movdqu 48(%rdi), %xmm1
654	movdqu 32(%rdi), %xmm11
655	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
656	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
657	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
658	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
659	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
660	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
661	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
662	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
663	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
664	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
665	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
666	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
667	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
668	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
669	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
670	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
671
672	movdqu 16(%rdi), %xmm1
673	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
674	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
675	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
676	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
677	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
678	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
679	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
680	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
681	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
682	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
683	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
684	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
685	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
686	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
687	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
688	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
689
690 	pxor	%xmm0, %xmm3
691	movdqu	(%r8, %rax), %xmm0
692	pxor	%xmm0, %xmm4
693	movdqu	16(%r8, %rax), %xmm0
694	pxor	%xmm0, %xmm5
695	movdqu	32(%r8, %rax), %xmm0
696	pxor	%xmm0, %xmm6
697	movdqu	48(%r8, %rax), %xmm0
698	pxor	%xmm0, %xmm7
699	movdqu	64(%r8, %rax), %xmm0
700	pxor	%xmm0, %xmm8
701	movdqu	80(%r8, %rax), %xmm0
702	pxor	%xmm0, %xmm9
703	movdqu	96(%r8, %rax), %xmm0
704	pxor	%xmm0, %xmm10
705	movdqu	112(%r8, %rax), %xmm0
706	movdqu	%xmm3, (%rsi, %rax)
707	movdqu	%xmm4, 16(%rsi, %rax)
708	movdqu	%xmm5, 32(%rsi, %rax)
709	movdqu	%xmm6, 48(%rsi, %rax)
710	movdqu	%xmm7, 64(%rsi, %rax)
711	movdqu	%xmm8, 80(%rsi, %rax)
712	movdqu	%xmm9, 96(%rsi, %rax)
713	movdqu	%xmm10, 112(%rsi, %rax)
714	addl	$128, %eax
715	cmpl	%r11d, %eax
716	jbe	2b
7171:	cmpl	%eax, %r9d
718	je	5f
719
720	movdqu	16(%rdi), %xmm3
721	movdqu	32(%rdi), %xmm4
722	movdqu	48(%rdi), %xmm5
723	movdqu	64(%rdi), %xmm6
724	movdqu	80(%rdi), %xmm7
725	movdqu	96(%rdi), %xmm8
726	movdqu	112(%rdi), %xmm9
727	movdqu	128(%rdi), %xmm10
728	movdqu	144(%rdi), %xmm11
729
7304:	movdqu	(%r8, %rax), %xmm1
731	movdqa	%xmm1, %xmm13
732	pxor	%xmm12, %xmm1
733	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
734	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
735	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
736	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
737	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
738	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
739	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
740	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
741	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
742	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
743	pxor	%xmm0, %xmm1
744	movdqu	%xmm1, (%rsi, %rax)
745	movdqa	%xmm13, %xmm0
746	addl	$16, %eax
747	cmpl	%eax, %r9d
748	jne	4b
749
7505:	movdqu	%xmm0, (%rdx)
751
752	xor	%eax, %eax
753	ret
754	.size intel_aes_decrypt_cbc_128, .-intel_aes_decrypt_cbc_128
755
756/* in %rdi : the key
757   in %rsi : buffer for expanded key
758*/
759	.type intel_aes_encrypt_init_192,@function
760	.globl intel_aes_encrypt_init_192
761	.align	16
762intel_aes_encrypt_init_192:
763	movdqu	(%rdi), %xmm1
764	movq	16(%rdi), %xmm3
765	movdqu	%xmm1, (%rsi)
766	movq	%xmm3, 16(%rsi)
767	leaq	24(%rsi), %rsi
768
769	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
770	call key_expansion192
771	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
772	call key_expansion192
773	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
774	call key_expansion192
775	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
776	call key_expansion192
777	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
778	call key_expansion192
779	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
780	call key_expansion192
781	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
782	call key_expansion192
783	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x80	/* aeskeygenassist $0x80, %xmm3, %xmm2 */
784	call key_expansion192
785
786	ret
787	.size intel_aes_encrypt_init_192, .-intel_aes_encrypt_init_192
788
789
790/* in %rdi : the key
791   in %rsi : buffer for expanded key
792*/
793	.type intel_aes_decrypt_init_192,@function
794	.globl intel_aes_decrypt_init_192
795	.align	16
796intel_aes_decrypt_init_192:
797	movdqu	(%rdi), %xmm1
798	movq	16(%rdi), %xmm3
799	movdqu	%xmm1, (%rsi)
800	movq	%xmm3, 16(%rsi)
801	leaq	24(%rsi), %rsi
802
803	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
804	call key_expansion192
805	movups	-32(%rsi), %xmm2
806	movups	-16(%rsi), %xmm4
807	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
808	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
809	movups	%xmm2, -32(%rsi)
810	movups	%xmm4, -16(%rsi)
811	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
812	call key_expansion192
813	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
814	movups	%xmm2, -24(%rsi)
815	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
816	call key_expansion192
817	movups	-32(%rsi), %xmm2
818	movups	-16(%rsi), %xmm4
819	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
820	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
821	movups	%xmm2, -32(%rsi)
822	movups	%xmm4, -16(%rsi)
823	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
824	call key_expansion192
825	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
826	movups	%xmm2, -24(%rsi)
827	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
828	call key_expansion192
829	movups	-32(%rsi), %xmm2
830	movups	-16(%rsi), %xmm4
831	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
832	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
833	movups	%xmm2, -32(%rsi)
834	movups	%xmm4, -16(%rsi)
835	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
836	call key_expansion192
837	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
838	movups	%xmm2, -24(%rsi)
839	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
840	call key_expansion192
841	movups	-32(%rsi), %xmm2
842	movups	-16(%rsi), %xmm4
843	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
844	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
845	movups	%xmm2, -32(%rsi)
846	movups	%xmm4, -16(%rsi)
847	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x80	/* aeskeygenassist $0x80, %xmm3, %xmm2 */
848	call key_expansion192
849
850	ret
851	.size intel_aes_decrypt_init_192, .-intel_aes_decrypt_init_192
852
853
854	.type key_expansion192,@function
855	.align	16
856key_expansion192:
857	pshufd	$0x55, %xmm2, %xmm2
858	xor	%eax, %eax
859	movd	%eax, %xmm4
860	shufps	$0x10, %xmm1, %xmm4
861	pxor	%xmm4, %xmm1
862	shufps	$0x8c, %xmm1, %xmm4
863	pxor	%xmm2, %xmm1
864	pxor	%xmm4, %xmm1
865	movdqu	%xmm1, (%rsi)
866	addq	$16, %rsi
867
868	pshufd	$0xff, %xmm1, %xmm4
869	movd	%eax, %xmm5
870	shufps	$0x00, %xmm3, %xmm5
871	shufps	$0x08, %xmm3, %xmm5
872	pxor	%xmm4, %xmm3
873	pxor	%xmm5, %xmm3
874	movq	%xmm3, (%rsi)
875	addq	$8, %rsi
876	ret
877	.size key_expansion192, .-key_expansion192
878
879
880/* in %rdi : cx - context
881   in %rsi : output - pointer to output buffer
882   in %rdx : outputLen - pointer to variable for length of output
883             (already filled in by caller)
884   in %ecx : maxOutputLen - length of output buffer
885             (already checked by caller)
886   in %r8  : input - pointer to input buffer
887   in %r9d : inputLen - length of input buffer
888   on stack: blocksize - AES blocksize (always 16, unused)
889*/
890	.type intel_aes_encrypt_ecb_192,@function
891	.globl intel_aes_encrypt_ecb_192
892	.align	16
893intel_aes_encrypt_ecb_192:
894	movdqu	(%rdi), %xmm2
895	movdqu	192(%rdi), %xmm14
896	xorl	%eax, %eax
897//	cmpl	$8*16, %r9d
898	cmpl	$128, %r9d
899	jb	1f
900//	leal	-8*16(%r9), %r11d
901	leal	-128(%r9), %r11d
9022:	movdqu	(%r8, %rax), %xmm3
903	movdqu	16(%r8, %rax), %xmm4
904	movdqu	32(%r8, %rax), %xmm5
905	movdqu	48(%r8, %rax), %xmm6
906	movdqu	64(%r8, %rax), %xmm7
907	movdqu	80(%r8, %rax), %xmm8
908	movdqu	96(%r8, %rax), %xmm9
909	movdqu	112(%r8, %rax), %xmm10
910	pxor	%xmm2, %xmm3
911	pxor	%xmm2, %xmm4
912	pxor	%xmm2, %xmm5
913	pxor	%xmm2, %xmm6
914	pxor	%xmm2, %xmm7
915	pxor	%xmm2, %xmm8
916	pxor	%xmm2, %xmm9
917	pxor	%xmm2, %xmm10
918
919// complete loop unrolling
920	movdqu 16(%rdi), %xmm1
921	movdqu 32(%rdi), %xmm11
922	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
923	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
924	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
925	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
926	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
927	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
928	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
929	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
930	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
931	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
932	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
933	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
934	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
935	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
936	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
937	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
938
939	movdqu 48(%rdi), %xmm1
940	movdqu 64(%rdi), %xmm11
941	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
942	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
943	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
944	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
945	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
946	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
947	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
948	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
949	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
950	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
951	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
952	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
953	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
954	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
955	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
956	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
957
958	movdqu 80(%rdi), %xmm1
959	movdqu 96(%rdi), %xmm11
960	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
961	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
962	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
963	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
964	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
965	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
966	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
967	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
968	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
969	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
970	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
971	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
972	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
973	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
974	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
975	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
976
977	movdqu 112(%rdi), %xmm1
978	movdqu 128(%rdi), %xmm11
979	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
980	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
981	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
982	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
983	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
984	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
985	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
986	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
987	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
988	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
989	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
990	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
991	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
992	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
993	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
994	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
995
996	movdqu 144(%rdi), %xmm1
997	movdqu 160(%rdi), %xmm11
998	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
999	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
1000	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
1001	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
1002	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
1003	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
1004	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
1005	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
1006	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
1007	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
1008	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
1009	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
1010	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
1011	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
1012	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
1013	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
1014
1015	movdqu 176(%rdi), %xmm1
1016	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
1017	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
1018	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
1019	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
1020	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
1021	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
1022	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
1023	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
1024	.byte 0x66,0x41,0x0f,0x38,0xdd,0xde	/* aesenclast 	%xmm14, %xmm3 */
1025	.byte 0x66,0x41,0x0f,0x38,0xdd,0xe6	/* aesenclast 	%xmm14, %xmm4 */
1026	.byte 0x66,0x41,0x0f,0x38,0xdd,0xee	/* aesenclast 	%xmm14, %xmm5 */
1027	.byte 0x66,0x41,0x0f,0x38,0xdd,0xf6	/* aesenclast 	%xmm14, %xmm7 */
1028	.byte 0x66,0x41,0x0f,0x38,0xdd,0xfe	/* aesenclast 	%xmm14, %xmm3 */
1029	.byte 0x66,0x45,0x0f,0x38,0xdd,0xc6	/* aesenclast 	%xmm14, %xmm8 */
1030	.byte 0x66,0x45,0x0f,0x38,0xdd,0xce	/* aesenclast 	%xmm14, %xmm9 */
1031	.byte 0x66,0x45,0x0f,0x38,0xdd,0xd6	/* aesenclast 	%xmm14, %xmm10 */
1032
1033	movdqu	%xmm3, (%rsi, %rax)
1034	movdqu	%xmm4, 16(%rsi, %rax)
1035	movdqu	%xmm5, 32(%rsi, %rax)
1036	movdqu	%xmm6, 48(%rsi, %rax)
1037	movdqu	%xmm7, 64(%rsi, %rax)
1038	movdqu	%xmm8, 80(%rsi, %rax)
1039	movdqu	%xmm9, 96(%rsi, %rax)
1040	movdqu	%xmm10, 112(%rsi, %rax)
1041//	addl	$8*16, %eax
1042	addl	$128, %eax
1043	cmpl	%r11d, %eax
1044	jbe	2b
10451:	cmpl	%eax, %r9d
1046	je	5f
1047
1048	movdqu	16(%rdi), %xmm3
1049	movdqu	32(%rdi), %xmm4
1050	movdqu	48(%rdi), %xmm5
1051	movdqu	64(%rdi), %xmm6
1052	movdqu	80(%rdi), %xmm7
1053	movdqu	96(%rdi), %xmm8
1054	movdqu	112(%rdi), %xmm9
1055	movdqu	128(%rdi), %xmm10
1056	movdqu	144(%rdi), %xmm11
1057	movdqu	160(%rdi), %xmm12
1058	movdqu	176(%rdi), %xmm13
1059
10604:	movdqu	(%r8, %rax), %xmm1
1061	pxor	%xmm2, %xmm1
1062	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
1063	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
1064	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
1065	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
1066	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
1067	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
1068	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
1069	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
1070	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
1071	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
1072	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
1073	.byte 0x66,0x41,0x0f,0x38,0xdd,0xce	/* aesenclast %xmm14, %xmm1 */
1074	movdqu	%xmm1, (%rsi, %rax)
1075	addl	$16, %eax
1076	cmpl	%eax, %r9d
1077	jne	4b
1078
10795:	xor	%eax, %eax
1080	ret
1081	.size intel_aes_encrypt_ecb_192, .-intel_aes_encrypt_ecb_192
1082
1083
1084/* in %rdi : cx - context
1085   in %rsi : output - pointer to output buffer
1086   in %rdx : outputLen - pointer to variable for length of output
1087             (already filled in by caller)
1088   in %ecx : maxOutputLen - length of output buffer
1089             (already checked by caller)
1090   in %r8  : input - pointer to input buffer
1091   in %r9d : inputLen - length of input buffer
1092   on stack: blocksize - AES blocksize (always 16, unused)
1093*/
1094	.type intel_aes_decrypt_ecb_192,@function
1095	.globl intel_aes_decrypt_ecb_192
1096	.align	16
1097intel_aes_decrypt_ecb_192:
1098	movdqu	(%rdi), %xmm2
1099	movdqu	192(%rdi), %xmm14
1100	xorl	%eax, %eax
1101//	cmpl	$8*16, %r9d
1102	cmpl	$128, %r9d
1103	jb	1f
1104//	leal	-8*16(%r9), %r11d
1105	leal	-128(%r9), %r11d
11062:	movdqu	(%r8, %rax), %xmm3
1107	movdqu	16(%r8, %rax), %xmm4
1108	movdqu	32(%r8, %rax), %xmm5
1109	movdqu	48(%r8, %rax), %xmm6
1110	movdqu	64(%r8, %rax), %xmm7
1111	movdqu	80(%r8, %rax), %xmm8
1112	movdqu	96(%r8, %rax), %xmm9
1113	movdqu	112(%r8, %rax), %xmm10
1114	pxor	%xmm14, %xmm3
1115	pxor	%xmm14, %xmm4
1116	pxor	%xmm14, %xmm5
1117	pxor	%xmm14, %xmm6
1118	pxor	%xmm14, %xmm7
1119	pxor	%xmm14, %xmm8
1120	pxor	%xmm14, %xmm9
1121	pxor	%xmm14, %xmm10
1122
1123// complete loop unrolling
1124	movdqu 176(%rdi), %xmm1
1125	movdqu 160(%rdi), %xmm11
1126	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1127	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1128	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1129	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1130	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1131	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1132	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1133	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1134	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
1135	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
1136	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
1137	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
1138	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
1139	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
1140	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
1141	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
1142
1143	movdqu 144(%rdi), %xmm1
1144	movdqu 128(%rdi), %xmm11
1145	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1146	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1147	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1148	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1149	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1150	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1151	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1152	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1153	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
1154	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
1155	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
1156	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
1157	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
1158	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
1159	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
1160	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
1161
1162	movdqu 112(%rdi), %xmm1
1163	movdqu 96(%rdi), %xmm11
1164	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1165	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1166	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1167	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1168	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1169	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1170	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1171	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1172	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
1173	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
1174	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
1175	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
1176	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
1177	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
1178	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
1179	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
1180
1181	movdqu 80(%rdi), %xmm1
1182	movdqu 64(%rdi), %xmm11
1183	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1184	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1185	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1186	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1187	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1188	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1189	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1190	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1191	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
1192	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
1193	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
1194	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
1195	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
1196	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
1197	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
1198	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
1199
1200	movdqu 48(%rdi), %xmm1
1201	movdqu 32(%rdi), %xmm11
1202	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1203	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1204	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1205	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1206	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1207	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1208	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1209	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1210	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
1211	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
1212	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
1213	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
1214	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
1215	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
1216	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
1217	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
1218
1219	movdqu 16(%rdi), %xmm1
1220	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1221	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1222	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1223	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1224	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1225	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1226	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1227	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1228	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
1229	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
1230	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
1231	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
1232	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
1233	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
1234	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
1235	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
1236
1237	movdqu	%xmm3, (%rsi, %rax)
1238	movdqu	%xmm4, 16(%rsi, %rax)
1239	movdqu	%xmm5, 32(%rsi, %rax)
1240	movdqu	%xmm6, 48(%rsi, %rax)
1241	movdqu	%xmm7, 64(%rsi, %rax)
1242	movdqu	%xmm8, 80(%rsi, %rax)
1243	movdqu	%xmm9, 96(%rsi, %rax)
1244	movdqu	%xmm10, 112(%rsi, %rax)
1245//	addl	$8*16, %eax
1246	addl	$128, %eax
1247	cmpl	%r11d, %eax
1248	jbe	2b
12491:	cmpl	%eax, %r9d
1250	je	5f
1251
1252	movdqu	16(%rdi), %xmm3
1253	movdqu	32(%rdi), %xmm4
1254	movdqu	48(%rdi), %xmm5
1255	movdqu	64(%rdi), %xmm6
1256	movdqu	80(%rdi), %xmm7
1257	movdqu	96(%rdi), %xmm8
1258	movdqu	112(%rdi), %xmm9
1259	movdqu	128(%rdi), %xmm10
1260	movdqu	144(%rdi), %xmm11
1261	movdqu	160(%rdi), %xmm12
1262	movdqu	176(%rdi), %xmm13
1263
12644:	movdqu	(%r8, %rax), %xmm1
1265	pxor	%xmm14, %xmm1
1266	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
1267	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
1268	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
1269	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
1270	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
1271	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
1272	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
1273	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
1274	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
1275	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
1276	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
1277	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
1278	movdqu	%xmm1, (%rsi, %rax)
1279	addl	$16, %eax
1280	cmpl	%eax, %r9d
1281	jne	4b
1282
12835:	xor	%eax, %eax
1284	ret
1285	.size intel_aes_decrypt_ecb_192, .-intel_aes_decrypt_ecb_192
1286
1287
1288/* in %rdi : cx - context
1289   in %rsi : output - pointer to output buffer
1290   in %rdx : outputLen - pointer to variable for length of output
1291             (already filled in by caller)
1292   in %ecx : maxOutputLen - length of output buffer
1293             (already checked by caller)
1294   in %r8  : input - pointer to input buffer
1295   in %r9d : inputLen - length of input buffer
1296   on stack: blocksize - AES blocksize (always 16, unused)
1297*/
1298	.type intel_aes_encrypt_cbc_192,@function
1299	.globl intel_aes_encrypt_cbc_192
1300	.align	16
1301intel_aes_encrypt_cbc_192:
1302	testl	%r9d, %r9d
1303	je	2f
1304
1305//	leaq	IV_OFFSET(%rdi), %rdx
1306	leaq	256(%rdi), %rdx
1307
1308	movdqu	(%rdx), %xmm0
1309	movdqu	(%rdi), %xmm2
1310	movdqu	16(%rdi), %xmm3
1311	movdqu	32(%rdi), %xmm4
1312	movdqu	48(%rdi), %xmm5
1313	movdqu	64(%rdi), %xmm6
1314	movdqu	80(%rdi), %xmm7
1315	movdqu	96(%rdi), %xmm8
1316	movdqu	112(%rdi), %xmm9
1317	movdqu	128(%rdi), %xmm10
1318	movdqu	144(%rdi), %xmm11
1319	movdqu	160(%rdi), %xmm12
1320	movdqu	176(%rdi), %xmm13
1321	movdqu	192(%rdi), %xmm14
1322
1323	xorl	%eax, %eax
13241:	movdqu	(%r8, %rax), %xmm1
1325	pxor	%xmm0, %xmm1
1326	pxor	%xmm2, %xmm1
1327	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
1328	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
1329	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
1330	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
1331	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
1332	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
1333	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
1334	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
1335	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
1336	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
1337	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
1338	.byte 0x66,0x41,0x0f,0x38,0xdd,0xce	/* aesenclast %xmm14, %xmm1 */
1339	movdqu	%xmm1, (%rsi, %rax)
1340	movdqa	%xmm1, %xmm0
1341	addl	$16, %eax
1342	cmpl	%eax, %r9d
1343	jne	1b
1344
1345	movdqu	%xmm0, (%rdx)
1346
13472:	xor	%eax, %eax
1348	ret
1349	.size intel_aes_encrypt_cbc_192, .-intel_aes_encrypt_cbc_192
1350
1351
1352/* in %rdi : cx - context
1353   in %rsi : output - pointer to output buffer
1354   in %rdx : outputLen - pointer to variable for length of output
1355             (already filled in by caller)
1356   in %exx : maxOutputLen - length of output buffer
1357             (already checked by caller)
1358   in %r8  : input - pointer to input buffer
1359   in %r9d : inputLen - length of input buffer
1360   on stack: blocksize - AES blocksize (always 16, unused)
1361*/
1362	.type intel_aes_decrypt_cbc_192,@function
1363	.globl intel_aes_decrypt_cbc_192
1364	.align	16
1365intel_aes_decrypt_cbc_192:
1366//	leaq	IV_OFFSET(%rdi), %rdx
1367	leaq	256(%rdi), %rdx
1368
1369	movdqu	(%rdx), %xmm0
1370	movdqu	(%rdi), %xmm2
1371	movdqu	192(%rdi), %xmm14
1372	xorl	%eax, %eax
1373	cmpl	$128, %r9d
1374	jb	1f
1375	leal	-128(%r9), %r11d
13762:	movdqu	(%r8, %rax), %xmm3
1377	movdqu	16(%r8, %rax), %xmm4
1378	movdqu	32(%r8, %rax), %xmm5
1379	movdqu	48(%r8, %rax), %xmm6
1380	movdqu	64(%r8, %rax), %xmm7
1381	movdqu	80(%r8, %rax), %xmm8
1382	movdqu	96(%r8, %rax), %xmm9
1383	movdqu	112(%r8, %rax), %xmm10
1384	pxor	%xmm14, %xmm3
1385	pxor	%xmm14, %xmm4
1386	pxor	%xmm14, %xmm5
1387	pxor	%xmm14, %xmm6
1388	pxor	%xmm14, %xmm7
1389	pxor	%xmm14, %xmm8
1390	pxor	%xmm14, %xmm9
1391	pxor	%xmm14, %xmm10
1392
1393// complete loop unrolling
1394	movdqu 176(%rdi), %xmm1
1395	movdqu 160(%rdi), %xmm11
1396	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1397	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1398	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1399	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1400	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1401	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1402	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1403	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1404	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
1405	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
1406	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
1407	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
1408	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
1409	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
1410	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
1411	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
1412
1413	movdqu 144(%rdi), %xmm1
1414	movdqu 128(%rdi), %xmm11
1415	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1416	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1417	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1418	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1419	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1420	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1421	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1422	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1423	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
1424	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
1425	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
1426	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
1427	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
1428	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
1429	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
1430	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
1431
1432	movdqu 112(%rdi), %xmm1
1433	movdqu 96(%rdi), %xmm11
1434	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1435	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1436	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1437	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1438	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1439	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1440	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1441	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1442	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
1443	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
1444	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
1445	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
1446	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
1447	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
1448	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
1449	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
1450
1451	movdqu 80(%rdi), %xmm1
1452	movdqu 64(%rdi), %xmm11
1453	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1454	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1455	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1456	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1457	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1458	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1459	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1460	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1461	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
1462	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
1463	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
1464	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
1465	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
1466	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
1467	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
1468	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
1469
1470	movdqu 48(%rdi), %xmm1
1471	movdqu 32(%rdi), %xmm11
1472	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1473	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1474	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1475	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1476	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1477	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1478	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1479	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1480	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
1481	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
1482	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
1483	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
1484	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
1485	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
1486	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
1487	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
1488
1489	movdqu 16(%rdi), %xmm1
1490	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1491	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1492	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1493	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1494	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1495	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1496	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1497	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1498	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
1499	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
1500	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
1501	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
1502	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
1503	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
1504	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
1505	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
1506
1507 	pxor	%xmm0, %xmm3
1508	movdqu	(%r8, %rax), %xmm0
1509	pxor	%xmm0, %xmm4
1510	movdqu	16(%r8, %rax), %xmm0
1511	pxor	%xmm0, %xmm5
1512	movdqu	32(%r8, %rax), %xmm0
1513	pxor	%xmm0, %xmm6
1514	movdqu	48(%r8, %rax), %xmm0
1515	pxor	%xmm0, %xmm7
1516	movdqu	64(%r8, %rax), %xmm0
1517	pxor	%xmm0, %xmm8
1518	movdqu	80(%r8, %rax), %xmm0
1519	pxor	%xmm0, %xmm9
1520	movdqu	96(%r8, %rax), %xmm0
1521	pxor	%xmm0, %xmm10
1522	movdqu	112(%r8, %rax), %xmm0
1523	movdqu	%xmm3, (%rsi, %rax)
1524	movdqu	%xmm4, 16(%rsi, %rax)
1525	movdqu	%xmm5, 32(%rsi, %rax)
1526	movdqu	%xmm6, 48(%rsi, %rax)
1527	movdqu	%xmm7, 64(%rsi, %rax)
1528	movdqu	%xmm8, 80(%rsi, %rax)
1529	movdqu	%xmm9, 96(%rsi, %rax)
1530	movdqu	%xmm10, 112(%rsi, %rax)
1531	addl	$128, %eax
1532	cmpl	%r11d, %eax
1533	jbe	2b
15341:	cmpl	%eax, %r9d
1535	je	5f
1536
1537	movdqu	16(%rdi), %xmm3
1538	movdqu	32(%rdi), %xmm4
1539	movdqu	48(%rdi), %xmm5
1540	movdqu	64(%rdi), %xmm6
1541	movdqu	80(%rdi), %xmm7
1542	movdqu	96(%rdi), %xmm8
1543	movdqu	112(%rdi), %xmm9
1544	movdqu	128(%rdi), %xmm10
1545	movdqu	144(%rdi), %xmm11
1546	movdqu	160(%rdi), %xmm12
1547	movdqu	176(%rdi), %xmm13
1548
15494:	movdqu	(%r8, %rax), %xmm1
1550	movdqa	%xmm1, %xmm15
1551	pxor	%xmm14, %xmm1
1552	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
1553	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
1554	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
1555	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
1556	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
1557	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
1558	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
1559	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
1560	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
1561	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
1562	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
1563	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
1564	pxor	%xmm0, %xmm1
1565	movdqu	%xmm1, (%rsi, %rax)
1566	movdqa	%xmm15, %xmm0
1567	addl	$16, %eax
1568	cmpl	%eax, %r9d
1569	jne	4b
1570
15715:	movdqu	%xmm0, (%rdx)
1572
1573	xor	%eax, %eax
1574	ret
1575	.size intel_aes_decrypt_cbc_192, .-intel_aes_decrypt_cbc_192
1576
1577/* in %rdi : the key
1578   in %rsi : buffer for expanded key
1579*/
1580	.type intel_aes_encrypt_init_256,@function
1581	.globl intel_aes_encrypt_init_256
1582	.align	16
1583intel_aes_encrypt_init_256:
1584	movdqu	(%rdi), %xmm1
1585	movdqu	16(%rdi), %xmm3
1586	movdqu	%xmm1, (%rsi)
1587	movdqu	%xmm3, 16(%rsi)
1588	leaq	32(%rsi), %rsi
1589	xor	%eax, %eax
1590
1591	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
1592	call key_expansion256
1593	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
1594	call key_expansion256
1595	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
1596	call key_expansion256
1597	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
1598	call key_expansion256
1599	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
1600	call key_expansion256
1601	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
1602	call key_expansion256
1603	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
1604	pxor	%xmm6, %xmm6
1605	pshufd	$0xff, %xmm2, %xmm2
1606	shufps	$0x10, %xmm1, %xmm6
1607	pxor	%xmm6, %xmm1
1608	shufps	$0x8c, %xmm1, %xmm6
1609	pxor	%xmm2, %xmm1
1610	pxor	%xmm6, %xmm1
1611	movdqu	%xmm1, (%rsi)
1612
1613	ret
1614	.size intel_aes_encrypt_init_256, .-intel_aes_encrypt_init_256
1615
1616
1617/* in %rdi : the key
1618   in %rsi : buffer for expanded key
1619*/
1620	.type intel_aes_decrypt_init_256,@function
1621	.globl intel_aes_decrypt_init_256
1622	.align	16
1623intel_aes_decrypt_init_256:
1624	movdqu	(%rdi), %xmm1
1625	movdqu	16(%rdi), %xmm3
1626	movdqu	%xmm1, (%rsi)
1627	.byte 0x66,0x0f,0x38,0xdb,0xe3	/* aesimc	%xmm3, %xmm4 */
1628	movdqu	%xmm4, 16(%rsi)
1629	leaq	32(%rsi), %rsi
1630	xor	%eax, %eax
1631
1632	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
1633	call key_expansion256
1634	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
1635	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
1636	movdqu	%xmm4, -32(%rsi)
1637	movdqu	%xmm5, -16(%rsi)
1638	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
1639	call key_expansion256
1640	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
1641	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
1642	movdqu	%xmm4, -32(%rsi)
1643	movdqu	%xmm5, -16(%rsi)
1644	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
1645	call key_expansion256
1646	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
1647	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
1648	movdqu	%xmm4, -32(%rsi)
1649	movdqu	%xmm5, -16(%rsi)
1650	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
1651	call key_expansion256
1652	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
1653	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
1654	movdqu	%xmm4, -32(%rsi)
1655	movdqu	%xmm5, -16(%rsi)
1656	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
1657	call key_expansion256
1658	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
1659	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
1660	movdqu	%xmm4, -32(%rsi)
1661	movdqu	%xmm5, -16(%rsi)
1662	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
1663	call key_expansion256
1664	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
1665	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
1666	movdqu	%xmm4, -32(%rsi)
1667	movdqu	%xmm5, -16(%rsi)
1668	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
1669	pxor	%xmm6, %xmm6
1670	pshufd	$0xff, %xmm2, %xmm2
1671	shufps	$0x10, %xmm1, %xmm6
1672	pxor	%xmm6, %xmm1
1673	shufps	$0x8c, %xmm1, %xmm6
1674	pxor	%xmm2, %xmm1
1675	pxor	%xmm6, %xmm1
1676	movdqu	%xmm1, (%rsi)
1677
1678	ret
1679	.size intel_aes_decrypt_init_256, .-intel_aes_decrypt_init_256
1680
1681
1682	.type key_expansion256,@function
1683	.align	16
1684key_expansion256:
1685	movd	%eax, %xmm6
1686	pshufd	$0xff, %xmm2, %xmm2
1687	shufps	$0x10, %xmm1, %xmm6
1688	pxor	%xmm6, %xmm1
1689	shufps	$0x8c, %xmm1, %xmm6
1690	pxor	%xmm2, %xmm1
1691	pxor	%xmm6, %xmm1
1692	movdqu	%xmm1, (%rsi)
1693
1694	addq	$16, %rsi
1695	.byte 0x66,0x0f,0x3a,0xdf,0xe1,0x00	/* aeskeygenassist $0, %xmm1, %xmm4 */
1696	pshufd	$0xaa, %xmm4, %xmm4
1697	shufps	$0x10, %xmm3, %xmm6
1698	pxor	%xmm6, %xmm3
1699	shufps	$0x8c, %xmm3, %xmm6
1700	pxor	%xmm4, %xmm3
1701	pxor	%xmm6, %xmm3
1702	movdqu	%xmm3, (%rsi)
1703	addq	$16, %rsi
1704	ret
1705	.size key_expansion256, .-key_expansion256
1706
1707
1708/* in %rdi : cx - context
1709   in %rsi : output - pointer to output buffer
1710   in %rdx : outputLen - pointer to variable for length of output
1711             (already filled in by caller)
1712   in %ecx : maxOutputLen - length of output buffer
1713             (already checked by caller)
1714   in %r8  : input - pointer to input buffer
1715   in %r9d : inputLen - length of input buffer
1716   on stack: blocksize - AES blocksize (always 16, unused)
1717*/
1718	.type intel_aes_encrypt_ecb_256,@function
1719	.globl intel_aes_encrypt_ecb_256
1720	.align	16
1721intel_aes_encrypt_ecb_256:
1722	movdqu	(%rdi), %xmm2
1723	movdqu	224(%rdi), %xmm15
1724	xorl	%eax, %eax
1725//	cmpl	$8*16, %r9d
1726	cmpl	$128, %r9d
1727	jb	1f
1728//	leal	-8*16(%r9), %r11d
1729	leal	-128(%r9), %r11d
17302:	movdqu	(%r8, %rax), %xmm3
1731	movdqu	16(%r8, %rax), %xmm4
1732	movdqu	32(%r8, %rax), %xmm5
1733	movdqu	48(%r8, %rax), %xmm6
1734	movdqu	64(%r8, %rax), %xmm7
1735	movdqu	80(%r8, %rax), %xmm8
1736	movdqu	96(%r8, %rax), %xmm9
1737	movdqu	112(%r8, %rax), %xmm10
1738	pxor	%xmm2, %xmm3
1739	pxor	%xmm2, %xmm4
1740	pxor	%xmm2, %xmm5
1741	pxor	%xmm2, %xmm6
1742	pxor	%xmm2, %xmm7
1743	pxor	%xmm2, %xmm8
1744	pxor	%xmm2, %xmm9
1745	pxor	%xmm2, %xmm10
1746
1747// complete loop unrolling
1748	movdqu 16(%rdi), %xmm1
1749	movdqu 32(%rdi), %xmm11
1750	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
1751	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
1752	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
1753	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
1754	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
1755	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
1756	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
1757	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
1758	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
1759	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
1760	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
1761	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
1762	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
1763	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
1764	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
1765	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
1766
1767	movdqu 48(%rdi), %xmm1
1768	movdqu 64(%rdi), %xmm11
1769	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
1770	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
1771	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
1772	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
1773	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
1774	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
1775	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
1776	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
1777	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
1778	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
1779	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
1780	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
1781	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
1782	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
1783	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
1784	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
1785
1786	movdqu 80(%rdi), %xmm1
1787	movdqu 96(%rdi), %xmm11
1788	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
1789	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
1790	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
1791	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
1792	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
1793	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
1794	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
1795	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
1796	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
1797	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
1798	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
1799	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
1800	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
1801	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
1802	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
1803	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
1804
1805	movdqu 112(%rdi), %xmm1
1806	movdqu 128(%rdi), %xmm11
1807	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
1808	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
1809	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
1810	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
1811	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
1812	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
1813	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
1814	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
1815	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
1816	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
1817	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
1818	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
1819	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
1820	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
1821	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
1822	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
1823
1824	movdqu 144(%rdi), %xmm1
1825	movdqu 160(%rdi), %xmm11
1826	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
1827	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
1828	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
1829	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
1830	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
1831	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
1832	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
1833	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
1834	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
1835	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
1836	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
1837	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
1838	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
1839	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
1840	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
1841	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
1842
1843	movdqu 176(%rdi), %xmm1
1844	movdqu 192(%rdi), %xmm11
1845	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
1846	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
1847	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
1848	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
1849	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
1850	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
1851	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
1852	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
1853	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
1854	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
1855	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
1856	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
1857	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
1858	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
1859	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
1860	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
1861
1862	movdqu 208(%rdi), %xmm1
1863	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
1864	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
1865	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
1866	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
1867	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
1868	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
1869	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
1870	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
1871	.byte 0x66,0x41,0x0f,0x38,0xdd,0xdf	/* aesenclast 	%xmm15, %xmm3 */
1872	.byte 0x66,0x41,0x0f,0x38,0xdd,0xe7	/* aesenclast 	%xmm15, %xmm4 */
1873	.byte 0x66,0x41,0x0f,0x38,0xdd,0xef	/* aesenclast 	%xmm15, %xmm5 */
1874	.byte 0x66,0x41,0x0f,0x38,0xdd,0xf7	/* aesenclast 	%xmm15, %xmm6 */
1875	.byte 0x66,0x41,0x0f,0x38,0xdd,0xff	/* aesenclast 	%xmm15, %xmm7 */
1876	.byte 0x66,0x45,0x0f,0x38,0xdd,0xc7	/* aesenclast 	%xmm15, %xmm8 */
1877	.byte 0x66,0x45,0x0f,0x38,0xdd,0xcf	/* aesenclast 	%xmm15, %xmm9 */
1878	.byte 0x66,0x45,0x0f,0x38,0xdd,0xd7	/* aesenclast 	%xmm15, %xmm10 */
1879
1880	movdqu	%xmm3, (%rsi, %rax)
1881	movdqu	%xmm4, 16(%rsi, %rax)
1882	movdqu	%xmm5, 32(%rsi, %rax)
1883	movdqu	%xmm6, 48(%rsi, %rax)
1884	movdqu	%xmm7, 64(%rsi, %rax)
1885	movdqu	%xmm8, 80(%rsi, %rax)
1886	movdqu	%xmm9, 96(%rsi, %rax)
1887	movdqu	%xmm10, 112(%rsi, %rax)
1888//	addl	$8*16, %eax
1889	addl	$128, %eax
1890	cmpl	%r11d, %eax
1891	jbe	2b
18921:	cmpl	%eax, %r9d
1893	je	5f
1894
1895	movdqu	(%rdi), %xmm8
1896	movdqu	16(%rdi), %xmm2
1897	movdqu	32(%rdi), %xmm3
1898	movdqu	48(%rdi), %xmm4
1899	movdqu	64(%rdi), %xmm5
1900	movdqu	80(%rdi), %xmm6
1901	movdqu	96(%rdi), %xmm7
1902	movdqu	128(%rdi), %xmm9
1903	movdqu	144(%rdi), %xmm10
1904	movdqu	160(%rdi), %xmm11
1905	movdqu	176(%rdi), %xmm12
1906	movdqu	192(%rdi), %xmm13
1907	movdqu	208(%rdi), %xmm14
1908
19094:	movdqu	(%r8, %rax), %xmm1
1910	pxor	%xmm8, %xmm1
1911	movdqu	112(%rdi), %xmm8
1912	.byte 0x66,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm2, %xmm1 */
1913	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
1914	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
1915	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
1916	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
1917	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
1918	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
1919	movdqu	(%rdi), %xmm8
1920	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
1921	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
1922	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
1923	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
1924	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
1925	.byte 0x66,0x41,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm14, %xmm1 */
1926	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcf	/* aesenclast %xmm15, %xmm1 */
1927	movdqu	%xmm1, (%rsi, %rax)
1928	addl	$16, %eax
1929	cmpl	%eax, %r9d
1930	jne	4b
1931
19325:	xor	%eax, %eax
1933	ret
1934	.size intel_aes_encrypt_ecb_256, .-intel_aes_encrypt_ecb_256
1935
1936
1937/* in %rdi : cx - context
1938   in %rsi : output - pointer to output buffer
1939   in %rdx : outputLen - pointer to variable for length of output
1940             (already filled in by caller)
1941   in %ecx : maxOutputLen - length of output buffer
1942             (already checked by caller)
1943   in %r8  : input - pointer to input buffer
1944   in %r9d : inputLen - length of input buffer
1945   on stack: blocksize - AES blocksize (always 16, unused)
1946*/
1947	.type intel_aes_decrypt_ecb_256,@function
1948	.globl intel_aes_decrypt_ecb_256
1949	.align	16
1950intel_aes_decrypt_ecb_256:
1951	movdqu	(%rdi), %xmm2
1952	movdqu	224(%rdi), %xmm15
1953	xorl	%eax, %eax
1954//	cmpl	$8*16, %r9d
1955	cmpl	$128, %r9d
1956	jb	1f
1957//	leal	-8*16(%r9), %r11d
1958	leal	-128(%r9), %r11d
19592:	movdqu	(%r8, %rax), %xmm3
1960	movdqu	16(%r8, %rax), %xmm4
1961	movdqu	32(%r8, %rax), %xmm5
1962	movdqu	48(%r8, %rax), %xmm6
1963	movdqu	64(%r8, %rax), %xmm7
1964	movdqu	80(%r8, %rax), %xmm8
1965	movdqu	96(%r8, %rax), %xmm9
1966	movdqu	112(%r8, %rax), %xmm10
1967	pxor	%xmm15, %xmm3
1968	pxor	%xmm15, %xmm4
1969	pxor	%xmm15, %xmm5
1970	pxor	%xmm15, %xmm6
1971	pxor	%xmm15, %xmm7
1972	pxor	%xmm15, %xmm8
1973	pxor	%xmm15, %xmm9
1974	pxor	%xmm15, %xmm10
1975
1976// complete loop unrolling
1977	movdqu 208(%rdi), %xmm1
1978	movdqu 192(%rdi), %xmm11
1979	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1980	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
1981	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
1982	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
1983	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
1984	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
1985	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
1986	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
1987	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
1988	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
1989	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
1990	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
1991	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
1992	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
1993	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
1994	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
1995
1996	movdqu 176(%rdi), %xmm1
1997	movdqu 160(%rdi), %xmm11
1998	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
1999	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2000	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2001	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2002	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2003	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2004	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2005	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2006	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
2007	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
2008	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
2009	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
2010	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
2011	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
2012	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
2013	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
2014
2015	movdqu 144(%rdi), %xmm1
2016	movdqu 128(%rdi), %xmm11
2017	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
2018	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2019	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2020	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2021	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2022	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2023	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2024	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2025	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
2026	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
2027	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
2028	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
2029	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
2030	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
2031	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
2032	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
2033
2034	movdqu 112(%rdi), %xmm1
2035	movdqu 96(%rdi), %xmm11
2036	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
2037	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2038	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2039	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2040	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2041	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2042	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2043	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2044	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
2045	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
2046	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
2047	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
2048	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
2049	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
2050	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
2051	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
2052
2053	movdqu 80(%rdi), %xmm1
2054	movdqu 64(%rdi), %xmm11
2055	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
2056	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2057	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2058	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2059	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2060	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2061	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2062	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2063	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
2064	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
2065	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
2066	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
2067	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
2068	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
2069	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
2070	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
2071
2072	movdqu 48(%rdi), %xmm1
2073	movdqu 32(%rdi), %xmm11
2074	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
2075	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2076	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2077	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2078	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2079	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2080	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2081	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2082	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
2083	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
2084	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
2085	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
2086	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
2087	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
2088	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
2089	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
2090
2091	movdqu 16(%rdi), %xmm1
2092	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
2093	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2094	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2095	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2096	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2097	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2098	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2099	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2100	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
2101	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
2102	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
2103	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
2104	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
2105	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
2106	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
2107	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
2108
2109	movdqu	%xmm3, (%rsi, %rax)
2110	movdqu	%xmm4, 16(%rsi, %rax)
2111	movdqu	%xmm5, 32(%rsi, %rax)
2112	movdqu	%xmm6, 48(%rsi, %rax)
2113	movdqu	%xmm7, 64(%rsi, %rax)
2114	movdqu	%xmm8, 80(%rsi, %rax)
2115	movdqu	%xmm9, 96(%rsi, %rax)
2116	movdqu	%xmm10, 112(%rsi, %rax)
2117//	addl	$8*16, %eax
2118	addl	$128, %eax
2119	cmpl	%r11d, %eax
2120	jbe	2b
21211:	cmpl	%eax, %r9d
2122	je	5f
2123
2124	movdqu	16(%rdi), %xmm2
2125	movdqu	32(%rdi), %xmm3
2126	movdqu	48(%rdi), %xmm4
2127	movdqu	64(%rdi), %xmm5
2128	movdqu	80(%rdi), %xmm6
2129	movdqu	96(%rdi), %xmm7
2130	movdqu	112(%rdi), %xmm8
2131	movdqu	128(%rdi), %xmm9
2132	movdqu	144(%rdi), %xmm10
2133	movdqu	160(%rdi), %xmm11
2134	movdqu	176(%rdi), %xmm12
2135	movdqu	192(%rdi), %xmm13
2136	movdqu	208(%rdi), %xmm14
2137
21384:	movdqu	(%r8, %rax), %xmm1
2139	pxor	%xmm15, %xmm1
2140	.byte 0x66,0x41,0x0f,0x38,0xde,0xce	/* aesdec	%xmm14, %xmm1 */
2141	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
2142	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
2143	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
2144	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
2145	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
2146	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
2147	movdqu	(%rdi), %xmm8
2148	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
2149	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
2150	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
2151	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
2152	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
2153	.byte 0x66,0x0f,0x38,0xde,0xca	/* aesdec	%xmm2, %xmm1 */
2154	.byte 0x66,0x41,0x0f,0x38,0xdf,0xc8	/* aesdeclast %xmm8, %xmm1 */
2155	movdqu	112(%rdi), %xmm8
2156	movdqu	%xmm1, (%rsi, %rax)
2157	addl	$16, %eax
2158	cmpl	%eax, %r9d
2159	jne	4b
2160
21615:	xor	%eax, %eax
2162	ret
2163	.size intel_aes_decrypt_ecb_256, .-intel_aes_decrypt_ecb_256
2164
2165
2166/* in %rdi : cx - context
2167   in %rsi : output - pointer to output buffer
2168   in %rdx : outputLen - pointer to variable for length of output
2169             (already filled in by caller)
2170   in %ecx : maxOutputLen - length of output buffer
2171             (already checked by caller)
2172   in %r8  : input - pointer to input buffer
2173   in %r9d : inputLen - length of input buffer
2174   on stack: blocksize - AES blocksize (always 16, unused)
2175*/
2176	.type intel_aes_encrypt_cbc_256,@function
2177	.globl intel_aes_encrypt_cbc_256
2178	.align	16
2179intel_aes_encrypt_cbc_256:
2180	testl	%r9d, %r9d
2181	je	2f
2182
2183//	leaq	IV_OFFSET(%rdi), %rdx
2184	leaq	256(%rdi), %rdx
2185
2186	movdqu	(%rdx), %xmm0
2187	movdqu	(%rdi), %xmm8
2188	movdqu	16(%rdi), %xmm2
2189	movdqu	32(%rdi), %xmm3
2190	movdqu	48(%rdi), %xmm4
2191	movdqu	64(%rdi), %xmm5
2192	movdqu	80(%rdi), %xmm6
2193	movdqu	96(%rdi), %xmm7
2194	movdqu	128(%rdi), %xmm9
2195	movdqu	144(%rdi), %xmm10
2196	movdqu	160(%rdi), %xmm11
2197	movdqu	176(%rdi), %xmm12
2198	movdqu	192(%rdi), %xmm13
2199	movdqu	208(%rdi), %xmm14
2200	movdqu	224(%rdi), %xmm15
2201
2202	xorl	%eax, %eax
22031:	movdqu	(%r8, %rax), %xmm1
2204	pxor	%xmm0, %xmm1
2205	pxor	%xmm8, %xmm1
2206	movdqu	112(%rdi), %xmm8
2207	.byte 0x66,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm2, %xmm1 */
2208	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
2209	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
2210	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
2211	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
2212	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
2213	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
2214	movdqu	(%rdi), %xmm8
2215	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
2216	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
2217	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
2218	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
2219	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
2220	.byte 0x66,0x41,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm14, %xmm1 */
2221	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcf	/* aesenclast %xmm15, %xmm1 */
2222	movdqu	%xmm1, (%rsi, %rax)
2223	movdqa	%xmm1, %xmm0
2224	addl	$16, %eax
2225	cmpl	%eax, %r9d
2226	jne	1b
2227
2228	movdqu	%xmm0, (%rdx)
2229
22302:	xor	%eax, %eax
2231	ret
2232	.size intel_aes_encrypt_cbc_256, .-intel_aes_encrypt_cbc_256
2233
2234
2235/* in %rdi : cx - context
2236   in %rsi : output - pointer to output buffer
2237   in %rdx : outputLen - pointer to variable for length of output
2238             (already filled in by caller)
2239   in %ecx : maxOutputLen - length of output buffer
2240             (already checked by caller)
2241   in %r8  : input - pointer to input buffer
2242   in %r9d : inputLen - length of input buffer
2243   on stack: blocksize - AES blocksize (always 16, unused)
2244*/
2245	.type intel_aes_decrypt_cbc_256,@function
2246	.globl intel_aes_decrypt_cbc_256
2247	.align	16
2248intel_aes_decrypt_cbc_256:
2249//	leaq	IV_OFFSET(%rdi), %rdx
2250	leaq	256(%rdi), %rdx
2251
2252	movdqu	(%rdx), %xmm0
2253	movdqu	(%rdi), %xmm2
2254	movdqu	224(%rdi), %xmm15
2255	xorl	%eax, %eax
2256//	cmpl	$8*16, %r9d
2257	cmpl	$128, %r9d
2258	jb	1f
2259//	leal	-8*16(%r9), %r11d
2260	leal	-128(%r9), %r11d
22612:	movdqu  (%r8, %rax), %xmm3
2262	movdqu	16(%r8, %rax), %xmm4
2263	movdqu	32(%r8, %rax), %xmm5
2264	movdqu	48(%r8, %rax), %xmm6
2265	movdqu	64(%r8, %rax), %xmm7
2266	movdqu	80(%r8, %rax), %xmm8
2267	movdqu	96(%r8, %rax), %xmm9
2268	movdqu	112(%r8, %rax), %xmm10
2269	pxor	%xmm15, %xmm3
2270	pxor	%xmm15, %xmm4
2271	pxor	%xmm15, %xmm5
2272	pxor	%xmm15, %xmm6
2273	pxor	%xmm15, %xmm7
2274	pxor	%xmm15, %xmm8
2275	pxor	%xmm15, %xmm9
2276	pxor	%xmm15, %xmm10
2277
2278// complete loop unrolling
2279	movdqu 208(%rdi), %xmm1
2280	movdqu 192(%rdi), %xmm11
2281	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
2282	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2283	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2284	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2285	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2286	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2287	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2288	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2289	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
2290	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
2291	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
2292	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
2293	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
2294	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
2295	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
2296	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
2297
2298	movdqu 176(%rdi), %xmm1
2299	movdqu 160(%rdi), %xmm11
2300	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
2301	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2302	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2303	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2304	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2305	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2306	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2307	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2308	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
2309	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
2310	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
2311	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
2312	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
2313	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
2314	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
2315	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
2316
2317	movdqu 144(%rdi), %xmm1
2318	movdqu 128(%rdi), %xmm11
2319	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
2320	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2321	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2322	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2323	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2324	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2325	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2326	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2327	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
2328	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
2329	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
2330	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
2331	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
2332	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
2333	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
2334	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
2335
2336	movdqu 112(%rdi), %xmm1
2337	movdqu 96(%rdi), %xmm11
2338	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
2339	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2340	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2341	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2342	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2343	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2344	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2345	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2346	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
2347	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
2348	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
2349	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
2350	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
2351	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
2352	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
2353	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
2354
2355	movdqu 80(%rdi), %xmm1
2356	movdqu 64(%rdi), %xmm11
2357	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
2358	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2359	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2360	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2361	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2362	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2363	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2364	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2365	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
2366	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
2367	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
2368	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
2369	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
2370	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
2371	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
2372	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
2373
2374	movdqu 48(%rdi), %xmm1
2375	movdqu 32(%rdi), %xmm11
2376	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
2377	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2378	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2379	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2380	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2381	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2382	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2383	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2384	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
2385	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
2386	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
2387	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
2388	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
2389	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
2390	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
2391	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
2392
2393	movdqu 16(%rdi), %xmm1
2394	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
2395	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
2396	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
2397	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
2398	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
2399	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
2400	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
2401	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
2402	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
2403	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
2404	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
2405	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
2406	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
2407	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
2408	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
2409	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
2410
2411 	pxor	%xmm0, %xmm3
2412	movdqu	(%r8, %rax), %xmm0
2413	pxor	%xmm0, %xmm4
2414	movdqu	16(%r8, %rax), %xmm0
2415	pxor	%xmm0, %xmm5
2416	movdqu	32(%r8, %rax), %xmm0
2417	pxor	%xmm0, %xmm6
2418	movdqu	48(%r8, %rax), %xmm0
2419	pxor	%xmm0, %xmm7
2420	movdqu	64(%r8, %rax), %xmm0
2421	pxor	%xmm0, %xmm8
2422	movdqu	80(%r8, %rax), %xmm0
2423	pxor	%xmm0, %xmm9
2424	movdqu	96(%r8, %rax), %xmm0
2425	pxor	%xmm0, %xmm10
2426	movdqu	112(%r8, %rax), %xmm0
2427	movdqu	%xmm3, (%rsi, %rax)
2428	movdqu	%xmm4, 16(%rsi, %rax)
2429	movdqu	%xmm5, 32(%rsi, %rax)
2430	movdqu	%xmm6, 48(%rsi, %rax)
2431	movdqu	%xmm7, 64(%rsi, %rax)
2432	movdqu	%xmm8, 80(%rsi, %rax)
2433	movdqu	%xmm9, 96(%rsi, %rax)
2434	movdqu	%xmm10, 112(%rsi, %rax)
2435//	addl	$8*16, %eax
2436	addl	$128, %eax
2437	cmpl	%r11d, %eax
2438	jbe	2b
24391:	cmpl	%eax, %r9d
2440	je	5f
2441
2442	movdqu	16(%rdi), %xmm2
2443	movdqu	32(%rdi), %xmm3
2444	movdqu	48(%rdi), %xmm4
2445	movdqu	64(%rdi), %xmm5
2446	movdqu	80(%rdi), %xmm6
2447	movdqu	96(%rdi), %xmm7
2448	movdqu	112(%rdi), %xmm8
2449	movdqu	128(%rdi), %xmm9
2450	movdqu	144(%rdi), %xmm10
2451	movdqu	160(%rdi), %xmm11
2452	movdqu	176(%rdi), %xmm12
2453	movdqu	192(%rdi), %xmm13
2454	movdqu	208(%rdi), %xmm14
2455
24564:	movdqu	(%r8, %rax), %xmm1
2457	pxor	%xmm15, %xmm1
2458	.byte 0x66,0x41,0x0f,0x38,0xde,0xce	/* aesdec	%xmm14, %xmm1 */
2459	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
2460	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
2461	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
2462	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
2463	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
2464	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
2465	movdqu	(%rdi), %xmm8
2466	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
2467	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
2468	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
2469	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
2470	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
2471	.byte 0x66,0x0f,0x38,0xde,0xca	/* aesdec	%xmm2, %xmm1 */
2472	.byte 0x66,0x41,0x0f,0x38,0xdf,0xc8	/* aesdeclast %xmm8, %xmm1 */
2473	movdqu	112(%rdi), %xmm8
2474	pxor	%xmm0, %xmm1
2475	movdqu	(%r8, %rax), %xmm0  /* fetch the IV before we store the block */
2476	movdqu	%xmm1, (%rsi, %rax) /* in case input buf = output buf */
2477	addl	$16, %eax
2478	cmpl	%eax, %r9d
2479	jne	4b
2480
24815:	movdqu	%xmm0, (%rdx)
2482
2483	xor	%eax, %eax
2484	ret
2485	.size intel_aes_decrypt_cbc_256, .-intel_aes_decrypt_cbc_256
2486