1# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
2#
3# Licensed under the Apache License 2.0 (the "License").  You may not use
4# this file except in compliance with the License.  You can obtain a copy
5# in the file LICENSE in the source distribution or at
6# https://www.openssl.org/source/license.html
7
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16#
17# AES-NI-CTR+GHASH stitch.
18#
19# February 2013
20#
21# OpenSSL GCM implementation is organized in such way that its
22# performance is rather close to the sum of its streamed components,
23# in the context parallelized AES-NI CTR and modulo-scheduled
24# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
25# was observed to perform significantly better than the sum of the
26# components on contemporary CPUs, the effort was deemed impossible to
27# justify. This module is based on combination of Intel submissions,
28# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
29# Locktyukhin of Intel Corp. who verified that it reduces shuffles
30# pressure with notable relative improvement, achieving 1.0 cycle per
31# byte processed with 128-bit key on Haswell processor, 0.74 - on
32# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
33# measurements for favourable packet size, one divisible by 96.
34# Applications using the EVP interface will observe a few percent
35# worse performance.]
36#
37# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
38#
39# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
40# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
41
42# Generated once from
43# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
44# and modified for ICP. Modification are kept at a bare minimum to ease later
45# upstream merges.
46
47#if defined(__x86_64__) && defined(HAVE_AVX) && \
48    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
49
50.extern gcm_avx_can_use_movbe
51
52.text
53
54#ifdef HAVE_MOVBE
55.type	_aesni_ctr32_ghash_6x,@function
56.align	32
57_aesni_ctr32_ghash_6x:
58	vmovdqu	32(%r11),%xmm2
59	subq	$6,%rdx
60	vpxor	%xmm4,%xmm4,%xmm4
61	vmovdqu	0-128(%rcx),%xmm15
62	vpaddb	%xmm2,%xmm1,%xmm10
63	vpaddb	%xmm2,%xmm10,%xmm11
64	vpaddb	%xmm2,%xmm11,%xmm12
65	vpaddb	%xmm2,%xmm12,%xmm13
66	vpaddb	%xmm2,%xmm13,%xmm14
67	vpxor	%xmm15,%xmm1,%xmm9
68	vmovdqu	%xmm4,16+8(%rsp)
69	jmp	.Loop6x
70
71.align	32
72.Loop6x:
73	addl	$100663296,%ebx
74	jc	.Lhandle_ctr32
75	vmovdqu	0-32(%r9),%xmm3
76	vpaddb	%xmm2,%xmm14,%xmm1
77	vpxor	%xmm15,%xmm10,%xmm10
78	vpxor	%xmm15,%xmm11,%xmm11
79
80.Lresume_ctr32:
81	vmovdqu	%xmm1,(%r8)
82	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
83	vpxor	%xmm15,%xmm12,%xmm12
84	vmovups	16-128(%rcx),%xmm2
85	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
86	xorq	%r12,%r12
87	cmpq	%r14,%r15
88
89	vaesenc	%xmm2,%xmm9,%xmm9
90	vmovdqu	48+8(%rsp),%xmm0
91	vpxor	%xmm15,%xmm13,%xmm13
92	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
93	vaesenc	%xmm2,%xmm10,%xmm10
94	vpxor	%xmm15,%xmm14,%xmm14
95	setnc	%r12b
96	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
97	vaesenc	%xmm2,%xmm11,%xmm11
98	vmovdqu	16-32(%r9),%xmm3
99	negq	%r12
100	vaesenc	%xmm2,%xmm12,%xmm12
101	vpxor	%xmm5,%xmm6,%xmm6
102	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
103	vpxor	%xmm4,%xmm8,%xmm8
104	vaesenc	%xmm2,%xmm13,%xmm13
105	vpxor	%xmm5,%xmm1,%xmm4
106	andq	$0x60,%r12
107	vmovups	32-128(%rcx),%xmm15
108	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
109	vaesenc	%xmm2,%xmm14,%xmm14
110
111	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
112	leaq	(%r14,%r12,1),%r14
113	vaesenc	%xmm15,%xmm9,%xmm9
114	vpxor	16+8(%rsp),%xmm8,%xmm8
115	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
116	vmovdqu	64+8(%rsp),%xmm0
117	vaesenc	%xmm15,%xmm10,%xmm10
118	movbeq	88(%r14),%r13
119	vaesenc	%xmm15,%xmm11,%xmm11
120	movbeq	80(%r14),%r12
121	vaesenc	%xmm15,%xmm12,%xmm12
122	movq	%r13,32+8(%rsp)
123	vaesenc	%xmm15,%xmm13,%xmm13
124	movq	%r12,40+8(%rsp)
125	vmovdqu	48-32(%r9),%xmm5
126	vaesenc	%xmm15,%xmm14,%xmm14
127
128	vmovups	48-128(%rcx),%xmm15
129	vpxor	%xmm1,%xmm6,%xmm6
130	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
131	vaesenc	%xmm15,%xmm9,%xmm9
132	vpxor	%xmm2,%xmm6,%xmm6
133	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
134	vaesenc	%xmm15,%xmm10,%xmm10
135	vpxor	%xmm3,%xmm7,%xmm7
136	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
137	vaesenc	%xmm15,%xmm11,%xmm11
138	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
139	vmovdqu	80+8(%rsp),%xmm0
140	vaesenc	%xmm15,%xmm12,%xmm12
141	vaesenc	%xmm15,%xmm13,%xmm13
142	vpxor	%xmm1,%xmm4,%xmm4
143	vmovdqu	64-32(%r9),%xmm1
144	vaesenc	%xmm15,%xmm14,%xmm14
145
146	vmovups	64-128(%rcx),%xmm15
147	vpxor	%xmm2,%xmm6,%xmm6
148	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
149	vaesenc	%xmm15,%xmm9,%xmm9
150	vpxor	%xmm3,%xmm6,%xmm6
151	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
152	vaesenc	%xmm15,%xmm10,%xmm10
153	movbeq	72(%r14),%r13
154	vpxor	%xmm5,%xmm7,%xmm7
155	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
156	vaesenc	%xmm15,%xmm11,%xmm11
157	movbeq	64(%r14),%r12
158	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
159	vmovdqu	96+8(%rsp),%xmm0
160	vaesenc	%xmm15,%xmm12,%xmm12
161	movq	%r13,48+8(%rsp)
162	vaesenc	%xmm15,%xmm13,%xmm13
163	movq	%r12,56+8(%rsp)
164	vpxor	%xmm2,%xmm4,%xmm4
165	vmovdqu	96-32(%r9),%xmm2
166	vaesenc	%xmm15,%xmm14,%xmm14
167
168	vmovups	80-128(%rcx),%xmm15
169	vpxor	%xmm3,%xmm6,%xmm6
170	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
171	vaesenc	%xmm15,%xmm9,%xmm9
172	vpxor	%xmm5,%xmm6,%xmm6
173	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
174	vaesenc	%xmm15,%xmm10,%xmm10
175	movbeq	56(%r14),%r13
176	vpxor	%xmm1,%xmm7,%xmm7
177	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
178	vpxor	112+8(%rsp),%xmm8,%xmm8
179	vaesenc	%xmm15,%xmm11,%xmm11
180	movbeq	48(%r14),%r12
181	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
182	vaesenc	%xmm15,%xmm12,%xmm12
183	movq	%r13,64+8(%rsp)
184	vaesenc	%xmm15,%xmm13,%xmm13
185	movq	%r12,72+8(%rsp)
186	vpxor	%xmm3,%xmm4,%xmm4
187	vmovdqu	112-32(%r9),%xmm3
188	vaesenc	%xmm15,%xmm14,%xmm14
189
190	vmovups	96-128(%rcx),%xmm15
191	vpxor	%xmm5,%xmm6,%xmm6
192	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
193	vaesenc	%xmm15,%xmm9,%xmm9
194	vpxor	%xmm1,%xmm6,%xmm6
195	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
196	vaesenc	%xmm15,%xmm10,%xmm10
197	movbeq	40(%r14),%r13
198	vpxor	%xmm2,%xmm7,%xmm7
199	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
200	vaesenc	%xmm15,%xmm11,%xmm11
201	movbeq	32(%r14),%r12
202	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
203	vaesenc	%xmm15,%xmm12,%xmm12
204	movq	%r13,80+8(%rsp)
205	vaesenc	%xmm15,%xmm13,%xmm13
206	movq	%r12,88+8(%rsp)
207	vpxor	%xmm5,%xmm6,%xmm6
208	vaesenc	%xmm15,%xmm14,%xmm14
209	vpxor	%xmm1,%xmm6,%xmm6
210
211	vmovups	112-128(%rcx),%xmm15
212	vpslldq	$8,%xmm6,%xmm5
213	vpxor	%xmm2,%xmm4,%xmm4
214	vmovdqu	16(%r11),%xmm3
215
216	vaesenc	%xmm15,%xmm9,%xmm9
217	vpxor	%xmm8,%xmm7,%xmm7
218	vaesenc	%xmm15,%xmm10,%xmm10
219	vpxor	%xmm5,%xmm4,%xmm4
220	movbeq	24(%r14),%r13
221	vaesenc	%xmm15,%xmm11,%xmm11
222	movbeq	16(%r14),%r12
223	vpalignr	$8,%xmm4,%xmm4,%xmm0
224	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
225	movq	%r13,96+8(%rsp)
226	vaesenc	%xmm15,%xmm12,%xmm12
227	movq	%r12,104+8(%rsp)
228	vaesenc	%xmm15,%xmm13,%xmm13
229	vmovups	128-128(%rcx),%xmm1
230	vaesenc	%xmm15,%xmm14,%xmm14
231
232	vaesenc	%xmm1,%xmm9,%xmm9
233	vmovups	144-128(%rcx),%xmm15
234	vaesenc	%xmm1,%xmm10,%xmm10
235	vpsrldq	$8,%xmm6,%xmm6
236	vaesenc	%xmm1,%xmm11,%xmm11
237	vpxor	%xmm6,%xmm7,%xmm7
238	vaesenc	%xmm1,%xmm12,%xmm12
239	vpxor	%xmm0,%xmm4,%xmm4
240	movbeq	8(%r14),%r13
241	vaesenc	%xmm1,%xmm13,%xmm13
242	movbeq	0(%r14),%r12
243	vaesenc	%xmm1,%xmm14,%xmm14
244	vmovups	160-128(%rcx),%xmm1
245	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
246	jb	.Lenc_tail
247
248	vaesenc	%xmm15,%xmm9,%xmm9
249	vaesenc	%xmm15,%xmm10,%xmm10
250	vaesenc	%xmm15,%xmm11,%xmm11
251	vaesenc	%xmm15,%xmm12,%xmm12
252	vaesenc	%xmm15,%xmm13,%xmm13
253	vaesenc	%xmm15,%xmm14,%xmm14
254
255	vaesenc	%xmm1,%xmm9,%xmm9
256	vaesenc	%xmm1,%xmm10,%xmm10
257	vaesenc	%xmm1,%xmm11,%xmm11
258	vaesenc	%xmm1,%xmm12,%xmm12
259	vaesenc	%xmm1,%xmm13,%xmm13
260	vmovups	176-128(%rcx),%xmm15
261	vaesenc	%xmm1,%xmm14,%xmm14
262	vmovups	192-128(%rcx),%xmm1
263	cmpl	$14,%ebp	// ICP does not zero key schedule.
264	jb	.Lenc_tail
265
266	vaesenc	%xmm15,%xmm9,%xmm9
267	vaesenc	%xmm15,%xmm10,%xmm10
268	vaesenc	%xmm15,%xmm11,%xmm11
269	vaesenc	%xmm15,%xmm12,%xmm12
270	vaesenc	%xmm15,%xmm13,%xmm13
271	vaesenc	%xmm15,%xmm14,%xmm14
272
273	vaesenc	%xmm1,%xmm9,%xmm9
274	vaesenc	%xmm1,%xmm10,%xmm10
275	vaesenc	%xmm1,%xmm11,%xmm11
276	vaesenc	%xmm1,%xmm12,%xmm12
277	vaesenc	%xmm1,%xmm13,%xmm13
278	vmovups	208-128(%rcx),%xmm15
279	vaesenc	%xmm1,%xmm14,%xmm14
280	vmovups	224-128(%rcx),%xmm1
281	jmp	.Lenc_tail
282
283.align	32
284.Lhandle_ctr32:
285	vmovdqu	(%r11),%xmm0
286	vpshufb	%xmm0,%xmm1,%xmm6
287	vmovdqu	48(%r11),%xmm5
288	vpaddd	64(%r11),%xmm6,%xmm10
289	vpaddd	%xmm5,%xmm6,%xmm11
290	vmovdqu	0-32(%r9),%xmm3
291	vpaddd	%xmm5,%xmm10,%xmm12
292	vpshufb	%xmm0,%xmm10,%xmm10
293	vpaddd	%xmm5,%xmm11,%xmm13
294	vpshufb	%xmm0,%xmm11,%xmm11
295	vpxor	%xmm15,%xmm10,%xmm10
296	vpaddd	%xmm5,%xmm12,%xmm14
297	vpshufb	%xmm0,%xmm12,%xmm12
298	vpxor	%xmm15,%xmm11,%xmm11
299	vpaddd	%xmm5,%xmm13,%xmm1
300	vpshufb	%xmm0,%xmm13,%xmm13
301	vpshufb	%xmm0,%xmm14,%xmm14
302	vpshufb	%xmm0,%xmm1,%xmm1
303	jmp	.Lresume_ctr32
304
305.align	32
306.Lenc_tail:
307	vaesenc	%xmm15,%xmm9,%xmm9
308	vmovdqu	%xmm7,16+8(%rsp)
309	vpalignr	$8,%xmm4,%xmm4,%xmm8
310	vaesenc	%xmm15,%xmm10,%xmm10
311	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
312	vpxor	0(%rdi),%xmm1,%xmm2
313	vaesenc	%xmm15,%xmm11,%xmm11
314	vpxor	16(%rdi),%xmm1,%xmm0
315	vaesenc	%xmm15,%xmm12,%xmm12
316	vpxor	32(%rdi),%xmm1,%xmm5
317	vaesenc	%xmm15,%xmm13,%xmm13
318	vpxor	48(%rdi),%xmm1,%xmm6
319	vaesenc	%xmm15,%xmm14,%xmm14
320	vpxor	64(%rdi),%xmm1,%xmm7
321	vpxor	80(%rdi),%xmm1,%xmm3
322	vmovdqu	(%r8),%xmm1
323
324	vaesenclast	%xmm2,%xmm9,%xmm9
325	vmovdqu	32(%r11),%xmm2
326	vaesenclast	%xmm0,%xmm10,%xmm10
327	vpaddb	%xmm2,%xmm1,%xmm0
328	movq	%r13,112+8(%rsp)
329	leaq	96(%rdi),%rdi
330	vaesenclast	%xmm5,%xmm11,%xmm11
331	vpaddb	%xmm2,%xmm0,%xmm5
332	movq	%r12,120+8(%rsp)
333	leaq	96(%rsi),%rsi
334	vmovdqu	0-128(%rcx),%xmm15
335	vaesenclast	%xmm6,%xmm12,%xmm12
336	vpaddb	%xmm2,%xmm5,%xmm6
337	vaesenclast	%xmm7,%xmm13,%xmm13
338	vpaddb	%xmm2,%xmm6,%xmm7
339	vaesenclast	%xmm3,%xmm14,%xmm14
340	vpaddb	%xmm2,%xmm7,%xmm3
341
342	addq	$0x60,%r10
343	subq	$0x6,%rdx
344	jc	.L6x_done
345
346	vmovups	%xmm9,-96(%rsi)
347	vpxor	%xmm15,%xmm1,%xmm9
348	vmovups	%xmm10,-80(%rsi)
349	vmovdqa	%xmm0,%xmm10
350	vmovups	%xmm11,-64(%rsi)
351	vmovdqa	%xmm5,%xmm11
352	vmovups	%xmm12,-48(%rsi)
353	vmovdqa	%xmm6,%xmm12
354	vmovups	%xmm13,-32(%rsi)
355	vmovdqa	%xmm7,%xmm13
356	vmovups	%xmm14,-16(%rsi)
357	vmovdqa	%xmm3,%xmm14
358	vmovdqu	32+8(%rsp),%xmm7
359	jmp	.Loop6x
360
361.L6x_done:
362	vpxor	16+8(%rsp),%xmm8,%xmm8
363	vpxor	%xmm4,%xmm8,%xmm8
364
365	.byte	0xf3,0xc3
366.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
367#endif /* ifdef HAVE_MOVBE */
368
369.type	_aesni_ctr32_ghash_no_movbe_6x,@function
370.align	32
371_aesni_ctr32_ghash_no_movbe_6x:
372	vmovdqu	32(%r11),%xmm2
373	subq	$6,%rdx
374	vpxor	%xmm4,%xmm4,%xmm4
375	vmovdqu	0-128(%rcx),%xmm15
376	vpaddb	%xmm2,%xmm1,%xmm10
377	vpaddb	%xmm2,%xmm10,%xmm11
378	vpaddb	%xmm2,%xmm11,%xmm12
379	vpaddb	%xmm2,%xmm12,%xmm13
380	vpaddb	%xmm2,%xmm13,%xmm14
381	vpxor	%xmm15,%xmm1,%xmm9
382	vmovdqu	%xmm4,16+8(%rsp)
383	jmp	.Loop6x_nmb
384
385.align	32
386.Loop6x_nmb:
387	addl	$100663296,%ebx
388	jc	.Lhandle_ctr32_nmb
389	vmovdqu	0-32(%r9),%xmm3
390	vpaddb	%xmm2,%xmm14,%xmm1
391	vpxor	%xmm15,%xmm10,%xmm10
392	vpxor	%xmm15,%xmm11,%xmm11
393
394.Lresume_ctr32_nmb:
395	vmovdqu	%xmm1,(%r8)
396	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
397	vpxor	%xmm15,%xmm12,%xmm12
398	vmovups	16-128(%rcx),%xmm2
399	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
400	xorq	%r12,%r12
401	cmpq	%r14,%r15
402
403	vaesenc	%xmm2,%xmm9,%xmm9
404	vmovdqu	48+8(%rsp),%xmm0
405	vpxor	%xmm15,%xmm13,%xmm13
406	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
407	vaesenc	%xmm2,%xmm10,%xmm10
408	vpxor	%xmm15,%xmm14,%xmm14
409	setnc	%r12b
410	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
411	vaesenc	%xmm2,%xmm11,%xmm11
412	vmovdqu	16-32(%r9),%xmm3
413	negq	%r12
414	vaesenc	%xmm2,%xmm12,%xmm12
415	vpxor	%xmm5,%xmm6,%xmm6
416	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
417	vpxor	%xmm4,%xmm8,%xmm8
418	vaesenc	%xmm2,%xmm13,%xmm13
419	vpxor	%xmm5,%xmm1,%xmm4
420	andq	$0x60,%r12
421	vmovups	32-128(%rcx),%xmm15
422	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
423	vaesenc	%xmm2,%xmm14,%xmm14
424
425	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
426	leaq	(%r14,%r12,1),%r14
427	vaesenc	%xmm15,%xmm9,%xmm9
428	vpxor	16+8(%rsp),%xmm8,%xmm8
429	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
430	vmovdqu	64+8(%rsp),%xmm0
431	vaesenc	%xmm15,%xmm10,%xmm10
432	movq	88(%r14),%r13
433	bswapq	%r13
434	vaesenc	%xmm15,%xmm11,%xmm11
435	movq	80(%r14),%r12
436	bswapq	%r12
437	vaesenc	%xmm15,%xmm12,%xmm12
438	movq	%r13,32+8(%rsp)
439	vaesenc	%xmm15,%xmm13,%xmm13
440	movq	%r12,40+8(%rsp)
441	vmovdqu	48-32(%r9),%xmm5
442	vaesenc	%xmm15,%xmm14,%xmm14
443
444	vmovups	48-128(%rcx),%xmm15
445	vpxor	%xmm1,%xmm6,%xmm6
446	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
447	vaesenc	%xmm15,%xmm9,%xmm9
448	vpxor	%xmm2,%xmm6,%xmm6
449	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
450	vaesenc	%xmm15,%xmm10,%xmm10
451	vpxor	%xmm3,%xmm7,%xmm7
452	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
453	vaesenc	%xmm15,%xmm11,%xmm11
454	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
455	vmovdqu	80+8(%rsp),%xmm0
456	vaesenc	%xmm15,%xmm12,%xmm12
457	vaesenc	%xmm15,%xmm13,%xmm13
458	vpxor	%xmm1,%xmm4,%xmm4
459	vmovdqu	64-32(%r9),%xmm1
460	vaesenc	%xmm15,%xmm14,%xmm14
461
462	vmovups	64-128(%rcx),%xmm15
463	vpxor	%xmm2,%xmm6,%xmm6
464	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
465	vaesenc	%xmm15,%xmm9,%xmm9
466	vpxor	%xmm3,%xmm6,%xmm6
467	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
468	vaesenc	%xmm15,%xmm10,%xmm10
469	movq	72(%r14),%r13
470	bswapq	%r13
471	vpxor	%xmm5,%xmm7,%xmm7
472	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
473	vaesenc	%xmm15,%xmm11,%xmm11
474	movq	64(%r14),%r12
475	bswapq	%r12
476	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
477	vmovdqu	96+8(%rsp),%xmm0
478	vaesenc	%xmm15,%xmm12,%xmm12
479	movq	%r13,48+8(%rsp)
480	vaesenc	%xmm15,%xmm13,%xmm13
481	movq	%r12,56+8(%rsp)
482	vpxor	%xmm2,%xmm4,%xmm4
483	vmovdqu	96-32(%r9),%xmm2
484	vaesenc	%xmm15,%xmm14,%xmm14
485
486	vmovups	80-128(%rcx),%xmm15
487	vpxor	%xmm3,%xmm6,%xmm6
488	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
489	vaesenc	%xmm15,%xmm9,%xmm9
490	vpxor	%xmm5,%xmm6,%xmm6
491	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
492	vaesenc	%xmm15,%xmm10,%xmm10
493	movq	56(%r14),%r13
494	bswapq	%r13
495	vpxor	%xmm1,%xmm7,%xmm7
496	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
497	vpxor	112+8(%rsp),%xmm8,%xmm8
498	vaesenc	%xmm15,%xmm11,%xmm11
499	movq	48(%r14),%r12
500	bswapq	%r12
501	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
502	vaesenc	%xmm15,%xmm12,%xmm12
503	movq	%r13,64+8(%rsp)
504	vaesenc	%xmm15,%xmm13,%xmm13
505	movq	%r12,72+8(%rsp)
506	vpxor	%xmm3,%xmm4,%xmm4
507	vmovdqu	112-32(%r9),%xmm3
508	vaesenc	%xmm15,%xmm14,%xmm14
509
510	vmovups	96-128(%rcx),%xmm15
511	vpxor	%xmm5,%xmm6,%xmm6
512	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
513	vaesenc	%xmm15,%xmm9,%xmm9
514	vpxor	%xmm1,%xmm6,%xmm6
515	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
516	vaesenc	%xmm15,%xmm10,%xmm10
517	movq	40(%r14),%r13
518	bswapq	%r13
519	vpxor	%xmm2,%xmm7,%xmm7
520	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
521	vaesenc	%xmm15,%xmm11,%xmm11
522	movq	32(%r14),%r12
523	bswapq	%r12
524	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
525	vaesenc	%xmm15,%xmm12,%xmm12
526	movq	%r13,80+8(%rsp)
527	vaesenc	%xmm15,%xmm13,%xmm13
528	movq	%r12,88+8(%rsp)
529	vpxor	%xmm5,%xmm6,%xmm6
530	vaesenc	%xmm15,%xmm14,%xmm14
531	vpxor	%xmm1,%xmm6,%xmm6
532
533	vmovups	112-128(%rcx),%xmm15
534	vpslldq	$8,%xmm6,%xmm5
535	vpxor	%xmm2,%xmm4,%xmm4
536	vmovdqu	16(%r11),%xmm3
537
538	vaesenc	%xmm15,%xmm9,%xmm9
539	vpxor	%xmm8,%xmm7,%xmm7
540	vaesenc	%xmm15,%xmm10,%xmm10
541	vpxor	%xmm5,%xmm4,%xmm4
542	movq	24(%r14),%r13
543	bswapq	%r13
544	vaesenc	%xmm15,%xmm11,%xmm11
545	movq	16(%r14),%r12
546	bswapq	%r12
547	vpalignr	$8,%xmm4,%xmm4,%xmm0
548	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
549	movq	%r13,96+8(%rsp)
550	vaesenc	%xmm15,%xmm12,%xmm12
551	movq	%r12,104+8(%rsp)
552	vaesenc	%xmm15,%xmm13,%xmm13
553	vmovups	128-128(%rcx),%xmm1
554	vaesenc	%xmm15,%xmm14,%xmm14
555
556	vaesenc	%xmm1,%xmm9,%xmm9
557	vmovups	144-128(%rcx),%xmm15
558	vaesenc	%xmm1,%xmm10,%xmm10
559	vpsrldq	$8,%xmm6,%xmm6
560	vaesenc	%xmm1,%xmm11,%xmm11
561	vpxor	%xmm6,%xmm7,%xmm7
562	vaesenc	%xmm1,%xmm12,%xmm12
563	vpxor	%xmm0,%xmm4,%xmm4
564	movq	8(%r14),%r13
565	bswapq	%r13
566	vaesenc	%xmm1,%xmm13,%xmm13
567	movq	0(%r14),%r12
568	bswapq	%r12
569	vaesenc	%xmm1,%xmm14,%xmm14
570	vmovups	160-128(%rcx),%xmm1
571	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
572	jb	.Lenc_tail_nmb
573
574	vaesenc	%xmm15,%xmm9,%xmm9
575	vaesenc	%xmm15,%xmm10,%xmm10
576	vaesenc	%xmm15,%xmm11,%xmm11
577	vaesenc	%xmm15,%xmm12,%xmm12
578	vaesenc	%xmm15,%xmm13,%xmm13
579	vaesenc	%xmm15,%xmm14,%xmm14
580
581	vaesenc	%xmm1,%xmm9,%xmm9
582	vaesenc	%xmm1,%xmm10,%xmm10
583	vaesenc	%xmm1,%xmm11,%xmm11
584	vaesenc	%xmm1,%xmm12,%xmm12
585	vaesenc	%xmm1,%xmm13,%xmm13
586	vmovups	176-128(%rcx),%xmm15
587	vaesenc	%xmm1,%xmm14,%xmm14
588	vmovups	192-128(%rcx),%xmm1
589	cmpl	$14,%ebp	// ICP does not zero key schedule.
590	jb	.Lenc_tail_nmb
591
592	vaesenc	%xmm15,%xmm9,%xmm9
593	vaesenc	%xmm15,%xmm10,%xmm10
594	vaesenc	%xmm15,%xmm11,%xmm11
595	vaesenc	%xmm15,%xmm12,%xmm12
596	vaesenc	%xmm15,%xmm13,%xmm13
597	vaesenc	%xmm15,%xmm14,%xmm14
598
599	vaesenc	%xmm1,%xmm9,%xmm9
600	vaesenc	%xmm1,%xmm10,%xmm10
601	vaesenc	%xmm1,%xmm11,%xmm11
602	vaesenc	%xmm1,%xmm12,%xmm12
603	vaesenc	%xmm1,%xmm13,%xmm13
604	vmovups	208-128(%rcx),%xmm15
605	vaesenc	%xmm1,%xmm14,%xmm14
606	vmovups	224-128(%rcx),%xmm1
607	jmp	.Lenc_tail_nmb
608
609.align	32
610.Lhandle_ctr32_nmb:
611	vmovdqu	(%r11),%xmm0
612	vpshufb	%xmm0,%xmm1,%xmm6
613	vmovdqu	48(%r11),%xmm5
614	vpaddd	64(%r11),%xmm6,%xmm10
615	vpaddd	%xmm5,%xmm6,%xmm11
616	vmovdqu	0-32(%r9),%xmm3
617	vpaddd	%xmm5,%xmm10,%xmm12
618	vpshufb	%xmm0,%xmm10,%xmm10
619	vpaddd	%xmm5,%xmm11,%xmm13
620	vpshufb	%xmm0,%xmm11,%xmm11
621	vpxor	%xmm15,%xmm10,%xmm10
622	vpaddd	%xmm5,%xmm12,%xmm14
623	vpshufb	%xmm0,%xmm12,%xmm12
624	vpxor	%xmm15,%xmm11,%xmm11
625	vpaddd	%xmm5,%xmm13,%xmm1
626	vpshufb	%xmm0,%xmm13,%xmm13
627	vpshufb	%xmm0,%xmm14,%xmm14
628	vpshufb	%xmm0,%xmm1,%xmm1
629	jmp	.Lresume_ctr32_nmb
630
631.align	32
632.Lenc_tail_nmb:
633	vaesenc	%xmm15,%xmm9,%xmm9
634	vmovdqu	%xmm7,16+8(%rsp)
635	vpalignr	$8,%xmm4,%xmm4,%xmm8
636	vaesenc	%xmm15,%xmm10,%xmm10
637	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
638	vpxor	0(%rdi),%xmm1,%xmm2
639	vaesenc	%xmm15,%xmm11,%xmm11
640	vpxor	16(%rdi),%xmm1,%xmm0
641	vaesenc	%xmm15,%xmm12,%xmm12
642	vpxor	32(%rdi),%xmm1,%xmm5
643	vaesenc	%xmm15,%xmm13,%xmm13
644	vpxor	48(%rdi),%xmm1,%xmm6
645	vaesenc	%xmm15,%xmm14,%xmm14
646	vpxor	64(%rdi),%xmm1,%xmm7
647	vpxor	80(%rdi),%xmm1,%xmm3
648	vmovdqu	(%r8),%xmm1
649
650	vaesenclast	%xmm2,%xmm9,%xmm9
651	vmovdqu	32(%r11),%xmm2
652	vaesenclast	%xmm0,%xmm10,%xmm10
653	vpaddb	%xmm2,%xmm1,%xmm0
654	movq	%r13,112+8(%rsp)
655	leaq	96(%rdi),%rdi
656	vaesenclast	%xmm5,%xmm11,%xmm11
657	vpaddb	%xmm2,%xmm0,%xmm5
658	movq	%r12,120+8(%rsp)
659	leaq	96(%rsi),%rsi
660	vmovdqu	0-128(%rcx),%xmm15
661	vaesenclast	%xmm6,%xmm12,%xmm12
662	vpaddb	%xmm2,%xmm5,%xmm6
663	vaesenclast	%xmm7,%xmm13,%xmm13
664	vpaddb	%xmm2,%xmm6,%xmm7
665	vaesenclast	%xmm3,%xmm14,%xmm14
666	vpaddb	%xmm2,%xmm7,%xmm3
667
668	addq	$0x60,%r10
669	subq	$0x6,%rdx
670	jc	.L6x_done_nmb
671
672	vmovups	%xmm9,-96(%rsi)
673	vpxor	%xmm15,%xmm1,%xmm9
674	vmovups	%xmm10,-80(%rsi)
675	vmovdqa	%xmm0,%xmm10
676	vmovups	%xmm11,-64(%rsi)
677	vmovdqa	%xmm5,%xmm11
678	vmovups	%xmm12,-48(%rsi)
679	vmovdqa	%xmm6,%xmm12
680	vmovups	%xmm13,-32(%rsi)
681	vmovdqa	%xmm7,%xmm13
682	vmovups	%xmm14,-16(%rsi)
683	vmovdqa	%xmm3,%xmm14
684	vmovdqu	32+8(%rsp),%xmm7
685	jmp	.Loop6x_nmb
686
687.L6x_done_nmb:
688	vpxor	16+8(%rsp),%xmm8,%xmm8
689	vpxor	%xmm4,%xmm8,%xmm8
690
691	.byte	0xf3,0xc3
692.size	_aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x
693
694.globl	aesni_gcm_decrypt
695.type	aesni_gcm_decrypt,@function
696.align	32
697aesni_gcm_decrypt:
698.cfi_startproc
699	xorq	%r10,%r10
700	cmpq	$0x60,%rdx
701	jb	.Lgcm_dec_abort
702
703	leaq	(%rsp),%rax
704.cfi_def_cfa_register	%rax
705	pushq	%rbx
706.cfi_offset	%rbx,-16
707	pushq	%rbp
708.cfi_offset	%rbp,-24
709	pushq	%r12
710.cfi_offset	%r12,-32
711	pushq	%r13
712.cfi_offset	%r13,-40
713	pushq	%r14
714.cfi_offset	%r14,-48
715	pushq	%r15
716.cfi_offset	%r15,-56
717	vzeroupper
718
719	vmovdqu	(%r8),%xmm1
720	addq	$-128,%rsp
721	movl	12(%r8),%ebx
722	leaq	.Lbswap_mask(%rip),%r11
723	leaq	-128(%rcx),%r14
724	movq	$0xf80,%r15
725	vmovdqu	(%r9),%xmm8
726	andq	$-128,%rsp
727	vmovdqu	(%r11),%xmm0
728	leaq	128(%rcx),%rcx
729	leaq	32+32(%r9),%r9
730	movl	504-128(%rcx),%ebp	// ICP has a larger offset for rounds.
731	vpshufb	%xmm0,%xmm8,%xmm8
732
733	andq	%r15,%r14
734	andq	%rsp,%r15
735	subq	%r14,%r15
736	jc	.Ldec_no_key_aliasing
737	cmpq	$768,%r15
738	jnc	.Ldec_no_key_aliasing
739	subq	%r15,%rsp
740.Ldec_no_key_aliasing:
741
742	vmovdqu	80(%rdi),%xmm7
743	leaq	(%rdi),%r14
744	vmovdqu	64(%rdi),%xmm4
745	leaq	-192(%rdi,%rdx,1),%r15
746	vmovdqu	48(%rdi),%xmm5
747	shrq	$4,%rdx
748	xorq	%r10,%r10
749	vmovdqu	32(%rdi),%xmm6
750	vpshufb	%xmm0,%xmm7,%xmm7
751	vmovdqu	16(%rdi),%xmm2
752	vpshufb	%xmm0,%xmm4,%xmm4
753	vmovdqu	(%rdi),%xmm3
754	vpshufb	%xmm0,%xmm5,%xmm5
755	vmovdqu	%xmm4,48(%rsp)
756	vpshufb	%xmm0,%xmm6,%xmm6
757	vmovdqu	%xmm5,64(%rsp)
758	vpshufb	%xmm0,%xmm2,%xmm2
759	vmovdqu	%xmm6,80(%rsp)
760	vpshufb	%xmm0,%xmm3,%xmm3
761	vmovdqu	%xmm2,96(%rsp)
762	vmovdqu	%xmm3,112(%rsp)
763
764#ifdef HAVE_MOVBE
765#ifdef _KERNEL
766	testl	$1,gcm_avx_can_use_movbe(%rip)
767#else
768	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
769#endif
770	jz	1f
771	call	_aesni_ctr32_ghash_6x
772	jmp	2f
7731:
774#endif
775	call	_aesni_ctr32_ghash_no_movbe_6x
7762:
777	vmovups	%xmm9,-96(%rsi)
778	vmovups	%xmm10,-80(%rsi)
779	vmovups	%xmm11,-64(%rsi)
780	vmovups	%xmm12,-48(%rsi)
781	vmovups	%xmm13,-32(%rsi)
782	vmovups	%xmm14,-16(%rsi)
783
784	vpshufb	(%r11),%xmm8,%xmm8
785	vmovdqu	%xmm8,-64(%r9)
786
787	vzeroupper
788	movq	-48(%rax),%r15
789.cfi_restore	%r15
790	movq	-40(%rax),%r14
791.cfi_restore	%r14
792	movq	-32(%rax),%r13
793.cfi_restore	%r13
794	movq	-24(%rax),%r12
795.cfi_restore	%r12
796	movq	-16(%rax),%rbp
797.cfi_restore	%rbp
798	movq	-8(%rax),%rbx
799.cfi_restore	%rbx
800	leaq	(%rax),%rsp
801.cfi_def_cfa_register	%rsp
802.Lgcm_dec_abort:
803	movq	%r10,%rax
804	.byte	0xf3,0xc3
805.cfi_endproc
806.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
807.type	_aesni_ctr32_6x,@function
808.align	32
809_aesni_ctr32_6x:
810	vmovdqu	0-128(%rcx),%xmm4
811	vmovdqu	32(%r11),%xmm2
812	leaq	-2(%rbp),%r13	// ICP uses 10,12,14 not 9,11,13 for rounds.
813	vmovups	16-128(%rcx),%xmm15
814	leaq	32-128(%rcx),%r12
815	vpxor	%xmm4,%xmm1,%xmm9
816	addl	$100663296,%ebx
817	jc	.Lhandle_ctr32_2
818	vpaddb	%xmm2,%xmm1,%xmm10
819	vpaddb	%xmm2,%xmm10,%xmm11
820	vpxor	%xmm4,%xmm10,%xmm10
821	vpaddb	%xmm2,%xmm11,%xmm12
822	vpxor	%xmm4,%xmm11,%xmm11
823	vpaddb	%xmm2,%xmm12,%xmm13
824	vpxor	%xmm4,%xmm12,%xmm12
825	vpaddb	%xmm2,%xmm13,%xmm14
826	vpxor	%xmm4,%xmm13,%xmm13
827	vpaddb	%xmm2,%xmm14,%xmm1
828	vpxor	%xmm4,%xmm14,%xmm14
829	jmp	.Loop_ctr32
830
831.align	16
832.Loop_ctr32:
833	vaesenc	%xmm15,%xmm9,%xmm9
834	vaesenc	%xmm15,%xmm10,%xmm10
835	vaesenc	%xmm15,%xmm11,%xmm11
836	vaesenc	%xmm15,%xmm12,%xmm12
837	vaesenc	%xmm15,%xmm13,%xmm13
838	vaesenc	%xmm15,%xmm14,%xmm14
839	vmovups	(%r12),%xmm15
840	leaq	16(%r12),%r12
841	decl	%r13d
842	jnz	.Loop_ctr32
843
844	vmovdqu	(%r12),%xmm3
845	vaesenc	%xmm15,%xmm9,%xmm9
846	vpxor	0(%rdi),%xmm3,%xmm4
847	vaesenc	%xmm15,%xmm10,%xmm10
848	vpxor	16(%rdi),%xmm3,%xmm5
849	vaesenc	%xmm15,%xmm11,%xmm11
850	vpxor	32(%rdi),%xmm3,%xmm6
851	vaesenc	%xmm15,%xmm12,%xmm12
852	vpxor	48(%rdi),%xmm3,%xmm8
853	vaesenc	%xmm15,%xmm13,%xmm13
854	vpxor	64(%rdi),%xmm3,%xmm2
855	vaesenc	%xmm15,%xmm14,%xmm14
856	vpxor	80(%rdi),%xmm3,%xmm3
857	leaq	96(%rdi),%rdi
858
859	vaesenclast	%xmm4,%xmm9,%xmm9
860	vaesenclast	%xmm5,%xmm10,%xmm10
861	vaesenclast	%xmm6,%xmm11,%xmm11
862	vaesenclast	%xmm8,%xmm12,%xmm12
863	vaesenclast	%xmm2,%xmm13,%xmm13
864	vaesenclast	%xmm3,%xmm14,%xmm14
865	vmovups	%xmm9,0(%rsi)
866	vmovups	%xmm10,16(%rsi)
867	vmovups	%xmm11,32(%rsi)
868	vmovups	%xmm12,48(%rsi)
869	vmovups	%xmm13,64(%rsi)
870	vmovups	%xmm14,80(%rsi)
871	leaq	96(%rsi),%rsi
872
873	.byte	0xf3,0xc3
874.align	32
875.Lhandle_ctr32_2:
876	vpshufb	%xmm0,%xmm1,%xmm6
877	vmovdqu	48(%r11),%xmm5
878	vpaddd	64(%r11),%xmm6,%xmm10
879	vpaddd	%xmm5,%xmm6,%xmm11
880	vpaddd	%xmm5,%xmm10,%xmm12
881	vpshufb	%xmm0,%xmm10,%xmm10
882	vpaddd	%xmm5,%xmm11,%xmm13
883	vpshufb	%xmm0,%xmm11,%xmm11
884	vpxor	%xmm4,%xmm10,%xmm10
885	vpaddd	%xmm5,%xmm12,%xmm14
886	vpshufb	%xmm0,%xmm12,%xmm12
887	vpxor	%xmm4,%xmm11,%xmm11
888	vpaddd	%xmm5,%xmm13,%xmm1
889	vpshufb	%xmm0,%xmm13,%xmm13
890	vpxor	%xmm4,%xmm12,%xmm12
891	vpshufb	%xmm0,%xmm14,%xmm14
892	vpxor	%xmm4,%xmm13,%xmm13
893	vpshufb	%xmm0,%xmm1,%xmm1
894	vpxor	%xmm4,%xmm14,%xmm14
895	jmp	.Loop_ctr32
896.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
897
898.globl	aesni_gcm_encrypt
899.type	aesni_gcm_encrypt,@function
900.align	32
901aesni_gcm_encrypt:
902.cfi_startproc
903	xorq	%r10,%r10
904	cmpq	$288,%rdx
905	jb	.Lgcm_enc_abort
906
907	leaq	(%rsp),%rax
908.cfi_def_cfa_register	%rax
909	pushq	%rbx
910.cfi_offset	%rbx,-16
911	pushq	%rbp
912.cfi_offset	%rbp,-24
913	pushq	%r12
914.cfi_offset	%r12,-32
915	pushq	%r13
916.cfi_offset	%r13,-40
917	pushq	%r14
918.cfi_offset	%r14,-48
919	pushq	%r15
920.cfi_offset	%r15,-56
921	vzeroupper
922
923	vmovdqu	(%r8),%xmm1
924	addq	$-128,%rsp
925	movl	12(%r8),%ebx
926	leaq	.Lbswap_mask(%rip),%r11
927	leaq	-128(%rcx),%r14
928	movq	$0xf80,%r15
929	leaq	128(%rcx),%rcx
930	vmovdqu	(%r11),%xmm0
931	andq	$-128,%rsp
932	movl	504-128(%rcx),%ebp	// ICP has an larger offset for rounds.
933
934	andq	%r15,%r14
935	andq	%rsp,%r15
936	subq	%r14,%r15
937	jc	.Lenc_no_key_aliasing
938	cmpq	$768,%r15
939	jnc	.Lenc_no_key_aliasing
940	subq	%r15,%rsp
941.Lenc_no_key_aliasing:
942
943	leaq	(%rsi),%r14
944	leaq	-192(%rsi,%rdx,1),%r15
945	shrq	$4,%rdx
946
947	call	_aesni_ctr32_6x
948	vpshufb	%xmm0,%xmm9,%xmm8
949	vpshufb	%xmm0,%xmm10,%xmm2
950	vmovdqu	%xmm8,112(%rsp)
951	vpshufb	%xmm0,%xmm11,%xmm4
952	vmovdqu	%xmm2,96(%rsp)
953	vpshufb	%xmm0,%xmm12,%xmm5
954	vmovdqu	%xmm4,80(%rsp)
955	vpshufb	%xmm0,%xmm13,%xmm6
956	vmovdqu	%xmm5,64(%rsp)
957	vpshufb	%xmm0,%xmm14,%xmm7
958	vmovdqu	%xmm6,48(%rsp)
959
960	call	_aesni_ctr32_6x
961
962	vmovdqu	(%r9),%xmm8
963	leaq	32+32(%r9),%r9
964	subq	$12,%rdx
965	movq	$192,%r10
966	vpshufb	%xmm0,%xmm8,%xmm8
967
968#ifdef HAVE_MOVBE
969#ifdef _KERNEL
970	testl	$1,gcm_avx_can_use_movbe(%rip)
971#else
972	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
973#endif
974	jz	1f
975	call	_aesni_ctr32_ghash_6x
976	jmp	2f
9771:
978#endif
979	call	_aesni_ctr32_ghash_no_movbe_6x
9802:
981	vmovdqu	32(%rsp),%xmm7
982	vmovdqu	(%r11),%xmm0
983	vmovdqu	0-32(%r9),%xmm3
984	vpunpckhqdq	%xmm7,%xmm7,%xmm1
985	vmovdqu	32-32(%r9),%xmm15
986	vmovups	%xmm9,-96(%rsi)
987	vpshufb	%xmm0,%xmm9,%xmm9
988	vpxor	%xmm7,%xmm1,%xmm1
989	vmovups	%xmm10,-80(%rsi)
990	vpshufb	%xmm0,%xmm10,%xmm10
991	vmovups	%xmm11,-64(%rsi)
992	vpshufb	%xmm0,%xmm11,%xmm11
993	vmovups	%xmm12,-48(%rsi)
994	vpshufb	%xmm0,%xmm12,%xmm12
995	vmovups	%xmm13,-32(%rsi)
996	vpshufb	%xmm0,%xmm13,%xmm13
997	vmovups	%xmm14,-16(%rsi)
998	vpshufb	%xmm0,%xmm14,%xmm14
999	vmovdqu	%xmm9,16(%rsp)
1000	vmovdqu	48(%rsp),%xmm6
1001	vmovdqu	16-32(%r9),%xmm0
1002	vpunpckhqdq	%xmm6,%xmm6,%xmm2
1003	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
1004	vpxor	%xmm6,%xmm2,%xmm2
1005	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
1006	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
1007
1008	vmovdqu	64(%rsp),%xmm9
1009	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
1010	vmovdqu	48-32(%r9),%xmm3
1011	vpxor	%xmm5,%xmm4,%xmm4
1012	vpunpckhqdq	%xmm9,%xmm9,%xmm5
1013	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
1014	vpxor	%xmm9,%xmm5,%xmm5
1015	vpxor	%xmm7,%xmm6,%xmm6
1016	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
1017	vmovdqu	80-32(%r9),%xmm15
1018	vpxor	%xmm1,%xmm2,%xmm2
1019
1020	vmovdqu	80(%rsp),%xmm1
1021	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
1022	vmovdqu	64-32(%r9),%xmm0
1023	vpxor	%xmm4,%xmm7,%xmm7
1024	vpunpckhqdq	%xmm1,%xmm1,%xmm4
1025	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
1026	vpxor	%xmm1,%xmm4,%xmm4
1027	vpxor	%xmm6,%xmm9,%xmm9
1028	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
1029	vpxor	%xmm2,%xmm5,%xmm5
1030
1031	vmovdqu	96(%rsp),%xmm2
1032	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
1033	vmovdqu	96-32(%r9),%xmm3
1034	vpxor	%xmm7,%xmm6,%xmm6
1035	vpunpckhqdq	%xmm2,%xmm2,%xmm7
1036	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
1037	vpxor	%xmm2,%xmm7,%xmm7
1038	vpxor	%xmm9,%xmm1,%xmm1
1039	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
1040	vmovdqu	128-32(%r9),%xmm15
1041	vpxor	%xmm5,%xmm4,%xmm4
1042
1043	vpxor	112(%rsp),%xmm8,%xmm8
1044	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
1045	vmovdqu	112-32(%r9),%xmm0
1046	vpunpckhqdq	%xmm8,%xmm8,%xmm9
1047	vpxor	%xmm6,%xmm5,%xmm5
1048	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
1049	vpxor	%xmm8,%xmm9,%xmm9
1050	vpxor	%xmm1,%xmm2,%xmm2
1051	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
1052	vpxor	%xmm4,%xmm7,%xmm4
1053
1054	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
1055	vmovdqu	0-32(%r9),%xmm3
1056	vpunpckhqdq	%xmm14,%xmm14,%xmm1
1057	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
1058	vpxor	%xmm14,%xmm1,%xmm1
1059	vpxor	%xmm5,%xmm6,%xmm5
1060	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
1061	vmovdqu	32-32(%r9),%xmm15
1062	vpxor	%xmm2,%xmm8,%xmm7
1063	vpxor	%xmm4,%xmm9,%xmm6
1064
1065	vmovdqu	16-32(%r9),%xmm0
1066	vpxor	%xmm5,%xmm7,%xmm9
1067	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
1068	vpxor	%xmm9,%xmm6,%xmm6
1069	vpunpckhqdq	%xmm13,%xmm13,%xmm2
1070	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
1071	vpxor	%xmm13,%xmm2,%xmm2
1072	vpslldq	$8,%xmm6,%xmm9
1073	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
1074	vpxor	%xmm9,%xmm5,%xmm8
1075	vpsrldq	$8,%xmm6,%xmm6
1076	vpxor	%xmm6,%xmm7,%xmm7
1077
1078	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
1079	vmovdqu	48-32(%r9),%xmm3
1080	vpxor	%xmm4,%xmm5,%xmm5
1081	vpunpckhqdq	%xmm12,%xmm12,%xmm9
1082	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
1083	vpxor	%xmm12,%xmm9,%xmm9
1084	vpxor	%xmm14,%xmm13,%xmm13
1085	vpalignr	$8,%xmm8,%xmm8,%xmm14
1086	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
1087	vmovdqu	80-32(%r9),%xmm15
1088	vpxor	%xmm1,%xmm2,%xmm2
1089
1090	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
1091	vmovdqu	64-32(%r9),%xmm0
1092	vpxor	%xmm5,%xmm4,%xmm4
1093	vpunpckhqdq	%xmm11,%xmm11,%xmm1
1094	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
1095	vpxor	%xmm11,%xmm1,%xmm1
1096	vpxor	%xmm13,%xmm12,%xmm12
1097	vxorps	16(%rsp),%xmm7,%xmm7
1098	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
1099	vpxor	%xmm2,%xmm9,%xmm9
1100
1101	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
1102	vxorps	%xmm14,%xmm8,%xmm8
1103
1104	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
1105	vmovdqu	96-32(%r9),%xmm3
1106	vpxor	%xmm4,%xmm5,%xmm5
1107	vpunpckhqdq	%xmm10,%xmm10,%xmm2
1108	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
1109	vpxor	%xmm10,%xmm2,%xmm2
1110	vpalignr	$8,%xmm8,%xmm8,%xmm14
1111	vpxor	%xmm12,%xmm11,%xmm11
1112	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
1113	vmovdqu	128-32(%r9),%xmm15
1114	vpxor	%xmm9,%xmm1,%xmm1
1115
1116	vxorps	%xmm7,%xmm14,%xmm14
1117	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
1118	vxorps	%xmm14,%xmm8,%xmm8
1119
1120	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
1121	vmovdqu	112-32(%r9),%xmm0
1122	vpxor	%xmm5,%xmm4,%xmm4
1123	vpunpckhqdq	%xmm8,%xmm8,%xmm9
1124	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
1125	vpxor	%xmm8,%xmm9,%xmm9
1126	vpxor	%xmm11,%xmm10,%xmm10
1127	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
1128	vpxor	%xmm1,%xmm2,%xmm2
1129
1130	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
1131	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
1132	vpxor	%xmm4,%xmm5,%xmm5
1133	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
1134	vpxor	%xmm10,%xmm7,%xmm7
1135	vpxor	%xmm2,%xmm6,%xmm6
1136
1137	vpxor	%xmm5,%xmm7,%xmm4
1138	vpxor	%xmm4,%xmm6,%xmm6
1139	vpslldq	$8,%xmm6,%xmm1
1140	vmovdqu	16(%r11),%xmm3
1141	vpsrldq	$8,%xmm6,%xmm6
1142	vpxor	%xmm1,%xmm5,%xmm8
1143	vpxor	%xmm6,%xmm7,%xmm7
1144
1145	vpalignr	$8,%xmm8,%xmm8,%xmm2
1146	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
1147	vpxor	%xmm2,%xmm8,%xmm8
1148
1149	vpalignr	$8,%xmm8,%xmm8,%xmm2
1150	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
1151	vpxor	%xmm7,%xmm2,%xmm2
1152	vpxor	%xmm2,%xmm8,%xmm8
1153	vpshufb	(%r11),%xmm8,%xmm8
1154	vmovdqu	%xmm8,-64(%r9)
1155
1156	vzeroupper
1157	movq	-48(%rax),%r15
1158.cfi_restore	%r15
1159	movq	-40(%rax),%r14
1160.cfi_restore	%r14
1161	movq	-32(%rax),%r13
1162.cfi_restore	%r13
1163	movq	-24(%rax),%r12
1164.cfi_restore	%r12
1165	movq	-16(%rax),%rbp
1166.cfi_restore	%rbp
1167	movq	-8(%rax),%rbx
1168.cfi_restore	%rbx
1169	leaq	(%rax),%rsp
1170.cfi_def_cfa_register	%rsp
1171.Lgcm_enc_abort:
1172	movq	%r10,%rax
1173	.byte	0xf3,0xc3
1174.cfi_endproc
1175.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
1176
1177/* Some utility routines */
1178
1179/*
1180 * clear all fpu registers
1181 * void clear_fpu_regs_avx(void);
1182 */
1183.globl	clear_fpu_regs_avx
1184.type	clear_fpu_regs_avx,@function
1185.align	32
1186clear_fpu_regs_avx:
1187	vzeroall
1188	ret
1189.size	clear_fpu_regs_avx,.-clear_fpu_regs_avx
1190
1191/*
1192 * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1193 *
1194 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
1195 * stores the result at `dst'. The XOR is performed using FPU registers,
1196 * so make sure FPU state is saved when running this in the kernel.
1197 */
1198.globl  gcm_xor_avx
1199.type	gcm_xor_avx,@function
1200.align	32
1201gcm_xor_avx:
1202	movdqu  (%rdi), %xmm0
1203	movdqu  (%rsi), %xmm1
1204	pxor    %xmm1, %xmm0
1205	movdqu  %xmm0, (%rsi)
1206	ret
1207.size	gcm_xor_avx,.-gcm_xor_avx
1208
1209/*
1210 * Toggle a boolean_t value atomically and return the new value.
1211 * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
1212 */
1213.globl	atomic_toggle_boolean_nv
1214.type	atomic_toggle_boolean_nv,@function
1215.align	32
1216atomic_toggle_boolean_nv:
1217	xorl	%eax, %eax
1218	lock
1219	xorl	$1, (%rdi)
1220	jz	1f
1221	movl	$1, %eax
12221:
1223	ret
1224.size	atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
1225
1226.align	64
1227.Lbswap_mask:
1228.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1229.Lpoly:
1230.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1231.Lone_msb:
1232.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1233.Ltwo_lsb:
1234.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1235.Lone_lsb:
1236.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1237.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1238.align	64
1239
1240/* Mark the stack non-executable. */
1241#if defined(__linux__) && defined(__ELF__)
1242.section .note.GNU-stack,"",%progbits
1243#endif
1244
1245#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
1246