1# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
2#
3# Licensed under the Apache License 2.0 (the "License").  You may not use
4# this file except in compliance with the License.  You can obtain a copy
5# in the file LICENSE in the source distribution or at
6# https://www.openssl.org/source/license.html
7
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16#
17# AES-NI-CTR+GHASH stitch.
18#
19# February 2013
20#
21# OpenSSL GCM implementation is organized in such way that its
22# performance is rather close to the sum of its streamed components,
23# in the context parallelized AES-NI CTR and modulo-scheduled
24# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
25# was observed to perform significantly better than the sum of the
26# components on contemporary CPUs, the effort was deemed impossible to
27# justify. This module is based on combination of Intel submissions,
28# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
29# Locktyukhin of Intel Corp. who verified that it reduces shuffles
30# pressure with notable relative improvement, achieving 1.0 cycle per
31# byte processed with 128-bit key on Haswell processor, 0.74 - on
32# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
33# measurements for favourable packet size, one divisible by 96.
34# Applications using the EVP interface will observe a few percent
35# worse performance.]
36#
37# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
38#
39# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
40# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
41
42# Generated once from
43# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
44# and modified for ICP. Modification are kept at a bare minimum to ease later
45# upstream merges.
46
47#if defined(__x86_64__) && defined(HAVE_AVX) && \
48    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
49
50#define _ASM
51#include <sys/asm_linkage.h>
52
53/* Windows userland links with OpenSSL */
54#if !defined (_WIN32) || defined (_KERNEL)
55
56/* Apple needs _ */
57#if defined (__APPLE__)
58#define	gcm_avx_can_use_movbe _gcm_avx_can_use_movbe
59#endif
60
61.extern gcm_avx_can_use_movbe
62
63.text
64
65#ifdef HAVE_MOVBE
66.balign 32
67FUNCTION(_aesni_ctr32_ghash_6x)
68.cfi_startproc
69	ENDBR
70	vmovdqu	32(%r11),%xmm2
71	subq	$6,%rdx
72	vpxor	%xmm4,%xmm4,%xmm4
73	vmovdqu	0-128(%rcx),%xmm15
74	vpaddb	%xmm2,%xmm1,%xmm10
75	vpaddb	%xmm2,%xmm10,%xmm11
76	vpaddb	%xmm2,%xmm11,%xmm12
77	vpaddb	%xmm2,%xmm12,%xmm13
78	vpaddb	%xmm2,%xmm13,%xmm14
79	vpxor	%xmm15,%xmm1,%xmm9
80	vmovdqu	%xmm4,16+8(%rsp)
81	jmp	.Loop6x
82
83.balign	32
84.Loop6x:
85	addl	$100663296,%ebx
86	jc	.Lhandle_ctr32
87	vmovdqu	0-32(%r9),%xmm3
88	vpaddb	%xmm2,%xmm14,%xmm1
89	vpxor	%xmm15,%xmm10,%xmm10
90	vpxor	%xmm15,%xmm11,%xmm11
91
92.Lresume_ctr32:
93	vmovdqu	%xmm1,(%r8)
94	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
95	vpxor	%xmm15,%xmm12,%xmm12
96	vmovups	16-128(%rcx),%xmm2
97	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
98	xorq	%r12,%r12
99	cmpq	%r14,%r15
100
101	vaesenc	%xmm2,%xmm9,%xmm9
102	vmovdqu	48+8(%rsp),%xmm0
103	vpxor	%xmm15,%xmm13,%xmm13
104	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
105	vaesenc	%xmm2,%xmm10,%xmm10
106	vpxor	%xmm15,%xmm14,%xmm14
107	setnc	%r12b
108	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
109	vaesenc	%xmm2,%xmm11,%xmm11
110	vmovdqu	16-32(%r9),%xmm3
111	negq	%r12
112	vaesenc	%xmm2,%xmm12,%xmm12
113	vpxor	%xmm5,%xmm6,%xmm6
114	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
115	vpxor	%xmm4,%xmm8,%xmm8
116	vaesenc	%xmm2,%xmm13,%xmm13
117	vpxor	%xmm5,%xmm1,%xmm4
118	andq	$0x60,%r12
119	vmovups	32-128(%rcx),%xmm15
120	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
121	vaesenc	%xmm2,%xmm14,%xmm14
122
123	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
124	leaq	(%r14,%r12,1),%r14
125	vaesenc	%xmm15,%xmm9,%xmm9
126	vpxor	16+8(%rsp),%xmm8,%xmm8
127	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
128	vmovdqu	64+8(%rsp),%xmm0
129	vaesenc	%xmm15,%xmm10,%xmm10
130	movbeq	88(%r14),%r13
131	vaesenc	%xmm15,%xmm11,%xmm11
132	movbeq	80(%r14),%r12
133	vaesenc	%xmm15,%xmm12,%xmm12
134	movq	%r13,32+8(%rsp)
135	vaesenc	%xmm15,%xmm13,%xmm13
136	movq	%r12,40+8(%rsp)
137	vmovdqu	48-32(%r9),%xmm5
138	vaesenc	%xmm15,%xmm14,%xmm14
139
140	vmovups	48-128(%rcx),%xmm15
141	vpxor	%xmm1,%xmm6,%xmm6
142	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
143	vaesenc	%xmm15,%xmm9,%xmm9
144	vpxor	%xmm2,%xmm6,%xmm6
145	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
146	vaesenc	%xmm15,%xmm10,%xmm10
147	vpxor	%xmm3,%xmm7,%xmm7
148	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
149	vaesenc	%xmm15,%xmm11,%xmm11
150	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
151	vmovdqu	80+8(%rsp),%xmm0
152	vaesenc	%xmm15,%xmm12,%xmm12
153	vaesenc	%xmm15,%xmm13,%xmm13
154	vpxor	%xmm1,%xmm4,%xmm4
155	vmovdqu	64-32(%r9),%xmm1
156	vaesenc	%xmm15,%xmm14,%xmm14
157
158	vmovups	64-128(%rcx),%xmm15
159	vpxor	%xmm2,%xmm6,%xmm6
160	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
161	vaesenc	%xmm15,%xmm9,%xmm9
162	vpxor	%xmm3,%xmm6,%xmm6
163	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
164	vaesenc	%xmm15,%xmm10,%xmm10
165	movbeq	72(%r14),%r13
166	vpxor	%xmm5,%xmm7,%xmm7
167	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
168	vaesenc	%xmm15,%xmm11,%xmm11
169	movbeq	64(%r14),%r12
170	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
171	vmovdqu	96+8(%rsp),%xmm0
172	vaesenc	%xmm15,%xmm12,%xmm12
173	movq	%r13,48+8(%rsp)
174	vaesenc	%xmm15,%xmm13,%xmm13
175	movq	%r12,56+8(%rsp)
176	vpxor	%xmm2,%xmm4,%xmm4
177	vmovdqu	96-32(%r9),%xmm2
178	vaesenc	%xmm15,%xmm14,%xmm14
179
180	vmovups	80-128(%rcx),%xmm15
181	vpxor	%xmm3,%xmm6,%xmm6
182	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
183	vaesenc	%xmm15,%xmm9,%xmm9
184	vpxor	%xmm5,%xmm6,%xmm6
185	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
186	vaesenc	%xmm15,%xmm10,%xmm10
187	movbeq	56(%r14),%r13
188	vpxor	%xmm1,%xmm7,%xmm7
189	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
190	vpxor	112+8(%rsp),%xmm8,%xmm8
191	vaesenc	%xmm15,%xmm11,%xmm11
192	movbeq	48(%r14),%r12
193	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
194	vaesenc	%xmm15,%xmm12,%xmm12
195	movq	%r13,64+8(%rsp)
196	vaesenc	%xmm15,%xmm13,%xmm13
197	movq	%r12,72+8(%rsp)
198	vpxor	%xmm3,%xmm4,%xmm4
199	vmovdqu	112-32(%r9),%xmm3
200	vaesenc	%xmm15,%xmm14,%xmm14
201
202	vmovups	96-128(%rcx),%xmm15
203	vpxor	%xmm5,%xmm6,%xmm6
204	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
205	vaesenc	%xmm15,%xmm9,%xmm9
206	vpxor	%xmm1,%xmm6,%xmm6
207	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
208	vaesenc	%xmm15,%xmm10,%xmm10
209	movbeq	40(%r14),%r13
210	vpxor	%xmm2,%xmm7,%xmm7
211	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
212	vaesenc	%xmm15,%xmm11,%xmm11
213	movbeq	32(%r14),%r12
214	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
215	vaesenc	%xmm15,%xmm12,%xmm12
216	movq	%r13,80+8(%rsp)
217	vaesenc	%xmm15,%xmm13,%xmm13
218	movq	%r12,88+8(%rsp)
219	vpxor	%xmm5,%xmm6,%xmm6
220	vaesenc	%xmm15,%xmm14,%xmm14
221	vpxor	%xmm1,%xmm6,%xmm6
222
223	vmovups	112-128(%rcx),%xmm15
224	vpslldq	$8,%xmm6,%xmm5
225	vpxor	%xmm2,%xmm4,%xmm4
226	vmovdqu	16(%r11),%xmm3
227
228	vaesenc	%xmm15,%xmm9,%xmm9
229	vpxor	%xmm8,%xmm7,%xmm7
230	vaesenc	%xmm15,%xmm10,%xmm10
231	vpxor	%xmm5,%xmm4,%xmm4
232	movbeq	24(%r14),%r13
233	vaesenc	%xmm15,%xmm11,%xmm11
234	movbeq	16(%r14),%r12
235	vpalignr	$8,%xmm4,%xmm4,%xmm0
236	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
237	movq	%r13,96+8(%rsp)
238	vaesenc	%xmm15,%xmm12,%xmm12
239	movq	%r12,104+8(%rsp)
240	vaesenc	%xmm15,%xmm13,%xmm13
241	vmovups	128-128(%rcx),%xmm1
242	vaesenc	%xmm15,%xmm14,%xmm14
243
244	vaesenc	%xmm1,%xmm9,%xmm9
245	vmovups	144-128(%rcx),%xmm15
246	vaesenc	%xmm1,%xmm10,%xmm10
247	vpsrldq	$8,%xmm6,%xmm6
248	vaesenc	%xmm1,%xmm11,%xmm11
249	vpxor	%xmm6,%xmm7,%xmm7
250	vaesenc	%xmm1,%xmm12,%xmm12
251	vpxor	%xmm0,%xmm4,%xmm4
252	movbeq	8(%r14),%r13
253	vaesenc	%xmm1,%xmm13,%xmm13
254	movbeq	0(%r14),%r12
255	vaesenc	%xmm1,%xmm14,%xmm14
256	vmovups	160-128(%rcx),%xmm1
257	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
258	jb	.Lenc_tail
259
260	vaesenc	%xmm15,%xmm9,%xmm9
261	vaesenc	%xmm15,%xmm10,%xmm10
262	vaesenc	%xmm15,%xmm11,%xmm11
263	vaesenc	%xmm15,%xmm12,%xmm12
264	vaesenc	%xmm15,%xmm13,%xmm13
265	vaesenc	%xmm15,%xmm14,%xmm14
266
267	vaesenc	%xmm1,%xmm9,%xmm9
268	vaesenc	%xmm1,%xmm10,%xmm10
269	vaesenc	%xmm1,%xmm11,%xmm11
270	vaesenc	%xmm1,%xmm12,%xmm12
271	vaesenc	%xmm1,%xmm13,%xmm13
272	vmovups	176-128(%rcx),%xmm15
273	vaesenc	%xmm1,%xmm14,%xmm14
274	vmovups	192-128(%rcx),%xmm1
275	cmpl	$14,%ebp	// ICP does not zero key schedule.
276	jb	.Lenc_tail
277
278	vaesenc	%xmm15,%xmm9,%xmm9
279	vaesenc	%xmm15,%xmm10,%xmm10
280	vaesenc	%xmm15,%xmm11,%xmm11
281	vaesenc	%xmm15,%xmm12,%xmm12
282	vaesenc	%xmm15,%xmm13,%xmm13
283	vaesenc	%xmm15,%xmm14,%xmm14
284
285	vaesenc	%xmm1,%xmm9,%xmm9
286	vaesenc	%xmm1,%xmm10,%xmm10
287	vaesenc	%xmm1,%xmm11,%xmm11
288	vaesenc	%xmm1,%xmm12,%xmm12
289	vaesenc	%xmm1,%xmm13,%xmm13
290	vmovups	208-128(%rcx),%xmm15
291	vaesenc	%xmm1,%xmm14,%xmm14
292	vmovups	224-128(%rcx),%xmm1
293	jmp	.Lenc_tail
294
295.balign	32
296.Lhandle_ctr32:
297	vmovdqu	(%r11),%xmm0
298	vpshufb	%xmm0,%xmm1,%xmm6
299	vmovdqu	48(%r11),%xmm5
300	vpaddd	64(%r11),%xmm6,%xmm10
301	vpaddd	%xmm5,%xmm6,%xmm11
302	vmovdqu	0-32(%r9),%xmm3
303	vpaddd	%xmm5,%xmm10,%xmm12
304	vpshufb	%xmm0,%xmm10,%xmm10
305	vpaddd	%xmm5,%xmm11,%xmm13
306	vpshufb	%xmm0,%xmm11,%xmm11
307	vpxor	%xmm15,%xmm10,%xmm10
308	vpaddd	%xmm5,%xmm12,%xmm14
309	vpshufb	%xmm0,%xmm12,%xmm12
310	vpxor	%xmm15,%xmm11,%xmm11
311	vpaddd	%xmm5,%xmm13,%xmm1
312	vpshufb	%xmm0,%xmm13,%xmm13
313	vpshufb	%xmm0,%xmm14,%xmm14
314	vpshufb	%xmm0,%xmm1,%xmm1
315	jmp	.Lresume_ctr32
316
317.balign	32
318.Lenc_tail:
319	vaesenc	%xmm15,%xmm9,%xmm9
320	vmovdqu	%xmm7,16+8(%rsp)
321	vpalignr	$8,%xmm4,%xmm4,%xmm8
322	vaesenc	%xmm15,%xmm10,%xmm10
323	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
324	vpxor	0(%rdi),%xmm1,%xmm2
325	vaesenc	%xmm15,%xmm11,%xmm11
326	vpxor	16(%rdi),%xmm1,%xmm0
327	vaesenc	%xmm15,%xmm12,%xmm12
328	vpxor	32(%rdi),%xmm1,%xmm5
329	vaesenc	%xmm15,%xmm13,%xmm13
330	vpxor	48(%rdi),%xmm1,%xmm6
331	vaesenc	%xmm15,%xmm14,%xmm14
332	vpxor	64(%rdi),%xmm1,%xmm7
333	vpxor	80(%rdi),%xmm1,%xmm3
334	vmovdqu	(%r8),%xmm1
335
336	vaesenclast	%xmm2,%xmm9,%xmm9
337	vmovdqu	32(%r11),%xmm2
338	vaesenclast	%xmm0,%xmm10,%xmm10
339	vpaddb	%xmm2,%xmm1,%xmm0
340	movq	%r13,112+8(%rsp)
341	leaq	96(%rdi),%rdi
342	vaesenclast	%xmm5,%xmm11,%xmm11
343	vpaddb	%xmm2,%xmm0,%xmm5
344	movq	%r12,120+8(%rsp)
345	leaq	96(%rsi),%rsi
346	vmovdqu	0-128(%rcx),%xmm15
347	vaesenclast	%xmm6,%xmm12,%xmm12
348	vpaddb	%xmm2,%xmm5,%xmm6
349	vaesenclast	%xmm7,%xmm13,%xmm13
350	vpaddb	%xmm2,%xmm6,%xmm7
351	vaesenclast	%xmm3,%xmm14,%xmm14
352	vpaddb	%xmm2,%xmm7,%xmm3
353
354	addq	$0x60,%r10
355	subq	$0x6,%rdx
356	jc	.L6x_done
357
358	vmovups	%xmm9,-96(%rsi)
359	vpxor	%xmm15,%xmm1,%xmm9
360	vmovups	%xmm10,-80(%rsi)
361	vmovdqa	%xmm0,%xmm10
362	vmovups	%xmm11,-64(%rsi)
363	vmovdqa	%xmm5,%xmm11
364	vmovups	%xmm12,-48(%rsi)
365	vmovdqa	%xmm6,%xmm12
366	vmovups	%xmm13,-32(%rsi)
367	vmovdqa	%xmm7,%xmm13
368	vmovups	%xmm14,-16(%rsi)
369	vmovdqa	%xmm3,%xmm14
370	vmovdqu	32+8(%rsp),%xmm7
371	jmp	.Loop6x
372
373.L6x_done:
374	vpxor	16+8(%rsp),%xmm8,%xmm8
375	vpxor	%xmm4,%xmm8,%xmm8
376
377	RET
378.cfi_endproc
379SET_SIZE(_aesni_ctr32_ghash_6x)
380#endif /* ifdef HAVE_MOVBE */
381
382.balign 32
383FUNCTION(_aesni_ctr32_ghash_no_movbe_6x)
384.cfi_startproc
385	ENDBR
386	vmovdqu	32(%r11),%xmm2
387	subq	$6,%rdx
388	vpxor	%xmm4,%xmm4,%xmm4
389	vmovdqu	0-128(%rcx),%xmm15
390	vpaddb	%xmm2,%xmm1,%xmm10
391	vpaddb	%xmm2,%xmm10,%xmm11
392	vpaddb	%xmm2,%xmm11,%xmm12
393	vpaddb	%xmm2,%xmm12,%xmm13
394	vpaddb	%xmm2,%xmm13,%xmm14
395	vpxor	%xmm15,%xmm1,%xmm9
396	vmovdqu	%xmm4,16+8(%rsp)
397	jmp	.Loop6x_nmb
398
399.balign	32
400.Loop6x_nmb:
401	addl	$100663296,%ebx
402	jc	.Lhandle_ctr32_nmb
403	vmovdqu	0-32(%r9),%xmm3
404	vpaddb	%xmm2,%xmm14,%xmm1
405	vpxor	%xmm15,%xmm10,%xmm10
406	vpxor	%xmm15,%xmm11,%xmm11
407
408.Lresume_ctr32_nmb:
409	vmovdqu	%xmm1,(%r8)
410	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
411	vpxor	%xmm15,%xmm12,%xmm12
412	vmovups	16-128(%rcx),%xmm2
413	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
414	xorq	%r12,%r12
415	cmpq	%r14,%r15
416
417	vaesenc	%xmm2,%xmm9,%xmm9
418	vmovdqu	48+8(%rsp),%xmm0
419	vpxor	%xmm15,%xmm13,%xmm13
420	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
421	vaesenc	%xmm2,%xmm10,%xmm10
422	vpxor	%xmm15,%xmm14,%xmm14
423	setnc	%r12b
424	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
425	vaesenc	%xmm2,%xmm11,%xmm11
426	vmovdqu	16-32(%r9),%xmm3
427	negq	%r12
428	vaesenc	%xmm2,%xmm12,%xmm12
429	vpxor	%xmm5,%xmm6,%xmm6
430	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
431	vpxor	%xmm4,%xmm8,%xmm8
432	vaesenc	%xmm2,%xmm13,%xmm13
433	vpxor	%xmm5,%xmm1,%xmm4
434	andq	$0x60,%r12
435	vmovups	32-128(%rcx),%xmm15
436	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
437	vaesenc	%xmm2,%xmm14,%xmm14
438
439	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
440	leaq	(%r14,%r12,1),%r14
441	vaesenc	%xmm15,%xmm9,%xmm9
442	vpxor	16+8(%rsp),%xmm8,%xmm8
443	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
444	vmovdqu	64+8(%rsp),%xmm0
445	vaesenc	%xmm15,%xmm10,%xmm10
446	movq	88(%r14),%r13
447	bswapq	%r13
448	vaesenc	%xmm15,%xmm11,%xmm11
449	movq	80(%r14),%r12
450	bswapq	%r12
451	vaesenc	%xmm15,%xmm12,%xmm12
452	movq	%r13,32+8(%rsp)
453	vaesenc	%xmm15,%xmm13,%xmm13
454	movq	%r12,40+8(%rsp)
455	vmovdqu	48-32(%r9),%xmm5
456	vaesenc	%xmm15,%xmm14,%xmm14
457
458	vmovups	48-128(%rcx),%xmm15
459	vpxor	%xmm1,%xmm6,%xmm6
460	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
461	vaesenc	%xmm15,%xmm9,%xmm9
462	vpxor	%xmm2,%xmm6,%xmm6
463	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
464	vaesenc	%xmm15,%xmm10,%xmm10
465	vpxor	%xmm3,%xmm7,%xmm7
466	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
467	vaesenc	%xmm15,%xmm11,%xmm11
468	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
469	vmovdqu	80+8(%rsp),%xmm0
470	vaesenc	%xmm15,%xmm12,%xmm12
471	vaesenc	%xmm15,%xmm13,%xmm13
472	vpxor	%xmm1,%xmm4,%xmm4
473	vmovdqu	64-32(%r9),%xmm1
474	vaesenc	%xmm15,%xmm14,%xmm14
475
476	vmovups	64-128(%rcx),%xmm15
477	vpxor	%xmm2,%xmm6,%xmm6
478	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
479	vaesenc	%xmm15,%xmm9,%xmm9
480	vpxor	%xmm3,%xmm6,%xmm6
481	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
482	vaesenc	%xmm15,%xmm10,%xmm10
483	movq	72(%r14),%r13
484	bswapq	%r13
485	vpxor	%xmm5,%xmm7,%xmm7
486	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
487	vaesenc	%xmm15,%xmm11,%xmm11
488	movq	64(%r14),%r12
489	bswapq	%r12
490	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
491	vmovdqu	96+8(%rsp),%xmm0
492	vaesenc	%xmm15,%xmm12,%xmm12
493	movq	%r13,48+8(%rsp)
494	vaesenc	%xmm15,%xmm13,%xmm13
495	movq	%r12,56+8(%rsp)
496	vpxor	%xmm2,%xmm4,%xmm4
497	vmovdqu	96-32(%r9),%xmm2
498	vaesenc	%xmm15,%xmm14,%xmm14
499
500	vmovups	80-128(%rcx),%xmm15
501	vpxor	%xmm3,%xmm6,%xmm6
502	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
503	vaesenc	%xmm15,%xmm9,%xmm9
504	vpxor	%xmm5,%xmm6,%xmm6
505	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
506	vaesenc	%xmm15,%xmm10,%xmm10
507	movq	56(%r14),%r13
508	bswapq	%r13
509	vpxor	%xmm1,%xmm7,%xmm7
510	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
511	vpxor	112+8(%rsp),%xmm8,%xmm8
512	vaesenc	%xmm15,%xmm11,%xmm11
513	movq	48(%r14),%r12
514	bswapq	%r12
515	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
516	vaesenc	%xmm15,%xmm12,%xmm12
517	movq	%r13,64+8(%rsp)
518	vaesenc	%xmm15,%xmm13,%xmm13
519	movq	%r12,72+8(%rsp)
520	vpxor	%xmm3,%xmm4,%xmm4
521	vmovdqu	112-32(%r9),%xmm3
522	vaesenc	%xmm15,%xmm14,%xmm14
523
524	vmovups	96-128(%rcx),%xmm15
525	vpxor	%xmm5,%xmm6,%xmm6
526	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
527	vaesenc	%xmm15,%xmm9,%xmm9
528	vpxor	%xmm1,%xmm6,%xmm6
529	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
530	vaesenc	%xmm15,%xmm10,%xmm10
531	movq	40(%r14),%r13
532	bswapq	%r13
533	vpxor	%xmm2,%xmm7,%xmm7
534	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
535	vaesenc	%xmm15,%xmm11,%xmm11
536	movq	32(%r14),%r12
537	bswapq	%r12
538	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
539	vaesenc	%xmm15,%xmm12,%xmm12
540	movq	%r13,80+8(%rsp)
541	vaesenc	%xmm15,%xmm13,%xmm13
542	movq	%r12,88+8(%rsp)
543	vpxor	%xmm5,%xmm6,%xmm6
544	vaesenc	%xmm15,%xmm14,%xmm14
545	vpxor	%xmm1,%xmm6,%xmm6
546
547	vmovups	112-128(%rcx),%xmm15
548	vpslldq	$8,%xmm6,%xmm5
549	vpxor	%xmm2,%xmm4,%xmm4
550	vmovdqu	16(%r11),%xmm3
551
552	vaesenc	%xmm15,%xmm9,%xmm9
553	vpxor	%xmm8,%xmm7,%xmm7
554	vaesenc	%xmm15,%xmm10,%xmm10
555	vpxor	%xmm5,%xmm4,%xmm4
556	movq	24(%r14),%r13
557	bswapq	%r13
558	vaesenc	%xmm15,%xmm11,%xmm11
559	movq	16(%r14),%r12
560	bswapq	%r12
561	vpalignr	$8,%xmm4,%xmm4,%xmm0
562	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
563	movq	%r13,96+8(%rsp)
564	vaesenc	%xmm15,%xmm12,%xmm12
565	movq	%r12,104+8(%rsp)
566	vaesenc	%xmm15,%xmm13,%xmm13
567	vmovups	128-128(%rcx),%xmm1
568	vaesenc	%xmm15,%xmm14,%xmm14
569
570	vaesenc	%xmm1,%xmm9,%xmm9
571	vmovups	144-128(%rcx),%xmm15
572	vaesenc	%xmm1,%xmm10,%xmm10
573	vpsrldq	$8,%xmm6,%xmm6
574	vaesenc	%xmm1,%xmm11,%xmm11
575	vpxor	%xmm6,%xmm7,%xmm7
576	vaesenc	%xmm1,%xmm12,%xmm12
577	vpxor	%xmm0,%xmm4,%xmm4
578	movq	8(%r14),%r13
579	bswapq	%r13
580	vaesenc	%xmm1,%xmm13,%xmm13
581	movq	0(%r14),%r12
582	bswapq	%r12
583	vaesenc	%xmm1,%xmm14,%xmm14
584	vmovups	160-128(%rcx),%xmm1
585	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
586	jb	.Lenc_tail_nmb
587
588	vaesenc	%xmm15,%xmm9,%xmm9
589	vaesenc	%xmm15,%xmm10,%xmm10
590	vaesenc	%xmm15,%xmm11,%xmm11
591	vaesenc	%xmm15,%xmm12,%xmm12
592	vaesenc	%xmm15,%xmm13,%xmm13
593	vaesenc	%xmm15,%xmm14,%xmm14
594
595	vaesenc	%xmm1,%xmm9,%xmm9
596	vaesenc	%xmm1,%xmm10,%xmm10
597	vaesenc	%xmm1,%xmm11,%xmm11
598	vaesenc	%xmm1,%xmm12,%xmm12
599	vaesenc	%xmm1,%xmm13,%xmm13
600	vmovups	176-128(%rcx),%xmm15
601	vaesenc	%xmm1,%xmm14,%xmm14
602	vmovups	192-128(%rcx),%xmm1
603	cmpl	$14,%ebp	// ICP does not zero key schedule.
604	jb	.Lenc_tail_nmb
605
606	vaesenc	%xmm15,%xmm9,%xmm9
607	vaesenc	%xmm15,%xmm10,%xmm10
608	vaesenc	%xmm15,%xmm11,%xmm11
609	vaesenc	%xmm15,%xmm12,%xmm12
610	vaesenc	%xmm15,%xmm13,%xmm13
611	vaesenc	%xmm15,%xmm14,%xmm14
612
613	vaesenc	%xmm1,%xmm9,%xmm9
614	vaesenc	%xmm1,%xmm10,%xmm10
615	vaesenc	%xmm1,%xmm11,%xmm11
616	vaesenc	%xmm1,%xmm12,%xmm12
617	vaesenc	%xmm1,%xmm13,%xmm13
618	vmovups	208-128(%rcx),%xmm15
619	vaesenc	%xmm1,%xmm14,%xmm14
620	vmovups	224-128(%rcx),%xmm1
621	jmp	.Lenc_tail_nmb
622
623.balign	32
624.Lhandle_ctr32_nmb:
625	vmovdqu	(%r11),%xmm0
626	vpshufb	%xmm0,%xmm1,%xmm6
627	vmovdqu	48(%r11),%xmm5
628	vpaddd	64(%r11),%xmm6,%xmm10
629	vpaddd	%xmm5,%xmm6,%xmm11
630	vmovdqu	0-32(%r9),%xmm3
631	vpaddd	%xmm5,%xmm10,%xmm12
632	vpshufb	%xmm0,%xmm10,%xmm10
633	vpaddd	%xmm5,%xmm11,%xmm13
634	vpshufb	%xmm0,%xmm11,%xmm11
635	vpxor	%xmm15,%xmm10,%xmm10
636	vpaddd	%xmm5,%xmm12,%xmm14
637	vpshufb	%xmm0,%xmm12,%xmm12
638	vpxor	%xmm15,%xmm11,%xmm11
639	vpaddd	%xmm5,%xmm13,%xmm1
640	vpshufb	%xmm0,%xmm13,%xmm13
641	vpshufb	%xmm0,%xmm14,%xmm14
642	vpshufb	%xmm0,%xmm1,%xmm1
643	jmp	.Lresume_ctr32_nmb
644
645.balign	32
646.Lenc_tail_nmb:
647	vaesenc	%xmm15,%xmm9,%xmm9
648	vmovdqu	%xmm7,16+8(%rsp)
649	vpalignr	$8,%xmm4,%xmm4,%xmm8
650	vaesenc	%xmm15,%xmm10,%xmm10
651	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
652	vpxor	0(%rdi),%xmm1,%xmm2
653	vaesenc	%xmm15,%xmm11,%xmm11
654	vpxor	16(%rdi),%xmm1,%xmm0
655	vaesenc	%xmm15,%xmm12,%xmm12
656	vpxor	32(%rdi),%xmm1,%xmm5
657	vaesenc	%xmm15,%xmm13,%xmm13
658	vpxor	48(%rdi),%xmm1,%xmm6
659	vaesenc	%xmm15,%xmm14,%xmm14
660	vpxor	64(%rdi),%xmm1,%xmm7
661	vpxor	80(%rdi),%xmm1,%xmm3
662	vmovdqu	(%r8),%xmm1
663
664	vaesenclast	%xmm2,%xmm9,%xmm9
665	vmovdqu	32(%r11),%xmm2
666	vaesenclast	%xmm0,%xmm10,%xmm10
667	vpaddb	%xmm2,%xmm1,%xmm0
668	movq	%r13,112+8(%rsp)
669	leaq	96(%rdi),%rdi
670	vaesenclast	%xmm5,%xmm11,%xmm11
671	vpaddb	%xmm2,%xmm0,%xmm5
672	movq	%r12,120+8(%rsp)
673	leaq	96(%rsi),%rsi
674	vmovdqu	0-128(%rcx),%xmm15
675	vaesenclast	%xmm6,%xmm12,%xmm12
676	vpaddb	%xmm2,%xmm5,%xmm6
677	vaesenclast	%xmm7,%xmm13,%xmm13
678	vpaddb	%xmm2,%xmm6,%xmm7
679	vaesenclast	%xmm3,%xmm14,%xmm14
680	vpaddb	%xmm2,%xmm7,%xmm3
681
682	addq	$0x60,%r10
683	subq	$0x6,%rdx
684	jc	.L6x_done_nmb
685
686	vmovups	%xmm9,-96(%rsi)
687	vpxor	%xmm15,%xmm1,%xmm9
688	vmovups	%xmm10,-80(%rsi)
689	vmovdqa	%xmm0,%xmm10
690	vmovups	%xmm11,-64(%rsi)
691	vmovdqa	%xmm5,%xmm11
692	vmovups	%xmm12,-48(%rsi)
693	vmovdqa	%xmm6,%xmm12
694	vmovups	%xmm13,-32(%rsi)
695	vmovdqa	%xmm7,%xmm13
696	vmovups	%xmm14,-16(%rsi)
697	vmovdqa	%xmm3,%xmm14
698	vmovdqu	32+8(%rsp),%xmm7
699	jmp	.Loop6x_nmb
700
701.L6x_done_nmb:
702	vpxor	16+8(%rsp),%xmm8,%xmm8
703	vpxor	%xmm4,%xmm8,%xmm8
704
705	RET
706.cfi_endproc
707SET_SIZE(_aesni_ctr32_ghash_no_movbe_6x)
708
709ENTRY_ALIGN(aesni_gcm_decrypt, 32)
710.cfi_startproc
711	ENDBR
712	xorq	%r10,%r10
713	cmpq	$0x60,%rdx
714	jb	.Lgcm_dec_abort
715
716	leaq	(%rsp),%rax
717.cfi_def_cfa_register	%rax
718	pushq	%rbx
719.cfi_offset	%rbx,-16
720	pushq	%rbp
721.cfi_offset	%rbp,-24
722	pushq	%r12
723.cfi_offset	%r12,-32
724	pushq	%r13
725.cfi_offset	%r13,-40
726	pushq	%r14
727.cfi_offset	%r14,-48
728	pushq	%r15
729.cfi_offset	%r15,-56
730	pushq	%r9
731.cfi_offset	%r9,-64
732	vzeroupper
733
734	vmovdqu	(%r8),%xmm1
735	addq	$-128,%rsp
736	movl	12(%r8),%ebx
737	leaq	.Lbswap_mask(%rip),%r11
738	leaq	-128(%rcx),%r14
739	movq	$0xf80,%r15
740	vmovdqu	(%r9),%xmm8
741	andq	$-128,%rsp
742	vmovdqu	(%r11),%xmm0
743	leaq	128(%rcx),%rcx
744	movq	32(%r9),%r9
745	leaq	32(%r9),%r9
746	movl	504-128(%rcx),%ebp	// ICP has a larger offset for rounds.
747	vpshufb	%xmm0,%xmm8,%xmm8
748
749	andq	%r15,%r14
750	andq	%rsp,%r15
751	subq	%r14,%r15
752	jc	.Ldec_no_key_aliasing
753	cmpq	$768,%r15
754	jnc	.Ldec_no_key_aliasing
755	subq	%r15,%rsp
756.Ldec_no_key_aliasing:
757
758	vmovdqu	80(%rdi),%xmm7
759	leaq	(%rdi),%r14
760	vmovdqu	64(%rdi),%xmm4
761	leaq	-192(%rdi,%rdx,1),%r15
762	vmovdqu	48(%rdi),%xmm5
763	shrq	$4,%rdx
764	xorq	%r10,%r10
765	vmovdqu	32(%rdi),%xmm6
766	vpshufb	%xmm0,%xmm7,%xmm7
767	vmovdqu	16(%rdi),%xmm2
768	vpshufb	%xmm0,%xmm4,%xmm4
769	vmovdqu	(%rdi),%xmm3
770	vpshufb	%xmm0,%xmm5,%xmm5
771	vmovdqu	%xmm4,48(%rsp)
772	vpshufb	%xmm0,%xmm6,%xmm6
773	vmovdqu	%xmm5,64(%rsp)
774	vpshufb	%xmm0,%xmm2,%xmm2
775	vmovdqu	%xmm6,80(%rsp)
776	vpshufb	%xmm0,%xmm3,%xmm3
777	vmovdqu	%xmm2,96(%rsp)
778	vmovdqu	%xmm3,112(%rsp)
779
780#ifdef HAVE_MOVBE
781#ifdef _KERNEL
782	testl	$1,gcm_avx_can_use_movbe(%rip)
783#else
784	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
785#endif
786	jz	1f
787	call	_aesni_ctr32_ghash_6x
788	jmp	2f
7891:
790#endif
791	call	_aesni_ctr32_ghash_no_movbe_6x
7922:
793	vmovups	%xmm9,-96(%rsi)
794	vmovups	%xmm10,-80(%rsi)
795	vmovups	%xmm11,-64(%rsi)
796	vmovups	%xmm12,-48(%rsi)
797	vmovups	%xmm13,-32(%rsi)
798	vmovups	%xmm14,-16(%rsi)
799
800	vpshufb	(%r11),%xmm8,%xmm8
801	movq	-56(%rax),%r9
802.cfi_restore	%r9
803	vmovdqu	%xmm8,(%r9)
804
805	vzeroupper
806	movq	-48(%rax),%r15
807.cfi_restore	%r15
808	movq	-40(%rax),%r14
809.cfi_restore	%r14
810	movq	-32(%rax),%r13
811.cfi_restore	%r13
812	movq	-24(%rax),%r12
813.cfi_restore	%r12
814	movq	-16(%rax),%rbp
815.cfi_restore	%rbp
816	movq	-8(%rax),%rbx
817.cfi_restore	%rbx
818	leaq	(%rax),%rsp
819.cfi_def_cfa_register	%rsp
820.Lgcm_dec_abort:
821	movq	%r10,%rax
822	RET
823.cfi_endproc
824SET_SIZE(aesni_gcm_decrypt)
825
826.balign 32
827FUNCTION(_aesni_ctr32_6x)
828.cfi_startproc
829	ENDBR
830	vmovdqu	0-128(%rcx),%xmm4
831	vmovdqu	32(%r11),%xmm2
832	leaq	-2(%rbp),%r13	// ICP uses 10,12,14 not 9,11,13 for rounds.
833	vmovups	16-128(%rcx),%xmm15
834	leaq	32-128(%rcx),%r12
835	vpxor	%xmm4,%xmm1,%xmm9
836	addl	$100663296,%ebx
837	jc	.Lhandle_ctr32_2
838	vpaddb	%xmm2,%xmm1,%xmm10
839	vpaddb	%xmm2,%xmm10,%xmm11
840	vpxor	%xmm4,%xmm10,%xmm10
841	vpaddb	%xmm2,%xmm11,%xmm12
842	vpxor	%xmm4,%xmm11,%xmm11
843	vpaddb	%xmm2,%xmm12,%xmm13
844	vpxor	%xmm4,%xmm12,%xmm12
845	vpaddb	%xmm2,%xmm13,%xmm14
846	vpxor	%xmm4,%xmm13,%xmm13
847	vpaddb	%xmm2,%xmm14,%xmm1
848	vpxor	%xmm4,%xmm14,%xmm14
849	jmp	.Loop_ctr32
850
851.balign	16
852.Loop_ctr32:
853	vaesenc	%xmm15,%xmm9,%xmm9
854	vaesenc	%xmm15,%xmm10,%xmm10
855	vaesenc	%xmm15,%xmm11,%xmm11
856	vaesenc	%xmm15,%xmm12,%xmm12
857	vaesenc	%xmm15,%xmm13,%xmm13
858	vaesenc	%xmm15,%xmm14,%xmm14
859	vmovups	(%r12),%xmm15
860	leaq	16(%r12),%r12
861	decl	%r13d
862	jnz	.Loop_ctr32
863
864	vmovdqu	(%r12),%xmm3
865	vaesenc	%xmm15,%xmm9,%xmm9
866	vpxor	0(%rdi),%xmm3,%xmm4
867	vaesenc	%xmm15,%xmm10,%xmm10
868	vpxor	16(%rdi),%xmm3,%xmm5
869	vaesenc	%xmm15,%xmm11,%xmm11
870	vpxor	32(%rdi),%xmm3,%xmm6
871	vaesenc	%xmm15,%xmm12,%xmm12
872	vpxor	48(%rdi),%xmm3,%xmm8
873	vaesenc	%xmm15,%xmm13,%xmm13
874	vpxor	64(%rdi),%xmm3,%xmm2
875	vaesenc	%xmm15,%xmm14,%xmm14
876	vpxor	80(%rdi),%xmm3,%xmm3
877	leaq	96(%rdi),%rdi
878
879	vaesenclast	%xmm4,%xmm9,%xmm9
880	vaesenclast	%xmm5,%xmm10,%xmm10
881	vaesenclast	%xmm6,%xmm11,%xmm11
882	vaesenclast	%xmm8,%xmm12,%xmm12
883	vaesenclast	%xmm2,%xmm13,%xmm13
884	vaesenclast	%xmm3,%xmm14,%xmm14
885	vmovups	%xmm9,0(%rsi)
886	vmovups	%xmm10,16(%rsi)
887	vmovups	%xmm11,32(%rsi)
888	vmovups	%xmm12,48(%rsi)
889	vmovups	%xmm13,64(%rsi)
890	vmovups	%xmm14,80(%rsi)
891	leaq	96(%rsi),%rsi
892
893	RET
894.balign	32
895.Lhandle_ctr32_2:
896	vpshufb	%xmm0,%xmm1,%xmm6
897	vmovdqu	48(%r11),%xmm5
898	vpaddd	64(%r11),%xmm6,%xmm10
899	vpaddd	%xmm5,%xmm6,%xmm11
900	vpaddd	%xmm5,%xmm10,%xmm12
901	vpshufb	%xmm0,%xmm10,%xmm10
902	vpaddd	%xmm5,%xmm11,%xmm13
903	vpshufb	%xmm0,%xmm11,%xmm11
904	vpxor	%xmm4,%xmm10,%xmm10
905	vpaddd	%xmm5,%xmm12,%xmm14
906	vpshufb	%xmm0,%xmm12,%xmm12
907	vpxor	%xmm4,%xmm11,%xmm11
908	vpaddd	%xmm5,%xmm13,%xmm1
909	vpshufb	%xmm0,%xmm13,%xmm13
910	vpxor	%xmm4,%xmm12,%xmm12
911	vpshufb	%xmm0,%xmm14,%xmm14
912	vpxor	%xmm4,%xmm13,%xmm13
913	vpshufb	%xmm0,%xmm1,%xmm1
914	vpxor	%xmm4,%xmm14,%xmm14
915	jmp	.Loop_ctr32
916.cfi_endproc
917SET_SIZE(_aesni_ctr32_6x)
918
919ENTRY_ALIGN(aesni_gcm_encrypt, 32)
920.cfi_startproc
921	ENDBR
922	xorq	%r10,%r10
923	cmpq	$288,%rdx
924	jb	.Lgcm_enc_abort
925
926	leaq	(%rsp),%rax
927.cfi_def_cfa_register	%rax
928	pushq	%rbx
929.cfi_offset	%rbx,-16
930	pushq	%rbp
931.cfi_offset	%rbp,-24
932	pushq	%r12
933.cfi_offset	%r12,-32
934	pushq	%r13
935.cfi_offset	%r13,-40
936	pushq	%r14
937.cfi_offset	%r14,-48
938	pushq	%r15
939.cfi_offset	%r15,-56
940	pushq	%r9
941.cfi_offset	%r9,-64
942	vzeroupper
943
944	vmovdqu	(%r8),%xmm1
945	addq	$-128,%rsp
946	movl	12(%r8),%ebx
947	leaq	.Lbswap_mask(%rip),%r11
948	leaq	-128(%rcx),%r14
949	movq	$0xf80,%r15
950	leaq	128(%rcx),%rcx
951	vmovdqu	(%r11),%xmm0
952	andq	$-128,%rsp
953	movl	504-128(%rcx),%ebp	// ICP has an larger offset for rounds.
954
955	andq	%r15,%r14
956	andq	%rsp,%r15
957	subq	%r14,%r15
958	jc	.Lenc_no_key_aliasing
959	cmpq	$768,%r15
960	jnc	.Lenc_no_key_aliasing
961	subq	%r15,%rsp
962.Lenc_no_key_aliasing:
963
964	leaq	(%rsi),%r14
965	leaq	-192(%rsi,%rdx,1),%r15
966	shrq	$4,%rdx
967
968	call	_aesni_ctr32_6x
969	vpshufb	%xmm0,%xmm9,%xmm8
970	vpshufb	%xmm0,%xmm10,%xmm2
971	vmovdqu	%xmm8,112(%rsp)
972	vpshufb	%xmm0,%xmm11,%xmm4
973	vmovdqu	%xmm2,96(%rsp)
974	vpshufb	%xmm0,%xmm12,%xmm5
975	vmovdqu	%xmm4,80(%rsp)
976	vpshufb	%xmm0,%xmm13,%xmm6
977	vmovdqu	%xmm5,64(%rsp)
978	vpshufb	%xmm0,%xmm14,%xmm7
979	vmovdqu	%xmm6,48(%rsp)
980
981	call	_aesni_ctr32_6x
982
983	vmovdqu	(%r9),%xmm8
984	movq	32(%r9),%r9
985	leaq	32(%r9),%r9
986	subq	$12,%rdx
987	movq	$192,%r10
988	vpshufb	%xmm0,%xmm8,%xmm8
989
990#ifdef HAVE_MOVBE
991#ifdef _KERNEL
992	testl	$1,gcm_avx_can_use_movbe(%rip)
993#else
994	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
995#endif
996	jz	1f
997	call	_aesni_ctr32_ghash_6x
998	jmp	2f
9991:
1000#endif
1001	call	_aesni_ctr32_ghash_no_movbe_6x
10022:
1003	vmovdqu	32(%rsp),%xmm7
1004	vmovdqu	(%r11),%xmm0
1005	vmovdqu	0-32(%r9),%xmm3
1006	vpunpckhqdq	%xmm7,%xmm7,%xmm1
1007	vmovdqu	32-32(%r9),%xmm15
1008	vmovups	%xmm9,-96(%rsi)
1009	vpshufb	%xmm0,%xmm9,%xmm9
1010	vpxor	%xmm7,%xmm1,%xmm1
1011	vmovups	%xmm10,-80(%rsi)
1012	vpshufb	%xmm0,%xmm10,%xmm10
1013	vmovups	%xmm11,-64(%rsi)
1014	vpshufb	%xmm0,%xmm11,%xmm11
1015	vmovups	%xmm12,-48(%rsi)
1016	vpshufb	%xmm0,%xmm12,%xmm12
1017	vmovups	%xmm13,-32(%rsi)
1018	vpshufb	%xmm0,%xmm13,%xmm13
1019	vmovups	%xmm14,-16(%rsi)
1020	vpshufb	%xmm0,%xmm14,%xmm14
1021	vmovdqu	%xmm9,16(%rsp)
1022	vmovdqu	48(%rsp),%xmm6
1023	vmovdqu	16-32(%r9),%xmm0
1024	vpunpckhqdq	%xmm6,%xmm6,%xmm2
1025	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
1026	vpxor	%xmm6,%xmm2,%xmm2
1027	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
1028	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
1029
1030	vmovdqu	64(%rsp),%xmm9
1031	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
1032	vmovdqu	48-32(%r9),%xmm3
1033	vpxor	%xmm5,%xmm4,%xmm4
1034	vpunpckhqdq	%xmm9,%xmm9,%xmm5
1035	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
1036	vpxor	%xmm9,%xmm5,%xmm5
1037	vpxor	%xmm7,%xmm6,%xmm6
1038	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
1039	vmovdqu	80-32(%r9),%xmm15
1040	vpxor	%xmm1,%xmm2,%xmm2
1041
1042	vmovdqu	80(%rsp),%xmm1
1043	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
1044	vmovdqu	64-32(%r9),%xmm0
1045	vpxor	%xmm4,%xmm7,%xmm7
1046	vpunpckhqdq	%xmm1,%xmm1,%xmm4
1047	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
1048	vpxor	%xmm1,%xmm4,%xmm4
1049	vpxor	%xmm6,%xmm9,%xmm9
1050	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
1051	vpxor	%xmm2,%xmm5,%xmm5
1052
1053	vmovdqu	96(%rsp),%xmm2
1054	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
1055	vmovdqu	96-32(%r9),%xmm3
1056	vpxor	%xmm7,%xmm6,%xmm6
1057	vpunpckhqdq	%xmm2,%xmm2,%xmm7
1058	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
1059	vpxor	%xmm2,%xmm7,%xmm7
1060	vpxor	%xmm9,%xmm1,%xmm1
1061	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
1062	vmovdqu	128-32(%r9),%xmm15
1063	vpxor	%xmm5,%xmm4,%xmm4
1064
1065	vpxor	112(%rsp),%xmm8,%xmm8
1066	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
1067	vmovdqu	112-32(%r9),%xmm0
1068	vpunpckhqdq	%xmm8,%xmm8,%xmm9
1069	vpxor	%xmm6,%xmm5,%xmm5
1070	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
1071	vpxor	%xmm8,%xmm9,%xmm9
1072	vpxor	%xmm1,%xmm2,%xmm2
1073	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
1074	vpxor	%xmm4,%xmm7,%xmm4
1075
1076	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
1077	vmovdqu	0-32(%r9),%xmm3
1078	vpunpckhqdq	%xmm14,%xmm14,%xmm1
1079	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
1080	vpxor	%xmm14,%xmm1,%xmm1
1081	vpxor	%xmm5,%xmm6,%xmm5
1082	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
1083	vmovdqu	32-32(%r9),%xmm15
1084	vpxor	%xmm2,%xmm8,%xmm7
1085	vpxor	%xmm4,%xmm9,%xmm6
1086
1087	vmovdqu	16-32(%r9),%xmm0
1088	vpxor	%xmm5,%xmm7,%xmm9
1089	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
1090	vpxor	%xmm9,%xmm6,%xmm6
1091	vpunpckhqdq	%xmm13,%xmm13,%xmm2
1092	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
1093	vpxor	%xmm13,%xmm2,%xmm2
1094	vpslldq	$8,%xmm6,%xmm9
1095	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
1096	vpxor	%xmm9,%xmm5,%xmm8
1097	vpsrldq	$8,%xmm6,%xmm6
1098	vpxor	%xmm6,%xmm7,%xmm7
1099
1100	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
1101	vmovdqu	48-32(%r9),%xmm3
1102	vpxor	%xmm4,%xmm5,%xmm5
1103	vpunpckhqdq	%xmm12,%xmm12,%xmm9
1104	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
1105	vpxor	%xmm12,%xmm9,%xmm9
1106	vpxor	%xmm14,%xmm13,%xmm13
1107	vpalignr	$8,%xmm8,%xmm8,%xmm14
1108	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
1109	vmovdqu	80-32(%r9),%xmm15
1110	vpxor	%xmm1,%xmm2,%xmm2
1111
1112	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
1113	vmovdqu	64-32(%r9),%xmm0
1114	vpxor	%xmm5,%xmm4,%xmm4
1115	vpunpckhqdq	%xmm11,%xmm11,%xmm1
1116	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
1117	vpxor	%xmm11,%xmm1,%xmm1
1118	vpxor	%xmm13,%xmm12,%xmm12
1119	vxorps	16(%rsp),%xmm7,%xmm7
1120	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
1121	vpxor	%xmm2,%xmm9,%xmm9
1122
1123	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
1124	vxorps	%xmm14,%xmm8,%xmm8
1125
1126	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
1127	vmovdqu	96-32(%r9),%xmm3
1128	vpxor	%xmm4,%xmm5,%xmm5
1129	vpunpckhqdq	%xmm10,%xmm10,%xmm2
1130	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
1131	vpxor	%xmm10,%xmm2,%xmm2
1132	vpalignr	$8,%xmm8,%xmm8,%xmm14
1133	vpxor	%xmm12,%xmm11,%xmm11
1134	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
1135	vmovdqu	128-32(%r9),%xmm15
1136	vpxor	%xmm9,%xmm1,%xmm1
1137
1138	vxorps	%xmm7,%xmm14,%xmm14
1139	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
1140	vxorps	%xmm14,%xmm8,%xmm8
1141
1142	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
1143	vmovdqu	112-32(%r9),%xmm0
1144	vpxor	%xmm5,%xmm4,%xmm4
1145	vpunpckhqdq	%xmm8,%xmm8,%xmm9
1146	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
1147	vpxor	%xmm8,%xmm9,%xmm9
1148	vpxor	%xmm11,%xmm10,%xmm10
1149	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
1150	vpxor	%xmm1,%xmm2,%xmm2
1151
1152	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
1153	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
1154	vpxor	%xmm4,%xmm5,%xmm5
1155	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
1156	vpxor	%xmm10,%xmm7,%xmm7
1157	vpxor	%xmm2,%xmm6,%xmm6
1158
1159	vpxor	%xmm5,%xmm7,%xmm4
1160	vpxor	%xmm4,%xmm6,%xmm6
1161	vpslldq	$8,%xmm6,%xmm1
1162	vmovdqu	16(%r11),%xmm3
1163	vpsrldq	$8,%xmm6,%xmm6
1164	vpxor	%xmm1,%xmm5,%xmm8
1165	vpxor	%xmm6,%xmm7,%xmm7
1166
1167	vpalignr	$8,%xmm8,%xmm8,%xmm2
1168	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
1169	vpxor	%xmm2,%xmm8,%xmm8
1170
1171	vpalignr	$8,%xmm8,%xmm8,%xmm2
1172	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
1173	vpxor	%xmm7,%xmm2,%xmm2
1174	vpxor	%xmm2,%xmm8,%xmm8
1175	vpshufb	(%r11),%xmm8,%xmm8
1176	movq	-56(%rax),%r9
1177.cfi_restore	%r9
1178	vmovdqu	%xmm8,(%r9)
1179
1180	vzeroupper
1181	movq	-48(%rax),%r15
1182.cfi_restore	%r15
1183	movq	-40(%rax),%r14
1184.cfi_restore	%r14
1185	movq	-32(%rax),%r13
1186.cfi_restore	%r13
1187	movq	-24(%rax),%r12
1188.cfi_restore	%r12
1189	movq	-16(%rax),%rbp
1190.cfi_restore	%rbp
1191	movq	-8(%rax),%rbx
1192.cfi_restore	%rbx
1193	leaq	(%rax),%rsp
1194.cfi_def_cfa_register	%rsp
1195.Lgcm_enc_abort:
1196	movq	%r10,%rax
1197	RET
1198.cfi_endproc
1199SET_SIZE(aesni_gcm_encrypt)
1200
1201#endif /* !_WIN32 || _KERNEL */
1202
1203/* Some utility routines */
1204
1205/*
1206 * clear all fpu registers
1207 * void clear_fpu_regs_avx(void);
1208 */
1209ENTRY_ALIGN(clear_fpu_regs_avx, 32)
1210	vzeroall
1211	RET
1212SET_SIZE(clear_fpu_regs_avx)
1213
1214/*
1215 * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1216 *
1217 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
1218 * stores the result at `dst'. The XOR is performed using FPU registers,
1219 * so make sure FPU state is saved when running this in the kernel.
1220 */
1221ENTRY_ALIGN(gcm_xor_avx, 32)
1222	movdqu  (%rdi), %xmm0
1223	movdqu  (%rsi), %xmm1
1224	pxor    %xmm1, %xmm0
1225	movdqu  %xmm0, (%rsi)
1226	RET
1227SET_SIZE(gcm_xor_avx)
1228
1229/*
1230 * Toggle a boolean_t value atomically and return the new value.
1231 * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
1232 */
1233ENTRY_ALIGN(atomic_toggle_boolean_nv, 32)
1234	xorl	%eax, %eax
1235	lock
1236	xorl	$1, (%rdi)
1237	jz	1f
1238	movl	$1, %eax
12391:
1240	RET
1241SET_SIZE(atomic_toggle_boolean_nv)
1242
1243SECTION_STATIC
1244
1245.balign	64
1246.Lbswap_mask:
1247.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1248.Lpoly:
1249.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1250.Lone_msb:
1251.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1252.Ltwo_lsb:
1253.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1254.Lone_lsb:
1255.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1256.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1257.balign	64
1258
1259/* Mark the stack non-executable. */
1260#if defined(__linux__) && defined(__ELF__)
1261.section .note.GNU-stack,"",%progbits
1262#endif
1263
1264#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
1265