1# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
2#
3# Licensed under the Apache License 2.0 (the "License").  You may not use
4# this file except in compliance with the License.  You can obtain a copy
5# in the file LICENSE in the source distribution or at
6# https://www.openssl.org/source/license.html
7
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16#
17# AES-NI-CTR+GHASH stitch.
18#
19# February 2013
20#
21# OpenSSL GCM implementation is organized in such way that its
22# performance is rather close to the sum of its streamed components,
23# in the context parallelized AES-NI CTR and modulo-scheduled
24# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
25# was observed to perform significantly better than the sum of the
26# components on contemporary CPUs, the effort was deemed impossible to
27# justify. This module is based on combination of Intel submissions,
28# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
29# Locktyukhin of Intel Corp. who verified that it reduces shuffles
30# pressure with notable relative improvement, achieving 1.0 cycle per
31# byte processed with 128-bit key on Haswell processor, 0.74 - on
32# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
33# measurements for favourable packet size, one divisible by 96.
34# Applications using the EVP interface will observe a few percent
35# worse performance.]
36#
37# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
38#
39# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
40# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
41
42# Generated once from
43# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
44# and modified for ICP. Modification are kept at a bare minimum to ease later
45# upstream merges.
46
47#if defined(__x86_64__) && defined(HAVE_AVX) && \
48    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
49
50#define _ASM
51#include <sys/asm_linkage.h>
52
53/* Windows userland links with OpenSSL */
54#if !defined (_WIN32) || defined (_KERNEL)
55
56.extern gcm_avx_can_use_movbe
57
58.text
59
60#ifdef HAVE_MOVBE
61.balign 32
62FUNCTION(_aesni_ctr32_ghash_6x)
63.cfi_startproc
64	ENDBR
65	vmovdqu	32(%r11),%xmm2
66	subq	$6,%rdx
67	vpxor	%xmm4,%xmm4,%xmm4
68	vmovdqu	0-128(%rcx),%xmm15
69	vpaddb	%xmm2,%xmm1,%xmm10
70	vpaddb	%xmm2,%xmm10,%xmm11
71	vpaddb	%xmm2,%xmm11,%xmm12
72	vpaddb	%xmm2,%xmm12,%xmm13
73	vpaddb	%xmm2,%xmm13,%xmm14
74	vpxor	%xmm15,%xmm1,%xmm9
75	vmovdqu	%xmm4,16+8(%rsp)
76	jmp	.Loop6x
77
78.balign	32
79.Loop6x:
80	addl	$100663296,%ebx
81	jc	.Lhandle_ctr32
82	vmovdqu	0-32(%r9),%xmm3
83	vpaddb	%xmm2,%xmm14,%xmm1
84	vpxor	%xmm15,%xmm10,%xmm10
85	vpxor	%xmm15,%xmm11,%xmm11
86
87.Lresume_ctr32:
88	vmovdqu	%xmm1,(%r8)
89	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
90	vpxor	%xmm15,%xmm12,%xmm12
91	vmovups	16-128(%rcx),%xmm2
92	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
93	xorq	%r12,%r12
94	cmpq	%r14,%r15
95
96	vaesenc	%xmm2,%xmm9,%xmm9
97	vmovdqu	48+8(%rsp),%xmm0
98	vpxor	%xmm15,%xmm13,%xmm13
99	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
100	vaesenc	%xmm2,%xmm10,%xmm10
101	vpxor	%xmm15,%xmm14,%xmm14
102	setnc	%r12b
103	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
104	vaesenc	%xmm2,%xmm11,%xmm11
105	vmovdqu	16-32(%r9),%xmm3
106	negq	%r12
107	vaesenc	%xmm2,%xmm12,%xmm12
108	vpxor	%xmm5,%xmm6,%xmm6
109	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
110	vpxor	%xmm4,%xmm8,%xmm8
111	vaesenc	%xmm2,%xmm13,%xmm13
112	vpxor	%xmm5,%xmm1,%xmm4
113	andq	$0x60,%r12
114	vmovups	32-128(%rcx),%xmm15
115	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
116	vaesenc	%xmm2,%xmm14,%xmm14
117
118	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
119	leaq	(%r14,%r12,1),%r14
120	vaesenc	%xmm15,%xmm9,%xmm9
121	vpxor	16+8(%rsp),%xmm8,%xmm8
122	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
123	vmovdqu	64+8(%rsp),%xmm0
124	vaesenc	%xmm15,%xmm10,%xmm10
125	movbeq	88(%r14),%r13
126	vaesenc	%xmm15,%xmm11,%xmm11
127	movbeq	80(%r14),%r12
128	vaesenc	%xmm15,%xmm12,%xmm12
129	movq	%r13,32+8(%rsp)
130	vaesenc	%xmm15,%xmm13,%xmm13
131	movq	%r12,40+8(%rsp)
132	vmovdqu	48-32(%r9),%xmm5
133	vaesenc	%xmm15,%xmm14,%xmm14
134
135	vmovups	48-128(%rcx),%xmm15
136	vpxor	%xmm1,%xmm6,%xmm6
137	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
138	vaesenc	%xmm15,%xmm9,%xmm9
139	vpxor	%xmm2,%xmm6,%xmm6
140	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
141	vaesenc	%xmm15,%xmm10,%xmm10
142	vpxor	%xmm3,%xmm7,%xmm7
143	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
144	vaesenc	%xmm15,%xmm11,%xmm11
145	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
146	vmovdqu	80+8(%rsp),%xmm0
147	vaesenc	%xmm15,%xmm12,%xmm12
148	vaesenc	%xmm15,%xmm13,%xmm13
149	vpxor	%xmm1,%xmm4,%xmm4
150	vmovdqu	64-32(%r9),%xmm1
151	vaesenc	%xmm15,%xmm14,%xmm14
152
153	vmovups	64-128(%rcx),%xmm15
154	vpxor	%xmm2,%xmm6,%xmm6
155	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
156	vaesenc	%xmm15,%xmm9,%xmm9
157	vpxor	%xmm3,%xmm6,%xmm6
158	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
159	vaesenc	%xmm15,%xmm10,%xmm10
160	movbeq	72(%r14),%r13
161	vpxor	%xmm5,%xmm7,%xmm7
162	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
163	vaesenc	%xmm15,%xmm11,%xmm11
164	movbeq	64(%r14),%r12
165	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
166	vmovdqu	96+8(%rsp),%xmm0
167	vaesenc	%xmm15,%xmm12,%xmm12
168	movq	%r13,48+8(%rsp)
169	vaesenc	%xmm15,%xmm13,%xmm13
170	movq	%r12,56+8(%rsp)
171	vpxor	%xmm2,%xmm4,%xmm4
172	vmovdqu	96-32(%r9),%xmm2
173	vaesenc	%xmm15,%xmm14,%xmm14
174
175	vmovups	80-128(%rcx),%xmm15
176	vpxor	%xmm3,%xmm6,%xmm6
177	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
178	vaesenc	%xmm15,%xmm9,%xmm9
179	vpxor	%xmm5,%xmm6,%xmm6
180	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
181	vaesenc	%xmm15,%xmm10,%xmm10
182	movbeq	56(%r14),%r13
183	vpxor	%xmm1,%xmm7,%xmm7
184	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
185	vpxor	112+8(%rsp),%xmm8,%xmm8
186	vaesenc	%xmm15,%xmm11,%xmm11
187	movbeq	48(%r14),%r12
188	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
189	vaesenc	%xmm15,%xmm12,%xmm12
190	movq	%r13,64+8(%rsp)
191	vaesenc	%xmm15,%xmm13,%xmm13
192	movq	%r12,72+8(%rsp)
193	vpxor	%xmm3,%xmm4,%xmm4
194	vmovdqu	112-32(%r9),%xmm3
195	vaesenc	%xmm15,%xmm14,%xmm14
196
197	vmovups	96-128(%rcx),%xmm15
198	vpxor	%xmm5,%xmm6,%xmm6
199	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
200	vaesenc	%xmm15,%xmm9,%xmm9
201	vpxor	%xmm1,%xmm6,%xmm6
202	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
203	vaesenc	%xmm15,%xmm10,%xmm10
204	movbeq	40(%r14),%r13
205	vpxor	%xmm2,%xmm7,%xmm7
206	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
207	vaesenc	%xmm15,%xmm11,%xmm11
208	movbeq	32(%r14),%r12
209	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
210	vaesenc	%xmm15,%xmm12,%xmm12
211	movq	%r13,80+8(%rsp)
212	vaesenc	%xmm15,%xmm13,%xmm13
213	movq	%r12,88+8(%rsp)
214	vpxor	%xmm5,%xmm6,%xmm6
215	vaesenc	%xmm15,%xmm14,%xmm14
216	vpxor	%xmm1,%xmm6,%xmm6
217
218	vmovups	112-128(%rcx),%xmm15
219	vpslldq	$8,%xmm6,%xmm5
220	vpxor	%xmm2,%xmm4,%xmm4
221	vmovdqu	16(%r11),%xmm3
222
223	vaesenc	%xmm15,%xmm9,%xmm9
224	vpxor	%xmm8,%xmm7,%xmm7
225	vaesenc	%xmm15,%xmm10,%xmm10
226	vpxor	%xmm5,%xmm4,%xmm4
227	movbeq	24(%r14),%r13
228	vaesenc	%xmm15,%xmm11,%xmm11
229	movbeq	16(%r14),%r12
230	vpalignr	$8,%xmm4,%xmm4,%xmm0
231	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
232	movq	%r13,96+8(%rsp)
233	vaesenc	%xmm15,%xmm12,%xmm12
234	movq	%r12,104+8(%rsp)
235	vaesenc	%xmm15,%xmm13,%xmm13
236	vmovups	128-128(%rcx),%xmm1
237	vaesenc	%xmm15,%xmm14,%xmm14
238
239	vaesenc	%xmm1,%xmm9,%xmm9
240	vmovups	144-128(%rcx),%xmm15
241	vaesenc	%xmm1,%xmm10,%xmm10
242	vpsrldq	$8,%xmm6,%xmm6
243	vaesenc	%xmm1,%xmm11,%xmm11
244	vpxor	%xmm6,%xmm7,%xmm7
245	vaesenc	%xmm1,%xmm12,%xmm12
246	vpxor	%xmm0,%xmm4,%xmm4
247	movbeq	8(%r14),%r13
248	vaesenc	%xmm1,%xmm13,%xmm13
249	movbeq	0(%r14),%r12
250	vaesenc	%xmm1,%xmm14,%xmm14
251	vmovups	160-128(%rcx),%xmm1
252	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
253	jb	.Lenc_tail
254
255	vaesenc	%xmm15,%xmm9,%xmm9
256	vaesenc	%xmm15,%xmm10,%xmm10
257	vaesenc	%xmm15,%xmm11,%xmm11
258	vaesenc	%xmm15,%xmm12,%xmm12
259	vaesenc	%xmm15,%xmm13,%xmm13
260	vaesenc	%xmm15,%xmm14,%xmm14
261
262	vaesenc	%xmm1,%xmm9,%xmm9
263	vaesenc	%xmm1,%xmm10,%xmm10
264	vaesenc	%xmm1,%xmm11,%xmm11
265	vaesenc	%xmm1,%xmm12,%xmm12
266	vaesenc	%xmm1,%xmm13,%xmm13
267	vmovups	176-128(%rcx),%xmm15
268	vaesenc	%xmm1,%xmm14,%xmm14
269	vmovups	192-128(%rcx),%xmm1
270	cmpl	$14,%ebp	// ICP does not zero key schedule.
271	jb	.Lenc_tail
272
273	vaesenc	%xmm15,%xmm9,%xmm9
274	vaesenc	%xmm15,%xmm10,%xmm10
275	vaesenc	%xmm15,%xmm11,%xmm11
276	vaesenc	%xmm15,%xmm12,%xmm12
277	vaesenc	%xmm15,%xmm13,%xmm13
278	vaesenc	%xmm15,%xmm14,%xmm14
279
280	vaesenc	%xmm1,%xmm9,%xmm9
281	vaesenc	%xmm1,%xmm10,%xmm10
282	vaesenc	%xmm1,%xmm11,%xmm11
283	vaesenc	%xmm1,%xmm12,%xmm12
284	vaesenc	%xmm1,%xmm13,%xmm13
285	vmovups	208-128(%rcx),%xmm15
286	vaesenc	%xmm1,%xmm14,%xmm14
287	vmovups	224-128(%rcx),%xmm1
288	jmp	.Lenc_tail
289
290.balign	32
291.Lhandle_ctr32:
292	vmovdqu	(%r11),%xmm0
293	vpshufb	%xmm0,%xmm1,%xmm6
294	vmovdqu	48(%r11),%xmm5
295	vpaddd	64(%r11),%xmm6,%xmm10
296	vpaddd	%xmm5,%xmm6,%xmm11
297	vmovdqu	0-32(%r9),%xmm3
298	vpaddd	%xmm5,%xmm10,%xmm12
299	vpshufb	%xmm0,%xmm10,%xmm10
300	vpaddd	%xmm5,%xmm11,%xmm13
301	vpshufb	%xmm0,%xmm11,%xmm11
302	vpxor	%xmm15,%xmm10,%xmm10
303	vpaddd	%xmm5,%xmm12,%xmm14
304	vpshufb	%xmm0,%xmm12,%xmm12
305	vpxor	%xmm15,%xmm11,%xmm11
306	vpaddd	%xmm5,%xmm13,%xmm1
307	vpshufb	%xmm0,%xmm13,%xmm13
308	vpshufb	%xmm0,%xmm14,%xmm14
309	vpshufb	%xmm0,%xmm1,%xmm1
310	jmp	.Lresume_ctr32
311
312.balign	32
313.Lenc_tail:
314	vaesenc	%xmm15,%xmm9,%xmm9
315	vmovdqu	%xmm7,16+8(%rsp)
316	vpalignr	$8,%xmm4,%xmm4,%xmm8
317	vaesenc	%xmm15,%xmm10,%xmm10
318	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
319	vpxor	0(%rdi),%xmm1,%xmm2
320	vaesenc	%xmm15,%xmm11,%xmm11
321	vpxor	16(%rdi),%xmm1,%xmm0
322	vaesenc	%xmm15,%xmm12,%xmm12
323	vpxor	32(%rdi),%xmm1,%xmm5
324	vaesenc	%xmm15,%xmm13,%xmm13
325	vpxor	48(%rdi),%xmm1,%xmm6
326	vaesenc	%xmm15,%xmm14,%xmm14
327	vpxor	64(%rdi),%xmm1,%xmm7
328	vpxor	80(%rdi),%xmm1,%xmm3
329	vmovdqu	(%r8),%xmm1
330
331	vaesenclast	%xmm2,%xmm9,%xmm9
332	vmovdqu	32(%r11),%xmm2
333	vaesenclast	%xmm0,%xmm10,%xmm10
334	vpaddb	%xmm2,%xmm1,%xmm0
335	movq	%r13,112+8(%rsp)
336	leaq	96(%rdi),%rdi
337	vaesenclast	%xmm5,%xmm11,%xmm11
338	vpaddb	%xmm2,%xmm0,%xmm5
339	movq	%r12,120+8(%rsp)
340	leaq	96(%rsi),%rsi
341	vmovdqu	0-128(%rcx),%xmm15
342	vaesenclast	%xmm6,%xmm12,%xmm12
343	vpaddb	%xmm2,%xmm5,%xmm6
344	vaesenclast	%xmm7,%xmm13,%xmm13
345	vpaddb	%xmm2,%xmm6,%xmm7
346	vaesenclast	%xmm3,%xmm14,%xmm14
347	vpaddb	%xmm2,%xmm7,%xmm3
348
349	addq	$0x60,%r10
350	subq	$0x6,%rdx
351	jc	.L6x_done
352
353	vmovups	%xmm9,-96(%rsi)
354	vpxor	%xmm15,%xmm1,%xmm9
355	vmovups	%xmm10,-80(%rsi)
356	vmovdqa	%xmm0,%xmm10
357	vmovups	%xmm11,-64(%rsi)
358	vmovdqa	%xmm5,%xmm11
359	vmovups	%xmm12,-48(%rsi)
360	vmovdqa	%xmm6,%xmm12
361	vmovups	%xmm13,-32(%rsi)
362	vmovdqa	%xmm7,%xmm13
363	vmovups	%xmm14,-16(%rsi)
364	vmovdqa	%xmm3,%xmm14
365	vmovdqu	32+8(%rsp),%xmm7
366	jmp	.Loop6x
367
368.L6x_done:
369	vpxor	16+8(%rsp),%xmm8,%xmm8
370	vpxor	%xmm4,%xmm8,%xmm8
371
372	RET
373.cfi_endproc
374SET_SIZE(_aesni_ctr32_ghash_6x)
375#endif /* ifdef HAVE_MOVBE */
376
377.balign 32
378FUNCTION(_aesni_ctr32_ghash_no_movbe_6x)
379.cfi_startproc
380	ENDBR
381	vmovdqu	32(%r11),%xmm2
382	subq	$6,%rdx
383	vpxor	%xmm4,%xmm4,%xmm4
384	vmovdqu	0-128(%rcx),%xmm15
385	vpaddb	%xmm2,%xmm1,%xmm10
386	vpaddb	%xmm2,%xmm10,%xmm11
387	vpaddb	%xmm2,%xmm11,%xmm12
388	vpaddb	%xmm2,%xmm12,%xmm13
389	vpaddb	%xmm2,%xmm13,%xmm14
390	vpxor	%xmm15,%xmm1,%xmm9
391	vmovdqu	%xmm4,16+8(%rsp)
392	jmp	.Loop6x_nmb
393
394.balign	32
395.Loop6x_nmb:
396	addl	$100663296,%ebx
397	jc	.Lhandle_ctr32_nmb
398	vmovdqu	0-32(%r9),%xmm3
399	vpaddb	%xmm2,%xmm14,%xmm1
400	vpxor	%xmm15,%xmm10,%xmm10
401	vpxor	%xmm15,%xmm11,%xmm11
402
403.Lresume_ctr32_nmb:
404	vmovdqu	%xmm1,(%r8)
405	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
406	vpxor	%xmm15,%xmm12,%xmm12
407	vmovups	16-128(%rcx),%xmm2
408	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
409	xorq	%r12,%r12
410	cmpq	%r14,%r15
411
412	vaesenc	%xmm2,%xmm9,%xmm9
413	vmovdqu	48+8(%rsp),%xmm0
414	vpxor	%xmm15,%xmm13,%xmm13
415	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
416	vaesenc	%xmm2,%xmm10,%xmm10
417	vpxor	%xmm15,%xmm14,%xmm14
418	setnc	%r12b
419	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
420	vaesenc	%xmm2,%xmm11,%xmm11
421	vmovdqu	16-32(%r9),%xmm3
422	negq	%r12
423	vaesenc	%xmm2,%xmm12,%xmm12
424	vpxor	%xmm5,%xmm6,%xmm6
425	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
426	vpxor	%xmm4,%xmm8,%xmm8
427	vaesenc	%xmm2,%xmm13,%xmm13
428	vpxor	%xmm5,%xmm1,%xmm4
429	andq	$0x60,%r12
430	vmovups	32-128(%rcx),%xmm15
431	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
432	vaesenc	%xmm2,%xmm14,%xmm14
433
434	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
435	leaq	(%r14,%r12,1),%r14
436	vaesenc	%xmm15,%xmm9,%xmm9
437	vpxor	16+8(%rsp),%xmm8,%xmm8
438	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
439	vmovdqu	64+8(%rsp),%xmm0
440	vaesenc	%xmm15,%xmm10,%xmm10
441	movq	88(%r14),%r13
442	bswapq	%r13
443	vaesenc	%xmm15,%xmm11,%xmm11
444	movq	80(%r14),%r12
445	bswapq	%r12
446	vaesenc	%xmm15,%xmm12,%xmm12
447	movq	%r13,32+8(%rsp)
448	vaesenc	%xmm15,%xmm13,%xmm13
449	movq	%r12,40+8(%rsp)
450	vmovdqu	48-32(%r9),%xmm5
451	vaesenc	%xmm15,%xmm14,%xmm14
452
453	vmovups	48-128(%rcx),%xmm15
454	vpxor	%xmm1,%xmm6,%xmm6
455	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
456	vaesenc	%xmm15,%xmm9,%xmm9
457	vpxor	%xmm2,%xmm6,%xmm6
458	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
459	vaesenc	%xmm15,%xmm10,%xmm10
460	vpxor	%xmm3,%xmm7,%xmm7
461	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
462	vaesenc	%xmm15,%xmm11,%xmm11
463	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
464	vmovdqu	80+8(%rsp),%xmm0
465	vaesenc	%xmm15,%xmm12,%xmm12
466	vaesenc	%xmm15,%xmm13,%xmm13
467	vpxor	%xmm1,%xmm4,%xmm4
468	vmovdqu	64-32(%r9),%xmm1
469	vaesenc	%xmm15,%xmm14,%xmm14
470
471	vmovups	64-128(%rcx),%xmm15
472	vpxor	%xmm2,%xmm6,%xmm6
473	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
474	vaesenc	%xmm15,%xmm9,%xmm9
475	vpxor	%xmm3,%xmm6,%xmm6
476	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
477	vaesenc	%xmm15,%xmm10,%xmm10
478	movq	72(%r14),%r13
479	bswapq	%r13
480	vpxor	%xmm5,%xmm7,%xmm7
481	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
482	vaesenc	%xmm15,%xmm11,%xmm11
483	movq	64(%r14),%r12
484	bswapq	%r12
485	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
486	vmovdqu	96+8(%rsp),%xmm0
487	vaesenc	%xmm15,%xmm12,%xmm12
488	movq	%r13,48+8(%rsp)
489	vaesenc	%xmm15,%xmm13,%xmm13
490	movq	%r12,56+8(%rsp)
491	vpxor	%xmm2,%xmm4,%xmm4
492	vmovdqu	96-32(%r9),%xmm2
493	vaesenc	%xmm15,%xmm14,%xmm14
494
495	vmovups	80-128(%rcx),%xmm15
496	vpxor	%xmm3,%xmm6,%xmm6
497	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
498	vaesenc	%xmm15,%xmm9,%xmm9
499	vpxor	%xmm5,%xmm6,%xmm6
500	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
501	vaesenc	%xmm15,%xmm10,%xmm10
502	movq	56(%r14),%r13
503	bswapq	%r13
504	vpxor	%xmm1,%xmm7,%xmm7
505	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
506	vpxor	112+8(%rsp),%xmm8,%xmm8
507	vaesenc	%xmm15,%xmm11,%xmm11
508	movq	48(%r14),%r12
509	bswapq	%r12
510	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
511	vaesenc	%xmm15,%xmm12,%xmm12
512	movq	%r13,64+8(%rsp)
513	vaesenc	%xmm15,%xmm13,%xmm13
514	movq	%r12,72+8(%rsp)
515	vpxor	%xmm3,%xmm4,%xmm4
516	vmovdqu	112-32(%r9),%xmm3
517	vaesenc	%xmm15,%xmm14,%xmm14
518
519	vmovups	96-128(%rcx),%xmm15
520	vpxor	%xmm5,%xmm6,%xmm6
521	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
522	vaesenc	%xmm15,%xmm9,%xmm9
523	vpxor	%xmm1,%xmm6,%xmm6
524	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
525	vaesenc	%xmm15,%xmm10,%xmm10
526	movq	40(%r14),%r13
527	bswapq	%r13
528	vpxor	%xmm2,%xmm7,%xmm7
529	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
530	vaesenc	%xmm15,%xmm11,%xmm11
531	movq	32(%r14),%r12
532	bswapq	%r12
533	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
534	vaesenc	%xmm15,%xmm12,%xmm12
535	movq	%r13,80+8(%rsp)
536	vaesenc	%xmm15,%xmm13,%xmm13
537	movq	%r12,88+8(%rsp)
538	vpxor	%xmm5,%xmm6,%xmm6
539	vaesenc	%xmm15,%xmm14,%xmm14
540	vpxor	%xmm1,%xmm6,%xmm6
541
542	vmovups	112-128(%rcx),%xmm15
543	vpslldq	$8,%xmm6,%xmm5
544	vpxor	%xmm2,%xmm4,%xmm4
545	vmovdqu	16(%r11),%xmm3
546
547	vaesenc	%xmm15,%xmm9,%xmm9
548	vpxor	%xmm8,%xmm7,%xmm7
549	vaesenc	%xmm15,%xmm10,%xmm10
550	vpxor	%xmm5,%xmm4,%xmm4
551	movq	24(%r14),%r13
552	bswapq	%r13
553	vaesenc	%xmm15,%xmm11,%xmm11
554	movq	16(%r14),%r12
555	bswapq	%r12
556	vpalignr	$8,%xmm4,%xmm4,%xmm0
557	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
558	movq	%r13,96+8(%rsp)
559	vaesenc	%xmm15,%xmm12,%xmm12
560	movq	%r12,104+8(%rsp)
561	vaesenc	%xmm15,%xmm13,%xmm13
562	vmovups	128-128(%rcx),%xmm1
563	vaesenc	%xmm15,%xmm14,%xmm14
564
565	vaesenc	%xmm1,%xmm9,%xmm9
566	vmovups	144-128(%rcx),%xmm15
567	vaesenc	%xmm1,%xmm10,%xmm10
568	vpsrldq	$8,%xmm6,%xmm6
569	vaesenc	%xmm1,%xmm11,%xmm11
570	vpxor	%xmm6,%xmm7,%xmm7
571	vaesenc	%xmm1,%xmm12,%xmm12
572	vpxor	%xmm0,%xmm4,%xmm4
573	movq	8(%r14),%r13
574	bswapq	%r13
575	vaesenc	%xmm1,%xmm13,%xmm13
576	movq	0(%r14),%r12
577	bswapq	%r12
578	vaesenc	%xmm1,%xmm14,%xmm14
579	vmovups	160-128(%rcx),%xmm1
580	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
581	jb	.Lenc_tail_nmb
582
583	vaesenc	%xmm15,%xmm9,%xmm9
584	vaesenc	%xmm15,%xmm10,%xmm10
585	vaesenc	%xmm15,%xmm11,%xmm11
586	vaesenc	%xmm15,%xmm12,%xmm12
587	vaesenc	%xmm15,%xmm13,%xmm13
588	vaesenc	%xmm15,%xmm14,%xmm14
589
590	vaesenc	%xmm1,%xmm9,%xmm9
591	vaesenc	%xmm1,%xmm10,%xmm10
592	vaesenc	%xmm1,%xmm11,%xmm11
593	vaesenc	%xmm1,%xmm12,%xmm12
594	vaesenc	%xmm1,%xmm13,%xmm13
595	vmovups	176-128(%rcx),%xmm15
596	vaesenc	%xmm1,%xmm14,%xmm14
597	vmovups	192-128(%rcx),%xmm1
598	cmpl	$14,%ebp	// ICP does not zero key schedule.
599	jb	.Lenc_tail_nmb
600
601	vaesenc	%xmm15,%xmm9,%xmm9
602	vaesenc	%xmm15,%xmm10,%xmm10
603	vaesenc	%xmm15,%xmm11,%xmm11
604	vaesenc	%xmm15,%xmm12,%xmm12
605	vaesenc	%xmm15,%xmm13,%xmm13
606	vaesenc	%xmm15,%xmm14,%xmm14
607
608	vaesenc	%xmm1,%xmm9,%xmm9
609	vaesenc	%xmm1,%xmm10,%xmm10
610	vaesenc	%xmm1,%xmm11,%xmm11
611	vaesenc	%xmm1,%xmm12,%xmm12
612	vaesenc	%xmm1,%xmm13,%xmm13
613	vmovups	208-128(%rcx),%xmm15
614	vaesenc	%xmm1,%xmm14,%xmm14
615	vmovups	224-128(%rcx),%xmm1
616	jmp	.Lenc_tail_nmb
617
618.balign	32
619.Lhandle_ctr32_nmb:
620	vmovdqu	(%r11),%xmm0
621	vpshufb	%xmm0,%xmm1,%xmm6
622	vmovdqu	48(%r11),%xmm5
623	vpaddd	64(%r11),%xmm6,%xmm10
624	vpaddd	%xmm5,%xmm6,%xmm11
625	vmovdqu	0-32(%r9),%xmm3
626	vpaddd	%xmm5,%xmm10,%xmm12
627	vpshufb	%xmm0,%xmm10,%xmm10
628	vpaddd	%xmm5,%xmm11,%xmm13
629	vpshufb	%xmm0,%xmm11,%xmm11
630	vpxor	%xmm15,%xmm10,%xmm10
631	vpaddd	%xmm5,%xmm12,%xmm14
632	vpshufb	%xmm0,%xmm12,%xmm12
633	vpxor	%xmm15,%xmm11,%xmm11
634	vpaddd	%xmm5,%xmm13,%xmm1
635	vpshufb	%xmm0,%xmm13,%xmm13
636	vpshufb	%xmm0,%xmm14,%xmm14
637	vpshufb	%xmm0,%xmm1,%xmm1
638	jmp	.Lresume_ctr32_nmb
639
640.balign	32
641.Lenc_tail_nmb:
642	vaesenc	%xmm15,%xmm9,%xmm9
643	vmovdqu	%xmm7,16+8(%rsp)
644	vpalignr	$8,%xmm4,%xmm4,%xmm8
645	vaesenc	%xmm15,%xmm10,%xmm10
646	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
647	vpxor	0(%rdi),%xmm1,%xmm2
648	vaesenc	%xmm15,%xmm11,%xmm11
649	vpxor	16(%rdi),%xmm1,%xmm0
650	vaesenc	%xmm15,%xmm12,%xmm12
651	vpxor	32(%rdi),%xmm1,%xmm5
652	vaesenc	%xmm15,%xmm13,%xmm13
653	vpxor	48(%rdi),%xmm1,%xmm6
654	vaesenc	%xmm15,%xmm14,%xmm14
655	vpxor	64(%rdi),%xmm1,%xmm7
656	vpxor	80(%rdi),%xmm1,%xmm3
657	vmovdqu	(%r8),%xmm1
658
659	vaesenclast	%xmm2,%xmm9,%xmm9
660	vmovdqu	32(%r11),%xmm2
661	vaesenclast	%xmm0,%xmm10,%xmm10
662	vpaddb	%xmm2,%xmm1,%xmm0
663	movq	%r13,112+8(%rsp)
664	leaq	96(%rdi),%rdi
665	vaesenclast	%xmm5,%xmm11,%xmm11
666	vpaddb	%xmm2,%xmm0,%xmm5
667	movq	%r12,120+8(%rsp)
668	leaq	96(%rsi),%rsi
669	vmovdqu	0-128(%rcx),%xmm15
670	vaesenclast	%xmm6,%xmm12,%xmm12
671	vpaddb	%xmm2,%xmm5,%xmm6
672	vaesenclast	%xmm7,%xmm13,%xmm13
673	vpaddb	%xmm2,%xmm6,%xmm7
674	vaesenclast	%xmm3,%xmm14,%xmm14
675	vpaddb	%xmm2,%xmm7,%xmm3
676
677	addq	$0x60,%r10
678	subq	$0x6,%rdx
679	jc	.L6x_done_nmb
680
681	vmovups	%xmm9,-96(%rsi)
682	vpxor	%xmm15,%xmm1,%xmm9
683	vmovups	%xmm10,-80(%rsi)
684	vmovdqa	%xmm0,%xmm10
685	vmovups	%xmm11,-64(%rsi)
686	vmovdqa	%xmm5,%xmm11
687	vmovups	%xmm12,-48(%rsi)
688	vmovdqa	%xmm6,%xmm12
689	vmovups	%xmm13,-32(%rsi)
690	vmovdqa	%xmm7,%xmm13
691	vmovups	%xmm14,-16(%rsi)
692	vmovdqa	%xmm3,%xmm14
693	vmovdqu	32+8(%rsp),%xmm7
694	jmp	.Loop6x_nmb
695
696.L6x_done_nmb:
697	vpxor	16+8(%rsp),%xmm8,%xmm8
698	vpxor	%xmm4,%xmm8,%xmm8
699
700	RET
701.cfi_endproc
702SET_SIZE(_aesni_ctr32_ghash_no_movbe_6x)
703
704ENTRY_ALIGN(aesni_gcm_decrypt, 32)
705.cfi_startproc
706	ENDBR
707	xorq	%r10,%r10
708	cmpq	$0x60,%rdx
709	jb	.Lgcm_dec_abort
710
711	leaq	(%rsp),%rax
712.cfi_def_cfa_register	%rax
713	pushq	%rbx
714.cfi_offset	%rbx,-16
715	pushq	%rbp
716.cfi_offset	%rbp,-24
717	pushq	%r12
718.cfi_offset	%r12,-32
719	pushq	%r13
720.cfi_offset	%r13,-40
721	pushq	%r14
722.cfi_offset	%r14,-48
723	pushq	%r15
724.cfi_offset	%r15,-56
725	pushq	%r9
726.cfi_offset	%r9,-64
727	vzeroupper
728
729	vmovdqu	(%r8),%xmm1
730	addq	$-128,%rsp
731	movl	12(%r8),%ebx
732	leaq	.Lbswap_mask(%rip),%r11
733	leaq	-128(%rcx),%r14
734	movq	$0xf80,%r15
735	vmovdqu	(%r9),%xmm8
736	andq	$-128,%rsp
737	vmovdqu	(%r11),%xmm0
738	leaq	128(%rcx),%rcx
739	movq	32(%r9),%r9
740	leaq	32(%r9),%r9
741	movl	504-128(%rcx),%ebp	// ICP has a larger offset for rounds.
742	vpshufb	%xmm0,%xmm8,%xmm8
743
744	andq	%r15,%r14
745	andq	%rsp,%r15
746	subq	%r14,%r15
747	jc	.Ldec_no_key_aliasing
748	cmpq	$768,%r15
749	jnc	.Ldec_no_key_aliasing
750	subq	%r15,%rsp
751.Ldec_no_key_aliasing:
752
753	vmovdqu	80(%rdi),%xmm7
754	leaq	(%rdi),%r14
755	vmovdqu	64(%rdi),%xmm4
756	leaq	-192(%rdi,%rdx,1),%r15
757	vmovdqu	48(%rdi),%xmm5
758	shrq	$4,%rdx
759	xorq	%r10,%r10
760	vmovdqu	32(%rdi),%xmm6
761	vpshufb	%xmm0,%xmm7,%xmm7
762	vmovdqu	16(%rdi),%xmm2
763	vpshufb	%xmm0,%xmm4,%xmm4
764	vmovdqu	(%rdi),%xmm3
765	vpshufb	%xmm0,%xmm5,%xmm5
766	vmovdqu	%xmm4,48(%rsp)
767	vpshufb	%xmm0,%xmm6,%xmm6
768	vmovdqu	%xmm5,64(%rsp)
769	vpshufb	%xmm0,%xmm2,%xmm2
770	vmovdqu	%xmm6,80(%rsp)
771	vpshufb	%xmm0,%xmm3,%xmm3
772	vmovdqu	%xmm2,96(%rsp)
773	vmovdqu	%xmm3,112(%rsp)
774
775#ifdef HAVE_MOVBE
776#ifdef _KERNEL
777	testl	$1,gcm_avx_can_use_movbe(%rip)
778#else
779	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
780#endif
781	jz	1f
782	call	_aesni_ctr32_ghash_6x
783	jmp	2f
7841:
785#endif
786	call	_aesni_ctr32_ghash_no_movbe_6x
7872:
788	vmovups	%xmm9,-96(%rsi)
789	vmovups	%xmm10,-80(%rsi)
790	vmovups	%xmm11,-64(%rsi)
791	vmovups	%xmm12,-48(%rsi)
792	vmovups	%xmm13,-32(%rsi)
793	vmovups	%xmm14,-16(%rsi)
794
795	vpshufb	(%r11),%xmm8,%xmm8
796	movq	-56(%rax),%r9
797.cfi_restore	%r9
798	vmovdqu	%xmm8,(%r9)
799
800	vzeroupper
801	movq	-48(%rax),%r15
802.cfi_restore	%r15
803	movq	-40(%rax),%r14
804.cfi_restore	%r14
805	movq	-32(%rax),%r13
806.cfi_restore	%r13
807	movq	-24(%rax),%r12
808.cfi_restore	%r12
809	movq	-16(%rax),%rbp
810.cfi_restore	%rbp
811	movq	-8(%rax),%rbx
812.cfi_restore	%rbx
813	leaq	(%rax),%rsp
814.cfi_def_cfa_register	%rsp
815.Lgcm_dec_abort:
816	movq	%r10,%rax
817	RET
818.cfi_endproc
819SET_SIZE(aesni_gcm_decrypt)
820
821.balign 32
822FUNCTION(_aesni_ctr32_6x)
823.cfi_startproc
824	ENDBR
825	vmovdqu	0-128(%rcx),%xmm4
826	vmovdqu	32(%r11),%xmm2
827	leaq	-2(%rbp),%r13	// ICP uses 10,12,14 not 9,11,13 for rounds.
828	vmovups	16-128(%rcx),%xmm15
829	leaq	32-128(%rcx),%r12
830	vpxor	%xmm4,%xmm1,%xmm9
831	addl	$100663296,%ebx
832	jc	.Lhandle_ctr32_2
833	vpaddb	%xmm2,%xmm1,%xmm10
834	vpaddb	%xmm2,%xmm10,%xmm11
835	vpxor	%xmm4,%xmm10,%xmm10
836	vpaddb	%xmm2,%xmm11,%xmm12
837	vpxor	%xmm4,%xmm11,%xmm11
838	vpaddb	%xmm2,%xmm12,%xmm13
839	vpxor	%xmm4,%xmm12,%xmm12
840	vpaddb	%xmm2,%xmm13,%xmm14
841	vpxor	%xmm4,%xmm13,%xmm13
842	vpaddb	%xmm2,%xmm14,%xmm1
843	vpxor	%xmm4,%xmm14,%xmm14
844	jmp	.Loop_ctr32
845
846.balign	16
847.Loop_ctr32:
848	vaesenc	%xmm15,%xmm9,%xmm9
849	vaesenc	%xmm15,%xmm10,%xmm10
850	vaesenc	%xmm15,%xmm11,%xmm11
851	vaesenc	%xmm15,%xmm12,%xmm12
852	vaesenc	%xmm15,%xmm13,%xmm13
853	vaesenc	%xmm15,%xmm14,%xmm14
854	vmovups	(%r12),%xmm15
855	leaq	16(%r12),%r12
856	decl	%r13d
857	jnz	.Loop_ctr32
858
859	vmovdqu	(%r12),%xmm3
860	vaesenc	%xmm15,%xmm9,%xmm9
861	vpxor	0(%rdi),%xmm3,%xmm4
862	vaesenc	%xmm15,%xmm10,%xmm10
863	vpxor	16(%rdi),%xmm3,%xmm5
864	vaesenc	%xmm15,%xmm11,%xmm11
865	vpxor	32(%rdi),%xmm3,%xmm6
866	vaesenc	%xmm15,%xmm12,%xmm12
867	vpxor	48(%rdi),%xmm3,%xmm8
868	vaesenc	%xmm15,%xmm13,%xmm13
869	vpxor	64(%rdi),%xmm3,%xmm2
870	vaesenc	%xmm15,%xmm14,%xmm14
871	vpxor	80(%rdi),%xmm3,%xmm3
872	leaq	96(%rdi),%rdi
873
874	vaesenclast	%xmm4,%xmm9,%xmm9
875	vaesenclast	%xmm5,%xmm10,%xmm10
876	vaesenclast	%xmm6,%xmm11,%xmm11
877	vaesenclast	%xmm8,%xmm12,%xmm12
878	vaesenclast	%xmm2,%xmm13,%xmm13
879	vaesenclast	%xmm3,%xmm14,%xmm14
880	vmovups	%xmm9,0(%rsi)
881	vmovups	%xmm10,16(%rsi)
882	vmovups	%xmm11,32(%rsi)
883	vmovups	%xmm12,48(%rsi)
884	vmovups	%xmm13,64(%rsi)
885	vmovups	%xmm14,80(%rsi)
886	leaq	96(%rsi),%rsi
887
888	RET
889.balign	32
890.Lhandle_ctr32_2:
891	vpshufb	%xmm0,%xmm1,%xmm6
892	vmovdqu	48(%r11),%xmm5
893	vpaddd	64(%r11),%xmm6,%xmm10
894	vpaddd	%xmm5,%xmm6,%xmm11
895	vpaddd	%xmm5,%xmm10,%xmm12
896	vpshufb	%xmm0,%xmm10,%xmm10
897	vpaddd	%xmm5,%xmm11,%xmm13
898	vpshufb	%xmm0,%xmm11,%xmm11
899	vpxor	%xmm4,%xmm10,%xmm10
900	vpaddd	%xmm5,%xmm12,%xmm14
901	vpshufb	%xmm0,%xmm12,%xmm12
902	vpxor	%xmm4,%xmm11,%xmm11
903	vpaddd	%xmm5,%xmm13,%xmm1
904	vpshufb	%xmm0,%xmm13,%xmm13
905	vpxor	%xmm4,%xmm12,%xmm12
906	vpshufb	%xmm0,%xmm14,%xmm14
907	vpxor	%xmm4,%xmm13,%xmm13
908	vpshufb	%xmm0,%xmm1,%xmm1
909	vpxor	%xmm4,%xmm14,%xmm14
910	jmp	.Loop_ctr32
911.cfi_endproc
912SET_SIZE(_aesni_ctr32_6x)
913
914ENTRY_ALIGN(aesni_gcm_encrypt, 32)
915.cfi_startproc
916	ENDBR
917	xorq	%r10,%r10
918	cmpq	$288,%rdx
919	jb	.Lgcm_enc_abort
920
921	leaq	(%rsp),%rax
922.cfi_def_cfa_register	%rax
923	pushq	%rbx
924.cfi_offset	%rbx,-16
925	pushq	%rbp
926.cfi_offset	%rbp,-24
927	pushq	%r12
928.cfi_offset	%r12,-32
929	pushq	%r13
930.cfi_offset	%r13,-40
931	pushq	%r14
932.cfi_offset	%r14,-48
933	pushq	%r15
934.cfi_offset	%r15,-56
935	pushq	%r9
936.cfi_offset	%r9,-64
937	vzeroupper
938
939	vmovdqu	(%r8),%xmm1
940	addq	$-128,%rsp
941	movl	12(%r8),%ebx
942	leaq	.Lbswap_mask(%rip),%r11
943	leaq	-128(%rcx),%r14
944	movq	$0xf80,%r15
945	leaq	128(%rcx),%rcx
946	vmovdqu	(%r11),%xmm0
947	andq	$-128,%rsp
948	movl	504-128(%rcx),%ebp	// ICP has an larger offset for rounds.
949
950	andq	%r15,%r14
951	andq	%rsp,%r15
952	subq	%r14,%r15
953	jc	.Lenc_no_key_aliasing
954	cmpq	$768,%r15
955	jnc	.Lenc_no_key_aliasing
956	subq	%r15,%rsp
957.Lenc_no_key_aliasing:
958
959	leaq	(%rsi),%r14
960	leaq	-192(%rsi,%rdx,1),%r15
961	shrq	$4,%rdx
962
963	call	_aesni_ctr32_6x
964	vpshufb	%xmm0,%xmm9,%xmm8
965	vpshufb	%xmm0,%xmm10,%xmm2
966	vmovdqu	%xmm8,112(%rsp)
967	vpshufb	%xmm0,%xmm11,%xmm4
968	vmovdqu	%xmm2,96(%rsp)
969	vpshufb	%xmm0,%xmm12,%xmm5
970	vmovdqu	%xmm4,80(%rsp)
971	vpshufb	%xmm0,%xmm13,%xmm6
972	vmovdqu	%xmm5,64(%rsp)
973	vpshufb	%xmm0,%xmm14,%xmm7
974	vmovdqu	%xmm6,48(%rsp)
975
976	call	_aesni_ctr32_6x
977
978	vmovdqu	(%r9),%xmm8
979	movq	32(%r9),%r9
980	leaq	32(%r9),%r9
981	subq	$12,%rdx
982	movq	$192,%r10
983	vpshufb	%xmm0,%xmm8,%xmm8
984
985#ifdef HAVE_MOVBE
986#ifdef _KERNEL
987	testl	$1,gcm_avx_can_use_movbe(%rip)
988#else
989	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
990#endif
991	jz	1f
992	call	_aesni_ctr32_ghash_6x
993	jmp	2f
9941:
995#endif
996	call	_aesni_ctr32_ghash_no_movbe_6x
9972:
998	vmovdqu	32(%rsp),%xmm7
999	vmovdqu	(%r11),%xmm0
1000	vmovdqu	0-32(%r9),%xmm3
1001	vpunpckhqdq	%xmm7,%xmm7,%xmm1
1002	vmovdqu	32-32(%r9),%xmm15
1003	vmovups	%xmm9,-96(%rsi)
1004	vpshufb	%xmm0,%xmm9,%xmm9
1005	vpxor	%xmm7,%xmm1,%xmm1
1006	vmovups	%xmm10,-80(%rsi)
1007	vpshufb	%xmm0,%xmm10,%xmm10
1008	vmovups	%xmm11,-64(%rsi)
1009	vpshufb	%xmm0,%xmm11,%xmm11
1010	vmovups	%xmm12,-48(%rsi)
1011	vpshufb	%xmm0,%xmm12,%xmm12
1012	vmovups	%xmm13,-32(%rsi)
1013	vpshufb	%xmm0,%xmm13,%xmm13
1014	vmovups	%xmm14,-16(%rsi)
1015	vpshufb	%xmm0,%xmm14,%xmm14
1016	vmovdqu	%xmm9,16(%rsp)
1017	vmovdqu	48(%rsp),%xmm6
1018	vmovdqu	16-32(%r9),%xmm0
1019	vpunpckhqdq	%xmm6,%xmm6,%xmm2
1020	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
1021	vpxor	%xmm6,%xmm2,%xmm2
1022	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
1023	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
1024
1025	vmovdqu	64(%rsp),%xmm9
1026	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
1027	vmovdqu	48-32(%r9),%xmm3
1028	vpxor	%xmm5,%xmm4,%xmm4
1029	vpunpckhqdq	%xmm9,%xmm9,%xmm5
1030	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
1031	vpxor	%xmm9,%xmm5,%xmm5
1032	vpxor	%xmm7,%xmm6,%xmm6
1033	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
1034	vmovdqu	80-32(%r9),%xmm15
1035	vpxor	%xmm1,%xmm2,%xmm2
1036
1037	vmovdqu	80(%rsp),%xmm1
1038	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
1039	vmovdqu	64-32(%r9),%xmm0
1040	vpxor	%xmm4,%xmm7,%xmm7
1041	vpunpckhqdq	%xmm1,%xmm1,%xmm4
1042	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
1043	vpxor	%xmm1,%xmm4,%xmm4
1044	vpxor	%xmm6,%xmm9,%xmm9
1045	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
1046	vpxor	%xmm2,%xmm5,%xmm5
1047
1048	vmovdqu	96(%rsp),%xmm2
1049	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
1050	vmovdqu	96-32(%r9),%xmm3
1051	vpxor	%xmm7,%xmm6,%xmm6
1052	vpunpckhqdq	%xmm2,%xmm2,%xmm7
1053	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
1054	vpxor	%xmm2,%xmm7,%xmm7
1055	vpxor	%xmm9,%xmm1,%xmm1
1056	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
1057	vmovdqu	128-32(%r9),%xmm15
1058	vpxor	%xmm5,%xmm4,%xmm4
1059
1060	vpxor	112(%rsp),%xmm8,%xmm8
1061	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
1062	vmovdqu	112-32(%r9),%xmm0
1063	vpunpckhqdq	%xmm8,%xmm8,%xmm9
1064	vpxor	%xmm6,%xmm5,%xmm5
1065	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
1066	vpxor	%xmm8,%xmm9,%xmm9
1067	vpxor	%xmm1,%xmm2,%xmm2
1068	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
1069	vpxor	%xmm4,%xmm7,%xmm4
1070
1071	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
1072	vmovdqu	0-32(%r9),%xmm3
1073	vpunpckhqdq	%xmm14,%xmm14,%xmm1
1074	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
1075	vpxor	%xmm14,%xmm1,%xmm1
1076	vpxor	%xmm5,%xmm6,%xmm5
1077	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
1078	vmovdqu	32-32(%r9),%xmm15
1079	vpxor	%xmm2,%xmm8,%xmm7
1080	vpxor	%xmm4,%xmm9,%xmm6
1081
1082	vmovdqu	16-32(%r9),%xmm0
1083	vpxor	%xmm5,%xmm7,%xmm9
1084	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
1085	vpxor	%xmm9,%xmm6,%xmm6
1086	vpunpckhqdq	%xmm13,%xmm13,%xmm2
1087	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
1088	vpxor	%xmm13,%xmm2,%xmm2
1089	vpslldq	$8,%xmm6,%xmm9
1090	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
1091	vpxor	%xmm9,%xmm5,%xmm8
1092	vpsrldq	$8,%xmm6,%xmm6
1093	vpxor	%xmm6,%xmm7,%xmm7
1094
1095	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
1096	vmovdqu	48-32(%r9),%xmm3
1097	vpxor	%xmm4,%xmm5,%xmm5
1098	vpunpckhqdq	%xmm12,%xmm12,%xmm9
1099	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
1100	vpxor	%xmm12,%xmm9,%xmm9
1101	vpxor	%xmm14,%xmm13,%xmm13
1102	vpalignr	$8,%xmm8,%xmm8,%xmm14
1103	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
1104	vmovdqu	80-32(%r9),%xmm15
1105	vpxor	%xmm1,%xmm2,%xmm2
1106
1107	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
1108	vmovdqu	64-32(%r9),%xmm0
1109	vpxor	%xmm5,%xmm4,%xmm4
1110	vpunpckhqdq	%xmm11,%xmm11,%xmm1
1111	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
1112	vpxor	%xmm11,%xmm1,%xmm1
1113	vpxor	%xmm13,%xmm12,%xmm12
1114	vxorps	16(%rsp),%xmm7,%xmm7
1115	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
1116	vpxor	%xmm2,%xmm9,%xmm9
1117
1118	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
1119	vxorps	%xmm14,%xmm8,%xmm8
1120
1121	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
1122	vmovdqu	96-32(%r9),%xmm3
1123	vpxor	%xmm4,%xmm5,%xmm5
1124	vpunpckhqdq	%xmm10,%xmm10,%xmm2
1125	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
1126	vpxor	%xmm10,%xmm2,%xmm2
1127	vpalignr	$8,%xmm8,%xmm8,%xmm14
1128	vpxor	%xmm12,%xmm11,%xmm11
1129	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
1130	vmovdqu	128-32(%r9),%xmm15
1131	vpxor	%xmm9,%xmm1,%xmm1
1132
1133	vxorps	%xmm7,%xmm14,%xmm14
1134	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
1135	vxorps	%xmm14,%xmm8,%xmm8
1136
1137	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
1138	vmovdqu	112-32(%r9),%xmm0
1139	vpxor	%xmm5,%xmm4,%xmm4
1140	vpunpckhqdq	%xmm8,%xmm8,%xmm9
1141	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
1142	vpxor	%xmm8,%xmm9,%xmm9
1143	vpxor	%xmm11,%xmm10,%xmm10
1144	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
1145	vpxor	%xmm1,%xmm2,%xmm2
1146
1147	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
1148	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
1149	vpxor	%xmm4,%xmm5,%xmm5
1150	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
1151	vpxor	%xmm10,%xmm7,%xmm7
1152	vpxor	%xmm2,%xmm6,%xmm6
1153
1154	vpxor	%xmm5,%xmm7,%xmm4
1155	vpxor	%xmm4,%xmm6,%xmm6
1156	vpslldq	$8,%xmm6,%xmm1
1157	vmovdqu	16(%r11),%xmm3
1158	vpsrldq	$8,%xmm6,%xmm6
1159	vpxor	%xmm1,%xmm5,%xmm8
1160	vpxor	%xmm6,%xmm7,%xmm7
1161
1162	vpalignr	$8,%xmm8,%xmm8,%xmm2
1163	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
1164	vpxor	%xmm2,%xmm8,%xmm8
1165
1166	vpalignr	$8,%xmm8,%xmm8,%xmm2
1167	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
1168	vpxor	%xmm7,%xmm2,%xmm2
1169	vpxor	%xmm2,%xmm8,%xmm8
1170	vpshufb	(%r11),%xmm8,%xmm8
1171	movq	-56(%rax),%r9
1172.cfi_restore	%r9
1173	vmovdqu	%xmm8,(%r9)
1174
1175	vzeroupper
1176	movq	-48(%rax),%r15
1177.cfi_restore	%r15
1178	movq	-40(%rax),%r14
1179.cfi_restore	%r14
1180	movq	-32(%rax),%r13
1181.cfi_restore	%r13
1182	movq	-24(%rax),%r12
1183.cfi_restore	%r12
1184	movq	-16(%rax),%rbp
1185.cfi_restore	%rbp
1186	movq	-8(%rax),%rbx
1187.cfi_restore	%rbx
1188	leaq	(%rax),%rsp
1189.cfi_def_cfa_register	%rsp
1190.Lgcm_enc_abort:
1191	movq	%r10,%rax
1192	RET
1193.cfi_endproc
1194SET_SIZE(aesni_gcm_encrypt)
1195
1196#endif /* !_WIN32 || _KERNEL */
1197
1198/* Some utility routines */
1199
1200/*
1201 * clear all fpu registers
1202 * void clear_fpu_regs_avx(void);
1203 */
1204ENTRY_ALIGN(clear_fpu_regs_avx, 32)
1205	vzeroall
1206	RET
1207SET_SIZE(clear_fpu_regs_avx)
1208
1209/*
1210 * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1211 *
1212 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
1213 * stores the result at `dst'. The XOR is performed using FPU registers,
1214 * so make sure FPU state is saved when running this in the kernel.
1215 */
1216ENTRY_ALIGN(gcm_xor_avx, 32)
1217	movdqu  (%rdi), %xmm0
1218	movdqu  (%rsi), %xmm1
1219	pxor    %xmm1, %xmm0
1220	movdqu  %xmm0, (%rsi)
1221	RET
1222SET_SIZE(gcm_xor_avx)
1223
1224/*
1225 * Toggle a boolean_t value atomically and return the new value.
1226 * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
1227 */
1228ENTRY_ALIGN(atomic_toggle_boolean_nv, 32)
1229	xorl	%eax, %eax
1230	lock
1231	xorl	$1, (%rdi)
1232	jz	1f
1233	movl	$1, %eax
12341:
1235	RET
1236SET_SIZE(atomic_toggle_boolean_nv)
1237
1238SECTION_STATIC
1239
1240.balign	64
1241.Lbswap_mask:
1242.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1243.Lpoly:
1244.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1245.Lone_msb:
1246.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1247.Ltwo_lsb:
1248.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1249.Lone_lsb:
1250.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1251.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1252.balign	64
1253
1254/* Mark the stack non-executable. */
1255#if defined(__linux__) && defined(__ELF__)
1256.section .note.GNU-stack,"",%progbits
1257#endif
1258
1259#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
1260