1# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
2#
3# Licensed under the Apache License 2.0 (the "License").  You may not use
4# this file except in compliance with the License.  You can obtain a copy
5# in the file LICENSE in the source distribution or at
6# https://www.openssl.org/source/license.html
7
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16#
17# AES-NI-CTR+GHASH stitch.
18#
19# February 2013
20#
21# OpenSSL GCM implementation is organized in such way that its
22# performance is rather close to the sum of its streamed components,
23# in the context parallelized AES-NI CTR and modulo-scheduled
24# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
25# was observed to perform significantly better than the sum of the
26# components on contemporary CPUs, the effort was deemed impossible to
27# justify. This module is based on combination of Intel submissions,
28# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
29# Locktyukhin of Intel Corp. who verified that it reduces shuffles
30# pressure with notable relative improvement, achieving 1.0 cycle per
31# byte processed with 128-bit key on Haswell processor, 0.74 - on
32# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
33# measurements for favourable packet size, one divisible by 96.
34# Applications using the EVP interface will observe a few percent
35# worse performance.]
36#
37# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
38#
39# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
40# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
41
42# Generated once from
43# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
44# and modified for ICP. Modification are kept at a bare minimum to ease later
45# upstream merges.
46
47#if defined(__x86_64__) && defined(HAVE_AVX) && \
48    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
49
50.extern gcm_avx_can_use_movbe
51
52.text
53
54#ifdef HAVE_MOVBE
55.type	_aesni_ctr32_ghash_6x,@function
56.align	32
57_aesni_ctr32_ghash_6x:
58.cfi_startproc
59	vmovdqu	32(%r11),%xmm2
60	subq	$6,%rdx
61	vpxor	%xmm4,%xmm4,%xmm4
62	vmovdqu	0-128(%rcx),%xmm15
63	vpaddb	%xmm2,%xmm1,%xmm10
64	vpaddb	%xmm2,%xmm10,%xmm11
65	vpaddb	%xmm2,%xmm11,%xmm12
66	vpaddb	%xmm2,%xmm12,%xmm13
67	vpaddb	%xmm2,%xmm13,%xmm14
68	vpxor	%xmm15,%xmm1,%xmm9
69	vmovdqu	%xmm4,16+8(%rsp)
70	jmp	.Loop6x
71
72.align	32
73.Loop6x:
74	addl	$100663296,%ebx
75	jc	.Lhandle_ctr32
76	vmovdqu	0-32(%r9),%xmm3
77	vpaddb	%xmm2,%xmm14,%xmm1
78	vpxor	%xmm15,%xmm10,%xmm10
79	vpxor	%xmm15,%xmm11,%xmm11
80
81.Lresume_ctr32:
82	vmovdqu	%xmm1,(%r8)
83	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
84	vpxor	%xmm15,%xmm12,%xmm12
85	vmovups	16-128(%rcx),%xmm2
86	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
87	xorq	%r12,%r12
88	cmpq	%r14,%r15
89
90	vaesenc	%xmm2,%xmm9,%xmm9
91	vmovdqu	48+8(%rsp),%xmm0
92	vpxor	%xmm15,%xmm13,%xmm13
93	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
94	vaesenc	%xmm2,%xmm10,%xmm10
95	vpxor	%xmm15,%xmm14,%xmm14
96	setnc	%r12b
97	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
98	vaesenc	%xmm2,%xmm11,%xmm11
99	vmovdqu	16-32(%r9),%xmm3
100	negq	%r12
101	vaesenc	%xmm2,%xmm12,%xmm12
102	vpxor	%xmm5,%xmm6,%xmm6
103	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
104	vpxor	%xmm4,%xmm8,%xmm8
105	vaesenc	%xmm2,%xmm13,%xmm13
106	vpxor	%xmm5,%xmm1,%xmm4
107	andq	$0x60,%r12
108	vmovups	32-128(%rcx),%xmm15
109	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
110	vaesenc	%xmm2,%xmm14,%xmm14
111
112	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
113	leaq	(%r14,%r12,1),%r14
114	vaesenc	%xmm15,%xmm9,%xmm9
115	vpxor	16+8(%rsp),%xmm8,%xmm8
116	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
117	vmovdqu	64+8(%rsp),%xmm0
118	vaesenc	%xmm15,%xmm10,%xmm10
119	movbeq	88(%r14),%r13
120	vaesenc	%xmm15,%xmm11,%xmm11
121	movbeq	80(%r14),%r12
122	vaesenc	%xmm15,%xmm12,%xmm12
123	movq	%r13,32+8(%rsp)
124	vaesenc	%xmm15,%xmm13,%xmm13
125	movq	%r12,40+8(%rsp)
126	vmovdqu	48-32(%r9),%xmm5
127	vaesenc	%xmm15,%xmm14,%xmm14
128
129	vmovups	48-128(%rcx),%xmm15
130	vpxor	%xmm1,%xmm6,%xmm6
131	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
132	vaesenc	%xmm15,%xmm9,%xmm9
133	vpxor	%xmm2,%xmm6,%xmm6
134	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
135	vaesenc	%xmm15,%xmm10,%xmm10
136	vpxor	%xmm3,%xmm7,%xmm7
137	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
138	vaesenc	%xmm15,%xmm11,%xmm11
139	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
140	vmovdqu	80+8(%rsp),%xmm0
141	vaesenc	%xmm15,%xmm12,%xmm12
142	vaesenc	%xmm15,%xmm13,%xmm13
143	vpxor	%xmm1,%xmm4,%xmm4
144	vmovdqu	64-32(%r9),%xmm1
145	vaesenc	%xmm15,%xmm14,%xmm14
146
147	vmovups	64-128(%rcx),%xmm15
148	vpxor	%xmm2,%xmm6,%xmm6
149	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
150	vaesenc	%xmm15,%xmm9,%xmm9
151	vpxor	%xmm3,%xmm6,%xmm6
152	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
153	vaesenc	%xmm15,%xmm10,%xmm10
154	movbeq	72(%r14),%r13
155	vpxor	%xmm5,%xmm7,%xmm7
156	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
157	vaesenc	%xmm15,%xmm11,%xmm11
158	movbeq	64(%r14),%r12
159	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
160	vmovdqu	96+8(%rsp),%xmm0
161	vaesenc	%xmm15,%xmm12,%xmm12
162	movq	%r13,48+8(%rsp)
163	vaesenc	%xmm15,%xmm13,%xmm13
164	movq	%r12,56+8(%rsp)
165	vpxor	%xmm2,%xmm4,%xmm4
166	vmovdqu	96-32(%r9),%xmm2
167	vaesenc	%xmm15,%xmm14,%xmm14
168
169	vmovups	80-128(%rcx),%xmm15
170	vpxor	%xmm3,%xmm6,%xmm6
171	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
172	vaesenc	%xmm15,%xmm9,%xmm9
173	vpxor	%xmm5,%xmm6,%xmm6
174	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
175	vaesenc	%xmm15,%xmm10,%xmm10
176	movbeq	56(%r14),%r13
177	vpxor	%xmm1,%xmm7,%xmm7
178	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
179	vpxor	112+8(%rsp),%xmm8,%xmm8
180	vaesenc	%xmm15,%xmm11,%xmm11
181	movbeq	48(%r14),%r12
182	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
183	vaesenc	%xmm15,%xmm12,%xmm12
184	movq	%r13,64+8(%rsp)
185	vaesenc	%xmm15,%xmm13,%xmm13
186	movq	%r12,72+8(%rsp)
187	vpxor	%xmm3,%xmm4,%xmm4
188	vmovdqu	112-32(%r9),%xmm3
189	vaesenc	%xmm15,%xmm14,%xmm14
190
191	vmovups	96-128(%rcx),%xmm15
192	vpxor	%xmm5,%xmm6,%xmm6
193	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
194	vaesenc	%xmm15,%xmm9,%xmm9
195	vpxor	%xmm1,%xmm6,%xmm6
196	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
197	vaesenc	%xmm15,%xmm10,%xmm10
198	movbeq	40(%r14),%r13
199	vpxor	%xmm2,%xmm7,%xmm7
200	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
201	vaesenc	%xmm15,%xmm11,%xmm11
202	movbeq	32(%r14),%r12
203	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
204	vaesenc	%xmm15,%xmm12,%xmm12
205	movq	%r13,80+8(%rsp)
206	vaesenc	%xmm15,%xmm13,%xmm13
207	movq	%r12,88+8(%rsp)
208	vpxor	%xmm5,%xmm6,%xmm6
209	vaesenc	%xmm15,%xmm14,%xmm14
210	vpxor	%xmm1,%xmm6,%xmm6
211
212	vmovups	112-128(%rcx),%xmm15
213	vpslldq	$8,%xmm6,%xmm5
214	vpxor	%xmm2,%xmm4,%xmm4
215	vmovdqu	16(%r11),%xmm3
216
217	vaesenc	%xmm15,%xmm9,%xmm9
218	vpxor	%xmm8,%xmm7,%xmm7
219	vaesenc	%xmm15,%xmm10,%xmm10
220	vpxor	%xmm5,%xmm4,%xmm4
221	movbeq	24(%r14),%r13
222	vaesenc	%xmm15,%xmm11,%xmm11
223	movbeq	16(%r14),%r12
224	vpalignr	$8,%xmm4,%xmm4,%xmm0
225	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
226	movq	%r13,96+8(%rsp)
227	vaesenc	%xmm15,%xmm12,%xmm12
228	movq	%r12,104+8(%rsp)
229	vaesenc	%xmm15,%xmm13,%xmm13
230	vmovups	128-128(%rcx),%xmm1
231	vaesenc	%xmm15,%xmm14,%xmm14
232
233	vaesenc	%xmm1,%xmm9,%xmm9
234	vmovups	144-128(%rcx),%xmm15
235	vaesenc	%xmm1,%xmm10,%xmm10
236	vpsrldq	$8,%xmm6,%xmm6
237	vaesenc	%xmm1,%xmm11,%xmm11
238	vpxor	%xmm6,%xmm7,%xmm7
239	vaesenc	%xmm1,%xmm12,%xmm12
240	vpxor	%xmm0,%xmm4,%xmm4
241	movbeq	8(%r14),%r13
242	vaesenc	%xmm1,%xmm13,%xmm13
243	movbeq	0(%r14),%r12
244	vaesenc	%xmm1,%xmm14,%xmm14
245	vmovups	160-128(%rcx),%xmm1
246	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
247	jb	.Lenc_tail
248
249	vaesenc	%xmm15,%xmm9,%xmm9
250	vaesenc	%xmm15,%xmm10,%xmm10
251	vaesenc	%xmm15,%xmm11,%xmm11
252	vaesenc	%xmm15,%xmm12,%xmm12
253	vaesenc	%xmm15,%xmm13,%xmm13
254	vaesenc	%xmm15,%xmm14,%xmm14
255
256	vaesenc	%xmm1,%xmm9,%xmm9
257	vaesenc	%xmm1,%xmm10,%xmm10
258	vaesenc	%xmm1,%xmm11,%xmm11
259	vaesenc	%xmm1,%xmm12,%xmm12
260	vaesenc	%xmm1,%xmm13,%xmm13
261	vmovups	176-128(%rcx),%xmm15
262	vaesenc	%xmm1,%xmm14,%xmm14
263	vmovups	192-128(%rcx),%xmm1
264	cmpl	$14,%ebp	// ICP does not zero key schedule.
265	jb	.Lenc_tail
266
267	vaesenc	%xmm15,%xmm9,%xmm9
268	vaesenc	%xmm15,%xmm10,%xmm10
269	vaesenc	%xmm15,%xmm11,%xmm11
270	vaesenc	%xmm15,%xmm12,%xmm12
271	vaesenc	%xmm15,%xmm13,%xmm13
272	vaesenc	%xmm15,%xmm14,%xmm14
273
274	vaesenc	%xmm1,%xmm9,%xmm9
275	vaesenc	%xmm1,%xmm10,%xmm10
276	vaesenc	%xmm1,%xmm11,%xmm11
277	vaesenc	%xmm1,%xmm12,%xmm12
278	vaesenc	%xmm1,%xmm13,%xmm13
279	vmovups	208-128(%rcx),%xmm15
280	vaesenc	%xmm1,%xmm14,%xmm14
281	vmovups	224-128(%rcx),%xmm1
282	jmp	.Lenc_tail
283
284.align	32
285.Lhandle_ctr32:
286	vmovdqu	(%r11),%xmm0
287	vpshufb	%xmm0,%xmm1,%xmm6
288	vmovdqu	48(%r11),%xmm5
289	vpaddd	64(%r11),%xmm6,%xmm10
290	vpaddd	%xmm5,%xmm6,%xmm11
291	vmovdqu	0-32(%r9),%xmm3
292	vpaddd	%xmm5,%xmm10,%xmm12
293	vpshufb	%xmm0,%xmm10,%xmm10
294	vpaddd	%xmm5,%xmm11,%xmm13
295	vpshufb	%xmm0,%xmm11,%xmm11
296	vpxor	%xmm15,%xmm10,%xmm10
297	vpaddd	%xmm5,%xmm12,%xmm14
298	vpshufb	%xmm0,%xmm12,%xmm12
299	vpxor	%xmm15,%xmm11,%xmm11
300	vpaddd	%xmm5,%xmm13,%xmm1
301	vpshufb	%xmm0,%xmm13,%xmm13
302	vpshufb	%xmm0,%xmm14,%xmm14
303	vpshufb	%xmm0,%xmm1,%xmm1
304	jmp	.Lresume_ctr32
305
306.align	32
307.Lenc_tail:
308	vaesenc	%xmm15,%xmm9,%xmm9
309	vmovdqu	%xmm7,16+8(%rsp)
310	vpalignr	$8,%xmm4,%xmm4,%xmm8
311	vaesenc	%xmm15,%xmm10,%xmm10
312	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
313	vpxor	0(%rdi),%xmm1,%xmm2
314	vaesenc	%xmm15,%xmm11,%xmm11
315	vpxor	16(%rdi),%xmm1,%xmm0
316	vaesenc	%xmm15,%xmm12,%xmm12
317	vpxor	32(%rdi),%xmm1,%xmm5
318	vaesenc	%xmm15,%xmm13,%xmm13
319	vpxor	48(%rdi),%xmm1,%xmm6
320	vaesenc	%xmm15,%xmm14,%xmm14
321	vpxor	64(%rdi),%xmm1,%xmm7
322	vpxor	80(%rdi),%xmm1,%xmm3
323	vmovdqu	(%r8),%xmm1
324
325	vaesenclast	%xmm2,%xmm9,%xmm9
326	vmovdqu	32(%r11),%xmm2
327	vaesenclast	%xmm0,%xmm10,%xmm10
328	vpaddb	%xmm2,%xmm1,%xmm0
329	movq	%r13,112+8(%rsp)
330	leaq	96(%rdi),%rdi
331	vaesenclast	%xmm5,%xmm11,%xmm11
332	vpaddb	%xmm2,%xmm0,%xmm5
333	movq	%r12,120+8(%rsp)
334	leaq	96(%rsi),%rsi
335	vmovdqu	0-128(%rcx),%xmm15
336	vaesenclast	%xmm6,%xmm12,%xmm12
337	vpaddb	%xmm2,%xmm5,%xmm6
338	vaesenclast	%xmm7,%xmm13,%xmm13
339	vpaddb	%xmm2,%xmm6,%xmm7
340	vaesenclast	%xmm3,%xmm14,%xmm14
341	vpaddb	%xmm2,%xmm7,%xmm3
342
343	addq	$0x60,%r10
344	subq	$0x6,%rdx
345	jc	.L6x_done
346
347	vmovups	%xmm9,-96(%rsi)
348	vpxor	%xmm15,%xmm1,%xmm9
349	vmovups	%xmm10,-80(%rsi)
350	vmovdqa	%xmm0,%xmm10
351	vmovups	%xmm11,-64(%rsi)
352	vmovdqa	%xmm5,%xmm11
353	vmovups	%xmm12,-48(%rsi)
354	vmovdqa	%xmm6,%xmm12
355	vmovups	%xmm13,-32(%rsi)
356	vmovdqa	%xmm7,%xmm13
357	vmovups	%xmm14,-16(%rsi)
358	vmovdqa	%xmm3,%xmm14
359	vmovdqu	32+8(%rsp),%xmm7
360	jmp	.Loop6x
361
362.L6x_done:
363	vpxor	16+8(%rsp),%xmm8,%xmm8
364	vpxor	%xmm4,%xmm8,%xmm8
365
366	.byte	0xf3,0xc3
367.cfi_endproc
368.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
369#endif /* ifdef HAVE_MOVBE */
370
371.type	_aesni_ctr32_ghash_no_movbe_6x,@function
372.align	32
373_aesni_ctr32_ghash_no_movbe_6x:
374.cfi_startproc
375	vmovdqu	32(%r11),%xmm2
376	subq	$6,%rdx
377	vpxor	%xmm4,%xmm4,%xmm4
378	vmovdqu	0-128(%rcx),%xmm15
379	vpaddb	%xmm2,%xmm1,%xmm10
380	vpaddb	%xmm2,%xmm10,%xmm11
381	vpaddb	%xmm2,%xmm11,%xmm12
382	vpaddb	%xmm2,%xmm12,%xmm13
383	vpaddb	%xmm2,%xmm13,%xmm14
384	vpxor	%xmm15,%xmm1,%xmm9
385	vmovdqu	%xmm4,16+8(%rsp)
386	jmp	.Loop6x_nmb
387
388.align	32
389.Loop6x_nmb:
390	addl	$100663296,%ebx
391	jc	.Lhandle_ctr32_nmb
392	vmovdqu	0-32(%r9),%xmm3
393	vpaddb	%xmm2,%xmm14,%xmm1
394	vpxor	%xmm15,%xmm10,%xmm10
395	vpxor	%xmm15,%xmm11,%xmm11
396
397.Lresume_ctr32_nmb:
398	vmovdqu	%xmm1,(%r8)
399	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
400	vpxor	%xmm15,%xmm12,%xmm12
401	vmovups	16-128(%rcx),%xmm2
402	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
403	xorq	%r12,%r12
404	cmpq	%r14,%r15
405
406	vaesenc	%xmm2,%xmm9,%xmm9
407	vmovdqu	48+8(%rsp),%xmm0
408	vpxor	%xmm15,%xmm13,%xmm13
409	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
410	vaesenc	%xmm2,%xmm10,%xmm10
411	vpxor	%xmm15,%xmm14,%xmm14
412	setnc	%r12b
413	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
414	vaesenc	%xmm2,%xmm11,%xmm11
415	vmovdqu	16-32(%r9),%xmm3
416	negq	%r12
417	vaesenc	%xmm2,%xmm12,%xmm12
418	vpxor	%xmm5,%xmm6,%xmm6
419	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
420	vpxor	%xmm4,%xmm8,%xmm8
421	vaesenc	%xmm2,%xmm13,%xmm13
422	vpxor	%xmm5,%xmm1,%xmm4
423	andq	$0x60,%r12
424	vmovups	32-128(%rcx),%xmm15
425	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
426	vaesenc	%xmm2,%xmm14,%xmm14
427
428	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
429	leaq	(%r14,%r12,1),%r14
430	vaesenc	%xmm15,%xmm9,%xmm9
431	vpxor	16+8(%rsp),%xmm8,%xmm8
432	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
433	vmovdqu	64+8(%rsp),%xmm0
434	vaesenc	%xmm15,%xmm10,%xmm10
435	movq	88(%r14),%r13
436	bswapq	%r13
437	vaesenc	%xmm15,%xmm11,%xmm11
438	movq	80(%r14),%r12
439	bswapq	%r12
440	vaesenc	%xmm15,%xmm12,%xmm12
441	movq	%r13,32+8(%rsp)
442	vaesenc	%xmm15,%xmm13,%xmm13
443	movq	%r12,40+8(%rsp)
444	vmovdqu	48-32(%r9),%xmm5
445	vaesenc	%xmm15,%xmm14,%xmm14
446
447	vmovups	48-128(%rcx),%xmm15
448	vpxor	%xmm1,%xmm6,%xmm6
449	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
450	vaesenc	%xmm15,%xmm9,%xmm9
451	vpxor	%xmm2,%xmm6,%xmm6
452	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
453	vaesenc	%xmm15,%xmm10,%xmm10
454	vpxor	%xmm3,%xmm7,%xmm7
455	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
456	vaesenc	%xmm15,%xmm11,%xmm11
457	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
458	vmovdqu	80+8(%rsp),%xmm0
459	vaesenc	%xmm15,%xmm12,%xmm12
460	vaesenc	%xmm15,%xmm13,%xmm13
461	vpxor	%xmm1,%xmm4,%xmm4
462	vmovdqu	64-32(%r9),%xmm1
463	vaesenc	%xmm15,%xmm14,%xmm14
464
465	vmovups	64-128(%rcx),%xmm15
466	vpxor	%xmm2,%xmm6,%xmm6
467	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
468	vaesenc	%xmm15,%xmm9,%xmm9
469	vpxor	%xmm3,%xmm6,%xmm6
470	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
471	vaesenc	%xmm15,%xmm10,%xmm10
472	movq	72(%r14),%r13
473	bswapq	%r13
474	vpxor	%xmm5,%xmm7,%xmm7
475	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
476	vaesenc	%xmm15,%xmm11,%xmm11
477	movq	64(%r14),%r12
478	bswapq	%r12
479	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
480	vmovdqu	96+8(%rsp),%xmm0
481	vaesenc	%xmm15,%xmm12,%xmm12
482	movq	%r13,48+8(%rsp)
483	vaesenc	%xmm15,%xmm13,%xmm13
484	movq	%r12,56+8(%rsp)
485	vpxor	%xmm2,%xmm4,%xmm4
486	vmovdqu	96-32(%r9),%xmm2
487	vaesenc	%xmm15,%xmm14,%xmm14
488
489	vmovups	80-128(%rcx),%xmm15
490	vpxor	%xmm3,%xmm6,%xmm6
491	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
492	vaesenc	%xmm15,%xmm9,%xmm9
493	vpxor	%xmm5,%xmm6,%xmm6
494	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
495	vaesenc	%xmm15,%xmm10,%xmm10
496	movq	56(%r14),%r13
497	bswapq	%r13
498	vpxor	%xmm1,%xmm7,%xmm7
499	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
500	vpxor	112+8(%rsp),%xmm8,%xmm8
501	vaesenc	%xmm15,%xmm11,%xmm11
502	movq	48(%r14),%r12
503	bswapq	%r12
504	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
505	vaesenc	%xmm15,%xmm12,%xmm12
506	movq	%r13,64+8(%rsp)
507	vaesenc	%xmm15,%xmm13,%xmm13
508	movq	%r12,72+8(%rsp)
509	vpxor	%xmm3,%xmm4,%xmm4
510	vmovdqu	112-32(%r9),%xmm3
511	vaesenc	%xmm15,%xmm14,%xmm14
512
513	vmovups	96-128(%rcx),%xmm15
514	vpxor	%xmm5,%xmm6,%xmm6
515	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
516	vaesenc	%xmm15,%xmm9,%xmm9
517	vpxor	%xmm1,%xmm6,%xmm6
518	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
519	vaesenc	%xmm15,%xmm10,%xmm10
520	movq	40(%r14),%r13
521	bswapq	%r13
522	vpxor	%xmm2,%xmm7,%xmm7
523	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
524	vaesenc	%xmm15,%xmm11,%xmm11
525	movq	32(%r14),%r12
526	bswapq	%r12
527	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
528	vaesenc	%xmm15,%xmm12,%xmm12
529	movq	%r13,80+8(%rsp)
530	vaesenc	%xmm15,%xmm13,%xmm13
531	movq	%r12,88+8(%rsp)
532	vpxor	%xmm5,%xmm6,%xmm6
533	vaesenc	%xmm15,%xmm14,%xmm14
534	vpxor	%xmm1,%xmm6,%xmm6
535
536	vmovups	112-128(%rcx),%xmm15
537	vpslldq	$8,%xmm6,%xmm5
538	vpxor	%xmm2,%xmm4,%xmm4
539	vmovdqu	16(%r11),%xmm3
540
541	vaesenc	%xmm15,%xmm9,%xmm9
542	vpxor	%xmm8,%xmm7,%xmm7
543	vaesenc	%xmm15,%xmm10,%xmm10
544	vpxor	%xmm5,%xmm4,%xmm4
545	movq	24(%r14),%r13
546	bswapq	%r13
547	vaesenc	%xmm15,%xmm11,%xmm11
548	movq	16(%r14),%r12
549	bswapq	%r12
550	vpalignr	$8,%xmm4,%xmm4,%xmm0
551	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
552	movq	%r13,96+8(%rsp)
553	vaesenc	%xmm15,%xmm12,%xmm12
554	movq	%r12,104+8(%rsp)
555	vaesenc	%xmm15,%xmm13,%xmm13
556	vmovups	128-128(%rcx),%xmm1
557	vaesenc	%xmm15,%xmm14,%xmm14
558
559	vaesenc	%xmm1,%xmm9,%xmm9
560	vmovups	144-128(%rcx),%xmm15
561	vaesenc	%xmm1,%xmm10,%xmm10
562	vpsrldq	$8,%xmm6,%xmm6
563	vaesenc	%xmm1,%xmm11,%xmm11
564	vpxor	%xmm6,%xmm7,%xmm7
565	vaesenc	%xmm1,%xmm12,%xmm12
566	vpxor	%xmm0,%xmm4,%xmm4
567	movq	8(%r14),%r13
568	bswapq	%r13
569	vaesenc	%xmm1,%xmm13,%xmm13
570	movq	0(%r14),%r12
571	bswapq	%r12
572	vaesenc	%xmm1,%xmm14,%xmm14
573	vmovups	160-128(%rcx),%xmm1
574	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
575	jb	.Lenc_tail_nmb
576
577	vaesenc	%xmm15,%xmm9,%xmm9
578	vaesenc	%xmm15,%xmm10,%xmm10
579	vaesenc	%xmm15,%xmm11,%xmm11
580	vaesenc	%xmm15,%xmm12,%xmm12
581	vaesenc	%xmm15,%xmm13,%xmm13
582	vaesenc	%xmm15,%xmm14,%xmm14
583
584	vaesenc	%xmm1,%xmm9,%xmm9
585	vaesenc	%xmm1,%xmm10,%xmm10
586	vaesenc	%xmm1,%xmm11,%xmm11
587	vaesenc	%xmm1,%xmm12,%xmm12
588	vaesenc	%xmm1,%xmm13,%xmm13
589	vmovups	176-128(%rcx),%xmm15
590	vaesenc	%xmm1,%xmm14,%xmm14
591	vmovups	192-128(%rcx),%xmm1
592	cmpl	$14,%ebp	// ICP does not zero key schedule.
593	jb	.Lenc_tail_nmb
594
595	vaesenc	%xmm15,%xmm9,%xmm9
596	vaesenc	%xmm15,%xmm10,%xmm10
597	vaesenc	%xmm15,%xmm11,%xmm11
598	vaesenc	%xmm15,%xmm12,%xmm12
599	vaesenc	%xmm15,%xmm13,%xmm13
600	vaesenc	%xmm15,%xmm14,%xmm14
601
602	vaesenc	%xmm1,%xmm9,%xmm9
603	vaesenc	%xmm1,%xmm10,%xmm10
604	vaesenc	%xmm1,%xmm11,%xmm11
605	vaesenc	%xmm1,%xmm12,%xmm12
606	vaesenc	%xmm1,%xmm13,%xmm13
607	vmovups	208-128(%rcx),%xmm15
608	vaesenc	%xmm1,%xmm14,%xmm14
609	vmovups	224-128(%rcx),%xmm1
610	jmp	.Lenc_tail_nmb
611
612.align	32
613.Lhandle_ctr32_nmb:
614	vmovdqu	(%r11),%xmm0
615	vpshufb	%xmm0,%xmm1,%xmm6
616	vmovdqu	48(%r11),%xmm5
617	vpaddd	64(%r11),%xmm6,%xmm10
618	vpaddd	%xmm5,%xmm6,%xmm11
619	vmovdqu	0-32(%r9),%xmm3
620	vpaddd	%xmm5,%xmm10,%xmm12
621	vpshufb	%xmm0,%xmm10,%xmm10
622	vpaddd	%xmm5,%xmm11,%xmm13
623	vpshufb	%xmm0,%xmm11,%xmm11
624	vpxor	%xmm15,%xmm10,%xmm10
625	vpaddd	%xmm5,%xmm12,%xmm14
626	vpshufb	%xmm0,%xmm12,%xmm12
627	vpxor	%xmm15,%xmm11,%xmm11
628	vpaddd	%xmm5,%xmm13,%xmm1
629	vpshufb	%xmm0,%xmm13,%xmm13
630	vpshufb	%xmm0,%xmm14,%xmm14
631	vpshufb	%xmm0,%xmm1,%xmm1
632	jmp	.Lresume_ctr32_nmb
633
634.align	32
635.Lenc_tail_nmb:
636	vaesenc	%xmm15,%xmm9,%xmm9
637	vmovdqu	%xmm7,16+8(%rsp)
638	vpalignr	$8,%xmm4,%xmm4,%xmm8
639	vaesenc	%xmm15,%xmm10,%xmm10
640	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
641	vpxor	0(%rdi),%xmm1,%xmm2
642	vaesenc	%xmm15,%xmm11,%xmm11
643	vpxor	16(%rdi),%xmm1,%xmm0
644	vaesenc	%xmm15,%xmm12,%xmm12
645	vpxor	32(%rdi),%xmm1,%xmm5
646	vaesenc	%xmm15,%xmm13,%xmm13
647	vpxor	48(%rdi),%xmm1,%xmm6
648	vaesenc	%xmm15,%xmm14,%xmm14
649	vpxor	64(%rdi),%xmm1,%xmm7
650	vpxor	80(%rdi),%xmm1,%xmm3
651	vmovdqu	(%r8),%xmm1
652
653	vaesenclast	%xmm2,%xmm9,%xmm9
654	vmovdqu	32(%r11),%xmm2
655	vaesenclast	%xmm0,%xmm10,%xmm10
656	vpaddb	%xmm2,%xmm1,%xmm0
657	movq	%r13,112+8(%rsp)
658	leaq	96(%rdi),%rdi
659	vaesenclast	%xmm5,%xmm11,%xmm11
660	vpaddb	%xmm2,%xmm0,%xmm5
661	movq	%r12,120+8(%rsp)
662	leaq	96(%rsi),%rsi
663	vmovdqu	0-128(%rcx),%xmm15
664	vaesenclast	%xmm6,%xmm12,%xmm12
665	vpaddb	%xmm2,%xmm5,%xmm6
666	vaesenclast	%xmm7,%xmm13,%xmm13
667	vpaddb	%xmm2,%xmm6,%xmm7
668	vaesenclast	%xmm3,%xmm14,%xmm14
669	vpaddb	%xmm2,%xmm7,%xmm3
670
671	addq	$0x60,%r10
672	subq	$0x6,%rdx
673	jc	.L6x_done_nmb
674
675	vmovups	%xmm9,-96(%rsi)
676	vpxor	%xmm15,%xmm1,%xmm9
677	vmovups	%xmm10,-80(%rsi)
678	vmovdqa	%xmm0,%xmm10
679	vmovups	%xmm11,-64(%rsi)
680	vmovdqa	%xmm5,%xmm11
681	vmovups	%xmm12,-48(%rsi)
682	vmovdqa	%xmm6,%xmm12
683	vmovups	%xmm13,-32(%rsi)
684	vmovdqa	%xmm7,%xmm13
685	vmovups	%xmm14,-16(%rsi)
686	vmovdqa	%xmm3,%xmm14
687	vmovdqu	32+8(%rsp),%xmm7
688	jmp	.Loop6x_nmb
689
690.L6x_done_nmb:
691	vpxor	16+8(%rsp),%xmm8,%xmm8
692	vpxor	%xmm4,%xmm8,%xmm8
693
694	.byte	0xf3,0xc3
695.cfi_endproc
696.size	_aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x
697
698.globl	aesni_gcm_decrypt
699.type	aesni_gcm_decrypt,@function
700.align	32
701aesni_gcm_decrypt:
702.cfi_startproc
703	xorq	%r10,%r10
704	cmpq	$0x60,%rdx
705	jb	.Lgcm_dec_abort
706
707	leaq	(%rsp),%rax
708.cfi_def_cfa_register	%rax
709	pushq	%rbx
710.cfi_offset	%rbx,-16
711	pushq	%rbp
712.cfi_offset	%rbp,-24
713	pushq	%r12
714.cfi_offset	%r12,-32
715	pushq	%r13
716.cfi_offset	%r13,-40
717	pushq	%r14
718.cfi_offset	%r14,-48
719	pushq	%r15
720.cfi_offset	%r15,-56
721	pushq	%r9
722.cfi_offset	%r9,-64
723	vzeroupper
724
725	vmovdqu	(%r8),%xmm1
726	addq	$-128,%rsp
727	movl	12(%r8),%ebx
728	leaq	.Lbswap_mask(%rip),%r11
729	leaq	-128(%rcx),%r14
730	movq	$0xf80,%r15
731	vmovdqu	(%r9),%xmm8
732	andq	$-128,%rsp
733	vmovdqu	(%r11),%xmm0
734	leaq	128(%rcx),%rcx
735	movq	32(%r9),%r9
736	leaq	32(%r9),%r9
737	movl	504-128(%rcx),%ebp	// ICP has a larger offset for rounds.
738	vpshufb	%xmm0,%xmm8,%xmm8
739
740	andq	%r15,%r14
741	andq	%rsp,%r15
742	subq	%r14,%r15
743	jc	.Ldec_no_key_aliasing
744	cmpq	$768,%r15
745	jnc	.Ldec_no_key_aliasing
746	subq	%r15,%rsp
747.Ldec_no_key_aliasing:
748
749	vmovdqu	80(%rdi),%xmm7
750	leaq	(%rdi),%r14
751	vmovdqu	64(%rdi),%xmm4
752	leaq	-192(%rdi,%rdx,1),%r15
753	vmovdqu	48(%rdi),%xmm5
754	shrq	$4,%rdx
755	xorq	%r10,%r10
756	vmovdqu	32(%rdi),%xmm6
757	vpshufb	%xmm0,%xmm7,%xmm7
758	vmovdqu	16(%rdi),%xmm2
759	vpshufb	%xmm0,%xmm4,%xmm4
760	vmovdqu	(%rdi),%xmm3
761	vpshufb	%xmm0,%xmm5,%xmm5
762	vmovdqu	%xmm4,48(%rsp)
763	vpshufb	%xmm0,%xmm6,%xmm6
764	vmovdqu	%xmm5,64(%rsp)
765	vpshufb	%xmm0,%xmm2,%xmm2
766	vmovdqu	%xmm6,80(%rsp)
767	vpshufb	%xmm0,%xmm3,%xmm3
768	vmovdqu	%xmm2,96(%rsp)
769	vmovdqu	%xmm3,112(%rsp)
770
771#ifdef HAVE_MOVBE
772#ifdef _KERNEL
773	testl	$1,gcm_avx_can_use_movbe(%rip)
774#else
775	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
776#endif
777	jz	1f
778	call	_aesni_ctr32_ghash_6x
779	jmp	2f
7801:
781#endif
782	call	_aesni_ctr32_ghash_no_movbe_6x
7832:
784	vmovups	%xmm9,-96(%rsi)
785	vmovups	%xmm10,-80(%rsi)
786	vmovups	%xmm11,-64(%rsi)
787	vmovups	%xmm12,-48(%rsi)
788	vmovups	%xmm13,-32(%rsi)
789	vmovups	%xmm14,-16(%rsi)
790
791	vpshufb	(%r11),%xmm8,%xmm8
792	movq	-56(%rax),%r9
793.cfi_restore	%r9
794	vmovdqu	%xmm8,(%r9)
795
796	vzeroupper
797	movq	-48(%rax),%r15
798.cfi_restore	%r15
799	movq	-40(%rax),%r14
800.cfi_restore	%r14
801	movq	-32(%rax),%r13
802.cfi_restore	%r13
803	movq	-24(%rax),%r12
804.cfi_restore	%r12
805	movq	-16(%rax),%rbp
806.cfi_restore	%rbp
807	movq	-8(%rax),%rbx
808.cfi_restore	%rbx
809	leaq	(%rax),%rsp
810.cfi_def_cfa_register	%rsp
811.Lgcm_dec_abort:
812	movq	%r10,%rax
813	.byte	0xf3,0xc3
814.cfi_endproc
815.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
816.type	_aesni_ctr32_6x,@function
817.align	32
818_aesni_ctr32_6x:
819.cfi_startproc
820	vmovdqu	0-128(%rcx),%xmm4
821	vmovdqu	32(%r11),%xmm2
822	leaq	-2(%rbp),%r13	// ICP uses 10,12,14 not 9,11,13 for rounds.
823	vmovups	16-128(%rcx),%xmm15
824	leaq	32-128(%rcx),%r12
825	vpxor	%xmm4,%xmm1,%xmm9
826	addl	$100663296,%ebx
827	jc	.Lhandle_ctr32_2
828	vpaddb	%xmm2,%xmm1,%xmm10
829	vpaddb	%xmm2,%xmm10,%xmm11
830	vpxor	%xmm4,%xmm10,%xmm10
831	vpaddb	%xmm2,%xmm11,%xmm12
832	vpxor	%xmm4,%xmm11,%xmm11
833	vpaddb	%xmm2,%xmm12,%xmm13
834	vpxor	%xmm4,%xmm12,%xmm12
835	vpaddb	%xmm2,%xmm13,%xmm14
836	vpxor	%xmm4,%xmm13,%xmm13
837	vpaddb	%xmm2,%xmm14,%xmm1
838	vpxor	%xmm4,%xmm14,%xmm14
839	jmp	.Loop_ctr32
840
841.align	16
842.Loop_ctr32:
843	vaesenc	%xmm15,%xmm9,%xmm9
844	vaesenc	%xmm15,%xmm10,%xmm10
845	vaesenc	%xmm15,%xmm11,%xmm11
846	vaesenc	%xmm15,%xmm12,%xmm12
847	vaesenc	%xmm15,%xmm13,%xmm13
848	vaesenc	%xmm15,%xmm14,%xmm14
849	vmovups	(%r12),%xmm15
850	leaq	16(%r12),%r12
851	decl	%r13d
852	jnz	.Loop_ctr32
853
854	vmovdqu	(%r12),%xmm3
855	vaesenc	%xmm15,%xmm9,%xmm9
856	vpxor	0(%rdi),%xmm3,%xmm4
857	vaesenc	%xmm15,%xmm10,%xmm10
858	vpxor	16(%rdi),%xmm3,%xmm5
859	vaesenc	%xmm15,%xmm11,%xmm11
860	vpxor	32(%rdi),%xmm3,%xmm6
861	vaesenc	%xmm15,%xmm12,%xmm12
862	vpxor	48(%rdi),%xmm3,%xmm8
863	vaesenc	%xmm15,%xmm13,%xmm13
864	vpxor	64(%rdi),%xmm3,%xmm2
865	vaesenc	%xmm15,%xmm14,%xmm14
866	vpxor	80(%rdi),%xmm3,%xmm3
867	leaq	96(%rdi),%rdi
868
869	vaesenclast	%xmm4,%xmm9,%xmm9
870	vaesenclast	%xmm5,%xmm10,%xmm10
871	vaesenclast	%xmm6,%xmm11,%xmm11
872	vaesenclast	%xmm8,%xmm12,%xmm12
873	vaesenclast	%xmm2,%xmm13,%xmm13
874	vaesenclast	%xmm3,%xmm14,%xmm14
875	vmovups	%xmm9,0(%rsi)
876	vmovups	%xmm10,16(%rsi)
877	vmovups	%xmm11,32(%rsi)
878	vmovups	%xmm12,48(%rsi)
879	vmovups	%xmm13,64(%rsi)
880	vmovups	%xmm14,80(%rsi)
881	leaq	96(%rsi),%rsi
882
883	.byte	0xf3,0xc3
884.align	32
885.Lhandle_ctr32_2:
886	vpshufb	%xmm0,%xmm1,%xmm6
887	vmovdqu	48(%r11),%xmm5
888	vpaddd	64(%r11),%xmm6,%xmm10
889	vpaddd	%xmm5,%xmm6,%xmm11
890	vpaddd	%xmm5,%xmm10,%xmm12
891	vpshufb	%xmm0,%xmm10,%xmm10
892	vpaddd	%xmm5,%xmm11,%xmm13
893	vpshufb	%xmm0,%xmm11,%xmm11
894	vpxor	%xmm4,%xmm10,%xmm10
895	vpaddd	%xmm5,%xmm12,%xmm14
896	vpshufb	%xmm0,%xmm12,%xmm12
897	vpxor	%xmm4,%xmm11,%xmm11
898	vpaddd	%xmm5,%xmm13,%xmm1
899	vpshufb	%xmm0,%xmm13,%xmm13
900	vpxor	%xmm4,%xmm12,%xmm12
901	vpshufb	%xmm0,%xmm14,%xmm14
902	vpxor	%xmm4,%xmm13,%xmm13
903	vpshufb	%xmm0,%xmm1,%xmm1
904	vpxor	%xmm4,%xmm14,%xmm14
905	jmp	.Loop_ctr32
906.cfi_endproc
907.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
908
909.globl	aesni_gcm_encrypt
910.type	aesni_gcm_encrypt,@function
911.align	32
912aesni_gcm_encrypt:
913.cfi_startproc
914	xorq	%r10,%r10
915	cmpq	$288,%rdx
916	jb	.Lgcm_enc_abort
917
918	leaq	(%rsp),%rax
919.cfi_def_cfa_register	%rax
920	pushq	%rbx
921.cfi_offset	%rbx,-16
922	pushq	%rbp
923.cfi_offset	%rbp,-24
924	pushq	%r12
925.cfi_offset	%r12,-32
926	pushq	%r13
927.cfi_offset	%r13,-40
928	pushq	%r14
929.cfi_offset	%r14,-48
930	pushq	%r15
931.cfi_offset	%r15,-56
932	pushq	%r9
933.cfi_offset	%r9,-64
934	vzeroupper
935
936	vmovdqu	(%r8),%xmm1
937	addq	$-128,%rsp
938	movl	12(%r8),%ebx
939	leaq	.Lbswap_mask(%rip),%r11
940	leaq	-128(%rcx),%r14
941	movq	$0xf80,%r15
942	leaq	128(%rcx),%rcx
943	vmovdqu	(%r11),%xmm0
944	andq	$-128,%rsp
945	movl	504-128(%rcx),%ebp	// ICP has an larger offset for rounds.
946
947	andq	%r15,%r14
948	andq	%rsp,%r15
949	subq	%r14,%r15
950	jc	.Lenc_no_key_aliasing
951	cmpq	$768,%r15
952	jnc	.Lenc_no_key_aliasing
953	subq	%r15,%rsp
954.Lenc_no_key_aliasing:
955
956	leaq	(%rsi),%r14
957	leaq	-192(%rsi,%rdx,1),%r15
958	shrq	$4,%rdx
959
960	call	_aesni_ctr32_6x
961	vpshufb	%xmm0,%xmm9,%xmm8
962	vpshufb	%xmm0,%xmm10,%xmm2
963	vmovdqu	%xmm8,112(%rsp)
964	vpshufb	%xmm0,%xmm11,%xmm4
965	vmovdqu	%xmm2,96(%rsp)
966	vpshufb	%xmm0,%xmm12,%xmm5
967	vmovdqu	%xmm4,80(%rsp)
968	vpshufb	%xmm0,%xmm13,%xmm6
969	vmovdqu	%xmm5,64(%rsp)
970	vpshufb	%xmm0,%xmm14,%xmm7
971	vmovdqu	%xmm6,48(%rsp)
972
973	call	_aesni_ctr32_6x
974
975	vmovdqu	(%r9),%xmm8
976	movq	32(%r9),%r9
977	leaq	32(%r9),%r9
978	subq	$12,%rdx
979	movq	$192,%r10
980	vpshufb	%xmm0,%xmm8,%xmm8
981
982#ifdef HAVE_MOVBE
983#ifdef _KERNEL
984	testl	$1,gcm_avx_can_use_movbe(%rip)
985#else
986	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
987#endif
988	jz	1f
989	call	_aesni_ctr32_ghash_6x
990	jmp	2f
9911:
992#endif
993	call	_aesni_ctr32_ghash_no_movbe_6x
9942:
995	vmovdqu	32(%rsp),%xmm7
996	vmovdqu	(%r11),%xmm0
997	vmovdqu	0-32(%r9),%xmm3
998	vpunpckhqdq	%xmm7,%xmm7,%xmm1
999	vmovdqu	32-32(%r9),%xmm15
1000	vmovups	%xmm9,-96(%rsi)
1001	vpshufb	%xmm0,%xmm9,%xmm9
1002	vpxor	%xmm7,%xmm1,%xmm1
1003	vmovups	%xmm10,-80(%rsi)
1004	vpshufb	%xmm0,%xmm10,%xmm10
1005	vmovups	%xmm11,-64(%rsi)
1006	vpshufb	%xmm0,%xmm11,%xmm11
1007	vmovups	%xmm12,-48(%rsi)
1008	vpshufb	%xmm0,%xmm12,%xmm12
1009	vmovups	%xmm13,-32(%rsi)
1010	vpshufb	%xmm0,%xmm13,%xmm13
1011	vmovups	%xmm14,-16(%rsi)
1012	vpshufb	%xmm0,%xmm14,%xmm14
1013	vmovdqu	%xmm9,16(%rsp)
1014	vmovdqu	48(%rsp),%xmm6
1015	vmovdqu	16-32(%r9),%xmm0
1016	vpunpckhqdq	%xmm6,%xmm6,%xmm2
1017	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
1018	vpxor	%xmm6,%xmm2,%xmm2
1019	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
1020	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
1021
1022	vmovdqu	64(%rsp),%xmm9
1023	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
1024	vmovdqu	48-32(%r9),%xmm3
1025	vpxor	%xmm5,%xmm4,%xmm4
1026	vpunpckhqdq	%xmm9,%xmm9,%xmm5
1027	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
1028	vpxor	%xmm9,%xmm5,%xmm5
1029	vpxor	%xmm7,%xmm6,%xmm6
1030	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
1031	vmovdqu	80-32(%r9),%xmm15
1032	vpxor	%xmm1,%xmm2,%xmm2
1033
1034	vmovdqu	80(%rsp),%xmm1
1035	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
1036	vmovdqu	64-32(%r9),%xmm0
1037	vpxor	%xmm4,%xmm7,%xmm7
1038	vpunpckhqdq	%xmm1,%xmm1,%xmm4
1039	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
1040	vpxor	%xmm1,%xmm4,%xmm4
1041	vpxor	%xmm6,%xmm9,%xmm9
1042	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
1043	vpxor	%xmm2,%xmm5,%xmm5
1044
1045	vmovdqu	96(%rsp),%xmm2
1046	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
1047	vmovdqu	96-32(%r9),%xmm3
1048	vpxor	%xmm7,%xmm6,%xmm6
1049	vpunpckhqdq	%xmm2,%xmm2,%xmm7
1050	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
1051	vpxor	%xmm2,%xmm7,%xmm7
1052	vpxor	%xmm9,%xmm1,%xmm1
1053	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
1054	vmovdqu	128-32(%r9),%xmm15
1055	vpxor	%xmm5,%xmm4,%xmm4
1056
1057	vpxor	112(%rsp),%xmm8,%xmm8
1058	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
1059	vmovdqu	112-32(%r9),%xmm0
1060	vpunpckhqdq	%xmm8,%xmm8,%xmm9
1061	vpxor	%xmm6,%xmm5,%xmm5
1062	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
1063	vpxor	%xmm8,%xmm9,%xmm9
1064	vpxor	%xmm1,%xmm2,%xmm2
1065	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
1066	vpxor	%xmm4,%xmm7,%xmm4
1067
1068	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
1069	vmovdqu	0-32(%r9),%xmm3
1070	vpunpckhqdq	%xmm14,%xmm14,%xmm1
1071	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
1072	vpxor	%xmm14,%xmm1,%xmm1
1073	vpxor	%xmm5,%xmm6,%xmm5
1074	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
1075	vmovdqu	32-32(%r9),%xmm15
1076	vpxor	%xmm2,%xmm8,%xmm7
1077	vpxor	%xmm4,%xmm9,%xmm6
1078
1079	vmovdqu	16-32(%r9),%xmm0
1080	vpxor	%xmm5,%xmm7,%xmm9
1081	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
1082	vpxor	%xmm9,%xmm6,%xmm6
1083	vpunpckhqdq	%xmm13,%xmm13,%xmm2
1084	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
1085	vpxor	%xmm13,%xmm2,%xmm2
1086	vpslldq	$8,%xmm6,%xmm9
1087	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
1088	vpxor	%xmm9,%xmm5,%xmm8
1089	vpsrldq	$8,%xmm6,%xmm6
1090	vpxor	%xmm6,%xmm7,%xmm7
1091
1092	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
1093	vmovdqu	48-32(%r9),%xmm3
1094	vpxor	%xmm4,%xmm5,%xmm5
1095	vpunpckhqdq	%xmm12,%xmm12,%xmm9
1096	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
1097	vpxor	%xmm12,%xmm9,%xmm9
1098	vpxor	%xmm14,%xmm13,%xmm13
1099	vpalignr	$8,%xmm8,%xmm8,%xmm14
1100	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
1101	vmovdqu	80-32(%r9),%xmm15
1102	vpxor	%xmm1,%xmm2,%xmm2
1103
1104	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
1105	vmovdqu	64-32(%r9),%xmm0
1106	vpxor	%xmm5,%xmm4,%xmm4
1107	vpunpckhqdq	%xmm11,%xmm11,%xmm1
1108	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
1109	vpxor	%xmm11,%xmm1,%xmm1
1110	vpxor	%xmm13,%xmm12,%xmm12
1111	vxorps	16(%rsp),%xmm7,%xmm7
1112	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
1113	vpxor	%xmm2,%xmm9,%xmm9
1114
1115	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
1116	vxorps	%xmm14,%xmm8,%xmm8
1117
1118	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
1119	vmovdqu	96-32(%r9),%xmm3
1120	vpxor	%xmm4,%xmm5,%xmm5
1121	vpunpckhqdq	%xmm10,%xmm10,%xmm2
1122	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
1123	vpxor	%xmm10,%xmm2,%xmm2
1124	vpalignr	$8,%xmm8,%xmm8,%xmm14
1125	vpxor	%xmm12,%xmm11,%xmm11
1126	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
1127	vmovdqu	128-32(%r9),%xmm15
1128	vpxor	%xmm9,%xmm1,%xmm1
1129
1130	vxorps	%xmm7,%xmm14,%xmm14
1131	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
1132	vxorps	%xmm14,%xmm8,%xmm8
1133
1134	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
1135	vmovdqu	112-32(%r9),%xmm0
1136	vpxor	%xmm5,%xmm4,%xmm4
1137	vpunpckhqdq	%xmm8,%xmm8,%xmm9
1138	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
1139	vpxor	%xmm8,%xmm9,%xmm9
1140	vpxor	%xmm11,%xmm10,%xmm10
1141	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
1142	vpxor	%xmm1,%xmm2,%xmm2
1143
1144	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
1145	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
1146	vpxor	%xmm4,%xmm5,%xmm5
1147	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
1148	vpxor	%xmm10,%xmm7,%xmm7
1149	vpxor	%xmm2,%xmm6,%xmm6
1150
1151	vpxor	%xmm5,%xmm7,%xmm4
1152	vpxor	%xmm4,%xmm6,%xmm6
1153	vpslldq	$8,%xmm6,%xmm1
1154	vmovdqu	16(%r11),%xmm3
1155	vpsrldq	$8,%xmm6,%xmm6
1156	vpxor	%xmm1,%xmm5,%xmm8
1157	vpxor	%xmm6,%xmm7,%xmm7
1158
1159	vpalignr	$8,%xmm8,%xmm8,%xmm2
1160	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
1161	vpxor	%xmm2,%xmm8,%xmm8
1162
1163	vpalignr	$8,%xmm8,%xmm8,%xmm2
1164	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
1165	vpxor	%xmm7,%xmm2,%xmm2
1166	vpxor	%xmm2,%xmm8,%xmm8
1167	vpshufb	(%r11),%xmm8,%xmm8
1168	movq	-56(%rax),%r9
1169.cfi_restore	%r9
1170	vmovdqu	%xmm8,(%r9)
1171
1172	vzeroupper
1173	movq	-48(%rax),%r15
1174.cfi_restore	%r15
1175	movq	-40(%rax),%r14
1176.cfi_restore	%r14
1177	movq	-32(%rax),%r13
1178.cfi_restore	%r13
1179	movq	-24(%rax),%r12
1180.cfi_restore	%r12
1181	movq	-16(%rax),%rbp
1182.cfi_restore	%rbp
1183	movq	-8(%rax),%rbx
1184.cfi_restore	%rbx
1185	leaq	(%rax),%rsp
1186.cfi_def_cfa_register	%rsp
1187.Lgcm_enc_abort:
1188	movq	%r10,%rax
1189	.byte	0xf3,0xc3
1190.cfi_endproc
1191.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
1192
1193/* Some utility routines */
1194
1195/*
1196 * clear all fpu registers
1197 * void clear_fpu_regs_avx(void);
1198 */
1199.globl	clear_fpu_regs_avx
1200.type	clear_fpu_regs_avx,@function
1201.align	32
1202clear_fpu_regs_avx:
1203	vzeroall
1204	RET
1205.size	clear_fpu_regs_avx,.-clear_fpu_regs_avx
1206
1207/*
1208 * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1209 *
1210 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
1211 * stores the result at `dst'. The XOR is performed using FPU registers,
1212 * so make sure FPU state is saved when running this in the kernel.
1213 */
1214.globl  gcm_xor_avx
1215.type	gcm_xor_avx,@function
1216.align	32
1217gcm_xor_avx:
1218	movdqu  (%rdi), %xmm0
1219	movdqu  (%rsi), %xmm1
1220	pxor    %xmm1, %xmm0
1221	movdqu  %xmm0, (%rsi)
1222	RET
1223.size	gcm_xor_avx,.-gcm_xor_avx
1224
1225/*
1226 * Toggle a boolean_t value atomically and return the new value.
1227 * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
1228 */
1229.globl	atomic_toggle_boolean_nv
1230.type	atomic_toggle_boolean_nv,@function
1231.align	32
1232atomic_toggle_boolean_nv:
1233	xorl	%eax, %eax
1234	lock
1235	xorl	$1, (%rdi)
1236	jz	1f
1237	movl	$1, %eax
12381:
1239	RET
1240.size	atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
1241
1242.align	64
1243.Lbswap_mask:
1244.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1245.Lpoly:
1246.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1247.Lone_msb:
1248.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1249.Ltwo_lsb:
1250.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1251.Lone_lsb:
1252.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1253.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1254.align	64
1255
1256/* Mark the stack non-executable. */
1257#if defined(__linux__) && defined(__ELF__)
1258.section .note.GNU-stack,"",%progbits
1259#endif
1260
1261#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
1262