1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10#
11# AES-NI-CTR+GHASH stitch.
12#
13# February 2013
14#
15# OpenSSL GCM implementation is organized in such way that its
16# performance is rather close to the sum of its streamed components,
17# in the context parallelized AES-NI CTR and modulo-scheduled
18# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
19# was observed to perform significantly better than the sum of the
20# components on contemporary CPUs, the effort was deemed impossible to
21# justify. This module is based on combination of Intel submissions,
22# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
23# Locktyukhin of Intel Corp. who verified that it reduces shuffles
24# pressure with notable relative improvement, achieving 1.0 cycle per
25# byte processed with 128-bit key on Haswell processor, and 0.74 -
26# on Broadwell. [Mentioned results are raw profiled measurements for
27# favourable packet size, one divisible by 96. Applications using the
28# EVP interface will observe a few percent worse performance.]
29#
30# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
31# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
32
33$flavour = shift;
34$output  = shift;
35if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
36
37$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
41( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
42die "can't locate x86_64-xlate.pl";
43
44if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
45		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
46	$avx = ($1>=2.20) + ($1>=2.22);
47}
48
49if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
50	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
51	$avx = ($1>=2.09) + ($1>=2.10);
52}
53
54if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
55	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
56	$avx = ($1>=10) + ($1>=11);
57}
58
59if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
60	$avx = ($2>=3.0) + ($2>3.0);
61}
62
63open OUT,"| \"$^X\" $xlate $flavour $output";
64*STDOUT=*OUT;
65
66if ($avx>1) {{{
67
68($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
69
70($Ii,$T1,$T2,$Hkey,
71 $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
72
73($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
74
75($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
76
77$code=<<___;
78.text
79
80.type	_aesni_ctr32_ghash_6x,\@abi-omnipotent
81.align	32
82_aesni_ctr32_ghash_6x:
83	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
84	sub		\$6,$len
85	vpxor		$Z0,$Z0,$Z0		# $Z0   = 0
86	vmovdqu		0x00-0x80($key),$rndkey
87	vpaddb		$T2,$T1,$inout1
88	vpaddb		$T2,$inout1,$inout2
89	vpaddb		$T2,$inout2,$inout3
90	vpaddb		$T2,$inout3,$inout4
91	vpaddb		$T2,$inout4,$inout5
92	vpxor		$rndkey,$T1,$inout0
93	vmovdqu		$Z0,16+8(%rsp)		# "$Z3" = 0
94	jmp		.Loop6x
95
96.align	32
97.Loop6x:
98	add		\$`6<<24`,$counter
99	jc		.Lhandle_ctr32		# discard $inout[1-5]?
100	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
101	  vpaddb	$T2,$inout5,$T1		# next counter value
102	  vpxor		$rndkey,$inout1,$inout1
103	  vpxor		$rndkey,$inout2,$inout2
104
105.Lresume_ctr32:
106	vmovdqu		$T1,($ivp)		# save next counter value
107	vpclmulqdq	\$0x10,$Hkey,$Z3,$Z1
108	  vpxor		$rndkey,$inout3,$inout3
109	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
110	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
111	xor		%r12,%r12
112	cmp		$in0,$end0
113
114	  vaesenc	$T2,$inout0,$inout0
115	vmovdqu		0x30+8(%rsp),$Ii	# I[4]
116	  vpxor		$rndkey,$inout4,$inout4
117	vpclmulqdq	\$0x00,$Hkey,$Z3,$T1
118	  vaesenc	$T2,$inout1,$inout1
119	  vpxor		$rndkey,$inout5,$inout5
120	setnc		%r12b
121	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
122	  vaesenc	$T2,$inout2,$inout2
123	vmovdqu		0x10-0x20($Xip),$Hkey	# $Hkey^2
124	neg		%r12
125	  vaesenc	$T2,$inout3,$inout3
126	 vpxor		$Z1,$Z2,$Z2
127	vpclmulqdq	\$0x00,$Hkey,$Ii,$Z1
128	 vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
129	  vaesenc	$T2,$inout4,$inout4
130	 vpxor		$Z1,$T1,$Z0
131	and		\$0x60,%r12
132	  vmovups	0x20-0x80($key),$rndkey
133	vpclmulqdq	\$0x10,$Hkey,$Ii,$T1
134	  vaesenc	$T2,$inout5,$inout5
135
136	vpclmulqdq	\$0x01,$Hkey,$Ii,$T2
137	lea		($in0,%r12),$in0
138	  vaesenc	$rndkey,$inout0,$inout0
139	 vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled [vpxor $Z3,$Xi,$Xi]
140	vpclmulqdq	\$0x11,$Hkey,$Ii,$Hkey
141	 vmovdqu	0x40+8(%rsp),$Ii	# I[3]
142	  vaesenc	$rndkey,$inout1,$inout1
143	movbe		0x58($in0),%r13
144	  vaesenc	$rndkey,$inout2,$inout2
145	movbe		0x50($in0),%r12
146	  vaesenc	$rndkey,$inout3,$inout3
147	mov		%r13,0x20+8(%rsp)
148	  vaesenc	$rndkey,$inout4,$inout4
149	mov		%r12,0x28+8(%rsp)
150	vmovdqu		0x30-0x20($Xip),$Z1	# borrow $Z1 for $Hkey^3
151	  vaesenc	$rndkey,$inout5,$inout5
152
153	  vmovups	0x30-0x80($key),$rndkey
154	 vpxor		$T1,$Z2,$Z2
155	vpclmulqdq	\$0x00,$Z1,$Ii,$T1
156	  vaesenc	$rndkey,$inout0,$inout0
157	 vpxor		$T2,$Z2,$Z2
158	vpclmulqdq	\$0x10,$Z1,$Ii,$T2
159	  vaesenc	$rndkey,$inout1,$inout1
160	 vpxor		$Hkey,$Z3,$Z3
161	vpclmulqdq	\$0x01,$Z1,$Ii,$Hkey
162	  vaesenc	$rndkey,$inout2,$inout2
163	vpclmulqdq	\$0x11,$Z1,$Ii,$Z1
164	 vmovdqu	0x50+8(%rsp),$Ii	# I[2]
165	  vaesenc	$rndkey,$inout3,$inout3
166	  vaesenc	$rndkey,$inout4,$inout4
167	 vpxor		$T1,$Z0,$Z0
168	vmovdqu		0x40-0x20($Xip),$T1	# borrow $T1 for $Hkey^4
169	  vaesenc	$rndkey,$inout5,$inout5
170
171	  vmovups	0x40-0x80($key),$rndkey
172	 vpxor		$T2,$Z2,$Z2
173	vpclmulqdq	\$0x00,$T1,$Ii,$T2
174	  vaesenc	$rndkey,$inout0,$inout0
175	 vpxor		$Hkey,$Z2,$Z2
176	vpclmulqdq	\$0x10,$T1,$Ii,$Hkey
177	  vaesenc	$rndkey,$inout1,$inout1
178	movbe		0x48($in0),%r13
179	 vpxor		$Z1,$Z3,$Z3
180	vpclmulqdq	\$0x01,$T1,$Ii,$Z1
181	  vaesenc	$rndkey,$inout2,$inout2
182	movbe		0x40($in0),%r12
183	vpclmulqdq	\$0x11,$T1,$Ii,$T1
184	 vmovdqu	0x60+8(%rsp),$Ii	# I[1]
185	  vaesenc	$rndkey,$inout3,$inout3
186	mov		%r13,0x30+8(%rsp)
187	  vaesenc	$rndkey,$inout4,$inout4
188	mov		%r12,0x38+8(%rsp)
189	 vpxor		$T2,$Z0,$Z0
190	vmovdqu		0x60-0x20($Xip),$T2	# borrow $T2 for $Hkey^5
191	  vaesenc	$rndkey,$inout5,$inout5
192
193	  vmovups	0x50-0x80($key),$rndkey
194	 vpxor		$Hkey,$Z2,$Z2
195	vpclmulqdq	\$0x00,$T2,$Ii,$Hkey
196	  vaesenc	$rndkey,$inout0,$inout0
197	 vpxor		$Z1,$Z2,$Z2
198	vpclmulqdq	\$0x10,$T2,$Ii,$Z1
199	  vaesenc	$rndkey,$inout1,$inout1
200	movbe		0x38($in0),%r13
201	 vpxor		$T1,$Z3,$Z3
202	vpclmulqdq	\$0x01,$T2,$Ii,$T1
203	 vpxor		0x70+8(%rsp),$Xi,$Xi	# accumulate I[0]
204	  vaesenc	$rndkey,$inout2,$inout2
205	movbe		0x30($in0),%r12
206	vpclmulqdq	\$0x11,$T2,$Ii,$T2
207	  vaesenc	$rndkey,$inout3,$inout3
208	mov		%r13,0x40+8(%rsp)
209	  vaesenc	$rndkey,$inout4,$inout4
210	mov		%r12,0x48+8(%rsp)
211	 vpxor		$Hkey,$Z0,$Z0
212	 vmovdqu	0x70-0x20($Xip),$Hkey	# $Hkey^6
213	  vaesenc	$rndkey,$inout5,$inout5
214
215	  vmovups	0x60-0x80($key),$rndkey
216	 vpxor		$Z1,$Z2,$Z2
217	vpclmulqdq	\$0x10,$Hkey,$Xi,$Z1
218	  vaesenc	$rndkey,$inout0,$inout0
219	 vpxor		$T1,$Z2,$Z2
220	vpclmulqdq	\$0x01,$Hkey,$Xi,$T1
221	  vaesenc	$rndkey,$inout1,$inout1
222	movbe		0x28($in0),%r13
223	 vpxor		$T2,$Z3,$Z3
224	vpclmulqdq	\$0x00,$Hkey,$Xi,$T2
225	  vaesenc	$rndkey,$inout2,$inout2
226	movbe		0x20($in0),%r12
227	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xi
228	  vaesenc	$rndkey,$inout3,$inout3
229	mov		%r13,0x50+8(%rsp)
230	  vaesenc	$rndkey,$inout4,$inout4
231	mov		%r12,0x58+8(%rsp)
232	vpxor		$Z1,$Z2,$Z2
233	  vaesenc	$rndkey,$inout5,$inout5
234	vpxor		$T1,$Z2,$Z2
235
236	  vmovups	0x70-0x80($key),$rndkey
237	vpslldq		\$8,$Z2,$Z1
238	vpxor		$T2,$Z0,$Z0
239	vmovdqu		0x10($const),$Hkey	# .Lpoly
240
241	  vaesenc	$rndkey,$inout0,$inout0
242	vpxor		$Xi,$Z3,$Z3
243	  vaesenc	$rndkey,$inout1,$inout1
244	vpxor		$Z1,$Z0,$Z0
245	movbe		0x18($in0),%r13
246	  vaesenc	$rndkey,$inout2,$inout2
247	movbe		0x10($in0),%r12
248	vpalignr	\$8,$Z0,$Z0,$Ii		# 1st phase
249	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
250	mov		%r13,0x60+8(%rsp)
251	  vaesenc	$rndkey,$inout3,$inout3
252	mov		%r12,0x68+8(%rsp)
253	  vaesenc	$rndkey,$inout4,$inout4
254	  vmovups	0x80-0x80($key),$T1	# borrow $T1 for $rndkey
255	  vaesenc	$rndkey,$inout5,$inout5
256
257	  vaesenc	$T1,$inout0,$inout0
258	  vmovups	0x90-0x80($key),$rndkey
259	  vaesenc	$T1,$inout1,$inout1
260	vpsrldq		\$8,$Z2,$Z2
261	  vaesenc	$T1,$inout2,$inout2
262	vpxor		$Z2,$Z3,$Z3
263	  vaesenc	$T1,$inout3,$inout3
264	vpxor		$Ii,$Z0,$Z0
265	movbe		0x08($in0),%r13
266	  vaesenc	$T1,$inout4,$inout4
267	movbe		0x00($in0),%r12
268	  vaesenc	$T1,$inout5,$inout5
269	  vmovups	0xa0-0x80($key),$T1
270	  cmp		\$11,$rounds
271	  jb		.Lenc_tail		# 128-bit key
272
273	  vaesenc	$rndkey,$inout0,$inout0
274	  vaesenc	$rndkey,$inout1,$inout1
275	  vaesenc	$rndkey,$inout2,$inout2
276	  vaesenc	$rndkey,$inout3,$inout3
277	  vaesenc	$rndkey,$inout4,$inout4
278	  vaesenc	$rndkey,$inout5,$inout5
279
280	  vaesenc	$T1,$inout0,$inout0
281	  vaesenc	$T1,$inout1,$inout1
282	  vaesenc	$T1,$inout2,$inout2
283	  vaesenc	$T1,$inout3,$inout3
284	  vaesenc	$T1,$inout4,$inout4
285	  vmovups	0xb0-0x80($key),$rndkey
286	  vaesenc	$T1,$inout5,$inout5
287	  vmovups	0xc0-0x80($key),$T1
288	  je		.Lenc_tail		# 192-bit key
289
290	  vaesenc	$rndkey,$inout0,$inout0
291	  vaesenc	$rndkey,$inout1,$inout1
292	  vaesenc	$rndkey,$inout2,$inout2
293	  vaesenc	$rndkey,$inout3,$inout3
294	  vaesenc	$rndkey,$inout4,$inout4
295	  vaesenc	$rndkey,$inout5,$inout5
296
297	  vaesenc	$T1,$inout0,$inout0
298	  vaesenc	$T1,$inout1,$inout1
299	  vaesenc	$T1,$inout2,$inout2
300	  vaesenc	$T1,$inout3,$inout3
301	  vaesenc	$T1,$inout4,$inout4
302	  vmovups	0xd0-0x80($key),$rndkey
303	  vaesenc	$T1,$inout5,$inout5
304	  vmovups	0xe0-0x80($key),$T1
305	  jmp		.Lenc_tail		# 256-bit key
306
307.align	32
308.Lhandle_ctr32:
309	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
310	  vpshufb	$Ii,$T1,$Z2		# byte-swap counter
311	  vmovdqu	0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
312	  vpaddd	0x40($const),$Z2,$inout1	# .Lone_lsb
313	  vpaddd	$Z1,$Z2,$inout2
314	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
315	  vpaddd	$Z1,$inout1,$inout3
316	  vpshufb	$Ii,$inout1,$inout1
317	  vpaddd	$Z1,$inout2,$inout4
318	  vpshufb	$Ii,$inout2,$inout2
319	  vpxor		$rndkey,$inout1,$inout1
320	  vpaddd	$Z1,$inout3,$inout5
321	  vpshufb	$Ii,$inout3,$inout3
322	  vpxor		$rndkey,$inout2,$inout2
323	  vpaddd	$Z1,$inout4,$T1		# byte-swapped next counter value
324	  vpshufb	$Ii,$inout4,$inout4
325	  vpshufb	$Ii,$inout5,$inout5
326	  vpshufb	$Ii,$T1,$T1		# next counter value
327	jmp		.Lresume_ctr32
328
329.align	32
330.Lenc_tail:
331	  vaesenc	$rndkey,$inout0,$inout0
332	vmovdqu		$Z3,16+8(%rsp)		# postpone vpxor $Z3,$Xi,$Xi
333	vpalignr	\$8,$Z0,$Z0,$Xi		# 2nd phase
334	  vaesenc	$rndkey,$inout1,$inout1
335	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
336	  vpxor		0x00($inp),$T1,$T2
337	  vaesenc	$rndkey,$inout2,$inout2
338	  vpxor		0x10($inp),$T1,$Ii
339	  vaesenc	$rndkey,$inout3,$inout3
340	  vpxor		0x20($inp),$T1,$Z1
341	  vaesenc	$rndkey,$inout4,$inout4
342	  vpxor		0x30($inp),$T1,$Z2
343	  vaesenc	$rndkey,$inout5,$inout5
344	  vpxor		0x40($inp),$T1,$Z3
345	  vpxor		0x50($inp),$T1,$Hkey
346	  vmovdqu	($ivp),$T1		# load next counter value
347
348	  vaesenclast	$T2,$inout0,$inout0
349	  vmovdqu	0x20($const),$T2	# borrow $T2, .Lone_msb
350	  vaesenclast	$Ii,$inout1,$inout1
351	 vpaddb		$T2,$T1,$Ii
352	mov		%r13,0x70+8(%rsp)
353	lea		0x60($inp),$inp
354	  vaesenclast	$Z1,$inout2,$inout2
355	 vpaddb		$T2,$Ii,$Z1
356	mov		%r12,0x78+8(%rsp)
357	lea		0x60($out),$out
358	  vmovdqu	0x00-0x80($key),$rndkey
359	  vaesenclast	$Z2,$inout3,$inout3
360	 vpaddb		$T2,$Z1,$Z2
361	  vaesenclast	$Z3, $inout4,$inout4
362	 vpaddb		$T2,$Z2,$Z3
363	  vaesenclast	$Hkey,$inout5,$inout5
364	 vpaddb		$T2,$Z3,$Hkey
365
366	add		\$0x60,$ret
367	sub		\$0x6,$len
368	jc		.L6x_done
369
370	  vmovups	$inout0,-0x60($out)	# save output
371	 vpxor		$rndkey,$T1,$inout0
372	  vmovups	$inout1,-0x50($out)
373	 vmovdqa	$Ii,$inout1		# 0 latency
374	  vmovups	$inout2,-0x40($out)
375	 vmovdqa	$Z1,$inout2		# 0 latency
376	  vmovups	$inout3,-0x30($out)
377	 vmovdqa	$Z2,$inout3		# 0 latency
378	  vmovups	$inout4,-0x20($out)
379	 vmovdqa	$Z3,$inout4		# 0 latency
380	  vmovups	$inout5,-0x10($out)
381	 vmovdqa	$Hkey,$inout5		# 0 latency
382	vmovdqu		0x20+8(%rsp),$Z3	# I[5]
383	jmp		.Loop6x
384
385.L6x_done:
386	vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled
387	vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
388
389	ret
390.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
391___
392######################################################################
393#
394# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
395#		const AES_KEY *key, unsigned char iv[16],
396#		struct { u128 Xi,H,Htbl[9]; } *Xip);
397$code.=<<___;
398.globl	aesni_gcm_decrypt
399.type	aesni_gcm_decrypt,\@function,6
400.align	32
401aesni_gcm_decrypt:
402	xor	$ret,$ret
403	cmp	\$0x60,$len			# minimal accepted length
404	jb	.Lgcm_dec_abort
405
406	lea	(%rsp),%rax			# save stack pointer
407	push	%rbx
408	push	%rbp
409	push	%r12
410	push	%r13
411	push	%r14
412	push	%r15
413___
414$code.=<<___ if ($win64);
415	lea	-0xa8(%rsp),%rsp
416	movaps	%xmm6,-0xd8(%rax)
417	movaps	%xmm7,-0xc8(%rax)
418	movaps	%xmm8,-0xb8(%rax)
419	movaps	%xmm9,-0xa8(%rax)
420	movaps	%xmm10,-0x98(%rax)
421	movaps	%xmm11,-0x88(%rax)
422	movaps	%xmm12,-0x78(%rax)
423	movaps	%xmm13,-0x68(%rax)
424	movaps	%xmm14,-0x58(%rax)
425	movaps	%xmm15,-0x48(%rax)
426.Lgcm_dec_body:
427___
428$code.=<<___;
429	vzeroupper
430
431	vmovdqu		($ivp),$T1		# input counter value
432	add		\$-128,%rsp
433	mov		12($ivp),$counter
434	lea		.Lbswap_mask(%rip),$const
435	lea		-0x80($key),$in0	# borrow $in0
436	mov		\$0xf80,$end0		# borrow $end0
437	vmovdqu		($Xip),$Xi		# load Xi
438	and		\$-128,%rsp		# ensure stack alignment
439	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
440	lea		0x80($key),$key		# size optimization
441	lea		0x20+0x20($Xip),$Xip	# size optimization
442	mov		0xf0-0x80($key),$rounds
443	vpshufb		$Ii,$Xi,$Xi
444
445	and		$end0,$in0
446	and		%rsp,$end0
447	sub		$in0,$end0
448	jc		.Ldec_no_key_aliasing
449	cmp		\$768,$end0
450	jnc		.Ldec_no_key_aliasing
451	sub		$end0,%rsp		# avoid aliasing with key
452.Ldec_no_key_aliasing:
453
454	vmovdqu		0x50($inp),$Z3		# I[5]
455	lea		($inp),$in0
456	vmovdqu		0x40($inp),$Z0
457	lea		-0xc0($inp,$len),$end0
458	vmovdqu		0x30($inp),$Z1
459	shr		\$4,$len
460	xor		$ret,$ret
461	vmovdqu		0x20($inp),$Z2
462	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x
463	vmovdqu		0x10($inp),$T2
464	 vpshufb	$Ii,$Z0,$Z0
465	vmovdqu		($inp),$Hkey
466	 vpshufb	$Ii,$Z1,$Z1
467	vmovdqu		$Z0,0x30(%rsp)
468	 vpshufb	$Ii,$Z2,$Z2
469	vmovdqu		$Z1,0x40(%rsp)
470	 vpshufb	$Ii,$T2,$T2
471	vmovdqu		$Z2,0x50(%rsp)
472	 vpshufb	$Ii,$Hkey,$Hkey
473	vmovdqu		$T2,0x60(%rsp)
474	vmovdqu		$Hkey,0x70(%rsp)
475
476	call		_aesni_ctr32_ghash_6x
477
478	vmovups		$inout0,-0x60($out)	# save output
479	vmovups		$inout1,-0x50($out)
480	vmovups		$inout2,-0x40($out)
481	vmovups		$inout3,-0x30($out)
482	vmovups		$inout4,-0x20($out)
483	vmovups		$inout5,-0x10($out)
484
485	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
486	vmovdqu		$Xi,-0x40($Xip)		# output Xi
487
488	vzeroupper
489___
490$code.=<<___ if ($win64);
491	movaps	-0xd8(%rax),%xmm6
492	movaps	-0xc8(%rax),%xmm7
493	movaps	-0xb8(%rax),%xmm8
494	movaps	-0xa8(%rax),%xmm9
495	movaps	-0x98(%rax),%xmm10
496	movaps	-0x88(%rax),%xmm11
497	movaps	-0x78(%rax),%xmm12
498	movaps	-0x68(%rax),%xmm13
499	movaps	-0x58(%rax),%xmm14
500	movaps	-0x48(%rax),%xmm15
501___
502$code.=<<___;
503	mov	-48(%rax),%r15
504	mov	-40(%rax),%r14
505	mov	-32(%rax),%r13
506	mov	-24(%rax),%r12
507	mov	-16(%rax),%rbp
508	mov	-8(%rax),%rbx
509	lea	(%rax),%rsp		# restore %rsp
510.Lgcm_dec_abort:
511	mov	$ret,%rax		# return value
512	ret
513.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
514___
515
516$code.=<<___;
517.type	_aesni_ctr32_6x,\@abi-omnipotent
518.align	32
519_aesni_ctr32_6x:
520	vmovdqu		0x00-0x80($key),$Z0	# borrow $Z0 for $rndkey
521	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
522	lea		-1($rounds),%r13
523	vmovups		0x10-0x80($key),$rndkey
524	lea		0x20-0x80($key),%r12
525	vpxor		$Z0,$T1,$inout0
526	add		\$`6<<24`,$counter
527	jc		.Lhandle_ctr32_2
528	vpaddb		$T2,$T1,$inout1
529	vpaddb		$T2,$inout1,$inout2
530	vpxor		$Z0,$inout1,$inout1
531	vpaddb		$T2,$inout2,$inout3
532	vpxor		$Z0,$inout2,$inout2
533	vpaddb		$T2,$inout3,$inout4
534	vpxor		$Z0,$inout3,$inout3
535	vpaddb		$T2,$inout4,$inout5
536	vpxor		$Z0,$inout4,$inout4
537	vpaddb		$T2,$inout5,$T1
538	vpxor		$Z0,$inout5,$inout5
539	jmp		.Loop_ctr32
540
541.align	16
542.Loop_ctr32:
543	vaesenc		$rndkey,$inout0,$inout0
544	vaesenc		$rndkey,$inout1,$inout1
545	vaesenc		$rndkey,$inout2,$inout2
546	vaesenc		$rndkey,$inout3,$inout3
547	vaesenc		$rndkey,$inout4,$inout4
548	vaesenc		$rndkey,$inout5,$inout5
549	vmovups		(%r12),$rndkey
550	lea		0x10(%r12),%r12
551	dec		%r13d
552	jnz		.Loop_ctr32
553
554	vmovdqu		(%r12),$Hkey		# last round key
555	vaesenc		$rndkey,$inout0,$inout0
556	vpxor		0x00($inp),$Hkey,$Z0
557	vaesenc		$rndkey,$inout1,$inout1
558	vpxor		0x10($inp),$Hkey,$Z1
559	vaesenc		$rndkey,$inout2,$inout2
560	vpxor		0x20($inp),$Hkey,$Z2
561	vaesenc		$rndkey,$inout3,$inout3
562	vpxor		0x30($inp),$Hkey,$Xi
563	vaesenc		$rndkey,$inout4,$inout4
564	vpxor		0x40($inp),$Hkey,$T2
565	vaesenc		$rndkey,$inout5,$inout5
566	vpxor		0x50($inp),$Hkey,$Hkey
567	lea		0x60($inp),$inp
568
569	vaesenclast	$Z0,$inout0,$inout0
570	vaesenclast	$Z1,$inout1,$inout1
571	vaesenclast	$Z2,$inout2,$inout2
572	vaesenclast	$Xi,$inout3,$inout3
573	vaesenclast	$T2,$inout4,$inout4
574	vaesenclast	$Hkey,$inout5,$inout5
575	vmovups		$inout0,0x00($out)
576	vmovups		$inout1,0x10($out)
577	vmovups		$inout2,0x20($out)
578	vmovups		$inout3,0x30($out)
579	vmovups		$inout4,0x40($out)
580	vmovups		$inout5,0x50($out)
581	lea		0x60($out),$out
582
583	ret
584.align	32
585.Lhandle_ctr32_2:
586	vpshufb		$Ii,$T1,$Z2		# byte-swap counter
587	vmovdqu		0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
588	vpaddd		0x40($const),$Z2,$inout1	# .Lone_lsb
589	vpaddd		$Z1,$Z2,$inout2
590	vpaddd		$Z1,$inout1,$inout3
591	vpshufb		$Ii,$inout1,$inout1
592	vpaddd		$Z1,$inout2,$inout4
593	vpshufb		$Ii,$inout2,$inout2
594	vpxor		$Z0,$inout1,$inout1
595	vpaddd		$Z1,$inout3,$inout5
596	vpshufb		$Ii,$inout3,$inout3
597	vpxor		$Z0,$inout2,$inout2
598	vpaddd		$Z1,$inout4,$T1		# byte-swapped next counter value
599	vpshufb		$Ii,$inout4,$inout4
600	vpxor		$Z0,$inout3,$inout3
601	vpshufb		$Ii,$inout5,$inout5
602	vpxor		$Z0,$inout4,$inout4
603	vpshufb		$Ii,$T1,$T1		# next counter value
604	vpxor		$Z0,$inout5,$inout5
605	jmp	.Loop_ctr32
606.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
607
608.globl	aesni_gcm_encrypt
609.type	aesni_gcm_encrypt,\@function,6
610.align	32
611aesni_gcm_encrypt:
612	xor	$ret,$ret
613	cmp	\$0x60*3,$len			# minimal accepted length
614	jb	.Lgcm_enc_abort
615
616	lea	(%rsp),%rax			# save stack pointer
617	push	%rbx
618	push	%rbp
619	push	%r12
620	push	%r13
621	push	%r14
622	push	%r15
623___
624$code.=<<___ if ($win64);
625	lea	-0xa8(%rsp),%rsp
626	movaps	%xmm6,-0xd8(%rax)
627	movaps	%xmm7,-0xc8(%rax)
628	movaps	%xmm8,-0xb8(%rax)
629	movaps	%xmm9,-0xa8(%rax)
630	movaps	%xmm10,-0x98(%rax)
631	movaps	%xmm11,-0x88(%rax)
632	movaps	%xmm12,-0x78(%rax)
633	movaps	%xmm13,-0x68(%rax)
634	movaps	%xmm14,-0x58(%rax)
635	movaps	%xmm15,-0x48(%rax)
636.Lgcm_enc_body:
637___
638$code.=<<___;
639	vzeroupper
640
641	vmovdqu		($ivp),$T1		# input counter value
642	add		\$-128,%rsp
643	mov		12($ivp),$counter
644	lea		.Lbswap_mask(%rip),$const
645	lea		-0x80($key),$in0	# borrow $in0
646	mov		\$0xf80,$end0		# borrow $end0
647	lea		0x80($key),$key		# size optimization
648	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
649	and		\$-128,%rsp		# ensure stack alignment
650	mov		0xf0-0x80($key),$rounds
651
652	and		$end0,$in0
653	and		%rsp,$end0
654	sub		$in0,$end0
655	jc		.Lenc_no_key_aliasing
656	cmp		\$768,$end0
657	jnc		.Lenc_no_key_aliasing
658	sub		$end0,%rsp		# avoid aliasing with key
659.Lenc_no_key_aliasing:
660
661	lea		($out),$in0
662	lea		-0xc0($out,$len),$end0
663	shr		\$4,$len
664
665	call		_aesni_ctr32_6x
666	vpshufb		$Ii,$inout0,$Xi		# save bswapped output on stack
667	vpshufb		$Ii,$inout1,$T2
668	vmovdqu		$Xi,0x70(%rsp)
669	vpshufb		$Ii,$inout2,$Z0
670	vmovdqu		$T2,0x60(%rsp)
671	vpshufb		$Ii,$inout3,$Z1
672	vmovdqu		$Z0,0x50(%rsp)
673	vpshufb		$Ii,$inout4,$Z2
674	vmovdqu		$Z1,0x40(%rsp)
675	vpshufb		$Ii,$inout5,$Z3		# passed to _aesni_ctr32_ghash_6x
676	vmovdqu		$Z2,0x30(%rsp)
677
678	call		_aesni_ctr32_6x
679
680	vmovdqu		($Xip),$Xi		# load Xi
681	lea		0x20+0x20($Xip),$Xip	# size optimization
682	sub		\$12,$len
683	mov		\$0x60*2,$ret
684	vpshufb		$Ii,$Xi,$Xi
685
686	call		_aesni_ctr32_ghash_6x
687	vmovdqu		0x20(%rsp),$Z3		# I[5]
688	 vmovdqu	($const),$Ii		# borrow $Ii for .Lbswap_mask
689	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
690	vpunpckhqdq	$Z3,$Z3,$T1
691	vmovdqu		0x20-0x20($Xip),$rndkey	# borrow $rndkey for $HK
692	 vmovups	$inout0,-0x60($out)	# save output
693	 vpshufb	$Ii,$inout0,$inout0	# but keep bswapped copy
694	vpxor		$Z3,$T1,$T1
695	 vmovups	$inout1,-0x50($out)
696	 vpshufb	$Ii,$inout1,$inout1
697	 vmovups	$inout2,-0x40($out)
698	 vpshufb	$Ii,$inout2,$inout2
699	 vmovups	$inout3,-0x30($out)
700	 vpshufb	$Ii,$inout3,$inout3
701	 vmovups	$inout4,-0x20($out)
702	 vpshufb	$Ii,$inout4,$inout4
703	 vmovups	$inout5,-0x10($out)
704	 vpshufb	$Ii,$inout5,$inout5
705	 vmovdqu	$inout0,0x10(%rsp)	# free $inout0
706___
707{ my ($HK,$T3)=($rndkey,$inout0);
708
709$code.=<<___;
710	 vmovdqu	0x30(%rsp),$Z2		# I[4]
711	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
712	 vpunpckhqdq	$Z2,$Z2,$T2
713	vpclmulqdq	\$0x00,$Hkey,$Z3,$Z1
714	 vpxor		$Z2,$T2,$T2
715	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
716	vpclmulqdq	\$0x00,$HK,$T1,$T1
717
718	 vmovdqu	0x40(%rsp),$T3		# I[3]
719	vpclmulqdq	\$0x00,$Ii,$Z2,$Z0
720	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
721	vpxor		$Z1,$Z0,$Z0
722	 vpunpckhqdq	$T3,$T3,$Z1
723	vpclmulqdq	\$0x11,$Ii,$Z2,$Z2
724	 vpxor		$T3,$Z1,$Z1
725	vpxor		$Z3,$Z2,$Z2
726	vpclmulqdq	\$0x10,$HK,$T2,$T2
727	 vmovdqu	0x50-0x20($Xip),$HK
728	vpxor		$T1,$T2,$T2
729
730	 vmovdqu	0x50(%rsp),$T1		# I[2]
731	vpclmulqdq	\$0x00,$Hkey,$T3,$Z3
732	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
733	vpxor		$Z0,$Z3,$Z3
734	 vpunpckhqdq	$T1,$T1,$Z0
735	vpclmulqdq	\$0x11,$Hkey,$T3,$T3
736	 vpxor		$T1,$Z0,$Z0
737	vpxor		$Z2,$T3,$T3
738	vpclmulqdq	\$0x00,$HK,$Z1,$Z1
739	vpxor		$T2,$Z1,$Z1
740
741	 vmovdqu	0x60(%rsp),$T2		# I[1]
742	vpclmulqdq	\$0x00,$Ii,$T1,$Z2
743	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
744	vpxor		$Z3,$Z2,$Z2
745	 vpunpckhqdq	$T2,$T2,$Z3
746	vpclmulqdq	\$0x11,$Ii,$T1,$T1
747	 vpxor		$T2,$Z3,$Z3
748	vpxor		$T3,$T1,$T1
749	vpclmulqdq	\$0x10,$HK,$Z0,$Z0
750	 vmovdqu	0x80-0x20($Xip),$HK
751	vpxor		$Z1,$Z0,$Z0
752
753	 vpxor		0x70(%rsp),$Xi,$Xi	# accumulate I[0]
754	vpclmulqdq	\$0x00,$Hkey,$T2,$Z1
755	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
756	 vpunpckhqdq	$Xi,$Xi,$T3
757	vpxor		$Z2,$Z1,$Z1
758	vpclmulqdq	\$0x11,$Hkey,$T2,$T2
759	 vpxor		$Xi,$T3,$T3
760	vpxor		$T1,$T2,$T2
761	vpclmulqdq	\$0x00,$HK,$Z3,$Z3
762	vpxor		$Z0,$Z3,$Z0
763
764	vpclmulqdq	\$0x00,$Ii,$Xi,$Z2
765	 vmovdqu	0x00-0x20($Xip),$Hkey	# $Hkey^1
766	 vpunpckhqdq	$inout5,$inout5,$T1
767	vpclmulqdq	\$0x11,$Ii,$Xi,$Xi
768	 vpxor		$inout5,$T1,$T1
769	vpxor		$Z1,$Z2,$Z1
770	vpclmulqdq	\$0x10,$HK,$T3,$T3
771	 vmovdqu	0x20-0x20($Xip),$HK
772	vpxor		$T2,$Xi,$Z3
773	vpxor		$Z0,$T3,$Z2
774
775	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
776	  vpxor		$Z1,$Z3,$T3		# aggregated Karatsuba post-processing
777	vpclmulqdq	\$0x00,$Hkey,$inout5,$Z0
778	  vpxor		$T3,$Z2,$Z2
779	 vpunpckhqdq	$inout4,$inout4,$T2
780	vpclmulqdq	\$0x11,$Hkey,$inout5,$inout5
781	 vpxor		$inout4,$T2,$T2
782	  vpslldq	\$8,$Z2,$T3
783	vpclmulqdq	\$0x00,$HK,$T1,$T1
784	  vpxor		$T3,$Z1,$Xi
785	  vpsrldq	\$8,$Z2,$Z2
786	  vpxor		$Z2,$Z3,$Z3
787
788	vpclmulqdq	\$0x00,$Ii,$inout4,$Z1
789	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
790	vpxor		$Z0,$Z1,$Z1
791	 vpunpckhqdq	$inout3,$inout3,$T3
792	vpclmulqdq	\$0x11,$Ii,$inout4,$inout4
793	 vpxor		$inout3,$T3,$T3
794	vpxor		$inout5,$inout4,$inout4
795	  vpalignr	\$8,$Xi,$Xi,$inout5	# 1st phase
796	vpclmulqdq	\$0x10,$HK,$T2,$T2
797	 vmovdqu	0x50-0x20($Xip),$HK
798	vpxor		$T1,$T2,$T2
799
800	vpclmulqdq	\$0x00,$Hkey,$inout3,$Z0
801	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
802	vpxor		$Z1,$Z0,$Z0
803	 vpunpckhqdq	$inout2,$inout2,$T1
804	vpclmulqdq	\$0x11,$Hkey,$inout3,$inout3
805	 vpxor		$inout2,$T1,$T1
806	vpxor		$inout4,$inout3,$inout3
807	  vxorps	0x10(%rsp),$Z3,$Z3	# accumulate $inout0
808	vpclmulqdq	\$0x00,$HK,$T3,$T3
809	vpxor		$T2,$T3,$T3
810
811	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
812	  vxorps	$inout5,$Xi,$Xi
813
814	vpclmulqdq	\$0x00,$Ii,$inout2,$Z1
815	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
816	vpxor		$Z0,$Z1,$Z1
817	 vpunpckhqdq	$inout1,$inout1,$T2
818	vpclmulqdq	\$0x11,$Ii,$inout2,$inout2
819	 vpxor		$inout1,$T2,$T2
820	  vpalignr	\$8,$Xi,$Xi,$inout5	# 2nd phase
821	vpxor		$inout3,$inout2,$inout2
822	vpclmulqdq	\$0x10,$HK,$T1,$T1
823	 vmovdqu	0x80-0x20($Xip),$HK
824	vpxor		$T3,$T1,$T1
825
826	  vxorps	$Z3,$inout5,$inout5
827	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
828	  vxorps	$inout5,$Xi,$Xi
829
830	vpclmulqdq	\$0x00,$Hkey,$inout1,$Z0
831	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
832	vpxor		$Z1,$Z0,$Z0
833	 vpunpckhqdq	$Xi,$Xi,$T3
834	vpclmulqdq	\$0x11,$Hkey,$inout1,$inout1
835	 vpxor		$Xi,$T3,$T3
836	vpxor		$inout2,$inout1,$inout1
837	vpclmulqdq	\$0x00,$HK,$T2,$T2
838	vpxor		$T1,$T2,$T2
839
840	vpclmulqdq	\$0x00,$Ii,$Xi,$Z1
841	vpclmulqdq	\$0x11,$Ii,$Xi,$Z3
842	vpxor		$Z0,$Z1,$Z1
843	vpclmulqdq	\$0x10,$HK,$T3,$Z2
844	vpxor		$inout1,$Z3,$Z3
845	vpxor		$T2,$Z2,$Z2
846
847	vpxor		$Z1,$Z3,$Z0		# aggregated Karatsuba post-processing
848	vpxor		$Z0,$Z2,$Z2
849	vpslldq		\$8,$Z2,$T1
850	vmovdqu		0x10($const),$Hkey	# .Lpoly
851	vpsrldq		\$8,$Z2,$Z2
852	vpxor		$T1,$Z1,$Xi
853	vpxor		$Z2,$Z3,$Z3
854
855	vpalignr	\$8,$Xi,$Xi,$T2		# 1st phase
856	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
857	vpxor		$T2,$Xi,$Xi
858
859	vpalignr	\$8,$Xi,$Xi,$T2		# 2nd phase
860	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
861	vpxor		$Z3,$T2,$T2
862	vpxor		$T2,$Xi,$Xi
863___
864}
865$code.=<<___;
866	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
867	vmovdqu		$Xi,-0x40($Xip)		# output Xi
868
869	vzeroupper
870___
871$code.=<<___ if ($win64);
872	movaps	-0xd8(%rax),%xmm6
873	movaps	-0xc8(%rax),%xmm7
874	movaps	-0xb8(%rax),%xmm8
875	movaps	-0xa8(%rax),%xmm9
876	movaps	-0x98(%rax),%xmm10
877	movaps	-0x88(%rax),%xmm11
878	movaps	-0x78(%rax),%xmm12
879	movaps	-0x68(%rax),%xmm13
880	movaps	-0x58(%rax),%xmm14
881	movaps	-0x48(%rax),%xmm15
882___
883$code.=<<___;
884	mov	-48(%rax),%r15
885	mov	-40(%rax),%r14
886	mov	-32(%rax),%r13
887	mov	-24(%rax),%r12
888	mov	-16(%rax),%rbp
889	mov	-8(%rax),%rbx
890	lea	(%rax),%rsp		# restore %rsp
891.Lgcm_enc_abort:
892	mov	$ret,%rax		# return value
893	ret
894.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
895___
896
897$code.=<<___;
898.align	64
899.Lbswap_mask:
900	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
901.Lpoly:
902	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
903.Lone_msb:
904	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
905.Ltwo_lsb:
906	.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
907.Lone_lsb:
908	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
909.asciz	"AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
910.align	64
911___
912if ($win64) {
913$rec="%rcx";
914$frame="%rdx";
915$context="%r8";
916$disp="%r9";
917
918$code.=<<___
919.extern	__imp_RtlVirtualUnwind
920.type	gcm_se_handler,\@abi-omnipotent
921.align	16
922gcm_se_handler:
923	push	%rsi
924	push	%rdi
925	push	%rbx
926	push	%rbp
927	push	%r12
928	push	%r13
929	push	%r14
930	push	%r15
931	pushfq
932	sub	\$64,%rsp
933
934	mov	120($context),%rax	# pull context->Rax
935	mov	248($context),%rbx	# pull context->Rip
936
937	mov	8($disp),%rsi		# disp->ImageBase
938	mov	56($disp),%r11		# disp->HandlerData
939
940	mov	0(%r11),%r10d		# HandlerData[0]
941	lea	(%rsi,%r10),%r10	# prologue label
942	cmp	%r10,%rbx		# context->Rip<prologue label
943	jb	.Lcommon_seh_tail
944
945	mov	152($context),%rax	# pull context->Rsp
946
947	mov	4(%r11),%r10d		# HandlerData[1]
948	lea	(%rsi,%r10),%r10	# epilogue label
949	cmp	%r10,%rbx		# context->Rip>=epilogue label
950	jae	.Lcommon_seh_tail
951
952	mov	120($context),%rax	# pull context->Rax
953
954	mov	-48(%rax),%r15
955	mov	-40(%rax),%r14
956	mov	-32(%rax),%r13
957	mov	-24(%rax),%r12
958	mov	-16(%rax),%rbp
959	mov	-8(%rax),%rbx
960	mov	%r15,240($context)
961	mov	%r14,232($context)
962	mov	%r13,224($context)
963	mov	%r12,216($context)
964	mov	%rbp,160($context)
965	mov	%rbx,144($context)
966
967	lea	-0xd8(%rax),%rsi	# %xmm save area
968	lea	512($context),%rdi	# & context.Xmm6
969	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
970	.long	0xa548f3fc		# cld; rep movsq
971
972.Lcommon_seh_tail:
973	mov	8(%rax),%rdi
974	mov	16(%rax),%rsi
975	mov	%rax,152($context)	# restore context->Rsp
976	mov	%rsi,168($context)	# restore context->Rsi
977	mov	%rdi,176($context)	# restore context->Rdi
978
979	mov	40($disp),%rdi		# disp->ContextRecord
980	mov	$context,%rsi		# context
981	mov	\$154,%ecx		# sizeof(CONTEXT)
982	.long	0xa548f3fc		# cld; rep movsq
983
984	mov	$disp,%rsi
985	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
986	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
987	mov	0(%rsi),%r8		# arg3, disp->ControlPc
988	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
989	mov	40(%rsi),%r10		# disp->ContextRecord
990	lea	56(%rsi),%r11		# &disp->HandlerData
991	lea	24(%rsi),%r12		# &disp->EstablisherFrame
992	mov	%r10,32(%rsp)		# arg5
993	mov	%r11,40(%rsp)		# arg6
994	mov	%r12,48(%rsp)		# arg7
995	mov	%rcx,56(%rsp)		# arg8, (NULL)
996	call	*__imp_RtlVirtualUnwind(%rip)
997
998	mov	\$1,%eax		# ExceptionContinueSearch
999	add	\$64,%rsp
1000	popfq
1001	pop	%r15
1002	pop	%r14
1003	pop	%r13
1004	pop	%r12
1005	pop	%rbp
1006	pop	%rbx
1007	pop	%rdi
1008	pop	%rsi
1009	ret
1010.size	gcm_se_handler,.-gcm_se_handler
1011
1012.section	.pdata
1013.align	4
1014	.rva	.LSEH_begin_aesni_gcm_decrypt
1015	.rva	.LSEH_end_aesni_gcm_decrypt
1016	.rva	.LSEH_gcm_dec_info
1017
1018	.rva	.LSEH_begin_aesni_gcm_encrypt
1019	.rva	.LSEH_end_aesni_gcm_encrypt
1020	.rva	.LSEH_gcm_enc_info
1021.section	.xdata
1022.align	8
1023.LSEH_gcm_dec_info:
1024	.byte	9,0,0,0
1025	.rva	gcm_se_handler
1026	.rva	.Lgcm_dec_body,.Lgcm_dec_abort
1027.LSEH_gcm_enc_info:
1028	.byte	9,0,0,0
1029	.rva	gcm_se_handler
1030	.rva	.Lgcm_enc_body,.Lgcm_enc_abort
1031___
1032}
1033}}} else {{{
1034$code=<<___;	# assembler is too old
1035.text
1036
1037.globl	aesni_gcm_encrypt
1038.type	aesni_gcm_encrypt,\@abi-omnipotent
1039aesni_gcm_encrypt:
1040	xor	%eax,%eax
1041	ret
1042.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
1043
1044.globl	aesni_gcm_decrypt
1045.type	aesni_gcm_decrypt,\@abi-omnipotent
1046aesni_gcm_decrypt:
1047	xor	%eax,%eax
1048	ret
1049.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
1050___
1051}}}
1052
1053$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1054
1055print $code;
1056
1057close STDOUT;
1058