1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# Version 2.1.
18#
19# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
20# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
21# [you'll notice a lot of resemblance], such as compressed S-boxes
22# in little-endian byte order, prefetch of these tables in CBC mode,
23# as well as avoiding L1 cache aliasing between stack frame and key
24# schedule and already mentioned tables, compressed Td4...
25#
26# Performance in number of cycles per processed byte for 128-bit key:
27#
28#		ECB encrypt	ECB decrypt	CBC large chunk
29# AMD64		33		43		13.0
30# EM64T		38		56		18.6(*)
31# Core 2	30		42		14.5(*)
32# Atom		65		86		32.1(*)
33#
34# (*) with hyper-threading off
35
36$flavour = shift;
37$output  = shift;
38if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39
40$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45die "can't locate x86_64-xlate.pl";
46
47open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
48*STDOUT=*OUT;
49
50$verticalspin=1;	# unlike 32-bit version $verticalspin performs
51			# ~15% better on both AMD and Intel cores
52$speed_limit=512;	# see aes-586.pl for details
53
54$code=".text\n";
55
56$s0="%eax";
57$s1="%ebx";
58$s2="%ecx";
59$s3="%edx";
60$acc0="%esi";	$mask80="%rsi";
61$acc1="%edi";	$maskfe="%rdi";
62$acc2="%ebp";	$mask1b="%rbp";
63$inp="%r8";
64$out="%r9";
65$t0="%r10d";
66$t1="%r11d";
67$t2="%r12d";
68$rnds="%r13d";
69$sbox="%r14";
70$key="%r15";
71
72sub hi() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1h/;	$r; }
73sub lo() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1l/;
74			$r =~ s/%[er]([sd]i)/%\1l/;
75			$r =~ s/%(r[0-9]+)[d]?/%\1b/;	$r; }
76sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
77			$r =~ s/%r([0-9]+)/%r\1d/;	$r; }
78sub _data_word()
79{ my $i;
80    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
81}
82sub data_word()
83{ my $i;
84  my $last=pop(@_);
85    $code.=".long\t";
86    while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
87    $code.=sprintf"0x%08x\n",$last;
88}
89
90sub data_byte()
91{ my $i;
92  my $last=pop(@_);
93    $code.=".byte\t";
94    while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
95    $code.=sprintf"0x%02x\n",$last&0xff;
96}
97
98sub encvert()
99{ my $t3="%r8d";	# zaps $inp!
100
101$code.=<<___;
102	# favor 3-way issue Opteron pipeline...
103	movzb	`&lo("$s0")`,$acc0
104	movzb	`&lo("$s1")`,$acc1
105	movzb	`&lo("$s2")`,$acc2
106	mov	0($sbox,$acc0,8),$t0
107	mov	0($sbox,$acc1,8),$t1
108	mov	0($sbox,$acc2,8),$t2
109
110	movzb	`&hi("$s1")`,$acc0
111	movzb	`&hi("$s2")`,$acc1
112	movzb	`&lo("$s3")`,$acc2
113	xor	3($sbox,$acc0,8),$t0
114	xor	3($sbox,$acc1,8),$t1
115	mov	0($sbox,$acc2,8),$t3
116
117	movzb	`&hi("$s3")`,$acc0
118	shr	\$16,$s2
119	movzb	`&hi("$s0")`,$acc2
120	xor	3($sbox,$acc0,8),$t2
121	shr	\$16,$s3
122	xor	3($sbox,$acc2,8),$t3
123
124	shr	\$16,$s1
125	lea	16($key),$key
126	shr	\$16,$s0
127
128	movzb	`&lo("$s2")`,$acc0
129	movzb	`&lo("$s3")`,$acc1
130	movzb	`&lo("$s0")`,$acc2
131	xor	2($sbox,$acc0,8),$t0
132	xor	2($sbox,$acc1,8),$t1
133	xor	2($sbox,$acc2,8),$t2
134
135	movzb	`&hi("$s3")`,$acc0
136	movzb	`&hi("$s0")`,$acc1
137	movzb	`&lo("$s1")`,$acc2
138	xor	1($sbox,$acc0,8),$t0
139	xor	1($sbox,$acc1,8),$t1
140	xor	2($sbox,$acc2,8),$t3
141
142	mov	12($key),$s3
143	movzb	`&hi("$s1")`,$acc1
144	movzb	`&hi("$s2")`,$acc2
145	mov	0($key),$s0
146	xor	1($sbox,$acc1,8),$t2
147	xor	1($sbox,$acc2,8),$t3
148
149	mov	4($key),$s1
150	mov	8($key),$s2
151	xor	$t0,$s0
152	xor	$t1,$s1
153	xor	$t2,$s2
154	xor	$t3,$s3
155___
156}
157
158sub enclastvert()
159{ my $t3="%r8d";	# zaps $inp!
160
161$code.=<<___;
162	movzb	`&lo("$s0")`,$acc0
163	movzb	`&lo("$s1")`,$acc1
164	movzb	`&lo("$s2")`,$acc2
165	movzb	2($sbox,$acc0,8),$t0
166	movzb	2($sbox,$acc1,8),$t1
167	movzb	2($sbox,$acc2,8),$t2
168
169	movzb	`&lo("$s3")`,$acc0
170	movzb	`&hi("$s1")`,$acc1
171	movzb	`&hi("$s2")`,$acc2
172	movzb	2($sbox,$acc0,8),$t3
173	mov	0($sbox,$acc1,8),$acc1	#$t0
174	mov	0($sbox,$acc2,8),$acc2	#$t1
175
176	and	\$0x0000ff00,$acc1
177	and	\$0x0000ff00,$acc2
178
179	xor	$acc1,$t0
180	xor	$acc2,$t1
181	shr	\$16,$s2
182
183	movzb	`&hi("$s3")`,$acc0
184	movzb	`&hi("$s0")`,$acc1
185	shr	\$16,$s3
186	mov	0($sbox,$acc0,8),$acc0	#$t2
187	mov	0($sbox,$acc1,8),$acc1	#$t3
188
189	and	\$0x0000ff00,$acc0
190	and	\$0x0000ff00,$acc1
191	shr	\$16,$s1
192	xor	$acc0,$t2
193	xor	$acc1,$t3
194	shr	\$16,$s0
195
196	movzb	`&lo("$s2")`,$acc0
197	movzb	`&lo("$s3")`,$acc1
198	movzb	`&lo("$s0")`,$acc2
199	mov	0($sbox,$acc0,8),$acc0	#$t0
200	mov	0($sbox,$acc1,8),$acc1	#$t1
201	mov	0($sbox,$acc2,8),$acc2	#$t2
202
203	and	\$0x00ff0000,$acc0
204	and	\$0x00ff0000,$acc1
205	and	\$0x00ff0000,$acc2
206
207	xor	$acc0,$t0
208	xor	$acc1,$t1
209	xor	$acc2,$t2
210
211	movzb	`&lo("$s1")`,$acc0
212	movzb	`&hi("$s3")`,$acc1
213	movzb	`&hi("$s0")`,$acc2
214	mov	0($sbox,$acc0,8),$acc0	#$t3
215	mov	2($sbox,$acc1,8),$acc1	#$t0
216	mov	2($sbox,$acc2,8),$acc2	#$t1
217
218	and	\$0x00ff0000,$acc0
219	and	\$0xff000000,$acc1
220	and	\$0xff000000,$acc2
221
222	xor	$acc0,$t3
223	xor	$acc1,$t0
224	xor	$acc2,$t1
225
226	movzb	`&hi("$s1")`,$acc0
227	movzb	`&hi("$s2")`,$acc1
228	mov	16+12($key),$s3
229	mov	2($sbox,$acc0,8),$acc0	#$t2
230	mov	2($sbox,$acc1,8),$acc1	#$t3
231	mov	16+0($key),$s0
232
233	and	\$0xff000000,$acc0
234	and	\$0xff000000,$acc1
235
236	xor	$acc0,$t2
237	xor	$acc1,$t3
238
239	mov	16+4($key),$s1
240	mov	16+8($key),$s2
241	xor	$t0,$s0
242	xor	$t1,$s1
243	xor	$t2,$s2
244	xor	$t3,$s3
245___
246}
247
248sub encstep()
249{ my ($i,@s) = @_;
250  my $tmp0=$acc0;
251  my $tmp1=$acc1;
252  my $tmp2=$acc2;
253  my $out=($t0,$t1,$t2,$s[0])[$i];
254
255	if ($i==3) {
256		$tmp0=$s[1];
257		$tmp1=$s[2];
258		$tmp2=$s[3];
259	}
260	$code.="	movzb	".&lo($s[0]).",$out\n";
261	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
262	$code.="	lea	16($key),$key\n"	if ($i==0);
263
264	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
265	$code.="	mov	0($sbox,$out,8),$out\n";
266
267	$code.="	shr	\$16,$tmp1\n";
268	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
269	$code.="	xor	3($sbox,$tmp0,8),$out\n";
270
271	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
272	$code.="	shr	\$24,$tmp2\n";
273	$code.="	xor	4*$i($key),$out\n";
274
275	$code.="	xor	2($sbox,$tmp1,8),$out\n";
276	$code.="	xor	1($sbox,$tmp2,8),$out\n";
277
278	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
279	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
280	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
281	$code.="\n";
282}
283
284sub enclast()
285{ my ($i,@s)=@_;
286  my $tmp0=$acc0;
287  my $tmp1=$acc1;
288  my $tmp2=$acc2;
289  my $out=($t0,$t1,$t2,$s[0])[$i];
290
291	if ($i==3) {
292		$tmp0=$s[1];
293		$tmp1=$s[2];
294		$tmp2=$s[3];
295	}
296	$code.="	movzb	".&lo($s[0]).",$out\n";
297	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
298
299	$code.="	mov	2($sbox,$out,8),$out\n";
300	$code.="	shr	\$16,$tmp1\n";
301	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
302
303	$code.="	and	\$0x000000ff,$out\n";
304	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
305	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
306	$code.="	shr	\$24,$tmp2\n";
307
308	$code.="	mov	0($sbox,$tmp0,8),$tmp0\n";
309	$code.="	mov	0($sbox,$tmp1,8),$tmp1\n";
310	$code.="	mov	2($sbox,$tmp2,8),$tmp2\n";
311
312	$code.="	and	\$0x0000ff00,$tmp0\n";
313	$code.="	and	\$0x00ff0000,$tmp1\n";
314	$code.="	and	\$0xff000000,$tmp2\n";
315
316	$code.="	xor	$tmp0,$out\n";
317	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
318	$code.="	xor	$tmp1,$out\n";
319	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
320	$code.="	xor	$tmp2,$out\n";
321	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
322	$code.="\n";
323}
324
325$code.=<<___;
326.type	_x86_64_AES_encrypt,\@abi-omnipotent
327.align	16
328_x86_64_AES_encrypt:
329	xor	0($key),$s0			# xor with key
330	xor	4($key),$s1
331	xor	8($key),$s2
332	xor	12($key),$s3
333
334	mov	240($key),$rnds			# load key->rounds
335	sub	\$1,$rnds
336	jmp	.Lenc_loop
337.align	16
338.Lenc_loop:
339___
340	if ($verticalspin) { &encvert(); }
341	else {	&encstep(0,$s0,$s1,$s2,$s3);
342		&encstep(1,$s1,$s2,$s3,$s0);
343		&encstep(2,$s2,$s3,$s0,$s1);
344		&encstep(3,$s3,$s0,$s1,$s2);
345	}
346$code.=<<___;
347	sub	\$1,$rnds
348	jnz	.Lenc_loop
349___
350	if ($verticalspin) { &enclastvert(); }
351	else {	&enclast(0,$s0,$s1,$s2,$s3);
352		&enclast(1,$s1,$s2,$s3,$s0);
353		&enclast(2,$s2,$s3,$s0,$s1);
354		&enclast(3,$s3,$s0,$s1,$s2);
355		$code.=<<___;
356		xor	16+0($key),$s0		# xor with key
357		xor	16+4($key),$s1
358		xor	16+8($key),$s2
359		xor	16+12($key),$s3
360___
361	}
362$code.=<<___;
363	.byte	0xf3,0xc3			# rep ret
364.size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
365___
366
367# it's possible to implement this by shifting tN by 8, filling least
368# significant byte with byte load and finally bswap-ing at the end,
369# but such partial register load kills Core 2...
370sub enccompactvert()
371{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
372
373$code.=<<___;
374	movzb	`&lo("$s0")`,$t0
375	movzb	`&lo("$s1")`,$t1
376	movzb	`&lo("$s2")`,$t2
377	movzb	`&lo("$s3")`,$t3
378	movzb	`&hi("$s1")`,$acc0
379	movzb	`&hi("$s2")`,$acc1
380	shr	\$16,$s2
381	movzb	`&hi("$s3")`,$acc2
382	movzb	($sbox,$t0,1),$t0
383	movzb	($sbox,$t1,1),$t1
384	movzb	($sbox,$t2,1),$t2
385	movzb	($sbox,$t3,1),$t3
386
387	movzb	($sbox,$acc0,1),$t4	#$t0
388	movzb	`&hi("$s0")`,$acc0
389	movzb	($sbox,$acc1,1),$t5	#$t1
390	movzb	`&lo("$s2")`,$acc1
391	movzb	($sbox,$acc2,1),$acc2	#$t2
392	movzb	($sbox,$acc0,1),$acc0	#$t3
393
394	shl	\$8,$t4
395	shr	\$16,$s3
396	shl	\$8,$t5
397	xor	$t4,$t0
398	shr	\$16,$s0
399	movzb	`&lo("$s3")`,$t4
400	shr	\$16,$s1
401	xor	$t5,$t1
402	shl	\$8,$acc2
403	movzb	`&lo("$s0")`,$t5
404	movzb	($sbox,$acc1,1),$acc1	#$t0
405	xor	$acc2,$t2
406
407	shl	\$8,$acc0
408	movzb	`&lo("$s1")`,$acc2
409	shl	\$16,$acc1
410	xor	$acc0,$t3
411	movzb	($sbox,$t4,1),$t4	#$t1
412	movzb	`&hi("$s3")`,$acc0
413	movzb	($sbox,$t5,1),$t5	#$t2
414	xor	$acc1,$t0
415
416	shr	\$8,$s2
417	movzb	`&hi("$s0")`,$acc1
418	shl	\$16,$t4
419	shr	\$8,$s1
420	shl	\$16,$t5
421	xor	$t4,$t1
422	movzb	($sbox,$acc2,1),$acc2	#$t3
423	movzb	($sbox,$acc0,1),$acc0	#$t0
424	movzb	($sbox,$acc1,1),$acc1	#$t1
425	movzb	($sbox,$s2,1),$s3	#$t3
426	movzb	($sbox,$s1,1),$s2	#$t2
427
428	shl	\$16,$acc2
429	xor	$t5,$t2
430	shl	\$24,$acc0
431	xor	$acc2,$t3
432	shl	\$24,$acc1
433	xor	$acc0,$t0
434	shl	\$24,$s3
435	xor	$acc1,$t1
436	shl	\$24,$s2
437	mov	$t0,$s0
438	mov	$t1,$s1
439	xor	$t2,$s2
440	xor	$t3,$s3
441___
442}
443
444sub enctransform_ref()
445{ my $sn = shift;
446  my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
447
448$code.=<<___;
449	mov	$sn,$acc
450	and	\$0x80808080,$acc
451	mov	$acc,$tmp
452	shr	\$7,$tmp
453	lea	($sn,$sn),$r2
454	sub	$tmp,$acc
455	and	\$0xfefefefe,$r2
456	and	\$0x1b1b1b1b,$acc
457	mov	$sn,$tmp
458	xor	$acc,$r2
459
460	xor	$r2,$sn
461	rol	\$24,$sn
462	xor	$r2,$sn
463	ror	\$16,$tmp
464	xor	$tmp,$sn
465	ror	\$8,$tmp
466	xor	$tmp,$sn
467___
468}
469
470# unlike decrypt case it does not pay off to parallelize enctransform
471sub enctransform()
472{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
473
474$code.=<<___;
475	mov	\$0x80808080,$t0
476	mov	\$0x80808080,$t1
477	and	$s0,$t0
478	and	$s1,$t1
479	mov	$t0,$acc0
480	mov	$t1,$acc1
481	shr	\$7,$t0
482	lea	($s0,$s0),$r20
483	shr	\$7,$t1
484	lea	($s1,$s1),$r21
485	sub	$t0,$acc0
486	sub	$t1,$acc1
487	and	\$0xfefefefe,$r20
488	and	\$0xfefefefe,$r21
489	and	\$0x1b1b1b1b,$acc0
490	and	\$0x1b1b1b1b,$acc1
491	mov	$s0,$t0
492	mov	$s1,$t1
493	xor	$acc0,$r20
494	xor	$acc1,$r21
495
496	xor	$r20,$s0
497	xor	$r21,$s1
498	 mov	\$0x80808080,$t2
499	rol	\$24,$s0
500	 mov	\$0x80808080,$t3
501	rol	\$24,$s1
502	 and	$s2,$t2
503	 and	$s3,$t3
504	xor	$r20,$s0
505	xor	$r21,$s1
506	 mov	$t2,$acc0
507	ror	\$16,$t0
508	 mov	$t3,$acc1
509	ror	\$16,$t1
510	 lea	($s2,$s2),$r20
511	 shr	\$7,$t2
512	xor	$t0,$s0
513	 shr	\$7,$t3
514	xor	$t1,$s1
515	ror	\$8,$t0
516	 lea	($s3,$s3),$r21
517	ror	\$8,$t1
518	 sub	$t2,$acc0
519	 sub	$t3,$acc1
520	xor	$t0,$s0
521	xor	$t1,$s1
522
523	and	\$0xfefefefe,$r20
524	and	\$0xfefefefe,$r21
525	and	\$0x1b1b1b1b,$acc0
526	and	\$0x1b1b1b1b,$acc1
527	mov	$s2,$t2
528	mov	$s3,$t3
529	xor	$acc0,$r20
530	xor	$acc1,$r21
531
532	ror	\$16,$t2
533	xor	$r20,$s2
534	ror	\$16,$t3
535	xor	$r21,$s3
536	rol	\$24,$s2
537	mov	0($sbox),$acc0			# prefetch Te4
538	rol	\$24,$s3
539	xor	$r20,$s2
540	mov	64($sbox),$acc1
541	xor	$r21,$s3
542	mov	128($sbox),$r20
543	xor	$t2,$s2
544	ror	\$8,$t2
545	xor	$t3,$s3
546	ror	\$8,$t3
547	xor	$t2,$s2
548	mov	192($sbox),$r21
549	xor	$t3,$s3
550___
551}
552
553$code.=<<___;
554.type	_x86_64_AES_encrypt_compact,\@abi-omnipotent
555.align	16
556_x86_64_AES_encrypt_compact:
557	lea	128($sbox),$inp			# size optimization
558	mov	0-128($inp),$acc1		# prefetch Te4
559	mov	32-128($inp),$acc2
560	mov	64-128($inp),$t0
561	mov	96-128($inp),$t1
562	mov	128-128($inp),$acc1
563	mov	160-128($inp),$acc2
564	mov	192-128($inp),$t0
565	mov	224-128($inp),$t1
566	jmp	.Lenc_loop_compact
567.align	16
568.Lenc_loop_compact:
569		xor	0($key),$s0		# xor with key
570		xor	4($key),$s1
571		xor	8($key),$s2
572		xor	12($key),$s3
573		lea	16($key),$key
574___
575		&enccompactvert();
576$code.=<<___;
577		cmp	16(%rsp),$key
578		je	.Lenc_compact_done
579___
580		&enctransform();
581$code.=<<___;
582	jmp	.Lenc_loop_compact
583.align	16
584.Lenc_compact_done:
585	xor	0($key),$s0
586	xor	4($key),$s1
587	xor	8($key),$s2
588	xor	12($key),$s3
589	.byte	0xf3,0xc3			# rep ret
590.size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
591___
592
593# void GFp_aes_nohw_encrypt (const void *inp,void *out,const AES_KEY *key);
594$code.=<<___;
595.align	16
596.globl	GFp_aes_nohw_encrypt
597.type	GFp_aes_nohw_encrypt,\@function,3
598.hidden	GFp_aes_nohw_encrypt
599GFp_aes_nohw_encrypt:
600.cfi_startproc
601	mov	%rsp,%rax
602.cfi_def_cfa_register	%rax
603	push	%rbx
604.cfi_push	%rbx
605	push	%rbp
606.cfi_push	%rbp
607	push	%r12
608.cfi_push	%r12
609	push	%r13
610.cfi_push	%r13
611	push	%r14
612.cfi_push	%r14
613	push	%r15
614.cfi_push	%r15
615
616	# allocate frame "above" key schedule
617	lea	-63(%rdx),%rcx	# %rdx is key argument
618	and	\$-64,%rsp
619	sub	%rsp,%rcx
620	neg	%rcx
621	and	\$0x3c0,%rcx
622	sub	%rcx,%rsp
623	sub	\$32,%rsp
624
625	mov	%rsi,16(%rsp)	# save out
626	mov	%rax,24(%rsp)	# save original stack pointer
627.cfi_cfa_expression	%rsp+24,deref,+8
628.Lenc_prologue:
629
630	mov	%rdx,$key
631	mov	240($key),$rnds	# load rounds
632
633	mov	0(%rdi),$s0	# load input vector
634	mov	4(%rdi),$s1
635	mov	8(%rdi),$s2
636	mov	12(%rdi),$s3
637
638	shl	\$4,$rnds
639	lea	($key,$rnds),%rbp
640	mov	$key,(%rsp)	# key schedule
641	mov	%rbp,8(%rsp)	# end of key schedule
642
643	# pick Te4 copy which can't "overlap" with stack frame or key schedule
644	lea	.LAES_Te+2048(%rip),$sbox
645	lea	768(%rsp),%rbp
646	sub	$sbox,%rbp
647	and	\$0x300,%rbp
648	lea	($sbox,%rbp),$sbox
649
650	call	_x86_64_AES_encrypt_compact
651
652	mov	16(%rsp),$out	# restore out
653	mov	24(%rsp),%rsi	# restore saved stack pointer
654.cfi_def_cfa	%rsi,8
655	mov	$s0,0($out)	# write output vector
656	mov	$s1,4($out)
657	mov	$s2,8($out)
658	mov	$s3,12($out)
659
660	mov	-48(%rsi),%r15
661.cfi_restore	%r15
662	mov	-40(%rsi),%r14
663.cfi_restore	%r14
664	mov	-32(%rsi),%r13
665.cfi_restore	%r13
666	mov	-24(%rsi),%r12
667.cfi_restore	%r12
668	mov	-16(%rsi),%rbp
669.cfi_restore	%rbp
670	mov	-8(%rsi),%rbx
671.cfi_restore	%rbx
672	lea	(%rsi),%rsp
673.cfi_def_cfa_register	%rsp
674.Lenc_epilogue:
675	ret
676.cfi_endproc
677.size	GFp_aes_nohw_encrypt,.-GFp_aes_nohw_encrypt
678___
679
680#------------------------------------------------------------------#
681
682sub enckey()
683{
684$code.=<<___;
685	movz	%dl,%esi		# rk[i]>>0
686	movzb	-128(%rbp,%rsi),%ebx
687	movz	%dh,%esi		# rk[i]>>8
688	shl	\$24,%ebx
689	xor	%ebx,%eax
690
691	movzb	-128(%rbp,%rsi),%ebx
692	shr	\$16,%edx
693	movz	%dl,%esi		# rk[i]>>16
694	xor	%ebx,%eax
695
696	movzb	-128(%rbp,%rsi),%ebx
697	movz	%dh,%esi		# rk[i]>>24
698	shl	\$8,%ebx
699	xor	%ebx,%eax
700
701	movzb	-128(%rbp,%rsi),%ebx
702	shl	\$16,%ebx
703	xor	%ebx,%eax
704
705	xor	1024-128(%rbp,%rcx,4),%eax		# rcon
706___
707}
708
709# int GFp_aes_nohw_set_encrypt_key(const unsigned char *userKey, const int bits,
710#                                 AES_KEY *key)
711$code.=<<___;
712.align	16
713.globl GFp_aes_nohw_set_encrypt_key
714.type  GFp_aes_nohw_set_encrypt_key,\@function,3
715GFp_aes_nohw_set_encrypt_key:
716.cfi_startproc
717	push	%rbx
718.cfi_push	%rbx
719	push	%rbp
720.cfi_push	%rbp
721	push	%r12			# redundant, but allows to share
722.cfi_push	%r12
723	push	%r13			# exception handler...
724.cfi_push	%r13
725	push	%r14
726.cfi_push	%r14
727	push	%r15
728.cfi_push	%r15
729	sub	\$8,%rsp
730.cfi_adjust_cfa_offset	8
731.Lenc_key_prologue:
732
733	call	_x86_64_AES_set_encrypt_key
734
735	mov	40(%rsp),%rbp
736.cfi_restore	%rbp
737	mov	48(%rsp),%rbx
738.cfi_restore	%rbx
739	add	\$56,%rsp
740.cfi_adjust_cfa_offset	-56
741.Lenc_key_epilogue:
742	ret
743.cfi_endproc
744.size GFp_aes_nohw_set_encrypt_key,.-GFp_aes_nohw_set_encrypt_key
745
746.type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
747.align	16
748_x86_64_AES_set_encrypt_key:
749	mov	%esi,%ecx			# %ecx=bits
750	mov	%rdi,%rsi			# %rsi=userKey
751	mov	%rdx,%rdi			# %rdi=key
752
753	test	\$-1,%rsi
754	jz	.Lbadpointer
755	test	\$-1,%rdi
756	jz	.Lbadpointer
757
758	lea	.LAES_Te(%rip),%rbp
759	lea	2048+128(%rbp),%rbp
760
761	# prefetch Te4
762	mov	0-128(%rbp),%eax
763	mov	32-128(%rbp),%ebx
764	mov	64-128(%rbp),%r8d
765	mov	96-128(%rbp),%edx
766	mov	128-128(%rbp),%eax
767	mov	160-128(%rbp),%ebx
768	mov	192-128(%rbp),%r8d
769	mov	224-128(%rbp),%edx
770
771	cmp	\$128,%ecx
772	je	.L10rounds
773	cmp	\$256,%ecx
774	je	.L14rounds
775	mov	\$-2,%rax			# invalid number of bits
776	jmp	.Lexit
777
778.L10rounds:
779	mov	0(%rsi),%rax			# copy first 4 dwords
780	mov	8(%rsi),%rdx
781	mov	%rax,0(%rdi)
782	mov	%rdx,8(%rdi)
783
784	shr	\$32,%rdx
785	xor	%ecx,%ecx
786	jmp	.L10shortcut
787.align	4
788.L10loop:
789		mov	0(%rdi),%eax			# rk[0]
790		mov	12(%rdi),%edx			# rk[3]
791.L10shortcut:
792___
793		&enckey	();
794$code.=<<___;
795		mov	%eax,16(%rdi)			# rk[4]
796		xor	4(%rdi),%eax
797		mov	%eax,20(%rdi)			# rk[5]
798		xor	8(%rdi),%eax
799		mov	%eax,24(%rdi)			# rk[6]
800		xor	12(%rdi),%eax
801		mov	%eax,28(%rdi)			# rk[7]
802		add	\$1,%ecx
803		lea	16(%rdi),%rdi
804		cmp	\$10,%ecx
805	jl	.L10loop
806
807	movl	\$10,80(%rdi)			# setup number of rounds
808	xor	%rax,%rax
809	jmp	.Lexit
810
811.L14rounds:
812	mov	0(%rsi),%rax			# copy first 8 dwords
813	mov	8(%rsi),%rbx
814	mov	16(%rsi),%rcx
815	mov	24(%rsi),%rdx
816	mov	%rax,0(%rdi)
817	mov	%rbx,8(%rdi)
818	mov	%rcx,16(%rdi)
819	mov	%rdx,24(%rdi)
820
821	shr	\$32,%rdx
822	xor	%ecx,%ecx
823	jmp	.L14shortcut
824.align	4
825.L14loop:
826		mov	0(%rdi),%eax			# rk[0]
827		mov	28(%rdi),%edx			# rk[4]
828.L14shortcut:
829___
830		&enckey	();
831$code.=<<___;
832		mov	%eax,32(%rdi)			# rk[8]
833		xor	4(%rdi),%eax
834		mov	%eax,36(%rdi)			# rk[9]
835		xor	8(%rdi),%eax
836		mov	%eax,40(%rdi)			# rk[10]
837		xor	12(%rdi),%eax
838		mov	%eax,44(%rdi)			# rk[11]
839
840		cmp	\$6,%ecx
841		je	.L14break
842		add	\$1,%ecx
843
844		mov	%eax,%edx
845		mov	16(%rdi),%eax			# rk[4]
846		movz	%dl,%esi			# rk[11]>>0
847		movzb	-128(%rbp,%rsi),%ebx
848		movz	%dh,%esi			# rk[11]>>8
849		xor	%ebx,%eax
850
851		movzb	-128(%rbp,%rsi),%ebx
852		shr	\$16,%edx
853		shl	\$8,%ebx
854		movz	%dl,%esi			# rk[11]>>16
855		xor	%ebx,%eax
856
857		movzb	-128(%rbp,%rsi),%ebx
858		movz	%dh,%esi			# rk[11]>>24
859		shl	\$16,%ebx
860		xor	%ebx,%eax
861
862		movzb	-128(%rbp,%rsi),%ebx
863		shl	\$24,%ebx
864		xor	%ebx,%eax
865
866		mov	%eax,48(%rdi)			# rk[12]
867		xor	20(%rdi),%eax
868		mov	%eax,52(%rdi)			# rk[13]
869		xor	24(%rdi),%eax
870		mov	%eax,56(%rdi)			# rk[14]
871		xor	28(%rdi),%eax
872		mov	%eax,60(%rdi)			# rk[15]
873
874		lea	32(%rdi),%rdi
875	jmp	.L14loop
876.L14break:
877	movl	\$14,48(%rdi)		# setup number of rounds
878	xor	%rax,%rax
879	jmp	.Lexit
880
881.Lbadpointer:
882	mov	\$-1,%rax
883.Lexit:
884	.byte	0xf3,0xc3			# rep ret
885.size	_x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
886___
887
888$code.=<<___;
889.align	64
890.LAES_Te:
891___
892	&_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
893	&_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
894	&_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
895	&_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
896	&_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
897	&_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
898	&_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
899	&_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
900	&_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
901	&_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
902	&_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
903	&_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
904	&_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
905	&_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
906	&_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
907	&_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
908	&_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
909	&_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
910	&_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
911	&_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
912	&_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
913	&_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
914	&_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
915	&_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
916	&_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
917	&_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
918	&_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
919	&_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
920	&_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
921	&_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
922	&_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
923	&_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
924	&_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
925	&_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
926	&_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
927	&_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
928	&_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
929	&_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
930	&_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
931	&_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
932	&_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
933	&_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
934	&_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
935	&_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
936	&_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
937	&_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
938	&_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
939	&_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
940	&_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
941	&_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
942	&_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
943	&_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
944	&_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
945	&_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
946	&_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
947	&_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
948	&_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
949	&_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
950	&_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
951	&_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
952	&_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
953	&_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
954	&_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
955	&_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
956
957#Te4	# four copies of Te4 to choose from to avoid L1 aliasing
958	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
959	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
960	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
961	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
962	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
963	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
964	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
965	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
966	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
967	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
968	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
969	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
970	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
971	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
972	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
973	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
974	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
975	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
976	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
977	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
978	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
979	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
980	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
981	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
982	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
983	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
984	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
985	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
986	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
987	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
988	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
989	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
990
991	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
992	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
993	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
994	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
995	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
996	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
997	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
998	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
999	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1000	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1001	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1002	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1003	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1004	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1005	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1006	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1007	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1008	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1009	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1010	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1011	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1012	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1013	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1014	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1015	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1016	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1017	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1018	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1019	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1020	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1021	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1022	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1023
1024	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1025	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1026	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1027	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1028	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1029	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1030	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1031	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1032	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1033	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1034	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1035	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1036	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1037	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1038	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1039	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1040	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1041	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1042	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1043	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1044	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1045	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1046	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1047	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1048	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1049	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1050	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1051	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1052	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1053	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1054	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1055	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1056
1057	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1058	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1059	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1060	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1061	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1062	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1063	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1064	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1065	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1066	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1067	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1068	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1069	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1070	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1071	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1072	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1073	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1074	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1075	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1076	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1077	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1078	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1079	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1080	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1081	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1082	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1083	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1084	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1085	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1086	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1087	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1088	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1089#rcon:
1090$code.=<<___;
1091	.long	0x00000001, 0x00000002, 0x00000004, 0x00000008
1092	.long	0x00000010, 0x00000020, 0x00000040, 0x00000080
1093	.long	0x0000001b, 0x00000036, 0x80808080, 0x80808080
1094	.long	0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
1095___
1096
1097$code.=<<___;
1098.asciz  "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1099.align	64
1100___
1101
1102# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1103#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1104if ($win64) {
1105$rec="%rcx";
1106$frame="%rdx";
1107$context="%r8";
1108$disp="%r9";
1109
1110$code.=<<___;
1111.extern	__imp_RtlVirtualUnwind
1112.type	block_se_handler,\@abi-omnipotent
1113.align	16
1114block_se_handler:
1115	push	%rsi
1116	push	%rdi
1117	push	%rbx
1118	push	%rbp
1119	push	%r12
1120	push	%r13
1121	push	%r14
1122	push	%r15
1123	pushfq
1124	sub	\$64,%rsp
1125
1126	mov	120($context),%rax	# pull context->Rax
1127	mov	248($context),%rbx	# pull context->Rip
1128
1129	mov	8($disp),%rsi		# disp->ImageBase
1130	mov	56($disp),%r11		# disp->HandlerData
1131
1132	mov	0(%r11),%r10d		# HandlerData[0]
1133	lea	(%rsi,%r10),%r10	# prologue label
1134	cmp	%r10,%rbx		# context->Rip<prologue label
1135	jb	.Lin_block_prologue
1136
1137	mov	152($context),%rax	# pull context->Rsp
1138
1139	mov	4(%r11),%r10d		# HandlerData[1]
1140	lea	(%rsi,%r10),%r10	# epilogue label
1141	cmp	%r10,%rbx		# context->Rip>=epilogue label
1142	jae	.Lin_block_prologue
1143
1144	mov	24(%rax),%rax		# pull saved real stack pointer
1145
1146	mov	-8(%rax),%rbx
1147	mov	-16(%rax),%rbp
1148	mov	-24(%rax),%r12
1149	mov	-32(%rax),%r13
1150	mov	-40(%rax),%r14
1151	mov	-48(%rax),%r15
1152	mov	%rbx,144($context)	# restore context->Rbx
1153	mov	%rbp,160($context)	# restore context->Rbp
1154	mov	%r12,216($context)	# restore context->R12
1155	mov	%r13,224($context)	# restore context->R13
1156	mov	%r14,232($context)	# restore context->R14
1157	mov	%r15,240($context)	# restore context->R15
1158
1159.Lin_block_prologue:
1160	mov	8(%rax),%rdi
1161	mov	16(%rax),%rsi
1162	mov	%rax,152($context)	# restore context->Rsp
1163	mov	%rsi,168($context)	# restore context->Rsi
1164	mov	%rdi,176($context)	# restore context->Rdi
1165
1166	jmp	.Lcommon_seh_exit
1167.size	block_se_handler,.-block_se_handler
1168
1169.type	key_se_handler,\@abi-omnipotent
1170.align	16
1171key_se_handler:
1172	push	%rsi
1173	push	%rdi
1174	push	%rbx
1175	push	%rbp
1176	push	%r12
1177	push	%r13
1178	push	%r14
1179	push	%r15
1180	pushfq
1181	sub	\$64,%rsp
1182
1183	mov	120($context),%rax	# pull context->Rax
1184	mov	248($context),%rbx	# pull context->Rip
1185
1186	mov	8($disp),%rsi		# disp->ImageBase
1187	mov	56($disp),%r11		# disp->HandlerData
1188
1189	mov	0(%r11),%r10d		# HandlerData[0]
1190	lea	(%rsi,%r10),%r10	# prologue label
1191	cmp	%r10,%rbx		# context->Rip<prologue label
1192	jb	.Lin_key_prologue
1193
1194	mov	152($context),%rax	# pull context->Rsp
1195
1196	mov	4(%r11),%r10d		# HandlerData[1]
1197	lea	(%rsi,%r10),%r10	# epilogue label
1198	cmp	%r10,%rbx		# context->Rip>=epilogue label
1199	jae	.Lin_key_prologue
1200
1201	lea	56(%rax),%rax
1202
1203	mov	-8(%rax),%rbx
1204	mov	-16(%rax),%rbp
1205	mov	-24(%rax),%r12
1206	mov	-32(%rax),%r13
1207	mov	-40(%rax),%r14
1208	mov	-48(%rax),%r15
1209	mov	%rbx,144($context)	# restore context->Rbx
1210	mov	%rbp,160($context)	# restore context->Rbp
1211	mov	%r12,216($context)	# restore context->R12
1212	mov	%r13,224($context)	# restore context->R13
1213	mov	%r14,232($context)	# restore context->R14
1214	mov	%r15,240($context)	# restore context->R15
1215
1216.Lin_key_prologue:
1217	mov	8(%rax),%rdi
1218	mov	16(%rax),%rsi
1219	mov	%rax,152($context)	# restore context->Rsp
1220	mov	%rsi,168($context)	# restore context->Rsi
1221	mov	%rdi,176($context)	# restore context->Rdi
1222
1223.Lcommon_seh_exit:
1224	mov	40($disp),%rdi		# disp->ContextRecord
1225	mov	$context,%rsi		# context
1226	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
1227	.long	0xa548f3fc		# cld; rep movsq
1228
1229	mov	$disp,%rsi
1230	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1231	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1232	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1233	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1234	mov	40(%rsi),%r10		# disp->ContextRecord
1235	lea	56(%rsi),%r11		# &disp->HandlerData
1236	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1237	mov	%r10,32(%rsp)		# arg5
1238	mov	%r11,40(%rsp)		# arg6
1239	mov	%r12,48(%rsp)		# arg7
1240	mov	%rcx,56(%rsp)		# arg8, (NULL)
1241	call	*__imp_RtlVirtualUnwind(%rip)
1242
1243	mov	\$1,%eax		# ExceptionContinueSearch
1244	add	\$64,%rsp
1245	popfq
1246	pop	%r15
1247	pop	%r14
1248	pop	%r13
1249	pop	%r12
1250	pop	%rbp
1251	pop	%rbx
1252	pop	%rdi
1253	pop	%rsi
1254	ret
1255.size	key_se_handler,.-key_se_handler
1256
1257.section	.pdata
1258.align	4
1259	.rva	.LSEH_begin_GFp_aes_nohw_encrypt
1260	.rva	.LSEH_end_GFp_aes_nohw_encrypt
1261	.rva	.LSEH_info_GFp_aes_nohw_encrypt
1262
1263	.rva	.LSEH_begin_GFp_aes_nohw_set_encrypt_key
1264	.rva	.LSEH_end_GFp_aes_nohw_set_encrypt_key
1265	.rva	.LSEH_info_GFp_aes_nohw_set_encrypt_key
1266
1267.section	.xdata
1268.align	8
1269.LSEH_info_GFp_aes_nohw_encrypt:
1270	.byte	9,0,0,0
1271	.rva	block_se_handler
1272	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
1273.LSEH_info_GFp_aes_nohw_set_encrypt_key:
1274	.byte	9,0,0,0
1275	.rva	key_se_handler
1276	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
1277___
1278}
1279
1280$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1281
1282print $code;
1283
1284close STDOUT;
1285