1#! /usr/bin/env perl
2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# June 2011
18#
19# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
20# in http://download.intel.com/design/intarch/papers/323686.pdf, is
21# that since AESNI-CBC encrypt exhibit *very* low instruction-level
22# parallelism, interleaving it with another algorithm would allow to
23# utilize processor resources better and achieve better performance.
24# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
25# AESNI code is weaved into it. Below are performance numbers in
26# cycles per processed byte, less is better, for standalone AESNI-CBC
27# encrypt, sum of the latter and standalone SHA1, and "stitched"
28# subroutine:
29#
30#		AES-128-CBC	+SHA1		stitch      gain
31# Westmere	3.77[+5.3]	9.07		6.55	    +38%
32# Sandy Bridge	5.05[+5.0(6.1)]	10.06(11.15)	5.98(7.05)  +68%(+58%)
33# Ivy Bridge	5.05[+4.6]	9.65		5.54        +74%
34# Haswell	4.43[+3.6(4.2)]	8.00(8.58)	4.55(5.21)  +75%(+65%)
35# Skylake	2.63[+3.5(4.1)]	6.17(6.69)	4.23(4.44)  +46%(+51%)
36# Bulldozer	5.77[+6.0]	11.72		6.37        +84%
37# Ryzen(**)	2.71[+1.93]	4.64		2.74        +69%
38# Goldmont(**)	3.82[+1.70]	5.52		4.20        +31%
39#
40#		AES-192-CBC
41# Westmere	4.51		9.81		6.80	    +44%
42# Sandy Bridge	6.05		11.06(12.15)	6.11(7.19)  +81%(+69%)
43# Ivy Bridge	6.05		10.65		6.07        +75%
44# Haswell	5.29		8.86(9.44)	5.32(5.32)  +67%(+77%)
45# Bulldozer	6.89		12.84		6.96        +84%
46#
47#		AES-256-CBC
48# Westmere	5.25		10.55		7.21	    +46%
49# Sandy Bridge	7.05		12.06(13.15)	7.12(7.72)  +69%(+70%)
50# Ivy Bridge	7.05		11.65		7.12        +64%
51# Haswell	6.19		9.76(10.34)	6.21(6.25)  +57%(+65%)
52# Skylake	3.62		7.16(7.68)	4.56(4.76)  +57%(+61%)
53# Bulldozer	8.00		13.95		8.25        +69%
54# Ryzen(**)	3.71		5.64		3.72        +52%
55# Goldmont(**)	5.35		7.05		5.76        +22%
56#
57# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
58#	background information. Above numbers in parentheses are SSSE3
59#	results collected on AVX-capable CPU, i.e. apply on OSes that
60#	don't support AVX.
61# (**)	SHAEXT results.
62#
63# Needless to mention that it makes no sense to implement "stitched"
64# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
65# fully utilize parallelism, so stitching would not give any gain
66# anyway. Well, there might be some, e.g. because of better cache
67# locality... For reference, here are performance results for
68# standalone AESNI-CBC decrypt:
69#
70#		AES-128-CBC	AES-192-CBC	AES-256-CBC
71# Westmere	1.25		1.50		1.75
72# Sandy Bridge	0.74		0.91		1.09
73# Ivy Bridge	0.74		0.90		1.11
74# Haswell	0.63		0.76		0.88
75# Bulldozer	0.70		0.85		0.99
76
77# And indeed:
78#
79#		AES-256-CBC	+SHA1		stitch      gain
80# Westmere	1.75		7.20		6.68        +7.8%
81# Sandy Bridge	1.09		6.09(7.22)	5.82(6.95)  +4.6%(+3.9%)
82# Ivy Bridge	1.11		5.70		5.45        +4.6%
83# Haswell	0.88		4.45(5.00)	4.39(4.69)  +1.4%(*)(+6.6%)
84# Bulldozer	0.99		6.95		5.95        +17%(**)
85#
86# (*)	Tiny improvement coefficient on Haswell is because we compare
87#	AVX1 stitch to sum with AVX2 SHA1.
88# (**)	Execution is fully dominated by integer code sequence and
89#	SIMD still hardly shows [in single-process benchmark;-]
90
91$flavour = shift;
92$output  = shift;
93if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
94
95$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
96
97$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
98( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
99( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
100die "can't locate x86_64-xlate.pl";
101
102$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
103		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
104	   $1>=2.19);
105$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
106	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
107	   $1>=2.09);
108$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
109	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
110	   $1>=10);
111$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
112
113$shaext=1;	### set to zero if compiling for 1.0.1
114
115$stitched_decrypt=0;
116
117open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
118*STDOUT=*OUT;
119
120# void aesni_cbc_sha1_enc(const void *inp,
121#			void *out,
122#			size_t length,
123#			const AES_KEY *key,
124#			unsigned char *iv,
125#			SHA_CTX *ctx,
126#			const void *in0);
127
128$code.=<<___;
129.text
130.extern	OPENSSL_ia32cap_P
131
132.globl	aesni_cbc_sha1_enc
133.type	aesni_cbc_sha1_enc,\@abi-omnipotent
134.align	32
135aesni_cbc_sha1_enc:
136	# caller should check for SSSE3 and AES-NI bits
137	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
138	mov	OPENSSL_ia32cap_P+4(%rip),%r11
139___
140$code.=<<___ if ($shaext);
141	bt	\$61,%r11		# check SHA bit
142	jc	aesni_cbc_sha1_enc_shaext
143___
144$code.=<<___ if ($avx);
145	and	\$`1<<28`,%r11d		# mask AVX bit
146	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
147	or	%r11d,%r10d
148	cmp	\$`1<<28|1<<30`,%r10d
149	je	aesni_cbc_sha1_enc_avx
150___
151$code.=<<___;
152	jmp	aesni_cbc_sha1_enc_ssse3
153	ret
154.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
155___
156
157my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
158
159my $Xi=4;
160my @X=map("%xmm$_",(4..7,0..3));
161my @Tx=map("%xmm$_",(8..10));
162my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
163my @T=("%esi","%edi");
164my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
165my $K_XX_XX="%r11";
166my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));			# for enc
167my @rndkey=("%xmm14","%xmm15");					# for enc
168my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));	# for dec
169
170if (1) {	# reassign for Atom Silvermont
171    # The goal is to minimize amount of instructions with more than
172    # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
173    # SSSE3 instructions to upper half of the register bank.
174    @X=map("%xmm$_",(8..11,4..7));
175    @Tx=map("%xmm$_",(12,13,3));
176    ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
177    @rndkey=("%xmm0","%xmm1");
178}
179
180sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
181{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
182  my $arg = pop;
183    $arg = "\$$arg" if ($arg*1 eq $arg);
184    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
185}
186
187my $_rol=sub { &rol(@_) };
188my $_ror=sub { &ror(@_) };
189
190$code.=<<___;
191.type	aesni_cbc_sha1_enc_ssse3,\@function,6
192.align	32
193aesni_cbc_sha1_enc_ssse3:
194.cfi_startproc
195	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
196	#shr	\$6,$len			# debugging artefact
197	#jz	.Lepilogue_ssse3		# debugging artefact
198	push	%rbx
199.cfi_push	%rbx
200	push	%rbp
201.cfi_push	%rbp
202	push	%r12
203.cfi_push	%r12
204	push	%r13
205.cfi_push	%r13
206	push	%r14
207.cfi_push	%r14
208	push	%r15
209.cfi_push	%r15
210	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
211.cfi_adjust_cfa_offset	`104+($win64?10*16:0)`
212	#mov	$in0,$inp			# debugging artefact
213	#lea	64(%rsp),$ctx			# debugging artefact
214___
215$code.=<<___ if ($win64);
216	movaps	%xmm6,96+0(%rsp)
217	movaps	%xmm7,96+16(%rsp)
218	movaps	%xmm8,96+32(%rsp)
219	movaps	%xmm9,96+48(%rsp)
220	movaps	%xmm10,96+64(%rsp)
221	movaps	%xmm11,96+80(%rsp)
222	movaps	%xmm12,96+96(%rsp)
223	movaps	%xmm13,96+112(%rsp)
224	movaps	%xmm14,96+128(%rsp)
225	movaps	%xmm15,96+144(%rsp)
226.Lprologue_ssse3:
227___
228$code.=<<___;
229	mov	$in0,%r12			# reassign arguments
230	mov	$out,%r13
231	mov	$len,%r14
232	lea	112($key),%r15			# size optimization
233	movdqu	($ivp),$iv			# load IV
234	mov	$ivp,88(%rsp)			# save $ivp
235___
236($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
237my $rounds="${ivp}d";
238$code.=<<___;
239	shl	\$6,$len
240	sub	$in0,$out
241	mov	240-112($key),$rounds
242	add	$inp,$len		# end of input
243
244	lea	K_XX_XX(%rip),$K_XX_XX
245	mov	0($ctx),$A		# load context
246	mov	4($ctx),$B
247	mov	8($ctx),$C
248	mov	12($ctx),$D
249	mov	$B,@T[0]		# magic seed
250	mov	16($ctx),$E
251	mov	$C,@T[1]
252	xor	$D,@T[1]
253	and	@T[1],@T[0]
254
255	movdqa	64($K_XX_XX),@Tx[2]	# pbswap mask
256	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
257	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
258	movdqu	16($inp),@X[-3&7]
259	movdqu	32($inp),@X[-2&7]
260	movdqu	48($inp),@X[-1&7]
261	pshufb	@Tx[2],@X[-4&7]		# byte swap
262	pshufb	@Tx[2],@X[-3&7]
263	pshufb	@Tx[2],@X[-2&7]
264	add	\$64,$inp
265	paddd	@Tx[1],@X[-4&7]		# add K_00_19
266	pshufb	@Tx[2],@X[-1&7]
267	paddd	@Tx[1],@X[-3&7]
268	paddd	@Tx[1],@X[-2&7]
269	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
270	psubd	@Tx[1],@X[-4&7]		# restore X[]
271	movdqa	@X[-3&7],16(%rsp)
272	psubd	@Tx[1],@X[-3&7]
273	movdqa	@X[-2&7],32(%rsp)
274	psubd	@Tx[1],@X[-2&7]
275	movups	-112($key),$rndkey0	# $key[0]
276	movups	16-112($key),$rndkey[0]	# forward reference
277	jmp	.Loop_ssse3
278___
279
280my $aesenc=sub {
281  use integer;
282  my ($n,$k)=($r/10,$r%10);
283    if ($k==0) {
284      $code.=<<___;
285	movups		`16*$n`($in0),$in		# load input
286	xorps		$rndkey0,$in
287___
288      $code.=<<___ if ($n);
289	movups		$iv,`16*($n-1)`($out,$in0)	# write output
290___
291      $code.=<<___;
292	xorps		$in,$iv
293	movups		`32+16*$k-112`($key),$rndkey[1]
294	aesenc		$rndkey[0],$iv
295___
296    } elsif ($k==9) {
297      $sn++;
298      $code.=<<___;
299	cmp		\$11,$rounds
300	jb		.Laesenclast$sn
301	movups		`32+16*($k+0)-112`($key),$rndkey[1]
302	aesenc		$rndkey[0],$iv
303	movups		`32+16*($k+1)-112`($key),$rndkey[0]
304	aesenc		$rndkey[1],$iv
305	je		.Laesenclast$sn
306	movups		`32+16*($k+2)-112`($key),$rndkey[1]
307	aesenc		$rndkey[0],$iv
308	movups		`32+16*($k+3)-112`($key),$rndkey[0]
309	aesenc		$rndkey[1],$iv
310.Laesenclast$sn:
311	aesenclast	$rndkey[0],$iv
312	movups		16-112($key),$rndkey[1]		# forward reference
313___
314    } else {
315      $code.=<<___;
316	movups		`32+16*$k-112`($key),$rndkey[1]
317	aesenc		$rndkey[0],$iv
318___
319    }
320    $r++;	unshift(@rndkey,pop(@rndkey));
321};
322
323sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
324{ use integer;
325  my $body = shift;
326  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
327  my ($a,$b,$c,$d,$e);
328
329	 eval(shift(@insns));		# ror
330	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
331	 eval(shift(@insns));
332	&movdqa	(@Tx[0],@X[-1&7]);
333	  &paddd	(@Tx[1],@X[-1&7]);
334	 eval(shift(@insns));
335	 eval(shift(@insns));
336
337	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
338	 eval(shift(@insns));
339	 eval(shift(@insns));		# rol
340	 eval(shift(@insns));
341	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
342	 eval(shift(@insns));
343	 eval(shift(@insns));
344
345	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
346	 eval(shift(@insns));
347	 eval(shift(@insns));		# ror
348	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
349	 eval(shift(@insns));
350	 eval(shift(@insns));
351	 eval(shift(@insns));
352
353	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
354	 eval(shift(@insns));
355	 eval(shift(@insns));		# rol
356	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
357	 eval(shift(@insns));
358	 eval(shift(@insns));
359
360	&movdqa	(@Tx[2],@X[0]);
361	 eval(shift(@insns));
362	 eval(shift(@insns));
363	 eval(shift(@insns));		# ror
364	&movdqa	(@Tx[0],@X[0]);
365	 eval(shift(@insns));
366
367	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
368	&paddd	(@X[0],@X[0]);
369	 eval(shift(@insns));
370	 eval(shift(@insns));
371
372	&psrld	(@Tx[0],31);
373	 eval(shift(@insns));
374	 eval(shift(@insns));		# rol
375	 eval(shift(@insns));
376	&movdqa	(@Tx[1],@Tx[2]);
377	 eval(shift(@insns));
378	 eval(shift(@insns));
379
380	&psrld	(@Tx[2],30);
381	 eval(shift(@insns));
382	 eval(shift(@insns));		# ror
383	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
384	 eval(shift(@insns));
385	 eval(shift(@insns));
386	 eval(shift(@insns));
387
388	&pslld	(@Tx[1],2);
389	&pxor	(@X[0],@Tx[2]);
390	 eval(shift(@insns));
391	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
392	 eval(shift(@insns));		# rol
393	 eval(shift(@insns));
394	 eval(shift(@insns));
395
396	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
397	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
398
399	 foreach (@insns) { eval; }	# remaining instructions [if any]
400
401  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
402		push(@Tx,shift(@Tx));
403}
404
405sub Xupdate_ssse3_32_79()
406{ use integer;
407  my $body = shift;
408  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
409  my ($a,$b,$c,$d,$e);
410
411	 eval(shift(@insns))		if ($Xi==8);
412	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
413	 eval(shift(@insns))		if ($Xi==8);
414	 eval(shift(@insns));		# body_20_39
415	 eval(shift(@insns));
416	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
417	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
418	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
419	 eval(shift(@insns));
420	 eval(shift(@insns));		# rol
421
422	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
423	 eval(shift(@insns));
424	 eval(shift(@insns));
425	if ($Xi%5) {
426	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
427	} else {			# ... or load next one
428	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
429	}
430	 eval(shift(@insns));		# ror
431	  &paddd	(@Tx[1],@X[-1&7]);
432	 eval(shift(@insns));
433
434	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
435	 eval(shift(@insns));		# body_20_39
436	 eval(shift(@insns));
437	 eval(shift(@insns));
438	 eval(shift(@insns));		# rol
439	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
440
441	&movdqa	(@Tx[0],@X[0]);
442	 eval(shift(@insns));
443	 eval(shift(@insns));
444	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
445	 eval(shift(@insns));		# ror
446	 eval(shift(@insns));
447	 eval(shift(@insns));		# body_20_39
448
449	&pslld	(@X[0],2);
450	 eval(shift(@insns));
451	 eval(shift(@insns));
452	&psrld	(@Tx[0],30);
453	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
454	 eval(shift(@insns));
455	 eval(shift(@insns));
456	 eval(shift(@insns));		# ror
457
458	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
459	 eval(shift(@insns));
460	 eval(shift(@insns));		# body_20_39
461	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
462	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
463	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
464	 eval(shift(@insns));
465	 eval(shift(@insns));		# rol
466	 eval(shift(@insns));
467	 eval(shift(@insns));
468	 eval(shift(@insns));		# rol
469	 eval(shift(@insns));
470
471	 foreach (@insns) { eval; }	# remaining instructions
472
473  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
474		push(@Tx,shift(@Tx));
475}
476
477sub Xuplast_ssse3_80()
478{ use integer;
479  my $body = shift;
480  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
481  my ($a,$b,$c,$d,$e);
482
483	 eval(shift(@insns));
484	 eval(shift(@insns));
485	 eval(shift(@insns));
486	 eval(shift(@insns));
487	  &paddd	(@Tx[1],@X[-1&7]);
488	 eval(shift(@insns));
489	 eval(shift(@insns));
490
491	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
492
493	 foreach (@insns) { eval; }		# remaining instructions
494
495	&cmp	($inp,$len);
496	&je	(shift);
497
498	unshift(@Tx,pop(@Tx));
499
500	&movdqa	(@Tx[2],"64($K_XX_XX)");	# pbswap mask
501	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
502	&movdqu	(@X[-4&7],"0($inp)");		# load input
503	&movdqu	(@X[-3&7],"16($inp)");
504	&movdqu	(@X[-2&7],"32($inp)");
505	&movdqu	(@X[-1&7],"48($inp)");
506	&pshufb	(@X[-4&7],@Tx[2]);		# byte swap
507	&add	($inp,64);
508
509  $Xi=0;
510}
511
512sub Xloop_ssse3()
513{ use integer;
514  my $body = shift;
515  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
516  my ($a,$b,$c,$d,$e);
517
518	 eval(shift(@insns));
519	 eval(shift(@insns));
520	 eval(shift(@insns));
521	&pshufb	(@X[($Xi-3)&7],@Tx[2]);
522	 eval(shift(@insns));
523	 eval(shift(@insns));
524	 eval(shift(@insns));
525	 eval(shift(@insns));
526	&paddd	(@X[($Xi-4)&7],@Tx[1]);
527	 eval(shift(@insns));
528	 eval(shift(@insns));
529	 eval(shift(@insns));
530	 eval(shift(@insns));
531	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
532	 eval(shift(@insns));
533	 eval(shift(@insns));
534	 eval(shift(@insns));
535	 eval(shift(@insns));
536	&psubd	(@X[($Xi-4)&7],@Tx[1]);
537
538	foreach (@insns) { eval; }
539  $Xi++;
540}
541
542sub Xtail_ssse3()
543{ use integer;
544  my $body = shift;
545  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
546  my ($a,$b,$c,$d,$e);
547
548	foreach (@insns) { eval; }
549}
550
551my @body_00_19 = (
552	'($a,$b,$c,$d,$e)=@V;'.
553	'&$_ror	($b,$j?7:2);',	# $b>>>2
554	'&xor	(@T[0],$d);',
555	'&mov	(@T[1],$a);',	# $b for next round
556
557	'&add	($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
558	'&xor	($b,$c);',	# $c^$d for next round
559
560	'&$_rol	($a,5);',
561	'&add	($e,@T[0]);',
562	'&and	(@T[1],$b);',	# ($b&($c^$d)) for next round
563
564	'&xor	($b,$c);',	# restore $b
565	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
566	);
567
568sub body_00_19 () {	# ((c^d)&b)^d
569    # on start @T[0]=(c^d)&b
570    return &body_20_39() if ($rx==19); $rx++;
571
572    use integer;
573    my ($k,$n);
574    my @r=@body_00_19;
575
576	$n = scalar(@r);
577	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
578	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
579	$jj++;
580
581    return @r;
582}
583
584my @body_20_39 = (
585	'($a,$b,$c,$d,$e)=@V;'.
586	'&add	($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
587	'&xor	(@T[0],$d)	if($j==19);'.
588	'&xor	(@T[0],$c)	if($j> 19);',	# ($b^$d^$c)
589	'&mov	(@T[1],$a);',	# $b for next round
590
591	'&$_rol	($a,5);',
592	'&add	($e,@T[0]);',
593	'&xor	(@T[1],$c)	if ($j< 79);',	# $b^$d for next round
594
595	'&$_ror	($b,7);',	# $b>>>2
596	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
597	);
598
599sub body_20_39 () {	# b^d^c
600    # on entry @T[0]=b^d
601    return &body_40_59() if ($rx==39); $rx++;
602
603    use integer;
604    my ($k,$n);
605    my @r=@body_20_39;
606
607	$n = scalar(@r);
608	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
609	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n && $rx!=20);
610	$jj++;
611
612    return @r;
613}
614
615my @body_40_59 = (
616	'($a,$b,$c,$d,$e)=@V;'.
617	'&add	($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
618	'&and	(@T[0],$c)	if ($j>=40);',	# (b^c)&(c^d)
619	'&xor	($c,$d)		if ($j>=40);',	# restore $c
620
621	'&$_ror	($b,7);',	# $b>>>2
622	'&mov	(@T[1],$a);',	# $b for next round
623	'&xor	(@T[0],$c);',
624
625	'&$_rol	($a,5);',
626	'&add	($e,@T[0]);',
627	'&xor	(@T[1],$c)	if ($j==59);'.
628	'&xor	(@T[1],$b)	if ($j< 59);',	# b^c for next round
629
630	'&xor	($b,$c)		if ($j< 59);',	# c^d for next round
631	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
632	);
633
634sub body_40_59 () {	# ((b^c)&(c^d))^c
635    # on entry @T[0]=(b^c), (c^=d)
636    $rx++;
637
638    use integer;
639    my ($k,$n);
640    my @r=@body_40_59;
641
642	$n = scalar(@r);
643	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
644	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n && $rx!=40);
645	$jj++;
646
647    return @r;
648}
649$code.=<<___;
650.align	32
651.Loop_ssse3:
652___
653	&Xupdate_ssse3_16_31(\&body_00_19);
654	&Xupdate_ssse3_16_31(\&body_00_19);
655	&Xupdate_ssse3_16_31(\&body_00_19);
656	&Xupdate_ssse3_16_31(\&body_00_19);
657	&Xupdate_ssse3_32_79(\&body_00_19);
658	&Xupdate_ssse3_32_79(\&body_20_39);
659	&Xupdate_ssse3_32_79(\&body_20_39);
660	&Xupdate_ssse3_32_79(\&body_20_39);
661	&Xupdate_ssse3_32_79(\&body_20_39);
662	&Xupdate_ssse3_32_79(\&body_20_39);
663	&Xupdate_ssse3_32_79(\&body_40_59);
664	&Xupdate_ssse3_32_79(\&body_40_59);
665	&Xupdate_ssse3_32_79(\&body_40_59);
666	&Xupdate_ssse3_32_79(\&body_40_59);
667	&Xupdate_ssse3_32_79(\&body_40_59);
668	&Xupdate_ssse3_32_79(\&body_20_39);
669	&Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3");	# can jump to "done"
670
671				$saved_j=$j; @saved_V=@V;
672				$saved_r=$r; @saved_rndkey=@rndkey;
673
674	&Xloop_ssse3(\&body_20_39);
675	&Xloop_ssse3(\&body_20_39);
676	&Xloop_ssse3(\&body_20_39);
677
678$code.=<<___;
679	movups	$iv,48($out,$in0)		# write output
680	lea	64($in0),$in0
681
682	add	0($ctx),$A			# update context
683	add	4($ctx),@T[0]
684	add	8($ctx),$C
685	add	12($ctx),$D
686	mov	$A,0($ctx)
687	add	16($ctx),$E
688	mov	@T[0],4($ctx)
689	mov	@T[0],$B			# magic seed
690	mov	$C,8($ctx)
691	mov	$C,@T[1]
692	mov	$D,12($ctx)
693	xor	$D,@T[1]
694	mov	$E,16($ctx)
695	and	@T[1],@T[0]
696	jmp	.Loop_ssse3
697
698.Ldone_ssse3:
699___
700				$jj=$j=$saved_j; @V=@saved_V;
701				$r=$saved_r;     @rndkey=@saved_rndkey;
702
703	&Xtail_ssse3(\&body_20_39);
704	&Xtail_ssse3(\&body_20_39);
705	&Xtail_ssse3(\&body_20_39);
706
707$code.=<<___;
708	movups	$iv,48($out,$in0)		# write output
709	mov	88(%rsp),$ivp			# restore $ivp
710
711	add	0($ctx),$A			# update context
712	add	4($ctx),@T[0]
713	add	8($ctx),$C
714	mov	$A,0($ctx)
715	add	12($ctx),$D
716	mov	@T[0],4($ctx)
717	add	16($ctx),$E
718	mov	$C,8($ctx)
719	mov	$D,12($ctx)
720	mov	$E,16($ctx)
721	movups	$iv,($ivp)			# write IV
722___
723$code.=<<___ if ($win64);
724	movaps	96+0(%rsp),%xmm6
725	movaps	96+16(%rsp),%xmm7
726	movaps	96+32(%rsp),%xmm8
727	movaps	96+48(%rsp),%xmm9
728	movaps	96+64(%rsp),%xmm10
729	movaps	96+80(%rsp),%xmm11
730	movaps	96+96(%rsp),%xmm12
731	movaps	96+112(%rsp),%xmm13
732	movaps	96+128(%rsp),%xmm14
733	movaps	96+144(%rsp),%xmm15
734___
735$code.=<<___;
736	lea	`104+($win64?10*16:0)`(%rsp),%rsi
737.cfi_def_cfa	%rsi,56
738	mov	0(%rsi),%r15
739.cfi_restore	%r15
740	mov	8(%rsi),%r14
741.cfi_restore	%r14
742	mov	16(%rsi),%r13
743.cfi_restore	%r13
744	mov	24(%rsi),%r12
745.cfi_restore	%r12
746	mov	32(%rsi),%rbp
747.cfi_restore	%rbp
748	mov	40(%rsi),%rbx
749.cfi_restore	%rbx
750	lea	48(%rsi),%rsp
751.cfi_def_cfa	%rsp,8
752.Lepilogue_ssse3:
753	ret
754.cfi_endproc
755.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
756___
757
758						if ($stitched_decrypt) {{{
759# reset
760($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
761$j=$jj=$r=$rx=0;
762$Xi=4;
763
764# reassign for Atom Silvermont (see above)
765($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
766@X=map("%xmm$_",(8..13,6,7));
767@Tx=map("%xmm$_",(14,15,5));
768
769my @aes256_dec = (
770	'&movdqu($inout0,"0x00($in0)");',
771	'&movdqu($inout1,"0x10($in0)");	&pxor	($inout0,$rndkey0);',
772	'&movdqu($inout2,"0x20($in0)");	&pxor	($inout1,$rndkey0);',
773	'&movdqu($inout3,"0x30($in0)");	&pxor	($inout2,$rndkey0);',
774
775	'&pxor	($inout3,$rndkey0);	&movups	($rndkey0,"16-112($key)");',
776	'&movaps("64(%rsp)",@X[2]);',	# save IV, originally @X[3]
777	undef,undef
778	);
779for ($i=0;$i<13;$i++) {
780    push (@aes256_dec,(
781	'&aesdec	($inout0,$rndkey0);',
782	'&aesdec	($inout1,$rndkey0);',
783	'&aesdec	($inout2,$rndkey0);',
784	'&aesdec	($inout3,$rndkey0);	&movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
785	));
786    push (@aes256_dec,(undef,undef))	if (($i>=3 && $i<=5) || $i>=11);
787    push (@aes256_dec,(undef,undef))	if ($i==5);
788}
789push(@aes256_dec,(
790	'&aesdeclast	($inout0,$rndkey0);	&movups	(@X[0],"0x00($in0)");',
791	'&aesdeclast	($inout1,$rndkey0);	&movups	(@X[1],"0x10($in0)");',
792	'&aesdeclast	($inout2,$rndkey0);	&movups	(@X[2],"0x20($in0)");',
793	'&aesdeclast	($inout3,$rndkey0);	&movups	(@X[3],"0x30($in0)");',
794
795	'&xorps		($inout0,"64(%rsp)");	&movdqu	($rndkey0,"-112($key)");',
796	'&xorps		($inout1,@X[0]);	&movups	("0x00($out,$in0)",$inout0);',
797	'&xorps		($inout2,@X[1]);	&movups	("0x10($out,$in0)",$inout1);',
798	'&xorps		($inout3,@X[2]);	&movups	("0x20($out,$in0)",$inout2);',
799
800	'&movups	("0x30($out,$in0)",$inout3);'
801	));
802
803sub body_00_19_dec () {	# ((c^d)&b)^d
804    # on start @T[0]=(c^d)&b
805    return &body_20_39_dec() if ($rx==19);
806
807    my @r=@body_00_19;
808
809	unshift (@r,@aes256_dec[$rx])	if (@aes256_dec[$rx]);
810	$rx++;
811
812    return @r;
813}
814
815sub body_20_39_dec () {	# b^d^c
816    # on entry @T[0]=b^d
817    return &body_40_59_dec() if ($rx==39);
818
819    my @r=@body_20_39;
820
821	unshift (@r,@aes256_dec[$rx])	if (@aes256_dec[$rx]);
822	$rx++;
823
824    return @r;
825}
826
827sub body_40_59_dec () {	# ((b^c)&(c^d))^c
828    # on entry @T[0]=(b^c), (c^=d)
829
830    my @r=@body_40_59;
831
832	unshift (@r,@aes256_dec[$rx])	if (@aes256_dec[$rx]);
833	$rx++;
834
835    return @r;
836}
837
838$code.=<<___;
839.globl	aesni256_cbc_sha1_dec
840.type	aesni256_cbc_sha1_dec,\@abi-omnipotent
841.align	32
842aesni256_cbc_sha1_dec:
843	# caller should check for SSSE3 and AES-NI bits
844	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
845	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
846___
847$code.=<<___ if ($avx);
848	and	\$`1<<28`,%r11d		# mask AVX bit
849	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
850	or	%r11d,%r10d
851	cmp	\$`1<<28|1<<30`,%r10d
852	je	aesni256_cbc_sha1_dec_avx
853___
854$code.=<<___;
855	jmp	aesni256_cbc_sha1_dec_ssse3
856	ret
857.size	aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
858
859.type	aesni256_cbc_sha1_dec_ssse3,\@function,6
860.align	32
861aesni256_cbc_sha1_dec_ssse3:
862.cfi_startproc
863	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
864	push	%rbx
865.cfi_push	%rbx
866	push	%rbp
867.cfi_push	%rbp
868	push	%r12
869.cfi_push	%r12
870	push	%r13
871.cfi_push	%r13
872	push	%r14
873.cfi_push	%r14
874	push	%r15
875.cfi_push	%r15
876	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
877.cfi_adjust_cfa_offset	`104+($win64?10*16:0)`
878___
879$code.=<<___ if ($win64);
880	movaps	%xmm6,96+0(%rsp)
881	movaps	%xmm7,96+16(%rsp)
882	movaps	%xmm8,96+32(%rsp)
883	movaps	%xmm9,96+48(%rsp)
884	movaps	%xmm10,96+64(%rsp)
885	movaps	%xmm11,96+80(%rsp)
886	movaps	%xmm12,96+96(%rsp)
887	movaps	%xmm13,96+112(%rsp)
888	movaps	%xmm14,96+128(%rsp)
889	movaps	%xmm15,96+144(%rsp)
890.Lprologue_dec_ssse3:
891___
892$code.=<<___;
893	mov	$in0,%r12			# reassign arguments
894	mov	$out,%r13
895	mov	$len,%r14
896	lea	112($key),%r15			# size optimization
897	movdqu	($ivp),@X[3]			# load IV
898	#mov	$ivp,88(%rsp)			# save $ivp
899___
900($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
901$code.=<<___;
902	shl	\$6,$len
903	sub	$in0,$out
904	add	$inp,$len		# end of input
905
906	lea	K_XX_XX(%rip),$K_XX_XX
907	mov	0($ctx),$A		# load context
908	mov	4($ctx),$B
909	mov	8($ctx),$C
910	mov	12($ctx),$D
911	mov	$B,@T[0]		# magic seed
912	mov	16($ctx),$E
913	mov	$C,@T[1]
914	xor	$D,@T[1]
915	and	@T[1],@T[0]
916
917	movdqa	64($K_XX_XX),@Tx[2]	# pbswap mask
918	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
919	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
920	movdqu	16($inp),@X[-3&7]
921	movdqu	32($inp),@X[-2&7]
922	movdqu	48($inp),@X[-1&7]
923	pshufb	@Tx[2],@X[-4&7]		# byte swap
924	add	\$64,$inp
925	pshufb	@Tx[2],@X[-3&7]
926	pshufb	@Tx[2],@X[-2&7]
927	pshufb	@Tx[2],@X[-1&7]
928	paddd	@Tx[1],@X[-4&7]		# add K_00_19
929	paddd	@Tx[1],@X[-3&7]
930	paddd	@Tx[1],@X[-2&7]
931	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
932	psubd	@Tx[1],@X[-4&7]		# restore X[]
933	movdqa	@X[-3&7],16(%rsp)
934	psubd	@Tx[1],@X[-3&7]
935	movdqa	@X[-2&7],32(%rsp)
936	psubd	@Tx[1],@X[-2&7]
937	movdqu	-112($key),$rndkey0	# $key[0]
938	jmp	.Loop_dec_ssse3
939
940.align	32
941.Loop_dec_ssse3:
942___
943	&Xupdate_ssse3_16_31(\&body_00_19_dec);
944	&Xupdate_ssse3_16_31(\&body_00_19_dec);
945	&Xupdate_ssse3_16_31(\&body_00_19_dec);
946	&Xupdate_ssse3_16_31(\&body_00_19_dec);
947	&Xupdate_ssse3_32_79(\&body_00_19_dec);
948	&Xupdate_ssse3_32_79(\&body_20_39_dec);
949	&Xupdate_ssse3_32_79(\&body_20_39_dec);
950	&Xupdate_ssse3_32_79(\&body_20_39_dec);
951	&Xupdate_ssse3_32_79(\&body_20_39_dec);
952	&Xupdate_ssse3_32_79(\&body_20_39_dec);
953	&Xupdate_ssse3_32_79(\&body_40_59_dec);
954	&Xupdate_ssse3_32_79(\&body_40_59_dec);
955	&Xupdate_ssse3_32_79(\&body_40_59_dec);
956	&Xupdate_ssse3_32_79(\&body_40_59_dec);
957	&Xupdate_ssse3_32_79(\&body_40_59_dec);
958	&Xupdate_ssse3_32_79(\&body_20_39_dec);
959	&Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3");	# can jump to "done"
960
961				$saved_j=$j;   @saved_V=@V;
962				$saved_rx=$rx;
963
964	&Xloop_ssse3(\&body_20_39_dec);
965	&Xloop_ssse3(\&body_20_39_dec);
966	&Xloop_ssse3(\&body_20_39_dec);
967
968	eval(@aes256_dec[-1]);			# last store
969$code.=<<___;
970	lea	64($in0),$in0
971
972	add	0($ctx),$A			# update context
973	add	4($ctx),@T[0]
974	add	8($ctx),$C
975	add	12($ctx),$D
976	mov	$A,0($ctx)
977	add	16($ctx),$E
978	mov	@T[0],4($ctx)
979	mov	@T[0],$B			# magic seed
980	mov	$C,8($ctx)
981	mov	$C,@T[1]
982	mov	$D,12($ctx)
983	xor	$D,@T[1]
984	mov	$E,16($ctx)
985	and	@T[1],@T[0]
986	jmp	.Loop_dec_ssse3
987
988.Ldone_dec_ssse3:
989___
990				$jj=$j=$saved_j; @V=@saved_V;
991				$rx=$saved_rx;
992
993	&Xtail_ssse3(\&body_20_39_dec);
994	&Xtail_ssse3(\&body_20_39_dec);
995	&Xtail_ssse3(\&body_20_39_dec);
996
997	eval(@aes256_dec[-1]);			# last store
998$code.=<<___;
999	add	0($ctx),$A			# update context
1000	add	4($ctx),@T[0]
1001	add	8($ctx),$C
1002	mov	$A,0($ctx)
1003	add	12($ctx),$D
1004	mov	@T[0],4($ctx)
1005	add	16($ctx),$E
1006	mov	$C,8($ctx)
1007	mov	$D,12($ctx)
1008	mov	$E,16($ctx)
1009	movups	@X[3],($ivp)			# write IV
1010___
1011$code.=<<___ if ($win64);
1012	movaps	96+0(%rsp),%xmm6
1013	movaps	96+16(%rsp),%xmm7
1014	movaps	96+32(%rsp),%xmm8
1015	movaps	96+48(%rsp),%xmm9
1016	movaps	96+64(%rsp),%xmm10
1017	movaps	96+80(%rsp),%xmm11
1018	movaps	96+96(%rsp),%xmm12
1019	movaps	96+112(%rsp),%xmm13
1020	movaps	96+128(%rsp),%xmm14
1021	movaps	96+144(%rsp),%xmm15
1022___
1023$code.=<<___;
1024	lea	`104+($win64?10*16:0)`(%rsp),%rsi
1025.cfi_cfa_def	%rsi,56
1026	mov	0(%rsi),%r15
1027.cfi_restore	%r15
1028	mov	8(%rsi),%r14
1029.cfi_restore	%r14
1030	mov	16(%rsi),%r13
1031.cfi_restore	%r13
1032	mov	24(%rsi),%r12
1033.cfi_restore	%r12
1034	mov	32(%rsi),%rbp
1035.cfi_restore	%rbp
1036	mov	40(%rsi),%rbx
1037.cfi_restore	%rbx
1038	lea	48(%rsi),%rsp
1039.cfi_cfa_def	%rsp,8
1040.Lepilogue_dec_ssse3:
1041	ret
1042.cfi_endproc
1043.size	aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
1044___
1045						}}}
1046$j=$jj=$r=$rx=0;
1047
1048if ($avx) {
1049my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1050
1051my $Xi=4;
1052my @X=map("%xmm$_",(4..7,0..3));
1053my @Tx=map("%xmm$_",(8..10));
1054my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
1055my @T=("%esi","%edi");
1056my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
1057my @rndkey=("%xmm14","%xmm15");
1058my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));	# for dec
1059my $Kx=@Tx[2];
1060
1061my $_rol=sub { &shld(@_[0],@_) };
1062my $_ror=sub { &shrd(@_[0],@_) };
1063
1064$code.=<<___;
1065.type	aesni_cbc_sha1_enc_avx,\@function,6
1066.align	32
1067aesni_cbc_sha1_enc_avx:
1068.cfi_startproc
1069	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1070	#shr	\$6,$len			# debugging artefact
1071	#jz	.Lepilogue_avx			# debugging artefact
1072	push	%rbx
1073.cfi_push	%rbx
1074	push	%rbp
1075.cfi_push	%rbp
1076	push	%r12
1077.cfi_push	%r12
1078	push	%r13
1079.cfi_push	%r13
1080	push	%r14
1081.cfi_push	%r14
1082	push	%r15
1083.cfi_push	%r15
1084	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
1085.cfi_adjust_cfa_offset	`104+($win64?10*16:0)`
1086	#mov	$in0,$inp			# debugging artefact
1087	#lea	64(%rsp),$ctx			# debugging artefact
1088___
1089$code.=<<___ if ($win64);
1090	movaps	%xmm6,96+0(%rsp)
1091	movaps	%xmm7,96+16(%rsp)
1092	movaps	%xmm8,96+32(%rsp)
1093	movaps	%xmm9,96+48(%rsp)
1094	movaps	%xmm10,96+64(%rsp)
1095	movaps	%xmm11,96+80(%rsp)
1096	movaps	%xmm12,96+96(%rsp)
1097	movaps	%xmm13,96+112(%rsp)
1098	movaps	%xmm14,96+128(%rsp)
1099	movaps	%xmm15,96+144(%rsp)
1100.Lprologue_avx:
1101___
1102$code.=<<___;
1103	vzeroall
1104	mov	$in0,%r12			# reassign arguments
1105	mov	$out,%r13
1106	mov	$len,%r14
1107	lea	112($key),%r15			# size optimization
1108	vmovdqu	($ivp),$iv			# load IV
1109	mov	$ivp,88(%rsp)			# save $ivp
1110___
1111($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
1112my $rounds="${ivp}d";
1113$code.=<<___;
1114	shl	\$6,$len
1115	sub	$in0,$out
1116	mov	240-112($key),$rounds
1117	add	$inp,$len		# end of input
1118
1119	lea	K_XX_XX(%rip),$K_XX_XX
1120	mov	0($ctx),$A		# load context
1121	mov	4($ctx),$B
1122	mov	8($ctx),$C
1123	mov	12($ctx),$D
1124	mov	$B,@T[0]		# magic seed
1125	mov	16($ctx),$E
1126	mov	$C,@T[1]
1127	xor	$D,@T[1]
1128	and	@T[1],@T[0]
1129
1130	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
1131	vmovdqa	0($K_XX_XX),$Kx		# K_00_19
1132	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
1133	vmovdqu	16($inp),@X[-3&7]
1134	vmovdqu	32($inp),@X[-2&7]
1135	vmovdqu	48($inp),@X[-1&7]
1136	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
1137	add	\$64,$inp
1138	vpshufb	@X[2],@X[-3&7],@X[-3&7]
1139	vpshufb	@X[2],@X[-2&7],@X[-2&7]
1140	vpshufb	@X[2],@X[-1&7],@X[-1&7]
1141	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1142	vpaddd	$Kx,@X[-3&7],@X[1]
1143	vpaddd	$Kx,@X[-2&7],@X[2]
1144	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1145	vmovdqa	@X[1],16(%rsp)
1146	vmovdqa	@X[2],32(%rsp)
1147	vmovups	-112($key),$rndkey[1]	# $key[0]
1148	vmovups	16-112($key),$rndkey[0]	# forward reference
1149	jmp	.Loop_avx
1150___
1151
1152my $aesenc=sub {
1153  use integer;
1154  my ($n,$k)=($r/10,$r%10);
1155    if ($k==0) {
1156      $code.=<<___;
1157	vmovdqu		`16*$n`($in0),$in		# load input
1158	vpxor		$rndkey[1],$in,$in
1159___
1160      $code.=<<___ if ($n);
1161	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
1162___
1163      $code.=<<___;
1164	vpxor		$in,$iv,$iv
1165	vaesenc		$rndkey[0],$iv,$iv
1166	vmovups		`32+16*$k-112`($key),$rndkey[1]
1167___
1168    } elsif ($k==9) {
1169      $sn++;
1170      $code.=<<___;
1171	cmp		\$11,$rounds
1172	jb		.Lvaesenclast$sn
1173	vaesenc		$rndkey[0],$iv,$iv
1174	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
1175	vaesenc		$rndkey[1],$iv,$iv
1176	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
1177	je		.Lvaesenclast$sn
1178	vaesenc		$rndkey[0],$iv,$iv
1179	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
1180	vaesenc		$rndkey[1],$iv,$iv
1181	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
1182.Lvaesenclast$sn:
1183	vaesenclast	$rndkey[0],$iv,$iv
1184	vmovups		-112($key),$rndkey[0]
1185	vmovups		16-112($key),$rndkey[1]		# forward reference
1186___
1187    } else {
1188      $code.=<<___;
1189	vaesenc		$rndkey[0],$iv,$iv
1190	vmovups		`32+16*$k-112`($key),$rndkey[1]
1191___
1192    }
1193    $r++;	unshift(@rndkey,pop(@rndkey));
1194};
1195
1196sub Xupdate_avx_16_31()		# recall that $Xi starts with 4
1197{ use integer;
1198  my $body = shift;
1199  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1200  my ($a,$b,$c,$d,$e);
1201
1202	 eval(shift(@insns));
1203	 eval(shift(@insns));
1204	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1205	 eval(shift(@insns));
1206	 eval(shift(@insns));
1207
1208	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1209	 eval(shift(@insns));
1210	 eval(shift(@insns));
1211	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1212	 eval(shift(@insns));
1213	 eval(shift(@insns));
1214	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1215	 eval(shift(@insns));
1216	 eval(shift(@insns));
1217
1218	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1219	 eval(shift(@insns));
1220	 eval(shift(@insns));
1221	 eval(shift(@insns));
1222	 eval(shift(@insns));
1223
1224	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1225	 eval(shift(@insns));
1226	 eval(shift(@insns));
1227	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1228	 eval(shift(@insns));
1229	 eval(shift(@insns));
1230
1231	&vpsrld	(@Tx[0],@X[0],31);
1232	 eval(shift(@insns));
1233	 eval(shift(@insns));
1234	 eval(shift(@insns));
1235	 eval(shift(@insns));
1236
1237	&vpslldq(@Tx[1],@X[0],12);		# "X[0]"<<96, extract one dword
1238	&vpaddd	(@X[0],@X[0],@X[0]);
1239	 eval(shift(@insns));
1240	 eval(shift(@insns));
1241	 eval(shift(@insns));
1242	 eval(shift(@insns));
1243
1244	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1245	&vpsrld	(@Tx[0],@Tx[1],30);
1246	 eval(shift(@insns));
1247	 eval(shift(@insns));
1248	 eval(shift(@insns));
1249	 eval(shift(@insns));
1250
1251	&vpslld	(@Tx[1],@Tx[1],2);
1252	&vpxor	(@X[0],@X[0],@Tx[0]);
1253	 eval(shift(@insns));
1254	 eval(shift(@insns));
1255	 eval(shift(@insns));
1256	 eval(shift(@insns));
1257
1258	&vpxor	(@X[0],@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
1259	 eval(shift(@insns));
1260	 eval(shift(@insns));
1261	  &vmovdqa	($Kx,eval(16*(($Xi)/5))."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1262	 eval(shift(@insns));
1263	 eval(shift(@insns));
1264
1265
1266	 foreach (@insns) { eval; }	# remaining instructions [if any]
1267
1268  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1269}
1270
1271sub Xupdate_avx_32_79()
1272{ use integer;
1273  my $body = shift;
1274  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
1275  my ($a,$b,$c,$d,$e);
1276
1277	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1278	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1279	 eval(shift(@insns));		# body_20_39
1280	 eval(shift(@insns));
1281	 eval(shift(@insns));
1282	 eval(shift(@insns));		# rol
1283
1284	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1285	 eval(shift(@insns));
1286	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1287	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1288	  &vmovdqa	($Kx,eval(16*($Xi/5))."($K_XX_XX)")	if ($Xi%5==0);
1289	 eval(shift(@insns));		# ror
1290	 eval(shift(@insns));
1291
1292	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1293	 eval(shift(@insns));		# body_20_39
1294	 eval(shift(@insns));
1295	 eval(shift(@insns));
1296	 eval(shift(@insns));		# rol
1297
1298	&vpsrld	(@Tx[0],@X[0],30);
1299	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1300	 eval(shift(@insns));
1301	 eval(shift(@insns));
1302	 eval(shift(@insns));		# ror
1303	 eval(shift(@insns));
1304
1305	&vpslld	(@X[0],@X[0],2);
1306	 eval(shift(@insns));		# body_20_39
1307	 eval(shift(@insns));
1308	 eval(shift(@insns));
1309	 eval(shift(@insns));		# rol
1310	 eval(shift(@insns));
1311	 eval(shift(@insns));
1312	 eval(shift(@insns));		# ror
1313	 eval(shift(@insns));
1314
1315	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1316	 eval(shift(@insns));		# body_20_39
1317	 eval(shift(@insns));
1318	 eval(shift(@insns));
1319	 eval(shift(@insns));		# rol
1320	 eval(shift(@insns));
1321	 eval(shift(@insns));
1322	 eval(shift(@insns));		# rol
1323	 eval(shift(@insns));
1324
1325	 foreach (@insns) { eval; }	# remaining instructions
1326
1327  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1328}
1329
1330sub Xuplast_avx_80()
1331{ use integer;
1332  my $body = shift;
1333  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1334  my ($a,$b,$c,$d,$e);
1335
1336	 eval(shift(@insns));
1337	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1338	 eval(shift(@insns));
1339	 eval(shift(@insns));
1340	 eval(shift(@insns));
1341	 eval(shift(@insns));
1342
1343	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1344
1345	 foreach (@insns) { eval; }		# remaining instructions
1346
1347	&cmp	($inp,$len);
1348	&je	(shift);
1349
1350	&vmovdqa(@Tx[1],"64($K_XX_XX)");	# pbswap mask
1351	&vmovdqa($Kx,"0($K_XX_XX)");		# K_00_19
1352	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1353	&vmovdqu(@X[-3&7],"16($inp)");
1354	&vmovdqu(@X[-2&7],"32($inp)");
1355	&vmovdqu(@X[-1&7],"48($inp)");
1356	&vpshufb(@X[-4&7],@X[-4&7],@Tx[1]);	# byte swap
1357	&add	($inp,64);
1358
1359  $Xi=0;
1360}
1361
1362sub Xloop_avx()
1363{ use integer;
1364  my $body = shift;
1365  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1366  my ($a,$b,$c,$d,$e);
1367
1368	 eval(shift(@insns));
1369	 eval(shift(@insns));
1370	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
1371	 eval(shift(@insns));
1372	 eval(shift(@insns));
1373	&vpaddd	(@Tx[0],@X[($Xi-4)&7],$Kx);
1374	 eval(shift(@insns));
1375	 eval(shift(@insns));
1376	 eval(shift(@insns));
1377	 eval(shift(@insns));
1378	&vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]);	# X[]+K xfer to IALU
1379	 eval(shift(@insns));
1380	 eval(shift(@insns));
1381
1382	foreach (@insns) { eval; }
1383  $Xi++;
1384}
1385
1386sub Xtail_avx()
1387{ use integer;
1388  my $body = shift;
1389  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1390  my ($a,$b,$c,$d,$e);
1391
1392	foreach (@insns) { eval; }
1393}
1394
1395$code.=<<___;
1396.align	32
1397.Loop_avx:
1398___
1399	&Xupdate_avx_16_31(\&body_00_19);
1400	&Xupdate_avx_16_31(\&body_00_19);
1401	&Xupdate_avx_16_31(\&body_00_19);
1402	&Xupdate_avx_16_31(\&body_00_19);
1403	&Xupdate_avx_32_79(\&body_00_19);
1404	&Xupdate_avx_32_79(\&body_20_39);
1405	&Xupdate_avx_32_79(\&body_20_39);
1406	&Xupdate_avx_32_79(\&body_20_39);
1407	&Xupdate_avx_32_79(\&body_20_39);
1408	&Xupdate_avx_32_79(\&body_20_39);
1409	&Xupdate_avx_32_79(\&body_40_59);
1410	&Xupdate_avx_32_79(\&body_40_59);
1411	&Xupdate_avx_32_79(\&body_40_59);
1412	&Xupdate_avx_32_79(\&body_40_59);
1413	&Xupdate_avx_32_79(\&body_40_59);
1414	&Xupdate_avx_32_79(\&body_20_39);
1415	&Xuplast_avx_80(\&body_20_39,".Ldone_avx");	# can jump to "done"
1416
1417				$saved_j=$j; @saved_V=@V;
1418				$saved_r=$r; @saved_rndkey=@rndkey;
1419
1420	&Xloop_avx(\&body_20_39);
1421	&Xloop_avx(\&body_20_39);
1422	&Xloop_avx(\&body_20_39);
1423
1424$code.=<<___;
1425	vmovups	$iv,48($out,$in0)		# write output
1426	lea	64($in0),$in0
1427
1428	add	0($ctx),$A			# update context
1429	add	4($ctx),@T[0]
1430	add	8($ctx),$C
1431	add	12($ctx),$D
1432	mov	$A,0($ctx)
1433	add	16($ctx),$E
1434	mov	@T[0],4($ctx)
1435	mov	@T[0],$B			# magic seed
1436	mov	$C,8($ctx)
1437	mov	$C,@T[1]
1438	mov	$D,12($ctx)
1439	xor	$D,@T[1]
1440	mov	$E,16($ctx)
1441	and	@T[1],@T[0]
1442	jmp	.Loop_avx
1443
1444.Ldone_avx:
1445___
1446				$jj=$j=$saved_j; @V=@saved_V;
1447				$r=$saved_r;     @rndkey=@saved_rndkey;
1448
1449	&Xtail_avx(\&body_20_39);
1450	&Xtail_avx(\&body_20_39);
1451	&Xtail_avx(\&body_20_39);
1452
1453$code.=<<___;
1454	vmovups	$iv,48($out,$in0)		# write output
1455	mov	88(%rsp),$ivp			# restore $ivp
1456
1457	add	0($ctx),$A			# update context
1458	add	4($ctx),@T[0]
1459	add	8($ctx),$C
1460	mov	$A,0($ctx)
1461	add	12($ctx),$D
1462	mov	@T[0],4($ctx)
1463	add	16($ctx),$E
1464	mov	$C,8($ctx)
1465	mov	$D,12($ctx)
1466	mov	$E,16($ctx)
1467	vmovups	$iv,($ivp)			# write IV
1468	vzeroall
1469___
1470$code.=<<___ if ($win64);
1471	movaps	96+0(%rsp),%xmm6
1472	movaps	96+16(%rsp),%xmm7
1473	movaps	96+32(%rsp),%xmm8
1474	movaps	96+48(%rsp),%xmm9
1475	movaps	96+64(%rsp),%xmm10
1476	movaps	96+80(%rsp),%xmm11
1477	movaps	96+96(%rsp),%xmm12
1478	movaps	96+112(%rsp),%xmm13
1479	movaps	96+128(%rsp),%xmm14
1480	movaps	96+144(%rsp),%xmm15
1481___
1482$code.=<<___;
1483	lea	`104+($win64?10*16:0)`(%rsp),%rsi
1484.cfi_def_cfa	%rsi,56
1485	mov	0(%rsi),%r15
1486.cfi_restore	%r15
1487	mov	8(%rsi),%r14
1488.cfi_restore	%r14
1489	mov	16(%rsi),%r13
1490.cfi_restore	%r13
1491	mov	24(%rsi),%r12
1492.cfi_restore	%r12
1493	mov	32(%rsi),%rbp
1494.cfi_restore	%rbp
1495	mov	40(%rsi),%rbx
1496.cfi_restore	%rbx
1497	lea	48(%rsi),%rsp
1498.cfi_def_cfa	%rsp,8
1499.Lepilogue_avx:
1500	ret
1501.cfi_endproc
1502.size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1503___
1504
1505						if ($stitched_decrypt) {{{
1506# reset
1507($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1508
1509$j=$jj=$r=$rx=0;
1510$Xi=4;
1511
1512@aes256_dec = (
1513	'&vpxor	($inout0,$rndkey0,"0x00($in0)");',
1514	'&vpxor	($inout1,$rndkey0,"0x10($in0)");',
1515	'&vpxor	($inout2,$rndkey0,"0x20($in0)");',
1516	'&vpxor	($inout3,$rndkey0,"0x30($in0)");',
1517
1518	'&vmovups($rndkey0,"16-112($key)");',
1519	'&vmovups("64(%rsp)",@X[2]);',		# save IV, originally @X[3]
1520	undef,undef
1521	);
1522for ($i=0;$i<13;$i++) {
1523    push (@aes256_dec,(
1524	'&vaesdec	($inout0,$inout0,$rndkey0);',
1525	'&vaesdec	($inout1,$inout1,$rndkey0);',
1526	'&vaesdec	($inout2,$inout2,$rndkey0);',
1527	'&vaesdec	($inout3,$inout3,$rndkey0);	&vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
1528	));
1529    push (@aes256_dec,(undef,undef))	if (($i>=3 && $i<=5) || $i>=11);
1530    push (@aes256_dec,(undef,undef))	if ($i==5);
1531}
1532push(@aes256_dec,(
1533	'&vaesdeclast	($inout0,$inout0,$rndkey0);	&vmovups(@X[0],"0x00($in0)");',
1534	'&vaesdeclast	($inout1,$inout1,$rndkey0);	&vmovups(@X[1],"0x10($in0)");',
1535	'&vaesdeclast	($inout2,$inout2,$rndkey0);	&vmovups(@X[2],"0x20($in0)");',
1536	'&vaesdeclast	($inout3,$inout3,$rndkey0);	&vmovups(@X[3],"0x30($in0)");',
1537
1538	'&vxorps	($inout0,$inout0,"64(%rsp)");	&vmovdqu($rndkey0,"-112($key)");',
1539	'&vxorps	($inout1,$inout1,@X[0]);	&vmovups("0x00($out,$in0)",$inout0);',
1540	'&vxorps	($inout2,$inout2,@X[1]);	&vmovups("0x10($out,$in0)",$inout1);',
1541	'&vxorps	($inout3,$inout3,@X[2]);	&vmovups("0x20($out,$in0)",$inout2);',
1542
1543	'&vmovups	("0x30($out,$in0)",$inout3);'
1544	));
1545
1546$code.=<<___;
1547.type	aesni256_cbc_sha1_dec_avx,\@function,6
1548.align	32
1549aesni256_cbc_sha1_dec_avx:
1550.cfi_startproc
1551	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1552	push	%rbx
1553.cfi_push	%rbx
1554	push	%rbp
1555.cfi_push	%rbp
1556	push	%r12
1557.cfi_push	%r12
1558	push	%r13
1559.cfi_push	%r13
1560	push	%r14
1561.cfi_push	%r14
1562	push	%r15
1563.cfi_push	%r15
1564	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
1565.cfi_adjust_cfa_offset	`104+($win64?10*16:0)`
1566___
1567$code.=<<___ if ($win64);
1568	movaps	%xmm6,96+0(%rsp)
1569	movaps	%xmm7,96+16(%rsp)
1570	movaps	%xmm8,96+32(%rsp)
1571	movaps	%xmm9,96+48(%rsp)
1572	movaps	%xmm10,96+64(%rsp)
1573	movaps	%xmm11,96+80(%rsp)
1574	movaps	%xmm12,96+96(%rsp)
1575	movaps	%xmm13,96+112(%rsp)
1576	movaps	%xmm14,96+128(%rsp)
1577	movaps	%xmm15,96+144(%rsp)
1578.Lprologue_dec_avx:
1579___
1580$code.=<<___;
1581	vzeroall
1582	mov	$in0,%r12			# reassign arguments
1583	mov	$out,%r13
1584	mov	$len,%r14
1585	lea	112($key),%r15			# size optimization
1586	vmovdqu	($ivp),@X[3]			# load IV
1587___
1588($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
1589$code.=<<___;
1590	shl	\$6,$len
1591	sub	$in0,$out
1592	add	$inp,$len		# end of input
1593
1594	lea	K_XX_XX(%rip),$K_XX_XX
1595	mov	0($ctx),$A		# load context
1596	mov	4($ctx),$B
1597	mov	8($ctx),$C
1598	mov	12($ctx),$D
1599	mov	$B,@T[0]		# magic seed
1600	mov	16($ctx),$E
1601	mov	$C,@T[1]
1602	xor	$D,@T[1]
1603	and	@T[1],@T[0]
1604
1605	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
1606	vmovdqa	0($K_XX_XX),$Kx		# K_00_19
1607	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
1608	vmovdqu	16($inp),@X[-3&7]
1609	vmovdqu	32($inp),@X[-2&7]
1610	vmovdqu	48($inp),@X[-1&7]
1611	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
1612	add	\$64,$inp
1613	vpshufb	@X[2],@X[-3&7],@X[-3&7]
1614	vpshufb	@X[2],@X[-2&7],@X[-2&7]
1615	vpshufb	@X[2],@X[-1&7],@X[-1&7]
1616	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1617	vpaddd	$Kx,@X[-3&7],@X[1]
1618	vpaddd	$Kx,@X[-2&7],@X[2]
1619	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1620	vmovdqa	@X[1],16(%rsp)
1621	vmovdqa	@X[2],32(%rsp)
1622	vmovups	-112($key),$rndkey0	# $key[0]
1623	jmp	.Loop_dec_avx
1624
1625.align	32
1626.Loop_dec_avx:
1627___
1628	&Xupdate_avx_16_31(\&body_00_19_dec);
1629	&Xupdate_avx_16_31(\&body_00_19_dec);
1630	&Xupdate_avx_16_31(\&body_00_19_dec);
1631	&Xupdate_avx_16_31(\&body_00_19_dec);
1632	&Xupdate_avx_32_79(\&body_00_19_dec);
1633	&Xupdate_avx_32_79(\&body_20_39_dec);
1634	&Xupdate_avx_32_79(\&body_20_39_dec);
1635	&Xupdate_avx_32_79(\&body_20_39_dec);
1636	&Xupdate_avx_32_79(\&body_20_39_dec);
1637	&Xupdate_avx_32_79(\&body_20_39_dec);
1638	&Xupdate_avx_32_79(\&body_40_59_dec);
1639	&Xupdate_avx_32_79(\&body_40_59_dec);
1640	&Xupdate_avx_32_79(\&body_40_59_dec);
1641	&Xupdate_avx_32_79(\&body_40_59_dec);
1642	&Xupdate_avx_32_79(\&body_40_59_dec);
1643	&Xupdate_avx_32_79(\&body_20_39_dec);
1644	&Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx");	# can jump to "done"
1645
1646				$saved_j=$j; @saved_V=@V;
1647				$saved_rx=$rx;
1648
1649	&Xloop_avx(\&body_20_39_dec);
1650	&Xloop_avx(\&body_20_39_dec);
1651	&Xloop_avx(\&body_20_39_dec);
1652
1653	eval(@aes256_dec[-1]);			# last store
1654$code.=<<___;
1655	lea	64($in0),$in0
1656
1657	add	0($ctx),$A			# update context
1658	add	4($ctx),@T[0]
1659	add	8($ctx),$C
1660	add	12($ctx),$D
1661	mov	$A,0($ctx)
1662	add	16($ctx),$E
1663	mov	@T[0],4($ctx)
1664	mov	@T[0],$B			# magic seed
1665	mov	$C,8($ctx)
1666	mov	$C,@T[1]
1667	mov	$D,12($ctx)
1668	xor	$D,@T[1]
1669	mov	$E,16($ctx)
1670	and	@T[1],@T[0]
1671	jmp	.Loop_dec_avx
1672
1673.Ldone_dec_avx:
1674___
1675				$jj=$j=$saved_j; @V=@saved_V;
1676				$rx=$saved_rx;
1677
1678	&Xtail_avx(\&body_20_39_dec);
1679	&Xtail_avx(\&body_20_39_dec);
1680	&Xtail_avx(\&body_20_39_dec);
1681
1682	eval(@aes256_dec[-1]);			# last store
1683$code.=<<___;
1684
1685	add	0($ctx),$A			# update context
1686	add	4($ctx),@T[0]
1687	add	8($ctx),$C
1688	mov	$A,0($ctx)
1689	add	12($ctx),$D
1690	mov	@T[0],4($ctx)
1691	add	16($ctx),$E
1692	mov	$C,8($ctx)
1693	mov	$D,12($ctx)
1694	mov	$E,16($ctx)
1695	vmovups	@X[3],($ivp)			# write IV
1696	vzeroall
1697___
1698$code.=<<___ if ($win64);
1699	movaps	96+0(%rsp),%xmm6
1700	movaps	96+16(%rsp),%xmm7
1701	movaps	96+32(%rsp),%xmm8
1702	movaps	96+48(%rsp),%xmm9
1703	movaps	96+64(%rsp),%xmm10
1704	movaps	96+80(%rsp),%xmm11
1705	movaps	96+96(%rsp),%xmm12
1706	movaps	96+112(%rsp),%xmm13
1707	movaps	96+128(%rsp),%xmm14
1708	movaps	96+144(%rsp),%xmm15
1709___
1710$code.=<<___;
1711	lea	`104+($win64?10*16:0)`(%rsp),%rsi
1712.cfi_def_cfa	%rsi,56
1713	mov	0(%rsi),%r15
1714.cfi_restore	%r15
1715	mov	8(%rsi),%r14
1716.cfi_restore	%r14
1717	mov	16(%rsi),%r13
1718.cfi_restore	%r13
1719	mov	24(%rsi),%r12
1720.cfi_restore	%r12
1721	mov	32(%rsi),%rbp
1722.cfi_restore	%rbp
1723	mov	40(%rsi),%rbx
1724.cfi_restore	%rbx
1725	lea	48(%rsi),%rsp
1726.cfi_def_cfa	%rsp,8
1727.Lepilogue_dec_avx:
1728	ret
1729.cfi_endproc
1730.size	aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
1731___
1732						}}}
1733}
1734$code.=<<___;
1735.align	64
1736K_XX_XX:
1737.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1738.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1739.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1740.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1741.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1742.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1743
1744.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1745.align	64
1746___
1747						if ($shaext) {{{
1748($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1749
1750$rounds="%r11d";
1751
1752($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
1753@rndkey=("%xmm0","%xmm1");
1754$r=0;
1755
1756my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
1757my @MSG=map("%xmm$_",(3..6));
1758
1759$code.=<<___;
1760.type	aesni_cbc_sha1_enc_shaext,\@function,6
1761.align	32
1762aesni_cbc_sha1_enc_shaext:
1763	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1764___
1765$code.=<<___ if ($win64);
1766	lea	`-8-10*16`(%rsp),%rsp
1767	movaps	%xmm6,-8-10*16(%rax)
1768	movaps	%xmm7,-8-9*16(%rax)
1769	movaps	%xmm8,-8-8*16(%rax)
1770	movaps	%xmm9,-8-7*16(%rax)
1771	movaps	%xmm10,-8-6*16(%rax)
1772	movaps	%xmm11,-8-5*16(%rax)
1773	movaps	%xmm12,-8-4*16(%rax)
1774	movaps	%xmm13,-8-3*16(%rax)
1775	movaps	%xmm14,-8-2*16(%rax)
1776	movaps	%xmm15,-8-1*16(%rax)
1777.Lprologue_shaext:
1778___
1779$code.=<<___;
1780	movdqu	($ctx),$ABCD
1781	movd	16($ctx),$E
1782	movdqa	K_XX_XX+0x50(%rip),$BSWAP	# byte-n-word swap
1783
1784	mov	240($key),$rounds
1785	sub	$in0,$out
1786	movups	($key),$rndkey0			# $key[0]
1787	movups	($ivp),$iv			# load IV
1788	movups	16($key),$rndkey[0]		# forward reference
1789	lea	112($key),$key			# size optimization
1790
1791	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
1792	pshufd	\$0b00011011,$E,$E		# flip word order
1793	jmp	.Loop_shaext
1794
1795.align	16
1796.Loop_shaext:
1797___
1798	&$aesenc();
1799$code.=<<___;
1800	movdqu		($inp),@MSG[0]
1801	movdqa		$E,$E_SAVE		# offload $E
1802	pshufb		$BSWAP,@MSG[0]
1803	movdqu		0x10($inp),@MSG[1]
1804	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
1805___
1806	&$aesenc();
1807$code.=<<___;
1808	pshufb		$BSWAP,@MSG[1]
1809
1810	paddd		@MSG[0],$E
1811	movdqu		0x20($inp),@MSG[2]
1812	lea		0x40($inp),$inp
1813	pxor		$E_SAVE,@MSG[0]		# black magic
1814___
1815	&$aesenc();
1816$code.=<<___;
1817	pxor		$E_SAVE,@MSG[0]		# black magic
1818	movdqa		$ABCD,$E_
1819	pshufb		$BSWAP,@MSG[2]
1820	sha1rnds4	\$0,$E,$ABCD		# 0-3
1821	sha1nexte	@MSG[1],$E_
1822___
1823	&$aesenc();
1824$code.=<<___;
1825	sha1msg1	@MSG[1],@MSG[0]
1826	movdqu		-0x10($inp),@MSG[3]
1827	movdqa		$ABCD,$E
1828	pshufb		$BSWAP,@MSG[3]
1829___
1830	&$aesenc();
1831$code.=<<___;
1832	sha1rnds4	\$0,$E_,$ABCD		# 4-7
1833	sha1nexte	@MSG[2],$E
1834	pxor		@MSG[2],@MSG[0]
1835	sha1msg1	@MSG[2],@MSG[1]
1836___
1837	&$aesenc();
1838
1839for($i=2;$i<20-4;$i++) {
1840$code.=<<___;
1841	movdqa		$ABCD,$E_
1842	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 8-11
1843	sha1nexte	@MSG[3],$E_
1844___
1845	&$aesenc();
1846$code.=<<___;
1847	sha1msg2	@MSG[3],@MSG[0]
1848	pxor		@MSG[3],@MSG[1]
1849	sha1msg1	@MSG[3],@MSG[2]
1850___
1851	($E,$E_)=($E_,$E);
1852	push(@MSG,shift(@MSG));
1853
1854	&$aesenc();
1855}
1856$code.=<<___;
1857	movdqa		$ABCD,$E_
1858	sha1rnds4	\$3,$E,$ABCD		# 64-67
1859	sha1nexte	@MSG[3],$E_
1860	sha1msg2	@MSG[3],@MSG[0]
1861	pxor		@MSG[3],@MSG[1]
1862___
1863	&$aesenc();
1864$code.=<<___;
1865	movdqa		$ABCD,$E
1866	sha1rnds4	\$3,$E_,$ABCD		# 68-71
1867	sha1nexte	@MSG[0],$E
1868	sha1msg2	@MSG[0],@MSG[1]
1869___
1870	&$aesenc();
1871$code.=<<___;
1872	movdqa		$E_SAVE,@MSG[0]
1873	movdqa		$ABCD,$E_
1874	sha1rnds4	\$3,$E,$ABCD		# 72-75
1875	sha1nexte	@MSG[1],$E_
1876___
1877	&$aesenc();
1878$code.=<<___;
1879	movdqa		$ABCD,$E
1880	sha1rnds4	\$3,$E_,$ABCD		# 76-79
1881	sha1nexte	$MSG[0],$E
1882___
1883	while($r<40)	{ &$aesenc(); }		# remaining aesenc's
1884$code.=<<___;
1885	dec		$len
1886
1887	paddd		$ABCD_SAVE,$ABCD
1888	movups		$iv,48($out,$in0)	# write output
1889	lea		64($in0),$in0
1890	jnz		.Loop_shaext
1891
1892	pshufd	\$0b00011011,$ABCD,$ABCD
1893	pshufd	\$0b00011011,$E,$E
1894	movups	$iv,($ivp)			# write IV
1895	movdqu	$ABCD,($ctx)
1896	movd	$E,16($ctx)
1897___
1898$code.=<<___ if ($win64);
1899	movaps	-8-10*16(%rax),%xmm6
1900	movaps	-8-9*16(%rax),%xmm7
1901	movaps	-8-8*16(%rax),%xmm8
1902	movaps	-8-7*16(%rax),%xmm9
1903	movaps	-8-6*16(%rax),%xmm10
1904	movaps	-8-5*16(%rax),%xmm11
1905	movaps	-8-4*16(%rax),%xmm12
1906	movaps	-8-3*16(%rax),%xmm13
1907	movaps	-8-2*16(%rax),%xmm14
1908	movaps	-8-1*16(%rax),%xmm15
1909	mov	%rax,%rsp
1910.Lepilogue_shaext:
1911___
1912$code.=<<___;
1913	ret
1914.size	aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
1915___
1916						}}}
1917# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1918#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1919if ($win64) {
1920$rec="%rcx";
1921$frame="%rdx";
1922$context="%r8";
1923$disp="%r9";
1924
1925$code.=<<___;
1926.extern	__imp_RtlVirtualUnwind
1927.type	ssse3_handler,\@abi-omnipotent
1928.align	16
1929ssse3_handler:
1930	push	%rsi
1931	push	%rdi
1932	push	%rbx
1933	push	%rbp
1934	push	%r12
1935	push	%r13
1936	push	%r14
1937	push	%r15
1938	pushfq
1939	sub	\$64,%rsp
1940
1941	mov	120($context),%rax	# pull context->Rax
1942	mov	248($context),%rbx	# pull context->Rip
1943
1944	mov	8($disp),%rsi		# disp->ImageBase
1945	mov	56($disp),%r11		# disp->HandlerData
1946
1947	mov	0(%r11),%r10d		# HandlerData[0]
1948	lea	(%rsi,%r10),%r10	# prologue label
1949	cmp	%r10,%rbx		# context->Rip<prologue label
1950	jb	.Lcommon_seh_tail
1951
1952	mov	152($context),%rax	# pull context->Rsp
1953
1954	mov	4(%r11),%r10d		# HandlerData[1]
1955	lea	(%rsi,%r10),%r10	# epilogue label
1956	cmp	%r10,%rbx		# context->Rip>=epilogue label
1957	jae	.Lcommon_seh_tail
1958___
1959$code.=<<___ if ($shaext);
1960	lea	aesni_cbc_sha1_enc_shaext(%rip),%r10
1961	cmp	%r10,%rbx
1962	jb	.Lseh_no_shaext
1963
1964	lea	(%rax),%rsi
1965	lea	512($context),%rdi	# &context.Xmm6
1966	mov	\$20,%ecx
1967	.long	0xa548f3fc		# cld; rep movsq
1968	lea	168(%rax),%rax		# adjust stack pointer
1969	jmp	.Lcommon_seh_tail
1970.Lseh_no_shaext:
1971___
1972$code.=<<___;
1973	lea	96(%rax),%rsi
1974	lea	512($context),%rdi	# &context.Xmm6
1975	mov	\$20,%ecx
1976	.long	0xa548f3fc		# cld; rep movsq
1977	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
1978
1979	mov	0(%rax),%r15
1980	mov	8(%rax),%r14
1981	mov	16(%rax),%r13
1982	mov	24(%rax),%r12
1983	mov	32(%rax),%rbp
1984	mov	40(%rax),%rbx
1985	lea	48(%rax),%rax
1986	mov	%rbx,144($context)	# restore context->Rbx
1987	mov	%rbp,160($context)	# restore context->Rbp
1988	mov	%r12,216($context)	# restore context->R12
1989	mov	%r13,224($context)	# restore context->R13
1990	mov	%r14,232($context)	# restore context->R14
1991	mov	%r15,240($context)	# restore context->R15
1992
1993.Lcommon_seh_tail:
1994	mov	8(%rax),%rdi
1995	mov	16(%rax),%rsi
1996	mov	%rax,152($context)	# restore context->Rsp
1997	mov	%rsi,168($context)	# restore context->Rsi
1998	mov	%rdi,176($context)	# restore context->Rdi
1999
2000	mov	40($disp),%rdi		# disp->ContextRecord
2001	mov	$context,%rsi		# context
2002	mov	\$154,%ecx		# sizeof(CONTEXT)
2003	.long	0xa548f3fc		# cld; rep movsq
2004
2005	mov	$disp,%rsi
2006	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2007	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2008	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2009	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2010	mov	40(%rsi),%r10		# disp->ContextRecord
2011	lea	56(%rsi),%r11		# &disp->HandlerData
2012	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2013	mov	%r10,32(%rsp)		# arg5
2014	mov	%r11,40(%rsp)		# arg6
2015	mov	%r12,48(%rsp)		# arg7
2016	mov	%rcx,56(%rsp)		# arg8, (NULL)
2017	call	*__imp_RtlVirtualUnwind(%rip)
2018
2019	mov	\$1,%eax		# ExceptionContinueSearch
2020	add	\$64,%rsp
2021	popfq
2022	pop	%r15
2023	pop	%r14
2024	pop	%r13
2025	pop	%r12
2026	pop	%rbp
2027	pop	%rbx
2028	pop	%rdi
2029	pop	%rsi
2030	ret
2031.size	ssse3_handler,.-ssse3_handler
2032
2033.section	.pdata
2034.align	4
2035	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
2036	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
2037	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
2038___
2039$code.=<<___ if ($avx);
2040	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
2041	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
2042	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
2043___
2044$code.=<<___ if ($shaext);
2045	.rva	.LSEH_begin_aesni_cbc_sha1_enc_shaext
2046	.rva	.LSEH_end_aesni_cbc_sha1_enc_shaext
2047	.rva	.LSEH_info_aesni_cbc_sha1_enc_shaext
2048___
2049$code.=<<___;
2050.section	.xdata
2051.align	8
2052.LSEH_info_aesni_cbc_sha1_enc_ssse3:
2053	.byte	9,0,0,0
2054	.rva	ssse3_handler
2055	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2056___
2057$code.=<<___ if ($avx);
2058.LSEH_info_aesni_cbc_sha1_enc_avx:
2059	.byte	9,0,0,0
2060	.rva	ssse3_handler
2061	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2062___
2063$code.=<<___ if ($shaext);
2064.LSEH_info_aesni_cbc_sha1_enc_shaext:
2065	.byte	9,0,0,0
2066	.rva	ssse3_handler
2067	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
2068___
2069}
2070
2071####################################################################
2072sub rex {
2073  local *opcode=shift;
2074  my ($dst,$src)=@_;
2075  my $rex=0;
2076
2077    $rex|=0x04			if($dst>=8);
2078    $rex|=0x01			if($src>=8);
2079    unshift @opcode,$rex|0x40	if($rex);
2080}
2081
2082sub sha1rnds4 {
2083    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2084      my @opcode=(0x0f,0x3a,0xcc);
2085	rex(\@opcode,$3,$2);
2086	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2087	my $c=$1;
2088	push @opcode,$c=~/^0/?oct($c):$c;
2089	return ".byte\t".join(',',@opcode);
2090    } else {
2091	return "sha1rnds4\t".@_[0];
2092    }
2093}
2094
2095sub sha1op38 {
2096    my $instr = shift;
2097    my %opcodelet = (
2098		"sha1nexte" => 0xc8,
2099  		"sha1msg1"  => 0xc9,
2100		"sha1msg2"  => 0xca	);
2101
2102    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2103      my @opcode=(0x0f,0x38);
2104	rex(\@opcode,$2,$1);
2105	push @opcode,$opcodelet{$instr};
2106	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2107	return ".byte\t".join(',',@opcode);
2108    } else {
2109	return $instr."\t".@_[0];
2110    }
2111}
2112
2113sub aesni {
2114  my $line=shift;
2115  my @opcode=(0x0f,0x38);
2116
2117    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2118	my %opcodelet = (
2119		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
2120		"aesdec" => 0xde,	"aesdeclast" => 0xdf
2121	);
2122	return undef if (!defined($opcodelet{$1}));
2123	rex(\@opcode,$3,$2);
2124	push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3);	# ModR/M
2125	unshift @opcode,0x66;
2126	return ".byte\t".join(',',@opcode);
2127    }
2128    return $line;
2129}
2130
2131foreach (split("\n",$code)) {
2132        s/\`([^\`]*)\`/eval $1/geo;
2133
2134	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo		or
2135	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo		or
2136	s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
2137
2138	print $_,"\n";
2139}
2140close STDOUT;
2141