1#! /usr/bin/env perl
2# Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# June 2011
18#
19# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
20# in http://download.intel.com/design/intarch/papers/323686.pdf, is
21# that since AESNI-CBC encrypt exhibit *very* low instruction-level
22# parallelism, interleaving it with another algorithm would allow to
23# utilize processor resources better and achieve better performance.
24# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
25# AESNI code is weaved into it. Below are performance numbers in
26# cycles per processed byte, less is better, for standalone AESNI-CBC
27# encrypt, sum of the latter and standalone SHA1, and "stitched"
28# subroutine:
29#
30#		AES-128-CBC	+SHA1		stitch      gain
31# Westmere	3.77[+5.3]	9.07		6.55	    +38%
32# Sandy Bridge	5.05[+5.0(6.1)]	10.06(11.15)	5.98(7.05)  +68%(+58%)
33# Ivy Bridge	5.05[+4.6]	9.65		5.54        +74%
34# Haswell	4.43[+3.6(4.2)]	8.00(8.58)	4.55(5.21)  +75%(+65%)
35# Skylake	2.63[+3.5(4.1)]	6.17(6.69)	4.23(4.44)  +46%(+51%)
36# Bulldozer	5.77[+6.0]	11.72		6.37        +84%
37# Ryzen(**)	2.71[+1.93]	4.64		2.74        +69%
38# Goldmont(**)	3.82[+1.70]	5.52		4.20        +31%
39#
40#		AES-192-CBC
41# Westmere	4.51		9.81		6.80	    +44%
42# Sandy Bridge	6.05		11.06(12.15)	6.11(7.19)  +81%(+69%)
43# Ivy Bridge	6.05		10.65		6.07        +75%
44# Haswell	5.29		8.86(9.44)	5.32(5.32)  +67%(+77%)
45# Bulldozer	6.89		12.84		6.96        +84%
46#
47#		AES-256-CBC
48# Westmere	5.25		10.55		7.21	    +46%
49# Sandy Bridge	7.05		12.06(13.15)	7.12(7.72)  +69%(+70%)
50# Ivy Bridge	7.05		11.65		7.12        +64%
51# Haswell	6.19		9.76(10.34)	6.21(6.25)  +57%(+65%)
52# Skylake	3.62		7.16(7.68)	4.56(4.76)  +57%(+61%)
53# Bulldozer	8.00		13.95		8.25        +69%
54# Ryzen(**)	3.71		5.64		3.72        +52%
55# Goldmont(**)	5.35		7.05		5.76        +22%
56#
57# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
58#	background information. Above numbers in parentheses are SSSE3
59#	results collected on AVX-capable CPU, i.e. apply on OSes that
60#	don't support AVX.
61# (**)	SHAEXT results.
62#
63# Needless to mention that it makes no sense to implement "stitched"
64# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
65# fully utilize parallelism, so stitching would not give any gain
66# anyway. Well, there might be some, e.g. because of better cache
67# locality... For reference, here are performance results for
68# standalone AESNI-CBC decrypt:
69#
70#		AES-128-CBC	AES-192-CBC	AES-256-CBC
71# Westmere	1.25		1.50		1.75
72# Sandy Bridge	0.74		0.91		1.09
73# Ivy Bridge	0.74		0.90		1.11
74# Haswell	0.63		0.76		0.88
75# Bulldozer	0.70		0.85		0.99
76
77# And indeed:
78#
79#		AES-256-CBC	+SHA1		stitch      gain
80# Westmere	1.75		7.20		6.68        +7.8%
81# Sandy Bridge	1.09		6.09(7.22)	5.82(6.95)  +4.6%(+3.9%)
82# Ivy Bridge	1.11		5.70		5.45        +4.6%
83# Haswell	0.88		4.45(5.00)	4.39(4.69)  +1.4%(*)(+6.6%)
84# Bulldozer	0.99		6.95		5.95        +17%(**)
85#
86# (*)	Tiny improvement coefficient on Haswell is because we compare
87#	AVX1 stitch to sum with AVX2 SHA1.
88# (**)	Execution is fully dominated by integer code sequence and
89#	SIMD still hardly shows [in single-process benchmark;-]
90
91$flavour = shift;
92$output  = shift;
93if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
94
95$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
96
97$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
98( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
99( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
100die "can't locate x86_64-xlate.pl";
101
102$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
103		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
104	   $1>=2.19);
105$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
106	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
107	   $1>=2.09);
108$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
109	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
110	   $1>=10);
111$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/ && $2>=3.0);
112
113$shaext=1;	### set to zero if compiling for 1.0.1
114
115$stitched_decrypt=0;
116
117open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
118*STDOUT=*OUT;
119
120# void aesni_cbc_sha1_enc(const void *inp,
121#			void *out,
122#			size_t length,
123#			const AES_KEY *key,
124#			unsigned char *iv,
125#			SHA_CTX *ctx,
126#			const void *in0);
127
128$code.=<<___;
129.text
130.extern	OPENSSL_ia32cap_P
131
132.globl	aesni_cbc_sha1_enc
133.type	aesni_cbc_sha1_enc,\@abi-omnipotent
134.align	32
135aesni_cbc_sha1_enc:
136.cfi_startproc
137	# caller should check for SSSE3 and AES-NI bits
138	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
139	mov	OPENSSL_ia32cap_P+4(%rip),%r11
140___
141$code.=<<___ if ($shaext);
142	bt	\$61,%r11		# check SHA bit
143	jc	aesni_cbc_sha1_enc_shaext
144___
145$code.=<<___ if ($avx);
146	and	\$`1<<28`,%r11d		# mask AVX bit
147	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
148	or	%r11d,%r10d
149	cmp	\$`1<<28|1<<30`,%r10d
150	je	aesni_cbc_sha1_enc_avx
151___
152$code.=<<___;
153	jmp	aesni_cbc_sha1_enc_ssse3
154	ret
155.cfi_endproc
156.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
157___
158
159my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
160
161my $Xi=4;
162my @X=map("%xmm$_",(4..7,0..3));
163my @Tx=map("%xmm$_",(8..10));
164my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
165my @T=("%esi","%edi");
166my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
167my $K_XX_XX="%r11";
168my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));			# for enc
169my @rndkey=("%xmm14","%xmm15");					# for enc
170my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));	# for dec
171
172if (1) {	# reassign for Atom Silvermont
173    # The goal is to minimize amount of instructions with more than
174    # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
175    # SSSE3 instructions to upper half of the register bank.
176    @X=map("%xmm$_",(8..11,4..7));
177    @Tx=map("%xmm$_",(12,13,3));
178    ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
179    @rndkey=("%xmm0","%xmm1");
180}
181
182sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
183{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
184  my $arg = pop;
185    $arg = "\$$arg" if ($arg*1 eq $arg);
186    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
187}
188
189my $_rol=sub { &rol(@_) };
190my $_ror=sub { &ror(@_) };
191
192$code.=<<___;
193.type	aesni_cbc_sha1_enc_ssse3,\@function,6
194.align	32
195aesni_cbc_sha1_enc_ssse3:
196.cfi_startproc
197	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
198	#shr	\$6,$len			# debugging artefact
199	#jz	.Lepilogue_ssse3		# debugging artefact
200	push	%rbx
201.cfi_push	%rbx
202	push	%rbp
203.cfi_push	%rbp
204	push	%r12
205.cfi_push	%r12
206	push	%r13
207.cfi_push	%r13
208	push	%r14
209.cfi_push	%r14
210	push	%r15
211.cfi_push	%r15
212	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
213.cfi_adjust_cfa_offset	`104+($win64?10*16:0)`
214	#mov	$in0,$inp			# debugging artefact
215	#lea	64(%rsp),$ctx			# debugging artefact
216___
217$code.=<<___ if ($win64);
218	movaps	%xmm6,96+0(%rsp)
219	movaps	%xmm7,96+16(%rsp)
220	movaps	%xmm8,96+32(%rsp)
221	movaps	%xmm9,96+48(%rsp)
222	movaps	%xmm10,96+64(%rsp)
223	movaps	%xmm11,96+80(%rsp)
224	movaps	%xmm12,96+96(%rsp)
225	movaps	%xmm13,96+112(%rsp)
226	movaps	%xmm14,96+128(%rsp)
227	movaps	%xmm15,96+144(%rsp)
228.Lprologue_ssse3:
229___
230$code.=<<___;
231	mov	$in0,%r12			# reassign arguments
232	mov	$out,%r13
233	mov	$len,%r14
234	lea	112($key),%r15			# size optimization
235	movdqu	($ivp),$iv			# load IV
236	mov	$ivp,88(%rsp)			# save $ivp
237___
238($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
239my $rounds="${ivp}d";
240$code.=<<___;
241	shl	\$6,$len
242	sub	$in0,$out
243	mov	240-112($key),$rounds
244	add	$inp,$len		# end of input
245
246	lea	K_XX_XX(%rip),$K_XX_XX
247	mov	0($ctx),$A		# load context
248	mov	4($ctx),$B
249	mov	8($ctx),$C
250	mov	12($ctx),$D
251	mov	$B,@T[0]		# magic seed
252	mov	16($ctx),$E
253	mov	$C,@T[1]
254	xor	$D,@T[1]
255	and	@T[1],@T[0]
256
257	movdqa	64($K_XX_XX),@Tx[2]	# pbswap mask
258	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
259	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
260	movdqu	16($inp),@X[-3&7]
261	movdqu	32($inp),@X[-2&7]
262	movdqu	48($inp),@X[-1&7]
263	pshufb	@Tx[2],@X[-4&7]		# byte swap
264	pshufb	@Tx[2],@X[-3&7]
265	pshufb	@Tx[2],@X[-2&7]
266	add	\$64,$inp
267	paddd	@Tx[1],@X[-4&7]		# add K_00_19
268	pshufb	@Tx[2],@X[-1&7]
269	paddd	@Tx[1],@X[-3&7]
270	paddd	@Tx[1],@X[-2&7]
271	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
272	psubd	@Tx[1],@X[-4&7]		# restore X[]
273	movdqa	@X[-3&7],16(%rsp)
274	psubd	@Tx[1],@X[-3&7]
275	movdqa	@X[-2&7],32(%rsp)
276	psubd	@Tx[1],@X[-2&7]
277	movups	-112($key),$rndkey0	# $key[0]
278	movups	16-112($key),$rndkey[0]	# forward reference
279	jmp	.Loop_ssse3
280___
281
282my $aesenc=sub {
283  use integer;
284  my ($n,$k)=($r/10,$r%10);
285    if ($k==0) {
286      $code.=<<___;
287	movups		`16*$n`($in0),$in		# load input
288	xorps		$rndkey0,$in
289___
290      $code.=<<___ if ($n);
291	movups		$iv,`16*($n-1)`($out,$in0)	# write output
292___
293      $code.=<<___;
294	xorps		$in,$iv
295	movups		`32+16*$k-112`($key),$rndkey[1]
296	aesenc		$rndkey[0],$iv
297___
298    } elsif ($k==9) {
299      $sn++;
300      $code.=<<___;
301	cmp		\$11,$rounds
302	jb		.Laesenclast$sn
303	movups		`32+16*($k+0)-112`($key),$rndkey[1]
304	aesenc		$rndkey[0],$iv
305	movups		`32+16*($k+1)-112`($key),$rndkey[0]
306	aesenc		$rndkey[1],$iv
307	je		.Laesenclast$sn
308	movups		`32+16*($k+2)-112`($key),$rndkey[1]
309	aesenc		$rndkey[0],$iv
310	movups		`32+16*($k+3)-112`($key),$rndkey[0]
311	aesenc		$rndkey[1],$iv
312.Laesenclast$sn:
313	aesenclast	$rndkey[0],$iv
314	movups		16-112($key),$rndkey[1]		# forward reference
315___
316    } else {
317      $code.=<<___;
318	movups		`32+16*$k-112`($key),$rndkey[1]
319	aesenc		$rndkey[0],$iv
320___
321    }
322    $r++;	unshift(@rndkey,pop(@rndkey));
323};
324
325sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
326{ use integer;
327  my $body = shift;
328  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
329  my ($a,$b,$c,$d,$e);
330
331	 eval(shift(@insns));		# ror
332	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
333	 eval(shift(@insns));
334	&movdqa	(@Tx[0],@X[-1&7]);
335	  &paddd	(@Tx[1],@X[-1&7]);
336	 eval(shift(@insns));
337	 eval(shift(@insns));
338
339	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
340	 eval(shift(@insns));
341	 eval(shift(@insns));		# rol
342	 eval(shift(@insns));
343	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
344	 eval(shift(@insns));
345	 eval(shift(@insns));
346
347	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
348	 eval(shift(@insns));
349	 eval(shift(@insns));		# ror
350	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
351	 eval(shift(@insns));
352	 eval(shift(@insns));
353	 eval(shift(@insns));
354
355	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
356	 eval(shift(@insns));
357	 eval(shift(@insns));		# rol
358	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
359	 eval(shift(@insns));
360	 eval(shift(@insns));
361
362	&movdqa	(@Tx[2],@X[0]);
363	 eval(shift(@insns));
364	 eval(shift(@insns));
365	 eval(shift(@insns));		# ror
366	&movdqa	(@Tx[0],@X[0]);
367	 eval(shift(@insns));
368
369	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
370	&paddd	(@X[0],@X[0]);
371	 eval(shift(@insns));
372	 eval(shift(@insns));
373
374	&psrld	(@Tx[0],31);
375	 eval(shift(@insns));
376	 eval(shift(@insns));		# rol
377	 eval(shift(@insns));
378	&movdqa	(@Tx[1],@Tx[2]);
379	 eval(shift(@insns));
380	 eval(shift(@insns));
381
382	&psrld	(@Tx[2],30);
383	 eval(shift(@insns));
384	 eval(shift(@insns));		# ror
385	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
386	 eval(shift(@insns));
387	 eval(shift(@insns));
388	 eval(shift(@insns));
389
390	&pslld	(@Tx[1],2);
391	&pxor	(@X[0],@Tx[2]);
392	 eval(shift(@insns));
393	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
394	 eval(shift(@insns));		# rol
395	 eval(shift(@insns));
396	 eval(shift(@insns));
397
398	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
399	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
400
401	 foreach (@insns) { eval; }	# remaining instructions [if any]
402
403  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
404		push(@Tx,shift(@Tx));
405}
406
407sub Xupdate_ssse3_32_79()
408{ use integer;
409  my $body = shift;
410  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
411  my ($a,$b,$c,$d,$e);
412
413	 eval(shift(@insns))		if ($Xi==8);
414	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
415	 eval(shift(@insns))		if ($Xi==8);
416	 eval(shift(@insns));		# body_20_39
417	 eval(shift(@insns));
418	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
419	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
420	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
421	 eval(shift(@insns));
422	 eval(shift(@insns));		# rol
423
424	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
425	 eval(shift(@insns));
426	 eval(shift(@insns));
427	if ($Xi%5) {
428	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
429	} else {			# ... or load next one
430	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
431	}
432	 eval(shift(@insns));		# ror
433	  &paddd	(@Tx[1],@X[-1&7]);
434	 eval(shift(@insns));
435
436	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
437	 eval(shift(@insns));		# body_20_39
438	 eval(shift(@insns));
439	 eval(shift(@insns));
440	 eval(shift(@insns));		# rol
441	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
442
443	&movdqa	(@Tx[0],@X[0]);
444	 eval(shift(@insns));
445	 eval(shift(@insns));
446	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
447	 eval(shift(@insns));		# ror
448	 eval(shift(@insns));
449	 eval(shift(@insns));		# body_20_39
450
451	&pslld	(@X[0],2);
452	 eval(shift(@insns));
453	 eval(shift(@insns));
454	&psrld	(@Tx[0],30);
455	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
456	 eval(shift(@insns));
457	 eval(shift(@insns));
458	 eval(shift(@insns));		# ror
459
460	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
461	 eval(shift(@insns));
462	 eval(shift(@insns));		# body_20_39
463	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
464	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
465	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
466	 eval(shift(@insns));
467	 eval(shift(@insns));		# rol
468	 eval(shift(@insns));
469	 eval(shift(@insns));
470	 eval(shift(@insns));		# rol
471	 eval(shift(@insns));
472
473	 foreach (@insns) { eval; }	# remaining instructions
474
475  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
476		push(@Tx,shift(@Tx));
477}
478
479sub Xuplast_ssse3_80()
480{ use integer;
481  my $body = shift;
482  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
483  my ($a,$b,$c,$d,$e);
484
485	 eval(shift(@insns));
486	 eval(shift(@insns));
487	 eval(shift(@insns));
488	 eval(shift(@insns));
489	  &paddd	(@Tx[1],@X[-1&7]);
490	 eval(shift(@insns));
491	 eval(shift(@insns));
492
493	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
494
495	 foreach (@insns) { eval; }		# remaining instructions
496
497	&cmp	($inp,$len);
498	&je	(shift);
499
500	unshift(@Tx,pop(@Tx));
501
502	&movdqa	(@Tx[2],"64($K_XX_XX)");	# pbswap mask
503	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
504	&movdqu	(@X[-4&7],"0($inp)");		# load input
505	&movdqu	(@X[-3&7],"16($inp)");
506	&movdqu	(@X[-2&7],"32($inp)");
507	&movdqu	(@X[-1&7],"48($inp)");
508	&pshufb	(@X[-4&7],@Tx[2]);		# byte swap
509	&add	($inp,64);
510
511  $Xi=0;
512}
513
514sub Xloop_ssse3()
515{ use integer;
516  my $body = shift;
517  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
518  my ($a,$b,$c,$d,$e);
519
520	 eval(shift(@insns));
521	 eval(shift(@insns));
522	 eval(shift(@insns));
523	&pshufb	(@X[($Xi-3)&7],@Tx[2]);
524	 eval(shift(@insns));
525	 eval(shift(@insns));
526	 eval(shift(@insns));
527	 eval(shift(@insns));
528	&paddd	(@X[($Xi-4)&7],@Tx[1]);
529	 eval(shift(@insns));
530	 eval(shift(@insns));
531	 eval(shift(@insns));
532	 eval(shift(@insns));
533	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
534	 eval(shift(@insns));
535	 eval(shift(@insns));
536	 eval(shift(@insns));
537	 eval(shift(@insns));
538	&psubd	(@X[($Xi-4)&7],@Tx[1]);
539
540	foreach (@insns) { eval; }
541  $Xi++;
542}
543
544sub Xtail_ssse3()
545{ use integer;
546  my $body = shift;
547  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
548  my ($a,$b,$c,$d,$e);
549
550	foreach (@insns) { eval; }
551}
552
553my @body_00_19 = (
554	'($a,$b,$c,$d,$e)=@V;'.
555	'&$_ror	($b,$j?7:2);',	# $b>>>2
556	'&xor	(@T[0],$d);',
557	'&mov	(@T[1],$a);',	# $b for next round
558
559	'&add	($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
560	'&xor	($b,$c);',	# $c^$d for next round
561
562	'&$_rol	($a,5);',
563	'&add	($e,@T[0]);',
564	'&and	(@T[1],$b);',	# ($b&($c^$d)) for next round
565
566	'&xor	($b,$c);',	# restore $b
567	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
568	);
569
570sub body_00_19 () {	# ((c^d)&b)^d
571    # on start @T[0]=(c^d)&b
572    return &body_20_39() if ($rx==19); $rx++;
573
574    use integer;
575    my ($k,$n);
576    my @r=@body_00_19;
577
578	$n = scalar(@r);
579	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
580	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
581	$jj++;
582
583    return @r;
584}
585
586my @body_20_39 = (
587	'($a,$b,$c,$d,$e)=@V;'.
588	'&add	($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
589	'&xor	(@T[0],$d)	if($j==19);'.
590	'&xor	(@T[0],$c)	if($j> 19);',	# ($b^$d^$c)
591	'&mov	(@T[1],$a);',	# $b for next round
592
593	'&$_rol	($a,5);',
594	'&add	($e,@T[0]);',
595	'&xor	(@T[1],$c)	if ($j< 79);',	# $b^$d for next round
596
597	'&$_ror	($b,7);',	# $b>>>2
598	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
599	);
600
601sub body_20_39 () {	# b^d^c
602    # on entry @T[0]=b^d
603    return &body_40_59() if ($rx==39); $rx++;
604
605    use integer;
606    my ($k,$n);
607    my @r=@body_20_39;
608
609	$n = scalar(@r);
610	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
611	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n && $rx!=20);
612	$jj++;
613
614    return @r;
615}
616
617my @body_40_59 = (
618	'($a,$b,$c,$d,$e)=@V;'.
619	'&add	($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
620	'&and	(@T[0],$c)	if ($j>=40);',	# (b^c)&(c^d)
621	'&xor	($c,$d)		if ($j>=40);',	# restore $c
622
623	'&$_ror	($b,7);',	# $b>>>2
624	'&mov	(@T[1],$a);',	# $b for next round
625	'&xor	(@T[0],$c);',
626
627	'&$_rol	($a,5);',
628	'&add	($e,@T[0]);',
629	'&xor	(@T[1],$c)	if ($j==59);'.
630	'&xor	(@T[1],$b)	if ($j< 59);',	# b^c for next round
631
632	'&xor	($b,$c)		if ($j< 59);',	# c^d for next round
633	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
634	);
635
636sub body_40_59 () {	# ((b^c)&(c^d))^c
637    # on entry @T[0]=(b^c), (c^=d)
638    $rx++;
639
640    use integer;
641    my ($k,$n);
642    my @r=@body_40_59;
643
644	$n = scalar(@r);
645	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
646	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n && $rx!=40);
647	$jj++;
648
649    return @r;
650}
651$code.=<<___;
652.align	32
653.Loop_ssse3:
654___
655	&Xupdate_ssse3_16_31(\&body_00_19);
656	&Xupdate_ssse3_16_31(\&body_00_19);
657	&Xupdate_ssse3_16_31(\&body_00_19);
658	&Xupdate_ssse3_16_31(\&body_00_19);
659	&Xupdate_ssse3_32_79(\&body_00_19);
660	&Xupdate_ssse3_32_79(\&body_20_39);
661	&Xupdate_ssse3_32_79(\&body_20_39);
662	&Xupdate_ssse3_32_79(\&body_20_39);
663	&Xupdate_ssse3_32_79(\&body_20_39);
664	&Xupdate_ssse3_32_79(\&body_20_39);
665	&Xupdate_ssse3_32_79(\&body_40_59);
666	&Xupdate_ssse3_32_79(\&body_40_59);
667	&Xupdate_ssse3_32_79(\&body_40_59);
668	&Xupdate_ssse3_32_79(\&body_40_59);
669	&Xupdate_ssse3_32_79(\&body_40_59);
670	&Xupdate_ssse3_32_79(\&body_20_39);
671	&Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3");	# can jump to "done"
672
673				$saved_j=$j; @saved_V=@V;
674				$saved_r=$r; @saved_rndkey=@rndkey;
675
676	&Xloop_ssse3(\&body_20_39);
677	&Xloop_ssse3(\&body_20_39);
678	&Xloop_ssse3(\&body_20_39);
679
680$code.=<<___;
681	movups	$iv,48($out,$in0)		# write output
682	lea	64($in0),$in0
683
684	add	0($ctx),$A			# update context
685	add	4($ctx),@T[0]
686	add	8($ctx),$C
687	add	12($ctx),$D
688	mov	$A,0($ctx)
689	add	16($ctx),$E
690	mov	@T[0],4($ctx)
691	mov	@T[0],$B			# magic seed
692	mov	$C,8($ctx)
693	mov	$C,@T[1]
694	mov	$D,12($ctx)
695	xor	$D,@T[1]
696	mov	$E,16($ctx)
697	and	@T[1],@T[0]
698	jmp	.Loop_ssse3
699
700.Ldone_ssse3:
701___
702				$jj=$j=$saved_j; @V=@saved_V;
703				$r=$saved_r;     @rndkey=@saved_rndkey;
704
705	&Xtail_ssse3(\&body_20_39);
706	&Xtail_ssse3(\&body_20_39);
707	&Xtail_ssse3(\&body_20_39);
708
709$code.=<<___;
710	movups	$iv,48($out,$in0)		# write output
711	mov	88(%rsp),$ivp			# restore $ivp
712
713	add	0($ctx),$A			# update context
714	add	4($ctx),@T[0]
715	add	8($ctx),$C
716	mov	$A,0($ctx)
717	add	12($ctx),$D
718	mov	@T[0],4($ctx)
719	add	16($ctx),$E
720	mov	$C,8($ctx)
721	mov	$D,12($ctx)
722	mov	$E,16($ctx)
723	movups	$iv,($ivp)			# write IV
724___
725$code.=<<___ if ($win64);
726	movaps	96+0(%rsp),%xmm6
727	movaps	96+16(%rsp),%xmm7
728	movaps	96+32(%rsp),%xmm8
729	movaps	96+48(%rsp),%xmm9
730	movaps	96+64(%rsp),%xmm10
731	movaps	96+80(%rsp),%xmm11
732	movaps	96+96(%rsp),%xmm12
733	movaps	96+112(%rsp),%xmm13
734	movaps	96+128(%rsp),%xmm14
735	movaps	96+144(%rsp),%xmm15
736___
737$code.=<<___;
738	lea	`104+($win64?10*16:0)`(%rsp),%rsi
739.cfi_def_cfa	%rsi,56
740	mov	0(%rsi),%r15
741.cfi_restore	%r15
742	mov	8(%rsi),%r14
743.cfi_restore	%r14
744	mov	16(%rsi),%r13
745.cfi_restore	%r13
746	mov	24(%rsi),%r12
747.cfi_restore	%r12
748	mov	32(%rsi),%rbp
749.cfi_restore	%rbp
750	mov	40(%rsi),%rbx
751.cfi_restore	%rbx
752	lea	48(%rsi),%rsp
753.cfi_def_cfa	%rsp,8
754.Lepilogue_ssse3:
755	ret
756.cfi_endproc
757.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
758___
759
760						if ($stitched_decrypt) {{{
761# reset
762($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
763$j=$jj=$r=$rx=0;
764$Xi=4;
765
766# reassign for Atom Silvermont (see above)
767($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
768@X=map("%xmm$_",(8..13,6,7));
769@Tx=map("%xmm$_",(14,15,5));
770
771my @aes256_dec = (
772	'&movdqu($inout0,"0x00($in0)");',
773	'&movdqu($inout1,"0x10($in0)");	&pxor	($inout0,$rndkey0);',
774	'&movdqu($inout2,"0x20($in0)");	&pxor	($inout1,$rndkey0);',
775	'&movdqu($inout3,"0x30($in0)");	&pxor	($inout2,$rndkey0);',
776
777	'&pxor	($inout3,$rndkey0);	&movups	($rndkey0,"16-112($key)");',
778	'&movaps("64(%rsp)",@X[2]);',	# save IV, originally @X[3]
779	undef,undef
780	);
781for ($i=0;$i<13;$i++) {
782    push (@aes256_dec,(
783	'&aesdec	($inout0,$rndkey0);',
784	'&aesdec	($inout1,$rndkey0);',
785	'&aesdec	($inout2,$rndkey0);',
786	'&aesdec	($inout3,$rndkey0);	&movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
787	));
788    push (@aes256_dec,(undef,undef))	if (($i>=3 && $i<=5) || $i>=11);
789    push (@aes256_dec,(undef,undef))	if ($i==5);
790}
791push(@aes256_dec,(
792	'&aesdeclast	($inout0,$rndkey0);	&movups	(@X[0],"0x00($in0)");',
793	'&aesdeclast	($inout1,$rndkey0);	&movups	(@X[1],"0x10($in0)");',
794	'&aesdeclast	($inout2,$rndkey0);	&movups	(@X[2],"0x20($in0)");',
795	'&aesdeclast	($inout3,$rndkey0);	&movups	(@X[3],"0x30($in0)");',
796
797	'&xorps		($inout0,"64(%rsp)");	&movdqu	($rndkey0,"-112($key)");',
798	'&xorps		($inout1,@X[0]);	&movups	("0x00($out,$in0)",$inout0);',
799	'&xorps		($inout2,@X[1]);	&movups	("0x10($out,$in0)",$inout1);',
800	'&xorps		($inout3,@X[2]);	&movups	("0x20($out,$in0)",$inout2);',
801
802	'&movups	("0x30($out,$in0)",$inout3);'
803	));
804
805sub body_00_19_dec () {	# ((c^d)&b)^d
806    # on start @T[0]=(c^d)&b
807    return &body_20_39_dec() if ($rx==19);
808
809    my @r=@body_00_19;
810
811	unshift (@r,@aes256_dec[$rx])	if (@aes256_dec[$rx]);
812	$rx++;
813
814    return @r;
815}
816
817sub body_20_39_dec () {	# b^d^c
818    # on entry @T[0]=b^d
819    return &body_40_59_dec() if ($rx==39);
820
821    my @r=@body_20_39;
822
823	unshift (@r,@aes256_dec[$rx])	if (@aes256_dec[$rx]);
824	$rx++;
825
826    return @r;
827}
828
829sub body_40_59_dec () {	# ((b^c)&(c^d))^c
830    # on entry @T[0]=(b^c), (c^=d)
831
832    my @r=@body_40_59;
833
834	unshift (@r,@aes256_dec[$rx])	if (@aes256_dec[$rx]);
835	$rx++;
836
837    return @r;
838}
839
840$code.=<<___;
841.globl	aesni256_cbc_sha1_dec
842.type	aesni256_cbc_sha1_dec,\@abi-omnipotent
843.align	32
844aesni256_cbc_sha1_dec:
845.cfi_startproc
846	# caller should check for SSSE3 and AES-NI bits
847	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
848	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
849___
850$code.=<<___ if ($avx);
851	and	\$`1<<28`,%r11d		# mask AVX bit
852	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
853	or	%r11d,%r10d
854	cmp	\$`1<<28|1<<30`,%r10d
855	je	aesni256_cbc_sha1_dec_avx
856___
857$code.=<<___;
858	jmp	aesni256_cbc_sha1_dec_ssse3
859	ret
860.cfi_endproc
861.size	aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
862
863.type	aesni256_cbc_sha1_dec_ssse3,\@function,6
864.align	32
865aesni256_cbc_sha1_dec_ssse3:
866.cfi_startproc
867	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
868	push	%rbx
869.cfi_push	%rbx
870	push	%rbp
871.cfi_push	%rbp
872	push	%r12
873.cfi_push	%r12
874	push	%r13
875.cfi_push	%r13
876	push	%r14
877.cfi_push	%r14
878	push	%r15
879.cfi_push	%r15
880	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
881.cfi_adjust_cfa_offset	`104+($win64?10*16:0)`
882___
883$code.=<<___ if ($win64);
884	movaps	%xmm6,96+0(%rsp)
885	movaps	%xmm7,96+16(%rsp)
886	movaps	%xmm8,96+32(%rsp)
887	movaps	%xmm9,96+48(%rsp)
888	movaps	%xmm10,96+64(%rsp)
889	movaps	%xmm11,96+80(%rsp)
890	movaps	%xmm12,96+96(%rsp)
891	movaps	%xmm13,96+112(%rsp)
892	movaps	%xmm14,96+128(%rsp)
893	movaps	%xmm15,96+144(%rsp)
894.Lprologue_dec_ssse3:
895___
896$code.=<<___;
897	mov	$in0,%r12			# reassign arguments
898	mov	$out,%r13
899	mov	$len,%r14
900	lea	112($key),%r15			# size optimization
901	movdqu	($ivp),@X[3]			# load IV
902	#mov	$ivp,88(%rsp)			# save $ivp
903___
904($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
905$code.=<<___;
906	shl	\$6,$len
907	sub	$in0,$out
908	add	$inp,$len		# end of input
909
910	lea	K_XX_XX(%rip),$K_XX_XX
911	mov	0($ctx),$A		# load context
912	mov	4($ctx),$B
913	mov	8($ctx),$C
914	mov	12($ctx),$D
915	mov	$B,@T[0]		# magic seed
916	mov	16($ctx),$E
917	mov	$C,@T[1]
918	xor	$D,@T[1]
919	and	@T[1],@T[0]
920
921	movdqa	64($K_XX_XX),@Tx[2]	# pbswap mask
922	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
923	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
924	movdqu	16($inp),@X[-3&7]
925	movdqu	32($inp),@X[-2&7]
926	movdqu	48($inp),@X[-1&7]
927	pshufb	@Tx[2],@X[-4&7]		# byte swap
928	add	\$64,$inp
929	pshufb	@Tx[2],@X[-3&7]
930	pshufb	@Tx[2],@X[-2&7]
931	pshufb	@Tx[2],@X[-1&7]
932	paddd	@Tx[1],@X[-4&7]		# add K_00_19
933	paddd	@Tx[1],@X[-3&7]
934	paddd	@Tx[1],@X[-2&7]
935	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
936	psubd	@Tx[1],@X[-4&7]		# restore X[]
937	movdqa	@X[-3&7],16(%rsp)
938	psubd	@Tx[1],@X[-3&7]
939	movdqa	@X[-2&7],32(%rsp)
940	psubd	@Tx[1],@X[-2&7]
941	movdqu	-112($key),$rndkey0	# $key[0]
942	jmp	.Loop_dec_ssse3
943
944.align	32
945.Loop_dec_ssse3:
946___
947	&Xupdate_ssse3_16_31(\&body_00_19_dec);
948	&Xupdate_ssse3_16_31(\&body_00_19_dec);
949	&Xupdate_ssse3_16_31(\&body_00_19_dec);
950	&Xupdate_ssse3_16_31(\&body_00_19_dec);
951	&Xupdate_ssse3_32_79(\&body_00_19_dec);
952	&Xupdate_ssse3_32_79(\&body_20_39_dec);
953	&Xupdate_ssse3_32_79(\&body_20_39_dec);
954	&Xupdate_ssse3_32_79(\&body_20_39_dec);
955	&Xupdate_ssse3_32_79(\&body_20_39_dec);
956	&Xupdate_ssse3_32_79(\&body_20_39_dec);
957	&Xupdate_ssse3_32_79(\&body_40_59_dec);
958	&Xupdate_ssse3_32_79(\&body_40_59_dec);
959	&Xupdate_ssse3_32_79(\&body_40_59_dec);
960	&Xupdate_ssse3_32_79(\&body_40_59_dec);
961	&Xupdate_ssse3_32_79(\&body_40_59_dec);
962	&Xupdate_ssse3_32_79(\&body_20_39_dec);
963	&Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3");	# can jump to "done"
964
965				$saved_j=$j;   @saved_V=@V;
966				$saved_rx=$rx;
967
968	&Xloop_ssse3(\&body_20_39_dec);
969	&Xloop_ssse3(\&body_20_39_dec);
970	&Xloop_ssse3(\&body_20_39_dec);
971
972	eval(@aes256_dec[-1]);			# last store
973$code.=<<___;
974	lea	64($in0),$in0
975
976	add	0($ctx),$A			# update context
977	add	4($ctx),@T[0]
978	add	8($ctx),$C
979	add	12($ctx),$D
980	mov	$A,0($ctx)
981	add	16($ctx),$E
982	mov	@T[0],4($ctx)
983	mov	@T[0],$B			# magic seed
984	mov	$C,8($ctx)
985	mov	$C,@T[1]
986	mov	$D,12($ctx)
987	xor	$D,@T[1]
988	mov	$E,16($ctx)
989	and	@T[1],@T[0]
990	jmp	.Loop_dec_ssse3
991
992.Ldone_dec_ssse3:
993___
994				$jj=$j=$saved_j; @V=@saved_V;
995				$rx=$saved_rx;
996
997	&Xtail_ssse3(\&body_20_39_dec);
998	&Xtail_ssse3(\&body_20_39_dec);
999	&Xtail_ssse3(\&body_20_39_dec);
1000
1001	eval(@aes256_dec[-1]);			# last store
1002$code.=<<___;
1003	add	0($ctx),$A			# update context
1004	add	4($ctx),@T[0]
1005	add	8($ctx),$C
1006	mov	$A,0($ctx)
1007	add	12($ctx),$D
1008	mov	@T[0],4($ctx)
1009	add	16($ctx),$E
1010	mov	$C,8($ctx)
1011	mov	$D,12($ctx)
1012	mov	$E,16($ctx)
1013	movups	@X[3],($ivp)			# write IV
1014___
1015$code.=<<___ if ($win64);
1016	movaps	96+0(%rsp),%xmm6
1017	movaps	96+16(%rsp),%xmm7
1018	movaps	96+32(%rsp),%xmm8
1019	movaps	96+48(%rsp),%xmm9
1020	movaps	96+64(%rsp),%xmm10
1021	movaps	96+80(%rsp),%xmm11
1022	movaps	96+96(%rsp),%xmm12
1023	movaps	96+112(%rsp),%xmm13
1024	movaps	96+128(%rsp),%xmm14
1025	movaps	96+144(%rsp),%xmm15
1026___
1027$code.=<<___;
1028	lea	`104+($win64?10*16:0)`(%rsp),%rsi
1029.cfi_cfa_def	%rsi,56
1030	mov	0(%rsi),%r15
1031.cfi_restore	%r15
1032	mov	8(%rsi),%r14
1033.cfi_restore	%r14
1034	mov	16(%rsi),%r13
1035.cfi_restore	%r13
1036	mov	24(%rsi),%r12
1037.cfi_restore	%r12
1038	mov	32(%rsi),%rbp
1039.cfi_restore	%rbp
1040	mov	40(%rsi),%rbx
1041.cfi_restore	%rbx
1042	lea	48(%rsi),%rsp
1043.cfi_cfa_def	%rsp,8
1044.Lepilogue_dec_ssse3:
1045	ret
1046.cfi_endproc
1047.size	aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
1048___
1049						}}}
1050$j=$jj=$r=$rx=0;
1051
1052if ($avx) {
1053my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1054
1055my $Xi=4;
1056my @X=map("%xmm$_",(4..7,0..3));
1057my @Tx=map("%xmm$_",(8..10));
1058my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
1059my @T=("%esi","%edi");
1060my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
1061my @rndkey=("%xmm14","%xmm15");
1062my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));	# for dec
1063my $Kx=@Tx[2];
1064
1065my $_rol=sub { &shld(@_[0],@_) };
1066my $_ror=sub { &shrd(@_[0],@_) };
1067
1068$code.=<<___;
1069.type	aesni_cbc_sha1_enc_avx,\@function,6
1070.align	32
1071aesni_cbc_sha1_enc_avx:
1072.cfi_startproc
1073	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1074	#shr	\$6,$len			# debugging artefact
1075	#jz	.Lepilogue_avx			# debugging artefact
1076	push	%rbx
1077.cfi_push	%rbx
1078	push	%rbp
1079.cfi_push	%rbp
1080	push	%r12
1081.cfi_push	%r12
1082	push	%r13
1083.cfi_push	%r13
1084	push	%r14
1085.cfi_push	%r14
1086	push	%r15
1087.cfi_push	%r15
1088	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
1089.cfi_adjust_cfa_offset	`104+($win64?10*16:0)`
1090	#mov	$in0,$inp			# debugging artefact
1091	#lea	64(%rsp),$ctx			# debugging artefact
1092___
1093$code.=<<___ if ($win64);
1094	movaps	%xmm6,96+0(%rsp)
1095	movaps	%xmm7,96+16(%rsp)
1096	movaps	%xmm8,96+32(%rsp)
1097	movaps	%xmm9,96+48(%rsp)
1098	movaps	%xmm10,96+64(%rsp)
1099	movaps	%xmm11,96+80(%rsp)
1100	movaps	%xmm12,96+96(%rsp)
1101	movaps	%xmm13,96+112(%rsp)
1102	movaps	%xmm14,96+128(%rsp)
1103	movaps	%xmm15,96+144(%rsp)
1104.Lprologue_avx:
1105___
1106$code.=<<___;
1107	vzeroall
1108	mov	$in0,%r12			# reassign arguments
1109	mov	$out,%r13
1110	mov	$len,%r14
1111	lea	112($key),%r15			# size optimization
1112	vmovdqu	($ivp),$iv			# load IV
1113	mov	$ivp,88(%rsp)			# save $ivp
1114___
1115($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
1116my $rounds="${ivp}d";
1117$code.=<<___;
1118	shl	\$6,$len
1119	sub	$in0,$out
1120	mov	240-112($key),$rounds
1121	add	$inp,$len		# end of input
1122
1123	lea	K_XX_XX(%rip),$K_XX_XX
1124	mov	0($ctx),$A		# load context
1125	mov	4($ctx),$B
1126	mov	8($ctx),$C
1127	mov	12($ctx),$D
1128	mov	$B,@T[0]		# magic seed
1129	mov	16($ctx),$E
1130	mov	$C,@T[1]
1131	xor	$D,@T[1]
1132	and	@T[1],@T[0]
1133
1134	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
1135	vmovdqa	0($K_XX_XX),$Kx		# K_00_19
1136	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
1137	vmovdqu	16($inp),@X[-3&7]
1138	vmovdqu	32($inp),@X[-2&7]
1139	vmovdqu	48($inp),@X[-1&7]
1140	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
1141	add	\$64,$inp
1142	vpshufb	@X[2],@X[-3&7],@X[-3&7]
1143	vpshufb	@X[2],@X[-2&7],@X[-2&7]
1144	vpshufb	@X[2],@X[-1&7],@X[-1&7]
1145	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1146	vpaddd	$Kx,@X[-3&7],@X[1]
1147	vpaddd	$Kx,@X[-2&7],@X[2]
1148	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1149	vmovdqa	@X[1],16(%rsp)
1150	vmovdqa	@X[2],32(%rsp)
1151	vmovups	-112($key),$rndkey[1]	# $key[0]
1152	vmovups	16-112($key),$rndkey[0]	# forward reference
1153	jmp	.Loop_avx
1154___
1155
1156my $aesenc=sub {
1157  use integer;
1158  my ($n,$k)=($r/10,$r%10);
1159    if ($k==0) {
1160      $code.=<<___;
1161	vmovdqu		`16*$n`($in0),$in		# load input
1162	vpxor		$rndkey[1],$in,$in
1163___
1164      $code.=<<___ if ($n);
1165	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
1166___
1167      $code.=<<___;
1168	vpxor		$in,$iv,$iv
1169	vaesenc		$rndkey[0],$iv,$iv
1170	vmovups		`32+16*$k-112`($key),$rndkey[1]
1171___
1172    } elsif ($k==9) {
1173      $sn++;
1174      $code.=<<___;
1175	cmp		\$11,$rounds
1176	jb		.Lvaesenclast$sn
1177	vaesenc		$rndkey[0],$iv,$iv
1178	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
1179	vaesenc		$rndkey[1],$iv,$iv
1180	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
1181	je		.Lvaesenclast$sn
1182	vaesenc		$rndkey[0],$iv,$iv
1183	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
1184	vaesenc		$rndkey[1],$iv,$iv
1185	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
1186.Lvaesenclast$sn:
1187	vaesenclast	$rndkey[0],$iv,$iv
1188	vmovups		-112($key),$rndkey[0]
1189	vmovups		16-112($key),$rndkey[1]		# forward reference
1190___
1191    } else {
1192      $code.=<<___;
1193	vaesenc		$rndkey[0],$iv,$iv
1194	vmovups		`32+16*$k-112`($key),$rndkey[1]
1195___
1196    }
1197    $r++;	unshift(@rndkey,pop(@rndkey));
1198};
1199
1200sub Xupdate_avx_16_31()		# recall that $Xi starts with 4
1201{ use integer;
1202  my $body = shift;
1203  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1204  my ($a,$b,$c,$d,$e);
1205
1206	 eval(shift(@insns));
1207	 eval(shift(@insns));
1208	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1209	 eval(shift(@insns));
1210	 eval(shift(@insns));
1211
1212	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1213	 eval(shift(@insns));
1214	 eval(shift(@insns));
1215	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1216	 eval(shift(@insns));
1217	 eval(shift(@insns));
1218	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1219	 eval(shift(@insns));
1220	 eval(shift(@insns));
1221
1222	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1223	 eval(shift(@insns));
1224	 eval(shift(@insns));
1225	 eval(shift(@insns));
1226	 eval(shift(@insns));
1227
1228	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1229	 eval(shift(@insns));
1230	 eval(shift(@insns));
1231	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1232	 eval(shift(@insns));
1233	 eval(shift(@insns));
1234
1235	&vpsrld	(@Tx[0],@X[0],31);
1236	 eval(shift(@insns));
1237	 eval(shift(@insns));
1238	 eval(shift(@insns));
1239	 eval(shift(@insns));
1240
1241	&vpslldq(@Tx[1],@X[0],12);		# "X[0]"<<96, extract one dword
1242	&vpaddd	(@X[0],@X[0],@X[0]);
1243	 eval(shift(@insns));
1244	 eval(shift(@insns));
1245	 eval(shift(@insns));
1246	 eval(shift(@insns));
1247
1248	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1249	&vpsrld	(@Tx[0],@Tx[1],30);
1250	 eval(shift(@insns));
1251	 eval(shift(@insns));
1252	 eval(shift(@insns));
1253	 eval(shift(@insns));
1254
1255	&vpslld	(@Tx[1],@Tx[1],2);
1256	&vpxor	(@X[0],@X[0],@Tx[0]);
1257	 eval(shift(@insns));
1258	 eval(shift(@insns));
1259	 eval(shift(@insns));
1260	 eval(shift(@insns));
1261
1262	&vpxor	(@X[0],@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
1263	 eval(shift(@insns));
1264	 eval(shift(@insns));
1265	  &vmovdqa	($Kx,eval(16*(($Xi)/5))."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1266	 eval(shift(@insns));
1267	 eval(shift(@insns));
1268
1269
1270	 foreach (@insns) { eval; }	# remaining instructions [if any]
1271
1272  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1273}
1274
1275sub Xupdate_avx_32_79()
1276{ use integer;
1277  my $body = shift;
1278  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
1279  my ($a,$b,$c,$d,$e);
1280
1281	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1282	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1283	 eval(shift(@insns));		# body_20_39
1284	 eval(shift(@insns));
1285	 eval(shift(@insns));
1286	 eval(shift(@insns));		# rol
1287
1288	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1289	 eval(shift(@insns));
1290	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1291	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1292	  &vmovdqa	($Kx,eval(16*($Xi/5))."($K_XX_XX)")	if ($Xi%5==0);
1293	 eval(shift(@insns));		# ror
1294	 eval(shift(@insns));
1295
1296	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1297	 eval(shift(@insns));		# body_20_39
1298	 eval(shift(@insns));
1299	 eval(shift(@insns));
1300	 eval(shift(@insns));		# rol
1301
1302	&vpsrld	(@Tx[0],@X[0],30);
1303	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1304	 eval(shift(@insns));
1305	 eval(shift(@insns));
1306	 eval(shift(@insns));		# ror
1307	 eval(shift(@insns));
1308
1309	&vpslld	(@X[0],@X[0],2);
1310	 eval(shift(@insns));		# body_20_39
1311	 eval(shift(@insns));
1312	 eval(shift(@insns));
1313	 eval(shift(@insns));		# rol
1314	 eval(shift(@insns));
1315	 eval(shift(@insns));
1316	 eval(shift(@insns));		# ror
1317	 eval(shift(@insns));
1318
1319	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1320	 eval(shift(@insns));		# body_20_39
1321	 eval(shift(@insns));
1322	 eval(shift(@insns));
1323	 eval(shift(@insns));		# rol
1324	 eval(shift(@insns));
1325	 eval(shift(@insns));
1326	 eval(shift(@insns));		# rol
1327	 eval(shift(@insns));
1328
1329	 foreach (@insns) { eval; }	# remaining instructions
1330
1331  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1332}
1333
1334sub Xuplast_avx_80()
1335{ use integer;
1336  my $body = shift;
1337  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1338  my ($a,$b,$c,$d,$e);
1339
1340	 eval(shift(@insns));
1341	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1342	 eval(shift(@insns));
1343	 eval(shift(@insns));
1344	 eval(shift(@insns));
1345	 eval(shift(@insns));
1346
1347	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1348
1349	 foreach (@insns) { eval; }		# remaining instructions
1350
1351	&cmp	($inp,$len);
1352	&je	(shift);
1353
1354	&vmovdqa(@Tx[1],"64($K_XX_XX)");	# pbswap mask
1355	&vmovdqa($Kx,"0($K_XX_XX)");		# K_00_19
1356	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1357	&vmovdqu(@X[-3&7],"16($inp)");
1358	&vmovdqu(@X[-2&7],"32($inp)");
1359	&vmovdqu(@X[-1&7],"48($inp)");
1360	&vpshufb(@X[-4&7],@X[-4&7],@Tx[1]);	# byte swap
1361	&add	($inp,64);
1362
1363  $Xi=0;
1364}
1365
1366sub Xloop_avx()
1367{ use integer;
1368  my $body = shift;
1369  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1370  my ($a,$b,$c,$d,$e);
1371
1372	 eval(shift(@insns));
1373	 eval(shift(@insns));
1374	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
1375	 eval(shift(@insns));
1376	 eval(shift(@insns));
1377	&vpaddd	(@Tx[0],@X[($Xi-4)&7],$Kx);
1378	 eval(shift(@insns));
1379	 eval(shift(@insns));
1380	 eval(shift(@insns));
1381	 eval(shift(@insns));
1382	&vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]);	# X[]+K xfer to IALU
1383	 eval(shift(@insns));
1384	 eval(shift(@insns));
1385
1386	foreach (@insns) { eval; }
1387  $Xi++;
1388}
1389
1390sub Xtail_avx()
1391{ use integer;
1392  my $body = shift;
1393  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1394  my ($a,$b,$c,$d,$e);
1395
1396	foreach (@insns) { eval; }
1397}
1398
1399$code.=<<___;
1400.align	32
1401.Loop_avx:
1402___
1403	&Xupdate_avx_16_31(\&body_00_19);
1404	&Xupdate_avx_16_31(\&body_00_19);
1405	&Xupdate_avx_16_31(\&body_00_19);
1406	&Xupdate_avx_16_31(\&body_00_19);
1407	&Xupdate_avx_32_79(\&body_00_19);
1408	&Xupdate_avx_32_79(\&body_20_39);
1409	&Xupdate_avx_32_79(\&body_20_39);
1410	&Xupdate_avx_32_79(\&body_20_39);
1411	&Xupdate_avx_32_79(\&body_20_39);
1412	&Xupdate_avx_32_79(\&body_20_39);
1413	&Xupdate_avx_32_79(\&body_40_59);
1414	&Xupdate_avx_32_79(\&body_40_59);
1415	&Xupdate_avx_32_79(\&body_40_59);
1416	&Xupdate_avx_32_79(\&body_40_59);
1417	&Xupdate_avx_32_79(\&body_40_59);
1418	&Xupdate_avx_32_79(\&body_20_39);
1419	&Xuplast_avx_80(\&body_20_39,".Ldone_avx");	# can jump to "done"
1420
1421				$saved_j=$j; @saved_V=@V;
1422				$saved_r=$r; @saved_rndkey=@rndkey;
1423
1424	&Xloop_avx(\&body_20_39);
1425	&Xloop_avx(\&body_20_39);
1426	&Xloop_avx(\&body_20_39);
1427
1428$code.=<<___;
1429	vmovups	$iv,48($out,$in0)		# write output
1430	lea	64($in0),$in0
1431
1432	add	0($ctx),$A			# update context
1433	add	4($ctx),@T[0]
1434	add	8($ctx),$C
1435	add	12($ctx),$D
1436	mov	$A,0($ctx)
1437	add	16($ctx),$E
1438	mov	@T[0],4($ctx)
1439	mov	@T[0],$B			# magic seed
1440	mov	$C,8($ctx)
1441	mov	$C,@T[1]
1442	mov	$D,12($ctx)
1443	xor	$D,@T[1]
1444	mov	$E,16($ctx)
1445	and	@T[1],@T[0]
1446	jmp	.Loop_avx
1447
1448.Ldone_avx:
1449___
1450				$jj=$j=$saved_j; @V=@saved_V;
1451				$r=$saved_r;     @rndkey=@saved_rndkey;
1452
1453	&Xtail_avx(\&body_20_39);
1454	&Xtail_avx(\&body_20_39);
1455	&Xtail_avx(\&body_20_39);
1456
1457$code.=<<___;
1458	vmovups	$iv,48($out,$in0)		# write output
1459	mov	88(%rsp),$ivp			# restore $ivp
1460
1461	add	0($ctx),$A			# update context
1462	add	4($ctx),@T[0]
1463	add	8($ctx),$C
1464	mov	$A,0($ctx)
1465	add	12($ctx),$D
1466	mov	@T[0],4($ctx)
1467	add	16($ctx),$E
1468	mov	$C,8($ctx)
1469	mov	$D,12($ctx)
1470	mov	$E,16($ctx)
1471	vmovups	$iv,($ivp)			# write IV
1472	vzeroall
1473___
1474$code.=<<___ if ($win64);
1475	movaps	96+0(%rsp),%xmm6
1476	movaps	96+16(%rsp),%xmm7
1477	movaps	96+32(%rsp),%xmm8
1478	movaps	96+48(%rsp),%xmm9
1479	movaps	96+64(%rsp),%xmm10
1480	movaps	96+80(%rsp),%xmm11
1481	movaps	96+96(%rsp),%xmm12
1482	movaps	96+112(%rsp),%xmm13
1483	movaps	96+128(%rsp),%xmm14
1484	movaps	96+144(%rsp),%xmm15
1485___
1486$code.=<<___;
1487	lea	`104+($win64?10*16:0)`(%rsp),%rsi
1488.cfi_def_cfa	%rsi,56
1489	mov	0(%rsi),%r15
1490.cfi_restore	%r15
1491	mov	8(%rsi),%r14
1492.cfi_restore	%r14
1493	mov	16(%rsi),%r13
1494.cfi_restore	%r13
1495	mov	24(%rsi),%r12
1496.cfi_restore	%r12
1497	mov	32(%rsi),%rbp
1498.cfi_restore	%rbp
1499	mov	40(%rsi),%rbx
1500.cfi_restore	%rbx
1501	lea	48(%rsi),%rsp
1502.cfi_def_cfa	%rsp,8
1503.Lepilogue_avx:
1504	ret
1505.cfi_endproc
1506.size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1507___
1508
1509						if ($stitched_decrypt) {{{
1510# reset
1511($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1512
1513$j=$jj=$r=$rx=0;
1514$Xi=4;
1515
1516@aes256_dec = (
1517	'&vpxor	($inout0,$rndkey0,"0x00($in0)");',
1518	'&vpxor	($inout1,$rndkey0,"0x10($in0)");',
1519	'&vpxor	($inout2,$rndkey0,"0x20($in0)");',
1520	'&vpxor	($inout3,$rndkey0,"0x30($in0)");',
1521
1522	'&vmovups($rndkey0,"16-112($key)");',
1523	'&vmovups("64(%rsp)",@X[2]);',		# save IV, originally @X[3]
1524	undef,undef
1525	);
1526for ($i=0;$i<13;$i++) {
1527    push (@aes256_dec,(
1528	'&vaesdec	($inout0,$inout0,$rndkey0);',
1529	'&vaesdec	($inout1,$inout1,$rndkey0);',
1530	'&vaesdec	($inout2,$inout2,$rndkey0);',
1531	'&vaesdec	($inout3,$inout3,$rndkey0);	&vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
1532	));
1533    push (@aes256_dec,(undef,undef))	if (($i>=3 && $i<=5) || $i>=11);
1534    push (@aes256_dec,(undef,undef))	if ($i==5);
1535}
1536push(@aes256_dec,(
1537	'&vaesdeclast	($inout0,$inout0,$rndkey0);	&vmovups(@X[0],"0x00($in0)");',
1538	'&vaesdeclast	($inout1,$inout1,$rndkey0);	&vmovups(@X[1],"0x10($in0)");',
1539	'&vaesdeclast	($inout2,$inout2,$rndkey0);	&vmovups(@X[2],"0x20($in0)");',
1540	'&vaesdeclast	($inout3,$inout3,$rndkey0);	&vmovups(@X[3],"0x30($in0)");',
1541
1542	'&vxorps	($inout0,$inout0,"64(%rsp)");	&vmovdqu($rndkey0,"-112($key)");',
1543	'&vxorps	($inout1,$inout1,@X[0]);	&vmovups("0x00($out,$in0)",$inout0);',
1544	'&vxorps	($inout2,$inout2,@X[1]);	&vmovups("0x10($out,$in0)",$inout1);',
1545	'&vxorps	($inout3,$inout3,@X[2]);	&vmovups("0x20($out,$in0)",$inout2);',
1546
1547	'&vmovups	("0x30($out,$in0)",$inout3);'
1548	));
1549
1550$code.=<<___;
1551.type	aesni256_cbc_sha1_dec_avx,\@function,6
1552.align	32
1553aesni256_cbc_sha1_dec_avx:
1554.cfi_startproc
1555	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1556	push	%rbx
1557.cfi_push	%rbx
1558	push	%rbp
1559.cfi_push	%rbp
1560	push	%r12
1561.cfi_push	%r12
1562	push	%r13
1563.cfi_push	%r13
1564	push	%r14
1565.cfi_push	%r14
1566	push	%r15
1567.cfi_push	%r15
1568	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
1569.cfi_adjust_cfa_offset	`104+($win64?10*16:0)`
1570___
1571$code.=<<___ if ($win64);
1572	movaps	%xmm6,96+0(%rsp)
1573	movaps	%xmm7,96+16(%rsp)
1574	movaps	%xmm8,96+32(%rsp)
1575	movaps	%xmm9,96+48(%rsp)
1576	movaps	%xmm10,96+64(%rsp)
1577	movaps	%xmm11,96+80(%rsp)
1578	movaps	%xmm12,96+96(%rsp)
1579	movaps	%xmm13,96+112(%rsp)
1580	movaps	%xmm14,96+128(%rsp)
1581	movaps	%xmm15,96+144(%rsp)
1582.Lprologue_dec_avx:
1583___
1584$code.=<<___;
1585	vzeroall
1586	mov	$in0,%r12			# reassign arguments
1587	mov	$out,%r13
1588	mov	$len,%r14
1589	lea	112($key),%r15			# size optimization
1590	vmovdqu	($ivp),@X[3]			# load IV
1591___
1592($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
1593$code.=<<___;
1594	shl	\$6,$len
1595	sub	$in0,$out
1596	add	$inp,$len		# end of input
1597
1598	lea	K_XX_XX(%rip),$K_XX_XX
1599	mov	0($ctx),$A		# load context
1600	mov	4($ctx),$B
1601	mov	8($ctx),$C
1602	mov	12($ctx),$D
1603	mov	$B,@T[0]		# magic seed
1604	mov	16($ctx),$E
1605	mov	$C,@T[1]
1606	xor	$D,@T[1]
1607	and	@T[1],@T[0]
1608
1609	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
1610	vmovdqa	0($K_XX_XX),$Kx		# K_00_19
1611	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
1612	vmovdqu	16($inp),@X[-3&7]
1613	vmovdqu	32($inp),@X[-2&7]
1614	vmovdqu	48($inp),@X[-1&7]
1615	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
1616	add	\$64,$inp
1617	vpshufb	@X[2],@X[-3&7],@X[-3&7]
1618	vpshufb	@X[2],@X[-2&7],@X[-2&7]
1619	vpshufb	@X[2],@X[-1&7],@X[-1&7]
1620	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1621	vpaddd	$Kx,@X[-3&7],@X[1]
1622	vpaddd	$Kx,@X[-2&7],@X[2]
1623	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1624	vmovdqa	@X[1],16(%rsp)
1625	vmovdqa	@X[2],32(%rsp)
1626	vmovups	-112($key),$rndkey0	# $key[0]
1627	jmp	.Loop_dec_avx
1628
1629.align	32
1630.Loop_dec_avx:
1631___
1632	&Xupdate_avx_16_31(\&body_00_19_dec);
1633	&Xupdate_avx_16_31(\&body_00_19_dec);
1634	&Xupdate_avx_16_31(\&body_00_19_dec);
1635	&Xupdate_avx_16_31(\&body_00_19_dec);
1636	&Xupdate_avx_32_79(\&body_00_19_dec);
1637	&Xupdate_avx_32_79(\&body_20_39_dec);
1638	&Xupdate_avx_32_79(\&body_20_39_dec);
1639	&Xupdate_avx_32_79(\&body_20_39_dec);
1640	&Xupdate_avx_32_79(\&body_20_39_dec);
1641	&Xupdate_avx_32_79(\&body_20_39_dec);
1642	&Xupdate_avx_32_79(\&body_40_59_dec);
1643	&Xupdate_avx_32_79(\&body_40_59_dec);
1644	&Xupdate_avx_32_79(\&body_40_59_dec);
1645	&Xupdate_avx_32_79(\&body_40_59_dec);
1646	&Xupdate_avx_32_79(\&body_40_59_dec);
1647	&Xupdate_avx_32_79(\&body_20_39_dec);
1648	&Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx");	# can jump to "done"
1649
1650				$saved_j=$j; @saved_V=@V;
1651				$saved_rx=$rx;
1652
1653	&Xloop_avx(\&body_20_39_dec);
1654	&Xloop_avx(\&body_20_39_dec);
1655	&Xloop_avx(\&body_20_39_dec);
1656
1657	eval(@aes256_dec[-1]);			# last store
1658$code.=<<___;
1659	lea	64($in0),$in0
1660
1661	add	0($ctx),$A			# update context
1662	add	4($ctx),@T[0]
1663	add	8($ctx),$C
1664	add	12($ctx),$D
1665	mov	$A,0($ctx)
1666	add	16($ctx),$E
1667	mov	@T[0],4($ctx)
1668	mov	@T[0],$B			# magic seed
1669	mov	$C,8($ctx)
1670	mov	$C,@T[1]
1671	mov	$D,12($ctx)
1672	xor	$D,@T[1]
1673	mov	$E,16($ctx)
1674	and	@T[1],@T[0]
1675	jmp	.Loop_dec_avx
1676
1677.Ldone_dec_avx:
1678___
1679				$jj=$j=$saved_j; @V=@saved_V;
1680				$rx=$saved_rx;
1681
1682	&Xtail_avx(\&body_20_39_dec);
1683	&Xtail_avx(\&body_20_39_dec);
1684	&Xtail_avx(\&body_20_39_dec);
1685
1686	eval(@aes256_dec[-1]);			# last store
1687$code.=<<___;
1688
1689	add	0($ctx),$A			# update context
1690	add	4($ctx),@T[0]
1691	add	8($ctx),$C
1692	mov	$A,0($ctx)
1693	add	12($ctx),$D
1694	mov	@T[0],4($ctx)
1695	add	16($ctx),$E
1696	mov	$C,8($ctx)
1697	mov	$D,12($ctx)
1698	mov	$E,16($ctx)
1699	vmovups	@X[3],($ivp)			# write IV
1700	vzeroall
1701___
1702$code.=<<___ if ($win64);
1703	movaps	96+0(%rsp),%xmm6
1704	movaps	96+16(%rsp),%xmm7
1705	movaps	96+32(%rsp),%xmm8
1706	movaps	96+48(%rsp),%xmm9
1707	movaps	96+64(%rsp),%xmm10
1708	movaps	96+80(%rsp),%xmm11
1709	movaps	96+96(%rsp),%xmm12
1710	movaps	96+112(%rsp),%xmm13
1711	movaps	96+128(%rsp),%xmm14
1712	movaps	96+144(%rsp),%xmm15
1713___
1714$code.=<<___;
1715	lea	`104+($win64?10*16:0)`(%rsp),%rsi
1716.cfi_def_cfa	%rsi,56
1717	mov	0(%rsi),%r15
1718.cfi_restore	%r15
1719	mov	8(%rsi),%r14
1720.cfi_restore	%r14
1721	mov	16(%rsi),%r13
1722.cfi_restore	%r13
1723	mov	24(%rsi),%r12
1724.cfi_restore	%r12
1725	mov	32(%rsi),%rbp
1726.cfi_restore	%rbp
1727	mov	40(%rsi),%rbx
1728.cfi_restore	%rbx
1729	lea	48(%rsi),%rsp
1730.cfi_def_cfa	%rsp,8
1731.Lepilogue_dec_avx:
1732	ret
1733.cfi_endproc
1734.size	aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
1735___
1736						}}}
1737}
1738$code.=<<___;
1739.align	64
1740K_XX_XX:
1741.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1742.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1743.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1744.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1745.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1746.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1747
1748.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1749.align	64
1750___
1751						if ($shaext) {{{
1752($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1753
1754$rounds="%r11d";
1755
1756($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
1757@rndkey=("%xmm0","%xmm1");
1758$r=0;
1759
1760my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
1761my @MSG=map("%xmm$_",(3..6));
1762
1763$code.=<<___;
1764.type	aesni_cbc_sha1_enc_shaext,\@function,6
1765.align	32
1766aesni_cbc_sha1_enc_shaext:
1767.cfi_startproc
1768	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1769___
1770$code.=<<___ if ($win64);
1771	lea	`-8-10*16`(%rsp),%rsp
1772	movaps	%xmm6,-8-10*16(%rax)
1773	movaps	%xmm7,-8-9*16(%rax)
1774	movaps	%xmm8,-8-8*16(%rax)
1775	movaps	%xmm9,-8-7*16(%rax)
1776	movaps	%xmm10,-8-6*16(%rax)
1777	movaps	%xmm11,-8-5*16(%rax)
1778	movaps	%xmm12,-8-4*16(%rax)
1779	movaps	%xmm13,-8-3*16(%rax)
1780	movaps	%xmm14,-8-2*16(%rax)
1781	movaps	%xmm15,-8-1*16(%rax)
1782.Lprologue_shaext:
1783___
1784$code.=<<___;
1785	movdqu	($ctx),$ABCD
1786	movd	16($ctx),$E
1787	movdqa	K_XX_XX+0x50(%rip),$BSWAP	# byte-n-word swap
1788
1789	mov	240($key),$rounds
1790	sub	$in0,$out
1791	movups	($key),$rndkey0			# $key[0]
1792	movups	($ivp),$iv			# load IV
1793	movups	16($key),$rndkey[0]		# forward reference
1794	lea	112($key),$key			# size optimization
1795
1796	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
1797	pshufd	\$0b00011011,$E,$E		# flip word order
1798	jmp	.Loop_shaext
1799
1800.align	16
1801.Loop_shaext:
1802___
1803	&$aesenc();
1804$code.=<<___;
1805	movdqu		($inp),@MSG[0]
1806	movdqa		$E,$E_SAVE		# offload $E
1807	pshufb		$BSWAP,@MSG[0]
1808	movdqu		0x10($inp),@MSG[1]
1809	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
1810___
1811	&$aesenc();
1812$code.=<<___;
1813	pshufb		$BSWAP,@MSG[1]
1814
1815	paddd		@MSG[0],$E
1816	movdqu		0x20($inp),@MSG[2]
1817	lea		0x40($inp),$inp
1818	pxor		$E_SAVE,@MSG[0]		# black magic
1819___
1820	&$aesenc();
1821$code.=<<___;
1822	pxor		$E_SAVE,@MSG[0]		# black magic
1823	movdqa		$ABCD,$E_
1824	pshufb		$BSWAP,@MSG[2]
1825	sha1rnds4	\$0,$E,$ABCD		# 0-3
1826	sha1nexte	@MSG[1],$E_
1827___
1828	&$aesenc();
1829$code.=<<___;
1830	sha1msg1	@MSG[1],@MSG[0]
1831	movdqu		-0x10($inp),@MSG[3]
1832	movdqa		$ABCD,$E
1833	pshufb		$BSWAP,@MSG[3]
1834___
1835	&$aesenc();
1836$code.=<<___;
1837	sha1rnds4	\$0,$E_,$ABCD		# 4-7
1838	sha1nexte	@MSG[2],$E
1839	pxor		@MSG[2],@MSG[0]
1840	sha1msg1	@MSG[2],@MSG[1]
1841___
1842	&$aesenc();
1843
1844for($i=2;$i<20-4;$i++) {
1845$code.=<<___;
1846	movdqa		$ABCD,$E_
1847	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 8-11
1848	sha1nexte	@MSG[3],$E_
1849___
1850	&$aesenc();
1851$code.=<<___;
1852	sha1msg2	@MSG[3],@MSG[0]
1853	pxor		@MSG[3],@MSG[1]
1854	sha1msg1	@MSG[3],@MSG[2]
1855___
1856	($E,$E_)=($E_,$E);
1857	push(@MSG,shift(@MSG));
1858
1859	&$aesenc();
1860}
1861$code.=<<___;
1862	movdqa		$ABCD,$E_
1863	sha1rnds4	\$3,$E,$ABCD		# 64-67
1864	sha1nexte	@MSG[3],$E_
1865	sha1msg2	@MSG[3],@MSG[0]
1866	pxor		@MSG[3],@MSG[1]
1867___
1868	&$aesenc();
1869$code.=<<___;
1870	movdqa		$ABCD,$E
1871	sha1rnds4	\$3,$E_,$ABCD		# 68-71
1872	sha1nexte	@MSG[0],$E
1873	sha1msg2	@MSG[0],@MSG[1]
1874___
1875	&$aesenc();
1876$code.=<<___;
1877	movdqa		$E_SAVE,@MSG[0]
1878	movdqa		$ABCD,$E_
1879	sha1rnds4	\$3,$E,$ABCD		# 72-75
1880	sha1nexte	@MSG[1],$E_
1881___
1882	&$aesenc();
1883$code.=<<___;
1884	movdqa		$ABCD,$E
1885	sha1rnds4	\$3,$E_,$ABCD		# 76-79
1886	sha1nexte	$MSG[0],$E
1887___
1888	while($r<40)	{ &$aesenc(); }		# remaining aesenc's
1889$code.=<<___;
1890	dec		$len
1891
1892	paddd		$ABCD_SAVE,$ABCD
1893	movups		$iv,48($out,$in0)	# write output
1894	lea		64($in0),$in0
1895	jnz		.Loop_shaext
1896
1897	pshufd	\$0b00011011,$ABCD,$ABCD
1898	pshufd	\$0b00011011,$E,$E
1899	movups	$iv,($ivp)			# write IV
1900	movdqu	$ABCD,($ctx)
1901	movd	$E,16($ctx)
1902___
1903$code.=<<___ if ($win64);
1904	movaps	-8-10*16(%rax),%xmm6
1905	movaps	-8-9*16(%rax),%xmm7
1906	movaps	-8-8*16(%rax),%xmm8
1907	movaps	-8-7*16(%rax),%xmm9
1908	movaps	-8-6*16(%rax),%xmm10
1909	movaps	-8-5*16(%rax),%xmm11
1910	movaps	-8-4*16(%rax),%xmm12
1911	movaps	-8-3*16(%rax),%xmm13
1912	movaps	-8-2*16(%rax),%xmm14
1913	movaps	-8-1*16(%rax),%xmm15
1914	mov	%rax,%rsp
1915.Lepilogue_shaext:
1916___
1917$code.=<<___;
1918	ret
1919.cfi_endproc
1920.size	aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
1921___
1922						}}}
1923# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1924#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1925if ($win64) {
1926$rec="%rcx";
1927$frame="%rdx";
1928$context="%r8";
1929$disp="%r9";
1930
1931$code.=<<___;
1932.extern	__imp_RtlVirtualUnwind
1933.type	ssse3_handler,\@abi-omnipotent
1934.align	16
1935ssse3_handler:
1936	push	%rsi
1937	push	%rdi
1938	push	%rbx
1939	push	%rbp
1940	push	%r12
1941	push	%r13
1942	push	%r14
1943	push	%r15
1944	pushfq
1945	sub	\$64,%rsp
1946
1947	mov	120($context),%rax	# pull context->Rax
1948	mov	248($context),%rbx	# pull context->Rip
1949
1950	mov	8($disp),%rsi		# disp->ImageBase
1951	mov	56($disp),%r11		# disp->HandlerData
1952
1953	mov	0(%r11),%r10d		# HandlerData[0]
1954	lea	(%rsi,%r10),%r10	# prologue label
1955	cmp	%r10,%rbx		# context->Rip<prologue label
1956	jb	.Lcommon_seh_tail
1957
1958	mov	152($context),%rax	# pull context->Rsp
1959
1960	mov	4(%r11),%r10d		# HandlerData[1]
1961	lea	(%rsi,%r10),%r10	# epilogue label
1962	cmp	%r10,%rbx		# context->Rip>=epilogue label
1963	jae	.Lcommon_seh_tail
1964___
1965$code.=<<___ if ($shaext);
1966	lea	aesni_cbc_sha1_enc_shaext(%rip),%r10
1967	cmp	%r10,%rbx
1968	jb	.Lseh_no_shaext
1969
1970	lea	(%rax),%rsi
1971	lea	512($context),%rdi	# &context.Xmm6
1972	mov	\$20,%ecx
1973	.long	0xa548f3fc		# cld; rep movsq
1974	lea	168(%rax),%rax		# adjust stack pointer
1975	jmp	.Lcommon_seh_tail
1976.Lseh_no_shaext:
1977___
1978$code.=<<___;
1979	lea	96(%rax),%rsi
1980	lea	512($context),%rdi	# &context.Xmm6
1981	mov	\$20,%ecx
1982	.long	0xa548f3fc		# cld; rep movsq
1983	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
1984
1985	mov	0(%rax),%r15
1986	mov	8(%rax),%r14
1987	mov	16(%rax),%r13
1988	mov	24(%rax),%r12
1989	mov	32(%rax),%rbp
1990	mov	40(%rax),%rbx
1991	lea	48(%rax),%rax
1992	mov	%rbx,144($context)	# restore context->Rbx
1993	mov	%rbp,160($context)	# restore context->Rbp
1994	mov	%r12,216($context)	# restore context->R12
1995	mov	%r13,224($context)	# restore context->R13
1996	mov	%r14,232($context)	# restore context->R14
1997	mov	%r15,240($context)	# restore context->R15
1998
1999.Lcommon_seh_tail:
2000	mov	8(%rax),%rdi
2001	mov	16(%rax),%rsi
2002	mov	%rax,152($context)	# restore context->Rsp
2003	mov	%rsi,168($context)	# restore context->Rsi
2004	mov	%rdi,176($context)	# restore context->Rdi
2005
2006	mov	40($disp),%rdi		# disp->ContextRecord
2007	mov	$context,%rsi		# context
2008	mov	\$154,%ecx		# sizeof(CONTEXT)
2009	.long	0xa548f3fc		# cld; rep movsq
2010
2011	mov	$disp,%rsi
2012	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2013	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2014	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2015	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2016	mov	40(%rsi),%r10		# disp->ContextRecord
2017	lea	56(%rsi),%r11		# &disp->HandlerData
2018	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2019	mov	%r10,32(%rsp)		# arg5
2020	mov	%r11,40(%rsp)		# arg6
2021	mov	%r12,48(%rsp)		# arg7
2022	mov	%rcx,56(%rsp)		# arg8, (NULL)
2023	call	*__imp_RtlVirtualUnwind(%rip)
2024
2025	mov	\$1,%eax		# ExceptionContinueSearch
2026	add	\$64,%rsp
2027	popfq
2028	pop	%r15
2029	pop	%r14
2030	pop	%r13
2031	pop	%r12
2032	pop	%rbp
2033	pop	%rbx
2034	pop	%rdi
2035	pop	%rsi
2036	ret
2037.size	ssse3_handler,.-ssse3_handler
2038
2039.section	.pdata
2040.align	4
2041	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
2042	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
2043	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
2044___
2045$code.=<<___ if ($avx);
2046	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
2047	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
2048	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
2049___
2050$code.=<<___ if ($shaext);
2051	.rva	.LSEH_begin_aesni_cbc_sha1_enc_shaext
2052	.rva	.LSEH_end_aesni_cbc_sha1_enc_shaext
2053	.rva	.LSEH_info_aesni_cbc_sha1_enc_shaext
2054___
2055$code.=<<___;
2056.section	.xdata
2057.align	8
2058.LSEH_info_aesni_cbc_sha1_enc_ssse3:
2059	.byte	9,0,0,0
2060	.rva	ssse3_handler
2061	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2062___
2063$code.=<<___ if ($avx);
2064.LSEH_info_aesni_cbc_sha1_enc_avx:
2065	.byte	9,0,0,0
2066	.rva	ssse3_handler
2067	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2068___
2069$code.=<<___ if ($shaext);
2070.LSEH_info_aesni_cbc_sha1_enc_shaext:
2071	.byte	9,0,0,0
2072	.rva	ssse3_handler
2073	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
2074___
2075}
2076
2077####################################################################
2078sub rex {
2079  local *opcode=shift;
2080  my ($dst,$src)=@_;
2081  my $rex=0;
2082
2083    $rex|=0x04			if($dst>=8);
2084    $rex|=0x01			if($src>=8);
2085    unshift @opcode,$rex|0x40	if($rex);
2086}
2087
2088sub sha1rnds4 {
2089    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2090      my @opcode=(0x0f,0x3a,0xcc);
2091	rex(\@opcode,$3,$2);
2092	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2093	my $c=$1;
2094	push @opcode,$c=~/^0/?oct($c):$c;
2095	return ".byte\t".join(',',@opcode);
2096    } else {
2097	return "sha1rnds4\t".@_[0];
2098    }
2099}
2100
2101sub sha1op38 {
2102    my $instr = shift;
2103    my %opcodelet = (
2104		"sha1nexte" => 0xc8,
2105  		"sha1msg1"  => 0xc9,
2106		"sha1msg2"  => 0xca	);
2107
2108    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2109      my @opcode=(0x0f,0x38);
2110	rex(\@opcode,$2,$1);
2111	push @opcode,$opcodelet{$instr};
2112	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2113	return ".byte\t".join(',',@opcode);
2114    } else {
2115	return $instr."\t".@_[0];
2116    }
2117}
2118
2119sub aesni {
2120  my $line=shift;
2121  my @opcode=(0x0f,0x38);
2122
2123    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2124	my %opcodelet = (
2125		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
2126		"aesdec" => 0xde,	"aesdeclast" => 0xdf
2127	);
2128	return undef if (!defined($opcodelet{$1}));
2129	rex(\@opcode,$3,$2);
2130	push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3);	# ModR/M
2131	unshift @opcode,0x66;
2132	return ".byte\t".join(',',@opcode);
2133    }
2134    return $line;
2135}
2136
2137foreach (split("\n",$code)) {
2138        s/\`([^\`]*)\`/eval $1/geo;
2139
2140	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo		or
2141	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo		or
2142	s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
2143
2144	print $_,"\n";
2145}
2146close STDOUT or die "error closing STDOUT: $!";
2147