1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# January 2013
11#
12# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13# in http://download.intel.com/design/intarch/papers/323686.pdf, is
14# that since AESNI-CBC encrypt exhibit *very* low instruction-level
15# parallelism, interleaving it with another algorithm would allow to
16# utilize processor resources better and achieve better performance.
17# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18# AESNI code is weaved into it. As SHA256 dominates execution time,
19# stitch performance does not depend on AES key length. Below are
20# performance numbers in cycles per processed byte, less is better,
21# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22# subroutine:
23#
24#		 AES-128/-192/-256+SHA256	this(**)gain
25# Sandy Bridge	    5.05/6.05/7.05+11.6		13.0	+28%/36%/43%
26# Ivy Bridge	    5.05/6.05/7.05+10.3		11.6	+32%/41%/50%
27# Haswell	    4.43/5.29/6.19+7.80		8.79	+39%/49%/59%
28# Bulldozer	    5.77/6.89/8.00+13.7		13.7	+42%/50%/58%
29#
30# (*)	there are XOP, AVX1 and AVX2 code pathes, meaning that
31#	Westmere is omitted from loop, this is because gain was not
32#	estimated high enough to justify the effort;
33# (**)	these are EVP-free results, results obtained with 'speed
34#	-evp aes-256-cbc-hmac-sha256' will vary by percent or two;
35
36$flavour = shift;
37$output  = shift;
38if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39
40$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45die "can't locate x86_64-xlate.pl";
46
47if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49	$avx = ($1>=2.19) + ($1>=2.22);
50}
51
52if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54	$avx = ($1>=2.09) + ($1>=2.10);
55}
56
57if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59	$avx = ($1>=10) + ($1>=12);
60}
61
62if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
63	$avx = ($2>=3.0) + ($2>3.0);
64}
65
66$shaext=$avx;	### set to zero if compiling for 1.0.1
67$avx=1		if (!$shaext && $avx);
68
69open OUT,"| \"$^X\" $xlate $flavour $output";
70*STDOUT=*OUT;
71
72$func="aesni_cbc_sha256_enc";
73$TABLE="K256";
74$SZ=4;
75@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
76				"%r8d","%r9d","%r10d","%r11d");
77($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
78@Sigma0=( 2,13,22);
79@Sigma1=( 6,11,25);
80@sigma0=( 7,18, 3);
81@sigma1=(17,19,10);
82$rounds=64;
83
84########################################################################
85# void aesni_cbc_sha256_enc(const void *inp,
86#			void *out,
87#			size_t length,
88#			const AES_KEY *key,
89#			unsigned char *iv,
90#			SHA256_CTX *ctx,
91#			const void *in0);
92($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
93("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
94
95$Tbl="%rbp";
96
97$_inp="16*$SZ+0*8(%rsp)";
98$_out="16*$SZ+1*8(%rsp)";
99$_end="16*$SZ+2*8(%rsp)";
100$_key="16*$SZ+3*8(%rsp)";
101$_ivp="16*$SZ+4*8(%rsp)";
102$_ctx="16*$SZ+5*8(%rsp)";
103$_in0="16*$SZ+6*8(%rsp)";
104$_rsp="16*$SZ+7*8(%rsp)";
105$framesz=16*$SZ+8*8;
106
107$code=<<___;
108.text
109
110.extern	OPENSSL_ia32cap_P
111.globl	$func
112.type	$func,\@abi-omnipotent
113.align	16
114$func:
115___
116						if ($avx) {
117$code.=<<___;
118	lea	OPENSSL_ia32cap_P(%rip),%r11
119	mov	\$1,%eax
120	cmp	\$0,`$win64?"%rcx":"%rdi"`
121	je	.Lprobe
122	mov	0(%r11),%eax
123	mov	4(%r11),%r10
124___
125$code.=<<___ if ($shaext);
126	bt	\$61,%r10			# check for SHA
127	jc	${func}_shaext
128___
129$code.=<<___;
130	mov	%r10,%r11
131	shr	\$32,%r11
132
133	test	\$`1<<11`,%r10d			# check for XOP
134	jnz	${func}_xop
135___
136$code.=<<___ if ($avx>1);
137	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
138	cmp	\$`1<<8|1<<5|1<<3`,%r11d
139	je	${func}_avx2
140___
141$code.=<<___;
142	and	\$`1<<28`,%r10d			# check for AVX
143	jnz	${func}_avx
144	ud2
145___
146						}
147$code.=<<___;
148	xor	%eax,%eax
149	cmp	\$0,`$win64?"%rcx":"%rdi"`
150	je	.Lprobe
151	ud2
152.Lprobe:
153	ret
154.size	$func,.-$func
155
156.align	64
157.type	$TABLE,\@object
158$TABLE:
159	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
160	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
161	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
162	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
163	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
164	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
167	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
168	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
169	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
170	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
171	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
172	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
173	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
174	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
175	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
176	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
177	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
178	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
179	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
180	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
181	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
182	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
183	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
184	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
185	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
186	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
187	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
188	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
189	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
190	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
191
192	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
193	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
194	.long	0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
195	.long	0,0,0,0,   0,0,0,0
196	.asciz	"AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
197.align	64
198___
199
200######################################################################
201# SIMD code paths
202#
203{{{
204($iv,$inout,$roundkey,$temp,
205 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
206
207$aesni_cbc_idx=0;
208@aesni_cbc_block = (
209##	&vmovdqu	($roundkey,"0x00-0x80($inp)");'
210##	&vmovdqu	($inout,($inp));
211##	&mov		($_inp,$inp);
212
213	'&vpxor		($inout,$inout,$roundkey);'.
214	' &vmovdqu	($roundkey,"0x10-0x80($inp)");',
215
216	'&vpxor		($inout,$inout,$iv);',
217
218	'&vaesenc	($inout,$inout,$roundkey);'.
219	' &vmovdqu	($roundkey,"0x20-0x80($inp)");',
220
221	'&vaesenc	($inout,$inout,$roundkey);'.
222	' &vmovdqu	($roundkey,"0x30-0x80($inp)");',
223
224	'&vaesenc	($inout,$inout,$roundkey);'.
225	' &vmovdqu	($roundkey,"0x40-0x80($inp)");',
226
227	'&vaesenc	($inout,$inout,$roundkey);'.
228	' &vmovdqu	($roundkey,"0x50-0x80($inp)");',
229
230	'&vaesenc	($inout,$inout,$roundkey);'.
231	' &vmovdqu	($roundkey,"0x60-0x80($inp)");',
232
233	'&vaesenc	($inout,$inout,$roundkey);'.
234	' &vmovdqu	($roundkey,"0x70-0x80($inp)");',
235
236	'&vaesenc	($inout,$inout,$roundkey);'.
237	' &vmovdqu	($roundkey,"0x80-0x80($inp)");',
238
239	'&vaesenc	($inout,$inout,$roundkey);'.
240	' &vmovdqu	($roundkey,"0x90-0x80($inp)");',
241
242	'&vaesenc	($inout,$inout,$roundkey);'.
243	' &vmovdqu	($roundkey,"0xa0-0x80($inp)");',
244
245	'&vaesenclast	($temp,$inout,$roundkey);'.
246	' &vaesenc	($inout,$inout,$roundkey);'.
247	' &vmovdqu	($roundkey,"0xb0-0x80($inp)");',
248
249	'&vpand		($iv,$temp,$mask10);'.
250	' &vaesenc	($inout,$inout,$roundkey);'.
251	' &vmovdqu	($roundkey,"0xc0-0x80($inp)");',
252
253	'&vaesenclast	($temp,$inout,$roundkey);'.
254	' &vaesenc	($inout,$inout,$roundkey);'.
255	' &vmovdqu	($roundkey,"0xd0-0x80($inp)");',
256
257	'&vpand		($temp,$temp,$mask12);'.
258	' &vaesenc	($inout,$inout,$roundkey);'.
259	 '&vmovdqu	($roundkey,"0xe0-0x80($inp)");',
260
261	'&vpor		($iv,$iv,$temp);'.
262	' &vaesenclast	($temp,$inout,$roundkey);'.
263	' &vmovdqu	($roundkey,"0x00-0x80($inp)");'
264
265##	&mov		($inp,$_inp);
266##	&mov		($out,$_out);
267##	&vpand		($temp,$temp,$mask14);
268##	&vpor		($iv,$iv,$temp);
269##	&vmovdqu	($iv,($out,$inp);
270##	&lea		(inp,16($inp));
271);
272
273my $a4=$T1;
274my ($a,$b,$c,$d,$e,$f,$g,$h);
275
276sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
277{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
278  my $arg = pop;
279    $arg = "\$$arg" if ($arg*1 eq $arg);
280    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
281}
282
283sub body_00_15 () {
284	(
285	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
286
287	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
288	'&mov	($a,$a1)',
289	'&mov	($a4,$f)',
290
291	'&xor	($a0,$e)',
292	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
293	'&xor	($a4,$g)',			# f^g
294
295	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
296	'&xor	($a1,$a)',
297	'&and	($a4,$e)',			# (f^g)&e
298
299	@aesni_cbc_block[$aesni_cbc_idx++].
300	'&xor	($a0,$e)',
301	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
302	'&mov	($a2,$a)',
303
304	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
305	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
306	'&xor	($a2,$b)',			# a^b, b^c in next round
307
308	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
309	'&add	($h,$a4)',			# h+=Ch(e,f,g)
310	'&and	($a3,$a2)',			# (b^c)&(a^b)
311
312	'&xor	($a1,$a)',
313	'&add	($h,$a0)',			# h+=Sigma1(e)
314	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
315
316	'&add	($d,$h)',			# d+=h
317	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
318	'&add	($h,$a3)',			# h+=Maj(a,b,c)
319
320	'&mov	($a0,$d)',
321	'&add	($a1,$h);'.			# h+=Sigma0(a)
322	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
323	);
324}
325
326if ($avx) {{
327######################################################################
328# XOP code path
329#
330$code.=<<___;
331.type	${func}_xop,\@function,6
332.align	64
333${func}_xop:
334.Lxop_shortcut:
335	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
336	push	%rbx
337	push	%rbp
338	push	%r12
339	push	%r13
340	push	%r14
341	push	%r15
342	mov	%rsp,%r11		# copy %rsp
343	sub	\$`$framesz+$win64*16*10`,%rsp
344	and	\$-64,%rsp		# align stack frame
345
346	shl	\$6,$len
347	sub	$inp,$out		# re-bias
348	sub	$inp,$in0
349	add	$inp,$len		# end of input
350
351	#mov	$inp,$_inp		# saved later
352	mov	$out,$_out
353	mov	$len,$_end
354	#mov	$key,$_key		# remains resident in $inp register
355	mov	$ivp,$_ivp
356	mov	$ctx,$_ctx
357	mov	$in0,$_in0
358	mov	%r11,$_rsp
359___
360$code.=<<___ if ($win64);
361	movaps	%xmm6,`$framesz+16*0`(%rsp)
362	movaps	%xmm7,`$framesz+16*1`(%rsp)
363	movaps	%xmm8,`$framesz+16*2`(%rsp)
364	movaps	%xmm9,`$framesz+16*3`(%rsp)
365	movaps	%xmm10,`$framesz+16*4`(%rsp)
366	movaps	%xmm11,`$framesz+16*5`(%rsp)
367	movaps	%xmm12,`$framesz+16*6`(%rsp)
368	movaps	%xmm13,`$framesz+16*7`(%rsp)
369	movaps	%xmm14,`$framesz+16*8`(%rsp)
370	movaps	%xmm15,`$framesz+16*9`(%rsp)
371___
372$code.=<<___;
373.Lprologue_xop:
374	vzeroall
375
376	mov	$inp,%r12		# borrow $a4
377	lea	0x80($key),$inp		# size optimization, reassign
378	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
379	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
380	mov	$ctx,%r15		# borrow $a2
381	mov	$in0,%rsi		# borrow $a3
382	vmovdqu	($ivp),$iv		# load IV
383	sub	\$9,%r14
384
385	mov	$SZ*0(%r15),$A
386	mov	$SZ*1(%r15),$B
387	mov	$SZ*2(%r15),$C
388	mov	$SZ*3(%r15),$D
389	mov	$SZ*4(%r15),$E
390	mov	$SZ*5(%r15),$F
391	mov	$SZ*6(%r15),$G
392	mov	$SZ*7(%r15),$H
393
394	vmovdqa	0x00(%r13,%r14,8),$mask14
395	vmovdqa	0x10(%r13,%r14,8),$mask12
396	vmovdqa	0x20(%r13,%r14,8),$mask10
397	vmovdqu	0x00-0x80($inp),$roundkey
398	jmp	.Lloop_xop
399___
400					if ($SZ==4) {	# SHA256
401    my @X = map("%xmm$_",(0..3));
402    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
403
404$code.=<<___;
405.align	16
406.Lloop_xop:
407	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
408	vmovdqu	0x00(%rsi,%r12),@X[0]
409	vmovdqu	0x10(%rsi,%r12),@X[1]
410	vmovdqu	0x20(%rsi,%r12),@X[2]
411	vmovdqu	0x30(%rsi,%r12),@X[3]
412	vpshufb	$t3,@X[0],@X[0]
413	lea	$TABLE(%rip),$Tbl
414	vpshufb	$t3,@X[1],@X[1]
415	vpshufb	$t3,@X[2],@X[2]
416	vpaddd	0x00($Tbl),@X[0],$t0
417	vpshufb	$t3,@X[3],@X[3]
418	vpaddd	0x20($Tbl),@X[1],$t1
419	vpaddd	0x40($Tbl),@X[2],$t2
420	vpaddd	0x60($Tbl),@X[3],$t3
421	vmovdqa	$t0,0x00(%rsp)
422	mov	$A,$a1
423	vmovdqa	$t1,0x10(%rsp)
424	mov	$B,$a3
425	vmovdqa	$t2,0x20(%rsp)
426	xor	$C,$a3			# magic
427	vmovdqa	$t3,0x30(%rsp)
428	mov	$E,$a0
429	jmp	.Lxop_00_47
430
431.align	16
432.Lxop_00_47:
433	sub	\$-16*2*$SZ,$Tbl	# size optimization
434	vmovdqu	(%r12),$inout		# $a4
435	mov	%r12,$_inp		# $a4
436___
437sub XOP_256_00_47 () {
438my $j = shift;
439my $body = shift;
440my @X = @_;
441my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
442
443	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
444	  eval(shift(@insns));
445	  eval(shift(@insns));
446	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
447	  eval(shift(@insns));
448	  eval(shift(@insns));
449	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
450	  eval(shift(@insns));
451	  eval(shift(@insns));
452	&vpsrld		($t0,$t0,$sigma0[2]);
453	  eval(shift(@insns));
454	  eval(shift(@insns));
455	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
456	  eval(shift(@insns));
457	  eval(shift(@insns));
458	  eval(shift(@insns));
459	  eval(shift(@insns));
460	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
461	  eval(shift(@insns));
462	  eval(shift(@insns));
463	&vpxor		($t0,$t0,$t1);
464	  eval(shift(@insns));
465	  eval(shift(@insns));
466	  eval(shift(@insns));
467	  eval(shift(@insns));
468	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
469	  eval(shift(@insns));
470	  eval(shift(@insns));
471	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
472	  eval(shift(@insns));
473	  eval(shift(@insns));
474	 &vpsrld	($t2,@X[3],$sigma1[2]);
475	  eval(shift(@insns));
476	  eval(shift(@insns));
477	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
478	  eval(shift(@insns));
479	  eval(shift(@insns));
480	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
481	  eval(shift(@insns));
482	  eval(shift(@insns));
483	 &vpxor		($t3,$t3,$t2);
484	  eval(shift(@insns));
485	  eval(shift(@insns));
486	  eval(shift(@insns));
487	  eval(shift(@insns));
488	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
489	  eval(shift(@insns));
490	  eval(shift(@insns));
491	  eval(shift(@insns));
492	  eval(shift(@insns));
493	&vpsrldq	($t3,$t3,8);
494	  eval(shift(@insns));
495	  eval(shift(@insns));
496	  eval(shift(@insns));
497	  eval(shift(@insns));
498	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
499	  eval(shift(@insns));
500	  eval(shift(@insns));
501	  eval(shift(@insns));
502	  eval(shift(@insns));
503	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
504	  eval(shift(@insns));
505	  eval(shift(@insns));
506	 &vpsrld	($t2,@X[0],$sigma1[2]);
507	  eval(shift(@insns));
508	  eval(shift(@insns));
509	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
510	  eval(shift(@insns));
511	  eval(shift(@insns));
512	 &vpxor		($t3,$t3,$t2);
513	  eval(shift(@insns));
514	  eval(shift(@insns));
515	  eval(shift(@insns));
516	  eval(shift(@insns));
517	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
518	  eval(shift(@insns));
519	  eval(shift(@insns));
520	  eval(shift(@insns));
521	  eval(shift(@insns));
522	&vpslldq	($t3,$t3,8);		# 22 instructions
523	  eval(shift(@insns));
524	  eval(shift(@insns));
525	  eval(shift(@insns));
526	  eval(shift(@insns));
527	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
528	  eval(shift(@insns));
529	  eval(shift(@insns));
530	  eval(shift(@insns));
531	  eval(shift(@insns));
532	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
533	  foreach (@insns) { eval; }		# remaining instructions
534	&vmovdqa	(16*$j."(%rsp)",$t2);
535}
536
537    $aesni_cbc_idx=0;
538    for ($i=0,$j=0; $j<4; $j++) {
539	&XOP_256_00_47($j,\&body_00_15,@X);
540	push(@X,shift(@X));			# rotate(@X)
541    }
542    	&mov		("%r12",$_inp);		# borrow $a4
543	&vpand		($temp,$temp,$mask14);
544	&mov		("%r15",$_out);		# borrow $a2
545	&vpor		($iv,$iv,$temp);
546	&vmovdqu	("(%r15,%r12)",$iv);	# write output
547	&lea		("%r12","16(%r12)");	# inp++
548
549	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
550	&jne	(".Lxop_00_47");
551
552	&vmovdqu	($inout,"(%r12)");
553	&mov		($_inp,"%r12");
554
555    $aesni_cbc_idx=0;
556    for ($i=0; $i<16; ) {
557	foreach(body_00_15()) { eval; }
558    }
559					}
560$code.=<<___;
561	mov	$_inp,%r12		# borrow $a4
562	mov	$_out,%r13		# borrow $a0
563	mov	$_ctx,%r15		# borrow $a2
564	mov	$_in0,%rsi		# borrow $a3
565
566	vpand	$mask14,$temp,$temp
567	mov	$a1,$A
568	vpor	$temp,$iv,$iv
569	vmovdqu	$iv,(%r13,%r12)		# write output
570	lea	16(%r12),%r12		# inp++
571
572	add	$SZ*0(%r15),$A
573	add	$SZ*1(%r15),$B
574	add	$SZ*2(%r15),$C
575	add	$SZ*3(%r15),$D
576	add	$SZ*4(%r15),$E
577	add	$SZ*5(%r15),$F
578	add	$SZ*6(%r15),$G
579	add	$SZ*7(%r15),$H
580
581	cmp	$_end,%r12
582
583	mov	$A,$SZ*0(%r15)
584	mov	$B,$SZ*1(%r15)
585	mov	$C,$SZ*2(%r15)
586	mov	$D,$SZ*3(%r15)
587	mov	$E,$SZ*4(%r15)
588	mov	$F,$SZ*5(%r15)
589	mov	$G,$SZ*6(%r15)
590	mov	$H,$SZ*7(%r15)
591
592	jb	.Lloop_xop
593
594	mov	$_ivp,$ivp
595	mov	$_rsp,%rsi
596	vmovdqu	$iv,($ivp)		# output IV
597	vzeroall
598___
599$code.=<<___ if ($win64);
600	movaps	`$framesz+16*0`(%rsp),%xmm6
601	movaps	`$framesz+16*1`(%rsp),%xmm7
602	movaps	`$framesz+16*2`(%rsp),%xmm8
603	movaps	`$framesz+16*3`(%rsp),%xmm9
604	movaps	`$framesz+16*4`(%rsp),%xmm10
605	movaps	`$framesz+16*5`(%rsp),%xmm11
606	movaps	`$framesz+16*6`(%rsp),%xmm12
607	movaps	`$framesz+16*7`(%rsp),%xmm13
608	movaps	`$framesz+16*8`(%rsp),%xmm14
609	movaps	`$framesz+16*9`(%rsp),%xmm15
610___
611$code.=<<___;
612	mov	(%rsi),%r15
613	mov	8(%rsi),%r14
614	mov	16(%rsi),%r13
615	mov	24(%rsi),%r12
616	mov	32(%rsi),%rbp
617	mov	40(%rsi),%rbx
618	lea	48(%rsi),%rsp
619.Lepilogue_xop:
620	ret
621.size	${func}_xop,.-${func}_xop
622___
623######################################################################
624# AVX+shrd code path
625#
626local *ror = sub { &shrd(@_[0],@_) };
627
628$code.=<<___;
629.type	${func}_avx,\@function,6
630.align	64
631${func}_avx:
632.Lavx_shortcut:
633	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
634	push	%rbx
635	push	%rbp
636	push	%r12
637	push	%r13
638	push	%r14
639	push	%r15
640	mov	%rsp,%r11		# copy %rsp
641	sub	\$`$framesz+$win64*16*10`,%rsp
642	and	\$-64,%rsp		# align stack frame
643
644	shl	\$6,$len
645	sub	$inp,$out		# re-bias
646	sub	$inp,$in0
647	add	$inp,$len		# end of input
648
649	#mov	$inp,$_inp		# saved later
650	mov	$out,$_out
651	mov	$len,$_end
652	#mov	$key,$_key		# remains resident in $inp register
653	mov	$ivp,$_ivp
654	mov	$ctx,$_ctx
655	mov	$in0,$_in0
656	mov	%r11,$_rsp
657___
658$code.=<<___ if ($win64);
659	movaps	%xmm6,`$framesz+16*0`(%rsp)
660	movaps	%xmm7,`$framesz+16*1`(%rsp)
661	movaps	%xmm8,`$framesz+16*2`(%rsp)
662	movaps	%xmm9,`$framesz+16*3`(%rsp)
663	movaps	%xmm10,`$framesz+16*4`(%rsp)
664	movaps	%xmm11,`$framesz+16*5`(%rsp)
665	movaps	%xmm12,`$framesz+16*6`(%rsp)
666	movaps	%xmm13,`$framesz+16*7`(%rsp)
667	movaps	%xmm14,`$framesz+16*8`(%rsp)
668	movaps	%xmm15,`$framesz+16*9`(%rsp)
669___
670$code.=<<___;
671.Lprologue_avx:
672	vzeroall
673
674	mov	$inp,%r12		# borrow $a4
675	lea	0x80($key),$inp		# size optimization, reassign
676	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
677	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
678	mov	$ctx,%r15		# borrow $a2
679	mov	$in0,%rsi		# borrow $a3
680	vmovdqu	($ivp),$iv		# load IV
681	sub	\$9,%r14
682
683	mov	$SZ*0(%r15),$A
684	mov	$SZ*1(%r15),$B
685	mov	$SZ*2(%r15),$C
686	mov	$SZ*3(%r15),$D
687	mov	$SZ*4(%r15),$E
688	mov	$SZ*5(%r15),$F
689	mov	$SZ*6(%r15),$G
690	mov	$SZ*7(%r15),$H
691
692	vmovdqa	0x00(%r13,%r14,8),$mask14
693	vmovdqa	0x10(%r13,%r14,8),$mask12
694	vmovdqa	0x20(%r13,%r14,8),$mask10
695	vmovdqu	0x00-0x80($inp),$roundkey
696___
697					if ($SZ==4) {	# SHA256
698    my @X = map("%xmm$_",(0..3));
699    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
700
701$code.=<<___;
702	jmp	.Lloop_avx
703.align	16
704.Lloop_avx:
705	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
706	vmovdqu	0x00(%rsi,%r12),@X[0]
707	vmovdqu	0x10(%rsi,%r12),@X[1]
708	vmovdqu	0x20(%rsi,%r12),@X[2]
709	vmovdqu	0x30(%rsi,%r12),@X[3]
710	vpshufb	$t3,@X[0],@X[0]
711	lea	$TABLE(%rip),$Tbl
712	vpshufb	$t3,@X[1],@X[1]
713	vpshufb	$t3,@X[2],@X[2]
714	vpaddd	0x00($Tbl),@X[0],$t0
715	vpshufb	$t3,@X[3],@X[3]
716	vpaddd	0x20($Tbl),@X[1],$t1
717	vpaddd	0x40($Tbl),@X[2],$t2
718	vpaddd	0x60($Tbl),@X[3],$t3
719	vmovdqa	$t0,0x00(%rsp)
720	mov	$A,$a1
721	vmovdqa	$t1,0x10(%rsp)
722	mov	$B,$a3
723	vmovdqa	$t2,0x20(%rsp)
724	xor	$C,$a3			# magic
725	vmovdqa	$t3,0x30(%rsp)
726	mov	$E,$a0
727	jmp	.Lavx_00_47
728
729.align	16
730.Lavx_00_47:
731	sub	\$-16*2*$SZ,$Tbl	# size optimization
732	vmovdqu	(%r12),$inout		# $a4
733	mov	%r12,$_inp		# $a4
734___
735sub Xupdate_256_AVX () {
736	(
737	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
738	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
739	'&vpsrld	($t2,$t0,$sigma0[0]);',
740	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
741	'&vpsrld	($t3,$t0,$sigma0[2])',
742	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
743	'&vpxor		($t0,$t3,$t2)',
744	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
745	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
746	'&vpxor		($t0,$t0,$t1)',
747	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
748	'&vpxor		($t0,$t0,$t2)',
749	 '&vpsrld	($t2,$t3,$sigma1[2]);',
750	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
751	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
752	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
753	 '&vpxor	($t2,$t2,$t3);',
754	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
755	 '&vpxor	($t2,$t2,$t3)',		# sigma1(X[14..15])
756	 '&vpshufd	($t2,$t2,0b10000100)',
757	 '&vpsrldq	($t2,$t2,8)',
758	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
759	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
760	 '&vpsrld	($t2,$t3,$sigma1[2])',
761	 '&vpsrlq	($t3,$t3,$sigma1[0])',
762	 '&vpxor	($t2,$t2,$t3);',
763	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
764	 '&vpxor	($t2,$t2,$t3)',
765	 '&vpshufd	($t2,$t2,0b11101000)',
766	 '&vpslldq	($t2,$t2,8)',
767	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
768	);
769}
770
771sub AVX_256_00_47 () {
772my $j = shift;
773my $body = shift;
774my @X = @_;
775my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
776
777	foreach (Xupdate_256_AVX()) {		# 29 instructions
778	    eval;
779	    eval(shift(@insns));
780	    eval(shift(@insns));
781	    eval(shift(@insns));
782	}
783	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
784	  foreach (@insns) { eval; }		# remaining instructions
785	&vmovdqa	(16*$j."(%rsp)",$t2);
786}
787
788    $aesni_cbc_idx=0;
789    for ($i=0,$j=0; $j<4; $j++) {
790	&AVX_256_00_47($j,\&body_00_15,@X);
791	push(@X,shift(@X));			# rotate(@X)
792    }
793    	&mov		("%r12",$_inp);		# borrow $a4
794	&vpand		($temp,$temp,$mask14);
795	&mov		("%r15",$_out);		# borrow $a2
796	&vpor		($iv,$iv,$temp);
797	&vmovdqu	("(%r15,%r12)",$iv);	# write output
798	&lea		("%r12","16(%r12)");	# inp++
799
800	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
801	&jne	(".Lavx_00_47");
802
803	&vmovdqu	($inout,"(%r12)");
804	&mov		($_inp,"%r12");
805
806    $aesni_cbc_idx=0;
807    for ($i=0; $i<16; ) {
808	foreach(body_00_15()) { eval; }
809    }
810
811					}
812$code.=<<___;
813	mov	$_inp,%r12		# borrow $a4
814	mov	$_out,%r13		# borrow $a0
815	mov	$_ctx,%r15		# borrow $a2
816	mov	$_in0,%rsi		# borrow $a3
817
818	vpand	$mask14,$temp,$temp
819	mov	$a1,$A
820	vpor	$temp,$iv,$iv
821	vmovdqu	$iv,(%r13,%r12)		# write output
822	lea	16(%r12),%r12		# inp++
823
824	add	$SZ*0(%r15),$A
825	add	$SZ*1(%r15),$B
826	add	$SZ*2(%r15),$C
827	add	$SZ*3(%r15),$D
828	add	$SZ*4(%r15),$E
829	add	$SZ*5(%r15),$F
830	add	$SZ*6(%r15),$G
831	add	$SZ*7(%r15),$H
832
833	cmp	$_end,%r12
834
835	mov	$A,$SZ*0(%r15)
836	mov	$B,$SZ*1(%r15)
837	mov	$C,$SZ*2(%r15)
838	mov	$D,$SZ*3(%r15)
839	mov	$E,$SZ*4(%r15)
840	mov	$F,$SZ*5(%r15)
841	mov	$G,$SZ*6(%r15)
842	mov	$H,$SZ*7(%r15)
843	jb	.Lloop_avx
844
845	mov	$_ivp,$ivp
846	mov	$_rsp,%rsi
847	vmovdqu	$iv,($ivp)		# output IV
848	vzeroall
849___
850$code.=<<___ if ($win64);
851	movaps	`$framesz+16*0`(%rsp),%xmm6
852	movaps	`$framesz+16*1`(%rsp),%xmm7
853	movaps	`$framesz+16*2`(%rsp),%xmm8
854	movaps	`$framesz+16*3`(%rsp),%xmm9
855	movaps	`$framesz+16*4`(%rsp),%xmm10
856	movaps	`$framesz+16*5`(%rsp),%xmm11
857	movaps	`$framesz+16*6`(%rsp),%xmm12
858	movaps	`$framesz+16*7`(%rsp),%xmm13
859	movaps	`$framesz+16*8`(%rsp),%xmm14
860	movaps	`$framesz+16*9`(%rsp),%xmm15
861___
862$code.=<<___;
863	mov	(%rsi),%r15
864	mov	8(%rsi),%r14
865	mov	16(%rsi),%r13
866	mov	24(%rsi),%r12
867	mov	32(%rsi),%rbp
868	mov	40(%rsi),%rbx
869	lea	48(%rsi),%rsp
870.Lepilogue_avx:
871	ret
872.size	${func}_avx,.-${func}_avx
873___
874
875if ($avx>1) {{
876######################################################################
877# AVX2+BMI code path
878#
879my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
880my $PUSH8=8*2*$SZ;
881use integer;
882
883sub bodyx_00_15 () {
884	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
885	(
886	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
887
888	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
889	'&and	($a4,$e)',		# f&e
890	'&rorx	($a0,$e,$Sigma1[2])',
891	'&rorx	($a2,$e,$Sigma1[1])',
892
893	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
894	'&lea	($h,"($h,$a4)")',
895	'&andn	($a4,$e,$g)',		# ~e&g
896	'&xor	($a0,$a2)',
897
898	'&rorx	($a1,$e,$Sigma1[0])',
899	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
900	'&xor	($a0,$a1)',		# Sigma1(e)
901	'&mov	($a2,$a)',
902
903	'&rorx	($a4,$a,$Sigma0[2])',
904	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
905	'&xor	($a2,$b)',		# a^b, b^c in next round
906	'&rorx	($a1,$a,$Sigma0[1])',
907
908	'&rorx	($a0,$a,$Sigma0[0])',
909	'&lea	($d,"($d,$h)")',	# d+=h
910	'&and	($a3,$a2)',		# (b^c)&(a^b)
911	@aesni_cbc_block[$aesni_cbc_idx++].
912	'&xor	($a1,$a4)',
913
914	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
915	'&xor	($a1,$a0)',		# Sigma0(a)
916	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
917	'&mov	($a4,$e)',		# copy of f in future
918
919	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
920	);
921	# and at the finish one has to $a+=$a1
922}
923
924$code.=<<___;
925.type	${func}_avx2,\@function,6
926.align	64
927${func}_avx2:
928.Lavx2_shortcut:
929	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
930	push	%rbx
931	push	%rbp
932	push	%r12
933	push	%r13
934	push	%r14
935	push	%r15
936	mov	%rsp,%r11		# copy %rsp
937	sub	\$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
938	and	\$-256*$SZ,%rsp		# align stack frame
939	add	\$`2*$SZ*($rounds-8)`,%rsp
940
941	shl	\$6,$len
942	sub	$inp,$out		# re-bias
943	sub	$inp,$in0
944	add	$inp,$len		# end of input
945
946	#mov	$inp,$_inp		# saved later
947	#mov	$out,$_out		# kept in $offload
948	mov	$len,$_end
949	#mov	$key,$_key		# remains resident in $inp register
950	mov	$ivp,$_ivp
951	mov	$ctx,$_ctx
952	mov	$in0,$_in0
953	mov	%r11,$_rsp
954___
955$code.=<<___ if ($win64);
956	movaps	%xmm6,`$framesz+16*0`(%rsp)
957	movaps	%xmm7,`$framesz+16*1`(%rsp)
958	movaps	%xmm8,`$framesz+16*2`(%rsp)
959	movaps	%xmm9,`$framesz+16*3`(%rsp)
960	movaps	%xmm10,`$framesz+16*4`(%rsp)
961	movaps	%xmm11,`$framesz+16*5`(%rsp)
962	movaps	%xmm12,`$framesz+16*6`(%rsp)
963	movaps	%xmm13,`$framesz+16*7`(%rsp)
964	movaps	%xmm14,`$framesz+16*8`(%rsp)
965	movaps	%xmm15,`$framesz+16*9`(%rsp)
966___
967$code.=<<___;
968.Lprologue_avx2:
969	vzeroall
970
971	mov	$inp,%r13		# borrow $a0
972	vpinsrq	\$1,$out,$offload,$offload
973	lea	0x80($key),$inp		# size optimization, reassign
974	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r12	# borrow $a4
975	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
976	mov	$ctx,%r15		# borrow $a2
977	mov	$in0,%rsi		# borrow $a3
978	vmovdqu	($ivp),$iv		# load IV
979	lea	-9(%r14),%r14
980
981	vmovdqa	0x00(%r12,%r14,8),$mask14
982	vmovdqa	0x10(%r12,%r14,8),$mask12
983	vmovdqa	0x20(%r12,%r14,8),$mask10
984
985	sub	\$-16*$SZ,%r13		# inp++, size optimization
986	mov	$SZ*0(%r15),$A
987	lea	(%rsi,%r13),%r12	# borrow $a0
988	mov	$SZ*1(%r15),$B
989	cmp	$len,%r13		# $_end
990	mov	$SZ*2(%r15),$C
991	cmove	%rsp,%r12		# next block or random data
992	mov	$SZ*3(%r15),$D
993	mov	$SZ*4(%r15),$E
994	mov	$SZ*5(%r15),$F
995	mov	$SZ*6(%r15),$G
996	mov	$SZ*7(%r15),$H
997	vmovdqu	0x00-0x80($inp),$roundkey
998___
999					if ($SZ==4) {	# SHA256
1000    my @X = map("%ymm$_",(0..3));
1001    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1002
1003$code.=<<___;
1004	jmp	.Loop_avx2
1005.align	16
1006.Loop_avx2:
1007	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1008	vmovdqu	-16*$SZ+0(%rsi,%r13),%xmm0
1009	vmovdqu	-16*$SZ+16(%rsi,%r13),%xmm1
1010	vmovdqu	-16*$SZ+32(%rsi,%r13),%xmm2
1011	vmovdqu	-16*$SZ+48(%rsi,%r13),%xmm3
1012
1013	vinserti128	\$1,(%r12),@X[0],@X[0]
1014	vinserti128	\$1,16(%r12),@X[1],@X[1]
1015	 vpshufb	$t3,@X[0],@X[0]
1016	vinserti128	\$1,32(%r12),@X[2],@X[2]
1017	 vpshufb	$t3,@X[1],@X[1]
1018	vinserti128	\$1,48(%r12),@X[3],@X[3]
1019
1020	lea	$TABLE(%rip),$Tbl
1021	vpshufb	$t3,@X[2],@X[2]
1022	lea	-16*$SZ(%r13),%r13
1023	vpaddd	0x00($Tbl),@X[0],$t0
1024	vpshufb	$t3,@X[3],@X[3]
1025	vpaddd	0x20($Tbl),@X[1],$t1
1026	vpaddd	0x40($Tbl),@X[2],$t2
1027	vpaddd	0x60($Tbl),@X[3],$t3
1028	vmovdqa	$t0,0x00(%rsp)
1029	xor	$a1,$a1
1030	vmovdqa	$t1,0x20(%rsp)
1031	lea	-$PUSH8(%rsp),%rsp
1032	mov	$B,$a3
1033	vmovdqa	$t2,0x00(%rsp)
1034	xor	$C,$a3			# magic
1035	vmovdqa	$t3,0x20(%rsp)
1036	mov	$F,$a4
1037	sub	\$-16*2*$SZ,$Tbl	# size optimization
1038	jmp	.Lavx2_00_47
1039
1040.align	16
1041.Lavx2_00_47:
1042	vmovdqu	(%r13),$inout
1043	vpinsrq	\$0,%r13,$offload,$offload
1044___
1045
1046sub AVX2_256_00_47 () {
1047my $j = shift;
1048my $body = shift;
1049my @X = @_;
1050my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1051my $base = "+2*$PUSH8(%rsp)";
1052
1053	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
1054	foreach (Xupdate_256_AVX()) {		# 29 instructions
1055	    eval;
1056	    eval(shift(@insns));
1057	    eval(shift(@insns));
1058	    eval(shift(@insns));
1059	}
1060	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1061	  foreach (@insns) { eval; }		# remaining instructions
1062	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1063}
1064    $aesni_cbc_idx=0;
1065    for ($i=0,$j=0; $j<4; $j++) {
1066	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1067	push(@X,shift(@X));			# rotate(@X)
1068    }
1069	&vmovq		("%r13",$offload);	# borrow $a0
1070	&vpextrq	("%r15",$offload,1);	# borrow $a2
1071	&vpand		($temp,$temp,$mask14);
1072	&vpor		($iv,$iv,$temp);
1073	&vmovdqu	("(%r15,%r13)",$iv);	# write output
1074	&lea		("%r13","16(%r13)");	# inp++
1075
1076	&lea	($Tbl,16*2*$SZ."($Tbl)");
1077	&cmpb	(($SZ-1)."($Tbl)",0);
1078	&jne	(".Lavx2_00_47");
1079
1080	&vmovdqu	($inout,"(%r13)");
1081	&vpinsrq	($offload,$offload,"%r13",0);
1082
1083    $aesni_cbc_idx=0;
1084    for ($i=0; $i<16; ) {
1085	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1086	foreach(bodyx_00_15()) { eval; }
1087    }
1088					}
1089$code.=<<___;
1090	vpextrq	\$1,$offload,%r12		# $_out, borrow $a4
1091	vmovq	$offload,%r13			# $_inp, borrow $a0
1092	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1093	add	$a1,$A
1094	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
1095
1096	vpand	$mask14,$temp,$temp
1097	vpor	$temp,$iv,$iv
1098	vmovdqu	$iv,(%r12,%r13)			# write output
1099	lea	16(%r13),%r13
1100
1101	add	$SZ*0(%r15),$A
1102	add	$SZ*1(%r15),$B
1103	add	$SZ*2(%r15),$C
1104	add	$SZ*3(%r15),$D
1105	add	$SZ*4(%r15),$E
1106	add	$SZ*5(%r15),$F
1107	add	$SZ*6(%r15),$G
1108	add	$SZ*7(%r15),$H
1109
1110	mov	$A,$SZ*0(%r15)
1111	mov	$B,$SZ*1(%r15)
1112	mov	$C,$SZ*2(%r15)
1113	mov	$D,$SZ*3(%r15)
1114	mov	$E,$SZ*4(%r15)
1115	mov	$F,$SZ*5(%r15)
1116	mov	$G,$SZ*6(%r15)
1117	mov	$H,$SZ*7(%r15)
1118
1119	cmp	`$PUSH8+2*8`($Tbl),%r13		# $_end
1120	je	.Ldone_avx2
1121
1122	xor	$a1,$a1
1123	mov	$B,$a3
1124	mov	$F,$a4
1125	xor	$C,$a3			# magic
1126	jmp	.Lower_avx2
1127.align	16
1128.Lower_avx2:
1129	vmovdqu	(%r13),$inout
1130	vpinsrq	\$0,%r13,$offload,$offload
1131___
1132    $aesni_cbc_idx=0;
1133    for ($i=0; $i<16; ) {
1134	my $base="+16($Tbl)";
1135	foreach(bodyx_00_15()) { eval; }
1136	&lea	($Tbl,"-$PUSH8($Tbl)")	if ($i==8);
1137    }
1138$code.=<<___;
1139	vmovq	$offload,%r13			# borrow $a0
1140	vpextrq	\$1,$offload,%r15		# borrow $a2
1141	vpand	$mask14,$temp,$temp
1142	vpor	$temp,$iv,$iv
1143	lea	-$PUSH8($Tbl),$Tbl
1144	vmovdqu	$iv,(%r15,%r13)			# write output
1145	lea	16(%r13),%r13			# inp++
1146	cmp	%rsp,$Tbl
1147	jae	.Lower_avx2
1148
1149	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1150	lea	16*$SZ(%r13),%r13
1151	mov	`2*$SZ*$rounds+6*8`(%rsp),%rsi	# $_in0, borrow $a3
1152	add	$a1,$A
1153	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
1154
1155	add	$SZ*0(%r15),$A
1156	add	$SZ*1(%r15),$B
1157	add	$SZ*2(%r15),$C
1158	add	$SZ*3(%r15),$D
1159	add	$SZ*4(%r15),$E
1160	add	$SZ*5(%r15),$F
1161	add	$SZ*6(%r15),$G
1162	lea	(%rsi,%r13),%r12
1163	add	$SZ*7(%r15),$H
1164
1165	cmp	$_end,%r13
1166
1167	mov	$A,$SZ*0(%r15)
1168	cmove	%rsp,%r12		# next block or stale data
1169	mov	$B,$SZ*1(%r15)
1170	mov	$C,$SZ*2(%r15)
1171	mov	$D,$SZ*3(%r15)
1172	mov	$E,$SZ*4(%r15)
1173	mov	$F,$SZ*5(%r15)
1174	mov	$G,$SZ*6(%r15)
1175	mov	$H,$SZ*7(%r15)
1176
1177	jbe	.Loop_avx2
1178	lea	(%rsp),$Tbl
1179
1180.Ldone_avx2:
1181	lea	($Tbl),%rsp
1182	mov	$_ivp,$ivp
1183	mov	$_rsp,%rsi
1184	vmovdqu	$iv,($ivp)		# output IV
1185	vzeroall
1186___
1187$code.=<<___ if ($win64);
1188	movaps	`$framesz+16*0`(%rsp),%xmm6
1189	movaps	`$framesz+16*1`(%rsp),%xmm7
1190	movaps	`$framesz+16*2`(%rsp),%xmm8
1191	movaps	`$framesz+16*3`(%rsp),%xmm9
1192	movaps	`$framesz+16*4`(%rsp),%xmm10
1193	movaps	`$framesz+16*5`(%rsp),%xmm11
1194	movaps	`$framesz+16*6`(%rsp),%xmm12
1195	movaps	`$framesz+16*7`(%rsp),%xmm13
1196	movaps	`$framesz+16*8`(%rsp),%xmm14
1197	movaps	`$framesz+16*9`(%rsp),%xmm15
1198___
1199$code.=<<___;
1200	mov	(%rsi),%r15
1201	mov	8(%rsi),%r14
1202	mov	16(%rsi),%r13
1203	mov	24(%rsi),%r12
1204	mov	32(%rsi),%rbp
1205	mov	40(%rsi),%rbx
1206	lea	48(%rsi),%rsp
1207.Lepilogue_avx2:
1208	ret
1209.size	${func}_avx2,.-${func}_avx2
1210___
1211}}
1212}}
1213{{
1214my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1215
1216my ($rounds,$Tbl)=("%r11d","%rbx");
1217
1218my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1219my @rndkey=("%xmm4","%xmm5");
1220my $r=0;
1221my $sn=0;
1222
1223my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1224my @MSG=map("%xmm$_",(10..13));
1225
1226my $aesenc=sub {
1227  use integer;
1228  my ($n,$k)=($r/10,$r%10);
1229    if ($k==0) {
1230      $code.=<<___;
1231	movups		`16*$n`($in0),$in		# load input
1232	xorps		$rndkey0,$in
1233___
1234      $code.=<<___ if ($n);
1235	movups		$iv,`16*($n-1)`($out,$in0)	# write output
1236___
1237      $code.=<<___;
1238	xorps		$in,$iv
1239	movups		`32+16*$k-112`($key),$rndkey[1]
1240	aesenc		$rndkey[0],$iv
1241___
1242    } elsif ($k==9) {
1243      $sn++;
1244      $code.=<<___;
1245	cmp		\$11,$rounds
1246	jb		.Laesenclast$sn
1247	movups		`32+16*($k+0)-112`($key),$rndkey[1]
1248	aesenc		$rndkey[0],$iv
1249	movups		`32+16*($k+1)-112`($key),$rndkey[0]
1250	aesenc		$rndkey[1],$iv
1251	je		.Laesenclast$sn
1252	movups		`32+16*($k+2)-112`($key),$rndkey[1]
1253	aesenc		$rndkey[0],$iv
1254	movups		`32+16*($k+3)-112`($key),$rndkey[0]
1255	aesenc		$rndkey[1],$iv
1256.Laesenclast$sn:
1257	aesenclast	$rndkey[0],$iv
1258	movups		16-112($key),$rndkey[1]		# forward reference
1259	nop
1260___
1261    } else {
1262      $code.=<<___;
1263	movups		`32+16*$k-112`($key),$rndkey[1]
1264	aesenc		$rndkey[0],$iv
1265___
1266    }
1267    $r++;	unshift(@rndkey,pop(@rndkey));
1268};
1269
1270if ($shaext) {
1271my $Tbl="%rax";
1272
1273$code.=<<___;
1274.type	${func}_shaext,\@function,6
1275.align	32
1276${func}_shaext:
1277	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1278___
1279$code.=<<___ if ($win64);
1280	lea	`-8-10*16`(%rsp),%rsp
1281	movaps	%xmm6,-8-10*16(%rax)
1282	movaps	%xmm7,-8-9*16(%rax)
1283	movaps	%xmm8,-8-8*16(%rax)
1284	movaps	%xmm9,-8-7*16(%rax)
1285	movaps	%xmm10,-8-6*16(%rax)
1286	movaps	%xmm11,-8-5*16(%rax)
1287	movaps	%xmm12,-8-4*16(%rax)
1288	movaps	%xmm13,-8-3*16(%rax)
1289	movaps	%xmm14,-8-2*16(%rax)
1290	movaps	%xmm15,-8-1*16(%rax)
1291.Lprologue_shaext:
1292___
1293$code.=<<___;
1294	lea		K256+0x80(%rip),$Tbl
1295	movdqu		($ctx),$ABEF		# DCBA
1296	movdqu		16($ctx),$CDGH		# HGFE
1297	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
1298
1299	mov		240($key),$rounds
1300	sub		$in0,$out
1301	movups		($key),$rndkey0		# $key[0]
1302	movups		16($key),$rndkey[0]	# forward reference
1303	lea		112($key),$key		# size optimization
1304
1305	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
1306	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
1307	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
1308	movdqa		$TMP,$BSWAP		# offload
1309	palignr		\$8,$CDGH,$ABEF		# ABEF
1310	punpcklqdq	$Wi,$CDGH		# CDGH
1311
1312	jmp	.Loop_shaext
1313
1314.align	16
1315.Loop_shaext:
1316	movdqu		($inp),@MSG[0]
1317	movdqu		0x10($inp),@MSG[1]
1318	movdqu		0x20($inp),@MSG[2]
1319	pshufb		$TMP,@MSG[0]
1320	movdqu		0x30($inp),@MSG[3]
1321
1322	movdqa		0*32-0x80($Tbl),$Wi
1323	paddd		@MSG[0],$Wi
1324	pshufb		$TMP,@MSG[1]
1325	movdqa		$CDGH,$CDGH_SAVE	# offload
1326	movdqa		$ABEF,$ABEF_SAVE	# offload
1327___
1328	&$aesenc();
1329$code.=<<___;
1330	sha256rnds2	$ABEF,$CDGH		# 0-3
1331	pshufd		\$0x0e,$Wi,$Wi
1332___
1333	&$aesenc();
1334$code.=<<___;
1335	sha256rnds2	$CDGH,$ABEF
1336
1337	movdqa		1*32-0x80($Tbl),$Wi
1338	paddd		@MSG[1],$Wi
1339	pshufb		$TMP,@MSG[2]
1340	lea		0x40($inp),$inp
1341___
1342	&$aesenc();
1343$code.=<<___;
1344	sha256rnds2	$ABEF,$CDGH		# 4-7
1345	pshufd		\$0x0e,$Wi,$Wi
1346___
1347	&$aesenc();
1348$code.=<<___;
1349	sha256rnds2	$CDGH,$ABEF
1350
1351	movdqa		2*32-0x80($Tbl),$Wi
1352	paddd		@MSG[2],$Wi
1353	pshufb		$TMP,@MSG[3]
1354	sha256msg1	@MSG[1],@MSG[0]
1355___
1356	&$aesenc();
1357$code.=<<___;
1358	sha256rnds2	$ABEF,$CDGH		# 8-11
1359	pshufd		\$0x0e,$Wi,$Wi
1360	movdqa		@MSG[3],$TMP
1361	palignr		\$4,@MSG[2],$TMP
1362	paddd		$TMP,@MSG[0]
1363___
1364	&$aesenc();
1365$code.=<<___;
1366	sha256rnds2	$CDGH,$ABEF
1367
1368	movdqa		3*32-0x80($Tbl),$Wi
1369	paddd		@MSG[3],$Wi
1370	sha256msg2	@MSG[3],@MSG[0]
1371	sha256msg1	@MSG[2],@MSG[1]
1372___
1373	&$aesenc();
1374$code.=<<___;
1375	sha256rnds2	$ABEF,$CDGH		# 12-15
1376	pshufd		\$0x0e,$Wi,$Wi
1377___
1378	&$aesenc();
1379$code.=<<___;
1380	movdqa		@MSG[0],$TMP
1381	palignr		\$4,@MSG[3],$TMP
1382	paddd		$TMP,@MSG[1]
1383	sha256rnds2	$CDGH,$ABEF
1384___
1385for($i=4;$i<16-3;$i++) {
1386	&$aesenc()	if (($r%10)==0);
1387$code.=<<___;
1388	movdqa		$i*32-0x80($Tbl),$Wi
1389	paddd		@MSG[0],$Wi
1390	sha256msg2	@MSG[0],@MSG[1]
1391	sha256msg1	@MSG[3],@MSG[2]
1392___
1393	&$aesenc();
1394$code.=<<___;
1395	sha256rnds2	$ABEF,$CDGH		# 16-19...
1396	pshufd		\$0x0e,$Wi,$Wi
1397	movdqa		@MSG[1],$TMP
1398	palignr		\$4,@MSG[0],$TMP
1399	paddd		$TMP,@MSG[2]
1400___
1401	&$aesenc();
1402	&$aesenc()	if ($r==19);
1403$code.=<<___;
1404	sha256rnds2	$CDGH,$ABEF
1405___
1406	push(@MSG,shift(@MSG));
1407}
1408$code.=<<___;
1409	movdqa		13*32-0x80($Tbl),$Wi
1410	paddd		@MSG[0],$Wi
1411	sha256msg2	@MSG[0],@MSG[1]
1412	sha256msg1	@MSG[3],@MSG[2]
1413___
1414	&$aesenc();
1415$code.=<<___;
1416	sha256rnds2	$ABEF,$CDGH		# 52-55
1417	pshufd		\$0x0e,$Wi,$Wi
1418	movdqa		@MSG[1],$TMP
1419	palignr		\$4,@MSG[0],$TMP
1420	paddd		$TMP,@MSG[2]
1421___
1422	&$aesenc();
1423	&$aesenc();
1424$code.=<<___;
1425	sha256rnds2	$CDGH,$ABEF
1426
1427	movdqa		14*32-0x80($Tbl),$Wi
1428	paddd		@MSG[1],$Wi
1429	sha256msg2	@MSG[1],@MSG[2]
1430	movdqa		$BSWAP,$TMP
1431___
1432	&$aesenc();
1433$code.=<<___;
1434	sha256rnds2	$ABEF,$CDGH		# 56-59
1435	pshufd		\$0x0e,$Wi,$Wi
1436___
1437	&$aesenc();
1438$code.=<<___;
1439	sha256rnds2	$CDGH,$ABEF
1440
1441	movdqa		15*32-0x80($Tbl),$Wi
1442	paddd		@MSG[2],$Wi
1443___
1444	&$aesenc();
1445	&$aesenc();
1446$code.=<<___;
1447	sha256rnds2	$ABEF,$CDGH		# 60-63
1448	pshufd		\$0x0e,$Wi,$Wi
1449___
1450	&$aesenc();
1451$code.=<<___;
1452	sha256rnds2	$CDGH,$ABEF
1453	#pxor		$CDGH,$rndkey0		# black magic
1454___
1455	while ($r<40)	{ &$aesenc(); }		# remaining aesenc's
1456$code.=<<___;
1457	#xorps		$CDGH,$rndkey0		# black magic
1458	paddd		$CDGH_SAVE,$CDGH
1459	paddd		$ABEF_SAVE,$ABEF
1460
1461	dec		$len
1462	movups		$iv,48($out,$in0)	# write output
1463	lea		64($in0),$in0
1464	jnz		.Loop_shaext
1465
1466	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
1467	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
1468	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
1469	punpckhqdq	$CDGH,$ABEF		# DCBA
1470	palignr		\$8,$TMP,$CDGH		# HGFE
1471
1472	movups		$iv,($ivp)		# write IV
1473	movdqu		$ABEF,($ctx)
1474	movdqu		$CDGH,16($ctx)
1475___
1476$code.=<<___ if ($win64);
1477	movaps	0*16(%rsp),%xmm6
1478	movaps	1*16(%rsp),%xmm7
1479	movaps	2*16(%rsp),%xmm8
1480	movaps	3*16(%rsp),%xmm9
1481	movaps	4*16(%rsp),%xmm10
1482	movaps	5*16(%rsp),%xmm11
1483	movaps	6*16(%rsp),%xmm12
1484	movaps	7*16(%rsp),%xmm13
1485	movaps	8*16(%rsp),%xmm14
1486	movaps	9*16(%rsp),%xmm15
1487	lea	8+10*16(%rsp),%rsp
1488.Lepilogue_shaext:
1489___
1490$code.=<<___;
1491	ret
1492.size	${func}_shaext,.-${func}_shaext
1493___
1494}
1495}}}}}
1496
1497# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1498#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1499if ($win64 && $avx) {
1500$rec="%rcx";
1501$frame="%rdx";
1502$context="%r8";
1503$disp="%r9";
1504
1505$code.=<<___;
1506.extern	__imp_RtlVirtualUnwind
1507.type	se_handler,\@abi-omnipotent
1508.align	16
1509se_handler:
1510	push	%rsi
1511	push	%rdi
1512	push	%rbx
1513	push	%rbp
1514	push	%r12
1515	push	%r13
1516	push	%r14
1517	push	%r15
1518	pushfq
1519	sub	\$64,%rsp
1520
1521	mov	120($context),%rax	# pull context->Rax
1522	mov	248($context),%rbx	# pull context->Rip
1523
1524	mov	8($disp),%rsi		# disp->ImageBase
1525	mov	56($disp),%r11		# disp->HanderlData
1526
1527	mov	0(%r11),%r10d		# HandlerData[0]
1528	lea	(%rsi,%r10),%r10	# prologue label
1529	cmp	%r10,%rbx		# context->Rip<prologue label
1530	jb	.Lin_prologue
1531
1532	mov	152($context),%rax	# pull context->Rsp
1533
1534	mov	4(%r11),%r10d		# HandlerData[1]
1535	lea	(%rsi,%r10),%r10	# epilogue label
1536	cmp	%r10,%rbx		# context->Rip>=epilogue label
1537	jae	.Lin_prologue
1538___
1539$code.=<<___ if ($shaext);
1540	lea	aesni_cbc_sha256_enc_shaext(%rip),%r10
1541	cmp	%r10,%rbx
1542	jb	.Lnot_in_shaext
1543
1544	lea	(%rax),%rsi
1545	lea	512($context),%rdi	# &context.Xmm6
1546	mov	\$20,%ecx
1547	.long	0xa548f3fc		# cld; rep movsq
1548	lea	168(%rax),%rax		# adjust stack pointer
1549	jmp	.Lin_prologue
1550.Lnot_in_shaext:
1551___
1552$code.=<<___ if ($avx>1);
1553	lea	.Lavx2_shortcut(%rip),%r10
1554	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
1555	jb	.Lnot_in_avx2
1556
1557	and	\$-256*$SZ,%rax
1558	add	\$`2*$SZ*($rounds-8)`,%rax
1559.Lnot_in_avx2:
1560___
1561$code.=<<___;
1562	mov	%rax,%rsi		# put aside Rsp
1563	mov	16*$SZ+7*8(%rax),%rax	# pull $_rsp
1564	lea	48(%rax),%rax
1565
1566	mov	-8(%rax),%rbx
1567	mov	-16(%rax),%rbp
1568	mov	-24(%rax),%r12
1569	mov	-32(%rax),%r13
1570	mov	-40(%rax),%r14
1571	mov	-48(%rax),%r15
1572	mov	%rbx,144($context)	# restore context->Rbx
1573	mov	%rbp,160($context)	# restore context->Rbp
1574	mov	%r12,216($context)	# restore context->R12
1575	mov	%r13,224($context)	# restore context->R13
1576	mov	%r14,232($context)	# restore context->R14
1577	mov	%r15,240($context)	# restore context->R15
1578
1579	lea	16*$SZ+8*8(%rsi),%rsi	# Xmm6- save area
1580	lea	512($context),%rdi	# &context.Xmm6
1581	mov	\$20,%ecx
1582	.long	0xa548f3fc		# cld; rep movsq
1583
1584.Lin_prologue:
1585	mov	8(%rax),%rdi
1586	mov	16(%rax),%rsi
1587	mov	%rax,152($context)	# restore context->Rsp
1588	mov	%rsi,168($context)	# restore context->Rsi
1589	mov	%rdi,176($context)	# restore context->Rdi
1590
1591	mov	40($disp),%rdi		# disp->ContextRecord
1592	mov	$context,%rsi		# context
1593	mov	\$154,%ecx		# sizeof(CONTEXT)
1594	.long	0xa548f3fc		# cld; rep movsq
1595
1596	mov	$disp,%rsi
1597	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1598	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1599	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1600	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1601	mov	40(%rsi),%r10		# disp->ContextRecord
1602	lea	56(%rsi),%r11		# &disp->HandlerData
1603	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1604	mov	%r10,32(%rsp)		# arg5
1605	mov	%r11,40(%rsp)		# arg6
1606	mov	%r12,48(%rsp)		# arg7
1607	mov	%rcx,56(%rsp)		# arg8, (NULL)
1608	call	*__imp_RtlVirtualUnwind(%rip)
1609
1610	mov	\$1,%eax		# ExceptionContinueSearch
1611	add	\$64,%rsp
1612	popfq
1613	pop	%r15
1614	pop	%r14
1615	pop	%r13
1616	pop	%r12
1617	pop	%rbp
1618	pop	%rbx
1619	pop	%rdi
1620	pop	%rsi
1621	ret
1622.size	se_handler,.-se_handler
1623
1624.section	.pdata
1625	.rva	.LSEH_begin_${func}_xop
1626	.rva	.LSEH_end_${func}_xop
1627	.rva	.LSEH_info_${func}_xop
1628
1629	.rva	.LSEH_begin_${func}_avx
1630	.rva	.LSEH_end_${func}_avx
1631	.rva	.LSEH_info_${func}_avx
1632___
1633$code.=<<___ if ($avx>1);
1634	.rva	.LSEH_begin_${func}_avx2
1635	.rva	.LSEH_end_${func}_avx2
1636	.rva	.LSEH_info_${func}_avx2
1637___
1638$code.=<<___ if ($shaext);
1639	.rva	.LSEH_begin_${func}_shaext
1640	.rva	.LSEH_end_${func}_shaext
1641	.rva	.LSEH_info_${func}_shaext
1642___
1643$code.=<<___;
1644.section	.xdata
1645.align	8
1646.LSEH_info_${func}_xop:
1647	.byte	9,0,0,0
1648	.rva	se_handler
1649	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
1650
1651.LSEH_info_${func}_avx:
1652	.byte	9,0,0,0
1653	.rva	se_handler
1654	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1655___
1656$code.=<<___ if ($avx>1);
1657.LSEH_info_${func}_avx2:
1658	.byte	9,0,0,0
1659	.rva	se_handler
1660	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
1661___
1662$code.=<<___ if ($shaext);
1663.LSEH_info_${func}_shaext:
1664	.byte	9,0,0,0
1665	.rva	se_handler
1666	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
1667___
1668}
1669
1670####################################################################
1671sub rex {
1672  local *opcode=shift;
1673  my ($dst,$src)=@_;
1674  my $rex=0;
1675
1676    $rex|=0x04			if($dst>=8);
1677    $rex|=0x01			if($src>=8);
1678    unshift @opcode,$rex|0x40	if($rex);
1679}
1680
1681{
1682  my %opcodelet = (
1683		"sha256rnds2" => 0xcb,
1684  		"sha256msg1"  => 0xcc,
1685		"sha256msg2"  => 0xcd	);
1686
1687  sub sha256op38 {
1688    my $instr = shift;
1689
1690    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1691      my @opcode=(0x0f,0x38);
1692	rex(\@opcode,$2,$1);
1693	push @opcode,$opcodelet{$instr};
1694	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1695	return ".byte\t".join(',',@opcode);
1696    } else {
1697	return $instr."\t".@_[0];
1698    }
1699  }
1700}
1701
1702$code =~ s/\`([^\`]*)\`/eval $1/gem;
1703$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1704print $code;
1705close STDOUT;
1706