1#! /usr/bin/env perl
2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# January 2013
18#
19# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20# in http://download.intel.com/design/intarch/papers/323686.pdf, is
21# that since AESNI-CBC encrypt exhibit *very* low instruction-level
22# parallelism, interleaving it with another algorithm would allow to
23# utilize processor resources better and achieve better performance.
24# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25# AESNI code is weaved into it. As SHA256 dominates execution time,
26# stitch performance does not depend on AES key length. Below are
27# performance numbers in cycles per processed byte, less is better,
28# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29# subroutine:
30#
31#		 AES-128/-192/-256+SHA256   this(**)	gain
32# Sandy Bridge	    5.05/6.05/7.05+11.6	    13.0	+28%/36%/43%
33# Ivy Bridge	    5.05/6.05/7.05+10.3	    11.6	+32%/41%/50%
34# Haswell	    4.43/5.29/6.19+7.80	    8.79	+39%/49%/59%
35# Skylake	    2.62/3.14/3.62+7.70	    8.10	+27%/34%/40%
36# Bulldozer	    5.77/6.89/8.00+13.7	    13.7	+42%/50%/58%
37# Ryzen(***)	    2.71/-/3.71+2.05	    2.74/-/3.73	+74%/-/54%
38# Goldmont(***)	    3.82/-/5.35+4.16	    4.73/-/5.94	+69%/-/60%
39#
40# (*)	there are XOP, AVX1 and AVX2 code paths, meaning that
41#	Westmere is omitted from loop, this is because gain was not
42#	estimated high enough to justify the effort;
43# (**)	these are EVP-free results, results obtained with 'speed
44#	-evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45# (***)	these are SHAEXT results;
46
47$flavour = shift;
48$output  = shift;
49if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
50
51$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
52
53$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
55( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
56die "can't locate x86_64-xlate.pl";
57
58if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
59		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
60	$avx = ($1>=2.19) + ($1>=2.22);
61}
62
63if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65	$avx = ($1>=2.09) + ($1>=2.10);
66}
67
68if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
70	$avx = ($1>=10) + ($1>=12);
71}
72
73if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
74	$avx = ($2>=3.0) + ($2>3.0);
75}
76
77$shaext=$avx;	### set to zero if compiling for 1.0.1
78$avx=1		if (!$shaext && $avx);
79
80open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
81*STDOUT=*OUT;
82
83$func="aesni_cbc_sha256_enc";
84$TABLE="K256";
85$SZ=4;
86@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
87				"%r8d","%r9d","%r10d","%r11d");
88($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
89@Sigma0=( 2,13,22);
90@Sigma1=( 6,11,25);
91@sigma0=( 7,18, 3);
92@sigma1=(17,19,10);
93$rounds=64;
94
95########################################################################
96# void aesni_cbc_sha256_enc(const void *inp,
97#			void *out,
98#			size_t length,
99#			const AES_KEY *key,
100#			unsigned char *iv,
101#			SHA256_CTX *ctx,
102#			const void *in0);
103($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
104("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
105
106$Tbl="%rbp";
107
108$_inp="16*$SZ+0*8(%rsp)";
109$_out="16*$SZ+1*8(%rsp)";
110$_end="16*$SZ+2*8(%rsp)";
111$_key="16*$SZ+3*8(%rsp)";
112$_ivp="16*$SZ+4*8(%rsp)";
113$_ctx="16*$SZ+5*8(%rsp)";
114$_in0="16*$SZ+6*8(%rsp)";
115$_rsp="`16*$SZ+7*8`(%rsp)";
116$framesz=16*$SZ+8*8;
117
118$code=<<___;
119.text
120
121.extern	OPENSSL_ia32cap_P
122.globl	$func
123.type	$func,\@abi-omnipotent
124.align	16
125$func:
126.cfi_startproc
127___
128						if ($avx) {
129$code.=<<___;
130	lea	OPENSSL_ia32cap_P(%rip),%r11
131	mov	\$1,%eax
132	cmp	\$0,`$win64?"%rcx":"%rdi"`
133	je	.Lprobe
134	mov	0(%r11),%eax
135	mov	4(%r11),%r10
136___
137$code.=<<___ if ($shaext);
138	bt	\$61,%r10			# check for SHA
139	jc	${func}_shaext
140___
141$code.=<<___;
142	mov	%r10,%r11
143	shr	\$32,%r11
144
145	test	\$`1<<11`,%r10d			# check for XOP
146	jnz	${func}_xop
147___
148$code.=<<___ if ($avx>1);
149	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
150	cmp	\$`1<<8|1<<5|1<<3`,%r11d
151	je	${func}_avx2
152___
153$code.=<<___;
154	and	\$`1<<28`,%r10d			# check for AVX
155	jnz	${func}_avx
156	ud2
157___
158						}
159$code.=<<___;
160	xor	%eax,%eax
161	cmp	\$0,`$win64?"%rcx":"%rdi"`
162	je	.Lprobe
163	ud2
164.Lprobe:
165	ret
166.cfi_endproc
167.size	$func,.-$func
168
169.align	64
170.type	$TABLE,\@object
171$TABLE:
172	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
173	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
174	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
175	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
176	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
177	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
178	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
179	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
180	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
181	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
182	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
183	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
184	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
185	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
186	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
187	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
188	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
189	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
190	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
191	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
192	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
193	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
194	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
195	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
196	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
197	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
198	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
199	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
200	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
201	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
202	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
204
205	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
206	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
207	.long	0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
208	.long	0,0,0,0,   0,0,0,0
209	.asciz	"AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
210.align	64
211___
212
213######################################################################
214# SIMD code paths
215#
216{{{
217($iv,$inout,$roundkey,$temp,
218 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
219
220$aesni_cbc_idx=0;
221@aesni_cbc_block = (
222##	&vmovdqu	($roundkey,"0x00-0x80($inp)");'
223##	&vmovdqu	($inout,($inp));
224##	&mov		($_inp,$inp);
225
226	'&vpxor		($inout,$inout,$roundkey);'.
227	' &vmovdqu	($roundkey,"0x10-0x80($inp)");',
228
229	'&vpxor		($inout,$inout,$iv);',
230
231	'&vaesenc	($inout,$inout,$roundkey);'.
232	' &vmovdqu	($roundkey,"0x20-0x80($inp)");',
233
234	'&vaesenc	($inout,$inout,$roundkey);'.
235	' &vmovdqu	($roundkey,"0x30-0x80($inp)");',
236
237	'&vaesenc	($inout,$inout,$roundkey);'.
238	' &vmovdqu	($roundkey,"0x40-0x80($inp)");',
239
240	'&vaesenc	($inout,$inout,$roundkey);'.
241	' &vmovdqu	($roundkey,"0x50-0x80($inp)");',
242
243	'&vaesenc	($inout,$inout,$roundkey);'.
244	' &vmovdqu	($roundkey,"0x60-0x80($inp)");',
245
246	'&vaesenc	($inout,$inout,$roundkey);'.
247	' &vmovdqu	($roundkey,"0x70-0x80($inp)");',
248
249	'&vaesenc	($inout,$inout,$roundkey);'.
250	' &vmovdqu	($roundkey,"0x80-0x80($inp)");',
251
252	'&vaesenc	($inout,$inout,$roundkey);'.
253	' &vmovdqu	($roundkey,"0x90-0x80($inp)");',
254
255	'&vaesenc	($inout,$inout,$roundkey);'.
256	' &vmovdqu	($roundkey,"0xa0-0x80($inp)");',
257
258	'&vaesenclast	($temp,$inout,$roundkey);'.
259	' &vaesenc	($inout,$inout,$roundkey);'.
260	' &vmovdqu	($roundkey,"0xb0-0x80($inp)");',
261
262	'&vpand		($iv,$temp,$mask10);'.
263	' &vaesenc	($inout,$inout,$roundkey);'.
264	' &vmovdqu	($roundkey,"0xc0-0x80($inp)");',
265
266	'&vaesenclast	($temp,$inout,$roundkey);'.
267	' &vaesenc	($inout,$inout,$roundkey);'.
268	' &vmovdqu	($roundkey,"0xd0-0x80($inp)");',
269
270	'&vpand		($temp,$temp,$mask12);'.
271	' &vaesenc	($inout,$inout,$roundkey);'.
272	 '&vmovdqu	($roundkey,"0xe0-0x80($inp)");',
273
274	'&vpor		($iv,$iv,$temp);'.
275	' &vaesenclast	($temp,$inout,$roundkey);'.
276	' &vmovdqu	($roundkey,"0x00-0x80($inp)");'
277
278##	&mov		($inp,$_inp);
279##	&mov		($out,$_out);
280##	&vpand		($temp,$temp,$mask14);
281##	&vpor		($iv,$iv,$temp);
282##	&vmovdqu	($iv,($out,$inp);
283##	&lea		(inp,16($inp));
284);
285
286my $a4=$T1;
287my ($a,$b,$c,$d,$e,$f,$g,$h);
288
289sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
290{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
291  my $arg = pop;
292    $arg = "\$$arg" if ($arg*1 eq $arg);
293    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
294}
295
296sub body_00_15 () {
297	(
298	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
299
300	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
301	'&mov	($a,$a1)',
302	'&mov	($a4,$f)',
303
304	'&xor	($a0,$e)',
305	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
306	'&xor	($a4,$g)',			# f^g
307
308	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
309	'&xor	($a1,$a)',
310	'&and	($a4,$e)',			# (f^g)&e
311
312	@aesni_cbc_block[$aesni_cbc_idx++].
313	'&xor	($a0,$e)',
314	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
315	'&mov	($a2,$a)',
316
317	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
318	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
319	'&xor	($a2,$b)',			# a^b, b^c in next round
320
321	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
322	'&add	($h,$a4)',			# h+=Ch(e,f,g)
323	'&and	($a3,$a2)',			# (b^c)&(a^b)
324
325	'&xor	($a1,$a)',
326	'&add	($h,$a0)',			# h+=Sigma1(e)
327	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
328
329	'&add	($d,$h)',			# d+=h
330	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
331	'&add	($h,$a3)',			# h+=Maj(a,b,c)
332
333	'&mov	($a0,$d)',
334	'&add	($a1,$h);'.			# h+=Sigma0(a)
335	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
336	);
337}
338
339if ($avx) {{
340######################################################################
341# XOP code path
342#
343$code.=<<___;
344.type	${func}_xop,\@function,6
345.align	64
346${func}_xop:
347.cfi_startproc
348.Lxop_shortcut:
349	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
350	mov	%rsp,%rax		# copy %rsp
351.cfi_def_cfa_register	%rax
352	push	%rbx
353.cfi_push	%rbx
354	push	%rbp
355.cfi_push	%rbp
356	push	%r12
357.cfi_push	%r12
358	push	%r13
359.cfi_push	%r13
360	push	%r14
361.cfi_push	%r14
362	push	%r15
363.cfi_push	%r15
364	sub	\$`$framesz+$win64*16*10`,%rsp
365	and	\$-64,%rsp		# align stack frame
366
367	shl	\$6,$len
368	sub	$inp,$out		# re-bias
369	sub	$inp,$in0
370	add	$inp,$len		# end of input
371
372	#mov	$inp,$_inp		# saved later
373	mov	$out,$_out
374	mov	$len,$_end
375	#mov	$key,$_key		# remains resident in $inp register
376	mov	$ivp,$_ivp
377	mov	$ctx,$_ctx
378	mov	$in0,$_in0
379	mov	%rax,$_rsp
380.cfi_cfa_expression	$_rsp,deref,+8
381___
382$code.=<<___ if ($win64);
383	movaps	%xmm6,`$framesz+16*0`(%rsp)
384	movaps	%xmm7,`$framesz+16*1`(%rsp)
385	movaps	%xmm8,`$framesz+16*2`(%rsp)
386	movaps	%xmm9,`$framesz+16*3`(%rsp)
387	movaps	%xmm10,`$framesz+16*4`(%rsp)
388	movaps	%xmm11,`$framesz+16*5`(%rsp)
389	movaps	%xmm12,`$framesz+16*6`(%rsp)
390	movaps	%xmm13,`$framesz+16*7`(%rsp)
391	movaps	%xmm14,`$framesz+16*8`(%rsp)
392	movaps	%xmm15,`$framesz+16*9`(%rsp)
393___
394$code.=<<___;
395.Lprologue_xop:
396	vzeroall
397
398	mov	$inp,%r12		# borrow $a4
399	lea	0x80($key),$inp		# size optimization, reassign
400	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
401	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
402	mov	$ctx,%r15		# borrow $a2
403	mov	$in0,%rsi		# borrow $a3
404	vmovdqu	($ivp),$iv		# load IV
405	sub	\$9,%r14
406
407	mov	$SZ*0(%r15),$A
408	mov	$SZ*1(%r15),$B
409	mov	$SZ*2(%r15),$C
410	mov	$SZ*3(%r15),$D
411	mov	$SZ*4(%r15),$E
412	mov	$SZ*5(%r15),$F
413	mov	$SZ*6(%r15),$G
414	mov	$SZ*7(%r15),$H
415
416	vmovdqa	0x00(%r13,%r14,8),$mask14
417	vmovdqa	0x10(%r13,%r14,8),$mask12
418	vmovdqa	0x20(%r13,%r14,8),$mask10
419	vmovdqu	0x00-0x80($inp),$roundkey
420	jmp	.Lloop_xop
421___
422					if ($SZ==4) {	# SHA256
423    my @X = map("%xmm$_",(0..3));
424    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
425
426$code.=<<___;
427.align	16
428.Lloop_xop:
429	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
430	vmovdqu	0x00(%rsi,%r12),@X[0]
431	vmovdqu	0x10(%rsi,%r12),@X[1]
432	vmovdqu	0x20(%rsi,%r12),@X[2]
433	vmovdqu	0x30(%rsi,%r12),@X[3]
434	vpshufb	$t3,@X[0],@X[0]
435	lea	$TABLE(%rip),$Tbl
436	vpshufb	$t3,@X[1],@X[1]
437	vpshufb	$t3,@X[2],@X[2]
438	vpaddd	0x00($Tbl),@X[0],$t0
439	vpshufb	$t3,@X[3],@X[3]
440	vpaddd	0x20($Tbl),@X[1],$t1
441	vpaddd	0x40($Tbl),@X[2],$t2
442	vpaddd	0x60($Tbl),@X[3],$t3
443	vmovdqa	$t0,0x00(%rsp)
444	mov	$A,$a1
445	vmovdqa	$t1,0x10(%rsp)
446	mov	$B,$a3
447	vmovdqa	$t2,0x20(%rsp)
448	xor	$C,$a3			# magic
449	vmovdqa	$t3,0x30(%rsp)
450	mov	$E,$a0
451	jmp	.Lxop_00_47
452
453.align	16
454.Lxop_00_47:
455	sub	\$-16*2*$SZ,$Tbl	# size optimization
456	vmovdqu	(%r12),$inout		# $a4
457	mov	%r12,$_inp		# $a4
458___
459sub XOP_256_00_47 () {
460my $j = shift;
461my $body = shift;
462my @X = @_;
463my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
464
465	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
466	  eval(shift(@insns));
467	  eval(shift(@insns));
468	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
469	  eval(shift(@insns));
470	  eval(shift(@insns));
471	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
472	  eval(shift(@insns));
473	  eval(shift(@insns));
474	&vpsrld		($t0,$t0,$sigma0[2]);
475	  eval(shift(@insns));
476	  eval(shift(@insns));
477	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
478	  eval(shift(@insns));
479	  eval(shift(@insns));
480	  eval(shift(@insns));
481	  eval(shift(@insns));
482	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
483	  eval(shift(@insns));
484	  eval(shift(@insns));
485	&vpxor		($t0,$t0,$t1);
486	  eval(shift(@insns));
487	  eval(shift(@insns));
488	  eval(shift(@insns));
489	  eval(shift(@insns));
490	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
491	  eval(shift(@insns));
492	  eval(shift(@insns));
493	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
494	  eval(shift(@insns));
495	  eval(shift(@insns));
496	 &vpsrld	($t2,@X[3],$sigma1[2]);
497	  eval(shift(@insns));
498	  eval(shift(@insns));
499	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
500	  eval(shift(@insns));
501	  eval(shift(@insns));
502	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
503	  eval(shift(@insns));
504	  eval(shift(@insns));
505	 &vpxor		($t3,$t3,$t2);
506	  eval(shift(@insns));
507	  eval(shift(@insns));
508	  eval(shift(@insns));
509	  eval(shift(@insns));
510	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
511	  eval(shift(@insns));
512	  eval(shift(@insns));
513	  eval(shift(@insns));
514	  eval(shift(@insns));
515	&vpsrldq	($t3,$t3,8);
516	  eval(shift(@insns));
517	  eval(shift(@insns));
518	  eval(shift(@insns));
519	  eval(shift(@insns));
520	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
521	  eval(shift(@insns));
522	  eval(shift(@insns));
523	  eval(shift(@insns));
524	  eval(shift(@insns));
525	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
526	  eval(shift(@insns));
527	  eval(shift(@insns));
528	 &vpsrld	($t2,@X[0],$sigma1[2]);
529	  eval(shift(@insns));
530	  eval(shift(@insns));
531	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
532	  eval(shift(@insns));
533	  eval(shift(@insns));
534	 &vpxor		($t3,$t3,$t2);
535	  eval(shift(@insns));
536	  eval(shift(@insns));
537	  eval(shift(@insns));
538	  eval(shift(@insns));
539	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
540	  eval(shift(@insns));
541	  eval(shift(@insns));
542	  eval(shift(@insns));
543	  eval(shift(@insns));
544	&vpslldq	($t3,$t3,8);		# 22 instructions
545	  eval(shift(@insns));
546	  eval(shift(@insns));
547	  eval(shift(@insns));
548	  eval(shift(@insns));
549	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
550	  eval(shift(@insns));
551	  eval(shift(@insns));
552	  eval(shift(@insns));
553	  eval(shift(@insns));
554	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
555	  foreach (@insns) { eval; }		# remaining instructions
556	&vmovdqa	(16*$j."(%rsp)",$t2);
557}
558
559    $aesni_cbc_idx=0;
560    for ($i=0,$j=0; $j<4; $j++) {
561	&XOP_256_00_47($j,\&body_00_15,@X);
562	push(@X,shift(@X));			# rotate(@X)
563    }
564    	&mov		("%r12",$_inp);		# borrow $a4
565	&vpand		($temp,$temp,$mask14);
566	&mov		("%r15",$_out);		# borrow $a2
567	&vpor		($iv,$iv,$temp);
568	&vmovdqu	("(%r15,%r12)",$iv);	# write output
569	&lea		("%r12","16(%r12)");	# inp++
570
571	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
572	&jne	(".Lxop_00_47");
573
574	&vmovdqu	($inout,"(%r12)");
575	&mov		($_inp,"%r12");
576
577    $aesni_cbc_idx=0;
578    for ($i=0; $i<16; ) {
579	foreach(body_00_15()) { eval; }
580    }
581					}
582$code.=<<___;
583	mov	$_inp,%r12		# borrow $a4
584	mov	$_out,%r13		# borrow $a0
585	mov	$_ctx,%r15		# borrow $a2
586	mov	$_in0,%rsi		# borrow $a3
587
588	vpand	$mask14,$temp,$temp
589	mov	$a1,$A
590	vpor	$temp,$iv,$iv
591	vmovdqu	$iv,(%r13,%r12)		# write output
592	lea	16(%r12),%r12		# inp++
593
594	add	$SZ*0(%r15),$A
595	add	$SZ*1(%r15),$B
596	add	$SZ*2(%r15),$C
597	add	$SZ*3(%r15),$D
598	add	$SZ*4(%r15),$E
599	add	$SZ*5(%r15),$F
600	add	$SZ*6(%r15),$G
601	add	$SZ*7(%r15),$H
602
603	cmp	$_end,%r12
604
605	mov	$A,$SZ*0(%r15)
606	mov	$B,$SZ*1(%r15)
607	mov	$C,$SZ*2(%r15)
608	mov	$D,$SZ*3(%r15)
609	mov	$E,$SZ*4(%r15)
610	mov	$F,$SZ*5(%r15)
611	mov	$G,$SZ*6(%r15)
612	mov	$H,$SZ*7(%r15)
613
614	jb	.Lloop_xop
615
616	mov	$_ivp,$ivp
617	mov	$_rsp,%rsi
618.cfi_def_cfa	%rsi,8
619	vmovdqu	$iv,($ivp)		# output IV
620	vzeroall
621___
622$code.=<<___ if ($win64);
623	movaps	`$framesz+16*0`(%rsp),%xmm6
624	movaps	`$framesz+16*1`(%rsp),%xmm7
625	movaps	`$framesz+16*2`(%rsp),%xmm8
626	movaps	`$framesz+16*3`(%rsp),%xmm9
627	movaps	`$framesz+16*4`(%rsp),%xmm10
628	movaps	`$framesz+16*5`(%rsp),%xmm11
629	movaps	`$framesz+16*6`(%rsp),%xmm12
630	movaps	`$framesz+16*7`(%rsp),%xmm13
631	movaps	`$framesz+16*8`(%rsp),%xmm14
632	movaps	`$framesz+16*9`(%rsp),%xmm15
633___
634$code.=<<___;
635	mov	-48(%rsi),%r15
636.cfi_restore	%r15
637	mov	-40(%rsi),%r14
638.cfi_restore	%r14
639	mov	-32(%rsi),%r13
640.cfi_restore	%r13
641	mov	-24(%rsi),%r12
642.cfi_restore	%r12
643	mov	-16(%rsi),%rbp
644.cfi_restore	%rbp
645	mov	-8(%rsi),%rbx
646.cfi_restore	%rbx
647	lea	(%rsi),%rsp
648.cfi_def_cfa_register	%rsp
649.Lepilogue_xop:
650	ret
651.cfi_endproc
652.size	${func}_xop,.-${func}_xop
653___
654######################################################################
655# AVX+shrd code path
656#
657local *ror = sub { &shrd(@_[0],@_) };
658
659$code.=<<___;
660.type	${func}_avx,\@function,6
661.align	64
662${func}_avx:
663.cfi_startproc
664.Lavx_shortcut:
665	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
666	mov	%rsp,%rax		# copy %rsp
667.cfi_def_cfa_register	%rax
668	push	%rbx
669.cfi_push	%rbx
670	push	%rbp
671.cfi_push	%rbp
672	push	%r12
673.cfi_push	%r12
674	push	%r13
675.cfi_push	%r13
676	push	%r14
677.cfi_push	%r14
678	push	%r15
679.cfi_push	%r15
680	sub	\$`$framesz+$win64*16*10`,%rsp
681	and	\$-64,%rsp		# align stack frame
682
683	shl	\$6,$len
684	sub	$inp,$out		# re-bias
685	sub	$inp,$in0
686	add	$inp,$len		# end of input
687
688	#mov	$inp,$_inp		# saved later
689	mov	$out,$_out
690	mov	$len,$_end
691	#mov	$key,$_key		# remains resident in $inp register
692	mov	$ivp,$_ivp
693	mov	$ctx,$_ctx
694	mov	$in0,$_in0
695	mov	%rax,$_rsp
696.cfi_cfa_expression	$_rsp,deref,+8
697___
698$code.=<<___ if ($win64);
699	movaps	%xmm6,`$framesz+16*0`(%rsp)
700	movaps	%xmm7,`$framesz+16*1`(%rsp)
701	movaps	%xmm8,`$framesz+16*2`(%rsp)
702	movaps	%xmm9,`$framesz+16*3`(%rsp)
703	movaps	%xmm10,`$framesz+16*4`(%rsp)
704	movaps	%xmm11,`$framesz+16*5`(%rsp)
705	movaps	%xmm12,`$framesz+16*6`(%rsp)
706	movaps	%xmm13,`$framesz+16*7`(%rsp)
707	movaps	%xmm14,`$framesz+16*8`(%rsp)
708	movaps	%xmm15,`$framesz+16*9`(%rsp)
709___
710$code.=<<___;
711.Lprologue_avx:
712	vzeroall
713
714	mov	$inp,%r12		# borrow $a4
715	lea	0x80($key),$inp		# size optimization, reassign
716	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
717	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
718	mov	$ctx,%r15		# borrow $a2
719	mov	$in0,%rsi		# borrow $a3
720	vmovdqu	($ivp),$iv		# load IV
721	sub	\$9,%r14
722
723	mov	$SZ*0(%r15),$A
724	mov	$SZ*1(%r15),$B
725	mov	$SZ*2(%r15),$C
726	mov	$SZ*3(%r15),$D
727	mov	$SZ*4(%r15),$E
728	mov	$SZ*5(%r15),$F
729	mov	$SZ*6(%r15),$G
730	mov	$SZ*7(%r15),$H
731
732	vmovdqa	0x00(%r13,%r14,8),$mask14
733	vmovdqa	0x10(%r13,%r14,8),$mask12
734	vmovdqa	0x20(%r13,%r14,8),$mask10
735	vmovdqu	0x00-0x80($inp),$roundkey
736___
737					if ($SZ==4) {	# SHA256
738    my @X = map("%xmm$_",(0..3));
739    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
740
741$code.=<<___;
742	jmp	.Lloop_avx
743.align	16
744.Lloop_avx:
745	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
746	vmovdqu	0x00(%rsi,%r12),@X[0]
747	vmovdqu	0x10(%rsi,%r12),@X[1]
748	vmovdqu	0x20(%rsi,%r12),@X[2]
749	vmovdqu	0x30(%rsi,%r12),@X[3]
750	vpshufb	$t3,@X[0],@X[0]
751	lea	$TABLE(%rip),$Tbl
752	vpshufb	$t3,@X[1],@X[1]
753	vpshufb	$t3,@X[2],@X[2]
754	vpaddd	0x00($Tbl),@X[0],$t0
755	vpshufb	$t3,@X[3],@X[3]
756	vpaddd	0x20($Tbl),@X[1],$t1
757	vpaddd	0x40($Tbl),@X[2],$t2
758	vpaddd	0x60($Tbl),@X[3],$t3
759	vmovdqa	$t0,0x00(%rsp)
760	mov	$A,$a1
761	vmovdqa	$t1,0x10(%rsp)
762	mov	$B,$a3
763	vmovdqa	$t2,0x20(%rsp)
764	xor	$C,$a3			# magic
765	vmovdqa	$t3,0x30(%rsp)
766	mov	$E,$a0
767	jmp	.Lavx_00_47
768
769.align	16
770.Lavx_00_47:
771	sub	\$-16*2*$SZ,$Tbl	# size optimization
772	vmovdqu	(%r12),$inout		# $a4
773	mov	%r12,$_inp		# $a4
774___
775sub Xupdate_256_AVX () {
776	(
777	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
778	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
779	'&vpsrld	($t2,$t0,$sigma0[0]);',
780	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
781	'&vpsrld	($t3,$t0,$sigma0[2])',
782	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
783	'&vpxor		($t0,$t3,$t2)',
784	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
785	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
786	'&vpxor		($t0,$t0,$t1)',
787	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
788	'&vpxor		($t0,$t0,$t2)',
789	 '&vpsrld	($t2,$t3,$sigma1[2]);',
790	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
791	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
792	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
793	 '&vpxor	($t2,$t2,$t3);',
794	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
795	 '&vpxor	($t2,$t2,$t3)',		# sigma1(X[14..15])
796	 '&vpshufd	($t2,$t2,0b10000100)',
797	 '&vpsrldq	($t2,$t2,8)',
798	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
799	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
800	 '&vpsrld	($t2,$t3,$sigma1[2])',
801	 '&vpsrlq	($t3,$t3,$sigma1[0])',
802	 '&vpxor	($t2,$t2,$t3);',
803	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
804	 '&vpxor	($t2,$t2,$t3)',
805	 '&vpshufd	($t2,$t2,0b11101000)',
806	 '&vpslldq	($t2,$t2,8)',
807	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
808	);
809}
810
811sub AVX_256_00_47 () {
812my $j = shift;
813my $body = shift;
814my @X = @_;
815my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
816
817	foreach (Xupdate_256_AVX()) {		# 29 instructions
818	    eval;
819	    eval(shift(@insns));
820	    eval(shift(@insns));
821	    eval(shift(@insns));
822	}
823	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
824	  foreach (@insns) { eval; }		# remaining instructions
825	&vmovdqa	(16*$j."(%rsp)",$t2);
826}
827
828    $aesni_cbc_idx=0;
829    for ($i=0,$j=0; $j<4; $j++) {
830	&AVX_256_00_47($j,\&body_00_15,@X);
831	push(@X,shift(@X));			# rotate(@X)
832    }
833    	&mov		("%r12",$_inp);		# borrow $a4
834	&vpand		($temp,$temp,$mask14);
835	&mov		("%r15",$_out);		# borrow $a2
836	&vpor		($iv,$iv,$temp);
837	&vmovdqu	("(%r15,%r12)",$iv);	# write output
838	&lea		("%r12","16(%r12)");	# inp++
839
840	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
841	&jne	(".Lavx_00_47");
842
843	&vmovdqu	($inout,"(%r12)");
844	&mov		($_inp,"%r12");
845
846    $aesni_cbc_idx=0;
847    for ($i=0; $i<16; ) {
848	foreach(body_00_15()) { eval; }
849    }
850
851					}
852$code.=<<___;
853	mov	$_inp,%r12		# borrow $a4
854	mov	$_out,%r13		# borrow $a0
855	mov	$_ctx,%r15		# borrow $a2
856	mov	$_in0,%rsi		# borrow $a3
857
858	vpand	$mask14,$temp,$temp
859	mov	$a1,$A
860	vpor	$temp,$iv,$iv
861	vmovdqu	$iv,(%r13,%r12)		# write output
862	lea	16(%r12),%r12		# inp++
863
864	add	$SZ*0(%r15),$A
865	add	$SZ*1(%r15),$B
866	add	$SZ*2(%r15),$C
867	add	$SZ*3(%r15),$D
868	add	$SZ*4(%r15),$E
869	add	$SZ*5(%r15),$F
870	add	$SZ*6(%r15),$G
871	add	$SZ*7(%r15),$H
872
873	cmp	$_end,%r12
874
875	mov	$A,$SZ*0(%r15)
876	mov	$B,$SZ*1(%r15)
877	mov	$C,$SZ*2(%r15)
878	mov	$D,$SZ*3(%r15)
879	mov	$E,$SZ*4(%r15)
880	mov	$F,$SZ*5(%r15)
881	mov	$G,$SZ*6(%r15)
882	mov	$H,$SZ*7(%r15)
883	jb	.Lloop_avx
884
885	mov	$_ivp,$ivp
886	mov	$_rsp,%rsi
887.cfi_def_cfa	%rsi,8
888	vmovdqu	$iv,($ivp)		# output IV
889	vzeroall
890___
891$code.=<<___ if ($win64);
892	movaps	`$framesz+16*0`(%rsp),%xmm6
893	movaps	`$framesz+16*1`(%rsp),%xmm7
894	movaps	`$framesz+16*2`(%rsp),%xmm8
895	movaps	`$framesz+16*3`(%rsp),%xmm9
896	movaps	`$framesz+16*4`(%rsp),%xmm10
897	movaps	`$framesz+16*5`(%rsp),%xmm11
898	movaps	`$framesz+16*6`(%rsp),%xmm12
899	movaps	`$framesz+16*7`(%rsp),%xmm13
900	movaps	`$framesz+16*8`(%rsp),%xmm14
901	movaps	`$framesz+16*9`(%rsp),%xmm15
902___
903$code.=<<___;
904	mov	-48(%rsi),%r15
905.cfi_restore	%r15
906	mov	-40(%rsi),%r14
907.cfi_restore	%r14
908	mov	-32(%rsi),%r13
909.cfi_restore	%r13
910	mov	-24(%rsi),%r12
911.cfi_restore	%r12
912	mov	-16(%rsi),%rbp
913.cfi_restore	%rbp
914	mov	-8(%rsi),%rbx
915.cfi_restore	%rbx
916	lea	(%rsi),%rsp
917.cfi_def_cfa_register	%rsp
918.Lepilogue_avx:
919	ret
920.cfi_endproc
921.size	${func}_avx,.-${func}_avx
922___
923
924if ($avx>1) {{
925######################################################################
926# AVX2+BMI code path
927#
928my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
929my $PUSH8=8*2*$SZ;
930use integer;
931
932sub bodyx_00_15 () {
933	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
934	(
935	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
936
937	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
938	'&and	($a4,$e)',		# f&e
939	'&rorx	($a0,$e,$Sigma1[2])',
940	'&rorx	($a2,$e,$Sigma1[1])',
941
942	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
943	'&lea	($h,"($h,$a4)")',
944	'&andn	($a4,$e,$g)',		# ~e&g
945	'&xor	($a0,$a2)',
946
947	'&rorx	($a1,$e,$Sigma1[0])',
948	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
949	'&xor	($a0,$a1)',		# Sigma1(e)
950	'&mov	($a2,$a)',
951
952	'&rorx	($a4,$a,$Sigma0[2])',
953	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
954	'&xor	($a2,$b)',		# a^b, b^c in next round
955	'&rorx	($a1,$a,$Sigma0[1])',
956
957	'&rorx	($a0,$a,$Sigma0[0])',
958	'&lea	($d,"($d,$h)")',	# d+=h
959	'&and	($a3,$a2)',		# (b^c)&(a^b)
960	@aesni_cbc_block[$aesni_cbc_idx++].
961	'&xor	($a1,$a4)',
962
963	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
964	'&xor	($a1,$a0)',		# Sigma0(a)
965	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
966	'&mov	($a4,$e)',		# copy of f in future
967
968	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
969	);
970	# and at the finish one has to $a+=$a1
971}
972
973$code.=<<___;
974.type	${func}_avx2,\@function,6
975.align	64
976${func}_avx2:
977.cfi_startproc
978.Lavx2_shortcut:
979	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
980	mov	%rsp,%rax		# copy %rsp
981.cfi_def_cfa_register	%rax
982	push	%rbx
983.cfi_push	%rbx
984	push	%rbp
985.cfi_push	%rbp
986	push	%r12
987.cfi_push	%r12
988	push	%r13
989.cfi_push	%r13
990	push	%r14
991.cfi_push	%r14
992	push	%r15
993.cfi_push	%r15
994	sub	\$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
995	and	\$-256*$SZ,%rsp		# align stack frame
996	add	\$`2*$SZ*($rounds-8)`,%rsp
997
998	shl	\$6,$len
999	sub	$inp,$out		# re-bias
1000	sub	$inp,$in0
1001	add	$inp,$len		# end of input
1002
1003	#mov	$inp,$_inp		# saved later
1004	#mov	$out,$_out		# kept in $offload
1005	mov	$len,$_end
1006	#mov	$key,$_key		# remains resident in $inp register
1007	mov	$ivp,$_ivp
1008	mov	$ctx,$_ctx
1009	mov	$in0,$_in0
1010	mov	%rax,$_rsp
1011.cfi_cfa_expression	$_rsp,deref,+8
1012___
1013$code.=<<___ if ($win64);
1014	movaps	%xmm6,`$framesz+16*0`(%rsp)
1015	movaps	%xmm7,`$framesz+16*1`(%rsp)
1016	movaps	%xmm8,`$framesz+16*2`(%rsp)
1017	movaps	%xmm9,`$framesz+16*3`(%rsp)
1018	movaps	%xmm10,`$framesz+16*4`(%rsp)
1019	movaps	%xmm11,`$framesz+16*5`(%rsp)
1020	movaps	%xmm12,`$framesz+16*6`(%rsp)
1021	movaps	%xmm13,`$framesz+16*7`(%rsp)
1022	movaps	%xmm14,`$framesz+16*8`(%rsp)
1023	movaps	%xmm15,`$framesz+16*9`(%rsp)
1024___
1025$code.=<<___;
1026.Lprologue_avx2:
1027	vzeroall
1028
1029	mov	$inp,%r13		# borrow $a0
1030	vpinsrq	\$1,$out,$offload,$offload
1031	lea	0x80($key),$inp		# size optimization, reassign
1032	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r12	# borrow $a4
1033	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
1034	mov	$ctx,%r15		# borrow $a2
1035	mov	$in0,%rsi		# borrow $a3
1036	vmovdqu	($ivp),$iv		# load IV
1037	lea	-9(%r14),%r14
1038
1039	vmovdqa	0x00(%r12,%r14,8),$mask14
1040	vmovdqa	0x10(%r12,%r14,8),$mask12
1041	vmovdqa	0x20(%r12,%r14,8),$mask10
1042
1043	sub	\$-16*$SZ,%r13		# inp++, size optimization
1044	mov	$SZ*0(%r15),$A
1045	lea	(%rsi,%r13),%r12	# borrow $a0
1046	mov	$SZ*1(%r15),$B
1047	cmp	$len,%r13		# $_end
1048	mov	$SZ*2(%r15),$C
1049	cmove	%rsp,%r12		# next block or random data
1050	mov	$SZ*3(%r15),$D
1051	mov	$SZ*4(%r15),$E
1052	mov	$SZ*5(%r15),$F
1053	mov	$SZ*6(%r15),$G
1054	mov	$SZ*7(%r15),$H
1055	vmovdqu	0x00-0x80($inp),$roundkey
1056___
1057					if ($SZ==4) {	# SHA256
1058    my @X = map("%ymm$_",(0..3));
1059    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1060
1061$code.=<<___;
1062	jmp	.Loop_avx2
1063.align	16
1064.Loop_avx2:
1065	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1066	vmovdqu	-16*$SZ+0(%rsi,%r13),%xmm0
1067	vmovdqu	-16*$SZ+16(%rsi,%r13),%xmm1
1068	vmovdqu	-16*$SZ+32(%rsi,%r13),%xmm2
1069	vmovdqu	-16*$SZ+48(%rsi,%r13),%xmm3
1070
1071	vinserti128	\$1,(%r12),@X[0],@X[0]
1072	vinserti128	\$1,16(%r12),@X[1],@X[1]
1073	 vpshufb	$t3,@X[0],@X[0]
1074	vinserti128	\$1,32(%r12),@X[2],@X[2]
1075	 vpshufb	$t3,@X[1],@X[1]
1076	vinserti128	\$1,48(%r12),@X[3],@X[3]
1077
1078	lea	$TABLE(%rip),$Tbl
1079	vpshufb	$t3,@X[2],@X[2]
1080	lea	-16*$SZ(%r13),%r13
1081	vpaddd	0x00($Tbl),@X[0],$t0
1082	vpshufb	$t3,@X[3],@X[3]
1083	vpaddd	0x20($Tbl),@X[1],$t1
1084	vpaddd	0x40($Tbl),@X[2],$t2
1085	vpaddd	0x60($Tbl),@X[3],$t3
1086	vmovdqa	$t0,0x00(%rsp)
1087	xor	$a1,$a1
1088	vmovdqa	$t1,0x20(%rsp)
1089___
1090$code.=<<___ if (!$win64);
1091# temporarily use %rsi as frame pointer
1092        mov     $_rsp,%rsi
1093.cfi_def_cfa    %rsi,8
1094___
1095$code.=<<___;
1096	lea	-$PUSH8(%rsp),%rsp
1097___
1098$code.=<<___ if (!$win64);
1099# the frame info is at $_rsp, but the stack is moving...
1100# so a second frame pointer is saved at -8(%rsp)
1101# that is in the red zone
1102        mov     %rsi,-8(%rsp)
1103.cfi_cfa_expression     %rsp-8,deref,+8
1104___
1105$code.=<<___;
1106	mov	$B,$a3
1107	vmovdqa	$t2,0x00(%rsp)
1108	xor	$C,$a3			# magic
1109	vmovdqa	$t3,0x20(%rsp)
1110	mov	$F,$a4
1111	sub	\$-16*2*$SZ,$Tbl	# size optimization
1112	jmp	.Lavx2_00_47
1113
1114.align	16
1115.Lavx2_00_47:
1116	vmovdqu	(%r13),$inout
1117	vpinsrq	\$0,%r13,$offload,$offload
1118___
1119
1120sub AVX2_256_00_47 () {
1121my $j = shift;
1122my $body = shift;
1123my @X = @_;
1124my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1125my $base = "+2*$PUSH8(%rsp)";
1126
1127	if (($j%2)==0) {
1128	&lea	("%rsp","-$PUSH8(%rsp)");
1129$code.=<<___ if (!$win64);
1130.cfi_cfa_expression     %rsp+`$PUSH8-8`,deref,+8
1131# copy secondary frame pointer to new location again at -8(%rsp)
1132        pushq   $PUSH8-8(%rsp)
1133.cfi_cfa_expression     %rsp,deref,+8
1134        lea     8(%rsp),%rsp
1135.cfi_cfa_expression     %rsp-8,deref,+8
1136___
1137	}
1138	foreach (Xupdate_256_AVX()) {		# 29 instructions
1139	    eval;
1140	    eval(shift(@insns));
1141	    eval(shift(@insns));
1142	    eval(shift(@insns));
1143	}
1144	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1145	  foreach (@insns) { eval; }		# remaining instructions
1146	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1147}
1148    $aesni_cbc_idx=0;
1149    for ($i=0,$j=0; $j<4; $j++) {
1150	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1151	push(@X,shift(@X));			# rotate(@X)
1152    }
1153	&vmovq		("%r13",$offload);	# borrow $a0
1154	&vpextrq	("%r15",$offload,1);	# borrow $a2
1155	&vpand		($temp,$temp,$mask14);
1156	&vpor		($iv,$iv,$temp);
1157	&vmovdqu	("(%r15,%r13)",$iv);	# write output
1158	&lea		("%r13","16(%r13)");	# inp++
1159
1160	&lea	($Tbl,16*2*$SZ."($Tbl)");
1161	&cmpb	(($SZ-1)."($Tbl)",0);
1162	&jne	(".Lavx2_00_47");
1163
1164	&vmovdqu	($inout,"(%r13)");
1165	&vpinsrq	($offload,$offload,"%r13",0);
1166
1167    $aesni_cbc_idx=0;
1168    for ($i=0; $i<16; ) {
1169	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1170	foreach(bodyx_00_15()) { eval; }
1171    }
1172					}
1173$code.=<<___;
1174	vpextrq	\$1,$offload,%r12		# $_out, borrow $a4
1175	vmovq	$offload,%r13			# $_inp, borrow $a0
1176	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1177	add	$a1,$A
1178	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
1179
1180	vpand	$mask14,$temp,$temp
1181	vpor	$temp,$iv,$iv
1182	vmovdqu	$iv,(%r12,%r13)			# write output
1183	lea	16(%r13),%r13
1184
1185	add	$SZ*0(%r15),$A
1186	add	$SZ*1(%r15),$B
1187	add	$SZ*2(%r15),$C
1188	add	$SZ*3(%r15),$D
1189	add	$SZ*4(%r15),$E
1190	add	$SZ*5(%r15),$F
1191	add	$SZ*6(%r15),$G
1192	add	$SZ*7(%r15),$H
1193
1194	mov	$A,$SZ*0(%r15)
1195	mov	$B,$SZ*1(%r15)
1196	mov	$C,$SZ*2(%r15)
1197	mov	$D,$SZ*3(%r15)
1198	mov	$E,$SZ*4(%r15)
1199	mov	$F,$SZ*5(%r15)
1200	mov	$G,$SZ*6(%r15)
1201	mov	$H,$SZ*7(%r15)
1202
1203	cmp	`$PUSH8+2*8`($Tbl),%r13		# $_end
1204	je	.Ldone_avx2
1205
1206	xor	$a1,$a1
1207	mov	$B,$a3
1208	mov	$F,$a4
1209	xor	$C,$a3			# magic
1210	jmp	.Lower_avx2
1211.align	16
1212.Lower_avx2:
1213	vmovdqu	(%r13),$inout
1214	vpinsrq	\$0,%r13,$offload,$offload
1215___
1216    $aesni_cbc_idx=0;
1217    for ($i=0; $i<16; ) {
1218	my $base="+16($Tbl)";
1219	foreach(bodyx_00_15()) { eval; }
1220	&lea	($Tbl,"-$PUSH8($Tbl)")	if ($i==8);
1221    }
1222$code.=<<___;
1223	vmovq	$offload,%r13			# borrow $a0
1224	vpextrq	\$1,$offload,%r15		# borrow $a2
1225	vpand	$mask14,$temp,$temp
1226	vpor	$temp,$iv,$iv
1227	lea	-$PUSH8($Tbl),$Tbl
1228	vmovdqu	$iv,(%r15,%r13)			# write output
1229	lea	16(%r13),%r13			# inp++
1230	cmp	%rsp,$Tbl
1231	jae	.Lower_avx2
1232
1233	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1234	lea	16*$SZ(%r13),%r13
1235	mov	`2*$SZ*$rounds+6*8`(%rsp),%rsi	# $_in0, borrow $a3
1236	add	$a1,$A
1237	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
1238
1239	add	$SZ*0(%r15),$A
1240	add	$SZ*1(%r15),$B
1241	add	$SZ*2(%r15),$C
1242	add	$SZ*3(%r15),$D
1243	add	$SZ*4(%r15),$E
1244	add	$SZ*5(%r15),$F
1245	add	$SZ*6(%r15),$G
1246	lea	(%rsi,%r13),%r12
1247	add	$SZ*7(%r15),$H
1248
1249	cmp	$_end,%r13
1250
1251	mov	$A,$SZ*0(%r15)
1252	cmove	%rsp,%r12		# next block or stale data
1253	mov	$B,$SZ*1(%r15)
1254	mov	$C,$SZ*2(%r15)
1255	mov	$D,$SZ*3(%r15)
1256	mov	$E,$SZ*4(%r15)
1257	mov	$F,$SZ*5(%r15)
1258	mov	$G,$SZ*6(%r15)
1259	mov	$H,$SZ*7(%r15)
1260
1261	jbe	.Loop_avx2
1262	lea	(%rsp),$Tbl
1263# temporarily use $Tbl as index to $_rsp
1264# this avoids the need to save a secondary frame pointer at -8(%rsp)
1265.cfi_cfa_expression     $Tbl+`16*$SZ+7*8`,deref,+8
1266
1267.Ldone_avx2:
1268	mov	16*$SZ+4*8($Tbl),$ivp
1269	mov	16*$SZ+7*8($Tbl),%rsi
1270.cfi_def_cfa	%rsi,8
1271	vmovdqu	$iv,($ivp)		# output IV
1272	vzeroall
1273___
1274$code.=<<___ if ($win64);
1275	movaps	`$framesz+16*0`($Tbl),%xmm6
1276	movaps	`$framesz+16*1`($Tbl),%xmm7
1277	movaps	`$framesz+16*2`($Tbl),%xmm8
1278	movaps	`$framesz+16*3`($Tbl),%xmm9
1279	movaps	`$framesz+16*4`($Tbl),%xmm10
1280	movaps	`$framesz+16*5`($Tbl),%xmm11
1281	movaps	`$framesz+16*6`($Tbl),%xmm12
1282	movaps	`$framesz+16*7`($Tbl),%xmm13
1283	movaps	`$framesz+16*8`($Tbl),%xmm14
1284	movaps	`$framesz+16*9`($Tbl),%xmm15
1285___
1286$code.=<<___;
1287	mov	-48(%rsi),%r15
1288.cfi_restore	%r15
1289	mov	-40(%rsi),%r14
1290.cfi_restore	%r14
1291	mov	-32(%rsi),%r13
1292.cfi_restore	%r13
1293	mov	-24(%rsi),%r12
1294.cfi_restore	%r12
1295	mov	-16(%rsi),%rbp
1296.cfi_restore	%rbp
1297	mov	-8(%rsi),%rbx
1298.cfi_restore	%rbx
1299	lea	(%rsi),%rsp
1300.cfi_def_cfa_register	%rsp
1301.Lepilogue_avx2:
1302	ret
1303.cfi_endproc
1304.size	${func}_avx2,.-${func}_avx2
1305___
1306}}
1307}}
1308{{
1309my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1310
1311my ($rounds,$Tbl)=("%r11d","%rbx");
1312
1313my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1314my @rndkey=("%xmm4","%xmm5");
1315my $r=0;
1316my $sn=0;
1317
1318my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1319my @MSG=map("%xmm$_",(10..13));
1320
1321my $aesenc=sub {
1322  use integer;
1323  my ($n,$k)=($r/10,$r%10);
1324    if ($k==0) {
1325      $code.=<<___;
1326	movups		`16*$n`($in0),$in		# load input
1327	xorps		$rndkey0,$in
1328___
1329      $code.=<<___ if ($n);
1330	movups		$iv,`16*($n-1)`($out,$in0)	# write output
1331___
1332      $code.=<<___;
1333	xorps		$in,$iv
1334	movups		`32+16*$k-112`($key),$rndkey[1]
1335	aesenc		$rndkey[0],$iv
1336___
1337    } elsif ($k==9) {
1338      $sn++;
1339      $code.=<<___;
1340	cmp		\$11,$rounds
1341	jb		.Laesenclast$sn
1342	movups		`32+16*($k+0)-112`($key),$rndkey[1]
1343	aesenc		$rndkey[0],$iv
1344	movups		`32+16*($k+1)-112`($key),$rndkey[0]
1345	aesenc		$rndkey[1],$iv
1346	je		.Laesenclast$sn
1347	movups		`32+16*($k+2)-112`($key),$rndkey[1]
1348	aesenc		$rndkey[0],$iv
1349	movups		`32+16*($k+3)-112`($key),$rndkey[0]
1350	aesenc		$rndkey[1],$iv
1351.Laesenclast$sn:
1352	aesenclast	$rndkey[0],$iv
1353	movups		16-112($key),$rndkey[1]		# forward reference
1354	nop
1355___
1356    } else {
1357      $code.=<<___;
1358	movups		`32+16*$k-112`($key),$rndkey[1]
1359	aesenc		$rndkey[0],$iv
1360___
1361    }
1362    $r++;	unshift(@rndkey,pop(@rndkey));
1363};
1364
1365if ($shaext) {
1366my $Tbl="%rax";
1367
1368$code.=<<___;
1369.type	${func}_shaext,\@function,6
1370.align	32
1371${func}_shaext:
1372.cfi_startproc
1373	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1374___
1375$code.=<<___ if ($win64);
1376	lea	`-8-10*16`(%rsp),%rsp
1377	movaps	%xmm6,-8-10*16(%rax)
1378	movaps	%xmm7,-8-9*16(%rax)
1379	movaps	%xmm8,-8-8*16(%rax)
1380	movaps	%xmm9,-8-7*16(%rax)
1381	movaps	%xmm10,-8-6*16(%rax)
1382	movaps	%xmm11,-8-5*16(%rax)
1383	movaps	%xmm12,-8-4*16(%rax)
1384	movaps	%xmm13,-8-3*16(%rax)
1385	movaps	%xmm14,-8-2*16(%rax)
1386	movaps	%xmm15,-8-1*16(%rax)
1387.Lprologue_shaext:
1388___
1389$code.=<<___;
1390	lea		K256+0x80(%rip),$Tbl
1391	movdqu		($ctx),$ABEF		# DCBA
1392	movdqu		16($ctx),$CDGH		# HGFE
1393	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
1394
1395	mov		240($key),$rounds
1396	sub		$in0,$out
1397	movups		($key),$rndkey0		# $key[0]
1398	movups		($ivp),$iv		# load IV
1399	movups		16($key),$rndkey[0]	# forward reference
1400	lea		112($key),$key		# size optimization
1401
1402	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
1403	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
1404	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
1405	movdqa		$TMP,$BSWAP		# offload
1406	palignr		\$8,$CDGH,$ABEF		# ABEF
1407	punpcklqdq	$Wi,$CDGH		# CDGH
1408
1409	jmp	.Loop_shaext
1410
1411.align	16
1412.Loop_shaext:
1413	movdqu		($inp),@MSG[0]
1414	movdqu		0x10($inp),@MSG[1]
1415	movdqu		0x20($inp),@MSG[2]
1416	pshufb		$TMP,@MSG[0]
1417	movdqu		0x30($inp),@MSG[3]
1418
1419	movdqa		0*32-0x80($Tbl),$Wi
1420	paddd		@MSG[0],$Wi
1421	pshufb		$TMP,@MSG[1]
1422	movdqa		$CDGH,$CDGH_SAVE	# offload
1423	movdqa		$ABEF,$ABEF_SAVE	# offload
1424___
1425	&$aesenc();
1426$code.=<<___;
1427	sha256rnds2	$ABEF,$CDGH		# 0-3
1428	pshufd		\$0x0e,$Wi,$Wi
1429___
1430	&$aesenc();
1431$code.=<<___;
1432	sha256rnds2	$CDGH,$ABEF
1433
1434	movdqa		1*32-0x80($Tbl),$Wi
1435	paddd		@MSG[1],$Wi
1436	pshufb		$TMP,@MSG[2]
1437	lea		0x40($inp),$inp
1438___
1439	&$aesenc();
1440$code.=<<___;
1441	sha256rnds2	$ABEF,$CDGH		# 4-7
1442	pshufd		\$0x0e,$Wi,$Wi
1443___
1444	&$aesenc();
1445$code.=<<___;
1446	sha256rnds2	$CDGH,$ABEF
1447
1448	movdqa		2*32-0x80($Tbl),$Wi
1449	paddd		@MSG[2],$Wi
1450	pshufb		$TMP,@MSG[3]
1451	sha256msg1	@MSG[1],@MSG[0]
1452___
1453	&$aesenc();
1454$code.=<<___;
1455	sha256rnds2	$ABEF,$CDGH		# 8-11
1456	pshufd		\$0x0e,$Wi,$Wi
1457	movdqa		@MSG[3],$TMP
1458	palignr		\$4,@MSG[2],$TMP
1459	paddd		$TMP,@MSG[0]
1460___
1461	&$aesenc();
1462$code.=<<___;
1463	sha256rnds2	$CDGH,$ABEF
1464
1465	movdqa		3*32-0x80($Tbl),$Wi
1466	paddd		@MSG[3],$Wi
1467	sha256msg2	@MSG[3],@MSG[0]
1468	sha256msg1	@MSG[2],@MSG[1]
1469___
1470	&$aesenc();
1471$code.=<<___;
1472	sha256rnds2	$ABEF,$CDGH		# 12-15
1473	pshufd		\$0x0e,$Wi,$Wi
1474___
1475	&$aesenc();
1476$code.=<<___;
1477	movdqa		@MSG[0],$TMP
1478	palignr		\$4,@MSG[3],$TMP
1479	paddd		$TMP,@MSG[1]
1480	sha256rnds2	$CDGH,$ABEF
1481___
1482for($i=4;$i<16-3;$i++) {
1483	&$aesenc()	if (($r%10)==0);
1484$code.=<<___;
1485	movdqa		$i*32-0x80($Tbl),$Wi
1486	paddd		@MSG[0],$Wi
1487	sha256msg2	@MSG[0],@MSG[1]
1488	sha256msg1	@MSG[3],@MSG[2]
1489___
1490	&$aesenc();
1491$code.=<<___;
1492	sha256rnds2	$ABEF,$CDGH		# 16-19...
1493	pshufd		\$0x0e,$Wi,$Wi
1494	movdqa		@MSG[1],$TMP
1495	palignr		\$4,@MSG[0],$TMP
1496	paddd		$TMP,@MSG[2]
1497___
1498	&$aesenc();
1499	&$aesenc()	if ($r==19);
1500$code.=<<___;
1501	sha256rnds2	$CDGH,$ABEF
1502___
1503	push(@MSG,shift(@MSG));
1504}
1505$code.=<<___;
1506	movdqa		13*32-0x80($Tbl),$Wi
1507	paddd		@MSG[0],$Wi
1508	sha256msg2	@MSG[0],@MSG[1]
1509	sha256msg1	@MSG[3],@MSG[2]
1510___
1511	&$aesenc();
1512$code.=<<___;
1513	sha256rnds2	$ABEF,$CDGH		# 52-55
1514	pshufd		\$0x0e,$Wi,$Wi
1515	movdqa		@MSG[1],$TMP
1516	palignr		\$4,@MSG[0],$TMP
1517	paddd		$TMP,@MSG[2]
1518___
1519	&$aesenc();
1520	&$aesenc();
1521$code.=<<___;
1522	sha256rnds2	$CDGH,$ABEF
1523
1524	movdqa		14*32-0x80($Tbl),$Wi
1525	paddd		@MSG[1],$Wi
1526	sha256msg2	@MSG[1],@MSG[2]
1527	movdqa		$BSWAP,$TMP
1528___
1529	&$aesenc();
1530$code.=<<___;
1531	sha256rnds2	$ABEF,$CDGH		# 56-59
1532	pshufd		\$0x0e,$Wi,$Wi
1533___
1534	&$aesenc();
1535$code.=<<___;
1536	sha256rnds2	$CDGH,$ABEF
1537
1538	movdqa		15*32-0x80($Tbl),$Wi
1539	paddd		@MSG[2],$Wi
1540___
1541	&$aesenc();
1542	&$aesenc();
1543$code.=<<___;
1544	sha256rnds2	$ABEF,$CDGH		# 60-63
1545	pshufd		\$0x0e,$Wi,$Wi
1546___
1547	&$aesenc();
1548$code.=<<___;
1549	sha256rnds2	$CDGH,$ABEF
1550	#pxor		$CDGH,$rndkey0		# black magic
1551___
1552	while ($r<40)	{ &$aesenc(); }		# remaining aesenc's
1553$code.=<<___;
1554	#xorps		$CDGH,$rndkey0		# black magic
1555	paddd		$CDGH_SAVE,$CDGH
1556	paddd		$ABEF_SAVE,$ABEF
1557
1558	dec		$len
1559	movups		$iv,48($out,$in0)	# write output
1560	lea		64($in0),$in0
1561	jnz		.Loop_shaext
1562
1563	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
1564	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
1565	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
1566	punpckhqdq	$CDGH,$ABEF		# DCBA
1567	palignr		\$8,$TMP,$CDGH		# HGFE
1568
1569	movups		$iv,($ivp)		# write IV
1570	movdqu		$ABEF,($ctx)
1571	movdqu		$CDGH,16($ctx)
1572___
1573$code.=<<___ if ($win64);
1574	movaps	0*16(%rsp),%xmm6
1575	movaps	1*16(%rsp),%xmm7
1576	movaps	2*16(%rsp),%xmm8
1577	movaps	3*16(%rsp),%xmm9
1578	movaps	4*16(%rsp),%xmm10
1579	movaps	5*16(%rsp),%xmm11
1580	movaps	6*16(%rsp),%xmm12
1581	movaps	7*16(%rsp),%xmm13
1582	movaps	8*16(%rsp),%xmm14
1583	movaps	9*16(%rsp),%xmm15
1584	lea	8+10*16(%rsp),%rsp
1585.Lepilogue_shaext:
1586___
1587$code.=<<___;
1588	ret
1589.cfi_endproc
1590.size	${func}_shaext,.-${func}_shaext
1591___
1592}
1593}}}}}
1594
1595# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1596#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1597if ($win64 && $avx) {
1598$rec="%rcx";
1599$frame="%rdx";
1600$context="%r8";
1601$disp="%r9";
1602
1603$code.=<<___;
1604.extern	__imp_RtlVirtualUnwind
1605.type	se_handler,\@abi-omnipotent
1606.align	16
1607se_handler:
1608	push	%rsi
1609	push	%rdi
1610	push	%rbx
1611	push	%rbp
1612	push	%r12
1613	push	%r13
1614	push	%r14
1615	push	%r15
1616	pushfq
1617	sub	\$64,%rsp
1618
1619	mov	120($context),%rax	# pull context->Rax
1620	mov	248($context),%rbx	# pull context->Rip
1621
1622	mov	8($disp),%rsi		# disp->ImageBase
1623	mov	56($disp),%r11		# disp->HanderlData
1624
1625	mov	0(%r11),%r10d		# HandlerData[0]
1626	lea	(%rsi,%r10),%r10	# prologue label
1627	cmp	%r10,%rbx		# context->Rip<prologue label
1628	jb	.Lin_prologue
1629
1630	mov	152($context),%rax	# pull context->Rsp
1631
1632	mov	4(%r11),%r10d		# HandlerData[1]
1633	lea	(%rsi,%r10),%r10	# epilogue label
1634	cmp	%r10,%rbx		# context->Rip>=epilogue label
1635	jae	.Lin_prologue
1636___
1637$code.=<<___ if ($shaext);
1638	lea	aesni_cbc_sha256_enc_shaext(%rip),%r10
1639	cmp	%r10,%rbx
1640	jb	.Lnot_in_shaext
1641
1642	lea	(%rax),%rsi
1643	lea	512($context),%rdi	# &context.Xmm6
1644	mov	\$20,%ecx
1645	.long	0xa548f3fc		# cld; rep movsq
1646	lea	168(%rax),%rax		# adjust stack pointer
1647	jmp	.Lin_prologue
1648.Lnot_in_shaext:
1649___
1650$code.=<<___ if ($avx>1);
1651	lea	.Lavx2_shortcut(%rip),%r10
1652	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
1653	jb	.Lnot_in_avx2
1654
1655	and	\$-256*$SZ,%rax
1656	add	\$`2*$SZ*($rounds-8)`,%rax
1657.Lnot_in_avx2:
1658___
1659$code.=<<___;
1660	mov	%rax,%rsi		# put aside Rsp
1661	mov	16*$SZ+7*8(%rax),%rax	# pull $_rsp
1662
1663	mov	-8(%rax),%rbx
1664	mov	-16(%rax),%rbp
1665	mov	-24(%rax),%r12
1666	mov	-32(%rax),%r13
1667	mov	-40(%rax),%r14
1668	mov	-48(%rax),%r15
1669	mov	%rbx,144($context)	# restore context->Rbx
1670	mov	%rbp,160($context)	# restore context->Rbp
1671	mov	%r12,216($context)	# restore context->R12
1672	mov	%r13,224($context)	# restore context->R13
1673	mov	%r14,232($context)	# restore context->R14
1674	mov	%r15,240($context)	# restore context->R15
1675
1676	lea	16*$SZ+8*8(%rsi),%rsi	# Xmm6- save area
1677	lea	512($context),%rdi	# &context.Xmm6
1678	mov	\$20,%ecx
1679	.long	0xa548f3fc		# cld; rep movsq
1680
1681.Lin_prologue:
1682	mov	8(%rax),%rdi
1683	mov	16(%rax),%rsi
1684	mov	%rax,152($context)	# restore context->Rsp
1685	mov	%rsi,168($context)	# restore context->Rsi
1686	mov	%rdi,176($context)	# restore context->Rdi
1687
1688	mov	40($disp),%rdi		# disp->ContextRecord
1689	mov	$context,%rsi		# context
1690	mov	\$154,%ecx		# sizeof(CONTEXT)
1691	.long	0xa548f3fc		# cld; rep movsq
1692
1693	mov	$disp,%rsi
1694	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1695	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1696	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1697	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1698	mov	40(%rsi),%r10		# disp->ContextRecord
1699	lea	56(%rsi),%r11		# &disp->HandlerData
1700	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1701	mov	%r10,32(%rsp)		# arg5
1702	mov	%r11,40(%rsp)		# arg6
1703	mov	%r12,48(%rsp)		# arg7
1704	mov	%rcx,56(%rsp)		# arg8, (NULL)
1705	call	*__imp_RtlVirtualUnwind(%rip)
1706
1707	mov	\$1,%eax		# ExceptionContinueSearch
1708	add	\$64,%rsp
1709	popfq
1710	pop	%r15
1711	pop	%r14
1712	pop	%r13
1713	pop	%r12
1714	pop	%rbp
1715	pop	%rbx
1716	pop	%rdi
1717	pop	%rsi
1718	ret
1719.size	se_handler,.-se_handler
1720
1721.section	.pdata
1722	.rva	.LSEH_begin_${func}_xop
1723	.rva	.LSEH_end_${func}_xop
1724	.rva	.LSEH_info_${func}_xop
1725
1726	.rva	.LSEH_begin_${func}_avx
1727	.rva	.LSEH_end_${func}_avx
1728	.rva	.LSEH_info_${func}_avx
1729___
1730$code.=<<___ if ($avx>1);
1731	.rva	.LSEH_begin_${func}_avx2
1732	.rva	.LSEH_end_${func}_avx2
1733	.rva	.LSEH_info_${func}_avx2
1734___
1735$code.=<<___ if ($shaext);
1736	.rva	.LSEH_begin_${func}_shaext
1737	.rva	.LSEH_end_${func}_shaext
1738	.rva	.LSEH_info_${func}_shaext
1739___
1740$code.=<<___;
1741.section	.xdata
1742.align	8
1743.LSEH_info_${func}_xop:
1744	.byte	9,0,0,0
1745	.rva	se_handler
1746	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
1747
1748.LSEH_info_${func}_avx:
1749	.byte	9,0,0,0
1750	.rva	se_handler
1751	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1752___
1753$code.=<<___ if ($avx>1);
1754.LSEH_info_${func}_avx2:
1755	.byte	9,0,0,0
1756	.rva	se_handler
1757	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
1758___
1759$code.=<<___ if ($shaext);
1760.LSEH_info_${func}_shaext:
1761	.byte	9,0,0,0
1762	.rva	se_handler
1763	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
1764___
1765}
1766
1767####################################################################
1768sub rex {
1769  local *opcode=shift;
1770  my ($dst,$src)=@_;
1771  my $rex=0;
1772
1773    $rex|=0x04			if($dst>=8);
1774    $rex|=0x01			if($src>=8);
1775    unshift @opcode,$rex|0x40	if($rex);
1776}
1777
1778{
1779  my %opcodelet = (
1780		"sha256rnds2" => 0xcb,
1781  		"sha256msg1"  => 0xcc,
1782		"sha256msg2"  => 0xcd	);
1783
1784  sub sha256op38 {
1785    my $instr = shift;
1786
1787    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1788      my @opcode=(0x0f,0x38);
1789	rex(\@opcode,$2,$1);
1790	push @opcode,$opcodelet{$instr};
1791	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1792	return ".byte\t".join(',',@opcode);
1793    } else {
1794	return $instr."\t".@_[0];
1795    }
1796  }
1797}
1798
1799$code =~ s/\`([^\`]*)\`/eval $1/gem;
1800$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1801print $code;
1802close STDOUT or die "error closing STDOUT: $!";
1803