1#! /usr/bin/env perl
2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# January 2013
18#
19# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20# in http://download.intel.com/design/intarch/papers/323686.pdf, is
21# that since AESNI-CBC encrypt exhibit *very* low instruction-level
22# parallelism, interleaving it with another algorithm would allow to
23# utilize processor resources better and achieve better performance.
24# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25# AESNI code is weaved into it. As SHA256 dominates execution time,
26# stitch performance does not depend on AES key length. Below are
27# performance numbers in cycles per processed byte, less is better,
28# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29# subroutine:
30#
31#		 AES-128/-192/-256+SHA256   this(**)	gain
32# Sandy Bridge	    5.05/6.05/7.05+11.6	    13.0	+28%/36%/43%
33# Ivy Bridge	    5.05/6.05/7.05+10.3	    11.6	+32%/41%/50%
34# Haswell	    4.43/5.29/6.19+7.80	    8.79	+39%/49%/59%
35# Skylake	    2.62/3.14/3.62+7.70	    8.10	+27%/34%/40%
36# Bulldozer	    5.77/6.89/8.00+13.7	    13.7	+42%/50%/58%
37# Ryzen(***)	    2.71/-/3.71+2.05	    2.74/-/3.73	+74%/-/54%
38# Goldmont(***)	    3.82/-/5.35+4.16	    4.73/-/5.94	+69%/-/60%
39#
40# (*)	there are XOP, AVX1 and AVX2 code paths, meaning that
41#	Westmere is omitted from loop, this is because gain was not
42#	estimated high enough to justify the effort;
43# (**)	these are EVP-free results, results obtained with 'speed
44#	-evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45# (***)	these are SHAEXT results;
46
47# $output is the last argument if it looks like a file (it has an extension)
48# $flavour is the first argument if it doesn't look like a file
49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51
52$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57die "can't locate x86_64-xlate.pl";
58
59if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
60		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
61	$avx = ($1>=2.19) + ($1>=2.22);
62}
63
64if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
65	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
66	$avx = ($1>=2.09) + ($1>=2.10);
67}
68
69if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71	$avx = ($1>=10) + ($1>=12);
72}
73
74if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
75	$avx = ($2>=3.0) + ($2>3.0);
76}
77
78$shaext=$avx;	### set to zero if compiling for 1.0.1
79$avx=1		if (!$shaext && $avx);
80
81open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
82    or die "can't call $xlate: $!";
83*STDOUT=*OUT;
84
85$func="aesni_cbc_sha256_enc";
86$TABLE="K256";
87$SZ=4;
88@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
89				"%r8d","%r9d","%r10d","%r11d");
90($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
91@Sigma0=( 2,13,22);
92@Sigma1=( 6,11,25);
93@sigma0=( 7,18, 3);
94@sigma1=(17,19,10);
95$rounds=64;
96
97########################################################################
98# void aesni_cbc_sha256_enc(const void *inp,
99#			void *out,
100#			size_t length,
101#			const AES_KEY *key,
102#			unsigned char *iv,
103#			SHA256_CTX *ctx,
104#			const void *in0);
105($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
106("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
107
108$Tbl="%rbp";
109
110$_inp="16*$SZ+0*8(%rsp)";
111$_out="16*$SZ+1*8(%rsp)";
112$_end="16*$SZ+2*8(%rsp)";
113$_key="16*$SZ+3*8(%rsp)";
114$_ivp="16*$SZ+4*8(%rsp)";
115$_ctx="16*$SZ+5*8(%rsp)";
116$_in0="16*$SZ+6*8(%rsp)";
117$_rsp="`16*$SZ+7*8`(%rsp)";
118$framesz=16*$SZ+8*8;
119
120$code=<<___;
121.text
122
123.extern	OPENSSL_ia32cap_P
124.globl	$func
125.type	$func,\@abi-omnipotent
126.align	16
127$func:
128.cfi_startproc
129___
130						if ($avx) {
131$code.=<<___;
132	lea	OPENSSL_ia32cap_P(%rip),%r11
133	mov	\$1,%eax
134	cmp	\$0,`$win64?"%rcx":"%rdi"`
135	je	.Lprobe
136	mov	0(%r11),%eax
137	mov	4(%r11),%r10
138___
139$code.=<<___ if ($shaext);
140	bt	\$61,%r10			# check for SHA
141	jc	${func}_shaext
142___
143$code.=<<___;
144	mov	%r10,%r11
145	shr	\$32,%r11
146
147	test	\$`1<<11`,%r10d			# check for XOP
148	jnz	${func}_xop
149___
150$code.=<<___ if ($avx>1);
151	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
152	cmp	\$`1<<8|1<<5|1<<3`,%r11d
153	je	${func}_avx2
154___
155$code.=<<___;
156	and	\$`1<<28`,%r10d			# check for AVX
157	jnz	${func}_avx
158	ud2
159___
160						}
161$code.=<<___;
162	xor	%eax,%eax
163	cmp	\$0,`$win64?"%rcx":"%rdi"`
164	je	.Lprobe
165	ud2
166.Lprobe:
167	ret
168.cfi_endproc
169.size	$func,.-$func
170
171.align	64
172.type	$TABLE,\@object
173$TABLE:
174	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
175	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
176	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
177	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
178	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
179	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
180	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
181	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
183	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
184	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
185	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
186	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
187	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
189	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
190	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
191	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
192	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
193	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
194	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
195	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
196	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
197	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
198	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
199	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
203	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
204	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
205	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
206
207	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
208	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
209	.long	0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
210	.long	0,0,0,0,   0,0,0,0
211	.asciz	"AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
212.align	64
213___
214
215######################################################################
216# SIMD code paths
217#
218{{{
219($iv,$inout,$roundkey,$temp,
220 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
221
222$aesni_cbc_idx=0;
223@aesni_cbc_block = (
224##	&vmovdqu	($roundkey,"0x00-0x80($inp)");'
225##	&vmovdqu	($inout,($inp));
226##	&mov		($_inp,$inp);
227
228	'&vpxor		($inout,$inout,$roundkey);'.
229	' &vmovdqu	($roundkey,"0x10-0x80($inp)");',
230
231	'&vpxor		($inout,$inout,$iv);',
232
233	'&vaesenc	($inout,$inout,$roundkey);'.
234	' &vmovdqu	($roundkey,"0x20-0x80($inp)");',
235
236	'&vaesenc	($inout,$inout,$roundkey);'.
237	' &vmovdqu	($roundkey,"0x30-0x80($inp)");',
238
239	'&vaesenc	($inout,$inout,$roundkey);'.
240	' &vmovdqu	($roundkey,"0x40-0x80($inp)");',
241
242	'&vaesenc	($inout,$inout,$roundkey);'.
243	' &vmovdqu	($roundkey,"0x50-0x80($inp)");',
244
245	'&vaesenc	($inout,$inout,$roundkey);'.
246	' &vmovdqu	($roundkey,"0x60-0x80($inp)");',
247
248	'&vaesenc	($inout,$inout,$roundkey);'.
249	' &vmovdqu	($roundkey,"0x70-0x80($inp)");',
250
251	'&vaesenc	($inout,$inout,$roundkey);'.
252	' &vmovdqu	($roundkey,"0x80-0x80($inp)");',
253
254	'&vaesenc	($inout,$inout,$roundkey);'.
255	' &vmovdqu	($roundkey,"0x90-0x80($inp)");',
256
257	'&vaesenc	($inout,$inout,$roundkey);'.
258	' &vmovdqu	($roundkey,"0xa0-0x80($inp)");',
259
260	'&vaesenclast	($temp,$inout,$roundkey);'.
261	' &vaesenc	($inout,$inout,$roundkey);'.
262	' &vmovdqu	($roundkey,"0xb0-0x80($inp)");',
263
264	'&vpand		($iv,$temp,$mask10);'.
265	' &vaesenc	($inout,$inout,$roundkey);'.
266	' &vmovdqu	($roundkey,"0xc0-0x80($inp)");',
267
268	'&vaesenclast	($temp,$inout,$roundkey);'.
269	' &vaesenc	($inout,$inout,$roundkey);'.
270	' &vmovdqu	($roundkey,"0xd0-0x80($inp)");',
271
272	'&vpand		($temp,$temp,$mask12);'.
273	' &vaesenc	($inout,$inout,$roundkey);'.
274	 '&vmovdqu	($roundkey,"0xe0-0x80($inp)");',
275
276	'&vpor		($iv,$iv,$temp);'.
277	' &vaesenclast	($temp,$inout,$roundkey);'.
278	' &vmovdqu	($roundkey,"0x00-0x80($inp)");'
279
280##	&mov		($inp,$_inp);
281##	&mov		($out,$_out);
282##	&vpand		($temp,$temp,$mask14);
283##	&vpor		($iv,$iv,$temp);
284##	&vmovdqu	($iv,($out,$inp);
285##	&lea		(inp,16($inp));
286);
287
288my $a4=$T1;
289my ($a,$b,$c,$d,$e,$f,$g,$h);
290
291sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
292{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
293  my $arg = pop;
294    $arg = "\$$arg" if ($arg*1 eq $arg);
295    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
296}
297
298sub body_00_15 () {
299	(
300	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
301
302	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
303	'&mov	($a,$a1)',
304	'&mov	($a4,$f)',
305
306	'&xor	($a0,$e)',
307	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
308	'&xor	($a4,$g)',			# f^g
309
310	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
311	'&xor	($a1,$a)',
312	'&and	($a4,$e)',			# (f^g)&e
313
314	@aesni_cbc_block[$aesni_cbc_idx++].
315	'&xor	($a0,$e)',
316	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
317	'&mov	($a2,$a)',
318
319	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
320	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
321	'&xor	($a2,$b)',			# a^b, b^c in next round
322
323	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
324	'&add	($h,$a4)',			# h+=Ch(e,f,g)
325	'&and	($a3,$a2)',			# (b^c)&(a^b)
326
327	'&xor	($a1,$a)',
328	'&add	($h,$a0)',			# h+=Sigma1(e)
329	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
330
331	'&add	($d,$h)',			# d+=h
332	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
333	'&add	($h,$a3)',			# h+=Maj(a,b,c)
334
335	'&mov	($a0,$d)',
336	'&add	($a1,$h);'.			# h+=Sigma0(a)
337	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
338	);
339}
340
341if ($avx) {{
342######################################################################
343# XOP code path
344#
345$code.=<<___;
346.type	${func}_xop,\@function,6
347.align	64
348${func}_xop:
349.cfi_startproc
350.Lxop_shortcut:
351	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
352	mov	%rsp,%rax		# copy %rsp
353.cfi_def_cfa_register	%rax
354	push	%rbx
355.cfi_push	%rbx
356	push	%rbp
357.cfi_push	%rbp
358	push	%r12
359.cfi_push	%r12
360	push	%r13
361.cfi_push	%r13
362	push	%r14
363.cfi_push	%r14
364	push	%r15
365.cfi_push	%r15
366	sub	\$`$framesz+$win64*16*10`,%rsp
367	and	\$-64,%rsp		# align stack frame
368
369	shl	\$6,$len
370	sub	$inp,$out		# re-bias
371	sub	$inp,$in0
372	add	$inp,$len		# end of input
373
374	#mov	$inp,$_inp		# saved later
375	mov	$out,$_out
376	mov	$len,$_end
377	#mov	$key,$_key		# remains resident in $inp register
378	mov	$ivp,$_ivp
379	mov	$ctx,$_ctx
380	mov	$in0,$_in0
381	mov	%rax,$_rsp
382.cfi_cfa_expression	$_rsp,deref,+8
383___
384$code.=<<___ if ($win64);
385	movaps	%xmm6,`$framesz+16*0`(%rsp)
386	movaps	%xmm7,`$framesz+16*1`(%rsp)
387	movaps	%xmm8,`$framesz+16*2`(%rsp)
388	movaps	%xmm9,`$framesz+16*3`(%rsp)
389	movaps	%xmm10,`$framesz+16*4`(%rsp)
390	movaps	%xmm11,`$framesz+16*5`(%rsp)
391	movaps	%xmm12,`$framesz+16*6`(%rsp)
392	movaps	%xmm13,`$framesz+16*7`(%rsp)
393	movaps	%xmm14,`$framesz+16*8`(%rsp)
394	movaps	%xmm15,`$framesz+16*9`(%rsp)
395___
396$code.=<<___;
397.Lprologue_xop:
398	vzeroall
399
400	mov	$inp,%r12		# borrow $a4
401	lea	0x80($key),$inp		# size optimization, reassign
402	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
403	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
404	mov	$ctx,%r15		# borrow $a2
405	mov	$in0,%rsi		# borrow $a3
406	vmovdqu	($ivp),$iv		# load IV
407	sub	\$9,%r14
408
409	mov	$SZ*0(%r15),$A
410	mov	$SZ*1(%r15),$B
411	mov	$SZ*2(%r15),$C
412	mov	$SZ*3(%r15),$D
413	mov	$SZ*4(%r15),$E
414	mov	$SZ*5(%r15),$F
415	mov	$SZ*6(%r15),$G
416	mov	$SZ*7(%r15),$H
417
418	vmovdqa	0x00(%r13,%r14,8),$mask14
419	vmovdqa	0x10(%r13,%r14,8),$mask12
420	vmovdqa	0x20(%r13,%r14,8),$mask10
421	vmovdqu	0x00-0x80($inp),$roundkey
422	jmp	.Lloop_xop
423___
424					if ($SZ==4) {	# SHA256
425    my @X = map("%xmm$_",(0..3));
426    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
427
428$code.=<<___;
429.align	16
430.Lloop_xop:
431	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
432	vmovdqu	0x00(%rsi,%r12),@X[0]
433	vmovdqu	0x10(%rsi,%r12),@X[1]
434	vmovdqu	0x20(%rsi,%r12),@X[2]
435	vmovdqu	0x30(%rsi,%r12),@X[3]
436	vpshufb	$t3,@X[0],@X[0]
437	lea	$TABLE(%rip),$Tbl
438	vpshufb	$t3,@X[1],@X[1]
439	vpshufb	$t3,@X[2],@X[2]
440	vpaddd	0x00($Tbl),@X[0],$t0
441	vpshufb	$t3,@X[3],@X[3]
442	vpaddd	0x20($Tbl),@X[1],$t1
443	vpaddd	0x40($Tbl),@X[2],$t2
444	vpaddd	0x60($Tbl),@X[3],$t3
445	vmovdqa	$t0,0x00(%rsp)
446	mov	$A,$a1
447	vmovdqa	$t1,0x10(%rsp)
448	mov	$B,$a3
449	vmovdqa	$t2,0x20(%rsp)
450	xor	$C,$a3			# magic
451	vmovdqa	$t3,0x30(%rsp)
452	mov	$E,$a0
453	jmp	.Lxop_00_47
454
455.align	16
456.Lxop_00_47:
457	sub	\$-16*2*$SZ,$Tbl	# size optimization
458	vmovdqu	(%r12),$inout		# $a4
459	mov	%r12,$_inp		# $a4
460___
461sub XOP_256_00_47 () {
462my $j = shift;
463my $body = shift;
464my @X = @_;
465my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
466
467	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
468	  eval(shift(@insns));
469	  eval(shift(@insns));
470	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
471	  eval(shift(@insns));
472	  eval(shift(@insns));
473	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
474	  eval(shift(@insns));
475	  eval(shift(@insns));
476	&vpsrld		($t0,$t0,$sigma0[2]);
477	  eval(shift(@insns));
478	  eval(shift(@insns));
479	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
480	  eval(shift(@insns));
481	  eval(shift(@insns));
482	  eval(shift(@insns));
483	  eval(shift(@insns));
484	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
485	  eval(shift(@insns));
486	  eval(shift(@insns));
487	&vpxor		($t0,$t0,$t1);
488	  eval(shift(@insns));
489	  eval(shift(@insns));
490	  eval(shift(@insns));
491	  eval(shift(@insns));
492	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
493	  eval(shift(@insns));
494	  eval(shift(@insns));
495	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
496	  eval(shift(@insns));
497	  eval(shift(@insns));
498	 &vpsrld	($t2,@X[3],$sigma1[2]);
499	  eval(shift(@insns));
500	  eval(shift(@insns));
501	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
502	  eval(shift(@insns));
503	  eval(shift(@insns));
504	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
505	  eval(shift(@insns));
506	  eval(shift(@insns));
507	 &vpxor		($t3,$t3,$t2);
508	  eval(shift(@insns));
509	  eval(shift(@insns));
510	  eval(shift(@insns));
511	  eval(shift(@insns));
512	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
513	  eval(shift(@insns));
514	  eval(shift(@insns));
515	  eval(shift(@insns));
516	  eval(shift(@insns));
517	&vpsrldq	($t3,$t3,8);
518	  eval(shift(@insns));
519	  eval(shift(@insns));
520	  eval(shift(@insns));
521	  eval(shift(@insns));
522	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
523	  eval(shift(@insns));
524	  eval(shift(@insns));
525	  eval(shift(@insns));
526	  eval(shift(@insns));
527	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
528	  eval(shift(@insns));
529	  eval(shift(@insns));
530	 &vpsrld	($t2,@X[0],$sigma1[2]);
531	  eval(shift(@insns));
532	  eval(shift(@insns));
533	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
534	  eval(shift(@insns));
535	  eval(shift(@insns));
536	 &vpxor		($t3,$t3,$t2);
537	  eval(shift(@insns));
538	  eval(shift(@insns));
539	  eval(shift(@insns));
540	  eval(shift(@insns));
541	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
542	  eval(shift(@insns));
543	  eval(shift(@insns));
544	  eval(shift(@insns));
545	  eval(shift(@insns));
546	&vpslldq	($t3,$t3,8);		# 22 instructions
547	  eval(shift(@insns));
548	  eval(shift(@insns));
549	  eval(shift(@insns));
550	  eval(shift(@insns));
551	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
552	  eval(shift(@insns));
553	  eval(shift(@insns));
554	  eval(shift(@insns));
555	  eval(shift(@insns));
556	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
557	  foreach (@insns) { eval; }		# remaining instructions
558	&vmovdqa	(16*$j."(%rsp)",$t2);
559}
560
561    $aesni_cbc_idx=0;
562    for ($i=0,$j=0; $j<4; $j++) {
563	&XOP_256_00_47($j,\&body_00_15,@X);
564	push(@X,shift(@X));			# rotate(@X)
565    }
566    	&mov		("%r12",$_inp);		# borrow $a4
567	&vpand		($temp,$temp,$mask14);
568	&mov		("%r15",$_out);		# borrow $a2
569	&vpor		($iv,$iv,$temp);
570	&vmovdqu	("(%r15,%r12)",$iv);	# write output
571	&lea		("%r12","16(%r12)");	# inp++
572
573	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
574	&jne	(".Lxop_00_47");
575
576	&vmovdqu	($inout,"(%r12)");
577	&mov		($_inp,"%r12");
578
579    $aesni_cbc_idx=0;
580    for ($i=0; $i<16; ) {
581	foreach(body_00_15()) { eval; }
582    }
583					}
584$code.=<<___;
585	mov	$_inp,%r12		# borrow $a4
586	mov	$_out,%r13		# borrow $a0
587	mov	$_ctx,%r15		# borrow $a2
588	mov	$_in0,%rsi		# borrow $a3
589
590	vpand	$mask14,$temp,$temp
591	mov	$a1,$A
592	vpor	$temp,$iv,$iv
593	vmovdqu	$iv,(%r13,%r12)		# write output
594	lea	16(%r12),%r12		# inp++
595
596	add	$SZ*0(%r15),$A
597	add	$SZ*1(%r15),$B
598	add	$SZ*2(%r15),$C
599	add	$SZ*3(%r15),$D
600	add	$SZ*4(%r15),$E
601	add	$SZ*5(%r15),$F
602	add	$SZ*6(%r15),$G
603	add	$SZ*7(%r15),$H
604
605	cmp	$_end,%r12
606
607	mov	$A,$SZ*0(%r15)
608	mov	$B,$SZ*1(%r15)
609	mov	$C,$SZ*2(%r15)
610	mov	$D,$SZ*3(%r15)
611	mov	$E,$SZ*4(%r15)
612	mov	$F,$SZ*5(%r15)
613	mov	$G,$SZ*6(%r15)
614	mov	$H,$SZ*7(%r15)
615
616	jb	.Lloop_xop
617
618	mov	$_ivp,$ivp
619	mov	$_rsp,%rsi
620.cfi_def_cfa	%rsi,8
621	vmovdqu	$iv,($ivp)		# output IV
622	vzeroall
623___
624$code.=<<___ if ($win64);
625	movaps	`$framesz+16*0`(%rsp),%xmm6
626	movaps	`$framesz+16*1`(%rsp),%xmm7
627	movaps	`$framesz+16*2`(%rsp),%xmm8
628	movaps	`$framesz+16*3`(%rsp),%xmm9
629	movaps	`$framesz+16*4`(%rsp),%xmm10
630	movaps	`$framesz+16*5`(%rsp),%xmm11
631	movaps	`$framesz+16*6`(%rsp),%xmm12
632	movaps	`$framesz+16*7`(%rsp),%xmm13
633	movaps	`$framesz+16*8`(%rsp),%xmm14
634	movaps	`$framesz+16*9`(%rsp),%xmm15
635___
636$code.=<<___;
637	mov	-48(%rsi),%r15
638.cfi_restore	%r15
639	mov	-40(%rsi),%r14
640.cfi_restore	%r14
641	mov	-32(%rsi),%r13
642.cfi_restore	%r13
643	mov	-24(%rsi),%r12
644.cfi_restore	%r12
645	mov	-16(%rsi),%rbp
646.cfi_restore	%rbp
647	mov	-8(%rsi),%rbx
648.cfi_restore	%rbx
649	lea	(%rsi),%rsp
650.cfi_def_cfa_register	%rsp
651.Lepilogue_xop:
652	ret
653.cfi_endproc
654.size	${func}_xop,.-${func}_xop
655___
656######################################################################
657# AVX+shrd code path
658#
659local *ror = sub { &shrd(@_[0],@_) };
660
661$code.=<<___;
662.type	${func}_avx,\@function,6
663.align	64
664${func}_avx:
665.cfi_startproc
666.Lavx_shortcut:
667	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
668	mov	%rsp,%rax		# copy %rsp
669.cfi_def_cfa_register	%rax
670	push	%rbx
671.cfi_push	%rbx
672	push	%rbp
673.cfi_push	%rbp
674	push	%r12
675.cfi_push	%r12
676	push	%r13
677.cfi_push	%r13
678	push	%r14
679.cfi_push	%r14
680	push	%r15
681.cfi_push	%r15
682	sub	\$`$framesz+$win64*16*10`,%rsp
683	and	\$-64,%rsp		# align stack frame
684
685	shl	\$6,$len
686	sub	$inp,$out		# re-bias
687	sub	$inp,$in0
688	add	$inp,$len		# end of input
689
690	#mov	$inp,$_inp		# saved later
691	mov	$out,$_out
692	mov	$len,$_end
693	#mov	$key,$_key		# remains resident in $inp register
694	mov	$ivp,$_ivp
695	mov	$ctx,$_ctx
696	mov	$in0,$_in0
697	mov	%rax,$_rsp
698.cfi_cfa_expression	$_rsp,deref,+8
699___
700$code.=<<___ if ($win64);
701	movaps	%xmm6,`$framesz+16*0`(%rsp)
702	movaps	%xmm7,`$framesz+16*1`(%rsp)
703	movaps	%xmm8,`$framesz+16*2`(%rsp)
704	movaps	%xmm9,`$framesz+16*3`(%rsp)
705	movaps	%xmm10,`$framesz+16*4`(%rsp)
706	movaps	%xmm11,`$framesz+16*5`(%rsp)
707	movaps	%xmm12,`$framesz+16*6`(%rsp)
708	movaps	%xmm13,`$framesz+16*7`(%rsp)
709	movaps	%xmm14,`$framesz+16*8`(%rsp)
710	movaps	%xmm15,`$framesz+16*9`(%rsp)
711___
712$code.=<<___;
713.Lprologue_avx:
714	vzeroall
715
716	mov	$inp,%r12		# borrow $a4
717	lea	0x80($key),$inp		# size optimization, reassign
718	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
719	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
720	mov	$ctx,%r15		# borrow $a2
721	mov	$in0,%rsi		# borrow $a3
722	vmovdqu	($ivp),$iv		# load IV
723	sub	\$9,%r14
724
725	mov	$SZ*0(%r15),$A
726	mov	$SZ*1(%r15),$B
727	mov	$SZ*2(%r15),$C
728	mov	$SZ*3(%r15),$D
729	mov	$SZ*4(%r15),$E
730	mov	$SZ*5(%r15),$F
731	mov	$SZ*6(%r15),$G
732	mov	$SZ*7(%r15),$H
733
734	vmovdqa	0x00(%r13,%r14,8),$mask14
735	vmovdqa	0x10(%r13,%r14,8),$mask12
736	vmovdqa	0x20(%r13,%r14,8),$mask10
737	vmovdqu	0x00-0x80($inp),$roundkey
738___
739					if ($SZ==4) {	# SHA256
740    my @X = map("%xmm$_",(0..3));
741    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
742
743$code.=<<___;
744	jmp	.Lloop_avx
745.align	16
746.Lloop_avx:
747	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
748	vmovdqu	0x00(%rsi,%r12),@X[0]
749	vmovdqu	0x10(%rsi,%r12),@X[1]
750	vmovdqu	0x20(%rsi,%r12),@X[2]
751	vmovdqu	0x30(%rsi,%r12),@X[3]
752	vpshufb	$t3,@X[0],@X[0]
753	lea	$TABLE(%rip),$Tbl
754	vpshufb	$t3,@X[1],@X[1]
755	vpshufb	$t3,@X[2],@X[2]
756	vpaddd	0x00($Tbl),@X[0],$t0
757	vpshufb	$t3,@X[3],@X[3]
758	vpaddd	0x20($Tbl),@X[1],$t1
759	vpaddd	0x40($Tbl),@X[2],$t2
760	vpaddd	0x60($Tbl),@X[3],$t3
761	vmovdqa	$t0,0x00(%rsp)
762	mov	$A,$a1
763	vmovdqa	$t1,0x10(%rsp)
764	mov	$B,$a3
765	vmovdqa	$t2,0x20(%rsp)
766	xor	$C,$a3			# magic
767	vmovdqa	$t3,0x30(%rsp)
768	mov	$E,$a0
769	jmp	.Lavx_00_47
770
771.align	16
772.Lavx_00_47:
773	sub	\$-16*2*$SZ,$Tbl	# size optimization
774	vmovdqu	(%r12),$inout		# $a4
775	mov	%r12,$_inp		# $a4
776___
777sub Xupdate_256_AVX () {
778	(
779	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
780	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
781	'&vpsrld	($t2,$t0,$sigma0[0]);',
782	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
783	'&vpsrld	($t3,$t0,$sigma0[2])',
784	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
785	'&vpxor		($t0,$t3,$t2)',
786	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
787	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
788	'&vpxor		($t0,$t0,$t1)',
789	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
790	'&vpxor		($t0,$t0,$t2)',
791	 '&vpsrld	($t2,$t3,$sigma1[2]);',
792	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
793	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
794	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
795	 '&vpxor	($t2,$t2,$t3);',
796	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
797	 '&vpxor	($t2,$t2,$t3)',		# sigma1(X[14..15])
798	 '&vpshufd	($t2,$t2,0b10000100)',
799	 '&vpsrldq	($t2,$t2,8)',
800	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
801	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
802	 '&vpsrld	($t2,$t3,$sigma1[2])',
803	 '&vpsrlq	($t3,$t3,$sigma1[0])',
804	 '&vpxor	($t2,$t2,$t3);',
805	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
806	 '&vpxor	($t2,$t2,$t3)',
807	 '&vpshufd	($t2,$t2,0b11101000)',
808	 '&vpslldq	($t2,$t2,8)',
809	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
810	);
811}
812
813sub AVX_256_00_47 () {
814my $j = shift;
815my $body = shift;
816my @X = @_;
817my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
818
819	foreach (Xupdate_256_AVX()) {		# 29 instructions
820	    eval;
821	    eval(shift(@insns));
822	    eval(shift(@insns));
823	    eval(shift(@insns));
824	}
825	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
826	  foreach (@insns) { eval; }		# remaining instructions
827	&vmovdqa	(16*$j."(%rsp)",$t2);
828}
829
830    $aesni_cbc_idx=0;
831    for ($i=0,$j=0; $j<4; $j++) {
832	&AVX_256_00_47($j,\&body_00_15,@X);
833	push(@X,shift(@X));			# rotate(@X)
834    }
835    	&mov		("%r12",$_inp);		# borrow $a4
836	&vpand		($temp,$temp,$mask14);
837	&mov		("%r15",$_out);		# borrow $a2
838	&vpor		($iv,$iv,$temp);
839	&vmovdqu	("(%r15,%r12)",$iv);	# write output
840	&lea		("%r12","16(%r12)");	# inp++
841
842	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
843	&jne	(".Lavx_00_47");
844
845	&vmovdqu	($inout,"(%r12)");
846	&mov		($_inp,"%r12");
847
848    $aesni_cbc_idx=0;
849    for ($i=0; $i<16; ) {
850	foreach(body_00_15()) { eval; }
851    }
852
853					}
854$code.=<<___;
855	mov	$_inp,%r12		# borrow $a4
856	mov	$_out,%r13		# borrow $a0
857	mov	$_ctx,%r15		# borrow $a2
858	mov	$_in0,%rsi		# borrow $a3
859
860	vpand	$mask14,$temp,$temp
861	mov	$a1,$A
862	vpor	$temp,$iv,$iv
863	vmovdqu	$iv,(%r13,%r12)		# write output
864	lea	16(%r12),%r12		# inp++
865
866	add	$SZ*0(%r15),$A
867	add	$SZ*1(%r15),$B
868	add	$SZ*2(%r15),$C
869	add	$SZ*3(%r15),$D
870	add	$SZ*4(%r15),$E
871	add	$SZ*5(%r15),$F
872	add	$SZ*6(%r15),$G
873	add	$SZ*7(%r15),$H
874
875	cmp	$_end,%r12
876
877	mov	$A,$SZ*0(%r15)
878	mov	$B,$SZ*1(%r15)
879	mov	$C,$SZ*2(%r15)
880	mov	$D,$SZ*3(%r15)
881	mov	$E,$SZ*4(%r15)
882	mov	$F,$SZ*5(%r15)
883	mov	$G,$SZ*6(%r15)
884	mov	$H,$SZ*7(%r15)
885	jb	.Lloop_avx
886
887	mov	$_ivp,$ivp
888	mov	$_rsp,%rsi
889.cfi_def_cfa	%rsi,8
890	vmovdqu	$iv,($ivp)		# output IV
891	vzeroall
892___
893$code.=<<___ if ($win64);
894	movaps	`$framesz+16*0`(%rsp),%xmm6
895	movaps	`$framesz+16*1`(%rsp),%xmm7
896	movaps	`$framesz+16*2`(%rsp),%xmm8
897	movaps	`$framesz+16*3`(%rsp),%xmm9
898	movaps	`$framesz+16*4`(%rsp),%xmm10
899	movaps	`$framesz+16*5`(%rsp),%xmm11
900	movaps	`$framesz+16*6`(%rsp),%xmm12
901	movaps	`$framesz+16*7`(%rsp),%xmm13
902	movaps	`$framesz+16*8`(%rsp),%xmm14
903	movaps	`$framesz+16*9`(%rsp),%xmm15
904___
905$code.=<<___;
906	mov	-48(%rsi),%r15
907.cfi_restore	%r15
908	mov	-40(%rsi),%r14
909.cfi_restore	%r14
910	mov	-32(%rsi),%r13
911.cfi_restore	%r13
912	mov	-24(%rsi),%r12
913.cfi_restore	%r12
914	mov	-16(%rsi),%rbp
915.cfi_restore	%rbp
916	mov	-8(%rsi),%rbx
917.cfi_restore	%rbx
918	lea	(%rsi),%rsp
919.cfi_def_cfa_register	%rsp
920.Lepilogue_avx:
921	ret
922.cfi_endproc
923.size	${func}_avx,.-${func}_avx
924___
925
926if ($avx>1) {{
927######################################################################
928# AVX2+BMI code path
929#
930my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
931my $PUSH8=8*2*$SZ;
932use integer;
933
934sub bodyx_00_15 () {
935	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
936	(
937	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
938
939	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
940	'&and	($a4,$e)',		# f&e
941	'&rorx	($a0,$e,$Sigma1[2])',
942	'&rorx	($a2,$e,$Sigma1[1])',
943
944	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
945	'&lea	($h,"($h,$a4)")',
946	'&andn	($a4,$e,$g)',		# ~e&g
947	'&xor	($a0,$a2)',
948
949	'&rorx	($a1,$e,$Sigma1[0])',
950	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
951	'&xor	($a0,$a1)',		# Sigma1(e)
952	'&mov	($a2,$a)',
953
954	'&rorx	($a4,$a,$Sigma0[2])',
955	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
956	'&xor	($a2,$b)',		# a^b, b^c in next round
957	'&rorx	($a1,$a,$Sigma0[1])',
958
959	'&rorx	($a0,$a,$Sigma0[0])',
960	'&lea	($d,"($d,$h)")',	# d+=h
961	'&and	($a3,$a2)',		# (b^c)&(a^b)
962	@aesni_cbc_block[$aesni_cbc_idx++].
963	'&xor	($a1,$a4)',
964
965	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
966	'&xor	($a1,$a0)',		# Sigma0(a)
967	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
968	'&mov	($a4,$e)',		# copy of f in future
969
970	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
971	);
972	# and at the finish one has to $a+=$a1
973}
974
975$code.=<<___;
976.type	${func}_avx2,\@function,6
977.align	64
978${func}_avx2:
979.cfi_startproc
980.Lavx2_shortcut:
981	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
982	mov	%rsp,%rax		# copy %rsp
983.cfi_def_cfa_register	%rax
984	push	%rbx
985.cfi_push	%rbx
986	push	%rbp
987.cfi_push	%rbp
988	push	%r12
989.cfi_push	%r12
990	push	%r13
991.cfi_push	%r13
992	push	%r14
993.cfi_push	%r14
994	push	%r15
995.cfi_push	%r15
996	sub	\$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
997	and	\$-256*$SZ,%rsp		# align stack frame
998	add	\$`2*$SZ*($rounds-8)`,%rsp
999
1000	shl	\$6,$len
1001	sub	$inp,$out		# re-bias
1002	sub	$inp,$in0
1003	add	$inp,$len		# end of input
1004
1005	#mov	$inp,$_inp		# saved later
1006	#mov	$out,$_out		# kept in $offload
1007	mov	$len,$_end
1008	#mov	$key,$_key		# remains resident in $inp register
1009	mov	$ivp,$_ivp
1010	mov	$ctx,$_ctx
1011	mov	$in0,$_in0
1012	mov	%rax,$_rsp
1013.cfi_cfa_expression	$_rsp,deref,+8
1014___
1015$code.=<<___ if ($win64);
1016	movaps	%xmm6,`$framesz+16*0`(%rsp)
1017	movaps	%xmm7,`$framesz+16*1`(%rsp)
1018	movaps	%xmm8,`$framesz+16*2`(%rsp)
1019	movaps	%xmm9,`$framesz+16*3`(%rsp)
1020	movaps	%xmm10,`$framesz+16*4`(%rsp)
1021	movaps	%xmm11,`$framesz+16*5`(%rsp)
1022	movaps	%xmm12,`$framesz+16*6`(%rsp)
1023	movaps	%xmm13,`$framesz+16*7`(%rsp)
1024	movaps	%xmm14,`$framesz+16*8`(%rsp)
1025	movaps	%xmm15,`$framesz+16*9`(%rsp)
1026___
1027$code.=<<___;
1028.Lprologue_avx2:
1029	vzeroall
1030
1031	mov	$inp,%r13		# borrow $a0
1032	vpinsrq	\$1,$out,$offload,$offload
1033	lea	0x80($key),$inp		# size optimization, reassign
1034	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r12	# borrow $a4
1035	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
1036	mov	$ctx,%r15		# borrow $a2
1037	mov	$in0,%rsi		# borrow $a3
1038	vmovdqu	($ivp),$iv		# load IV
1039	lea	-9(%r14),%r14
1040
1041	vmovdqa	0x00(%r12,%r14,8),$mask14
1042	vmovdqa	0x10(%r12,%r14,8),$mask12
1043	vmovdqa	0x20(%r12,%r14,8),$mask10
1044
1045	sub	\$-16*$SZ,%r13		# inp++, size optimization
1046	mov	$SZ*0(%r15),$A
1047	lea	(%rsi,%r13),%r12	# borrow $a0
1048	mov	$SZ*1(%r15),$B
1049	cmp	$len,%r13		# $_end
1050	mov	$SZ*2(%r15),$C
1051	cmove	%rsp,%r12		# next block or random data
1052	mov	$SZ*3(%r15),$D
1053	mov	$SZ*4(%r15),$E
1054	mov	$SZ*5(%r15),$F
1055	mov	$SZ*6(%r15),$G
1056	mov	$SZ*7(%r15),$H
1057	vmovdqu	0x00-0x80($inp),$roundkey
1058___
1059					if ($SZ==4) {	# SHA256
1060    my @X = map("%ymm$_",(0..3));
1061    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1062
1063$code.=<<___;
1064	jmp	.Loop_avx2
1065.align	16
1066.Loop_avx2:
1067	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1068	vmovdqu	-16*$SZ+0(%rsi,%r13),%xmm0
1069	vmovdqu	-16*$SZ+16(%rsi,%r13),%xmm1
1070	vmovdqu	-16*$SZ+32(%rsi,%r13),%xmm2
1071	vmovdqu	-16*$SZ+48(%rsi,%r13),%xmm3
1072
1073	vinserti128	\$1,(%r12),@X[0],@X[0]
1074	vinserti128	\$1,16(%r12),@X[1],@X[1]
1075	 vpshufb	$t3,@X[0],@X[0]
1076	vinserti128	\$1,32(%r12),@X[2],@X[2]
1077	 vpshufb	$t3,@X[1],@X[1]
1078	vinserti128	\$1,48(%r12),@X[3],@X[3]
1079
1080	lea	$TABLE(%rip),$Tbl
1081	vpshufb	$t3,@X[2],@X[2]
1082	lea	-16*$SZ(%r13),%r13
1083	vpaddd	0x00($Tbl),@X[0],$t0
1084	vpshufb	$t3,@X[3],@X[3]
1085	vpaddd	0x20($Tbl),@X[1],$t1
1086	vpaddd	0x40($Tbl),@X[2],$t2
1087	vpaddd	0x60($Tbl),@X[3],$t3
1088	vmovdqa	$t0,0x00(%rsp)
1089	xor	$a1,$a1
1090	vmovdqa	$t1,0x20(%rsp)
1091___
1092$code.=<<___ if (!$win64);
1093# temporarily use %rsi as frame pointer
1094        mov     $_rsp,%rsi
1095.cfi_def_cfa    %rsi,8
1096___
1097$code.=<<___;
1098	lea	-$PUSH8(%rsp),%rsp
1099___
1100$code.=<<___ if (!$win64);
1101# the frame info is at $_rsp, but the stack is moving...
1102# so a second frame pointer is saved at -8(%rsp)
1103# that is in the red zone
1104        mov     %rsi,-8(%rsp)
1105.cfi_cfa_expression     %rsp-8,deref,+8
1106___
1107$code.=<<___;
1108	mov	$B,$a3
1109	vmovdqa	$t2,0x00(%rsp)
1110	xor	$C,$a3			# magic
1111	vmovdqa	$t3,0x20(%rsp)
1112	mov	$F,$a4
1113	sub	\$-16*2*$SZ,$Tbl	# size optimization
1114	jmp	.Lavx2_00_47
1115
1116.align	16
1117.Lavx2_00_47:
1118	vmovdqu	(%r13),$inout
1119	vpinsrq	\$0,%r13,$offload,$offload
1120___
1121
1122sub AVX2_256_00_47 () {
1123my $j = shift;
1124my $body = shift;
1125my @X = @_;
1126my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1127my $base = "+2*$PUSH8(%rsp)";
1128
1129	if (($j%2)==0) {
1130	&lea	("%rsp","-$PUSH8(%rsp)");
1131$code.=<<___ if (!$win64);
1132.cfi_cfa_expression     %rsp+`$PUSH8-8`,deref,+8
1133# copy secondary frame pointer to new location again at -8(%rsp)
1134        pushq   $PUSH8-8(%rsp)
1135.cfi_cfa_expression     %rsp,deref,+8
1136        lea     8(%rsp),%rsp
1137.cfi_cfa_expression     %rsp-8,deref,+8
1138___
1139	}
1140	foreach (Xupdate_256_AVX()) {		# 29 instructions
1141	    eval;
1142	    eval(shift(@insns));
1143	    eval(shift(@insns));
1144	    eval(shift(@insns));
1145	}
1146	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1147	  foreach (@insns) { eval; }		# remaining instructions
1148	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1149}
1150    $aesni_cbc_idx=0;
1151    for ($i=0,$j=0; $j<4; $j++) {
1152	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1153	push(@X,shift(@X));			# rotate(@X)
1154    }
1155	&vmovq		("%r13",$offload);	# borrow $a0
1156	&vpextrq	("%r15",$offload,1);	# borrow $a2
1157	&vpand		($temp,$temp,$mask14);
1158	&vpor		($iv,$iv,$temp);
1159	&vmovdqu	("(%r15,%r13)",$iv);	# write output
1160	&lea		("%r13","16(%r13)");	# inp++
1161
1162	&lea	($Tbl,16*2*$SZ."($Tbl)");
1163	&cmpb	(($SZ-1)."($Tbl)",0);
1164	&jne	(".Lavx2_00_47");
1165
1166	&vmovdqu	($inout,"(%r13)");
1167	&vpinsrq	($offload,$offload,"%r13",0);
1168
1169    $aesni_cbc_idx=0;
1170    for ($i=0; $i<16; ) {
1171	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1172	foreach(bodyx_00_15()) { eval; }
1173    }
1174					}
1175$code.=<<___;
1176	vpextrq	\$1,$offload,%r12		# $_out, borrow $a4
1177	vmovq	$offload,%r13			# $_inp, borrow $a0
1178	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1179	add	$a1,$A
1180	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
1181
1182	vpand	$mask14,$temp,$temp
1183	vpor	$temp,$iv,$iv
1184	vmovdqu	$iv,(%r12,%r13)			# write output
1185	lea	16(%r13),%r13
1186
1187	add	$SZ*0(%r15),$A
1188	add	$SZ*1(%r15),$B
1189	add	$SZ*2(%r15),$C
1190	add	$SZ*3(%r15),$D
1191	add	$SZ*4(%r15),$E
1192	add	$SZ*5(%r15),$F
1193	add	$SZ*6(%r15),$G
1194	add	$SZ*7(%r15),$H
1195
1196	mov	$A,$SZ*0(%r15)
1197	mov	$B,$SZ*1(%r15)
1198	mov	$C,$SZ*2(%r15)
1199	mov	$D,$SZ*3(%r15)
1200	mov	$E,$SZ*4(%r15)
1201	mov	$F,$SZ*5(%r15)
1202	mov	$G,$SZ*6(%r15)
1203	mov	$H,$SZ*7(%r15)
1204
1205	cmp	`$PUSH8+2*8`($Tbl),%r13		# $_end
1206	je	.Ldone_avx2
1207
1208	xor	$a1,$a1
1209	mov	$B,$a3
1210	mov	$F,$a4
1211	xor	$C,$a3			# magic
1212	jmp	.Lower_avx2
1213.align	16
1214.Lower_avx2:
1215	vmovdqu	(%r13),$inout
1216	vpinsrq	\$0,%r13,$offload,$offload
1217___
1218    $aesni_cbc_idx=0;
1219    for ($i=0; $i<16; ) {
1220	my $base="+16($Tbl)";
1221	foreach(bodyx_00_15()) { eval; }
1222	&lea	($Tbl,"-$PUSH8($Tbl)")	if ($i==8);
1223    }
1224$code.=<<___;
1225	vmovq	$offload,%r13			# borrow $a0
1226	vpextrq	\$1,$offload,%r15		# borrow $a2
1227	vpand	$mask14,$temp,$temp
1228	vpor	$temp,$iv,$iv
1229	lea	-$PUSH8($Tbl),$Tbl
1230	vmovdqu	$iv,(%r15,%r13)			# write output
1231	lea	16(%r13),%r13			# inp++
1232	cmp	%rsp,$Tbl
1233	jae	.Lower_avx2
1234
1235	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1236	lea	16*$SZ(%r13),%r13
1237	mov	`2*$SZ*$rounds+6*8`(%rsp),%rsi	# $_in0, borrow $a3
1238	add	$a1,$A
1239	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
1240
1241	add	$SZ*0(%r15),$A
1242	add	$SZ*1(%r15),$B
1243	add	$SZ*2(%r15),$C
1244	add	$SZ*3(%r15),$D
1245	add	$SZ*4(%r15),$E
1246	add	$SZ*5(%r15),$F
1247	add	$SZ*6(%r15),$G
1248	lea	(%rsi,%r13),%r12
1249	add	$SZ*7(%r15),$H
1250
1251	cmp	$_end,%r13
1252
1253	mov	$A,$SZ*0(%r15)
1254	cmove	%rsp,%r12		# next block or stale data
1255	mov	$B,$SZ*1(%r15)
1256	mov	$C,$SZ*2(%r15)
1257	mov	$D,$SZ*3(%r15)
1258	mov	$E,$SZ*4(%r15)
1259	mov	$F,$SZ*5(%r15)
1260	mov	$G,$SZ*6(%r15)
1261	mov	$H,$SZ*7(%r15)
1262
1263	jbe	.Loop_avx2
1264	lea	(%rsp),$Tbl
1265# temporarily use $Tbl as index to $_rsp
1266# this avoids the need to save a secondary frame pointer at -8(%rsp)
1267.cfi_cfa_expression     $Tbl+`16*$SZ+7*8`,deref,+8
1268
1269.Ldone_avx2:
1270	mov	16*$SZ+4*8($Tbl),$ivp
1271	mov	16*$SZ+7*8($Tbl),%rsi
1272.cfi_def_cfa	%rsi,8
1273	vmovdqu	$iv,($ivp)		# output IV
1274	vzeroall
1275___
1276$code.=<<___ if ($win64);
1277	movaps	`$framesz+16*0`($Tbl),%xmm6
1278	movaps	`$framesz+16*1`($Tbl),%xmm7
1279	movaps	`$framesz+16*2`($Tbl),%xmm8
1280	movaps	`$framesz+16*3`($Tbl),%xmm9
1281	movaps	`$framesz+16*4`($Tbl),%xmm10
1282	movaps	`$framesz+16*5`($Tbl),%xmm11
1283	movaps	`$framesz+16*6`($Tbl),%xmm12
1284	movaps	`$framesz+16*7`($Tbl),%xmm13
1285	movaps	`$framesz+16*8`($Tbl),%xmm14
1286	movaps	`$framesz+16*9`($Tbl),%xmm15
1287___
1288$code.=<<___;
1289	mov	-48(%rsi),%r15
1290.cfi_restore	%r15
1291	mov	-40(%rsi),%r14
1292.cfi_restore	%r14
1293	mov	-32(%rsi),%r13
1294.cfi_restore	%r13
1295	mov	-24(%rsi),%r12
1296.cfi_restore	%r12
1297	mov	-16(%rsi),%rbp
1298.cfi_restore	%rbp
1299	mov	-8(%rsi),%rbx
1300.cfi_restore	%rbx
1301	lea	(%rsi),%rsp
1302.cfi_def_cfa_register	%rsp
1303.Lepilogue_avx2:
1304	ret
1305.cfi_endproc
1306.size	${func}_avx2,.-${func}_avx2
1307___
1308}}
1309}}
1310{{
1311my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1312
1313my ($rounds,$Tbl)=("%r11d","%rbx");
1314
1315my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1316my @rndkey=("%xmm4","%xmm5");
1317my $r=0;
1318my $sn=0;
1319
1320my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1321my @MSG=map("%xmm$_",(10..13));
1322
1323my $aesenc=sub {
1324  use integer;
1325  my ($n,$k)=($r/10,$r%10);
1326    if ($k==0) {
1327      $code.=<<___;
1328	movups		`16*$n`($in0),$in		# load input
1329	xorps		$rndkey0,$in
1330___
1331      $code.=<<___ if ($n);
1332	movups		$iv,`16*($n-1)`($out,$in0)	# write output
1333___
1334      $code.=<<___;
1335	xorps		$in,$iv
1336	movups		`32+16*$k-112`($key),$rndkey[1]
1337	aesenc		$rndkey[0],$iv
1338___
1339    } elsif ($k==9) {
1340      $sn++;
1341      $code.=<<___;
1342	cmp		\$11,$rounds
1343	jb		.Laesenclast$sn
1344	movups		`32+16*($k+0)-112`($key),$rndkey[1]
1345	aesenc		$rndkey[0],$iv
1346	movups		`32+16*($k+1)-112`($key),$rndkey[0]
1347	aesenc		$rndkey[1],$iv
1348	je		.Laesenclast$sn
1349	movups		`32+16*($k+2)-112`($key),$rndkey[1]
1350	aesenc		$rndkey[0],$iv
1351	movups		`32+16*($k+3)-112`($key),$rndkey[0]
1352	aesenc		$rndkey[1],$iv
1353.Laesenclast$sn:
1354	aesenclast	$rndkey[0],$iv
1355	movups		16-112($key),$rndkey[1]		# forward reference
1356	nop
1357___
1358    } else {
1359      $code.=<<___;
1360	movups		`32+16*$k-112`($key),$rndkey[1]
1361	aesenc		$rndkey[0],$iv
1362___
1363    }
1364    $r++;	unshift(@rndkey,pop(@rndkey));
1365};
1366
1367if ($shaext) {
1368my $Tbl="%rax";
1369
1370$code.=<<___;
1371.type	${func}_shaext,\@function,6
1372.align	32
1373${func}_shaext:
1374.cfi_startproc
1375	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1376___
1377$code.=<<___ if ($win64);
1378	lea	`-8-10*16`(%rsp),%rsp
1379	movaps	%xmm6,-8-10*16(%rax)
1380	movaps	%xmm7,-8-9*16(%rax)
1381	movaps	%xmm8,-8-8*16(%rax)
1382	movaps	%xmm9,-8-7*16(%rax)
1383	movaps	%xmm10,-8-6*16(%rax)
1384	movaps	%xmm11,-8-5*16(%rax)
1385	movaps	%xmm12,-8-4*16(%rax)
1386	movaps	%xmm13,-8-3*16(%rax)
1387	movaps	%xmm14,-8-2*16(%rax)
1388	movaps	%xmm15,-8-1*16(%rax)
1389.Lprologue_shaext:
1390___
1391$code.=<<___;
1392	lea		K256+0x80(%rip),$Tbl
1393	movdqu		($ctx),$ABEF		# DCBA
1394	movdqu		16($ctx),$CDGH		# HGFE
1395	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
1396
1397	mov		240($key),$rounds
1398	sub		$in0,$out
1399	movups		($key),$rndkey0		# $key[0]
1400	movups		($ivp),$iv		# load IV
1401	movups		16($key),$rndkey[0]	# forward reference
1402	lea		112($key),$key		# size optimization
1403
1404	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
1405	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
1406	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
1407	movdqa		$TMP,$BSWAP		# offload
1408	palignr		\$8,$CDGH,$ABEF		# ABEF
1409	punpcklqdq	$Wi,$CDGH		# CDGH
1410
1411	jmp	.Loop_shaext
1412
1413.align	16
1414.Loop_shaext:
1415	movdqu		($inp),@MSG[0]
1416	movdqu		0x10($inp),@MSG[1]
1417	movdqu		0x20($inp),@MSG[2]
1418	pshufb		$TMP,@MSG[0]
1419	movdqu		0x30($inp),@MSG[3]
1420
1421	movdqa		0*32-0x80($Tbl),$Wi
1422	paddd		@MSG[0],$Wi
1423	pshufb		$TMP,@MSG[1]
1424	movdqa		$CDGH,$CDGH_SAVE	# offload
1425	movdqa		$ABEF,$ABEF_SAVE	# offload
1426___
1427	&$aesenc();
1428$code.=<<___;
1429	sha256rnds2	$ABEF,$CDGH		# 0-3
1430	pshufd		\$0x0e,$Wi,$Wi
1431___
1432	&$aesenc();
1433$code.=<<___;
1434	sha256rnds2	$CDGH,$ABEF
1435
1436	movdqa		1*32-0x80($Tbl),$Wi
1437	paddd		@MSG[1],$Wi
1438	pshufb		$TMP,@MSG[2]
1439	lea		0x40($inp),$inp
1440___
1441	&$aesenc();
1442$code.=<<___;
1443	sha256rnds2	$ABEF,$CDGH		# 4-7
1444	pshufd		\$0x0e,$Wi,$Wi
1445___
1446	&$aesenc();
1447$code.=<<___;
1448	sha256rnds2	$CDGH,$ABEF
1449
1450	movdqa		2*32-0x80($Tbl),$Wi
1451	paddd		@MSG[2],$Wi
1452	pshufb		$TMP,@MSG[3]
1453	sha256msg1	@MSG[1],@MSG[0]
1454___
1455	&$aesenc();
1456$code.=<<___;
1457	sha256rnds2	$ABEF,$CDGH		# 8-11
1458	pshufd		\$0x0e,$Wi,$Wi
1459	movdqa		@MSG[3],$TMP
1460	palignr		\$4,@MSG[2],$TMP
1461	paddd		$TMP,@MSG[0]
1462___
1463	&$aesenc();
1464$code.=<<___;
1465	sha256rnds2	$CDGH,$ABEF
1466
1467	movdqa		3*32-0x80($Tbl),$Wi
1468	paddd		@MSG[3],$Wi
1469	sha256msg2	@MSG[3],@MSG[0]
1470	sha256msg1	@MSG[2],@MSG[1]
1471___
1472	&$aesenc();
1473$code.=<<___;
1474	sha256rnds2	$ABEF,$CDGH		# 12-15
1475	pshufd		\$0x0e,$Wi,$Wi
1476___
1477	&$aesenc();
1478$code.=<<___;
1479	movdqa		@MSG[0],$TMP
1480	palignr		\$4,@MSG[3],$TMP
1481	paddd		$TMP,@MSG[1]
1482	sha256rnds2	$CDGH,$ABEF
1483___
1484for($i=4;$i<16-3;$i++) {
1485	&$aesenc()	if (($r%10)==0);
1486$code.=<<___;
1487	movdqa		$i*32-0x80($Tbl),$Wi
1488	paddd		@MSG[0],$Wi
1489	sha256msg2	@MSG[0],@MSG[1]
1490	sha256msg1	@MSG[3],@MSG[2]
1491___
1492	&$aesenc();
1493$code.=<<___;
1494	sha256rnds2	$ABEF,$CDGH		# 16-19...
1495	pshufd		\$0x0e,$Wi,$Wi
1496	movdqa		@MSG[1],$TMP
1497	palignr		\$4,@MSG[0],$TMP
1498	paddd		$TMP,@MSG[2]
1499___
1500	&$aesenc();
1501	&$aesenc()	if ($r==19);
1502$code.=<<___;
1503	sha256rnds2	$CDGH,$ABEF
1504___
1505	push(@MSG,shift(@MSG));
1506}
1507$code.=<<___;
1508	movdqa		13*32-0x80($Tbl),$Wi
1509	paddd		@MSG[0],$Wi
1510	sha256msg2	@MSG[0],@MSG[1]
1511	sha256msg1	@MSG[3],@MSG[2]
1512___
1513	&$aesenc();
1514$code.=<<___;
1515	sha256rnds2	$ABEF,$CDGH		# 52-55
1516	pshufd		\$0x0e,$Wi,$Wi
1517	movdqa		@MSG[1],$TMP
1518	palignr		\$4,@MSG[0],$TMP
1519	paddd		$TMP,@MSG[2]
1520___
1521	&$aesenc();
1522	&$aesenc();
1523$code.=<<___;
1524	sha256rnds2	$CDGH,$ABEF
1525
1526	movdqa		14*32-0x80($Tbl),$Wi
1527	paddd		@MSG[1],$Wi
1528	sha256msg2	@MSG[1],@MSG[2]
1529	movdqa		$BSWAP,$TMP
1530___
1531	&$aesenc();
1532$code.=<<___;
1533	sha256rnds2	$ABEF,$CDGH		# 56-59
1534	pshufd		\$0x0e,$Wi,$Wi
1535___
1536	&$aesenc();
1537$code.=<<___;
1538	sha256rnds2	$CDGH,$ABEF
1539
1540	movdqa		15*32-0x80($Tbl),$Wi
1541	paddd		@MSG[2],$Wi
1542___
1543	&$aesenc();
1544	&$aesenc();
1545$code.=<<___;
1546	sha256rnds2	$ABEF,$CDGH		# 60-63
1547	pshufd		\$0x0e,$Wi,$Wi
1548___
1549	&$aesenc();
1550$code.=<<___;
1551	sha256rnds2	$CDGH,$ABEF
1552	#pxor		$CDGH,$rndkey0		# black magic
1553___
1554	while ($r<40)	{ &$aesenc(); }		# remaining aesenc's
1555$code.=<<___;
1556	#xorps		$CDGH,$rndkey0		# black magic
1557	paddd		$CDGH_SAVE,$CDGH
1558	paddd		$ABEF_SAVE,$ABEF
1559
1560	dec		$len
1561	movups		$iv,48($out,$in0)	# write output
1562	lea		64($in0),$in0
1563	jnz		.Loop_shaext
1564
1565	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
1566	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
1567	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
1568	punpckhqdq	$CDGH,$ABEF		# DCBA
1569	palignr		\$8,$TMP,$CDGH		# HGFE
1570
1571	movups		$iv,($ivp)		# write IV
1572	movdqu		$ABEF,($ctx)
1573	movdqu		$CDGH,16($ctx)
1574___
1575$code.=<<___ if ($win64);
1576	movaps	0*16(%rsp),%xmm6
1577	movaps	1*16(%rsp),%xmm7
1578	movaps	2*16(%rsp),%xmm8
1579	movaps	3*16(%rsp),%xmm9
1580	movaps	4*16(%rsp),%xmm10
1581	movaps	5*16(%rsp),%xmm11
1582	movaps	6*16(%rsp),%xmm12
1583	movaps	7*16(%rsp),%xmm13
1584	movaps	8*16(%rsp),%xmm14
1585	movaps	9*16(%rsp),%xmm15
1586	lea	8+10*16(%rsp),%rsp
1587.Lepilogue_shaext:
1588___
1589$code.=<<___;
1590	ret
1591.cfi_endproc
1592.size	${func}_shaext,.-${func}_shaext
1593___
1594}
1595}}}}}
1596
1597# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1598#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1599if ($win64 && $avx) {
1600$rec="%rcx";
1601$frame="%rdx";
1602$context="%r8";
1603$disp="%r9";
1604
1605$code.=<<___;
1606.extern	__imp_RtlVirtualUnwind
1607.type	se_handler,\@abi-omnipotent
1608.align	16
1609se_handler:
1610	push	%rsi
1611	push	%rdi
1612	push	%rbx
1613	push	%rbp
1614	push	%r12
1615	push	%r13
1616	push	%r14
1617	push	%r15
1618	pushfq
1619	sub	\$64,%rsp
1620
1621	mov	120($context),%rax	# pull context->Rax
1622	mov	248($context),%rbx	# pull context->Rip
1623
1624	mov	8($disp),%rsi		# disp->ImageBase
1625	mov	56($disp),%r11		# disp->HanderlData
1626
1627	mov	0(%r11),%r10d		# HandlerData[0]
1628	lea	(%rsi,%r10),%r10	# prologue label
1629	cmp	%r10,%rbx		# context->Rip<prologue label
1630	jb	.Lin_prologue
1631
1632	mov	152($context),%rax	# pull context->Rsp
1633
1634	mov	4(%r11),%r10d		# HandlerData[1]
1635	lea	(%rsi,%r10),%r10	# epilogue label
1636	cmp	%r10,%rbx		# context->Rip>=epilogue label
1637	jae	.Lin_prologue
1638___
1639$code.=<<___ if ($shaext);
1640	lea	aesni_cbc_sha256_enc_shaext(%rip),%r10
1641	cmp	%r10,%rbx
1642	jb	.Lnot_in_shaext
1643
1644	lea	(%rax),%rsi
1645	lea	512($context),%rdi	# &context.Xmm6
1646	mov	\$20,%ecx
1647	.long	0xa548f3fc		# cld; rep movsq
1648	lea	168(%rax),%rax		# adjust stack pointer
1649	jmp	.Lin_prologue
1650.Lnot_in_shaext:
1651___
1652$code.=<<___ if ($avx>1);
1653	lea	.Lavx2_shortcut(%rip),%r10
1654	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
1655	jb	.Lnot_in_avx2
1656
1657	and	\$-256*$SZ,%rax
1658	add	\$`2*$SZ*($rounds-8)`,%rax
1659.Lnot_in_avx2:
1660___
1661$code.=<<___;
1662	mov	%rax,%rsi		# put aside Rsp
1663	mov	16*$SZ+7*8(%rax),%rax	# pull $_rsp
1664
1665	mov	-8(%rax),%rbx
1666	mov	-16(%rax),%rbp
1667	mov	-24(%rax),%r12
1668	mov	-32(%rax),%r13
1669	mov	-40(%rax),%r14
1670	mov	-48(%rax),%r15
1671	mov	%rbx,144($context)	# restore context->Rbx
1672	mov	%rbp,160($context)	# restore context->Rbp
1673	mov	%r12,216($context)	# restore context->R12
1674	mov	%r13,224($context)	# restore context->R13
1675	mov	%r14,232($context)	# restore context->R14
1676	mov	%r15,240($context)	# restore context->R15
1677
1678	lea	16*$SZ+8*8(%rsi),%rsi	# Xmm6- save area
1679	lea	512($context),%rdi	# &context.Xmm6
1680	mov	\$20,%ecx
1681	.long	0xa548f3fc		# cld; rep movsq
1682
1683.Lin_prologue:
1684	mov	8(%rax),%rdi
1685	mov	16(%rax),%rsi
1686	mov	%rax,152($context)	# restore context->Rsp
1687	mov	%rsi,168($context)	# restore context->Rsi
1688	mov	%rdi,176($context)	# restore context->Rdi
1689
1690	mov	40($disp),%rdi		# disp->ContextRecord
1691	mov	$context,%rsi		# context
1692	mov	\$154,%ecx		# sizeof(CONTEXT)
1693	.long	0xa548f3fc		# cld; rep movsq
1694
1695	mov	$disp,%rsi
1696	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1697	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1698	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1699	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1700	mov	40(%rsi),%r10		# disp->ContextRecord
1701	lea	56(%rsi),%r11		# &disp->HandlerData
1702	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1703	mov	%r10,32(%rsp)		# arg5
1704	mov	%r11,40(%rsp)		# arg6
1705	mov	%r12,48(%rsp)		# arg7
1706	mov	%rcx,56(%rsp)		# arg8, (NULL)
1707	call	*__imp_RtlVirtualUnwind(%rip)
1708
1709	mov	\$1,%eax		# ExceptionContinueSearch
1710	add	\$64,%rsp
1711	popfq
1712	pop	%r15
1713	pop	%r14
1714	pop	%r13
1715	pop	%r12
1716	pop	%rbp
1717	pop	%rbx
1718	pop	%rdi
1719	pop	%rsi
1720	ret
1721.size	se_handler,.-se_handler
1722
1723.section	.pdata
1724	.rva	.LSEH_begin_${func}_xop
1725	.rva	.LSEH_end_${func}_xop
1726	.rva	.LSEH_info_${func}_xop
1727
1728	.rva	.LSEH_begin_${func}_avx
1729	.rva	.LSEH_end_${func}_avx
1730	.rva	.LSEH_info_${func}_avx
1731___
1732$code.=<<___ if ($avx>1);
1733	.rva	.LSEH_begin_${func}_avx2
1734	.rva	.LSEH_end_${func}_avx2
1735	.rva	.LSEH_info_${func}_avx2
1736___
1737$code.=<<___ if ($shaext);
1738	.rva	.LSEH_begin_${func}_shaext
1739	.rva	.LSEH_end_${func}_shaext
1740	.rva	.LSEH_info_${func}_shaext
1741___
1742$code.=<<___;
1743.section	.xdata
1744.align	8
1745.LSEH_info_${func}_xop:
1746	.byte	9,0,0,0
1747	.rva	se_handler
1748	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
1749
1750.LSEH_info_${func}_avx:
1751	.byte	9,0,0,0
1752	.rva	se_handler
1753	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1754___
1755$code.=<<___ if ($avx>1);
1756.LSEH_info_${func}_avx2:
1757	.byte	9,0,0,0
1758	.rva	se_handler
1759	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
1760___
1761$code.=<<___ if ($shaext);
1762.LSEH_info_${func}_shaext:
1763	.byte	9,0,0,0
1764	.rva	se_handler
1765	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
1766___
1767}
1768
1769####################################################################
1770sub rex {
1771  local *opcode=shift;
1772  my ($dst,$src)=@_;
1773  my $rex=0;
1774
1775    $rex|=0x04			if($dst>=8);
1776    $rex|=0x01			if($src>=8);
1777    unshift @opcode,$rex|0x40	if($rex);
1778}
1779
1780{
1781  my %opcodelet = (
1782		"sha256rnds2" => 0xcb,
1783  		"sha256msg1"  => 0xcc,
1784		"sha256msg2"  => 0xcd	);
1785
1786  sub sha256op38 {
1787    my $instr = shift;
1788
1789    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1790      my @opcode=(0x0f,0x38);
1791	rex(\@opcode,$2,$1);
1792	push @opcode,$opcodelet{$instr};
1793	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1794	return ".byte\t".join(',',@opcode);
1795    } else {
1796	return $instr."\t".@_[0];
1797    }
1798  }
1799}
1800
1801$code =~ s/\`([^\`]*)\`/eval $1/gem;
1802$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1803print $code;
1804close STDOUT or die "error closing STDOUT: $!";
1805