1#! /usr/bin/env perl
2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# January 2013
18#
19# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20# in http://download.intel.com/design/intarch/papers/323686.pdf, is
21# that since AESNI-CBC encrypt exhibit *very* low instruction-level
22# parallelism, interleaving it with another algorithm would allow to
23# utilize processor resources better and achieve better performance.
24# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25# AESNI code is weaved into it. As SHA256 dominates execution time,
26# stitch performance does not depend on AES key length. Below are
27# performance numbers in cycles per processed byte, less is better,
28# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29# subroutine:
30#
31#		 AES-128/-192/-256+SHA256   this(**)	gain
32# Sandy Bridge	    5.05/6.05/7.05+11.6	    13.0	+28%/36%/43%
33# Ivy Bridge	    5.05/6.05/7.05+10.3	    11.6	+32%/41%/50%
34# Haswell	    4.43/5.29/6.19+7.80	    8.79	+39%/49%/59%
35# Skylake	    2.62/3.14/3.62+7.70	    8.10	+27%/34%/40%
36# Bulldozer	    5.77/6.89/8.00+13.7	    13.7	+42%/50%/58%
37# Ryzen(***)	    2.71/-/3.71+2.05	    2.74/-/3.73	+74%/-/54%
38# Goldmont(***)	    3.82/-/5.35+4.16	    4.73/-/5.94	+69%/-/60%
39#
40# (*)	there are XOP, AVX1 and AVX2 code paths, meaning that
41#	Westmere is omitted from loop, this is because gain was not
42#	estimated high enough to justify the effort;
43# (**)	these are EVP-free results, results obtained with 'speed
44#	-evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45# (***)	these are SHAEXT results;
46
47$flavour = shift;
48$output  = shift;
49if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
50
51$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
52
53$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
55( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
56die "can't locate x86_64-xlate.pl";
57
58if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
59		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
60	$avx = ($1>=2.19) + ($1>=2.22);
61}
62
63if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65	$avx = ($1>=2.09) + ($1>=2.10);
66}
67
68if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
70	$avx = ($1>=10) + ($1>=12);
71}
72
73if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
74	$avx = ($2>=3.0) + ($2>3.0);
75}
76
77$shaext=$avx;	### set to zero if compiling for 1.0.1
78$avx=1		if (!$shaext && $avx);
79
80open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
81*STDOUT=*OUT;
82
83$func="aesni_cbc_sha256_enc";
84$TABLE="K256";
85$SZ=4;
86@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
87				"%r8d","%r9d","%r10d","%r11d");
88($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
89@Sigma0=( 2,13,22);
90@Sigma1=( 6,11,25);
91@sigma0=( 7,18, 3);
92@sigma1=(17,19,10);
93$rounds=64;
94
95########################################################################
96# void aesni_cbc_sha256_enc(const void *inp,
97#			void *out,
98#			size_t length,
99#			const AES_KEY *key,
100#			unsigned char *iv,
101#			SHA256_CTX *ctx,
102#			const void *in0);
103($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
104("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
105
106$Tbl="%rbp";
107
108$_inp="16*$SZ+0*8(%rsp)";
109$_out="16*$SZ+1*8(%rsp)";
110$_end="16*$SZ+2*8(%rsp)";
111$_key="16*$SZ+3*8(%rsp)";
112$_ivp="16*$SZ+4*8(%rsp)";
113$_ctx="16*$SZ+5*8(%rsp)";
114$_in0="16*$SZ+6*8(%rsp)";
115$_rsp="`16*$SZ+7*8`(%rsp)";
116$framesz=16*$SZ+8*8;
117
118$code=<<___;
119.text
120
121.extern	OPENSSL_ia32cap_P
122.globl	$func
123.type	$func,\@abi-omnipotent
124.align	16
125$func:
126___
127						if ($avx) {
128$code.=<<___;
129	lea	OPENSSL_ia32cap_P(%rip),%r11
130	mov	\$1,%eax
131	cmp	\$0,`$win64?"%rcx":"%rdi"`
132	je	.Lprobe
133	mov	0(%r11),%eax
134	mov	4(%r11),%r10
135___
136$code.=<<___ if ($shaext);
137	bt	\$61,%r10			# check for SHA
138	jc	${func}_shaext
139___
140$code.=<<___;
141	mov	%r10,%r11
142	shr	\$32,%r11
143
144	test	\$`1<<11`,%r10d			# check for XOP
145	jnz	${func}_xop
146___
147$code.=<<___ if ($avx>1);
148	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
149	cmp	\$`1<<8|1<<5|1<<3`,%r11d
150	je	${func}_avx2
151___
152$code.=<<___;
153	and	\$`1<<28`,%r10d			# check for AVX
154	jnz	${func}_avx
155	ud2
156___
157						}
158$code.=<<___;
159	xor	%eax,%eax
160	cmp	\$0,`$win64?"%rcx":"%rdi"`
161	je	.Lprobe
162	ud2
163.Lprobe:
164	ret
165.size	$func,.-$func
166
167.align	64
168.type	$TABLE,\@object
169$TABLE:
170	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
171	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
172	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
173	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
174	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
175	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
176	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
177	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
178	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
179	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
180	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
181	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
182	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
183	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
184	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
185	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
186	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
187	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
188	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
189	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
190	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
191	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
192	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
193	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
194	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
195	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
196	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
197	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
198	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
199	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
200	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
201	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
202
203	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
204	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
205	.long	0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
206	.long	0,0,0,0,   0,0,0,0
207	.asciz	"AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
208.align	64
209___
210
211######################################################################
212# SIMD code paths
213#
214{{{
215($iv,$inout,$roundkey,$temp,
216 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
217
218$aesni_cbc_idx=0;
219@aesni_cbc_block = (
220##	&vmovdqu	($roundkey,"0x00-0x80($inp)");'
221##	&vmovdqu	($inout,($inp));
222##	&mov		($_inp,$inp);
223
224	'&vpxor		($inout,$inout,$roundkey);'.
225	' &vmovdqu	($roundkey,"0x10-0x80($inp)");',
226
227	'&vpxor		($inout,$inout,$iv);',
228
229	'&vaesenc	($inout,$inout,$roundkey);'.
230	' &vmovdqu	($roundkey,"0x20-0x80($inp)");',
231
232	'&vaesenc	($inout,$inout,$roundkey);'.
233	' &vmovdqu	($roundkey,"0x30-0x80($inp)");',
234
235	'&vaesenc	($inout,$inout,$roundkey);'.
236	' &vmovdqu	($roundkey,"0x40-0x80($inp)");',
237
238	'&vaesenc	($inout,$inout,$roundkey);'.
239	' &vmovdqu	($roundkey,"0x50-0x80($inp)");',
240
241	'&vaesenc	($inout,$inout,$roundkey);'.
242	' &vmovdqu	($roundkey,"0x60-0x80($inp)");',
243
244	'&vaesenc	($inout,$inout,$roundkey);'.
245	' &vmovdqu	($roundkey,"0x70-0x80($inp)");',
246
247	'&vaesenc	($inout,$inout,$roundkey);'.
248	' &vmovdqu	($roundkey,"0x80-0x80($inp)");',
249
250	'&vaesenc	($inout,$inout,$roundkey);'.
251	' &vmovdqu	($roundkey,"0x90-0x80($inp)");',
252
253	'&vaesenc	($inout,$inout,$roundkey);'.
254	' &vmovdqu	($roundkey,"0xa0-0x80($inp)");',
255
256	'&vaesenclast	($temp,$inout,$roundkey);'.
257	' &vaesenc	($inout,$inout,$roundkey);'.
258	' &vmovdqu	($roundkey,"0xb0-0x80($inp)");',
259
260	'&vpand		($iv,$temp,$mask10);'.
261	' &vaesenc	($inout,$inout,$roundkey);'.
262	' &vmovdqu	($roundkey,"0xc0-0x80($inp)");',
263
264	'&vaesenclast	($temp,$inout,$roundkey);'.
265	' &vaesenc	($inout,$inout,$roundkey);'.
266	' &vmovdqu	($roundkey,"0xd0-0x80($inp)");',
267
268	'&vpand		($temp,$temp,$mask12);'.
269	' &vaesenc	($inout,$inout,$roundkey);'.
270	 '&vmovdqu	($roundkey,"0xe0-0x80($inp)");',
271
272	'&vpor		($iv,$iv,$temp);'.
273	' &vaesenclast	($temp,$inout,$roundkey);'.
274	' &vmovdqu	($roundkey,"0x00-0x80($inp)");'
275
276##	&mov		($inp,$_inp);
277##	&mov		($out,$_out);
278##	&vpand		($temp,$temp,$mask14);
279##	&vpor		($iv,$iv,$temp);
280##	&vmovdqu	($iv,($out,$inp);
281##	&lea		(inp,16($inp));
282);
283
284my $a4=$T1;
285my ($a,$b,$c,$d,$e,$f,$g,$h);
286
287sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
288{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
289  my $arg = pop;
290    $arg = "\$$arg" if ($arg*1 eq $arg);
291    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
292}
293
294sub body_00_15 () {
295	(
296	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
297
298	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
299	'&mov	($a,$a1)',
300	'&mov	($a4,$f)',
301
302	'&xor	($a0,$e)',
303	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
304	'&xor	($a4,$g)',			# f^g
305
306	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
307	'&xor	($a1,$a)',
308	'&and	($a4,$e)',			# (f^g)&e
309
310	@aesni_cbc_block[$aesni_cbc_idx++].
311	'&xor	($a0,$e)',
312	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
313	'&mov	($a2,$a)',
314
315	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
316	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
317	'&xor	($a2,$b)',			# a^b, b^c in next round
318
319	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
320	'&add	($h,$a4)',			# h+=Ch(e,f,g)
321	'&and	($a3,$a2)',			# (b^c)&(a^b)
322
323	'&xor	($a1,$a)',
324	'&add	($h,$a0)',			# h+=Sigma1(e)
325	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
326
327	'&add	($d,$h)',			# d+=h
328	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
329	'&add	($h,$a3)',			# h+=Maj(a,b,c)
330
331	'&mov	($a0,$d)',
332	'&add	($a1,$h);'.			# h+=Sigma0(a)
333	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
334	);
335}
336
337if ($avx) {{
338######################################################################
339# XOP code path
340#
341$code.=<<___;
342.type	${func}_xop,\@function,6
343.align	64
344${func}_xop:
345.cfi_startproc
346.Lxop_shortcut:
347	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
348	mov	%rsp,%rax		# copy %rsp
349.cfi_def_cfa_register	%rax
350	push	%rbx
351.cfi_push	%rbx
352	push	%rbp
353.cfi_push	%rbp
354	push	%r12
355.cfi_push	%r12
356	push	%r13
357.cfi_push	%r13
358	push	%r14
359.cfi_push	%r14
360	push	%r15
361.cfi_push	%r15
362	sub	\$`$framesz+$win64*16*10`,%rsp
363	and	\$-64,%rsp		# align stack frame
364
365	shl	\$6,$len
366	sub	$inp,$out		# re-bias
367	sub	$inp,$in0
368	add	$inp,$len		# end of input
369
370	#mov	$inp,$_inp		# saved later
371	mov	$out,$_out
372	mov	$len,$_end
373	#mov	$key,$_key		# remains resident in $inp register
374	mov	$ivp,$_ivp
375	mov	$ctx,$_ctx
376	mov	$in0,$_in0
377	mov	%rax,$_rsp
378.cfi_cfa_expression	$_rsp,deref,+8
379___
380$code.=<<___ if ($win64);
381	movaps	%xmm6,`$framesz+16*0`(%rsp)
382	movaps	%xmm7,`$framesz+16*1`(%rsp)
383	movaps	%xmm8,`$framesz+16*2`(%rsp)
384	movaps	%xmm9,`$framesz+16*3`(%rsp)
385	movaps	%xmm10,`$framesz+16*4`(%rsp)
386	movaps	%xmm11,`$framesz+16*5`(%rsp)
387	movaps	%xmm12,`$framesz+16*6`(%rsp)
388	movaps	%xmm13,`$framesz+16*7`(%rsp)
389	movaps	%xmm14,`$framesz+16*8`(%rsp)
390	movaps	%xmm15,`$framesz+16*9`(%rsp)
391___
392$code.=<<___;
393.Lprologue_xop:
394	vzeroall
395
396	mov	$inp,%r12		# borrow $a4
397	lea	0x80($key),$inp		# size optimization, reassign
398	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
399	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
400	mov	$ctx,%r15		# borrow $a2
401	mov	$in0,%rsi		# borrow $a3
402	vmovdqu	($ivp),$iv		# load IV
403	sub	\$9,%r14
404
405	mov	$SZ*0(%r15),$A
406	mov	$SZ*1(%r15),$B
407	mov	$SZ*2(%r15),$C
408	mov	$SZ*3(%r15),$D
409	mov	$SZ*4(%r15),$E
410	mov	$SZ*5(%r15),$F
411	mov	$SZ*6(%r15),$G
412	mov	$SZ*7(%r15),$H
413
414	vmovdqa	0x00(%r13,%r14,8),$mask14
415	vmovdqa	0x10(%r13,%r14,8),$mask12
416	vmovdqa	0x20(%r13,%r14,8),$mask10
417	vmovdqu	0x00-0x80($inp),$roundkey
418	jmp	.Lloop_xop
419___
420					if ($SZ==4) {	# SHA256
421    my @X = map("%xmm$_",(0..3));
422    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
423
424$code.=<<___;
425.align	16
426.Lloop_xop:
427	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
428	vmovdqu	0x00(%rsi,%r12),@X[0]
429	vmovdqu	0x10(%rsi,%r12),@X[1]
430	vmovdqu	0x20(%rsi,%r12),@X[2]
431	vmovdqu	0x30(%rsi,%r12),@X[3]
432	vpshufb	$t3,@X[0],@X[0]
433	lea	$TABLE(%rip),$Tbl
434	vpshufb	$t3,@X[1],@X[1]
435	vpshufb	$t3,@X[2],@X[2]
436	vpaddd	0x00($Tbl),@X[0],$t0
437	vpshufb	$t3,@X[3],@X[3]
438	vpaddd	0x20($Tbl),@X[1],$t1
439	vpaddd	0x40($Tbl),@X[2],$t2
440	vpaddd	0x60($Tbl),@X[3],$t3
441	vmovdqa	$t0,0x00(%rsp)
442	mov	$A,$a1
443	vmovdqa	$t1,0x10(%rsp)
444	mov	$B,$a3
445	vmovdqa	$t2,0x20(%rsp)
446	xor	$C,$a3			# magic
447	vmovdqa	$t3,0x30(%rsp)
448	mov	$E,$a0
449	jmp	.Lxop_00_47
450
451.align	16
452.Lxop_00_47:
453	sub	\$-16*2*$SZ,$Tbl	# size optimization
454	vmovdqu	(%r12),$inout		# $a4
455	mov	%r12,$_inp		# $a4
456___
457sub XOP_256_00_47 () {
458my $j = shift;
459my $body = shift;
460my @X = @_;
461my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
462
463	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
464	  eval(shift(@insns));
465	  eval(shift(@insns));
466	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
467	  eval(shift(@insns));
468	  eval(shift(@insns));
469	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
470	  eval(shift(@insns));
471	  eval(shift(@insns));
472	&vpsrld		($t0,$t0,$sigma0[2]);
473	  eval(shift(@insns));
474	  eval(shift(@insns));
475	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
476	  eval(shift(@insns));
477	  eval(shift(@insns));
478	  eval(shift(@insns));
479	  eval(shift(@insns));
480	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
481	  eval(shift(@insns));
482	  eval(shift(@insns));
483	&vpxor		($t0,$t0,$t1);
484	  eval(shift(@insns));
485	  eval(shift(@insns));
486	  eval(shift(@insns));
487	  eval(shift(@insns));
488	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
489	  eval(shift(@insns));
490	  eval(shift(@insns));
491	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
492	  eval(shift(@insns));
493	  eval(shift(@insns));
494	 &vpsrld	($t2,@X[3],$sigma1[2]);
495	  eval(shift(@insns));
496	  eval(shift(@insns));
497	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
498	  eval(shift(@insns));
499	  eval(shift(@insns));
500	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
501	  eval(shift(@insns));
502	  eval(shift(@insns));
503	 &vpxor		($t3,$t3,$t2);
504	  eval(shift(@insns));
505	  eval(shift(@insns));
506	  eval(shift(@insns));
507	  eval(shift(@insns));
508	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
509	  eval(shift(@insns));
510	  eval(shift(@insns));
511	  eval(shift(@insns));
512	  eval(shift(@insns));
513	&vpsrldq	($t3,$t3,8);
514	  eval(shift(@insns));
515	  eval(shift(@insns));
516	  eval(shift(@insns));
517	  eval(shift(@insns));
518	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
519	  eval(shift(@insns));
520	  eval(shift(@insns));
521	  eval(shift(@insns));
522	  eval(shift(@insns));
523	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
524	  eval(shift(@insns));
525	  eval(shift(@insns));
526	 &vpsrld	($t2,@X[0],$sigma1[2]);
527	  eval(shift(@insns));
528	  eval(shift(@insns));
529	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
530	  eval(shift(@insns));
531	  eval(shift(@insns));
532	 &vpxor		($t3,$t3,$t2);
533	  eval(shift(@insns));
534	  eval(shift(@insns));
535	  eval(shift(@insns));
536	  eval(shift(@insns));
537	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
538	  eval(shift(@insns));
539	  eval(shift(@insns));
540	  eval(shift(@insns));
541	  eval(shift(@insns));
542	&vpslldq	($t3,$t3,8);		# 22 instructions
543	  eval(shift(@insns));
544	  eval(shift(@insns));
545	  eval(shift(@insns));
546	  eval(shift(@insns));
547	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
548	  eval(shift(@insns));
549	  eval(shift(@insns));
550	  eval(shift(@insns));
551	  eval(shift(@insns));
552	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
553	  foreach (@insns) { eval; }		# remaining instructions
554	&vmovdqa	(16*$j."(%rsp)",$t2);
555}
556
557    $aesni_cbc_idx=0;
558    for ($i=0,$j=0; $j<4; $j++) {
559	&XOP_256_00_47($j,\&body_00_15,@X);
560	push(@X,shift(@X));			# rotate(@X)
561    }
562    	&mov		("%r12",$_inp);		# borrow $a4
563	&vpand		($temp,$temp,$mask14);
564	&mov		("%r15",$_out);		# borrow $a2
565	&vpor		($iv,$iv,$temp);
566	&vmovdqu	("(%r15,%r12)",$iv);	# write output
567	&lea		("%r12","16(%r12)");	# inp++
568
569	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
570	&jne	(".Lxop_00_47");
571
572	&vmovdqu	($inout,"(%r12)");
573	&mov		($_inp,"%r12");
574
575    $aesni_cbc_idx=0;
576    for ($i=0; $i<16; ) {
577	foreach(body_00_15()) { eval; }
578    }
579					}
580$code.=<<___;
581	mov	$_inp,%r12		# borrow $a4
582	mov	$_out,%r13		# borrow $a0
583	mov	$_ctx,%r15		# borrow $a2
584	mov	$_in0,%rsi		# borrow $a3
585
586	vpand	$mask14,$temp,$temp
587	mov	$a1,$A
588	vpor	$temp,$iv,$iv
589	vmovdqu	$iv,(%r13,%r12)		# write output
590	lea	16(%r12),%r12		# inp++
591
592	add	$SZ*0(%r15),$A
593	add	$SZ*1(%r15),$B
594	add	$SZ*2(%r15),$C
595	add	$SZ*3(%r15),$D
596	add	$SZ*4(%r15),$E
597	add	$SZ*5(%r15),$F
598	add	$SZ*6(%r15),$G
599	add	$SZ*7(%r15),$H
600
601	cmp	$_end,%r12
602
603	mov	$A,$SZ*0(%r15)
604	mov	$B,$SZ*1(%r15)
605	mov	$C,$SZ*2(%r15)
606	mov	$D,$SZ*3(%r15)
607	mov	$E,$SZ*4(%r15)
608	mov	$F,$SZ*5(%r15)
609	mov	$G,$SZ*6(%r15)
610	mov	$H,$SZ*7(%r15)
611
612	jb	.Lloop_xop
613
614	mov	$_ivp,$ivp
615	mov	$_rsp,%rsi
616.cfi_def_cfa	%rsi,8
617	vmovdqu	$iv,($ivp)		# output IV
618	vzeroall
619___
620$code.=<<___ if ($win64);
621	movaps	`$framesz+16*0`(%rsp),%xmm6
622	movaps	`$framesz+16*1`(%rsp),%xmm7
623	movaps	`$framesz+16*2`(%rsp),%xmm8
624	movaps	`$framesz+16*3`(%rsp),%xmm9
625	movaps	`$framesz+16*4`(%rsp),%xmm10
626	movaps	`$framesz+16*5`(%rsp),%xmm11
627	movaps	`$framesz+16*6`(%rsp),%xmm12
628	movaps	`$framesz+16*7`(%rsp),%xmm13
629	movaps	`$framesz+16*8`(%rsp),%xmm14
630	movaps	`$framesz+16*9`(%rsp),%xmm15
631___
632$code.=<<___;
633	mov	-48(%rsi),%r15
634.cfi_restore	%r15
635	mov	-40(%rsi),%r14
636.cfi_restore	%r14
637	mov	-32(%rsi),%r13
638.cfi_restore	%r13
639	mov	-24(%rsi),%r12
640.cfi_restore	%r12
641	mov	-16(%rsi),%rbp
642.cfi_restore	%rbp
643	mov	-8(%rsi),%rbx
644.cfi_restore	%rbx
645	lea	(%rsi),%rsp
646.cfi_def_cfa_register	%rsp
647.Lepilogue_xop:
648	ret
649.cfi_endproc
650.size	${func}_xop,.-${func}_xop
651___
652######################################################################
653# AVX+shrd code path
654#
655local *ror = sub { &shrd(@_[0],@_) };
656
657$code.=<<___;
658.type	${func}_avx,\@function,6
659.align	64
660${func}_avx:
661.cfi_startproc
662.Lavx_shortcut:
663	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
664	mov	%rsp,%rax		# copy %rsp
665.cfi_def_cfa_register	%rax
666	push	%rbx
667.cfi_push	%rbx
668	push	%rbp
669.cfi_push	%rbp
670	push	%r12
671.cfi_push	%r12
672	push	%r13
673.cfi_push	%r13
674	push	%r14
675.cfi_push	%r14
676	push	%r15
677.cfi_push	%r15
678	sub	\$`$framesz+$win64*16*10`,%rsp
679	and	\$-64,%rsp		# align stack frame
680
681	shl	\$6,$len
682	sub	$inp,$out		# re-bias
683	sub	$inp,$in0
684	add	$inp,$len		# end of input
685
686	#mov	$inp,$_inp		# saved later
687	mov	$out,$_out
688	mov	$len,$_end
689	#mov	$key,$_key		# remains resident in $inp register
690	mov	$ivp,$_ivp
691	mov	$ctx,$_ctx
692	mov	$in0,$_in0
693	mov	%rax,$_rsp
694.cfi_cfa_expression	$_rsp,deref,+8
695___
696$code.=<<___ if ($win64);
697	movaps	%xmm6,`$framesz+16*0`(%rsp)
698	movaps	%xmm7,`$framesz+16*1`(%rsp)
699	movaps	%xmm8,`$framesz+16*2`(%rsp)
700	movaps	%xmm9,`$framesz+16*3`(%rsp)
701	movaps	%xmm10,`$framesz+16*4`(%rsp)
702	movaps	%xmm11,`$framesz+16*5`(%rsp)
703	movaps	%xmm12,`$framesz+16*6`(%rsp)
704	movaps	%xmm13,`$framesz+16*7`(%rsp)
705	movaps	%xmm14,`$framesz+16*8`(%rsp)
706	movaps	%xmm15,`$framesz+16*9`(%rsp)
707___
708$code.=<<___;
709.Lprologue_avx:
710	vzeroall
711
712	mov	$inp,%r12		# borrow $a4
713	lea	0x80($key),$inp		# size optimization, reassign
714	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
715	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
716	mov	$ctx,%r15		# borrow $a2
717	mov	$in0,%rsi		# borrow $a3
718	vmovdqu	($ivp),$iv		# load IV
719	sub	\$9,%r14
720
721	mov	$SZ*0(%r15),$A
722	mov	$SZ*1(%r15),$B
723	mov	$SZ*2(%r15),$C
724	mov	$SZ*3(%r15),$D
725	mov	$SZ*4(%r15),$E
726	mov	$SZ*5(%r15),$F
727	mov	$SZ*6(%r15),$G
728	mov	$SZ*7(%r15),$H
729
730	vmovdqa	0x00(%r13,%r14,8),$mask14
731	vmovdqa	0x10(%r13,%r14,8),$mask12
732	vmovdqa	0x20(%r13,%r14,8),$mask10
733	vmovdqu	0x00-0x80($inp),$roundkey
734___
735					if ($SZ==4) {	# SHA256
736    my @X = map("%xmm$_",(0..3));
737    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
738
739$code.=<<___;
740	jmp	.Lloop_avx
741.align	16
742.Lloop_avx:
743	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
744	vmovdqu	0x00(%rsi,%r12),@X[0]
745	vmovdqu	0x10(%rsi,%r12),@X[1]
746	vmovdqu	0x20(%rsi,%r12),@X[2]
747	vmovdqu	0x30(%rsi,%r12),@X[3]
748	vpshufb	$t3,@X[0],@X[0]
749	lea	$TABLE(%rip),$Tbl
750	vpshufb	$t3,@X[1],@X[1]
751	vpshufb	$t3,@X[2],@X[2]
752	vpaddd	0x00($Tbl),@X[0],$t0
753	vpshufb	$t3,@X[3],@X[3]
754	vpaddd	0x20($Tbl),@X[1],$t1
755	vpaddd	0x40($Tbl),@X[2],$t2
756	vpaddd	0x60($Tbl),@X[3],$t3
757	vmovdqa	$t0,0x00(%rsp)
758	mov	$A,$a1
759	vmovdqa	$t1,0x10(%rsp)
760	mov	$B,$a3
761	vmovdqa	$t2,0x20(%rsp)
762	xor	$C,$a3			# magic
763	vmovdqa	$t3,0x30(%rsp)
764	mov	$E,$a0
765	jmp	.Lavx_00_47
766
767.align	16
768.Lavx_00_47:
769	sub	\$-16*2*$SZ,$Tbl	# size optimization
770	vmovdqu	(%r12),$inout		# $a4
771	mov	%r12,$_inp		# $a4
772___
773sub Xupdate_256_AVX () {
774	(
775	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
776	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
777	'&vpsrld	($t2,$t0,$sigma0[0]);',
778	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
779	'&vpsrld	($t3,$t0,$sigma0[2])',
780	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
781	'&vpxor		($t0,$t3,$t2)',
782	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
783	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
784	'&vpxor		($t0,$t0,$t1)',
785	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
786	'&vpxor		($t0,$t0,$t2)',
787	 '&vpsrld	($t2,$t3,$sigma1[2]);',
788	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
789	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
790	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
791	 '&vpxor	($t2,$t2,$t3);',
792	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
793	 '&vpxor	($t2,$t2,$t3)',		# sigma1(X[14..15])
794	 '&vpshufd	($t2,$t2,0b10000100)',
795	 '&vpsrldq	($t2,$t2,8)',
796	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
797	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
798	 '&vpsrld	($t2,$t3,$sigma1[2])',
799	 '&vpsrlq	($t3,$t3,$sigma1[0])',
800	 '&vpxor	($t2,$t2,$t3);',
801	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
802	 '&vpxor	($t2,$t2,$t3)',
803	 '&vpshufd	($t2,$t2,0b11101000)',
804	 '&vpslldq	($t2,$t2,8)',
805	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
806	);
807}
808
809sub AVX_256_00_47 () {
810my $j = shift;
811my $body = shift;
812my @X = @_;
813my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
814
815	foreach (Xupdate_256_AVX()) {		# 29 instructions
816	    eval;
817	    eval(shift(@insns));
818	    eval(shift(@insns));
819	    eval(shift(@insns));
820	}
821	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
822	  foreach (@insns) { eval; }		# remaining instructions
823	&vmovdqa	(16*$j."(%rsp)",$t2);
824}
825
826    $aesni_cbc_idx=0;
827    for ($i=0,$j=0; $j<4; $j++) {
828	&AVX_256_00_47($j,\&body_00_15,@X);
829	push(@X,shift(@X));			# rotate(@X)
830    }
831    	&mov		("%r12",$_inp);		# borrow $a4
832	&vpand		($temp,$temp,$mask14);
833	&mov		("%r15",$_out);		# borrow $a2
834	&vpor		($iv,$iv,$temp);
835	&vmovdqu	("(%r15,%r12)",$iv);	# write output
836	&lea		("%r12","16(%r12)");	# inp++
837
838	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
839	&jne	(".Lavx_00_47");
840
841	&vmovdqu	($inout,"(%r12)");
842	&mov		($_inp,"%r12");
843
844    $aesni_cbc_idx=0;
845    for ($i=0; $i<16; ) {
846	foreach(body_00_15()) { eval; }
847    }
848
849					}
850$code.=<<___;
851	mov	$_inp,%r12		# borrow $a4
852	mov	$_out,%r13		# borrow $a0
853	mov	$_ctx,%r15		# borrow $a2
854	mov	$_in0,%rsi		# borrow $a3
855
856	vpand	$mask14,$temp,$temp
857	mov	$a1,$A
858	vpor	$temp,$iv,$iv
859	vmovdqu	$iv,(%r13,%r12)		# write output
860	lea	16(%r12),%r12		# inp++
861
862	add	$SZ*0(%r15),$A
863	add	$SZ*1(%r15),$B
864	add	$SZ*2(%r15),$C
865	add	$SZ*3(%r15),$D
866	add	$SZ*4(%r15),$E
867	add	$SZ*5(%r15),$F
868	add	$SZ*6(%r15),$G
869	add	$SZ*7(%r15),$H
870
871	cmp	$_end,%r12
872
873	mov	$A,$SZ*0(%r15)
874	mov	$B,$SZ*1(%r15)
875	mov	$C,$SZ*2(%r15)
876	mov	$D,$SZ*3(%r15)
877	mov	$E,$SZ*4(%r15)
878	mov	$F,$SZ*5(%r15)
879	mov	$G,$SZ*6(%r15)
880	mov	$H,$SZ*7(%r15)
881	jb	.Lloop_avx
882
883	mov	$_ivp,$ivp
884	mov	$_rsp,%rsi
885.cfi_def_cfa	%rsi,8
886	vmovdqu	$iv,($ivp)		# output IV
887	vzeroall
888___
889$code.=<<___ if ($win64);
890	movaps	`$framesz+16*0`(%rsp),%xmm6
891	movaps	`$framesz+16*1`(%rsp),%xmm7
892	movaps	`$framesz+16*2`(%rsp),%xmm8
893	movaps	`$framesz+16*3`(%rsp),%xmm9
894	movaps	`$framesz+16*4`(%rsp),%xmm10
895	movaps	`$framesz+16*5`(%rsp),%xmm11
896	movaps	`$framesz+16*6`(%rsp),%xmm12
897	movaps	`$framesz+16*7`(%rsp),%xmm13
898	movaps	`$framesz+16*8`(%rsp),%xmm14
899	movaps	`$framesz+16*9`(%rsp),%xmm15
900___
901$code.=<<___;
902	mov	-48(%rsi),%r15
903.cfi_restore	%r15
904	mov	-40(%rsi),%r14
905.cfi_restore	%r14
906	mov	-32(%rsi),%r13
907.cfi_restore	%r13
908	mov	-24(%rsi),%r12
909.cfi_restore	%r12
910	mov	-16(%rsi),%rbp
911.cfi_restore	%rbp
912	mov	-8(%rsi),%rbx
913.cfi_restore	%rbx
914	lea	(%rsi),%rsp
915.cfi_def_cfa_register	%rsp
916.Lepilogue_avx:
917	ret
918.cfi_endproc
919.size	${func}_avx,.-${func}_avx
920___
921
922if ($avx>1) {{
923######################################################################
924# AVX2+BMI code path
925#
926my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
927my $PUSH8=8*2*$SZ;
928use integer;
929
930sub bodyx_00_15 () {
931	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
932	(
933	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
934
935	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
936	'&and	($a4,$e)',		# f&e
937	'&rorx	($a0,$e,$Sigma1[2])',
938	'&rorx	($a2,$e,$Sigma1[1])',
939
940	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
941	'&lea	($h,"($h,$a4)")',
942	'&andn	($a4,$e,$g)',		# ~e&g
943	'&xor	($a0,$a2)',
944
945	'&rorx	($a1,$e,$Sigma1[0])',
946	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
947	'&xor	($a0,$a1)',		# Sigma1(e)
948	'&mov	($a2,$a)',
949
950	'&rorx	($a4,$a,$Sigma0[2])',
951	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
952	'&xor	($a2,$b)',		# a^b, b^c in next round
953	'&rorx	($a1,$a,$Sigma0[1])',
954
955	'&rorx	($a0,$a,$Sigma0[0])',
956	'&lea	($d,"($d,$h)")',	# d+=h
957	'&and	($a3,$a2)',		# (b^c)&(a^b)
958	@aesni_cbc_block[$aesni_cbc_idx++].
959	'&xor	($a1,$a4)',
960
961	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
962	'&xor	($a1,$a0)',		# Sigma0(a)
963	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
964	'&mov	($a4,$e)',		# copy of f in future
965
966	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
967	);
968	# and at the finish one has to $a+=$a1
969}
970
971$code.=<<___;
972.type	${func}_avx2,\@function,6
973.align	64
974${func}_avx2:
975.cfi_startproc
976.Lavx2_shortcut:
977	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
978	mov	%rsp,%rax		# copy %rsp
979.cfi_def_cfa_register	%rax
980	push	%rbx
981.cfi_push	%rbx
982	push	%rbp
983.cfi_push	%rbp
984	push	%r12
985.cfi_push	%r12
986	push	%r13
987.cfi_push	%r13
988	push	%r14
989.cfi_push	%r14
990	push	%r15
991.cfi_push	%r15
992	sub	\$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
993	and	\$-256*$SZ,%rsp		# align stack frame
994	add	\$`2*$SZ*($rounds-8)`,%rsp
995
996	shl	\$6,$len
997	sub	$inp,$out		# re-bias
998	sub	$inp,$in0
999	add	$inp,$len		# end of input
1000
1001	#mov	$inp,$_inp		# saved later
1002	#mov	$out,$_out		# kept in $offload
1003	mov	$len,$_end
1004	#mov	$key,$_key		# remains resident in $inp register
1005	mov	$ivp,$_ivp
1006	mov	$ctx,$_ctx
1007	mov	$in0,$_in0
1008	mov	%rax,$_rsp
1009.cfi_cfa_expression	$_rsp,deref,+8
1010___
1011$code.=<<___ if ($win64);
1012	movaps	%xmm6,`$framesz+16*0`(%rsp)
1013	movaps	%xmm7,`$framesz+16*1`(%rsp)
1014	movaps	%xmm8,`$framesz+16*2`(%rsp)
1015	movaps	%xmm9,`$framesz+16*3`(%rsp)
1016	movaps	%xmm10,`$framesz+16*4`(%rsp)
1017	movaps	%xmm11,`$framesz+16*5`(%rsp)
1018	movaps	%xmm12,`$framesz+16*6`(%rsp)
1019	movaps	%xmm13,`$framesz+16*7`(%rsp)
1020	movaps	%xmm14,`$framesz+16*8`(%rsp)
1021	movaps	%xmm15,`$framesz+16*9`(%rsp)
1022___
1023$code.=<<___;
1024.Lprologue_avx2:
1025	vzeroall
1026
1027	mov	$inp,%r13		# borrow $a0
1028	vpinsrq	\$1,$out,$offload,$offload
1029	lea	0x80($key),$inp		# size optimization, reassign
1030	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r12	# borrow $a4
1031	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
1032	mov	$ctx,%r15		# borrow $a2
1033	mov	$in0,%rsi		# borrow $a3
1034	vmovdqu	($ivp),$iv		# load IV
1035	lea	-9(%r14),%r14
1036
1037	vmovdqa	0x00(%r12,%r14,8),$mask14
1038	vmovdqa	0x10(%r12,%r14,8),$mask12
1039	vmovdqa	0x20(%r12,%r14,8),$mask10
1040
1041	sub	\$-16*$SZ,%r13		# inp++, size optimization
1042	mov	$SZ*0(%r15),$A
1043	lea	(%rsi,%r13),%r12	# borrow $a0
1044	mov	$SZ*1(%r15),$B
1045	cmp	$len,%r13		# $_end
1046	mov	$SZ*2(%r15),$C
1047	cmove	%rsp,%r12		# next block or random data
1048	mov	$SZ*3(%r15),$D
1049	mov	$SZ*4(%r15),$E
1050	mov	$SZ*5(%r15),$F
1051	mov	$SZ*6(%r15),$G
1052	mov	$SZ*7(%r15),$H
1053	vmovdqu	0x00-0x80($inp),$roundkey
1054___
1055					if ($SZ==4) {	# SHA256
1056    my @X = map("%ymm$_",(0..3));
1057    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1058
1059$code.=<<___;
1060	jmp	.Loop_avx2
1061.align	16
1062.Loop_avx2:
1063	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1064	vmovdqu	-16*$SZ+0(%rsi,%r13),%xmm0
1065	vmovdqu	-16*$SZ+16(%rsi,%r13),%xmm1
1066	vmovdqu	-16*$SZ+32(%rsi,%r13),%xmm2
1067	vmovdqu	-16*$SZ+48(%rsi,%r13),%xmm3
1068
1069	vinserti128	\$1,(%r12),@X[0],@X[0]
1070	vinserti128	\$1,16(%r12),@X[1],@X[1]
1071	 vpshufb	$t3,@X[0],@X[0]
1072	vinserti128	\$1,32(%r12),@X[2],@X[2]
1073	 vpshufb	$t3,@X[1],@X[1]
1074	vinserti128	\$1,48(%r12),@X[3],@X[3]
1075
1076	lea	$TABLE(%rip),$Tbl
1077	vpshufb	$t3,@X[2],@X[2]
1078	lea	-16*$SZ(%r13),%r13
1079	vpaddd	0x00($Tbl),@X[0],$t0
1080	vpshufb	$t3,@X[3],@X[3]
1081	vpaddd	0x20($Tbl),@X[1],$t1
1082	vpaddd	0x40($Tbl),@X[2],$t2
1083	vpaddd	0x60($Tbl),@X[3],$t3
1084	vmovdqa	$t0,0x00(%rsp)
1085	xor	$a1,$a1
1086	vmovdqa	$t1,0x20(%rsp)
1087	lea	-$PUSH8(%rsp),%rsp
1088	mov	$B,$a3
1089	vmovdqa	$t2,0x00(%rsp)
1090	xor	$C,$a3			# magic
1091	vmovdqa	$t3,0x20(%rsp)
1092	mov	$F,$a4
1093	sub	\$-16*2*$SZ,$Tbl	# size optimization
1094	jmp	.Lavx2_00_47
1095
1096.align	16
1097.Lavx2_00_47:
1098	vmovdqu	(%r13),$inout
1099	vpinsrq	\$0,%r13,$offload,$offload
1100___
1101
1102sub AVX2_256_00_47 () {
1103my $j = shift;
1104my $body = shift;
1105my @X = @_;
1106my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1107my $base = "+2*$PUSH8(%rsp)";
1108
1109	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
1110	foreach (Xupdate_256_AVX()) {		# 29 instructions
1111	    eval;
1112	    eval(shift(@insns));
1113	    eval(shift(@insns));
1114	    eval(shift(@insns));
1115	}
1116	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1117	  foreach (@insns) { eval; }		# remaining instructions
1118	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1119}
1120    $aesni_cbc_idx=0;
1121    for ($i=0,$j=0; $j<4; $j++) {
1122	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1123	push(@X,shift(@X));			# rotate(@X)
1124    }
1125	&vmovq		("%r13",$offload);	# borrow $a0
1126	&vpextrq	("%r15",$offload,1);	# borrow $a2
1127	&vpand		($temp,$temp,$mask14);
1128	&vpor		($iv,$iv,$temp);
1129	&vmovdqu	("(%r15,%r13)",$iv);	# write output
1130	&lea		("%r13","16(%r13)");	# inp++
1131
1132	&lea	($Tbl,16*2*$SZ."($Tbl)");
1133	&cmpb	(($SZ-1)."($Tbl)",0);
1134	&jne	(".Lavx2_00_47");
1135
1136	&vmovdqu	($inout,"(%r13)");
1137	&vpinsrq	($offload,$offload,"%r13",0);
1138
1139    $aesni_cbc_idx=0;
1140    for ($i=0; $i<16; ) {
1141	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1142	foreach(bodyx_00_15()) { eval; }
1143    }
1144					}
1145$code.=<<___;
1146	vpextrq	\$1,$offload,%r12		# $_out, borrow $a4
1147	vmovq	$offload,%r13			# $_inp, borrow $a0
1148	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1149	add	$a1,$A
1150	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
1151
1152	vpand	$mask14,$temp,$temp
1153	vpor	$temp,$iv,$iv
1154	vmovdqu	$iv,(%r12,%r13)			# write output
1155	lea	16(%r13),%r13
1156
1157	add	$SZ*0(%r15),$A
1158	add	$SZ*1(%r15),$B
1159	add	$SZ*2(%r15),$C
1160	add	$SZ*3(%r15),$D
1161	add	$SZ*4(%r15),$E
1162	add	$SZ*5(%r15),$F
1163	add	$SZ*6(%r15),$G
1164	add	$SZ*7(%r15),$H
1165
1166	mov	$A,$SZ*0(%r15)
1167	mov	$B,$SZ*1(%r15)
1168	mov	$C,$SZ*2(%r15)
1169	mov	$D,$SZ*3(%r15)
1170	mov	$E,$SZ*4(%r15)
1171	mov	$F,$SZ*5(%r15)
1172	mov	$G,$SZ*6(%r15)
1173	mov	$H,$SZ*7(%r15)
1174
1175	cmp	`$PUSH8+2*8`($Tbl),%r13		# $_end
1176	je	.Ldone_avx2
1177
1178	xor	$a1,$a1
1179	mov	$B,$a3
1180	mov	$F,$a4
1181	xor	$C,$a3			# magic
1182	jmp	.Lower_avx2
1183.align	16
1184.Lower_avx2:
1185	vmovdqu	(%r13),$inout
1186	vpinsrq	\$0,%r13,$offload,$offload
1187___
1188    $aesni_cbc_idx=0;
1189    for ($i=0; $i<16; ) {
1190	my $base="+16($Tbl)";
1191	foreach(bodyx_00_15()) { eval; }
1192	&lea	($Tbl,"-$PUSH8($Tbl)")	if ($i==8);
1193    }
1194$code.=<<___;
1195	vmovq	$offload,%r13			# borrow $a0
1196	vpextrq	\$1,$offload,%r15		# borrow $a2
1197	vpand	$mask14,$temp,$temp
1198	vpor	$temp,$iv,$iv
1199	lea	-$PUSH8($Tbl),$Tbl
1200	vmovdqu	$iv,(%r15,%r13)			# write output
1201	lea	16(%r13),%r13			# inp++
1202	cmp	%rsp,$Tbl
1203	jae	.Lower_avx2
1204
1205	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1206	lea	16*$SZ(%r13),%r13
1207	mov	`2*$SZ*$rounds+6*8`(%rsp),%rsi	# $_in0, borrow $a3
1208	add	$a1,$A
1209	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
1210
1211	add	$SZ*0(%r15),$A
1212	add	$SZ*1(%r15),$B
1213	add	$SZ*2(%r15),$C
1214	add	$SZ*3(%r15),$D
1215	add	$SZ*4(%r15),$E
1216	add	$SZ*5(%r15),$F
1217	add	$SZ*6(%r15),$G
1218	lea	(%rsi,%r13),%r12
1219	add	$SZ*7(%r15),$H
1220
1221	cmp	$_end,%r13
1222
1223	mov	$A,$SZ*0(%r15)
1224	cmove	%rsp,%r12		# next block or stale data
1225	mov	$B,$SZ*1(%r15)
1226	mov	$C,$SZ*2(%r15)
1227	mov	$D,$SZ*3(%r15)
1228	mov	$E,$SZ*4(%r15)
1229	mov	$F,$SZ*5(%r15)
1230	mov	$G,$SZ*6(%r15)
1231	mov	$H,$SZ*7(%r15)
1232
1233	jbe	.Loop_avx2
1234	lea	(%rsp),$Tbl
1235
1236.Ldone_avx2:
1237	lea	($Tbl),%rsp
1238	mov	$_ivp,$ivp
1239	mov	$_rsp,%rsi
1240.cfi_def_cfa	%rsi,8
1241	vmovdqu	$iv,($ivp)		# output IV
1242	vzeroall
1243___
1244$code.=<<___ if ($win64);
1245	movaps	`$framesz+16*0`(%rsp),%xmm6
1246	movaps	`$framesz+16*1`(%rsp),%xmm7
1247	movaps	`$framesz+16*2`(%rsp),%xmm8
1248	movaps	`$framesz+16*3`(%rsp),%xmm9
1249	movaps	`$framesz+16*4`(%rsp),%xmm10
1250	movaps	`$framesz+16*5`(%rsp),%xmm11
1251	movaps	`$framesz+16*6`(%rsp),%xmm12
1252	movaps	`$framesz+16*7`(%rsp),%xmm13
1253	movaps	`$framesz+16*8`(%rsp),%xmm14
1254	movaps	`$framesz+16*9`(%rsp),%xmm15
1255___
1256$code.=<<___;
1257	mov	-48(%rsi),%r15
1258.cfi_restore	%r15
1259	mov	-40(%rsi),%r14
1260.cfi_restore	%r14
1261	mov	-32(%rsi),%r13
1262.cfi_restore	%r13
1263	mov	-24(%rsi),%r12
1264.cfi_restore	%r12
1265	mov	-16(%rsi),%rbp
1266.cfi_restore	%rbp
1267	mov	-8(%rsi),%rbx
1268.cfi_restore	%rbx
1269	lea	(%rsi),%rsp
1270.cfi_def_cfa_register	%rsp
1271.Lepilogue_avx2:
1272	ret
1273.cfi_endproc
1274.size	${func}_avx2,.-${func}_avx2
1275___
1276}}
1277}}
1278{{
1279my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1280
1281my ($rounds,$Tbl)=("%r11d","%rbx");
1282
1283my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1284my @rndkey=("%xmm4","%xmm5");
1285my $r=0;
1286my $sn=0;
1287
1288my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1289my @MSG=map("%xmm$_",(10..13));
1290
1291my $aesenc=sub {
1292  use integer;
1293  my ($n,$k)=($r/10,$r%10);
1294    if ($k==0) {
1295      $code.=<<___;
1296	movups		`16*$n`($in0),$in		# load input
1297	xorps		$rndkey0,$in
1298___
1299      $code.=<<___ if ($n);
1300	movups		$iv,`16*($n-1)`($out,$in0)	# write output
1301___
1302      $code.=<<___;
1303	xorps		$in,$iv
1304	movups		`32+16*$k-112`($key),$rndkey[1]
1305	aesenc		$rndkey[0],$iv
1306___
1307    } elsif ($k==9) {
1308      $sn++;
1309      $code.=<<___;
1310	cmp		\$11,$rounds
1311	jb		.Laesenclast$sn
1312	movups		`32+16*($k+0)-112`($key),$rndkey[1]
1313	aesenc		$rndkey[0],$iv
1314	movups		`32+16*($k+1)-112`($key),$rndkey[0]
1315	aesenc		$rndkey[1],$iv
1316	je		.Laesenclast$sn
1317	movups		`32+16*($k+2)-112`($key),$rndkey[1]
1318	aesenc		$rndkey[0],$iv
1319	movups		`32+16*($k+3)-112`($key),$rndkey[0]
1320	aesenc		$rndkey[1],$iv
1321.Laesenclast$sn:
1322	aesenclast	$rndkey[0],$iv
1323	movups		16-112($key),$rndkey[1]		# forward reference
1324	nop
1325___
1326    } else {
1327      $code.=<<___;
1328	movups		`32+16*$k-112`($key),$rndkey[1]
1329	aesenc		$rndkey[0],$iv
1330___
1331    }
1332    $r++;	unshift(@rndkey,pop(@rndkey));
1333};
1334
1335if ($shaext) {
1336my $Tbl="%rax";
1337
1338$code.=<<___;
1339.type	${func}_shaext,\@function,6
1340.align	32
1341${func}_shaext:
1342	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1343___
1344$code.=<<___ if ($win64);
1345	lea	`-8-10*16`(%rsp),%rsp
1346	movaps	%xmm6,-8-10*16(%rax)
1347	movaps	%xmm7,-8-9*16(%rax)
1348	movaps	%xmm8,-8-8*16(%rax)
1349	movaps	%xmm9,-8-7*16(%rax)
1350	movaps	%xmm10,-8-6*16(%rax)
1351	movaps	%xmm11,-8-5*16(%rax)
1352	movaps	%xmm12,-8-4*16(%rax)
1353	movaps	%xmm13,-8-3*16(%rax)
1354	movaps	%xmm14,-8-2*16(%rax)
1355	movaps	%xmm15,-8-1*16(%rax)
1356.Lprologue_shaext:
1357___
1358$code.=<<___;
1359	lea		K256+0x80(%rip),$Tbl
1360	movdqu		($ctx),$ABEF		# DCBA
1361	movdqu		16($ctx),$CDGH		# HGFE
1362	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
1363
1364	mov		240($key),$rounds
1365	sub		$in0,$out
1366	movups		($key),$rndkey0		# $key[0]
1367	movups		($ivp),$iv		# load IV
1368	movups		16($key),$rndkey[0]	# forward reference
1369	lea		112($key),$key		# size optimization
1370
1371	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
1372	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
1373	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
1374	movdqa		$TMP,$BSWAP		# offload
1375	palignr		\$8,$CDGH,$ABEF		# ABEF
1376	punpcklqdq	$Wi,$CDGH		# CDGH
1377
1378	jmp	.Loop_shaext
1379
1380.align	16
1381.Loop_shaext:
1382	movdqu		($inp),@MSG[0]
1383	movdqu		0x10($inp),@MSG[1]
1384	movdqu		0x20($inp),@MSG[2]
1385	pshufb		$TMP,@MSG[0]
1386	movdqu		0x30($inp),@MSG[3]
1387
1388	movdqa		0*32-0x80($Tbl),$Wi
1389	paddd		@MSG[0],$Wi
1390	pshufb		$TMP,@MSG[1]
1391	movdqa		$CDGH,$CDGH_SAVE	# offload
1392	movdqa		$ABEF,$ABEF_SAVE	# offload
1393___
1394	&$aesenc();
1395$code.=<<___;
1396	sha256rnds2	$ABEF,$CDGH		# 0-3
1397	pshufd		\$0x0e,$Wi,$Wi
1398___
1399	&$aesenc();
1400$code.=<<___;
1401	sha256rnds2	$CDGH,$ABEF
1402
1403	movdqa		1*32-0x80($Tbl),$Wi
1404	paddd		@MSG[1],$Wi
1405	pshufb		$TMP,@MSG[2]
1406	lea		0x40($inp),$inp
1407___
1408	&$aesenc();
1409$code.=<<___;
1410	sha256rnds2	$ABEF,$CDGH		# 4-7
1411	pshufd		\$0x0e,$Wi,$Wi
1412___
1413	&$aesenc();
1414$code.=<<___;
1415	sha256rnds2	$CDGH,$ABEF
1416
1417	movdqa		2*32-0x80($Tbl),$Wi
1418	paddd		@MSG[2],$Wi
1419	pshufb		$TMP,@MSG[3]
1420	sha256msg1	@MSG[1],@MSG[0]
1421___
1422	&$aesenc();
1423$code.=<<___;
1424	sha256rnds2	$ABEF,$CDGH		# 8-11
1425	pshufd		\$0x0e,$Wi,$Wi
1426	movdqa		@MSG[3],$TMP
1427	palignr		\$4,@MSG[2],$TMP
1428	paddd		$TMP,@MSG[0]
1429___
1430	&$aesenc();
1431$code.=<<___;
1432	sha256rnds2	$CDGH,$ABEF
1433
1434	movdqa		3*32-0x80($Tbl),$Wi
1435	paddd		@MSG[3],$Wi
1436	sha256msg2	@MSG[3],@MSG[0]
1437	sha256msg1	@MSG[2],@MSG[1]
1438___
1439	&$aesenc();
1440$code.=<<___;
1441	sha256rnds2	$ABEF,$CDGH		# 12-15
1442	pshufd		\$0x0e,$Wi,$Wi
1443___
1444	&$aesenc();
1445$code.=<<___;
1446	movdqa		@MSG[0],$TMP
1447	palignr		\$4,@MSG[3],$TMP
1448	paddd		$TMP,@MSG[1]
1449	sha256rnds2	$CDGH,$ABEF
1450___
1451for($i=4;$i<16-3;$i++) {
1452	&$aesenc()	if (($r%10)==0);
1453$code.=<<___;
1454	movdqa		$i*32-0x80($Tbl),$Wi
1455	paddd		@MSG[0],$Wi
1456	sha256msg2	@MSG[0],@MSG[1]
1457	sha256msg1	@MSG[3],@MSG[2]
1458___
1459	&$aesenc();
1460$code.=<<___;
1461	sha256rnds2	$ABEF,$CDGH		# 16-19...
1462	pshufd		\$0x0e,$Wi,$Wi
1463	movdqa		@MSG[1],$TMP
1464	palignr		\$4,@MSG[0],$TMP
1465	paddd		$TMP,@MSG[2]
1466___
1467	&$aesenc();
1468	&$aesenc()	if ($r==19);
1469$code.=<<___;
1470	sha256rnds2	$CDGH,$ABEF
1471___
1472	push(@MSG,shift(@MSG));
1473}
1474$code.=<<___;
1475	movdqa		13*32-0x80($Tbl),$Wi
1476	paddd		@MSG[0],$Wi
1477	sha256msg2	@MSG[0],@MSG[1]
1478	sha256msg1	@MSG[3],@MSG[2]
1479___
1480	&$aesenc();
1481$code.=<<___;
1482	sha256rnds2	$ABEF,$CDGH		# 52-55
1483	pshufd		\$0x0e,$Wi,$Wi
1484	movdqa		@MSG[1],$TMP
1485	palignr		\$4,@MSG[0],$TMP
1486	paddd		$TMP,@MSG[2]
1487___
1488	&$aesenc();
1489	&$aesenc();
1490$code.=<<___;
1491	sha256rnds2	$CDGH,$ABEF
1492
1493	movdqa		14*32-0x80($Tbl),$Wi
1494	paddd		@MSG[1],$Wi
1495	sha256msg2	@MSG[1],@MSG[2]
1496	movdqa		$BSWAP,$TMP
1497___
1498	&$aesenc();
1499$code.=<<___;
1500	sha256rnds2	$ABEF,$CDGH		# 56-59
1501	pshufd		\$0x0e,$Wi,$Wi
1502___
1503	&$aesenc();
1504$code.=<<___;
1505	sha256rnds2	$CDGH,$ABEF
1506
1507	movdqa		15*32-0x80($Tbl),$Wi
1508	paddd		@MSG[2],$Wi
1509___
1510	&$aesenc();
1511	&$aesenc();
1512$code.=<<___;
1513	sha256rnds2	$ABEF,$CDGH		# 60-63
1514	pshufd		\$0x0e,$Wi,$Wi
1515___
1516	&$aesenc();
1517$code.=<<___;
1518	sha256rnds2	$CDGH,$ABEF
1519	#pxor		$CDGH,$rndkey0		# black magic
1520___
1521	while ($r<40)	{ &$aesenc(); }		# remaining aesenc's
1522$code.=<<___;
1523	#xorps		$CDGH,$rndkey0		# black magic
1524	paddd		$CDGH_SAVE,$CDGH
1525	paddd		$ABEF_SAVE,$ABEF
1526
1527	dec		$len
1528	movups		$iv,48($out,$in0)	# write output
1529	lea		64($in0),$in0
1530	jnz		.Loop_shaext
1531
1532	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
1533	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
1534	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
1535	punpckhqdq	$CDGH,$ABEF		# DCBA
1536	palignr		\$8,$TMP,$CDGH		# HGFE
1537
1538	movups		$iv,($ivp)		# write IV
1539	movdqu		$ABEF,($ctx)
1540	movdqu		$CDGH,16($ctx)
1541___
1542$code.=<<___ if ($win64);
1543	movaps	0*16(%rsp),%xmm6
1544	movaps	1*16(%rsp),%xmm7
1545	movaps	2*16(%rsp),%xmm8
1546	movaps	3*16(%rsp),%xmm9
1547	movaps	4*16(%rsp),%xmm10
1548	movaps	5*16(%rsp),%xmm11
1549	movaps	6*16(%rsp),%xmm12
1550	movaps	7*16(%rsp),%xmm13
1551	movaps	8*16(%rsp),%xmm14
1552	movaps	9*16(%rsp),%xmm15
1553	lea	8+10*16(%rsp),%rsp
1554.Lepilogue_shaext:
1555___
1556$code.=<<___;
1557	ret
1558.size	${func}_shaext,.-${func}_shaext
1559___
1560}
1561}}}}}
1562
1563# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1564#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1565if ($win64 && $avx) {
1566$rec="%rcx";
1567$frame="%rdx";
1568$context="%r8";
1569$disp="%r9";
1570
1571$code.=<<___;
1572.extern	__imp_RtlVirtualUnwind
1573.type	se_handler,\@abi-omnipotent
1574.align	16
1575se_handler:
1576	push	%rsi
1577	push	%rdi
1578	push	%rbx
1579	push	%rbp
1580	push	%r12
1581	push	%r13
1582	push	%r14
1583	push	%r15
1584	pushfq
1585	sub	\$64,%rsp
1586
1587	mov	120($context),%rax	# pull context->Rax
1588	mov	248($context),%rbx	# pull context->Rip
1589
1590	mov	8($disp),%rsi		# disp->ImageBase
1591	mov	56($disp),%r11		# disp->HanderlData
1592
1593	mov	0(%r11),%r10d		# HandlerData[0]
1594	lea	(%rsi,%r10),%r10	# prologue label
1595	cmp	%r10,%rbx		# context->Rip<prologue label
1596	jb	.Lin_prologue
1597
1598	mov	152($context),%rax	# pull context->Rsp
1599
1600	mov	4(%r11),%r10d		# HandlerData[1]
1601	lea	(%rsi,%r10),%r10	# epilogue label
1602	cmp	%r10,%rbx		# context->Rip>=epilogue label
1603	jae	.Lin_prologue
1604___
1605$code.=<<___ if ($shaext);
1606	lea	aesni_cbc_sha256_enc_shaext(%rip),%r10
1607	cmp	%r10,%rbx
1608	jb	.Lnot_in_shaext
1609
1610	lea	(%rax),%rsi
1611	lea	512($context),%rdi	# &context.Xmm6
1612	mov	\$20,%ecx
1613	.long	0xa548f3fc		# cld; rep movsq
1614	lea	168(%rax),%rax		# adjust stack pointer
1615	jmp	.Lin_prologue
1616.Lnot_in_shaext:
1617___
1618$code.=<<___ if ($avx>1);
1619	lea	.Lavx2_shortcut(%rip),%r10
1620	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
1621	jb	.Lnot_in_avx2
1622
1623	and	\$-256*$SZ,%rax
1624	add	\$`2*$SZ*($rounds-8)`,%rax
1625.Lnot_in_avx2:
1626___
1627$code.=<<___;
1628	mov	%rax,%rsi		# put aside Rsp
1629	mov	16*$SZ+7*8(%rax),%rax	# pull $_rsp
1630
1631	mov	-8(%rax),%rbx
1632	mov	-16(%rax),%rbp
1633	mov	-24(%rax),%r12
1634	mov	-32(%rax),%r13
1635	mov	-40(%rax),%r14
1636	mov	-48(%rax),%r15
1637	mov	%rbx,144($context)	# restore context->Rbx
1638	mov	%rbp,160($context)	# restore context->Rbp
1639	mov	%r12,216($context)	# restore context->R12
1640	mov	%r13,224($context)	# restore context->R13
1641	mov	%r14,232($context)	# restore context->R14
1642	mov	%r15,240($context)	# restore context->R15
1643
1644	lea	16*$SZ+8*8(%rsi),%rsi	# Xmm6- save area
1645	lea	512($context),%rdi	# &context.Xmm6
1646	mov	\$20,%ecx
1647	.long	0xa548f3fc		# cld; rep movsq
1648
1649.Lin_prologue:
1650	mov	8(%rax),%rdi
1651	mov	16(%rax),%rsi
1652	mov	%rax,152($context)	# restore context->Rsp
1653	mov	%rsi,168($context)	# restore context->Rsi
1654	mov	%rdi,176($context)	# restore context->Rdi
1655
1656	mov	40($disp),%rdi		# disp->ContextRecord
1657	mov	$context,%rsi		# context
1658	mov	\$154,%ecx		# sizeof(CONTEXT)
1659	.long	0xa548f3fc		# cld; rep movsq
1660
1661	mov	$disp,%rsi
1662	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1663	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1664	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1665	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1666	mov	40(%rsi),%r10		# disp->ContextRecord
1667	lea	56(%rsi),%r11		# &disp->HandlerData
1668	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1669	mov	%r10,32(%rsp)		# arg5
1670	mov	%r11,40(%rsp)		# arg6
1671	mov	%r12,48(%rsp)		# arg7
1672	mov	%rcx,56(%rsp)		# arg8, (NULL)
1673	call	*__imp_RtlVirtualUnwind(%rip)
1674
1675	mov	\$1,%eax		# ExceptionContinueSearch
1676	add	\$64,%rsp
1677	popfq
1678	pop	%r15
1679	pop	%r14
1680	pop	%r13
1681	pop	%r12
1682	pop	%rbp
1683	pop	%rbx
1684	pop	%rdi
1685	pop	%rsi
1686	ret
1687.size	se_handler,.-se_handler
1688
1689.section	.pdata
1690	.rva	.LSEH_begin_${func}_xop
1691	.rva	.LSEH_end_${func}_xop
1692	.rva	.LSEH_info_${func}_xop
1693
1694	.rva	.LSEH_begin_${func}_avx
1695	.rva	.LSEH_end_${func}_avx
1696	.rva	.LSEH_info_${func}_avx
1697___
1698$code.=<<___ if ($avx>1);
1699	.rva	.LSEH_begin_${func}_avx2
1700	.rva	.LSEH_end_${func}_avx2
1701	.rva	.LSEH_info_${func}_avx2
1702___
1703$code.=<<___ if ($shaext);
1704	.rva	.LSEH_begin_${func}_shaext
1705	.rva	.LSEH_end_${func}_shaext
1706	.rva	.LSEH_info_${func}_shaext
1707___
1708$code.=<<___;
1709.section	.xdata
1710.align	8
1711.LSEH_info_${func}_xop:
1712	.byte	9,0,0,0
1713	.rva	se_handler
1714	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
1715
1716.LSEH_info_${func}_avx:
1717	.byte	9,0,0,0
1718	.rva	se_handler
1719	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1720___
1721$code.=<<___ if ($avx>1);
1722.LSEH_info_${func}_avx2:
1723	.byte	9,0,0,0
1724	.rva	se_handler
1725	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
1726___
1727$code.=<<___ if ($shaext);
1728.LSEH_info_${func}_shaext:
1729	.byte	9,0,0,0
1730	.rva	se_handler
1731	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
1732___
1733}
1734
1735####################################################################
1736sub rex {
1737  local *opcode=shift;
1738  my ($dst,$src)=@_;
1739  my $rex=0;
1740
1741    $rex|=0x04			if($dst>=8);
1742    $rex|=0x01			if($src>=8);
1743    unshift @opcode,$rex|0x40	if($rex);
1744}
1745
1746{
1747  my %opcodelet = (
1748		"sha256rnds2" => 0xcb,
1749  		"sha256msg1"  => 0xcc,
1750		"sha256msg2"  => 0xcd	);
1751
1752  sub sha256op38 {
1753    my $instr = shift;
1754
1755    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1756      my @opcode=(0x0f,0x38);
1757	rex(\@opcode,$2,$1);
1758	push @opcode,$opcodelet{$instr};
1759	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1760	return ".byte\t".join(',',@opcode);
1761    } else {
1762	return $instr."\t".@_[0];
1763    }
1764  }
1765}
1766
1767$code =~ s/\`([^\`]*)\`/eval $1/gem;
1768$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1769print $code;
1770close STDOUT;
1771