1#! /usr/bin/env perl
2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Multi-buffer SHA256 procedure processes n buffers in parallel by
18# placing buffer data to designated lane of SIMD register. n is
19# naturally limited to 4 on pre-AVX2 processors and to 8 on
20# AVX2-capable processors such as Haswell.
21#
22#		this	+aesni(i)	sha256	aesni-sha256	gain(iv)
23# -------------------------------------------------------------------
24# Westmere(ii)	23.3/n	+1.28=7.11(n=4)	12.3	+3.75=16.1	+126%
25# Atom(ii)	38.7/n	+3.93=13.6(n=4)	20.8	+5.69=26.5	+95%
26# Sandy Bridge	(20.5	+5.15=25.7)/n	11.6	13.0		+103%
27# Ivy Bridge	(20.4	+5.14=25.5)/n	10.3	11.6		+82%
28# Haswell(iii)	(21.0	+5.00=26.0)/n	7.80	8.79		+170%
29# Skylake	(18.9	+5.00=23.9)/n	7.70	8.17		+170%
30# Bulldozer	(21.6	+5.76=27.4)/n	13.6	13.7		+100%
31#
32# (i)	multi-block CBC encrypt with 128-bit key;
33# (ii)	(HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34#	because of lower AES-NI instruction throughput, nor is there
35#	AES-NI-SHA256 stitch for these processors;
36# (iii)	"this" is for n=8, when we gather twice as much data, result
37#	for n=4 is 20.3+4.44=24.7;
38# (iv)	presented improvement coefficients are asymptotic limits and
39#	in real-life application are somewhat lower, e.g. for 2KB
40#	fragments they range from 75% to 130% (on Haswell);
41
42# $output is the last argument if it looks like a file (it has an extension)
43# $flavour is the first argument if it doesn't look like a file
44$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
45$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54push(@INC,"${dir}","${dir}../../perlasm");
55require "x86_64-support.pl";
56
57$ptr_size=&pointer_size($flavour);
58
59$avx=0;
60
61if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
62		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
63	$avx = ($1>=2.19) + ($1>=2.22);
64}
65
66if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
67	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
68	$avx = ($1>=2.09) + ($1>=2.10);
69}
70
71if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
72	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
73	$avx = ($1>=10) + ($1>=11);
74}
75
76if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
77	$avx = ($2>=3.0) + ($2>3.0);
78}
79
80open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
81    or die "can't call $xlate: $!";
82*STDOUT=*OUT;
83
84# void sha256_multi_block (
85#     struct {	unsigned int A[8];
86#		unsigned int B[8];
87#		unsigned int C[8];
88#		unsigned int D[8];
89#		unsigned int E[8];
90#		unsigned int F[8];
91#		unsigned int G[8];
92#		unsigned int H[8];	} *ctx,
93#     struct {	void *ptr; int blocks;	} inp[8],
94#     int num);		/* 1 or 2 */
95#
96$ctx="%rdi";	# 1st arg
97$inp="%rsi";	# 2nd arg
98$num="%edx";	# 3rd arg
99@ptr=map("%r$_",(8..11));
100$Tbl="%rbp";
101$inp_elm_size=2*$ptr_size;
102
103@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
104($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
105
106$REG_SZ=16;
107
108sub Xi_off {
109my $off = shift;
110
111    $off %= 16; $off *= $REG_SZ;
112    $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
113}
114
115sub ROUND_00_15 {
116my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
117
118$code.=<<___ if ($i<15);
119	movd		`4*$i`(@ptr[0]),$Xi
120	movd		`4*$i`(@ptr[1]),$t1
121	movd		`4*$i`(@ptr[2]),$t2
122	movd		`4*$i`(@ptr[3]),$t3
123	punpckldq	$t2,$Xi
124	punpckldq	$t3,$t1
125	punpckldq	$t1,$Xi
126___
127$code.=<<___ if ($i==15);
128	movd		`4*$i`(@ptr[0]),$Xi
129	 lea		`16*4`(@ptr[0]),@ptr[0]
130	movd		`4*$i`(@ptr[1]),$t1
131	 lea		`16*4`(@ptr[1]),@ptr[1]
132	movd		`4*$i`(@ptr[2]),$t2
133	 lea		`16*4`(@ptr[2]),@ptr[2]
134	movd		`4*$i`(@ptr[3]),$t3
135	 lea		`16*4`(@ptr[3]),@ptr[3]
136	punpckldq	$t2,$Xi
137	punpckldq	$t3,$t1
138	punpckldq	$t1,$Xi
139___
140$code.=<<___;
141	movdqa	$e,$sigma
142	`"pshufb	$Xn,$Xi"		if ($i<=15 && ($i&1)==0)`
143	movdqa	$e,$t3
144	`"pshufb	$Xn,$Xi"		if ($i<=15 && ($i&1)==1)`
145	psrld	\$6,$sigma
146	movdqa	$e,$t2
147	pslld	\$7,$t3
148	movdqa	$Xi,`&Xi_off($i)`
149	 paddd	$h,$Xi				# Xi+=h
150
151	psrld	\$11,$t2
152	pxor	$t3,$sigma
153	pslld	\$21-7,$t3
154	 paddd	`32*($i%8)-128`($Tbl),$Xi	# Xi+=K[round]
155	pxor	$t2,$sigma
156
157	psrld	\$25-11,$t2
158	 movdqa	$e,$t1
159	 `"prefetcht0	63(@ptr[0])"		if ($i==15)`
160	pxor	$t3,$sigma
161	 movdqa	$e,$axb				# borrow $axb
162	pslld	\$26-21,$t3
163	 pandn	$g,$t1
164	 pand	$f,$axb
165	pxor	$t2,$sigma
166
167	 `"prefetcht0	63(@ptr[1])"		if ($i==15)`
168	movdqa	$a,$t2
169	pxor	$t3,$sigma			# Sigma1(e)
170	movdqa	$a,$t3
171	psrld	\$2,$t2
172	paddd	$sigma,$Xi			# Xi+=Sigma1(e)
173	 pxor	$axb,$t1			# Ch(e,f,g)
174	 movdqa	$b,$axb
175	movdqa	$a,$sigma
176	pslld	\$10,$t3
177	 pxor	$a,$axb				# a^b, b^c in next round
178
179	 `"prefetcht0	63(@ptr[2])"		if ($i==15)`
180	psrld	\$13,$sigma
181	pxor	$t3,$t2
182	 paddd	$t1,$Xi				# Xi+=Ch(e,f,g)
183	pslld	\$19-10,$t3
184	 pand	$axb,$bxc
185	pxor	$sigma,$t2
186
187	 `"prefetcht0	63(@ptr[3])"		if ($i==15)`
188	psrld	\$22-13,$sigma
189	pxor	$t3,$t2
190	 movdqa	$b,$h
191	pslld	\$30-19,$t3
192	pxor	$t2,$sigma
193	 pxor	$bxc,$h				# h=Maj(a,b,c)=Ch(a^b,c,b)
194	 paddd	$Xi,$d				# d+=Xi
195	pxor	$t3,$sigma			# Sigma0(a)
196
197	paddd	$Xi,$h				# h+=Xi
198	paddd	$sigma,$h			# h+=Sigma0(a)
199___
200$code.=<<___ if (($i%8)==7);
201	lea	`32*8`($Tbl),$Tbl
202___
203	($axb,$bxc)=($bxc,$axb);
204}
205
206sub ROUND_16_XX {
207my $i=shift;
208
209$code.=<<___;
210	movdqa	`&Xi_off($i+1)`,$Xn
211	paddd	`&Xi_off($i+9)`,$Xi		# Xi+=X[i+9]
212
213	movdqa	$Xn,$sigma
214	movdqa	$Xn,$t2
215	psrld	\$3,$sigma
216	movdqa	$Xn,$t3
217
218	psrld	\$7,$t2
219	movdqa	`&Xi_off($i+14)`,$t1
220	pslld	\$14,$t3
221	pxor	$t2,$sigma
222	psrld	\$18-7,$t2
223	movdqa	$t1,$axb			# borrow $axb
224	pxor	$t3,$sigma
225	pslld	\$25-14,$t3
226	pxor	$t2,$sigma
227	psrld	\$10,$t1
228	movdqa	$axb,$t2
229
230	psrld	\$17,$axb
231	pxor	$t3,$sigma			# sigma0(X[i+1])
232	pslld	\$13,$t2
233	 paddd	$sigma,$Xi			# Xi+=sigma0(e)
234	pxor	$axb,$t1
235	psrld	\$19-17,$axb
236	pxor	$t2,$t1
237	pslld	\$15-13,$t2
238	pxor	$axb,$t1
239	pxor	$t2,$t1				# sigma0(X[i+14])
240	paddd	$t1,$Xi				# Xi+=sigma1(X[i+14])
241___
242	&ROUND_00_15($i,@_);
243	($Xi,$Xn)=($Xn,$Xi);
244}
245
246$code.=<<___;
247.text
248
249.extern	OPENSSL_ia32cap_P
250
251.globl	sha256_multi_block
252.type	sha256_multi_block,\@function,3
253.align	32
254sha256_multi_block:
255.cfi_startproc
256	mov	OPENSSL_ia32cap_P+4(%rip),%rcx
257	bt	\$61,%rcx			# check SHA bit
258	jc	_shaext_shortcut
259___
260$code.=<<___ if ($avx);
261	test	\$`1<<28`,%ecx
262	jnz	_avx_shortcut
263___
264$code.=<<___;
265	mov	%rsp,%rax
266.cfi_def_cfa_register	%rax
267	push	%rbx
268.cfi_push	%rbx
269	push	%rbp
270.cfi_push	%rbp
271___
272$code.=<<___ if ($win64);
273	lea	-0xa8(%rsp),%rsp
274	movaps	%xmm6,(%rsp)
275	movaps	%xmm7,0x10(%rsp)
276	movaps	%xmm8,0x20(%rsp)
277	movaps	%xmm9,0x30(%rsp)
278	movaps	%xmm10,-0x78(%rax)
279	movaps	%xmm11,-0x68(%rax)
280	movaps	%xmm12,-0x58(%rax)
281	movaps	%xmm13,-0x48(%rax)
282	movaps	%xmm14,-0x38(%rax)
283	movaps	%xmm15,-0x28(%rax)
284___
285$code.=<<___;
286	sub	\$`$REG_SZ*18`, %rsp
287	and	\$-256,%rsp
288	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
289.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
290.Lbody:
291	lea	K256+128(%rip),$Tbl
292	lea	`$REG_SZ*16`(%rsp),%rbx
293	lea	0x80($ctx),$ctx			# size optimization
294
295.Loop_grande:
296	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
297	xor	$num,$num
298___
299for($i=0;$i<4;$i++) {
300    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
301    $code.=<<___;
302	# input pointer
303	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
304	# number of blocks
305	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
306	cmp	$num,%ecx
307	cmovg	%ecx,$num			# find maximum
308	test	%ecx,%ecx
309	mov	%ecx,`4*$i`(%rbx)		# initialize counters
310	cmovle	$Tbl,@ptr[$i]			# cancel input
311___
312}
313$code.=<<___;
314	test	$num,$num
315	jz	.Ldone
316
317	movdqu	0x00-0x80($ctx),$A		# load context
318	 lea	128(%rsp),%rax
319	movdqu	0x20-0x80($ctx),$B
320	movdqu	0x40-0x80($ctx),$C
321	movdqu	0x60-0x80($ctx),$D
322	movdqu	0x80-0x80($ctx),$E
323	movdqu	0xa0-0x80($ctx),$F
324	movdqu	0xc0-0x80($ctx),$G
325	movdqu	0xe0-0x80($ctx),$H
326	movdqu	.Lpbswap(%rip),$Xn
327	jmp	.Loop
328
329.align	32
330.Loop:
331	movdqa	$C,$bxc
332	pxor	$B,$bxc				# magic seed
333___
334for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
335$code.=<<___;
336	movdqu	`&Xi_off($i)`,$Xi
337	mov	\$3,%ecx
338	jmp	.Loop_16_xx
339.align	32
340.Loop_16_xx:
341___
342for(;$i<32;$i++)	{ &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
343$code.=<<___;
344	dec	%ecx
345	jnz	.Loop_16_xx
346
347	mov	\$1,%ecx
348	lea	K256+128(%rip),$Tbl
349
350	movdqa	(%rbx),$sigma			# pull counters
351	cmp	4*0(%rbx),%ecx			# examine counters
352	pxor	$t1,$t1
353	cmovge	$Tbl,@ptr[0]			# cancel input
354	cmp	4*1(%rbx),%ecx
355	movdqa	$sigma,$Xn
356	cmovge	$Tbl,@ptr[1]
357	cmp	4*2(%rbx),%ecx
358	pcmpgtd	$t1,$Xn				# mask value
359	cmovge	$Tbl,@ptr[2]
360	cmp	4*3(%rbx),%ecx
361	paddd	$Xn,$sigma			# counters--
362	cmovge	$Tbl,@ptr[3]
363
364	movdqu	0x00-0x80($ctx),$t1
365	pand	$Xn,$A
366	movdqu	0x20-0x80($ctx),$t2
367	pand	$Xn,$B
368	movdqu	0x40-0x80($ctx),$t3
369	pand	$Xn,$C
370	movdqu	0x60-0x80($ctx),$Xi
371	pand	$Xn,$D
372	paddd	$t1,$A
373	movdqu	0x80-0x80($ctx),$t1
374	pand	$Xn,$E
375	paddd	$t2,$B
376	movdqu	0xa0-0x80($ctx),$t2
377	pand	$Xn,$F
378	paddd	$t3,$C
379	movdqu	0xc0-0x80($ctx),$t3
380	pand	$Xn,$G
381	paddd	$Xi,$D
382	movdqu	0xe0-0x80($ctx),$Xi
383	pand	$Xn,$H
384	paddd	$t1,$E
385	paddd	$t2,$F
386	movdqu	$A,0x00-0x80($ctx)
387	paddd	$t3,$G
388	movdqu	$B,0x20-0x80($ctx)
389	paddd	$Xi,$H
390	movdqu	$C,0x40-0x80($ctx)
391	movdqu	$D,0x60-0x80($ctx)
392	movdqu	$E,0x80-0x80($ctx)
393	movdqu	$F,0xa0-0x80($ctx)
394	movdqu	$G,0xc0-0x80($ctx)
395	movdqu	$H,0xe0-0x80($ctx)
396
397	movdqa	$sigma,(%rbx)			# save counters
398	movdqa	.Lpbswap(%rip),$Xn
399	dec	$num
400	jnz	.Loop
401
402	mov	`$REG_SZ*17+8`(%rsp),$num
403	lea	$REG_SZ($ctx),$ctx
404	lea	`$inp_elm_size*$REG_SZ/4`($inp),$inp
405	dec	$num
406	jnz	.Loop_grande
407
408.Ldone:
409	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
410.cfi_def_cfa	%rax,8
411___
412$code.=<<___ if ($win64);
413	movaps	-0xb8(%rax),%xmm6
414	movaps	-0xa8(%rax),%xmm7
415	movaps	-0x98(%rax),%xmm8
416	movaps	-0x88(%rax),%xmm9
417	movaps	-0x78(%rax),%xmm10
418	movaps	-0x68(%rax),%xmm11
419	movaps	-0x58(%rax),%xmm12
420	movaps	-0x48(%rax),%xmm13
421	movaps	-0x38(%rax),%xmm14
422	movaps	-0x28(%rax),%xmm15
423___
424$code.=<<___;
425	mov	-16(%rax),%rbp
426.cfi_restore	%rbp
427	mov	-8(%rax),%rbx
428.cfi_restore	%rbx
429	lea	(%rax),%rsp
430.cfi_def_cfa_register	%rsp
431.Lepilogue:
432	ret
433.cfi_endproc
434.size	sha256_multi_block,.-sha256_multi_block
435___
436						{{{
437my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
438my @MSG0=map("%xmm$_",(4..7));
439my @MSG1=map("%xmm$_",(8..11));
440
441$code.=<<___;
442.type	sha256_multi_block_shaext,\@function,3
443.align	32
444sha256_multi_block_shaext:
445.cfi_startproc
446_shaext_shortcut:
447	mov	%rsp,%rax
448.cfi_def_cfa_register	%rax
449	push	%rbx
450.cfi_push	%rbx
451	push	%rbp
452.cfi_push	%rbp
453___
454$code.=<<___ if ($win64);
455	lea	-0xa8(%rsp),%rsp
456	movaps	%xmm6,(%rsp)
457	movaps	%xmm7,0x10(%rsp)
458	movaps	%xmm8,0x20(%rsp)
459	movaps	%xmm9,0x30(%rsp)
460	movaps	%xmm10,-0x78(%rax)
461	movaps	%xmm11,-0x68(%rax)
462	movaps	%xmm12,-0x58(%rax)
463	movaps	%xmm13,-0x48(%rax)
464	movaps	%xmm14,-0x38(%rax)
465	movaps	%xmm15,-0x28(%rax)
466___
467$code.=<<___;
468	sub	\$`$REG_SZ*18`,%rsp
469	shl	\$1,$num			# we process pair at a time
470	and	\$-256,%rsp
471	lea	0x80($ctx),$ctx			# size optimization
472	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
473.Lbody_shaext:
474	lea	`$REG_SZ*16`(%rsp),%rbx
475	lea	K256_shaext+0x80(%rip),$Tbl
476
477.Loop_grande_shaext:
478	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
479	xor	$num,$num
480___
481for($i=0;$i<2;$i++) {
482    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
483    $code.=<<___;
484	# input pointer
485	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
486	# number of blocks
487	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
488	cmp	$num,%ecx
489	cmovg	%ecx,$num			# find maximum
490	test	%ecx,%ecx
491	mov	%ecx,`4*$i`(%rbx)		# initialize counters
492	cmovle	%rsp,@ptr[$i]			# cancel input
493___
494}
495$code.=<<___;
496	test	$num,$num
497	jz	.Ldone_shaext
498
499	movq		0x00-0x80($ctx),$ABEF0		# A1.A0
500	movq		0x20-0x80($ctx),@MSG0[0]	# B1.B0
501	movq		0x40-0x80($ctx),$CDGH0		# C1.C0
502	movq		0x60-0x80($ctx),@MSG0[1]	# D1.D0
503	movq		0x80-0x80($ctx),@MSG1[0]	# E1.E0
504	movq		0xa0-0x80($ctx),@MSG1[1]	# F1.F0
505	movq		0xc0-0x80($ctx),@MSG1[2]	# G1.G0
506	movq		0xe0-0x80($ctx),@MSG1[3]	# H1.H0
507
508	punpckldq	@MSG0[0],$ABEF0			# B1.A1.B0.A0
509	punpckldq	@MSG0[1],$CDGH0			# D1.C1.D0.C0
510	punpckldq	@MSG1[1],@MSG1[0]		# F1.E1.F0.E0
511	punpckldq	@MSG1[3],@MSG1[2]		# H1.G1.H0.G0
512	movdqa		K256_shaext-0x10(%rip),$TMPx	# byte swap
513
514	movdqa		$ABEF0,$ABEF1
515	movdqa		$CDGH0,$CDGH1
516	punpcklqdq	@MSG1[0],$ABEF0			# F0.E0.B0.A0
517	punpcklqdq	@MSG1[2],$CDGH0			# H0.G0.D0.C0
518	punpckhqdq	@MSG1[0],$ABEF1			# F1.E1.B1.A1
519	punpckhqdq	@MSG1[2],$CDGH1			# H1.G1.D1.C1
520
521	pshufd		\$0b00011011,$ABEF0,$ABEF0
522	pshufd		\$0b00011011,$CDGH0,$CDGH0
523	pshufd		\$0b00011011,$ABEF1,$ABEF1
524	pshufd		\$0b00011011,$CDGH1,$CDGH1
525	jmp		.Loop_shaext
526
527.align	32
528.Loop_shaext:
529	movdqu		0x00(@ptr[0]),@MSG0[0]
530	 movdqu		0x00(@ptr[1]),@MSG1[0]
531	movdqu		0x10(@ptr[0]),@MSG0[1]
532	 movdqu		0x10(@ptr[1]),@MSG1[1]
533	movdqu		0x20(@ptr[0]),@MSG0[2]
534	pshufb		$TMPx,@MSG0[0]
535	 movdqu		0x20(@ptr[1]),@MSG1[2]
536	 pshufb		$TMPx,@MSG1[0]
537	movdqu		0x30(@ptr[0]),@MSG0[3]
538	lea		0x40(@ptr[0]),@ptr[0]
539	 movdqu		0x30(@ptr[1]),@MSG1[3]
540	 lea		0x40(@ptr[1]),@ptr[1]
541
542	movdqa		0*16-0x80($Tbl),$Wi
543	pshufb		$TMPx,@MSG0[1]
544	paddd		@MSG0[0],$Wi
545	pxor		$ABEF0,@MSG0[0]		# black magic
546	movdqa		$Wi,$TMP0
547	 movdqa		0*16-0x80($Tbl),$TMP1
548	 pshufb		$TMPx,@MSG1[1]
549	 paddd		@MSG1[0],$TMP1
550	movdqa		$CDGH0,0x50(%rsp)	# offload
551	sha256rnds2	$ABEF0,$CDGH0		# 0-3
552	 pxor		$ABEF1,@MSG1[0]		# black magic
553	 movdqa		$TMP1,$Wi
554	 movdqa		$CDGH1,0x70(%rsp)
555	 sha256rnds2	$ABEF1,$CDGH1		# 0-3
556	pshufd		\$0x0e,$TMP0,$Wi
557	pxor		$ABEF0,@MSG0[0]		# black magic
558	movdqa		$ABEF0,0x40(%rsp)	# offload
559	sha256rnds2	$CDGH0,$ABEF0
560	 pshufd		\$0x0e,$TMP1,$Wi
561	 pxor		$ABEF1,@MSG1[0]		# black magic
562	 movdqa		$ABEF1,0x60(%rsp)
563	movdqa		1*16-0x80($Tbl),$TMP0
564	paddd		@MSG0[1],$TMP0
565	pshufb		$TMPx,@MSG0[2]
566	 sha256rnds2	$CDGH1,$ABEF1
567
568	movdqa		$TMP0,$Wi
569	 movdqa		1*16-0x80($Tbl),$TMP1
570	 paddd		@MSG1[1],$TMP1
571	sha256rnds2	$ABEF0,$CDGH0		# 4-7
572	 movdqa		$TMP1,$Wi
573	prefetcht0	127(@ptr[0])
574	pshufb		$TMPx,@MSG0[3]
575	 pshufb		$TMPx,@MSG1[2]
576	 prefetcht0	127(@ptr[1])
577	 sha256rnds2	$ABEF1,$CDGH1		# 4-7
578	pshufd		\$0x0e,$TMP0,$Wi
579	 pshufb		$TMPx,@MSG1[3]
580	sha256msg1	@MSG0[1],@MSG0[0]
581	sha256rnds2	$CDGH0,$ABEF0
582	 pshufd		\$0x0e,$TMP1,$Wi
583	movdqa		2*16-0x80($Tbl),$TMP0
584	paddd		@MSG0[2],$TMP0
585	 sha256rnds2	$CDGH1,$ABEF1
586
587	movdqa		$TMP0,$Wi
588	 movdqa		2*16-0x80($Tbl),$TMP1
589	 paddd		@MSG1[2],$TMP1
590	sha256rnds2	$ABEF0,$CDGH0		# 8-11
591	 sha256msg1	@MSG1[1],@MSG1[0]
592	 movdqa		$TMP1,$Wi
593	movdqa		@MSG0[3],$TMPx
594	 sha256rnds2	$ABEF1,$CDGH1		# 8-11
595	pshufd		\$0x0e,$TMP0,$Wi
596	palignr		\$4,@MSG0[2],$TMPx
597	paddd		$TMPx,@MSG0[0]
598	 movdqa		@MSG1[3],$TMPx
599	 palignr	\$4,@MSG1[2],$TMPx
600	sha256msg1	@MSG0[2],@MSG0[1]
601	sha256rnds2	$CDGH0,$ABEF0
602	 pshufd		\$0x0e,$TMP1,$Wi
603	movdqa		3*16-0x80($Tbl),$TMP0
604	paddd		@MSG0[3],$TMP0
605	 sha256rnds2	$CDGH1,$ABEF1
606	 sha256msg1	@MSG1[2],@MSG1[1]
607
608	movdqa		$TMP0,$Wi
609	 movdqa		3*16-0x80($Tbl),$TMP1
610	 paddd		$TMPx,@MSG1[0]
611	 paddd		@MSG1[3],$TMP1
612	sha256msg2	@MSG0[3],@MSG0[0]
613	sha256rnds2	$ABEF0,$CDGH0		# 12-15
614	 movdqa		$TMP1,$Wi
615	movdqa		@MSG0[0],$TMPx
616	palignr		\$4,@MSG0[3],$TMPx
617	 sha256rnds2	$ABEF1,$CDGH1		# 12-15
618	 sha256msg2	@MSG1[3],@MSG1[0]
619	pshufd		\$0x0e,$TMP0,$Wi
620	paddd		$TMPx,@MSG0[1]
621	 movdqa		@MSG1[0],$TMPx
622	 palignr	\$4,@MSG1[3],$TMPx
623	sha256msg1	@MSG0[3],@MSG0[2]
624	sha256rnds2	$CDGH0,$ABEF0
625	 pshufd		\$0x0e,$TMP1,$Wi
626	movdqa		4*16-0x80($Tbl),$TMP0
627	paddd		@MSG0[0],$TMP0
628	 sha256rnds2	$CDGH1,$ABEF1
629	 sha256msg1	@MSG1[3],@MSG1[2]
630___
631for($i=4;$i<16-3;$i++) {
632$code.=<<___;
633	movdqa		$TMP0,$Wi
634	 movdqa		$i*16-0x80($Tbl),$TMP1
635	 paddd		$TMPx,@MSG1[1]
636	 paddd		@MSG1[0],$TMP1
637	sha256msg2	@MSG0[0],@MSG0[1]
638	sha256rnds2	$ABEF0,$CDGH0		# 16-19...
639	 movdqa		$TMP1,$Wi
640	movdqa		@MSG0[1],$TMPx
641	palignr		\$4,@MSG0[0],$TMPx
642	 sha256rnds2	$ABEF1,$CDGH1		# 16-19...
643	 sha256msg2	@MSG1[0],@MSG1[1]
644	pshufd		\$0x0e,$TMP0,$Wi
645	paddd		$TMPx,@MSG0[2]
646	 movdqa		@MSG1[1],$TMPx
647	 palignr	\$4,@MSG1[0],$TMPx
648	sha256msg1	@MSG0[0],@MSG0[3]
649	sha256rnds2	$CDGH0,$ABEF0
650	 pshufd		\$0x0e,$TMP1,$Wi
651	movdqa		`($i+1)*16`-0x80($Tbl),$TMP0
652	paddd		@MSG0[1],$TMP0
653	 sha256rnds2	$CDGH1,$ABEF1
654	 sha256msg1	@MSG1[0],@MSG1[3]
655___
656	push(@MSG0,shift(@MSG0));	push(@MSG1,shift(@MSG1));
657}
658$code.=<<___;
659	movdqa		$TMP0,$Wi
660	 movdqa		13*16-0x80($Tbl),$TMP1
661	 paddd		$TMPx,@MSG1[1]
662	 paddd		@MSG1[0],$TMP1
663	sha256msg2	@MSG0[0],@MSG0[1]
664	sha256rnds2	$ABEF0,$CDGH0		# 52-55
665	 movdqa		$TMP1,$Wi
666	movdqa		@MSG0[1],$TMPx
667	palignr		\$4,@MSG0[0],$TMPx
668	 sha256rnds2	$ABEF1,$CDGH1		# 52-55
669	 sha256msg2	@MSG1[0],@MSG1[1]
670	pshufd		\$0x0e,$TMP0,$Wi
671	paddd		$TMPx,@MSG0[2]
672	 movdqa		@MSG1[1],$TMPx
673	 palignr	\$4,@MSG1[0],$TMPx
674	nop
675	sha256rnds2	$CDGH0,$ABEF0
676	 pshufd		\$0x0e,$TMP1,$Wi
677	movdqa		14*16-0x80($Tbl),$TMP0
678	paddd		@MSG0[1],$TMP0
679	 sha256rnds2	$CDGH1,$ABEF1
680
681	movdqa		$TMP0,$Wi
682	 movdqa		14*16-0x80($Tbl),$TMP1
683	 paddd		$TMPx,@MSG1[2]
684	 paddd		@MSG1[1],$TMP1
685	sha256msg2	@MSG0[1],@MSG0[2]
686	nop
687	sha256rnds2	$ABEF0,$CDGH0		# 56-59
688	 movdqa		$TMP1,$Wi
689	  mov		\$1,%ecx
690	  pxor		@MSG0[1],@MSG0[1]	# zero
691	 sha256rnds2	$ABEF1,$CDGH1		# 56-59
692	 sha256msg2	@MSG1[1],@MSG1[2]
693	pshufd		\$0x0e,$TMP0,$Wi
694	movdqa		15*16-0x80($Tbl),$TMP0
695	paddd		@MSG0[2],$TMP0
696	  movq		(%rbx),@MSG0[2]		# pull counters
697	  nop
698	sha256rnds2	$CDGH0,$ABEF0
699	 pshufd		\$0x0e,$TMP1,$Wi
700	 movdqa		15*16-0x80($Tbl),$TMP1
701	 paddd		@MSG1[2],$TMP1
702	 sha256rnds2	$CDGH1,$ABEF1
703
704	movdqa		$TMP0,$Wi
705	  cmp		4*0(%rbx),%ecx		# examine counters
706	  cmovge	%rsp,@ptr[0]		# cancel input
707	  cmp		4*1(%rbx),%ecx
708	  cmovge	%rsp,@ptr[1]
709	  pshufd	\$0x00,@MSG0[2],@MSG1[0]
710	sha256rnds2	$ABEF0,$CDGH0		# 60-63
711	 movdqa		$TMP1,$Wi
712	  pshufd	\$0x55,@MSG0[2],@MSG1[1]
713	  movdqa	@MSG0[2],@MSG1[2]
714	 sha256rnds2	$ABEF1,$CDGH1		# 60-63
715	pshufd		\$0x0e,$TMP0,$Wi
716	  pcmpgtd	@MSG0[1],@MSG1[0]
717	  pcmpgtd	@MSG0[1],@MSG1[1]
718	sha256rnds2	$CDGH0,$ABEF0
719	 pshufd		\$0x0e,$TMP1,$Wi
720	  pcmpgtd	@MSG0[1],@MSG1[2]	# counter mask
721	  movdqa	K256_shaext-0x10(%rip),$TMPx
722	 sha256rnds2	$CDGH1,$ABEF1
723
724	pand		@MSG1[0],$CDGH0
725	 pand		@MSG1[1],$CDGH1
726	pand		@MSG1[0],$ABEF0
727	 pand		@MSG1[1],$ABEF1
728	paddd		@MSG0[2],@MSG1[2]	# counters--
729
730	paddd		0x50(%rsp),$CDGH0
731	 paddd		0x70(%rsp),$CDGH1
732	paddd		0x40(%rsp),$ABEF0
733	 paddd		0x60(%rsp),$ABEF1
734
735	movq		@MSG1[2],(%rbx)		# save counters
736	dec		$num
737	jnz		.Loop_shaext
738
739	mov		`$REG_SZ*17+8`(%rsp),$num
740
741	pshufd		\$0b00011011,$ABEF0,$ABEF0
742	pshufd		\$0b00011011,$CDGH0,$CDGH0
743	pshufd		\$0b00011011,$ABEF1,$ABEF1
744	pshufd		\$0b00011011,$CDGH1,$CDGH1
745
746	movdqa		$ABEF0,@MSG0[0]
747	movdqa		$CDGH0,@MSG0[1]
748	punpckldq	$ABEF1,$ABEF0			# B1.B0.A1.A0
749	punpckhdq	$ABEF1,@MSG0[0]			# F1.F0.E1.E0
750	punpckldq	$CDGH1,$CDGH0			# D1.D0.C1.C0
751	punpckhdq	$CDGH1,@MSG0[1]			# H1.H0.G1.G0
752
753	movq		$ABEF0,0x00-0x80($ctx)		# A1.A0
754	psrldq		\$8,$ABEF0
755	movq		@MSG0[0],0x80-0x80($ctx)	# E1.E0
756	psrldq		\$8,@MSG0[0]
757	movq		$ABEF0,0x20-0x80($ctx)		# B1.B0
758	movq		@MSG0[0],0xa0-0x80($ctx)	# F1.F0
759
760	movq		$CDGH0,0x40-0x80($ctx)		# C1.C0
761	psrldq		\$8,$CDGH0
762	movq		@MSG0[1],0xc0-0x80($ctx)	# G1.G0
763	psrldq		\$8,@MSG0[1]
764	movq		$CDGH0,0x60-0x80($ctx)		# D1.D0
765	movq		@MSG0[1],0xe0-0x80($ctx)	# H1.H0
766
767	lea	`$REG_SZ/2`($ctx),$ctx
768	lea	`$inp_elm_size*2`($inp),$inp
769	dec	$num
770	jnz	.Loop_grande_shaext
771
772.Ldone_shaext:
773	#mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
774___
775$code.=<<___ if ($win64);
776	movaps	-0xb8(%rax),%xmm6
777	movaps	-0xa8(%rax),%xmm7
778	movaps	-0x98(%rax),%xmm8
779	movaps	-0x88(%rax),%xmm9
780	movaps	-0x78(%rax),%xmm10
781	movaps	-0x68(%rax),%xmm11
782	movaps	-0x58(%rax),%xmm12
783	movaps	-0x48(%rax),%xmm13
784	movaps	-0x38(%rax),%xmm14
785	movaps	-0x28(%rax),%xmm15
786___
787$code.=<<___;
788	mov	-16(%rax),%rbp
789.cfi_restore	%rbp
790	mov	-8(%rax),%rbx
791.cfi_restore	%rbx
792	lea	(%rax),%rsp
793.cfi_def_cfa_register	%rsp
794.Lepilogue_shaext:
795	ret
796.cfi_endproc
797.size	sha256_multi_block_shaext,.-sha256_multi_block_shaext
798___
799						}}}
800						if ($avx) {{{
801sub ROUND_00_15_avx {
802my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
803
804$code.=<<___ if ($i<15 && $REG_SZ==16);
805	vmovd		`4*$i`(@ptr[0]),$Xi
806	vmovd		`4*$i`(@ptr[1]),$t1
807	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
808	vpinsrd		\$1,`4*$i`(@ptr[3]),$t1,$t1
809	vpunpckldq	$t1,$Xi,$Xi
810	vpshufb		$Xn,$Xi,$Xi
811___
812$code.=<<___ if ($i==15 && $REG_SZ==16);
813	vmovd		`4*$i`(@ptr[0]),$Xi
814	 lea		`16*4`(@ptr[0]),@ptr[0]
815	vmovd		`4*$i`(@ptr[1]),$t1
816	 lea		`16*4`(@ptr[1]),@ptr[1]
817	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
818	 lea		`16*4`(@ptr[2]),@ptr[2]
819	vpinsrd		\$1,`4*$i`(@ptr[3]),$t1,$t1
820	 lea		`16*4`(@ptr[3]),@ptr[3]
821	vpunpckldq	$t1,$Xi,$Xi
822	vpshufb		$Xn,$Xi,$Xi
823___
824$code.=<<___ if ($i<15 && $REG_SZ==32);
825	vmovd		`4*$i`(@ptr[0]),$Xi
826	vmovd		`4*$i`(@ptr[4]),$t1
827	vmovd		`4*$i`(@ptr[1]),$t2
828	vmovd		`4*$i`(@ptr[5]),$t3
829	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
830	vpinsrd		\$1,`4*$i`(@ptr[6]),$t1,$t1
831	vpinsrd		\$1,`4*$i`(@ptr[3]),$t2,$t2
832	vpunpckldq	$t2,$Xi,$Xi
833	vpinsrd		\$1,`4*$i`(@ptr[7]),$t3,$t3
834	vpunpckldq	$t3,$t1,$t1
835	vinserti128	$t1,$Xi,$Xi
836	vpshufb		$Xn,$Xi,$Xi
837___
838$code.=<<___ if ($i==15 && $REG_SZ==32);
839	vmovd		`4*$i`(@ptr[0]),$Xi
840	 lea		`16*4`(@ptr[0]),@ptr[0]
841	vmovd		`4*$i`(@ptr[4]),$t1
842	 lea		`16*4`(@ptr[4]),@ptr[4]
843	vmovd		`4*$i`(@ptr[1]),$t2
844	 lea		`16*4`(@ptr[1]),@ptr[1]
845	vmovd		`4*$i`(@ptr[5]),$t3
846	 lea		`16*4`(@ptr[5]),@ptr[5]
847	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
848	 lea		`16*4`(@ptr[2]),@ptr[2]
849	vpinsrd		\$1,`4*$i`(@ptr[6]),$t1,$t1
850	 lea		`16*4`(@ptr[6]),@ptr[6]
851	vpinsrd		\$1,`4*$i`(@ptr[3]),$t2,$t2
852	 lea		`16*4`(@ptr[3]),@ptr[3]
853	vpunpckldq	$t2,$Xi,$Xi
854	vpinsrd		\$1,`4*$i`(@ptr[7]),$t3,$t3
855	 lea		`16*4`(@ptr[7]),@ptr[7]
856	vpunpckldq	$t3,$t1,$t1
857	vinserti128	$t1,$Xi,$Xi
858	vpshufb		$Xn,$Xi,$Xi
859___
860$code.=<<___;
861	vpsrld	\$6,$e,$sigma
862	vpslld	\$26,$e,$t3
863	vmovdqu	$Xi,`&Xi_off($i)`
864	 vpaddd	$h,$Xi,$Xi			# Xi+=h
865
866	vpsrld	\$11,$e,$t2
867	vpxor	$t3,$sigma,$sigma
868	vpslld	\$21,$e,$t3
869	 vpaddd	`32*($i%8)-128`($Tbl),$Xi,$Xi	# Xi+=K[round]
870	vpxor	$t2,$sigma,$sigma
871
872	vpsrld	\$25,$e,$t2
873	vpxor	$t3,$sigma,$sigma
874	 `"prefetcht0	63(@ptr[0])"		if ($i==15)`
875	vpslld	\$7,$e,$t3
876	 vpandn	$g,$e,$t1
877	 vpand	$f,$e,$axb			# borrow $axb
878	 `"prefetcht0	63(@ptr[1])"		if ($i==15)`
879	vpxor	$t2,$sigma,$sigma
880
881	vpsrld	\$2,$a,$h			# borrow $h
882	vpxor	$t3,$sigma,$sigma		# Sigma1(e)
883	 `"prefetcht0	63(@ptr[2])"		if ($i==15)`
884	vpslld	\$30,$a,$t2
885	 vpxor	$axb,$t1,$t1			# Ch(e,f,g)
886	 vpxor	$a,$b,$axb			# a^b, b^c in next round
887	 `"prefetcht0	63(@ptr[3])"		if ($i==15)`
888	vpxor	$t2,$h,$h
889	vpaddd	$sigma,$Xi,$Xi			# Xi+=Sigma1(e)
890
891	vpsrld	\$13,$a,$t2
892	 `"prefetcht0	63(@ptr[4])"		if ($i==15 && $REG_SZ==32)`
893	vpslld	\$19,$a,$t3
894	 vpaddd	$t1,$Xi,$Xi			# Xi+=Ch(e,f,g)
895	 vpand	$axb,$bxc,$bxc
896	 `"prefetcht0	63(@ptr[5])"		if ($i==15 && $REG_SZ==32)`
897	vpxor	$t2,$h,$sigma
898
899	vpsrld	\$22,$a,$t2
900	vpxor	$t3,$sigma,$sigma
901	 `"prefetcht0	63(@ptr[6])"		if ($i==15 && $REG_SZ==32)`
902	vpslld	\$10,$a,$t3
903	 vpxor	$bxc,$b,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
904	 vpaddd	$Xi,$d,$d			# d+=Xi
905	 `"prefetcht0	63(@ptr[7])"		if ($i==15 && $REG_SZ==32)`
906	vpxor	$t2,$sigma,$sigma
907	vpxor	$t3,$sigma,$sigma		# Sigma0(a)
908
909	vpaddd	$Xi,$h,$h			# h+=Xi
910	vpaddd	$sigma,$h,$h			# h+=Sigma0(a)
911___
912$code.=<<___ if (($i%8)==7);
913	add	\$`32*8`,$Tbl
914___
915	($axb,$bxc)=($bxc,$axb);
916}
917
918sub ROUND_16_XX_avx {
919my $i=shift;
920
921$code.=<<___;
922	vmovdqu	`&Xi_off($i+1)`,$Xn
923	vpaddd	`&Xi_off($i+9)`,$Xi,$Xi		# Xi+=X[i+9]
924
925	vpsrld	\$3,$Xn,$sigma
926	vpsrld	\$7,$Xn,$t2
927	vpslld	\$25,$Xn,$t3
928	vpxor	$t2,$sigma,$sigma
929	vpsrld	\$18,$Xn,$t2
930	vpxor	$t3,$sigma,$sigma
931	vpslld	\$14,$Xn,$t3
932	vmovdqu	`&Xi_off($i+14)`,$t1
933	vpsrld	\$10,$t1,$axb			# borrow $axb
934
935	vpxor	$t2,$sigma,$sigma
936	vpsrld	\$17,$t1,$t2
937	vpxor	$t3,$sigma,$sigma		# sigma0(X[i+1])
938	vpslld	\$15,$t1,$t3
939	 vpaddd	$sigma,$Xi,$Xi			# Xi+=sigma0(e)
940	vpxor	$t2,$axb,$sigma
941	vpsrld	\$19,$t1,$t2
942	vpxor	$t3,$sigma,$sigma
943	vpslld	\$13,$t1,$t3
944	vpxor	$t2,$sigma,$sigma
945	vpxor	$t3,$sigma,$sigma		# sigma0(X[i+14])
946	vpaddd	$sigma,$Xi,$Xi			# Xi+=sigma1(X[i+14])
947___
948	&ROUND_00_15_avx($i,@_);
949	($Xi,$Xn)=($Xn,$Xi);
950}
951
952$code.=<<___;
953.type	sha256_multi_block_avx,\@function,3
954.align	32
955sha256_multi_block_avx:
956.cfi_startproc
957_avx_shortcut:
958___
959$code.=<<___ if ($avx>1);
960	shr	\$32,%rcx
961	cmp	\$2,$num
962	jb	.Lavx
963	test	\$`1<<5`,%ecx
964	jnz	_avx2_shortcut
965	jmp	.Lavx
966.align	32
967.Lavx:
968___
969$code.=<<___;
970	mov	%rsp,%rax
971.cfi_def_cfa_register	%rax
972	push	%rbx
973.cfi_push	%rbx
974	push	%rbp
975.cfi_push	%rbp
976___
977$code.=<<___ if ($win64);
978	lea	-0xa8(%rsp),%rsp
979	movaps	%xmm6,(%rsp)
980	movaps	%xmm7,0x10(%rsp)
981	movaps	%xmm8,0x20(%rsp)
982	movaps	%xmm9,0x30(%rsp)
983	movaps	%xmm10,-0x78(%rax)
984	movaps	%xmm11,-0x68(%rax)
985	movaps	%xmm12,-0x58(%rax)
986	movaps	%xmm13,-0x48(%rax)
987	movaps	%xmm14,-0x38(%rax)
988	movaps	%xmm15,-0x28(%rax)
989___
990$code.=<<___;
991	sub	\$`$REG_SZ*18`, %rsp
992	and	\$-256,%rsp
993	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
994.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
995.Lbody_avx:
996	lea	K256+128(%rip),$Tbl
997	lea	`$REG_SZ*16`(%rsp),%rbx
998	lea	0x80($ctx),$ctx			# size optimization
999
1000.Loop_grande_avx:
1001	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1002	xor	$num,$num
1003___
1004for($i=0;$i<4;$i++) {
1005    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1006    $code.=<<___;
1007	# input pointer
1008	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
1009	# number of blocks
1010	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
1011	cmp	$num,%ecx
1012	cmovg	%ecx,$num			# find maximum
1013	test	%ecx,%ecx
1014	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1015	cmovle	$Tbl,@ptr[$i]			# cancel input
1016___
1017}
1018$code.=<<___;
1019	test	$num,$num
1020	jz	.Ldone_avx
1021
1022	vmovdqu	0x00-0x80($ctx),$A		# load context
1023	 lea	128(%rsp),%rax
1024	vmovdqu	0x20-0x80($ctx),$B
1025	vmovdqu	0x40-0x80($ctx),$C
1026	vmovdqu	0x60-0x80($ctx),$D
1027	vmovdqu	0x80-0x80($ctx),$E
1028	vmovdqu	0xa0-0x80($ctx),$F
1029	vmovdqu	0xc0-0x80($ctx),$G
1030	vmovdqu	0xe0-0x80($ctx),$H
1031	vmovdqu	.Lpbswap(%rip),$Xn
1032	jmp	.Loop_avx
1033
1034.align	32
1035.Loop_avx:
1036	vpxor	$B,$C,$bxc			# magic seed
1037___
1038for($i=0;$i<16;$i++)	{ &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1039$code.=<<___;
1040	vmovdqu	`&Xi_off($i)`,$Xi
1041	mov	\$3,%ecx
1042	jmp	.Loop_16_xx_avx
1043.align	32
1044.Loop_16_xx_avx:
1045___
1046for(;$i<32;$i++)	{ &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1047$code.=<<___;
1048	dec	%ecx
1049	jnz	.Loop_16_xx_avx
1050
1051	mov	\$1,%ecx
1052	lea	K256+128(%rip),$Tbl
1053___
1054for($i=0;$i<4;$i++) {
1055    $code.=<<___;
1056	cmp	`4*$i`(%rbx),%ecx		# examine counters
1057	cmovge	$Tbl,@ptr[$i]			# cancel input
1058___
1059}
1060$code.=<<___;
1061	vmovdqa	(%rbx),$sigma			# pull counters
1062	vpxor	$t1,$t1,$t1
1063	vmovdqa	$sigma,$Xn
1064	vpcmpgtd $t1,$Xn,$Xn			# mask value
1065	vpaddd	$Xn,$sigma,$sigma		# counters--
1066
1067	vmovdqu	0x00-0x80($ctx),$t1
1068	vpand	$Xn,$A,$A
1069	vmovdqu	0x20-0x80($ctx),$t2
1070	vpand	$Xn,$B,$B
1071	vmovdqu	0x40-0x80($ctx),$t3
1072	vpand	$Xn,$C,$C
1073	vmovdqu	0x60-0x80($ctx),$Xi
1074	vpand	$Xn,$D,$D
1075	vpaddd	$t1,$A,$A
1076	vmovdqu	0x80-0x80($ctx),$t1
1077	vpand	$Xn,$E,$E
1078	vpaddd	$t2,$B,$B
1079	vmovdqu	0xa0-0x80($ctx),$t2
1080	vpand	$Xn,$F,$F
1081	vpaddd	$t3,$C,$C
1082	vmovdqu	0xc0-0x80($ctx),$t3
1083	vpand	$Xn,$G,$G
1084	vpaddd	$Xi,$D,$D
1085	vmovdqu	0xe0-0x80($ctx),$Xi
1086	vpand	$Xn,$H,$H
1087	vpaddd	$t1,$E,$E
1088	vpaddd	$t2,$F,$F
1089	vmovdqu	$A,0x00-0x80($ctx)
1090	vpaddd	$t3,$G,$G
1091	vmovdqu	$B,0x20-0x80($ctx)
1092	vpaddd	$Xi,$H,$H
1093	vmovdqu	$C,0x40-0x80($ctx)
1094	vmovdqu	$D,0x60-0x80($ctx)
1095	vmovdqu	$E,0x80-0x80($ctx)
1096	vmovdqu	$F,0xa0-0x80($ctx)
1097	vmovdqu	$G,0xc0-0x80($ctx)
1098	vmovdqu	$H,0xe0-0x80($ctx)
1099
1100	vmovdqu	$sigma,(%rbx)			# save counters
1101	vmovdqu	.Lpbswap(%rip),$Xn
1102	dec	$num
1103	jnz	.Loop_avx
1104
1105	mov	`$REG_SZ*17+8`(%rsp),$num
1106	lea	$REG_SZ($ctx),$ctx
1107	lea	`$inp_elm_size*$REG_SZ/4`($inp),$inp
1108	dec	$num
1109	jnz	.Loop_grande_avx
1110
1111.Ldone_avx:
1112	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
1113.cfi_def_cfa	%rax,8
1114	vzeroupper
1115___
1116$code.=<<___ if ($win64);
1117	movaps	-0xb8(%rax),%xmm6
1118	movaps	-0xa8(%rax),%xmm7
1119	movaps	-0x98(%rax),%xmm8
1120	movaps	-0x88(%rax),%xmm9
1121	movaps	-0x78(%rax),%xmm10
1122	movaps	-0x68(%rax),%xmm11
1123	movaps	-0x58(%rax),%xmm12
1124	movaps	-0x48(%rax),%xmm13
1125	movaps	-0x38(%rax),%xmm14
1126	movaps	-0x28(%rax),%xmm15
1127___
1128$code.=<<___;
1129	mov	-16(%rax),%rbp
1130.cfi_restore	%rbp
1131	mov	-8(%rax),%rbx
1132.cfi_restore	%rbx
1133	lea	(%rax),%rsp
1134.cfi_def_cfa_register	%rsp
1135.Lepilogue_avx:
1136	ret
1137.cfi_endproc
1138.size	sha256_multi_block_avx,.-sha256_multi_block_avx
1139___
1140						if ($avx>1) {
1141$code =~ s/\`([^\`]*)\`/eval $1/gem;
1142
1143$REG_SZ=32;
1144@ptr=map("%r$_",(12..15,8..11));
1145
1146@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1147($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1148
1149$code.=<<___;
1150.type	sha256_multi_block_avx2,\@function,3
1151.align	32
1152sha256_multi_block_avx2:
1153.cfi_startproc
1154_avx2_shortcut:
1155	mov	%rsp,%rax
1156.cfi_def_cfa_register	%rax
1157	push	%rbx
1158.cfi_push	%rbx
1159	push	%rbp
1160.cfi_push	%rbp
1161	push	%r12
1162.cfi_push	%r12
1163	push	%r13
1164.cfi_push	%r13
1165	push	%r14
1166.cfi_push	%r14
1167	push	%r15
1168.cfi_push	%r15
1169___
1170$code.=<<___ if ($win64);
1171	lea	-0xa8(%rsp),%rsp
1172	movaps	%xmm6,(%rsp)
1173	movaps	%xmm7,0x10(%rsp)
1174	movaps	%xmm8,0x20(%rsp)
1175	movaps	%xmm9,0x30(%rsp)
1176	movaps	%xmm10,0x40(%rsp)
1177	movaps	%xmm11,0x50(%rsp)
1178	movaps	%xmm12,-0x78(%rax)
1179	movaps	%xmm13,-0x68(%rax)
1180	movaps	%xmm14,-0x58(%rax)
1181	movaps	%xmm15,-0x48(%rax)
1182___
1183$code.=<<___;
1184	sub	\$`$REG_SZ*18`, %rsp
1185	and	\$-256,%rsp
1186	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
1187.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
1188.Lbody_avx2:
1189	lea	K256+128(%rip),$Tbl
1190	lea	0x80($ctx),$ctx			# size optimization
1191
1192.Loop_grande_avx2:
1193	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1194	xor	$num,$num
1195	lea	`$REG_SZ*16`(%rsp),%rbx
1196___
1197for($i=0;$i<8;$i++) {
1198    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1199    $code.=<<___;
1200	# input pointer
1201	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
1202	# number of blocks
1203	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
1204	cmp	$num,%ecx
1205	cmovg	%ecx,$num			# find maximum
1206	test	%ecx,%ecx
1207	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1208	cmovle	$Tbl,@ptr[$i]			# cancel input
1209___
1210}
1211$code.=<<___;
1212	vmovdqu	0x00-0x80($ctx),$A		# load context
1213	 lea	128(%rsp),%rax
1214	vmovdqu	0x20-0x80($ctx),$B
1215	 lea	256+128(%rsp),%rbx
1216	vmovdqu	0x40-0x80($ctx),$C
1217	vmovdqu	0x60-0x80($ctx),$D
1218	vmovdqu	0x80-0x80($ctx),$E
1219	vmovdqu	0xa0-0x80($ctx),$F
1220	vmovdqu	0xc0-0x80($ctx),$G
1221	vmovdqu	0xe0-0x80($ctx),$H
1222	vmovdqu	.Lpbswap(%rip),$Xn
1223	jmp	.Loop_avx2
1224
1225.align	32
1226.Loop_avx2:
1227	vpxor	$B,$C,$bxc			# magic seed
1228___
1229for($i=0;$i<16;$i++)	{ &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1230$code.=<<___;
1231	vmovdqu	`&Xi_off($i)`,$Xi
1232	mov	\$3,%ecx
1233	jmp	.Loop_16_xx_avx2
1234.align	32
1235.Loop_16_xx_avx2:
1236___
1237for(;$i<32;$i++)	{ &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1238$code.=<<___;
1239	dec	%ecx
1240	jnz	.Loop_16_xx_avx2
1241
1242	mov	\$1,%ecx
1243	lea	`$REG_SZ*16`(%rsp),%rbx
1244	lea	K256+128(%rip),$Tbl
1245___
1246for($i=0;$i<8;$i++) {
1247    $code.=<<___;
1248	cmp	`4*$i`(%rbx),%ecx		# examine counters
1249	cmovge	$Tbl,@ptr[$i]			# cancel input
1250___
1251}
1252$code.=<<___;
1253	vmovdqa	(%rbx),$sigma			# pull counters
1254	vpxor	$t1,$t1,$t1
1255	vmovdqa	$sigma,$Xn
1256	vpcmpgtd $t1,$Xn,$Xn			# mask value
1257	vpaddd	$Xn,$sigma,$sigma		# counters--
1258
1259	vmovdqu	0x00-0x80($ctx),$t1
1260	vpand	$Xn,$A,$A
1261	vmovdqu	0x20-0x80($ctx),$t2
1262	vpand	$Xn,$B,$B
1263	vmovdqu	0x40-0x80($ctx),$t3
1264	vpand	$Xn,$C,$C
1265	vmovdqu	0x60-0x80($ctx),$Xi
1266	vpand	$Xn,$D,$D
1267	vpaddd	$t1,$A,$A
1268	vmovdqu	0x80-0x80($ctx),$t1
1269	vpand	$Xn,$E,$E
1270	vpaddd	$t2,$B,$B
1271	vmovdqu	0xa0-0x80($ctx),$t2
1272	vpand	$Xn,$F,$F
1273	vpaddd	$t3,$C,$C
1274	vmovdqu	0xc0-0x80($ctx),$t3
1275	vpand	$Xn,$G,$G
1276	vpaddd	$Xi,$D,$D
1277	vmovdqu	0xe0-0x80($ctx),$Xi
1278	vpand	$Xn,$H,$H
1279	vpaddd	$t1,$E,$E
1280	vpaddd	$t2,$F,$F
1281	vmovdqu	$A,0x00-0x80($ctx)
1282	vpaddd	$t3,$G,$G
1283	vmovdqu	$B,0x20-0x80($ctx)
1284	vpaddd	$Xi,$H,$H
1285	vmovdqu	$C,0x40-0x80($ctx)
1286	vmovdqu	$D,0x60-0x80($ctx)
1287	vmovdqu	$E,0x80-0x80($ctx)
1288	vmovdqu	$F,0xa0-0x80($ctx)
1289	vmovdqu	$G,0xc0-0x80($ctx)
1290	vmovdqu	$H,0xe0-0x80($ctx)
1291
1292	vmovdqu	$sigma,(%rbx)			# save counters
1293	lea	256+128(%rsp),%rbx
1294	vmovdqu	.Lpbswap(%rip),$Xn
1295	dec	$num
1296	jnz	.Loop_avx2
1297
1298	#mov	`$REG_SZ*17+8`(%rsp),$num
1299	#lea	$REG_SZ($ctx),$ctx
1300	#lea	`$inp_elm_size*$REG_SZ/4`($inp),$inp
1301	#dec	$num
1302	#jnz	.Loop_grande_avx2
1303
1304.Ldone_avx2:
1305	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
1306.cfi_def_cfa	%rax,8
1307	vzeroupper
1308___
1309$code.=<<___ if ($win64);
1310	movaps	-0xd8(%rax),%xmm6
1311	movaps	-0xc8(%rax),%xmm7
1312	movaps	-0xb8(%rax),%xmm8
1313	movaps	-0xa8(%rax),%xmm9
1314	movaps	-0x98(%rax),%xmm10
1315	movaps	-0x88(%rax),%xmm11
1316	movaps	-0x78(%rax),%xmm12
1317	movaps	-0x68(%rax),%xmm13
1318	movaps	-0x58(%rax),%xmm14
1319	movaps	-0x48(%rax),%xmm15
1320___
1321$code.=<<___;
1322	mov	-48(%rax),%r15
1323.cfi_restore	%r15
1324	mov	-40(%rax),%r14
1325.cfi_restore	%r14
1326	mov	-32(%rax),%r13
1327.cfi_restore	%r13
1328	mov	-24(%rax),%r12
1329.cfi_restore	%r12
1330	mov	-16(%rax),%rbp
1331.cfi_restore	%rbp
1332	mov	-8(%rax),%rbx
1333.cfi_restore	%rbx
1334	lea	(%rax),%rsp
1335.cfi_def_cfa_register	%rsp
1336.Lepilogue_avx2:
1337	ret
1338.cfi_endproc
1339.size	sha256_multi_block_avx2,.-sha256_multi_block_avx2
1340___
1341					}	}}}
1342$code.=<<___;
1343.align	256
1344K256:
1345___
1346sub TABLE {
1347    foreach (@_) {
1348	$code.=<<___;
1349	.long	$_,$_,$_,$_
1350	.long	$_,$_,$_,$_
1351___
1352    }
1353}
1354&TABLE(	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1355	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1356	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1357	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1358	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1359	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1360	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1361	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1362	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1363	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1364	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1365	0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1366	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1367	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1368	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1369	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1370$code.=<<___;
1371.Lpbswap:
1372	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1373	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1374K256_shaext:
1375	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1376	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1377	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1378	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1379	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1380	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1381	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1382	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1383	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1384	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1385	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1386	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1387	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1388	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1389	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1390	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1391	.asciz	"SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1392___
1393
1394if ($win64) {
1395# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1396#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1397$rec="%rcx";
1398$frame="%rdx";
1399$context="%r8";
1400$disp="%r9";
1401
1402$code.=<<___;
1403.extern	__imp_RtlVirtualUnwind
1404.type	se_handler,\@abi-omnipotent
1405.align	16
1406se_handler:
1407	push	%rsi
1408	push	%rdi
1409	push	%rbx
1410	push	%rbp
1411	push	%r12
1412	push	%r13
1413	push	%r14
1414	push	%r15
1415	pushfq
1416	sub	\$64,%rsp
1417
1418	mov	120($context),%rax	# pull context->Rax
1419	mov	248($context),%rbx	# pull context->Rip
1420
1421	mov	8($disp),%rsi		# disp->ImageBase
1422	mov	56($disp),%r11		# disp->HandlerData
1423
1424	mov	0(%r11),%r10d		# HandlerData[0]
1425	lea	(%rsi,%r10),%r10	# end of prologue label
1426	cmp	%r10,%rbx		# context->Rip<.Lbody
1427	jb	.Lin_prologue
1428
1429	mov	152($context),%rax	# pull context->Rsp
1430
1431	mov	4(%r11),%r10d		# HandlerData[1]
1432	lea	(%rsi,%r10),%r10	# epilogue label
1433	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1434	jae	.Lin_prologue
1435
1436	mov	`16*17`(%rax),%rax	# pull saved stack pointer
1437
1438	mov	-8(%rax),%rbx
1439	mov	-16(%rax),%rbp
1440	mov	%rbx,144($context)	# restore context->Rbx
1441	mov	%rbp,160($context)	# restore context->Rbp
1442
1443	lea	-24-10*16(%rax),%rsi
1444	lea	512($context),%rdi	# &context.Xmm6
1445	mov	\$20,%ecx
1446	.long	0xa548f3fc		# cld; rep movsq
1447
1448.Lin_prologue:
1449	mov	8(%rax),%rdi
1450	mov	16(%rax),%rsi
1451	mov	%rax,152($context)	# restore context->Rsp
1452	mov	%rsi,168($context)	# restore context->Rsi
1453	mov	%rdi,176($context)	# restore context->Rdi
1454
1455	mov	40($disp),%rdi		# disp->ContextRecord
1456	mov	$context,%rsi		# context
1457	mov	\$154,%ecx		# sizeof(CONTEXT)
1458	.long	0xa548f3fc		# cld; rep movsq
1459
1460	mov	$disp,%rsi
1461	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1462	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1463	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1464	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1465	mov	40(%rsi),%r10		# disp->ContextRecord
1466	lea	56(%rsi),%r11		# &disp->HandlerData
1467	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1468	mov	%r10,32(%rsp)		# arg5
1469	mov	%r11,40(%rsp)		# arg6
1470	mov	%r12,48(%rsp)		# arg7
1471	mov	%rcx,56(%rsp)		# arg8, (NULL)
1472	call	*__imp_RtlVirtualUnwind(%rip)
1473
1474	mov	\$1,%eax		# ExceptionContinueSearch
1475	add	\$64,%rsp
1476	popfq
1477	pop	%r15
1478	pop	%r14
1479	pop	%r13
1480	pop	%r12
1481	pop	%rbp
1482	pop	%rbx
1483	pop	%rdi
1484	pop	%rsi
1485	ret
1486.size	se_handler,.-se_handler
1487___
1488$code.=<<___ if ($avx>1);
1489.type	avx2_handler,\@abi-omnipotent
1490.align	16
1491avx2_handler:
1492	push	%rsi
1493	push	%rdi
1494	push	%rbx
1495	push	%rbp
1496	push	%r12
1497	push	%r13
1498	push	%r14
1499	push	%r15
1500	pushfq
1501	sub	\$64,%rsp
1502
1503	mov	120($context),%rax	# pull context->Rax
1504	mov	248($context),%rbx	# pull context->Rip
1505
1506	mov	8($disp),%rsi		# disp->ImageBase
1507	mov	56($disp),%r11		# disp->HandlerData
1508
1509	mov	0(%r11),%r10d		# HandlerData[0]
1510	lea	(%rsi,%r10),%r10	# end of prologue label
1511	cmp	%r10,%rbx		# context->Rip<body label
1512	jb	.Lin_prologue
1513
1514	mov	152($context),%rax	# pull context->Rsp
1515
1516	mov	4(%r11),%r10d		# HandlerData[1]
1517	lea	(%rsi,%r10),%r10	# epilogue label
1518	cmp	%r10,%rbx		# context->Rip>=epilogue label
1519	jae	.Lin_prologue
1520
1521	mov	`32*17`($context),%rax	# pull saved stack pointer
1522
1523	mov	-8(%rax),%rbx
1524	mov	-16(%rax),%rbp
1525	mov	-24(%rax),%r12
1526	mov	-32(%rax),%r13
1527	mov	-40(%rax),%r14
1528	mov	-48(%rax),%r15
1529	mov	%rbx,144($context)	# restore context->Rbx
1530	mov	%rbp,160($context)	# restore context->Rbp
1531	mov	%r12,216($context)	# restore context->R12
1532	mov	%r13,224($context)	# restore context->R13
1533	mov	%r14,232($context)	# restore context->R14
1534	mov	%r15,240($context)	# restore context->R15
1535
1536	lea	-56-10*16(%rax),%rsi
1537	lea	512($context),%rdi	# &context.Xmm6
1538	mov	\$20,%ecx
1539	.long	0xa548f3fc		# cld; rep movsq
1540
1541	jmp	.Lin_prologue
1542.size	avx2_handler,.-avx2_handler
1543___
1544$code.=<<___;
1545.section	.pdata
1546.align	4
1547	.rva	.LSEH_begin_sha256_multi_block
1548	.rva	.LSEH_end_sha256_multi_block
1549	.rva	.LSEH_info_sha256_multi_block
1550	.rva	.LSEH_begin_sha256_multi_block_shaext
1551	.rva	.LSEH_end_sha256_multi_block_shaext
1552	.rva	.LSEH_info_sha256_multi_block_shaext
1553___
1554$code.=<<___ if ($avx);
1555	.rva	.LSEH_begin_sha256_multi_block_avx
1556	.rva	.LSEH_end_sha256_multi_block_avx
1557	.rva	.LSEH_info_sha256_multi_block_avx
1558___
1559$code.=<<___ if ($avx>1);
1560	.rva	.LSEH_begin_sha256_multi_block_avx2
1561	.rva	.LSEH_end_sha256_multi_block_avx2
1562	.rva	.LSEH_info_sha256_multi_block_avx2
1563___
1564$code.=<<___;
1565.section	.xdata
1566.align	8
1567.LSEH_info_sha256_multi_block:
1568	.byte	9,0,0,0
1569	.rva	se_handler
1570	.rva	.Lbody,.Lepilogue			# HandlerData[]
1571.LSEH_info_sha256_multi_block_shaext:
1572	.byte	9,0,0,0
1573	.rva	se_handler
1574	.rva	.Lbody_shaext,.Lepilogue_shaext		# HandlerData[]
1575___
1576$code.=<<___ if ($avx);
1577.LSEH_info_sha256_multi_block_avx:
1578	.byte	9,0,0,0
1579	.rva	se_handler
1580	.rva	.Lbody_avx,.Lepilogue_avx		# HandlerData[]
1581___
1582$code.=<<___ if ($avx>1);
1583.LSEH_info_sha256_multi_block_avx2:
1584	.byte	9,0,0,0
1585	.rva	avx2_handler
1586	.rva	.Lbody_avx2,.Lepilogue_avx2		# HandlerData[]
1587___
1588}
1589####################################################################
1590
1591sub rex {
1592  local *opcode=shift;
1593  my ($dst,$src)=@_;
1594  my $rex=0;
1595
1596    $rex|=0x04			if ($dst>=8);
1597    $rex|=0x01			if ($src>=8);
1598    unshift @opcode,$rex|0x40	if ($rex);
1599}
1600
1601sub sha256op38 {
1602    my $instr = shift;
1603    my %opcodelet = (
1604		"sha256rnds2" => 0xcb,
1605  		"sha256msg1"  => 0xcc,
1606		"sha256msg2"  => 0xcd	);
1607
1608    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1609      my @opcode=(0x0f,0x38);
1610	rex(\@opcode,$2,$1);
1611	push @opcode,$opcodelet{$instr};
1612	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1613	return ".byte\t".join(',',@opcode);
1614    } else {
1615	return $instr."\t".@_[0];
1616    }
1617}
1618
1619foreach (split("\n",$code)) {
1620	s/\`([^\`]*)\`/eval($1)/ge;
1621
1622	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo		or
1623
1624	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
1625	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
1626	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go	or
1627	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1628	s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go		or
1629	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1630
1631	print $_,"\n";
1632}
1633
1634close STDOUT or die "error closing STDOUT: $!";
1635