1#! /usr/bin/env perl
2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Multi-buffer SHA256 procedure processes n buffers in parallel by
18# placing buffer data to designated lane of SIMD register. n is
19# naturally limited to 4 on pre-AVX2 processors and to 8 on
20# AVX2-capable processors such as Haswell.
21#
22#		this	+aesni(i)	sha256	aesni-sha256	gain(iv)
23# -------------------------------------------------------------------
24# Westmere(ii)	23.3/n	+1.28=7.11(n=4)	12.3	+3.75=16.1	+126%
25# Atom(ii)	38.7/n	+3.93=13.6(n=4)	20.8	+5.69=26.5	+95%
26# Sandy Bridge	(20.5	+5.15=25.7)/n	11.6	13.0		+103%
27# Ivy Bridge	(20.4	+5.14=25.5)/n	10.3	11.6		+82%
28# Haswell(iii)	(21.0	+5.00=26.0)/n	7.80	8.79		+170%
29# Skylake	(18.9	+5.00=23.9)/n	7.70	8.17		+170%
30# Bulldozer	(21.6	+5.76=27.4)/n	13.6	13.7		+100%
31#
32# (i)	multi-block CBC encrypt with 128-bit key;
33# (ii)	(HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34#	because of lower AES-NI instruction throughput, nor is there
35#	AES-NI-SHA256 stitch for these processors;
36# (iii)	"this" is for n=8, when we gather twice as much data, result
37#	for n=4 is 20.3+4.44=24.7;
38# (iv)	presented improvement coefficients are asymptotic limits and
39#	in real-life application are somewhat lower, e.g. for 2KB
40#	fragments they range from 75% to 130% (on Haswell);
41
42$flavour = shift;
43$output  = shift;
44if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51die "can't locate x86_64-xlate.pl";
52
53$avx=0;
54
55if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57	$avx = ($1>=2.19) + ($1>=2.22);
58}
59
60if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62	$avx = ($1>=2.09) + ($1>=2.10);
63}
64
65if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67	$avx = ($1>=10) + ($1>=11);
68}
69
70if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
71	$avx = ($2>=3.0) + ($2>3.0);
72}
73
74open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
75*STDOUT=*OUT;
76
77# void sha256_multi_block (
78#     struct {	unsigned int A[8];
79#		unsigned int B[8];
80#		unsigned int C[8];
81#		unsigned int D[8];
82#		unsigned int E[8];
83#		unsigned int F[8];
84#		unsigned int G[8];
85#		unsigned int H[8];	} *ctx,
86#     struct {	void *ptr; int blocks;	} inp[8],
87#     int num);		/* 1 or 2 */
88#
89$ctx="%rdi";	# 1st arg
90$inp="%rsi";	# 2nd arg
91$num="%edx";	# 3rd arg
92@ptr=map("%r$_",(8..11));
93$Tbl="%rbp";
94
95@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
96($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
97
98$REG_SZ=16;
99
100sub Xi_off {
101my $off = shift;
102
103    $off %= 16; $off *= $REG_SZ;
104    $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
105}
106
107sub ROUND_00_15 {
108my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
109
110$code.=<<___ if ($i<15);
111	movd		`4*$i`(@ptr[0]),$Xi
112	movd		`4*$i`(@ptr[1]),$t1
113	movd		`4*$i`(@ptr[2]),$t2
114	movd		`4*$i`(@ptr[3]),$t3
115	punpckldq	$t2,$Xi
116	punpckldq	$t3,$t1
117	punpckldq	$t1,$Xi
118___
119$code.=<<___ if ($i==15);
120	movd		`4*$i`(@ptr[0]),$Xi
121	 lea		`16*4`(@ptr[0]),@ptr[0]
122	movd		`4*$i`(@ptr[1]),$t1
123	 lea		`16*4`(@ptr[1]),@ptr[1]
124	movd		`4*$i`(@ptr[2]),$t2
125	 lea		`16*4`(@ptr[2]),@ptr[2]
126	movd		`4*$i`(@ptr[3]),$t3
127	 lea		`16*4`(@ptr[3]),@ptr[3]
128	punpckldq	$t2,$Xi
129	punpckldq	$t3,$t1
130	punpckldq	$t1,$Xi
131___
132$code.=<<___;
133	movdqa	$e,$sigma
134	`"pshufb	$Xn,$Xi"		if ($i<=15 && ($i&1)==0)`
135	movdqa	$e,$t3
136	`"pshufb	$Xn,$Xi"		if ($i<=15 && ($i&1)==1)`
137	psrld	\$6,$sigma
138	movdqa	$e,$t2
139	pslld	\$7,$t3
140	movdqa	$Xi,`&Xi_off($i)`
141	 paddd	$h,$Xi				# Xi+=h
142
143	psrld	\$11,$t2
144	pxor	$t3,$sigma
145	pslld	\$21-7,$t3
146	 paddd	`32*($i%8)-128`($Tbl),$Xi	# Xi+=K[round]
147	pxor	$t2,$sigma
148
149	psrld	\$25-11,$t2
150	 movdqa	$e,$t1
151	 `"prefetcht0	63(@ptr[0])"		if ($i==15)`
152	pxor	$t3,$sigma
153	 movdqa	$e,$axb				# borrow $axb
154	pslld	\$26-21,$t3
155	 pandn	$g,$t1
156	 pand	$f,$axb
157	pxor	$t2,$sigma
158
159	 `"prefetcht0	63(@ptr[1])"		if ($i==15)`
160	movdqa	$a,$t2
161	pxor	$t3,$sigma			# Sigma1(e)
162	movdqa	$a,$t3
163	psrld	\$2,$t2
164	paddd	$sigma,$Xi			# Xi+=Sigma1(e)
165	 pxor	$axb,$t1			# Ch(e,f,g)
166	 movdqa	$b,$axb
167	movdqa	$a,$sigma
168	pslld	\$10,$t3
169	 pxor	$a,$axb				# a^b, b^c in next round
170
171	 `"prefetcht0	63(@ptr[2])"		if ($i==15)`
172	psrld	\$13,$sigma
173	pxor	$t3,$t2
174	 paddd	$t1,$Xi				# Xi+=Ch(e,f,g)
175	pslld	\$19-10,$t3
176	 pand	$axb,$bxc
177	pxor	$sigma,$t2
178
179	 `"prefetcht0	63(@ptr[3])"		if ($i==15)`
180	psrld	\$22-13,$sigma
181	pxor	$t3,$t2
182	 movdqa	$b,$h
183	pslld	\$30-19,$t3
184	pxor	$t2,$sigma
185	 pxor	$bxc,$h				# h=Maj(a,b,c)=Ch(a^b,c,b)
186	 paddd	$Xi,$d				# d+=Xi
187	pxor	$t3,$sigma			# Sigma0(a)
188
189	paddd	$Xi,$h				# h+=Xi
190	paddd	$sigma,$h			# h+=Sigma0(a)
191___
192$code.=<<___ if (($i%8)==7);
193	lea	`32*8`($Tbl),$Tbl
194___
195	($axb,$bxc)=($bxc,$axb);
196}
197
198sub ROUND_16_XX {
199my $i=shift;
200
201$code.=<<___;
202	movdqa	`&Xi_off($i+1)`,$Xn
203	paddd	`&Xi_off($i+9)`,$Xi		# Xi+=X[i+9]
204
205	movdqa	$Xn,$sigma
206	movdqa	$Xn,$t2
207	psrld	\$3,$sigma
208	movdqa	$Xn,$t3
209
210	psrld	\$7,$t2
211	movdqa	`&Xi_off($i+14)`,$t1
212	pslld	\$14,$t3
213	pxor	$t2,$sigma
214	psrld	\$18-7,$t2
215	movdqa	$t1,$axb			# borrow $axb
216	pxor	$t3,$sigma
217	pslld	\$25-14,$t3
218	pxor	$t2,$sigma
219	psrld	\$10,$t1
220	movdqa	$axb,$t2
221
222	psrld	\$17,$axb
223	pxor	$t3,$sigma			# sigma0(X[i+1])
224	pslld	\$13,$t2
225	 paddd	$sigma,$Xi			# Xi+=sigma0(e)
226	pxor	$axb,$t1
227	psrld	\$19-17,$axb
228	pxor	$t2,$t1
229	pslld	\$15-13,$t2
230	pxor	$axb,$t1
231	pxor	$t2,$t1				# sigma0(X[i+14])
232	paddd	$t1,$Xi				# Xi+=sigma1(X[i+14])
233___
234	&ROUND_00_15($i,@_);
235	($Xi,$Xn)=($Xn,$Xi);
236}
237
238$code.=<<___;
239.text
240
241.extern	OPENSSL_ia32cap_P
242
243.globl	sha256_multi_block
244.type	sha256_multi_block,\@function,3
245.align	32
246sha256_multi_block:
247.cfi_startproc
248	mov	OPENSSL_ia32cap_P+4(%rip),%rcx
249	bt	\$61,%rcx			# check SHA bit
250	jc	_shaext_shortcut
251___
252$code.=<<___ if ($avx);
253	test	\$`1<<28`,%ecx
254	jnz	_avx_shortcut
255___
256$code.=<<___;
257	mov	%rsp,%rax
258.cfi_def_cfa_register	%rax
259	push	%rbx
260.cfi_push	%rbx
261	push	%rbp
262.cfi_push	%rbp
263___
264$code.=<<___ if ($win64);
265	lea	-0xa8(%rsp),%rsp
266	movaps	%xmm6,(%rsp)
267	movaps	%xmm7,0x10(%rsp)
268	movaps	%xmm8,0x20(%rsp)
269	movaps	%xmm9,0x30(%rsp)
270	movaps	%xmm10,-0x78(%rax)
271	movaps	%xmm11,-0x68(%rax)
272	movaps	%xmm12,-0x58(%rax)
273	movaps	%xmm13,-0x48(%rax)
274	movaps	%xmm14,-0x38(%rax)
275	movaps	%xmm15,-0x28(%rax)
276___
277$code.=<<___;
278	sub	\$`$REG_SZ*18`, %rsp
279	and	\$-256,%rsp
280	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
281.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
282.Lbody:
283	lea	K256+128(%rip),$Tbl
284	lea	`$REG_SZ*16`(%rsp),%rbx
285	lea	0x80($ctx),$ctx			# size optimization
286
287.Loop_grande:
288	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
289	xor	$num,$num
290___
291for($i=0;$i<4;$i++) {
292    $code.=<<___;
293	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
294	mov	`16*$i+8`($inp),%ecx		# number of blocks
295	cmp	$num,%ecx
296	cmovg	%ecx,$num			# find maximum
297	test	%ecx,%ecx
298	mov	%ecx,`4*$i`(%rbx)		# initialize counters
299	cmovle	$Tbl,@ptr[$i]			# cancel input
300___
301}
302$code.=<<___;
303	test	$num,$num
304	jz	.Ldone
305
306	movdqu	0x00-0x80($ctx),$A		# load context
307	 lea	128(%rsp),%rax
308	movdqu	0x20-0x80($ctx),$B
309	movdqu	0x40-0x80($ctx),$C
310	movdqu	0x60-0x80($ctx),$D
311	movdqu	0x80-0x80($ctx),$E
312	movdqu	0xa0-0x80($ctx),$F
313	movdqu	0xc0-0x80($ctx),$G
314	movdqu	0xe0-0x80($ctx),$H
315	movdqu	.Lpbswap(%rip),$Xn
316	jmp	.Loop
317
318.align	32
319.Loop:
320	movdqa	$C,$bxc
321	pxor	$B,$bxc				# magic seed
322___
323for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
324$code.=<<___;
325	movdqu	`&Xi_off($i)`,$Xi
326	mov	\$3,%ecx
327	jmp	.Loop_16_xx
328.align	32
329.Loop_16_xx:
330___
331for(;$i<32;$i++)	{ &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
332$code.=<<___;
333	dec	%ecx
334	jnz	.Loop_16_xx
335
336	mov	\$1,%ecx
337	lea	K256+128(%rip),$Tbl
338
339	movdqa	(%rbx),$sigma			# pull counters
340	cmp	4*0(%rbx),%ecx			# examine counters
341	pxor	$t1,$t1
342	cmovge	$Tbl,@ptr[0]			# cancel input
343	cmp	4*1(%rbx),%ecx
344	movdqa	$sigma,$Xn
345	cmovge	$Tbl,@ptr[1]
346	cmp	4*2(%rbx),%ecx
347	pcmpgtd	$t1,$Xn				# mask value
348	cmovge	$Tbl,@ptr[2]
349	cmp	4*3(%rbx),%ecx
350	paddd	$Xn,$sigma			# counters--
351	cmovge	$Tbl,@ptr[3]
352
353	movdqu	0x00-0x80($ctx),$t1
354	pand	$Xn,$A
355	movdqu	0x20-0x80($ctx),$t2
356	pand	$Xn,$B
357	movdqu	0x40-0x80($ctx),$t3
358	pand	$Xn,$C
359	movdqu	0x60-0x80($ctx),$Xi
360	pand	$Xn,$D
361	paddd	$t1,$A
362	movdqu	0x80-0x80($ctx),$t1
363	pand	$Xn,$E
364	paddd	$t2,$B
365	movdqu	0xa0-0x80($ctx),$t2
366	pand	$Xn,$F
367	paddd	$t3,$C
368	movdqu	0xc0-0x80($ctx),$t3
369	pand	$Xn,$G
370	paddd	$Xi,$D
371	movdqu	0xe0-0x80($ctx),$Xi
372	pand	$Xn,$H
373	paddd	$t1,$E
374	paddd	$t2,$F
375	movdqu	$A,0x00-0x80($ctx)
376	paddd	$t3,$G
377	movdqu	$B,0x20-0x80($ctx)
378	paddd	$Xi,$H
379	movdqu	$C,0x40-0x80($ctx)
380	movdqu	$D,0x60-0x80($ctx)
381	movdqu	$E,0x80-0x80($ctx)
382	movdqu	$F,0xa0-0x80($ctx)
383	movdqu	$G,0xc0-0x80($ctx)
384	movdqu	$H,0xe0-0x80($ctx)
385
386	movdqa	$sigma,(%rbx)			# save counters
387	movdqa	.Lpbswap(%rip),$Xn
388	dec	$num
389	jnz	.Loop
390
391	mov	`$REG_SZ*17+8`(%rsp),$num
392	lea	$REG_SZ($ctx),$ctx
393	lea	`16*$REG_SZ/4`($inp),$inp
394	dec	$num
395	jnz	.Loop_grande
396
397.Ldone:
398	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
399.cfi_def_cfa	%rax,8
400___
401$code.=<<___ if ($win64);
402	movaps	-0xb8(%rax),%xmm6
403	movaps	-0xa8(%rax),%xmm7
404	movaps	-0x98(%rax),%xmm8
405	movaps	-0x88(%rax),%xmm9
406	movaps	-0x78(%rax),%xmm10
407	movaps	-0x68(%rax),%xmm11
408	movaps	-0x58(%rax),%xmm12
409	movaps	-0x48(%rax),%xmm13
410	movaps	-0x38(%rax),%xmm14
411	movaps	-0x28(%rax),%xmm15
412___
413$code.=<<___;
414	mov	-16(%rax),%rbp
415.cfi_restore	%rbp
416	mov	-8(%rax),%rbx
417.cfi_restore	%rbx
418	lea	(%rax),%rsp
419.cfi_def_cfa_register	%rsp
420.Lepilogue:
421	ret
422.cfi_endproc
423.size	sha256_multi_block,.-sha256_multi_block
424___
425						{{{
426my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
427my @MSG0=map("%xmm$_",(4..7));
428my @MSG1=map("%xmm$_",(8..11));
429
430$code.=<<___;
431.type	sha256_multi_block_shaext,\@function,3
432.align	32
433sha256_multi_block_shaext:
434.cfi_startproc
435_shaext_shortcut:
436	mov	%rsp,%rax
437.cfi_def_cfa_register	%rax
438	push	%rbx
439.cfi_push	%rbx
440	push	%rbp
441.cfi_push	%rbp
442___
443$code.=<<___ if ($win64);
444	lea	-0xa8(%rsp),%rsp
445	movaps	%xmm6,(%rsp)
446	movaps	%xmm7,0x10(%rsp)
447	movaps	%xmm8,0x20(%rsp)
448	movaps	%xmm9,0x30(%rsp)
449	movaps	%xmm10,-0x78(%rax)
450	movaps	%xmm11,-0x68(%rax)
451	movaps	%xmm12,-0x58(%rax)
452	movaps	%xmm13,-0x48(%rax)
453	movaps	%xmm14,-0x38(%rax)
454	movaps	%xmm15,-0x28(%rax)
455___
456$code.=<<___;
457	sub	\$`$REG_SZ*18`,%rsp
458	shl	\$1,$num			# we process pair at a time
459	and	\$-256,%rsp
460	lea	0x80($ctx),$ctx			# size optimization
461	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
462.Lbody_shaext:
463	lea	`$REG_SZ*16`(%rsp),%rbx
464	lea	K256_shaext+0x80(%rip),$Tbl
465
466.Loop_grande_shaext:
467	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
468	xor	$num,$num
469___
470for($i=0;$i<2;$i++) {
471    $code.=<<___;
472	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
473	mov	`16*$i+8`($inp),%ecx		# number of blocks
474	cmp	$num,%ecx
475	cmovg	%ecx,$num			# find maximum
476	test	%ecx,%ecx
477	mov	%ecx,`4*$i`(%rbx)		# initialize counters
478	cmovle	%rsp,@ptr[$i]			# cancel input
479___
480}
481$code.=<<___;
482	test	$num,$num
483	jz	.Ldone_shaext
484
485	movq		0x00-0x80($ctx),$ABEF0		# A1.A0
486	movq		0x20-0x80($ctx),@MSG0[0]	# B1.B0
487	movq		0x40-0x80($ctx),$CDGH0		# C1.C0
488	movq		0x60-0x80($ctx),@MSG0[1]	# D1.D0
489	movq		0x80-0x80($ctx),@MSG1[0]	# E1.E0
490	movq		0xa0-0x80($ctx),@MSG1[1]	# F1.F0
491	movq		0xc0-0x80($ctx),@MSG1[2]	# G1.G0
492	movq		0xe0-0x80($ctx),@MSG1[3]	# H1.H0
493
494	punpckldq	@MSG0[0],$ABEF0			# B1.A1.B0.A0
495	punpckldq	@MSG0[1],$CDGH0			# D1.C1.D0.C0
496	punpckldq	@MSG1[1],@MSG1[0]		# F1.E1.F0.E0
497	punpckldq	@MSG1[3],@MSG1[2]		# H1.G1.H0.G0
498	movdqa		K256_shaext-0x10(%rip),$TMPx	# byte swap
499
500	movdqa		$ABEF0,$ABEF1
501	movdqa		$CDGH0,$CDGH1
502	punpcklqdq	@MSG1[0],$ABEF0			# F0.E0.B0.A0
503	punpcklqdq	@MSG1[2],$CDGH0			# H0.G0.D0.C0
504	punpckhqdq	@MSG1[0],$ABEF1			# F1.E1.B1.A1
505	punpckhqdq	@MSG1[2],$CDGH1			# H1.G1.D1.C1
506
507	pshufd		\$0b00011011,$ABEF0,$ABEF0
508	pshufd		\$0b00011011,$CDGH0,$CDGH0
509	pshufd		\$0b00011011,$ABEF1,$ABEF1
510	pshufd		\$0b00011011,$CDGH1,$CDGH1
511	jmp		.Loop_shaext
512
513.align	32
514.Loop_shaext:
515	movdqu		0x00(@ptr[0]),@MSG0[0]
516	 movdqu		0x00(@ptr[1]),@MSG1[0]
517	movdqu		0x10(@ptr[0]),@MSG0[1]
518	 movdqu		0x10(@ptr[1]),@MSG1[1]
519	movdqu		0x20(@ptr[0]),@MSG0[2]
520	pshufb		$TMPx,@MSG0[0]
521	 movdqu		0x20(@ptr[1]),@MSG1[2]
522	 pshufb		$TMPx,@MSG1[0]
523	movdqu		0x30(@ptr[0]),@MSG0[3]
524	lea		0x40(@ptr[0]),@ptr[0]
525	 movdqu		0x30(@ptr[1]),@MSG1[3]
526	 lea		0x40(@ptr[1]),@ptr[1]
527
528	movdqa		0*16-0x80($Tbl),$Wi
529	pshufb		$TMPx,@MSG0[1]
530	paddd		@MSG0[0],$Wi
531	pxor		$ABEF0,@MSG0[0]		# black magic
532	movdqa		$Wi,$TMP0
533	 movdqa		0*16-0x80($Tbl),$TMP1
534	 pshufb		$TMPx,@MSG1[1]
535	 paddd		@MSG1[0],$TMP1
536	movdqa		$CDGH0,0x50(%rsp)	# offload
537	sha256rnds2	$ABEF0,$CDGH0		# 0-3
538	 pxor		$ABEF1,@MSG1[0]		# black magic
539	 movdqa		$TMP1,$Wi
540	 movdqa		$CDGH1,0x70(%rsp)
541	 sha256rnds2	$ABEF1,$CDGH1		# 0-3
542	pshufd		\$0x0e,$TMP0,$Wi
543	pxor		$ABEF0,@MSG0[0]		# black magic
544	movdqa		$ABEF0,0x40(%rsp)	# offload
545	sha256rnds2	$CDGH0,$ABEF0
546	 pshufd		\$0x0e,$TMP1,$Wi
547	 pxor		$ABEF1,@MSG1[0]		# black magic
548	 movdqa		$ABEF1,0x60(%rsp)
549	movdqa		1*16-0x80($Tbl),$TMP0
550	paddd		@MSG0[1],$TMP0
551	pshufb		$TMPx,@MSG0[2]
552	 sha256rnds2	$CDGH1,$ABEF1
553
554	movdqa		$TMP0,$Wi
555	 movdqa		1*16-0x80($Tbl),$TMP1
556	 paddd		@MSG1[1],$TMP1
557	sha256rnds2	$ABEF0,$CDGH0		# 4-7
558	 movdqa		$TMP1,$Wi
559	prefetcht0	127(@ptr[0])
560	pshufb		$TMPx,@MSG0[3]
561	 pshufb		$TMPx,@MSG1[2]
562	 prefetcht0	127(@ptr[1])
563	 sha256rnds2	$ABEF1,$CDGH1		# 4-7
564	pshufd		\$0x0e,$TMP0,$Wi
565	 pshufb		$TMPx,@MSG1[3]
566	sha256msg1	@MSG0[1],@MSG0[0]
567	sha256rnds2	$CDGH0,$ABEF0
568	 pshufd		\$0x0e,$TMP1,$Wi
569	movdqa		2*16-0x80($Tbl),$TMP0
570	paddd		@MSG0[2],$TMP0
571	 sha256rnds2	$CDGH1,$ABEF1
572
573	movdqa		$TMP0,$Wi
574	 movdqa		2*16-0x80($Tbl),$TMP1
575	 paddd		@MSG1[2],$TMP1
576	sha256rnds2	$ABEF0,$CDGH0		# 8-11
577	 sha256msg1	@MSG1[1],@MSG1[0]
578	 movdqa		$TMP1,$Wi
579	movdqa		@MSG0[3],$TMPx
580	 sha256rnds2	$ABEF1,$CDGH1		# 8-11
581	pshufd		\$0x0e,$TMP0,$Wi
582	palignr		\$4,@MSG0[2],$TMPx
583	paddd		$TMPx,@MSG0[0]
584	 movdqa		@MSG1[3],$TMPx
585	 palignr	\$4,@MSG1[2],$TMPx
586	sha256msg1	@MSG0[2],@MSG0[1]
587	sha256rnds2	$CDGH0,$ABEF0
588	 pshufd		\$0x0e,$TMP1,$Wi
589	movdqa		3*16-0x80($Tbl),$TMP0
590	paddd		@MSG0[3],$TMP0
591	 sha256rnds2	$CDGH1,$ABEF1
592	 sha256msg1	@MSG1[2],@MSG1[1]
593
594	movdqa		$TMP0,$Wi
595	 movdqa		3*16-0x80($Tbl),$TMP1
596	 paddd		$TMPx,@MSG1[0]
597	 paddd		@MSG1[3],$TMP1
598	sha256msg2	@MSG0[3],@MSG0[0]
599	sha256rnds2	$ABEF0,$CDGH0		# 12-15
600	 movdqa		$TMP1,$Wi
601	movdqa		@MSG0[0],$TMPx
602	palignr		\$4,@MSG0[3],$TMPx
603	 sha256rnds2	$ABEF1,$CDGH1		# 12-15
604	 sha256msg2	@MSG1[3],@MSG1[0]
605	pshufd		\$0x0e,$TMP0,$Wi
606	paddd		$TMPx,@MSG0[1]
607	 movdqa		@MSG1[0],$TMPx
608	 palignr	\$4,@MSG1[3],$TMPx
609	sha256msg1	@MSG0[3],@MSG0[2]
610	sha256rnds2	$CDGH0,$ABEF0
611	 pshufd		\$0x0e,$TMP1,$Wi
612	movdqa		4*16-0x80($Tbl),$TMP0
613	paddd		@MSG0[0],$TMP0
614	 sha256rnds2	$CDGH1,$ABEF1
615	 sha256msg1	@MSG1[3],@MSG1[2]
616___
617for($i=4;$i<16-3;$i++) {
618$code.=<<___;
619	movdqa		$TMP0,$Wi
620	 movdqa		$i*16-0x80($Tbl),$TMP1
621	 paddd		$TMPx,@MSG1[1]
622	 paddd		@MSG1[0],$TMP1
623	sha256msg2	@MSG0[0],@MSG0[1]
624	sha256rnds2	$ABEF0,$CDGH0		# 16-19...
625	 movdqa		$TMP1,$Wi
626	movdqa		@MSG0[1],$TMPx
627	palignr		\$4,@MSG0[0],$TMPx
628	 sha256rnds2	$ABEF1,$CDGH1		# 16-19...
629	 sha256msg2	@MSG1[0],@MSG1[1]
630	pshufd		\$0x0e,$TMP0,$Wi
631	paddd		$TMPx,@MSG0[2]
632	 movdqa		@MSG1[1],$TMPx
633	 palignr	\$4,@MSG1[0],$TMPx
634	sha256msg1	@MSG0[0],@MSG0[3]
635	sha256rnds2	$CDGH0,$ABEF0
636	 pshufd		\$0x0e,$TMP1,$Wi
637	movdqa		`($i+1)*16`-0x80($Tbl),$TMP0
638	paddd		@MSG0[1],$TMP0
639	 sha256rnds2	$CDGH1,$ABEF1
640	 sha256msg1	@MSG1[0],@MSG1[3]
641___
642	push(@MSG0,shift(@MSG0));	push(@MSG1,shift(@MSG1));
643}
644$code.=<<___;
645	movdqa		$TMP0,$Wi
646	 movdqa		13*16-0x80($Tbl),$TMP1
647	 paddd		$TMPx,@MSG1[1]
648	 paddd		@MSG1[0],$TMP1
649	sha256msg2	@MSG0[0],@MSG0[1]
650	sha256rnds2	$ABEF0,$CDGH0		# 52-55
651	 movdqa		$TMP1,$Wi
652	movdqa		@MSG0[1],$TMPx
653	palignr		\$4,@MSG0[0],$TMPx
654	 sha256rnds2	$ABEF1,$CDGH1		# 52-55
655	 sha256msg2	@MSG1[0],@MSG1[1]
656	pshufd		\$0x0e,$TMP0,$Wi
657	paddd		$TMPx,@MSG0[2]
658	 movdqa		@MSG1[1],$TMPx
659	 palignr	\$4,@MSG1[0],$TMPx
660	nop
661	sha256rnds2	$CDGH0,$ABEF0
662	 pshufd		\$0x0e,$TMP1,$Wi
663	movdqa		14*16-0x80($Tbl),$TMP0
664	paddd		@MSG0[1],$TMP0
665	 sha256rnds2	$CDGH1,$ABEF1
666
667	movdqa		$TMP0,$Wi
668	 movdqa		14*16-0x80($Tbl),$TMP1
669	 paddd		$TMPx,@MSG1[2]
670	 paddd		@MSG1[1],$TMP1
671	sha256msg2	@MSG0[1],@MSG0[2]
672	nop
673	sha256rnds2	$ABEF0,$CDGH0		# 56-59
674	 movdqa		$TMP1,$Wi
675	  mov		\$1,%ecx
676	  pxor		@MSG0[1],@MSG0[1]	# zero
677	 sha256rnds2	$ABEF1,$CDGH1		# 56-59
678	 sha256msg2	@MSG1[1],@MSG1[2]
679	pshufd		\$0x0e,$TMP0,$Wi
680	movdqa		15*16-0x80($Tbl),$TMP0
681	paddd		@MSG0[2],$TMP0
682	  movq		(%rbx),@MSG0[2]		# pull counters
683	  nop
684	sha256rnds2	$CDGH0,$ABEF0
685	 pshufd		\$0x0e,$TMP1,$Wi
686	 movdqa		15*16-0x80($Tbl),$TMP1
687	 paddd		@MSG1[2],$TMP1
688	 sha256rnds2	$CDGH1,$ABEF1
689
690	movdqa		$TMP0,$Wi
691	  cmp		4*0(%rbx),%ecx		# examine counters
692	  cmovge	%rsp,@ptr[0]		# cancel input
693	  cmp		4*1(%rbx),%ecx
694	  cmovge	%rsp,@ptr[1]
695	  pshufd	\$0x00,@MSG0[2],@MSG1[0]
696	sha256rnds2	$ABEF0,$CDGH0		# 60-63
697	 movdqa		$TMP1,$Wi
698	  pshufd	\$0x55,@MSG0[2],@MSG1[1]
699	  movdqa	@MSG0[2],@MSG1[2]
700	 sha256rnds2	$ABEF1,$CDGH1		# 60-63
701	pshufd		\$0x0e,$TMP0,$Wi
702	  pcmpgtd	@MSG0[1],@MSG1[0]
703	  pcmpgtd	@MSG0[1],@MSG1[1]
704	sha256rnds2	$CDGH0,$ABEF0
705	 pshufd		\$0x0e,$TMP1,$Wi
706	  pcmpgtd	@MSG0[1],@MSG1[2]	# counter mask
707	  movdqa	K256_shaext-0x10(%rip),$TMPx
708	 sha256rnds2	$CDGH1,$ABEF1
709
710	pand		@MSG1[0],$CDGH0
711	 pand		@MSG1[1],$CDGH1
712	pand		@MSG1[0],$ABEF0
713	 pand		@MSG1[1],$ABEF1
714	paddd		@MSG0[2],@MSG1[2]	# counters--
715
716	paddd		0x50(%rsp),$CDGH0
717	 paddd		0x70(%rsp),$CDGH1
718	paddd		0x40(%rsp),$ABEF0
719	 paddd		0x60(%rsp),$ABEF1
720
721	movq		@MSG1[2],(%rbx)		# save counters
722	dec		$num
723	jnz		.Loop_shaext
724
725	mov		`$REG_SZ*17+8`(%rsp),$num
726
727	pshufd		\$0b00011011,$ABEF0,$ABEF0
728	pshufd		\$0b00011011,$CDGH0,$CDGH0
729	pshufd		\$0b00011011,$ABEF1,$ABEF1
730	pshufd		\$0b00011011,$CDGH1,$CDGH1
731
732	movdqa		$ABEF0,@MSG0[0]
733	movdqa		$CDGH0,@MSG0[1]
734	punpckldq	$ABEF1,$ABEF0			# B1.B0.A1.A0
735	punpckhdq	$ABEF1,@MSG0[0]			# F1.F0.E1.E0
736	punpckldq	$CDGH1,$CDGH0			# D1.D0.C1.C0
737	punpckhdq	$CDGH1,@MSG0[1]			# H1.H0.G1.G0
738
739	movq		$ABEF0,0x00-0x80($ctx)		# A1.A0
740	psrldq		\$8,$ABEF0
741	movq		@MSG0[0],0x80-0x80($ctx)	# E1.E0
742	psrldq		\$8,@MSG0[0]
743	movq		$ABEF0,0x20-0x80($ctx)		# B1.B0
744	movq		@MSG0[0],0xa0-0x80($ctx)	# F1.F0
745
746	movq		$CDGH0,0x40-0x80($ctx)		# C1.C0
747	psrldq		\$8,$CDGH0
748	movq		@MSG0[1],0xc0-0x80($ctx)	# G1.G0
749	psrldq		\$8,@MSG0[1]
750	movq		$CDGH0,0x60-0x80($ctx)		# D1.D0
751	movq		@MSG0[1],0xe0-0x80($ctx)	# H1.H0
752
753	lea	`$REG_SZ/2`($ctx),$ctx
754	lea	`16*2`($inp),$inp
755	dec	$num
756	jnz	.Loop_grande_shaext
757
758.Ldone_shaext:
759	#mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
760___
761$code.=<<___ if ($win64);
762	movaps	-0xb8(%rax),%xmm6
763	movaps	-0xa8(%rax),%xmm7
764	movaps	-0x98(%rax),%xmm8
765	movaps	-0x88(%rax),%xmm9
766	movaps	-0x78(%rax),%xmm10
767	movaps	-0x68(%rax),%xmm11
768	movaps	-0x58(%rax),%xmm12
769	movaps	-0x48(%rax),%xmm13
770	movaps	-0x38(%rax),%xmm14
771	movaps	-0x28(%rax),%xmm15
772___
773$code.=<<___;
774	mov	-16(%rax),%rbp
775.cfi_restore	%rbp
776	mov	-8(%rax),%rbx
777.cfi_restore	%rbx
778	lea	(%rax),%rsp
779.cfi_def_cfa_register	%rsp
780.Lepilogue_shaext:
781	ret
782.cfi_endproc
783.size	sha256_multi_block_shaext,.-sha256_multi_block_shaext
784___
785						}}}
786						if ($avx) {{{
787sub ROUND_00_15_avx {
788my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
789
790$code.=<<___ if ($i<15 && $REG_SZ==16);
791	vmovd		`4*$i`(@ptr[0]),$Xi
792	vmovd		`4*$i`(@ptr[1]),$t1
793	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
794	vpinsrd		\$1,`4*$i`(@ptr[3]),$t1,$t1
795	vpunpckldq	$t1,$Xi,$Xi
796	vpshufb		$Xn,$Xi,$Xi
797___
798$code.=<<___ if ($i==15 && $REG_SZ==16);
799	vmovd		`4*$i`(@ptr[0]),$Xi
800	 lea		`16*4`(@ptr[0]),@ptr[0]
801	vmovd		`4*$i`(@ptr[1]),$t1
802	 lea		`16*4`(@ptr[1]),@ptr[1]
803	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
804	 lea		`16*4`(@ptr[2]),@ptr[2]
805	vpinsrd		\$1,`4*$i`(@ptr[3]),$t1,$t1
806	 lea		`16*4`(@ptr[3]),@ptr[3]
807	vpunpckldq	$t1,$Xi,$Xi
808	vpshufb		$Xn,$Xi,$Xi
809___
810$code.=<<___ if ($i<15 && $REG_SZ==32);
811	vmovd		`4*$i`(@ptr[0]),$Xi
812	vmovd		`4*$i`(@ptr[4]),$t1
813	vmovd		`4*$i`(@ptr[1]),$t2
814	vmovd		`4*$i`(@ptr[5]),$t3
815	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
816	vpinsrd		\$1,`4*$i`(@ptr[6]),$t1,$t1
817	vpinsrd		\$1,`4*$i`(@ptr[3]),$t2,$t2
818	vpunpckldq	$t2,$Xi,$Xi
819	vpinsrd		\$1,`4*$i`(@ptr[7]),$t3,$t3
820	vpunpckldq	$t3,$t1,$t1
821	vinserti128	$t1,$Xi,$Xi
822	vpshufb		$Xn,$Xi,$Xi
823___
824$code.=<<___ if ($i==15 && $REG_SZ==32);
825	vmovd		`4*$i`(@ptr[0]),$Xi
826	 lea		`16*4`(@ptr[0]),@ptr[0]
827	vmovd		`4*$i`(@ptr[4]),$t1
828	 lea		`16*4`(@ptr[4]),@ptr[4]
829	vmovd		`4*$i`(@ptr[1]),$t2
830	 lea		`16*4`(@ptr[1]),@ptr[1]
831	vmovd		`4*$i`(@ptr[5]),$t3
832	 lea		`16*4`(@ptr[5]),@ptr[5]
833	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
834	 lea		`16*4`(@ptr[2]),@ptr[2]
835	vpinsrd		\$1,`4*$i`(@ptr[6]),$t1,$t1
836	 lea		`16*4`(@ptr[6]),@ptr[6]
837	vpinsrd		\$1,`4*$i`(@ptr[3]),$t2,$t2
838	 lea		`16*4`(@ptr[3]),@ptr[3]
839	vpunpckldq	$t2,$Xi,$Xi
840	vpinsrd		\$1,`4*$i`(@ptr[7]),$t3,$t3
841	 lea		`16*4`(@ptr[7]),@ptr[7]
842	vpunpckldq	$t3,$t1,$t1
843	vinserti128	$t1,$Xi,$Xi
844	vpshufb		$Xn,$Xi,$Xi
845___
846$code.=<<___;
847	vpsrld	\$6,$e,$sigma
848	vpslld	\$26,$e,$t3
849	vmovdqu	$Xi,`&Xi_off($i)`
850	 vpaddd	$h,$Xi,$Xi			# Xi+=h
851
852	vpsrld	\$11,$e,$t2
853	vpxor	$t3,$sigma,$sigma
854	vpslld	\$21,$e,$t3
855	 vpaddd	`32*($i%8)-128`($Tbl),$Xi,$Xi	# Xi+=K[round]
856	vpxor	$t2,$sigma,$sigma
857
858	vpsrld	\$25,$e,$t2
859	vpxor	$t3,$sigma,$sigma
860	 `"prefetcht0	63(@ptr[0])"		if ($i==15)`
861	vpslld	\$7,$e,$t3
862	 vpandn	$g,$e,$t1
863	 vpand	$f,$e,$axb			# borrow $axb
864	 `"prefetcht0	63(@ptr[1])"		if ($i==15)`
865	vpxor	$t2,$sigma,$sigma
866
867	vpsrld	\$2,$a,$h			# borrow $h
868	vpxor	$t3,$sigma,$sigma		# Sigma1(e)
869	 `"prefetcht0	63(@ptr[2])"		if ($i==15)`
870	vpslld	\$30,$a,$t2
871	 vpxor	$axb,$t1,$t1			# Ch(e,f,g)
872	 vpxor	$a,$b,$axb			# a^b, b^c in next round
873	 `"prefetcht0	63(@ptr[3])"		if ($i==15)`
874	vpxor	$t2,$h,$h
875	vpaddd	$sigma,$Xi,$Xi			# Xi+=Sigma1(e)
876
877	vpsrld	\$13,$a,$t2
878	 `"prefetcht0	63(@ptr[4])"		if ($i==15 && $REG_SZ==32)`
879	vpslld	\$19,$a,$t3
880	 vpaddd	$t1,$Xi,$Xi			# Xi+=Ch(e,f,g)
881	 vpand	$axb,$bxc,$bxc
882	 `"prefetcht0	63(@ptr[5])"		if ($i==15 && $REG_SZ==32)`
883	vpxor	$t2,$h,$sigma
884
885	vpsrld	\$22,$a,$t2
886	vpxor	$t3,$sigma,$sigma
887	 `"prefetcht0	63(@ptr[6])"		if ($i==15 && $REG_SZ==32)`
888	vpslld	\$10,$a,$t3
889	 vpxor	$bxc,$b,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
890	 vpaddd	$Xi,$d,$d			# d+=Xi
891	 `"prefetcht0	63(@ptr[7])"		if ($i==15 && $REG_SZ==32)`
892	vpxor	$t2,$sigma,$sigma
893	vpxor	$t3,$sigma,$sigma		# Sigma0(a)
894
895	vpaddd	$Xi,$h,$h			# h+=Xi
896	vpaddd	$sigma,$h,$h			# h+=Sigma0(a)
897___
898$code.=<<___ if (($i%8)==7);
899	add	\$`32*8`,$Tbl
900___
901	($axb,$bxc)=($bxc,$axb);
902}
903
904sub ROUND_16_XX_avx {
905my $i=shift;
906
907$code.=<<___;
908	vmovdqu	`&Xi_off($i+1)`,$Xn
909	vpaddd	`&Xi_off($i+9)`,$Xi,$Xi		# Xi+=X[i+9]
910
911	vpsrld	\$3,$Xn,$sigma
912	vpsrld	\$7,$Xn,$t2
913	vpslld	\$25,$Xn,$t3
914	vpxor	$t2,$sigma,$sigma
915	vpsrld	\$18,$Xn,$t2
916	vpxor	$t3,$sigma,$sigma
917	vpslld	\$14,$Xn,$t3
918	vmovdqu	`&Xi_off($i+14)`,$t1
919	vpsrld	\$10,$t1,$axb			# borrow $axb
920
921	vpxor	$t2,$sigma,$sigma
922	vpsrld	\$17,$t1,$t2
923	vpxor	$t3,$sigma,$sigma		# sigma0(X[i+1])
924	vpslld	\$15,$t1,$t3
925	 vpaddd	$sigma,$Xi,$Xi			# Xi+=sigma0(e)
926	vpxor	$t2,$axb,$sigma
927	vpsrld	\$19,$t1,$t2
928	vpxor	$t3,$sigma,$sigma
929	vpslld	\$13,$t1,$t3
930	vpxor	$t2,$sigma,$sigma
931	vpxor	$t3,$sigma,$sigma		# sigma0(X[i+14])
932	vpaddd	$sigma,$Xi,$Xi			# Xi+=sigma1(X[i+14])
933___
934	&ROUND_00_15_avx($i,@_);
935	($Xi,$Xn)=($Xn,$Xi);
936}
937
938$code.=<<___;
939.type	sha256_multi_block_avx,\@function,3
940.align	32
941sha256_multi_block_avx:
942.cfi_startproc
943_avx_shortcut:
944___
945$code.=<<___ if ($avx>1);
946	shr	\$32,%rcx
947	cmp	\$2,$num
948	jb	.Lavx
949	test	\$`1<<5`,%ecx
950	jnz	_avx2_shortcut
951	jmp	.Lavx
952.align	32
953.Lavx:
954___
955$code.=<<___;
956	mov	%rsp,%rax
957.cfi_def_cfa_register	%rax
958	push	%rbx
959.cfi_push	%rbx
960	push	%rbp
961.cfi_push	%rbp
962___
963$code.=<<___ if ($win64);
964	lea	-0xa8(%rsp),%rsp
965	movaps	%xmm6,(%rsp)
966	movaps	%xmm7,0x10(%rsp)
967	movaps	%xmm8,0x20(%rsp)
968	movaps	%xmm9,0x30(%rsp)
969	movaps	%xmm10,-0x78(%rax)
970	movaps	%xmm11,-0x68(%rax)
971	movaps	%xmm12,-0x58(%rax)
972	movaps	%xmm13,-0x48(%rax)
973	movaps	%xmm14,-0x38(%rax)
974	movaps	%xmm15,-0x28(%rax)
975___
976$code.=<<___;
977	sub	\$`$REG_SZ*18`, %rsp
978	and	\$-256,%rsp
979	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
980.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
981.Lbody_avx:
982	lea	K256+128(%rip),$Tbl
983	lea	`$REG_SZ*16`(%rsp),%rbx
984	lea	0x80($ctx),$ctx			# size optimization
985
986.Loop_grande_avx:
987	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
988	xor	$num,$num
989___
990for($i=0;$i<4;$i++) {
991    $code.=<<___;
992	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
993	mov	`16*$i+8`($inp),%ecx		# number of blocks
994	cmp	$num,%ecx
995	cmovg	%ecx,$num			# find maximum
996	test	%ecx,%ecx
997	mov	%ecx,`4*$i`(%rbx)		# initialize counters
998	cmovle	$Tbl,@ptr[$i]			# cancel input
999___
1000}
1001$code.=<<___;
1002	test	$num,$num
1003	jz	.Ldone_avx
1004
1005	vmovdqu	0x00-0x80($ctx),$A		# load context
1006	 lea	128(%rsp),%rax
1007	vmovdqu	0x20-0x80($ctx),$B
1008	vmovdqu	0x40-0x80($ctx),$C
1009	vmovdqu	0x60-0x80($ctx),$D
1010	vmovdqu	0x80-0x80($ctx),$E
1011	vmovdqu	0xa0-0x80($ctx),$F
1012	vmovdqu	0xc0-0x80($ctx),$G
1013	vmovdqu	0xe0-0x80($ctx),$H
1014	vmovdqu	.Lpbswap(%rip),$Xn
1015	jmp	.Loop_avx
1016
1017.align	32
1018.Loop_avx:
1019	vpxor	$B,$C,$bxc			# magic seed
1020___
1021for($i=0;$i<16;$i++)	{ &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1022$code.=<<___;
1023	vmovdqu	`&Xi_off($i)`,$Xi
1024	mov	\$3,%ecx
1025	jmp	.Loop_16_xx_avx
1026.align	32
1027.Loop_16_xx_avx:
1028___
1029for(;$i<32;$i++)	{ &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1030$code.=<<___;
1031	dec	%ecx
1032	jnz	.Loop_16_xx_avx
1033
1034	mov	\$1,%ecx
1035	lea	K256+128(%rip),$Tbl
1036___
1037for($i=0;$i<4;$i++) {
1038    $code.=<<___;
1039	cmp	`4*$i`(%rbx),%ecx		# examine counters
1040	cmovge	$Tbl,@ptr[$i]			# cancel input
1041___
1042}
1043$code.=<<___;
1044	vmovdqa	(%rbx),$sigma			# pull counters
1045	vpxor	$t1,$t1,$t1
1046	vmovdqa	$sigma,$Xn
1047	vpcmpgtd $t1,$Xn,$Xn			# mask value
1048	vpaddd	$Xn,$sigma,$sigma		# counters--
1049
1050	vmovdqu	0x00-0x80($ctx),$t1
1051	vpand	$Xn,$A,$A
1052	vmovdqu	0x20-0x80($ctx),$t2
1053	vpand	$Xn,$B,$B
1054	vmovdqu	0x40-0x80($ctx),$t3
1055	vpand	$Xn,$C,$C
1056	vmovdqu	0x60-0x80($ctx),$Xi
1057	vpand	$Xn,$D,$D
1058	vpaddd	$t1,$A,$A
1059	vmovdqu	0x80-0x80($ctx),$t1
1060	vpand	$Xn,$E,$E
1061	vpaddd	$t2,$B,$B
1062	vmovdqu	0xa0-0x80($ctx),$t2
1063	vpand	$Xn,$F,$F
1064	vpaddd	$t3,$C,$C
1065	vmovdqu	0xc0-0x80($ctx),$t3
1066	vpand	$Xn,$G,$G
1067	vpaddd	$Xi,$D,$D
1068	vmovdqu	0xe0-0x80($ctx),$Xi
1069	vpand	$Xn,$H,$H
1070	vpaddd	$t1,$E,$E
1071	vpaddd	$t2,$F,$F
1072	vmovdqu	$A,0x00-0x80($ctx)
1073	vpaddd	$t3,$G,$G
1074	vmovdqu	$B,0x20-0x80($ctx)
1075	vpaddd	$Xi,$H,$H
1076	vmovdqu	$C,0x40-0x80($ctx)
1077	vmovdqu	$D,0x60-0x80($ctx)
1078	vmovdqu	$E,0x80-0x80($ctx)
1079	vmovdqu	$F,0xa0-0x80($ctx)
1080	vmovdqu	$G,0xc0-0x80($ctx)
1081	vmovdqu	$H,0xe0-0x80($ctx)
1082
1083	vmovdqu	$sigma,(%rbx)			# save counters
1084	vmovdqu	.Lpbswap(%rip),$Xn
1085	dec	$num
1086	jnz	.Loop_avx
1087
1088	mov	`$REG_SZ*17+8`(%rsp),$num
1089	lea	$REG_SZ($ctx),$ctx
1090	lea	`16*$REG_SZ/4`($inp),$inp
1091	dec	$num
1092	jnz	.Loop_grande_avx
1093
1094.Ldone_avx:
1095	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
1096.cfi_def_cfa	%rax,8
1097	vzeroupper
1098___
1099$code.=<<___ if ($win64);
1100	movaps	-0xb8(%rax),%xmm6
1101	movaps	-0xa8(%rax),%xmm7
1102	movaps	-0x98(%rax),%xmm8
1103	movaps	-0x88(%rax),%xmm9
1104	movaps	-0x78(%rax),%xmm10
1105	movaps	-0x68(%rax),%xmm11
1106	movaps	-0x58(%rax),%xmm12
1107	movaps	-0x48(%rax),%xmm13
1108	movaps	-0x38(%rax),%xmm14
1109	movaps	-0x28(%rax),%xmm15
1110___
1111$code.=<<___;
1112	mov	-16(%rax),%rbp
1113.cfi_restore	%rbp
1114	mov	-8(%rax),%rbx
1115.cfi_restore	%rbx
1116	lea	(%rax),%rsp
1117.cfi_def_cfa_register	%rsp
1118.Lepilogue_avx:
1119	ret
1120.cfi_endproc
1121.size	sha256_multi_block_avx,.-sha256_multi_block_avx
1122___
1123						if ($avx>1) {
1124$code =~ s/\`([^\`]*)\`/eval $1/gem;
1125
1126$REG_SZ=32;
1127@ptr=map("%r$_",(12..15,8..11));
1128
1129@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1130($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1131
1132$code.=<<___;
1133.type	sha256_multi_block_avx2,\@function,3
1134.align	32
1135sha256_multi_block_avx2:
1136.cfi_startproc
1137_avx2_shortcut:
1138	mov	%rsp,%rax
1139.cfi_def_cfa_register	%rax
1140	push	%rbx
1141.cfi_push	%rbx
1142	push	%rbp
1143.cfi_push	%rbp
1144	push	%r12
1145.cfi_push	%r12
1146	push	%r13
1147.cfi_push	%r13
1148	push	%r14
1149.cfi_push	%r14
1150	push	%r15
1151.cfi_push	%r15
1152___
1153$code.=<<___ if ($win64);
1154	lea	-0xa8(%rsp),%rsp
1155	movaps	%xmm6,(%rsp)
1156	movaps	%xmm7,0x10(%rsp)
1157	movaps	%xmm8,0x20(%rsp)
1158	movaps	%xmm9,0x30(%rsp)
1159	movaps	%xmm10,0x40(%rsp)
1160	movaps	%xmm11,0x50(%rsp)
1161	movaps	%xmm12,-0x78(%rax)
1162	movaps	%xmm13,-0x68(%rax)
1163	movaps	%xmm14,-0x58(%rax)
1164	movaps	%xmm15,-0x48(%rax)
1165___
1166$code.=<<___;
1167	sub	\$`$REG_SZ*18`, %rsp
1168	and	\$-256,%rsp
1169	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
1170.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
1171.Lbody_avx2:
1172	lea	K256+128(%rip),$Tbl
1173	lea	0x80($ctx),$ctx			# size optimization
1174
1175.Loop_grande_avx2:
1176	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1177	xor	$num,$num
1178	lea	`$REG_SZ*16`(%rsp),%rbx
1179___
1180for($i=0;$i<8;$i++) {
1181    $code.=<<___;
1182	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
1183	mov	`16*$i+8`($inp),%ecx		# number of blocks
1184	cmp	$num,%ecx
1185	cmovg	%ecx,$num			# find maximum
1186	test	%ecx,%ecx
1187	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1188	cmovle	$Tbl,@ptr[$i]			# cancel input
1189___
1190}
1191$code.=<<___;
1192	vmovdqu	0x00-0x80($ctx),$A		# load context
1193	 lea	128(%rsp),%rax
1194	vmovdqu	0x20-0x80($ctx),$B
1195	 lea	256+128(%rsp),%rbx
1196	vmovdqu	0x40-0x80($ctx),$C
1197	vmovdqu	0x60-0x80($ctx),$D
1198	vmovdqu	0x80-0x80($ctx),$E
1199	vmovdqu	0xa0-0x80($ctx),$F
1200	vmovdqu	0xc0-0x80($ctx),$G
1201	vmovdqu	0xe0-0x80($ctx),$H
1202	vmovdqu	.Lpbswap(%rip),$Xn
1203	jmp	.Loop_avx2
1204
1205.align	32
1206.Loop_avx2:
1207	vpxor	$B,$C,$bxc			# magic seed
1208___
1209for($i=0;$i<16;$i++)	{ &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1210$code.=<<___;
1211	vmovdqu	`&Xi_off($i)`,$Xi
1212	mov	\$3,%ecx
1213	jmp	.Loop_16_xx_avx2
1214.align	32
1215.Loop_16_xx_avx2:
1216___
1217for(;$i<32;$i++)	{ &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1218$code.=<<___;
1219	dec	%ecx
1220	jnz	.Loop_16_xx_avx2
1221
1222	mov	\$1,%ecx
1223	lea	`$REG_SZ*16`(%rsp),%rbx
1224	lea	K256+128(%rip),$Tbl
1225___
1226for($i=0;$i<8;$i++) {
1227    $code.=<<___;
1228	cmp	`4*$i`(%rbx),%ecx		# examine counters
1229	cmovge	$Tbl,@ptr[$i]			# cancel input
1230___
1231}
1232$code.=<<___;
1233	vmovdqa	(%rbx),$sigma			# pull counters
1234	vpxor	$t1,$t1,$t1
1235	vmovdqa	$sigma,$Xn
1236	vpcmpgtd $t1,$Xn,$Xn			# mask value
1237	vpaddd	$Xn,$sigma,$sigma		# counters--
1238
1239	vmovdqu	0x00-0x80($ctx),$t1
1240	vpand	$Xn,$A,$A
1241	vmovdqu	0x20-0x80($ctx),$t2
1242	vpand	$Xn,$B,$B
1243	vmovdqu	0x40-0x80($ctx),$t3
1244	vpand	$Xn,$C,$C
1245	vmovdqu	0x60-0x80($ctx),$Xi
1246	vpand	$Xn,$D,$D
1247	vpaddd	$t1,$A,$A
1248	vmovdqu	0x80-0x80($ctx),$t1
1249	vpand	$Xn,$E,$E
1250	vpaddd	$t2,$B,$B
1251	vmovdqu	0xa0-0x80($ctx),$t2
1252	vpand	$Xn,$F,$F
1253	vpaddd	$t3,$C,$C
1254	vmovdqu	0xc0-0x80($ctx),$t3
1255	vpand	$Xn,$G,$G
1256	vpaddd	$Xi,$D,$D
1257	vmovdqu	0xe0-0x80($ctx),$Xi
1258	vpand	$Xn,$H,$H
1259	vpaddd	$t1,$E,$E
1260	vpaddd	$t2,$F,$F
1261	vmovdqu	$A,0x00-0x80($ctx)
1262	vpaddd	$t3,$G,$G
1263	vmovdqu	$B,0x20-0x80($ctx)
1264	vpaddd	$Xi,$H,$H
1265	vmovdqu	$C,0x40-0x80($ctx)
1266	vmovdqu	$D,0x60-0x80($ctx)
1267	vmovdqu	$E,0x80-0x80($ctx)
1268	vmovdqu	$F,0xa0-0x80($ctx)
1269	vmovdqu	$G,0xc0-0x80($ctx)
1270	vmovdqu	$H,0xe0-0x80($ctx)
1271
1272	vmovdqu	$sigma,(%rbx)			# save counters
1273	lea	256+128(%rsp),%rbx
1274	vmovdqu	.Lpbswap(%rip),$Xn
1275	dec	$num
1276	jnz	.Loop_avx2
1277
1278	#mov	`$REG_SZ*17+8`(%rsp),$num
1279	#lea	$REG_SZ($ctx),$ctx
1280	#lea	`16*$REG_SZ/4`($inp),$inp
1281	#dec	$num
1282	#jnz	.Loop_grande_avx2
1283
1284.Ldone_avx2:
1285	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
1286.cfi_def_cfa	%rax,8
1287	vzeroupper
1288___
1289$code.=<<___ if ($win64);
1290	movaps	-0xd8(%rax),%xmm6
1291	movaps	-0xc8(%rax),%xmm7
1292	movaps	-0xb8(%rax),%xmm8
1293	movaps	-0xa8(%rax),%xmm9
1294	movaps	-0x98(%rax),%xmm10
1295	movaps	-0x88(%rax),%xmm11
1296	movaps	-0x78(%rax),%xmm12
1297	movaps	-0x68(%rax),%xmm13
1298	movaps	-0x58(%rax),%xmm14
1299	movaps	-0x48(%rax),%xmm15
1300___
1301$code.=<<___;
1302	mov	-48(%rax),%r15
1303.cfi_restore	%r15
1304	mov	-40(%rax),%r14
1305.cfi_restore	%r14
1306	mov	-32(%rax),%r13
1307.cfi_restore	%r13
1308	mov	-24(%rax),%r12
1309.cfi_restore	%r12
1310	mov	-16(%rax),%rbp
1311.cfi_restore	%rbp
1312	mov	-8(%rax),%rbx
1313.cfi_restore	%rbx
1314	lea	(%rax),%rsp
1315.cfi_def_cfa_register	%rsp
1316.Lepilogue_avx2:
1317	ret
1318.cfi_endproc
1319.size	sha256_multi_block_avx2,.-sha256_multi_block_avx2
1320___
1321					}	}}}
1322$code.=<<___;
1323.align	256
1324K256:
1325___
1326sub TABLE {
1327    foreach (@_) {
1328	$code.=<<___;
1329	.long	$_,$_,$_,$_
1330	.long	$_,$_,$_,$_
1331___
1332    }
1333}
1334&TABLE(	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1335	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1336	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1337	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1338	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1339	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1340	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1341	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1342	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1343	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1344	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1345	0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1346	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1347	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1348	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1349	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1350$code.=<<___;
1351.Lpbswap:
1352	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1353	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1354K256_shaext:
1355	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1356	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1357	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1358	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1359	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1360	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1361	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1362	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1363	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1364	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1365	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1366	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1367	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1368	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1369	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1370	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1371	.asciz	"SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1372___
1373
1374if ($win64) {
1375# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1376#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1377$rec="%rcx";
1378$frame="%rdx";
1379$context="%r8";
1380$disp="%r9";
1381
1382$code.=<<___;
1383.extern	__imp_RtlVirtualUnwind
1384.type	se_handler,\@abi-omnipotent
1385.align	16
1386se_handler:
1387	push	%rsi
1388	push	%rdi
1389	push	%rbx
1390	push	%rbp
1391	push	%r12
1392	push	%r13
1393	push	%r14
1394	push	%r15
1395	pushfq
1396	sub	\$64,%rsp
1397
1398	mov	120($context),%rax	# pull context->Rax
1399	mov	248($context),%rbx	# pull context->Rip
1400
1401	mov	8($disp),%rsi		# disp->ImageBase
1402	mov	56($disp),%r11		# disp->HandlerData
1403
1404	mov	0(%r11),%r10d		# HandlerData[0]
1405	lea	(%rsi,%r10),%r10	# end of prologue label
1406	cmp	%r10,%rbx		# context->Rip<.Lbody
1407	jb	.Lin_prologue
1408
1409	mov	152($context),%rax	# pull context->Rsp
1410
1411	mov	4(%r11),%r10d		# HandlerData[1]
1412	lea	(%rsi,%r10),%r10	# epilogue label
1413	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1414	jae	.Lin_prologue
1415
1416	mov	`16*17`(%rax),%rax	# pull saved stack pointer
1417
1418	mov	-8(%rax),%rbx
1419	mov	-16(%rax),%rbp
1420	mov	%rbx,144($context)	# restore context->Rbx
1421	mov	%rbp,160($context)	# restore context->Rbp
1422
1423	lea	-24-10*16(%rax),%rsi
1424	lea	512($context),%rdi	# &context.Xmm6
1425	mov	\$20,%ecx
1426	.long	0xa548f3fc		# cld; rep movsq
1427
1428.Lin_prologue:
1429	mov	8(%rax),%rdi
1430	mov	16(%rax),%rsi
1431	mov	%rax,152($context)	# restore context->Rsp
1432	mov	%rsi,168($context)	# restore context->Rsi
1433	mov	%rdi,176($context)	# restore context->Rdi
1434
1435	mov	40($disp),%rdi		# disp->ContextRecord
1436	mov	$context,%rsi		# context
1437	mov	\$154,%ecx		# sizeof(CONTEXT)
1438	.long	0xa548f3fc		# cld; rep movsq
1439
1440	mov	$disp,%rsi
1441	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1442	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1443	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1444	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1445	mov	40(%rsi),%r10		# disp->ContextRecord
1446	lea	56(%rsi),%r11		# &disp->HandlerData
1447	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1448	mov	%r10,32(%rsp)		# arg5
1449	mov	%r11,40(%rsp)		# arg6
1450	mov	%r12,48(%rsp)		# arg7
1451	mov	%rcx,56(%rsp)		# arg8, (NULL)
1452	call	*__imp_RtlVirtualUnwind(%rip)
1453
1454	mov	\$1,%eax		# ExceptionContinueSearch
1455	add	\$64,%rsp
1456	popfq
1457	pop	%r15
1458	pop	%r14
1459	pop	%r13
1460	pop	%r12
1461	pop	%rbp
1462	pop	%rbx
1463	pop	%rdi
1464	pop	%rsi
1465	ret
1466.size	se_handler,.-se_handler
1467___
1468$code.=<<___ if ($avx>1);
1469.type	avx2_handler,\@abi-omnipotent
1470.align	16
1471avx2_handler:
1472	push	%rsi
1473	push	%rdi
1474	push	%rbx
1475	push	%rbp
1476	push	%r12
1477	push	%r13
1478	push	%r14
1479	push	%r15
1480	pushfq
1481	sub	\$64,%rsp
1482
1483	mov	120($context),%rax	# pull context->Rax
1484	mov	248($context),%rbx	# pull context->Rip
1485
1486	mov	8($disp),%rsi		# disp->ImageBase
1487	mov	56($disp),%r11		# disp->HandlerData
1488
1489	mov	0(%r11),%r10d		# HandlerData[0]
1490	lea	(%rsi,%r10),%r10	# end of prologue label
1491	cmp	%r10,%rbx		# context->Rip<body label
1492	jb	.Lin_prologue
1493
1494	mov	152($context),%rax	# pull context->Rsp
1495
1496	mov	4(%r11),%r10d		# HandlerData[1]
1497	lea	(%rsi,%r10),%r10	# epilogue label
1498	cmp	%r10,%rbx		# context->Rip>=epilogue label
1499	jae	.Lin_prologue
1500
1501	mov	`32*17`($context),%rax	# pull saved stack pointer
1502
1503	mov	-8(%rax),%rbx
1504	mov	-16(%rax),%rbp
1505	mov	-24(%rax),%r12
1506	mov	-32(%rax),%r13
1507	mov	-40(%rax),%r14
1508	mov	-48(%rax),%r15
1509	mov	%rbx,144($context)	# restore context->Rbx
1510	mov	%rbp,160($context)	# restore context->Rbp
1511	mov	%r12,216($context)	# restore context->R12
1512	mov	%r13,224($context)	# restore context->R13
1513	mov	%r14,232($context)	# restore context->R14
1514	mov	%r15,240($context)	# restore context->R15
1515
1516	lea	-56-10*16(%rax),%rsi
1517	lea	512($context),%rdi	# &context.Xmm6
1518	mov	\$20,%ecx
1519	.long	0xa548f3fc		# cld; rep movsq
1520
1521	jmp	.Lin_prologue
1522.size	avx2_handler,.-avx2_handler
1523___
1524$code.=<<___;
1525.section	.pdata
1526.align	4
1527	.rva	.LSEH_begin_sha256_multi_block
1528	.rva	.LSEH_end_sha256_multi_block
1529	.rva	.LSEH_info_sha256_multi_block
1530	.rva	.LSEH_begin_sha256_multi_block_shaext
1531	.rva	.LSEH_end_sha256_multi_block_shaext
1532	.rva	.LSEH_info_sha256_multi_block_shaext
1533___
1534$code.=<<___ if ($avx);
1535	.rva	.LSEH_begin_sha256_multi_block_avx
1536	.rva	.LSEH_end_sha256_multi_block_avx
1537	.rva	.LSEH_info_sha256_multi_block_avx
1538___
1539$code.=<<___ if ($avx>1);
1540	.rva	.LSEH_begin_sha256_multi_block_avx2
1541	.rva	.LSEH_end_sha256_multi_block_avx2
1542	.rva	.LSEH_info_sha256_multi_block_avx2
1543___
1544$code.=<<___;
1545.section	.xdata
1546.align	8
1547.LSEH_info_sha256_multi_block:
1548	.byte	9,0,0,0
1549	.rva	se_handler
1550	.rva	.Lbody,.Lepilogue			# HandlerData[]
1551.LSEH_info_sha256_multi_block_shaext:
1552	.byte	9,0,0,0
1553	.rva	se_handler
1554	.rva	.Lbody_shaext,.Lepilogue_shaext		# HandlerData[]
1555___
1556$code.=<<___ if ($avx);
1557.LSEH_info_sha256_multi_block_avx:
1558	.byte	9,0,0,0
1559	.rva	se_handler
1560	.rva	.Lbody_avx,.Lepilogue_avx		# HandlerData[]
1561___
1562$code.=<<___ if ($avx>1);
1563.LSEH_info_sha256_multi_block_avx2:
1564	.byte	9,0,0,0
1565	.rva	avx2_handler
1566	.rva	.Lbody_avx2,.Lepilogue_avx2		# HandlerData[]
1567___
1568}
1569####################################################################
1570
1571sub rex {
1572  local *opcode=shift;
1573  my ($dst,$src)=@_;
1574  my $rex=0;
1575
1576    $rex|=0x04			if ($dst>=8);
1577    $rex|=0x01			if ($src>=8);
1578    unshift @opcode,$rex|0x40	if ($rex);
1579}
1580
1581sub sha256op38 {
1582    my $instr = shift;
1583    my %opcodelet = (
1584		"sha256rnds2" => 0xcb,
1585  		"sha256msg1"  => 0xcc,
1586		"sha256msg2"  => 0xcd	);
1587
1588    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1589      my @opcode=(0x0f,0x38);
1590	rex(\@opcode,$2,$1);
1591	push @opcode,$opcodelet{$instr};
1592	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1593	return ".byte\t".join(',',@opcode);
1594    } else {
1595	return $instr."\t".@_[0];
1596    }
1597}
1598
1599foreach (split("\n",$code)) {
1600	s/\`([^\`]*)\`/eval($1)/ge;
1601
1602	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo		or
1603
1604	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
1605	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
1606	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go	or
1607	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1608	s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go		or
1609	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1610
1611	print $_,"\n";
1612}
1613
1614close STDOUT or die "error closing STDOUT: $!";
1615