1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Multi-buffer SHA256 procedure processes n buffers in parallel by
11# placing buffer data to designated lane of SIMD register. n is
12# naturally limited to 4 on pre-AVX2 processors and to 8 on
13# AVX2-capable processors such as Haswell.
14#
15#		this	+aesni(i)	sha256	aesni-sha256	gain(iv)
16# -------------------------------------------------------------------
17# Westmere(ii)	23.3/n	+1.28=7.11(n=4)	12.3	+3.75=16.1	+126%
18# Atom(ii)	38.7/n	+3.93=13.6(n=4)	20.8	+5.69=26.5	+95%
19# Sandy Bridge	(20.5	+5.15=25.7)/n	11.6	13.0		+103%
20# Ivy Bridge	(20.4	+5.14=25.5)/n	10.3	11.6		+82%
21# Haswell(iii)	(21.0	+5.00=26.0)/n	7.80	8.79		+170%
22# Bulldozer	(21.6	+5.76=27.4)/n	13.6	13.7		+100%
23#
24# (i)	multi-block CBC encrypt with 128-bit key;
25# (ii)	(HASH+AES)/n does not apply to Westmere for n>3 and Atom,
26#	because of lower AES-NI instruction throughput, nor is there
27#	AES-NI-SHA256 stitch for these processors;
28# (iii)	"this" is for n=8, when we gather twice as much data, result
29#	for n=4 is 20.3+4.44=24.7;
30# (iv)	presented improvement coefficients are asymptotic limits and
31#	in real-life application are somewhat lower, e.g. for 2KB
32#	fragments they range from 75% to 130% (on Haswell);
33
34$flavour = shift;
35$output  = shift;
36if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
37
38$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
39
40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
42( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
43die "can't locate x86_64-xlate.pl";
44
45$avx=0;
46
47if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49	$avx = ($1>=2.19) + ($1>=2.22);
50}
51
52if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54	$avx = ($1>=2.09) + ($1>=2.10);
55}
56
57if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59	$avx = ($1>=10) + ($1>=11);
60}
61
62if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
63	$avx = ($2>=3.0) + ($2>3.0);
64}
65
66open OUT,"| \"$^X\" $xlate $flavour $output";
67*STDOUT=*OUT;
68
69# void sha256_multi_block (
70#     struct {	unsigned int A[8];
71#		unsigned int B[8];
72#		unsigned int C[8];
73#		unsigned int D[8];
74#		unsigned int E[8];
75#		unsigned int F[8];
76#		unsigned int G[8];
77#		unsigned int H[8];	} *ctx,
78#     struct {	void *ptr; int blocks;	} inp[8],
79#     int num);		/* 1 or 2 */
80#
81$ctx="%rdi";	# 1st arg
82$inp="%rsi";	# 2nd arg
83$num="%edx";	# 3rd arg
84@ptr=map("%r$_",(8..11));
85$Tbl="%rbp";
86
87@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
88($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
89
90$REG_SZ=16;
91
92sub Xi_off {
93my $off = shift;
94
95    $off %= 16; $off *= $REG_SZ;
96    $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
97}
98
99sub ROUND_00_15 {
100my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
101
102$code.=<<___ if ($i<15);
103	movd		`4*$i`(@ptr[0]),$Xi
104	movd		`4*$i`(@ptr[1]),$t1
105	movd		`4*$i`(@ptr[2]),$t2
106	movd		`4*$i`(@ptr[3]),$t3
107	punpckldq	$t2,$Xi
108	punpckldq	$t3,$t1
109	punpckldq	$t1,$Xi
110___
111$code.=<<___ if ($i==15);
112	movd		`4*$i`(@ptr[0]),$Xi
113	 lea		`16*4`(@ptr[0]),@ptr[0]
114	movd		`4*$i`(@ptr[1]),$t1
115	 lea		`16*4`(@ptr[1]),@ptr[1]
116	movd		`4*$i`(@ptr[2]),$t2
117	 lea		`16*4`(@ptr[2]),@ptr[2]
118	movd		`4*$i`(@ptr[3]),$t3
119	 lea		`16*4`(@ptr[3]),@ptr[3]
120	punpckldq	$t2,$Xi
121	punpckldq	$t3,$t1
122	punpckldq	$t1,$Xi
123___
124$code.=<<___;
125	movdqa	$e,$sigma
126	`"pshufb	$Xn,$Xi"		if ($i<=15 && ($i&1)==0)`
127	movdqa	$e,$t3
128	`"pshufb	$Xn,$Xi"		if ($i<=15 && ($i&1)==1)`
129	psrld	\$6,$sigma
130	movdqa	$e,$t2
131	pslld	\$7,$t3
132	movdqa	$Xi,`&Xi_off($i)`
133	 paddd	$h,$Xi				# Xi+=h
134
135	psrld	\$11,$t2
136	pxor	$t3,$sigma
137	pslld	\$21-7,$t3
138	 paddd	`32*($i%8)-128`($Tbl),$Xi	# Xi+=K[round]
139	pxor	$t2,$sigma
140
141	psrld	\$25-11,$t2
142	 movdqa	$e,$t1
143	 `"prefetcht0	63(@ptr[0])"		if ($i==15)`
144	pxor	$t3,$sigma
145	 movdqa	$e,$axb				# borrow $axb
146	pslld	\$26-21,$t3
147	 pandn	$g,$t1
148	 pand	$f,$axb
149	pxor	$t2,$sigma
150
151	 `"prefetcht0	63(@ptr[1])"		if ($i==15)`
152	movdqa	$a,$t2
153	pxor	$t3,$sigma			# Sigma1(e)
154	movdqa	$a,$t3
155	psrld	\$2,$t2
156	paddd	$sigma,$Xi			# Xi+=Sigma1(e)
157	 pxor	$axb,$t1			# Ch(e,f,g)
158	 movdqa	$b,$axb
159	movdqa	$a,$sigma
160	pslld	\$10,$t3
161	 pxor	$a,$axb				# a^b, b^c in next round
162
163	 `"prefetcht0	63(@ptr[2])"		if ($i==15)`
164	psrld	\$13,$sigma
165	pxor	$t3,$t2
166	 paddd	$t1,$Xi				# Xi+=Ch(e,f,g)
167	pslld	\$19-10,$t3
168	 pand	$axb,$bxc
169	pxor	$sigma,$t2
170
171	 `"prefetcht0	63(@ptr[3])"		if ($i==15)`
172	psrld	\$22-13,$sigma
173	pxor	$t3,$t2
174	 movdqa	$b,$h
175	pslld	\$30-19,$t3
176	pxor	$t2,$sigma
177	 pxor	$bxc,$h				# h=Maj(a,b,c)=Ch(a^b,c,b)
178	 paddd	$Xi,$d				# d+=Xi
179	pxor	$t3,$sigma			# Sigma0(a)
180
181	paddd	$Xi,$h				# h+=Xi
182	paddd	$sigma,$h			# h+=Sigma0(a)
183___
184$code.=<<___ if (($i%8)==7);
185	lea	`32*8`($Tbl),$Tbl
186___
187	($axb,$bxc)=($bxc,$axb);
188}
189
190sub ROUND_16_XX {
191my $i=shift;
192
193$code.=<<___;
194	movdqa	`&Xi_off($i+1)`,$Xn
195	paddd	`&Xi_off($i+9)`,$Xi		# Xi+=X[i+9]
196
197	movdqa	$Xn,$sigma
198	movdqa	$Xn,$t2
199	psrld	\$3,$sigma
200	movdqa	$Xn,$t3
201
202	psrld	\$7,$t2
203	movdqa	`&Xi_off($i+14)`,$t1
204	pslld	\$14,$t3
205	pxor	$t2,$sigma
206	psrld	\$18-7,$t2
207	movdqa	$t1,$axb			# borrow $axb
208	pxor	$t3,$sigma
209	pslld	\$25-14,$t3
210	pxor	$t2,$sigma
211	psrld	\$10,$t1
212	movdqa	$axb,$t2
213
214	psrld	\$17,$axb
215	pxor	$t3,$sigma			# sigma0(X[i+1])
216	pslld	\$13,$t2
217	 paddd	$sigma,$Xi			# Xi+=sigma0(e)
218	pxor	$axb,$t1
219	psrld	\$19-17,$axb
220	pxor	$t2,$t1
221	pslld	\$15-13,$t2
222	pxor	$axb,$t1
223	pxor	$t2,$t1				# sigma0(X[i+14])
224	paddd	$t1,$Xi				# Xi+=sigma1(X[i+14])
225___
226	&ROUND_00_15($i,@_);
227	($Xi,$Xn)=($Xn,$Xi);
228}
229
230$code.=<<___;
231.text
232
233.extern	OPENSSL_ia32cap_P
234
235.globl	sha256_multi_block
236.type	sha256_multi_block,\@function,3
237.align	32
238sha256_multi_block:
239	mov	OPENSSL_ia32cap_P+4(%rip),%rcx
240	bt	\$61,%rcx			# check SHA bit
241	jc	_shaext_shortcut
242___
243$code.=<<___ if ($avx);
244	test	\$`1<<28`,%ecx
245	jnz	_avx_shortcut
246___
247$code.=<<___;
248	mov	%rsp,%rax
249	push	%rbx
250	push	%rbp
251___
252$code.=<<___ if ($win64);
253	lea	-0xa8(%rsp),%rsp
254	movaps	%xmm6,(%rsp)
255	movaps	%xmm7,0x10(%rsp)
256	movaps	%xmm8,0x20(%rsp)
257	movaps	%xmm9,0x30(%rsp)
258	movaps	%xmm10,-0x78(%rax)
259	movaps	%xmm11,-0x68(%rax)
260	movaps	%xmm12,-0x58(%rax)
261	movaps	%xmm13,-0x48(%rax)
262	movaps	%xmm14,-0x38(%rax)
263	movaps	%xmm15,-0x28(%rax)
264___
265$code.=<<___;
266	sub	\$`$REG_SZ*18`, %rsp
267	and	\$-256,%rsp
268	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
269.Lbody:
270	lea	K256+128(%rip),$Tbl
271	lea	`$REG_SZ*16`(%rsp),%rbx
272	lea	0x80($ctx),$ctx			# size optimization
273
274.Loop_grande:
275	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
276	xor	$num,$num
277___
278for($i=0;$i<4;$i++) {
279    $code.=<<___;
280	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
281	mov	`16*$i+8`($inp),%ecx		# number of blocks
282	cmp	$num,%ecx
283	cmovg	%ecx,$num			# find maximum
284	test	%ecx,%ecx
285	mov	%ecx,`4*$i`(%rbx)		# initialize counters
286	cmovle	$Tbl,@ptr[$i]			# cancel input
287___
288}
289$code.=<<___;
290	test	$num,$num
291	jz	.Ldone
292
293	movdqu	0x00-0x80($ctx),$A		# load context
294	 lea	128(%rsp),%rax
295	movdqu	0x20-0x80($ctx),$B
296	movdqu	0x40-0x80($ctx),$C
297	movdqu	0x60-0x80($ctx),$D
298	movdqu	0x80-0x80($ctx),$E
299	movdqu	0xa0-0x80($ctx),$F
300	movdqu	0xc0-0x80($ctx),$G
301	movdqu	0xe0-0x80($ctx),$H
302	movdqu	.Lpbswap(%rip),$Xn
303	jmp	.Loop
304
305.align	32
306.Loop:
307	movdqa	$C,$bxc
308	pxor	$B,$bxc				# magic seed
309___
310for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
311$code.=<<___;
312	movdqu	`&Xi_off($i)`,$Xi
313	mov	\$3,%ecx
314	jmp	.Loop_16_xx
315.align	32
316.Loop_16_xx:
317___
318for(;$i<32;$i++)	{ &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
319$code.=<<___;
320	dec	%ecx
321	jnz	.Loop_16_xx
322
323	mov	\$1,%ecx
324	lea	K256+128(%rip),$Tbl
325
326	movdqa	(%rbx),$sigma			# pull counters
327	cmp	4*0(%rbx),%ecx			# examine counters
328	pxor	$t1,$t1
329	cmovge	$Tbl,@ptr[0]			# cancel input
330	cmp	4*1(%rbx),%ecx
331	movdqa	$sigma,$Xn
332	cmovge	$Tbl,@ptr[1]
333	cmp	4*2(%rbx),%ecx
334	pcmpgtd	$t1,$Xn				# mask value
335	cmovge	$Tbl,@ptr[2]
336	cmp	4*3(%rbx),%ecx
337	paddd	$Xn,$sigma			# counters--
338	cmovge	$Tbl,@ptr[3]
339
340	movdqu	0x00-0x80($ctx),$t1
341	pand	$Xn,$A
342	movdqu	0x20-0x80($ctx),$t2
343	pand	$Xn,$B
344	movdqu	0x40-0x80($ctx),$t3
345	pand	$Xn,$C
346	movdqu	0x60-0x80($ctx),$Xi
347	pand	$Xn,$D
348	paddd	$t1,$A
349	movdqu	0x80-0x80($ctx),$t1
350	pand	$Xn,$E
351	paddd	$t2,$B
352	movdqu	0xa0-0x80($ctx),$t2
353	pand	$Xn,$F
354	paddd	$t3,$C
355	movdqu	0xc0-0x80($ctx),$t3
356	pand	$Xn,$G
357	paddd	$Xi,$D
358	movdqu	0xe0-0x80($ctx),$Xi
359	pand	$Xn,$H
360	paddd	$t1,$E
361	paddd	$t2,$F
362	movdqu	$A,0x00-0x80($ctx)
363	paddd	$t3,$G
364	movdqu	$B,0x20-0x80($ctx)
365	paddd	$Xi,$H
366	movdqu	$C,0x40-0x80($ctx)
367	movdqu	$D,0x60-0x80($ctx)
368	movdqu	$E,0x80-0x80($ctx)
369	movdqu	$F,0xa0-0x80($ctx)
370	movdqu	$G,0xc0-0x80($ctx)
371	movdqu	$H,0xe0-0x80($ctx)
372
373	movdqa	$sigma,(%rbx)			# save counters
374	movdqa	.Lpbswap(%rip),$Xn
375	dec	$num
376	jnz	.Loop
377
378	mov	`$REG_SZ*17+8`(%rsp),$num
379	lea	$REG_SZ($ctx),$ctx
380	lea	`16*$REG_SZ/4`($inp),$inp
381	dec	$num
382	jnz	.Loop_grande
383
384.Ldone:
385	mov	`$REG_SZ*17`(%rsp),%rax		# orignal %rsp
386___
387$code.=<<___ if ($win64);
388	movaps	-0xb8(%rax),%xmm6
389	movaps	-0xa8(%rax),%xmm7
390	movaps	-0x98(%rax),%xmm8
391	movaps	-0x88(%rax),%xmm9
392	movaps	-0x78(%rax),%xmm10
393	movaps	-0x68(%rax),%xmm11
394	movaps	-0x58(%rax),%xmm12
395	movaps	-0x48(%rax),%xmm13
396	movaps	-0x38(%rax),%xmm14
397	movaps	-0x28(%rax),%xmm15
398___
399$code.=<<___;
400	mov	-16(%rax),%rbp
401	mov	-8(%rax),%rbx
402	lea	(%rax),%rsp
403.Lepilogue:
404	ret
405.size	sha256_multi_block,.-sha256_multi_block
406___
407						{{{
408my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
409my @MSG0=map("%xmm$_",(4..7));
410my @MSG1=map("%xmm$_",(8..11));
411
412$code.=<<___;
413.type	sha256_multi_block_shaext,\@function,3
414.align	32
415sha256_multi_block_shaext:
416_shaext_shortcut:
417	mov	%rsp,%rax
418	push	%rbx
419	push	%rbp
420___
421$code.=<<___ if ($win64);
422	lea	-0xa8(%rsp),%rsp
423	movaps	%xmm6,(%rsp)
424	movaps	%xmm7,0x10(%rsp)
425	movaps	%xmm8,0x20(%rsp)
426	movaps	%xmm9,0x30(%rsp)
427	movaps	%xmm10,-0x78(%rax)
428	movaps	%xmm11,-0x68(%rax)
429	movaps	%xmm12,-0x58(%rax)
430	movaps	%xmm13,-0x48(%rax)
431	movaps	%xmm14,-0x38(%rax)
432	movaps	%xmm15,-0x28(%rax)
433___
434$code.=<<___;
435	sub	\$`$REG_SZ*18`,%rsp
436	shl	\$1,$num			# we process pair at a time
437	and	\$-256,%rsp
438	lea	0x80($ctx),$ctx			# size optimization
439	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
440.Lbody_shaext:
441	lea	`$REG_SZ*16`(%rsp),%rbx
442	lea	K256_shaext+0x80(%rip),$Tbl
443
444.Loop_grande_shaext:
445	mov	$num,`$REG_SZ*17+8`(%rsp)	# orignal $num
446	xor	$num,$num
447___
448for($i=0;$i<2;$i++) {
449    $code.=<<___;
450	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
451	mov	`16*$i+8`($inp),%ecx		# number of blocks
452	cmp	$num,%ecx
453	cmovg	%ecx,$num			# find maximum
454	test	%ecx,%ecx
455	mov	%ecx,`4*$i`(%rbx)		# initialize counters
456	cmovle	%rsp,@ptr[$i]			# cancel input
457___
458}
459$code.=<<___;
460	test	$num,$num
461	jz	.Ldone_shaext
462
463	movq		0x00-0x80($ctx),$ABEF0		# A1.A0
464	movq		0x20-0x80($ctx),@MSG0[0]	# B1.B0
465	movq		0x40-0x80($ctx),$CDGH0		# C1.C0
466	movq		0x60-0x80($ctx),@MSG0[1]	# D1.D0
467	movq		0x80-0x80($ctx),@MSG1[0]	# E1.E0
468	movq		0xa0-0x80($ctx),@MSG1[1]	# F1.F0
469	movq		0xc0-0x80($ctx),@MSG1[2]	# G1.G0
470	movq		0xe0-0x80($ctx),@MSG1[3]	# H1.H0
471
472	punpckldq	@MSG0[0],$ABEF0			# B1.A1.B0.A0
473	punpckldq	@MSG0[1],$CDGH0			# D1.C1.D0.C0
474	punpckldq	@MSG1[1],@MSG1[0]		# F1.E1.F0.E0
475	punpckldq	@MSG1[3],@MSG1[2]		# H1.G1.H0.G0
476	movdqa		K256_shaext-0x10(%rip),$TMPx	# byte swap
477
478	movdqa		$ABEF0,$ABEF1
479	movdqa		$CDGH0,$CDGH1
480	punpcklqdq	@MSG1[0],$ABEF0			# F0.E0.B0.A0
481	punpcklqdq	@MSG1[2],$CDGH0			# H0.G0.D0.C0
482	punpckhqdq	@MSG1[0],$ABEF1			# F1.E1.B1.A1
483	punpckhqdq	@MSG1[2],$CDGH1			# H1.G1.D1.C1
484
485	pshufd		\$0b00011011,$ABEF0,$ABEF0
486	pshufd		\$0b00011011,$CDGH0,$CDGH0
487	pshufd		\$0b00011011,$ABEF1,$ABEF1
488	pshufd		\$0b00011011,$CDGH1,$CDGH1
489	jmp		.Loop_shaext
490
491.align	32
492.Loop_shaext:
493	movdqu		0x00(@ptr[0]),@MSG0[0]
494	 movdqu		0x00(@ptr[1]),@MSG1[0]
495	movdqu		0x10(@ptr[0]),@MSG0[1]
496	 movdqu		0x10(@ptr[1]),@MSG1[1]
497	movdqu		0x20(@ptr[0]),@MSG0[2]
498	pshufb		$TMPx,@MSG0[0]
499	 movdqu		0x20(@ptr[1]),@MSG1[2]
500	 pshufb		$TMPx,@MSG1[0]
501	movdqu		0x30(@ptr[0]),@MSG0[3]
502	lea		0x40(@ptr[0]),@ptr[0]
503	 movdqu		0x30(@ptr[1]),@MSG1[3]
504	 lea		0x40(@ptr[1]),@ptr[1]
505
506	movdqa		0*16-0x80($Tbl),$Wi
507	pshufb		$TMPx,@MSG0[1]
508	paddd		@MSG0[0],$Wi
509	pxor		$ABEF0,@MSG0[0]		# black magic
510	movdqa		$Wi,$TMP0
511	 movdqa		0*16-0x80($Tbl),$TMP1
512	 pshufb		$TMPx,@MSG1[1]
513	 paddd		@MSG1[0],$TMP1
514	movdqa		$CDGH0,0x50(%rsp)	# offload
515	sha256rnds2	$ABEF0,$CDGH0		# 0-3
516	 pxor		$ABEF1,@MSG1[0]		# black magic
517	 movdqa		$TMP1,$Wi
518	 movdqa		$CDGH1,0x70(%rsp)
519	 sha256rnds2	$ABEF1,$CDGH1		# 0-3
520	pshufd		\$0x0e,$TMP0,$Wi
521	pxor		$ABEF0,@MSG0[0]		# black magic
522	movdqa		$ABEF0,0x40(%rsp)	# offload
523	sha256rnds2	$CDGH0,$ABEF0
524	 pshufd		\$0x0e,$TMP1,$Wi
525	 pxor		$ABEF1,@MSG1[0]		# black magic
526	 movdqa		$ABEF1,0x60(%rsp)
527	movdqa		1*16-0x80($Tbl),$TMP0
528	paddd		@MSG0[1],$TMP0
529	pshufb		$TMPx,@MSG0[2]
530	 sha256rnds2	$CDGH1,$ABEF1
531
532	movdqa		$TMP0,$Wi
533	 movdqa		1*16-0x80($Tbl),$TMP1
534	 paddd		@MSG1[1],$TMP1
535	sha256rnds2	$ABEF0,$CDGH0		# 4-7
536	 movdqa		$TMP1,$Wi
537	prefetcht0	127(@ptr[0])
538	pshufb		$TMPx,@MSG0[3]
539	 pshufb		$TMPx,@MSG1[2]
540	 prefetcht0	127(@ptr[1])
541	 sha256rnds2	$ABEF1,$CDGH1		# 4-7
542	pshufd		\$0x0e,$TMP0,$Wi
543	 pshufb		$TMPx,@MSG1[3]
544	sha256msg1	@MSG0[1],@MSG0[0]
545	sha256rnds2	$CDGH0,$ABEF0
546	 pshufd		\$0x0e,$TMP1,$Wi
547	movdqa		2*16-0x80($Tbl),$TMP0
548	paddd		@MSG0[2],$TMP0
549	 sha256rnds2	$CDGH1,$ABEF1
550
551	movdqa		$TMP0,$Wi
552	 movdqa		2*16-0x80($Tbl),$TMP1
553	 paddd		@MSG1[2],$TMP1
554	sha256rnds2	$ABEF0,$CDGH0		# 8-11
555	 sha256msg1	@MSG1[1],@MSG1[0]
556	 movdqa		$TMP1,$Wi
557	movdqa		@MSG0[3],$TMPx
558	 sha256rnds2	$ABEF1,$CDGH1		# 8-11
559	pshufd		\$0x0e,$TMP0,$Wi
560	palignr		\$4,@MSG0[2],$TMPx
561	paddd		$TMPx,@MSG0[0]
562	 movdqa		@MSG1[3],$TMPx
563	 palignr	\$4,@MSG1[2],$TMPx
564	sha256msg1	@MSG0[2],@MSG0[1]
565	sha256rnds2	$CDGH0,$ABEF0
566	 pshufd		\$0x0e,$TMP1,$Wi
567	movdqa		3*16-0x80($Tbl),$TMP0
568	paddd		@MSG0[3],$TMP0
569	 sha256rnds2	$CDGH1,$ABEF1
570	 sha256msg1	@MSG1[2],@MSG1[1]
571
572	movdqa		$TMP0,$Wi
573	 movdqa		3*16-0x80($Tbl),$TMP1
574	 paddd		$TMPx,@MSG1[0]
575	 paddd		@MSG1[3],$TMP1
576	sha256msg2	@MSG0[3],@MSG0[0]
577	sha256rnds2	$ABEF0,$CDGH0		# 12-15
578	 movdqa		$TMP1,$Wi
579	movdqa		@MSG0[0],$TMPx
580	palignr		\$4,@MSG0[3],$TMPx
581	 sha256rnds2	$ABEF1,$CDGH1		# 12-15
582	 sha256msg2	@MSG1[3],@MSG1[0]
583	pshufd		\$0x0e,$TMP0,$Wi
584	paddd		$TMPx,@MSG0[1]
585	 movdqa		@MSG1[0],$TMPx
586	 palignr	\$4,@MSG1[3],$TMPx
587	sha256msg1	@MSG0[3],@MSG0[2]
588	sha256rnds2	$CDGH0,$ABEF0
589	 pshufd		\$0x0e,$TMP1,$Wi
590	movdqa		4*16-0x80($Tbl),$TMP0
591	paddd		@MSG0[0],$TMP0
592	 sha256rnds2	$CDGH1,$ABEF1
593	 sha256msg1	@MSG1[3],@MSG1[2]
594___
595for($i=4;$i<16-3;$i++) {
596$code.=<<___;
597	movdqa		$TMP0,$Wi
598	 movdqa		$i*16-0x80($Tbl),$TMP1
599	 paddd		$TMPx,@MSG1[1]
600	 paddd		@MSG1[0],$TMP1
601	sha256msg2	@MSG0[0],@MSG0[1]
602	sha256rnds2	$ABEF0,$CDGH0		# 16-19...
603	 movdqa		$TMP1,$Wi
604	movdqa		@MSG0[1],$TMPx
605	palignr		\$4,@MSG0[0],$TMPx
606	 sha256rnds2	$ABEF1,$CDGH1		# 16-19...
607	 sha256msg2	@MSG1[0],@MSG1[1]
608	pshufd		\$0x0e,$TMP0,$Wi
609	paddd		$TMPx,@MSG0[2]
610	 movdqa		@MSG1[1],$TMPx
611	 palignr	\$4,@MSG1[0],$TMPx
612	sha256msg1	@MSG0[0],@MSG0[3]
613	sha256rnds2	$CDGH0,$ABEF0
614	 pshufd		\$0x0e,$TMP1,$Wi
615	movdqa		`($i+1)*16`-0x80($Tbl),$TMP0
616	paddd		@MSG0[1],$TMP0
617	 sha256rnds2	$CDGH1,$ABEF1
618	 sha256msg1	@MSG1[0],@MSG1[3]
619___
620	push(@MSG0,shift(@MSG0));	push(@MSG1,shift(@MSG1));
621}
622$code.=<<___;
623	movdqa		$TMP0,$Wi
624	 movdqa		13*16-0x80($Tbl),$TMP1
625	 paddd		$TMPx,@MSG1[1]
626	 paddd		@MSG1[0],$TMP1
627	sha256msg2	@MSG0[0],@MSG0[1]
628	sha256rnds2	$ABEF0,$CDGH0		# 52-55
629	 movdqa		$TMP1,$Wi
630	movdqa		@MSG0[1],$TMPx
631	palignr		\$4,@MSG0[0],$TMPx
632	 sha256rnds2	$ABEF1,$CDGH1		# 52-55
633	 sha256msg2	@MSG1[0],@MSG1[1]
634	pshufd		\$0x0e,$TMP0,$Wi
635	paddd		$TMPx,@MSG0[2]
636	 movdqa		@MSG1[1],$TMPx
637	 palignr	\$4,@MSG1[0],$TMPx
638	nop
639	sha256rnds2	$CDGH0,$ABEF0
640	 pshufd		\$0x0e,$TMP1,$Wi
641	movdqa		14*16-0x80($Tbl),$TMP0
642	paddd		@MSG0[1],$TMP0
643	 sha256rnds2	$CDGH1,$ABEF1
644
645	movdqa		$TMP0,$Wi
646	 movdqa		14*16-0x80($Tbl),$TMP1
647	 paddd		$TMPx,@MSG1[2]
648	 paddd		@MSG1[1],$TMP1
649	sha256msg2	@MSG0[1],@MSG0[2]
650	nop
651	sha256rnds2	$ABEF0,$CDGH0		# 56-59
652	 movdqa		$TMP1,$Wi
653	  mov		\$1,%ecx
654	  pxor		@MSG0[1],@MSG0[1]	# zero
655	 sha256rnds2	$ABEF1,$CDGH1		# 56-59
656	 sha256msg2	@MSG1[1],@MSG1[2]
657	pshufd		\$0x0e,$TMP0,$Wi
658	movdqa		15*16-0x80($Tbl),$TMP0
659	paddd		@MSG0[2],$TMP0
660	  movq		(%rbx),@MSG0[2]		# pull counters
661	  nop
662	sha256rnds2	$CDGH0,$ABEF0
663	 pshufd		\$0x0e,$TMP1,$Wi
664	 movdqa		15*16-0x80($Tbl),$TMP1
665	 paddd		@MSG1[2],$TMP1
666	 sha256rnds2	$CDGH1,$ABEF1
667
668	movdqa		$TMP0,$Wi
669	  cmp		4*0(%rbx),%ecx		# examine counters
670	  cmovge	%rsp,@ptr[0]		# cancel input
671	  cmp		4*1(%rbx),%ecx
672	  cmovge	%rsp,@ptr[1]
673	  pshufd	\$0x00,@MSG0[2],@MSG1[0]
674	sha256rnds2	$ABEF0,$CDGH0		# 60-63
675	 movdqa		$TMP1,$Wi
676	  pshufd	\$0x55,@MSG0[2],@MSG1[1]
677	  movdqa	@MSG0[2],@MSG1[2]
678	 sha256rnds2	$ABEF1,$CDGH1		# 60-63
679	pshufd		\$0x0e,$TMP0,$Wi
680	  pcmpgtd	@MSG0[1],@MSG1[0]
681	  pcmpgtd	@MSG0[1],@MSG1[1]
682	sha256rnds2	$CDGH0,$ABEF0
683	 pshufd		\$0x0e,$TMP1,$Wi
684	  pcmpgtd	@MSG0[1],@MSG1[2]	# counter mask
685	  movdqa	K256_shaext-0x10(%rip),$TMPx
686	 sha256rnds2	$CDGH1,$ABEF1
687
688	pand		@MSG1[0],$CDGH0
689	 pand		@MSG1[1],$CDGH1
690	pand		@MSG1[0],$ABEF0
691	 pand		@MSG1[1],$ABEF1
692	paddd		@MSG0[2],@MSG1[2]	# counters--
693
694	paddd		0x50(%rsp),$CDGH0
695	 paddd		0x70(%rsp),$CDGH1
696	paddd		0x40(%rsp),$ABEF0
697	 paddd		0x60(%rsp),$ABEF1
698
699	movq		@MSG1[2],(%rbx)		# save counters
700	dec		$num
701	jnz		.Loop_shaext
702
703	mov		`$REG_SZ*17+8`(%rsp),$num
704
705	pshufd		\$0b00011011,$ABEF0,$ABEF0
706	pshufd		\$0b00011011,$CDGH0,$CDGH0
707	pshufd		\$0b00011011,$ABEF1,$ABEF1
708	pshufd		\$0b00011011,$CDGH1,$CDGH1
709
710	movdqa		$ABEF0,@MSG0[0]
711	movdqa		$CDGH0,@MSG0[1]
712	punpckldq	$ABEF1,$ABEF0			# B1.B0.A1.A0
713	punpckhdq	$ABEF1,@MSG0[0]			# F1.F0.E1.E0
714	punpckldq	$CDGH1,$CDGH0			# D1.D0.C1.C0
715	punpckhdq	$CDGH1,@MSG0[1]			# H1.H0.G1.G0
716
717	movq		$ABEF0,0x00-0x80($ctx)		# A1.A0
718	psrldq		\$8,$ABEF0
719	movq		@MSG0[0],0x80-0x80($ctx)	# E1.E0
720	psrldq		\$8,@MSG0[0]
721	movq		$ABEF0,0x20-0x80($ctx)		# B1.B0
722	movq		@MSG0[0],0xa0-0x80($ctx)	# F1.F0
723
724	movq		$CDGH0,0x40-0x80($ctx)		# C1.C0
725	psrldq		\$8,$CDGH0
726	movq		@MSG0[1],0xc0-0x80($ctx)	# G1.G0
727	psrldq		\$8,@MSG0[1]
728	movq		$CDGH0,0x60-0x80($ctx)		# D1.D0
729	movq		@MSG0[1],0xe0-0x80($ctx)	# H1.H0
730
731	lea	`$REG_SZ/2`($ctx),$ctx
732	lea	`16*2`($inp),$inp
733	dec	$num
734	jnz	.Loop_grande_shaext
735
736.Ldone_shaext:
737	#mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
738___
739$code.=<<___ if ($win64);
740	movaps	-0xb8(%rax),%xmm6
741	movaps	-0xa8(%rax),%xmm7
742	movaps	-0x98(%rax),%xmm8
743	movaps	-0x88(%rax),%xmm9
744	movaps	-0x78(%rax),%xmm10
745	movaps	-0x68(%rax),%xmm11
746	movaps	-0x58(%rax),%xmm12
747	movaps	-0x48(%rax),%xmm13
748	movaps	-0x38(%rax),%xmm14
749	movaps	-0x28(%rax),%xmm15
750___
751$code.=<<___;
752	mov	-16(%rax),%rbp
753	mov	-8(%rax),%rbx
754	lea	(%rax),%rsp
755.Lepilogue_shaext:
756	ret
757.size	sha256_multi_block_shaext,.-sha256_multi_block_shaext
758___
759						}}}
760						if ($avx) {{{
761sub ROUND_00_15_avx {
762my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
763
764$code.=<<___ if ($i<15 && $REG_SZ==16);
765	vmovd		`4*$i`(@ptr[0]),$Xi
766	vmovd		`4*$i`(@ptr[1]),$t1
767	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
768	vpinsrd		\$1,`4*$i`(@ptr[3]),$t1,$t1
769	vpunpckldq	$t1,$Xi,$Xi
770	vpshufb		$Xn,$Xi,$Xi
771___
772$code.=<<___ if ($i==15 && $REG_SZ==16);
773	vmovd		`4*$i`(@ptr[0]),$Xi
774	 lea		`16*4`(@ptr[0]),@ptr[0]
775	vmovd		`4*$i`(@ptr[1]),$t1
776	 lea		`16*4`(@ptr[1]),@ptr[1]
777	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
778	 lea		`16*4`(@ptr[2]),@ptr[2]
779	vpinsrd		\$1,`4*$i`(@ptr[3]),$t1,$t1
780	 lea		`16*4`(@ptr[3]),@ptr[3]
781	vpunpckldq	$t1,$Xi,$Xi
782	vpshufb		$Xn,$Xi,$Xi
783___
784$code.=<<___ if ($i<15 && $REG_SZ==32);
785	vmovd		`4*$i`(@ptr[0]),$Xi
786	vmovd		`4*$i`(@ptr[4]),$t1
787	vmovd		`4*$i`(@ptr[1]),$t2
788	vmovd		`4*$i`(@ptr[5]),$t3
789	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
790	vpinsrd		\$1,`4*$i`(@ptr[6]),$t1,$t1
791	vpinsrd		\$1,`4*$i`(@ptr[3]),$t2,$t2
792	vpunpckldq	$t2,$Xi,$Xi
793	vpinsrd		\$1,`4*$i`(@ptr[7]),$t3,$t3
794	vpunpckldq	$t3,$t1,$t1
795	vinserti128	$t1,$Xi,$Xi
796	vpshufb		$Xn,$Xi,$Xi
797___
798$code.=<<___ if ($i==15 && $REG_SZ==32);
799	vmovd		`4*$i`(@ptr[0]),$Xi
800	 lea		`16*4`(@ptr[0]),@ptr[0]
801	vmovd		`4*$i`(@ptr[4]),$t1
802	 lea		`16*4`(@ptr[4]),@ptr[4]
803	vmovd		`4*$i`(@ptr[1]),$t2
804	 lea		`16*4`(@ptr[1]),@ptr[1]
805	vmovd		`4*$i`(@ptr[5]),$t3
806	 lea		`16*4`(@ptr[5]),@ptr[5]
807	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
808	 lea		`16*4`(@ptr[2]),@ptr[2]
809	vpinsrd		\$1,`4*$i`(@ptr[6]),$t1,$t1
810	 lea		`16*4`(@ptr[6]),@ptr[6]
811	vpinsrd		\$1,`4*$i`(@ptr[3]),$t2,$t2
812	 lea		`16*4`(@ptr[3]),@ptr[3]
813	vpunpckldq	$t2,$Xi,$Xi
814	vpinsrd		\$1,`4*$i`(@ptr[7]),$t3,$t3
815	 lea		`16*4`(@ptr[7]),@ptr[7]
816	vpunpckldq	$t3,$t1,$t1
817	vinserti128	$t1,$Xi,$Xi
818	vpshufb		$Xn,$Xi,$Xi
819___
820$code.=<<___;
821	vpsrld	\$6,$e,$sigma
822	vpslld	\$26,$e,$t3
823	vmovdqu	$Xi,`&Xi_off($i)`
824	 vpaddd	$h,$Xi,$Xi			# Xi+=h
825
826	vpsrld	\$11,$e,$t2
827	vpxor	$t3,$sigma,$sigma
828	vpslld	\$21,$e,$t3
829	 vpaddd	`32*($i%8)-128`($Tbl),$Xi,$Xi	# Xi+=K[round]
830	vpxor	$t2,$sigma,$sigma
831
832	vpsrld	\$25,$e,$t2
833	vpxor	$t3,$sigma,$sigma
834	 `"prefetcht0	63(@ptr[0])"		if ($i==15)`
835	vpslld	\$7,$e,$t3
836	 vpandn	$g,$e,$t1
837	 vpand	$f,$e,$axb			# borrow $axb
838	 `"prefetcht0	63(@ptr[1])"		if ($i==15)`
839	vpxor	$t2,$sigma,$sigma
840
841	vpsrld	\$2,$a,$h			# borrow $h
842	vpxor	$t3,$sigma,$sigma		# Sigma1(e)
843	 `"prefetcht0	63(@ptr[2])"		if ($i==15)`
844	vpslld	\$30,$a,$t2
845	 vpxor	$axb,$t1,$t1			# Ch(e,f,g)
846	 vpxor	$a,$b,$axb			# a^b, b^c in next round
847	 `"prefetcht0	63(@ptr[3])"		if ($i==15)`
848	vpxor	$t2,$h,$h
849	vpaddd	$sigma,$Xi,$Xi			# Xi+=Sigma1(e)
850
851	vpsrld	\$13,$a,$t2
852	 `"prefetcht0	63(@ptr[4])"		if ($i==15 && $REG_SZ==32)`
853	vpslld	\$19,$a,$t3
854	 vpaddd	$t1,$Xi,$Xi			# Xi+=Ch(e,f,g)
855	 vpand	$axb,$bxc,$bxc
856	 `"prefetcht0	63(@ptr[5])"		if ($i==15 && $REG_SZ==32)`
857	vpxor	$t2,$h,$sigma
858
859	vpsrld	\$22,$a,$t2
860	vpxor	$t3,$sigma,$sigma
861	 `"prefetcht0	63(@ptr[6])"		if ($i==15 && $REG_SZ==32)`
862	vpslld	\$10,$a,$t3
863	 vpxor	$bxc,$b,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
864	 vpaddd	$Xi,$d,$d			# d+=Xi
865	 `"prefetcht0	63(@ptr[7])"		if ($i==15 && $REG_SZ==32)`
866	vpxor	$t2,$sigma,$sigma
867	vpxor	$t3,$sigma,$sigma		# Sigma0(a)
868
869	vpaddd	$Xi,$h,$h			# h+=Xi
870	vpaddd	$sigma,$h,$h			# h+=Sigma0(a)
871___
872$code.=<<___ if (($i%8)==7);
873	add	\$`32*8`,$Tbl
874___
875	($axb,$bxc)=($bxc,$axb);
876}
877
878sub ROUND_16_XX_avx {
879my $i=shift;
880
881$code.=<<___;
882	vmovdqu	`&Xi_off($i+1)`,$Xn
883	vpaddd	`&Xi_off($i+9)`,$Xi,$Xi		# Xi+=X[i+9]
884
885	vpsrld	\$3,$Xn,$sigma
886	vpsrld	\$7,$Xn,$t2
887	vpslld	\$25,$Xn,$t3
888	vpxor	$t2,$sigma,$sigma
889	vpsrld	\$18,$Xn,$t2
890	vpxor	$t3,$sigma,$sigma
891	vpslld	\$14,$Xn,$t3
892	vmovdqu	`&Xi_off($i+14)`,$t1
893	vpsrld	\$10,$t1,$axb			# borrow $axb
894
895	vpxor	$t2,$sigma,$sigma
896	vpsrld	\$17,$t1,$t2
897	vpxor	$t3,$sigma,$sigma		# sigma0(X[i+1])
898	vpslld	\$15,$t1,$t3
899	 vpaddd	$sigma,$Xi,$Xi			# Xi+=sigma0(e)
900	vpxor	$t2,$axb,$sigma
901	vpsrld	\$19,$t1,$t2
902	vpxor	$t3,$sigma,$sigma
903	vpslld	\$13,$t1,$t3
904	vpxor	$t2,$sigma,$sigma
905	vpxor	$t3,$sigma,$sigma		# sigma0(X[i+14])
906	vpaddd	$sigma,$Xi,$Xi			# Xi+=sigma1(X[i+14])
907___
908	&ROUND_00_15_avx($i,@_);
909	($Xi,$Xn)=($Xn,$Xi);
910}
911
912$code.=<<___;
913.type	sha256_multi_block_avx,\@function,3
914.align	32
915sha256_multi_block_avx:
916_avx_shortcut:
917___
918$code.=<<___ if ($avx>1);
919	shr	\$32,%rcx
920	cmp	\$2,$num
921	jb	.Lavx
922	test	\$`1<<5`,%ecx
923	jnz	_avx2_shortcut
924	jmp	.Lavx
925.align	32
926.Lavx:
927___
928$code.=<<___;
929	mov	%rsp,%rax
930	push	%rbx
931	push	%rbp
932___
933$code.=<<___ if ($win64);
934	lea	-0xa8(%rsp),%rsp
935	movaps	%xmm6,(%rsp)
936	movaps	%xmm7,0x10(%rsp)
937	movaps	%xmm8,0x20(%rsp)
938	movaps	%xmm9,0x30(%rsp)
939	movaps	%xmm10,-0x78(%rax)
940	movaps	%xmm11,-0x68(%rax)
941	movaps	%xmm12,-0x58(%rax)
942	movaps	%xmm13,-0x48(%rax)
943	movaps	%xmm14,-0x38(%rax)
944	movaps	%xmm15,-0x28(%rax)
945___
946$code.=<<___;
947	sub	\$`$REG_SZ*18`, %rsp
948	and	\$-256,%rsp
949	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
950.Lbody_avx:
951	lea	K256+128(%rip),$Tbl
952	lea	`$REG_SZ*16`(%rsp),%rbx
953	lea	0x80($ctx),$ctx			# size optimization
954
955.Loop_grande_avx:
956	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
957	xor	$num,$num
958___
959for($i=0;$i<4;$i++) {
960    $code.=<<___;
961	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
962	mov	`16*$i+8`($inp),%ecx		# number of blocks
963	cmp	$num,%ecx
964	cmovg	%ecx,$num			# find maximum
965	test	%ecx,%ecx
966	mov	%ecx,`4*$i`(%rbx)		# initialize counters
967	cmovle	$Tbl,@ptr[$i]			# cancel input
968___
969}
970$code.=<<___;
971	test	$num,$num
972	jz	.Ldone_avx
973
974	vmovdqu	0x00-0x80($ctx),$A		# load context
975	 lea	128(%rsp),%rax
976	vmovdqu	0x20-0x80($ctx),$B
977	vmovdqu	0x40-0x80($ctx),$C
978	vmovdqu	0x60-0x80($ctx),$D
979	vmovdqu	0x80-0x80($ctx),$E
980	vmovdqu	0xa0-0x80($ctx),$F
981	vmovdqu	0xc0-0x80($ctx),$G
982	vmovdqu	0xe0-0x80($ctx),$H
983	vmovdqu	.Lpbswap(%rip),$Xn
984	jmp	.Loop_avx
985
986.align	32
987.Loop_avx:
988	vpxor	$B,$C,$bxc			# magic seed
989___
990for($i=0;$i<16;$i++)	{ &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
991$code.=<<___;
992	vmovdqu	`&Xi_off($i)`,$Xi
993	mov	\$3,%ecx
994	jmp	.Loop_16_xx_avx
995.align	32
996.Loop_16_xx_avx:
997___
998for(;$i<32;$i++)	{ &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
999$code.=<<___;
1000	dec	%ecx
1001	jnz	.Loop_16_xx_avx
1002
1003	mov	\$1,%ecx
1004	lea	K256+128(%rip),$Tbl
1005___
1006for($i=0;$i<4;$i++) {
1007    $code.=<<___;
1008	cmp	`4*$i`(%rbx),%ecx		# examine counters
1009	cmovge	$Tbl,@ptr[$i]			# cancel input
1010___
1011}
1012$code.=<<___;
1013	vmovdqa	(%rbx),$sigma			# pull counters
1014	vpxor	$t1,$t1,$t1
1015	vmovdqa	$sigma,$Xn
1016	vpcmpgtd $t1,$Xn,$Xn			# mask value
1017	vpaddd	$Xn,$sigma,$sigma		# counters--
1018
1019	vmovdqu	0x00-0x80($ctx),$t1
1020	vpand	$Xn,$A,$A
1021	vmovdqu	0x20-0x80($ctx),$t2
1022	vpand	$Xn,$B,$B
1023	vmovdqu	0x40-0x80($ctx),$t3
1024	vpand	$Xn,$C,$C
1025	vmovdqu	0x60-0x80($ctx),$Xi
1026	vpand	$Xn,$D,$D
1027	vpaddd	$t1,$A,$A
1028	vmovdqu	0x80-0x80($ctx),$t1
1029	vpand	$Xn,$E,$E
1030	vpaddd	$t2,$B,$B
1031	vmovdqu	0xa0-0x80($ctx),$t2
1032	vpand	$Xn,$F,$F
1033	vpaddd	$t3,$C,$C
1034	vmovdqu	0xc0-0x80($ctx),$t3
1035	vpand	$Xn,$G,$G
1036	vpaddd	$Xi,$D,$D
1037	vmovdqu	0xe0-0x80($ctx),$Xi
1038	vpand	$Xn,$H,$H
1039	vpaddd	$t1,$E,$E
1040	vpaddd	$t2,$F,$F
1041	vmovdqu	$A,0x00-0x80($ctx)
1042	vpaddd	$t3,$G,$G
1043	vmovdqu	$B,0x20-0x80($ctx)
1044	vpaddd	$Xi,$H,$H
1045	vmovdqu	$C,0x40-0x80($ctx)
1046	vmovdqu	$D,0x60-0x80($ctx)
1047	vmovdqu	$E,0x80-0x80($ctx)
1048	vmovdqu	$F,0xa0-0x80($ctx)
1049	vmovdqu	$G,0xc0-0x80($ctx)
1050	vmovdqu	$H,0xe0-0x80($ctx)
1051
1052	vmovdqu	$sigma,(%rbx)			# save counters
1053	vmovdqu	.Lpbswap(%rip),$Xn
1054	dec	$num
1055	jnz	.Loop_avx
1056
1057	mov	`$REG_SZ*17+8`(%rsp),$num
1058	lea	$REG_SZ($ctx),$ctx
1059	lea	`16*$REG_SZ/4`($inp),$inp
1060	dec	$num
1061	jnz	.Loop_grande_avx
1062
1063.Ldone_avx:
1064	mov	`$REG_SZ*17`(%rsp),%rax		# orignal %rsp
1065	vzeroupper
1066___
1067$code.=<<___ if ($win64);
1068	movaps	-0xb8(%rax),%xmm6
1069	movaps	-0xa8(%rax),%xmm7
1070	movaps	-0x98(%rax),%xmm8
1071	movaps	-0x88(%rax),%xmm9
1072	movaps	-0x78(%rax),%xmm10
1073	movaps	-0x68(%rax),%xmm11
1074	movaps	-0x58(%rax),%xmm12
1075	movaps	-0x48(%rax),%xmm13
1076	movaps	-0x38(%rax),%xmm14
1077	movaps	-0x28(%rax),%xmm15
1078___
1079$code.=<<___;
1080	mov	-16(%rax),%rbp
1081	mov	-8(%rax),%rbx
1082	lea	(%rax),%rsp
1083.Lepilogue_avx:
1084	ret
1085.size	sha256_multi_block_avx,.-sha256_multi_block_avx
1086___
1087						if ($avx>1) {
1088$code =~ s/\`([^\`]*)\`/eval $1/gem;
1089
1090$REG_SZ=32;
1091@ptr=map("%r$_",(12..15,8..11));
1092
1093@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1094($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1095
1096$code.=<<___;
1097.type	sha256_multi_block_avx2,\@function,3
1098.align	32
1099sha256_multi_block_avx2:
1100_avx2_shortcut:
1101	mov	%rsp,%rax
1102	push	%rbx
1103	push	%rbp
1104	push	%r12
1105	push	%r13
1106	push	%r14
1107	push	%r15
1108___
1109$code.=<<___ if ($win64);
1110	lea	-0xa8(%rsp),%rsp
1111	movaps	%xmm6,(%rsp)
1112	movaps	%xmm7,0x10(%rsp)
1113	movaps	%xmm8,0x20(%rsp)
1114	movaps	%xmm9,0x30(%rsp)
1115	movaps	%xmm10,0x40(%rsp)
1116	movaps	%xmm11,0x50(%rsp)
1117	movaps	%xmm12,-0x78(%rax)
1118	movaps	%xmm13,-0x68(%rax)
1119	movaps	%xmm14,-0x58(%rax)
1120	movaps	%xmm15,-0x48(%rax)
1121___
1122$code.=<<___;
1123	sub	\$`$REG_SZ*18`, %rsp
1124	and	\$-256,%rsp
1125	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
1126.Lbody_avx2:
1127	lea	K256+128(%rip),$Tbl
1128	lea	0x80($ctx),$ctx			# size optimization
1129
1130.Loop_grande_avx2:
1131	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1132	xor	$num,$num
1133	lea	`$REG_SZ*16`(%rsp),%rbx
1134___
1135for($i=0;$i<8;$i++) {
1136    $code.=<<___;
1137	mov	`16*$i+0`($inp),@ptr[$i]	# input pointer
1138	mov	`16*$i+8`($inp),%ecx		# number of blocks
1139	cmp	$num,%ecx
1140	cmovg	%ecx,$num			# find maximum
1141	test	%ecx,%ecx
1142	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1143	cmovle	$Tbl,@ptr[$i]			# cancel input
1144___
1145}
1146$code.=<<___;
1147	vmovdqu	0x00-0x80($ctx),$A		# load context
1148	 lea	128(%rsp),%rax
1149	vmovdqu	0x20-0x80($ctx),$B
1150	 lea	256+128(%rsp),%rbx
1151	vmovdqu	0x40-0x80($ctx),$C
1152	vmovdqu	0x60-0x80($ctx),$D
1153	vmovdqu	0x80-0x80($ctx),$E
1154	vmovdqu	0xa0-0x80($ctx),$F
1155	vmovdqu	0xc0-0x80($ctx),$G
1156	vmovdqu	0xe0-0x80($ctx),$H
1157	vmovdqu	.Lpbswap(%rip),$Xn
1158	jmp	.Loop_avx2
1159
1160.align	32
1161.Loop_avx2:
1162	vpxor	$B,$C,$bxc			# magic seed
1163___
1164for($i=0;$i<16;$i++)	{ &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1165$code.=<<___;
1166	vmovdqu	`&Xi_off($i)`,$Xi
1167	mov	\$3,%ecx
1168	jmp	.Loop_16_xx_avx2
1169.align	32
1170.Loop_16_xx_avx2:
1171___
1172for(;$i<32;$i++)	{ &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1173$code.=<<___;
1174	dec	%ecx
1175	jnz	.Loop_16_xx_avx2
1176
1177	mov	\$1,%ecx
1178	lea	`$REG_SZ*16`(%rsp),%rbx
1179	lea	K256+128(%rip),$Tbl
1180___
1181for($i=0;$i<8;$i++) {
1182    $code.=<<___;
1183	cmp	`4*$i`(%rbx),%ecx		# examine counters
1184	cmovge	$Tbl,@ptr[$i]			# cancel input
1185___
1186}
1187$code.=<<___;
1188	vmovdqa	(%rbx),$sigma			# pull counters
1189	vpxor	$t1,$t1,$t1
1190	vmovdqa	$sigma,$Xn
1191	vpcmpgtd $t1,$Xn,$Xn			# mask value
1192	vpaddd	$Xn,$sigma,$sigma		# counters--
1193
1194	vmovdqu	0x00-0x80($ctx),$t1
1195	vpand	$Xn,$A,$A
1196	vmovdqu	0x20-0x80($ctx),$t2
1197	vpand	$Xn,$B,$B
1198	vmovdqu	0x40-0x80($ctx),$t3
1199	vpand	$Xn,$C,$C
1200	vmovdqu	0x60-0x80($ctx),$Xi
1201	vpand	$Xn,$D,$D
1202	vpaddd	$t1,$A,$A
1203	vmovdqu	0x80-0x80($ctx),$t1
1204	vpand	$Xn,$E,$E
1205	vpaddd	$t2,$B,$B
1206	vmovdqu	0xa0-0x80($ctx),$t2
1207	vpand	$Xn,$F,$F
1208	vpaddd	$t3,$C,$C
1209	vmovdqu	0xc0-0x80($ctx),$t3
1210	vpand	$Xn,$G,$G
1211	vpaddd	$Xi,$D,$D
1212	vmovdqu	0xe0-0x80($ctx),$Xi
1213	vpand	$Xn,$H,$H
1214	vpaddd	$t1,$E,$E
1215	vpaddd	$t2,$F,$F
1216	vmovdqu	$A,0x00-0x80($ctx)
1217	vpaddd	$t3,$G,$G
1218	vmovdqu	$B,0x20-0x80($ctx)
1219	vpaddd	$Xi,$H,$H
1220	vmovdqu	$C,0x40-0x80($ctx)
1221	vmovdqu	$D,0x60-0x80($ctx)
1222	vmovdqu	$E,0x80-0x80($ctx)
1223	vmovdqu	$F,0xa0-0x80($ctx)
1224	vmovdqu	$G,0xc0-0x80($ctx)
1225	vmovdqu	$H,0xe0-0x80($ctx)
1226
1227	vmovdqu	$sigma,(%rbx)			# save counters
1228	lea	256+128(%rsp),%rbx
1229	vmovdqu	.Lpbswap(%rip),$Xn
1230	dec	$num
1231	jnz	.Loop_avx2
1232
1233	#mov	`$REG_SZ*17+8`(%rsp),$num
1234	#lea	$REG_SZ($ctx),$ctx
1235	#lea	`16*$REG_SZ/4`($inp),$inp
1236	#dec	$num
1237	#jnz	.Loop_grande_avx2
1238
1239.Ldone_avx2:
1240	mov	`$REG_SZ*17`(%rsp),%rax		# orignal %rsp
1241	vzeroupper
1242___
1243$code.=<<___ if ($win64);
1244	movaps	-0xd8(%rax),%xmm6
1245	movaps	-0xc8(%rax),%xmm7
1246	movaps	-0xb8(%rax),%xmm8
1247	movaps	-0xa8(%rax),%xmm9
1248	movaps	-0x98(%rax),%xmm10
1249	movaps	-0x88(%rax),%xmm11
1250	movaps	-0x78(%rax),%xmm12
1251	movaps	-0x68(%rax),%xmm13
1252	movaps	-0x58(%rax),%xmm14
1253	movaps	-0x48(%rax),%xmm15
1254___
1255$code.=<<___;
1256	mov	-48(%rax),%r15
1257	mov	-40(%rax),%r14
1258	mov	-32(%rax),%r13
1259	mov	-24(%rax),%r12
1260	mov	-16(%rax),%rbp
1261	mov	-8(%rax),%rbx
1262	lea	(%rax),%rsp
1263.Lepilogue_avx2:
1264	ret
1265.size	sha256_multi_block_avx2,.-sha256_multi_block_avx2
1266___
1267					}	}}}
1268$code.=<<___;
1269.align	256
1270K256:
1271___
1272sub TABLE {
1273    foreach (@_) {
1274	$code.=<<___;
1275	.long	$_,$_,$_,$_
1276	.long	$_,$_,$_,$_
1277___
1278    }
1279}
1280&TABLE(	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1281	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1282	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1283	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1284	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1285	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1286	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1287	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1288	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1289	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1290	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1291	0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1292	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1293	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1294	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1295	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1296$code.=<<___;
1297.Lpbswap:
1298	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1299	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1300K256_shaext:
1301	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1302	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1303	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1304	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1305	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1306	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1307	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1308	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1309	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1310	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1311	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1312	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1313	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1314	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1315	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1316	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1317	.asciz	"SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1318___
1319
1320if ($win64) {
1321# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1322#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1323$rec="%rcx";
1324$frame="%rdx";
1325$context="%r8";
1326$disp="%r9";
1327
1328$code.=<<___;
1329.extern	__imp_RtlVirtualUnwind
1330.type	se_handler,\@abi-omnipotent
1331.align	16
1332se_handler:
1333	push	%rsi
1334	push	%rdi
1335	push	%rbx
1336	push	%rbp
1337	push	%r12
1338	push	%r13
1339	push	%r14
1340	push	%r15
1341	pushfq
1342	sub	\$64,%rsp
1343
1344	mov	120($context),%rax	# pull context->Rax
1345	mov	248($context),%rbx	# pull context->Rip
1346
1347	mov	8($disp),%rsi		# disp->ImageBase
1348	mov	56($disp),%r11		# disp->HandlerData
1349
1350	mov	0(%r11),%r10d		# HandlerData[0]
1351	lea	(%rsi,%r10),%r10	# end of prologue label
1352	cmp	%r10,%rbx		# context->Rip<.Lbody
1353	jb	.Lin_prologue
1354
1355	mov	152($context),%rax	# pull context->Rsp
1356
1357	mov	4(%r11),%r10d		# HandlerData[1]
1358	lea	(%rsi,%r10),%r10	# epilogue label
1359	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1360	jae	.Lin_prologue
1361
1362	mov	`16*17`(%rax),%rax	# pull saved stack pointer
1363
1364	mov	-8(%rax),%rbx
1365	mov	-16(%rax),%rbp
1366	mov	%rbx,144($context)	# restore context->Rbx
1367	mov	%rbp,160($context)	# restore context->Rbp
1368
1369	lea	-24-10*16(%rax),%rsi
1370	lea	512($context),%rdi	# &context.Xmm6
1371	mov	\$20,%ecx
1372	.long	0xa548f3fc		# cld; rep movsq
1373
1374.Lin_prologue:
1375	mov	8(%rax),%rdi
1376	mov	16(%rax),%rsi
1377	mov	%rax,152($context)	# restore context->Rsp
1378	mov	%rsi,168($context)	# restore context->Rsi
1379	mov	%rdi,176($context)	# restore context->Rdi
1380
1381	mov	40($disp),%rdi		# disp->ContextRecord
1382	mov	$context,%rsi		# context
1383	mov	\$154,%ecx		# sizeof(CONTEXT)
1384	.long	0xa548f3fc		# cld; rep movsq
1385
1386	mov	$disp,%rsi
1387	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1388	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1389	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1390	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1391	mov	40(%rsi),%r10		# disp->ContextRecord
1392	lea	56(%rsi),%r11		# &disp->HandlerData
1393	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1394	mov	%r10,32(%rsp)		# arg5
1395	mov	%r11,40(%rsp)		# arg6
1396	mov	%r12,48(%rsp)		# arg7
1397	mov	%rcx,56(%rsp)		# arg8, (NULL)
1398	call	*__imp_RtlVirtualUnwind(%rip)
1399
1400	mov	\$1,%eax		# ExceptionContinueSearch
1401	add	\$64,%rsp
1402	popfq
1403	pop	%r15
1404	pop	%r14
1405	pop	%r13
1406	pop	%r12
1407	pop	%rbp
1408	pop	%rbx
1409	pop	%rdi
1410	pop	%rsi
1411	ret
1412.size	se_handler,.-se_handler
1413___
1414$code.=<<___ if ($avx>1);
1415.type	avx2_handler,\@abi-omnipotent
1416.align	16
1417avx2_handler:
1418	push	%rsi
1419	push	%rdi
1420	push	%rbx
1421	push	%rbp
1422	push	%r12
1423	push	%r13
1424	push	%r14
1425	push	%r15
1426	pushfq
1427	sub	\$64,%rsp
1428
1429	mov	120($context),%rax	# pull context->Rax
1430	mov	248($context),%rbx	# pull context->Rip
1431
1432	mov	8($disp),%rsi		# disp->ImageBase
1433	mov	56($disp),%r11		# disp->HandlerData
1434
1435	mov	0(%r11),%r10d		# HandlerData[0]
1436	lea	(%rsi,%r10),%r10	# end of prologue label
1437	cmp	%r10,%rbx		# context->Rip<body label
1438	jb	.Lin_prologue
1439
1440	mov	152($context),%rax	# pull context->Rsp
1441
1442	mov	4(%r11),%r10d		# HandlerData[1]
1443	lea	(%rsi,%r10),%r10	# epilogue label
1444	cmp	%r10,%rbx		# context->Rip>=epilogue label
1445	jae	.Lin_prologue
1446
1447	mov	`32*17`($context),%rax	# pull saved stack pointer
1448
1449	mov	-8(%rax),%rbx
1450	mov	-16(%rax),%rbp
1451	mov	-24(%rax),%r12
1452	mov	-32(%rax),%r13
1453	mov	-40(%rax),%r14
1454	mov	-48(%rax),%r15
1455	mov	%rbx,144($context)	# restore context->Rbx
1456	mov	%rbp,160($context)	# restore context->Rbp
1457	mov	%r12,216($context)	# restore cotnext->R12
1458	mov	%r13,224($context)	# restore cotnext->R13
1459	mov	%r14,232($context)	# restore cotnext->R14
1460	mov	%r15,240($context)	# restore cotnext->R15
1461
1462	lea	-56-10*16(%rax),%rsi
1463	lea	512($context),%rdi	# &context.Xmm6
1464	mov	\$20,%ecx
1465	.long	0xa548f3fc		# cld; rep movsq
1466
1467	jmp	.Lin_prologue
1468.size	avx2_handler,.-avx2_handler
1469___
1470$code.=<<___;
1471.section	.pdata
1472.align	4
1473	.rva	.LSEH_begin_sha256_multi_block
1474	.rva	.LSEH_end_sha256_multi_block
1475	.rva	.LSEH_info_sha256_multi_block
1476	.rva	.LSEH_begin_sha256_multi_block_shaext
1477	.rva	.LSEH_end_sha256_multi_block_shaext
1478	.rva	.LSEH_info_sha256_multi_block_shaext
1479___
1480$code.=<<___ if ($avx);
1481	.rva	.LSEH_begin_sha256_multi_block_avx
1482	.rva	.LSEH_end_sha256_multi_block_avx
1483	.rva	.LSEH_info_sha256_multi_block_avx
1484___
1485$code.=<<___ if ($avx>1);
1486	.rva	.LSEH_begin_sha256_multi_block_avx2
1487	.rva	.LSEH_end_sha256_multi_block_avx2
1488	.rva	.LSEH_info_sha256_multi_block_avx2
1489___
1490$code.=<<___;
1491.section	.xdata
1492.align	8
1493.LSEH_info_sha256_multi_block:
1494	.byte	9,0,0,0
1495	.rva	se_handler
1496	.rva	.Lbody,.Lepilogue			# HandlerData[]
1497.LSEH_info_sha256_multi_block_shaext:
1498	.byte	9,0,0,0
1499	.rva	se_handler
1500	.rva	.Lbody_shaext,.Lepilogue_shaext		# HandlerData[]
1501___
1502$code.=<<___ if ($avx);
1503.LSEH_info_sha256_multi_block_avx:
1504	.byte	9,0,0,0
1505	.rva	se_handler
1506	.rva	.Lbody_avx,.Lepilogue_avx		# HandlerData[]
1507___
1508$code.=<<___ if ($avx>1);
1509.LSEH_info_sha256_multi_block_avx2:
1510	.byte	9,0,0,0
1511	.rva	avx2_handler
1512	.rva	.Lbody_avx2,.Lepilogue_avx2		# HandlerData[]
1513___
1514}
1515####################################################################
1516
1517sub rex {
1518  local *opcode=shift;
1519  my ($dst,$src)=@_;
1520  my $rex=0;
1521
1522    $rex|=0x04			if ($dst>=8);
1523    $rex|=0x01			if ($src>=8);
1524    unshift @opcode,$rex|0x40	if ($rex);
1525}
1526
1527sub sha256op38 {
1528    my $instr = shift;
1529    my %opcodelet = (
1530		"sha256rnds2" => 0xcb,
1531  		"sha256msg1"  => 0xcc,
1532		"sha256msg2"  => 0xcd	);
1533
1534    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1535      my @opcode=(0x0f,0x38);
1536	rex(\@opcode,$2,$1);
1537	push @opcode,$opcodelet{$instr};
1538	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1539	return ".byte\t".join(',',@opcode);
1540    } else {
1541	return $instr."\t".@_[0];
1542    }
1543}
1544
1545foreach (split("\n",$code)) {
1546	s/\`([^\`]*)\`/eval($1)/ge;
1547
1548	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo		or
1549
1550	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
1551	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
1552	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go	or
1553	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1554	s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go		or
1555	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1556
1557	print $_,"\n";
1558}
1559
1560close STDOUT;
1561