1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
88#
89# AMD K8	14.9	-	    -		    9.57    -
90# P4		17.3	-	    -		    30.8    -
91# Core 2	15.6	13.8(+13%)  -		    9.97    -
92# Westmere	14.8	12.3(+19%)  -		    9.58    -
93# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
94# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
95# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
96# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
97# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
98# Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
99# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
100# Atom		23.0	18.9(+22%)  -		    14.7    -
101# Silvermont	27.4	20.6(+33%)  -               17.5    -
102# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
103# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
104#
105# (*)	whichever best applicable, including SHAEXT;
106# (**)	switch from ror to shrd stands for fair share of improvement;
107# (***)	execution time is fully determined by remaining integer-only
108#	part, body_00_15; reducing the amount of SIMD instructions
109#	below certain limit makes no difference/sense; to conserve
110#	space SHA256 XOP code path is therefore omitted;
111
112$flavour = shift;
113$output  = shift;
114if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
115
116$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
117
118$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
119( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
120( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
121die "can't locate x86_64-xlate.pl";
122
123if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
124		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
125	$avx = ($1>=2.19) + ($1>=2.22);
126}
127
128if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
129	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
130	$avx = ($1>=2.09) + ($1>=2.10);
131}
132
133if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
134	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
135	$avx = ($1>=10) + ($1>=11);
136}
137
138if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
139	$avx = ($2>=3.0) + ($2>3.0);
140}
141
142$shaext=1;	### set to zero if compiling for 1.0.1
143$avx=1		if (!$shaext && $avx);
144
145open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
146*STDOUT=*OUT;
147
148if ($output =~ /512/) {
149	$func="sha512_block_data_order";
150	$TABLE="K512";
151	$SZ=8;
152	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
153					"%r8", "%r9", "%r10","%r11");
154	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
155	@Sigma0=(28,34,39);
156	@Sigma1=(14,18,41);
157	@sigma0=(1,  8, 7);
158	@sigma1=(19,61, 6);
159	$rounds=80;
160} else {
161	$func="sha256_block_data_order";
162	$TABLE="K256";
163	$SZ=4;
164	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
165					"%r8d","%r9d","%r10d","%r11d");
166	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
167	@Sigma0=( 2,13,22);
168	@Sigma1=( 6,11,25);
169	@sigma0=( 7,18, 3);
170	@sigma1=(17,19,10);
171	$rounds=64;
172}
173
174$ctx="%rdi";	# 1st arg, zapped by $a3
175$inp="%rsi";	# 2nd arg
176$Tbl="%rbp";
177
178$_ctx="16*$SZ+0*8(%rsp)";
179$_inp="16*$SZ+1*8(%rsp)";
180$_end="16*$SZ+2*8(%rsp)";
181$_rsp="`16*$SZ+3*8`(%rsp)";
182$framesz="16*$SZ+4*8";
183
184
185sub ROUND_00_15()
186{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
187  my $STRIDE=$SZ;
188     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
189
190$code.=<<___;
191	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
192	mov	$f,$a2
193
194	xor	$e,$a0
195	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
196	xor	$g,$a2			# f^g
197
198	mov	$T1,`$SZ*($i&0xf)`(%rsp)
199	xor	$a,$a1
200	and	$e,$a2			# (f^g)&e
201
202	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
203	add	$h,$T1			# T1+=h
204	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
205
206	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
207	xor	$e,$a0
208	add	$a2,$T1			# T1+=Ch(e,f,g)
209
210	mov	$a,$a2
211	add	($Tbl),$T1		# T1+=K[round]
212	xor	$a,$a1
213
214	xor	$b,$a2			# a^b, b^c in next round
215	ror	\$$Sigma1[0],$a0	# Sigma1(e)
216	mov	$b,$h
217
218	and	$a2,$a3
219	ror	\$$Sigma0[0],$a1	# Sigma0(a)
220	add	$a0,$T1			# T1+=Sigma1(e)
221
222	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
223	add	$T1,$d			# d+=T1
224	add	$T1,$h			# h+=T1
225
226	lea	$STRIDE($Tbl),$Tbl	# round++
227___
228$code.=<<___ if ($i<15);
229	add	$a1,$h			# h+=Sigma0(a)
230___
231	($a2,$a3) = ($a3,$a2);
232}
233
234sub ROUND_16_XX()
235{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
236
237$code.=<<___;
238	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
239	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
240
241	mov	$a0,$T1
242	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
243	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
244	mov	$a2,$a1
245	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
246
247	xor	$T1,$a0
248	shr	\$$sigma0[2],$T1
249	ror	\$$sigma0[0],$a0
250	xor	$a1,$a2
251	shr	\$$sigma1[2],$a1
252
253	ror	\$$sigma1[0],$a2
254	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
255	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
256	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
257
258	add	`$SZ*($i&0xf)`(%rsp),$T1
259	mov	$e,$a0
260	add	$a2,$T1
261	mov	$a,$a1
262___
263	&ROUND_00_15(@_);
264}
265
266$code=<<___;
267.text
268
269.extern	OPENSSL_ia32cap_P
270.globl	$func
271.type	$func,\@function,3
272.align	16
273$func:
274.cfi_startproc
275___
276$code.=<<___ if ($SZ==4 || $avx);
277	lea	OPENSSL_ia32cap_P(%rip),%r11
278	mov	0(%r11),%r9d
279	mov	4(%r11),%r10d
280	mov	8(%r11),%r11d
281___
282$code.=<<___ if ($SZ==4 && $shaext);
283	test	\$`1<<29`,%r11d		# check for SHA
284	jnz	_shaext_shortcut
285___
286$code.=<<___ if ($avx && $SZ==8);
287	test	\$`1<<11`,%r10d		# check for XOP
288	jnz	.Lxop_shortcut
289___
290$code.=<<___ if ($avx>1);
291	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
292	cmp	\$`1<<8|1<<5|1<<3`,%r11d
293	je	.Lavx2_shortcut
294___
295$code.=<<___ if ($avx);
296	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
297	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
298	or	%r9d,%r10d
299	cmp	\$`1<<28|1<<9|1<<30`,%r10d
300	je	.Lavx_shortcut
301___
302$code.=<<___ if ($SZ==4);
303	test	\$`1<<9`,%r10d
304	jnz	.Lssse3_shortcut
305___
306$code.=<<___;
307	mov	%rsp,%rax		# copy %rsp
308.cfi_def_cfa_register	%rax
309	push	%rbx
310.cfi_push	%rbx
311	push	%rbp
312.cfi_push	%rbp
313	push	%r12
314.cfi_push	%r12
315	push	%r13
316.cfi_push	%r13
317	push	%r14
318.cfi_push	%r14
319	push	%r15
320.cfi_push	%r15
321	shl	\$4,%rdx		# num*16
322	sub	\$$framesz,%rsp
323	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
324	and	\$-64,%rsp		# align stack frame
325	mov	$ctx,$_ctx		# save ctx, 1st arg
326	mov	$inp,$_inp		# save inp, 2nd arh
327	mov	%rdx,$_end		# save end pointer, "3rd" arg
328	mov	%rax,$_rsp		# save copy of %rsp
329.cfi_cfa_expression	$_rsp,deref,+8
330.Lprologue:
331
332	mov	$SZ*0($ctx),$A
333	mov	$SZ*1($ctx),$B
334	mov	$SZ*2($ctx),$C
335	mov	$SZ*3($ctx),$D
336	mov	$SZ*4($ctx),$E
337	mov	$SZ*5($ctx),$F
338	mov	$SZ*6($ctx),$G
339	mov	$SZ*7($ctx),$H
340	jmp	.Lloop
341
342.align	16
343.Lloop:
344	mov	$B,$a3
345	lea	$TABLE(%rip),$Tbl
346	xor	$C,$a3			# magic
347___
348	for($i=0;$i<16;$i++) {
349		$code.="	mov	$SZ*$i($inp),$T1\n";
350		$code.="	mov	@ROT[4],$a0\n";
351		$code.="	mov	@ROT[0],$a1\n";
352		$code.="	bswap	$T1\n";
353		&ROUND_00_15($i,@ROT);
354		unshift(@ROT,pop(@ROT));
355	}
356$code.=<<___;
357	jmp	.Lrounds_16_xx
358.align	16
359.Lrounds_16_xx:
360___
361	for(;$i<32;$i++) {
362		&ROUND_16_XX($i,@ROT);
363		unshift(@ROT,pop(@ROT));
364	}
365
366$code.=<<___;
367	cmpb	\$0,`$SZ-1`($Tbl)
368	jnz	.Lrounds_16_xx
369
370	mov	$_ctx,$ctx
371	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
372	lea	16*$SZ($inp),$inp
373
374	add	$SZ*0($ctx),$A
375	add	$SZ*1($ctx),$B
376	add	$SZ*2($ctx),$C
377	add	$SZ*3($ctx),$D
378	add	$SZ*4($ctx),$E
379	add	$SZ*5($ctx),$F
380	add	$SZ*6($ctx),$G
381	add	$SZ*7($ctx),$H
382
383	cmp	$_end,$inp
384
385	mov	$A,$SZ*0($ctx)
386	mov	$B,$SZ*1($ctx)
387	mov	$C,$SZ*2($ctx)
388	mov	$D,$SZ*3($ctx)
389	mov	$E,$SZ*4($ctx)
390	mov	$F,$SZ*5($ctx)
391	mov	$G,$SZ*6($ctx)
392	mov	$H,$SZ*7($ctx)
393	jb	.Lloop
394
395	mov	$_rsp,%rsi
396.cfi_def_cfa	%rsi,8
397	mov	-48(%rsi),%r15
398.cfi_restore	%r15
399	mov	-40(%rsi),%r14
400.cfi_restore	%r14
401	mov	-32(%rsi),%r13
402.cfi_restore	%r13
403	mov	-24(%rsi),%r12
404.cfi_restore	%r12
405	mov	-16(%rsi),%rbp
406.cfi_restore	%rbp
407	mov	-8(%rsi),%rbx
408.cfi_restore	%rbx
409	lea	(%rsi),%rsp
410.cfi_def_cfa_register	%rsp
411.Lepilogue:
412	ret
413.cfi_endproc
414.size	$func,.-$func
415___
416
417if ($SZ==4) {
418$code.=<<___;
419.align	64
420.type	$TABLE,\@object
421$TABLE:
422	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
423	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
424	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
425	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
426	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
427	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
428	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
429	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
430	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
431	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
432	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
433	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
434	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
435	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
436	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
437	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
438	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
439	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
440	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
441	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
442	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
443	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
444	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
445	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
446	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
447	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
448	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
449	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
450	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
451	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
452	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
453	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
454
455	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
456	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
457	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
458	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
459	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
460	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
461	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
462___
463} else {
464$code.=<<___;
465.align	64
466.type	$TABLE,\@object
467$TABLE:
468	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
469	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
470	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
471	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
472	.quad	0x3956c25bf348b538,0x59f111f1b605d019
473	.quad	0x3956c25bf348b538,0x59f111f1b605d019
474	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
475	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
476	.quad	0xd807aa98a3030242,0x12835b0145706fbe
477	.quad	0xd807aa98a3030242,0x12835b0145706fbe
478	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
479	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
480	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
481	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
482	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
483	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
484	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
485	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
486	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
487	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
488	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
489	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
490	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
491	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
492	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
493	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
494	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
495	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
496	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
497	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
498	.quad	0x06ca6351e003826f,0x142929670a0e6e70
499	.quad	0x06ca6351e003826f,0x142929670a0e6e70
500	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
501	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
502	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
503	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
504	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
505	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
506	.quad	0x81c2c92e47edaee6,0x92722c851482353b
507	.quad	0x81c2c92e47edaee6,0x92722c851482353b
508	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
509	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
510	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
511	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
512	.quad	0xd192e819d6ef5218,0xd69906245565a910
513	.quad	0xd192e819d6ef5218,0xd69906245565a910
514	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
515	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
516	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
517	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
518	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
519	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
520	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
521	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
522	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
523	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
524	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
525	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
526	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
527	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
528	.quad	0x90befffa23631e28,0xa4506cebde82bde9
529	.quad	0x90befffa23631e28,0xa4506cebde82bde9
530	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
531	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
532	.quad	0xca273eceea26619c,0xd186b8c721c0c207
533	.quad	0xca273eceea26619c,0xd186b8c721c0c207
534	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
535	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
536	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
537	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
538	.quad	0x113f9804bef90dae,0x1b710b35131c471b
539	.quad	0x113f9804bef90dae,0x1b710b35131c471b
540	.quad	0x28db77f523047d84,0x32caab7b40c72493
541	.quad	0x28db77f523047d84,0x32caab7b40c72493
542	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
543	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
544	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
545	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
546	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
547	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
548
549	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
550	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
551	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
552___
553}
554
555######################################################################
556# SIMD code paths
557#
558if ($SZ==4 && $shaext) {{{
559######################################################################
560# Intel SHA Extensions implementation of SHA256 update function.
561#
562my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
563
564my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
565my @MSG=map("%xmm$_",(3..6));
566
567$code.=<<___;
568.type	sha256_block_data_order_shaext,\@function,3
569.align	64
570sha256_block_data_order_shaext:
571_shaext_shortcut:
572___
573$code.=<<___ if ($win64);
574	lea	`-8-5*16`(%rsp),%rsp
575	movaps	%xmm6,-8-5*16(%rax)
576	movaps	%xmm7,-8-4*16(%rax)
577	movaps	%xmm8,-8-3*16(%rax)
578	movaps	%xmm9,-8-2*16(%rax)
579	movaps	%xmm10,-8-1*16(%rax)
580.Lprologue_shaext:
581___
582$code.=<<___;
583	lea		K256+0x80(%rip),$Tbl
584	movdqu		($ctx),$ABEF		# DCBA
585	movdqu		16($ctx),$CDGH		# HGFE
586	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
587
588	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
589	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
590	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
591	movdqa		$TMP,$BSWAP		# offload
592	palignr		\$8,$CDGH,$ABEF		# ABEF
593	punpcklqdq	$Wi,$CDGH		# CDGH
594	jmp		.Loop_shaext
595
596.align	16
597.Loop_shaext:
598	movdqu		($inp),@MSG[0]
599	movdqu		0x10($inp),@MSG[1]
600	movdqu		0x20($inp),@MSG[2]
601	pshufb		$TMP,@MSG[0]
602	movdqu		0x30($inp),@MSG[3]
603
604	movdqa		0*32-0x80($Tbl),$Wi
605	paddd		@MSG[0],$Wi
606	pshufb		$TMP,@MSG[1]
607	movdqa		$CDGH,$CDGH_SAVE	# offload
608	sha256rnds2	$ABEF,$CDGH		# 0-3
609	pshufd		\$0x0e,$Wi,$Wi
610	nop
611	movdqa		$ABEF,$ABEF_SAVE	# offload
612	sha256rnds2	$CDGH,$ABEF
613
614	movdqa		1*32-0x80($Tbl),$Wi
615	paddd		@MSG[1],$Wi
616	pshufb		$TMP,@MSG[2]
617	sha256rnds2	$ABEF,$CDGH		# 4-7
618	pshufd		\$0x0e,$Wi,$Wi
619	lea		0x40($inp),$inp
620	sha256msg1	@MSG[1],@MSG[0]
621	sha256rnds2	$CDGH,$ABEF
622
623	movdqa		2*32-0x80($Tbl),$Wi
624	paddd		@MSG[2],$Wi
625	pshufb		$TMP,@MSG[3]
626	sha256rnds2	$ABEF,$CDGH		# 8-11
627	pshufd		\$0x0e,$Wi,$Wi
628	movdqa		@MSG[3],$TMP
629	palignr		\$4,@MSG[2],$TMP
630	nop
631	paddd		$TMP,@MSG[0]
632	sha256msg1	@MSG[2],@MSG[1]
633	sha256rnds2	$CDGH,$ABEF
634
635	movdqa		3*32-0x80($Tbl),$Wi
636	paddd		@MSG[3],$Wi
637	sha256msg2	@MSG[3],@MSG[0]
638	sha256rnds2	$ABEF,$CDGH		# 12-15
639	pshufd		\$0x0e,$Wi,$Wi
640	movdqa		@MSG[0],$TMP
641	palignr		\$4,@MSG[3],$TMP
642	nop
643	paddd		$TMP,@MSG[1]
644	sha256msg1	@MSG[3],@MSG[2]
645	sha256rnds2	$CDGH,$ABEF
646___
647for($i=4;$i<16-3;$i++) {
648$code.=<<___;
649	movdqa		$i*32-0x80($Tbl),$Wi
650	paddd		@MSG[0],$Wi
651	sha256msg2	@MSG[0],@MSG[1]
652	sha256rnds2	$ABEF,$CDGH		# 16-19...
653	pshufd		\$0x0e,$Wi,$Wi
654	movdqa		@MSG[1],$TMP
655	palignr		\$4,@MSG[0],$TMP
656	nop
657	paddd		$TMP,@MSG[2]
658	sha256msg1	@MSG[0],@MSG[3]
659	sha256rnds2	$CDGH,$ABEF
660___
661	push(@MSG,shift(@MSG));
662}
663$code.=<<___;
664	movdqa		13*32-0x80($Tbl),$Wi
665	paddd		@MSG[0],$Wi
666	sha256msg2	@MSG[0],@MSG[1]
667	sha256rnds2	$ABEF,$CDGH		# 52-55
668	pshufd		\$0x0e,$Wi,$Wi
669	movdqa		@MSG[1],$TMP
670	palignr		\$4,@MSG[0],$TMP
671	sha256rnds2	$CDGH,$ABEF
672	paddd		$TMP,@MSG[2]
673
674	movdqa		14*32-0x80($Tbl),$Wi
675	paddd		@MSG[1],$Wi
676	sha256rnds2	$ABEF,$CDGH		# 56-59
677	pshufd		\$0x0e,$Wi,$Wi
678	sha256msg2	@MSG[1],@MSG[2]
679	movdqa		$BSWAP,$TMP
680	sha256rnds2	$CDGH,$ABEF
681
682	movdqa		15*32-0x80($Tbl),$Wi
683	paddd		@MSG[2],$Wi
684	nop
685	sha256rnds2	$ABEF,$CDGH		# 60-63
686	pshufd		\$0x0e,$Wi,$Wi
687	dec		$num
688	nop
689	sha256rnds2	$CDGH,$ABEF
690
691	paddd		$CDGH_SAVE,$CDGH
692	paddd		$ABEF_SAVE,$ABEF
693	jnz		.Loop_shaext
694
695	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
696	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
697	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
698	punpckhqdq	$CDGH,$ABEF		# DCBA
699	palignr		\$8,$TMP,$CDGH		# HGFE
700
701	movdqu	$ABEF,($ctx)
702	movdqu	$CDGH,16($ctx)
703___
704$code.=<<___ if ($win64);
705	movaps	-8-5*16(%rax),%xmm6
706	movaps	-8-4*16(%rax),%xmm7
707	movaps	-8-3*16(%rax),%xmm8
708	movaps	-8-2*16(%rax),%xmm9
709	movaps	-8-1*16(%rax),%xmm10
710	mov	%rax,%rsp
711.Lepilogue_shaext:
712___
713$code.=<<___;
714	ret
715.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
716___
717}}}
718{{{
719
720my $a4=$T1;
721my ($a,$b,$c,$d,$e,$f,$g,$h);
722
723sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
724{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
725  my $arg = pop;
726    $arg = "\$$arg" if ($arg*1 eq $arg);
727    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
728}
729
730sub body_00_15 () {
731	(
732	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
733
734	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
735	'&mov	($a,$a1)',
736	'&mov	($a4,$f)',
737
738	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
739	'&xor	($a0,$e)',
740	'&xor	($a4,$g)',			# f^g
741
742	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
743	'&xor	($a1,$a)',
744	'&and	($a4,$e)',			# (f^g)&e
745
746	'&xor	($a0,$e)',
747	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
748	'&mov	($a2,$a)',
749
750	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
751	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
752	'&xor	($a2,$b)',			# a^b, b^c in next round
753
754	'&add	($h,$a4)',			# h+=Ch(e,f,g)
755	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
756	'&and	($a3,$a2)',			# (b^c)&(a^b)
757
758	'&xor	($a1,$a)',
759	'&add	($h,$a0)',			# h+=Sigma1(e)
760	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
761
762	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
763	'&add	($d,$h)',			# d+=h
764	'&add	($h,$a3)',			# h+=Maj(a,b,c)
765
766	'&mov	($a0,$d)',
767	'&add	($a1,$h);'.			# h+=Sigma0(a)
768	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
769	);
770}
771
772######################################################################
773# SSSE3 code path
774#
775if ($SZ==4) {	# SHA256 only
776my @X = map("%xmm$_",(0..3));
777my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
778
779$code.=<<___;
780.type	${func}_ssse3,\@function,3
781.align	64
782${func}_ssse3:
783.cfi_startproc
784.Lssse3_shortcut:
785	mov	%rsp,%rax		# copy %rsp
786.cfi_def_cfa_register	%rax
787	push	%rbx
788.cfi_push	%rbx
789	push	%rbp
790.cfi_push	%rbp
791	push	%r12
792.cfi_push	%r12
793	push	%r13
794.cfi_push	%r13
795	push	%r14
796.cfi_push	%r14
797	push	%r15
798.cfi_push	%r15
799	shl	\$4,%rdx		# num*16
800	sub	\$`$framesz+$win64*16*4`,%rsp
801	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
802	and	\$-64,%rsp		# align stack frame
803	mov	$ctx,$_ctx		# save ctx, 1st arg
804	mov	$inp,$_inp		# save inp, 2nd arh
805	mov	%rdx,$_end		# save end pointer, "3rd" arg
806	mov	%rax,$_rsp		# save copy of %rsp
807.cfi_cfa_expression	$_rsp,deref,+8
808___
809$code.=<<___ if ($win64);
810	movaps	%xmm6,16*$SZ+32(%rsp)
811	movaps	%xmm7,16*$SZ+48(%rsp)
812	movaps	%xmm8,16*$SZ+64(%rsp)
813	movaps	%xmm9,16*$SZ+80(%rsp)
814___
815$code.=<<___;
816.Lprologue_ssse3:
817
818	mov	$SZ*0($ctx),$A
819	mov	$SZ*1($ctx),$B
820	mov	$SZ*2($ctx),$C
821	mov	$SZ*3($ctx),$D
822	mov	$SZ*4($ctx),$E
823	mov	$SZ*5($ctx),$F
824	mov	$SZ*6($ctx),$G
825	mov	$SZ*7($ctx),$H
826___
827
828$code.=<<___;
829	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
830	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
831	jmp	.Lloop_ssse3
832.align	16
833.Lloop_ssse3:
834	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
835	movdqu	0x00($inp),@X[0]
836	movdqu	0x10($inp),@X[1]
837	movdqu	0x20($inp),@X[2]
838	pshufb	$t3,@X[0]
839	movdqu	0x30($inp),@X[3]
840	lea	$TABLE(%rip),$Tbl
841	pshufb	$t3,@X[1]
842	movdqa	0x00($Tbl),$t0
843	movdqa	0x20($Tbl),$t1
844	pshufb	$t3,@X[2]
845	paddd	@X[0],$t0
846	movdqa	0x40($Tbl),$t2
847	pshufb	$t3,@X[3]
848	movdqa	0x60($Tbl),$t3
849	paddd	@X[1],$t1
850	paddd	@X[2],$t2
851	paddd	@X[3],$t3
852	movdqa	$t0,0x00(%rsp)
853	mov	$A,$a1
854	movdqa	$t1,0x10(%rsp)
855	mov	$B,$a3
856	movdqa	$t2,0x20(%rsp)
857	xor	$C,$a3			# magic
858	movdqa	$t3,0x30(%rsp)
859	mov	$E,$a0
860	jmp	.Lssse3_00_47
861
862.align	16
863.Lssse3_00_47:
864	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
865___
866sub Xupdate_256_SSSE3 () {
867	(
868	'&movdqa	($t0,@X[1]);',
869	'&movdqa	($t3,@X[3])',
870	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
871	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
872	'&movdqa	($t1,$t0)',
873	'&movdqa	($t2,$t0);',
874	'&psrld		($t0,$sigma0[2])',
875	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
876	'&psrld		($t2,$sigma0[0])',
877	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
878	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
879	'&pxor		($t0,$t2)',
880	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
881	'&pxor		($t0,$t1)',
882	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
883	'&pxor		($t0,$t2);',
884	 '&movdqa	($t2,$t3)',
885	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
886	 '&psrld	($t3,$sigma1[2])',
887	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
888	 '&psrlq	($t2,$sigma1[0])',
889	 '&pxor		($t3,$t2);',
890	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
891	 '&pxor		($t3,$t2)',
892	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
893	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
894	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
895	 '&movdqa	($t2,$t3);',
896	 '&psrld	($t3,$sigma1[2])',
897	 '&psrlq	($t2,$sigma1[0])',
898	 '&pxor		($t3,$t2);',
899	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
900	 '&pxor		($t3,$t2);',
901	'&movdqa	($t2,16*2*$j."($Tbl)")',
902	 '&pshufb	($t3,$t5)',
903	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
904	);
905}
906
907sub SSSE3_256_00_47 () {
908my $j = shift;
909my $body = shift;
910my @X = @_;
911my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
912
913    if (0) {
914	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
915	    eval;
916	    eval(shift(@insns));
917	    eval(shift(@insns));
918	    eval(shift(@insns));
919	}
920    } else {			# squeeze extra 4% on Westmere and 19% on Atom
921	  eval(shift(@insns));	#@
922	&movdqa		($t0,@X[1]);
923	  eval(shift(@insns));
924	  eval(shift(@insns));
925	&movdqa		($t3,@X[3]);
926	  eval(shift(@insns));	#@
927	  eval(shift(@insns));
928	  eval(shift(@insns));
929	  eval(shift(@insns));	#@
930	  eval(shift(@insns));
931	&palignr	($t0,@X[0],$SZ);	# X[1..4]
932	  eval(shift(@insns));
933	  eval(shift(@insns));
934	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
935	  eval(shift(@insns));
936	  eval(shift(@insns));
937	  eval(shift(@insns));
938	  eval(shift(@insns));	#@
939	&movdqa		($t1,$t0);
940	  eval(shift(@insns));
941	  eval(shift(@insns));
942	&movdqa		($t2,$t0);
943	  eval(shift(@insns));	#@
944	  eval(shift(@insns));
945	&psrld		($t0,$sigma0[2]);
946	  eval(shift(@insns));
947	  eval(shift(@insns));
948	  eval(shift(@insns));
949	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
950	  eval(shift(@insns));	#@
951	  eval(shift(@insns));
952	&psrld		($t2,$sigma0[0]);
953	  eval(shift(@insns));
954	  eval(shift(@insns));
955	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
956	  eval(shift(@insns));
957	  eval(shift(@insns));	#@
958	&pslld		($t1,8*$SZ-$sigma0[1]);
959	  eval(shift(@insns));
960	  eval(shift(@insns));
961	&pxor		($t0,$t2);
962	  eval(shift(@insns));	#@
963	  eval(shift(@insns));
964	  eval(shift(@insns));
965	  eval(shift(@insns));	#@
966	&psrld		($t2,$sigma0[1]-$sigma0[0]);
967	  eval(shift(@insns));
968	&pxor		($t0,$t1);
969	  eval(shift(@insns));
970	  eval(shift(@insns));
971	&pslld		($t1,$sigma0[1]-$sigma0[0]);
972	  eval(shift(@insns));
973	  eval(shift(@insns));
974	&pxor		($t0,$t2);
975	  eval(shift(@insns));
976	  eval(shift(@insns));	#@
977	 &movdqa	($t2,$t3);
978	  eval(shift(@insns));
979	  eval(shift(@insns));
980	&pxor		($t0,$t1);		# sigma0(X[1..4])
981	  eval(shift(@insns));	#@
982	  eval(shift(@insns));
983	  eval(shift(@insns));
984	 &psrld		($t3,$sigma1[2]);
985	  eval(shift(@insns));
986	  eval(shift(@insns));
987	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
988	  eval(shift(@insns));	#@
989	  eval(shift(@insns));
990	 &psrlq		($t2,$sigma1[0]);
991	  eval(shift(@insns));
992	  eval(shift(@insns));
993	  eval(shift(@insns));
994	 &pxor		($t3,$t2);
995	  eval(shift(@insns));	#@
996	  eval(shift(@insns));
997	  eval(shift(@insns));
998	  eval(shift(@insns));	#@
999	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1000	  eval(shift(@insns));
1001	  eval(shift(@insns));
1002	 &pxor		($t3,$t2);
1003	  eval(shift(@insns));	#@
1004	  eval(shift(@insns));
1005	  eval(shift(@insns));
1006	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
1007	 &pshufd	($t3,$t3,0b10000000);
1008	  eval(shift(@insns));
1009	  eval(shift(@insns));
1010	  eval(shift(@insns));
1011	 &psrldq	($t3,8);
1012	  eval(shift(@insns));
1013	  eval(shift(@insns));	#@
1014	  eval(shift(@insns));
1015	  eval(shift(@insns));
1016	  eval(shift(@insns));	#@
1017	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
1018	  eval(shift(@insns));
1019	  eval(shift(@insns));
1020	  eval(shift(@insns));
1021	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
1022	  eval(shift(@insns));
1023	  eval(shift(@insns));	#@
1024	  eval(shift(@insns));
1025	 &movdqa	($t2,$t3);
1026	  eval(shift(@insns));
1027	  eval(shift(@insns));
1028	 &psrld		($t3,$sigma1[2]);
1029	  eval(shift(@insns));
1030	  eval(shift(@insns));	#@
1031	 &psrlq		($t2,$sigma1[0]);
1032	  eval(shift(@insns));
1033	  eval(shift(@insns));
1034	 &pxor		($t3,$t2);
1035	  eval(shift(@insns));	#@
1036	  eval(shift(@insns));
1037	  eval(shift(@insns));
1038	  eval(shift(@insns));	#@
1039	  eval(shift(@insns));
1040	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1041	  eval(shift(@insns));
1042	  eval(shift(@insns));
1043	  eval(shift(@insns));
1044	 &pxor		($t3,$t2);
1045	  eval(shift(@insns));
1046	  eval(shift(@insns));
1047	  eval(shift(@insns));	#@
1048	 #&pshufb	($t3,$t5);
1049	 &pshufd	($t3,$t3,0b00001000);
1050	  eval(shift(@insns));
1051	  eval(shift(@insns));
1052	&movdqa		($t2,16*2*$j."($Tbl)");
1053	  eval(shift(@insns));	#@
1054	  eval(shift(@insns));
1055	 &pslldq	($t3,8);
1056	  eval(shift(@insns));
1057	  eval(shift(@insns));
1058	  eval(shift(@insns));
1059	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1060	  eval(shift(@insns));	#@
1061	  eval(shift(@insns));
1062	  eval(shift(@insns));
1063    }
1064	&paddd		($t2,@X[0]);
1065	  foreach (@insns) { eval; }		# remaining instructions
1066	&movdqa		(16*$j."(%rsp)",$t2);
1067}
1068
1069    for ($i=0,$j=0; $j<4; $j++) {
1070	&SSSE3_256_00_47($j,\&body_00_15,@X);
1071	push(@X,shift(@X));			# rotate(@X)
1072    }
1073	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1074	&jne	(".Lssse3_00_47");
1075
1076    for ($i=0; $i<16; ) {
1077	foreach(body_00_15()) { eval; }
1078    }
1079$code.=<<___;
1080	mov	$_ctx,$ctx
1081	mov	$a1,$A
1082
1083	add	$SZ*0($ctx),$A
1084	lea	16*$SZ($inp),$inp
1085	add	$SZ*1($ctx),$B
1086	add	$SZ*2($ctx),$C
1087	add	$SZ*3($ctx),$D
1088	add	$SZ*4($ctx),$E
1089	add	$SZ*5($ctx),$F
1090	add	$SZ*6($ctx),$G
1091	add	$SZ*7($ctx),$H
1092
1093	cmp	$_end,$inp
1094
1095	mov	$A,$SZ*0($ctx)
1096	mov	$B,$SZ*1($ctx)
1097	mov	$C,$SZ*2($ctx)
1098	mov	$D,$SZ*3($ctx)
1099	mov	$E,$SZ*4($ctx)
1100	mov	$F,$SZ*5($ctx)
1101	mov	$G,$SZ*6($ctx)
1102	mov	$H,$SZ*7($ctx)
1103	jb	.Lloop_ssse3
1104
1105	mov	$_rsp,%rsi
1106.cfi_def_cfa	%rsi,8
1107___
1108$code.=<<___ if ($win64);
1109	movaps	16*$SZ+32(%rsp),%xmm6
1110	movaps	16*$SZ+48(%rsp),%xmm7
1111	movaps	16*$SZ+64(%rsp),%xmm8
1112	movaps	16*$SZ+80(%rsp),%xmm9
1113___
1114$code.=<<___;
1115	mov	-48(%rsi),%r15
1116.cfi_restore	%r15
1117	mov	-40(%rsi),%r14
1118.cfi_restore	%r14
1119	mov	-32(%rsi),%r13
1120.cfi_restore	%r13
1121	mov	-24(%rsi),%r12
1122.cfi_restore	%r12
1123	mov	-16(%rsi),%rbp
1124.cfi_restore	%rbp
1125	mov	-8(%rsi),%rbx
1126.cfi_restore	%rbx
1127	lea	(%rsi),%rsp
1128.cfi_def_cfa_register	%rsp
1129.Lepilogue_ssse3:
1130	ret
1131.cfi_endproc
1132.size	${func}_ssse3,.-${func}_ssse3
1133___
1134}
1135
1136if ($avx) {{
1137######################################################################
1138# XOP code path
1139#
1140if ($SZ==8) {	# SHA512 only
1141$code.=<<___;
1142.type	${func}_xop,\@function,3
1143.align	64
1144${func}_xop:
1145.cfi_startproc
1146.Lxop_shortcut:
1147	mov	%rsp,%rax		# copy %rsp
1148.cfi_def_cfa_register	%rax
1149	push	%rbx
1150.cfi_push	%rbx
1151	push	%rbp
1152.cfi_push	%rbp
1153	push	%r12
1154.cfi_push	%r12
1155	push	%r13
1156.cfi_push	%r13
1157	push	%r14
1158.cfi_push	%r14
1159	push	%r15
1160.cfi_push	%r15
1161	shl	\$4,%rdx		# num*16
1162	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1163	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1164	and	\$-64,%rsp		# align stack frame
1165	mov	$ctx,$_ctx		# save ctx, 1st arg
1166	mov	$inp,$_inp		# save inp, 2nd arh
1167	mov	%rdx,$_end		# save end pointer, "3rd" arg
1168	mov	%rax,$_rsp		# save copy of %rsp
1169.cfi_cfa_expression	$_rsp,deref,+8
1170___
1171$code.=<<___ if ($win64);
1172	movaps	%xmm6,16*$SZ+32(%rsp)
1173	movaps	%xmm7,16*$SZ+48(%rsp)
1174	movaps	%xmm8,16*$SZ+64(%rsp)
1175	movaps	%xmm9,16*$SZ+80(%rsp)
1176___
1177$code.=<<___ if ($win64 && $SZ>4);
1178	movaps	%xmm10,16*$SZ+96(%rsp)
1179	movaps	%xmm11,16*$SZ+112(%rsp)
1180___
1181$code.=<<___;
1182.Lprologue_xop:
1183
1184	vzeroupper
1185	mov	$SZ*0($ctx),$A
1186	mov	$SZ*1($ctx),$B
1187	mov	$SZ*2($ctx),$C
1188	mov	$SZ*3($ctx),$D
1189	mov	$SZ*4($ctx),$E
1190	mov	$SZ*5($ctx),$F
1191	mov	$SZ*6($ctx),$G
1192	mov	$SZ*7($ctx),$H
1193	jmp	.Lloop_xop
1194___
1195					if ($SZ==4) {	# SHA256
1196    my @X = map("%xmm$_",(0..3));
1197    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1198
1199$code.=<<___;
1200.align	16
1201.Lloop_xop:
1202	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1203	vmovdqu	0x00($inp),@X[0]
1204	vmovdqu	0x10($inp),@X[1]
1205	vmovdqu	0x20($inp),@X[2]
1206	vmovdqu	0x30($inp),@X[3]
1207	vpshufb	$t3,@X[0],@X[0]
1208	lea	$TABLE(%rip),$Tbl
1209	vpshufb	$t3,@X[1],@X[1]
1210	vpshufb	$t3,@X[2],@X[2]
1211	vpaddd	0x00($Tbl),@X[0],$t0
1212	vpshufb	$t3,@X[3],@X[3]
1213	vpaddd	0x20($Tbl),@X[1],$t1
1214	vpaddd	0x40($Tbl),@X[2],$t2
1215	vpaddd	0x60($Tbl),@X[3],$t3
1216	vmovdqa	$t0,0x00(%rsp)
1217	mov	$A,$a1
1218	vmovdqa	$t1,0x10(%rsp)
1219	mov	$B,$a3
1220	vmovdqa	$t2,0x20(%rsp)
1221	xor	$C,$a3			# magic
1222	vmovdqa	$t3,0x30(%rsp)
1223	mov	$E,$a0
1224	jmp	.Lxop_00_47
1225
1226.align	16
1227.Lxop_00_47:
1228	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1229___
1230sub XOP_256_00_47 () {
1231my $j = shift;
1232my $body = shift;
1233my @X = @_;
1234my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1235
1236	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
1237	  eval(shift(@insns));
1238	  eval(shift(@insns));
1239	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
1240	  eval(shift(@insns));
1241	  eval(shift(@insns));
1242	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
1243	  eval(shift(@insns));
1244	  eval(shift(@insns));
1245	&vpsrld		($t0,$t0,$sigma0[2]);
1246	  eval(shift(@insns));
1247	  eval(shift(@insns));
1248	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
1249	  eval(shift(@insns));
1250	  eval(shift(@insns));
1251	  eval(shift(@insns));
1252	  eval(shift(@insns));
1253	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
1254	  eval(shift(@insns));
1255	  eval(shift(@insns));
1256	&vpxor		($t0,$t0,$t1);
1257	  eval(shift(@insns));
1258	  eval(shift(@insns));
1259	  eval(shift(@insns));
1260	  eval(shift(@insns));
1261	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
1262	  eval(shift(@insns));
1263	  eval(shift(@insns));
1264	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
1265	  eval(shift(@insns));
1266	  eval(shift(@insns));
1267	 &vpsrld	($t2,@X[3],$sigma1[2]);
1268	  eval(shift(@insns));
1269	  eval(shift(@insns));
1270	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
1271	  eval(shift(@insns));
1272	  eval(shift(@insns));
1273	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1274	  eval(shift(@insns));
1275	  eval(shift(@insns));
1276	 &vpxor		($t3,$t3,$t2);
1277	  eval(shift(@insns));
1278	  eval(shift(@insns));
1279	  eval(shift(@insns));
1280	  eval(shift(@insns));
1281	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1282	  eval(shift(@insns));
1283	  eval(shift(@insns));
1284	  eval(shift(@insns));
1285	  eval(shift(@insns));
1286	&vpsrldq	($t3,$t3,8);
1287	  eval(shift(@insns));
1288	  eval(shift(@insns));
1289	  eval(shift(@insns));
1290	  eval(shift(@insns));
1291	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1292	  eval(shift(@insns));
1293	  eval(shift(@insns));
1294	  eval(shift(@insns));
1295	  eval(shift(@insns));
1296	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
1297	  eval(shift(@insns));
1298	  eval(shift(@insns));
1299	 &vpsrld	($t2,@X[0],$sigma1[2]);
1300	  eval(shift(@insns));
1301	  eval(shift(@insns));
1302	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1303	  eval(shift(@insns));
1304	  eval(shift(@insns));
1305	 &vpxor		($t3,$t3,$t2);
1306	  eval(shift(@insns));
1307	  eval(shift(@insns));
1308	  eval(shift(@insns));
1309	  eval(shift(@insns));
1310	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
1311	  eval(shift(@insns));
1312	  eval(shift(@insns));
1313	  eval(shift(@insns));
1314	  eval(shift(@insns));
1315	&vpslldq	($t3,$t3,8);		# 22 instructions
1316	  eval(shift(@insns));
1317	  eval(shift(@insns));
1318	  eval(shift(@insns));
1319	  eval(shift(@insns));
1320	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
1321	  eval(shift(@insns));
1322	  eval(shift(@insns));
1323	  eval(shift(@insns));
1324	  eval(shift(@insns));
1325	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1326	  foreach (@insns) { eval; }		# remaining instructions
1327	&vmovdqa	(16*$j."(%rsp)",$t2);
1328}
1329
1330    for ($i=0,$j=0; $j<4; $j++) {
1331	&XOP_256_00_47($j,\&body_00_15,@X);
1332	push(@X,shift(@X));			# rotate(@X)
1333    }
1334	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1335	&jne	(".Lxop_00_47");
1336
1337    for ($i=0; $i<16; ) {
1338	foreach(body_00_15()) { eval; }
1339    }
1340
1341					} else {	# SHA512
1342    my @X = map("%xmm$_",(0..7));
1343    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1344
1345$code.=<<___;
1346.align	16
1347.Lloop_xop:
1348	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1349	vmovdqu	0x00($inp),@X[0]
1350	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1351	vmovdqu	0x10($inp),@X[1]
1352	vmovdqu	0x20($inp),@X[2]
1353	vpshufb	$t3,@X[0],@X[0]
1354	vmovdqu	0x30($inp),@X[3]
1355	vpshufb	$t3,@X[1],@X[1]
1356	vmovdqu	0x40($inp),@X[4]
1357	vpshufb	$t3,@X[2],@X[2]
1358	vmovdqu	0x50($inp),@X[5]
1359	vpshufb	$t3,@X[3],@X[3]
1360	vmovdqu	0x60($inp),@X[6]
1361	vpshufb	$t3,@X[4],@X[4]
1362	vmovdqu	0x70($inp),@X[7]
1363	vpshufb	$t3,@X[5],@X[5]
1364	vpaddq	-0x80($Tbl),@X[0],$t0
1365	vpshufb	$t3,@X[6],@X[6]
1366	vpaddq	-0x60($Tbl),@X[1],$t1
1367	vpshufb	$t3,@X[7],@X[7]
1368	vpaddq	-0x40($Tbl),@X[2],$t2
1369	vpaddq	-0x20($Tbl),@X[3],$t3
1370	vmovdqa	$t0,0x00(%rsp)
1371	vpaddq	0x00($Tbl),@X[4],$t0
1372	vmovdqa	$t1,0x10(%rsp)
1373	vpaddq	0x20($Tbl),@X[5],$t1
1374	vmovdqa	$t2,0x20(%rsp)
1375	vpaddq	0x40($Tbl),@X[6],$t2
1376	vmovdqa	$t3,0x30(%rsp)
1377	vpaddq	0x60($Tbl),@X[7],$t3
1378	vmovdqa	$t0,0x40(%rsp)
1379	mov	$A,$a1
1380	vmovdqa	$t1,0x50(%rsp)
1381	mov	$B,$a3
1382	vmovdqa	$t2,0x60(%rsp)
1383	xor	$C,$a3			# magic
1384	vmovdqa	$t3,0x70(%rsp)
1385	mov	$E,$a0
1386	jmp	.Lxop_00_47
1387
1388.align	16
1389.Lxop_00_47:
1390	add	\$`16*2*$SZ`,$Tbl
1391___
1392sub XOP_512_00_47 () {
1393my $j = shift;
1394my $body = shift;
1395my @X = @_;
1396my @insns = (&$body,&$body);			# 52 instructions
1397
1398	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
1399	  eval(shift(@insns));
1400	  eval(shift(@insns));
1401	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
1402	  eval(shift(@insns));
1403	  eval(shift(@insns));
1404	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
1405	  eval(shift(@insns));
1406	  eval(shift(@insns));
1407	&vpsrlq		($t0,$t0,$sigma0[2]);
1408	  eval(shift(@insns));
1409	  eval(shift(@insns));
1410	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
1411	  eval(shift(@insns));
1412	  eval(shift(@insns));
1413	  eval(shift(@insns));
1414	  eval(shift(@insns));
1415	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
1416	  eval(shift(@insns));
1417	  eval(shift(@insns));
1418	&vpxor		($t0,$t0,$t1);
1419	  eval(shift(@insns));
1420	  eval(shift(@insns));
1421	  eval(shift(@insns));
1422	  eval(shift(@insns));
1423	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
1424	  eval(shift(@insns));
1425	  eval(shift(@insns));
1426	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
1427	  eval(shift(@insns));
1428	  eval(shift(@insns));
1429	 &vpsrlq	($t2,@X[7],$sigma1[2]);
1430	  eval(shift(@insns));
1431	  eval(shift(@insns));
1432	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
1433	  eval(shift(@insns));
1434	  eval(shift(@insns));
1435	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
1436	  eval(shift(@insns));
1437	  eval(shift(@insns));
1438	 &vpxor		($t3,$t3,$t2);
1439	  eval(shift(@insns));
1440	  eval(shift(@insns));
1441	  eval(shift(@insns));
1442	  eval(shift(@insns));
1443	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1444	  eval(shift(@insns));
1445	  eval(shift(@insns));
1446	  eval(shift(@insns));
1447	  eval(shift(@insns));
1448	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1449	  eval(shift(@insns));
1450	  eval(shift(@insns));
1451	  eval(shift(@insns));
1452	  eval(shift(@insns));
1453	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1454	  foreach (@insns) { eval; }		# remaining instructions
1455	&vmovdqa	(16*$j."(%rsp)",$t2);
1456}
1457
1458    for ($i=0,$j=0; $j<8; $j++) {
1459	&XOP_512_00_47($j,\&body_00_15,@X);
1460	push(@X,shift(@X));			# rotate(@X)
1461    }
1462	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1463	&jne	(".Lxop_00_47");
1464
1465    for ($i=0; $i<16; ) {
1466	foreach(body_00_15()) { eval; }
1467    }
1468}
1469$code.=<<___;
1470	mov	$_ctx,$ctx
1471	mov	$a1,$A
1472
1473	add	$SZ*0($ctx),$A
1474	lea	16*$SZ($inp),$inp
1475	add	$SZ*1($ctx),$B
1476	add	$SZ*2($ctx),$C
1477	add	$SZ*3($ctx),$D
1478	add	$SZ*4($ctx),$E
1479	add	$SZ*5($ctx),$F
1480	add	$SZ*6($ctx),$G
1481	add	$SZ*7($ctx),$H
1482
1483	cmp	$_end,$inp
1484
1485	mov	$A,$SZ*0($ctx)
1486	mov	$B,$SZ*1($ctx)
1487	mov	$C,$SZ*2($ctx)
1488	mov	$D,$SZ*3($ctx)
1489	mov	$E,$SZ*4($ctx)
1490	mov	$F,$SZ*5($ctx)
1491	mov	$G,$SZ*6($ctx)
1492	mov	$H,$SZ*7($ctx)
1493	jb	.Lloop_xop
1494
1495	mov	$_rsp,%rsi
1496.cfi_def_cfa	%rsi,8
1497	vzeroupper
1498___
1499$code.=<<___ if ($win64);
1500	movaps	16*$SZ+32(%rsp),%xmm6
1501	movaps	16*$SZ+48(%rsp),%xmm7
1502	movaps	16*$SZ+64(%rsp),%xmm8
1503	movaps	16*$SZ+80(%rsp),%xmm9
1504___
1505$code.=<<___ if ($win64 && $SZ>4);
1506	movaps	16*$SZ+96(%rsp),%xmm10
1507	movaps	16*$SZ+112(%rsp),%xmm11
1508___
1509$code.=<<___;
1510	mov	-48(%rsi),%r15
1511.cfi_restore	%r15
1512	mov	-40(%rsi),%r14
1513.cfi_restore	%r14
1514	mov	-32(%rsi),%r13
1515.cfi_restore	%r13
1516	mov	-24(%rsi),%r12
1517.cfi_restore	%r12
1518	mov	-16(%rsi),%rbp
1519.cfi_restore	%rbp
1520	mov	-8(%rsi),%rbx
1521.cfi_restore	%rbx
1522	lea	(%rsi),%rsp
1523.cfi_def_cfa_register	%rsp
1524.Lepilogue_xop:
1525	ret
1526.cfi_endproc
1527.size	${func}_xop,.-${func}_xop
1528___
1529}
1530######################################################################
1531# AVX+shrd code path
1532#
1533local *ror = sub { &shrd(@_[0],@_) };
1534
1535$code.=<<___;
1536.type	${func}_avx,\@function,3
1537.align	64
1538${func}_avx:
1539.cfi_startproc
1540.Lavx_shortcut:
1541	mov	%rsp,%rax		# copy %rsp
1542.cfi_def_cfa_register	%rax
1543	push	%rbx
1544.cfi_push	%rbx
1545	push	%rbp
1546.cfi_push	%rbp
1547	push	%r12
1548.cfi_push	%r12
1549	push	%r13
1550.cfi_push	%r13
1551	push	%r14
1552.cfi_push	%r14
1553	push	%r15
1554.cfi_push	%r15
1555	shl	\$4,%rdx		# num*16
1556	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1557	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1558	and	\$-64,%rsp		# align stack frame
1559	mov	$ctx,$_ctx		# save ctx, 1st arg
1560	mov	$inp,$_inp		# save inp, 2nd arh
1561	mov	%rdx,$_end		# save end pointer, "3rd" arg
1562	mov	%rax,$_rsp		# save copy of %rsp
1563.cfi_cfa_expression	$_rsp,deref,+8
1564___
1565$code.=<<___ if ($win64);
1566	movaps	%xmm6,16*$SZ+32(%rsp)
1567	movaps	%xmm7,16*$SZ+48(%rsp)
1568	movaps	%xmm8,16*$SZ+64(%rsp)
1569	movaps	%xmm9,16*$SZ+80(%rsp)
1570___
1571$code.=<<___ if ($win64 && $SZ>4);
1572	movaps	%xmm10,16*$SZ+96(%rsp)
1573	movaps	%xmm11,16*$SZ+112(%rsp)
1574___
1575$code.=<<___;
1576.Lprologue_avx:
1577
1578	vzeroupper
1579	mov	$SZ*0($ctx),$A
1580	mov	$SZ*1($ctx),$B
1581	mov	$SZ*2($ctx),$C
1582	mov	$SZ*3($ctx),$D
1583	mov	$SZ*4($ctx),$E
1584	mov	$SZ*5($ctx),$F
1585	mov	$SZ*6($ctx),$G
1586	mov	$SZ*7($ctx),$H
1587___
1588					if ($SZ==4) {	# SHA256
1589    my @X = map("%xmm$_",(0..3));
1590    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1591
1592$code.=<<___;
1593	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1594	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1595	jmp	.Lloop_avx
1596.align	16
1597.Lloop_avx:
1598	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1599	vmovdqu	0x00($inp),@X[0]
1600	vmovdqu	0x10($inp),@X[1]
1601	vmovdqu	0x20($inp),@X[2]
1602	vmovdqu	0x30($inp),@X[3]
1603	vpshufb	$t3,@X[0],@X[0]
1604	lea	$TABLE(%rip),$Tbl
1605	vpshufb	$t3,@X[1],@X[1]
1606	vpshufb	$t3,@X[2],@X[2]
1607	vpaddd	0x00($Tbl),@X[0],$t0
1608	vpshufb	$t3,@X[3],@X[3]
1609	vpaddd	0x20($Tbl),@X[1],$t1
1610	vpaddd	0x40($Tbl),@X[2],$t2
1611	vpaddd	0x60($Tbl),@X[3],$t3
1612	vmovdqa	$t0,0x00(%rsp)
1613	mov	$A,$a1
1614	vmovdqa	$t1,0x10(%rsp)
1615	mov	$B,$a3
1616	vmovdqa	$t2,0x20(%rsp)
1617	xor	$C,$a3			# magic
1618	vmovdqa	$t3,0x30(%rsp)
1619	mov	$E,$a0
1620	jmp	.Lavx_00_47
1621
1622.align	16
1623.Lavx_00_47:
1624	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1625___
1626sub Xupdate_256_AVX () {
1627	(
1628	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1629	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1630	'&vpsrld	($t2,$t0,$sigma0[0]);',
1631	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1632	'&vpsrld	($t3,$t0,$sigma0[2])',
1633	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1634	'&vpxor		($t0,$t3,$t2)',
1635	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1636	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1637	'&vpxor		($t0,$t0,$t1)',
1638	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1639	'&vpxor		($t0,$t0,$t2)',
1640	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1641	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1642	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1643	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1644	 '&vpxor	($t2,$t2,$t3);',
1645	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1646	 '&vpxor	($t2,$t2,$t3)',
1647	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1648	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1649	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1650	 '&vpsrld	($t2,$t3,$sigma1[2])',
1651	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1652	 '&vpxor	($t2,$t2,$t3);',
1653	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1654	 '&vpxor	($t2,$t2,$t3)',
1655	 '&vpshufb	($t2,$t2,$t5)',
1656	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1657	);
1658}
1659
1660sub AVX_256_00_47 () {
1661my $j = shift;
1662my $body = shift;
1663my @X = @_;
1664my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1665
1666	foreach (Xupdate_256_AVX()) {		# 29 instructions
1667	    eval;
1668	    eval(shift(@insns));
1669	    eval(shift(@insns));
1670	    eval(shift(@insns));
1671	}
1672	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1673	  foreach (@insns) { eval; }		# remaining instructions
1674	&vmovdqa	(16*$j."(%rsp)",$t2);
1675}
1676
1677    for ($i=0,$j=0; $j<4; $j++) {
1678	&AVX_256_00_47($j,\&body_00_15,@X);
1679	push(@X,shift(@X));			# rotate(@X)
1680    }
1681	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1682	&jne	(".Lavx_00_47");
1683
1684    for ($i=0; $i<16; ) {
1685	foreach(body_00_15()) { eval; }
1686    }
1687
1688					} else {	# SHA512
1689    my @X = map("%xmm$_",(0..7));
1690    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1691
1692$code.=<<___;
1693	jmp	.Lloop_avx
1694.align	16
1695.Lloop_avx:
1696	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1697	vmovdqu	0x00($inp),@X[0]
1698	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1699	vmovdqu	0x10($inp),@X[1]
1700	vmovdqu	0x20($inp),@X[2]
1701	vpshufb	$t3,@X[0],@X[0]
1702	vmovdqu	0x30($inp),@X[3]
1703	vpshufb	$t3,@X[1],@X[1]
1704	vmovdqu	0x40($inp),@X[4]
1705	vpshufb	$t3,@X[2],@X[2]
1706	vmovdqu	0x50($inp),@X[5]
1707	vpshufb	$t3,@X[3],@X[3]
1708	vmovdqu	0x60($inp),@X[6]
1709	vpshufb	$t3,@X[4],@X[4]
1710	vmovdqu	0x70($inp),@X[7]
1711	vpshufb	$t3,@X[5],@X[5]
1712	vpaddq	-0x80($Tbl),@X[0],$t0
1713	vpshufb	$t3,@X[6],@X[6]
1714	vpaddq	-0x60($Tbl),@X[1],$t1
1715	vpshufb	$t3,@X[7],@X[7]
1716	vpaddq	-0x40($Tbl),@X[2],$t2
1717	vpaddq	-0x20($Tbl),@X[3],$t3
1718	vmovdqa	$t0,0x00(%rsp)
1719	vpaddq	0x00($Tbl),@X[4],$t0
1720	vmovdqa	$t1,0x10(%rsp)
1721	vpaddq	0x20($Tbl),@X[5],$t1
1722	vmovdqa	$t2,0x20(%rsp)
1723	vpaddq	0x40($Tbl),@X[6],$t2
1724	vmovdqa	$t3,0x30(%rsp)
1725	vpaddq	0x60($Tbl),@X[7],$t3
1726	vmovdqa	$t0,0x40(%rsp)
1727	mov	$A,$a1
1728	vmovdqa	$t1,0x50(%rsp)
1729	mov	$B,$a3
1730	vmovdqa	$t2,0x60(%rsp)
1731	xor	$C,$a3			# magic
1732	vmovdqa	$t3,0x70(%rsp)
1733	mov	$E,$a0
1734	jmp	.Lavx_00_47
1735
1736.align	16
1737.Lavx_00_47:
1738	add	\$`16*2*$SZ`,$Tbl
1739___
1740sub Xupdate_512_AVX () {
1741	(
1742	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1743	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1744	'&vpsrlq	($t2,$t0,$sigma0[0])',
1745	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1746	'&vpsrlq	($t3,$t0,$sigma0[2])',
1747	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1748	 '&vpxor	($t0,$t3,$t2)',
1749	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1750	 '&vpxor	($t0,$t0,$t1)',
1751	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1752	 '&vpxor	($t0,$t0,$t2)',
1753	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1754	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1755	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1756	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1757	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1758	 '&vpxor	($t3,$t3,$t2)',
1759	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1760	 '&vpxor	($t3,$t3,$t1)',
1761	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1762	 '&vpxor	($t3,$t3,$t2)',
1763	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1764	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1765	);
1766}
1767
1768sub AVX_512_00_47 () {
1769my $j = shift;
1770my $body = shift;
1771my @X = @_;
1772my @insns = (&$body,&$body);			# 52 instructions
1773
1774	foreach (Xupdate_512_AVX()) {		# 23 instructions
1775	    eval;
1776	    eval(shift(@insns));
1777	    eval(shift(@insns));
1778	}
1779	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1780	  foreach (@insns) { eval; }		# remaining instructions
1781	&vmovdqa	(16*$j."(%rsp)",$t2);
1782}
1783
1784    for ($i=0,$j=0; $j<8; $j++) {
1785	&AVX_512_00_47($j,\&body_00_15,@X);
1786	push(@X,shift(@X));			# rotate(@X)
1787    }
1788	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1789	&jne	(".Lavx_00_47");
1790
1791    for ($i=0; $i<16; ) {
1792	foreach(body_00_15()) { eval; }
1793    }
1794}
1795$code.=<<___;
1796	mov	$_ctx,$ctx
1797	mov	$a1,$A
1798
1799	add	$SZ*0($ctx),$A
1800	lea	16*$SZ($inp),$inp
1801	add	$SZ*1($ctx),$B
1802	add	$SZ*2($ctx),$C
1803	add	$SZ*3($ctx),$D
1804	add	$SZ*4($ctx),$E
1805	add	$SZ*5($ctx),$F
1806	add	$SZ*6($ctx),$G
1807	add	$SZ*7($ctx),$H
1808
1809	cmp	$_end,$inp
1810
1811	mov	$A,$SZ*0($ctx)
1812	mov	$B,$SZ*1($ctx)
1813	mov	$C,$SZ*2($ctx)
1814	mov	$D,$SZ*3($ctx)
1815	mov	$E,$SZ*4($ctx)
1816	mov	$F,$SZ*5($ctx)
1817	mov	$G,$SZ*6($ctx)
1818	mov	$H,$SZ*7($ctx)
1819	jb	.Lloop_avx
1820
1821	mov	$_rsp,%rsi
1822.cfi_def_cfa	%rsi,8
1823	vzeroupper
1824___
1825$code.=<<___ if ($win64);
1826	movaps	16*$SZ+32(%rsp),%xmm6
1827	movaps	16*$SZ+48(%rsp),%xmm7
1828	movaps	16*$SZ+64(%rsp),%xmm8
1829	movaps	16*$SZ+80(%rsp),%xmm9
1830___
1831$code.=<<___ if ($win64 && $SZ>4);
1832	movaps	16*$SZ+96(%rsp),%xmm10
1833	movaps	16*$SZ+112(%rsp),%xmm11
1834___
1835$code.=<<___;
1836	mov	-48(%rsi),%r15
1837.cfi_restore	%r15
1838	mov	-40(%rsi),%r14
1839.cfi_restore	%r14
1840	mov	-32(%rsi),%r13
1841.cfi_restore	%r13
1842	mov	-24(%rsi),%r12
1843.cfi_restore	%r12
1844	mov	-16(%rsi),%rbp
1845.cfi_restore	%rbp
1846	mov	-8(%rsi),%rbx
1847.cfi_restore	%rbx
1848	lea	(%rsi),%rsp
1849.cfi_def_cfa_register	%rsp
1850.Lepilogue_avx:
1851	ret
1852.cfi_endproc
1853.size	${func}_avx,.-${func}_avx
1854___
1855
1856if ($avx>1) {{
1857######################################################################
1858# AVX2+BMI code path
1859#
1860my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
1861my $PUSH8=8*2*$SZ;
1862use integer;
1863
1864sub bodyx_00_15 () {
1865	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1866	(
1867	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1868
1869	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
1870	'&and	($a4,$e)',		# f&e
1871	'&rorx	($a0,$e,$Sigma1[2])',
1872	'&rorx	($a2,$e,$Sigma1[1])',
1873
1874	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
1875	'&lea	($h,"($h,$a4)")',
1876	'&andn	($a4,$e,$g)',		# ~e&g
1877	'&xor	($a0,$a2)',
1878
1879	'&rorx	($a1,$e,$Sigma1[0])',
1880	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
1881	'&xor	($a0,$a1)',		# Sigma1(e)
1882	'&mov	($a2,$a)',
1883
1884	'&rorx	($a4,$a,$Sigma0[2])',
1885	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
1886	'&xor	($a2,$b)',		# a^b, b^c in next round
1887	'&rorx	($a1,$a,$Sigma0[1])',
1888
1889	'&rorx	($a0,$a,$Sigma0[0])',
1890	'&lea	($d,"($d,$h)")',	# d+=h
1891	'&and	($a3,$a2)',		# (b^c)&(a^b)
1892	'&xor	($a1,$a4)',
1893
1894	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
1895	'&xor	($a1,$a0)',		# Sigma0(a)
1896	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
1897	'&mov	($a4,$e)',		# copy of f in future
1898
1899	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1900	);
1901	# and at the finish one has to $a+=$a1
1902}
1903
1904$code.=<<___;
1905.type	${func}_avx2,\@function,3
1906.align	64
1907${func}_avx2:
1908.cfi_startproc
1909.Lavx2_shortcut:
1910	mov	%rsp,%rax		# copy %rsp
1911.cfi_def_cfa_register	%rax
1912	push	%rbx
1913.cfi_push	%rbx
1914	push	%rbp
1915.cfi_push	%rbp
1916	push	%r12
1917.cfi_push	%r12
1918	push	%r13
1919.cfi_push	%r13
1920	push	%r14
1921.cfi_push	%r14
1922	push	%r15
1923.cfi_push	%r15
1924	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1925	shl	\$4,%rdx		# num*16
1926	and	\$-256*$SZ,%rsp		# align stack frame
1927	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1928	add	\$`2*$SZ*($rounds-8)`,%rsp
1929	mov	$ctx,$_ctx		# save ctx, 1st arg
1930	mov	$inp,$_inp		# save inp, 2nd arh
1931	mov	%rdx,$_end		# save end pointer, "3rd" arg
1932	mov	%rax,$_rsp		# save copy of %rsp
1933.cfi_cfa_expression	$_rsp,deref,+8
1934___
1935$code.=<<___ if ($win64);
1936	movaps	%xmm6,16*$SZ+32(%rsp)
1937	movaps	%xmm7,16*$SZ+48(%rsp)
1938	movaps	%xmm8,16*$SZ+64(%rsp)
1939	movaps	%xmm9,16*$SZ+80(%rsp)
1940___
1941$code.=<<___ if ($win64 && $SZ>4);
1942	movaps	%xmm10,16*$SZ+96(%rsp)
1943	movaps	%xmm11,16*$SZ+112(%rsp)
1944___
1945$code.=<<___;
1946.Lprologue_avx2:
1947
1948	vzeroupper
1949	sub	\$-16*$SZ,$inp		# inp++, size optimization
1950	mov	$SZ*0($ctx),$A
1951	mov	$inp,%r12		# borrow $T1
1952	mov	$SZ*1($ctx),$B
1953	cmp	%rdx,$inp		# $_end
1954	mov	$SZ*2($ctx),$C
1955	cmove	%rsp,%r12		# next block or random data
1956	mov	$SZ*3($ctx),$D
1957	mov	$SZ*4($ctx),$E
1958	mov	$SZ*5($ctx),$F
1959	mov	$SZ*6($ctx),$G
1960	mov	$SZ*7($ctx),$H
1961___
1962					if ($SZ==4) {	# SHA256
1963    my @X = map("%ymm$_",(0..3));
1964    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1965
1966$code.=<<___;
1967	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1968	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1969	jmp	.Loop_avx2
1970.align	16
1971.Loop_avx2:
1972	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1973	vmovdqu	-16*$SZ+0($inp),%xmm0
1974	vmovdqu	-16*$SZ+16($inp),%xmm1
1975	vmovdqu	-16*$SZ+32($inp),%xmm2
1976	vmovdqu	-16*$SZ+48($inp),%xmm3
1977	#mov		$inp,$_inp	# offload $inp
1978	vinserti128	\$1,(%r12),@X[0],@X[0]
1979	vinserti128	\$1,16(%r12),@X[1],@X[1]
1980	vpshufb		$t3,@X[0],@X[0]
1981	vinserti128	\$1,32(%r12),@X[2],@X[2]
1982	vpshufb		$t3,@X[1],@X[1]
1983	vinserti128	\$1,48(%r12),@X[3],@X[3]
1984
1985	lea	$TABLE(%rip),$Tbl
1986	vpshufb	$t3,@X[2],@X[2]
1987	vpaddd	0x00($Tbl),@X[0],$t0
1988	vpshufb	$t3,@X[3],@X[3]
1989	vpaddd	0x20($Tbl),@X[1],$t1
1990	vpaddd	0x40($Tbl),@X[2],$t2
1991	vpaddd	0x60($Tbl),@X[3],$t3
1992	vmovdqa	$t0,0x00(%rsp)
1993	xor	$a1,$a1
1994	vmovdqa	$t1,0x20(%rsp)
1995	lea	-$PUSH8(%rsp),%rsp
1996	mov	$B,$a3
1997	vmovdqa	$t2,0x00(%rsp)
1998	xor	$C,$a3			# magic
1999	vmovdqa	$t3,0x20(%rsp)
2000	mov	$F,$a4
2001	sub	\$-16*2*$SZ,$Tbl	# size optimization
2002	jmp	.Lavx2_00_47
2003
2004.align	16
2005.Lavx2_00_47:
2006___
2007
2008sub AVX2_256_00_47 () {
2009my $j = shift;
2010my $body = shift;
2011my @X = @_;
2012my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
2013my $base = "+2*$PUSH8(%rsp)";
2014
2015	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
2016	foreach (Xupdate_256_AVX()) {		# 29 instructions
2017	    eval;
2018	    eval(shift(@insns));
2019	    eval(shift(@insns));
2020	    eval(shift(@insns));
2021	}
2022	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
2023	  foreach (@insns) { eval; }		# remaining instructions
2024	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2025}
2026
2027    for ($i=0,$j=0; $j<4; $j++) {
2028	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
2029	push(@X,shift(@X));			# rotate(@X)
2030    }
2031	&lea	($Tbl,16*2*$SZ."($Tbl)");
2032	&cmpb	(($SZ-1)."($Tbl)",0);
2033	&jne	(".Lavx2_00_47");
2034
2035    for ($i=0; $i<16; ) {
2036	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2037	foreach(bodyx_00_15()) { eval; }
2038    }
2039					} else {	# SHA512
2040    my @X = map("%ymm$_",(0..7));
2041    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
2042
2043$code.=<<___;
2044	jmp	.Loop_avx2
2045.align	16
2046.Loop_avx2:
2047	vmovdqu	-16*$SZ($inp),%xmm0
2048	vmovdqu	-16*$SZ+16($inp),%xmm1
2049	vmovdqu	-16*$SZ+32($inp),%xmm2
2050	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
2051	vmovdqu	-16*$SZ+48($inp),%xmm3
2052	vmovdqu	-16*$SZ+64($inp),%xmm4
2053	vmovdqu	-16*$SZ+80($inp),%xmm5
2054	vmovdqu	-16*$SZ+96($inp),%xmm6
2055	vmovdqu	-16*$SZ+112($inp),%xmm7
2056	#mov	$inp,$_inp	# offload $inp
2057	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
2058	vinserti128	\$1,(%r12),@X[0],@X[0]
2059	vinserti128	\$1,16(%r12),@X[1],@X[1]
2060	 vpshufb	$t2,@X[0],@X[0]
2061	vinserti128	\$1,32(%r12),@X[2],@X[2]
2062	 vpshufb	$t2,@X[1],@X[1]
2063	vinserti128	\$1,48(%r12),@X[3],@X[3]
2064	 vpshufb	$t2,@X[2],@X[2]
2065	vinserti128	\$1,64(%r12),@X[4],@X[4]
2066	 vpshufb	$t2,@X[3],@X[3]
2067	vinserti128	\$1,80(%r12),@X[5],@X[5]
2068	 vpshufb	$t2,@X[4],@X[4]
2069	vinserti128	\$1,96(%r12),@X[6],@X[6]
2070	 vpshufb	$t2,@X[5],@X[5]
2071	vinserti128	\$1,112(%r12),@X[7],@X[7]
2072
2073	vpaddq	-0x80($Tbl),@X[0],$t0
2074	vpshufb	$t2,@X[6],@X[6]
2075	vpaddq	-0x60($Tbl),@X[1],$t1
2076	vpshufb	$t2,@X[7],@X[7]
2077	vpaddq	-0x40($Tbl),@X[2],$t2
2078	vpaddq	-0x20($Tbl),@X[3],$t3
2079	vmovdqa	$t0,0x00(%rsp)
2080	vpaddq	0x00($Tbl),@X[4],$t0
2081	vmovdqa	$t1,0x20(%rsp)
2082	vpaddq	0x20($Tbl),@X[5],$t1
2083	vmovdqa	$t2,0x40(%rsp)
2084	vpaddq	0x40($Tbl),@X[6],$t2
2085	vmovdqa	$t3,0x60(%rsp)
2086	lea	-$PUSH8(%rsp),%rsp
2087	vpaddq	0x60($Tbl),@X[7],$t3
2088	vmovdqa	$t0,0x00(%rsp)
2089	xor	$a1,$a1
2090	vmovdqa	$t1,0x20(%rsp)
2091	mov	$B,$a3
2092	vmovdqa	$t2,0x40(%rsp)
2093	xor	$C,$a3			# magic
2094	vmovdqa	$t3,0x60(%rsp)
2095	mov	$F,$a4
2096	add	\$16*2*$SZ,$Tbl
2097	jmp	.Lavx2_00_47
2098
2099.align	16
2100.Lavx2_00_47:
2101___
2102
2103sub AVX2_512_00_47 () {
2104my $j = shift;
2105my $body = shift;
2106my @X = @_;
2107my @insns = (&$body,&$body);			# 48 instructions
2108my $base = "+2*$PUSH8(%rsp)";
2109
2110	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
2111	foreach (Xupdate_512_AVX()) {		# 23 instructions
2112	    eval;
2113	    if ($_ !~ /\;$/) {
2114		eval(shift(@insns));
2115		eval(shift(@insns));
2116		eval(shift(@insns));
2117	    }
2118	}
2119	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
2120	  foreach (@insns) { eval; }		# remaining instructions
2121	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2122}
2123
2124    for ($i=0,$j=0; $j<8; $j++) {
2125	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
2126	push(@X,shift(@X));			# rotate(@X)
2127    }
2128	&lea	($Tbl,16*2*$SZ."($Tbl)");
2129	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
2130	&jne	(".Lavx2_00_47");
2131
2132    for ($i=0; $i<16; ) {
2133	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2134	foreach(bodyx_00_15()) { eval; }
2135    }
2136}
2137$code.=<<___;
2138	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2139	add	$a1,$A
2140	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2141	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
2142
2143	add	$SZ*0($ctx),$A
2144	add	$SZ*1($ctx),$B
2145	add	$SZ*2($ctx),$C
2146	add	$SZ*3($ctx),$D
2147	add	$SZ*4($ctx),$E
2148	add	$SZ*5($ctx),$F
2149	add	$SZ*6($ctx),$G
2150	add	$SZ*7($ctx),$H
2151
2152	mov	$A,$SZ*0($ctx)
2153	mov	$B,$SZ*1($ctx)
2154	mov	$C,$SZ*2($ctx)
2155	mov	$D,$SZ*3($ctx)
2156	mov	$E,$SZ*4($ctx)
2157	mov	$F,$SZ*5($ctx)
2158	mov	$G,$SZ*6($ctx)
2159	mov	$H,$SZ*7($ctx)
2160
2161	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
2162	je	.Ldone_avx2
2163
2164	xor	$a1,$a1
2165	mov	$B,$a3
2166	xor	$C,$a3			# magic
2167	mov	$F,$a4
2168	jmp	.Lower_avx2
2169.align	16
2170.Lower_avx2:
2171___
2172    for ($i=0; $i<8; ) {
2173	my $base="+16($Tbl)";
2174	foreach(bodyx_00_15()) { eval; }
2175    }
2176$code.=<<___;
2177	lea	-$PUSH8($Tbl),$Tbl
2178	cmp	%rsp,$Tbl
2179	jae	.Lower_avx2
2180
2181	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2182	add	$a1,$A
2183	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2184	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
2185
2186	add	$SZ*0($ctx),$A
2187	add	$SZ*1($ctx),$B
2188	add	$SZ*2($ctx),$C
2189	add	$SZ*3($ctx),$D
2190	add	$SZ*4($ctx),$E
2191	add	$SZ*5($ctx),$F
2192	lea	`2*16*$SZ`($inp),$inp	# inp+=2
2193	add	$SZ*6($ctx),$G
2194	mov	$inp,%r12
2195	add	$SZ*7($ctx),$H
2196	cmp	$_end,$inp
2197
2198	mov	$A,$SZ*0($ctx)
2199	cmove	%rsp,%r12		# next block or stale data
2200	mov	$B,$SZ*1($ctx)
2201	mov	$C,$SZ*2($ctx)
2202	mov	$D,$SZ*3($ctx)
2203	mov	$E,$SZ*4($ctx)
2204	mov	$F,$SZ*5($ctx)
2205	mov	$G,$SZ*6($ctx)
2206	mov	$H,$SZ*7($ctx)
2207
2208	jbe	.Loop_avx2
2209	lea	(%rsp),$Tbl
2210
2211.Ldone_avx2:
2212	lea	($Tbl),%rsp
2213	mov	$_rsp,%rsi
2214.cfi_def_cfa	%rsi,8
2215	vzeroupper
2216___
2217$code.=<<___ if ($win64);
2218	movaps	16*$SZ+32(%rsp),%xmm6
2219	movaps	16*$SZ+48(%rsp),%xmm7
2220	movaps	16*$SZ+64(%rsp),%xmm8
2221	movaps	16*$SZ+80(%rsp),%xmm9
2222___
2223$code.=<<___ if ($win64 && $SZ>4);
2224	movaps	16*$SZ+96(%rsp),%xmm10
2225	movaps	16*$SZ+112(%rsp),%xmm11
2226___
2227$code.=<<___;
2228	mov	-48(%rsi),%r15
2229.cfi_restore	%r15
2230	mov	-40(%rsi),%r14
2231.cfi_restore	%r14
2232	mov	-32(%rsi),%r13
2233.cfi_restore	%r13
2234	mov	-24(%rsi),%r12
2235.cfi_restore	%r12
2236	mov	-16(%rsi),%rbp
2237.cfi_restore	%rbp
2238	mov	-8(%rsi),%rbx
2239.cfi_restore	%rbx
2240	lea	(%rsi),%rsp
2241.cfi_def_cfa_register	%rsp
2242.Lepilogue_avx2:
2243	ret
2244.cfi_endproc
2245.size	${func}_avx2,.-${func}_avx2
2246___
2247}}
2248}}}}}
2249
2250# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2251#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2252if ($win64) {
2253$rec="%rcx";
2254$frame="%rdx";
2255$context="%r8";
2256$disp="%r9";
2257
2258$code.=<<___;
2259.extern	__imp_RtlVirtualUnwind
2260.type	se_handler,\@abi-omnipotent
2261.align	16
2262se_handler:
2263	push	%rsi
2264	push	%rdi
2265	push	%rbx
2266	push	%rbp
2267	push	%r12
2268	push	%r13
2269	push	%r14
2270	push	%r15
2271	pushfq
2272	sub	\$64,%rsp
2273
2274	mov	120($context),%rax	# pull context->Rax
2275	mov	248($context),%rbx	# pull context->Rip
2276
2277	mov	8($disp),%rsi		# disp->ImageBase
2278	mov	56($disp),%r11		# disp->HanderlData
2279
2280	mov	0(%r11),%r10d		# HandlerData[0]
2281	lea	(%rsi,%r10),%r10	# prologue label
2282	cmp	%r10,%rbx		# context->Rip<prologue label
2283	jb	.Lin_prologue
2284
2285	mov	152($context),%rax	# pull context->Rsp
2286
2287	mov	4(%r11),%r10d		# HandlerData[1]
2288	lea	(%rsi,%r10),%r10	# epilogue label
2289	cmp	%r10,%rbx		# context->Rip>=epilogue label
2290	jae	.Lin_prologue
2291___
2292$code.=<<___ if ($avx>1);
2293	lea	.Lavx2_shortcut(%rip),%r10
2294	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
2295	jb	.Lnot_in_avx2
2296
2297	and	\$-256*$SZ,%rax
2298	add	\$`2*$SZ*($rounds-8)`,%rax
2299.Lnot_in_avx2:
2300___
2301$code.=<<___;
2302	mov	%rax,%rsi		# put aside Rsp
2303	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
2304
2305	mov	-8(%rax),%rbx
2306	mov	-16(%rax),%rbp
2307	mov	-24(%rax),%r12
2308	mov	-32(%rax),%r13
2309	mov	-40(%rax),%r14
2310	mov	-48(%rax),%r15
2311	mov	%rbx,144($context)	# restore context->Rbx
2312	mov	%rbp,160($context)	# restore context->Rbp
2313	mov	%r12,216($context)	# restore context->R12
2314	mov	%r13,224($context)	# restore context->R13
2315	mov	%r14,232($context)	# restore context->R14
2316	mov	%r15,240($context)	# restore context->R15
2317
2318	lea	.Lepilogue(%rip),%r10
2319	cmp	%r10,%rbx
2320	jb	.Lin_prologue		# non-AVX code
2321
2322	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
2323	lea	512($context),%rdi	# &context.Xmm6
2324	mov	\$`$SZ==4?8:12`,%ecx
2325	.long	0xa548f3fc		# cld; rep movsq
2326
2327.Lin_prologue:
2328	mov	8(%rax),%rdi
2329	mov	16(%rax),%rsi
2330	mov	%rax,152($context)	# restore context->Rsp
2331	mov	%rsi,168($context)	# restore context->Rsi
2332	mov	%rdi,176($context)	# restore context->Rdi
2333
2334	mov	40($disp),%rdi		# disp->ContextRecord
2335	mov	$context,%rsi		# context
2336	mov	\$154,%ecx		# sizeof(CONTEXT)
2337	.long	0xa548f3fc		# cld; rep movsq
2338
2339	mov	$disp,%rsi
2340	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2341	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2342	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2343	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2344	mov	40(%rsi),%r10		# disp->ContextRecord
2345	lea	56(%rsi),%r11		# &disp->HandlerData
2346	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2347	mov	%r10,32(%rsp)		# arg5
2348	mov	%r11,40(%rsp)		# arg6
2349	mov	%r12,48(%rsp)		# arg7
2350	mov	%rcx,56(%rsp)		# arg8, (NULL)
2351	call	*__imp_RtlVirtualUnwind(%rip)
2352
2353	mov	\$1,%eax		# ExceptionContinueSearch
2354	add	\$64,%rsp
2355	popfq
2356	pop	%r15
2357	pop	%r14
2358	pop	%r13
2359	pop	%r12
2360	pop	%rbp
2361	pop	%rbx
2362	pop	%rdi
2363	pop	%rsi
2364	ret
2365.size	se_handler,.-se_handler
2366___
2367
2368$code.=<<___ if ($SZ==4 && $shaext);
2369.type	shaext_handler,\@abi-omnipotent
2370.align	16
2371shaext_handler:
2372	push	%rsi
2373	push	%rdi
2374	push	%rbx
2375	push	%rbp
2376	push	%r12
2377	push	%r13
2378	push	%r14
2379	push	%r15
2380	pushfq
2381	sub	\$64,%rsp
2382
2383	mov	120($context),%rax	# pull context->Rax
2384	mov	248($context),%rbx	# pull context->Rip
2385
2386	lea	.Lprologue_shaext(%rip),%r10
2387	cmp	%r10,%rbx		# context->Rip<.Lprologue
2388	jb	.Lin_prologue
2389
2390	lea	.Lepilogue_shaext(%rip),%r10
2391	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
2392	jae	.Lin_prologue
2393
2394	lea	-8-5*16(%rax),%rsi
2395	lea	512($context),%rdi	# &context.Xmm6
2396	mov	\$10,%ecx
2397	.long	0xa548f3fc		# cld; rep movsq
2398
2399	jmp	.Lin_prologue
2400.size	shaext_handler,.-shaext_handler
2401___
2402
2403$code.=<<___;
2404.section	.pdata
2405.align	4
2406	.rva	.LSEH_begin_$func
2407	.rva	.LSEH_end_$func
2408	.rva	.LSEH_info_$func
2409___
2410$code.=<<___ if ($SZ==4 && $shaext);
2411	.rva	.LSEH_begin_${func}_shaext
2412	.rva	.LSEH_end_${func}_shaext
2413	.rva	.LSEH_info_${func}_shaext
2414___
2415$code.=<<___ if ($SZ==4);
2416	.rva	.LSEH_begin_${func}_ssse3
2417	.rva	.LSEH_end_${func}_ssse3
2418	.rva	.LSEH_info_${func}_ssse3
2419___
2420$code.=<<___ if ($avx && $SZ==8);
2421	.rva	.LSEH_begin_${func}_xop
2422	.rva	.LSEH_end_${func}_xop
2423	.rva	.LSEH_info_${func}_xop
2424___
2425$code.=<<___ if ($avx);
2426	.rva	.LSEH_begin_${func}_avx
2427	.rva	.LSEH_end_${func}_avx
2428	.rva	.LSEH_info_${func}_avx
2429___
2430$code.=<<___ if ($avx>1);
2431	.rva	.LSEH_begin_${func}_avx2
2432	.rva	.LSEH_end_${func}_avx2
2433	.rva	.LSEH_info_${func}_avx2
2434___
2435$code.=<<___;
2436.section	.xdata
2437.align	8
2438.LSEH_info_$func:
2439	.byte	9,0,0,0
2440	.rva	se_handler
2441	.rva	.Lprologue,.Lepilogue			# HandlerData[]
2442___
2443$code.=<<___ if ($SZ==4 && $shaext);
2444.LSEH_info_${func}_shaext:
2445	.byte	9,0,0,0
2446	.rva	shaext_handler
2447___
2448$code.=<<___ if ($SZ==4);
2449.LSEH_info_${func}_ssse3:
2450	.byte	9,0,0,0
2451	.rva	se_handler
2452	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2453___
2454$code.=<<___ if ($avx && $SZ==8);
2455.LSEH_info_${func}_xop:
2456	.byte	9,0,0,0
2457	.rva	se_handler
2458	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
2459___
2460$code.=<<___ if ($avx);
2461.LSEH_info_${func}_avx:
2462	.byte	9,0,0,0
2463	.rva	se_handler
2464	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2465___
2466$code.=<<___ if ($avx>1);
2467.LSEH_info_${func}_avx2:
2468	.byte	9,0,0,0
2469	.rva	se_handler
2470	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2471___
2472}
2473
2474sub sha256op38 {
2475    my $instr = shift;
2476    my %opcodelet = (
2477		"sha256rnds2" => 0xcb,
2478  		"sha256msg1"  => 0xcc,
2479		"sha256msg2"  => 0xcd	);
2480
2481    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2482      my @opcode=(0x0f,0x38);
2483	push @opcode,$opcodelet{$instr};
2484	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2485	return ".byte\t".join(',',@opcode);
2486    } else {
2487	return $instr."\t".@_[0];
2488    }
2489}
2490
2491foreach (split("\n",$code)) {
2492	s/\`([^\`]*)\`/eval $1/geo;
2493
2494	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2495
2496	print $_,"\n";
2497}
2498close STDOUT;
2499