1#! /usr/bin/env perl
2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
88#
89# AMD K8	14.9	-	    -		    9.57    -
90# P4		17.3	-	    -		    30.8    -
91# Core 2	15.6	13.8(+13%)  -		    9.97    -
92# Westmere	14.8	12.3(+19%)  -		    9.58    -
93# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
94# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
95# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
96# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
97# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
98# Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
99# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
100# Atom		23.0	18.9(+22%)  -		    14.7    -
101# Silvermont	27.4	20.6(+33%)  -               17.5    -
102# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
103# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
104#
105# (*)	whichever best applicable, including SHAEXT;
106# (**)	switch from ror to shrd stands for fair share of improvement;
107# (***)	execution time is fully determined by remaining integer-only
108#	part, body_00_15; reducing the amount of SIMD instructions
109#	below certain limit makes no difference/sense; to conserve
110#	space SHA256 XOP code path is therefore omitted;
111
112$flavour = shift;
113$output  = shift;
114if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
115
116$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
117
118$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
119( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
120( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
121die "can't locate x86_64-xlate.pl";
122
123if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
124		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
125	$avx = ($1>=2.19) + ($1>=2.22);
126}
127
128if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
129	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
130	$avx = ($1>=2.09) + ($1>=2.10);
131}
132
133if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
134	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
135	$avx = ($1>=10) + ($1>=11);
136}
137
138if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
139	$avx = ($2>=3.0) + ($2>3.0);
140}
141
142$shaext=1;	### set to zero if compiling for 1.0.1
143$avx=1		if (!$shaext && $avx);
144
145open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
146*STDOUT=*OUT;
147
148if ($output =~ /512/) {
149	$func="sha512_block_data_order";
150	$TABLE="K512";
151	$SZ=8;
152	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
153					"%r8", "%r9", "%r10","%r11");
154	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
155	@Sigma0=(28,34,39);
156	@Sigma1=(14,18,41);
157	@sigma0=(1,  8, 7);
158	@sigma1=(19,61, 6);
159	$rounds=80;
160} else {
161	$func="sha256_block_data_order";
162	$TABLE="K256";
163	$SZ=4;
164	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
165					"%r8d","%r9d","%r10d","%r11d");
166	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
167	@Sigma0=( 2,13,22);
168	@Sigma1=( 6,11,25);
169	@sigma0=( 7,18, 3);
170	@sigma1=(17,19,10);
171	$rounds=64;
172}
173
174$ctx="%rdi";	# 1st arg, zapped by $a3
175$inp="%rsi";	# 2nd arg
176$Tbl="%rbp";
177
178$_ctx="16*$SZ+0*8(%rsp)";
179$_inp="16*$SZ+1*8(%rsp)";
180$_end="16*$SZ+2*8(%rsp)";
181$_rsp="`16*$SZ+3*8`(%rsp)";
182$framesz="16*$SZ+4*8";
183
184
185sub ROUND_00_15()
186{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
187  my $STRIDE=$SZ;
188     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
189
190$code.=<<___;
191	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
192	mov	$f,$a2
193
194	xor	$e,$a0
195	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
196	xor	$g,$a2			# f^g
197
198	mov	$T1,`$SZ*($i&0xf)`(%rsp)
199	xor	$a,$a1
200	and	$e,$a2			# (f^g)&e
201
202	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
203	add	$h,$T1			# T1+=h
204	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
205
206	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
207	xor	$e,$a0
208	add	$a2,$T1			# T1+=Ch(e,f,g)
209
210	mov	$a,$a2
211	add	($Tbl),$T1		# T1+=K[round]
212	xor	$a,$a1
213
214	xor	$b,$a2			# a^b, b^c in next round
215	ror	\$$Sigma1[0],$a0	# Sigma1(e)
216	mov	$b,$h
217
218	and	$a2,$a3
219	ror	\$$Sigma0[0],$a1	# Sigma0(a)
220	add	$a0,$T1			# T1+=Sigma1(e)
221
222	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
223	add	$T1,$d			# d+=T1
224	add	$T1,$h			# h+=T1
225
226	lea	$STRIDE($Tbl),$Tbl	# round++
227___
228$code.=<<___ if ($i<15);
229	add	$a1,$h			# h+=Sigma0(a)
230___
231	($a2,$a3) = ($a3,$a2);
232}
233
234sub ROUND_16_XX()
235{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
236
237$code.=<<___;
238	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
239	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
240
241	mov	$a0,$T1
242	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
243	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
244	mov	$a2,$a1
245	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
246
247	xor	$T1,$a0
248	shr	\$$sigma0[2],$T1
249	ror	\$$sigma0[0],$a0
250	xor	$a1,$a2
251	shr	\$$sigma1[2],$a1
252
253	ror	\$$sigma1[0],$a2
254	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
255	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
256	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
257
258	add	`$SZ*($i&0xf)`(%rsp),$T1
259	mov	$e,$a0
260	add	$a2,$T1
261	mov	$a,$a1
262___
263	&ROUND_00_15(@_);
264}
265
266$code=<<___;
267.text
268
269.extern	OPENSSL_ia32cap_P
270.globl	$func
271.type	$func,\@function,3
272.align	16
273$func:
274.cfi_startproc
275___
276$code.=<<___ if ($SZ==4 || $avx);
277	lea	OPENSSL_ia32cap_P(%rip),%r11
278	mov	0(%r11),%r9d
279	mov	4(%r11),%r10d
280	mov	8(%r11),%r11d
281___
282$code.=<<___ if ($SZ==4 && $shaext);
283	test	\$`1<<29`,%r11d		# check for SHA
284	jnz	_shaext_shortcut
285___
286$code.=<<___ if ($avx && $SZ==8);
287	test	\$`1<<11`,%r10d		# check for XOP
288	jnz	.Lxop_shortcut
289___
290$code.=<<___ if ($avx>1);
291	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
292	cmp	\$`1<<8|1<<5|1<<3`,%r11d
293	je	.Lavx2_shortcut
294___
295$code.=<<___ if ($avx);
296	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
297	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
298	or	%r9d,%r10d
299	cmp	\$`1<<28|1<<9|1<<30`,%r10d
300	je	.Lavx_shortcut
301___
302$code.=<<___ if ($SZ==4);
303	test	\$`1<<9`,%r10d
304	jnz	.Lssse3_shortcut
305___
306$code.=<<___;
307	mov	%rsp,%rax		# copy %rsp
308.cfi_def_cfa_register	%rax
309	push	%rbx
310.cfi_push	%rbx
311	push	%rbp
312.cfi_push	%rbp
313	push	%r12
314.cfi_push	%r12
315	push	%r13
316.cfi_push	%r13
317	push	%r14
318.cfi_push	%r14
319	push	%r15
320.cfi_push	%r15
321	shl	\$4,%rdx		# num*16
322	sub	\$$framesz,%rsp
323	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
324	and	\$-64,%rsp		# align stack frame
325	mov	$ctx,$_ctx		# save ctx, 1st arg
326	mov	$inp,$_inp		# save inp, 2nd arh
327	mov	%rdx,$_end		# save end pointer, "3rd" arg
328	mov	%rax,$_rsp		# save copy of %rsp
329.cfi_cfa_expression	$_rsp,deref,+8
330.Lprologue:
331
332	mov	$SZ*0($ctx),$A
333	mov	$SZ*1($ctx),$B
334	mov	$SZ*2($ctx),$C
335	mov	$SZ*3($ctx),$D
336	mov	$SZ*4($ctx),$E
337	mov	$SZ*5($ctx),$F
338	mov	$SZ*6($ctx),$G
339	mov	$SZ*7($ctx),$H
340	jmp	.Lloop
341
342.align	16
343.Lloop:
344	mov	$B,$a3
345	lea	$TABLE(%rip),$Tbl
346	xor	$C,$a3			# magic
347___
348	for($i=0;$i<16;$i++) {
349		$code.="	mov	$SZ*$i($inp),$T1\n";
350		$code.="	mov	@ROT[4],$a0\n";
351		$code.="	mov	@ROT[0],$a1\n";
352		$code.="	bswap	$T1\n";
353		&ROUND_00_15($i,@ROT);
354		unshift(@ROT,pop(@ROT));
355	}
356$code.=<<___;
357	jmp	.Lrounds_16_xx
358.align	16
359.Lrounds_16_xx:
360___
361	for(;$i<32;$i++) {
362		&ROUND_16_XX($i,@ROT);
363		unshift(@ROT,pop(@ROT));
364	}
365
366$code.=<<___;
367	cmpb	\$0,`$SZ-1`($Tbl)
368	jnz	.Lrounds_16_xx
369
370	mov	$_ctx,$ctx
371	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
372	lea	16*$SZ($inp),$inp
373
374	add	$SZ*0($ctx),$A
375	add	$SZ*1($ctx),$B
376	add	$SZ*2($ctx),$C
377	add	$SZ*3($ctx),$D
378	add	$SZ*4($ctx),$E
379	add	$SZ*5($ctx),$F
380	add	$SZ*6($ctx),$G
381	add	$SZ*7($ctx),$H
382
383	cmp	$_end,$inp
384
385	mov	$A,$SZ*0($ctx)
386	mov	$B,$SZ*1($ctx)
387	mov	$C,$SZ*2($ctx)
388	mov	$D,$SZ*3($ctx)
389	mov	$E,$SZ*4($ctx)
390	mov	$F,$SZ*5($ctx)
391	mov	$G,$SZ*6($ctx)
392	mov	$H,$SZ*7($ctx)
393	jb	.Lloop
394
395	mov	$_rsp,%rsi
396.cfi_def_cfa	%rsi,8
397	mov	-48(%rsi),%r15
398.cfi_restore	%r15
399	mov	-40(%rsi),%r14
400.cfi_restore	%r14
401	mov	-32(%rsi),%r13
402.cfi_restore	%r13
403	mov	-24(%rsi),%r12
404.cfi_restore	%r12
405	mov	-16(%rsi),%rbp
406.cfi_restore	%rbp
407	mov	-8(%rsi),%rbx
408.cfi_restore	%rbx
409	lea	(%rsi),%rsp
410.cfi_def_cfa_register	%rsp
411.Lepilogue:
412	ret
413.cfi_endproc
414.size	$func,.-$func
415___
416
417if ($SZ==4) {
418$code.=<<___;
419.align	64
420.type	$TABLE,\@object
421$TABLE:
422	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
423	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
424	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
425	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
426	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
427	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
428	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
429	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
430	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
431	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
432	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
433	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
434	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
435	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
436	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
437	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
438	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
439	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
440	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
441	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
442	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
443	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
444	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
445	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
446	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
447	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
448	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
449	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
450	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
451	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
452	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
453	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
454
455	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
456	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
457	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
458	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
459	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
460	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
461	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
462___
463} else {
464$code.=<<___;
465.align	64
466.type	$TABLE,\@object
467$TABLE:
468	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
469	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
470	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
471	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
472	.quad	0x3956c25bf348b538,0x59f111f1b605d019
473	.quad	0x3956c25bf348b538,0x59f111f1b605d019
474	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
475	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
476	.quad	0xd807aa98a3030242,0x12835b0145706fbe
477	.quad	0xd807aa98a3030242,0x12835b0145706fbe
478	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
479	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
480	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
481	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
482	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
483	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
484	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
485	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
486	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
487	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
488	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
489	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
490	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
491	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
492	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
493	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
494	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
495	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
496	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
497	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
498	.quad	0x06ca6351e003826f,0x142929670a0e6e70
499	.quad	0x06ca6351e003826f,0x142929670a0e6e70
500	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
501	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
502	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
503	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
504	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
505	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
506	.quad	0x81c2c92e47edaee6,0x92722c851482353b
507	.quad	0x81c2c92e47edaee6,0x92722c851482353b
508	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
509	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
510	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
511	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
512	.quad	0xd192e819d6ef5218,0xd69906245565a910
513	.quad	0xd192e819d6ef5218,0xd69906245565a910
514	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
515	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
516	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
517	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
518	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
519	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
520	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
521	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
522	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
523	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
524	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
525	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
526	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
527	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
528	.quad	0x90befffa23631e28,0xa4506cebde82bde9
529	.quad	0x90befffa23631e28,0xa4506cebde82bde9
530	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
531	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
532	.quad	0xca273eceea26619c,0xd186b8c721c0c207
533	.quad	0xca273eceea26619c,0xd186b8c721c0c207
534	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
535	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
536	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
537	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
538	.quad	0x113f9804bef90dae,0x1b710b35131c471b
539	.quad	0x113f9804bef90dae,0x1b710b35131c471b
540	.quad	0x28db77f523047d84,0x32caab7b40c72493
541	.quad	0x28db77f523047d84,0x32caab7b40c72493
542	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
543	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
544	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
545	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
546	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
547	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
548
549	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
550	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
551	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
552___
553}
554
555######################################################################
556# SIMD code paths
557#
558if ($SZ==4 && $shaext) {{{
559######################################################################
560# Intel SHA Extensions implementation of SHA256 update function.
561#
562my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
563
564my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
565my @MSG=map("%xmm$_",(3..6));
566
567$code.=<<___;
568.type	sha256_block_data_order_shaext,\@function,3
569.align	64
570sha256_block_data_order_shaext:
571_shaext_shortcut:
572.cfi_startproc
573___
574$code.=<<___ if ($win64);
575	lea	`-8-5*16`(%rsp),%rsp
576	movaps	%xmm6,-8-5*16(%rax)
577	movaps	%xmm7,-8-4*16(%rax)
578	movaps	%xmm8,-8-3*16(%rax)
579	movaps	%xmm9,-8-2*16(%rax)
580	movaps	%xmm10,-8-1*16(%rax)
581.Lprologue_shaext:
582___
583$code.=<<___;
584	lea		K256+0x80(%rip),$Tbl
585	movdqu		($ctx),$ABEF		# DCBA
586	movdqu		16($ctx),$CDGH		# HGFE
587	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
588
589	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
590	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
591	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
592	movdqa		$TMP,$BSWAP		# offload
593	palignr		\$8,$CDGH,$ABEF		# ABEF
594	punpcklqdq	$Wi,$CDGH		# CDGH
595	jmp		.Loop_shaext
596
597.align	16
598.Loop_shaext:
599	movdqu		($inp),@MSG[0]
600	movdqu		0x10($inp),@MSG[1]
601	movdqu		0x20($inp),@MSG[2]
602	pshufb		$TMP,@MSG[0]
603	movdqu		0x30($inp),@MSG[3]
604
605	movdqa		0*32-0x80($Tbl),$Wi
606	paddd		@MSG[0],$Wi
607	pshufb		$TMP,@MSG[1]
608	movdqa		$CDGH,$CDGH_SAVE	# offload
609	sha256rnds2	$ABEF,$CDGH		# 0-3
610	pshufd		\$0x0e,$Wi,$Wi
611	nop
612	movdqa		$ABEF,$ABEF_SAVE	# offload
613	sha256rnds2	$CDGH,$ABEF
614
615	movdqa		1*32-0x80($Tbl),$Wi
616	paddd		@MSG[1],$Wi
617	pshufb		$TMP,@MSG[2]
618	sha256rnds2	$ABEF,$CDGH		# 4-7
619	pshufd		\$0x0e,$Wi,$Wi
620	lea		0x40($inp),$inp
621	sha256msg1	@MSG[1],@MSG[0]
622	sha256rnds2	$CDGH,$ABEF
623
624	movdqa		2*32-0x80($Tbl),$Wi
625	paddd		@MSG[2],$Wi
626	pshufb		$TMP,@MSG[3]
627	sha256rnds2	$ABEF,$CDGH		# 8-11
628	pshufd		\$0x0e,$Wi,$Wi
629	movdqa		@MSG[3],$TMP
630	palignr		\$4,@MSG[2],$TMP
631	nop
632	paddd		$TMP,@MSG[0]
633	sha256msg1	@MSG[2],@MSG[1]
634	sha256rnds2	$CDGH,$ABEF
635
636	movdqa		3*32-0x80($Tbl),$Wi
637	paddd		@MSG[3],$Wi
638	sha256msg2	@MSG[3],@MSG[0]
639	sha256rnds2	$ABEF,$CDGH		# 12-15
640	pshufd		\$0x0e,$Wi,$Wi
641	movdqa		@MSG[0],$TMP
642	palignr		\$4,@MSG[3],$TMP
643	nop
644	paddd		$TMP,@MSG[1]
645	sha256msg1	@MSG[3],@MSG[2]
646	sha256rnds2	$CDGH,$ABEF
647___
648for($i=4;$i<16-3;$i++) {
649$code.=<<___;
650	movdqa		$i*32-0x80($Tbl),$Wi
651	paddd		@MSG[0],$Wi
652	sha256msg2	@MSG[0],@MSG[1]
653	sha256rnds2	$ABEF,$CDGH		# 16-19...
654	pshufd		\$0x0e,$Wi,$Wi
655	movdqa		@MSG[1],$TMP
656	palignr		\$4,@MSG[0],$TMP
657	nop
658	paddd		$TMP,@MSG[2]
659	sha256msg1	@MSG[0],@MSG[3]
660	sha256rnds2	$CDGH,$ABEF
661___
662	push(@MSG,shift(@MSG));
663}
664$code.=<<___;
665	movdqa		13*32-0x80($Tbl),$Wi
666	paddd		@MSG[0],$Wi
667	sha256msg2	@MSG[0],@MSG[1]
668	sha256rnds2	$ABEF,$CDGH		# 52-55
669	pshufd		\$0x0e,$Wi,$Wi
670	movdqa		@MSG[1],$TMP
671	palignr		\$4,@MSG[0],$TMP
672	sha256rnds2	$CDGH,$ABEF
673	paddd		$TMP,@MSG[2]
674
675	movdqa		14*32-0x80($Tbl),$Wi
676	paddd		@MSG[1],$Wi
677	sha256rnds2	$ABEF,$CDGH		# 56-59
678	pshufd		\$0x0e,$Wi,$Wi
679	sha256msg2	@MSG[1],@MSG[2]
680	movdqa		$BSWAP,$TMP
681	sha256rnds2	$CDGH,$ABEF
682
683	movdqa		15*32-0x80($Tbl),$Wi
684	paddd		@MSG[2],$Wi
685	nop
686	sha256rnds2	$ABEF,$CDGH		# 60-63
687	pshufd		\$0x0e,$Wi,$Wi
688	dec		$num
689	nop
690	sha256rnds2	$CDGH,$ABEF
691
692	paddd		$CDGH_SAVE,$CDGH
693	paddd		$ABEF_SAVE,$ABEF
694	jnz		.Loop_shaext
695
696	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
697	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
698	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
699	punpckhqdq	$CDGH,$ABEF		# DCBA
700	palignr		\$8,$TMP,$CDGH		# HGFE
701
702	movdqu	$ABEF,($ctx)
703	movdqu	$CDGH,16($ctx)
704___
705$code.=<<___ if ($win64);
706	movaps	-8-5*16(%rax),%xmm6
707	movaps	-8-4*16(%rax),%xmm7
708	movaps	-8-3*16(%rax),%xmm8
709	movaps	-8-2*16(%rax),%xmm9
710	movaps	-8-1*16(%rax),%xmm10
711	mov	%rax,%rsp
712.Lepilogue_shaext:
713___
714$code.=<<___;
715	ret
716.cfi_endproc
717.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
718___
719}}}
720{{{
721
722my $a4=$T1;
723my ($a,$b,$c,$d,$e,$f,$g,$h);
724
725sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
726{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
727  my $arg = pop;
728    $arg = "\$$arg" if ($arg*1 eq $arg);
729    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
730}
731
732sub body_00_15 () {
733	(
734	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
735
736	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
737	'&mov	($a,$a1)',
738	'&mov	($a4,$f)',
739
740	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
741	'&xor	($a0,$e)',
742	'&xor	($a4,$g)',			# f^g
743
744	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
745	'&xor	($a1,$a)',
746	'&and	($a4,$e)',			# (f^g)&e
747
748	'&xor	($a0,$e)',
749	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
750	'&mov	($a2,$a)',
751
752	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
753	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
754	'&xor	($a2,$b)',			# a^b, b^c in next round
755
756	'&add	($h,$a4)',			# h+=Ch(e,f,g)
757	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
758	'&and	($a3,$a2)',			# (b^c)&(a^b)
759
760	'&xor	($a1,$a)',
761	'&add	($h,$a0)',			# h+=Sigma1(e)
762	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
763
764	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
765	'&add	($d,$h)',			# d+=h
766	'&add	($h,$a3)',			# h+=Maj(a,b,c)
767
768	'&mov	($a0,$d)',
769	'&add	($a1,$h);'.			# h+=Sigma0(a)
770	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
771	);
772}
773
774######################################################################
775# SSSE3 code path
776#
777if ($SZ==4) {	# SHA256 only
778my @X = map("%xmm$_",(0..3));
779my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
780
781$code.=<<___;
782.type	${func}_ssse3,\@function,3
783.align	64
784${func}_ssse3:
785.cfi_startproc
786.Lssse3_shortcut:
787	mov	%rsp,%rax		# copy %rsp
788.cfi_def_cfa_register	%rax
789	push	%rbx
790.cfi_push	%rbx
791	push	%rbp
792.cfi_push	%rbp
793	push	%r12
794.cfi_push	%r12
795	push	%r13
796.cfi_push	%r13
797	push	%r14
798.cfi_push	%r14
799	push	%r15
800.cfi_push	%r15
801	shl	\$4,%rdx		# num*16
802	sub	\$`$framesz+$win64*16*4`,%rsp
803	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
804	and	\$-64,%rsp		# align stack frame
805	mov	$ctx,$_ctx		# save ctx, 1st arg
806	mov	$inp,$_inp		# save inp, 2nd arh
807	mov	%rdx,$_end		# save end pointer, "3rd" arg
808	mov	%rax,$_rsp		# save copy of %rsp
809.cfi_cfa_expression	$_rsp,deref,+8
810___
811$code.=<<___ if ($win64);
812	movaps	%xmm6,16*$SZ+32(%rsp)
813	movaps	%xmm7,16*$SZ+48(%rsp)
814	movaps	%xmm8,16*$SZ+64(%rsp)
815	movaps	%xmm9,16*$SZ+80(%rsp)
816___
817$code.=<<___;
818.Lprologue_ssse3:
819
820	mov	$SZ*0($ctx),$A
821	mov	$SZ*1($ctx),$B
822	mov	$SZ*2($ctx),$C
823	mov	$SZ*3($ctx),$D
824	mov	$SZ*4($ctx),$E
825	mov	$SZ*5($ctx),$F
826	mov	$SZ*6($ctx),$G
827	mov	$SZ*7($ctx),$H
828___
829
830$code.=<<___;
831	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
832	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
833	jmp	.Lloop_ssse3
834.align	16
835.Lloop_ssse3:
836	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
837	movdqu	0x00($inp),@X[0]
838	movdqu	0x10($inp),@X[1]
839	movdqu	0x20($inp),@X[2]
840	pshufb	$t3,@X[0]
841	movdqu	0x30($inp),@X[3]
842	lea	$TABLE(%rip),$Tbl
843	pshufb	$t3,@X[1]
844	movdqa	0x00($Tbl),$t0
845	movdqa	0x20($Tbl),$t1
846	pshufb	$t3,@X[2]
847	paddd	@X[0],$t0
848	movdqa	0x40($Tbl),$t2
849	pshufb	$t3,@X[3]
850	movdqa	0x60($Tbl),$t3
851	paddd	@X[1],$t1
852	paddd	@X[2],$t2
853	paddd	@X[3],$t3
854	movdqa	$t0,0x00(%rsp)
855	mov	$A,$a1
856	movdqa	$t1,0x10(%rsp)
857	mov	$B,$a3
858	movdqa	$t2,0x20(%rsp)
859	xor	$C,$a3			# magic
860	movdqa	$t3,0x30(%rsp)
861	mov	$E,$a0
862	jmp	.Lssse3_00_47
863
864.align	16
865.Lssse3_00_47:
866	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
867___
868sub Xupdate_256_SSSE3 () {
869	(
870	'&movdqa	($t0,@X[1]);',
871	'&movdqa	($t3,@X[3])',
872	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
873	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
874	'&movdqa	($t1,$t0)',
875	'&movdqa	($t2,$t0);',
876	'&psrld		($t0,$sigma0[2])',
877	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
878	'&psrld		($t2,$sigma0[0])',
879	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
880	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
881	'&pxor		($t0,$t2)',
882	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
883	'&pxor		($t0,$t1)',
884	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
885	'&pxor		($t0,$t2);',
886	 '&movdqa	($t2,$t3)',
887	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
888	 '&psrld	($t3,$sigma1[2])',
889	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
890	 '&psrlq	($t2,$sigma1[0])',
891	 '&pxor		($t3,$t2);',
892	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
893	 '&pxor		($t3,$t2)',
894	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
895	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
896	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
897	 '&movdqa	($t2,$t3);',
898	 '&psrld	($t3,$sigma1[2])',
899	 '&psrlq	($t2,$sigma1[0])',
900	 '&pxor		($t3,$t2);',
901	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
902	 '&pxor		($t3,$t2);',
903	'&movdqa	($t2,16*2*$j."($Tbl)")',
904	 '&pshufb	($t3,$t5)',
905	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
906	);
907}
908
909sub SSSE3_256_00_47 () {
910my $j = shift;
911my $body = shift;
912my @X = @_;
913my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
914
915    if (0) {
916	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
917	    eval;
918	    eval(shift(@insns));
919	    eval(shift(@insns));
920	    eval(shift(@insns));
921	}
922    } else {			# squeeze extra 4% on Westmere and 19% on Atom
923	  eval(shift(@insns));	#@
924	&movdqa		($t0,@X[1]);
925	  eval(shift(@insns));
926	  eval(shift(@insns));
927	&movdqa		($t3,@X[3]);
928	  eval(shift(@insns));	#@
929	  eval(shift(@insns));
930	  eval(shift(@insns));
931	  eval(shift(@insns));	#@
932	  eval(shift(@insns));
933	&palignr	($t0,@X[0],$SZ);	# X[1..4]
934	  eval(shift(@insns));
935	  eval(shift(@insns));
936	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
937	  eval(shift(@insns));
938	  eval(shift(@insns));
939	  eval(shift(@insns));
940	  eval(shift(@insns));	#@
941	&movdqa		($t1,$t0);
942	  eval(shift(@insns));
943	  eval(shift(@insns));
944	&movdqa		($t2,$t0);
945	  eval(shift(@insns));	#@
946	  eval(shift(@insns));
947	&psrld		($t0,$sigma0[2]);
948	  eval(shift(@insns));
949	  eval(shift(@insns));
950	  eval(shift(@insns));
951	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
952	  eval(shift(@insns));	#@
953	  eval(shift(@insns));
954	&psrld		($t2,$sigma0[0]);
955	  eval(shift(@insns));
956	  eval(shift(@insns));
957	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
958	  eval(shift(@insns));
959	  eval(shift(@insns));	#@
960	&pslld		($t1,8*$SZ-$sigma0[1]);
961	  eval(shift(@insns));
962	  eval(shift(@insns));
963	&pxor		($t0,$t2);
964	  eval(shift(@insns));	#@
965	  eval(shift(@insns));
966	  eval(shift(@insns));
967	  eval(shift(@insns));	#@
968	&psrld		($t2,$sigma0[1]-$sigma0[0]);
969	  eval(shift(@insns));
970	&pxor		($t0,$t1);
971	  eval(shift(@insns));
972	  eval(shift(@insns));
973	&pslld		($t1,$sigma0[1]-$sigma0[0]);
974	  eval(shift(@insns));
975	  eval(shift(@insns));
976	&pxor		($t0,$t2);
977	  eval(shift(@insns));
978	  eval(shift(@insns));	#@
979	 &movdqa	($t2,$t3);
980	  eval(shift(@insns));
981	  eval(shift(@insns));
982	&pxor		($t0,$t1);		# sigma0(X[1..4])
983	  eval(shift(@insns));	#@
984	  eval(shift(@insns));
985	  eval(shift(@insns));
986	 &psrld		($t3,$sigma1[2]);
987	  eval(shift(@insns));
988	  eval(shift(@insns));
989	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
990	  eval(shift(@insns));	#@
991	  eval(shift(@insns));
992	 &psrlq		($t2,$sigma1[0]);
993	  eval(shift(@insns));
994	  eval(shift(@insns));
995	  eval(shift(@insns));
996	 &pxor		($t3,$t2);
997	  eval(shift(@insns));	#@
998	  eval(shift(@insns));
999	  eval(shift(@insns));
1000	  eval(shift(@insns));	#@
1001	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1002	  eval(shift(@insns));
1003	  eval(shift(@insns));
1004	 &pxor		($t3,$t2);
1005	  eval(shift(@insns));	#@
1006	  eval(shift(@insns));
1007	  eval(shift(@insns));
1008	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
1009	 &pshufd	($t3,$t3,0b10000000);
1010	  eval(shift(@insns));
1011	  eval(shift(@insns));
1012	  eval(shift(@insns));
1013	 &psrldq	($t3,8);
1014	  eval(shift(@insns));
1015	  eval(shift(@insns));	#@
1016	  eval(shift(@insns));
1017	  eval(shift(@insns));
1018	  eval(shift(@insns));	#@
1019	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
1020	  eval(shift(@insns));
1021	  eval(shift(@insns));
1022	  eval(shift(@insns));
1023	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
1024	  eval(shift(@insns));
1025	  eval(shift(@insns));	#@
1026	  eval(shift(@insns));
1027	 &movdqa	($t2,$t3);
1028	  eval(shift(@insns));
1029	  eval(shift(@insns));
1030	 &psrld		($t3,$sigma1[2]);
1031	  eval(shift(@insns));
1032	  eval(shift(@insns));	#@
1033	 &psrlq		($t2,$sigma1[0]);
1034	  eval(shift(@insns));
1035	  eval(shift(@insns));
1036	 &pxor		($t3,$t2);
1037	  eval(shift(@insns));	#@
1038	  eval(shift(@insns));
1039	  eval(shift(@insns));
1040	  eval(shift(@insns));	#@
1041	  eval(shift(@insns));
1042	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1043	  eval(shift(@insns));
1044	  eval(shift(@insns));
1045	  eval(shift(@insns));
1046	 &pxor		($t3,$t2);
1047	  eval(shift(@insns));
1048	  eval(shift(@insns));
1049	  eval(shift(@insns));	#@
1050	 #&pshufb	($t3,$t5);
1051	 &pshufd	($t3,$t3,0b00001000);
1052	  eval(shift(@insns));
1053	  eval(shift(@insns));
1054	&movdqa		($t2,16*2*$j."($Tbl)");
1055	  eval(shift(@insns));	#@
1056	  eval(shift(@insns));
1057	 &pslldq	($t3,8);
1058	  eval(shift(@insns));
1059	  eval(shift(@insns));
1060	  eval(shift(@insns));
1061	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1062	  eval(shift(@insns));	#@
1063	  eval(shift(@insns));
1064	  eval(shift(@insns));
1065    }
1066	&paddd		($t2,@X[0]);
1067	  foreach (@insns) { eval; }		# remaining instructions
1068	&movdqa		(16*$j."(%rsp)",$t2);
1069}
1070
1071    for ($i=0,$j=0; $j<4; $j++) {
1072	&SSSE3_256_00_47($j,\&body_00_15,@X);
1073	push(@X,shift(@X));			# rotate(@X)
1074    }
1075	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1076	&jne	(".Lssse3_00_47");
1077
1078    for ($i=0; $i<16; ) {
1079	foreach(body_00_15()) { eval; }
1080    }
1081$code.=<<___;
1082	mov	$_ctx,$ctx
1083	mov	$a1,$A
1084
1085	add	$SZ*0($ctx),$A
1086	lea	16*$SZ($inp),$inp
1087	add	$SZ*1($ctx),$B
1088	add	$SZ*2($ctx),$C
1089	add	$SZ*3($ctx),$D
1090	add	$SZ*4($ctx),$E
1091	add	$SZ*5($ctx),$F
1092	add	$SZ*6($ctx),$G
1093	add	$SZ*7($ctx),$H
1094
1095	cmp	$_end,$inp
1096
1097	mov	$A,$SZ*0($ctx)
1098	mov	$B,$SZ*1($ctx)
1099	mov	$C,$SZ*2($ctx)
1100	mov	$D,$SZ*3($ctx)
1101	mov	$E,$SZ*4($ctx)
1102	mov	$F,$SZ*5($ctx)
1103	mov	$G,$SZ*6($ctx)
1104	mov	$H,$SZ*7($ctx)
1105	jb	.Lloop_ssse3
1106
1107	mov	$_rsp,%rsi
1108.cfi_def_cfa	%rsi,8
1109___
1110$code.=<<___ if ($win64);
1111	movaps	16*$SZ+32(%rsp),%xmm6
1112	movaps	16*$SZ+48(%rsp),%xmm7
1113	movaps	16*$SZ+64(%rsp),%xmm8
1114	movaps	16*$SZ+80(%rsp),%xmm9
1115___
1116$code.=<<___;
1117	mov	-48(%rsi),%r15
1118.cfi_restore	%r15
1119	mov	-40(%rsi),%r14
1120.cfi_restore	%r14
1121	mov	-32(%rsi),%r13
1122.cfi_restore	%r13
1123	mov	-24(%rsi),%r12
1124.cfi_restore	%r12
1125	mov	-16(%rsi),%rbp
1126.cfi_restore	%rbp
1127	mov	-8(%rsi),%rbx
1128.cfi_restore	%rbx
1129	lea	(%rsi),%rsp
1130.cfi_def_cfa_register	%rsp
1131.Lepilogue_ssse3:
1132	ret
1133.cfi_endproc
1134.size	${func}_ssse3,.-${func}_ssse3
1135___
1136}
1137
1138if ($avx) {{
1139######################################################################
1140# XOP code path
1141#
1142if ($SZ==8) {	# SHA512 only
1143$code.=<<___;
1144.type	${func}_xop,\@function,3
1145.align	64
1146${func}_xop:
1147.cfi_startproc
1148.Lxop_shortcut:
1149	mov	%rsp,%rax		# copy %rsp
1150.cfi_def_cfa_register	%rax
1151	push	%rbx
1152.cfi_push	%rbx
1153	push	%rbp
1154.cfi_push	%rbp
1155	push	%r12
1156.cfi_push	%r12
1157	push	%r13
1158.cfi_push	%r13
1159	push	%r14
1160.cfi_push	%r14
1161	push	%r15
1162.cfi_push	%r15
1163	shl	\$4,%rdx		# num*16
1164	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1165	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1166	and	\$-64,%rsp		# align stack frame
1167	mov	$ctx,$_ctx		# save ctx, 1st arg
1168	mov	$inp,$_inp		# save inp, 2nd arh
1169	mov	%rdx,$_end		# save end pointer, "3rd" arg
1170	mov	%rax,$_rsp		# save copy of %rsp
1171.cfi_cfa_expression	$_rsp,deref,+8
1172___
1173$code.=<<___ if ($win64);
1174	movaps	%xmm6,16*$SZ+32(%rsp)
1175	movaps	%xmm7,16*$SZ+48(%rsp)
1176	movaps	%xmm8,16*$SZ+64(%rsp)
1177	movaps	%xmm9,16*$SZ+80(%rsp)
1178___
1179$code.=<<___ if ($win64 && $SZ>4);
1180	movaps	%xmm10,16*$SZ+96(%rsp)
1181	movaps	%xmm11,16*$SZ+112(%rsp)
1182___
1183$code.=<<___;
1184.Lprologue_xop:
1185
1186	vzeroupper
1187	mov	$SZ*0($ctx),$A
1188	mov	$SZ*1($ctx),$B
1189	mov	$SZ*2($ctx),$C
1190	mov	$SZ*3($ctx),$D
1191	mov	$SZ*4($ctx),$E
1192	mov	$SZ*5($ctx),$F
1193	mov	$SZ*6($ctx),$G
1194	mov	$SZ*7($ctx),$H
1195	jmp	.Lloop_xop
1196___
1197					if ($SZ==4) {	# SHA256
1198    my @X = map("%xmm$_",(0..3));
1199    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1200
1201$code.=<<___;
1202.align	16
1203.Lloop_xop:
1204	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1205	vmovdqu	0x00($inp),@X[0]
1206	vmovdqu	0x10($inp),@X[1]
1207	vmovdqu	0x20($inp),@X[2]
1208	vmovdqu	0x30($inp),@X[3]
1209	vpshufb	$t3,@X[0],@X[0]
1210	lea	$TABLE(%rip),$Tbl
1211	vpshufb	$t3,@X[1],@X[1]
1212	vpshufb	$t3,@X[2],@X[2]
1213	vpaddd	0x00($Tbl),@X[0],$t0
1214	vpshufb	$t3,@X[3],@X[3]
1215	vpaddd	0x20($Tbl),@X[1],$t1
1216	vpaddd	0x40($Tbl),@X[2],$t2
1217	vpaddd	0x60($Tbl),@X[3],$t3
1218	vmovdqa	$t0,0x00(%rsp)
1219	mov	$A,$a1
1220	vmovdqa	$t1,0x10(%rsp)
1221	mov	$B,$a3
1222	vmovdqa	$t2,0x20(%rsp)
1223	xor	$C,$a3			# magic
1224	vmovdqa	$t3,0x30(%rsp)
1225	mov	$E,$a0
1226	jmp	.Lxop_00_47
1227
1228.align	16
1229.Lxop_00_47:
1230	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1231___
1232sub XOP_256_00_47 () {
1233my $j = shift;
1234my $body = shift;
1235my @X = @_;
1236my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1237
1238	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
1239	  eval(shift(@insns));
1240	  eval(shift(@insns));
1241	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
1242	  eval(shift(@insns));
1243	  eval(shift(@insns));
1244	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
1245	  eval(shift(@insns));
1246	  eval(shift(@insns));
1247	&vpsrld		($t0,$t0,$sigma0[2]);
1248	  eval(shift(@insns));
1249	  eval(shift(@insns));
1250	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
1251	  eval(shift(@insns));
1252	  eval(shift(@insns));
1253	  eval(shift(@insns));
1254	  eval(shift(@insns));
1255	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
1256	  eval(shift(@insns));
1257	  eval(shift(@insns));
1258	&vpxor		($t0,$t0,$t1);
1259	  eval(shift(@insns));
1260	  eval(shift(@insns));
1261	  eval(shift(@insns));
1262	  eval(shift(@insns));
1263	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
1264	  eval(shift(@insns));
1265	  eval(shift(@insns));
1266	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
1267	  eval(shift(@insns));
1268	  eval(shift(@insns));
1269	 &vpsrld	($t2,@X[3],$sigma1[2]);
1270	  eval(shift(@insns));
1271	  eval(shift(@insns));
1272	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
1273	  eval(shift(@insns));
1274	  eval(shift(@insns));
1275	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1276	  eval(shift(@insns));
1277	  eval(shift(@insns));
1278	 &vpxor		($t3,$t3,$t2);
1279	  eval(shift(@insns));
1280	  eval(shift(@insns));
1281	  eval(shift(@insns));
1282	  eval(shift(@insns));
1283	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1284	  eval(shift(@insns));
1285	  eval(shift(@insns));
1286	  eval(shift(@insns));
1287	  eval(shift(@insns));
1288	&vpsrldq	($t3,$t3,8);
1289	  eval(shift(@insns));
1290	  eval(shift(@insns));
1291	  eval(shift(@insns));
1292	  eval(shift(@insns));
1293	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1294	  eval(shift(@insns));
1295	  eval(shift(@insns));
1296	  eval(shift(@insns));
1297	  eval(shift(@insns));
1298	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
1299	  eval(shift(@insns));
1300	  eval(shift(@insns));
1301	 &vpsrld	($t2,@X[0],$sigma1[2]);
1302	  eval(shift(@insns));
1303	  eval(shift(@insns));
1304	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1305	  eval(shift(@insns));
1306	  eval(shift(@insns));
1307	 &vpxor		($t3,$t3,$t2);
1308	  eval(shift(@insns));
1309	  eval(shift(@insns));
1310	  eval(shift(@insns));
1311	  eval(shift(@insns));
1312	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
1313	  eval(shift(@insns));
1314	  eval(shift(@insns));
1315	  eval(shift(@insns));
1316	  eval(shift(@insns));
1317	&vpslldq	($t3,$t3,8);		# 22 instructions
1318	  eval(shift(@insns));
1319	  eval(shift(@insns));
1320	  eval(shift(@insns));
1321	  eval(shift(@insns));
1322	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
1323	  eval(shift(@insns));
1324	  eval(shift(@insns));
1325	  eval(shift(@insns));
1326	  eval(shift(@insns));
1327	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1328	  foreach (@insns) { eval; }		# remaining instructions
1329	&vmovdqa	(16*$j."(%rsp)",$t2);
1330}
1331
1332    for ($i=0,$j=0; $j<4; $j++) {
1333	&XOP_256_00_47($j,\&body_00_15,@X);
1334	push(@X,shift(@X));			# rotate(@X)
1335    }
1336	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1337	&jne	(".Lxop_00_47");
1338
1339    for ($i=0; $i<16; ) {
1340	foreach(body_00_15()) { eval; }
1341    }
1342
1343					} else {	# SHA512
1344    my @X = map("%xmm$_",(0..7));
1345    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1346
1347$code.=<<___;
1348.align	16
1349.Lloop_xop:
1350	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1351	vmovdqu	0x00($inp),@X[0]
1352	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1353	vmovdqu	0x10($inp),@X[1]
1354	vmovdqu	0x20($inp),@X[2]
1355	vpshufb	$t3,@X[0],@X[0]
1356	vmovdqu	0x30($inp),@X[3]
1357	vpshufb	$t3,@X[1],@X[1]
1358	vmovdqu	0x40($inp),@X[4]
1359	vpshufb	$t3,@X[2],@X[2]
1360	vmovdqu	0x50($inp),@X[5]
1361	vpshufb	$t3,@X[3],@X[3]
1362	vmovdqu	0x60($inp),@X[6]
1363	vpshufb	$t3,@X[4],@X[4]
1364	vmovdqu	0x70($inp),@X[7]
1365	vpshufb	$t3,@X[5],@X[5]
1366	vpaddq	-0x80($Tbl),@X[0],$t0
1367	vpshufb	$t3,@X[6],@X[6]
1368	vpaddq	-0x60($Tbl),@X[1],$t1
1369	vpshufb	$t3,@X[7],@X[7]
1370	vpaddq	-0x40($Tbl),@X[2],$t2
1371	vpaddq	-0x20($Tbl),@X[3],$t3
1372	vmovdqa	$t0,0x00(%rsp)
1373	vpaddq	0x00($Tbl),@X[4],$t0
1374	vmovdqa	$t1,0x10(%rsp)
1375	vpaddq	0x20($Tbl),@X[5],$t1
1376	vmovdqa	$t2,0x20(%rsp)
1377	vpaddq	0x40($Tbl),@X[6],$t2
1378	vmovdqa	$t3,0x30(%rsp)
1379	vpaddq	0x60($Tbl),@X[7],$t3
1380	vmovdqa	$t0,0x40(%rsp)
1381	mov	$A,$a1
1382	vmovdqa	$t1,0x50(%rsp)
1383	mov	$B,$a3
1384	vmovdqa	$t2,0x60(%rsp)
1385	xor	$C,$a3			# magic
1386	vmovdqa	$t3,0x70(%rsp)
1387	mov	$E,$a0
1388	jmp	.Lxop_00_47
1389
1390.align	16
1391.Lxop_00_47:
1392	add	\$`16*2*$SZ`,$Tbl
1393___
1394sub XOP_512_00_47 () {
1395my $j = shift;
1396my $body = shift;
1397my @X = @_;
1398my @insns = (&$body,&$body);			# 52 instructions
1399
1400	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
1401	  eval(shift(@insns));
1402	  eval(shift(@insns));
1403	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
1404	  eval(shift(@insns));
1405	  eval(shift(@insns));
1406	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
1407	  eval(shift(@insns));
1408	  eval(shift(@insns));
1409	&vpsrlq		($t0,$t0,$sigma0[2]);
1410	  eval(shift(@insns));
1411	  eval(shift(@insns));
1412	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
1413	  eval(shift(@insns));
1414	  eval(shift(@insns));
1415	  eval(shift(@insns));
1416	  eval(shift(@insns));
1417	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
1418	  eval(shift(@insns));
1419	  eval(shift(@insns));
1420	&vpxor		($t0,$t0,$t1);
1421	  eval(shift(@insns));
1422	  eval(shift(@insns));
1423	  eval(shift(@insns));
1424	  eval(shift(@insns));
1425	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
1426	  eval(shift(@insns));
1427	  eval(shift(@insns));
1428	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
1429	  eval(shift(@insns));
1430	  eval(shift(@insns));
1431	 &vpsrlq	($t2,@X[7],$sigma1[2]);
1432	  eval(shift(@insns));
1433	  eval(shift(@insns));
1434	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
1435	  eval(shift(@insns));
1436	  eval(shift(@insns));
1437	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
1438	  eval(shift(@insns));
1439	  eval(shift(@insns));
1440	 &vpxor		($t3,$t3,$t2);
1441	  eval(shift(@insns));
1442	  eval(shift(@insns));
1443	  eval(shift(@insns));
1444	  eval(shift(@insns));
1445	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1446	  eval(shift(@insns));
1447	  eval(shift(@insns));
1448	  eval(shift(@insns));
1449	  eval(shift(@insns));
1450	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1451	  eval(shift(@insns));
1452	  eval(shift(@insns));
1453	  eval(shift(@insns));
1454	  eval(shift(@insns));
1455	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1456	  foreach (@insns) { eval; }		# remaining instructions
1457	&vmovdqa	(16*$j."(%rsp)",$t2);
1458}
1459
1460    for ($i=0,$j=0; $j<8; $j++) {
1461	&XOP_512_00_47($j,\&body_00_15,@X);
1462	push(@X,shift(@X));			# rotate(@X)
1463    }
1464	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1465	&jne	(".Lxop_00_47");
1466
1467    for ($i=0; $i<16; ) {
1468	foreach(body_00_15()) { eval; }
1469    }
1470}
1471$code.=<<___;
1472	mov	$_ctx,$ctx
1473	mov	$a1,$A
1474
1475	add	$SZ*0($ctx),$A
1476	lea	16*$SZ($inp),$inp
1477	add	$SZ*1($ctx),$B
1478	add	$SZ*2($ctx),$C
1479	add	$SZ*3($ctx),$D
1480	add	$SZ*4($ctx),$E
1481	add	$SZ*5($ctx),$F
1482	add	$SZ*6($ctx),$G
1483	add	$SZ*7($ctx),$H
1484
1485	cmp	$_end,$inp
1486
1487	mov	$A,$SZ*0($ctx)
1488	mov	$B,$SZ*1($ctx)
1489	mov	$C,$SZ*2($ctx)
1490	mov	$D,$SZ*3($ctx)
1491	mov	$E,$SZ*4($ctx)
1492	mov	$F,$SZ*5($ctx)
1493	mov	$G,$SZ*6($ctx)
1494	mov	$H,$SZ*7($ctx)
1495	jb	.Lloop_xop
1496
1497	mov	$_rsp,%rsi
1498.cfi_def_cfa	%rsi,8
1499	vzeroupper
1500___
1501$code.=<<___ if ($win64);
1502	movaps	16*$SZ+32(%rsp),%xmm6
1503	movaps	16*$SZ+48(%rsp),%xmm7
1504	movaps	16*$SZ+64(%rsp),%xmm8
1505	movaps	16*$SZ+80(%rsp),%xmm9
1506___
1507$code.=<<___ if ($win64 && $SZ>4);
1508	movaps	16*$SZ+96(%rsp),%xmm10
1509	movaps	16*$SZ+112(%rsp),%xmm11
1510___
1511$code.=<<___;
1512	mov	-48(%rsi),%r15
1513.cfi_restore	%r15
1514	mov	-40(%rsi),%r14
1515.cfi_restore	%r14
1516	mov	-32(%rsi),%r13
1517.cfi_restore	%r13
1518	mov	-24(%rsi),%r12
1519.cfi_restore	%r12
1520	mov	-16(%rsi),%rbp
1521.cfi_restore	%rbp
1522	mov	-8(%rsi),%rbx
1523.cfi_restore	%rbx
1524	lea	(%rsi),%rsp
1525.cfi_def_cfa_register	%rsp
1526.Lepilogue_xop:
1527	ret
1528.cfi_endproc
1529.size	${func}_xop,.-${func}_xop
1530___
1531}
1532######################################################################
1533# AVX+shrd code path
1534#
1535local *ror = sub { &shrd(@_[0],@_) };
1536
1537$code.=<<___;
1538.type	${func}_avx,\@function,3
1539.align	64
1540${func}_avx:
1541.cfi_startproc
1542.Lavx_shortcut:
1543	mov	%rsp,%rax		# copy %rsp
1544.cfi_def_cfa_register	%rax
1545	push	%rbx
1546.cfi_push	%rbx
1547	push	%rbp
1548.cfi_push	%rbp
1549	push	%r12
1550.cfi_push	%r12
1551	push	%r13
1552.cfi_push	%r13
1553	push	%r14
1554.cfi_push	%r14
1555	push	%r15
1556.cfi_push	%r15
1557	shl	\$4,%rdx		# num*16
1558	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1559	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1560	and	\$-64,%rsp		# align stack frame
1561	mov	$ctx,$_ctx		# save ctx, 1st arg
1562	mov	$inp,$_inp		# save inp, 2nd arh
1563	mov	%rdx,$_end		# save end pointer, "3rd" arg
1564	mov	%rax,$_rsp		# save copy of %rsp
1565.cfi_cfa_expression	$_rsp,deref,+8
1566___
1567$code.=<<___ if ($win64);
1568	movaps	%xmm6,16*$SZ+32(%rsp)
1569	movaps	%xmm7,16*$SZ+48(%rsp)
1570	movaps	%xmm8,16*$SZ+64(%rsp)
1571	movaps	%xmm9,16*$SZ+80(%rsp)
1572___
1573$code.=<<___ if ($win64 && $SZ>4);
1574	movaps	%xmm10,16*$SZ+96(%rsp)
1575	movaps	%xmm11,16*$SZ+112(%rsp)
1576___
1577$code.=<<___;
1578.Lprologue_avx:
1579
1580	vzeroupper
1581	mov	$SZ*0($ctx),$A
1582	mov	$SZ*1($ctx),$B
1583	mov	$SZ*2($ctx),$C
1584	mov	$SZ*3($ctx),$D
1585	mov	$SZ*4($ctx),$E
1586	mov	$SZ*5($ctx),$F
1587	mov	$SZ*6($ctx),$G
1588	mov	$SZ*7($ctx),$H
1589___
1590					if ($SZ==4) {	# SHA256
1591    my @X = map("%xmm$_",(0..3));
1592    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1593
1594$code.=<<___;
1595	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1596	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1597	jmp	.Lloop_avx
1598.align	16
1599.Lloop_avx:
1600	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1601	vmovdqu	0x00($inp),@X[0]
1602	vmovdqu	0x10($inp),@X[1]
1603	vmovdqu	0x20($inp),@X[2]
1604	vmovdqu	0x30($inp),@X[3]
1605	vpshufb	$t3,@X[0],@X[0]
1606	lea	$TABLE(%rip),$Tbl
1607	vpshufb	$t3,@X[1],@X[1]
1608	vpshufb	$t3,@X[2],@X[2]
1609	vpaddd	0x00($Tbl),@X[0],$t0
1610	vpshufb	$t3,@X[3],@X[3]
1611	vpaddd	0x20($Tbl),@X[1],$t1
1612	vpaddd	0x40($Tbl),@X[2],$t2
1613	vpaddd	0x60($Tbl),@X[3],$t3
1614	vmovdqa	$t0,0x00(%rsp)
1615	mov	$A,$a1
1616	vmovdqa	$t1,0x10(%rsp)
1617	mov	$B,$a3
1618	vmovdqa	$t2,0x20(%rsp)
1619	xor	$C,$a3			# magic
1620	vmovdqa	$t3,0x30(%rsp)
1621	mov	$E,$a0
1622	jmp	.Lavx_00_47
1623
1624.align	16
1625.Lavx_00_47:
1626	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1627___
1628sub Xupdate_256_AVX () {
1629	(
1630	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1631	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1632	'&vpsrld	($t2,$t0,$sigma0[0]);',
1633	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1634	'&vpsrld	($t3,$t0,$sigma0[2])',
1635	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1636	'&vpxor		($t0,$t3,$t2)',
1637	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1638	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1639	'&vpxor		($t0,$t0,$t1)',
1640	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1641	'&vpxor		($t0,$t0,$t2)',
1642	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1643	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1644	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1645	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1646	 '&vpxor	($t2,$t2,$t3);',
1647	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1648	 '&vpxor	($t2,$t2,$t3)',
1649	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1650	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1651	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1652	 '&vpsrld	($t2,$t3,$sigma1[2])',
1653	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1654	 '&vpxor	($t2,$t2,$t3);',
1655	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1656	 '&vpxor	($t2,$t2,$t3)',
1657	 '&vpshufb	($t2,$t2,$t5)',
1658	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1659	);
1660}
1661
1662sub AVX_256_00_47 () {
1663my $j = shift;
1664my $body = shift;
1665my @X = @_;
1666my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1667
1668	foreach (Xupdate_256_AVX()) {		# 29 instructions
1669	    eval;
1670	    eval(shift(@insns));
1671	    eval(shift(@insns));
1672	    eval(shift(@insns));
1673	}
1674	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1675	  foreach (@insns) { eval; }		# remaining instructions
1676	&vmovdqa	(16*$j."(%rsp)",$t2);
1677}
1678
1679    for ($i=0,$j=0; $j<4; $j++) {
1680	&AVX_256_00_47($j,\&body_00_15,@X);
1681	push(@X,shift(@X));			# rotate(@X)
1682    }
1683	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1684	&jne	(".Lavx_00_47");
1685
1686    for ($i=0; $i<16; ) {
1687	foreach(body_00_15()) { eval; }
1688    }
1689
1690					} else {	# SHA512
1691    my @X = map("%xmm$_",(0..7));
1692    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1693
1694$code.=<<___;
1695	jmp	.Lloop_avx
1696.align	16
1697.Lloop_avx:
1698	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1699	vmovdqu	0x00($inp),@X[0]
1700	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1701	vmovdqu	0x10($inp),@X[1]
1702	vmovdqu	0x20($inp),@X[2]
1703	vpshufb	$t3,@X[0],@X[0]
1704	vmovdqu	0x30($inp),@X[3]
1705	vpshufb	$t3,@X[1],@X[1]
1706	vmovdqu	0x40($inp),@X[4]
1707	vpshufb	$t3,@X[2],@X[2]
1708	vmovdqu	0x50($inp),@X[5]
1709	vpshufb	$t3,@X[3],@X[3]
1710	vmovdqu	0x60($inp),@X[6]
1711	vpshufb	$t3,@X[4],@X[4]
1712	vmovdqu	0x70($inp),@X[7]
1713	vpshufb	$t3,@X[5],@X[5]
1714	vpaddq	-0x80($Tbl),@X[0],$t0
1715	vpshufb	$t3,@X[6],@X[6]
1716	vpaddq	-0x60($Tbl),@X[1],$t1
1717	vpshufb	$t3,@X[7],@X[7]
1718	vpaddq	-0x40($Tbl),@X[2],$t2
1719	vpaddq	-0x20($Tbl),@X[3],$t3
1720	vmovdqa	$t0,0x00(%rsp)
1721	vpaddq	0x00($Tbl),@X[4],$t0
1722	vmovdqa	$t1,0x10(%rsp)
1723	vpaddq	0x20($Tbl),@X[5],$t1
1724	vmovdqa	$t2,0x20(%rsp)
1725	vpaddq	0x40($Tbl),@X[6],$t2
1726	vmovdqa	$t3,0x30(%rsp)
1727	vpaddq	0x60($Tbl),@X[7],$t3
1728	vmovdqa	$t0,0x40(%rsp)
1729	mov	$A,$a1
1730	vmovdqa	$t1,0x50(%rsp)
1731	mov	$B,$a3
1732	vmovdqa	$t2,0x60(%rsp)
1733	xor	$C,$a3			# magic
1734	vmovdqa	$t3,0x70(%rsp)
1735	mov	$E,$a0
1736	jmp	.Lavx_00_47
1737
1738.align	16
1739.Lavx_00_47:
1740	add	\$`16*2*$SZ`,$Tbl
1741___
1742sub Xupdate_512_AVX () {
1743	(
1744	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1745	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1746	'&vpsrlq	($t2,$t0,$sigma0[0])',
1747	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1748	'&vpsrlq	($t3,$t0,$sigma0[2])',
1749	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1750	 '&vpxor	($t0,$t3,$t2)',
1751	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1752	 '&vpxor	($t0,$t0,$t1)',
1753	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1754	 '&vpxor	($t0,$t0,$t2)',
1755	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1756	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1757	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1758	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1759	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1760	 '&vpxor	($t3,$t3,$t2)',
1761	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1762	 '&vpxor	($t3,$t3,$t1)',
1763	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1764	 '&vpxor	($t3,$t3,$t2)',
1765	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1766	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1767	);
1768}
1769
1770sub AVX_512_00_47 () {
1771my $j = shift;
1772my $body = shift;
1773my @X = @_;
1774my @insns = (&$body,&$body);			# 52 instructions
1775
1776	foreach (Xupdate_512_AVX()) {		# 23 instructions
1777	    eval;
1778	    eval(shift(@insns));
1779	    eval(shift(@insns));
1780	}
1781	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1782	  foreach (@insns) { eval; }		# remaining instructions
1783	&vmovdqa	(16*$j."(%rsp)",$t2);
1784}
1785
1786    for ($i=0,$j=0; $j<8; $j++) {
1787	&AVX_512_00_47($j,\&body_00_15,@X);
1788	push(@X,shift(@X));			# rotate(@X)
1789    }
1790	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1791	&jne	(".Lavx_00_47");
1792
1793    for ($i=0; $i<16; ) {
1794	foreach(body_00_15()) { eval; }
1795    }
1796}
1797$code.=<<___;
1798	mov	$_ctx,$ctx
1799	mov	$a1,$A
1800
1801	add	$SZ*0($ctx),$A
1802	lea	16*$SZ($inp),$inp
1803	add	$SZ*1($ctx),$B
1804	add	$SZ*2($ctx),$C
1805	add	$SZ*3($ctx),$D
1806	add	$SZ*4($ctx),$E
1807	add	$SZ*5($ctx),$F
1808	add	$SZ*6($ctx),$G
1809	add	$SZ*7($ctx),$H
1810
1811	cmp	$_end,$inp
1812
1813	mov	$A,$SZ*0($ctx)
1814	mov	$B,$SZ*1($ctx)
1815	mov	$C,$SZ*2($ctx)
1816	mov	$D,$SZ*3($ctx)
1817	mov	$E,$SZ*4($ctx)
1818	mov	$F,$SZ*5($ctx)
1819	mov	$G,$SZ*6($ctx)
1820	mov	$H,$SZ*7($ctx)
1821	jb	.Lloop_avx
1822
1823	mov	$_rsp,%rsi
1824.cfi_def_cfa	%rsi,8
1825	vzeroupper
1826___
1827$code.=<<___ if ($win64);
1828	movaps	16*$SZ+32(%rsp),%xmm6
1829	movaps	16*$SZ+48(%rsp),%xmm7
1830	movaps	16*$SZ+64(%rsp),%xmm8
1831	movaps	16*$SZ+80(%rsp),%xmm9
1832___
1833$code.=<<___ if ($win64 && $SZ>4);
1834	movaps	16*$SZ+96(%rsp),%xmm10
1835	movaps	16*$SZ+112(%rsp),%xmm11
1836___
1837$code.=<<___;
1838	mov	-48(%rsi),%r15
1839.cfi_restore	%r15
1840	mov	-40(%rsi),%r14
1841.cfi_restore	%r14
1842	mov	-32(%rsi),%r13
1843.cfi_restore	%r13
1844	mov	-24(%rsi),%r12
1845.cfi_restore	%r12
1846	mov	-16(%rsi),%rbp
1847.cfi_restore	%rbp
1848	mov	-8(%rsi),%rbx
1849.cfi_restore	%rbx
1850	lea	(%rsi),%rsp
1851.cfi_def_cfa_register	%rsp
1852.Lepilogue_avx:
1853	ret
1854.cfi_endproc
1855.size	${func}_avx,.-${func}_avx
1856___
1857
1858if ($avx>1) {{
1859######################################################################
1860# AVX2+BMI code path
1861#
1862my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
1863my $PUSH8=8*2*$SZ;
1864use integer;
1865
1866sub bodyx_00_15 () {
1867	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1868	(
1869	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1870
1871	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
1872	'&and	($a4,$e)',		# f&e
1873	'&rorx	($a0,$e,$Sigma1[2])',
1874	'&rorx	($a2,$e,$Sigma1[1])',
1875
1876	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
1877	'&lea	($h,"($h,$a4)")',
1878	'&andn	($a4,$e,$g)',		# ~e&g
1879	'&xor	($a0,$a2)',
1880
1881	'&rorx	($a1,$e,$Sigma1[0])',
1882	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
1883	'&xor	($a0,$a1)',		# Sigma1(e)
1884	'&mov	($a2,$a)',
1885
1886	'&rorx	($a4,$a,$Sigma0[2])',
1887	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
1888	'&xor	($a2,$b)',		# a^b, b^c in next round
1889	'&rorx	($a1,$a,$Sigma0[1])',
1890
1891	'&rorx	($a0,$a,$Sigma0[0])',
1892	'&lea	($d,"($d,$h)")',	# d+=h
1893	'&and	($a3,$a2)',		# (b^c)&(a^b)
1894	'&xor	($a1,$a4)',
1895
1896	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
1897	'&xor	($a1,$a0)',		# Sigma0(a)
1898	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
1899	'&mov	($a4,$e)',		# copy of f in future
1900
1901	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1902	);
1903	# and at the finish one has to $a+=$a1
1904}
1905
1906$code.=<<___;
1907.type	${func}_avx2,\@function,3
1908.align	64
1909${func}_avx2:
1910.cfi_startproc
1911.Lavx2_shortcut:
1912	mov	%rsp,%rax		# copy %rsp
1913.cfi_def_cfa_register	%rax
1914	push	%rbx
1915.cfi_push	%rbx
1916	push	%rbp
1917.cfi_push	%rbp
1918	push	%r12
1919.cfi_push	%r12
1920	push	%r13
1921.cfi_push	%r13
1922	push	%r14
1923.cfi_push	%r14
1924	push	%r15
1925.cfi_push	%r15
1926	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1927	shl	\$4,%rdx		# num*16
1928	and	\$-256*$SZ,%rsp		# align stack frame
1929	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1930	add	\$`2*$SZ*($rounds-8)`,%rsp
1931	mov	$ctx,$_ctx		# save ctx, 1st arg
1932	mov	$inp,$_inp		# save inp, 2nd arh
1933	mov	%rdx,$_end		# save end pointer, "3rd" arg
1934	mov	%rax,$_rsp		# save copy of %rsp
1935.cfi_cfa_expression	$_rsp,deref,+8
1936___
1937$code.=<<___ if ($win64);
1938	movaps	%xmm6,16*$SZ+32(%rsp)
1939	movaps	%xmm7,16*$SZ+48(%rsp)
1940	movaps	%xmm8,16*$SZ+64(%rsp)
1941	movaps	%xmm9,16*$SZ+80(%rsp)
1942___
1943$code.=<<___ if ($win64 && $SZ>4);
1944	movaps	%xmm10,16*$SZ+96(%rsp)
1945	movaps	%xmm11,16*$SZ+112(%rsp)
1946___
1947$code.=<<___;
1948.Lprologue_avx2:
1949
1950	vzeroupper
1951	sub	\$-16*$SZ,$inp		# inp++, size optimization
1952	mov	$SZ*0($ctx),$A
1953	mov	$inp,%r12		# borrow $T1
1954	mov	$SZ*1($ctx),$B
1955	cmp	%rdx,$inp		# $_end
1956	mov	$SZ*2($ctx),$C
1957	cmove	%rsp,%r12		# next block or random data
1958	mov	$SZ*3($ctx),$D
1959	mov	$SZ*4($ctx),$E
1960	mov	$SZ*5($ctx),$F
1961	mov	$SZ*6($ctx),$G
1962	mov	$SZ*7($ctx),$H
1963___
1964					if ($SZ==4) {	# SHA256
1965    my @X = map("%ymm$_",(0..3));
1966    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1967
1968$code.=<<___;
1969	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1970	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1971	jmp	.Loop_avx2
1972.align	16
1973.Loop_avx2:
1974	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1975	vmovdqu	-16*$SZ+0($inp),%xmm0
1976	vmovdqu	-16*$SZ+16($inp),%xmm1
1977	vmovdqu	-16*$SZ+32($inp),%xmm2
1978	vmovdqu	-16*$SZ+48($inp),%xmm3
1979	#mov		$inp,$_inp	# offload $inp
1980	vinserti128	\$1,(%r12),@X[0],@X[0]
1981	vinserti128	\$1,16(%r12),@X[1],@X[1]
1982	vpshufb		$t3,@X[0],@X[0]
1983	vinserti128	\$1,32(%r12),@X[2],@X[2]
1984	vpshufb		$t3,@X[1],@X[1]
1985	vinserti128	\$1,48(%r12),@X[3],@X[3]
1986
1987	lea	$TABLE(%rip),$Tbl
1988	vpshufb	$t3,@X[2],@X[2]
1989	vpaddd	0x00($Tbl),@X[0],$t0
1990	vpshufb	$t3,@X[3],@X[3]
1991	vpaddd	0x20($Tbl),@X[1],$t1
1992	vpaddd	0x40($Tbl),@X[2],$t2
1993	vpaddd	0x60($Tbl),@X[3],$t3
1994	vmovdqa	$t0,0x00(%rsp)
1995	xor	$a1,$a1
1996	vmovdqa	$t1,0x20(%rsp)
1997___
1998$code.=<<___ if (!$win64);
1999# temporarily use %rdi as frame pointer
2000	mov	$_rsp,%rdi
2001.cfi_def_cfa	%rdi,8
2002___
2003$code.=<<___;
2004	lea	-$PUSH8(%rsp),%rsp
2005___
2006$code.=<<___ if (!$win64);
2007# the frame info is at $_rsp, but the stack is moving...
2008# so a second frame pointer is saved at -8(%rsp)
2009# that is in the red zone
2010	mov	%rdi,-8(%rsp)
2011.cfi_cfa_expression	%rsp-8,deref,+8
2012___
2013$code.=<<___;
2014	mov	$B,$a3
2015	vmovdqa	$t2,0x00(%rsp)
2016	xor	$C,$a3			# magic
2017	vmovdqa	$t3,0x20(%rsp)
2018	mov	$F,$a4
2019	sub	\$-16*2*$SZ,$Tbl	# size optimization
2020	jmp	.Lavx2_00_47
2021
2022.align	16
2023.Lavx2_00_47:
2024___
2025
2026sub AVX2_256_00_47 () {
2027my $j = shift;
2028my $body = shift;
2029my @X = @_;
2030my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
2031my $base = "+2*$PUSH8(%rsp)";
2032
2033	if (($j%2)==0) {
2034	&lea	("%rsp","-$PUSH8(%rsp)");
2035$code.=<<___ if (!$win64);
2036.cfi_cfa_expression	%rsp+`$PUSH8-8`,deref,+8
2037# copy secondary frame pointer to new location again at -8(%rsp)
2038	pushq	$PUSH8-8(%rsp)
2039.cfi_cfa_expression	%rsp,deref,+8
2040	lea	8(%rsp),%rsp
2041.cfi_cfa_expression	%rsp-8,deref,+8
2042___
2043	}
2044
2045	foreach (Xupdate_256_AVX()) {		# 29 instructions
2046	    eval;
2047	    eval(shift(@insns));
2048	    eval(shift(@insns));
2049	    eval(shift(@insns));
2050	}
2051	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
2052	  foreach (@insns) { eval; }		# remaining instructions
2053	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2054}
2055
2056    for ($i=0,$j=0; $j<4; $j++) {
2057	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
2058	push(@X,shift(@X));			# rotate(@X)
2059    }
2060	&lea	($Tbl,16*2*$SZ."($Tbl)");
2061	&cmpb	(($SZ-1)."($Tbl)",0);
2062	&jne	(".Lavx2_00_47");
2063
2064    for ($i=0; $i<16; ) {
2065	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2066	foreach(bodyx_00_15()) { eval; }
2067    }
2068					} else {	# SHA512
2069    my @X = map("%ymm$_",(0..7));
2070    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
2071
2072$code.=<<___;
2073	jmp	.Loop_avx2
2074.align	16
2075.Loop_avx2:
2076	vmovdqu	-16*$SZ($inp),%xmm0
2077	vmovdqu	-16*$SZ+16($inp),%xmm1
2078	vmovdqu	-16*$SZ+32($inp),%xmm2
2079	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
2080	vmovdqu	-16*$SZ+48($inp),%xmm3
2081	vmovdqu	-16*$SZ+64($inp),%xmm4
2082	vmovdqu	-16*$SZ+80($inp),%xmm5
2083	vmovdqu	-16*$SZ+96($inp),%xmm6
2084	vmovdqu	-16*$SZ+112($inp),%xmm7
2085	#mov	$inp,$_inp	# offload $inp
2086	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
2087	vinserti128	\$1,(%r12),@X[0],@X[0]
2088	vinserti128	\$1,16(%r12),@X[1],@X[1]
2089	 vpshufb	$t2,@X[0],@X[0]
2090	vinserti128	\$1,32(%r12),@X[2],@X[2]
2091	 vpshufb	$t2,@X[1],@X[1]
2092	vinserti128	\$1,48(%r12),@X[3],@X[3]
2093	 vpshufb	$t2,@X[2],@X[2]
2094	vinserti128	\$1,64(%r12),@X[4],@X[4]
2095	 vpshufb	$t2,@X[3],@X[3]
2096	vinserti128	\$1,80(%r12),@X[5],@X[5]
2097	 vpshufb	$t2,@X[4],@X[4]
2098	vinserti128	\$1,96(%r12),@X[6],@X[6]
2099	 vpshufb	$t2,@X[5],@X[5]
2100	vinserti128	\$1,112(%r12),@X[7],@X[7]
2101
2102	vpaddq	-0x80($Tbl),@X[0],$t0
2103	vpshufb	$t2,@X[6],@X[6]
2104	vpaddq	-0x60($Tbl),@X[1],$t1
2105	vpshufb	$t2,@X[7],@X[7]
2106	vpaddq	-0x40($Tbl),@X[2],$t2
2107	vpaddq	-0x20($Tbl),@X[3],$t3
2108	vmovdqa	$t0,0x00(%rsp)
2109	vpaddq	0x00($Tbl),@X[4],$t0
2110	vmovdqa	$t1,0x20(%rsp)
2111	vpaddq	0x20($Tbl),@X[5],$t1
2112	vmovdqa	$t2,0x40(%rsp)
2113	vpaddq	0x40($Tbl),@X[6],$t2
2114	vmovdqa	$t3,0x60(%rsp)
2115___
2116$code.=<<___ if (!$win64);
2117# temporarily use %rdi as frame pointer
2118	mov	$_rsp,%rdi
2119.cfi_def_cfa	%rdi,8
2120___
2121$code.=<<___;
2122	lea	-$PUSH8(%rsp),%rsp
2123___
2124$code.=<<___ if (!$win64);
2125# the frame info is at $_rsp, but the stack is moving...
2126# so a second frame pointer is saved at -8(%rsp)
2127# that is in the red zone
2128	mov	%rdi,-8(%rsp)
2129.cfi_cfa_expression	%rsp-8,deref,+8
2130___
2131$code.=<<___;
2132	vpaddq	0x60($Tbl),@X[7],$t3
2133	vmovdqa	$t0,0x00(%rsp)
2134	xor	$a1,$a1
2135	vmovdqa	$t1,0x20(%rsp)
2136	mov	$B,$a3
2137	vmovdqa	$t2,0x40(%rsp)
2138	xor	$C,$a3			# magic
2139	vmovdqa	$t3,0x60(%rsp)
2140	mov	$F,$a4
2141	add	\$16*2*$SZ,$Tbl
2142	jmp	.Lavx2_00_47
2143
2144.align	16
2145.Lavx2_00_47:
2146___
2147
2148sub AVX2_512_00_47 () {
2149my $j = shift;
2150my $body = shift;
2151my @X = @_;
2152my @insns = (&$body,&$body);			# 48 instructions
2153my $base = "+2*$PUSH8(%rsp)";
2154
2155	if (($j%4)==0) {
2156	&lea	("%rsp","-$PUSH8(%rsp)");
2157$code.=<<___ if (!$win64);
2158.cfi_cfa_expression	%rsp+`$PUSH8-8`,deref,+8
2159# copy secondary frame pointer to new location again at -8(%rsp)
2160	pushq	$PUSH8-8(%rsp)
2161.cfi_cfa_expression	%rsp,deref,+8
2162	lea	8(%rsp),%rsp
2163.cfi_cfa_expression	%rsp-8,deref,+8
2164___
2165	}
2166
2167	foreach (Xupdate_512_AVX()) {		# 23 instructions
2168	    eval;
2169	    if ($_ !~ /\;$/) {
2170		eval(shift(@insns));
2171		eval(shift(@insns));
2172		eval(shift(@insns));
2173	    }
2174	}
2175	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
2176	  foreach (@insns) { eval; }		# remaining instructions
2177	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2178}
2179
2180    for ($i=0,$j=0; $j<8; $j++) {
2181	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
2182	push(@X,shift(@X));			# rotate(@X)
2183    }
2184	&lea	($Tbl,16*2*$SZ."($Tbl)");
2185	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
2186	&jne	(".Lavx2_00_47");
2187
2188    for ($i=0; $i<16; ) {
2189	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2190	foreach(bodyx_00_15()) { eval; }
2191    }
2192}
2193$code.=<<___;
2194	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2195	add	$a1,$A
2196	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2197	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
2198
2199	add	$SZ*0($ctx),$A
2200	add	$SZ*1($ctx),$B
2201	add	$SZ*2($ctx),$C
2202	add	$SZ*3($ctx),$D
2203	add	$SZ*4($ctx),$E
2204	add	$SZ*5($ctx),$F
2205	add	$SZ*6($ctx),$G
2206	add	$SZ*7($ctx),$H
2207
2208	mov	$A,$SZ*0($ctx)
2209	mov	$B,$SZ*1($ctx)
2210	mov	$C,$SZ*2($ctx)
2211	mov	$D,$SZ*3($ctx)
2212	mov	$E,$SZ*4($ctx)
2213	mov	$F,$SZ*5($ctx)
2214	mov	$G,$SZ*6($ctx)
2215	mov	$H,$SZ*7($ctx)
2216
2217	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
2218	je	.Ldone_avx2
2219
2220	xor	$a1,$a1
2221	mov	$B,$a3
2222	xor	$C,$a3			# magic
2223	mov	$F,$a4
2224	jmp	.Lower_avx2
2225.align	16
2226.Lower_avx2:
2227___
2228    for ($i=0; $i<8; ) {
2229	my $base="+16($Tbl)";
2230	foreach(bodyx_00_15()) { eval; }
2231    }
2232$code.=<<___;
2233	lea	-$PUSH8($Tbl),$Tbl
2234	cmp	%rsp,$Tbl
2235	jae	.Lower_avx2
2236
2237	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2238	add	$a1,$A
2239	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2240	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
2241# restore frame pointer to original location at $_rsp
2242.cfi_cfa_expression	$_rsp,deref,+8
2243
2244	add	$SZ*0($ctx),$A
2245	add	$SZ*1($ctx),$B
2246	add	$SZ*2($ctx),$C
2247	add	$SZ*3($ctx),$D
2248	add	$SZ*4($ctx),$E
2249	add	$SZ*5($ctx),$F
2250	lea	`2*16*$SZ`($inp),$inp	# inp+=2
2251	add	$SZ*6($ctx),$G
2252	mov	$inp,%r12
2253	add	$SZ*7($ctx),$H
2254	cmp	$_end,$inp
2255
2256	mov	$A,$SZ*0($ctx)
2257	cmove	%rsp,%r12		# next block or stale data
2258	mov	$B,$SZ*1($ctx)
2259	mov	$C,$SZ*2($ctx)
2260	mov	$D,$SZ*3($ctx)
2261	mov	$E,$SZ*4($ctx)
2262	mov	$F,$SZ*5($ctx)
2263	mov	$G,$SZ*6($ctx)
2264	mov	$H,$SZ*7($ctx)
2265
2266	jbe	.Loop_avx2
2267	lea	(%rsp),$Tbl
2268# temporarily use $Tbl as index to $_rsp
2269# this avoids the need to save a secondary frame pointer at -8(%rsp)
2270.cfi_cfa_expression	$Tbl+`16*$SZ+3*8`,deref,+8
2271
2272.Ldone_avx2:
2273	mov	`16*$SZ+3*8`($Tbl),%rsi
2274.cfi_def_cfa	%rsi,8
2275	vzeroupper
2276___
2277$code.=<<___ if ($win64);
2278	movaps	16*$SZ+32($Tbl),%xmm6
2279	movaps	16*$SZ+48($Tbl),%xmm7
2280	movaps	16*$SZ+64($Tbl),%xmm8
2281	movaps	16*$SZ+80($Tbl),%xmm9
2282___
2283$code.=<<___ if ($win64 && $SZ>4);
2284	movaps	16*$SZ+96($Tbl),%xmm10
2285	movaps	16*$SZ+112($Tbl),%xmm11
2286___
2287$code.=<<___;
2288	mov	-48(%rsi),%r15
2289.cfi_restore	%r15
2290	mov	-40(%rsi),%r14
2291.cfi_restore	%r14
2292	mov	-32(%rsi),%r13
2293.cfi_restore	%r13
2294	mov	-24(%rsi),%r12
2295.cfi_restore	%r12
2296	mov	-16(%rsi),%rbp
2297.cfi_restore	%rbp
2298	mov	-8(%rsi),%rbx
2299.cfi_restore	%rbx
2300	lea	(%rsi),%rsp
2301.cfi_def_cfa_register	%rsp
2302.Lepilogue_avx2:
2303	ret
2304.cfi_endproc
2305.size	${func}_avx2,.-${func}_avx2
2306___
2307}}
2308}}}}}
2309
2310# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2311#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2312if ($win64) {
2313$rec="%rcx";
2314$frame="%rdx";
2315$context="%r8";
2316$disp="%r9";
2317
2318$code.=<<___;
2319.extern	__imp_RtlVirtualUnwind
2320.type	se_handler,\@abi-omnipotent
2321.align	16
2322se_handler:
2323	push	%rsi
2324	push	%rdi
2325	push	%rbx
2326	push	%rbp
2327	push	%r12
2328	push	%r13
2329	push	%r14
2330	push	%r15
2331	pushfq
2332	sub	\$64,%rsp
2333
2334	mov	120($context),%rax	# pull context->Rax
2335	mov	248($context),%rbx	# pull context->Rip
2336
2337	mov	8($disp),%rsi		# disp->ImageBase
2338	mov	56($disp),%r11		# disp->HanderlData
2339
2340	mov	0(%r11),%r10d		# HandlerData[0]
2341	lea	(%rsi,%r10),%r10	# prologue label
2342	cmp	%r10,%rbx		# context->Rip<prologue label
2343	jb	.Lin_prologue
2344
2345	mov	152($context),%rax	# pull context->Rsp
2346
2347	mov	4(%r11),%r10d		# HandlerData[1]
2348	lea	(%rsi,%r10),%r10	# epilogue label
2349	cmp	%r10,%rbx		# context->Rip>=epilogue label
2350	jae	.Lin_prologue
2351___
2352$code.=<<___ if ($avx>1);
2353	lea	.Lavx2_shortcut(%rip),%r10
2354	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
2355	jb	.Lnot_in_avx2
2356
2357	and	\$-256*$SZ,%rax
2358	add	\$`2*$SZ*($rounds-8)`,%rax
2359.Lnot_in_avx2:
2360___
2361$code.=<<___;
2362	mov	%rax,%rsi		# put aside Rsp
2363	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
2364
2365	mov	-8(%rax),%rbx
2366	mov	-16(%rax),%rbp
2367	mov	-24(%rax),%r12
2368	mov	-32(%rax),%r13
2369	mov	-40(%rax),%r14
2370	mov	-48(%rax),%r15
2371	mov	%rbx,144($context)	# restore context->Rbx
2372	mov	%rbp,160($context)	# restore context->Rbp
2373	mov	%r12,216($context)	# restore context->R12
2374	mov	%r13,224($context)	# restore context->R13
2375	mov	%r14,232($context)	# restore context->R14
2376	mov	%r15,240($context)	# restore context->R15
2377
2378	lea	.Lepilogue(%rip),%r10
2379	cmp	%r10,%rbx
2380	jb	.Lin_prologue		# non-AVX code
2381
2382	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
2383	lea	512($context),%rdi	# &context.Xmm6
2384	mov	\$`$SZ==4?8:12`,%ecx
2385	.long	0xa548f3fc		# cld; rep movsq
2386
2387.Lin_prologue:
2388	mov	8(%rax),%rdi
2389	mov	16(%rax),%rsi
2390	mov	%rax,152($context)	# restore context->Rsp
2391	mov	%rsi,168($context)	# restore context->Rsi
2392	mov	%rdi,176($context)	# restore context->Rdi
2393
2394	mov	40($disp),%rdi		# disp->ContextRecord
2395	mov	$context,%rsi		# context
2396	mov	\$154,%ecx		# sizeof(CONTEXT)
2397	.long	0xa548f3fc		# cld; rep movsq
2398
2399	mov	$disp,%rsi
2400	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2401	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2402	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2403	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2404	mov	40(%rsi),%r10		# disp->ContextRecord
2405	lea	56(%rsi),%r11		# &disp->HandlerData
2406	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2407	mov	%r10,32(%rsp)		# arg5
2408	mov	%r11,40(%rsp)		# arg6
2409	mov	%r12,48(%rsp)		# arg7
2410	mov	%rcx,56(%rsp)		# arg8, (NULL)
2411	call	*__imp_RtlVirtualUnwind(%rip)
2412
2413	mov	\$1,%eax		# ExceptionContinueSearch
2414	add	\$64,%rsp
2415	popfq
2416	pop	%r15
2417	pop	%r14
2418	pop	%r13
2419	pop	%r12
2420	pop	%rbp
2421	pop	%rbx
2422	pop	%rdi
2423	pop	%rsi
2424	ret
2425.size	se_handler,.-se_handler
2426___
2427
2428$code.=<<___ if ($SZ==4 && $shaext);
2429.type	shaext_handler,\@abi-omnipotent
2430.align	16
2431shaext_handler:
2432	push	%rsi
2433	push	%rdi
2434	push	%rbx
2435	push	%rbp
2436	push	%r12
2437	push	%r13
2438	push	%r14
2439	push	%r15
2440	pushfq
2441	sub	\$64,%rsp
2442
2443	mov	120($context),%rax	# pull context->Rax
2444	mov	248($context),%rbx	# pull context->Rip
2445
2446	lea	.Lprologue_shaext(%rip),%r10
2447	cmp	%r10,%rbx		# context->Rip<.Lprologue
2448	jb	.Lin_prologue
2449
2450	lea	.Lepilogue_shaext(%rip),%r10
2451	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
2452	jae	.Lin_prologue
2453
2454	lea	-8-5*16(%rax),%rsi
2455	lea	512($context),%rdi	# &context.Xmm6
2456	mov	\$10,%ecx
2457	.long	0xa548f3fc		# cld; rep movsq
2458
2459	jmp	.Lin_prologue
2460.size	shaext_handler,.-shaext_handler
2461___
2462
2463$code.=<<___;
2464.section	.pdata
2465.align	4
2466	.rva	.LSEH_begin_$func
2467	.rva	.LSEH_end_$func
2468	.rva	.LSEH_info_$func
2469___
2470$code.=<<___ if ($SZ==4 && $shaext);
2471	.rva	.LSEH_begin_${func}_shaext
2472	.rva	.LSEH_end_${func}_shaext
2473	.rva	.LSEH_info_${func}_shaext
2474___
2475$code.=<<___ if ($SZ==4);
2476	.rva	.LSEH_begin_${func}_ssse3
2477	.rva	.LSEH_end_${func}_ssse3
2478	.rva	.LSEH_info_${func}_ssse3
2479___
2480$code.=<<___ if ($avx && $SZ==8);
2481	.rva	.LSEH_begin_${func}_xop
2482	.rva	.LSEH_end_${func}_xop
2483	.rva	.LSEH_info_${func}_xop
2484___
2485$code.=<<___ if ($avx);
2486	.rva	.LSEH_begin_${func}_avx
2487	.rva	.LSEH_end_${func}_avx
2488	.rva	.LSEH_info_${func}_avx
2489___
2490$code.=<<___ if ($avx>1);
2491	.rva	.LSEH_begin_${func}_avx2
2492	.rva	.LSEH_end_${func}_avx2
2493	.rva	.LSEH_info_${func}_avx2
2494___
2495$code.=<<___;
2496.section	.xdata
2497.align	8
2498.LSEH_info_$func:
2499	.byte	9,0,0,0
2500	.rva	se_handler
2501	.rva	.Lprologue,.Lepilogue			# HandlerData[]
2502___
2503$code.=<<___ if ($SZ==4 && $shaext);
2504.LSEH_info_${func}_shaext:
2505	.byte	9,0,0,0
2506	.rva	shaext_handler
2507___
2508$code.=<<___ if ($SZ==4);
2509.LSEH_info_${func}_ssse3:
2510	.byte	9,0,0,0
2511	.rva	se_handler
2512	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2513___
2514$code.=<<___ if ($avx && $SZ==8);
2515.LSEH_info_${func}_xop:
2516	.byte	9,0,0,0
2517	.rva	se_handler
2518	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
2519___
2520$code.=<<___ if ($avx);
2521.LSEH_info_${func}_avx:
2522	.byte	9,0,0,0
2523	.rva	se_handler
2524	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2525___
2526$code.=<<___ if ($avx>1);
2527.LSEH_info_${func}_avx2:
2528	.byte	9,0,0,0
2529	.rva	se_handler
2530	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2531___
2532}
2533
2534sub sha256op38 {
2535    my $instr = shift;
2536    my %opcodelet = (
2537		"sha256rnds2" => 0xcb,
2538  		"sha256msg1"  => 0xcc,
2539		"sha256msg2"  => 0xcd	);
2540
2541    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2542      my @opcode=(0x0f,0x38);
2543	push @opcode,$opcodelet{$instr};
2544	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2545	return ".byte\t".join(',',@opcode);
2546    } else {
2547	return $instr."\t".@_[0];
2548    }
2549}
2550
2551foreach (split("\n",$code)) {
2552	s/\`([^\`]*)\`/eval $1/geo;
2553
2554	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2555
2556	print $_,"\n";
2557}
2558close STDOUT or die "error closing STDOUT: $!";
2559