1db522d3aSSimon L. B. Nielsen#! /usr/bin/env perl
217f01e99SJung-uk Kim# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved.
3e71b7053SJung-uk Kim#
4b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License").  You may not use
5e71b7053SJung-uk Kim# this file except in compliance with the License.  You can obtain a copy
6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at
7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html
8e71b7053SJung-uk Kim
9db522d3aSSimon L. B. Nielsen#
10db522d3aSSimon L. B. Nielsen# ====================================================================
117bded2dbSJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12db522d3aSSimon L. B. Nielsen# project. The module is, however, dual licensed under OpenSSL and
13db522d3aSSimon L. B. Nielsen# CRYPTOGAMS licenses depending on where you obtain it. For further
14db522d3aSSimon L. B. Nielsen# details see http://www.openssl.org/~appro/cryptogams/.
15db522d3aSSimon L. B. Nielsen# ====================================================================
16db522d3aSSimon L. B. Nielsen#
17db522d3aSSimon L. B. Nielsen# sha1_block procedure for x86_64.
18db522d3aSSimon L. B. Nielsen#
19db522d3aSSimon L. B. Nielsen# It was brought to my attention that on EM64T compiler-generated code
20db522d3aSSimon L. B. Nielsen# was far behind 32-bit assembler implementation. This is unlike on
21db522d3aSSimon L. B. Nielsen# Opteron where compiler-generated code was only 15% behind 32-bit
22db522d3aSSimon L. B. Nielsen# assembler, which originally made it hard to motivate the effort.
23db522d3aSSimon L. B. Nielsen# There was suggestion to mechanically translate 32-bit code, but I
24db522d3aSSimon L. B. Nielsen# dismissed it, reasoning that x86_64 offers enough register bank
25db522d3aSSimon L. B. Nielsen# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
261f13597dSJung-uk Kim# implementation:-) However! While 64-bit code does perform better
27db522d3aSSimon L. B. Nielsen# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
28db522d3aSSimon L. B. Nielsen# x86_64 does offer larger *addressable* bank, but out-of-order core
29db522d3aSSimon L. B. Nielsen# reaches for even more registers through dynamic aliasing, and EM64T
30db522d3aSSimon L. B. Nielsen# core must have managed to run-time optimize even 32-bit code just as
31db522d3aSSimon L. B. Nielsen# good as 64-bit one. Performance improvement is summarized in the
32db522d3aSSimon L. B. Nielsen# following table:
33db522d3aSSimon L. B. Nielsen#
34db522d3aSSimon L. B. Nielsen#		gcc 3.4		32-bit asm	cycles/byte
35db522d3aSSimon L. B. Nielsen# Opteron	+45%		+20%		6.8
36db522d3aSSimon L. B. Nielsen# Xeon P4	+65%		+0%		9.9
37db522d3aSSimon L. B. Nielsen# Core2		+60%		+10%		7.0
38db522d3aSSimon L. B. Nielsen
391f13597dSJung-uk Kim# August 2009.
401f13597dSJung-uk Kim#
411f13597dSJung-uk Kim# The code was revised to minimize code size and to maximize
421f13597dSJung-uk Kim# "distance" between instructions producing input to 'lea'
431f13597dSJung-uk Kim# instruction and the 'lea' instruction itself, which is essential
441f13597dSJung-uk Kim# for Intel Atom core.
451f13597dSJung-uk Kim
461f13597dSJung-uk Kim# October 2010.
471f13597dSJung-uk Kim#
481f13597dSJung-uk Kim# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
491f13597dSJung-uk Kim# is to offload message schedule denoted by Wt in NIST specification,
501f13597dSJung-uk Kim# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
511f13597dSJung-uk Kim# for background and implementation details. The only difference from
521f13597dSJung-uk Kim# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
531f13597dSJung-uk Kim# to free temporary registers.
541f13597dSJung-uk Kim
551f13597dSJung-uk Kim# April 2011.
561f13597dSJung-uk Kim#
571f13597dSJung-uk Kim# Add AVX code path. See sha1-586.pl for further information.
581f13597dSJung-uk Kim
597bded2dbSJung-uk Kim# May 2013.
607bded2dbSJung-uk Kim#
617bded2dbSJung-uk Kim# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
627bded2dbSJung-uk Kim# and loading pair of consecutive blocks to 256-bit %ymm registers)
637bded2dbSJung-uk Kim# did not provide impressive performance improvement till a crucial
647bded2dbSJung-uk Kim# hint regarding the number of Xupdate iterations to pre-compute in
657bded2dbSJung-uk Kim# advance was provided by Ilya Albrekht of Intel Corp.
667bded2dbSJung-uk Kim
677bded2dbSJung-uk Kim# March 2014.
687bded2dbSJung-uk Kim#
697bded2dbSJung-uk Kim# Add support for Intel SHA Extensions.
707bded2dbSJung-uk Kim
711f13597dSJung-uk Kim######################################################################
721f13597dSJung-uk Kim# Current performance is summarized in following table. Numbers are
731f13597dSJung-uk Kim# CPU clock cycles spent to process single byte (less is better).
741f13597dSJung-uk Kim#
757bded2dbSJung-uk Kim#		x86_64		SSSE3		AVX[2]
767bded2dbSJung-uk Kim# P4		9.05		-
777bded2dbSJung-uk Kim# Opteron	6.26		-
787bded2dbSJung-uk Kim# Core2		6.55		6.05/+8%	-
797bded2dbSJung-uk Kim# Westmere	6.73		5.30/+27%	-
807bded2dbSJung-uk Kim# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
817bded2dbSJung-uk Kim# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
827bded2dbSJung-uk Kim# Haswell	5.45		4.15/+31%	3.57/+53%
83e71b7053SJung-uk Kim# Skylake	5.18		4.06/+28%	3.54/+46%
847bded2dbSJung-uk Kim# Bulldozer	9.11		5.95/+53%
85e71b7053SJung-uk Kim# Ryzen		4.75		3.80/+24%	1.93/+150%(**)
867bded2dbSJung-uk Kim# VIA Nano	9.32		7.15/+30%
877bded2dbSJung-uk Kim# Atom		10.3		9.17/+12%
887bded2dbSJung-uk Kim# Silvermont	13.1(*)		9.37/+40%
89e71b7053SJung-uk Kim# Knights L	13.2(*)		9.68/+36%	8.30/+59%
90e71b7053SJung-uk Kim# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
917bded2dbSJung-uk Kim#
927bded2dbSJung-uk Kim# (*)	obviously suboptimal result, nothing was done about it,
937bded2dbSJung-uk Kim#	because SSSE3 code is compiled unconditionally;
94e71b7053SJung-uk Kim# (**)	SHAEXT result
951f13597dSJung-uk Kim
96b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension)
97b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file
98b077aed3SPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
99b077aed3SPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
1001f13597dSJung-uk Kim
1011f13597dSJung-uk Kim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102db522d3aSSimon L. B. Nielsen
103db522d3aSSimon L. B. Nielsen$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104db522d3aSSimon L. B. Nielsen( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105db522d3aSSimon L. B. Nielsen( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106db522d3aSSimon L. B. Nielsendie "can't locate x86_64-xlate.pl";
107db522d3aSSimon L. B. Nielsen
1087bded2dbSJung-uk Kimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
1097bded2dbSJung-uk Kim		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
1107bded2dbSJung-uk Kim	$avx = ($1>=2.19) + ($1>=2.22);
1117bded2dbSJung-uk Kim}
1127bded2dbSJung-uk Kim
1137bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
1147bded2dbSJung-uk Kim	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
1157bded2dbSJung-uk Kim	$avx = ($1>=2.09) + ($1>=2.10);
1167bded2dbSJung-uk Kim}
1177bded2dbSJung-uk Kim
1187bded2dbSJung-uk Kimif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
1197bded2dbSJung-uk Kim	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
1207bded2dbSJung-uk Kim	$avx = ($1>=10) + ($1>=11);
1217bded2dbSJung-uk Kim}
1227bded2dbSJung-uk Kim
12363c1bb51SJung-uk Kimif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
1247bded2dbSJung-uk Kim	$avx = ($2>=3.0) + ($2>3.0);
1257bded2dbSJung-uk Kim}
1267bded2dbSJung-uk Kim
1277bded2dbSJung-uk Kim$shaext=1;	### set to zero if compiling for 1.0.1
1287bded2dbSJung-uk Kim$avx=1		if (!$shaext && $avx);
1291f13597dSJung-uk Kim
130b077aed3SPierre Proncheryopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
131b077aed3SPierre Pronchery    or die "can't call $xlate: $!";
13209286989SJung-uk Kim*STDOUT=*OUT;
133db522d3aSSimon L. B. Nielsen
134db522d3aSSimon L. B. Nielsen$ctx="%rdi";	# 1st arg
135db522d3aSSimon L. B. Nielsen$inp="%rsi";	# 2nd arg
136db522d3aSSimon L. B. Nielsen$num="%rdx";	# 3rd arg
137db522d3aSSimon L. B. Nielsen
138db522d3aSSimon L. B. Nielsen# reassign arguments in order to produce more compact code
139db522d3aSSimon L. B. Nielsen$ctx="%r8";
140db522d3aSSimon L. B. Nielsen$inp="%r9";
141db522d3aSSimon L. B. Nielsen$num="%r10";
142db522d3aSSimon L. B. Nielsen
1431f13597dSJung-uk Kim$t0="%eax";
1441f13597dSJung-uk Kim$t1="%ebx";
1451f13597dSJung-uk Kim$t2="%ecx";
1467bded2dbSJung-uk Kim@xi=("%edx","%ebp","%r14d");
1471f13597dSJung-uk Kim$A="%esi";
1481f13597dSJung-uk Kim$B="%edi";
1491f13597dSJung-uk Kim$C="%r11d";
1501f13597dSJung-uk Kim$D="%r12d";
1511f13597dSJung-uk Kim$E="%r13d";
152db522d3aSSimon L. B. Nielsen
1531f13597dSJung-uk Kim@V=($A,$B,$C,$D,$E);
154db522d3aSSimon L. B. Nielsen
1551f13597dSJung-uk Kimsub BODY_00_19 {
1561f13597dSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_;
1571f13597dSJung-uk Kimmy $j=$i+1;
1581f13597dSJung-uk Kim$code.=<<___ if ($i==0);
1591f13597dSJung-uk Kim	mov	`4*$i`($inp),$xi[0]
1601f13597dSJung-uk Kim	bswap	$xi[0]
1611f13597dSJung-uk Kim___
1621f13597dSJung-uk Kim$code.=<<___ if ($i<15);
1631f13597dSJung-uk Kim	mov	`4*$j`($inp),$xi[1]
1647bded2dbSJung-uk Kim	mov	$d,$t0
1657bded2dbSJung-uk Kim	mov	$xi[0],`4*$i`(%rsp)
1661f13597dSJung-uk Kim	mov	$a,$t2
1671f13597dSJung-uk Kim	bswap	$xi[1]
1687bded2dbSJung-uk Kim	xor	$c,$t0
1691f13597dSJung-uk Kim	rol	\$5,$t2
1701f13597dSJung-uk Kim	and	$b,$t0
1717bded2dbSJung-uk Kim	lea	0x5a827999($xi[0],$e),$e
1721f13597dSJung-uk Kim	add	$t2,$e
1731f13597dSJung-uk Kim	xor	$d,$t0
1741f13597dSJung-uk Kim	rol	\$30,$b
1751f13597dSJung-uk Kim	add	$t0,$e
1761f13597dSJung-uk Kim___
1771f13597dSJung-uk Kim$code.=<<___ if ($i>=15);
1787bded2dbSJung-uk Kim	xor	`4*($j%16)`(%rsp),$xi[1]
1797bded2dbSJung-uk Kim	mov	$d,$t0
1807bded2dbSJung-uk Kim	mov	$xi[0],`4*($i%16)`(%rsp)
1811f13597dSJung-uk Kim	mov	$a,$t2
1821f13597dSJung-uk Kim	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
1837bded2dbSJung-uk Kim	xor	$c,$t0
1841f13597dSJung-uk Kim	rol	\$5,$t2
1851f13597dSJung-uk Kim	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
1861f13597dSJung-uk Kim	and	$b,$t0
1871f13597dSJung-uk Kim	lea	0x5a827999($xi[0],$e),$e
1881f13597dSJung-uk Kim	rol	\$30,$b
1897bded2dbSJung-uk Kim	xor	$d,$t0
1907bded2dbSJung-uk Kim	add	$t2,$e
1917bded2dbSJung-uk Kim	rol	\$1,$xi[1]
1921f13597dSJung-uk Kim	add	$t0,$e
1931f13597dSJung-uk Kim___
1947bded2dbSJung-uk Kimpush(@xi,shift(@xi));
1951f13597dSJung-uk Kim}
1961f13597dSJung-uk Kim
1971f13597dSJung-uk Kimsub BODY_20_39 {
1981f13597dSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_;
1991f13597dSJung-uk Kimmy $j=$i+1;
2001f13597dSJung-uk Kimmy $K=($i<40)?0x6ed9eba1:0xca62c1d6;
2011f13597dSJung-uk Kim$code.=<<___ if ($i<79);
2027bded2dbSJung-uk Kim	xor	`4*($j%16)`(%rsp),$xi[1]
2037bded2dbSJung-uk Kim	mov	$b,$t0
2047bded2dbSJung-uk Kim	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
2051f13597dSJung-uk Kim	mov	$a,$t2
2061f13597dSJung-uk Kim	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
2071f13597dSJung-uk Kim	xor	$d,$t0
2087bded2dbSJung-uk Kim	rol	\$5,$t2
2097bded2dbSJung-uk Kim	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
2107bded2dbSJung-uk Kim	lea	$K($xi[0],$e),$e
2117bded2dbSJung-uk Kim	xor	$c,$t0
2121f13597dSJung-uk Kim	add	$t2,$e
2131f13597dSJung-uk Kim	rol	\$30,$b
2141f13597dSJung-uk Kim	add	$t0,$e
2151f13597dSJung-uk Kim	rol	\$1,$xi[1]
2161f13597dSJung-uk Kim___
2171f13597dSJung-uk Kim$code.=<<___ if ($i==79);
2187bded2dbSJung-uk Kim	mov	$b,$t0
2191f13597dSJung-uk Kim	mov	$a,$t2
2207bded2dbSJung-uk Kim	xor	$d,$t0
2211f13597dSJung-uk Kim	lea	$K($xi[0],$e),$e
2221f13597dSJung-uk Kim	rol	\$5,$t2
2237bded2dbSJung-uk Kim	xor	$c,$t0
2241f13597dSJung-uk Kim	add	$t2,$e
2251f13597dSJung-uk Kim	rol	\$30,$b
2261f13597dSJung-uk Kim	add	$t0,$e
2271f13597dSJung-uk Kim___
2287bded2dbSJung-uk Kimpush(@xi,shift(@xi));
2291f13597dSJung-uk Kim}
2301f13597dSJung-uk Kim
2311f13597dSJung-uk Kimsub BODY_40_59 {
2321f13597dSJung-uk Kimmy ($i,$a,$b,$c,$d,$e)=@_;
2331f13597dSJung-uk Kimmy $j=$i+1;
234db522d3aSSimon L. B. Nielsen$code.=<<___;
2357bded2dbSJung-uk Kim	xor	`4*($j%16)`(%rsp),$xi[1]
2367bded2dbSJung-uk Kim	mov	$d,$t0
2377bded2dbSJung-uk Kim	mov	$xi[0],`4*($i%16)`(%rsp)
2387bded2dbSJung-uk Kim	mov	$d,$t1
2391f13597dSJung-uk Kim	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
2407bded2dbSJung-uk Kim	and	$c,$t0
2411f13597dSJung-uk Kim	mov	$a,$t2
2421f13597dSJung-uk Kim	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
2431f13597dSJung-uk Kim	lea	0x8f1bbcdc($xi[0],$e),$e
2447bded2dbSJung-uk Kim	xor	$c,$t1
2451f13597dSJung-uk Kim	rol	\$5,$t2
2461f13597dSJung-uk Kim	add	$t0,$e
2471f13597dSJung-uk Kim	rol	\$1,$xi[1]
2487bded2dbSJung-uk Kim	and	$b,$t1
2491f13597dSJung-uk Kim	add	$t2,$e
2507bded2dbSJung-uk Kim	rol	\$30,$b
2517bded2dbSJung-uk Kim	add	$t1,$e
2521f13597dSJung-uk Kim___
2537bded2dbSJung-uk Kimpush(@xi,shift(@xi));
2541f13597dSJung-uk Kim}
2551f13597dSJung-uk Kim
2561f13597dSJung-uk Kim$code.=<<___;
2571f13597dSJung-uk Kim.text
2581f13597dSJung-uk Kim.extern	OPENSSL_ia32cap_P
2591f13597dSJung-uk Kim
2601f13597dSJung-uk Kim.globl	sha1_block_data_order
2611f13597dSJung-uk Kim.type	sha1_block_data_order,\@function,3
262db522d3aSSimon L. B. Nielsen.align	16
2631f13597dSJung-uk Kimsha1_block_data_order:
264e71b7053SJung-uk Kim.cfi_startproc
2651f13597dSJung-uk Kim	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
2661f13597dSJung-uk Kim	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
2677bded2dbSJung-uk Kim	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
2681f13597dSJung-uk Kim	test	\$`1<<9`,%r8d		# check SSSE3 bit
2691f13597dSJung-uk Kim	jz	.Lialu
2701f13597dSJung-uk Kim___
2717bded2dbSJung-uk Kim$code.=<<___ if ($shaext);
2727bded2dbSJung-uk Kim	test	\$`1<<29`,%r10d		# check SHA bit
2737bded2dbSJung-uk Kim	jnz	_shaext_shortcut
2747bded2dbSJung-uk Kim___
2757bded2dbSJung-uk Kim$code.=<<___ if ($avx>1);
2767bded2dbSJung-uk Kim	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
2777bded2dbSJung-uk Kim	cmp	\$`1<<3|1<<5|1<<8`,%r10d
2787bded2dbSJung-uk Kim	je	_avx2_shortcut
2797bded2dbSJung-uk Kim___
2801f13597dSJung-uk Kim$code.=<<___ if ($avx);
2811f13597dSJung-uk Kim	and	\$`1<<28`,%r8d		# mask AVX bit
2821f13597dSJung-uk Kim	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
2831f13597dSJung-uk Kim	or	%r9d,%r8d
2841f13597dSJung-uk Kim	cmp	\$`1<<28|1<<30`,%r8d
2851f13597dSJung-uk Kim	je	_avx_shortcut
2861f13597dSJung-uk Kim___
2871f13597dSJung-uk Kim$code.=<<___;
2881f13597dSJung-uk Kim	jmp	_ssse3_shortcut
2891f13597dSJung-uk Kim
2901f13597dSJung-uk Kim.align	16
2911f13597dSJung-uk Kim.Lialu:
2927bded2dbSJung-uk Kim	mov	%rsp,%rax
293e71b7053SJung-uk Kim.cfi_def_cfa_register	%rax
294db522d3aSSimon L. B. Nielsen	push	%rbx
295e71b7053SJung-uk Kim.cfi_push	%rbx
296db522d3aSSimon L. B. Nielsen	push	%rbp
297e71b7053SJung-uk Kim.cfi_push	%rbp
298db522d3aSSimon L. B. Nielsen	push	%r12
299e71b7053SJung-uk Kim.cfi_push	%r12
3001f13597dSJung-uk Kim	push	%r13
301e71b7053SJung-uk Kim.cfi_push	%r13
3027bded2dbSJung-uk Kim	push	%r14
303e71b7053SJung-uk Kim.cfi_push	%r14
304db522d3aSSimon L. B. Nielsen	mov	%rdi,$ctx	# reassigned argument
305db522d3aSSimon L. B. Nielsen	sub	\$`8+16*4`,%rsp
306db522d3aSSimon L. B. Nielsen	mov	%rsi,$inp	# reassigned argument
307db522d3aSSimon L. B. Nielsen	and	\$-64,%rsp
308db522d3aSSimon L. B. Nielsen	mov	%rdx,$num	# reassigned argument
3097bded2dbSJung-uk Kim	mov	%rax,`16*4`(%rsp)
310e71b7053SJung-uk Kim.cfi_cfa_expression	%rsp+64,deref,+8
3111f13597dSJung-uk Kim.Lprologue:
312db522d3aSSimon L. B. Nielsen
313db522d3aSSimon L. B. Nielsen	mov	0($ctx),$A
314db522d3aSSimon L. B. Nielsen	mov	4($ctx),$B
315db522d3aSSimon L. B. Nielsen	mov	8($ctx),$C
316db522d3aSSimon L. B. Nielsen	mov	12($ctx),$D
317db522d3aSSimon L. B. Nielsen	mov	16($ctx),$E
3181f13597dSJung-uk Kim	jmp	.Lloop
319db522d3aSSimon L. B. Nielsen
3201f13597dSJung-uk Kim.align	16
3211f13597dSJung-uk Kim.Lloop:
322db522d3aSSimon L. B. Nielsen___
323db522d3aSSimon L. B. Nielsenfor($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
324db522d3aSSimon L. B. Nielsenfor(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
325db522d3aSSimon L. B. Nielsenfor(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
326db522d3aSSimon L. B. Nielsenfor(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
327db522d3aSSimon L. B. Nielsen$code.=<<___;
3281f13597dSJung-uk Kim	add	0($ctx),$A
3291f13597dSJung-uk Kim	add	4($ctx),$B
3301f13597dSJung-uk Kim	add	8($ctx),$C
3311f13597dSJung-uk Kim	add	12($ctx),$D
3321f13597dSJung-uk Kim	add	16($ctx),$E
3331f13597dSJung-uk Kim	mov	$A,0($ctx)
3341f13597dSJung-uk Kim	mov	$B,4($ctx)
3351f13597dSJung-uk Kim	mov	$C,8($ctx)
3361f13597dSJung-uk Kim	mov	$D,12($ctx)
3371f13597dSJung-uk Kim	mov	$E,16($ctx)
338db522d3aSSimon L. B. Nielsen
339db522d3aSSimon L. B. Nielsen	sub	\$1,$num
3401f13597dSJung-uk Kim	lea	`16*4`($inp),$inp
341db522d3aSSimon L. B. Nielsen	jnz	.Lloop
3421f13597dSJung-uk Kim
3431f13597dSJung-uk Kim	mov	`16*4`(%rsp),%rsi
344e71b7053SJung-uk Kim.cfi_def_cfa	%rsi,8
3457bded2dbSJung-uk Kim	mov	-40(%rsi),%r14
346e71b7053SJung-uk Kim.cfi_restore	%r14
3477bded2dbSJung-uk Kim	mov	-32(%rsi),%r13
348e71b7053SJung-uk Kim.cfi_restore	%r13
3497bded2dbSJung-uk Kim	mov	-24(%rsi),%r12
350e71b7053SJung-uk Kim.cfi_restore	%r12
3517bded2dbSJung-uk Kim	mov	-16(%rsi),%rbp
352e71b7053SJung-uk Kim.cfi_restore	%rbp
3537bded2dbSJung-uk Kim	mov	-8(%rsi),%rbx
354e71b7053SJung-uk Kim.cfi_restore	%rbx
3557bded2dbSJung-uk Kim	lea	(%rsi),%rsp
356e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
3571f13597dSJung-uk Kim.Lepilogue:
3581f13597dSJung-uk Kim	ret
359e71b7053SJung-uk Kim.cfi_endproc
3601f13597dSJung-uk Kim.size	sha1_block_data_order,.-sha1_block_data_order
361db522d3aSSimon L. B. Nielsen___
3627bded2dbSJung-uk Kimif ($shaext) {{{
3637bded2dbSJung-uk Kim######################################################################
3647bded2dbSJung-uk Kim# Intel SHA Extensions implementation of SHA1 update function.
3657bded2dbSJung-uk Kim#
3667bded2dbSJung-uk Kimmy ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
3677bded2dbSJung-uk Kimmy ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
3687bded2dbSJung-uk Kimmy @MSG=map("%xmm$_",(4..7));
3697bded2dbSJung-uk Kim
3707bded2dbSJung-uk Kim$code.=<<___;
3717bded2dbSJung-uk Kim.type	sha1_block_data_order_shaext,\@function,3
3727bded2dbSJung-uk Kim.align	32
3737bded2dbSJung-uk Kimsha1_block_data_order_shaext:
3747bded2dbSJung-uk Kim_shaext_shortcut:
375e71b7053SJung-uk Kim.cfi_startproc
3767bded2dbSJung-uk Kim___
3777bded2dbSJung-uk Kim$code.=<<___ if ($win64);
3787bded2dbSJung-uk Kim	lea	`-8-4*16`(%rsp),%rsp
3797bded2dbSJung-uk Kim	movaps	%xmm6,-8-4*16(%rax)
3807bded2dbSJung-uk Kim	movaps	%xmm7,-8-3*16(%rax)
3817bded2dbSJung-uk Kim	movaps	%xmm8,-8-2*16(%rax)
3827bded2dbSJung-uk Kim	movaps	%xmm9,-8-1*16(%rax)
3837bded2dbSJung-uk Kim.Lprologue_shaext:
3847bded2dbSJung-uk Kim___
3857bded2dbSJung-uk Kim$code.=<<___;
3867bded2dbSJung-uk Kim	movdqu	($ctx),$ABCD
3877bded2dbSJung-uk Kim	movd	16($ctx),$E
3887bded2dbSJung-uk Kim	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
3897bded2dbSJung-uk Kim
3907bded2dbSJung-uk Kim	movdqu	($inp),@MSG[0]
3917bded2dbSJung-uk Kim	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
3927bded2dbSJung-uk Kim	movdqu	0x10($inp),@MSG[1]
3937bded2dbSJung-uk Kim	pshufd	\$0b00011011,$E,$E		# flip word order
3947bded2dbSJung-uk Kim	movdqu	0x20($inp),@MSG[2]
3957bded2dbSJung-uk Kim	pshufb	$BSWAP,@MSG[0]
3967bded2dbSJung-uk Kim	movdqu	0x30($inp),@MSG[3]
3977bded2dbSJung-uk Kim	pshufb	$BSWAP,@MSG[1]
3987bded2dbSJung-uk Kim	pshufb	$BSWAP,@MSG[2]
3997bded2dbSJung-uk Kim	movdqa	$E,$E_SAVE			# offload $E
4007bded2dbSJung-uk Kim	pshufb	$BSWAP,@MSG[3]
4017bded2dbSJung-uk Kim	jmp	.Loop_shaext
4027bded2dbSJung-uk Kim
4037bded2dbSJung-uk Kim.align	16
4047bded2dbSJung-uk Kim.Loop_shaext:
4057bded2dbSJung-uk Kim	dec		$num
406aeb5019cSJung-uk Kim	lea		0x40($inp),%r8		# next input block
4077bded2dbSJung-uk Kim	paddd		@MSG[0],$E
408aeb5019cSJung-uk Kim	cmovne		%r8,$inp
4097bded2dbSJung-uk Kim	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
4107bded2dbSJung-uk Kim___
4117bded2dbSJung-uk Kimfor($i=0;$i<20-4;$i+=2) {
4127bded2dbSJung-uk Kim$code.=<<___;
4137bded2dbSJung-uk Kim	sha1msg1	@MSG[1],@MSG[0]
4147bded2dbSJung-uk Kim	movdqa		$ABCD,$E_
4157bded2dbSJung-uk Kim	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
4167bded2dbSJung-uk Kim	sha1nexte	@MSG[1],$E_
4177bded2dbSJung-uk Kim	pxor		@MSG[2],@MSG[0]
4187bded2dbSJung-uk Kim	sha1msg1	@MSG[2],@MSG[1]
4197bded2dbSJung-uk Kim	sha1msg2	@MSG[3],@MSG[0]
4207bded2dbSJung-uk Kim
4217bded2dbSJung-uk Kim	movdqa		$ABCD,$E
4227bded2dbSJung-uk Kim	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
4237bded2dbSJung-uk Kim	sha1nexte	@MSG[2],$E
4247bded2dbSJung-uk Kim	pxor		@MSG[3],@MSG[1]
4257bded2dbSJung-uk Kim	sha1msg2	@MSG[0],@MSG[1]
4267bded2dbSJung-uk Kim___
4277bded2dbSJung-uk Kim	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
4287bded2dbSJung-uk Kim}
4297bded2dbSJung-uk Kim$code.=<<___;
4307bded2dbSJung-uk Kim	movdqu		($inp),@MSG[0]
4317bded2dbSJung-uk Kim	movdqa		$ABCD,$E_
4327bded2dbSJung-uk Kim	sha1rnds4	\$3,$E,$ABCD		# 64-67
4337bded2dbSJung-uk Kim	sha1nexte	@MSG[1],$E_
4347bded2dbSJung-uk Kim	movdqu		0x10($inp),@MSG[1]
4357bded2dbSJung-uk Kim	pshufb		$BSWAP,@MSG[0]
4367bded2dbSJung-uk Kim
4377bded2dbSJung-uk Kim	movdqa		$ABCD,$E
4387bded2dbSJung-uk Kim	sha1rnds4	\$3,$E_,$ABCD		# 68-71
4397bded2dbSJung-uk Kim	sha1nexte	@MSG[2],$E
4407bded2dbSJung-uk Kim	movdqu		0x20($inp),@MSG[2]
4417bded2dbSJung-uk Kim	pshufb		$BSWAP,@MSG[1]
4427bded2dbSJung-uk Kim
4437bded2dbSJung-uk Kim	movdqa		$ABCD,$E_
4447bded2dbSJung-uk Kim	sha1rnds4	\$3,$E,$ABCD		# 72-75
4457bded2dbSJung-uk Kim	sha1nexte	@MSG[3],$E_
4467bded2dbSJung-uk Kim	movdqu		0x30($inp),@MSG[3]
4477bded2dbSJung-uk Kim	pshufb		$BSWAP,@MSG[2]
4487bded2dbSJung-uk Kim
4497bded2dbSJung-uk Kim	movdqa		$ABCD,$E
4507bded2dbSJung-uk Kim	sha1rnds4	\$3,$E_,$ABCD		# 76-79
4517bded2dbSJung-uk Kim	sha1nexte	$E_SAVE,$E
4527bded2dbSJung-uk Kim	pshufb		$BSWAP,@MSG[3]
4537bded2dbSJung-uk Kim
4547bded2dbSJung-uk Kim	paddd		$ABCD_SAVE,$ABCD
4557bded2dbSJung-uk Kim	movdqa		$E,$E_SAVE		# offload $E
4567bded2dbSJung-uk Kim
4577bded2dbSJung-uk Kim	jnz		.Loop_shaext
4587bded2dbSJung-uk Kim
4597bded2dbSJung-uk Kim	pshufd	\$0b00011011,$ABCD,$ABCD
4607bded2dbSJung-uk Kim	pshufd	\$0b00011011,$E,$E
4617bded2dbSJung-uk Kim	movdqu	$ABCD,($ctx)
4627bded2dbSJung-uk Kim	movd	$E,16($ctx)
4637bded2dbSJung-uk Kim___
4647bded2dbSJung-uk Kim$code.=<<___ if ($win64);
4657bded2dbSJung-uk Kim	movaps	-8-4*16(%rax),%xmm6
4667bded2dbSJung-uk Kim	movaps	-8-3*16(%rax),%xmm7
4677bded2dbSJung-uk Kim	movaps	-8-2*16(%rax),%xmm8
4687bded2dbSJung-uk Kim	movaps	-8-1*16(%rax),%xmm9
4697bded2dbSJung-uk Kim	mov	%rax,%rsp
4707bded2dbSJung-uk Kim.Lepilogue_shaext:
4717bded2dbSJung-uk Kim___
4727bded2dbSJung-uk Kim$code.=<<___;
4737bded2dbSJung-uk Kim	ret
47417f01e99SJung-uk Kim.cfi_endproc
4757bded2dbSJung-uk Kim.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
4767bded2dbSJung-uk Kim___
4777bded2dbSJung-uk Kim}}}
4781f13597dSJung-uk Kim{{{
4791f13597dSJung-uk Kimmy $Xi=4;
4801f13597dSJung-uk Kimmy @X=map("%xmm$_",(4..7,0..3));
4811f13597dSJung-uk Kimmy @Tx=map("%xmm$_",(8..10));
4827bded2dbSJung-uk Kimmy $Kx="%xmm11";
4831f13597dSJung-uk Kimmy @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
4841f13597dSJung-uk Kimmy @T=("%esi","%edi");
4851f13597dSJung-uk Kimmy $j=0;
4867bded2dbSJung-uk Kimmy $rx=0;
487e71b7053SJung-uk Kimmy $K_XX_XX="%r14";
488e71b7053SJung-uk Kimmy $fp="%r11";
4891f13597dSJung-uk Kim
4901f13597dSJung-uk Kimmy $_rol=sub { &rol(@_) };
4911f13597dSJung-uk Kimmy $_ror=sub { &ror(@_) };
4921f13597dSJung-uk Kim
4937bded2dbSJung-uk Kim{ my $sn;
4947bded2dbSJung-uk Kimsub align32() {
4957bded2dbSJung-uk Kim  ++$sn;
4967bded2dbSJung-uk Kim$code.=<<___;
4977bded2dbSJung-uk Kim	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
4987bded2dbSJung-uk Kim.align	32
4997bded2dbSJung-uk Kim.Lalign32_$sn:
5007bded2dbSJung-uk Kim___
5017bded2dbSJung-uk Kim}
5027bded2dbSJung-uk Kim}
5037bded2dbSJung-uk Kim
5041f13597dSJung-uk Kim$code.=<<___;
5051f13597dSJung-uk Kim.type	sha1_block_data_order_ssse3,\@function,3
5061f13597dSJung-uk Kim.align	16
5071f13597dSJung-uk Kimsha1_block_data_order_ssse3:
5081f13597dSJung-uk Kim_ssse3_shortcut:
509e71b7053SJung-uk Kim.cfi_startproc
510e71b7053SJung-uk Kim	mov	%rsp,$fp	# frame pointer
511e71b7053SJung-uk Kim.cfi_def_cfa_register	$fp
5121f13597dSJung-uk Kim	push	%rbx
513e71b7053SJung-uk Kim.cfi_push	%rbx
5141f13597dSJung-uk Kim	push	%rbp
515e71b7053SJung-uk Kim.cfi_push	%rbp
5161f13597dSJung-uk Kim	push	%r12
517e71b7053SJung-uk Kim.cfi_push	%r12
5187bded2dbSJung-uk Kim	push	%r13		# redundant, done to share Win64 SE handler
519e71b7053SJung-uk Kim.cfi_push	%r13
5207bded2dbSJung-uk Kim	push	%r14
521e71b7053SJung-uk Kim.cfi_push	%r14
5227bded2dbSJung-uk Kim	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
5231f13597dSJung-uk Kim___
5241f13597dSJung-uk Kim$code.=<<___ if ($win64);
525e71b7053SJung-uk Kim	movaps	%xmm6,-40-6*16($fp)
526e71b7053SJung-uk Kim	movaps	%xmm7,-40-5*16($fp)
527e71b7053SJung-uk Kim	movaps	%xmm8,-40-4*16($fp)
528e71b7053SJung-uk Kim	movaps	%xmm9,-40-3*16($fp)
529e71b7053SJung-uk Kim	movaps	%xmm10,-40-2*16($fp)
530e71b7053SJung-uk Kim	movaps	%xmm11,-40-1*16($fp)
5311f13597dSJung-uk Kim.Lprologue_ssse3:
5321f13597dSJung-uk Kim___
5331f13597dSJung-uk Kim$code.=<<___;
5347bded2dbSJung-uk Kim	and	\$-64,%rsp
5351f13597dSJung-uk Kim	mov	%rdi,$ctx	# reassigned argument
5361f13597dSJung-uk Kim	mov	%rsi,$inp	# reassigned argument
5371f13597dSJung-uk Kim	mov	%rdx,$num	# reassigned argument
5381f13597dSJung-uk Kim
5391f13597dSJung-uk Kim	shl	\$6,$num
5401f13597dSJung-uk Kim	add	$inp,$num
5417bded2dbSJung-uk Kim	lea	K_XX_XX+64(%rip),$K_XX_XX
5421f13597dSJung-uk Kim
5431f13597dSJung-uk Kim	mov	0($ctx),$A		# load context
5441f13597dSJung-uk Kim	mov	4($ctx),$B
5451f13597dSJung-uk Kim	mov	8($ctx),$C
5461f13597dSJung-uk Kim	mov	12($ctx),$D
5471f13597dSJung-uk Kim	mov	$B,@T[0]		# magic seed
5481f13597dSJung-uk Kim	mov	16($ctx),$E
5497bded2dbSJung-uk Kim	mov	$C,@T[1]
5507bded2dbSJung-uk Kim	xor	$D,@T[1]
5517bded2dbSJung-uk Kim	and	@T[1],@T[0]
5521f13597dSJung-uk Kim
5531f13597dSJung-uk Kim	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
5547bded2dbSJung-uk Kim	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
5551f13597dSJung-uk Kim	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
5561f13597dSJung-uk Kim	movdqu	16($inp),@X[-3&7]
5571f13597dSJung-uk Kim	movdqu	32($inp),@X[-2&7]
5581f13597dSJung-uk Kim	movdqu	48($inp),@X[-1&7]
5591f13597dSJung-uk Kim	pshufb	@X[2],@X[-4&7]		# byte swap
5601f13597dSJung-uk Kim	pshufb	@X[2],@X[-3&7]
5611f13597dSJung-uk Kim	pshufb	@X[2],@X[-2&7]
5627bded2dbSJung-uk Kim	add	\$64,$inp
5631f13597dSJung-uk Kim	paddd	@Tx[1],@X[-4&7]		# add K_00_19
5647bded2dbSJung-uk Kim	pshufb	@X[2],@X[-1&7]
5651f13597dSJung-uk Kim	paddd	@Tx[1],@X[-3&7]
5661f13597dSJung-uk Kim	paddd	@Tx[1],@X[-2&7]
5671f13597dSJung-uk Kim	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
5681f13597dSJung-uk Kim	psubd	@Tx[1],@X[-4&7]		# restore X[]
5691f13597dSJung-uk Kim	movdqa	@X[-3&7],16(%rsp)
5701f13597dSJung-uk Kim	psubd	@Tx[1],@X[-3&7]
5711f13597dSJung-uk Kim	movdqa	@X[-2&7],32(%rsp)
5721f13597dSJung-uk Kim	psubd	@Tx[1],@X[-2&7]
5731f13597dSJung-uk Kim	jmp	.Loop_ssse3
5741f13597dSJung-uk Kim___
5751f13597dSJung-uk Kim
5761f13597dSJung-uk Kimsub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
5771f13597dSJung-uk Kim{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
5781f13597dSJung-uk Kim  my $arg = pop;
5791f13597dSJung-uk Kim    $arg = "\$$arg" if ($arg*1 eq $arg);
5801f13597dSJung-uk Kim    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
5811f13597dSJung-uk Kim}
5821f13597dSJung-uk Kim
583e71b7053SJung-uk Kimsub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
5841f13597dSJung-uk Kim{ use integer;
5851f13597dSJung-uk Kim  my $body = shift;
5861f13597dSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
5871f13597dSJung-uk Kim  my ($a,$b,$c,$d,$e);
5881f13597dSJung-uk Kim
5897bded2dbSJung-uk Kim	 eval(shift(@insns));		# ror
5907bded2dbSJung-uk Kim	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
5911f13597dSJung-uk Kim	 eval(shift(@insns));
5921f13597dSJung-uk Kim	&movdqa	(@Tx[0],@X[-1&7]);
5937bded2dbSJung-uk Kim	  &paddd	(@Tx[1],@X[-1&7]);
5941f13597dSJung-uk Kim	 eval(shift(@insns));
5951f13597dSJung-uk Kim	 eval(shift(@insns));
5961f13597dSJung-uk Kim
5977bded2dbSJung-uk Kim	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
5981f13597dSJung-uk Kim	 eval(shift(@insns));
5997bded2dbSJung-uk Kim	 eval(shift(@insns));		# rol
6001f13597dSJung-uk Kim	 eval(shift(@insns));
6011f13597dSJung-uk Kim	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
6021f13597dSJung-uk Kim	 eval(shift(@insns));
6031f13597dSJung-uk Kim	 eval(shift(@insns));
6047bded2dbSJung-uk Kim
6051f13597dSJung-uk Kim	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
6061f13597dSJung-uk Kim	 eval(shift(@insns));
6077bded2dbSJung-uk Kim	 eval(shift(@insns));		# ror
6081f13597dSJung-uk Kim	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
6091f13597dSJung-uk Kim	 eval(shift(@insns));
6101f13597dSJung-uk Kim	 eval(shift(@insns));
6111f13597dSJung-uk Kim	 eval(shift(@insns));
6121f13597dSJung-uk Kim
6131f13597dSJung-uk Kim	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
6141f13597dSJung-uk Kim	 eval(shift(@insns));
6157bded2dbSJung-uk Kim	 eval(shift(@insns));		# rol
6161f13597dSJung-uk Kim	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
6171f13597dSJung-uk Kim	 eval(shift(@insns));
6181f13597dSJung-uk Kim	 eval(shift(@insns));
6191f13597dSJung-uk Kim
6201f13597dSJung-uk Kim	&movdqa	(@Tx[2],@X[0]);
6217bded2dbSJung-uk Kim	 eval(shift(@insns));
6227bded2dbSJung-uk Kim	 eval(shift(@insns));
6237bded2dbSJung-uk Kim	 eval(shift(@insns));		# ror
6241f13597dSJung-uk Kim	&movdqa	(@Tx[0],@X[0]);
6251f13597dSJung-uk Kim	 eval(shift(@insns));
6261f13597dSJung-uk Kim
6271f13597dSJung-uk Kim	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
6281f13597dSJung-uk Kim	&paddd	(@X[0],@X[0]);
6291f13597dSJung-uk Kim	 eval(shift(@insns));
6301f13597dSJung-uk Kim	 eval(shift(@insns));
6311f13597dSJung-uk Kim
6321f13597dSJung-uk Kim	&psrld	(@Tx[0],31);
6331f13597dSJung-uk Kim	 eval(shift(@insns));
6347bded2dbSJung-uk Kim	 eval(shift(@insns));		# rol
6351f13597dSJung-uk Kim	 eval(shift(@insns));
6361f13597dSJung-uk Kim	&movdqa	(@Tx[1],@Tx[2]);
6371f13597dSJung-uk Kim	 eval(shift(@insns));
6381f13597dSJung-uk Kim	 eval(shift(@insns));
6391f13597dSJung-uk Kim
6401f13597dSJung-uk Kim	&psrld	(@Tx[2],30);
6411f13597dSJung-uk Kim	 eval(shift(@insns));
6427bded2dbSJung-uk Kim	 eval(shift(@insns));		# ror
6437bded2dbSJung-uk Kim	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
6441f13597dSJung-uk Kim	 eval(shift(@insns));
6451f13597dSJung-uk Kim	 eval(shift(@insns));
6461f13597dSJung-uk Kim	 eval(shift(@insns));
6471f13597dSJung-uk Kim
6481f13597dSJung-uk Kim	&pslld	(@Tx[1],2);
6491f13597dSJung-uk Kim	&pxor	(@X[0],@Tx[2]);
6501f13597dSJung-uk Kim	 eval(shift(@insns));
6517bded2dbSJung-uk Kim	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
6527bded2dbSJung-uk Kim	 eval(shift(@insns));		# rol
6531f13597dSJung-uk Kim	 eval(shift(@insns));
6541f13597dSJung-uk Kim	 eval(shift(@insns));
6551f13597dSJung-uk Kim
6561f13597dSJung-uk Kim	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
6577bded2dbSJung-uk Kim	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
6581f13597dSJung-uk Kim
6591f13597dSJung-uk Kim	 foreach (@insns) { eval; }	# remaining instructions [if any]
6601f13597dSJung-uk Kim
6611f13597dSJung-uk Kim  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
6621f13597dSJung-uk Kim		push(@Tx,shift(@Tx));
6631f13597dSJung-uk Kim}
6641f13597dSJung-uk Kim
6651f13597dSJung-uk Kimsub Xupdate_ssse3_32_79()
6661f13597dSJung-uk Kim{ use integer;
6671f13597dSJung-uk Kim  my $body = shift;
6687bded2dbSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
6691f13597dSJung-uk Kim  my ($a,$b,$c,$d,$e);
6701f13597dSJung-uk Kim
6717bded2dbSJung-uk Kim	 eval(shift(@insns))		if ($Xi==8);
6721f13597dSJung-uk Kim	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
6737bded2dbSJung-uk Kim	 eval(shift(@insns))		if ($Xi==8);
6747bded2dbSJung-uk Kim	 eval(shift(@insns));		# body_20_39
6751f13597dSJung-uk Kim	 eval(shift(@insns));
6767bded2dbSJung-uk Kim	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
6777bded2dbSJung-uk Kim	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
6787bded2dbSJung-uk Kim	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
6791f13597dSJung-uk Kim	 eval(shift(@insns));
6801f13597dSJung-uk Kim	 eval(shift(@insns));		# rol
6811f13597dSJung-uk Kim
6821f13597dSJung-uk Kim	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
6831f13597dSJung-uk Kim	 eval(shift(@insns));
6847bded2dbSJung-uk Kim	 eval(shift(@insns));
6851f13597dSJung-uk Kim	if ($Xi%5) {
6861f13597dSJung-uk Kim	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
6871f13597dSJung-uk Kim	} else {			# ... or load next one
6887bded2dbSJung-uk Kim	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
6891f13597dSJung-uk Kim	}
6901f13597dSJung-uk Kim	 eval(shift(@insns));		# ror
6917bded2dbSJung-uk Kim	  &paddd	(@Tx[1],@X[-1&7]);
6921f13597dSJung-uk Kim	 eval(shift(@insns));
6931f13597dSJung-uk Kim
6941f13597dSJung-uk Kim	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
6951f13597dSJung-uk Kim	 eval(shift(@insns));		# body_20_39
6961f13597dSJung-uk Kim	 eval(shift(@insns));
6971f13597dSJung-uk Kim	 eval(shift(@insns));
6981f13597dSJung-uk Kim	 eval(shift(@insns));		# rol
6997bded2dbSJung-uk Kim	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
7001f13597dSJung-uk Kim
7011f13597dSJung-uk Kim	&movdqa	(@Tx[0],@X[0]);
7027bded2dbSJung-uk Kim	 eval(shift(@insns));
7037bded2dbSJung-uk Kim	 eval(shift(@insns));
7041f13597dSJung-uk Kim	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
7051f13597dSJung-uk Kim	 eval(shift(@insns));		# ror
7061f13597dSJung-uk Kim	 eval(shift(@insns));
7077bded2dbSJung-uk Kim	 eval(shift(@insns));		# body_20_39
7081f13597dSJung-uk Kim
7091f13597dSJung-uk Kim	&pslld	(@X[0],2);
7107bded2dbSJung-uk Kim	 eval(shift(@insns));
7111f13597dSJung-uk Kim	 eval(shift(@insns));
7121f13597dSJung-uk Kim	&psrld	(@Tx[0],30);
7137bded2dbSJung-uk Kim	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
7141f13597dSJung-uk Kim	 eval(shift(@insns));
7151f13597dSJung-uk Kim	 eval(shift(@insns));
7161f13597dSJung-uk Kim	 eval(shift(@insns));		# ror
7171f13597dSJung-uk Kim
7181f13597dSJung-uk Kim	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
7191f13597dSJung-uk Kim	 eval(shift(@insns));
7207bded2dbSJung-uk Kim	 eval(shift(@insns));		# body_20_39
7217bded2dbSJung-uk Kim	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
7227bded2dbSJung-uk Kim	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
7237bded2dbSJung-uk Kim	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
7241f13597dSJung-uk Kim	 eval(shift(@insns));
7251f13597dSJung-uk Kim	 eval(shift(@insns));		# rol
7261f13597dSJung-uk Kim	 eval(shift(@insns));
7271f13597dSJung-uk Kim	 eval(shift(@insns));
7281f13597dSJung-uk Kim	 eval(shift(@insns));		# rol
7291f13597dSJung-uk Kim	 eval(shift(@insns));
7301f13597dSJung-uk Kim
7311f13597dSJung-uk Kim	 foreach (@insns) { eval; }	# remaining instructions
7321f13597dSJung-uk Kim
7331f13597dSJung-uk Kim  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
7341f13597dSJung-uk Kim		push(@Tx,shift(@Tx));
7351f13597dSJung-uk Kim}
7361f13597dSJung-uk Kim
7371f13597dSJung-uk Kimsub Xuplast_ssse3_80()
7381f13597dSJung-uk Kim{ use integer;
7391f13597dSJung-uk Kim  my $body = shift;
7401f13597dSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
7411f13597dSJung-uk Kim  my ($a,$b,$c,$d,$e);
7421f13597dSJung-uk Kim
7431f13597dSJung-uk Kim	 eval(shift(@insns));
7447bded2dbSJung-uk Kim	 eval(shift(@insns));
7457bded2dbSJung-uk Kim	 eval(shift(@insns));
7467bded2dbSJung-uk Kim	 eval(shift(@insns));
7471f13597dSJung-uk Kim	  &paddd	(@Tx[1],@X[-1&7]);
7481f13597dSJung-uk Kim	 eval(shift(@insns));
7491f13597dSJung-uk Kim	 eval(shift(@insns));
7501f13597dSJung-uk Kim
7511f13597dSJung-uk Kim	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
7521f13597dSJung-uk Kim
7531f13597dSJung-uk Kim	 foreach (@insns) { eval; }		# remaining instructions
7541f13597dSJung-uk Kim
7551f13597dSJung-uk Kim	&cmp	($inp,$num);
7561f13597dSJung-uk Kim	&je	(".Ldone_ssse3");
7571f13597dSJung-uk Kim
7581f13597dSJung-uk Kim	unshift(@Tx,pop(@Tx));
7591f13597dSJung-uk Kim
7601f13597dSJung-uk Kim	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
7617bded2dbSJung-uk Kim	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
7621f13597dSJung-uk Kim	&movdqu	(@X[-4&7],"0($inp)");		# load input
7631f13597dSJung-uk Kim	&movdqu	(@X[-3&7],"16($inp)");
7641f13597dSJung-uk Kim	&movdqu	(@X[-2&7],"32($inp)");
7651f13597dSJung-uk Kim	&movdqu	(@X[-1&7],"48($inp)");
7661f13597dSJung-uk Kim	&pshufb	(@X[-4&7],@X[2]);		# byte swap
7671f13597dSJung-uk Kim	&add	($inp,64);
7681f13597dSJung-uk Kim
7691f13597dSJung-uk Kim  $Xi=0;
7701f13597dSJung-uk Kim}
7711f13597dSJung-uk Kim
7721f13597dSJung-uk Kimsub Xloop_ssse3()
7731f13597dSJung-uk Kim{ use integer;
7741f13597dSJung-uk Kim  my $body = shift;
7751f13597dSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
7761f13597dSJung-uk Kim  my ($a,$b,$c,$d,$e);
7771f13597dSJung-uk Kim
7781f13597dSJung-uk Kim	 eval(shift(@insns));
7791f13597dSJung-uk Kim	 eval(shift(@insns));
7807bded2dbSJung-uk Kim	 eval(shift(@insns));
7811f13597dSJung-uk Kim	&pshufb	(@X[($Xi-3)&7],@X[2]);
7821f13597dSJung-uk Kim	 eval(shift(@insns));
7831f13597dSJung-uk Kim	 eval(shift(@insns));
7847bded2dbSJung-uk Kim	 eval(shift(@insns));
7857bded2dbSJung-uk Kim	 eval(shift(@insns));
7861f13597dSJung-uk Kim	&paddd	(@X[($Xi-4)&7],@Tx[1]);
7871f13597dSJung-uk Kim	 eval(shift(@insns));
7881f13597dSJung-uk Kim	 eval(shift(@insns));
7891f13597dSJung-uk Kim	 eval(shift(@insns));
7901f13597dSJung-uk Kim	 eval(shift(@insns));
7911f13597dSJung-uk Kim	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
7921f13597dSJung-uk Kim	 eval(shift(@insns));
7931f13597dSJung-uk Kim	 eval(shift(@insns));
7947bded2dbSJung-uk Kim	 eval(shift(@insns));
7957bded2dbSJung-uk Kim	 eval(shift(@insns));
7961f13597dSJung-uk Kim	&psubd	(@X[($Xi-4)&7],@Tx[1]);
7971f13597dSJung-uk Kim
7981f13597dSJung-uk Kim	foreach (@insns) { eval; }
7991f13597dSJung-uk Kim  $Xi++;
8001f13597dSJung-uk Kim}
8011f13597dSJung-uk Kim
8021f13597dSJung-uk Kimsub Xtail_ssse3()
8031f13597dSJung-uk Kim{ use integer;
8041f13597dSJung-uk Kim  my $body = shift;
8051f13597dSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
8061f13597dSJung-uk Kim  my ($a,$b,$c,$d,$e);
8071f13597dSJung-uk Kim
8081f13597dSJung-uk Kim	foreach (@insns) { eval; }
8091f13597dSJung-uk Kim}
8101f13597dSJung-uk Kim
8117bded2dbSJung-uk Kimsub body_00_19 () {	# ((c^d)&b)^d
8127bded2dbSJung-uk Kim	# on start @T[0]=(c^d)&b
8137bded2dbSJung-uk Kim	return &body_20_39() if ($rx==19); $rx++;
8141f13597dSJung-uk Kim	(
8151f13597dSJung-uk Kim	'($a,$b,$c,$d,$e)=@V;'.
8167bded2dbSJung-uk Kim	'&$_ror	($b,$j?7:2)',	# $b>>>2
8177bded2dbSJung-uk Kim	'&xor	(@T[0],$d)',
8187bded2dbSJung-uk Kim	'&mov	(@T[1],$a)',	# $b for next round
8197bded2dbSJung-uk Kim
8207bded2dbSJung-uk Kim	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
8217bded2dbSJung-uk Kim	'&xor	($b,$c)',	# $c^$d for next round
8227bded2dbSJung-uk Kim
8237bded2dbSJung-uk Kim	'&$_rol	($a,5)',
8247bded2dbSJung-uk Kim	'&add	($e,@T[0])',
8257bded2dbSJung-uk Kim	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
8267bded2dbSJung-uk Kim
8277bded2dbSJung-uk Kim	'&xor	($b,$c)',	# restore $b
8287bded2dbSJung-uk Kim	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
8291f13597dSJung-uk Kim	);
8301f13597dSJung-uk Kim}
8311f13597dSJung-uk Kim
8327bded2dbSJung-uk Kimsub body_20_39 () {	# b^d^c
8337bded2dbSJung-uk Kim	# on entry @T[0]=b^d
8347bded2dbSJung-uk Kim	return &body_40_59() if ($rx==39); $rx++;
8351f13597dSJung-uk Kim	(
8361f13597dSJung-uk Kim	'($a,$b,$c,$d,$e)=@V;'.
8377bded2dbSJung-uk Kim	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
8387bded2dbSJung-uk Kim	'&xor	(@T[0],$d)	if($j==19);'.
8397bded2dbSJung-uk Kim	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
8407bded2dbSJung-uk Kim	'&mov	(@T[1],$a)',	# $b for next round
8417bded2dbSJung-uk Kim
8427bded2dbSJung-uk Kim	'&$_rol	($a,5)',
8437bded2dbSJung-uk Kim	'&add	($e,@T[0])',
8447bded2dbSJung-uk Kim	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
8457bded2dbSJung-uk Kim
8467bded2dbSJung-uk Kim	'&$_ror	($b,7)',	# $b>>>2
8477bded2dbSJung-uk Kim	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
8481f13597dSJung-uk Kim	);
8491f13597dSJung-uk Kim}
8501f13597dSJung-uk Kim
8517bded2dbSJung-uk Kimsub body_40_59 () {	# ((b^c)&(c^d))^c
8527bded2dbSJung-uk Kim	# on entry @T[0]=(b^c), (c^=d)
8537bded2dbSJung-uk Kim	$rx++;
8541f13597dSJung-uk Kim	(
8551f13597dSJung-uk Kim	'($a,$b,$c,$d,$e)=@V;'.
8567bded2dbSJung-uk Kim	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
8577bded2dbSJung-uk Kim	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
8587bded2dbSJung-uk Kim	'&xor	($c,$d)		if ($j>=40)',	# restore $c
8597bded2dbSJung-uk Kim
8607bded2dbSJung-uk Kim	'&$_ror	($b,7)',	# $b>>>2
8617bded2dbSJung-uk Kim	'&mov	(@T[1],$a)',	# $b for next round
8627bded2dbSJung-uk Kim	'&xor	(@T[0],$c)',
8637bded2dbSJung-uk Kim
8647bded2dbSJung-uk Kim	'&$_rol	($a,5)',
8657bded2dbSJung-uk Kim	'&add	($e,@T[0])',
8667bded2dbSJung-uk Kim	'&xor	(@T[1],$c)	if ($j==59);'.
8677bded2dbSJung-uk Kim	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
8687bded2dbSJung-uk Kim
8697bded2dbSJung-uk Kim	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
8707bded2dbSJung-uk Kim	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
8711f13597dSJung-uk Kim	);
8721f13597dSJung-uk Kim}
8731f13597dSJung-uk Kim$code.=<<___;
8741f13597dSJung-uk Kim.align	16
8751f13597dSJung-uk Kim.Loop_ssse3:
8761f13597dSJung-uk Kim___
8771f13597dSJung-uk Kim	&Xupdate_ssse3_16_31(\&body_00_19);
8781f13597dSJung-uk Kim	&Xupdate_ssse3_16_31(\&body_00_19);
8791f13597dSJung-uk Kim	&Xupdate_ssse3_16_31(\&body_00_19);
8801f13597dSJung-uk Kim	&Xupdate_ssse3_16_31(\&body_00_19);
8811f13597dSJung-uk Kim	&Xupdate_ssse3_32_79(\&body_00_19);
8821f13597dSJung-uk Kim	&Xupdate_ssse3_32_79(\&body_20_39);
8831f13597dSJung-uk Kim	&Xupdate_ssse3_32_79(\&body_20_39);
8841f13597dSJung-uk Kim	&Xupdate_ssse3_32_79(\&body_20_39);
8851f13597dSJung-uk Kim	&Xupdate_ssse3_32_79(\&body_20_39);
8861f13597dSJung-uk Kim	&Xupdate_ssse3_32_79(\&body_20_39);
8871f13597dSJung-uk Kim	&Xupdate_ssse3_32_79(\&body_40_59);
8881f13597dSJung-uk Kim	&Xupdate_ssse3_32_79(\&body_40_59);
8891f13597dSJung-uk Kim	&Xupdate_ssse3_32_79(\&body_40_59);
8901f13597dSJung-uk Kim	&Xupdate_ssse3_32_79(\&body_40_59);
8911f13597dSJung-uk Kim	&Xupdate_ssse3_32_79(\&body_40_59);
8921f13597dSJung-uk Kim	&Xupdate_ssse3_32_79(\&body_20_39);
8931f13597dSJung-uk Kim	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
8941f13597dSJung-uk Kim
8951f13597dSJung-uk Kim				$saved_j=$j; @saved_V=@V;
8961f13597dSJung-uk Kim
8971f13597dSJung-uk Kim	&Xloop_ssse3(\&body_20_39);
8981f13597dSJung-uk Kim	&Xloop_ssse3(\&body_20_39);
8991f13597dSJung-uk Kim	&Xloop_ssse3(\&body_20_39);
9001f13597dSJung-uk Kim
9011f13597dSJung-uk Kim$code.=<<___;
9021f13597dSJung-uk Kim	add	0($ctx),$A			# update context
9031f13597dSJung-uk Kim	add	4($ctx),@T[0]
9041f13597dSJung-uk Kim	add	8($ctx),$C
9051f13597dSJung-uk Kim	add	12($ctx),$D
9061f13597dSJung-uk Kim	mov	$A,0($ctx)
9071f13597dSJung-uk Kim	add	16($ctx),$E
9081f13597dSJung-uk Kim	mov	@T[0],4($ctx)
9091f13597dSJung-uk Kim	mov	@T[0],$B			# magic seed
9101f13597dSJung-uk Kim	mov	$C,8($ctx)
9117bded2dbSJung-uk Kim	mov	$C,@T[1]
9121f13597dSJung-uk Kim	mov	$D,12($ctx)
9137bded2dbSJung-uk Kim	xor	$D,@T[1]
9141f13597dSJung-uk Kim	mov	$E,16($ctx)
9157bded2dbSJung-uk Kim	and	@T[1],@T[0]
9161f13597dSJung-uk Kim	jmp	.Loop_ssse3
9171f13597dSJung-uk Kim
9181f13597dSJung-uk Kim.align	16
9191f13597dSJung-uk Kim.Ldone_ssse3:
9201f13597dSJung-uk Kim___
9211f13597dSJung-uk Kim				$j=$saved_j; @V=@saved_V;
9221f13597dSJung-uk Kim
9231f13597dSJung-uk Kim	&Xtail_ssse3(\&body_20_39);
9241f13597dSJung-uk Kim	&Xtail_ssse3(\&body_20_39);
9251f13597dSJung-uk Kim	&Xtail_ssse3(\&body_20_39);
9261f13597dSJung-uk Kim
9271f13597dSJung-uk Kim$code.=<<___;
9281f13597dSJung-uk Kim	add	0($ctx),$A			# update context
9291f13597dSJung-uk Kim	add	4($ctx),@T[0]
9301f13597dSJung-uk Kim	add	8($ctx),$C
9311f13597dSJung-uk Kim	mov	$A,0($ctx)
9321f13597dSJung-uk Kim	add	12($ctx),$D
9331f13597dSJung-uk Kim	mov	@T[0],4($ctx)
9341f13597dSJung-uk Kim	add	16($ctx),$E
9351f13597dSJung-uk Kim	mov	$C,8($ctx)
9361f13597dSJung-uk Kim	mov	$D,12($ctx)
9371f13597dSJung-uk Kim	mov	$E,16($ctx)
9381f13597dSJung-uk Kim___
9391f13597dSJung-uk Kim$code.=<<___ if ($win64);
940e71b7053SJung-uk Kim	movaps	-40-6*16($fp),%xmm6
941e71b7053SJung-uk Kim	movaps	-40-5*16($fp),%xmm7
942e71b7053SJung-uk Kim	movaps	-40-4*16($fp),%xmm8
943e71b7053SJung-uk Kim	movaps	-40-3*16($fp),%xmm9
944e71b7053SJung-uk Kim	movaps	-40-2*16($fp),%xmm10
945e71b7053SJung-uk Kim	movaps	-40-1*16($fp),%xmm11
9461f13597dSJung-uk Kim___
9471f13597dSJung-uk Kim$code.=<<___;
948e71b7053SJung-uk Kim	mov	-40($fp),%r14
949e71b7053SJung-uk Kim.cfi_restore	%r14
950e71b7053SJung-uk Kim	mov	-32($fp),%r13
951e71b7053SJung-uk Kim.cfi_restore	%r13
952e71b7053SJung-uk Kim	mov	-24($fp),%r12
953e71b7053SJung-uk Kim.cfi_restore	%r12
954e71b7053SJung-uk Kim	mov	-16($fp),%rbp
955e71b7053SJung-uk Kim.cfi_restore	%rbp
956e71b7053SJung-uk Kim	mov	-8($fp),%rbx
957e71b7053SJung-uk Kim.cfi_restore	%rbx
958e71b7053SJung-uk Kim	lea	($fp),%rsp
959e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
9601f13597dSJung-uk Kim.Lepilogue_ssse3:
9611f13597dSJung-uk Kim	ret
962e71b7053SJung-uk Kim.cfi_endproc
9631f13597dSJung-uk Kim.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
9641f13597dSJung-uk Kim___
9651f13597dSJung-uk Kim
9661f13597dSJung-uk Kimif ($avx) {
9677bded2dbSJung-uk Kim$Xi=4;				# reset variables
9687bded2dbSJung-uk Kim@X=map("%xmm$_",(4..7,0..3));
9697bded2dbSJung-uk Kim@Tx=map("%xmm$_",(8..10));
9707bded2dbSJung-uk Kim$j=0;
9717bded2dbSJung-uk Kim$rx=0;
9727bded2dbSJung-uk Kim
9737bded2dbSJung-uk Kimmy $done_avx_label=".Ldone_avx";
9741f13597dSJung-uk Kim
9751f13597dSJung-uk Kimmy $_rol=sub { &shld(@_[0],@_) };
9761f13597dSJung-uk Kimmy $_ror=sub { &shrd(@_[0],@_) };
9771f13597dSJung-uk Kim
9781f13597dSJung-uk Kim$code.=<<___;
9791f13597dSJung-uk Kim.type	sha1_block_data_order_avx,\@function,3
9801f13597dSJung-uk Kim.align	16
9811f13597dSJung-uk Kimsha1_block_data_order_avx:
9821f13597dSJung-uk Kim_avx_shortcut:
983e71b7053SJung-uk Kim.cfi_startproc
984e71b7053SJung-uk Kim	mov	%rsp,$fp
985e71b7053SJung-uk Kim.cfi_def_cfa_register	$fp
9861f13597dSJung-uk Kim	push	%rbx
987e71b7053SJung-uk Kim.cfi_push	%rbx
9881f13597dSJung-uk Kim	push	%rbp
989e71b7053SJung-uk Kim.cfi_push	%rbp
9901f13597dSJung-uk Kim	push	%r12
991e71b7053SJung-uk Kim.cfi_push	%r12
9927bded2dbSJung-uk Kim	push	%r13		# redundant, done to share Win64 SE handler
993e71b7053SJung-uk Kim.cfi_push	%r13
9947bded2dbSJung-uk Kim	push	%r14
995e71b7053SJung-uk Kim.cfi_push	%r14
9967bded2dbSJung-uk Kim	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
9977bded2dbSJung-uk Kim	vzeroupper
9981f13597dSJung-uk Kim___
9991f13597dSJung-uk Kim$code.=<<___ if ($win64);
1000e71b7053SJung-uk Kim	vmovaps	%xmm6,-40-6*16($fp)
1001e71b7053SJung-uk Kim	vmovaps	%xmm7,-40-5*16($fp)
1002e71b7053SJung-uk Kim	vmovaps	%xmm8,-40-4*16($fp)
1003e71b7053SJung-uk Kim	vmovaps	%xmm9,-40-3*16($fp)
1004e71b7053SJung-uk Kim	vmovaps	%xmm10,-40-2*16($fp)
1005e71b7053SJung-uk Kim	vmovaps	%xmm11,-40-1*16($fp)
10061f13597dSJung-uk Kim.Lprologue_avx:
10071f13597dSJung-uk Kim___
10081f13597dSJung-uk Kim$code.=<<___;
10097bded2dbSJung-uk Kim	and	\$-64,%rsp
10101f13597dSJung-uk Kim	mov	%rdi,$ctx	# reassigned argument
10111f13597dSJung-uk Kim	mov	%rsi,$inp	# reassigned argument
10121f13597dSJung-uk Kim	mov	%rdx,$num	# reassigned argument
10131f13597dSJung-uk Kim
10141f13597dSJung-uk Kim	shl	\$6,$num
10151f13597dSJung-uk Kim	add	$inp,$num
10167bded2dbSJung-uk Kim	lea	K_XX_XX+64(%rip),$K_XX_XX
10171f13597dSJung-uk Kim
10181f13597dSJung-uk Kim	mov	0($ctx),$A		# load context
10191f13597dSJung-uk Kim	mov	4($ctx),$B
10201f13597dSJung-uk Kim	mov	8($ctx),$C
10211f13597dSJung-uk Kim	mov	12($ctx),$D
10221f13597dSJung-uk Kim	mov	$B,@T[0]		# magic seed
10231f13597dSJung-uk Kim	mov	16($ctx),$E
10247bded2dbSJung-uk Kim	mov	$C,@T[1]
10257bded2dbSJung-uk Kim	xor	$D,@T[1]
10267bded2dbSJung-uk Kim	and	@T[1],@T[0]
10271f13597dSJung-uk Kim
10281f13597dSJung-uk Kim	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
10297bded2dbSJung-uk Kim	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
10301f13597dSJung-uk Kim	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
10311f13597dSJung-uk Kim	vmovdqu	16($inp),@X[-3&7]
10321f13597dSJung-uk Kim	vmovdqu	32($inp),@X[-2&7]
10331f13597dSJung-uk Kim	vmovdqu	48($inp),@X[-1&7]
10341f13597dSJung-uk Kim	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
10351f13597dSJung-uk Kim	add	\$64,$inp
10361f13597dSJung-uk Kim	vpshufb	@X[2],@X[-3&7],@X[-3&7]
10371f13597dSJung-uk Kim	vpshufb	@X[2],@X[-2&7],@X[-2&7]
10381f13597dSJung-uk Kim	vpshufb	@X[2],@X[-1&7],@X[-1&7]
10397bded2dbSJung-uk Kim	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
10407bded2dbSJung-uk Kim	vpaddd	$Kx,@X[-3&7],@X[1]
10417bded2dbSJung-uk Kim	vpaddd	$Kx,@X[-2&7],@X[2]
10421f13597dSJung-uk Kim	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
10431f13597dSJung-uk Kim	vmovdqa	@X[1],16(%rsp)
10441f13597dSJung-uk Kim	vmovdqa	@X[2],32(%rsp)
10451f13597dSJung-uk Kim	jmp	.Loop_avx
10461f13597dSJung-uk Kim___
10471f13597dSJung-uk Kim
1048e71b7053SJung-uk Kimsub Xupdate_avx_16_31()		# recall that $Xi starts with 4
10491f13597dSJung-uk Kim{ use integer;
10501f13597dSJung-uk Kim  my $body = shift;
10511f13597dSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
10521f13597dSJung-uk Kim  my ($a,$b,$c,$d,$e);
10531f13597dSJung-uk Kim
10541f13597dSJung-uk Kim	 eval(shift(@insns));
10551f13597dSJung-uk Kim	 eval(shift(@insns));
10561f13597dSJung-uk Kim	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
10571f13597dSJung-uk Kim	 eval(shift(@insns));
10581f13597dSJung-uk Kim	 eval(shift(@insns));
10591f13597dSJung-uk Kim
10607bded2dbSJung-uk Kim	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
10611f13597dSJung-uk Kim	 eval(shift(@insns));
10621f13597dSJung-uk Kim	 eval(shift(@insns));
10631f13597dSJung-uk Kim	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
10641f13597dSJung-uk Kim	 eval(shift(@insns));
10651f13597dSJung-uk Kim	 eval(shift(@insns));
10661f13597dSJung-uk Kim	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
10671f13597dSJung-uk Kim	 eval(shift(@insns));
10681f13597dSJung-uk Kim	 eval(shift(@insns));
10691f13597dSJung-uk Kim
10701f13597dSJung-uk Kim	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
10711f13597dSJung-uk Kim	 eval(shift(@insns));
10721f13597dSJung-uk Kim	 eval(shift(@insns));
10731f13597dSJung-uk Kim	 eval(shift(@insns));
10741f13597dSJung-uk Kim	 eval(shift(@insns));
10751f13597dSJung-uk Kim
10761f13597dSJung-uk Kim	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
10771f13597dSJung-uk Kim	 eval(shift(@insns));
10781f13597dSJung-uk Kim	 eval(shift(@insns));
10791f13597dSJung-uk Kim	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
10801f13597dSJung-uk Kim	 eval(shift(@insns));
10811f13597dSJung-uk Kim	 eval(shift(@insns));
10821f13597dSJung-uk Kim
10831f13597dSJung-uk Kim	&vpsrld	(@Tx[0],@X[0],31);
10841f13597dSJung-uk Kim	 eval(shift(@insns));
10851f13597dSJung-uk Kim	 eval(shift(@insns));
10861f13597dSJung-uk Kim	 eval(shift(@insns));
10871f13597dSJung-uk Kim	 eval(shift(@insns));
10881f13597dSJung-uk Kim
10891f13597dSJung-uk Kim	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
10901f13597dSJung-uk Kim	&vpaddd	(@X[0],@X[0],@X[0]);
10911f13597dSJung-uk Kim	 eval(shift(@insns));
10921f13597dSJung-uk Kim	 eval(shift(@insns));
10931f13597dSJung-uk Kim	 eval(shift(@insns));
10941f13597dSJung-uk Kim	 eval(shift(@insns));
10951f13597dSJung-uk Kim
10961f13597dSJung-uk Kim	&vpsrld	(@Tx[1],@Tx[2],30);
10971f13597dSJung-uk Kim	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
10981f13597dSJung-uk Kim	 eval(shift(@insns));
10991f13597dSJung-uk Kim	 eval(shift(@insns));
11001f13597dSJung-uk Kim	 eval(shift(@insns));
11011f13597dSJung-uk Kim	 eval(shift(@insns));
11021f13597dSJung-uk Kim
11031f13597dSJung-uk Kim	&vpslld	(@Tx[2],@Tx[2],2);
11041f13597dSJung-uk Kim	&vpxor	(@X[0],@X[0],@Tx[1]);
11051f13597dSJung-uk Kim	 eval(shift(@insns));
11061f13597dSJung-uk Kim	 eval(shift(@insns));
11071f13597dSJung-uk Kim	 eval(shift(@insns));
11081f13597dSJung-uk Kim	 eval(shift(@insns));
11091f13597dSJung-uk Kim
11101f13597dSJung-uk Kim	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
11111f13597dSJung-uk Kim	 eval(shift(@insns));
11121f13597dSJung-uk Kim	 eval(shift(@insns));
11137bded2dbSJung-uk Kim	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
11141f13597dSJung-uk Kim	 eval(shift(@insns));
11151f13597dSJung-uk Kim	 eval(shift(@insns));
11161f13597dSJung-uk Kim
11171f13597dSJung-uk Kim
11181f13597dSJung-uk Kim	 foreach (@insns) { eval; }	# remaining instructions [if any]
11191f13597dSJung-uk Kim
11201f13597dSJung-uk Kim  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
11211f13597dSJung-uk Kim}
11221f13597dSJung-uk Kim
11231f13597dSJung-uk Kimsub Xupdate_avx_32_79()
11241f13597dSJung-uk Kim{ use integer;
11251f13597dSJung-uk Kim  my $body = shift;
11267bded2dbSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
11271f13597dSJung-uk Kim  my ($a,$b,$c,$d,$e);
11281f13597dSJung-uk Kim
11291f13597dSJung-uk Kim	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
11301f13597dSJung-uk Kim	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
11311f13597dSJung-uk Kim	 eval(shift(@insns));		# body_20_39
11321f13597dSJung-uk Kim	 eval(shift(@insns));
11331f13597dSJung-uk Kim	 eval(shift(@insns));
11341f13597dSJung-uk Kim	 eval(shift(@insns));		# rol
11351f13597dSJung-uk Kim
11361f13597dSJung-uk Kim	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
11371f13597dSJung-uk Kim	 eval(shift(@insns));
11381f13597dSJung-uk Kim	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
11397bded2dbSJung-uk Kim	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
11407bded2dbSJung-uk Kim	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
11411f13597dSJung-uk Kim	 eval(shift(@insns));		# ror
11421f13597dSJung-uk Kim	 eval(shift(@insns));
11431f13597dSJung-uk Kim
11441f13597dSJung-uk Kim	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
11451f13597dSJung-uk Kim	 eval(shift(@insns));		# body_20_39
11461f13597dSJung-uk Kim	 eval(shift(@insns));
11471f13597dSJung-uk Kim	 eval(shift(@insns));
11481f13597dSJung-uk Kim	 eval(shift(@insns));		# rol
11491f13597dSJung-uk Kim
11501f13597dSJung-uk Kim	&vpsrld	(@Tx[0],@X[0],30);
11511f13597dSJung-uk Kim	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
11521f13597dSJung-uk Kim	 eval(shift(@insns));
11531f13597dSJung-uk Kim	 eval(shift(@insns));
11541f13597dSJung-uk Kim	 eval(shift(@insns));		# ror
11551f13597dSJung-uk Kim	 eval(shift(@insns));
11561f13597dSJung-uk Kim
11571f13597dSJung-uk Kim	&vpslld	(@X[0],@X[0],2);
11581f13597dSJung-uk Kim	 eval(shift(@insns));		# body_20_39
11591f13597dSJung-uk Kim	 eval(shift(@insns));
11601f13597dSJung-uk Kim	 eval(shift(@insns));
11611f13597dSJung-uk Kim	 eval(shift(@insns));		# rol
11621f13597dSJung-uk Kim	 eval(shift(@insns));
11631f13597dSJung-uk Kim	 eval(shift(@insns));
11641f13597dSJung-uk Kim	 eval(shift(@insns));		# ror
11651f13597dSJung-uk Kim	 eval(shift(@insns));
11661f13597dSJung-uk Kim
11671f13597dSJung-uk Kim	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
11681f13597dSJung-uk Kim	 eval(shift(@insns));		# body_20_39
11691f13597dSJung-uk Kim	 eval(shift(@insns));
11701f13597dSJung-uk Kim	 eval(shift(@insns));
11711f13597dSJung-uk Kim	 eval(shift(@insns));		# rol
11721f13597dSJung-uk Kim	 eval(shift(@insns));
11731f13597dSJung-uk Kim	 eval(shift(@insns));
11741f13597dSJung-uk Kim	 eval(shift(@insns));		# rol
11751f13597dSJung-uk Kim	 eval(shift(@insns));
11761f13597dSJung-uk Kim
11771f13597dSJung-uk Kim	 foreach (@insns) { eval; }	# remaining instructions
11781f13597dSJung-uk Kim
11791f13597dSJung-uk Kim  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
11801f13597dSJung-uk Kim}
11811f13597dSJung-uk Kim
11821f13597dSJung-uk Kimsub Xuplast_avx_80()
11831f13597dSJung-uk Kim{ use integer;
11841f13597dSJung-uk Kim  my $body = shift;
11851f13597dSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
11861f13597dSJung-uk Kim  my ($a,$b,$c,$d,$e);
11871f13597dSJung-uk Kim
11881f13597dSJung-uk Kim	 eval(shift(@insns));
11897bded2dbSJung-uk Kim	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
11901f13597dSJung-uk Kim	 eval(shift(@insns));
11911f13597dSJung-uk Kim	 eval(shift(@insns));
11921f13597dSJung-uk Kim	 eval(shift(@insns));
11931f13597dSJung-uk Kim	 eval(shift(@insns));
11941f13597dSJung-uk Kim
11957bded2dbSJung-uk Kim	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
11961f13597dSJung-uk Kim
11971f13597dSJung-uk Kim	 foreach (@insns) { eval; }		# remaining instructions
11981f13597dSJung-uk Kim
11991f13597dSJung-uk Kim	&cmp	($inp,$num);
12007bded2dbSJung-uk Kim	&je	($done_avx_label);
12011f13597dSJung-uk Kim
12021f13597dSJung-uk Kim	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
12037bded2dbSJung-uk Kim	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
12041f13597dSJung-uk Kim	&vmovdqu(@X[-4&7],"0($inp)");		# load input
12051f13597dSJung-uk Kim	&vmovdqu(@X[-3&7],"16($inp)");
12061f13597dSJung-uk Kim	&vmovdqu(@X[-2&7],"32($inp)");
12071f13597dSJung-uk Kim	&vmovdqu(@X[-1&7],"48($inp)");
12081f13597dSJung-uk Kim	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
12091f13597dSJung-uk Kim	&add	($inp,64);
12101f13597dSJung-uk Kim
12111f13597dSJung-uk Kim  $Xi=0;
12121f13597dSJung-uk Kim}
12131f13597dSJung-uk Kim
12141f13597dSJung-uk Kimsub Xloop_avx()
12151f13597dSJung-uk Kim{ use integer;
12161f13597dSJung-uk Kim  my $body = shift;
12171f13597dSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
12181f13597dSJung-uk Kim  my ($a,$b,$c,$d,$e);
12191f13597dSJung-uk Kim
12201f13597dSJung-uk Kim	 eval(shift(@insns));
12211f13597dSJung-uk Kim	 eval(shift(@insns));
12221f13597dSJung-uk Kim	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
12231f13597dSJung-uk Kim	 eval(shift(@insns));
12241f13597dSJung-uk Kim	 eval(shift(@insns));
12257bded2dbSJung-uk Kim	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
12261f13597dSJung-uk Kim	 eval(shift(@insns));
12271f13597dSJung-uk Kim	 eval(shift(@insns));
12281f13597dSJung-uk Kim	 eval(shift(@insns));
12291f13597dSJung-uk Kim	 eval(shift(@insns));
12301f13597dSJung-uk Kim	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
12311f13597dSJung-uk Kim	 eval(shift(@insns));
12321f13597dSJung-uk Kim	 eval(shift(@insns));
12331f13597dSJung-uk Kim
12341f13597dSJung-uk Kim	foreach (@insns) { eval; }
12351f13597dSJung-uk Kim  $Xi++;
12361f13597dSJung-uk Kim}
12371f13597dSJung-uk Kim
12381f13597dSJung-uk Kimsub Xtail_avx()
12391f13597dSJung-uk Kim{ use integer;
12401f13597dSJung-uk Kim  my $body = shift;
12411f13597dSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
12421f13597dSJung-uk Kim  my ($a,$b,$c,$d,$e);
12431f13597dSJung-uk Kim
12441f13597dSJung-uk Kim	foreach (@insns) { eval; }
12451f13597dSJung-uk Kim}
12461f13597dSJung-uk Kim
12471f13597dSJung-uk Kim$code.=<<___;
12481f13597dSJung-uk Kim.align	16
12491f13597dSJung-uk Kim.Loop_avx:
12501f13597dSJung-uk Kim___
12511f13597dSJung-uk Kim	&Xupdate_avx_16_31(\&body_00_19);
12521f13597dSJung-uk Kim	&Xupdate_avx_16_31(\&body_00_19);
12531f13597dSJung-uk Kim	&Xupdate_avx_16_31(\&body_00_19);
12541f13597dSJung-uk Kim	&Xupdate_avx_16_31(\&body_00_19);
12551f13597dSJung-uk Kim	&Xupdate_avx_32_79(\&body_00_19);
12561f13597dSJung-uk Kim	&Xupdate_avx_32_79(\&body_20_39);
12571f13597dSJung-uk Kim	&Xupdate_avx_32_79(\&body_20_39);
12581f13597dSJung-uk Kim	&Xupdate_avx_32_79(\&body_20_39);
12591f13597dSJung-uk Kim	&Xupdate_avx_32_79(\&body_20_39);
12601f13597dSJung-uk Kim	&Xupdate_avx_32_79(\&body_20_39);
12611f13597dSJung-uk Kim	&Xupdate_avx_32_79(\&body_40_59);
12621f13597dSJung-uk Kim	&Xupdate_avx_32_79(\&body_40_59);
12631f13597dSJung-uk Kim	&Xupdate_avx_32_79(\&body_40_59);
12641f13597dSJung-uk Kim	&Xupdate_avx_32_79(\&body_40_59);
12651f13597dSJung-uk Kim	&Xupdate_avx_32_79(\&body_40_59);
12661f13597dSJung-uk Kim	&Xupdate_avx_32_79(\&body_20_39);
12671f13597dSJung-uk Kim	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
12681f13597dSJung-uk Kim
12691f13597dSJung-uk Kim				$saved_j=$j; @saved_V=@V;
12701f13597dSJung-uk Kim
12711f13597dSJung-uk Kim	&Xloop_avx(\&body_20_39);
12721f13597dSJung-uk Kim	&Xloop_avx(\&body_20_39);
12731f13597dSJung-uk Kim	&Xloop_avx(\&body_20_39);
12741f13597dSJung-uk Kim
12751f13597dSJung-uk Kim$code.=<<___;
12761f13597dSJung-uk Kim	add	0($ctx),$A			# update context
12771f13597dSJung-uk Kim	add	4($ctx),@T[0]
12781f13597dSJung-uk Kim	add	8($ctx),$C
12791f13597dSJung-uk Kim	add	12($ctx),$D
12801f13597dSJung-uk Kim	mov	$A,0($ctx)
12811f13597dSJung-uk Kim	add	16($ctx),$E
12821f13597dSJung-uk Kim	mov	@T[0],4($ctx)
12831f13597dSJung-uk Kim	mov	@T[0],$B			# magic seed
12841f13597dSJung-uk Kim	mov	$C,8($ctx)
12857bded2dbSJung-uk Kim	mov	$C,@T[1]
12861f13597dSJung-uk Kim	mov	$D,12($ctx)
12877bded2dbSJung-uk Kim	xor	$D,@T[1]
12881f13597dSJung-uk Kim	mov	$E,16($ctx)
12897bded2dbSJung-uk Kim	and	@T[1],@T[0]
12901f13597dSJung-uk Kim	jmp	.Loop_avx
12911f13597dSJung-uk Kim
12921f13597dSJung-uk Kim.align	16
12937bded2dbSJung-uk Kim$done_avx_label:
12941f13597dSJung-uk Kim___
12951f13597dSJung-uk Kim				$j=$saved_j; @V=@saved_V;
12961f13597dSJung-uk Kim
12971f13597dSJung-uk Kim	&Xtail_avx(\&body_20_39);
12981f13597dSJung-uk Kim	&Xtail_avx(\&body_20_39);
12991f13597dSJung-uk Kim	&Xtail_avx(\&body_20_39);
13001f13597dSJung-uk Kim
13011f13597dSJung-uk Kim$code.=<<___;
1302de78d5d8SJung-uk Kim	vzeroupper
13031f13597dSJung-uk Kim
13041f13597dSJung-uk Kim	add	0($ctx),$A			# update context
13051f13597dSJung-uk Kim	add	4($ctx),@T[0]
13061f13597dSJung-uk Kim	add	8($ctx),$C
13071f13597dSJung-uk Kim	mov	$A,0($ctx)
13081f13597dSJung-uk Kim	add	12($ctx),$D
13091f13597dSJung-uk Kim	mov	@T[0],4($ctx)
13101f13597dSJung-uk Kim	add	16($ctx),$E
13111f13597dSJung-uk Kim	mov	$C,8($ctx)
13121f13597dSJung-uk Kim	mov	$D,12($ctx)
13131f13597dSJung-uk Kim	mov	$E,16($ctx)
13141f13597dSJung-uk Kim___
13151f13597dSJung-uk Kim$code.=<<___ if ($win64);
1316e71b7053SJung-uk Kim	movaps	-40-6*16($fp),%xmm6
1317e71b7053SJung-uk Kim	movaps	-40-5*16($fp),%xmm7
1318e71b7053SJung-uk Kim	movaps	-40-4*16($fp),%xmm8
1319e71b7053SJung-uk Kim	movaps	-40-3*16($fp),%xmm9
1320e71b7053SJung-uk Kim	movaps	-40-2*16($fp),%xmm10
1321e71b7053SJung-uk Kim	movaps	-40-1*16($fp),%xmm11
13221f13597dSJung-uk Kim___
13231f13597dSJung-uk Kim$code.=<<___;
1324e71b7053SJung-uk Kim	mov	-40($fp),%r14
1325e71b7053SJung-uk Kim.cfi_restore	%r14
1326e71b7053SJung-uk Kim	mov	-32($fp),%r13
1327e71b7053SJung-uk Kim.cfi_restore	%r13
1328e71b7053SJung-uk Kim	mov	-24($fp),%r12
1329e71b7053SJung-uk Kim.cfi_restore	%r12
1330e71b7053SJung-uk Kim	mov	-16($fp),%rbp
1331e71b7053SJung-uk Kim.cfi_restore	%rbp
1332e71b7053SJung-uk Kim	mov	-8($fp),%rbx
1333e71b7053SJung-uk Kim.cfi_restore	%rbx
1334e71b7053SJung-uk Kim	lea	($fp),%rsp
1335e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
13361f13597dSJung-uk Kim.Lepilogue_avx:
13371f13597dSJung-uk Kim	ret
1338e71b7053SJung-uk Kim.cfi_endproc
13391f13597dSJung-uk Kim.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
13401f13597dSJung-uk Kim___
13417bded2dbSJung-uk Kim
13427bded2dbSJung-uk Kimif ($avx>1) {
13437bded2dbSJung-uk Kimuse integer;
13447bded2dbSJung-uk Kim$Xi=4;					# reset variables
13457bded2dbSJung-uk Kim@X=map("%ymm$_",(4..7,0..3));
13467bded2dbSJung-uk Kim@Tx=map("%ymm$_",(8..10));
13477bded2dbSJung-uk Kim$Kx="%ymm11";
13487bded2dbSJung-uk Kim$j=0;
13497bded2dbSJung-uk Kim
13507bded2dbSJung-uk Kimmy @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
13517bded2dbSJung-uk Kimmy ($a5,$t0)=("%r12d","%edi");
13527bded2dbSJung-uk Kim
13537bded2dbSJung-uk Kimmy ($A,$F,$B,$C,$D,$E)=@ROTX;
13547bded2dbSJung-uk Kimmy $rx=0;
13557bded2dbSJung-uk Kimmy $frame="%r13";
13567bded2dbSJung-uk Kim
13577bded2dbSJung-uk Kim$code.=<<___;
13587bded2dbSJung-uk Kim.type	sha1_block_data_order_avx2,\@function,3
13597bded2dbSJung-uk Kim.align	16
13607bded2dbSJung-uk Kimsha1_block_data_order_avx2:
13617bded2dbSJung-uk Kim_avx2_shortcut:
1362e71b7053SJung-uk Kim.cfi_startproc
1363e71b7053SJung-uk Kim	mov	%rsp,$fp
1364e71b7053SJung-uk Kim.cfi_def_cfa_register	$fp
13657bded2dbSJung-uk Kim	push	%rbx
1366e71b7053SJung-uk Kim.cfi_push	%rbx
13677bded2dbSJung-uk Kim	push	%rbp
1368e71b7053SJung-uk Kim.cfi_push	%rbp
13697bded2dbSJung-uk Kim	push	%r12
1370e71b7053SJung-uk Kim.cfi_push	%r12
13717bded2dbSJung-uk Kim	push	%r13
1372e71b7053SJung-uk Kim.cfi_push	%r13
13737bded2dbSJung-uk Kim	push	%r14
1374e71b7053SJung-uk Kim.cfi_push	%r14
13757bded2dbSJung-uk Kim	vzeroupper
13767bded2dbSJung-uk Kim___
13777bded2dbSJung-uk Kim$code.=<<___ if ($win64);
13787bded2dbSJung-uk Kim	lea	-6*16(%rsp),%rsp
1379e71b7053SJung-uk Kim	vmovaps	%xmm6,-40-6*16($fp)
1380e71b7053SJung-uk Kim	vmovaps	%xmm7,-40-5*16($fp)
1381e71b7053SJung-uk Kim	vmovaps	%xmm8,-40-4*16($fp)
1382e71b7053SJung-uk Kim	vmovaps	%xmm9,-40-3*16($fp)
1383e71b7053SJung-uk Kim	vmovaps	%xmm10,-40-2*16($fp)
1384e71b7053SJung-uk Kim	vmovaps	%xmm11,-40-1*16($fp)
13857bded2dbSJung-uk Kim.Lprologue_avx2:
13867bded2dbSJung-uk Kim___
13877bded2dbSJung-uk Kim$code.=<<___;
13887bded2dbSJung-uk Kim	mov	%rdi,$ctx		# reassigned argument
13897bded2dbSJung-uk Kim	mov	%rsi,$inp		# reassigned argument
13907bded2dbSJung-uk Kim	mov	%rdx,$num		# reassigned argument
13917bded2dbSJung-uk Kim
13927bded2dbSJung-uk Kim	lea	-640(%rsp),%rsp
13937bded2dbSJung-uk Kim	shl	\$6,$num
13947bded2dbSJung-uk Kim	 lea	64($inp),$frame
13957bded2dbSJung-uk Kim	and	\$-128,%rsp
13967bded2dbSJung-uk Kim	add	$inp,$num
13977bded2dbSJung-uk Kim	lea	K_XX_XX+64(%rip),$K_XX_XX
13987bded2dbSJung-uk Kim
13997bded2dbSJung-uk Kim	mov	0($ctx),$A		# load context
14007bded2dbSJung-uk Kim	 cmp	$num,$frame
14017bded2dbSJung-uk Kim	 cmovae	$inp,$frame		# next or same block
14027bded2dbSJung-uk Kim	mov	4($ctx),$F
14037bded2dbSJung-uk Kim	mov	8($ctx),$C
14047bded2dbSJung-uk Kim	mov	12($ctx),$D
14057bded2dbSJung-uk Kim	mov	16($ctx),$E
14067bded2dbSJung-uk Kim	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
14077bded2dbSJung-uk Kim
14087bded2dbSJung-uk Kim	vmovdqu		($inp),%xmm0
14097bded2dbSJung-uk Kim	vmovdqu		16($inp),%xmm1
14107bded2dbSJung-uk Kim	vmovdqu		32($inp),%xmm2
14117bded2dbSJung-uk Kim	vmovdqu		48($inp),%xmm3
14127bded2dbSJung-uk Kim	lea		64($inp),$inp
14137bded2dbSJung-uk Kim	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
14147bded2dbSJung-uk Kim	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
14157bded2dbSJung-uk Kim	vpshufb		@X[2],@X[-4&7],@X[-4&7]
14167bded2dbSJung-uk Kim	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
14177bded2dbSJung-uk Kim	vpshufb		@X[2],@X[-3&7],@X[-3&7]
14187bded2dbSJung-uk Kim	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
14197bded2dbSJung-uk Kim	vpshufb		@X[2],@X[-2&7],@X[-2&7]
14207bded2dbSJung-uk Kim	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
14217bded2dbSJung-uk Kim	vpshufb		@X[2],@X[-1&7],@X[-1&7]
14227bded2dbSJung-uk Kim
14237bded2dbSJung-uk Kim	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
14247bded2dbSJung-uk Kim	vpaddd	$Kx,@X[-3&7],@X[1]
14257bded2dbSJung-uk Kim	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
14267bded2dbSJung-uk Kim	vpaddd	$Kx,@X[-2&7],@X[2]
14277bded2dbSJung-uk Kim	vmovdqu	@X[1],32(%rsp)
14287bded2dbSJung-uk Kim	vpaddd	$Kx,@X[-1&7],@X[3]
14297bded2dbSJung-uk Kim	vmovdqu	@X[2],64(%rsp)
14307bded2dbSJung-uk Kim	vmovdqu	@X[3],96(%rsp)
14317bded2dbSJung-uk Kim___
14327bded2dbSJung-uk Kimfor (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
14337bded2dbSJung-uk Kim    use integer;
14347bded2dbSJung-uk Kim
14357bded2dbSJung-uk Kim	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
14367bded2dbSJung-uk Kim	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
14377bded2dbSJung-uk Kim	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
14387bded2dbSJung-uk Kim	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
14397bded2dbSJung-uk Kim	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
14407bded2dbSJung-uk Kim	&vpsrld	(@Tx[0],@X[0],31);
14417bded2dbSJung-uk Kim	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
14427bded2dbSJung-uk Kim	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
14437bded2dbSJung-uk Kim	&vpaddd	(@X[0],@X[0],@X[0]);
14447bded2dbSJung-uk Kim	&vpsrld	(@Tx[1],@Tx[2],30);
14457bded2dbSJung-uk Kim	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
14467bded2dbSJung-uk Kim	&vpslld	(@Tx[2],@Tx[2],2);
14477bded2dbSJung-uk Kim	&vpxor	(@X[0],@X[0],@Tx[1]);
14487bded2dbSJung-uk Kim	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
14497bded2dbSJung-uk Kim	&vpaddd	(@Tx[1],@X[0],$Kx);
14507bded2dbSJung-uk Kim	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
14517bded2dbSJung-uk Kim
14527bded2dbSJung-uk Kim	push(@X,shift(@X));	# "rotate" X[]
14537bded2dbSJung-uk Kim}
14547bded2dbSJung-uk Kim$code.=<<___;
14557bded2dbSJung-uk Kim	lea	128(%rsp),$frame
14567bded2dbSJung-uk Kim	jmp	.Loop_avx2
14577bded2dbSJung-uk Kim.align	32
14587bded2dbSJung-uk Kim.Loop_avx2:
14597bded2dbSJung-uk Kim	rorx	\$2,$F,$B
14607bded2dbSJung-uk Kim	andn	$D,$F,$t0
14617bded2dbSJung-uk Kim	and	$C,$F
14627bded2dbSJung-uk Kim	xor	$t0,$F
14637bded2dbSJung-uk Kim___
14647bded2dbSJung-uk Kimsub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
14657bded2dbSJung-uk Kim	# at start $f=(b&c)^(~b&d), $b>>>=2
14667bded2dbSJung-uk Kim	return &bodyx_20_39() if ($rx==19); $rx++;
14677bded2dbSJung-uk Kim	(
14687bded2dbSJung-uk Kim	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
14697bded2dbSJung-uk Kim
14707bded2dbSJung-uk Kim	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
14717bded2dbSJung-uk Kim	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
14727bded2dbSJung-uk Kim	'&andn	($t0,$a,$c)',			# ~b&d for next round
14737bded2dbSJung-uk Kim
14747bded2dbSJung-uk Kim	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
14757bded2dbSJung-uk Kim	'&rorx	($a5,$a,27)',			# a<<<5
14767bded2dbSJung-uk Kim	'&rorx	($f,$a,2)',			# b>>>2 for next round
14777bded2dbSJung-uk Kim	'&and	($a,$b)',			# b&c for next round
14787bded2dbSJung-uk Kim
14797bded2dbSJung-uk Kim	'&add	($e,$a5)',			# e+=a<<<5
14807bded2dbSJung-uk Kim	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
14817bded2dbSJung-uk Kim
14827bded2dbSJung-uk Kim	'unshift(@ROTX,pop(@ROTX)); $j++;'
14837bded2dbSJung-uk Kim	)
14847bded2dbSJung-uk Kim}
14857bded2dbSJung-uk Kim
14867bded2dbSJung-uk Kimsub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
14877bded2dbSJung-uk Kim	# on entry $f=b^c^d, $b>>>=2
14887bded2dbSJung-uk Kim	return &bodyx_40_59() if ($rx==39); $rx++;
14897bded2dbSJung-uk Kim	(
14907bded2dbSJung-uk Kim	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
14917bded2dbSJung-uk Kim
14927bded2dbSJung-uk Kim	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
14937bded2dbSJung-uk Kim	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
14947bded2dbSJung-uk Kim
14957bded2dbSJung-uk Kim	'&lea	($e,"($e,$f)")',		# e+=b^c^d
14967bded2dbSJung-uk Kim	'&rorx	($a5,$a,27)',			# a<<<5
14977bded2dbSJung-uk Kim	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
14987bded2dbSJung-uk Kim	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
14997bded2dbSJung-uk Kim
15007bded2dbSJung-uk Kim	'&add	($e,$a5)',			# e+=a<<<5
15017bded2dbSJung-uk Kim	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
15027bded2dbSJung-uk Kim
15037bded2dbSJung-uk Kim	'unshift(@ROTX,pop(@ROTX)); $j++;'
15047bded2dbSJung-uk Kim	)
15057bded2dbSJung-uk Kim}
15067bded2dbSJung-uk Kim
15077bded2dbSJung-uk Kimsub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
15087bded2dbSJung-uk Kim	# on entry $f=((b^c)&(c^d)), $b>>>=2
15097bded2dbSJung-uk Kim	$rx++;
15107bded2dbSJung-uk Kim	(
15117bded2dbSJung-uk Kim	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
15127bded2dbSJung-uk Kim
15137bded2dbSJung-uk Kim	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
15147bded2dbSJung-uk Kim	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
15157bded2dbSJung-uk Kim	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
15167bded2dbSJung-uk Kim	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
15177bded2dbSJung-uk Kim	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
15187bded2dbSJung-uk Kim
15197bded2dbSJung-uk Kim	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
15207bded2dbSJung-uk Kim	'&rorx	($a5,$a,27)',			# a<<<5
15217bded2dbSJung-uk Kim	'&rorx	($f,$a,2)',			# b>>>2 in next round
15227bded2dbSJung-uk Kim	'&xor	($a,$b)',			# b^c for next round
15237bded2dbSJung-uk Kim
15247bded2dbSJung-uk Kim	'&add	($e,$a5)',			# e+=a<<<5
15257bded2dbSJung-uk Kim	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
15267bded2dbSJung-uk Kim	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
15277bded2dbSJung-uk Kim
15287bded2dbSJung-uk Kim	'unshift(@ROTX,pop(@ROTX)); $j++;'
15297bded2dbSJung-uk Kim	)
15307bded2dbSJung-uk Kim}
15317bded2dbSJung-uk Kim
1532e71b7053SJung-uk Kimsub Xupdate_avx2_16_31()		# recall that $Xi starts with 4
15337bded2dbSJung-uk Kim{ use integer;
15347bded2dbSJung-uk Kim  my $body = shift;
15357bded2dbSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
15367bded2dbSJung-uk Kim  my ($a,$b,$c,$d,$e);
15377bded2dbSJung-uk Kim
15387bded2dbSJung-uk Kim	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
15397bded2dbSJung-uk Kim	 eval(shift(@insns));
15407bded2dbSJung-uk Kim	 eval(shift(@insns));
15417bded2dbSJung-uk Kim	 eval(shift(@insns));
15427bded2dbSJung-uk Kim	 eval(shift(@insns));
15437bded2dbSJung-uk Kim
15447bded2dbSJung-uk Kim	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
15457bded2dbSJung-uk Kim	 eval(shift(@insns));
15467bded2dbSJung-uk Kim	 eval(shift(@insns));
15477bded2dbSJung-uk Kim	 eval(shift(@insns));
15487bded2dbSJung-uk Kim
15497bded2dbSJung-uk Kim	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
15507bded2dbSJung-uk Kim	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
15517bded2dbSJung-uk Kim	 eval(shift(@insns));
15527bded2dbSJung-uk Kim	 eval(shift(@insns));
15537bded2dbSJung-uk Kim
15547bded2dbSJung-uk Kim	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
15557bded2dbSJung-uk Kim	 eval(shift(@insns));
15567bded2dbSJung-uk Kim	 eval(shift(@insns));
15577bded2dbSJung-uk Kim	 eval(shift(@insns));
15587bded2dbSJung-uk Kim	 eval(shift(@insns));
15597bded2dbSJung-uk Kim
15607bded2dbSJung-uk Kim	&vpsrld	(@Tx[0],@X[0],31);
15617bded2dbSJung-uk Kim	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
15627bded2dbSJung-uk Kim	 eval(shift(@insns));
15637bded2dbSJung-uk Kim	 eval(shift(@insns));
15647bded2dbSJung-uk Kim	 eval(shift(@insns));
15657bded2dbSJung-uk Kim
15667bded2dbSJung-uk Kim	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
15677bded2dbSJung-uk Kim	&vpaddd	(@X[0],@X[0],@X[0]);
15687bded2dbSJung-uk Kim	 eval(shift(@insns));
15697bded2dbSJung-uk Kim	 eval(shift(@insns));
15707bded2dbSJung-uk Kim
15717bded2dbSJung-uk Kim	&vpsrld	(@Tx[1],@Tx[2],30);
15727bded2dbSJung-uk Kim	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
15737bded2dbSJung-uk Kim	 eval(shift(@insns));
15747bded2dbSJung-uk Kim	 eval(shift(@insns));
15757bded2dbSJung-uk Kim
15767bded2dbSJung-uk Kim	&vpslld	(@Tx[2],@Tx[2],2);
15777bded2dbSJung-uk Kim	&vpxor	(@X[0],@X[0],@Tx[1]);
15787bded2dbSJung-uk Kim	 eval(shift(@insns));
15797bded2dbSJung-uk Kim	 eval(shift(@insns));
15807bded2dbSJung-uk Kim
15817bded2dbSJung-uk Kim	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
15827bded2dbSJung-uk Kim	 eval(shift(@insns));
15837bded2dbSJung-uk Kim	 eval(shift(@insns));
15847bded2dbSJung-uk Kim	 eval(shift(@insns));
15857bded2dbSJung-uk Kim
15867bded2dbSJung-uk Kim	&vpaddd	(@Tx[1],@X[0],$Kx);
15877bded2dbSJung-uk Kim	 eval(shift(@insns));
15887bded2dbSJung-uk Kim	 eval(shift(@insns));
15897bded2dbSJung-uk Kim	 eval(shift(@insns));
15907bded2dbSJung-uk Kim	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
15917bded2dbSJung-uk Kim
15927bded2dbSJung-uk Kim	 foreach (@insns) { eval; }	# remaining instructions [if any]
15937bded2dbSJung-uk Kim
15947bded2dbSJung-uk Kim	$Xi++;
15957bded2dbSJung-uk Kim	push(@X,shift(@X));	# "rotate" X[]
15967bded2dbSJung-uk Kim}
15977bded2dbSJung-uk Kim
15987bded2dbSJung-uk Kimsub Xupdate_avx2_32_79()
15997bded2dbSJung-uk Kim{ use integer;
16007bded2dbSJung-uk Kim  my $body = shift;
16017bded2dbSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
16027bded2dbSJung-uk Kim  my ($a,$b,$c,$d,$e);
16037bded2dbSJung-uk Kim
16047bded2dbSJung-uk Kim	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
16057bded2dbSJung-uk Kim	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
16067bded2dbSJung-uk Kim	 eval(shift(@insns));
16077bded2dbSJung-uk Kim	 eval(shift(@insns));
16087bded2dbSJung-uk Kim
16097bded2dbSJung-uk Kim	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
16107bded2dbSJung-uk Kim	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
16117bded2dbSJung-uk Kim	 eval(shift(@insns));
16127bded2dbSJung-uk Kim	 eval(shift(@insns));
16137bded2dbSJung-uk Kim	 eval(shift(@insns));
16147bded2dbSJung-uk Kim
16157bded2dbSJung-uk Kim	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
16167bded2dbSJung-uk Kim	 eval(shift(@insns));
16177bded2dbSJung-uk Kim	 eval(shift(@insns));
16187bded2dbSJung-uk Kim	 eval(shift(@insns));
16197bded2dbSJung-uk Kim
16207bded2dbSJung-uk Kim	&vpsrld	(@Tx[0],@X[0],30);
16217bded2dbSJung-uk Kim	&vpslld	(@X[0],@X[0],2);
16227bded2dbSJung-uk Kim	 eval(shift(@insns));
16237bded2dbSJung-uk Kim	 eval(shift(@insns));
16247bded2dbSJung-uk Kim	 eval(shift(@insns));
16257bded2dbSJung-uk Kim
16267bded2dbSJung-uk Kim	#&vpslld	(@X[0],@X[0],2);
16277bded2dbSJung-uk Kim	 eval(shift(@insns));
16287bded2dbSJung-uk Kim	 eval(shift(@insns));
16297bded2dbSJung-uk Kim	 eval(shift(@insns));
16307bded2dbSJung-uk Kim
16317bded2dbSJung-uk Kim	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
16327bded2dbSJung-uk Kim	 eval(shift(@insns));
16337bded2dbSJung-uk Kim	 eval(shift(@insns));
16347bded2dbSJung-uk Kim	 eval(shift(@insns));
16357bded2dbSJung-uk Kim	 eval(shift(@insns));
16367bded2dbSJung-uk Kim
16377bded2dbSJung-uk Kim	&vpaddd	(@Tx[1],@X[0],$Kx);
16387bded2dbSJung-uk Kim	 eval(shift(@insns));
16397bded2dbSJung-uk Kim	 eval(shift(@insns));
16407bded2dbSJung-uk Kim	 eval(shift(@insns));
16417bded2dbSJung-uk Kim	 eval(shift(@insns));
16427bded2dbSJung-uk Kim
16437bded2dbSJung-uk Kim	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
16447bded2dbSJung-uk Kim
16457bded2dbSJung-uk Kim	 foreach (@insns) { eval; }	# remaining instructions
16467bded2dbSJung-uk Kim
16477bded2dbSJung-uk Kim	$Xi++;
16487bded2dbSJung-uk Kim	push(@X,shift(@X));	# "rotate" X[]
16497bded2dbSJung-uk Kim}
16507bded2dbSJung-uk Kim
16517bded2dbSJung-uk Kimsub Xloop_avx2()
16527bded2dbSJung-uk Kim{ use integer;
16537bded2dbSJung-uk Kim  my $body = shift;
16547bded2dbSJung-uk Kim  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
16557bded2dbSJung-uk Kim  my ($a,$b,$c,$d,$e);
16567bded2dbSJung-uk Kim
16577bded2dbSJung-uk Kim	 foreach (@insns) { eval; }
16587bded2dbSJung-uk Kim}
16597bded2dbSJung-uk Kim
16607bded2dbSJung-uk Kim	&align32();
16617bded2dbSJung-uk Kim	&Xupdate_avx2_32_79(\&bodyx_00_19);
16627bded2dbSJung-uk Kim	&Xupdate_avx2_32_79(\&bodyx_00_19);
16637bded2dbSJung-uk Kim	&Xupdate_avx2_32_79(\&bodyx_00_19);
16647bded2dbSJung-uk Kim	&Xupdate_avx2_32_79(\&bodyx_00_19);
16657bded2dbSJung-uk Kim
16667bded2dbSJung-uk Kim	&Xupdate_avx2_32_79(\&bodyx_20_39);
16677bded2dbSJung-uk Kim	&Xupdate_avx2_32_79(\&bodyx_20_39);
16687bded2dbSJung-uk Kim	&Xupdate_avx2_32_79(\&bodyx_20_39);
16697bded2dbSJung-uk Kim	&Xupdate_avx2_32_79(\&bodyx_20_39);
16707bded2dbSJung-uk Kim
16717bded2dbSJung-uk Kim	&align32();
16727bded2dbSJung-uk Kim	&Xupdate_avx2_32_79(\&bodyx_40_59);
16737bded2dbSJung-uk Kim	&Xupdate_avx2_32_79(\&bodyx_40_59);
16747bded2dbSJung-uk Kim	&Xupdate_avx2_32_79(\&bodyx_40_59);
16757bded2dbSJung-uk Kim	&Xupdate_avx2_32_79(\&bodyx_40_59);
16767bded2dbSJung-uk Kim
16777bded2dbSJung-uk Kim	&Xloop_avx2(\&bodyx_20_39);
16787bded2dbSJung-uk Kim	&Xloop_avx2(\&bodyx_20_39);
16797bded2dbSJung-uk Kim	&Xloop_avx2(\&bodyx_20_39);
16807bded2dbSJung-uk Kim	&Xloop_avx2(\&bodyx_20_39);
16817bded2dbSJung-uk Kim
16827bded2dbSJung-uk Kim$code.=<<___;
16837bded2dbSJung-uk Kim	lea	128($inp),$frame
16847bded2dbSJung-uk Kim	lea	128($inp),%rdi			# borrow $t0
16857bded2dbSJung-uk Kim	cmp	$num,$frame
16867bded2dbSJung-uk Kim	cmovae	$inp,$frame			# next or previous block
16877bded2dbSJung-uk Kim
16887bded2dbSJung-uk Kim	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
16897bded2dbSJung-uk Kim	add	0($ctx),@ROTX[0]		# update context
16907bded2dbSJung-uk Kim	add	4($ctx),@ROTX[1]
16917bded2dbSJung-uk Kim	add	8($ctx),@ROTX[3]
16927bded2dbSJung-uk Kim	mov	@ROTX[0],0($ctx)
16937bded2dbSJung-uk Kim	add	12($ctx),@ROTX[4]
16947bded2dbSJung-uk Kim	mov	@ROTX[1],4($ctx)
16957bded2dbSJung-uk Kim	 mov	@ROTX[0],$A			# A=d
16967bded2dbSJung-uk Kim	add	16($ctx),@ROTX[5]
16977bded2dbSJung-uk Kim	 mov	@ROTX[3],$a5
16987bded2dbSJung-uk Kim	mov	@ROTX[3],8($ctx)
16997bded2dbSJung-uk Kim	 mov	@ROTX[4],$D			# D=b
17007bded2dbSJung-uk Kim	 #xchg	@ROTX[5],$F			# F=c, C=f
17017bded2dbSJung-uk Kim	mov	@ROTX[4],12($ctx)
17027bded2dbSJung-uk Kim	 mov	@ROTX[1],$F			# F=e
17037bded2dbSJung-uk Kim	mov	@ROTX[5],16($ctx)
17047bded2dbSJung-uk Kim	#mov	$F,16($ctx)
17057bded2dbSJung-uk Kim	 mov	@ROTX[5],$E			# E=c
17067bded2dbSJung-uk Kim	 mov	$a5,$C				# C=f
17077bded2dbSJung-uk Kim	 #xchg	$F,$E				# E=c, F=e
17087bded2dbSJung-uk Kim
17097bded2dbSJung-uk Kim	cmp	$num,$inp
17107bded2dbSJung-uk Kim	je	.Ldone_avx2
17117bded2dbSJung-uk Kim___
17127bded2dbSJung-uk Kim
17137bded2dbSJung-uk Kim$Xi=4;				# reset variables
17147bded2dbSJung-uk Kim@X=map("%ymm$_",(4..7,0..3));
17157bded2dbSJung-uk Kim
17167bded2dbSJung-uk Kim$code.=<<___;
17177bded2dbSJung-uk Kim	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
17187bded2dbSJung-uk Kim	cmp	$num,%rdi			# borrowed $t0
17197bded2dbSJung-uk Kim	ja	.Last_avx2
17207bded2dbSJung-uk Kim
17217bded2dbSJung-uk Kim	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
17227bded2dbSJung-uk Kim	vmovdqu		-48(%rdi),%xmm1
17237bded2dbSJung-uk Kim	vmovdqu		-32(%rdi),%xmm2
17247bded2dbSJung-uk Kim	vmovdqu		-16(%rdi),%xmm3
17257bded2dbSJung-uk Kim	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
17267bded2dbSJung-uk Kim	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
17277bded2dbSJung-uk Kim	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
17287bded2dbSJung-uk Kim	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
17297bded2dbSJung-uk Kim	jmp	.Last_avx2
17307bded2dbSJung-uk Kim
17317bded2dbSJung-uk Kim.align	32
17327bded2dbSJung-uk Kim.Last_avx2:
17337bded2dbSJung-uk Kim	lea	128+16(%rsp),$frame
17347bded2dbSJung-uk Kim	rorx	\$2,$F,$B
17357bded2dbSJung-uk Kim	andn	$D,$F,$t0
17367bded2dbSJung-uk Kim	and	$C,$F
17377bded2dbSJung-uk Kim	xor	$t0,$F
17387bded2dbSJung-uk Kim	sub	\$-128,$inp
17397bded2dbSJung-uk Kim___
17407bded2dbSJung-uk Kim	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
17417bded2dbSJung-uk Kim
17427bded2dbSJung-uk Kim	&Xloop_avx2	(\&bodyx_00_19);
17437bded2dbSJung-uk Kim	&Xloop_avx2	(\&bodyx_00_19);
17447bded2dbSJung-uk Kim	&Xloop_avx2	(\&bodyx_00_19);
17457bded2dbSJung-uk Kim	&Xloop_avx2	(\&bodyx_00_19);
17467bded2dbSJung-uk Kim
17477bded2dbSJung-uk Kim	&Xloop_avx2	(\&bodyx_20_39);
17487bded2dbSJung-uk Kim	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
17497bded2dbSJung-uk Kim	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
17507bded2dbSJung-uk Kim	&Xloop_avx2	(\&bodyx_20_39);
17517bded2dbSJung-uk Kim	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
17527bded2dbSJung-uk Kim	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
17537bded2dbSJung-uk Kim	&Xloop_avx2	(\&bodyx_20_39);
17547bded2dbSJung-uk Kim	  &vmovdqu	("0(%rsp)",@Tx[0]);
17557bded2dbSJung-uk Kim	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
17567bded2dbSJung-uk Kim	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
17577bded2dbSJung-uk Kim	&Xloop_avx2	(\&bodyx_20_39);
17587bded2dbSJung-uk Kim	  &vmovdqu	("32(%rsp)",@Tx[1]);
17597bded2dbSJung-uk Kim	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
17607bded2dbSJung-uk Kim	  &vpaddd	(@X[2],@X[-2&7],$Kx);
17617bded2dbSJung-uk Kim
17627bded2dbSJung-uk Kim	&Xloop_avx2	(\&bodyx_40_59);
17637bded2dbSJung-uk Kim	&align32	();
17647bded2dbSJung-uk Kim	  &vmovdqu	("64(%rsp)",@X[2]);
17657bded2dbSJung-uk Kim	  &vpaddd	(@X[3],@X[-1&7],$Kx);
17667bded2dbSJung-uk Kim	&Xloop_avx2	(\&bodyx_40_59);
17677bded2dbSJung-uk Kim	  &vmovdqu	("96(%rsp)",@X[3]);
17687bded2dbSJung-uk Kim	&Xloop_avx2	(\&bodyx_40_59);
17697bded2dbSJung-uk Kim	&Xupdate_avx2_16_31(\&bodyx_40_59);
17707bded2dbSJung-uk Kim
17717bded2dbSJung-uk Kim	&Xupdate_avx2_16_31(\&bodyx_20_39);
17727bded2dbSJung-uk Kim	&Xupdate_avx2_16_31(\&bodyx_20_39);
17737bded2dbSJung-uk Kim	&Xupdate_avx2_16_31(\&bodyx_20_39);
17747bded2dbSJung-uk Kim	&Xloop_avx2	(\&bodyx_20_39);
17757bded2dbSJung-uk Kim
17767bded2dbSJung-uk Kim$code.=<<___;
17777bded2dbSJung-uk Kim	lea	128(%rsp),$frame
17787bded2dbSJung-uk Kim
17797bded2dbSJung-uk Kim	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
17807bded2dbSJung-uk Kim	add	0($ctx),@ROTX[0]		# update context
17817bded2dbSJung-uk Kim	add	4($ctx),@ROTX[1]
17827bded2dbSJung-uk Kim	add	8($ctx),@ROTX[3]
17837bded2dbSJung-uk Kim	mov	@ROTX[0],0($ctx)
17847bded2dbSJung-uk Kim	add	12($ctx),@ROTX[4]
17857bded2dbSJung-uk Kim	mov	@ROTX[1],4($ctx)
17867bded2dbSJung-uk Kim	 mov	@ROTX[0],$A			# A=d
17877bded2dbSJung-uk Kim	add	16($ctx),@ROTX[5]
17887bded2dbSJung-uk Kim	 mov	@ROTX[3],$a5
17897bded2dbSJung-uk Kim	mov	@ROTX[3],8($ctx)
17907bded2dbSJung-uk Kim	 mov	@ROTX[4],$D			# D=b
17917bded2dbSJung-uk Kim	 #xchg	@ROTX[5],$F			# F=c, C=f
17927bded2dbSJung-uk Kim	mov	@ROTX[4],12($ctx)
17937bded2dbSJung-uk Kim	 mov	@ROTX[1],$F			# F=e
17947bded2dbSJung-uk Kim	mov	@ROTX[5],16($ctx)
17957bded2dbSJung-uk Kim	#mov	$F,16($ctx)
17967bded2dbSJung-uk Kim	 mov	@ROTX[5],$E			# E=c
17977bded2dbSJung-uk Kim	 mov	$a5,$C				# C=f
17987bded2dbSJung-uk Kim	 #xchg	$F,$E				# E=c, F=e
17997bded2dbSJung-uk Kim
18007bded2dbSJung-uk Kim	cmp	$num,$inp
18017bded2dbSJung-uk Kim	jbe	.Loop_avx2
18027bded2dbSJung-uk Kim
18037bded2dbSJung-uk Kim.Ldone_avx2:
18047bded2dbSJung-uk Kim	vzeroupper
18057bded2dbSJung-uk Kim___
18067bded2dbSJung-uk Kim$code.=<<___ if ($win64);
1807e71b7053SJung-uk Kim	movaps	-40-6*16($fp),%xmm6
1808e71b7053SJung-uk Kim	movaps	-40-5*16($fp),%xmm7
1809e71b7053SJung-uk Kim	movaps	-40-4*16($fp),%xmm8
1810e71b7053SJung-uk Kim	movaps	-40-3*16($fp),%xmm9
1811e71b7053SJung-uk Kim	movaps	-40-2*16($fp),%xmm10
1812e71b7053SJung-uk Kim	movaps	-40-1*16($fp),%xmm11
18137bded2dbSJung-uk Kim___
18147bded2dbSJung-uk Kim$code.=<<___;
1815e71b7053SJung-uk Kim	mov	-40($fp),%r14
1816e71b7053SJung-uk Kim.cfi_restore	%r14
1817e71b7053SJung-uk Kim	mov	-32($fp),%r13
1818e71b7053SJung-uk Kim.cfi_restore	%r13
1819e71b7053SJung-uk Kim	mov	-24($fp),%r12
1820e71b7053SJung-uk Kim.cfi_restore	%r12
1821e71b7053SJung-uk Kim	mov	-16($fp),%rbp
1822e71b7053SJung-uk Kim.cfi_restore	%rbp
1823e71b7053SJung-uk Kim	mov	-8($fp),%rbx
1824e71b7053SJung-uk Kim.cfi_restore	%rbx
1825e71b7053SJung-uk Kim	lea	($fp),%rsp
1826e71b7053SJung-uk Kim.cfi_def_cfa_register	%rsp
18277bded2dbSJung-uk Kim.Lepilogue_avx2:
18287bded2dbSJung-uk Kim	ret
1829e71b7053SJung-uk Kim.cfi_endproc
18307bded2dbSJung-uk Kim.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
18317bded2dbSJung-uk Kim___
18327bded2dbSJung-uk Kim}
18331f13597dSJung-uk Kim}
18341f13597dSJung-uk Kim$code.=<<___;
18351f13597dSJung-uk Kim.align	64
18361f13597dSJung-uk KimK_XX_XX:
18371f13597dSJung-uk Kim.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
18387bded2dbSJung-uk Kim.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
18397bded2dbSJung-uk Kim.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
18401f13597dSJung-uk Kim.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
18411f13597dSJung-uk Kim.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
18427bded2dbSJung-uk Kim.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
18437bded2dbSJung-uk Kim.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
18441f13597dSJung-uk Kim.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
18451f13597dSJung-uk Kim.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
18467bded2dbSJung-uk Kim.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
18477bded2dbSJung-uk Kim.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
18481f13597dSJung-uk Kim___
18491f13597dSJung-uk Kim}}}
1850db522d3aSSimon L. B. Nielsen$code.=<<___;
1851db522d3aSSimon L. B. Nielsen.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
18521f13597dSJung-uk Kim.align	64
1853db522d3aSSimon L. B. Nielsen___
1854db522d3aSSimon L. B. Nielsen
18551f13597dSJung-uk Kim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
18561f13597dSJung-uk Kim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
18571f13597dSJung-uk Kimif ($win64) {
18581f13597dSJung-uk Kim$rec="%rcx";
18591f13597dSJung-uk Kim$frame="%rdx";
18601f13597dSJung-uk Kim$context="%r8";
18611f13597dSJung-uk Kim$disp="%r9";
18621f13597dSJung-uk Kim
18631f13597dSJung-uk Kim$code.=<<___;
18641f13597dSJung-uk Kim.extern	__imp_RtlVirtualUnwind
18651f13597dSJung-uk Kim.type	se_handler,\@abi-omnipotent
18661f13597dSJung-uk Kim.align	16
18671f13597dSJung-uk Kimse_handler:
18681f13597dSJung-uk Kim	push	%rsi
18691f13597dSJung-uk Kim	push	%rdi
18701f13597dSJung-uk Kim	push	%rbx
18711f13597dSJung-uk Kim	push	%rbp
18721f13597dSJung-uk Kim	push	%r12
18731f13597dSJung-uk Kim	push	%r13
18741f13597dSJung-uk Kim	push	%r14
18751f13597dSJung-uk Kim	push	%r15
18761f13597dSJung-uk Kim	pushfq
18771f13597dSJung-uk Kim	sub	\$64,%rsp
18781f13597dSJung-uk Kim
18791f13597dSJung-uk Kim	mov	120($context),%rax	# pull context->Rax
18801f13597dSJung-uk Kim	mov	248($context),%rbx	# pull context->Rip
18811f13597dSJung-uk Kim
18821f13597dSJung-uk Kim	lea	.Lprologue(%rip),%r10
18831f13597dSJung-uk Kim	cmp	%r10,%rbx		# context->Rip<.Lprologue
18841f13597dSJung-uk Kim	jb	.Lcommon_seh_tail
18851f13597dSJung-uk Kim
18861f13597dSJung-uk Kim	mov	152($context),%rax	# pull context->Rsp
18871f13597dSJung-uk Kim
18881f13597dSJung-uk Kim	lea	.Lepilogue(%rip),%r10
18891f13597dSJung-uk Kim	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
18901f13597dSJung-uk Kim	jae	.Lcommon_seh_tail
18911f13597dSJung-uk Kim
18921f13597dSJung-uk Kim	mov	`16*4`(%rax),%rax	# pull saved stack pointer
18931f13597dSJung-uk Kim
18941f13597dSJung-uk Kim	mov	-8(%rax),%rbx
18951f13597dSJung-uk Kim	mov	-16(%rax),%rbp
18961f13597dSJung-uk Kim	mov	-24(%rax),%r12
18971f13597dSJung-uk Kim	mov	-32(%rax),%r13
18987bded2dbSJung-uk Kim	mov	-40(%rax),%r14
18991f13597dSJung-uk Kim	mov	%rbx,144($context)	# restore context->Rbx
19001f13597dSJung-uk Kim	mov	%rbp,160($context)	# restore context->Rbp
19011f13597dSJung-uk Kim	mov	%r12,216($context)	# restore context->R12
19021f13597dSJung-uk Kim	mov	%r13,224($context)	# restore context->R13
19037bded2dbSJung-uk Kim	mov	%r14,232($context)	# restore context->R14
19041f13597dSJung-uk Kim
19051f13597dSJung-uk Kim	jmp	.Lcommon_seh_tail
19061f13597dSJung-uk Kim.size	se_handler,.-se_handler
19077bded2dbSJung-uk Kim___
19081f13597dSJung-uk Kim
19097bded2dbSJung-uk Kim$code.=<<___ if ($shaext);
19107bded2dbSJung-uk Kim.type	shaext_handler,\@abi-omnipotent
19117bded2dbSJung-uk Kim.align	16
19127bded2dbSJung-uk Kimshaext_handler:
19137bded2dbSJung-uk Kim	push	%rsi
19147bded2dbSJung-uk Kim	push	%rdi
19157bded2dbSJung-uk Kim	push	%rbx
19167bded2dbSJung-uk Kim	push	%rbp
19177bded2dbSJung-uk Kim	push	%r12
19187bded2dbSJung-uk Kim	push	%r13
19197bded2dbSJung-uk Kim	push	%r14
19207bded2dbSJung-uk Kim	push	%r15
19217bded2dbSJung-uk Kim	pushfq
19227bded2dbSJung-uk Kim	sub	\$64,%rsp
19237bded2dbSJung-uk Kim
19247bded2dbSJung-uk Kim	mov	120($context),%rax	# pull context->Rax
19257bded2dbSJung-uk Kim	mov	248($context),%rbx	# pull context->Rip
19267bded2dbSJung-uk Kim
19277bded2dbSJung-uk Kim	lea	.Lprologue_shaext(%rip),%r10
19287bded2dbSJung-uk Kim	cmp	%r10,%rbx		# context->Rip<.Lprologue
19297bded2dbSJung-uk Kim	jb	.Lcommon_seh_tail
19307bded2dbSJung-uk Kim
19317bded2dbSJung-uk Kim	lea	.Lepilogue_shaext(%rip),%r10
19327bded2dbSJung-uk Kim	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
19337bded2dbSJung-uk Kim	jae	.Lcommon_seh_tail
19347bded2dbSJung-uk Kim
19357bded2dbSJung-uk Kim	lea	-8-4*16(%rax),%rsi
19367bded2dbSJung-uk Kim	lea	512($context),%rdi	# &context.Xmm6
19377bded2dbSJung-uk Kim	mov	\$8,%ecx
19387bded2dbSJung-uk Kim	.long	0xa548f3fc		# cld; rep movsq
19397bded2dbSJung-uk Kim
19407bded2dbSJung-uk Kim	jmp	.Lcommon_seh_tail
19417bded2dbSJung-uk Kim.size	shaext_handler,.-shaext_handler
19427bded2dbSJung-uk Kim___
19437bded2dbSJung-uk Kim
19447bded2dbSJung-uk Kim$code.=<<___;
19451f13597dSJung-uk Kim.type	ssse3_handler,\@abi-omnipotent
19461f13597dSJung-uk Kim.align	16
19471f13597dSJung-uk Kimssse3_handler:
19481f13597dSJung-uk Kim	push	%rsi
19491f13597dSJung-uk Kim	push	%rdi
19501f13597dSJung-uk Kim	push	%rbx
19511f13597dSJung-uk Kim	push	%rbp
19521f13597dSJung-uk Kim	push	%r12
19531f13597dSJung-uk Kim	push	%r13
19541f13597dSJung-uk Kim	push	%r14
19551f13597dSJung-uk Kim	push	%r15
19561f13597dSJung-uk Kim	pushfq
19571f13597dSJung-uk Kim	sub	\$64,%rsp
19581f13597dSJung-uk Kim
19591f13597dSJung-uk Kim	mov	120($context),%rax	# pull context->Rax
19601f13597dSJung-uk Kim	mov	248($context),%rbx	# pull context->Rip
19611f13597dSJung-uk Kim
19621f13597dSJung-uk Kim	mov	8($disp),%rsi		# disp->ImageBase
19631f13597dSJung-uk Kim	mov	56($disp),%r11		# disp->HandlerData
19641f13597dSJung-uk Kim
19651f13597dSJung-uk Kim	mov	0(%r11),%r10d		# HandlerData[0]
19661f13597dSJung-uk Kim	lea	(%rsi,%r10),%r10	# prologue label
19671f13597dSJung-uk Kim	cmp	%r10,%rbx		# context->Rip<prologue label
19681f13597dSJung-uk Kim	jb	.Lcommon_seh_tail
19691f13597dSJung-uk Kim
1970e71b7053SJung-uk Kim	mov	208($context),%rax	# pull context->R11
19711f13597dSJung-uk Kim
19721f13597dSJung-uk Kim	mov	4(%r11),%r10d		# HandlerData[1]
19731f13597dSJung-uk Kim	lea	(%rsi,%r10),%r10	# epilogue label
19741f13597dSJung-uk Kim	cmp	%r10,%rbx		# context->Rip>=epilogue label
19751f13597dSJung-uk Kim	jae	.Lcommon_seh_tail
19761f13597dSJung-uk Kim
19777bded2dbSJung-uk Kim	lea	-40-6*16(%rax),%rsi
19781f13597dSJung-uk Kim	lea	512($context),%rdi	# &context.Xmm6
19797bded2dbSJung-uk Kim	mov	\$12,%ecx
19801f13597dSJung-uk Kim	.long	0xa548f3fc		# cld; rep movsq
19811f13597dSJung-uk Kim
19821f13597dSJung-uk Kim	mov	-8(%rax),%rbx
19831f13597dSJung-uk Kim	mov	-16(%rax),%rbp
19841f13597dSJung-uk Kim	mov	-24(%rax),%r12
19857bded2dbSJung-uk Kim	mov	-32(%rax),%r13
19867bded2dbSJung-uk Kim	mov	-40(%rax),%r14
19871f13597dSJung-uk Kim	mov	%rbx,144($context)	# restore context->Rbx
19881f13597dSJung-uk Kim	mov	%rbp,160($context)	# restore context->Rbp
1989e71b7053SJung-uk Kim	mov	%r12,216($context)	# restore context->R12
1990e71b7053SJung-uk Kim	mov	%r13,224($context)	# restore context->R13
1991e71b7053SJung-uk Kim	mov	%r14,232($context)	# restore context->R14
19921f13597dSJung-uk Kim
19931f13597dSJung-uk Kim.Lcommon_seh_tail:
19941f13597dSJung-uk Kim	mov	8(%rax),%rdi
19951f13597dSJung-uk Kim	mov	16(%rax),%rsi
19961f13597dSJung-uk Kim	mov	%rax,152($context)	# restore context->Rsp
19971f13597dSJung-uk Kim	mov	%rsi,168($context)	# restore context->Rsi
19981f13597dSJung-uk Kim	mov	%rdi,176($context)	# restore context->Rdi
19991f13597dSJung-uk Kim
20001f13597dSJung-uk Kim	mov	40($disp),%rdi		# disp->ContextRecord
20011f13597dSJung-uk Kim	mov	$context,%rsi		# context
20021f13597dSJung-uk Kim	mov	\$154,%ecx		# sizeof(CONTEXT)
20031f13597dSJung-uk Kim	.long	0xa548f3fc		# cld; rep movsq
20041f13597dSJung-uk Kim
20051f13597dSJung-uk Kim	mov	$disp,%rsi
20061f13597dSJung-uk Kim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
20071f13597dSJung-uk Kim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
20081f13597dSJung-uk Kim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
20091f13597dSJung-uk Kim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
20101f13597dSJung-uk Kim	mov	40(%rsi),%r10		# disp->ContextRecord
20111f13597dSJung-uk Kim	lea	56(%rsi),%r11		# &disp->HandlerData
20121f13597dSJung-uk Kim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
20131f13597dSJung-uk Kim	mov	%r10,32(%rsp)		# arg5
20141f13597dSJung-uk Kim	mov	%r11,40(%rsp)		# arg6
20151f13597dSJung-uk Kim	mov	%r12,48(%rsp)		# arg7
20161f13597dSJung-uk Kim	mov	%rcx,56(%rsp)		# arg8, (NULL)
20171f13597dSJung-uk Kim	call	*__imp_RtlVirtualUnwind(%rip)
20181f13597dSJung-uk Kim
20191f13597dSJung-uk Kim	mov	\$1,%eax		# ExceptionContinueSearch
20201f13597dSJung-uk Kim	add	\$64,%rsp
20211f13597dSJung-uk Kim	popfq
20221f13597dSJung-uk Kim	pop	%r15
20231f13597dSJung-uk Kim	pop	%r14
20241f13597dSJung-uk Kim	pop	%r13
20251f13597dSJung-uk Kim	pop	%r12
20261f13597dSJung-uk Kim	pop	%rbp
20271f13597dSJung-uk Kim	pop	%rbx
20281f13597dSJung-uk Kim	pop	%rdi
20291f13597dSJung-uk Kim	pop	%rsi
20301f13597dSJung-uk Kim	ret
20311f13597dSJung-uk Kim.size	ssse3_handler,.-ssse3_handler
20321f13597dSJung-uk Kim
20331f13597dSJung-uk Kim.section	.pdata
20341f13597dSJung-uk Kim.align	4
20351f13597dSJung-uk Kim	.rva	.LSEH_begin_sha1_block_data_order
20361f13597dSJung-uk Kim	.rva	.LSEH_end_sha1_block_data_order
20371f13597dSJung-uk Kim	.rva	.LSEH_info_sha1_block_data_order
20387bded2dbSJung-uk Kim___
20397bded2dbSJung-uk Kim$code.=<<___ if ($shaext);
20407bded2dbSJung-uk Kim	.rva	.LSEH_begin_sha1_block_data_order_shaext
20417bded2dbSJung-uk Kim	.rva	.LSEH_end_sha1_block_data_order_shaext
20427bded2dbSJung-uk Kim	.rva	.LSEH_info_sha1_block_data_order_shaext
20437bded2dbSJung-uk Kim___
20447bded2dbSJung-uk Kim$code.=<<___;
20451f13597dSJung-uk Kim	.rva	.LSEH_begin_sha1_block_data_order_ssse3
20461f13597dSJung-uk Kim	.rva	.LSEH_end_sha1_block_data_order_ssse3
20471f13597dSJung-uk Kim	.rva	.LSEH_info_sha1_block_data_order_ssse3
20481f13597dSJung-uk Kim___
20491f13597dSJung-uk Kim$code.=<<___ if ($avx);
20501f13597dSJung-uk Kim	.rva	.LSEH_begin_sha1_block_data_order_avx
20511f13597dSJung-uk Kim	.rva	.LSEH_end_sha1_block_data_order_avx
20521f13597dSJung-uk Kim	.rva	.LSEH_info_sha1_block_data_order_avx
20531f13597dSJung-uk Kim___
20547bded2dbSJung-uk Kim$code.=<<___ if ($avx>1);
20557bded2dbSJung-uk Kim	.rva	.LSEH_begin_sha1_block_data_order_avx2
20567bded2dbSJung-uk Kim	.rva	.LSEH_end_sha1_block_data_order_avx2
20577bded2dbSJung-uk Kim	.rva	.LSEH_info_sha1_block_data_order_avx2
20587bded2dbSJung-uk Kim___
20591f13597dSJung-uk Kim$code.=<<___;
20601f13597dSJung-uk Kim.section	.xdata
20611f13597dSJung-uk Kim.align	8
20621f13597dSJung-uk Kim.LSEH_info_sha1_block_data_order:
20631f13597dSJung-uk Kim	.byte	9,0,0,0
20641f13597dSJung-uk Kim	.rva	se_handler
20657bded2dbSJung-uk Kim___
20667bded2dbSJung-uk Kim$code.=<<___ if ($shaext);
20677bded2dbSJung-uk Kim.LSEH_info_sha1_block_data_order_shaext:
20687bded2dbSJung-uk Kim	.byte	9,0,0,0
20697bded2dbSJung-uk Kim	.rva	shaext_handler
20707bded2dbSJung-uk Kim___
20717bded2dbSJung-uk Kim$code.=<<___;
20721f13597dSJung-uk Kim.LSEH_info_sha1_block_data_order_ssse3:
20731f13597dSJung-uk Kim	.byte	9,0,0,0
20741f13597dSJung-uk Kim	.rva	ssse3_handler
20751f13597dSJung-uk Kim	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
20761f13597dSJung-uk Kim___
20771f13597dSJung-uk Kim$code.=<<___ if ($avx);
20781f13597dSJung-uk Kim.LSEH_info_sha1_block_data_order_avx:
20791f13597dSJung-uk Kim	.byte	9,0,0,0
20801f13597dSJung-uk Kim	.rva	ssse3_handler
20811f13597dSJung-uk Kim	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
20821f13597dSJung-uk Kim___
20837bded2dbSJung-uk Kim$code.=<<___ if ($avx>1);
20847bded2dbSJung-uk Kim.LSEH_info_sha1_block_data_order_avx2:
20857bded2dbSJung-uk Kim	.byte	9,0,0,0
20867bded2dbSJung-uk Kim	.rva	ssse3_handler
20877bded2dbSJung-uk Kim	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
20887bded2dbSJung-uk Kim___
20891f13597dSJung-uk Kim}
20901f13597dSJung-uk Kim
2091db522d3aSSimon L. B. Nielsen####################################################################
2092db522d3aSSimon L. B. Nielsen
20937bded2dbSJung-uk Kimsub sha1rnds4 {
20947bded2dbSJung-uk Kim    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
20957bded2dbSJung-uk Kim      my @opcode=(0x0f,0x3a,0xcc);
20967bded2dbSJung-uk Kim	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
20977bded2dbSJung-uk Kim	my $c=$1;
20987bded2dbSJung-uk Kim	push @opcode,$c=~/^0/?oct($c):$c;
20997bded2dbSJung-uk Kim	return ".byte\t".join(',',@opcode);
21007bded2dbSJung-uk Kim    } else {
21017bded2dbSJung-uk Kim	return "sha1rnds4\t".@_[0];
21027bded2dbSJung-uk Kim    }
21037bded2dbSJung-uk Kim}
21047bded2dbSJung-uk Kim
21057bded2dbSJung-uk Kimsub sha1op38 {
21067bded2dbSJung-uk Kim    my $instr = shift;
21077bded2dbSJung-uk Kim    my %opcodelet = (
21087bded2dbSJung-uk Kim		"sha1nexte" => 0xc8,
21097bded2dbSJung-uk Kim  		"sha1msg1"  => 0xc9,
21107bded2dbSJung-uk Kim		"sha1msg2"  => 0xca	);
21117bded2dbSJung-uk Kim
21127bded2dbSJung-uk Kim    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
21137bded2dbSJung-uk Kim      my @opcode=(0x0f,0x38);
21147bded2dbSJung-uk Kim      my $rex=0;
21157bded2dbSJung-uk Kim	$rex|=0x04			if ($2>=8);
21167bded2dbSJung-uk Kim	$rex|=0x01			if ($1>=8);
21177bded2dbSJung-uk Kim	unshift @opcode,0x40|$rex	if ($rex);
21187bded2dbSJung-uk Kim	push @opcode,$opcodelet{$instr};
21197bded2dbSJung-uk Kim	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
21207bded2dbSJung-uk Kim	return ".byte\t".join(',',@opcode);
21217bded2dbSJung-uk Kim    } else {
21227bded2dbSJung-uk Kim	return $instr."\t".@_[0];
21237bded2dbSJung-uk Kim    }
21247bded2dbSJung-uk Kim}
21257bded2dbSJung-uk Kim
21267bded2dbSJung-uk Kimforeach (split("\n",$code)) {
21277bded2dbSJung-uk Kim	s/\`([^\`]*)\`/eval $1/geo;
21287bded2dbSJung-uk Kim
21297bded2dbSJung-uk Kim	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
21307bded2dbSJung-uk Kim	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
21317bded2dbSJung-uk Kim
21327bded2dbSJung-uk Kim	print $_,"\n";
21337bded2dbSJung-uk Kim}
213417f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!";
2135