1#! /usr/bin/env perl
2# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# sha1_block procedure for x86_64.
18#
19# It was brought to my attention that on EM64T compiler-generated code
20# was far behind 32-bit assembler implementation. This is unlike on
21# Opteron where compiler-generated code was only 15% behind 32-bit
22# assembler, which originally made it hard to motivate the effort.
23# There was suggestion to mechanically translate 32-bit code, but I
24# dismissed it, reasoning that x86_64 offers enough register bank
25# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
26# implementation:-) However! While 64-bit code does perform better
27# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
28# x86_64 does offer larger *addressable* bank, but out-of-order core
29# reaches for even more registers through dynamic aliasing, and EM64T
30# core must have managed to run-time optimize even 32-bit code just as
31# good as 64-bit one. Performance improvement is summarized in the
32# following table:
33#
34#		gcc 3.4		32-bit asm	cycles/byte
35# Opteron	+45%		+20%		6.8
36# Xeon P4	+65%		+0%		9.9
37# Core2		+60%		+10%		7.0
38
39# August 2009.
40#
41# The code was revised to minimize code size and to maximize
42# "distance" between instructions producing input to 'lea'
43# instruction and the 'lea' instruction itself, which is essential
44# for Intel Atom core.
45
46# October 2010.
47#
48# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
49# is to offload message schedule denoted by Wt in NIST specification,
50# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
51# for background and implementation details. The only difference from
52# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
53# to free temporary registers.
54
55# April 2011.
56#
57# Add AVX code path. See sha1-586.pl for further information.
58
59# May 2013.
60#
61# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
62# and loading pair of consecutive blocks to 256-bit %ymm registers)
63# did not provide impressive performance improvement till a crucial
64# hint regarding the number of Xupdate iterations to pre-compute in
65# advance was provided by Ilya Albrekht of Intel Corp.
66
67# March 2014.
68#
69# Add support for Intel SHA Extensions.
70
71######################################################################
72# Current performance is summarized in following table. Numbers are
73# CPU clock cycles spent to process single byte (less is better).
74#
75#		x86_64		SSSE3		AVX[2]
76# P4		9.05		-
77# Opteron	6.26		-
78# Core2		6.55		6.05/+8%	-
79# Westmere	6.73		5.30/+27%	-
80# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
81# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
82# Haswell	5.45		4.15/+31%	3.57/+53%
83# Skylake	5.18		4.06/+28%	3.54/+46%
84# Bulldozer	9.11		5.95/+53%
85# Ryzen		4.75		3.80/+24%	1.93/+150%(**)
86# VIA Nano	9.32		7.15/+30%
87# Atom		10.3		9.17/+12%
88# Silvermont	13.1(*)		9.37/+40%
89# Knights L	13.2(*)		9.68/+36%	8.30/+59%
90# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
91#
92# (*)	obviously suboptimal result, nothing was done about it,
93#	because SSSE3 code is compiled unconditionally;
94# (**)	SHAEXT result
95
96$flavour = shift;
97$output  = shift;
98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99
100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101
102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105die "can't locate x86_64-xlate.pl";
106
107if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
108		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
109	$avx = ($1>=2.19) + ($1>=2.22);
110}
111
112if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
113	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
114	$avx = ($1>=2.09) + ($1>=2.10);
115}
116
117if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
118	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
119	$avx = ($1>=10) + ($1>=11);
120}
121
122if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([2-9]\.[0-9]+)/) {
123	$avx = ($2>=3.0) + ($2>3.0);
124}
125
126$shaext=1;	### set to zero if compiling for 1.0.1
127$avx=1		if (!$shaext && $avx);
128
129open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
130*STDOUT=*OUT;
131
132$ctx="%rdi";	# 1st arg
133$inp="%rsi";	# 2nd arg
134$num="%rdx";	# 3rd arg
135
136# reassign arguments in order to produce more compact code
137$ctx="%r8";
138$inp="%r9";
139$num="%r10";
140
141$t0="%eax";
142$t1="%ebx";
143$t2="%ecx";
144@xi=("%edx","%ebp","%r14d");
145$A="%esi";
146$B="%edi";
147$C="%r11d";
148$D="%r12d";
149$E="%r13d";
150
151@V=($A,$B,$C,$D,$E);
152
153sub BODY_00_19 {
154my ($i,$a,$b,$c,$d,$e)=@_;
155my $j=$i+1;
156$code.=<<___ if ($i==0);
157	mov	`4*$i`($inp),$xi[0]
158	bswap	$xi[0]
159___
160$code.=<<___ if ($i<15);
161	mov	`4*$j`($inp),$xi[1]
162	mov	$d,$t0
163	mov	$xi[0],`4*$i`(%rsp)
164	mov	$a,$t2
165	bswap	$xi[1]
166	xor	$c,$t0
167	rol	\$5,$t2
168	and	$b,$t0
169	lea	0x5a827999($xi[0],$e),$e
170	add	$t2,$e
171	xor	$d,$t0
172	rol	\$30,$b
173	add	$t0,$e
174___
175$code.=<<___ if ($i>=15);
176	xor	`4*($j%16)`(%rsp),$xi[1]
177	mov	$d,$t0
178	mov	$xi[0],`4*($i%16)`(%rsp)
179	mov	$a,$t2
180	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
181	xor	$c,$t0
182	rol	\$5,$t2
183	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
184	and	$b,$t0
185	lea	0x5a827999($xi[0],$e),$e
186	rol	\$30,$b
187	xor	$d,$t0
188	add	$t2,$e
189	rol	\$1,$xi[1]
190	add	$t0,$e
191___
192push(@xi,shift(@xi));
193}
194
195sub BODY_20_39 {
196my ($i,$a,$b,$c,$d,$e)=@_;
197my $j=$i+1;
198my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
199$code.=<<___ if ($i<79);
200	xor	`4*($j%16)`(%rsp),$xi[1]
201	mov	$b,$t0
202	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
203	mov	$a,$t2
204	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
205	xor	$d,$t0
206	rol	\$5,$t2
207	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
208	lea	$K($xi[0],$e),$e
209	xor	$c,$t0
210	add	$t2,$e
211	rol	\$30,$b
212	add	$t0,$e
213	rol	\$1,$xi[1]
214___
215$code.=<<___ if ($i==79);
216	mov	$b,$t0
217	mov	$a,$t2
218	xor	$d,$t0
219	lea	$K($xi[0],$e),$e
220	rol	\$5,$t2
221	xor	$c,$t0
222	add	$t2,$e
223	rol	\$30,$b
224	add	$t0,$e
225___
226push(@xi,shift(@xi));
227}
228
229sub BODY_40_59 {
230my ($i,$a,$b,$c,$d,$e)=@_;
231my $j=$i+1;
232$code.=<<___;
233	xor	`4*($j%16)`(%rsp),$xi[1]
234	mov	$d,$t0
235	mov	$xi[0],`4*($i%16)`(%rsp)
236	mov	$d,$t1
237	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
238	and	$c,$t0
239	mov	$a,$t2
240	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
241	lea	0x8f1bbcdc($xi[0],$e),$e
242	xor	$c,$t1
243	rol	\$5,$t2
244	add	$t0,$e
245	rol	\$1,$xi[1]
246	and	$b,$t1
247	add	$t2,$e
248	rol	\$30,$b
249	add	$t1,$e
250___
251push(@xi,shift(@xi));
252}
253
254$code.=<<___;
255.text
256.extern	OPENSSL_ia32cap_P
257
258.globl	sha1_block_data_order
259.type	sha1_block_data_order,\@function,3
260.align	16
261sha1_block_data_order:
262.cfi_startproc
263	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
264	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
265	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
266	test	\$`1<<9`,%r8d		# check SSSE3 bit
267	jz	.Lialu
268___
269$code.=<<___ if ($shaext);
270	test	\$`1<<29`,%r10d		# check SHA bit
271	jnz	_shaext_shortcut
272___
273$code.=<<___ if ($avx>1);
274	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
275	cmp	\$`1<<3|1<<5|1<<8`,%r10d
276	je	_avx2_shortcut
277___
278$code.=<<___ if ($avx);
279	and	\$`1<<28`,%r8d		# mask AVX bit
280	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
281	or	%r9d,%r8d
282	cmp	\$`1<<28|1<<30`,%r8d
283	je	_avx_shortcut
284___
285$code.=<<___;
286	jmp	_ssse3_shortcut
287
288.align	16
289.Lialu:
290	mov	%rsp,%rax
291.cfi_def_cfa_register	%rax
292	push	%rbx
293.cfi_push	%rbx
294	push	%rbp
295.cfi_push	%rbp
296	push	%r12
297.cfi_push	%r12
298	push	%r13
299.cfi_push	%r13
300	push	%r14
301.cfi_push	%r14
302	mov	%rdi,$ctx	# reassigned argument
303	sub	\$`8+16*4`,%rsp
304	mov	%rsi,$inp	# reassigned argument
305	and	\$-64,%rsp
306	mov	%rdx,$num	# reassigned argument
307	mov	%rax,`16*4`(%rsp)
308.cfi_cfa_expression	%rsp+64,deref,+8
309.Lprologue:
310
311	mov	0($ctx),$A
312	mov	4($ctx),$B
313	mov	8($ctx),$C
314	mov	12($ctx),$D
315	mov	16($ctx),$E
316	jmp	.Lloop
317
318.align	16
319.Lloop:
320___
321for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
322for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
323for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
324for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
325$code.=<<___;
326	add	0($ctx),$A
327	add	4($ctx),$B
328	add	8($ctx),$C
329	add	12($ctx),$D
330	add	16($ctx),$E
331	mov	$A,0($ctx)
332	mov	$B,4($ctx)
333	mov	$C,8($ctx)
334	mov	$D,12($ctx)
335	mov	$E,16($ctx)
336
337	sub	\$1,$num
338	lea	`16*4`($inp),$inp
339	jnz	.Lloop
340
341	mov	`16*4`(%rsp),%rsi
342.cfi_def_cfa	%rsi,8
343	mov	-40(%rsi),%r14
344.cfi_restore	%r14
345	mov	-32(%rsi),%r13
346.cfi_restore	%r13
347	mov	-24(%rsi),%r12
348.cfi_restore	%r12
349	mov	-16(%rsi),%rbp
350.cfi_restore	%rbp
351	mov	-8(%rsi),%rbx
352.cfi_restore	%rbx
353	lea	(%rsi),%rsp
354.cfi_def_cfa_register	%rsp
355.Lepilogue:
356	ret
357.cfi_endproc
358.size	sha1_block_data_order,.-sha1_block_data_order
359___
360if ($shaext) {{{
361######################################################################
362# Intel SHA Extensions implementation of SHA1 update function.
363#
364my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
365my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
366my @MSG=map("%xmm$_",(4..7));
367
368$code.=<<___;
369.type	sha1_block_data_order_shaext,\@function,3
370.align	32
371sha1_block_data_order_shaext:
372_shaext_shortcut:
373.cfi_startproc
374___
375$code.=<<___ if ($win64);
376	lea	`-8-4*16`(%rsp),%rsp
377	movaps	%xmm6,-8-4*16(%rax)
378	movaps	%xmm7,-8-3*16(%rax)
379	movaps	%xmm8,-8-2*16(%rax)
380	movaps	%xmm9,-8-1*16(%rax)
381.Lprologue_shaext:
382___
383$code.=<<___;
384	movdqu	($ctx),$ABCD
385	movd	16($ctx),$E
386	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
387
388	movdqu	($inp),@MSG[0]
389	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
390	movdqu	0x10($inp),@MSG[1]
391	pshufd	\$0b00011011,$E,$E		# flip word order
392	movdqu	0x20($inp),@MSG[2]
393	pshufb	$BSWAP,@MSG[0]
394	movdqu	0x30($inp),@MSG[3]
395	pshufb	$BSWAP,@MSG[1]
396	pshufb	$BSWAP,@MSG[2]
397	movdqa	$E,$E_SAVE			# offload $E
398	pshufb	$BSWAP,@MSG[3]
399	jmp	.Loop_shaext
400
401.align	16
402.Loop_shaext:
403	dec		$num
404	lea		0x40($inp),%r8		# next input block
405	paddd		@MSG[0],$E
406	cmovne		%r8,$inp
407	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
408___
409for($i=0;$i<20-4;$i+=2) {
410$code.=<<___;
411	sha1msg1	@MSG[1],@MSG[0]
412	movdqa		$ABCD,$E_
413	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
414	sha1nexte	@MSG[1],$E_
415	pxor		@MSG[2],@MSG[0]
416	sha1msg1	@MSG[2],@MSG[1]
417	sha1msg2	@MSG[3],@MSG[0]
418
419	movdqa		$ABCD,$E
420	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
421	sha1nexte	@MSG[2],$E
422	pxor		@MSG[3],@MSG[1]
423	sha1msg2	@MSG[0],@MSG[1]
424___
425	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
426}
427$code.=<<___;
428	movdqu		($inp),@MSG[0]
429	movdqa		$ABCD,$E_
430	sha1rnds4	\$3,$E,$ABCD		# 64-67
431	sha1nexte	@MSG[1],$E_
432	movdqu		0x10($inp),@MSG[1]
433	pshufb		$BSWAP,@MSG[0]
434
435	movdqa		$ABCD,$E
436	sha1rnds4	\$3,$E_,$ABCD		# 68-71
437	sha1nexte	@MSG[2],$E
438	movdqu		0x20($inp),@MSG[2]
439	pshufb		$BSWAP,@MSG[1]
440
441	movdqa		$ABCD,$E_
442	sha1rnds4	\$3,$E,$ABCD		# 72-75
443	sha1nexte	@MSG[3],$E_
444	movdqu		0x30($inp),@MSG[3]
445	pshufb		$BSWAP,@MSG[2]
446
447	movdqa		$ABCD,$E
448	sha1rnds4	\$3,$E_,$ABCD		# 76-79
449	sha1nexte	$E_SAVE,$E
450	pshufb		$BSWAP,@MSG[3]
451
452	paddd		$ABCD_SAVE,$ABCD
453	movdqa		$E,$E_SAVE		# offload $E
454
455	jnz		.Loop_shaext
456
457	pshufd	\$0b00011011,$ABCD,$ABCD
458	pshufd	\$0b00011011,$E,$E
459	movdqu	$ABCD,($ctx)
460	movd	$E,16($ctx)
461___
462$code.=<<___ if ($win64);
463	movaps	-8-4*16(%rax),%xmm6
464	movaps	-8-3*16(%rax),%xmm7
465	movaps	-8-2*16(%rax),%xmm8
466	movaps	-8-1*16(%rax),%xmm9
467	mov	%rax,%rsp
468.Lepilogue_shaext:
469___
470$code.=<<___;
471.cfi_endproc
472	ret
473.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
474___
475}}}
476{{{
477my $Xi=4;
478my @X=map("%xmm$_",(4..7,0..3));
479my @Tx=map("%xmm$_",(8..10));
480my $Kx="%xmm11";
481my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
482my @T=("%esi","%edi");
483my $j=0;
484my $rx=0;
485my $K_XX_XX="%r14";
486my $fp="%r11";
487
488my $_rol=sub { &rol(@_) };
489my $_ror=sub { &ror(@_) };
490
491{ my $sn;
492sub align32() {
493  ++$sn;
494$code.=<<___;
495	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
496.align	32
497.Lalign32_$sn:
498___
499}
500}
501
502$code.=<<___;
503.type	sha1_block_data_order_ssse3,\@function,3
504.align	16
505sha1_block_data_order_ssse3:
506_ssse3_shortcut:
507.cfi_startproc
508	mov	%rsp,$fp	# frame pointer
509.cfi_def_cfa_register	$fp
510	push	%rbx
511.cfi_push	%rbx
512	push	%rbp
513.cfi_push	%rbp
514	push	%r12
515.cfi_push	%r12
516	push	%r13		# redundant, done to share Win64 SE handler
517.cfi_push	%r13
518	push	%r14
519.cfi_push	%r14
520	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
521___
522$code.=<<___ if ($win64);
523	movaps	%xmm6,-40-6*16($fp)
524	movaps	%xmm7,-40-5*16($fp)
525	movaps	%xmm8,-40-4*16($fp)
526	movaps	%xmm9,-40-3*16($fp)
527	movaps	%xmm10,-40-2*16($fp)
528	movaps	%xmm11,-40-1*16($fp)
529.Lprologue_ssse3:
530___
531$code.=<<___;
532	and	\$-64,%rsp
533	mov	%rdi,$ctx	# reassigned argument
534	mov	%rsi,$inp	# reassigned argument
535	mov	%rdx,$num	# reassigned argument
536
537	shl	\$6,$num
538	add	$inp,$num
539	lea	K_XX_XX+64(%rip),$K_XX_XX
540
541	mov	0($ctx),$A		# load context
542	mov	4($ctx),$B
543	mov	8($ctx),$C
544	mov	12($ctx),$D
545	mov	$B,@T[0]		# magic seed
546	mov	16($ctx),$E
547	mov	$C,@T[1]
548	xor	$D,@T[1]
549	and	@T[1],@T[0]
550
551	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
552	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
553	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
554	movdqu	16($inp),@X[-3&7]
555	movdqu	32($inp),@X[-2&7]
556	movdqu	48($inp),@X[-1&7]
557	pshufb	@X[2],@X[-4&7]		# byte swap
558	pshufb	@X[2],@X[-3&7]
559	pshufb	@X[2],@X[-2&7]
560	add	\$64,$inp
561	paddd	@Tx[1],@X[-4&7]		# add K_00_19
562	pshufb	@X[2],@X[-1&7]
563	paddd	@Tx[1],@X[-3&7]
564	paddd	@Tx[1],@X[-2&7]
565	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
566	psubd	@Tx[1],@X[-4&7]		# restore X[]
567	movdqa	@X[-3&7],16(%rsp)
568	psubd	@Tx[1],@X[-3&7]
569	movdqa	@X[-2&7],32(%rsp)
570	psubd	@Tx[1],@X[-2&7]
571	jmp	.Loop_ssse3
572___
573
574sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
575{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
576  my $arg = pop;
577    $arg = "\$$arg" if ($arg*1 eq $arg);
578    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
579}
580
581sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
582{ use integer;
583  my $body = shift;
584  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
585  my ($a,$b,$c,$d,$e);
586
587	 eval(shift(@insns));		# ror
588	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
589	 eval(shift(@insns));
590	&movdqa	(@Tx[0],@X[-1&7]);
591	  &paddd	(@Tx[1],@X[-1&7]);
592	 eval(shift(@insns));
593	 eval(shift(@insns));
594
595	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
596	 eval(shift(@insns));
597	 eval(shift(@insns));		# rol
598	 eval(shift(@insns));
599	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
600	 eval(shift(@insns));
601	 eval(shift(@insns));
602
603	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
604	 eval(shift(@insns));
605	 eval(shift(@insns));		# ror
606	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
607	 eval(shift(@insns));
608	 eval(shift(@insns));
609	 eval(shift(@insns));
610
611	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
612	 eval(shift(@insns));
613	 eval(shift(@insns));		# rol
614	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
615	 eval(shift(@insns));
616	 eval(shift(@insns));
617
618	&movdqa	(@Tx[2],@X[0]);
619	 eval(shift(@insns));
620	 eval(shift(@insns));
621	 eval(shift(@insns));		# ror
622	&movdqa	(@Tx[0],@X[0]);
623	 eval(shift(@insns));
624
625	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
626	&paddd	(@X[0],@X[0]);
627	 eval(shift(@insns));
628	 eval(shift(@insns));
629
630	&psrld	(@Tx[0],31);
631	 eval(shift(@insns));
632	 eval(shift(@insns));		# rol
633	 eval(shift(@insns));
634	&movdqa	(@Tx[1],@Tx[2]);
635	 eval(shift(@insns));
636	 eval(shift(@insns));
637
638	&psrld	(@Tx[2],30);
639	 eval(shift(@insns));
640	 eval(shift(@insns));		# ror
641	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
642	 eval(shift(@insns));
643	 eval(shift(@insns));
644	 eval(shift(@insns));
645
646	&pslld	(@Tx[1],2);
647	&pxor	(@X[0],@Tx[2]);
648	 eval(shift(@insns));
649	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
650	 eval(shift(@insns));		# rol
651	 eval(shift(@insns));
652	 eval(shift(@insns));
653
654	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
655	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
656
657	 foreach (@insns) { eval; }	# remaining instructions [if any]
658
659  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
660		push(@Tx,shift(@Tx));
661}
662
663sub Xupdate_ssse3_32_79()
664{ use integer;
665  my $body = shift;
666  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
667  my ($a,$b,$c,$d,$e);
668
669	 eval(shift(@insns))		if ($Xi==8);
670	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
671	 eval(shift(@insns))		if ($Xi==8);
672	 eval(shift(@insns));		# body_20_39
673	 eval(shift(@insns));
674	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
675	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
676	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
677	 eval(shift(@insns));
678	 eval(shift(@insns));		# rol
679
680	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
681	 eval(shift(@insns));
682	 eval(shift(@insns));
683	if ($Xi%5) {
684	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
685	} else {			# ... or load next one
686	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
687	}
688	 eval(shift(@insns));		# ror
689	  &paddd	(@Tx[1],@X[-1&7]);
690	 eval(shift(@insns));
691
692	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
693	 eval(shift(@insns));		# body_20_39
694	 eval(shift(@insns));
695	 eval(shift(@insns));
696	 eval(shift(@insns));		# rol
697	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
698
699	&movdqa	(@Tx[0],@X[0]);
700	 eval(shift(@insns));
701	 eval(shift(@insns));
702	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
703	 eval(shift(@insns));		# ror
704	 eval(shift(@insns));
705	 eval(shift(@insns));		# body_20_39
706
707	&pslld	(@X[0],2);
708	 eval(shift(@insns));
709	 eval(shift(@insns));
710	&psrld	(@Tx[0],30);
711	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
712	 eval(shift(@insns));
713	 eval(shift(@insns));
714	 eval(shift(@insns));		# ror
715
716	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
717	 eval(shift(@insns));
718	 eval(shift(@insns));		# body_20_39
719	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
720	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
721	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
722	 eval(shift(@insns));
723	 eval(shift(@insns));		# rol
724	 eval(shift(@insns));
725	 eval(shift(@insns));
726	 eval(shift(@insns));		# rol
727	 eval(shift(@insns));
728
729	 foreach (@insns) { eval; }	# remaining instructions
730
731  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
732		push(@Tx,shift(@Tx));
733}
734
735sub Xuplast_ssse3_80()
736{ use integer;
737  my $body = shift;
738  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
739  my ($a,$b,$c,$d,$e);
740
741	 eval(shift(@insns));
742	 eval(shift(@insns));
743	 eval(shift(@insns));
744	 eval(shift(@insns));
745	  &paddd	(@Tx[1],@X[-1&7]);
746	 eval(shift(@insns));
747	 eval(shift(@insns));
748
749	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
750
751	 foreach (@insns) { eval; }		# remaining instructions
752
753	&cmp	($inp,$num);
754	&je	(".Ldone_ssse3");
755
756	unshift(@Tx,pop(@Tx));
757
758	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
759	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
760	&movdqu	(@X[-4&7],"0($inp)");		# load input
761	&movdqu	(@X[-3&7],"16($inp)");
762	&movdqu	(@X[-2&7],"32($inp)");
763	&movdqu	(@X[-1&7],"48($inp)");
764	&pshufb	(@X[-4&7],@X[2]);		# byte swap
765	&add	($inp,64);
766
767  $Xi=0;
768}
769
770sub Xloop_ssse3()
771{ use integer;
772  my $body = shift;
773  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
774  my ($a,$b,$c,$d,$e);
775
776	 eval(shift(@insns));
777	 eval(shift(@insns));
778	 eval(shift(@insns));
779	&pshufb	(@X[($Xi-3)&7],@X[2]);
780	 eval(shift(@insns));
781	 eval(shift(@insns));
782	 eval(shift(@insns));
783	 eval(shift(@insns));
784	&paddd	(@X[($Xi-4)&7],@Tx[1]);
785	 eval(shift(@insns));
786	 eval(shift(@insns));
787	 eval(shift(@insns));
788	 eval(shift(@insns));
789	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
790	 eval(shift(@insns));
791	 eval(shift(@insns));
792	 eval(shift(@insns));
793	 eval(shift(@insns));
794	&psubd	(@X[($Xi-4)&7],@Tx[1]);
795
796	foreach (@insns) { eval; }
797  $Xi++;
798}
799
800sub Xtail_ssse3()
801{ use integer;
802  my $body = shift;
803  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
804  my ($a,$b,$c,$d,$e);
805
806	foreach (@insns) { eval; }
807}
808
809sub body_00_19 () {	# ((c^d)&b)^d
810	# on start @T[0]=(c^d)&b
811	return &body_20_39() if ($rx==19); $rx++;
812	(
813	'($a,$b,$c,$d,$e)=@V;'.
814	'&$_ror	($b,$j?7:2)',	# $b>>>2
815	'&xor	(@T[0],$d)',
816	'&mov	(@T[1],$a)',	# $b for next round
817
818	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
819	'&xor	($b,$c)',	# $c^$d for next round
820
821	'&$_rol	($a,5)',
822	'&add	($e,@T[0])',
823	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
824
825	'&xor	($b,$c)',	# restore $b
826	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
827	);
828}
829
830sub body_20_39 () {	# b^d^c
831	# on entry @T[0]=b^d
832	return &body_40_59() if ($rx==39); $rx++;
833	(
834	'($a,$b,$c,$d,$e)=@V;'.
835	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
836	'&xor	(@T[0],$d)	if($j==19);'.
837	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
838	'&mov	(@T[1],$a)',	# $b for next round
839
840	'&$_rol	($a,5)',
841	'&add	($e,@T[0])',
842	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
843
844	'&$_ror	($b,7)',	# $b>>>2
845	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
846	);
847}
848
849sub body_40_59 () {	# ((b^c)&(c^d))^c
850	# on entry @T[0]=(b^c), (c^=d)
851	$rx++;
852	(
853	'($a,$b,$c,$d,$e)=@V;'.
854	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
855	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
856	'&xor	($c,$d)		if ($j>=40)',	# restore $c
857
858	'&$_ror	($b,7)',	# $b>>>2
859	'&mov	(@T[1],$a)',	# $b for next round
860	'&xor	(@T[0],$c)',
861
862	'&$_rol	($a,5)',
863	'&add	($e,@T[0])',
864	'&xor	(@T[1],$c)	if ($j==59);'.
865	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
866
867	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
868	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
869	);
870}
871$code.=<<___;
872.align	16
873.Loop_ssse3:
874___
875	&Xupdate_ssse3_16_31(\&body_00_19);
876	&Xupdate_ssse3_16_31(\&body_00_19);
877	&Xupdate_ssse3_16_31(\&body_00_19);
878	&Xupdate_ssse3_16_31(\&body_00_19);
879	&Xupdate_ssse3_32_79(\&body_00_19);
880	&Xupdate_ssse3_32_79(\&body_20_39);
881	&Xupdate_ssse3_32_79(\&body_20_39);
882	&Xupdate_ssse3_32_79(\&body_20_39);
883	&Xupdate_ssse3_32_79(\&body_20_39);
884	&Xupdate_ssse3_32_79(\&body_20_39);
885	&Xupdate_ssse3_32_79(\&body_40_59);
886	&Xupdate_ssse3_32_79(\&body_40_59);
887	&Xupdate_ssse3_32_79(\&body_40_59);
888	&Xupdate_ssse3_32_79(\&body_40_59);
889	&Xupdate_ssse3_32_79(\&body_40_59);
890	&Xupdate_ssse3_32_79(\&body_20_39);
891	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
892
893				$saved_j=$j; @saved_V=@V;
894
895	&Xloop_ssse3(\&body_20_39);
896	&Xloop_ssse3(\&body_20_39);
897	&Xloop_ssse3(\&body_20_39);
898
899$code.=<<___;
900	add	0($ctx),$A			# update context
901	add	4($ctx),@T[0]
902	add	8($ctx),$C
903	add	12($ctx),$D
904	mov	$A,0($ctx)
905	add	16($ctx),$E
906	mov	@T[0],4($ctx)
907	mov	@T[0],$B			# magic seed
908	mov	$C,8($ctx)
909	mov	$C,@T[1]
910	mov	$D,12($ctx)
911	xor	$D,@T[1]
912	mov	$E,16($ctx)
913	and	@T[1],@T[0]
914	jmp	.Loop_ssse3
915
916.align	16
917.Ldone_ssse3:
918___
919				$j=$saved_j; @V=@saved_V;
920
921	&Xtail_ssse3(\&body_20_39);
922	&Xtail_ssse3(\&body_20_39);
923	&Xtail_ssse3(\&body_20_39);
924
925$code.=<<___;
926	add	0($ctx),$A			# update context
927	add	4($ctx),@T[0]
928	add	8($ctx),$C
929	mov	$A,0($ctx)
930	add	12($ctx),$D
931	mov	@T[0],4($ctx)
932	add	16($ctx),$E
933	mov	$C,8($ctx)
934	mov	$D,12($ctx)
935	mov	$E,16($ctx)
936___
937$code.=<<___ if ($win64);
938	movaps	-40-6*16($fp),%xmm6
939	movaps	-40-5*16($fp),%xmm7
940	movaps	-40-4*16($fp),%xmm8
941	movaps	-40-3*16($fp),%xmm9
942	movaps	-40-2*16($fp),%xmm10
943	movaps	-40-1*16($fp),%xmm11
944___
945$code.=<<___;
946	mov	-40($fp),%r14
947.cfi_restore	%r14
948	mov	-32($fp),%r13
949.cfi_restore	%r13
950	mov	-24($fp),%r12
951.cfi_restore	%r12
952	mov	-16($fp),%rbp
953.cfi_restore	%rbp
954	mov	-8($fp),%rbx
955.cfi_restore	%rbx
956	lea	($fp),%rsp
957.cfi_def_cfa_register	%rsp
958.Lepilogue_ssse3:
959	ret
960.cfi_endproc
961.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
962___
963
964if ($avx) {
965$Xi=4;				# reset variables
966@X=map("%xmm$_",(4..7,0..3));
967@Tx=map("%xmm$_",(8..10));
968$j=0;
969$rx=0;
970
971my $done_avx_label=".Ldone_avx";
972
973my $_rol=sub { &shld(@_[0],@_) };
974my $_ror=sub { &shrd(@_[0],@_) };
975
976$code.=<<___;
977.type	sha1_block_data_order_avx,\@function,3
978.align	16
979sha1_block_data_order_avx:
980_avx_shortcut:
981.cfi_startproc
982	mov	%rsp,$fp
983.cfi_def_cfa_register	$fp
984	push	%rbx
985.cfi_push	%rbx
986	push	%rbp
987.cfi_push	%rbp
988	push	%r12
989.cfi_push	%r12
990	push	%r13		# redundant, done to share Win64 SE handler
991.cfi_push	%r13
992	push	%r14
993.cfi_push	%r14
994	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
995	vzeroupper
996___
997$code.=<<___ if ($win64);
998	vmovaps	%xmm6,-40-6*16($fp)
999	vmovaps	%xmm7,-40-5*16($fp)
1000	vmovaps	%xmm8,-40-4*16($fp)
1001	vmovaps	%xmm9,-40-3*16($fp)
1002	vmovaps	%xmm10,-40-2*16($fp)
1003	vmovaps	%xmm11,-40-1*16($fp)
1004.Lprologue_avx:
1005___
1006$code.=<<___;
1007	and	\$-64,%rsp
1008	mov	%rdi,$ctx	# reassigned argument
1009	mov	%rsi,$inp	# reassigned argument
1010	mov	%rdx,$num	# reassigned argument
1011
1012	shl	\$6,$num
1013	add	$inp,$num
1014	lea	K_XX_XX+64(%rip),$K_XX_XX
1015
1016	mov	0($ctx),$A		# load context
1017	mov	4($ctx),$B
1018	mov	8($ctx),$C
1019	mov	12($ctx),$D
1020	mov	$B,@T[0]		# magic seed
1021	mov	16($ctx),$E
1022	mov	$C,@T[1]
1023	xor	$D,@T[1]
1024	and	@T[1],@T[0]
1025
1026	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
1027	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
1028	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
1029	vmovdqu	16($inp),@X[-3&7]
1030	vmovdqu	32($inp),@X[-2&7]
1031	vmovdqu	48($inp),@X[-1&7]
1032	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
1033	add	\$64,$inp
1034	vpshufb	@X[2],@X[-3&7],@X[-3&7]
1035	vpshufb	@X[2],@X[-2&7],@X[-2&7]
1036	vpshufb	@X[2],@X[-1&7],@X[-1&7]
1037	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1038	vpaddd	$Kx,@X[-3&7],@X[1]
1039	vpaddd	$Kx,@X[-2&7],@X[2]
1040	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1041	vmovdqa	@X[1],16(%rsp)
1042	vmovdqa	@X[2],32(%rsp)
1043	jmp	.Loop_avx
1044___
1045
1046sub Xupdate_avx_16_31()		# recall that $Xi starts with 4
1047{ use integer;
1048  my $body = shift;
1049  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1050  my ($a,$b,$c,$d,$e);
1051
1052	 eval(shift(@insns));
1053	 eval(shift(@insns));
1054	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1055	 eval(shift(@insns));
1056	 eval(shift(@insns));
1057
1058	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1059	 eval(shift(@insns));
1060	 eval(shift(@insns));
1061	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1062	 eval(shift(@insns));
1063	 eval(shift(@insns));
1064	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1065	 eval(shift(@insns));
1066	 eval(shift(@insns));
1067
1068	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1069	 eval(shift(@insns));
1070	 eval(shift(@insns));
1071	 eval(shift(@insns));
1072	 eval(shift(@insns));
1073
1074	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1075	 eval(shift(@insns));
1076	 eval(shift(@insns));
1077	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1078	 eval(shift(@insns));
1079	 eval(shift(@insns));
1080
1081	&vpsrld	(@Tx[0],@X[0],31);
1082	 eval(shift(@insns));
1083	 eval(shift(@insns));
1084	 eval(shift(@insns));
1085	 eval(shift(@insns));
1086
1087	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1088	&vpaddd	(@X[0],@X[0],@X[0]);
1089	 eval(shift(@insns));
1090	 eval(shift(@insns));
1091	 eval(shift(@insns));
1092	 eval(shift(@insns));
1093
1094	&vpsrld	(@Tx[1],@Tx[2],30);
1095	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1096	 eval(shift(@insns));
1097	 eval(shift(@insns));
1098	 eval(shift(@insns));
1099	 eval(shift(@insns));
1100
1101	&vpslld	(@Tx[2],@Tx[2],2);
1102	&vpxor	(@X[0],@X[0],@Tx[1]);
1103	 eval(shift(@insns));
1104	 eval(shift(@insns));
1105	 eval(shift(@insns));
1106	 eval(shift(@insns));
1107
1108	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1109	 eval(shift(@insns));
1110	 eval(shift(@insns));
1111	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1112	 eval(shift(@insns));
1113	 eval(shift(@insns));
1114
1115
1116	 foreach (@insns) { eval; }	# remaining instructions [if any]
1117
1118  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1119}
1120
1121sub Xupdate_avx_32_79()
1122{ use integer;
1123  my $body = shift;
1124  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1125  my ($a,$b,$c,$d,$e);
1126
1127	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1128	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1129	 eval(shift(@insns));		# body_20_39
1130	 eval(shift(@insns));
1131	 eval(shift(@insns));
1132	 eval(shift(@insns));		# rol
1133
1134	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1135	 eval(shift(@insns));
1136	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1137	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1138	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1139	 eval(shift(@insns));		# ror
1140	 eval(shift(@insns));
1141
1142	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1143	 eval(shift(@insns));		# body_20_39
1144	 eval(shift(@insns));
1145	 eval(shift(@insns));
1146	 eval(shift(@insns));		# rol
1147
1148	&vpsrld	(@Tx[0],@X[0],30);
1149	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1150	 eval(shift(@insns));
1151	 eval(shift(@insns));
1152	 eval(shift(@insns));		# ror
1153	 eval(shift(@insns));
1154
1155	&vpslld	(@X[0],@X[0],2);
1156	 eval(shift(@insns));		# body_20_39
1157	 eval(shift(@insns));
1158	 eval(shift(@insns));
1159	 eval(shift(@insns));		# rol
1160	 eval(shift(@insns));
1161	 eval(shift(@insns));
1162	 eval(shift(@insns));		# ror
1163	 eval(shift(@insns));
1164
1165	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1166	 eval(shift(@insns));		# body_20_39
1167	 eval(shift(@insns));
1168	 eval(shift(@insns));
1169	 eval(shift(@insns));		# rol
1170	 eval(shift(@insns));
1171	 eval(shift(@insns));
1172	 eval(shift(@insns));		# rol
1173	 eval(shift(@insns));
1174
1175	 foreach (@insns) { eval; }	# remaining instructions
1176
1177  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1178}
1179
1180sub Xuplast_avx_80()
1181{ use integer;
1182  my $body = shift;
1183  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1184  my ($a,$b,$c,$d,$e);
1185
1186	 eval(shift(@insns));
1187	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1188	 eval(shift(@insns));
1189	 eval(shift(@insns));
1190	 eval(shift(@insns));
1191	 eval(shift(@insns));
1192
1193	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1194
1195	 foreach (@insns) { eval; }		# remaining instructions
1196
1197	&cmp	($inp,$num);
1198	&je	($done_avx_label);
1199
1200	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
1201	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
1202	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1203	&vmovdqu(@X[-3&7],"16($inp)");
1204	&vmovdqu(@X[-2&7],"32($inp)");
1205	&vmovdqu(@X[-1&7],"48($inp)");
1206	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1207	&add	($inp,64);
1208
1209  $Xi=0;
1210}
1211
1212sub Xloop_avx()
1213{ use integer;
1214  my $body = shift;
1215  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1216  my ($a,$b,$c,$d,$e);
1217
1218	 eval(shift(@insns));
1219	 eval(shift(@insns));
1220	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1221	 eval(shift(@insns));
1222	 eval(shift(@insns));
1223	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1224	 eval(shift(@insns));
1225	 eval(shift(@insns));
1226	 eval(shift(@insns));
1227	 eval(shift(@insns));
1228	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
1229	 eval(shift(@insns));
1230	 eval(shift(@insns));
1231
1232	foreach (@insns) { eval; }
1233  $Xi++;
1234}
1235
1236sub Xtail_avx()
1237{ use integer;
1238  my $body = shift;
1239  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1240  my ($a,$b,$c,$d,$e);
1241
1242	foreach (@insns) { eval; }
1243}
1244
1245$code.=<<___;
1246.align	16
1247.Loop_avx:
1248___
1249	&Xupdate_avx_16_31(\&body_00_19);
1250	&Xupdate_avx_16_31(\&body_00_19);
1251	&Xupdate_avx_16_31(\&body_00_19);
1252	&Xupdate_avx_16_31(\&body_00_19);
1253	&Xupdate_avx_32_79(\&body_00_19);
1254	&Xupdate_avx_32_79(\&body_20_39);
1255	&Xupdate_avx_32_79(\&body_20_39);
1256	&Xupdate_avx_32_79(\&body_20_39);
1257	&Xupdate_avx_32_79(\&body_20_39);
1258	&Xupdate_avx_32_79(\&body_20_39);
1259	&Xupdate_avx_32_79(\&body_40_59);
1260	&Xupdate_avx_32_79(\&body_40_59);
1261	&Xupdate_avx_32_79(\&body_40_59);
1262	&Xupdate_avx_32_79(\&body_40_59);
1263	&Xupdate_avx_32_79(\&body_40_59);
1264	&Xupdate_avx_32_79(\&body_20_39);
1265	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1266
1267				$saved_j=$j; @saved_V=@V;
1268
1269	&Xloop_avx(\&body_20_39);
1270	&Xloop_avx(\&body_20_39);
1271	&Xloop_avx(\&body_20_39);
1272
1273$code.=<<___;
1274	add	0($ctx),$A			# update context
1275	add	4($ctx),@T[0]
1276	add	8($ctx),$C
1277	add	12($ctx),$D
1278	mov	$A,0($ctx)
1279	add	16($ctx),$E
1280	mov	@T[0],4($ctx)
1281	mov	@T[0],$B			# magic seed
1282	mov	$C,8($ctx)
1283	mov	$C,@T[1]
1284	mov	$D,12($ctx)
1285	xor	$D,@T[1]
1286	mov	$E,16($ctx)
1287	and	@T[1],@T[0]
1288	jmp	.Loop_avx
1289
1290.align	16
1291$done_avx_label:
1292___
1293				$j=$saved_j; @V=@saved_V;
1294
1295	&Xtail_avx(\&body_20_39);
1296	&Xtail_avx(\&body_20_39);
1297	&Xtail_avx(\&body_20_39);
1298
1299$code.=<<___;
1300	vzeroupper
1301
1302	add	0($ctx),$A			# update context
1303	add	4($ctx),@T[0]
1304	add	8($ctx),$C
1305	mov	$A,0($ctx)
1306	add	12($ctx),$D
1307	mov	@T[0],4($ctx)
1308	add	16($ctx),$E
1309	mov	$C,8($ctx)
1310	mov	$D,12($ctx)
1311	mov	$E,16($ctx)
1312___
1313$code.=<<___ if ($win64);
1314	movaps	-40-6*16($fp),%xmm6
1315	movaps	-40-5*16($fp),%xmm7
1316	movaps	-40-4*16($fp),%xmm8
1317	movaps	-40-3*16($fp),%xmm9
1318	movaps	-40-2*16($fp),%xmm10
1319	movaps	-40-1*16($fp),%xmm11
1320___
1321$code.=<<___;
1322	mov	-40($fp),%r14
1323.cfi_restore	%r14
1324	mov	-32($fp),%r13
1325.cfi_restore	%r13
1326	mov	-24($fp),%r12
1327.cfi_restore	%r12
1328	mov	-16($fp),%rbp
1329.cfi_restore	%rbp
1330	mov	-8($fp),%rbx
1331.cfi_restore	%rbx
1332	lea	($fp),%rsp
1333.cfi_def_cfa_register	%rsp
1334.Lepilogue_avx:
1335	ret
1336.cfi_endproc
1337.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1338___
1339
1340if ($avx>1) {
1341use integer;
1342$Xi=4;					# reset variables
1343@X=map("%ymm$_",(4..7,0..3));
1344@Tx=map("%ymm$_",(8..10));
1345$Kx="%ymm11";
1346$j=0;
1347
1348my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1349my ($a5,$t0)=("%r12d","%edi");
1350
1351my ($A,$F,$B,$C,$D,$E)=@ROTX;
1352my $rx=0;
1353my $frame="%r13";
1354
1355$code.=<<___;
1356.type	sha1_block_data_order_avx2,\@function,3
1357.align	16
1358sha1_block_data_order_avx2:
1359_avx2_shortcut:
1360.cfi_startproc
1361	mov	%rsp,$fp
1362.cfi_def_cfa_register	$fp
1363	push	%rbx
1364.cfi_push	%rbx
1365	push	%rbp
1366.cfi_push	%rbp
1367	push	%r12
1368.cfi_push	%r12
1369	push	%r13
1370.cfi_push	%r13
1371	push	%r14
1372.cfi_push	%r14
1373	vzeroupper
1374___
1375$code.=<<___ if ($win64);
1376	lea	-6*16(%rsp),%rsp
1377	vmovaps	%xmm6,-40-6*16($fp)
1378	vmovaps	%xmm7,-40-5*16($fp)
1379	vmovaps	%xmm8,-40-4*16($fp)
1380	vmovaps	%xmm9,-40-3*16($fp)
1381	vmovaps	%xmm10,-40-2*16($fp)
1382	vmovaps	%xmm11,-40-1*16($fp)
1383.Lprologue_avx2:
1384___
1385$code.=<<___;
1386	mov	%rdi,$ctx		# reassigned argument
1387	mov	%rsi,$inp		# reassigned argument
1388	mov	%rdx,$num		# reassigned argument
1389
1390	lea	-640(%rsp),%rsp
1391	shl	\$6,$num
1392	 lea	64($inp),$frame
1393	and	\$-128,%rsp
1394	add	$inp,$num
1395	lea	K_XX_XX+64(%rip),$K_XX_XX
1396
1397	mov	0($ctx),$A		# load context
1398	 cmp	$num,$frame
1399	 cmovae	$inp,$frame		# next or same block
1400	mov	4($ctx),$F
1401	mov	8($ctx),$C
1402	mov	12($ctx),$D
1403	mov	16($ctx),$E
1404	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
1405
1406	vmovdqu		($inp),%xmm0
1407	vmovdqu		16($inp),%xmm1
1408	vmovdqu		32($inp),%xmm2
1409	vmovdqu		48($inp),%xmm3
1410	lea		64($inp),$inp
1411	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
1412	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1413	vpshufb		@X[2],@X[-4&7],@X[-4&7]
1414	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1415	vpshufb		@X[2],@X[-3&7],@X[-3&7]
1416	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1417	vpshufb		@X[2],@X[-2&7],@X[-2&7]
1418	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
1419	vpshufb		@X[2],@X[-1&7],@X[-1&7]
1420
1421	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1422	vpaddd	$Kx,@X[-3&7],@X[1]
1423	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
1424	vpaddd	$Kx,@X[-2&7],@X[2]
1425	vmovdqu	@X[1],32(%rsp)
1426	vpaddd	$Kx,@X[-1&7],@X[3]
1427	vmovdqu	@X[2],64(%rsp)
1428	vmovdqu	@X[3],96(%rsp)
1429___
1430for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
1431    use integer;
1432
1433	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1434	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1435	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1436	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1437	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1438	&vpsrld	(@Tx[0],@X[0],31);
1439	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1440	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1441	&vpaddd	(@X[0],@X[0],@X[0]);
1442	&vpsrld	(@Tx[1],@Tx[2],30);
1443	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1444	&vpslld	(@Tx[2],@Tx[2],2);
1445	&vpxor	(@X[0],@X[0],@Tx[1]);
1446	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1447	&vpaddd	(@Tx[1],@X[0],$Kx);
1448	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1449
1450	push(@X,shift(@X));	# "rotate" X[]
1451}
1452$code.=<<___;
1453	lea	128(%rsp),$frame
1454	jmp	.Loop_avx2
1455.align	32
1456.Loop_avx2:
1457	rorx	\$2,$F,$B
1458	andn	$D,$F,$t0
1459	and	$C,$F
1460	xor	$t0,$F
1461___
1462sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
1463	# at start $f=(b&c)^(~b&d), $b>>>=2
1464	return &bodyx_20_39() if ($rx==19); $rx++;
1465	(
1466	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1467
1468	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1469	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1470	'&andn	($t0,$a,$c)',			# ~b&d for next round
1471
1472	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
1473	'&rorx	($a5,$a,27)',			# a<<<5
1474	'&rorx	($f,$a,2)',			# b>>>2 for next round
1475	'&and	($a,$b)',			# b&c for next round
1476
1477	'&add	($e,$a5)',			# e+=a<<<5
1478	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
1479
1480	'unshift(@ROTX,pop(@ROTX)); $j++;'
1481	)
1482}
1483
1484sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
1485	# on entry $f=b^c^d, $b>>>=2
1486	return &bodyx_40_59() if ($rx==39); $rx++;
1487	(
1488	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1489
1490	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1491	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1492
1493	'&lea	($e,"($e,$f)")',		# e+=b^c^d
1494	'&rorx	($a5,$a,27)',			# a<<<5
1495	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
1496	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
1497
1498	'&add	($e,$a5)',			# e+=a<<<5
1499	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
1500
1501	'unshift(@ROTX,pop(@ROTX)); $j++;'
1502	)
1503}
1504
1505sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
1506	# on entry $f=((b^c)&(c^d)), $b>>>=2
1507	$rx++;
1508	(
1509	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1510
1511	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1512	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1513	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
1514	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
1515	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
1516
1517	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
1518	'&rorx	($a5,$a,27)',			# a<<<5
1519	'&rorx	($f,$a,2)',			# b>>>2 in next round
1520	'&xor	($a,$b)',			# b^c for next round
1521
1522	'&add	($e,$a5)',			# e+=a<<<5
1523	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
1524	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
1525
1526	'unshift(@ROTX,pop(@ROTX)); $j++;'
1527	)
1528}
1529
1530sub Xupdate_avx2_16_31()		# recall that $Xi starts with 4
1531{ use integer;
1532  my $body = shift;
1533  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
1534  my ($a,$b,$c,$d,$e);
1535
1536	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1537	 eval(shift(@insns));
1538	 eval(shift(@insns));
1539	 eval(shift(@insns));
1540	 eval(shift(@insns));
1541
1542	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1543	 eval(shift(@insns));
1544	 eval(shift(@insns));
1545	 eval(shift(@insns));
1546
1547	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1548	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1549	 eval(shift(@insns));
1550	 eval(shift(@insns));
1551
1552	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1553	 eval(shift(@insns));
1554	 eval(shift(@insns));
1555	 eval(shift(@insns));
1556	 eval(shift(@insns));
1557
1558	&vpsrld	(@Tx[0],@X[0],31);
1559	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1560	 eval(shift(@insns));
1561	 eval(shift(@insns));
1562	 eval(shift(@insns));
1563
1564	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1565	&vpaddd	(@X[0],@X[0],@X[0]);
1566	 eval(shift(@insns));
1567	 eval(shift(@insns));
1568
1569	&vpsrld	(@Tx[1],@Tx[2],30);
1570	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1571	 eval(shift(@insns));
1572	 eval(shift(@insns));
1573
1574	&vpslld	(@Tx[2],@Tx[2],2);
1575	&vpxor	(@X[0],@X[0],@Tx[1]);
1576	 eval(shift(@insns));
1577	 eval(shift(@insns));
1578
1579	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1580	 eval(shift(@insns));
1581	 eval(shift(@insns));
1582	 eval(shift(@insns));
1583
1584	&vpaddd	(@Tx[1],@X[0],$Kx);
1585	 eval(shift(@insns));
1586	 eval(shift(@insns));
1587	 eval(shift(@insns));
1588	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1589
1590	 foreach (@insns) { eval; }	# remaining instructions [if any]
1591
1592	$Xi++;
1593	push(@X,shift(@X));	# "rotate" X[]
1594}
1595
1596sub Xupdate_avx2_32_79()
1597{ use integer;
1598  my $body = shift;
1599  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
1600  my ($a,$b,$c,$d,$e);
1601
1602	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1603	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1604	 eval(shift(@insns));
1605	 eval(shift(@insns));
1606
1607	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1608	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1609	 eval(shift(@insns));
1610	 eval(shift(@insns));
1611	 eval(shift(@insns));
1612
1613	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1614	 eval(shift(@insns));
1615	 eval(shift(@insns));
1616	 eval(shift(@insns));
1617
1618	&vpsrld	(@Tx[0],@X[0],30);
1619	&vpslld	(@X[0],@X[0],2);
1620	 eval(shift(@insns));
1621	 eval(shift(@insns));
1622	 eval(shift(@insns));
1623
1624	#&vpslld	(@X[0],@X[0],2);
1625	 eval(shift(@insns));
1626	 eval(shift(@insns));
1627	 eval(shift(@insns));
1628
1629	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1630	 eval(shift(@insns));
1631	 eval(shift(@insns));
1632	 eval(shift(@insns));
1633	 eval(shift(@insns));
1634
1635	&vpaddd	(@Tx[1],@X[0],$Kx);
1636	 eval(shift(@insns));
1637	 eval(shift(@insns));
1638	 eval(shift(@insns));
1639	 eval(shift(@insns));
1640
1641	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1642
1643	 foreach (@insns) { eval; }	# remaining instructions
1644
1645	$Xi++;
1646	push(@X,shift(@X));	# "rotate" X[]
1647}
1648
1649sub Xloop_avx2()
1650{ use integer;
1651  my $body = shift;
1652  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
1653  my ($a,$b,$c,$d,$e);
1654
1655	 foreach (@insns) { eval; }
1656}
1657
1658	&align32();
1659	&Xupdate_avx2_32_79(\&bodyx_00_19);
1660	&Xupdate_avx2_32_79(\&bodyx_00_19);
1661	&Xupdate_avx2_32_79(\&bodyx_00_19);
1662	&Xupdate_avx2_32_79(\&bodyx_00_19);
1663
1664	&Xupdate_avx2_32_79(\&bodyx_20_39);
1665	&Xupdate_avx2_32_79(\&bodyx_20_39);
1666	&Xupdate_avx2_32_79(\&bodyx_20_39);
1667	&Xupdate_avx2_32_79(\&bodyx_20_39);
1668
1669	&align32();
1670	&Xupdate_avx2_32_79(\&bodyx_40_59);
1671	&Xupdate_avx2_32_79(\&bodyx_40_59);
1672	&Xupdate_avx2_32_79(\&bodyx_40_59);
1673	&Xupdate_avx2_32_79(\&bodyx_40_59);
1674
1675	&Xloop_avx2(\&bodyx_20_39);
1676	&Xloop_avx2(\&bodyx_20_39);
1677	&Xloop_avx2(\&bodyx_20_39);
1678	&Xloop_avx2(\&bodyx_20_39);
1679
1680$code.=<<___;
1681	lea	128($inp),$frame
1682	lea	128($inp),%rdi			# borrow $t0
1683	cmp	$num,$frame
1684	cmovae	$inp,$frame			# next or previous block
1685
1686	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1687	add	0($ctx),@ROTX[0]		# update context
1688	add	4($ctx),@ROTX[1]
1689	add	8($ctx),@ROTX[3]
1690	mov	@ROTX[0],0($ctx)
1691	add	12($ctx),@ROTX[4]
1692	mov	@ROTX[1],4($ctx)
1693	 mov	@ROTX[0],$A			# A=d
1694	add	16($ctx),@ROTX[5]
1695	 mov	@ROTX[3],$a5
1696	mov	@ROTX[3],8($ctx)
1697	 mov	@ROTX[4],$D			# D=b
1698	 #xchg	@ROTX[5],$F			# F=c, C=f
1699	mov	@ROTX[4],12($ctx)
1700	 mov	@ROTX[1],$F			# F=e
1701	mov	@ROTX[5],16($ctx)
1702	#mov	$F,16($ctx)
1703	 mov	@ROTX[5],$E			# E=c
1704	 mov	$a5,$C				# C=f
1705	 #xchg	$F,$E				# E=c, F=e
1706
1707	cmp	$num,$inp
1708	je	.Ldone_avx2
1709___
1710
1711$Xi=4;				# reset variables
1712@X=map("%ymm$_",(4..7,0..3));
1713
1714$code.=<<___;
1715	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
1716	cmp	$num,%rdi			# borrowed $t0
1717	ja	.Last_avx2
1718
1719	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
1720	vmovdqu		-48(%rdi),%xmm1
1721	vmovdqu		-32(%rdi),%xmm2
1722	vmovdqu		-16(%rdi),%xmm3
1723	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
1724	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1725	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1726	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1727	jmp	.Last_avx2
1728
1729.align	32
1730.Last_avx2:
1731	lea	128+16(%rsp),$frame
1732	rorx	\$2,$F,$B
1733	andn	$D,$F,$t0
1734	and	$C,$F
1735	xor	$t0,$F
1736	sub	\$-128,$inp
1737___
1738	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
1739
1740	&Xloop_avx2	(\&bodyx_00_19);
1741	&Xloop_avx2	(\&bodyx_00_19);
1742	&Xloop_avx2	(\&bodyx_00_19);
1743	&Xloop_avx2	(\&bodyx_00_19);
1744
1745	&Xloop_avx2	(\&bodyx_20_39);
1746	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
1747	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1748	&Xloop_avx2	(\&bodyx_20_39);
1749	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
1750	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
1751	&Xloop_avx2	(\&bodyx_20_39);
1752	  &vmovdqu	("0(%rsp)",@Tx[0]);
1753	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
1754	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
1755	&Xloop_avx2	(\&bodyx_20_39);
1756	  &vmovdqu	("32(%rsp)",@Tx[1]);
1757	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
1758	  &vpaddd	(@X[2],@X[-2&7],$Kx);
1759
1760	&Xloop_avx2	(\&bodyx_40_59);
1761	&align32	();
1762	  &vmovdqu	("64(%rsp)",@X[2]);
1763	  &vpaddd	(@X[3],@X[-1&7],$Kx);
1764	&Xloop_avx2	(\&bodyx_40_59);
1765	  &vmovdqu	("96(%rsp)",@X[3]);
1766	&Xloop_avx2	(\&bodyx_40_59);
1767	&Xupdate_avx2_16_31(\&bodyx_40_59);
1768
1769	&Xupdate_avx2_16_31(\&bodyx_20_39);
1770	&Xupdate_avx2_16_31(\&bodyx_20_39);
1771	&Xupdate_avx2_16_31(\&bodyx_20_39);
1772	&Xloop_avx2	(\&bodyx_20_39);
1773
1774$code.=<<___;
1775	lea	128(%rsp),$frame
1776
1777	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1778	add	0($ctx),@ROTX[0]		# update context
1779	add	4($ctx),@ROTX[1]
1780	add	8($ctx),@ROTX[3]
1781	mov	@ROTX[0],0($ctx)
1782	add	12($ctx),@ROTX[4]
1783	mov	@ROTX[1],4($ctx)
1784	 mov	@ROTX[0],$A			# A=d
1785	add	16($ctx),@ROTX[5]
1786	 mov	@ROTX[3],$a5
1787	mov	@ROTX[3],8($ctx)
1788	 mov	@ROTX[4],$D			# D=b
1789	 #xchg	@ROTX[5],$F			# F=c, C=f
1790	mov	@ROTX[4],12($ctx)
1791	 mov	@ROTX[1],$F			# F=e
1792	mov	@ROTX[5],16($ctx)
1793	#mov	$F,16($ctx)
1794	 mov	@ROTX[5],$E			# E=c
1795	 mov	$a5,$C				# C=f
1796	 #xchg	$F,$E				# E=c, F=e
1797
1798	cmp	$num,$inp
1799	jbe	.Loop_avx2
1800
1801.Ldone_avx2:
1802	vzeroupper
1803___
1804$code.=<<___ if ($win64);
1805	movaps	-40-6*16($fp),%xmm6
1806	movaps	-40-5*16($fp),%xmm7
1807	movaps	-40-4*16($fp),%xmm8
1808	movaps	-40-3*16($fp),%xmm9
1809	movaps	-40-2*16($fp),%xmm10
1810	movaps	-40-1*16($fp),%xmm11
1811___
1812$code.=<<___;
1813	mov	-40($fp),%r14
1814.cfi_restore	%r14
1815	mov	-32($fp),%r13
1816.cfi_restore	%r13
1817	mov	-24($fp),%r12
1818.cfi_restore	%r12
1819	mov	-16($fp),%rbp
1820.cfi_restore	%rbp
1821	mov	-8($fp),%rbx
1822.cfi_restore	%rbx
1823	lea	($fp),%rsp
1824.cfi_def_cfa_register	%rsp
1825.Lepilogue_avx2:
1826	ret
1827.cfi_endproc
1828.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1829___
1830}
1831}
1832$code.=<<___;
1833.align	64
1834K_XX_XX:
1835.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1836.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1837.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1838.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1839.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1840.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1841.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1842.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1843.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1844.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1845.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1846___
1847}}}
1848$code.=<<___;
1849.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1850.align	64
1851___
1852
1853# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1854#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1855if ($win64) {
1856$rec="%rcx";
1857$frame="%rdx";
1858$context="%r8";
1859$disp="%r9";
1860
1861$code.=<<___;
1862.extern	__imp_RtlVirtualUnwind
1863.type	se_handler,\@abi-omnipotent
1864.align	16
1865se_handler:
1866	push	%rsi
1867	push	%rdi
1868	push	%rbx
1869	push	%rbp
1870	push	%r12
1871	push	%r13
1872	push	%r14
1873	push	%r15
1874	pushfq
1875	sub	\$64,%rsp
1876
1877	mov	120($context),%rax	# pull context->Rax
1878	mov	248($context),%rbx	# pull context->Rip
1879
1880	lea	.Lprologue(%rip),%r10
1881	cmp	%r10,%rbx		# context->Rip<.Lprologue
1882	jb	.Lcommon_seh_tail
1883
1884	mov	152($context),%rax	# pull context->Rsp
1885
1886	lea	.Lepilogue(%rip),%r10
1887	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1888	jae	.Lcommon_seh_tail
1889
1890	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1891
1892	mov	-8(%rax),%rbx
1893	mov	-16(%rax),%rbp
1894	mov	-24(%rax),%r12
1895	mov	-32(%rax),%r13
1896	mov	-40(%rax),%r14
1897	mov	%rbx,144($context)	# restore context->Rbx
1898	mov	%rbp,160($context)	# restore context->Rbp
1899	mov	%r12,216($context)	# restore context->R12
1900	mov	%r13,224($context)	# restore context->R13
1901	mov	%r14,232($context)	# restore context->R14
1902
1903	jmp	.Lcommon_seh_tail
1904.size	se_handler,.-se_handler
1905___
1906
1907$code.=<<___ if ($shaext);
1908.type	shaext_handler,\@abi-omnipotent
1909.align	16
1910shaext_handler:
1911	push	%rsi
1912	push	%rdi
1913	push	%rbx
1914	push	%rbp
1915	push	%r12
1916	push	%r13
1917	push	%r14
1918	push	%r15
1919	pushfq
1920	sub	\$64,%rsp
1921
1922	mov	120($context),%rax	# pull context->Rax
1923	mov	248($context),%rbx	# pull context->Rip
1924
1925	lea	.Lprologue_shaext(%rip),%r10
1926	cmp	%r10,%rbx		# context->Rip<.Lprologue
1927	jb	.Lcommon_seh_tail
1928
1929	lea	.Lepilogue_shaext(%rip),%r10
1930	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1931	jae	.Lcommon_seh_tail
1932
1933	lea	-8-4*16(%rax),%rsi
1934	lea	512($context),%rdi	# &context.Xmm6
1935	mov	\$8,%ecx
1936	.long	0xa548f3fc		# cld; rep movsq
1937
1938	jmp	.Lcommon_seh_tail
1939.size	shaext_handler,.-shaext_handler
1940___
1941
1942$code.=<<___;
1943.type	ssse3_handler,\@abi-omnipotent
1944.align	16
1945ssse3_handler:
1946	push	%rsi
1947	push	%rdi
1948	push	%rbx
1949	push	%rbp
1950	push	%r12
1951	push	%r13
1952	push	%r14
1953	push	%r15
1954	pushfq
1955	sub	\$64,%rsp
1956
1957	mov	120($context),%rax	# pull context->Rax
1958	mov	248($context),%rbx	# pull context->Rip
1959
1960	mov	8($disp),%rsi		# disp->ImageBase
1961	mov	56($disp),%r11		# disp->HandlerData
1962
1963	mov	0(%r11),%r10d		# HandlerData[0]
1964	lea	(%rsi,%r10),%r10	# prologue label
1965	cmp	%r10,%rbx		# context->Rip<prologue label
1966	jb	.Lcommon_seh_tail
1967
1968	mov	208($context),%rax	# pull context->R11
1969
1970	mov	4(%r11),%r10d		# HandlerData[1]
1971	lea	(%rsi,%r10),%r10	# epilogue label
1972	cmp	%r10,%rbx		# context->Rip>=epilogue label
1973	jae	.Lcommon_seh_tail
1974
1975	lea	-40-6*16(%rax),%rsi
1976	lea	512($context),%rdi	# &context.Xmm6
1977	mov	\$12,%ecx
1978	.long	0xa548f3fc		# cld; rep movsq
1979
1980	mov	-8(%rax),%rbx
1981	mov	-16(%rax),%rbp
1982	mov	-24(%rax),%r12
1983	mov	-32(%rax),%r13
1984	mov	-40(%rax),%r14
1985	mov	%rbx,144($context)	# restore context->Rbx
1986	mov	%rbp,160($context)	# restore context->Rbp
1987	mov	%r12,216($context)	# restore context->R12
1988	mov	%r13,224($context)	# restore context->R13
1989	mov	%r14,232($context)	# restore context->R14
1990
1991.Lcommon_seh_tail:
1992	mov	8(%rax),%rdi
1993	mov	16(%rax),%rsi
1994	mov	%rax,152($context)	# restore context->Rsp
1995	mov	%rsi,168($context)	# restore context->Rsi
1996	mov	%rdi,176($context)	# restore context->Rdi
1997
1998	mov	40($disp),%rdi		# disp->ContextRecord
1999	mov	$context,%rsi		# context
2000	mov	\$154,%ecx		# sizeof(CONTEXT)
2001	.long	0xa548f3fc		# cld; rep movsq
2002
2003	mov	$disp,%rsi
2004	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2005	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2006	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2007	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2008	mov	40(%rsi),%r10		# disp->ContextRecord
2009	lea	56(%rsi),%r11		# &disp->HandlerData
2010	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2011	mov	%r10,32(%rsp)		# arg5
2012	mov	%r11,40(%rsp)		# arg6
2013	mov	%r12,48(%rsp)		# arg7
2014	mov	%rcx,56(%rsp)		# arg8, (NULL)
2015	call	*__imp_RtlVirtualUnwind(%rip)
2016
2017	mov	\$1,%eax		# ExceptionContinueSearch
2018	add	\$64,%rsp
2019	popfq
2020	pop	%r15
2021	pop	%r14
2022	pop	%r13
2023	pop	%r12
2024	pop	%rbp
2025	pop	%rbx
2026	pop	%rdi
2027	pop	%rsi
2028	ret
2029.size	ssse3_handler,.-ssse3_handler
2030
2031.section	.pdata
2032.align	4
2033	.rva	.LSEH_begin_sha1_block_data_order
2034	.rva	.LSEH_end_sha1_block_data_order
2035	.rva	.LSEH_info_sha1_block_data_order
2036___
2037$code.=<<___ if ($shaext);
2038	.rva	.LSEH_begin_sha1_block_data_order_shaext
2039	.rva	.LSEH_end_sha1_block_data_order_shaext
2040	.rva	.LSEH_info_sha1_block_data_order_shaext
2041___
2042$code.=<<___;
2043	.rva	.LSEH_begin_sha1_block_data_order_ssse3
2044	.rva	.LSEH_end_sha1_block_data_order_ssse3
2045	.rva	.LSEH_info_sha1_block_data_order_ssse3
2046___
2047$code.=<<___ if ($avx);
2048	.rva	.LSEH_begin_sha1_block_data_order_avx
2049	.rva	.LSEH_end_sha1_block_data_order_avx
2050	.rva	.LSEH_info_sha1_block_data_order_avx
2051___
2052$code.=<<___ if ($avx>1);
2053	.rva	.LSEH_begin_sha1_block_data_order_avx2
2054	.rva	.LSEH_end_sha1_block_data_order_avx2
2055	.rva	.LSEH_info_sha1_block_data_order_avx2
2056___
2057$code.=<<___;
2058.section	.xdata
2059.align	8
2060.LSEH_info_sha1_block_data_order:
2061	.byte	9,0,0,0
2062	.rva	se_handler
2063___
2064$code.=<<___ if ($shaext);
2065.LSEH_info_sha1_block_data_order_shaext:
2066	.byte	9,0,0,0
2067	.rva	shaext_handler
2068___
2069$code.=<<___;
2070.LSEH_info_sha1_block_data_order_ssse3:
2071	.byte	9,0,0,0
2072	.rva	ssse3_handler
2073	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2074___
2075$code.=<<___ if ($avx);
2076.LSEH_info_sha1_block_data_order_avx:
2077	.byte	9,0,0,0
2078	.rva	ssse3_handler
2079	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2080___
2081$code.=<<___ if ($avx>1);
2082.LSEH_info_sha1_block_data_order_avx2:
2083	.byte	9,0,0,0
2084	.rva	ssse3_handler
2085	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2086___
2087}
2088
2089####################################################################
2090
2091sub sha1rnds4 {
2092    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2093      my @opcode=(0x0f,0x3a,0xcc);
2094	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2095	my $c=$1;
2096	push @opcode,$c=~/^0/?oct($c):$c;
2097	return ".byte\t".join(',',@opcode);
2098    } else {
2099	return "sha1rnds4\t".@_[0];
2100    }
2101}
2102
2103sub sha1op38 {
2104    my $instr = shift;
2105    my %opcodelet = (
2106		"sha1nexte" => 0xc8,
2107  		"sha1msg1"  => 0xc9,
2108		"sha1msg2"  => 0xca	);
2109
2110    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2111      my @opcode=(0x0f,0x38);
2112      my $rex=0;
2113	$rex|=0x04			if ($2>=8);
2114	$rex|=0x01			if ($1>=8);
2115	unshift @opcode,0x40|$rex	if ($rex);
2116	push @opcode,$opcodelet{$instr};
2117	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2118	return ".byte\t".join(',',@opcode);
2119    } else {
2120	return $instr."\t".@_[0];
2121    }
2122}
2123
2124foreach (split("\n",$code)) {
2125	s/\`([^\`]*)\`/eval $1/geo;
2126
2127	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2128	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2129
2130	print $_,"\n";
2131}
2132close STDOUT;
2133