1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27#		gcc 3.4		32-bit asm	cycles/byte
28# Opteron	+45%		+20%		6.8
29# Xeon P4	+65%		+0%		9.9
30# Core2		+60%		+10%		7.0
31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39# October 2010.
40#
41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42# is to offload message schedule denoted by Wt in NIST specification,
43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44# for background and implementation details. The only difference from
45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46# to free temporary registers.
47
48# April 2011.
49#
50# Add AVX code path. See sha1-586.pl for further information.
51
52######################################################################
53# Current performance is summarized in following table. Numbers are
54# CPU clock cycles spent to process single byte (less is better).
55#
56#		x86_64		SSSE3		AVX
57# P4		9.8		-
58# Opteron	6.6		-
59# Core2		6.7		6.1/+10%	-
60# Atom		11.0		9.7/+13%	-
61# Westmere	7.1		5.6/+27%	-
62# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
63
64$flavour = shift;
65$output  = shift;
66if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
67
68$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
69
70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73die "can't locate x86_64-xlate.pl";
74
75$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
76		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
77	   $1>=2.19);
78$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
80	   $1>=2.09);
81$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
82	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
83	   $1>=10);
84
85open OUT,"| \"$^X\" $xlate $flavour $output";
86*STDOUT=*OUT;
87
88$ctx="%rdi";	# 1st arg
89$inp="%rsi";	# 2nd arg
90$num="%rdx";	# 3rd arg
91
92# reassign arguments in order to produce more compact code
93$ctx="%r8";
94$inp="%r9";
95$num="%r10";
96
97$t0="%eax";
98$t1="%ebx";
99$t2="%ecx";
100@xi=("%edx","%ebp");
101$A="%esi";
102$B="%edi";
103$C="%r11d";
104$D="%r12d";
105$E="%r13d";
106
107@V=($A,$B,$C,$D,$E);
108
109sub BODY_00_19 {
110my ($i,$a,$b,$c,$d,$e)=@_;
111my $j=$i+1;
112$code.=<<___ if ($i==0);
113	mov	`4*$i`($inp),$xi[0]
114	bswap	$xi[0]
115	mov	$xi[0],`4*$i`(%rsp)
116___
117$code.=<<___ if ($i<15);
118	mov	$c,$t0
119	mov	`4*$j`($inp),$xi[1]
120	mov	$a,$t2
121	xor	$d,$t0
122	bswap	$xi[1]
123	rol	\$5,$t2
124	lea	0x5a827999($xi[0],$e),$e
125	and	$b,$t0
126	mov	$xi[1],`4*$j`(%rsp)
127	add	$t2,$e
128	xor	$d,$t0
129	rol	\$30,$b
130	add	$t0,$e
131___
132$code.=<<___ if ($i>=15);
133	mov	`4*($j%16)`(%rsp),$xi[1]
134	mov	$c,$t0
135	mov	$a,$t2
136	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
137	xor	$d,$t0
138	rol	\$5,$t2
139	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
140	and	$b,$t0
141	lea	0x5a827999($xi[0],$e),$e
142	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
143	xor	$d,$t0
144	rol	\$1,$xi[1]
145	add	$t2,$e
146	rol	\$30,$b
147	mov	$xi[1],`4*($j%16)`(%rsp)
148	add	$t0,$e
149___
150unshift(@xi,pop(@xi));
151}
152
153sub BODY_20_39 {
154my ($i,$a,$b,$c,$d,$e)=@_;
155my $j=$i+1;
156my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
157$code.=<<___ if ($i<79);
158	mov	`4*($j%16)`(%rsp),$xi[1]
159	mov	$c,$t0
160	mov	$a,$t2
161	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
162	xor	$b,$t0
163	rol	\$5,$t2
164	lea	$K($xi[0],$e),$e
165	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
166	xor	$d,$t0
167	add	$t2,$e
168	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
169	rol	\$30,$b
170	add	$t0,$e
171	rol	\$1,$xi[1]
172___
173$code.=<<___ if ($i<76);
174	mov	$xi[1],`4*($j%16)`(%rsp)
175___
176$code.=<<___ if ($i==79);
177	mov	$c,$t0
178	mov	$a,$t2
179	xor	$b,$t0
180	lea	$K($xi[0],$e),$e
181	rol	\$5,$t2
182	xor	$d,$t0
183	add	$t2,$e
184	rol	\$30,$b
185	add	$t0,$e
186___
187unshift(@xi,pop(@xi));
188}
189
190sub BODY_40_59 {
191my ($i,$a,$b,$c,$d,$e)=@_;
192my $j=$i+1;
193$code.=<<___;
194	mov	`4*($j%16)`(%rsp),$xi[1]
195	mov	$c,$t0
196	mov	$c,$t1
197	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
198	and	$d,$t0
199	mov	$a,$t2
200	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
201	xor	$d,$t1
202	lea	0x8f1bbcdc($xi[0],$e),$e
203	rol	\$5,$t2
204	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
205	add	$t0,$e
206	and	$b,$t1
207	rol	\$1,$xi[1]
208	add	$t1,$e
209	rol	\$30,$b
210	mov	$xi[1],`4*($j%16)`(%rsp)
211	add	$t2,$e
212___
213unshift(@xi,pop(@xi));
214}
215
216$code.=<<___;
217.text
218.extern	OPENSSL_ia32cap_P
219
220.globl	sha1_block_data_order
221.type	sha1_block_data_order,\@function,3
222.align	16
223sha1_block_data_order:
224	mov	OPENSSL_ia32cap_P+0(%rip),%r8
225	mov	4(%r8),%r8d
226	bt	\$9,%r8d
227	jnc	.Lialu
228___
229$code.=<<___ if ($avx);
230	bt	\$28,%r8d
231	jc	_avx_shortcut
232___
233$code.=<<___;
234	jmp	_ssse3_shortcut
235
236.align	16
237.Lialu:
238	push	%rbx
239	push	%rbp
240	push	%r12
241	push	%r13
242	mov	%rsp,%r11
243	mov	%rdi,$ctx	# reassigned argument
244	sub	\$`8+16*4`,%rsp
245	mov	%rsi,$inp	# reassigned argument
246	and	\$-64,%rsp
247	mov	%rdx,$num	# reassigned argument
248	mov	%r11,`16*4`(%rsp)
249.Lprologue:
250
251	mov	0($ctx),$A
252	mov	4($ctx),$B
253	mov	8($ctx),$C
254	mov	12($ctx),$D
255	mov	16($ctx),$E
256	jmp	.Lloop
257
258.align	16
259.Lloop:
260___
261for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
262for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
263for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
264for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
265$code.=<<___;
266	add	0($ctx),$A
267	add	4($ctx),$B
268	add	8($ctx),$C
269	add	12($ctx),$D
270	add	16($ctx),$E
271	mov	$A,0($ctx)
272	mov	$B,4($ctx)
273	mov	$C,8($ctx)
274	mov	$D,12($ctx)
275	mov	$E,16($ctx)
276
277	sub	\$1,$num
278	lea	`16*4`($inp),$inp
279	jnz	.Lloop
280
281	mov	`16*4`(%rsp),%rsi
282	mov	(%rsi),%r13
283	mov	8(%rsi),%r12
284	mov	16(%rsi),%rbp
285	mov	24(%rsi),%rbx
286	lea	32(%rsi),%rsp
287.Lepilogue:
288	ret
289.size	sha1_block_data_order,.-sha1_block_data_order
290___
291{{{
292my $Xi=4;
293my @X=map("%xmm$_",(4..7,0..3));
294my @Tx=map("%xmm$_",(8..10));
295my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
296my @T=("%esi","%edi");
297my $j=0;
298my $K_XX_XX="%r11";
299
300my $_rol=sub { &rol(@_) };
301my $_ror=sub { &ror(@_) };
302
303$code.=<<___;
304.type	sha1_block_data_order_ssse3,\@function,3
305.align	16
306sha1_block_data_order_ssse3:
307_ssse3_shortcut:
308	push	%rbx
309	push	%rbp
310	push	%r12
311	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
312___
313$code.=<<___ if ($win64);
314	movaps	%xmm6,64+0(%rsp)
315	movaps	%xmm7,64+16(%rsp)
316	movaps	%xmm8,64+32(%rsp)
317	movaps	%xmm9,64+48(%rsp)
318	movaps	%xmm10,64+64(%rsp)
319.Lprologue_ssse3:
320___
321$code.=<<___;
322	mov	%rdi,$ctx	# reassigned argument
323	mov	%rsi,$inp	# reassigned argument
324	mov	%rdx,$num	# reassigned argument
325
326	shl	\$6,$num
327	add	$inp,$num
328	lea	K_XX_XX(%rip),$K_XX_XX
329
330	mov	0($ctx),$A		# load context
331	mov	4($ctx),$B
332	mov	8($ctx),$C
333	mov	12($ctx),$D
334	mov	$B,@T[0]		# magic seed
335	mov	16($ctx),$E
336
337	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
338	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
339	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
340	movdqu	16($inp),@X[-3&7]
341	movdqu	32($inp),@X[-2&7]
342	movdqu	48($inp),@X[-1&7]
343	pshufb	@X[2],@X[-4&7]		# byte swap
344	add	\$64,$inp
345	pshufb	@X[2],@X[-3&7]
346	pshufb	@X[2],@X[-2&7]
347	pshufb	@X[2],@X[-1&7]
348	paddd	@Tx[1],@X[-4&7]		# add K_00_19
349	paddd	@Tx[1],@X[-3&7]
350	paddd	@Tx[1],@X[-2&7]
351	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
352	psubd	@Tx[1],@X[-4&7]		# restore X[]
353	movdqa	@X[-3&7],16(%rsp)
354	psubd	@Tx[1],@X[-3&7]
355	movdqa	@X[-2&7],32(%rsp)
356	psubd	@Tx[1],@X[-2&7]
357	jmp	.Loop_ssse3
358___
359
360sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
361{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
362  my $arg = pop;
363    $arg = "\$$arg" if ($arg*1 eq $arg);
364    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
365}
366
367sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
368{ use integer;
369  my $body = shift;
370  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
371  my ($a,$b,$c,$d,$e);
372
373	&movdqa	(@X[0],@X[-3&7]);
374	 eval(shift(@insns));
375	 eval(shift(@insns));
376	&movdqa	(@Tx[0],@X[-1&7]);
377	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
378	 eval(shift(@insns));
379	 eval(shift(@insns));
380
381	  &paddd	(@Tx[1],@X[-1&7]);
382	 eval(shift(@insns));
383	 eval(shift(@insns));
384	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
385	 eval(shift(@insns));
386	 eval(shift(@insns));
387	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
388	 eval(shift(@insns));
389	 eval(shift(@insns));
390
391	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
392	 eval(shift(@insns));
393	 eval(shift(@insns));
394	 eval(shift(@insns));
395	 eval(shift(@insns));
396
397	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
398	 eval(shift(@insns));
399	 eval(shift(@insns));
400	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
401	 eval(shift(@insns));
402	 eval(shift(@insns));
403
404	&movdqa	(@Tx[2],@X[0]);
405	&movdqa	(@Tx[0],@X[0]);
406	 eval(shift(@insns));
407	 eval(shift(@insns));
408	 eval(shift(@insns));
409	 eval(shift(@insns));
410
411	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
412	&paddd	(@X[0],@X[0]);
413	 eval(shift(@insns));
414	 eval(shift(@insns));
415	 eval(shift(@insns));
416	 eval(shift(@insns));
417
418	&psrld	(@Tx[0],31);
419	 eval(shift(@insns));
420	 eval(shift(@insns));
421	&movdqa	(@Tx[1],@Tx[2]);
422	 eval(shift(@insns));
423	 eval(shift(@insns));
424
425	&psrld	(@Tx[2],30);
426	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
427	 eval(shift(@insns));
428	 eval(shift(@insns));
429	 eval(shift(@insns));
430	 eval(shift(@insns));
431
432	&pslld	(@Tx[1],2);
433	&pxor	(@X[0],@Tx[2]);
434	 eval(shift(@insns));
435	 eval(shift(@insns));
436	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
437	 eval(shift(@insns));
438	 eval(shift(@insns));
439
440	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
441
442	 foreach (@insns) { eval; }	# remaining instructions [if any]
443
444  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
445		push(@Tx,shift(@Tx));
446}
447
448sub Xupdate_ssse3_32_79()
449{ use integer;
450  my $body = shift;
451  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
452  my ($a,$b,$c,$d,$e);
453
454	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
455	 eval(shift(@insns));		# body_20_39
456	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
457	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
458	 eval(shift(@insns));
459	 eval(shift(@insns));
460	 eval(shift(@insns));		# rol
461
462	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
463	 eval(shift(@insns));
464	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
465	if ($Xi%5) {
466	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
467	} else {			# ... or load next one
468	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
469	}
470	  &paddd	(@Tx[1],@X[-1&7]);
471	 eval(shift(@insns));		# ror
472	 eval(shift(@insns));
473
474	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
475	 eval(shift(@insns));		# body_20_39
476	 eval(shift(@insns));
477	 eval(shift(@insns));
478	 eval(shift(@insns));		# rol
479
480	&movdqa	(@Tx[0],@X[0]);
481	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
482	 eval(shift(@insns));
483	 eval(shift(@insns));
484	 eval(shift(@insns));		# ror
485	 eval(shift(@insns));
486
487	&pslld	(@X[0],2);
488	 eval(shift(@insns));		# body_20_39
489	 eval(shift(@insns));
490	&psrld	(@Tx[0],30);
491	 eval(shift(@insns));
492	 eval(shift(@insns));		# rol
493	 eval(shift(@insns));
494	 eval(shift(@insns));
495	 eval(shift(@insns));		# ror
496	 eval(shift(@insns));
497
498	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
499	 eval(shift(@insns));		# body_20_39
500	 eval(shift(@insns));
501	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
502	 eval(shift(@insns));
503	 eval(shift(@insns));		# rol
504	 eval(shift(@insns));
505	 eval(shift(@insns));
506	 eval(shift(@insns));		# rol
507	 eval(shift(@insns));
508
509	 foreach (@insns) { eval; }	# remaining instructions
510
511  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
512		push(@Tx,shift(@Tx));
513}
514
515sub Xuplast_ssse3_80()
516{ use integer;
517  my $body = shift;
518  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
519  my ($a,$b,$c,$d,$e);
520
521	 eval(shift(@insns));
522	  &paddd	(@Tx[1],@X[-1&7]);
523	 eval(shift(@insns));
524	 eval(shift(@insns));
525	 eval(shift(@insns));
526	 eval(shift(@insns));
527
528	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
529
530	 foreach (@insns) { eval; }		# remaining instructions
531
532	&cmp	($inp,$num);
533	&je	(".Ldone_ssse3");
534
535	unshift(@Tx,pop(@Tx));
536
537	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
538	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
539	&movdqu	(@X[-4&7],"0($inp)");		# load input
540	&movdqu	(@X[-3&7],"16($inp)");
541	&movdqu	(@X[-2&7],"32($inp)");
542	&movdqu	(@X[-1&7],"48($inp)");
543	&pshufb	(@X[-4&7],@X[2]);		# byte swap
544	&add	($inp,64);
545
546  $Xi=0;
547}
548
549sub Xloop_ssse3()
550{ use integer;
551  my $body = shift;
552  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
553  my ($a,$b,$c,$d,$e);
554
555	 eval(shift(@insns));
556	 eval(shift(@insns));
557	&pshufb	(@X[($Xi-3)&7],@X[2]);
558	 eval(shift(@insns));
559	 eval(shift(@insns));
560	&paddd	(@X[($Xi-4)&7],@Tx[1]);
561	 eval(shift(@insns));
562	 eval(shift(@insns));
563	 eval(shift(@insns));
564	 eval(shift(@insns));
565	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
566	 eval(shift(@insns));
567	 eval(shift(@insns));
568	&psubd	(@X[($Xi-4)&7],@Tx[1]);
569
570	foreach (@insns) { eval; }
571  $Xi++;
572}
573
574sub Xtail_ssse3()
575{ use integer;
576  my $body = shift;
577  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
578  my ($a,$b,$c,$d,$e);
579
580	foreach (@insns) { eval; }
581}
582
583sub body_00_19 () {
584	(
585	'($a,$b,$c,$d,$e)=@V;'.
586	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
587	'&xor	($c,$d);',
588	'&mov	(@T[1],$a);',	# $b in next round
589	'&$_rol	($a,5);',
590	'&and	(@T[0],$c);',	# ($b&($c^$d))
591	'&xor	($c,$d);',	# restore $c
592	'&xor	(@T[0],$d);',
593	'&add	($e,$a);',
594	'&$_ror	($b,$j?7:2);',	# $b>>>2
595	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
596	);
597}
598
599sub body_20_39 () {
600	(
601	'($a,$b,$c,$d,$e)=@V;'.
602	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
603	'&xor	(@T[0],$d);',	# ($b^$d)
604	'&mov	(@T[1],$a);',	# $b in next round
605	'&$_rol	($a,5);',
606	'&xor	(@T[0],$c);',	# ($b^$d^$c)
607	'&add	($e,$a);',
608	'&$_ror	($b,7);',	# $b>>>2
609	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
610	);
611}
612
613sub body_40_59 () {
614	(
615	'($a,$b,$c,$d,$e)=@V;'.
616	'&mov	(@T[1],$c);',
617	'&xor	($c,$d);',
618	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
619	'&and	(@T[1],$d);',
620	'&and	(@T[0],$c);',	# ($b&($c^$d))
621	'&$_ror	($b,7);',	# $b>>>2
622	'&add	($e,@T[1]);',
623	'&mov	(@T[1],$a);',	# $b in next round
624	'&$_rol	($a,5);',
625	'&add	($e,@T[0]);',
626	'&xor	($c,$d);',	# restore $c
627	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
628	);
629}
630$code.=<<___;
631.align	16
632.Loop_ssse3:
633___
634	&Xupdate_ssse3_16_31(\&body_00_19);
635	&Xupdate_ssse3_16_31(\&body_00_19);
636	&Xupdate_ssse3_16_31(\&body_00_19);
637	&Xupdate_ssse3_16_31(\&body_00_19);
638	&Xupdate_ssse3_32_79(\&body_00_19);
639	&Xupdate_ssse3_32_79(\&body_20_39);
640	&Xupdate_ssse3_32_79(\&body_20_39);
641	&Xupdate_ssse3_32_79(\&body_20_39);
642	&Xupdate_ssse3_32_79(\&body_20_39);
643	&Xupdate_ssse3_32_79(\&body_20_39);
644	&Xupdate_ssse3_32_79(\&body_40_59);
645	&Xupdate_ssse3_32_79(\&body_40_59);
646	&Xupdate_ssse3_32_79(\&body_40_59);
647	&Xupdate_ssse3_32_79(\&body_40_59);
648	&Xupdate_ssse3_32_79(\&body_40_59);
649	&Xupdate_ssse3_32_79(\&body_20_39);
650	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
651
652				$saved_j=$j; @saved_V=@V;
653
654	&Xloop_ssse3(\&body_20_39);
655	&Xloop_ssse3(\&body_20_39);
656	&Xloop_ssse3(\&body_20_39);
657
658$code.=<<___;
659	add	0($ctx),$A			# update context
660	add	4($ctx),@T[0]
661	add	8($ctx),$C
662	add	12($ctx),$D
663	mov	$A,0($ctx)
664	add	16($ctx),$E
665	mov	@T[0],4($ctx)
666	mov	@T[0],$B			# magic seed
667	mov	$C,8($ctx)
668	mov	$D,12($ctx)
669	mov	$E,16($ctx)
670	jmp	.Loop_ssse3
671
672.align	16
673.Ldone_ssse3:
674___
675				$j=$saved_j; @V=@saved_V;
676
677	&Xtail_ssse3(\&body_20_39);
678	&Xtail_ssse3(\&body_20_39);
679	&Xtail_ssse3(\&body_20_39);
680
681$code.=<<___;
682	add	0($ctx),$A			# update context
683	add	4($ctx),@T[0]
684	add	8($ctx),$C
685	mov	$A,0($ctx)
686	add	12($ctx),$D
687	mov	@T[0],4($ctx)
688	add	16($ctx),$E
689	mov	$C,8($ctx)
690	mov	$D,12($ctx)
691	mov	$E,16($ctx)
692___
693$code.=<<___ if ($win64);
694	movaps	64+0(%rsp),%xmm6
695	movaps	64+16(%rsp),%xmm7
696	movaps	64+32(%rsp),%xmm8
697	movaps	64+48(%rsp),%xmm9
698	movaps	64+64(%rsp),%xmm10
699___
700$code.=<<___;
701	lea	`64+($win64?5*16:0)`(%rsp),%rsi
702	mov	0(%rsi),%r12
703	mov	8(%rsi),%rbp
704	mov	16(%rsi),%rbx
705	lea	24(%rsi),%rsp
706.Lepilogue_ssse3:
707	ret
708.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
709___
710
711if ($avx) {
712my $Xi=4;
713my @X=map("%xmm$_",(4..7,0..3));
714my @Tx=map("%xmm$_",(8..10));
715my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
716my @T=("%esi","%edi");
717my $j=0;
718my $K_XX_XX="%r11";
719
720my $_rol=sub { &shld(@_[0],@_) };
721my $_ror=sub { &shrd(@_[0],@_) };
722
723$code.=<<___;
724.type	sha1_block_data_order_avx,\@function,3
725.align	16
726sha1_block_data_order_avx:
727_avx_shortcut:
728	push	%rbx
729	push	%rbp
730	push	%r12
731	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
732___
733$code.=<<___ if ($win64);
734	movaps	%xmm6,64+0(%rsp)
735	movaps	%xmm7,64+16(%rsp)
736	movaps	%xmm8,64+32(%rsp)
737	movaps	%xmm9,64+48(%rsp)
738	movaps	%xmm10,64+64(%rsp)
739.Lprologue_avx:
740___
741$code.=<<___;
742	mov	%rdi,$ctx	# reassigned argument
743	mov	%rsi,$inp	# reassigned argument
744	mov	%rdx,$num	# reassigned argument
745	vzeroupper
746
747	shl	\$6,$num
748	add	$inp,$num
749	lea	K_XX_XX(%rip),$K_XX_XX
750
751	mov	0($ctx),$A		# load context
752	mov	4($ctx),$B
753	mov	8($ctx),$C
754	mov	12($ctx),$D
755	mov	$B,@T[0]		# magic seed
756	mov	16($ctx),$E
757
758	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
759	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
760	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
761	vmovdqu	16($inp),@X[-3&7]
762	vmovdqu	32($inp),@X[-2&7]
763	vmovdqu	48($inp),@X[-1&7]
764	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
765	add	\$64,$inp
766	vpshufb	@X[2],@X[-3&7],@X[-3&7]
767	vpshufb	@X[2],@X[-2&7],@X[-2&7]
768	vpshufb	@X[2],@X[-1&7],@X[-1&7]
769	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
770	vpaddd	@Tx[1],@X[-3&7],@X[1]
771	vpaddd	@Tx[1],@X[-2&7],@X[2]
772	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
773	vmovdqa	@X[1],16(%rsp)
774	vmovdqa	@X[2],32(%rsp)
775	jmp	.Loop_avx
776___
777
778sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
779{ use integer;
780  my $body = shift;
781  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
782  my ($a,$b,$c,$d,$e);
783
784	 eval(shift(@insns));
785	 eval(shift(@insns));
786	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
787	 eval(shift(@insns));
788	 eval(shift(@insns));
789
790	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
791	 eval(shift(@insns));
792	 eval(shift(@insns));
793	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
794	 eval(shift(@insns));
795	 eval(shift(@insns));
796	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
797	 eval(shift(@insns));
798	 eval(shift(@insns));
799
800	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
801	 eval(shift(@insns));
802	 eval(shift(@insns));
803	 eval(shift(@insns));
804	 eval(shift(@insns));
805
806	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
807	 eval(shift(@insns));
808	 eval(shift(@insns));
809	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
810	 eval(shift(@insns));
811	 eval(shift(@insns));
812
813	&vpsrld	(@Tx[0],@X[0],31);
814	 eval(shift(@insns));
815	 eval(shift(@insns));
816	 eval(shift(@insns));
817	 eval(shift(@insns));
818
819	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
820	&vpaddd	(@X[0],@X[0],@X[0]);
821	 eval(shift(@insns));
822	 eval(shift(@insns));
823	 eval(shift(@insns));
824	 eval(shift(@insns));
825
826	&vpsrld	(@Tx[1],@Tx[2],30);
827	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
828	 eval(shift(@insns));
829	 eval(shift(@insns));
830	 eval(shift(@insns));
831	 eval(shift(@insns));
832
833	&vpslld	(@Tx[2],@Tx[2],2);
834	&vpxor	(@X[0],@X[0],@Tx[1]);
835	 eval(shift(@insns));
836	 eval(shift(@insns));
837	 eval(shift(@insns));
838	 eval(shift(@insns));
839
840	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
841	 eval(shift(@insns));
842	 eval(shift(@insns));
843	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
844	 eval(shift(@insns));
845	 eval(shift(@insns));
846
847
848	 foreach (@insns) { eval; }	# remaining instructions [if any]
849
850  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
851		push(@Tx,shift(@Tx));
852}
853
854sub Xupdate_avx_32_79()
855{ use integer;
856  my $body = shift;
857  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
858  my ($a,$b,$c,$d,$e);
859
860	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
861	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
862	 eval(shift(@insns));		# body_20_39
863	 eval(shift(@insns));
864	 eval(shift(@insns));
865	 eval(shift(@insns));		# rol
866
867	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
868	 eval(shift(@insns));
869	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
870	if ($Xi%5) {
871	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
872	} else {			# ... or load next one
873	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
874	}
875	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
876	 eval(shift(@insns));		# ror
877	 eval(shift(@insns));
878
879	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
880	 eval(shift(@insns));		# body_20_39
881	 eval(shift(@insns));
882	 eval(shift(@insns));
883	 eval(shift(@insns));		# rol
884
885	&vpsrld	(@Tx[0],@X[0],30);
886	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
887	 eval(shift(@insns));
888	 eval(shift(@insns));
889	 eval(shift(@insns));		# ror
890	 eval(shift(@insns));
891
892	&vpslld	(@X[0],@X[0],2);
893	 eval(shift(@insns));		# body_20_39
894	 eval(shift(@insns));
895	 eval(shift(@insns));
896	 eval(shift(@insns));		# rol
897	 eval(shift(@insns));
898	 eval(shift(@insns));
899	 eval(shift(@insns));		# ror
900	 eval(shift(@insns));
901
902	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
903	 eval(shift(@insns));		# body_20_39
904	 eval(shift(@insns));
905	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
906	 eval(shift(@insns));
907	 eval(shift(@insns));		# rol
908	 eval(shift(@insns));
909	 eval(shift(@insns));
910	 eval(shift(@insns));		# rol
911	 eval(shift(@insns));
912
913	 foreach (@insns) { eval; }	# remaining instructions
914
915  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
916		push(@Tx,shift(@Tx));
917}
918
919sub Xuplast_avx_80()
920{ use integer;
921  my $body = shift;
922  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
923  my ($a,$b,$c,$d,$e);
924
925	 eval(shift(@insns));
926	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
927	 eval(shift(@insns));
928	 eval(shift(@insns));
929	 eval(shift(@insns));
930	 eval(shift(@insns));
931
932	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
933
934	 foreach (@insns) { eval; }		# remaining instructions
935
936	&cmp	($inp,$num);
937	&je	(".Ldone_avx");
938
939	unshift(@Tx,pop(@Tx));
940
941	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
942	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
943	&vmovdqu(@X[-4&7],"0($inp)");		# load input
944	&vmovdqu(@X[-3&7],"16($inp)");
945	&vmovdqu(@X[-2&7],"32($inp)");
946	&vmovdqu(@X[-1&7],"48($inp)");
947	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
948	&add	($inp,64);
949
950  $Xi=0;
951}
952
953sub Xloop_avx()
954{ use integer;
955  my $body = shift;
956  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
957  my ($a,$b,$c,$d,$e);
958
959	 eval(shift(@insns));
960	 eval(shift(@insns));
961	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
962	 eval(shift(@insns));
963	 eval(shift(@insns));
964	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
965	 eval(shift(@insns));
966	 eval(shift(@insns));
967	 eval(shift(@insns));
968	 eval(shift(@insns));
969	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
970	 eval(shift(@insns));
971	 eval(shift(@insns));
972
973	foreach (@insns) { eval; }
974  $Xi++;
975}
976
977sub Xtail_avx()
978{ use integer;
979  my $body = shift;
980  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
981  my ($a,$b,$c,$d,$e);
982
983	foreach (@insns) { eval; }
984}
985
986$code.=<<___;
987.align	16
988.Loop_avx:
989___
990	&Xupdate_avx_16_31(\&body_00_19);
991	&Xupdate_avx_16_31(\&body_00_19);
992	&Xupdate_avx_16_31(\&body_00_19);
993	&Xupdate_avx_16_31(\&body_00_19);
994	&Xupdate_avx_32_79(\&body_00_19);
995	&Xupdate_avx_32_79(\&body_20_39);
996	&Xupdate_avx_32_79(\&body_20_39);
997	&Xupdate_avx_32_79(\&body_20_39);
998	&Xupdate_avx_32_79(\&body_20_39);
999	&Xupdate_avx_32_79(\&body_20_39);
1000	&Xupdate_avx_32_79(\&body_40_59);
1001	&Xupdate_avx_32_79(\&body_40_59);
1002	&Xupdate_avx_32_79(\&body_40_59);
1003	&Xupdate_avx_32_79(\&body_40_59);
1004	&Xupdate_avx_32_79(\&body_40_59);
1005	&Xupdate_avx_32_79(\&body_20_39);
1006	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1007
1008				$saved_j=$j; @saved_V=@V;
1009
1010	&Xloop_avx(\&body_20_39);
1011	&Xloop_avx(\&body_20_39);
1012	&Xloop_avx(\&body_20_39);
1013
1014$code.=<<___;
1015	add	0($ctx),$A			# update context
1016	add	4($ctx),@T[0]
1017	add	8($ctx),$C
1018	add	12($ctx),$D
1019	mov	$A,0($ctx)
1020	add	16($ctx),$E
1021	mov	@T[0],4($ctx)
1022	mov	@T[0],$B			# magic seed
1023	mov	$C,8($ctx)
1024	mov	$D,12($ctx)
1025	mov	$E,16($ctx)
1026	jmp	.Loop_avx
1027
1028.align	16
1029.Ldone_avx:
1030___
1031				$j=$saved_j; @V=@saved_V;
1032
1033	&Xtail_avx(\&body_20_39);
1034	&Xtail_avx(\&body_20_39);
1035	&Xtail_avx(\&body_20_39);
1036
1037$code.=<<___;
1038	vzeroupper
1039
1040	add	0($ctx),$A			# update context
1041	add	4($ctx),@T[0]
1042	add	8($ctx),$C
1043	mov	$A,0($ctx)
1044	add	12($ctx),$D
1045	mov	@T[0],4($ctx)
1046	add	16($ctx),$E
1047	mov	$C,8($ctx)
1048	mov	$D,12($ctx)
1049	mov	$E,16($ctx)
1050___
1051$code.=<<___ if ($win64);
1052	movaps	64+0(%rsp),%xmm6
1053	movaps	64+16(%rsp),%xmm7
1054	movaps	64+32(%rsp),%xmm8
1055	movaps	64+48(%rsp),%xmm9
1056	movaps	64+64(%rsp),%xmm10
1057___
1058$code.=<<___;
1059	lea	`64+($win64?5*16:0)`(%rsp),%rsi
1060	mov	0(%rsi),%r12
1061	mov	8(%rsi),%rbp
1062	mov	16(%rsi),%rbx
1063	lea	24(%rsi),%rsp
1064.Lepilogue_avx:
1065	ret
1066.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1067___
1068}
1069$code.=<<___;
1070.align	64
1071K_XX_XX:
1072.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1073.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1074.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1075.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1076.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1077___
1078}}}
1079$code.=<<___;
1080.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1081.align	64
1082___
1083
1084# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1085#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1086if ($win64) {
1087$rec="%rcx";
1088$frame="%rdx";
1089$context="%r8";
1090$disp="%r9";
1091
1092$code.=<<___;
1093.extern	__imp_RtlVirtualUnwind
1094.type	se_handler,\@abi-omnipotent
1095.align	16
1096se_handler:
1097	push	%rsi
1098	push	%rdi
1099	push	%rbx
1100	push	%rbp
1101	push	%r12
1102	push	%r13
1103	push	%r14
1104	push	%r15
1105	pushfq
1106	sub	\$64,%rsp
1107
1108	mov	120($context),%rax	# pull context->Rax
1109	mov	248($context),%rbx	# pull context->Rip
1110
1111	lea	.Lprologue(%rip),%r10
1112	cmp	%r10,%rbx		# context->Rip<.Lprologue
1113	jb	.Lcommon_seh_tail
1114
1115	mov	152($context),%rax	# pull context->Rsp
1116
1117	lea	.Lepilogue(%rip),%r10
1118	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1119	jae	.Lcommon_seh_tail
1120
1121	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1122	lea	32(%rax),%rax
1123
1124	mov	-8(%rax),%rbx
1125	mov	-16(%rax),%rbp
1126	mov	-24(%rax),%r12
1127	mov	-32(%rax),%r13
1128	mov	%rbx,144($context)	# restore context->Rbx
1129	mov	%rbp,160($context)	# restore context->Rbp
1130	mov	%r12,216($context)	# restore context->R12
1131	mov	%r13,224($context)	# restore context->R13
1132
1133	jmp	.Lcommon_seh_tail
1134.size	se_handler,.-se_handler
1135
1136.type	ssse3_handler,\@abi-omnipotent
1137.align	16
1138ssse3_handler:
1139	push	%rsi
1140	push	%rdi
1141	push	%rbx
1142	push	%rbp
1143	push	%r12
1144	push	%r13
1145	push	%r14
1146	push	%r15
1147	pushfq
1148	sub	\$64,%rsp
1149
1150	mov	120($context),%rax	# pull context->Rax
1151	mov	248($context),%rbx	# pull context->Rip
1152
1153	mov	8($disp),%rsi		# disp->ImageBase
1154	mov	56($disp),%r11		# disp->HandlerData
1155
1156	mov	0(%r11),%r10d		# HandlerData[0]
1157	lea	(%rsi,%r10),%r10	# prologue label
1158	cmp	%r10,%rbx		# context->Rip<prologue label
1159	jb	.Lcommon_seh_tail
1160
1161	mov	152($context),%rax	# pull context->Rsp
1162
1163	mov	4(%r11),%r10d		# HandlerData[1]
1164	lea	(%rsi,%r10),%r10	# epilogue label
1165	cmp	%r10,%rbx		# context->Rip>=epilogue label
1166	jae	.Lcommon_seh_tail
1167
1168	lea	64(%rax),%rsi
1169	lea	512($context),%rdi	# &context.Xmm6
1170	mov	\$10,%ecx
1171	.long	0xa548f3fc		# cld; rep movsq
1172	lea	`24+64+5*16`(%rax),%rax	# adjust stack pointer
1173
1174	mov	-8(%rax),%rbx
1175	mov	-16(%rax),%rbp
1176	mov	-24(%rax),%r12
1177	mov	%rbx,144($context)	# restore context->Rbx
1178	mov	%rbp,160($context)	# restore context->Rbp
1179	mov	%r12,216($context)	# restore cotnext->R12
1180
1181.Lcommon_seh_tail:
1182	mov	8(%rax),%rdi
1183	mov	16(%rax),%rsi
1184	mov	%rax,152($context)	# restore context->Rsp
1185	mov	%rsi,168($context)	# restore context->Rsi
1186	mov	%rdi,176($context)	# restore context->Rdi
1187
1188	mov	40($disp),%rdi		# disp->ContextRecord
1189	mov	$context,%rsi		# context
1190	mov	\$154,%ecx		# sizeof(CONTEXT)
1191	.long	0xa548f3fc		# cld; rep movsq
1192
1193	mov	$disp,%rsi
1194	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1195	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1196	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1197	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1198	mov	40(%rsi),%r10		# disp->ContextRecord
1199	lea	56(%rsi),%r11		# &disp->HandlerData
1200	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1201	mov	%r10,32(%rsp)		# arg5
1202	mov	%r11,40(%rsp)		# arg6
1203	mov	%r12,48(%rsp)		# arg7
1204	mov	%rcx,56(%rsp)		# arg8, (NULL)
1205	call	*__imp_RtlVirtualUnwind(%rip)
1206
1207	mov	\$1,%eax		# ExceptionContinueSearch
1208	add	\$64,%rsp
1209	popfq
1210	pop	%r15
1211	pop	%r14
1212	pop	%r13
1213	pop	%r12
1214	pop	%rbp
1215	pop	%rbx
1216	pop	%rdi
1217	pop	%rsi
1218	ret
1219.size	ssse3_handler,.-ssse3_handler
1220
1221.section	.pdata
1222.align	4
1223	.rva	.LSEH_begin_sha1_block_data_order
1224	.rva	.LSEH_end_sha1_block_data_order
1225	.rva	.LSEH_info_sha1_block_data_order
1226	.rva	.LSEH_begin_sha1_block_data_order_ssse3
1227	.rva	.LSEH_end_sha1_block_data_order_ssse3
1228	.rva	.LSEH_info_sha1_block_data_order_ssse3
1229___
1230$code.=<<___ if ($avx);
1231	.rva	.LSEH_begin_sha1_block_data_order_avx
1232	.rva	.LSEH_end_sha1_block_data_order_avx
1233	.rva	.LSEH_info_sha1_block_data_order_avx
1234___
1235$code.=<<___;
1236.section	.xdata
1237.align	8
1238.LSEH_info_sha1_block_data_order:
1239	.byte	9,0,0,0
1240	.rva	se_handler
1241.LSEH_info_sha1_block_data_order_ssse3:
1242	.byte	9,0,0,0
1243	.rva	ssse3_handler
1244	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1245___
1246$code.=<<___ if ($avx);
1247.LSEH_info_sha1_block_data_order_avx:
1248	.byte	9,0,0,0
1249	.rva	ssse3_handler
1250	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1251___
1252}
1253
1254####################################################################
1255
1256$code =~ s/\`([^\`]*)\`/eval $1/gem;
1257print $code;
1258close STDOUT;
1259