1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27#		gcc 3.4		32-bit asm	cycles/byte
28# Opteron	+45%		+20%		6.8
29# Xeon P4	+65%		+0%		9.9
30# Core2		+60%		+10%		7.0
31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39$flavour = shift;
40$output  = shift;
41if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42
43$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44
45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
47( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
48die "can't locate x86_64-xlate.pl";
49
50open STDOUT,"| $^X $xlate $flavour $output";
51
52$ctx="%rdi";	# 1st arg
53$inp="%rsi";	# 2nd arg
54$num="%rdx";	# 3rd arg
55
56# reassign arguments in order to produce more compact code
57$ctx="%r8";
58$inp="%r9";
59$num="%r10";
60
61$t0="%eax";
62$t1="%ebx";
63$t2="%ecx";
64@xi=("%edx","%ebp");
65$A="%esi";
66$B="%edi";
67$C="%r11d";
68$D="%r12d";
69$E="%r13d";
70
71@V=($A,$B,$C,$D,$E);
72
73sub BODY_00_19 {
74my ($i,$a,$b,$c,$d,$e)=@_;
75my $j=$i+1;
76$code.=<<___ if ($i==0);
77	mov	`4*$i`($inp),$xi[0]
78	bswap	$xi[0]
79	mov	$xi[0],`4*$i`(%rsp)
80___
81$code.=<<___ if ($i<15);
82	mov	$c,$t0
83	mov	`4*$j`($inp),$xi[1]
84	mov	$a,$t2
85	xor	$d,$t0
86	bswap	$xi[1]
87	rol	\$5,$t2
88	lea	0x5a827999($xi[0],$e),$e
89	and	$b,$t0
90	mov	$xi[1],`4*$j`(%rsp)
91	add	$t2,$e
92	xor	$d,$t0
93	rol	\$30,$b
94	add	$t0,$e
95___
96$code.=<<___ if ($i>=15);
97	mov	`4*($j%16)`(%rsp),$xi[1]
98	mov	$c,$t0
99	mov	$a,$t2
100	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
101	xor	$d,$t0
102	rol	\$5,$t2
103	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
104	and	$b,$t0
105	lea	0x5a827999($xi[0],$e),$e
106	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
107	xor	$d,$t0
108	rol	\$1,$xi[1]
109	add	$t2,$e
110	rol	\$30,$b
111	mov	$xi[1],`4*($j%16)`(%rsp)
112	add	$t0,$e
113___
114unshift(@xi,pop(@xi));
115}
116
117sub BODY_20_39 {
118my ($i,$a,$b,$c,$d,$e)=@_;
119my $j=$i+1;
120my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
121$code.=<<___ if ($i<79);
122	mov	`4*($j%16)`(%rsp),$xi[1]
123	mov	$c,$t0
124	mov	$a,$t2
125	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
126	xor	$b,$t0
127	rol	\$5,$t2
128	lea	$K($xi[0],$e),$e
129	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
130	xor	$d,$t0
131	add	$t2,$e
132	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
133	rol	\$30,$b
134	add	$t0,$e
135	rol	\$1,$xi[1]
136___
137$code.=<<___ if ($i<76);
138	mov	$xi[1],`4*($j%16)`(%rsp)
139___
140$code.=<<___ if ($i==79);
141	mov	$c,$t0
142	mov	$a,$t2
143	xor	$b,$t0
144	lea	$K($xi[0],$e),$e
145	rol	\$5,$t2
146	xor	$d,$t0
147	add	$t2,$e
148	rol	\$30,$b
149	add	$t0,$e
150___
151unshift(@xi,pop(@xi));
152}
153
154sub BODY_40_59 {
155my ($i,$a,$b,$c,$d,$e)=@_;
156my $j=$i+1;
157$code.=<<___;
158	mov	`4*($j%16)`(%rsp),$xi[1]
159	mov	$c,$t0
160	mov	$c,$t1
161	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
162	and	$d,$t0
163	mov	$a,$t2
164	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
165	xor	$d,$t1
166	lea	0x8f1bbcdc($xi[0],$e),$e
167	rol	\$5,$t2
168	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
169	add	$t0,$e
170	and	$b,$t1
171	rol	\$1,$xi[1]
172	add	$t1,$e
173	rol	\$30,$b
174	mov	$xi[1],`4*($j%16)`(%rsp)
175	add	$t2,$e
176___
177unshift(@xi,pop(@xi));
178}
179
180$code.=<<___;
181.text
182
183.globl	sha1_block_data_order
184.type	sha1_block_data_order,\@function,3
185.align	16
186sha1_block_data_order:
187	push	%rbx
188	push	%rbp
189	push	%r12
190	push	%r13
191	mov	%rsp,%r11
192	mov	%rdi,$ctx	# reassigned argument
193	sub	\$`8+16*4`,%rsp
194	mov	%rsi,$inp	# reassigned argument
195	and	\$-64,%rsp
196	mov	%rdx,$num	# reassigned argument
197	mov	%r11,`16*4`(%rsp)
198.Lprologue:
199
200	mov	0($ctx),$A
201	mov	4($ctx),$B
202	mov	8($ctx),$C
203	mov	12($ctx),$D
204	mov	16($ctx),$E
205
206.align	4
207.Lloop:
208___
209for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
210for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
211for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
212for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
213$code.=<<___;
214	add	0($ctx),$A
215	add	4($ctx),$B
216	add	8($ctx),$C
217	add	12($ctx),$D
218	add	16($ctx),$E
219	mov	$A,0($ctx)
220	mov	$B,4($ctx)
221	mov	$C,8($ctx)
222	mov	$D,12($ctx)
223	mov	$E,16($ctx)
224
225	sub	\$1,$num
226	lea	`16*4`($inp),$inp
227	jnz	.Lloop
228
229	mov	`16*4`(%rsp),%rsi
230	mov	(%rsi),%r13
231	mov	8(%rsi),%r12
232	mov	16(%rsi),%rbp
233	mov	24(%rsi),%rbx
234	lea	32(%rsi),%rsp
235.Lepilogue:
236	ret
237.size	sha1_block_data_order,.-sha1_block_data_order
238
239.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
240.align	16
241___
242
243# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
244#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
245if ($win64) {
246$rec="%rcx";
247$frame="%rdx";
248$context="%r8";
249$disp="%r9";
250
251$code.=<<___;
252.extern	__imp_RtlVirtualUnwind
253.type	se_handler,\@abi-omnipotent
254.align	16
255se_handler:
256	push	%rsi
257	push	%rdi
258	push	%rbx
259	push	%rbp
260	push	%r12
261	push	%r13
262	push	%r14
263	push	%r15
264	pushfq
265	sub	\$64,%rsp
266
267	mov	120($context),%rax	# pull context->Rax
268	mov	248($context),%rbx	# pull context->Rip
269
270	lea	.Lprologue(%rip),%r10
271	cmp	%r10,%rbx		# context->Rip<.Lprologue
272	jb	.Lin_prologue
273
274	mov	152($context),%rax	# pull context->Rsp
275
276	lea	.Lepilogue(%rip),%r10
277	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
278	jae	.Lin_prologue
279
280	mov	`16*4`(%rax),%rax	# pull saved stack pointer
281	lea	32(%rax),%rax
282
283	mov	-8(%rax),%rbx
284	mov	-16(%rax),%rbp
285	mov	-24(%rax),%r12
286	mov	-32(%rax),%r13
287	mov	%rbx,144($context)	# restore context->Rbx
288	mov	%rbp,160($context)	# restore context->Rbp
289	mov	%r12,216($context)	# restore context->R12
290	mov	%r13,224($context)	# restore context->R13
291
292.Lin_prologue:
293	mov	8(%rax),%rdi
294	mov	16(%rax),%rsi
295	mov	%rax,152($context)	# restore context->Rsp
296	mov	%rsi,168($context)	# restore context->Rsi
297	mov	%rdi,176($context)	# restore context->Rdi
298
299	mov	40($disp),%rdi		# disp->ContextRecord
300	mov	$context,%rsi		# context
301	mov	\$154,%ecx		# sizeof(CONTEXT)
302	.long	0xa548f3fc		# cld; rep movsq
303
304	mov	$disp,%rsi
305	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
306	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
307	mov	0(%rsi),%r8		# arg3, disp->ControlPc
308	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
309	mov	40(%rsi),%r10		# disp->ContextRecord
310	lea	56(%rsi),%r11		# &disp->HandlerData
311	lea	24(%rsi),%r12		# &disp->EstablisherFrame
312	mov	%r10,32(%rsp)		# arg5
313	mov	%r11,40(%rsp)		# arg6
314	mov	%r12,48(%rsp)		# arg7
315	mov	%rcx,56(%rsp)		# arg8, (NULL)
316	call	*__imp_RtlVirtualUnwind(%rip)
317
318	mov	\$1,%eax		# ExceptionContinueSearch
319	add	\$64,%rsp
320	popfq
321	pop	%r15
322	pop	%r14
323	pop	%r13
324	pop	%r12
325	pop	%rbp
326	pop	%rbx
327	pop	%rdi
328	pop	%rsi
329	ret
330.size	se_handler,.-se_handler
331
332.section	.pdata
333.align	4
334	.rva	.LSEH_begin_sha1_block_data_order
335	.rva	.LSEH_end_sha1_block_data_order
336	.rva	.LSEH_info_sha1_block_data_order
337
338.section	.xdata
339.align	8
340.LSEH_info_sha1_block_data_order:
341	.byte	9,0,0,0
342	.rva	se_handler
343___
344}
345
346####################################################################
347
348$code =~ s/\`([^\`]*)\`/eval $1/gem;
349print $code;
350close STDOUT;
351