1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8#
9# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
10# ====================================================================
11
12# SHA256 performance improvement over compiler generated code varies
13# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
14# build]. Just like in SHA1 module I aim to ensure scalability on
15# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
16
17# SHA512 on pre-T1 UltraSPARC.
18#
19# Performance is >75% better than 64-bit code generated by Sun C and
20# over 2x than 32-bit code. X[16] resides on stack, but access to it
21# is scheduled for L2 latency and staged through 32 least significant
22# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
23# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
24# good [optimal coefficient is 50%].
25#
26# SHA512 on UltraSPARC T1.
27#
28# It's not any faster than 64-bit code generated by Sun C 5.8. This is
29# because 64-bit code generator has the advantage of using 64-bit
30# loads(*) to access X[16], which I consciously traded for 32-/64-bit
31# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
32# code by 60%, not to mention that it doesn't suffer from severe decay
33# when running 4 times physical cores threads and that it leaves gcc
34# [3.4] behind by over 4x factor! If compared to SHA256, single thread
35# performance is only 10% better, but overall throughput for maximum
36# amount of threads for given CPU exceeds corresponding one of SHA256
37# by 30% [again, optimal coefficient is 50%].
38#
39# (*)	Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
40#	in-order, i.e. load instruction has to complete prior next
41#	instruction in given thread is executed, even if the latter is
42#	not dependent on load result! This means that on T1 two 32-bit
43#	loads are always slower than one 64-bit load. Once again this
44#	is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
45#	2x32-bit loads can be as fast as 1x64-bit ones.
46#
47# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
48# which is 9.3x/11.1x faster than software. Multi-process benchmark
49# saturates at 11.5x single-process result on 8-core processor, or
50# ~11/16GBps per 2.85GHz socket.
51
52$output=shift;
53open STDOUT,">$output";
54
55if ($output =~ /512/) {
56	$label="512";
57	$SZ=8;
58	$LD="ldx";		# load from memory
59	$ST="stx";		# store to memory
60	$SLL="sllx";		# shift left logical
61	$SRL="srlx";		# shift right logical
62	@Sigma0=(28,34,39);
63	@Sigma1=(14,18,41);
64	@sigma0=( 7, 1, 8);	# right shift first
65	@sigma1=( 6,19,61);	# right shift first
66	$lastK=0x817;
67	$rounds=80;
68	$align=4;
69
70	$locals=16*$SZ;		# X[16]
71
72	$A="%o0";
73	$B="%o1";
74	$C="%o2";
75	$D="%o3";
76	$E="%o4";
77	$F="%o5";
78	$G="%g1";
79	$H="%o7";
80	@V=($A,$B,$C,$D,$E,$F,$G,$H);
81} else {
82	$label="256";
83	$SZ=4;
84	$LD="ld";		# load from memory
85	$ST="st";		# store to memory
86	$SLL="sll";		# shift left logical
87	$SRL="srl";		# shift right logical
88	@Sigma0=( 2,13,22);
89	@Sigma1=( 6,11,25);
90	@sigma0=( 3, 7,18);	# right shift first
91	@sigma1=(10,17,19);	# right shift first
92	$lastK=0x8f2;
93	$rounds=64;
94	$align=8;
95
96	$locals=0;		# X[16] is register resident
97	@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
98
99	$A="%l0";
100	$B="%l1";
101	$C="%l2";
102	$D="%l3";
103	$E="%l4";
104	$F="%l5";
105	$G="%l6";
106	$H="%l7";
107	@V=($A,$B,$C,$D,$E,$F,$G,$H);
108}
109$T1="%g2";
110$tmp0="%g3";
111$tmp1="%g4";
112$tmp2="%g5";
113
114$ctx="%i0";
115$inp="%i1";
116$len="%i2";
117$Ktbl="%i3";
118$tmp31="%i4";
119$tmp32="%i5";
120
121########### SHA256
122$Xload = sub {
123my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
124
125    if ($i==0) {
126$code.=<<___;
127	ldx	[$inp+0],@X[0]
128	ldx	[$inp+16],@X[2]
129	ldx	[$inp+32],@X[4]
130	ldx	[$inp+48],@X[6]
131	ldx	[$inp+8],@X[1]
132	ldx	[$inp+24],@X[3]
133	subcc	%g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
134	ldx	[$inp+40],@X[5]
135	bz,pt	%icc,.Laligned
136	ldx	[$inp+56],@X[7]
137
138	sllx	@X[0],$tmp31,@X[0]
139	ldx	[$inp+64],$T1
140___
141for($j=0;$j<7;$j++)
142{   $code.=<<___;
143	srlx	@X[$j+1],$tmp32,$tmp1
144	sllx	@X[$j+1],$tmp31,@X[$j+1]
145	or	$tmp1,@X[$j],@X[$j]
146___
147}
148$code.=<<___;
149	srlx	$T1,$tmp32,$T1
150	or	$T1,@X[7],@X[7]
151.Laligned:
152___
153    }
154
155    if ($i&1) {
156	$code.="\tadd	@X[$i/2],$h,$T1\n";
157    } else {
158	$code.="\tsrlx	@X[$i/2],32,$T1\n\tadd	$h,$T1,$T1\n";
159    }
160} if ($SZ==4);
161
162########### SHA512
163$Xload = sub {
164my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
165my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
166
167$code.=<<___ if ($i==0);
168	ld	[$inp+0],%l0
169	ld	[$inp+4],%l1
170	ld	[$inp+8],%l2
171	ld	[$inp+12],%l3
172	ld	[$inp+16],%l4
173	ld	[$inp+20],%l5
174	ld	[$inp+24],%l6
175	cmp	$tmp31,0
176	ld	[$inp+28],%l7
177___
178$code.=<<___ if ($i<15);
179	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
180	add	$tmp31,32,$tmp0
181	sllx	@pair[0],$tmp0,$tmp1
182	`"ld	[$inp+".eval(32+0+$i*8)."],@pair[0]"	if ($i<12)`
183	srlx	@pair[2],$tmp32,@pair[1]
184	or	$tmp1,$tmp2,$tmp2
185	or	@pair[1],$tmp2,$tmp2
186	`"ld	[$inp+".eval(32+4+$i*8)."],@pair[1]"	if ($i<12)`
187	add	$h,$tmp2,$T1
188	$ST	$tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
189___
190$code.=<<___ if ($i==12);
191	bnz,a,pn	%icc,.+8
192	ld	[$inp+128],%l0
193___
194$code.=<<___ if ($i==15);
195	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
196	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
197	add	$tmp31,32,$tmp0
198	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
199	sllx	@pair[0],$tmp0,$tmp1
200	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
201	srlx	@pair[2],$tmp32,@pair[1]
202	or	$tmp1,$tmp2,$tmp2
203	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
204	or	@pair[1],$tmp2,$tmp2
205	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
206	add	$h,$tmp2,$T1
207	$ST	$tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
208	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
209	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
210	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
211___
212} if ($SZ==8);
213
214########### common
215sub BODY_00_15 {
216my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
217
218    if ($i<16) {
219	&$Xload(@_);
220    } else {
221	$code.="\tadd	$h,$T1,$T1\n";
222    }
223
224$code.=<<___;
225	$SRL	$e,@Sigma1[0],$h	!! $i
226	xor	$f,$g,$tmp2
227	$SLL	$e,`$SZ*8-@Sigma1[2]`,$tmp1
228	and	$e,$tmp2,$tmp2
229	$SRL	$e,@Sigma1[1],$tmp0
230	xor	$tmp1,$h,$h
231	$SLL	$e,`$SZ*8-@Sigma1[1]`,$tmp1
232	xor	$tmp0,$h,$h
233	$SRL	$e,@Sigma1[2],$tmp0
234	xor	$tmp1,$h,$h
235	$SLL	$e,`$SZ*8-@Sigma1[0]`,$tmp1
236	xor	$tmp0,$h,$h
237	xor	$g,$tmp2,$tmp2		! Ch(e,f,g)
238	xor	$tmp1,$h,$tmp0		! Sigma1(e)
239
240	$SRL	$a,@Sigma0[0],$h
241	add	$tmp2,$T1,$T1
242	$LD	[$Ktbl+`$i*$SZ`],$tmp2	! K[$i]
243	$SLL	$a,`$SZ*8-@Sigma0[2]`,$tmp1
244	add	$tmp0,$T1,$T1
245	$SRL	$a,@Sigma0[1],$tmp0
246	xor	$tmp1,$h,$h
247	$SLL	$a,`$SZ*8-@Sigma0[1]`,$tmp1
248	xor	$tmp0,$h,$h
249	$SRL	$a,@Sigma0[2],$tmp0
250	xor	$tmp1,$h,$h
251	$SLL	$a,`$SZ*8-@Sigma0[0]`,$tmp1
252	xor	$tmp0,$h,$h
253	xor	$tmp1,$h,$h		! Sigma0(a)
254
255	or	$a,$b,$tmp0
256	and	$a,$b,$tmp1
257	and	$c,$tmp0,$tmp0
258	or	$tmp0,$tmp1,$tmp1	! Maj(a,b,c)
259	add	$tmp2,$T1,$T1		! +=K[$i]
260	add	$tmp1,$h,$h
261
262	add	$T1,$d,$d
263	add	$T1,$h,$h
264___
265}
266
267########### SHA256
268$BODY_16_XX = sub {
269my $i=@_[0];
270my $xi;
271
272    if ($i&1) {
273	$xi=$tmp32;
274	$code.="\tsrlx	@X[(($i+1)/2)%8],32,$xi\n";
275    } else {
276	$xi=@X[(($i+1)/2)%8];
277    }
278$code.=<<___;
279	srl	$xi,@sigma0[0],$T1		!! Xupdate($i)
280	sll	$xi,`32-@sigma0[2]`,$tmp1
281	srl	$xi,@sigma0[1],$tmp0
282	xor	$tmp1,$T1,$T1
283	sll	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
284	xor	$tmp0,$T1,$T1
285	srl	$xi,@sigma0[2],$tmp0
286	xor	$tmp1,$T1,$T1
287___
288    if ($i&1) {
289	$xi=@X[(($i+14)/2)%8];
290    } else {
291	$xi=$tmp32;
292	$code.="\tsrlx	@X[(($i+14)/2)%8],32,$xi\n";
293    }
294$code.=<<___;
295	srl	$xi,@sigma1[0],$tmp2
296	xor	$tmp0,$T1,$T1			! T1=sigma0(X[i+1])
297	sll	$xi,`32-@sigma1[2]`,$tmp1
298	srl	$xi,@sigma1[1],$tmp0
299	xor	$tmp1,$tmp2,$tmp2
300	sll	$tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
301	xor	$tmp0,$tmp2,$tmp2
302	srl	$xi,@sigma1[2],$tmp0
303	xor	$tmp1,$tmp2,$tmp2
304___
305    if ($i&1) {
306	$xi=@X[($i/2)%8];
307$code.=<<___;
308	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
309	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
310	srl	@X[($i/2)%8],0,$tmp0
311	add	$tmp2,$tmp1,$tmp1
312	add	$xi,$T1,$T1			! +=X[i]
313	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
314	add	$tmp1,$T1,$T1
315
316	srl	$T1,0,$T1
317	or	$T1,@X[($i/2)%8],@X[($i/2)%8]
318___
319    } else {
320	$xi=@X[(($i+9)/2)%8];
321$code.=<<___;
322	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
323	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
324	add	$xi,$T1,$T1			! +=X[i+9]
325	add	$tmp2,$tmp1,$tmp1
326	srl	@X[($i/2)%8],0,@X[($i/2)%8]
327	add	$tmp1,$T1,$T1
328
329	sllx	$T1,32,$tmp0
330	or	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
331___
332    }
333    &BODY_00_15(@_);
334} if ($SZ==4);
335
336########### SHA512
337$BODY_16_XX = sub {
338my $i=@_[0];
339my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
340
341$code.=<<___;
342	sllx	%l2,32,$tmp0		!! Xupdate($i)
343	or	%l3,$tmp0,$tmp0
344
345	srlx	$tmp0,@sigma0[0],$T1
346	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
347	sllx	$tmp0,`64-@sigma0[2]`,$tmp1
348	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
349	srlx	$tmp0,@sigma0[1],$tmp0
350	xor	$tmp1,$T1,$T1
351	sllx	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
352	xor	$tmp0,$T1,$T1
353	srlx	$tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
354	xor	$tmp1,$T1,$T1
355	sllx	%l6,32,$tmp2
356	xor	$tmp0,$T1,$T1		! sigma0(X[$i+1])
357	or	%l7,$tmp2,$tmp2
358
359	srlx	$tmp2,@sigma1[0],$tmp1
360	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
361	sllx	$tmp2,`64-@sigma1[2]`,$tmp0
362	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
363	srlx	$tmp2,@sigma1[1],$tmp2
364	xor	$tmp0,$tmp1,$tmp1
365	sllx	$tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
366	xor	$tmp2,$tmp1,$tmp1
367	srlx	$tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
368	xor	$tmp0,$tmp1,$tmp1
369	sllx	%l4,32,$tmp0
370	xor	$tmp2,$tmp1,$tmp1	! sigma1(X[$i+14])
371	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
372	or	%l5,$tmp0,$tmp0
373	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
374
375	sllx	%l0,32,$tmp2
376	add	$tmp1,$T1,$T1
377	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
378	or	%l1,$tmp2,$tmp2
379	add	$tmp0,$T1,$T1		! +=X[$i+9]
380	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
381	add	$tmp2,$T1,$T1		! +=X[$i]
382	$ST	$T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
383___
384    &BODY_00_15(@_);
385} if ($SZ==8);
386
387$code.=<<___;
388#include "sparc_arch.h"
389
390#ifdef __arch64__
391.register	%g2,#scratch
392.register	%g3,#scratch
393#endif
394
395.section	".text",#alloc,#execinstr
396
397.align	64
398K${label}:
399.type	K${label},#object
400___
401if ($SZ==4) {
402$code.=<<___;
403	.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
404	.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
405	.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
406	.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
407	.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
408	.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
409	.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
410	.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
411	.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
412	.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
413	.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
414	.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
415	.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
416	.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
417	.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
418	.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
419___
420} else {
421$code.=<<___;
422	.long	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
423	.long	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
424	.long	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
425	.long	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
426	.long	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
427	.long	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
428	.long	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
429	.long	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
430	.long	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
431	.long	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
432	.long	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
433	.long	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
434	.long	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
435	.long	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
436	.long	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
437	.long	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
438	.long	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
439	.long	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
440	.long	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
441	.long	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
442	.long	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
443	.long	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
444	.long	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
445	.long	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
446	.long	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
447	.long	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
448	.long	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
449	.long	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
450	.long	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
451	.long	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
452	.long	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
453	.long	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
454	.long	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
455	.long	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
456	.long	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
457	.long	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
458	.long	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
459	.long	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
460	.long	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
461	.long	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
462___
463}
464$code.=<<___;
465.size	K${label},.-K${label}
466
467#ifdef __PIC__
468SPARC_PIC_THUNK(%g1)
469#endif
470
471.globl	sha${label}_block_data_order
472.align	32
473sha${label}_block_data_order:
474	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
475	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
476
477	andcc	%g1, CFR_SHA${label}, %g0
478	be	.Lsoftware
479	nop
480___
481$code.=<<___ if ($SZ==8); 		# SHA512
482	ldd	[%o0 + 0x00], %f0	! load context
483	ldd	[%o0 + 0x08], %f2
484	ldd	[%o0 + 0x10], %f4
485	ldd	[%o0 + 0x18], %f6
486	ldd	[%o0 + 0x20], %f8
487	ldd	[%o0 + 0x28], %f10
488	andcc	%o1, 0x7, %g0
489	ldd	[%o0 + 0x30], %f12
490	bne,pn	%icc, .Lhwunaligned
491	 ldd	[%o0 + 0x38], %f14
492
493.Lhwaligned_loop:
494	ldd	[%o1 + 0x00], %f16
495	ldd	[%o1 + 0x08], %f18
496	ldd	[%o1 + 0x10], %f20
497	ldd	[%o1 + 0x18], %f22
498	ldd	[%o1 + 0x20], %f24
499	ldd	[%o1 + 0x28], %f26
500	ldd	[%o1 + 0x30], %f28
501	ldd	[%o1 + 0x38], %f30
502	ldd	[%o1 + 0x40], %f32
503	ldd	[%o1 + 0x48], %f34
504	ldd	[%o1 + 0x50], %f36
505	ldd	[%o1 + 0x58], %f38
506	ldd	[%o1 + 0x60], %f40
507	ldd	[%o1 + 0x68], %f42
508	ldd	[%o1 + 0x70], %f44
509	subcc	%o2, 1, %o2		! done yet?
510	ldd	[%o1 + 0x78], %f46
511	add	%o1, 0x80, %o1
512	prefetch [%o1 + 63], 20
513	prefetch [%o1 + 64+63], 20
514
515	.word	0x81b02860		! SHA512
516
517	bne,pt	SIZE_T_CC, .Lhwaligned_loop
518	nop
519
520.Lhwfinish:
521	std	%f0, [%o0 + 0x00]	! store context
522	std	%f2, [%o0 + 0x08]
523	std	%f4, [%o0 + 0x10]
524	std	%f6, [%o0 + 0x18]
525	std	%f8, [%o0 + 0x20]
526	std	%f10, [%o0 + 0x28]
527	std	%f12, [%o0 + 0x30]
528	retl
529	 std	%f14, [%o0 + 0x38]
530
531.align	16
532.Lhwunaligned:
533	alignaddr %o1, %g0, %o1
534
535	ldd	[%o1 + 0x00], %f18
536.Lhwunaligned_loop:
537	ldd	[%o1 + 0x08], %f20
538	ldd	[%o1 + 0x10], %f22
539	ldd	[%o1 + 0x18], %f24
540	ldd	[%o1 + 0x20], %f26
541	ldd	[%o1 + 0x28], %f28
542	ldd	[%o1 + 0x30], %f30
543	ldd	[%o1 + 0x38], %f32
544	ldd	[%o1 + 0x40], %f34
545	ldd	[%o1 + 0x48], %f36
546	ldd	[%o1 + 0x50], %f38
547	ldd	[%o1 + 0x58], %f40
548	ldd	[%o1 + 0x60], %f42
549	ldd	[%o1 + 0x68], %f44
550	ldd	[%o1 + 0x70], %f46
551	ldd	[%o1 + 0x78], %f48
552	subcc	%o2, 1, %o2		! done yet?
553	ldd	[%o1 + 0x80], %f50
554	add	%o1, 0x80, %o1
555	prefetch [%o1 + 63], 20
556	prefetch [%o1 + 64+63], 20
557
558	faligndata %f18, %f20, %f16
559	faligndata %f20, %f22, %f18
560	faligndata %f22, %f24, %f20
561	faligndata %f24, %f26, %f22
562	faligndata %f26, %f28, %f24
563	faligndata %f28, %f30, %f26
564	faligndata %f30, %f32, %f28
565	faligndata %f32, %f34, %f30
566	faligndata %f34, %f36, %f32
567	faligndata %f36, %f38, %f34
568	faligndata %f38, %f40, %f36
569	faligndata %f40, %f42, %f38
570	faligndata %f42, %f44, %f40
571	faligndata %f44, %f46, %f42
572	faligndata %f46, %f48, %f44
573	faligndata %f48, %f50, %f46
574
575	.word	0x81b02860		! SHA512
576
577	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
578	for	%f50, %f50, %f18	! %f18=%f50
579
580	ba	.Lhwfinish
581	nop
582___
583$code.=<<___ if ($SZ==4); 		# SHA256
584	ld	[%o0 + 0x00], %f0
585	ld	[%o0 + 0x04], %f1
586	ld	[%o0 + 0x08], %f2
587	ld	[%o0 + 0x0c], %f3
588	ld	[%o0 + 0x10], %f4
589	ld	[%o0 + 0x14], %f5
590	andcc	%o1, 0x7, %g0
591	ld	[%o0 + 0x18], %f6
592	bne,pn	%icc, .Lhwunaligned
593	 ld	[%o0 + 0x1c], %f7
594
595.Lhwloop:
596	ldd	[%o1 + 0x00], %f8
597	ldd	[%o1 + 0x08], %f10
598	ldd	[%o1 + 0x10], %f12
599	ldd	[%o1 + 0x18], %f14
600	ldd	[%o1 + 0x20], %f16
601	ldd	[%o1 + 0x28], %f18
602	ldd	[%o1 + 0x30], %f20
603	subcc	%o2, 1, %o2		! done yet?
604	ldd	[%o1 + 0x38], %f22
605	add	%o1, 0x40, %o1
606	prefetch [%o1 + 63], 20
607
608	.word	0x81b02840		! SHA256
609
610	bne,pt	SIZE_T_CC, .Lhwloop
611	nop
612
613.Lhwfinish:
614	st	%f0, [%o0 + 0x00]	! store context
615	st	%f1, [%o0 + 0x04]
616	st	%f2, [%o0 + 0x08]
617	st	%f3, [%o0 + 0x0c]
618	st	%f4, [%o0 + 0x10]
619	st	%f5, [%o0 + 0x14]
620	st	%f6, [%o0 + 0x18]
621	retl
622	 st	%f7, [%o0 + 0x1c]
623
624.align	8
625.Lhwunaligned:
626	alignaddr %o1, %g0, %o1
627
628	ldd	[%o1 + 0x00], %f10
629.Lhwunaligned_loop:
630	ldd	[%o1 + 0x08], %f12
631	ldd	[%o1 + 0x10], %f14
632	ldd	[%o1 + 0x18], %f16
633	ldd	[%o1 + 0x20], %f18
634	ldd	[%o1 + 0x28], %f20
635	ldd	[%o1 + 0x30], %f22
636	ldd	[%o1 + 0x38], %f24
637	subcc	%o2, 1, %o2		! done yet?
638	ldd	[%o1 + 0x40], %f26
639	add	%o1, 0x40, %o1
640	prefetch [%o1 + 63], 20
641
642	faligndata %f10, %f12, %f8
643	faligndata %f12, %f14, %f10
644	faligndata %f14, %f16, %f12
645	faligndata %f16, %f18, %f14
646	faligndata %f18, %f20, %f16
647	faligndata %f20, %f22, %f18
648	faligndata %f22, %f24, %f20
649	faligndata %f24, %f26, %f22
650
651	.word	0x81b02840		! SHA256
652
653	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
654	for	%f26, %f26, %f10	! %f10=%f26
655
656	ba	.Lhwfinish
657	nop
658___
659$code.=<<___;
660.align	16
661.Lsoftware:
662	save	%sp,-STACK_FRAME-$locals,%sp
663	and	$inp,`$align-1`,$tmp31
664	sllx	$len,`log(16*$SZ)/log(2)`,$len
665	andn	$inp,`$align-1`,$inp
666	sll	$tmp31,3,$tmp31
667	add	$inp,$len,$len
668___
669$code.=<<___ if ($SZ==8); # SHA512
670	mov	32,$tmp32
671	sub	$tmp32,$tmp31,$tmp32
672___
673$code.=<<___;
674.Lpic:	call	.+8
675	add	%o7,K${label}-.Lpic,$Ktbl
676
677	$LD	[$ctx+`0*$SZ`],$A
678	$LD	[$ctx+`1*$SZ`],$B
679	$LD	[$ctx+`2*$SZ`],$C
680	$LD	[$ctx+`3*$SZ`],$D
681	$LD	[$ctx+`4*$SZ`],$E
682	$LD	[$ctx+`5*$SZ`],$F
683	$LD	[$ctx+`6*$SZ`],$G
684	$LD	[$ctx+`7*$SZ`],$H
685
686.Lloop:
687___
688for ($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
689$code.=".L16_xx:\n";
690for (;$i<32;$i++)	{ &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
691$code.=<<___;
692	and	$tmp2,0xfff,$tmp2
693	cmp	$tmp2,$lastK
694	bne	.L16_xx
695	add	$Ktbl,`16*$SZ`,$Ktbl	! Ktbl+=16
696
697___
698$code.=<<___ if ($SZ==4); # SHA256
699	$LD	[$ctx+`0*$SZ`],@X[0]
700	$LD	[$ctx+`1*$SZ`],@X[1]
701	$LD	[$ctx+`2*$SZ`],@X[2]
702	$LD	[$ctx+`3*$SZ`],@X[3]
703	$LD	[$ctx+`4*$SZ`],@X[4]
704	$LD	[$ctx+`5*$SZ`],@X[5]
705	$LD	[$ctx+`6*$SZ`],@X[6]
706	$LD	[$ctx+`7*$SZ`],@X[7]
707
708	add	$A,@X[0],$A
709	$ST	$A,[$ctx+`0*$SZ`]
710	add	$B,@X[1],$B
711	$ST	$B,[$ctx+`1*$SZ`]
712	add	$C,@X[2],$C
713	$ST	$C,[$ctx+`2*$SZ`]
714	add	$D,@X[3],$D
715	$ST	$D,[$ctx+`3*$SZ`]
716	add	$E,@X[4],$E
717	$ST	$E,[$ctx+`4*$SZ`]
718	add	$F,@X[5],$F
719	$ST	$F,[$ctx+`5*$SZ`]
720	add	$G,@X[6],$G
721	$ST	$G,[$ctx+`6*$SZ`]
722	add	$H,@X[7],$H
723	$ST	$H,[$ctx+`7*$SZ`]
724___
725$code.=<<___ if ($SZ==8); # SHA512
726	ld	[$ctx+`0*$SZ+0`],%l0
727	ld	[$ctx+`0*$SZ+4`],%l1
728	ld	[$ctx+`1*$SZ+0`],%l2
729	ld	[$ctx+`1*$SZ+4`],%l3
730	ld	[$ctx+`2*$SZ+0`],%l4
731	ld	[$ctx+`2*$SZ+4`],%l5
732	ld	[$ctx+`3*$SZ+0`],%l6
733
734	sllx	%l0,32,$tmp0
735	ld	[$ctx+`3*$SZ+4`],%l7
736	sllx	%l2,32,$tmp1
737	or	%l1,$tmp0,$tmp0
738	or	%l3,$tmp1,$tmp1
739	add	$tmp0,$A,$A
740	add	$tmp1,$B,$B
741	$ST	$A,[$ctx+`0*$SZ`]
742	sllx	%l4,32,$tmp2
743	$ST	$B,[$ctx+`1*$SZ`]
744	sllx	%l6,32,$T1
745	or	%l5,$tmp2,$tmp2
746	or	%l7,$T1,$T1
747	add	$tmp2,$C,$C
748	$ST	$C,[$ctx+`2*$SZ`]
749	add	$T1,$D,$D
750	$ST	$D,[$ctx+`3*$SZ`]
751
752	ld	[$ctx+`4*$SZ+0`],%l0
753	ld	[$ctx+`4*$SZ+4`],%l1
754	ld	[$ctx+`5*$SZ+0`],%l2
755	ld	[$ctx+`5*$SZ+4`],%l3
756	ld	[$ctx+`6*$SZ+0`],%l4
757	ld	[$ctx+`6*$SZ+4`],%l5
758	ld	[$ctx+`7*$SZ+0`],%l6
759
760	sllx	%l0,32,$tmp0
761	ld	[$ctx+`7*$SZ+4`],%l7
762	sllx	%l2,32,$tmp1
763	or	%l1,$tmp0,$tmp0
764	or	%l3,$tmp1,$tmp1
765	add	$tmp0,$E,$E
766	add	$tmp1,$F,$F
767	$ST	$E,[$ctx+`4*$SZ`]
768	sllx	%l4,32,$tmp2
769	$ST	$F,[$ctx+`5*$SZ`]
770	sllx	%l6,32,$T1
771	or	%l5,$tmp2,$tmp2
772	or	%l7,$T1,$T1
773	add	$tmp2,$G,$G
774	$ST	$G,[$ctx+`6*$SZ`]
775	add	$T1,$H,$H
776	$ST	$H,[$ctx+`7*$SZ`]
777___
778$code.=<<___;
779	add	$inp,`16*$SZ`,$inp		! advance inp
780	cmp	$inp,$len
781	bne	SIZE_T_CC,.Lloop
782	sub	$Ktbl,`($rounds-16)*$SZ`,$Ktbl	! rewind Ktbl
783
784	ret
785	restore
786.type	sha${label}_block_data_order,#function
787.size	sha${label}_block_data_order,(.-sha${label}_block_data_order)
788.asciz	"SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
789.align	4
790___
791
792# Purpose of these subroutines is to explicitly encode VIS instructions,
793# so that one can compile the module without having to specify VIS
794# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
795# Idea is to reserve for option to produce "universal" binary and let
796# programmer detect if current CPU is VIS capable at run-time.
797sub unvis {
798my ($mnemonic,$rs1,$rs2,$rd)=@_;
799my $ref,$opf;
800my %visopf = (	"faligndata"	=> 0x048,
801		"for"		=> 0x07c	);
802
803    $ref = "$mnemonic\t$rs1,$rs2,$rd";
804
805    if ($opf=$visopf{$mnemonic}) {
806	foreach ($rs1,$rs2,$rd) {
807	    return $ref if (!/%f([0-9]{1,2})/);
808	    $_=$1;
809	    if ($1>=32) {
810		return $ref if ($1&1);
811		# re-encode for upper double register addressing
812		$_=($1|$1>>5)&31;
813	    }
814	}
815
816	return	sprintf ".word\t0x%08x !%s",
817			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
818			$ref;
819    } else {
820	return $ref;
821    }
822}
823sub unalignaddr {
824my ($mnemonic,$rs1,$rs2,$rd)=@_;
825my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
826my $ref="$mnemonic\t$rs1,$rs2,$rd";
827
828    foreach ($rs1,$rs2,$rd) {
829	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
830	else			{ return $ref; }
831    }
832    return  sprintf ".word\t0x%08x !%s",
833		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
834		    $ref;
835}
836
837foreach (split("\n",$code)) {
838	s/\`([^\`]*)\`/eval $1/ge;
839
840	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
841		&unvis($1,$2,$3,$4)
842	 /ge;
843	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
844		&unalignaddr($1,$2,$3,$4)
845	 /ge;
846
847	print $_,"\n";
848}
849
850close STDOUT;
851