1#! /usr/bin/env perl
2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15#
16# Hardware SPARC T4 support by David S. Miller
17# ====================================================================
18
19# SHA256 performance improvement over compiler generated code varies
20# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
21# build]. Just like in SHA1 module I aim to ensure scalability on
22# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
23
24# SHA512 on pre-T1 UltraSPARC.
25#
26# Performance is >75% better than 64-bit code generated by Sun C and
27# over 2x than 32-bit code. X[16] resides on stack, but access to it
28# is scheduled for L2 latency and staged through 32 least significant
29# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
30# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
31# good [optimal coefficient is 50%].
32#
33# SHA512 on UltraSPARC T1.
34#
35# It's not any faster than 64-bit code generated by Sun C 5.8. This is
36# because 64-bit code generator has the advantage of using 64-bit
37# loads(*) to access X[16], which I consciously traded for 32-/64-bit
38# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
39# code by 60%, not to mention that it doesn't suffer from severe decay
40# when running 4 times physical cores threads and that it leaves gcc
41# [3.4] behind by over 4x factor! If compared to SHA256, single thread
42# performance is only 10% better, but overall throughput for maximum
43# amount of threads for given CPU exceeds corresponding one of SHA256
44# by 30% [again, optimal coefficient is 50%].
45#
46# (*)	Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
47#	in-order, i.e. load instruction has to complete prior next
48#	instruction in given thread is executed, even if the latter is
49#	not dependent on load result! This means that on T1 two 32-bit
50#	loads are always slower than one 64-bit load. Once again this
51#	is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
52#	2x32-bit loads can be as fast as 1x64-bit ones.
53#
54# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
55# which is 9.3x/11.1x faster than software. Multi-process benchmark
56# saturates at 11.5x single-process result on 8-core processor, or
57# ~11/16GBps per 2.85GHz socket.
58
59$output=pop;
60open STDOUT,">$output";
61
62if ($output =~ /512/) {
63	$label="512";
64	$SZ=8;
65	$LD="ldx";		# load from memory
66	$ST="stx";		# store to memory
67	$SLL="sllx";		# shift left logical
68	$SRL="srlx";		# shift right logical
69	@Sigma0=(28,34,39);
70	@Sigma1=(14,18,41);
71	@sigma0=( 7, 1, 8);	# right shift first
72	@sigma1=( 6,19,61);	# right shift first
73	$lastK=0x817;
74	$rounds=80;
75	$align=4;
76
77	$locals=16*$SZ;		# X[16]
78
79	$A="%o0";
80	$B="%o1";
81	$C="%o2";
82	$D="%o3";
83	$E="%o4";
84	$F="%o5";
85	$G="%g1";
86	$H="%o7";
87	@V=($A,$B,$C,$D,$E,$F,$G,$H);
88} else {
89	$label="256";
90	$SZ=4;
91	$LD="ld";		# load from memory
92	$ST="st";		# store to memory
93	$SLL="sll";		# shift left logical
94	$SRL="srl";		# shift right logical
95	@Sigma0=( 2,13,22);
96	@Sigma1=( 6,11,25);
97	@sigma0=( 3, 7,18);	# right shift first
98	@sigma1=(10,17,19);	# right shift first
99	$lastK=0x8f2;
100	$rounds=64;
101	$align=8;
102
103	$locals=0;		# X[16] is register resident
104	@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
105
106	$A="%l0";
107	$B="%l1";
108	$C="%l2";
109	$D="%l3";
110	$E="%l4";
111	$F="%l5";
112	$G="%l6";
113	$H="%l7";
114	@V=($A,$B,$C,$D,$E,$F,$G,$H);
115}
116$T1="%g2";
117$tmp0="%g3";
118$tmp1="%g4";
119$tmp2="%g5";
120
121$ctx="%i0";
122$inp="%i1";
123$len="%i2";
124$Ktbl="%i3";
125$tmp31="%i4";
126$tmp32="%i5";
127
128########### SHA256
129$Xload = sub {
130my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
131
132    if ($i==0) {
133$code.=<<___;
134	ldx	[$inp+0],@X[0]
135	ldx	[$inp+16],@X[2]
136	ldx	[$inp+32],@X[4]
137	ldx	[$inp+48],@X[6]
138	ldx	[$inp+8],@X[1]
139	ldx	[$inp+24],@X[3]
140	subcc	%g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
141	ldx	[$inp+40],@X[5]
142	bz,pt	%icc,.Laligned
143	ldx	[$inp+56],@X[7]
144
145	sllx	@X[0],$tmp31,@X[0]
146	ldx	[$inp+64],$T1
147___
148for($j=0;$j<7;$j++)
149{   $code.=<<___;
150	srlx	@X[$j+1],$tmp32,$tmp1
151	sllx	@X[$j+1],$tmp31,@X[$j+1]
152	or	$tmp1,@X[$j],@X[$j]
153___
154}
155$code.=<<___;
156	srlx	$T1,$tmp32,$T1
157	or	$T1,@X[7],@X[7]
158.Laligned:
159___
160    }
161
162    if ($i&1) {
163	$code.="\tadd	@X[$i/2],$h,$T1\n";
164    } else {
165	$code.="\tsrlx	@X[$i/2],32,$T1\n\tadd	$h,$T1,$T1\n";
166    }
167} if ($SZ==4);
168
169########### SHA512
170$Xload = sub {
171my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
172my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
173
174$code.=<<___ if ($i==0);
175	ld	[$inp+0],%l0
176	ld	[$inp+4],%l1
177	ld	[$inp+8],%l2
178	ld	[$inp+12],%l3
179	ld	[$inp+16],%l4
180	ld	[$inp+20],%l5
181	ld	[$inp+24],%l6
182	cmp	$tmp31,0
183	ld	[$inp+28],%l7
184___
185$code.=<<___ if ($i<15);
186	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
187	add	$tmp31,32,$tmp0
188	sllx	@pair[0],$tmp0,$tmp1
189	`"ld	[$inp+".eval(32+0+$i*8)."],@pair[0]"	if ($i<12)`
190	srlx	@pair[2],$tmp32,@pair[1]
191	or	$tmp1,$tmp2,$tmp2
192	or	@pair[1],$tmp2,$tmp2
193	`"ld	[$inp+".eval(32+4+$i*8)."],@pair[1]"	if ($i<12)`
194	add	$h,$tmp2,$T1
195	$ST	$tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
196___
197$code.=<<___ if ($i==12);
198	bnz,a,pn	%icc,.+8
199	ld	[$inp+128],%l0
200___
201$code.=<<___ if ($i==15);
202	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
203	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
204	add	$tmp31,32,$tmp0
205	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
206	sllx	@pair[0],$tmp0,$tmp1
207	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
208	srlx	@pair[2],$tmp32,@pair[1]
209	or	$tmp1,$tmp2,$tmp2
210	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
211	or	@pair[1],$tmp2,$tmp2
212	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
213	add	$h,$tmp2,$T1
214	$ST	$tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
215	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
216	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
217	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
218___
219} if ($SZ==8);
220
221########### common
222sub BODY_00_15 {
223my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
224
225    if ($i<16) {
226	&$Xload(@_);
227    } else {
228	$code.="\tadd	$h,$T1,$T1\n";
229    }
230
231$code.=<<___;
232	$SRL	$e,@Sigma1[0],$h	!! $i
233	xor	$f,$g,$tmp2
234	$SLL	$e,`$SZ*8-@Sigma1[2]`,$tmp1
235	and	$e,$tmp2,$tmp2
236	$SRL	$e,@Sigma1[1],$tmp0
237	xor	$tmp1,$h,$h
238	$SLL	$e,`$SZ*8-@Sigma1[1]`,$tmp1
239	xor	$tmp0,$h,$h
240	$SRL	$e,@Sigma1[2],$tmp0
241	xor	$tmp1,$h,$h
242	$SLL	$e,`$SZ*8-@Sigma1[0]`,$tmp1
243	xor	$tmp0,$h,$h
244	xor	$g,$tmp2,$tmp2		! Ch(e,f,g)
245	xor	$tmp1,$h,$tmp0		! Sigma1(e)
246
247	$SRL	$a,@Sigma0[0],$h
248	add	$tmp2,$T1,$T1
249	$LD	[$Ktbl+`$i*$SZ`],$tmp2	! K[$i]
250	$SLL	$a,`$SZ*8-@Sigma0[2]`,$tmp1
251	add	$tmp0,$T1,$T1
252	$SRL	$a,@Sigma0[1],$tmp0
253	xor	$tmp1,$h,$h
254	$SLL	$a,`$SZ*8-@Sigma0[1]`,$tmp1
255	xor	$tmp0,$h,$h
256	$SRL	$a,@Sigma0[2],$tmp0
257	xor	$tmp1,$h,$h
258	$SLL	$a,`$SZ*8-@Sigma0[0]`,$tmp1
259	xor	$tmp0,$h,$h
260	xor	$tmp1,$h,$h		! Sigma0(a)
261
262	or	$a,$b,$tmp0
263	and	$a,$b,$tmp1
264	and	$c,$tmp0,$tmp0
265	or	$tmp0,$tmp1,$tmp1	! Maj(a,b,c)
266	add	$tmp2,$T1,$T1		! +=K[$i]
267	add	$tmp1,$h,$h
268
269	add	$T1,$d,$d
270	add	$T1,$h,$h
271___
272}
273
274########### SHA256
275$BODY_16_XX = sub {
276my $i=@_[0];
277my $xi;
278
279    if ($i&1) {
280	$xi=$tmp32;
281	$code.="\tsrlx	@X[(($i+1)/2)%8],32,$xi\n";
282    } else {
283	$xi=@X[(($i+1)/2)%8];
284    }
285$code.=<<___;
286	srl	$xi,@sigma0[0],$T1		!! Xupdate($i)
287	sll	$xi,`32-@sigma0[2]`,$tmp1
288	srl	$xi,@sigma0[1],$tmp0
289	xor	$tmp1,$T1,$T1
290	sll	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
291	xor	$tmp0,$T1,$T1
292	srl	$xi,@sigma0[2],$tmp0
293	xor	$tmp1,$T1,$T1
294___
295    if ($i&1) {
296	$xi=@X[(($i+14)/2)%8];
297    } else {
298	$xi=$tmp32;
299	$code.="\tsrlx	@X[(($i+14)/2)%8],32,$xi\n";
300    }
301$code.=<<___;
302	srl	$xi,@sigma1[0],$tmp2
303	xor	$tmp0,$T1,$T1			! T1=sigma0(X[i+1])
304	sll	$xi,`32-@sigma1[2]`,$tmp1
305	srl	$xi,@sigma1[1],$tmp0
306	xor	$tmp1,$tmp2,$tmp2
307	sll	$tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
308	xor	$tmp0,$tmp2,$tmp2
309	srl	$xi,@sigma1[2],$tmp0
310	xor	$tmp1,$tmp2,$tmp2
311___
312    if ($i&1) {
313	$xi=@X[($i/2)%8];
314$code.=<<___;
315	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
316	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
317	srl	@X[($i/2)%8],0,$tmp0
318	add	$tmp2,$tmp1,$tmp1
319	add	$xi,$T1,$T1			! +=X[i]
320	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
321	add	$tmp1,$T1,$T1
322
323	srl	$T1,0,$T1
324	or	$T1,@X[($i/2)%8],@X[($i/2)%8]
325___
326    } else {
327	$xi=@X[(($i+9)/2)%8];
328$code.=<<___;
329	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
330	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
331	add	$xi,$T1,$T1			! +=X[i+9]
332	add	$tmp2,$tmp1,$tmp1
333	srl	@X[($i/2)%8],0,@X[($i/2)%8]
334	add	$tmp1,$T1,$T1
335
336	sllx	$T1,32,$tmp0
337	or	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
338___
339    }
340    &BODY_00_15(@_);
341} if ($SZ==4);
342
343########### SHA512
344$BODY_16_XX = sub {
345my $i=@_[0];
346my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
347
348$code.=<<___;
349	sllx	%l2,32,$tmp0		!! Xupdate($i)
350	or	%l3,$tmp0,$tmp0
351
352	srlx	$tmp0,@sigma0[0],$T1
353	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
354	sllx	$tmp0,`64-@sigma0[2]`,$tmp1
355	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
356	srlx	$tmp0,@sigma0[1],$tmp0
357	xor	$tmp1,$T1,$T1
358	sllx	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
359	xor	$tmp0,$T1,$T1
360	srlx	$tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
361	xor	$tmp1,$T1,$T1
362	sllx	%l6,32,$tmp2
363	xor	$tmp0,$T1,$T1		! sigma0(X[$i+1])
364	or	%l7,$tmp2,$tmp2
365
366	srlx	$tmp2,@sigma1[0],$tmp1
367	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
368	sllx	$tmp2,`64-@sigma1[2]`,$tmp0
369	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
370	srlx	$tmp2,@sigma1[1],$tmp2
371	xor	$tmp0,$tmp1,$tmp1
372	sllx	$tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
373	xor	$tmp2,$tmp1,$tmp1
374	srlx	$tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
375	xor	$tmp0,$tmp1,$tmp1
376	sllx	%l4,32,$tmp0
377	xor	$tmp2,$tmp1,$tmp1	! sigma1(X[$i+14])
378	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
379	or	%l5,$tmp0,$tmp0
380	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
381
382	sllx	%l0,32,$tmp2
383	add	$tmp1,$T1,$T1
384	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
385	or	%l1,$tmp2,$tmp2
386	add	$tmp0,$T1,$T1		! +=X[$i+9]
387	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
388	add	$tmp2,$T1,$T1		! +=X[$i]
389	$ST	$T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
390___
391    &BODY_00_15(@_);
392} if ($SZ==8);
393
394$code.=<<___;
395#include "sparc_arch.h"
396
397#ifdef __arch64__
398.register	%g2,#scratch
399.register	%g3,#scratch
400#endif
401
402.section	".text",#alloc,#execinstr
403
404.align	64
405K${label}:
406.type	K${label},#object
407___
408if ($SZ==4) {
409$code.=<<___;
410	.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
411	.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
412	.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
413	.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
414	.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
415	.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
416	.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
417	.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
418	.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
419	.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
420	.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
421	.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
422	.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
423	.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
424	.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
425	.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
426___
427} else {
428$code.=<<___;
429	.long	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
430	.long	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
431	.long	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
432	.long	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
433	.long	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
434	.long	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
435	.long	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
436	.long	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
437	.long	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
438	.long	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
439	.long	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
440	.long	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
441	.long	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
442	.long	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
443	.long	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
444	.long	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
445	.long	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
446	.long	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
447	.long	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
448	.long	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
449	.long	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
450	.long	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
451	.long	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
452	.long	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
453	.long	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
454	.long	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
455	.long	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
456	.long	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
457	.long	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
458	.long	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
459	.long	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
460	.long	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
461	.long	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
462	.long	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
463	.long	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
464	.long	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
465	.long	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
466	.long	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
467	.long	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
468	.long	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
469___
470}
471$code.=<<___;
472.size	K${label},.-K${label}
473
474#ifdef __PIC__
475SPARC_PIC_THUNK(%g1)
476#endif
477
478.globl	sha${label}_block_data_order
479.align	32
480sha${label}_block_data_order:
481	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
482	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
483
484	andcc	%g1, CFR_SHA${label}, %g0
485	be	.Lsoftware
486	nop
487___
488$code.=<<___ if ($SZ==8); 		# SHA512
489	ldd	[%o0 + 0x00], %f0	! load context
490	ldd	[%o0 + 0x08], %f2
491	ldd	[%o0 + 0x10], %f4
492	ldd	[%o0 + 0x18], %f6
493	ldd	[%o0 + 0x20], %f8
494	ldd	[%o0 + 0x28], %f10
495	andcc	%o1, 0x7, %g0
496	ldd	[%o0 + 0x30], %f12
497	bne,pn	%icc, .Lhwunaligned
498	 ldd	[%o0 + 0x38], %f14
499
500.Lhwaligned_loop:
501	ldd	[%o1 + 0x00], %f16
502	ldd	[%o1 + 0x08], %f18
503	ldd	[%o1 + 0x10], %f20
504	ldd	[%o1 + 0x18], %f22
505	ldd	[%o1 + 0x20], %f24
506	ldd	[%o1 + 0x28], %f26
507	ldd	[%o1 + 0x30], %f28
508	ldd	[%o1 + 0x38], %f30
509	ldd	[%o1 + 0x40], %f32
510	ldd	[%o1 + 0x48], %f34
511	ldd	[%o1 + 0x50], %f36
512	ldd	[%o1 + 0x58], %f38
513	ldd	[%o1 + 0x60], %f40
514	ldd	[%o1 + 0x68], %f42
515	ldd	[%o1 + 0x70], %f44
516	subcc	%o2, 1, %o2		! done yet?
517	ldd	[%o1 + 0x78], %f46
518	add	%o1, 0x80, %o1
519	prefetch [%o1 + 63], 20
520	prefetch [%o1 + 64+63], 20
521
522	.word	0x81b02860		! SHA512
523
524	bne,pt	SIZE_T_CC, .Lhwaligned_loop
525	nop
526
527.Lhwfinish:
528	std	%f0, [%o0 + 0x00]	! store context
529	std	%f2, [%o0 + 0x08]
530	std	%f4, [%o0 + 0x10]
531	std	%f6, [%o0 + 0x18]
532	std	%f8, [%o0 + 0x20]
533	std	%f10, [%o0 + 0x28]
534	std	%f12, [%o0 + 0x30]
535	retl
536	 std	%f14, [%o0 + 0x38]
537
538.align	16
539.Lhwunaligned:
540	alignaddr %o1, %g0, %o1
541
542	ldd	[%o1 + 0x00], %f18
543.Lhwunaligned_loop:
544	ldd	[%o1 + 0x08], %f20
545	ldd	[%o1 + 0x10], %f22
546	ldd	[%o1 + 0x18], %f24
547	ldd	[%o1 + 0x20], %f26
548	ldd	[%o1 + 0x28], %f28
549	ldd	[%o1 + 0x30], %f30
550	ldd	[%o1 + 0x38], %f32
551	ldd	[%o1 + 0x40], %f34
552	ldd	[%o1 + 0x48], %f36
553	ldd	[%o1 + 0x50], %f38
554	ldd	[%o1 + 0x58], %f40
555	ldd	[%o1 + 0x60], %f42
556	ldd	[%o1 + 0x68], %f44
557	ldd	[%o1 + 0x70], %f46
558	ldd	[%o1 + 0x78], %f48
559	subcc	%o2, 1, %o2		! done yet?
560	ldd	[%o1 + 0x80], %f50
561	add	%o1, 0x80, %o1
562	prefetch [%o1 + 63], 20
563	prefetch [%o1 + 64+63], 20
564
565	faligndata %f18, %f20, %f16
566	faligndata %f20, %f22, %f18
567	faligndata %f22, %f24, %f20
568	faligndata %f24, %f26, %f22
569	faligndata %f26, %f28, %f24
570	faligndata %f28, %f30, %f26
571	faligndata %f30, %f32, %f28
572	faligndata %f32, %f34, %f30
573	faligndata %f34, %f36, %f32
574	faligndata %f36, %f38, %f34
575	faligndata %f38, %f40, %f36
576	faligndata %f40, %f42, %f38
577	faligndata %f42, %f44, %f40
578	faligndata %f44, %f46, %f42
579	faligndata %f46, %f48, %f44
580	faligndata %f48, %f50, %f46
581
582	.word	0x81b02860		! SHA512
583
584	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
585	for	%f50, %f50, %f18	! %f18=%f50
586
587	ba	.Lhwfinish
588	nop
589___
590$code.=<<___ if ($SZ==4); 		# SHA256
591	ld	[%o0 + 0x00], %f0
592	ld	[%o0 + 0x04], %f1
593	ld	[%o0 + 0x08], %f2
594	ld	[%o0 + 0x0c], %f3
595	ld	[%o0 + 0x10], %f4
596	ld	[%o0 + 0x14], %f5
597	andcc	%o1, 0x7, %g0
598	ld	[%o0 + 0x18], %f6
599	bne,pn	%icc, .Lhwunaligned
600	 ld	[%o0 + 0x1c], %f7
601
602.Lhwloop:
603	ldd	[%o1 + 0x00], %f8
604	ldd	[%o1 + 0x08], %f10
605	ldd	[%o1 + 0x10], %f12
606	ldd	[%o1 + 0x18], %f14
607	ldd	[%o1 + 0x20], %f16
608	ldd	[%o1 + 0x28], %f18
609	ldd	[%o1 + 0x30], %f20
610	subcc	%o2, 1, %o2		! done yet?
611	ldd	[%o1 + 0x38], %f22
612	add	%o1, 0x40, %o1
613	prefetch [%o1 + 63], 20
614
615	.word	0x81b02840		! SHA256
616
617	bne,pt	SIZE_T_CC, .Lhwloop
618	nop
619
620.Lhwfinish:
621	st	%f0, [%o0 + 0x00]	! store context
622	st	%f1, [%o0 + 0x04]
623	st	%f2, [%o0 + 0x08]
624	st	%f3, [%o0 + 0x0c]
625	st	%f4, [%o0 + 0x10]
626	st	%f5, [%o0 + 0x14]
627	st	%f6, [%o0 + 0x18]
628	retl
629	 st	%f7, [%o0 + 0x1c]
630
631.align	8
632.Lhwunaligned:
633	alignaddr %o1, %g0, %o1
634
635	ldd	[%o1 + 0x00], %f10
636.Lhwunaligned_loop:
637	ldd	[%o1 + 0x08], %f12
638	ldd	[%o1 + 0x10], %f14
639	ldd	[%o1 + 0x18], %f16
640	ldd	[%o1 + 0x20], %f18
641	ldd	[%o1 + 0x28], %f20
642	ldd	[%o1 + 0x30], %f22
643	ldd	[%o1 + 0x38], %f24
644	subcc	%o2, 1, %o2		! done yet?
645	ldd	[%o1 + 0x40], %f26
646	add	%o1, 0x40, %o1
647	prefetch [%o1 + 63], 20
648
649	faligndata %f10, %f12, %f8
650	faligndata %f12, %f14, %f10
651	faligndata %f14, %f16, %f12
652	faligndata %f16, %f18, %f14
653	faligndata %f18, %f20, %f16
654	faligndata %f20, %f22, %f18
655	faligndata %f22, %f24, %f20
656	faligndata %f24, %f26, %f22
657
658	.word	0x81b02840		! SHA256
659
660	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
661	for	%f26, %f26, %f10	! %f10=%f26
662
663	ba	.Lhwfinish
664	nop
665___
666$code.=<<___;
667.align	16
668.Lsoftware:
669	save	%sp,-STACK_FRAME-$locals,%sp
670	and	$inp,`$align-1`,$tmp31
671	sllx	$len,`log(16*$SZ)/log(2)`,$len
672	andn	$inp,`$align-1`,$inp
673	sll	$tmp31,3,$tmp31
674	add	$inp,$len,$len
675___
676$code.=<<___ if ($SZ==8); # SHA512
677	mov	32,$tmp32
678	sub	$tmp32,$tmp31,$tmp32
679___
680$code.=<<___;
681.Lpic:	call	.+8
682	add	%o7,K${label}-.Lpic,$Ktbl
683
684	$LD	[$ctx+`0*$SZ`],$A
685	$LD	[$ctx+`1*$SZ`],$B
686	$LD	[$ctx+`2*$SZ`],$C
687	$LD	[$ctx+`3*$SZ`],$D
688	$LD	[$ctx+`4*$SZ`],$E
689	$LD	[$ctx+`5*$SZ`],$F
690	$LD	[$ctx+`6*$SZ`],$G
691	$LD	[$ctx+`7*$SZ`],$H
692
693.Lloop:
694___
695for ($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
696$code.=".L16_xx:\n";
697for (;$i<32;$i++)	{ &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
698$code.=<<___;
699	and	$tmp2,0xfff,$tmp2
700	cmp	$tmp2,$lastK
701	bne	.L16_xx
702	add	$Ktbl,`16*$SZ`,$Ktbl	! Ktbl+=16
703
704___
705$code.=<<___ if ($SZ==4); # SHA256
706	$LD	[$ctx+`0*$SZ`],@X[0]
707	$LD	[$ctx+`1*$SZ`],@X[1]
708	$LD	[$ctx+`2*$SZ`],@X[2]
709	$LD	[$ctx+`3*$SZ`],@X[3]
710	$LD	[$ctx+`4*$SZ`],@X[4]
711	$LD	[$ctx+`5*$SZ`],@X[5]
712	$LD	[$ctx+`6*$SZ`],@X[6]
713	$LD	[$ctx+`7*$SZ`],@X[7]
714
715	add	$A,@X[0],$A
716	$ST	$A,[$ctx+`0*$SZ`]
717	add	$B,@X[1],$B
718	$ST	$B,[$ctx+`1*$SZ`]
719	add	$C,@X[2],$C
720	$ST	$C,[$ctx+`2*$SZ`]
721	add	$D,@X[3],$D
722	$ST	$D,[$ctx+`3*$SZ`]
723	add	$E,@X[4],$E
724	$ST	$E,[$ctx+`4*$SZ`]
725	add	$F,@X[5],$F
726	$ST	$F,[$ctx+`5*$SZ`]
727	add	$G,@X[6],$G
728	$ST	$G,[$ctx+`6*$SZ`]
729	add	$H,@X[7],$H
730	$ST	$H,[$ctx+`7*$SZ`]
731___
732$code.=<<___ if ($SZ==8); # SHA512
733	ld	[$ctx+`0*$SZ+0`],%l0
734	ld	[$ctx+`0*$SZ+4`],%l1
735	ld	[$ctx+`1*$SZ+0`],%l2
736	ld	[$ctx+`1*$SZ+4`],%l3
737	ld	[$ctx+`2*$SZ+0`],%l4
738	ld	[$ctx+`2*$SZ+4`],%l5
739	ld	[$ctx+`3*$SZ+0`],%l6
740
741	sllx	%l0,32,$tmp0
742	ld	[$ctx+`3*$SZ+4`],%l7
743	sllx	%l2,32,$tmp1
744	or	%l1,$tmp0,$tmp0
745	or	%l3,$tmp1,$tmp1
746	add	$tmp0,$A,$A
747	add	$tmp1,$B,$B
748	$ST	$A,[$ctx+`0*$SZ`]
749	sllx	%l4,32,$tmp2
750	$ST	$B,[$ctx+`1*$SZ`]
751	sllx	%l6,32,$T1
752	or	%l5,$tmp2,$tmp2
753	or	%l7,$T1,$T1
754	add	$tmp2,$C,$C
755	$ST	$C,[$ctx+`2*$SZ`]
756	add	$T1,$D,$D
757	$ST	$D,[$ctx+`3*$SZ`]
758
759	ld	[$ctx+`4*$SZ+0`],%l0
760	ld	[$ctx+`4*$SZ+4`],%l1
761	ld	[$ctx+`5*$SZ+0`],%l2
762	ld	[$ctx+`5*$SZ+4`],%l3
763	ld	[$ctx+`6*$SZ+0`],%l4
764	ld	[$ctx+`6*$SZ+4`],%l5
765	ld	[$ctx+`7*$SZ+0`],%l6
766
767	sllx	%l0,32,$tmp0
768	ld	[$ctx+`7*$SZ+4`],%l7
769	sllx	%l2,32,$tmp1
770	or	%l1,$tmp0,$tmp0
771	or	%l3,$tmp1,$tmp1
772	add	$tmp0,$E,$E
773	add	$tmp1,$F,$F
774	$ST	$E,[$ctx+`4*$SZ`]
775	sllx	%l4,32,$tmp2
776	$ST	$F,[$ctx+`5*$SZ`]
777	sllx	%l6,32,$T1
778	or	%l5,$tmp2,$tmp2
779	or	%l7,$T1,$T1
780	add	$tmp2,$G,$G
781	$ST	$G,[$ctx+`6*$SZ`]
782	add	$T1,$H,$H
783	$ST	$H,[$ctx+`7*$SZ`]
784___
785$code.=<<___;
786	add	$inp,`16*$SZ`,$inp		! advance inp
787	cmp	$inp,$len
788	bne	SIZE_T_CC,.Lloop
789	sub	$Ktbl,`($rounds-16)*$SZ`,$Ktbl	! rewind Ktbl
790
791	ret
792	restore
793.type	sha${label}_block_data_order,#function
794.size	sha${label}_block_data_order,(.-sha${label}_block_data_order)
795.asciz	"SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
796.align	4
797___
798
799# Purpose of these subroutines is to explicitly encode VIS instructions,
800# so that one can compile the module without having to specify VIS
801# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
802# Idea is to reserve for option to produce "universal" binary and let
803# programmer detect if current CPU is VIS capable at run-time.
804sub unvis {
805my ($mnemonic,$rs1,$rs2,$rd)=@_;
806my $ref,$opf;
807my %visopf = (	"faligndata"	=> 0x048,
808		"for"		=> 0x07c	);
809
810    $ref = "$mnemonic\t$rs1,$rs2,$rd";
811
812    if ($opf=$visopf{$mnemonic}) {
813	foreach ($rs1,$rs2,$rd) {
814	    return $ref if (!/%f([0-9]{1,2})/);
815	    $_=$1;
816	    if ($1>=32) {
817		return $ref if ($1&1);
818		# re-encode for upper double register addressing
819		$_=($1|$1>>5)&31;
820	    }
821	}
822
823	return	sprintf ".word\t0x%08x !%s",
824			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
825			$ref;
826    } else {
827	return $ref;
828    }
829}
830sub unalignaddr {
831my ($mnemonic,$rs1,$rs2,$rd)=@_;
832my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
833my $ref="$mnemonic\t$rs1,$rs2,$rd";
834
835    foreach ($rs1,$rs2,$rd) {
836	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
837	else			{ return $ref; }
838    }
839    return  sprintf ".word\t0x%08x !%s",
840		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
841		    $ref;
842}
843
844foreach (split("\n",$code)) {
845	s/\`([^\`]*)\`/eval $1/ge;
846
847	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
848		&unvis($1,$2,$3,$4)
849	 /ge;
850	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
851		&unalignaddr($1,$2,$3,$4)
852	 /ge;
853
854	print $_,"\n";
855}
856
857close STDOUT;
858