1#! /usr/bin/env perl
2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# SHA256/512 block procedure for PA-RISC.
18
19# June 2009.
20#
21# SHA256 performance is >75% better than gcc 3.2 generated code on
22# PA-7100LC. Compared to code generated by vendor compiler this
23# implementation is almost 70% faster in 64-bit build, but delivers
24# virtually same performance in 32-bit build on PA-8600.
25#
26# SHA512 performance is >2.9x better than gcc 3.2 generated code on
27# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
28# code is executed on PA-RISC 2.0 processor and switches to 64-bit
29# code path delivering adequate performance even in "blended" 32-bit
30# build. Though 64-bit code is not any faster than code generated by
31# vendor compiler on PA-8600...
32#
33# Special thanks to polarhome.com for providing HP-UX account.
34
35$flavour = shift;
36$output = shift;
37open STDOUT,">$output";
38
39if ($flavour =~ /64/) {
40	$LEVEL		="2.0W";
41	$SIZE_T		=8;
42	$FRAME_MARKER	=80;
43	$SAVED_RP	=16;
44	$PUSH		="std";
45	$PUSHMA		="std,ma";
46	$POP		="ldd";
47	$POPMB		="ldd,mb";
48} else {
49	$LEVEL		="1.0";
50	$SIZE_T		=4;
51	$FRAME_MARKER	=48;
52	$SAVED_RP	=20;
53	$PUSH		="stw";
54	$PUSHMA		="stwm";
55	$POP		="ldw";
56	$POPMB		="ldwm";
57}
58
59if ($output =~ /512/) {
60	$func="sha512_block_data_order";
61	$SZ=8;
62	@Sigma0=(28,34,39);
63	@Sigma1=(14,18,41);
64	@sigma0=(1,  8, 7);
65	@sigma1=(19,61, 6);
66	$rounds=80;
67	$LAST10BITS=0x017;
68	$LD="ldd";
69	$LDM="ldd,ma";
70	$ST="std";
71} else {
72	$func="sha256_block_data_order";
73	$SZ=4;
74	@Sigma0=( 2,13,22);
75	@Sigma1=( 6,11,25);
76	@sigma0=( 7,18, 3);
77	@sigma1=(17,19,10);
78	$rounds=64;
79	$LAST10BITS=0x0f2;
80	$LD="ldw";
81	$LDM="ldwm";
82	$ST="stw";
83}
84
85$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
86				#                 [+ argument transfer]
87$XOFF=16*$SZ+32;		# local variables
88$FRAME+=$XOFF;
89$XOFF+=$FRAME_MARKER;		# distance between %sp and local variables
90
91$ctx="%r26";	# zapped by $a0
92$inp="%r25";	# zapped by $a1
93$num="%r24";	# zapped by $t0
94
95$a0 ="%r26";
96$a1 ="%r25";
97$t0 ="%r24";
98$t1 ="%r29";
99$Tbl="%r31";
100
101@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
102
103@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
104    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
105
106sub ROUND_00_15 {
107my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
108$code.=<<___;
109	_ror	$e,$Sigma1[0],$a0
110	and	$f,$e,$t0
111	_ror	$e,$Sigma1[1],$a1
112	addl	$t1,$h,$h
113	andcm	$g,$e,$t1
114	xor	$a1,$a0,$a0
115	_ror	$a1,`$Sigma1[2]-$Sigma1[1]`,$a1
116	or	$t0,$t1,$t1		; Ch(e,f,g)
117	addl	@X[$i%16],$h,$h
118	xor	$a0,$a1,$a1		; Sigma1(e)
119	addl	$t1,$h,$h
120	_ror	$a,$Sigma0[0],$a0
121	addl	$a1,$h,$h
122
123	_ror	$a,$Sigma0[1],$a1
124	and	$a,$b,$t0
125	and	$a,$c,$t1
126	xor	$a1,$a0,$a0
127	_ror	$a1,`$Sigma0[2]-$Sigma0[1]`,$a1
128	xor	$t1,$t0,$t0
129	and	$b,$c,$t1
130	xor	$a0,$a1,$a1		; Sigma0(a)
131	addl	$h,$d,$d
132	xor	$t1,$t0,$t0		; Maj(a,b,c)
133	`"$LDM	$SZ($Tbl),$t1" if ($i<15)`
134	addl	$a1,$h,$h
135	addl	$t0,$h,$h
136
137___
138}
139
140sub ROUND_16_xx {
141my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
142$i-=16;
143$code.=<<___;
144	_ror	@X[($i+1)%16],$sigma0[0],$a0
145	_ror	@X[($i+1)%16],$sigma0[1],$a1
146	addl	@X[($i+9)%16],@X[$i],@X[$i]
147	_ror	@X[($i+14)%16],$sigma1[0],$t0
148	_ror	@X[($i+14)%16],$sigma1[1],$t1
149	xor	$a1,$a0,$a0
150	_shr	@X[($i+1)%16],$sigma0[2],$a1
151	xor	$t1,$t0,$t0
152	_shr	@X[($i+14)%16],$sigma1[2],$t1
153	xor	$a1,$a0,$a0		; sigma0(X[(i+1)&0x0f])
154	xor	$t1,$t0,$t0		; sigma1(X[(i+14)&0x0f])
155	$LDM	$SZ($Tbl),$t1
156	addl	$a0,@X[$i],@X[$i]
157	addl	$t0,@X[$i],@X[$i]
158___
159$code.=<<___ if ($i==15);
160	extru	$t1,31,10,$a1
161	comiclr,<> $LAST10BITS,$a1,%r0
162	ldo	1($Tbl),$Tbl		; signal end of $Tbl
163___
164&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
165}
166
167$code=<<___;
168	.LEVEL	$LEVEL
169	.SPACE	\$TEXT\$
170	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
171
172	.ALIGN	64
173L\$table
174___
175$code.=<<___ if ($SZ==8);
176	.WORD	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
177	.WORD	0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
178	.WORD	0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
179	.WORD	0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
180	.WORD	0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
181	.WORD	0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
182	.WORD	0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
183	.WORD	0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
184	.WORD	0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
185	.WORD	0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
186	.WORD	0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
187	.WORD	0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
188	.WORD	0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
189	.WORD	0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
190	.WORD	0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
191	.WORD	0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
192	.WORD	0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
193	.WORD	0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
194	.WORD	0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
195	.WORD	0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
196	.WORD	0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
197	.WORD	0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
198	.WORD	0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
199	.WORD	0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
200	.WORD	0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
201	.WORD	0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
202	.WORD	0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
203	.WORD	0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
204	.WORD	0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
205	.WORD	0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
206	.WORD	0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
207	.WORD	0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
208	.WORD	0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
209	.WORD	0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
210	.WORD	0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
211	.WORD	0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
212	.WORD	0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
213	.WORD	0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
214	.WORD	0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
215	.WORD	0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
216___
217$code.=<<___ if ($SZ==4);
218	.WORD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
219	.WORD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
220	.WORD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
221	.WORD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
222	.WORD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
223	.WORD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
224	.WORD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
225	.WORD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
226	.WORD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
227	.WORD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
228	.WORD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
229	.WORD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
230	.WORD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
231	.WORD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
232	.WORD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
233	.WORD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
234___
235$code.=<<___;
236
237	.EXPORT	$func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
238	.ALIGN	64
239$func
240	.PROC
241	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
242	.ENTRY
243	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
244	$PUSHMA	%r3,$FRAME(%sp)
245	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
246	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
247	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
248	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
249	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
250	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
251	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
252	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
253	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
254	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
255	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
256	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
257	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
258	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
259	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
260
261	_shl	$num,`log(16*$SZ)/log(2)`,$num
262	addl	$inp,$num,$num		; $num to point at the end of $inp
263
264	$PUSH	$num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)	; save arguments
265	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
266	$PUSH	$ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
267
268	blr	%r0,$Tbl
269	ldi	3,$t1
270L\$pic
271	andcm	$Tbl,$t1,$Tbl		; wipe privilege level
272	ldo	L\$table-L\$pic($Tbl),$Tbl
273___
274$code.=<<___ if ($SZ==8 && $SIZE_T==4);
275	ldi	31,$t1
276	mtctl	$t1,%cr11
277	extrd,u,*= $t1,%sar,1,$t1	; executes on PA-RISC 1.0
278	b	L\$parisc1
279	nop
280___
281$code.=<<___;
282	$LD	`0*$SZ`($ctx),$A	; load context
283	$LD	`1*$SZ`($ctx),$B
284	$LD	`2*$SZ`($ctx),$C
285	$LD	`3*$SZ`($ctx),$D
286	$LD	`4*$SZ`($ctx),$E
287	$LD	`5*$SZ`($ctx),$F
288	$LD	`6*$SZ`($ctx),$G
289	$LD	`7*$SZ`($ctx),$H
290
291	extru	$inp,31,`log($SZ)/log(2)`,$t0
292	sh3addl	$t0,%r0,$t0
293	subi	`8*$SZ`,$t0,$t0
294	mtctl	$t0,%cr11		; load %sar with align factor
295
296L\$oop
297	ldi	`$SZ-1`,$t0
298	$LDM	$SZ($Tbl),$t1
299	andcm	$inp,$t0,$t0		; align $inp
300___
301	for ($i=0;$i<15;$i++) {		# load input block
302	$code.="\t$LD	`$SZ*$i`($t0),@X[$i]\n";		}
303$code.=<<___;
304	cmpb,*=	$inp,$t0,L\$aligned
305	$LD	`$SZ*15`($t0),@X[15]
306	$LD	`$SZ*16`($t0),@X[16]
307___
308	for ($i=0;$i<16;$i++) {		# align data
309	$code.="\t_align	@X[$i],@X[$i+1],@X[$i]\n";	}
310$code.=<<___;
311L\$aligned
312	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
313___
314
315for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
316$code.=<<___;
317L\$rounds
318	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
319___
320for(;$i<32;$i++)	{ &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
321$code.=<<___;
322	bb,>=	$Tbl,31,L\$rounds	; end of $Tbl signalled?
323	nop
324
325	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
326	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
327	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
328	ldo	`-$rounds*$SZ-1`($Tbl),$Tbl		; rewind $Tbl
329
330	$LD	`0*$SZ`($ctx),@X[0]	; load context
331	$LD	`1*$SZ`($ctx),@X[1]
332	$LD	`2*$SZ`($ctx),@X[2]
333	$LD	`3*$SZ`($ctx),@X[3]
334	$LD	`4*$SZ`($ctx),@X[4]
335	$LD	`5*$SZ`($ctx),@X[5]
336	addl	@X[0],$A,$A
337	$LD	`6*$SZ`($ctx),@X[6]
338	addl	@X[1],$B,$B
339	$LD	`7*$SZ`($ctx),@X[7]
340	ldo	`16*$SZ`($inp),$inp	; advance $inp
341
342	$ST	$A,`0*$SZ`($ctx)	; save context
343	addl	@X[2],$C,$C
344	$ST	$B,`1*$SZ`($ctx)
345	addl	@X[3],$D,$D
346	$ST	$C,`2*$SZ`($ctx)
347	addl	@X[4],$E,$E
348	$ST	$D,`3*$SZ`($ctx)
349	addl	@X[5],$F,$F
350	$ST	$E,`4*$SZ`($ctx)
351	addl	@X[6],$G,$G
352	$ST	$F,`5*$SZ`($ctx)
353	addl	@X[7],$H,$H
354	$ST	$G,`6*$SZ`($ctx)
355	$ST	$H,`7*$SZ`($ctx)
356
357	cmpb,*<>,n $inp,$num,L\$oop
358	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
359___
360if ($SZ==8 && $SIZE_T==4)	# SHA512 for 32-bit PA-RISC 1.0
361{{
362$code.=<<___;
363	b	L\$done
364	nop
365
366	.ALIGN	64
367L\$parisc1
368___
369
370@V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
371      $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) =
372   ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
373     "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
374$a0 ="%r17";
375$a1 ="%r18";
376$a2 ="%r19";
377$a3 ="%r20";
378$t0 ="%r21";
379$t1 ="%r22";
380$t2 ="%r28";
381$t3 ="%r29";
382$Tbl="%r31";
383
384@X=("%r23","%r24","%r25","%r26");	# zaps $num,$inp,$ctx
385
386sub ROUND_00_15_pa1 {
387my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
388       $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
389my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
390
391$code.=<<___ if (!$flag);
392	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
393	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
394___
395$code.=<<___;
396	shd	$ehi,$elo,$Sigma1[0],$t0
397	 add	$Xlo,$hlo,$hlo
398	shd	$elo,$ehi,$Sigma1[0],$t1
399	 addc	$Xhi,$hhi,$hhi		; h += X[i]
400	shd	$ehi,$elo,$Sigma1[1],$t2
401	 ldwm	8($Tbl),$Xhi
402	shd	$elo,$ehi,$Sigma1[1],$t3
403	 ldw	-4($Tbl),$Xlo		; load K[i]
404	xor	$t2,$t0,$t0
405	xor	$t3,$t1,$t1
406	 and	$flo,$elo,$a0
407	 and	$fhi,$ehi,$a1
408	shd	$ehi,$elo,$Sigma1[2],$t2
409	 andcm	$glo,$elo,$a2
410	shd	$elo,$ehi,$Sigma1[2],$t3
411	 andcm	$ghi,$ehi,$a3
412	xor	$t2,$t0,$t0
413	xor	$t3,$t1,$t1		; Sigma1(e)
414	add	$Xlo,$hlo,$hlo
415	 xor	$a2,$a0,$a0
416	addc	$Xhi,$hhi,$hhi		; h += K[i]
417	 xor	$a3,$a1,$a1		; Ch(e,f,g)
418
419	 add	$t0,$hlo,$hlo
420	shd	$ahi,$alo,$Sigma0[0],$t0
421	 addc	$t1,$hhi,$hhi		; h += Sigma1(e)
422	shd	$alo,$ahi,$Sigma0[0],$t1
423	 add	$a0,$hlo,$hlo
424	shd	$ahi,$alo,$Sigma0[1],$t2
425	 addc	$a1,$hhi,$hhi		; h += Ch(e,f,g)
426	shd	$alo,$ahi,$Sigma0[1],$t3
427
428	xor	$t2,$t0,$t0
429	xor	$t3,$t1,$t1
430	shd	$ahi,$alo,$Sigma0[2],$t2
431	and	$alo,$blo,$a0
432	shd	$alo,$ahi,$Sigma0[2],$t3
433	and	$ahi,$bhi,$a1
434	xor	$t2,$t0,$t0
435	xor	$t3,$t1,$t1		; Sigma0(a)
436
437	and	$alo,$clo,$a2
438	and	$ahi,$chi,$a3
439	xor	$a2,$a0,$a0
440	 add	$hlo,$dlo,$dlo
441	xor	$a3,$a1,$a1
442	 addc	$hhi,$dhi,$dhi		; d += h
443	and	$blo,$clo,$a2
444	 add	$t0,$hlo,$hlo
445	and	$bhi,$chi,$a3
446	 addc	$t1,$hhi,$hhi		; h += Sigma0(a)
447	xor	$a2,$a0,$a0
448	 add	$a0,$hlo,$hlo
449	xor	$a3,$a1,$a1		; Maj(a,b,c)
450	 addc	$a1,$hhi,$hhi		; h += Maj(a,b,c)
451
452___
453$code.=<<___ if ($i==15 && $flag);
454	extru	$Xlo,31,10,$Xlo
455	comiclr,= $LAST10BITS,$Xlo,%r0
456	b	L\$rounds_pa1
457	nop
458___
459push(@X,shift(@X)); push(@X,shift(@X));
460}
461
462sub ROUND_16_xx_pa1 {
463my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
464my ($i)=shift;
465$i-=16;
466$code.=<<___;
467	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
468	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
469	ldw	`-$XOFF+8*(($i+9)%16)`(%sp),$a1
470	ldw	`-$XOFF+8*(($i+9)%16)+4`(%sp),$a0	; load X[i+9]
471	ldw	`-$XOFF+8*(($i+14)%16)`(%sp),$a3
472	ldw	`-$XOFF+8*(($i+14)%16)+4`(%sp),$a2	; load X[i+14]
473	shd	$Xnhi,$Xnlo,$sigma0[0],$t0
474	shd	$Xnlo,$Xnhi,$sigma0[0],$t1
475	 add	$a0,$Xlo,$Xlo
476	shd	$Xnhi,$Xnlo,$sigma0[1],$t2
477	 addc	$a1,$Xhi,$Xhi
478	shd	$Xnlo,$Xnhi,$sigma0[1],$t3
479	xor	$t2,$t0,$t0
480	shd	$Xnhi,$Xnlo,$sigma0[2],$t2
481	xor	$t3,$t1,$t1
482	extru	$Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
483	xor	$t2,$t0,$t0
484	 shd	$a3,$a2,$sigma1[0],$a0
485	xor	$t3,$t1,$t1		; sigma0(X[i+1)&0x0f])
486	 shd	$a2,$a3,$sigma1[0],$a1
487	add	$t0,$Xlo,$Xlo
488	 shd	$a3,$a2,$sigma1[1],$t2
489	addc	$t1,$Xhi,$Xhi
490	 shd	$a2,$a3,$sigma1[1],$t3
491	xor	$t2,$a0,$a0
492	shd	$a3,$a2,$sigma1[2],$t2
493	xor	$t3,$a1,$a1
494	extru	$a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
495	xor	$t2,$a0,$a0
496	xor	$t3,$a1,$a1		; sigma0(X[i+14)&0x0f])
497	add	$a0,$Xlo,$Xlo
498	addc	$a1,$Xhi,$Xhi
499
500	stw	$Xhi,`-$XOFF+8*($i%16)`(%sp)
501	stw	$Xlo,`-$XOFF+8*($i%16)+4`(%sp)
502___
503&ROUND_00_15_pa1($i,@_,1);
504}
505$code.=<<___;
506	ldw	`0*4`($ctx),$Ahi		; load context
507	ldw	`1*4`($ctx),$Alo
508	ldw	`2*4`($ctx),$Bhi
509	ldw	`3*4`($ctx),$Blo
510	ldw	`4*4`($ctx),$Chi
511	ldw	`5*4`($ctx),$Clo
512	ldw	`6*4`($ctx),$Dhi
513	ldw	`7*4`($ctx),$Dlo
514	ldw	`8*4`($ctx),$Ehi
515	ldw	`9*4`($ctx),$Elo
516	ldw	`10*4`($ctx),$Fhi
517	ldw	`11*4`($ctx),$Flo
518	ldw	`12*4`($ctx),$Ghi
519	ldw	`13*4`($ctx),$Glo
520	ldw	`14*4`($ctx),$Hhi
521	ldw	`15*4`($ctx),$Hlo
522
523	extru	$inp,31,2,$t0
524	sh3addl	$t0,%r0,$t0
525	subi	32,$t0,$t0
526	mtctl	$t0,%cr11		; load %sar with align factor
527
528L\$oop_pa1
529	extru	$inp,31,2,$a3
530	comib,=	0,$a3,L\$aligned_pa1
531	sub	$inp,$a3,$inp
532
533	ldw	`0*4`($inp),$X[0]
534	ldw	`1*4`($inp),$X[1]
535	ldw	`2*4`($inp),$t2
536	ldw	`3*4`($inp),$t3
537	ldw	`4*4`($inp),$a0
538	ldw	`5*4`($inp),$a1
539	ldw	`6*4`($inp),$a2
540	ldw	`7*4`($inp),$a3
541	vshd	$X[0],$X[1],$X[0]
542	vshd	$X[1],$t2,$X[1]
543	stw	$X[0],`-$XOFF+0*4`(%sp)
544	ldw	`8*4`($inp),$t0
545	vshd	$t2,$t3,$t2
546	stw	$X[1],`-$XOFF+1*4`(%sp)
547	ldw	`9*4`($inp),$t1
548	vshd	$t3,$a0,$t3
549___
550{
551my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
552for ($i=2;$i<=(128/4-8);$i++) {
553$code.=<<___;
554	stw	$t[0],`-$XOFF+$i*4`(%sp)
555	ldw	`(8+$i)*4`($inp),$t[0]
556	vshd	$t[1],$t[2],$t[1]
557___
558push(@t,shift(@t));
559}
560for (;$i<(128/4-1);$i++) {
561$code.=<<___;
562	stw	$t[0],`-$XOFF+$i*4`(%sp)
563	vshd	$t[1],$t[2],$t[1]
564___
565push(@t,shift(@t));
566}
567$code.=<<___;
568	b	L\$collected_pa1
569	stw	$t[0],`-$XOFF+$i*4`(%sp)
570
571___
572}
573$code.=<<___;
574L\$aligned_pa1
575	ldw	`0*4`($inp),$X[0]
576	ldw	`1*4`($inp),$X[1]
577	ldw	`2*4`($inp),$t2
578	ldw	`3*4`($inp),$t3
579	ldw	`4*4`($inp),$a0
580	ldw	`5*4`($inp),$a1
581	ldw	`6*4`($inp),$a2
582	ldw	`7*4`($inp),$a3
583	stw	$X[0],`-$XOFF+0*4`(%sp)
584	ldw	`8*4`($inp),$t0
585	stw	$X[1],`-$XOFF+1*4`(%sp)
586	ldw	`9*4`($inp),$t1
587___
588{
589my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
590for ($i=2;$i<(128/4-8);$i++) {
591$code.=<<___;
592	stw	$t[0],`-$XOFF+$i*4`(%sp)
593	ldw	`(8+$i)*4`($inp),$t[0]
594___
595push(@t,shift(@t));
596}
597for (;$i<128/4;$i++) {
598$code.=<<___;
599	stw	$t[0],`-$XOFF+$i*4`(%sp)
600___
601push(@t,shift(@t));
602}
603$code.="L\$collected_pa1\n";
604}
605
606for($i=0;$i<16;$i++)	{ &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
607$code.="L\$rounds_pa1\n";
608for(;$i<32;$i++)	{ &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
609
610$code.=<<___;
611	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
612	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
613	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
614	ldo	`-$rounds*$SZ`($Tbl),$Tbl		; rewind $Tbl
615
616	ldw	`0*4`($ctx),$t1		; update context
617	ldw	`1*4`($ctx),$t0
618	ldw	`2*4`($ctx),$t3
619	ldw	`3*4`($ctx),$t2
620	ldw	`4*4`($ctx),$a1
621	ldw	`5*4`($ctx),$a0
622	ldw	`6*4`($ctx),$a3
623	add	$t0,$Alo,$Alo
624	ldw	`7*4`($ctx),$a2
625	addc	$t1,$Ahi,$Ahi
626	ldw	`8*4`($ctx),$t1
627	add	$t2,$Blo,$Blo
628	ldw	`9*4`($ctx),$t0
629	addc	$t3,$Bhi,$Bhi
630	ldw	`10*4`($ctx),$t3
631	add	$a0,$Clo,$Clo
632	ldw	`11*4`($ctx),$t2
633	addc	$a1,$Chi,$Chi
634	ldw	`12*4`($ctx),$a1
635	add	$a2,$Dlo,$Dlo
636	ldw	`13*4`($ctx),$a0
637	addc	$a3,$Dhi,$Dhi
638	ldw	`14*4`($ctx),$a3
639	add	$t0,$Elo,$Elo
640	ldw	`15*4`($ctx),$a2
641	addc	$t1,$Ehi,$Ehi
642	stw	$Ahi,`0*4`($ctx)
643	add	$t2,$Flo,$Flo
644	stw	$Alo,`1*4`($ctx)
645	addc	$t3,$Fhi,$Fhi
646	stw	$Bhi,`2*4`($ctx)
647	add	$a0,$Glo,$Glo
648	stw	$Blo,`3*4`($ctx)
649	addc	$a1,$Ghi,$Ghi
650	stw	$Chi,`4*4`($ctx)
651	add	$a2,$Hlo,$Hlo
652	stw	$Clo,`5*4`($ctx)
653	addc	$a3,$Hhi,$Hhi
654	stw	$Dhi,`6*4`($ctx)
655	ldo	`16*$SZ`($inp),$inp	; advance $inp
656	stw	$Dlo,`7*4`($ctx)
657	stw	$Ehi,`8*4`($ctx)
658	stw	$Elo,`9*4`($ctx)
659	stw	$Fhi,`10*4`($ctx)
660	stw	$Flo,`11*4`($ctx)
661	stw	$Ghi,`12*4`($ctx)
662	stw	$Glo,`13*4`($ctx)
663	stw	$Hhi,`14*4`($ctx)
664	comb,=	$inp,$num,L\$done
665	stw	$Hlo,`15*4`($ctx)
666	b	L\$oop_pa1
667	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
668L\$done
669___
670}}
671$code.=<<___;
672	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
673	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
674	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
675	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
676	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
677	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
678	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
679	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
680	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
681	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
682	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
683	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
684	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
685	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
686	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
687	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
688	bv	(%r2)
689	.EXIT
690	$POPMB	-$FRAME(%sp),%r3
691	.PROCEND
692	.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
693___
694
695# Explicitly encode PA-RISC 2.0 instructions used in this module, so
696# that it can be compiled with .LEVEL 1.0. It should be noted that I
697# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
698# directive...
699
700my $ldd = sub {
701  my ($mod,$args) = @_;
702  my $orig = "ldd$mod\t$args";
703
704    if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
705    {	my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
706	$opcode|=(1<<3) if ($mod =~ /^,m/);
707	$opcode|=(1<<2) if ($mod =~ /^,mb/);
708	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
709    }
710    else { "\t".$orig; }
711};
712
713my $std = sub {
714  my ($mod,$args) = @_;
715  my $orig = "std$mod\t$args";
716
717    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
718    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
719	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
720    }
721    else { "\t".$orig; }
722};
723
724my $extrd = sub {
725  my ($mod,$args) = @_;
726  my $orig = "extrd$mod\t$args";
727
728    # I only have ",u" completer, it's implicitly encoded...
729    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
730    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
731	my $len=32-$3;
732	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
733	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
734	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
735    }
736    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
737    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
738	my $len=32-$2;
739	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
740	$opcode |= (1<<13) if ($mod =~ /,\**=/);
741	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
742    }
743    else { "\t".$orig; }
744};
745
746my $shrpd = sub {
747  my ($mod,$args) = @_;
748  my $orig = "shrpd$mod\t$args";
749
750    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
751    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
752	my $cpos=63-$3;
753	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
754	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
755    }
756    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
757    {	sprintf "\t.WORD\t0x%08x\t; %s",
758		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
759    }
760    else { "\t".$orig; }
761};
762
763sub assemble {
764  my ($mnemonic,$mod,$args)=@_;
765  my $opcode = eval("\$$mnemonic");
766
767    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
768}
769
770if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
771	=~ /GNU assembler/) {
772    $gnuas = 1;
773}
774
775foreach (split("\n",$code)) {
776	s/\`([^\`]*)\`/eval $1/ge;
777
778	s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
779		$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)	# rotation for >=32
780		:       sprintf("shd\t%$1,%$2,%d",$3)/e			or
781	# translate made up instructions: _ror, _shr, _align, _shl
782	s/_ror(\s+)(%r[0-9]+),/
783		($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e			or
784
785	s/_shr(\s+%r[0-9]+),([0-9]+),/
786		$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
787		:        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e	or
788
789	s/_align(\s+%r[0-9]+,%r[0-9]+),/
790		($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e		or
791
792	s/_shl(\s+%r[0-9]+),([0-9]+),/
793		$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
794		:            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
795
796	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
797
798	s/(\.LEVEL\s+2\.0)W/$1w/	if ($gnuas && $SIZE_T==8);
799	s/\.SPACE\s+\$TEXT\$/.text/	if ($gnuas && $SIZE_T==8);
800	s/\.SUBSPA.*//			if ($gnuas && $SIZE_T==8);
801	s/cmpb,\*/comb,/ 		if ($SIZE_T==4);
802	s/\bbv\b/bve/    		if ($SIZE_T==8);
803
804	print $_,"\n";
805}
806
807close STDOUT or die "error closing STDOUT: $!";
808