1#! /usr/bin/env perl
2# Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# RC4 for PA-RISC.
18
19# June 2009.
20#
21# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
22# For reference, [4x] unrolled loop is >40% faster than folded one.
23# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
24# is believed to be not sufficient to justify the effort...
25#
26# Special thanks to polarhome.com for providing HP-UX account.
27
28$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
29
30$flavour = shift;
31$output = shift;
32open STDOUT,">$output";
33
34if ($flavour =~ /64/) {
35	$LEVEL		="2.0W";
36	$SIZE_T		=8;
37	$FRAME_MARKER	=80;
38	$SAVED_RP	=16;
39	$PUSH		="std";
40	$PUSHMA		="std,ma";
41	$POP		="ldd";
42	$POPMB		="ldd,mb";
43} else {
44	$LEVEL		="1.0";
45	$SIZE_T		=4;
46	$FRAME_MARKER	=48;
47	$SAVED_RP	=20;
48	$PUSH		="stw";
49	$PUSHMA		="stwm";
50	$POP		="ldw";
51	$POPMB		="ldwm";
52}
53
54$FRAME=4*$SIZE_T+$FRAME_MARKER;	# 4 saved regs + frame marker
55				#                [+ argument transfer]
56$SZ=1;				# defaults to RC4_CHAR
57if (open CONF,"<${dir}../../opensslconf.h") {
58    while(<CONF>) {
59	if (m/#\s*define\s+RC4_INT\s+(.*)/) {
60	    $SZ = ($1=~/char$/) ? 1 : 4;
61	    last;
62	}
63    }
64    close CONF;
65}
66
67if ($SZ==1) {	# RC4_CHAR
68    $LD="ldb";
69    $LDX="ldbx";
70    $MKX="addl";
71    $ST="stb";
72} else {	# RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
73    $LD="ldw";
74    $LDX="ldwx,s";
75    $MKX="sh2addl";
76    $ST="stw";
77}
78
79$key="%r26";
80$len="%r25";
81$inp="%r24";
82$out="%r23";
83
84@XX=("%r19","%r20");
85@TX=("%r21","%r22");
86$YY="%r28";
87$TY="%r29";
88
89$acc="%r1";
90$ix="%r2";
91$iy="%r3";
92$dat0="%r4";
93$dat1="%r5";
94$rem="%r6";
95$mask="%r31";
96
97sub unrolledloopbody {
98for ($i=0;$i<4;$i++) {
99$code.=<<___;
100	ldo	1($XX[0]),$XX[1]
101	`sprintf("$LDX	%$TY(%$key),%$dat1") if ($i>0)`
102	and	$mask,$XX[1],$XX[1]
103	$LDX	$YY($key),$TY
104	$MKX	$YY,$key,$ix
105	$LDX	$XX[1]($key),$TX[1]
106	$MKX	$XX[0],$key,$iy
107	$ST	$TX[0],0($ix)
108	comclr,<> $XX[1],$YY,%r0	; conditional
109	copy	$TX[0],$TX[1]		; move
110	`sprintf("%sdep	%$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
111	$ST	$TY,0($iy)
112	addl	$TX[0],$TY,$TY
113	addl	$TX[1],$YY,$YY
114	and	$mask,$TY,$TY
115	and	$mask,$YY,$YY
116___
117push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
118} }
119
120sub foldedloop {
121my ($label,$count)=@_;
122$code.=<<___;
123$label
124	$MKX	$YY,$key,$iy
125	$LDX	$YY($key),$TY
126	$MKX	$XX[0],$key,$ix
127	$ST	$TX[0],0($iy)
128	ldo	1($XX[0]),$XX[0]
129	$ST	$TY,0($ix)
130	addl	$TX[0],$TY,$TY
131	ldbx	$inp($out),$dat1
132	and	$mask,$TY,$TY
133	and	$mask,$XX[0],$XX[0]
134	$LDX	$TY($key),$acc
135	$LDX	$XX[0]($key),$TX[0]
136	ldo	1($out),$out
137	xor	$dat1,$acc,$acc
138	addl	$TX[0],$YY,$YY
139	stb	$acc,-1($out)
140	addib,<> -1,$count,$label	; $count is always small
141	and	$mask,$YY,$YY
142___
143}
144
145$code=<<___;
146	.LEVEL	$LEVEL
147	.SPACE	\$TEXT\$
148	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
149
150	.EXPORT	RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
151RC4
152	.PROC
153	.CALLINFO	FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
154	.ENTRY
155	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
156	$PUSHMA	%r3,$FRAME(%sp)
157	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
158	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
159	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
160
161	cmpib,*= 0,$len,L\$abort
162	sub	$inp,$out,$inp		; distance between $inp and $out
163
164	$LD	`0*$SZ`($key),$XX[0]
165	$LD	`1*$SZ`($key),$YY
166	ldo	`2*$SZ`($key),$key
167
168	ldi	0xff,$mask
169	ldi	3,$dat0
170
171	ldo	1($XX[0]),$XX[0]	; warm up loop
172	and	$mask,$XX[0],$XX[0]
173	$LDX	$XX[0]($key),$TX[0]
174	addl	$TX[0],$YY,$YY
175	cmpib,*>>= 6,$len,L\$oop1	; is $len large enough to bother?
176	and	$mask,$YY,$YY
177
178	and,<>	$out,$dat0,$rem		; is $out aligned?
179	b	L\$alignedout
180	subi	4,$rem,$rem
181	sub	$len,$rem,$len
182___
183&foldedloop("L\$alignout",$rem);	# process till $out is aligned
184
185$code.=<<___;
186L\$alignedout				; $len is at least 4 here
187	and,<>	$inp,$dat0,$acc		; is $inp aligned?
188	b	L\$oop4
189	sub	$inp,$acc,$rem		; align $inp
190
191	sh3addl	$acc,%r0,$acc
192	subi	32,$acc,$acc
193	mtctl	$acc,%cr11		; load %sar with vshd align factor
194	ldwx	$rem($out),$dat0
195	ldo	4($rem),$rem
196L\$oop4misalignedinp
197___
198&unrolledloopbody();
199$code.=<<___;
200	$LDX	$TY($key),$ix
201	ldwx	$rem($out),$dat1
202	ldo	-4($len),$len
203	or	$ix,$acc,$acc		; last piece, no need to dep
204	vshd	$dat0,$dat1,$iy		; align data
205	copy	$dat1,$dat0
206	xor	$iy,$acc,$acc
207	stw	$acc,0($out)
208	cmpib,*<< 3,$len,L\$oop4misalignedinp
209	ldo	4($out),$out
210	cmpib,*= 0,$len,L\$done
211	nop
212	b	L\$oop1
213	nop
214
215	.ALIGN	8
216L\$oop4
217___
218&unrolledloopbody();
219$code.=<<___;
220	$LDX	$TY($key),$ix
221	ldwx	$inp($out),$dat0
222	ldo	-4($len),$len
223	or	$ix,$acc,$acc		; last piece, no need to dep
224	xor	$dat0,$acc,$acc
225	stw	$acc,0($out)
226	cmpib,*<< 3,$len,L\$oop4
227	ldo	4($out),$out
228	cmpib,*= 0,$len,L\$done
229	nop
230___
231&foldedloop("L\$oop1",$len);
232$code.=<<___;
233L\$done
234	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2
235	ldo	-1($XX[0]),$XX[0]	; chill out loop
236	sub	$YY,$TX[0],$YY
237	and	$mask,$XX[0],$XX[0]
238	and	$mask,$YY,$YY
239	$ST	$XX[0],`-2*$SZ`($key)
240	$ST	$YY,`-1*$SZ`($key)
241	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
242	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
243	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
244L\$abort
245	bv	(%r2)
246	.EXIT
247	$POPMB	-$FRAME(%sp),%r3
248	.PROCEND
249___
250
251$code.=<<___;
252
253	.EXPORT	RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
254	.ALIGN	8
255RC4_set_key
256	.PROC
257	.CALLINFO	NO_CALLS
258	.ENTRY
259	$ST	%r0,`0*$SZ`($key)
260	$ST	%r0,`1*$SZ`($key)
261	ldo	`2*$SZ`($key),$key
262	copy	%r0,@XX[0]
263L\$1st
264	$ST	@XX[0],0($key)
265	ldo	1(@XX[0]),@XX[0]
266	bb,>=	@XX[0],`31-8`,L\$1st	; @XX[0]<256
267	ldo	$SZ($key),$key
268
269	ldo	`-256*$SZ`($key),$key	; rewind $key
270	addl	$len,$inp,$inp		; $inp to point at the end
271	sub	%r0,$len,%r23		; inverse index
272	copy	%r0,@XX[0]
273	copy	%r0,@XX[1]
274	ldi	0xff,$mask
275
276L\$2nd
277	$LDX	@XX[0]($key),@TX[0]
278	ldbx	%r23($inp),@TX[1]
279	addi,nuv 1,%r23,%r23		; increment and conditional
280	sub	%r0,$len,%r23		; inverse index
281	addl	@TX[0],@XX[1],@XX[1]
282	addl	@TX[1],@XX[1],@XX[1]
283	and	$mask,@XX[1],@XX[1]
284	$MKX	@XX[0],$key,$TY
285	$LDX	@XX[1]($key),@TX[1]
286	$MKX	@XX[1],$key,$YY
287	ldo	1(@XX[0]),@XX[0]
288	$ST	@TX[0],0($YY)
289	bb,>=	@XX[0],`31-8`,L\$2nd	; @XX[0]<256
290	$ST	@TX[1],0($TY)
291
292	bv,n	(%r2)
293	.EXIT
294	nop
295	.PROCEND
296
297	.EXPORT	RC4_options,ENTRY
298	.ALIGN	8
299RC4_options
300	.PROC
301	.CALLINFO	NO_CALLS
302	.ENTRY
303	blr	%r0,%r28
304	ldi	3,%r1
305L\$pic
306	andcm	%r28,%r1,%r28
307	bv	(%r2)
308	.EXIT
309	ldo	L\$opts-L\$pic(%r28),%r28
310	.PROCEND
311	.ALIGN	8
312L\$opts
313	.STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
314	.STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
315___
316
317if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
318	=~ /GNU assembler/) {
319    $gnuas = 1;
320}
321
322foreach(split("\n",$code)) {
323	s/\`([^\`]*)\`/eval $1/ge;
324
325	s/(\.LEVEL\s+2\.0)W/$1w/	if ($gnuas && $SIZE_T==8);
326	s/\.SPACE\s+\$TEXT\$/.text/	if ($gnuas && $SIZE_T==8);
327	s/\.SUBSPA.*//			if ($gnuas && $SIZE_T==8);
328	s/cmpib,\*/comib,/		if ($SIZE_T==4);
329	s/\bbv\b/bve/			if ($SIZE_T==8);
330
331	print $_,"\n";
332}
333close STDOUT;
334