1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# February 2009
11#
12# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
13# "cluster" Address Generation Interlocks, so that one pipeline stall
14# resolves several dependencies.
15
16# November 2010.
17#
18# Adapt for -m31 build. If kernel supports what's called "highgprs"
19# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
20# instructions and achieve "64-bit" performance even in 31-bit legacy
21# application context. The feature is not specific to any particular
22# processor, as long as it's "z-CPU". Latter implies that the code
23# remains z/Architecture specific. On z990 it was measured to perform
24# 50% better than code generated by gcc 4.3.
25
26$flavour = shift;
27
28if ($flavour =~ /3[12]/) {
29	$SIZE_T=4;
30	$g="";
31} else {
32	$SIZE_T=8;
33	$g="g";
34}
35
36while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
37open STDOUT,">$output";
38
39$rp="%r14";
40$sp="%r15";
41$code=<<___;
42.text
43
44___
45
46# void RC4(RC4_KEY *key,size_t len,const void *inp,void *out)
47{
48$acc="%r0";
49$cnt="%r1";
50$key="%r2";
51$len="%r3";
52$inp="%r4";
53$out="%r5";
54
55@XX=("%r6","%r7");
56@TX=("%r8","%r9");
57$YY="%r10";
58$TY="%r11";
59
60$code.=<<___;
61.globl	RC4
62.type	RC4,\@function
63.align	64
64RC4:
65	stm${g}	%r6,%r11,6*$SIZE_T($sp)
66___
67$code.=<<___ if ($flavour =~ /3[12]/);
68	llgfr	$len,$len
69___
70$code.=<<___;
71	llgc	$XX[0],0($key)
72	llgc	$YY,1($key)
73	la	$XX[0],1($XX[0])
74	nill	$XX[0],0xff
75	srlg	$cnt,$len,3
76	ltgr	$cnt,$cnt
77	llgc	$TX[0],2($XX[0],$key)
78	jz	.Lshort
79	j	.Loop8
80
81.align	64
82.Loop8:
83___
84for ($i=0;$i<8;$i++) {
85$code.=<<___;
86	la	$YY,0($YY,$TX[0])	# $i
87	nill	$YY,255
88	la	$XX[1],1($XX[0])
89	nill	$XX[1],255
90___
91$code.=<<___ if ($i==1);
92	llgc	$acc,2($TY,$key)
93___
94$code.=<<___ if ($i>1);
95	sllg	$acc,$acc,8
96	ic	$acc,2($TY,$key)
97___
98$code.=<<___;
99	llgc	$TY,2($YY,$key)
100	stc	$TX[0],2($YY,$key)
101	llgc	$TX[1],2($XX[1],$key)
102	stc	$TY,2($XX[0],$key)
103	cr	$XX[1],$YY
104	jne	.Lcmov$i
105	la	$TX[1],0($TX[0])
106.Lcmov$i:
107	la	$TY,0($TY,$TX[0])
108	nill	$TY,255
109___
110push(@TX,shift(@TX)); push(@XX,shift(@XX));     # "rotate" registers
111}
112
113$code.=<<___;
114	lg	$TX[1],0($inp)
115	sllg	$acc,$acc,8
116	la	$inp,8($inp)
117	ic	$acc,2($TY,$key)
118	xgr	$acc,$TX[1]
119	stg	$acc,0($out)
120	la	$out,8($out)
121	brctg	$cnt,.Loop8
122
123.Lshort:
124	lghi	$acc,7
125	ngr	$len,$acc
126	jz	.Lexit
127	j	.Loop1
128
129.align	16
130.Loop1:
131	la	$YY,0($YY,$TX[0])
132	nill	$YY,255
133	llgc	$TY,2($YY,$key)
134	stc	$TX[0],2($YY,$key)
135	stc	$TY,2($XX[0],$key)
136	ar	$TY,$TX[0]
137	ahi	$XX[0],1
138	nill	$TY,255
139	nill	$XX[0],255
140	llgc	$acc,0($inp)
141	la	$inp,1($inp)
142	llgc	$TY,2($TY,$key)
143	llgc	$TX[0],2($XX[0],$key)
144	xr	$acc,$TY
145	stc	$acc,0($out)
146	la	$out,1($out)
147	brct	$len,.Loop1
148
149.Lexit:
150	ahi	$XX[0],-1
151	stc	$XX[0],0($key)
152	stc	$YY,1($key)
153	lm${g}	%r6,%r11,6*$SIZE_T($sp)
154	br	$rp
155.size	RC4,.-RC4
156.string	"RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
157
158___
159}
160
161# void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp)
162{
163$cnt="%r0";
164$idx="%r1";
165$key="%r2";
166$len="%r3";
167$inp="%r4";
168$acc="%r5";
169$dat="%r6";
170$ikey="%r7";
171$iinp="%r8";
172
173$code.=<<___;
174.globl	private_RC4_set_key
175.type	private_RC4_set_key,\@function
176.align	64
177private_RC4_set_key:
178	stm${g}	%r6,%r8,6*$SIZE_T($sp)
179	lhi	$cnt,256
180	la	$idx,0(%r0)
181	sth	$idx,0($key)
182.align	4
183.L1stloop:
184	stc	$idx,2($idx,$key)
185	la	$idx,1($idx)
186	brct	$cnt,.L1stloop
187
188	lghi	$ikey,-256
189	lr	$cnt,$len
190	la	$iinp,0(%r0)
191	la	$idx,0(%r0)
192.align	16
193.L2ndloop:
194	llgc	$acc,2+256($ikey,$key)
195	llgc	$dat,0($iinp,$inp)
196	la	$idx,0($idx,$acc)
197	la	$ikey,1($ikey)
198	la	$idx,0($idx,$dat)
199	nill	$idx,255
200	la	$iinp,1($iinp)
201	tml	$ikey,255
202	llgc	$dat,2($idx,$key)
203	stc	$dat,2+256-1($ikey,$key)
204	stc	$acc,2($idx,$key)
205	jz	.Ldone
206	brct	$cnt,.L2ndloop
207	lr	$cnt,$len
208	la	$iinp,0(%r0)
209	j	.L2ndloop
210.Ldone:
211	lm${g}	%r6,%r8,6*$SIZE_T($sp)
212	br	$rp
213.size	private_RC4_set_key,.-private_RC4_set_key
214
215___
216}
217
218# const char *RC4_options()
219$code.=<<___;
220.globl	RC4_options
221.type	RC4_options,\@function
222.align	16
223RC4_options:
224	larl	%r2,.Loptions
225	br	%r14
226.size	RC4_options,.-RC4_options
227.section	.rodata
228.Loptions:
229.align	8
230.string	"rc4(8x,char)"
231___
232
233print $code;
234close STDOUT;	# force flush
235