1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# December 2015
18#
19# ChaCha20 for s390x.
20#
21# 3 times faster than compiler-generated code.
22
23$flavour = shift;
24
25if ($flavour =~ /3[12]/) {
26	$SIZE_T=4;
27	$g="";
28} else {
29	$SIZE_T=8;
30	$g="g";
31}
32
33while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
34open STDOUT,">$output";
35
36sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
37{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
38    $code .= "\t$opcode\t".join(',',@_)."\n";
39}
40
41my $sp="%r15";
42
43my $stdframe=16*$SIZE_T+4*8;
44my $frame=$stdframe+4*20;
45
46my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
47
48my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
49my @t=map("%r$_",(8,9));
50
51sub ROUND {
52my ($a0,$b0,$c0,$d0)=@_;
53my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
54my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
55my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
56my ($xc,$xc_)=map("\"$_\"",@t);
57my @x=map("\"$_\"",@x);
58
59	# Consider order in which variables are addressed by their
60	# index:
61	#
62	#	a   b   c   d
63	#
64	#	0   4   8  12 < even round
65	#	1   5   9  13
66	#	2   6  10  14
67	#	3   7  11  15
68	#	0   5  10  15 < odd round
69	#	1   6  11  12
70	#	2   7   8  13
71	#	3   4   9  14
72	#
73	# 'a', 'b' and 'd's are permanently allocated in registers,
74	# @x[0..7,12..15], while 'c's are maintained in memory. If
75	# you observe 'c' column, you'll notice that pair of 'c's is
76	# invariant between rounds. This means that we have to reload
77	# them once per round, in the middle. This is why you'll see
78	# 'c' stores and loads in the middle, but none in the beginning
79	# or end.
80
81	(
82	"&alr	(@x[$a0],@x[$b0])",	# Q1
83	 "&alr	(@x[$a1],@x[$b1])",	# Q2
84	"&xr	(@x[$d0],@x[$a0])",
85	 "&xr	(@x[$d1],@x[$a1])",
86	"&rll	(@x[$d0],@x[$d0],16)",
87	 "&rll	(@x[$d1],@x[$d1],16)",
88
89	"&alr	($xc,@x[$d0])",
90	 "&alr	($xc_,@x[$d1])",
91	"&xr	(@x[$b0],$xc)",
92	 "&xr	(@x[$b1],$xc_)",
93	"&rll	(@x[$b0],@x[$b0],12)",
94	 "&rll	(@x[$b1],@x[$b1],12)",
95
96	"&alr	(@x[$a0],@x[$b0])",
97	 "&alr	(@x[$a1],@x[$b1])",
98	"&xr	(@x[$d0],@x[$a0])",
99	 "&xr	(@x[$d1],@x[$a1])",
100	"&rll	(@x[$d0],@x[$d0],8)",
101	 "&rll	(@x[$d1],@x[$d1],8)",
102
103	"&alr	($xc,@x[$d0])",
104	 "&alr	($xc_,@x[$d1])",
105	"&xr	(@x[$b0],$xc)",
106	 "&xr	(@x[$b1],$xc_)",
107	"&rll	(@x[$b0],@x[$b0],7)",
108	 "&rll	(@x[$b1],@x[$b1],7)",
109
110	"&stm	($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')",	# reload pair of 'c's
111	"&lm	($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
112
113	"&alr	(@x[$a2],@x[$b2])",	# Q3
114	 "&alr	(@x[$a3],@x[$b3])",	# Q4
115	"&xr	(@x[$d2],@x[$a2])",
116	 "&xr	(@x[$d3],@x[$a3])",
117	"&rll	(@x[$d2],@x[$d2],16)",
118	 "&rll	(@x[$d3],@x[$d3],16)",
119
120	"&alr	($xc,@x[$d2])",
121	 "&alr	($xc_,@x[$d3])",
122	"&xr	(@x[$b2],$xc)",
123	 "&xr	(@x[$b3],$xc_)",
124	"&rll	(@x[$b2],@x[$b2],12)",
125	 "&rll	(@x[$b3],@x[$b3],12)",
126
127	"&alr	(@x[$a2],@x[$b2])",
128	 "&alr	(@x[$a3],@x[$b3])",
129	"&xr	(@x[$d2],@x[$a2])",
130	 "&xr	(@x[$d3],@x[$a3])",
131	"&rll	(@x[$d2],@x[$d2],8)",
132	 "&rll	(@x[$d3],@x[$d3],8)",
133
134	"&alr	($xc,@x[$d2])",
135	 "&alr	($xc_,@x[$d3])",
136	"&xr	(@x[$b2],$xc)",
137	 "&xr	(@x[$b3],$xc_)",
138	"&rll	(@x[$b2],@x[$b2],7)",
139	 "&rll	(@x[$b3],@x[$b3],7)"
140	);
141}
142
143$code.=<<___;
144.text
145
146.globl	ChaCha20_ctr32
147.type	ChaCha20_ctr32,\@function
148.align	32
149ChaCha20_ctr32:
150	lt${g}r	$len,$len			# $len==0?
151	bzr	%r14
152	a${g}hi	$len,-64
153	l${g}hi	%r1,-$frame
154	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
155	sl${g}r	$out,$inp			# difference
156	la	$len,0($inp,$len)		# end of input minus 64
157	larl	%r7,.Lsigma
158	lgr	%r0,$sp
159	la	$sp,0(%r1,$sp)
160	st${g}	%r0,0($sp)
161
162	lmg	%r8,%r11,0($key)		# load key
163	lmg	%r12,%r13,0($counter)		# load counter
164	lmg	%r6,%r7,0(%r7)			# load sigma constant
165
166	la	%r14,0($inp)
167	st${g}	$out,$frame+3*$SIZE_T($sp)
168	st${g}	$len,$frame+4*$SIZE_T($sp)
169	stmg	%r6,%r13,$stdframe($sp)		# copy key schedule to stack
170	srlg	@x[12],%r12,32			# 32-bit counter value
171	j	.Loop_outer
172
173.align	16
174.Loop_outer:
175	lm	@x[0],@x[7],$stdframe+4*0($sp)		# load x[0]-x[7]
176	lm	@t[0],@t[1],$stdframe+4*10($sp)		# load x[10]-x[11]
177	lm	@x[13],@x[15],$stdframe+4*13($sp)	# load x[13]-x[15]
178	stm	@t[0],@t[1],$stdframe+4*8+4*10($sp)	# offload x[10]-x[11]
179	lm	@t[0],@t[1],$stdframe+4*8($sp)		# load x[8]-x[9]
180	st	@x[12],$stdframe+4*12($sp)		# save counter
181	st${g}	%r14,$frame+2*$SIZE_T($sp)		# save input pointer
182	lhi	%r14,10
183	j	.Loop
184
185.align	4
186.Loop:
187___
188	foreach (&ROUND(0, 4, 8,12)) { eval; }
189	foreach (&ROUND(0, 5,10,15)) { eval; }
190$code.=<<___;
191	brct	%r14,.Loop
192
193	l${g}	%r14,$frame+2*$SIZE_T($sp)		# pull input pointer
194	stm	@t[0],@t[1],$stdframe+4*8+4*8($sp)	# offload x[8]-x[9]
195	lm${g}	@t[0],@t[1],$frame+3*$SIZE_T($sp)
196
197	al	@x[0],$stdframe+4*0($sp)	# accumulate key schedule
198	al	@x[1],$stdframe+4*1($sp)
199	al	@x[2],$stdframe+4*2($sp)
200	al	@x[3],$stdframe+4*3($sp)
201	al	@x[4],$stdframe+4*4($sp)
202	al	@x[5],$stdframe+4*5($sp)
203	al	@x[6],$stdframe+4*6($sp)
204	al	@x[7],$stdframe+4*7($sp)
205	lrvr	@x[0],@x[0]
206	lrvr	@x[1],@x[1]
207	lrvr	@x[2],@x[2]
208	lrvr	@x[3],@x[3]
209	lrvr	@x[4],@x[4]
210	lrvr	@x[5],@x[5]
211	lrvr	@x[6],@x[6]
212	lrvr	@x[7],@x[7]
213	al	@x[12],$stdframe+4*12($sp)
214	al	@x[13],$stdframe+4*13($sp)
215	al	@x[14],$stdframe+4*14($sp)
216	al	@x[15],$stdframe+4*15($sp)
217	lrvr	@x[12],@x[12]
218	lrvr	@x[13],@x[13]
219	lrvr	@x[14],@x[14]
220	lrvr	@x[15],@x[15]
221
222	la	@t[0],0(@t[0],%r14)		# reconstruct output pointer
223	cl${g}r	%r14,@t[1]
224	jh	.Ltail
225
226	x	@x[0],4*0(%r14)			# xor with input
227	x	@x[1],4*1(%r14)
228	st	@x[0],4*0(@t[0])		# store output
229	x	@x[2],4*2(%r14)
230	st	@x[1],4*1(@t[0])
231	x	@x[3],4*3(%r14)
232	st	@x[2],4*2(@t[0])
233	x	@x[4],4*4(%r14)
234	st	@x[3],4*3(@t[0])
235	 lm	@x[0],@x[3],$stdframe+4*8+4*8($sp)	# load x[8]-x[11]
236	x	@x[5],4*5(%r14)
237	st	@x[4],4*4(@t[0])
238	x	@x[6],4*6(%r14)
239	 al	@x[0],$stdframe+4*8($sp)
240	st	@x[5],4*5(@t[0])
241	x	@x[7],4*7(%r14)
242	 al	@x[1],$stdframe+4*9($sp)
243	st	@x[6],4*6(@t[0])
244	x	@x[12],4*12(%r14)
245	 al	@x[2],$stdframe+4*10($sp)
246	st	@x[7],4*7(@t[0])
247	x	@x[13],4*13(%r14)
248	 al	@x[3],$stdframe+4*11($sp)
249	st	@x[12],4*12(@t[0])
250	x	@x[14],4*14(%r14)
251	st	@x[13],4*13(@t[0])
252	x	@x[15],4*15(%r14)
253	st	@x[14],4*14(@t[0])
254	 lrvr	@x[0],@x[0]
255	st	@x[15],4*15(@t[0])
256	 lrvr	@x[1],@x[1]
257	 lrvr	@x[2],@x[2]
258	 lrvr	@x[3],@x[3]
259	lhi	@x[12],1
260	 x	@x[0],4*8(%r14)
261	al	@x[12],$stdframe+4*12($sp)	# increment counter
262	 x	@x[1],4*9(%r14)
263	 st	@x[0],4*8(@t[0])
264	 x	@x[2],4*10(%r14)
265	 st	@x[1],4*9(@t[0])
266	 x	@x[3],4*11(%r14)
267	 st	@x[2],4*10(@t[0])
268	 st	@x[3],4*11(@t[0])
269
270	cl${g}r	%r14,@t[1]			# done yet?
271	la	%r14,64(%r14)
272	jl	.Loop_outer
273
274.Ldone:
275	xgr	%r0,%r0
276	xgr	%r1,%r1
277	xgr	%r2,%r2
278	xgr	%r3,%r3
279	stmg	%r0,%r3,$stdframe+4*4($sp)	# wipe key copy
280	stmg	%r0,%r3,$stdframe+4*12($sp)
281
282	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
283	br	%r14
284
285.align	16
286.Ltail:
287	la	@t[1],64($t[1])
288	stm	@x[0],@x[7],$stdframe+4*0($sp)
289	sl${g}r	@t[1],%r14
290	lm	@x[0],@x[3],$stdframe+4*8+4*8($sp)
291	l${g}hi	@x[6],0
292	stm	@x[12],@x[15],$stdframe+4*12($sp)
293	al	@x[0],$stdframe+4*8($sp)
294	al	@x[1],$stdframe+4*9($sp)
295	al	@x[2],$stdframe+4*10($sp)
296	al	@x[3],$stdframe+4*11($sp)
297	lrvr	@x[0],@x[0]
298	lrvr	@x[1],@x[1]
299	lrvr	@x[2],@x[2]
300	lrvr	@x[3],@x[3]
301	stm	@x[0],@x[3],$stdframe+4*8($sp)
302
303.Loop_tail:
304	llgc	@x[4],0(@x[6],%r14)
305	llgc	@x[5],$stdframe(@x[6],$sp)
306	xr	@x[5],@x[4]
307	stc	@x[5],0(@x[6],@t[0])
308	la	@x[6],1(@x[6])
309	brct	@t[1],.Loop_tail
310
311	j	.Ldone
312.size	ChaCha20_ctr32,.-ChaCha20_ctr32
313
314.align	32
315.Lsigma:
316.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	# endian-neutral
317.asciz	"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
318.align	4
319___
320
321foreach (split("\n",$code)) {
322	s/\`([^\`]*)\`/eval $1/ge;
323
324	print $_,"\n";
325}
326close STDOUT or die "error closing STDOUT: $!";
327