1e71b7053SJung-uk Kim#!/usr/bin/env perl
217f01e99SJung-uk Kim# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3e71b7053SJung-uk Kim#
4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License").  You may not use
5e71b7053SJung-uk Kim# this file except in compliance with the License.  You can obtain a copy
6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at
7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html
8e71b7053SJung-uk Kim#
9e71b7053SJung-uk Kim# ====================================================================
10e71b7053SJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11e71b7053SJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and
12e71b7053SJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further
13e71b7053SJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/.
14e71b7053SJung-uk Kim# ====================================================================
15e71b7053SJung-uk Kim#
16e71b7053SJung-uk Kim# Keccak-1600 for s390x.
17e71b7053SJung-uk Kim#
18e71b7053SJung-uk Kim# June 2017.
19e71b7053SJung-uk Kim#
20e71b7053SJung-uk Kim# Below code is [lane complementing] KECCAK_2X implementation (see
21e71b7053SJung-uk Kim# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22e71b7053SJung-uk Kim# instead of actually unrolling the loop pair-wise I simply flip
23e71b7053SJung-uk Kim# pointers to T[][] and A[][] at the end of round. Since number of
24e71b7053SJung-uk Kim# rounds is even, last round writes to A[][] and everything works out.
25e71b7053SJung-uk Kim# In the nutshell it's transliteration of x86_64 module, because both
26e71b7053SJung-uk Kim# architectures have similar capabilities/limitations. Performance
27e71b7053SJung-uk Kim# measurement is problematic as I don't have access to an idle system.
28e71b7053SJung-uk Kim# It looks like z13 processes one byte [out of long message] in ~14
29e71b7053SJung-uk Kim# cycles. At least the result is consistent with estimate based on
30e71b7053SJung-uk Kim# amount of instruction and assumed instruction issue rate. It's ~2.5x
31e71b7053SJung-uk Kim# faster than compiler-generated code.
32e71b7053SJung-uk Kim
33*b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension)
34*b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file
35*b077aed3SPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
36*b077aed3SPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
37e71b7053SJung-uk Kim
38e71b7053SJung-uk Kimif ($flavour =~ /3[12]/) {
39e71b7053SJung-uk Kim	$SIZE_T=4;
40e71b7053SJung-uk Kim	$g="";
41e71b7053SJung-uk Kim} else {
42e71b7053SJung-uk Kim	$SIZE_T=8;
43e71b7053SJung-uk Kim	$g="g";
44e71b7053SJung-uk Kim}
45e71b7053SJung-uk Kim
46*b077aed3SPierre Pronchery$output and open STDOUT,">$output";
47e71b7053SJung-uk Kim
48e71b7053SJung-uk Kimmy @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
49e71b7053SJung-uk Kim
50e71b7053SJung-uk Kimmy @C = map("%r$_",(0,1,5..7));
51e71b7053SJung-uk Kimmy @D = map("%r$_",(8..12));
52e71b7053SJung-uk Kimmy @T = map("%r$_",(13..14));
53e71b7053SJung-uk Kimmy ($src,$dst,$iotas) = map("%r$_",(2..4));
54e71b7053SJung-uk Kimmy $sp = "%r15";
55e71b7053SJung-uk Kim
56e71b7053SJung-uk Kim$stdframe=16*$SIZE_T+4*8;
57e71b7053SJung-uk Kim$frame=$stdframe+25*8;
58e71b7053SJung-uk Kim
59e71b7053SJung-uk Kimmy @rhotates = ([  0,  1, 62, 28, 27 ],
60e71b7053SJung-uk Kim                [ 36, 44,  6, 55, 20 ],
61e71b7053SJung-uk Kim                [  3, 10, 43, 25, 39 ],
62e71b7053SJung-uk Kim                [ 41, 45, 15, 21,  8 ],
63e71b7053SJung-uk Kim                [ 18,  2, 61, 56, 14 ]);
64e71b7053SJung-uk Kim
65e71b7053SJung-uk Kim{ my @C = @C;	# copy, because we mess them up...
66e71b7053SJung-uk Kim  my @D = @D;
67e71b7053SJung-uk Kim
68e71b7053SJung-uk Kim$code.=<<___;
69e71b7053SJung-uk Kim.text
70e71b7053SJung-uk Kim
71e71b7053SJung-uk Kim.type	__KeccakF1600,\@function
72e71b7053SJung-uk Kim.align	32
73e71b7053SJung-uk Kim__KeccakF1600:
74e71b7053SJung-uk Kim	st${g}	%r14,$SIZE_T*14($sp)
75e71b7053SJung-uk Kim	lg	@C[0],$A[4][0]($src)
76e71b7053SJung-uk Kim	lg	@C[1],$A[4][1]($src)
77e71b7053SJung-uk Kim	lg	@C[2],$A[4][2]($src)
78e71b7053SJung-uk Kim	lg	@C[3],$A[4][3]($src)
79e71b7053SJung-uk Kim	lg	@C[4],$A[4][4]($src)
80e71b7053SJung-uk Kim	larl	$iotas,iotas
81e71b7053SJung-uk Kim	j	.Loop
82e71b7053SJung-uk Kim
83e71b7053SJung-uk Kim.align	16
84e71b7053SJung-uk Kim.Loop:
85e71b7053SJung-uk Kim	lg	@D[0],$A[0][0]($src)
86e71b7053SJung-uk Kim	lg	@D[1],$A[1][1]($src)
87e71b7053SJung-uk Kim	lg	@D[2],$A[2][2]($src)
88e71b7053SJung-uk Kim	lg	@D[3],$A[3][3]($src)
89e71b7053SJung-uk Kim
90e71b7053SJung-uk Kim	xgr	@C[0],@D[0]
91e71b7053SJung-uk Kim	xg	@C[1],$A[0][1]($src)
92e71b7053SJung-uk Kim	xg	@C[2],$A[0][2]($src)
93e71b7053SJung-uk Kim	xg	@C[3],$A[0][3]($src)
94e71b7053SJung-uk Kim	lgr	@D[4],@C[4]
95e71b7053SJung-uk Kim	xg	@C[4],$A[0][4]($src)
96e71b7053SJung-uk Kim
97e71b7053SJung-uk Kim	xg	@C[0],$A[1][0]($src)
98e71b7053SJung-uk Kim	xgr	@C[1],@D[1]
99e71b7053SJung-uk Kim	xg	@C[2],$A[1][2]($src)
100e71b7053SJung-uk Kim	xg	@C[3],$A[1][3]($src)
101e71b7053SJung-uk Kim	xg	@C[4],$A[1][4]($src)
102e71b7053SJung-uk Kim
103e71b7053SJung-uk Kim	xg	@C[0],$A[2][0]($src)
104e71b7053SJung-uk Kim	xg	@C[1],$A[2][1]($src)
105e71b7053SJung-uk Kim	xgr	@C[2],@D[2]
106e71b7053SJung-uk Kim	xg	@C[3],$A[2][3]($src)
107e71b7053SJung-uk Kim	xg	@C[4],$A[2][4]($src)
108e71b7053SJung-uk Kim
109e71b7053SJung-uk Kim	xg	@C[0],$A[3][0]($src)
110e71b7053SJung-uk Kim	xg	@C[1],$A[3][1]($src)
111e71b7053SJung-uk Kim	xg	@C[2],$A[3][2]($src)
112e71b7053SJung-uk Kim	xgr	@C[3],@D[3]
113e71b7053SJung-uk Kim	xg	@C[4],$A[3][4]($src)
114e71b7053SJung-uk Kim
115e71b7053SJung-uk Kim	lgr	@T[0],@C[2]
116e71b7053SJung-uk Kim	rllg	@C[2],@C[2],1
117e71b7053SJung-uk Kim	xgr	@C[2],@C[0]		# D[1] = ROL64(C[2], 1) ^ C[0]
118e71b7053SJung-uk Kim
119e71b7053SJung-uk Kim	rllg	@C[0],@C[0],1
120e71b7053SJung-uk Kim	xgr	@C[0],@C[3]		# D[4] = ROL64(C[0], 1) ^ C[3]
121e71b7053SJung-uk Kim
122e71b7053SJung-uk Kim	rllg	@C[3],@C[3],1
123e71b7053SJung-uk Kim	xgr	@C[3],@C[1]		# D[2] = ROL64(C[3], 1) ^ C[1]
124e71b7053SJung-uk Kim
125e71b7053SJung-uk Kim	rllg	@C[1],@C[1],1
126e71b7053SJung-uk Kim	xgr	@C[1],@C[4]		# D[0] = ROL64(C[1], 1) ^ C[4]
127e71b7053SJung-uk Kim
128e71b7053SJung-uk Kim	rllg	@C[4],@C[4],1
129e71b7053SJung-uk Kim	xgr	@C[4],@T[0]		# D[3] = ROL64(C[4], 1) ^ C[2]
130e71b7053SJung-uk Kim___
131e71b7053SJung-uk Kim	(@D[0..4], @C) = (@C[1..4,0], @D);
132e71b7053SJung-uk Kim$code.=<<___;
133e71b7053SJung-uk Kim	xgr	@C[1],@D[1]
134e71b7053SJung-uk Kim	xgr	@C[2],@D[2]
135e71b7053SJung-uk Kim	xgr	@C[3],@D[3]
136e71b7053SJung-uk Kim	 rllg	@C[1],@C[1],$rhotates[1][1]
137e71b7053SJung-uk Kim	xgr	@C[4],@D[4]
138e71b7053SJung-uk Kim	 rllg	@C[2],@C[2],$rhotates[2][2]
139e71b7053SJung-uk Kim	xgr	@C[0],@D[0]
140e71b7053SJung-uk Kim
141e71b7053SJung-uk Kim	lgr	@T[0],@C[1]
142e71b7053SJung-uk Kim	ogr	@C[1],@C[2]
143e71b7053SJung-uk Kim	 rllg	@C[3],@C[3],$rhotates[3][3]
144e71b7053SJung-uk Kim	xgr	@C[1],@C[0]		#	    C[0] ^ ( C[1] | C[2])
145e71b7053SJung-uk Kim	 rllg	@C[4],@C[4],$rhotates[4][4]
146e71b7053SJung-uk Kim	xg	@C[1],0($iotas)
147e71b7053SJung-uk Kim	la	$iotas,8($iotas)
148e71b7053SJung-uk Kim	stg	@C[1],$A[0][0]($dst)	# R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
149e71b7053SJung-uk Kim
150e71b7053SJung-uk Kim	lgr	@T[1],@C[4]
151e71b7053SJung-uk Kim	ngr	@C[4],@C[3]
152e71b7053SJung-uk Kim	 lghi	@C[1],-1		# no 'not' instruction :-(
153e71b7053SJung-uk Kim	xgr	@C[4],@C[2]		#	    C[2] ^ ( C[4] & C[3])
154e71b7053SJung-uk Kim	 xgr	@C[2],@C[1]		# not	@C[2]
155e71b7053SJung-uk Kim	stg	@C[4],$A[0][2]($dst)	# R[0][2] = C[2] ^ ( C[4] & C[3])
156e71b7053SJung-uk Kim	 ogr	@C[2],@C[3]
157e71b7053SJung-uk Kim	 xgr	@C[2],@T[0]		#	    C[1] ^ (~C[2] | C[3])
158e71b7053SJung-uk Kim
159e71b7053SJung-uk Kim	ngr	@T[0],@C[0]
160e71b7053SJung-uk Kim	 stg	@C[2],$A[0][1]($dst)	# R[0][1] = C[1] ^ (~C[2] | C[3])
161e71b7053SJung-uk Kim	xgr	@T[0],@T[1]		#	    C[4] ^ ( C[1] & C[0])
162e71b7053SJung-uk Kim	 ogr	@T[1],@C[0]
163e71b7053SJung-uk Kim	stg	@T[0],$A[0][4]($dst)	# R[0][4] = C[4] ^ ( C[1] & C[0])
164e71b7053SJung-uk Kim	 xgr	@T[1],@C[3]		#	    C[3] ^ ( C[4] | C[0])
165e71b7053SJung-uk Kim	 stg	@T[1],$A[0][3]($dst)	# R[0][3] = C[3] ^ ( C[4] | C[0])
166e71b7053SJung-uk Kim
167e71b7053SJung-uk Kim
168e71b7053SJung-uk Kim	lg	@C[0],$A[0][3]($src)
169e71b7053SJung-uk Kim	lg	@C[4],$A[4][2]($src)
170e71b7053SJung-uk Kim	lg	@C[3],$A[3][1]($src)
171e71b7053SJung-uk Kim	lg	@C[1],$A[1][4]($src)
172e71b7053SJung-uk Kim	lg	@C[2],$A[2][0]($src)
173e71b7053SJung-uk Kim
174e71b7053SJung-uk Kim	xgr	@C[0],@D[3]
175e71b7053SJung-uk Kim	xgr	@C[4],@D[2]
176e71b7053SJung-uk Kim	 rllg	@C[0],@C[0],$rhotates[0][3]
177e71b7053SJung-uk Kim	xgr	@C[3],@D[1]
178e71b7053SJung-uk Kim	 rllg	@C[4],@C[4],$rhotates[4][2]
179e71b7053SJung-uk Kim	xgr	@C[1],@D[4]
180e71b7053SJung-uk Kim	 rllg	@C[3],@C[3],$rhotates[3][1]
181e71b7053SJung-uk Kim	xgr	@C[2],@D[0]
182e71b7053SJung-uk Kim
183e71b7053SJung-uk Kim	lgr	@T[0],@C[0]
184e71b7053SJung-uk Kim	ogr	@C[0],@C[4]
185e71b7053SJung-uk Kim	 rllg	@C[1],@C[1],$rhotates[1][4]
186e71b7053SJung-uk Kim	xgr	@C[0],@C[3]		#	    C[3] ^ (C[0] |  C[4])
187e71b7053SJung-uk Kim	 rllg	@C[2],@C[2],$rhotates[2][0]
188e71b7053SJung-uk Kim	stg	@C[0],$A[1][3]($dst)	# R[1][3] = C[3] ^ (C[0] |  C[4])
189e71b7053SJung-uk Kim
190e71b7053SJung-uk Kim	lgr	@T[1],@C[1]
191e71b7053SJung-uk Kim	ngr	@C[1],@T[0]
192e71b7053SJung-uk Kim	 lghi	@C[0],-1		# no 'not' instruction :-(
193e71b7053SJung-uk Kim	xgr	@C[1],@C[4]		#	    C[4] ^ (C[1] &  C[0])
194e71b7053SJung-uk Kim	 xgr	@C[4],@C[0]		# not	@C[4]
195e71b7053SJung-uk Kim	stg	@C[1],$A[1][4]($dst)	# R[1][4] = C[4] ^ (C[1] &  C[0])
196e71b7053SJung-uk Kim
197e71b7053SJung-uk Kim	 ogr	@C[4],@C[3]
198e71b7053SJung-uk Kim	 xgr	@C[4],@C[2]		#	    C[2] ^ (~C[4] | C[3])
199e71b7053SJung-uk Kim
200e71b7053SJung-uk Kim	ngr	@C[3],@C[2]
201e71b7053SJung-uk Kim	 stg	@C[4],$A[1][2]($dst)	# R[1][2] = C[2] ^ (~C[4] | C[3])
202e71b7053SJung-uk Kim	xgr	@C[3],@T[1]		#	    C[1] ^ (C[3] &  C[2])
203e71b7053SJung-uk Kim	 ogr	@T[1],@C[2]
204e71b7053SJung-uk Kim	stg	@C[3],$A[1][1]($dst)	# R[1][1] = C[1] ^ (C[3] &  C[2])
205e71b7053SJung-uk Kim	 xgr	@T[1],@T[0]		#	    C[0] ^ (C[1] |  C[2])
206e71b7053SJung-uk Kim	 stg	@T[1],$A[1][0]($dst)	# R[1][0] = C[0] ^ (C[1] |  C[2])
207e71b7053SJung-uk Kim
208e71b7053SJung-uk Kim
209e71b7053SJung-uk Kim	lg	@C[2],$A[2][3]($src)
210e71b7053SJung-uk Kim	lg	@C[3],$A[3][4]($src)
211e71b7053SJung-uk Kim	lg	@C[1],$A[1][2]($src)
212e71b7053SJung-uk Kim	lg	@C[4],$A[4][0]($src)
213e71b7053SJung-uk Kim	lg	@C[0],$A[0][1]($src)
214e71b7053SJung-uk Kim
215e71b7053SJung-uk Kim	xgr	@C[2],@D[3]
216e71b7053SJung-uk Kim	xgr	@C[3],@D[4]
217e71b7053SJung-uk Kim	 rllg	@C[2],@C[2],$rhotates[2][3]
218e71b7053SJung-uk Kim	xgr	@C[1],@D[2]
219e71b7053SJung-uk Kim	 rllg	@C[3],@C[3],$rhotates[3][4]
220e71b7053SJung-uk Kim	xgr	@C[4],@D[0]
221e71b7053SJung-uk Kim	 rllg	@C[1],@C[1],$rhotates[1][2]
222e71b7053SJung-uk Kim	xgr	@C[0],@D[1]
223e71b7053SJung-uk Kim
224e71b7053SJung-uk Kim	lgr	@T[0],@C[2]
225e71b7053SJung-uk Kim	ngr	@C[2],@C[3]
226e71b7053SJung-uk Kim	 rllg	@C[4],@C[4],$rhotates[4][0]
227e71b7053SJung-uk Kim	xgr	@C[2],@C[1]		#	     C[1] ^ ( C[2] & C[3])
228e71b7053SJung-uk Kim	lghi	@T[1],-1		# no 'not' instruction :-(
229e71b7053SJung-uk Kim	stg	@C[2],$A[2][1]($dst)	# R[2][1] =  C[1] ^ ( C[2] & C[3])
230e71b7053SJung-uk Kim
231e71b7053SJung-uk Kim	xgr	@C[3],@T[1]		# not	@C[3]
232e71b7053SJung-uk Kim	lgr	@T[1],@C[4]
233e71b7053SJung-uk Kim	ngr	@C[4],@C[3]
234e71b7053SJung-uk Kim	 rllg	@C[0],@C[0],$rhotates[0][1]
235e71b7053SJung-uk Kim	xgr	@C[4],@T[0]		#	     C[2] ^ ( C[4] & ~C[3])
236e71b7053SJung-uk Kim	 ogr	@T[0],@C[1]
237e71b7053SJung-uk Kim	stg	@C[4],$A[2][2]($dst)	# R[2][2] =  C[2] ^ ( C[4] & ~C[3])
238e71b7053SJung-uk Kim	 xgr	@T[0],@C[0]		#	     C[0] ^ ( C[2] | C[1])
239e71b7053SJung-uk Kim
240e71b7053SJung-uk Kim	ngr	@C[1],@C[0]
241e71b7053SJung-uk Kim	 stg	@T[0],$A[2][0]($dst)	# R[2][0] =  C[0] ^ ( C[2] | C[1])
242e71b7053SJung-uk Kim	xgr	@C[1],@T[1]		#	     C[4] ^ ( C[1] & C[0])
243e71b7053SJung-uk Kim	 ogr	@C[0],@T[1]
244e71b7053SJung-uk Kim	stg	@C[1],$A[2][4]($dst)	# R[2][4] =  C[4] ^ ( C[1] & C[0])
245e71b7053SJung-uk Kim	 xgr	@C[0],@C[3]		#	    ~C[3] ^ ( C[0] | C[4])
246e71b7053SJung-uk Kim	 stg	@C[0],$A[2][3]($dst)	# R[2][3] = ~C[3] ^ ( C[0] | C[4])
247e71b7053SJung-uk Kim
248e71b7053SJung-uk Kim
249e71b7053SJung-uk Kim	lg	@C[2],$A[2][1]($src)
250e71b7053SJung-uk Kim	lg	@C[3],$A[3][2]($src)
251e71b7053SJung-uk Kim	lg	@C[1],$A[1][0]($src)
252e71b7053SJung-uk Kim	lg	@C[4],$A[4][3]($src)
253e71b7053SJung-uk Kim	lg	@C[0],$A[0][4]($src)
254e71b7053SJung-uk Kim
255e71b7053SJung-uk Kim	xgr	@C[2],@D[1]
256e71b7053SJung-uk Kim	xgr	@C[3],@D[2]
257e71b7053SJung-uk Kim	 rllg	@C[2],@C[2],$rhotates[2][1]
258e71b7053SJung-uk Kim	xgr	@C[1],@D[0]
259e71b7053SJung-uk Kim	 rllg	@C[3],@C[3],$rhotates[3][2]
260e71b7053SJung-uk Kim	xgr	@C[4],@D[3]
261e71b7053SJung-uk Kim	 rllg	@C[1],@C[1],$rhotates[1][0]
262e71b7053SJung-uk Kim	xgr	@C[0],@D[4]
263e71b7053SJung-uk Kim	 rllg	@C[4],@C[4],$rhotates[4][3]
264e71b7053SJung-uk Kim
265e71b7053SJung-uk Kim	lgr	@T[0],@C[2]
266e71b7053SJung-uk Kim	ogr	@C[2],@C[3]
267e71b7053SJung-uk Kim	lghi	@T[1],-1		# no 'not' instruction :-(
268e71b7053SJung-uk Kim	xgr	@C[2],@C[1]		#	     C[1] ^ ( C[2] | C[3])
269e71b7053SJung-uk Kim	xgr	@C[3],@T[1]		# not	@C[3]
270e71b7053SJung-uk Kim	stg	@C[2],$A[3][1]($dst)	# R[3][1] =  C[1] ^ ( C[2] | C[3])
271e71b7053SJung-uk Kim
272e71b7053SJung-uk Kim	lgr	@T[1],@C[4]
273e71b7053SJung-uk Kim	ogr	@C[4],@C[3]
274e71b7053SJung-uk Kim	 rllg	@C[0],@C[0],$rhotates[0][4]
275e71b7053SJung-uk Kim	xgr	@C[4],@T[0]		#	     C[2] ^ ( C[4] | ~C[3])
276e71b7053SJung-uk Kim	 ngr	@T[0],@C[1]
277e71b7053SJung-uk Kim	stg	@C[4],$A[3][2]($dst)	# R[3][2] =  C[2] ^ ( C[4] | ~C[3])
278e71b7053SJung-uk Kim	 xgr	@T[0],@C[0]		#	     C[0] ^ ( C[2] & C[1])
279e71b7053SJung-uk Kim
280e71b7053SJung-uk Kim	ogr	@C[1],@C[0]
281e71b7053SJung-uk Kim	 stg	@T[0],$A[3][0]($dst)	# R[3][0] =  C[0] ^ ( C[2] & C[1])
282e71b7053SJung-uk Kim	xgr	@C[1],@T[1]		#	     C[4] ^ ( C[1] | C[0])
283e71b7053SJung-uk Kim	 ngr	@C[0],@T[1]
284e71b7053SJung-uk Kim	stg	@C[1],$A[3][4]($dst)	# R[3][4] =  C[4] ^ ( C[1] | C[0])
285e71b7053SJung-uk Kim	 xgr	@C[0],@C[3]		#	    ~C[3] ^ ( C[0] & C[4])
286e71b7053SJung-uk Kim	 stg	@C[0],$A[3][3]($dst)	# R[3][3] = ~C[3] ^ ( C[0] & C[4])
287e71b7053SJung-uk Kim
288e71b7053SJung-uk Kim
289e71b7053SJung-uk Kim	xg	@D[2],$A[0][2]($src)
290e71b7053SJung-uk Kim	xg	@D[3],$A[1][3]($src)
291e71b7053SJung-uk Kim	xg	@D[1],$A[4][1]($src)
292e71b7053SJung-uk Kim	xg	@D[4],$A[2][4]($src)
293e71b7053SJung-uk Kim	xgr	$dst,$src		# xchg	$dst,$src
294e71b7053SJung-uk Kim	 rllg	@D[2],@D[2],$rhotates[0][2]
295e71b7053SJung-uk Kim	xg	@D[0],$A[3][0]($src)
296e71b7053SJung-uk Kim	 rllg	@D[3],@D[3],$rhotates[1][3]
297e71b7053SJung-uk Kim	xgr	$src,$dst
298e71b7053SJung-uk Kim	 rllg	@D[1],@D[1],$rhotates[4][1]
299e71b7053SJung-uk Kim	xgr	$dst,$src
300e71b7053SJung-uk Kim	 rllg	@D[4],@D[4],$rhotates[2][4]
301e71b7053SJung-uk Kim___
302e71b7053SJung-uk Kim	@C = @D[2..4,0,1];
303e71b7053SJung-uk Kim$code.=<<___;
304e71b7053SJung-uk Kim	lgr	@T[0],@C[0]
305e71b7053SJung-uk Kim	ngr	@C[0],@C[1]
306e71b7053SJung-uk Kim	lghi	@T[1],-1		# no 'not' instruction :-(
307e71b7053SJung-uk Kim	xgr	@C[0],@C[4]		#	     C[4] ^ ( C[0] & C[1])
308e71b7053SJung-uk Kim	xgr	@C[1],@T[1]		# not	@C[1]
309e71b7053SJung-uk Kim	stg	@C[0],$A[4][4]($src)	# R[4][4] =  C[4] ^ ( C[0] & C[1])
310e71b7053SJung-uk Kim
311e71b7053SJung-uk Kim	lgr	@T[1],@C[2]
312e71b7053SJung-uk Kim	ngr	@C[2],@C[1]
313e71b7053SJung-uk Kim	 rllg	@D[0],@D[0],$rhotates[3][0]
314e71b7053SJung-uk Kim	xgr	@C[2],@T[0]		#	     C[0] ^ ( C[2] & ~C[1])
315e71b7053SJung-uk Kim	 ogr	@T[0],@C[4]
316e71b7053SJung-uk Kim	stg	@C[2],$A[4][0]($src)	# R[4][0] =  C[0] ^ ( C[2] & ~C[1])
317e71b7053SJung-uk Kim	 xgr	@T[0],@C[3]		#	     C[3] ^ ( C[0] | C[4])
318e71b7053SJung-uk Kim
319e71b7053SJung-uk Kim	ngr	@C[4],@C[3]
320e71b7053SJung-uk Kim	 stg	@T[0],$A[4][3]($src)	# R[4][3] =  C[3] ^ ( C[0] | C[4])
321e71b7053SJung-uk Kim	xgr	@C[4],@T[1]		#	     C[2] ^ ( C[4] & C[3])
322e71b7053SJung-uk Kim	 ogr	@C[3],@T[1]
323e71b7053SJung-uk Kim	stg	@C[4],$A[4][2]($src)	# R[4][2] =  C[2] ^ ( C[4] & C[3])
324e71b7053SJung-uk Kim	 xgr	@C[3],@C[1]		#	    ~C[1] ^ ( C[2] | C[3])
325e71b7053SJung-uk Kim
326e71b7053SJung-uk Kim	lgr	@C[1],@C[0]		# harmonize with the loop top
327e71b7053SJung-uk Kim	lgr	@C[0],@T[0]
328e71b7053SJung-uk Kim	 stg	@C[3],$A[4][1]($src)	# R[4][1] = ~C[1] ^ ( C[2] | C[3])
329e71b7053SJung-uk Kim
330e71b7053SJung-uk Kim	tmll	$iotas,255
331e71b7053SJung-uk Kim	jnz	.Loop
332e71b7053SJung-uk Kim
333e71b7053SJung-uk Kim	l${g}	%r14,$SIZE_T*14($sp)
334e71b7053SJung-uk Kim	br	%r14
335e71b7053SJung-uk Kim.size	__KeccakF1600,.-__KeccakF1600
336e71b7053SJung-uk Kim___
337e71b7053SJung-uk Kim}
338e71b7053SJung-uk Kim{
339e71b7053SJung-uk Kim$code.=<<___;
340e71b7053SJung-uk Kim.type	KeccakF1600,\@function
341e71b7053SJung-uk Kim.align	32
342e71b7053SJung-uk KimKeccakF1600:
343e71b7053SJung-uk Kim.LKeccakF1600:
344e71b7053SJung-uk Kim	lghi	%r1,-$frame
345e71b7053SJung-uk Kim	stm${g}	%r6,%r15,$SIZE_T*6($sp)
346e71b7053SJung-uk Kim	lgr	%r0,$sp
347e71b7053SJung-uk Kim	la	$sp,0(%r1,$sp)
348e71b7053SJung-uk Kim	st${g}	%r0,0($sp)
349e71b7053SJung-uk Kim
350e71b7053SJung-uk Kim	lghi	@D[0],-1		# no 'not' instruction :-(
351e71b7053SJung-uk Kim	lghi	@D[1],-1
352e71b7053SJung-uk Kim	lghi	@D[2],-1
353e71b7053SJung-uk Kim	lghi	@D[3],-1
354e71b7053SJung-uk Kim	lghi	@D[4],-1
355e71b7053SJung-uk Kim	lghi	@T[0],-1
356e71b7053SJung-uk Kim	xg	@D[0],$A[0][1]($src)
357e71b7053SJung-uk Kim	xg	@D[1],$A[0][2]($src)
358e71b7053SJung-uk Kim	xg	@D[2],$A[1][3]($src)
359e71b7053SJung-uk Kim	xg	@D[3],$A[2][2]($src)
360e71b7053SJung-uk Kim	xg	@D[4],$A[3][2]($src)
361e71b7053SJung-uk Kim	xg	@T[0],$A[4][0]($src)
362e71b7053SJung-uk Kim	stmg	@D[0],@D[1],$A[0][1]($src)
363e71b7053SJung-uk Kim	stg	@D[2],$A[1][3]($src)
364e71b7053SJung-uk Kim	stg	@D[3],$A[2][2]($src)
365e71b7053SJung-uk Kim	stg	@D[4],$A[3][2]($src)
366e71b7053SJung-uk Kim	stg	@T[0],$A[4][0]($src)
367e71b7053SJung-uk Kim
368e71b7053SJung-uk Kim	la	$dst,$stdframe($sp)
369e71b7053SJung-uk Kim
370e71b7053SJung-uk Kim	bras	%r14,__KeccakF1600
371e71b7053SJung-uk Kim
372e71b7053SJung-uk Kim	lghi	@D[0],-1		# no 'not' instruction :-(
373e71b7053SJung-uk Kim	lghi	@D[1],-1
374e71b7053SJung-uk Kim	lghi	@D[2],-1
375e71b7053SJung-uk Kim	lghi	@D[3],-1
376e71b7053SJung-uk Kim	lghi	@D[4],-1
377e71b7053SJung-uk Kim	lghi	@T[0],-1
378e71b7053SJung-uk Kim	xg	@D[0],$A[0][1]($src)
379e71b7053SJung-uk Kim	xg	@D[1],$A[0][2]($src)
380e71b7053SJung-uk Kim	xg	@D[2],$A[1][3]($src)
381e71b7053SJung-uk Kim	xg	@D[3],$A[2][2]($src)
382e71b7053SJung-uk Kim	xg	@D[4],$A[3][2]($src)
383e71b7053SJung-uk Kim	xg	@T[0],$A[4][0]($src)
384e71b7053SJung-uk Kim	stmg	@D[0],@D[1],$A[0][1]($src)
385e71b7053SJung-uk Kim	stg	@D[2],$A[1][3]($src)
386e71b7053SJung-uk Kim	stg	@D[3],$A[2][2]($src)
387e71b7053SJung-uk Kim	stg	@D[4],$A[3][2]($src)
388e71b7053SJung-uk Kim	stg	@T[0],$A[4][0]($src)
389e71b7053SJung-uk Kim
390e71b7053SJung-uk Kim	lm${g}	%r6,%r15,$frame+6*$SIZE_T($sp)
391e71b7053SJung-uk Kim	br	%r14
392e71b7053SJung-uk Kim.size	KeccakF1600,.-KeccakF1600
393e71b7053SJung-uk Kim___
394e71b7053SJung-uk Kim}
395e71b7053SJung-uk Kim{ my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5));
396e71b7053SJung-uk Kim
397e71b7053SJung-uk Kim$code.=<<___;
398e71b7053SJung-uk Kim.globl	SHA3_absorb
399e71b7053SJung-uk Kim.type	SHA3_absorb,\@function
400e71b7053SJung-uk Kim.align	32
401e71b7053SJung-uk KimSHA3_absorb:
402e71b7053SJung-uk Kim	lghi	%r1,-$frame
403e71b7053SJung-uk Kim	stm${g}	%r5,%r15,$SIZE_T*5($sp)
404e71b7053SJung-uk Kim	lgr	%r0,$sp
405e71b7053SJung-uk Kim	la	$sp,0(%r1,$sp)
406e71b7053SJung-uk Kim	st${g}	%r0,0($sp)
407e71b7053SJung-uk Kim
408e71b7053SJung-uk Kim	lghi	@D[0],-1		# no 'not' instruction :-(
409e71b7053SJung-uk Kim	lghi	@D[1],-1
410e71b7053SJung-uk Kim	lghi	@D[2],-1
411e71b7053SJung-uk Kim	lghi	@D[3],-1
412e71b7053SJung-uk Kim	lghi	@D[4],-1
413e71b7053SJung-uk Kim	lghi	@T[0],-1
414e71b7053SJung-uk Kim	xg	@D[0],$A[0][1]($src)
415e71b7053SJung-uk Kim	xg	@D[1],$A[0][2]($src)
416e71b7053SJung-uk Kim	xg	@D[2],$A[1][3]($src)
417e71b7053SJung-uk Kim	xg	@D[3],$A[2][2]($src)
418e71b7053SJung-uk Kim	xg	@D[4],$A[3][2]($src)
419e71b7053SJung-uk Kim	xg	@T[0],$A[4][0]($src)
420e71b7053SJung-uk Kim	stmg	@D[0],@D[1],$A[0][1]($src)
421e71b7053SJung-uk Kim	stg	@D[2],$A[1][3]($src)
422e71b7053SJung-uk Kim	stg	@D[3],$A[2][2]($src)
423e71b7053SJung-uk Kim	stg	@D[4],$A[3][2]($src)
424e71b7053SJung-uk Kim	stg	@T[0],$A[4][0]($src)
425e71b7053SJung-uk Kim
426e71b7053SJung-uk Kim.Loop_absorb:
427e71b7053SJung-uk Kim	cl${g}r	$len,$bsz
428e71b7053SJung-uk Kim	jl	.Ldone_absorb
429e71b7053SJung-uk Kim
430e71b7053SJung-uk Kim	srl${g}	$bsz,3
431e71b7053SJung-uk Kim	la	%r1,0($A_flat)
432e71b7053SJung-uk Kim
433e71b7053SJung-uk Kim.Lblock_absorb:
434e71b7053SJung-uk Kim	lrvg	%r0,0($inp)
435e71b7053SJung-uk Kim	la	$inp,8($inp)
436e71b7053SJung-uk Kim	xg	%r0,0(%r1)
437e71b7053SJung-uk Kim	a${g}hi	$len,-8
438c9cf7b5cSJung-uk Kim	stg	%r0,0(%r1)
439c9cf7b5cSJung-uk Kim	la	%r1,8(%r1)
440e71b7053SJung-uk Kim	brct	$bsz,.Lblock_absorb
441e71b7053SJung-uk Kim
442e71b7053SJung-uk Kim	stm${g}	$inp,$len,$frame+3*$SIZE_T($sp)
443e71b7053SJung-uk Kim	la	$dst,$stdframe($sp)
444e71b7053SJung-uk Kim	bras	%r14,__KeccakF1600
445e71b7053SJung-uk Kim	lm${g}	$inp,$bsz,$frame+3*$SIZE_T($sp)
446e71b7053SJung-uk Kim	j	.Loop_absorb
447e71b7053SJung-uk Kim
448e71b7053SJung-uk Kim.align	16
449e71b7053SJung-uk Kim.Ldone_absorb:
450e71b7053SJung-uk Kim	lghi	@D[0],-1		# no 'not' instruction :-(
451e71b7053SJung-uk Kim	lghi	@D[1],-1
452e71b7053SJung-uk Kim	lghi	@D[2],-1
453e71b7053SJung-uk Kim	lghi	@D[3],-1
454e71b7053SJung-uk Kim	lghi	@D[4],-1
455e71b7053SJung-uk Kim	lghi	@T[0],-1
456e71b7053SJung-uk Kim	xg	@D[0],$A[0][1]($src)
457e71b7053SJung-uk Kim	xg	@D[1],$A[0][2]($src)
458e71b7053SJung-uk Kim	xg	@D[2],$A[1][3]($src)
459e71b7053SJung-uk Kim	xg	@D[3],$A[2][2]($src)
460e71b7053SJung-uk Kim	xg	@D[4],$A[3][2]($src)
461e71b7053SJung-uk Kim	xg	@T[0],$A[4][0]($src)
462e71b7053SJung-uk Kim	stmg	@D[0],@D[1],$A[0][1]($src)
463e71b7053SJung-uk Kim	stg	@D[2],$A[1][3]($src)
464e71b7053SJung-uk Kim	stg	@D[3],$A[2][2]($src)
465e71b7053SJung-uk Kim	stg	@D[4],$A[3][2]($src)
466e71b7053SJung-uk Kim	stg	@T[0],$A[4][0]($src)
467e71b7053SJung-uk Kim
468e71b7053SJung-uk Kim	lgr	%r2,$len		# return value
469e71b7053SJung-uk Kim
470e71b7053SJung-uk Kim	lm${g}	%r6,%r15,$frame+6*$SIZE_T($sp)
471e71b7053SJung-uk Kim	br	%r14
472e71b7053SJung-uk Kim.size	SHA3_absorb,.-SHA3_absorb
473e71b7053SJung-uk Kim___
474e71b7053SJung-uk Kim}
475e71b7053SJung-uk Kim{ my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5));
476e71b7053SJung-uk Kim
477e71b7053SJung-uk Kim$code.=<<___;
478e71b7053SJung-uk Kim.globl	SHA3_squeeze
479e71b7053SJung-uk Kim.type	SHA3_squeeze,\@function
480e71b7053SJung-uk Kim.align	32
481e71b7053SJung-uk KimSHA3_squeeze:
482e71b7053SJung-uk Kim	srl${g}	$bsz,3
483e71b7053SJung-uk Kim	st${g}	%r14,2*$SIZE_T($sp)
484e71b7053SJung-uk Kim	lghi	%r14,8
485e71b7053SJung-uk Kim	st${g}	$bsz,5*$SIZE_T($sp)
486e71b7053SJung-uk Kim	la	%r1,0($A_flat)
487e71b7053SJung-uk Kim
488e71b7053SJung-uk Kim	j	.Loop_squeeze
489e71b7053SJung-uk Kim
490e71b7053SJung-uk Kim.align	16
491e71b7053SJung-uk Kim.Loop_squeeze:
492e71b7053SJung-uk Kim	cl${g}r $len,%r14
493e71b7053SJung-uk Kim	jl	.Ltail_squeeze
494e71b7053SJung-uk Kim
495e71b7053SJung-uk Kim	lrvg	%r0,0(%r1)
496e71b7053SJung-uk Kim	la	%r1,8(%r1)
497e71b7053SJung-uk Kim	stg	%r0,0($out)
498e71b7053SJung-uk Kim	la	$out,8($out)
499e71b7053SJung-uk Kim	a${g}hi	$len,-8			# len -= 8
500e71b7053SJung-uk Kim	jz	.Ldone_squeeze
501e71b7053SJung-uk Kim
502e71b7053SJung-uk Kim	brct	$bsz,.Loop_squeeze	# bsz--
503e71b7053SJung-uk Kim
504e71b7053SJung-uk Kim	stm${g}	$out,$len,3*$SIZE_T($sp)
505e71b7053SJung-uk Kim	bras	%r14,.LKeccakF1600
506e71b7053SJung-uk Kim	lm${g}	$out,$bsz,3*$SIZE_T($sp)
507e71b7053SJung-uk Kim	lghi	%r14,8
508e71b7053SJung-uk Kim	la	%r1,0($A_flat)
509e71b7053SJung-uk Kim	j	.Loop_squeeze
510e71b7053SJung-uk Kim
511e71b7053SJung-uk Kim.Ltail_squeeze:
512e71b7053SJung-uk Kim	lg	%r0,0(%r1)
513e71b7053SJung-uk Kim.Loop_tail_squeeze:
514e71b7053SJung-uk Kim	stc	%r0,0($out)
515e71b7053SJung-uk Kim	la	$out,1($out)
516e71b7053SJung-uk Kim	srlg	%r0,8
517e71b7053SJung-uk Kim	brct	$len,.Loop_tail_squeeze
518e71b7053SJung-uk Kim
519e71b7053SJung-uk Kim.Ldone_squeeze:
520e71b7053SJung-uk Kim	l${g}	%r14,2*$SIZE_T($sp)
521e71b7053SJung-uk Kim	br	%r14
522e71b7053SJung-uk Kim.size	SHA3_squeeze,.-SHA3_squeeze
523e71b7053SJung-uk Kim___
524e71b7053SJung-uk Kim}
525e71b7053SJung-uk Kim$code.=<<___;
526e71b7053SJung-uk Kim.align	256
527e71b7053SJung-uk Kim	.quad	0,0,0,0,0,0,0,0
528e71b7053SJung-uk Kim.type	iotas,\@object
529e71b7053SJung-uk Kimiotas:
530e71b7053SJung-uk Kim	.quad	0x0000000000000001
531e71b7053SJung-uk Kim	.quad	0x0000000000008082
532e71b7053SJung-uk Kim	.quad	0x800000000000808a
533e71b7053SJung-uk Kim	.quad	0x8000000080008000
534e71b7053SJung-uk Kim	.quad	0x000000000000808b
535e71b7053SJung-uk Kim	.quad	0x0000000080000001
536e71b7053SJung-uk Kim	.quad	0x8000000080008081
537e71b7053SJung-uk Kim	.quad	0x8000000000008009
538e71b7053SJung-uk Kim	.quad	0x000000000000008a
539e71b7053SJung-uk Kim	.quad	0x0000000000000088
540e71b7053SJung-uk Kim	.quad	0x0000000080008009
541e71b7053SJung-uk Kim	.quad	0x000000008000000a
542e71b7053SJung-uk Kim	.quad	0x000000008000808b
543e71b7053SJung-uk Kim	.quad	0x800000000000008b
544e71b7053SJung-uk Kim	.quad	0x8000000000008089
545e71b7053SJung-uk Kim	.quad	0x8000000000008003
546e71b7053SJung-uk Kim	.quad	0x8000000000008002
547e71b7053SJung-uk Kim	.quad	0x8000000000000080
548e71b7053SJung-uk Kim	.quad	0x000000000000800a
549e71b7053SJung-uk Kim	.quad	0x800000008000000a
550e71b7053SJung-uk Kim	.quad	0x8000000080008081
551e71b7053SJung-uk Kim	.quad	0x8000000000008080
552e71b7053SJung-uk Kim	.quad	0x0000000080000001
553e71b7053SJung-uk Kim	.quad	0x8000000080008008
554e71b7053SJung-uk Kim.size	iotas,.-iotas
555e71b7053SJung-uk Kim.asciz	"Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>"
556e71b7053SJung-uk Kim___
557e71b7053SJung-uk Kim
558e71b7053SJung-uk Kim# unlike 32-bit shift 64-bit one takes three arguments
559e71b7053SJung-uk Kim$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
560e71b7053SJung-uk Kim
561e71b7053SJung-uk Kimprint $code;
56217f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!";
563