1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for s390x.
17#
18# June 2017.
19#
20# Below code is [lane complementing] KECCAK_2X implementation (see
21# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22# instead of actually unrolling the loop pair-wise I simply flip
23# pointers to T[][] and A[][] at the end of round. Since number of
24# rounds is even, last round writes to A[][] and everything works out.
25# In the nutshell it's transliteration of x86_64 module, because both
26# architectures have similar capabilities/limitations. Performance
27# measurement is problematic as I don't have access to an idle system.
28# It looks like z13 processes one byte [out of long message] in ~14
29# cycles. At least the result is consistent with estimate based on
30# amount of instruction and assumed instruction issue rate. It's ~2.5x
31# faster than compiler-generated code.
32
33# $output is the last argument if it looks like a file (it has an extension)
34# $flavour is the first argument if it doesn't look like a file
35$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
36$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
37
38if ($flavour =~ /3[12]/) {
39	$SIZE_T=4;
40	$g="";
41} else {
42	$SIZE_T=8;
43	$g="g";
44}
45
46$output and open STDOUT,">$output";
47
48my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
49
50my @C = map("%r$_",(0,1,5..7));
51my @D = map("%r$_",(8..12));
52my @T = map("%r$_",(13..14));
53my ($src,$dst,$iotas) = map("%r$_",(2..4));
54my $sp = "%r15";
55
56$stdframe=16*$SIZE_T+4*8;
57$frame=$stdframe+25*8;
58
59my @rhotates = ([  0,  1, 62, 28, 27 ],
60                [ 36, 44,  6, 55, 20 ],
61                [  3, 10, 43, 25, 39 ],
62                [ 41, 45, 15, 21,  8 ],
63                [ 18,  2, 61, 56, 14 ]);
64
65{ my @C = @C;	# copy, because we mess them up...
66  my @D = @D;
67
68$code.=<<___;
69.text
70
71.type	__KeccakF1600,\@function
72.align	32
73__KeccakF1600:
74	st${g}	%r14,$SIZE_T*14($sp)
75	lg	@C[0],$A[4][0]($src)
76	lg	@C[1],$A[4][1]($src)
77	lg	@C[2],$A[4][2]($src)
78	lg	@C[3],$A[4][3]($src)
79	lg	@C[4],$A[4][4]($src)
80	larl	$iotas,iotas
81	j	.Loop
82
83.align	16
84.Loop:
85	lg	@D[0],$A[0][0]($src)
86	lg	@D[1],$A[1][1]($src)
87	lg	@D[2],$A[2][2]($src)
88	lg	@D[3],$A[3][3]($src)
89
90	xgr	@C[0],@D[0]
91	xg	@C[1],$A[0][1]($src)
92	xg	@C[2],$A[0][2]($src)
93	xg	@C[3],$A[0][3]($src)
94	lgr	@D[4],@C[4]
95	xg	@C[4],$A[0][4]($src)
96
97	xg	@C[0],$A[1][0]($src)
98	xgr	@C[1],@D[1]
99	xg	@C[2],$A[1][2]($src)
100	xg	@C[3],$A[1][3]($src)
101	xg	@C[4],$A[1][4]($src)
102
103	xg	@C[0],$A[2][0]($src)
104	xg	@C[1],$A[2][1]($src)
105	xgr	@C[2],@D[2]
106	xg	@C[3],$A[2][3]($src)
107	xg	@C[4],$A[2][4]($src)
108
109	xg	@C[0],$A[3][0]($src)
110	xg	@C[1],$A[3][1]($src)
111	xg	@C[2],$A[3][2]($src)
112	xgr	@C[3],@D[3]
113	xg	@C[4],$A[3][4]($src)
114
115	lgr	@T[0],@C[2]
116	rllg	@C[2],@C[2],1
117	xgr	@C[2],@C[0]		# D[1] = ROL64(C[2], 1) ^ C[0]
118
119	rllg	@C[0],@C[0],1
120	xgr	@C[0],@C[3]		# D[4] = ROL64(C[0], 1) ^ C[3]
121
122	rllg	@C[3],@C[3],1
123	xgr	@C[3],@C[1]		# D[2] = ROL64(C[3], 1) ^ C[1]
124
125	rllg	@C[1],@C[1],1
126	xgr	@C[1],@C[4]		# D[0] = ROL64(C[1], 1) ^ C[4]
127
128	rllg	@C[4],@C[4],1
129	xgr	@C[4],@T[0]		# D[3] = ROL64(C[4], 1) ^ C[2]
130___
131	(@D[0..4], @C) = (@C[1..4,0], @D);
132$code.=<<___;
133	xgr	@C[1],@D[1]
134	xgr	@C[2],@D[2]
135	xgr	@C[3],@D[3]
136	 rllg	@C[1],@C[1],$rhotates[1][1]
137	xgr	@C[4],@D[4]
138	 rllg	@C[2],@C[2],$rhotates[2][2]
139	xgr	@C[0],@D[0]
140
141	lgr	@T[0],@C[1]
142	ogr	@C[1],@C[2]
143	 rllg	@C[3],@C[3],$rhotates[3][3]
144	xgr	@C[1],@C[0]		#	    C[0] ^ ( C[1] | C[2])
145	 rllg	@C[4],@C[4],$rhotates[4][4]
146	xg	@C[1],0($iotas)
147	la	$iotas,8($iotas)
148	stg	@C[1],$A[0][0]($dst)	# R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
149
150	lgr	@T[1],@C[4]
151	ngr	@C[4],@C[3]
152	 lghi	@C[1],-1		# no 'not' instruction :-(
153	xgr	@C[4],@C[2]		#	    C[2] ^ ( C[4] & C[3])
154	 xgr	@C[2],@C[1]		# not	@C[2]
155	stg	@C[4],$A[0][2]($dst)	# R[0][2] = C[2] ^ ( C[4] & C[3])
156	 ogr	@C[2],@C[3]
157	 xgr	@C[2],@T[0]		#	    C[1] ^ (~C[2] | C[3])
158
159	ngr	@T[0],@C[0]
160	 stg	@C[2],$A[0][1]($dst)	# R[0][1] = C[1] ^ (~C[2] | C[3])
161	xgr	@T[0],@T[1]		#	    C[4] ^ ( C[1] & C[0])
162	 ogr	@T[1],@C[0]
163	stg	@T[0],$A[0][4]($dst)	# R[0][4] = C[4] ^ ( C[1] & C[0])
164	 xgr	@T[1],@C[3]		#	    C[3] ^ ( C[4] | C[0])
165	 stg	@T[1],$A[0][3]($dst)	# R[0][3] = C[3] ^ ( C[4] | C[0])
166
167
168	lg	@C[0],$A[0][3]($src)
169	lg	@C[4],$A[4][2]($src)
170	lg	@C[3],$A[3][1]($src)
171	lg	@C[1],$A[1][4]($src)
172	lg	@C[2],$A[2][0]($src)
173
174	xgr	@C[0],@D[3]
175	xgr	@C[4],@D[2]
176	 rllg	@C[0],@C[0],$rhotates[0][3]
177	xgr	@C[3],@D[1]
178	 rllg	@C[4],@C[4],$rhotates[4][2]
179	xgr	@C[1],@D[4]
180	 rllg	@C[3],@C[3],$rhotates[3][1]
181	xgr	@C[2],@D[0]
182
183	lgr	@T[0],@C[0]
184	ogr	@C[0],@C[4]
185	 rllg	@C[1],@C[1],$rhotates[1][4]
186	xgr	@C[0],@C[3]		#	    C[3] ^ (C[0] |  C[4])
187	 rllg	@C[2],@C[2],$rhotates[2][0]
188	stg	@C[0],$A[1][3]($dst)	# R[1][3] = C[3] ^ (C[0] |  C[4])
189
190	lgr	@T[1],@C[1]
191	ngr	@C[1],@T[0]
192	 lghi	@C[0],-1		# no 'not' instruction :-(
193	xgr	@C[1],@C[4]		#	    C[4] ^ (C[1] &  C[0])
194	 xgr	@C[4],@C[0]		# not	@C[4]
195	stg	@C[1],$A[1][4]($dst)	# R[1][4] = C[4] ^ (C[1] &  C[0])
196
197	 ogr	@C[4],@C[3]
198	 xgr	@C[4],@C[2]		#	    C[2] ^ (~C[4] | C[3])
199
200	ngr	@C[3],@C[2]
201	 stg	@C[4],$A[1][2]($dst)	# R[1][2] = C[2] ^ (~C[4] | C[3])
202	xgr	@C[3],@T[1]		#	    C[1] ^ (C[3] &  C[2])
203	 ogr	@T[1],@C[2]
204	stg	@C[3],$A[1][1]($dst)	# R[1][1] = C[1] ^ (C[3] &  C[2])
205	 xgr	@T[1],@T[0]		#	    C[0] ^ (C[1] |  C[2])
206	 stg	@T[1],$A[1][0]($dst)	# R[1][0] = C[0] ^ (C[1] |  C[2])
207
208
209	lg	@C[2],$A[2][3]($src)
210	lg	@C[3],$A[3][4]($src)
211	lg	@C[1],$A[1][2]($src)
212	lg	@C[4],$A[4][0]($src)
213	lg	@C[0],$A[0][1]($src)
214
215	xgr	@C[2],@D[3]
216	xgr	@C[3],@D[4]
217	 rllg	@C[2],@C[2],$rhotates[2][3]
218	xgr	@C[1],@D[2]
219	 rllg	@C[3],@C[3],$rhotates[3][4]
220	xgr	@C[4],@D[0]
221	 rllg	@C[1],@C[1],$rhotates[1][2]
222	xgr	@C[0],@D[1]
223
224	lgr	@T[0],@C[2]
225	ngr	@C[2],@C[3]
226	 rllg	@C[4],@C[4],$rhotates[4][0]
227	xgr	@C[2],@C[1]		#	     C[1] ^ ( C[2] & C[3])
228	lghi	@T[1],-1		# no 'not' instruction :-(
229	stg	@C[2],$A[2][1]($dst)	# R[2][1] =  C[1] ^ ( C[2] & C[3])
230
231	xgr	@C[3],@T[1]		# not	@C[3]
232	lgr	@T[1],@C[4]
233	ngr	@C[4],@C[3]
234	 rllg	@C[0],@C[0],$rhotates[0][1]
235	xgr	@C[4],@T[0]		#	     C[2] ^ ( C[4] & ~C[3])
236	 ogr	@T[0],@C[1]
237	stg	@C[4],$A[2][2]($dst)	# R[2][2] =  C[2] ^ ( C[4] & ~C[3])
238	 xgr	@T[0],@C[0]		#	     C[0] ^ ( C[2] | C[1])
239
240	ngr	@C[1],@C[0]
241	 stg	@T[0],$A[2][0]($dst)	# R[2][0] =  C[0] ^ ( C[2] | C[1])
242	xgr	@C[1],@T[1]		#	     C[4] ^ ( C[1] & C[0])
243	 ogr	@C[0],@T[1]
244	stg	@C[1],$A[2][4]($dst)	# R[2][4] =  C[4] ^ ( C[1] & C[0])
245	 xgr	@C[0],@C[3]		#	    ~C[3] ^ ( C[0] | C[4])
246	 stg	@C[0],$A[2][3]($dst)	# R[2][3] = ~C[3] ^ ( C[0] | C[4])
247
248
249	lg	@C[2],$A[2][1]($src)
250	lg	@C[3],$A[3][2]($src)
251	lg	@C[1],$A[1][0]($src)
252	lg	@C[4],$A[4][3]($src)
253	lg	@C[0],$A[0][4]($src)
254
255	xgr	@C[2],@D[1]
256	xgr	@C[3],@D[2]
257	 rllg	@C[2],@C[2],$rhotates[2][1]
258	xgr	@C[1],@D[0]
259	 rllg	@C[3],@C[3],$rhotates[3][2]
260	xgr	@C[4],@D[3]
261	 rllg	@C[1],@C[1],$rhotates[1][0]
262	xgr	@C[0],@D[4]
263	 rllg	@C[4],@C[4],$rhotates[4][3]
264
265	lgr	@T[0],@C[2]
266	ogr	@C[2],@C[3]
267	lghi	@T[1],-1		# no 'not' instruction :-(
268	xgr	@C[2],@C[1]		#	     C[1] ^ ( C[2] | C[3])
269	xgr	@C[3],@T[1]		# not	@C[3]
270	stg	@C[2],$A[3][1]($dst)	# R[3][1] =  C[1] ^ ( C[2] | C[3])
271
272	lgr	@T[1],@C[4]
273	ogr	@C[4],@C[3]
274	 rllg	@C[0],@C[0],$rhotates[0][4]
275	xgr	@C[4],@T[0]		#	     C[2] ^ ( C[4] | ~C[3])
276	 ngr	@T[0],@C[1]
277	stg	@C[4],$A[3][2]($dst)	# R[3][2] =  C[2] ^ ( C[4] | ~C[3])
278	 xgr	@T[0],@C[0]		#	     C[0] ^ ( C[2] & C[1])
279
280	ogr	@C[1],@C[0]
281	 stg	@T[0],$A[3][0]($dst)	# R[3][0] =  C[0] ^ ( C[2] & C[1])
282	xgr	@C[1],@T[1]		#	     C[4] ^ ( C[1] | C[0])
283	 ngr	@C[0],@T[1]
284	stg	@C[1],$A[3][4]($dst)	# R[3][4] =  C[4] ^ ( C[1] | C[0])
285	 xgr	@C[0],@C[3]		#	    ~C[3] ^ ( C[0] & C[4])
286	 stg	@C[0],$A[3][3]($dst)	# R[3][3] = ~C[3] ^ ( C[0] & C[4])
287
288
289	xg	@D[2],$A[0][2]($src)
290	xg	@D[3],$A[1][3]($src)
291	xg	@D[1],$A[4][1]($src)
292	xg	@D[4],$A[2][4]($src)
293	xgr	$dst,$src		# xchg	$dst,$src
294	 rllg	@D[2],@D[2],$rhotates[0][2]
295	xg	@D[0],$A[3][0]($src)
296	 rllg	@D[3],@D[3],$rhotates[1][3]
297	xgr	$src,$dst
298	 rllg	@D[1],@D[1],$rhotates[4][1]
299	xgr	$dst,$src
300	 rllg	@D[4],@D[4],$rhotates[2][4]
301___
302	@C = @D[2..4,0,1];
303$code.=<<___;
304	lgr	@T[0],@C[0]
305	ngr	@C[0],@C[1]
306	lghi	@T[1],-1		# no 'not' instruction :-(
307	xgr	@C[0],@C[4]		#	     C[4] ^ ( C[0] & C[1])
308	xgr	@C[1],@T[1]		# not	@C[1]
309	stg	@C[0],$A[4][4]($src)	# R[4][4] =  C[4] ^ ( C[0] & C[1])
310
311	lgr	@T[1],@C[2]
312	ngr	@C[2],@C[1]
313	 rllg	@D[0],@D[0],$rhotates[3][0]
314	xgr	@C[2],@T[0]		#	     C[0] ^ ( C[2] & ~C[1])
315	 ogr	@T[0],@C[4]
316	stg	@C[2],$A[4][0]($src)	# R[4][0] =  C[0] ^ ( C[2] & ~C[1])
317	 xgr	@T[0],@C[3]		#	     C[3] ^ ( C[0] | C[4])
318
319	ngr	@C[4],@C[3]
320	 stg	@T[0],$A[4][3]($src)	# R[4][3] =  C[3] ^ ( C[0] | C[4])
321	xgr	@C[4],@T[1]		#	     C[2] ^ ( C[4] & C[3])
322	 ogr	@C[3],@T[1]
323	stg	@C[4],$A[4][2]($src)	# R[4][2] =  C[2] ^ ( C[4] & C[3])
324	 xgr	@C[3],@C[1]		#	    ~C[1] ^ ( C[2] | C[3])
325
326	lgr	@C[1],@C[0]		# harmonize with the loop top
327	lgr	@C[0],@T[0]
328	 stg	@C[3],$A[4][1]($src)	# R[4][1] = ~C[1] ^ ( C[2] | C[3])
329
330	tmll	$iotas,255
331	jnz	.Loop
332
333	l${g}	%r14,$SIZE_T*14($sp)
334	br	%r14
335.size	__KeccakF1600,.-__KeccakF1600
336___
337}
338{
339$code.=<<___;
340.type	KeccakF1600,\@function
341.align	32
342KeccakF1600:
343.LKeccakF1600:
344	lghi	%r1,-$frame
345	stm${g}	%r6,%r15,$SIZE_T*6($sp)
346	lgr	%r0,$sp
347	la	$sp,0(%r1,$sp)
348	st${g}	%r0,0($sp)
349
350	lghi	@D[0],-1		# no 'not' instruction :-(
351	lghi	@D[1],-1
352	lghi	@D[2],-1
353	lghi	@D[3],-1
354	lghi	@D[4],-1
355	lghi	@T[0],-1
356	xg	@D[0],$A[0][1]($src)
357	xg	@D[1],$A[0][2]($src)
358	xg	@D[2],$A[1][3]($src)
359	xg	@D[3],$A[2][2]($src)
360	xg	@D[4],$A[3][2]($src)
361	xg	@T[0],$A[4][0]($src)
362	stmg	@D[0],@D[1],$A[0][1]($src)
363	stg	@D[2],$A[1][3]($src)
364	stg	@D[3],$A[2][2]($src)
365	stg	@D[4],$A[3][2]($src)
366	stg	@T[0],$A[4][0]($src)
367
368	la	$dst,$stdframe($sp)
369
370	bras	%r14,__KeccakF1600
371
372	lghi	@D[0],-1		# no 'not' instruction :-(
373	lghi	@D[1],-1
374	lghi	@D[2],-1
375	lghi	@D[3],-1
376	lghi	@D[4],-1
377	lghi	@T[0],-1
378	xg	@D[0],$A[0][1]($src)
379	xg	@D[1],$A[0][2]($src)
380	xg	@D[2],$A[1][3]($src)
381	xg	@D[3],$A[2][2]($src)
382	xg	@D[4],$A[3][2]($src)
383	xg	@T[0],$A[4][0]($src)
384	stmg	@D[0],@D[1],$A[0][1]($src)
385	stg	@D[2],$A[1][3]($src)
386	stg	@D[3],$A[2][2]($src)
387	stg	@D[4],$A[3][2]($src)
388	stg	@T[0],$A[4][0]($src)
389
390	lm${g}	%r6,%r15,$frame+6*$SIZE_T($sp)
391	br	%r14
392.size	KeccakF1600,.-KeccakF1600
393___
394}
395{ my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5));
396
397$code.=<<___;
398.globl	SHA3_absorb
399.type	SHA3_absorb,\@function
400.align	32
401SHA3_absorb:
402	lghi	%r1,-$frame
403	stm${g}	%r5,%r15,$SIZE_T*5($sp)
404	lgr	%r0,$sp
405	la	$sp,0(%r1,$sp)
406	st${g}	%r0,0($sp)
407
408	lghi	@D[0],-1		# no 'not' instruction :-(
409	lghi	@D[1],-1
410	lghi	@D[2],-1
411	lghi	@D[3],-1
412	lghi	@D[4],-1
413	lghi	@T[0],-1
414	xg	@D[0],$A[0][1]($src)
415	xg	@D[1],$A[0][2]($src)
416	xg	@D[2],$A[1][3]($src)
417	xg	@D[3],$A[2][2]($src)
418	xg	@D[4],$A[3][2]($src)
419	xg	@T[0],$A[4][0]($src)
420	stmg	@D[0],@D[1],$A[0][1]($src)
421	stg	@D[2],$A[1][3]($src)
422	stg	@D[3],$A[2][2]($src)
423	stg	@D[4],$A[3][2]($src)
424	stg	@T[0],$A[4][0]($src)
425
426.Loop_absorb:
427	cl${g}r	$len,$bsz
428	jl	.Ldone_absorb
429
430	srl${g}	$bsz,3
431	la	%r1,0($A_flat)
432
433.Lblock_absorb:
434	lrvg	%r0,0($inp)
435	la	$inp,8($inp)
436	xg	%r0,0(%r1)
437	a${g}hi	$len,-8
438	stg	%r0,0(%r1)
439	la	%r1,8(%r1)
440	brct	$bsz,.Lblock_absorb
441
442	stm${g}	$inp,$len,$frame+3*$SIZE_T($sp)
443	la	$dst,$stdframe($sp)
444	bras	%r14,__KeccakF1600
445	lm${g}	$inp,$bsz,$frame+3*$SIZE_T($sp)
446	j	.Loop_absorb
447
448.align	16
449.Ldone_absorb:
450	lghi	@D[0],-1		# no 'not' instruction :-(
451	lghi	@D[1],-1
452	lghi	@D[2],-1
453	lghi	@D[3],-1
454	lghi	@D[4],-1
455	lghi	@T[0],-1
456	xg	@D[0],$A[0][1]($src)
457	xg	@D[1],$A[0][2]($src)
458	xg	@D[2],$A[1][3]($src)
459	xg	@D[3],$A[2][2]($src)
460	xg	@D[4],$A[3][2]($src)
461	xg	@T[0],$A[4][0]($src)
462	stmg	@D[0],@D[1],$A[0][1]($src)
463	stg	@D[2],$A[1][3]($src)
464	stg	@D[3],$A[2][2]($src)
465	stg	@D[4],$A[3][2]($src)
466	stg	@T[0],$A[4][0]($src)
467
468	lgr	%r2,$len		# return value
469
470	lm${g}	%r6,%r15,$frame+6*$SIZE_T($sp)
471	br	%r14
472.size	SHA3_absorb,.-SHA3_absorb
473___
474}
475{ my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5));
476
477$code.=<<___;
478.globl	SHA3_squeeze
479.type	SHA3_squeeze,\@function
480.align	32
481SHA3_squeeze:
482	srl${g}	$bsz,3
483	st${g}	%r14,2*$SIZE_T($sp)
484	lghi	%r14,8
485	st${g}	$bsz,5*$SIZE_T($sp)
486	la	%r1,0($A_flat)
487
488	j	.Loop_squeeze
489
490.align	16
491.Loop_squeeze:
492	cl${g}r $len,%r14
493	jl	.Ltail_squeeze
494
495	lrvg	%r0,0(%r1)
496	la	%r1,8(%r1)
497	stg	%r0,0($out)
498	la	$out,8($out)
499	a${g}hi	$len,-8			# len -= 8
500	jz	.Ldone_squeeze
501
502	brct	$bsz,.Loop_squeeze	# bsz--
503
504	stm${g}	$out,$len,3*$SIZE_T($sp)
505	bras	%r14,.LKeccakF1600
506	lm${g}	$out,$bsz,3*$SIZE_T($sp)
507	lghi	%r14,8
508	la	%r1,0($A_flat)
509	j	.Loop_squeeze
510
511.Ltail_squeeze:
512	lg	%r0,0(%r1)
513.Loop_tail_squeeze:
514	stc	%r0,0($out)
515	la	$out,1($out)
516	srlg	%r0,8
517	brct	$len,.Loop_tail_squeeze
518
519.Ldone_squeeze:
520	l${g}	%r14,2*$SIZE_T($sp)
521	br	%r14
522.size	SHA3_squeeze,.-SHA3_squeeze
523___
524}
525$code.=<<___;
526.align	256
527	.quad	0,0,0,0,0,0,0,0
528.type	iotas,\@object
529iotas:
530	.quad	0x0000000000000001
531	.quad	0x0000000000008082
532	.quad	0x800000000000808a
533	.quad	0x8000000080008000
534	.quad	0x000000000000808b
535	.quad	0x0000000080000001
536	.quad	0x8000000080008081
537	.quad	0x8000000000008009
538	.quad	0x000000000000008a
539	.quad	0x0000000000000088
540	.quad	0x0000000080008009
541	.quad	0x000000008000000a
542	.quad	0x000000008000808b
543	.quad	0x800000000000008b
544	.quad	0x8000000000008089
545	.quad	0x8000000000008003
546	.quad	0x8000000000008002
547	.quad	0x8000000000000080
548	.quad	0x000000000000800a
549	.quad	0x800000008000000a
550	.quad	0x8000000080008081
551	.quad	0x8000000000008080
552	.quad	0x0000000080000001
553	.quad	0x8000000080008008
554.size	iotas,.-iotas
555.asciz	"Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>"
556___
557
558# unlike 32-bit shift 64-bit one takes three arguments
559$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
560
561print $code;
562close STDOUT or die "error closing STDOUT: $!";
563