1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for s390x.
17#
18# June 2017.
19#
20# Below code is [lane complementing] KECCAK_2X implementation (see
21# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22# instead of actually unrolling the loop pair-wise I simply flip
23# pointers to T[][] and A[][] at the end of round. Since number of
24# rounds is even, last round writes to A[][] and everything works out.
25# In the nutshell it's transliteration of x86_64 module, because both
26# architectures have similar capabilities/limitations. Performance
27# measurement is problematic as I don't have access to an idle system.
28# It looks like z13 processes one byte [out of long message] in ~14
29# cycles. At least the result is consistent with estimate based on
30# amount of instruction and assumed instruction issue rate. It's ~2.5x
31# faster than compiler-generated code.
32
33$flavour = shift;
34
35if ($flavour =~ /3[12]/) {
36	$SIZE_T=4;
37	$g="";
38} else {
39	$SIZE_T=8;
40	$g="g";
41}
42
43while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
44open STDOUT,">$output";
45
46my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
47
48my @C = map("%r$_",(0,1,5..7));
49my @D = map("%r$_",(8..12));
50my @T = map("%r$_",(13..14));
51my ($src,$dst,$iotas) = map("%r$_",(2..4));
52my $sp = "%r15";
53
54$stdframe=16*$SIZE_T+4*8;
55$frame=$stdframe+25*8;
56
57my @rhotates = ([  0,  1, 62, 28, 27 ],
58                [ 36, 44,  6, 55, 20 ],
59                [  3, 10, 43, 25, 39 ],
60                [ 41, 45, 15, 21,  8 ],
61                [ 18,  2, 61, 56, 14 ]);
62
63{ my @C = @C;	# copy, because we mess them up...
64  my @D = @D;
65
66$code.=<<___;
67.text
68
69.type	__KeccakF1600,\@function
70.align	32
71__KeccakF1600:
72	st${g}	%r14,$SIZE_T*14($sp)
73	lg	@C[0],$A[4][0]($src)
74	lg	@C[1],$A[4][1]($src)
75	lg	@C[2],$A[4][2]($src)
76	lg	@C[3],$A[4][3]($src)
77	lg	@C[4],$A[4][4]($src)
78	larl	$iotas,iotas
79	j	.Loop
80
81.align	16
82.Loop:
83	lg	@D[0],$A[0][0]($src)
84	lg	@D[1],$A[1][1]($src)
85	lg	@D[2],$A[2][2]($src)
86	lg	@D[3],$A[3][3]($src)
87
88	xgr	@C[0],@D[0]
89	xg	@C[1],$A[0][1]($src)
90	xg	@C[2],$A[0][2]($src)
91	xg	@C[3],$A[0][3]($src)
92	lgr	@D[4],@C[4]
93	xg	@C[4],$A[0][4]($src)
94
95	xg	@C[0],$A[1][0]($src)
96	xgr	@C[1],@D[1]
97	xg	@C[2],$A[1][2]($src)
98	xg	@C[3],$A[1][3]($src)
99	xg	@C[4],$A[1][4]($src)
100
101	xg	@C[0],$A[2][0]($src)
102	xg	@C[1],$A[2][1]($src)
103	xgr	@C[2],@D[2]
104	xg	@C[3],$A[2][3]($src)
105	xg	@C[4],$A[2][4]($src)
106
107	xg	@C[0],$A[3][0]($src)
108	xg	@C[1],$A[3][1]($src)
109	xg	@C[2],$A[3][2]($src)
110	xgr	@C[3],@D[3]
111	xg	@C[4],$A[3][4]($src)
112
113	lgr	@T[0],@C[2]
114	rllg	@C[2],@C[2],1
115	xgr	@C[2],@C[0]		# D[1] = ROL64(C[2], 1) ^ C[0]
116
117	rllg	@C[0],@C[0],1
118	xgr	@C[0],@C[3]		# D[4] = ROL64(C[0], 1) ^ C[3]
119
120	rllg	@C[3],@C[3],1
121	xgr	@C[3],@C[1]		# D[2] = ROL64(C[3], 1) ^ C[1]
122
123	rllg	@C[1],@C[1],1
124	xgr	@C[1],@C[4]		# D[0] = ROL64(C[1], 1) ^ C[4]
125
126	rllg	@C[4],@C[4],1
127	xgr	@C[4],@T[0]		# D[3] = ROL64(C[4], 1) ^ C[2]
128___
129	(@D[0..4], @C) = (@C[1..4,0], @D);
130$code.=<<___;
131	xgr	@C[1],@D[1]
132	xgr	@C[2],@D[2]
133	xgr	@C[3],@D[3]
134	 rllg	@C[1],@C[1],$rhotates[1][1]
135	xgr	@C[4],@D[4]
136	 rllg	@C[2],@C[2],$rhotates[2][2]
137	xgr	@C[0],@D[0]
138
139	lgr	@T[0],@C[1]
140	ogr	@C[1],@C[2]
141	 rllg	@C[3],@C[3],$rhotates[3][3]
142	xgr	@C[1],@C[0]		#	    C[0] ^ ( C[1] | C[2])
143	 rllg	@C[4],@C[4],$rhotates[4][4]
144	xg	@C[1],0($iotas)
145	la	$iotas,8($iotas)
146	stg	@C[1],$A[0][0]($dst)	# R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
147
148	lgr	@T[1],@C[4]
149	ngr	@C[4],@C[3]
150	 lghi	@C[1],-1		# no 'not' instruction :-(
151	xgr	@C[4],@C[2]		#	    C[2] ^ ( C[4] & C[3])
152	 xgr	@C[2],@C[1]		# not	@C[2]
153	stg	@C[4],$A[0][2]($dst)	# R[0][2] = C[2] ^ ( C[4] & C[3])
154	 ogr	@C[2],@C[3]
155	 xgr	@C[2],@T[0]		#	    C[1] ^ (~C[2] | C[3])
156
157	ngr	@T[0],@C[0]
158	 stg	@C[2],$A[0][1]($dst)	# R[0][1] = C[1] ^ (~C[2] | C[3])
159	xgr	@T[0],@T[1]		#	    C[4] ^ ( C[1] & C[0])
160	 ogr	@T[1],@C[0]
161	stg	@T[0],$A[0][4]($dst)	# R[0][4] = C[4] ^ ( C[1] & C[0])
162	 xgr	@T[1],@C[3]		#	    C[3] ^ ( C[4] | C[0])
163	 stg	@T[1],$A[0][3]($dst)	# R[0][3] = C[3] ^ ( C[4] | C[0])
164
165
166	lg	@C[0],$A[0][3]($src)
167	lg	@C[4],$A[4][2]($src)
168	lg	@C[3],$A[3][1]($src)
169	lg	@C[1],$A[1][4]($src)
170	lg	@C[2],$A[2][0]($src)
171
172	xgr	@C[0],@D[3]
173	xgr	@C[4],@D[2]
174	 rllg	@C[0],@C[0],$rhotates[0][3]
175	xgr	@C[3],@D[1]
176	 rllg	@C[4],@C[4],$rhotates[4][2]
177	xgr	@C[1],@D[4]
178	 rllg	@C[3],@C[3],$rhotates[3][1]
179	xgr	@C[2],@D[0]
180
181	lgr	@T[0],@C[0]
182	ogr	@C[0],@C[4]
183	 rllg	@C[1],@C[1],$rhotates[1][4]
184	xgr	@C[0],@C[3]		#	    C[3] ^ (C[0] |  C[4])
185	 rllg	@C[2],@C[2],$rhotates[2][0]
186	stg	@C[0],$A[1][3]($dst)	# R[1][3] = C[3] ^ (C[0] |  C[4])
187
188	lgr	@T[1],@C[1]
189	ngr	@C[1],@T[0]
190	 lghi	@C[0],-1		# no 'not' instruction :-(
191	xgr	@C[1],@C[4]		#	    C[4] ^ (C[1] &  C[0])
192	 xgr	@C[4],@C[0]		# not	@C[4]
193	stg	@C[1],$A[1][4]($dst)	# R[1][4] = C[4] ^ (C[1] &  C[0])
194
195	 ogr	@C[4],@C[3]
196	 xgr	@C[4],@C[2]		#	    C[2] ^ (~C[4] | C[3])
197
198	ngr	@C[3],@C[2]
199	 stg	@C[4],$A[1][2]($dst)	# R[1][2] = C[2] ^ (~C[4] | C[3])
200	xgr	@C[3],@T[1]		#	    C[1] ^ (C[3] &  C[2])
201	 ogr	@T[1],@C[2]
202	stg	@C[3],$A[1][1]($dst)	# R[1][1] = C[1] ^ (C[3] &  C[2])
203	 xgr	@T[1],@T[0]		#	    C[0] ^ (C[1] |  C[2])
204	 stg	@T[1],$A[1][0]($dst)	# R[1][0] = C[0] ^ (C[1] |  C[2])
205
206
207	lg	@C[2],$A[2][3]($src)
208	lg	@C[3],$A[3][4]($src)
209	lg	@C[1],$A[1][2]($src)
210	lg	@C[4],$A[4][0]($src)
211	lg	@C[0],$A[0][1]($src)
212
213	xgr	@C[2],@D[3]
214	xgr	@C[3],@D[4]
215	 rllg	@C[2],@C[2],$rhotates[2][3]
216	xgr	@C[1],@D[2]
217	 rllg	@C[3],@C[3],$rhotates[3][4]
218	xgr	@C[4],@D[0]
219	 rllg	@C[1],@C[1],$rhotates[1][2]
220	xgr	@C[0],@D[1]
221
222	lgr	@T[0],@C[2]
223	ngr	@C[2],@C[3]
224	 rllg	@C[4],@C[4],$rhotates[4][0]
225	xgr	@C[2],@C[1]		#	     C[1] ^ ( C[2] & C[3])
226	lghi	@T[1],-1		# no 'not' instruction :-(
227	stg	@C[2],$A[2][1]($dst)	# R[2][1] =  C[1] ^ ( C[2] & C[3])
228
229	xgr	@C[3],@T[1]		# not	@C[3]
230	lgr	@T[1],@C[4]
231	ngr	@C[4],@C[3]
232	 rllg	@C[0],@C[0],$rhotates[0][1]
233	xgr	@C[4],@T[0]		#	     C[2] ^ ( C[4] & ~C[3])
234	 ogr	@T[0],@C[1]
235	stg	@C[4],$A[2][2]($dst)	# R[2][2] =  C[2] ^ ( C[4] & ~C[3])
236	 xgr	@T[0],@C[0]		#	     C[0] ^ ( C[2] | C[1])
237
238	ngr	@C[1],@C[0]
239	 stg	@T[0],$A[2][0]($dst)	# R[2][0] =  C[0] ^ ( C[2] | C[1])
240	xgr	@C[1],@T[1]		#	     C[4] ^ ( C[1] & C[0])
241	 ogr	@C[0],@T[1]
242	stg	@C[1],$A[2][4]($dst)	# R[2][4] =  C[4] ^ ( C[1] & C[0])
243	 xgr	@C[0],@C[3]		#	    ~C[3] ^ ( C[0] | C[4])
244	 stg	@C[0],$A[2][3]($dst)	# R[2][3] = ~C[3] ^ ( C[0] | C[4])
245
246
247	lg	@C[2],$A[2][1]($src)
248	lg	@C[3],$A[3][2]($src)
249	lg	@C[1],$A[1][0]($src)
250	lg	@C[4],$A[4][3]($src)
251	lg	@C[0],$A[0][4]($src)
252
253	xgr	@C[2],@D[1]
254	xgr	@C[3],@D[2]
255	 rllg	@C[2],@C[2],$rhotates[2][1]
256	xgr	@C[1],@D[0]
257	 rllg	@C[3],@C[3],$rhotates[3][2]
258	xgr	@C[4],@D[3]
259	 rllg	@C[1],@C[1],$rhotates[1][0]
260	xgr	@C[0],@D[4]
261	 rllg	@C[4],@C[4],$rhotates[4][3]
262
263	lgr	@T[0],@C[2]
264	ogr	@C[2],@C[3]
265	lghi	@T[1],-1		# no 'not' instruction :-(
266	xgr	@C[2],@C[1]		#	     C[1] ^ ( C[2] | C[3])
267	xgr	@C[3],@T[1]		# not	@C[3]
268	stg	@C[2],$A[3][1]($dst)	# R[3][1] =  C[1] ^ ( C[2] | C[3])
269
270	lgr	@T[1],@C[4]
271	ogr	@C[4],@C[3]
272	 rllg	@C[0],@C[0],$rhotates[0][4]
273	xgr	@C[4],@T[0]		#	     C[2] ^ ( C[4] | ~C[3])
274	 ngr	@T[0],@C[1]
275	stg	@C[4],$A[3][2]($dst)	# R[3][2] =  C[2] ^ ( C[4] | ~C[3])
276	 xgr	@T[0],@C[0]		#	     C[0] ^ ( C[2] & C[1])
277
278	ogr	@C[1],@C[0]
279	 stg	@T[0],$A[3][0]($dst)	# R[3][0] =  C[0] ^ ( C[2] & C[1])
280	xgr	@C[1],@T[1]		#	     C[4] ^ ( C[1] | C[0])
281	 ngr	@C[0],@T[1]
282	stg	@C[1],$A[3][4]($dst)	# R[3][4] =  C[4] ^ ( C[1] | C[0])
283	 xgr	@C[0],@C[3]		#	    ~C[3] ^ ( C[0] & C[4])
284	 stg	@C[0],$A[3][3]($dst)	# R[3][3] = ~C[3] ^ ( C[0] & C[4])
285
286
287	xg	@D[2],$A[0][2]($src)
288	xg	@D[3],$A[1][3]($src)
289	xg	@D[1],$A[4][1]($src)
290	xg	@D[4],$A[2][4]($src)
291	xgr	$dst,$src		# xchg	$dst,$src
292	 rllg	@D[2],@D[2],$rhotates[0][2]
293	xg	@D[0],$A[3][0]($src)
294	 rllg	@D[3],@D[3],$rhotates[1][3]
295	xgr	$src,$dst
296	 rllg	@D[1],@D[1],$rhotates[4][1]
297	xgr	$dst,$src
298	 rllg	@D[4],@D[4],$rhotates[2][4]
299___
300	@C = @D[2..4,0,1];
301$code.=<<___;
302	lgr	@T[0],@C[0]
303	ngr	@C[0],@C[1]
304	lghi	@T[1],-1		# no 'not' instruction :-(
305	xgr	@C[0],@C[4]		#	     C[4] ^ ( C[0] & C[1])
306	xgr	@C[1],@T[1]		# not	@C[1]
307	stg	@C[0],$A[4][4]($src)	# R[4][4] =  C[4] ^ ( C[0] & C[1])
308
309	lgr	@T[1],@C[2]
310	ngr	@C[2],@C[1]
311	 rllg	@D[0],@D[0],$rhotates[3][0]
312	xgr	@C[2],@T[0]		#	     C[0] ^ ( C[2] & ~C[1])
313	 ogr	@T[0],@C[4]
314	stg	@C[2],$A[4][0]($src)	# R[4][0] =  C[0] ^ ( C[2] & ~C[1])
315	 xgr	@T[0],@C[3]		#	     C[3] ^ ( C[0] | C[4])
316
317	ngr	@C[4],@C[3]
318	 stg	@T[0],$A[4][3]($src)	# R[4][3] =  C[3] ^ ( C[0] | C[4])
319	xgr	@C[4],@T[1]		#	     C[2] ^ ( C[4] & C[3])
320	 ogr	@C[3],@T[1]
321	stg	@C[4],$A[4][2]($src)	# R[4][2] =  C[2] ^ ( C[4] & C[3])
322	 xgr	@C[3],@C[1]		#	    ~C[1] ^ ( C[2] | C[3])
323
324	lgr	@C[1],@C[0]		# harmonize with the loop top
325	lgr	@C[0],@T[0]
326	 stg	@C[3],$A[4][1]($src)	# R[4][1] = ~C[1] ^ ( C[2] | C[3])
327
328	tmll	$iotas,255
329	jnz	.Loop
330
331	l${g}	%r14,$SIZE_T*14($sp)
332	br	%r14
333.size	__KeccakF1600,.-__KeccakF1600
334___
335}
336{
337$code.=<<___;
338.type	KeccakF1600,\@function
339.align	32
340KeccakF1600:
341.LKeccakF1600:
342	lghi	%r1,-$frame
343	stm${g}	%r6,%r15,$SIZE_T*6($sp)
344	lgr	%r0,$sp
345	la	$sp,0(%r1,$sp)
346	st${g}	%r0,0($sp)
347
348	lghi	@D[0],-1		# no 'not' instruction :-(
349	lghi	@D[1],-1
350	lghi	@D[2],-1
351	lghi	@D[3],-1
352	lghi	@D[4],-1
353	lghi	@T[0],-1
354	xg	@D[0],$A[0][1]($src)
355	xg	@D[1],$A[0][2]($src)
356	xg	@D[2],$A[1][3]($src)
357	xg	@D[3],$A[2][2]($src)
358	xg	@D[4],$A[3][2]($src)
359	xg	@T[0],$A[4][0]($src)
360	stmg	@D[0],@D[1],$A[0][1]($src)
361	stg	@D[2],$A[1][3]($src)
362	stg	@D[3],$A[2][2]($src)
363	stg	@D[4],$A[3][2]($src)
364	stg	@T[0],$A[4][0]($src)
365
366	la	$dst,$stdframe($sp)
367
368	bras	%r14,__KeccakF1600
369
370	lghi	@D[0],-1		# no 'not' instruction :-(
371	lghi	@D[1],-1
372	lghi	@D[2],-1
373	lghi	@D[3],-1
374	lghi	@D[4],-1
375	lghi	@T[0],-1
376	xg	@D[0],$A[0][1]($src)
377	xg	@D[1],$A[0][2]($src)
378	xg	@D[2],$A[1][3]($src)
379	xg	@D[3],$A[2][2]($src)
380	xg	@D[4],$A[3][2]($src)
381	xg	@T[0],$A[4][0]($src)
382	stmg	@D[0],@D[1],$A[0][1]($src)
383	stg	@D[2],$A[1][3]($src)
384	stg	@D[3],$A[2][2]($src)
385	stg	@D[4],$A[3][2]($src)
386	stg	@T[0],$A[4][0]($src)
387
388	lm${g}	%r6,%r15,$frame+6*$SIZE_T($sp)
389	br	%r14
390.size	KeccakF1600,.-KeccakF1600
391___
392}
393{ my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5));
394
395$code.=<<___;
396.globl	SHA3_absorb
397.type	SHA3_absorb,\@function
398.align	32
399SHA3_absorb:
400	lghi	%r1,-$frame
401	stm${g}	%r5,%r15,$SIZE_T*5($sp)
402	lgr	%r0,$sp
403	la	$sp,0(%r1,$sp)
404	st${g}	%r0,0($sp)
405
406	lghi	@D[0],-1		# no 'not' instruction :-(
407	lghi	@D[1],-1
408	lghi	@D[2],-1
409	lghi	@D[3],-1
410	lghi	@D[4],-1
411	lghi	@T[0],-1
412	xg	@D[0],$A[0][1]($src)
413	xg	@D[1],$A[0][2]($src)
414	xg	@D[2],$A[1][3]($src)
415	xg	@D[3],$A[2][2]($src)
416	xg	@D[4],$A[3][2]($src)
417	xg	@T[0],$A[4][0]($src)
418	stmg	@D[0],@D[1],$A[0][1]($src)
419	stg	@D[2],$A[1][3]($src)
420	stg	@D[3],$A[2][2]($src)
421	stg	@D[4],$A[3][2]($src)
422	stg	@T[0],$A[4][0]($src)
423
424.Loop_absorb:
425	cl${g}r	$len,$bsz
426	jl	.Ldone_absorb
427
428	srl${g}	$bsz,3
429	la	%r1,0($A_flat)
430
431.Lblock_absorb:
432	lrvg	%r0,0($inp)
433	la	$inp,8($inp)
434	xg	%r0,0(%r1)
435	a${g}hi	$len,-8
436	stg	%r0,0(%r1)
437	la	%r1,8(%r1)
438	brct	$bsz,.Lblock_absorb
439
440	stm${g}	$inp,$len,$frame+3*$SIZE_T($sp)
441	la	$dst,$stdframe($sp)
442	bras	%r14,__KeccakF1600
443	lm${g}	$inp,$bsz,$frame+3*$SIZE_T($sp)
444	j	.Loop_absorb
445
446.align	16
447.Ldone_absorb:
448	lghi	@D[0],-1		# no 'not' instruction :-(
449	lghi	@D[1],-1
450	lghi	@D[2],-1
451	lghi	@D[3],-1
452	lghi	@D[4],-1
453	lghi	@T[0],-1
454	xg	@D[0],$A[0][1]($src)
455	xg	@D[1],$A[0][2]($src)
456	xg	@D[2],$A[1][3]($src)
457	xg	@D[3],$A[2][2]($src)
458	xg	@D[4],$A[3][2]($src)
459	xg	@T[0],$A[4][0]($src)
460	stmg	@D[0],@D[1],$A[0][1]($src)
461	stg	@D[2],$A[1][3]($src)
462	stg	@D[3],$A[2][2]($src)
463	stg	@D[4],$A[3][2]($src)
464	stg	@T[0],$A[4][0]($src)
465
466	lgr	%r2,$len		# return value
467
468	lm${g}	%r6,%r15,$frame+6*$SIZE_T($sp)
469	br	%r14
470.size	SHA3_absorb,.-SHA3_absorb
471___
472}
473{ my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5));
474
475$code.=<<___;
476.globl	SHA3_squeeze
477.type	SHA3_squeeze,\@function
478.align	32
479SHA3_squeeze:
480	srl${g}	$bsz,3
481	st${g}	%r14,2*$SIZE_T($sp)
482	lghi	%r14,8
483	st${g}	$bsz,5*$SIZE_T($sp)
484	la	%r1,0($A_flat)
485
486	j	.Loop_squeeze
487
488.align	16
489.Loop_squeeze:
490	cl${g}r $len,%r14
491	jl	.Ltail_squeeze
492
493	lrvg	%r0,0(%r1)
494	la	%r1,8(%r1)
495	stg	%r0,0($out)
496	la	$out,8($out)
497	a${g}hi	$len,-8			# len -= 8
498	jz	.Ldone_squeeze
499
500	brct	$bsz,.Loop_squeeze	# bsz--
501
502	stm${g}	$out,$len,3*$SIZE_T($sp)
503	bras	%r14,.LKeccakF1600
504	lm${g}	$out,$bsz,3*$SIZE_T($sp)
505	lghi	%r14,8
506	la	%r1,0($A_flat)
507	j	.Loop_squeeze
508
509.Ltail_squeeze:
510	lg	%r0,0(%r1)
511.Loop_tail_squeeze:
512	stc	%r0,0($out)
513	la	$out,1($out)
514	srlg	%r0,8
515	brct	$len,.Loop_tail_squeeze
516
517.Ldone_squeeze:
518	l${g}	%r14,2*$SIZE_T($sp)
519	br	%r14
520.size	SHA3_squeeze,.-SHA3_squeeze
521___
522}
523$code.=<<___;
524.align	256
525	.quad	0,0,0,0,0,0,0,0
526.type	iotas,\@object
527iotas:
528	.quad	0x0000000000000001
529	.quad	0x0000000000008082
530	.quad	0x800000000000808a
531	.quad	0x8000000080008000
532	.quad	0x000000000000808b
533	.quad	0x0000000080000001
534	.quad	0x8000000080008081
535	.quad	0x8000000000008009
536	.quad	0x000000000000008a
537	.quad	0x0000000000000088
538	.quad	0x0000000080008009
539	.quad	0x000000008000000a
540	.quad	0x000000008000808b
541	.quad	0x800000000000008b
542	.quad	0x8000000000008089
543	.quad	0x8000000000008003
544	.quad	0x8000000000008002
545	.quad	0x8000000000000080
546	.quad	0x000000000000800a
547	.quad	0x800000008000000a
548	.quad	0x8000000080008081
549	.quad	0x8000000000008080
550	.quad	0x0000000080000001
551	.quad	0x8000000080008008
552.size	iotas,.-iotas
553.asciz	"Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>"
554___
555
556# unlike 32-bit shift 64-bit one takes three arguments
557$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
558
559print $code;
560close STDOUT or die "error closing STDOUT: $!";
561