1#!/usr/bin/env perl
2# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for ARMv8.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT implementation. It makes no
21# sense to attempt SIMD/NEON implementation for following reason.
22# 64-bit lanes of vector registers can't be addressed as easily as in
23# 32-bit mode. This means that 64-bit NEON is bound to be slower than
24# 32-bit NEON, and this implementation is faster than 32-bit NEON on
25# same processor. Even though it takes more scalar xor's and andn's,
26# it gets compensated by availability of rotate. Not to forget that
27# most processors achieve higher issue rate with scalar instructions.
28#
29# February 2018.
30#
31# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
32# variant with register permutation/rotation twist that allows to
33# eliminate copies to temporary registers. If you look closely you'll
34# notice that it uses only one lane of vector registers. The new
35# instructions effectively facilitate parallel hashing, which we don't
36# support [yet?]. But lowest-level core procedure is prepared for it.
37# The inner round is 67 [vector] instructions, so it's not actually
38# obvious that it will provide performance improvement [in serial
39# hash] as long as vector instructions issue rate is limited to 1 per
40# cycle...
41#
42######################################################################
43# Numbers are cycles per processed byte.
44#
45#		r=1088(*)
46#
47# Cortex-A53	13
48# Cortex-A57	12
49# X-Gene	14
50# Mongoose	10
51# Kryo		12
52# Denver	7.8
53# Apple A7	7.2
54#
55# (*)	Corresponds to SHA3-256. No improvement coefficients are listed
56#	because they vary too much from compiler to compiler. Newer
57#	compiler does much better and improvement varies from 5% on
58#	Cortex-A57 to 25% on Cortex-A53. While in comparison to older
59#	compiler this code is at least 2x faster...
60
61$flavour = shift;
62$output  = shift;
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67die "can't locate arm-xlate.pl";
68
69open OUT,"| \"$^X\" $xlate $flavour $output";
70*STDOUT=*OUT;
71
72my @rhotates = ([  0,  1, 62, 28, 27 ],
73                [ 36, 44,  6, 55, 20 ],
74                [  3, 10, 43, 25, 39 ],
75                [ 41, 45, 15, 21,  8 ],
76                [ 18,  2, 61, 56, 14 ]);
77
78$code.=<<___;
79.text
80
81.align 8	// strategic alignment and padding that allows to use
82		// address value as loop termination condition...
83	.quad	0,0,0,0,0,0,0,0
84.type	iotas,%object
85iotas:
86	.quad	0x0000000000000001
87	.quad	0x0000000000008082
88	.quad	0x800000000000808a
89	.quad	0x8000000080008000
90	.quad	0x000000000000808b
91	.quad	0x0000000080000001
92	.quad	0x8000000080008081
93	.quad	0x8000000000008009
94	.quad	0x000000000000008a
95	.quad	0x0000000000000088
96	.quad	0x0000000080008009
97	.quad	0x000000008000000a
98	.quad	0x000000008000808b
99	.quad	0x800000000000008b
100	.quad	0x8000000000008089
101	.quad	0x8000000000008003
102	.quad	0x8000000000008002
103	.quad	0x8000000000000080
104	.quad	0x000000000000800a
105	.quad	0x800000008000000a
106	.quad	0x8000000080008081
107	.quad	0x8000000000008080
108	.quad	0x0000000080000001
109	.quad	0x8000000080008008
110.size	iotas,.-iotas
111___
112								{{{
113my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
114            (0, 5, 10, 15, 20));
115   $A[3][3] = "x25"; # x18 is reserved
116
117my @C = map("x$_", (26,27,28,30));
118
119$code.=<<___;
120.type	KeccakF1600_int,%function
121.align	5
122KeccakF1600_int:
123	adr	$C[2],iotas
124	stp	$C[2],x30,[sp,#16]		// 32 bytes on top are mine
125	b	.Loop
126.align	4
127.Loop:
128	////////////////////////////////////////// Theta
129	eor	$C[0],$A[0][0],$A[1][0]
130	stp	$A[0][4],$A[1][4],[sp,#0]	// offload pair...
131	eor	$C[1],$A[0][1],$A[1][1]
132	eor	$C[2],$A[0][2],$A[1][2]
133	eor	$C[3],$A[0][3],$A[1][3]
134___
135	$C[4]=$A[0][4];
136	$C[5]=$A[1][4];
137$code.=<<___;
138	eor	$C[4],$A[0][4],$A[1][4]
139	eor	$C[0],$C[0],$A[2][0]
140	eor	$C[1],$C[1],$A[2][1]
141	eor	$C[2],$C[2],$A[2][2]
142	eor	$C[3],$C[3],$A[2][3]
143	eor	$C[4],$C[4],$A[2][4]
144	eor	$C[0],$C[0],$A[3][0]
145	eor	$C[1],$C[1],$A[3][1]
146	eor	$C[2],$C[2],$A[3][2]
147	eor	$C[3],$C[3],$A[3][3]
148	eor	$C[4],$C[4],$A[3][4]
149	eor	$C[0],$C[0],$A[4][0]
150	eor	$C[2],$C[2],$A[4][2]
151	eor	$C[1],$C[1],$A[4][1]
152	eor	$C[3],$C[3],$A[4][3]
153	eor	$C[4],$C[4],$A[4][4]
154
155	eor	$C[5],$C[0],$C[2],ror#63
156
157	eor	$A[0][1],$A[0][1],$C[5]
158	eor	$A[1][1],$A[1][1],$C[5]
159	eor	$A[2][1],$A[2][1],$C[5]
160	eor	$A[3][1],$A[3][1],$C[5]
161	eor	$A[4][1],$A[4][1],$C[5]
162
163	eor	$C[5],$C[1],$C[3],ror#63
164	eor	$C[2],$C[2],$C[4],ror#63
165	eor	$C[3],$C[3],$C[0],ror#63
166	eor	$C[4],$C[4],$C[1],ror#63
167
168	eor	$C[1],   $A[0][2],$C[5]		// mov	$C[1],$A[0][2]
169	eor	$A[1][2],$A[1][2],$C[5]
170	eor	$A[2][2],$A[2][2],$C[5]
171	eor	$A[3][2],$A[3][2],$C[5]
172	eor	$A[4][2],$A[4][2],$C[5]
173
174	eor	$A[0][0],$A[0][0],$C[4]
175	eor	$A[1][0],$A[1][0],$C[4]
176	eor	$A[2][0],$A[2][0],$C[4]
177	eor	$A[3][0],$A[3][0],$C[4]
178	eor	$A[4][0],$A[4][0],$C[4]
179___
180	$C[4]=undef;
181	$C[5]=undef;
182$code.=<<___;
183	ldp	$A[0][4],$A[1][4],[sp,#0]	// re-load offloaded data
184	eor	$C[0],   $A[0][3],$C[2]		// mov	$C[0],$A[0][3]
185	eor	$A[1][3],$A[1][3],$C[2]
186	eor	$A[2][3],$A[2][3],$C[2]
187	eor	$A[3][3],$A[3][3],$C[2]
188	eor	$A[4][3],$A[4][3],$C[2]
189
190	eor	$C[2],   $A[0][4],$C[3]		// mov	$C[2],$A[0][4]
191	eor	$A[1][4],$A[1][4],$C[3]
192	eor	$A[2][4],$A[2][4],$C[3]
193	eor	$A[3][4],$A[3][4],$C[3]
194	eor	$A[4][4],$A[4][4],$C[3]
195
196	////////////////////////////////////////// Rho+Pi
197	mov	$C[3],$A[0][1]
198	ror	$A[0][1],$A[1][1],#64-$rhotates[1][1]
199	//mov	$C[1],$A[0][2]
200	ror	$A[0][2],$A[2][2],#64-$rhotates[2][2]
201	//mov	$C[0],$A[0][3]
202	ror	$A[0][3],$A[3][3],#64-$rhotates[3][3]
203	//mov	$C[2],$A[0][4]
204	ror	$A[0][4],$A[4][4],#64-$rhotates[4][4]
205
206	ror	$A[1][1],$A[1][4],#64-$rhotates[1][4]
207	ror	$A[2][2],$A[2][3],#64-$rhotates[2][3]
208	ror	$A[3][3],$A[3][2],#64-$rhotates[3][2]
209	ror	$A[4][4],$A[4][1],#64-$rhotates[4][1]
210
211	ror	$A[1][4],$A[4][2],#64-$rhotates[4][2]
212	ror	$A[2][3],$A[3][4],#64-$rhotates[3][4]
213	ror	$A[3][2],$A[2][1],#64-$rhotates[2][1]
214	ror	$A[4][1],$A[1][3],#64-$rhotates[1][3]
215
216	ror	$A[4][2],$A[2][4],#64-$rhotates[2][4]
217	ror	$A[3][4],$A[4][3],#64-$rhotates[4][3]
218	ror	$A[2][1],$A[1][2],#64-$rhotates[1][2]
219	ror	$A[1][3],$A[3][1],#64-$rhotates[3][1]
220
221	ror	$A[2][4],$A[4][0],#64-$rhotates[4][0]
222	ror	$A[4][3],$A[3][0],#64-$rhotates[3][0]
223	ror	$A[1][2],$A[2][0],#64-$rhotates[2][0]
224	ror	$A[3][1],$A[1][0],#64-$rhotates[1][0]
225
226	ror	$A[1][0],$C[0],#64-$rhotates[0][3]
227	ror	$A[2][0],$C[3],#64-$rhotates[0][1]
228	ror	$A[3][0],$C[2],#64-$rhotates[0][4]
229	ror	$A[4][0],$C[1],#64-$rhotates[0][2]
230
231	////////////////////////////////////////// Chi+Iota
232	bic	$C[0],$A[0][2],$A[0][1]
233	bic	$C[1],$A[0][3],$A[0][2]
234	bic	$C[2],$A[0][0],$A[0][4]
235	bic	$C[3],$A[0][1],$A[0][0]
236	eor	$A[0][0],$A[0][0],$C[0]
237	bic	$C[0],$A[0][4],$A[0][3]
238	eor	$A[0][1],$A[0][1],$C[1]
239	 ldr	$C[1],[sp,#16]
240	eor	$A[0][3],$A[0][3],$C[2]
241	eor	$A[0][4],$A[0][4],$C[3]
242	eor	$A[0][2],$A[0][2],$C[0]
243	 ldr	$C[3],[$C[1]],#8		// Iota[i++]
244
245	bic	$C[0],$A[1][2],$A[1][1]
246	 tst	$C[1],#255			// are we done?
247	 str	$C[1],[sp,#16]
248	bic	$C[1],$A[1][3],$A[1][2]
249	bic	$C[2],$A[1][0],$A[1][4]
250	 eor	$A[0][0],$A[0][0],$C[3]		// A[0][0] ^= Iota
251	bic	$C[3],$A[1][1],$A[1][0]
252	eor	$A[1][0],$A[1][0],$C[0]
253	bic	$C[0],$A[1][4],$A[1][3]
254	eor	$A[1][1],$A[1][1],$C[1]
255	eor	$A[1][3],$A[1][3],$C[2]
256	eor	$A[1][4],$A[1][4],$C[3]
257	eor	$A[1][2],$A[1][2],$C[0]
258
259	bic	$C[0],$A[2][2],$A[2][1]
260	bic	$C[1],$A[2][3],$A[2][2]
261	bic	$C[2],$A[2][0],$A[2][4]
262	bic	$C[3],$A[2][1],$A[2][0]
263	eor	$A[2][0],$A[2][0],$C[0]
264	bic	$C[0],$A[2][4],$A[2][3]
265	eor	$A[2][1],$A[2][1],$C[1]
266	eor	$A[2][3],$A[2][3],$C[2]
267	eor	$A[2][4],$A[2][4],$C[3]
268	eor	$A[2][2],$A[2][2],$C[0]
269
270	bic	$C[0],$A[3][2],$A[3][1]
271	bic	$C[1],$A[3][3],$A[3][2]
272	bic	$C[2],$A[3][0],$A[3][4]
273	bic	$C[3],$A[3][1],$A[3][0]
274	eor	$A[3][0],$A[3][0],$C[0]
275	bic	$C[0],$A[3][4],$A[3][3]
276	eor	$A[3][1],$A[3][1],$C[1]
277	eor	$A[3][3],$A[3][3],$C[2]
278	eor	$A[3][4],$A[3][4],$C[3]
279	eor	$A[3][2],$A[3][2],$C[0]
280
281	bic	$C[0],$A[4][2],$A[4][1]
282	bic	$C[1],$A[4][3],$A[4][2]
283	bic	$C[2],$A[4][0],$A[4][4]
284	bic	$C[3],$A[4][1],$A[4][0]
285	eor	$A[4][0],$A[4][0],$C[0]
286	bic	$C[0],$A[4][4],$A[4][3]
287	eor	$A[4][1],$A[4][1],$C[1]
288	eor	$A[4][3],$A[4][3],$C[2]
289	eor	$A[4][4],$A[4][4],$C[3]
290	eor	$A[4][2],$A[4][2],$C[0]
291
292	bne	.Loop
293
294	ldr	x30,[sp,#24]
295	ret
296.size	KeccakF1600_int,.-KeccakF1600_int
297
298.type	KeccakF1600,%function
299.align	5
300KeccakF1600:
301	stp	x29,x30,[sp,#-128]!
302	add	x29,sp,#0
303	stp	x19,x20,[sp,#16]
304	stp	x21,x22,[sp,#32]
305	stp	x23,x24,[sp,#48]
306	stp	x25,x26,[sp,#64]
307	stp	x27,x28,[sp,#80]
308	sub	sp,sp,#48
309
310	str	x0,[sp,#32]			// offload argument
311	mov	$C[0],x0
312	ldp	$A[0][0],$A[0][1],[x0,#16*0]
313	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
314	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
315	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
316	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
317	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
318	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
319	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
320	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
321	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
322	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
323	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
324	ldr	$A[4][4],[$C[0],#16*12]
325
326	bl	KeccakF1600_int
327
328	ldr	$C[0],[sp,#32]
329	stp	$A[0][0],$A[0][1],[$C[0],#16*0]
330	stp	$A[0][2],$A[0][3],[$C[0],#16*1]
331	stp	$A[0][4],$A[1][0],[$C[0],#16*2]
332	stp	$A[1][1],$A[1][2],[$C[0],#16*3]
333	stp	$A[1][3],$A[1][4],[$C[0],#16*4]
334	stp	$A[2][0],$A[2][1],[$C[0],#16*5]
335	stp	$A[2][2],$A[2][3],[$C[0],#16*6]
336	stp	$A[2][4],$A[3][0],[$C[0],#16*7]
337	stp	$A[3][1],$A[3][2],[$C[0],#16*8]
338	stp	$A[3][3],$A[3][4],[$C[0],#16*9]
339	stp	$A[4][0],$A[4][1],[$C[0],#16*10]
340	stp	$A[4][2],$A[4][3],[$C[0],#16*11]
341	str	$A[4][4],[$C[0],#16*12]
342
343	ldp	x19,x20,[x29,#16]
344	add	sp,sp,#48
345	ldp	x21,x22,[x29,#32]
346	ldp	x23,x24,[x29,#48]
347	ldp	x25,x26,[x29,#64]
348	ldp	x27,x28,[x29,#80]
349	ldp	x29,x30,[sp],#128
350	ret
351.size	KeccakF1600,.-KeccakF1600
352
353.globl	SHA3_absorb
354.type	SHA3_absorb,%function
355.align	5
356SHA3_absorb:
357	stp	x29,x30,[sp,#-128]!
358	add	x29,sp,#0
359	stp	x19,x20,[sp,#16]
360	stp	x21,x22,[sp,#32]
361	stp	x23,x24,[sp,#48]
362	stp	x25,x26,[sp,#64]
363	stp	x27,x28,[sp,#80]
364	sub	sp,sp,#64
365
366	stp	x0,x1,[sp,#32]			// offload arguments
367	stp	x2,x3,[sp,#48]
368
369	mov	$C[0],x0			// uint64_t A[5][5]
370	mov	$C[1],x1			// const void *inp
371	mov	$C[2],x2			// size_t len
372	mov	$C[3],x3			// size_t bsz
373	ldp	$A[0][0],$A[0][1],[$C[0],#16*0]
374	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
375	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
376	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
377	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
378	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
379	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
380	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
381	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
382	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
383	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
384	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
385	ldr	$A[4][4],[$C[0],#16*12]
386	b	.Loop_absorb
387
388.align	4
389.Loop_absorb:
390	subs	$C[0],$C[2],$C[3]		// len - bsz
391	blo	.Labsorbed
392
393	str	$C[0],[sp,#48]			// save len - bsz
394___
395for (my $i=0; $i<24; $i+=2) {
396my $j = $i+1;
397$code.=<<___;
398	ldr	$C[0],[$C[1]],#8		// *inp++
399#ifdef	__AARCH64EB__
400	rev	$C[0],$C[0]
401#endif
402	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
403	cmp	$C[3],#8*($i+2)
404	blo	.Lprocess_block
405	ldr	$C[0],[$C[1]],#8		// *inp++
406#ifdef	__AARCH64EB__
407	rev	$C[0],$C[0]
408#endif
409	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
410	beq	.Lprocess_block
411___
412}
413$code.=<<___;
414	ldr	$C[0],[$C[1]],#8		// *inp++
415#ifdef	__AARCH64EB__
416	rev	$C[0],$C[0]
417#endif
418	eor	$A[4][4],$A[4][4],$C[0]
419
420.Lprocess_block:
421	str	$C[1],[sp,#40]			// save inp
422
423	bl	KeccakF1600_int
424
425	ldr	$C[1],[sp,#40]			// restore arguments
426	ldp	$C[2],$C[3],[sp,#48]
427	b	.Loop_absorb
428
429.align	4
430.Labsorbed:
431	ldr	$C[1],[sp,#32]
432	stp	$A[0][0],$A[0][1],[$C[1],#16*0]
433	stp	$A[0][2],$A[0][3],[$C[1],#16*1]
434	stp	$A[0][4],$A[1][0],[$C[1],#16*2]
435	stp	$A[1][1],$A[1][2],[$C[1],#16*3]
436	stp	$A[1][3],$A[1][4],[$C[1],#16*4]
437	stp	$A[2][0],$A[2][1],[$C[1],#16*5]
438	stp	$A[2][2],$A[2][3],[$C[1],#16*6]
439	stp	$A[2][4],$A[3][0],[$C[1],#16*7]
440	stp	$A[3][1],$A[3][2],[$C[1],#16*8]
441	stp	$A[3][3],$A[3][4],[$C[1],#16*9]
442	stp	$A[4][0],$A[4][1],[$C[1],#16*10]
443	stp	$A[4][2],$A[4][3],[$C[1],#16*11]
444	str	$A[4][4],[$C[1],#16*12]
445
446	mov	x0,$C[2]			// return value
447	ldp	x19,x20,[x29,#16]
448	add	sp,sp,#64
449	ldp	x21,x22,[x29,#32]
450	ldp	x23,x24,[x29,#48]
451	ldp	x25,x26,[x29,#64]
452	ldp	x27,x28,[x29,#80]
453	ldp	x29,x30,[sp],#128
454	ret
455.size	SHA3_absorb,.-SHA3_absorb
456___
457{
458my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
459$code.=<<___;
460.globl	SHA3_squeeze
461.type	SHA3_squeeze,%function
462.align	5
463SHA3_squeeze:
464	stp	x29,x30,[sp,#-48]!
465	add	x29,sp,#0
466	stp	x19,x20,[sp,#16]
467	stp	x21,x22,[sp,#32]
468
469	mov	$A_flat,x0			// put aside arguments
470	mov	$out,x1
471	mov	$len,x2
472	mov	$bsz,x3
473
474.Loop_squeeze:
475	ldr	x4,[x0],#8
476	cmp	$len,#8
477	blo	.Lsqueeze_tail
478#ifdef	__AARCH64EB__
479	rev	x4,x4
480#endif
481	str	x4,[$out],#8
482	subs	$len,$len,#8
483	beq	.Lsqueeze_done
484
485	subs	x3,x3,#8
486	bhi	.Loop_squeeze
487
488	mov	x0,$A_flat
489	bl	KeccakF1600
490	mov	x0,$A_flat
491	mov	x3,$bsz
492	b	.Loop_squeeze
493
494.align	4
495.Lsqueeze_tail:
496	strb	w4,[$out],#1
497	lsr	x4,x4,#8
498	subs	$len,$len,#1
499	beq	.Lsqueeze_done
500	strb	w4,[$out],#1
501	lsr	x4,x4,#8
502	subs	$len,$len,#1
503	beq	.Lsqueeze_done
504	strb	w4,[$out],#1
505	lsr	x4,x4,#8
506	subs	$len,$len,#1
507	beq	.Lsqueeze_done
508	strb	w4,[$out],#1
509	lsr	x4,x4,#8
510	subs	$len,$len,#1
511	beq	.Lsqueeze_done
512	strb	w4,[$out],#1
513	lsr	x4,x4,#8
514	subs	$len,$len,#1
515	beq	.Lsqueeze_done
516	strb	w4,[$out],#1
517	lsr	x4,x4,#8
518	subs	$len,$len,#1
519	beq	.Lsqueeze_done
520	strb	w4,[$out],#1
521
522.Lsqueeze_done:
523	ldp	x19,x20,[sp,#16]
524	ldp	x21,x22,[sp,#32]
525	ldp	x29,x30,[sp],#48
526	ret
527.size	SHA3_squeeze,.-SHA3_squeeze
528___
529}								}}}
530								{{{
531my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
532                             "v".($_+3).".16b", "v".($_+4).".16b" ],
533            (0, 5, 10, 15, 20));
534
535my @C = map("v$_.16b", (25..31));
536
537$code.=<<___;
538.type	KeccakF1600_ce,%function
539.align	5
540KeccakF1600_ce:
541	mov	x9,#12
542	adr	x10,iotas
543	b	.Loop_ce
544.align	4
545.Loop_ce:
546___
547for($i=0; $i<2; $i++) {
548$code.=<<___;
549	////////////////////////////////////////////////// Theta
550	eor3	$C[0],$A[0][0],$A[1][0],$A[2][0]
551	eor3	$C[1],$A[0][1],$A[1][1],$A[2][1]
552	eor3	$C[2],$A[0][2],$A[1][2],$A[2][2]
553	eor3	$C[3],$A[0][3],$A[1][3],$A[2][3]
554	eor3	$C[4],$A[0][4],$A[1][4],$A[2][4]
555	eor3	$C[0],$C[0],   $A[3][0],$A[4][0]
556	eor3	$C[1],$C[1],   $A[3][1],$A[4][1]
557	eor3	$C[2],$C[2],   $A[3][2],$A[4][2]
558	eor3	$C[3],$C[3],   $A[3][3],$A[4][3]
559	eor3	$C[4],$C[4],   $A[3][4],$A[4][4]
560
561	rax1	$C[5],$C[0],$C[2]			// D[1]
562	rax1	$C[6],$C[1],$C[3]			// D[2]
563	rax1	$C[2],$C[2],$C[4]			// D[3]
564	rax1	$C[3],$C[3],$C[0]			// D[4]
565	rax1	$C[4],$C[4],$C[1]			// D[0]
566
567	////////////////////////////////////////////////// Theta+Rho+Pi
568	xar	$C[0],   $A[1][1],$C[5],#64-$rhotates[1][1]	// C[0]=A[0][1]
569	xar	$A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4]
570	xar	$A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2]
571	xar	$A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4]
572	xar	$A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0]
573
574	xar	$A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2]
575
576	xar	$A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2]
577	xar	$A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3]
578	xar	$A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4]
579	xar	$A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3]
580	xar	$A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0]
581
582	xar	$A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4]
583
584	eor	$A[0][0],$A[0][0],$C[4]
585	ldr	x11,[x10],#8
586
587	xar	$C[1],   $A[3][3],$C[2],#64-$rhotates[3][3]	// C[1]=A[0][3]
588	xar	$A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2]
589	xar	$A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1]
590	xar	$A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2]
591	xar	$A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0]
592
593	xar	$A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1]	// *
594
595	xar	$A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4]
596	xar	$A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1]
597	xar	$A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3]
598	xar	$A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1]
599	xar	$A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0]
600
601	xar	$C[2],   $A[0][3],$C[2],#64-$rhotates[0][3]	// C[2]=A[1][0]
602
603	////////////////////////////////////////////////// Chi+Iota
604	dup	$C[6],x11				// borrow C[6]
605	bcax	$C[3],   $A[0][0],$A[0][2],$C[0]	// *
606	bcax	$A[0][1],$C[0],   $C[1],   $A[0][2]	// *
607	bcax	$A[0][2],$A[0][2],$A[0][4],$C[1]
608	bcax	$A[0][3],$C[1],   $A[0][0],$A[0][4]
609	bcax	$A[0][4],$A[0][4],$C[0],   $A[0][0]
610
611	bcax	$A[1][0],$C[2],   $A[1][2],$A[1][1]	// *
612	bcax	$C[0],   $A[1][1],$A[1][3],$A[1][2]	// *
613	bcax	$A[1][2],$A[1][2],$A[1][4],$A[1][3]
614	bcax	$A[1][3],$A[1][3],$C[2],   $A[1][4]
615	bcax	$A[1][4],$A[1][4],$A[1][1],$C[2]
616
617	eor	$A[0][0],$C[3],$C[6]			// Iota
618
619	bcax	$C[1],   $A[2][0],$A[2][2],$A[2][1]	// *
620	bcax	$C[2],   $A[2][1],$A[2][3],$A[2][2]	// *
621	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
622	bcax	$A[2][3],$A[2][3],$A[2][0],$A[2][4]
623	bcax	$A[2][4],$A[2][4],$A[2][1],$A[2][0]
624
625	bcax	$C[3],   $A[3][0],$A[3][2],$A[3][1]	// *
626	bcax	$C[4],   $A[3][1],$A[3][3],$A[3][2]	// *
627	bcax	$A[3][2],$A[3][2],$A[3][4],$A[3][3]
628	bcax	$A[3][3],$A[3][3],$A[3][0],$A[3][4]
629	bcax	$A[3][4],$A[3][4],$A[3][1],$A[3][0]
630
631	bcax	$C[5],   $A[4][0],$A[4][2],$A[4][1]	// *
632	bcax	$C[6],   $A[4][1],$A[4][3],$A[4][2]	// *
633	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
634	bcax	$A[4][3],$A[4][3],$A[4][0],$A[4][4]
635	bcax	$A[4][4],$A[4][4],$A[4][1],$A[4][0]
636___
637	(         $A[1][1],       $C[0]) = (      $C[0],          $A[1][1]);
638	($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]);
639	($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]);
640	($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]);
641}
642$code.=<<___;
643	subs	x9,x9,#1
644	bne	.Loop_ce
645
646	ret
647.size	KeccakF1600_ce,.-KeccakF1600_ce
648
649.type	KeccakF1600_cext,%function
650.align	5
651KeccakF1600_cext:
652	stp	x29,x30,[sp,#-80]!
653	add	x29,sp,#0
654	stp	d8,d9,[sp,#16]		// per ABI requirement
655	stp	d10,d11,[sp,#32]
656	stp	d12,d13,[sp,#48]
657	stp	d14,d15,[sp,#64]
658___
659for($i=0; $i<24; $i+=2) {		# load A[5][5]
660my $j=$i+1;
661$code.=<<___;
662	ldp	d$i,d$j,[x0,#8*$i]
663___
664}
665$code.=<<___;
666	ldr	d24,[x0,#8*$i]
667	bl	KeccakF1600_ce
668	ldr	x30,[sp,#8]
669___
670for($i=0; $i<24; $i+=2) {		# store A[5][5]
671my $j=$i+1;
672$code.=<<___;
673	stp	d$i,d$j,[x0,#8*$i]
674___
675}
676$code.=<<___;
677	str	d24,[x0,#8*$i]
678
679	ldp	d8,d9,[sp,#16]
680	ldp	d10,d11,[sp,#32]
681	ldp	d12,d13,[sp,#48]
682	ldp	d14,d15,[sp,#64]
683	ldr	x29,[sp],#80
684	ret
685.size	KeccakF1600_cext,.-KeccakF1600_cext
686___
687
688{
689my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
690
691$code.=<<___;
692.globl	SHA3_absorb_cext
693.type	SHA3_absorb_cext,%function
694.align	5
695SHA3_absorb_cext:
696	stp	x29,x30,[sp,#-80]!
697	add	x29,sp,#0
698	stp	d8,d9,[sp,#16]		// per ABI requirement
699	stp	d10,d11,[sp,#32]
700	stp	d12,d13,[sp,#48]
701	stp	d14,d15,[sp,#64]
702___
703for($i=0; $i<24; $i+=2) {		# load A[5][5]
704my $j=$i+1;
705$code.=<<___;
706	ldp	d$i,d$j,[x0,#8*$i]
707___
708}
709$code.=<<___;
710	ldr	d24,[x0,#8*$i]
711	b	.Loop_absorb_ce
712
713.align	4
714.Loop_absorb_ce:
715	subs	$len,$len,$bsz		// len - bsz
716	blo	.Labsorbed_ce
717___
718for (my $i=0; $i<24; $i+=2) {
719my $j = $i+1;
720$code.=<<___;
721	ldr	d31,[$inp],#8		// *inp++
722#ifdef	__AARCH64EB__
723	rev64	v31.16b,v31.16b
724#endif
725	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
726	cmp	$bsz,#8*($i+2)
727	blo	.Lprocess_block_ce
728	ldr	d31,[$inp],#8		// *inp++
729#ifdef	__AARCH64EB__
730	rev	v31.16b,v31.16b
731#endif
732	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
733	beq	.Lprocess_block_ce
734___
735}
736$code.=<<___;
737	ldr	d31,[$inp],#8		// *inp++
738#ifdef	__AARCH64EB__
739	rev	v31.16b,v31.16b
740#endif
741	eor	$A[4][4],$A[4][4],v31.16b
742
743.Lprocess_block_ce:
744
745	bl	KeccakF1600_ce
746
747	b	.Loop_absorb_ce
748
749.align	4
750.Labsorbed_ce:
751___
752for($i=0; $i<24; $i+=2) {		# store A[5][5]
753my $j=$i+1;
754$code.=<<___;
755	stp	d$i,d$j,[x0,#8*$i]
756___
757}
758$code.=<<___;
759	str	d24,[x0,#8*$i]
760	add	x0,$len,$bsz		// return value
761
762	ldp	d8,d9,[sp,#16]
763	ldp	d10,d11,[sp,#32]
764	ldp	d12,d13,[sp,#48]
765	ldp	d14,d15,[sp,#64]
766	ldp	x29,x30,[sp],#80
767	ret
768.size	SHA3_absorb_cext,.-SHA3_absorb_cext
769___
770}
771{
772my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
773$code.=<<___;
774.globl	SHA3_squeeze_cext
775.type	SHA3_squeeze_cext,%function
776.align	5
777SHA3_squeeze_cext:
778	stp	x29,x30,[sp,#-16]!
779	add	x29,sp,#0
780	mov	x9,$ctx
781	mov	x10,$bsz
782
783.Loop_squeeze_ce:
784	ldr	x4,[x9],#8
785	cmp	$len,#8
786	blo	.Lsqueeze_tail_ce
787#ifdef	__AARCH64EB__
788	rev	x4,x4
789#endif
790	str	x4,[$out],#8
791	beq	.Lsqueeze_done_ce
792
793	sub	$len,$len,#8
794	subs	x10,x10,#8
795	bhi	.Loop_squeeze_ce
796
797	bl	KeccakF1600_cext
798	ldr	x30,[sp,#8]
799	mov	x9,$ctx
800	mov	x10,$bsz
801	b	.Loop_squeeze_ce
802
803.align	4
804.Lsqueeze_tail_ce:
805	strb	w4,[$out],#1
806	lsr	x4,x4,#8
807	subs	$len,$len,#1
808	beq	.Lsqueeze_done_ce
809	strb	w4,[$out],#1
810	lsr	x4,x4,#8
811	subs	$len,$len,#1
812	beq	.Lsqueeze_done_ce
813	strb	w4,[$out],#1
814	lsr	x4,x4,#8
815	subs	$len,$len,#1
816	beq	.Lsqueeze_done_ce
817	strb	w4,[$out],#1
818	lsr	x4,x4,#8
819	subs	$len,$len,#1
820	beq	.Lsqueeze_done_ce
821	strb	w4,[$out],#1
822	lsr	x4,x4,#8
823	subs	$len,$len,#1
824	beq	.Lsqueeze_done_ce
825	strb	w4,[$out],#1
826	lsr	x4,x4,#8
827	subs	$len,$len,#1
828	beq	.Lsqueeze_done_ce
829	strb	w4,[$out],#1
830
831.Lsqueeze_done_ce:
832	ldr	x29,[sp],#16
833	ret
834.size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
835___
836}								}}}
837$code.=<<___;
838.asciz	"Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
839___
840
841{   my  %opcode = (
842	"rax1"	=> 0xce608c00,	"eor3"	=> 0xce000000,
843	"bcax"	=> 0xce200000,	"xar"	=> 0xce800000	);
844
845    sub unsha3 {
846	my ($mnemonic,$arg)=@_;
847
848	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
849	&&
850	sprintf ".inst\t0x%08x\t//%s %s",
851			$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
852			$mnemonic,$arg;
853    }
854}
855
856foreach(split("\n",$code)) {
857
858	s/\`([^\`]*)\`/eval($1)/ge;
859
860	m/\bdup\b/ and s/\.16b/.2d/g	or
861	s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
862
863	print $_,"\n";
864}
865
866close STDOUT;
867