1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for ARMv8.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT implementation. It makes no
21# sense to attempt SIMD/NEON implementation for following reason.
22# 64-bit lanes of vector registers can't be addressed as easily as in
23# 32-bit mode. This means that 64-bit NEON is bound to be slower than
24# 32-bit NEON, and this implementation is faster than 32-bit NEON on
25# same processor. Even though it takes more scalar xor's and andn's,
26# it gets compensated by availability of rotate. Not to forget that
27# most processors achieve higher issue rate with scalar instructions.
28#
29# February 2018.
30#
31# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
32# variant with register permutation/rotation twist that allows to
33# eliminate copies to temporary registers. If you look closely you'll
34# notice that it uses only one lane of vector registers. The new
35# instructions effectively facilitate parallel hashing, which we don't
36# support [yet?]. But lowest-level core procedure is prepared for it.
37# The inner round is 67 [vector] instructions, so it's not actually
38# obvious that it will provide performance improvement [in serial
39# hash] as long as vector instructions issue rate is limited to 1 per
40# cycle...
41#
42######################################################################
43# Numbers are cycles per processed byte.
44#
45#		r=1088(*)
46#
47# Cortex-A53	13
48# Cortex-A57	12
49# X-Gene	14
50# Mongoose	10
51# Kryo		12
52# Denver	7.8
53# Apple A7	7.2
54# ThunderX2	9.7
55#
56# (*)	Corresponds to SHA3-256. No improvement coefficients are listed
57#	because they vary too much from compiler to compiler. Newer
58#	compiler does much better and improvement varies from 5% on
59#	Cortex-A57 to 25% on Cortex-A53. While in comparison to older
60#	compiler this code is at least 2x faster...
61
62# $output is the last argument if it looks like a file (it has an extension)
63# $flavour is the first argument if it doesn't look like a file
64$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
65$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
66
67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
69( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
70die "can't locate arm-xlate.pl";
71
72open OUT,"| \"$^X\" $xlate $flavour \"$output\""
73    or die "can't call $xlate: $!";
74*STDOUT=*OUT;
75
76my @rhotates = ([  0,  1, 62, 28, 27 ],
77                [ 36, 44,  6, 55, 20 ],
78                [  3, 10, 43, 25, 39 ],
79                [ 41, 45, 15, 21,  8 ],
80                [ 18,  2, 61, 56, 14 ]);
81
82$code.=<<___;
83.text
84
85.align 8	// strategic alignment and padding that allows to use
86		// address value as loop termination condition...
87	.quad	0,0,0,0,0,0,0,0
88.type	iotas,%object
89iotas:
90	.quad	0x0000000000000001
91	.quad	0x0000000000008082
92	.quad	0x800000000000808a
93	.quad	0x8000000080008000
94	.quad	0x000000000000808b
95	.quad	0x0000000080000001
96	.quad	0x8000000080008081
97	.quad	0x8000000000008009
98	.quad	0x000000000000008a
99	.quad	0x0000000000000088
100	.quad	0x0000000080008009
101	.quad	0x000000008000000a
102	.quad	0x000000008000808b
103	.quad	0x800000000000008b
104	.quad	0x8000000000008089
105	.quad	0x8000000000008003
106	.quad	0x8000000000008002
107	.quad	0x8000000000000080
108	.quad	0x000000000000800a
109	.quad	0x800000008000000a
110	.quad	0x8000000080008081
111	.quad	0x8000000000008080
112	.quad	0x0000000080000001
113	.quad	0x8000000080008008
114.size	iotas,.-iotas
115___
116								{{{
117my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
118            (0, 5, 10, 15, 20));
119   $A[3][3] = "x25"; # x18 is reserved
120
121my @C = map("x$_", (26,27,28,30));
122
123$code.=<<___;
124.type	KeccakF1600_int,%function
125.align	5
126KeccakF1600_int:
127	adr	$C[2],iotas
128	.inst	0xd503233f			// paciasp
129	stp	$C[2],x30,[sp,#16]		// 32 bytes on top are mine
130	b	.Loop
131.align	4
132.Loop:
133	////////////////////////////////////////// Theta
134	eor	$C[0],$A[0][0],$A[1][0]
135	stp	$A[0][4],$A[1][4],[sp,#0]	// offload pair...
136	eor	$C[1],$A[0][1],$A[1][1]
137	eor	$C[2],$A[0][2],$A[1][2]
138	eor	$C[3],$A[0][3],$A[1][3]
139___
140	$C[4]=$A[0][4];
141	$C[5]=$A[1][4];
142$code.=<<___;
143	eor	$C[4],$A[0][4],$A[1][4]
144	eor	$C[0],$C[0],$A[2][0]
145	eor	$C[1],$C[1],$A[2][1]
146	eor	$C[2],$C[2],$A[2][2]
147	eor	$C[3],$C[3],$A[2][3]
148	eor	$C[4],$C[4],$A[2][4]
149	eor	$C[0],$C[0],$A[3][0]
150	eor	$C[1],$C[1],$A[3][1]
151	eor	$C[2],$C[2],$A[3][2]
152	eor	$C[3],$C[3],$A[3][3]
153	eor	$C[4],$C[4],$A[3][4]
154	eor	$C[0],$C[0],$A[4][0]
155	eor	$C[2],$C[2],$A[4][2]
156	eor	$C[1],$C[1],$A[4][1]
157	eor	$C[3],$C[3],$A[4][3]
158	eor	$C[4],$C[4],$A[4][4]
159
160	eor	$C[5],$C[0],$C[2],ror#63
161
162	eor	$A[0][1],$A[0][1],$C[5]
163	eor	$A[1][1],$A[1][1],$C[5]
164	eor	$A[2][1],$A[2][1],$C[5]
165	eor	$A[3][1],$A[3][1],$C[5]
166	eor	$A[4][1],$A[4][1],$C[5]
167
168	eor	$C[5],$C[1],$C[3],ror#63
169	eor	$C[2],$C[2],$C[4],ror#63
170	eor	$C[3],$C[3],$C[0],ror#63
171	eor	$C[4],$C[4],$C[1],ror#63
172
173	eor	$C[1],   $A[0][2],$C[5]		// mov	$C[1],$A[0][2]
174	eor	$A[1][2],$A[1][2],$C[5]
175	eor	$A[2][2],$A[2][2],$C[5]
176	eor	$A[3][2],$A[3][2],$C[5]
177	eor	$A[4][2],$A[4][2],$C[5]
178
179	eor	$A[0][0],$A[0][0],$C[4]
180	eor	$A[1][0],$A[1][0],$C[4]
181	eor	$A[2][0],$A[2][0],$C[4]
182	eor	$A[3][0],$A[3][0],$C[4]
183	eor	$A[4][0],$A[4][0],$C[4]
184___
185	$C[4]=undef;
186	$C[5]=undef;
187$code.=<<___;
188	ldp	$A[0][4],$A[1][4],[sp,#0]	// re-load offloaded data
189	eor	$C[0],   $A[0][3],$C[2]		// mov	$C[0],$A[0][3]
190	eor	$A[1][3],$A[1][3],$C[2]
191	eor	$A[2][3],$A[2][3],$C[2]
192	eor	$A[3][3],$A[3][3],$C[2]
193	eor	$A[4][3],$A[4][3],$C[2]
194
195	eor	$C[2],   $A[0][4],$C[3]		// mov	$C[2],$A[0][4]
196	eor	$A[1][4],$A[1][4],$C[3]
197	eor	$A[2][4],$A[2][4],$C[3]
198	eor	$A[3][4],$A[3][4],$C[3]
199	eor	$A[4][4],$A[4][4],$C[3]
200
201	////////////////////////////////////////// Rho+Pi
202	mov	$C[3],$A[0][1]
203	ror	$A[0][1],$A[1][1],#64-$rhotates[1][1]
204	//mov	$C[1],$A[0][2]
205	ror	$A[0][2],$A[2][2],#64-$rhotates[2][2]
206	//mov	$C[0],$A[0][3]
207	ror	$A[0][3],$A[3][3],#64-$rhotates[3][3]
208	//mov	$C[2],$A[0][4]
209	ror	$A[0][4],$A[4][4],#64-$rhotates[4][4]
210
211	ror	$A[1][1],$A[1][4],#64-$rhotates[1][4]
212	ror	$A[2][2],$A[2][3],#64-$rhotates[2][3]
213	ror	$A[3][3],$A[3][2],#64-$rhotates[3][2]
214	ror	$A[4][4],$A[4][1],#64-$rhotates[4][1]
215
216	ror	$A[1][4],$A[4][2],#64-$rhotates[4][2]
217	ror	$A[2][3],$A[3][4],#64-$rhotates[3][4]
218	ror	$A[3][2],$A[2][1],#64-$rhotates[2][1]
219	ror	$A[4][1],$A[1][3],#64-$rhotates[1][3]
220
221	ror	$A[4][2],$A[2][4],#64-$rhotates[2][4]
222	ror	$A[3][4],$A[4][3],#64-$rhotates[4][3]
223	ror	$A[2][1],$A[1][2],#64-$rhotates[1][2]
224	ror	$A[1][3],$A[3][1],#64-$rhotates[3][1]
225
226	ror	$A[2][4],$A[4][0],#64-$rhotates[4][0]
227	ror	$A[4][3],$A[3][0],#64-$rhotates[3][0]
228	ror	$A[1][2],$A[2][0],#64-$rhotates[2][0]
229	ror	$A[3][1],$A[1][0],#64-$rhotates[1][0]
230
231	ror	$A[1][0],$C[0],#64-$rhotates[0][3]
232	ror	$A[2][0],$C[3],#64-$rhotates[0][1]
233	ror	$A[3][0],$C[2],#64-$rhotates[0][4]
234	ror	$A[4][0],$C[1],#64-$rhotates[0][2]
235
236	////////////////////////////////////////// Chi+Iota
237	bic	$C[0],$A[0][2],$A[0][1]
238	bic	$C[1],$A[0][3],$A[0][2]
239	bic	$C[2],$A[0][0],$A[0][4]
240	bic	$C[3],$A[0][1],$A[0][0]
241	eor	$A[0][0],$A[0][0],$C[0]
242	bic	$C[0],$A[0][4],$A[0][3]
243	eor	$A[0][1],$A[0][1],$C[1]
244	 ldr	$C[1],[sp,#16]
245	eor	$A[0][3],$A[0][3],$C[2]
246	eor	$A[0][4],$A[0][4],$C[3]
247	eor	$A[0][2],$A[0][2],$C[0]
248	 ldr	$C[3],[$C[1]],#8		// Iota[i++]
249
250	bic	$C[0],$A[1][2],$A[1][1]
251	 tst	$C[1],#255			// are we done?
252	 str	$C[1],[sp,#16]
253	bic	$C[1],$A[1][3],$A[1][2]
254	bic	$C[2],$A[1][0],$A[1][4]
255	 eor	$A[0][0],$A[0][0],$C[3]		// A[0][0] ^= Iota
256	bic	$C[3],$A[1][1],$A[1][0]
257	eor	$A[1][0],$A[1][0],$C[0]
258	bic	$C[0],$A[1][4],$A[1][3]
259	eor	$A[1][1],$A[1][1],$C[1]
260	eor	$A[1][3],$A[1][3],$C[2]
261	eor	$A[1][4],$A[1][4],$C[3]
262	eor	$A[1][2],$A[1][2],$C[0]
263
264	bic	$C[0],$A[2][2],$A[2][1]
265	bic	$C[1],$A[2][3],$A[2][2]
266	bic	$C[2],$A[2][0],$A[2][4]
267	bic	$C[3],$A[2][1],$A[2][0]
268	eor	$A[2][0],$A[2][0],$C[0]
269	bic	$C[0],$A[2][4],$A[2][3]
270	eor	$A[2][1],$A[2][1],$C[1]
271	eor	$A[2][3],$A[2][3],$C[2]
272	eor	$A[2][4],$A[2][4],$C[3]
273	eor	$A[2][2],$A[2][2],$C[0]
274
275	bic	$C[0],$A[3][2],$A[3][1]
276	bic	$C[1],$A[3][3],$A[3][2]
277	bic	$C[2],$A[3][0],$A[3][4]
278	bic	$C[3],$A[3][1],$A[3][0]
279	eor	$A[3][0],$A[3][0],$C[0]
280	bic	$C[0],$A[3][4],$A[3][3]
281	eor	$A[3][1],$A[3][1],$C[1]
282	eor	$A[3][3],$A[3][3],$C[2]
283	eor	$A[3][4],$A[3][4],$C[3]
284	eor	$A[3][2],$A[3][2],$C[0]
285
286	bic	$C[0],$A[4][2],$A[4][1]
287	bic	$C[1],$A[4][3],$A[4][2]
288	bic	$C[2],$A[4][0],$A[4][4]
289	bic	$C[3],$A[4][1],$A[4][0]
290	eor	$A[4][0],$A[4][0],$C[0]
291	bic	$C[0],$A[4][4],$A[4][3]
292	eor	$A[4][1],$A[4][1],$C[1]
293	eor	$A[4][3],$A[4][3],$C[2]
294	eor	$A[4][4],$A[4][4],$C[3]
295	eor	$A[4][2],$A[4][2],$C[0]
296
297	bne	.Loop
298
299	ldr	x30,[sp,#24]
300	.inst	0xd50323bf			// autiasp
301	ret
302.size	KeccakF1600_int,.-KeccakF1600_int
303
304.type	KeccakF1600,%function
305.align	5
306KeccakF1600:
307	.inst	0xd503233f			// paciasp
308	stp	x29,x30,[sp,#-128]!
309	add	x29,sp,#0
310	stp	x19,x20,[sp,#16]
311	stp	x21,x22,[sp,#32]
312	stp	x23,x24,[sp,#48]
313	stp	x25,x26,[sp,#64]
314	stp	x27,x28,[sp,#80]
315	sub	sp,sp,#48
316
317	str	x0,[sp,#32]			// offload argument
318	mov	$C[0],x0
319	ldp	$A[0][0],$A[0][1],[x0,#16*0]
320	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
321	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
322	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
323	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
324	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
325	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
326	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
327	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
328	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
329	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
330	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
331	ldr	$A[4][4],[$C[0],#16*12]
332
333	bl	KeccakF1600_int
334
335	ldr	$C[0],[sp,#32]
336	stp	$A[0][0],$A[0][1],[$C[0],#16*0]
337	stp	$A[0][2],$A[0][3],[$C[0],#16*1]
338	stp	$A[0][4],$A[1][0],[$C[0],#16*2]
339	stp	$A[1][1],$A[1][2],[$C[0],#16*3]
340	stp	$A[1][3],$A[1][4],[$C[0],#16*4]
341	stp	$A[2][0],$A[2][1],[$C[0],#16*5]
342	stp	$A[2][2],$A[2][3],[$C[0],#16*6]
343	stp	$A[2][4],$A[3][0],[$C[0],#16*7]
344	stp	$A[3][1],$A[3][2],[$C[0],#16*8]
345	stp	$A[3][3],$A[3][4],[$C[0],#16*9]
346	stp	$A[4][0],$A[4][1],[$C[0],#16*10]
347	stp	$A[4][2],$A[4][3],[$C[0],#16*11]
348	str	$A[4][4],[$C[0],#16*12]
349
350	ldp	x19,x20,[x29,#16]
351	add	sp,sp,#48
352	ldp	x21,x22,[x29,#32]
353	ldp	x23,x24,[x29,#48]
354	ldp	x25,x26,[x29,#64]
355	ldp	x27,x28,[x29,#80]
356	ldp	x29,x30,[sp],#128
357	.inst	0xd50323bf			// autiasp
358	ret
359.size	KeccakF1600,.-KeccakF1600
360
361.globl	SHA3_absorb
362.type	SHA3_absorb,%function
363.align	5
364SHA3_absorb:
365	.inst	0xd503233f			// paciasp
366	stp	x29,x30,[sp,#-128]!
367	add	x29,sp,#0
368	stp	x19,x20,[sp,#16]
369	stp	x21,x22,[sp,#32]
370	stp	x23,x24,[sp,#48]
371	stp	x25,x26,[sp,#64]
372	stp	x27,x28,[sp,#80]
373	sub	sp,sp,#64
374
375	stp	x0,x1,[sp,#32]			// offload arguments
376	stp	x2,x3,[sp,#48]
377
378	mov	$C[0],x0			// uint64_t A[5][5]
379	mov	$C[1],x1			// const void *inp
380	mov	$C[2],x2			// size_t len
381	mov	$C[3],x3			// size_t bsz
382	ldp	$A[0][0],$A[0][1],[$C[0],#16*0]
383	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
384	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
385	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
386	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
387	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
388	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
389	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
390	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
391	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
392	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
393	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
394	ldr	$A[4][4],[$C[0],#16*12]
395	b	.Loop_absorb
396
397.align	4
398.Loop_absorb:
399	subs	$C[0],$C[2],$C[3]		// len - bsz
400	blo	.Labsorbed
401
402	str	$C[0],[sp,#48]			// save len - bsz
403___
404for (my $i=0; $i<24; $i+=2) {
405my $j = $i+1;
406$code.=<<___;
407	ldr	$C[0],[$C[1]],#8		// *inp++
408#ifdef	__AARCH64EB__
409	rev	$C[0],$C[0]
410#endif
411	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
412	cmp	$C[3],#8*($i+2)
413	blo	.Lprocess_block
414	ldr	$C[0],[$C[1]],#8		// *inp++
415#ifdef	__AARCH64EB__
416	rev	$C[0],$C[0]
417#endif
418	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
419	beq	.Lprocess_block
420___
421}
422$code.=<<___;
423	ldr	$C[0],[$C[1]],#8		// *inp++
424#ifdef	__AARCH64EB__
425	rev	$C[0],$C[0]
426#endif
427	eor	$A[4][4],$A[4][4],$C[0]
428
429.Lprocess_block:
430	str	$C[1],[sp,#40]			// save inp
431
432	bl	KeccakF1600_int
433
434	ldr	$C[1],[sp,#40]			// restore arguments
435	ldp	$C[2],$C[3],[sp,#48]
436	b	.Loop_absorb
437
438.align	4
439.Labsorbed:
440	ldr	$C[1],[sp,#32]
441	stp	$A[0][0],$A[0][1],[$C[1],#16*0]
442	stp	$A[0][2],$A[0][3],[$C[1],#16*1]
443	stp	$A[0][4],$A[1][0],[$C[1],#16*2]
444	stp	$A[1][1],$A[1][2],[$C[1],#16*3]
445	stp	$A[1][3],$A[1][4],[$C[1],#16*4]
446	stp	$A[2][0],$A[2][1],[$C[1],#16*5]
447	stp	$A[2][2],$A[2][3],[$C[1],#16*6]
448	stp	$A[2][4],$A[3][0],[$C[1],#16*7]
449	stp	$A[3][1],$A[3][2],[$C[1],#16*8]
450	stp	$A[3][3],$A[3][4],[$C[1],#16*9]
451	stp	$A[4][0],$A[4][1],[$C[1],#16*10]
452	stp	$A[4][2],$A[4][3],[$C[1],#16*11]
453	str	$A[4][4],[$C[1],#16*12]
454
455	mov	x0,$C[2]			// return value
456	ldp	x19,x20,[x29,#16]
457	add	sp,sp,#64
458	ldp	x21,x22,[x29,#32]
459	ldp	x23,x24,[x29,#48]
460	ldp	x25,x26,[x29,#64]
461	ldp	x27,x28,[x29,#80]
462	ldp	x29,x30,[sp],#128
463	.inst	0xd50323bf			// autiasp
464	ret
465.size	SHA3_absorb,.-SHA3_absorb
466___
467{
468my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
469$code.=<<___;
470.globl	SHA3_squeeze
471.type	SHA3_squeeze,%function
472.align	5
473SHA3_squeeze:
474	.inst	0xd503233f			// paciasp
475	stp	x29,x30,[sp,#-48]!
476	add	x29,sp,#0
477	stp	x19,x20,[sp,#16]
478	stp	x21,x22,[sp,#32]
479
480	mov	$A_flat,x0			// put aside arguments
481	mov	$out,x1
482	mov	$len,x2
483	mov	$bsz,x3
484
485.Loop_squeeze:
486	ldr	x4,[x0],#8
487	cmp	$len,#8
488	blo	.Lsqueeze_tail
489#ifdef	__AARCH64EB__
490	rev	x4,x4
491#endif
492	str	x4,[$out],#8
493	subs	$len,$len,#8
494	beq	.Lsqueeze_done
495
496	subs	x3,x3,#8
497	bhi	.Loop_squeeze
498
499	mov	x0,$A_flat
500	bl	KeccakF1600
501	mov	x0,$A_flat
502	mov	x3,$bsz
503	b	.Loop_squeeze
504
505.align	4
506.Lsqueeze_tail:
507	strb	w4,[$out],#1
508	lsr	x4,x4,#8
509	subs	$len,$len,#1
510	beq	.Lsqueeze_done
511	strb	w4,[$out],#1
512	lsr	x4,x4,#8
513	subs	$len,$len,#1
514	beq	.Lsqueeze_done
515	strb	w4,[$out],#1
516	lsr	x4,x4,#8
517	subs	$len,$len,#1
518	beq	.Lsqueeze_done
519	strb	w4,[$out],#1
520	lsr	x4,x4,#8
521	subs	$len,$len,#1
522	beq	.Lsqueeze_done
523	strb	w4,[$out],#1
524	lsr	x4,x4,#8
525	subs	$len,$len,#1
526	beq	.Lsqueeze_done
527	strb	w4,[$out],#1
528	lsr	x4,x4,#8
529	subs	$len,$len,#1
530	beq	.Lsqueeze_done
531	strb	w4,[$out],#1
532
533.Lsqueeze_done:
534	ldp	x19,x20,[sp,#16]
535	ldp	x21,x22,[sp,#32]
536	ldp	x29,x30,[sp],#48
537	.inst	0xd50323bf			// autiasp
538	ret
539.size	SHA3_squeeze,.-SHA3_squeeze
540___
541}								}}}
542								{{{
543my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
544                             "v".($_+3).".16b", "v".($_+4).".16b" ],
545            (0, 5, 10, 15, 20));
546
547my @C = map("v$_.16b", (25..31));
548my @D = @C[4,5,6,2,3];
549
550$code.=<<___;
551.type	KeccakF1600_ce,%function
552.align	5
553KeccakF1600_ce:
554	mov	x9,#24
555	adr	x10,iotas
556	b	.Loop_ce
557.align	4
558.Loop_ce:
559	////////////////////////////////////////////////// Theta
560	eor3	$C[0],$A[4][0],$A[3][0],$A[2][0]
561	eor3	$C[1],$A[4][1],$A[3][1],$A[2][1]
562	eor3	$C[2],$A[4][2],$A[3][2],$A[2][2]
563	eor3	$C[3],$A[4][3],$A[3][3],$A[2][3]
564	eor3	$C[4],$A[4][4],$A[3][4],$A[2][4]
565	eor3	$C[0],$C[0],   $A[1][0],$A[0][0]
566	eor3	$C[1],$C[1],   $A[1][1],$A[0][1]
567	eor3	$C[2],$C[2],   $A[1][2],$A[0][2]
568	eor3	$C[3],$C[3],   $A[1][3],$A[0][3]
569	eor3	$C[4],$C[4],   $A[1][4],$A[0][4]
570
571	rax1	$C[5],$C[0],$C[2]			// D[1]
572	rax1	$C[6],$C[1],$C[3]			// D[2]
573	rax1	$C[2],$C[2],$C[4]			// D[3]
574	rax1	$C[3],$C[3],$C[0]			// D[4]
575	rax1	$C[4],$C[4],$C[1]			// D[0]
576
577	////////////////////////////////////////////////// Theta+Rho+Pi
578	xar	$C[0],   $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]
579
580	xar	$A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
581	xar	$A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
582	xar	$A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
583	xar	$A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
584	xar	$A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]
585
586	xar	$C[1],   $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]
587
588	xar	$A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
589	xar	$A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
590	xar	$A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
591	xar	$A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
592	xar	$A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]
593
594	xar	$A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]
595
596	xar	$D[4],   $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
597	xar	$A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
598	xar	$A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
599	xar	$A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
600	xar	$A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]
601
602	xar	$A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]
603
604	eor	$A[0][0],$A[0][0],$D[0]
605
606	xar	$D[3],   $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
607	xar	$A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
608	xar	$D[1],   $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
609	xar	$D[2],   $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
610	xar	$D[0],   $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]
611
612	////////////////////////////////////////////////// Chi+Iota
613	bcax	$A[4][0],$C[1],   $A[4][2],$A[1][3]	// A[1][3]=A[4][1]
614	bcax	$A[4][1],$A[1][3],$A[4][3],$A[4][2]	// A[1][3]=A[4][1]
615	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
616	bcax	$A[4][3],$A[4][3],$C[1],   $A[4][4]
617	bcax	$A[4][4],$A[4][4],$A[1][3],$C[1]	// A[1][3]=A[4][1]
618
619	ld1r	{$C[1]},[x10],#8
620
621	bcax	$A[3][2],$D[1],   $A[3][4],$A[0][3]	// A[0][3]=A[3][3]
622	bcax	$A[3][3],$A[0][3],$A[3][0],$A[3][4]	// A[0][3]=A[3][3]
623	bcax	$A[3][4],$A[3][4],$A[3][1],$A[3][0]
624	bcax	$A[3][0],$A[3][0],$D[1],   $A[3][1]
625	bcax	$A[3][1],$A[3][1],$A[0][3],$D[1]	// A[0][3]=A[3][3]
626
627	bcax	$A[2][0],$C[0],   $A[2][2],$D[2]
628	bcax	$A[2][1],$D[2],   $A[2][3],$A[2][2]
629	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
630	bcax	$A[2][3],$A[2][3],$C[0],   $A[2][4]
631	bcax	$A[2][4],$A[2][4],$D[2],   $C[0]
632
633	bcax	$A[1][2],$D[0],   $A[1][4],$A[0][4]	// A[0][4]=A[1][3]
634	bcax	$A[1][3],$A[0][4],$A[1][0],$A[1][4]	// A[0][4]=A[1][3]
635	bcax	$A[1][4],$A[1][4],$A[1][1],$A[1][0]
636	bcax	$A[1][0],$A[1][0],$D[0],   $A[1][1]
637	bcax	$A[1][1],$A[1][1],$A[0][4],$D[0]	// A[0][4]=A[1][3]
638
639	bcax	$A[0][3],$D[3],   $A[0][0],$D[4]
640	bcax	$A[0][4],$D[4],   $A[0][1],$A[0][0]
641	bcax	$A[0][0],$A[0][0],$A[0][2],$A[0][1]
642	bcax	$A[0][1],$A[0][1],$D[3],   $A[0][2]
643	bcax	$A[0][2],$A[0][2],$D[4],   $D[3]
644
645	eor	$A[0][0],$A[0][0],$C[1]
646
647	subs	x9,x9,#1
648	bne	.Loop_ce
649
650	ret
651.size	KeccakF1600_ce,.-KeccakF1600_ce
652
653.type	KeccakF1600_cext,%function
654.align	5
655KeccakF1600_cext:
656	.inst	0xd503233f		// paciasp
657	stp	x29,x30,[sp,#-80]!
658	add	x29,sp,#0
659	stp	d8,d9,[sp,#16]		// per ABI requirement
660	stp	d10,d11,[sp,#32]
661	stp	d12,d13,[sp,#48]
662	stp	d14,d15,[sp,#64]
663___
664for($i=0; $i<24; $i+=2) {		# load A[5][5]
665my $j=$i+1;
666$code.=<<___;
667	ldp	d$i,d$j,[x0,#8*$i]
668___
669}
670$code.=<<___;
671	ldr	d24,[x0,#8*$i]
672	bl	KeccakF1600_ce
673	ldr	x30,[sp,#8]
674___
675for($i=0; $i<24; $i+=2) {		# store A[5][5]
676my $j=$i+1;
677$code.=<<___;
678	stp	d$i,d$j,[x0,#8*$i]
679___
680}
681$code.=<<___;
682	str	d24,[x0,#8*$i]
683
684	ldp	d8,d9,[sp,#16]
685	ldp	d10,d11,[sp,#32]
686	ldp	d12,d13,[sp,#48]
687	ldp	d14,d15,[sp,#64]
688	ldr	x29,[sp],#80
689	.inst	0xd50323bf		// autiasp
690	ret
691.size	KeccakF1600_cext,.-KeccakF1600_cext
692___
693
694{
695my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
696
697$code.=<<___;
698.globl	SHA3_absorb_cext
699.type	SHA3_absorb_cext,%function
700.align	5
701SHA3_absorb_cext:
702	.inst	0xd503233f		// paciasp
703	stp	x29,x30,[sp,#-80]!
704	add	x29,sp,#0
705	stp	d8,d9,[sp,#16]		// per ABI requirement
706	stp	d10,d11,[sp,#32]
707	stp	d12,d13,[sp,#48]
708	stp	d14,d15,[sp,#64]
709___
710for($i=0; $i<24; $i+=2) {		# load A[5][5]
711my $j=$i+1;
712$code.=<<___;
713	ldp	d$i,d$j,[x0,#8*$i]
714___
715}
716$code.=<<___;
717	ldr	d24,[x0,#8*$i]
718	b	.Loop_absorb_ce
719
720.align	4
721.Loop_absorb_ce:
722	subs	$len,$len,$bsz		// len - bsz
723	blo	.Labsorbed_ce
724___
725for (my $i=0; $i<24; $i+=2) {
726my $j = $i+1;
727$code.=<<___;
728	ldr	d31,[$inp],#8		// *inp++
729#ifdef	__AARCH64EB__
730	rev64	v31.16b,v31.16b
731#endif
732	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
733	cmp	$bsz,#8*($i+2)
734	blo	.Lprocess_block_ce
735	ldr	d31,[$inp],#8		// *inp++
736#ifdef	__AARCH64EB__
737	rev64	v31.16b,v31.16b
738#endif
739	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
740	beq	.Lprocess_block_ce
741___
742}
743$code.=<<___;
744	ldr	d31,[$inp],#8		// *inp++
745#ifdef	__AARCH64EB__
746	rev64	v31.16b,v31.16b
747#endif
748	eor	$A[4][4],$A[4][4],v31.16b
749
750.Lprocess_block_ce:
751
752	bl	KeccakF1600_ce
753
754	b	.Loop_absorb_ce
755
756.align	4
757.Labsorbed_ce:
758___
759for($i=0; $i<24; $i+=2) {		# store A[5][5]
760my $j=$i+1;
761$code.=<<___;
762	stp	d$i,d$j,[x0,#8*$i]
763___
764}
765$code.=<<___;
766	str	d24,[x0,#8*$i]
767	add	x0,$len,$bsz		// return value
768
769	ldp	d8,d9,[sp,#16]
770	ldp	d10,d11,[sp,#32]
771	ldp	d12,d13,[sp,#48]
772	ldp	d14,d15,[sp,#64]
773	ldp	x29,x30,[sp],#80
774	.inst	0xd50323bf		// autiasp
775	ret
776.size	SHA3_absorb_cext,.-SHA3_absorb_cext
777___
778}
779{
780my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
781$code.=<<___;
782.globl	SHA3_squeeze_cext
783.type	SHA3_squeeze_cext,%function
784.align	5
785SHA3_squeeze_cext:
786	.inst	0xd503233f		// paciasp
787	stp	x29,x30,[sp,#-16]!
788	add	x29,sp,#0
789	mov	x9,$ctx
790	mov	x10,$bsz
791
792.Loop_squeeze_ce:
793	ldr	x4,[x9],#8
794	cmp	$len,#8
795	blo	.Lsqueeze_tail_ce
796#ifdef	__AARCH64EB__
797	rev	x4,x4
798#endif
799	str	x4,[$out],#8
800	beq	.Lsqueeze_done_ce
801
802	sub	$len,$len,#8
803	subs	x10,x10,#8
804	bhi	.Loop_squeeze_ce
805
806	bl	KeccakF1600_cext
807	ldr	x30,[sp,#8]
808	mov	x9,$ctx
809	mov	x10,$bsz
810	b	.Loop_squeeze_ce
811
812.align	4
813.Lsqueeze_tail_ce:
814	strb	w4,[$out],#1
815	lsr	x4,x4,#8
816	subs	$len,$len,#1
817	beq	.Lsqueeze_done_ce
818	strb	w4,[$out],#1
819	lsr	x4,x4,#8
820	subs	$len,$len,#1
821	beq	.Lsqueeze_done_ce
822	strb	w4,[$out],#1
823	lsr	x4,x4,#8
824	subs	$len,$len,#1
825	beq	.Lsqueeze_done_ce
826	strb	w4,[$out],#1
827	lsr	x4,x4,#8
828	subs	$len,$len,#1
829	beq	.Lsqueeze_done_ce
830	strb	w4,[$out],#1
831	lsr	x4,x4,#8
832	subs	$len,$len,#1
833	beq	.Lsqueeze_done_ce
834	strb	w4,[$out],#1
835	lsr	x4,x4,#8
836	subs	$len,$len,#1
837	beq	.Lsqueeze_done_ce
838	strb	w4,[$out],#1
839
840.Lsqueeze_done_ce:
841	ldr	x29,[sp],#16
842	.inst	0xd50323bf		// autiasp
843	ret
844.size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
845___
846}								}}}
847$code.=<<___;
848.asciz	"Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
849___
850
851{   my  %opcode = (
852	"rax1"	=> 0xce608c00,	"eor3"	=> 0xce000000,
853	"bcax"	=> 0xce200000,	"xar"	=> 0xce800000	);
854
855    sub unsha3 {
856	my ($mnemonic,$arg)=@_;
857
858	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
859	&&
860	sprintf ".inst\t0x%08x\t//%s %s",
861			$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
862			$mnemonic,$arg;
863    }
864}
865
866foreach(split("\n",$code)) {
867
868	s/\`([^\`]*)\`/eval($1)/ge;
869
870	m/\bld1r\b/ and s/\.16b/.2d/g	or
871	s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
872
873	print $_,"\n";
874}
875
876close STDOUT or die "error closing STDOUT: $!";
877