1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for ARMv8.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT implementation. It makes no
21# sense to attempt SIMD/NEON implementation for following reason.
22# 64-bit lanes of vector registers can't be addressed as easily as in
23# 32-bit mode. This means that 64-bit NEON is bound to be slower than
24# 32-bit NEON, and this implementation is faster than 32-bit NEON on
25# same processor. Even though it takes more scalar xor's and andn's,
26# it gets compensated by availability of rotate. Not to forget that
27# most processors achieve higher issue rate with scalar instructions.
28#
29# February 2018.
30#
31# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
32# variant with register permutation/rotation twist that allows to
33# eliminate copies to temporary registers. If you look closely you'll
34# notice that it uses only one lane of vector registers. The new
35# instructions effectively facilitate parallel hashing, which we don't
36# support [yet?]. But lowest-level core procedure is prepared for it.
37# The inner round is 67 [vector] instructions, so it's not actually
38# obvious that it will provide performance improvement [in serial
39# hash] as long as vector instructions issue rate is limited to 1 per
40# cycle...
41#
42######################################################################
43# Numbers are cycles per processed byte.
44#
45#		r=1088(*)
46#
47# Cortex-A53	13
48# Cortex-A57	12
49# X-Gene	14
50# Mongoose	10
51# Kryo		12
52# Denver	7.8
53# Apple A7	7.2
54# ThunderX2	9.7
55#
56# (*)	Corresponds to SHA3-256. No improvement coefficients are listed
57#	because they vary too much from compiler to compiler. Newer
58#	compiler does much better and improvement varies from 5% on
59#	Cortex-A57 to 25% on Cortex-A53. While in comparison to older
60#	compiler this code is at least 2x faster...
61
62# $output is the last argument if it looks like a file (it has an extension)
63# $flavour is the first argument if it doesn't look like a file
64$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
65$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
66
67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
69( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
70die "can't locate arm-xlate.pl";
71
72open OUT,"| \"$^X\" $xlate $flavour \"$output\""
73    or die "can't call $xlate: $!";
74*STDOUT=*OUT;
75
76my @rhotates = ([  0,  1, 62, 28, 27 ],
77                [ 36, 44,  6, 55, 20 ],
78                [  3, 10, 43, 25, 39 ],
79                [ 41, 45, 15, 21,  8 ],
80                [ 18,  2, 61, 56, 14 ]);
81
82$code.=<<___;
83#include "arm_arch.h"
84
85.text
86
87.align 8	// strategic alignment and padding that allows to use
88		// address value as loop termination condition...
89	.quad	0,0,0,0,0,0,0,0
90.type	iotas,%object
91iotas:
92	.quad	0x0000000000000001
93	.quad	0x0000000000008082
94	.quad	0x800000000000808a
95	.quad	0x8000000080008000
96	.quad	0x000000000000808b
97	.quad	0x0000000080000001
98	.quad	0x8000000080008081
99	.quad	0x8000000000008009
100	.quad	0x000000000000008a
101	.quad	0x0000000000000088
102	.quad	0x0000000080008009
103	.quad	0x000000008000000a
104	.quad	0x000000008000808b
105	.quad	0x800000000000008b
106	.quad	0x8000000000008089
107	.quad	0x8000000000008003
108	.quad	0x8000000000008002
109	.quad	0x8000000000000080
110	.quad	0x000000000000800a
111	.quad	0x800000008000000a
112	.quad	0x8000000080008081
113	.quad	0x8000000000008080
114	.quad	0x0000000080000001
115	.quad	0x8000000080008008
116.size	iotas,.-iotas
117___
118								{{{
119my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
120            (0, 5, 10, 15, 20));
121   $A[3][3] = "x25"; # x18 is reserved
122
123my @C = map("x$_", (26,27,28,30));
124
125$code.=<<___;
126.type	KeccakF1600_int,%function
127.align	5
128KeccakF1600_int:
129	AARCH64_SIGN_LINK_REGISTER
130	adr	$C[2],iotas
131	stp	$C[2],x30,[sp,#16]		// 32 bytes on top are mine
132	b	.Loop
133.align	4
134.Loop:
135	////////////////////////////////////////// Theta
136	eor	$C[0],$A[0][0],$A[1][0]
137	stp	$A[0][4],$A[1][4],[sp,#0]	// offload pair...
138	eor	$C[1],$A[0][1],$A[1][1]
139	eor	$C[2],$A[0][2],$A[1][2]
140	eor	$C[3],$A[0][3],$A[1][3]
141___
142	$C[4]=$A[0][4];
143	$C[5]=$A[1][4];
144$code.=<<___;
145	eor	$C[4],$A[0][4],$A[1][4]
146	eor	$C[0],$C[0],$A[2][0]
147	eor	$C[1],$C[1],$A[2][1]
148	eor	$C[2],$C[2],$A[2][2]
149	eor	$C[3],$C[3],$A[2][3]
150	eor	$C[4],$C[4],$A[2][4]
151	eor	$C[0],$C[0],$A[3][0]
152	eor	$C[1],$C[1],$A[3][1]
153	eor	$C[2],$C[2],$A[3][2]
154	eor	$C[3],$C[3],$A[3][3]
155	eor	$C[4],$C[4],$A[3][4]
156	eor	$C[0],$C[0],$A[4][0]
157	eor	$C[2],$C[2],$A[4][2]
158	eor	$C[1],$C[1],$A[4][1]
159	eor	$C[3],$C[3],$A[4][3]
160	eor	$C[4],$C[4],$A[4][4]
161
162	eor	$C[5],$C[0],$C[2],ror#63
163
164	eor	$A[0][1],$A[0][1],$C[5]
165	eor	$A[1][1],$A[1][1],$C[5]
166	eor	$A[2][1],$A[2][1],$C[5]
167	eor	$A[3][1],$A[3][1],$C[5]
168	eor	$A[4][1],$A[4][1],$C[5]
169
170	eor	$C[5],$C[1],$C[3],ror#63
171	eor	$C[2],$C[2],$C[4],ror#63
172	eor	$C[3],$C[3],$C[0],ror#63
173	eor	$C[4],$C[4],$C[1],ror#63
174
175	eor	$C[1],   $A[0][2],$C[5]		// mov	$C[1],$A[0][2]
176	eor	$A[1][2],$A[1][2],$C[5]
177	eor	$A[2][2],$A[2][2],$C[5]
178	eor	$A[3][2],$A[3][2],$C[5]
179	eor	$A[4][2],$A[4][2],$C[5]
180
181	eor	$A[0][0],$A[0][0],$C[4]
182	eor	$A[1][0],$A[1][0],$C[4]
183	eor	$A[2][0],$A[2][0],$C[4]
184	eor	$A[3][0],$A[3][0],$C[4]
185	eor	$A[4][0],$A[4][0],$C[4]
186___
187	$C[4]=undef;
188	$C[5]=undef;
189$code.=<<___;
190	ldp	$A[0][4],$A[1][4],[sp,#0]	// re-load offloaded data
191	eor	$C[0],   $A[0][3],$C[2]		// mov	$C[0],$A[0][3]
192	eor	$A[1][3],$A[1][3],$C[2]
193	eor	$A[2][3],$A[2][3],$C[2]
194	eor	$A[3][3],$A[3][3],$C[2]
195	eor	$A[4][3],$A[4][3],$C[2]
196
197	eor	$C[2],   $A[0][4],$C[3]		// mov	$C[2],$A[0][4]
198	eor	$A[1][4],$A[1][4],$C[3]
199	eor	$A[2][4],$A[2][4],$C[3]
200	eor	$A[3][4],$A[3][4],$C[3]
201	eor	$A[4][4],$A[4][4],$C[3]
202
203	////////////////////////////////////////// Rho+Pi
204	mov	$C[3],$A[0][1]
205	ror	$A[0][1],$A[1][1],#64-$rhotates[1][1]
206	//mov	$C[1],$A[0][2]
207	ror	$A[0][2],$A[2][2],#64-$rhotates[2][2]
208	//mov	$C[0],$A[0][3]
209	ror	$A[0][3],$A[3][3],#64-$rhotates[3][3]
210	//mov	$C[2],$A[0][4]
211	ror	$A[0][4],$A[4][4],#64-$rhotates[4][4]
212
213	ror	$A[1][1],$A[1][4],#64-$rhotates[1][4]
214	ror	$A[2][2],$A[2][3],#64-$rhotates[2][3]
215	ror	$A[3][3],$A[3][2],#64-$rhotates[3][2]
216	ror	$A[4][4],$A[4][1],#64-$rhotates[4][1]
217
218	ror	$A[1][4],$A[4][2],#64-$rhotates[4][2]
219	ror	$A[2][3],$A[3][4],#64-$rhotates[3][4]
220	ror	$A[3][2],$A[2][1],#64-$rhotates[2][1]
221	ror	$A[4][1],$A[1][3],#64-$rhotates[1][3]
222
223	ror	$A[4][2],$A[2][4],#64-$rhotates[2][4]
224	ror	$A[3][4],$A[4][3],#64-$rhotates[4][3]
225	ror	$A[2][1],$A[1][2],#64-$rhotates[1][2]
226	ror	$A[1][3],$A[3][1],#64-$rhotates[3][1]
227
228	ror	$A[2][4],$A[4][0],#64-$rhotates[4][0]
229	ror	$A[4][3],$A[3][0],#64-$rhotates[3][0]
230	ror	$A[1][2],$A[2][0],#64-$rhotates[2][0]
231	ror	$A[3][1],$A[1][0],#64-$rhotates[1][0]
232
233	ror	$A[1][0],$C[0],#64-$rhotates[0][3]
234	ror	$A[2][0],$C[3],#64-$rhotates[0][1]
235	ror	$A[3][0],$C[2],#64-$rhotates[0][4]
236	ror	$A[4][0],$C[1],#64-$rhotates[0][2]
237
238	////////////////////////////////////////// Chi+Iota
239	bic	$C[0],$A[0][2],$A[0][1]
240	bic	$C[1],$A[0][3],$A[0][2]
241	bic	$C[2],$A[0][0],$A[0][4]
242	bic	$C[3],$A[0][1],$A[0][0]
243	eor	$A[0][0],$A[0][0],$C[0]
244	bic	$C[0],$A[0][4],$A[0][3]
245	eor	$A[0][1],$A[0][1],$C[1]
246	 ldr	$C[1],[sp,#16]
247	eor	$A[0][3],$A[0][3],$C[2]
248	eor	$A[0][4],$A[0][4],$C[3]
249	eor	$A[0][2],$A[0][2],$C[0]
250	 ldr	$C[3],[$C[1]],#8		// Iota[i++]
251
252	bic	$C[0],$A[1][2],$A[1][1]
253	 tst	$C[1],#255			// are we done?
254	 str	$C[1],[sp,#16]
255	bic	$C[1],$A[1][3],$A[1][2]
256	bic	$C[2],$A[1][0],$A[1][4]
257	 eor	$A[0][0],$A[0][0],$C[3]		// A[0][0] ^= Iota
258	bic	$C[3],$A[1][1],$A[1][0]
259	eor	$A[1][0],$A[1][0],$C[0]
260	bic	$C[0],$A[1][4],$A[1][3]
261	eor	$A[1][1],$A[1][1],$C[1]
262	eor	$A[1][3],$A[1][3],$C[2]
263	eor	$A[1][4],$A[1][4],$C[3]
264	eor	$A[1][2],$A[1][2],$C[0]
265
266	bic	$C[0],$A[2][2],$A[2][1]
267	bic	$C[1],$A[2][3],$A[2][2]
268	bic	$C[2],$A[2][0],$A[2][4]
269	bic	$C[3],$A[2][1],$A[2][0]
270	eor	$A[2][0],$A[2][0],$C[0]
271	bic	$C[0],$A[2][4],$A[2][3]
272	eor	$A[2][1],$A[2][1],$C[1]
273	eor	$A[2][3],$A[2][3],$C[2]
274	eor	$A[2][4],$A[2][4],$C[3]
275	eor	$A[2][2],$A[2][2],$C[0]
276
277	bic	$C[0],$A[3][2],$A[3][1]
278	bic	$C[1],$A[3][3],$A[3][2]
279	bic	$C[2],$A[3][0],$A[3][4]
280	bic	$C[3],$A[3][1],$A[3][0]
281	eor	$A[3][0],$A[3][0],$C[0]
282	bic	$C[0],$A[3][4],$A[3][3]
283	eor	$A[3][1],$A[3][1],$C[1]
284	eor	$A[3][3],$A[3][3],$C[2]
285	eor	$A[3][4],$A[3][4],$C[3]
286	eor	$A[3][2],$A[3][2],$C[0]
287
288	bic	$C[0],$A[4][2],$A[4][1]
289	bic	$C[1],$A[4][3],$A[4][2]
290	bic	$C[2],$A[4][0],$A[4][4]
291	bic	$C[3],$A[4][1],$A[4][0]
292	eor	$A[4][0],$A[4][0],$C[0]
293	bic	$C[0],$A[4][4],$A[4][3]
294	eor	$A[4][1],$A[4][1],$C[1]
295	eor	$A[4][3],$A[4][3],$C[2]
296	eor	$A[4][4],$A[4][4],$C[3]
297	eor	$A[4][2],$A[4][2],$C[0]
298
299	bne	.Loop
300
301	ldr	x30,[sp,#24]
302	AARCH64_VALIDATE_LINK_REGISTER
303	ret
304.size	KeccakF1600_int,.-KeccakF1600_int
305
306.type	KeccakF1600,%function
307.align	5
308KeccakF1600:
309	AARCH64_SIGN_LINK_REGISTER
310	stp	x29,x30,[sp,#-128]!
311	add	x29,sp,#0
312	stp	x19,x20,[sp,#16]
313	stp	x21,x22,[sp,#32]
314	stp	x23,x24,[sp,#48]
315	stp	x25,x26,[sp,#64]
316	stp	x27,x28,[sp,#80]
317	sub	sp,sp,#48
318
319	str	x0,[sp,#32]			// offload argument
320	mov	$C[0],x0
321	ldp	$A[0][0],$A[0][1],[x0,#16*0]
322	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
323	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
324	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
325	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
326	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
327	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
328	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
329	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
330	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
331	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
332	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
333	ldr	$A[4][4],[$C[0],#16*12]
334
335	bl	KeccakF1600_int
336
337	ldr	$C[0],[sp,#32]
338	stp	$A[0][0],$A[0][1],[$C[0],#16*0]
339	stp	$A[0][2],$A[0][3],[$C[0],#16*1]
340	stp	$A[0][4],$A[1][0],[$C[0],#16*2]
341	stp	$A[1][1],$A[1][2],[$C[0],#16*3]
342	stp	$A[1][3],$A[1][4],[$C[0],#16*4]
343	stp	$A[2][0],$A[2][1],[$C[0],#16*5]
344	stp	$A[2][2],$A[2][3],[$C[0],#16*6]
345	stp	$A[2][4],$A[3][0],[$C[0],#16*7]
346	stp	$A[3][1],$A[3][2],[$C[0],#16*8]
347	stp	$A[3][3],$A[3][4],[$C[0],#16*9]
348	stp	$A[4][0],$A[4][1],[$C[0],#16*10]
349	stp	$A[4][2],$A[4][3],[$C[0],#16*11]
350	str	$A[4][4],[$C[0],#16*12]
351
352	ldp	x19,x20,[x29,#16]
353	add	sp,sp,#48
354	ldp	x21,x22,[x29,#32]
355	ldp	x23,x24,[x29,#48]
356	ldp	x25,x26,[x29,#64]
357	ldp	x27,x28,[x29,#80]
358	ldp	x29,x30,[sp],#128
359	AARCH64_VALIDATE_LINK_REGISTER
360	ret
361.size	KeccakF1600,.-KeccakF1600
362
363.globl	SHA3_absorb
364.type	SHA3_absorb,%function
365.align	5
366SHA3_absorb:
367	AARCH64_SIGN_LINK_REGISTER
368	stp	x29,x30,[sp,#-128]!
369	add	x29,sp,#0
370	stp	x19,x20,[sp,#16]
371	stp	x21,x22,[sp,#32]
372	stp	x23,x24,[sp,#48]
373	stp	x25,x26,[sp,#64]
374	stp	x27,x28,[sp,#80]
375	sub	sp,sp,#64
376
377	stp	x0,x1,[sp,#32]			// offload arguments
378	stp	x2,x3,[sp,#48]
379
380	mov	$C[0],x0			// uint64_t A[5][5]
381	mov	$C[1],x1			// const void *inp
382	mov	$C[2],x2			// size_t len
383	mov	$C[3],x3			// size_t bsz
384	ldp	$A[0][0],$A[0][1],[$C[0],#16*0]
385	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
386	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
387	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
388	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
389	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
390	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
391	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
392	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
393	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
394	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
395	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
396	ldr	$A[4][4],[$C[0],#16*12]
397	b	.Loop_absorb
398
399.align	4
400.Loop_absorb:
401	subs	$C[0],$C[2],$C[3]		// len - bsz
402	blo	.Labsorbed
403
404	str	$C[0],[sp,#48]			// save len - bsz
405___
406for (my $i=0; $i<24; $i+=2) {
407my $j = $i+1;
408$code.=<<___;
409	ldr	$C[0],[$C[1]],#8		// *inp++
410#ifdef	__AARCH64EB__
411	rev	$C[0],$C[0]
412#endif
413	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
414	cmp	$C[3],#8*($i+2)
415	blo	.Lprocess_block
416	ldr	$C[0],[$C[1]],#8		// *inp++
417#ifdef	__AARCH64EB__
418	rev	$C[0],$C[0]
419#endif
420	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
421	beq	.Lprocess_block
422___
423}
424$code.=<<___;
425	ldr	$C[0],[$C[1]],#8		// *inp++
426#ifdef	__AARCH64EB__
427	rev	$C[0],$C[0]
428#endif
429	eor	$A[4][4],$A[4][4],$C[0]
430
431.Lprocess_block:
432	str	$C[1],[sp,#40]			// save inp
433
434	bl	KeccakF1600_int
435
436	ldr	$C[1],[sp,#40]			// restore arguments
437	ldp	$C[2],$C[3],[sp,#48]
438	b	.Loop_absorb
439
440.align	4
441.Labsorbed:
442	ldr	$C[1],[sp,#32]
443	stp	$A[0][0],$A[0][1],[$C[1],#16*0]
444	stp	$A[0][2],$A[0][3],[$C[1],#16*1]
445	stp	$A[0][4],$A[1][0],[$C[1],#16*2]
446	stp	$A[1][1],$A[1][2],[$C[1],#16*3]
447	stp	$A[1][3],$A[1][4],[$C[1],#16*4]
448	stp	$A[2][0],$A[2][1],[$C[1],#16*5]
449	stp	$A[2][2],$A[2][3],[$C[1],#16*6]
450	stp	$A[2][4],$A[3][0],[$C[1],#16*7]
451	stp	$A[3][1],$A[3][2],[$C[1],#16*8]
452	stp	$A[3][3],$A[3][4],[$C[1],#16*9]
453	stp	$A[4][0],$A[4][1],[$C[1],#16*10]
454	stp	$A[4][2],$A[4][3],[$C[1],#16*11]
455	str	$A[4][4],[$C[1],#16*12]
456
457	mov	x0,$C[2]			// return value
458	ldp	x19,x20,[x29,#16]
459	add	sp,sp,#64
460	ldp	x21,x22,[x29,#32]
461	ldp	x23,x24,[x29,#48]
462	ldp	x25,x26,[x29,#64]
463	ldp	x27,x28,[x29,#80]
464	ldp	x29,x30,[sp],#128
465	AARCH64_VALIDATE_LINK_REGISTER
466	ret
467.size	SHA3_absorb,.-SHA3_absorb
468___
469{
470my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
471$code.=<<___;
472.globl	SHA3_squeeze
473.type	SHA3_squeeze,%function
474.align	5
475SHA3_squeeze:
476	AARCH64_SIGN_LINK_REGISTER
477	stp	x29,x30,[sp,#-48]!
478	add	x29,sp,#0
479	stp	x19,x20,[sp,#16]
480	stp	x21,x22,[sp,#32]
481
482	mov	$A_flat,x0			// put aside arguments
483	mov	$out,x1
484	mov	$len,x2
485	mov	$bsz,x3
486
487.Loop_squeeze:
488	ldr	x4,[x0],#8
489	cmp	$len,#8
490	blo	.Lsqueeze_tail
491#ifdef	__AARCH64EB__
492	rev	x4,x4
493#endif
494	str	x4,[$out],#8
495	subs	$len,$len,#8
496	beq	.Lsqueeze_done
497
498	subs	x3,x3,#8
499	bhi	.Loop_squeeze
500
501	mov	x0,$A_flat
502	bl	KeccakF1600
503	mov	x0,$A_flat
504	mov	x3,$bsz
505	b	.Loop_squeeze
506
507.align	4
508.Lsqueeze_tail:
509	strb	w4,[$out],#1
510	lsr	x4,x4,#8
511	subs	$len,$len,#1
512	beq	.Lsqueeze_done
513	strb	w4,[$out],#1
514	lsr	x4,x4,#8
515	subs	$len,$len,#1
516	beq	.Lsqueeze_done
517	strb	w4,[$out],#1
518	lsr	x4,x4,#8
519	subs	$len,$len,#1
520	beq	.Lsqueeze_done
521	strb	w4,[$out],#1
522	lsr	x4,x4,#8
523	subs	$len,$len,#1
524	beq	.Lsqueeze_done
525	strb	w4,[$out],#1
526	lsr	x4,x4,#8
527	subs	$len,$len,#1
528	beq	.Lsqueeze_done
529	strb	w4,[$out],#1
530	lsr	x4,x4,#8
531	subs	$len,$len,#1
532	beq	.Lsqueeze_done
533	strb	w4,[$out],#1
534
535.Lsqueeze_done:
536	ldp	x19,x20,[sp,#16]
537	ldp	x21,x22,[sp,#32]
538	ldp	x29,x30,[sp],#48
539	AARCH64_VALIDATE_LINK_REGISTER
540	ret
541.size	SHA3_squeeze,.-SHA3_squeeze
542___
543}								}}}
544								{{{
545my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
546                             "v".($_+3).".16b", "v".($_+4).".16b" ],
547            (0, 5, 10, 15, 20));
548
549my @C = map("v$_.16b", (25..31));
550my @D = @C[4,5,6,2,3];
551
552$code.=<<___;
553.type	KeccakF1600_ce,%function
554.align	5
555KeccakF1600_ce:
556	mov	x9,#24
557	adr	x10,iotas
558	b	.Loop_ce
559.align	4
560.Loop_ce:
561	////////////////////////////////////////////////// Theta
562	eor3	$C[0],$A[4][0],$A[3][0],$A[2][0]
563	eor3	$C[1],$A[4][1],$A[3][1],$A[2][1]
564	eor3	$C[2],$A[4][2],$A[3][2],$A[2][2]
565	eor3	$C[3],$A[4][3],$A[3][3],$A[2][3]
566	eor3	$C[4],$A[4][4],$A[3][4],$A[2][4]
567	eor3	$C[0],$C[0],   $A[1][0],$A[0][0]
568	eor3	$C[1],$C[1],   $A[1][1],$A[0][1]
569	eor3	$C[2],$C[2],   $A[1][2],$A[0][2]
570	eor3	$C[3],$C[3],   $A[1][3],$A[0][3]
571	eor3	$C[4],$C[4],   $A[1][4],$A[0][4]
572
573	rax1	$C[5],$C[0],$C[2]			// D[1]
574	rax1	$C[6],$C[1],$C[3]			// D[2]
575	rax1	$C[2],$C[2],$C[4]			// D[3]
576	rax1	$C[3],$C[3],$C[0]			// D[4]
577	rax1	$C[4],$C[4],$C[1]			// D[0]
578
579	////////////////////////////////////////////////// Theta+Rho+Pi
580	xar	$C[0],   $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]
581
582	xar	$A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
583	xar	$A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
584	xar	$A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
585	xar	$A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
586	xar	$A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]
587
588	xar	$C[1],   $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]
589
590	xar	$A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
591	xar	$A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
592	xar	$A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
593	xar	$A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
594	xar	$A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]
595
596	xar	$A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]
597
598	xar	$D[4],   $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
599	xar	$A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
600	xar	$A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
601	xar	$A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
602	xar	$A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]
603
604	xar	$A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]
605
606	eor	$A[0][0],$A[0][0],$D[0]
607
608	xar	$D[3],   $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
609	xar	$A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
610	xar	$D[1],   $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
611	xar	$D[2],   $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
612	xar	$D[0],   $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]
613
614	////////////////////////////////////////////////// Chi+Iota
615	bcax	$A[4][0],$C[1],   $A[4][2],$A[1][3]	// A[1][3]=A[4][1]
616	bcax	$A[4][1],$A[1][3],$A[4][3],$A[4][2]	// A[1][3]=A[4][1]
617	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
618	bcax	$A[4][3],$A[4][3],$C[1],   $A[4][4]
619	bcax	$A[4][4],$A[4][4],$A[1][3],$C[1]	// A[1][3]=A[4][1]
620
621	ld1r	{$C[1]},[x10],#8
622
623	bcax	$A[3][2],$D[1],   $A[3][4],$A[0][3]	// A[0][3]=A[3][3]
624	bcax	$A[3][3],$A[0][3],$A[3][0],$A[3][4]	// A[0][3]=A[3][3]
625	bcax	$A[3][4],$A[3][4],$A[3][1],$A[3][0]
626	bcax	$A[3][0],$A[3][0],$D[1],   $A[3][1]
627	bcax	$A[3][1],$A[3][1],$A[0][3],$D[1]	// A[0][3]=A[3][3]
628
629	bcax	$A[2][0],$C[0],   $A[2][2],$D[2]
630	bcax	$A[2][1],$D[2],   $A[2][3],$A[2][2]
631	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
632	bcax	$A[2][3],$A[2][3],$C[0],   $A[2][4]
633	bcax	$A[2][4],$A[2][4],$D[2],   $C[0]
634
635	bcax	$A[1][2],$D[0],   $A[1][4],$A[0][4]	// A[0][4]=A[1][3]
636	bcax	$A[1][3],$A[0][4],$A[1][0],$A[1][4]	// A[0][4]=A[1][3]
637	bcax	$A[1][4],$A[1][4],$A[1][1],$A[1][0]
638	bcax	$A[1][0],$A[1][0],$D[0],   $A[1][1]
639	bcax	$A[1][1],$A[1][1],$A[0][4],$D[0]	// A[0][4]=A[1][3]
640
641	bcax	$A[0][3],$D[3],   $A[0][0],$D[4]
642	bcax	$A[0][4],$D[4],   $A[0][1],$A[0][0]
643	bcax	$A[0][0],$A[0][0],$A[0][2],$A[0][1]
644	bcax	$A[0][1],$A[0][1],$D[3],   $A[0][2]
645	bcax	$A[0][2],$A[0][2],$D[4],   $D[3]
646
647	eor	$A[0][0],$A[0][0],$C[1]
648
649	subs	x9,x9,#1
650	bne	.Loop_ce
651
652	ret
653.size	KeccakF1600_ce,.-KeccakF1600_ce
654
655.type	KeccakF1600_cext,%function
656.align	5
657KeccakF1600_cext:
658	AARCH64_SIGN_LINK_REGISTER
659	stp	x29,x30,[sp,#-80]!
660	add	x29,sp,#0
661	stp	d8,d9,[sp,#16]		// per ABI requirement
662	stp	d10,d11,[sp,#32]
663	stp	d12,d13,[sp,#48]
664	stp	d14,d15,[sp,#64]
665___
666for($i=0; $i<24; $i+=2) {		# load A[5][5]
667my $j=$i+1;
668$code.=<<___;
669	ldp	d$i,d$j,[x0,#8*$i]
670___
671}
672$code.=<<___;
673	ldr	d24,[x0,#8*$i]
674	bl	KeccakF1600_ce
675	ldr	x30,[sp,#8]
676___
677for($i=0; $i<24; $i+=2) {		# store A[5][5]
678my $j=$i+1;
679$code.=<<___;
680	stp	d$i,d$j,[x0,#8*$i]
681___
682}
683$code.=<<___;
684	str	d24,[x0,#8*$i]
685
686	ldp	d8,d9,[sp,#16]
687	ldp	d10,d11,[sp,#32]
688	ldp	d12,d13,[sp,#48]
689	ldp	d14,d15,[sp,#64]
690	ldr	x29,[sp],#80
691	AARCH64_VALIDATE_LINK_REGISTER
692	ret
693.size	KeccakF1600_cext,.-KeccakF1600_cext
694___
695
696{
697my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
698
699$code.=<<___;
700.globl	SHA3_absorb_cext
701.type	SHA3_absorb_cext,%function
702.align	5
703SHA3_absorb_cext:
704	AARCH64_SIGN_LINK_REGISTER
705	stp	x29,x30,[sp,#-80]!
706	add	x29,sp,#0
707	stp	d8,d9,[sp,#16]		// per ABI requirement
708	stp	d10,d11,[sp,#32]
709	stp	d12,d13,[sp,#48]
710	stp	d14,d15,[sp,#64]
711___
712for($i=0; $i<24; $i+=2) {		# load A[5][5]
713my $j=$i+1;
714$code.=<<___;
715	ldp	d$i,d$j,[x0,#8*$i]
716___
717}
718$code.=<<___;
719	ldr	d24,[x0,#8*$i]
720	b	.Loop_absorb_ce
721
722.align	4
723.Loop_absorb_ce:
724	subs	$len,$len,$bsz		// len - bsz
725	blo	.Labsorbed_ce
726___
727for (my $i=0; $i<24; $i+=2) {
728my $j = $i+1;
729$code.=<<___;
730	ldr	d31,[$inp],#8		// *inp++
731#ifdef	__AARCH64EB__
732	rev64	v31.16b,v31.16b
733#endif
734	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
735	cmp	$bsz,#8*($i+2)
736	blo	.Lprocess_block_ce
737	ldr	d31,[$inp],#8		// *inp++
738#ifdef	__AARCH64EB__
739	rev64	v31.16b,v31.16b
740#endif
741	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
742	beq	.Lprocess_block_ce
743___
744}
745$code.=<<___;
746	ldr	d31,[$inp],#8		// *inp++
747#ifdef	__AARCH64EB__
748	rev64	v31.16b,v31.16b
749#endif
750	eor	$A[4][4],$A[4][4],v31.16b
751
752.Lprocess_block_ce:
753
754	bl	KeccakF1600_ce
755
756	b	.Loop_absorb_ce
757
758.align	4
759.Labsorbed_ce:
760___
761for($i=0; $i<24; $i+=2) {		# store A[5][5]
762my $j=$i+1;
763$code.=<<___;
764	stp	d$i,d$j,[x0,#8*$i]
765___
766}
767$code.=<<___;
768	str	d24,[x0,#8*$i]
769	add	x0,$len,$bsz		// return value
770
771	ldp	d8,d9,[sp,#16]
772	ldp	d10,d11,[sp,#32]
773	ldp	d12,d13,[sp,#48]
774	ldp	d14,d15,[sp,#64]
775	ldp	x29,x30,[sp],#80
776	AARCH64_VALIDATE_LINK_REGISTER
777	ret
778.size	SHA3_absorb_cext,.-SHA3_absorb_cext
779___
780}
781{
782my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
783$code.=<<___;
784.globl	SHA3_squeeze_cext
785.type	SHA3_squeeze_cext,%function
786.align	5
787SHA3_squeeze_cext:
788	AARCH64_SIGN_LINK_REGISTER
789	stp	x29,x30,[sp,#-16]!
790	add	x29,sp,#0
791	mov	x9,$ctx
792	mov	x10,$bsz
793
794.Loop_squeeze_ce:
795	ldr	x4,[x9],#8
796	cmp	$len,#8
797	blo	.Lsqueeze_tail_ce
798#ifdef	__AARCH64EB__
799	rev	x4,x4
800#endif
801	str	x4,[$out],#8
802	beq	.Lsqueeze_done_ce
803
804	sub	$len,$len,#8
805	subs	x10,x10,#8
806	bhi	.Loop_squeeze_ce
807
808	bl	KeccakF1600_cext
809	ldr	x30,[sp,#8]
810	mov	x9,$ctx
811	mov	x10,$bsz
812	b	.Loop_squeeze_ce
813
814.align	4
815.Lsqueeze_tail_ce:
816	strb	w4,[$out],#1
817	lsr	x4,x4,#8
818	subs	$len,$len,#1
819	beq	.Lsqueeze_done_ce
820	strb	w4,[$out],#1
821	lsr	x4,x4,#8
822	subs	$len,$len,#1
823	beq	.Lsqueeze_done_ce
824	strb	w4,[$out],#1
825	lsr	x4,x4,#8
826	subs	$len,$len,#1
827	beq	.Lsqueeze_done_ce
828	strb	w4,[$out],#1
829	lsr	x4,x4,#8
830	subs	$len,$len,#1
831	beq	.Lsqueeze_done_ce
832	strb	w4,[$out],#1
833	lsr	x4,x4,#8
834	subs	$len,$len,#1
835	beq	.Lsqueeze_done_ce
836	strb	w4,[$out],#1
837	lsr	x4,x4,#8
838	subs	$len,$len,#1
839	beq	.Lsqueeze_done_ce
840	strb	w4,[$out],#1
841
842.Lsqueeze_done_ce:
843	ldr	x29,[sp],#16
844	AARCH64_VALIDATE_LINK_REGISTER
845	ret
846.size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
847___
848}								}}}
849$code.=<<___;
850.asciz	"Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
851___
852
853{   my  %opcode = (
854	"rax1"	=> 0xce608c00,	"eor3"	=> 0xce000000,
855	"bcax"	=> 0xce200000,	"xar"	=> 0xce800000	);
856
857    sub unsha3 {
858	my ($mnemonic,$arg)=@_;
859
860	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
861	&&
862	sprintf ".inst\t0x%08x\t//%s %s",
863			$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
864			$mnemonic,$arg;
865    }
866}
867
868foreach(split("\n",$code)) {
869
870	s/\`([^\`]*)\`/eval($1)/ge;
871
872	m/\bld1r\b/ and s/\.16b/.2d/g	or
873	s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
874
875	print $_,"\n";
876}
877
878close STDOUT or die "error closing STDOUT: $!";
879