1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for ARMv8.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT implementation. It makes no
21# sense to attempt SIMD/NEON implementation for following reason.
22# 64-bit lanes of vector registers can't be addressed as easily as in
23# 32-bit mode. This means that 64-bit NEON is bound to be slower than
24# 32-bit NEON, and this implementation is faster than 32-bit NEON on
25# same processor. Even though it takes more scalar xor's and andn's,
26# it gets compensated by availability of rotate. Not to forget that
27# most processors achieve higher issue rate with scalar instructions.
28#
29# February 2018.
30#
31# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
32# variant with register permutation/rotation twist that allows to
33# eliminate copies to temporary registers. If you look closely you'll
34# notice that it uses only one lane of vector registers. The new
35# instructions effectively facilitate parallel hashing, which we don't
36# support [yet?]. But lowest-level core procedure is prepared for it.
37# The inner round is 67 [vector] instructions, so it's not actually
38# obvious that it will provide performance improvement [in serial
39# hash] as long as vector instructions issue rate is limited to 1 per
40# cycle...
41#
42######################################################################
43# Numbers are cycles per processed byte.
44#
45#		r=1088(*)
46#
47# Cortex-A53	13
48# Cortex-A57	12
49# X-Gene	14
50# Mongoose	10
51# Kryo		12
52# Denver	7.8
53# Apple A7	7.2
54#
55# (*)	Corresponds to SHA3-256. No improvement coefficients are listed
56#	because they vary too much from compiler to compiler. Newer
57#	compiler does much better and improvement varies from 5% on
58#	Cortex-A57 to 25% on Cortex-A53. While in comparison to older
59#	compiler this code is at least 2x faster...
60
61$flavour = shift;
62$output  = shift;
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67die "can't locate arm-xlate.pl";
68
69open OUT,"| \"$^X\" $xlate $flavour $output";
70*STDOUT=*OUT;
71
72my @rhotates = ([  0,  1, 62, 28, 27 ],
73                [ 36, 44,  6, 55, 20 ],
74                [  3, 10, 43, 25, 39 ],
75                [ 41, 45, 15, 21,  8 ],
76                [ 18,  2, 61, 56, 14 ]);
77
78$code.=<<___;
79.text
80
81.align 8	// strategic alignment and padding that allows to use
82		// address value as loop termination condition...
83	.quad	0,0,0,0,0,0,0,0
84.type	iotas,%object
85iotas:
86	.quad	0x0000000000000001
87	.quad	0x0000000000008082
88	.quad	0x800000000000808a
89	.quad	0x8000000080008000
90	.quad	0x000000000000808b
91	.quad	0x0000000080000001
92	.quad	0x8000000080008081
93	.quad	0x8000000000008009
94	.quad	0x000000000000008a
95	.quad	0x0000000000000088
96	.quad	0x0000000080008009
97	.quad	0x000000008000000a
98	.quad	0x000000008000808b
99	.quad	0x800000000000008b
100	.quad	0x8000000000008089
101	.quad	0x8000000000008003
102	.quad	0x8000000000008002
103	.quad	0x8000000000000080
104	.quad	0x000000000000800a
105	.quad	0x800000008000000a
106	.quad	0x8000000080008081
107	.quad	0x8000000000008080
108	.quad	0x0000000080000001
109	.quad	0x8000000080008008
110.size	iotas,.-iotas
111___
112								{{{
113my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
114            (0, 5, 10, 15, 20));
115   $A[3][3] = "x25"; # x18 is reserved
116
117my @C = map("x$_", (26,27,28,30));
118
119$code.=<<___;
120.type	KeccakF1600_int,%function
121.align	5
122KeccakF1600_int:
123	adr	$C[2],iotas
124	.inst	0xd503233f			// paciasp
125	stp	$C[2],x30,[sp,#16]		// 32 bytes on top are mine
126	b	.Loop
127.align	4
128.Loop:
129	////////////////////////////////////////// Theta
130	eor	$C[0],$A[0][0],$A[1][0]
131	stp	$A[0][4],$A[1][4],[sp,#0]	// offload pair...
132	eor	$C[1],$A[0][1],$A[1][1]
133	eor	$C[2],$A[0][2],$A[1][2]
134	eor	$C[3],$A[0][3],$A[1][3]
135___
136	$C[4]=$A[0][4];
137	$C[5]=$A[1][4];
138$code.=<<___;
139	eor	$C[4],$A[0][4],$A[1][4]
140	eor	$C[0],$C[0],$A[2][0]
141	eor	$C[1],$C[1],$A[2][1]
142	eor	$C[2],$C[2],$A[2][2]
143	eor	$C[3],$C[3],$A[2][3]
144	eor	$C[4],$C[4],$A[2][4]
145	eor	$C[0],$C[0],$A[3][0]
146	eor	$C[1],$C[1],$A[3][1]
147	eor	$C[2],$C[2],$A[3][2]
148	eor	$C[3],$C[3],$A[3][3]
149	eor	$C[4],$C[4],$A[3][4]
150	eor	$C[0],$C[0],$A[4][0]
151	eor	$C[2],$C[2],$A[4][2]
152	eor	$C[1],$C[1],$A[4][1]
153	eor	$C[3],$C[3],$A[4][3]
154	eor	$C[4],$C[4],$A[4][4]
155
156	eor	$C[5],$C[0],$C[2],ror#63
157
158	eor	$A[0][1],$A[0][1],$C[5]
159	eor	$A[1][1],$A[1][1],$C[5]
160	eor	$A[2][1],$A[2][1],$C[5]
161	eor	$A[3][1],$A[3][1],$C[5]
162	eor	$A[4][1],$A[4][1],$C[5]
163
164	eor	$C[5],$C[1],$C[3],ror#63
165	eor	$C[2],$C[2],$C[4],ror#63
166	eor	$C[3],$C[3],$C[0],ror#63
167	eor	$C[4],$C[4],$C[1],ror#63
168
169	eor	$C[1],   $A[0][2],$C[5]		// mov	$C[1],$A[0][2]
170	eor	$A[1][2],$A[1][2],$C[5]
171	eor	$A[2][2],$A[2][2],$C[5]
172	eor	$A[3][2],$A[3][2],$C[5]
173	eor	$A[4][2],$A[4][2],$C[5]
174
175	eor	$A[0][0],$A[0][0],$C[4]
176	eor	$A[1][0],$A[1][0],$C[4]
177	eor	$A[2][0],$A[2][0],$C[4]
178	eor	$A[3][0],$A[3][0],$C[4]
179	eor	$A[4][0],$A[4][0],$C[4]
180___
181	$C[4]=undef;
182	$C[5]=undef;
183$code.=<<___;
184	ldp	$A[0][4],$A[1][4],[sp,#0]	// re-load offloaded data
185	eor	$C[0],   $A[0][3],$C[2]		// mov	$C[0],$A[0][3]
186	eor	$A[1][3],$A[1][3],$C[2]
187	eor	$A[2][3],$A[2][3],$C[2]
188	eor	$A[3][3],$A[3][3],$C[2]
189	eor	$A[4][3],$A[4][3],$C[2]
190
191	eor	$C[2],   $A[0][4],$C[3]		// mov	$C[2],$A[0][4]
192	eor	$A[1][4],$A[1][4],$C[3]
193	eor	$A[2][4],$A[2][4],$C[3]
194	eor	$A[3][4],$A[3][4],$C[3]
195	eor	$A[4][4],$A[4][4],$C[3]
196
197	////////////////////////////////////////// Rho+Pi
198	mov	$C[3],$A[0][1]
199	ror	$A[0][1],$A[1][1],#64-$rhotates[1][1]
200	//mov	$C[1],$A[0][2]
201	ror	$A[0][2],$A[2][2],#64-$rhotates[2][2]
202	//mov	$C[0],$A[0][3]
203	ror	$A[0][3],$A[3][3],#64-$rhotates[3][3]
204	//mov	$C[2],$A[0][4]
205	ror	$A[0][4],$A[4][4],#64-$rhotates[4][4]
206
207	ror	$A[1][1],$A[1][4],#64-$rhotates[1][4]
208	ror	$A[2][2],$A[2][3],#64-$rhotates[2][3]
209	ror	$A[3][3],$A[3][2],#64-$rhotates[3][2]
210	ror	$A[4][4],$A[4][1],#64-$rhotates[4][1]
211
212	ror	$A[1][4],$A[4][2],#64-$rhotates[4][2]
213	ror	$A[2][3],$A[3][4],#64-$rhotates[3][4]
214	ror	$A[3][2],$A[2][1],#64-$rhotates[2][1]
215	ror	$A[4][1],$A[1][3],#64-$rhotates[1][3]
216
217	ror	$A[4][2],$A[2][4],#64-$rhotates[2][4]
218	ror	$A[3][4],$A[4][3],#64-$rhotates[4][3]
219	ror	$A[2][1],$A[1][2],#64-$rhotates[1][2]
220	ror	$A[1][3],$A[3][1],#64-$rhotates[3][1]
221
222	ror	$A[2][4],$A[4][0],#64-$rhotates[4][0]
223	ror	$A[4][3],$A[3][0],#64-$rhotates[3][0]
224	ror	$A[1][2],$A[2][0],#64-$rhotates[2][0]
225	ror	$A[3][1],$A[1][0],#64-$rhotates[1][0]
226
227	ror	$A[1][0],$C[0],#64-$rhotates[0][3]
228	ror	$A[2][0],$C[3],#64-$rhotates[0][1]
229	ror	$A[3][0],$C[2],#64-$rhotates[0][4]
230	ror	$A[4][0],$C[1],#64-$rhotates[0][2]
231
232	////////////////////////////////////////// Chi+Iota
233	bic	$C[0],$A[0][2],$A[0][1]
234	bic	$C[1],$A[0][3],$A[0][2]
235	bic	$C[2],$A[0][0],$A[0][4]
236	bic	$C[3],$A[0][1],$A[0][0]
237	eor	$A[0][0],$A[0][0],$C[0]
238	bic	$C[0],$A[0][4],$A[0][3]
239	eor	$A[0][1],$A[0][1],$C[1]
240	 ldr	$C[1],[sp,#16]
241	eor	$A[0][3],$A[0][3],$C[2]
242	eor	$A[0][4],$A[0][4],$C[3]
243	eor	$A[0][2],$A[0][2],$C[0]
244	 ldr	$C[3],[$C[1]],#8		// Iota[i++]
245
246	bic	$C[0],$A[1][2],$A[1][1]
247	 tst	$C[1],#255			// are we done?
248	 str	$C[1],[sp,#16]
249	bic	$C[1],$A[1][3],$A[1][2]
250	bic	$C[2],$A[1][0],$A[1][4]
251	 eor	$A[0][0],$A[0][0],$C[3]		// A[0][0] ^= Iota
252	bic	$C[3],$A[1][1],$A[1][0]
253	eor	$A[1][0],$A[1][0],$C[0]
254	bic	$C[0],$A[1][4],$A[1][3]
255	eor	$A[1][1],$A[1][1],$C[1]
256	eor	$A[1][3],$A[1][3],$C[2]
257	eor	$A[1][4],$A[1][4],$C[3]
258	eor	$A[1][2],$A[1][2],$C[0]
259
260	bic	$C[0],$A[2][2],$A[2][1]
261	bic	$C[1],$A[2][3],$A[2][2]
262	bic	$C[2],$A[2][0],$A[2][4]
263	bic	$C[3],$A[2][1],$A[2][0]
264	eor	$A[2][0],$A[2][0],$C[0]
265	bic	$C[0],$A[2][4],$A[2][3]
266	eor	$A[2][1],$A[2][1],$C[1]
267	eor	$A[2][3],$A[2][3],$C[2]
268	eor	$A[2][4],$A[2][4],$C[3]
269	eor	$A[2][2],$A[2][2],$C[0]
270
271	bic	$C[0],$A[3][2],$A[3][1]
272	bic	$C[1],$A[3][3],$A[3][2]
273	bic	$C[2],$A[3][0],$A[3][4]
274	bic	$C[3],$A[3][1],$A[3][0]
275	eor	$A[3][0],$A[3][0],$C[0]
276	bic	$C[0],$A[3][4],$A[3][3]
277	eor	$A[3][1],$A[3][1],$C[1]
278	eor	$A[3][3],$A[3][3],$C[2]
279	eor	$A[3][4],$A[3][4],$C[3]
280	eor	$A[3][2],$A[3][2],$C[0]
281
282	bic	$C[0],$A[4][2],$A[4][1]
283	bic	$C[1],$A[4][3],$A[4][2]
284	bic	$C[2],$A[4][0],$A[4][4]
285	bic	$C[3],$A[4][1],$A[4][0]
286	eor	$A[4][0],$A[4][0],$C[0]
287	bic	$C[0],$A[4][4],$A[4][3]
288	eor	$A[4][1],$A[4][1],$C[1]
289	eor	$A[4][3],$A[4][3],$C[2]
290	eor	$A[4][4],$A[4][4],$C[3]
291	eor	$A[4][2],$A[4][2],$C[0]
292
293	bne	.Loop
294
295	ldr	x30,[sp,#24]
296	.inst	0xd50323bf			// autiasp
297	ret
298.size	KeccakF1600_int,.-KeccakF1600_int
299
300.type	KeccakF1600,%function
301.align	5
302KeccakF1600:
303	.inst	0xd503233f			// paciasp
304	stp	x29,x30,[sp,#-128]!
305	add	x29,sp,#0
306	stp	x19,x20,[sp,#16]
307	stp	x21,x22,[sp,#32]
308	stp	x23,x24,[sp,#48]
309	stp	x25,x26,[sp,#64]
310	stp	x27,x28,[sp,#80]
311	sub	sp,sp,#48
312
313	str	x0,[sp,#32]			// offload argument
314	mov	$C[0],x0
315	ldp	$A[0][0],$A[0][1],[x0,#16*0]
316	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
317	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
318	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
319	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
320	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
321	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
322	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
323	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
324	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
325	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
326	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
327	ldr	$A[4][4],[$C[0],#16*12]
328
329	bl	KeccakF1600_int
330
331	ldr	$C[0],[sp,#32]
332	stp	$A[0][0],$A[0][1],[$C[0],#16*0]
333	stp	$A[0][2],$A[0][3],[$C[0],#16*1]
334	stp	$A[0][4],$A[1][0],[$C[0],#16*2]
335	stp	$A[1][1],$A[1][2],[$C[0],#16*3]
336	stp	$A[1][3],$A[1][4],[$C[0],#16*4]
337	stp	$A[2][0],$A[2][1],[$C[0],#16*5]
338	stp	$A[2][2],$A[2][3],[$C[0],#16*6]
339	stp	$A[2][4],$A[3][0],[$C[0],#16*7]
340	stp	$A[3][1],$A[3][2],[$C[0],#16*8]
341	stp	$A[3][3],$A[3][4],[$C[0],#16*9]
342	stp	$A[4][0],$A[4][1],[$C[0],#16*10]
343	stp	$A[4][2],$A[4][3],[$C[0],#16*11]
344	str	$A[4][4],[$C[0],#16*12]
345
346	ldp	x19,x20,[x29,#16]
347	add	sp,sp,#48
348	ldp	x21,x22,[x29,#32]
349	ldp	x23,x24,[x29,#48]
350	ldp	x25,x26,[x29,#64]
351	ldp	x27,x28,[x29,#80]
352	ldp	x29,x30,[sp],#128
353	.inst	0xd50323bf			// autiasp
354	ret
355.size	KeccakF1600,.-KeccakF1600
356
357.globl	SHA3_absorb
358.type	SHA3_absorb,%function
359.align	5
360SHA3_absorb:
361	.inst	0xd503233f			// paciasp
362	stp	x29,x30,[sp,#-128]!
363	add	x29,sp,#0
364	stp	x19,x20,[sp,#16]
365	stp	x21,x22,[sp,#32]
366	stp	x23,x24,[sp,#48]
367	stp	x25,x26,[sp,#64]
368	stp	x27,x28,[sp,#80]
369	sub	sp,sp,#64
370
371	stp	x0,x1,[sp,#32]			// offload arguments
372	stp	x2,x3,[sp,#48]
373
374	mov	$C[0],x0			// uint64_t A[5][5]
375	mov	$C[1],x1			// const void *inp
376	mov	$C[2],x2			// size_t len
377	mov	$C[3],x3			// size_t bsz
378	ldp	$A[0][0],$A[0][1],[$C[0],#16*0]
379	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
380	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
381	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
382	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
383	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
384	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
385	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
386	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
387	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
388	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
389	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
390	ldr	$A[4][4],[$C[0],#16*12]
391	b	.Loop_absorb
392
393.align	4
394.Loop_absorb:
395	subs	$C[0],$C[2],$C[3]		// len - bsz
396	blo	.Labsorbed
397
398	str	$C[0],[sp,#48]			// save len - bsz
399___
400for (my $i=0; $i<24; $i+=2) {
401my $j = $i+1;
402$code.=<<___;
403	ldr	$C[0],[$C[1]],#8		// *inp++
404#ifdef	__AARCH64EB__
405	rev	$C[0],$C[0]
406#endif
407	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
408	cmp	$C[3],#8*($i+2)
409	blo	.Lprocess_block
410	ldr	$C[0],[$C[1]],#8		// *inp++
411#ifdef	__AARCH64EB__
412	rev	$C[0],$C[0]
413#endif
414	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
415	beq	.Lprocess_block
416___
417}
418$code.=<<___;
419	ldr	$C[0],[$C[1]],#8		// *inp++
420#ifdef	__AARCH64EB__
421	rev	$C[0],$C[0]
422#endif
423	eor	$A[4][4],$A[4][4],$C[0]
424
425.Lprocess_block:
426	str	$C[1],[sp,#40]			// save inp
427
428	bl	KeccakF1600_int
429
430	ldr	$C[1],[sp,#40]			// restore arguments
431	ldp	$C[2],$C[3],[sp,#48]
432	b	.Loop_absorb
433
434.align	4
435.Labsorbed:
436	ldr	$C[1],[sp,#32]
437	stp	$A[0][0],$A[0][1],[$C[1],#16*0]
438	stp	$A[0][2],$A[0][3],[$C[1],#16*1]
439	stp	$A[0][4],$A[1][0],[$C[1],#16*2]
440	stp	$A[1][1],$A[1][2],[$C[1],#16*3]
441	stp	$A[1][3],$A[1][4],[$C[1],#16*4]
442	stp	$A[2][0],$A[2][1],[$C[1],#16*5]
443	stp	$A[2][2],$A[2][3],[$C[1],#16*6]
444	stp	$A[2][4],$A[3][0],[$C[1],#16*7]
445	stp	$A[3][1],$A[3][2],[$C[1],#16*8]
446	stp	$A[3][3],$A[3][4],[$C[1],#16*9]
447	stp	$A[4][0],$A[4][1],[$C[1],#16*10]
448	stp	$A[4][2],$A[4][3],[$C[1],#16*11]
449	str	$A[4][4],[$C[1],#16*12]
450
451	mov	x0,$C[2]			// return value
452	ldp	x19,x20,[x29,#16]
453	add	sp,sp,#64
454	ldp	x21,x22,[x29,#32]
455	ldp	x23,x24,[x29,#48]
456	ldp	x25,x26,[x29,#64]
457	ldp	x27,x28,[x29,#80]
458	ldp	x29,x30,[sp],#128
459	.inst	0xd50323bf			// autiasp
460	ret
461.size	SHA3_absorb,.-SHA3_absorb
462___
463{
464my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
465$code.=<<___;
466.globl	SHA3_squeeze
467.type	SHA3_squeeze,%function
468.align	5
469SHA3_squeeze:
470	.inst	0xd503233f			// paciasp
471	stp	x29,x30,[sp,#-48]!
472	add	x29,sp,#0
473	stp	x19,x20,[sp,#16]
474	stp	x21,x22,[sp,#32]
475
476	mov	$A_flat,x0			// put aside arguments
477	mov	$out,x1
478	mov	$len,x2
479	mov	$bsz,x3
480
481.Loop_squeeze:
482	ldr	x4,[x0],#8
483	cmp	$len,#8
484	blo	.Lsqueeze_tail
485#ifdef	__AARCH64EB__
486	rev	x4,x4
487#endif
488	str	x4,[$out],#8
489	subs	$len,$len,#8
490	beq	.Lsqueeze_done
491
492	subs	x3,x3,#8
493	bhi	.Loop_squeeze
494
495	mov	x0,$A_flat
496	bl	KeccakF1600
497	mov	x0,$A_flat
498	mov	x3,$bsz
499	b	.Loop_squeeze
500
501.align	4
502.Lsqueeze_tail:
503	strb	w4,[$out],#1
504	lsr	x4,x4,#8
505	subs	$len,$len,#1
506	beq	.Lsqueeze_done
507	strb	w4,[$out],#1
508	lsr	x4,x4,#8
509	subs	$len,$len,#1
510	beq	.Lsqueeze_done
511	strb	w4,[$out],#1
512	lsr	x4,x4,#8
513	subs	$len,$len,#1
514	beq	.Lsqueeze_done
515	strb	w4,[$out],#1
516	lsr	x4,x4,#8
517	subs	$len,$len,#1
518	beq	.Lsqueeze_done
519	strb	w4,[$out],#1
520	lsr	x4,x4,#8
521	subs	$len,$len,#1
522	beq	.Lsqueeze_done
523	strb	w4,[$out],#1
524	lsr	x4,x4,#8
525	subs	$len,$len,#1
526	beq	.Lsqueeze_done
527	strb	w4,[$out],#1
528
529.Lsqueeze_done:
530	ldp	x19,x20,[sp,#16]
531	ldp	x21,x22,[sp,#32]
532	ldp	x29,x30,[sp],#48
533	.inst	0xd50323bf			// autiasp
534	ret
535.size	SHA3_squeeze,.-SHA3_squeeze
536___
537}								}}}
538								{{{
539my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
540                             "v".($_+3).".16b", "v".($_+4).".16b" ],
541            (0, 5, 10, 15, 20));
542
543my @C = map("v$_.16b", (25..31));
544
545$code.=<<___;
546.type	KeccakF1600_ce,%function
547.align	5
548KeccakF1600_ce:
549	mov	x9,#12
550	adr	x10,iotas
551	b	.Loop_ce
552.align	4
553.Loop_ce:
554___
555for($i=0; $i<2; $i++) {
556$code.=<<___;
557	////////////////////////////////////////////////// Theta
558	eor3	$C[0],$A[0][0],$A[1][0],$A[2][0]
559	eor3	$C[1],$A[0][1],$A[1][1],$A[2][1]
560	eor3	$C[2],$A[0][2],$A[1][2],$A[2][2]
561	eor3	$C[3],$A[0][3],$A[1][3],$A[2][3]
562	eor3	$C[4],$A[0][4],$A[1][4],$A[2][4]
563	eor3	$C[0],$C[0],   $A[3][0],$A[4][0]
564	eor3	$C[1],$C[1],   $A[3][1],$A[4][1]
565	eor3	$C[2],$C[2],   $A[3][2],$A[4][2]
566	eor3	$C[3],$C[3],   $A[3][3],$A[4][3]
567	eor3	$C[4],$C[4],   $A[3][4],$A[4][4]
568
569	rax1	$C[5],$C[0],$C[2]			// D[1]
570	rax1	$C[6],$C[1],$C[3]			// D[2]
571	rax1	$C[2],$C[2],$C[4]			// D[3]
572	rax1	$C[3],$C[3],$C[0]			// D[4]
573	rax1	$C[4],$C[4],$C[1]			// D[0]
574
575	////////////////////////////////////////////////// Theta+Rho+Pi
576	xar	$C[0],   $A[1][1],$C[5],#64-$rhotates[1][1]	// C[0]=A[0][1]
577	xar	$A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4]
578	xar	$A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2]
579	xar	$A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4]
580	xar	$A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0]
581
582	xar	$A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2]
583
584	xar	$A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2]
585	xar	$A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3]
586	xar	$A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4]
587	xar	$A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3]
588	xar	$A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0]
589
590	xar	$A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4]
591
592	eor	$A[0][0],$A[0][0],$C[4]
593	ldr	x11,[x10],#8
594
595	xar	$C[1],   $A[3][3],$C[2],#64-$rhotates[3][3]	// C[1]=A[0][3]
596	xar	$A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2]
597	xar	$A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1]
598	xar	$A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2]
599	xar	$A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0]
600
601	xar	$A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1]	// *
602
603	xar	$A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4]
604	xar	$A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1]
605	xar	$A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3]
606	xar	$A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1]
607	xar	$A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0]
608
609	xar	$C[2],   $A[0][3],$C[2],#64-$rhotates[0][3]	// C[2]=A[1][0]
610
611	////////////////////////////////////////////////// Chi+Iota
612	dup	$C[6],x11				// borrow C[6]
613	bcax	$C[3],   $A[0][0],$A[0][2],$C[0]	// *
614	bcax	$A[0][1],$C[0],   $C[1],   $A[0][2]	// *
615	bcax	$A[0][2],$A[0][2],$A[0][4],$C[1]
616	bcax	$A[0][3],$C[1],   $A[0][0],$A[0][4]
617	bcax	$A[0][4],$A[0][4],$C[0],   $A[0][0]
618
619	bcax	$A[1][0],$C[2],   $A[1][2],$A[1][1]	// *
620	bcax	$C[0],   $A[1][1],$A[1][3],$A[1][2]	// *
621	bcax	$A[1][2],$A[1][2],$A[1][4],$A[1][3]
622	bcax	$A[1][3],$A[1][3],$C[2],   $A[1][4]
623	bcax	$A[1][4],$A[1][4],$A[1][1],$C[2]
624
625	eor	$A[0][0],$C[3],$C[6]			// Iota
626
627	bcax	$C[1],   $A[2][0],$A[2][2],$A[2][1]	// *
628	bcax	$C[2],   $A[2][1],$A[2][3],$A[2][2]	// *
629	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
630	bcax	$A[2][3],$A[2][3],$A[2][0],$A[2][4]
631	bcax	$A[2][4],$A[2][4],$A[2][1],$A[2][0]
632
633	bcax	$C[3],   $A[3][0],$A[3][2],$A[3][1]	// *
634	bcax	$C[4],   $A[3][1],$A[3][3],$A[3][2]	// *
635	bcax	$A[3][2],$A[3][2],$A[3][4],$A[3][3]
636	bcax	$A[3][3],$A[3][3],$A[3][0],$A[3][4]
637	bcax	$A[3][4],$A[3][4],$A[3][1],$A[3][0]
638
639	bcax	$C[5],   $A[4][0],$A[4][2],$A[4][1]	// *
640	bcax	$C[6],   $A[4][1],$A[4][3],$A[4][2]	// *
641	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
642	bcax	$A[4][3],$A[4][3],$A[4][0],$A[4][4]
643	bcax	$A[4][4],$A[4][4],$A[4][1],$A[4][0]
644___
645	(         $A[1][1],       $C[0]) = (      $C[0],          $A[1][1]);
646	($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]);
647	($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]);
648	($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]);
649}
650$code.=<<___;
651	subs	x9,x9,#1
652	bne	.Loop_ce
653
654	ret
655.size	KeccakF1600_ce,.-KeccakF1600_ce
656
657.type	KeccakF1600_cext,%function
658.align	5
659KeccakF1600_cext:
660	.inst	0xd503233f		// paciasp
661	stp	x29,x30,[sp,#-80]!
662	add	x29,sp,#0
663	stp	d8,d9,[sp,#16]		// per ABI requirement
664	stp	d10,d11,[sp,#32]
665	stp	d12,d13,[sp,#48]
666	stp	d14,d15,[sp,#64]
667___
668for($i=0; $i<24; $i+=2) {		# load A[5][5]
669my $j=$i+1;
670$code.=<<___;
671	ldp	d$i,d$j,[x0,#8*$i]
672___
673}
674$code.=<<___;
675	ldr	d24,[x0,#8*$i]
676	bl	KeccakF1600_ce
677	ldr	x30,[sp,#8]
678___
679for($i=0; $i<24; $i+=2) {		# store A[5][5]
680my $j=$i+1;
681$code.=<<___;
682	stp	d$i,d$j,[x0,#8*$i]
683___
684}
685$code.=<<___;
686	str	d24,[x0,#8*$i]
687
688	ldp	d8,d9,[sp,#16]
689	ldp	d10,d11,[sp,#32]
690	ldp	d12,d13,[sp,#48]
691	ldp	d14,d15,[sp,#64]
692	ldr	x29,[sp],#80
693	.inst	0xd50323bf		// autiasp
694	ret
695.size	KeccakF1600_cext,.-KeccakF1600_cext
696___
697
698{
699my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
700
701$code.=<<___;
702.globl	SHA3_absorb_cext
703.type	SHA3_absorb_cext,%function
704.align	5
705SHA3_absorb_cext:
706	.inst	0xd503233f		// paciasp
707	stp	x29,x30,[sp,#-80]!
708	add	x29,sp,#0
709	stp	d8,d9,[sp,#16]		// per ABI requirement
710	stp	d10,d11,[sp,#32]
711	stp	d12,d13,[sp,#48]
712	stp	d14,d15,[sp,#64]
713___
714for($i=0; $i<24; $i+=2) {		# load A[5][5]
715my $j=$i+1;
716$code.=<<___;
717	ldp	d$i,d$j,[x0,#8*$i]
718___
719}
720$code.=<<___;
721	ldr	d24,[x0,#8*$i]
722	b	.Loop_absorb_ce
723
724.align	4
725.Loop_absorb_ce:
726	subs	$len,$len,$bsz		// len - bsz
727	blo	.Labsorbed_ce
728___
729for (my $i=0; $i<24; $i+=2) {
730my $j = $i+1;
731$code.=<<___;
732	ldr	d31,[$inp],#8		// *inp++
733#ifdef	__AARCH64EB__
734	rev64	v31.16b,v31.16b
735#endif
736	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
737	cmp	$bsz,#8*($i+2)
738	blo	.Lprocess_block_ce
739	ldr	d31,[$inp],#8		// *inp++
740#ifdef	__AARCH64EB__
741	rev64	v31.16b,v31.16b
742#endif
743	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
744	beq	.Lprocess_block_ce
745___
746}
747$code.=<<___;
748	ldr	d31,[$inp],#8		// *inp++
749#ifdef	__AARCH64EB__
750	rev64	v31.16b,v31.16b
751#endif
752	eor	$A[4][4],$A[4][4],v31.16b
753
754.Lprocess_block_ce:
755
756	bl	KeccakF1600_ce
757
758	b	.Loop_absorb_ce
759
760.align	4
761.Labsorbed_ce:
762___
763for($i=0; $i<24; $i+=2) {		# store A[5][5]
764my $j=$i+1;
765$code.=<<___;
766	stp	d$i,d$j,[x0,#8*$i]
767___
768}
769$code.=<<___;
770	str	d24,[x0,#8*$i]
771	add	x0,$len,$bsz		// return value
772
773	ldp	d8,d9,[sp,#16]
774	ldp	d10,d11,[sp,#32]
775	ldp	d12,d13,[sp,#48]
776	ldp	d14,d15,[sp,#64]
777	ldp	x29,x30,[sp],#80
778	.inst	0xd50323bf		// autiasp
779	ret
780.size	SHA3_absorb_cext,.-SHA3_absorb_cext
781___
782}
783{
784my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
785$code.=<<___;
786.globl	SHA3_squeeze_cext
787.type	SHA3_squeeze_cext,%function
788.align	5
789SHA3_squeeze_cext:
790	.inst	0xd503233f		// paciasp
791	stp	x29,x30,[sp,#-16]!
792	add	x29,sp,#0
793	mov	x9,$ctx
794	mov	x10,$bsz
795
796.Loop_squeeze_ce:
797	ldr	x4,[x9],#8
798	cmp	$len,#8
799	blo	.Lsqueeze_tail_ce
800#ifdef	__AARCH64EB__
801	rev	x4,x4
802#endif
803	str	x4,[$out],#8
804	beq	.Lsqueeze_done_ce
805
806	sub	$len,$len,#8
807	subs	x10,x10,#8
808	bhi	.Loop_squeeze_ce
809
810	bl	KeccakF1600_cext
811	ldr	x30,[sp,#8]
812	mov	x9,$ctx
813	mov	x10,$bsz
814	b	.Loop_squeeze_ce
815
816.align	4
817.Lsqueeze_tail_ce:
818	strb	w4,[$out],#1
819	lsr	x4,x4,#8
820	subs	$len,$len,#1
821	beq	.Lsqueeze_done_ce
822	strb	w4,[$out],#1
823	lsr	x4,x4,#8
824	subs	$len,$len,#1
825	beq	.Lsqueeze_done_ce
826	strb	w4,[$out],#1
827	lsr	x4,x4,#8
828	subs	$len,$len,#1
829	beq	.Lsqueeze_done_ce
830	strb	w4,[$out],#1
831	lsr	x4,x4,#8
832	subs	$len,$len,#1
833	beq	.Lsqueeze_done_ce
834	strb	w4,[$out],#1
835	lsr	x4,x4,#8
836	subs	$len,$len,#1
837	beq	.Lsqueeze_done_ce
838	strb	w4,[$out],#1
839	lsr	x4,x4,#8
840	subs	$len,$len,#1
841	beq	.Lsqueeze_done_ce
842	strb	w4,[$out],#1
843
844.Lsqueeze_done_ce:
845	ldr	x29,[sp],#16
846	.inst	0xd50323bf		// autiasp
847	ret
848.size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
849___
850}								}}}
851$code.=<<___;
852.asciz	"Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
853___
854
855{   my  %opcode = (
856	"rax1"	=> 0xce608c00,	"eor3"	=> 0xce000000,
857	"bcax"	=> 0xce200000,	"xar"	=> 0xce800000	);
858
859    sub unsha3 {
860	my ($mnemonic,$arg)=@_;
861
862	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
863	&&
864	sprintf ".inst\t0x%08x\t//%s %s",
865			$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
866			$mnemonic,$arg;
867    }
868}
869
870foreach(split("\n",$code)) {
871
872	s/\`([^\`]*)\`/eval($1)/ge;
873
874	m/\bdup\b/ and s/\.16b/.2d/g	or
875	s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
876
877	print $_,"\n";
878}
879
880close STDOUT or die "error closing STDOUT: $!";
881