1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for ARMv4.
17#
18# June 2017.
19#
20# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21# interleaving. How does it compare to Keccak Code Package? It's as
22# fast, but several times smaller, and is endian- and ISA-neutral. ISA
23# neutrality means that minimum ISA requirement is ARMv4, yet it can
24# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25# register layout taken from Keccak Code Package. It's also as fast,
26# in fact faster by 10-15% on some processors, and endian-neutral.
27#
28# August 2017.
29#
30# Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31# of rotate instructions with logical ones. This resulted in ~10%
32# improvement on most processors. Switch to KECCAK_2X effectively
33# minimizes re-loads from temporary storage, and merged rotates just
34# eliminate corresponding instructions. As for latter. When examining
35# code you'll notice commented ror instructions. These are eliminated
36# ones, and you should trace destination register below to see what's
37# going on. Just in case, why not all rotates are eliminated. Trouble
38# is that you have operations that require both inputs to be rotated,
39# e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40# 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41# that takes 'a' as input. And thing is that this next operation can
42# be in next round. It's totally possible to "carry" rotate "factors"
43# to the next round, but it makes code more complex. And the last word
44# is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
45# time being]...
46#
47# Reduce per-round instruction count in Thumb-2 case by 16%. This is
48# achieved by folding ldr/str pairs to their double-word counterparts.
49# Theoretically this should have improved performance on single-issue
50# cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
51# usual...
52#
53########################################################################
54# Numbers are cycles per processed byte. Non-NEON results account even
55# for input bit interleaving.
56#
57#		r=1088(*)   Thumb-2(**) NEON
58#
59# ARM11xx	82/+150%
60# Cortex-A5	88/+160%,   86,         36
61# Cortex-A7	78/+160%,   68,         34
62# Cortex-A8	51/+230%,   57,         30
63# Cortex-A9	53/+210%,   51,         26
64# Cortex-A15	42/+160%,   38,         18
65# Snapdragon S4	43/+210%,   38,         24
66#
67# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
68#	over compiler-generated KECCAK_2X reference code.
69# (**)	Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70#	Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71#	processors are presented mostly for reference purposes.
72
73# $output is the last argument if it looks like a file (it has an extension)
74# $flavour is the first argument if it doesn't look like a file
75$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
76$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
77
78if ($flavour && $flavour ne "void") {
79    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
80    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
81    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
82    die "can't locate arm-xlate.pl";
83
84    open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
85        or die "can't call $xlate: $!";
86} else {
87    $output and open STDOUT,">$output";
88}
89
90my @C = map("r$_",(0..9));
91my @E = map("r$_",(10..12,14));
92
93########################################################################
94# Stack layout
95# ----->+-----------------------+
96#       | uint64_t A[5][5]      |
97#       | ...                   |
98# +200->+-----------------------+
99#       | uint64_t D[5]         |
100#       | ...                   |
101# +240->+-----------------------+
102#       | uint64_t T[5][5]      |
103#       | ...                   |
104# +440->+-----------------------+
105#       | saved lr              |
106# +444->+-----------------------+
107#       | loop counter          |
108# +448->+-----------------------+
109#       | ...
110
111my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
112my @D = map(8*$_, (25..29));
113my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
114
115$code.=<<___;
116#include "arm_arch.h"
117
118#if defined(__thumb2__)
119.syntax	unified
120.thumb
121#else
122.code	32
123#endif
124
125.text
126
127.type	iotas32, %object
128.align	5
129iotas32:
130	.long	0x00000001, 0x00000000
131	.long	0x00000000, 0x00000089
132	.long	0x00000000, 0x8000008b
133	.long	0x00000000, 0x80008080
134	.long	0x00000001, 0x0000008b
135	.long	0x00000001, 0x00008000
136	.long	0x00000001, 0x80008088
137	.long	0x00000001, 0x80000082
138	.long	0x00000000, 0x0000000b
139	.long	0x00000000, 0x0000000a
140	.long	0x00000001, 0x00008082
141	.long	0x00000000, 0x00008003
142	.long	0x00000001, 0x0000808b
143	.long	0x00000001, 0x8000000b
144	.long	0x00000001, 0x8000008a
145	.long	0x00000001, 0x80000081
146	.long	0x00000000, 0x80000081
147	.long	0x00000000, 0x80000008
148	.long	0x00000000, 0x00000083
149	.long	0x00000000, 0x80008003
150	.long	0x00000001, 0x80008088
151	.long	0x00000000, 0x80000088
152	.long	0x00000001, 0x00008000
153	.long	0x00000000, 0x80008082
154.size	iotas32,.-iotas32
155
156.type	KeccakF1600_int, %function
157.align	5
158KeccakF1600_int:
159	add	@C[9],sp,#$A[4][2]
160	add	@E[2],sp,#$A[0][0]
161	add	@E[0],sp,#$A[1][0]
162	ldmia	@C[9],{@C[4]-@C[9]}		@ A[4][2..4]
163KeccakF1600_enter:
164	str	lr,[sp,#440]
165	eor	@E[1],@E[1],@E[1]
166	str	@E[1],[sp,#444]
167	b	.Lround2x
168
169.align	4
170.Lround2x:
171___
172sub Round {
173my (@A,@R); (@A[0..4],@R) = @_;
174
175$code.=<<___;
176	ldmia	@E[2],{@C[0]-@C[3]}		@ A[0][0..1]
177	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[1][0..1]
178#ifdef	__thumb2__
179	eor	@C[0],@C[0],@E[0]
180	eor	@C[1],@C[1],@E[1]
181	eor	@C[2],@C[2],@E[2]
182	ldrd	@E[0],@E[1],[sp,#$A[1][2]]
183	eor	@C[3],@C[3],@E[3]
184	ldrd	@E[2],@E[3],[sp,#$A[1][3]]
185	eor	@C[4],@C[4],@E[0]
186	eor	@C[5],@C[5],@E[1]
187	eor	@C[6],@C[6],@E[2]
188	ldrd	@E[0],@E[1],[sp,#$A[1][4]]
189	eor	@C[7],@C[7],@E[3]
190	ldrd	@E[2],@E[3],[sp,#$A[2][0]]
191	eor	@C[8],@C[8],@E[0]
192	eor	@C[9],@C[9],@E[1]
193	eor	@C[0],@C[0],@E[2]
194	ldrd	@E[0],@E[1],[sp,#$A[2][1]]
195	eor	@C[1],@C[1],@E[3]
196	ldrd	@E[2],@E[3],[sp,#$A[2][2]]
197	eor	@C[2],@C[2],@E[0]
198	eor	@C[3],@C[3],@E[1]
199	eor	@C[4],@C[4],@E[2]
200	ldrd	@E[0],@E[1],[sp,#$A[2][3]]
201	eor	@C[5],@C[5],@E[3]
202	ldrd	@E[2],@E[3],[sp,#$A[2][4]]
203	eor	@C[6],@C[6],@E[0]
204	eor	@C[7],@C[7],@E[1]
205	eor	@C[8],@C[8],@E[2]
206	ldrd	@E[0],@E[1],[sp,#$A[3][0]]
207	eor	@C[9],@C[9],@E[3]
208	ldrd	@E[2],@E[3],[sp,#$A[3][1]]
209	eor	@C[0],@C[0],@E[0]
210	eor	@C[1],@C[1],@E[1]
211	eor	@C[2],@C[2],@E[2]
212	ldrd	@E[0],@E[1],[sp,#$A[3][2]]
213	eor	@C[3],@C[3],@E[3]
214	ldrd	@E[2],@E[3],[sp,#$A[3][3]]
215	eor	@C[4],@C[4],@E[0]
216	eor	@C[5],@C[5],@E[1]
217	eor	@C[6],@C[6],@E[2]
218	ldrd	@E[0],@E[1],[sp,#$A[3][4]]
219	eor	@C[7],@C[7],@E[3]
220	ldrd	@E[2],@E[3],[sp,#$A[4][0]]
221	eor	@C[8],@C[8],@E[0]
222	eor	@C[9],@C[9],@E[1]
223	eor	@C[0],@C[0],@E[2]
224	ldrd	@E[0],@E[1],[sp,#$A[4][1]]
225	eor	@C[1],@C[1],@E[3]
226	ldrd	@E[2],@E[3],[sp,#$A[0][2]]
227	eor	@C[2],@C[2],@E[0]
228	eor	@C[3],@C[3],@E[1]
229	eor	@C[4],@C[4],@E[2]
230	ldrd	@E[0],@E[1],[sp,#$A[0][3]]
231	eor	@C[5],@C[5],@E[3]
232	ldrd	@E[2],@E[3],[sp,#$A[0][4]]
233#else
234	eor	@C[0],@C[0],@E[0]
235	 add	@E[0],sp,#$A[1][2]
236	eor	@C[1],@C[1],@E[1]
237	eor	@C[2],@C[2],@E[2]
238	eor	@C[3],@C[3],@E[3]
239	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[1][2..3]
240	eor	@C[4],@C[4],@E[0]
241	 add	@E[0],sp,#$A[1][4]
242	eor	@C[5],@C[5],@E[1]
243	eor	@C[6],@C[6],@E[2]
244	eor	@C[7],@C[7],@E[3]
245	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[1][4]..A[2][0]
246	eor	@C[8],@C[8],@E[0]
247	 add	@E[0],sp,#$A[2][1]
248	eor	@C[9],@C[9],@E[1]
249	eor	@C[0],@C[0],@E[2]
250	eor	@C[1],@C[1],@E[3]
251	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[2][1..2]
252	eor	@C[2],@C[2],@E[0]
253	 add	@E[0],sp,#$A[2][3]
254	eor	@C[3],@C[3],@E[1]
255	eor	@C[4],@C[4],@E[2]
256	eor	@C[5],@C[5],@E[3]
257	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[2][3..4]
258	eor	@C[6],@C[6],@E[0]
259	 add	@E[0],sp,#$A[3][0]
260	eor	@C[7],@C[7],@E[1]
261	eor	@C[8],@C[8],@E[2]
262	eor	@C[9],@C[9],@E[3]
263	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[3][0..1]
264	eor	@C[0],@C[0],@E[0]
265	 add	@E[0],sp,#$A[3][2]
266	eor	@C[1],@C[1],@E[1]
267	eor	@C[2],@C[2],@E[2]
268	eor	@C[3],@C[3],@E[3]
269	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[3][2..3]
270	eor	@C[4],@C[4],@E[0]
271	 add	@E[0],sp,#$A[3][4]
272	eor	@C[5],@C[5],@E[1]
273	eor	@C[6],@C[6],@E[2]
274	eor	@C[7],@C[7],@E[3]
275	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[3][4]..A[4][0]
276	eor	@C[8],@C[8],@E[0]
277	ldr	@E[0],[sp,#$A[4][1]]		@ A[4][1]
278	eor	@C[9],@C[9],@E[1]
279	ldr	@E[1],[sp,#$A[4][1]+4]
280	eor	@C[0],@C[0],@E[2]
281	ldr	@E[2],[sp,#$A[0][2]]		@ A[0][2]
282	eor	@C[1],@C[1],@E[3]
283	ldr	@E[3],[sp,#$A[0][2]+4]
284	eor	@C[2],@C[2],@E[0]
285	 add	@E[0],sp,#$A[0][3]
286	eor	@C[3],@C[3],@E[1]
287	eor	@C[4],@C[4],@E[2]
288	eor	@C[5],@C[5],@E[3]
289	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[0][3..4]
290#endif
291	eor	@C[6],@C[6],@E[0]
292	eor	@C[7],@C[7],@E[1]
293	eor	@C[8],@C[8],@E[2]
294	eor	@C[9],@C[9],@E[3]
295
296	eor	@E[0],@C[0],@C[5],ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
297	str.l	@E[0],[sp,#$D[1]]		@ D[1] = E[0]
298	eor	@E[1],@C[1],@C[4]
299	str.h	@E[1],[sp,#$D[1]+4]
300	eor	@E[2],@C[6],@C[1],ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
301	eor	@E[3],@C[7],@C[0]
302	str.l	@E[2],[sp,#$D[4]]		@ D[4] = E[1]
303	eor	@C[0],@C[8],@C[3],ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
304	str.h	@E[3],[sp,#$D[4]+4]
305	eor	@C[1],@C[9],@C[2]
306	str.l	@C[0],[sp,#$D[0]]		@ D[0] = C[0]
307	eor	@C[2],@C[2],@C[7],ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
308	 ldr.l	@C[7],[sp,#$A[3][3]]
309	eor	@C[3],@C[3],@C[6]
310	str.h	@C[1],[sp,#$D[0]+4]
311	 ldr.h	@C[6],[sp,#$A[3][3]+4]
312	str.l	@C[2],[sp,#$D[2]]		@ D[2] = C[1]
313	eor	@C[4],@C[4],@C[9],ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
314	str.h	@C[3],[sp,#$D[2]+4]
315	eor	@C[5],@C[5],@C[8]
316
317	ldr.l	@C[8],[sp,#$A[4][4]]
318	ldr.h	@C[9],[sp,#$A[4][4]+4]
319	 str.l	@C[4],[sp,#$D[3]]		@ D[3] = C[2]
320	eor	@C[7],@C[7],@C[4]
321	 str.h	@C[5],[sp,#$D[3]+4]
322	eor	@C[6],@C[6],@C[5]
323	ldr.l	@C[4],[sp,#$A[0][0]]
324	@ ror	@C[7],@C[7],#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
325	@ ror	@C[6],@C[6],#32-11
326	ldr.h	@C[5],[sp,#$A[0][0]+4]
327	eor	@C[8],@C[8],@E[2]
328	eor	@C[9],@C[9],@E[3]
329	ldr.l	@E[2],[sp,#$A[2][2]]
330	eor	@C[0],@C[0],@C[4]
331	ldr.h	@E[3],[sp,#$A[2][2]+4]
332	@ ror	@C[8],@C[8],#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
333	@ ror	@C[9],@C[9],#32-7
334	eor	@C[1],@C[1],@C[5]		@ C[0] =       A[0][0] ^ C[0]; /* rotate by 0 */  /* D[0] */
335	eor	@E[2],@E[2],@C[2]
336	ldr.l	@C[2],[sp,#$A[1][1]]
337	eor	@E[3],@E[3],@C[3]
338	ldr.h	@C[3],[sp,#$A[1][1]+4]
339	ror	@C[5],@E[2],#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
340	 ldr	@E[2],[sp,#444]			@ load counter
341	eor	@C[2],@C[2],@E[0]
342	 adr	@E[0],iotas32
343	ror	@C[4],@E[3],#32-22
344	 add	@E[3],@E[0],@E[2]
345	eor	@C[3],@C[3],@E[1]
346___
347$code.=<<___	if ($A[0][0] != $T[0][0]);
348	ldmia	@E[3],{@E[0],@E[1]}		@ iotas[i]
349___
350$code.=<<___	if ($A[0][0] == $T[0][0]);
351	ldr.l	@E[0],[@E[3],#8]		@ iotas[i].lo
352	add	@E[2],@E[2],#16
353	ldr.h	@E[1],[@E[3],#12]		@ iotas[i].hi
354	cmp	@E[2],#192
355	str	@E[2],[sp,#444]			@ store counter
356___
357$code.=<<___;
358	bic	@E[2],@C[4],@C[2],ror#32-22
359	bic	@E[3],@C[5],@C[3],ror#32-22
360	 ror	@C[2],@C[2],#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);   /* D[1] */
361	 ror	@C[3],@C[3],#32-22
362	eor	@E[2],@E[2],@C[0]
363	eor	@E[3],@E[3],@C[1]
364	eor	@E[0],@E[0],@E[2]
365	eor	@E[1],@E[1],@E[3]
366	str.l	@E[0],[sp,#$R[0][0]]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
367	bic	@E[2],@C[6],@C[4],ror#11
368	str.h	@E[1],[sp,#$R[0][0]+4]
369	bic	@E[3],@C[7],@C[5],ror#10
370	bic	@E[0],@C[8],@C[6],ror#32-(11-7)
371	bic	@E[1],@C[9],@C[7],ror#32-(10-7)
372	eor	@E[2],@C[2],@E[2],ror#32-11
373	str.l	@E[2],[sp,#$R[0][1]]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
374	eor	@E[3],@C[3],@E[3],ror#32-10
375	str.h	@E[3],[sp,#$R[0][1]+4]
376	eor	@E[0],@C[4],@E[0],ror#32-7
377	eor	@E[1],@C[5],@E[1],ror#32-7
378	str.l	@E[0],[sp,#$R[0][2]]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
379	bic	@E[2],@C[0],@C[8],ror#32-7
380	str.h	@E[1],[sp,#$R[0][2]+4]
381	bic	@E[3],@C[1],@C[9],ror#32-7
382	eor	@E[2],@E[2],@C[6],ror#32-11
383	str.l	@E[2],[sp,#$R[0][3]]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
384	eor	@E[3],@E[3],@C[7],ror#32-10
385	str.h	@E[3],[sp,#$R[0][3]+4]
386	bic	@E[0],@C[2],@C[0]
387	 add	@E[3],sp,#$D[3]
388	 ldr.l	@C[0],[sp,#$A[0][3]]		@ A[0][3]
389	bic	@E[1],@C[3],@C[1]
390	 ldr.h	@C[1],[sp,#$A[0][3]+4]
391	eor	@E[0],@E[0],@C[8],ror#32-7
392	eor	@E[1],@E[1],@C[9],ror#32-7
393	str.l	@E[0],[sp,#$R[0][4]]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
394	 add	@C[9],sp,#$D[0]
395	str.h	@E[1],[sp,#$R[0][4]+4]
396
397	ldmia	@E[3],{@E[0]-@E[2],@E[3]}	@ D[3..4]
398	ldmia	@C[9],{@C[6]-@C[9]}		@ D[0..1]
399
400	ldr.l	@C[2],[sp,#$A[1][4]]		@ A[1][4]
401	eor	@C[0],@C[0],@E[0]
402	ldr.h	@C[3],[sp,#$A[1][4]+4]
403	eor	@C[1],@C[1],@E[1]
404	@ ror	@C[0],@C[0],#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
405	ldr.l	@E[0],[sp,#$A[3][1]]		@ A[3][1]
406	@ ror	@C[1],@C[1],#32-14
407	ldr.h	@E[1],[sp,#$A[3][1]+4]
408
409	eor	@C[2],@C[2],@E[2]
410	ldr.l	@C[4],[sp,#$A[2][0]]		@ A[2][0]
411	eor	@C[3],@C[3],@E[3]
412	ldr.h	@C[5],[sp,#$A[2][0]+4]
413	@ ror	@C[2],@C[2],#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
414	@ ror	@C[3],@C[3],#32-10
415
416	eor	@C[6],@C[6],@C[4]
417	ldr.l	@E[2],[sp,#$D[2]]		@ D[2]
418	eor	@C[7],@C[7],@C[5]
419	ldr.h	@E[3],[sp,#$D[2]+4]
420	ror	@C[5],@C[6],#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
421	ror	@C[4],@C[7],#32-2
422
423	eor	@E[0],@E[0],@C[8]
424	ldr.l	@C[8],[sp,#$A[4][2]]		@ A[4][2]
425	eor	@E[1],@E[1],@C[9]
426	ldr.h	@C[9],[sp,#$A[4][2]+4]
427	ror	@C[7],@E[0],#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
428	ror	@C[6],@E[1],#32-23
429
430	bic	@E[0],@C[4],@C[2],ror#32-10
431	bic	@E[1],@C[5],@C[3],ror#32-10
432	 eor	@E[2],@E[2],@C[8]
433	 eor	@E[3],@E[3],@C[9]
434	 ror	@C[9],@E[2],#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
435	 ror	@C[8],@E[3],#32-31
436	eor	@E[0],@E[0],@C[0],ror#32-14
437	eor	@E[1],@E[1],@C[1],ror#32-14
438	str.l	@E[0],[sp,#$R[1][0]]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
439	bic	@E[2],@C[6],@C[4]
440	str.h	@E[1],[sp,#$R[1][0]+4]
441	bic	@E[3],@C[7],@C[5]
442	eor	@E[2],@E[2],@C[2],ror#32-10
443	str.l	@E[2],[sp,#$R[1][1]]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
444	eor	@E[3],@E[3],@C[3],ror#32-10
445	str.h	@E[3],[sp,#$R[1][1]+4]
446	bic	@E[0],@C[8],@C[6]
447	bic	@E[1],@C[9],@C[7]
448	bic	@E[2],@C[0],@C[8],ror#14
449	bic	@E[3],@C[1],@C[9],ror#14
450	eor	@E[0],@E[0],@C[4]
451	eor	@E[1],@E[1],@C[5]
452	str.l	@E[0],[sp,#$R[1][2]]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
453	bic	@C[2],@C[2],@C[0],ror#32-(14-10)
454	str.h	@E[1],[sp,#$R[1][2]+4]
455	eor	@E[2],@C[6],@E[2],ror#32-14
456	bic	@E[1],@C[3],@C[1],ror#32-(14-10)
457	str.l	@E[2],[sp,#$R[1][3]]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
458	eor	@E[3],@C[7],@E[3],ror#32-14
459	str.h	@E[3],[sp,#$R[1][3]+4]
460	 add	@E[2],sp,#$D[1]
461	 ldr.l	@C[1],[sp,#$A[0][1]]		@ A[0][1]
462	eor	@E[0],@C[8],@C[2],ror#32-10
463	 ldr.h	@C[0],[sp,#$A[0][1]+4]
464	eor	@E[1],@C[9],@E[1],ror#32-10
465	str.l	@E[0],[sp,#$R[1][4]]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
466	str.h	@E[1],[sp,#$R[1][4]+4]
467
468	add	@C[9],sp,#$D[3]
469	ldmia	@E[2],{@E[0]-@E[2],@E[3]}	@ D[1..2]
470	ldr.l	@C[2],[sp,#$A[1][2]]		@ A[1][2]
471	ldr.h	@C[3],[sp,#$A[1][2]+4]
472	ldmia	@C[9],{@C[6]-@C[9]}		@ D[3..4]
473
474	eor	@C[1],@C[1],@E[0]
475	ldr.l	@C[4],[sp,#$A[2][3]]		@ A[2][3]
476	eor	@C[0],@C[0],@E[1]
477	ldr.h	@C[5],[sp,#$A[2][3]+4]
478	ror	@C[0],@C[0],#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
479
480	eor	@C[2],@C[2],@E[2]
481	ldr.l	@E[0],[sp,#$A[3][4]]		@ A[3][4]
482	eor	@C[3],@C[3],@E[3]
483	ldr.h	@E[1],[sp,#$A[3][4]+4]
484	@ ror	@C[2],@C[2],#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
485	ldr.l	@E[2],[sp,#$D[0]]		@ D[0]
486	@ ror	@C[3],@C[3],#32-3
487	ldr.h	@E[3],[sp,#$D[0]+4]
488
489	eor	@C[4],@C[4],@C[6]
490	eor	@C[5],@C[5],@C[7]
491	@ ror	@C[5],@C[6],#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
492	@ ror	@C[4],@C[7],#32-13		@ [track reverse order below]
493
494	eor	@E[0],@E[0],@C[8]
495	ldr.l	@C[8],[sp,#$A[4][0]]		@ A[4][0]
496	eor	@E[1],@E[1],@C[9]
497	ldr.h	@C[9],[sp,#$A[4][0]+4]
498	ror	@C[6],@E[0],#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
499	ror	@C[7],@E[1],#32-4
500
501	eor	@E[2],@E[2],@C[8]
502	eor	@E[3],@E[3],@C[9]
503	ror	@C[8],@E[2],#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
504	ror	@C[9],@E[3],#32-9
505
506	bic	@E[0],@C[5],@C[2],ror#13-3
507	bic	@E[1],@C[4],@C[3],ror#12-3
508	bic	@E[2],@C[6],@C[5],ror#32-13
509	bic	@E[3],@C[7],@C[4],ror#32-12
510	eor	@E[0],@C[0],@E[0],ror#32-13
511	eor	@E[1],@C[1],@E[1],ror#32-12
512	str.l	@E[0],[sp,#$R[2][0]]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
513	eor	@E[2],@E[2],@C[2],ror#32-3
514	str.h	@E[1],[sp,#$R[2][0]+4]
515	eor	@E[3],@E[3],@C[3],ror#32-3
516	str.l	@E[2],[sp,#$R[2][1]]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
517	bic	@E[0],@C[8],@C[6]
518	bic	@E[1],@C[9],@C[7]
519	str.h	@E[3],[sp,#$R[2][1]+4]
520	eor	@E[0],@E[0],@C[5],ror#32-13
521	eor	@E[1],@E[1],@C[4],ror#32-12
522	str.l	@E[0],[sp,#$R[2][2]]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
523	bic	@E[2],@C[0],@C[8]
524	str.h	@E[1],[sp,#$R[2][2]+4]
525	bic	@E[3],@C[1],@C[9]
526	eor	@E[2],@E[2],@C[6]
527	eor	@E[3],@E[3],@C[7]
528	str.l	@E[2],[sp,#$R[2][3]]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
529	bic	@E[0],@C[2],@C[0],ror#3
530	str.h	@E[3],[sp,#$R[2][3]+4]
531	bic	@E[1],@C[3],@C[1],ror#3
532	 ldr.l	@C[1],[sp,#$A[0][4]]		@ A[0][4] [in reverse order]
533	eor	@E[0],@C[8],@E[0],ror#32-3
534	 ldr.h	@C[0],[sp,#$A[0][4]+4]
535	eor	@E[1],@C[9],@E[1],ror#32-3
536	str.l	@E[0],[sp,#$R[2][4]]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
537	 add	@C[9],sp,#$D[1]
538	str.h	@E[1],[sp,#$R[2][4]+4]
539
540	ldr.l	@E[0],[sp,#$D[4]]		@ D[4]
541	ldr.h	@E[1],[sp,#$D[4]+4]
542	ldr.l	@E[2],[sp,#$D[0]]		@ D[0]
543	ldr.h	@E[3],[sp,#$D[0]+4]
544
545	ldmia	@C[9],{@C[6]-@C[9]}		@ D[1..2]
546
547	eor	@C[1],@C[1],@E[0]
548	ldr.l	@C[2],[sp,#$A[1][0]]		@ A[1][0]
549	eor	@C[0],@C[0],@E[1]
550	ldr.h	@C[3],[sp,#$A[1][0]+4]
551	@ ror	@C[1],@E[0],#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
552	ldr.l	@C[4],[sp,#$A[2][1]]		@ A[2][1]
553	@ ror	@C[0],@E[1],#32-14		@ [was loaded in reverse order]
554	ldr.h	@C[5],[sp,#$A[2][1]+4]
555
556	eor	@C[2],@C[2],@E[2]
557	ldr.l	@E[0],[sp,#$A[3][2]]		@ A[3][2]
558	eor	@C[3],@C[3],@E[3]
559	ldr.h	@E[1],[sp,#$A[3][2]+4]
560	@ ror	@C[2],@C[2],#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
561	ldr.l	@E[2],[sp,#$D[3]]		@ D[3]
562	@ ror	@C[3],@C[3],#32-18
563	ldr.h	@E[3],[sp,#$D[3]+4]
564
565	eor	@C[6],@C[6],@C[4]
566	eor	@C[7],@C[7],@C[5]
567	ror	@C[4],@C[6],#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
568	ror	@C[5],@C[7],#32-5
569
570	eor	@E[0],@E[0],@C[8]
571	ldr.l	@C[8],[sp,#$A[4][3]]		@ A[4][3]
572	eor	@E[1],@E[1],@C[9]
573	ldr.h	@C[9],[sp,#$A[4][3]+4]
574	ror	@C[7],@E[0],#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
575	ror	@C[6],@E[1],#32-8
576
577	eor	@E[2],@E[2],@C[8]
578	eor	@E[3],@E[3],@C[9]
579	ror	@C[8],@E[2],#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
580	ror	@C[9],@E[3],#32-28
581
582	bic	@E[0],@C[4],@C[2],ror#32-18
583	bic	@E[1],@C[5],@C[3],ror#32-18
584	eor	@E[0],@E[0],@C[0],ror#32-14
585	eor	@E[1],@E[1],@C[1],ror#32-13
586	str.l	@E[0],[sp,#$R[3][0]]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
587	bic	@E[2],@C[6],@C[4]
588	str.h	@E[1],[sp,#$R[3][0]+4]
589	bic	@E[3],@C[7],@C[5]
590	eor	@E[2],@E[2],@C[2],ror#32-18
591	str.l	@E[2],[sp,#$R[3][1]]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
592	eor	@E[3],@E[3],@C[3],ror#32-18
593	str.h	@E[3],[sp,#$R[3][1]+4]
594	bic	@E[0],@C[8],@C[6]
595	bic	@E[1],@C[9],@C[7]
596	bic	@E[2],@C[0],@C[8],ror#14
597	bic	@E[3],@C[1],@C[9],ror#13
598	eor	@E[0],@E[0],@C[4]
599	eor	@E[1],@E[1],@C[5]
600	str.l	@E[0],[sp,#$R[3][2]]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
601	bic	@C[2],@C[2],@C[0],ror#18-14
602	str.h	@E[1],[sp,#$R[3][2]+4]
603	eor	@E[2],@C[6],@E[2],ror#32-14
604	bic	@E[1],@C[3],@C[1],ror#18-13
605	eor	@E[3],@C[7],@E[3],ror#32-13
606	str.l	@E[2],[sp,#$R[3][3]]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
607	str.h	@E[3],[sp,#$R[3][3]+4]
608	 add	@E[3],sp,#$D[2]
609	 ldr.l	@C[0],[sp,#$A[0][2]]		@ A[0][2]
610	eor	@E[0],@C[8],@C[2],ror#32-18
611	 ldr.h	@C[1],[sp,#$A[0][2]+4]
612	eor	@E[1],@C[9],@E[1],ror#32-18
613	str.l	@E[0],[sp,#$R[3][4]]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
614	str.h	@E[1],[sp,#$R[3][4]+4]
615
616	ldmia	@E[3],{@E[0]-@E[2],@E[3]}	@ D[2..3]
617	ldr.l	@C[2],[sp,#$A[1][3]]		@ A[1][3]
618	ldr.h	@C[3],[sp,#$A[1][3]+4]
619	ldr.l	@C[6],[sp,#$D[4]]		@ D[4]
620	ldr.h	@C[7],[sp,#$D[4]+4]
621
622	eor	@C[0],@C[0],@E[0]
623	ldr.l	@C[4],[sp,#$A[2][4]]		@ A[2][4]
624	eor	@C[1],@C[1],@E[1]
625	ldr.h	@C[5],[sp,#$A[2][4]+4]
626	@ ror	@C[0],@C[0],#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
627	ldr.l	@C[8],[sp,#$D[0]]		@ D[0]
628	@ ror	@C[1],@C[1],#32-31
629	ldr.h	@C[9],[sp,#$D[0]+4]
630
631	eor	@E[2],@E[2],@C[2]
632	ldr.l	@E[0],[sp,#$A[3][0]]		@ A[3][0]
633	eor	@E[3],@E[3],@C[3]
634	ldr.h	@E[1],[sp,#$A[3][0]+4]
635	ror	@C[3],@E[2],#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
636	ldr.l	@E[2],[sp,#$D[1]]		@ D[1]
637	ror	@C[2],@E[3],#32-28
638	ldr.h	@E[3],[sp,#$D[1]+4]
639
640	eor	@C[6],@C[6],@C[4]
641	eor	@C[7],@C[7],@C[5]
642	ror	@C[5],@C[6],#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
643	ror	@C[4],@C[7],#32-20
644
645	eor	@E[0],@E[0],@C[8]
646	ldr.l	@C[8],[sp,#$A[4][1]]		@ A[4][1]
647	eor	@E[1],@E[1],@C[9]
648	ldr.h	@C[9],[sp,#$A[4][1]+4]
649	ror	@C[7],@E[0],#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
650	ror	@C[6],@E[1],#32-21
651
652	eor	@C[8],@C[8],@E[2]
653	eor	@C[9],@C[9],@E[3]
654	@ ror	@C[8],@C[2],#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
655	@ ror	@C[9],@C[3],#32-1
656
657	bic	@E[0],@C[4],@C[2]
658	bic	@E[1],@C[5],@C[3]
659	eor	@E[0],@E[0],@C[0],ror#32-31
660	str.l	@E[0],[sp,#$R[4][0]]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
661	eor	@E[1],@E[1],@C[1],ror#32-31
662	str.h	@E[1],[sp,#$R[4][0]+4]
663	bic	@E[2],@C[6],@C[4]
664	bic	@E[3],@C[7],@C[5]
665	eor	@E[2],@E[2],@C[2]
666	eor	@E[3],@E[3],@C[3]
667	str.l	@E[2],[sp,#$R[4][1]]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
668	bic	@E[0],@C[8],@C[6],ror#1
669	str.h	@E[3],[sp,#$R[4][1]+4]
670	bic	@E[1],@C[9],@C[7],ror#1
671	bic	@E[2],@C[0],@C[8],ror#31-1
672	bic	@E[3],@C[1],@C[9],ror#31-1
673	eor	@C[4],@C[4],@E[0],ror#32-1
674	str.l	@C[4],[sp,#$R[4][2]]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
675	eor	@C[5],@C[5],@E[1],ror#32-1
676	str.h	@C[5],[sp,#$R[4][2]+4]
677	eor	@C[6],@C[6],@E[2],ror#32-31
678	eor	@C[7],@C[7],@E[3],ror#32-31
679	str.l	@C[6],[sp,#$R[4][3]]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
680	bic	@E[0],@C[2],@C[0],ror#32-31
681	str.h	@C[7],[sp,#$R[4][3]+4]
682	bic	@E[1],@C[3],@C[1],ror#32-31
683	 add	@E[2],sp,#$R[0][0]
684	eor	@C[8],@E[0],@C[8],ror#32-1
685	 add	@E[0],sp,#$R[1][0]
686	eor	@C[9],@E[1],@C[9],ror#32-1
687	str.l	@C[8],[sp,#$R[4][4]]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
688	str.h	@C[9],[sp,#$R[4][4]+4]
689___
690}
691	Round(@A,@T);
692	Round(@T,@A);
693$code.=<<___;
694	blo	.Lround2x
695
696#if __ARM_ARCH__>=5
697	ldr	pc,[sp,#440]
698#else
699	ldr	lr,[sp,#440]
700	tst	lr,#1
701	moveq	pc,lr		@ be binary compatible with V4, yet
702	bx	lr		@ interoperable with Thumb ISA:-)
703#endif
704.size	KeccakF1600_int,.-KeccakF1600_int
705
706.type	KeccakF1600, %function
707.align	5
708KeccakF1600:
709	stmdb	sp!,{r0,r4-r11,lr}
710	sub	sp,sp,#440+16			@ space for A[5][5],D[5],T[5][5],...
711
712	add	@E[0],r0,#$A[1][0]
713	add	@E[1],sp,#$A[1][0]
714	ldmia	r0,    {@C[0]-@C[9]}		@ copy A[5][5] to stack
715	stmia	sp,    {@C[0]-@C[9]}
716	ldmia	@E[0]!,{@C[0]-@C[9]}
717	stmia	@E[1]!,{@C[0]-@C[9]}
718	ldmia	@E[0]!,{@C[0]-@C[9]}
719	stmia	@E[1]!,{@C[0]-@C[9]}
720	ldmia	@E[0]!,{@C[0]-@C[9]}
721	stmia	@E[1]!,{@C[0]-@C[9]}
722	ldmia	@E[0], {@C[0]-@C[9]}
723	add	@E[2],sp,#$A[0][0]
724	add	@E[0],sp,#$A[1][0]
725	stmia	@E[1], {@C[0]-@C[9]}
726
727	bl	KeccakF1600_enter
728
729	ldr	@E[1], [sp,#440+16]		@ restore pointer to A
730	ldmia	sp,    {@C[0]-@C[9]}
731	stmia	@E[1]!,{@C[0]-@C[9]}		@ return A[5][5]
732	ldmia	@E[0]!,{@C[0]-@C[9]}
733	stmia	@E[1]!,{@C[0]-@C[9]}
734	ldmia	@E[0]!,{@C[0]-@C[9]}
735	stmia	@E[1]!,{@C[0]-@C[9]}
736	ldmia	@E[0]!,{@C[0]-@C[9]}
737	stmia	@E[1]!,{@C[0]-@C[9]}
738	ldmia	@E[0], {@C[0]-@C[9]}
739	stmia	@E[1], {@C[0]-@C[9]}
740
741	add	sp,sp,#440+20
742#if __ARM_ARCH__>=5
743	ldmia	sp!,{r4-r11,pc}
744#else
745	ldmia	sp!,{r4-r11,lr}
746	tst	lr,#1
747	moveq	pc,lr		@ be binary compatible with V4, yet
748	bx	lr		@ interoperable with Thumb ISA:-)
749#endif
750.size	KeccakF1600,.-KeccakF1600
751___
752{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
753
754########################################################################
755# Stack layout
756# ----->+-----------------------+
757#       | uint64_t A[5][5]      |
758#       | ...                   |
759#       | ...                   |
760# +456->+-----------------------+
761#       | 0x55555555            |
762# +460->+-----------------------+
763#       | 0x33333333            |
764# +464->+-----------------------+
765#       | 0x0f0f0f0f            |
766# +468->+-----------------------+
767#       | 0x00ff00ff            |
768# +472->+-----------------------+
769#       | uint64_t *A           |
770# +476->+-----------------------+
771#       | const void *inp       |
772# +480->+-----------------------+
773#       | size_t len            |
774# +484->+-----------------------+
775#       | size_t bs             |
776# +488->+-----------------------+
777#       | ....
778
779$code.=<<___;
780.global	SHA3_absorb
781.type	SHA3_absorb,%function
782.align	5
783SHA3_absorb:
784	stmdb	sp!,{r0-r12,lr}
785	sub	sp,sp,#456+16
786
787	add	$A_flat,r0,#$A[1][0]
788	@ mov	$inp,r1
789	mov	$len,r2
790	mov	$bsz,r3
791	cmp	r2,r3
792	blo	.Labsorb_abort
793
794	add	$inp,sp,#0
795	ldmia	r0,      {@C[0]-@C[9]}	@ copy A[5][5] to stack
796	stmia	$inp!,   {@C[0]-@C[9]}
797	ldmia	$A_flat!,{@C[0]-@C[9]}
798	stmia	$inp!,   {@C[0]-@C[9]}
799	ldmia	$A_flat!,{@C[0]-@C[9]}
800	stmia	$inp!,   {@C[0]-@C[9]}
801	ldmia	$A_flat!,{@C[0]-@C[9]}
802	stmia	$inp!,   {@C[0]-@C[9]}
803	ldmia	$A_flat!,{@C[0]-@C[9]}
804	stmia	$inp,    {@C[0]-@C[9]}
805
806	ldr	$inp,[sp,#476]		@ restore $inp
807#ifdef	__thumb2__
808	mov	r9,#0x00ff00ff
809	mov	r8,#0x0f0f0f0f
810	mov	r7,#0x33333333
811	mov	r6,#0x55555555
812#else
813	mov	r6,#0x11		@ compose constants
814	mov	r8,#0x0f
815	mov	r9,#0xff
816	orr	r6,r6,r6,lsl#8
817	orr	r8,r8,r8,lsl#8
818	orr	r6,r6,r6,lsl#16		@ 0x11111111
819	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
820	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
821	orr	r7,r6,r6,lsl#1		@ 0x33333333
822	orr	r6,r6,r6,lsl#2		@ 0x55555555
823#endif
824	str	r9,[sp,#468]
825	str	r8,[sp,#464]
826	str	r7,[sp,#460]
827	str	r6,[sp,#456]
828	b	.Loop_absorb
829
830.align	4
831.Loop_absorb:
832	subs	r0,$len,$bsz
833	blo	.Labsorbed
834	add	$A_flat,sp,#0
835	str	r0,[sp,#480]		@ save len - bsz
836
837.align	4
838.Loop_block:
839	ldrb	r0,[$inp],#1
840	ldrb	r1,[$inp],#1
841	ldrb	r2,[$inp],#1
842	ldrb	r3,[$inp],#1
843	ldrb	r4,[$inp],#1
844	orr	r0,r0,r1,lsl#8
845	ldrb	r1,[$inp],#1
846	orr	r0,r0,r2,lsl#16
847	ldrb	r2,[$inp],#1
848	orr	r0,r0,r3,lsl#24		@ lo
849	ldrb	r3,[$inp],#1
850	orr	r1,r4,r1,lsl#8
851	orr	r1,r1,r2,lsl#16
852	orr	r1,r1,r3,lsl#24		@ hi
853
854	and	r2,r0,r6		@ &=0x55555555
855	and	r0,r0,r6,lsl#1		@ &=0xaaaaaaaa
856	and	r3,r1,r6		@ &=0x55555555
857	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
858	orr	r2,r2,r2,lsr#1
859	orr	r0,r0,r0,lsl#1
860	orr	r3,r3,r3,lsr#1
861	orr	r1,r1,r1,lsl#1
862	and	r2,r2,r7		@ &=0x33333333
863	and	r0,r0,r7,lsl#2		@ &=0xcccccccc
864	and	r3,r3,r7		@ &=0x33333333
865	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
866	orr	r2,r2,r2,lsr#2
867	orr	r0,r0,r0,lsl#2
868	orr	r3,r3,r3,lsr#2
869	orr	r1,r1,r1,lsl#2
870	and	r2,r2,r8		@ &=0x0f0f0f0f
871	and	r0,r0,r8,lsl#4		@ &=0xf0f0f0f0
872	and	r3,r3,r8		@ &=0x0f0f0f0f
873	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
874	ldmia	$A_flat,{r4-r5}		@ A_flat[i]
875	orr	r2,r2,r2,lsr#4
876	orr	r0,r0,r0,lsl#4
877	orr	r3,r3,r3,lsr#4
878	orr	r1,r1,r1,lsl#4
879	and	r2,r2,r9		@ &=0x00ff00ff
880	and	r0,r0,r9,lsl#8		@ &=0xff00ff00
881	and	r3,r3,r9		@ &=0x00ff00ff
882	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
883	orr	r2,r2,r2,lsr#8
884	orr	r0,r0,r0,lsl#8
885	orr	r3,r3,r3,lsr#8
886	orr	r1,r1,r1,lsl#8
887
888	lsl	r2,r2,#16
889	lsr	r1,r1,#16
890	eor	r4,r4,r3,lsl#16
891	eor	r5,r5,r0,lsr#16
892	eor	r4,r4,r2,lsr#16
893	eor	r5,r5,r1,lsl#16
894	stmia	$A_flat!,{r4-r5}	@ A_flat[i++] ^= BitInterleave(inp[0..7])
895
896	subs	$bsz,$bsz,#8
897	bhi	.Loop_block
898
899	str	$inp,[sp,#476]
900
901	bl	KeccakF1600_int
902
903	add	r14,sp,#456
904	ldmia	r14,{r6-r12,r14}	@ restore constants and variables
905	b	.Loop_absorb
906
907.align	4
908.Labsorbed:
909	add	$inp,sp,#$A[1][0]
910	ldmia	sp,      {@C[0]-@C[9]}
911	stmia	$A_flat!,{@C[0]-@C[9]}	@ return A[5][5]
912	ldmia	$inp!,   {@C[0]-@C[9]}
913	stmia	$A_flat!,{@C[0]-@C[9]}
914	ldmia	$inp!,   {@C[0]-@C[9]}
915	stmia	$A_flat!,{@C[0]-@C[9]}
916	ldmia	$inp!,   {@C[0]-@C[9]}
917	stmia	$A_flat!,{@C[0]-@C[9]}
918	ldmia	$inp,    {@C[0]-@C[9]}
919	stmia	$A_flat, {@C[0]-@C[9]}
920
921.Labsorb_abort:
922	add	sp,sp,#456+32
923	mov	r0,$len			@ return value
924#if __ARM_ARCH__>=5
925	ldmia	sp!,{r4-r12,pc}
926#else
927	ldmia	sp!,{r4-r12,lr}
928	tst	lr,#1
929	moveq	pc,lr		@ be binary compatible with V4, yet
930	bx	lr		@ interoperable with Thumb ISA:-)
931#endif
932.size	SHA3_absorb,.-SHA3_absorb
933___
934}
935{ my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
936
937$code.=<<___;
938.global	SHA3_squeeze
939.type	SHA3_squeeze,%function
940.align	5
941SHA3_squeeze:
942	stmdb	sp!,{r0,r3-r10,lr}
943
944	mov	$A_flat,r0
945	mov	$out,r1
946	mov	$len,r2
947	mov	$bsz,r3
948
949#ifdef	__thumb2__
950	mov	r9,#0x00ff00ff
951	mov	r8,#0x0f0f0f0f
952	mov	r7,#0x33333333
953	mov	r6,#0x55555555
954#else
955	mov	r6,#0x11		@ compose constants
956	mov	r8,#0x0f
957	mov	r9,#0xff
958	orr	r6,r6,r6,lsl#8
959	orr	r8,r8,r8,lsl#8
960	orr	r6,r6,r6,lsl#16		@ 0x11111111
961	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
962	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
963	orr	r7,r6,r6,lsl#1		@ 0x33333333
964	orr	r6,r6,r6,lsl#2		@ 0x55555555
965#endif
966	stmdb	sp!,{r6-r9}
967
968	mov	r14,$A_flat
969	b	.Loop_squeeze
970
971.align	4
972.Loop_squeeze:
973	ldmia	$A_flat!,{r0,r1}	@ A_flat[i++]
974
975	lsl	r2,r0,#16
976	lsl	r3,r1,#16		@ r3 = r1 << 16
977	lsr	r2,r2,#16		@ r2 = r0 & 0x0000ffff
978	lsr	r1,r1,#16
979	lsr	r0,r0,#16		@ r0 = r0 >> 16
980	lsl	r1,r1,#16		@ r1 = r1 & 0xffff0000
981
982	orr	r2,r2,r2,lsl#8
983	orr	r3,r3,r3,lsr#8
984	orr	r0,r0,r0,lsl#8
985	orr	r1,r1,r1,lsr#8
986	and	r2,r2,r9		@ &=0x00ff00ff
987	and	r3,r3,r9,lsl#8		@ &=0xff00ff00
988	and	r0,r0,r9		@ &=0x00ff00ff
989	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
990	orr	r2,r2,r2,lsl#4
991	orr	r3,r3,r3,lsr#4
992	orr	r0,r0,r0,lsl#4
993	orr	r1,r1,r1,lsr#4
994	and	r2,r2,r8		@ &=0x0f0f0f0f
995	and	r3,r3,r8,lsl#4		@ &=0xf0f0f0f0
996	and	r0,r0,r8		@ &=0x0f0f0f0f
997	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
998	orr	r2,r2,r2,lsl#2
999	orr	r3,r3,r3,lsr#2
1000	orr	r0,r0,r0,lsl#2
1001	orr	r1,r1,r1,lsr#2
1002	and	r2,r2,r7		@ &=0x33333333
1003	and	r3,r3,r7,lsl#2		@ &=0xcccccccc
1004	and	r0,r0,r7		@ &=0x33333333
1005	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
1006	orr	r2,r2,r2,lsl#1
1007	orr	r3,r3,r3,lsr#1
1008	orr	r0,r0,r0,lsl#1
1009	orr	r1,r1,r1,lsr#1
1010	and	r2,r2,r6		@ &=0x55555555
1011	and	r3,r3,r6,lsl#1		@ &=0xaaaaaaaa
1012	and	r0,r0,r6		@ &=0x55555555
1013	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
1014
1015	orr	r2,r2,r3
1016	orr	r0,r0,r1
1017
1018	cmp	$len,#8
1019	blo	.Lsqueeze_tail
1020	lsr	r1,r2,#8
1021	strb	r2,[$out],#1
1022	lsr	r3,r2,#16
1023	strb	r1,[$out],#1
1024	lsr	r2,r2,#24
1025	strb	r3,[$out],#1
1026	strb	r2,[$out],#1
1027
1028	lsr	r1,r0,#8
1029	strb	r0,[$out],#1
1030	lsr	r3,r0,#16
1031	strb	r1,[$out],#1
1032	lsr	r0,r0,#24
1033	strb	r3,[$out],#1
1034	strb	r0,[$out],#1
1035	subs	$len,$len,#8
1036	beq	.Lsqueeze_done
1037
1038	subs	$bsz,$bsz,#8		@ bsz -= 8
1039	bhi	.Loop_squeeze
1040
1041	mov	r0,r14			@ original $A_flat
1042
1043	bl	KeccakF1600
1044
1045	ldmia	sp,{r6-r10,r12}		@ restore constants and variables
1046	mov	r14,$A_flat
1047	b	.Loop_squeeze
1048
1049.align	4
1050.Lsqueeze_tail:
1051	strb	r2,[$out],#1
1052	lsr	r2,r2,#8
1053	subs	$len,$len,#1
1054	beq	.Lsqueeze_done
1055	strb	r2,[$out],#1
1056	lsr	r2,r2,#8
1057	subs	$len,$len,#1
1058	beq	.Lsqueeze_done
1059	strb	r2,[$out],#1
1060	lsr	r2,r2,#8
1061	subs	$len,$len,#1
1062	beq	.Lsqueeze_done
1063	strb	r2,[$out],#1
1064	subs	$len,$len,#1
1065	beq	.Lsqueeze_done
1066
1067	strb	r0,[$out],#1
1068	lsr	r0,r0,#8
1069	subs	$len,$len,#1
1070	beq	.Lsqueeze_done
1071	strb	r0,[$out],#1
1072	lsr	r0,r0,#8
1073	subs	$len,$len,#1
1074	beq	.Lsqueeze_done
1075	strb	r0,[$out]
1076	b	.Lsqueeze_done
1077
1078.align	4
1079.Lsqueeze_done:
1080	add	sp,sp,#24
1081#if __ARM_ARCH__>=5
1082	ldmia	sp!,{r4-r10,pc}
1083#else
1084	ldmia	sp!,{r4-r10,lr}
1085	tst	lr,#1
1086	moveq	pc,lr		@ be binary compatible with V4, yet
1087	bx	lr		@ interoperable with Thumb ISA:-)
1088#endif
1089.size	SHA3_squeeze,.-SHA3_squeeze
1090___
1091}
1092
1093$code.=<<___;
1094#if __ARM_MAX_ARCH__>=7
1095.fpu	neon
1096
1097.type	iotas64, %object
1098.align 5
1099iotas64:
1100	.quad	0x0000000000000001
1101	.quad	0x0000000000008082
1102	.quad	0x800000000000808a
1103	.quad	0x8000000080008000
1104	.quad	0x000000000000808b
1105	.quad	0x0000000080000001
1106	.quad	0x8000000080008081
1107	.quad	0x8000000000008009
1108	.quad	0x000000000000008a
1109	.quad	0x0000000000000088
1110	.quad	0x0000000080008009
1111	.quad	0x000000008000000a
1112	.quad	0x000000008000808b
1113	.quad	0x800000000000008b
1114	.quad	0x8000000000008089
1115	.quad	0x8000000000008003
1116	.quad	0x8000000000008002
1117	.quad	0x8000000000000080
1118	.quad	0x000000000000800a
1119	.quad	0x800000008000000a
1120	.quad	0x8000000080008081
1121	.quad	0x8000000000008080
1122	.quad	0x0000000080000001
1123	.quad	0x8000000080008008
1124.size	iotas64,.-iotas64
1125
1126.type	KeccakF1600_neon, %function
1127.align	5
1128KeccakF1600_neon:
1129	add	r1, r0, #16
1130	adr	r2, iotas64
1131	mov	r3, #24			@ loop counter
1132	b	.Loop_neon
1133
1134.align	4
1135.Loop_neon:
1136	@ Theta
1137	vst1.64		{q4},  [r0,:64]		@ offload A[0..1][4]
1138	veor		q13, q0,  q5		@ A[0..1][0]^A[2..3][0]
1139	vst1.64		{d18}, [r1,:64]		@ offload A[2][4]
1140	veor		q14, q1,  q6		@ A[0..1][1]^A[2..3][1]
1141	veor		q15, q2,  q7		@ A[0..1][2]^A[2..3][2]
1142	veor		d26, d26, d27		@ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1143	veor		d27, d28, d29		@ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1144	veor		q14, q3,  q8		@ A[0..1][3]^A[2..3][3]
1145	veor		q4,  q4,  q9		@ A[0..1][4]^A[2..3][4]
1146	veor		d30, d30, d31		@ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1147	veor		d31, d28, d29		@ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1148	veor		d25, d8,  d9		@ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1149	veor		q13, q13, q10		@ C[0..1]^=A[4][0..1]
1150	veor		q14, q15, q11		@ C[2..3]^=A[4][2..3]
1151	veor		d25, d25, d24		@ C[4]^=A[4][4]
1152
1153	vadd.u64	q4,  q13, q13		@ C[0..1]<<1
1154	vadd.u64	q15, q14, q14		@ C[2..3]<<1
1155	vadd.u64	d18, d25, d25		@ C[4]<<1
1156	vsri.u64	q4,  q13, #63		@ ROL64(C[0..1],1)
1157	vsri.u64	q15, q14, #63		@ ROL64(C[2..3],1)
1158	vsri.u64	d18, d25, #63		@ ROL64(C[4],1)
1159	veor		d25, d25, d9		@ D[0] = C[4] ^= ROL64(C[1],1)
1160	veor		q13, q13, q15		@ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1161	veor		d28, d28, d18		@ D[3] = C[2] ^= ROL64(C[4],1)
1162	veor		d29, d29, d8		@ D[4] = C[3] ^= ROL64(C[0],1)
1163
1164	veor		d0,  d0,  d25		@ A[0][0] ^= C[4]
1165	veor		d1,  d1,  d25		@ A[1][0] ^= C[4]
1166	veor		d10, d10, d25		@ A[2][0] ^= C[4]
1167	veor		d11, d11, d25		@ A[3][0] ^= C[4]
1168	veor		d20, d20, d25		@ A[4][0] ^= C[4]
1169
1170	veor		d2,  d2,  d26		@ A[0][1] ^= D[1]
1171	veor		d3,  d3,  d26		@ A[1][1] ^= D[1]
1172	veor		d12, d12, d26		@ A[2][1] ^= D[1]
1173	veor		d13, d13, d26		@ A[3][1] ^= D[1]
1174	veor		d21, d21, d26		@ A[4][1] ^= D[1]
1175	vmov		d26, d27
1176
1177	veor		d6,  d6,  d28		@ A[0][3] ^= C[2]
1178	veor		d7,  d7,  d28		@ A[1][3] ^= C[2]
1179	veor		d16, d16, d28		@ A[2][3] ^= C[2]
1180	veor		d17, d17, d28		@ A[3][3] ^= C[2]
1181	veor		d23, d23, d28		@ A[4][3] ^= C[2]
1182	vld1.64		{q4},  [r0,:64]		@ restore A[0..1][4]
1183	vmov		d28, d29
1184
1185	vld1.64		{d18}, [r1,:64]		@ restore A[2][4]
1186	veor		q2,  q2,  q13		@ A[0..1][2] ^= D[2]
1187	veor		q7,  q7,  q13		@ A[2..3][2] ^= D[2]
1188	veor		d22, d22, d27		@ A[4][2]    ^= D[2]
1189
1190	veor		q4,  q4,  q14		@ A[0..1][4] ^= C[3]
1191	veor		q9,  q9,  q14		@ A[2..3][4] ^= C[3]
1192	veor		d24, d24, d29		@ A[4][4]    ^= C[3]
1193
1194	@ Rho + Pi
1195	vmov		d26, d2			@ C[1] = A[0][1]
1196	vshl.u64	d2,  d3,  #44
1197	vmov		d27, d4			@ C[2] = A[0][2]
1198	vshl.u64	d4,  d14, #43
1199	vmov		d28, d6			@ C[3] = A[0][3]
1200	vshl.u64	d6,  d17, #21
1201	vmov		d29, d8			@ C[4] = A[0][4]
1202	vshl.u64	d8,  d24, #14
1203	vsri.u64	d2,  d3,  #64-44	@ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1204	vsri.u64	d4,  d14, #64-43	@ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1205	vsri.u64	d6,  d17, #64-21	@ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1206	vsri.u64	d8,  d24, #64-14	@ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1207
1208	vshl.u64	d3,  d9,  #20
1209	vshl.u64	d14, d16, #25
1210	vshl.u64	d17, d15, #15
1211	vshl.u64	d24, d21, #2
1212	vsri.u64	d3,  d9,  #64-20	@ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1213	vsri.u64	d14, d16, #64-25	@ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1214	vsri.u64	d17, d15, #64-15	@ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1215	vsri.u64	d24, d21, #64-2		@ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1216
1217	vshl.u64	d9,  d22, #61
1218	@ vshl.u64	d16, d19, #8
1219	vshl.u64	d15, d12, #10
1220	vshl.u64	d21, d7,  #55
1221	vsri.u64	d9,  d22, #64-61	@ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1222	vext.8		d16, d19, d19, #8-1	@ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1223	vsri.u64	d15, d12, #64-10	@ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1224	vsri.u64	d21, d7,  #64-55	@ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1225
1226	vshl.u64	d22, d18, #39
1227	@ vshl.u64	d19, d23, #56
1228	vshl.u64	d12, d5,  #6
1229	vshl.u64	d7,  d13, #45
1230	vsri.u64	d22, d18, #64-39	@ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1231	vext.8		d19, d23, d23, #8-7	@ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1232	vsri.u64	d12, d5,  #64-6		@ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1233	vsri.u64	d7,  d13, #64-45	@ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1234
1235	vshl.u64	d18, d20, #18
1236	vshl.u64	d23, d11, #41
1237	vshl.u64	d5,  d10, #3
1238	vshl.u64	d13, d1,  #36
1239	vsri.u64	d18, d20, #64-18	@ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1240	vsri.u64	d23, d11, #64-41	@ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1241	vsri.u64	d5,  d10, #64-3		@ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1242	vsri.u64	d13, d1,  #64-36	@ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1243
1244	vshl.u64	d1,  d28, #28
1245	vshl.u64	d10, d26, #1
1246	vshl.u64	d11, d29, #27
1247	vshl.u64	d20, d27, #62
1248	vsri.u64	d1,  d28, #64-28	@ A[1][0] = ROL64(C[3],    rhotates[0][3])
1249	vsri.u64	d10, d26, #64-1		@ A[2][0] = ROL64(C[1],    rhotates[0][1])
1250	vsri.u64	d11, d29, #64-27	@ A[3][0] = ROL64(C[4],    rhotates[0][4])
1251	vsri.u64	d20, d27, #64-62	@ A[4][0] = ROL64(C[2],    rhotates[0][2])
1252
1253	@ Chi + Iota
1254	vbic		q13, q2,  q1
1255	vbic		q14, q3,  q2
1256	vbic		q15, q4,  q3
1257	veor		q13, q13, q0		@ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1258	veor		q14, q14, q1		@ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1259	veor		q2,  q2,  q15		@ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1260	vst1.64		{q13}, [r0,:64]		@ offload A[0..1][0]
1261	vbic		q13, q0,  q4
1262	vbic		q15, q1,  q0
1263	vmov		q1,  q14		@ A[0..1][1]
1264	veor		q3,  q3,  q13		@ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1265	veor		q4,  q4,  q15		@ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1266
1267	vbic		q13, q7,  q6
1268	vmov		q0,  q5			@ A[2..3][0]
1269	vbic		q14, q8,  q7
1270	vmov		q15, q6			@ A[2..3][1]
1271	veor		q5,  q5,  q13		@ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1272	vbic		q13, q9,  q8
1273	veor		q6,  q6,  q14		@ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1274	vbic		q14, q0,  q9
1275	veor		q7,  q7,  q13		@ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1276	vbic		q13, q15, q0
1277	veor		q8,  q8,  q14		@ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1278	vmov		q14, q10		@ A[4][0..1]
1279	veor		q9,  q9,  q13		@ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1280
1281	vld1.64		d25, [r2,:64]!		@ Iota[i++]
1282	vbic		d26, d22, d21
1283	vbic		d27, d23, d22
1284	vld1.64		{q0}, [r0,:64]		@ restore A[0..1][0]
1285	veor		d20, d20, d26		@ A[4][0] ^= (~A[4][1] & A[4][2])
1286	vbic		d26, d24, d23
1287	veor		d21, d21, d27		@ A[4][1] ^= (~A[4][2] & A[4][3])
1288	vbic		d27, d28, d24
1289	veor		d22, d22, d26		@ A[4][2] ^= (~A[4][3] & A[4][4])
1290	vbic		d26, d29, d28
1291	veor		d23, d23, d27		@ A[4][3] ^= (~A[4][4] & A[4][0])
1292	veor		d0,  d0,  d25		@ A[0][0] ^= Iota[i]
1293	veor		d24, d24, d26		@ A[4][4] ^= (~A[4][0] & A[4][1])
1294
1295	subs	r3, r3, #1
1296	bne	.Loop_neon
1297
1298	ret
1299.size	KeccakF1600_neon,.-KeccakF1600_neon
1300
1301.global	SHA3_absorb_neon
1302.type	SHA3_absorb_neon, %function
1303.align	5
1304SHA3_absorb_neon:
1305	stmdb	sp!, {r4-r6,lr}
1306	vstmdb	sp!, {d8-d15}
1307
1308	mov	r4, r1			@ inp
1309	mov	r5, r2			@ len
1310	mov	r6, r3			@ bsz
1311
1312	vld1.32	{d0}, [r0,:64]!		@ A[0][0]
1313	vld1.32	{d2}, [r0,:64]!		@ A[0][1]
1314	vld1.32	{d4}, [r0,:64]!		@ A[0][2]
1315	vld1.32	{d6}, [r0,:64]!		@ A[0][3]
1316	vld1.32	{d8}, [r0,:64]!		@ A[0][4]
1317
1318	vld1.32	{d1}, [r0,:64]!		@ A[1][0]
1319	vld1.32	{d3}, [r0,:64]!		@ A[1][1]
1320	vld1.32	{d5}, [r0,:64]!		@ A[1][2]
1321	vld1.32	{d7}, [r0,:64]!		@ A[1][3]
1322	vld1.32	{d9}, [r0,:64]!		@ A[1][4]
1323
1324	vld1.32	{d10}, [r0,:64]!		@ A[2][0]
1325	vld1.32	{d12}, [r0,:64]!		@ A[2][1]
1326	vld1.32	{d14}, [r0,:64]!		@ A[2][2]
1327	vld1.32	{d16}, [r0,:64]!		@ A[2][3]
1328	vld1.32	{d18}, [r0,:64]!		@ A[2][4]
1329
1330	vld1.32	{d11}, [r0,:64]!		@ A[3][0]
1331	vld1.32	{d13}, [r0,:64]!		@ A[3][1]
1332	vld1.32	{d15}, [r0,:64]!		@ A[3][2]
1333	vld1.32	{d17}, [r0,:64]!		@ A[3][3]
1334	vld1.32	{d19}, [r0,:64]!		@ A[3][4]
1335
1336	vld1.32	{d20-d23}, [r0,:64]!	@ A[4][0..3]
1337	vld1.32	{d24}, [r0,:64]		@ A[4][4]
1338	sub	r0, r0, #24*8		@ rewind
1339	b	.Loop_absorb_neon
1340
1341.align	4
1342.Loop_absorb_neon:
1343	subs	r12, r5, r6		@ len - bsz
1344	blo	.Labsorbed_neon
1345	mov	r5, r12
1346
1347	vld1.8	{d31}, [r4]!		@ endian-neutral loads...
1348	cmp	r6, #8*2
1349	veor	d0, d0, d31		@ A[0][0] ^= *inp++
1350	blo	.Lprocess_neon
1351	vld1.8	{d31}, [r4]!
1352	veor	d2, d2, d31		@ A[0][1] ^= *inp++
1353	beq	.Lprocess_neon
1354	vld1.8	{d31}, [r4]!
1355	cmp	r6, #8*4
1356	veor	d4, d4, d31		@ A[0][2] ^= *inp++
1357	blo	.Lprocess_neon
1358	vld1.8	{d31}, [r4]!
1359	veor	d6, d6, d31		@ A[0][3] ^= *inp++
1360	beq	.Lprocess_neon
1361	vld1.8	{d31},[r4]!
1362	cmp	r6, #8*6
1363	veor	d8, d8, d31		@ A[0][4] ^= *inp++
1364	blo	.Lprocess_neon
1365
1366	vld1.8	{d31}, [r4]!
1367	veor	d1, d1, d31		@ A[1][0] ^= *inp++
1368	beq	.Lprocess_neon
1369	vld1.8	{d31}, [r4]!
1370	cmp	r6, #8*8
1371	veor	d3, d3, d31		@ A[1][1] ^= *inp++
1372	blo	.Lprocess_neon
1373	vld1.8	{d31}, [r4]!
1374	veor	d5, d5, d31		@ A[1][2] ^= *inp++
1375	beq	.Lprocess_neon
1376	vld1.8	{d31}, [r4]!
1377	cmp	r6, #8*10
1378	veor	d7, d7, d31		@ A[1][3] ^= *inp++
1379	blo	.Lprocess_neon
1380	vld1.8	{d31}, [r4]!
1381	veor	d9, d9, d31		@ A[1][4] ^= *inp++
1382	beq	.Lprocess_neon
1383
1384	vld1.8	{d31}, [r4]!
1385	cmp	r6, #8*12
1386	veor	d10, d10, d31		@ A[2][0] ^= *inp++
1387	blo	.Lprocess_neon
1388	vld1.8	{d31}, [r4]!
1389	veor	d12, d12, d31		@ A[2][1] ^= *inp++
1390	beq	.Lprocess_neon
1391	vld1.8	{d31}, [r4]!
1392	cmp	r6, #8*14
1393	veor	d14, d14, d31		@ A[2][2] ^= *inp++
1394	blo	.Lprocess_neon
1395	vld1.8	{d31}, [r4]!
1396	veor	d16, d16, d31		@ A[2][3] ^= *inp++
1397	beq	.Lprocess_neon
1398	vld1.8	{d31}, [r4]!
1399	cmp	r6, #8*16
1400	veor	d18, d18, d31		@ A[2][4] ^= *inp++
1401	blo	.Lprocess_neon
1402
1403	vld1.8	{d31}, [r4]!
1404	veor	d11, d11, d31		@ A[3][0] ^= *inp++
1405	beq	.Lprocess_neon
1406	vld1.8	{d31}, [r4]!
1407	cmp	r6, #8*18
1408	veor	d13, d13, d31		@ A[3][1] ^= *inp++
1409	blo	.Lprocess_neon
1410	vld1.8	{d31}, [r4]!
1411	veor	d15, d15, d31		@ A[3][2] ^= *inp++
1412	beq	.Lprocess_neon
1413	vld1.8	{d31}, [r4]!
1414	cmp	r6, #8*20
1415	veor	d17, d17, d31		@ A[3][3] ^= *inp++
1416	blo	.Lprocess_neon
1417	vld1.8	{d31}, [r4]!
1418	veor	d19, d19, d31		@ A[3][4] ^= *inp++
1419	beq	.Lprocess_neon
1420
1421	vld1.8	{d31}, [r4]!
1422	cmp	r6, #8*22
1423	veor	d20, d20, d31		@ A[4][0] ^= *inp++
1424	blo	.Lprocess_neon
1425	vld1.8	{d31}, [r4]!
1426	veor	d21, d21, d31		@ A[4][1] ^= *inp++
1427	beq	.Lprocess_neon
1428	vld1.8	{d31}, [r4]!
1429	cmp	r6, #8*24
1430	veor	d22, d22, d31		@ A[4][2] ^= *inp++
1431	blo	.Lprocess_neon
1432	vld1.8	{d31}, [r4]!
1433	veor	d23, d23, d31		@ A[4][3] ^= *inp++
1434	beq	.Lprocess_neon
1435	vld1.8	{d31}, [r4]!
1436	veor	d24, d24, d31		@ A[4][4] ^= *inp++
1437
1438.Lprocess_neon:
1439	bl	KeccakF1600_neon
1440	b 	.Loop_absorb_neon
1441
1442.align	4
1443.Labsorbed_neon:
1444	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
1445	vst1.32	{d2}, [r0,:64]!
1446	vst1.32	{d4}, [r0,:64]!
1447	vst1.32	{d6}, [r0,:64]!
1448	vst1.32	{d8}, [r0,:64]!
1449
1450	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
1451	vst1.32	{d3}, [r0,:64]!
1452	vst1.32	{d5}, [r0,:64]!
1453	vst1.32	{d7}, [r0,:64]!
1454	vst1.32	{d9}, [r0,:64]!
1455
1456	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
1457	vst1.32	{d12}, [r0,:64]!
1458	vst1.32	{d14}, [r0,:64]!
1459	vst1.32	{d16}, [r0,:64]!
1460	vst1.32	{d18}, [r0,:64]!
1461
1462	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
1463	vst1.32	{d13}, [r0,:64]!
1464	vst1.32	{d15}, [r0,:64]!
1465	vst1.32	{d17}, [r0,:64]!
1466	vst1.32	{d19}, [r0,:64]!
1467
1468	vst1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
1469	vst1.32	{d24}, [r0,:64]
1470
1471	mov	r0, r5			@ return value
1472	vldmia	sp!, {d8-d15}
1473	ldmia	sp!, {r4-r6,pc}
1474.size	SHA3_absorb_neon,.-SHA3_absorb_neon
1475
1476.global	SHA3_squeeze_neon
1477.type	SHA3_squeeze_neon, %function
1478.align	5
1479SHA3_squeeze_neon:
1480	stmdb	sp!, {r4-r6,lr}
1481
1482	mov	r4, r1			@ out
1483	mov	r5, r2			@ len
1484	mov	r6, r3			@ bsz
1485	mov	r12, r0			@ A_flat
1486	mov	r14, r3			@ bsz
1487	b	.Loop_squeeze_neon
1488
1489.align	4
1490.Loop_squeeze_neon:
1491	cmp	r5, #8
1492	blo	.Lsqueeze_neon_tail
1493	vld1.32	{d0}, [r12]!
1494	vst1.8	{d0}, [r4]!		@ endian-neutral store
1495
1496	subs	r5, r5, #8		@ len -= 8
1497	beq	.Lsqueeze_neon_done
1498
1499	subs	r14, r14, #8		@ bsz -= 8
1500	bhi	.Loop_squeeze_neon
1501
1502	vstmdb	sp!,  {d8-d15}
1503
1504	vld1.32	{d0}, [r0,:64]!		@ A[0][0..4]
1505	vld1.32	{d2}, [r0,:64]!
1506	vld1.32	{d4}, [r0,:64]!
1507	vld1.32	{d6}, [r0,:64]!
1508	vld1.32	{d8}, [r0,:64]!
1509
1510	vld1.32	{d1}, [r0,:64]!		@ A[1][0..4]
1511	vld1.32	{d3}, [r0,:64]!
1512	vld1.32	{d5}, [r0,:64]!
1513	vld1.32	{d7}, [r0,:64]!
1514	vld1.32	{d9}, [r0,:64]!
1515
1516	vld1.32	{d10}, [r0,:64]!		@ A[2][0..4]
1517	vld1.32	{d12}, [r0,:64]!
1518	vld1.32	{d14}, [r0,:64]!
1519	vld1.32	{d16}, [r0,:64]!
1520	vld1.32	{d18}, [r0,:64]!
1521
1522	vld1.32	{d11}, [r0,:64]!		@ A[3][0..4]
1523	vld1.32	{d13}, [r0,:64]!
1524	vld1.32	{d15}, [r0,:64]!
1525	vld1.32	{d17}, [r0,:64]!
1526	vld1.32	{d19}, [r0,:64]!
1527
1528	vld1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
1529	vld1.32	{d24}, [r0,:64]
1530	sub	r0, r0, #24*8		@ rewind
1531
1532	bl	KeccakF1600_neon
1533
1534	mov	r12, r0			@ A_flat
1535	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
1536	vst1.32	{d2}, [r0,:64]!
1537	vst1.32	{d4}, [r0,:64]!
1538	vst1.32	{d6}, [r0,:64]!
1539	vst1.32	{d8}, [r0,:64]!
1540
1541	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
1542	vst1.32	{d3}, [r0,:64]!
1543	vst1.32	{d5}, [r0,:64]!
1544	vst1.32	{d7}, [r0,:64]!
1545	vst1.32	{d9}, [r0,:64]!
1546
1547	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
1548	vst1.32	{d12}, [r0,:64]!
1549	vst1.32	{d14}, [r0,:64]!
1550	vst1.32	{d16}, [r0,:64]!
1551	vst1.32	{d18}, [r0,:64]!
1552
1553	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
1554	vst1.32	{d13}, [r0,:64]!
1555	vst1.32	{d15}, [r0,:64]!
1556	vst1.32	{d17}, [r0,:64]!
1557	vst1.32	{d19}, [r0,:64]!
1558
1559	vst1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
1560	mov	r14, r6			@ bsz
1561	vst1.32	{d24}, [r0,:64]
1562	mov	r0,  r12		@ rewind
1563
1564	vldmia	sp!, {d8-d15}
1565	b	.Loop_squeeze_neon
1566
1567.align	4
1568.Lsqueeze_neon_tail:
1569	ldmia	r12, {r2,r3}
1570	cmp	r5, #2
1571	strb	r2, [r4],#1		@ endian-neutral store
1572	lsr	r2, r2, #8
1573	blo	.Lsqueeze_neon_done
1574	strb	r2, [r4], #1
1575	lsr	r2, r2, #8
1576	beq	.Lsqueeze_neon_done
1577	strb	r2, [r4], #1
1578	lsr	r2, r2, #8
1579	cmp	r5, #4
1580	blo	.Lsqueeze_neon_done
1581	strb	r2, [r4], #1
1582	beq	.Lsqueeze_neon_done
1583
1584	strb	r3, [r4], #1
1585	lsr	r3, r3, #8
1586	cmp	r5, #6
1587	blo	.Lsqueeze_neon_done
1588	strb	r3, [r4], #1
1589	lsr	r3, r3, #8
1590	beq	.Lsqueeze_neon_done
1591	strb	r3, [r4], #1
1592
1593.Lsqueeze_neon_done:
1594	ldmia	sp!, {r4-r6,pc}
1595.size	SHA3_squeeze_neon,.-SHA3_squeeze_neon
1596#endif
1597.asciz	"Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1598.align	2
1599___
1600
1601{
1602    my %ldr, %str;
1603
1604    sub ldrd {
1605	my ($mnemonic,$half,$reg,$ea) = @_;
1606	my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1607
1608	if ($half eq "l") {
1609	    $$op{reg} = $reg;
1610	    $$op{ea}  = $ea;
1611	    sprintf "#ifndef	__thumb2__\n"	.
1612		    "	%s\t%s,%s\n"		.
1613		    "#endif", $mnemonic,$reg,$ea;
1614	} else {
1615	    sprintf "#ifndef	__thumb2__\n"	.
1616		    "	%s\t%s,%s\n"		.
1617		    "#else\n"			.
1618		    "	%sd\t%s,%s,%s\n"	.
1619		    "#endif",	$mnemonic,$reg,$ea,
1620				$mnemonic,$$op{reg},$reg,$$op{ea};
1621	}
1622    }
1623}
1624
1625foreach (split($/,$code)) {
1626	s/\`([^\`]*)\`/eval $1/ge;
1627
1628	s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
1629	s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov	$2$1#/g or
1630	s/\bret\b/bx	lr/g		or
1631	s/\bbx\s+lr\b/.word\t0xe12fff1e/g;	# make it possible to compile with -march=armv4
1632
1633	print $_,"\n";
1634}
1635
1636close STDOUT or die "error closing STDOUT: $!"; # enforce flush
1637