1#!/usr/bin/env perl
2# Copyright 2017-2019 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for ARMv4.
17#
18# June 2017.
19#
20# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21# interleaving. How does it compare to Keccak Code Package? It's as
22# fast, but several times smaller, and is endian- and ISA-neutral. ISA
23# neutrality means that minimum ISA requirement is ARMv4, yet it can
24# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25# register layout taken from Keccak Code Package. It's also as fast,
26# in fact faster by 10-15% on some processors, and endian-neutral.
27#
28# August 2017.
29#
30# Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31# of rotate instructions with logical ones. This resulted in ~10%
32# improvement on most processors. Switch to KECCAK_2X effectively
33# minimizes re-loads from temporary storage, and merged rotates just
34# eliminate corresponding instructions. As for latter. When examining
35# code you'll notice commented ror instructions. These are eliminated
36# ones, and you should trace destination register below to see what's
37# going on. Just in case, why not all rotates are eliminated. Trouble
38# is that you have operations that require both inputs to be rotated,
39# e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40# 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41# that takes 'a' as input. And thing is that this next operation can
42# be in next round. It's totally possible to "carry" rotate "factors"
43# to the next round, but it makes code more complex. And the last word
44# is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
45# time being]...
46#
47# Reduce per-round instruction count in Thumb-2 case by 16%. This is
48# achieved by folding ldr/str pairs to their double-word counterparts.
49# Theoretically this should have improved performance on single-issue
50# cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
51# usual...
52#
53########################################################################
54# Numbers are cycles per processed byte. Non-NEON results account even
55# for input bit interleaving.
56#
57#		r=1088(*)   Thumb-2(**) NEON
58#
59# ARM11xx	82/+150%
60# Cortex-A5	88/+160%,   86,         36
61# Cortex-A7	78/+160%,   68,         34
62# Cortex-A8	51/+230%,   57,         30
63# Cortex-A9	53/+210%,   51,         26
64# Cortex-A15	42/+160%,   38,         18
65# Snapdragon S4	43/+210%,   38,         24
66#
67# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
68#	over compiler-generated KECCAK_2X reference code.
69# (**)	Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70#	Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71#	processors are presented mostly for reference purposes.
72
73$flavour = shift;
74if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
75else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
76
77if ($flavour && $flavour ne "void") {
78    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
79    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
80    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
81    die "can't locate arm-xlate.pl";
82
83    open STDOUT,"| \"$^X\" $xlate $flavour $output";
84} else {
85    open STDOUT,">$output";
86}
87
88my @C = map("r$_",(0..9));
89my @E = map("r$_",(10..12,14));
90
91########################################################################
92# Stack layout
93# ----->+-----------------------+
94#       | uint64_t A[5][5]      |
95#       | ...                   |
96# +200->+-----------------------+
97#       | uint64_t D[5]         |
98#       | ...                   |
99# +240->+-----------------------+
100#       | uint64_t T[5][5]      |
101#       | ...                   |
102# +440->+-----------------------+
103#       | saved lr              |
104# +444->+-----------------------+
105#       | loop counter          |
106# +448->+-----------------------+
107#       | ...
108
109my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
110my @D = map(8*$_, (25..29));
111my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
112
113$code.=<<___;
114#include "arm_arch.h"
115
116.text
117
118#if defined(__thumb2__)
119.syntax	unified
120.thumb
121#else
122.code	32
123#endif
124
125.type	iotas32, %object
126.align	5
127iotas32:
128	.long	0x00000001, 0x00000000
129	.long	0x00000000, 0x00000089
130	.long	0x00000000, 0x8000008b
131	.long	0x00000000, 0x80008080
132	.long	0x00000001, 0x0000008b
133	.long	0x00000001, 0x00008000
134	.long	0x00000001, 0x80008088
135	.long	0x00000001, 0x80000082
136	.long	0x00000000, 0x0000000b
137	.long	0x00000000, 0x0000000a
138	.long	0x00000001, 0x00008082
139	.long	0x00000000, 0x00008003
140	.long	0x00000001, 0x0000808b
141	.long	0x00000001, 0x8000000b
142	.long	0x00000001, 0x8000008a
143	.long	0x00000001, 0x80000081
144	.long	0x00000000, 0x80000081
145	.long	0x00000000, 0x80000008
146	.long	0x00000000, 0x00000083
147	.long	0x00000000, 0x80008003
148	.long	0x00000001, 0x80008088
149	.long	0x00000000, 0x80000088
150	.long	0x00000001, 0x00008000
151	.long	0x00000000, 0x80008082
152.size	iotas32,.-iotas32
153
154.type	KeccakF1600_int, %function
155.align	5
156KeccakF1600_int:
157	add	@C[9],sp,#$A[4][2]
158	add	@E[2],sp,#$A[0][0]
159	add	@E[0],sp,#$A[1][0]
160	ldmia	@C[9],{@C[4]-@C[9]}		@ A[4][2..4]
161KeccakF1600_enter:
162	str	lr,[sp,#440]
163	eor	@E[1],@E[1],@E[1]
164	str	@E[1],[sp,#444]
165	b	.Lround2x
166
167.align	4
168.Lround2x:
169___
170sub Round {
171my (@A,@R); (@A[0..4],@R) = @_;
172
173$code.=<<___;
174	ldmia	@E[2],{@C[0]-@C[3]}		@ A[0][0..1]
175	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[1][0..1]
176#ifdef	__thumb2__
177	eor	@C[0],@C[0],@E[0]
178	eor	@C[1],@C[1],@E[1]
179	eor	@C[2],@C[2],@E[2]
180	ldrd	@E[0],@E[1],[sp,#$A[1][2]]
181	eor	@C[3],@C[3],@E[3]
182	ldrd	@E[2],@E[3],[sp,#$A[1][3]]
183	eor	@C[4],@C[4],@E[0]
184	eor	@C[5],@C[5],@E[1]
185	eor	@C[6],@C[6],@E[2]
186	ldrd	@E[0],@E[1],[sp,#$A[1][4]]
187	eor	@C[7],@C[7],@E[3]
188	ldrd	@E[2],@E[3],[sp,#$A[2][0]]
189	eor	@C[8],@C[8],@E[0]
190	eor	@C[9],@C[9],@E[1]
191	eor	@C[0],@C[0],@E[2]
192	ldrd	@E[0],@E[1],[sp,#$A[2][1]]
193	eor	@C[1],@C[1],@E[3]
194	ldrd	@E[2],@E[3],[sp,#$A[2][2]]
195	eor	@C[2],@C[2],@E[0]
196	eor	@C[3],@C[3],@E[1]
197	eor	@C[4],@C[4],@E[2]
198	ldrd	@E[0],@E[1],[sp,#$A[2][3]]
199	eor	@C[5],@C[5],@E[3]
200	ldrd	@E[2],@E[3],[sp,#$A[2][4]]
201	eor	@C[6],@C[6],@E[0]
202	eor	@C[7],@C[7],@E[1]
203	eor	@C[8],@C[8],@E[2]
204	ldrd	@E[0],@E[1],[sp,#$A[3][0]]
205	eor	@C[9],@C[9],@E[3]
206	ldrd	@E[2],@E[3],[sp,#$A[3][1]]
207	eor	@C[0],@C[0],@E[0]
208	eor	@C[1],@C[1],@E[1]
209	eor	@C[2],@C[2],@E[2]
210	ldrd	@E[0],@E[1],[sp,#$A[3][2]]
211	eor	@C[3],@C[3],@E[3]
212	ldrd	@E[2],@E[3],[sp,#$A[3][3]]
213	eor	@C[4],@C[4],@E[0]
214	eor	@C[5],@C[5],@E[1]
215	eor	@C[6],@C[6],@E[2]
216	ldrd	@E[0],@E[1],[sp,#$A[3][4]]
217	eor	@C[7],@C[7],@E[3]
218	ldrd	@E[2],@E[3],[sp,#$A[4][0]]
219	eor	@C[8],@C[8],@E[0]
220	eor	@C[9],@C[9],@E[1]
221	eor	@C[0],@C[0],@E[2]
222	ldrd	@E[0],@E[1],[sp,#$A[4][1]]
223	eor	@C[1],@C[1],@E[3]
224	ldrd	@E[2],@E[3],[sp,#$A[0][2]]
225	eor	@C[2],@C[2],@E[0]
226	eor	@C[3],@C[3],@E[1]
227	eor	@C[4],@C[4],@E[2]
228	ldrd	@E[0],@E[1],[sp,#$A[0][3]]
229	eor	@C[5],@C[5],@E[3]
230	ldrd	@E[2],@E[3],[sp,#$A[0][4]]
231#else
232	eor	@C[0],@C[0],@E[0]
233	 add	@E[0],sp,#$A[1][2]
234	eor	@C[1],@C[1],@E[1]
235	eor	@C[2],@C[2],@E[2]
236	eor	@C[3],@C[3],@E[3]
237	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[1][2..3]
238	eor	@C[4],@C[4],@E[0]
239	 add	@E[0],sp,#$A[1][4]
240	eor	@C[5],@C[5],@E[1]
241	eor	@C[6],@C[6],@E[2]
242	eor	@C[7],@C[7],@E[3]
243	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[1][4]..A[2][0]
244	eor	@C[8],@C[8],@E[0]
245	 add	@E[0],sp,#$A[2][1]
246	eor	@C[9],@C[9],@E[1]
247	eor	@C[0],@C[0],@E[2]
248	eor	@C[1],@C[1],@E[3]
249	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[2][1..2]
250	eor	@C[2],@C[2],@E[0]
251	 add	@E[0],sp,#$A[2][3]
252	eor	@C[3],@C[3],@E[1]
253	eor	@C[4],@C[4],@E[2]
254	eor	@C[5],@C[5],@E[3]
255	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[2][3..4]
256	eor	@C[6],@C[6],@E[0]
257	 add	@E[0],sp,#$A[3][0]
258	eor	@C[7],@C[7],@E[1]
259	eor	@C[8],@C[8],@E[2]
260	eor	@C[9],@C[9],@E[3]
261	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[3][0..1]
262	eor	@C[0],@C[0],@E[0]
263	 add	@E[0],sp,#$A[3][2]
264	eor	@C[1],@C[1],@E[1]
265	eor	@C[2],@C[2],@E[2]
266	eor	@C[3],@C[3],@E[3]
267	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[3][2..3]
268	eor	@C[4],@C[4],@E[0]
269	 add	@E[0],sp,#$A[3][4]
270	eor	@C[5],@C[5],@E[1]
271	eor	@C[6],@C[6],@E[2]
272	eor	@C[7],@C[7],@E[3]
273	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[3][4]..A[4][0]
274	eor	@C[8],@C[8],@E[0]
275	ldr	@E[0],[sp,#$A[4][1]]		@ A[4][1]
276	eor	@C[9],@C[9],@E[1]
277	ldr	@E[1],[sp,#$A[4][1]+4]
278	eor	@C[0],@C[0],@E[2]
279	ldr	@E[2],[sp,#$A[0][2]]		@ A[0][2]
280	eor	@C[1],@C[1],@E[3]
281	ldr	@E[3],[sp,#$A[0][2]+4]
282	eor	@C[2],@C[2],@E[0]
283	 add	@E[0],sp,#$A[0][3]
284	eor	@C[3],@C[3],@E[1]
285	eor	@C[4],@C[4],@E[2]
286	eor	@C[5],@C[5],@E[3]
287	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[0][3..4]
288#endif
289	eor	@C[6],@C[6],@E[0]
290	eor	@C[7],@C[7],@E[1]
291	eor	@C[8],@C[8],@E[2]
292	eor	@C[9],@C[9],@E[3]
293
294	eor	@E[0],@C[0],@C[5],ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
295	str.l	@E[0],[sp,#$D[1]]		@ D[1] = E[0]
296	eor	@E[1],@C[1],@C[4]
297	str.h	@E[1],[sp,#$D[1]+4]
298	eor	@E[2],@C[6],@C[1],ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
299	eor	@E[3],@C[7],@C[0]
300	str.l	@E[2],[sp,#$D[4]]		@ D[4] = E[1]
301	eor	@C[0],@C[8],@C[3],ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
302	str.h	@E[3],[sp,#$D[4]+4]
303	eor	@C[1],@C[9],@C[2]
304	str.l	@C[0],[sp,#$D[0]]		@ D[0] = C[0]
305	eor	@C[2],@C[2],@C[7],ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
306	 ldr.l	@C[7],[sp,#$A[3][3]]
307	eor	@C[3],@C[3],@C[6]
308	str.h	@C[1],[sp,#$D[0]+4]
309	 ldr.h	@C[6],[sp,#$A[3][3]+4]
310	str.l	@C[2],[sp,#$D[2]]		@ D[2] = C[1]
311	eor	@C[4],@C[4],@C[9],ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
312	str.h	@C[3],[sp,#$D[2]+4]
313	eor	@C[5],@C[5],@C[8]
314
315	ldr.l	@C[8],[sp,#$A[4][4]]
316	ldr.h	@C[9],[sp,#$A[4][4]+4]
317	 str.l	@C[4],[sp,#$D[3]]		@ D[3] = C[2]
318	eor	@C[7],@C[7],@C[4]
319	 str.h	@C[5],[sp,#$D[3]+4]
320	eor	@C[6],@C[6],@C[5]
321	ldr.l	@C[4],[sp,#$A[0][0]]
322	@ ror	@C[7],@C[7],#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
323	@ ror	@C[6],@C[6],#32-11
324	ldr.h	@C[5],[sp,#$A[0][0]+4]
325	eor	@C[8],@C[8],@E[2]
326	eor	@C[9],@C[9],@E[3]
327	ldr.l	@E[2],[sp,#$A[2][2]]
328	eor	@C[0],@C[0],@C[4]
329	ldr.h	@E[3],[sp,#$A[2][2]+4]
330	@ ror	@C[8],@C[8],#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
331	@ ror	@C[9],@C[9],#32-7
332	eor	@C[1],@C[1],@C[5]		@ C[0] =       A[0][0] ^ C[0]; /* rotate by 0 */  /* D[0] */
333	eor	@E[2],@E[2],@C[2]
334	ldr.l	@C[2],[sp,#$A[1][1]]
335	eor	@E[3],@E[3],@C[3]
336	ldr.h	@C[3],[sp,#$A[1][1]+4]
337	ror	@C[5],@E[2],#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
338	 ldr	@E[2],[sp,#444]			@ load counter
339	eor	@C[2],@C[2],@E[0]
340	 adr	@E[0],iotas32
341	ror	@C[4],@E[3],#32-22
342	 add	@E[3],@E[0],@E[2]
343	eor	@C[3],@C[3],@E[1]
344___
345$code.=<<___	if ($A[0][0] != $T[0][0]);
346	ldmia	@E[3],{@E[0],@E[1]}		@ iotas[i]
347___
348$code.=<<___	if ($A[0][0] == $T[0][0]);
349	ldr.l	@E[0],[@E[3],#8]		@ iotas[i].lo
350	add	@E[2],@E[2],#16
351	ldr.h	@E[1],[@E[3],#12]		@ iotas[i].hi
352	cmp	@E[2],#192
353	str	@E[2],[sp,#444]			@ store counter
354___
355$code.=<<___;
356	bic	@E[2],@C[4],@C[2],ror#32-22
357	bic	@E[3],@C[5],@C[3],ror#32-22
358	 ror	@C[2],@C[2],#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);   /* D[1] */
359	 ror	@C[3],@C[3],#32-22
360	eor	@E[2],@E[2],@C[0]
361	eor	@E[3],@E[3],@C[1]
362	eor	@E[0],@E[0],@E[2]
363	eor	@E[1],@E[1],@E[3]
364	str.l	@E[0],[sp,#$R[0][0]]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
365	bic	@E[2],@C[6],@C[4],ror#11
366	str.h	@E[1],[sp,#$R[0][0]+4]
367	bic	@E[3],@C[7],@C[5],ror#10
368	bic	@E[0],@C[8],@C[6],ror#32-(11-7)
369	bic	@E[1],@C[9],@C[7],ror#32-(10-7)
370	eor	@E[2],@C[2],@E[2],ror#32-11
371	str.l	@E[2],[sp,#$R[0][1]]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
372	eor	@E[3],@C[3],@E[3],ror#32-10
373	str.h	@E[3],[sp,#$R[0][1]+4]
374	eor	@E[0],@C[4],@E[0],ror#32-7
375	eor	@E[1],@C[5],@E[1],ror#32-7
376	str.l	@E[0],[sp,#$R[0][2]]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
377	bic	@E[2],@C[0],@C[8],ror#32-7
378	str.h	@E[1],[sp,#$R[0][2]+4]
379	bic	@E[3],@C[1],@C[9],ror#32-7
380	eor	@E[2],@E[2],@C[6],ror#32-11
381	str.l	@E[2],[sp,#$R[0][3]]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
382	eor	@E[3],@E[3],@C[7],ror#32-10
383	str.h	@E[3],[sp,#$R[0][3]+4]
384	bic	@E[0],@C[2],@C[0]
385	 add	@E[3],sp,#$D[3]
386	 ldr.l	@C[0],[sp,#$A[0][3]]		@ A[0][3]
387	bic	@E[1],@C[3],@C[1]
388	 ldr.h	@C[1],[sp,#$A[0][3]+4]
389	eor	@E[0],@E[0],@C[8],ror#32-7
390	eor	@E[1],@E[1],@C[9],ror#32-7
391	str.l	@E[0],[sp,#$R[0][4]]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
392	 add	@C[9],sp,#$D[0]
393	str.h	@E[1],[sp,#$R[0][4]+4]
394
395	ldmia	@E[3],{@E[0]-@E[2],@E[3]}	@ D[3..4]
396	ldmia	@C[9],{@C[6]-@C[9]}		@ D[0..1]
397
398	ldr.l	@C[2],[sp,#$A[1][4]]		@ A[1][4]
399	eor	@C[0],@C[0],@E[0]
400	ldr.h	@C[3],[sp,#$A[1][4]+4]
401	eor	@C[1],@C[1],@E[1]
402	@ ror	@C[0],@C[0],#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
403	ldr.l	@E[0],[sp,#$A[3][1]]		@ A[3][1]
404	@ ror	@C[1],@C[1],#32-14
405	ldr.h	@E[1],[sp,#$A[3][1]+4]
406
407	eor	@C[2],@C[2],@E[2]
408	ldr.l	@C[4],[sp,#$A[2][0]]		@ A[2][0]
409	eor	@C[3],@C[3],@E[3]
410	ldr.h	@C[5],[sp,#$A[2][0]+4]
411	@ ror	@C[2],@C[2],#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
412	@ ror	@C[3],@C[3],#32-10
413
414	eor	@C[6],@C[6],@C[4]
415	ldr.l	@E[2],[sp,#$D[2]]		@ D[2]
416	eor	@C[7],@C[7],@C[5]
417	ldr.h	@E[3],[sp,#$D[2]+4]
418	ror	@C[5],@C[6],#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
419	ror	@C[4],@C[7],#32-2
420
421	eor	@E[0],@E[0],@C[8]
422	ldr.l	@C[8],[sp,#$A[4][2]]		@ A[4][2]
423	eor	@E[1],@E[1],@C[9]
424	ldr.h	@C[9],[sp,#$A[4][2]+4]
425	ror	@C[7],@E[0],#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
426	ror	@C[6],@E[1],#32-23
427
428	bic	@E[0],@C[4],@C[2],ror#32-10
429	bic	@E[1],@C[5],@C[3],ror#32-10
430	 eor	@E[2],@E[2],@C[8]
431	 eor	@E[3],@E[3],@C[9]
432	 ror	@C[9],@E[2],#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
433	 ror	@C[8],@E[3],#32-31
434	eor	@E[0],@E[0],@C[0],ror#32-14
435	eor	@E[1],@E[1],@C[1],ror#32-14
436	str.l	@E[0],[sp,#$R[1][0]]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
437	bic	@E[2],@C[6],@C[4]
438	str.h	@E[1],[sp,#$R[1][0]+4]
439	bic	@E[3],@C[7],@C[5]
440	eor	@E[2],@E[2],@C[2],ror#32-10
441	str.l	@E[2],[sp,#$R[1][1]]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
442	eor	@E[3],@E[3],@C[3],ror#32-10
443	str.h	@E[3],[sp,#$R[1][1]+4]
444	bic	@E[0],@C[8],@C[6]
445	bic	@E[1],@C[9],@C[7]
446	bic	@E[2],@C[0],@C[8],ror#14
447	bic	@E[3],@C[1],@C[9],ror#14
448	eor	@E[0],@E[0],@C[4]
449	eor	@E[1],@E[1],@C[5]
450	str.l	@E[0],[sp,#$R[1][2]]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
451	bic	@C[2],@C[2],@C[0],ror#32-(14-10)
452	str.h	@E[1],[sp,#$R[1][2]+4]
453	eor	@E[2],@C[6],@E[2],ror#32-14
454	bic	@E[1],@C[3],@C[1],ror#32-(14-10)
455	str.l	@E[2],[sp,#$R[1][3]]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
456	eor	@E[3],@C[7],@E[3],ror#32-14
457	str.h	@E[3],[sp,#$R[1][3]+4]
458	 add	@E[2],sp,#$D[1]
459	 ldr.l	@C[1],[sp,#$A[0][1]]		@ A[0][1]
460	eor	@E[0],@C[8],@C[2],ror#32-10
461	 ldr.h	@C[0],[sp,#$A[0][1]+4]
462	eor	@E[1],@C[9],@E[1],ror#32-10
463	str.l	@E[0],[sp,#$R[1][4]]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
464	str.h	@E[1],[sp,#$R[1][4]+4]
465
466	add	@C[9],sp,#$D[3]
467	ldmia	@E[2],{@E[0]-@E[2],@E[3]}	@ D[1..2]
468	ldr.l	@C[2],[sp,#$A[1][2]]		@ A[1][2]
469	ldr.h	@C[3],[sp,#$A[1][2]+4]
470	ldmia	@C[9],{@C[6]-@C[9]}		@ D[3..4]
471
472	eor	@C[1],@C[1],@E[0]
473	ldr.l	@C[4],[sp,#$A[2][3]]		@ A[2][3]
474	eor	@C[0],@C[0],@E[1]
475	ldr.h	@C[5],[sp,#$A[2][3]+4]
476	ror	@C[0],@C[0],#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
477
478	eor	@C[2],@C[2],@E[2]
479	ldr.l	@E[0],[sp,#$A[3][4]]		@ A[3][4]
480	eor	@C[3],@C[3],@E[3]
481	ldr.h	@E[1],[sp,#$A[3][4]+4]
482	@ ror	@C[2],@C[2],#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
483	ldr.l	@E[2],[sp,#$D[0]]		@ D[0]
484	@ ror	@C[3],@C[3],#32-3
485	ldr.h	@E[3],[sp,#$D[0]+4]
486
487	eor	@C[4],@C[4],@C[6]
488	eor	@C[5],@C[5],@C[7]
489	@ ror	@C[5],@C[6],#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
490	@ ror	@C[4],@C[7],#32-13		@ [track reverse order below]
491
492	eor	@E[0],@E[0],@C[8]
493	ldr.l	@C[8],[sp,#$A[4][0]]		@ A[4][0]
494	eor	@E[1],@E[1],@C[9]
495	ldr.h	@C[9],[sp,#$A[4][0]+4]
496	ror	@C[6],@E[0],#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
497	ror	@C[7],@E[1],#32-4
498
499	eor	@E[2],@E[2],@C[8]
500	eor	@E[3],@E[3],@C[9]
501	ror	@C[8],@E[2],#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
502	ror	@C[9],@E[3],#32-9
503
504	bic	@E[0],@C[5],@C[2],ror#13-3
505	bic	@E[1],@C[4],@C[3],ror#12-3
506	bic	@E[2],@C[6],@C[5],ror#32-13
507	bic	@E[3],@C[7],@C[4],ror#32-12
508	eor	@E[0],@C[0],@E[0],ror#32-13
509	eor	@E[1],@C[1],@E[1],ror#32-12
510	str.l	@E[0],[sp,#$R[2][0]]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
511	eor	@E[2],@E[2],@C[2],ror#32-3
512	str.h	@E[1],[sp,#$R[2][0]+4]
513	eor	@E[3],@E[3],@C[3],ror#32-3
514	str.l	@E[2],[sp,#$R[2][1]]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
515	bic	@E[0],@C[8],@C[6]
516	bic	@E[1],@C[9],@C[7]
517	str.h	@E[3],[sp,#$R[2][1]+4]
518	eor	@E[0],@E[0],@C[5],ror#32-13
519	eor	@E[1],@E[1],@C[4],ror#32-12
520	str.l	@E[0],[sp,#$R[2][2]]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
521	bic	@E[2],@C[0],@C[8]
522	str.h	@E[1],[sp,#$R[2][2]+4]
523	bic	@E[3],@C[1],@C[9]
524	eor	@E[2],@E[2],@C[6]
525	eor	@E[3],@E[3],@C[7]
526	str.l	@E[2],[sp,#$R[2][3]]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
527	bic	@E[0],@C[2],@C[0],ror#3
528	str.h	@E[3],[sp,#$R[2][3]+4]
529	bic	@E[1],@C[3],@C[1],ror#3
530	 ldr.l	@C[1],[sp,#$A[0][4]]		@ A[0][4] [in reverse order]
531	eor	@E[0],@C[8],@E[0],ror#32-3
532	 ldr.h	@C[0],[sp,#$A[0][4]+4]
533	eor	@E[1],@C[9],@E[1],ror#32-3
534	str.l	@E[0],[sp,#$R[2][4]]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
535	 add	@C[9],sp,#$D[1]
536	str.h	@E[1],[sp,#$R[2][4]+4]
537
538	ldr.l	@E[0],[sp,#$D[4]]		@ D[4]
539	ldr.h	@E[1],[sp,#$D[4]+4]
540	ldr.l	@E[2],[sp,#$D[0]]		@ D[0]
541	ldr.h	@E[3],[sp,#$D[0]+4]
542
543	ldmia	@C[9],{@C[6]-@C[9]}		@ D[1..2]
544
545	eor	@C[1],@C[1],@E[0]
546	ldr.l	@C[2],[sp,#$A[1][0]]		@ A[1][0]
547	eor	@C[0],@C[0],@E[1]
548	ldr.h	@C[3],[sp,#$A[1][0]+4]
549	@ ror	@C[1],@E[0],#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
550	ldr.l	@C[4],[sp,#$A[2][1]]		@ A[2][1]
551	@ ror	@C[0],@E[1],#32-14		@ [was loaded in reverse order]
552	ldr.h	@C[5],[sp,#$A[2][1]+4]
553
554	eor	@C[2],@C[2],@E[2]
555	ldr.l	@E[0],[sp,#$A[3][2]]		@ A[3][2]
556	eor	@C[3],@C[3],@E[3]
557	ldr.h	@E[1],[sp,#$A[3][2]+4]
558	@ ror	@C[2],@C[2],#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
559	ldr.l	@E[2],[sp,#$D[3]]		@ D[3]
560	@ ror	@C[3],@C[3],#32-18
561	ldr.h	@E[3],[sp,#$D[3]+4]
562
563	eor	@C[6],@C[6],@C[4]
564	eor	@C[7],@C[7],@C[5]
565	ror	@C[4],@C[6],#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
566	ror	@C[5],@C[7],#32-5
567
568	eor	@E[0],@E[0],@C[8]
569	ldr.l	@C[8],[sp,#$A[4][3]]		@ A[4][3]
570	eor	@E[1],@E[1],@C[9]
571	ldr.h	@C[9],[sp,#$A[4][3]+4]
572	ror	@C[7],@E[0],#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
573	ror	@C[6],@E[1],#32-8
574
575	eor	@E[2],@E[2],@C[8]
576	eor	@E[3],@E[3],@C[9]
577	ror	@C[8],@E[2],#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
578	ror	@C[9],@E[3],#32-28
579
580	bic	@E[0],@C[4],@C[2],ror#32-18
581	bic	@E[1],@C[5],@C[3],ror#32-18
582	eor	@E[0],@E[0],@C[0],ror#32-14
583	eor	@E[1],@E[1],@C[1],ror#32-13
584	str.l	@E[0],[sp,#$R[3][0]]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
585	bic	@E[2],@C[6],@C[4]
586	str.h	@E[1],[sp,#$R[3][0]+4]
587	bic	@E[3],@C[7],@C[5]
588	eor	@E[2],@E[2],@C[2],ror#32-18
589	str.l	@E[2],[sp,#$R[3][1]]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
590	eor	@E[3],@E[3],@C[3],ror#32-18
591	str.h	@E[3],[sp,#$R[3][1]+4]
592	bic	@E[0],@C[8],@C[6]
593	bic	@E[1],@C[9],@C[7]
594	bic	@E[2],@C[0],@C[8],ror#14
595	bic	@E[3],@C[1],@C[9],ror#13
596	eor	@E[0],@E[0],@C[4]
597	eor	@E[1],@E[1],@C[5]
598	str.l	@E[0],[sp,#$R[3][2]]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
599	bic	@C[2],@C[2],@C[0],ror#18-14
600	str.h	@E[1],[sp,#$R[3][2]+4]
601	eor	@E[2],@C[6],@E[2],ror#32-14
602	bic	@E[1],@C[3],@C[1],ror#18-13
603	eor	@E[3],@C[7],@E[3],ror#32-13
604	str.l	@E[2],[sp,#$R[3][3]]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
605	str.h	@E[3],[sp,#$R[3][3]+4]
606	 add	@E[3],sp,#$D[2]
607	 ldr.l	@C[0],[sp,#$A[0][2]]		@ A[0][2]
608	eor	@E[0],@C[8],@C[2],ror#32-18
609	 ldr.h	@C[1],[sp,#$A[0][2]+4]
610	eor	@E[1],@C[9],@E[1],ror#32-18
611	str.l	@E[0],[sp,#$R[3][4]]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
612	str.h	@E[1],[sp,#$R[3][4]+4]
613
614	ldmia	@E[3],{@E[0]-@E[2],@E[3]}	@ D[2..3]
615	ldr.l	@C[2],[sp,#$A[1][3]]		@ A[1][3]
616	ldr.h	@C[3],[sp,#$A[1][3]+4]
617	ldr.l	@C[6],[sp,#$D[4]]		@ D[4]
618	ldr.h	@C[7],[sp,#$D[4]+4]
619
620	eor	@C[0],@C[0],@E[0]
621	ldr.l	@C[4],[sp,#$A[2][4]]		@ A[2][4]
622	eor	@C[1],@C[1],@E[1]
623	ldr.h	@C[5],[sp,#$A[2][4]+4]
624	@ ror	@C[0],@C[0],#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
625	ldr.l	@C[8],[sp,#$D[0]]		@ D[0]
626	@ ror	@C[1],@C[1],#32-31
627	ldr.h	@C[9],[sp,#$D[0]+4]
628
629	eor	@E[2],@E[2],@C[2]
630	ldr.l	@E[0],[sp,#$A[3][0]]		@ A[3][0]
631	eor	@E[3],@E[3],@C[3]
632	ldr.h	@E[1],[sp,#$A[3][0]+4]
633	ror	@C[3],@E[2],#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
634	ldr.l	@E[2],[sp,#$D[1]]		@ D[1]
635	ror	@C[2],@E[3],#32-28
636	ldr.h	@E[3],[sp,#$D[1]+4]
637
638	eor	@C[6],@C[6],@C[4]
639	eor	@C[7],@C[7],@C[5]
640	ror	@C[5],@C[6],#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
641	ror	@C[4],@C[7],#32-20
642
643	eor	@E[0],@E[0],@C[8]
644	ldr.l	@C[8],[sp,#$A[4][1]]		@ A[4][1]
645	eor	@E[1],@E[1],@C[9]
646	ldr.h	@C[9],[sp,#$A[4][1]+4]
647	ror	@C[7],@E[0],#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
648	ror	@C[6],@E[1],#32-21
649
650	eor	@C[8],@C[8],@E[2]
651	eor	@C[9],@C[9],@E[3]
652	@ ror	@C[8],@C[2],#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
653	@ ror	@C[9],@C[3],#32-1
654
655	bic	@E[0],@C[4],@C[2]
656	bic	@E[1],@C[5],@C[3]
657	eor	@E[0],@E[0],@C[0],ror#32-31
658	str.l	@E[0],[sp,#$R[4][0]]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
659	eor	@E[1],@E[1],@C[1],ror#32-31
660	str.h	@E[1],[sp,#$R[4][0]+4]
661	bic	@E[2],@C[6],@C[4]
662	bic	@E[3],@C[7],@C[5]
663	eor	@E[2],@E[2],@C[2]
664	eor	@E[3],@E[3],@C[3]
665	str.l	@E[2],[sp,#$R[4][1]]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
666	bic	@E[0],@C[8],@C[6],ror#1
667	str.h	@E[3],[sp,#$R[4][1]+4]
668	bic	@E[1],@C[9],@C[7],ror#1
669	bic	@E[2],@C[0],@C[8],ror#31-1
670	bic	@E[3],@C[1],@C[9],ror#31-1
671	eor	@C[4],@C[4],@E[0],ror#32-1
672	str.l	@C[4],[sp,#$R[4][2]]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
673	eor	@C[5],@C[5],@E[1],ror#32-1
674	str.h	@C[5],[sp,#$R[4][2]+4]
675	eor	@C[6],@C[6],@E[2],ror#32-31
676	eor	@C[7],@C[7],@E[3],ror#32-31
677	str.l	@C[6],[sp,#$R[4][3]]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
678	bic	@E[0],@C[2],@C[0],ror#32-31
679	str.h	@C[7],[sp,#$R[4][3]+4]
680	bic	@E[1],@C[3],@C[1],ror#32-31
681	 add	@E[2],sp,#$R[0][0]
682	eor	@C[8],@E[0],@C[8],ror#32-1
683	 add	@E[0],sp,#$R[1][0]
684	eor	@C[9],@E[1],@C[9],ror#32-1
685	str.l	@C[8],[sp,#$R[4][4]]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
686	str.h	@C[9],[sp,#$R[4][4]+4]
687___
688}
689	Round(@A,@T);
690	Round(@T,@A);
691$code.=<<___;
692	blo	.Lround2x
693
694	ldr	pc,[sp,#440]
695.size	KeccakF1600_int,.-KeccakF1600_int
696
697.type	KeccakF1600, %function
698.align	5
699KeccakF1600:
700	stmdb	sp!,{r0,r4-r11,lr}
701	sub	sp,sp,#440+16			@ space for A[5][5],D[5],T[5][5],...
702
703	add	@E[0],r0,#$A[1][0]
704	add	@E[1],sp,#$A[1][0]
705	ldmia	r0,    {@C[0]-@C[9]}		@ copy A[5][5] to stack
706	stmia	sp,    {@C[0]-@C[9]}
707	ldmia	@E[0]!,{@C[0]-@C[9]}
708	stmia	@E[1]!,{@C[0]-@C[9]}
709	ldmia	@E[0]!,{@C[0]-@C[9]}
710	stmia	@E[1]!,{@C[0]-@C[9]}
711	ldmia	@E[0]!,{@C[0]-@C[9]}
712	stmia	@E[1]!,{@C[0]-@C[9]}
713	ldmia	@E[0], {@C[0]-@C[9]}
714	add	@E[2],sp,#$A[0][0]
715	add	@E[0],sp,#$A[1][0]
716	stmia	@E[1], {@C[0]-@C[9]}
717
718	bl	KeccakF1600_enter
719
720	ldr	@E[1], [sp,#440+16]		@ restore pointer to A
721	ldmia	sp,    {@C[0]-@C[9]}
722	stmia	@E[1]!,{@C[0]-@C[9]}		@ return A[5][5]
723	ldmia	@E[0]!,{@C[0]-@C[9]}
724	stmia	@E[1]!,{@C[0]-@C[9]}
725	ldmia	@E[0]!,{@C[0]-@C[9]}
726	stmia	@E[1]!,{@C[0]-@C[9]}
727	ldmia	@E[0]!,{@C[0]-@C[9]}
728	stmia	@E[1]!,{@C[0]-@C[9]}
729	ldmia	@E[0], {@C[0]-@C[9]}
730	stmia	@E[1], {@C[0]-@C[9]}
731
732	add	sp,sp,#440+20
733	ldmia	sp!,{r4-r11,pc}
734.size	KeccakF1600,.-KeccakF1600
735___
736{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
737
738########################################################################
739# Stack layout
740# ----->+-----------------------+
741#       | uint64_t A[5][5]      |
742#       | ...                   |
743#       | ...                   |
744# +456->+-----------------------+
745#       | 0x55555555            |
746# +460->+-----------------------+
747#       | 0x33333333            |
748# +464->+-----------------------+
749#       | 0x0f0f0f0f            |
750# +468->+-----------------------+
751#       | 0x00ff00ff            |
752# +472->+-----------------------+
753#       | uint64_t *A           |
754# +476->+-----------------------+
755#       | const void *inp       |
756# +480->+-----------------------+
757#       | size_t len            |
758# +484->+-----------------------+
759#       | size_t bs             |
760# +488->+-----------------------+
761#       | ....
762
763$code.=<<___;
764.global	SHA3_absorb
765.type	SHA3_absorb,%function
766.align	5
767SHA3_absorb:
768	stmdb	sp!,{r0-r12,lr}
769	sub	sp,sp,#456+16
770
771	add	$A_flat,r0,#$A[1][0]
772	@ mov	$inp,r1
773	mov	$len,r2
774	mov	$bsz,r3
775	cmp	r2,r3
776	blo	.Labsorb_abort
777
778	add	$inp,sp,#0
779	ldmia	r0,      {@C[0]-@C[9]}	@ copy A[5][5] to stack
780	stmia	$inp!,   {@C[0]-@C[9]}
781	ldmia	$A_flat!,{@C[0]-@C[9]}
782	stmia	$inp!,   {@C[0]-@C[9]}
783	ldmia	$A_flat!,{@C[0]-@C[9]}
784	stmia	$inp!,   {@C[0]-@C[9]}
785	ldmia	$A_flat!,{@C[0]-@C[9]}
786	stmia	$inp!,   {@C[0]-@C[9]}
787	ldmia	$A_flat!,{@C[0]-@C[9]}
788	stmia	$inp,    {@C[0]-@C[9]}
789
790	ldr	$inp,[sp,#476]		@ restore $inp
791#ifdef	__thumb2__
792	mov	r9,#0x00ff00ff
793	mov	r8,#0x0f0f0f0f
794	mov	r7,#0x33333333
795	mov	r6,#0x55555555
796#else
797	mov	r6,#0x11		@ compose constants
798	mov	r8,#0x0f
799	mov	r9,#0xff
800	orr	r6,r6,r6,lsl#8
801	orr	r8,r8,r8,lsl#8
802	orr	r6,r6,r6,lsl#16		@ 0x11111111
803	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
804	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
805	orr	r7,r6,r6,lsl#1		@ 0x33333333
806	orr	r6,r6,r6,lsl#2		@ 0x55555555
807#endif
808	str	r9,[sp,#468]
809	str	r8,[sp,#464]
810	str	r7,[sp,#460]
811	str	r6,[sp,#456]
812	b	.Loop_absorb
813
814.align	4
815.Loop_absorb:
816	subs	r0,$len,$bsz
817	blo	.Labsorbed
818	add	$A_flat,sp,#0
819	str	r0,[sp,#480]		@ save len - bsz
820
821.align	4
822.Loop_block:
823	ldrb	r0,[$inp],#1
824	ldrb	r1,[$inp],#1
825	ldrb	r2,[$inp],#1
826	ldrb	r3,[$inp],#1
827	ldrb	r4,[$inp],#1
828	orr	r0,r0,r1,lsl#8
829	ldrb	r1,[$inp],#1
830	orr	r0,r0,r2,lsl#16
831	ldrb	r2,[$inp],#1
832	orr	r0,r0,r3,lsl#24		@ lo
833	ldrb	r3,[$inp],#1
834	orr	r1,r4,r1,lsl#8
835	orr	r1,r1,r2,lsl#16
836	orr	r1,r1,r3,lsl#24		@ hi
837
838	and	r2,r0,r6		@ &=0x55555555
839	and	r0,r0,r6,lsl#1		@ &=0xaaaaaaaa
840	and	r3,r1,r6		@ &=0x55555555
841	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
842	orr	r2,r2,r2,lsr#1
843	orr	r0,r0,r0,lsl#1
844	orr	r3,r3,r3,lsr#1
845	orr	r1,r1,r1,lsl#1
846	and	r2,r2,r7		@ &=0x33333333
847	and	r0,r0,r7,lsl#2		@ &=0xcccccccc
848	and	r3,r3,r7		@ &=0x33333333
849	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
850	orr	r2,r2,r2,lsr#2
851	orr	r0,r0,r0,lsl#2
852	orr	r3,r3,r3,lsr#2
853	orr	r1,r1,r1,lsl#2
854	and	r2,r2,r8		@ &=0x0f0f0f0f
855	and	r0,r0,r8,lsl#4		@ &=0xf0f0f0f0
856	and	r3,r3,r8		@ &=0x0f0f0f0f
857	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
858	ldmia	$A_flat,{r4-r5}		@ A_flat[i]
859	orr	r2,r2,r2,lsr#4
860	orr	r0,r0,r0,lsl#4
861	orr	r3,r3,r3,lsr#4
862	orr	r1,r1,r1,lsl#4
863	and	r2,r2,r9		@ &=0x00ff00ff
864	and	r0,r0,r9,lsl#8		@ &=0xff00ff00
865	and	r3,r3,r9		@ &=0x00ff00ff
866	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
867	orr	r2,r2,r2,lsr#8
868	orr	r0,r0,r0,lsl#8
869	orr	r3,r3,r3,lsr#8
870	orr	r1,r1,r1,lsl#8
871
872	lsl	r2,r2,#16
873	lsr	r1,r1,#16
874	eor	r4,r4,r3,lsl#16
875	eor	r5,r5,r0,lsr#16
876	eor	r4,r4,r2,lsr#16
877	eor	r5,r5,r1,lsl#16
878	stmia	$A_flat!,{r4-r5}	@ A_flat[i++] ^= BitInterleave(inp[0..7])
879
880	subs	$bsz,$bsz,#8
881	bhi	.Loop_block
882
883	str	$inp,[sp,#476]
884
885	bl	KeccakF1600_int
886
887	add	r14,sp,#456
888	ldmia	r14,{r6-r12,r14}	@ restore constants and variables
889	b	.Loop_absorb
890
891.align	4
892.Labsorbed:
893	add	$inp,sp,#$A[1][0]
894	ldmia	sp,      {@C[0]-@C[9]}
895	stmia	$A_flat!,{@C[0]-@C[9]}	@ return A[5][5]
896	ldmia	$inp!,   {@C[0]-@C[9]}
897	stmia	$A_flat!,{@C[0]-@C[9]}
898	ldmia	$inp!,   {@C[0]-@C[9]}
899	stmia	$A_flat!,{@C[0]-@C[9]}
900	ldmia	$inp!,   {@C[0]-@C[9]}
901	stmia	$A_flat!,{@C[0]-@C[9]}
902	ldmia	$inp,    {@C[0]-@C[9]}
903	stmia	$A_flat, {@C[0]-@C[9]}
904
905.Labsorb_abort:
906	add	sp,sp,#456+32
907	mov	r0,$len			@ return value
908	ldmia	sp!,{r4-r12,pc}
909.size	SHA3_absorb,.-SHA3_absorb
910___
911}
912{ my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
913
914$code.=<<___;
915.global	SHA3_squeeze
916.type	SHA3_squeeze,%function
917.align	5
918SHA3_squeeze:
919	stmdb	sp!,{r0,r3-r10,lr}
920
921	mov	$A_flat,r0
922	mov	$out,r1
923	mov	$len,r2
924	mov	$bsz,r3
925
926#ifdef	__thumb2__
927	mov	r9,#0x00ff00ff
928	mov	r8,#0x0f0f0f0f
929	mov	r7,#0x33333333
930	mov	r6,#0x55555555
931#else
932	mov	r6,#0x11		@ compose constants
933	mov	r8,#0x0f
934	mov	r9,#0xff
935	orr	r6,r6,r6,lsl#8
936	orr	r8,r8,r8,lsl#8
937	orr	r6,r6,r6,lsl#16		@ 0x11111111
938	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
939	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
940	orr	r7,r6,r6,lsl#1		@ 0x33333333
941	orr	r6,r6,r6,lsl#2		@ 0x55555555
942#endif
943	stmdb	sp!,{r6-r9}
944
945	mov	r14,$A_flat
946	b	.Loop_squeeze
947
948.align	4
949.Loop_squeeze:
950	ldmia	$A_flat!,{r0,r1}	@ A_flat[i++]
951
952	lsl	r2,r0,#16
953	lsl	r3,r1,#16		@ r3 = r1 << 16
954	lsr	r2,r2,#16		@ r2 = r0 & 0x0000ffff
955	lsr	r1,r1,#16
956	lsr	r0,r0,#16		@ r0 = r0 >> 16
957	lsl	r1,r1,#16		@ r1 = r1 & 0xffff0000
958
959	orr	r2,r2,r2,lsl#8
960	orr	r3,r3,r3,lsr#8
961	orr	r0,r0,r0,lsl#8
962	orr	r1,r1,r1,lsr#8
963	and	r2,r2,r9		@ &=0x00ff00ff
964	and	r3,r3,r9,lsl#8		@ &=0xff00ff00
965	and	r0,r0,r9		@ &=0x00ff00ff
966	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
967	orr	r2,r2,r2,lsl#4
968	orr	r3,r3,r3,lsr#4
969	orr	r0,r0,r0,lsl#4
970	orr	r1,r1,r1,lsr#4
971	and	r2,r2,r8		@ &=0x0f0f0f0f
972	and	r3,r3,r8,lsl#4		@ &=0xf0f0f0f0
973	and	r0,r0,r8		@ &=0x0f0f0f0f
974	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
975	orr	r2,r2,r2,lsl#2
976	orr	r3,r3,r3,lsr#2
977	orr	r0,r0,r0,lsl#2
978	orr	r1,r1,r1,lsr#2
979	and	r2,r2,r7		@ &=0x33333333
980	and	r3,r3,r7,lsl#2		@ &=0xcccccccc
981	and	r0,r0,r7		@ &=0x33333333
982	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
983	orr	r2,r2,r2,lsl#1
984	orr	r3,r3,r3,lsr#1
985	orr	r0,r0,r0,lsl#1
986	orr	r1,r1,r1,lsr#1
987	and	r2,r2,r6		@ &=0x55555555
988	and	r3,r3,r6,lsl#1		@ &=0xaaaaaaaa
989	and	r0,r0,r6		@ &=0x55555555
990	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
991
992	orr	r2,r2,r3
993	orr	r0,r0,r1
994
995	cmp	$len,#8
996	blo	.Lsqueeze_tail
997	lsr	r1,r2,#8
998	strb	r2,[$out],#1
999	lsr	r3,r2,#16
1000	strb	r1,[$out],#1
1001	lsr	r2,r2,#24
1002	strb	r3,[$out],#1
1003	strb	r2,[$out],#1
1004
1005	lsr	r1,r0,#8
1006	strb	r0,[$out],#1
1007	lsr	r3,r0,#16
1008	strb	r1,[$out],#1
1009	lsr	r0,r0,#24
1010	strb	r3,[$out],#1
1011	strb	r0,[$out],#1
1012	subs	$len,$len,#8
1013	beq	.Lsqueeze_done
1014
1015	subs	$bsz,$bsz,#8		@ bsz -= 8
1016	bhi	.Loop_squeeze
1017
1018	mov	r0,r14			@ original $A_flat
1019
1020	bl	KeccakF1600
1021
1022	ldmia	sp,{r6-r10,r12}		@ restore constants and variables
1023	mov	r14,$A_flat
1024	b	.Loop_squeeze
1025
1026.align	4
1027.Lsqueeze_tail:
1028	strb	r2,[$out],#1
1029	lsr	r2,r2,#8
1030	subs	$len,$len,#1
1031	beq	.Lsqueeze_done
1032	strb	r2,[$out],#1
1033	lsr	r2,r2,#8
1034	subs	$len,$len,#1
1035	beq	.Lsqueeze_done
1036	strb	r2,[$out],#1
1037	lsr	r2,r2,#8
1038	subs	$len,$len,#1
1039	beq	.Lsqueeze_done
1040	strb	r2,[$out],#1
1041	subs	$len,$len,#1
1042	beq	.Lsqueeze_done
1043
1044	strb	r0,[$out],#1
1045	lsr	r0,r0,#8
1046	subs	$len,$len,#1
1047	beq	.Lsqueeze_done
1048	strb	r0,[$out],#1
1049	lsr	r0,r0,#8
1050	subs	$len,$len,#1
1051	beq	.Lsqueeze_done
1052	strb	r0,[$out]
1053	b	.Lsqueeze_done
1054
1055.align	4
1056.Lsqueeze_done:
1057	add	sp,sp,#24
1058	ldmia	sp!,{r4-r10,pc}
1059.size	SHA3_squeeze,.-SHA3_squeeze
1060___
1061}
1062
1063$code.=<<___;
1064#if __ARM_MAX_ARCH__>=7
1065.fpu	neon
1066
1067.type	iotas64, %object
1068.align 5
1069iotas64:
1070	.quad	0x0000000000000001
1071	.quad	0x0000000000008082
1072	.quad	0x800000000000808a
1073	.quad	0x8000000080008000
1074	.quad	0x000000000000808b
1075	.quad	0x0000000080000001
1076	.quad	0x8000000080008081
1077	.quad	0x8000000000008009
1078	.quad	0x000000000000008a
1079	.quad	0x0000000000000088
1080	.quad	0x0000000080008009
1081	.quad	0x000000008000000a
1082	.quad	0x000000008000808b
1083	.quad	0x800000000000008b
1084	.quad	0x8000000000008089
1085	.quad	0x8000000000008003
1086	.quad	0x8000000000008002
1087	.quad	0x8000000000000080
1088	.quad	0x000000000000800a
1089	.quad	0x800000008000000a
1090	.quad	0x8000000080008081
1091	.quad	0x8000000000008080
1092	.quad	0x0000000080000001
1093	.quad	0x8000000080008008
1094.size	iotas64,.-iotas64
1095
1096.type	KeccakF1600_neon, %function
1097.align	5
1098KeccakF1600_neon:
1099	add	r1, r0, #16
1100	adr	r2, iotas64
1101	mov	r3, #24			@ loop counter
1102	b	.Loop_neon
1103
1104.align	4
1105.Loop_neon:
1106	@ Theta
1107	vst1.64		{q4},  [r0,:64]		@ offload A[0..1][4]
1108	veor		q13, q0,  q5		@ A[0..1][0]^A[2..3][0]
1109	vst1.64		{d18}, [r1,:64]		@ offload A[2][4]
1110	veor		q14, q1,  q6		@ A[0..1][1]^A[2..3][1]
1111	veor		q15, q2,  q7		@ A[0..1][2]^A[2..3][2]
1112	veor		d26, d26, d27		@ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1113	veor		d27, d28, d29		@ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1114	veor		q14, q3,  q8		@ A[0..1][3]^A[2..3][3]
1115	veor		q4,  q4,  q9		@ A[0..1][4]^A[2..3][4]
1116	veor		d30, d30, d31		@ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1117	veor		d31, d28, d29		@ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1118	veor		d25, d8,  d9		@ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1119	veor		q13, q13, q10		@ C[0..1]^=A[4][0..1]
1120	veor		q14, q15, q11		@ C[2..3]^=A[4][2..3]
1121	veor		d25, d25, d24		@ C[4]^=A[4][4]
1122
1123	vadd.u64	q4,  q13, q13		@ C[0..1]<<1
1124	vadd.u64	q15, q14, q14		@ C[2..3]<<1
1125	vadd.u64	d18, d25, d25		@ C[4]<<1
1126	vsri.u64	q4,  q13, #63		@ ROL64(C[0..1],1)
1127	vsri.u64	q15, q14, #63		@ ROL64(C[2..3],1)
1128	vsri.u64	d18, d25, #63		@ ROL64(C[4],1)
1129	veor		d25, d25, d9		@ D[0] = C[4] ^= ROL64(C[1],1)
1130	veor		q13, q13, q15		@ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1131	veor		d28, d28, d18		@ D[3] = C[2] ^= ROL64(C[4],1)
1132	veor		d29, d29, d8		@ D[4] = C[3] ^= ROL64(C[0],1)
1133
1134	veor		d0,  d0,  d25		@ A[0][0] ^= C[4]
1135	veor		d1,  d1,  d25		@ A[1][0] ^= C[4]
1136	veor		d10, d10, d25		@ A[2][0] ^= C[4]
1137	veor		d11, d11, d25		@ A[3][0] ^= C[4]
1138	veor		d20, d20, d25		@ A[4][0] ^= C[4]
1139
1140	veor		d2,  d2,  d26		@ A[0][1] ^= D[1]
1141	veor		d3,  d3,  d26		@ A[1][1] ^= D[1]
1142	veor		d12, d12, d26		@ A[2][1] ^= D[1]
1143	veor		d13, d13, d26		@ A[3][1] ^= D[1]
1144	veor		d21, d21, d26		@ A[4][1] ^= D[1]
1145	vmov		d26, d27
1146
1147	veor		d6,  d6,  d28		@ A[0][3] ^= C[2]
1148	veor		d7,  d7,  d28		@ A[1][3] ^= C[2]
1149	veor		d16, d16, d28		@ A[2][3] ^= C[2]
1150	veor		d17, d17, d28		@ A[3][3] ^= C[2]
1151	veor		d23, d23, d28		@ A[4][3] ^= C[2]
1152	vld1.64		{q4},  [r0,:64]		@ restore A[0..1][4]
1153	vmov		d28, d29
1154
1155	vld1.64		{d18}, [r1,:64]		@ restore A[2][4]
1156	veor		q2,  q2,  q13		@ A[0..1][2] ^= D[2]
1157	veor		q7,  q7,  q13		@ A[2..3][2] ^= D[2]
1158	veor		d22, d22, d27		@ A[4][2]    ^= D[2]
1159
1160	veor		q4,  q4,  q14		@ A[0..1][4] ^= C[3]
1161	veor		q9,  q9,  q14		@ A[2..3][4] ^= C[3]
1162	veor		d24, d24, d29		@ A[4][4]    ^= C[3]
1163
1164	@ Rho + Pi
1165	vmov		d26, d2			@ C[1] = A[0][1]
1166	vshl.u64	d2,  d3,  #44
1167	vmov		d27, d4			@ C[2] = A[0][2]
1168	vshl.u64	d4,  d14, #43
1169	vmov		d28, d6			@ C[3] = A[0][3]
1170	vshl.u64	d6,  d17, #21
1171	vmov		d29, d8			@ C[4] = A[0][4]
1172	vshl.u64	d8,  d24, #14
1173	vsri.u64	d2,  d3,  #64-44	@ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1174	vsri.u64	d4,  d14, #64-43	@ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1175	vsri.u64	d6,  d17, #64-21	@ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1176	vsri.u64	d8,  d24, #64-14	@ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1177
1178	vshl.u64	d3,  d9,  #20
1179	vshl.u64	d14, d16, #25
1180	vshl.u64	d17, d15, #15
1181	vshl.u64	d24, d21, #2
1182	vsri.u64	d3,  d9,  #64-20	@ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1183	vsri.u64	d14, d16, #64-25	@ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1184	vsri.u64	d17, d15, #64-15	@ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1185	vsri.u64	d24, d21, #64-2		@ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1186
1187	vshl.u64	d9,  d22, #61
1188	@ vshl.u64	d16, d19, #8
1189	vshl.u64	d15, d12, #10
1190	vshl.u64	d21, d7,  #55
1191	vsri.u64	d9,  d22, #64-61	@ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1192	vext.8		d16, d19, d19, #8-1	@ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1193	vsri.u64	d15, d12, #64-10	@ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1194	vsri.u64	d21, d7,  #64-55	@ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1195
1196	vshl.u64	d22, d18, #39
1197	@ vshl.u64	d19, d23, #56
1198	vshl.u64	d12, d5,  #6
1199	vshl.u64	d7,  d13, #45
1200	vsri.u64	d22, d18, #64-39	@ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1201	vext.8		d19, d23, d23, #8-7	@ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1202	vsri.u64	d12, d5,  #64-6		@ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1203	vsri.u64	d7,  d13, #64-45	@ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1204
1205	vshl.u64	d18, d20, #18
1206	vshl.u64	d23, d11, #41
1207	vshl.u64	d5,  d10, #3
1208	vshl.u64	d13, d1,  #36
1209	vsri.u64	d18, d20, #64-18	@ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1210	vsri.u64	d23, d11, #64-41	@ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1211	vsri.u64	d5,  d10, #64-3		@ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1212	vsri.u64	d13, d1,  #64-36	@ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1213
1214	vshl.u64	d1,  d28, #28
1215	vshl.u64	d10, d26, #1
1216	vshl.u64	d11, d29, #27
1217	vshl.u64	d20, d27, #62
1218	vsri.u64	d1,  d28, #64-28	@ A[1][0] = ROL64(C[3],    rhotates[0][3])
1219	vsri.u64	d10, d26, #64-1		@ A[2][0] = ROL64(C[1],    rhotates[0][1])
1220	vsri.u64	d11, d29, #64-27	@ A[3][0] = ROL64(C[4],    rhotates[0][4])
1221	vsri.u64	d20, d27, #64-62	@ A[4][0] = ROL64(C[2],    rhotates[0][2])
1222
1223	@ Chi + Iota
1224	vbic		q13, q2,  q1
1225	vbic		q14, q3,  q2
1226	vbic		q15, q4,  q3
1227	veor		q13, q13, q0		@ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1228	veor		q14, q14, q1		@ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1229	veor		q2,  q2,  q15		@ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1230	vst1.64		{q13}, [r0,:64]		@ offload A[0..1][0]
1231	vbic		q13, q0,  q4
1232	vbic		q15, q1,  q0
1233	vmov		q1,  q14		@ A[0..1][1]
1234	veor		q3,  q3,  q13		@ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1235	veor		q4,  q4,  q15		@ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1236
1237	vbic		q13, q7,  q6
1238	vmov		q0,  q5			@ A[2..3][0]
1239	vbic		q14, q8,  q7
1240	vmov		q15, q6			@ A[2..3][1]
1241	veor		q5,  q5,  q13		@ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1242	vbic		q13, q9,  q8
1243	veor		q6,  q6,  q14		@ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1244	vbic		q14, q0,  q9
1245	veor		q7,  q7,  q13		@ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1246	vbic		q13, q15, q0
1247	veor		q8,  q8,  q14		@ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1248	vmov		q14, q10		@ A[4][0..1]
1249	veor		q9,  q9,  q13		@ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1250
1251	vld1.64		d25, [r2,:64]!		@ Iota[i++]
1252	vbic		d26, d22, d21
1253	vbic		d27, d23, d22
1254	vld1.64		{q0}, [r0,:64]		@ restore A[0..1][0]
1255	veor		d20, d20, d26		@ A[4][0] ^= (~A[4][1] & A[4][2])
1256	vbic		d26, d24, d23
1257	veor		d21, d21, d27		@ A[4][1] ^= (~A[4][2] & A[4][3])
1258	vbic		d27, d28, d24
1259	veor		d22, d22, d26		@ A[4][2] ^= (~A[4][3] & A[4][4])
1260	vbic		d26, d29, d28
1261	veor		d23, d23, d27		@ A[4][3] ^= (~A[4][4] & A[4][0])
1262	veor		d0,  d0,  d25		@ A[0][0] ^= Iota[i]
1263	veor		d24, d24, d26		@ A[4][4] ^= (~A[4][0] & A[4][1])
1264
1265	subs	r3, r3, #1
1266	bne	.Loop_neon
1267
1268	bx	lr
1269.size	KeccakF1600_neon,.-KeccakF1600_neon
1270
1271.global	SHA3_absorb_neon
1272.type	SHA3_absorb_neon, %function
1273.align	5
1274SHA3_absorb_neon:
1275	stmdb	sp!, {r4-r6,lr}
1276	vstmdb	sp!, {d8-d15}
1277
1278	mov	r4, r1			@ inp
1279	mov	r5, r2			@ len
1280	mov	r6, r3			@ bsz
1281
1282	vld1.32	{d0}, [r0,:64]!		@ A[0][0]
1283	vld1.32	{d2}, [r0,:64]!		@ A[0][1]
1284	vld1.32	{d4}, [r0,:64]!		@ A[0][2]
1285	vld1.32	{d6}, [r0,:64]!		@ A[0][3]
1286	vld1.32	{d8}, [r0,:64]!		@ A[0][4]
1287
1288	vld1.32	{d1}, [r0,:64]!		@ A[1][0]
1289	vld1.32	{d3}, [r0,:64]!		@ A[1][1]
1290	vld1.32	{d5}, [r0,:64]!		@ A[1][2]
1291	vld1.32	{d7}, [r0,:64]!		@ A[1][3]
1292	vld1.32	{d9}, [r0,:64]!		@ A[1][4]
1293
1294	vld1.32	{d10}, [r0,:64]!		@ A[2][0]
1295	vld1.32	{d12}, [r0,:64]!		@ A[2][1]
1296	vld1.32	{d14}, [r0,:64]!		@ A[2][2]
1297	vld1.32	{d16}, [r0,:64]!		@ A[2][3]
1298	vld1.32	{d18}, [r0,:64]!		@ A[2][4]
1299
1300	vld1.32	{d11}, [r0,:64]!		@ A[3][0]
1301	vld1.32	{d13}, [r0,:64]!		@ A[3][1]
1302	vld1.32	{d15}, [r0,:64]!		@ A[3][2]
1303	vld1.32	{d17}, [r0,:64]!		@ A[3][3]
1304	vld1.32	{d19}, [r0,:64]!		@ A[3][4]
1305
1306	vld1.32	{d20-d23}, [r0,:64]!	@ A[4][0..3]
1307	vld1.32	{d24}, [r0,:64]		@ A[4][4]
1308	sub	r0, r0, #24*8		@ rewind
1309	b	.Loop_absorb_neon
1310
1311.align	4
1312.Loop_absorb_neon:
1313	subs	r12, r5, r6		@ len - bsz
1314	blo	.Labsorbed_neon
1315	mov	r5, r12
1316
1317	vld1.8	{d31}, [r4]!		@ endian-neutral loads...
1318	cmp	r6, #8*2
1319	veor	d0, d0, d31		@ A[0][0] ^= *inp++
1320	blo	.Lprocess_neon
1321	vld1.8	{d31}, [r4]!
1322	veor	d2, d2, d31		@ A[0][1] ^= *inp++
1323	beq	.Lprocess_neon
1324	vld1.8	{d31}, [r4]!
1325	cmp	r6, #8*4
1326	veor	d4, d4, d31		@ A[0][2] ^= *inp++
1327	blo	.Lprocess_neon
1328	vld1.8	{d31}, [r4]!
1329	veor	d6, d6, d31		@ A[0][3] ^= *inp++
1330	beq	.Lprocess_neon
1331	vld1.8	{d31},[r4]!
1332	cmp	r6, #8*6
1333	veor	d8, d8, d31		@ A[0][4] ^= *inp++
1334	blo	.Lprocess_neon
1335
1336	vld1.8	{d31}, [r4]!
1337	veor	d1, d1, d31		@ A[1][0] ^= *inp++
1338	beq	.Lprocess_neon
1339	vld1.8	{d31}, [r4]!
1340	cmp	r6, #8*8
1341	veor	d3, d3, d31		@ A[1][1] ^= *inp++
1342	blo	.Lprocess_neon
1343	vld1.8	{d31}, [r4]!
1344	veor	d5, d5, d31		@ A[1][2] ^= *inp++
1345	beq	.Lprocess_neon
1346	vld1.8	{d31}, [r4]!
1347	cmp	r6, #8*10
1348	veor	d7, d7, d31		@ A[1][3] ^= *inp++
1349	blo	.Lprocess_neon
1350	vld1.8	{d31}, [r4]!
1351	veor	d9, d9, d31		@ A[1][4] ^= *inp++
1352	beq	.Lprocess_neon
1353
1354	vld1.8	{d31}, [r4]!
1355	cmp	r6, #8*12
1356	veor	d10, d10, d31		@ A[2][0] ^= *inp++
1357	blo	.Lprocess_neon
1358	vld1.8	{d31}, [r4]!
1359	veor	d12, d12, d31		@ A[2][1] ^= *inp++
1360	beq	.Lprocess_neon
1361	vld1.8	{d31}, [r4]!
1362	cmp	r6, #8*14
1363	veor	d14, d14, d31		@ A[2][2] ^= *inp++
1364	blo	.Lprocess_neon
1365	vld1.8	{d31}, [r4]!
1366	veor	d16, d16, d31		@ A[2][3] ^= *inp++
1367	beq	.Lprocess_neon
1368	vld1.8	{d31}, [r4]!
1369	cmp	r6, #8*16
1370	veor	d18, d18, d31		@ A[2][4] ^= *inp++
1371	blo	.Lprocess_neon
1372
1373	vld1.8	{d31}, [r4]!
1374	veor	d11, d11, d31		@ A[3][0] ^= *inp++
1375	beq	.Lprocess_neon
1376	vld1.8	{d31}, [r4]!
1377	cmp	r6, #8*18
1378	veor	d13, d13, d31		@ A[3][1] ^= *inp++
1379	blo	.Lprocess_neon
1380	vld1.8	{d31}, [r4]!
1381	veor	d15, d15, d31		@ A[3][2] ^= *inp++
1382	beq	.Lprocess_neon
1383	vld1.8	{d31}, [r4]!
1384	cmp	r6, #8*20
1385	veor	d17, d17, d31		@ A[3][3] ^= *inp++
1386	blo	.Lprocess_neon
1387	vld1.8	{d31}, [r4]!
1388	veor	d19, d19, d31		@ A[3][4] ^= *inp++
1389	beq	.Lprocess_neon
1390
1391	vld1.8	{d31}, [r4]!
1392	cmp	r6, #8*22
1393	veor	d20, d20, d31		@ A[4][0] ^= *inp++
1394	blo	.Lprocess_neon
1395	vld1.8	{d31}, [r4]!
1396	veor	d21, d21, d31		@ A[4][1] ^= *inp++
1397	beq	.Lprocess_neon
1398	vld1.8	{d31}, [r4]!
1399	cmp	r6, #8*24
1400	veor	d22, d22, d31		@ A[4][2] ^= *inp++
1401	blo	.Lprocess_neon
1402	vld1.8	{d31}, [r4]!
1403	veor	d23, d23, d31		@ A[4][3] ^= *inp++
1404	beq	.Lprocess_neon
1405	vld1.8	{d31}, [r4]!
1406	veor	d24, d24, d31		@ A[4][4] ^= *inp++
1407
1408.Lprocess_neon:
1409	bl	KeccakF1600_neon
1410	b 	.Loop_absorb_neon
1411
1412.align	4
1413.Labsorbed_neon:
1414	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
1415	vst1.32	{d2}, [r0,:64]!
1416	vst1.32	{d4}, [r0,:64]!
1417	vst1.32	{d6}, [r0,:64]!
1418	vst1.32	{d8}, [r0,:64]!
1419
1420	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
1421	vst1.32	{d3}, [r0,:64]!
1422	vst1.32	{d5}, [r0,:64]!
1423	vst1.32	{d7}, [r0,:64]!
1424	vst1.32	{d9}, [r0,:64]!
1425
1426	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
1427	vst1.32	{d12}, [r0,:64]!
1428	vst1.32	{d14}, [r0,:64]!
1429	vst1.32	{d16}, [r0,:64]!
1430	vst1.32	{d18}, [r0,:64]!
1431
1432	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
1433	vst1.32	{d13}, [r0,:64]!
1434	vst1.32	{d15}, [r0,:64]!
1435	vst1.32	{d17}, [r0,:64]!
1436	vst1.32	{d19}, [r0,:64]!
1437
1438	vst1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
1439	vst1.32	{d24}, [r0,:64]
1440
1441	mov	r0, r5			@ return value
1442	vldmia	sp!, {d8-d15}
1443	ldmia	sp!, {r4-r6,pc}
1444.size	SHA3_absorb_neon,.-SHA3_absorb_neon
1445
1446.global	SHA3_squeeze_neon
1447.type	SHA3_squeeze_neon, %function
1448.align	5
1449SHA3_squeeze_neon:
1450	stmdb	sp!, {r4-r6,lr}
1451
1452	mov	r4, r1			@ out
1453	mov	r5, r2			@ len
1454	mov	r6, r3			@ bsz
1455	mov	r12, r0			@ A_flat
1456	mov	r14, r3			@ bsz
1457	b	.Loop_squeeze_neon
1458
1459.align	4
1460.Loop_squeeze_neon:
1461	cmp	r5, #8
1462	blo	.Lsqueeze_neon_tail
1463	vld1.32	{d0}, [r12]!
1464	vst1.8	{d0}, [r4]!		@ endian-neutral store
1465
1466	subs	r5, r5, #8		@ len -= 8
1467	beq	.Lsqueeze_neon_done
1468
1469	subs	r14, r14, #8		@ bsz -= 8
1470	bhi	.Loop_squeeze_neon
1471
1472	vstmdb	sp!,  {d8-d15}
1473
1474	vld1.32	{d0}, [r0,:64]!		@ A[0][0..4]
1475	vld1.32	{d2}, [r0,:64]!
1476	vld1.32	{d4}, [r0,:64]!
1477	vld1.32	{d6}, [r0,:64]!
1478	vld1.32	{d8}, [r0,:64]!
1479
1480	vld1.32	{d1}, [r0,:64]!		@ A[1][0..4]
1481	vld1.32	{d3}, [r0,:64]!
1482	vld1.32	{d5}, [r0,:64]!
1483	vld1.32	{d7}, [r0,:64]!
1484	vld1.32	{d9}, [r0,:64]!
1485
1486	vld1.32	{d10}, [r0,:64]!		@ A[2][0..4]
1487	vld1.32	{d12}, [r0,:64]!
1488	vld1.32	{d14}, [r0,:64]!
1489	vld1.32	{d16}, [r0,:64]!
1490	vld1.32	{d18}, [r0,:64]!
1491
1492	vld1.32	{d11}, [r0,:64]!		@ A[3][0..4]
1493	vld1.32	{d13}, [r0,:64]!
1494	vld1.32	{d15}, [r0,:64]!
1495	vld1.32	{d17}, [r0,:64]!
1496	vld1.32	{d19}, [r0,:64]!
1497
1498	vld1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
1499	vld1.32	{d24}, [r0,:64]
1500	sub	r0, r0, #24*8		@ rewind
1501
1502	bl	KeccakF1600_neon
1503
1504	mov	r12, r0			@ A_flat
1505	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
1506	vst1.32	{d2}, [r0,:64]!
1507	vst1.32	{d4}, [r0,:64]!
1508	vst1.32	{d6}, [r0,:64]!
1509	vst1.32	{d8}, [r0,:64]!
1510
1511	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
1512	vst1.32	{d3}, [r0,:64]!
1513	vst1.32	{d5}, [r0,:64]!
1514	vst1.32	{d7}, [r0,:64]!
1515	vst1.32	{d9}, [r0,:64]!
1516
1517	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
1518	vst1.32	{d12}, [r0,:64]!
1519	vst1.32	{d14}, [r0,:64]!
1520	vst1.32	{d16}, [r0,:64]!
1521	vst1.32	{d18}, [r0,:64]!
1522
1523	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
1524	vst1.32	{d13}, [r0,:64]!
1525	vst1.32	{d15}, [r0,:64]!
1526	vst1.32	{d17}, [r0,:64]!
1527	vst1.32	{d19}, [r0,:64]!
1528
1529	vst1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
1530	mov	r14, r6			@ bsz
1531	vst1.32	{d24}, [r0,:64]
1532	mov	r0,  r12		@ rewind
1533
1534	vldmia	sp!, {d8-d15}
1535	b	.Loop_squeeze_neon
1536
1537.align	4
1538.Lsqueeze_neon_tail:
1539	ldmia	r12, {r2,r3}
1540	cmp	r5, #2
1541	strb	r2, [r4],#1		@ endian-neutral store
1542	lsr	r2, r2, #8
1543	blo	.Lsqueeze_neon_done
1544	strb	r2, [r4], #1
1545	lsr	r2, r2, #8
1546	beq	.Lsqueeze_neon_done
1547	strb	r2, [r4], #1
1548	lsr	r2, r2, #8
1549	cmp	r5, #4
1550	blo	.Lsqueeze_neon_done
1551	strb	r2, [r4], #1
1552	beq	.Lsqueeze_neon_done
1553
1554	strb	r3, [r4], #1
1555	lsr	r3, r3, #8
1556	cmp	r5, #6
1557	blo	.Lsqueeze_neon_done
1558	strb	r3, [r4], #1
1559	lsr	r3, r3, #8
1560	beq	.Lsqueeze_neon_done
1561	strb	r3, [r4], #1
1562
1563.Lsqueeze_neon_done:
1564	ldmia	sp!, {r4-r6,pc}
1565.size	SHA3_squeeze_neon,.-SHA3_squeeze_neon
1566#endif
1567.asciz	"Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1568.align	2
1569___
1570
1571{
1572    my %ldr, %str;
1573
1574    sub ldrd {
1575	my ($mnemonic,$half,$reg,$ea) = @_;
1576	my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1577
1578	if ($half eq "l") {
1579	    $$op{reg} = $reg;
1580	    $$op{ea}  = $ea;
1581	    sprintf "#ifndef	__thumb2__\n"	.
1582		    "	%s\t%s,%s\n"		.
1583		    "#endif", $mnemonic,$reg,$ea;
1584	} else {
1585	    sprintf "#ifndef	__thumb2__\n"	.
1586		    "	%s\t%s,%s\n"		.
1587		    "#else\n"			.
1588		    "	%sd\t%s,%s,%s\n"	.
1589		    "#endif",	$mnemonic,$reg,$ea,
1590				$mnemonic,$$op{reg},$reg,$$op{ea};
1591	}
1592    }
1593}
1594
1595foreach (split($/,$code)) {
1596	s/\`([^\`]*)\`/eval $1/ge;
1597
1598	s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
1599	s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov	$2$1#/g or
1600	s/\bret\b/bx	lr/g		or
1601	s/\bbx\s+lr\b/.word\t0xe12fff1e/g;	# make it possible to compile with -march=armv4
1602
1603	print $_,"\n";
1604}
1605
1606close STDOUT; # enforce flush
1607