1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from keccak1600-armv4.pl. */
3#include "arm_arch.h"
4
5.text
6
7#if defined(__thumb2__)
8.syntax	unified
9.thumb
10#else
11.code	32
12#endif
13
14.type	iotas32, %object
15.align	5
16iotas32:
17.long	0x00000001, 0x00000000
18.long	0x00000000, 0x00000089
19.long	0x00000000, 0x8000008b
20.long	0x00000000, 0x80008080
21.long	0x00000001, 0x0000008b
22.long	0x00000001, 0x00008000
23.long	0x00000001, 0x80008088
24.long	0x00000001, 0x80000082
25.long	0x00000000, 0x0000000b
26.long	0x00000000, 0x0000000a
27.long	0x00000001, 0x00008082
28.long	0x00000000, 0x00008003
29.long	0x00000001, 0x0000808b
30.long	0x00000001, 0x8000000b
31.long	0x00000001, 0x8000008a
32.long	0x00000001, 0x80000081
33.long	0x00000000, 0x80000081
34.long	0x00000000, 0x80000008
35.long	0x00000000, 0x00000083
36.long	0x00000000, 0x80008003
37.long	0x00000001, 0x80008088
38.long	0x00000000, 0x80000088
39.long	0x00000001, 0x00008000
40.long	0x00000000, 0x80008082
41.size	iotas32,.-iotas32
42
43.type	KeccakF1600_int, %function
44.align	5
45KeccakF1600_int:
46	add	r9,sp,#176
47	add	r12,sp,#0
48	add	r10,sp,#40
49	ldmia	r9,{r4,r5,r6,r7,r8,r9}		@ A[4][2..4]
50KeccakF1600_enter:
51	str	lr,[sp,#440]
52	eor	r11,r11,r11
53	str	r11,[sp,#444]
54	b	.Lround2x
55
56.align	4
57.Lround2x:
58	ldmia	r12,{r0,r1,r2,r3}		@ A[0][0..1]
59	ldmia	r10,{r10,r11,r12,r14}	@ A[1][0..1]
60#ifdef	__thumb2__
61	eor	r0,r0,r10
62	eor	r1,r1,r11
63	eor	r2,r2,r12
64	ldrd	r10,r11,[sp,#56]
65	eor	r3,r3,r14
66	ldrd	r12,r14,[sp,#64]
67	eor	r4,r4,r10
68	eor	r5,r5,r11
69	eor	r6,r6,r12
70	ldrd	r10,r11,[sp,#72]
71	eor	r7,r7,r14
72	ldrd	r12,r14,[sp,#80]
73	eor	r8,r8,r10
74	eor	r9,r9,r11
75	eor	r0,r0,r12
76	ldrd	r10,r11,[sp,#88]
77	eor	r1,r1,r14
78	ldrd	r12,r14,[sp,#96]
79	eor	r2,r2,r10
80	eor	r3,r3,r11
81	eor	r4,r4,r12
82	ldrd	r10,r11,[sp,#104]
83	eor	r5,r5,r14
84	ldrd	r12,r14,[sp,#112]
85	eor	r6,r6,r10
86	eor	r7,r7,r11
87	eor	r8,r8,r12
88	ldrd	r10,r11,[sp,#120]
89	eor	r9,r9,r14
90	ldrd	r12,r14,[sp,#128]
91	eor	r0,r0,r10
92	eor	r1,r1,r11
93	eor	r2,r2,r12
94	ldrd	r10,r11,[sp,#136]
95	eor	r3,r3,r14
96	ldrd	r12,r14,[sp,#144]
97	eor	r4,r4,r10
98	eor	r5,r5,r11
99	eor	r6,r6,r12
100	ldrd	r10,r11,[sp,#152]
101	eor	r7,r7,r14
102	ldrd	r12,r14,[sp,#160]
103	eor	r8,r8,r10
104	eor	r9,r9,r11
105	eor	r0,r0,r12
106	ldrd	r10,r11,[sp,#168]
107	eor	r1,r1,r14
108	ldrd	r12,r14,[sp,#16]
109	eor	r2,r2,r10
110	eor	r3,r3,r11
111	eor	r4,r4,r12
112	ldrd	r10,r11,[sp,#24]
113	eor	r5,r5,r14
114	ldrd	r12,r14,[sp,#32]
115#else
116	eor	r0,r0,r10
117	add	r10,sp,#56
118	eor	r1,r1,r11
119	eor	r2,r2,r12
120	eor	r3,r3,r14
121	ldmia	r10,{r10,r11,r12,r14}	@ A[1][2..3]
122	eor	r4,r4,r10
123	add	r10,sp,#72
124	eor	r5,r5,r11
125	eor	r6,r6,r12
126	eor	r7,r7,r14
127	ldmia	r10,{r10,r11,r12,r14}	@ A[1][4]..A[2][0]
128	eor	r8,r8,r10
129	add	r10,sp,#88
130	eor	r9,r9,r11
131	eor	r0,r0,r12
132	eor	r1,r1,r14
133	ldmia	r10,{r10,r11,r12,r14}	@ A[2][1..2]
134	eor	r2,r2,r10
135	add	r10,sp,#104
136	eor	r3,r3,r11
137	eor	r4,r4,r12
138	eor	r5,r5,r14
139	ldmia	r10,{r10,r11,r12,r14}	@ A[2][3..4]
140	eor	r6,r6,r10
141	add	r10,sp,#120
142	eor	r7,r7,r11
143	eor	r8,r8,r12
144	eor	r9,r9,r14
145	ldmia	r10,{r10,r11,r12,r14}	@ A[3][0..1]
146	eor	r0,r0,r10
147	add	r10,sp,#136
148	eor	r1,r1,r11
149	eor	r2,r2,r12
150	eor	r3,r3,r14
151	ldmia	r10,{r10,r11,r12,r14}	@ A[3][2..3]
152	eor	r4,r4,r10
153	add	r10,sp,#152
154	eor	r5,r5,r11
155	eor	r6,r6,r12
156	eor	r7,r7,r14
157	ldmia	r10,{r10,r11,r12,r14}	@ A[3][4]..A[4][0]
158	eor	r8,r8,r10
159	ldr	r10,[sp,#168]		@ A[4][1]
160	eor	r9,r9,r11
161	ldr	r11,[sp,#168+4]
162	eor	r0,r0,r12
163	ldr	r12,[sp,#16]		@ A[0][2]
164	eor	r1,r1,r14
165	ldr	r14,[sp,#16+4]
166	eor	r2,r2,r10
167	add	r10,sp,#24
168	eor	r3,r3,r11
169	eor	r4,r4,r12
170	eor	r5,r5,r14
171	ldmia	r10,{r10,r11,r12,r14}	@ A[0][3..4]
172#endif
173	eor	r6,r6,r10
174	eor	r7,r7,r11
175	eor	r8,r8,r12
176	eor	r9,r9,r14
177
178	eor	r10,r0,r5,ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
179#ifndef	__thumb2__
180	str	r10,[sp,#208]		@ D[1] = E[0]
181#endif
182	eor	r11,r1,r4
183#ifndef	__thumb2__
184	str	r11,[sp,#208+4]
185#else
186	strd	r10,r11,[sp,#208]		@ D[1] = E[0]
187#endif
188	eor	r12,r6,r1,ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
189	eor	r14,r7,r0
190#ifndef	__thumb2__
191	str	r12,[sp,#232]		@ D[4] = E[1]
192#endif
193	eor	r0,r8,r3,ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
194#ifndef	__thumb2__
195	str	r14,[sp,#232+4]
196#else
197	strd	r12,r14,[sp,#232]		@ D[4] = E[1]
198#endif
199	eor	r1,r9,r2
200#ifndef	__thumb2__
201	str	r0,[sp,#200]		@ D[0] = C[0]
202#endif
203	eor	r2,r2,r7,ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
204#ifndef	__thumb2__
205	ldr	r7,[sp,#144]
206#endif
207	eor	r3,r3,r6
208#ifndef	__thumb2__
209	str	r1,[sp,#200+4]
210#else
211	strd	r0,r1,[sp,#200]		@ D[0] = C[0]
212#endif
213#ifndef	__thumb2__
214	ldr	r6,[sp,#144+4]
215#else
216	ldrd	r7,r6,[sp,#144]
217#endif
218#ifndef	__thumb2__
219	str	r2,[sp,#216]		@ D[2] = C[1]
220#endif
221	eor	r4,r4,r9,ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
222#ifndef	__thumb2__
223	str	r3,[sp,#216+4]
224#else
225	strd	r2,r3,[sp,#216]		@ D[2] = C[1]
226#endif
227	eor	r5,r5,r8
228
229#ifndef	__thumb2__
230	ldr	r8,[sp,#192]
231#endif
232#ifndef	__thumb2__
233	ldr	r9,[sp,#192+4]
234#else
235	ldrd	r8,r9,[sp,#192]
236#endif
237#ifndef	__thumb2__
238	str	r4,[sp,#224]		@ D[3] = C[2]
239#endif
240	eor	r7,r7,r4
241#ifndef	__thumb2__
242	str	r5,[sp,#224+4]
243#else
244	strd	r4,r5,[sp,#224]		@ D[3] = C[2]
245#endif
246	eor	r6,r6,r5
247#ifndef	__thumb2__
248	ldr	r4,[sp,#0]
249#endif
250	@ mov	r7,r7,ror#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
251	@ mov	r6,r6,ror#32-11
252#ifndef	__thumb2__
253	ldr	r5,[sp,#0+4]
254#else
255	ldrd	r4,r5,[sp,#0]
256#endif
257	eor	r8,r8,r12
258	eor	r9,r9,r14
259#ifndef	__thumb2__
260	ldr	r12,[sp,#96]
261#endif
262	eor	r0,r0,r4
263#ifndef	__thumb2__
264	ldr	r14,[sp,#96+4]
265#else
266	ldrd	r12,r14,[sp,#96]
267#endif
268	@ mov	r8,r8,ror#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
269	@ mov	r9,r9,ror#32-7
270	eor	r1,r1,r5		@ C[0] =       A[0][0] ^ C[0];
271	eor	r12,r12,r2
272#ifndef	__thumb2__
273	ldr	r2,[sp,#48]
274#endif
275	eor	r14,r14,r3
276#ifndef	__thumb2__
277	ldr	r3,[sp,#48+4]
278#else
279	ldrd	r2,r3,[sp,#48]
280#endif
281	mov	r5,r12,ror#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);
282	ldr	r12,[sp,#444]			@ load counter
283	eor	r2,r2,r10
284	adr	r10,iotas32
285	mov	r4,r14,ror#32-22
286	add	r14,r10,r12
287	eor	r3,r3,r11
288	ldmia	r14,{r10,r11}		@ iotas[i]
289	bic	r12,r4,r2,ror#32-22
290	bic	r14,r5,r3,ror#32-22
291	mov	r2,r2,ror#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);
292	mov	r3,r3,ror#32-22
293	eor	r12,r12,r0
294	eor	r14,r14,r1
295	eor	r10,r10,r12
296	eor	r11,r11,r14
297#ifndef	__thumb2__
298	str	r10,[sp,#240]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
299#endif
300	bic	r12,r6,r4,ror#11
301#ifndef	__thumb2__
302	str	r11,[sp,#240+4]
303#else
304	strd	r10,r11,[sp,#240]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
305#endif
306	bic	r14,r7,r5,ror#10
307	bic	r10,r8,r6,ror#32-(11-7)
308	bic	r11,r9,r7,ror#32-(10-7)
309	eor	r12,r2,r12,ror#32-11
310#ifndef	__thumb2__
311	str	r12,[sp,#248]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
312#endif
313	eor	r14,r3,r14,ror#32-10
314#ifndef	__thumb2__
315	str	r14,[sp,#248+4]
316#else
317	strd	r12,r14,[sp,#248]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
318#endif
319	eor	r10,r4,r10,ror#32-7
320	eor	r11,r5,r11,ror#32-7
321#ifndef	__thumb2__
322	str	r10,[sp,#256]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
323#endif
324	bic	r12,r0,r8,ror#32-7
325#ifndef	__thumb2__
326	str	r11,[sp,#256+4]
327#else
328	strd	r10,r11,[sp,#256]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
329#endif
330	bic	r14,r1,r9,ror#32-7
331	eor	r12,r12,r6,ror#32-11
332#ifndef	__thumb2__
333	str	r12,[sp,#264]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
334#endif
335	eor	r14,r14,r7,ror#32-10
336#ifndef	__thumb2__
337	str	r14,[sp,#264+4]
338#else
339	strd	r12,r14,[sp,#264]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
340#endif
341	bic	r10,r2,r0
342	add	r14,sp,#224
343#ifndef	__thumb2__
344	ldr	r0,[sp,#24]		@ A[0][3]
345#endif
346	bic	r11,r3,r1
347#ifndef	__thumb2__
348	ldr	r1,[sp,#24+4]
349#else
350	ldrd	r0,r1,[sp,#24]		@ A[0][3]
351#endif
352	eor	r10,r10,r8,ror#32-7
353	eor	r11,r11,r9,ror#32-7
354#ifndef	__thumb2__
355	str	r10,[sp,#272]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
356#endif
357	add	r9,sp,#200
358#ifndef	__thumb2__
359	str	r11,[sp,#272+4]
360#else
361	strd	r10,r11,[sp,#272]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
362#endif
363
364	ldmia	r14,{r10,r11,r12,r14}	@ D[3..4]
365	ldmia	r9,{r6,r7,r8,r9}		@ D[0..1]
366
367#ifndef	__thumb2__
368	ldr	r2,[sp,#72]		@ A[1][4]
369#endif
370	eor	r0,r0,r10
371#ifndef	__thumb2__
372	ldr	r3,[sp,#72+4]
373#else
374	ldrd	r2,r3,[sp,#72]		@ A[1][4]
375#endif
376	eor	r1,r1,r11
377	@ mov	r0,r0,ror#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
378#ifndef	__thumb2__
379	ldr	r10,[sp,#128]		@ A[3][1]
380#endif
381	@ mov	r1,r1,ror#32-14
382#ifndef	__thumb2__
383	ldr	r11,[sp,#128+4]
384#else
385	ldrd	r10,r11,[sp,#128]		@ A[3][1]
386#endif
387
388	eor	r2,r2,r12
389#ifndef	__thumb2__
390	ldr	r4,[sp,#80]		@ A[2][0]
391#endif
392	eor	r3,r3,r14
393#ifndef	__thumb2__
394	ldr	r5,[sp,#80+4]
395#else
396	ldrd	r4,r5,[sp,#80]		@ A[2][0]
397#endif
398	@ mov	r2,r2,ror#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
399	@ mov	r3,r3,ror#32-10
400
401	eor	r6,r6,r4
402#ifndef	__thumb2__
403	ldr	r12,[sp,#216]		@ D[2]
404#endif
405	eor	r7,r7,r5
406#ifndef	__thumb2__
407	ldr	r14,[sp,#216+4]
408#else
409	ldrd	r12,r14,[sp,#216]		@ D[2]
410#endif
411	mov	r5,r6,ror#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
412	mov	r4,r7,ror#32-2
413
414	eor	r10,r10,r8
415#ifndef	__thumb2__
416	ldr	r8,[sp,#176]		@ A[4][2]
417#endif
418	eor	r11,r11,r9
419#ifndef	__thumb2__
420	ldr	r9,[sp,#176+4]
421#else
422	ldrd	r8,r9,[sp,#176]		@ A[4][2]
423#endif
424	mov	r7,r10,ror#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
425	mov	r6,r11,ror#32-23
426
427	bic	r10,r4,r2,ror#32-10
428	bic	r11,r5,r3,ror#32-10
429	eor	r12,r12,r8
430	eor	r14,r14,r9
431	mov	r9,r12,ror#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
432	mov	r8,r14,ror#32-31
433	eor	r10,r10,r0,ror#32-14
434	eor	r11,r11,r1,ror#32-14
435#ifndef	__thumb2__
436	str	r10,[sp,#280]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
437#endif
438	bic	r12,r6,r4
439#ifndef	__thumb2__
440	str	r11,[sp,#280+4]
441#else
442	strd	r10,r11,[sp,#280]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
443#endif
444	bic	r14,r7,r5
445	eor	r12,r12,r2,ror#32-10
446#ifndef	__thumb2__
447	str	r12,[sp,#288]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
448#endif
449	eor	r14,r14,r3,ror#32-10
450#ifndef	__thumb2__
451	str	r14,[sp,#288+4]
452#else
453	strd	r12,r14,[sp,#288]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
454#endif
455	bic	r10,r8,r6
456	bic	r11,r9,r7
457	bic	r12,r0,r8,ror#14
458	bic	r14,r1,r9,ror#14
459	eor	r10,r10,r4
460	eor	r11,r11,r5
461#ifndef	__thumb2__
462	str	r10,[sp,#296]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
463#endif
464	bic	r2,r2,r0,ror#32-(14-10)
465#ifndef	__thumb2__
466	str	r11,[sp,#296+4]
467#else
468	strd	r10,r11,[sp,#296]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
469#endif
470	eor	r12,r6,r12,ror#32-14
471	bic	r11,r3,r1,ror#32-(14-10)
472#ifndef	__thumb2__
473	str	r12,[sp,#304]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
474#endif
475	eor	r14,r7,r14,ror#32-14
476#ifndef	__thumb2__
477	str	r14,[sp,#304+4]
478#else
479	strd	r12,r14,[sp,#304]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
480#endif
481	add	r12,sp,#208
482#ifndef	__thumb2__
483	ldr	r1,[sp,#8]		@ A[0][1]
484#endif
485	eor	r10,r8,r2,ror#32-10
486#ifndef	__thumb2__
487	ldr	r0,[sp,#8+4]
488#else
489	ldrd	r1,r0,[sp,#8]		@ A[0][1]
490#endif
491	eor	r11,r9,r11,ror#32-10
492#ifndef	__thumb2__
493	str	r10,[sp,#312]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
494#endif
495#ifndef	__thumb2__
496	str	r11,[sp,#312+4]
497#else
498	strd	r10,r11,[sp,#312]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
499#endif
500
501	add	r9,sp,#224
502	ldmia	r12,{r10,r11,r12,r14}	@ D[1..2]
503#ifndef	__thumb2__
504	ldr	r2,[sp,#56]		@ A[1][2]
505#endif
506#ifndef	__thumb2__
507	ldr	r3,[sp,#56+4]
508#else
509	ldrd	r2,r3,[sp,#56]		@ A[1][2]
510#endif
511	ldmia	r9,{r6,r7,r8,r9}		@ D[3..4]
512
513	eor	r1,r1,r10
514#ifndef	__thumb2__
515	ldr	r4,[sp,#104]		@ A[2][3]
516#endif
517	eor	r0,r0,r11
518#ifndef	__thumb2__
519	ldr	r5,[sp,#104+4]
520#else
521	ldrd	r4,r5,[sp,#104]		@ A[2][3]
522#endif
523	mov	r0,r0,ror#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
524
525	eor	r2,r2,r12
526#ifndef	__thumb2__
527	ldr	r10,[sp,#152]		@ A[3][4]
528#endif
529	eor	r3,r3,r14
530#ifndef	__thumb2__
531	ldr	r11,[sp,#152+4]
532#else
533	ldrd	r10,r11,[sp,#152]		@ A[3][4]
534#endif
535	@ mov	r2,r2,ror#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
536#ifndef	__thumb2__
537	ldr	r12,[sp,#200]		@ D[0]
538#endif
539	@ mov	r3,r3,ror#32-3
540#ifndef	__thumb2__
541	ldr	r14,[sp,#200+4]
542#else
543	ldrd	r12,r14,[sp,#200]		@ D[0]
544#endif
545
546	eor	r4,r4,r6
547	eor	r5,r5,r7
548	@ mov	r5,r6,ror#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
549	@ mov	r4,r7,ror#32-13		@ [track reverse order below]
550
551	eor	r10,r10,r8
552#ifndef	__thumb2__
553	ldr	r8,[sp,#160]		@ A[4][0]
554#endif
555	eor	r11,r11,r9
556#ifndef	__thumb2__
557	ldr	r9,[sp,#160+4]
558#else
559	ldrd	r8,r9,[sp,#160]		@ A[4][0]
560#endif
561	mov	r6,r10,ror#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
562	mov	r7,r11,ror#32-4
563
564	eor	r12,r12,r8
565	eor	r14,r14,r9
566	mov	r8,r12,ror#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
567	mov	r9,r14,ror#32-9
568
569	bic	r10,r5,r2,ror#13-3
570	bic	r11,r4,r3,ror#12-3
571	bic	r12,r6,r5,ror#32-13
572	bic	r14,r7,r4,ror#32-12
573	eor	r10,r0,r10,ror#32-13
574	eor	r11,r1,r11,ror#32-12
575#ifndef	__thumb2__
576	str	r10,[sp,#320]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
577#endif
578	eor	r12,r12,r2,ror#32-3
579#ifndef	__thumb2__
580	str	r11,[sp,#320+4]
581#else
582	strd	r10,r11,[sp,#320]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
583#endif
584	eor	r14,r14,r3,ror#32-3
585#ifndef	__thumb2__
586	str	r12,[sp,#328]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
587#endif
588	bic	r10,r8,r6
589	bic	r11,r9,r7
590#ifndef	__thumb2__
591	str	r14,[sp,#328+4]
592#else
593	strd	r12,r14,[sp,#328]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
594#endif
595	eor	r10,r10,r5,ror#32-13
596	eor	r11,r11,r4,ror#32-12
597#ifndef	__thumb2__
598	str	r10,[sp,#336]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
599#endif
600	bic	r12,r0,r8
601#ifndef	__thumb2__
602	str	r11,[sp,#336+4]
603#else
604	strd	r10,r11,[sp,#336]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
605#endif
606	bic	r14,r1,r9
607	eor	r12,r12,r6
608	eor	r14,r14,r7
609#ifndef	__thumb2__
610	str	r12,[sp,#344]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
611#endif
612	bic	r10,r2,r0,ror#3
613#ifndef	__thumb2__
614	str	r14,[sp,#344+4]
615#else
616	strd	r12,r14,[sp,#344]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
617#endif
618	bic	r11,r3,r1,ror#3
619#ifndef	__thumb2__
620	ldr	r1,[sp,#32]		@ A[0][4] [in reverse order]
621#endif
622	eor	r10,r8,r10,ror#32-3
623#ifndef	__thumb2__
624	ldr	r0,[sp,#32+4]
625#else
626	ldrd	r1,r0,[sp,#32]		@ A[0][4] [in reverse order]
627#endif
628	eor	r11,r9,r11,ror#32-3
629#ifndef	__thumb2__
630	str	r10,[sp,#352]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
631#endif
632	add	r9,sp,#208
633#ifndef	__thumb2__
634	str	r11,[sp,#352+4]
635#else
636	strd	r10,r11,[sp,#352]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
637#endif
638
639#ifndef	__thumb2__
640	ldr	r10,[sp,#232]		@ D[4]
641#endif
642#ifndef	__thumb2__
643	ldr	r11,[sp,#232+4]
644#else
645	ldrd	r10,r11,[sp,#232]		@ D[4]
646#endif
647#ifndef	__thumb2__
648	ldr	r12,[sp,#200]		@ D[0]
649#endif
650#ifndef	__thumb2__
651	ldr	r14,[sp,#200+4]
652#else
653	ldrd	r12,r14,[sp,#200]		@ D[0]
654#endif
655
656	ldmia	r9,{r6,r7,r8,r9}		@ D[1..2]
657
658	eor	r1,r1,r10
659#ifndef	__thumb2__
660	ldr	r2,[sp,#40]		@ A[1][0]
661#endif
662	eor	r0,r0,r11
663#ifndef	__thumb2__
664	ldr	r3,[sp,#40+4]
665#else
666	ldrd	r2,r3,[sp,#40]		@ A[1][0]
667#endif
668	@ mov	r1,r10,ror#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
669#ifndef	__thumb2__
670	ldr	r4,[sp,#88]		@ A[2][1]
671#endif
672	@ mov	r0,r11,ror#32-14		@ [was loaded in reverse order]
673#ifndef	__thumb2__
674	ldr	r5,[sp,#88+4]
675#else
676	ldrd	r4,r5,[sp,#88]		@ A[2][1]
677#endif
678
679	eor	r2,r2,r12
680#ifndef	__thumb2__
681	ldr	r10,[sp,#136]		@ A[3][2]
682#endif
683	eor	r3,r3,r14
684#ifndef	__thumb2__
685	ldr	r11,[sp,#136+4]
686#else
687	ldrd	r10,r11,[sp,#136]		@ A[3][2]
688#endif
689	@ mov	r2,r2,ror#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
690#ifndef	__thumb2__
691	ldr	r12,[sp,#224]		@ D[3]
692#endif
693	@ mov	r3,r3,ror#32-18
694#ifndef	__thumb2__
695	ldr	r14,[sp,#224+4]
696#else
697	ldrd	r12,r14,[sp,#224]		@ D[3]
698#endif
699
700	eor	r6,r6,r4
701	eor	r7,r7,r5
702	mov	r4,r6,ror#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
703	mov	r5,r7,ror#32-5
704
705	eor	r10,r10,r8
706#ifndef	__thumb2__
707	ldr	r8,[sp,#184]		@ A[4][3]
708#endif
709	eor	r11,r11,r9
710#ifndef	__thumb2__
711	ldr	r9,[sp,#184+4]
712#else
713	ldrd	r8,r9,[sp,#184]		@ A[4][3]
714#endif
715	mov	r7,r10,ror#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
716	mov	r6,r11,ror#32-8
717
718	eor	r12,r12,r8
719	eor	r14,r14,r9
720	mov	r8,r12,ror#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
721	mov	r9,r14,ror#32-28
722
723	bic	r10,r4,r2,ror#32-18
724	bic	r11,r5,r3,ror#32-18
725	eor	r10,r10,r0,ror#32-14
726	eor	r11,r11,r1,ror#32-13
727#ifndef	__thumb2__
728	str	r10,[sp,#360]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
729#endif
730	bic	r12,r6,r4
731#ifndef	__thumb2__
732	str	r11,[sp,#360+4]
733#else
734	strd	r10,r11,[sp,#360]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
735#endif
736	bic	r14,r7,r5
737	eor	r12,r12,r2,ror#32-18
738#ifndef	__thumb2__
739	str	r12,[sp,#368]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
740#endif
741	eor	r14,r14,r3,ror#32-18
742#ifndef	__thumb2__
743	str	r14,[sp,#368+4]
744#else
745	strd	r12,r14,[sp,#368]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
746#endif
747	bic	r10,r8,r6
748	bic	r11,r9,r7
749	bic	r12,r0,r8,ror#14
750	bic	r14,r1,r9,ror#13
751	eor	r10,r10,r4
752	eor	r11,r11,r5
753#ifndef	__thumb2__
754	str	r10,[sp,#376]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
755#endif
756	bic	r2,r2,r0,ror#18-14
757#ifndef	__thumb2__
758	str	r11,[sp,#376+4]
759#else
760	strd	r10,r11,[sp,#376]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
761#endif
762	eor	r12,r6,r12,ror#32-14
763	bic	r11,r3,r1,ror#18-13
764	eor	r14,r7,r14,ror#32-13
765#ifndef	__thumb2__
766	str	r12,[sp,#384]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
767#endif
768#ifndef	__thumb2__
769	str	r14,[sp,#384+4]
770#else
771	strd	r12,r14,[sp,#384]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
772#endif
773	add	r14,sp,#216
774#ifndef	__thumb2__
775	ldr	r0,[sp,#16]		@ A[0][2]
776#endif
777	eor	r10,r8,r2,ror#32-18
778#ifndef	__thumb2__
779	ldr	r1,[sp,#16+4]
780#else
781	ldrd	r0,r1,[sp,#16]		@ A[0][2]
782#endif
783	eor	r11,r9,r11,ror#32-18
784#ifndef	__thumb2__
785	str	r10,[sp,#392]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
786#endif
787#ifndef	__thumb2__
788	str	r11,[sp,#392+4]
789#else
790	strd	r10,r11,[sp,#392]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
791#endif
792
793	ldmia	r14,{r10,r11,r12,r14}	@ D[2..3]
794#ifndef	__thumb2__
795	ldr	r2,[sp,#64]		@ A[1][3]
796#endif
797#ifndef	__thumb2__
798	ldr	r3,[sp,#64+4]
799#else
800	ldrd	r2,r3,[sp,#64]		@ A[1][3]
801#endif
802#ifndef	__thumb2__
803	ldr	r6,[sp,#232]		@ D[4]
804#endif
805#ifndef	__thumb2__
806	ldr	r7,[sp,#232+4]
807#else
808	ldrd	r6,r7,[sp,#232]		@ D[4]
809#endif
810
811	eor	r0,r0,r10
812#ifndef	__thumb2__
813	ldr	r4,[sp,#112]		@ A[2][4]
814#endif
815	eor	r1,r1,r11
816#ifndef	__thumb2__
817	ldr	r5,[sp,#112+4]
818#else
819	ldrd	r4,r5,[sp,#112]		@ A[2][4]
820#endif
821	@ mov	r0,r0,ror#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
822#ifndef	__thumb2__
823	ldr	r8,[sp,#200]		@ D[0]
824#endif
825	@ mov	r1,r1,ror#32-31
826#ifndef	__thumb2__
827	ldr	r9,[sp,#200+4]
828#else
829	ldrd	r8,r9,[sp,#200]		@ D[0]
830#endif
831
832	eor	r12,r12,r2
833#ifndef	__thumb2__
834	ldr	r10,[sp,#120]		@ A[3][0]
835#endif
836	eor	r14,r14,r3
837#ifndef	__thumb2__
838	ldr	r11,[sp,#120+4]
839#else
840	ldrd	r10,r11,[sp,#120]		@ A[3][0]
841#endif
842	mov	r3,r12,ror#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
843#ifndef	__thumb2__
844	ldr	r12,[sp,#208]		@ D[1]
845#endif
846	mov	r2,r14,ror#32-28
847#ifndef	__thumb2__
848	ldr	r14,[sp,#208+4]
849#else
850	ldrd	r12,r14,[sp,#208]		@ D[1]
851#endif
852
853	eor	r6,r6,r4
854	eor	r7,r7,r5
855	mov	r5,r6,ror#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
856	mov	r4,r7,ror#32-20
857
858	eor	r10,r10,r8
859#ifndef	__thumb2__
860	ldr	r8,[sp,#168]		@ A[4][1]
861#endif
862	eor	r11,r11,r9
863#ifndef	__thumb2__
864	ldr	r9,[sp,#168+4]
865#else
866	ldrd	r8,r9,[sp,#168]		@ A[4][1]
867#endif
868	mov	r7,r10,ror#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
869	mov	r6,r11,ror#32-21
870
871	eor	r8,r8,r12
872	eor	r9,r9,r14
873	@ mov	r8,r2,ror#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
874	@ mov	r9,r3,ror#32-1
875
876	bic	r10,r4,r2
877	bic	r11,r5,r3
878	eor	r10,r10,r0,ror#32-31
879#ifndef	__thumb2__
880	str	r10,[sp,#400]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
881#endif
882	eor	r11,r11,r1,ror#32-31
883#ifndef	__thumb2__
884	str	r11,[sp,#400+4]
885#else
886	strd	r10,r11,[sp,#400]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
887#endif
888	bic	r12,r6,r4
889	bic	r14,r7,r5
890	eor	r12,r12,r2
891	eor	r14,r14,r3
892#ifndef	__thumb2__
893	str	r12,[sp,#408]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
894#endif
895	bic	r10,r8,r6,ror#1
896#ifndef	__thumb2__
897	str	r14,[sp,#408+4]
898#else
899	strd	r12,r14,[sp,#408]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
900#endif
901	bic	r11,r9,r7,ror#1
902	bic	r12,r0,r8,ror#31-1
903	bic	r14,r1,r9,ror#31-1
904	eor	r4,r4,r10,ror#32-1
905#ifndef	__thumb2__
906	str	r4,[sp,#416]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
907#endif
908	eor	r5,r5,r11,ror#32-1
909#ifndef	__thumb2__
910	str	r5,[sp,#416+4]
911#else
912	strd	r4,r5,[sp,#416]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
913#endif
914	eor	r6,r6,r12,ror#32-31
915	eor	r7,r7,r14,ror#32-31
916#ifndef	__thumb2__
917	str	r6,[sp,#424]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
918#endif
919	bic	r10,r2,r0,ror#32-31
920#ifndef	__thumb2__
921	str	r7,[sp,#424+4]
922#else
923	strd	r6,r7,[sp,#424]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
924#endif
925	bic	r11,r3,r1,ror#32-31
926	add	r12,sp,#240
927	eor	r8,r10,r8,ror#32-1
928	add	r10,sp,#280
929	eor	r9,r11,r9,ror#32-1
930#ifndef	__thumb2__
931	str	r8,[sp,#432]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
932#endif
933#ifndef	__thumb2__
934	str	r9,[sp,#432+4]
935#else
936	strd	r8,r9,[sp,#432]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
937#endif
938	ldmia	r12,{r0,r1,r2,r3}		@ A[0][0..1]
939	ldmia	r10,{r10,r11,r12,r14}	@ A[1][0..1]
940#ifdef	__thumb2__
941	eor	r0,r0,r10
942	eor	r1,r1,r11
943	eor	r2,r2,r12
944	ldrd	r10,r11,[sp,#296]
945	eor	r3,r3,r14
946	ldrd	r12,r14,[sp,#304]
947	eor	r4,r4,r10
948	eor	r5,r5,r11
949	eor	r6,r6,r12
950	ldrd	r10,r11,[sp,#312]
951	eor	r7,r7,r14
952	ldrd	r12,r14,[sp,#320]
953	eor	r8,r8,r10
954	eor	r9,r9,r11
955	eor	r0,r0,r12
956	ldrd	r10,r11,[sp,#328]
957	eor	r1,r1,r14
958	ldrd	r12,r14,[sp,#336]
959	eor	r2,r2,r10
960	eor	r3,r3,r11
961	eor	r4,r4,r12
962	ldrd	r10,r11,[sp,#344]
963	eor	r5,r5,r14
964	ldrd	r12,r14,[sp,#352]
965	eor	r6,r6,r10
966	eor	r7,r7,r11
967	eor	r8,r8,r12
968	ldrd	r10,r11,[sp,#360]
969	eor	r9,r9,r14
970	ldrd	r12,r14,[sp,#368]
971	eor	r0,r0,r10
972	eor	r1,r1,r11
973	eor	r2,r2,r12
974	ldrd	r10,r11,[sp,#376]
975	eor	r3,r3,r14
976	ldrd	r12,r14,[sp,#384]
977	eor	r4,r4,r10
978	eor	r5,r5,r11
979	eor	r6,r6,r12
980	ldrd	r10,r11,[sp,#392]
981	eor	r7,r7,r14
982	ldrd	r12,r14,[sp,#400]
983	eor	r8,r8,r10
984	eor	r9,r9,r11
985	eor	r0,r0,r12
986	ldrd	r10,r11,[sp,#408]
987	eor	r1,r1,r14
988	ldrd	r12,r14,[sp,#256]
989	eor	r2,r2,r10
990	eor	r3,r3,r11
991	eor	r4,r4,r12
992	ldrd	r10,r11,[sp,#264]
993	eor	r5,r5,r14
994	ldrd	r12,r14,[sp,#272]
995#else
996	eor	r0,r0,r10
997	add	r10,sp,#296
998	eor	r1,r1,r11
999	eor	r2,r2,r12
1000	eor	r3,r3,r14
1001	ldmia	r10,{r10,r11,r12,r14}	@ A[1][2..3]
1002	eor	r4,r4,r10
1003	add	r10,sp,#312
1004	eor	r5,r5,r11
1005	eor	r6,r6,r12
1006	eor	r7,r7,r14
1007	ldmia	r10,{r10,r11,r12,r14}	@ A[1][4]..A[2][0]
1008	eor	r8,r8,r10
1009	add	r10,sp,#328
1010	eor	r9,r9,r11
1011	eor	r0,r0,r12
1012	eor	r1,r1,r14
1013	ldmia	r10,{r10,r11,r12,r14}	@ A[2][1..2]
1014	eor	r2,r2,r10
1015	add	r10,sp,#344
1016	eor	r3,r3,r11
1017	eor	r4,r4,r12
1018	eor	r5,r5,r14
1019	ldmia	r10,{r10,r11,r12,r14}	@ A[2][3..4]
1020	eor	r6,r6,r10
1021	add	r10,sp,#360
1022	eor	r7,r7,r11
1023	eor	r8,r8,r12
1024	eor	r9,r9,r14
1025	ldmia	r10,{r10,r11,r12,r14}	@ A[3][0..1]
1026	eor	r0,r0,r10
1027	add	r10,sp,#376
1028	eor	r1,r1,r11
1029	eor	r2,r2,r12
1030	eor	r3,r3,r14
1031	ldmia	r10,{r10,r11,r12,r14}	@ A[3][2..3]
1032	eor	r4,r4,r10
1033	add	r10,sp,#392
1034	eor	r5,r5,r11
1035	eor	r6,r6,r12
1036	eor	r7,r7,r14
1037	ldmia	r10,{r10,r11,r12,r14}	@ A[3][4]..A[4][0]
1038	eor	r8,r8,r10
1039	ldr	r10,[sp,#408]		@ A[4][1]
1040	eor	r9,r9,r11
1041	ldr	r11,[sp,#408+4]
1042	eor	r0,r0,r12
1043	ldr	r12,[sp,#256]		@ A[0][2]
1044	eor	r1,r1,r14
1045	ldr	r14,[sp,#256+4]
1046	eor	r2,r2,r10
1047	add	r10,sp,#264
1048	eor	r3,r3,r11
1049	eor	r4,r4,r12
1050	eor	r5,r5,r14
1051	ldmia	r10,{r10,r11,r12,r14}	@ A[0][3..4]
1052#endif
1053	eor	r6,r6,r10
1054	eor	r7,r7,r11
1055	eor	r8,r8,r12
1056	eor	r9,r9,r14
1057
1058	eor	r10,r0,r5,ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
1059#ifndef	__thumb2__
1060	str	r10,[sp,#208]		@ D[1] = E[0]
1061#endif
1062	eor	r11,r1,r4
1063#ifndef	__thumb2__
1064	str	r11,[sp,#208+4]
1065#else
1066	strd	r10,r11,[sp,#208]		@ D[1] = E[0]
1067#endif
1068	eor	r12,r6,r1,ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
1069	eor	r14,r7,r0
1070#ifndef	__thumb2__
1071	str	r12,[sp,#232]		@ D[4] = E[1]
1072#endif
1073	eor	r0,r8,r3,ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
1074#ifndef	__thumb2__
1075	str	r14,[sp,#232+4]
1076#else
1077	strd	r12,r14,[sp,#232]		@ D[4] = E[1]
1078#endif
1079	eor	r1,r9,r2
1080#ifndef	__thumb2__
1081	str	r0,[sp,#200]		@ D[0] = C[0]
1082#endif
1083	eor	r2,r2,r7,ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
1084#ifndef	__thumb2__
1085	ldr	r7,[sp,#384]
1086#endif
1087	eor	r3,r3,r6
1088#ifndef	__thumb2__
1089	str	r1,[sp,#200+4]
1090#else
1091	strd	r0,r1,[sp,#200]		@ D[0] = C[0]
1092#endif
1093#ifndef	__thumb2__
1094	ldr	r6,[sp,#384+4]
1095#else
1096	ldrd	r7,r6,[sp,#384]
1097#endif
1098#ifndef	__thumb2__
1099	str	r2,[sp,#216]		@ D[2] = C[1]
1100#endif
1101	eor	r4,r4,r9,ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
1102#ifndef	__thumb2__
1103	str	r3,[sp,#216+4]
1104#else
1105	strd	r2,r3,[sp,#216]		@ D[2] = C[1]
1106#endif
1107	eor	r5,r5,r8
1108
1109#ifndef	__thumb2__
1110	ldr	r8,[sp,#432]
1111#endif
1112#ifndef	__thumb2__
1113	ldr	r9,[sp,#432+4]
1114#else
1115	ldrd	r8,r9,[sp,#432]
1116#endif
1117#ifndef	__thumb2__
1118	str	r4,[sp,#224]		@ D[3] = C[2]
1119#endif
1120	eor	r7,r7,r4
1121#ifndef	__thumb2__
1122	str	r5,[sp,#224+4]
1123#else
1124	strd	r4,r5,[sp,#224]		@ D[3] = C[2]
1125#endif
1126	eor	r6,r6,r5
1127#ifndef	__thumb2__
1128	ldr	r4,[sp,#240]
1129#endif
1130	@ mov	r7,r7,ror#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
1131	@ mov	r6,r6,ror#32-11
1132#ifndef	__thumb2__
1133	ldr	r5,[sp,#240+4]
1134#else
1135	ldrd	r4,r5,[sp,#240]
1136#endif
1137	eor	r8,r8,r12
1138	eor	r9,r9,r14
1139#ifndef	__thumb2__
1140	ldr	r12,[sp,#336]
1141#endif
1142	eor	r0,r0,r4
1143#ifndef	__thumb2__
1144	ldr	r14,[sp,#336+4]
1145#else
1146	ldrd	r12,r14,[sp,#336]
1147#endif
1148	@ mov	r8,r8,ror#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
1149	@ mov	r9,r9,ror#32-7
1150	eor	r1,r1,r5		@ C[0] =       A[0][0] ^ C[0];
1151	eor	r12,r12,r2
1152#ifndef	__thumb2__
1153	ldr	r2,[sp,#288]
1154#endif
1155	eor	r14,r14,r3
1156#ifndef	__thumb2__
1157	ldr	r3,[sp,#288+4]
1158#else
1159	ldrd	r2,r3,[sp,#288]
1160#endif
1161	mov	r5,r12,ror#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);
1162	ldr	r12,[sp,#444]			@ load counter
1163	eor	r2,r2,r10
1164	adr	r10,iotas32
1165	mov	r4,r14,ror#32-22
1166	add	r14,r10,r12
1167	eor	r3,r3,r11
1168#ifndef	__thumb2__
1169	ldr	r10,[r14,#8]		@ iotas[i].lo
1170#endif
1171	add	r12,r12,#16
1172#ifndef	__thumb2__
1173	ldr	r11,[r14,#12]		@ iotas[i].hi
1174#else
1175	ldrd	r10,r11,[r14,#8]		@ iotas[i].lo
1176#endif
1177	cmp	r12,#192
1178	str	r12,[sp,#444]			@ store counter
1179	bic	r12,r4,r2,ror#32-22
1180	bic	r14,r5,r3,ror#32-22
1181	mov	r2,r2,ror#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);
1182	mov	r3,r3,ror#32-22
1183	eor	r12,r12,r0
1184	eor	r14,r14,r1
1185	eor	r10,r10,r12
1186	eor	r11,r11,r14
1187#ifndef	__thumb2__
1188	str	r10,[sp,#0]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
1189#endif
1190	bic	r12,r6,r4,ror#11
1191#ifndef	__thumb2__
1192	str	r11,[sp,#0+4]
1193#else
1194	strd	r10,r11,[sp,#0]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
1195#endif
1196	bic	r14,r7,r5,ror#10
1197	bic	r10,r8,r6,ror#32-(11-7)
1198	bic	r11,r9,r7,ror#32-(10-7)
1199	eor	r12,r2,r12,ror#32-11
1200#ifndef	__thumb2__
1201	str	r12,[sp,#8]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
1202#endif
1203	eor	r14,r3,r14,ror#32-10
1204#ifndef	__thumb2__
1205	str	r14,[sp,#8+4]
1206#else
1207	strd	r12,r14,[sp,#8]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
1208#endif
1209	eor	r10,r4,r10,ror#32-7
1210	eor	r11,r5,r11,ror#32-7
1211#ifndef	__thumb2__
1212	str	r10,[sp,#16]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
1213#endif
1214	bic	r12,r0,r8,ror#32-7
1215#ifndef	__thumb2__
1216	str	r11,[sp,#16+4]
1217#else
1218	strd	r10,r11,[sp,#16]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
1219#endif
1220	bic	r14,r1,r9,ror#32-7
1221	eor	r12,r12,r6,ror#32-11
1222#ifndef	__thumb2__
1223	str	r12,[sp,#24]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
1224#endif
1225	eor	r14,r14,r7,ror#32-10
1226#ifndef	__thumb2__
1227	str	r14,[sp,#24+4]
1228#else
1229	strd	r12,r14,[sp,#24]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
1230#endif
1231	bic	r10,r2,r0
1232	add	r14,sp,#224
1233#ifndef	__thumb2__
1234	ldr	r0,[sp,#264]		@ A[0][3]
1235#endif
1236	bic	r11,r3,r1
1237#ifndef	__thumb2__
1238	ldr	r1,[sp,#264+4]
1239#else
1240	ldrd	r0,r1,[sp,#264]		@ A[0][3]
1241#endif
1242	eor	r10,r10,r8,ror#32-7
1243	eor	r11,r11,r9,ror#32-7
1244#ifndef	__thumb2__
1245	str	r10,[sp,#32]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
1246#endif
1247	add	r9,sp,#200
1248#ifndef	__thumb2__
1249	str	r11,[sp,#32+4]
1250#else
1251	strd	r10,r11,[sp,#32]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
1252#endif
1253
1254	ldmia	r14,{r10,r11,r12,r14}	@ D[3..4]
1255	ldmia	r9,{r6,r7,r8,r9}		@ D[0..1]
1256
1257#ifndef	__thumb2__
1258	ldr	r2,[sp,#312]		@ A[1][4]
1259#endif
1260	eor	r0,r0,r10
1261#ifndef	__thumb2__
1262	ldr	r3,[sp,#312+4]
1263#else
1264	ldrd	r2,r3,[sp,#312]		@ A[1][4]
1265#endif
1266	eor	r1,r1,r11
1267	@ mov	r0,r0,ror#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
1268#ifndef	__thumb2__
1269	ldr	r10,[sp,#368]		@ A[3][1]
1270#endif
1271	@ mov	r1,r1,ror#32-14
1272#ifndef	__thumb2__
1273	ldr	r11,[sp,#368+4]
1274#else
1275	ldrd	r10,r11,[sp,#368]		@ A[3][1]
1276#endif
1277
1278	eor	r2,r2,r12
1279#ifndef	__thumb2__
1280	ldr	r4,[sp,#320]		@ A[2][0]
1281#endif
1282	eor	r3,r3,r14
1283#ifndef	__thumb2__
1284	ldr	r5,[sp,#320+4]
1285#else
1286	ldrd	r4,r5,[sp,#320]		@ A[2][0]
1287#endif
1288	@ mov	r2,r2,ror#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
1289	@ mov	r3,r3,ror#32-10
1290
1291	eor	r6,r6,r4
1292#ifndef	__thumb2__
1293	ldr	r12,[sp,#216]		@ D[2]
1294#endif
1295	eor	r7,r7,r5
1296#ifndef	__thumb2__
1297	ldr	r14,[sp,#216+4]
1298#else
1299	ldrd	r12,r14,[sp,#216]		@ D[2]
1300#endif
1301	mov	r5,r6,ror#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
1302	mov	r4,r7,ror#32-2
1303
1304	eor	r10,r10,r8
1305#ifndef	__thumb2__
1306	ldr	r8,[sp,#416]		@ A[4][2]
1307#endif
1308	eor	r11,r11,r9
1309#ifndef	__thumb2__
1310	ldr	r9,[sp,#416+4]
1311#else
1312	ldrd	r8,r9,[sp,#416]		@ A[4][2]
1313#endif
1314	mov	r7,r10,ror#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
1315	mov	r6,r11,ror#32-23
1316
1317	bic	r10,r4,r2,ror#32-10
1318	bic	r11,r5,r3,ror#32-10
1319	eor	r12,r12,r8
1320	eor	r14,r14,r9
1321	mov	r9,r12,ror#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
1322	mov	r8,r14,ror#32-31
1323	eor	r10,r10,r0,ror#32-14
1324	eor	r11,r11,r1,ror#32-14
1325#ifndef	__thumb2__
1326	str	r10,[sp,#40]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
1327#endif
1328	bic	r12,r6,r4
1329#ifndef	__thumb2__
1330	str	r11,[sp,#40+4]
1331#else
1332	strd	r10,r11,[sp,#40]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
1333#endif
1334	bic	r14,r7,r5
1335	eor	r12,r12,r2,ror#32-10
1336#ifndef	__thumb2__
1337	str	r12,[sp,#48]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
1338#endif
1339	eor	r14,r14,r3,ror#32-10
1340#ifndef	__thumb2__
1341	str	r14,[sp,#48+4]
1342#else
1343	strd	r12,r14,[sp,#48]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
1344#endif
1345	bic	r10,r8,r6
1346	bic	r11,r9,r7
1347	bic	r12,r0,r8,ror#14
1348	bic	r14,r1,r9,ror#14
1349	eor	r10,r10,r4
1350	eor	r11,r11,r5
1351#ifndef	__thumb2__
1352	str	r10,[sp,#56]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
1353#endif
1354	bic	r2,r2,r0,ror#32-(14-10)
1355#ifndef	__thumb2__
1356	str	r11,[sp,#56+4]
1357#else
1358	strd	r10,r11,[sp,#56]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
1359#endif
1360	eor	r12,r6,r12,ror#32-14
1361	bic	r11,r3,r1,ror#32-(14-10)
1362#ifndef	__thumb2__
1363	str	r12,[sp,#64]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
1364#endif
1365	eor	r14,r7,r14,ror#32-14
1366#ifndef	__thumb2__
1367	str	r14,[sp,#64+4]
1368#else
1369	strd	r12,r14,[sp,#64]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
1370#endif
1371	add	r12,sp,#208
1372#ifndef	__thumb2__
1373	ldr	r1,[sp,#248]		@ A[0][1]
1374#endif
1375	eor	r10,r8,r2,ror#32-10
1376#ifndef	__thumb2__
1377	ldr	r0,[sp,#248+4]
1378#else
1379	ldrd	r1,r0,[sp,#248]		@ A[0][1]
1380#endif
1381	eor	r11,r9,r11,ror#32-10
1382#ifndef	__thumb2__
1383	str	r10,[sp,#72]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
1384#endif
1385#ifndef	__thumb2__
1386	str	r11,[sp,#72+4]
1387#else
1388	strd	r10,r11,[sp,#72]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
1389#endif
1390
1391	add	r9,sp,#224
1392	ldmia	r12,{r10,r11,r12,r14}	@ D[1..2]
1393#ifndef	__thumb2__
1394	ldr	r2,[sp,#296]		@ A[1][2]
1395#endif
1396#ifndef	__thumb2__
1397	ldr	r3,[sp,#296+4]
1398#else
1399	ldrd	r2,r3,[sp,#296]		@ A[1][2]
1400#endif
1401	ldmia	r9,{r6,r7,r8,r9}		@ D[3..4]
1402
1403	eor	r1,r1,r10
1404#ifndef	__thumb2__
1405	ldr	r4,[sp,#344]		@ A[2][3]
1406#endif
1407	eor	r0,r0,r11
1408#ifndef	__thumb2__
1409	ldr	r5,[sp,#344+4]
1410#else
1411	ldrd	r4,r5,[sp,#344]		@ A[2][3]
1412#endif
1413	mov	r0,r0,ror#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
1414
1415	eor	r2,r2,r12
1416#ifndef	__thumb2__
1417	ldr	r10,[sp,#392]		@ A[3][4]
1418#endif
1419	eor	r3,r3,r14
1420#ifndef	__thumb2__
1421	ldr	r11,[sp,#392+4]
1422#else
1423	ldrd	r10,r11,[sp,#392]		@ A[3][4]
1424#endif
1425	@ mov	r2,r2,ror#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
1426#ifndef	__thumb2__
1427	ldr	r12,[sp,#200]		@ D[0]
1428#endif
1429	@ mov	r3,r3,ror#32-3
1430#ifndef	__thumb2__
1431	ldr	r14,[sp,#200+4]
1432#else
1433	ldrd	r12,r14,[sp,#200]		@ D[0]
1434#endif
1435
1436	eor	r4,r4,r6
1437	eor	r5,r5,r7
1438	@ mov	r5,r6,ror#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
1439	@ mov	r4,r7,ror#32-13		@ [track reverse order below]
1440
1441	eor	r10,r10,r8
1442#ifndef	__thumb2__
1443	ldr	r8,[sp,#400]		@ A[4][0]
1444#endif
1445	eor	r11,r11,r9
1446#ifndef	__thumb2__
1447	ldr	r9,[sp,#400+4]
1448#else
1449	ldrd	r8,r9,[sp,#400]		@ A[4][0]
1450#endif
1451	mov	r6,r10,ror#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
1452	mov	r7,r11,ror#32-4
1453
1454	eor	r12,r12,r8
1455	eor	r14,r14,r9
1456	mov	r8,r12,ror#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
1457	mov	r9,r14,ror#32-9
1458
1459	bic	r10,r5,r2,ror#13-3
1460	bic	r11,r4,r3,ror#12-3
1461	bic	r12,r6,r5,ror#32-13
1462	bic	r14,r7,r4,ror#32-12
1463	eor	r10,r0,r10,ror#32-13
1464	eor	r11,r1,r11,ror#32-12
1465#ifndef	__thumb2__
1466	str	r10,[sp,#80]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
1467#endif
1468	eor	r12,r12,r2,ror#32-3
1469#ifndef	__thumb2__
1470	str	r11,[sp,#80+4]
1471#else
1472	strd	r10,r11,[sp,#80]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
1473#endif
1474	eor	r14,r14,r3,ror#32-3
1475#ifndef	__thumb2__
1476	str	r12,[sp,#88]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
1477#endif
1478	bic	r10,r8,r6
1479	bic	r11,r9,r7
1480#ifndef	__thumb2__
1481	str	r14,[sp,#88+4]
1482#else
1483	strd	r12,r14,[sp,#88]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
1484#endif
1485	eor	r10,r10,r5,ror#32-13
1486	eor	r11,r11,r4,ror#32-12
1487#ifndef	__thumb2__
1488	str	r10,[sp,#96]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
1489#endif
1490	bic	r12,r0,r8
1491#ifndef	__thumb2__
1492	str	r11,[sp,#96+4]
1493#else
1494	strd	r10,r11,[sp,#96]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
1495#endif
1496	bic	r14,r1,r9
1497	eor	r12,r12,r6
1498	eor	r14,r14,r7
1499#ifndef	__thumb2__
1500	str	r12,[sp,#104]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
1501#endif
1502	bic	r10,r2,r0,ror#3
1503#ifndef	__thumb2__
1504	str	r14,[sp,#104+4]
1505#else
1506	strd	r12,r14,[sp,#104]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
1507#endif
1508	bic	r11,r3,r1,ror#3
1509#ifndef	__thumb2__
1510	ldr	r1,[sp,#272]		@ A[0][4] [in reverse order]
1511#endif
1512	eor	r10,r8,r10,ror#32-3
1513#ifndef	__thumb2__
1514	ldr	r0,[sp,#272+4]
1515#else
1516	ldrd	r1,r0,[sp,#272]		@ A[0][4] [in reverse order]
1517#endif
1518	eor	r11,r9,r11,ror#32-3
1519#ifndef	__thumb2__
1520	str	r10,[sp,#112]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
1521#endif
1522	add	r9,sp,#208
1523#ifndef	__thumb2__
1524	str	r11,[sp,#112+4]
1525#else
1526	strd	r10,r11,[sp,#112]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
1527#endif
1528
1529#ifndef	__thumb2__
1530	ldr	r10,[sp,#232]		@ D[4]
1531#endif
1532#ifndef	__thumb2__
1533	ldr	r11,[sp,#232+4]
1534#else
1535	ldrd	r10,r11,[sp,#232]		@ D[4]
1536#endif
1537#ifndef	__thumb2__
1538	ldr	r12,[sp,#200]		@ D[0]
1539#endif
1540#ifndef	__thumb2__
1541	ldr	r14,[sp,#200+4]
1542#else
1543	ldrd	r12,r14,[sp,#200]		@ D[0]
1544#endif
1545
1546	ldmia	r9,{r6,r7,r8,r9}		@ D[1..2]
1547
1548	eor	r1,r1,r10
1549#ifndef	__thumb2__
1550	ldr	r2,[sp,#280]		@ A[1][0]
1551#endif
1552	eor	r0,r0,r11
1553#ifndef	__thumb2__
1554	ldr	r3,[sp,#280+4]
1555#else
1556	ldrd	r2,r3,[sp,#280]		@ A[1][0]
1557#endif
1558	@ mov	r1,r10,ror#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
1559#ifndef	__thumb2__
1560	ldr	r4,[sp,#328]		@ A[2][1]
1561#endif
1562	@ mov	r0,r11,ror#32-14		@ [was loaded in reverse order]
1563#ifndef	__thumb2__
1564	ldr	r5,[sp,#328+4]
1565#else
1566	ldrd	r4,r5,[sp,#328]		@ A[2][1]
1567#endif
1568
1569	eor	r2,r2,r12
1570#ifndef	__thumb2__
1571	ldr	r10,[sp,#376]		@ A[3][2]
1572#endif
1573	eor	r3,r3,r14
1574#ifndef	__thumb2__
1575	ldr	r11,[sp,#376+4]
1576#else
1577	ldrd	r10,r11,[sp,#376]		@ A[3][2]
1578#endif
1579	@ mov	r2,r2,ror#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
1580#ifndef	__thumb2__
1581	ldr	r12,[sp,#224]		@ D[3]
1582#endif
1583	@ mov	r3,r3,ror#32-18
1584#ifndef	__thumb2__
1585	ldr	r14,[sp,#224+4]
1586#else
1587	ldrd	r12,r14,[sp,#224]		@ D[3]
1588#endif
1589
1590	eor	r6,r6,r4
1591	eor	r7,r7,r5
1592	mov	r4,r6,ror#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
1593	mov	r5,r7,ror#32-5
1594
1595	eor	r10,r10,r8
1596#ifndef	__thumb2__
1597	ldr	r8,[sp,#424]		@ A[4][3]
1598#endif
1599	eor	r11,r11,r9
1600#ifndef	__thumb2__
1601	ldr	r9,[sp,#424+4]
1602#else
1603	ldrd	r8,r9,[sp,#424]		@ A[4][3]
1604#endif
1605	mov	r7,r10,ror#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
1606	mov	r6,r11,ror#32-8
1607
1608	eor	r12,r12,r8
1609	eor	r14,r14,r9
1610	mov	r8,r12,ror#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
1611	mov	r9,r14,ror#32-28
1612
1613	bic	r10,r4,r2,ror#32-18
1614	bic	r11,r5,r3,ror#32-18
1615	eor	r10,r10,r0,ror#32-14
1616	eor	r11,r11,r1,ror#32-13
1617#ifndef	__thumb2__
1618	str	r10,[sp,#120]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
1619#endif
1620	bic	r12,r6,r4
1621#ifndef	__thumb2__
1622	str	r11,[sp,#120+4]
1623#else
1624	strd	r10,r11,[sp,#120]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
1625#endif
1626	bic	r14,r7,r5
1627	eor	r12,r12,r2,ror#32-18
1628#ifndef	__thumb2__
1629	str	r12,[sp,#128]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
1630#endif
1631	eor	r14,r14,r3,ror#32-18
1632#ifndef	__thumb2__
1633	str	r14,[sp,#128+4]
1634#else
1635	strd	r12,r14,[sp,#128]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
1636#endif
1637	bic	r10,r8,r6
1638	bic	r11,r9,r7
1639	bic	r12,r0,r8,ror#14
1640	bic	r14,r1,r9,ror#13
1641	eor	r10,r10,r4
1642	eor	r11,r11,r5
1643#ifndef	__thumb2__
1644	str	r10,[sp,#136]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
1645#endif
1646	bic	r2,r2,r0,ror#18-14
1647#ifndef	__thumb2__
1648	str	r11,[sp,#136+4]
1649#else
1650	strd	r10,r11,[sp,#136]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
1651#endif
1652	eor	r12,r6,r12,ror#32-14
1653	bic	r11,r3,r1,ror#18-13
1654	eor	r14,r7,r14,ror#32-13
1655#ifndef	__thumb2__
1656	str	r12,[sp,#144]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
1657#endif
1658#ifndef	__thumb2__
1659	str	r14,[sp,#144+4]
1660#else
1661	strd	r12,r14,[sp,#144]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
1662#endif
1663	add	r14,sp,#216
1664#ifndef	__thumb2__
1665	ldr	r0,[sp,#256]		@ A[0][2]
1666#endif
1667	eor	r10,r8,r2,ror#32-18
1668#ifndef	__thumb2__
1669	ldr	r1,[sp,#256+4]
1670#else
1671	ldrd	r0,r1,[sp,#256]		@ A[0][2]
1672#endif
1673	eor	r11,r9,r11,ror#32-18
1674#ifndef	__thumb2__
1675	str	r10,[sp,#152]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
1676#endif
1677#ifndef	__thumb2__
1678	str	r11,[sp,#152+4]
1679#else
1680	strd	r10,r11,[sp,#152]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
1681#endif
1682
1683	ldmia	r14,{r10,r11,r12,r14}	@ D[2..3]
1684#ifndef	__thumb2__
1685	ldr	r2,[sp,#304]		@ A[1][3]
1686#endif
1687#ifndef	__thumb2__
1688	ldr	r3,[sp,#304+4]
1689#else
1690	ldrd	r2,r3,[sp,#304]		@ A[1][3]
1691#endif
1692#ifndef	__thumb2__
1693	ldr	r6,[sp,#232]		@ D[4]
1694#endif
1695#ifndef	__thumb2__
1696	ldr	r7,[sp,#232+4]
1697#else
1698	ldrd	r6,r7,[sp,#232]		@ D[4]
1699#endif
1700
1701	eor	r0,r0,r10
1702#ifndef	__thumb2__
1703	ldr	r4,[sp,#352]		@ A[2][4]
1704#endif
1705	eor	r1,r1,r11
1706#ifndef	__thumb2__
1707	ldr	r5,[sp,#352+4]
1708#else
1709	ldrd	r4,r5,[sp,#352]		@ A[2][4]
1710#endif
1711	@ mov	r0,r0,ror#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
1712#ifndef	__thumb2__
1713	ldr	r8,[sp,#200]		@ D[0]
1714#endif
1715	@ mov	r1,r1,ror#32-31
1716#ifndef	__thumb2__
1717	ldr	r9,[sp,#200+4]
1718#else
1719	ldrd	r8,r9,[sp,#200]		@ D[0]
1720#endif
1721
1722	eor	r12,r12,r2
1723#ifndef	__thumb2__
1724	ldr	r10,[sp,#360]		@ A[3][0]
1725#endif
1726	eor	r14,r14,r3
1727#ifndef	__thumb2__
1728	ldr	r11,[sp,#360+4]
1729#else
1730	ldrd	r10,r11,[sp,#360]		@ A[3][0]
1731#endif
1732	mov	r3,r12,ror#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
1733#ifndef	__thumb2__
1734	ldr	r12,[sp,#208]		@ D[1]
1735#endif
1736	mov	r2,r14,ror#32-28
1737#ifndef	__thumb2__
1738	ldr	r14,[sp,#208+4]
1739#else
1740	ldrd	r12,r14,[sp,#208]		@ D[1]
1741#endif
1742
1743	eor	r6,r6,r4
1744	eor	r7,r7,r5
1745	mov	r5,r6,ror#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
1746	mov	r4,r7,ror#32-20
1747
1748	eor	r10,r10,r8
1749#ifndef	__thumb2__
1750	ldr	r8,[sp,#408]		@ A[4][1]
1751#endif
1752	eor	r11,r11,r9
1753#ifndef	__thumb2__
1754	ldr	r9,[sp,#408+4]
1755#else
1756	ldrd	r8,r9,[sp,#408]		@ A[4][1]
1757#endif
1758	mov	r7,r10,ror#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
1759	mov	r6,r11,ror#32-21
1760
1761	eor	r8,r8,r12
1762	eor	r9,r9,r14
1763	@ mov	r8,r2,ror#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
1764	@ mov	r9,r3,ror#32-1
1765
1766	bic	r10,r4,r2
1767	bic	r11,r5,r3
1768	eor	r10,r10,r0,ror#32-31
1769#ifndef	__thumb2__
1770	str	r10,[sp,#160]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
1771#endif
1772	eor	r11,r11,r1,ror#32-31
1773#ifndef	__thumb2__
1774	str	r11,[sp,#160+4]
1775#else
1776	strd	r10,r11,[sp,#160]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
1777#endif
1778	bic	r12,r6,r4
1779	bic	r14,r7,r5
1780	eor	r12,r12,r2
1781	eor	r14,r14,r3
1782#ifndef	__thumb2__
1783	str	r12,[sp,#168]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
1784#endif
1785	bic	r10,r8,r6,ror#1
1786#ifndef	__thumb2__
1787	str	r14,[sp,#168+4]
1788#else
1789	strd	r12,r14,[sp,#168]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
1790#endif
1791	bic	r11,r9,r7,ror#1
1792	bic	r12,r0,r8,ror#31-1
1793	bic	r14,r1,r9,ror#31-1
1794	eor	r4,r4,r10,ror#32-1
1795#ifndef	__thumb2__
1796	str	r4,[sp,#176]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
1797#endif
1798	eor	r5,r5,r11,ror#32-1
1799#ifndef	__thumb2__
1800	str	r5,[sp,#176+4]
1801#else
1802	strd	r4,r5,[sp,#176]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
1803#endif
1804	eor	r6,r6,r12,ror#32-31
1805	eor	r7,r7,r14,ror#32-31
1806#ifndef	__thumb2__
1807	str	r6,[sp,#184]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
1808#endif
1809	bic	r10,r2,r0,ror#32-31
1810#ifndef	__thumb2__
1811	str	r7,[sp,#184+4]
1812#else
1813	strd	r6,r7,[sp,#184]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
1814#endif
1815	bic	r11,r3,r1,ror#32-31
1816	add	r12,sp,#0
1817	eor	r8,r10,r8,ror#32-1
1818	add	r10,sp,#40
1819	eor	r9,r11,r9,ror#32-1
1820#ifndef	__thumb2__
1821	str	r8,[sp,#192]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
1822#endif
1823#ifndef	__thumb2__
1824	str	r9,[sp,#192+4]
1825#else
1826	strd	r8,r9,[sp,#192]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
1827#endif
1828	blo	.Lround2x
1829
1830	ldr	pc,[sp,#440]
1831.size	KeccakF1600_int,.-KeccakF1600_int
1832
1833.type	KeccakF1600, %function
1834.align	5
1835KeccakF1600:
1836	stmdb	sp!,{r0,r4-r11,lr}
1837	sub	sp,sp,#440+16			@ space for A[5][5],D[5],T[5][5],...
1838
1839	add	r10,r0,#40
1840	add	r11,sp,#40
1841	ldmia	r0,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ copy A[5][5] to stack
1842	stmia	sp,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1843	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1844	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1845	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1846	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1847	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1848	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1849	ldmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1850	add	r12,sp,#0
1851	add	r10,sp,#40
1852	stmia	r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1853
1854	bl	KeccakF1600_enter
1855
1856	ldr	r11, [sp,#440+16]		@ restore pointer to A
1857	ldmia	sp,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1858	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ return A[5][5]
1859	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1860	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1861	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1862	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1863	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1864	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1865	ldmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1866	stmia	r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1867
1868	add	sp,sp,#440+20
1869	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
1870.size	KeccakF1600,.-KeccakF1600
1871.globl	SHA3_absorb
1872.type	SHA3_absorb,%function
1873.align	5
1874SHA3_absorb:
1875	stmdb	sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
1876	sub	sp,sp,#456+16
1877
1878	add	r10,r0,#40
1879	@ mov	r11,r1
1880	mov	r12,r2
1881	mov	r14,r3
1882	cmp	r2,r3
1883	blo	.Labsorb_abort
1884
1885	add	r11,sp,#0
1886	ldmia	r0,      {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}	@ copy A[5][5] to stack
1887	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1888	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1889	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1890	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1891	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1892	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1893	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1894	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1895	stmia	r11,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1896
1897	ldr	r11,[sp,#476]		@ restore r11
1898#ifdef	__thumb2__
1899	mov	r9,#0x00ff00ff
1900	mov	r8,#0x0f0f0f0f
1901	mov	r7,#0x33333333
1902	mov	r6,#0x55555555
1903#else
1904	mov	r6,#0x11		@ compose constants
1905	mov	r8,#0x0f
1906	mov	r9,#0xff
1907	orr	r6,r6,r6,lsl#8
1908	orr	r8,r8,r8,lsl#8
1909	orr	r6,r6,r6,lsl#16		@ 0x11111111
1910	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
1911	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
1912	orr	r7,r6,r6,lsl#1		@ 0x33333333
1913	orr	r6,r6,r6,lsl#2		@ 0x55555555
1914#endif
1915	str	r9,[sp,#468]
1916	str	r8,[sp,#464]
1917	str	r7,[sp,#460]
1918	str	r6,[sp,#456]
1919	b	.Loop_absorb
1920
1921.align	4
1922.Loop_absorb:
1923	subs	r0,r12,r14
1924	blo	.Labsorbed
1925	add	r10,sp,#0
1926	str	r0,[sp,#480]		@ save len - bsz
1927
1928.align	4
1929.Loop_block:
1930	ldrb	r0,[r11],#1
1931	ldrb	r1,[r11],#1
1932	ldrb	r2,[r11],#1
1933	ldrb	r3,[r11],#1
1934	ldrb	r4,[r11],#1
1935	orr	r0,r0,r1,lsl#8
1936	ldrb	r1,[r11],#1
1937	orr	r0,r0,r2,lsl#16
1938	ldrb	r2,[r11],#1
1939	orr	r0,r0,r3,lsl#24		@ lo
1940	ldrb	r3,[r11],#1
1941	orr	r1,r4,r1,lsl#8
1942	orr	r1,r1,r2,lsl#16
1943	orr	r1,r1,r3,lsl#24		@ hi
1944
1945	and	r2,r0,r6		@ &=0x55555555
1946	and	r0,r0,r6,lsl#1		@ &=0xaaaaaaaa
1947	and	r3,r1,r6		@ &=0x55555555
1948	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
1949	orr	r2,r2,r2,lsr#1
1950	orr	r0,r0,r0,lsl#1
1951	orr	r3,r3,r3,lsr#1
1952	orr	r1,r1,r1,lsl#1
1953	and	r2,r2,r7		@ &=0x33333333
1954	and	r0,r0,r7,lsl#2		@ &=0xcccccccc
1955	and	r3,r3,r7		@ &=0x33333333
1956	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
1957	orr	r2,r2,r2,lsr#2
1958	orr	r0,r0,r0,lsl#2
1959	orr	r3,r3,r3,lsr#2
1960	orr	r1,r1,r1,lsl#2
1961	and	r2,r2,r8		@ &=0x0f0f0f0f
1962	and	r0,r0,r8,lsl#4		@ &=0xf0f0f0f0
1963	and	r3,r3,r8		@ &=0x0f0f0f0f
1964	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
1965	ldmia	r10,{r4,r5}		@ A_flat[i]
1966	orr	r2,r2,r2,lsr#4
1967	orr	r0,r0,r0,lsl#4
1968	orr	r3,r3,r3,lsr#4
1969	orr	r1,r1,r1,lsl#4
1970	and	r2,r2,r9		@ &=0x00ff00ff
1971	and	r0,r0,r9,lsl#8		@ &=0xff00ff00
1972	and	r3,r3,r9		@ &=0x00ff00ff
1973	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
1974	orr	r2,r2,r2,lsr#8
1975	orr	r0,r0,r0,lsl#8
1976	orr	r3,r3,r3,lsr#8
1977	orr	r1,r1,r1,lsl#8
1978
1979	mov	r2,r2,lsl#16
1980	mov	r1,r1,lsr#16
1981	eor	r4,r4,r3,lsl#16
1982	eor	r5,r5,r0,lsr#16
1983	eor	r4,r4,r2,lsr#16
1984	eor	r5,r5,r1,lsl#16
1985	stmia	r10!,{r4,r5}	@ A_flat[i++] ^= BitInterleave(inp[0..7])
1986
1987	subs	r14,r14,#8
1988	bhi	.Loop_block
1989
1990	str	r11,[sp,#476]
1991
1992	bl	KeccakF1600_int
1993
1994	add	r14,sp,#456
1995	ldmia	r14,{r6,r7,r8,r9,r10,r11,r12,r14}	@ restore constants and variables
1996	b	.Loop_absorb
1997
1998.align	4
1999.Labsorbed:
2000	add	r11,sp,#40
2001	ldmia	sp,      {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2002	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}	@ return A[5][5]
2003	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2004	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2005	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2006	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2007	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2008	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2009	ldmia	r11,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2010	stmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2011
2012.Labsorb_abort:
2013	add	sp,sp,#456+32
2014	mov	r0,r12			@ return value
2015	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
2016.size	SHA3_absorb,.-SHA3_absorb
2017.globl	SHA3_squeeze
2018.type	SHA3_squeeze,%function
2019.align	5
2020SHA3_squeeze:
2021	stmdb	sp!,{r0,r3-r10,lr}
2022
2023	mov	r10,r0
2024	mov	r4,r1
2025	mov	r5,r2
2026	mov	r12,r3
2027
2028#ifdef	__thumb2__
2029	mov	r9,#0x00ff00ff
2030	mov	r8,#0x0f0f0f0f
2031	mov	r7,#0x33333333
2032	mov	r6,#0x55555555
2033#else
2034	mov	r6,#0x11		@ compose constants
2035	mov	r8,#0x0f
2036	mov	r9,#0xff
2037	orr	r6,r6,r6,lsl#8
2038	orr	r8,r8,r8,lsl#8
2039	orr	r6,r6,r6,lsl#16		@ 0x11111111
2040	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
2041	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
2042	orr	r7,r6,r6,lsl#1		@ 0x33333333
2043	orr	r6,r6,r6,lsl#2		@ 0x55555555
2044#endif
2045	stmdb	sp!,{r6,r7,r8,r9}
2046
2047	mov	r14,r10
2048	b	.Loop_squeeze
2049
2050.align	4
2051.Loop_squeeze:
2052	ldmia	r10!,{r0,r1}	@ A_flat[i++]
2053
2054	mov	r2,r0,lsl#16
2055	mov	r3,r1,lsl#16		@ r3 = r1 << 16
2056	mov	r2,r2,lsr#16		@ r2 = r0 & 0x0000ffff
2057	mov	r1,r1,lsr#16
2058	mov	r0,r0,lsr#16		@ r0 = r0 >> 16
2059	mov	r1,r1,lsl#16		@ r1 = r1 & 0xffff0000
2060
2061	orr	r2,r2,r2,lsl#8
2062	orr	r3,r3,r3,lsr#8
2063	orr	r0,r0,r0,lsl#8
2064	orr	r1,r1,r1,lsr#8
2065	and	r2,r2,r9		@ &=0x00ff00ff
2066	and	r3,r3,r9,lsl#8		@ &=0xff00ff00
2067	and	r0,r0,r9		@ &=0x00ff00ff
2068	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
2069	orr	r2,r2,r2,lsl#4
2070	orr	r3,r3,r3,lsr#4
2071	orr	r0,r0,r0,lsl#4
2072	orr	r1,r1,r1,lsr#4
2073	and	r2,r2,r8		@ &=0x0f0f0f0f
2074	and	r3,r3,r8,lsl#4		@ &=0xf0f0f0f0
2075	and	r0,r0,r8		@ &=0x0f0f0f0f
2076	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
2077	orr	r2,r2,r2,lsl#2
2078	orr	r3,r3,r3,lsr#2
2079	orr	r0,r0,r0,lsl#2
2080	orr	r1,r1,r1,lsr#2
2081	and	r2,r2,r7		@ &=0x33333333
2082	and	r3,r3,r7,lsl#2		@ &=0xcccccccc
2083	and	r0,r0,r7		@ &=0x33333333
2084	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
2085	orr	r2,r2,r2,lsl#1
2086	orr	r3,r3,r3,lsr#1
2087	orr	r0,r0,r0,lsl#1
2088	orr	r1,r1,r1,lsr#1
2089	and	r2,r2,r6		@ &=0x55555555
2090	and	r3,r3,r6,lsl#1		@ &=0xaaaaaaaa
2091	and	r0,r0,r6		@ &=0x55555555
2092	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
2093
2094	orr	r2,r2,r3
2095	orr	r0,r0,r1
2096
2097	cmp	r5,#8
2098	blo	.Lsqueeze_tail
2099	mov	r1,r2,lsr#8
2100	strb	r2,[r4],#1
2101	mov	r3,r2,lsr#16
2102	strb	r1,[r4],#1
2103	mov	r2,r2,lsr#24
2104	strb	r3,[r4],#1
2105	strb	r2,[r4],#1
2106
2107	mov	r1,r0,lsr#8
2108	strb	r0,[r4],#1
2109	mov	r3,r0,lsr#16
2110	strb	r1,[r4],#1
2111	mov	r0,r0,lsr#24
2112	strb	r3,[r4],#1
2113	strb	r0,[r4],#1
2114	subs	r5,r5,#8
2115	beq	.Lsqueeze_done
2116
2117	subs	r12,r12,#8		@ bsz -= 8
2118	bhi	.Loop_squeeze
2119
2120	mov	r0,r14			@ original r10
2121
2122	bl	KeccakF1600
2123
2124	ldmia	sp,{r6,r7,r8,r9,r10,r12}		@ restore constants and variables
2125	mov	r14,r10
2126	b	.Loop_squeeze
2127
2128.align	4
2129.Lsqueeze_tail:
2130	strb	r2,[r4],#1
2131	mov	r2,r2,lsr#8
2132	subs	r5,r5,#1
2133	beq	.Lsqueeze_done
2134	strb	r2,[r4],#1
2135	mov	r2,r2,lsr#8
2136	subs	r5,r5,#1
2137	beq	.Lsqueeze_done
2138	strb	r2,[r4],#1
2139	mov	r2,r2,lsr#8
2140	subs	r5,r5,#1
2141	beq	.Lsqueeze_done
2142	strb	r2,[r4],#1
2143	subs	r5,r5,#1
2144	beq	.Lsqueeze_done
2145
2146	strb	r0,[r4],#1
2147	mov	r0,r0,lsr#8
2148	subs	r5,r5,#1
2149	beq	.Lsqueeze_done
2150	strb	r0,[r4],#1
2151	mov	r0,r0,lsr#8
2152	subs	r5,r5,#1
2153	beq	.Lsqueeze_done
2154	strb	r0,[r4]
2155	b	.Lsqueeze_done
2156
2157.align	4
2158.Lsqueeze_done:
2159	add	sp,sp,#24
2160	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
2161.size	SHA3_squeeze,.-SHA3_squeeze
2162#if __ARM_MAX_ARCH__>=7
2163.fpu	neon
2164
2165.type	iotas64, %object
2166.align	5
2167iotas64:
2168.quad	0x0000000000000001
2169.quad	0x0000000000008082
2170.quad	0x800000000000808a
2171.quad	0x8000000080008000
2172.quad	0x000000000000808b
2173.quad	0x0000000080000001
2174.quad	0x8000000080008081
2175.quad	0x8000000000008009
2176.quad	0x000000000000008a
2177.quad	0x0000000000000088
2178.quad	0x0000000080008009
2179.quad	0x000000008000000a
2180.quad	0x000000008000808b
2181.quad	0x800000000000008b
2182.quad	0x8000000000008089
2183.quad	0x8000000000008003
2184.quad	0x8000000000008002
2185.quad	0x8000000000000080
2186.quad	0x000000000000800a
2187.quad	0x800000008000000a
2188.quad	0x8000000080008081
2189.quad	0x8000000000008080
2190.quad	0x0000000080000001
2191.quad	0x8000000080008008
2192.size	iotas64,.-iotas64
2193
2194.type	KeccakF1600_neon, %function
2195.align	5
2196KeccakF1600_neon:
2197	add	r1, r0, #16
2198	adr	r2, iotas64
2199	mov	r3, #24			@ loop counter
2200	b	.Loop_neon
2201
2202.align	4
2203.Loop_neon:
2204	@ Theta
2205	vst1.64	{q4},  [r0,:64]		@ offload A[0..1][4]
2206	veor	q13, q0,  q5		@ A[0..1][0]^A[2..3][0]
2207	vst1.64	{d18}, [r1,:64]		@ offload A[2][4]
2208	veor	q14, q1,  q6		@ A[0..1][1]^A[2..3][1]
2209	veor	q15, q2,  q7		@ A[0..1][2]^A[2..3][2]
2210	veor	d26, d26, d27		@ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
2211	veor	d27, d28, d29		@ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
2212	veor	q14, q3,  q8		@ A[0..1][3]^A[2..3][3]
2213	veor	q4,  q4,  q9		@ A[0..1][4]^A[2..3][4]
2214	veor	d30, d30, d31		@ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
2215	veor	d31, d28, d29		@ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
2216	veor	d25, d8,  d9		@ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
2217	veor	q13, q13, q10		@ C[0..1]^=A[4][0..1]
2218	veor	q14, q15, q11		@ C[2..3]^=A[4][2..3]
2219	veor	d25, d25, d24		@ C[4]^=A[4][4]
2220
2221	vadd.u64	q4,  q13, q13		@ C[0..1]<<1
2222	vadd.u64	q15, q14, q14		@ C[2..3]<<1
2223	vadd.u64	d18, d25, d25		@ C[4]<<1
2224	vsri.u64	q4,  q13, #63		@ ROL64(C[0..1],1)
2225	vsri.u64	q15, q14, #63		@ ROL64(C[2..3],1)
2226	vsri.u64	d18, d25, #63		@ ROL64(C[4],1)
2227	veor	d25, d25, d9		@ D[0] = C[4] ^= ROL64(C[1],1)
2228	veor	q13, q13, q15		@ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
2229	veor	d28, d28, d18		@ D[3] = C[2] ^= ROL64(C[4],1)
2230	veor	d29, d29, d8		@ D[4] = C[3] ^= ROL64(C[0],1)
2231
2232	veor	d0,  d0,  d25		@ A[0][0] ^= C[4]
2233	veor	d1,  d1,  d25		@ A[1][0] ^= C[4]
2234	veor	d10, d10, d25		@ A[2][0] ^= C[4]
2235	veor	d11, d11, d25		@ A[3][0] ^= C[4]
2236	veor	d20, d20, d25		@ A[4][0] ^= C[4]
2237
2238	veor	d2,  d2,  d26		@ A[0][1] ^= D[1]
2239	veor	d3,  d3,  d26		@ A[1][1] ^= D[1]
2240	veor	d12, d12, d26		@ A[2][1] ^= D[1]
2241	veor	d13, d13, d26		@ A[3][1] ^= D[1]
2242	veor	d21, d21, d26		@ A[4][1] ^= D[1]
2243	vmov	d26, d27
2244
2245	veor	d6,  d6,  d28		@ A[0][3] ^= C[2]
2246	veor	d7,  d7,  d28		@ A[1][3] ^= C[2]
2247	veor	d16, d16, d28		@ A[2][3] ^= C[2]
2248	veor	d17, d17, d28		@ A[3][3] ^= C[2]
2249	veor	d23, d23, d28		@ A[4][3] ^= C[2]
2250	vld1.64	{q4},  [r0,:64]		@ restore A[0..1][4]
2251	vmov	d28, d29
2252
2253	vld1.64	{d18}, [r1,:64]		@ restore A[2][4]
2254	veor	q2,  q2,  q13		@ A[0..1][2] ^= D[2]
2255	veor	q7,  q7,  q13		@ A[2..3][2] ^= D[2]
2256	veor	d22, d22, d27		@ A[4][2]    ^= D[2]
2257
2258	veor	q4,  q4,  q14		@ A[0..1][4] ^= C[3]
2259	veor	q9,  q9,  q14		@ A[2..3][4] ^= C[3]
2260	veor	d24, d24, d29		@ A[4][4]    ^= C[3]
2261
2262	@ Rho + Pi
2263	vmov	d26, d2			@ C[1] = A[0][1]
2264	vshl.u64	d2,  d3,  #44
2265	vmov	d27, d4			@ C[2] = A[0][2]
2266	vshl.u64	d4,  d14, #43
2267	vmov	d28, d6			@ C[3] = A[0][3]
2268	vshl.u64	d6,  d17, #21
2269	vmov	d29, d8			@ C[4] = A[0][4]
2270	vshl.u64	d8,  d24, #14
2271	vsri.u64	d2,  d3,  #64-44	@ A[0][1] = ROL64(A[1][1], rhotates[1][1])
2272	vsri.u64	d4,  d14, #64-43	@ A[0][2] = ROL64(A[2][2], rhotates[2][2])
2273	vsri.u64	d6,  d17, #64-21	@ A[0][3] = ROL64(A[3][3], rhotates[3][3])
2274	vsri.u64	d8,  d24, #64-14	@ A[0][4] = ROL64(A[4][4], rhotates[4][4])
2275
2276	vshl.u64	d3,  d9,  #20
2277	vshl.u64	d14, d16, #25
2278	vshl.u64	d17, d15, #15
2279	vshl.u64	d24, d21, #2
2280	vsri.u64	d3,  d9,  #64-20	@ A[1][1] = ROL64(A[1][4], rhotates[1][4])
2281	vsri.u64	d14, d16, #64-25	@ A[2][2] = ROL64(A[2][3], rhotates[2][3])
2282	vsri.u64	d17, d15, #64-15	@ A[3][3] = ROL64(A[3][2], rhotates[3][2])
2283	vsri.u64	d24, d21, #64-2		@ A[4][4] = ROL64(A[4][1], rhotates[4][1])
2284
2285	vshl.u64	d9,  d22, #61
2286	@ vshl.u64	d16, d19, #8
2287	vshl.u64	d15, d12, #10
2288	vshl.u64	d21, d7,  #55
2289	vsri.u64	d9,  d22, #64-61	@ A[1][4] = ROL64(A[4][2], rhotates[4][2])
2290	vext.8	d16, d19, d19, #8-1	@ A[2][3] = ROL64(A[3][4], rhotates[3][4])
2291	vsri.u64	d15, d12, #64-10	@ A[3][2] = ROL64(A[2][1], rhotates[2][1])
2292	vsri.u64	d21, d7,  #64-55	@ A[4][1] = ROL64(A[1][3], rhotates[1][3])
2293
2294	vshl.u64	d22, d18, #39
2295	@ vshl.u64	d19, d23, #56
2296	vshl.u64	d12, d5,  #6
2297	vshl.u64	d7,  d13, #45
2298	vsri.u64	d22, d18, #64-39	@ A[4][2] = ROL64(A[2][4], rhotates[2][4])
2299	vext.8	d19, d23, d23, #8-7	@ A[3][4] = ROL64(A[4][3], rhotates[4][3])
2300	vsri.u64	d12, d5,  #64-6		@ A[2][1] = ROL64(A[1][2], rhotates[1][2])
2301	vsri.u64	d7,  d13, #64-45	@ A[1][3] = ROL64(A[3][1], rhotates[3][1])
2302
2303	vshl.u64	d18, d20, #18
2304	vshl.u64	d23, d11, #41
2305	vshl.u64	d5,  d10, #3
2306	vshl.u64	d13, d1,  #36
2307	vsri.u64	d18, d20, #64-18	@ A[2][4] = ROL64(A[4][0], rhotates[4][0])
2308	vsri.u64	d23, d11, #64-41	@ A[4][3] = ROL64(A[3][0], rhotates[3][0])
2309	vsri.u64	d5,  d10, #64-3		@ A[1][2] = ROL64(A[2][0], rhotates[2][0])
2310	vsri.u64	d13, d1,  #64-36	@ A[3][1] = ROL64(A[1][0], rhotates[1][0])
2311
2312	vshl.u64	d1,  d28, #28
2313	vshl.u64	d10, d26, #1
2314	vshl.u64	d11, d29, #27
2315	vshl.u64	d20, d27, #62
2316	vsri.u64	d1,  d28, #64-28	@ A[1][0] = ROL64(C[3],    rhotates[0][3])
2317	vsri.u64	d10, d26, #64-1		@ A[2][0] = ROL64(C[1],    rhotates[0][1])
2318	vsri.u64	d11, d29, #64-27	@ A[3][0] = ROL64(C[4],    rhotates[0][4])
2319	vsri.u64	d20, d27, #64-62	@ A[4][0] = ROL64(C[2],    rhotates[0][2])
2320
2321	@ Chi + Iota
2322	vbic	q13, q2,  q1
2323	vbic	q14, q3,  q2
2324	vbic	q15, q4,  q3
2325	veor	q13, q13, q0		@ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
2326	veor	q14, q14, q1		@ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
2327	veor	q2,  q2,  q15		@ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
2328	vst1.64	{q13}, [r0,:64]		@ offload A[0..1][0]
2329	vbic	q13, q0,  q4
2330	vbic	q15, q1,  q0
2331	vmov	q1,  q14		@ A[0..1][1]
2332	veor	q3,  q3,  q13		@ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
2333	veor	q4,  q4,  q15		@ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
2334
2335	vbic	q13, q7,  q6
2336	vmov	q0,  q5			@ A[2..3][0]
2337	vbic	q14, q8,  q7
2338	vmov	q15, q6			@ A[2..3][1]
2339	veor	q5,  q5,  q13		@ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
2340	vbic	q13, q9,  q8
2341	veor	q6,  q6,  q14		@ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
2342	vbic	q14, q0,  q9
2343	veor	q7,  q7,  q13		@ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
2344	vbic	q13, q15, q0
2345	veor	q8,  q8,  q14		@ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
2346	vmov	q14, q10		@ A[4][0..1]
2347	veor	q9,  q9,  q13		@ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
2348
2349	vld1.64	d25, [r2,:64]!		@ Iota[i++]
2350	vbic	d26, d22, d21
2351	vbic	d27, d23, d22
2352	vld1.64	{q0}, [r0,:64]		@ restore A[0..1][0]
2353	veor	d20, d20, d26		@ A[4][0] ^= (~A[4][1] & A[4][2])
2354	vbic	d26, d24, d23
2355	veor	d21, d21, d27		@ A[4][1] ^= (~A[4][2] & A[4][3])
2356	vbic	d27, d28, d24
2357	veor	d22, d22, d26		@ A[4][2] ^= (~A[4][3] & A[4][4])
2358	vbic	d26, d29, d28
2359	veor	d23, d23, d27		@ A[4][3] ^= (~A[4][4] & A[4][0])
2360	veor	d0,  d0,  d25		@ A[0][0] ^= Iota[i]
2361	veor	d24, d24, d26		@ A[4][4] ^= (~A[4][0] & A[4][1])
2362
2363	subs	r3, r3, #1
2364	bne	.Loop_neon
2365
2366.word	0xe12fff1e
2367.size	KeccakF1600_neon,.-KeccakF1600_neon
2368
2369.globl	SHA3_absorb_neon
2370.type	SHA3_absorb_neon, %function
2371.align	5
2372SHA3_absorb_neon:
2373	stmdb	sp!, {r4,r5,r6,lr}
2374	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
2375
2376	mov	r4, r1			@ inp
2377	mov	r5, r2			@ len
2378	mov	r6, r3			@ bsz
2379
2380	vld1.32	{d0}, [r0,:64]!		@ A[0][0]
2381	vld1.32	{d2}, [r0,:64]!		@ A[0][1]
2382	vld1.32	{d4}, [r0,:64]!		@ A[0][2]
2383	vld1.32	{d6}, [r0,:64]!		@ A[0][3]
2384	vld1.32	{d8}, [r0,:64]!		@ A[0][4]
2385
2386	vld1.32	{d1}, [r0,:64]!		@ A[1][0]
2387	vld1.32	{d3}, [r0,:64]!		@ A[1][1]
2388	vld1.32	{d5}, [r0,:64]!		@ A[1][2]
2389	vld1.32	{d7}, [r0,:64]!		@ A[1][3]
2390	vld1.32	{d9}, [r0,:64]!		@ A[1][4]
2391
2392	vld1.32	{d10}, [r0,:64]!		@ A[2][0]
2393	vld1.32	{d12}, [r0,:64]!		@ A[2][1]
2394	vld1.32	{d14}, [r0,:64]!		@ A[2][2]
2395	vld1.32	{d16}, [r0,:64]!		@ A[2][3]
2396	vld1.32	{d18}, [r0,:64]!		@ A[2][4]
2397
2398	vld1.32	{d11}, [r0,:64]!		@ A[3][0]
2399	vld1.32	{d13}, [r0,:64]!		@ A[3][1]
2400	vld1.32	{d15}, [r0,:64]!		@ A[3][2]
2401	vld1.32	{d17}, [r0,:64]!		@ A[3][3]
2402	vld1.32	{d19}, [r0,:64]!		@ A[3][4]
2403
2404	vld1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..3]
2405	vld1.32	{d24}, [r0,:64]		@ A[4][4]
2406	sub	r0, r0, #24*8		@ rewind
2407	b	.Loop_absorb_neon
2408
2409.align	4
2410.Loop_absorb_neon:
2411	subs	r12, r5, r6		@ len - bsz
2412	blo	.Labsorbed_neon
2413	mov	r5, r12
2414
2415	vld1.8	{d31}, [r4]!		@ endian-neutral loads...
2416	cmp	r6, #8*2
2417	veor	d0, d0, d31		@ A[0][0] ^= *inp++
2418	blo	.Lprocess_neon
2419	vld1.8	{d31}, [r4]!
2420	veor	d2, d2, d31		@ A[0][1] ^= *inp++
2421	beq	.Lprocess_neon
2422	vld1.8	{d31}, [r4]!
2423	cmp	r6, #8*4
2424	veor	d4, d4, d31		@ A[0][2] ^= *inp++
2425	blo	.Lprocess_neon
2426	vld1.8	{d31}, [r4]!
2427	veor	d6, d6, d31		@ A[0][3] ^= *inp++
2428	beq	.Lprocess_neon
2429	vld1.8	{d31},[r4]!
2430	cmp	r6, #8*6
2431	veor	d8, d8, d31		@ A[0][4] ^= *inp++
2432	blo	.Lprocess_neon
2433
2434	vld1.8	{d31}, [r4]!
2435	veor	d1, d1, d31		@ A[1][0] ^= *inp++
2436	beq	.Lprocess_neon
2437	vld1.8	{d31}, [r4]!
2438	cmp	r6, #8*8
2439	veor	d3, d3, d31		@ A[1][1] ^= *inp++
2440	blo	.Lprocess_neon
2441	vld1.8	{d31}, [r4]!
2442	veor	d5, d5, d31		@ A[1][2] ^= *inp++
2443	beq	.Lprocess_neon
2444	vld1.8	{d31}, [r4]!
2445	cmp	r6, #8*10
2446	veor	d7, d7, d31		@ A[1][3] ^= *inp++
2447	blo	.Lprocess_neon
2448	vld1.8	{d31}, [r4]!
2449	veor	d9, d9, d31		@ A[1][4] ^= *inp++
2450	beq	.Lprocess_neon
2451
2452	vld1.8	{d31}, [r4]!
2453	cmp	r6, #8*12
2454	veor	d10, d10, d31		@ A[2][0] ^= *inp++
2455	blo	.Lprocess_neon
2456	vld1.8	{d31}, [r4]!
2457	veor	d12, d12, d31		@ A[2][1] ^= *inp++
2458	beq	.Lprocess_neon
2459	vld1.8	{d31}, [r4]!
2460	cmp	r6, #8*14
2461	veor	d14, d14, d31		@ A[2][2] ^= *inp++
2462	blo	.Lprocess_neon
2463	vld1.8	{d31}, [r4]!
2464	veor	d16, d16, d31		@ A[2][3] ^= *inp++
2465	beq	.Lprocess_neon
2466	vld1.8	{d31}, [r4]!
2467	cmp	r6, #8*16
2468	veor	d18, d18, d31		@ A[2][4] ^= *inp++
2469	blo	.Lprocess_neon
2470
2471	vld1.8	{d31}, [r4]!
2472	veor	d11, d11, d31		@ A[3][0] ^= *inp++
2473	beq	.Lprocess_neon
2474	vld1.8	{d31}, [r4]!
2475	cmp	r6, #8*18
2476	veor	d13, d13, d31		@ A[3][1] ^= *inp++
2477	blo	.Lprocess_neon
2478	vld1.8	{d31}, [r4]!
2479	veor	d15, d15, d31		@ A[3][2] ^= *inp++
2480	beq	.Lprocess_neon
2481	vld1.8	{d31}, [r4]!
2482	cmp	r6, #8*20
2483	veor	d17, d17, d31		@ A[3][3] ^= *inp++
2484	blo	.Lprocess_neon
2485	vld1.8	{d31}, [r4]!
2486	veor	d19, d19, d31		@ A[3][4] ^= *inp++
2487	beq	.Lprocess_neon
2488
2489	vld1.8	{d31}, [r4]!
2490	cmp	r6, #8*22
2491	veor	d20, d20, d31		@ A[4][0] ^= *inp++
2492	blo	.Lprocess_neon
2493	vld1.8	{d31}, [r4]!
2494	veor	d21, d21, d31		@ A[4][1] ^= *inp++
2495	beq	.Lprocess_neon
2496	vld1.8	{d31}, [r4]!
2497	cmp	r6, #8*24
2498	veor	d22, d22, d31		@ A[4][2] ^= *inp++
2499	blo	.Lprocess_neon
2500	vld1.8	{d31}, [r4]!
2501	veor	d23, d23, d31		@ A[4][3] ^= *inp++
2502	beq	.Lprocess_neon
2503	vld1.8	{d31}, [r4]!
2504	veor	d24, d24, d31		@ A[4][4] ^= *inp++
2505
2506.Lprocess_neon:
2507	bl	KeccakF1600_neon
2508	b	.Loop_absorb_neon
2509
2510.align	4
2511.Labsorbed_neon:
2512	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2513	vst1.32	{d2}, [r0,:64]!
2514	vst1.32	{d4}, [r0,:64]!
2515	vst1.32	{d6}, [r0,:64]!
2516	vst1.32	{d8}, [r0,:64]!
2517
2518	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2519	vst1.32	{d3}, [r0,:64]!
2520	vst1.32	{d5}, [r0,:64]!
2521	vst1.32	{d7}, [r0,:64]!
2522	vst1.32	{d9}, [r0,:64]!
2523
2524	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2525	vst1.32	{d12}, [r0,:64]!
2526	vst1.32	{d14}, [r0,:64]!
2527	vst1.32	{d16}, [r0,:64]!
2528	vst1.32	{d18}, [r0,:64]!
2529
2530	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2531	vst1.32	{d13}, [r0,:64]!
2532	vst1.32	{d15}, [r0,:64]!
2533	vst1.32	{d17}, [r0,:64]!
2534	vst1.32	{d19}, [r0,:64]!
2535
2536	vst1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
2537	vst1.32	{d24}, [r0,:64]
2538
2539	mov	r0, r5			@ return value
2540	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
2541	ldmia	sp!, {r4,r5,r6,pc}
2542.size	SHA3_absorb_neon,.-SHA3_absorb_neon
2543
2544.globl	SHA3_squeeze_neon
2545.type	SHA3_squeeze_neon, %function
2546.align	5
2547SHA3_squeeze_neon:
2548	stmdb	sp!, {r4,r5,r6,lr}
2549
2550	mov	r4, r1			@ out
2551	mov	r5, r2			@ len
2552	mov	r6, r3			@ bsz
2553	mov	r12, r0			@ A_flat
2554	mov	r14, r3			@ bsz
2555	b	.Loop_squeeze_neon
2556
2557.align	4
2558.Loop_squeeze_neon:
2559	cmp	r5, #8
2560	blo	.Lsqueeze_neon_tail
2561	vld1.32	{d0}, [r12]!
2562	vst1.8	{d0}, [r4]!		@ endian-neutral store
2563
2564	subs	r5, r5, #8		@ len -= 8
2565	beq	.Lsqueeze_neon_done
2566
2567	subs	r14, r14, #8		@ bsz -= 8
2568	bhi	.Loop_squeeze_neon
2569
2570	vstmdb	sp!,  {d8,d9,d10,d11,d12,d13,d14,d15}
2571
2572	vld1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2573	vld1.32	{d2}, [r0,:64]!
2574	vld1.32	{d4}, [r0,:64]!
2575	vld1.32	{d6}, [r0,:64]!
2576	vld1.32	{d8}, [r0,:64]!
2577
2578	vld1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2579	vld1.32	{d3}, [r0,:64]!
2580	vld1.32	{d5}, [r0,:64]!
2581	vld1.32	{d7}, [r0,:64]!
2582	vld1.32	{d9}, [r0,:64]!
2583
2584	vld1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2585	vld1.32	{d12}, [r0,:64]!
2586	vld1.32	{d14}, [r0,:64]!
2587	vld1.32	{d16}, [r0,:64]!
2588	vld1.32	{d18}, [r0,:64]!
2589
2590	vld1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2591	vld1.32	{d13}, [r0,:64]!
2592	vld1.32	{d15}, [r0,:64]!
2593	vld1.32	{d17}, [r0,:64]!
2594	vld1.32	{d19}, [r0,:64]!
2595
2596	vld1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
2597	vld1.32	{d24}, [r0,:64]
2598	sub	r0, r0, #24*8		@ rewind
2599
2600	bl	KeccakF1600_neon
2601
2602	mov	r12, r0			@ A_flat
2603	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2604	vst1.32	{d2}, [r0,:64]!
2605	vst1.32	{d4}, [r0,:64]!
2606	vst1.32	{d6}, [r0,:64]!
2607	vst1.32	{d8}, [r0,:64]!
2608
2609	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2610	vst1.32	{d3}, [r0,:64]!
2611	vst1.32	{d5}, [r0,:64]!
2612	vst1.32	{d7}, [r0,:64]!
2613	vst1.32	{d9}, [r0,:64]!
2614
2615	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2616	vst1.32	{d12}, [r0,:64]!
2617	vst1.32	{d14}, [r0,:64]!
2618	vst1.32	{d16}, [r0,:64]!
2619	vst1.32	{d18}, [r0,:64]!
2620
2621	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2622	vst1.32	{d13}, [r0,:64]!
2623	vst1.32	{d15}, [r0,:64]!
2624	vst1.32	{d17}, [r0,:64]!
2625	vst1.32	{d19}, [r0,:64]!
2626
2627	vst1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
2628	mov	r14, r6			@ bsz
2629	vst1.32	{d24}, [r0,:64]
2630	mov	r0,  r12		@ rewind
2631
2632	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
2633	b	.Loop_squeeze_neon
2634
2635.align	4
2636.Lsqueeze_neon_tail:
2637	ldmia	r12, {r2,r3}
2638	cmp	r5, #2
2639	strb	r2, [r4],#1		@ endian-neutral store
2640	mov	r2, r2, lsr#8
2641	blo	.Lsqueeze_neon_done
2642	strb	r2, [r4], #1
2643	mov	r2, r2, lsr#8
2644	beq	.Lsqueeze_neon_done
2645	strb	r2, [r4], #1
2646	mov	r2, r2, lsr#8
2647	cmp	r5, #4
2648	blo	.Lsqueeze_neon_done
2649	strb	r2, [r4], #1
2650	beq	.Lsqueeze_neon_done
2651
2652	strb	r3, [r4], #1
2653	mov	r3, r3, lsr#8
2654	cmp	r5, #6
2655	blo	.Lsqueeze_neon_done
2656	strb	r3, [r4], #1
2657	mov	r3, r3, lsr#8
2658	beq	.Lsqueeze_neon_done
2659	strb	r3, [r4], #1
2660
2661.Lsqueeze_neon_done:
2662	ldmia	sp!, {r4,r5,r6,pc}
2663.size	SHA3_squeeze_neon,.-SHA3_squeeze_neon
2664#endif
2665.byte	75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2666.align	2
2667.align	2
2668