1/* Do not modify. This file is auto-generated from keccak1600-armv4.pl. */
2#include "arm_arch.h"
3
4.text
5
6#if defined(__thumb2__)
7.syntax	unified
8.thumb
9#else
10.code	32
11#endif
12
13.type	iotas32, %object
14.align	5
15iotas32:
16.long	0x00000001, 0x00000000
17.long	0x00000000, 0x00000089
18.long	0x00000000, 0x8000008b
19.long	0x00000000, 0x80008080
20.long	0x00000001, 0x0000008b
21.long	0x00000001, 0x00008000
22.long	0x00000001, 0x80008088
23.long	0x00000001, 0x80000082
24.long	0x00000000, 0x0000000b
25.long	0x00000000, 0x0000000a
26.long	0x00000001, 0x00008082
27.long	0x00000000, 0x00008003
28.long	0x00000001, 0x0000808b
29.long	0x00000001, 0x8000000b
30.long	0x00000001, 0x8000008a
31.long	0x00000001, 0x80000081
32.long	0x00000000, 0x80000081
33.long	0x00000000, 0x80000008
34.long	0x00000000, 0x00000083
35.long	0x00000000, 0x80008003
36.long	0x00000001, 0x80008088
37.long	0x00000000, 0x80000088
38.long	0x00000001, 0x00008000
39.long	0x00000000, 0x80008082
40.size	iotas32,.-iotas32
41
42.type	KeccakF1600_int, %function
43.align	5
44KeccakF1600_int:
45	add	r9,sp,#176
46	add	r12,sp,#0
47	add	r10,sp,#40
48	ldmia	r9,{r4,r5,r6,r7,r8,r9}		@ A[4][2..4]
49KeccakF1600_enter:
50	str	lr,[sp,#440]
51	eor	r11,r11,r11
52	str	r11,[sp,#444]
53	b	.Lround2x
54
55.align	4
56.Lround2x:
57	ldmia	r12,{r0,r1,r2,r3}		@ A[0][0..1]
58	ldmia	r10,{r10,r11,r12,r14}	@ A[1][0..1]
59#ifdef	__thumb2__
60	eor	r0,r0,r10
61	eor	r1,r1,r11
62	eor	r2,r2,r12
63	ldrd	r10,r11,[sp,#56]
64	eor	r3,r3,r14
65	ldrd	r12,r14,[sp,#64]
66	eor	r4,r4,r10
67	eor	r5,r5,r11
68	eor	r6,r6,r12
69	ldrd	r10,r11,[sp,#72]
70	eor	r7,r7,r14
71	ldrd	r12,r14,[sp,#80]
72	eor	r8,r8,r10
73	eor	r9,r9,r11
74	eor	r0,r0,r12
75	ldrd	r10,r11,[sp,#88]
76	eor	r1,r1,r14
77	ldrd	r12,r14,[sp,#96]
78	eor	r2,r2,r10
79	eor	r3,r3,r11
80	eor	r4,r4,r12
81	ldrd	r10,r11,[sp,#104]
82	eor	r5,r5,r14
83	ldrd	r12,r14,[sp,#112]
84	eor	r6,r6,r10
85	eor	r7,r7,r11
86	eor	r8,r8,r12
87	ldrd	r10,r11,[sp,#120]
88	eor	r9,r9,r14
89	ldrd	r12,r14,[sp,#128]
90	eor	r0,r0,r10
91	eor	r1,r1,r11
92	eor	r2,r2,r12
93	ldrd	r10,r11,[sp,#136]
94	eor	r3,r3,r14
95	ldrd	r12,r14,[sp,#144]
96	eor	r4,r4,r10
97	eor	r5,r5,r11
98	eor	r6,r6,r12
99	ldrd	r10,r11,[sp,#152]
100	eor	r7,r7,r14
101	ldrd	r12,r14,[sp,#160]
102	eor	r8,r8,r10
103	eor	r9,r9,r11
104	eor	r0,r0,r12
105	ldrd	r10,r11,[sp,#168]
106	eor	r1,r1,r14
107	ldrd	r12,r14,[sp,#16]
108	eor	r2,r2,r10
109	eor	r3,r3,r11
110	eor	r4,r4,r12
111	ldrd	r10,r11,[sp,#24]
112	eor	r5,r5,r14
113	ldrd	r12,r14,[sp,#32]
114#else
115	eor	r0,r0,r10
116	add	r10,sp,#56
117	eor	r1,r1,r11
118	eor	r2,r2,r12
119	eor	r3,r3,r14
120	ldmia	r10,{r10,r11,r12,r14}	@ A[1][2..3]
121	eor	r4,r4,r10
122	add	r10,sp,#72
123	eor	r5,r5,r11
124	eor	r6,r6,r12
125	eor	r7,r7,r14
126	ldmia	r10,{r10,r11,r12,r14}	@ A[1][4]..A[2][0]
127	eor	r8,r8,r10
128	add	r10,sp,#88
129	eor	r9,r9,r11
130	eor	r0,r0,r12
131	eor	r1,r1,r14
132	ldmia	r10,{r10,r11,r12,r14}	@ A[2][1..2]
133	eor	r2,r2,r10
134	add	r10,sp,#104
135	eor	r3,r3,r11
136	eor	r4,r4,r12
137	eor	r5,r5,r14
138	ldmia	r10,{r10,r11,r12,r14}	@ A[2][3..4]
139	eor	r6,r6,r10
140	add	r10,sp,#120
141	eor	r7,r7,r11
142	eor	r8,r8,r12
143	eor	r9,r9,r14
144	ldmia	r10,{r10,r11,r12,r14}	@ A[3][0..1]
145	eor	r0,r0,r10
146	add	r10,sp,#136
147	eor	r1,r1,r11
148	eor	r2,r2,r12
149	eor	r3,r3,r14
150	ldmia	r10,{r10,r11,r12,r14}	@ A[3][2..3]
151	eor	r4,r4,r10
152	add	r10,sp,#152
153	eor	r5,r5,r11
154	eor	r6,r6,r12
155	eor	r7,r7,r14
156	ldmia	r10,{r10,r11,r12,r14}	@ A[3][4]..A[4][0]
157	eor	r8,r8,r10
158	ldr	r10,[sp,#168]		@ A[4][1]
159	eor	r9,r9,r11
160	ldr	r11,[sp,#168+4]
161	eor	r0,r0,r12
162	ldr	r12,[sp,#16]		@ A[0][2]
163	eor	r1,r1,r14
164	ldr	r14,[sp,#16+4]
165	eor	r2,r2,r10
166	add	r10,sp,#24
167	eor	r3,r3,r11
168	eor	r4,r4,r12
169	eor	r5,r5,r14
170	ldmia	r10,{r10,r11,r12,r14}	@ A[0][3..4]
171#endif
172	eor	r6,r6,r10
173	eor	r7,r7,r11
174	eor	r8,r8,r12
175	eor	r9,r9,r14
176
177	eor	r10,r0,r5,ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
178#ifndef	__thumb2__
179	str	r10,[sp,#208]		@ D[1] = E[0]
180#endif
181	eor	r11,r1,r4
182#ifndef	__thumb2__
183	str	r11,[sp,#208+4]
184#else
185	strd	r10,r11,[sp,#208]		@ D[1] = E[0]
186#endif
187	eor	r12,r6,r1,ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
188	eor	r14,r7,r0
189#ifndef	__thumb2__
190	str	r12,[sp,#232]		@ D[4] = E[1]
191#endif
192	eor	r0,r8,r3,ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
193#ifndef	__thumb2__
194	str	r14,[sp,#232+4]
195#else
196	strd	r12,r14,[sp,#232]		@ D[4] = E[1]
197#endif
198	eor	r1,r9,r2
199#ifndef	__thumb2__
200	str	r0,[sp,#200]		@ D[0] = C[0]
201#endif
202	eor	r2,r2,r7,ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
203#ifndef	__thumb2__
204	ldr	r7,[sp,#144]
205#endif
206	eor	r3,r3,r6
207#ifndef	__thumb2__
208	str	r1,[sp,#200+4]
209#else
210	strd	r0,r1,[sp,#200]		@ D[0] = C[0]
211#endif
212#ifndef	__thumb2__
213	ldr	r6,[sp,#144+4]
214#else
215	ldrd	r7,r6,[sp,#144]
216#endif
217#ifndef	__thumb2__
218	str	r2,[sp,#216]		@ D[2] = C[1]
219#endif
220	eor	r4,r4,r9,ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
221#ifndef	__thumb2__
222	str	r3,[sp,#216+4]
223#else
224	strd	r2,r3,[sp,#216]		@ D[2] = C[1]
225#endif
226	eor	r5,r5,r8
227
228#ifndef	__thumb2__
229	ldr	r8,[sp,#192]
230#endif
231#ifndef	__thumb2__
232	ldr	r9,[sp,#192+4]
233#else
234	ldrd	r8,r9,[sp,#192]
235#endif
236#ifndef	__thumb2__
237	str	r4,[sp,#224]		@ D[3] = C[2]
238#endif
239	eor	r7,r7,r4
240#ifndef	__thumb2__
241	str	r5,[sp,#224+4]
242#else
243	strd	r4,r5,[sp,#224]		@ D[3] = C[2]
244#endif
245	eor	r6,r6,r5
246#ifndef	__thumb2__
247	ldr	r4,[sp,#0]
248#endif
249	@ mov	r7,r7,ror#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
250	@ mov	r6,r6,ror#32-11
251#ifndef	__thumb2__
252	ldr	r5,[sp,#0+4]
253#else
254	ldrd	r4,r5,[sp,#0]
255#endif
256	eor	r8,r8,r12
257	eor	r9,r9,r14
258#ifndef	__thumb2__
259	ldr	r12,[sp,#96]
260#endif
261	eor	r0,r0,r4
262#ifndef	__thumb2__
263	ldr	r14,[sp,#96+4]
264#else
265	ldrd	r12,r14,[sp,#96]
266#endif
267	@ mov	r8,r8,ror#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
268	@ mov	r9,r9,ror#32-7
269	eor	r1,r1,r5		@ C[0] =       A[0][0] ^ C[0];
270	eor	r12,r12,r2
271#ifndef	__thumb2__
272	ldr	r2,[sp,#48]
273#endif
274	eor	r14,r14,r3
275#ifndef	__thumb2__
276	ldr	r3,[sp,#48+4]
277#else
278	ldrd	r2,r3,[sp,#48]
279#endif
280	mov	r5,r12,ror#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);
281	ldr	r12,[sp,#444]			@ load counter
282	eor	r2,r2,r10
283	adr	r10,iotas32
284	mov	r4,r14,ror#32-22
285	add	r14,r10,r12
286	eor	r3,r3,r11
287	ldmia	r14,{r10,r11}		@ iotas[i]
288	bic	r12,r4,r2,ror#32-22
289	bic	r14,r5,r3,ror#32-22
290	mov	r2,r2,ror#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);
291	mov	r3,r3,ror#32-22
292	eor	r12,r12,r0
293	eor	r14,r14,r1
294	eor	r10,r10,r12
295	eor	r11,r11,r14
296#ifndef	__thumb2__
297	str	r10,[sp,#240]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
298#endif
299	bic	r12,r6,r4,ror#11
300#ifndef	__thumb2__
301	str	r11,[sp,#240+4]
302#else
303	strd	r10,r11,[sp,#240]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
304#endif
305	bic	r14,r7,r5,ror#10
306	bic	r10,r8,r6,ror#32-(11-7)
307	bic	r11,r9,r7,ror#32-(10-7)
308	eor	r12,r2,r12,ror#32-11
309#ifndef	__thumb2__
310	str	r12,[sp,#248]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
311#endif
312	eor	r14,r3,r14,ror#32-10
313#ifndef	__thumb2__
314	str	r14,[sp,#248+4]
315#else
316	strd	r12,r14,[sp,#248]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
317#endif
318	eor	r10,r4,r10,ror#32-7
319	eor	r11,r5,r11,ror#32-7
320#ifndef	__thumb2__
321	str	r10,[sp,#256]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
322#endif
323	bic	r12,r0,r8,ror#32-7
324#ifndef	__thumb2__
325	str	r11,[sp,#256+4]
326#else
327	strd	r10,r11,[sp,#256]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
328#endif
329	bic	r14,r1,r9,ror#32-7
330	eor	r12,r12,r6,ror#32-11
331#ifndef	__thumb2__
332	str	r12,[sp,#264]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
333#endif
334	eor	r14,r14,r7,ror#32-10
335#ifndef	__thumb2__
336	str	r14,[sp,#264+4]
337#else
338	strd	r12,r14,[sp,#264]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
339#endif
340	bic	r10,r2,r0
341	add	r14,sp,#224
342#ifndef	__thumb2__
343	ldr	r0,[sp,#24]		@ A[0][3]
344#endif
345	bic	r11,r3,r1
346#ifndef	__thumb2__
347	ldr	r1,[sp,#24+4]
348#else
349	ldrd	r0,r1,[sp,#24]		@ A[0][3]
350#endif
351	eor	r10,r10,r8,ror#32-7
352	eor	r11,r11,r9,ror#32-7
353#ifndef	__thumb2__
354	str	r10,[sp,#272]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
355#endif
356	add	r9,sp,#200
357#ifndef	__thumb2__
358	str	r11,[sp,#272+4]
359#else
360	strd	r10,r11,[sp,#272]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
361#endif
362
363	ldmia	r14,{r10,r11,r12,r14}	@ D[3..4]
364	ldmia	r9,{r6,r7,r8,r9}		@ D[0..1]
365
366#ifndef	__thumb2__
367	ldr	r2,[sp,#72]		@ A[1][4]
368#endif
369	eor	r0,r0,r10
370#ifndef	__thumb2__
371	ldr	r3,[sp,#72+4]
372#else
373	ldrd	r2,r3,[sp,#72]		@ A[1][4]
374#endif
375	eor	r1,r1,r11
376	@ mov	r0,r0,ror#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
377#ifndef	__thumb2__
378	ldr	r10,[sp,#128]		@ A[3][1]
379#endif
380	@ mov	r1,r1,ror#32-14
381#ifndef	__thumb2__
382	ldr	r11,[sp,#128+4]
383#else
384	ldrd	r10,r11,[sp,#128]		@ A[3][1]
385#endif
386
387	eor	r2,r2,r12
388#ifndef	__thumb2__
389	ldr	r4,[sp,#80]		@ A[2][0]
390#endif
391	eor	r3,r3,r14
392#ifndef	__thumb2__
393	ldr	r5,[sp,#80+4]
394#else
395	ldrd	r4,r5,[sp,#80]		@ A[2][0]
396#endif
397	@ mov	r2,r2,ror#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
398	@ mov	r3,r3,ror#32-10
399
400	eor	r6,r6,r4
401#ifndef	__thumb2__
402	ldr	r12,[sp,#216]		@ D[2]
403#endif
404	eor	r7,r7,r5
405#ifndef	__thumb2__
406	ldr	r14,[sp,#216+4]
407#else
408	ldrd	r12,r14,[sp,#216]		@ D[2]
409#endif
410	mov	r5,r6,ror#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
411	mov	r4,r7,ror#32-2
412
413	eor	r10,r10,r8
414#ifndef	__thumb2__
415	ldr	r8,[sp,#176]		@ A[4][2]
416#endif
417	eor	r11,r11,r9
418#ifndef	__thumb2__
419	ldr	r9,[sp,#176+4]
420#else
421	ldrd	r8,r9,[sp,#176]		@ A[4][2]
422#endif
423	mov	r7,r10,ror#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
424	mov	r6,r11,ror#32-23
425
426	bic	r10,r4,r2,ror#32-10
427	bic	r11,r5,r3,ror#32-10
428	eor	r12,r12,r8
429	eor	r14,r14,r9
430	mov	r9,r12,ror#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
431	mov	r8,r14,ror#32-31
432	eor	r10,r10,r0,ror#32-14
433	eor	r11,r11,r1,ror#32-14
434#ifndef	__thumb2__
435	str	r10,[sp,#280]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
436#endif
437	bic	r12,r6,r4
438#ifndef	__thumb2__
439	str	r11,[sp,#280+4]
440#else
441	strd	r10,r11,[sp,#280]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
442#endif
443	bic	r14,r7,r5
444	eor	r12,r12,r2,ror#32-10
445#ifndef	__thumb2__
446	str	r12,[sp,#288]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
447#endif
448	eor	r14,r14,r3,ror#32-10
449#ifndef	__thumb2__
450	str	r14,[sp,#288+4]
451#else
452	strd	r12,r14,[sp,#288]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
453#endif
454	bic	r10,r8,r6
455	bic	r11,r9,r7
456	bic	r12,r0,r8,ror#14
457	bic	r14,r1,r9,ror#14
458	eor	r10,r10,r4
459	eor	r11,r11,r5
460#ifndef	__thumb2__
461	str	r10,[sp,#296]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
462#endif
463	bic	r2,r2,r0,ror#32-(14-10)
464#ifndef	__thumb2__
465	str	r11,[sp,#296+4]
466#else
467	strd	r10,r11,[sp,#296]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
468#endif
469	eor	r12,r6,r12,ror#32-14
470	bic	r11,r3,r1,ror#32-(14-10)
471#ifndef	__thumb2__
472	str	r12,[sp,#304]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
473#endif
474	eor	r14,r7,r14,ror#32-14
475#ifndef	__thumb2__
476	str	r14,[sp,#304+4]
477#else
478	strd	r12,r14,[sp,#304]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
479#endif
480	add	r12,sp,#208
481#ifndef	__thumb2__
482	ldr	r1,[sp,#8]		@ A[0][1]
483#endif
484	eor	r10,r8,r2,ror#32-10
485#ifndef	__thumb2__
486	ldr	r0,[sp,#8+4]
487#else
488	ldrd	r1,r0,[sp,#8]		@ A[0][1]
489#endif
490	eor	r11,r9,r11,ror#32-10
491#ifndef	__thumb2__
492	str	r10,[sp,#312]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
493#endif
494#ifndef	__thumb2__
495	str	r11,[sp,#312+4]
496#else
497	strd	r10,r11,[sp,#312]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
498#endif
499
500	add	r9,sp,#224
501	ldmia	r12,{r10,r11,r12,r14}	@ D[1..2]
502#ifndef	__thumb2__
503	ldr	r2,[sp,#56]		@ A[1][2]
504#endif
505#ifndef	__thumb2__
506	ldr	r3,[sp,#56+4]
507#else
508	ldrd	r2,r3,[sp,#56]		@ A[1][2]
509#endif
510	ldmia	r9,{r6,r7,r8,r9}		@ D[3..4]
511
512	eor	r1,r1,r10
513#ifndef	__thumb2__
514	ldr	r4,[sp,#104]		@ A[2][3]
515#endif
516	eor	r0,r0,r11
517#ifndef	__thumb2__
518	ldr	r5,[sp,#104+4]
519#else
520	ldrd	r4,r5,[sp,#104]		@ A[2][3]
521#endif
522	mov	r0,r0,ror#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
523
524	eor	r2,r2,r12
525#ifndef	__thumb2__
526	ldr	r10,[sp,#152]		@ A[3][4]
527#endif
528	eor	r3,r3,r14
529#ifndef	__thumb2__
530	ldr	r11,[sp,#152+4]
531#else
532	ldrd	r10,r11,[sp,#152]		@ A[3][4]
533#endif
534	@ mov	r2,r2,ror#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
535#ifndef	__thumb2__
536	ldr	r12,[sp,#200]		@ D[0]
537#endif
538	@ mov	r3,r3,ror#32-3
539#ifndef	__thumb2__
540	ldr	r14,[sp,#200+4]
541#else
542	ldrd	r12,r14,[sp,#200]		@ D[0]
543#endif
544
545	eor	r4,r4,r6
546	eor	r5,r5,r7
547	@ mov	r5,r6,ror#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
548	@ mov	r4,r7,ror#32-13		@ [track reverse order below]
549
550	eor	r10,r10,r8
551#ifndef	__thumb2__
552	ldr	r8,[sp,#160]		@ A[4][0]
553#endif
554	eor	r11,r11,r9
555#ifndef	__thumb2__
556	ldr	r9,[sp,#160+4]
557#else
558	ldrd	r8,r9,[sp,#160]		@ A[4][0]
559#endif
560	mov	r6,r10,ror#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
561	mov	r7,r11,ror#32-4
562
563	eor	r12,r12,r8
564	eor	r14,r14,r9
565	mov	r8,r12,ror#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
566	mov	r9,r14,ror#32-9
567
568	bic	r10,r5,r2,ror#13-3
569	bic	r11,r4,r3,ror#12-3
570	bic	r12,r6,r5,ror#32-13
571	bic	r14,r7,r4,ror#32-12
572	eor	r10,r0,r10,ror#32-13
573	eor	r11,r1,r11,ror#32-12
574#ifndef	__thumb2__
575	str	r10,[sp,#320]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
576#endif
577	eor	r12,r12,r2,ror#32-3
578#ifndef	__thumb2__
579	str	r11,[sp,#320+4]
580#else
581	strd	r10,r11,[sp,#320]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
582#endif
583	eor	r14,r14,r3,ror#32-3
584#ifndef	__thumb2__
585	str	r12,[sp,#328]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
586#endif
587	bic	r10,r8,r6
588	bic	r11,r9,r7
589#ifndef	__thumb2__
590	str	r14,[sp,#328+4]
591#else
592	strd	r12,r14,[sp,#328]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
593#endif
594	eor	r10,r10,r5,ror#32-13
595	eor	r11,r11,r4,ror#32-12
596#ifndef	__thumb2__
597	str	r10,[sp,#336]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
598#endif
599	bic	r12,r0,r8
600#ifndef	__thumb2__
601	str	r11,[sp,#336+4]
602#else
603	strd	r10,r11,[sp,#336]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
604#endif
605	bic	r14,r1,r9
606	eor	r12,r12,r6
607	eor	r14,r14,r7
608#ifndef	__thumb2__
609	str	r12,[sp,#344]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
610#endif
611	bic	r10,r2,r0,ror#3
612#ifndef	__thumb2__
613	str	r14,[sp,#344+4]
614#else
615	strd	r12,r14,[sp,#344]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
616#endif
617	bic	r11,r3,r1,ror#3
618#ifndef	__thumb2__
619	ldr	r1,[sp,#32]		@ A[0][4] [in reverse order]
620#endif
621	eor	r10,r8,r10,ror#32-3
622#ifndef	__thumb2__
623	ldr	r0,[sp,#32+4]
624#else
625	ldrd	r1,r0,[sp,#32]		@ A[0][4] [in reverse order]
626#endif
627	eor	r11,r9,r11,ror#32-3
628#ifndef	__thumb2__
629	str	r10,[sp,#352]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
630#endif
631	add	r9,sp,#208
632#ifndef	__thumb2__
633	str	r11,[sp,#352+4]
634#else
635	strd	r10,r11,[sp,#352]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
636#endif
637
638#ifndef	__thumb2__
639	ldr	r10,[sp,#232]		@ D[4]
640#endif
641#ifndef	__thumb2__
642	ldr	r11,[sp,#232+4]
643#else
644	ldrd	r10,r11,[sp,#232]		@ D[4]
645#endif
646#ifndef	__thumb2__
647	ldr	r12,[sp,#200]		@ D[0]
648#endif
649#ifndef	__thumb2__
650	ldr	r14,[sp,#200+4]
651#else
652	ldrd	r12,r14,[sp,#200]		@ D[0]
653#endif
654
655	ldmia	r9,{r6,r7,r8,r9}		@ D[1..2]
656
657	eor	r1,r1,r10
658#ifndef	__thumb2__
659	ldr	r2,[sp,#40]		@ A[1][0]
660#endif
661	eor	r0,r0,r11
662#ifndef	__thumb2__
663	ldr	r3,[sp,#40+4]
664#else
665	ldrd	r2,r3,[sp,#40]		@ A[1][0]
666#endif
667	@ mov	r1,r10,ror#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
668#ifndef	__thumb2__
669	ldr	r4,[sp,#88]		@ A[2][1]
670#endif
671	@ mov	r0,r11,ror#32-14		@ [was loaded in reverse order]
672#ifndef	__thumb2__
673	ldr	r5,[sp,#88+4]
674#else
675	ldrd	r4,r5,[sp,#88]		@ A[2][1]
676#endif
677
678	eor	r2,r2,r12
679#ifndef	__thumb2__
680	ldr	r10,[sp,#136]		@ A[3][2]
681#endif
682	eor	r3,r3,r14
683#ifndef	__thumb2__
684	ldr	r11,[sp,#136+4]
685#else
686	ldrd	r10,r11,[sp,#136]		@ A[3][2]
687#endif
688	@ mov	r2,r2,ror#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
689#ifndef	__thumb2__
690	ldr	r12,[sp,#224]		@ D[3]
691#endif
692	@ mov	r3,r3,ror#32-18
693#ifndef	__thumb2__
694	ldr	r14,[sp,#224+4]
695#else
696	ldrd	r12,r14,[sp,#224]		@ D[3]
697#endif
698
699	eor	r6,r6,r4
700	eor	r7,r7,r5
701	mov	r4,r6,ror#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
702	mov	r5,r7,ror#32-5
703
704	eor	r10,r10,r8
705#ifndef	__thumb2__
706	ldr	r8,[sp,#184]		@ A[4][3]
707#endif
708	eor	r11,r11,r9
709#ifndef	__thumb2__
710	ldr	r9,[sp,#184+4]
711#else
712	ldrd	r8,r9,[sp,#184]		@ A[4][3]
713#endif
714	mov	r7,r10,ror#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
715	mov	r6,r11,ror#32-8
716
717	eor	r12,r12,r8
718	eor	r14,r14,r9
719	mov	r8,r12,ror#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
720	mov	r9,r14,ror#32-28
721
722	bic	r10,r4,r2,ror#32-18
723	bic	r11,r5,r3,ror#32-18
724	eor	r10,r10,r0,ror#32-14
725	eor	r11,r11,r1,ror#32-13
726#ifndef	__thumb2__
727	str	r10,[sp,#360]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
728#endif
729	bic	r12,r6,r4
730#ifndef	__thumb2__
731	str	r11,[sp,#360+4]
732#else
733	strd	r10,r11,[sp,#360]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
734#endif
735	bic	r14,r7,r5
736	eor	r12,r12,r2,ror#32-18
737#ifndef	__thumb2__
738	str	r12,[sp,#368]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
739#endif
740	eor	r14,r14,r3,ror#32-18
741#ifndef	__thumb2__
742	str	r14,[sp,#368+4]
743#else
744	strd	r12,r14,[sp,#368]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
745#endif
746	bic	r10,r8,r6
747	bic	r11,r9,r7
748	bic	r12,r0,r8,ror#14
749	bic	r14,r1,r9,ror#13
750	eor	r10,r10,r4
751	eor	r11,r11,r5
752#ifndef	__thumb2__
753	str	r10,[sp,#376]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
754#endif
755	bic	r2,r2,r0,ror#18-14
756#ifndef	__thumb2__
757	str	r11,[sp,#376+4]
758#else
759	strd	r10,r11,[sp,#376]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
760#endif
761	eor	r12,r6,r12,ror#32-14
762	bic	r11,r3,r1,ror#18-13
763	eor	r14,r7,r14,ror#32-13
764#ifndef	__thumb2__
765	str	r12,[sp,#384]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
766#endif
767#ifndef	__thumb2__
768	str	r14,[sp,#384+4]
769#else
770	strd	r12,r14,[sp,#384]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
771#endif
772	add	r14,sp,#216
773#ifndef	__thumb2__
774	ldr	r0,[sp,#16]		@ A[0][2]
775#endif
776	eor	r10,r8,r2,ror#32-18
777#ifndef	__thumb2__
778	ldr	r1,[sp,#16+4]
779#else
780	ldrd	r0,r1,[sp,#16]		@ A[0][2]
781#endif
782	eor	r11,r9,r11,ror#32-18
783#ifndef	__thumb2__
784	str	r10,[sp,#392]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
785#endif
786#ifndef	__thumb2__
787	str	r11,[sp,#392+4]
788#else
789	strd	r10,r11,[sp,#392]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
790#endif
791
792	ldmia	r14,{r10,r11,r12,r14}	@ D[2..3]
793#ifndef	__thumb2__
794	ldr	r2,[sp,#64]		@ A[1][3]
795#endif
796#ifndef	__thumb2__
797	ldr	r3,[sp,#64+4]
798#else
799	ldrd	r2,r3,[sp,#64]		@ A[1][3]
800#endif
801#ifndef	__thumb2__
802	ldr	r6,[sp,#232]		@ D[4]
803#endif
804#ifndef	__thumb2__
805	ldr	r7,[sp,#232+4]
806#else
807	ldrd	r6,r7,[sp,#232]		@ D[4]
808#endif
809
810	eor	r0,r0,r10
811#ifndef	__thumb2__
812	ldr	r4,[sp,#112]		@ A[2][4]
813#endif
814	eor	r1,r1,r11
815#ifndef	__thumb2__
816	ldr	r5,[sp,#112+4]
817#else
818	ldrd	r4,r5,[sp,#112]		@ A[2][4]
819#endif
820	@ mov	r0,r0,ror#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
821#ifndef	__thumb2__
822	ldr	r8,[sp,#200]		@ D[0]
823#endif
824	@ mov	r1,r1,ror#32-31
825#ifndef	__thumb2__
826	ldr	r9,[sp,#200+4]
827#else
828	ldrd	r8,r9,[sp,#200]		@ D[0]
829#endif
830
831	eor	r12,r12,r2
832#ifndef	__thumb2__
833	ldr	r10,[sp,#120]		@ A[3][0]
834#endif
835	eor	r14,r14,r3
836#ifndef	__thumb2__
837	ldr	r11,[sp,#120+4]
838#else
839	ldrd	r10,r11,[sp,#120]		@ A[3][0]
840#endif
841	mov	r3,r12,ror#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
842#ifndef	__thumb2__
843	ldr	r12,[sp,#208]		@ D[1]
844#endif
845	mov	r2,r14,ror#32-28
846#ifndef	__thumb2__
847	ldr	r14,[sp,#208+4]
848#else
849	ldrd	r12,r14,[sp,#208]		@ D[1]
850#endif
851
852	eor	r6,r6,r4
853	eor	r7,r7,r5
854	mov	r5,r6,ror#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
855	mov	r4,r7,ror#32-20
856
857	eor	r10,r10,r8
858#ifndef	__thumb2__
859	ldr	r8,[sp,#168]		@ A[4][1]
860#endif
861	eor	r11,r11,r9
862#ifndef	__thumb2__
863	ldr	r9,[sp,#168+4]
864#else
865	ldrd	r8,r9,[sp,#168]		@ A[4][1]
866#endif
867	mov	r7,r10,ror#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
868	mov	r6,r11,ror#32-21
869
870	eor	r8,r8,r12
871	eor	r9,r9,r14
872	@ mov	r8,r2,ror#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
873	@ mov	r9,r3,ror#32-1
874
875	bic	r10,r4,r2
876	bic	r11,r5,r3
877	eor	r10,r10,r0,ror#32-31
878#ifndef	__thumb2__
879	str	r10,[sp,#400]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
880#endif
881	eor	r11,r11,r1,ror#32-31
882#ifndef	__thumb2__
883	str	r11,[sp,#400+4]
884#else
885	strd	r10,r11,[sp,#400]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
886#endif
887	bic	r12,r6,r4
888	bic	r14,r7,r5
889	eor	r12,r12,r2
890	eor	r14,r14,r3
891#ifndef	__thumb2__
892	str	r12,[sp,#408]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
893#endif
894	bic	r10,r8,r6,ror#1
895#ifndef	__thumb2__
896	str	r14,[sp,#408+4]
897#else
898	strd	r12,r14,[sp,#408]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
899#endif
900	bic	r11,r9,r7,ror#1
901	bic	r12,r0,r8,ror#31-1
902	bic	r14,r1,r9,ror#31-1
903	eor	r4,r4,r10,ror#32-1
904#ifndef	__thumb2__
905	str	r4,[sp,#416]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
906#endif
907	eor	r5,r5,r11,ror#32-1
908#ifndef	__thumb2__
909	str	r5,[sp,#416+4]
910#else
911	strd	r4,r5,[sp,#416]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
912#endif
913	eor	r6,r6,r12,ror#32-31
914	eor	r7,r7,r14,ror#32-31
915#ifndef	__thumb2__
916	str	r6,[sp,#424]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
917#endif
918	bic	r10,r2,r0,ror#32-31
919#ifndef	__thumb2__
920	str	r7,[sp,#424+4]
921#else
922	strd	r6,r7,[sp,#424]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
923#endif
924	bic	r11,r3,r1,ror#32-31
925	add	r12,sp,#240
926	eor	r8,r10,r8,ror#32-1
927	add	r10,sp,#280
928	eor	r9,r11,r9,ror#32-1
929#ifndef	__thumb2__
930	str	r8,[sp,#432]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
931#endif
932#ifndef	__thumb2__
933	str	r9,[sp,#432+4]
934#else
935	strd	r8,r9,[sp,#432]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
936#endif
937	ldmia	r12,{r0,r1,r2,r3}		@ A[0][0..1]
938	ldmia	r10,{r10,r11,r12,r14}	@ A[1][0..1]
939#ifdef	__thumb2__
940	eor	r0,r0,r10
941	eor	r1,r1,r11
942	eor	r2,r2,r12
943	ldrd	r10,r11,[sp,#296]
944	eor	r3,r3,r14
945	ldrd	r12,r14,[sp,#304]
946	eor	r4,r4,r10
947	eor	r5,r5,r11
948	eor	r6,r6,r12
949	ldrd	r10,r11,[sp,#312]
950	eor	r7,r7,r14
951	ldrd	r12,r14,[sp,#320]
952	eor	r8,r8,r10
953	eor	r9,r9,r11
954	eor	r0,r0,r12
955	ldrd	r10,r11,[sp,#328]
956	eor	r1,r1,r14
957	ldrd	r12,r14,[sp,#336]
958	eor	r2,r2,r10
959	eor	r3,r3,r11
960	eor	r4,r4,r12
961	ldrd	r10,r11,[sp,#344]
962	eor	r5,r5,r14
963	ldrd	r12,r14,[sp,#352]
964	eor	r6,r6,r10
965	eor	r7,r7,r11
966	eor	r8,r8,r12
967	ldrd	r10,r11,[sp,#360]
968	eor	r9,r9,r14
969	ldrd	r12,r14,[sp,#368]
970	eor	r0,r0,r10
971	eor	r1,r1,r11
972	eor	r2,r2,r12
973	ldrd	r10,r11,[sp,#376]
974	eor	r3,r3,r14
975	ldrd	r12,r14,[sp,#384]
976	eor	r4,r4,r10
977	eor	r5,r5,r11
978	eor	r6,r6,r12
979	ldrd	r10,r11,[sp,#392]
980	eor	r7,r7,r14
981	ldrd	r12,r14,[sp,#400]
982	eor	r8,r8,r10
983	eor	r9,r9,r11
984	eor	r0,r0,r12
985	ldrd	r10,r11,[sp,#408]
986	eor	r1,r1,r14
987	ldrd	r12,r14,[sp,#256]
988	eor	r2,r2,r10
989	eor	r3,r3,r11
990	eor	r4,r4,r12
991	ldrd	r10,r11,[sp,#264]
992	eor	r5,r5,r14
993	ldrd	r12,r14,[sp,#272]
994#else
995	eor	r0,r0,r10
996	add	r10,sp,#296
997	eor	r1,r1,r11
998	eor	r2,r2,r12
999	eor	r3,r3,r14
1000	ldmia	r10,{r10,r11,r12,r14}	@ A[1][2..3]
1001	eor	r4,r4,r10
1002	add	r10,sp,#312
1003	eor	r5,r5,r11
1004	eor	r6,r6,r12
1005	eor	r7,r7,r14
1006	ldmia	r10,{r10,r11,r12,r14}	@ A[1][4]..A[2][0]
1007	eor	r8,r8,r10
1008	add	r10,sp,#328
1009	eor	r9,r9,r11
1010	eor	r0,r0,r12
1011	eor	r1,r1,r14
1012	ldmia	r10,{r10,r11,r12,r14}	@ A[2][1..2]
1013	eor	r2,r2,r10
1014	add	r10,sp,#344
1015	eor	r3,r3,r11
1016	eor	r4,r4,r12
1017	eor	r5,r5,r14
1018	ldmia	r10,{r10,r11,r12,r14}	@ A[2][3..4]
1019	eor	r6,r6,r10
1020	add	r10,sp,#360
1021	eor	r7,r7,r11
1022	eor	r8,r8,r12
1023	eor	r9,r9,r14
1024	ldmia	r10,{r10,r11,r12,r14}	@ A[3][0..1]
1025	eor	r0,r0,r10
1026	add	r10,sp,#376
1027	eor	r1,r1,r11
1028	eor	r2,r2,r12
1029	eor	r3,r3,r14
1030	ldmia	r10,{r10,r11,r12,r14}	@ A[3][2..3]
1031	eor	r4,r4,r10
1032	add	r10,sp,#392
1033	eor	r5,r5,r11
1034	eor	r6,r6,r12
1035	eor	r7,r7,r14
1036	ldmia	r10,{r10,r11,r12,r14}	@ A[3][4]..A[4][0]
1037	eor	r8,r8,r10
1038	ldr	r10,[sp,#408]		@ A[4][1]
1039	eor	r9,r9,r11
1040	ldr	r11,[sp,#408+4]
1041	eor	r0,r0,r12
1042	ldr	r12,[sp,#256]		@ A[0][2]
1043	eor	r1,r1,r14
1044	ldr	r14,[sp,#256+4]
1045	eor	r2,r2,r10
1046	add	r10,sp,#264
1047	eor	r3,r3,r11
1048	eor	r4,r4,r12
1049	eor	r5,r5,r14
1050	ldmia	r10,{r10,r11,r12,r14}	@ A[0][3..4]
1051#endif
1052	eor	r6,r6,r10
1053	eor	r7,r7,r11
1054	eor	r8,r8,r12
1055	eor	r9,r9,r14
1056
1057	eor	r10,r0,r5,ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
1058#ifndef	__thumb2__
1059	str	r10,[sp,#208]		@ D[1] = E[0]
1060#endif
1061	eor	r11,r1,r4
1062#ifndef	__thumb2__
1063	str	r11,[sp,#208+4]
1064#else
1065	strd	r10,r11,[sp,#208]		@ D[1] = E[0]
1066#endif
1067	eor	r12,r6,r1,ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
1068	eor	r14,r7,r0
1069#ifndef	__thumb2__
1070	str	r12,[sp,#232]		@ D[4] = E[1]
1071#endif
1072	eor	r0,r8,r3,ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
1073#ifndef	__thumb2__
1074	str	r14,[sp,#232+4]
1075#else
1076	strd	r12,r14,[sp,#232]		@ D[4] = E[1]
1077#endif
1078	eor	r1,r9,r2
1079#ifndef	__thumb2__
1080	str	r0,[sp,#200]		@ D[0] = C[0]
1081#endif
1082	eor	r2,r2,r7,ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
1083#ifndef	__thumb2__
1084	ldr	r7,[sp,#384]
1085#endif
1086	eor	r3,r3,r6
1087#ifndef	__thumb2__
1088	str	r1,[sp,#200+4]
1089#else
1090	strd	r0,r1,[sp,#200]		@ D[0] = C[0]
1091#endif
1092#ifndef	__thumb2__
1093	ldr	r6,[sp,#384+4]
1094#else
1095	ldrd	r7,r6,[sp,#384]
1096#endif
1097#ifndef	__thumb2__
1098	str	r2,[sp,#216]		@ D[2] = C[1]
1099#endif
1100	eor	r4,r4,r9,ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
1101#ifndef	__thumb2__
1102	str	r3,[sp,#216+4]
1103#else
1104	strd	r2,r3,[sp,#216]		@ D[2] = C[1]
1105#endif
1106	eor	r5,r5,r8
1107
1108#ifndef	__thumb2__
1109	ldr	r8,[sp,#432]
1110#endif
1111#ifndef	__thumb2__
1112	ldr	r9,[sp,#432+4]
1113#else
1114	ldrd	r8,r9,[sp,#432]
1115#endif
1116#ifndef	__thumb2__
1117	str	r4,[sp,#224]		@ D[3] = C[2]
1118#endif
1119	eor	r7,r7,r4
1120#ifndef	__thumb2__
1121	str	r5,[sp,#224+4]
1122#else
1123	strd	r4,r5,[sp,#224]		@ D[3] = C[2]
1124#endif
1125	eor	r6,r6,r5
1126#ifndef	__thumb2__
1127	ldr	r4,[sp,#240]
1128#endif
1129	@ mov	r7,r7,ror#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
1130	@ mov	r6,r6,ror#32-11
1131#ifndef	__thumb2__
1132	ldr	r5,[sp,#240+4]
1133#else
1134	ldrd	r4,r5,[sp,#240]
1135#endif
1136	eor	r8,r8,r12
1137	eor	r9,r9,r14
1138#ifndef	__thumb2__
1139	ldr	r12,[sp,#336]
1140#endif
1141	eor	r0,r0,r4
1142#ifndef	__thumb2__
1143	ldr	r14,[sp,#336+4]
1144#else
1145	ldrd	r12,r14,[sp,#336]
1146#endif
1147	@ mov	r8,r8,ror#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
1148	@ mov	r9,r9,ror#32-7
1149	eor	r1,r1,r5		@ C[0] =       A[0][0] ^ C[0];
1150	eor	r12,r12,r2
1151#ifndef	__thumb2__
1152	ldr	r2,[sp,#288]
1153#endif
1154	eor	r14,r14,r3
1155#ifndef	__thumb2__
1156	ldr	r3,[sp,#288+4]
1157#else
1158	ldrd	r2,r3,[sp,#288]
1159#endif
1160	mov	r5,r12,ror#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);
1161	ldr	r12,[sp,#444]			@ load counter
1162	eor	r2,r2,r10
1163	adr	r10,iotas32
1164	mov	r4,r14,ror#32-22
1165	add	r14,r10,r12
1166	eor	r3,r3,r11
1167#ifndef	__thumb2__
1168	ldr	r10,[r14,#8]		@ iotas[i].lo
1169#endif
1170	add	r12,r12,#16
1171#ifndef	__thumb2__
1172	ldr	r11,[r14,#12]		@ iotas[i].hi
1173#else
1174	ldrd	r10,r11,[r14,#8]		@ iotas[i].lo
1175#endif
1176	cmp	r12,#192
1177	str	r12,[sp,#444]			@ store counter
1178	bic	r12,r4,r2,ror#32-22
1179	bic	r14,r5,r3,ror#32-22
1180	mov	r2,r2,ror#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);
1181	mov	r3,r3,ror#32-22
1182	eor	r12,r12,r0
1183	eor	r14,r14,r1
1184	eor	r10,r10,r12
1185	eor	r11,r11,r14
1186#ifndef	__thumb2__
1187	str	r10,[sp,#0]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
1188#endif
1189	bic	r12,r6,r4,ror#11
1190#ifndef	__thumb2__
1191	str	r11,[sp,#0+4]
1192#else
1193	strd	r10,r11,[sp,#0]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
1194#endif
1195	bic	r14,r7,r5,ror#10
1196	bic	r10,r8,r6,ror#32-(11-7)
1197	bic	r11,r9,r7,ror#32-(10-7)
1198	eor	r12,r2,r12,ror#32-11
1199#ifndef	__thumb2__
1200	str	r12,[sp,#8]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
1201#endif
1202	eor	r14,r3,r14,ror#32-10
1203#ifndef	__thumb2__
1204	str	r14,[sp,#8+4]
1205#else
1206	strd	r12,r14,[sp,#8]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
1207#endif
1208	eor	r10,r4,r10,ror#32-7
1209	eor	r11,r5,r11,ror#32-7
1210#ifndef	__thumb2__
1211	str	r10,[sp,#16]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
1212#endif
1213	bic	r12,r0,r8,ror#32-7
1214#ifndef	__thumb2__
1215	str	r11,[sp,#16+4]
1216#else
1217	strd	r10,r11,[sp,#16]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
1218#endif
1219	bic	r14,r1,r9,ror#32-7
1220	eor	r12,r12,r6,ror#32-11
1221#ifndef	__thumb2__
1222	str	r12,[sp,#24]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
1223#endif
1224	eor	r14,r14,r7,ror#32-10
1225#ifndef	__thumb2__
1226	str	r14,[sp,#24+4]
1227#else
1228	strd	r12,r14,[sp,#24]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
1229#endif
1230	bic	r10,r2,r0
1231	add	r14,sp,#224
1232#ifndef	__thumb2__
1233	ldr	r0,[sp,#264]		@ A[0][3]
1234#endif
1235	bic	r11,r3,r1
1236#ifndef	__thumb2__
1237	ldr	r1,[sp,#264+4]
1238#else
1239	ldrd	r0,r1,[sp,#264]		@ A[0][3]
1240#endif
1241	eor	r10,r10,r8,ror#32-7
1242	eor	r11,r11,r9,ror#32-7
1243#ifndef	__thumb2__
1244	str	r10,[sp,#32]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
1245#endif
1246	add	r9,sp,#200
1247#ifndef	__thumb2__
1248	str	r11,[sp,#32+4]
1249#else
1250	strd	r10,r11,[sp,#32]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
1251#endif
1252
1253	ldmia	r14,{r10,r11,r12,r14}	@ D[3..4]
1254	ldmia	r9,{r6,r7,r8,r9}		@ D[0..1]
1255
1256#ifndef	__thumb2__
1257	ldr	r2,[sp,#312]		@ A[1][4]
1258#endif
1259	eor	r0,r0,r10
1260#ifndef	__thumb2__
1261	ldr	r3,[sp,#312+4]
1262#else
1263	ldrd	r2,r3,[sp,#312]		@ A[1][4]
1264#endif
1265	eor	r1,r1,r11
1266	@ mov	r0,r0,ror#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
1267#ifndef	__thumb2__
1268	ldr	r10,[sp,#368]		@ A[3][1]
1269#endif
1270	@ mov	r1,r1,ror#32-14
1271#ifndef	__thumb2__
1272	ldr	r11,[sp,#368+4]
1273#else
1274	ldrd	r10,r11,[sp,#368]		@ A[3][1]
1275#endif
1276
1277	eor	r2,r2,r12
1278#ifndef	__thumb2__
1279	ldr	r4,[sp,#320]		@ A[2][0]
1280#endif
1281	eor	r3,r3,r14
1282#ifndef	__thumb2__
1283	ldr	r5,[sp,#320+4]
1284#else
1285	ldrd	r4,r5,[sp,#320]		@ A[2][0]
1286#endif
1287	@ mov	r2,r2,ror#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
1288	@ mov	r3,r3,ror#32-10
1289
1290	eor	r6,r6,r4
1291#ifndef	__thumb2__
1292	ldr	r12,[sp,#216]		@ D[2]
1293#endif
1294	eor	r7,r7,r5
1295#ifndef	__thumb2__
1296	ldr	r14,[sp,#216+4]
1297#else
1298	ldrd	r12,r14,[sp,#216]		@ D[2]
1299#endif
1300	mov	r5,r6,ror#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
1301	mov	r4,r7,ror#32-2
1302
1303	eor	r10,r10,r8
1304#ifndef	__thumb2__
1305	ldr	r8,[sp,#416]		@ A[4][2]
1306#endif
1307	eor	r11,r11,r9
1308#ifndef	__thumb2__
1309	ldr	r9,[sp,#416+4]
1310#else
1311	ldrd	r8,r9,[sp,#416]		@ A[4][2]
1312#endif
1313	mov	r7,r10,ror#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
1314	mov	r6,r11,ror#32-23
1315
1316	bic	r10,r4,r2,ror#32-10
1317	bic	r11,r5,r3,ror#32-10
1318	eor	r12,r12,r8
1319	eor	r14,r14,r9
1320	mov	r9,r12,ror#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
1321	mov	r8,r14,ror#32-31
1322	eor	r10,r10,r0,ror#32-14
1323	eor	r11,r11,r1,ror#32-14
1324#ifndef	__thumb2__
1325	str	r10,[sp,#40]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
1326#endif
1327	bic	r12,r6,r4
1328#ifndef	__thumb2__
1329	str	r11,[sp,#40+4]
1330#else
1331	strd	r10,r11,[sp,#40]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
1332#endif
1333	bic	r14,r7,r5
1334	eor	r12,r12,r2,ror#32-10
1335#ifndef	__thumb2__
1336	str	r12,[sp,#48]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
1337#endif
1338	eor	r14,r14,r3,ror#32-10
1339#ifndef	__thumb2__
1340	str	r14,[sp,#48+4]
1341#else
1342	strd	r12,r14,[sp,#48]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
1343#endif
1344	bic	r10,r8,r6
1345	bic	r11,r9,r7
1346	bic	r12,r0,r8,ror#14
1347	bic	r14,r1,r9,ror#14
1348	eor	r10,r10,r4
1349	eor	r11,r11,r5
1350#ifndef	__thumb2__
1351	str	r10,[sp,#56]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
1352#endif
1353	bic	r2,r2,r0,ror#32-(14-10)
1354#ifndef	__thumb2__
1355	str	r11,[sp,#56+4]
1356#else
1357	strd	r10,r11,[sp,#56]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
1358#endif
1359	eor	r12,r6,r12,ror#32-14
1360	bic	r11,r3,r1,ror#32-(14-10)
1361#ifndef	__thumb2__
1362	str	r12,[sp,#64]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
1363#endif
1364	eor	r14,r7,r14,ror#32-14
1365#ifndef	__thumb2__
1366	str	r14,[sp,#64+4]
1367#else
1368	strd	r12,r14,[sp,#64]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
1369#endif
1370	add	r12,sp,#208
1371#ifndef	__thumb2__
1372	ldr	r1,[sp,#248]		@ A[0][1]
1373#endif
1374	eor	r10,r8,r2,ror#32-10
1375#ifndef	__thumb2__
1376	ldr	r0,[sp,#248+4]
1377#else
1378	ldrd	r1,r0,[sp,#248]		@ A[0][1]
1379#endif
1380	eor	r11,r9,r11,ror#32-10
1381#ifndef	__thumb2__
1382	str	r10,[sp,#72]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
1383#endif
1384#ifndef	__thumb2__
1385	str	r11,[sp,#72+4]
1386#else
1387	strd	r10,r11,[sp,#72]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
1388#endif
1389
1390	add	r9,sp,#224
1391	ldmia	r12,{r10,r11,r12,r14}	@ D[1..2]
1392#ifndef	__thumb2__
1393	ldr	r2,[sp,#296]		@ A[1][2]
1394#endif
1395#ifndef	__thumb2__
1396	ldr	r3,[sp,#296+4]
1397#else
1398	ldrd	r2,r3,[sp,#296]		@ A[1][2]
1399#endif
1400	ldmia	r9,{r6,r7,r8,r9}		@ D[3..4]
1401
1402	eor	r1,r1,r10
1403#ifndef	__thumb2__
1404	ldr	r4,[sp,#344]		@ A[2][3]
1405#endif
1406	eor	r0,r0,r11
1407#ifndef	__thumb2__
1408	ldr	r5,[sp,#344+4]
1409#else
1410	ldrd	r4,r5,[sp,#344]		@ A[2][3]
1411#endif
1412	mov	r0,r0,ror#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
1413
1414	eor	r2,r2,r12
1415#ifndef	__thumb2__
1416	ldr	r10,[sp,#392]		@ A[3][4]
1417#endif
1418	eor	r3,r3,r14
1419#ifndef	__thumb2__
1420	ldr	r11,[sp,#392+4]
1421#else
1422	ldrd	r10,r11,[sp,#392]		@ A[3][4]
1423#endif
1424	@ mov	r2,r2,ror#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
1425#ifndef	__thumb2__
1426	ldr	r12,[sp,#200]		@ D[0]
1427#endif
1428	@ mov	r3,r3,ror#32-3
1429#ifndef	__thumb2__
1430	ldr	r14,[sp,#200+4]
1431#else
1432	ldrd	r12,r14,[sp,#200]		@ D[0]
1433#endif
1434
1435	eor	r4,r4,r6
1436	eor	r5,r5,r7
1437	@ mov	r5,r6,ror#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
1438	@ mov	r4,r7,ror#32-13		@ [track reverse order below]
1439
1440	eor	r10,r10,r8
1441#ifndef	__thumb2__
1442	ldr	r8,[sp,#400]		@ A[4][0]
1443#endif
1444	eor	r11,r11,r9
1445#ifndef	__thumb2__
1446	ldr	r9,[sp,#400+4]
1447#else
1448	ldrd	r8,r9,[sp,#400]		@ A[4][0]
1449#endif
1450	mov	r6,r10,ror#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
1451	mov	r7,r11,ror#32-4
1452
1453	eor	r12,r12,r8
1454	eor	r14,r14,r9
1455	mov	r8,r12,ror#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
1456	mov	r9,r14,ror#32-9
1457
1458	bic	r10,r5,r2,ror#13-3
1459	bic	r11,r4,r3,ror#12-3
1460	bic	r12,r6,r5,ror#32-13
1461	bic	r14,r7,r4,ror#32-12
1462	eor	r10,r0,r10,ror#32-13
1463	eor	r11,r1,r11,ror#32-12
1464#ifndef	__thumb2__
1465	str	r10,[sp,#80]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
1466#endif
1467	eor	r12,r12,r2,ror#32-3
1468#ifndef	__thumb2__
1469	str	r11,[sp,#80+4]
1470#else
1471	strd	r10,r11,[sp,#80]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
1472#endif
1473	eor	r14,r14,r3,ror#32-3
1474#ifndef	__thumb2__
1475	str	r12,[sp,#88]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
1476#endif
1477	bic	r10,r8,r6
1478	bic	r11,r9,r7
1479#ifndef	__thumb2__
1480	str	r14,[sp,#88+4]
1481#else
1482	strd	r12,r14,[sp,#88]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
1483#endif
1484	eor	r10,r10,r5,ror#32-13
1485	eor	r11,r11,r4,ror#32-12
1486#ifndef	__thumb2__
1487	str	r10,[sp,#96]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
1488#endif
1489	bic	r12,r0,r8
1490#ifndef	__thumb2__
1491	str	r11,[sp,#96+4]
1492#else
1493	strd	r10,r11,[sp,#96]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
1494#endif
1495	bic	r14,r1,r9
1496	eor	r12,r12,r6
1497	eor	r14,r14,r7
1498#ifndef	__thumb2__
1499	str	r12,[sp,#104]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
1500#endif
1501	bic	r10,r2,r0,ror#3
1502#ifndef	__thumb2__
1503	str	r14,[sp,#104+4]
1504#else
1505	strd	r12,r14,[sp,#104]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
1506#endif
1507	bic	r11,r3,r1,ror#3
1508#ifndef	__thumb2__
1509	ldr	r1,[sp,#272]		@ A[0][4] [in reverse order]
1510#endif
1511	eor	r10,r8,r10,ror#32-3
1512#ifndef	__thumb2__
1513	ldr	r0,[sp,#272+4]
1514#else
1515	ldrd	r1,r0,[sp,#272]		@ A[0][4] [in reverse order]
1516#endif
1517	eor	r11,r9,r11,ror#32-3
1518#ifndef	__thumb2__
1519	str	r10,[sp,#112]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
1520#endif
1521	add	r9,sp,#208
1522#ifndef	__thumb2__
1523	str	r11,[sp,#112+4]
1524#else
1525	strd	r10,r11,[sp,#112]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
1526#endif
1527
1528#ifndef	__thumb2__
1529	ldr	r10,[sp,#232]		@ D[4]
1530#endif
1531#ifndef	__thumb2__
1532	ldr	r11,[sp,#232+4]
1533#else
1534	ldrd	r10,r11,[sp,#232]		@ D[4]
1535#endif
1536#ifndef	__thumb2__
1537	ldr	r12,[sp,#200]		@ D[0]
1538#endif
1539#ifndef	__thumb2__
1540	ldr	r14,[sp,#200+4]
1541#else
1542	ldrd	r12,r14,[sp,#200]		@ D[0]
1543#endif
1544
1545	ldmia	r9,{r6,r7,r8,r9}		@ D[1..2]
1546
1547	eor	r1,r1,r10
1548#ifndef	__thumb2__
1549	ldr	r2,[sp,#280]		@ A[1][0]
1550#endif
1551	eor	r0,r0,r11
1552#ifndef	__thumb2__
1553	ldr	r3,[sp,#280+4]
1554#else
1555	ldrd	r2,r3,[sp,#280]		@ A[1][0]
1556#endif
1557	@ mov	r1,r10,ror#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
1558#ifndef	__thumb2__
1559	ldr	r4,[sp,#328]		@ A[2][1]
1560#endif
1561	@ mov	r0,r11,ror#32-14		@ [was loaded in reverse order]
1562#ifndef	__thumb2__
1563	ldr	r5,[sp,#328+4]
1564#else
1565	ldrd	r4,r5,[sp,#328]		@ A[2][1]
1566#endif
1567
1568	eor	r2,r2,r12
1569#ifndef	__thumb2__
1570	ldr	r10,[sp,#376]		@ A[3][2]
1571#endif
1572	eor	r3,r3,r14
1573#ifndef	__thumb2__
1574	ldr	r11,[sp,#376+4]
1575#else
1576	ldrd	r10,r11,[sp,#376]		@ A[3][2]
1577#endif
1578	@ mov	r2,r2,ror#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
1579#ifndef	__thumb2__
1580	ldr	r12,[sp,#224]		@ D[3]
1581#endif
1582	@ mov	r3,r3,ror#32-18
1583#ifndef	__thumb2__
1584	ldr	r14,[sp,#224+4]
1585#else
1586	ldrd	r12,r14,[sp,#224]		@ D[3]
1587#endif
1588
1589	eor	r6,r6,r4
1590	eor	r7,r7,r5
1591	mov	r4,r6,ror#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
1592	mov	r5,r7,ror#32-5
1593
1594	eor	r10,r10,r8
1595#ifndef	__thumb2__
1596	ldr	r8,[sp,#424]		@ A[4][3]
1597#endif
1598	eor	r11,r11,r9
1599#ifndef	__thumb2__
1600	ldr	r9,[sp,#424+4]
1601#else
1602	ldrd	r8,r9,[sp,#424]		@ A[4][3]
1603#endif
1604	mov	r7,r10,ror#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
1605	mov	r6,r11,ror#32-8
1606
1607	eor	r12,r12,r8
1608	eor	r14,r14,r9
1609	mov	r8,r12,ror#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
1610	mov	r9,r14,ror#32-28
1611
1612	bic	r10,r4,r2,ror#32-18
1613	bic	r11,r5,r3,ror#32-18
1614	eor	r10,r10,r0,ror#32-14
1615	eor	r11,r11,r1,ror#32-13
1616#ifndef	__thumb2__
1617	str	r10,[sp,#120]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
1618#endif
1619	bic	r12,r6,r4
1620#ifndef	__thumb2__
1621	str	r11,[sp,#120+4]
1622#else
1623	strd	r10,r11,[sp,#120]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
1624#endif
1625	bic	r14,r7,r5
1626	eor	r12,r12,r2,ror#32-18
1627#ifndef	__thumb2__
1628	str	r12,[sp,#128]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
1629#endif
1630	eor	r14,r14,r3,ror#32-18
1631#ifndef	__thumb2__
1632	str	r14,[sp,#128+4]
1633#else
1634	strd	r12,r14,[sp,#128]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
1635#endif
1636	bic	r10,r8,r6
1637	bic	r11,r9,r7
1638	bic	r12,r0,r8,ror#14
1639	bic	r14,r1,r9,ror#13
1640	eor	r10,r10,r4
1641	eor	r11,r11,r5
1642#ifndef	__thumb2__
1643	str	r10,[sp,#136]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
1644#endif
1645	bic	r2,r2,r0,ror#18-14
1646#ifndef	__thumb2__
1647	str	r11,[sp,#136+4]
1648#else
1649	strd	r10,r11,[sp,#136]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
1650#endif
1651	eor	r12,r6,r12,ror#32-14
1652	bic	r11,r3,r1,ror#18-13
1653	eor	r14,r7,r14,ror#32-13
1654#ifndef	__thumb2__
1655	str	r12,[sp,#144]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
1656#endif
1657#ifndef	__thumb2__
1658	str	r14,[sp,#144+4]
1659#else
1660	strd	r12,r14,[sp,#144]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
1661#endif
1662	add	r14,sp,#216
1663#ifndef	__thumb2__
1664	ldr	r0,[sp,#256]		@ A[0][2]
1665#endif
1666	eor	r10,r8,r2,ror#32-18
1667#ifndef	__thumb2__
1668	ldr	r1,[sp,#256+4]
1669#else
1670	ldrd	r0,r1,[sp,#256]		@ A[0][2]
1671#endif
1672	eor	r11,r9,r11,ror#32-18
1673#ifndef	__thumb2__
1674	str	r10,[sp,#152]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
1675#endif
1676#ifndef	__thumb2__
1677	str	r11,[sp,#152+4]
1678#else
1679	strd	r10,r11,[sp,#152]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
1680#endif
1681
1682	ldmia	r14,{r10,r11,r12,r14}	@ D[2..3]
1683#ifndef	__thumb2__
1684	ldr	r2,[sp,#304]		@ A[1][3]
1685#endif
1686#ifndef	__thumb2__
1687	ldr	r3,[sp,#304+4]
1688#else
1689	ldrd	r2,r3,[sp,#304]		@ A[1][3]
1690#endif
1691#ifndef	__thumb2__
1692	ldr	r6,[sp,#232]		@ D[4]
1693#endif
1694#ifndef	__thumb2__
1695	ldr	r7,[sp,#232+4]
1696#else
1697	ldrd	r6,r7,[sp,#232]		@ D[4]
1698#endif
1699
1700	eor	r0,r0,r10
1701#ifndef	__thumb2__
1702	ldr	r4,[sp,#352]		@ A[2][4]
1703#endif
1704	eor	r1,r1,r11
1705#ifndef	__thumb2__
1706	ldr	r5,[sp,#352+4]
1707#else
1708	ldrd	r4,r5,[sp,#352]		@ A[2][4]
1709#endif
1710	@ mov	r0,r0,ror#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
1711#ifndef	__thumb2__
1712	ldr	r8,[sp,#200]		@ D[0]
1713#endif
1714	@ mov	r1,r1,ror#32-31
1715#ifndef	__thumb2__
1716	ldr	r9,[sp,#200+4]
1717#else
1718	ldrd	r8,r9,[sp,#200]		@ D[0]
1719#endif
1720
1721	eor	r12,r12,r2
1722#ifndef	__thumb2__
1723	ldr	r10,[sp,#360]		@ A[3][0]
1724#endif
1725	eor	r14,r14,r3
1726#ifndef	__thumb2__
1727	ldr	r11,[sp,#360+4]
1728#else
1729	ldrd	r10,r11,[sp,#360]		@ A[3][0]
1730#endif
1731	mov	r3,r12,ror#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
1732#ifndef	__thumb2__
1733	ldr	r12,[sp,#208]		@ D[1]
1734#endif
1735	mov	r2,r14,ror#32-28
1736#ifndef	__thumb2__
1737	ldr	r14,[sp,#208+4]
1738#else
1739	ldrd	r12,r14,[sp,#208]		@ D[1]
1740#endif
1741
1742	eor	r6,r6,r4
1743	eor	r7,r7,r5
1744	mov	r5,r6,ror#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
1745	mov	r4,r7,ror#32-20
1746
1747	eor	r10,r10,r8
1748#ifndef	__thumb2__
1749	ldr	r8,[sp,#408]		@ A[4][1]
1750#endif
1751	eor	r11,r11,r9
1752#ifndef	__thumb2__
1753	ldr	r9,[sp,#408+4]
1754#else
1755	ldrd	r8,r9,[sp,#408]		@ A[4][1]
1756#endif
1757	mov	r7,r10,ror#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
1758	mov	r6,r11,ror#32-21
1759
1760	eor	r8,r8,r12
1761	eor	r9,r9,r14
1762	@ mov	r8,r2,ror#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
1763	@ mov	r9,r3,ror#32-1
1764
1765	bic	r10,r4,r2
1766	bic	r11,r5,r3
1767	eor	r10,r10,r0,ror#32-31
1768#ifndef	__thumb2__
1769	str	r10,[sp,#160]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
1770#endif
1771	eor	r11,r11,r1,ror#32-31
1772#ifndef	__thumb2__
1773	str	r11,[sp,#160+4]
1774#else
1775	strd	r10,r11,[sp,#160]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
1776#endif
1777	bic	r12,r6,r4
1778	bic	r14,r7,r5
1779	eor	r12,r12,r2
1780	eor	r14,r14,r3
1781#ifndef	__thumb2__
1782	str	r12,[sp,#168]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
1783#endif
1784	bic	r10,r8,r6,ror#1
1785#ifndef	__thumb2__
1786	str	r14,[sp,#168+4]
1787#else
1788	strd	r12,r14,[sp,#168]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
1789#endif
1790	bic	r11,r9,r7,ror#1
1791	bic	r12,r0,r8,ror#31-1
1792	bic	r14,r1,r9,ror#31-1
1793	eor	r4,r4,r10,ror#32-1
1794#ifndef	__thumb2__
1795	str	r4,[sp,#176]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
1796#endif
1797	eor	r5,r5,r11,ror#32-1
1798#ifndef	__thumb2__
1799	str	r5,[sp,#176+4]
1800#else
1801	strd	r4,r5,[sp,#176]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
1802#endif
1803	eor	r6,r6,r12,ror#32-31
1804	eor	r7,r7,r14,ror#32-31
1805#ifndef	__thumb2__
1806	str	r6,[sp,#184]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
1807#endif
1808	bic	r10,r2,r0,ror#32-31
1809#ifndef	__thumb2__
1810	str	r7,[sp,#184+4]
1811#else
1812	strd	r6,r7,[sp,#184]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
1813#endif
1814	bic	r11,r3,r1,ror#32-31
1815	add	r12,sp,#0
1816	eor	r8,r10,r8,ror#32-1
1817	add	r10,sp,#40
1818	eor	r9,r11,r9,ror#32-1
1819#ifndef	__thumb2__
1820	str	r8,[sp,#192]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
1821#endif
1822#ifndef	__thumb2__
1823	str	r9,[sp,#192+4]
1824#else
1825	strd	r8,r9,[sp,#192]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
1826#endif
1827	blo	.Lround2x
1828
1829	ldr	pc,[sp,#440]
1830.size	KeccakF1600_int,.-KeccakF1600_int
1831
1832.type	KeccakF1600, %function
1833.align	5
1834KeccakF1600:
1835	stmdb	sp!,{r0,r4-r11,lr}
1836	sub	sp,sp,#440+16			@ space for A[5][5],D[5],T[5][5],...
1837
1838	add	r10,r0,#40
1839	add	r11,sp,#40
1840	ldmia	r0,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ copy A[5][5] to stack
1841	stmia	sp,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1842	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1843	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1844	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1845	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1846	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1847	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1848	ldmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1849	add	r12,sp,#0
1850	add	r10,sp,#40
1851	stmia	r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1852
1853	bl	KeccakF1600_enter
1854
1855	ldr	r11, [sp,#440+16]		@ restore pointer to A
1856	ldmia	sp,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1857	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ return A[5][5]
1858	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1859	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1860	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1861	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1862	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1863	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1864	ldmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1865	stmia	r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1866
1867	add	sp,sp,#440+20
1868	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
1869.size	KeccakF1600,.-KeccakF1600
1870.globl	SHA3_absorb
1871.type	SHA3_absorb,%function
1872.align	5
1873SHA3_absorb:
1874	stmdb	sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
1875	sub	sp,sp,#456+16
1876
1877	add	r10,r0,#40
1878	@ mov	r11,r1
1879	mov	r12,r2
1880	mov	r14,r3
1881	cmp	r2,r3
1882	blo	.Labsorb_abort
1883
1884	add	r11,sp,#0
1885	ldmia	r0,      {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}	@ copy A[5][5] to stack
1886	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1887	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1888	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1889	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1890	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1891	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1892	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1893	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1894	stmia	r11,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1895
1896	ldr	r11,[sp,#476]		@ restore r11
1897#ifdef	__thumb2__
1898	mov	r9,#0x00ff00ff
1899	mov	r8,#0x0f0f0f0f
1900	mov	r7,#0x33333333
1901	mov	r6,#0x55555555
1902#else
1903	mov	r6,#0x11		@ compose constants
1904	mov	r8,#0x0f
1905	mov	r9,#0xff
1906	orr	r6,r6,r6,lsl#8
1907	orr	r8,r8,r8,lsl#8
1908	orr	r6,r6,r6,lsl#16		@ 0x11111111
1909	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
1910	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
1911	orr	r7,r6,r6,lsl#1		@ 0x33333333
1912	orr	r6,r6,r6,lsl#2		@ 0x55555555
1913#endif
1914	str	r9,[sp,#468]
1915	str	r8,[sp,#464]
1916	str	r7,[sp,#460]
1917	str	r6,[sp,#456]
1918	b	.Loop_absorb
1919
1920.align	4
1921.Loop_absorb:
1922	subs	r0,r12,r14
1923	blo	.Labsorbed
1924	add	r10,sp,#0
1925	str	r0,[sp,#480]		@ save len - bsz
1926
1927.align	4
1928.Loop_block:
1929	ldrb	r0,[r11],#1
1930	ldrb	r1,[r11],#1
1931	ldrb	r2,[r11],#1
1932	ldrb	r3,[r11],#1
1933	ldrb	r4,[r11],#1
1934	orr	r0,r0,r1,lsl#8
1935	ldrb	r1,[r11],#1
1936	orr	r0,r0,r2,lsl#16
1937	ldrb	r2,[r11],#1
1938	orr	r0,r0,r3,lsl#24		@ lo
1939	ldrb	r3,[r11],#1
1940	orr	r1,r4,r1,lsl#8
1941	orr	r1,r1,r2,lsl#16
1942	orr	r1,r1,r3,lsl#24		@ hi
1943
1944	and	r2,r0,r6		@ &=0x55555555
1945	and	r0,r0,r6,lsl#1		@ &=0xaaaaaaaa
1946	and	r3,r1,r6		@ &=0x55555555
1947	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
1948	orr	r2,r2,r2,lsr#1
1949	orr	r0,r0,r0,lsl#1
1950	orr	r3,r3,r3,lsr#1
1951	orr	r1,r1,r1,lsl#1
1952	and	r2,r2,r7		@ &=0x33333333
1953	and	r0,r0,r7,lsl#2		@ &=0xcccccccc
1954	and	r3,r3,r7		@ &=0x33333333
1955	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
1956	orr	r2,r2,r2,lsr#2
1957	orr	r0,r0,r0,lsl#2
1958	orr	r3,r3,r3,lsr#2
1959	orr	r1,r1,r1,lsl#2
1960	and	r2,r2,r8		@ &=0x0f0f0f0f
1961	and	r0,r0,r8,lsl#4		@ &=0xf0f0f0f0
1962	and	r3,r3,r8		@ &=0x0f0f0f0f
1963	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
1964	ldmia	r10,{r4,r5}		@ A_flat[i]
1965	orr	r2,r2,r2,lsr#4
1966	orr	r0,r0,r0,lsl#4
1967	orr	r3,r3,r3,lsr#4
1968	orr	r1,r1,r1,lsl#4
1969	and	r2,r2,r9		@ &=0x00ff00ff
1970	and	r0,r0,r9,lsl#8		@ &=0xff00ff00
1971	and	r3,r3,r9		@ &=0x00ff00ff
1972	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
1973	orr	r2,r2,r2,lsr#8
1974	orr	r0,r0,r0,lsl#8
1975	orr	r3,r3,r3,lsr#8
1976	orr	r1,r1,r1,lsl#8
1977
1978	mov	r2,r2,lsl#16
1979	mov	r1,r1,lsr#16
1980	eor	r4,r4,r3,lsl#16
1981	eor	r5,r5,r0,lsr#16
1982	eor	r4,r4,r2,lsr#16
1983	eor	r5,r5,r1,lsl#16
1984	stmia	r10!,{r4,r5}	@ A_flat[i++] ^= BitInterleave(inp[0..7])
1985
1986	subs	r14,r14,#8
1987	bhi	.Loop_block
1988
1989	str	r11,[sp,#476]
1990
1991	bl	KeccakF1600_int
1992
1993	add	r14,sp,#456
1994	ldmia	r14,{r6,r7,r8,r9,r10,r11,r12,r14}	@ restore constants and variables
1995	b	.Loop_absorb
1996
1997.align	4
1998.Labsorbed:
1999	add	r11,sp,#40
2000	ldmia	sp,      {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2001	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}	@ return A[5][5]
2002	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2003	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2004	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2005	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2006	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2007	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2008	ldmia	r11,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2009	stmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2010
2011.Labsorb_abort:
2012	add	sp,sp,#456+32
2013	mov	r0,r12			@ return value
2014	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
2015.size	SHA3_absorb,.-SHA3_absorb
2016.globl	SHA3_squeeze
2017.type	SHA3_squeeze,%function
2018.align	5
2019SHA3_squeeze:
2020	stmdb	sp!,{r0,r3-r10,lr}
2021
2022	mov	r10,r0
2023	mov	r4,r1
2024	mov	r5,r2
2025	mov	r12,r3
2026
2027#ifdef	__thumb2__
2028	mov	r9,#0x00ff00ff
2029	mov	r8,#0x0f0f0f0f
2030	mov	r7,#0x33333333
2031	mov	r6,#0x55555555
2032#else
2033	mov	r6,#0x11		@ compose constants
2034	mov	r8,#0x0f
2035	mov	r9,#0xff
2036	orr	r6,r6,r6,lsl#8
2037	orr	r8,r8,r8,lsl#8
2038	orr	r6,r6,r6,lsl#16		@ 0x11111111
2039	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
2040	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
2041	orr	r7,r6,r6,lsl#1		@ 0x33333333
2042	orr	r6,r6,r6,lsl#2		@ 0x55555555
2043#endif
2044	stmdb	sp!,{r6,r7,r8,r9}
2045
2046	mov	r14,r10
2047	b	.Loop_squeeze
2048
2049.align	4
2050.Loop_squeeze:
2051	ldmia	r10!,{r0,r1}	@ A_flat[i++]
2052
2053	mov	r2,r0,lsl#16
2054	mov	r3,r1,lsl#16		@ r3 = r1 << 16
2055	mov	r2,r2,lsr#16		@ r2 = r0 & 0x0000ffff
2056	mov	r1,r1,lsr#16
2057	mov	r0,r0,lsr#16		@ r0 = r0 >> 16
2058	mov	r1,r1,lsl#16		@ r1 = r1 & 0xffff0000
2059
2060	orr	r2,r2,r2,lsl#8
2061	orr	r3,r3,r3,lsr#8
2062	orr	r0,r0,r0,lsl#8
2063	orr	r1,r1,r1,lsr#8
2064	and	r2,r2,r9		@ &=0x00ff00ff
2065	and	r3,r3,r9,lsl#8		@ &=0xff00ff00
2066	and	r0,r0,r9		@ &=0x00ff00ff
2067	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
2068	orr	r2,r2,r2,lsl#4
2069	orr	r3,r3,r3,lsr#4
2070	orr	r0,r0,r0,lsl#4
2071	orr	r1,r1,r1,lsr#4
2072	and	r2,r2,r8		@ &=0x0f0f0f0f
2073	and	r3,r3,r8,lsl#4		@ &=0xf0f0f0f0
2074	and	r0,r0,r8		@ &=0x0f0f0f0f
2075	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
2076	orr	r2,r2,r2,lsl#2
2077	orr	r3,r3,r3,lsr#2
2078	orr	r0,r0,r0,lsl#2
2079	orr	r1,r1,r1,lsr#2
2080	and	r2,r2,r7		@ &=0x33333333
2081	and	r3,r3,r7,lsl#2		@ &=0xcccccccc
2082	and	r0,r0,r7		@ &=0x33333333
2083	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
2084	orr	r2,r2,r2,lsl#1
2085	orr	r3,r3,r3,lsr#1
2086	orr	r0,r0,r0,lsl#1
2087	orr	r1,r1,r1,lsr#1
2088	and	r2,r2,r6		@ &=0x55555555
2089	and	r3,r3,r6,lsl#1		@ &=0xaaaaaaaa
2090	and	r0,r0,r6		@ &=0x55555555
2091	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
2092
2093	orr	r2,r2,r3
2094	orr	r0,r0,r1
2095
2096	cmp	r5,#8
2097	blo	.Lsqueeze_tail
2098	mov	r1,r2,lsr#8
2099	strb	r2,[r4],#1
2100	mov	r3,r2,lsr#16
2101	strb	r1,[r4],#1
2102	mov	r2,r2,lsr#24
2103	strb	r3,[r4],#1
2104	strb	r2,[r4],#1
2105
2106	mov	r1,r0,lsr#8
2107	strb	r0,[r4],#1
2108	mov	r3,r0,lsr#16
2109	strb	r1,[r4],#1
2110	mov	r0,r0,lsr#24
2111	strb	r3,[r4],#1
2112	strb	r0,[r4],#1
2113	subs	r5,r5,#8
2114	beq	.Lsqueeze_done
2115
2116	subs	r12,r12,#8		@ bsz -= 8
2117	bhi	.Loop_squeeze
2118
2119	mov	r0,r14			@ original r10
2120
2121	bl	KeccakF1600
2122
2123	ldmia	sp,{r6,r7,r8,r9,r10,r12}		@ restore constants and variables
2124	mov	r14,r10
2125	b	.Loop_squeeze
2126
2127.align	4
2128.Lsqueeze_tail:
2129	strb	r2,[r4],#1
2130	mov	r2,r2,lsr#8
2131	subs	r5,r5,#1
2132	beq	.Lsqueeze_done
2133	strb	r2,[r4],#1
2134	mov	r2,r2,lsr#8
2135	subs	r5,r5,#1
2136	beq	.Lsqueeze_done
2137	strb	r2,[r4],#1
2138	mov	r2,r2,lsr#8
2139	subs	r5,r5,#1
2140	beq	.Lsqueeze_done
2141	strb	r2,[r4],#1
2142	subs	r5,r5,#1
2143	beq	.Lsqueeze_done
2144
2145	strb	r0,[r4],#1
2146	mov	r0,r0,lsr#8
2147	subs	r5,r5,#1
2148	beq	.Lsqueeze_done
2149	strb	r0,[r4],#1
2150	mov	r0,r0,lsr#8
2151	subs	r5,r5,#1
2152	beq	.Lsqueeze_done
2153	strb	r0,[r4]
2154	b	.Lsqueeze_done
2155
2156.align	4
2157.Lsqueeze_done:
2158	add	sp,sp,#24
2159	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
2160.size	SHA3_squeeze,.-SHA3_squeeze
2161#if __ARM_MAX_ARCH__>=7
2162.fpu	neon
2163
2164.type	iotas64, %object
2165.align	5
2166iotas64:
2167.quad	0x0000000000000001
2168.quad	0x0000000000008082
2169.quad	0x800000000000808a
2170.quad	0x8000000080008000
2171.quad	0x000000000000808b
2172.quad	0x0000000080000001
2173.quad	0x8000000080008081
2174.quad	0x8000000000008009
2175.quad	0x000000000000008a
2176.quad	0x0000000000000088
2177.quad	0x0000000080008009
2178.quad	0x000000008000000a
2179.quad	0x000000008000808b
2180.quad	0x800000000000008b
2181.quad	0x8000000000008089
2182.quad	0x8000000000008003
2183.quad	0x8000000000008002
2184.quad	0x8000000000000080
2185.quad	0x000000000000800a
2186.quad	0x800000008000000a
2187.quad	0x8000000080008081
2188.quad	0x8000000000008080
2189.quad	0x0000000080000001
2190.quad	0x8000000080008008
2191.size	iotas64,.-iotas64
2192
2193.type	KeccakF1600_neon, %function
2194.align	5
2195KeccakF1600_neon:
2196	add	r1, r0, #16
2197	adr	r2, iotas64
2198	mov	r3, #24			@ loop counter
2199	b	.Loop_neon
2200
2201.align	4
2202.Loop_neon:
2203	@ Theta
2204	vst1.64	{q4},  [r0,:64]		@ offload A[0..1][4]
2205	veor	q13, q0,  q5		@ A[0..1][0]^A[2..3][0]
2206	vst1.64	{d18}, [r1,:64]		@ offload A[2][4]
2207	veor	q14, q1,  q6		@ A[0..1][1]^A[2..3][1]
2208	veor	q15, q2,  q7		@ A[0..1][2]^A[2..3][2]
2209	veor	d26, d26, d27		@ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
2210	veor	d27, d28, d29		@ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
2211	veor	q14, q3,  q8		@ A[0..1][3]^A[2..3][3]
2212	veor	q4,  q4,  q9		@ A[0..1][4]^A[2..3][4]
2213	veor	d30, d30, d31		@ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
2214	veor	d31, d28, d29		@ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
2215	veor	d25, d8,  d9		@ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
2216	veor	q13, q13, q10		@ C[0..1]^=A[4][0..1]
2217	veor	q14, q15, q11		@ C[2..3]^=A[4][2..3]
2218	veor	d25, d25, d24		@ C[4]^=A[4][4]
2219
2220	vadd.u64	q4,  q13, q13		@ C[0..1]<<1
2221	vadd.u64	q15, q14, q14		@ C[2..3]<<1
2222	vadd.u64	d18, d25, d25		@ C[4]<<1
2223	vsri.u64	q4,  q13, #63		@ ROL64(C[0..1],1)
2224	vsri.u64	q15, q14, #63		@ ROL64(C[2..3],1)
2225	vsri.u64	d18, d25, #63		@ ROL64(C[4],1)
2226	veor	d25, d25, d9		@ D[0] = C[4] ^= ROL64(C[1],1)
2227	veor	q13, q13, q15		@ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
2228	veor	d28, d28, d18		@ D[3] = C[2] ^= ROL64(C[4],1)
2229	veor	d29, d29, d8		@ D[4] = C[3] ^= ROL64(C[0],1)
2230
2231	veor	d0,  d0,  d25		@ A[0][0] ^= C[4]
2232	veor	d1,  d1,  d25		@ A[1][0] ^= C[4]
2233	veor	d10, d10, d25		@ A[2][0] ^= C[4]
2234	veor	d11, d11, d25		@ A[3][0] ^= C[4]
2235	veor	d20, d20, d25		@ A[4][0] ^= C[4]
2236
2237	veor	d2,  d2,  d26		@ A[0][1] ^= D[1]
2238	veor	d3,  d3,  d26		@ A[1][1] ^= D[1]
2239	veor	d12, d12, d26		@ A[2][1] ^= D[1]
2240	veor	d13, d13, d26		@ A[3][1] ^= D[1]
2241	veor	d21, d21, d26		@ A[4][1] ^= D[1]
2242	vmov	d26, d27
2243
2244	veor	d6,  d6,  d28		@ A[0][3] ^= C[2]
2245	veor	d7,  d7,  d28		@ A[1][3] ^= C[2]
2246	veor	d16, d16, d28		@ A[2][3] ^= C[2]
2247	veor	d17, d17, d28		@ A[3][3] ^= C[2]
2248	veor	d23, d23, d28		@ A[4][3] ^= C[2]
2249	vld1.64	{q4},  [r0,:64]		@ restore A[0..1][4]
2250	vmov	d28, d29
2251
2252	vld1.64	{d18}, [r1,:64]		@ restore A[2][4]
2253	veor	q2,  q2,  q13		@ A[0..1][2] ^= D[2]
2254	veor	q7,  q7,  q13		@ A[2..3][2] ^= D[2]
2255	veor	d22, d22, d27		@ A[4][2]    ^= D[2]
2256
2257	veor	q4,  q4,  q14		@ A[0..1][4] ^= C[3]
2258	veor	q9,  q9,  q14		@ A[2..3][4] ^= C[3]
2259	veor	d24, d24, d29		@ A[4][4]    ^= C[3]
2260
2261	@ Rho + Pi
2262	vmov	d26, d2			@ C[1] = A[0][1]
2263	vshl.u64	d2,  d3,  #44
2264	vmov	d27, d4			@ C[2] = A[0][2]
2265	vshl.u64	d4,  d14, #43
2266	vmov	d28, d6			@ C[3] = A[0][3]
2267	vshl.u64	d6,  d17, #21
2268	vmov	d29, d8			@ C[4] = A[0][4]
2269	vshl.u64	d8,  d24, #14
2270	vsri.u64	d2,  d3,  #64-44	@ A[0][1] = ROL64(A[1][1], rhotates[1][1])
2271	vsri.u64	d4,  d14, #64-43	@ A[0][2] = ROL64(A[2][2], rhotates[2][2])
2272	vsri.u64	d6,  d17, #64-21	@ A[0][3] = ROL64(A[3][3], rhotates[3][3])
2273	vsri.u64	d8,  d24, #64-14	@ A[0][4] = ROL64(A[4][4], rhotates[4][4])
2274
2275	vshl.u64	d3,  d9,  #20
2276	vshl.u64	d14, d16, #25
2277	vshl.u64	d17, d15, #15
2278	vshl.u64	d24, d21, #2
2279	vsri.u64	d3,  d9,  #64-20	@ A[1][1] = ROL64(A[1][4], rhotates[1][4])
2280	vsri.u64	d14, d16, #64-25	@ A[2][2] = ROL64(A[2][3], rhotates[2][3])
2281	vsri.u64	d17, d15, #64-15	@ A[3][3] = ROL64(A[3][2], rhotates[3][2])
2282	vsri.u64	d24, d21, #64-2		@ A[4][4] = ROL64(A[4][1], rhotates[4][1])
2283
2284	vshl.u64	d9,  d22, #61
2285	@ vshl.u64	d16, d19, #8
2286	vshl.u64	d15, d12, #10
2287	vshl.u64	d21, d7,  #55
2288	vsri.u64	d9,  d22, #64-61	@ A[1][4] = ROL64(A[4][2], rhotates[4][2])
2289	vext.8	d16, d19, d19, #8-1	@ A[2][3] = ROL64(A[3][4], rhotates[3][4])
2290	vsri.u64	d15, d12, #64-10	@ A[3][2] = ROL64(A[2][1], rhotates[2][1])
2291	vsri.u64	d21, d7,  #64-55	@ A[4][1] = ROL64(A[1][3], rhotates[1][3])
2292
2293	vshl.u64	d22, d18, #39
2294	@ vshl.u64	d19, d23, #56
2295	vshl.u64	d12, d5,  #6
2296	vshl.u64	d7,  d13, #45
2297	vsri.u64	d22, d18, #64-39	@ A[4][2] = ROL64(A[2][4], rhotates[2][4])
2298	vext.8	d19, d23, d23, #8-7	@ A[3][4] = ROL64(A[4][3], rhotates[4][3])
2299	vsri.u64	d12, d5,  #64-6		@ A[2][1] = ROL64(A[1][2], rhotates[1][2])
2300	vsri.u64	d7,  d13, #64-45	@ A[1][3] = ROL64(A[3][1], rhotates[3][1])
2301
2302	vshl.u64	d18, d20, #18
2303	vshl.u64	d23, d11, #41
2304	vshl.u64	d5,  d10, #3
2305	vshl.u64	d13, d1,  #36
2306	vsri.u64	d18, d20, #64-18	@ A[2][4] = ROL64(A[4][0], rhotates[4][0])
2307	vsri.u64	d23, d11, #64-41	@ A[4][3] = ROL64(A[3][0], rhotates[3][0])
2308	vsri.u64	d5,  d10, #64-3		@ A[1][2] = ROL64(A[2][0], rhotates[2][0])
2309	vsri.u64	d13, d1,  #64-36	@ A[3][1] = ROL64(A[1][0], rhotates[1][0])
2310
2311	vshl.u64	d1,  d28, #28
2312	vshl.u64	d10, d26, #1
2313	vshl.u64	d11, d29, #27
2314	vshl.u64	d20, d27, #62
2315	vsri.u64	d1,  d28, #64-28	@ A[1][0] = ROL64(C[3],    rhotates[0][3])
2316	vsri.u64	d10, d26, #64-1		@ A[2][0] = ROL64(C[1],    rhotates[0][1])
2317	vsri.u64	d11, d29, #64-27	@ A[3][0] = ROL64(C[4],    rhotates[0][4])
2318	vsri.u64	d20, d27, #64-62	@ A[4][0] = ROL64(C[2],    rhotates[0][2])
2319
2320	@ Chi + Iota
2321	vbic	q13, q2,  q1
2322	vbic	q14, q3,  q2
2323	vbic	q15, q4,  q3
2324	veor	q13, q13, q0		@ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
2325	veor	q14, q14, q1		@ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
2326	veor	q2,  q2,  q15		@ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
2327	vst1.64	{q13}, [r0,:64]		@ offload A[0..1][0]
2328	vbic	q13, q0,  q4
2329	vbic	q15, q1,  q0
2330	vmov	q1,  q14		@ A[0..1][1]
2331	veor	q3,  q3,  q13		@ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
2332	veor	q4,  q4,  q15		@ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
2333
2334	vbic	q13, q7,  q6
2335	vmov	q0,  q5			@ A[2..3][0]
2336	vbic	q14, q8,  q7
2337	vmov	q15, q6			@ A[2..3][1]
2338	veor	q5,  q5,  q13		@ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
2339	vbic	q13, q9,  q8
2340	veor	q6,  q6,  q14		@ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
2341	vbic	q14, q0,  q9
2342	veor	q7,  q7,  q13		@ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
2343	vbic	q13, q15, q0
2344	veor	q8,  q8,  q14		@ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
2345	vmov	q14, q10		@ A[4][0..1]
2346	veor	q9,  q9,  q13		@ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
2347
2348	vld1.64	d25, [r2,:64]!		@ Iota[i++]
2349	vbic	d26, d22, d21
2350	vbic	d27, d23, d22
2351	vld1.64	{q0}, [r0,:64]		@ restore A[0..1][0]
2352	veor	d20, d20, d26		@ A[4][0] ^= (~A[4][1] & A[4][2])
2353	vbic	d26, d24, d23
2354	veor	d21, d21, d27		@ A[4][1] ^= (~A[4][2] & A[4][3])
2355	vbic	d27, d28, d24
2356	veor	d22, d22, d26		@ A[4][2] ^= (~A[4][3] & A[4][4])
2357	vbic	d26, d29, d28
2358	veor	d23, d23, d27		@ A[4][3] ^= (~A[4][4] & A[4][0])
2359	veor	d0,  d0,  d25		@ A[0][0] ^= Iota[i]
2360	veor	d24, d24, d26		@ A[4][4] ^= (~A[4][0] & A[4][1])
2361
2362	subs	r3, r3, #1
2363	bne	.Loop_neon
2364
2365.word	0xe12fff1e
2366.size	KeccakF1600_neon,.-KeccakF1600_neon
2367
2368.globl	SHA3_absorb_neon
2369.type	SHA3_absorb_neon, %function
2370.align	5
2371SHA3_absorb_neon:
2372	stmdb	sp!, {r4,r5,r6,lr}
2373	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
2374
2375	mov	r4, r1			@ inp
2376	mov	r5, r2			@ len
2377	mov	r6, r3			@ bsz
2378
2379	vld1.32	{d0}, [r0,:64]!		@ A[0][0]
2380	vld1.32	{d2}, [r0,:64]!		@ A[0][1]
2381	vld1.32	{d4}, [r0,:64]!		@ A[0][2]
2382	vld1.32	{d6}, [r0,:64]!		@ A[0][3]
2383	vld1.32	{d8}, [r0,:64]!		@ A[0][4]
2384
2385	vld1.32	{d1}, [r0,:64]!		@ A[1][0]
2386	vld1.32	{d3}, [r0,:64]!		@ A[1][1]
2387	vld1.32	{d5}, [r0,:64]!		@ A[1][2]
2388	vld1.32	{d7}, [r0,:64]!		@ A[1][3]
2389	vld1.32	{d9}, [r0,:64]!		@ A[1][4]
2390
2391	vld1.32	{d10}, [r0,:64]!		@ A[2][0]
2392	vld1.32	{d12}, [r0,:64]!		@ A[2][1]
2393	vld1.32	{d14}, [r0,:64]!		@ A[2][2]
2394	vld1.32	{d16}, [r0,:64]!		@ A[2][3]
2395	vld1.32	{d18}, [r0,:64]!		@ A[2][4]
2396
2397	vld1.32	{d11}, [r0,:64]!		@ A[3][0]
2398	vld1.32	{d13}, [r0,:64]!		@ A[3][1]
2399	vld1.32	{d15}, [r0,:64]!		@ A[3][2]
2400	vld1.32	{d17}, [r0,:64]!		@ A[3][3]
2401	vld1.32	{d19}, [r0,:64]!		@ A[3][4]
2402
2403	vld1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..3]
2404	vld1.32	{d24}, [r0,:64]		@ A[4][4]
2405	sub	r0, r0, #24*8		@ rewind
2406	b	.Loop_absorb_neon
2407
2408.align	4
2409.Loop_absorb_neon:
2410	subs	r12, r5, r6		@ len - bsz
2411	blo	.Labsorbed_neon
2412	mov	r5, r12
2413
2414	vld1.8	{d31}, [r4]!		@ endian-neutral loads...
2415	cmp	r6, #8*2
2416	veor	d0, d0, d31		@ A[0][0] ^= *inp++
2417	blo	.Lprocess_neon
2418	vld1.8	{d31}, [r4]!
2419	veor	d2, d2, d31		@ A[0][1] ^= *inp++
2420	beq	.Lprocess_neon
2421	vld1.8	{d31}, [r4]!
2422	cmp	r6, #8*4
2423	veor	d4, d4, d31		@ A[0][2] ^= *inp++
2424	blo	.Lprocess_neon
2425	vld1.8	{d31}, [r4]!
2426	veor	d6, d6, d31		@ A[0][3] ^= *inp++
2427	beq	.Lprocess_neon
2428	vld1.8	{d31},[r4]!
2429	cmp	r6, #8*6
2430	veor	d8, d8, d31		@ A[0][4] ^= *inp++
2431	blo	.Lprocess_neon
2432
2433	vld1.8	{d31}, [r4]!
2434	veor	d1, d1, d31		@ A[1][0] ^= *inp++
2435	beq	.Lprocess_neon
2436	vld1.8	{d31}, [r4]!
2437	cmp	r6, #8*8
2438	veor	d3, d3, d31		@ A[1][1] ^= *inp++
2439	blo	.Lprocess_neon
2440	vld1.8	{d31}, [r4]!
2441	veor	d5, d5, d31		@ A[1][2] ^= *inp++
2442	beq	.Lprocess_neon
2443	vld1.8	{d31}, [r4]!
2444	cmp	r6, #8*10
2445	veor	d7, d7, d31		@ A[1][3] ^= *inp++
2446	blo	.Lprocess_neon
2447	vld1.8	{d31}, [r4]!
2448	veor	d9, d9, d31		@ A[1][4] ^= *inp++
2449	beq	.Lprocess_neon
2450
2451	vld1.8	{d31}, [r4]!
2452	cmp	r6, #8*12
2453	veor	d10, d10, d31		@ A[2][0] ^= *inp++
2454	blo	.Lprocess_neon
2455	vld1.8	{d31}, [r4]!
2456	veor	d12, d12, d31		@ A[2][1] ^= *inp++
2457	beq	.Lprocess_neon
2458	vld1.8	{d31}, [r4]!
2459	cmp	r6, #8*14
2460	veor	d14, d14, d31		@ A[2][2] ^= *inp++
2461	blo	.Lprocess_neon
2462	vld1.8	{d31}, [r4]!
2463	veor	d16, d16, d31		@ A[2][3] ^= *inp++
2464	beq	.Lprocess_neon
2465	vld1.8	{d31}, [r4]!
2466	cmp	r6, #8*16
2467	veor	d18, d18, d31		@ A[2][4] ^= *inp++
2468	blo	.Lprocess_neon
2469
2470	vld1.8	{d31}, [r4]!
2471	veor	d11, d11, d31		@ A[3][0] ^= *inp++
2472	beq	.Lprocess_neon
2473	vld1.8	{d31}, [r4]!
2474	cmp	r6, #8*18
2475	veor	d13, d13, d31		@ A[3][1] ^= *inp++
2476	blo	.Lprocess_neon
2477	vld1.8	{d31}, [r4]!
2478	veor	d15, d15, d31		@ A[3][2] ^= *inp++
2479	beq	.Lprocess_neon
2480	vld1.8	{d31}, [r4]!
2481	cmp	r6, #8*20
2482	veor	d17, d17, d31		@ A[3][3] ^= *inp++
2483	blo	.Lprocess_neon
2484	vld1.8	{d31}, [r4]!
2485	veor	d19, d19, d31		@ A[3][4] ^= *inp++
2486	beq	.Lprocess_neon
2487
2488	vld1.8	{d31}, [r4]!
2489	cmp	r6, #8*22
2490	veor	d20, d20, d31		@ A[4][0] ^= *inp++
2491	blo	.Lprocess_neon
2492	vld1.8	{d31}, [r4]!
2493	veor	d21, d21, d31		@ A[4][1] ^= *inp++
2494	beq	.Lprocess_neon
2495	vld1.8	{d31}, [r4]!
2496	cmp	r6, #8*24
2497	veor	d22, d22, d31		@ A[4][2] ^= *inp++
2498	blo	.Lprocess_neon
2499	vld1.8	{d31}, [r4]!
2500	veor	d23, d23, d31		@ A[4][3] ^= *inp++
2501	beq	.Lprocess_neon
2502	vld1.8	{d31}, [r4]!
2503	veor	d24, d24, d31		@ A[4][4] ^= *inp++
2504
2505.Lprocess_neon:
2506	bl	KeccakF1600_neon
2507	b	.Loop_absorb_neon
2508
2509.align	4
2510.Labsorbed_neon:
2511	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2512	vst1.32	{d2}, [r0,:64]!
2513	vst1.32	{d4}, [r0,:64]!
2514	vst1.32	{d6}, [r0,:64]!
2515	vst1.32	{d8}, [r0,:64]!
2516
2517	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2518	vst1.32	{d3}, [r0,:64]!
2519	vst1.32	{d5}, [r0,:64]!
2520	vst1.32	{d7}, [r0,:64]!
2521	vst1.32	{d9}, [r0,:64]!
2522
2523	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2524	vst1.32	{d12}, [r0,:64]!
2525	vst1.32	{d14}, [r0,:64]!
2526	vst1.32	{d16}, [r0,:64]!
2527	vst1.32	{d18}, [r0,:64]!
2528
2529	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2530	vst1.32	{d13}, [r0,:64]!
2531	vst1.32	{d15}, [r0,:64]!
2532	vst1.32	{d17}, [r0,:64]!
2533	vst1.32	{d19}, [r0,:64]!
2534
2535	vst1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
2536	vst1.32	{d24}, [r0,:64]
2537
2538	mov	r0, r5			@ return value
2539	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
2540	ldmia	sp!, {r4,r5,r6,pc}
2541.size	SHA3_absorb_neon,.-SHA3_absorb_neon
2542
2543.globl	SHA3_squeeze_neon
2544.type	SHA3_squeeze_neon, %function
2545.align	5
2546SHA3_squeeze_neon:
2547	stmdb	sp!, {r4,r5,r6,lr}
2548
2549	mov	r4, r1			@ out
2550	mov	r5, r2			@ len
2551	mov	r6, r3			@ bsz
2552	mov	r12, r0			@ A_flat
2553	mov	r14, r3			@ bsz
2554	b	.Loop_squeeze_neon
2555
2556.align	4
2557.Loop_squeeze_neon:
2558	cmp	r5, #8
2559	blo	.Lsqueeze_neon_tail
2560	vld1.32	{d0}, [r12]!
2561	vst1.8	{d0}, [r4]!		@ endian-neutral store
2562
2563	subs	r5, r5, #8		@ len -= 8
2564	beq	.Lsqueeze_neon_done
2565
2566	subs	r14, r14, #8		@ bsz -= 8
2567	bhi	.Loop_squeeze_neon
2568
2569	vstmdb	sp!,  {d8,d9,d10,d11,d12,d13,d14,d15}
2570
2571	vld1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2572	vld1.32	{d2}, [r0,:64]!
2573	vld1.32	{d4}, [r0,:64]!
2574	vld1.32	{d6}, [r0,:64]!
2575	vld1.32	{d8}, [r0,:64]!
2576
2577	vld1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2578	vld1.32	{d3}, [r0,:64]!
2579	vld1.32	{d5}, [r0,:64]!
2580	vld1.32	{d7}, [r0,:64]!
2581	vld1.32	{d9}, [r0,:64]!
2582
2583	vld1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2584	vld1.32	{d12}, [r0,:64]!
2585	vld1.32	{d14}, [r0,:64]!
2586	vld1.32	{d16}, [r0,:64]!
2587	vld1.32	{d18}, [r0,:64]!
2588
2589	vld1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2590	vld1.32	{d13}, [r0,:64]!
2591	vld1.32	{d15}, [r0,:64]!
2592	vld1.32	{d17}, [r0,:64]!
2593	vld1.32	{d19}, [r0,:64]!
2594
2595	vld1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
2596	vld1.32	{d24}, [r0,:64]
2597	sub	r0, r0, #24*8		@ rewind
2598
2599	bl	KeccakF1600_neon
2600
2601	mov	r12, r0			@ A_flat
2602	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2603	vst1.32	{d2}, [r0,:64]!
2604	vst1.32	{d4}, [r0,:64]!
2605	vst1.32	{d6}, [r0,:64]!
2606	vst1.32	{d8}, [r0,:64]!
2607
2608	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2609	vst1.32	{d3}, [r0,:64]!
2610	vst1.32	{d5}, [r0,:64]!
2611	vst1.32	{d7}, [r0,:64]!
2612	vst1.32	{d9}, [r0,:64]!
2613
2614	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2615	vst1.32	{d12}, [r0,:64]!
2616	vst1.32	{d14}, [r0,:64]!
2617	vst1.32	{d16}, [r0,:64]!
2618	vst1.32	{d18}, [r0,:64]!
2619
2620	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2621	vst1.32	{d13}, [r0,:64]!
2622	vst1.32	{d15}, [r0,:64]!
2623	vst1.32	{d17}, [r0,:64]!
2624	vst1.32	{d19}, [r0,:64]!
2625
2626	vst1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
2627	mov	r14, r6			@ bsz
2628	vst1.32	{d24}, [r0,:64]
2629	mov	r0,  r12		@ rewind
2630
2631	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
2632	b	.Loop_squeeze_neon
2633
2634.align	4
2635.Lsqueeze_neon_tail:
2636	ldmia	r12, {r2,r3}
2637	cmp	r5, #2
2638	strb	r2, [r4],#1		@ endian-neutral store
2639	mov	r2, r2, lsr#8
2640	blo	.Lsqueeze_neon_done
2641	strb	r2, [r4], #1
2642	mov	r2, r2, lsr#8
2643	beq	.Lsqueeze_neon_done
2644	strb	r2, [r4], #1
2645	mov	r2, r2, lsr#8
2646	cmp	r5, #4
2647	blo	.Lsqueeze_neon_done
2648	strb	r2, [r4], #1
2649	beq	.Lsqueeze_neon_done
2650
2651	strb	r3, [r4], #1
2652	mov	r3, r3, lsr#8
2653	cmp	r5, #6
2654	blo	.Lsqueeze_neon_done
2655	strb	r3, [r4], #1
2656	mov	r3, r3, lsr#8
2657	beq	.Lsqueeze_neon_done
2658	strb	r3, [r4], #1
2659
2660.Lsqueeze_neon_done:
2661	ldmia	sp!, {r4,r5,r6,pc}
2662.size	SHA3_squeeze_neon,.-SHA3_squeeze_neon
2663#endif
2664.byte	75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2665.align	2
2666.align	2
2667