1#include "arm_asm.h"
2#include "arm_arch.h"
3
4.text
5
6
7.hidden	OPENSSL_armcap_P
8
9.align	5
10.Lsigma:
11.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
12.Lone:
13.long	1,0,0,0
14.LOPENSSL_armcap_P:
15#ifdef	__ILP32__
16.long	OPENSSL_armcap_P-.
17#else
18.quad	OPENSSL_armcap_P-.
19#endif
20.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
21.align	2
22
23.globl	ChaCha20_ctr32
24.type	ChaCha20_ctr32,%function
25.align	5
26ChaCha20_ctr32:
27	cbz	x2,.Labort
28	adr	x5,.LOPENSSL_armcap_P
29	cmp	x2,#192
30	b.lo	.Lshort
31#ifdef	__ILP32__
32	ldrsw	x6,[x5]
33#else
34	ldr	x6,[x5]
35#endif
36	ldr	w17,[x6,x5]
37	tst	w17,#ARMV7_NEON
38	b.ne	ChaCha20_neon
39
40.Lshort:
41.inst	0xd503233f			// paciasp
42	stp	x29,x30,[sp,#-96]!
43	add	x29,sp,#0
44
45	adr	x5,.Lsigma
46	stp	x19,x20,[sp,#16]
47	stp	x21,x22,[sp,#32]
48	stp	x23,x24,[sp,#48]
49	stp	x25,x26,[sp,#64]
50	stp	x27,x28,[sp,#80]
51	sub	sp,sp,#64
52
53	ldp	x22,x23,[x5]		// load sigma
54	ldp	x24,x25,[x3]		// load key
55	ldp	x26,x27,[x3,#16]
56	ldp	x28,x30,[x4]		// load counter
57#ifdef	__ARMEB__
58	ror	x24,x24,#32
59	ror	x25,x25,#32
60	ror	x26,x26,#32
61	ror	x27,x27,#32
62	ror	x28,x28,#32
63	ror	x30,x30,#32
64#endif
65
66.Loop_outer:
67	mov	w5,w22			// unpack key block
68	lsr	x6,x22,#32
69	mov	w7,w23
70	lsr	x8,x23,#32
71	mov	w9,w24
72	lsr	x10,x24,#32
73	mov	w11,w25
74	lsr	x12,x25,#32
75	mov	w13,w26
76	lsr	x14,x26,#32
77	mov	w15,w27
78	lsr	x16,x27,#32
79	mov	w17,w28
80	lsr	x19,x28,#32
81	mov	w20,w30
82	lsr	x21,x30,#32
83
84	mov	x4,#10
85	subs	x2,x2,#64
86.Loop:
87	sub	x4,x4,#1
88	add	w5,w5,w9
89	add	w6,w6,w10
90	add	w7,w7,w11
91	add	w8,w8,w12
92	eor	w17,w17,w5
93	eor	w19,w19,w6
94	eor	w20,w20,w7
95	eor	w21,w21,w8
96	ror	w17,w17,#16
97	ror	w19,w19,#16
98	ror	w20,w20,#16
99	ror	w21,w21,#16
100	add	w13,w13,w17
101	add	w14,w14,w19
102	add	w15,w15,w20
103	add	w16,w16,w21
104	eor	w9,w9,w13
105	eor	w10,w10,w14
106	eor	w11,w11,w15
107	eor	w12,w12,w16
108	ror	w9,w9,#20
109	ror	w10,w10,#20
110	ror	w11,w11,#20
111	ror	w12,w12,#20
112	add	w5,w5,w9
113	add	w6,w6,w10
114	add	w7,w7,w11
115	add	w8,w8,w12
116	eor	w17,w17,w5
117	eor	w19,w19,w6
118	eor	w20,w20,w7
119	eor	w21,w21,w8
120	ror	w17,w17,#24
121	ror	w19,w19,#24
122	ror	w20,w20,#24
123	ror	w21,w21,#24
124	add	w13,w13,w17
125	add	w14,w14,w19
126	add	w15,w15,w20
127	add	w16,w16,w21
128	eor	w9,w9,w13
129	eor	w10,w10,w14
130	eor	w11,w11,w15
131	eor	w12,w12,w16
132	ror	w9,w9,#25
133	ror	w10,w10,#25
134	ror	w11,w11,#25
135	ror	w12,w12,#25
136	add	w5,w5,w10
137	add	w6,w6,w11
138	add	w7,w7,w12
139	add	w8,w8,w9
140	eor	w21,w21,w5
141	eor	w17,w17,w6
142	eor	w19,w19,w7
143	eor	w20,w20,w8
144	ror	w21,w21,#16
145	ror	w17,w17,#16
146	ror	w19,w19,#16
147	ror	w20,w20,#16
148	add	w15,w15,w21
149	add	w16,w16,w17
150	add	w13,w13,w19
151	add	w14,w14,w20
152	eor	w10,w10,w15
153	eor	w11,w11,w16
154	eor	w12,w12,w13
155	eor	w9,w9,w14
156	ror	w10,w10,#20
157	ror	w11,w11,#20
158	ror	w12,w12,#20
159	ror	w9,w9,#20
160	add	w5,w5,w10
161	add	w6,w6,w11
162	add	w7,w7,w12
163	add	w8,w8,w9
164	eor	w21,w21,w5
165	eor	w17,w17,w6
166	eor	w19,w19,w7
167	eor	w20,w20,w8
168	ror	w21,w21,#24
169	ror	w17,w17,#24
170	ror	w19,w19,#24
171	ror	w20,w20,#24
172	add	w15,w15,w21
173	add	w16,w16,w17
174	add	w13,w13,w19
175	add	w14,w14,w20
176	eor	w10,w10,w15
177	eor	w11,w11,w16
178	eor	w12,w12,w13
179	eor	w9,w9,w14
180	ror	w10,w10,#25
181	ror	w11,w11,#25
182	ror	w12,w12,#25
183	ror	w9,w9,#25
184	cbnz	x4,.Loop
185
186	add	w5,w5,w22		// accumulate key block
187	add	x6,x6,x22,lsr#32
188	add	w7,w7,w23
189	add	x8,x8,x23,lsr#32
190	add	w9,w9,w24
191	add	x10,x10,x24,lsr#32
192	add	w11,w11,w25
193	add	x12,x12,x25,lsr#32
194	add	w13,w13,w26
195	add	x14,x14,x26,lsr#32
196	add	w15,w15,w27
197	add	x16,x16,x27,lsr#32
198	add	w17,w17,w28
199	add	x19,x19,x28,lsr#32
200	add	w20,w20,w30
201	add	x21,x21,x30,lsr#32
202
203	b.lo	.Ltail
204
205	add	x5,x5,x6,lsl#32	// pack
206	add	x7,x7,x8,lsl#32
207	ldp	x6,x8,[x1,#0]		// load input
208	add	x9,x9,x10,lsl#32
209	add	x11,x11,x12,lsl#32
210	ldp	x10,x12,[x1,#16]
211	add	x13,x13,x14,lsl#32
212	add	x15,x15,x16,lsl#32
213	ldp	x14,x16,[x1,#32]
214	add	x17,x17,x19,lsl#32
215	add	x20,x20,x21,lsl#32
216	ldp	x19,x21,[x1,#48]
217	add	x1,x1,#64
218#ifdef	__ARMEB__
219	rev	x5,x5
220	rev	x7,x7
221	rev	x9,x9
222	rev	x11,x11
223	rev	x13,x13
224	rev	x15,x15
225	rev	x17,x17
226	rev	x20,x20
227#endif
228	eor	x5,x5,x6
229	eor	x7,x7,x8
230	eor	x9,x9,x10
231	eor	x11,x11,x12
232	eor	x13,x13,x14
233	eor	x15,x15,x16
234	eor	x17,x17,x19
235	eor	x20,x20,x21
236
237	stp	x5,x7,[x0,#0]		// store output
238	add	x28,x28,#1			// increment counter
239	stp	x9,x11,[x0,#16]
240	stp	x13,x15,[x0,#32]
241	stp	x17,x20,[x0,#48]
242	add	x0,x0,#64
243
244	b.hi	.Loop_outer
245
246	ldp	x19,x20,[x29,#16]
247	add	sp,sp,#64
248	ldp	x21,x22,[x29,#32]
249	ldp	x23,x24,[x29,#48]
250	ldp	x25,x26,[x29,#64]
251	ldp	x27,x28,[x29,#80]
252	ldp	x29,x30,[sp],#96
253.inst	0xd50323bf			// autiasp
254.Labort:
255	ret
256
257.align	4
258.Ltail:
259	add	x2,x2,#64
260.Less_than_64:
261	sub	x0,x0,#1
262	add	x1,x1,x2
263	add	x0,x0,x2
264	add	x4,sp,x2
265	neg	x2,x2
266
267	add	x5,x5,x6,lsl#32	// pack
268	add	x7,x7,x8,lsl#32
269	add	x9,x9,x10,lsl#32
270	add	x11,x11,x12,lsl#32
271	add	x13,x13,x14,lsl#32
272	add	x15,x15,x16,lsl#32
273	add	x17,x17,x19,lsl#32
274	add	x20,x20,x21,lsl#32
275#ifdef	__ARMEB__
276	rev	x5,x5
277	rev	x7,x7
278	rev	x9,x9
279	rev	x11,x11
280	rev	x13,x13
281	rev	x15,x15
282	rev	x17,x17
283	rev	x20,x20
284#endif
285	stp	x5,x7,[sp,#0]
286	stp	x9,x11,[sp,#16]
287	stp	x13,x15,[sp,#32]
288	stp	x17,x20,[sp,#48]
289
290.Loop_tail:
291	ldrb	w10,[x1,x2]
292	ldrb	w11,[x4,x2]
293	add	x2,x2,#1
294	eor	w10,w10,w11
295	strb	w10,[x0,x2]
296	cbnz	x2,.Loop_tail
297
298	stp	xzr,xzr,[sp,#0]
299	stp	xzr,xzr,[sp,#16]
300	stp	xzr,xzr,[sp,#32]
301	stp	xzr,xzr,[sp,#48]
302
303	ldp	x19,x20,[x29,#16]
304	add	sp,sp,#64
305	ldp	x21,x22,[x29,#32]
306	ldp	x23,x24,[x29,#48]
307	ldp	x25,x26,[x29,#64]
308	ldp	x27,x28,[x29,#80]
309	ldp	x29,x30,[sp],#96
310.inst	0xd50323bf			// autiasp
311	ret
312.size	ChaCha20_ctr32,.-ChaCha20_ctr32
313
314.type	ChaCha20_neon,%function
315.align	5
316ChaCha20_neon:
317.inst	0xd503233f			// paciasp
318	stp	x29,x30,[sp,#-96]!
319	add	x29,sp,#0
320
321	adr	x5,.Lsigma
322	stp	x19,x20,[sp,#16]
323	stp	x21,x22,[sp,#32]
324	stp	x23,x24,[sp,#48]
325	stp	x25,x26,[sp,#64]
326	stp	x27,x28,[sp,#80]
327	cmp	x2,#512
328	b.hs	.L512_or_more_neon
329
330	sub	sp,sp,#64
331
332	ldp	x22,x23,[x5]		// load sigma
333	ld1	{v24.4s},[x5],#16
334	ldp	x24,x25,[x3]		// load key
335	ldp	x26,x27,[x3,#16]
336	ld1	{v25.4s,v26.4s},[x3]
337	ldp	x28,x30,[x4]		// load counter
338	ld1	{v27.4s},[x4]
339	ld1	{v31.4s},[x5]
340#ifdef	__ARMEB__
341	rev64	v24.4s,v24.4s
342	ror	x24,x24,#32
343	ror	x25,x25,#32
344	ror	x26,x26,#32
345	ror	x27,x27,#32
346	ror	x28,x28,#32
347	ror	x30,x30,#32
348#endif
349	add	v27.4s,v27.4s,v31.4s		// += 1
350	add	v28.4s,v27.4s,v31.4s
351	add	v29.4s,v28.4s,v31.4s
352	shl	v31.4s,v31.4s,#2			// 1 -> 4
353
354.Loop_outer_neon:
355	mov	w5,w22			// unpack key block
356	lsr	x6,x22,#32
357	mov	v0.16b,v24.16b
358	mov	w7,w23
359	lsr	x8,x23,#32
360	mov	v4.16b,v24.16b
361	mov	w9,w24
362	lsr	x10,x24,#32
363	mov	v16.16b,v24.16b
364	mov	w11,w25
365	mov	v1.16b,v25.16b
366	lsr	x12,x25,#32
367	mov	v5.16b,v25.16b
368	mov	w13,w26
369	mov	v17.16b,v25.16b
370	lsr	x14,x26,#32
371	mov	v3.16b,v27.16b
372	mov	w15,w27
373	mov	v7.16b,v28.16b
374	lsr	x16,x27,#32
375	mov	v19.16b,v29.16b
376	mov	w17,w28
377	mov	v2.16b,v26.16b
378	lsr	x19,x28,#32
379	mov	v6.16b,v26.16b
380	mov	w20,w30
381	mov	v18.16b,v26.16b
382	lsr	x21,x30,#32
383
384	mov	x4,#10
385	subs	x2,x2,#256
386.Loop_neon:
387	sub	x4,x4,#1
388	add	v0.4s,v0.4s,v1.4s
389	add	w5,w5,w9
390	add	v4.4s,v4.4s,v5.4s
391	add	w6,w6,w10
392	add	v16.4s,v16.4s,v17.4s
393	add	w7,w7,w11
394	eor	v3.16b,v3.16b,v0.16b
395	add	w8,w8,w12
396	eor	v7.16b,v7.16b,v4.16b
397	eor	w17,w17,w5
398	eor	v19.16b,v19.16b,v16.16b
399	eor	w19,w19,w6
400	rev32	v3.8h,v3.8h
401	eor	w20,w20,w7
402	rev32	v7.8h,v7.8h
403	eor	w21,w21,w8
404	rev32	v19.8h,v19.8h
405	ror	w17,w17,#16
406	add	v2.4s,v2.4s,v3.4s
407	ror	w19,w19,#16
408	add	v6.4s,v6.4s,v7.4s
409	ror	w20,w20,#16
410	add	v18.4s,v18.4s,v19.4s
411	ror	w21,w21,#16
412	eor	v20.16b,v1.16b,v2.16b
413	add	w13,w13,w17
414	eor	v21.16b,v5.16b,v6.16b
415	add	w14,w14,w19
416	eor	v22.16b,v17.16b,v18.16b
417	add	w15,w15,w20
418	ushr	v1.4s,v20.4s,#20
419	add	w16,w16,w21
420	ushr	v5.4s,v21.4s,#20
421	eor	w9,w9,w13
422	ushr	v17.4s,v22.4s,#20
423	eor	w10,w10,w14
424	sli	v1.4s,v20.4s,#12
425	eor	w11,w11,w15
426	sli	v5.4s,v21.4s,#12
427	eor	w12,w12,w16
428	sli	v17.4s,v22.4s,#12
429	ror	w9,w9,#20
430	add	v0.4s,v0.4s,v1.4s
431	ror	w10,w10,#20
432	add	v4.4s,v4.4s,v5.4s
433	ror	w11,w11,#20
434	add	v16.4s,v16.4s,v17.4s
435	ror	w12,w12,#20
436	eor	v20.16b,v3.16b,v0.16b
437	add	w5,w5,w9
438	eor	v21.16b,v7.16b,v4.16b
439	add	w6,w6,w10
440	eor	v22.16b,v19.16b,v16.16b
441	add	w7,w7,w11
442	ushr	v3.4s,v20.4s,#24
443	add	w8,w8,w12
444	ushr	v7.4s,v21.4s,#24
445	eor	w17,w17,w5
446	ushr	v19.4s,v22.4s,#24
447	eor	w19,w19,w6
448	sli	v3.4s,v20.4s,#8
449	eor	w20,w20,w7
450	sli	v7.4s,v21.4s,#8
451	eor	w21,w21,w8
452	sli	v19.4s,v22.4s,#8
453	ror	w17,w17,#24
454	add	v2.4s,v2.4s,v3.4s
455	ror	w19,w19,#24
456	add	v6.4s,v6.4s,v7.4s
457	ror	w20,w20,#24
458	add	v18.4s,v18.4s,v19.4s
459	ror	w21,w21,#24
460	eor	v20.16b,v1.16b,v2.16b
461	add	w13,w13,w17
462	eor	v21.16b,v5.16b,v6.16b
463	add	w14,w14,w19
464	eor	v22.16b,v17.16b,v18.16b
465	add	w15,w15,w20
466	ushr	v1.4s,v20.4s,#25
467	add	w16,w16,w21
468	ushr	v5.4s,v21.4s,#25
469	eor	w9,w9,w13
470	ushr	v17.4s,v22.4s,#25
471	eor	w10,w10,w14
472	sli	v1.4s,v20.4s,#7
473	eor	w11,w11,w15
474	sli	v5.4s,v21.4s,#7
475	eor	w12,w12,w16
476	sli	v17.4s,v22.4s,#7
477	ror	w9,w9,#25
478	ext	v2.16b,v2.16b,v2.16b,#8
479	ror	w10,w10,#25
480	ext	v6.16b,v6.16b,v6.16b,#8
481	ror	w11,w11,#25
482	ext	v18.16b,v18.16b,v18.16b,#8
483	ror	w12,w12,#25
484	ext	v3.16b,v3.16b,v3.16b,#12
485	ext	v7.16b,v7.16b,v7.16b,#12
486	ext	v19.16b,v19.16b,v19.16b,#12
487	ext	v1.16b,v1.16b,v1.16b,#4
488	ext	v5.16b,v5.16b,v5.16b,#4
489	ext	v17.16b,v17.16b,v17.16b,#4
490	add	v0.4s,v0.4s,v1.4s
491	add	w5,w5,w10
492	add	v4.4s,v4.4s,v5.4s
493	add	w6,w6,w11
494	add	v16.4s,v16.4s,v17.4s
495	add	w7,w7,w12
496	eor	v3.16b,v3.16b,v0.16b
497	add	w8,w8,w9
498	eor	v7.16b,v7.16b,v4.16b
499	eor	w21,w21,w5
500	eor	v19.16b,v19.16b,v16.16b
501	eor	w17,w17,w6
502	rev32	v3.8h,v3.8h
503	eor	w19,w19,w7
504	rev32	v7.8h,v7.8h
505	eor	w20,w20,w8
506	rev32	v19.8h,v19.8h
507	ror	w21,w21,#16
508	add	v2.4s,v2.4s,v3.4s
509	ror	w17,w17,#16
510	add	v6.4s,v6.4s,v7.4s
511	ror	w19,w19,#16
512	add	v18.4s,v18.4s,v19.4s
513	ror	w20,w20,#16
514	eor	v20.16b,v1.16b,v2.16b
515	add	w15,w15,w21
516	eor	v21.16b,v5.16b,v6.16b
517	add	w16,w16,w17
518	eor	v22.16b,v17.16b,v18.16b
519	add	w13,w13,w19
520	ushr	v1.4s,v20.4s,#20
521	add	w14,w14,w20
522	ushr	v5.4s,v21.4s,#20
523	eor	w10,w10,w15
524	ushr	v17.4s,v22.4s,#20
525	eor	w11,w11,w16
526	sli	v1.4s,v20.4s,#12
527	eor	w12,w12,w13
528	sli	v5.4s,v21.4s,#12
529	eor	w9,w9,w14
530	sli	v17.4s,v22.4s,#12
531	ror	w10,w10,#20
532	add	v0.4s,v0.4s,v1.4s
533	ror	w11,w11,#20
534	add	v4.4s,v4.4s,v5.4s
535	ror	w12,w12,#20
536	add	v16.4s,v16.4s,v17.4s
537	ror	w9,w9,#20
538	eor	v20.16b,v3.16b,v0.16b
539	add	w5,w5,w10
540	eor	v21.16b,v7.16b,v4.16b
541	add	w6,w6,w11
542	eor	v22.16b,v19.16b,v16.16b
543	add	w7,w7,w12
544	ushr	v3.4s,v20.4s,#24
545	add	w8,w8,w9
546	ushr	v7.4s,v21.4s,#24
547	eor	w21,w21,w5
548	ushr	v19.4s,v22.4s,#24
549	eor	w17,w17,w6
550	sli	v3.4s,v20.4s,#8
551	eor	w19,w19,w7
552	sli	v7.4s,v21.4s,#8
553	eor	w20,w20,w8
554	sli	v19.4s,v22.4s,#8
555	ror	w21,w21,#24
556	add	v2.4s,v2.4s,v3.4s
557	ror	w17,w17,#24
558	add	v6.4s,v6.4s,v7.4s
559	ror	w19,w19,#24
560	add	v18.4s,v18.4s,v19.4s
561	ror	w20,w20,#24
562	eor	v20.16b,v1.16b,v2.16b
563	add	w15,w15,w21
564	eor	v21.16b,v5.16b,v6.16b
565	add	w16,w16,w17
566	eor	v22.16b,v17.16b,v18.16b
567	add	w13,w13,w19
568	ushr	v1.4s,v20.4s,#25
569	add	w14,w14,w20
570	ushr	v5.4s,v21.4s,#25
571	eor	w10,w10,w15
572	ushr	v17.4s,v22.4s,#25
573	eor	w11,w11,w16
574	sli	v1.4s,v20.4s,#7
575	eor	w12,w12,w13
576	sli	v5.4s,v21.4s,#7
577	eor	w9,w9,w14
578	sli	v17.4s,v22.4s,#7
579	ror	w10,w10,#25
580	ext	v2.16b,v2.16b,v2.16b,#8
581	ror	w11,w11,#25
582	ext	v6.16b,v6.16b,v6.16b,#8
583	ror	w12,w12,#25
584	ext	v18.16b,v18.16b,v18.16b,#8
585	ror	w9,w9,#25
586	ext	v3.16b,v3.16b,v3.16b,#4
587	ext	v7.16b,v7.16b,v7.16b,#4
588	ext	v19.16b,v19.16b,v19.16b,#4
589	ext	v1.16b,v1.16b,v1.16b,#12
590	ext	v5.16b,v5.16b,v5.16b,#12
591	ext	v17.16b,v17.16b,v17.16b,#12
592	cbnz	x4,.Loop_neon
593
594	add	w5,w5,w22		// accumulate key block
595	add	v0.4s,v0.4s,v24.4s
596	add	x6,x6,x22,lsr#32
597	add	v4.4s,v4.4s,v24.4s
598	add	w7,w7,w23
599	add	v16.4s,v16.4s,v24.4s
600	add	x8,x8,x23,lsr#32
601	add	v2.4s,v2.4s,v26.4s
602	add	w9,w9,w24
603	add	v6.4s,v6.4s,v26.4s
604	add	x10,x10,x24,lsr#32
605	add	v18.4s,v18.4s,v26.4s
606	add	w11,w11,w25
607	add	v3.4s,v3.4s,v27.4s
608	add	x12,x12,x25,lsr#32
609	add	w13,w13,w26
610	add	v7.4s,v7.4s,v28.4s
611	add	x14,x14,x26,lsr#32
612	add	w15,w15,w27
613	add	v19.4s,v19.4s,v29.4s
614	add	x16,x16,x27,lsr#32
615	add	w17,w17,w28
616	add	v1.4s,v1.4s,v25.4s
617	add	x19,x19,x28,lsr#32
618	add	w20,w20,w30
619	add	v5.4s,v5.4s,v25.4s
620	add	x21,x21,x30,lsr#32
621	add	v17.4s,v17.4s,v25.4s
622
623	b.lo	.Ltail_neon
624
625	add	x5,x5,x6,lsl#32	// pack
626	add	x7,x7,x8,lsl#32
627	ldp	x6,x8,[x1,#0]		// load input
628	add	x9,x9,x10,lsl#32
629	add	x11,x11,x12,lsl#32
630	ldp	x10,x12,[x1,#16]
631	add	x13,x13,x14,lsl#32
632	add	x15,x15,x16,lsl#32
633	ldp	x14,x16,[x1,#32]
634	add	x17,x17,x19,lsl#32
635	add	x20,x20,x21,lsl#32
636	ldp	x19,x21,[x1,#48]
637	add	x1,x1,#64
638#ifdef	__ARMEB__
639	rev	x5,x5
640	rev	x7,x7
641	rev	x9,x9
642	rev	x11,x11
643	rev	x13,x13
644	rev	x15,x15
645	rev	x17,x17
646	rev	x20,x20
647#endif
648	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
649	eor	x5,x5,x6
650	eor	x7,x7,x8
651	eor	x9,x9,x10
652	eor	x11,x11,x12
653	eor	x13,x13,x14
654	eor	v0.16b,v0.16b,v20.16b
655	eor	x15,x15,x16
656	eor	v1.16b,v1.16b,v21.16b
657	eor	x17,x17,x19
658	eor	v2.16b,v2.16b,v22.16b
659	eor	x20,x20,x21
660	eor	v3.16b,v3.16b,v23.16b
661	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
662
663	stp	x5,x7,[x0,#0]		// store output
664	add	x28,x28,#4			// increment counter
665	stp	x9,x11,[x0,#16]
666	add	v27.4s,v27.4s,v31.4s		// += 4
667	stp	x13,x15,[x0,#32]
668	add	v28.4s,v28.4s,v31.4s
669	stp	x17,x20,[x0,#48]
670	add	v29.4s,v29.4s,v31.4s
671	add	x0,x0,#64
672
673	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
674	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
675
676	eor	v4.16b,v4.16b,v20.16b
677	eor	v5.16b,v5.16b,v21.16b
678	eor	v6.16b,v6.16b,v22.16b
679	eor	v7.16b,v7.16b,v23.16b
680	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
681
682	eor	v16.16b,v16.16b,v0.16b
683	eor	v17.16b,v17.16b,v1.16b
684	eor	v18.16b,v18.16b,v2.16b
685	eor	v19.16b,v19.16b,v3.16b
686	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
687
688	b.hi	.Loop_outer_neon
689
690	ldp	x19,x20,[x29,#16]
691	add	sp,sp,#64
692	ldp	x21,x22,[x29,#32]
693	ldp	x23,x24,[x29,#48]
694	ldp	x25,x26,[x29,#64]
695	ldp	x27,x28,[x29,#80]
696	ldp	x29,x30,[sp],#96
697.inst	0xd50323bf			// autiasp
698	ret
699
700.Ltail_neon:
701	add	x2,x2,#256
702	cmp	x2,#64
703	b.lo	.Less_than_64
704
705	add	x5,x5,x6,lsl#32	// pack
706	add	x7,x7,x8,lsl#32
707	ldp	x6,x8,[x1,#0]		// load input
708	add	x9,x9,x10,lsl#32
709	add	x11,x11,x12,lsl#32
710	ldp	x10,x12,[x1,#16]
711	add	x13,x13,x14,lsl#32
712	add	x15,x15,x16,lsl#32
713	ldp	x14,x16,[x1,#32]
714	add	x17,x17,x19,lsl#32
715	add	x20,x20,x21,lsl#32
716	ldp	x19,x21,[x1,#48]
717	add	x1,x1,#64
718#ifdef	__ARMEB__
719	rev	x5,x5
720	rev	x7,x7
721	rev	x9,x9
722	rev	x11,x11
723	rev	x13,x13
724	rev	x15,x15
725	rev	x17,x17
726	rev	x20,x20
727#endif
728	eor	x5,x5,x6
729	eor	x7,x7,x8
730	eor	x9,x9,x10
731	eor	x11,x11,x12
732	eor	x13,x13,x14
733	eor	x15,x15,x16
734	eor	x17,x17,x19
735	eor	x20,x20,x21
736
737	stp	x5,x7,[x0,#0]		// store output
738	add	x28,x28,#4			// increment counter
739	stp	x9,x11,[x0,#16]
740	stp	x13,x15,[x0,#32]
741	stp	x17,x20,[x0,#48]
742	add	x0,x0,#64
743	b.eq	.Ldone_neon
744	sub	x2,x2,#64
745	cmp	x2,#64
746	b.lo	.Less_than_128
747
748	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
749	eor	v0.16b,v0.16b,v20.16b
750	eor	v1.16b,v1.16b,v21.16b
751	eor	v2.16b,v2.16b,v22.16b
752	eor	v3.16b,v3.16b,v23.16b
753	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
754	b.eq	.Ldone_neon
755	sub	x2,x2,#64
756	cmp	x2,#64
757	b.lo	.Less_than_192
758
759	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
760	eor	v4.16b,v4.16b,v20.16b
761	eor	v5.16b,v5.16b,v21.16b
762	eor	v6.16b,v6.16b,v22.16b
763	eor	v7.16b,v7.16b,v23.16b
764	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
765	b.eq	.Ldone_neon
766	sub	x2,x2,#64
767
768	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
769	b	.Last_neon
770
771.Less_than_128:
772	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
773	b	.Last_neon
774.Less_than_192:
775	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
776	b	.Last_neon
777
778.align	4
779.Last_neon:
780	sub	x0,x0,#1
781	add	x1,x1,x2
782	add	x0,x0,x2
783	add	x4,sp,x2
784	neg	x2,x2
785
786.Loop_tail_neon:
787	ldrb	w10,[x1,x2]
788	ldrb	w11,[x4,x2]
789	add	x2,x2,#1
790	eor	w10,w10,w11
791	strb	w10,[x0,x2]
792	cbnz	x2,.Loop_tail_neon
793
794	stp	xzr,xzr,[sp,#0]
795	stp	xzr,xzr,[sp,#16]
796	stp	xzr,xzr,[sp,#32]
797	stp	xzr,xzr,[sp,#48]
798
799.Ldone_neon:
800	ldp	x19,x20,[x29,#16]
801	add	sp,sp,#64
802	ldp	x21,x22,[x29,#32]
803	ldp	x23,x24,[x29,#48]
804	ldp	x25,x26,[x29,#64]
805	ldp	x27,x28,[x29,#80]
806	ldp	x29,x30,[sp],#96
807.inst	0xd50323bf			// autiasp
808	ret
809.size	ChaCha20_neon,.-ChaCha20_neon
810.type	ChaCha20_512_neon,%function
811.align	5
812ChaCha20_512_neon:
813.inst	0xd503233f			// paciasp
814	stp	x29,x30,[sp,#-96]!
815	add	x29,sp,#0
816
817	adr	x5,.Lsigma
818	stp	x19,x20,[sp,#16]
819	stp	x21,x22,[sp,#32]
820	stp	x23,x24,[sp,#48]
821	stp	x25,x26,[sp,#64]
822	stp	x27,x28,[sp,#80]
823
824.L512_or_more_neon:
825	sub	sp,sp,#128+64
826
827	ldp	x22,x23,[x5]		// load sigma
828	ld1	{v24.4s},[x5],#16
829	ldp	x24,x25,[x3]		// load key
830	ldp	x26,x27,[x3,#16]
831	ld1	{v25.4s,v26.4s},[x3]
832	ldp	x28,x30,[x4]		// load counter
833	ld1	{v27.4s},[x4]
834	ld1	{v31.4s},[x5]
835#ifdef	__ARMEB__
836	rev64	v24.4s,v24.4s
837	ror	x24,x24,#32
838	ror	x25,x25,#32
839	ror	x26,x26,#32
840	ror	x27,x27,#32
841	ror	x28,x28,#32
842	ror	x30,x30,#32
843#endif
844	add	v27.4s,v27.4s,v31.4s		// += 1
845	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
846	add	v27.4s,v27.4s,v31.4s		// not typo
847	str	q26,[sp,#32]
848	add	v28.4s,v27.4s,v31.4s
849	add	v29.4s,v28.4s,v31.4s
850	add	v30.4s,v29.4s,v31.4s
851	shl	v31.4s,v31.4s,#2			// 1 -> 4
852
853	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
854	stp	d10,d11,[sp,#128+16]
855	stp	d12,d13,[sp,#128+32]
856	stp	d14,d15,[sp,#128+48]
857
858	sub	x2,x2,#512			// not typo
859
860.Loop_outer_512_neon:
861	mov	v0.16b,v24.16b
862	mov	v4.16b,v24.16b
863	mov	v8.16b,v24.16b
864	mov	v12.16b,v24.16b
865	mov	v16.16b,v24.16b
866	mov	v20.16b,v24.16b
867	mov	v1.16b,v25.16b
868	mov	w5,w22			// unpack key block
869	mov	v5.16b,v25.16b
870	lsr	x6,x22,#32
871	mov	v9.16b,v25.16b
872	mov	w7,w23
873	mov	v13.16b,v25.16b
874	lsr	x8,x23,#32
875	mov	v17.16b,v25.16b
876	mov	w9,w24
877	mov	v21.16b,v25.16b
878	lsr	x10,x24,#32
879	mov	v3.16b,v27.16b
880	mov	w11,w25
881	mov	v7.16b,v28.16b
882	lsr	x12,x25,#32
883	mov	v11.16b,v29.16b
884	mov	w13,w26
885	mov	v15.16b,v30.16b
886	lsr	x14,x26,#32
887	mov	v2.16b,v26.16b
888	mov	w15,w27
889	mov	v6.16b,v26.16b
890	lsr	x16,x27,#32
891	add	v19.4s,v3.4s,v31.4s			// +4
892	mov	w17,w28
893	add	v23.4s,v7.4s,v31.4s			// +4
894	lsr	x19,x28,#32
895	mov	v10.16b,v26.16b
896	mov	w20,w30
897	mov	v14.16b,v26.16b
898	lsr	x21,x30,#32
899	mov	v18.16b,v26.16b
900	stp	q27,q28,[sp,#48]		// off-load key block, variable part
901	mov	v22.16b,v26.16b
902	str	q29,[sp,#80]
903
904	mov	x4,#5
905	subs	x2,x2,#512
906.Loop_upper_neon:
907	sub	x4,x4,#1
908	add	v0.4s,v0.4s,v1.4s
909	add	w5,w5,w9
910	add	v4.4s,v4.4s,v5.4s
911	add	w6,w6,w10
912	add	v8.4s,v8.4s,v9.4s
913	add	w7,w7,w11
914	add	v12.4s,v12.4s,v13.4s
915	add	w8,w8,w12
916	add	v16.4s,v16.4s,v17.4s
917	eor	w17,w17,w5
918	add	v20.4s,v20.4s,v21.4s
919	eor	w19,w19,w6
920	eor	v3.16b,v3.16b,v0.16b
921	eor	w20,w20,w7
922	eor	v7.16b,v7.16b,v4.16b
923	eor	w21,w21,w8
924	eor	v11.16b,v11.16b,v8.16b
925	ror	w17,w17,#16
926	eor	v15.16b,v15.16b,v12.16b
927	ror	w19,w19,#16
928	eor	v19.16b,v19.16b,v16.16b
929	ror	w20,w20,#16
930	eor	v23.16b,v23.16b,v20.16b
931	ror	w21,w21,#16
932	rev32	v3.8h,v3.8h
933	add	w13,w13,w17
934	rev32	v7.8h,v7.8h
935	add	w14,w14,w19
936	rev32	v11.8h,v11.8h
937	add	w15,w15,w20
938	rev32	v15.8h,v15.8h
939	add	w16,w16,w21
940	rev32	v19.8h,v19.8h
941	eor	w9,w9,w13
942	rev32	v23.8h,v23.8h
943	eor	w10,w10,w14
944	add	v2.4s,v2.4s,v3.4s
945	eor	w11,w11,w15
946	add	v6.4s,v6.4s,v7.4s
947	eor	w12,w12,w16
948	add	v10.4s,v10.4s,v11.4s
949	ror	w9,w9,#20
950	add	v14.4s,v14.4s,v15.4s
951	ror	w10,w10,#20
952	add	v18.4s,v18.4s,v19.4s
953	ror	w11,w11,#20
954	add	v22.4s,v22.4s,v23.4s
955	ror	w12,w12,#20
956	eor	v24.16b,v1.16b,v2.16b
957	add	w5,w5,w9
958	eor	v25.16b,v5.16b,v6.16b
959	add	w6,w6,w10
960	eor	v26.16b,v9.16b,v10.16b
961	add	w7,w7,w11
962	eor	v27.16b,v13.16b,v14.16b
963	add	w8,w8,w12
964	eor	v28.16b,v17.16b,v18.16b
965	eor	w17,w17,w5
966	eor	v29.16b,v21.16b,v22.16b
967	eor	w19,w19,w6
968	ushr	v1.4s,v24.4s,#20
969	eor	w20,w20,w7
970	ushr	v5.4s,v25.4s,#20
971	eor	w21,w21,w8
972	ushr	v9.4s,v26.4s,#20
973	ror	w17,w17,#24
974	ushr	v13.4s,v27.4s,#20
975	ror	w19,w19,#24
976	ushr	v17.4s,v28.4s,#20
977	ror	w20,w20,#24
978	ushr	v21.4s,v29.4s,#20
979	ror	w21,w21,#24
980	sli	v1.4s,v24.4s,#12
981	add	w13,w13,w17
982	sli	v5.4s,v25.4s,#12
983	add	w14,w14,w19
984	sli	v9.4s,v26.4s,#12
985	add	w15,w15,w20
986	sli	v13.4s,v27.4s,#12
987	add	w16,w16,w21
988	sli	v17.4s,v28.4s,#12
989	eor	w9,w9,w13
990	sli	v21.4s,v29.4s,#12
991	eor	w10,w10,w14
992	add	v0.4s,v0.4s,v1.4s
993	eor	w11,w11,w15
994	add	v4.4s,v4.4s,v5.4s
995	eor	w12,w12,w16
996	add	v8.4s,v8.4s,v9.4s
997	ror	w9,w9,#25
998	add	v12.4s,v12.4s,v13.4s
999	ror	w10,w10,#25
1000	add	v16.4s,v16.4s,v17.4s
1001	ror	w11,w11,#25
1002	add	v20.4s,v20.4s,v21.4s
1003	ror	w12,w12,#25
1004	eor	v24.16b,v3.16b,v0.16b
1005	add	w5,w5,w10
1006	eor	v25.16b,v7.16b,v4.16b
1007	add	w6,w6,w11
1008	eor	v26.16b,v11.16b,v8.16b
1009	add	w7,w7,w12
1010	eor	v27.16b,v15.16b,v12.16b
1011	add	w8,w8,w9
1012	eor	v28.16b,v19.16b,v16.16b
1013	eor	w21,w21,w5
1014	eor	v29.16b,v23.16b,v20.16b
1015	eor	w17,w17,w6
1016	ushr	v3.4s,v24.4s,#24
1017	eor	w19,w19,w7
1018	ushr	v7.4s,v25.4s,#24
1019	eor	w20,w20,w8
1020	ushr	v11.4s,v26.4s,#24
1021	ror	w21,w21,#16
1022	ushr	v15.4s,v27.4s,#24
1023	ror	w17,w17,#16
1024	ushr	v19.4s,v28.4s,#24
1025	ror	w19,w19,#16
1026	ushr	v23.4s,v29.4s,#24
1027	ror	w20,w20,#16
1028	sli	v3.4s,v24.4s,#8
1029	add	w15,w15,w21
1030	sli	v7.4s,v25.4s,#8
1031	add	w16,w16,w17
1032	sli	v11.4s,v26.4s,#8
1033	add	w13,w13,w19
1034	sli	v15.4s,v27.4s,#8
1035	add	w14,w14,w20
1036	sli	v19.4s,v28.4s,#8
1037	eor	w10,w10,w15
1038	sli	v23.4s,v29.4s,#8
1039	eor	w11,w11,w16
1040	add	v2.4s,v2.4s,v3.4s
1041	eor	w12,w12,w13
1042	add	v6.4s,v6.4s,v7.4s
1043	eor	w9,w9,w14
1044	add	v10.4s,v10.4s,v11.4s
1045	ror	w10,w10,#20
1046	add	v14.4s,v14.4s,v15.4s
1047	ror	w11,w11,#20
1048	add	v18.4s,v18.4s,v19.4s
1049	ror	w12,w12,#20
1050	add	v22.4s,v22.4s,v23.4s
1051	ror	w9,w9,#20
1052	eor	v24.16b,v1.16b,v2.16b
1053	add	w5,w5,w10
1054	eor	v25.16b,v5.16b,v6.16b
1055	add	w6,w6,w11
1056	eor	v26.16b,v9.16b,v10.16b
1057	add	w7,w7,w12
1058	eor	v27.16b,v13.16b,v14.16b
1059	add	w8,w8,w9
1060	eor	v28.16b,v17.16b,v18.16b
1061	eor	w21,w21,w5
1062	eor	v29.16b,v21.16b,v22.16b
1063	eor	w17,w17,w6
1064	ushr	v1.4s,v24.4s,#25
1065	eor	w19,w19,w7
1066	ushr	v5.4s,v25.4s,#25
1067	eor	w20,w20,w8
1068	ushr	v9.4s,v26.4s,#25
1069	ror	w21,w21,#24
1070	ushr	v13.4s,v27.4s,#25
1071	ror	w17,w17,#24
1072	ushr	v17.4s,v28.4s,#25
1073	ror	w19,w19,#24
1074	ushr	v21.4s,v29.4s,#25
1075	ror	w20,w20,#24
1076	sli	v1.4s,v24.4s,#7
1077	add	w15,w15,w21
1078	sli	v5.4s,v25.4s,#7
1079	add	w16,w16,w17
1080	sli	v9.4s,v26.4s,#7
1081	add	w13,w13,w19
1082	sli	v13.4s,v27.4s,#7
1083	add	w14,w14,w20
1084	sli	v17.4s,v28.4s,#7
1085	eor	w10,w10,w15
1086	sli	v21.4s,v29.4s,#7
1087	eor	w11,w11,w16
1088	ext	v2.16b,v2.16b,v2.16b,#8
1089	eor	w12,w12,w13
1090	ext	v6.16b,v6.16b,v6.16b,#8
1091	eor	w9,w9,w14
1092	ext	v10.16b,v10.16b,v10.16b,#8
1093	ror	w10,w10,#25
1094	ext	v14.16b,v14.16b,v14.16b,#8
1095	ror	w11,w11,#25
1096	ext	v18.16b,v18.16b,v18.16b,#8
1097	ror	w12,w12,#25
1098	ext	v22.16b,v22.16b,v22.16b,#8
1099	ror	w9,w9,#25
1100	ext	v3.16b,v3.16b,v3.16b,#12
1101	ext	v7.16b,v7.16b,v7.16b,#12
1102	ext	v11.16b,v11.16b,v11.16b,#12
1103	ext	v15.16b,v15.16b,v15.16b,#12
1104	ext	v19.16b,v19.16b,v19.16b,#12
1105	ext	v23.16b,v23.16b,v23.16b,#12
1106	ext	v1.16b,v1.16b,v1.16b,#4
1107	ext	v5.16b,v5.16b,v5.16b,#4
1108	ext	v9.16b,v9.16b,v9.16b,#4
1109	ext	v13.16b,v13.16b,v13.16b,#4
1110	ext	v17.16b,v17.16b,v17.16b,#4
1111	ext	v21.16b,v21.16b,v21.16b,#4
1112	add	v0.4s,v0.4s,v1.4s
1113	add	w5,w5,w9
1114	add	v4.4s,v4.4s,v5.4s
1115	add	w6,w6,w10
1116	add	v8.4s,v8.4s,v9.4s
1117	add	w7,w7,w11
1118	add	v12.4s,v12.4s,v13.4s
1119	add	w8,w8,w12
1120	add	v16.4s,v16.4s,v17.4s
1121	eor	w17,w17,w5
1122	add	v20.4s,v20.4s,v21.4s
1123	eor	w19,w19,w6
1124	eor	v3.16b,v3.16b,v0.16b
1125	eor	w20,w20,w7
1126	eor	v7.16b,v7.16b,v4.16b
1127	eor	w21,w21,w8
1128	eor	v11.16b,v11.16b,v8.16b
1129	ror	w17,w17,#16
1130	eor	v15.16b,v15.16b,v12.16b
1131	ror	w19,w19,#16
1132	eor	v19.16b,v19.16b,v16.16b
1133	ror	w20,w20,#16
1134	eor	v23.16b,v23.16b,v20.16b
1135	ror	w21,w21,#16
1136	rev32	v3.8h,v3.8h
1137	add	w13,w13,w17
1138	rev32	v7.8h,v7.8h
1139	add	w14,w14,w19
1140	rev32	v11.8h,v11.8h
1141	add	w15,w15,w20
1142	rev32	v15.8h,v15.8h
1143	add	w16,w16,w21
1144	rev32	v19.8h,v19.8h
1145	eor	w9,w9,w13
1146	rev32	v23.8h,v23.8h
1147	eor	w10,w10,w14
1148	add	v2.4s,v2.4s,v3.4s
1149	eor	w11,w11,w15
1150	add	v6.4s,v6.4s,v7.4s
1151	eor	w12,w12,w16
1152	add	v10.4s,v10.4s,v11.4s
1153	ror	w9,w9,#20
1154	add	v14.4s,v14.4s,v15.4s
1155	ror	w10,w10,#20
1156	add	v18.4s,v18.4s,v19.4s
1157	ror	w11,w11,#20
1158	add	v22.4s,v22.4s,v23.4s
1159	ror	w12,w12,#20
1160	eor	v24.16b,v1.16b,v2.16b
1161	add	w5,w5,w9
1162	eor	v25.16b,v5.16b,v6.16b
1163	add	w6,w6,w10
1164	eor	v26.16b,v9.16b,v10.16b
1165	add	w7,w7,w11
1166	eor	v27.16b,v13.16b,v14.16b
1167	add	w8,w8,w12
1168	eor	v28.16b,v17.16b,v18.16b
1169	eor	w17,w17,w5
1170	eor	v29.16b,v21.16b,v22.16b
1171	eor	w19,w19,w6
1172	ushr	v1.4s,v24.4s,#20
1173	eor	w20,w20,w7
1174	ushr	v5.4s,v25.4s,#20
1175	eor	w21,w21,w8
1176	ushr	v9.4s,v26.4s,#20
1177	ror	w17,w17,#24
1178	ushr	v13.4s,v27.4s,#20
1179	ror	w19,w19,#24
1180	ushr	v17.4s,v28.4s,#20
1181	ror	w20,w20,#24
1182	ushr	v21.4s,v29.4s,#20
1183	ror	w21,w21,#24
1184	sli	v1.4s,v24.4s,#12
1185	add	w13,w13,w17
1186	sli	v5.4s,v25.4s,#12
1187	add	w14,w14,w19
1188	sli	v9.4s,v26.4s,#12
1189	add	w15,w15,w20
1190	sli	v13.4s,v27.4s,#12
1191	add	w16,w16,w21
1192	sli	v17.4s,v28.4s,#12
1193	eor	w9,w9,w13
1194	sli	v21.4s,v29.4s,#12
1195	eor	w10,w10,w14
1196	add	v0.4s,v0.4s,v1.4s
1197	eor	w11,w11,w15
1198	add	v4.4s,v4.4s,v5.4s
1199	eor	w12,w12,w16
1200	add	v8.4s,v8.4s,v9.4s
1201	ror	w9,w9,#25
1202	add	v12.4s,v12.4s,v13.4s
1203	ror	w10,w10,#25
1204	add	v16.4s,v16.4s,v17.4s
1205	ror	w11,w11,#25
1206	add	v20.4s,v20.4s,v21.4s
1207	ror	w12,w12,#25
1208	eor	v24.16b,v3.16b,v0.16b
1209	add	w5,w5,w10
1210	eor	v25.16b,v7.16b,v4.16b
1211	add	w6,w6,w11
1212	eor	v26.16b,v11.16b,v8.16b
1213	add	w7,w7,w12
1214	eor	v27.16b,v15.16b,v12.16b
1215	add	w8,w8,w9
1216	eor	v28.16b,v19.16b,v16.16b
1217	eor	w21,w21,w5
1218	eor	v29.16b,v23.16b,v20.16b
1219	eor	w17,w17,w6
1220	ushr	v3.4s,v24.4s,#24
1221	eor	w19,w19,w7
1222	ushr	v7.4s,v25.4s,#24
1223	eor	w20,w20,w8
1224	ushr	v11.4s,v26.4s,#24
1225	ror	w21,w21,#16
1226	ushr	v15.4s,v27.4s,#24
1227	ror	w17,w17,#16
1228	ushr	v19.4s,v28.4s,#24
1229	ror	w19,w19,#16
1230	ushr	v23.4s,v29.4s,#24
1231	ror	w20,w20,#16
1232	sli	v3.4s,v24.4s,#8
1233	add	w15,w15,w21
1234	sli	v7.4s,v25.4s,#8
1235	add	w16,w16,w17
1236	sli	v11.4s,v26.4s,#8
1237	add	w13,w13,w19
1238	sli	v15.4s,v27.4s,#8
1239	add	w14,w14,w20
1240	sli	v19.4s,v28.4s,#8
1241	eor	w10,w10,w15
1242	sli	v23.4s,v29.4s,#8
1243	eor	w11,w11,w16
1244	add	v2.4s,v2.4s,v3.4s
1245	eor	w12,w12,w13
1246	add	v6.4s,v6.4s,v7.4s
1247	eor	w9,w9,w14
1248	add	v10.4s,v10.4s,v11.4s
1249	ror	w10,w10,#20
1250	add	v14.4s,v14.4s,v15.4s
1251	ror	w11,w11,#20
1252	add	v18.4s,v18.4s,v19.4s
1253	ror	w12,w12,#20
1254	add	v22.4s,v22.4s,v23.4s
1255	ror	w9,w9,#20
1256	eor	v24.16b,v1.16b,v2.16b
1257	add	w5,w5,w10
1258	eor	v25.16b,v5.16b,v6.16b
1259	add	w6,w6,w11
1260	eor	v26.16b,v9.16b,v10.16b
1261	add	w7,w7,w12
1262	eor	v27.16b,v13.16b,v14.16b
1263	add	w8,w8,w9
1264	eor	v28.16b,v17.16b,v18.16b
1265	eor	w21,w21,w5
1266	eor	v29.16b,v21.16b,v22.16b
1267	eor	w17,w17,w6
1268	ushr	v1.4s,v24.4s,#25
1269	eor	w19,w19,w7
1270	ushr	v5.4s,v25.4s,#25
1271	eor	w20,w20,w8
1272	ushr	v9.4s,v26.4s,#25
1273	ror	w21,w21,#24
1274	ushr	v13.4s,v27.4s,#25
1275	ror	w17,w17,#24
1276	ushr	v17.4s,v28.4s,#25
1277	ror	w19,w19,#24
1278	ushr	v21.4s,v29.4s,#25
1279	ror	w20,w20,#24
1280	sli	v1.4s,v24.4s,#7
1281	add	w15,w15,w21
1282	sli	v5.4s,v25.4s,#7
1283	add	w16,w16,w17
1284	sli	v9.4s,v26.4s,#7
1285	add	w13,w13,w19
1286	sli	v13.4s,v27.4s,#7
1287	add	w14,w14,w20
1288	sli	v17.4s,v28.4s,#7
1289	eor	w10,w10,w15
1290	sli	v21.4s,v29.4s,#7
1291	eor	w11,w11,w16
1292	ext	v2.16b,v2.16b,v2.16b,#8
1293	eor	w12,w12,w13
1294	ext	v6.16b,v6.16b,v6.16b,#8
1295	eor	w9,w9,w14
1296	ext	v10.16b,v10.16b,v10.16b,#8
1297	ror	w10,w10,#25
1298	ext	v14.16b,v14.16b,v14.16b,#8
1299	ror	w11,w11,#25
1300	ext	v18.16b,v18.16b,v18.16b,#8
1301	ror	w12,w12,#25
1302	ext	v22.16b,v22.16b,v22.16b,#8
1303	ror	w9,w9,#25
1304	ext	v3.16b,v3.16b,v3.16b,#4
1305	ext	v7.16b,v7.16b,v7.16b,#4
1306	ext	v11.16b,v11.16b,v11.16b,#4
1307	ext	v15.16b,v15.16b,v15.16b,#4
1308	ext	v19.16b,v19.16b,v19.16b,#4
1309	ext	v23.16b,v23.16b,v23.16b,#4
1310	ext	v1.16b,v1.16b,v1.16b,#12
1311	ext	v5.16b,v5.16b,v5.16b,#12
1312	ext	v9.16b,v9.16b,v9.16b,#12
1313	ext	v13.16b,v13.16b,v13.16b,#12
1314	ext	v17.16b,v17.16b,v17.16b,#12
1315	ext	v21.16b,v21.16b,v21.16b,#12
1316	cbnz	x4,.Loop_upper_neon
1317
1318	add	w5,w5,w22		// accumulate key block
1319	add	x6,x6,x22,lsr#32
1320	add	w7,w7,w23
1321	add	x8,x8,x23,lsr#32
1322	add	w9,w9,w24
1323	add	x10,x10,x24,lsr#32
1324	add	w11,w11,w25
1325	add	x12,x12,x25,lsr#32
1326	add	w13,w13,w26
1327	add	x14,x14,x26,lsr#32
1328	add	w15,w15,w27
1329	add	x16,x16,x27,lsr#32
1330	add	w17,w17,w28
1331	add	x19,x19,x28,lsr#32
1332	add	w20,w20,w30
1333	add	x21,x21,x30,lsr#32
1334
1335	add	x5,x5,x6,lsl#32	// pack
1336	add	x7,x7,x8,lsl#32
1337	ldp	x6,x8,[x1,#0]		// load input
1338	add	x9,x9,x10,lsl#32
1339	add	x11,x11,x12,lsl#32
1340	ldp	x10,x12,[x1,#16]
1341	add	x13,x13,x14,lsl#32
1342	add	x15,x15,x16,lsl#32
1343	ldp	x14,x16,[x1,#32]
1344	add	x17,x17,x19,lsl#32
1345	add	x20,x20,x21,lsl#32
1346	ldp	x19,x21,[x1,#48]
1347	add	x1,x1,#64
1348#ifdef	__ARMEB__
1349	rev	x5,x5
1350	rev	x7,x7
1351	rev	x9,x9
1352	rev	x11,x11
1353	rev	x13,x13
1354	rev	x15,x15
1355	rev	x17,x17
1356	rev	x20,x20
1357#endif
1358	eor	x5,x5,x6
1359	eor	x7,x7,x8
1360	eor	x9,x9,x10
1361	eor	x11,x11,x12
1362	eor	x13,x13,x14
1363	eor	x15,x15,x16
1364	eor	x17,x17,x19
1365	eor	x20,x20,x21
1366
1367	stp	x5,x7,[x0,#0]		// store output
1368	add	x28,x28,#1			// increment counter
1369	mov	w5,w22			// unpack key block
1370	lsr	x6,x22,#32
1371	stp	x9,x11,[x0,#16]
1372	mov	w7,w23
1373	lsr	x8,x23,#32
1374	stp	x13,x15,[x0,#32]
1375	mov	w9,w24
1376	lsr	x10,x24,#32
1377	stp	x17,x20,[x0,#48]
1378	add	x0,x0,#64
1379	mov	w11,w25
1380	lsr	x12,x25,#32
1381	mov	w13,w26
1382	lsr	x14,x26,#32
1383	mov	w15,w27
1384	lsr	x16,x27,#32
1385	mov	w17,w28
1386	lsr	x19,x28,#32
1387	mov	w20,w30
1388	lsr	x21,x30,#32
1389
1390	mov	x4,#5
1391.Loop_lower_neon:
1392	sub	x4,x4,#1
1393	add	v0.4s,v0.4s,v1.4s
1394	add	w5,w5,w9
1395	add	v4.4s,v4.4s,v5.4s
1396	add	w6,w6,w10
1397	add	v8.4s,v8.4s,v9.4s
1398	add	w7,w7,w11
1399	add	v12.4s,v12.4s,v13.4s
1400	add	w8,w8,w12
1401	add	v16.4s,v16.4s,v17.4s
1402	eor	w17,w17,w5
1403	add	v20.4s,v20.4s,v21.4s
1404	eor	w19,w19,w6
1405	eor	v3.16b,v3.16b,v0.16b
1406	eor	w20,w20,w7
1407	eor	v7.16b,v7.16b,v4.16b
1408	eor	w21,w21,w8
1409	eor	v11.16b,v11.16b,v8.16b
1410	ror	w17,w17,#16
1411	eor	v15.16b,v15.16b,v12.16b
1412	ror	w19,w19,#16
1413	eor	v19.16b,v19.16b,v16.16b
1414	ror	w20,w20,#16
1415	eor	v23.16b,v23.16b,v20.16b
1416	ror	w21,w21,#16
1417	rev32	v3.8h,v3.8h
1418	add	w13,w13,w17
1419	rev32	v7.8h,v7.8h
1420	add	w14,w14,w19
1421	rev32	v11.8h,v11.8h
1422	add	w15,w15,w20
1423	rev32	v15.8h,v15.8h
1424	add	w16,w16,w21
1425	rev32	v19.8h,v19.8h
1426	eor	w9,w9,w13
1427	rev32	v23.8h,v23.8h
1428	eor	w10,w10,w14
1429	add	v2.4s,v2.4s,v3.4s
1430	eor	w11,w11,w15
1431	add	v6.4s,v6.4s,v7.4s
1432	eor	w12,w12,w16
1433	add	v10.4s,v10.4s,v11.4s
1434	ror	w9,w9,#20
1435	add	v14.4s,v14.4s,v15.4s
1436	ror	w10,w10,#20
1437	add	v18.4s,v18.4s,v19.4s
1438	ror	w11,w11,#20
1439	add	v22.4s,v22.4s,v23.4s
1440	ror	w12,w12,#20
1441	eor	v24.16b,v1.16b,v2.16b
1442	add	w5,w5,w9
1443	eor	v25.16b,v5.16b,v6.16b
1444	add	w6,w6,w10
1445	eor	v26.16b,v9.16b,v10.16b
1446	add	w7,w7,w11
1447	eor	v27.16b,v13.16b,v14.16b
1448	add	w8,w8,w12
1449	eor	v28.16b,v17.16b,v18.16b
1450	eor	w17,w17,w5
1451	eor	v29.16b,v21.16b,v22.16b
1452	eor	w19,w19,w6
1453	ushr	v1.4s,v24.4s,#20
1454	eor	w20,w20,w7
1455	ushr	v5.4s,v25.4s,#20
1456	eor	w21,w21,w8
1457	ushr	v9.4s,v26.4s,#20
1458	ror	w17,w17,#24
1459	ushr	v13.4s,v27.4s,#20
1460	ror	w19,w19,#24
1461	ushr	v17.4s,v28.4s,#20
1462	ror	w20,w20,#24
1463	ushr	v21.4s,v29.4s,#20
1464	ror	w21,w21,#24
1465	sli	v1.4s,v24.4s,#12
1466	add	w13,w13,w17
1467	sli	v5.4s,v25.4s,#12
1468	add	w14,w14,w19
1469	sli	v9.4s,v26.4s,#12
1470	add	w15,w15,w20
1471	sli	v13.4s,v27.4s,#12
1472	add	w16,w16,w21
1473	sli	v17.4s,v28.4s,#12
1474	eor	w9,w9,w13
1475	sli	v21.4s,v29.4s,#12
1476	eor	w10,w10,w14
1477	add	v0.4s,v0.4s,v1.4s
1478	eor	w11,w11,w15
1479	add	v4.4s,v4.4s,v5.4s
1480	eor	w12,w12,w16
1481	add	v8.4s,v8.4s,v9.4s
1482	ror	w9,w9,#25
1483	add	v12.4s,v12.4s,v13.4s
1484	ror	w10,w10,#25
1485	add	v16.4s,v16.4s,v17.4s
1486	ror	w11,w11,#25
1487	add	v20.4s,v20.4s,v21.4s
1488	ror	w12,w12,#25
1489	eor	v24.16b,v3.16b,v0.16b
1490	add	w5,w5,w10
1491	eor	v25.16b,v7.16b,v4.16b
1492	add	w6,w6,w11
1493	eor	v26.16b,v11.16b,v8.16b
1494	add	w7,w7,w12
1495	eor	v27.16b,v15.16b,v12.16b
1496	add	w8,w8,w9
1497	eor	v28.16b,v19.16b,v16.16b
1498	eor	w21,w21,w5
1499	eor	v29.16b,v23.16b,v20.16b
1500	eor	w17,w17,w6
1501	ushr	v3.4s,v24.4s,#24
1502	eor	w19,w19,w7
1503	ushr	v7.4s,v25.4s,#24
1504	eor	w20,w20,w8
1505	ushr	v11.4s,v26.4s,#24
1506	ror	w21,w21,#16
1507	ushr	v15.4s,v27.4s,#24
1508	ror	w17,w17,#16
1509	ushr	v19.4s,v28.4s,#24
1510	ror	w19,w19,#16
1511	ushr	v23.4s,v29.4s,#24
1512	ror	w20,w20,#16
1513	sli	v3.4s,v24.4s,#8
1514	add	w15,w15,w21
1515	sli	v7.4s,v25.4s,#8
1516	add	w16,w16,w17
1517	sli	v11.4s,v26.4s,#8
1518	add	w13,w13,w19
1519	sli	v15.4s,v27.4s,#8
1520	add	w14,w14,w20
1521	sli	v19.4s,v28.4s,#8
1522	eor	w10,w10,w15
1523	sli	v23.4s,v29.4s,#8
1524	eor	w11,w11,w16
1525	add	v2.4s,v2.4s,v3.4s
1526	eor	w12,w12,w13
1527	add	v6.4s,v6.4s,v7.4s
1528	eor	w9,w9,w14
1529	add	v10.4s,v10.4s,v11.4s
1530	ror	w10,w10,#20
1531	add	v14.4s,v14.4s,v15.4s
1532	ror	w11,w11,#20
1533	add	v18.4s,v18.4s,v19.4s
1534	ror	w12,w12,#20
1535	add	v22.4s,v22.4s,v23.4s
1536	ror	w9,w9,#20
1537	eor	v24.16b,v1.16b,v2.16b
1538	add	w5,w5,w10
1539	eor	v25.16b,v5.16b,v6.16b
1540	add	w6,w6,w11
1541	eor	v26.16b,v9.16b,v10.16b
1542	add	w7,w7,w12
1543	eor	v27.16b,v13.16b,v14.16b
1544	add	w8,w8,w9
1545	eor	v28.16b,v17.16b,v18.16b
1546	eor	w21,w21,w5
1547	eor	v29.16b,v21.16b,v22.16b
1548	eor	w17,w17,w6
1549	ushr	v1.4s,v24.4s,#25
1550	eor	w19,w19,w7
1551	ushr	v5.4s,v25.4s,#25
1552	eor	w20,w20,w8
1553	ushr	v9.4s,v26.4s,#25
1554	ror	w21,w21,#24
1555	ushr	v13.4s,v27.4s,#25
1556	ror	w17,w17,#24
1557	ushr	v17.4s,v28.4s,#25
1558	ror	w19,w19,#24
1559	ushr	v21.4s,v29.4s,#25
1560	ror	w20,w20,#24
1561	sli	v1.4s,v24.4s,#7
1562	add	w15,w15,w21
1563	sli	v5.4s,v25.4s,#7
1564	add	w16,w16,w17
1565	sli	v9.4s,v26.4s,#7
1566	add	w13,w13,w19
1567	sli	v13.4s,v27.4s,#7
1568	add	w14,w14,w20
1569	sli	v17.4s,v28.4s,#7
1570	eor	w10,w10,w15
1571	sli	v21.4s,v29.4s,#7
1572	eor	w11,w11,w16
1573	ext	v2.16b,v2.16b,v2.16b,#8
1574	eor	w12,w12,w13
1575	ext	v6.16b,v6.16b,v6.16b,#8
1576	eor	w9,w9,w14
1577	ext	v10.16b,v10.16b,v10.16b,#8
1578	ror	w10,w10,#25
1579	ext	v14.16b,v14.16b,v14.16b,#8
1580	ror	w11,w11,#25
1581	ext	v18.16b,v18.16b,v18.16b,#8
1582	ror	w12,w12,#25
1583	ext	v22.16b,v22.16b,v22.16b,#8
1584	ror	w9,w9,#25
1585	ext	v3.16b,v3.16b,v3.16b,#12
1586	ext	v7.16b,v7.16b,v7.16b,#12
1587	ext	v11.16b,v11.16b,v11.16b,#12
1588	ext	v15.16b,v15.16b,v15.16b,#12
1589	ext	v19.16b,v19.16b,v19.16b,#12
1590	ext	v23.16b,v23.16b,v23.16b,#12
1591	ext	v1.16b,v1.16b,v1.16b,#4
1592	ext	v5.16b,v5.16b,v5.16b,#4
1593	ext	v9.16b,v9.16b,v9.16b,#4
1594	ext	v13.16b,v13.16b,v13.16b,#4
1595	ext	v17.16b,v17.16b,v17.16b,#4
1596	ext	v21.16b,v21.16b,v21.16b,#4
1597	add	v0.4s,v0.4s,v1.4s
1598	add	w5,w5,w9
1599	add	v4.4s,v4.4s,v5.4s
1600	add	w6,w6,w10
1601	add	v8.4s,v8.4s,v9.4s
1602	add	w7,w7,w11
1603	add	v12.4s,v12.4s,v13.4s
1604	add	w8,w8,w12
1605	add	v16.4s,v16.4s,v17.4s
1606	eor	w17,w17,w5
1607	add	v20.4s,v20.4s,v21.4s
1608	eor	w19,w19,w6
1609	eor	v3.16b,v3.16b,v0.16b
1610	eor	w20,w20,w7
1611	eor	v7.16b,v7.16b,v4.16b
1612	eor	w21,w21,w8
1613	eor	v11.16b,v11.16b,v8.16b
1614	ror	w17,w17,#16
1615	eor	v15.16b,v15.16b,v12.16b
1616	ror	w19,w19,#16
1617	eor	v19.16b,v19.16b,v16.16b
1618	ror	w20,w20,#16
1619	eor	v23.16b,v23.16b,v20.16b
1620	ror	w21,w21,#16
1621	rev32	v3.8h,v3.8h
1622	add	w13,w13,w17
1623	rev32	v7.8h,v7.8h
1624	add	w14,w14,w19
1625	rev32	v11.8h,v11.8h
1626	add	w15,w15,w20
1627	rev32	v15.8h,v15.8h
1628	add	w16,w16,w21
1629	rev32	v19.8h,v19.8h
1630	eor	w9,w9,w13
1631	rev32	v23.8h,v23.8h
1632	eor	w10,w10,w14
1633	add	v2.4s,v2.4s,v3.4s
1634	eor	w11,w11,w15
1635	add	v6.4s,v6.4s,v7.4s
1636	eor	w12,w12,w16
1637	add	v10.4s,v10.4s,v11.4s
1638	ror	w9,w9,#20
1639	add	v14.4s,v14.4s,v15.4s
1640	ror	w10,w10,#20
1641	add	v18.4s,v18.4s,v19.4s
1642	ror	w11,w11,#20
1643	add	v22.4s,v22.4s,v23.4s
1644	ror	w12,w12,#20
1645	eor	v24.16b,v1.16b,v2.16b
1646	add	w5,w5,w9
1647	eor	v25.16b,v5.16b,v6.16b
1648	add	w6,w6,w10
1649	eor	v26.16b,v9.16b,v10.16b
1650	add	w7,w7,w11
1651	eor	v27.16b,v13.16b,v14.16b
1652	add	w8,w8,w12
1653	eor	v28.16b,v17.16b,v18.16b
1654	eor	w17,w17,w5
1655	eor	v29.16b,v21.16b,v22.16b
1656	eor	w19,w19,w6
1657	ushr	v1.4s,v24.4s,#20
1658	eor	w20,w20,w7
1659	ushr	v5.4s,v25.4s,#20
1660	eor	w21,w21,w8
1661	ushr	v9.4s,v26.4s,#20
1662	ror	w17,w17,#24
1663	ushr	v13.4s,v27.4s,#20
1664	ror	w19,w19,#24
1665	ushr	v17.4s,v28.4s,#20
1666	ror	w20,w20,#24
1667	ushr	v21.4s,v29.4s,#20
1668	ror	w21,w21,#24
1669	sli	v1.4s,v24.4s,#12
1670	add	w13,w13,w17
1671	sli	v5.4s,v25.4s,#12
1672	add	w14,w14,w19
1673	sli	v9.4s,v26.4s,#12
1674	add	w15,w15,w20
1675	sli	v13.4s,v27.4s,#12
1676	add	w16,w16,w21
1677	sli	v17.4s,v28.4s,#12
1678	eor	w9,w9,w13
1679	sli	v21.4s,v29.4s,#12
1680	eor	w10,w10,w14
1681	add	v0.4s,v0.4s,v1.4s
1682	eor	w11,w11,w15
1683	add	v4.4s,v4.4s,v5.4s
1684	eor	w12,w12,w16
1685	add	v8.4s,v8.4s,v9.4s
1686	ror	w9,w9,#25
1687	add	v12.4s,v12.4s,v13.4s
1688	ror	w10,w10,#25
1689	add	v16.4s,v16.4s,v17.4s
1690	ror	w11,w11,#25
1691	add	v20.4s,v20.4s,v21.4s
1692	ror	w12,w12,#25
1693	eor	v24.16b,v3.16b,v0.16b
1694	add	w5,w5,w10
1695	eor	v25.16b,v7.16b,v4.16b
1696	add	w6,w6,w11
1697	eor	v26.16b,v11.16b,v8.16b
1698	add	w7,w7,w12
1699	eor	v27.16b,v15.16b,v12.16b
1700	add	w8,w8,w9
1701	eor	v28.16b,v19.16b,v16.16b
1702	eor	w21,w21,w5
1703	eor	v29.16b,v23.16b,v20.16b
1704	eor	w17,w17,w6
1705	ushr	v3.4s,v24.4s,#24
1706	eor	w19,w19,w7
1707	ushr	v7.4s,v25.4s,#24
1708	eor	w20,w20,w8
1709	ushr	v11.4s,v26.4s,#24
1710	ror	w21,w21,#16
1711	ushr	v15.4s,v27.4s,#24
1712	ror	w17,w17,#16
1713	ushr	v19.4s,v28.4s,#24
1714	ror	w19,w19,#16
1715	ushr	v23.4s,v29.4s,#24
1716	ror	w20,w20,#16
1717	sli	v3.4s,v24.4s,#8
1718	add	w15,w15,w21
1719	sli	v7.4s,v25.4s,#8
1720	add	w16,w16,w17
1721	sli	v11.4s,v26.4s,#8
1722	add	w13,w13,w19
1723	sli	v15.4s,v27.4s,#8
1724	add	w14,w14,w20
1725	sli	v19.4s,v28.4s,#8
1726	eor	w10,w10,w15
1727	sli	v23.4s,v29.4s,#8
1728	eor	w11,w11,w16
1729	add	v2.4s,v2.4s,v3.4s
1730	eor	w12,w12,w13
1731	add	v6.4s,v6.4s,v7.4s
1732	eor	w9,w9,w14
1733	add	v10.4s,v10.4s,v11.4s
1734	ror	w10,w10,#20
1735	add	v14.4s,v14.4s,v15.4s
1736	ror	w11,w11,#20
1737	add	v18.4s,v18.4s,v19.4s
1738	ror	w12,w12,#20
1739	add	v22.4s,v22.4s,v23.4s
1740	ror	w9,w9,#20
1741	eor	v24.16b,v1.16b,v2.16b
1742	add	w5,w5,w10
1743	eor	v25.16b,v5.16b,v6.16b
1744	add	w6,w6,w11
1745	eor	v26.16b,v9.16b,v10.16b
1746	add	w7,w7,w12
1747	eor	v27.16b,v13.16b,v14.16b
1748	add	w8,w8,w9
1749	eor	v28.16b,v17.16b,v18.16b
1750	eor	w21,w21,w5
1751	eor	v29.16b,v21.16b,v22.16b
1752	eor	w17,w17,w6
1753	ushr	v1.4s,v24.4s,#25
1754	eor	w19,w19,w7
1755	ushr	v5.4s,v25.4s,#25
1756	eor	w20,w20,w8
1757	ushr	v9.4s,v26.4s,#25
1758	ror	w21,w21,#24
1759	ushr	v13.4s,v27.4s,#25
1760	ror	w17,w17,#24
1761	ushr	v17.4s,v28.4s,#25
1762	ror	w19,w19,#24
1763	ushr	v21.4s,v29.4s,#25
1764	ror	w20,w20,#24
1765	sli	v1.4s,v24.4s,#7
1766	add	w15,w15,w21
1767	sli	v5.4s,v25.4s,#7
1768	add	w16,w16,w17
1769	sli	v9.4s,v26.4s,#7
1770	add	w13,w13,w19
1771	sli	v13.4s,v27.4s,#7
1772	add	w14,w14,w20
1773	sli	v17.4s,v28.4s,#7
1774	eor	w10,w10,w15
1775	sli	v21.4s,v29.4s,#7
1776	eor	w11,w11,w16
1777	ext	v2.16b,v2.16b,v2.16b,#8
1778	eor	w12,w12,w13
1779	ext	v6.16b,v6.16b,v6.16b,#8
1780	eor	w9,w9,w14
1781	ext	v10.16b,v10.16b,v10.16b,#8
1782	ror	w10,w10,#25
1783	ext	v14.16b,v14.16b,v14.16b,#8
1784	ror	w11,w11,#25
1785	ext	v18.16b,v18.16b,v18.16b,#8
1786	ror	w12,w12,#25
1787	ext	v22.16b,v22.16b,v22.16b,#8
1788	ror	w9,w9,#25
1789	ext	v3.16b,v3.16b,v3.16b,#4
1790	ext	v7.16b,v7.16b,v7.16b,#4
1791	ext	v11.16b,v11.16b,v11.16b,#4
1792	ext	v15.16b,v15.16b,v15.16b,#4
1793	ext	v19.16b,v19.16b,v19.16b,#4
1794	ext	v23.16b,v23.16b,v23.16b,#4
1795	ext	v1.16b,v1.16b,v1.16b,#12
1796	ext	v5.16b,v5.16b,v5.16b,#12
1797	ext	v9.16b,v9.16b,v9.16b,#12
1798	ext	v13.16b,v13.16b,v13.16b,#12
1799	ext	v17.16b,v17.16b,v17.16b,#12
1800	ext	v21.16b,v21.16b,v21.16b,#12
1801	cbnz	x4,.Loop_lower_neon
1802
1803	add	w5,w5,w22		// accumulate key block
1804	ldp	q24,q25,[sp,#0]
1805	add	x6,x6,x22,lsr#32
1806	ldp	q26,q27,[sp,#32]
1807	add	w7,w7,w23
1808	ldp	q28,q29,[sp,#64]
1809	add	x8,x8,x23,lsr#32
1810	add	v0.4s,v0.4s,v24.4s
1811	add	w9,w9,w24
1812	add	v4.4s,v4.4s,v24.4s
1813	add	x10,x10,x24,lsr#32
1814	add	v8.4s,v8.4s,v24.4s
1815	add	w11,w11,w25
1816	add	v12.4s,v12.4s,v24.4s
1817	add	x12,x12,x25,lsr#32
1818	add	v16.4s,v16.4s,v24.4s
1819	add	w13,w13,w26
1820	add	v20.4s,v20.4s,v24.4s
1821	add	x14,x14,x26,lsr#32
1822	add	v2.4s,v2.4s,v26.4s
1823	add	w15,w15,w27
1824	add	v6.4s,v6.4s,v26.4s
1825	add	x16,x16,x27,lsr#32
1826	add	v10.4s,v10.4s,v26.4s
1827	add	w17,w17,w28
1828	add	v14.4s,v14.4s,v26.4s
1829	add	x19,x19,x28,lsr#32
1830	add	v18.4s,v18.4s,v26.4s
1831	add	w20,w20,w30
1832	add	v22.4s,v22.4s,v26.4s
1833	add	x21,x21,x30,lsr#32
1834	add	v19.4s,v19.4s,v31.4s			// +4
1835	add	x5,x5,x6,lsl#32	// pack
1836	add	v23.4s,v23.4s,v31.4s			// +4
1837	add	x7,x7,x8,lsl#32
1838	add	v3.4s,v3.4s,v27.4s
1839	ldp	x6,x8,[x1,#0]		// load input
1840	add	v7.4s,v7.4s,v28.4s
1841	add	x9,x9,x10,lsl#32
1842	add	v11.4s,v11.4s,v29.4s
1843	add	x11,x11,x12,lsl#32
1844	add	v15.4s,v15.4s,v30.4s
1845	ldp	x10,x12,[x1,#16]
1846	add	v19.4s,v19.4s,v27.4s
1847	add	x13,x13,x14,lsl#32
1848	add	v23.4s,v23.4s,v28.4s
1849	add	x15,x15,x16,lsl#32
1850	add	v1.4s,v1.4s,v25.4s
1851	ldp	x14,x16,[x1,#32]
1852	add	v5.4s,v5.4s,v25.4s
1853	add	x17,x17,x19,lsl#32
1854	add	v9.4s,v9.4s,v25.4s
1855	add	x20,x20,x21,lsl#32
1856	add	v13.4s,v13.4s,v25.4s
1857	ldp	x19,x21,[x1,#48]
1858	add	v17.4s,v17.4s,v25.4s
1859	add	x1,x1,#64
1860	add	v21.4s,v21.4s,v25.4s
1861
1862#ifdef	__ARMEB__
1863	rev	x5,x5
1864	rev	x7,x7
1865	rev	x9,x9
1866	rev	x11,x11
1867	rev	x13,x13
1868	rev	x15,x15
1869	rev	x17,x17
1870	rev	x20,x20
1871#endif
1872	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1873	eor	x5,x5,x6
1874	eor	x7,x7,x8
1875	eor	x9,x9,x10
1876	eor	x11,x11,x12
1877	eor	x13,x13,x14
1878	eor	v0.16b,v0.16b,v24.16b
1879	eor	x15,x15,x16
1880	eor	v1.16b,v1.16b,v25.16b
1881	eor	x17,x17,x19
1882	eor	v2.16b,v2.16b,v26.16b
1883	eor	x20,x20,x21
1884	eor	v3.16b,v3.16b,v27.16b
1885	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1886
1887	stp	x5,x7,[x0,#0]		// store output
1888	add	x28,x28,#7			// increment counter
1889	stp	x9,x11,[x0,#16]
1890	stp	x13,x15,[x0,#32]
1891	stp	x17,x20,[x0,#48]
1892	add	x0,x0,#64
1893	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1894
1895	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1896	eor	v4.16b,v4.16b,v24.16b
1897	eor	v5.16b,v5.16b,v25.16b
1898	eor	v6.16b,v6.16b,v26.16b
1899	eor	v7.16b,v7.16b,v27.16b
1900	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1901
1902	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1903	eor	v8.16b,v8.16b,v0.16b
1904	ldp	q24,q25,[sp,#0]
1905	eor	v9.16b,v9.16b,v1.16b
1906	ldp	q26,q27,[sp,#32]
1907	eor	v10.16b,v10.16b,v2.16b
1908	eor	v11.16b,v11.16b,v3.16b
1909	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1910
1911	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1912	eor	v12.16b,v12.16b,v4.16b
1913	eor	v13.16b,v13.16b,v5.16b
1914	eor	v14.16b,v14.16b,v6.16b
1915	eor	v15.16b,v15.16b,v7.16b
1916	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1917
1918	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1919	eor	v16.16b,v16.16b,v8.16b
1920	eor	v17.16b,v17.16b,v9.16b
1921	eor	v18.16b,v18.16b,v10.16b
1922	eor	v19.16b,v19.16b,v11.16b
1923	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1924
1925	shl	v0.4s,v31.4s,#1			// 4 -> 8
1926	eor	v20.16b,v20.16b,v12.16b
1927	eor	v21.16b,v21.16b,v13.16b
1928	eor	v22.16b,v22.16b,v14.16b
1929	eor	v23.16b,v23.16b,v15.16b
1930	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1931
1932	add	v27.4s,v27.4s,v0.4s			// += 8
1933	add	v28.4s,v28.4s,v0.4s
1934	add	v29.4s,v29.4s,v0.4s
1935	add	v30.4s,v30.4s,v0.4s
1936
1937	b.hs	.Loop_outer_512_neon
1938
1939	adds	x2,x2,#512
1940	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1941
1942	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1943	ldp	d10,d11,[sp,#128+16]
1944	ldp	d12,d13,[sp,#128+32]
1945	ldp	d14,d15,[sp,#128+48]
1946
1947	stp	q24,q31,[sp,#0]		// wipe off-load area
1948	stp	q24,q31,[sp,#32]
1949	stp	q24,q31,[sp,#64]
1950
1951	b.eq	.Ldone_512_neon
1952
1953	cmp	x2,#192
1954	sub	v27.4s,v27.4s,v0.4s			// -= 1
1955	sub	v28.4s,v28.4s,v0.4s
1956	sub	v29.4s,v29.4s,v0.4s
1957	add	sp,sp,#128
1958	b.hs	.Loop_outer_neon
1959
1960	eor	v25.16b,v25.16b,v25.16b
1961	eor	v26.16b,v26.16b,v26.16b
1962	eor	v27.16b,v27.16b,v27.16b
1963	eor	v28.16b,v28.16b,v28.16b
1964	eor	v29.16b,v29.16b,v29.16b
1965	eor	v30.16b,v30.16b,v30.16b
1966	b	.Loop_outer
1967
1968.Ldone_512_neon:
1969	ldp	x19,x20,[x29,#16]
1970	add	sp,sp,#128+64
1971	ldp	x21,x22,[x29,#32]
1972	ldp	x23,x24,[x29,#48]
1973	ldp	x25,x26,[x29,#64]
1974	ldp	x27,x28,[x29,#80]
1975	ldp	x29,x30,[sp],#96
1976.inst	0xd50323bf			// autiasp
1977	ret
1978.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1979