1*1dcdf01fSchristos#include "arm_asm.h"
2*1dcdf01fSchristos#include "arm_arch.h"
3*1dcdf01fSchristos
4*1dcdf01fSchristos.text
5*1dcdf01fSchristos
6*1dcdf01fSchristos
7*1dcdf01fSchristos.hidden	OPENSSL_armcap_P
8*1dcdf01fSchristos
9*1dcdf01fSchristos.align	5
10*1dcdf01fSchristos.Lsigma:
11*1dcdf01fSchristos.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
12*1dcdf01fSchristos.Lone:
13*1dcdf01fSchristos.long	1,0,0,0
14*1dcdf01fSchristos.LOPENSSL_armcap_P:
15*1dcdf01fSchristos#ifdef	__ILP32__
16*1dcdf01fSchristos.long	OPENSSL_armcap_P-.
17*1dcdf01fSchristos#else
18*1dcdf01fSchristos.quad	OPENSSL_armcap_P-.
19*1dcdf01fSchristos#endif
20*1dcdf01fSchristos.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
21*1dcdf01fSchristos.align	2
22*1dcdf01fSchristos
23*1dcdf01fSchristos.globl	ChaCha20_ctr32
24*1dcdf01fSchristos.type	ChaCha20_ctr32,%function
25*1dcdf01fSchristos.align	5
26*1dcdf01fSchristosChaCha20_ctr32:
27*1dcdf01fSchristos	cbz	x2,.Labort
28*1dcdf01fSchristos	adr	x5,.LOPENSSL_armcap_P
29*1dcdf01fSchristos	cmp	x2,#192
30*1dcdf01fSchristos	b.lo	.Lshort
31*1dcdf01fSchristos#ifdef	__ILP32__
32*1dcdf01fSchristos	ldrsw	x6,[x5]
33*1dcdf01fSchristos#else
34*1dcdf01fSchristos	ldr	x6,[x5]
35*1dcdf01fSchristos#endif
36*1dcdf01fSchristos	ldr	w17,[x6,x5]
37*1dcdf01fSchristos	tst	w17,#ARMV7_NEON
38*1dcdf01fSchristos	b.ne	ChaCha20_neon
39*1dcdf01fSchristos
40*1dcdf01fSchristos.Lshort:
41*1dcdf01fSchristos.inst	0xd503233f			// paciasp
42*1dcdf01fSchristos	stp	x29,x30,[sp,#-96]!
43*1dcdf01fSchristos	add	x29,sp,#0
44*1dcdf01fSchristos
45*1dcdf01fSchristos	adr	x5,.Lsigma
46*1dcdf01fSchristos	stp	x19,x20,[sp,#16]
47*1dcdf01fSchristos	stp	x21,x22,[sp,#32]
48*1dcdf01fSchristos	stp	x23,x24,[sp,#48]
49*1dcdf01fSchristos	stp	x25,x26,[sp,#64]
50*1dcdf01fSchristos	stp	x27,x28,[sp,#80]
51*1dcdf01fSchristos	sub	sp,sp,#64
52*1dcdf01fSchristos
53*1dcdf01fSchristos	ldp	x22,x23,[x5]		// load sigma
54*1dcdf01fSchristos	ldp	x24,x25,[x3]		// load key
55*1dcdf01fSchristos	ldp	x26,x27,[x3,#16]
56*1dcdf01fSchristos	ldp	x28,x30,[x4]		// load counter
57*1dcdf01fSchristos#ifdef	__ARMEB__
58*1dcdf01fSchristos	ror	x24,x24,#32
59*1dcdf01fSchristos	ror	x25,x25,#32
60*1dcdf01fSchristos	ror	x26,x26,#32
61*1dcdf01fSchristos	ror	x27,x27,#32
62*1dcdf01fSchristos	ror	x28,x28,#32
63*1dcdf01fSchristos	ror	x30,x30,#32
64*1dcdf01fSchristos#endif
65*1dcdf01fSchristos
66*1dcdf01fSchristos.Loop_outer:
67*1dcdf01fSchristos	mov	w5,w22			// unpack key block
68*1dcdf01fSchristos	lsr	x6,x22,#32
69*1dcdf01fSchristos	mov	w7,w23
70*1dcdf01fSchristos	lsr	x8,x23,#32
71*1dcdf01fSchristos	mov	w9,w24
72*1dcdf01fSchristos	lsr	x10,x24,#32
73*1dcdf01fSchristos	mov	w11,w25
74*1dcdf01fSchristos	lsr	x12,x25,#32
75*1dcdf01fSchristos	mov	w13,w26
76*1dcdf01fSchristos	lsr	x14,x26,#32
77*1dcdf01fSchristos	mov	w15,w27
78*1dcdf01fSchristos	lsr	x16,x27,#32
79*1dcdf01fSchristos	mov	w17,w28
80*1dcdf01fSchristos	lsr	x19,x28,#32
81*1dcdf01fSchristos	mov	w20,w30
82*1dcdf01fSchristos	lsr	x21,x30,#32
83*1dcdf01fSchristos
84*1dcdf01fSchristos	mov	x4,#10
85*1dcdf01fSchristos	subs	x2,x2,#64
86*1dcdf01fSchristos.Loop:
87*1dcdf01fSchristos	sub	x4,x4,#1
88*1dcdf01fSchristos	add	w5,w5,w9
89*1dcdf01fSchristos	add	w6,w6,w10
90*1dcdf01fSchristos	add	w7,w7,w11
91*1dcdf01fSchristos	add	w8,w8,w12
92*1dcdf01fSchristos	eor	w17,w17,w5
93*1dcdf01fSchristos	eor	w19,w19,w6
94*1dcdf01fSchristos	eor	w20,w20,w7
95*1dcdf01fSchristos	eor	w21,w21,w8
96*1dcdf01fSchristos	ror	w17,w17,#16
97*1dcdf01fSchristos	ror	w19,w19,#16
98*1dcdf01fSchristos	ror	w20,w20,#16
99*1dcdf01fSchristos	ror	w21,w21,#16
100*1dcdf01fSchristos	add	w13,w13,w17
101*1dcdf01fSchristos	add	w14,w14,w19
102*1dcdf01fSchristos	add	w15,w15,w20
103*1dcdf01fSchristos	add	w16,w16,w21
104*1dcdf01fSchristos	eor	w9,w9,w13
105*1dcdf01fSchristos	eor	w10,w10,w14
106*1dcdf01fSchristos	eor	w11,w11,w15
107*1dcdf01fSchristos	eor	w12,w12,w16
108*1dcdf01fSchristos	ror	w9,w9,#20
109*1dcdf01fSchristos	ror	w10,w10,#20
110*1dcdf01fSchristos	ror	w11,w11,#20
111*1dcdf01fSchristos	ror	w12,w12,#20
112*1dcdf01fSchristos	add	w5,w5,w9
113*1dcdf01fSchristos	add	w6,w6,w10
114*1dcdf01fSchristos	add	w7,w7,w11
115*1dcdf01fSchristos	add	w8,w8,w12
116*1dcdf01fSchristos	eor	w17,w17,w5
117*1dcdf01fSchristos	eor	w19,w19,w6
118*1dcdf01fSchristos	eor	w20,w20,w7
119*1dcdf01fSchristos	eor	w21,w21,w8
120*1dcdf01fSchristos	ror	w17,w17,#24
121*1dcdf01fSchristos	ror	w19,w19,#24
122*1dcdf01fSchristos	ror	w20,w20,#24
123*1dcdf01fSchristos	ror	w21,w21,#24
124*1dcdf01fSchristos	add	w13,w13,w17
125*1dcdf01fSchristos	add	w14,w14,w19
126*1dcdf01fSchristos	add	w15,w15,w20
127*1dcdf01fSchristos	add	w16,w16,w21
128*1dcdf01fSchristos	eor	w9,w9,w13
129*1dcdf01fSchristos	eor	w10,w10,w14
130*1dcdf01fSchristos	eor	w11,w11,w15
131*1dcdf01fSchristos	eor	w12,w12,w16
132*1dcdf01fSchristos	ror	w9,w9,#25
133*1dcdf01fSchristos	ror	w10,w10,#25
134*1dcdf01fSchristos	ror	w11,w11,#25
135*1dcdf01fSchristos	ror	w12,w12,#25
136*1dcdf01fSchristos	add	w5,w5,w10
137*1dcdf01fSchristos	add	w6,w6,w11
138*1dcdf01fSchristos	add	w7,w7,w12
139*1dcdf01fSchristos	add	w8,w8,w9
140*1dcdf01fSchristos	eor	w21,w21,w5
141*1dcdf01fSchristos	eor	w17,w17,w6
142*1dcdf01fSchristos	eor	w19,w19,w7
143*1dcdf01fSchristos	eor	w20,w20,w8
144*1dcdf01fSchristos	ror	w21,w21,#16
145*1dcdf01fSchristos	ror	w17,w17,#16
146*1dcdf01fSchristos	ror	w19,w19,#16
147*1dcdf01fSchristos	ror	w20,w20,#16
148*1dcdf01fSchristos	add	w15,w15,w21
149*1dcdf01fSchristos	add	w16,w16,w17
150*1dcdf01fSchristos	add	w13,w13,w19
151*1dcdf01fSchristos	add	w14,w14,w20
152*1dcdf01fSchristos	eor	w10,w10,w15
153*1dcdf01fSchristos	eor	w11,w11,w16
154*1dcdf01fSchristos	eor	w12,w12,w13
155*1dcdf01fSchristos	eor	w9,w9,w14
156*1dcdf01fSchristos	ror	w10,w10,#20
157*1dcdf01fSchristos	ror	w11,w11,#20
158*1dcdf01fSchristos	ror	w12,w12,#20
159*1dcdf01fSchristos	ror	w9,w9,#20
160*1dcdf01fSchristos	add	w5,w5,w10
161*1dcdf01fSchristos	add	w6,w6,w11
162*1dcdf01fSchristos	add	w7,w7,w12
163*1dcdf01fSchristos	add	w8,w8,w9
164*1dcdf01fSchristos	eor	w21,w21,w5
165*1dcdf01fSchristos	eor	w17,w17,w6
166*1dcdf01fSchristos	eor	w19,w19,w7
167*1dcdf01fSchristos	eor	w20,w20,w8
168*1dcdf01fSchristos	ror	w21,w21,#24
169*1dcdf01fSchristos	ror	w17,w17,#24
170*1dcdf01fSchristos	ror	w19,w19,#24
171*1dcdf01fSchristos	ror	w20,w20,#24
172*1dcdf01fSchristos	add	w15,w15,w21
173*1dcdf01fSchristos	add	w16,w16,w17
174*1dcdf01fSchristos	add	w13,w13,w19
175*1dcdf01fSchristos	add	w14,w14,w20
176*1dcdf01fSchristos	eor	w10,w10,w15
177*1dcdf01fSchristos	eor	w11,w11,w16
178*1dcdf01fSchristos	eor	w12,w12,w13
179*1dcdf01fSchristos	eor	w9,w9,w14
180*1dcdf01fSchristos	ror	w10,w10,#25
181*1dcdf01fSchristos	ror	w11,w11,#25
182*1dcdf01fSchristos	ror	w12,w12,#25
183*1dcdf01fSchristos	ror	w9,w9,#25
184*1dcdf01fSchristos	cbnz	x4,.Loop
185*1dcdf01fSchristos
186*1dcdf01fSchristos	add	w5,w5,w22		// accumulate key block
187*1dcdf01fSchristos	add	x6,x6,x22,lsr#32
188*1dcdf01fSchristos	add	w7,w7,w23
189*1dcdf01fSchristos	add	x8,x8,x23,lsr#32
190*1dcdf01fSchristos	add	w9,w9,w24
191*1dcdf01fSchristos	add	x10,x10,x24,lsr#32
192*1dcdf01fSchristos	add	w11,w11,w25
193*1dcdf01fSchristos	add	x12,x12,x25,lsr#32
194*1dcdf01fSchristos	add	w13,w13,w26
195*1dcdf01fSchristos	add	x14,x14,x26,lsr#32
196*1dcdf01fSchristos	add	w15,w15,w27
197*1dcdf01fSchristos	add	x16,x16,x27,lsr#32
198*1dcdf01fSchristos	add	w17,w17,w28
199*1dcdf01fSchristos	add	x19,x19,x28,lsr#32
200*1dcdf01fSchristos	add	w20,w20,w30
201*1dcdf01fSchristos	add	x21,x21,x30,lsr#32
202*1dcdf01fSchristos
203*1dcdf01fSchristos	b.lo	.Ltail
204*1dcdf01fSchristos
205*1dcdf01fSchristos	add	x5,x5,x6,lsl#32	// pack
206*1dcdf01fSchristos	add	x7,x7,x8,lsl#32
207*1dcdf01fSchristos	ldp	x6,x8,[x1,#0]		// load input
208*1dcdf01fSchristos	add	x9,x9,x10,lsl#32
209*1dcdf01fSchristos	add	x11,x11,x12,lsl#32
210*1dcdf01fSchristos	ldp	x10,x12,[x1,#16]
211*1dcdf01fSchristos	add	x13,x13,x14,lsl#32
212*1dcdf01fSchristos	add	x15,x15,x16,lsl#32
213*1dcdf01fSchristos	ldp	x14,x16,[x1,#32]
214*1dcdf01fSchristos	add	x17,x17,x19,lsl#32
215*1dcdf01fSchristos	add	x20,x20,x21,lsl#32
216*1dcdf01fSchristos	ldp	x19,x21,[x1,#48]
217*1dcdf01fSchristos	add	x1,x1,#64
218*1dcdf01fSchristos#ifdef	__ARMEB__
219*1dcdf01fSchristos	rev	x5,x5
220*1dcdf01fSchristos	rev	x7,x7
221*1dcdf01fSchristos	rev	x9,x9
222*1dcdf01fSchristos	rev	x11,x11
223*1dcdf01fSchristos	rev	x13,x13
224*1dcdf01fSchristos	rev	x15,x15
225*1dcdf01fSchristos	rev	x17,x17
226*1dcdf01fSchristos	rev	x20,x20
227*1dcdf01fSchristos#endif
228*1dcdf01fSchristos	eor	x5,x5,x6
229*1dcdf01fSchristos	eor	x7,x7,x8
230*1dcdf01fSchristos	eor	x9,x9,x10
231*1dcdf01fSchristos	eor	x11,x11,x12
232*1dcdf01fSchristos	eor	x13,x13,x14
233*1dcdf01fSchristos	eor	x15,x15,x16
234*1dcdf01fSchristos	eor	x17,x17,x19
235*1dcdf01fSchristos	eor	x20,x20,x21
236*1dcdf01fSchristos
237*1dcdf01fSchristos	stp	x5,x7,[x0,#0]		// store output
238*1dcdf01fSchristos	add	x28,x28,#1			// increment counter
239*1dcdf01fSchristos	stp	x9,x11,[x0,#16]
240*1dcdf01fSchristos	stp	x13,x15,[x0,#32]
241*1dcdf01fSchristos	stp	x17,x20,[x0,#48]
242*1dcdf01fSchristos	add	x0,x0,#64
243*1dcdf01fSchristos
244*1dcdf01fSchristos	b.hi	.Loop_outer
245*1dcdf01fSchristos
246*1dcdf01fSchristos	ldp	x19,x20,[x29,#16]
247*1dcdf01fSchristos	add	sp,sp,#64
248*1dcdf01fSchristos	ldp	x21,x22,[x29,#32]
249*1dcdf01fSchristos	ldp	x23,x24,[x29,#48]
250*1dcdf01fSchristos	ldp	x25,x26,[x29,#64]
251*1dcdf01fSchristos	ldp	x27,x28,[x29,#80]
252*1dcdf01fSchristos	ldp	x29,x30,[sp],#96
253*1dcdf01fSchristos.inst	0xd50323bf			// autiasp
254*1dcdf01fSchristos.Labort:
255*1dcdf01fSchristos	ret
256*1dcdf01fSchristos
257*1dcdf01fSchristos.align	4
258*1dcdf01fSchristos.Ltail:
259*1dcdf01fSchristos	add	x2,x2,#64
260*1dcdf01fSchristos.Less_than_64:
261*1dcdf01fSchristos	sub	x0,x0,#1
262*1dcdf01fSchristos	add	x1,x1,x2
263*1dcdf01fSchristos	add	x0,x0,x2
264*1dcdf01fSchristos	add	x4,sp,x2
265*1dcdf01fSchristos	neg	x2,x2
266*1dcdf01fSchristos
267*1dcdf01fSchristos	add	x5,x5,x6,lsl#32	// pack
268*1dcdf01fSchristos	add	x7,x7,x8,lsl#32
269*1dcdf01fSchristos	add	x9,x9,x10,lsl#32
270*1dcdf01fSchristos	add	x11,x11,x12,lsl#32
271*1dcdf01fSchristos	add	x13,x13,x14,lsl#32
272*1dcdf01fSchristos	add	x15,x15,x16,lsl#32
273*1dcdf01fSchristos	add	x17,x17,x19,lsl#32
274*1dcdf01fSchristos	add	x20,x20,x21,lsl#32
275*1dcdf01fSchristos#ifdef	__ARMEB__
276*1dcdf01fSchristos	rev	x5,x5
277*1dcdf01fSchristos	rev	x7,x7
278*1dcdf01fSchristos	rev	x9,x9
279*1dcdf01fSchristos	rev	x11,x11
280*1dcdf01fSchristos	rev	x13,x13
281*1dcdf01fSchristos	rev	x15,x15
282*1dcdf01fSchristos	rev	x17,x17
283*1dcdf01fSchristos	rev	x20,x20
284*1dcdf01fSchristos#endif
285*1dcdf01fSchristos	stp	x5,x7,[sp,#0]
286*1dcdf01fSchristos	stp	x9,x11,[sp,#16]
287*1dcdf01fSchristos	stp	x13,x15,[sp,#32]
288*1dcdf01fSchristos	stp	x17,x20,[sp,#48]
289*1dcdf01fSchristos
290*1dcdf01fSchristos.Loop_tail:
291*1dcdf01fSchristos	ldrb	w10,[x1,x2]
292*1dcdf01fSchristos	ldrb	w11,[x4,x2]
293*1dcdf01fSchristos	add	x2,x2,#1
294*1dcdf01fSchristos	eor	w10,w10,w11
295*1dcdf01fSchristos	strb	w10,[x0,x2]
296*1dcdf01fSchristos	cbnz	x2,.Loop_tail
297*1dcdf01fSchristos
298*1dcdf01fSchristos	stp	xzr,xzr,[sp,#0]
299*1dcdf01fSchristos	stp	xzr,xzr,[sp,#16]
300*1dcdf01fSchristos	stp	xzr,xzr,[sp,#32]
301*1dcdf01fSchristos	stp	xzr,xzr,[sp,#48]
302*1dcdf01fSchristos
303*1dcdf01fSchristos	ldp	x19,x20,[x29,#16]
304*1dcdf01fSchristos	add	sp,sp,#64
305*1dcdf01fSchristos	ldp	x21,x22,[x29,#32]
306*1dcdf01fSchristos	ldp	x23,x24,[x29,#48]
307*1dcdf01fSchristos	ldp	x25,x26,[x29,#64]
308*1dcdf01fSchristos	ldp	x27,x28,[x29,#80]
309*1dcdf01fSchristos	ldp	x29,x30,[sp],#96
310*1dcdf01fSchristos.inst	0xd50323bf			// autiasp
311*1dcdf01fSchristos	ret
312*1dcdf01fSchristos.size	ChaCha20_ctr32,.-ChaCha20_ctr32
313*1dcdf01fSchristos
314*1dcdf01fSchristos.type	ChaCha20_neon,%function
315*1dcdf01fSchristos.align	5
316*1dcdf01fSchristosChaCha20_neon:
317*1dcdf01fSchristos.inst	0xd503233f			// paciasp
318*1dcdf01fSchristos	stp	x29,x30,[sp,#-96]!
319*1dcdf01fSchristos	add	x29,sp,#0
320*1dcdf01fSchristos
321*1dcdf01fSchristos	adr	x5,.Lsigma
322*1dcdf01fSchristos	stp	x19,x20,[sp,#16]
323*1dcdf01fSchristos	stp	x21,x22,[sp,#32]
324*1dcdf01fSchristos	stp	x23,x24,[sp,#48]
325*1dcdf01fSchristos	stp	x25,x26,[sp,#64]
326*1dcdf01fSchristos	stp	x27,x28,[sp,#80]
327*1dcdf01fSchristos	cmp	x2,#512
328*1dcdf01fSchristos	b.hs	.L512_or_more_neon
329*1dcdf01fSchristos
330*1dcdf01fSchristos	sub	sp,sp,#64
331*1dcdf01fSchristos
332*1dcdf01fSchristos	ldp	x22,x23,[x5]		// load sigma
333*1dcdf01fSchristos	ld1	{v24.4s},[x5],#16
334*1dcdf01fSchristos	ldp	x24,x25,[x3]		// load key
335*1dcdf01fSchristos	ldp	x26,x27,[x3,#16]
336*1dcdf01fSchristos	ld1	{v25.4s,v26.4s},[x3]
337*1dcdf01fSchristos	ldp	x28,x30,[x4]		// load counter
338*1dcdf01fSchristos	ld1	{v27.4s},[x4]
339*1dcdf01fSchristos	ld1	{v31.4s},[x5]
340*1dcdf01fSchristos#ifdef	__ARMEB__
341*1dcdf01fSchristos	rev64	v24.4s,v24.4s
342*1dcdf01fSchristos	ror	x24,x24,#32
343*1dcdf01fSchristos	ror	x25,x25,#32
344*1dcdf01fSchristos	ror	x26,x26,#32
345*1dcdf01fSchristos	ror	x27,x27,#32
346*1dcdf01fSchristos	ror	x28,x28,#32
347*1dcdf01fSchristos	ror	x30,x30,#32
348*1dcdf01fSchristos#endif
349*1dcdf01fSchristos	add	v27.4s,v27.4s,v31.4s		// += 1
350*1dcdf01fSchristos	add	v28.4s,v27.4s,v31.4s
351*1dcdf01fSchristos	add	v29.4s,v28.4s,v31.4s
352*1dcdf01fSchristos	shl	v31.4s,v31.4s,#2			// 1 -> 4
353*1dcdf01fSchristos
354*1dcdf01fSchristos.Loop_outer_neon:
355*1dcdf01fSchristos	mov	w5,w22			// unpack key block
356*1dcdf01fSchristos	lsr	x6,x22,#32
357*1dcdf01fSchristos	mov	v0.16b,v24.16b
358*1dcdf01fSchristos	mov	w7,w23
359*1dcdf01fSchristos	lsr	x8,x23,#32
360*1dcdf01fSchristos	mov	v4.16b,v24.16b
361*1dcdf01fSchristos	mov	w9,w24
362*1dcdf01fSchristos	lsr	x10,x24,#32
363*1dcdf01fSchristos	mov	v16.16b,v24.16b
364*1dcdf01fSchristos	mov	w11,w25
365*1dcdf01fSchristos	mov	v1.16b,v25.16b
366*1dcdf01fSchristos	lsr	x12,x25,#32
367*1dcdf01fSchristos	mov	v5.16b,v25.16b
368*1dcdf01fSchristos	mov	w13,w26
369*1dcdf01fSchristos	mov	v17.16b,v25.16b
370*1dcdf01fSchristos	lsr	x14,x26,#32
371*1dcdf01fSchristos	mov	v3.16b,v27.16b
372*1dcdf01fSchristos	mov	w15,w27
373*1dcdf01fSchristos	mov	v7.16b,v28.16b
374*1dcdf01fSchristos	lsr	x16,x27,#32
375*1dcdf01fSchristos	mov	v19.16b,v29.16b
376*1dcdf01fSchristos	mov	w17,w28
377*1dcdf01fSchristos	mov	v2.16b,v26.16b
378*1dcdf01fSchristos	lsr	x19,x28,#32
379*1dcdf01fSchristos	mov	v6.16b,v26.16b
380*1dcdf01fSchristos	mov	w20,w30
381*1dcdf01fSchristos	mov	v18.16b,v26.16b
382*1dcdf01fSchristos	lsr	x21,x30,#32
383*1dcdf01fSchristos
384*1dcdf01fSchristos	mov	x4,#10
385*1dcdf01fSchristos	subs	x2,x2,#256
386*1dcdf01fSchristos.Loop_neon:
387*1dcdf01fSchristos	sub	x4,x4,#1
388*1dcdf01fSchristos	add	v0.4s,v0.4s,v1.4s
389*1dcdf01fSchristos	add	w5,w5,w9
390*1dcdf01fSchristos	add	v4.4s,v4.4s,v5.4s
391*1dcdf01fSchristos	add	w6,w6,w10
392*1dcdf01fSchristos	add	v16.4s,v16.4s,v17.4s
393*1dcdf01fSchristos	add	w7,w7,w11
394*1dcdf01fSchristos	eor	v3.16b,v3.16b,v0.16b
395*1dcdf01fSchristos	add	w8,w8,w12
396*1dcdf01fSchristos	eor	v7.16b,v7.16b,v4.16b
397*1dcdf01fSchristos	eor	w17,w17,w5
398*1dcdf01fSchristos	eor	v19.16b,v19.16b,v16.16b
399*1dcdf01fSchristos	eor	w19,w19,w6
400*1dcdf01fSchristos	rev32	v3.8h,v3.8h
401*1dcdf01fSchristos	eor	w20,w20,w7
402*1dcdf01fSchristos	rev32	v7.8h,v7.8h
403*1dcdf01fSchristos	eor	w21,w21,w8
404*1dcdf01fSchristos	rev32	v19.8h,v19.8h
405*1dcdf01fSchristos	ror	w17,w17,#16
406*1dcdf01fSchristos	add	v2.4s,v2.4s,v3.4s
407*1dcdf01fSchristos	ror	w19,w19,#16
408*1dcdf01fSchristos	add	v6.4s,v6.4s,v7.4s
409*1dcdf01fSchristos	ror	w20,w20,#16
410*1dcdf01fSchristos	add	v18.4s,v18.4s,v19.4s
411*1dcdf01fSchristos	ror	w21,w21,#16
412*1dcdf01fSchristos	eor	v20.16b,v1.16b,v2.16b
413*1dcdf01fSchristos	add	w13,w13,w17
414*1dcdf01fSchristos	eor	v21.16b,v5.16b,v6.16b
415*1dcdf01fSchristos	add	w14,w14,w19
416*1dcdf01fSchristos	eor	v22.16b,v17.16b,v18.16b
417*1dcdf01fSchristos	add	w15,w15,w20
418*1dcdf01fSchristos	ushr	v1.4s,v20.4s,#20
419*1dcdf01fSchristos	add	w16,w16,w21
420*1dcdf01fSchristos	ushr	v5.4s,v21.4s,#20
421*1dcdf01fSchristos	eor	w9,w9,w13
422*1dcdf01fSchristos	ushr	v17.4s,v22.4s,#20
423*1dcdf01fSchristos	eor	w10,w10,w14
424*1dcdf01fSchristos	sli	v1.4s,v20.4s,#12
425*1dcdf01fSchristos	eor	w11,w11,w15
426*1dcdf01fSchristos	sli	v5.4s,v21.4s,#12
427*1dcdf01fSchristos	eor	w12,w12,w16
428*1dcdf01fSchristos	sli	v17.4s,v22.4s,#12
429*1dcdf01fSchristos	ror	w9,w9,#20
430*1dcdf01fSchristos	add	v0.4s,v0.4s,v1.4s
431*1dcdf01fSchristos	ror	w10,w10,#20
432*1dcdf01fSchristos	add	v4.4s,v4.4s,v5.4s
433*1dcdf01fSchristos	ror	w11,w11,#20
434*1dcdf01fSchristos	add	v16.4s,v16.4s,v17.4s
435*1dcdf01fSchristos	ror	w12,w12,#20
436*1dcdf01fSchristos	eor	v20.16b,v3.16b,v0.16b
437*1dcdf01fSchristos	add	w5,w5,w9
438*1dcdf01fSchristos	eor	v21.16b,v7.16b,v4.16b
439*1dcdf01fSchristos	add	w6,w6,w10
440*1dcdf01fSchristos	eor	v22.16b,v19.16b,v16.16b
441*1dcdf01fSchristos	add	w7,w7,w11
442*1dcdf01fSchristos	ushr	v3.4s,v20.4s,#24
443*1dcdf01fSchristos	add	w8,w8,w12
444*1dcdf01fSchristos	ushr	v7.4s,v21.4s,#24
445*1dcdf01fSchristos	eor	w17,w17,w5
446*1dcdf01fSchristos	ushr	v19.4s,v22.4s,#24
447*1dcdf01fSchristos	eor	w19,w19,w6
448*1dcdf01fSchristos	sli	v3.4s,v20.4s,#8
449*1dcdf01fSchristos	eor	w20,w20,w7
450*1dcdf01fSchristos	sli	v7.4s,v21.4s,#8
451*1dcdf01fSchristos	eor	w21,w21,w8
452*1dcdf01fSchristos	sli	v19.4s,v22.4s,#8
453*1dcdf01fSchristos	ror	w17,w17,#24
454*1dcdf01fSchristos	add	v2.4s,v2.4s,v3.4s
455*1dcdf01fSchristos	ror	w19,w19,#24
456*1dcdf01fSchristos	add	v6.4s,v6.4s,v7.4s
457*1dcdf01fSchristos	ror	w20,w20,#24
458*1dcdf01fSchristos	add	v18.4s,v18.4s,v19.4s
459*1dcdf01fSchristos	ror	w21,w21,#24
460*1dcdf01fSchristos	eor	v20.16b,v1.16b,v2.16b
461*1dcdf01fSchristos	add	w13,w13,w17
462*1dcdf01fSchristos	eor	v21.16b,v5.16b,v6.16b
463*1dcdf01fSchristos	add	w14,w14,w19
464*1dcdf01fSchristos	eor	v22.16b,v17.16b,v18.16b
465*1dcdf01fSchristos	add	w15,w15,w20
466*1dcdf01fSchristos	ushr	v1.4s,v20.4s,#25
467*1dcdf01fSchristos	add	w16,w16,w21
468*1dcdf01fSchristos	ushr	v5.4s,v21.4s,#25
469*1dcdf01fSchristos	eor	w9,w9,w13
470*1dcdf01fSchristos	ushr	v17.4s,v22.4s,#25
471*1dcdf01fSchristos	eor	w10,w10,w14
472*1dcdf01fSchristos	sli	v1.4s,v20.4s,#7
473*1dcdf01fSchristos	eor	w11,w11,w15
474*1dcdf01fSchristos	sli	v5.4s,v21.4s,#7
475*1dcdf01fSchristos	eor	w12,w12,w16
476*1dcdf01fSchristos	sli	v17.4s,v22.4s,#7
477*1dcdf01fSchristos	ror	w9,w9,#25
478*1dcdf01fSchristos	ext	v2.16b,v2.16b,v2.16b,#8
479*1dcdf01fSchristos	ror	w10,w10,#25
480*1dcdf01fSchristos	ext	v6.16b,v6.16b,v6.16b,#8
481*1dcdf01fSchristos	ror	w11,w11,#25
482*1dcdf01fSchristos	ext	v18.16b,v18.16b,v18.16b,#8
483*1dcdf01fSchristos	ror	w12,w12,#25
484*1dcdf01fSchristos	ext	v3.16b,v3.16b,v3.16b,#12
485*1dcdf01fSchristos	ext	v7.16b,v7.16b,v7.16b,#12
486*1dcdf01fSchristos	ext	v19.16b,v19.16b,v19.16b,#12
487*1dcdf01fSchristos	ext	v1.16b,v1.16b,v1.16b,#4
488*1dcdf01fSchristos	ext	v5.16b,v5.16b,v5.16b,#4
489*1dcdf01fSchristos	ext	v17.16b,v17.16b,v17.16b,#4
490*1dcdf01fSchristos	add	v0.4s,v0.4s,v1.4s
491*1dcdf01fSchristos	add	w5,w5,w10
492*1dcdf01fSchristos	add	v4.4s,v4.4s,v5.4s
493*1dcdf01fSchristos	add	w6,w6,w11
494*1dcdf01fSchristos	add	v16.4s,v16.4s,v17.4s
495*1dcdf01fSchristos	add	w7,w7,w12
496*1dcdf01fSchristos	eor	v3.16b,v3.16b,v0.16b
497*1dcdf01fSchristos	add	w8,w8,w9
498*1dcdf01fSchristos	eor	v7.16b,v7.16b,v4.16b
499*1dcdf01fSchristos	eor	w21,w21,w5
500*1dcdf01fSchristos	eor	v19.16b,v19.16b,v16.16b
501*1dcdf01fSchristos	eor	w17,w17,w6
502*1dcdf01fSchristos	rev32	v3.8h,v3.8h
503*1dcdf01fSchristos	eor	w19,w19,w7
504*1dcdf01fSchristos	rev32	v7.8h,v7.8h
505*1dcdf01fSchristos	eor	w20,w20,w8
506*1dcdf01fSchristos	rev32	v19.8h,v19.8h
507*1dcdf01fSchristos	ror	w21,w21,#16
508*1dcdf01fSchristos	add	v2.4s,v2.4s,v3.4s
509*1dcdf01fSchristos	ror	w17,w17,#16
510*1dcdf01fSchristos	add	v6.4s,v6.4s,v7.4s
511*1dcdf01fSchristos	ror	w19,w19,#16
512*1dcdf01fSchristos	add	v18.4s,v18.4s,v19.4s
513*1dcdf01fSchristos	ror	w20,w20,#16
514*1dcdf01fSchristos	eor	v20.16b,v1.16b,v2.16b
515*1dcdf01fSchristos	add	w15,w15,w21
516*1dcdf01fSchristos	eor	v21.16b,v5.16b,v6.16b
517*1dcdf01fSchristos	add	w16,w16,w17
518*1dcdf01fSchristos	eor	v22.16b,v17.16b,v18.16b
519*1dcdf01fSchristos	add	w13,w13,w19
520*1dcdf01fSchristos	ushr	v1.4s,v20.4s,#20
521*1dcdf01fSchristos	add	w14,w14,w20
522*1dcdf01fSchristos	ushr	v5.4s,v21.4s,#20
523*1dcdf01fSchristos	eor	w10,w10,w15
524*1dcdf01fSchristos	ushr	v17.4s,v22.4s,#20
525*1dcdf01fSchristos	eor	w11,w11,w16
526*1dcdf01fSchristos	sli	v1.4s,v20.4s,#12
527*1dcdf01fSchristos	eor	w12,w12,w13
528*1dcdf01fSchristos	sli	v5.4s,v21.4s,#12
529*1dcdf01fSchristos	eor	w9,w9,w14
530*1dcdf01fSchristos	sli	v17.4s,v22.4s,#12
531*1dcdf01fSchristos	ror	w10,w10,#20
532*1dcdf01fSchristos	add	v0.4s,v0.4s,v1.4s
533*1dcdf01fSchristos	ror	w11,w11,#20
534*1dcdf01fSchristos	add	v4.4s,v4.4s,v5.4s
535*1dcdf01fSchristos	ror	w12,w12,#20
536*1dcdf01fSchristos	add	v16.4s,v16.4s,v17.4s
537*1dcdf01fSchristos	ror	w9,w9,#20
538*1dcdf01fSchristos	eor	v20.16b,v3.16b,v0.16b
539*1dcdf01fSchristos	add	w5,w5,w10
540*1dcdf01fSchristos	eor	v21.16b,v7.16b,v4.16b
541*1dcdf01fSchristos	add	w6,w6,w11
542*1dcdf01fSchristos	eor	v22.16b,v19.16b,v16.16b
543*1dcdf01fSchristos	add	w7,w7,w12
544*1dcdf01fSchristos	ushr	v3.4s,v20.4s,#24
545*1dcdf01fSchristos	add	w8,w8,w9
546*1dcdf01fSchristos	ushr	v7.4s,v21.4s,#24
547*1dcdf01fSchristos	eor	w21,w21,w5
548*1dcdf01fSchristos	ushr	v19.4s,v22.4s,#24
549*1dcdf01fSchristos	eor	w17,w17,w6
550*1dcdf01fSchristos	sli	v3.4s,v20.4s,#8
551*1dcdf01fSchristos	eor	w19,w19,w7
552*1dcdf01fSchristos	sli	v7.4s,v21.4s,#8
553*1dcdf01fSchristos	eor	w20,w20,w8
554*1dcdf01fSchristos	sli	v19.4s,v22.4s,#8
555*1dcdf01fSchristos	ror	w21,w21,#24
556*1dcdf01fSchristos	add	v2.4s,v2.4s,v3.4s
557*1dcdf01fSchristos	ror	w17,w17,#24
558*1dcdf01fSchristos	add	v6.4s,v6.4s,v7.4s
559*1dcdf01fSchristos	ror	w19,w19,#24
560*1dcdf01fSchristos	add	v18.4s,v18.4s,v19.4s
561*1dcdf01fSchristos	ror	w20,w20,#24
562*1dcdf01fSchristos	eor	v20.16b,v1.16b,v2.16b
563*1dcdf01fSchristos	add	w15,w15,w21
564*1dcdf01fSchristos	eor	v21.16b,v5.16b,v6.16b
565*1dcdf01fSchristos	add	w16,w16,w17
566*1dcdf01fSchristos	eor	v22.16b,v17.16b,v18.16b
567*1dcdf01fSchristos	add	w13,w13,w19
568*1dcdf01fSchristos	ushr	v1.4s,v20.4s,#25
569*1dcdf01fSchristos	add	w14,w14,w20
570*1dcdf01fSchristos	ushr	v5.4s,v21.4s,#25
571*1dcdf01fSchristos	eor	w10,w10,w15
572*1dcdf01fSchristos	ushr	v17.4s,v22.4s,#25
573*1dcdf01fSchristos	eor	w11,w11,w16
574*1dcdf01fSchristos	sli	v1.4s,v20.4s,#7
575*1dcdf01fSchristos	eor	w12,w12,w13
576*1dcdf01fSchristos	sli	v5.4s,v21.4s,#7
577*1dcdf01fSchristos	eor	w9,w9,w14
578*1dcdf01fSchristos	sli	v17.4s,v22.4s,#7
579*1dcdf01fSchristos	ror	w10,w10,#25
580*1dcdf01fSchristos	ext	v2.16b,v2.16b,v2.16b,#8
581*1dcdf01fSchristos	ror	w11,w11,#25
582*1dcdf01fSchristos	ext	v6.16b,v6.16b,v6.16b,#8
583*1dcdf01fSchristos	ror	w12,w12,#25
584*1dcdf01fSchristos	ext	v18.16b,v18.16b,v18.16b,#8
585*1dcdf01fSchristos	ror	w9,w9,#25
586*1dcdf01fSchristos	ext	v3.16b,v3.16b,v3.16b,#4
587*1dcdf01fSchristos	ext	v7.16b,v7.16b,v7.16b,#4
588*1dcdf01fSchristos	ext	v19.16b,v19.16b,v19.16b,#4
589*1dcdf01fSchristos	ext	v1.16b,v1.16b,v1.16b,#12
590*1dcdf01fSchristos	ext	v5.16b,v5.16b,v5.16b,#12
591*1dcdf01fSchristos	ext	v17.16b,v17.16b,v17.16b,#12
592*1dcdf01fSchristos	cbnz	x4,.Loop_neon
593*1dcdf01fSchristos
594*1dcdf01fSchristos	add	w5,w5,w22		// accumulate key block
595*1dcdf01fSchristos	add	v0.4s,v0.4s,v24.4s
596*1dcdf01fSchristos	add	x6,x6,x22,lsr#32
597*1dcdf01fSchristos	add	v4.4s,v4.4s,v24.4s
598*1dcdf01fSchristos	add	w7,w7,w23
599*1dcdf01fSchristos	add	v16.4s,v16.4s,v24.4s
600*1dcdf01fSchristos	add	x8,x8,x23,lsr#32
601*1dcdf01fSchristos	add	v2.4s,v2.4s,v26.4s
602*1dcdf01fSchristos	add	w9,w9,w24
603*1dcdf01fSchristos	add	v6.4s,v6.4s,v26.4s
604*1dcdf01fSchristos	add	x10,x10,x24,lsr#32
605*1dcdf01fSchristos	add	v18.4s,v18.4s,v26.4s
606*1dcdf01fSchristos	add	w11,w11,w25
607*1dcdf01fSchristos	add	v3.4s,v3.4s,v27.4s
608*1dcdf01fSchristos	add	x12,x12,x25,lsr#32
609*1dcdf01fSchristos	add	w13,w13,w26
610*1dcdf01fSchristos	add	v7.4s,v7.4s,v28.4s
611*1dcdf01fSchristos	add	x14,x14,x26,lsr#32
612*1dcdf01fSchristos	add	w15,w15,w27
613*1dcdf01fSchristos	add	v19.4s,v19.4s,v29.4s
614*1dcdf01fSchristos	add	x16,x16,x27,lsr#32
615*1dcdf01fSchristos	add	w17,w17,w28
616*1dcdf01fSchristos	add	v1.4s,v1.4s,v25.4s
617*1dcdf01fSchristos	add	x19,x19,x28,lsr#32
618*1dcdf01fSchristos	add	w20,w20,w30
619*1dcdf01fSchristos	add	v5.4s,v5.4s,v25.4s
620*1dcdf01fSchristos	add	x21,x21,x30,lsr#32
621*1dcdf01fSchristos	add	v17.4s,v17.4s,v25.4s
622*1dcdf01fSchristos
623*1dcdf01fSchristos	b.lo	.Ltail_neon
624*1dcdf01fSchristos
625*1dcdf01fSchristos	add	x5,x5,x6,lsl#32	// pack
626*1dcdf01fSchristos	add	x7,x7,x8,lsl#32
627*1dcdf01fSchristos	ldp	x6,x8,[x1,#0]		// load input
628*1dcdf01fSchristos	add	x9,x9,x10,lsl#32
629*1dcdf01fSchristos	add	x11,x11,x12,lsl#32
630*1dcdf01fSchristos	ldp	x10,x12,[x1,#16]
631*1dcdf01fSchristos	add	x13,x13,x14,lsl#32
632*1dcdf01fSchristos	add	x15,x15,x16,lsl#32
633*1dcdf01fSchristos	ldp	x14,x16,[x1,#32]
634*1dcdf01fSchristos	add	x17,x17,x19,lsl#32
635*1dcdf01fSchristos	add	x20,x20,x21,lsl#32
636*1dcdf01fSchristos	ldp	x19,x21,[x1,#48]
637*1dcdf01fSchristos	add	x1,x1,#64
638*1dcdf01fSchristos#ifdef	__ARMEB__
639*1dcdf01fSchristos	rev	x5,x5
640*1dcdf01fSchristos	rev	x7,x7
641*1dcdf01fSchristos	rev	x9,x9
642*1dcdf01fSchristos	rev	x11,x11
643*1dcdf01fSchristos	rev	x13,x13
644*1dcdf01fSchristos	rev	x15,x15
645*1dcdf01fSchristos	rev	x17,x17
646*1dcdf01fSchristos	rev	x20,x20
647*1dcdf01fSchristos#endif
648*1dcdf01fSchristos	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
649*1dcdf01fSchristos	eor	x5,x5,x6
650*1dcdf01fSchristos	eor	x7,x7,x8
651*1dcdf01fSchristos	eor	x9,x9,x10
652*1dcdf01fSchristos	eor	x11,x11,x12
653*1dcdf01fSchristos	eor	x13,x13,x14
654*1dcdf01fSchristos	eor	v0.16b,v0.16b,v20.16b
655*1dcdf01fSchristos	eor	x15,x15,x16
656*1dcdf01fSchristos	eor	v1.16b,v1.16b,v21.16b
657*1dcdf01fSchristos	eor	x17,x17,x19
658*1dcdf01fSchristos	eor	v2.16b,v2.16b,v22.16b
659*1dcdf01fSchristos	eor	x20,x20,x21
660*1dcdf01fSchristos	eor	v3.16b,v3.16b,v23.16b
661*1dcdf01fSchristos	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
662*1dcdf01fSchristos
663*1dcdf01fSchristos	stp	x5,x7,[x0,#0]		// store output
664*1dcdf01fSchristos	add	x28,x28,#4			// increment counter
665*1dcdf01fSchristos	stp	x9,x11,[x0,#16]
666*1dcdf01fSchristos	add	v27.4s,v27.4s,v31.4s		// += 4
667*1dcdf01fSchristos	stp	x13,x15,[x0,#32]
668*1dcdf01fSchristos	add	v28.4s,v28.4s,v31.4s
669*1dcdf01fSchristos	stp	x17,x20,[x0,#48]
670*1dcdf01fSchristos	add	v29.4s,v29.4s,v31.4s
671*1dcdf01fSchristos	add	x0,x0,#64
672*1dcdf01fSchristos
673*1dcdf01fSchristos	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
674*1dcdf01fSchristos	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
675*1dcdf01fSchristos
676*1dcdf01fSchristos	eor	v4.16b,v4.16b,v20.16b
677*1dcdf01fSchristos	eor	v5.16b,v5.16b,v21.16b
678*1dcdf01fSchristos	eor	v6.16b,v6.16b,v22.16b
679*1dcdf01fSchristos	eor	v7.16b,v7.16b,v23.16b
680*1dcdf01fSchristos	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
681*1dcdf01fSchristos
682*1dcdf01fSchristos	eor	v16.16b,v16.16b,v0.16b
683*1dcdf01fSchristos	eor	v17.16b,v17.16b,v1.16b
684*1dcdf01fSchristos	eor	v18.16b,v18.16b,v2.16b
685*1dcdf01fSchristos	eor	v19.16b,v19.16b,v3.16b
686*1dcdf01fSchristos	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
687*1dcdf01fSchristos
688*1dcdf01fSchristos	b.hi	.Loop_outer_neon
689*1dcdf01fSchristos
690*1dcdf01fSchristos	ldp	x19,x20,[x29,#16]
691*1dcdf01fSchristos	add	sp,sp,#64
692*1dcdf01fSchristos	ldp	x21,x22,[x29,#32]
693*1dcdf01fSchristos	ldp	x23,x24,[x29,#48]
694*1dcdf01fSchristos	ldp	x25,x26,[x29,#64]
695*1dcdf01fSchristos	ldp	x27,x28,[x29,#80]
696*1dcdf01fSchristos	ldp	x29,x30,[sp],#96
697*1dcdf01fSchristos.inst	0xd50323bf			// autiasp
698*1dcdf01fSchristos	ret
699*1dcdf01fSchristos
700*1dcdf01fSchristos.Ltail_neon:
701*1dcdf01fSchristos	add	x2,x2,#256
702*1dcdf01fSchristos	cmp	x2,#64
703*1dcdf01fSchristos	b.lo	.Less_than_64
704*1dcdf01fSchristos
705*1dcdf01fSchristos	add	x5,x5,x6,lsl#32	// pack
706*1dcdf01fSchristos	add	x7,x7,x8,lsl#32
707*1dcdf01fSchristos	ldp	x6,x8,[x1,#0]		// load input
708*1dcdf01fSchristos	add	x9,x9,x10,lsl#32
709*1dcdf01fSchristos	add	x11,x11,x12,lsl#32
710*1dcdf01fSchristos	ldp	x10,x12,[x1,#16]
711*1dcdf01fSchristos	add	x13,x13,x14,lsl#32
712*1dcdf01fSchristos	add	x15,x15,x16,lsl#32
713*1dcdf01fSchristos	ldp	x14,x16,[x1,#32]
714*1dcdf01fSchristos	add	x17,x17,x19,lsl#32
715*1dcdf01fSchristos	add	x20,x20,x21,lsl#32
716*1dcdf01fSchristos	ldp	x19,x21,[x1,#48]
717*1dcdf01fSchristos	add	x1,x1,#64
718*1dcdf01fSchristos#ifdef	__ARMEB__
719*1dcdf01fSchristos	rev	x5,x5
720*1dcdf01fSchristos	rev	x7,x7
721*1dcdf01fSchristos	rev	x9,x9
722*1dcdf01fSchristos	rev	x11,x11
723*1dcdf01fSchristos	rev	x13,x13
724*1dcdf01fSchristos	rev	x15,x15
725*1dcdf01fSchristos	rev	x17,x17
726*1dcdf01fSchristos	rev	x20,x20
727*1dcdf01fSchristos#endif
728*1dcdf01fSchristos	eor	x5,x5,x6
729*1dcdf01fSchristos	eor	x7,x7,x8
730*1dcdf01fSchristos	eor	x9,x9,x10
731*1dcdf01fSchristos	eor	x11,x11,x12
732*1dcdf01fSchristos	eor	x13,x13,x14
733*1dcdf01fSchristos	eor	x15,x15,x16
734*1dcdf01fSchristos	eor	x17,x17,x19
735*1dcdf01fSchristos	eor	x20,x20,x21
736*1dcdf01fSchristos
737*1dcdf01fSchristos	stp	x5,x7,[x0,#0]		// store output
738*1dcdf01fSchristos	add	x28,x28,#4			// increment counter
739*1dcdf01fSchristos	stp	x9,x11,[x0,#16]
740*1dcdf01fSchristos	stp	x13,x15,[x0,#32]
741*1dcdf01fSchristos	stp	x17,x20,[x0,#48]
742*1dcdf01fSchristos	add	x0,x0,#64
743*1dcdf01fSchristos	b.eq	.Ldone_neon
744*1dcdf01fSchristos	sub	x2,x2,#64
745*1dcdf01fSchristos	cmp	x2,#64
746*1dcdf01fSchristos	b.lo	.Less_than_128
747*1dcdf01fSchristos
748*1dcdf01fSchristos	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
749*1dcdf01fSchristos	eor	v0.16b,v0.16b,v20.16b
750*1dcdf01fSchristos	eor	v1.16b,v1.16b,v21.16b
751*1dcdf01fSchristos	eor	v2.16b,v2.16b,v22.16b
752*1dcdf01fSchristos	eor	v3.16b,v3.16b,v23.16b
753*1dcdf01fSchristos	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
754*1dcdf01fSchristos	b.eq	.Ldone_neon
755*1dcdf01fSchristos	sub	x2,x2,#64
756*1dcdf01fSchristos	cmp	x2,#64
757*1dcdf01fSchristos	b.lo	.Less_than_192
758*1dcdf01fSchristos
759*1dcdf01fSchristos	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
760*1dcdf01fSchristos	eor	v4.16b,v4.16b,v20.16b
761*1dcdf01fSchristos	eor	v5.16b,v5.16b,v21.16b
762*1dcdf01fSchristos	eor	v6.16b,v6.16b,v22.16b
763*1dcdf01fSchristos	eor	v7.16b,v7.16b,v23.16b
764*1dcdf01fSchristos	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
765*1dcdf01fSchristos	b.eq	.Ldone_neon
766*1dcdf01fSchristos	sub	x2,x2,#64
767*1dcdf01fSchristos
768*1dcdf01fSchristos	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
769*1dcdf01fSchristos	b	.Last_neon
770*1dcdf01fSchristos
771*1dcdf01fSchristos.Less_than_128:
772*1dcdf01fSchristos	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
773*1dcdf01fSchristos	b	.Last_neon
774*1dcdf01fSchristos.Less_than_192:
775*1dcdf01fSchristos	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
776*1dcdf01fSchristos	b	.Last_neon
777*1dcdf01fSchristos
778*1dcdf01fSchristos.align	4
779*1dcdf01fSchristos.Last_neon:
780*1dcdf01fSchristos	sub	x0,x0,#1
781*1dcdf01fSchristos	add	x1,x1,x2
782*1dcdf01fSchristos	add	x0,x0,x2
783*1dcdf01fSchristos	add	x4,sp,x2
784*1dcdf01fSchristos	neg	x2,x2
785*1dcdf01fSchristos
786*1dcdf01fSchristos.Loop_tail_neon:
787*1dcdf01fSchristos	ldrb	w10,[x1,x2]
788*1dcdf01fSchristos	ldrb	w11,[x4,x2]
789*1dcdf01fSchristos	add	x2,x2,#1
790*1dcdf01fSchristos	eor	w10,w10,w11
791*1dcdf01fSchristos	strb	w10,[x0,x2]
792*1dcdf01fSchristos	cbnz	x2,.Loop_tail_neon
793*1dcdf01fSchristos
794*1dcdf01fSchristos	stp	xzr,xzr,[sp,#0]
795*1dcdf01fSchristos	stp	xzr,xzr,[sp,#16]
796*1dcdf01fSchristos	stp	xzr,xzr,[sp,#32]
797*1dcdf01fSchristos	stp	xzr,xzr,[sp,#48]
798*1dcdf01fSchristos
799*1dcdf01fSchristos.Ldone_neon:
800*1dcdf01fSchristos	ldp	x19,x20,[x29,#16]
801*1dcdf01fSchristos	add	sp,sp,#64
802*1dcdf01fSchristos	ldp	x21,x22,[x29,#32]
803*1dcdf01fSchristos	ldp	x23,x24,[x29,#48]
804*1dcdf01fSchristos	ldp	x25,x26,[x29,#64]
805*1dcdf01fSchristos	ldp	x27,x28,[x29,#80]
806*1dcdf01fSchristos	ldp	x29,x30,[sp],#96
807*1dcdf01fSchristos.inst	0xd50323bf			// autiasp
808*1dcdf01fSchristos	ret
809*1dcdf01fSchristos.size	ChaCha20_neon,.-ChaCha20_neon
810*1dcdf01fSchristos.type	ChaCha20_512_neon,%function
811*1dcdf01fSchristos.align	5
812*1dcdf01fSchristosChaCha20_512_neon:
813*1dcdf01fSchristos.inst	0xd503233f			// paciasp
814*1dcdf01fSchristos	stp	x29,x30,[sp,#-96]!
815*1dcdf01fSchristos	add	x29,sp,#0
816*1dcdf01fSchristos
817*1dcdf01fSchristos	adr	x5,.Lsigma
818*1dcdf01fSchristos	stp	x19,x20,[sp,#16]
819*1dcdf01fSchristos	stp	x21,x22,[sp,#32]
820*1dcdf01fSchristos	stp	x23,x24,[sp,#48]
821*1dcdf01fSchristos	stp	x25,x26,[sp,#64]
822*1dcdf01fSchristos	stp	x27,x28,[sp,#80]
823*1dcdf01fSchristos
824*1dcdf01fSchristos.L512_or_more_neon:
825*1dcdf01fSchristos	sub	sp,sp,#128+64
826*1dcdf01fSchristos
827*1dcdf01fSchristos	ldp	x22,x23,[x5]		// load sigma
828*1dcdf01fSchristos	ld1	{v24.4s},[x5],#16
829*1dcdf01fSchristos	ldp	x24,x25,[x3]		// load key
830*1dcdf01fSchristos	ldp	x26,x27,[x3,#16]
831*1dcdf01fSchristos	ld1	{v25.4s,v26.4s},[x3]
832*1dcdf01fSchristos	ldp	x28,x30,[x4]		// load counter
833*1dcdf01fSchristos	ld1	{v27.4s},[x4]
834*1dcdf01fSchristos	ld1	{v31.4s},[x5]
835*1dcdf01fSchristos#ifdef	__ARMEB__
836*1dcdf01fSchristos	rev64	v24.4s,v24.4s
837*1dcdf01fSchristos	ror	x24,x24,#32
838*1dcdf01fSchristos	ror	x25,x25,#32
839*1dcdf01fSchristos	ror	x26,x26,#32
840*1dcdf01fSchristos	ror	x27,x27,#32
841*1dcdf01fSchristos	ror	x28,x28,#32
842*1dcdf01fSchristos	ror	x30,x30,#32
843*1dcdf01fSchristos#endif
844*1dcdf01fSchristos	add	v27.4s,v27.4s,v31.4s		// += 1
845*1dcdf01fSchristos	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
846*1dcdf01fSchristos	add	v27.4s,v27.4s,v31.4s		// not typo
847*1dcdf01fSchristos	str	q26,[sp,#32]
848*1dcdf01fSchristos	add	v28.4s,v27.4s,v31.4s
849*1dcdf01fSchristos	add	v29.4s,v28.4s,v31.4s
850*1dcdf01fSchristos	add	v30.4s,v29.4s,v31.4s
851*1dcdf01fSchristos	shl	v31.4s,v31.4s,#2			// 1 -> 4
852*1dcdf01fSchristos
853*1dcdf01fSchristos	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
854*1dcdf01fSchristos	stp	d10,d11,[sp,#128+16]
855*1dcdf01fSchristos	stp	d12,d13,[sp,#128+32]
856*1dcdf01fSchristos	stp	d14,d15,[sp,#128+48]
857*1dcdf01fSchristos
858*1dcdf01fSchristos	sub	x2,x2,#512			// not typo
859*1dcdf01fSchristos
860*1dcdf01fSchristos.Loop_outer_512_neon:
861*1dcdf01fSchristos	mov	v0.16b,v24.16b
862*1dcdf01fSchristos	mov	v4.16b,v24.16b
863*1dcdf01fSchristos	mov	v8.16b,v24.16b
864*1dcdf01fSchristos	mov	v12.16b,v24.16b
865*1dcdf01fSchristos	mov	v16.16b,v24.16b
866*1dcdf01fSchristos	mov	v20.16b,v24.16b
867*1dcdf01fSchristos	mov	v1.16b,v25.16b
868*1dcdf01fSchristos	mov	w5,w22			// unpack key block
869*1dcdf01fSchristos	mov	v5.16b,v25.16b
870*1dcdf01fSchristos	lsr	x6,x22,#32
871*1dcdf01fSchristos	mov	v9.16b,v25.16b
872*1dcdf01fSchristos	mov	w7,w23
873*1dcdf01fSchristos	mov	v13.16b,v25.16b
874*1dcdf01fSchristos	lsr	x8,x23,#32
875*1dcdf01fSchristos	mov	v17.16b,v25.16b
876*1dcdf01fSchristos	mov	w9,w24
877*1dcdf01fSchristos	mov	v21.16b,v25.16b
878*1dcdf01fSchristos	lsr	x10,x24,#32
879*1dcdf01fSchristos	mov	v3.16b,v27.16b
880*1dcdf01fSchristos	mov	w11,w25
881*1dcdf01fSchristos	mov	v7.16b,v28.16b
882*1dcdf01fSchristos	lsr	x12,x25,#32
883*1dcdf01fSchristos	mov	v11.16b,v29.16b
884*1dcdf01fSchristos	mov	w13,w26
885*1dcdf01fSchristos	mov	v15.16b,v30.16b
886*1dcdf01fSchristos	lsr	x14,x26,#32
887*1dcdf01fSchristos	mov	v2.16b,v26.16b
888*1dcdf01fSchristos	mov	w15,w27
889*1dcdf01fSchristos	mov	v6.16b,v26.16b
890*1dcdf01fSchristos	lsr	x16,x27,#32
891*1dcdf01fSchristos	add	v19.4s,v3.4s,v31.4s			// +4
892*1dcdf01fSchristos	mov	w17,w28
893*1dcdf01fSchristos	add	v23.4s,v7.4s,v31.4s			// +4
894*1dcdf01fSchristos	lsr	x19,x28,#32
895*1dcdf01fSchristos	mov	v10.16b,v26.16b
896*1dcdf01fSchristos	mov	w20,w30
897*1dcdf01fSchristos	mov	v14.16b,v26.16b
898*1dcdf01fSchristos	lsr	x21,x30,#32
899*1dcdf01fSchristos	mov	v18.16b,v26.16b
900*1dcdf01fSchristos	stp	q27,q28,[sp,#48]		// off-load key block, variable part
901*1dcdf01fSchristos	mov	v22.16b,v26.16b
902*1dcdf01fSchristos	str	q29,[sp,#80]
903*1dcdf01fSchristos
904*1dcdf01fSchristos	mov	x4,#5
905*1dcdf01fSchristos	subs	x2,x2,#512
906*1dcdf01fSchristos.Loop_upper_neon:
907*1dcdf01fSchristos	sub	x4,x4,#1
908*1dcdf01fSchristos	add	v0.4s,v0.4s,v1.4s
909*1dcdf01fSchristos	add	w5,w5,w9
910*1dcdf01fSchristos	add	v4.4s,v4.4s,v5.4s
911*1dcdf01fSchristos	add	w6,w6,w10
912*1dcdf01fSchristos	add	v8.4s,v8.4s,v9.4s
913*1dcdf01fSchristos	add	w7,w7,w11
914*1dcdf01fSchristos	add	v12.4s,v12.4s,v13.4s
915*1dcdf01fSchristos	add	w8,w8,w12
916*1dcdf01fSchristos	add	v16.4s,v16.4s,v17.4s
917*1dcdf01fSchristos	eor	w17,w17,w5
918*1dcdf01fSchristos	add	v20.4s,v20.4s,v21.4s
919*1dcdf01fSchristos	eor	w19,w19,w6
920*1dcdf01fSchristos	eor	v3.16b,v3.16b,v0.16b
921*1dcdf01fSchristos	eor	w20,w20,w7
922*1dcdf01fSchristos	eor	v7.16b,v7.16b,v4.16b
923*1dcdf01fSchristos	eor	w21,w21,w8
924*1dcdf01fSchristos	eor	v11.16b,v11.16b,v8.16b
925*1dcdf01fSchristos	ror	w17,w17,#16
926*1dcdf01fSchristos	eor	v15.16b,v15.16b,v12.16b
927*1dcdf01fSchristos	ror	w19,w19,#16
928*1dcdf01fSchristos	eor	v19.16b,v19.16b,v16.16b
929*1dcdf01fSchristos	ror	w20,w20,#16
930*1dcdf01fSchristos	eor	v23.16b,v23.16b,v20.16b
931*1dcdf01fSchristos	ror	w21,w21,#16
932*1dcdf01fSchristos	rev32	v3.8h,v3.8h
933*1dcdf01fSchristos	add	w13,w13,w17
934*1dcdf01fSchristos	rev32	v7.8h,v7.8h
935*1dcdf01fSchristos	add	w14,w14,w19
936*1dcdf01fSchristos	rev32	v11.8h,v11.8h
937*1dcdf01fSchristos	add	w15,w15,w20
938*1dcdf01fSchristos	rev32	v15.8h,v15.8h
939*1dcdf01fSchristos	add	w16,w16,w21
940*1dcdf01fSchristos	rev32	v19.8h,v19.8h
941*1dcdf01fSchristos	eor	w9,w9,w13
942*1dcdf01fSchristos	rev32	v23.8h,v23.8h
943*1dcdf01fSchristos	eor	w10,w10,w14
944*1dcdf01fSchristos	add	v2.4s,v2.4s,v3.4s
945*1dcdf01fSchristos	eor	w11,w11,w15
946*1dcdf01fSchristos	add	v6.4s,v6.4s,v7.4s
947*1dcdf01fSchristos	eor	w12,w12,w16
948*1dcdf01fSchristos	add	v10.4s,v10.4s,v11.4s
949*1dcdf01fSchristos	ror	w9,w9,#20
950*1dcdf01fSchristos	add	v14.4s,v14.4s,v15.4s
951*1dcdf01fSchristos	ror	w10,w10,#20
952*1dcdf01fSchristos	add	v18.4s,v18.4s,v19.4s
953*1dcdf01fSchristos	ror	w11,w11,#20
954*1dcdf01fSchristos	add	v22.4s,v22.4s,v23.4s
955*1dcdf01fSchristos	ror	w12,w12,#20
956*1dcdf01fSchristos	eor	v24.16b,v1.16b,v2.16b
957*1dcdf01fSchristos	add	w5,w5,w9
958*1dcdf01fSchristos	eor	v25.16b,v5.16b,v6.16b
959*1dcdf01fSchristos	add	w6,w6,w10
960*1dcdf01fSchristos	eor	v26.16b,v9.16b,v10.16b
961*1dcdf01fSchristos	add	w7,w7,w11
962*1dcdf01fSchristos	eor	v27.16b,v13.16b,v14.16b
963*1dcdf01fSchristos	add	w8,w8,w12
964*1dcdf01fSchristos	eor	v28.16b,v17.16b,v18.16b
965*1dcdf01fSchristos	eor	w17,w17,w5
966*1dcdf01fSchristos	eor	v29.16b,v21.16b,v22.16b
967*1dcdf01fSchristos	eor	w19,w19,w6
968*1dcdf01fSchristos	ushr	v1.4s,v24.4s,#20
969*1dcdf01fSchristos	eor	w20,w20,w7
970*1dcdf01fSchristos	ushr	v5.4s,v25.4s,#20
971*1dcdf01fSchristos	eor	w21,w21,w8
972*1dcdf01fSchristos	ushr	v9.4s,v26.4s,#20
973*1dcdf01fSchristos	ror	w17,w17,#24
974*1dcdf01fSchristos	ushr	v13.4s,v27.4s,#20
975*1dcdf01fSchristos	ror	w19,w19,#24
976*1dcdf01fSchristos	ushr	v17.4s,v28.4s,#20
977*1dcdf01fSchristos	ror	w20,w20,#24
978*1dcdf01fSchristos	ushr	v21.4s,v29.4s,#20
979*1dcdf01fSchristos	ror	w21,w21,#24
980*1dcdf01fSchristos	sli	v1.4s,v24.4s,#12
981*1dcdf01fSchristos	add	w13,w13,w17
982*1dcdf01fSchristos	sli	v5.4s,v25.4s,#12
983*1dcdf01fSchristos	add	w14,w14,w19
984*1dcdf01fSchristos	sli	v9.4s,v26.4s,#12
985*1dcdf01fSchristos	add	w15,w15,w20
986*1dcdf01fSchristos	sli	v13.4s,v27.4s,#12
987*1dcdf01fSchristos	add	w16,w16,w21
988*1dcdf01fSchristos	sli	v17.4s,v28.4s,#12
989*1dcdf01fSchristos	eor	w9,w9,w13
990*1dcdf01fSchristos	sli	v21.4s,v29.4s,#12
991*1dcdf01fSchristos	eor	w10,w10,w14
992*1dcdf01fSchristos	add	v0.4s,v0.4s,v1.4s
993*1dcdf01fSchristos	eor	w11,w11,w15
994*1dcdf01fSchristos	add	v4.4s,v4.4s,v5.4s
995*1dcdf01fSchristos	eor	w12,w12,w16
996*1dcdf01fSchristos	add	v8.4s,v8.4s,v9.4s
997*1dcdf01fSchristos	ror	w9,w9,#25
998*1dcdf01fSchristos	add	v12.4s,v12.4s,v13.4s
999*1dcdf01fSchristos	ror	w10,w10,#25
1000*1dcdf01fSchristos	add	v16.4s,v16.4s,v17.4s
1001*1dcdf01fSchristos	ror	w11,w11,#25
1002*1dcdf01fSchristos	add	v20.4s,v20.4s,v21.4s
1003*1dcdf01fSchristos	ror	w12,w12,#25
1004*1dcdf01fSchristos	eor	v24.16b,v3.16b,v0.16b
1005*1dcdf01fSchristos	add	w5,w5,w10
1006*1dcdf01fSchristos	eor	v25.16b,v7.16b,v4.16b
1007*1dcdf01fSchristos	add	w6,w6,w11
1008*1dcdf01fSchristos	eor	v26.16b,v11.16b,v8.16b
1009*1dcdf01fSchristos	add	w7,w7,w12
1010*1dcdf01fSchristos	eor	v27.16b,v15.16b,v12.16b
1011*1dcdf01fSchristos	add	w8,w8,w9
1012*1dcdf01fSchristos	eor	v28.16b,v19.16b,v16.16b
1013*1dcdf01fSchristos	eor	w21,w21,w5
1014*1dcdf01fSchristos	eor	v29.16b,v23.16b,v20.16b
1015*1dcdf01fSchristos	eor	w17,w17,w6
1016*1dcdf01fSchristos	ushr	v3.4s,v24.4s,#24
1017*1dcdf01fSchristos	eor	w19,w19,w7
1018*1dcdf01fSchristos	ushr	v7.4s,v25.4s,#24
1019*1dcdf01fSchristos	eor	w20,w20,w8
1020*1dcdf01fSchristos	ushr	v11.4s,v26.4s,#24
1021*1dcdf01fSchristos	ror	w21,w21,#16
1022*1dcdf01fSchristos	ushr	v15.4s,v27.4s,#24
1023*1dcdf01fSchristos	ror	w17,w17,#16
1024*1dcdf01fSchristos	ushr	v19.4s,v28.4s,#24
1025*1dcdf01fSchristos	ror	w19,w19,#16
1026*1dcdf01fSchristos	ushr	v23.4s,v29.4s,#24
1027*1dcdf01fSchristos	ror	w20,w20,#16
1028*1dcdf01fSchristos	sli	v3.4s,v24.4s,#8
1029*1dcdf01fSchristos	add	w15,w15,w21
1030*1dcdf01fSchristos	sli	v7.4s,v25.4s,#8
1031*1dcdf01fSchristos	add	w16,w16,w17
1032*1dcdf01fSchristos	sli	v11.4s,v26.4s,#8
1033*1dcdf01fSchristos	add	w13,w13,w19
1034*1dcdf01fSchristos	sli	v15.4s,v27.4s,#8
1035*1dcdf01fSchristos	add	w14,w14,w20
1036*1dcdf01fSchristos	sli	v19.4s,v28.4s,#8
1037*1dcdf01fSchristos	eor	w10,w10,w15
1038*1dcdf01fSchristos	sli	v23.4s,v29.4s,#8
1039*1dcdf01fSchristos	eor	w11,w11,w16
1040*1dcdf01fSchristos	add	v2.4s,v2.4s,v3.4s
1041*1dcdf01fSchristos	eor	w12,w12,w13
1042*1dcdf01fSchristos	add	v6.4s,v6.4s,v7.4s
1043*1dcdf01fSchristos	eor	w9,w9,w14
1044*1dcdf01fSchristos	add	v10.4s,v10.4s,v11.4s
1045*1dcdf01fSchristos	ror	w10,w10,#20
1046*1dcdf01fSchristos	add	v14.4s,v14.4s,v15.4s
1047*1dcdf01fSchristos	ror	w11,w11,#20
1048*1dcdf01fSchristos	add	v18.4s,v18.4s,v19.4s
1049*1dcdf01fSchristos	ror	w12,w12,#20
1050*1dcdf01fSchristos	add	v22.4s,v22.4s,v23.4s
1051*1dcdf01fSchristos	ror	w9,w9,#20
1052*1dcdf01fSchristos	eor	v24.16b,v1.16b,v2.16b
1053*1dcdf01fSchristos	add	w5,w5,w10
1054*1dcdf01fSchristos	eor	v25.16b,v5.16b,v6.16b
1055*1dcdf01fSchristos	add	w6,w6,w11
1056*1dcdf01fSchristos	eor	v26.16b,v9.16b,v10.16b
1057*1dcdf01fSchristos	add	w7,w7,w12
1058*1dcdf01fSchristos	eor	v27.16b,v13.16b,v14.16b
1059*1dcdf01fSchristos	add	w8,w8,w9
1060*1dcdf01fSchristos	eor	v28.16b,v17.16b,v18.16b
1061*1dcdf01fSchristos	eor	w21,w21,w5
1062*1dcdf01fSchristos	eor	v29.16b,v21.16b,v22.16b
1063*1dcdf01fSchristos	eor	w17,w17,w6
1064*1dcdf01fSchristos	ushr	v1.4s,v24.4s,#25
1065*1dcdf01fSchristos	eor	w19,w19,w7
1066*1dcdf01fSchristos	ushr	v5.4s,v25.4s,#25
1067*1dcdf01fSchristos	eor	w20,w20,w8
1068*1dcdf01fSchristos	ushr	v9.4s,v26.4s,#25
1069*1dcdf01fSchristos	ror	w21,w21,#24
1070*1dcdf01fSchristos	ushr	v13.4s,v27.4s,#25
1071*1dcdf01fSchristos	ror	w17,w17,#24
1072*1dcdf01fSchristos	ushr	v17.4s,v28.4s,#25
1073*1dcdf01fSchristos	ror	w19,w19,#24
1074*1dcdf01fSchristos	ushr	v21.4s,v29.4s,#25
1075*1dcdf01fSchristos	ror	w20,w20,#24
1076*1dcdf01fSchristos	sli	v1.4s,v24.4s,#7
1077*1dcdf01fSchristos	add	w15,w15,w21
1078*1dcdf01fSchristos	sli	v5.4s,v25.4s,#7
1079*1dcdf01fSchristos	add	w16,w16,w17
1080*1dcdf01fSchristos	sli	v9.4s,v26.4s,#7
1081*1dcdf01fSchristos	add	w13,w13,w19
1082*1dcdf01fSchristos	sli	v13.4s,v27.4s,#7
1083*1dcdf01fSchristos	add	w14,w14,w20
1084*1dcdf01fSchristos	sli	v17.4s,v28.4s,#7
1085*1dcdf01fSchristos	eor	w10,w10,w15
1086*1dcdf01fSchristos	sli	v21.4s,v29.4s,#7
1087*1dcdf01fSchristos	eor	w11,w11,w16
1088*1dcdf01fSchristos	ext	v2.16b,v2.16b,v2.16b,#8
1089*1dcdf01fSchristos	eor	w12,w12,w13
1090*1dcdf01fSchristos	ext	v6.16b,v6.16b,v6.16b,#8
1091*1dcdf01fSchristos	eor	w9,w9,w14
1092*1dcdf01fSchristos	ext	v10.16b,v10.16b,v10.16b,#8
1093*1dcdf01fSchristos	ror	w10,w10,#25
1094*1dcdf01fSchristos	ext	v14.16b,v14.16b,v14.16b,#8
1095*1dcdf01fSchristos	ror	w11,w11,#25
1096*1dcdf01fSchristos	ext	v18.16b,v18.16b,v18.16b,#8
1097*1dcdf01fSchristos	ror	w12,w12,#25
1098*1dcdf01fSchristos	ext	v22.16b,v22.16b,v22.16b,#8
1099*1dcdf01fSchristos	ror	w9,w9,#25
1100*1dcdf01fSchristos	ext	v3.16b,v3.16b,v3.16b,#12
1101*1dcdf01fSchristos	ext	v7.16b,v7.16b,v7.16b,#12
1102*1dcdf01fSchristos	ext	v11.16b,v11.16b,v11.16b,#12
1103*1dcdf01fSchristos	ext	v15.16b,v15.16b,v15.16b,#12
1104*1dcdf01fSchristos	ext	v19.16b,v19.16b,v19.16b,#12
1105*1dcdf01fSchristos	ext	v23.16b,v23.16b,v23.16b,#12
1106*1dcdf01fSchristos	ext	v1.16b,v1.16b,v1.16b,#4
1107*1dcdf01fSchristos	ext	v5.16b,v5.16b,v5.16b,#4
1108*1dcdf01fSchristos	ext	v9.16b,v9.16b,v9.16b,#4
1109*1dcdf01fSchristos	ext	v13.16b,v13.16b,v13.16b,#4
1110*1dcdf01fSchristos	ext	v17.16b,v17.16b,v17.16b,#4
1111*1dcdf01fSchristos	ext	v21.16b,v21.16b,v21.16b,#4
1112*1dcdf01fSchristos	add	v0.4s,v0.4s,v1.4s
1113*1dcdf01fSchristos	add	w5,w5,w9
1114*1dcdf01fSchristos	add	v4.4s,v4.4s,v5.4s
1115*1dcdf01fSchristos	add	w6,w6,w10
1116*1dcdf01fSchristos	add	v8.4s,v8.4s,v9.4s
1117*1dcdf01fSchristos	add	w7,w7,w11
1118*1dcdf01fSchristos	add	v12.4s,v12.4s,v13.4s
1119*1dcdf01fSchristos	add	w8,w8,w12
1120*1dcdf01fSchristos	add	v16.4s,v16.4s,v17.4s
1121*1dcdf01fSchristos	eor	w17,w17,w5
1122*1dcdf01fSchristos	add	v20.4s,v20.4s,v21.4s
1123*1dcdf01fSchristos	eor	w19,w19,w6
1124*1dcdf01fSchristos	eor	v3.16b,v3.16b,v0.16b
1125*1dcdf01fSchristos	eor	w20,w20,w7
1126*1dcdf01fSchristos	eor	v7.16b,v7.16b,v4.16b
1127*1dcdf01fSchristos	eor	w21,w21,w8
1128*1dcdf01fSchristos	eor	v11.16b,v11.16b,v8.16b
1129*1dcdf01fSchristos	ror	w17,w17,#16
1130*1dcdf01fSchristos	eor	v15.16b,v15.16b,v12.16b
1131*1dcdf01fSchristos	ror	w19,w19,#16
1132*1dcdf01fSchristos	eor	v19.16b,v19.16b,v16.16b
1133*1dcdf01fSchristos	ror	w20,w20,#16
1134*1dcdf01fSchristos	eor	v23.16b,v23.16b,v20.16b
1135*1dcdf01fSchristos	ror	w21,w21,#16
1136*1dcdf01fSchristos	rev32	v3.8h,v3.8h
1137*1dcdf01fSchristos	add	w13,w13,w17
1138*1dcdf01fSchristos	rev32	v7.8h,v7.8h
1139*1dcdf01fSchristos	add	w14,w14,w19
1140*1dcdf01fSchristos	rev32	v11.8h,v11.8h
1141*1dcdf01fSchristos	add	w15,w15,w20
1142*1dcdf01fSchristos	rev32	v15.8h,v15.8h
1143*1dcdf01fSchristos	add	w16,w16,w21
1144*1dcdf01fSchristos	rev32	v19.8h,v19.8h
1145*1dcdf01fSchristos	eor	w9,w9,w13
1146*1dcdf01fSchristos	rev32	v23.8h,v23.8h
1147*1dcdf01fSchristos	eor	w10,w10,w14
1148*1dcdf01fSchristos	add	v2.4s,v2.4s,v3.4s
1149*1dcdf01fSchristos	eor	w11,w11,w15
1150*1dcdf01fSchristos	add	v6.4s,v6.4s,v7.4s
1151*1dcdf01fSchristos	eor	w12,w12,w16
1152*1dcdf01fSchristos	add	v10.4s,v10.4s,v11.4s
1153*1dcdf01fSchristos	ror	w9,w9,#20
1154*1dcdf01fSchristos	add	v14.4s,v14.4s,v15.4s
1155*1dcdf01fSchristos	ror	w10,w10,#20
1156*1dcdf01fSchristos	add	v18.4s,v18.4s,v19.4s
1157*1dcdf01fSchristos	ror	w11,w11,#20
1158*1dcdf01fSchristos	add	v22.4s,v22.4s,v23.4s
1159*1dcdf01fSchristos	ror	w12,w12,#20
1160*1dcdf01fSchristos	eor	v24.16b,v1.16b,v2.16b
1161*1dcdf01fSchristos	add	w5,w5,w9
1162*1dcdf01fSchristos	eor	v25.16b,v5.16b,v6.16b
1163*1dcdf01fSchristos	add	w6,w6,w10
1164*1dcdf01fSchristos	eor	v26.16b,v9.16b,v10.16b
1165*1dcdf01fSchristos	add	w7,w7,w11
1166*1dcdf01fSchristos	eor	v27.16b,v13.16b,v14.16b
1167*1dcdf01fSchristos	add	w8,w8,w12
1168*1dcdf01fSchristos	eor	v28.16b,v17.16b,v18.16b
1169*1dcdf01fSchristos	eor	w17,w17,w5
1170*1dcdf01fSchristos	eor	v29.16b,v21.16b,v22.16b
1171*1dcdf01fSchristos	eor	w19,w19,w6
1172*1dcdf01fSchristos	ushr	v1.4s,v24.4s,#20
1173*1dcdf01fSchristos	eor	w20,w20,w7
1174*1dcdf01fSchristos	ushr	v5.4s,v25.4s,#20
1175*1dcdf01fSchristos	eor	w21,w21,w8
1176*1dcdf01fSchristos	ushr	v9.4s,v26.4s,#20
1177*1dcdf01fSchristos	ror	w17,w17,#24
1178*1dcdf01fSchristos	ushr	v13.4s,v27.4s,#20
1179*1dcdf01fSchristos	ror	w19,w19,#24
1180*1dcdf01fSchristos	ushr	v17.4s,v28.4s,#20
1181*1dcdf01fSchristos	ror	w20,w20,#24
1182*1dcdf01fSchristos	ushr	v21.4s,v29.4s,#20
1183*1dcdf01fSchristos	ror	w21,w21,#24
1184*1dcdf01fSchristos	sli	v1.4s,v24.4s,#12
1185*1dcdf01fSchristos	add	w13,w13,w17
1186*1dcdf01fSchristos	sli	v5.4s,v25.4s,#12
1187*1dcdf01fSchristos	add	w14,w14,w19
1188*1dcdf01fSchristos	sli	v9.4s,v26.4s,#12
1189*1dcdf01fSchristos	add	w15,w15,w20
1190*1dcdf01fSchristos	sli	v13.4s,v27.4s,#12
1191*1dcdf01fSchristos	add	w16,w16,w21
1192*1dcdf01fSchristos	sli	v17.4s,v28.4s,#12
1193*1dcdf01fSchristos	eor	w9,w9,w13
1194*1dcdf01fSchristos	sli	v21.4s,v29.4s,#12
1195*1dcdf01fSchristos	eor	w10,w10,w14
1196*1dcdf01fSchristos	add	v0.4s,v0.4s,v1.4s
1197*1dcdf01fSchristos	eor	w11,w11,w15
1198*1dcdf01fSchristos	add	v4.4s,v4.4s,v5.4s
1199*1dcdf01fSchristos	eor	w12,w12,w16
1200*1dcdf01fSchristos	add	v8.4s,v8.4s,v9.4s
1201*1dcdf01fSchristos	ror	w9,w9,#25
1202*1dcdf01fSchristos	add	v12.4s,v12.4s,v13.4s
1203*1dcdf01fSchristos	ror	w10,w10,#25
1204*1dcdf01fSchristos	add	v16.4s,v16.4s,v17.4s
1205*1dcdf01fSchristos	ror	w11,w11,#25
1206*1dcdf01fSchristos	add	v20.4s,v20.4s,v21.4s
1207*1dcdf01fSchristos	ror	w12,w12,#25
1208*1dcdf01fSchristos	eor	v24.16b,v3.16b,v0.16b
1209*1dcdf01fSchristos	add	w5,w5,w10
1210*1dcdf01fSchristos	eor	v25.16b,v7.16b,v4.16b
1211*1dcdf01fSchristos	add	w6,w6,w11
1212*1dcdf01fSchristos	eor	v26.16b,v11.16b,v8.16b
1213*1dcdf01fSchristos	add	w7,w7,w12
1214*1dcdf01fSchristos	eor	v27.16b,v15.16b,v12.16b
1215*1dcdf01fSchristos	add	w8,w8,w9
1216*1dcdf01fSchristos	eor	v28.16b,v19.16b,v16.16b
1217*1dcdf01fSchristos	eor	w21,w21,w5
1218*1dcdf01fSchristos	eor	v29.16b,v23.16b,v20.16b
1219*1dcdf01fSchristos	eor	w17,w17,w6
1220*1dcdf01fSchristos	ushr	v3.4s,v24.4s,#24
1221*1dcdf01fSchristos	eor	w19,w19,w7
1222*1dcdf01fSchristos	ushr	v7.4s,v25.4s,#24
1223*1dcdf01fSchristos	eor	w20,w20,w8
1224*1dcdf01fSchristos	ushr	v11.4s,v26.4s,#24
1225*1dcdf01fSchristos	ror	w21,w21,#16
1226*1dcdf01fSchristos	ushr	v15.4s,v27.4s,#24
1227*1dcdf01fSchristos	ror	w17,w17,#16
1228*1dcdf01fSchristos	ushr	v19.4s,v28.4s,#24
1229*1dcdf01fSchristos	ror	w19,w19,#16
1230*1dcdf01fSchristos	ushr	v23.4s,v29.4s,#24
1231*1dcdf01fSchristos	ror	w20,w20,#16
1232*1dcdf01fSchristos	sli	v3.4s,v24.4s,#8
1233*1dcdf01fSchristos	add	w15,w15,w21
1234*1dcdf01fSchristos	sli	v7.4s,v25.4s,#8
1235*1dcdf01fSchristos	add	w16,w16,w17
1236*1dcdf01fSchristos	sli	v11.4s,v26.4s,#8
1237*1dcdf01fSchristos	add	w13,w13,w19
1238*1dcdf01fSchristos	sli	v15.4s,v27.4s,#8
1239*1dcdf01fSchristos	add	w14,w14,w20
1240*1dcdf01fSchristos	sli	v19.4s,v28.4s,#8
1241*1dcdf01fSchristos	eor	w10,w10,w15
1242*1dcdf01fSchristos	sli	v23.4s,v29.4s,#8
1243*1dcdf01fSchristos	eor	w11,w11,w16
1244*1dcdf01fSchristos	add	v2.4s,v2.4s,v3.4s
1245*1dcdf01fSchristos	eor	w12,w12,w13
1246*1dcdf01fSchristos	add	v6.4s,v6.4s,v7.4s
1247*1dcdf01fSchristos	eor	w9,w9,w14
1248*1dcdf01fSchristos	add	v10.4s,v10.4s,v11.4s
1249*1dcdf01fSchristos	ror	w10,w10,#20
1250*1dcdf01fSchristos	add	v14.4s,v14.4s,v15.4s
1251*1dcdf01fSchristos	ror	w11,w11,#20
1252*1dcdf01fSchristos	add	v18.4s,v18.4s,v19.4s
1253*1dcdf01fSchristos	ror	w12,w12,#20
1254*1dcdf01fSchristos	add	v22.4s,v22.4s,v23.4s
1255*1dcdf01fSchristos	ror	w9,w9,#20
1256*1dcdf01fSchristos	eor	v24.16b,v1.16b,v2.16b
1257*1dcdf01fSchristos	add	w5,w5,w10
1258*1dcdf01fSchristos	eor	v25.16b,v5.16b,v6.16b
1259*1dcdf01fSchristos	add	w6,w6,w11
1260*1dcdf01fSchristos	eor	v26.16b,v9.16b,v10.16b
1261*1dcdf01fSchristos	add	w7,w7,w12
1262*1dcdf01fSchristos	eor	v27.16b,v13.16b,v14.16b
1263*1dcdf01fSchristos	add	w8,w8,w9
1264*1dcdf01fSchristos	eor	v28.16b,v17.16b,v18.16b
1265*1dcdf01fSchristos	eor	w21,w21,w5
1266*1dcdf01fSchristos	eor	v29.16b,v21.16b,v22.16b
1267*1dcdf01fSchristos	eor	w17,w17,w6
1268*1dcdf01fSchristos	ushr	v1.4s,v24.4s,#25
1269*1dcdf01fSchristos	eor	w19,w19,w7
1270*1dcdf01fSchristos	ushr	v5.4s,v25.4s,#25
1271*1dcdf01fSchristos	eor	w20,w20,w8
1272*1dcdf01fSchristos	ushr	v9.4s,v26.4s,#25
1273*1dcdf01fSchristos	ror	w21,w21,#24
1274*1dcdf01fSchristos	ushr	v13.4s,v27.4s,#25
1275*1dcdf01fSchristos	ror	w17,w17,#24
1276*1dcdf01fSchristos	ushr	v17.4s,v28.4s,#25
1277*1dcdf01fSchristos	ror	w19,w19,#24
1278*1dcdf01fSchristos	ushr	v21.4s,v29.4s,#25
1279*1dcdf01fSchristos	ror	w20,w20,#24
1280*1dcdf01fSchristos	sli	v1.4s,v24.4s,#7
1281*1dcdf01fSchristos	add	w15,w15,w21
1282*1dcdf01fSchristos	sli	v5.4s,v25.4s,#7
1283*1dcdf01fSchristos	add	w16,w16,w17
1284*1dcdf01fSchristos	sli	v9.4s,v26.4s,#7
1285*1dcdf01fSchristos	add	w13,w13,w19
1286*1dcdf01fSchristos	sli	v13.4s,v27.4s,#7
1287*1dcdf01fSchristos	add	w14,w14,w20
1288*1dcdf01fSchristos	sli	v17.4s,v28.4s,#7
1289*1dcdf01fSchristos	eor	w10,w10,w15
1290*1dcdf01fSchristos	sli	v21.4s,v29.4s,#7
1291*1dcdf01fSchristos	eor	w11,w11,w16
1292*1dcdf01fSchristos	ext	v2.16b,v2.16b,v2.16b,#8
1293*1dcdf01fSchristos	eor	w12,w12,w13
1294*1dcdf01fSchristos	ext	v6.16b,v6.16b,v6.16b,#8
1295*1dcdf01fSchristos	eor	w9,w9,w14
1296*1dcdf01fSchristos	ext	v10.16b,v10.16b,v10.16b,#8
1297*1dcdf01fSchristos	ror	w10,w10,#25
1298*1dcdf01fSchristos	ext	v14.16b,v14.16b,v14.16b,#8
1299*1dcdf01fSchristos	ror	w11,w11,#25
1300*1dcdf01fSchristos	ext	v18.16b,v18.16b,v18.16b,#8
1301*1dcdf01fSchristos	ror	w12,w12,#25
1302*1dcdf01fSchristos	ext	v22.16b,v22.16b,v22.16b,#8
1303*1dcdf01fSchristos	ror	w9,w9,#25
1304*1dcdf01fSchristos	ext	v3.16b,v3.16b,v3.16b,#4
1305*1dcdf01fSchristos	ext	v7.16b,v7.16b,v7.16b,#4
1306*1dcdf01fSchristos	ext	v11.16b,v11.16b,v11.16b,#4
1307*1dcdf01fSchristos	ext	v15.16b,v15.16b,v15.16b,#4
1308*1dcdf01fSchristos	ext	v19.16b,v19.16b,v19.16b,#4
1309*1dcdf01fSchristos	ext	v23.16b,v23.16b,v23.16b,#4
1310*1dcdf01fSchristos	ext	v1.16b,v1.16b,v1.16b,#12
1311*1dcdf01fSchristos	ext	v5.16b,v5.16b,v5.16b,#12
1312*1dcdf01fSchristos	ext	v9.16b,v9.16b,v9.16b,#12
1313*1dcdf01fSchristos	ext	v13.16b,v13.16b,v13.16b,#12
1314*1dcdf01fSchristos	ext	v17.16b,v17.16b,v17.16b,#12
1315*1dcdf01fSchristos	ext	v21.16b,v21.16b,v21.16b,#12
1316*1dcdf01fSchristos	cbnz	x4,.Loop_upper_neon
1317*1dcdf01fSchristos
1318*1dcdf01fSchristos	add	w5,w5,w22		// accumulate key block
1319*1dcdf01fSchristos	add	x6,x6,x22,lsr#32
1320*1dcdf01fSchristos	add	w7,w7,w23
1321*1dcdf01fSchristos	add	x8,x8,x23,lsr#32
1322*1dcdf01fSchristos	add	w9,w9,w24
1323*1dcdf01fSchristos	add	x10,x10,x24,lsr#32
1324*1dcdf01fSchristos	add	w11,w11,w25
1325*1dcdf01fSchristos	add	x12,x12,x25,lsr#32
1326*1dcdf01fSchristos	add	w13,w13,w26
1327*1dcdf01fSchristos	add	x14,x14,x26,lsr#32
1328*1dcdf01fSchristos	add	w15,w15,w27
1329*1dcdf01fSchristos	add	x16,x16,x27,lsr#32
1330*1dcdf01fSchristos	add	w17,w17,w28
1331*1dcdf01fSchristos	add	x19,x19,x28,lsr#32
1332*1dcdf01fSchristos	add	w20,w20,w30
1333*1dcdf01fSchristos	add	x21,x21,x30,lsr#32
1334*1dcdf01fSchristos
1335*1dcdf01fSchristos	add	x5,x5,x6,lsl#32	// pack
1336*1dcdf01fSchristos	add	x7,x7,x8,lsl#32
1337*1dcdf01fSchristos	ldp	x6,x8,[x1,#0]		// load input
1338*1dcdf01fSchristos	add	x9,x9,x10,lsl#32
1339*1dcdf01fSchristos	add	x11,x11,x12,lsl#32
1340*1dcdf01fSchristos	ldp	x10,x12,[x1,#16]
1341*1dcdf01fSchristos	add	x13,x13,x14,lsl#32
1342*1dcdf01fSchristos	add	x15,x15,x16,lsl#32
1343*1dcdf01fSchristos	ldp	x14,x16,[x1,#32]
1344*1dcdf01fSchristos	add	x17,x17,x19,lsl#32
1345*1dcdf01fSchristos	add	x20,x20,x21,lsl#32
1346*1dcdf01fSchristos	ldp	x19,x21,[x1,#48]
1347*1dcdf01fSchristos	add	x1,x1,#64
1348*1dcdf01fSchristos#ifdef	__ARMEB__
1349*1dcdf01fSchristos	rev	x5,x5
1350*1dcdf01fSchristos	rev	x7,x7
1351*1dcdf01fSchristos	rev	x9,x9
1352*1dcdf01fSchristos	rev	x11,x11
1353*1dcdf01fSchristos	rev	x13,x13
1354*1dcdf01fSchristos	rev	x15,x15
1355*1dcdf01fSchristos	rev	x17,x17
1356*1dcdf01fSchristos	rev	x20,x20
1357*1dcdf01fSchristos#endif
1358*1dcdf01fSchristos	eor	x5,x5,x6
1359*1dcdf01fSchristos	eor	x7,x7,x8
1360*1dcdf01fSchristos	eor	x9,x9,x10
1361*1dcdf01fSchristos	eor	x11,x11,x12
1362*1dcdf01fSchristos	eor	x13,x13,x14
1363*1dcdf01fSchristos	eor	x15,x15,x16
1364*1dcdf01fSchristos	eor	x17,x17,x19
1365*1dcdf01fSchristos	eor	x20,x20,x21
1366*1dcdf01fSchristos
1367*1dcdf01fSchristos	stp	x5,x7,[x0,#0]		// store output
1368*1dcdf01fSchristos	add	x28,x28,#1			// increment counter
1369*1dcdf01fSchristos	mov	w5,w22			// unpack key block
1370*1dcdf01fSchristos	lsr	x6,x22,#32
1371*1dcdf01fSchristos	stp	x9,x11,[x0,#16]
1372*1dcdf01fSchristos	mov	w7,w23
1373*1dcdf01fSchristos	lsr	x8,x23,#32
1374*1dcdf01fSchristos	stp	x13,x15,[x0,#32]
1375*1dcdf01fSchristos	mov	w9,w24
1376*1dcdf01fSchristos	lsr	x10,x24,#32
1377*1dcdf01fSchristos	stp	x17,x20,[x0,#48]
1378*1dcdf01fSchristos	add	x0,x0,#64
1379*1dcdf01fSchristos	mov	w11,w25
1380*1dcdf01fSchristos	lsr	x12,x25,#32
1381*1dcdf01fSchristos	mov	w13,w26
1382*1dcdf01fSchristos	lsr	x14,x26,#32
1383*1dcdf01fSchristos	mov	w15,w27
1384*1dcdf01fSchristos	lsr	x16,x27,#32
1385*1dcdf01fSchristos	mov	w17,w28
1386*1dcdf01fSchristos	lsr	x19,x28,#32
1387*1dcdf01fSchristos	mov	w20,w30
1388*1dcdf01fSchristos	lsr	x21,x30,#32
1389*1dcdf01fSchristos
1390*1dcdf01fSchristos	mov	x4,#5
1391*1dcdf01fSchristos.Loop_lower_neon:
1392*1dcdf01fSchristos	sub	x4,x4,#1
1393*1dcdf01fSchristos	add	v0.4s,v0.4s,v1.4s
1394*1dcdf01fSchristos	add	w5,w5,w9
1395*1dcdf01fSchristos	add	v4.4s,v4.4s,v5.4s
1396*1dcdf01fSchristos	add	w6,w6,w10
1397*1dcdf01fSchristos	add	v8.4s,v8.4s,v9.4s
1398*1dcdf01fSchristos	add	w7,w7,w11
1399*1dcdf01fSchristos	add	v12.4s,v12.4s,v13.4s
1400*1dcdf01fSchristos	add	w8,w8,w12
1401*1dcdf01fSchristos	add	v16.4s,v16.4s,v17.4s
1402*1dcdf01fSchristos	eor	w17,w17,w5
1403*1dcdf01fSchristos	add	v20.4s,v20.4s,v21.4s
1404*1dcdf01fSchristos	eor	w19,w19,w6
1405*1dcdf01fSchristos	eor	v3.16b,v3.16b,v0.16b
1406*1dcdf01fSchristos	eor	w20,w20,w7
1407*1dcdf01fSchristos	eor	v7.16b,v7.16b,v4.16b
1408*1dcdf01fSchristos	eor	w21,w21,w8
1409*1dcdf01fSchristos	eor	v11.16b,v11.16b,v8.16b
1410*1dcdf01fSchristos	ror	w17,w17,#16
1411*1dcdf01fSchristos	eor	v15.16b,v15.16b,v12.16b
1412*1dcdf01fSchristos	ror	w19,w19,#16
1413*1dcdf01fSchristos	eor	v19.16b,v19.16b,v16.16b
1414*1dcdf01fSchristos	ror	w20,w20,#16
1415*1dcdf01fSchristos	eor	v23.16b,v23.16b,v20.16b
1416*1dcdf01fSchristos	ror	w21,w21,#16
1417*1dcdf01fSchristos	rev32	v3.8h,v3.8h
1418*1dcdf01fSchristos	add	w13,w13,w17
1419*1dcdf01fSchristos	rev32	v7.8h,v7.8h
1420*1dcdf01fSchristos	add	w14,w14,w19
1421*1dcdf01fSchristos	rev32	v11.8h,v11.8h
1422*1dcdf01fSchristos	add	w15,w15,w20
1423*1dcdf01fSchristos	rev32	v15.8h,v15.8h
1424*1dcdf01fSchristos	add	w16,w16,w21
1425*1dcdf01fSchristos	rev32	v19.8h,v19.8h
1426*1dcdf01fSchristos	eor	w9,w9,w13
1427*1dcdf01fSchristos	rev32	v23.8h,v23.8h
1428*1dcdf01fSchristos	eor	w10,w10,w14
1429*1dcdf01fSchristos	add	v2.4s,v2.4s,v3.4s
1430*1dcdf01fSchristos	eor	w11,w11,w15
1431*1dcdf01fSchristos	add	v6.4s,v6.4s,v7.4s
1432*1dcdf01fSchristos	eor	w12,w12,w16
1433*1dcdf01fSchristos	add	v10.4s,v10.4s,v11.4s
1434*1dcdf01fSchristos	ror	w9,w9,#20
1435*1dcdf01fSchristos	add	v14.4s,v14.4s,v15.4s
1436*1dcdf01fSchristos	ror	w10,w10,#20
1437*1dcdf01fSchristos	add	v18.4s,v18.4s,v19.4s
1438*1dcdf01fSchristos	ror	w11,w11,#20
1439*1dcdf01fSchristos	add	v22.4s,v22.4s,v23.4s
1440*1dcdf01fSchristos	ror	w12,w12,#20
1441*1dcdf01fSchristos	eor	v24.16b,v1.16b,v2.16b
1442*1dcdf01fSchristos	add	w5,w5,w9
1443*1dcdf01fSchristos	eor	v25.16b,v5.16b,v6.16b
1444*1dcdf01fSchristos	add	w6,w6,w10
1445*1dcdf01fSchristos	eor	v26.16b,v9.16b,v10.16b
1446*1dcdf01fSchristos	add	w7,w7,w11
1447*1dcdf01fSchristos	eor	v27.16b,v13.16b,v14.16b
1448*1dcdf01fSchristos	add	w8,w8,w12
1449*1dcdf01fSchristos	eor	v28.16b,v17.16b,v18.16b
1450*1dcdf01fSchristos	eor	w17,w17,w5
1451*1dcdf01fSchristos	eor	v29.16b,v21.16b,v22.16b
1452*1dcdf01fSchristos	eor	w19,w19,w6
1453*1dcdf01fSchristos	ushr	v1.4s,v24.4s,#20
1454*1dcdf01fSchristos	eor	w20,w20,w7
1455*1dcdf01fSchristos	ushr	v5.4s,v25.4s,#20
1456*1dcdf01fSchristos	eor	w21,w21,w8
1457*1dcdf01fSchristos	ushr	v9.4s,v26.4s,#20
1458*1dcdf01fSchristos	ror	w17,w17,#24
1459*1dcdf01fSchristos	ushr	v13.4s,v27.4s,#20
1460*1dcdf01fSchristos	ror	w19,w19,#24
1461*1dcdf01fSchristos	ushr	v17.4s,v28.4s,#20
1462*1dcdf01fSchristos	ror	w20,w20,#24
1463*1dcdf01fSchristos	ushr	v21.4s,v29.4s,#20
1464*1dcdf01fSchristos	ror	w21,w21,#24
1465*1dcdf01fSchristos	sli	v1.4s,v24.4s,#12
1466*1dcdf01fSchristos	add	w13,w13,w17
1467*1dcdf01fSchristos	sli	v5.4s,v25.4s,#12
1468*1dcdf01fSchristos	add	w14,w14,w19
1469*1dcdf01fSchristos	sli	v9.4s,v26.4s,#12
1470*1dcdf01fSchristos	add	w15,w15,w20
1471*1dcdf01fSchristos	sli	v13.4s,v27.4s,#12
1472*1dcdf01fSchristos	add	w16,w16,w21
1473*1dcdf01fSchristos	sli	v17.4s,v28.4s,#12
1474*1dcdf01fSchristos	eor	w9,w9,w13
1475*1dcdf01fSchristos	sli	v21.4s,v29.4s,#12
1476*1dcdf01fSchristos	eor	w10,w10,w14
1477*1dcdf01fSchristos	add	v0.4s,v0.4s,v1.4s
1478*1dcdf01fSchristos	eor	w11,w11,w15
1479*1dcdf01fSchristos	add	v4.4s,v4.4s,v5.4s
1480*1dcdf01fSchristos	eor	w12,w12,w16
1481*1dcdf01fSchristos	add	v8.4s,v8.4s,v9.4s
1482*1dcdf01fSchristos	ror	w9,w9,#25
1483*1dcdf01fSchristos	add	v12.4s,v12.4s,v13.4s
1484*1dcdf01fSchristos	ror	w10,w10,#25
1485*1dcdf01fSchristos	add	v16.4s,v16.4s,v17.4s
1486*1dcdf01fSchristos	ror	w11,w11,#25
1487*1dcdf01fSchristos	add	v20.4s,v20.4s,v21.4s
1488*1dcdf01fSchristos	ror	w12,w12,#25
1489*1dcdf01fSchristos	eor	v24.16b,v3.16b,v0.16b
1490*1dcdf01fSchristos	add	w5,w5,w10
1491*1dcdf01fSchristos	eor	v25.16b,v7.16b,v4.16b
1492*1dcdf01fSchristos	add	w6,w6,w11
1493*1dcdf01fSchristos	eor	v26.16b,v11.16b,v8.16b
1494*1dcdf01fSchristos	add	w7,w7,w12
1495*1dcdf01fSchristos	eor	v27.16b,v15.16b,v12.16b
1496*1dcdf01fSchristos	add	w8,w8,w9
1497*1dcdf01fSchristos	eor	v28.16b,v19.16b,v16.16b
1498*1dcdf01fSchristos	eor	w21,w21,w5
1499*1dcdf01fSchristos	eor	v29.16b,v23.16b,v20.16b
1500*1dcdf01fSchristos	eor	w17,w17,w6
1501*1dcdf01fSchristos	ushr	v3.4s,v24.4s,#24
1502*1dcdf01fSchristos	eor	w19,w19,w7
1503*1dcdf01fSchristos	ushr	v7.4s,v25.4s,#24
1504*1dcdf01fSchristos	eor	w20,w20,w8
1505*1dcdf01fSchristos	ushr	v11.4s,v26.4s,#24
1506*1dcdf01fSchristos	ror	w21,w21,#16
1507*1dcdf01fSchristos	ushr	v15.4s,v27.4s,#24
1508*1dcdf01fSchristos	ror	w17,w17,#16
1509*1dcdf01fSchristos	ushr	v19.4s,v28.4s,#24
1510*1dcdf01fSchristos	ror	w19,w19,#16
1511*1dcdf01fSchristos	ushr	v23.4s,v29.4s,#24
1512*1dcdf01fSchristos	ror	w20,w20,#16
1513*1dcdf01fSchristos	sli	v3.4s,v24.4s,#8
1514*1dcdf01fSchristos	add	w15,w15,w21
1515*1dcdf01fSchristos	sli	v7.4s,v25.4s,#8
1516*1dcdf01fSchristos	add	w16,w16,w17
1517*1dcdf01fSchristos	sli	v11.4s,v26.4s,#8
1518*1dcdf01fSchristos	add	w13,w13,w19
1519*1dcdf01fSchristos	sli	v15.4s,v27.4s,#8
1520*1dcdf01fSchristos	add	w14,w14,w20
1521*1dcdf01fSchristos	sli	v19.4s,v28.4s,#8
1522*1dcdf01fSchristos	eor	w10,w10,w15
1523*1dcdf01fSchristos	sli	v23.4s,v29.4s,#8
1524*1dcdf01fSchristos	eor	w11,w11,w16
1525*1dcdf01fSchristos	add	v2.4s,v2.4s,v3.4s
1526*1dcdf01fSchristos	eor	w12,w12,w13
1527*1dcdf01fSchristos	add	v6.4s,v6.4s,v7.4s
1528*1dcdf01fSchristos	eor	w9,w9,w14
1529*1dcdf01fSchristos	add	v10.4s,v10.4s,v11.4s
1530*1dcdf01fSchristos	ror	w10,w10,#20
1531*1dcdf01fSchristos	add	v14.4s,v14.4s,v15.4s
1532*1dcdf01fSchristos	ror	w11,w11,#20
1533*1dcdf01fSchristos	add	v18.4s,v18.4s,v19.4s
1534*1dcdf01fSchristos	ror	w12,w12,#20
1535*1dcdf01fSchristos	add	v22.4s,v22.4s,v23.4s
1536*1dcdf01fSchristos	ror	w9,w9,#20
1537*1dcdf01fSchristos	eor	v24.16b,v1.16b,v2.16b
1538*1dcdf01fSchristos	add	w5,w5,w10
1539*1dcdf01fSchristos	eor	v25.16b,v5.16b,v6.16b
1540*1dcdf01fSchristos	add	w6,w6,w11
1541*1dcdf01fSchristos	eor	v26.16b,v9.16b,v10.16b
1542*1dcdf01fSchristos	add	w7,w7,w12
1543*1dcdf01fSchristos	eor	v27.16b,v13.16b,v14.16b
1544*1dcdf01fSchristos	add	w8,w8,w9
1545*1dcdf01fSchristos	eor	v28.16b,v17.16b,v18.16b
1546*1dcdf01fSchristos	eor	w21,w21,w5
1547*1dcdf01fSchristos	eor	v29.16b,v21.16b,v22.16b
1548*1dcdf01fSchristos	eor	w17,w17,w6
1549*1dcdf01fSchristos	ushr	v1.4s,v24.4s,#25
1550*1dcdf01fSchristos	eor	w19,w19,w7
1551*1dcdf01fSchristos	ushr	v5.4s,v25.4s,#25
1552*1dcdf01fSchristos	eor	w20,w20,w8
1553*1dcdf01fSchristos	ushr	v9.4s,v26.4s,#25
1554*1dcdf01fSchristos	ror	w21,w21,#24
1555*1dcdf01fSchristos	ushr	v13.4s,v27.4s,#25
1556*1dcdf01fSchristos	ror	w17,w17,#24
1557*1dcdf01fSchristos	ushr	v17.4s,v28.4s,#25
1558*1dcdf01fSchristos	ror	w19,w19,#24
1559*1dcdf01fSchristos	ushr	v21.4s,v29.4s,#25
1560*1dcdf01fSchristos	ror	w20,w20,#24
1561*1dcdf01fSchristos	sli	v1.4s,v24.4s,#7
1562*1dcdf01fSchristos	add	w15,w15,w21
1563*1dcdf01fSchristos	sli	v5.4s,v25.4s,#7
1564*1dcdf01fSchristos	add	w16,w16,w17
1565*1dcdf01fSchristos	sli	v9.4s,v26.4s,#7
1566*1dcdf01fSchristos	add	w13,w13,w19
1567*1dcdf01fSchristos	sli	v13.4s,v27.4s,#7
1568*1dcdf01fSchristos	add	w14,w14,w20
1569*1dcdf01fSchristos	sli	v17.4s,v28.4s,#7
1570*1dcdf01fSchristos	eor	w10,w10,w15
1571*1dcdf01fSchristos	sli	v21.4s,v29.4s,#7
1572*1dcdf01fSchristos	eor	w11,w11,w16
1573*1dcdf01fSchristos	ext	v2.16b,v2.16b,v2.16b,#8
1574*1dcdf01fSchristos	eor	w12,w12,w13
1575*1dcdf01fSchristos	ext	v6.16b,v6.16b,v6.16b,#8
1576*1dcdf01fSchristos	eor	w9,w9,w14
1577*1dcdf01fSchristos	ext	v10.16b,v10.16b,v10.16b,#8
1578*1dcdf01fSchristos	ror	w10,w10,#25
1579*1dcdf01fSchristos	ext	v14.16b,v14.16b,v14.16b,#8
1580*1dcdf01fSchristos	ror	w11,w11,#25
1581*1dcdf01fSchristos	ext	v18.16b,v18.16b,v18.16b,#8
1582*1dcdf01fSchristos	ror	w12,w12,#25
1583*1dcdf01fSchristos	ext	v22.16b,v22.16b,v22.16b,#8
1584*1dcdf01fSchristos	ror	w9,w9,#25
1585*1dcdf01fSchristos	ext	v3.16b,v3.16b,v3.16b,#12
1586*1dcdf01fSchristos	ext	v7.16b,v7.16b,v7.16b,#12
1587*1dcdf01fSchristos	ext	v11.16b,v11.16b,v11.16b,#12
1588*1dcdf01fSchristos	ext	v15.16b,v15.16b,v15.16b,#12
1589*1dcdf01fSchristos	ext	v19.16b,v19.16b,v19.16b,#12
1590*1dcdf01fSchristos	ext	v23.16b,v23.16b,v23.16b,#12
1591*1dcdf01fSchristos	ext	v1.16b,v1.16b,v1.16b,#4
1592*1dcdf01fSchristos	ext	v5.16b,v5.16b,v5.16b,#4
1593*1dcdf01fSchristos	ext	v9.16b,v9.16b,v9.16b,#4
1594*1dcdf01fSchristos	ext	v13.16b,v13.16b,v13.16b,#4
1595*1dcdf01fSchristos	ext	v17.16b,v17.16b,v17.16b,#4
1596*1dcdf01fSchristos	ext	v21.16b,v21.16b,v21.16b,#4
1597*1dcdf01fSchristos	add	v0.4s,v0.4s,v1.4s
1598*1dcdf01fSchristos	add	w5,w5,w9
1599*1dcdf01fSchristos	add	v4.4s,v4.4s,v5.4s
1600*1dcdf01fSchristos	add	w6,w6,w10
1601*1dcdf01fSchristos	add	v8.4s,v8.4s,v9.4s
1602*1dcdf01fSchristos	add	w7,w7,w11
1603*1dcdf01fSchristos	add	v12.4s,v12.4s,v13.4s
1604*1dcdf01fSchristos	add	w8,w8,w12
1605*1dcdf01fSchristos	add	v16.4s,v16.4s,v17.4s
1606*1dcdf01fSchristos	eor	w17,w17,w5
1607*1dcdf01fSchristos	add	v20.4s,v20.4s,v21.4s
1608*1dcdf01fSchristos	eor	w19,w19,w6
1609*1dcdf01fSchristos	eor	v3.16b,v3.16b,v0.16b
1610*1dcdf01fSchristos	eor	w20,w20,w7
1611*1dcdf01fSchristos	eor	v7.16b,v7.16b,v4.16b
1612*1dcdf01fSchristos	eor	w21,w21,w8
1613*1dcdf01fSchristos	eor	v11.16b,v11.16b,v8.16b
1614*1dcdf01fSchristos	ror	w17,w17,#16
1615*1dcdf01fSchristos	eor	v15.16b,v15.16b,v12.16b
1616*1dcdf01fSchristos	ror	w19,w19,#16
1617*1dcdf01fSchristos	eor	v19.16b,v19.16b,v16.16b
1618*1dcdf01fSchristos	ror	w20,w20,#16
1619*1dcdf01fSchristos	eor	v23.16b,v23.16b,v20.16b
1620*1dcdf01fSchristos	ror	w21,w21,#16
1621*1dcdf01fSchristos	rev32	v3.8h,v3.8h
1622*1dcdf01fSchristos	add	w13,w13,w17
1623*1dcdf01fSchristos	rev32	v7.8h,v7.8h
1624*1dcdf01fSchristos	add	w14,w14,w19
1625*1dcdf01fSchristos	rev32	v11.8h,v11.8h
1626*1dcdf01fSchristos	add	w15,w15,w20
1627*1dcdf01fSchristos	rev32	v15.8h,v15.8h
1628*1dcdf01fSchristos	add	w16,w16,w21
1629*1dcdf01fSchristos	rev32	v19.8h,v19.8h
1630*1dcdf01fSchristos	eor	w9,w9,w13
1631*1dcdf01fSchristos	rev32	v23.8h,v23.8h
1632*1dcdf01fSchristos	eor	w10,w10,w14
1633*1dcdf01fSchristos	add	v2.4s,v2.4s,v3.4s
1634*1dcdf01fSchristos	eor	w11,w11,w15
1635*1dcdf01fSchristos	add	v6.4s,v6.4s,v7.4s
1636*1dcdf01fSchristos	eor	w12,w12,w16
1637*1dcdf01fSchristos	add	v10.4s,v10.4s,v11.4s
1638*1dcdf01fSchristos	ror	w9,w9,#20
1639*1dcdf01fSchristos	add	v14.4s,v14.4s,v15.4s
1640*1dcdf01fSchristos	ror	w10,w10,#20
1641*1dcdf01fSchristos	add	v18.4s,v18.4s,v19.4s
1642*1dcdf01fSchristos	ror	w11,w11,#20
1643*1dcdf01fSchristos	add	v22.4s,v22.4s,v23.4s
1644*1dcdf01fSchristos	ror	w12,w12,#20
1645*1dcdf01fSchristos	eor	v24.16b,v1.16b,v2.16b
1646*1dcdf01fSchristos	add	w5,w5,w9
1647*1dcdf01fSchristos	eor	v25.16b,v5.16b,v6.16b
1648*1dcdf01fSchristos	add	w6,w6,w10
1649*1dcdf01fSchristos	eor	v26.16b,v9.16b,v10.16b
1650*1dcdf01fSchristos	add	w7,w7,w11
1651*1dcdf01fSchristos	eor	v27.16b,v13.16b,v14.16b
1652*1dcdf01fSchristos	add	w8,w8,w12
1653*1dcdf01fSchristos	eor	v28.16b,v17.16b,v18.16b
1654*1dcdf01fSchristos	eor	w17,w17,w5
1655*1dcdf01fSchristos	eor	v29.16b,v21.16b,v22.16b
1656*1dcdf01fSchristos	eor	w19,w19,w6
1657*1dcdf01fSchristos	ushr	v1.4s,v24.4s,#20
1658*1dcdf01fSchristos	eor	w20,w20,w7
1659*1dcdf01fSchristos	ushr	v5.4s,v25.4s,#20
1660*1dcdf01fSchristos	eor	w21,w21,w8
1661*1dcdf01fSchristos	ushr	v9.4s,v26.4s,#20
1662*1dcdf01fSchristos	ror	w17,w17,#24
1663*1dcdf01fSchristos	ushr	v13.4s,v27.4s,#20
1664*1dcdf01fSchristos	ror	w19,w19,#24
1665*1dcdf01fSchristos	ushr	v17.4s,v28.4s,#20
1666*1dcdf01fSchristos	ror	w20,w20,#24
1667*1dcdf01fSchristos	ushr	v21.4s,v29.4s,#20
1668*1dcdf01fSchristos	ror	w21,w21,#24
1669*1dcdf01fSchristos	sli	v1.4s,v24.4s,#12
1670*1dcdf01fSchristos	add	w13,w13,w17
1671*1dcdf01fSchristos	sli	v5.4s,v25.4s,#12
1672*1dcdf01fSchristos	add	w14,w14,w19
1673*1dcdf01fSchristos	sli	v9.4s,v26.4s,#12
1674*1dcdf01fSchristos	add	w15,w15,w20
1675*1dcdf01fSchristos	sli	v13.4s,v27.4s,#12
1676*1dcdf01fSchristos	add	w16,w16,w21
1677*1dcdf01fSchristos	sli	v17.4s,v28.4s,#12
1678*1dcdf01fSchristos	eor	w9,w9,w13
1679*1dcdf01fSchristos	sli	v21.4s,v29.4s,#12
1680*1dcdf01fSchristos	eor	w10,w10,w14
1681*1dcdf01fSchristos	add	v0.4s,v0.4s,v1.4s
1682*1dcdf01fSchristos	eor	w11,w11,w15
1683*1dcdf01fSchristos	add	v4.4s,v4.4s,v5.4s
1684*1dcdf01fSchristos	eor	w12,w12,w16
1685*1dcdf01fSchristos	add	v8.4s,v8.4s,v9.4s
1686*1dcdf01fSchristos	ror	w9,w9,#25
1687*1dcdf01fSchristos	add	v12.4s,v12.4s,v13.4s
1688*1dcdf01fSchristos	ror	w10,w10,#25
1689*1dcdf01fSchristos	add	v16.4s,v16.4s,v17.4s
1690*1dcdf01fSchristos	ror	w11,w11,#25
1691*1dcdf01fSchristos	add	v20.4s,v20.4s,v21.4s
1692*1dcdf01fSchristos	ror	w12,w12,#25
1693*1dcdf01fSchristos	eor	v24.16b,v3.16b,v0.16b
1694*1dcdf01fSchristos	add	w5,w5,w10
1695*1dcdf01fSchristos	eor	v25.16b,v7.16b,v4.16b
1696*1dcdf01fSchristos	add	w6,w6,w11
1697*1dcdf01fSchristos	eor	v26.16b,v11.16b,v8.16b
1698*1dcdf01fSchristos	add	w7,w7,w12
1699*1dcdf01fSchristos	eor	v27.16b,v15.16b,v12.16b
1700*1dcdf01fSchristos	add	w8,w8,w9
1701*1dcdf01fSchristos	eor	v28.16b,v19.16b,v16.16b
1702*1dcdf01fSchristos	eor	w21,w21,w5
1703*1dcdf01fSchristos	eor	v29.16b,v23.16b,v20.16b
1704*1dcdf01fSchristos	eor	w17,w17,w6
1705*1dcdf01fSchristos	ushr	v3.4s,v24.4s,#24
1706*1dcdf01fSchristos	eor	w19,w19,w7
1707*1dcdf01fSchristos	ushr	v7.4s,v25.4s,#24
1708*1dcdf01fSchristos	eor	w20,w20,w8
1709*1dcdf01fSchristos	ushr	v11.4s,v26.4s,#24
1710*1dcdf01fSchristos	ror	w21,w21,#16
1711*1dcdf01fSchristos	ushr	v15.4s,v27.4s,#24
1712*1dcdf01fSchristos	ror	w17,w17,#16
1713*1dcdf01fSchristos	ushr	v19.4s,v28.4s,#24
1714*1dcdf01fSchristos	ror	w19,w19,#16
1715*1dcdf01fSchristos	ushr	v23.4s,v29.4s,#24
1716*1dcdf01fSchristos	ror	w20,w20,#16
1717*1dcdf01fSchristos	sli	v3.4s,v24.4s,#8
1718*1dcdf01fSchristos	add	w15,w15,w21
1719*1dcdf01fSchristos	sli	v7.4s,v25.4s,#8
1720*1dcdf01fSchristos	add	w16,w16,w17
1721*1dcdf01fSchristos	sli	v11.4s,v26.4s,#8
1722*1dcdf01fSchristos	add	w13,w13,w19
1723*1dcdf01fSchristos	sli	v15.4s,v27.4s,#8
1724*1dcdf01fSchristos	add	w14,w14,w20
1725*1dcdf01fSchristos	sli	v19.4s,v28.4s,#8
1726*1dcdf01fSchristos	eor	w10,w10,w15
1727*1dcdf01fSchristos	sli	v23.4s,v29.4s,#8
1728*1dcdf01fSchristos	eor	w11,w11,w16
1729*1dcdf01fSchristos	add	v2.4s,v2.4s,v3.4s
1730*1dcdf01fSchristos	eor	w12,w12,w13
1731*1dcdf01fSchristos	add	v6.4s,v6.4s,v7.4s
1732*1dcdf01fSchristos	eor	w9,w9,w14
1733*1dcdf01fSchristos	add	v10.4s,v10.4s,v11.4s
1734*1dcdf01fSchristos	ror	w10,w10,#20
1735*1dcdf01fSchristos	add	v14.4s,v14.4s,v15.4s
1736*1dcdf01fSchristos	ror	w11,w11,#20
1737*1dcdf01fSchristos	add	v18.4s,v18.4s,v19.4s
1738*1dcdf01fSchristos	ror	w12,w12,#20
1739*1dcdf01fSchristos	add	v22.4s,v22.4s,v23.4s
1740*1dcdf01fSchristos	ror	w9,w9,#20
1741*1dcdf01fSchristos	eor	v24.16b,v1.16b,v2.16b
1742*1dcdf01fSchristos	add	w5,w5,w10
1743*1dcdf01fSchristos	eor	v25.16b,v5.16b,v6.16b
1744*1dcdf01fSchristos	add	w6,w6,w11
1745*1dcdf01fSchristos	eor	v26.16b,v9.16b,v10.16b
1746*1dcdf01fSchristos	add	w7,w7,w12
1747*1dcdf01fSchristos	eor	v27.16b,v13.16b,v14.16b
1748*1dcdf01fSchristos	add	w8,w8,w9
1749*1dcdf01fSchristos	eor	v28.16b,v17.16b,v18.16b
1750*1dcdf01fSchristos	eor	w21,w21,w5
1751*1dcdf01fSchristos	eor	v29.16b,v21.16b,v22.16b
1752*1dcdf01fSchristos	eor	w17,w17,w6
1753*1dcdf01fSchristos	ushr	v1.4s,v24.4s,#25
1754*1dcdf01fSchristos	eor	w19,w19,w7
1755*1dcdf01fSchristos	ushr	v5.4s,v25.4s,#25
1756*1dcdf01fSchristos	eor	w20,w20,w8
1757*1dcdf01fSchristos	ushr	v9.4s,v26.4s,#25
1758*1dcdf01fSchristos	ror	w21,w21,#24
1759*1dcdf01fSchristos	ushr	v13.4s,v27.4s,#25
1760*1dcdf01fSchristos	ror	w17,w17,#24
1761*1dcdf01fSchristos	ushr	v17.4s,v28.4s,#25
1762*1dcdf01fSchristos	ror	w19,w19,#24
1763*1dcdf01fSchristos	ushr	v21.4s,v29.4s,#25
1764*1dcdf01fSchristos	ror	w20,w20,#24
1765*1dcdf01fSchristos	sli	v1.4s,v24.4s,#7
1766*1dcdf01fSchristos	add	w15,w15,w21
1767*1dcdf01fSchristos	sli	v5.4s,v25.4s,#7
1768*1dcdf01fSchristos	add	w16,w16,w17
1769*1dcdf01fSchristos	sli	v9.4s,v26.4s,#7
1770*1dcdf01fSchristos	add	w13,w13,w19
1771*1dcdf01fSchristos	sli	v13.4s,v27.4s,#7
1772*1dcdf01fSchristos	add	w14,w14,w20
1773*1dcdf01fSchristos	sli	v17.4s,v28.4s,#7
1774*1dcdf01fSchristos	eor	w10,w10,w15
1775*1dcdf01fSchristos	sli	v21.4s,v29.4s,#7
1776*1dcdf01fSchristos	eor	w11,w11,w16
1777*1dcdf01fSchristos	ext	v2.16b,v2.16b,v2.16b,#8
1778*1dcdf01fSchristos	eor	w12,w12,w13
1779*1dcdf01fSchristos	ext	v6.16b,v6.16b,v6.16b,#8
1780*1dcdf01fSchristos	eor	w9,w9,w14
1781*1dcdf01fSchristos	ext	v10.16b,v10.16b,v10.16b,#8
1782*1dcdf01fSchristos	ror	w10,w10,#25
1783*1dcdf01fSchristos	ext	v14.16b,v14.16b,v14.16b,#8
1784*1dcdf01fSchristos	ror	w11,w11,#25
1785*1dcdf01fSchristos	ext	v18.16b,v18.16b,v18.16b,#8
1786*1dcdf01fSchristos	ror	w12,w12,#25
1787*1dcdf01fSchristos	ext	v22.16b,v22.16b,v22.16b,#8
1788*1dcdf01fSchristos	ror	w9,w9,#25
1789*1dcdf01fSchristos	ext	v3.16b,v3.16b,v3.16b,#4
1790*1dcdf01fSchristos	ext	v7.16b,v7.16b,v7.16b,#4
1791*1dcdf01fSchristos	ext	v11.16b,v11.16b,v11.16b,#4
1792*1dcdf01fSchristos	ext	v15.16b,v15.16b,v15.16b,#4
1793*1dcdf01fSchristos	ext	v19.16b,v19.16b,v19.16b,#4
1794*1dcdf01fSchristos	ext	v23.16b,v23.16b,v23.16b,#4
1795*1dcdf01fSchristos	ext	v1.16b,v1.16b,v1.16b,#12
1796*1dcdf01fSchristos	ext	v5.16b,v5.16b,v5.16b,#12
1797*1dcdf01fSchristos	ext	v9.16b,v9.16b,v9.16b,#12
1798*1dcdf01fSchristos	ext	v13.16b,v13.16b,v13.16b,#12
1799*1dcdf01fSchristos	ext	v17.16b,v17.16b,v17.16b,#12
1800*1dcdf01fSchristos	ext	v21.16b,v21.16b,v21.16b,#12
1801*1dcdf01fSchristos	cbnz	x4,.Loop_lower_neon
1802*1dcdf01fSchristos
1803*1dcdf01fSchristos	add	w5,w5,w22		// accumulate key block
1804*1dcdf01fSchristos	ldp	q24,q25,[sp,#0]
1805*1dcdf01fSchristos	add	x6,x6,x22,lsr#32
1806*1dcdf01fSchristos	ldp	q26,q27,[sp,#32]
1807*1dcdf01fSchristos	add	w7,w7,w23
1808*1dcdf01fSchristos	ldp	q28,q29,[sp,#64]
1809*1dcdf01fSchristos	add	x8,x8,x23,lsr#32
1810*1dcdf01fSchristos	add	v0.4s,v0.4s,v24.4s
1811*1dcdf01fSchristos	add	w9,w9,w24
1812*1dcdf01fSchristos	add	v4.4s,v4.4s,v24.4s
1813*1dcdf01fSchristos	add	x10,x10,x24,lsr#32
1814*1dcdf01fSchristos	add	v8.4s,v8.4s,v24.4s
1815*1dcdf01fSchristos	add	w11,w11,w25
1816*1dcdf01fSchristos	add	v12.4s,v12.4s,v24.4s
1817*1dcdf01fSchristos	add	x12,x12,x25,lsr#32
1818*1dcdf01fSchristos	add	v16.4s,v16.4s,v24.4s
1819*1dcdf01fSchristos	add	w13,w13,w26
1820*1dcdf01fSchristos	add	v20.4s,v20.4s,v24.4s
1821*1dcdf01fSchristos	add	x14,x14,x26,lsr#32
1822*1dcdf01fSchristos	add	v2.4s,v2.4s,v26.4s
1823*1dcdf01fSchristos	add	w15,w15,w27
1824*1dcdf01fSchristos	add	v6.4s,v6.4s,v26.4s
1825*1dcdf01fSchristos	add	x16,x16,x27,lsr#32
1826*1dcdf01fSchristos	add	v10.4s,v10.4s,v26.4s
1827*1dcdf01fSchristos	add	w17,w17,w28
1828*1dcdf01fSchristos	add	v14.4s,v14.4s,v26.4s
1829*1dcdf01fSchristos	add	x19,x19,x28,lsr#32
1830*1dcdf01fSchristos	add	v18.4s,v18.4s,v26.4s
1831*1dcdf01fSchristos	add	w20,w20,w30
1832*1dcdf01fSchristos	add	v22.4s,v22.4s,v26.4s
1833*1dcdf01fSchristos	add	x21,x21,x30,lsr#32
1834*1dcdf01fSchristos	add	v19.4s,v19.4s,v31.4s			// +4
1835*1dcdf01fSchristos	add	x5,x5,x6,lsl#32	// pack
1836*1dcdf01fSchristos	add	v23.4s,v23.4s,v31.4s			// +4
1837*1dcdf01fSchristos	add	x7,x7,x8,lsl#32
1838*1dcdf01fSchristos	add	v3.4s,v3.4s,v27.4s
1839*1dcdf01fSchristos	ldp	x6,x8,[x1,#0]		// load input
1840*1dcdf01fSchristos	add	v7.4s,v7.4s,v28.4s
1841*1dcdf01fSchristos	add	x9,x9,x10,lsl#32
1842*1dcdf01fSchristos	add	v11.4s,v11.4s,v29.4s
1843*1dcdf01fSchristos	add	x11,x11,x12,lsl#32
1844*1dcdf01fSchristos	add	v15.4s,v15.4s,v30.4s
1845*1dcdf01fSchristos	ldp	x10,x12,[x1,#16]
1846*1dcdf01fSchristos	add	v19.4s,v19.4s,v27.4s
1847*1dcdf01fSchristos	add	x13,x13,x14,lsl#32
1848*1dcdf01fSchristos	add	v23.4s,v23.4s,v28.4s
1849*1dcdf01fSchristos	add	x15,x15,x16,lsl#32
1850*1dcdf01fSchristos	add	v1.4s,v1.4s,v25.4s
1851*1dcdf01fSchristos	ldp	x14,x16,[x1,#32]
1852*1dcdf01fSchristos	add	v5.4s,v5.4s,v25.4s
1853*1dcdf01fSchristos	add	x17,x17,x19,lsl#32
1854*1dcdf01fSchristos	add	v9.4s,v9.4s,v25.4s
1855*1dcdf01fSchristos	add	x20,x20,x21,lsl#32
1856*1dcdf01fSchristos	add	v13.4s,v13.4s,v25.4s
1857*1dcdf01fSchristos	ldp	x19,x21,[x1,#48]
1858*1dcdf01fSchristos	add	v17.4s,v17.4s,v25.4s
1859*1dcdf01fSchristos	add	x1,x1,#64
1860*1dcdf01fSchristos	add	v21.4s,v21.4s,v25.4s
1861*1dcdf01fSchristos
1862*1dcdf01fSchristos#ifdef	__ARMEB__
1863*1dcdf01fSchristos	rev	x5,x5
1864*1dcdf01fSchristos	rev	x7,x7
1865*1dcdf01fSchristos	rev	x9,x9
1866*1dcdf01fSchristos	rev	x11,x11
1867*1dcdf01fSchristos	rev	x13,x13
1868*1dcdf01fSchristos	rev	x15,x15
1869*1dcdf01fSchristos	rev	x17,x17
1870*1dcdf01fSchristos	rev	x20,x20
1871*1dcdf01fSchristos#endif
1872*1dcdf01fSchristos	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1873*1dcdf01fSchristos	eor	x5,x5,x6
1874*1dcdf01fSchristos	eor	x7,x7,x8
1875*1dcdf01fSchristos	eor	x9,x9,x10
1876*1dcdf01fSchristos	eor	x11,x11,x12
1877*1dcdf01fSchristos	eor	x13,x13,x14
1878*1dcdf01fSchristos	eor	v0.16b,v0.16b,v24.16b
1879*1dcdf01fSchristos	eor	x15,x15,x16
1880*1dcdf01fSchristos	eor	v1.16b,v1.16b,v25.16b
1881*1dcdf01fSchristos	eor	x17,x17,x19
1882*1dcdf01fSchristos	eor	v2.16b,v2.16b,v26.16b
1883*1dcdf01fSchristos	eor	x20,x20,x21
1884*1dcdf01fSchristos	eor	v3.16b,v3.16b,v27.16b
1885*1dcdf01fSchristos	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1886*1dcdf01fSchristos
1887*1dcdf01fSchristos	stp	x5,x7,[x0,#0]		// store output
1888*1dcdf01fSchristos	add	x28,x28,#7			// increment counter
1889*1dcdf01fSchristos	stp	x9,x11,[x0,#16]
1890*1dcdf01fSchristos	stp	x13,x15,[x0,#32]
1891*1dcdf01fSchristos	stp	x17,x20,[x0,#48]
1892*1dcdf01fSchristos	add	x0,x0,#64
1893*1dcdf01fSchristos	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1894*1dcdf01fSchristos
1895*1dcdf01fSchristos	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1896*1dcdf01fSchristos	eor	v4.16b,v4.16b,v24.16b
1897*1dcdf01fSchristos	eor	v5.16b,v5.16b,v25.16b
1898*1dcdf01fSchristos	eor	v6.16b,v6.16b,v26.16b
1899*1dcdf01fSchristos	eor	v7.16b,v7.16b,v27.16b
1900*1dcdf01fSchristos	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1901*1dcdf01fSchristos
1902*1dcdf01fSchristos	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1903*1dcdf01fSchristos	eor	v8.16b,v8.16b,v0.16b
1904*1dcdf01fSchristos	ldp	q24,q25,[sp,#0]
1905*1dcdf01fSchristos	eor	v9.16b,v9.16b,v1.16b
1906*1dcdf01fSchristos	ldp	q26,q27,[sp,#32]
1907*1dcdf01fSchristos	eor	v10.16b,v10.16b,v2.16b
1908*1dcdf01fSchristos	eor	v11.16b,v11.16b,v3.16b
1909*1dcdf01fSchristos	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1910*1dcdf01fSchristos
1911*1dcdf01fSchristos	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1912*1dcdf01fSchristos	eor	v12.16b,v12.16b,v4.16b
1913*1dcdf01fSchristos	eor	v13.16b,v13.16b,v5.16b
1914*1dcdf01fSchristos	eor	v14.16b,v14.16b,v6.16b
1915*1dcdf01fSchristos	eor	v15.16b,v15.16b,v7.16b
1916*1dcdf01fSchristos	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1917*1dcdf01fSchristos
1918*1dcdf01fSchristos	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1919*1dcdf01fSchristos	eor	v16.16b,v16.16b,v8.16b
1920*1dcdf01fSchristos	eor	v17.16b,v17.16b,v9.16b
1921*1dcdf01fSchristos	eor	v18.16b,v18.16b,v10.16b
1922*1dcdf01fSchristos	eor	v19.16b,v19.16b,v11.16b
1923*1dcdf01fSchristos	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1924*1dcdf01fSchristos
1925*1dcdf01fSchristos	shl	v0.4s,v31.4s,#1			// 4 -> 8
1926*1dcdf01fSchristos	eor	v20.16b,v20.16b,v12.16b
1927*1dcdf01fSchristos	eor	v21.16b,v21.16b,v13.16b
1928*1dcdf01fSchristos	eor	v22.16b,v22.16b,v14.16b
1929*1dcdf01fSchristos	eor	v23.16b,v23.16b,v15.16b
1930*1dcdf01fSchristos	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1931*1dcdf01fSchristos
1932*1dcdf01fSchristos	add	v27.4s,v27.4s,v0.4s			// += 8
1933*1dcdf01fSchristos	add	v28.4s,v28.4s,v0.4s
1934*1dcdf01fSchristos	add	v29.4s,v29.4s,v0.4s
1935*1dcdf01fSchristos	add	v30.4s,v30.4s,v0.4s
1936*1dcdf01fSchristos
1937*1dcdf01fSchristos	b.hs	.Loop_outer_512_neon
1938*1dcdf01fSchristos
1939*1dcdf01fSchristos	adds	x2,x2,#512
1940*1dcdf01fSchristos	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1941*1dcdf01fSchristos
1942*1dcdf01fSchristos	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1943*1dcdf01fSchristos	ldp	d10,d11,[sp,#128+16]
1944*1dcdf01fSchristos	ldp	d12,d13,[sp,#128+32]
1945*1dcdf01fSchristos	ldp	d14,d15,[sp,#128+48]
1946*1dcdf01fSchristos
1947*1dcdf01fSchristos	stp	q24,q31,[sp,#0]		// wipe off-load area
1948*1dcdf01fSchristos	stp	q24,q31,[sp,#32]
1949*1dcdf01fSchristos	stp	q24,q31,[sp,#64]
1950*1dcdf01fSchristos
1951*1dcdf01fSchristos	b.eq	.Ldone_512_neon
1952*1dcdf01fSchristos
1953*1dcdf01fSchristos	cmp	x2,#192
1954*1dcdf01fSchristos	sub	v27.4s,v27.4s,v0.4s			// -= 1
1955*1dcdf01fSchristos	sub	v28.4s,v28.4s,v0.4s
1956*1dcdf01fSchristos	sub	v29.4s,v29.4s,v0.4s
1957*1dcdf01fSchristos	add	sp,sp,#128
1958*1dcdf01fSchristos	b.hs	.Loop_outer_neon
1959*1dcdf01fSchristos
1960*1dcdf01fSchristos	eor	v25.16b,v25.16b,v25.16b
1961*1dcdf01fSchristos	eor	v26.16b,v26.16b,v26.16b
1962*1dcdf01fSchristos	eor	v27.16b,v27.16b,v27.16b
1963*1dcdf01fSchristos	eor	v28.16b,v28.16b,v28.16b
1964*1dcdf01fSchristos	eor	v29.16b,v29.16b,v29.16b
1965*1dcdf01fSchristos	eor	v30.16b,v30.16b,v30.16b
1966*1dcdf01fSchristos	b	.Loop_outer
1967*1dcdf01fSchristos
1968*1dcdf01fSchristos.Ldone_512_neon:
1969*1dcdf01fSchristos	ldp	x19,x20,[x29,#16]
1970*1dcdf01fSchristos	add	sp,sp,#128+64
1971*1dcdf01fSchristos	ldp	x21,x22,[x29,#32]
1972*1dcdf01fSchristos	ldp	x23,x24,[x29,#48]
1973*1dcdf01fSchristos	ldp	x25,x26,[x29,#64]
1974*1dcdf01fSchristos	ldp	x27,x28,[x29,#80]
1975*1dcdf01fSchristos	ldp	x29,x30,[sp],#96
1976*1dcdf01fSchristos.inst	0xd50323bf			// autiasp
1977*1dcdf01fSchristos	ret
1978*1dcdf01fSchristos.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1979