1/*	$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $	*/
2
3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <aarch64/asm.h>
30
31RCSID("$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $")
32
33#define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
34STEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
35STEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
36STEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
37STEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
38STEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
39STEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
40STEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
41STEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
42STEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
43STEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
44STEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
45STEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
46STEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
47STEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
48STEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
49STEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
50STEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
51STEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
52STEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
53STEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
54/* end ROUND */
55
56#define	STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
57	f(a0,b0,c0,d0, t0, r);						      \
58	f(a1,b1,c1,d1, t1, r);						      \
59	f(a2,b2,c2,d2, t2, r);						      \
60	f(a3,b3,c3,d3, t3, r);						      \
61	/* end of STEP */
62
63/*
64 * Each step of the ChaCha quarterround, split up so we can interleave
65 * the quarterrounds on independent rows/diagonals to maximize pipeline
66 * efficiency.  Reference:
67 *
68 *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
69 *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
70 *	https://cr.yp.to/papers.html#chacha
71 *
72 *	a += b; d ^= a; d <<<= 16;
73 *	c += d; b ^= c; b <<<= 12;
74 *	a += b; d ^= a; d <<<= 8;
75 *	c += d; b ^= c; b <<<= 7;
76 *
77 * The rotations are implemented with:
78 *	<<< 16		REV32 Vn.8h for 16,
79 *	<<< 12		SHL/SRI/ORR (shift left, shift right and insert, OR)
80 *	<<< 8		TBL (general permutation; rot8 below stored in r)
81 *	<<< 7		SHL/SRI/ORR
82 */
83#define	STEP0(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
84#define	STEP1(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
85#if 0
86#define	STEP2(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #16
87#define	STEP3(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 16)
88#define	STEP4(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
89#else
90#define	STEP2(a,b,c,d, t, r)	rev32	d##.8h, d##.8h
91#define	STEP3(a,b,c,d, t, r)	/* nothing */
92#define	STEP4(a,b,c,d, t, r)	/* nothing */
93#endif
94
95#define	STEP5(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
96#if 0
97#define	STEP6(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
98#define	STEP7(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #12
99#define	STEP8(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 12)
100#define	STEP9(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
101#else
102#define	STEP6(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
103#define	STEP7(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #12
104#define	STEP8(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 12)
105#define	STEP9(a,b,c,d, t, r)	/* nothing */
106#endif
107
108#define	STEP10(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
109#define	STEP11(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
110#if 0
111#define	STEP12(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #8
112#define	STEP13(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 8)
113#define	STEP14(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
114#else
115#define	STEP12(a,b,c,d, t, r)	tbl	d##.16b, {d##.16b}, r##.16b
116#define	STEP13(a,b,c,d, t, r)	/* nothing */
117#define	STEP14(a,b,c,d, t, r)	/* nothing */
118#endif
119
120#define	STEP15(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
121#if 0
122#define	STEP16(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
123#define	STEP17(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #7
124#define	STEP18(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 7)
125#define	STEP19(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
126#else
127#define	STEP16(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
128#define	STEP17(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #7
129#define	STEP18(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 7)
130#define	STEP19(a,b,c,d, t, r)	/* nothing */
131#endif
132
133#if defined(__AARCH64EB__)
134#define	HTOLE32(x)	rev32	x, x
135#define	LE32TOH(x)	rev32	x, x
136#else
137#define	LE32TOH(x)
138#define	HTOLE32(x)
139#endif
140
141/*
142 * chacha_stream256_neon(uint8_t s[256]@x0,
143 *     uint32_t blkno@w1,
144 *     const uint8_t nonce[12]@x2,
145 *     const uint8_t key[32]@x3,
146 *     const uint8_t const[16]@x4,
147 *     unsigned nr@w5)
148 */
149ENTRY(chacha_stream256_neon)
150	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
151	mov	fp, sp
152
153	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
154	stp	d10, d11, [sp, #0x20]
155	stp	d12, d13, [sp, #0x30]
156	stp	d14, d15, [sp, #0x40]
157
158	adrl	x9, v0123	/* x9 := &v0123 */
159	mov	x10, x4		/* r10 := c */
160	mov	x11, x3		/* r11 := k */
161	add	x12, x3, #16	/* r12 := k+4 */
162	mov	x13, x2		/* r13 := nonce */
163
164	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
165	dup	v12.4s, w1	/* v12 := (blkno, blkno, blkno, blkno) */
166	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
167	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
168	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
169	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
170	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
171
172	LE32TOH(v0.16b)
173	LE32TOH(v1.16b)
174	LE32TOH(v2.16b)
175	LE32TOH(v3.16b)
176	LE32TOH(v4.16b)
177	LE32TOH(v5.16b)
178	LE32TOH(v6.16b)
179	LE32TOH(v7.16b)
180	LE32TOH(v8.16b)
181	LE32TOH(v9.16b)
182	LE32TOH(v10.16b)
183	LE32TOH(v11.16b)
184	/* LE32TOH(v12.16b) -- blkno, already host order */
185	LE32TOH(v13.16b)
186	LE32TOH(v14.16b)
187	LE32TOH(v15.16b)
188
189	mov	v16.16b, v0.16b
190	mov	v17.16b, v1.16b
191	mov	v18.16b, v2.16b
192	mov	v19.16b, v3.16b
193	mov	v20.16b, v4.16b
194	mov	v21.16b, v5.16b
195	mov	v22.16b, v6.16b
196	mov	v23.16b, v7.16b
197	mov	v24.16b, v8.16b
198	mov	v25.16b, v9.16b
199	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
200	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
201	mov	w9, v11.s[0]
202	mov	w10, v13.s[0]
203	mov	w11, v14.s[0]
204	mov	w12, v15.s[0]
205
206	_ALIGN_TEXT
2071:	subs	w5, w5, #2
208	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
209	    v28,v29,v30,v31, v27)
210	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
211	    v28,v29,v30,v31, v27)
212	b.ne	1b
213
214	dup	v27.4s, w8
215	dup	v28.4s, w9
216	dup	v29.4s, w10
217	dup	v30.4s, w11
218	dup	v31.4s, w12
219
220	add	v0.4s, v0.4s, v16.4s
221	add	v1.4s, v1.4s, v17.4s
222	add	v2.4s, v2.4s, v18.4s
223	add	v3.4s, v3.4s, v19.4s
224	add	v4.4s, v4.4s, v20.4s
225	add	v5.4s, v5.4s, v21.4s
226	add	v6.4s, v6.4s, v22.4s
227	add	v7.4s, v7.4s, v23.4s
228	add	v8.4s, v8.4s, v24.4s
229	add	v9.4s, v9.4s, v25.4s
230	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
231	add	v11.4s, v11.4s, v28.4s
232	add	v12.4s, v12.4s, v26.4s
233	add	v13.4s, v13.4s, v29.4s
234	add	v14.4s, v14.4s, v30.4s
235	add	v15.4s, v15.4s, v31.4s
236
237	HTOLE32(v0.16b)
238	HTOLE32(v1.16b)
239	HTOLE32(v2.16b)
240	HTOLE32(v3.16b)
241	HTOLE32(v4.16b)
242	HTOLE32(v5.16b)
243	HTOLE32(v6.16b)
244	HTOLE32(v7.16b)
245	HTOLE32(v8.16b)
246	HTOLE32(v9.16b)
247	HTOLE32(v10.16b)
248	HTOLE32(v11.16b)
249	HTOLE32(v12.16b)
250	HTOLE32(v13.16b)
251	HTOLE32(v14.16b)
252	HTOLE32(v15.16b)
253
254	st4	{ v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
255	st4	{ v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
256	st4	{ v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
257	st4	{v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
258	st4	{ v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
259	st4	{ v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
260	st4	{ v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
261	st4	{v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
262	st4	{ v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
263	st4	{ v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
264	st4	{ v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
265	st4	{v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
266	st4	{ v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
267	st4	{ v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
268	st4	{ v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
269	st4	{v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
270
271	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
272	ldp	d10, d11, [sp, #0x20]
273	ldp	d12, d13, [sp, #0x30]
274	ldp	d14, d15, [sp, #0x40]
275
276	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
277	ret
278END(chacha_stream256_neon)
279
280/*
281 * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
282 *     uint32_t blkno@w2,
283 *     const uint8_t nonce[12]@x3,
284 *     const uint8_t key[32]@x4,
285 *     const uint8_t const[16]@x5,
286 *     unsigned nr@w6)
287 */
288ENTRY(chacha_stream_xor256_neon)
289	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
290	mov	fp, sp
291
292	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
293	stp	d10, d11, [sp, #0x20]
294	stp	d12, d13, [sp, #0x30]
295	stp	d14, d15, [sp, #0x40]
296
297	adrl	x9, v0123	/* x9 := &v0123 */
298	mov	x10, x5		/* r10 := c */
299	mov	x11, x4		/* r11 := k */
300	add	x12, x4, #16	/* r12 := k+4 */
301	mov	x13, x3		/* r13 := nonce */
302
303	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
304	dup	v12.4s, w2	/* v12 := (blkno, blkno, blkno, blkno) */
305	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
306	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
307	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
308	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
309	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
310
311	LE32TOH(v0.16b)
312	LE32TOH(v1.16b)
313	LE32TOH(v2.16b)
314	LE32TOH(v3.16b)
315	LE32TOH(v4.16b)
316	LE32TOH(v5.16b)
317	LE32TOH(v6.16b)
318	LE32TOH(v7.16b)
319	LE32TOH(v8.16b)
320	LE32TOH(v9.16b)
321	LE32TOH(v10.16b)
322	LE32TOH(v11.16b)
323	/* LE32TOH(v12.16b) -- blkno, already host order */
324	LE32TOH(v13.16b)
325	LE32TOH(v14.16b)
326	LE32TOH(v15.16b)
327
328	mov	v16.16b, v0.16b
329	mov	v17.16b, v1.16b
330	mov	v18.16b, v2.16b
331	mov	v19.16b, v3.16b
332	mov	v20.16b, v4.16b
333	mov	v21.16b, v5.16b
334	mov	v22.16b, v6.16b
335	mov	v23.16b, v7.16b
336	mov	v24.16b, v8.16b
337	mov	v25.16b, v9.16b
338	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
339	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
340	mov	w9, v11.s[0]
341	mov	w10, v13.s[0]
342	mov	w11, v14.s[0]
343	mov	w12, v15.s[0]
344
345        _ALIGN_TEXT
3461:	subs	w6, w6, #2
347	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
348	    v28,v29,v30,v31, v27)
349	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
350	    v28,v29,v30,v31, v27)
351	b.ne	1b
352
353	dup	v27.4s, w8
354	dup	v28.4s, w9
355	dup	v29.4s, w10
356	dup	v30.4s, w11
357	dup	v31.4s, w12
358
359	add	v0.4s, v0.4s, v16.4s
360	add	v1.4s, v1.4s, v17.4s
361	add	v2.4s, v2.4s, v18.4s
362	add	v3.4s, v3.4s, v19.4s
363	add	v4.4s, v4.4s, v20.4s
364	add	v5.4s, v5.4s, v21.4s
365	add	v6.4s, v6.4s, v22.4s
366	add	v7.4s, v7.4s, v23.4s
367	add	v8.4s, v8.4s, v24.4s
368	add	v9.4s, v9.4s, v25.4s
369	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
370	add	v11.4s, v11.4s, v28.4s
371	add	v12.4s, v12.4s, v26.4s
372	add	v13.4s, v13.4s, v29.4s
373	add	v14.4s, v14.4s, v30.4s
374	add	v15.4s, v15.4s, v31.4s
375
376	/*
377	 * We could do these sixteen LD4-into-lane instructions instead
378	 * by four LD1-into-register instructions, but we would need to
379	 * permute the elements in v0-v15 to put them in the right
380	 * order.  We can do that by a series of ZIP1/ZIP2 on 4s-sized
381	 * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
382	 * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
383	 * exceed the savings in cost from four LD1 instructions rather
384	 * than sixteen LD4 instructions, even if we interleave the LD1
385	 * instructions with the ZIPs.
386	 */
387	ld4	{v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
388	ld4	{v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
389	ld4	{v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
390	ld4	{v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
391	ld4	{v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
392	ld4	{v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
393	ld4	{v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
394	ld4	{v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
395	ld4	{v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
396	ld4	{v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
397	ld4	{v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
398	ld4	{v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
399	ld4	{v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
400	ld4	{v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
401	ld4	{v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
402	ld4	{v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
403
404	HTOLE32(v0.16b)
405	HTOLE32(v1.16b)
406	HTOLE32(v2.16b)
407	HTOLE32(v3.16b)
408	HTOLE32(v4.16b)
409	HTOLE32(v5.16b)
410	HTOLE32(v6.16b)
411	HTOLE32(v7.16b)
412	HTOLE32(v8.16b)
413	HTOLE32(v9.16b)
414	HTOLE32(v10.16b)
415	HTOLE32(v11.16b)
416	HTOLE32(v12.16b)
417	HTOLE32(v13.16b)
418	HTOLE32(v14.16b)
419	HTOLE32(v15.16b)
420
421	eor	v16.16b, v16.16b, v0.16b
422	eor	v17.16b, v17.16b, v1.16b
423	eor	v18.16b, v18.16b, v2.16b
424	eor	v19.16b, v19.16b, v3.16b
425	eor	v20.16b, v20.16b, v4.16b
426	eor	v21.16b, v21.16b, v5.16b
427	eor	v22.16b, v22.16b, v6.16b
428	eor	v23.16b, v23.16b, v7.16b
429	eor	v24.16b, v24.16b, v8.16b
430	eor	v25.16b, v25.16b, v9.16b
431	eor	v26.16b, v26.16b, v10.16b
432	eor	v27.16b, v27.16b, v11.16b
433	eor	v28.16b, v28.16b, v12.16b
434	eor	v29.16b, v29.16b, v13.16b
435	eor	v30.16b, v30.16b, v14.16b
436	eor	v31.16b, v31.16b, v15.16b
437
438	st4	{v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
439	st4	{v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
440	st4	{v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
441	st4	{v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
442	st4	{v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
443	st4	{v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
444	st4	{v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
445	st4	{v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
446	st4	{v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
447	st4	{v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
448	st4	{v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
449	st4	{v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
450	st4	{v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
451	st4	{v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
452	st4	{v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
453	st4	{v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
454
455	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
456	ldp	d10, d11, [sp, #0x20]
457	ldp	d12, d13, [sp, #0x30]
458	ldp	d14, d15, [sp, #0x40]
459
460	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
461	ret
462END(chacha_stream_xor256_neon)
463
464	.section .rodata
465	.p2align 4
466
467	.type	v0123,@object
468v0123:
469	.long	0, 1, 2, 3
470END(v0123)
471
472	/*
473	 * Must be immediately after v0123 -- we load them in a single
474	 * ld1 instruction.
475	 */
476	.type	rot8,@object
477rot8:
478	.long	0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
479END(rot8)
480