1*96000e66Sjakllsch/*	$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $	*/
2f45822bdSriastradh
3f45822bdSriastradh/*-
4f45822bdSriastradh * Copyright (c) 2020 The NetBSD Foundation, Inc.
5f45822bdSriastradh * All rights reserved.
6f45822bdSriastradh *
7f45822bdSriastradh * Redistribution and use in source and binary forms, with or without
8f45822bdSriastradh * modification, are permitted provided that the following conditions
9f45822bdSriastradh * are met:
10f45822bdSriastradh * 1. Redistributions of source code must retain the above copyright
11f45822bdSriastradh *    notice, this list of conditions and the following disclaimer.
12f45822bdSriastradh * 2. Redistributions in binary form must reproduce the above copyright
13f45822bdSriastradh *    notice, this list of conditions and the following disclaimer in the
14f45822bdSriastradh *    documentation and/or other materials provided with the distribution.
15f45822bdSriastradh *
16f45822bdSriastradh * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17f45822bdSriastradh * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18f45822bdSriastradh * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19f45822bdSriastradh * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20f45822bdSriastradh * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21f45822bdSriastradh * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22f45822bdSriastradh * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23f45822bdSriastradh * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24f45822bdSriastradh * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25f45822bdSriastradh * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26f45822bdSriastradh * POSSIBILITY OF SUCH DAMAGE.
27f45822bdSriastradh */
28f45822bdSriastradh
29f4043acdSriastradh#include <aarch64/asm.h>
30f45822bdSriastradh
31*96000e66SjakllschRCSID("$NetBSD: chacha_neon_64.S,v 1.7 2020/09/07 18:05:17 jakllsch Exp $")
320af5b014Sriastradh
33f45822bdSriastradh#define	ROUND(a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r) \
34f45822bdSriastradhSTEP(STEP0,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
35f45822bdSriastradhSTEP(STEP1,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
36f45822bdSriastradhSTEP(STEP2,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
37f45822bdSriastradhSTEP(STEP3,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
38f45822bdSriastradhSTEP(STEP4,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
39f45822bdSriastradhSTEP(STEP5,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
40f45822bdSriastradhSTEP(STEP6,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
41f45822bdSriastradhSTEP(STEP7,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
42f45822bdSriastradhSTEP(STEP8,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
43f45822bdSriastradhSTEP(STEP9,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);   \
44f45822bdSriastradhSTEP(STEP10,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
45f45822bdSriastradhSTEP(STEP11,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
46f45822bdSriastradhSTEP(STEP12,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
47f45822bdSriastradhSTEP(STEP13,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
48f45822bdSriastradhSTEP(STEP14,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
49f45822bdSriastradhSTEP(STEP15,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
50f45822bdSriastradhSTEP(STEP16,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
51f45822bdSriastradhSTEP(STEP17,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
52f45822bdSriastradhSTEP(STEP18,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
53f45822bdSriastradhSTEP(STEP19,a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,t0,t1,t2,t3, r);  \
54f45822bdSriastradh/* end ROUND */
55f45822bdSriastradh
56f45822bdSriastradh#define	STEP(f,a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3,t0,t1,t2,t3,r) \
57f45822bdSriastradh	f(a0,b0,c0,d0, t0, r);						      \
58f45822bdSriastradh	f(a1,b1,c1,d1, t1, r);						      \
59f45822bdSriastradh	f(a2,b2,c2,d2, t2, r);						      \
60f45822bdSriastradh	f(a3,b3,c3,d3, t3, r);						      \
61f45822bdSriastradh	/* end of STEP */
62f45822bdSriastradh
63f45822bdSriastradh/*
64f45822bdSriastradh * Each step of the ChaCha quarterround, split up so we can interleave
65f45822bdSriastradh * the quarterrounds on independent rows/diagonals to maximize pipeline
66f45822bdSriastradh * efficiency.  Reference:
67f45822bdSriastradh *
68f45822bdSriastradh *	Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop
69f45822bdSriastradh *	Record of the State of the Art in Stream Ciphers -- SASC 2008.
70f45822bdSriastradh *	https://cr.yp.to/papers.html#chacha
71f45822bdSriastradh *
72f45822bdSriastradh *	a += b; d ^= a; d <<<= 16;
73f45822bdSriastradh *	c += d; b ^= c; b <<<= 12;
74f45822bdSriastradh *	a += b; d ^= a; d <<<= 8;
75f45822bdSriastradh *	c += d; b ^= c; b <<<= 7;
76f45822bdSriastradh *
77f45822bdSriastradh * The rotations are implemented with:
78f45822bdSriastradh *	<<< 16		REV32 Vn.8h for 16,
79f45822bdSriastradh *	<<< 12		SHL/SRI/ORR (shift left, shift right and insert, OR)
80f45822bdSriastradh *	<<< 8		TBL (general permutation; rot8 below stored in r)
81f45822bdSriastradh *	<<< 7		SHL/SRI/ORR
82f45822bdSriastradh */
83f45822bdSriastradh#define	STEP0(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
84f45822bdSriastradh#define	STEP1(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
85f45822bdSriastradh#if 0
86f45822bdSriastradh#define	STEP2(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #16
87f45822bdSriastradh#define	STEP3(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 16)
88f45822bdSriastradh#define	STEP4(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
89f45822bdSriastradh#else
90f45822bdSriastradh#define	STEP2(a,b,c,d, t, r)	rev32	d##.8h, d##.8h
91f45822bdSriastradh#define	STEP3(a,b,c,d, t, r)	/* nothing */
92f45822bdSriastradh#define	STEP4(a,b,c,d, t, r)	/* nothing */
93f45822bdSriastradh#endif
94f45822bdSriastradh
95f45822bdSriastradh#define	STEP5(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
96f45822bdSriastradh#if 0
97f45822bdSriastradh#define	STEP6(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
98f45822bdSriastradh#define	STEP7(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #12
99f45822bdSriastradh#define	STEP8(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 12)
100f45822bdSriastradh#define	STEP9(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
101f45822bdSriastradh#else
102f45822bdSriastradh#define	STEP6(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
103f45822bdSriastradh#define	STEP7(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #12
104f45822bdSriastradh#define	STEP8(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 12)
105f45822bdSriastradh#define	STEP9(a,b,c,d, t, r)	/* nothing */
106f45822bdSriastradh#endif
107f45822bdSriastradh
108f45822bdSriastradh#define	STEP10(a,b,c,d, t, r)	add	a##.4s, a##.4s, b##.4s
109f45822bdSriastradh#define	STEP11(a,b,c,d, t, r)	eor	d##.16b, d##.16b, a##.16b
110f45822bdSriastradh#if 0
111f45822bdSriastradh#define	STEP12(a,b,c,d, t, r)	shl	t##.4s, d##.4s, #8
112f45822bdSriastradh#define	STEP13(a,b,c,d, t, r)	ushr	d##.4s, d##.4s, #(32 - 8)
113f45822bdSriastradh#define	STEP14(a,b,c,d, t, r)	orr	d##.16b, d##.16b, t##.16b
114f45822bdSriastradh#else
115f45822bdSriastradh#define	STEP12(a,b,c,d, t, r)	tbl	d##.16b, {d##.16b}, r##.16b
116f45822bdSriastradh#define	STEP13(a,b,c,d, t, r)	/* nothing */
117f45822bdSriastradh#define	STEP14(a,b,c,d, t, r)	/* nothing */
118f45822bdSriastradh#endif
119f45822bdSriastradh
120f45822bdSriastradh#define	STEP15(a,b,c,d, t, r)	add	c##.4s, c##.4s, d##.4s
121f45822bdSriastradh#if 0
122f45822bdSriastradh#define	STEP16(a,b,c,d, t, r)	eor	b##.16b, b##.16b, c##.16b
123f45822bdSriastradh#define	STEP17(a,b,c,d, t, r)	shl	t##.4s, b##.4s, #7
124f45822bdSriastradh#define	STEP18(a,b,c,d, t, r)	ushr	b##.4s, b##.4s, #(32 - 7)
125f45822bdSriastradh#define	STEP19(a,b,c,d, t, r)	orr	b##.16b, b##.16b, t##.16b
126f45822bdSriastradh#else
127f45822bdSriastradh#define	STEP16(a,b,c,d, t, r)	eor	t##.16b, b##.16b, c##.16b
128f45822bdSriastradh#define	STEP17(a,b,c,d, t, r)	shl	b##.4s, t##.4s, #7
129f45822bdSriastradh#define	STEP18(a,b,c,d, t, r)	sri	b##.4s, t##.4s, #(32 - 7)
130f45822bdSriastradh#define	STEP19(a,b,c,d, t, r)	/* nothing */
131f45822bdSriastradh#endif
132f45822bdSriastradh
133*96000e66Sjakllsch#if defined(__AARCH64EB__)
134f45822bdSriastradh#define	HTOLE32(x)	rev32	x, x
135f45822bdSriastradh#define	LE32TOH(x)	rev32	x, x
136*96000e66Sjakllsch#else
137*96000e66Sjakllsch#define	LE32TOH(x)
138*96000e66Sjakllsch#define	HTOLE32(x)
139f45822bdSriastradh#endif
140f45822bdSriastradh
141f45822bdSriastradh/*
142f45822bdSriastradh * chacha_stream256_neon(uint8_t s[256]@x0,
143f45822bdSriastradh *     uint32_t blkno@w1,
144f45822bdSriastradh *     const uint8_t nonce[12]@x2,
14537fa73a4Sriastradh *     const uint8_t key[32]@x3,
146f45822bdSriastradh *     const uint8_t const[16]@x4,
147f45822bdSriastradh *     unsigned nr@w5)
148f45822bdSriastradh */
149f45822bdSriastradhENTRY(chacha_stream256_neon)
150f45822bdSriastradh	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
151f45822bdSriastradh	mov	fp, sp
152f45822bdSriastradh
153f45822bdSriastradh	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
154f45822bdSriastradh	stp	d10, d11, [sp, #0x20]
155f45822bdSriastradh	stp	d12, d13, [sp, #0x30]
156f45822bdSriastradh	stp	d14, d15, [sp, #0x40]
157f45822bdSriastradh
158f45822bdSriastradh	adrl	x9, v0123	/* x9 := &v0123 */
159f45822bdSriastradh	mov	x10, x4		/* r10 := c */
160f45822bdSriastradh	mov	x11, x3		/* r11 := k */
161f45822bdSriastradh	add	x12, x3, #16	/* r12 := k+4 */
162f45822bdSriastradh	mov	x13, x2		/* r13 := nonce */
163f45822bdSriastradh
164f45822bdSriastradh	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
165f45822bdSriastradh	dup	v12.4s, w1	/* v12 := (blkno, blkno, blkno, blkno) */
166f45822bdSriastradh	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
167f45822bdSriastradh	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
168f45822bdSriastradh	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
169f45822bdSriastradh	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
170f45822bdSriastradh	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
171f45822bdSriastradh
172be52d94aSriastradh	LE32TOH(v0.16b)
173be52d94aSriastradh	LE32TOH(v1.16b)
174be52d94aSriastradh	LE32TOH(v2.16b)
175be52d94aSriastradh	LE32TOH(v3.16b)
176be52d94aSriastradh	LE32TOH(v4.16b)
177be52d94aSriastradh	LE32TOH(v5.16b)
178be52d94aSriastradh	LE32TOH(v6.16b)
179be52d94aSriastradh	LE32TOH(v7.16b)
180be52d94aSriastradh	LE32TOH(v8.16b)
181be52d94aSriastradh	LE32TOH(v9.16b)
182be52d94aSriastradh	LE32TOH(v10.16b)
183be52d94aSriastradh	LE32TOH(v11.16b)
184be52d94aSriastradh	/* LE32TOH(v12.16b) -- blkno, already host order */
185be52d94aSriastradh	LE32TOH(v13.16b)
186be52d94aSriastradh	LE32TOH(v14.16b)
187be52d94aSriastradh	LE32TOH(v15.16b)
188f45822bdSriastradh
189f45822bdSriastradh	mov	v16.16b, v0.16b
190f45822bdSriastradh	mov	v17.16b, v1.16b
191f45822bdSriastradh	mov	v18.16b, v2.16b
192f45822bdSriastradh	mov	v19.16b, v3.16b
193f45822bdSriastradh	mov	v20.16b, v4.16b
194f45822bdSriastradh	mov	v21.16b, v5.16b
195f45822bdSriastradh	mov	v22.16b, v6.16b
196f45822bdSriastradh	mov	v23.16b, v7.16b
197f45822bdSriastradh	mov	v24.16b, v8.16b
198f45822bdSriastradh	mov	v25.16b, v9.16b
199f45822bdSriastradh	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
200f45822bdSriastradh	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
201f45822bdSriastradh	mov	w9, v11.s[0]
202f45822bdSriastradh	mov	w10, v13.s[0]
203f45822bdSriastradh	mov	w11, v14.s[0]
204f45822bdSriastradh	mov	w12, v15.s[0]
205f45822bdSriastradh
206d189e131Sriastradh	_ALIGN_TEXT
207f45822bdSriastradh1:	subs	w5, w5, #2
208f45822bdSriastradh	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
209f45822bdSriastradh	    v28,v29,v30,v31, v27)
210f45822bdSriastradh	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
211f45822bdSriastradh	    v28,v29,v30,v31, v27)
212f45822bdSriastradh	b.ne	1b
213f45822bdSriastradh
214f45822bdSriastradh	dup	v27.4s, w8
215f45822bdSriastradh	dup	v28.4s, w9
216f45822bdSriastradh	dup	v29.4s, w10
217f45822bdSriastradh	dup	v30.4s, w11
218f45822bdSriastradh	dup	v31.4s, w12
219f45822bdSriastradh
220f45822bdSriastradh	add	v0.4s, v0.4s, v16.4s
221f45822bdSriastradh	add	v1.4s, v1.4s, v17.4s
222f45822bdSriastradh	add	v2.4s, v2.4s, v18.4s
223f45822bdSriastradh	add	v3.4s, v3.4s, v19.4s
224f45822bdSriastradh	add	v4.4s, v4.4s, v20.4s
225f45822bdSriastradh	add	v5.4s, v5.4s, v21.4s
226f45822bdSriastradh	add	v6.4s, v6.4s, v22.4s
227f45822bdSriastradh	add	v7.4s, v7.4s, v23.4s
228f45822bdSriastradh	add	v8.4s, v8.4s, v24.4s
229f45822bdSriastradh	add	v9.4s, v9.4s, v25.4s
230f45822bdSriastradh	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
231f45822bdSriastradh	add	v11.4s, v11.4s, v28.4s
232f45822bdSriastradh	add	v12.4s, v12.4s, v26.4s
233f45822bdSriastradh	add	v13.4s, v13.4s, v29.4s
234f45822bdSriastradh	add	v14.4s, v14.4s, v30.4s
235f45822bdSriastradh	add	v15.4s, v15.4s, v31.4s
236f45822bdSriastradh
237be52d94aSriastradh	HTOLE32(v0.16b)
238be52d94aSriastradh	HTOLE32(v1.16b)
239be52d94aSriastradh	HTOLE32(v2.16b)
240be52d94aSriastradh	HTOLE32(v3.16b)
241be52d94aSriastradh	HTOLE32(v4.16b)
242be52d94aSriastradh	HTOLE32(v5.16b)
243be52d94aSriastradh	HTOLE32(v6.16b)
244be52d94aSriastradh	HTOLE32(v7.16b)
245be52d94aSriastradh	HTOLE32(v8.16b)
246be52d94aSriastradh	HTOLE32(v9.16b)
247be52d94aSriastradh	HTOLE32(v10.16b)
248be52d94aSriastradh	HTOLE32(v11.16b)
249be52d94aSriastradh	HTOLE32(v12.16b)
250be52d94aSriastradh	HTOLE32(v13.16b)
251be52d94aSriastradh	HTOLE32(v14.16b)
252be52d94aSriastradh	HTOLE32(v15.16b)
253f45822bdSriastradh
254f45822bdSriastradh	st4	{ v0.s, v1.s, v2.s, v3.s}[0], [x0], #16
255f45822bdSriastradh	st4	{ v4.s, v5.s, v6.s, v7.s}[0], [x0], #16
256f45822bdSriastradh	st4	{ v8.s, v9.s,v10.s,v11.s}[0], [x0], #16
257f45822bdSriastradh	st4	{v12.s,v13.s,v14.s,v15.s}[0], [x0], #16
258f45822bdSriastradh	st4	{ v0.s, v1.s, v2.s, v3.s}[1], [x0], #16
259f45822bdSriastradh	st4	{ v4.s, v5.s, v6.s, v7.s}[1], [x0], #16
260f45822bdSriastradh	st4	{ v8.s, v9.s,v10.s,v11.s}[1], [x0], #16
261f45822bdSriastradh	st4	{v12.s,v13.s,v14.s,v15.s}[1], [x0], #16
262f45822bdSriastradh	st4	{ v0.s, v1.s, v2.s, v3.s}[2], [x0], #16
263f45822bdSriastradh	st4	{ v4.s, v5.s, v6.s, v7.s}[2], [x0], #16
264f45822bdSriastradh	st4	{ v8.s, v9.s,v10.s,v11.s}[2], [x0], #16
265f45822bdSriastradh	st4	{v12.s,v13.s,v14.s,v15.s}[2], [x0], #16
266f45822bdSriastradh	st4	{ v0.s, v1.s, v2.s, v3.s}[3], [x0], #16
267f45822bdSriastradh	st4	{ v4.s, v5.s, v6.s, v7.s}[3], [x0], #16
268f45822bdSriastradh	st4	{ v8.s, v9.s,v10.s,v11.s}[3], [x0], #16
269f45822bdSriastradh	st4	{v12.s,v13.s,v14.s,v15.s}[3], [x0], #16
270f45822bdSriastradh
271f45822bdSriastradh	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
272f45822bdSriastradh	ldp	d10, d11, [sp, #0x20]
273f45822bdSriastradh	ldp	d12, d13, [sp, #0x30]
274f45822bdSriastradh	ldp	d14, d15, [sp, #0x40]
275f45822bdSriastradh
276f45822bdSriastradh	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
277f45822bdSriastradh	ret
278f45822bdSriastradhEND(chacha_stream256_neon)
279f45822bdSriastradh
280f45822bdSriastradh/*
281f45822bdSriastradh * chacha_stream_xor256_neon(uint8_t s[256]@x0, const uint8_t p[256]@x1,
282f45822bdSriastradh *     uint32_t blkno@w2,
283f45822bdSriastradh *     const uint8_t nonce[12]@x3,
284f45822bdSriastradh *     const uint8_t key[32]@x4,
285f45822bdSriastradh *     const uint8_t const[16]@x5,
286f45822bdSriastradh *     unsigned nr@w6)
287f45822bdSriastradh */
288f45822bdSriastradhENTRY(chacha_stream_xor256_neon)
289f45822bdSriastradh	stp	fp, lr, [sp, #-0x50]!	/* push stack frame with uint64[8] */
290f45822bdSriastradh	mov	fp, sp
291f45822bdSriastradh
292f45822bdSriastradh	stp	d8, d9, [sp, #0x10]	/* save callee-saves vectors */
293f45822bdSriastradh	stp	d10, d11, [sp, #0x20]
294f45822bdSriastradh	stp	d12, d13, [sp, #0x30]
295f45822bdSriastradh	stp	d14, d15, [sp, #0x40]
296f45822bdSriastradh
297f45822bdSriastradh	adrl	x9, v0123	/* x9 := &v0123 */
298f45822bdSriastradh	mov	x10, x5		/* r10 := c */
299f45822bdSriastradh	mov	x11, x4		/* r11 := k */
300f45822bdSriastradh	add	x12, x4, #16	/* r12 := k+4 */
301f45822bdSriastradh	mov	x13, x3		/* r13 := nonce */
302f45822bdSriastradh
303f45822bdSriastradh	ld1	{v26.4s-v27.4s}, [x9]	/* v26 := v0123, v27 := rot8 */
304f45822bdSriastradh	dup	v12.4s, w2	/* v12 := (blkno, blkno, blkno, blkno) */
305f45822bdSriastradh	ld4r	{v0.4s-v3.4s}, [x10]	/* (v0,v1,v2,v3) := constant */
306f45822bdSriastradh	ld4r	{v4.4s-v7.4s}, [x11]	/* (v4,v5,v6,v7) := key[0:16) */
307f45822bdSriastradh	ld4r	{v8.4s-v11.4s}, [x12]	/* (v8,v9,v10,v11) := key[16:32) */
308f45822bdSriastradh	ld3r	{v13.4s-v15.4s}, [x13]	/* (v13,v14,v15) := nonce */
309f45822bdSriastradh	add	v12.4s, v12.4s, v26.4s	/* v12 := blkno + (0,1,2,3) */
310f45822bdSriastradh
311be52d94aSriastradh	LE32TOH(v0.16b)
312be52d94aSriastradh	LE32TOH(v1.16b)
313be52d94aSriastradh	LE32TOH(v2.16b)
314be52d94aSriastradh	LE32TOH(v3.16b)
315be52d94aSriastradh	LE32TOH(v4.16b)
316be52d94aSriastradh	LE32TOH(v5.16b)
317be52d94aSriastradh	LE32TOH(v6.16b)
318be52d94aSriastradh	LE32TOH(v7.16b)
319be52d94aSriastradh	LE32TOH(v8.16b)
320be52d94aSriastradh	LE32TOH(v9.16b)
321be52d94aSriastradh	LE32TOH(v10.16b)
322be52d94aSriastradh	LE32TOH(v11.16b)
323be52d94aSriastradh	/* LE32TOH(v12.16b) -- blkno, already host order */
324be52d94aSriastradh	LE32TOH(v13.16b)
325be52d94aSriastradh	LE32TOH(v14.16b)
326be52d94aSriastradh	LE32TOH(v15.16b)
327f45822bdSriastradh
328f45822bdSriastradh	mov	v16.16b, v0.16b
329f45822bdSriastradh	mov	v17.16b, v1.16b
330f45822bdSriastradh	mov	v18.16b, v2.16b
331f45822bdSriastradh	mov	v19.16b, v3.16b
332f45822bdSriastradh	mov	v20.16b, v4.16b
333f45822bdSriastradh	mov	v21.16b, v5.16b
334f45822bdSriastradh	mov	v22.16b, v6.16b
335f45822bdSriastradh	mov	v23.16b, v7.16b
336f45822bdSriastradh	mov	v24.16b, v8.16b
337f45822bdSriastradh	mov	v25.16b, v9.16b
338f45822bdSriastradh	mov	v26.16b, v12.16b	/* reordered since v12 isn't dup */
339f45822bdSriastradh	mov	w8, v10.s[0]		/* v27-31 needed as temporaries */
340f45822bdSriastradh	mov	w9, v11.s[0]
341f45822bdSriastradh	mov	w10, v13.s[0]
342f45822bdSriastradh	mov	w11, v14.s[0]
343f45822bdSriastradh	mov	w12, v15.s[0]
344f45822bdSriastradh
345d189e131Sriastradh        _ALIGN_TEXT
346f45822bdSriastradh1:	subs	w6, w6, #2
347f45822bdSriastradh	ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
348f45822bdSriastradh	    v28,v29,v30,v31, v27)
349f45822bdSriastradh	ROUND(v0,v1,v2,v3, v5,v6,v7,v4, v10,v11,v8,v9, v15,v12,v13,v14,
350f45822bdSriastradh	    v28,v29,v30,v31, v27)
351f45822bdSriastradh	b.ne	1b
352f45822bdSriastradh
353f45822bdSriastradh	dup	v27.4s, w8
354f45822bdSriastradh	dup	v28.4s, w9
355f45822bdSriastradh	dup	v29.4s, w10
356f45822bdSriastradh	dup	v30.4s, w11
357f45822bdSriastradh	dup	v31.4s, w12
358f45822bdSriastradh
359f45822bdSriastradh	add	v0.4s, v0.4s, v16.4s
360f45822bdSriastradh	add	v1.4s, v1.4s, v17.4s
361f45822bdSriastradh	add	v2.4s, v2.4s, v18.4s
362f45822bdSriastradh	add	v3.4s, v3.4s, v19.4s
363f45822bdSriastradh	add	v4.4s, v4.4s, v20.4s
364f45822bdSriastradh	add	v5.4s, v5.4s, v21.4s
365f45822bdSriastradh	add	v6.4s, v6.4s, v22.4s
366f45822bdSriastradh	add	v7.4s, v7.4s, v23.4s
367f45822bdSriastradh	add	v8.4s, v8.4s, v24.4s
368f45822bdSriastradh	add	v9.4s, v9.4s, v25.4s
369f45822bdSriastradh	add	v10.4s, v10.4s, v27.4s	/* reordered since v12 isn't dup */
370f45822bdSriastradh	add	v11.4s, v11.4s, v28.4s
371f45822bdSriastradh	add	v12.4s, v12.4s, v26.4s
372f45822bdSriastradh	add	v13.4s, v13.4s, v29.4s
373f45822bdSriastradh	add	v14.4s, v14.4s, v30.4s
374f45822bdSriastradh	add	v15.4s, v15.4s, v31.4s
375f45822bdSriastradh
376f45822bdSriastradh	/*
377f45822bdSriastradh	 * We could do these sixteen LD4-into-lane instructions instead
378f45822bdSriastradh	 * by four LD1-into-register instructions, but we would need to
379f45822bdSriastradh	 * permute the elements in v0-v15 to put them in the right
380f45822bdSriastradh	 * order.  We can do that by a series of ZIP1/ZIP2 on 4s-sized
381f45822bdSriastradh	 * elements, and then ZIP1/ZIP2 on 2d-sized elements, but the
382f45822bdSriastradh	 * net cost of the thirty-two ZIP1/ZIP2 instructions seems to
383f45822bdSriastradh	 * exceed the savings in cost from four LD1 instructions rather
384f45822bdSriastradh	 * than sixteen LD4 instructions, even if we interleave the LD1
385f45822bdSriastradh	 * instructions with the ZIPs.
386f45822bdSriastradh	 */
387f45822bdSriastradh	ld4	{v16.s,v17.s,v18.s,v19.s}[0], [x1], #16
388f45822bdSriastradh	ld4	{v20.s,v21.s,v22.s,v23.s}[0], [x1], #16
389f45822bdSriastradh	ld4	{v24.s,v25.s,v26.s,v27.s}[0], [x1], #16
390f45822bdSriastradh	ld4	{v28.s,v29.s,v30.s,v31.s}[0], [x1], #16
391f45822bdSriastradh	ld4	{v16.s,v17.s,v18.s,v19.s}[1], [x1], #16
392f45822bdSriastradh	ld4	{v20.s,v21.s,v22.s,v23.s}[1], [x1], #16
393f45822bdSriastradh	ld4	{v24.s,v25.s,v26.s,v27.s}[1], [x1], #16
394f45822bdSriastradh	ld4	{v28.s,v29.s,v30.s,v31.s}[1], [x1], #16
395f45822bdSriastradh	ld4	{v16.s,v17.s,v18.s,v19.s}[2], [x1], #16
396f45822bdSriastradh	ld4	{v20.s,v21.s,v22.s,v23.s}[2], [x1], #16
397f45822bdSriastradh	ld4	{v24.s,v25.s,v26.s,v27.s}[2], [x1], #16
398f45822bdSriastradh	ld4	{v28.s,v29.s,v30.s,v31.s}[2], [x1], #16
399f45822bdSriastradh	ld4	{v16.s,v17.s,v18.s,v19.s}[3], [x1], #16
400f45822bdSriastradh	ld4	{v20.s,v21.s,v22.s,v23.s}[3], [x1], #16
401f45822bdSriastradh	ld4	{v24.s,v25.s,v26.s,v27.s}[3], [x1], #16
402f45822bdSriastradh	ld4	{v28.s,v29.s,v30.s,v31.s}[3], [x1], #16
403f45822bdSriastradh
404be52d94aSriastradh	HTOLE32(v0.16b)
405be52d94aSriastradh	HTOLE32(v1.16b)
406be52d94aSriastradh	HTOLE32(v2.16b)
407be52d94aSriastradh	HTOLE32(v3.16b)
408be52d94aSriastradh	HTOLE32(v4.16b)
409be52d94aSriastradh	HTOLE32(v5.16b)
410be52d94aSriastradh	HTOLE32(v6.16b)
411be52d94aSriastradh	HTOLE32(v7.16b)
412be52d94aSriastradh	HTOLE32(v8.16b)
413be52d94aSriastradh	HTOLE32(v9.16b)
414be52d94aSriastradh	HTOLE32(v10.16b)
415be52d94aSriastradh	HTOLE32(v11.16b)
416be52d94aSriastradh	HTOLE32(v12.16b)
417be52d94aSriastradh	HTOLE32(v13.16b)
418be52d94aSriastradh	HTOLE32(v14.16b)
419be52d94aSriastradh	HTOLE32(v15.16b)
420f45822bdSriastradh
421f45822bdSriastradh	eor	v16.16b, v16.16b, v0.16b
422f45822bdSriastradh	eor	v17.16b, v17.16b, v1.16b
423f45822bdSriastradh	eor	v18.16b, v18.16b, v2.16b
424f45822bdSriastradh	eor	v19.16b, v19.16b, v3.16b
425f45822bdSriastradh	eor	v20.16b, v20.16b, v4.16b
426f45822bdSriastradh	eor	v21.16b, v21.16b, v5.16b
427f45822bdSriastradh	eor	v22.16b, v22.16b, v6.16b
428f45822bdSriastradh	eor	v23.16b, v23.16b, v7.16b
429f45822bdSriastradh	eor	v24.16b, v24.16b, v8.16b
430f45822bdSriastradh	eor	v25.16b, v25.16b, v9.16b
431f45822bdSriastradh	eor	v26.16b, v26.16b, v10.16b
432f45822bdSriastradh	eor	v27.16b, v27.16b, v11.16b
433f45822bdSriastradh	eor	v28.16b, v28.16b, v12.16b
434f45822bdSriastradh	eor	v29.16b, v29.16b, v13.16b
435f45822bdSriastradh	eor	v30.16b, v30.16b, v14.16b
436f45822bdSriastradh	eor	v31.16b, v31.16b, v15.16b
437f45822bdSriastradh
438f45822bdSriastradh	st4	{v16.s,v17.s,v18.s,v19.s}[0], [x0], #16
439f45822bdSriastradh	st4	{v20.s,v21.s,v22.s,v23.s}[0], [x0], #16
440f45822bdSriastradh	st4	{v24.s,v25.s,v26.s,v27.s}[0], [x0], #16
441f45822bdSriastradh	st4	{v28.s,v29.s,v30.s,v31.s}[0], [x0], #16
442f45822bdSriastradh	st4	{v16.s,v17.s,v18.s,v19.s}[1], [x0], #16
443f45822bdSriastradh	st4	{v20.s,v21.s,v22.s,v23.s}[1], [x0], #16
444f45822bdSriastradh	st4	{v24.s,v25.s,v26.s,v27.s}[1], [x0], #16
445f45822bdSriastradh	st4	{v28.s,v29.s,v30.s,v31.s}[1], [x0], #16
446f45822bdSriastradh	st4	{v16.s,v17.s,v18.s,v19.s}[2], [x0], #16
447f45822bdSriastradh	st4	{v20.s,v21.s,v22.s,v23.s}[2], [x0], #16
448f45822bdSriastradh	st4	{v24.s,v25.s,v26.s,v27.s}[2], [x0], #16
449f45822bdSriastradh	st4	{v28.s,v29.s,v30.s,v31.s}[2], [x0], #16
450f45822bdSriastradh	st4	{v16.s,v17.s,v18.s,v19.s}[3], [x0], #16
451f45822bdSriastradh	st4	{v20.s,v21.s,v22.s,v23.s}[3], [x0], #16
452f45822bdSriastradh	st4	{v24.s,v25.s,v26.s,v27.s}[3], [x0], #16
453f45822bdSriastradh	st4	{v28.s,v29.s,v30.s,v31.s}[3], [x0], #16
454f45822bdSriastradh
455f45822bdSriastradh	ldp	d8, d9, [sp, #0x10]	/* restore callee-saves vectors */
456f45822bdSriastradh	ldp	d10, d11, [sp, #0x20]
457f45822bdSriastradh	ldp	d12, d13, [sp, #0x30]
458f45822bdSriastradh	ldp	d14, d15, [sp, #0x40]
459f45822bdSriastradh
460f45822bdSriastradh	ldp	fp, lr, [sp], #0x50	/* pop stack frame with uint64[8] */
461f45822bdSriastradh	ret
462f45822bdSriastradhEND(chacha_stream_xor256_neon)
463f45822bdSriastradh
464f45822bdSriastradh	.section .rodata
465f45822bdSriastradh	.p2align 4
466f45822bdSriastradh
467f45822bdSriastradh	.type	v0123,@object
468f45822bdSriastradhv0123:
469f45822bdSriastradh	.long	0, 1, 2, 3
470f45822bdSriastradhEND(v0123)
471f45822bdSriastradh
472f45822bdSriastradh	/*
473f45822bdSriastradh	 * Must be immediately after v0123 -- we load them in a single
474f45822bdSriastradh	 * ld1 instruction.
475f45822bdSriastradh	 */
476f45822bdSriastradh	.type	rot8,@object
477f45822bdSriastradhrot8:
478f45822bdSriastradh	.long	0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
479f45822bdSriastradhEND(rot8)
480