1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build go1.11,gc,!purego
6
7#include "textflag.h"
8
9#define NUM_ROUNDS 10
10
11// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
12TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
13	MOVD	dst+0(FP), R1
14	MOVD	src+24(FP), R2
15	MOVD	src_len+32(FP), R3
16	MOVD	key+48(FP), R4
17	MOVD	nonce+56(FP), R6
18	MOVD	counter+64(FP), R7
19
20	MOVDconstants(SB), R10
21	MOVDincRotMatrix(SB), R11
22
23	MOVW	(R7), R20
24
25	AND	$~255, R3, R13
26	ADD	R2, R13, R12 // R12 for block end
27	AND	$255, R3, R13
28loop:
29	MOVD	$NUM_ROUNDS, R21
30	VLD1	(R11), [V30.S4, V31.S4]
31
32	// load contants
33	// VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
34	WORD	$0x4D60E940
35
36	// load keys
37	// VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
38	WORD	$0x4DFFE884
39	// VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
40	WORD	$0x4DFFE888
41	SUB	$32, R4
42
43	// load counter + nonce
44	// VLD1R (R7), [V12.S4]
45	WORD	$0x4D40C8EC
46
47	// VLD3R (R6), [V13.S4, V14.S4, V15.S4]
48	WORD	$0x4D40E8CD
49
50	// update counter
51	VADD	V30.S4, V12.S4, V12.S4
52
53chacha:
54	// V0..V3 += V4..V7
55	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
56	VADD	V0.S4, V4.S4, V0.S4
57	VADD	V1.S4, V5.S4, V1.S4
58	VADD	V2.S4, V6.S4, V2.S4
59	VADD	V3.S4, V7.S4, V3.S4
60	VEOR	V12.B16, V0.B16, V12.B16
61	VEOR	V13.B16, V1.B16, V13.B16
62	VEOR	V14.B16, V2.B16, V14.B16
63	VEOR	V15.B16, V3.B16, V15.B16
64	VREV32	V12.H8, V12.H8
65	VREV32	V13.H8, V13.H8
66	VREV32	V14.H8, V14.H8
67	VREV32	V15.H8, V15.H8
68	// V8..V11 += V12..V15
69	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
70	VADD	V8.S4, V12.S4, V8.S4
71	VADD	V9.S4, V13.S4, V9.S4
72	VADD	V10.S4, V14.S4, V10.S4
73	VADD	V11.S4, V15.S4, V11.S4
74	VEOR	V8.B16, V4.B16, V16.B16
75	VEOR	V9.B16, V5.B16, V17.B16
76	VEOR	V10.B16, V6.B16, V18.B16
77	VEOR	V11.B16, V7.B16, V19.B16
78	VSHL	$12, V16.S4, V4.S4
79	VSHL	$12, V17.S4, V5.S4
80	VSHL	$12, V18.S4, V6.S4
81	VSHL	$12, V19.S4, V7.S4
82	VSRI	$20, V16.S4, V4.S4
83	VSRI	$20, V17.S4, V5.S4
84	VSRI	$20, V18.S4, V6.S4
85	VSRI	$20, V19.S4, V7.S4
86
87	// V0..V3 += V4..V7
88	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
89	VADD	V0.S4, V4.S4, V0.S4
90	VADD	V1.S4, V5.S4, V1.S4
91	VADD	V2.S4, V6.S4, V2.S4
92	VADD	V3.S4, V7.S4, V3.S4
93	VEOR	V12.B16, V0.B16, V12.B16
94	VEOR	V13.B16, V1.B16, V13.B16
95	VEOR	V14.B16, V2.B16, V14.B16
96	VEOR	V15.B16, V3.B16, V15.B16
97	VTBL	V31.B16, [V12.B16], V12.B16
98	VTBL	V31.B16, [V13.B16], V13.B16
99	VTBL	V31.B16, [V14.B16], V14.B16
100	VTBL	V31.B16, [V15.B16], V15.B16
101
102	// V8..V11 += V12..V15
103	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
104	VADD	V12.S4, V8.S4, V8.S4
105	VADD	V13.S4, V9.S4, V9.S4
106	VADD	V14.S4, V10.S4, V10.S4
107	VADD	V15.S4, V11.S4, V11.S4
108	VEOR	V8.B16, V4.B16, V16.B16
109	VEOR	V9.B16, V5.B16, V17.B16
110	VEOR	V10.B16, V6.B16, V18.B16
111	VEOR	V11.B16, V7.B16, V19.B16
112	VSHL	$7, V16.S4, V4.S4
113	VSHL	$7, V17.S4, V5.S4
114	VSHL	$7, V18.S4, V6.S4
115	VSHL	$7, V19.S4, V7.S4
116	VSRI	$25, V16.S4, V4.S4
117	VSRI	$25, V17.S4, V5.S4
118	VSRI	$25, V18.S4, V6.S4
119	VSRI	$25, V19.S4, V7.S4
120
121	// V0..V3 += V5..V7, V4
122	// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
123	VADD	V0.S4, V5.S4, V0.S4
124	VADD	V1.S4, V6.S4, V1.S4
125	VADD	V2.S4, V7.S4, V2.S4
126	VADD	V3.S4, V4.S4, V3.S4
127	VEOR	V15.B16, V0.B16, V15.B16
128	VEOR	V12.B16, V1.B16, V12.B16
129	VEOR	V13.B16, V2.B16, V13.B16
130	VEOR	V14.B16, V3.B16, V14.B16
131	VREV32	V12.H8, V12.H8
132	VREV32	V13.H8, V13.H8
133	VREV32	V14.H8, V14.H8
134	VREV32	V15.H8, V15.H8
135
136	// V10 += V15; V5 <<<= ((V10 XOR V5), 12)
137	// ...
138	VADD	V15.S4, V10.S4, V10.S4
139	VADD	V12.S4, V11.S4, V11.S4
140	VADD	V13.S4, V8.S4, V8.S4
141	VADD	V14.S4, V9.S4, V9.S4
142	VEOR	V10.B16, V5.B16, V16.B16
143	VEOR	V11.B16, V6.B16, V17.B16
144	VEOR	V8.B16, V7.B16, V18.B16
145	VEOR	V9.B16, V4.B16, V19.B16
146	VSHL	$12, V16.S4, V5.S4
147	VSHL	$12, V17.S4, V6.S4
148	VSHL	$12, V18.S4, V7.S4
149	VSHL	$12, V19.S4, V4.S4
150	VSRI	$20, V16.S4, V5.S4
151	VSRI	$20, V17.S4, V6.S4
152	VSRI	$20, V18.S4, V7.S4
153	VSRI	$20, V19.S4, V4.S4
154
155	// V0 += V5; V15 <<<= ((V0 XOR V15), 8)
156	// ...
157	VADD	V5.S4, V0.S4, V0.S4
158	VADD	V6.S4, V1.S4, V1.S4
159	VADD	V7.S4, V2.S4, V2.S4
160	VADD	V4.S4, V3.S4, V3.S4
161	VEOR	V0.B16, V15.B16, V15.B16
162	VEOR	V1.B16, V12.B16, V12.B16
163	VEOR	V2.B16, V13.B16, V13.B16
164	VEOR	V3.B16, V14.B16, V14.B16
165	VTBL	V31.B16, [V12.B16], V12.B16
166	VTBL	V31.B16, [V13.B16], V13.B16
167	VTBL	V31.B16, [V14.B16], V14.B16
168	VTBL	V31.B16, [V15.B16], V15.B16
169
170	// V10 += V15; V5 <<<= ((V10 XOR V5), 7)
171	// ...
172	VADD	V15.S4, V10.S4, V10.S4
173	VADD	V12.S4, V11.S4, V11.S4
174	VADD	V13.S4, V8.S4, V8.S4
175	VADD	V14.S4, V9.S4, V9.S4
176	VEOR	V10.B16, V5.B16, V16.B16
177	VEOR	V11.B16, V6.B16, V17.B16
178	VEOR	V8.B16, V7.B16, V18.B16
179	VEOR	V9.B16, V4.B16, V19.B16
180	VSHL	$7, V16.S4, V5.S4
181	VSHL	$7, V17.S4, V6.S4
182	VSHL	$7, V18.S4, V7.S4
183	VSHL	$7, V19.S4, V4.S4
184	VSRI	$25, V16.S4, V5.S4
185	VSRI	$25, V17.S4, V6.S4
186	VSRI	$25, V18.S4, V7.S4
187	VSRI	$25, V19.S4, V4.S4
188
189	SUB	$1, R21
190	CBNZ	R21, chacha
191
192	// VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
193	WORD	$0x4D60E950
194
195	// VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
196	WORD	$0x4DFFE894
197	VADD	V30.S4, V12.S4, V12.S4
198	VADD	V16.S4, V0.S4, V0.S4
199	VADD	V17.S4, V1.S4, V1.S4
200	VADD	V18.S4, V2.S4, V2.S4
201	VADD	V19.S4, V3.S4, V3.S4
202	// VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
203	WORD	$0x4DFFE898
204	// restore R4
205	SUB	$32, R4
206
207	// load counter + nonce
208	// VLD1R (R7), [V28.S4]
209	WORD	$0x4D40C8FC
210	// VLD3R (R6), [V29.S4, V30.S4, V31.S4]
211	WORD	$0x4D40E8DD
212
213	VADD	V20.S4, V4.S4, V4.S4
214	VADD	V21.S4, V5.S4, V5.S4
215	VADD	V22.S4, V6.S4, V6.S4
216	VADD	V23.S4, V7.S4, V7.S4
217	VADD	V24.S4, V8.S4, V8.S4
218	VADD	V25.S4, V9.S4, V9.S4
219	VADD	V26.S4, V10.S4, V10.S4
220	VADD	V27.S4, V11.S4, V11.S4
221	VADD	V28.S4, V12.S4, V12.S4
222	VADD	V29.S4, V13.S4, V13.S4
223	VADD	V30.S4, V14.S4, V14.S4
224	VADD	V31.S4, V15.S4, V15.S4
225
226	VZIP1	V1.S4, V0.S4, V16.S4
227	VZIP2	V1.S4, V0.S4, V17.S4
228	VZIP1	V3.S4, V2.S4, V18.S4
229	VZIP2	V3.S4, V2.S4, V19.S4
230	VZIP1	V5.S4, V4.S4, V20.S4
231	VZIP2	V5.S4, V4.S4, V21.S4
232	VZIP1	V7.S4, V6.S4, V22.S4
233	VZIP2	V7.S4, V6.S4, V23.S4
234	VZIP1	V9.S4, V8.S4, V24.S4
235	VZIP2	V9.S4, V8.S4, V25.S4
236	VZIP1	V11.S4, V10.S4, V26.S4
237	VZIP2	V11.S4, V10.S4, V27.S4
238	VZIP1	V13.S4, V12.S4, V28.S4
239	VZIP2	V13.S4, V12.S4, V29.S4
240	VZIP1	V15.S4, V14.S4, V30.S4
241	VZIP2	V15.S4, V14.S4, V31.S4
242	VZIP1	V18.D2, V16.D2, V0.D2
243	VZIP2	V18.D2, V16.D2, V4.D2
244	VZIP1	V19.D2, V17.D2, V8.D2
245	VZIP2	V19.D2, V17.D2, V12.D2
246	VLD1.P	64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
247
248	VZIP1	V22.D2, V20.D2, V1.D2
249	VZIP2	V22.D2, V20.D2, V5.D2
250	VZIP1	V23.D2, V21.D2, V9.D2
251	VZIP2	V23.D2, V21.D2, V13.D2
252	VLD1.P	64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
253	VZIP1	V26.D2, V24.D2, V2.D2
254	VZIP2	V26.D2, V24.D2, V6.D2
255	VZIP1	V27.D2, V25.D2, V10.D2
256	VZIP2	V27.D2, V25.D2, V14.D2
257	VLD1.P	64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
258	VZIP1	V30.D2, V28.D2, V3.D2
259	VZIP2	V30.D2, V28.D2, V7.D2
260	VZIP1	V31.D2, V29.D2, V11.D2
261	VZIP2	V31.D2, V29.D2, V15.D2
262	VLD1.P	64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
263	VEOR	V0.B16, V16.B16, V16.B16
264	VEOR	V1.B16, V17.B16, V17.B16
265	VEOR	V2.B16, V18.B16, V18.B16
266	VEOR	V3.B16, V19.B16, V19.B16
267	VST1.P	[V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
268	VEOR	V4.B16, V20.B16, V20.B16
269	VEOR	V5.B16, V21.B16, V21.B16
270	VEOR	V6.B16, V22.B16, V22.B16
271	VEOR	V7.B16, V23.B16, V23.B16
272	VST1.P	[V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
273	VEOR	V8.B16, V24.B16, V24.B16
274	VEOR	V9.B16, V25.B16, V25.B16
275	VEOR	V10.B16, V26.B16, V26.B16
276	VEOR	V11.B16, V27.B16, V27.B16
277	VST1.P	[V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
278	VEOR	V12.B16, V28.B16, V28.B16
279	VEOR	V13.B16, V29.B16, V29.B16
280	VEOR	V14.B16, V30.B16, V30.B16
281	VEOR	V15.B16, V31.B16, V31.B16
282	VST1.P	[V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
283
284	ADD	$4, R20
285	MOVW	R20, (R7) // update counter
286
287	CMP	R2, R12
288	BGT	loop
289
290	RET
291
292
293DATA	·constants+0x00(SB)/4, $0x61707865
294DATA	·constants+0x04(SB)/4, $0x3320646e
295DATA	·constants+0x08(SB)/4, $0x79622d32
296DATA	·constants+0x0c(SB)/4, $0x6b206574
297GLOBL	·constants(SB), NOPTR|RODATA, $32
298
299DATA	·incRotMatrix+0x00(SB)/4, $0x00000000
300DATA	·incRotMatrix+0x04(SB)/4, $0x00000001
301DATA	·incRotMatrix+0x08(SB)/4, $0x00000002
302DATA	·incRotMatrix+0x0c(SB)/4, $0x00000003
303DATA	·incRotMatrix+0x10(SB)/4, $0x02010003
304DATA	·incRotMatrix+0x14(SB)/4, $0x06050407
305DATA	·incRotMatrix+0x18(SB)/4, $0x0A09080B
306DATA	·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
307GLOBL	·incRotMatrix(SB), NOPTR|RODATA, $32
308