1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build go1.11 && gc && !purego
6// +build go1.11,gc,!purego
7
8#include "textflag.h"
9
10#define NUM_ROUNDS 10
11
12// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
13TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
14	MOVD	dst+0(FP), R1
15	MOVD	src+24(FP), R2
16	MOVD	src_len+32(FP), R3
17	MOVD	key+48(FP), R4
18	MOVD	nonce+56(FP), R6
19	MOVD	counter+64(FP), R7
20
21	MOVDconstants(SB), R10
22	MOVDincRotMatrix(SB), R11
23
24	MOVW	(R7), R20
25
26	AND	$~255, R3, R13
27	ADD	R2, R13, R12 // R12 for block end
28	AND	$255, R3, R13
29loop:
30	MOVD	$NUM_ROUNDS, R21
31	VLD1	(R11), [V30.S4, V31.S4]
32
33	// load contants
34	// VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
35	WORD	$0x4D60E940
36
37	// load keys
38	// VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
39	WORD	$0x4DFFE884
40	// VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
41	WORD	$0x4DFFE888
42	SUB	$32, R4
43
44	// load counter + nonce
45	// VLD1R (R7), [V12.S4]
46	WORD	$0x4D40C8EC
47
48	// VLD3R (R6), [V13.S4, V14.S4, V15.S4]
49	WORD	$0x4D40E8CD
50
51	// update counter
52	VADD	V30.S4, V12.S4, V12.S4
53
54chacha:
55	// V0..V3 += V4..V7
56	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
57	VADD	V0.S4, V4.S4, V0.S4
58	VADD	V1.S4, V5.S4, V1.S4
59	VADD	V2.S4, V6.S4, V2.S4
60	VADD	V3.S4, V7.S4, V3.S4
61	VEOR	V12.B16, V0.B16, V12.B16
62	VEOR	V13.B16, V1.B16, V13.B16
63	VEOR	V14.B16, V2.B16, V14.B16
64	VEOR	V15.B16, V3.B16, V15.B16
65	VREV32	V12.H8, V12.H8
66	VREV32	V13.H8, V13.H8
67	VREV32	V14.H8, V14.H8
68	VREV32	V15.H8, V15.H8
69	// V8..V11 += V12..V15
70	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
71	VADD	V8.S4, V12.S4, V8.S4
72	VADD	V9.S4, V13.S4, V9.S4
73	VADD	V10.S4, V14.S4, V10.S4
74	VADD	V11.S4, V15.S4, V11.S4
75	VEOR	V8.B16, V4.B16, V16.B16
76	VEOR	V9.B16, V5.B16, V17.B16
77	VEOR	V10.B16, V6.B16, V18.B16
78	VEOR	V11.B16, V7.B16, V19.B16
79	VSHL	$12, V16.S4, V4.S4
80	VSHL	$12, V17.S4, V5.S4
81	VSHL	$12, V18.S4, V6.S4
82	VSHL	$12, V19.S4, V7.S4
83	VSRI	$20, V16.S4, V4.S4
84	VSRI	$20, V17.S4, V5.S4
85	VSRI	$20, V18.S4, V6.S4
86	VSRI	$20, V19.S4, V7.S4
87
88	// V0..V3 += V4..V7
89	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
90	VADD	V0.S4, V4.S4, V0.S4
91	VADD	V1.S4, V5.S4, V1.S4
92	VADD	V2.S4, V6.S4, V2.S4
93	VADD	V3.S4, V7.S4, V3.S4
94	VEOR	V12.B16, V0.B16, V12.B16
95	VEOR	V13.B16, V1.B16, V13.B16
96	VEOR	V14.B16, V2.B16, V14.B16
97	VEOR	V15.B16, V3.B16, V15.B16
98	VTBL	V31.B16, [V12.B16], V12.B16
99	VTBL	V31.B16, [V13.B16], V13.B16
100	VTBL	V31.B16, [V14.B16], V14.B16
101	VTBL	V31.B16, [V15.B16], V15.B16
102
103	// V8..V11 += V12..V15
104	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
105	VADD	V12.S4, V8.S4, V8.S4
106	VADD	V13.S4, V9.S4, V9.S4
107	VADD	V14.S4, V10.S4, V10.S4
108	VADD	V15.S4, V11.S4, V11.S4
109	VEOR	V8.B16, V4.B16, V16.B16
110	VEOR	V9.B16, V5.B16, V17.B16
111	VEOR	V10.B16, V6.B16, V18.B16
112	VEOR	V11.B16, V7.B16, V19.B16
113	VSHL	$7, V16.S4, V4.S4
114	VSHL	$7, V17.S4, V5.S4
115	VSHL	$7, V18.S4, V6.S4
116	VSHL	$7, V19.S4, V7.S4
117	VSRI	$25, V16.S4, V4.S4
118	VSRI	$25, V17.S4, V5.S4
119	VSRI	$25, V18.S4, V6.S4
120	VSRI	$25, V19.S4, V7.S4
121
122	// V0..V3 += V5..V7, V4
123	// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
124	VADD	V0.S4, V5.S4, V0.S4
125	VADD	V1.S4, V6.S4, V1.S4
126	VADD	V2.S4, V7.S4, V2.S4
127	VADD	V3.S4, V4.S4, V3.S4
128	VEOR	V15.B16, V0.B16, V15.B16
129	VEOR	V12.B16, V1.B16, V12.B16
130	VEOR	V13.B16, V2.B16, V13.B16
131	VEOR	V14.B16, V3.B16, V14.B16
132	VREV32	V12.H8, V12.H8
133	VREV32	V13.H8, V13.H8
134	VREV32	V14.H8, V14.H8
135	VREV32	V15.H8, V15.H8
136
137	// V10 += V15; V5 <<<= ((V10 XOR V5), 12)
138	// ...
139	VADD	V15.S4, V10.S4, V10.S4
140	VADD	V12.S4, V11.S4, V11.S4
141	VADD	V13.S4, V8.S4, V8.S4
142	VADD	V14.S4, V9.S4, V9.S4
143	VEOR	V10.B16, V5.B16, V16.B16
144	VEOR	V11.B16, V6.B16, V17.B16
145	VEOR	V8.B16, V7.B16, V18.B16
146	VEOR	V9.B16, V4.B16, V19.B16
147	VSHL	$12, V16.S4, V5.S4
148	VSHL	$12, V17.S4, V6.S4
149	VSHL	$12, V18.S4, V7.S4
150	VSHL	$12, V19.S4, V4.S4
151	VSRI	$20, V16.S4, V5.S4
152	VSRI	$20, V17.S4, V6.S4
153	VSRI	$20, V18.S4, V7.S4
154	VSRI	$20, V19.S4, V4.S4
155
156	// V0 += V5; V15 <<<= ((V0 XOR V15), 8)
157	// ...
158	VADD	V5.S4, V0.S4, V0.S4
159	VADD	V6.S4, V1.S4, V1.S4
160	VADD	V7.S4, V2.S4, V2.S4
161	VADD	V4.S4, V3.S4, V3.S4
162	VEOR	V0.B16, V15.B16, V15.B16
163	VEOR	V1.B16, V12.B16, V12.B16
164	VEOR	V2.B16, V13.B16, V13.B16
165	VEOR	V3.B16, V14.B16, V14.B16
166	VTBL	V31.B16, [V12.B16], V12.B16
167	VTBL	V31.B16, [V13.B16], V13.B16
168	VTBL	V31.B16, [V14.B16], V14.B16
169	VTBL	V31.B16, [V15.B16], V15.B16
170
171	// V10 += V15; V5 <<<= ((V10 XOR V5), 7)
172	// ...
173	VADD	V15.S4, V10.S4, V10.S4
174	VADD	V12.S4, V11.S4, V11.S4
175	VADD	V13.S4, V8.S4, V8.S4
176	VADD	V14.S4, V9.S4, V9.S4
177	VEOR	V10.B16, V5.B16, V16.B16
178	VEOR	V11.B16, V6.B16, V17.B16
179	VEOR	V8.B16, V7.B16, V18.B16
180	VEOR	V9.B16, V4.B16, V19.B16
181	VSHL	$7, V16.S4, V5.S4
182	VSHL	$7, V17.S4, V6.S4
183	VSHL	$7, V18.S4, V7.S4
184	VSHL	$7, V19.S4, V4.S4
185	VSRI	$25, V16.S4, V5.S4
186	VSRI	$25, V17.S4, V6.S4
187	VSRI	$25, V18.S4, V7.S4
188	VSRI	$25, V19.S4, V4.S4
189
190	SUB	$1, R21
191	CBNZ	R21, chacha
192
193	// VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
194	WORD	$0x4D60E950
195
196	// VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
197	WORD	$0x4DFFE894
198	VADD	V30.S4, V12.S4, V12.S4
199	VADD	V16.S4, V0.S4, V0.S4
200	VADD	V17.S4, V1.S4, V1.S4
201	VADD	V18.S4, V2.S4, V2.S4
202	VADD	V19.S4, V3.S4, V3.S4
203	// VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
204	WORD	$0x4DFFE898
205	// restore R4
206	SUB	$32, R4
207
208	// load counter + nonce
209	// VLD1R (R7), [V28.S4]
210	WORD	$0x4D40C8FC
211	// VLD3R (R6), [V29.S4, V30.S4, V31.S4]
212	WORD	$0x4D40E8DD
213
214	VADD	V20.S4, V4.S4, V4.S4
215	VADD	V21.S4, V5.S4, V5.S4
216	VADD	V22.S4, V6.S4, V6.S4
217	VADD	V23.S4, V7.S4, V7.S4
218	VADD	V24.S4, V8.S4, V8.S4
219	VADD	V25.S4, V9.S4, V9.S4
220	VADD	V26.S4, V10.S4, V10.S4
221	VADD	V27.S4, V11.S4, V11.S4
222	VADD	V28.S4, V12.S4, V12.S4
223	VADD	V29.S4, V13.S4, V13.S4
224	VADD	V30.S4, V14.S4, V14.S4
225	VADD	V31.S4, V15.S4, V15.S4
226
227	VZIP1	V1.S4, V0.S4, V16.S4
228	VZIP2	V1.S4, V0.S4, V17.S4
229	VZIP1	V3.S4, V2.S4, V18.S4
230	VZIP2	V3.S4, V2.S4, V19.S4
231	VZIP1	V5.S4, V4.S4, V20.S4
232	VZIP2	V5.S4, V4.S4, V21.S4
233	VZIP1	V7.S4, V6.S4, V22.S4
234	VZIP2	V7.S4, V6.S4, V23.S4
235	VZIP1	V9.S4, V8.S4, V24.S4
236	VZIP2	V9.S4, V8.S4, V25.S4
237	VZIP1	V11.S4, V10.S4, V26.S4
238	VZIP2	V11.S4, V10.S4, V27.S4
239	VZIP1	V13.S4, V12.S4, V28.S4
240	VZIP2	V13.S4, V12.S4, V29.S4
241	VZIP1	V15.S4, V14.S4, V30.S4
242	VZIP2	V15.S4, V14.S4, V31.S4
243	VZIP1	V18.D2, V16.D2, V0.D2
244	VZIP2	V18.D2, V16.D2, V4.D2
245	VZIP1	V19.D2, V17.D2, V8.D2
246	VZIP2	V19.D2, V17.D2, V12.D2
247	VLD1.P	64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
248
249	VZIP1	V22.D2, V20.D2, V1.D2
250	VZIP2	V22.D2, V20.D2, V5.D2
251	VZIP1	V23.D2, V21.D2, V9.D2
252	VZIP2	V23.D2, V21.D2, V13.D2
253	VLD1.P	64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
254	VZIP1	V26.D2, V24.D2, V2.D2
255	VZIP2	V26.D2, V24.D2, V6.D2
256	VZIP1	V27.D2, V25.D2, V10.D2
257	VZIP2	V27.D2, V25.D2, V14.D2
258	VLD1.P	64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
259	VZIP1	V30.D2, V28.D2, V3.D2
260	VZIP2	V30.D2, V28.D2, V7.D2
261	VZIP1	V31.D2, V29.D2, V11.D2
262	VZIP2	V31.D2, V29.D2, V15.D2
263	VLD1.P	64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
264	VEOR	V0.B16, V16.B16, V16.B16
265	VEOR	V1.B16, V17.B16, V17.B16
266	VEOR	V2.B16, V18.B16, V18.B16
267	VEOR	V3.B16, V19.B16, V19.B16
268	VST1.P	[V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
269	VEOR	V4.B16, V20.B16, V20.B16
270	VEOR	V5.B16, V21.B16, V21.B16
271	VEOR	V6.B16, V22.B16, V22.B16
272	VEOR	V7.B16, V23.B16, V23.B16
273	VST1.P	[V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
274	VEOR	V8.B16, V24.B16, V24.B16
275	VEOR	V9.B16, V25.B16, V25.B16
276	VEOR	V10.B16, V26.B16, V26.B16
277	VEOR	V11.B16, V27.B16, V27.B16
278	VST1.P	[V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
279	VEOR	V12.B16, V28.B16, V28.B16
280	VEOR	V13.B16, V29.B16, V29.B16
281	VEOR	V14.B16, V30.B16, V30.B16
282	VEOR	V15.B16, V31.B16, V31.B16
283	VST1.P	[V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
284
285	ADD	$4, R20
286	MOVW	R20, (R7) // update counter
287
288	CMP	R2, R12
289	BGT	loop
290
291	RET
292
293
294DATA	·constants+0x00(SB)/4, $0x61707865
295DATA	·constants+0x04(SB)/4, $0x3320646e
296DATA	·constants+0x08(SB)/4, $0x79622d32
297DATA	·constants+0x0c(SB)/4, $0x6b206574
298GLOBL	·constants(SB), NOPTR|RODATA, $32
299
300DATA	·incRotMatrix+0x00(SB)/4, $0x00000000
301DATA	·incRotMatrix+0x04(SB)/4, $0x00000001
302DATA	·incRotMatrix+0x08(SB)/4, $0x00000002
303DATA	·incRotMatrix+0x0c(SB)/4, $0x00000003
304DATA	·incRotMatrix+0x10(SB)/4, $0x02010003
305DATA	·incRotMatrix+0x14(SB)/4, $0x06050407
306DATA	·incRotMatrix+0x18(SB)/4, $0x0A09080B
307DATA	·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
308GLOBL	·incRotMatrix(SB), NOPTR|RODATA, $32
309