1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build gc,!purego
6
7#include "go_asm.h"
8#include "textflag.h"
9
10// This is an implementation of the ChaCha20 encryption algorithm as
11// specified in RFC 7539. It uses vector instructions to compute
12// 4 keystream blocks in parallel (256 bytes) which are then XORed
13// with the bytes in the input slice.
14
15GLOBL ·constants<>(SB), RODATA|NOPTR, $32
16// BSWAP: swap bytes in each 4-byte element
17DATA ·constants<>+0x00(SB)/4, $0x03020100
18DATA ·constants<>+0x04(SB)/4, $0x07060504
19DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
20DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
21// J0: [j0, j1, j2, j3]
22DATA ·constants<>+0x10(SB)/4, $0x61707865
23DATA ·constants<>+0x14(SB)/4, $0x3320646e
24DATA ·constants<>+0x18(SB)/4, $0x79622d32
25DATA ·constants<>+0x1c(SB)/4, $0x6b206574
26
27#define BSWAP V5
28#define J0    V6
29#define KEY0  V7
30#define KEY1  V8
31#define NONCE V9
32#define CTR   V10
33#define M0    V11
34#define M1    V12
35#define M2    V13
36#define M3    V14
37#define INC   V15
38#define X0    V16
39#define X1    V17
40#define X2    V18
41#define X3    V19
42#define X4    V20
43#define X5    V21
44#define X6    V22
45#define X7    V23
46#define X8    V24
47#define X9    V25
48#define X10   V26
49#define X11   V27
50#define X12   V28
51#define X13   V29
52#define X14   V30
53#define X15   V31
54
55#define NUM_ROUNDS 20
56
57#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
58	VAF    a1, a0, a0  \
59	VAF    b1, b0, b0  \
60	VAF    c1, c0, c0  \
61	VAF    d1, d0, d0  \
62	VX     a0, a2, a2  \
63	VX     b0, b2, b2  \
64	VX     c0, c2, c2  \
65	VX     d0, d2, d2  \
66	VERLLF $16, a2, a2 \
67	VERLLF $16, b2, b2 \
68	VERLLF $16, c2, c2 \
69	VERLLF $16, d2, d2 \
70	VAF    a2, a3, a3  \
71	VAF    b2, b3, b3  \
72	VAF    c2, c3, c3  \
73	VAF    d2, d3, d3  \
74	VX     a3, a1, a1  \
75	VX     b3, b1, b1  \
76	VX     c3, c1, c1  \
77	VX     d3, d1, d1  \
78	VERLLF $12, a1, a1 \
79	VERLLF $12, b1, b1 \
80	VERLLF $12, c1, c1 \
81	VERLLF $12, d1, d1 \
82	VAF    a1, a0, a0  \
83	VAF    b1, b0, b0  \
84	VAF    c1, c0, c0  \
85	VAF    d1, d0, d0  \
86	VX     a0, a2, a2  \
87	VX     b0, b2, b2  \
88	VX     c0, c2, c2  \
89	VX     d0, d2, d2  \
90	VERLLF $8, a2, a2  \
91	VERLLF $8, b2, b2  \
92	VERLLF $8, c2, c2  \
93	VERLLF $8, d2, d2  \
94	VAF    a2, a3, a3  \
95	VAF    b2, b3, b3  \
96	VAF    c2, c3, c3  \
97	VAF    d2, d3, d3  \
98	VX     a3, a1, a1  \
99	VX     b3, b1, b1  \
100	VX     c3, c1, c1  \
101	VX     d3, d1, d1  \
102	VERLLF $7, a1, a1  \
103	VERLLF $7, b1, b1  \
104	VERLLF $7, c1, c1  \
105	VERLLF $7, d1, d1
106
107#define PERMUTE(mask, v0, v1, v2, v3) \
108	VPERM v0, v0, mask, v0 \
109	VPERM v1, v1, mask, v1 \
110	VPERM v2, v2, mask, v2 \
111	VPERM v3, v3, mask, v3
112
113#define ADDV(x, v0, v1, v2, v3) \
114	VAF x, v0, v0 \
115	VAF x, v1, v1 \
116	VAF x, v2, v2 \
117	VAF x, v3, v3
118
119#define XORV(off, dst, src, v0, v1, v2, v3) \
120	VLM  off(src), M0, M3          \
121	PERMUTE(BSWAP, v0, v1, v2, v3) \
122	VX   v0, M0, M0                \
123	VX   v1, M1, M1                \
124	VX   v2, M2, M2                \
125	VX   v3, M3, M3                \
126	VSTM M0, M3, off(dst)
127
128#define SHUFFLE(a, b, c, d, t, u, v, w) \
129	VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
130	VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
131	VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
132	VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
133	VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
134	VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
135	VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
136	VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
137
138// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
139TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
140	MOVDconstants<>(SB), R1
141	MOVD dst+0(FP), R2         // R2=&dst[0]
142	LMG  src+24(FP), R3, R4    // R3=&src[0] R4=len(src)
143	MOVD key+48(FP), R5        // R5=key
144	MOVD nonce+56(FP), R6      // R6=nonce
145	MOVD counter+64(FP), R7    // R7=counter
146
147	// load BSWAP and J0
148	VLM (R1), BSWAP, J0
149
150	// setup
151	MOVD  $95, R0
152	VLM   (R5), KEY0, KEY1
153	VLL   R0, (R6), NONCE
154	VZERO M0
155	VLEIB $7, $32, M0
156	VSRLB M0, NONCE, NONCE
157
158	// initialize counter values
159	VLREPF (R7), CTR
160	VZERO  INC
161	VLEIF  $1, $1, INC
162	VLEIF  $2, $2, INC
163	VLEIF  $3, $3, INC
164	VAF    INC, CTR, CTR
165	VREPIF $4, INC
166
167chacha:
168	VREPF $0, J0, X0
169	VREPF $1, J0, X1
170	VREPF $2, J0, X2
171	VREPF $3, J0, X3
172	VREPF $0, KEY0, X4
173	VREPF $1, KEY0, X5
174	VREPF $2, KEY0, X6
175	VREPF $3, KEY0, X7
176	VREPF $0, KEY1, X8
177	VREPF $1, KEY1, X9
178	VREPF $2, KEY1, X10
179	VREPF $3, KEY1, X11
180	VLR   CTR, X12
181	VREPF $1, NONCE, X13
182	VREPF $2, NONCE, X14
183	VREPF $3, NONCE, X15
184
185	MOVD $(NUM_ROUNDS/2), R1
186
187loop:
188	ROUND4(X0, X4, X12,  X8, X1, X5, X13,  X9, X2, X6, X14, X10, X3, X7, X15, X11)
189	ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8,  X3, X4, X14, X9)
190
191	ADD $-1, R1
192	BNE loop
193
194	// decrement length
195	ADD $-256, R4
196
197	// rearrange vectors
198	SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
199	ADDV(J0, X0, X1, X2, X3)
200	SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
201	ADDV(KEY0, X4, X5, X6, X7)
202	SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
203	ADDV(KEY1, X8, X9, X10, X11)
204	VAF CTR, X12, X12
205	SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
206	ADDV(NONCE, X12, X13, X14, X15)
207
208	// increment counters
209	VAF INC, CTR, CTR
210
211	// xor keystream with plaintext
212	XORV(0*64, R2, R3, X0, X4,  X8, X12)
213	XORV(1*64, R2, R3, X1, X5,  X9, X13)
214	XORV(2*64, R2, R3, X2, X6, X10, X14)
215	XORV(3*64, R2, R3, X3, X7, X11, X15)
216
217	// increment pointers
218	MOVD $256(R2), R2
219	MOVD $256(R3), R3
220
221	CMPBNE  R4, $0, chacha
222
223	VSTEF $0, CTR, (R7)
224	RET
225