1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build gc && !purego
6// +build gc,!purego
7
8#include "go_asm.h"
9#include "textflag.h"
10
11// This is an implementation of the ChaCha20 encryption algorithm as
12// specified in RFC 7539. It uses vector instructions to compute
13// 4 keystream blocks in parallel (256 bytes) which are then XORed
14// with the bytes in the input slice.
15
16GLOBL ·constants<>(SB), RODATA|NOPTR, $32
17// BSWAP: swap bytes in each 4-byte element
18DATA ·constants<>+0x00(SB)/4, $0x03020100
19DATA ·constants<>+0x04(SB)/4, $0x07060504
20DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
21DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
22// J0: [j0, j1, j2, j3]
23DATA ·constants<>+0x10(SB)/4, $0x61707865
24DATA ·constants<>+0x14(SB)/4, $0x3320646e
25DATA ·constants<>+0x18(SB)/4, $0x79622d32
26DATA ·constants<>+0x1c(SB)/4, $0x6b206574
27
28#define BSWAP V5
29#define J0    V6
30#define KEY0  V7
31#define KEY1  V8
32#define NONCE V9
33#define CTR   V10
34#define M0    V11
35#define M1    V12
36#define M2    V13
37#define M3    V14
38#define INC   V15
39#define X0    V16
40#define X1    V17
41#define X2    V18
42#define X3    V19
43#define X4    V20
44#define X5    V21
45#define X6    V22
46#define X7    V23
47#define X8    V24
48#define X9    V25
49#define X10   V26
50#define X11   V27
51#define X12   V28
52#define X13   V29
53#define X14   V30
54#define X15   V31
55
56#define NUM_ROUNDS 20
57
58#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
59	VAF    a1, a0, a0  \
60	VAF    b1, b0, b0  \
61	VAF    c1, c0, c0  \
62	VAF    d1, d0, d0  \
63	VX     a0, a2, a2  \
64	VX     b0, b2, b2  \
65	VX     c0, c2, c2  \
66	VX     d0, d2, d2  \
67	VERLLF $16, a2, a2 \
68	VERLLF $16, b2, b2 \
69	VERLLF $16, c2, c2 \
70	VERLLF $16, d2, d2 \
71	VAF    a2, a3, a3  \
72	VAF    b2, b3, b3  \
73	VAF    c2, c3, c3  \
74	VAF    d2, d3, d3  \
75	VX     a3, a1, a1  \
76	VX     b3, b1, b1  \
77	VX     c3, c1, c1  \
78	VX     d3, d1, d1  \
79	VERLLF $12, a1, a1 \
80	VERLLF $12, b1, b1 \
81	VERLLF $12, c1, c1 \
82	VERLLF $12, d1, d1 \
83	VAF    a1, a0, a0  \
84	VAF    b1, b0, b0  \
85	VAF    c1, c0, c0  \
86	VAF    d1, d0, d0  \
87	VX     a0, a2, a2  \
88	VX     b0, b2, b2  \
89	VX     c0, c2, c2  \
90	VX     d0, d2, d2  \
91	VERLLF $8, a2, a2  \
92	VERLLF $8, b2, b2  \
93	VERLLF $8, c2, c2  \
94	VERLLF $8, d2, d2  \
95	VAF    a2, a3, a3  \
96	VAF    b2, b3, b3  \
97	VAF    c2, c3, c3  \
98	VAF    d2, d3, d3  \
99	VX     a3, a1, a1  \
100	VX     b3, b1, b1  \
101	VX     c3, c1, c1  \
102	VX     d3, d1, d1  \
103	VERLLF $7, a1, a1  \
104	VERLLF $7, b1, b1  \
105	VERLLF $7, c1, c1  \
106	VERLLF $7, d1, d1
107
108#define PERMUTE(mask, v0, v1, v2, v3) \
109	VPERM v0, v0, mask, v0 \
110	VPERM v1, v1, mask, v1 \
111	VPERM v2, v2, mask, v2 \
112	VPERM v3, v3, mask, v3
113
114#define ADDV(x, v0, v1, v2, v3) \
115	VAF x, v0, v0 \
116	VAF x, v1, v1 \
117	VAF x, v2, v2 \
118	VAF x, v3, v3
119
120#define XORV(off, dst, src, v0, v1, v2, v3) \
121	VLM  off(src), M0, M3          \
122	PERMUTE(BSWAP, v0, v1, v2, v3) \
123	VX   v0, M0, M0                \
124	VX   v1, M1, M1                \
125	VX   v2, M2, M2                \
126	VX   v3, M3, M3                \
127	VSTM M0, M3, off(dst)
128
129#define SHUFFLE(a, b, c, d, t, u, v, w) \
130	VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
131	VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
132	VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
133	VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
134	VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
135	VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
136	VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
137	VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
138
139// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
140TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
141	MOVDconstants<>(SB), R1
142	MOVD dst+0(FP), R2         // R2=&dst[0]
143	LMG  src+24(FP), R3, R4    // R3=&src[0] R4=len(src)
144	MOVD key+48(FP), R5        // R5=key
145	MOVD nonce+56(FP), R6      // R6=nonce
146	MOVD counter+64(FP), R7    // R7=counter
147
148	// load BSWAP and J0
149	VLM (R1), BSWAP, J0
150
151	// setup
152	MOVD  $95, R0
153	VLM   (R5), KEY0, KEY1
154	VLL   R0, (R6), NONCE
155	VZERO M0
156	VLEIB $7, $32, M0
157	VSRLB M0, NONCE, NONCE
158
159	// initialize counter values
160	VLREPF (R7), CTR
161	VZERO  INC
162	VLEIF  $1, $1, INC
163	VLEIF  $2, $2, INC
164	VLEIF  $3, $3, INC
165	VAF    INC, CTR, CTR
166	VREPIF $4, INC
167
168chacha:
169	VREPF $0, J0, X0
170	VREPF $1, J0, X1
171	VREPF $2, J0, X2
172	VREPF $3, J0, X3
173	VREPF $0, KEY0, X4
174	VREPF $1, KEY0, X5
175	VREPF $2, KEY0, X6
176	VREPF $3, KEY0, X7
177	VREPF $0, KEY1, X8
178	VREPF $1, KEY1, X9
179	VREPF $2, KEY1, X10
180	VREPF $3, KEY1, X11
181	VLR   CTR, X12
182	VREPF $1, NONCE, X13
183	VREPF $2, NONCE, X14
184	VREPF $3, NONCE, X15
185
186	MOVD $(NUM_ROUNDS/2), R1
187
188loop:
189	ROUND4(X0, X4, X12,  X8, X1, X5, X13,  X9, X2, X6, X14, X10, X3, X7, X15, X11)
190	ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8,  X3, X4, X14, X9)
191
192	ADD $-1, R1
193	BNE loop
194
195	// decrement length
196	ADD $-256, R4
197
198	// rearrange vectors
199	SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
200	ADDV(J0, X0, X1, X2, X3)
201	SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
202	ADDV(KEY0, X4, X5, X6, X7)
203	SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
204	ADDV(KEY1, X8, X9, X10, X11)
205	VAF CTR, X12, X12
206	SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
207	ADDV(NONCE, X12, X13, X14, X15)
208
209	// increment counters
210	VAF INC, CTR, CTR
211
212	// xor keystream with plaintext
213	XORV(0*64, R2, R3, X0, X4,  X8, X12)
214	XORV(1*64, R2, R3, X1, X5,  X9, X13)
215	XORV(2*64, R2, R3, X2, X6, X10, X14)
216	XORV(3*64, R2, R3, X3, X7, X11, X15)
217
218	// increment pointers
219	MOVD $256(R2), R2
220	MOVD $256(R3), R3
221
222	CMPBNE  R4, $0, chacha
223
224	VSTEF $0, CTR, (R7)
225	RET
226