1 /* $OpenBSD: chacha-merged.c,v 1.10 2021/10/22 17:43:00 tb Exp $ */
2 /*
3 chacha-merged.c version 20080118
4 D. J. Bernstein
5 Public domain.
6 */
7 
8 #include <stdint.h>
9 
10 #define CHACHA_MINKEYLEN	16
11 #define CHACHA_NONCELEN		8
12 #define CHACHA_CTRLEN		8
13 #define CHACHA_STATELEN		(CHACHA_NONCELEN+CHACHA_CTRLEN)
14 #define CHACHA_BLOCKLEN		64
15 
16 typedef uint8_t u8;
17 typedef uint32_t u32;
18 
19 struct chacha_ctx {
20 	u32 input[16];
21 	u8 ks[CHACHA_BLOCKLEN];
22 	u8 unused;
23 };
24 
25 static inline void chacha_keysetup(struct chacha_ctx *x, const u8 *k, u32 kbits)
26     __attribute__((__bounded__(__minbytes__, 2, CHACHA_MINKEYLEN)));
27 static inline void chacha_ivsetup(struct chacha_ctx *x, const u8 *iv,
28     const u8 *ctr)
29     __attribute__((__bounded__(__minbytes__, 2, CHACHA_NONCELEN)))
30     __attribute__((__bounded__(__minbytes__, 3, CHACHA_CTRLEN)));
31 static inline void chacha_encrypt_bytes(struct chacha_ctx *x, const u8 *m,
32     u8 *c, u32 bytes)
33     __attribute__((__bounded__(__buffer__, 2, 4)))
34     __attribute__((__bounded__(__buffer__, 3, 4)));
35 
36 typedef struct chacha_ctx chacha_ctx;
37 
38 #define U8C(v) (v##U)
39 #define U32C(v) (v##U)
40 
41 #define U8V(v) ((u8)(v) & U8C(0xFF))
42 #define U32V(v) ((u32)(v) & U32C(0xFFFFFFFF))
43 
44 #define ROTL32(v, n) \
45   (U32V((v) << (n)) | ((v) >> (32 - (n))))
46 
47 #define U8TO32_LITTLE(p) \
48   (((u32)((p)[0])) | \
49    ((u32)((p)[1]) <<  8) | \
50    ((u32)((p)[2]) << 16) | \
51    ((u32)((p)[3]) << 24))
52 
53 #define U32TO8_LITTLE(p, v) \
54   do { \
55     (p)[0] = U8V((v)); \
56     (p)[1] = U8V((v) >>  8); \
57     (p)[2] = U8V((v) >> 16); \
58     (p)[3] = U8V((v) >> 24); \
59   } while (0)
60 
61 #define ROTATE(v,c) (ROTL32(v,c))
62 #define XOR(v,w) ((v) ^ (w))
63 #define PLUS(v,w) (U32V((v) + (w)))
64 #define PLUSONE(v) (PLUS((v),1))
65 
66 #define QUARTERROUND(a,b,c,d) \
67   a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
68   c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
69   a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
70   c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
71 
72 /* Initialise with "expand 32-byte k". */
73 static const char sigma[16] = {
74 	0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x20, 0x33,
75 	0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20, 0x6b,
76 };
77 
78 /* Initialise with "expand 16-byte k". */
79 static const char tau[16] = {
80 	0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x20, 0x31,
81 	0x36, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20, 0x6b,
82 };
83 
84 static inline void
85 chacha_keysetup(chacha_ctx *x, const u8 *k, u32 kbits)
86 {
87 	const char *constants;
88 
89 	x->input[4] = U8TO32_LITTLE(k + 0);
90 	x->input[5] = U8TO32_LITTLE(k + 4);
91 	x->input[6] = U8TO32_LITTLE(k + 8);
92 	x->input[7] = U8TO32_LITTLE(k + 12);
93 	if (kbits == 256) { /* recommended */
94 		k += 16;
95 		constants = sigma;
96 	} else { /* kbits == 128 */
97 		constants = tau;
98 	}
99 	x->input[8] = U8TO32_LITTLE(k + 0);
100 	x->input[9] = U8TO32_LITTLE(k + 4);
101 	x->input[10] = U8TO32_LITTLE(k + 8);
102 	x->input[11] = U8TO32_LITTLE(k + 12);
103 	x->input[0] = U8TO32_LITTLE(constants + 0);
104 	x->input[1] = U8TO32_LITTLE(constants + 4);
105 	x->input[2] = U8TO32_LITTLE(constants + 8);
106 	x->input[3] = U8TO32_LITTLE(constants + 12);
107 }
108 
109 static inline void
110 chacha_ivsetup(chacha_ctx *x, const u8 *iv, const u8 *counter)
111 {
112 	x->input[12] = counter == NULL ? 0 : U8TO32_LITTLE(counter + 0);
113 	x->input[13] = counter == NULL ? 0 : U8TO32_LITTLE(counter + 4);
114 	x->input[14] = U8TO32_LITTLE(iv + 0);
115 	x->input[15] = U8TO32_LITTLE(iv + 4);
116 }
117 
118 static inline void
119 chacha_encrypt_bytes(chacha_ctx *x, const u8 *m, u8 *c, u32 bytes)
120 {
121 	u32 x0, x1, x2, x3, x4, x5, x6, x7;
122 	u32 x8, x9, x10, x11, x12, x13, x14, x15;
123 	u32 j0, j1, j2, j3, j4, j5, j6, j7;
124 	u32 j8, j9, j10, j11, j12, j13, j14, j15;
125 	u8 *ctarget = NULL;
126 	u8 tmp[64];
127 	u32 i;
128 
129 	if (!bytes)
130 		return;
131 
132 	j0 = x->input[0];
133 	j1 = x->input[1];
134 	j2 = x->input[2];
135 	j3 = x->input[3];
136 	j4 = x->input[4];
137 	j5 = x->input[5];
138 	j6 = x->input[6];
139 	j7 = x->input[7];
140 	j8 = x->input[8];
141 	j9 = x->input[9];
142 	j10 = x->input[10];
143 	j11 = x->input[11];
144 	j12 = x->input[12];
145 	j13 = x->input[13];
146 	j14 = x->input[14];
147 	j15 = x->input[15];
148 
149 	for (;;) {
150 		if (bytes < 64) {
151 			for (i = 0; i < bytes; ++i)
152 				tmp[i] = m[i];
153 			m = tmp;
154 			ctarget = c;
155 			c = tmp;
156 		}
157 		x0 = j0;
158 		x1 = j1;
159 		x2 = j2;
160 		x3 = j3;
161 		x4 = j4;
162 		x5 = j5;
163 		x6 = j6;
164 		x7 = j7;
165 		x8 = j8;
166 		x9 = j9;
167 		x10 = j10;
168 		x11 = j11;
169 		x12 = j12;
170 		x13 = j13;
171 		x14 = j14;
172 		x15 = j15;
173 		for (i = 20; i > 0; i -= 2) {
174 			QUARTERROUND(x0, x4, x8, x12)
175 			QUARTERROUND(x1, x5, x9, x13)
176 			QUARTERROUND(x2, x6, x10, x14)
177 			QUARTERROUND(x3, x7, x11, x15)
178 			QUARTERROUND(x0, x5, x10, x15)
179 			QUARTERROUND(x1, x6, x11, x12)
180 			QUARTERROUND(x2, x7, x8, x13)
181 			QUARTERROUND(x3, x4, x9, x14)
182 		}
183 		x0 = PLUS(x0, j0);
184 		x1 = PLUS(x1, j1);
185 		x2 = PLUS(x2, j2);
186 		x3 = PLUS(x3, j3);
187 		x4 = PLUS(x4, j4);
188 		x5 = PLUS(x5, j5);
189 		x6 = PLUS(x6, j6);
190 		x7 = PLUS(x7, j7);
191 		x8 = PLUS(x8, j8);
192 		x9 = PLUS(x9, j9);
193 		x10 = PLUS(x10, j10);
194 		x11 = PLUS(x11, j11);
195 		x12 = PLUS(x12, j12);
196 		x13 = PLUS(x13, j13);
197 		x14 = PLUS(x14, j14);
198 		x15 = PLUS(x15, j15);
199 
200 		if (bytes < 64) {
201 			U32TO8_LITTLE(x->ks + 0, x0);
202 			U32TO8_LITTLE(x->ks + 4, x1);
203 			U32TO8_LITTLE(x->ks + 8, x2);
204 			U32TO8_LITTLE(x->ks + 12, x3);
205 			U32TO8_LITTLE(x->ks + 16, x4);
206 			U32TO8_LITTLE(x->ks + 20, x5);
207 			U32TO8_LITTLE(x->ks + 24, x6);
208 			U32TO8_LITTLE(x->ks + 28, x7);
209 			U32TO8_LITTLE(x->ks + 32, x8);
210 			U32TO8_LITTLE(x->ks + 36, x9);
211 			U32TO8_LITTLE(x->ks + 40, x10);
212 			U32TO8_LITTLE(x->ks + 44, x11);
213 			U32TO8_LITTLE(x->ks + 48, x12);
214 			U32TO8_LITTLE(x->ks + 52, x13);
215 			U32TO8_LITTLE(x->ks + 56, x14);
216 			U32TO8_LITTLE(x->ks + 60, x15);
217 		}
218 
219 		x0 = XOR(x0, U8TO32_LITTLE(m + 0));
220 		x1 = XOR(x1, U8TO32_LITTLE(m + 4));
221 		x2 = XOR(x2, U8TO32_LITTLE(m + 8));
222 		x3 = XOR(x3, U8TO32_LITTLE(m + 12));
223 		x4 = XOR(x4, U8TO32_LITTLE(m + 16));
224 		x5 = XOR(x5, U8TO32_LITTLE(m + 20));
225 		x6 = XOR(x6, U8TO32_LITTLE(m + 24));
226 		x7 = XOR(x7, U8TO32_LITTLE(m + 28));
227 		x8 = XOR(x8, U8TO32_LITTLE(m + 32));
228 		x9 = XOR(x9, U8TO32_LITTLE(m + 36));
229 		x10 = XOR(x10, U8TO32_LITTLE(m + 40));
230 		x11 = XOR(x11, U8TO32_LITTLE(m + 44));
231 		x12 = XOR(x12, U8TO32_LITTLE(m + 48));
232 		x13 = XOR(x13, U8TO32_LITTLE(m + 52));
233 		x14 = XOR(x14, U8TO32_LITTLE(m + 56));
234 		x15 = XOR(x15, U8TO32_LITTLE(m + 60));
235 
236 		j12 = PLUSONE(j12);
237 		if (!j12) {
238 			j13 = PLUSONE(j13);
239 			/*
240 			 * Stopping at 2^70 bytes per nonce is the user's
241 			 * responsibility.
242 			 */
243 		}
244 
245 		U32TO8_LITTLE(c + 0, x0);
246 		U32TO8_LITTLE(c + 4, x1);
247 		U32TO8_LITTLE(c + 8, x2);
248 		U32TO8_LITTLE(c + 12, x3);
249 		U32TO8_LITTLE(c + 16, x4);
250 		U32TO8_LITTLE(c + 20, x5);
251 		U32TO8_LITTLE(c + 24, x6);
252 		U32TO8_LITTLE(c + 28, x7);
253 		U32TO8_LITTLE(c + 32, x8);
254 		U32TO8_LITTLE(c + 36, x9);
255 		U32TO8_LITTLE(c + 40, x10);
256 		U32TO8_LITTLE(c + 44, x11);
257 		U32TO8_LITTLE(c + 48, x12);
258 		U32TO8_LITTLE(c + 52, x13);
259 		U32TO8_LITTLE(c + 56, x14);
260 		U32TO8_LITTLE(c + 60, x15);
261 
262 		if (bytes <= 64) {
263 			if (bytes < 64) {
264 				for (i = 0; i < bytes; ++i)
265 					ctarget[i] = c[i];
266 			}
267 			x->input[12] = j12;
268 			x->input[13] = j13;
269 			x->unused = 64 - bytes;
270 			return;
271 		}
272 		bytes -= 64;
273 		c += 64;
274 		m += 64;
275 	}
276 }
277 
278 void
279 CRYPTO_hchacha_20(unsigned char subkey[32], const unsigned char key[32],
280     const unsigned char nonce[16])
281 {
282 	uint32_t x[16];
283 	int i;
284 
285 	x[0] = U8TO32_LITTLE(sigma + 0);
286 	x[1] = U8TO32_LITTLE(sigma + 4);
287 	x[2] = U8TO32_LITTLE(sigma + 8);
288 	x[3] = U8TO32_LITTLE(sigma + 12);
289 	x[4] = U8TO32_LITTLE(key + 0);
290 	x[5] = U8TO32_LITTLE(key + 4);
291 	x[6] = U8TO32_LITTLE(key + 8);
292 	x[7] = U8TO32_LITTLE(key + 12);
293 	x[8] = U8TO32_LITTLE(key + 16);
294 	x[9] = U8TO32_LITTLE(key + 20);
295 	x[10] = U8TO32_LITTLE(key + 24);
296 	x[11] = U8TO32_LITTLE(key + 28);
297 	x[12] = U8TO32_LITTLE(nonce + 0);
298 	x[13] = U8TO32_LITTLE(nonce + 4);
299 	x[14] = U8TO32_LITTLE(nonce + 8);
300 	x[15] = U8TO32_LITTLE(nonce + 12);
301 
302 	for (i = 20; i > 0; i -= 2) {
303 		QUARTERROUND(x[0], x[4], x[8], x[12])
304 		QUARTERROUND(x[1], x[5], x[9], x[13])
305 		QUARTERROUND(x[2], x[6], x[10], x[14])
306 		QUARTERROUND(x[3], x[7], x[11], x[15])
307 		QUARTERROUND(x[0], x[5], x[10], x[15])
308 		QUARTERROUND(x[1], x[6], x[11], x[12])
309 		QUARTERROUND(x[2], x[7], x[8], x[13])
310 		QUARTERROUND(x[3], x[4], x[9], x[14])
311 	}
312 
313 	U32TO8_LITTLE(subkey + 0, x[0]);
314 	U32TO8_LITTLE(subkey + 4, x[1]);
315 	U32TO8_LITTLE(subkey + 8, x[2]);
316 	U32TO8_LITTLE(subkey + 12, x[3]);
317 
318 	U32TO8_LITTLE(subkey + 16, x[12]);
319 	U32TO8_LITTLE(subkey + 20, x[13]);
320 	U32TO8_LITTLE(subkey + 24, x[14]);
321 	U32TO8_LITTLE(subkey + 28, x[15]);
322 }
323