1 /*	$NetBSD: cprng_fast.c,v 1.13 2015/04/13 22:43:41 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 2014 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Taylor R. Campbell.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.13 2015/04/13 22:43:41 riastradh Exp $");
34 
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/bitops.h>
38 #include <sys/cprng.h>
39 #include <sys/cpu.h>
40 #include <sys/intr.h>
41 #include <sys/percpu.h>
42 #include <sys/rnd.h>		/* rnd_initial_entropy */
43 
44 /* ChaCha core */
45 
46 #define	crypto_core_OUTPUTWORDS	16
47 #define	crypto_core_INPUTWORDS	4
48 #define	crypto_core_KEYWORDS	8
49 #define	crypto_core_CONSTWORDS	4
50 
51 #define	crypto_core_ROUNDS	8
52 
53 static uint32_t
rotate(uint32_t u,unsigned c)54 rotate(uint32_t u, unsigned c)
55 {
56 
57 	return (u << c) | (u >> (32 - c));
58 }
59 
60 #define	QUARTERROUND(a, b, c, d) do {					      \
61 	(a) += (b); (d) ^= (a); (d) = rotate((d), 16);			      \
62 	(c) += (d); (b) ^= (c); (b) = rotate((b), 12);			      \
63 	(a) += (b); (d) ^= (a); (d) = rotate((d),  8);			      \
64 	(c) += (d); (b) ^= (c); (b) = rotate((b),  7);			      \
65 } while (0)
66 
67 static void
crypto_core(uint32_t * out,const uint32_t * in,const uint32_t * k,const uint32_t * c)68 crypto_core(uint32_t *out, const uint32_t *in, const uint32_t *k,
69     const uint32_t *c)
70 {
71 	uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
72 	int i;
73 
74 	x0 = c[0];
75 	x1 = c[1];
76 	x2 = c[2];
77 	x3 = c[3];
78 	x4 = k[0];
79 	x5 = k[1];
80 	x6 = k[2];
81 	x7 = k[3];
82 	x8 = k[4];
83 	x9 = k[5];
84 	x10 = k[6];
85 	x11 = k[7];
86 	x12 = in[0];
87 	x13 = in[1];
88 	x14 = in[2];
89 	x15 = in[3];
90 
91 	for (i = crypto_core_ROUNDS; i > 0; i -= 2) {
92 		QUARTERROUND( x0, x4, x8,x12);
93 		QUARTERROUND( x1, x5, x9,x13);
94 		QUARTERROUND( x2, x6,x10,x14);
95 		QUARTERROUND( x3, x7,x11,x15);
96 		QUARTERROUND( x0, x5,x10,x15);
97 		QUARTERROUND( x1, x6,x11,x12);
98 		QUARTERROUND( x2, x7, x8,x13);
99 		QUARTERROUND( x3, x4, x9,x14);
100 	}
101 
102 	out[0] = x0 + c[0];
103 	out[1] = x1 + c[1];
104 	out[2] = x2 + c[2];
105 	out[3] = x3 + c[3];
106 	out[4] = x4 + k[0];
107 	out[5] = x5 + k[1];
108 	out[6] = x6 + k[2];
109 	out[7] = x7 + k[3];
110 	out[8] = x8 + k[4];
111 	out[9] = x9 + k[5];
112 	out[10] = x10 + k[6];
113 	out[11] = x11 + k[7];
114 	out[12] = x12 + in[0];
115 	out[13] = x13 + in[1];
116 	out[14] = x14 + in[2];
117 	out[15] = x15 + in[3];
118 }
119 
120 /* `expand 32-byte k' */
121 static const uint32_t crypto_core_constant32[4] = {
122 	0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U,
123 };
124 
125 /*
126  * Test vector for ChaCha20 from
127  * <http://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-00>,
128  * test vectors for ChaCha12 and ChaCha8 generated by the same
129  * crypto_core code with crypto_core_ROUNDS varied.
130  */
131 
132 #define	check(E)	do						\
133 {									\
134 	if (!(E))							\
135 		panic("crypto self-test failed: %s", #E);		\
136 } while (0)
137 
138 static void
crypto_core_selftest(void)139 crypto_core_selftest(void)
140 {
141 	const uint32_t zero32[8] = {0};
142 	const uint8_t sigma[] = "expand 32-byte k";
143 	uint32_t block[16];
144 	unsigned i;
145 
146 #if crypto_core_ROUNDS == 8
147 	static const uint8_t out[64] = {
148 		0x3e,0x00,0xef,0x2f,0x89,0x5f,0x40,0xd6,
149 		0x7f,0x5b,0xb8,0xe8,0x1f,0x09,0xa5,0xa1,
150 		0x2c,0x84,0x0e,0xc3,0xce,0x9a,0x7f,0x3b,
151 		0x18,0x1b,0xe1,0x88,0xef,0x71,0x1a,0x1e,
152 		0x98,0x4c,0xe1,0x72,0xb9,0x21,0x6f,0x41,
153 		0x9f,0x44,0x53,0x67,0x45,0x6d,0x56,0x19,
154 		0x31,0x4a,0x42,0xa3,0xda,0x86,0xb0,0x01,
155 		0x38,0x7b,0xfd,0xb8,0x0e,0x0c,0xfe,0x42,
156 	};
157 #elif crypto_core_ROUNDS == 12
158 	static const uint8_t out[64] = {
159 		0x9b,0xf4,0x9a,0x6a,0x07,0x55,0xf9,0x53,
160 		0x81,0x1f,0xce,0x12,0x5f,0x26,0x83,0xd5,
161 		0x04,0x29,0xc3,0xbb,0x49,0xe0,0x74,0x14,
162 		0x7e,0x00,0x89,0xa5,0x2e,0xae,0x15,0x5f,
163 		0x05,0x64,0xf8,0x79,0xd2,0x7a,0xe3,0xc0,
164 		0x2c,0xe8,0x28,0x34,0xac,0xfa,0x8c,0x79,
165 		0x3a,0x62,0x9f,0x2c,0xa0,0xde,0x69,0x19,
166 		0x61,0x0b,0xe8,0x2f,0x41,0x13,0x26,0xbe,
167 	};
168 #elif crypto_core_ROUNDS == 20
169 	static const uint8_t out[64] = {
170 		0x76,0xb8,0xe0,0xad,0xa0,0xf1,0x3d,0x90,
171 		0x40,0x5d,0x6a,0xe5,0x53,0x86,0xbd,0x28,
172 		0xbd,0xd2,0x19,0xb8,0xa0,0x8d,0xed,0x1a,
173 		0xa8,0x36,0xef,0xcc,0x8b,0x77,0x0d,0xc7,
174 		0xda,0x41,0x59,0x7c,0x51,0x57,0x48,0x8d,
175 		0x77,0x24,0xe0,0x3f,0xb8,0xd8,0x4a,0x37,
176 		0x6a,0x43,0xb8,0xf4,0x15,0x18,0xa1,0x1c,
177 		0xc3,0x87,0xb6,0x69,0xb2,0xee,0x65,0x86,
178 	};
179 #else
180 #error crypto_core_ROUNDS must be 8, 12, or 20.
181 #endif
182 
183 	check(crypto_core_constant32[0] == le32dec(&sigma[0]));
184 	check(crypto_core_constant32[1] == le32dec(&sigma[4]));
185 	check(crypto_core_constant32[2] == le32dec(&sigma[8]));
186 	check(crypto_core_constant32[3] == le32dec(&sigma[12]));
187 
188 	crypto_core(block, zero32, zero32, crypto_core_constant32);
189 	for (i = 0; i < 16; i++)
190 		check(block[i] == le32dec(&out[i*4]));
191 }
192 
193 #undef check
194 
195 #define	CPRNG_FAST_SEED_BYTES	(crypto_core_KEYWORDS * sizeof(uint32_t))
196 
197 struct cprng_fast {
198 	uint32_t 	buffer[crypto_core_OUTPUTWORDS];
199 	uint32_t 	key[crypto_core_KEYWORDS];
200 	uint32_t 	nonce[crypto_core_INPUTWORDS];
201 	bool		have_initial;
202 };
203 
204 __CTASSERT(sizeof ((struct cprng_fast *)0)->key == CPRNG_FAST_SEED_BYTES);
205 
206 static void	cprng_fast_init_cpu(void *, void *, struct cpu_info *);
207 static void	cprng_fast_schedule_reseed(struct cprng_fast *);
208 static void	cprng_fast_intr(void *);
209 
210 static void	cprng_fast_seed(struct cprng_fast *, const void *);
211 static void	cprng_fast_buf(struct cprng_fast *, void *, unsigned);
212 
213 static void	cprng_fast_buf_short(void *, size_t);
214 static void	cprng_fast_buf_long(void *, size_t);
215 
216 static percpu_t	*cprng_fast_percpu	__read_mostly;
217 static void	*cprng_fast_softint	__read_mostly;
218 
219 void
cprng_fast_init(void)220 cprng_fast_init(void)
221 {
222 
223 	crypto_core_selftest();
224 	cprng_fast_percpu = percpu_alloc(sizeof(struct cprng_fast));
225 	percpu_foreach(cprng_fast_percpu, &cprng_fast_init_cpu, NULL);
226 	cprng_fast_softint = softint_establish(SOFTINT_SERIAL|SOFTINT_MPSAFE,
227 	    &cprng_fast_intr, NULL);
228 }
229 
230 static void
cprng_fast_init_cpu(void * p,void * arg __unused,struct cpu_info * ci __unused)231 cprng_fast_init_cpu(void *p, void *arg __unused, struct cpu_info *ci __unused)
232 {
233 	struct cprng_fast *const cprng = p;
234 	uint8_t seed[CPRNG_FAST_SEED_BYTES];
235 
236 	cprng_strong(kern_cprng, seed, sizeof seed, 0);
237 	cprng_fast_seed(cprng, seed);
238 	cprng->have_initial = rnd_initial_entropy;
239 	(void)explicit_memset(seed, 0, sizeof seed);
240 }
241 
242 static inline int
cprng_fast_get(struct cprng_fast ** cprngp)243 cprng_fast_get(struct cprng_fast **cprngp)
244 {
245 	struct cprng_fast *cprng;
246 	int s;
247 
248 	*cprngp = cprng = percpu_getref(cprng_fast_percpu);
249 	s = splvm();
250 
251 	if (__predict_false(!cprng->have_initial))
252 		cprng_fast_schedule_reseed(cprng);
253 
254 	return s;
255 }
256 
257 static inline void
cprng_fast_put(struct cprng_fast * cprng,int s)258 cprng_fast_put(struct cprng_fast *cprng, int s)
259 {
260 
261 	KASSERT((cprng == percpu_getref(cprng_fast_percpu)) &&
262 	    (percpu_putref(cprng_fast_percpu), true));
263 	splx(s);
264 	percpu_putref(cprng_fast_percpu);
265 }
266 
267 static void
cprng_fast_schedule_reseed(struct cprng_fast * cprng __unused)268 cprng_fast_schedule_reseed(struct cprng_fast *cprng __unused)
269 {
270 
271 	softint_schedule(cprng_fast_softint);
272 }
273 
274 static void
cprng_fast_intr(void * cookie __unused)275 cprng_fast_intr(void *cookie __unused)
276 {
277 	struct cprng_fast *cprng;
278 	uint8_t seed[CPRNG_FAST_SEED_BYTES];
279 	int s;
280 
281 	cprng_strong(kern_cprng, seed, sizeof(seed), 0);
282 
283 	cprng = percpu_getref(cprng_fast_percpu);
284 	s = splvm();
285 	cprng_fast_seed(cprng, seed);
286 	cprng->have_initial = rnd_initial_entropy;
287 	splx(s);
288 	percpu_putref(cprng_fast_percpu);
289 
290 	explicit_memset(seed, 0, sizeof(seed));
291 }
292 
293 /* CPRNG algorithm */
294 
295 /*
296  * The state consists of a key, the current nonce, and a 64-byte buffer
297  * of output.  Since we fill the buffer only when we need output, and
298  * eat a 32-bit word at a time, one 32-bit word of the buffer would be
299  * wasted.  Instead, we repurpose it to count the number of entries in
300  * the buffer remaining, counting from high to low in order to allow
301  * comparison to zero to detect when we need to refill it.
302  */
303 #define	CPRNG_FAST_BUFIDX	(crypto_core_OUTPUTWORDS - 1)
304 
305 static void
cprng_fast_seed(struct cprng_fast * cprng,const void * seed)306 cprng_fast_seed(struct cprng_fast *cprng, const void *seed)
307 {
308 
309 	(void)memset(cprng->buffer, 0, sizeof cprng->buffer);
310 	(void)memcpy(cprng->key, seed, sizeof cprng->key);
311 	(void)memset(cprng->nonce, 0, sizeof cprng->nonce);
312 }
313 
314 static inline uint32_t
cprng_fast_word(struct cprng_fast * cprng)315 cprng_fast_word(struct cprng_fast *cprng)
316 {
317 	uint32_t v;
318 
319 	if (__predict_true(0 < cprng->buffer[CPRNG_FAST_BUFIDX])) {
320 		v = cprng->buffer[--cprng->buffer[CPRNG_FAST_BUFIDX]];
321 	} else {
322 		/* If we don't have enough words, refill the buffer.  */
323 		crypto_core(cprng->buffer, cprng->nonce, cprng->key,
324 		    crypto_core_constant32);
325 		if (__predict_false(++cprng->nonce[0] == 0)) {
326 			cprng->nonce[1]++;
327 			cprng_fast_schedule_reseed(cprng);
328 		}
329 		v = cprng->buffer[CPRNG_FAST_BUFIDX];
330 		cprng->buffer[CPRNG_FAST_BUFIDX] = CPRNG_FAST_BUFIDX;
331 	}
332 
333 	return v;
334 }
335 
336 static inline void
cprng_fast_buf(struct cprng_fast * cprng,void * buf,unsigned n)337 cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned n)
338 {
339 	uint8_t *p = buf;
340 	uint32_t v;
341 	unsigned w, r;
342 
343 	w = n / sizeof(uint32_t);
344 	while (w--) {
345 		v = cprng_fast_word(cprng);
346 		(void)memcpy(p, &v, 4);
347 		p += 4;
348 	}
349 
350 	r = n % sizeof(uint32_t);
351 	if (r) {
352 		v = cprng_fast_word(cprng);
353 		while (r--) {
354 			*p++ = (v & 0xff);
355 			v >>= 8;
356 		}
357 	}
358 }
359 
360 /*
361  * crypto_onetimestream: Expand a short unpredictable one-time seed
362  * into a long unpredictable output.
363  */
364 static void
crypto_onetimestream(const uint32_t seed[crypto_core_KEYWORDS],void * buf,size_t n)365 crypto_onetimestream(const uint32_t seed[crypto_core_KEYWORDS], void *buf,
366     size_t n)
367 {
368 	uint32_t block[crypto_core_OUTPUTWORDS];
369 	uint32_t nonce[crypto_core_INPUTWORDS] = {0};
370 	uint8_t *p8;
371 	uint32_t *p32;
372 	size_t ni, nb, nf;
373 
374 	/*
375 	 * Guarantee we can generate up to n bytes.  We have
376 	 * 2^(32*INPUTWORDS) possible inputs yielding output of
377 	 * 4*OUTPUTWORDS*2^(32*INPUTWORDS) bytes.  It suffices to
378 	 * require that sizeof n > (1/CHAR_BIT) log_2 n be less than
379 	 * (1/CHAR_BIT) log_2 of the total output stream length.  We
380 	 * have
381 	 *
382 	 *	log_2 (4 o 2^(32 i)) = log_2 (4 o) + log_2 2^(32 i)
383 	 *	  = 2 + log_2 o + 32 i.
384 	 */
385 	__CTASSERT(CHAR_BIT*sizeof n <=
386 	    (2 + ilog2(crypto_core_OUTPUTWORDS) + 32*crypto_core_INPUTWORDS));
387 
388 	p8 = buf;
389 	p32 = (uint32_t *)roundup2((uintptr_t)p8, sizeof(uint32_t));
390 	ni = (uint8_t *)p32 - p8;
391 	if (n < ni)
392 		ni = n;
393 	nb = (n - ni) / sizeof block;
394 	nf = (n - ni) % sizeof block;
395 
396 	KASSERT(((uintptr_t)p32 & 3) == 0);
397 	KASSERT(ni <= n);
398 	KASSERT(nb <= (n / sizeof block));
399 	KASSERT(nf <= n);
400 	KASSERT(n == (ni + (nb * sizeof block) + nf));
401 	KASSERT(ni < sizeof(uint32_t));
402 	KASSERT(nf < sizeof block);
403 
404 	if (ni) {
405 		crypto_core(block, nonce, seed, crypto_core_constant32);
406 		nonce[0]++;
407 		(void)memcpy(p8, block, ni);
408 	}
409 	while (nb--) {
410 		crypto_core(p32, nonce, seed, crypto_core_constant32);
411 		if (++nonce[0] == 0)
412 			nonce[1]++;
413 		p32 += crypto_core_OUTPUTWORDS;
414 	}
415 	if (nf) {
416 		crypto_core(block, nonce, seed, crypto_core_constant32);
417 		if (++nonce[0] == 0)
418 			nonce[1]++;
419 		(void)memcpy(p32, block, nf);
420 	}
421 
422 	if (ni | nf)
423 		(void)explicit_memset(block, 0, sizeof block);
424 }
425 
426 /* Public API */
427 
428 uint32_t
cprng_fast32(void)429 cprng_fast32(void)
430 {
431 	struct cprng_fast *cprng;
432 	uint32_t v;
433 	int s;
434 
435 	s = cprng_fast_get(&cprng);
436 	v = cprng_fast_word(cprng);
437 	cprng_fast_put(cprng, s);
438 
439 	return v;
440 }
441 
442 uint64_t
cprng_fast64(void)443 cprng_fast64(void)
444 {
445 	struct cprng_fast *cprng;
446 	uint32_t hi, lo;
447 	int s;
448 
449 	s = cprng_fast_get(&cprng);
450 	hi = cprng_fast_word(cprng);
451 	lo = cprng_fast_word(cprng);
452 	cprng_fast_put(cprng, s);
453 
454 	return ((uint64_t)hi << 32) | lo;
455 }
456 
457 static void
cprng_fast_buf_short(void * buf,size_t len)458 cprng_fast_buf_short(void *buf, size_t len)
459 {
460 	struct cprng_fast *cprng;
461 	int s;
462 
463 	s = cprng_fast_get(&cprng);
464 	cprng_fast_buf(cprng, buf, len);
465 	cprng_fast_put(cprng, s);
466 }
467 
468 static __noinline void
cprng_fast_buf_long(void * buf,size_t len)469 cprng_fast_buf_long(void *buf, size_t len)
470 {
471 	uint32_t seed[crypto_core_KEYWORDS];
472 	struct cprng_fast *cprng;
473 	int s;
474 
475 	s = cprng_fast_get(&cprng);
476 	cprng_fast_buf(cprng, seed, sizeof seed);
477 	cprng_fast_put(cprng, s);
478 
479 	crypto_onetimestream(seed, buf, len);
480 
481 	(void)explicit_memset(seed, 0, sizeof seed);
482 }
483 
484 size_t
cprng_fast(void * buf,size_t len)485 cprng_fast(void *buf, size_t len)
486 {
487 
488 	/*
489 	 * We don't want to hog the CPU, so we use the short version,
490 	 * to generate output without preemption, only if we can do it
491 	 * with at most one crypto_core.
492 	 */
493 	if (len <= (sizeof(uint32_t) * crypto_core_OUTPUTWORDS))
494 		cprng_fast_buf_short(buf, len);
495 	else
496 		cprng_fast_buf_long(buf, len);
497 
498 	return len;
499 }
500