1 /*-
2  * Copyright (c) 2016 The FreeBSD Foundation
3  * Copyright (c) 2020 Ampere Computing
4  * All rights reserved.
5  *
6  * This software was developed by Andrew Turner under
7  * sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * This file is derived from aesni_wrap.c:
31  * Copyright (C) 2008 Damien Miller <djm@mindrot.org>
32  * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
33  * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
34  * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org>
35  * Copyright (c) 2014 The FreeBSD Foundation
36  */
37 
38 /*
39  * This code is built with floating-point enabled. Make sure to have entered
40  * into floating-point context before calling any of these functions.
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/malloc.h>
49 #include <sys/queue.h>
50 
51 #include <opencrypto/cryptodev.h>
52 #include <opencrypto/gmac.h>
53 #include <crypto/rijndael/rijndael.h>
54 #include <crypto/armv8/armv8_crypto.h>
55 
56 #include <arm_neon.h>
57 
58 static uint8x16_t
59 armv8_aes_enc(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
60 {
61 	uint8x16_t tmp;
62 	int i;
63 
64 	tmp = from;
65 	for (i = 0; i < rounds - 1; i += 2) {
66 		tmp = vaeseq_u8(tmp, keysched[i]);
67 		tmp = vaesmcq_u8(tmp);
68 		tmp = vaeseq_u8(tmp, keysched[i + 1]);
69 		tmp = vaesmcq_u8(tmp);
70 	}
71 
72 	tmp = vaeseq_u8(tmp, keysched[rounds - 1]);
73 	tmp = vaesmcq_u8(tmp);
74 	tmp = vaeseq_u8(tmp, keysched[rounds]);
75 	tmp = veorq_u8(tmp, keysched[rounds + 1]);
76 
77 	return (tmp);
78 }
79 
80 static uint8x16_t
81 armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
82 {
83 	uint8x16_t tmp;
84 	int i;
85 
86 	tmp = from;
87 	for (i = 0; i < rounds - 1; i += 2) {
88 		tmp = vaesdq_u8(tmp, keysched[i]);
89 		tmp = vaesimcq_u8(tmp);
90 		tmp = vaesdq_u8(tmp, keysched[i+1]);
91 		tmp = vaesimcq_u8(tmp);
92 	}
93 
94 	tmp = vaesdq_u8(tmp, keysched[rounds - 1]);
95 	tmp = vaesimcq_u8(tmp);
96 	tmp = vaesdq_u8(tmp, keysched[rounds]);
97 	tmp = veorq_u8(tmp, keysched[rounds + 1]);
98 
99 	return (tmp);
100 }
101 
102 void
103 armv8_aes_encrypt_cbc(const AES_key_t *key, size_t len,
104     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
105     const uint8_t iv[static AES_BLOCK_LEN])
106 {
107 	uint8x16_t tot, ivreg, tmp;
108 	uint8_t block[AES_BLOCK_LEN], *from, *to;
109 	size_t fromseglen, oseglen, seglen, toseglen;
110 
111 	KASSERT(len % AES_BLOCK_LEN == 0,
112 	    ("%s: length %zu not a multiple of the block size", __func__, len));
113 
114 	ivreg = vld1q_u8(iv);
115 	for (; len > 0; len -= seglen) {
116 		from = crypto_cursor_segment(fromc, &fromseglen);
117 		to = crypto_cursor_segment(toc, &toseglen);
118 
119 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
120 		if (seglen < AES_BLOCK_LEN) {
121 			crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block);
122 			tmp = vld1q_u8(block);
123 			tot = armv8_aes_enc(key->aes_rounds - 1,
124 			    (const void *)key->aes_key, veorq_u8(tmp, ivreg));
125 			ivreg = tot;
126 			vst1q_u8(block, tot);
127 			crypto_cursor_copyback(toc, AES_BLOCK_LEN, block);
128 			seglen = AES_BLOCK_LEN;
129 		} else {
130 			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
131 			    seglen -= AES_BLOCK_LEN) {
132 				tmp = vld1q_u8(from);
133 				tot = armv8_aes_enc(key->aes_rounds - 1,
134 				    (const void *)key->aes_key,
135 				    veorq_u8(tmp, ivreg));
136 				ivreg = tot;
137 				vst1q_u8(to, tot);
138 				from += AES_BLOCK_LEN;
139 				to += AES_BLOCK_LEN;
140 			}
141 			seglen = oseglen - seglen;
142 			crypto_cursor_advance(fromc, seglen);
143 			crypto_cursor_advance(toc, seglen);
144 		}
145 	}
146 
147 	explicit_bzero(block, sizeof(block));
148 }
149 
150 void
151 armv8_aes_decrypt_cbc(const AES_key_t *key, size_t len,
152     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
153     const uint8_t iv[static AES_BLOCK_LEN])
154 {
155 	uint8x16_t ivreg, nextiv, tmp;
156 	uint8_t block[AES_BLOCK_LEN], *from, *to;
157 	size_t fromseglen, oseglen, seglen, toseglen;
158 
159 	KASSERT(len % AES_BLOCK_LEN == 0,
160 	    ("%s: length %zu not a multiple of the block size", __func__, len));
161 
162 	ivreg = vld1q_u8(iv);
163 	for (; len > 0; len -= seglen) {
164 		from = crypto_cursor_segment(fromc, &fromseglen);
165 		to = crypto_cursor_segment(toc, &toseglen);
166 
167 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
168 		if (seglen < AES_BLOCK_LEN) {
169 			crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block);
170 			nextiv = vld1q_u8(block);
171 			tmp = armv8_aes_dec(key->aes_rounds - 1,
172 			    (const void *)key->aes_key, nextiv);
173 			vst1q_u8(block, veorq_u8(tmp, ivreg));
174 			ivreg = nextiv;
175 			crypto_cursor_copyback(toc, AES_BLOCK_LEN, block);
176 			seglen = AES_BLOCK_LEN;
177 		} else {
178 			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
179 			    seglen -= AES_BLOCK_LEN) {
180 				nextiv = vld1q_u8(from);
181 				tmp = armv8_aes_dec(key->aes_rounds - 1,
182 				    (const void *)key->aes_key, nextiv);
183 				vst1q_u8(to, veorq_u8(tmp, ivreg));
184 				ivreg = nextiv;
185 				from += AES_BLOCK_LEN;
186 				to += AES_BLOCK_LEN;
187 			}
188 			crypto_cursor_advance(fromc, oseglen - seglen);
189 			crypto_cursor_advance(toc, oseglen - seglen);
190 			seglen = oseglen - seglen;
191 		}
192 	}
193 
194 	explicit_bzero(block, sizeof(block));
195 }
196 
197 #define	AES_XTS_BLOCKSIZE	16
198 #define	AES_XTS_IVSIZE		8
199 #define	AES_XTS_ALPHA		0x87	/* GF(2^128) generator polynomial */
200 
201 static inline int32x4_t
202 xts_crank_lfsr(int32x4_t inp)
203 {
204 	const int32x4_t alphamask = {AES_XTS_ALPHA, 1, 1, 1};
205 	int32x4_t xtweak, ret;
206 
207 	/* set up xor mask */
208 	xtweak = vextq_s32(inp, inp, 3);
209 	xtweak = vshrq_n_s32(xtweak, 31);
210 	xtweak &= alphamask;
211 
212 	/* next term */
213 	ret = vshlq_n_s32(inp, 1);
214 	ret ^= xtweak;
215 
216 	return ret;
217 }
218 
219 static void
220 armv8_aes_crypt_xts_block(int rounds, const uint8x16_t *key_schedule,
221     uint8x16_t *tweak, const uint8_t *from, uint8_t *to, int do_encrypt)
222 {
223 	uint8x16_t block;
224 
225 	block = vld1q_u8(from) ^ *tweak;
226 
227 	if (do_encrypt)
228 		block = armv8_aes_enc(rounds - 1, key_schedule, block);
229 	else
230 		block = armv8_aes_dec(rounds - 1, key_schedule, block);
231 
232 	vst1q_u8(to, block ^ *tweak);
233 
234 	*tweak = vreinterpretq_u8_s32(xts_crank_lfsr(vreinterpretq_s32_u8(*tweak)));
235 }
236 
237 static void
238 armv8_aes_crypt_xts(int rounds, const uint8x16_t *data_schedule,
239     const uint8x16_t *tweak_schedule, size_t len,
240     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
241     const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt)
242 {
243 	uint8x16_t tweakreg;
244 	uint8_t block[AES_XTS_BLOCKSIZE] __aligned(16);
245 	uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
246 	uint8_t *from, *to;
247 	size_t fromseglen, oseglen, seglen, toseglen;
248 
249 	KASSERT(len % AES_XTS_BLOCKSIZE == 0,
250 	    ("%s: length %zu not a multiple of the block size", __func__, len));
251 
252 	/*
253 	 * Prepare tweak as E_k2(IV). IV is specified as LE representation
254 	 * of a 64-bit block number which we allow to be passed in directly.
255 	 */
256 #if BYTE_ORDER == LITTLE_ENDIAN
257 	bcopy(iv, tweak, AES_XTS_IVSIZE);
258 	/* Last 64 bits of IV are always zero. */
259 	bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
260 #else
261 #error Only LITTLE_ENDIAN architectures are supported.
262 #endif
263 	tweakreg = vld1q_u8(tweak);
264 	tweakreg = armv8_aes_enc(rounds - 1, tweak_schedule, tweakreg);
265 
266 	for (; len > 0; len -= seglen) {
267 		from = crypto_cursor_segment(fromc, &fromseglen);
268 		to = crypto_cursor_segment(toc, &toseglen);
269 
270 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
271 		if (seglen < AES_XTS_BLOCKSIZE) {
272 			crypto_cursor_copydata(fromc, AES_XTS_BLOCKSIZE, block);
273 			armv8_aes_crypt_xts_block(rounds, data_schedule,
274 			    &tweakreg, block, block, do_encrypt);
275 			crypto_cursor_copyback(toc, AES_XTS_BLOCKSIZE, block);
276 			seglen = AES_XTS_BLOCKSIZE;
277 		} else {
278 			for (oseglen = seglen; seglen >= AES_XTS_BLOCKSIZE;
279 			    seglen -= AES_XTS_BLOCKSIZE) {
280 				armv8_aes_crypt_xts_block(rounds, data_schedule,
281 				    &tweakreg, from, to, do_encrypt);
282 				from += AES_XTS_BLOCKSIZE;
283 				to += AES_XTS_BLOCKSIZE;
284 			}
285 			seglen = oseglen - seglen;
286 			crypto_cursor_advance(fromc, seglen);
287 			crypto_cursor_advance(toc, seglen);
288 		}
289 	}
290 
291 	explicit_bzero(block, sizeof(block));
292 }
293 
294 void
295 armv8_aes_encrypt_xts(AES_key_t *data_schedule,
296     const void *tweak_schedule, size_t len, struct crypto_buffer_cursor *fromc,
297     struct crypto_buffer_cursor *toc, const uint8_t iv[static AES_BLOCK_LEN])
298 {
299 	armv8_aes_crypt_xts(data_schedule->aes_rounds,
300 	    (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc,
301 	    toc, iv, 1);
302 }
303 
304 void
305 armv8_aes_decrypt_xts(AES_key_t *data_schedule,
306     const void *tweak_schedule, size_t len,
307     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
308     const uint8_t iv[static AES_BLOCK_LEN])
309 {
310 	armv8_aes_crypt_xts(data_schedule->aes_rounds,
311 	    (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc,
312 	    toc, iv, 0);
313 
314 }
315 #define	AES_INC_COUNTER(counter)				\
316 	do {							\
317 		for (int pos = AES_BLOCK_LEN - 1;		\
318 		     pos >= 0; pos--)				\
319 			if (++(counter)[pos])			\
320 				break;				\
321 	} while (0)
322 
323 struct armv8_gcm_state {
324 	__uint128_val_t EK0;
325 	__uint128_val_t EKi;
326 	__uint128_val_t Xi;
327 	__uint128_val_t lenblock;
328 	uint8_t aes_counter[AES_BLOCK_LEN];
329 };
330 
331 static void
332 armv8_aes_gmac_setup(struct armv8_gcm_state *s, AES_key_t *aes_key,
333     const uint8_t *authdata, size_t authdatalen,
334     const uint8_t iv[static AES_GCM_IV_LEN], const __uint128_val_t *Htable)
335 {
336 	uint8_t block[AES_BLOCK_LEN];
337 	size_t trailer;
338 
339 	bzero(s->aes_counter, AES_BLOCK_LEN);
340 	memcpy(s->aes_counter, iv, AES_GCM_IV_LEN);
341 
342 	/* Setup the counter */
343 	s->aes_counter[AES_BLOCK_LEN - 1] = 1;
344 
345 	/* EK0 for a final GMAC round */
346 	aes_v8_encrypt(s->aes_counter, s->EK0.c, aes_key);
347 
348 	/* GCM starts with 2 as counter, 1 is used for final xor of tag. */
349 	s->aes_counter[AES_BLOCK_LEN - 1] = 2;
350 
351 	memset(s->Xi.c, 0, sizeof(s->Xi.c));
352 	trailer = authdatalen % AES_BLOCK_LEN;
353 	if (authdatalen - trailer > 0) {
354 		gcm_ghash_v8(s->Xi.u, Htable, authdata, authdatalen - trailer);
355 		authdata += authdatalen - trailer;
356 	}
357 	if (trailer > 0 || authdatalen == 0) {
358 		memset(block, 0, sizeof(block));
359 		memcpy(block, authdata, trailer);
360 		gcm_ghash_v8(s->Xi.u, Htable, block, AES_BLOCK_LEN);
361 	}
362 }
363 
364 static void
365 armv8_aes_gmac_finish(struct armv8_gcm_state *s, size_t len,
366     size_t authdatalen, const __uint128_val_t *Htable)
367 {
368 	/* Lengths block */
369 	s->lenblock.u[0] = s->lenblock.u[1] = 0;
370 	s->lenblock.d[1] = htobe32(authdatalen * 8);
371 	s->lenblock.d[3] = htobe32(len * 8);
372 	gcm_ghash_v8(s->Xi.u, Htable, s->lenblock.c, AES_BLOCK_LEN);
373 
374 	s->Xi.u[0] ^= s->EK0.u[0];
375 	s->Xi.u[1] ^= s->EK0.u[1];
376 }
377 
378 static void
379 armv8_aes_encrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key,
380     const uint64_t *from, uint64_t *to)
381 {
382 	aes_v8_encrypt(s->aes_counter, s->EKi.c, aes_key);
383 	AES_INC_COUNTER(s->aes_counter);
384 	to[0] = from[0] ^ s->EKi.u[0];
385 	to[1] = from[1] ^ s->EKi.u[1];
386 }
387 
388 static void
389 armv8_aes_decrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key,
390     const uint64_t *from, uint64_t *to)
391 {
392 	armv8_aes_encrypt_gcm_block(s, aes_key, from, to);
393 }
394 
395 void
396 armv8_aes_encrypt_gcm(AES_key_t *aes_key, size_t len,
397     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
398     size_t authdatalen, const uint8_t *authdata,
399     uint8_t tag[static GMAC_DIGEST_LEN],
400     const uint8_t iv[static AES_GCM_IV_LEN],
401     const __uint128_val_t *Htable)
402 {
403 	struct armv8_gcm_state s;
404 	uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN);
405 	uint64_t *from64, *to64;
406 	size_t fromseglen, i, olen, oseglen, seglen, toseglen;
407 
408 	armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable);
409 
410 	for (olen = len; len > 0; len -= seglen) {
411 		from64 = crypto_cursor_segment(fromc, &fromseglen);
412 		to64 = crypto_cursor_segment(toc, &toseglen);
413 
414 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
415 		if (seglen < AES_BLOCK_LEN) {
416 			seglen = ulmin(len, AES_BLOCK_LEN);
417 
418 			memset(block, 0, sizeof(block));
419 			crypto_cursor_copydata(fromc, (int)seglen, block);
420 
421 			if (seglen == AES_BLOCK_LEN) {
422 				armv8_aes_encrypt_gcm_block(&s, aes_key,
423 				    (uint64_t *)block, (uint64_t *)block);
424 			} else {
425 				aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key);
426 				AES_INC_COUNTER(s.aes_counter);
427 				for (i = 0; i < seglen; i++)
428 					block[i] ^= s.EKi.c[i];
429 			}
430 			gcm_ghash_v8(s.Xi.u, Htable, block, seglen);
431 
432 			crypto_cursor_copyback(toc, (int)seglen, block);
433 		} else {
434 			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
435 			    seglen -= AES_BLOCK_LEN) {
436 				armv8_aes_encrypt_gcm_block(&s, aes_key, from64,
437 				    to64);
438 				gcm_ghash_v8(s.Xi.u, Htable, (uint8_t *)to64,
439 				    AES_BLOCK_LEN);
440 
441 				from64 += 2;
442 				to64 += 2;
443 			}
444 
445 			seglen = oseglen - seglen;
446 			crypto_cursor_advance(fromc, seglen);
447 			crypto_cursor_advance(toc, seglen);
448 		}
449 	}
450 
451 	armv8_aes_gmac_finish(&s, olen, authdatalen, Htable);
452 	memcpy(tag, s.Xi.c, GMAC_DIGEST_LEN);
453 
454 	explicit_bzero(block, sizeof(block));
455 	explicit_bzero(&s, sizeof(s));
456 }
457 
458 int
459 armv8_aes_decrypt_gcm(AES_key_t *aes_key, size_t len,
460     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
461     size_t authdatalen, const uint8_t *authdata,
462     const uint8_t tag[static GMAC_DIGEST_LEN],
463     const uint8_t iv[static AES_GCM_IV_LEN],
464     const __uint128_val_t *Htable)
465 {
466 	struct armv8_gcm_state s;
467 	struct crypto_buffer_cursor fromcc;
468 	uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN), *from;
469 	uint64_t *block64, *from64, *to64;
470 	size_t fromseglen, olen, oseglen, seglen, toseglen;
471 	int error;
472 
473 	armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable);
474 
475 	crypto_cursor_copy(fromc, &fromcc);
476 	for (olen = len; len > 0; len -= seglen) {
477 		from = crypto_cursor_segment(&fromcc, &fromseglen);
478 		seglen = ulmin(len, fromseglen);
479 		seglen -= seglen % AES_BLOCK_LEN;
480 		if (seglen > 0) {
481 			gcm_ghash_v8(s.Xi.u, Htable, from, seglen);
482 			crypto_cursor_advance(&fromcc, seglen);
483 		} else {
484 			memset(block, 0, sizeof(block));
485 			seglen = ulmin(len, AES_BLOCK_LEN);
486 			crypto_cursor_copydata(&fromcc, seglen, block);
487 			gcm_ghash_v8(s.Xi.u, Htable, block, seglen);
488 		}
489 	}
490 
491 	armv8_aes_gmac_finish(&s, olen, authdatalen, Htable);
492 
493 	if (timingsafe_bcmp(tag, s.Xi.c, GMAC_DIGEST_LEN) != 0) {
494 		error = EBADMSG;
495 		goto out;
496 	}
497 
498 	block64 = (uint64_t *)block;
499 	for (len = olen; len > 0; len -= seglen) {
500 		from64 = crypto_cursor_segment(fromc, &fromseglen);
501 		to64 = crypto_cursor_segment(toc, &toseglen);
502 
503 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
504 		if (seglen < AES_BLOCK_LEN) {
505 			seglen = ulmin(len, AES_BLOCK_LEN);
506 
507 			memset(block, 0, sizeof(block));
508 			crypto_cursor_copydata(fromc, seglen, block);
509 
510 			armv8_aes_decrypt_gcm_block(&s, aes_key, block64,
511 			    block64);
512 
513 			crypto_cursor_copyback(toc, (int)seglen, block);
514 		} else {
515 			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
516 			    seglen -= AES_BLOCK_LEN) {
517 				armv8_aes_decrypt_gcm_block(&s, aes_key, from64,
518 				    to64);
519 
520 				from64 += 2;
521 				to64 += 2;
522 			}
523 
524 			seglen = oseglen - seglen;
525 			crypto_cursor_advance(fromc, seglen);
526 			crypto_cursor_advance(toc, seglen);
527 		}
528 	}
529 
530 	error = 0;
531 out:
532 	explicit_bzero(block, sizeof(block));
533 	explicit_bzero(&s, sizeof(s));
534 	return (error);
535 }
536