1 /*-
2  * Copyright (c) 2016 The FreeBSD Foundation
3  * Copyright (c) 2020 Ampere Computing
4  * All rights reserved.
5  *
6  * This software was developed by Andrew Turner under
7  * sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * This file is derived from aesni_wrap.c:
31  * Copyright (C) 2008 Damien Miller <djm@mindrot.org>
32  * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
33  * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
34  * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org>
35  * Copyright (c) 2014 The FreeBSD Foundation
36  */
37 
38 /*
39  * This code is built with floating-point enabled. Make sure to have entered
40  * into floating-point context before calling any of these functions.
41  */
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/malloc.h>
46 #include <sys/queue.h>
47 
48 #include <opencrypto/cryptodev.h>
49 #include <opencrypto/gmac.h>
50 #include <crypto/rijndael/rijndael.h>
51 #include <crypto/armv8/armv8_crypto.h>
52 
53 #include <arm_neon.h>
54 
55 static uint8x16_t
56 armv8_aes_enc(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
57 {
58 	uint8x16_t tmp;
59 	int i;
60 
61 	tmp = from;
62 	for (i = 0; i < rounds - 1; i += 2) {
63 		tmp = vaeseq_u8(tmp, keysched[i]);
64 		tmp = vaesmcq_u8(tmp);
65 		tmp = vaeseq_u8(tmp, keysched[i + 1]);
66 		tmp = vaesmcq_u8(tmp);
67 	}
68 
69 	tmp = vaeseq_u8(tmp, keysched[rounds - 1]);
70 	tmp = vaesmcq_u8(tmp);
71 	tmp = vaeseq_u8(tmp, keysched[rounds]);
72 	tmp = veorq_u8(tmp, keysched[rounds + 1]);
73 
74 	return (tmp);
75 }
76 
77 static uint8x16_t
78 armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
79 {
80 	uint8x16_t tmp;
81 	int i;
82 
83 	tmp = from;
84 	for (i = 0; i < rounds - 1; i += 2) {
85 		tmp = vaesdq_u8(tmp, keysched[i]);
86 		tmp = vaesimcq_u8(tmp);
87 		tmp = vaesdq_u8(tmp, keysched[i+1]);
88 		tmp = vaesimcq_u8(tmp);
89 	}
90 
91 	tmp = vaesdq_u8(tmp, keysched[rounds - 1]);
92 	tmp = vaesimcq_u8(tmp);
93 	tmp = vaesdq_u8(tmp, keysched[rounds]);
94 	tmp = veorq_u8(tmp, keysched[rounds + 1]);
95 
96 	return (tmp);
97 }
98 
99 void
100 armv8_aes_encrypt_cbc(const AES_key_t *key, size_t len,
101     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
102     const uint8_t iv[static AES_BLOCK_LEN])
103 {
104 	uint8x16_t tot, ivreg, tmp;
105 	uint8_t block[AES_BLOCK_LEN], *from, *to;
106 	size_t fromseglen, oseglen, seglen, toseglen;
107 
108 	KASSERT(len % AES_BLOCK_LEN == 0,
109 	    ("%s: length %zu not a multiple of the block size", __func__, len));
110 
111 	ivreg = vld1q_u8(iv);
112 	for (; len > 0; len -= seglen) {
113 		from = crypto_cursor_segment(fromc, &fromseglen);
114 		to = crypto_cursor_segment(toc, &toseglen);
115 
116 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
117 		if (seglen < AES_BLOCK_LEN) {
118 			crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block);
119 			tmp = vld1q_u8(block);
120 			tot = armv8_aes_enc(key->aes_rounds - 1,
121 			    (const void *)key->aes_key, veorq_u8(tmp, ivreg));
122 			ivreg = tot;
123 			vst1q_u8(block, tot);
124 			crypto_cursor_copyback(toc, AES_BLOCK_LEN, block);
125 			seglen = AES_BLOCK_LEN;
126 		} else {
127 			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
128 			    seglen -= AES_BLOCK_LEN) {
129 				tmp = vld1q_u8(from);
130 				tot = armv8_aes_enc(key->aes_rounds - 1,
131 				    (const void *)key->aes_key,
132 				    veorq_u8(tmp, ivreg));
133 				ivreg = tot;
134 				vst1q_u8(to, tot);
135 				from += AES_BLOCK_LEN;
136 				to += AES_BLOCK_LEN;
137 			}
138 			seglen = oseglen - seglen;
139 			crypto_cursor_advance(fromc, seglen);
140 			crypto_cursor_advance(toc, seglen);
141 		}
142 	}
143 
144 	explicit_bzero(block, sizeof(block));
145 }
146 
147 void
148 armv8_aes_decrypt_cbc(const AES_key_t *key, size_t len,
149     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
150     const uint8_t iv[static AES_BLOCK_LEN])
151 {
152 	uint8x16_t ivreg, nextiv, tmp;
153 	uint8_t block[AES_BLOCK_LEN], *from, *to;
154 	size_t fromseglen, oseglen, seglen, toseglen;
155 
156 	KASSERT(len % AES_BLOCK_LEN == 0,
157 	    ("%s: length %zu not a multiple of the block size", __func__, len));
158 
159 	ivreg = vld1q_u8(iv);
160 	for (; len > 0; len -= seglen) {
161 		from = crypto_cursor_segment(fromc, &fromseglen);
162 		to = crypto_cursor_segment(toc, &toseglen);
163 
164 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
165 		if (seglen < AES_BLOCK_LEN) {
166 			crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block);
167 			nextiv = vld1q_u8(block);
168 			tmp = armv8_aes_dec(key->aes_rounds - 1,
169 			    (const void *)key->aes_key, nextiv);
170 			vst1q_u8(block, veorq_u8(tmp, ivreg));
171 			ivreg = nextiv;
172 			crypto_cursor_copyback(toc, AES_BLOCK_LEN, block);
173 			seglen = AES_BLOCK_LEN;
174 		} else {
175 			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
176 			    seglen -= AES_BLOCK_LEN) {
177 				nextiv = vld1q_u8(from);
178 				tmp = armv8_aes_dec(key->aes_rounds - 1,
179 				    (const void *)key->aes_key, nextiv);
180 				vst1q_u8(to, veorq_u8(tmp, ivreg));
181 				ivreg = nextiv;
182 				from += AES_BLOCK_LEN;
183 				to += AES_BLOCK_LEN;
184 			}
185 			crypto_cursor_advance(fromc, oseglen - seglen);
186 			crypto_cursor_advance(toc, oseglen - seglen);
187 			seglen = oseglen - seglen;
188 		}
189 	}
190 
191 	explicit_bzero(block, sizeof(block));
192 }
193 
194 #define	AES_XTS_BLOCKSIZE	16
195 #define	AES_XTS_IVSIZE		8
196 #define	AES_XTS_ALPHA		0x87	/* GF(2^128) generator polynomial */
197 
198 static inline int32x4_t
199 xts_crank_lfsr(int32x4_t inp)
200 {
201 	const int32x4_t alphamask = {AES_XTS_ALPHA, 1, 1, 1};
202 	int32x4_t xtweak, ret;
203 
204 	/* set up xor mask */
205 	xtweak = vextq_s32(inp, inp, 3);
206 	xtweak = vshrq_n_s32(xtweak, 31);
207 	xtweak &= alphamask;
208 
209 	/* next term */
210 	ret = vshlq_n_s32(inp, 1);
211 	ret ^= xtweak;
212 
213 	return ret;
214 }
215 
216 static void
217 armv8_aes_crypt_xts_block(int rounds, const uint8x16_t *key_schedule,
218     uint8x16_t *tweak, const uint8_t *from, uint8_t *to, int do_encrypt)
219 {
220 	uint8x16_t block;
221 
222 	block = vld1q_u8(from) ^ *tweak;
223 
224 	if (do_encrypt)
225 		block = armv8_aes_enc(rounds - 1, key_schedule, block);
226 	else
227 		block = armv8_aes_dec(rounds - 1, key_schedule, block);
228 
229 	vst1q_u8(to, block ^ *tweak);
230 
231 	*tweak = vreinterpretq_u8_s32(xts_crank_lfsr(vreinterpretq_s32_u8(*tweak)));
232 }
233 
234 static void
235 armv8_aes_crypt_xts(int rounds, const uint8x16_t *data_schedule,
236     const uint8x16_t *tweak_schedule, size_t len,
237     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
238     const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt)
239 {
240 	uint8x16_t tweakreg;
241 	uint8_t block[AES_XTS_BLOCKSIZE] __aligned(16);
242 	uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
243 	uint8_t *from, *to;
244 	size_t fromseglen, oseglen, seglen, toseglen;
245 
246 	KASSERT(len % AES_XTS_BLOCKSIZE == 0,
247 	    ("%s: length %zu not a multiple of the block size", __func__, len));
248 
249 	/*
250 	 * Prepare tweak as E_k2(IV). IV is specified as LE representation
251 	 * of a 64-bit block number which we allow to be passed in directly.
252 	 */
253 #if BYTE_ORDER == LITTLE_ENDIAN
254 	bcopy(iv, tweak, AES_XTS_IVSIZE);
255 	/* Last 64 bits of IV are always zero. */
256 	bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
257 #else
258 #error Only LITTLE_ENDIAN architectures are supported.
259 #endif
260 	tweakreg = vld1q_u8(tweak);
261 	tweakreg = armv8_aes_enc(rounds - 1, tweak_schedule, tweakreg);
262 
263 	for (; len > 0; len -= seglen) {
264 		from = crypto_cursor_segment(fromc, &fromseglen);
265 		to = crypto_cursor_segment(toc, &toseglen);
266 
267 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
268 		if (seglen < AES_XTS_BLOCKSIZE) {
269 			crypto_cursor_copydata(fromc, AES_XTS_BLOCKSIZE, block);
270 			armv8_aes_crypt_xts_block(rounds, data_schedule,
271 			    &tweakreg, block, block, do_encrypt);
272 			crypto_cursor_copyback(toc, AES_XTS_BLOCKSIZE, block);
273 			seglen = AES_XTS_BLOCKSIZE;
274 		} else {
275 			for (oseglen = seglen; seglen >= AES_XTS_BLOCKSIZE;
276 			    seglen -= AES_XTS_BLOCKSIZE) {
277 				armv8_aes_crypt_xts_block(rounds, data_schedule,
278 				    &tweakreg, from, to, do_encrypt);
279 				from += AES_XTS_BLOCKSIZE;
280 				to += AES_XTS_BLOCKSIZE;
281 			}
282 			seglen = oseglen - seglen;
283 			crypto_cursor_advance(fromc, seglen);
284 			crypto_cursor_advance(toc, seglen);
285 		}
286 	}
287 
288 	explicit_bzero(block, sizeof(block));
289 }
290 
291 void
292 armv8_aes_encrypt_xts(AES_key_t *data_schedule,
293     const void *tweak_schedule, size_t len, struct crypto_buffer_cursor *fromc,
294     struct crypto_buffer_cursor *toc, const uint8_t iv[static AES_BLOCK_LEN])
295 {
296 	armv8_aes_crypt_xts(data_schedule->aes_rounds,
297 	    (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc,
298 	    toc, iv, 1);
299 }
300 
301 void
302 armv8_aes_decrypt_xts(AES_key_t *data_schedule,
303     const void *tweak_schedule, size_t len,
304     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
305     const uint8_t iv[static AES_BLOCK_LEN])
306 {
307 	armv8_aes_crypt_xts(data_schedule->aes_rounds,
308 	    (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc,
309 	    toc, iv, 0);
310 
311 }
312 #define	AES_INC_COUNTER(counter)				\
313 	do {							\
314 		for (int pos = AES_BLOCK_LEN - 1;		\
315 		     pos >= 0; pos--)				\
316 			if (++(counter)[pos])			\
317 				break;				\
318 	} while (0)
319 
320 struct armv8_gcm_state {
321 	__uint128_val_t EK0;
322 	__uint128_val_t EKi;
323 	__uint128_val_t Xi;
324 	__uint128_val_t lenblock;
325 	uint8_t aes_counter[AES_BLOCK_LEN];
326 };
327 
328 static void
329 armv8_aes_gmac_setup(struct armv8_gcm_state *s, AES_key_t *aes_key,
330     const uint8_t *authdata, size_t authdatalen,
331     const uint8_t iv[static AES_GCM_IV_LEN], const __uint128_val_t *Htable)
332 {
333 	uint8_t block[AES_BLOCK_LEN];
334 	size_t trailer;
335 
336 	bzero(s->aes_counter, AES_BLOCK_LEN);
337 	memcpy(s->aes_counter, iv, AES_GCM_IV_LEN);
338 
339 	/* Setup the counter */
340 	s->aes_counter[AES_BLOCK_LEN - 1] = 1;
341 
342 	/* EK0 for a final GMAC round */
343 	aes_v8_encrypt(s->aes_counter, s->EK0.c, aes_key);
344 
345 	/* GCM starts with 2 as counter, 1 is used for final xor of tag. */
346 	s->aes_counter[AES_BLOCK_LEN - 1] = 2;
347 
348 	memset(s->Xi.c, 0, sizeof(s->Xi.c));
349 	trailer = authdatalen % AES_BLOCK_LEN;
350 	if (authdatalen - trailer > 0) {
351 		gcm_ghash_v8(s->Xi.u, Htable, authdata, authdatalen - trailer);
352 		authdata += authdatalen - trailer;
353 	}
354 	if (trailer > 0 || authdatalen == 0) {
355 		memset(block, 0, sizeof(block));
356 		memcpy(block, authdata, trailer);
357 		gcm_ghash_v8(s->Xi.u, Htable, block, AES_BLOCK_LEN);
358 	}
359 }
360 
361 static void
362 armv8_aes_gmac_finish(struct armv8_gcm_state *s, size_t len,
363     size_t authdatalen, const __uint128_val_t *Htable)
364 {
365 	/* Lengths block */
366 	s->lenblock.u[0] = s->lenblock.u[1] = 0;
367 	s->lenblock.d[1] = htobe32(authdatalen * 8);
368 	s->lenblock.d[3] = htobe32(len * 8);
369 	gcm_ghash_v8(s->Xi.u, Htable, s->lenblock.c, AES_BLOCK_LEN);
370 
371 	s->Xi.u[0] ^= s->EK0.u[0];
372 	s->Xi.u[1] ^= s->EK0.u[1];
373 }
374 
375 static void
376 armv8_aes_encrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key,
377     const uint64_t *from, uint64_t *to)
378 {
379 	aes_v8_encrypt(s->aes_counter, s->EKi.c, aes_key);
380 	AES_INC_COUNTER(s->aes_counter);
381 	to[0] = from[0] ^ s->EKi.u[0];
382 	to[1] = from[1] ^ s->EKi.u[1];
383 }
384 
385 static void
386 armv8_aes_decrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key,
387     const uint64_t *from, uint64_t *to)
388 {
389 	armv8_aes_encrypt_gcm_block(s, aes_key, from, to);
390 }
391 
392 void
393 armv8_aes_encrypt_gcm(AES_key_t *aes_key, size_t len,
394     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
395     size_t authdatalen, const uint8_t *authdata,
396     uint8_t tag[static GMAC_DIGEST_LEN],
397     const uint8_t iv[static AES_GCM_IV_LEN],
398     const __uint128_val_t *Htable)
399 {
400 	struct armv8_gcm_state s;
401 	uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN);
402 	uint64_t *from64, *to64;
403 	size_t fromseglen, i, olen, oseglen, seglen, toseglen;
404 
405 	armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable);
406 
407 	for (olen = len; len > 0; len -= seglen) {
408 		from64 = crypto_cursor_segment(fromc, &fromseglen);
409 		to64 = crypto_cursor_segment(toc, &toseglen);
410 
411 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
412 		if (seglen < AES_BLOCK_LEN) {
413 			seglen = ulmin(len, AES_BLOCK_LEN);
414 
415 			memset(block, 0, sizeof(block));
416 			crypto_cursor_copydata(fromc, (int)seglen, block);
417 
418 			if (seglen == AES_BLOCK_LEN) {
419 				armv8_aes_encrypt_gcm_block(&s, aes_key,
420 				    (uint64_t *)block, (uint64_t *)block);
421 			} else {
422 				aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key);
423 				AES_INC_COUNTER(s.aes_counter);
424 				for (i = 0; i < seglen; i++)
425 					block[i] ^= s.EKi.c[i];
426 			}
427 			gcm_ghash_v8(s.Xi.u, Htable, block, seglen);
428 
429 			crypto_cursor_copyback(toc, (int)seglen, block);
430 		} else {
431 			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
432 			    seglen -= AES_BLOCK_LEN) {
433 				armv8_aes_encrypt_gcm_block(&s, aes_key, from64,
434 				    to64);
435 				gcm_ghash_v8(s.Xi.u, Htable, (uint8_t *)to64,
436 				    AES_BLOCK_LEN);
437 
438 				from64 += 2;
439 				to64 += 2;
440 			}
441 
442 			seglen = oseglen - seglen;
443 			crypto_cursor_advance(fromc, seglen);
444 			crypto_cursor_advance(toc, seglen);
445 		}
446 	}
447 
448 	armv8_aes_gmac_finish(&s, olen, authdatalen, Htable);
449 	memcpy(tag, s.Xi.c, GMAC_DIGEST_LEN);
450 
451 	explicit_bzero(block, sizeof(block));
452 	explicit_bzero(&s, sizeof(s));
453 }
454 
455 int
456 armv8_aes_decrypt_gcm(AES_key_t *aes_key, size_t len,
457     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
458     size_t authdatalen, const uint8_t *authdata,
459     const uint8_t tag[static GMAC_DIGEST_LEN],
460     const uint8_t iv[static AES_GCM_IV_LEN],
461     const __uint128_val_t *Htable)
462 {
463 	struct armv8_gcm_state s;
464 	struct crypto_buffer_cursor fromcc;
465 	uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN), *from;
466 	uint64_t *block64, *from64, *to64;
467 	size_t fromseglen, olen, oseglen, seglen, toseglen;
468 	int error;
469 
470 	armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable);
471 
472 	crypto_cursor_copy(fromc, &fromcc);
473 	for (olen = len; len > 0; len -= seglen) {
474 		from = crypto_cursor_segment(&fromcc, &fromseglen);
475 		seglen = ulmin(len, fromseglen);
476 		seglen -= seglen % AES_BLOCK_LEN;
477 		if (seglen > 0) {
478 			gcm_ghash_v8(s.Xi.u, Htable, from, seglen);
479 			crypto_cursor_advance(&fromcc, seglen);
480 		} else {
481 			memset(block, 0, sizeof(block));
482 			seglen = ulmin(len, AES_BLOCK_LEN);
483 			crypto_cursor_copydata(&fromcc, seglen, block);
484 			gcm_ghash_v8(s.Xi.u, Htable, block, seglen);
485 		}
486 	}
487 
488 	armv8_aes_gmac_finish(&s, olen, authdatalen, Htable);
489 
490 	if (timingsafe_bcmp(tag, s.Xi.c, GMAC_DIGEST_LEN) != 0) {
491 		error = EBADMSG;
492 		goto out;
493 	}
494 
495 	block64 = (uint64_t *)block;
496 	for (len = olen; len > 0; len -= seglen) {
497 		from64 = crypto_cursor_segment(fromc, &fromseglen);
498 		to64 = crypto_cursor_segment(toc, &toseglen);
499 
500 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
501 		if (seglen < AES_BLOCK_LEN) {
502 			seglen = ulmin(len, AES_BLOCK_LEN);
503 
504 			memset(block, 0, sizeof(block));
505 			crypto_cursor_copydata(fromc, seglen, block);
506 
507 			armv8_aes_decrypt_gcm_block(&s, aes_key, block64,
508 			    block64);
509 
510 			crypto_cursor_copyback(toc, (int)seglen, block);
511 		} else {
512 			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
513 			    seglen -= AES_BLOCK_LEN) {
514 				armv8_aes_decrypt_gcm_block(&s, aes_key, from64,
515 				    to64);
516 
517 				from64 += 2;
518 				to64 += 2;
519 			}
520 
521 			seglen = oseglen - seglen;
522 			crypto_cursor_advance(fromc, seglen);
523 			crypto_cursor_advance(toc, seglen);
524 		}
525 	}
526 
527 	error = 0;
528 out:
529 	explicit_bzero(block, sizeof(block));
530 	explicit_bzero(&s, sizeof(s));
531 	return (error);
532 }
533