1 /*-
2  * Copyright (c) 2016 The FreeBSD Foundation
3  * Copyright (c) 2020 Ampere Computing
4  * All rights reserved.
5  *
6  * This software was developed by Andrew Turner under
7  * sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * This file is derived from aesni_wrap.c:
31  * Copyright (C) 2008 Damien Miller <djm@mindrot.org>
32  * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
33  * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
34  * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org>
35  * Copyright (c) 2014 The FreeBSD Foundation
36  */
37 
38 /*
39  * This code is built with floating-point enabled. Make sure to have entered
40  * into floating-point context before calling any of these functions.
41  */
42 
43 #include <sys/cdefs.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/malloc.h>
47 #include <sys/queue.h>
48 
49 #include <opencrypto/cryptodev.h>
50 #include <opencrypto/gmac.h>
51 #include <crypto/rijndael/rijndael.h>
52 #include <crypto/armv8/armv8_crypto.h>
53 
54 #include <arm_neon.h>
55 
56 static uint8x16_t
57 armv8_aes_enc(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
58 {
59 	uint8x16_t tmp;
60 	int i;
61 
62 	tmp = from;
63 	for (i = 0; i < rounds - 1; i += 2) {
64 		tmp = vaeseq_u8(tmp, keysched[i]);
65 		tmp = vaesmcq_u8(tmp);
66 		tmp = vaeseq_u8(tmp, keysched[i + 1]);
67 		tmp = vaesmcq_u8(tmp);
68 	}
69 
70 	tmp = vaeseq_u8(tmp, keysched[rounds - 1]);
71 	tmp = vaesmcq_u8(tmp);
72 	tmp = vaeseq_u8(tmp, keysched[rounds]);
73 	tmp = veorq_u8(tmp, keysched[rounds + 1]);
74 
75 	return (tmp);
76 }
77 
78 static uint8x16_t
79 armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
80 {
81 	uint8x16_t tmp;
82 	int i;
83 
84 	tmp = from;
85 	for (i = 0; i < rounds - 1; i += 2) {
86 		tmp = vaesdq_u8(tmp, keysched[i]);
87 		tmp = vaesimcq_u8(tmp);
88 		tmp = vaesdq_u8(tmp, keysched[i+1]);
89 		tmp = vaesimcq_u8(tmp);
90 	}
91 
92 	tmp = vaesdq_u8(tmp, keysched[rounds - 1]);
93 	tmp = vaesimcq_u8(tmp);
94 	tmp = vaesdq_u8(tmp, keysched[rounds]);
95 	tmp = veorq_u8(tmp, keysched[rounds + 1]);
96 
97 	return (tmp);
98 }
99 
100 void
101 armv8_aes_encrypt_cbc(const AES_key_t *key, size_t len,
102     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
103     const uint8_t iv[static AES_BLOCK_LEN])
104 {
105 	uint8x16_t tot, ivreg, tmp;
106 	uint8_t block[AES_BLOCK_LEN], *from, *to;
107 	size_t fromseglen, oseglen, seglen, toseglen;
108 
109 	KASSERT(len % AES_BLOCK_LEN == 0,
110 	    ("%s: length %zu not a multiple of the block size", __func__, len));
111 
112 	ivreg = vld1q_u8(iv);
113 	for (; len > 0; len -= seglen) {
114 		from = crypto_cursor_segment(fromc, &fromseglen);
115 		to = crypto_cursor_segment(toc, &toseglen);
116 
117 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
118 		if (seglen < AES_BLOCK_LEN) {
119 			crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block);
120 			tmp = vld1q_u8(block);
121 			tot = armv8_aes_enc(key->aes_rounds - 1,
122 			    (const void *)key->aes_key, veorq_u8(tmp, ivreg));
123 			ivreg = tot;
124 			vst1q_u8(block, tot);
125 			crypto_cursor_copyback(toc, AES_BLOCK_LEN, block);
126 			seglen = AES_BLOCK_LEN;
127 		} else {
128 			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
129 			    seglen -= AES_BLOCK_LEN) {
130 				tmp = vld1q_u8(from);
131 				tot = armv8_aes_enc(key->aes_rounds - 1,
132 				    (const void *)key->aes_key,
133 				    veorq_u8(tmp, ivreg));
134 				ivreg = tot;
135 				vst1q_u8(to, tot);
136 				from += AES_BLOCK_LEN;
137 				to += AES_BLOCK_LEN;
138 			}
139 			seglen = oseglen - seglen;
140 			crypto_cursor_advance(fromc, seglen);
141 			crypto_cursor_advance(toc, seglen);
142 		}
143 	}
144 
145 	explicit_bzero(block, sizeof(block));
146 }
147 
148 void
149 armv8_aes_decrypt_cbc(const AES_key_t *key, size_t len,
150     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
151     const uint8_t iv[static AES_BLOCK_LEN])
152 {
153 	uint8x16_t ivreg, nextiv, tmp;
154 	uint8_t block[AES_BLOCK_LEN], *from, *to;
155 	size_t fromseglen, oseglen, seglen, toseglen;
156 
157 	KASSERT(len % AES_BLOCK_LEN == 0,
158 	    ("%s: length %zu not a multiple of the block size", __func__, len));
159 
160 	ivreg = vld1q_u8(iv);
161 	for (; len > 0; len -= seglen) {
162 		from = crypto_cursor_segment(fromc, &fromseglen);
163 		to = crypto_cursor_segment(toc, &toseglen);
164 
165 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
166 		if (seglen < AES_BLOCK_LEN) {
167 			crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block);
168 			nextiv = vld1q_u8(block);
169 			tmp = armv8_aes_dec(key->aes_rounds - 1,
170 			    (const void *)key->aes_key, nextiv);
171 			vst1q_u8(block, veorq_u8(tmp, ivreg));
172 			ivreg = nextiv;
173 			crypto_cursor_copyback(toc, AES_BLOCK_LEN, block);
174 			seglen = AES_BLOCK_LEN;
175 		} else {
176 			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
177 			    seglen -= AES_BLOCK_LEN) {
178 				nextiv = vld1q_u8(from);
179 				tmp = armv8_aes_dec(key->aes_rounds - 1,
180 				    (const void *)key->aes_key, nextiv);
181 				vst1q_u8(to, veorq_u8(tmp, ivreg));
182 				ivreg = nextiv;
183 				from += AES_BLOCK_LEN;
184 				to += AES_BLOCK_LEN;
185 			}
186 			crypto_cursor_advance(fromc, oseglen - seglen);
187 			crypto_cursor_advance(toc, oseglen - seglen);
188 			seglen = oseglen - seglen;
189 		}
190 	}
191 
192 	explicit_bzero(block, sizeof(block));
193 }
194 
195 #define	AES_XTS_BLOCKSIZE	16
196 #define	AES_XTS_IVSIZE		8
197 #define	AES_XTS_ALPHA		0x87	/* GF(2^128) generator polynomial */
198 
199 static inline int32x4_t
200 xts_crank_lfsr(int32x4_t inp)
201 {
202 	const int32x4_t alphamask = {AES_XTS_ALPHA, 1, 1, 1};
203 	int32x4_t xtweak, ret;
204 
205 	/* set up xor mask */
206 	xtweak = vextq_s32(inp, inp, 3);
207 	xtweak = vshrq_n_s32(xtweak, 31);
208 	xtweak &= alphamask;
209 
210 	/* next term */
211 	ret = vshlq_n_s32(inp, 1);
212 	ret ^= xtweak;
213 
214 	return ret;
215 }
216 
217 static void
218 armv8_aes_crypt_xts_block(int rounds, const uint8x16_t *key_schedule,
219     uint8x16_t *tweak, const uint8_t *from, uint8_t *to, int do_encrypt)
220 {
221 	uint8x16_t block;
222 
223 	block = vld1q_u8(from) ^ *tweak;
224 
225 	if (do_encrypt)
226 		block = armv8_aes_enc(rounds - 1, key_schedule, block);
227 	else
228 		block = armv8_aes_dec(rounds - 1, key_schedule, block);
229 
230 	vst1q_u8(to, block ^ *tweak);
231 
232 	*tweak = vreinterpretq_u8_s32(xts_crank_lfsr(vreinterpretq_s32_u8(*tweak)));
233 }
234 
235 static void
236 armv8_aes_crypt_xts(int rounds, const uint8x16_t *data_schedule,
237     const uint8x16_t *tweak_schedule, size_t len,
238     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
239     const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt)
240 {
241 	uint8x16_t tweakreg;
242 	uint8_t block[AES_XTS_BLOCKSIZE] __aligned(16);
243 	uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
244 	uint8_t *from, *to;
245 	size_t fromseglen, oseglen, seglen, toseglen;
246 
247 	KASSERT(len % AES_XTS_BLOCKSIZE == 0,
248 	    ("%s: length %zu not a multiple of the block size", __func__, len));
249 
250 	/*
251 	 * Prepare tweak as E_k2(IV). IV is specified as LE representation
252 	 * of a 64-bit block number which we allow to be passed in directly.
253 	 */
254 #if BYTE_ORDER == LITTLE_ENDIAN
255 	bcopy(iv, tweak, AES_XTS_IVSIZE);
256 	/* Last 64 bits of IV are always zero. */
257 	bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
258 #else
259 #error Only LITTLE_ENDIAN architectures are supported.
260 #endif
261 	tweakreg = vld1q_u8(tweak);
262 	tweakreg = armv8_aes_enc(rounds - 1, tweak_schedule, tweakreg);
263 
264 	for (; len > 0; len -= seglen) {
265 		from = crypto_cursor_segment(fromc, &fromseglen);
266 		to = crypto_cursor_segment(toc, &toseglen);
267 
268 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
269 		if (seglen < AES_XTS_BLOCKSIZE) {
270 			crypto_cursor_copydata(fromc, AES_XTS_BLOCKSIZE, block);
271 			armv8_aes_crypt_xts_block(rounds, data_schedule,
272 			    &tweakreg, block, block, do_encrypt);
273 			crypto_cursor_copyback(toc, AES_XTS_BLOCKSIZE, block);
274 			seglen = AES_XTS_BLOCKSIZE;
275 		} else {
276 			for (oseglen = seglen; seglen >= AES_XTS_BLOCKSIZE;
277 			    seglen -= AES_XTS_BLOCKSIZE) {
278 				armv8_aes_crypt_xts_block(rounds, data_schedule,
279 				    &tweakreg, from, to, do_encrypt);
280 				from += AES_XTS_BLOCKSIZE;
281 				to += AES_XTS_BLOCKSIZE;
282 			}
283 			seglen = oseglen - seglen;
284 			crypto_cursor_advance(fromc, seglen);
285 			crypto_cursor_advance(toc, seglen);
286 		}
287 	}
288 
289 	explicit_bzero(block, sizeof(block));
290 }
291 
292 void
293 armv8_aes_encrypt_xts(AES_key_t *data_schedule,
294     const void *tweak_schedule, size_t len, struct crypto_buffer_cursor *fromc,
295     struct crypto_buffer_cursor *toc, const uint8_t iv[static AES_BLOCK_LEN])
296 {
297 	armv8_aes_crypt_xts(data_schedule->aes_rounds,
298 	    (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc,
299 	    toc, iv, 1);
300 }
301 
302 void
303 armv8_aes_decrypt_xts(AES_key_t *data_schedule,
304     const void *tweak_schedule, size_t len,
305     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
306     const uint8_t iv[static AES_BLOCK_LEN])
307 {
308 	armv8_aes_crypt_xts(data_schedule->aes_rounds,
309 	    (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc,
310 	    toc, iv, 0);
311 
312 }
313 #define	AES_INC_COUNTER(counter)				\
314 	do {							\
315 		for (int pos = AES_BLOCK_LEN - 1;		\
316 		     pos >= 0; pos--)				\
317 			if (++(counter)[pos])			\
318 				break;				\
319 	} while (0)
320 
321 struct armv8_gcm_state {
322 	__uint128_val_t EK0;
323 	__uint128_val_t EKi;
324 	__uint128_val_t Xi;
325 	__uint128_val_t lenblock;
326 	uint8_t aes_counter[AES_BLOCK_LEN];
327 };
328 
329 static void
330 armv8_aes_gmac_setup(struct armv8_gcm_state *s, AES_key_t *aes_key,
331     const uint8_t *authdata, size_t authdatalen,
332     const uint8_t iv[static AES_GCM_IV_LEN], const __uint128_val_t *Htable)
333 {
334 	uint8_t block[AES_BLOCK_LEN];
335 	size_t trailer;
336 
337 	bzero(s->aes_counter, AES_BLOCK_LEN);
338 	memcpy(s->aes_counter, iv, AES_GCM_IV_LEN);
339 
340 	/* Setup the counter */
341 	s->aes_counter[AES_BLOCK_LEN - 1] = 1;
342 
343 	/* EK0 for a final GMAC round */
344 	aes_v8_encrypt(s->aes_counter, s->EK0.c, aes_key);
345 
346 	/* GCM starts with 2 as counter, 1 is used for final xor of tag. */
347 	s->aes_counter[AES_BLOCK_LEN - 1] = 2;
348 
349 	memset(s->Xi.c, 0, sizeof(s->Xi.c));
350 	trailer = authdatalen % AES_BLOCK_LEN;
351 	if (authdatalen - trailer > 0) {
352 		gcm_ghash_v8(s->Xi.u, Htable, authdata, authdatalen - trailer);
353 		authdata += authdatalen - trailer;
354 	}
355 	if (trailer > 0 || authdatalen == 0) {
356 		memset(block, 0, sizeof(block));
357 		memcpy(block, authdata, trailer);
358 		gcm_ghash_v8(s->Xi.u, Htable, block, AES_BLOCK_LEN);
359 	}
360 }
361 
362 static void
363 armv8_aes_gmac_finish(struct armv8_gcm_state *s, size_t len,
364     size_t authdatalen, const __uint128_val_t *Htable)
365 {
366 	/* Lengths block */
367 	s->lenblock.u[0] = s->lenblock.u[1] = 0;
368 	s->lenblock.d[1] = htobe32(authdatalen * 8);
369 	s->lenblock.d[3] = htobe32(len * 8);
370 	gcm_ghash_v8(s->Xi.u, Htable, s->lenblock.c, AES_BLOCK_LEN);
371 
372 	s->Xi.u[0] ^= s->EK0.u[0];
373 	s->Xi.u[1] ^= s->EK0.u[1];
374 }
375 
376 static void
377 armv8_aes_encrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key,
378     const uint64_t *from, uint64_t *to)
379 {
380 	aes_v8_encrypt(s->aes_counter, s->EKi.c, aes_key);
381 	AES_INC_COUNTER(s->aes_counter);
382 	to[0] = from[0] ^ s->EKi.u[0];
383 	to[1] = from[1] ^ s->EKi.u[1];
384 }
385 
386 static void
387 armv8_aes_decrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key,
388     const uint64_t *from, uint64_t *to)
389 {
390 	armv8_aes_encrypt_gcm_block(s, aes_key, from, to);
391 }
392 
393 void
394 armv8_aes_encrypt_gcm(AES_key_t *aes_key, size_t len,
395     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
396     size_t authdatalen, const uint8_t *authdata,
397     uint8_t tag[static GMAC_DIGEST_LEN],
398     const uint8_t iv[static AES_GCM_IV_LEN],
399     const __uint128_val_t *Htable)
400 {
401 	struct armv8_gcm_state s;
402 	uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN);
403 	uint64_t *from64, *to64;
404 	size_t fromseglen, i, olen, oseglen, seglen, toseglen;
405 
406 	armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable);
407 
408 	for (olen = len; len > 0; len -= seglen) {
409 		from64 = crypto_cursor_segment(fromc, &fromseglen);
410 		to64 = crypto_cursor_segment(toc, &toseglen);
411 
412 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
413 		if (seglen < AES_BLOCK_LEN) {
414 			seglen = ulmin(len, AES_BLOCK_LEN);
415 
416 			memset(block, 0, sizeof(block));
417 			crypto_cursor_copydata(fromc, (int)seglen, block);
418 
419 			if (seglen == AES_BLOCK_LEN) {
420 				armv8_aes_encrypt_gcm_block(&s, aes_key,
421 				    (uint64_t *)block, (uint64_t *)block);
422 			} else {
423 				aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key);
424 				AES_INC_COUNTER(s.aes_counter);
425 				for (i = 0; i < seglen; i++)
426 					block[i] ^= s.EKi.c[i];
427 			}
428 			gcm_ghash_v8(s.Xi.u, Htable, block, seglen);
429 
430 			crypto_cursor_copyback(toc, (int)seglen, block);
431 		} else {
432 			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
433 			    seglen -= AES_BLOCK_LEN) {
434 				armv8_aes_encrypt_gcm_block(&s, aes_key, from64,
435 				    to64);
436 				gcm_ghash_v8(s.Xi.u, Htable, (uint8_t *)to64,
437 				    AES_BLOCK_LEN);
438 
439 				from64 += 2;
440 				to64 += 2;
441 			}
442 
443 			seglen = oseglen - seglen;
444 			crypto_cursor_advance(fromc, seglen);
445 			crypto_cursor_advance(toc, seglen);
446 		}
447 	}
448 
449 	armv8_aes_gmac_finish(&s, olen, authdatalen, Htable);
450 	memcpy(tag, s.Xi.c, GMAC_DIGEST_LEN);
451 
452 	explicit_bzero(block, sizeof(block));
453 	explicit_bzero(&s, sizeof(s));
454 }
455 
456 int
457 armv8_aes_decrypt_gcm(AES_key_t *aes_key, size_t len,
458     struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
459     size_t authdatalen, const uint8_t *authdata,
460     const uint8_t tag[static GMAC_DIGEST_LEN],
461     const uint8_t iv[static AES_GCM_IV_LEN],
462     const __uint128_val_t *Htable)
463 {
464 	struct armv8_gcm_state s;
465 	struct crypto_buffer_cursor fromcc;
466 	uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN), *from;
467 	uint64_t *block64, *from64, *to64;
468 	size_t fromseglen, olen, oseglen, seglen, toseglen;
469 	int error;
470 
471 	armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable);
472 
473 	crypto_cursor_copy(fromc, &fromcc);
474 	for (olen = len; len > 0; len -= seglen) {
475 		from = crypto_cursor_segment(&fromcc, &fromseglen);
476 		seglen = ulmin(len, fromseglen);
477 		seglen -= seglen % AES_BLOCK_LEN;
478 		if (seglen > 0) {
479 			gcm_ghash_v8(s.Xi.u, Htable, from, seglen);
480 			crypto_cursor_advance(&fromcc, seglen);
481 		} else {
482 			memset(block, 0, sizeof(block));
483 			seglen = ulmin(len, AES_BLOCK_LEN);
484 			crypto_cursor_copydata(&fromcc, seglen, block);
485 			gcm_ghash_v8(s.Xi.u, Htable, block, seglen);
486 		}
487 	}
488 
489 	armv8_aes_gmac_finish(&s, olen, authdatalen, Htable);
490 
491 	if (timingsafe_bcmp(tag, s.Xi.c, GMAC_DIGEST_LEN) != 0) {
492 		error = EBADMSG;
493 		goto out;
494 	}
495 
496 	block64 = (uint64_t *)block;
497 	for (len = olen; len > 0; len -= seglen) {
498 		from64 = crypto_cursor_segment(fromc, &fromseglen);
499 		to64 = crypto_cursor_segment(toc, &toseglen);
500 
501 		seglen = ulmin(len, ulmin(fromseglen, toseglen));
502 		if (seglen < AES_BLOCK_LEN) {
503 			seglen = ulmin(len, AES_BLOCK_LEN);
504 
505 			memset(block, 0, sizeof(block));
506 			crypto_cursor_copydata(fromc, seglen, block);
507 
508 			armv8_aes_decrypt_gcm_block(&s, aes_key, block64,
509 			    block64);
510 
511 			crypto_cursor_copyback(toc, (int)seglen, block);
512 		} else {
513 			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
514 			    seglen -= AES_BLOCK_LEN) {
515 				armv8_aes_decrypt_gcm_block(&s, aes_key, from64,
516 				    to64);
517 
518 				from64 += 2;
519 				to64 += 2;
520 			}
521 
522 			seglen = oseglen - seglen;
523 			crypto_cursor_advance(fromc, seglen);
524 			crypto_cursor_advance(toc, seglen);
525 		}
526 	}
527 
528 	error = 0;
529 out:
530 	explicit_bzero(block, sizeof(block));
531 	explicit_bzero(&s, sizeof(s));
532 	return (error);
533 }
534