1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/zfs_context.h>
26 #include <modes/modes.h>
27 #include <sys/crypto/common.h>
28 #include <sys/crypto/icp.h>
29 #include <sys/crypto/impl.h>
30 #include <sys/byteorder.h>
31 #include <sys/simd.h>
32 #include <modes/gcm_impl.h>
33 #ifdef CAN_USE_GCM_ASM
34 #include <aes/aes_impl.h>
35 #endif
36 
37 #define	GHASH(c, d, t, o) \
38 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
39 	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
40 	(uint64_t *)(void *)(t));
41 
42 /* Select GCM implementation */
43 #define	IMPL_FASTEST	(UINT32_MAX)
44 #define	IMPL_CYCLE	(UINT32_MAX-1)
45 #ifdef CAN_USE_GCM_ASM
46 #define	IMPL_AVX	(UINT32_MAX-2)
47 #endif
48 #define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
49 static uint32_t icp_gcm_impl = IMPL_FASTEST;
50 static uint32_t user_sel_impl = IMPL_FASTEST;
51 
52 #ifdef CAN_USE_GCM_ASM
53 /* Does the architecture we run on support the MOVBE instruction? */
54 boolean_t gcm_avx_can_use_movbe = B_FALSE;
55 /*
56  * Whether to use the optimized openssl gcm and ghash implementations.
57  * Set to true if module parameter icp_gcm_impl == "avx".
58  */
59 static boolean_t gcm_use_avx = B_FALSE;
60 #define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
61 
62 extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
63 
64 static inline boolean_t gcm_avx_will_work(void);
65 static inline void gcm_set_avx(boolean_t);
66 static inline boolean_t gcm_toggle_avx(void);
67 static inline size_t gcm_simd_get_htab_size(boolean_t);
68 
69 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
70     crypto_data_t *, size_t);
71 
72 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
73 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
74 static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *,
75     size_t, size_t);
76 #endif /* ifdef CAN_USE_GCM_ASM */
77 
78 /*
79  * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
80  * is done in another function.
81  */
82 int
83 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
84     crypto_data_t *out, size_t block_size,
85     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
86     void (*copy_block)(uint8_t *, uint8_t *),
87     void (*xor_block)(uint8_t *, uint8_t *))
88 {
89 #ifdef CAN_USE_GCM_ASM
90 	if (ctx->gcm_use_avx == B_TRUE)
91 		return (gcm_mode_encrypt_contiguous_blocks_avx(
92 		    ctx, data, length, out, block_size));
93 #endif
94 
95 	const gcm_impl_ops_t *gops;
96 	size_t remainder = length;
97 	size_t need = 0;
98 	uint8_t *datap = (uint8_t *)data;
99 	uint8_t *blockp;
100 	uint8_t *lastp;
101 	void *iov_or_mp;
102 	offset_t offset;
103 	uint8_t *out_data_1;
104 	uint8_t *out_data_2;
105 	size_t out_data_1_len;
106 	uint64_t counter;
107 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
108 
109 	if (length + ctx->gcm_remainder_len < block_size) {
110 		/* accumulate bytes here and return */
111 		memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
112 		    datap,
113 		    length);
114 		ctx->gcm_remainder_len += length;
115 		if (ctx->gcm_copy_to == NULL) {
116 			ctx->gcm_copy_to = datap;
117 		}
118 		return (CRYPTO_SUCCESS);
119 	}
120 
121 	lastp = (uint8_t *)ctx->gcm_cb;
122 	crypto_init_ptrs(out, &iov_or_mp, &offset);
123 
124 	gops = gcm_impl_get_ops();
125 	do {
126 		/* Unprocessed data from last call. */
127 		if (ctx->gcm_remainder_len > 0) {
128 			need = block_size - ctx->gcm_remainder_len;
129 
130 			if (need > remainder)
131 				return (CRYPTO_DATA_LEN_RANGE);
132 
133 			memcpy(&((uint8_t *)ctx->gcm_remainder)
134 			    [ctx->gcm_remainder_len], datap, need);
135 
136 			blockp = (uint8_t *)ctx->gcm_remainder;
137 		} else {
138 			blockp = datap;
139 		}
140 
141 		/*
142 		 * Increment counter. Counter bits are confined
143 		 * to the bottom 32 bits of the counter block.
144 		 */
145 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
146 		counter = htonll(counter + 1);
147 		counter &= counter_mask;
148 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
149 
150 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
151 		    (uint8_t *)ctx->gcm_tmp);
152 		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
153 
154 		lastp = (uint8_t *)ctx->gcm_tmp;
155 
156 		ctx->gcm_processed_data_len += block_size;
157 
158 		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
159 		    &out_data_1_len, &out_data_2, block_size);
160 
161 		/* copy block to where it belongs */
162 		if (out_data_1_len == block_size) {
163 			copy_block(lastp, out_data_1);
164 		} else {
165 			memcpy(out_data_1, lastp, out_data_1_len);
166 			if (out_data_2 != NULL) {
167 				memcpy(out_data_2,
168 				    lastp + out_data_1_len,
169 				    block_size - out_data_1_len);
170 			}
171 		}
172 		/* update offset */
173 		out->cd_offset += block_size;
174 
175 		/* add ciphertext to the hash */
176 		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
177 
178 		/* Update pointer to next block of data to be processed. */
179 		if (ctx->gcm_remainder_len != 0) {
180 			datap += need;
181 			ctx->gcm_remainder_len = 0;
182 		} else {
183 			datap += block_size;
184 		}
185 
186 		remainder = (size_t)&data[length] - (size_t)datap;
187 
188 		/* Incomplete last block. */
189 		if (remainder > 0 && remainder < block_size) {
190 			memcpy(ctx->gcm_remainder, datap, remainder);
191 			ctx->gcm_remainder_len = remainder;
192 			ctx->gcm_copy_to = datap;
193 			goto out;
194 		}
195 		ctx->gcm_copy_to = NULL;
196 
197 	} while (remainder > 0);
198 out:
199 	return (CRYPTO_SUCCESS);
200 }
201 
202 int
203 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
204     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
205     void (*copy_block)(uint8_t *, uint8_t *),
206     void (*xor_block)(uint8_t *, uint8_t *))
207 {
208 	(void) copy_block;
209 #ifdef CAN_USE_GCM_ASM
210 	if (ctx->gcm_use_avx == B_TRUE)
211 		return (gcm_encrypt_final_avx(ctx, out, block_size));
212 #endif
213 
214 	const gcm_impl_ops_t *gops;
215 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
216 	uint8_t *ghash, *macp = NULL;
217 	int i, rv;
218 
219 	if (out->cd_length <
220 	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
221 		return (CRYPTO_DATA_LEN_RANGE);
222 	}
223 
224 	gops = gcm_impl_get_ops();
225 	ghash = (uint8_t *)ctx->gcm_ghash;
226 
227 	if (ctx->gcm_remainder_len > 0) {
228 		uint64_t counter;
229 		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
230 
231 		/*
232 		 * Here is where we deal with data that is not a
233 		 * multiple of the block size.
234 		 */
235 
236 		/*
237 		 * Increment counter.
238 		 */
239 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
240 		counter = htonll(counter + 1);
241 		counter &= counter_mask;
242 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
243 
244 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
245 		    (uint8_t *)ctx->gcm_tmp);
246 
247 		macp = (uint8_t *)ctx->gcm_remainder;
248 		memset(macp + ctx->gcm_remainder_len, 0,
249 		    block_size - ctx->gcm_remainder_len);
250 
251 		/* XOR with counter block */
252 		for (i = 0; i < ctx->gcm_remainder_len; i++) {
253 			macp[i] ^= tmpp[i];
254 		}
255 
256 		/* add ciphertext to the hash */
257 		GHASH(ctx, macp, ghash, gops);
258 
259 		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
260 	}
261 
262 	ctx->gcm_len_a_len_c[1] =
263 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
264 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
265 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
266 	    (uint8_t *)ctx->gcm_J0);
267 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
268 
269 	if (ctx->gcm_remainder_len > 0) {
270 		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
271 		if (rv != CRYPTO_SUCCESS)
272 			return (rv);
273 	}
274 	out->cd_offset += ctx->gcm_remainder_len;
275 	ctx->gcm_remainder_len = 0;
276 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
277 	if (rv != CRYPTO_SUCCESS)
278 		return (rv);
279 	out->cd_offset += ctx->gcm_tag_len;
280 
281 	return (CRYPTO_SUCCESS);
282 }
283 
284 /*
285  * This will only deal with decrypting the last block of the input that
286  * might not be a multiple of block length.
287  */
288 static void
289 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
290     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
291     void (*xor_block)(uint8_t *, uint8_t *))
292 {
293 	uint8_t *datap, *outp, *counterp;
294 	uint64_t counter;
295 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
296 	int i;
297 
298 	/*
299 	 * Increment counter.
300 	 * Counter bits are confined to the bottom 32 bits
301 	 */
302 	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
303 	counter = htonll(counter + 1);
304 	counter &= counter_mask;
305 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
306 
307 	datap = (uint8_t *)ctx->gcm_remainder;
308 	outp = &((ctx->gcm_pt_buf)[index]);
309 	counterp = (uint8_t *)ctx->gcm_tmp;
310 
311 	/* authentication tag */
312 	memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
313 	memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
314 
315 	/* add ciphertext to the hash */
316 	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
317 
318 	/* decrypt remaining ciphertext */
319 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
320 
321 	/* XOR with counter block */
322 	for (i = 0; i < ctx->gcm_remainder_len; i++) {
323 		outp[i] = datap[i] ^ counterp[i];
324 	}
325 }
326 
327 int
328 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
329     crypto_data_t *out, size_t block_size,
330     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
331     void (*copy_block)(uint8_t *, uint8_t *),
332     void (*xor_block)(uint8_t *, uint8_t *))
333 {
334 	(void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
335 	    (void) xor_block;
336 	size_t new_len;
337 	uint8_t *new;
338 
339 	/*
340 	 * Copy contiguous ciphertext input blocks to plaintext buffer.
341 	 * Ciphertext will be decrypted in the final.
342 	 */
343 	if (length > 0) {
344 		new_len = ctx->gcm_pt_buf_len + length;
345 		new = vmem_alloc(new_len, KM_SLEEP);
346 		if (new == NULL) {
347 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
348 			ctx->gcm_pt_buf = NULL;
349 			return (CRYPTO_HOST_MEMORY);
350 		}
351 
352 		if (ctx->gcm_pt_buf != NULL) {
353 			memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
354 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
355 		} else {
356 			ASSERT0(ctx->gcm_pt_buf_len);
357 		}
358 
359 		ctx->gcm_pt_buf = new;
360 		ctx->gcm_pt_buf_len = new_len;
361 		memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
362 		    length);
363 		ctx->gcm_processed_data_len += length;
364 	}
365 
366 	ctx->gcm_remainder_len = 0;
367 	return (CRYPTO_SUCCESS);
368 }
369 
370 int
371 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
372     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
373     void (*xor_block)(uint8_t *, uint8_t *))
374 {
375 #ifdef CAN_USE_GCM_ASM
376 	if (ctx->gcm_use_avx == B_TRUE)
377 		return (gcm_decrypt_final_avx(ctx, out, block_size));
378 #endif
379 
380 	const gcm_impl_ops_t *gops;
381 	size_t pt_len;
382 	size_t remainder;
383 	uint8_t *ghash;
384 	uint8_t *blockp;
385 	uint8_t *cbp;
386 	uint64_t counter;
387 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
388 	int processed = 0, rv;
389 
390 	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
391 
392 	gops = gcm_impl_get_ops();
393 	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
394 	ghash = (uint8_t *)ctx->gcm_ghash;
395 	blockp = ctx->gcm_pt_buf;
396 	remainder = pt_len;
397 	while (remainder > 0) {
398 		/* Incomplete last block */
399 		if (remainder < block_size) {
400 			memcpy(ctx->gcm_remainder, blockp, remainder);
401 			ctx->gcm_remainder_len = remainder;
402 			/*
403 			 * not expecting anymore ciphertext, just
404 			 * compute plaintext for the remaining input
405 			 */
406 			gcm_decrypt_incomplete_block(ctx, block_size,
407 			    processed, encrypt_block, xor_block);
408 			ctx->gcm_remainder_len = 0;
409 			goto out;
410 		}
411 		/* add ciphertext to the hash */
412 		GHASH(ctx, blockp, ghash, gops);
413 
414 		/*
415 		 * Increment counter.
416 		 * Counter bits are confined to the bottom 32 bits
417 		 */
418 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
419 		counter = htonll(counter + 1);
420 		counter &= counter_mask;
421 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
422 
423 		cbp = (uint8_t *)ctx->gcm_tmp;
424 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
425 
426 		/* XOR with ciphertext */
427 		xor_block(cbp, blockp);
428 
429 		processed += block_size;
430 		blockp += block_size;
431 		remainder -= block_size;
432 	}
433 out:
434 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
435 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
436 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
437 	    (uint8_t *)ctx->gcm_J0);
438 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
439 
440 	/* compare the input authentication tag with what we calculated */
441 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
442 		/* They don't match */
443 		return (CRYPTO_INVALID_MAC);
444 	} else {
445 		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
446 		if (rv != CRYPTO_SUCCESS)
447 			return (rv);
448 		out->cd_offset += pt_len;
449 	}
450 	return (CRYPTO_SUCCESS);
451 }
452 
453 static int
454 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
455 {
456 	size_t tag_len;
457 
458 	/*
459 	 * Check the length of the authentication tag (in bits).
460 	 */
461 	tag_len = gcm_param->ulTagBits;
462 	switch (tag_len) {
463 	case 32:
464 	case 64:
465 	case 96:
466 	case 104:
467 	case 112:
468 	case 120:
469 	case 128:
470 		break;
471 	default:
472 		return (CRYPTO_MECHANISM_PARAM_INVALID);
473 	}
474 
475 	if (gcm_param->ulIvLen == 0)
476 		return (CRYPTO_MECHANISM_PARAM_INVALID);
477 
478 	return (CRYPTO_SUCCESS);
479 }
480 
481 static void
482 gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
483     gcm_ctx_t *ctx, size_t block_size,
484     void (*copy_block)(uint8_t *, uint8_t *),
485     void (*xor_block)(uint8_t *, uint8_t *))
486 {
487 	const gcm_impl_ops_t *gops;
488 	uint8_t *cb;
489 	ulong_t remainder = iv_len;
490 	ulong_t processed = 0;
491 	uint8_t *datap, *ghash;
492 	uint64_t len_a_len_c[2];
493 
494 	gops = gcm_impl_get_ops();
495 	ghash = (uint8_t *)ctx->gcm_ghash;
496 	cb = (uint8_t *)ctx->gcm_cb;
497 	if (iv_len == 12) {
498 		memcpy(cb, iv, 12);
499 		cb[12] = 0;
500 		cb[13] = 0;
501 		cb[14] = 0;
502 		cb[15] = 1;
503 		/* J0 will be used again in the final */
504 		copy_block(cb, (uint8_t *)ctx->gcm_J0);
505 	} else {
506 		/* GHASH the IV */
507 		do {
508 			if (remainder < block_size) {
509 				memset(cb, 0, block_size);
510 				memcpy(cb, &(iv[processed]), remainder);
511 				datap = (uint8_t *)cb;
512 				remainder = 0;
513 			} else {
514 				datap = (uint8_t *)(&(iv[processed]));
515 				processed += block_size;
516 				remainder -= block_size;
517 			}
518 			GHASH(ctx, datap, ghash, gops);
519 		} while (remainder > 0);
520 
521 		len_a_len_c[0] = 0;
522 		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
523 		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
524 
525 		/* J0 will be used again in the final */
526 		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
527 	}
528 }
529 
530 static int
531 gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
532     unsigned char *auth_data, size_t auth_data_len, size_t block_size,
533     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
534     void (*copy_block)(uint8_t *, uint8_t *),
535     void (*xor_block)(uint8_t *, uint8_t *))
536 {
537 	const gcm_impl_ops_t *gops;
538 	uint8_t *ghash, *datap, *authp;
539 	size_t remainder, processed;
540 
541 	/* encrypt zero block to get subkey H */
542 	memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
543 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
544 	    (uint8_t *)ctx->gcm_H);
545 
546 	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
547 	    copy_block, xor_block);
548 
549 	gops = gcm_impl_get_ops();
550 	authp = (uint8_t *)ctx->gcm_tmp;
551 	ghash = (uint8_t *)ctx->gcm_ghash;
552 	memset(authp, 0, block_size);
553 	memset(ghash, 0, block_size);
554 
555 	processed = 0;
556 	remainder = auth_data_len;
557 	do {
558 		if (remainder < block_size) {
559 			/*
560 			 * There's not a block full of data, pad rest of
561 			 * buffer with zero
562 			 */
563 
564 			if (auth_data != NULL) {
565 				memset(authp, 0, block_size);
566 				memcpy(authp, &(auth_data[processed]),
567 				    remainder);
568 			} else {
569 				ASSERT0(remainder);
570 			}
571 
572 			datap = (uint8_t *)authp;
573 			remainder = 0;
574 		} else {
575 			datap = (uint8_t *)(&(auth_data[processed]));
576 			processed += block_size;
577 			remainder -= block_size;
578 		}
579 
580 		/* add auth data to the hash */
581 		GHASH(ctx, datap, ghash, gops);
582 
583 	} while (remainder > 0);
584 
585 	return (CRYPTO_SUCCESS);
586 }
587 
588 /*
589  * The following function is called at encrypt or decrypt init time
590  * for AES GCM mode.
591  *
592  * Init the GCM context struct. Handle the cycle and avx implementations here.
593  */
594 int
595 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
596     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
597     void (*copy_block)(uint8_t *, uint8_t *),
598     void (*xor_block)(uint8_t *, uint8_t *))
599 {
600 	int rv;
601 	CK_AES_GCM_PARAMS *gcm_param;
602 
603 	if (param != NULL) {
604 		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
605 
606 		if ((rv = gcm_validate_args(gcm_param)) != 0) {
607 			return (rv);
608 		}
609 
610 		gcm_ctx->gcm_tag_len = gcm_param->ulTagBits;
611 		gcm_ctx->gcm_tag_len >>= 3;
612 		gcm_ctx->gcm_processed_data_len = 0;
613 
614 		/* these values are in bits */
615 		gcm_ctx->gcm_len_a_len_c[0]
616 		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
617 
618 		rv = CRYPTO_SUCCESS;
619 		gcm_ctx->gcm_flags |= GCM_MODE;
620 	} else {
621 		return (CRYPTO_MECHANISM_PARAM_INVALID);
622 	}
623 
624 #ifdef CAN_USE_GCM_ASM
625 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
626 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
627 	} else {
628 		/*
629 		 * Handle the "cycle" implementation by creating avx and
630 		 * non-avx contexts alternately.
631 		 */
632 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
633 		/*
634 		 * We don't handle byte swapped key schedules in the avx
635 		 * code path.
636 		 */
637 		aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
638 		if (ks->ops->needs_byteswap == B_TRUE) {
639 			gcm_ctx->gcm_use_avx = B_FALSE;
640 		}
641 		/* Use the MOVBE and the BSWAP variants alternately. */
642 		if (gcm_ctx->gcm_use_avx == B_TRUE &&
643 		    zfs_movbe_available() == B_TRUE) {
644 			(void) atomic_toggle_boolean_nv(
645 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
646 		}
647 	}
648 	/* Allocate Htab memory as needed. */
649 	if (gcm_ctx->gcm_use_avx == B_TRUE) {
650 		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
651 
652 		if (htab_len == 0) {
653 			return (CRYPTO_MECHANISM_PARAM_INVALID);
654 		}
655 		gcm_ctx->gcm_htab_len = htab_len;
656 		gcm_ctx->gcm_Htable =
657 		    (uint64_t *)kmem_alloc(htab_len, KM_SLEEP);
658 
659 		if (gcm_ctx->gcm_Htable == NULL) {
660 			return (CRYPTO_HOST_MEMORY);
661 		}
662 	}
663 	/* Avx and non avx context initialization differs from here on. */
664 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
665 #endif /* ifdef CAN_USE_GCM_ASM */
666 		if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
667 		    gcm_param->pAAD, gcm_param->ulAADLen, block_size,
668 		    encrypt_block, copy_block, xor_block) != 0) {
669 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
670 		}
671 #ifdef CAN_USE_GCM_ASM
672 	} else {
673 		if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
674 		    gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) {
675 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
676 		}
677 	}
678 #endif /* ifdef CAN_USE_GCM_ASM */
679 
680 	return (rv);
681 }
682 
683 int
684 gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
685     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
686     void (*copy_block)(uint8_t *, uint8_t *),
687     void (*xor_block)(uint8_t *, uint8_t *))
688 {
689 	int rv;
690 	CK_AES_GMAC_PARAMS *gmac_param;
691 
692 	if (param != NULL) {
693 		gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param;
694 
695 		gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS);
696 		gcm_ctx->gcm_processed_data_len = 0;
697 
698 		/* these values are in bits */
699 		gcm_ctx->gcm_len_a_len_c[0]
700 		    = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen));
701 
702 		rv = CRYPTO_SUCCESS;
703 		gcm_ctx->gcm_flags |= GMAC_MODE;
704 	} else {
705 		return (CRYPTO_MECHANISM_PARAM_INVALID);
706 	}
707 
708 #ifdef CAN_USE_GCM_ASM
709 	/*
710 	 * Handle the "cycle" implementation by creating avx and non avx
711 	 * contexts alternately.
712 	 */
713 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
714 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
715 	} else {
716 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
717 	}
718 	/* We don't handle byte swapped key schedules in the avx code path. */
719 	aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
720 	if (ks->ops->needs_byteswap == B_TRUE) {
721 		gcm_ctx->gcm_use_avx = B_FALSE;
722 	}
723 	/* Allocate Htab memory as needed. */
724 	if (gcm_ctx->gcm_use_avx == B_TRUE) {
725 		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
726 
727 		if (htab_len == 0) {
728 			return (CRYPTO_MECHANISM_PARAM_INVALID);
729 		}
730 		gcm_ctx->gcm_htab_len = htab_len;
731 		gcm_ctx->gcm_Htable =
732 		    (uint64_t *)kmem_alloc(htab_len, KM_SLEEP);
733 
734 		if (gcm_ctx->gcm_Htable == NULL) {
735 			return (CRYPTO_HOST_MEMORY);
736 		}
737 	}
738 
739 	/* Avx and non avx context initialization differs from here on. */
740 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
741 #endif	/* ifdef CAN_USE_GCM_ASM */
742 		if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
743 		    gmac_param->pAAD, gmac_param->ulAADLen, block_size,
744 		    encrypt_block, copy_block, xor_block) != 0) {
745 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
746 		}
747 #ifdef CAN_USE_GCM_ASM
748 	} else {
749 		if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
750 		    gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) {
751 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
752 		}
753 	}
754 #endif /* ifdef CAN_USE_GCM_ASM */
755 
756 	return (rv);
757 }
758 
759 void *
760 gcm_alloc_ctx(int kmflag)
761 {
762 	gcm_ctx_t *gcm_ctx;
763 
764 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
765 		return (NULL);
766 
767 	gcm_ctx->gcm_flags = GCM_MODE;
768 	return (gcm_ctx);
769 }
770 
771 void *
772 gmac_alloc_ctx(int kmflag)
773 {
774 	gcm_ctx_t *gcm_ctx;
775 
776 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
777 		return (NULL);
778 
779 	gcm_ctx->gcm_flags = GMAC_MODE;
780 	return (gcm_ctx);
781 }
782 
783 /* GCM implementation that contains the fastest methods */
784 static gcm_impl_ops_t gcm_fastest_impl = {
785 	.name = "fastest"
786 };
787 
788 /* All compiled in implementations */
789 static const gcm_impl_ops_t *gcm_all_impl[] = {
790 	&gcm_generic_impl,
791 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
792 	&gcm_pclmulqdq_impl,
793 #endif
794 };
795 
796 /* Indicate that benchmark has been completed */
797 static boolean_t gcm_impl_initialized = B_FALSE;
798 
799 /* Hold all supported implementations */
800 static size_t gcm_supp_impl_cnt = 0;
801 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
802 
803 /*
804  * Returns the GCM operations for encrypt/decrypt/key setup.  When a
805  * SIMD implementation is not allowed in the current context, then
806  * fallback to the fastest generic implementation.
807  */
808 const gcm_impl_ops_t *
809 gcm_impl_get_ops()
810 {
811 	if (!kfpu_allowed())
812 		return (&gcm_generic_impl);
813 
814 	const gcm_impl_ops_t *ops = NULL;
815 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
816 
817 	switch (impl) {
818 	case IMPL_FASTEST:
819 		ASSERT(gcm_impl_initialized);
820 		ops = &gcm_fastest_impl;
821 		break;
822 	case IMPL_CYCLE:
823 		/* Cycle through supported implementations */
824 		ASSERT(gcm_impl_initialized);
825 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
826 		static size_t cycle_impl_idx = 0;
827 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
828 		ops = gcm_supp_impl[idx];
829 		break;
830 #ifdef CAN_USE_GCM_ASM
831 	case IMPL_AVX:
832 		/*
833 		 * Make sure that we return a valid implementation while
834 		 * switching to the avx implementation since there still
835 		 * may be unfinished non-avx contexts around.
836 		 */
837 		ops = &gcm_generic_impl;
838 		break;
839 #endif
840 	default:
841 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
842 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
843 		if (impl < ARRAY_SIZE(gcm_all_impl))
844 			ops = gcm_supp_impl[impl];
845 		break;
846 	}
847 
848 	ASSERT3P(ops, !=, NULL);
849 
850 	return (ops);
851 }
852 
853 /*
854  * Initialize all supported implementations.
855  */
856 void
857 gcm_impl_init(void)
858 {
859 	gcm_impl_ops_t *curr_impl;
860 	int i, c;
861 
862 	/* Move supported implementations into gcm_supp_impls */
863 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
864 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
865 
866 		if (curr_impl->is_supported())
867 			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
868 	}
869 	gcm_supp_impl_cnt = c;
870 
871 	/*
872 	 * Set the fastest implementation given the assumption that the
873 	 * hardware accelerated version is the fastest.
874 	 */
875 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
876 	if (gcm_pclmulqdq_impl.is_supported()) {
877 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
878 		    sizeof (gcm_fastest_impl));
879 	} else
880 #endif
881 	{
882 		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
883 		    sizeof (gcm_fastest_impl));
884 	}
885 
886 	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
887 
888 #ifdef CAN_USE_GCM_ASM
889 	/*
890 	 * Use the avx implementation if it's available and the implementation
891 	 * hasn't changed from its default value of fastest on module load.
892 	 */
893 	if (gcm_avx_will_work()) {
894 #ifdef HAVE_MOVBE
895 		if (zfs_movbe_available() == B_TRUE) {
896 			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
897 		}
898 #endif
899 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
900 			gcm_set_avx(B_TRUE);
901 		}
902 	}
903 #endif
904 	/* Finish initialization */
905 	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
906 	gcm_impl_initialized = B_TRUE;
907 }
908 
909 static const struct {
910 	char *name;
911 	uint32_t sel;
912 } gcm_impl_opts[] = {
913 		{ "cycle",	IMPL_CYCLE },
914 		{ "fastest",	IMPL_FASTEST },
915 #ifdef CAN_USE_GCM_ASM
916 		{ "avx",	IMPL_AVX },
917 #endif
918 };
919 
920 /*
921  * Function sets desired gcm implementation.
922  *
923  * If we are called before init(), user preference will be saved in
924  * user_sel_impl, and applied in later init() call. This occurs when module
925  * parameter is specified on module load. Otherwise, directly update
926  * icp_gcm_impl.
927  *
928  * @val		Name of gcm implementation to use
929  * @param	Unused.
930  */
931 int
932 gcm_impl_set(const char *val)
933 {
934 	int err = -EINVAL;
935 	char req_name[GCM_IMPL_NAME_MAX];
936 	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
937 	size_t i;
938 
939 	/* sanitize input */
940 	i = strnlen(val, GCM_IMPL_NAME_MAX);
941 	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
942 		return (err);
943 
944 	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
945 	while (i > 0 && isspace(req_name[i-1]))
946 		i--;
947 	req_name[i] = '\0';
948 
949 	/* Check mandatory options */
950 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
951 #ifdef CAN_USE_GCM_ASM
952 		/* Ignore avx implementation if it won't work. */
953 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
954 			continue;
955 		}
956 #endif
957 		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
958 			impl = gcm_impl_opts[i].sel;
959 			err = 0;
960 			break;
961 		}
962 	}
963 
964 	/* check all supported impl if init() was already called */
965 	if (err != 0 && gcm_impl_initialized) {
966 		/* check all supported implementations */
967 		for (i = 0; i < gcm_supp_impl_cnt; i++) {
968 			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
969 				impl = i;
970 				err = 0;
971 				break;
972 			}
973 		}
974 	}
975 #ifdef CAN_USE_GCM_ASM
976 	/*
977 	 * Use the avx implementation if available and the requested one is
978 	 * avx or fastest.
979 	 */
980 	if (gcm_avx_will_work() == B_TRUE &&
981 	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
982 		gcm_set_avx(B_TRUE);
983 	} else {
984 		gcm_set_avx(B_FALSE);
985 	}
986 #endif
987 
988 	if (err == 0) {
989 		if (gcm_impl_initialized)
990 			atomic_swap_32(&icp_gcm_impl, impl);
991 		else
992 			atomic_swap_32(&user_sel_impl, impl);
993 	}
994 
995 	return (err);
996 }
997 
998 #if defined(_KERNEL) && defined(__linux__)
999 
1000 static int
1001 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
1002 {
1003 	return (gcm_impl_set(val));
1004 }
1005 
1006 static int
1007 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
1008 {
1009 	int i, cnt = 0;
1010 	char *fmt;
1011 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
1012 
1013 	ASSERT(gcm_impl_initialized);
1014 
1015 	/* list mandatory options */
1016 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
1017 #ifdef CAN_USE_GCM_ASM
1018 		/* Ignore avx implementation if it won't work. */
1019 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
1020 			continue;
1021 		}
1022 #endif
1023 		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
1024 		cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name);
1025 	}
1026 
1027 	/* list all supported implementations */
1028 	for (i = 0; i < gcm_supp_impl_cnt; i++) {
1029 		fmt = (i == impl) ? "[%s] " : "%s ";
1030 		cnt += sprintf(buffer + cnt, fmt, gcm_supp_impl[i]->name);
1031 	}
1032 
1033 	return (cnt);
1034 }
1035 
1036 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
1037     NULL, 0644);
1038 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
1039 #endif /* defined(__KERNEL) */
1040 
1041 #ifdef CAN_USE_GCM_ASM
1042 #define	GCM_BLOCK_LEN 16
1043 /*
1044  * The openssl asm routines are 6x aggregated and need that many bytes
1045  * at minimum.
1046  */
1047 #define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
1048 #define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
1049 /*
1050  * Ensure the chunk size is reasonable since we are allocating a
1051  * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
1052  */
1053 #define	GCM_AVX_MAX_CHUNK_SIZE \
1054 	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
1055 
1056 /* Clear the FPU registers since they hold sensitive internal state. */
1057 #define	clear_fpu_regs() clear_fpu_regs_avx()
1058 #define	GHASH_AVX(ctx, in, len) \
1059     gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
1060     in, len)
1061 
1062 #define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1063 
1064 /* Get the chunk size module parameter. */
1065 #define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1066 
1067 /*
1068  * Module parameter: number of bytes to process at once while owning the FPU.
1069  * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1070  * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1071  */
1072 static uint32_t gcm_avx_chunk_size =
1073 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1074 
1075 extern void clear_fpu_regs_avx(void);
1076 extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1077 extern void aes_encrypt_intel(const uint32_t rk[], int nr,
1078     const uint32_t pt[4], uint32_t ct[4]);
1079 
1080 extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1081 extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1082     const uint8_t *in, size_t len);
1083 
1084 extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1085     const void *, uint64_t *, uint64_t *);
1086 
1087 extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1088     const void *, uint64_t *, uint64_t *);
1089 
1090 static inline boolean_t
1091 gcm_avx_will_work(void)
1092 {
1093 	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1094 	return (kfpu_allowed() &&
1095 	    zfs_avx_available() && zfs_aes_available() &&
1096 	    zfs_pclmulqdq_available());
1097 }
1098 
1099 static inline void
1100 gcm_set_avx(boolean_t val)
1101 {
1102 	if (gcm_avx_will_work() == B_TRUE) {
1103 		atomic_swap_32(&gcm_use_avx, val);
1104 	}
1105 }
1106 
1107 static inline boolean_t
1108 gcm_toggle_avx(void)
1109 {
1110 	if (gcm_avx_will_work() == B_TRUE) {
1111 		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1112 	} else {
1113 		return (B_FALSE);
1114 	}
1115 }
1116 
1117 static inline size_t
1118 gcm_simd_get_htab_size(boolean_t simd_mode)
1119 {
1120 	switch (simd_mode) {
1121 	case B_TRUE:
1122 		return (2 * 6 * 2 * sizeof (uint64_t));
1123 
1124 	default:
1125 		return (0);
1126 	}
1127 }
1128 
1129 /*
1130  * Clear sensitive data in the context.
1131  *
1132  * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and
1133  * ctx->gcm_Htable contain the hash sub key which protects authentication.
1134  *
1135  * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for
1136  * a known plaintext attack, they consists of the IV and the first and last
1137  * counter respectively. If they should be cleared is debatable.
1138  */
1139 static inline void
1140 gcm_clear_ctx(gcm_ctx_t *ctx)
1141 {
1142 	memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder));
1143 	memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
1144 	memset(ctx->gcm_J0, 0, sizeof (ctx->gcm_J0));
1145 	memset(ctx->gcm_tmp, 0, sizeof (ctx->gcm_tmp));
1146 }
1147 
1148 /* Increment the GCM counter block by n. */
1149 static inline void
1150 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1151 {
1152 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1153 	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1154 
1155 	counter = htonll(counter + n);
1156 	counter &= counter_mask;
1157 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1158 }
1159 
1160 /*
1161  * Encrypt multiple blocks of data in GCM mode.
1162  * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1163  * if possible. While processing a chunk the FPU is "locked".
1164  */
1165 static int
1166 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1167     size_t length, crypto_data_t *out, size_t block_size)
1168 {
1169 	size_t bleft = length;
1170 	size_t need = 0;
1171 	size_t done = 0;
1172 	uint8_t *datap = (uint8_t *)data;
1173 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1174 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1175 	uint64_t *ghash = ctx->gcm_ghash;
1176 	uint64_t *cb = ctx->gcm_cb;
1177 	uint8_t *ct_buf = NULL;
1178 	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1179 	int rv = CRYPTO_SUCCESS;
1180 
1181 	ASSERT(block_size == GCM_BLOCK_LEN);
1182 	/*
1183 	 * If the last call left an incomplete block, try to fill
1184 	 * it first.
1185 	 */
1186 	if (ctx->gcm_remainder_len > 0) {
1187 		need = block_size - ctx->gcm_remainder_len;
1188 		if (length < need) {
1189 			/* Accumulate bytes here and return. */
1190 			memcpy((uint8_t *)ctx->gcm_remainder +
1191 			    ctx->gcm_remainder_len, datap, length);
1192 
1193 			ctx->gcm_remainder_len += length;
1194 			if (ctx->gcm_copy_to == NULL) {
1195 				ctx->gcm_copy_to = datap;
1196 			}
1197 			return (CRYPTO_SUCCESS);
1198 		} else {
1199 			/* Complete incomplete block. */
1200 			memcpy((uint8_t *)ctx->gcm_remainder +
1201 			    ctx->gcm_remainder_len, datap, need);
1202 
1203 			ctx->gcm_copy_to = NULL;
1204 		}
1205 	}
1206 
1207 	/* Allocate a buffer to encrypt to if there is enough input. */
1208 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1209 		ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1210 		if (ct_buf == NULL) {
1211 			return (CRYPTO_HOST_MEMORY);
1212 		}
1213 	}
1214 
1215 	/* If we completed an incomplete block, encrypt and write it out. */
1216 	if (ctx->gcm_remainder_len > 0) {
1217 		kfpu_begin();
1218 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1219 		    (const uint32_t *)cb, (uint32_t *)tmp);
1220 
1221 		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1222 		GHASH_AVX(ctx, tmp, block_size);
1223 		clear_fpu_regs();
1224 		kfpu_end();
1225 		rv = crypto_put_output_data(tmp, out, block_size);
1226 		out->cd_offset += block_size;
1227 		gcm_incr_counter_block(ctx);
1228 		ctx->gcm_processed_data_len += block_size;
1229 		bleft -= need;
1230 		datap += need;
1231 		ctx->gcm_remainder_len = 0;
1232 	}
1233 
1234 	/* Do the bulk encryption in chunk_size blocks. */
1235 	for (; bleft >= chunk_size; bleft -= chunk_size) {
1236 		kfpu_begin();
1237 		done = aesni_gcm_encrypt(
1238 		    datap, ct_buf, chunk_size, key, cb, ghash);
1239 
1240 		clear_fpu_regs();
1241 		kfpu_end();
1242 		if (done != chunk_size) {
1243 			rv = CRYPTO_FAILED;
1244 			goto out_nofpu;
1245 		}
1246 		rv = crypto_put_output_data(ct_buf, out, chunk_size);
1247 		if (rv != CRYPTO_SUCCESS) {
1248 			goto out_nofpu;
1249 		}
1250 		out->cd_offset += chunk_size;
1251 		datap += chunk_size;
1252 		ctx->gcm_processed_data_len += chunk_size;
1253 	}
1254 	/* Check if we are already done. */
1255 	if (bleft == 0) {
1256 		goto out_nofpu;
1257 	}
1258 	/* Bulk encrypt the remaining data. */
1259 	kfpu_begin();
1260 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1261 		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1262 		if (done == 0) {
1263 			rv = CRYPTO_FAILED;
1264 			goto out;
1265 		}
1266 		rv = crypto_put_output_data(ct_buf, out, done);
1267 		if (rv != CRYPTO_SUCCESS) {
1268 			goto out;
1269 		}
1270 		out->cd_offset += done;
1271 		ctx->gcm_processed_data_len += done;
1272 		datap += done;
1273 		bleft -= done;
1274 
1275 	}
1276 	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1277 	while (bleft > 0) {
1278 		if (bleft < block_size) {
1279 			memcpy(ctx->gcm_remainder, datap, bleft);
1280 			ctx->gcm_remainder_len = bleft;
1281 			ctx->gcm_copy_to = datap;
1282 			goto out;
1283 		}
1284 		/* Encrypt, hash and write out. */
1285 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1286 		    (const uint32_t *)cb, (uint32_t *)tmp);
1287 
1288 		gcm_xor_avx(datap, tmp);
1289 		GHASH_AVX(ctx, tmp, block_size);
1290 		rv = crypto_put_output_data(tmp, out, block_size);
1291 		if (rv != CRYPTO_SUCCESS) {
1292 			goto out;
1293 		}
1294 		out->cd_offset += block_size;
1295 		gcm_incr_counter_block(ctx);
1296 		ctx->gcm_processed_data_len += block_size;
1297 		datap += block_size;
1298 		bleft -= block_size;
1299 	}
1300 out:
1301 	clear_fpu_regs();
1302 	kfpu_end();
1303 out_nofpu:
1304 	if (ct_buf != NULL) {
1305 		vmem_free(ct_buf, chunk_size);
1306 	}
1307 	return (rv);
1308 }
1309 
1310 /*
1311  * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1312  * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1313  */
1314 static int
1315 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1316 {
1317 	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1318 	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1319 	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1320 	size_t rem_len = ctx->gcm_remainder_len;
1321 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1322 	int aes_rounds = ((aes_key_t *)keysched)->nr;
1323 	int rv;
1324 
1325 	ASSERT(block_size == GCM_BLOCK_LEN);
1326 
1327 	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1328 		return (CRYPTO_DATA_LEN_RANGE);
1329 	}
1330 
1331 	kfpu_begin();
1332 	/* Pad last incomplete block with zeros, encrypt and hash. */
1333 	if (rem_len > 0) {
1334 		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1335 		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1336 
1337 		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1338 		memset(remainder + rem_len, 0, block_size - rem_len);
1339 		for (int i = 0; i < rem_len; i++) {
1340 			remainder[i] ^= tmp[i];
1341 		}
1342 		GHASH_AVX(ctx, remainder, block_size);
1343 		ctx->gcm_processed_data_len += rem_len;
1344 		/* No need to increment counter_block, it's the last block. */
1345 	}
1346 	/* Finish tag. */
1347 	ctx->gcm_len_a_len_c[1] =
1348 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1349 	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1350 	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1351 
1352 	gcm_xor_avx((uint8_t *)J0, ghash);
1353 	clear_fpu_regs();
1354 	kfpu_end();
1355 
1356 	/* Output remainder. */
1357 	if (rem_len > 0) {
1358 		rv = crypto_put_output_data(remainder, out, rem_len);
1359 		if (rv != CRYPTO_SUCCESS)
1360 			return (rv);
1361 	}
1362 	out->cd_offset += rem_len;
1363 	ctx->gcm_remainder_len = 0;
1364 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1365 	if (rv != CRYPTO_SUCCESS)
1366 		return (rv);
1367 
1368 	out->cd_offset += ctx->gcm_tag_len;
1369 	/* Clear sensitive data in the context before returning. */
1370 	gcm_clear_ctx(ctx);
1371 	return (CRYPTO_SUCCESS);
1372 }
1373 
1374 /*
1375  * Finalize decryption: We just have accumulated crypto text, so now we
1376  * decrypt it here inplace.
1377  */
1378 static int
1379 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1380 {
1381 	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1382 	ASSERT3U(block_size, ==, 16);
1383 
1384 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1385 	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1386 	uint8_t *datap = ctx->gcm_pt_buf;
1387 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1388 	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1389 	uint64_t *ghash = ctx->gcm_ghash;
1390 	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1391 	int rv = CRYPTO_SUCCESS;
1392 	size_t bleft, done;
1393 
1394 	/*
1395 	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1396 	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1397 	 * GCM_AVX_MIN_DECRYPT_BYTES.
1398 	 */
1399 	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1400 		kfpu_begin();
1401 		done = aesni_gcm_decrypt(datap, datap, chunk_size,
1402 		    (const void *)key, ctx->gcm_cb, ghash);
1403 		clear_fpu_regs();
1404 		kfpu_end();
1405 		if (done != chunk_size) {
1406 			return (CRYPTO_FAILED);
1407 		}
1408 		datap += done;
1409 	}
1410 	/* Decrypt remainder, which is less than chunk size, in one go. */
1411 	kfpu_begin();
1412 	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1413 		done = aesni_gcm_decrypt(datap, datap, bleft,
1414 		    (const void *)key, ctx->gcm_cb, ghash);
1415 		if (done == 0) {
1416 			clear_fpu_regs();
1417 			kfpu_end();
1418 			return (CRYPTO_FAILED);
1419 		}
1420 		datap += done;
1421 		bleft -= done;
1422 	}
1423 	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1424 
1425 	/*
1426 	 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1427 	 * decrypt them block by block.
1428 	 */
1429 	while (bleft > 0) {
1430 		/* Incomplete last block. */
1431 		if (bleft < block_size) {
1432 			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1433 
1434 			memset(lastb, 0, block_size);
1435 			memcpy(lastb, datap, bleft);
1436 			/* The GCM processing. */
1437 			GHASH_AVX(ctx, lastb, block_size);
1438 			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1439 			for (size_t i = 0; i < bleft; i++) {
1440 				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1441 			}
1442 			break;
1443 		}
1444 		/* The GCM processing. */
1445 		GHASH_AVX(ctx, datap, block_size);
1446 		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1447 		gcm_xor_avx((uint8_t *)tmp, datap);
1448 		gcm_incr_counter_block(ctx);
1449 
1450 		datap += block_size;
1451 		bleft -= block_size;
1452 	}
1453 	if (rv != CRYPTO_SUCCESS) {
1454 		clear_fpu_regs();
1455 		kfpu_end();
1456 		return (rv);
1457 	}
1458 	/* Decryption done, finish the tag. */
1459 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1460 	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1461 	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1462 	    (uint32_t *)ctx->gcm_J0);
1463 
1464 	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1465 
1466 	/* We are done with the FPU, restore its state. */
1467 	clear_fpu_regs();
1468 	kfpu_end();
1469 
1470 	/* Compare the input authentication tag with what we calculated. */
1471 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1472 		/* They don't match. */
1473 		return (CRYPTO_INVALID_MAC);
1474 	}
1475 	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1476 	if (rv != CRYPTO_SUCCESS) {
1477 		return (rv);
1478 	}
1479 	out->cd_offset += pt_len;
1480 	gcm_clear_ctx(ctx);
1481 	return (CRYPTO_SUCCESS);
1482 }
1483 
1484 /*
1485  * Initialize the GCM params H, Htabtle and the counter block. Save the
1486  * initial counter block.
1487  */
1488 static int
1489 gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
1490     unsigned char *auth_data, size_t auth_data_len, size_t block_size)
1491 {
1492 	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1493 	uint64_t *H = ctx->gcm_H;
1494 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1495 	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1496 	uint8_t *datap = auth_data;
1497 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1498 	size_t bleft;
1499 
1500 	ASSERT(block_size == GCM_BLOCK_LEN);
1501 
1502 	/* Init H (encrypt zero block) and create the initial counter block. */
1503 	memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1504 	memset(H, 0, sizeof (ctx->gcm_H));
1505 	kfpu_begin();
1506 	aes_encrypt_intel(keysched, aes_rounds,
1507 	    (const uint32_t *)H, (uint32_t *)H);
1508 
1509 	gcm_init_htab_avx(ctx->gcm_Htable, H);
1510 
1511 	if (iv_len == 12) {
1512 		memcpy(cb, iv, 12);
1513 		cb[12] = 0;
1514 		cb[13] = 0;
1515 		cb[14] = 0;
1516 		cb[15] = 1;
1517 		/* We need the ICB later. */
1518 		memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1519 	} else {
1520 		/*
1521 		 * Most consumers use 12 byte IVs, so it's OK to use the
1522 		 * original routines for other IV sizes, just avoid nesting
1523 		 * kfpu_begin calls.
1524 		 */
1525 		clear_fpu_regs();
1526 		kfpu_end();
1527 		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1528 		    aes_copy_block, aes_xor_block);
1529 		kfpu_begin();
1530 	}
1531 
1532 	/* Openssl post increments the counter, adjust for that. */
1533 	gcm_incr_counter_block(ctx);
1534 
1535 	/* Ghash AAD in chunk_size blocks. */
1536 	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1537 		GHASH_AVX(ctx, datap, chunk_size);
1538 		datap += chunk_size;
1539 		clear_fpu_regs();
1540 		kfpu_end();
1541 		kfpu_begin();
1542 	}
1543 	/* Ghash the remainder and handle possible incomplete GCM block. */
1544 	if (bleft > 0) {
1545 		size_t incomp = bleft % block_size;
1546 
1547 		bleft -= incomp;
1548 		if (bleft > 0) {
1549 			GHASH_AVX(ctx, datap, bleft);
1550 			datap += bleft;
1551 		}
1552 		if (incomp > 0) {
1553 			/* Zero pad and hash incomplete last block. */
1554 			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1555 
1556 			memset(authp, 0, block_size);
1557 			memcpy(authp, datap, incomp);
1558 			GHASH_AVX(ctx, authp, block_size);
1559 		}
1560 	}
1561 	clear_fpu_regs();
1562 	kfpu_end();
1563 	return (CRYPTO_SUCCESS);
1564 }
1565 
1566 #if defined(_KERNEL)
1567 static int
1568 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1569 {
1570 	unsigned long val;
1571 	char val_rounded[16];
1572 	int error = 0;
1573 
1574 	error = kstrtoul(buf, 0, &val);
1575 	if (error)
1576 		return (error);
1577 
1578 	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1579 
1580 	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1581 		return (-EINVAL);
1582 
1583 	snprintf(val_rounded, 16, "%u", (uint32_t)val);
1584 	error = param_set_uint(val_rounded, kp);
1585 	return (error);
1586 }
1587 
1588 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1589     param_get_uint, &gcm_avx_chunk_size, 0644);
1590 
1591 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1592 	"How many bytes to process while owning the FPU");
1593 
1594 #endif /* defined(__KERNEL) */
1595 #endif /* ifdef CAN_USE_GCM_ASM */
1596