1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/zfs_context.h>
26 #include <modes/modes.h>
27 #include <sys/crypto/common.h>
28 #include <sys/crypto/icp.h>
29 #include <sys/crypto/impl.h>
30 #include <sys/byteorder.h>
31 #include <sys/simd.h>
32 #include <modes/gcm_impl.h>
33 #ifdef CAN_USE_GCM_ASM
34 #include <aes/aes_impl.h>
35 #endif
36 
37 #define	GHASH(c, d, t, o) \
38 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
39 	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
40 	(uint64_t *)(void *)(t));
41 
42 /* Select GCM implementation */
43 #define	IMPL_FASTEST	(UINT32_MAX)
44 #define	IMPL_CYCLE	(UINT32_MAX-1)
45 #ifdef CAN_USE_GCM_ASM
46 #define	IMPL_AVX	(UINT32_MAX-2)
47 #endif
48 #define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
49 static uint32_t icp_gcm_impl = IMPL_FASTEST;
50 static uint32_t user_sel_impl = IMPL_FASTEST;
51 
52 #ifdef CAN_USE_GCM_ASM
53 /* Does the architecture we run on support the MOVBE instruction? */
54 boolean_t gcm_avx_can_use_movbe = B_FALSE;
55 /*
56  * Whether to use the optimized openssl gcm and ghash implementations.
57  * Set to true if module parameter icp_gcm_impl == "avx".
58  */
59 static boolean_t gcm_use_avx = B_FALSE;
60 #define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
61 
62 static inline boolean_t gcm_avx_will_work(void);
63 static inline void gcm_set_avx(boolean_t);
64 static inline boolean_t gcm_toggle_avx(void);
65 extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
66 
67 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
68     crypto_data_t *, size_t);
69 
70 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
71 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
72 static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *,
73     size_t, size_t);
74 #endif /* ifdef CAN_USE_GCM_ASM */
75 
76 /*
77  * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
78  * is done in another function.
79  */
80 int
81 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
82     crypto_data_t *out, size_t block_size,
83     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
84     void (*copy_block)(uint8_t *, uint8_t *),
85     void (*xor_block)(uint8_t *, uint8_t *))
86 {
87 #ifdef CAN_USE_GCM_ASM
88 	if (ctx->gcm_use_avx == B_TRUE)
89 		return (gcm_mode_encrypt_contiguous_blocks_avx(
90 		    ctx, data, length, out, block_size));
91 #endif
92 
93 	const gcm_impl_ops_t *gops;
94 	size_t remainder = length;
95 	size_t need = 0;
96 	uint8_t *datap = (uint8_t *)data;
97 	uint8_t *blockp;
98 	uint8_t *lastp;
99 	void *iov_or_mp;
100 	offset_t offset;
101 	uint8_t *out_data_1;
102 	uint8_t *out_data_2;
103 	size_t out_data_1_len;
104 	uint64_t counter;
105 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
106 
107 	if (length + ctx->gcm_remainder_len < block_size) {
108 		/* accumulate bytes here and return */
109 		bcopy(datap,
110 		    (uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
111 		    length);
112 		ctx->gcm_remainder_len += length;
113 		if (ctx->gcm_copy_to == NULL) {
114 			ctx->gcm_copy_to = datap;
115 		}
116 		return (CRYPTO_SUCCESS);
117 	}
118 
119 	lastp = (uint8_t *)ctx->gcm_cb;
120 	crypto_init_ptrs(out, &iov_or_mp, &offset);
121 
122 	gops = gcm_impl_get_ops();
123 	do {
124 		/* Unprocessed data from last call. */
125 		if (ctx->gcm_remainder_len > 0) {
126 			need = block_size - ctx->gcm_remainder_len;
127 
128 			if (need > remainder)
129 				return (CRYPTO_DATA_LEN_RANGE);
130 
131 			bcopy(datap, &((uint8_t *)ctx->gcm_remainder)
132 			    [ctx->gcm_remainder_len], need);
133 
134 			blockp = (uint8_t *)ctx->gcm_remainder;
135 		} else {
136 			blockp = datap;
137 		}
138 
139 		/*
140 		 * Increment counter. Counter bits are confined
141 		 * to the bottom 32 bits of the counter block.
142 		 */
143 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
144 		counter = htonll(counter + 1);
145 		counter &= counter_mask;
146 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
147 
148 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
149 		    (uint8_t *)ctx->gcm_tmp);
150 		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
151 
152 		lastp = (uint8_t *)ctx->gcm_tmp;
153 
154 		ctx->gcm_processed_data_len += block_size;
155 
156 		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
157 		    &out_data_1_len, &out_data_2, block_size);
158 
159 		/* copy block to where it belongs */
160 		if (out_data_1_len == block_size) {
161 			copy_block(lastp, out_data_1);
162 		} else {
163 			bcopy(lastp, out_data_1, out_data_1_len);
164 			if (out_data_2 != NULL) {
165 				bcopy(lastp + out_data_1_len,
166 				    out_data_2,
167 				    block_size - out_data_1_len);
168 			}
169 		}
170 		/* update offset */
171 		out->cd_offset += block_size;
172 
173 		/* add ciphertext to the hash */
174 		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
175 
176 		/* Update pointer to next block of data to be processed. */
177 		if (ctx->gcm_remainder_len != 0) {
178 			datap += need;
179 			ctx->gcm_remainder_len = 0;
180 		} else {
181 			datap += block_size;
182 		}
183 
184 		remainder = (size_t)&data[length] - (size_t)datap;
185 
186 		/* Incomplete last block. */
187 		if (remainder > 0 && remainder < block_size) {
188 			bcopy(datap, ctx->gcm_remainder, remainder);
189 			ctx->gcm_remainder_len = remainder;
190 			ctx->gcm_copy_to = datap;
191 			goto out;
192 		}
193 		ctx->gcm_copy_to = NULL;
194 
195 	} while (remainder > 0);
196 out:
197 	return (CRYPTO_SUCCESS);
198 }
199 
200 /* ARGSUSED */
201 int
202 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
203     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
204     void (*copy_block)(uint8_t *, uint8_t *),
205     void (*xor_block)(uint8_t *, uint8_t *))
206 {
207 #ifdef CAN_USE_GCM_ASM
208 	if (ctx->gcm_use_avx == B_TRUE)
209 		return (gcm_encrypt_final_avx(ctx, out, block_size));
210 #endif
211 
212 	const gcm_impl_ops_t *gops;
213 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
214 	uint8_t *ghash, *macp = NULL;
215 	int i, rv;
216 
217 	if (out->cd_length <
218 	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
219 		return (CRYPTO_DATA_LEN_RANGE);
220 	}
221 
222 	gops = gcm_impl_get_ops();
223 	ghash = (uint8_t *)ctx->gcm_ghash;
224 
225 	if (ctx->gcm_remainder_len > 0) {
226 		uint64_t counter;
227 		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
228 
229 		/*
230 		 * Here is where we deal with data that is not a
231 		 * multiple of the block size.
232 		 */
233 
234 		/*
235 		 * Increment counter.
236 		 */
237 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
238 		counter = htonll(counter + 1);
239 		counter &= counter_mask;
240 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
241 
242 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
243 		    (uint8_t *)ctx->gcm_tmp);
244 
245 		macp = (uint8_t *)ctx->gcm_remainder;
246 		bzero(macp + ctx->gcm_remainder_len,
247 		    block_size - ctx->gcm_remainder_len);
248 
249 		/* XOR with counter block */
250 		for (i = 0; i < ctx->gcm_remainder_len; i++) {
251 			macp[i] ^= tmpp[i];
252 		}
253 
254 		/* add ciphertext to the hash */
255 		GHASH(ctx, macp, ghash, gops);
256 
257 		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
258 	}
259 
260 	ctx->gcm_len_a_len_c[1] =
261 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
262 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
263 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
264 	    (uint8_t *)ctx->gcm_J0);
265 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
266 
267 	if (ctx->gcm_remainder_len > 0) {
268 		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
269 		if (rv != CRYPTO_SUCCESS)
270 			return (rv);
271 	}
272 	out->cd_offset += ctx->gcm_remainder_len;
273 	ctx->gcm_remainder_len = 0;
274 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
275 	if (rv != CRYPTO_SUCCESS)
276 		return (rv);
277 	out->cd_offset += ctx->gcm_tag_len;
278 
279 	return (CRYPTO_SUCCESS);
280 }
281 
282 /*
283  * This will only deal with decrypting the last block of the input that
284  * might not be a multiple of block length.
285  */
286 static void
287 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
288     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
289     void (*xor_block)(uint8_t *, uint8_t *))
290 {
291 	uint8_t *datap, *outp, *counterp;
292 	uint64_t counter;
293 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
294 	int i;
295 
296 	/*
297 	 * Increment counter.
298 	 * Counter bits are confined to the bottom 32 bits
299 	 */
300 	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
301 	counter = htonll(counter + 1);
302 	counter &= counter_mask;
303 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
304 
305 	datap = (uint8_t *)ctx->gcm_remainder;
306 	outp = &((ctx->gcm_pt_buf)[index]);
307 	counterp = (uint8_t *)ctx->gcm_tmp;
308 
309 	/* authentication tag */
310 	bzero((uint8_t *)ctx->gcm_tmp, block_size);
311 	bcopy(datap, (uint8_t *)ctx->gcm_tmp, ctx->gcm_remainder_len);
312 
313 	/* add ciphertext to the hash */
314 	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
315 
316 	/* decrypt remaining ciphertext */
317 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
318 
319 	/* XOR with counter block */
320 	for (i = 0; i < ctx->gcm_remainder_len; i++) {
321 		outp[i] = datap[i] ^ counterp[i];
322 	}
323 }
324 
325 /* ARGSUSED */
326 int
327 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
328     crypto_data_t *out, size_t block_size,
329     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
330     void (*copy_block)(uint8_t *, uint8_t *),
331     void (*xor_block)(uint8_t *, uint8_t *))
332 {
333 	size_t new_len;
334 	uint8_t *new;
335 
336 	/*
337 	 * Copy contiguous ciphertext input blocks to plaintext buffer.
338 	 * Ciphertext will be decrypted in the final.
339 	 */
340 	if (length > 0) {
341 		new_len = ctx->gcm_pt_buf_len + length;
342 		new = vmem_alloc(new_len, ctx->gcm_kmflag);
343 		if (new == NULL) {
344 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
345 			ctx->gcm_pt_buf = NULL;
346 			return (CRYPTO_HOST_MEMORY);
347 		}
348 		bcopy(ctx->gcm_pt_buf, new, ctx->gcm_pt_buf_len);
349 		vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
350 		ctx->gcm_pt_buf = new;
351 		ctx->gcm_pt_buf_len = new_len;
352 		bcopy(data, &ctx->gcm_pt_buf[ctx->gcm_processed_data_len],
353 		    length);
354 		ctx->gcm_processed_data_len += length;
355 	}
356 
357 	ctx->gcm_remainder_len = 0;
358 	return (CRYPTO_SUCCESS);
359 }
360 
361 int
362 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
363     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
364     void (*xor_block)(uint8_t *, uint8_t *))
365 {
366 #ifdef CAN_USE_GCM_ASM
367 	if (ctx->gcm_use_avx == B_TRUE)
368 		return (gcm_decrypt_final_avx(ctx, out, block_size));
369 #endif
370 
371 	const gcm_impl_ops_t *gops;
372 	size_t pt_len;
373 	size_t remainder;
374 	uint8_t *ghash;
375 	uint8_t *blockp;
376 	uint8_t *cbp;
377 	uint64_t counter;
378 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
379 	int processed = 0, rv;
380 
381 	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
382 
383 	gops = gcm_impl_get_ops();
384 	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
385 	ghash = (uint8_t *)ctx->gcm_ghash;
386 	blockp = ctx->gcm_pt_buf;
387 	remainder = pt_len;
388 	while (remainder > 0) {
389 		/* Incomplete last block */
390 		if (remainder < block_size) {
391 			bcopy(blockp, ctx->gcm_remainder, remainder);
392 			ctx->gcm_remainder_len = remainder;
393 			/*
394 			 * not expecting anymore ciphertext, just
395 			 * compute plaintext for the remaining input
396 			 */
397 			gcm_decrypt_incomplete_block(ctx, block_size,
398 			    processed, encrypt_block, xor_block);
399 			ctx->gcm_remainder_len = 0;
400 			goto out;
401 		}
402 		/* add ciphertext to the hash */
403 		GHASH(ctx, blockp, ghash, gops);
404 
405 		/*
406 		 * Increment counter.
407 		 * Counter bits are confined to the bottom 32 bits
408 		 */
409 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
410 		counter = htonll(counter + 1);
411 		counter &= counter_mask;
412 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
413 
414 		cbp = (uint8_t *)ctx->gcm_tmp;
415 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
416 
417 		/* XOR with ciphertext */
418 		xor_block(cbp, blockp);
419 
420 		processed += block_size;
421 		blockp += block_size;
422 		remainder -= block_size;
423 	}
424 out:
425 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
426 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
427 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
428 	    (uint8_t *)ctx->gcm_J0);
429 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
430 
431 	/* compare the input authentication tag with what we calculated */
432 	if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
433 		/* They don't match */
434 		return (CRYPTO_INVALID_MAC);
435 	} else {
436 		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
437 		if (rv != CRYPTO_SUCCESS)
438 			return (rv);
439 		out->cd_offset += pt_len;
440 	}
441 	return (CRYPTO_SUCCESS);
442 }
443 
444 static int
445 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
446 {
447 	size_t tag_len;
448 
449 	/*
450 	 * Check the length of the authentication tag (in bits).
451 	 */
452 	tag_len = gcm_param->ulTagBits;
453 	switch (tag_len) {
454 	case 32:
455 	case 64:
456 	case 96:
457 	case 104:
458 	case 112:
459 	case 120:
460 	case 128:
461 		break;
462 	default:
463 		return (CRYPTO_MECHANISM_PARAM_INVALID);
464 	}
465 
466 	if (gcm_param->ulIvLen == 0)
467 		return (CRYPTO_MECHANISM_PARAM_INVALID);
468 
469 	return (CRYPTO_SUCCESS);
470 }
471 
472 static void
473 gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
474     gcm_ctx_t *ctx, size_t block_size,
475     void (*copy_block)(uint8_t *, uint8_t *),
476     void (*xor_block)(uint8_t *, uint8_t *))
477 {
478 	const gcm_impl_ops_t *gops;
479 	uint8_t *cb;
480 	ulong_t remainder = iv_len;
481 	ulong_t processed = 0;
482 	uint8_t *datap, *ghash;
483 	uint64_t len_a_len_c[2];
484 
485 	gops = gcm_impl_get_ops();
486 	ghash = (uint8_t *)ctx->gcm_ghash;
487 	cb = (uint8_t *)ctx->gcm_cb;
488 	if (iv_len == 12) {
489 		bcopy(iv, cb, 12);
490 		cb[12] = 0;
491 		cb[13] = 0;
492 		cb[14] = 0;
493 		cb[15] = 1;
494 		/* J0 will be used again in the final */
495 		copy_block(cb, (uint8_t *)ctx->gcm_J0);
496 	} else {
497 		/* GHASH the IV */
498 		do {
499 			if (remainder < block_size) {
500 				bzero(cb, block_size);
501 				bcopy(&(iv[processed]), cb, remainder);
502 				datap = (uint8_t *)cb;
503 				remainder = 0;
504 			} else {
505 				datap = (uint8_t *)(&(iv[processed]));
506 				processed += block_size;
507 				remainder -= block_size;
508 			}
509 			GHASH(ctx, datap, ghash, gops);
510 		} while (remainder > 0);
511 
512 		len_a_len_c[0] = 0;
513 		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
514 		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
515 
516 		/* J0 will be used again in the final */
517 		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
518 	}
519 }
520 
521 static int
522 gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
523     unsigned char *auth_data, size_t auth_data_len, size_t block_size,
524     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
525     void (*copy_block)(uint8_t *, uint8_t *),
526     void (*xor_block)(uint8_t *, uint8_t *))
527 {
528 	const gcm_impl_ops_t *gops;
529 	uint8_t *ghash, *datap, *authp;
530 	size_t remainder, processed;
531 
532 	/* encrypt zero block to get subkey H */
533 	bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
534 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
535 	    (uint8_t *)ctx->gcm_H);
536 
537 	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
538 	    copy_block, xor_block);
539 
540 	gops = gcm_impl_get_ops();
541 	authp = (uint8_t *)ctx->gcm_tmp;
542 	ghash = (uint8_t *)ctx->gcm_ghash;
543 	bzero(authp, block_size);
544 	bzero(ghash, block_size);
545 
546 	processed = 0;
547 	remainder = auth_data_len;
548 	do {
549 		if (remainder < block_size) {
550 			/*
551 			 * There's not a block full of data, pad rest of
552 			 * buffer with zero
553 			 */
554 			bzero(authp, block_size);
555 			bcopy(&(auth_data[processed]), authp, remainder);
556 			datap = (uint8_t *)authp;
557 			remainder = 0;
558 		} else {
559 			datap = (uint8_t *)(&(auth_data[processed]));
560 			processed += block_size;
561 			remainder -= block_size;
562 		}
563 
564 		/* add auth data to the hash */
565 		GHASH(ctx, datap, ghash, gops);
566 
567 	} while (remainder > 0);
568 
569 	return (CRYPTO_SUCCESS);
570 }
571 
572 /*
573  * The following function is called at encrypt or decrypt init time
574  * for AES GCM mode.
575  *
576  * Init the GCM context struct. Handle the cycle and avx implementations here.
577  */
578 int
579 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
580     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
581     void (*copy_block)(uint8_t *, uint8_t *),
582     void (*xor_block)(uint8_t *, uint8_t *))
583 {
584 	int rv;
585 	CK_AES_GCM_PARAMS *gcm_param;
586 
587 	if (param != NULL) {
588 		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
589 
590 		if ((rv = gcm_validate_args(gcm_param)) != 0) {
591 			return (rv);
592 		}
593 
594 		gcm_ctx->gcm_tag_len = gcm_param->ulTagBits;
595 		gcm_ctx->gcm_tag_len >>= 3;
596 		gcm_ctx->gcm_processed_data_len = 0;
597 
598 		/* these values are in bits */
599 		gcm_ctx->gcm_len_a_len_c[0]
600 		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
601 
602 		rv = CRYPTO_SUCCESS;
603 		gcm_ctx->gcm_flags |= GCM_MODE;
604 	} else {
605 		return (CRYPTO_MECHANISM_PARAM_INVALID);
606 	}
607 
608 #ifdef CAN_USE_GCM_ASM
609 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
610 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
611 	} else {
612 		/*
613 		 * Handle the "cycle" implementation by creating avx and
614 		 * non-avx contexts alternately.
615 		 */
616 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
617 		/*
618 		 * We don't handle byte swapped key schedules in the avx
619 		 * code path.
620 		 */
621 		aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
622 		if (ks->ops->needs_byteswap == B_TRUE) {
623 			gcm_ctx->gcm_use_avx = B_FALSE;
624 		}
625 		/* Use the MOVBE and the BSWAP variants alternately. */
626 		if (gcm_ctx->gcm_use_avx == B_TRUE &&
627 		    zfs_movbe_available() == B_TRUE) {
628 			(void) atomic_toggle_boolean_nv(
629 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
630 		}
631 	}
632 	/* Avx and non avx context initialization differs from here on. */
633 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
634 #endif /* ifdef CAN_USE_GCM_ASM */
635 		if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
636 		    gcm_param->pAAD, gcm_param->ulAADLen, block_size,
637 		    encrypt_block, copy_block, xor_block) != 0) {
638 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
639 		}
640 #ifdef CAN_USE_GCM_ASM
641 	} else {
642 		if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
643 		    gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) {
644 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
645 		}
646 	}
647 #endif /* ifdef CAN_USE_GCM_ASM */
648 
649 	return (rv);
650 }
651 
652 int
653 gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
654     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
655     void (*copy_block)(uint8_t *, uint8_t *),
656     void (*xor_block)(uint8_t *, uint8_t *))
657 {
658 	int rv;
659 	CK_AES_GMAC_PARAMS *gmac_param;
660 
661 	if (param != NULL) {
662 		gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param;
663 
664 		gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS);
665 		gcm_ctx->gcm_processed_data_len = 0;
666 
667 		/* these values are in bits */
668 		gcm_ctx->gcm_len_a_len_c[0]
669 		    = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen));
670 
671 		rv = CRYPTO_SUCCESS;
672 		gcm_ctx->gcm_flags |= GMAC_MODE;
673 	} else {
674 		return (CRYPTO_MECHANISM_PARAM_INVALID);
675 	}
676 
677 #ifdef CAN_USE_GCM_ASM
678 	/*
679 	 * Handle the "cycle" implementation by creating avx and non avx
680 	 * contexts alternately.
681 	 */
682 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
683 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
684 	} else {
685 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
686 	}
687 	/* We don't handle byte swapped key schedules in the avx code path. */
688 	aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
689 	if (ks->ops->needs_byteswap == B_TRUE) {
690 		gcm_ctx->gcm_use_avx = B_FALSE;
691 	}
692 	/* Avx and non avx context initialization differs from here on. */
693 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
694 #endif	/* ifdef CAN_USE_GCM_ASM */
695 		if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
696 		    gmac_param->pAAD, gmac_param->ulAADLen, block_size,
697 		    encrypt_block, copy_block, xor_block) != 0) {
698 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
699 		}
700 #ifdef CAN_USE_GCM_ASM
701 	} else {
702 		if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
703 		    gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) {
704 			rv = CRYPTO_MECHANISM_PARAM_INVALID;
705 		}
706 	}
707 #endif /* ifdef CAN_USE_GCM_ASM */
708 
709 	return (rv);
710 }
711 
712 void *
713 gcm_alloc_ctx(int kmflag)
714 {
715 	gcm_ctx_t *gcm_ctx;
716 
717 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
718 		return (NULL);
719 
720 	gcm_ctx->gcm_flags = GCM_MODE;
721 	return (gcm_ctx);
722 }
723 
724 void *
725 gmac_alloc_ctx(int kmflag)
726 {
727 	gcm_ctx_t *gcm_ctx;
728 
729 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
730 		return (NULL);
731 
732 	gcm_ctx->gcm_flags = GMAC_MODE;
733 	return (gcm_ctx);
734 }
735 
736 void
737 gcm_set_kmflag(gcm_ctx_t *ctx, int kmflag)
738 {
739 	ctx->gcm_kmflag = kmflag;
740 }
741 
742 /* GCM implementation that contains the fastest methods */
743 static gcm_impl_ops_t gcm_fastest_impl = {
744 	.name = "fastest"
745 };
746 
747 /* All compiled in implementations */
748 const gcm_impl_ops_t *gcm_all_impl[] = {
749 	&gcm_generic_impl,
750 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
751 	&gcm_pclmulqdq_impl,
752 #endif
753 };
754 
755 /* Indicate that benchmark has been completed */
756 static boolean_t gcm_impl_initialized = B_FALSE;
757 
758 /* Hold all supported implementations */
759 static size_t gcm_supp_impl_cnt = 0;
760 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
761 
762 /*
763  * Returns the GCM operations for encrypt/decrypt/key setup.  When a
764  * SIMD implementation is not allowed in the current context, then
765  * fallback to the fastest generic implementation.
766  */
767 const gcm_impl_ops_t *
768 gcm_impl_get_ops()
769 {
770 	if (!kfpu_allowed())
771 		return (&gcm_generic_impl);
772 
773 	const gcm_impl_ops_t *ops = NULL;
774 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
775 
776 	switch (impl) {
777 	case IMPL_FASTEST:
778 		ASSERT(gcm_impl_initialized);
779 		ops = &gcm_fastest_impl;
780 		break;
781 	case IMPL_CYCLE:
782 		/* Cycle through supported implementations */
783 		ASSERT(gcm_impl_initialized);
784 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
785 		static size_t cycle_impl_idx = 0;
786 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
787 		ops = gcm_supp_impl[idx];
788 		break;
789 #ifdef CAN_USE_GCM_ASM
790 	case IMPL_AVX:
791 		/*
792 		 * Make sure that we return a valid implementation while
793 		 * switching to the avx implementation since there still
794 		 * may be unfinished non-avx contexts around.
795 		 */
796 		ops = &gcm_generic_impl;
797 		break;
798 #endif
799 	default:
800 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
801 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
802 		if (impl < ARRAY_SIZE(gcm_all_impl))
803 			ops = gcm_supp_impl[impl];
804 		break;
805 	}
806 
807 	ASSERT3P(ops, !=, NULL);
808 
809 	return (ops);
810 }
811 
812 /*
813  * Initialize all supported implementations.
814  */
815 void
816 gcm_impl_init(void)
817 {
818 	gcm_impl_ops_t *curr_impl;
819 	int i, c;
820 
821 	/* Move supported implementations into gcm_supp_impls */
822 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
823 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
824 
825 		if (curr_impl->is_supported())
826 			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
827 	}
828 	gcm_supp_impl_cnt = c;
829 
830 	/*
831 	 * Set the fastest implementation given the assumption that the
832 	 * hardware accelerated version is the fastest.
833 	 */
834 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
835 	if (gcm_pclmulqdq_impl.is_supported()) {
836 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
837 		    sizeof (gcm_fastest_impl));
838 	} else
839 #endif
840 	{
841 		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
842 		    sizeof (gcm_fastest_impl));
843 	}
844 
845 	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
846 
847 #ifdef CAN_USE_GCM_ASM
848 	/*
849 	 * Use the avx implementation if it's available and the implementation
850 	 * hasn't changed from its default value of fastest on module load.
851 	 */
852 	if (gcm_avx_will_work()) {
853 #ifdef HAVE_MOVBE
854 		if (zfs_movbe_available() == B_TRUE) {
855 			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
856 		}
857 #endif
858 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
859 			gcm_set_avx(B_TRUE);
860 		}
861 	}
862 #endif
863 	/* Finish initialization */
864 	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
865 	gcm_impl_initialized = B_TRUE;
866 }
867 
868 static const struct {
869 	char *name;
870 	uint32_t sel;
871 } gcm_impl_opts[] = {
872 		{ "cycle",	IMPL_CYCLE },
873 		{ "fastest",	IMPL_FASTEST },
874 #ifdef CAN_USE_GCM_ASM
875 		{ "avx",	IMPL_AVX },
876 #endif
877 };
878 
879 /*
880  * Function sets desired gcm implementation.
881  *
882  * If we are called before init(), user preference will be saved in
883  * user_sel_impl, and applied in later init() call. This occurs when module
884  * parameter is specified on module load. Otherwise, directly update
885  * icp_gcm_impl.
886  *
887  * @val		Name of gcm implementation to use
888  * @param	Unused.
889  */
890 int
891 gcm_impl_set(const char *val)
892 {
893 	int err = -EINVAL;
894 	char req_name[GCM_IMPL_NAME_MAX];
895 	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
896 	size_t i;
897 
898 	/* sanitize input */
899 	i = strnlen(val, GCM_IMPL_NAME_MAX);
900 	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
901 		return (err);
902 
903 	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
904 	while (i > 0 && isspace(req_name[i-1]))
905 		i--;
906 	req_name[i] = '\0';
907 
908 	/* Check mandatory options */
909 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
910 #ifdef CAN_USE_GCM_ASM
911 		/* Ignore avx implementation if it won't work. */
912 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
913 			continue;
914 		}
915 #endif
916 		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
917 			impl = gcm_impl_opts[i].sel;
918 			err = 0;
919 			break;
920 		}
921 	}
922 
923 	/* check all supported impl if init() was already called */
924 	if (err != 0 && gcm_impl_initialized) {
925 		/* check all supported implementations */
926 		for (i = 0; i < gcm_supp_impl_cnt; i++) {
927 			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
928 				impl = i;
929 				err = 0;
930 				break;
931 			}
932 		}
933 	}
934 #ifdef CAN_USE_GCM_ASM
935 	/*
936 	 * Use the avx implementation if available and the requested one is
937 	 * avx or fastest.
938 	 */
939 	if (gcm_avx_will_work() == B_TRUE &&
940 	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
941 		gcm_set_avx(B_TRUE);
942 	} else {
943 		gcm_set_avx(B_FALSE);
944 	}
945 #endif
946 
947 	if (err == 0) {
948 		if (gcm_impl_initialized)
949 			atomic_swap_32(&icp_gcm_impl, impl);
950 		else
951 			atomic_swap_32(&user_sel_impl, impl);
952 	}
953 
954 	return (err);
955 }
956 
957 #if defined(_KERNEL) && defined(__linux__)
958 
959 static int
960 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
961 {
962 	return (gcm_impl_set(val));
963 }
964 
965 static int
966 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
967 {
968 	int i, cnt = 0;
969 	char *fmt;
970 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
971 
972 	ASSERT(gcm_impl_initialized);
973 
974 	/* list mandatory options */
975 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
976 #ifdef CAN_USE_GCM_ASM
977 		/* Ignore avx implementation if it won't work. */
978 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
979 			continue;
980 		}
981 #endif
982 		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
983 		cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name);
984 	}
985 
986 	/* list all supported implementations */
987 	for (i = 0; i < gcm_supp_impl_cnt; i++) {
988 		fmt = (i == impl) ? "[%s] " : "%s ";
989 		cnt += sprintf(buffer + cnt, fmt, gcm_supp_impl[i]->name);
990 	}
991 
992 	return (cnt);
993 }
994 
995 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
996     NULL, 0644);
997 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
998 #endif /* defined(__KERNEL) */
999 
1000 #ifdef CAN_USE_GCM_ASM
1001 #define	GCM_BLOCK_LEN 16
1002 /*
1003  * The openssl asm routines are 6x aggregated and need that many bytes
1004  * at minimum.
1005  */
1006 #define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
1007 #define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
1008 /*
1009  * Ensure the chunk size is reasonable since we are allocating a
1010  * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
1011  */
1012 #define	GCM_AVX_MAX_CHUNK_SIZE \
1013 	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
1014 
1015 /* Get the chunk size module parameter. */
1016 #define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1017 
1018 /* Clear the FPU registers since they hold sensitive internal state. */
1019 #define	clear_fpu_regs() clear_fpu_regs_avx()
1020 #define	GHASH_AVX(ctx, in, len) \
1021     gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t (*)[2])(ctx)->gcm_Htable, \
1022     in, len)
1023 
1024 #define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1025 
1026 /*
1027  * Module parameter: number of bytes to process at once while owning the FPU.
1028  * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1029  * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1030  */
1031 static uint32_t gcm_avx_chunk_size =
1032 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1033 
1034 extern void clear_fpu_regs_avx(void);
1035 extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1036 extern void aes_encrypt_intel(const uint32_t rk[], int nr,
1037     const uint32_t pt[4], uint32_t ct[4]);
1038 
1039 extern void gcm_init_htab_avx(uint64_t Htable[16][2], const uint64_t H[2]);
1040 extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t Htable[16][2],
1041     const uint8_t *in, size_t len);
1042 
1043 extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1044     const void *, uint64_t *, uint64_t *);
1045 
1046 extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1047     const void *, uint64_t *, uint64_t *);
1048 
1049 static inline boolean_t
1050 gcm_avx_will_work(void)
1051 {
1052 	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1053 	return (kfpu_allowed() &&
1054 	    zfs_avx_available() && zfs_aes_available() &&
1055 	    zfs_pclmulqdq_available());
1056 }
1057 
1058 static inline void
1059 gcm_set_avx(boolean_t val)
1060 {
1061 	if (gcm_avx_will_work() == B_TRUE) {
1062 		atomic_swap_32(&gcm_use_avx, val);
1063 	}
1064 }
1065 
1066 static inline boolean_t
1067 gcm_toggle_avx(void)
1068 {
1069 	if (gcm_avx_will_work() == B_TRUE) {
1070 		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1071 	} else {
1072 		return (B_FALSE);
1073 	}
1074 }
1075 
1076 /*
1077  * Clear sensitive data in the context.
1078  *
1079  * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and
1080  * ctx->gcm_Htable contain the hash sub key which protects authentication.
1081  *
1082  * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for
1083  * a known plaintext attack, they consists of the IV and the first and last
1084  * counter respectively. If they should be cleared is debatable.
1085  */
1086 static inline void
1087 gcm_clear_ctx(gcm_ctx_t *ctx)
1088 {
1089 	bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder));
1090 	bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
1091 	bzero(ctx->gcm_Htable, sizeof (ctx->gcm_Htable));
1092 	bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0));
1093 	bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp));
1094 }
1095 
1096 /* Increment the GCM counter block by n. */
1097 static inline void
1098 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1099 {
1100 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1101 	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1102 
1103 	counter = htonll(counter + n);
1104 	counter &= counter_mask;
1105 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1106 }
1107 
1108 /*
1109  * Encrypt multiple blocks of data in GCM mode.
1110  * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1111  * if possible. While processing a chunk the FPU is "locked".
1112  */
1113 static int
1114 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1115     size_t length, crypto_data_t *out, size_t block_size)
1116 {
1117 	size_t bleft = length;
1118 	size_t need = 0;
1119 	size_t done = 0;
1120 	uint8_t *datap = (uint8_t *)data;
1121 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1122 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1123 	uint64_t *ghash = ctx->gcm_ghash;
1124 	uint64_t *cb = ctx->gcm_cb;
1125 	uint8_t *ct_buf = NULL;
1126 	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1127 	int rv = CRYPTO_SUCCESS;
1128 
1129 	ASSERT(block_size == GCM_BLOCK_LEN);
1130 	/*
1131 	 * If the last call left an incomplete block, try to fill
1132 	 * it first.
1133 	 */
1134 	if (ctx->gcm_remainder_len > 0) {
1135 		need = block_size - ctx->gcm_remainder_len;
1136 		if (length < need) {
1137 			/* Accumulate bytes here and return. */
1138 			bcopy(datap, (uint8_t *)ctx->gcm_remainder +
1139 			    ctx->gcm_remainder_len, length);
1140 
1141 			ctx->gcm_remainder_len += length;
1142 			if (ctx->gcm_copy_to == NULL) {
1143 				ctx->gcm_copy_to = datap;
1144 			}
1145 			return (CRYPTO_SUCCESS);
1146 		} else {
1147 			/* Complete incomplete block. */
1148 			bcopy(datap, (uint8_t *)ctx->gcm_remainder +
1149 			    ctx->gcm_remainder_len, need);
1150 
1151 			ctx->gcm_copy_to = NULL;
1152 		}
1153 	}
1154 
1155 	/* Allocate a buffer to encrypt to if there is enough input. */
1156 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1157 		ct_buf = vmem_alloc(chunk_size, ctx->gcm_kmflag);
1158 		if (ct_buf == NULL) {
1159 			return (CRYPTO_HOST_MEMORY);
1160 		}
1161 	}
1162 
1163 	/* If we completed an incomplete block, encrypt and write it out. */
1164 	if (ctx->gcm_remainder_len > 0) {
1165 		kfpu_begin();
1166 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1167 		    (const uint32_t *)cb, (uint32_t *)tmp);
1168 
1169 		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1170 		GHASH_AVX(ctx, tmp, block_size);
1171 		clear_fpu_regs();
1172 		kfpu_end();
1173 		rv = crypto_put_output_data(tmp, out, block_size);
1174 		out->cd_offset += block_size;
1175 		gcm_incr_counter_block(ctx);
1176 		ctx->gcm_processed_data_len += block_size;
1177 		bleft -= need;
1178 		datap += need;
1179 		ctx->gcm_remainder_len = 0;
1180 	}
1181 
1182 	/* Do the bulk encryption in chunk_size blocks. */
1183 	for (; bleft >= chunk_size; bleft -= chunk_size) {
1184 		kfpu_begin();
1185 		done = aesni_gcm_encrypt(
1186 		    datap, ct_buf, chunk_size, key, cb, ghash);
1187 
1188 		clear_fpu_regs();
1189 		kfpu_end();
1190 		if (done != chunk_size) {
1191 			rv = CRYPTO_FAILED;
1192 			goto out_nofpu;
1193 		}
1194 		rv = crypto_put_output_data(ct_buf, out, chunk_size);
1195 		if (rv != CRYPTO_SUCCESS) {
1196 			goto out_nofpu;
1197 		}
1198 		out->cd_offset += chunk_size;
1199 		datap += chunk_size;
1200 		ctx->gcm_processed_data_len += chunk_size;
1201 	}
1202 	/* Check if we are already done. */
1203 	if (bleft == 0) {
1204 		goto out_nofpu;
1205 	}
1206 	/* Bulk encrypt the remaining data. */
1207 	kfpu_begin();
1208 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1209 		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1210 		if (done == 0) {
1211 			rv = CRYPTO_FAILED;
1212 			goto out;
1213 		}
1214 		rv = crypto_put_output_data(ct_buf, out, done);
1215 		if (rv != CRYPTO_SUCCESS) {
1216 			goto out;
1217 		}
1218 		out->cd_offset += done;
1219 		ctx->gcm_processed_data_len += done;
1220 		datap += done;
1221 		bleft -= done;
1222 
1223 	}
1224 	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1225 	while (bleft > 0) {
1226 		if (bleft < block_size) {
1227 			bcopy(datap, ctx->gcm_remainder, bleft);
1228 			ctx->gcm_remainder_len = bleft;
1229 			ctx->gcm_copy_to = datap;
1230 			goto out;
1231 		}
1232 		/* Encrypt, hash and write out. */
1233 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1234 		    (const uint32_t *)cb, (uint32_t *)tmp);
1235 
1236 		gcm_xor_avx(datap, tmp);
1237 		GHASH_AVX(ctx, tmp, block_size);
1238 		rv = crypto_put_output_data(tmp, out, block_size);
1239 		if (rv != CRYPTO_SUCCESS) {
1240 			goto out;
1241 		}
1242 		out->cd_offset += block_size;
1243 		gcm_incr_counter_block(ctx);
1244 		ctx->gcm_processed_data_len += block_size;
1245 		datap += block_size;
1246 		bleft -= block_size;
1247 	}
1248 out:
1249 	clear_fpu_regs();
1250 	kfpu_end();
1251 out_nofpu:
1252 	if (ct_buf != NULL) {
1253 		vmem_free(ct_buf, chunk_size);
1254 	}
1255 	return (rv);
1256 }
1257 
1258 /*
1259  * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1260  * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1261  */
1262 static int
1263 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1264 {
1265 	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1266 	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1267 	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1268 	size_t rem_len = ctx->gcm_remainder_len;
1269 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1270 	int aes_rounds = ((aes_key_t *)keysched)->nr;
1271 	int rv;
1272 
1273 	ASSERT(block_size == GCM_BLOCK_LEN);
1274 
1275 	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1276 		return (CRYPTO_DATA_LEN_RANGE);
1277 	}
1278 
1279 	kfpu_begin();
1280 	/* Pad last incomplete block with zeros, encrypt and hash. */
1281 	if (rem_len > 0) {
1282 		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1283 		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1284 
1285 		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1286 		bzero(remainder + rem_len, block_size - rem_len);
1287 		for (int i = 0; i < rem_len; i++) {
1288 			remainder[i] ^= tmp[i];
1289 		}
1290 		GHASH_AVX(ctx, remainder, block_size);
1291 		ctx->gcm_processed_data_len += rem_len;
1292 		/* No need to increment counter_block, it's the last block. */
1293 	}
1294 	/* Finish tag. */
1295 	ctx->gcm_len_a_len_c[1] =
1296 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1297 	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1298 	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1299 
1300 	gcm_xor_avx((uint8_t *)J0, ghash);
1301 	clear_fpu_regs();
1302 	kfpu_end();
1303 
1304 	/* Output remainder. */
1305 	if (rem_len > 0) {
1306 		rv = crypto_put_output_data(remainder, out, rem_len);
1307 		if (rv != CRYPTO_SUCCESS)
1308 			return (rv);
1309 	}
1310 	out->cd_offset += rem_len;
1311 	ctx->gcm_remainder_len = 0;
1312 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1313 	if (rv != CRYPTO_SUCCESS)
1314 		return (rv);
1315 
1316 	out->cd_offset += ctx->gcm_tag_len;
1317 	/* Clear sensitive data in the context before returning. */
1318 	gcm_clear_ctx(ctx);
1319 	return (CRYPTO_SUCCESS);
1320 }
1321 
1322 /*
1323  * Finalize decryption: We just have accumulated crypto text, so now we
1324  * decrypt it here inplace.
1325  */
1326 static int
1327 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1328 {
1329 	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1330 	ASSERT3U(block_size, ==, 16);
1331 
1332 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1333 	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1334 	uint8_t *datap = ctx->gcm_pt_buf;
1335 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1336 	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1337 	uint64_t *ghash = ctx->gcm_ghash;
1338 	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1339 	int rv = CRYPTO_SUCCESS;
1340 	size_t bleft, done;
1341 
1342 	/*
1343 	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1344 	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1345 	 * GCM_AVX_MIN_DECRYPT_BYTES.
1346 	 */
1347 	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1348 		kfpu_begin();
1349 		done = aesni_gcm_decrypt(datap, datap, chunk_size,
1350 		    (const void *)key, ctx->gcm_cb, ghash);
1351 		clear_fpu_regs();
1352 		kfpu_end();
1353 		if (done != chunk_size) {
1354 			return (CRYPTO_FAILED);
1355 		}
1356 		datap += done;
1357 	}
1358 	/* Decrypt remainder, which is less then chunk size, in one go. */
1359 	kfpu_begin();
1360 	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1361 		done = aesni_gcm_decrypt(datap, datap, bleft,
1362 		    (const void *)key, ctx->gcm_cb, ghash);
1363 		if (done == 0) {
1364 			clear_fpu_regs();
1365 			kfpu_end();
1366 			return (CRYPTO_FAILED);
1367 		}
1368 		datap += done;
1369 		bleft -= done;
1370 	}
1371 	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1372 
1373 	/*
1374 	 * Now less then GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1375 	 * decrypt them block by block.
1376 	 */
1377 	while (bleft > 0) {
1378 		/* Incomplete last block. */
1379 		if (bleft < block_size) {
1380 			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1381 
1382 			bzero(lastb, block_size);
1383 			bcopy(datap, lastb, bleft);
1384 			/* The GCM processing. */
1385 			GHASH_AVX(ctx, lastb, block_size);
1386 			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1387 			for (size_t i = 0; i < bleft; i++) {
1388 				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1389 			}
1390 			break;
1391 		}
1392 		/* The GCM processing. */
1393 		GHASH_AVX(ctx, datap, block_size);
1394 		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1395 		gcm_xor_avx((uint8_t *)tmp, datap);
1396 		gcm_incr_counter_block(ctx);
1397 
1398 		datap += block_size;
1399 		bleft -= block_size;
1400 	}
1401 	if (rv != CRYPTO_SUCCESS) {
1402 		clear_fpu_regs();
1403 		kfpu_end();
1404 		return (rv);
1405 	}
1406 	/* Decryption done, finish the tag. */
1407 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1408 	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1409 	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1410 	    (uint32_t *)ctx->gcm_J0);
1411 
1412 	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1413 
1414 	/* We are done with the FPU, restore its state. */
1415 	clear_fpu_regs();
1416 	kfpu_end();
1417 
1418 	/* Compare the input authentication tag with what we calculated. */
1419 	if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1420 		/* They don't match. */
1421 		return (CRYPTO_INVALID_MAC);
1422 	}
1423 	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1424 	if (rv != CRYPTO_SUCCESS) {
1425 		return (rv);
1426 	}
1427 	out->cd_offset += pt_len;
1428 	gcm_clear_ctx(ctx);
1429 	return (CRYPTO_SUCCESS);
1430 }
1431 
1432 /*
1433  * Initialize the GCM params H, Htabtle and the counter block. Save the
1434  * initial counter block.
1435  */
1436 static int
1437 gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
1438     unsigned char *auth_data, size_t auth_data_len, size_t block_size)
1439 {
1440 	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1441 	uint64_t *H = ctx->gcm_H;
1442 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1443 	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1444 	uint8_t *datap = auth_data;
1445 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1446 	size_t bleft;
1447 
1448 	ASSERT(block_size == GCM_BLOCK_LEN);
1449 
1450 	/* Init H (encrypt zero block) and create the initial counter block. */
1451 	bzero(ctx->gcm_ghash, sizeof (ctx->gcm_ghash));
1452 	bzero(H, sizeof (ctx->gcm_H));
1453 	kfpu_begin();
1454 	aes_encrypt_intel(keysched, aes_rounds,
1455 	    (const uint32_t *)H, (uint32_t *)H);
1456 
1457 	gcm_init_htab_avx(ctx->gcm_Htable, H);
1458 
1459 	if (iv_len == 12) {
1460 		bcopy(iv, cb, 12);
1461 		cb[12] = 0;
1462 		cb[13] = 0;
1463 		cb[14] = 0;
1464 		cb[15] = 1;
1465 		/* We need the ICB later. */
1466 		bcopy(cb, ctx->gcm_J0, sizeof (ctx->gcm_J0));
1467 	} else {
1468 		/*
1469 		 * Most consumers use 12 byte IVs, so it's OK to use the
1470 		 * original routines for other IV sizes, just avoid nesting
1471 		 * kfpu_begin calls.
1472 		 */
1473 		clear_fpu_regs();
1474 		kfpu_end();
1475 		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1476 		    aes_copy_block, aes_xor_block);
1477 		kfpu_begin();
1478 	}
1479 
1480 	/* Openssl post increments the counter, adjust for that. */
1481 	gcm_incr_counter_block(ctx);
1482 
1483 	/* Ghash AAD in chunk_size blocks. */
1484 	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1485 		GHASH_AVX(ctx, datap, chunk_size);
1486 		datap += chunk_size;
1487 		clear_fpu_regs();
1488 		kfpu_end();
1489 		kfpu_begin();
1490 	}
1491 	/* Ghash the remainder and handle possible incomplete GCM block. */
1492 	if (bleft > 0) {
1493 		size_t incomp = bleft % block_size;
1494 
1495 		bleft -= incomp;
1496 		if (bleft > 0) {
1497 			GHASH_AVX(ctx, datap, bleft);
1498 			datap += bleft;
1499 		}
1500 		if (incomp > 0) {
1501 			/* Zero pad and hash incomplete last block. */
1502 			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1503 
1504 			bzero(authp, block_size);
1505 			bcopy(datap, authp, incomp);
1506 			GHASH_AVX(ctx, authp, block_size);
1507 		}
1508 	}
1509 	clear_fpu_regs();
1510 	kfpu_end();
1511 	return (CRYPTO_SUCCESS);
1512 }
1513 
1514 #if defined(_KERNEL)
1515 static int
1516 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1517 {
1518 	unsigned long val;
1519 	char val_rounded[16];
1520 	int error = 0;
1521 
1522 	error = kstrtoul(buf, 0, &val);
1523 	if (error)
1524 		return (error);
1525 
1526 	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1527 
1528 	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1529 		return (-EINVAL);
1530 
1531 	snprintf(val_rounded, 16, "%u", (uint32_t)val);
1532 	error = param_set_uint(val_rounded, kp);
1533 	return (error);
1534 }
1535 
1536 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1537     param_get_uint, &gcm_avx_chunk_size, 0644);
1538 
1539 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1540 	"How many bytes to process while owning the FPU");
1541 
1542 #endif /* defined(__KERNEL) */
1543 #endif /* ifdef CAN_USE_GCM_ASM */
1544