1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/zfs_context.h>
26 #include <sys/cmn_err.h>
27 #include <modes/modes.h>
28 #include <sys/crypto/common.h>
29 #include <sys/crypto/icp.h>
30 #include <sys/crypto/impl.h>
31 #include <sys/byteorder.h>
32 #include <sys/simd.h>
33 #include <modes/gcm_impl.h>
34 #ifdef CAN_USE_GCM_ASM
35 #include <aes/aes_impl.h>
36 #endif
37
38 #define GHASH(c, d, t, o) \
39 xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
40 (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
41 (uint64_t *)(void *)(t));
42
43 /* Select GCM implementation */
44 #define IMPL_FASTEST (UINT32_MAX)
45 #define IMPL_CYCLE (UINT32_MAX-1)
46 #ifdef CAN_USE_GCM_ASM
47 #define IMPL_AVX (UINT32_MAX-2)
48 #endif
49 #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
50 static uint32_t icp_gcm_impl = IMPL_FASTEST;
51 static uint32_t user_sel_impl = IMPL_FASTEST;
52
53 static inline int gcm_init_ctx_impl(boolean_t, gcm_ctx_t *, char *, size_t,
54 int (*)(const void *, const uint8_t *, uint8_t *),
55 void (*)(uint8_t *, uint8_t *),
56 void (*)(uint8_t *, uint8_t *));
57
58 #ifdef CAN_USE_GCM_ASM
59 /* Does the architecture we run on support the MOVBE instruction? */
60 boolean_t gcm_avx_can_use_movbe = B_FALSE;
61 /*
62 * Whether to use the optimized openssl gcm and ghash implementations.
63 * Set to true if module parameter icp_gcm_impl == "avx".
64 */
65 static boolean_t gcm_use_avx = B_FALSE;
66 #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx)
67
68 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
69
70 static inline boolean_t gcm_avx_will_work(void);
71 static inline void gcm_set_avx(boolean_t);
72 static inline boolean_t gcm_toggle_avx(void);
73 static inline size_t gcm_simd_get_htab_size(boolean_t);
74
75 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
76 crypto_data_t *, size_t);
77
78 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
79 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
80 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
81 size_t, size_t);
82 #endif /* ifdef CAN_USE_GCM_ASM */
83
84 /*
85 * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode
86 * is done in another function.
87 */
88 int
gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))89 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
90 crypto_data_t *out, size_t block_size,
91 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
92 void (*copy_block)(uint8_t *, uint8_t *),
93 void (*xor_block)(uint8_t *, uint8_t *))
94 {
95 #ifdef CAN_USE_GCM_ASM
96 if (ctx->gcm_use_avx == B_TRUE)
97 return (gcm_mode_encrypt_contiguous_blocks_avx(
98 ctx, data, length, out, block_size));
99 #endif
100
101 const gcm_impl_ops_t *gops;
102 size_t remainder = length;
103 size_t need = 0;
104 uint8_t *datap = (uint8_t *)data;
105 uint8_t *blockp;
106 uint8_t *lastp;
107 void *iov_or_mp;
108 offset_t offset;
109 uint8_t *out_data_1;
110 uint8_t *out_data_2;
111 size_t out_data_1_len;
112 uint64_t counter;
113 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
114
115 if (length + ctx->gcm_remainder_len < block_size) {
116 /* accumulate bytes here and return */
117 memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
118 datap,
119 length);
120 ctx->gcm_remainder_len += length;
121 if (ctx->gcm_copy_to == NULL) {
122 ctx->gcm_copy_to = datap;
123 }
124 return (CRYPTO_SUCCESS);
125 }
126
127 crypto_init_ptrs(out, &iov_or_mp, &offset);
128
129 gops = gcm_impl_get_ops();
130 do {
131 /* Unprocessed data from last call. */
132 if (ctx->gcm_remainder_len > 0) {
133 need = block_size - ctx->gcm_remainder_len;
134
135 if (need > remainder)
136 return (CRYPTO_DATA_LEN_RANGE);
137
138 memcpy(&((uint8_t *)ctx->gcm_remainder)
139 [ctx->gcm_remainder_len], datap, need);
140
141 blockp = (uint8_t *)ctx->gcm_remainder;
142 } else {
143 blockp = datap;
144 }
145
146 /*
147 * Increment counter. Counter bits are confined
148 * to the bottom 32 bits of the counter block.
149 */
150 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
151 counter = htonll(counter + 1);
152 counter &= counter_mask;
153 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
154
155 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
156 (uint8_t *)ctx->gcm_tmp);
157 xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
158
159 lastp = (uint8_t *)ctx->gcm_tmp;
160
161 ctx->gcm_processed_data_len += block_size;
162
163 crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
164 &out_data_1_len, &out_data_2, block_size);
165
166 /* copy block to where it belongs */
167 if (out_data_1_len == block_size) {
168 copy_block(lastp, out_data_1);
169 } else {
170 memcpy(out_data_1, lastp, out_data_1_len);
171 if (out_data_2 != NULL) {
172 memcpy(out_data_2,
173 lastp + out_data_1_len,
174 block_size - out_data_1_len);
175 }
176 }
177 /* update offset */
178 out->cd_offset += block_size;
179
180 /* add ciphertext to the hash */
181 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
182
183 /* Update pointer to next block of data to be processed. */
184 if (ctx->gcm_remainder_len != 0) {
185 datap += need;
186 ctx->gcm_remainder_len = 0;
187 } else {
188 datap += block_size;
189 }
190
191 remainder = (size_t)&data[length] - (size_t)datap;
192
193 /* Incomplete last block. */
194 if (remainder > 0 && remainder < block_size) {
195 memcpy(ctx->gcm_remainder, datap, remainder);
196 ctx->gcm_remainder_len = remainder;
197 ctx->gcm_copy_to = datap;
198 goto out;
199 }
200 ctx->gcm_copy_to = NULL;
201
202 } while (remainder > 0);
203 out:
204 return (CRYPTO_SUCCESS);
205 }
206
207 int
gcm_encrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))208 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
209 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
210 void (*copy_block)(uint8_t *, uint8_t *),
211 void (*xor_block)(uint8_t *, uint8_t *))
212 {
213 (void) copy_block;
214 #ifdef CAN_USE_GCM_ASM
215 if (ctx->gcm_use_avx == B_TRUE)
216 return (gcm_encrypt_final_avx(ctx, out, block_size));
217 #endif
218
219 const gcm_impl_ops_t *gops;
220 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
221 uint8_t *ghash, *macp = NULL;
222 int i, rv;
223
224 if (out->cd_length <
225 (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
226 return (CRYPTO_DATA_LEN_RANGE);
227 }
228
229 gops = gcm_impl_get_ops();
230 ghash = (uint8_t *)ctx->gcm_ghash;
231
232 if (ctx->gcm_remainder_len > 0) {
233 uint64_t counter;
234 uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
235
236 /*
237 * Here is where we deal with data that is not a
238 * multiple of the block size.
239 */
240
241 /*
242 * Increment counter.
243 */
244 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
245 counter = htonll(counter + 1);
246 counter &= counter_mask;
247 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
248
249 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
250 (uint8_t *)ctx->gcm_tmp);
251
252 macp = (uint8_t *)ctx->gcm_remainder;
253 memset(macp + ctx->gcm_remainder_len, 0,
254 block_size - ctx->gcm_remainder_len);
255
256 /* XOR with counter block */
257 for (i = 0; i < ctx->gcm_remainder_len; i++) {
258 macp[i] ^= tmpp[i];
259 }
260
261 /* add ciphertext to the hash */
262 GHASH(ctx, macp, ghash, gops);
263
264 ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
265 }
266
267 ctx->gcm_len_a_len_c[1] =
268 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
269 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
270 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
271 (uint8_t *)ctx->gcm_J0);
272 xor_block((uint8_t *)ctx->gcm_J0, ghash);
273
274 if (ctx->gcm_remainder_len > 0) {
275 rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
276 if (rv != CRYPTO_SUCCESS)
277 return (rv);
278 }
279 out->cd_offset += ctx->gcm_remainder_len;
280 ctx->gcm_remainder_len = 0;
281 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
282 if (rv != CRYPTO_SUCCESS)
283 return (rv);
284 out->cd_offset += ctx->gcm_tag_len;
285
286 return (CRYPTO_SUCCESS);
287 }
288
289 /*
290 * This will only deal with decrypting the last block of the input that
291 * might not be a multiple of block length.
292 */
293 static void
gcm_decrypt_incomplete_block(gcm_ctx_t * ctx,size_t block_size,size_t index,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))294 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
295 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
296 void (*xor_block)(uint8_t *, uint8_t *))
297 {
298 uint8_t *datap, *outp, *counterp;
299 uint64_t counter;
300 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
301 int i;
302
303 /*
304 * Increment counter.
305 * Counter bits are confined to the bottom 32 bits
306 */
307 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
308 counter = htonll(counter + 1);
309 counter &= counter_mask;
310 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
311
312 datap = (uint8_t *)ctx->gcm_remainder;
313 outp = &((ctx->gcm_pt_buf)[index]);
314 counterp = (uint8_t *)ctx->gcm_tmp;
315
316 /* authentication tag */
317 memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
318 memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
319
320 /* add ciphertext to the hash */
321 GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
322
323 /* decrypt remaining ciphertext */
324 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
325
326 /* XOR with counter block */
327 for (i = 0; i < ctx->gcm_remainder_len; i++) {
328 outp[i] = datap[i] ^ counterp[i];
329 }
330 }
331
332 int
gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))333 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
334 crypto_data_t *out, size_t block_size,
335 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
336 void (*copy_block)(uint8_t *, uint8_t *),
337 void (*xor_block)(uint8_t *, uint8_t *))
338 {
339 (void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
340 (void) xor_block;
341 size_t new_len;
342 uint8_t *new;
343
344 /*
345 * Copy contiguous ciphertext input blocks to plaintext buffer.
346 * Ciphertext will be decrypted in the final.
347 */
348 if (length > 0) {
349 new_len = ctx->gcm_pt_buf_len + length;
350 new = vmem_alloc(new_len, KM_SLEEP);
351 if (new == NULL) {
352 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
353 ctx->gcm_pt_buf = NULL;
354 return (CRYPTO_HOST_MEMORY);
355 }
356
357 if (ctx->gcm_pt_buf != NULL) {
358 memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
359 vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
360 } else {
361 ASSERT0(ctx->gcm_pt_buf_len);
362 }
363
364 ctx->gcm_pt_buf = new;
365 ctx->gcm_pt_buf_len = new_len;
366 memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
367 length);
368 ctx->gcm_processed_data_len += length;
369 }
370
371 ctx->gcm_remainder_len = 0;
372 return (CRYPTO_SUCCESS);
373 }
374
375 int
gcm_decrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))376 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
377 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
378 void (*xor_block)(uint8_t *, uint8_t *))
379 {
380 #ifdef CAN_USE_GCM_ASM
381 if (ctx->gcm_use_avx == B_TRUE)
382 return (gcm_decrypt_final_avx(ctx, out, block_size));
383 #endif
384
385 const gcm_impl_ops_t *gops;
386 size_t pt_len;
387 size_t remainder;
388 uint8_t *ghash;
389 uint8_t *blockp;
390 uint8_t *cbp;
391 uint64_t counter;
392 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
393 int processed = 0, rv;
394
395 ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
396
397 gops = gcm_impl_get_ops();
398 pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
399 ghash = (uint8_t *)ctx->gcm_ghash;
400 blockp = ctx->gcm_pt_buf;
401 remainder = pt_len;
402 while (remainder > 0) {
403 /* Incomplete last block */
404 if (remainder < block_size) {
405 memcpy(ctx->gcm_remainder, blockp, remainder);
406 ctx->gcm_remainder_len = remainder;
407 /*
408 * not expecting anymore ciphertext, just
409 * compute plaintext for the remaining input
410 */
411 gcm_decrypt_incomplete_block(ctx, block_size,
412 processed, encrypt_block, xor_block);
413 ctx->gcm_remainder_len = 0;
414 goto out;
415 }
416 /* add ciphertext to the hash */
417 GHASH(ctx, blockp, ghash, gops);
418
419 /*
420 * Increment counter.
421 * Counter bits are confined to the bottom 32 bits
422 */
423 counter = ntohll(ctx->gcm_cb[1] & counter_mask);
424 counter = htonll(counter + 1);
425 counter &= counter_mask;
426 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
427
428 cbp = (uint8_t *)ctx->gcm_tmp;
429 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
430
431 /* XOR with ciphertext */
432 xor_block(cbp, blockp);
433
434 processed += block_size;
435 blockp += block_size;
436 remainder -= block_size;
437 }
438 out:
439 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
440 GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
441 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
442 (uint8_t *)ctx->gcm_J0);
443 xor_block((uint8_t *)ctx->gcm_J0, ghash);
444
445 /* compare the input authentication tag with what we calculated */
446 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
447 /* They don't match */
448 return (CRYPTO_INVALID_MAC);
449 } else {
450 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
451 if (rv != CRYPTO_SUCCESS)
452 return (rv);
453 out->cd_offset += pt_len;
454 }
455 return (CRYPTO_SUCCESS);
456 }
457
458 static int
gcm_validate_args(CK_AES_GCM_PARAMS * gcm_param)459 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
460 {
461 size_t tag_len;
462
463 /*
464 * Check the length of the authentication tag (in bits).
465 */
466 tag_len = gcm_param->ulTagBits;
467 switch (tag_len) {
468 case 32:
469 case 64:
470 case 96:
471 case 104:
472 case 112:
473 case 120:
474 case 128:
475 break;
476 default:
477 return (CRYPTO_MECHANISM_PARAM_INVALID);
478 }
479
480 if (gcm_param->ulIvLen == 0)
481 return (CRYPTO_MECHANISM_PARAM_INVALID);
482
483 return (CRYPTO_SUCCESS);
484 }
485
486 static void
gcm_format_initial_blocks(const uint8_t * iv,ulong_t iv_len,gcm_ctx_t * ctx,size_t block_size,void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))487 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,
488 gcm_ctx_t *ctx, size_t block_size,
489 void (*copy_block)(uint8_t *, uint8_t *),
490 void (*xor_block)(uint8_t *, uint8_t *))
491 {
492 const gcm_impl_ops_t *gops;
493 uint8_t *cb;
494 ulong_t remainder = iv_len;
495 ulong_t processed = 0;
496 uint8_t *datap, *ghash;
497 uint64_t len_a_len_c[2];
498
499 gops = gcm_impl_get_ops();
500 ghash = (uint8_t *)ctx->gcm_ghash;
501 cb = (uint8_t *)ctx->gcm_cb;
502 if (iv_len == 12) {
503 memcpy(cb, iv, 12);
504 cb[12] = 0;
505 cb[13] = 0;
506 cb[14] = 0;
507 cb[15] = 1;
508 /* J0 will be used again in the final */
509 copy_block(cb, (uint8_t *)ctx->gcm_J0);
510 } else {
511 /* GHASH the IV */
512 do {
513 if (remainder < block_size) {
514 memset(cb, 0, block_size);
515 memcpy(cb, &(iv[processed]), remainder);
516 datap = (uint8_t *)cb;
517 remainder = 0;
518 } else {
519 datap = (uint8_t *)(&(iv[processed]));
520 processed += block_size;
521 remainder -= block_size;
522 }
523 GHASH(ctx, datap, ghash, gops);
524 } while (remainder > 0);
525
526 len_a_len_c[0] = 0;
527 len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
528 GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
529
530 /* J0 will be used again in the final */
531 copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
532 }
533 }
534
535 static int
gcm_init(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))536 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
537 const uint8_t *auth_data, size_t auth_data_len, size_t block_size,
538 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
539 void (*copy_block)(uint8_t *, uint8_t *),
540 void (*xor_block)(uint8_t *, uint8_t *))
541 {
542 const gcm_impl_ops_t *gops;
543 uint8_t *ghash, *datap, *authp;
544 size_t remainder, processed;
545
546 /* encrypt zero block to get subkey H */
547 memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
548 encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
549 (uint8_t *)ctx->gcm_H);
550
551 gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
552 copy_block, xor_block);
553
554 gops = gcm_impl_get_ops();
555 authp = (uint8_t *)ctx->gcm_tmp;
556 ghash = (uint8_t *)ctx->gcm_ghash;
557 memset(authp, 0, block_size);
558 memset(ghash, 0, block_size);
559
560 processed = 0;
561 remainder = auth_data_len;
562 do {
563 if (remainder < block_size) {
564 /*
565 * There's not a block full of data, pad rest of
566 * buffer with zero
567 */
568
569 if (auth_data != NULL) {
570 memset(authp, 0, block_size);
571 memcpy(authp, &(auth_data[processed]),
572 remainder);
573 } else {
574 ASSERT0(remainder);
575 }
576
577 datap = (uint8_t *)authp;
578 remainder = 0;
579 } else {
580 datap = (uint8_t *)(&(auth_data[processed]));
581 processed += block_size;
582 remainder -= block_size;
583 }
584
585 /* add auth data to the hash */
586 GHASH(ctx, datap, ghash, gops);
587
588 } while (remainder > 0);
589
590 return (CRYPTO_SUCCESS);
591 }
592
593 /*
594 * The following function is called at encrypt or decrypt init time
595 * for AES GCM mode.
596 */
597 int
gcm_init_ctx(gcm_ctx_t * gcm_ctx,char * param,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))598 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
599 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
600 void (*copy_block)(uint8_t *, uint8_t *),
601 void (*xor_block)(uint8_t *, uint8_t *))
602 {
603 return (gcm_init_ctx_impl(B_FALSE, gcm_ctx, param, block_size,
604 encrypt_block, copy_block, xor_block));
605 }
606
607 /*
608 * The following function is called at encrypt or decrypt init time
609 * for AES GMAC mode.
610 */
611 int
gmac_init_ctx(gcm_ctx_t * gcm_ctx,char * param,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))612 gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
613 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
614 void (*copy_block)(uint8_t *, uint8_t *),
615 void (*xor_block)(uint8_t *, uint8_t *))
616 {
617 return (gcm_init_ctx_impl(B_TRUE, gcm_ctx, param, block_size,
618 encrypt_block, copy_block, xor_block));
619 }
620
621 /*
622 * Init the GCM context struct. Handle the cycle and avx implementations here.
623 * Initialization of a GMAC context differs slightly from a GCM context.
624 */
625 static inline int
gcm_init_ctx_impl(boolean_t gmac_mode,gcm_ctx_t * gcm_ctx,char * param,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))626 gcm_init_ctx_impl(boolean_t gmac_mode, gcm_ctx_t *gcm_ctx, char *param,
627 size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,
628 uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),
629 void (*xor_block)(uint8_t *, uint8_t *))
630 {
631 CK_AES_GCM_PARAMS *gcm_param;
632 int rv = CRYPTO_SUCCESS;
633 size_t tag_len, iv_len;
634
635 if (param != NULL) {
636 gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
637
638 if (gmac_mode == B_FALSE) {
639 /* GCM mode. */
640 if ((rv = gcm_validate_args(gcm_param)) != 0) {
641 return (rv);
642 }
643 gcm_ctx->gcm_flags |= GCM_MODE;
644
645 size_t tbits = gcm_param->ulTagBits;
646 tag_len = CRYPTO_BITS2BYTES(tbits);
647 iv_len = gcm_param->ulIvLen;
648 } else {
649 /* GMAC mode. */
650 gcm_ctx->gcm_flags |= GMAC_MODE;
651 tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS);
652 iv_len = AES_GMAC_IV_LEN;
653 }
654 gcm_ctx->gcm_tag_len = tag_len;
655 gcm_ctx->gcm_processed_data_len = 0;
656
657 /* these values are in bits */
658 gcm_ctx->gcm_len_a_len_c[0]
659 = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
660 } else {
661 return (CRYPTO_MECHANISM_PARAM_INVALID);
662 }
663
664 const uint8_t *iv = (const uint8_t *)gcm_param->pIv;
665 const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
666 size_t aad_len = gcm_param->ulAADLen;
667
668 #ifdef CAN_USE_GCM_ASM
669 boolean_t needs_bswap =
670 ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
671
672 if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
673 gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
674 } else {
675 /*
676 * Handle the "cycle" implementation by creating avx and
677 * non-avx contexts alternately.
678 */
679 gcm_ctx->gcm_use_avx = gcm_toggle_avx();
680
681 /* The avx impl. doesn't handle byte swapped key schedules. */
682 if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
683 gcm_ctx->gcm_use_avx = B_FALSE;
684 }
685 /*
686 * If this is a GCM context, use the MOVBE and the BSWAP
687 * variants alternately. GMAC contexts code paths do not
688 * use the MOVBE instruction.
689 */
690 if (gcm_ctx->gcm_use_avx == B_TRUE && gmac_mode == B_FALSE &&
691 zfs_movbe_available() == B_TRUE) {
692 (void) atomic_toggle_boolean_nv(
693 (volatile boolean_t *)&gcm_avx_can_use_movbe);
694 }
695 }
696 /*
697 * We don't handle byte swapped key schedules in the avx code path,
698 * still they could be created by the aes generic implementation.
699 * Make sure not to use them since we'll corrupt data if we do.
700 */
701 if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) {
702 gcm_ctx->gcm_use_avx = B_FALSE;
703
704 cmn_err_once(CE_WARN,
705 "ICP: Can't use the aes generic or cycle implementations "
706 "in combination with the gcm avx implementation!");
707 cmn_err_once(CE_WARN,
708 "ICP: Falling back to a compatible implementation, "
709 "aes-gcm performance will likely be degraded.");
710 cmn_err_once(CE_WARN,
711 "ICP: Choose at least the x86_64 aes implementation to "
712 "restore performance.");
713 }
714
715 /* Allocate Htab memory as needed. */
716 if (gcm_ctx->gcm_use_avx == B_TRUE) {
717 size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
718
719 if (htab_len == 0) {
720 return (CRYPTO_MECHANISM_PARAM_INVALID);
721 }
722 gcm_ctx->gcm_htab_len = htab_len;
723 gcm_ctx->gcm_Htable =
724 kmem_alloc(htab_len, KM_SLEEP);
725
726 if (gcm_ctx->gcm_Htable == NULL) {
727 return (CRYPTO_HOST_MEMORY);
728 }
729 }
730 /* Avx and non avx context initialization differs from here on. */
731 if (gcm_ctx->gcm_use_avx == B_FALSE) {
732 #endif /* ifdef CAN_USE_GCM_ASM */
733 if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
734 encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
735 rv = CRYPTO_MECHANISM_PARAM_INVALID;
736 }
737 #ifdef CAN_USE_GCM_ASM
738 } else {
739 if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
740 block_size) != CRYPTO_SUCCESS) {
741 rv = CRYPTO_MECHANISM_PARAM_INVALID;
742 }
743 }
744 #endif /* ifdef CAN_USE_GCM_ASM */
745
746 return (rv);
747 }
748
749 void *
gcm_alloc_ctx(int kmflag)750 gcm_alloc_ctx(int kmflag)
751 {
752 gcm_ctx_t *gcm_ctx;
753
754 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
755 return (NULL);
756
757 gcm_ctx->gcm_flags = GCM_MODE;
758 return (gcm_ctx);
759 }
760
761 void *
gmac_alloc_ctx(int kmflag)762 gmac_alloc_ctx(int kmflag)
763 {
764 gcm_ctx_t *gcm_ctx;
765
766 if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
767 return (NULL);
768
769 gcm_ctx->gcm_flags = GMAC_MODE;
770 return (gcm_ctx);
771 }
772
773 /* GCM implementation that contains the fastest methods */
774 static gcm_impl_ops_t gcm_fastest_impl = {
775 .name = "fastest"
776 };
777
778 /* All compiled in implementations */
779 static const gcm_impl_ops_t *gcm_all_impl[] = {
780 &gcm_generic_impl,
781 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
782 &gcm_pclmulqdq_impl,
783 #endif
784 };
785
786 /* Indicate that benchmark has been completed */
787 static boolean_t gcm_impl_initialized = B_FALSE;
788
789 /* Hold all supported implementations */
790 static size_t gcm_supp_impl_cnt = 0;
791 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
792
793 /*
794 * Returns the GCM operations for encrypt/decrypt/key setup. When a
795 * SIMD implementation is not allowed in the current context, then
796 * fallback to the fastest generic implementation.
797 */
798 const gcm_impl_ops_t *
gcm_impl_get_ops(void)799 gcm_impl_get_ops(void)
800 {
801 if (!kfpu_allowed())
802 return (&gcm_generic_impl);
803
804 const gcm_impl_ops_t *ops = NULL;
805 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
806
807 switch (impl) {
808 case IMPL_FASTEST:
809 ASSERT(gcm_impl_initialized);
810 ops = &gcm_fastest_impl;
811 break;
812 case IMPL_CYCLE:
813 /* Cycle through supported implementations */
814 ASSERT(gcm_impl_initialized);
815 ASSERT3U(gcm_supp_impl_cnt, >, 0);
816 static size_t cycle_impl_idx = 0;
817 size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
818 ops = gcm_supp_impl[idx];
819 break;
820 #ifdef CAN_USE_GCM_ASM
821 case IMPL_AVX:
822 /*
823 * Make sure that we return a valid implementation while
824 * switching to the avx implementation since there still
825 * may be unfinished non-avx contexts around.
826 */
827 ops = &gcm_generic_impl;
828 break;
829 #endif
830 default:
831 ASSERT3U(impl, <, gcm_supp_impl_cnt);
832 ASSERT3U(gcm_supp_impl_cnt, >, 0);
833 if (impl < ARRAY_SIZE(gcm_all_impl))
834 ops = gcm_supp_impl[impl];
835 break;
836 }
837
838 ASSERT3P(ops, !=, NULL);
839
840 return (ops);
841 }
842
843 /*
844 * Initialize all supported implementations.
845 */
846 void
gcm_impl_init(void)847 gcm_impl_init(void)
848 {
849 gcm_impl_ops_t *curr_impl;
850 int i, c;
851
852 /* Move supported implementations into gcm_supp_impls */
853 for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
854 curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
855
856 if (curr_impl->is_supported())
857 gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
858 }
859 gcm_supp_impl_cnt = c;
860
861 /*
862 * Set the fastest implementation given the assumption that the
863 * hardware accelerated version is the fastest.
864 */
865 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
866 if (gcm_pclmulqdq_impl.is_supported()) {
867 memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
868 sizeof (gcm_fastest_impl));
869 } else
870 #endif
871 {
872 memcpy(&gcm_fastest_impl, &gcm_generic_impl,
873 sizeof (gcm_fastest_impl));
874 }
875
876 strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
877
878 #ifdef CAN_USE_GCM_ASM
879 /*
880 * Use the avx implementation if it's available and the implementation
881 * hasn't changed from its default value of fastest on module load.
882 */
883 if (gcm_avx_will_work()) {
884 #ifdef HAVE_MOVBE
885 if (zfs_movbe_available() == B_TRUE) {
886 atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
887 }
888 #endif
889 if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
890 gcm_set_avx(B_TRUE);
891 }
892 }
893 #endif
894 /* Finish initialization */
895 atomic_swap_32(&icp_gcm_impl, user_sel_impl);
896 gcm_impl_initialized = B_TRUE;
897 }
898
899 static const struct {
900 const char *name;
901 uint32_t sel;
902 } gcm_impl_opts[] = {
903 { "cycle", IMPL_CYCLE },
904 { "fastest", IMPL_FASTEST },
905 #ifdef CAN_USE_GCM_ASM
906 { "avx", IMPL_AVX },
907 #endif
908 };
909
910 /*
911 * Function sets desired gcm implementation.
912 *
913 * If we are called before init(), user preference will be saved in
914 * user_sel_impl, and applied in later init() call. This occurs when module
915 * parameter is specified on module load. Otherwise, directly update
916 * icp_gcm_impl.
917 *
918 * @val Name of gcm implementation to use
919 * @param Unused.
920 */
921 int
gcm_impl_set(const char * val)922 gcm_impl_set(const char *val)
923 {
924 int err = -EINVAL;
925 char req_name[GCM_IMPL_NAME_MAX];
926 uint32_t impl = GCM_IMPL_READ(user_sel_impl);
927 size_t i;
928
929 /* sanitize input */
930 i = strnlen(val, GCM_IMPL_NAME_MAX);
931 if (i == 0 || i >= GCM_IMPL_NAME_MAX)
932 return (err);
933
934 strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
935 while (i > 0 && isspace(req_name[i-1]))
936 i--;
937 req_name[i] = '\0';
938
939 /* Check mandatory options */
940 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
941 #ifdef CAN_USE_GCM_ASM
942 /* Ignore avx implementation if it won't work. */
943 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
944 continue;
945 }
946 #endif
947 if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
948 impl = gcm_impl_opts[i].sel;
949 err = 0;
950 break;
951 }
952 }
953
954 /* check all supported impl if init() was already called */
955 if (err != 0 && gcm_impl_initialized) {
956 /* check all supported implementations */
957 for (i = 0; i < gcm_supp_impl_cnt; i++) {
958 if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
959 impl = i;
960 err = 0;
961 break;
962 }
963 }
964 }
965 #ifdef CAN_USE_GCM_ASM
966 /*
967 * Use the avx implementation if available and the requested one is
968 * avx or fastest.
969 */
970 if (gcm_avx_will_work() == B_TRUE &&
971 (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
972 gcm_set_avx(B_TRUE);
973 } else {
974 gcm_set_avx(B_FALSE);
975 }
976 #endif
977
978 if (err == 0) {
979 if (gcm_impl_initialized)
980 atomic_swap_32(&icp_gcm_impl, impl);
981 else
982 atomic_swap_32(&user_sel_impl, impl);
983 }
984
985 return (err);
986 }
987
988 #if defined(_KERNEL) && defined(__linux__)
989
990 static int
icp_gcm_impl_set(const char * val,zfs_kernel_param_t * kp)991 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
992 {
993 return (gcm_impl_set(val));
994 }
995
996 static int
icp_gcm_impl_get(char * buffer,zfs_kernel_param_t * kp)997 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
998 {
999 int i, cnt = 0;
1000 char *fmt;
1001 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
1002
1003 ASSERT(gcm_impl_initialized);
1004
1005 /* list mandatory options */
1006 for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
1007 #ifdef CAN_USE_GCM_ASM
1008 /* Ignore avx implementation if it won't work. */
1009 if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
1010 continue;
1011 }
1012 #endif
1013 fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
1014 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
1015 gcm_impl_opts[i].name);
1016 }
1017
1018 /* list all supported implementations */
1019 for (i = 0; i < gcm_supp_impl_cnt; i++) {
1020 fmt = (i == impl) ? "[%s] " : "%s ";
1021 cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
1022 gcm_supp_impl[i]->name);
1023 }
1024
1025 return (cnt);
1026 }
1027
1028 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
1029 NULL, 0644);
1030 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
1031 #endif /* defined(__KERNEL) */
1032
1033 #ifdef CAN_USE_GCM_ASM
1034 #define GCM_BLOCK_LEN 16
1035 /*
1036 * The openssl asm routines are 6x aggregated and need that many bytes
1037 * at minimum.
1038 */
1039 #define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
1040 #define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
1041 /*
1042 * Ensure the chunk size is reasonable since we are allocating a
1043 * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
1044 */
1045 #define GCM_AVX_MAX_CHUNK_SIZE \
1046 (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
1047
1048 /* Clear the FPU registers since they hold sensitive internal state. */
1049 #define clear_fpu_regs() clear_fpu_regs_avx()
1050 #define GHASH_AVX(ctx, in, len) \
1051 gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
1052 in, len)
1053
1054 #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1055
1056 /* Get the chunk size module parameter. */
1057 #define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1058
1059 /*
1060 * Module parameter: number of bytes to process at once while owning the FPU.
1061 * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1062 * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1063 */
1064 static uint32_t gcm_avx_chunk_size =
1065 ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1066
1067 extern void ASMABI clear_fpu_regs_avx(void);
1068 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1069 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1070 const uint32_t pt[4], uint32_t ct[4]);
1071
1072 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1073 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1074 const uint8_t *in, size_t len);
1075
1076 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1077 const void *, uint64_t *, uint64_t *);
1078
1079 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1080 const void *, uint64_t *, uint64_t *);
1081
1082 static inline boolean_t
gcm_avx_will_work(void)1083 gcm_avx_will_work(void)
1084 {
1085 /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1086 return (kfpu_allowed() &&
1087 zfs_avx_available() && zfs_aes_available() &&
1088 zfs_pclmulqdq_available());
1089 }
1090
1091 static inline void
gcm_set_avx(boolean_t val)1092 gcm_set_avx(boolean_t val)
1093 {
1094 if (gcm_avx_will_work() == B_TRUE) {
1095 atomic_swap_32(&gcm_use_avx, val);
1096 }
1097 }
1098
1099 static inline boolean_t
gcm_toggle_avx(void)1100 gcm_toggle_avx(void)
1101 {
1102 if (gcm_avx_will_work() == B_TRUE) {
1103 return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
1104 } else {
1105 return (B_FALSE);
1106 }
1107 }
1108
1109 static inline size_t
gcm_simd_get_htab_size(boolean_t simd_mode)1110 gcm_simd_get_htab_size(boolean_t simd_mode)
1111 {
1112 switch (simd_mode) {
1113 case B_TRUE:
1114 return (2 * 6 * 2 * sizeof (uint64_t));
1115
1116 default:
1117 return (0);
1118 }
1119 }
1120
1121
1122 /* Increment the GCM counter block by n. */
1123 static inline void
gcm_incr_counter_block_by(gcm_ctx_t * ctx,int n)1124 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1125 {
1126 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1127 uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1128
1129 counter = htonll(counter + n);
1130 counter &= counter_mask;
1131 ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1132 }
1133
1134 /*
1135 * Encrypt multiple blocks of data in GCM mode.
1136 * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1137 * if possible. While processing a chunk the FPU is "locked".
1138 */
1139 static int
gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size)1140 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1141 size_t length, crypto_data_t *out, size_t block_size)
1142 {
1143 size_t bleft = length;
1144 size_t need = 0;
1145 size_t done = 0;
1146 uint8_t *datap = (uint8_t *)data;
1147 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1148 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1149 uint64_t *ghash = ctx->gcm_ghash;
1150 uint64_t *cb = ctx->gcm_cb;
1151 uint8_t *ct_buf = NULL;
1152 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1153 int rv = CRYPTO_SUCCESS;
1154
1155 ASSERT(block_size == GCM_BLOCK_LEN);
1156 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1157 B_FALSE);
1158 /*
1159 * If the last call left an incomplete block, try to fill
1160 * it first.
1161 */
1162 if (ctx->gcm_remainder_len > 0) {
1163 need = block_size - ctx->gcm_remainder_len;
1164 if (length < need) {
1165 /* Accumulate bytes here and return. */
1166 memcpy((uint8_t *)ctx->gcm_remainder +
1167 ctx->gcm_remainder_len, datap, length);
1168
1169 ctx->gcm_remainder_len += length;
1170 if (ctx->gcm_copy_to == NULL) {
1171 ctx->gcm_copy_to = datap;
1172 }
1173 return (CRYPTO_SUCCESS);
1174 } else {
1175 /* Complete incomplete block. */
1176 memcpy((uint8_t *)ctx->gcm_remainder +
1177 ctx->gcm_remainder_len, datap, need);
1178
1179 ctx->gcm_copy_to = NULL;
1180 }
1181 }
1182
1183 /* Allocate a buffer to encrypt to if there is enough input. */
1184 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1185 ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1186 if (ct_buf == NULL) {
1187 return (CRYPTO_HOST_MEMORY);
1188 }
1189 }
1190
1191 /* If we completed an incomplete block, encrypt and write it out. */
1192 if (ctx->gcm_remainder_len > 0) {
1193 kfpu_begin();
1194 aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1195 (const uint32_t *)cb, (uint32_t *)tmp);
1196
1197 gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1198 GHASH_AVX(ctx, tmp, block_size);
1199 clear_fpu_regs();
1200 kfpu_end();
1201 rv = crypto_put_output_data(tmp, out, block_size);
1202 out->cd_offset += block_size;
1203 gcm_incr_counter_block(ctx);
1204 ctx->gcm_processed_data_len += block_size;
1205 bleft -= need;
1206 datap += need;
1207 ctx->gcm_remainder_len = 0;
1208 }
1209
1210 /* Do the bulk encryption in chunk_size blocks. */
1211 for (; bleft >= chunk_size; bleft -= chunk_size) {
1212 kfpu_begin();
1213 done = aesni_gcm_encrypt(
1214 datap, ct_buf, chunk_size, key, cb, ghash);
1215
1216 clear_fpu_regs();
1217 kfpu_end();
1218 if (done != chunk_size) {
1219 rv = CRYPTO_FAILED;
1220 goto out_nofpu;
1221 }
1222 rv = crypto_put_output_data(ct_buf, out, chunk_size);
1223 if (rv != CRYPTO_SUCCESS) {
1224 goto out_nofpu;
1225 }
1226 out->cd_offset += chunk_size;
1227 datap += chunk_size;
1228 ctx->gcm_processed_data_len += chunk_size;
1229 }
1230 /* Check if we are already done. */
1231 if (bleft == 0) {
1232 goto out_nofpu;
1233 }
1234 /* Bulk encrypt the remaining data. */
1235 kfpu_begin();
1236 if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1237 done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
1238 if (done == 0) {
1239 rv = CRYPTO_FAILED;
1240 goto out;
1241 }
1242 rv = crypto_put_output_data(ct_buf, out, done);
1243 if (rv != CRYPTO_SUCCESS) {
1244 goto out;
1245 }
1246 out->cd_offset += done;
1247 ctx->gcm_processed_data_len += done;
1248 datap += done;
1249 bleft -= done;
1250
1251 }
1252 /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1253 while (bleft > 0) {
1254 if (bleft < block_size) {
1255 memcpy(ctx->gcm_remainder, datap, bleft);
1256 ctx->gcm_remainder_len = bleft;
1257 ctx->gcm_copy_to = datap;
1258 goto out;
1259 }
1260 /* Encrypt, hash and write out. */
1261 aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1262 (const uint32_t *)cb, (uint32_t *)tmp);
1263
1264 gcm_xor_avx(datap, tmp);
1265 GHASH_AVX(ctx, tmp, block_size);
1266 rv = crypto_put_output_data(tmp, out, block_size);
1267 if (rv != CRYPTO_SUCCESS) {
1268 goto out;
1269 }
1270 out->cd_offset += block_size;
1271 gcm_incr_counter_block(ctx);
1272 ctx->gcm_processed_data_len += block_size;
1273 datap += block_size;
1274 bleft -= block_size;
1275 }
1276 out:
1277 clear_fpu_regs();
1278 kfpu_end();
1279 out_nofpu:
1280 if (ct_buf != NULL) {
1281 vmem_free(ct_buf, chunk_size);
1282 }
1283 return (rv);
1284 }
1285
1286 /*
1287 * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1288 * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1289 */
1290 static int
gcm_encrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1291 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1292 {
1293 uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1294 uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1295 uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1296 size_t rem_len = ctx->gcm_remainder_len;
1297 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1298 int aes_rounds = ((aes_key_t *)keysched)->nr;
1299 int rv;
1300
1301 ASSERT(block_size == GCM_BLOCK_LEN);
1302 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1303 B_FALSE);
1304
1305 if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1306 return (CRYPTO_DATA_LEN_RANGE);
1307 }
1308
1309 kfpu_begin();
1310 /* Pad last incomplete block with zeros, encrypt and hash. */
1311 if (rem_len > 0) {
1312 uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1313 const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1314
1315 aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1316 memset(remainder + rem_len, 0, block_size - rem_len);
1317 for (int i = 0; i < rem_len; i++) {
1318 remainder[i] ^= tmp[i];
1319 }
1320 GHASH_AVX(ctx, remainder, block_size);
1321 ctx->gcm_processed_data_len += rem_len;
1322 /* No need to increment counter_block, it's the last block. */
1323 }
1324 /* Finish tag. */
1325 ctx->gcm_len_a_len_c[1] =
1326 htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1327 GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1328 aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1329
1330 gcm_xor_avx((uint8_t *)J0, ghash);
1331 clear_fpu_regs();
1332 kfpu_end();
1333
1334 /* Output remainder. */
1335 if (rem_len > 0) {
1336 rv = crypto_put_output_data(remainder, out, rem_len);
1337 if (rv != CRYPTO_SUCCESS)
1338 return (rv);
1339 }
1340 out->cd_offset += rem_len;
1341 ctx->gcm_remainder_len = 0;
1342 rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1343 if (rv != CRYPTO_SUCCESS)
1344 return (rv);
1345
1346 out->cd_offset += ctx->gcm_tag_len;
1347 return (CRYPTO_SUCCESS);
1348 }
1349
1350 /*
1351 * Finalize decryption: We just have accumulated crypto text, so now we
1352 * decrypt it here inplace.
1353 */
1354 static int
gcm_decrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1355 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1356 {
1357 ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1358 ASSERT3U(block_size, ==, 16);
1359 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1360 B_FALSE);
1361
1362 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1363 size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1364 uint8_t *datap = ctx->gcm_pt_buf;
1365 const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1366 uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1367 uint64_t *ghash = ctx->gcm_ghash;
1368 uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1369 int rv = CRYPTO_SUCCESS;
1370 size_t bleft, done;
1371
1372 /*
1373 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1374 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1375 * GCM_AVX_MIN_DECRYPT_BYTES.
1376 */
1377 for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1378 kfpu_begin();
1379 done = aesni_gcm_decrypt(datap, datap, chunk_size,
1380 (const void *)key, ctx->gcm_cb, ghash);
1381 clear_fpu_regs();
1382 kfpu_end();
1383 if (done != chunk_size) {
1384 return (CRYPTO_FAILED);
1385 }
1386 datap += done;
1387 }
1388 /* Decrypt remainder, which is less than chunk size, in one go. */
1389 kfpu_begin();
1390 if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1391 done = aesni_gcm_decrypt(datap, datap, bleft,
1392 (const void *)key, ctx->gcm_cb, ghash);
1393 if (done == 0) {
1394 clear_fpu_regs();
1395 kfpu_end();
1396 return (CRYPTO_FAILED);
1397 }
1398 datap += done;
1399 bleft -= done;
1400 }
1401 ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1402
1403 /*
1404 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1405 * decrypt them block by block.
1406 */
1407 while (bleft > 0) {
1408 /* Incomplete last block. */
1409 if (bleft < block_size) {
1410 uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1411
1412 memset(lastb, 0, block_size);
1413 memcpy(lastb, datap, bleft);
1414 /* The GCM processing. */
1415 GHASH_AVX(ctx, lastb, block_size);
1416 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1417 for (size_t i = 0; i < bleft; i++) {
1418 datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1419 }
1420 break;
1421 }
1422 /* The GCM processing. */
1423 GHASH_AVX(ctx, datap, block_size);
1424 aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1425 gcm_xor_avx((uint8_t *)tmp, datap);
1426 gcm_incr_counter_block(ctx);
1427
1428 datap += block_size;
1429 bleft -= block_size;
1430 }
1431 if (rv != CRYPTO_SUCCESS) {
1432 clear_fpu_regs();
1433 kfpu_end();
1434 return (rv);
1435 }
1436 /* Decryption done, finish the tag. */
1437 ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1438 GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1439 aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1440 (uint32_t *)ctx->gcm_J0);
1441
1442 gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1443
1444 /* We are done with the FPU, restore its state. */
1445 clear_fpu_regs();
1446 kfpu_end();
1447
1448 /* Compare the input authentication tag with what we calculated. */
1449 if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1450 /* They don't match. */
1451 return (CRYPTO_INVALID_MAC);
1452 }
1453 rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1454 if (rv != CRYPTO_SUCCESS) {
1455 return (rv);
1456 }
1457 out->cd_offset += pt_len;
1458 return (CRYPTO_SUCCESS);
1459 }
1460
1461 /*
1462 * Initialize the GCM params H, Htabtle and the counter block. Save the
1463 * initial counter block.
1464 */
1465 static int
gcm_init_avx(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size)1466 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
1467 const uint8_t *auth_data, size_t auth_data_len, size_t block_size)
1468 {
1469 uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1470 uint64_t *H = ctx->gcm_H;
1471 const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1472 int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1473 const uint8_t *datap = auth_data;
1474 size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1475 size_t bleft;
1476
1477 ASSERT(block_size == GCM_BLOCK_LEN);
1478 ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1479 B_FALSE);
1480
1481 /* Init H (encrypt zero block) and create the initial counter block. */
1482 memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1483 memset(H, 0, sizeof (ctx->gcm_H));
1484 kfpu_begin();
1485 aes_encrypt_intel(keysched, aes_rounds,
1486 (const uint32_t *)H, (uint32_t *)H);
1487
1488 gcm_init_htab_avx(ctx->gcm_Htable, H);
1489
1490 if (iv_len == 12) {
1491 memcpy(cb, iv, 12);
1492 cb[12] = 0;
1493 cb[13] = 0;
1494 cb[14] = 0;
1495 cb[15] = 1;
1496 /* We need the ICB later. */
1497 memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1498 } else {
1499 /*
1500 * Most consumers use 12 byte IVs, so it's OK to use the
1501 * original routines for other IV sizes, just avoid nesting
1502 * kfpu_begin calls.
1503 */
1504 clear_fpu_regs();
1505 kfpu_end();
1506 gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1507 aes_copy_block, aes_xor_block);
1508 kfpu_begin();
1509 }
1510
1511 /* Openssl post increments the counter, adjust for that. */
1512 gcm_incr_counter_block(ctx);
1513
1514 /* Ghash AAD in chunk_size blocks. */
1515 for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1516 GHASH_AVX(ctx, datap, chunk_size);
1517 datap += chunk_size;
1518 clear_fpu_regs();
1519 kfpu_end();
1520 kfpu_begin();
1521 }
1522 /* Ghash the remainder and handle possible incomplete GCM block. */
1523 if (bleft > 0) {
1524 size_t incomp = bleft % block_size;
1525
1526 bleft -= incomp;
1527 if (bleft > 0) {
1528 GHASH_AVX(ctx, datap, bleft);
1529 datap += bleft;
1530 }
1531 if (incomp > 0) {
1532 /* Zero pad and hash incomplete last block. */
1533 uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1534
1535 memset(authp, 0, block_size);
1536 memcpy(authp, datap, incomp);
1537 GHASH_AVX(ctx, authp, block_size);
1538 }
1539 }
1540 clear_fpu_regs();
1541 kfpu_end();
1542 return (CRYPTO_SUCCESS);
1543 }
1544
1545 #if defined(_KERNEL)
1546 static int
icp_gcm_avx_set_chunk_size(const char * buf,zfs_kernel_param_t * kp)1547 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1548 {
1549 unsigned long val;
1550 char val_rounded[16];
1551 int error = 0;
1552
1553 error = kstrtoul(buf, 0, &val);
1554 if (error)
1555 return (error);
1556
1557 val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1558
1559 if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1560 return (-EINVAL);
1561
1562 snprintf(val_rounded, 16, "%u", (uint32_t)val);
1563 error = param_set_uint(val_rounded, kp);
1564 return (error);
1565 }
1566
1567 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1568 param_get_uint, &gcm_avx_chunk_size, 0644);
1569
1570 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1571 "How many bytes to process while owning the FPU");
1572
1573 #endif /* defined(__KERNEL) */
1574 #endif /* ifdef CAN_USE_GCM_ASM */
1575