1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
24  */
25 
26 #include <sys/simd.h>
27 #include <sys/zfs_context.h>
28 #include <sys/zfs_impl.h>
29 #include <sys/blake3.h>
30 
31 #include "blake3_impl.h"
32 
33 #if defined(__aarch64__) || \
34 	(defined(__x86_64) && defined(HAVE_SSE2)) || \
35 	(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
36 
37 extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8],
38     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
39     uint64_t counter, uint8_t flags);
40 
41 extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8],
42     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
43     uint64_t counter, uint8_t flags, uint8_t out[64]);
44 
45 extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs,
46     size_t num_inputs, size_t blocks, const uint32_t key[8],
47     uint64_t counter, boolean_t increment_counter, uint8_t flags,
48     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
49 
50 static void blake3_compress_in_place_sse2(uint32_t cv[8],
51     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
52     uint64_t counter, uint8_t flags) {
53 	kfpu_begin();
54 	zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter,
55 	    flags);
56 	kfpu_end();
57 }
58 
59 static void blake3_compress_xof_sse2(const uint32_t cv[8],
60     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
61     uint64_t counter, uint8_t flags, uint8_t out[64]) {
62 	kfpu_begin();
63 	zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags,
64 	    out);
65 	kfpu_end();
66 }
67 
68 static void blake3_hash_many_sse2(const uint8_t * const *inputs,
69     size_t num_inputs, size_t blocks, const uint32_t key[8],
70     uint64_t counter, boolean_t increment_counter, uint8_t flags,
71     uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
72 	kfpu_begin();
73 	zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
74 	    increment_counter, flags, flags_start, flags_end, out);
75 	kfpu_end();
76 }
77 
78 static boolean_t blake3_is_sse2_supported(void)
79 {
80 #if defined(__x86_64)
81 	return (kfpu_allowed() && zfs_sse2_available());
82 #elif defined(__PPC64__)
83 	return (kfpu_allowed() && zfs_vsx_available());
84 #else
85 	return (kfpu_allowed());
86 #endif
87 }
88 
89 const blake3_ops_t blake3_sse2_impl = {
90 	.compress_in_place = blake3_compress_in_place_sse2,
91 	.compress_xof = blake3_compress_xof_sse2,
92 	.hash_many = blake3_hash_many_sse2,
93 	.is_supported = blake3_is_sse2_supported,
94 	.degree = 4,
95 	.name = "sse2"
96 };
97 #endif
98 
99 #if defined(__aarch64__) || \
100 	(defined(__x86_64) && defined(HAVE_SSE2)) || \
101 	(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
102 
103 extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8],
104     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
105     uint64_t counter, uint8_t flags);
106 
107 extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8],
108     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
109     uint64_t counter, uint8_t flags, uint8_t out[64]);
110 
111 extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs,
112     size_t num_inputs, size_t blocks, const uint32_t key[8],
113     uint64_t counter, boolean_t increment_counter, uint8_t flags,
114     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
115 
116 static void blake3_compress_in_place_sse41(uint32_t cv[8],
117     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
118     uint64_t counter, uint8_t flags) {
119 	kfpu_begin();
120 	zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter,
121 	    flags);
122 	kfpu_end();
123 }
124 
125 static void blake3_compress_xof_sse41(const uint32_t cv[8],
126     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
127     uint64_t counter, uint8_t flags, uint8_t out[64]) {
128 	kfpu_begin();
129 	zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags,
130 	    out);
131 	kfpu_end();
132 }
133 
134 static void blake3_hash_many_sse41(const uint8_t * const *inputs,
135     size_t num_inputs, size_t blocks, const uint32_t key[8],
136     uint64_t counter, boolean_t increment_counter, uint8_t flags,
137     uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
138 	kfpu_begin();
139 	zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
140 	    increment_counter, flags, flags_start, flags_end, out);
141 	kfpu_end();
142 }
143 
144 static boolean_t blake3_is_sse41_supported(void)
145 {
146 #if defined(__x86_64)
147 	return (kfpu_allowed() && zfs_sse4_1_available());
148 #elif defined(__PPC64__)
149 	return (kfpu_allowed() && zfs_vsx_available());
150 #else
151 	return (kfpu_allowed());
152 #endif
153 }
154 
155 const blake3_ops_t blake3_sse41_impl = {
156 	.compress_in_place = blake3_compress_in_place_sse41,
157 	.compress_xof = blake3_compress_xof_sse41,
158 	.hash_many = blake3_hash_many_sse41,
159 	.is_supported = blake3_is_sse41_supported,
160 	.degree = 4,
161 	.name = "sse41"
162 };
163 #endif
164 
165 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
166 extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs,
167     size_t num_inputs, size_t blocks, const uint32_t key[8],
168     uint64_t counter, boolean_t increment_counter, uint8_t flags,
169     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
170 
171 static void blake3_hash_many_avx2(const uint8_t * const *inputs,
172     size_t num_inputs, size_t blocks, const uint32_t key[8],
173     uint64_t counter, boolean_t increment_counter, uint8_t flags,
174     uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
175 	kfpu_begin();
176 	zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
177 	    increment_counter, flags, flags_start, flags_end, out);
178 	kfpu_end();
179 }
180 
181 static boolean_t blake3_is_avx2_supported(void)
182 {
183 	return (kfpu_allowed() && zfs_sse4_1_available() &&
184 	    zfs_avx2_available());
185 }
186 
187 const blake3_ops_t
188 blake3_avx2_impl = {
189 	.compress_in_place = blake3_compress_in_place_sse41,
190 	.compress_xof = blake3_compress_xof_sse41,
191 	.hash_many = blake3_hash_many_avx2,
192 	.is_supported = blake3_is_avx2_supported,
193 	.degree = 8,
194 	.name = "avx2"
195 };
196 #endif
197 
198 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
199 extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8],
200     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
201     uint64_t counter, uint8_t flags);
202 
203 extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8],
204     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
205     uint64_t counter, uint8_t flags, uint8_t out[64]);
206 
207 extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs,
208     size_t num_inputs, size_t blocks, const uint32_t key[8],
209     uint64_t counter, boolean_t increment_counter, uint8_t flags,
210     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
211 
212 static void blake3_compress_in_place_avx512(uint32_t cv[8],
213     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
214     uint64_t counter, uint8_t flags) {
215 	kfpu_begin();
216 	zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter,
217 	    flags);
218 	kfpu_end();
219 }
220 
221 static void blake3_compress_xof_avx512(const uint32_t cv[8],
222     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
223     uint64_t counter, uint8_t flags, uint8_t out[64]) {
224 	kfpu_begin();
225 	zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags,
226 	    out);
227 	kfpu_end();
228 }
229 
230 static void blake3_hash_many_avx512(const uint8_t * const *inputs,
231     size_t num_inputs, size_t blocks, const uint32_t key[8],
232     uint64_t counter, boolean_t increment_counter, uint8_t flags,
233     uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
234 	kfpu_begin();
235 	zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
236 	    increment_counter, flags, flags_start, flags_end, out);
237 	kfpu_end();
238 }
239 
240 static boolean_t blake3_is_avx512_supported(void)
241 {
242 	return (kfpu_allowed() && zfs_avx512f_available() &&
243 	    zfs_avx512vl_available());
244 }
245 
246 const blake3_ops_t blake3_avx512_impl = {
247 	.compress_in_place = blake3_compress_in_place_avx512,
248 	.compress_xof = blake3_compress_xof_avx512,
249 	.hash_many = blake3_hash_many_avx512,
250 	.is_supported = blake3_is_avx512_supported,
251 	.degree = 16,
252 	.name = "avx512"
253 };
254 #endif
255 
256 extern const blake3_ops_t blake3_generic_impl;
257 
258 static const blake3_ops_t *const blake3_impls[] = {
259 	&blake3_generic_impl,
260 #if defined(__aarch64__) || \
261 	(defined(__x86_64) && defined(HAVE_SSE2)) || \
262 	(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
263 	&blake3_sse2_impl,
264 #endif
265 #if defined(__aarch64__) || \
266 	(defined(__x86_64) && defined(HAVE_SSE4_1)) || \
267 	(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
268 	&blake3_sse41_impl,
269 #endif
270 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
271 	&blake3_avx2_impl,
272 #endif
273 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
274 	&blake3_avx512_impl,
275 #endif
276 };
277 
278 /* use the generic implementation functions */
279 #define	IMPL_NAME		"blake3"
280 #define	IMPL_OPS_T		blake3_ops_t
281 #define	IMPL_ARRAY		blake3_impls
282 #define	IMPL_GET_OPS		blake3_get_ops
283 #define	ZFS_IMPL_OPS		zfs_blake3_ops
284 #include <generic_impl.c>
285 
286 #ifdef _KERNEL
287 void **blake3_per_cpu_ctx;
288 
289 void
290 blake3_per_cpu_ctx_init(void)
291 {
292 	/*
293 	 * Create "The Godfather" ptr to hold all blake3 ctx
294 	 */
295 	blake3_per_cpu_ctx = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP);
296 	for (int i = 0; i < max_ncpus; i++) {
297 		blake3_per_cpu_ctx[i] = kmem_alloc(sizeof (BLAKE3_CTX),
298 		    KM_SLEEP);
299 	}
300 }
301 
302 void
303 blake3_per_cpu_ctx_fini(void)
304 {
305 	for (int i = 0; i < max_ncpus; i++) {
306 		memset(blake3_per_cpu_ctx[i], 0, sizeof (BLAKE3_CTX));
307 		kmem_free(blake3_per_cpu_ctx[i], sizeof (BLAKE3_CTX));
308 	}
309 	memset(blake3_per_cpu_ctx, 0, max_ncpus * sizeof (void *));
310 	kmem_free(blake3_per_cpu_ctx, max_ncpus * sizeof (void *));
311 }
312 
313 #define	IMPL_FMT(impl, i)	(((impl) == (i)) ? "[%s] " : "%s ")
314 
315 #if defined(__linux__)
316 
317 static int
318 blake3_param_get(char *buffer, zfs_kernel_param_t *unused)
319 {
320 	const uint32_t impl = IMPL_READ(generic_impl_chosen);
321 	char *fmt;
322 	int cnt = 0;
323 
324 	/* cycling */
325 	fmt = IMPL_FMT(impl, IMPL_CYCLE);
326 	cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "cycle");
327 
328 	/* list fastest */
329 	fmt = IMPL_FMT(impl, IMPL_FASTEST);
330 	cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest");
331 
332 	/* list all supported implementations */
333 	generic_impl_init();
334 	for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
335 		fmt = IMPL_FMT(impl, i);
336 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
337 		    blake3_impls[i]->name);
338 	}
339 
340 	return (cnt);
341 }
342 
343 static int
344 blake3_param_set(const char *val, zfs_kernel_param_t *unused)
345 {
346 	(void) unused;
347 	return (generic_impl_setname(val));
348 }
349 
350 #elif defined(__FreeBSD__)
351 
352 #include <sys/sbuf.h>
353 
354 static int
355 blake3_param(ZFS_MODULE_PARAM_ARGS)
356 {
357 	int err;
358 
359 	generic_impl_init();
360 	if (req->newptr == NULL) {
361 		const uint32_t impl = IMPL_READ(generic_impl_chosen);
362 		const int init_buflen = 64;
363 		const char *fmt;
364 		struct sbuf *s;
365 
366 		s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
367 
368 		/* cycling */
369 		fmt = IMPL_FMT(impl, IMPL_CYCLE);
370 		(void) sbuf_printf(s, fmt, "cycle");
371 
372 		/* list fastest */
373 		fmt = IMPL_FMT(impl, IMPL_FASTEST);
374 		(void) sbuf_printf(s, fmt, "fastest");
375 
376 		/* list all supported implementations */
377 		for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
378 			fmt = IMPL_FMT(impl, i);
379 			(void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
380 		}
381 
382 		err = sbuf_finish(s);
383 		sbuf_delete(s);
384 
385 		return (err);
386 	}
387 
388 	char buf[16];
389 
390 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
391 	if (err) {
392 		return (err);
393 	}
394 
395 	return (-generic_impl_setname(buf));
396 }
397 #endif
398 
399 #undef IMPL_FMT
400 
401 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, blake3_impl,
402     blake3_param_set, blake3_param_get, ZMOD_RW, \
403 	"Select BLAKE3 implementation.");
404 #endif
405