1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
24  */
25 
26 #include <sys/simd.h>
27 #include <sys/zfs_context.h>
28 #include <sys/zfs_impl.h>
29 #include <sys/blake3.h>
30 
31 #include "blake3_impl.h"
32 
33 #if !defined(OMIT_SIMD) && (defined(__aarch64__) ||  \
34 	(defined(__x86_64) && defined(HAVE_SSE2)) || \
35     (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)))
36 #define USE_SIMD
37 #endif
38 
39 #ifdef USE_SIMD
40 extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8],
41     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
42     uint64_t counter, uint8_t flags);
43 
44 extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8],
45     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
46     uint64_t counter, uint8_t flags, uint8_t out[64]);
47 
48 extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs,
49     size_t num_inputs, size_t blocks, const uint32_t key[8],
50     uint64_t counter, boolean_t increment_counter, uint8_t flags,
51     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
52 
53 static void blake3_compress_in_place_sse2(uint32_t cv[8],
54     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
55     uint64_t counter, uint8_t flags) {
56 	kfpu_begin();
57 	zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter,
58 	    flags);
59 	kfpu_end();
60 }
61 
62 static void blake3_compress_xof_sse2(const uint32_t cv[8],
63     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
64     uint64_t counter, uint8_t flags, uint8_t out[64]) {
65 	kfpu_begin();
66 	zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags,
67 	    out);
68 	kfpu_end();
69 }
70 
71 static void blake3_hash_many_sse2(const uint8_t * const *inputs,
72     size_t num_inputs, size_t blocks, const uint32_t key[8],
73     uint64_t counter, boolean_t increment_counter, uint8_t flags,
74     uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
75 	kfpu_begin();
76 	zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
77 	    increment_counter, flags, flags_start, flags_end, out);
78 	kfpu_end();
79 }
80 
81 static boolean_t blake3_is_sse2_supported(void)
82 {
83 #if defined(__x86_64)
84 	return (kfpu_allowed() && zfs_sse2_available());
85 #elif defined(__PPC64__)
86 	return (kfpu_allowed() && zfs_vsx_available());
87 #else
88 	return (kfpu_allowed());
89 #endif
90 }
91 
92 const blake3_ops_t blake3_sse2_impl = {
93 	.compress_in_place = blake3_compress_in_place_sse2,
94 	.compress_xof = blake3_compress_xof_sse2,
95 	.hash_many = blake3_hash_many_sse2,
96 	.is_supported = blake3_is_sse2_supported,
97 	.degree = 4,
98 	.name = "sse2"
99 };
100 #endif
101 
102 #ifdef USE_SIMD
103 
104 extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8],
105     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
106     uint64_t counter, uint8_t flags);
107 
108 extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8],
109     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
110     uint64_t counter, uint8_t flags, uint8_t out[64]);
111 
112 extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs,
113     size_t num_inputs, size_t blocks, const uint32_t key[8],
114     uint64_t counter, boolean_t increment_counter, uint8_t flags,
115     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
116 
117 static void blake3_compress_in_place_sse41(uint32_t cv[8],
118     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
119     uint64_t counter, uint8_t flags) {
120 	kfpu_begin();
121 	zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter,
122 	    flags);
123 	kfpu_end();
124 }
125 
126 static void blake3_compress_xof_sse41(const uint32_t cv[8],
127     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
128     uint64_t counter, uint8_t flags, uint8_t out[64]) {
129 	kfpu_begin();
130 	zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags,
131 	    out);
132 	kfpu_end();
133 }
134 
135 static void blake3_hash_many_sse41(const uint8_t * const *inputs,
136     size_t num_inputs, size_t blocks, const uint32_t key[8],
137     uint64_t counter, boolean_t increment_counter, uint8_t flags,
138     uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
139 	kfpu_begin();
140 	zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
141 	    increment_counter, flags, flags_start, flags_end, out);
142 	kfpu_end();
143 }
144 
145 static boolean_t blake3_is_sse41_supported(void)
146 {
147 #if defined(__x86_64)
148 	return (kfpu_allowed() && zfs_sse4_1_available());
149 #elif defined(__PPC64__)
150 	return (kfpu_allowed() && zfs_vsx_available());
151 #else
152 	return (kfpu_allowed());
153 #endif
154 }
155 
156 const blake3_ops_t blake3_sse41_impl = {
157 	.compress_in_place = blake3_compress_in_place_sse41,
158 	.compress_xof = blake3_compress_xof_sse41,
159 	.hash_many = blake3_hash_many_sse41,
160 	.is_supported = blake3_is_sse41_supported,
161 	.degree = 4,
162 	.name = "sse41"
163 };
164 #endif
165 
166 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
167 extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs,
168     size_t num_inputs, size_t blocks, const uint32_t key[8],
169     uint64_t counter, boolean_t increment_counter, uint8_t flags,
170     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
171 
172 static void blake3_hash_many_avx2(const uint8_t * const *inputs,
173     size_t num_inputs, size_t blocks, const uint32_t key[8],
174     uint64_t counter, boolean_t increment_counter, uint8_t flags,
175     uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
176 	kfpu_begin();
177 	zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
178 	    increment_counter, flags, flags_start, flags_end, out);
179 	kfpu_end();
180 }
181 
182 static boolean_t blake3_is_avx2_supported(void)
183 {
184 	return (kfpu_allowed() && zfs_sse4_1_available() &&
185 	    zfs_avx2_available());
186 }
187 
188 const blake3_ops_t
189 blake3_avx2_impl = {
190 	.compress_in_place = blake3_compress_in_place_sse41,
191 	.compress_xof = blake3_compress_xof_sse41,
192 	.hash_many = blake3_hash_many_avx2,
193 	.is_supported = blake3_is_avx2_supported,
194 	.degree = 8,
195 	.name = "avx2"
196 };
197 #endif
198 
199 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
200 extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8],
201     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
202     uint64_t counter, uint8_t flags);
203 
204 extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8],
205     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
206     uint64_t counter, uint8_t flags, uint8_t out[64]);
207 
208 extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs,
209     size_t num_inputs, size_t blocks, const uint32_t key[8],
210     uint64_t counter, boolean_t increment_counter, uint8_t flags,
211     uint8_t flags_start, uint8_t flags_end, uint8_t *out);
212 
213 static void blake3_compress_in_place_avx512(uint32_t cv[8],
214     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
215     uint64_t counter, uint8_t flags) {
216 	kfpu_begin();
217 	zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter,
218 	    flags);
219 	kfpu_end();
220 }
221 
222 static void blake3_compress_xof_avx512(const uint32_t cv[8],
223     const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
224     uint64_t counter, uint8_t flags, uint8_t out[64]) {
225 	kfpu_begin();
226 	zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags,
227 	    out);
228 	kfpu_end();
229 }
230 
231 static void blake3_hash_many_avx512(const uint8_t * const *inputs,
232     size_t num_inputs, size_t blocks, const uint32_t key[8],
233     uint64_t counter, boolean_t increment_counter, uint8_t flags,
234     uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
235 	kfpu_begin();
236 	zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
237 	    increment_counter, flags, flags_start, flags_end, out);
238 	kfpu_end();
239 }
240 
241 static boolean_t blake3_is_avx512_supported(void)
242 {
243 	return (kfpu_allowed() && zfs_avx512f_available() &&
244 	    zfs_avx512vl_available());
245 }
246 
247 const blake3_ops_t blake3_avx512_impl = {
248 	.compress_in_place = blake3_compress_in_place_avx512,
249 	.compress_xof = blake3_compress_xof_avx512,
250 	.hash_many = blake3_hash_many_avx512,
251 	.is_supported = blake3_is_avx512_supported,
252 	.degree = 16,
253 	.name = "avx512"
254 };
255 #endif
256 
257 extern const blake3_ops_t blake3_generic_impl;
258 
259 static const blake3_ops_t *const blake3_impls[] = {
260 	&blake3_generic_impl,
261 #ifdef USE_SIMD
262 #if defined(__aarch64__) || \
263 	(defined(__x86_64) && defined(HAVE_SSE2)) || \
264 	(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
265 	&blake3_sse2_impl,
266 #endif
267 #if defined(__aarch64__) || \
268 	(defined(__x86_64) && defined(HAVE_SSE4_1)) || \
269 	(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
270 	&blake3_sse41_impl,
271 #endif
272 #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
273 	&blake3_avx2_impl,
274 #endif
275 #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
276 	&blake3_avx512_impl,
277 #endif
278 #endif
279 };
280 
281 /* use the generic implementation functions */
282 #define	IMPL_NAME		"blake3"
283 #define	IMPL_OPS_T		blake3_ops_t
284 #define	IMPL_ARRAY		blake3_impls
285 #define	IMPL_GET_OPS		blake3_get_ops
286 #define	ZFS_IMPL_OPS		zfs_blake3_ops
287 #include <generic_impl.c>
288 
289 #ifdef _KERNEL
290 void **blake3_per_cpu_ctx;
291 
292 void
293 blake3_per_cpu_ctx_init(void)
294 {
295 	/*
296 	 * Create "The Godfather" ptr to hold all blake3 ctx
297 	 */
298 	blake3_per_cpu_ctx = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP);
299 	for (int i = 0; i < max_ncpus; i++) {
300 		blake3_per_cpu_ctx[i] = kmem_alloc(sizeof (BLAKE3_CTX),
301 		    KM_SLEEP);
302 	}
303 }
304 
305 void
306 blake3_per_cpu_ctx_fini(void)
307 {
308 	for (int i = 0; i < max_ncpus; i++) {
309 		memset(blake3_per_cpu_ctx[i], 0, sizeof (BLAKE3_CTX));
310 		kmem_free(blake3_per_cpu_ctx[i], sizeof (BLAKE3_CTX));
311 	}
312 	memset(blake3_per_cpu_ctx, 0, max_ncpus * sizeof (void *));
313 	kmem_free(blake3_per_cpu_ctx, max_ncpus * sizeof (void *));
314 }
315 
316 #define	IMPL_FMT(impl, i)	(((impl) == (i)) ? "[%s] " : "%s ")
317 
318 #if defined(__linux__)
319 
320 static int
321 blake3_param_get(char *buffer, zfs_kernel_param_t *unused)
322 {
323 	const uint32_t impl = IMPL_READ(generic_impl_chosen);
324 	char *fmt;
325 	int cnt = 0;
326 
327 	/* cycling */
328 	fmt = IMPL_FMT(impl, IMPL_CYCLE);
329 	cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "cycle");
330 
331 	/* list fastest */
332 	fmt = IMPL_FMT(impl, IMPL_FASTEST);
333 	cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest");
334 
335 	/* list all supported implementations */
336 	generic_impl_init();
337 	for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
338 		fmt = IMPL_FMT(impl, i);
339 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
340 		    blake3_impls[i]->name);
341 	}
342 
343 	return (cnt);
344 }
345 
346 static int
347 blake3_param_set(const char *val, zfs_kernel_param_t *unused)
348 {
349 	(void) unused;
350 	return (generic_impl_setname(val));
351 }
352 
353 #elif defined(__FreeBSD__)
354 
355 #include <sys/sbuf.h>
356 
357 static int
358 blake3_param(ZFS_MODULE_PARAM_ARGS)
359 {
360 	int err;
361 
362 	generic_impl_init();
363 	if (req->newptr == NULL) {
364 		const uint32_t impl = IMPL_READ(generic_impl_chosen);
365 		const int init_buflen = 64;
366 		const char *fmt;
367 		struct sbuf *s;
368 
369 		s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
370 
371 		/* cycling */
372 		fmt = IMPL_FMT(impl, IMPL_CYCLE);
373 		(void) sbuf_printf(s, fmt, "cycle");
374 
375 		/* list fastest */
376 		fmt = IMPL_FMT(impl, IMPL_FASTEST);
377 		(void) sbuf_printf(s, fmt, "fastest");
378 
379 		/* list all supported implementations */
380 		for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
381 			fmt = IMPL_FMT(impl, i);
382 			(void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
383 		}
384 
385 		err = sbuf_finish(s);
386 		sbuf_delete(s);
387 
388 		return (err);
389 	}
390 
391 	char buf[16];
392 
393 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
394 	if (err) {
395 		return (err);
396 	}
397 
398 	return (-generic_impl_setname(buf));
399 }
400 #endif
401 
402 #undef IMPL_FMT
403 
404 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, blake3_impl,
405     blake3_param_set, blake3_param_get, ZMOD_RW, \
406 	"Select BLAKE3 implementation.");
407 #endif
408