1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
24  */
25 
26 #include <sys/types.h>
27 #include <sys/spa.h>
28 #include <sys/zio_checksum.h>
29 #include <sys/zfs_context.h>
30 #include <sys/zfs_chksum.h>
31 
32 #include <sys/blake3.h>
33 
34 /* limit benchmarking to max 256KiB, when EdonR is slower then this: */
35 #define	LIMIT_PERF_MBS	300
36 
37 typedef struct {
38 	const char *name;
39 	const char *impl;
40 	uint64_t bs1k;
41 	uint64_t bs4k;
42 	uint64_t bs16k;
43 	uint64_t bs64k;
44 	uint64_t bs256k;
45 	uint64_t bs1m;
46 	uint64_t bs4m;
47 	uint64_t bs16m;
48 	zio_cksum_salt_t salt;
49 	zio_checksum_t *(func);
50 	zio_checksum_tmpl_init_t *(init);
51 	zio_checksum_tmpl_free_t *(free);
52 } chksum_stat_t;
53 
54 static chksum_stat_t *chksum_stat_data = 0;
55 static int chksum_stat_cnt = 0;
56 static kstat_t *chksum_kstat = NULL;
57 
58 /*
59  * i3-1005G1 test output:
60  *
61  * implementation     1k      4k     16k     64k    256k      1m      4m
62  * fletcher-4       5421   15001   26468   32555   34720   32801   18847
63  * edonr-generic    1196    1602    1761    1749    1762    1759    1751
64  * skein-generic     546     591     608     615     619     612     616
65  * sha256-generic    246     270     274     274     277     275     276
66  * sha256-avx        262     296     304     307     307     307     306
67  * sha256-sha-ni     769    1072    1172    1220    1219    1232    1228
68  * sha256-openssl    240     300     316     314     304     285     276
69  * sha512-generic    333     374     385     392     391     393     392
70  * sha512-openssl    353     441     467     476     472     467     426
71  * sha512-avx        362     444     473     475     479     476     478
72  * sha512-avx2       394     500     530     538     543     545     542
73  * blake3-generic    308     313     313     313     312     313     312
74  * blake3-sse2       402    1289    1423    1446    1432    1458    1413
75  * blake3-sse41      427    1470    1625    1704    1679    1607    1629
76  * blake3-avx2       428    1920    3095    3343    3356    3318    3204
77  * blake3-avx512     473    2687    4905    5836    5844    5643    5374
78  */
79 static int
80 chksum_kstat_headers(char *buf, size_t size)
81 {
82 	ssize_t off = 0;
83 
84 	off += kmem_scnprintf(buf + off, size, "%-23s", "implementation");
85 	off += kmem_scnprintf(buf + off, size - off, "%8s", "1k");
86 	off += kmem_scnprintf(buf + off, size - off, "%8s", "4k");
87 	off += kmem_scnprintf(buf + off, size - off, "%8s", "16k");
88 	off += kmem_scnprintf(buf + off, size - off, "%8s", "64k");
89 	off += kmem_scnprintf(buf + off, size - off, "%8s", "256k");
90 	off += kmem_scnprintf(buf + off, size - off, "%8s", "1m");
91 	off += kmem_scnprintf(buf + off, size - off, "%8s", "4m");
92 	(void) kmem_scnprintf(buf + off, size - off, "%8s\n", "16m");
93 
94 	return (0);
95 }
96 
97 static int
98 chksum_kstat_data(char *buf, size_t size, void *data)
99 {
100 	chksum_stat_t *cs;
101 	ssize_t off = 0;
102 	char b[24];
103 
104 	cs = (chksum_stat_t *)data;
105 	kmem_scnprintf(b, 23, "%s-%s", cs->name, cs->impl);
106 	off += kmem_scnprintf(buf + off, size - off, "%-23s", b);
107 	off += kmem_scnprintf(buf + off, size - off, "%8llu",
108 	    (u_longlong_t)cs->bs1k);
109 	off += kmem_scnprintf(buf + off, size - off, "%8llu",
110 	    (u_longlong_t)cs->bs4k);
111 	off += kmem_scnprintf(buf + off, size - off, "%8llu",
112 	    (u_longlong_t)cs->bs16k);
113 	off += kmem_scnprintf(buf + off, size - off, "%8llu",
114 	    (u_longlong_t)cs->bs64k);
115 	off += kmem_scnprintf(buf + off, size - off, "%8llu",
116 	    (u_longlong_t)cs->bs256k);
117 	off += kmem_scnprintf(buf + off, size - off, "%8llu",
118 	    (u_longlong_t)cs->bs1m);
119 	off += kmem_scnprintf(buf + off, size - off, "%8llu",
120 	    (u_longlong_t)cs->bs4m);
121 	(void) kmem_scnprintf(buf + off, size - off, "%8llu\n",
122 	    (u_longlong_t)cs->bs16m);
123 
124 	return (0);
125 }
126 
127 static void *
128 chksum_kstat_addr(kstat_t *ksp, loff_t n)
129 {
130 	if (n < chksum_stat_cnt)
131 		ksp->ks_private = (void *)(chksum_stat_data + n);
132 	else
133 		ksp->ks_private = NULL;
134 
135 	return (ksp->ks_private);
136 }
137 
138 static void
139 chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round,
140     uint64_t *result)
141 {
142 	hrtime_t start;
143 	uint64_t run_bw, run_time_ns, run_count = 0, size = 0;
144 	uint32_t l, loops = 0;
145 	zio_cksum_t zcp;
146 
147 	switch (round) {
148 	case 1: /* 1k */
149 		size = 1<<10; loops = 128; break;
150 	case 2: /* 2k */
151 		size = 1<<12; loops = 64; break;
152 	case 3: /* 4k */
153 		size = 1<<14; loops = 32; break;
154 	case 4: /* 16k */
155 		size = 1<<16; loops = 16; break;
156 	case 5: /* 256k */
157 		size = 1<<18; loops = 8; break;
158 	case 6: /* 1m */
159 		size = 1<<20; loops = 4; break;
160 	case 7: /* 4m */
161 		size = 1<<22; loops = 1; break;
162 	case 8: /* 16m */
163 		size = 1<<24; loops = 1; break;
164 	}
165 
166 	kpreempt_disable();
167 	start = gethrtime();
168 	do {
169 		for (l = 0; l < loops; l++, run_count++)
170 			cs->func(abd, size, ctx, &zcp);
171 
172 		run_time_ns = gethrtime() - start;
173 	} while (run_time_ns < MSEC2NSEC(1));
174 	kpreempt_enable();
175 
176 	run_bw = size * run_count * NANOSEC;
177 	run_bw /= run_time_ns;	/* B/s */
178 	*result = run_bw/1024/1024; /* MiB/s */
179 }
180 
181 #define	LIMIT_INIT	0
182 #define	LIMIT_NEEDED	1
183 #define	LIMIT_NOLIMIT	2
184 
185 static void
186 chksum_benchit(chksum_stat_t *cs)
187 {
188 	abd_t *abd;
189 	void *ctx = 0;
190 	void *salt = &cs->salt.zcs_bytes;
191 	static int chksum_stat_limit = LIMIT_INIT;
192 
193 	memset(salt, 0, sizeof (cs->salt.zcs_bytes));
194 	if (cs->init)
195 		ctx = cs->init(&cs->salt);
196 
197 	/* allocate test memory via abd linear interface */
198 	abd = abd_alloc_linear(1<<20, B_FALSE);
199 	chksum_run(cs, abd, ctx, 1, &cs->bs1k);
200 	chksum_run(cs, abd, ctx, 2, &cs->bs4k);
201 	chksum_run(cs, abd, ctx, 3, &cs->bs16k);
202 	chksum_run(cs, abd, ctx, 4, &cs->bs64k);
203 	chksum_run(cs, abd, ctx, 5, &cs->bs256k);
204 
205 	/* check if we ran on a slow cpu */
206 	if (chksum_stat_limit == LIMIT_INIT) {
207 		if (cs->bs1k < LIMIT_PERF_MBS) {
208 			chksum_stat_limit = LIMIT_NEEDED;
209 		} else {
210 			chksum_stat_limit = LIMIT_NOLIMIT;
211 		}
212 	}
213 
214 	/* skip benchmarks >= 1MiB when the CPU is to slow */
215 	if (chksum_stat_limit == LIMIT_NEEDED)
216 		goto abort;
217 
218 	chksum_run(cs, abd, ctx, 6, &cs->bs1m);
219 	abd_free(abd);
220 
221 	/* allocate test memory via abd non linear interface */
222 	abd = abd_alloc(1<<24, B_FALSE);
223 	chksum_run(cs, abd, ctx, 7, &cs->bs4m);
224 	chksum_run(cs, abd, ctx, 8, &cs->bs16m);
225 
226 abort:
227 	abd_free(abd);
228 
229 	/* free up temp memory */
230 	if (cs->free)
231 		cs->free(ctx);
232 }
233 
234 /*
235  * Initialize and benchmark all supported implementations.
236  */
237 static void
238 chksum_benchmark(void)
239 {
240 
241 #ifndef _KERNEL
242 	/* we need the benchmark only for the kernel module */
243 	return;
244 #endif
245 
246 	chksum_stat_t *cs;
247 	int cbid = 0;
248 	uint64_t max = 0;
249 	uint32_t id, id_save;
250 
251 	/* space for the benchmark times */
252 	chksum_stat_cnt = 4;
253 	chksum_stat_cnt += blake3_impl_getcnt();
254 	chksum_stat_data = kmem_zalloc(
255 	    sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP);
256 
257 	/* edonr - needs to be the first one here (slow CPU check) */
258 	cs = &chksum_stat_data[cbid++];
259 	cs->init = abd_checksum_edonr_tmpl_init;
260 	cs->func = abd_checksum_edonr_native;
261 	cs->free = abd_checksum_edonr_tmpl_free;
262 	cs->name = "edonr";
263 	cs->impl = "generic";
264 	chksum_benchit(cs);
265 
266 	/* skein */
267 	cs = &chksum_stat_data[cbid++];
268 	cs->init = abd_checksum_skein_tmpl_init;
269 	cs->func = abd_checksum_skein_native;
270 	cs->free = abd_checksum_skein_tmpl_free;
271 	cs->name = "skein";
272 	cs->impl = "generic";
273 	chksum_benchit(cs);
274 
275 	/* sha256 */
276 	cs = &chksum_stat_data[cbid++];
277 	cs->init = 0;
278 	cs->func = abd_checksum_SHA256;
279 	cs->free = 0;
280 	cs->name = "sha256";
281 	cs->impl = "generic";
282 	chksum_benchit(cs);
283 
284 	/* sha512 */
285 	cs = &chksum_stat_data[cbid++];
286 	cs->init = 0;
287 	cs->func = abd_checksum_SHA512_native;
288 	cs->free = 0;
289 	cs->name = "sha512";
290 	cs->impl = "generic";
291 	chksum_benchit(cs);
292 
293 	/* blake3 */
294 	id_save = blake3_impl_getid();
295 	for (id = 0; id < blake3_impl_getcnt(); id++) {
296 		blake3_impl_setid(id);
297 		cs = &chksum_stat_data[cbid++];
298 		cs->init = abd_checksum_blake3_tmpl_init;
299 		cs->func = abd_checksum_blake3_native;
300 		cs->free = abd_checksum_blake3_tmpl_free;
301 		cs->name = "blake3";
302 		cs->impl = blake3_impl_getname();
303 		chksum_benchit(cs);
304 		if (cs->bs256k > max) {
305 			max = cs->bs256k;
306 			blake3_impl_set_fastest(id);
307 		}
308 	}
309 
310 	/* restore initial value */
311 	blake3_impl_setid(id_save);
312 }
313 
314 void
315 chksum_init(void)
316 {
317 #ifdef _KERNEL
318 	blake3_per_cpu_ctx_init();
319 #endif
320 
321 	/* Benchmark supported implementations */
322 	chksum_benchmark();
323 
324 	/* Install kstats for all implementations */
325 	chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc",
326 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
327 
328 	if (chksum_kstat != NULL) {
329 		chksum_kstat->ks_data = NULL;
330 		chksum_kstat->ks_ndata = UINT32_MAX;
331 		kstat_set_raw_ops(chksum_kstat,
332 		    chksum_kstat_headers,
333 		    chksum_kstat_data,
334 		    chksum_kstat_addr);
335 		kstat_install(chksum_kstat);
336 	}
337 }
338 
339 void
340 chksum_fini(void)
341 {
342 	if (chksum_kstat != NULL) {
343 		kstat_delete(chksum_kstat);
344 		chksum_kstat = NULL;
345 	}
346 
347 	if (chksum_stat_cnt) {
348 		kmem_free(chksum_stat_data,
349 		    sizeof (chksum_stat_t) * chksum_stat_cnt);
350 		chksum_stat_cnt = 0;
351 		chksum_stat_data = 0;
352 	}
353 
354 #ifdef _KERNEL
355 	blake3_per_cpu_ctx_fini();
356 #endif
357 }
358