1 /*
2  * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  * 1. Redistributions of source code must retain the above copyright notice,
8  * this list of conditions and the following disclaimer.
9  *
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  *
14  * 3. Neither the name of the copyright holder nor the names of its
15  * contributors may be used to endorse or promote products derived from this
16  * software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * Copyright (c) 2016-2018, Klara Inc.
33  * Copyright (c) 2016-2018, Allan Jude
34  * Copyright (c) 2018-2020, Sebastian Gottschall
35  * Copyright (c) 2019-2020, Michael Niewöhner
36  * Copyright (c) 2020, The FreeBSD Foundation [1]
37  *
38  * [1] Portions of this software were developed by Allan Jude
39  *     under sponsorship from the FreeBSD Foundation.
40  */
41 
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zio_compress.h>
46 #include <sys/spa.h>
47 #include <sys/zstd/zstd.h>
48 
49 #define	ZSTD_STATIC_LINKING_ONLY
50 #include "lib/zstd.h"
51 #include "lib/common/zstd_errors.h"
52 
53 #ifndef IN_LIBSA
54 static int zstd_earlyabort_pass = 1;
55 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;
56 static unsigned int zstd_abort_size = (128 * 1024);
57 #endif
58 
59 static kstat_t *zstd_ksp = NULL;
60 
61 typedef struct zstd_stats {
62 	kstat_named_t	zstd_stat_alloc_fail;
63 	kstat_named_t	zstd_stat_alloc_fallback;
64 	kstat_named_t	zstd_stat_com_alloc_fail;
65 	kstat_named_t	zstd_stat_dec_alloc_fail;
66 	kstat_named_t	zstd_stat_com_inval;
67 	kstat_named_t	zstd_stat_dec_inval;
68 	kstat_named_t	zstd_stat_dec_header_inval;
69 	kstat_named_t	zstd_stat_com_fail;
70 	kstat_named_t	zstd_stat_dec_fail;
71 	/*
72 	 * LZ4 first-pass early abort verdict
73 	 */
74 	kstat_named_t	zstd_stat_lz4pass_allowed;
75 	kstat_named_t	zstd_stat_lz4pass_rejected;
76 	/*
77 	 * zstd-1 second-pass early abort verdict
78 	 */
79 	kstat_named_t	zstd_stat_zstdpass_allowed;
80 	kstat_named_t	zstd_stat_zstdpass_rejected;
81 	/*
82 	 * We excluded this from early abort for some reason
83 	 */
84 	kstat_named_t	zstd_stat_passignored;
85 	kstat_named_t	zstd_stat_passignored_size;
86 	kstat_named_t	zstd_stat_buffers;
87 	kstat_named_t	zstd_stat_size;
88 } zstd_stats_t;
89 
90 static zstd_stats_t zstd_stats = {
91 	{ "alloc_fail",			KSTAT_DATA_UINT64 },
92 	{ "alloc_fallback",		KSTAT_DATA_UINT64 },
93 	{ "compress_alloc_fail",	KSTAT_DATA_UINT64 },
94 	{ "decompress_alloc_fail",	KSTAT_DATA_UINT64 },
95 	{ "compress_level_invalid",	KSTAT_DATA_UINT64 },
96 	{ "decompress_level_invalid",	KSTAT_DATA_UINT64 },
97 	{ "decompress_header_invalid",	KSTAT_DATA_UINT64 },
98 	{ "compress_failed",		KSTAT_DATA_UINT64 },
99 	{ "decompress_failed",		KSTAT_DATA_UINT64 },
100 	{ "lz4pass_allowed",		KSTAT_DATA_UINT64 },
101 	{ "lz4pass_rejected",		KSTAT_DATA_UINT64 },
102 	{ "zstdpass_allowed",		KSTAT_DATA_UINT64 },
103 	{ "zstdpass_rejected",		KSTAT_DATA_UINT64 },
104 	{ "passignored",		KSTAT_DATA_UINT64 },
105 	{ "passignored_size",		KSTAT_DATA_UINT64 },
106 	{ "buffers",			KSTAT_DATA_UINT64 },
107 	{ "size",			KSTAT_DATA_UINT64 },
108 };
109 
110 #ifdef _KERNEL
111 static int
112 kstat_zstd_update(kstat_t *ksp, int rw)
113 {
114 	ASSERT(ksp != NULL);
115 
116 	if (rw == KSTAT_WRITE && ksp == zstd_ksp) {
117 		ZSTDSTAT_ZERO(zstd_stat_alloc_fail);
118 		ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);
119 		ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);
120 		ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);
121 		ZSTDSTAT_ZERO(zstd_stat_com_inval);
122 		ZSTDSTAT_ZERO(zstd_stat_dec_inval);
123 		ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);
124 		ZSTDSTAT_ZERO(zstd_stat_com_fail);
125 		ZSTDSTAT_ZERO(zstd_stat_dec_fail);
126 		ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);
127 		ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);
128 		ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);
129 		ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);
130 		ZSTDSTAT_ZERO(zstd_stat_passignored);
131 		ZSTDSTAT_ZERO(zstd_stat_passignored_size);
132 	}
133 
134 	return (0);
135 }
136 #endif
137 
138 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
139 enum zstd_kmem_type {
140 	ZSTD_KMEM_UNKNOWN = 0,
141 	/* Allocation type using kmem_vmalloc */
142 	ZSTD_KMEM_DEFAULT,
143 	/* Pool based allocation using mempool_alloc */
144 	ZSTD_KMEM_POOL,
145 	/* Reserved fallback memory for decompression only */
146 	ZSTD_KMEM_DCTX,
147 	ZSTD_KMEM_COUNT,
148 };
149 
150 /* Structure for pooled memory objects */
151 struct zstd_pool {
152 	void *mem;
153 	size_t size;
154 	kmutex_t barrier;
155 	hrtime_t timeout;
156 };
157 
158 /* Global structure for handling memory allocations */
159 struct zstd_kmem {
160 	enum zstd_kmem_type kmem_type;
161 	size_t kmem_size;
162 	struct zstd_pool *pool;
163 };
164 
165 /* Fallback memory structure used for decompression only if memory runs out */
166 struct zstd_fallback_mem {
167 	size_t mem_size;
168 	void *mem;
169 	kmutex_t barrier;
170 };
171 
172 struct zstd_levelmap {
173 	int16_t zstd_level;
174 	enum zio_zstd_levels level;
175 };
176 
177 /*
178  * ZSTD memory handlers
179  *
180  * For decompression we use a different handler which also provides fallback
181  * memory allocation in case memory runs out.
182  *
183  * The ZSTD handlers were split up for the most simplified implementation.
184  */
185 static void *zstd_alloc(void *opaque, size_t size);
186 static void *zstd_dctx_alloc(void *opaque, size_t size);
187 static void zstd_free(void *opaque, void *ptr);
188 
189 /* Compression memory handler */
190 static const ZSTD_customMem zstd_malloc = {
191 	zstd_alloc,
192 	zstd_free,
193 	NULL,
194 };
195 
196 /* Decompression memory handler */
197 static const ZSTD_customMem zstd_dctx_malloc = {
198 	zstd_dctx_alloc,
199 	zstd_free,
200 	NULL,
201 };
202 
203 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
204 static struct zstd_levelmap zstd_levels[] = {
205 	{ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
206 	{ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
207 	{ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
208 	{ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
209 	{ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
210 	{ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
211 	{ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
212 	{ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
213 	{ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
214 	{ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
215 	{ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
216 	{ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
217 	{ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
218 	{ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
219 	{ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
220 	{ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
221 	{ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
222 	{ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
223 	{ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
224 	{-1, ZIO_ZSTD_LEVEL_FAST_1},
225 	{-2, ZIO_ZSTD_LEVEL_FAST_2},
226 	{-3, ZIO_ZSTD_LEVEL_FAST_3},
227 	{-4, ZIO_ZSTD_LEVEL_FAST_4},
228 	{-5, ZIO_ZSTD_LEVEL_FAST_5},
229 	{-6, ZIO_ZSTD_LEVEL_FAST_6},
230 	{-7, ZIO_ZSTD_LEVEL_FAST_7},
231 	{-8, ZIO_ZSTD_LEVEL_FAST_8},
232 	{-9, ZIO_ZSTD_LEVEL_FAST_9},
233 	{-10, ZIO_ZSTD_LEVEL_FAST_10},
234 	{-20, ZIO_ZSTD_LEVEL_FAST_20},
235 	{-30, ZIO_ZSTD_LEVEL_FAST_30},
236 	{-40, ZIO_ZSTD_LEVEL_FAST_40},
237 	{-50, ZIO_ZSTD_LEVEL_FAST_50},
238 	{-60, ZIO_ZSTD_LEVEL_FAST_60},
239 	{-70, ZIO_ZSTD_LEVEL_FAST_70},
240 	{-80, ZIO_ZSTD_LEVEL_FAST_80},
241 	{-90, ZIO_ZSTD_LEVEL_FAST_90},
242 	{-100, ZIO_ZSTD_LEVEL_FAST_100},
243 	{-500, ZIO_ZSTD_LEVEL_FAST_500},
244 	{-1000, ZIO_ZSTD_LEVEL_FAST_1000},
245 };
246 
247 /*
248  * This variable represents the maximum count of the pool based on the number
249  * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
250  */
251 static int pool_count = 16;
252 
253 #define	ZSTD_POOL_MAX		pool_count
254 #define	ZSTD_POOL_TIMEOUT	60 * 2
255 
256 static struct zstd_fallback_mem zstd_dctx_fallback;
257 static struct zstd_pool *zstd_mempool_cctx;
258 static struct zstd_pool *zstd_mempool_dctx;
259 
260 /*
261  * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
262  * and while ASAN does this, KASAN defines that and does not. So to avoid
263  * changing the external code, we do this.
264  */
265 #if defined(ZFS_ASAN_ENABLED)
266 #define	ADDRESS_SANITIZER 1
267 #endif
268 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
269 void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
270 void __asan_poison_memory_region(void const volatile *addr, size_t size);
271 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
272 void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
273 #endif
274 
275 
276 static void
277 zstd_mempool_reap(struct zstd_pool *zstd_mempool)
278 {
279 	struct zstd_pool *pool;
280 
281 	if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
282 		return;
283 	}
284 
285 	/* free obsolete slots */
286 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
287 		pool = &zstd_mempool[i];
288 		if (pool->mem && mutex_tryenter(&pool->barrier)) {
289 			/* Free memory if unused object older than 2 minutes */
290 			if (pool->mem && gethrestime_sec() > pool->timeout) {
291 				vmem_free(pool->mem, pool->size);
292 				ZSTDSTAT_SUB(zstd_stat_buffers, 1);
293 				ZSTDSTAT_SUB(zstd_stat_size, pool->size);
294 				pool->mem = NULL;
295 				pool->size = 0;
296 				pool->timeout = 0;
297 			}
298 			mutex_exit(&pool->barrier);
299 		}
300 	}
301 }
302 
303 /*
304  * Try to get a cached allocated buffer from memory pool or allocate a new one
305  * if necessary. If a object is older than 2 minutes and does not fit the
306  * requested size, it will be released and a new cached entry will be allocated.
307  * If other pooled objects are detected without being used for 2 minutes, they
308  * will be released, too.
309  *
310  * The concept is that high frequency memory allocations of bigger objects are
311  * expensive. So if a lot of work is going on, allocations will be kept for a
312  * while and can be reused in that time frame.
313  *
314  * The scheduled release will be updated every time a object is reused.
315  */
316 
317 static void *
318 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
319 {
320 	struct zstd_pool *pool;
321 	struct zstd_kmem *mem = NULL;
322 
323 	if (!zstd_mempool) {
324 		return (NULL);
325 	}
326 
327 	/* Seek for preallocated memory slot and free obsolete slots */
328 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
329 		pool = &zstd_mempool[i];
330 		/*
331 		 * This lock is simply a marker for a pool object being in use.
332 		 * If it's already hold, it will be skipped.
333 		 *
334 		 * We need to create it before checking it to avoid race
335 		 * conditions caused by running in a threaded context.
336 		 *
337 		 * The lock is later released by zstd_mempool_free.
338 		 */
339 		if (mutex_tryenter(&pool->barrier)) {
340 			/*
341 			 * Check if objects fits the size, if so we take it and
342 			 * update the timestamp.
343 			 */
344 			if (pool->mem && size <= pool->size) {
345 				pool->timeout = gethrestime_sec() +
346 				    ZSTD_POOL_TIMEOUT;
347 				mem = pool->mem;
348 				return (mem);
349 			}
350 			mutex_exit(&pool->barrier);
351 		}
352 	}
353 
354 	/*
355 	 * If no preallocated slot was found, try to fill in a new one.
356 	 *
357 	 * We run a similar algorithm twice here to avoid pool fragmentation.
358 	 * The first one may generate holes in the list if objects get released.
359 	 * We always make sure that these holes get filled instead of adding new
360 	 * allocations constantly at the end.
361 	 */
362 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
363 		pool = &zstd_mempool[i];
364 		if (mutex_tryenter(&pool->barrier)) {
365 			/* Object is free, try to allocate new one */
366 			if (!pool->mem) {
367 				mem = vmem_alloc(size, KM_SLEEP);
368 				if (mem) {
369 					ZSTDSTAT_ADD(zstd_stat_buffers, 1);
370 					ZSTDSTAT_ADD(zstd_stat_size, size);
371 					pool->mem = mem;
372 					pool->size = size;
373 					/* Keep track for later release */
374 					mem->pool = pool;
375 					mem->kmem_type = ZSTD_KMEM_POOL;
376 					mem->kmem_size = size;
377 				}
378 			}
379 
380 			if (size <= pool->size) {
381 				/* Update timestamp */
382 				pool->timeout = gethrestime_sec() +
383 				    ZSTD_POOL_TIMEOUT;
384 
385 				return (pool->mem);
386 			}
387 
388 			mutex_exit(&pool->barrier);
389 		}
390 	}
391 
392 	/*
393 	 * If the pool is full or the allocation failed, try lazy allocation
394 	 * instead.
395 	 */
396 	if (!mem) {
397 		mem = vmem_alloc(size, KM_NOSLEEP);
398 		if (mem) {
399 			mem->pool = NULL;
400 			mem->kmem_type = ZSTD_KMEM_DEFAULT;
401 			mem->kmem_size = size;
402 		}
403 	}
404 
405 	return (mem);
406 }
407 
408 /* Mark object as released by releasing the barrier mutex */
409 static void
410 zstd_mempool_free(struct zstd_kmem *z)
411 {
412 	mutex_exit(&z->pool->barrier);
413 }
414 
415 /* Convert ZFS internal enum to ZSTD level */
416 static int
417 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
418 {
419 	if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
420 		*zstd_level = zstd_levels[level - 1].zstd_level;
421 		return (0);
422 	}
423 	if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
424 	    level <= ZIO_ZSTD_LEVEL_FAST_1000) {
425 		*zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
426 		    + ZIO_ZSTD_LEVEL_19].zstd_level;
427 		return (0);
428 	}
429 
430 	/* Invalid/unknown zfs compression enum - this should never happen. */
431 	return (1);
432 }
433 
434 #ifndef IN_LIBSA
435 size_t
436 zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
437     int level)
438 {
439 	int16_t zstd_level;
440 	if (zstd_enum_to_level(level, &zstd_level)) {
441 		ZSTDSTAT_BUMP(zstd_stat_com_inval);
442 		return (s_len);
443 	}
444 	/*
445 	 * A zstd early abort heuristic.
446 	 *
447 	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
448 	 *   128k), don't try any of this, just go.
449 	 *   (because experimentally that was a reasonable cutoff for a perf win
450 	 *   with tiny ratio change)
451 	 * - First, we try LZ4 compression, and if it doesn't early abort, we
452 	 *   jump directly to whatever compression level we intended to try.
453 	 * - Second, we try zstd-1 - if that errors out (usually, but not
454 	 *   exclusively, if it would overflow), we give up early.
455 	 *
456 	 *   If it works, instead we go on and compress anyway.
457 	 *
458 	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
459 	 * compressible data, it was losing up to 8.5% of the compressed
460 	 * savings versus no early abort, and all the zstd-fast levels are
461 	 * worse indications on their own than LZ4, and don't improve the LZ4
462 	 * pass noticably if stacked like this.
463 	 */
464 	size_t actual_abort_size = zstd_abort_size;
465 	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
466 	    s_len >= actual_abort_size) {
467 		int pass_len = 1;
468 		pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0);
469 		if (pass_len < d_len) {
470 			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
471 			goto keep_trying;
472 		}
473 		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
474 
475 		pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
476 		    ZIO_ZSTD_LEVEL_1);
477 		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
478 			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
479 			return (s_len);
480 		}
481 		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
482 	} else {
483 		ZSTDSTAT_BUMP(zstd_stat_passignored);
484 		if (s_len < actual_abort_size) {
485 			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
486 		}
487 	}
488 keep_trying:
489 	return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
490 
491 }
492 #endif
493 
494 /* Compress block using zstd */
495 size_t
496 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
497     int level)
498 {
499 	size_t c_len;
500 	int16_t zstd_level;
501 	zfs_zstdhdr_t *hdr;
502 	ZSTD_CCtx *cctx;
503 
504 	hdr = (zfs_zstdhdr_t *)d_start;
505 
506 	/* Skip compression if the specified level is invalid */
507 	if (zstd_enum_to_level(level, &zstd_level)) {
508 		ZSTDSTAT_BUMP(zstd_stat_com_inval);
509 		return (s_len);
510 	}
511 
512 	ASSERT3U(d_len, >=, sizeof (*hdr));
513 	ASSERT3U(d_len, <=, s_len);
514 	ASSERT3U(zstd_level, !=, 0);
515 
516 	cctx = ZSTD_createCCtx_advanced(zstd_malloc);
517 
518 	/*
519 	 * Out of kernel memory, gently fall through - this will disable
520 	 * compression in zio_compress_data
521 	 */
522 	if (!cctx) {
523 		ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
524 		return (s_len);
525 	}
526 
527 	/* Set the compression level */
528 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
529 
530 	/* Use the "magicless" zstd header which saves us 4 header bytes */
531 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
532 
533 	/*
534 	 * Disable redundant checksum calculation and content size storage since
535 	 * this is already done by ZFS itself.
536 	 */
537 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
538 	ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
539 
540 	c_len = ZSTD_compress2(cctx,
541 	    hdr->data,
542 	    d_len - sizeof (*hdr),
543 	    s_start, s_len);
544 
545 	ZSTD_freeCCtx(cctx);
546 
547 	/* Error in the compression routine, disable compression. */
548 	if (ZSTD_isError(c_len)) {
549 		/*
550 		 * If we are aborting the compression because the saves are
551 		 * too small, that is not a failure. Everything else is a
552 		 * failure, so increment the compression failure counter.
553 		 */
554 		int err = ZSTD_getErrorCode(c_len);
555 		if (err != ZSTD_error_dstSize_tooSmall) {
556 			ZSTDSTAT_BUMP(zstd_stat_com_fail);
557 			dprintf("Error: %s", ZSTD_getErrorString(err));
558 		}
559 		return (s_len);
560 	}
561 
562 	/*
563 	 * Encode the compressed buffer size at the start. We'll need this in
564 	 * decompression to counter the effects of padding which might be added
565 	 * to the compressed buffer and which, if unhandled, would confuse the
566 	 * hell out of our decompression function.
567 	 */
568 	hdr->c_len = BE_32(c_len);
569 
570 	/*
571 	 * Check version for overflow.
572 	 * The limit of 24 bits must not be exceeded. This allows a maximum
573 	 * version 1677.72.15 which we don't expect to be ever reached.
574 	 */
575 	ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
576 
577 	/*
578 	 * Encode the compression level as well. We may need to know the
579 	 * original compression level if compressed_arc is disabled, to match
580 	 * the compression settings to write this block to the L2ARC.
581 	 *
582 	 * Encode the actual level, so if the enum changes in the future, we
583 	 * will be compatible.
584 	 *
585 	 * The upper 24 bits store the ZSTD version to be able to provide
586 	 * future compatibility, since new versions might enhance the
587 	 * compression algorithm in a way, where the compressed data will
588 	 * change.
589 	 *
590 	 * As soon as such incompatibility occurs, handling code needs to be
591 	 * added, differentiating between the versions.
592 	 */
593 	zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
594 	zfs_set_hdrlevel(hdr, level);
595 	hdr->raw_version_level = BE_32(hdr->raw_version_level);
596 
597 	return (c_len + sizeof (*hdr));
598 }
599 
600 /* Decompress block using zstd and return its stored level */
601 int
602 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
603     size_t d_len, uint8_t *level)
604 {
605 	ZSTD_DCtx *dctx;
606 	size_t result;
607 	int16_t zstd_level;
608 	uint32_t c_len;
609 	const zfs_zstdhdr_t *hdr;
610 	zfs_zstdhdr_t hdr_copy;
611 
612 	hdr = (const zfs_zstdhdr_t *)s_start;
613 	c_len = BE_32(hdr->c_len);
614 
615 	/*
616 	 * Make a copy instead of directly converting the header, since we must
617 	 * not modify the original data that may be used again later.
618 	 */
619 	hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
620 	uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
621 
622 	/*
623 	 * NOTE: We ignore the ZSTD version for now. As soon as any
624 	 * incompatibility occurs, it has to be handled accordingly.
625 	 * The version can be accessed via `hdr_copy.version`.
626 	 */
627 
628 	/*
629 	 * Convert and check the level
630 	 * An invalid level is a strong indicator for data corruption! In such
631 	 * case return an error so the upper layers can try to fix it.
632 	 */
633 	if (zstd_enum_to_level(curlevel, &zstd_level)) {
634 		ZSTDSTAT_BUMP(zstd_stat_dec_inval);
635 		return (1);
636 	}
637 
638 	ASSERT3U(d_len, >=, s_len);
639 	ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
640 
641 	/* Invalid compressed buffer size encoded at start */
642 	if (c_len + sizeof (*hdr) > s_len) {
643 		ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
644 		return (1);
645 	}
646 
647 	dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
648 	if (!dctx) {
649 		ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
650 		return (1);
651 	}
652 
653 	/* Set header type to "magicless" */
654 	ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
655 
656 	/* Decompress the data and release the context */
657 	result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
658 	ZSTD_freeDCtx(dctx);
659 
660 	/*
661 	 * Returns 0 on success (decompression function returned non-negative)
662 	 * and non-zero on failure (decompression function returned negative.
663 	 */
664 	if (ZSTD_isError(result)) {
665 		ZSTDSTAT_BUMP(zstd_stat_dec_fail);
666 		return (1);
667 	}
668 
669 	if (level) {
670 		*level = curlevel;
671 	}
672 
673 	return (0);
674 }
675 
676 /* Decompress datablock using zstd */
677 int
678 zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
679     int level __maybe_unused)
680 {
681 
682 	return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
683 	    NULL));
684 }
685 
686 /* Allocator for zstd compression context using mempool_allocator */
687 static void *
688 zstd_alloc(void *opaque __maybe_unused, size_t size)
689 {
690 	size_t nbytes = sizeof (struct zstd_kmem) + size;
691 	struct zstd_kmem *z = NULL;
692 
693 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
694 
695 	if (!z) {
696 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
697 		return (NULL);
698 	}
699 
700 	return ((void*)z + (sizeof (struct zstd_kmem)));
701 }
702 
703 /*
704  * Allocator for zstd decompression context using mempool_allocator with
705  * fallback to reserved memory if allocation fails
706  */
707 static void *
708 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
709 {
710 	size_t nbytes = sizeof (struct zstd_kmem) + size;
711 	struct zstd_kmem *z = NULL;
712 	enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
713 
714 	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
715 	if (!z) {
716 		/* Try harder, decompression shall not fail */
717 		z = vmem_alloc(nbytes, KM_SLEEP);
718 		if (z) {
719 			z->pool = NULL;
720 		}
721 		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
722 	} else {
723 		return ((void*)z + (sizeof (struct zstd_kmem)));
724 	}
725 
726 	/* Fallback if everything fails */
727 	if (!z) {
728 		/*
729 		 * Barrier since we only can handle it in a single thread. All
730 		 * other following threads need to wait here until decompression
731 		 * is completed. zstd_free will release this barrier later.
732 		 */
733 		mutex_enter(&zstd_dctx_fallback.barrier);
734 
735 		z = zstd_dctx_fallback.mem;
736 		type = ZSTD_KMEM_DCTX;
737 		ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
738 	}
739 
740 	/* Allocation should always be successful */
741 	if (!z) {
742 		return (NULL);
743 	}
744 
745 	z->kmem_type = type;
746 	z->kmem_size = nbytes;
747 
748 	return ((void*)z + (sizeof (struct zstd_kmem)));
749 }
750 
751 /* Free allocated memory by its specific type */
752 static void
753 zstd_free(void *opaque __maybe_unused, void *ptr)
754 {
755 	struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
756 	enum zstd_kmem_type type;
757 
758 	ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
759 	ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
760 
761 	type = z->kmem_type;
762 	switch (type) {
763 	case ZSTD_KMEM_DEFAULT:
764 		vmem_free(z, z->kmem_size);
765 		break;
766 	case ZSTD_KMEM_POOL:
767 		zstd_mempool_free(z);
768 		break;
769 	case ZSTD_KMEM_DCTX:
770 		mutex_exit(&zstd_dctx_fallback.barrier);
771 		break;
772 	default:
773 		break;
774 	}
775 }
776 
777 /* Allocate fallback memory to ensure safe decompression */
778 static void __init
779 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
780 {
781 	mem->mem_size = size;
782 	mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
783 	mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
784 }
785 
786 /* Initialize memory pool barrier mutexes */
787 static void __init
788 zstd_mempool_init(void)
789 {
790 	zstd_mempool_cctx = (struct zstd_pool *)
791 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
792 	zstd_mempool_dctx = (struct zstd_pool *)
793 	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
794 
795 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
796 		mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
797 		    MUTEX_DEFAULT, NULL);
798 		mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
799 		    MUTEX_DEFAULT, NULL);
800 	}
801 }
802 
803 /* Initialize zstd-related memory handling */
804 static int __init
805 zstd_meminit(void)
806 {
807 	zstd_mempool_init();
808 
809 	/*
810 	 * Estimate the size of the fallback decompression context.
811 	 * The expected size on x64 with current ZSTD should be about 160 KB.
812 	 */
813 	create_fallback_mem(&zstd_dctx_fallback,
814 	    P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
815 	    PAGESIZE));
816 
817 	return (0);
818 }
819 
820 /* Release object from pool and free memory */
821 static void
822 release_pool(struct zstd_pool *pool)
823 {
824 	mutex_destroy(&pool->barrier);
825 	vmem_free(pool->mem, pool->size);
826 	pool->mem = NULL;
827 	pool->size = 0;
828 }
829 
830 /* Release memory pool objects */
831 static void
832 zstd_mempool_deinit(void)
833 {
834 	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
835 		release_pool(&zstd_mempool_cctx[i]);
836 		release_pool(&zstd_mempool_dctx[i]);
837 	}
838 
839 	kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
840 	kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
841 	zstd_mempool_dctx = NULL;
842 	zstd_mempool_cctx = NULL;
843 }
844 
845 /* release unused memory from pool */
846 
847 void
848 zfs_zstd_cache_reap_now(void)
849 {
850 
851 	/*
852 	 * Short-circuit if there are no buffers to begin with.
853 	 */
854 	if (ZSTDSTAT(zstd_stat_buffers) == 0)
855 		return;
856 
857 	/*
858 	 * calling alloc with zero size seeks
859 	 * and releases old unused objects
860 	 */
861 	zstd_mempool_reap(zstd_mempool_cctx);
862 	zstd_mempool_reap(zstd_mempool_dctx);
863 }
864 
865 extern int __init
866 zstd_init(void)
867 {
868 	/* Set pool size by using maximum sane thread count * 4 */
869 	pool_count = (boot_ncpus * 4);
870 	zstd_meminit();
871 
872 	/* Initialize kstat */
873 	zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
874 	    KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
875 	    KSTAT_FLAG_VIRTUAL);
876 	if (zstd_ksp != NULL) {
877 		zstd_ksp->ks_data = &zstd_stats;
878 		kstat_install(zstd_ksp);
879 #ifdef _KERNEL
880 		zstd_ksp->ks_update = kstat_zstd_update;
881 #endif
882 	}
883 
884 	return (0);
885 }
886 
887 extern void
888 zstd_fini(void)
889 {
890 	/* Deinitialize kstat */
891 	if (zstd_ksp != NULL) {
892 		kstat_delete(zstd_ksp);
893 		zstd_ksp = NULL;
894 	}
895 
896 	/* Release fallback memory */
897 	vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
898 	mutex_destroy(&zstd_dctx_fallback.barrier);
899 
900 	/* Deinit memory pool */
901 	zstd_mempool_deinit();
902 }
903 
904 #if defined(_KERNEL)
905 #ifdef __FreeBSD__
906 module_init(zstd_init);
907 module_exit(zstd_fini);
908 #endif
909 
910 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, INT, ZMOD_RW,
911 	"Enable early abort attempts when using zstd");
912 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,
913 	"Minimal size of block to attempt early abort");
914 #endif
915