1 /*
2 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * 1. Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 *
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 *
14 * 3. Neither the name of the copyright holder nor the names of its
15 * contributors may be used to endorse or promote products derived from this
16 * software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * Copyright (c) 2016-2018, Klara Inc.
33 * Copyright (c) 2016-2018, Allan Jude
34 * Copyright (c) 2018-2020, Sebastian Gottschall
35 * Copyright (c) 2019-2020, Michael Niewöhner
36 * Copyright (c) 2020, The FreeBSD Foundation [1]
37 *
38 * [1] Portions of this software were developed by Allan Jude
39 * under sponsorship from the FreeBSD Foundation.
40 */
41
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zio_compress.h>
46 #include <sys/spa.h>
47 #include <sys/zstd/zstd.h>
48
49 #define ZSTD_STATIC_LINKING_ONLY
50 #include "lib/zstd.h"
51 #include "lib/common/zstd_errors.h"
52
53 #ifndef IN_LIBSA
54 static uint_t zstd_earlyabort_pass = 1;
55 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;
56 static unsigned int zstd_abort_size = (128 * 1024);
57 #endif
58
59 static kstat_t *zstd_ksp = NULL;
60
61 typedef struct zstd_stats {
62 kstat_named_t zstd_stat_alloc_fail;
63 kstat_named_t zstd_stat_alloc_fallback;
64 kstat_named_t zstd_stat_com_alloc_fail;
65 kstat_named_t zstd_stat_dec_alloc_fail;
66 kstat_named_t zstd_stat_com_inval;
67 kstat_named_t zstd_stat_dec_inval;
68 kstat_named_t zstd_stat_dec_header_inval;
69 kstat_named_t zstd_stat_com_fail;
70 kstat_named_t zstd_stat_dec_fail;
71 /*
72 * LZ4 first-pass early abort verdict
73 */
74 kstat_named_t zstd_stat_lz4pass_allowed;
75 kstat_named_t zstd_stat_lz4pass_rejected;
76 /*
77 * zstd-1 second-pass early abort verdict
78 */
79 kstat_named_t zstd_stat_zstdpass_allowed;
80 kstat_named_t zstd_stat_zstdpass_rejected;
81 /*
82 * We excluded this from early abort for some reason
83 */
84 kstat_named_t zstd_stat_passignored;
85 kstat_named_t zstd_stat_passignored_size;
86 kstat_named_t zstd_stat_buffers;
87 kstat_named_t zstd_stat_size;
88 } zstd_stats_t;
89
90 static zstd_stats_t zstd_stats = {
91 { "alloc_fail", KSTAT_DATA_UINT64 },
92 { "alloc_fallback", KSTAT_DATA_UINT64 },
93 { "compress_alloc_fail", KSTAT_DATA_UINT64 },
94 { "decompress_alloc_fail", KSTAT_DATA_UINT64 },
95 { "compress_level_invalid", KSTAT_DATA_UINT64 },
96 { "decompress_level_invalid", KSTAT_DATA_UINT64 },
97 { "decompress_header_invalid", KSTAT_DATA_UINT64 },
98 { "compress_failed", KSTAT_DATA_UINT64 },
99 { "decompress_failed", KSTAT_DATA_UINT64 },
100 { "lz4pass_allowed", KSTAT_DATA_UINT64 },
101 { "lz4pass_rejected", KSTAT_DATA_UINT64 },
102 { "zstdpass_allowed", KSTAT_DATA_UINT64 },
103 { "zstdpass_rejected", KSTAT_DATA_UINT64 },
104 { "passignored", KSTAT_DATA_UINT64 },
105 { "passignored_size", KSTAT_DATA_UINT64 },
106 { "buffers", KSTAT_DATA_UINT64 },
107 { "size", KSTAT_DATA_UINT64 },
108 };
109
110 #ifdef _KERNEL
111 static int
kstat_zstd_update(kstat_t * ksp,int rw)112 kstat_zstd_update(kstat_t *ksp, int rw)
113 {
114 ASSERT(ksp != NULL);
115
116 if (rw == KSTAT_WRITE && ksp == zstd_ksp) {
117 ZSTDSTAT_ZERO(zstd_stat_alloc_fail);
118 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);
119 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);
120 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);
121 ZSTDSTAT_ZERO(zstd_stat_com_inval);
122 ZSTDSTAT_ZERO(zstd_stat_dec_inval);
123 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);
124 ZSTDSTAT_ZERO(zstd_stat_com_fail);
125 ZSTDSTAT_ZERO(zstd_stat_dec_fail);
126 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);
127 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);
128 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);
129 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);
130 ZSTDSTAT_ZERO(zstd_stat_passignored);
131 ZSTDSTAT_ZERO(zstd_stat_passignored_size);
132 }
133
134 return (0);
135 }
136 #endif
137
138 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
139 enum zstd_kmem_type {
140 ZSTD_KMEM_UNKNOWN = 0,
141 /* Allocation type using kmem_vmalloc */
142 ZSTD_KMEM_DEFAULT,
143 /* Pool based allocation using mempool_alloc */
144 ZSTD_KMEM_POOL,
145 /* Reserved fallback memory for decompression only */
146 ZSTD_KMEM_DCTX,
147 ZSTD_KMEM_COUNT,
148 };
149
150 /* Structure for pooled memory objects */
151 struct zstd_pool {
152 void *mem;
153 size_t size;
154 kmutex_t barrier;
155 hrtime_t timeout;
156 };
157
158 /* Global structure for handling memory allocations */
159 struct zstd_kmem {
160 enum zstd_kmem_type kmem_type;
161 size_t kmem_size;
162 struct zstd_pool *pool;
163 };
164
165 /* Fallback memory structure used for decompression only if memory runs out */
166 struct zstd_fallback_mem {
167 size_t mem_size;
168 void *mem;
169 kmutex_t barrier;
170 };
171
172 struct zstd_levelmap {
173 int16_t zstd_level;
174 enum zio_zstd_levels level;
175 };
176
177 /*
178 * ZSTD memory handlers
179 *
180 * For decompression we use a different handler which also provides fallback
181 * memory allocation in case memory runs out.
182 *
183 * The ZSTD handlers were split up for the most simplified implementation.
184 */
185 #ifndef IN_LIBSA
186 static void *zstd_alloc(void *opaque, size_t size);
187 #endif
188 static void *zstd_dctx_alloc(void *opaque, size_t size);
189 static void zstd_free(void *opaque, void *ptr);
190
191 #ifndef IN_LIBSA
192 /* Compression memory handler */
193 static const ZSTD_customMem zstd_malloc = {
194 zstd_alloc,
195 zstd_free,
196 NULL,
197 };
198 #endif
199
200 /* Decompression memory handler */
201 static const ZSTD_customMem zstd_dctx_malloc = {
202 zstd_dctx_alloc,
203 zstd_free,
204 NULL,
205 };
206
207 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
208 static struct zstd_levelmap zstd_levels[] = {
209 {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
210 {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
211 {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
212 {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
213 {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
214 {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
215 {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
216 {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
217 {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
218 {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
219 {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
220 {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
221 {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
222 {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
223 {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
224 {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
225 {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
226 {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
227 {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
228 {-1, ZIO_ZSTD_LEVEL_FAST_1},
229 {-2, ZIO_ZSTD_LEVEL_FAST_2},
230 {-3, ZIO_ZSTD_LEVEL_FAST_3},
231 {-4, ZIO_ZSTD_LEVEL_FAST_4},
232 {-5, ZIO_ZSTD_LEVEL_FAST_5},
233 {-6, ZIO_ZSTD_LEVEL_FAST_6},
234 {-7, ZIO_ZSTD_LEVEL_FAST_7},
235 {-8, ZIO_ZSTD_LEVEL_FAST_8},
236 {-9, ZIO_ZSTD_LEVEL_FAST_9},
237 {-10, ZIO_ZSTD_LEVEL_FAST_10},
238 {-20, ZIO_ZSTD_LEVEL_FAST_20},
239 {-30, ZIO_ZSTD_LEVEL_FAST_30},
240 {-40, ZIO_ZSTD_LEVEL_FAST_40},
241 {-50, ZIO_ZSTD_LEVEL_FAST_50},
242 {-60, ZIO_ZSTD_LEVEL_FAST_60},
243 {-70, ZIO_ZSTD_LEVEL_FAST_70},
244 {-80, ZIO_ZSTD_LEVEL_FAST_80},
245 {-90, ZIO_ZSTD_LEVEL_FAST_90},
246 {-100, ZIO_ZSTD_LEVEL_FAST_100},
247 {-500, ZIO_ZSTD_LEVEL_FAST_500},
248 {-1000, ZIO_ZSTD_LEVEL_FAST_1000},
249 };
250
251 /*
252 * This variable represents the maximum count of the pool based on the number
253 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
254 */
255 static int pool_count = 16;
256
257 #define ZSTD_POOL_MAX pool_count
258 #define ZSTD_POOL_TIMEOUT 60 * 2
259
260 static struct zstd_fallback_mem zstd_dctx_fallback;
261 static struct zstd_pool *zstd_mempool_cctx;
262 static struct zstd_pool *zstd_mempool_dctx;
263
264 /*
265 * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
266 * and while ASAN does this, KASAN defines that and does not. So to avoid
267 * changing the external code, we do this.
268 */
269 #if defined(ZFS_ASAN_ENABLED)
270 #define ADDRESS_SANITIZER 1
271 #endif
272 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
273 void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
274 void __asan_poison_memory_region(void const volatile *addr, size_t size);
__asan_unpoison_memory_region(void const volatile * addr,size_t size)275 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
__asan_poison_memory_region(void const volatile * addr,size_t size)276 void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
277 #endif
278
279
280 static void
zstd_mempool_reap(struct zstd_pool * zstd_mempool)281 zstd_mempool_reap(struct zstd_pool *zstd_mempool)
282 {
283 struct zstd_pool *pool;
284
285 if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
286 return;
287 }
288
289 /* free obsolete slots */
290 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
291 pool = &zstd_mempool[i];
292 if (pool->mem && mutex_tryenter(&pool->barrier)) {
293 /* Free memory if unused object older than 2 minutes */
294 if (pool->mem && gethrestime_sec() > pool->timeout) {
295 vmem_free(pool->mem, pool->size);
296 ZSTDSTAT_SUB(zstd_stat_buffers, 1);
297 ZSTDSTAT_SUB(zstd_stat_size, pool->size);
298 pool->mem = NULL;
299 pool->size = 0;
300 pool->timeout = 0;
301 }
302 mutex_exit(&pool->barrier);
303 }
304 }
305 }
306
307 /*
308 * Try to get a cached allocated buffer from memory pool or allocate a new one
309 * if necessary. If a object is older than 2 minutes and does not fit the
310 * requested size, it will be released and a new cached entry will be allocated.
311 * If other pooled objects are detected without being used for 2 minutes, they
312 * will be released, too.
313 *
314 * The concept is that high frequency memory allocations of bigger objects are
315 * expensive. So if a lot of work is going on, allocations will be kept for a
316 * while and can be reused in that time frame.
317 *
318 * The scheduled release will be updated every time a object is reused.
319 */
320
321 static void *
zstd_mempool_alloc(struct zstd_pool * zstd_mempool,size_t size)322 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
323 {
324 struct zstd_pool *pool;
325 struct zstd_kmem *mem = NULL;
326
327 if (!zstd_mempool) {
328 return (NULL);
329 }
330
331 /* Seek for preallocated memory slot and free obsolete slots */
332 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
333 pool = &zstd_mempool[i];
334 /*
335 * This lock is simply a marker for a pool object being in use.
336 * If it's already hold, it will be skipped.
337 *
338 * We need to create it before checking it to avoid race
339 * conditions caused by running in a threaded context.
340 *
341 * The lock is later released by zstd_mempool_free.
342 */
343 if (mutex_tryenter(&pool->barrier)) {
344 /*
345 * Check if objects fits the size, if so we take it and
346 * update the timestamp.
347 */
348 if (pool->mem && size <= pool->size) {
349 pool->timeout = gethrestime_sec() +
350 ZSTD_POOL_TIMEOUT;
351 mem = pool->mem;
352 return (mem);
353 }
354 mutex_exit(&pool->barrier);
355 }
356 }
357
358 /*
359 * If no preallocated slot was found, try to fill in a new one.
360 *
361 * We run a similar algorithm twice here to avoid pool fragmentation.
362 * The first one may generate holes in the list if objects get released.
363 * We always make sure that these holes get filled instead of adding new
364 * allocations constantly at the end.
365 */
366 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
367 pool = &zstd_mempool[i];
368 if (mutex_tryenter(&pool->barrier)) {
369 /* Object is free, try to allocate new one */
370 if (!pool->mem) {
371 mem = vmem_alloc(size, KM_SLEEP);
372 if (mem) {
373 ZSTDSTAT_ADD(zstd_stat_buffers, 1);
374 ZSTDSTAT_ADD(zstd_stat_size, size);
375 pool->mem = mem;
376 pool->size = size;
377 /* Keep track for later release */
378 mem->pool = pool;
379 mem->kmem_type = ZSTD_KMEM_POOL;
380 mem->kmem_size = size;
381 }
382 }
383
384 if (size <= pool->size) {
385 /* Update timestamp */
386 pool->timeout = gethrestime_sec() +
387 ZSTD_POOL_TIMEOUT;
388
389 return (pool->mem);
390 }
391
392 mutex_exit(&pool->barrier);
393 }
394 }
395
396 /*
397 * If the pool is full or the allocation failed, try lazy allocation
398 * instead.
399 */
400 if (!mem) {
401 mem = vmem_alloc(size, KM_NOSLEEP);
402 if (mem) {
403 mem->pool = NULL;
404 mem->kmem_type = ZSTD_KMEM_DEFAULT;
405 mem->kmem_size = size;
406 }
407 }
408
409 return (mem);
410 }
411
412 /* Mark object as released by releasing the barrier mutex */
413 static void
zstd_mempool_free(struct zstd_kmem * z)414 zstd_mempool_free(struct zstd_kmem *z)
415 {
416 mutex_exit(&z->pool->barrier);
417 }
418
419 /* Convert ZFS internal enum to ZSTD level */
420 static int
zstd_enum_to_level(enum zio_zstd_levels level,int16_t * zstd_level)421 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
422 {
423 if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
424 *zstd_level = zstd_levels[level - 1].zstd_level;
425 return (0);
426 }
427 if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
428 level <= ZIO_ZSTD_LEVEL_FAST_1000) {
429 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
430 + ZIO_ZSTD_LEVEL_19].zstd_level;
431 return (0);
432 }
433
434 /* Invalid/unknown zfs compression enum - this should never happen. */
435 return (1);
436 }
437
438 #ifndef IN_LIBSA
439 size_t
zfs_zstd_compress_wrap(void * s_start,void * d_start,size_t s_len,size_t d_len,int level)440 zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
441 int level)
442 {
443 int16_t zstd_level;
444 if (zstd_enum_to_level(level, &zstd_level)) {
445 ZSTDSTAT_BUMP(zstd_stat_com_inval);
446 return (s_len);
447 }
448 /*
449 * A zstd early abort heuristic.
450 *
451 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
452 * 128k), don't try any of this, just go.
453 * (because experimentally that was a reasonable cutoff for a perf win
454 * with tiny ratio change)
455 * - First, we try LZ4 compression, and if it doesn't early abort, we
456 * jump directly to whatever compression level we intended to try.
457 * - Second, we try zstd-1 - if that errors out (usually, but not
458 * exclusively, if it would overflow), we give up early.
459 *
460 * If it works, instead we go on and compress anyway.
461 *
462 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
463 * compressible data, it was losing up to 8.5% of the compressed
464 * savings versus no early abort, and all the zstd-fast levels are
465 * worse indications on their own than LZ4, and don't improve the LZ4
466 * pass noticably if stacked like this.
467 */
468 size_t actual_abort_size = zstd_abort_size;
469 if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
470 s_len >= actual_abort_size) {
471 int pass_len = 1;
472 pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0);
473 if (pass_len < d_len) {
474 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
475 goto keep_trying;
476 }
477 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
478
479 pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
480 ZIO_ZSTD_LEVEL_1);
481 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
482 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
483 return (s_len);
484 }
485 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
486 } else {
487 ZSTDSTAT_BUMP(zstd_stat_passignored);
488 if (s_len < actual_abort_size) {
489 ZSTDSTAT_BUMP(zstd_stat_passignored_size);
490 }
491 }
492 keep_trying:
493 return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
494
495 }
496
497 /* Compress block using zstd */
498 size_t
zfs_zstd_compress(void * s_start,void * d_start,size_t s_len,size_t d_len,int level)499 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
500 int level)
501 {
502 size_t c_len;
503 int16_t zstd_level;
504 zfs_zstdhdr_t *hdr;
505 ZSTD_CCtx *cctx;
506
507 hdr = (zfs_zstdhdr_t *)d_start;
508
509 /* Skip compression if the specified level is invalid */
510 if (zstd_enum_to_level(level, &zstd_level)) {
511 ZSTDSTAT_BUMP(zstd_stat_com_inval);
512 return (s_len);
513 }
514
515 ASSERT3U(d_len, >=, sizeof (*hdr));
516 ASSERT3U(d_len, <=, s_len);
517 ASSERT3U(zstd_level, !=, 0);
518
519 cctx = ZSTD_createCCtx_advanced(zstd_malloc);
520
521 /*
522 * Out of kernel memory, gently fall through - this will disable
523 * compression in zio_compress_data
524 */
525 if (!cctx) {
526 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
527 return (s_len);
528 }
529
530 /* Set the compression level */
531 ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
532
533 /* Use the "magicless" zstd header which saves us 4 header bytes */
534 ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
535
536 /*
537 * Disable redundant checksum calculation and content size storage since
538 * this is already done by ZFS itself.
539 */
540 ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
541 ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
542
543 c_len = ZSTD_compress2(cctx,
544 hdr->data,
545 d_len - sizeof (*hdr),
546 s_start, s_len);
547
548 ZSTD_freeCCtx(cctx);
549
550 /* Error in the compression routine, disable compression. */
551 if (ZSTD_isError(c_len)) {
552 /*
553 * If we are aborting the compression because the saves are
554 * too small, that is not a failure. Everything else is a
555 * failure, so increment the compression failure counter.
556 */
557 int err = ZSTD_getErrorCode(c_len);
558 if (err != ZSTD_error_dstSize_tooSmall) {
559 ZSTDSTAT_BUMP(zstd_stat_com_fail);
560 dprintf("Error: %s", ZSTD_getErrorString(err));
561 }
562 return (s_len);
563 }
564
565 /*
566 * Encode the compressed buffer size at the start. We'll need this in
567 * decompression to counter the effects of padding which might be added
568 * to the compressed buffer and which, if unhandled, would confuse the
569 * hell out of our decompression function.
570 */
571 hdr->c_len = BE_32(c_len);
572
573 /*
574 * Check version for overflow.
575 * The limit of 24 bits must not be exceeded. This allows a maximum
576 * version 1677.72.15 which we don't expect to be ever reached.
577 */
578 ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
579
580 /*
581 * Encode the compression level as well. We may need to know the
582 * original compression level if compressed_arc is disabled, to match
583 * the compression settings to write this block to the L2ARC.
584 *
585 * Encode the actual level, so if the enum changes in the future, we
586 * will be compatible.
587 *
588 * The upper 24 bits store the ZSTD version to be able to provide
589 * future compatibility, since new versions might enhance the
590 * compression algorithm in a way, where the compressed data will
591 * change.
592 *
593 * As soon as such incompatibility occurs, handling code needs to be
594 * added, differentiating between the versions.
595 */
596 zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
597 zfs_set_hdrlevel(hdr, level);
598 hdr->raw_version_level = BE_32(hdr->raw_version_level);
599
600 return (c_len + sizeof (*hdr));
601 }
602 #endif
603
604 /* Decompress block using zstd and return its stored level */
605 int
zfs_zstd_decompress_level(void * s_start,void * d_start,size_t s_len,size_t d_len,uint8_t * level)606 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
607 size_t d_len, uint8_t *level)
608 {
609 ZSTD_DCtx *dctx;
610 size_t result;
611 int16_t zstd_level;
612 uint32_t c_len;
613 const zfs_zstdhdr_t *hdr;
614 zfs_zstdhdr_t hdr_copy;
615
616 hdr = (const zfs_zstdhdr_t *)s_start;
617 c_len = BE_32(hdr->c_len);
618
619 /*
620 * Make a copy instead of directly converting the header, since we must
621 * not modify the original data that may be used again later.
622 */
623 hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
624 uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
625
626 /*
627 * NOTE: We ignore the ZSTD version for now. As soon as any
628 * incompatibility occurs, it has to be handled accordingly.
629 * The version can be accessed via `hdr_copy.version`.
630 */
631
632 /*
633 * Convert and check the level
634 * An invalid level is a strong indicator for data corruption! In such
635 * case return an error so the upper layers can try to fix it.
636 */
637 if (zstd_enum_to_level(curlevel, &zstd_level)) {
638 ZSTDSTAT_BUMP(zstd_stat_dec_inval);
639 return (1);
640 }
641
642 ASSERT3U(d_len, >=, s_len);
643 ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
644
645 /* Invalid compressed buffer size encoded at start */
646 if (c_len + sizeof (*hdr) > s_len) {
647 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
648 return (1);
649 }
650
651 dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
652 if (!dctx) {
653 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
654 return (1);
655 }
656
657 /* Set header type to "magicless" */
658 ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
659
660 /* Decompress the data and release the context */
661 result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
662 ZSTD_freeDCtx(dctx);
663
664 /*
665 * Returns 0 on success (decompression function returned non-negative)
666 * and non-zero on failure (decompression function returned negative.
667 */
668 if (ZSTD_isError(result)) {
669 ZSTDSTAT_BUMP(zstd_stat_dec_fail);
670 return (1);
671 }
672
673 if (level) {
674 *level = curlevel;
675 }
676
677 return (0);
678 }
679
680 /* Decompress datablock using zstd */
681 int
zfs_zstd_decompress(void * s_start,void * d_start,size_t s_len,size_t d_len,int level __maybe_unused)682 zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
683 int level __maybe_unused)
684 {
685
686 return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
687 NULL));
688 }
689
690 #ifndef IN_LIBSA
691 /* Allocator for zstd compression context using mempool_allocator */
692 static void *
zstd_alloc(void * opaque __maybe_unused,size_t size)693 zstd_alloc(void *opaque __maybe_unused, size_t size)
694 {
695 size_t nbytes = sizeof (struct zstd_kmem) + size;
696 struct zstd_kmem *z = NULL;
697
698 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
699
700 if (!z) {
701 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
702 return (NULL);
703 }
704
705 return ((void*)z + (sizeof (struct zstd_kmem)));
706 }
707 #endif
708
709 /*
710 * Allocator for zstd decompression context using mempool_allocator with
711 * fallback to reserved memory if allocation fails
712 */
713 static void *
zstd_dctx_alloc(void * opaque __maybe_unused,size_t size)714 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
715 {
716 size_t nbytes = sizeof (struct zstd_kmem) + size;
717 struct zstd_kmem *z = NULL;
718 enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
719
720 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
721 if (!z) {
722 /* Try harder, decompression shall not fail */
723 z = vmem_alloc(nbytes, KM_SLEEP);
724 if (z) {
725 z->pool = NULL;
726 }
727 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
728 } else {
729 return ((void*)z + (sizeof (struct zstd_kmem)));
730 }
731
732 /* Fallback if everything fails */
733 if (!z) {
734 /*
735 * Barrier since we only can handle it in a single thread. All
736 * other following threads need to wait here until decompression
737 * is completed. zstd_free will release this barrier later.
738 */
739 mutex_enter(&zstd_dctx_fallback.barrier);
740
741 z = zstd_dctx_fallback.mem;
742 type = ZSTD_KMEM_DCTX;
743 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
744 }
745
746 /* Allocation should always be successful */
747 if (!z) {
748 return (NULL);
749 }
750
751 z->kmem_type = type;
752 z->kmem_size = nbytes;
753
754 return ((void*)z + (sizeof (struct zstd_kmem)));
755 }
756
757 /* Free allocated memory by its specific type */
758 static void
zstd_free(void * opaque __maybe_unused,void * ptr)759 zstd_free(void *opaque __maybe_unused, void *ptr)
760 {
761 struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
762 enum zstd_kmem_type type;
763
764 ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
765 ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
766
767 type = z->kmem_type;
768 switch (type) {
769 case ZSTD_KMEM_DEFAULT:
770 vmem_free(z, z->kmem_size);
771 break;
772 case ZSTD_KMEM_POOL:
773 zstd_mempool_free(z);
774 break;
775 case ZSTD_KMEM_DCTX:
776 mutex_exit(&zstd_dctx_fallback.barrier);
777 break;
778 default:
779 break;
780 }
781 }
782
783 /* Allocate fallback memory to ensure safe decompression */
784 static void __init
create_fallback_mem(struct zstd_fallback_mem * mem,size_t size)785 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
786 {
787 mem->mem_size = size;
788 mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
789 mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
790 }
791
792 /* Initialize memory pool barrier mutexes */
793 static void __init
zstd_mempool_init(void)794 zstd_mempool_init(void)
795 {
796 zstd_mempool_cctx =
797 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
798 zstd_mempool_dctx =
799 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
800
801 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
802 mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
803 MUTEX_DEFAULT, NULL);
804 mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
805 MUTEX_DEFAULT, NULL);
806 }
807 }
808
809 /* Initialize zstd-related memory handling */
810 static int __init
zstd_meminit(void)811 zstd_meminit(void)
812 {
813 zstd_mempool_init();
814
815 /*
816 * Estimate the size of the fallback decompression context.
817 * The expected size on x64 with current ZSTD should be about 160 KB.
818 */
819 create_fallback_mem(&zstd_dctx_fallback,
820 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
821 PAGESIZE));
822
823 return (0);
824 }
825
826 /* Release object from pool and free memory */
827 static void
release_pool(struct zstd_pool * pool)828 release_pool(struct zstd_pool *pool)
829 {
830 mutex_destroy(&pool->barrier);
831 vmem_free(pool->mem, pool->size);
832 pool->mem = NULL;
833 pool->size = 0;
834 }
835
836 /* Release memory pool objects */
837 static void
zstd_mempool_deinit(void)838 zstd_mempool_deinit(void)
839 {
840 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
841 release_pool(&zstd_mempool_cctx[i]);
842 release_pool(&zstd_mempool_dctx[i]);
843 }
844
845 kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
846 kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
847 zstd_mempool_dctx = NULL;
848 zstd_mempool_cctx = NULL;
849 }
850
851 /* release unused memory from pool */
852
853 void
zfs_zstd_cache_reap_now(void)854 zfs_zstd_cache_reap_now(void)
855 {
856
857 /*
858 * Short-circuit if there are no buffers to begin with.
859 */
860 if (ZSTDSTAT(zstd_stat_buffers) == 0)
861 return;
862
863 /*
864 * calling alloc with zero size seeks
865 * and releases old unused objects
866 */
867 zstd_mempool_reap(zstd_mempool_cctx);
868 zstd_mempool_reap(zstd_mempool_dctx);
869 }
870
871 extern int __init
zstd_init(void)872 zstd_init(void)
873 {
874 /* Set pool size by using maximum sane thread count * 4 */
875 pool_count = (boot_ncpus * 4);
876 zstd_meminit();
877
878 /* Initialize kstat */
879 zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
880 KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
881 KSTAT_FLAG_VIRTUAL);
882 if (zstd_ksp != NULL) {
883 zstd_ksp->ks_data = &zstd_stats;
884 kstat_install(zstd_ksp);
885 #ifdef _KERNEL
886 zstd_ksp->ks_update = kstat_zstd_update;
887 #endif
888 }
889
890 return (0);
891 }
892
893 extern void
zstd_fini(void)894 zstd_fini(void)
895 {
896 /* Deinitialize kstat */
897 if (zstd_ksp != NULL) {
898 kstat_delete(zstd_ksp);
899 zstd_ksp = NULL;
900 }
901
902 /* Release fallback memory */
903 vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
904 mutex_destroy(&zstd_dctx_fallback.barrier);
905
906 /* Deinit memory pool */
907 zstd_mempool_deinit();
908 }
909
910 #if defined(_KERNEL)
911 #ifdef __FreeBSD__
912 module_init(zstd_init);
913 module_exit(zstd_fini);
914 #endif
915
916 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW,
917 "Enable early abort attempts when using zstd");
918 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,
919 "Minimal size of block to attempt early abort");
920 #endif
921