1 /* 2 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are met: 6 * 7 * 1. Redistributions of source code must retain the above copyright notice, 8 * this list of conditions and the following disclaimer. 9 * 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * 3. Neither the name of the copyright holder nor the names of its 15 * contributors may be used to endorse or promote products derived from this 16 * software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 /* 32 * Copyright (c) 2016-2018, Klara Inc. 33 * Copyright (c) 2016-2018, Allan Jude 34 * Copyright (c) 2018-2020, Sebastian Gottschall 35 * Copyright (c) 2019-2020, Michael Niewöhner 36 * Copyright (c) 2020, The FreeBSD Foundation [1] 37 * 38 * [1] Portions of this software were developed by Allan Jude 39 * under sponsorship from the FreeBSD Foundation. 40 */ 41 42 #include <sys/param.h> 43 #include <sys/sysmacros.h> 44 #include <sys/zfs_context.h> 45 #include <sys/zio_compress.h> 46 #include <sys/spa.h> 47 #include <sys/zstd/zstd.h> 48 49 #define ZSTD_STATIC_LINKING_ONLY 50 #include "lib/zstd.h" 51 #include "lib/common/zstd_errors.h" 52 53 #ifndef IN_LIBSA 54 static int zstd_earlyabort_pass = 1; 55 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3; 56 static unsigned int zstd_abort_size = (128 * 1024); 57 #endif 58 59 static kstat_t *zstd_ksp = NULL; 60 61 typedef struct zstd_stats { 62 kstat_named_t zstd_stat_alloc_fail; 63 kstat_named_t zstd_stat_alloc_fallback; 64 kstat_named_t zstd_stat_com_alloc_fail; 65 kstat_named_t zstd_stat_dec_alloc_fail; 66 kstat_named_t zstd_stat_com_inval; 67 kstat_named_t zstd_stat_dec_inval; 68 kstat_named_t zstd_stat_dec_header_inval; 69 kstat_named_t zstd_stat_com_fail; 70 kstat_named_t zstd_stat_dec_fail; 71 /* 72 * LZ4 first-pass early abort verdict 73 */ 74 kstat_named_t zstd_stat_lz4pass_allowed; 75 kstat_named_t zstd_stat_lz4pass_rejected; 76 /* 77 * zstd-1 second-pass early abort verdict 78 */ 79 kstat_named_t zstd_stat_zstdpass_allowed; 80 kstat_named_t zstd_stat_zstdpass_rejected; 81 /* 82 * We excluded this from early abort for some reason 83 */ 84 kstat_named_t zstd_stat_passignored; 85 kstat_named_t zstd_stat_passignored_size; 86 kstat_named_t zstd_stat_buffers; 87 kstat_named_t zstd_stat_size; 88 } zstd_stats_t; 89 90 static zstd_stats_t zstd_stats = { 91 { "alloc_fail", KSTAT_DATA_UINT64 }, 92 { "alloc_fallback", KSTAT_DATA_UINT64 }, 93 { "compress_alloc_fail", KSTAT_DATA_UINT64 }, 94 { "decompress_alloc_fail", KSTAT_DATA_UINT64 }, 95 { "compress_level_invalid", KSTAT_DATA_UINT64 }, 96 { "decompress_level_invalid", KSTAT_DATA_UINT64 }, 97 { "decompress_header_invalid", KSTAT_DATA_UINT64 }, 98 { "compress_failed", KSTAT_DATA_UINT64 }, 99 { "decompress_failed", KSTAT_DATA_UINT64 }, 100 { "lz4pass_allowed", KSTAT_DATA_UINT64 }, 101 { "lz4pass_rejected", KSTAT_DATA_UINT64 }, 102 { "zstdpass_allowed", KSTAT_DATA_UINT64 }, 103 { "zstdpass_rejected", KSTAT_DATA_UINT64 }, 104 { "passignored", KSTAT_DATA_UINT64 }, 105 { "passignored_size", KSTAT_DATA_UINT64 }, 106 { "buffers", KSTAT_DATA_UINT64 }, 107 { "size", KSTAT_DATA_UINT64 }, 108 }; 109 110 #ifdef _KERNEL 111 static int 112 kstat_zstd_update(kstat_t *ksp, int rw) 113 { 114 ASSERT(ksp != NULL); 115 116 if (rw == KSTAT_WRITE && ksp == zstd_ksp) { 117 ZSTDSTAT_ZERO(zstd_stat_alloc_fail); 118 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback); 119 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail); 120 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail); 121 ZSTDSTAT_ZERO(zstd_stat_com_inval); 122 ZSTDSTAT_ZERO(zstd_stat_dec_inval); 123 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval); 124 ZSTDSTAT_ZERO(zstd_stat_com_fail); 125 ZSTDSTAT_ZERO(zstd_stat_dec_fail); 126 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed); 127 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected); 128 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed); 129 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected); 130 ZSTDSTAT_ZERO(zstd_stat_passignored); 131 ZSTDSTAT_ZERO(zstd_stat_passignored_size); 132 } 133 134 return (0); 135 } 136 #endif 137 138 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */ 139 enum zstd_kmem_type { 140 ZSTD_KMEM_UNKNOWN = 0, 141 /* Allocation type using kmem_vmalloc */ 142 ZSTD_KMEM_DEFAULT, 143 /* Pool based allocation using mempool_alloc */ 144 ZSTD_KMEM_POOL, 145 /* Reserved fallback memory for decompression only */ 146 ZSTD_KMEM_DCTX, 147 ZSTD_KMEM_COUNT, 148 }; 149 150 /* Structure for pooled memory objects */ 151 struct zstd_pool { 152 void *mem; 153 size_t size; 154 kmutex_t barrier; 155 hrtime_t timeout; 156 }; 157 158 /* Global structure for handling memory allocations */ 159 struct zstd_kmem { 160 enum zstd_kmem_type kmem_type; 161 size_t kmem_size; 162 struct zstd_pool *pool; 163 }; 164 165 /* Fallback memory structure used for decompression only if memory runs out */ 166 struct zstd_fallback_mem { 167 size_t mem_size; 168 void *mem; 169 kmutex_t barrier; 170 }; 171 172 struct zstd_levelmap { 173 int16_t zstd_level; 174 enum zio_zstd_levels level; 175 }; 176 177 /* 178 * ZSTD memory handlers 179 * 180 * For decompression we use a different handler which also provides fallback 181 * memory allocation in case memory runs out. 182 * 183 * The ZSTD handlers were split up for the most simplified implementation. 184 */ 185 static void *zstd_alloc(void *opaque, size_t size); 186 static void *zstd_dctx_alloc(void *opaque, size_t size); 187 static void zstd_free(void *opaque, void *ptr); 188 189 /* Compression memory handler */ 190 static const ZSTD_customMem zstd_malloc = { 191 zstd_alloc, 192 zstd_free, 193 NULL, 194 }; 195 196 /* Decompression memory handler */ 197 static const ZSTD_customMem zstd_dctx_malloc = { 198 zstd_dctx_alloc, 199 zstd_free, 200 NULL, 201 }; 202 203 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */ 204 static struct zstd_levelmap zstd_levels[] = { 205 {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1}, 206 {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2}, 207 {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3}, 208 {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4}, 209 {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5}, 210 {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6}, 211 {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7}, 212 {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8}, 213 {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9}, 214 {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10}, 215 {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11}, 216 {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12}, 217 {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13}, 218 {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14}, 219 {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15}, 220 {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16}, 221 {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17}, 222 {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18}, 223 {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19}, 224 {-1, ZIO_ZSTD_LEVEL_FAST_1}, 225 {-2, ZIO_ZSTD_LEVEL_FAST_2}, 226 {-3, ZIO_ZSTD_LEVEL_FAST_3}, 227 {-4, ZIO_ZSTD_LEVEL_FAST_4}, 228 {-5, ZIO_ZSTD_LEVEL_FAST_5}, 229 {-6, ZIO_ZSTD_LEVEL_FAST_6}, 230 {-7, ZIO_ZSTD_LEVEL_FAST_7}, 231 {-8, ZIO_ZSTD_LEVEL_FAST_8}, 232 {-9, ZIO_ZSTD_LEVEL_FAST_9}, 233 {-10, ZIO_ZSTD_LEVEL_FAST_10}, 234 {-20, ZIO_ZSTD_LEVEL_FAST_20}, 235 {-30, ZIO_ZSTD_LEVEL_FAST_30}, 236 {-40, ZIO_ZSTD_LEVEL_FAST_40}, 237 {-50, ZIO_ZSTD_LEVEL_FAST_50}, 238 {-60, ZIO_ZSTD_LEVEL_FAST_60}, 239 {-70, ZIO_ZSTD_LEVEL_FAST_70}, 240 {-80, ZIO_ZSTD_LEVEL_FAST_80}, 241 {-90, ZIO_ZSTD_LEVEL_FAST_90}, 242 {-100, ZIO_ZSTD_LEVEL_FAST_100}, 243 {-500, ZIO_ZSTD_LEVEL_FAST_500}, 244 {-1000, ZIO_ZSTD_LEVEL_FAST_1000}, 245 }; 246 247 /* 248 * This variable represents the maximum count of the pool based on the number 249 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd. 250 */ 251 static int pool_count = 16; 252 253 #define ZSTD_POOL_MAX pool_count 254 #define ZSTD_POOL_TIMEOUT 60 * 2 255 256 static struct zstd_fallback_mem zstd_dctx_fallback; 257 static struct zstd_pool *zstd_mempool_cctx; 258 static struct zstd_pool *zstd_mempool_dctx; 259 260 /* 261 * The library zstd code expects these if ADDRESS_SANITIZER gets defined, 262 * and while ASAN does this, KASAN defines that and does not. So to avoid 263 * changing the external code, we do this. 264 */ 265 #if defined(ZFS_ASAN_ENABLED) 266 #define ADDRESS_SANITIZER 1 267 #endif 268 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER) 269 void __asan_unpoison_memory_region(void const volatile *addr, size_t size); 270 void __asan_poison_memory_region(void const volatile *addr, size_t size); 271 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {}; 272 void __asan_poison_memory_region(void const volatile *addr, size_t size) {}; 273 #endif 274 275 276 static void 277 zstd_mempool_reap(struct zstd_pool *zstd_mempool) 278 { 279 struct zstd_pool *pool; 280 281 if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) { 282 return; 283 } 284 285 /* free obsolete slots */ 286 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 287 pool = &zstd_mempool[i]; 288 if (pool->mem && mutex_tryenter(&pool->barrier)) { 289 /* Free memory if unused object older than 2 minutes */ 290 if (pool->mem && gethrestime_sec() > pool->timeout) { 291 vmem_free(pool->mem, pool->size); 292 ZSTDSTAT_SUB(zstd_stat_buffers, 1); 293 ZSTDSTAT_SUB(zstd_stat_size, pool->size); 294 pool->mem = NULL; 295 pool->size = 0; 296 pool->timeout = 0; 297 } 298 mutex_exit(&pool->barrier); 299 } 300 } 301 } 302 303 /* 304 * Try to get a cached allocated buffer from memory pool or allocate a new one 305 * if necessary. If a object is older than 2 minutes and does not fit the 306 * requested size, it will be released and a new cached entry will be allocated. 307 * If other pooled objects are detected without being used for 2 minutes, they 308 * will be released, too. 309 * 310 * The concept is that high frequency memory allocations of bigger objects are 311 * expensive. So if a lot of work is going on, allocations will be kept for a 312 * while and can be reused in that time frame. 313 * 314 * The scheduled release will be updated every time a object is reused. 315 */ 316 317 static void * 318 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) 319 { 320 struct zstd_pool *pool; 321 struct zstd_kmem *mem = NULL; 322 323 if (!zstd_mempool) { 324 return (NULL); 325 } 326 327 /* Seek for preallocated memory slot and free obsolete slots */ 328 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 329 pool = &zstd_mempool[i]; 330 /* 331 * This lock is simply a marker for a pool object being in use. 332 * If it's already hold, it will be skipped. 333 * 334 * We need to create it before checking it to avoid race 335 * conditions caused by running in a threaded context. 336 * 337 * The lock is later released by zstd_mempool_free. 338 */ 339 if (mutex_tryenter(&pool->barrier)) { 340 /* 341 * Check if objects fits the size, if so we take it and 342 * update the timestamp. 343 */ 344 if (pool->mem && size <= pool->size) { 345 pool->timeout = gethrestime_sec() + 346 ZSTD_POOL_TIMEOUT; 347 mem = pool->mem; 348 return (mem); 349 } 350 mutex_exit(&pool->barrier); 351 } 352 } 353 354 /* 355 * If no preallocated slot was found, try to fill in a new one. 356 * 357 * We run a similar algorithm twice here to avoid pool fragmentation. 358 * The first one may generate holes in the list if objects get released. 359 * We always make sure that these holes get filled instead of adding new 360 * allocations constantly at the end. 361 */ 362 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 363 pool = &zstd_mempool[i]; 364 if (mutex_tryenter(&pool->barrier)) { 365 /* Object is free, try to allocate new one */ 366 if (!pool->mem) { 367 mem = vmem_alloc(size, KM_SLEEP); 368 if (mem) { 369 ZSTDSTAT_ADD(zstd_stat_buffers, 1); 370 ZSTDSTAT_ADD(zstd_stat_size, size); 371 pool->mem = mem; 372 pool->size = size; 373 /* Keep track for later release */ 374 mem->pool = pool; 375 mem->kmem_type = ZSTD_KMEM_POOL; 376 mem->kmem_size = size; 377 } 378 } 379 380 if (size <= pool->size) { 381 /* Update timestamp */ 382 pool->timeout = gethrestime_sec() + 383 ZSTD_POOL_TIMEOUT; 384 385 return (pool->mem); 386 } 387 388 mutex_exit(&pool->barrier); 389 } 390 } 391 392 /* 393 * If the pool is full or the allocation failed, try lazy allocation 394 * instead. 395 */ 396 if (!mem) { 397 mem = vmem_alloc(size, KM_NOSLEEP); 398 if (mem) { 399 mem->pool = NULL; 400 mem->kmem_type = ZSTD_KMEM_DEFAULT; 401 mem->kmem_size = size; 402 } 403 } 404 405 return (mem); 406 } 407 408 /* Mark object as released by releasing the barrier mutex */ 409 static void 410 zstd_mempool_free(struct zstd_kmem *z) 411 { 412 mutex_exit(&z->pool->barrier); 413 } 414 415 /* Convert ZFS internal enum to ZSTD level */ 416 static int 417 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) 418 { 419 if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) { 420 *zstd_level = zstd_levels[level - 1].zstd_level; 421 return (0); 422 } 423 if (level >= ZIO_ZSTD_LEVEL_FAST_1 && 424 level <= ZIO_ZSTD_LEVEL_FAST_1000) { 425 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1 426 + ZIO_ZSTD_LEVEL_19].zstd_level; 427 return (0); 428 } 429 430 /* Invalid/unknown zfs compression enum - this should never happen. */ 431 return (1); 432 } 433 434 #ifndef IN_LIBSA 435 size_t 436 zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len, 437 int level) 438 { 439 int16_t zstd_level; 440 if (zstd_enum_to_level(level, &zstd_level)) { 441 ZSTDSTAT_BUMP(zstd_stat_com_inval); 442 return (s_len); 443 } 444 /* 445 * A zstd early abort heuristic. 446 * 447 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently 448 * 128k), don't try any of this, just go. 449 * (because experimentally that was a reasonable cutoff for a perf win 450 * with tiny ratio change) 451 * - First, we try LZ4 compression, and if it doesn't early abort, we 452 * jump directly to whatever compression level we intended to try. 453 * - Second, we try zstd-1 - if that errors out (usually, but not 454 * exclusively, if it would overflow), we give up early. 455 * 456 * If it works, instead we go on and compress anyway. 457 * 458 * Why two passes? LZ4 alone gets you a lot of the way, but on highly 459 * compressible data, it was losing up to 8.5% of the compressed 460 * savings versus no early abort, and all the zstd-fast levels are 461 * worse indications on their own than LZ4, and don't improve the LZ4 462 * pass noticably if stacked like this. 463 */ 464 size_t actual_abort_size = zstd_abort_size; 465 if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && 466 s_len >= actual_abort_size) { 467 int pass_len = 1; 468 pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0); 469 if (pass_len < d_len) { 470 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); 471 goto keep_trying; 472 } 473 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); 474 475 pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len, 476 ZIO_ZSTD_LEVEL_1); 477 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { 478 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); 479 return (s_len); 480 } 481 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); 482 } else { 483 ZSTDSTAT_BUMP(zstd_stat_passignored); 484 if (s_len < actual_abort_size) { 485 ZSTDSTAT_BUMP(zstd_stat_passignored_size); 486 } 487 } 488 keep_trying: 489 return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level)); 490 491 } 492 #endif 493 494 /* Compress block using zstd */ 495 size_t 496 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, 497 int level) 498 { 499 size_t c_len; 500 int16_t zstd_level; 501 zfs_zstdhdr_t *hdr; 502 ZSTD_CCtx *cctx; 503 504 hdr = (zfs_zstdhdr_t *)d_start; 505 506 /* Skip compression if the specified level is invalid */ 507 if (zstd_enum_to_level(level, &zstd_level)) { 508 ZSTDSTAT_BUMP(zstd_stat_com_inval); 509 return (s_len); 510 } 511 512 ASSERT3U(d_len, >=, sizeof (*hdr)); 513 ASSERT3U(d_len, <=, s_len); 514 ASSERT3U(zstd_level, !=, 0); 515 516 cctx = ZSTD_createCCtx_advanced(zstd_malloc); 517 518 /* 519 * Out of kernel memory, gently fall through - this will disable 520 * compression in zio_compress_data 521 */ 522 if (!cctx) { 523 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail); 524 return (s_len); 525 } 526 527 /* Set the compression level */ 528 ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level); 529 530 /* Use the "magicless" zstd header which saves us 4 header bytes */ 531 ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless); 532 533 /* 534 * Disable redundant checksum calculation and content size storage since 535 * this is already done by ZFS itself. 536 */ 537 ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0); 538 ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0); 539 540 c_len = ZSTD_compress2(cctx, 541 hdr->data, 542 d_len - sizeof (*hdr), 543 s_start, s_len); 544 545 ZSTD_freeCCtx(cctx); 546 547 /* Error in the compression routine, disable compression. */ 548 if (ZSTD_isError(c_len)) { 549 /* 550 * If we are aborting the compression because the saves are 551 * too small, that is not a failure. Everything else is a 552 * failure, so increment the compression failure counter. 553 */ 554 int err = ZSTD_getErrorCode(c_len); 555 if (err != ZSTD_error_dstSize_tooSmall) { 556 ZSTDSTAT_BUMP(zstd_stat_com_fail); 557 dprintf("Error: %s", ZSTD_getErrorString(err)); 558 } 559 return (s_len); 560 } 561 562 /* 563 * Encode the compressed buffer size at the start. We'll need this in 564 * decompression to counter the effects of padding which might be added 565 * to the compressed buffer and which, if unhandled, would confuse the 566 * hell out of our decompression function. 567 */ 568 hdr->c_len = BE_32(c_len); 569 570 /* 571 * Check version for overflow. 572 * The limit of 24 bits must not be exceeded. This allows a maximum 573 * version 1677.72.15 which we don't expect to be ever reached. 574 */ 575 ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF); 576 577 /* 578 * Encode the compression level as well. We may need to know the 579 * original compression level if compressed_arc is disabled, to match 580 * the compression settings to write this block to the L2ARC. 581 * 582 * Encode the actual level, so if the enum changes in the future, we 583 * will be compatible. 584 * 585 * The upper 24 bits store the ZSTD version to be able to provide 586 * future compatibility, since new versions might enhance the 587 * compression algorithm in a way, where the compressed data will 588 * change. 589 * 590 * As soon as such incompatibility occurs, handling code needs to be 591 * added, differentiating between the versions. 592 */ 593 zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER); 594 zfs_set_hdrlevel(hdr, level); 595 hdr->raw_version_level = BE_32(hdr->raw_version_level); 596 597 return (c_len + sizeof (*hdr)); 598 } 599 600 /* Decompress block using zstd and return its stored level */ 601 int 602 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, 603 size_t d_len, uint8_t *level) 604 { 605 ZSTD_DCtx *dctx; 606 size_t result; 607 int16_t zstd_level; 608 uint32_t c_len; 609 const zfs_zstdhdr_t *hdr; 610 zfs_zstdhdr_t hdr_copy; 611 612 hdr = (const zfs_zstdhdr_t *)s_start; 613 c_len = BE_32(hdr->c_len); 614 615 /* 616 * Make a copy instead of directly converting the header, since we must 617 * not modify the original data that may be used again later. 618 */ 619 hdr_copy.raw_version_level = BE_32(hdr->raw_version_level); 620 uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy); 621 622 /* 623 * NOTE: We ignore the ZSTD version for now. As soon as any 624 * incompatibility occurs, it has to be handled accordingly. 625 * The version can be accessed via `hdr_copy.version`. 626 */ 627 628 /* 629 * Convert and check the level 630 * An invalid level is a strong indicator for data corruption! In such 631 * case return an error so the upper layers can try to fix it. 632 */ 633 if (zstd_enum_to_level(curlevel, &zstd_level)) { 634 ZSTDSTAT_BUMP(zstd_stat_dec_inval); 635 return (1); 636 } 637 638 ASSERT3U(d_len, >=, s_len); 639 ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT); 640 641 /* Invalid compressed buffer size encoded at start */ 642 if (c_len + sizeof (*hdr) > s_len) { 643 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval); 644 return (1); 645 } 646 647 dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc); 648 if (!dctx) { 649 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail); 650 return (1); 651 } 652 653 /* Set header type to "magicless" */ 654 ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless); 655 656 /* Decompress the data and release the context */ 657 result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len); 658 ZSTD_freeDCtx(dctx); 659 660 /* 661 * Returns 0 on success (decompression function returned non-negative) 662 * and non-zero on failure (decompression function returned negative. 663 */ 664 if (ZSTD_isError(result)) { 665 ZSTDSTAT_BUMP(zstd_stat_dec_fail); 666 return (1); 667 } 668 669 if (level) { 670 *level = curlevel; 671 } 672 673 return (0); 674 } 675 676 /* Decompress datablock using zstd */ 677 int 678 zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, 679 int level __maybe_unused) 680 { 681 682 return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len, 683 NULL)); 684 } 685 686 /* Allocator for zstd compression context using mempool_allocator */ 687 static void * 688 zstd_alloc(void *opaque __maybe_unused, size_t size) 689 { 690 size_t nbytes = sizeof (struct zstd_kmem) + size; 691 struct zstd_kmem *z = NULL; 692 693 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes); 694 695 if (!z) { 696 ZSTDSTAT_BUMP(zstd_stat_alloc_fail); 697 return (NULL); 698 } 699 700 return ((void*)z + (sizeof (struct zstd_kmem))); 701 } 702 703 /* 704 * Allocator for zstd decompression context using mempool_allocator with 705 * fallback to reserved memory if allocation fails 706 */ 707 static void * 708 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size) 709 { 710 size_t nbytes = sizeof (struct zstd_kmem) + size; 711 struct zstd_kmem *z = NULL; 712 enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT; 713 714 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes); 715 if (!z) { 716 /* Try harder, decompression shall not fail */ 717 z = vmem_alloc(nbytes, KM_SLEEP); 718 if (z) { 719 z->pool = NULL; 720 } 721 ZSTDSTAT_BUMP(zstd_stat_alloc_fail); 722 } else { 723 return ((void*)z + (sizeof (struct zstd_kmem))); 724 } 725 726 /* Fallback if everything fails */ 727 if (!z) { 728 /* 729 * Barrier since we only can handle it in a single thread. All 730 * other following threads need to wait here until decompression 731 * is completed. zstd_free will release this barrier later. 732 */ 733 mutex_enter(&zstd_dctx_fallback.barrier); 734 735 z = zstd_dctx_fallback.mem; 736 type = ZSTD_KMEM_DCTX; 737 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback); 738 } 739 740 /* Allocation should always be successful */ 741 if (!z) { 742 return (NULL); 743 } 744 745 z->kmem_type = type; 746 z->kmem_size = nbytes; 747 748 return ((void*)z + (sizeof (struct zstd_kmem))); 749 } 750 751 /* Free allocated memory by its specific type */ 752 static void 753 zstd_free(void *opaque __maybe_unused, void *ptr) 754 { 755 struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem)); 756 enum zstd_kmem_type type; 757 758 ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT); 759 ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN); 760 761 type = z->kmem_type; 762 switch (type) { 763 case ZSTD_KMEM_DEFAULT: 764 vmem_free(z, z->kmem_size); 765 break; 766 case ZSTD_KMEM_POOL: 767 zstd_mempool_free(z); 768 break; 769 case ZSTD_KMEM_DCTX: 770 mutex_exit(&zstd_dctx_fallback.barrier); 771 break; 772 default: 773 break; 774 } 775 } 776 777 /* Allocate fallback memory to ensure safe decompression */ 778 static void __init 779 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size) 780 { 781 mem->mem_size = size; 782 mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP); 783 mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL); 784 } 785 786 /* Initialize memory pool barrier mutexes */ 787 static void __init 788 zstd_mempool_init(void) 789 { 790 zstd_mempool_cctx = (struct zstd_pool *) 791 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); 792 zstd_mempool_dctx = (struct zstd_pool *) 793 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); 794 795 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 796 mutex_init(&zstd_mempool_cctx[i].barrier, NULL, 797 MUTEX_DEFAULT, NULL); 798 mutex_init(&zstd_mempool_dctx[i].barrier, NULL, 799 MUTEX_DEFAULT, NULL); 800 } 801 } 802 803 /* Initialize zstd-related memory handling */ 804 static int __init 805 zstd_meminit(void) 806 { 807 zstd_mempool_init(); 808 809 /* 810 * Estimate the size of the fallback decompression context. 811 * The expected size on x64 with current ZSTD should be about 160 KB. 812 */ 813 create_fallback_mem(&zstd_dctx_fallback, 814 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem), 815 PAGESIZE)); 816 817 return (0); 818 } 819 820 /* Release object from pool and free memory */ 821 static void 822 release_pool(struct zstd_pool *pool) 823 { 824 mutex_destroy(&pool->barrier); 825 vmem_free(pool->mem, pool->size); 826 pool->mem = NULL; 827 pool->size = 0; 828 } 829 830 /* Release memory pool objects */ 831 static void 832 zstd_mempool_deinit(void) 833 { 834 for (int i = 0; i < ZSTD_POOL_MAX; i++) { 835 release_pool(&zstd_mempool_cctx[i]); 836 release_pool(&zstd_mempool_dctx[i]); 837 } 838 839 kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); 840 kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); 841 zstd_mempool_dctx = NULL; 842 zstd_mempool_cctx = NULL; 843 } 844 845 /* release unused memory from pool */ 846 847 void 848 zfs_zstd_cache_reap_now(void) 849 { 850 851 /* 852 * Short-circuit if there are no buffers to begin with. 853 */ 854 if (ZSTDSTAT(zstd_stat_buffers) == 0) 855 return; 856 857 /* 858 * calling alloc with zero size seeks 859 * and releases old unused objects 860 */ 861 zstd_mempool_reap(zstd_mempool_cctx); 862 zstd_mempool_reap(zstd_mempool_dctx); 863 } 864 865 extern int __init 866 zstd_init(void) 867 { 868 /* Set pool size by using maximum sane thread count * 4 */ 869 pool_count = (boot_ncpus * 4); 870 zstd_meminit(); 871 872 /* Initialize kstat */ 873 zstd_ksp = kstat_create("zfs", 0, "zstd", "misc", 874 KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t), 875 KSTAT_FLAG_VIRTUAL); 876 if (zstd_ksp != NULL) { 877 zstd_ksp->ks_data = &zstd_stats; 878 kstat_install(zstd_ksp); 879 #ifdef _KERNEL 880 zstd_ksp->ks_update = kstat_zstd_update; 881 #endif 882 } 883 884 return (0); 885 } 886 887 extern void 888 zstd_fini(void) 889 { 890 /* Deinitialize kstat */ 891 if (zstd_ksp != NULL) { 892 kstat_delete(zstd_ksp); 893 zstd_ksp = NULL; 894 } 895 896 /* Release fallback memory */ 897 vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size); 898 mutex_destroy(&zstd_dctx_fallback.barrier); 899 900 /* Deinit memory pool */ 901 zstd_mempool_deinit(); 902 } 903 904 #if defined(_KERNEL) 905 #ifdef __FreeBSD__ 906 module_init(zstd_init); 907 module_exit(zstd_fini); 908 #endif 909 910 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, INT, ZMOD_RW, 911 "Enable early abort attempts when using zstd"); 912 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW, 913 "Minimal size of block to attempt early abort"); 914 #endif 915