1 /* 2 * Copyright (c) 2011-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression) 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 /* 37 * This module handles low level logical file I/O (strategy) which backs 38 * the logical buffer cache. 39 * 40 * [De]compression, zero-block, check codes, and buffer cache operations 41 * for file data is handled here. 42 * 43 * Live dedup makes its home here as well. 44 */ 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/kernel.h> 49 #include <sys/fcntl.h> 50 #include <sys/buf.h> 51 #include <sys/proc.h> 52 #include <sys/namei.h> 53 #include <sys/mount.h> 54 #include <sys/vnode.h> 55 #include <sys/mountctl.h> 56 #include <sys/dirent.h> 57 #include <sys/uio.h> 58 #include <sys/objcache.h> 59 #include <sys/event.h> 60 #include <sys/file.h> 61 #include <vfs/fifofs/fifo.h> 62 63 #include "hammer2.h" 64 #include "hammer2_lz4.h" 65 66 #include "zlib/hammer2_zlib.h" 67 68 struct objcache *cache_buffer_read; 69 struct objcache *cache_buffer_write; 70 71 /* 72 * Strategy code (async logical file buffer I/O from system) 73 * 74 * WARNING: The strategy code cannot safely use hammer2 transactions 75 * as this can deadlock against vfs_sync's vfsync() call 76 * if multiple flushes are queued. All H2 structures must 77 * already be present and ready for the DIO. 78 * 79 * Reads can be initiated asynchronously, writes have to be 80 * spooled to a separate thread for action to avoid deadlocks. 81 */ 82 static void hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex); 83 static void hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex); 84 static int hammer2_strategy_read(struct vop_strategy_args *ap); 85 static int hammer2_strategy_write(struct vop_strategy_args *ap); 86 static void hammer2_strategy_read_completion(hammer2_chain_t *chain, 87 char *data, struct bio *bio); 88 89 static void hammer2_dedup_record(hammer2_chain_t *chain, char *data); 90 static hammer2_off_t hammer2_dedup_lookup(hammer2_dev_t *hmp, 91 char **datap, int pblksize); 92 93 int 94 hammer2_vop_strategy(struct vop_strategy_args *ap) 95 { 96 struct bio *biop; 97 struct buf *bp; 98 int error; 99 100 biop = ap->a_bio; 101 bp = biop->bio_buf; 102 103 switch(bp->b_cmd) { 104 case BUF_CMD_READ: 105 error = hammer2_strategy_read(ap); 106 ++hammer2_iod_file_read; 107 break; 108 case BUF_CMD_WRITE: 109 error = hammer2_strategy_write(ap); 110 ++hammer2_iod_file_write; 111 break; 112 default: 113 bp->b_error = error = EINVAL; 114 bp->b_flags |= B_ERROR; 115 biodone(biop); 116 break; 117 } 118 return (error); 119 } 120 121 /* 122 * Return the largest contiguous physical disk range for the logical 123 * request, in bytes. 124 * 125 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 126 * 127 * Basically disabled, the logical buffer write thread has to deal with 128 * buffers one-at-a-time. 129 */ 130 int 131 hammer2_vop_bmap(struct vop_bmap_args *ap) 132 { 133 *ap->a_doffsetp = NOOFFSET; 134 if (ap->a_runp) 135 *ap->a_runp = 0; 136 if (ap->a_runb) 137 *ap->a_runb = 0; 138 return (EOPNOTSUPP); 139 } 140 141 /**************************************************************************** 142 * READ SUPPORT * 143 ****************************************************************************/ 144 /* 145 * Callback used in read path in case that a block is compressed with LZ4. 146 */ 147 static 148 void 149 hammer2_decompress_LZ4_callback(const char *data, u_int bytes, struct bio *bio) 150 { 151 struct buf *bp; 152 char *compressed_buffer; 153 int compressed_size; 154 int result; 155 156 bp = bio->bio_buf; 157 158 #if 0 159 if bio->bio_caller_info2.index && 160 bio->bio_caller_info1.uvalue32 != 161 crc32(bp->b_data, bp->b_bufsize) --- return error 162 #endif 163 164 KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE); 165 compressed_size = *(const int *)data; 166 KKASSERT(compressed_size <= bytes - sizeof(int)); 167 168 compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT); 169 result = LZ4_decompress_safe(__DECONST(char *, &data[sizeof(int)]), 170 compressed_buffer, 171 compressed_size, 172 bp->b_bufsize); 173 if (result < 0) { 174 kprintf("READ PATH: Error during decompression." 175 "bio %016jx/%d\n", 176 (intmax_t)bio->bio_offset, bytes); 177 /* make sure it isn't random garbage */ 178 bzero(compressed_buffer, bp->b_bufsize); 179 } 180 KKASSERT(result <= bp->b_bufsize); 181 bcopy(compressed_buffer, bp->b_data, bp->b_bufsize); 182 if (result < bp->b_bufsize) 183 bzero(bp->b_data + result, bp->b_bufsize - result); 184 objcache_put(cache_buffer_read, compressed_buffer); 185 bp->b_resid = 0; 186 bp->b_flags |= B_AGE; 187 } 188 189 /* 190 * Callback used in read path in case that a block is compressed with ZLIB. 191 * It is almost identical to LZ4 callback, so in theory they can be unified, 192 * but we didn't want to make changes in bio structure for that. 193 */ 194 static 195 void 196 hammer2_decompress_ZLIB_callback(const char *data, u_int bytes, struct bio *bio) 197 { 198 struct buf *bp; 199 char *compressed_buffer; 200 z_stream strm_decompress; 201 int result; 202 int ret; 203 204 bp = bio->bio_buf; 205 206 KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE); 207 strm_decompress.avail_in = 0; 208 strm_decompress.next_in = Z_NULL; 209 210 ret = inflateInit(&strm_decompress); 211 212 if (ret != Z_OK) 213 kprintf("HAMMER2 ZLIB: Fatal error in inflateInit.\n"); 214 215 compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT); 216 strm_decompress.next_in = __DECONST(char *, data); 217 218 /* XXX supply proper size, subset of device bp */ 219 strm_decompress.avail_in = bytes; 220 strm_decompress.next_out = compressed_buffer; 221 strm_decompress.avail_out = bp->b_bufsize; 222 223 ret = inflate(&strm_decompress, Z_FINISH); 224 if (ret != Z_STREAM_END) { 225 kprintf("HAMMER2 ZLIB: Fatar error during decompression.\n"); 226 bzero(compressed_buffer, bp->b_bufsize); 227 } 228 bcopy(compressed_buffer, bp->b_data, bp->b_bufsize); 229 result = bp->b_bufsize - strm_decompress.avail_out; 230 if (result < bp->b_bufsize) 231 bzero(bp->b_data + result, strm_decompress.avail_out); 232 objcache_put(cache_buffer_read, compressed_buffer); 233 ret = inflateEnd(&strm_decompress); 234 235 bp->b_resid = 0; 236 bp->b_flags |= B_AGE; 237 } 238 239 /* 240 * Logical buffer I/O, async read. 241 */ 242 static 243 int 244 hammer2_strategy_read(struct vop_strategy_args *ap) 245 { 246 hammer2_xop_strategy_t *xop; 247 struct buf *bp; 248 struct bio *bio; 249 struct bio *nbio; 250 hammer2_inode_t *ip; 251 hammer2_key_t lbase; 252 253 bio = ap->a_bio; 254 bp = bio->bio_buf; 255 ip = VTOI(ap->a_vp); 256 nbio = push_bio(bio); 257 258 lbase = bio->bio_offset; 259 KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0); 260 261 xop = hammer2_xop_alloc(ip, 0); 262 xop->finished = 0; 263 xop->bio = bio; 264 xop->lbase = lbase; 265 hammer2_mtx_init(&xop->lock, "h2bio"); 266 hammer2_xop_start(&xop->head, hammer2_strategy_xop_read); 267 /* asynchronous completion */ 268 269 return(0); 270 } 271 272 /* 273 * Per-node XOP (threaded), do a synchronous lookup of the chain and 274 * its data. The frontend is asynchronous, so we are also responsible 275 * for racing to terminate the frontend. 276 */ 277 static 278 void 279 hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex) 280 { 281 hammer2_xop_strategy_t *xop = &arg->xop_strategy; 282 hammer2_chain_t *parent; 283 hammer2_chain_t *chain; 284 hammer2_key_t key_dummy; 285 hammer2_key_t lbase; 286 struct bio *bio; 287 struct buf *bp; 288 int cache_index = -1; 289 int error; 290 291 lbase = xop->lbase; 292 bio = xop->bio; 293 bp = bio->bio_buf; 294 295 parent = hammer2_inode_chain(xop->head.ip1, clindex, 296 HAMMER2_RESOLVE_ALWAYS | 297 HAMMER2_RESOLVE_SHARED); 298 if (parent) { 299 chain = hammer2_chain_lookup(&parent, &key_dummy, 300 lbase, lbase, 301 &cache_index, 302 HAMMER2_LOOKUP_ALWAYS | 303 HAMMER2_LOOKUP_SHARED); 304 error = chain ? chain->error : 0; 305 } else { 306 error = EIO; 307 chain = NULL; 308 } 309 error = hammer2_xop_feed(&xop->head, chain, clindex, error); 310 if (chain) 311 hammer2_chain_drop(chain); 312 if (parent) { 313 hammer2_chain_unlock(parent); 314 hammer2_chain_drop(parent); 315 } 316 chain = NULL; /* safety */ 317 parent = NULL; /* safety */ 318 319 /* 320 * Race to finish the frontend 321 */ 322 if (xop->finished) 323 return; 324 hammer2_mtx_ex(&xop->lock); 325 if (xop->finished) { 326 hammer2_mtx_unlock(&xop->lock); 327 return; 328 } 329 330 /* 331 * Async operation has not completed and we now own the lock. 332 * Determine if we can complete the operation by issuing the 333 * frontend collection non-blocking. 334 */ 335 error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT); 336 337 switch(error) { 338 case 0: 339 xop->finished = 1; 340 hammer2_mtx_unlock(&xop->lock); 341 chain = xop->head.cluster.focus; 342 hammer2_strategy_read_completion(chain, (char *)chain->data, 343 xop->bio); 344 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 345 biodone(bio); 346 break; 347 case ENOENT: 348 xop->finished = 1; 349 hammer2_mtx_unlock(&xop->lock); 350 bp->b_resid = 0; 351 bp->b_error = 0; 352 bzero(bp->b_data, bp->b_bcount); 353 biodone(bio); 354 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 355 break; 356 case EINPROGRESS: 357 hammer2_mtx_unlock(&xop->lock); 358 break; 359 default: 360 xop->finished = 1; 361 hammer2_mtx_unlock(&xop->lock); 362 bp->b_flags |= B_ERROR; 363 bp->b_error = EIO; 364 biodone(bio); 365 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 366 break; 367 } 368 } 369 370 static 371 void 372 hammer2_strategy_read_completion(hammer2_chain_t *chain, char *data, 373 struct bio *bio) 374 { 375 struct buf *bp = bio->bio_buf; 376 377 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 378 /* 379 * Data is embedded in the inode (copy from inode). 380 */ 381 bcopy(((hammer2_inode_data_t *)data)->u.data, 382 bp->b_data, HAMMER2_EMBEDDED_BYTES); 383 bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES, 384 bp->b_bcount - HAMMER2_EMBEDDED_BYTES); 385 bp->b_resid = 0; 386 bp->b_error = 0; 387 } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) { 388 /* 389 * Data is on-media, record for live dedup. 390 */ 391 hammer2_dedup_record(chain, data); 392 393 /* 394 * Decopmression and copy. 395 */ 396 switch (HAMMER2_DEC_COMP(chain->bref.methods)) { 397 case HAMMER2_COMP_LZ4: 398 hammer2_decompress_LZ4_callback(data, chain->bytes, 399 bio); 400 break; 401 case HAMMER2_COMP_ZLIB: 402 hammer2_decompress_ZLIB_callback(data, chain->bytes, 403 bio); 404 break; 405 case HAMMER2_COMP_NONE: 406 KKASSERT(chain->bytes <= bp->b_bcount); 407 bcopy(data, bp->b_data, chain->bytes); 408 if (chain->bytes < bp->b_bcount) { 409 bzero(bp->b_data + chain->bytes, 410 bp->b_bcount - chain->bytes); 411 } 412 bp->b_flags |= B_NOTMETA; 413 bp->b_resid = 0; 414 bp->b_error = 0; 415 break; 416 default: 417 panic("hammer2_strategy_read: " 418 "unknown compression type"); 419 } 420 } else { 421 panic("hammer2_strategy_read: unknown bref type"); 422 } 423 } 424 425 /**************************************************************************** 426 * WRITE SUPPORT * 427 ****************************************************************************/ 428 429 /* 430 * Functions for compression in threads, 431 * from hammer2_vnops.c 432 */ 433 static void hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip, 434 hammer2_chain_t **parentp, 435 hammer2_key_t lbase, int ioflag, int pblksize, 436 hammer2_tid_t mtid, int *errorp); 437 static void hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip, 438 hammer2_chain_t **parentp, 439 hammer2_key_t lbase, int ioflag, int pblksize, 440 hammer2_tid_t mtid, int *errorp, 441 int comp_algo, int check_algo); 442 static void hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip, 443 hammer2_chain_t **parentp, 444 hammer2_key_t lbase, int ioflag, int pblksize, 445 hammer2_tid_t mtid, int *errorp, 446 int check_algo); 447 static int test_block_zeros(const char *buf, size_t bytes); 448 static void zero_write(struct buf *bp, hammer2_inode_t *ip, 449 hammer2_chain_t **parentp, 450 hammer2_key_t lbase, 451 hammer2_tid_t mtid, int *errorp); 452 static void hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, 453 int ioflag, int pblksize, 454 hammer2_tid_t mtid, int *errorp, 455 int check_algo); 456 457 static 458 int 459 hammer2_strategy_write(struct vop_strategy_args *ap) 460 { 461 hammer2_xop_strategy_t *xop; 462 hammer2_pfs_t *pmp; 463 struct bio *bio; 464 struct buf *bp; 465 hammer2_inode_t *ip; 466 467 bio = ap->a_bio; 468 bp = bio->bio_buf; 469 ip = VTOI(ap->a_vp); 470 pmp = ip->pmp; 471 472 hammer2_lwinprog_ref(pmp); 473 hammer2_trans_assert_strategy(pmp); 474 475 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 476 xop->finished = 0; 477 xop->bio = bio; 478 xop->lbase = bio->bio_offset; 479 hammer2_xop_start(&xop->head, hammer2_strategy_xop_write); 480 /* asynchronous completion */ 481 482 hammer2_lwinprog_wait(pmp, hammer2_flush_pipe); 483 484 return(0); 485 } 486 487 /* 488 * Per-node XOP (threaded). Write the logical buffer to the media. 489 */ 490 static 491 void 492 hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex) 493 { 494 hammer2_xop_strategy_t *xop = &arg->xop_strategy; 495 hammer2_chain_t *parent; 496 hammer2_key_t lbase; 497 hammer2_inode_t *ip; 498 struct bio *bio; 499 struct buf *bp; 500 int error; 501 int lblksize; 502 int pblksize; 503 504 lbase = xop->lbase; 505 bio = xop->bio; 506 bp = bio->bio_buf; 507 ip = xop->head.ip1; 508 509 /* hammer2_trans_init(parent->hmp->spmp, HAMMER2_TRANS_BUFCACHE); */ 510 511 lblksize = hammer2_calc_logical(ip, bio->bio_offset, &lbase, NULL); 512 pblksize = hammer2_calc_physical(ip, lbase); 513 parent = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS); 514 hammer2_write_file_core(bp, ip, &parent, 515 lbase, IO_ASYNC, pblksize, 516 xop->head.mtid, &error); 517 if (parent) { 518 hammer2_chain_unlock(parent); 519 hammer2_chain_drop(parent); 520 parent = NULL; /* safety */ 521 } 522 error = hammer2_xop_feed(&xop->head, NULL, clindex, error); 523 524 /* 525 * Race to finish the frontend 526 */ 527 if (xop->finished) 528 return; 529 hammer2_mtx_ex(&xop->lock); 530 if (xop->finished) { 531 hammer2_mtx_unlock(&xop->lock); 532 return; 533 } 534 535 /* 536 * Async operation has not completed and we now own the lock. 537 * Determine if we can complete the operation by issuing the 538 * frontend collection non-blocking. 539 */ 540 error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT); 541 542 switch(error) { 543 case ENOENT: 544 case 0: 545 xop->finished = 1; 546 hammer2_mtx_unlock(&xop->lock); 547 bp->b_resid = 0; 548 bp->b_error = 0; 549 biodone(bio); 550 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 551 hammer2_lwinprog_drop(ip->pmp); 552 break; 553 case EINPROGRESS: 554 hammer2_mtx_unlock(&xop->lock); 555 break; 556 default: 557 xop->finished = 1; 558 hammer2_mtx_unlock(&xop->lock); 559 bp->b_flags |= B_ERROR; 560 bp->b_error = EIO; 561 biodone(bio); 562 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 563 hammer2_lwinprog_drop(ip->pmp); 564 break; 565 } 566 } 567 568 /* 569 * Wait for pending I/O to complete 570 */ 571 void 572 hammer2_bioq_sync(hammer2_pfs_t *pmp) 573 { 574 hammer2_lwinprog_wait(pmp, 0); 575 } 576 577 /* 578 * Create a new cluster at (cparent, lbase) and assign physical storage, 579 * returning a cluster suitable for I/O. The cluster will be in a modified 580 * state. 581 * 582 * cparent can wind up being anything. 583 * 584 * If datap is not NULL, *datap points to the real data we intend to write. 585 * If we can dedup the storage location we set *datap to NULL to indicate 586 * to the caller that a dedup occurred. 587 * 588 * NOTE: Special case for data embedded in inode. 589 */ 590 static 591 hammer2_chain_t * 592 hammer2_assign_physical(hammer2_inode_t *ip, hammer2_chain_t **parentp, 593 hammer2_key_t lbase, int pblksize, 594 hammer2_tid_t mtid, char **datap, int *errorp) 595 { 596 hammer2_chain_t *chain; 597 hammer2_key_t key_dummy; 598 hammer2_off_t dedup_off; 599 int pradix = hammer2_getradix(pblksize); 600 int cache_index = -1; 601 602 /* 603 * Locate the chain associated with lbase, return a locked chain. 604 * However, do not instantiate any data reference (which utilizes a 605 * device buffer) because we will be using direct IO via the 606 * logical buffer cache buffer. 607 */ 608 *errorp = 0; 609 KKASSERT(pblksize >= HAMMER2_ALLOC_MIN); 610 retry: 611 chain = hammer2_chain_lookup(parentp, &key_dummy, 612 lbase, lbase, 613 &cache_index, 614 HAMMER2_LOOKUP_NODATA); 615 if (chain == NULL) { 616 /* 617 * We found a hole, create a new chain entry. 618 * 619 * NOTE: DATA chains are created without device backing 620 * store (nor do we want any). 621 */ 622 dedup_off = hammer2_dedup_lookup((*parentp)->hmp, datap, 623 pblksize); 624 *errorp = hammer2_chain_create(parentp, &chain, ip->pmp, 625 lbase, HAMMER2_PBUFRADIX, 626 HAMMER2_BREF_TYPE_DATA, 627 pblksize, mtid, 628 dedup_off, 0); 629 if (chain == NULL) { 630 panic("hammer2_chain_create: par=%p error=%d\n", 631 *parentp, *errorp); 632 goto retry; 633 } 634 /*ip->delta_dcount += pblksize;*/ 635 } else { 636 switch (chain->bref.type) { 637 case HAMMER2_BREF_TYPE_INODE: 638 /* 639 * The data is embedded in the inode, which requires 640 * a bit more finess. 641 */ 642 hammer2_chain_modify_ip(ip, chain, mtid, 0); 643 break; 644 case HAMMER2_BREF_TYPE_DATA: 645 dedup_off = hammer2_dedup_lookup(chain->hmp, datap, 646 pblksize); 647 if (chain->bytes != pblksize) { 648 hammer2_chain_resize(ip, *parentp, chain, 649 mtid, dedup_off, 650 pradix, 651 HAMMER2_MODIFY_OPTDATA); 652 } 653 654 /* 655 * DATA buffers must be marked modified whether the 656 * data is in a logical buffer or not. We also have 657 * to make this call to fixup the chain data pointers 658 * after resizing in case this is an encrypted or 659 * compressed buffer. 660 */ 661 hammer2_chain_modify(chain, mtid, dedup_off, 662 HAMMER2_MODIFY_OPTDATA); 663 break; 664 default: 665 panic("hammer2_assign_physical: bad type"); 666 /* NOT REACHED */ 667 break; 668 } 669 } 670 return (chain); 671 } 672 673 /* 674 * hammer2_write_file_core() - hammer2_write_thread() helper 675 * 676 * The core write function which determines which path to take 677 * depending on compression settings. We also have to locate the 678 * related chains so we can calculate and set the check data for 679 * the blockref. 680 */ 681 static 682 void 683 hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip, 684 hammer2_chain_t **parentp, 685 hammer2_key_t lbase, int ioflag, int pblksize, 686 hammer2_tid_t mtid, int *errorp) 687 { 688 hammer2_chain_t *chain; 689 char *data = bp->b_data; 690 691 switch(HAMMER2_DEC_ALGO(ip->meta.comp_algo)) { 692 case HAMMER2_COMP_NONE: 693 /* 694 * We have to assign physical storage to the buffer 695 * we intend to dirty or write now to avoid deadlocks 696 * in the strategy code later. 697 * 698 * This can return NOOFFSET for inode-embedded data. 699 * The strategy code will take care of it in that case. 700 */ 701 chain = hammer2_assign_physical(ip, parentp, lbase, pblksize, 702 mtid, &data, errorp); 703 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 704 hammer2_inode_data_t *wipdata; 705 706 wipdata = &chain->data->ipdata; 707 KKASSERT(wipdata->meta.op_flags & 708 HAMMER2_OPFLAG_DIRECTDATA); 709 KKASSERT(bp->b_loffset == 0); 710 bcopy(bp->b_data, wipdata->u.data, 711 HAMMER2_EMBEDDED_BYTES); 712 ++hammer2_iod_file_wembed; 713 } else if (data == NULL) { 714 /* 715 * Copy of data already present on-media. 716 */ 717 chain->bref.methods = 718 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) + 719 HAMMER2_ENC_CHECK(ip->meta.check_algo); 720 hammer2_chain_setcheck(chain, bp->b_data); 721 } else { 722 hammer2_write_bp(chain, bp, ioflag, pblksize, 723 mtid, errorp, ip->meta.check_algo); 724 } 725 if (chain) { 726 hammer2_chain_unlock(chain); 727 hammer2_chain_drop(chain); 728 } 729 break; 730 case HAMMER2_COMP_AUTOZERO: 731 /* 732 * Check for zero-fill only 733 */ 734 hammer2_zero_check_and_write(bp, ip, parentp, 735 lbase, ioflag, pblksize, 736 mtid, errorp, 737 ip->meta.check_algo); 738 break; 739 case HAMMER2_COMP_LZ4: 740 case HAMMER2_COMP_ZLIB: 741 default: 742 /* 743 * Check for zero-fill and attempt compression. 744 */ 745 hammer2_compress_and_write(bp, ip, parentp, 746 lbase, ioflag, pblksize, 747 mtid, errorp, 748 ip->meta.comp_algo, 749 ip->meta.check_algo); 750 break; 751 } 752 } 753 754 /* 755 * Helper 756 * 757 * Generic function that will perform the compression in compression 758 * write path. The compression algorithm is determined by the settings 759 * obtained from inode. 760 */ 761 static 762 void 763 hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip, 764 hammer2_chain_t **parentp, 765 hammer2_key_t lbase, int ioflag, int pblksize, 766 hammer2_tid_t mtid, int *errorp, int comp_algo, int check_algo) 767 { 768 hammer2_chain_t *chain; 769 int comp_size; 770 int comp_block_size; 771 char *comp_buffer; 772 char *data; 773 774 if (test_block_zeros(bp->b_data, pblksize)) { 775 zero_write(bp, ip, parentp, lbase, mtid, errorp); 776 return; 777 } 778 779 comp_size = 0; 780 comp_buffer = NULL; 781 782 KKASSERT(pblksize / 2 <= 32768); 783 784 if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) { 785 z_stream strm_compress; 786 int comp_level; 787 int ret; 788 789 switch(HAMMER2_DEC_ALGO(comp_algo)) { 790 case HAMMER2_COMP_LZ4: 791 comp_buffer = objcache_get(cache_buffer_write, 792 M_INTWAIT); 793 comp_size = LZ4_compress_limitedOutput( 794 bp->b_data, 795 &comp_buffer[sizeof(int)], 796 pblksize, 797 pblksize / 2 - sizeof(int)); 798 /* 799 * We need to prefix with the size, LZ4 800 * doesn't do it for us. Add the related 801 * overhead. 802 */ 803 *(int *)comp_buffer = comp_size; 804 if (comp_size) 805 comp_size += sizeof(int); 806 break; 807 case HAMMER2_COMP_ZLIB: 808 comp_level = HAMMER2_DEC_LEVEL(comp_algo); 809 if (comp_level == 0) 810 comp_level = 6; /* default zlib compression */ 811 else if (comp_level < 6) 812 comp_level = 6; 813 else if (comp_level > 9) 814 comp_level = 9; 815 ret = deflateInit(&strm_compress, comp_level); 816 if (ret != Z_OK) { 817 kprintf("HAMMER2 ZLIB: fatal error " 818 "on deflateInit.\n"); 819 } 820 821 comp_buffer = objcache_get(cache_buffer_write, 822 M_INTWAIT); 823 strm_compress.next_in = bp->b_data; 824 strm_compress.avail_in = pblksize; 825 strm_compress.next_out = comp_buffer; 826 strm_compress.avail_out = pblksize / 2; 827 ret = deflate(&strm_compress, Z_FINISH); 828 if (ret == Z_STREAM_END) { 829 comp_size = pblksize / 2 - 830 strm_compress.avail_out; 831 } else { 832 comp_size = 0; 833 } 834 ret = deflateEnd(&strm_compress); 835 break; 836 default: 837 kprintf("Error: Unknown compression method.\n"); 838 kprintf("Comp_method = %d.\n", comp_algo); 839 break; 840 } 841 } 842 843 if (comp_size == 0) { 844 /* 845 * compression failed or turned off 846 */ 847 comp_block_size = pblksize; /* safety */ 848 if (++ip->comp_heuristic > 128) 849 ip->comp_heuristic = 8; 850 } else { 851 /* 852 * compression succeeded 853 */ 854 ip->comp_heuristic = 0; 855 if (comp_size <= 1024) { 856 comp_block_size = 1024; 857 } else if (comp_size <= 2048) { 858 comp_block_size = 2048; 859 } else if (comp_size <= 4096) { 860 comp_block_size = 4096; 861 } else if (comp_size <= 8192) { 862 comp_block_size = 8192; 863 } else if (comp_size <= 16384) { 864 comp_block_size = 16384; 865 } else if (comp_size <= 32768) { 866 comp_block_size = 32768; 867 } else { 868 panic("hammer2: WRITE PATH: " 869 "Weird comp_size value."); 870 /* NOT REACHED */ 871 comp_block_size = pblksize; 872 } 873 874 /* 875 * Must zero the remainder or dedup (which operates on a 876 * physical block basis) will not find matches. 877 */ 878 if (comp_size < comp_block_size) { 879 bzero(comp_buffer + comp_size, 880 comp_block_size - comp_size); 881 } 882 } 883 884 /* 885 * Assign physical storage, data will be set to NULL if a live-dedup 886 * was successful. 887 */ 888 data = comp_size ? comp_buffer : bp->b_data; 889 chain = hammer2_assign_physical(ip, parentp, lbase, comp_block_size, 890 mtid, &data, errorp); 891 892 if (*errorp) { 893 kprintf("WRITE PATH: An error occurred while " 894 "assigning physical space.\n"); 895 KKASSERT(chain == NULL); 896 goto done; 897 } 898 899 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 900 hammer2_inode_data_t *wipdata; 901 902 hammer2_chain_modify_ip(ip, chain, mtid, 0); 903 wipdata = &chain->data->ipdata; 904 KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA); 905 KKASSERT(bp->b_loffset == 0); 906 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES); 907 ++hammer2_iod_file_wembed; 908 } else if (data == NULL) { 909 /* 910 * Live deduplication, a copy of the data is already present 911 * on the media. 912 */ 913 char *bdata; 914 915 if (comp_size) { 916 chain->bref.methods = 917 HAMMER2_ENC_COMP(comp_algo) + 918 HAMMER2_ENC_CHECK(check_algo); 919 } else { 920 chain->bref.methods = 921 HAMMER2_ENC_COMP( 922 HAMMER2_COMP_NONE) + 923 HAMMER2_ENC_CHECK(check_algo); 924 } 925 bdata = comp_size ? comp_buffer : bp->b_data; 926 hammer2_chain_setcheck(chain, bdata); 927 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 928 } else { 929 hammer2_io_t *dio; 930 char *bdata; 931 932 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 933 934 switch(chain->bref.type) { 935 case HAMMER2_BREF_TYPE_INODE: 936 panic("hammer2_write_bp: unexpected inode\n"); 937 break; 938 case HAMMER2_BREF_TYPE_DATA: 939 /* 940 * Optimize out the read-before-write 941 * if possible. 942 */ 943 *errorp = hammer2_io_newnz(chain->hmp, 944 chain->bref.data_off, 945 chain->bytes, 946 &dio); 947 if (*errorp) { 948 hammer2_io_brelse(&dio); 949 kprintf("hammer2: WRITE PATH: " 950 "dbp bread error\n"); 951 break; 952 } 953 bdata = hammer2_io_data(dio, chain->bref.data_off); 954 955 /* 956 * When loading the block make sure we don't 957 * leave garbage after the compressed data. 958 */ 959 if (comp_size) { 960 chain->bref.methods = 961 HAMMER2_ENC_COMP(comp_algo) + 962 HAMMER2_ENC_CHECK(check_algo); 963 bcopy(comp_buffer, bdata, comp_size); 964 } else { 965 chain->bref.methods = 966 HAMMER2_ENC_COMP( 967 HAMMER2_COMP_NONE) + 968 HAMMER2_ENC_CHECK(check_algo); 969 bcopy(bp->b_data, bdata, pblksize); 970 } 971 972 /* 973 * The flush code doesn't calculate check codes for 974 * file data (doing so can result in excessive I/O), 975 * so we do it here. 976 */ 977 hammer2_chain_setcheck(chain, bdata); 978 hammer2_dedup_record(chain, bdata); 979 980 /* 981 * Device buffer is now valid, chain is no longer in 982 * the initial state. 983 * 984 * (No blockref table worries with file data) 985 */ 986 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 987 988 /* Now write the related bdp. */ 989 if (ioflag & IO_SYNC) { 990 /* 991 * Synchronous I/O requested. 992 */ 993 hammer2_io_bwrite(&dio); 994 /* 995 } else if ((ioflag & IO_DIRECT) && 996 loff + n == pblksize) { 997 hammer2_io_bdwrite(&dio); 998 */ 999 } else if (ioflag & IO_ASYNC) { 1000 hammer2_io_bawrite(&dio); 1001 } else { 1002 hammer2_io_bdwrite(&dio); 1003 } 1004 break; 1005 default: 1006 panic("hammer2_write_bp: bad chain type %d\n", 1007 chain->bref.type); 1008 /* NOT REACHED */ 1009 break; 1010 } 1011 } 1012 done: 1013 if (chain) { 1014 hammer2_chain_unlock(chain); 1015 hammer2_chain_drop(chain); 1016 } 1017 if (comp_buffer) 1018 objcache_put(cache_buffer_write, comp_buffer); 1019 } 1020 1021 /* 1022 * Helper 1023 * 1024 * Function that performs zero-checking and writing without compression, 1025 * it corresponds to default zero-checking path. 1026 */ 1027 static 1028 void 1029 hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip, 1030 hammer2_chain_t **parentp, 1031 hammer2_key_t lbase, int ioflag, int pblksize, 1032 hammer2_tid_t mtid, int *errorp, 1033 int check_algo) 1034 { 1035 hammer2_chain_t *chain; 1036 char *data = bp->b_data; 1037 1038 if (test_block_zeros(bp->b_data, pblksize)) { 1039 zero_write(bp, ip, parentp, lbase, mtid, errorp); 1040 } else { 1041 chain = hammer2_assign_physical(ip, parentp, lbase, pblksize, 1042 mtid, &data, errorp); 1043 if (data) { 1044 hammer2_write_bp(chain, bp, ioflag, pblksize, 1045 mtid, errorp, check_algo); 1046 } /* else dedup occurred */ 1047 if (chain) { 1048 hammer2_chain_unlock(chain); 1049 hammer2_chain_drop(chain); 1050 } 1051 } 1052 } 1053 1054 /* 1055 * Helper 1056 * 1057 * A function to test whether a block of data contains only zeros, 1058 * returns TRUE (non-zero) if the block is all zeros. 1059 */ 1060 static 1061 int 1062 test_block_zeros(const char *buf, size_t bytes) 1063 { 1064 size_t i; 1065 1066 for (i = 0; i < bytes; i += sizeof(long)) { 1067 if (*(const long *)(buf + i) != 0) 1068 return (0); 1069 } 1070 return (1); 1071 } 1072 1073 /* 1074 * Helper 1075 * 1076 * Function to "write" a block that contains only zeros. 1077 */ 1078 static 1079 void 1080 zero_write(struct buf *bp, hammer2_inode_t *ip, 1081 hammer2_chain_t **parentp, 1082 hammer2_key_t lbase, hammer2_tid_t mtid, int *errorp __unused) 1083 { 1084 hammer2_chain_t *chain; 1085 hammer2_key_t key_dummy; 1086 int cache_index = -1; 1087 1088 chain = hammer2_chain_lookup(parentp, &key_dummy, 1089 lbase, lbase, 1090 &cache_index, 1091 HAMMER2_LOOKUP_NODATA); 1092 if (chain) { 1093 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 1094 hammer2_inode_data_t *wipdata; 1095 1096 hammer2_chain_modify_ip(ip, chain, mtid, 0); 1097 wipdata = &chain->data->ipdata; 1098 KKASSERT(wipdata->meta.op_flags & 1099 HAMMER2_OPFLAG_DIRECTDATA); 1100 KKASSERT(bp->b_loffset == 0); 1101 bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES); 1102 ++hammer2_iod_file_wembed; 1103 } else { 1104 hammer2_chain_delete(*parentp, chain, 1105 mtid, HAMMER2_DELETE_PERMANENT); 1106 ++hammer2_iod_file_wzero; 1107 } 1108 hammer2_chain_unlock(chain); 1109 hammer2_chain_drop(chain); 1110 } else { 1111 ++hammer2_iod_file_wzero; 1112 } 1113 } 1114 1115 /* 1116 * Helper 1117 * 1118 * Function to write the data as it is, without performing any sort of 1119 * compression. This function is used in path without compression and 1120 * default zero-checking path. 1121 */ 1122 static 1123 void 1124 hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag, 1125 int pblksize, 1126 hammer2_tid_t mtid, int *errorp, int check_algo) 1127 { 1128 hammer2_inode_data_t *wipdata; 1129 hammer2_io_t *dio; 1130 char *bdata; 1131 int error; 1132 1133 error = 0; /* XXX TODO below */ 1134 1135 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 1136 1137 switch(chain->bref.type) { 1138 case HAMMER2_BREF_TYPE_INODE: 1139 wipdata = &chain->data->ipdata; 1140 KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA); 1141 KKASSERT(bp->b_loffset == 0); 1142 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES); 1143 error = 0; 1144 ++hammer2_iod_file_wembed; 1145 break; 1146 case HAMMER2_BREF_TYPE_DATA: 1147 error = hammer2_io_newnz(chain->hmp, 1148 chain->bref.data_off, 1149 chain->bytes, &dio); 1150 if (error) { 1151 hammer2_io_bqrelse(&dio); 1152 kprintf("hammer2: WRITE PATH: " 1153 "dbp bread error\n"); 1154 break; 1155 } 1156 bdata = hammer2_io_data(dio, chain->bref.data_off); 1157 1158 chain->bref.methods = HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) + 1159 HAMMER2_ENC_CHECK(check_algo); 1160 bcopy(bp->b_data, bdata, chain->bytes); 1161 1162 /* 1163 * The flush code doesn't calculate check codes for 1164 * file data (doing so can result in excessive I/O), 1165 * so we do it here. 1166 */ 1167 hammer2_chain_setcheck(chain, bdata); 1168 hammer2_dedup_record(chain, bdata); 1169 1170 /* 1171 * Device buffer is now valid, chain is no longer in 1172 * the initial state. 1173 * 1174 * (No blockref table worries with file data) 1175 */ 1176 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1177 1178 if (ioflag & IO_SYNC) { 1179 /* 1180 * Synchronous I/O requested. 1181 */ 1182 hammer2_io_bwrite(&dio); 1183 /* 1184 } else if ((ioflag & IO_DIRECT) && 1185 loff + n == pblksize) { 1186 hammer2_io_bdwrite(&dio); 1187 */ 1188 } else if (ioflag & IO_ASYNC) { 1189 hammer2_io_bawrite(&dio); 1190 } else { 1191 hammer2_io_bdwrite(&dio); 1192 } 1193 break; 1194 default: 1195 panic("hammer2_write_bp: bad chain type %d\n", 1196 chain->bref.type); 1197 /* NOT REACHED */ 1198 error = 0; 1199 break; 1200 } 1201 KKASSERT(error == 0); /* XXX TODO */ 1202 *errorp = error; 1203 } 1204 1205 /* 1206 * LIVE DEDUP HEURISTIC 1207 * 1208 * WARNING! This code is SMP safe but the heuristic allows SMP collisions. 1209 * All fields must be loaded into locals and validated. 1210 */ 1211 static 1212 void 1213 hammer2_dedup_record(hammer2_chain_t *chain, char *data) 1214 { 1215 hammer2_dev_t *hmp; 1216 hammer2_dedup_t *dedup; 1217 int32_t crc; 1218 int best = 0; 1219 int i; 1220 int dticks; 1221 1222 hmp = chain->hmp; 1223 crc = hammer2_icrc32(data, chain->bytes); 1224 dedup = &hmp->heur_dedup[crc & (HAMMER2_DEDUP_HEUR_MASK & ~3)]; 1225 for (i = 0; i < 4; ++i) { 1226 if (dedup[i].data_crc == crc) { 1227 best = i; 1228 break; 1229 } 1230 dticks = (int)(dedup[i].ticks - dedup[best].ticks); 1231 if (dticks < 0 || dticks > hz * 60 * 30) 1232 best = i; 1233 } 1234 dedup += best; 1235 if (hammer2_debug & 0x40000) { 1236 kprintf("REC %04x %08x %016jx\n", 1237 (int)(dedup - hmp->heur_dedup), 1238 crc, 1239 chain->bref.data_off); 1240 } 1241 dedup->ticks = ticks; 1242 dedup->data_off = chain->bref.data_off; 1243 dedup->data_crc = crc; 1244 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEDUP); 1245 } 1246 1247 static 1248 hammer2_off_t 1249 hammer2_dedup_lookup(hammer2_dev_t *hmp, char **datap, int pblksize) 1250 { 1251 hammer2_dedup_t *dedup; 1252 hammer2_io_t *dio; 1253 hammer2_off_t off; 1254 uint32_t crc; 1255 char *data; 1256 int i; 1257 1258 data = *datap; 1259 if (data == NULL) 1260 return 0; 1261 1262 crc = hammer2_icrc32(data, pblksize); 1263 dedup = &hmp->heur_dedup[crc & (HAMMER2_DEDUP_HEUR_MASK & ~3)]; 1264 1265 if (hammer2_debug & 0x40000) { 1266 kprintf("LOC %04x/4 %08x\n", 1267 (int)(dedup - hmp->heur_dedup), 1268 crc); 1269 } 1270 1271 for (i = 0; i < 4; ++i) { 1272 off = dedup[i].data_off; 1273 cpu_ccfence(); 1274 if (dedup[i].data_crc != crc) 1275 continue; 1276 if ((1 << (int)(off & HAMMER2_OFF_MASK_RADIX)) != pblksize) 1277 continue; 1278 dio = hammer2_io_getquick(hmp, off, pblksize); 1279 if (dio && 1280 bcmp(data, hammer2_io_data(dio, off), pblksize) == 0) { 1281 if (hammer2_debug & 0x40000) { 1282 kprintf("DEDUP SUCCESS %016jx\n", 1283 (intmax_t)off); 1284 } 1285 hammer2_io_putblk(&dio); 1286 *datap = NULL; 1287 dedup[i].ticks = ticks; /* update use */ 1288 ++hammer2_iod_file_wdedup; 1289 return off; /* RETURN */ 1290 } 1291 if (dio) 1292 hammer2_io_putblk(&dio); 1293 } 1294 return 0; 1295 } 1296 1297 /* 1298 * Poof. Races are ok, if someone gets in and reuses a dedup offset 1299 * before or while we are clearing it they will also recover the freemap 1300 * entry (set it to fully allocated), so a bulkfree race can only set it 1301 * to a possibly-free state. 1302 * 1303 * XXX ok, well, not really sure races are ok but going to run with it 1304 * for the moment. 1305 */ 1306 void 1307 hammer2_dedup_clear(hammer2_dev_t *hmp) 1308 { 1309 int i; 1310 1311 for (i = 0; i < HAMMER2_DEDUP_HEUR_SIZE; ++i) { 1312 hmp->heur_dedup[i].data_off = 0; 1313 hmp->heur_dedup[i].ticks = ticks - 1; 1314 } 1315 } 1316