1 /* 2 * Copyright (c) 2011-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression) 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 /* 37 * This module handles low level logical file I/O (strategy) which backs 38 * the logical buffer cache. 39 * 40 * [De]compression, zero-block, check codes, and buffer cache operations 41 * for file data is handled here. 42 * 43 * Live dedup makes its home here as well. 44 */ 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/kernel.h> 49 #include <sys/fcntl.h> 50 #include <sys/buf.h> 51 #include <sys/proc.h> 52 #include <sys/namei.h> 53 #include <sys/mount.h> 54 #include <sys/vnode.h> 55 #include <sys/mountctl.h> 56 #include <sys/dirent.h> 57 #include <sys/uio.h> 58 #include <sys/objcache.h> 59 #include <sys/event.h> 60 #include <sys/file.h> 61 #include <vfs/fifofs/fifo.h> 62 63 #include "hammer2.h" 64 #include "hammer2_lz4.h" 65 66 #include "zlib/hammer2_zlib.h" 67 68 struct objcache *cache_buffer_read; 69 struct objcache *cache_buffer_write; 70 71 /* 72 * Strategy code (async logical file buffer I/O from system) 73 * 74 * WARNING: The strategy code cannot safely use hammer2 transactions 75 * as this can deadlock against vfs_sync's vfsync() call 76 * if multiple flushes are queued. All H2 structures must 77 * already be present and ready for the DIO. 78 * 79 * Reads can be initiated asynchronously, writes have to be 80 * spooled to a separate thread for action to avoid deadlocks. 81 */ 82 static void hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex); 83 static void hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex); 84 static int hammer2_strategy_read(struct vop_strategy_args *ap); 85 static int hammer2_strategy_write(struct vop_strategy_args *ap); 86 static void hammer2_strategy_read_completion(hammer2_chain_t *chain, 87 char *data, struct bio *bio); 88 89 static void hammer2_dedup_record(hammer2_chain_t *chain, char *data); 90 static hammer2_off_t hammer2_dedup_lookup(hammer2_dev_t *hmp, 91 char **datap, int pblksize); 92 93 int 94 hammer2_vop_strategy(struct vop_strategy_args *ap) 95 { 96 struct bio *biop; 97 struct buf *bp; 98 int error; 99 100 biop = ap->a_bio; 101 bp = biop->bio_buf; 102 103 switch(bp->b_cmd) { 104 case BUF_CMD_READ: 105 error = hammer2_strategy_read(ap); 106 ++hammer2_iod_file_read; 107 break; 108 case BUF_CMD_WRITE: 109 error = hammer2_strategy_write(ap); 110 ++hammer2_iod_file_write; 111 break; 112 default: 113 bp->b_error = error = EINVAL; 114 bp->b_flags |= B_ERROR; 115 biodone(biop); 116 break; 117 } 118 return (error); 119 } 120 121 /* 122 * Return the largest contiguous physical disk range for the logical 123 * request, in bytes. 124 * 125 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb) 126 * 127 * Basically disabled, the logical buffer write thread has to deal with 128 * buffers one-at-a-time. 129 */ 130 int 131 hammer2_vop_bmap(struct vop_bmap_args *ap) 132 { 133 *ap->a_doffsetp = NOOFFSET; 134 if (ap->a_runp) 135 *ap->a_runp = 0; 136 if (ap->a_runb) 137 *ap->a_runb = 0; 138 return (EOPNOTSUPP); 139 } 140 141 /**************************************************************************** 142 * READ SUPPORT * 143 ****************************************************************************/ 144 /* 145 * Callback used in read path in case that a block is compressed with LZ4. 146 */ 147 static 148 void 149 hammer2_decompress_LZ4_callback(const char *data, u_int bytes, struct bio *bio) 150 { 151 struct buf *bp; 152 char *compressed_buffer; 153 int compressed_size; 154 int result; 155 156 bp = bio->bio_buf; 157 158 #if 0 159 if bio->bio_caller_info2.index && 160 bio->bio_caller_info1.uvalue32 != 161 crc32(bp->b_data, bp->b_bufsize) --- return error 162 #endif 163 164 KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE); 165 compressed_size = *(const int *)data; 166 KKASSERT(compressed_size <= bytes - sizeof(int)); 167 168 compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT); 169 result = LZ4_decompress_safe(__DECONST(char *, &data[sizeof(int)]), 170 compressed_buffer, 171 compressed_size, 172 bp->b_bufsize); 173 if (result < 0) { 174 kprintf("READ PATH: Error during decompression." 175 "bio %016jx/%d\n", 176 (intmax_t)bio->bio_offset, bytes); 177 /* make sure it isn't random garbage */ 178 bzero(compressed_buffer, bp->b_bufsize); 179 } 180 KKASSERT(result <= bp->b_bufsize); 181 bcopy(compressed_buffer, bp->b_data, bp->b_bufsize); 182 if (result < bp->b_bufsize) 183 bzero(bp->b_data + result, bp->b_bufsize - result); 184 objcache_put(cache_buffer_read, compressed_buffer); 185 bp->b_resid = 0; 186 bp->b_flags |= B_AGE; 187 } 188 189 /* 190 * Callback used in read path in case that a block is compressed with ZLIB. 191 * It is almost identical to LZ4 callback, so in theory they can be unified, 192 * but we didn't want to make changes in bio structure for that. 193 */ 194 static 195 void 196 hammer2_decompress_ZLIB_callback(const char *data, u_int bytes, struct bio *bio) 197 { 198 struct buf *bp; 199 char *compressed_buffer; 200 z_stream strm_decompress; 201 int result; 202 int ret; 203 204 bp = bio->bio_buf; 205 206 KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE); 207 strm_decompress.avail_in = 0; 208 strm_decompress.next_in = Z_NULL; 209 210 ret = inflateInit(&strm_decompress); 211 212 if (ret != Z_OK) 213 kprintf("HAMMER2 ZLIB: Fatal error in inflateInit.\n"); 214 215 compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT); 216 strm_decompress.next_in = __DECONST(char *, data); 217 218 /* XXX supply proper size, subset of device bp */ 219 strm_decompress.avail_in = bytes; 220 strm_decompress.next_out = compressed_buffer; 221 strm_decompress.avail_out = bp->b_bufsize; 222 223 ret = inflate(&strm_decompress, Z_FINISH); 224 if (ret != Z_STREAM_END) { 225 kprintf("HAMMER2 ZLIB: Fatar error during decompression.\n"); 226 bzero(compressed_buffer, bp->b_bufsize); 227 } 228 bcopy(compressed_buffer, bp->b_data, bp->b_bufsize); 229 result = bp->b_bufsize - strm_decompress.avail_out; 230 if (result < bp->b_bufsize) 231 bzero(bp->b_data + result, strm_decompress.avail_out); 232 objcache_put(cache_buffer_read, compressed_buffer); 233 ret = inflateEnd(&strm_decompress); 234 235 bp->b_resid = 0; 236 bp->b_flags |= B_AGE; 237 } 238 239 /* 240 * Logical buffer I/O, async read. 241 */ 242 static 243 int 244 hammer2_strategy_read(struct vop_strategy_args *ap) 245 { 246 hammer2_xop_strategy_t *xop; 247 struct buf *bp; 248 struct bio *bio; 249 struct bio *nbio; 250 hammer2_inode_t *ip; 251 hammer2_key_t lbase; 252 253 bio = ap->a_bio; 254 bp = bio->bio_buf; 255 ip = VTOI(ap->a_vp); 256 nbio = push_bio(bio); 257 258 lbase = bio->bio_offset; 259 KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0); 260 261 xop = hammer2_xop_alloc(ip, 0); 262 xop->finished = 0; 263 xop->bio = bio; 264 xop->lbase = lbase; 265 hammer2_mtx_init(&xop->lock, "h2bio"); 266 hammer2_xop_start(&xop->head, hammer2_strategy_xop_read); 267 268 return(0); 269 } 270 271 /* 272 * Per-node XOP (threaded), do a synchronous lookup of the chain and 273 * its data. The frontend is asynchronous, so we are also responsible 274 * for racing to terminate the frontend. 275 */ 276 static 277 void 278 hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex) 279 { 280 hammer2_xop_strategy_t *xop = &arg->xop_strategy; 281 hammer2_chain_t *parent; 282 hammer2_chain_t *chain; 283 hammer2_key_t key_dummy; 284 hammer2_key_t lbase; 285 struct bio *bio; 286 struct buf *bp; 287 int cache_index = -1; 288 int error; 289 290 lbase = xop->lbase; 291 bio = xop->bio; 292 bp = bio->bio_buf; 293 294 parent = hammer2_inode_chain(xop->head.ip1, clindex, 295 HAMMER2_RESOLVE_ALWAYS | 296 HAMMER2_RESOLVE_SHARED); 297 if (parent) { 298 chain = hammer2_chain_lookup(&parent, &key_dummy, 299 lbase, lbase, 300 &cache_index, 301 HAMMER2_LOOKUP_ALWAYS | 302 HAMMER2_LOOKUP_SHARED); 303 error = chain ? chain->error : 0; 304 } else { 305 error = EIO; 306 chain = NULL; 307 } 308 error = hammer2_xop_feed(&xop->head, chain, clindex, error); 309 if (chain) 310 hammer2_chain_drop(chain); 311 if (parent) { 312 hammer2_chain_unlock(parent); 313 hammer2_chain_drop(parent); 314 } 315 chain = NULL; /* safety */ 316 parent = NULL; /* safety */ 317 318 /* 319 * Race to finish the frontend 320 */ 321 if (xop->finished) 322 return; 323 hammer2_mtx_ex(&xop->lock); 324 if (xop->finished) { 325 hammer2_mtx_unlock(&xop->lock); 326 return; 327 } 328 329 /* 330 * Async operation has not completed and we now own the lock. 331 * Determine if we can complete the operation by issuing the 332 * frontend collection non-blocking. 333 */ 334 error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT); 335 336 switch(error) { 337 case 0: 338 xop->finished = 1; 339 hammer2_mtx_unlock(&xop->lock); 340 chain = xop->head.cluster.focus; 341 hammer2_strategy_read_completion(chain, (char *)chain->data, 342 xop->bio); 343 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 344 biodone(bio); 345 break; 346 case ENOENT: 347 xop->finished = 1; 348 hammer2_mtx_unlock(&xop->lock); 349 bp->b_resid = 0; 350 bp->b_error = 0; 351 bzero(bp->b_data, bp->b_bcount); 352 biodone(bio); 353 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 354 break; 355 case EINPROGRESS: 356 hammer2_mtx_unlock(&xop->lock); 357 break; 358 default: 359 xop->finished = 1; 360 hammer2_mtx_unlock(&xop->lock); 361 bp->b_flags |= B_ERROR; 362 bp->b_error = EIO; 363 biodone(bio); 364 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 365 break; 366 } 367 } 368 369 static 370 void 371 hammer2_strategy_read_completion(hammer2_chain_t *chain, char *data, 372 struct bio *bio) 373 { 374 struct buf *bp = bio->bio_buf; 375 376 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 377 /* 378 * Data is embedded in the inode (copy from inode). 379 */ 380 bcopy(((hammer2_inode_data_t *)data)->u.data, 381 bp->b_data, HAMMER2_EMBEDDED_BYTES); 382 bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES, 383 bp->b_bcount - HAMMER2_EMBEDDED_BYTES); 384 bp->b_resid = 0; 385 bp->b_error = 0; 386 } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) { 387 /* 388 * Data is on-media, record for live dedup. 389 */ 390 hammer2_dedup_record(chain, data); 391 392 /* 393 * Decopmression and copy. 394 */ 395 switch (HAMMER2_DEC_COMP(chain->bref.methods)) { 396 case HAMMER2_COMP_LZ4: 397 hammer2_decompress_LZ4_callback(data, chain->bytes, 398 bio); 399 break; 400 case HAMMER2_COMP_ZLIB: 401 hammer2_decompress_ZLIB_callback(data, chain->bytes, 402 bio); 403 break; 404 case HAMMER2_COMP_NONE: 405 KKASSERT(chain->bytes <= bp->b_bcount); 406 bcopy(data, bp->b_data, chain->bytes); 407 if (chain->bytes < bp->b_bcount) { 408 bzero(bp->b_data + chain->bytes, 409 bp->b_bcount - chain->bytes); 410 } 411 bp->b_flags |= B_NOTMETA; 412 bp->b_resid = 0; 413 bp->b_error = 0; 414 break; 415 default: 416 panic("hammer2_strategy_read: " 417 "unknown compression type"); 418 } 419 } else { 420 panic("hammer2_strategy_read: unknown bref type"); 421 } 422 } 423 424 /**************************************************************************** 425 * WRITE SUPPORT * 426 ****************************************************************************/ 427 428 /* 429 * Functions for compression in threads, 430 * from hammer2_vnops.c 431 */ 432 static void hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip, 433 hammer2_chain_t **parentp, 434 hammer2_key_t lbase, int ioflag, int pblksize, 435 hammer2_tid_t mtid, int *errorp); 436 static void hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip, 437 hammer2_chain_t **parentp, 438 hammer2_key_t lbase, int ioflag, int pblksize, 439 hammer2_tid_t mtid, int *errorp, 440 int comp_algo, int check_algo); 441 static void hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip, 442 hammer2_chain_t **parentp, 443 hammer2_key_t lbase, int ioflag, int pblksize, 444 hammer2_tid_t mtid, int *errorp, 445 int check_algo); 446 static int test_block_zeros(const char *buf, size_t bytes); 447 static void zero_write(struct buf *bp, hammer2_inode_t *ip, 448 hammer2_chain_t **parentp, 449 hammer2_key_t lbase, 450 hammer2_tid_t mtid, int *errorp); 451 static void hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, 452 int ioflag, int pblksize, 453 hammer2_tid_t mtid, int *errorp, 454 int check_algo); 455 456 static 457 int 458 hammer2_strategy_write(struct vop_strategy_args *ap) 459 { 460 hammer2_xop_strategy_t *xop; 461 hammer2_pfs_t *pmp; 462 struct bio *bio; 463 struct buf *bp; 464 hammer2_inode_t *ip; 465 466 bio = ap->a_bio; 467 bp = bio->bio_buf; 468 ip = VTOI(ap->a_vp); 469 pmp = ip->pmp; 470 471 hammer2_lwinprog_ref(pmp); 472 hammer2_trans_assert_strategy(pmp); 473 474 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); 475 xop->finished = 0; 476 xop->bio = bio; 477 xop->lbase = bio->bio_offset; 478 hammer2_xop_start(&xop->head, hammer2_strategy_xop_write); 479 /* asynchronous completion */ 480 481 hammer2_lwinprog_wait(pmp, hammer2_flush_pipe); 482 483 return(0); 484 } 485 486 /* 487 * Per-node XOP (threaded). Write the logical buffer to the media. 488 */ 489 static 490 void 491 hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex) 492 { 493 hammer2_xop_strategy_t *xop = &arg->xop_strategy; 494 hammer2_chain_t *parent; 495 hammer2_key_t lbase; 496 hammer2_inode_t *ip; 497 struct bio *bio; 498 struct buf *bp; 499 int error; 500 int lblksize; 501 int pblksize; 502 503 lbase = xop->lbase; 504 bio = xop->bio; 505 bp = bio->bio_buf; 506 ip = xop->head.ip1; 507 508 /* hammer2_trans_init(parent->hmp->spmp, HAMMER2_TRANS_BUFCACHE); */ 509 510 lblksize = hammer2_calc_logical(ip, bio->bio_offset, &lbase, NULL); 511 pblksize = hammer2_calc_physical(ip, lbase); 512 parent = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS); 513 hammer2_write_file_core(bp, ip, &parent, 514 lbase, IO_ASYNC, pblksize, 515 xop->head.mtid, &error); 516 if (parent) { 517 hammer2_chain_unlock(parent); 518 hammer2_chain_drop(parent); 519 parent = NULL; /* safety */ 520 } 521 error = hammer2_xop_feed(&xop->head, NULL, clindex, error); 522 523 /* 524 * Race to finish the frontend 525 */ 526 if (xop->finished) 527 return; 528 hammer2_mtx_ex(&xop->lock); 529 if (xop->finished) { 530 hammer2_mtx_unlock(&xop->lock); 531 return; 532 } 533 534 /* 535 * Async operation has not completed and we now own the lock. 536 * Determine if we can complete the operation by issuing the 537 * frontend collection non-blocking. 538 */ 539 error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT); 540 541 switch(error) { 542 case ENOENT: 543 case 0: 544 xop->finished = 1; 545 hammer2_mtx_unlock(&xop->lock); 546 bp->b_resid = 0; 547 bp->b_error = 0; 548 biodone(bio); 549 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 550 hammer2_lwinprog_drop(ip->pmp); 551 break; 552 case EINPROGRESS: 553 hammer2_mtx_unlock(&xop->lock); 554 break; 555 default: 556 xop->finished = 1; 557 hammer2_mtx_unlock(&xop->lock); 558 bp->b_flags |= B_ERROR; 559 bp->b_error = EIO; 560 biodone(bio); 561 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); 562 hammer2_lwinprog_drop(ip->pmp); 563 break; 564 } 565 } 566 567 /* 568 * Wait for pending I/O to complete 569 */ 570 void 571 hammer2_bioq_sync(hammer2_pfs_t *pmp) 572 { 573 hammer2_lwinprog_wait(pmp, 0); 574 } 575 576 /* 577 * Create a new cluster at (cparent, lbase) and assign physical storage, 578 * returning a cluster suitable for I/O. The cluster will be in a modified 579 * state. 580 * 581 * cparent can wind up being anything. 582 * 583 * If datap is not NULL, *datap points to the real data we intend to write. 584 * If we can dedup the storage location we set *datap to NULL to indicate 585 * to the caller that a dedup occurred. 586 * 587 * NOTE: Special case for data embedded in inode. 588 */ 589 static 590 hammer2_chain_t * 591 hammer2_assign_physical(hammer2_inode_t *ip, hammer2_chain_t **parentp, 592 hammer2_key_t lbase, int pblksize, 593 hammer2_tid_t mtid, char **datap, int *errorp) 594 { 595 hammer2_chain_t *chain; 596 hammer2_key_t key_dummy; 597 hammer2_off_t dedup_off; 598 int pradix = hammer2_getradix(pblksize); 599 int cache_index = -1; 600 601 /* 602 * Locate the chain associated with lbase, return a locked chain. 603 * However, do not instantiate any data reference (which utilizes a 604 * device buffer) because we will be using direct IO via the 605 * logical buffer cache buffer. 606 */ 607 *errorp = 0; 608 KKASSERT(pblksize >= HAMMER2_ALLOC_MIN); 609 retry: 610 chain = hammer2_chain_lookup(parentp, &key_dummy, 611 lbase, lbase, 612 &cache_index, 613 HAMMER2_LOOKUP_NODATA); 614 if (chain == NULL) { 615 /* 616 * We found a hole, create a new chain entry. 617 * 618 * NOTE: DATA chains are created without device backing 619 * store (nor do we want any). 620 */ 621 dedup_off = hammer2_dedup_lookup((*parentp)->hmp, datap, 622 pblksize); 623 *errorp = hammer2_chain_create(parentp, &chain, ip->pmp, 624 lbase, HAMMER2_PBUFRADIX, 625 HAMMER2_BREF_TYPE_DATA, 626 pblksize, mtid, 627 dedup_off, 0); 628 if (chain == NULL) { 629 panic("hammer2_chain_create: par=%p error=%d\n", 630 *parentp, *errorp); 631 goto retry; 632 } 633 /*ip->delta_dcount += pblksize;*/ 634 } else { 635 switch (chain->bref.type) { 636 case HAMMER2_BREF_TYPE_INODE: 637 /* 638 * The data is embedded in the inode, which requires 639 * a bit more finess. 640 */ 641 hammer2_chain_modify_ip(ip, chain, mtid, 0); 642 break; 643 case HAMMER2_BREF_TYPE_DATA: 644 dedup_off = hammer2_dedup_lookup(chain->hmp, datap, 645 pblksize); 646 if (chain->bytes != pblksize) { 647 hammer2_chain_resize(ip, *parentp, chain, 648 mtid, dedup_off, 649 pradix, 650 HAMMER2_MODIFY_OPTDATA); 651 } 652 653 /* 654 * DATA buffers must be marked modified whether the 655 * data is in a logical buffer or not. We also have 656 * to make this call to fixup the chain data pointers 657 * after resizing in case this is an encrypted or 658 * compressed buffer. 659 */ 660 hammer2_chain_modify(chain, mtid, dedup_off, 661 HAMMER2_MODIFY_OPTDATA); 662 break; 663 default: 664 panic("hammer2_assign_physical: bad type"); 665 /* NOT REACHED */ 666 break; 667 } 668 } 669 return (chain); 670 } 671 672 /* 673 * hammer2_write_file_core() - hammer2_write_thread() helper 674 * 675 * The core write function which determines which path to take 676 * depending on compression settings. We also have to locate the 677 * related chains so we can calculate and set the check data for 678 * the blockref. 679 */ 680 static 681 void 682 hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip, 683 hammer2_chain_t **parentp, 684 hammer2_key_t lbase, int ioflag, int pblksize, 685 hammer2_tid_t mtid, int *errorp) 686 { 687 hammer2_chain_t *chain; 688 char *data = bp->b_data; 689 690 switch(HAMMER2_DEC_ALGO(ip->meta.comp_algo)) { 691 case HAMMER2_COMP_NONE: 692 /* 693 * We have to assign physical storage to the buffer 694 * we intend to dirty or write now to avoid deadlocks 695 * in the strategy code later. 696 * 697 * This can return NOOFFSET for inode-embedded data. 698 * The strategy code will take care of it in that case. 699 */ 700 chain = hammer2_assign_physical(ip, parentp, lbase, pblksize, 701 mtid, &data, errorp); 702 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 703 hammer2_inode_data_t *wipdata; 704 705 wipdata = &chain->data->ipdata; 706 KKASSERT(wipdata->meta.op_flags & 707 HAMMER2_OPFLAG_DIRECTDATA); 708 KKASSERT(bp->b_loffset == 0); 709 bcopy(bp->b_data, wipdata->u.data, 710 HAMMER2_EMBEDDED_BYTES); 711 ++hammer2_iod_file_wembed; 712 } else if (data == NULL) { 713 /* 714 * Copy of data already present on-media. 715 */ 716 chain->bref.methods = 717 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) + 718 HAMMER2_ENC_CHECK(ip->meta.check_algo); 719 hammer2_chain_setcheck(chain, bp->b_data); 720 } else { 721 hammer2_write_bp(chain, bp, ioflag, pblksize, 722 mtid, errorp, ip->meta.check_algo); 723 } 724 if (chain) { 725 hammer2_chain_unlock(chain); 726 hammer2_chain_drop(chain); 727 } 728 break; 729 case HAMMER2_COMP_AUTOZERO: 730 /* 731 * Check for zero-fill only 732 */ 733 hammer2_zero_check_and_write(bp, ip, parentp, 734 lbase, ioflag, pblksize, 735 mtid, errorp, 736 ip->meta.check_algo); 737 break; 738 case HAMMER2_COMP_LZ4: 739 case HAMMER2_COMP_ZLIB: 740 default: 741 /* 742 * Check for zero-fill and attempt compression. 743 */ 744 hammer2_compress_and_write(bp, ip, parentp, 745 lbase, ioflag, pblksize, 746 mtid, errorp, 747 ip->meta.comp_algo, 748 ip->meta.check_algo); 749 break; 750 } 751 } 752 753 /* 754 * Helper 755 * 756 * Generic function that will perform the compression in compression 757 * write path. The compression algorithm is determined by the settings 758 * obtained from inode. 759 */ 760 static 761 void 762 hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip, 763 hammer2_chain_t **parentp, 764 hammer2_key_t lbase, int ioflag, int pblksize, 765 hammer2_tid_t mtid, int *errorp, int comp_algo, int check_algo) 766 { 767 hammer2_chain_t *chain; 768 int comp_size; 769 int comp_block_size; 770 char *comp_buffer; 771 char *data; 772 773 if (test_block_zeros(bp->b_data, pblksize)) { 774 zero_write(bp, ip, parentp, lbase, mtid, errorp); 775 return; 776 } 777 778 comp_size = 0; 779 comp_buffer = NULL; 780 781 KKASSERT(pblksize / 2 <= 32768); 782 783 if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) { 784 z_stream strm_compress; 785 int comp_level; 786 int ret; 787 788 switch(HAMMER2_DEC_ALGO(comp_algo)) { 789 case HAMMER2_COMP_LZ4: 790 comp_buffer = objcache_get(cache_buffer_write, 791 M_INTWAIT); 792 comp_size = LZ4_compress_limitedOutput( 793 bp->b_data, 794 &comp_buffer[sizeof(int)], 795 pblksize, 796 pblksize / 2 - sizeof(int)); 797 /* 798 * We need to prefix with the size, LZ4 799 * doesn't do it for us. Add the related 800 * overhead. 801 */ 802 *(int *)comp_buffer = comp_size; 803 if (comp_size) 804 comp_size += sizeof(int); 805 break; 806 case HAMMER2_COMP_ZLIB: 807 comp_level = HAMMER2_DEC_LEVEL(comp_algo); 808 if (comp_level == 0) 809 comp_level = 6; /* default zlib compression */ 810 else if (comp_level < 6) 811 comp_level = 6; 812 else if (comp_level > 9) 813 comp_level = 9; 814 ret = deflateInit(&strm_compress, comp_level); 815 if (ret != Z_OK) { 816 kprintf("HAMMER2 ZLIB: fatal error " 817 "on deflateInit.\n"); 818 } 819 820 comp_buffer = objcache_get(cache_buffer_write, 821 M_INTWAIT); 822 strm_compress.next_in = bp->b_data; 823 strm_compress.avail_in = pblksize; 824 strm_compress.next_out = comp_buffer; 825 strm_compress.avail_out = pblksize / 2; 826 ret = deflate(&strm_compress, Z_FINISH); 827 if (ret == Z_STREAM_END) { 828 comp_size = pblksize / 2 - 829 strm_compress.avail_out; 830 } else { 831 comp_size = 0; 832 } 833 ret = deflateEnd(&strm_compress); 834 break; 835 default: 836 kprintf("Error: Unknown compression method.\n"); 837 kprintf("Comp_method = %d.\n", comp_algo); 838 break; 839 } 840 } 841 842 if (comp_size == 0) { 843 /* 844 * compression failed or turned off 845 */ 846 comp_block_size = pblksize; /* safety */ 847 if (++ip->comp_heuristic > 128) 848 ip->comp_heuristic = 8; 849 } else { 850 /* 851 * compression succeeded 852 */ 853 ip->comp_heuristic = 0; 854 if (comp_size <= 1024) { 855 comp_block_size = 1024; 856 } else if (comp_size <= 2048) { 857 comp_block_size = 2048; 858 } else if (comp_size <= 4096) { 859 comp_block_size = 4096; 860 } else if (comp_size <= 8192) { 861 comp_block_size = 8192; 862 } else if (comp_size <= 16384) { 863 comp_block_size = 16384; 864 } else if (comp_size <= 32768) { 865 comp_block_size = 32768; 866 } else { 867 panic("hammer2: WRITE PATH: " 868 "Weird comp_size value."); 869 /* NOT REACHED */ 870 comp_block_size = pblksize; 871 } 872 873 /* 874 * Must zero the remainder or dedup (which operates on a 875 * physical block basis) will not find matches. 876 */ 877 if (comp_size < comp_block_size) { 878 bzero(comp_buffer + comp_size, 879 comp_block_size - comp_size); 880 } 881 } 882 883 /* 884 * Assign physical storage, data will be set to NULL if a live-dedup 885 * was successful. 886 */ 887 data = comp_size ? comp_buffer : bp->b_data; 888 chain = hammer2_assign_physical(ip, parentp, lbase, comp_block_size, 889 mtid, &data, errorp); 890 891 if (*errorp) { 892 kprintf("WRITE PATH: An error occurred while " 893 "assigning physical space.\n"); 894 KKASSERT(chain == NULL); 895 goto done; 896 } 897 898 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 899 hammer2_inode_data_t *wipdata; 900 901 hammer2_chain_modify_ip(ip, chain, mtid, 0); 902 wipdata = &chain->data->ipdata; 903 KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA); 904 KKASSERT(bp->b_loffset == 0); 905 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES); 906 ++hammer2_iod_file_wembed; 907 } else if (data == NULL) { 908 /* 909 * Live deduplication, a copy of the data is already present 910 * on the media. 911 */ 912 char *bdata; 913 914 if (comp_size) { 915 chain->bref.methods = 916 HAMMER2_ENC_COMP(comp_algo) + 917 HAMMER2_ENC_CHECK(check_algo); 918 } else { 919 chain->bref.methods = 920 HAMMER2_ENC_COMP( 921 HAMMER2_COMP_NONE) + 922 HAMMER2_ENC_CHECK(check_algo); 923 } 924 bdata = comp_size ? comp_buffer : bp->b_data; 925 hammer2_chain_setcheck(chain, bdata); 926 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 927 } else { 928 hammer2_io_t *dio; 929 char *bdata; 930 931 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 932 933 switch(chain->bref.type) { 934 case HAMMER2_BREF_TYPE_INODE: 935 panic("hammer2_write_bp: unexpected inode\n"); 936 break; 937 case HAMMER2_BREF_TYPE_DATA: 938 /* 939 * Optimize out the read-before-write 940 * if possible. 941 */ 942 *errorp = hammer2_io_newnz(chain->hmp, 943 chain->bref.data_off, 944 chain->bytes, 945 &dio); 946 if (*errorp) { 947 hammer2_io_brelse(&dio); 948 kprintf("hammer2: WRITE PATH: " 949 "dbp bread error\n"); 950 break; 951 } 952 bdata = hammer2_io_data(dio, chain->bref.data_off); 953 954 /* 955 * When loading the block make sure we don't 956 * leave garbage after the compressed data. 957 */ 958 if (comp_size) { 959 chain->bref.methods = 960 HAMMER2_ENC_COMP(comp_algo) + 961 HAMMER2_ENC_CHECK(check_algo); 962 bcopy(comp_buffer, bdata, comp_size); 963 } else { 964 chain->bref.methods = 965 HAMMER2_ENC_COMP( 966 HAMMER2_COMP_NONE) + 967 HAMMER2_ENC_CHECK(check_algo); 968 bcopy(bp->b_data, bdata, pblksize); 969 } 970 971 /* 972 * The flush code doesn't calculate check codes for 973 * file data (doing so can result in excessive I/O), 974 * so we do it here. 975 */ 976 hammer2_chain_setcheck(chain, bdata); 977 hammer2_dedup_record(chain, bdata); 978 979 /* 980 * Device buffer is now valid, chain is no longer in 981 * the initial state. 982 * 983 * (No blockref table worries with file data) 984 */ 985 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 986 987 /* Now write the related bdp. */ 988 if (ioflag & IO_SYNC) { 989 /* 990 * Synchronous I/O requested. 991 */ 992 hammer2_io_bwrite(&dio); 993 /* 994 } else if ((ioflag & IO_DIRECT) && 995 loff + n == pblksize) { 996 hammer2_io_bdwrite(&dio); 997 */ 998 } else if (ioflag & IO_ASYNC) { 999 hammer2_io_bawrite(&dio); 1000 } else { 1001 hammer2_io_bdwrite(&dio); 1002 } 1003 break; 1004 default: 1005 panic("hammer2_write_bp: bad chain type %d\n", 1006 chain->bref.type); 1007 /* NOT REACHED */ 1008 break; 1009 } 1010 } 1011 done: 1012 if (chain) { 1013 hammer2_chain_unlock(chain); 1014 hammer2_chain_drop(chain); 1015 } 1016 if (comp_buffer) 1017 objcache_put(cache_buffer_write, comp_buffer); 1018 } 1019 1020 /* 1021 * Helper 1022 * 1023 * Function that performs zero-checking and writing without compression, 1024 * it corresponds to default zero-checking path. 1025 */ 1026 static 1027 void 1028 hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip, 1029 hammer2_chain_t **parentp, 1030 hammer2_key_t lbase, int ioflag, int pblksize, 1031 hammer2_tid_t mtid, int *errorp, 1032 int check_algo) 1033 { 1034 hammer2_chain_t *chain; 1035 char *data = bp->b_data; 1036 1037 if (test_block_zeros(bp->b_data, pblksize)) { 1038 zero_write(bp, ip, parentp, lbase, mtid, errorp); 1039 } else { 1040 chain = hammer2_assign_physical(ip, parentp, lbase, pblksize, 1041 mtid, &data, errorp); 1042 if (data) { 1043 hammer2_write_bp(chain, bp, ioflag, pblksize, 1044 mtid, errorp, check_algo); 1045 } /* else dedup occurred */ 1046 if (chain) { 1047 hammer2_chain_unlock(chain); 1048 hammer2_chain_drop(chain); 1049 } 1050 } 1051 } 1052 1053 /* 1054 * Helper 1055 * 1056 * A function to test whether a block of data contains only zeros, 1057 * returns TRUE (non-zero) if the block is all zeros. 1058 */ 1059 static 1060 int 1061 test_block_zeros(const char *buf, size_t bytes) 1062 { 1063 size_t i; 1064 1065 for (i = 0; i < bytes; i += sizeof(long)) { 1066 if (*(const long *)(buf + i) != 0) 1067 return (0); 1068 } 1069 return (1); 1070 } 1071 1072 /* 1073 * Helper 1074 * 1075 * Function to "write" a block that contains only zeros. 1076 */ 1077 static 1078 void 1079 zero_write(struct buf *bp, hammer2_inode_t *ip, 1080 hammer2_chain_t **parentp, 1081 hammer2_key_t lbase, hammer2_tid_t mtid, int *errorp __unused) 1082 { 1083 hammer2_chain_t *chain; 1084 hammer2_key_t key_dummy; 1085 int cache_index = -1; 1086 1087 chain = hammer2_chain_lookup(parentp, &key_dummy, 1088 lbase, lbase, 1089 &cache_index, 1090 HAMMER2_LOOKUP_NODATA); 1091 if (chain) { 1092 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 1093 hammer2_inode_data_t *wipdata; 1094 1095 hammer2_chain_modify_ip(ip, chain, mtid, 0); 1096 wipdata = &chain->data->ipdata; 1097 KKASSERT(wipdata->meta.op_flags & 1098 HAMMER2_OPFLAG_DIRECTDATA); 1099 KKASSERT(bp->b_loffset == 0); 1100 bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES); 1101 ++hammer2_iod_file_wembed; 1102 } else { 1103 hammer2_chain_delete(*parentp, chain, 1104 mtid, HAMMER2_DELETE_PERMANENT); 1105 ++hammer2_iod_file_wzero; 1106 } 1107 hammer2_chain_unlock(chain); 1108 hammer2_chain_drop(chain); 1109 } else { 1110 ++hammer2_iod_file_wzero; 1111 } 1112 } 1113 1114 /* 1115 * Helper 1116 * 1117 * Function to write the data as it is, without performing any sort of 1118 * compression. This function is used in path without compression and 1119 * default zero-checking path. 1120 */ 1121 static 1122 void 1123 hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag, 1124 int pblksize, 1125 hammer2_tid_t mtid, int *errorp, int check_algo) 1126 { 1127 hammer2_inode_data_t *wipdata; 1128 hammer2_io_t *dio; 1129 char *bdata; 1130 int error; 1131 1132 error = 0; /* XXX TODO below */ 1133 1134 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED); 1135 1136 switch(chain->bref.type) { 1137 case HAMMER2_BREF_TYPE_INODE: 1138 wipdata = &chain->data->ipdata; 1139 KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA); 1140 KKASSERT(bp->b_loffset == 0); 1141 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES); 1142 error = 0; 1143 ++hammer2_iod_file_wembed; 1144 break; 1145 case HAMMER2_BREF_TYPE_DATA: 1146 error = hammer2_io_newnz(chain->hmp, 1147 chain->bref.data_off, 1148 chain->bytes, &dio); 1149 if (error) { 1150 hammer2_io_bqrelse(&dio); 1151 kprintf("hammer2: WRITE PATH: " 1152 "dbp bread error\n"); 1153 break; 1154 } 1155 bdata = hammer2_io_data(dio, chain->bref.data_off); 1156 1157 chain->bref.methods = HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) + 1158 HAMMER2_ENC_CHECK(check_algo); 1159 bcopy(bp->b_data, bdata, chain->bytes); 1160 1161 /* 1162 * The flush code doesn't calculate check codes for 1163 * file data (doing so can result in excessive I/O), 1164 * so we do it here. 1165 */ 1166 hammer2_chain_setcheck(chain, bdata); 1167 hammer2_dedup_record(chain, bdata); 1168 1169 /* 1170 * Device buffer is now valid, chain is no longer in 1171 * the initial state. 1172 * 1173 * (No blockref table worries with file data) 1174 */ 1175 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1176 1177 if (ioflag & IO_SYNC) { 1178 /* 1179 * Synchronous I/O requested. 1180 */ 1181 hammer2_io_bwrite(&dio); 1182 /* 1183 } else if ((ioflag & IO_DIRECT) && 1184 loff + n == pblksize) { 1185 hammer2_io_bdwrite(&dio); 1186 */ 1187 } else if (ioflag & IO_ASYNC) { 1188 hammer2_io_bawrite(&dio); 1189 } else { 1190 hammer2_io_bdwrite(&dio); 1191 } 1192 break; 1193 default: 1194 panic("hammer2_write_bp: bad chain type %d\n", 1195 chain->bref.type); 1196 /* NOT REACHED */ 1197 error = 0; 1198 break; 1199 } 1200 KKASSERT(error == 0); /* XXX TODO */ 1201 *errorp = error; 1202 } 1203 1204 /* 1205 * LIVE DEDUP HEURISTIC 1206 * 1207 * WARNING! This code is SMP safe but the heuristic allows SMP collisions. 1208 * All fields must be loaded into locals and validated. 1209 */ 1210 static 1211 void 1212 hammer2_dedup_record(hammer2_chain_t *chain, char *data) 1213 { 1214 hammer2_dev_t *hmp; 1215 hammer2_dedup_t *dedup; 1216 int32_t crc; 1217 int best = 0; 1218 int i; 1219 int dticks; 1220 1221 hmp = chain->hmp; 1222 crc = hammer2_icrc32(data, chain->bytes); 1223 dedup = &hmp->heur_dedup[crc & (HAMMER2_DEDUP_HEUR_MASK & ~3)]; 1224 for (i = 0; i < 4; ++i) { 1225 if (dedup[i].data_crc == crc) { 1226 best = i; 1227 break; 1228 } 1229 dticks = (int)(dedup[i].ticks - dedup[best].ticks); 1230 if (dticks < 0 || dticks > hz * 60 * 30) 1231 best = i; 1232 } 1233 dedup += best; 1234 if (hammer2_debug & 0x40000) { 1235 kprintf("REC %04x %08x %016jx\n", 1236 (int)(dedup - hmp->heur_dedup), 1237 crc, 1238 chain->bref.data_off); 1239 } 1240 dedup->ticks = ticks; 1241 dedup->data_off = chain->bref.data_off; 1242 dedup->data_crc = crc; 1243 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEDUP); 1244 } 1245 1246 static 1247 hammer2_off_t 1248 hammer2_dedup_lookup(hammer2_dev_t *hmp, char **datap, int pblksize) 1249 { 1250 hammer2_dedup_t *dedup; 1251 hammer2_io_t *dio; 1252 hammer2_off_t off; 1253 uint32_t crc; 1254 char *data; 1255 int i; 1256 1257 data = *datap; 1258 if (data == NULL) 1259 return 0; 1260 1261 crc = hammer2_icrc32(data, pblksize); 1262 dedup = &hmp->heur_dedup[crc & (HAMMER2_DEDUP_HEUR_MASK & ~3)]; 1263 1264 if (hammer2_debug & 0x40000) { 1265 kprintf("LOC %04x/4 %08x\n", 1266 (int)(dedup - hmp->heur_dedup), 1267 crc); 1268 } 1269 1270 for (i = 0; i < 4; ++i) { 1271 off = dedup[i].data_off; 1272 cpu_ccfence(); 1273 if (dedup[i].data_crc != crc) 1274 continue; 1275 if ((1 << (int)(off & HAMMER2_OFF_MASK_RADIX)) != pblksize) 1276 continue; 1277 dio = hammer2_io_getquick(hmp, off, pblksize); 1278 if (dio && 1279 bcmp(data, hammer2_io_data(dio, off), pblksize) == 0) { 1280 if (hammer2_debug & 0x40000) { 1281 kprintf("DEDUP SUCCESS %016jx\n", 1282 (intmax_t)off); 1283 } 1284 hammer2_io_putblk(&dio); 1285 *datap = NULL; 1286 dedup[i].ticks = ticks; /* update use */ 1287 ++hammer2_iod_file_wdedup; 1288 return off; /* RETURN */ 1289 } 1290 if (dio) 1291 hammer2_io_putblk(&dio); 1292 } 1293 return 0; 1294 } 1295 1296 /* 1297 * Poof. Races are ok, if someone gets in and reuses a dedup offset 1298 * before or while we are clearing it they will also recover the freemap 1299 * entry (set it to fully allocated), so a bulkfree race can only set it 1300 * to a possibly-free state. 1301 * 1302 * XXX ok, well, not really sure races are ok but going to run with it 1303 * for the moment. 1304 */ 1305 void 1306 hammer2_dedup_clear(hammer2_dev_t *hmp) 1307 { 1308 int i; 1309 1310 for (i = 0; i < HAMMER2_DEDUP_HEUR_SIZE; ++i) { 1311 hmp->heur_dedup[i].data_off = 0; 1312 hmp->heur_dedup[i].ticks = ticks - 1; 1313 } 1314 } 1315