1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.48 2008/06/29 07:50:40 dillon Exp $ 35 */ 36 /* 37 * IO Primitives and buffer cache management 38 * 39 * All major data-tracking structures in HAMMER contain a struct hammer_io 40 * which is used to manage their backing store. We use filesystem buffers 41 * for backing store and we leave them passively associated with their 42 * HAMMER structures. 43 * 44 * If the kernel tries to destroy a passively associated buf which we cannot 45 * yet let go we set B_LOCKED in the buffer and then actively released it 46 * later when we can. 47 */ 48 49 #include "hammer.h" 50 #include <sys/fcntl.h> 51 #include <sys/nlookup.h> 52 #include <sys/buf.h> 53 #include <sys/buf2.h> 54 55 static void hammer_io_modify(hammer_io_t io, int count); 56 static void hammer_io_deallocate(struct buf *bp); 57 static int hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data); 58 59 /* 60 * Initialize a new, already-zero'd hammer_io structure, or reinitialize 61 * an existing hammer_io structure which may have switched to another type. 62 */ 63 void 64 hammer_io_init(hammer_io_t io, hammer_mount_t hmp, enum hammer_io_type type) 65 { 66 io->hmp = hmp; 67 io->type = type; 68 } 69 70 /* 71 * Helper routine to disassociate a buffer cache buffer from an I/O 72 * structure. The buffer is unlocked and marked appropriate for reclamation. 73 * 74 * The io may have 0 or 1 references depending on who called us. The 75 * caller is responsible for dealing with the refs. 76 * 77 * This call can only be made when no action is required on the buffer. 78 * 79 * The caller must own the buffer and the IO must indicate that the 80 * structure no longer owns it (io.released != 0). 81 */ 82 static void 83 hammer_io_disassociate(hammer_io_structure_t iou) 84 { 85 struct buf *bp = iou->io.bp; 86 87 KKASSERT(iou->io.released); 88 KKASSERT(iou->io.modified == 0); 89 KKASSERT(LIST_FIRST(&bp->b_dep) == (void *)iou); 90 buf_dep_init(bp); 91 iou->io.bp = NULL; 92 93 /* 94 * If the buffer was locked someone wanted to get rid of it. 95 */ 96 if (bp->b_flags & B_LOCKED) { 97 --hammer_count_io_locked; 98 bp->b_flags &= ~B_LOCKED; 99 } 100 if (iou->io.reclaim) { 101 bp->b_flags |= B_NOCACHE|B_RELBUF; 102 iou->io.reclaim = 0; 103 } 104 105 switch(iou->io.type) { 106 case HAMMER_STRUCTURE_VOLUME: 107 iou->volume.ondisk = NULL; 108 break; 109 case HAMMER_STRUCTURE_DATA_BUFFER: 110 case HAMMER_STRUCTURE_META_BUFFER: 111 case HAMMER_STRUCTURE_UNDO_BUFFER: 112 iou->buffer.ondisk = NULL; 113 break; 114 } 115 } 116 117 /* 118 * Wait for any physical IO to complete 119 */ 120 static void 121 hammer_io_wait(hammer_io_t io) 122 { 123 if (io->running) { 124 crit_enter(); 125 tsleep_interlock(io); 126 io->waiting = 1; 127 for (;;) { 128 tsleep(io, 0, "hmrflw", 0); 129 if (io->running == 0) 130 break; 131 tsleep_interlock(io); 132 io->waiting = 1; 133 if (io->running == 0) 134 break; 135 } 136 crit_exit(); 137 } 138 } 139 140 /* 141 * Wait for all hammer_io-initated write I/O's to complete. This is not 142 * supposed to count direct I/O's but some can leak through (for 143 * non-full-sized direct I/Os). 144 */ 145 void 146 hammer_io_wait_all(hammer_mount_t hmp, const char *ident) 147 { 148 crit_enter(); 149 while (hmp->io_running_space) 150 tsleep(&hmp->io_running_space, 0, ident, 0); 151 crit_exit(); 152 } 153 154 #define HAMMER_MAXRA 4 155 156 /* 157 * Load bp for a HAMMER structure. The io must be exclusively locked by 158 * the caller. 159 * 160 * This routine is mostly used on meta-data and small-data blocks. Generally 161 * speaking HAMMER assumes some locality of reference and will cluster 162 * a 64K read. 163 * 164 * Note that clustering occurs at the device layer, not the logical layer. 165 * If the buffers do not apply to the current operation they may apply to 166 * some other. 167 */ 168 int 169 hammer_io_read(struct vnode *devvp, struct hammer_io *io, hammer_off_t limit) 170 { 171 struct buf *bp; 172 int error; 173 174 if ((bp = io->bp) == NULL) { 175 hammer_count_io_running_read += io->bytes; 176 #if 1 177 error = cluster_read(devvp, limit, io->offset, io->bytes, 178 HAMMER_CLUSTER_SIZE, 179 HAMMER_CLUSTER_BUFS, &io->bp); 180 #else 181 error = bread(devvp, io->offset, io->bytes, &io->bp); 182 #endif 183 hammer_count_io_running_read -= io->bytes; 184 if (error == 0) { 185 bp = io->bp; 186 bp->b_ops = &hammer_bioops; 187 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 188 LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node); 189 BUF_KERNPROC(bp); 190 } 191 KKASSERT(io->modified == 0); 192 KKASSERT(io->running == 0); 193 KKASSERT(io->waiting == 0); 194 io->released = 0; /* we hold an active lock on bp */ 195 } else { 196 error = 0; 197 } 198 return(error); 199 } 200 201 /* 202 * Similar to hammer_io_read() but returns a zero'd out buffer instead. 203 * Must be called with the IO exclusively locked. 204 * 205 * vfs_bio_clrbuf() is kinda nasty, enforce serialization against background 206 * I/O by forcing the buffer to not be in a released state before calling 207 * it. 208 * 209 * This function will also mark the IO as modified but it will not 210 * increment the modify_refs count. 211 */ 212 int 213 hammer_io_new(struct vnode *devvp, struct hammer_io *io) 214 { 215 struct buf *bp; 216 217 if ((bp = io->bp) == NULL) { 218 io->bp = getblk(devvp, io->offset, io->bytes, 0, 0); 219 bp = io->bp; 220 bp->b_ops = &hammer_bioops; 221 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 222 LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node); 223 io->released = 0; 224 KKASSERT(io->running == 0); 225 io->waiting = 0; 226 BUF_KERNPROC(bp); 227 } else { 228 if (io->released) { 229 regetblk(bp); 230 BUF_KERNPROC(bp); 231 io->released = 0; 232 } 233 } 234 hammer_io_modify(io, 0); 235 vfs_bio_clrbuf(bp); 236 return(0); 237 } 238 239 /* 240 * Remove potential device level aliases against buffers managed by high level 241 * vnodes. 242 */ 243 void 244 hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset) 245 { 246 hammer_io_structure_t iou; 247 hammer_off_t phys_offset; 248 struct buf *bp; 249 250 phys_offset = volume->ondisk->vol_buf_beg + 251 (zone2_offset & HAMMER_OFF_SHORT_MASK); 252 crit_enter(); 253 if ((bp = findblk(volume->devvp, phys_offset)) != NULL) { 254 bp = getblk(volume->devvp, phys_offset, bp->b_bufsize, 0, 0); 255 if ((iou = (void *)LIST_FIRST(&bp->b_dep)) != NULL) { 256 hammer_io_clear_modify(&iou->io, 1); 257 bundirty(bp); 258 iou->io.reclaim = 1; 259 hammer_io_deallocate(bp); 260 } else { 261 KKASSERT((bp->b_flags & B_LOCKED) == 0); 262 bundirty(bp); 263 bp->b_flags |= B_NOCACHE|B_RELBUF; 264 } 265 brelse(bp); 266 } 267 crit_exit(); 268 } 269 270 /* 271 * This routine is called on the last reference to a hammer structure. 272 * The io is usually interlocked with io.loading and io.refs must be 1. 273 * 274 * This routine may return a non-NULL bp to the caller for dispoal. Disposal 275 * simply means the caller finishes decrementing the ref-count on the 276 * IO structure then brelse()'s the bp. The bp may or may not still be 277 * passively associated with the IO. 278 * 279 * The only requirement here is that modified meta-data and volume-header 280 * buffer may NOT be disassociated from the IO structure, and consequently 281 * we also leave such buffers actively associated with the IO if they already 282 * are (since the kernel can't do anything with them anyway). Only the 283 * flusher is allowed to write such buffers out. Modified pure-data and 284 * undo buffers are returned to the kernel but left passively associated 285 * so we can track when the kernel writes the bp out. 286 */ 287 struct buf * 288 hammer_io_release(struct hammer_io *io, int flush) 289 { 290 union hammer_io_structure *iou = (void *)io; 291 struct buf *bp; 292 293 if ((bp = io->bp) == NULL) 294 return(NULL); 295 296 /* 297 * Try to flush a dirty IO to disk if asked to by the 298 * caller or if the kernel tried to flush the buffer in the past. 299 * 300 * Kernel-initiated flushes are only allowed for pure-data buffers. 301 * meta-data and volume buffers can only be flushed explicitly 302 * by HAMMER. 303 */ 304 if (io->modified) { 305 if (flush) { 306 hammer_io_flush(io); 307 } else if (bp->b_flags & B_LOCKED) { 308 switch(io->type) { 309 case HAMMER_STRUCTURE_DATA_BUFFER: 310 case HAMMER_STRUCTURE_UNDO_BUFFER: 311 hammer_io_flush(io); 312 break; 313 default: 314 break; 315 } 316 } /* else no explicit request to flush the buffer */ 317 } 318 319 /* 320 * Wait for the IO to complete if asked to. 321 */ 322 if (io->waitdep && io->running) { 323 hammer_io_wait(io); 324 } 325 326 /* 327 * Return control of the buffer to the kernel (with the provisio 328 * that our bioops can override kernel decisions with regards to 329 * the buffer). 330 */ 331 if ((flush || io->reclaim) && io->modified == 0 && io->running == 0) { 332 /* 333 * Always disassociate the bp if an explicit flush 334 * was requested and the IO completed with no error 335 * (so unmount can really clean up the structure). 336 */ 337 if (io->released) { 338 regetblk(bp); 339 BUF_KERNPROC(bp); 340 } else { 341 io->released = 1; 342 } 343 hammer_io_disassociate((hammer_io_structure_t)io); 344 /* return the bp */ 345 } else if (io->modified) { 346 /* 347 * Only certain IO types can be released to the kernel if 348 * the buffer has been modified. 349 * 350 * volume and meta-data IO types may only be explicitly 351 * flushed by HAMMER. 352 */ 353 switch(io->type) { 354 case HAMMER_STRUCTURE_DATA_BUFFER: 355 case HAMMER_STRUCTURE_UNDO_BUFFER: 356 if (io->released == 0) { 357 io->released = 1; 358 bdwrite(bp); 359 } 360 break; 361 default: 362 break; 363 } 364 bp = NULL; /* bp left associated */ 365 } else if (io->released == 0) { 366 /* 367 * Clean buffers can be generally released to the kernel. 368 * We leave the bp passively associated with the HAMMER 369 * structure and use bioops to disconnect it later on 370 * if the kernel wants to discard the buffer. 371 * 372 * We can steal the structure's ownership of the bp. 373 */ 374 io->released = 1; 375 if (bp->b_flags & B_LOCKED) { 376 hammer_io_disassociate(iou); 377 /* return the bp */ 378 } else { 379 if (io->reclaim) { 380 hammer_io_disassociate(iou); 381 /* return the bp */ 382 } else { 383 /* return the bp (bp passively associated) */ 384 } 385 } 386 } else { 387 /* 388 * A released buffer is passively associate with our 389 * hammer_io structure. The kernel cannot destroy it 390 * without making a bioops call. If the kernel (B_LOCKED) 391 * or we (reclaim) requested that the buffer be destroyed 392 * we destroy it, otherwise we do a quick get/release to 393 * reset its position in the kernel's LRU list. 394 * 395 * Leaving the buffer passively associated allows us to 396 * use the kernel's LRU buffer flushing mechanisms rather 397 * then rolling our own. 398 * 399 * XXX there are two ways of doing this. We can re-acquire 400 * and passively release to reset the LRU, or not. 401 */ 402 if (io->running == 0) { 403 regetblk(bp); 404 if ((bp->b_flags & B_LOCKED) || io->reclaim) { 405 hammer_io_disassociate(iou); 406 /* return the bp */ 407 } else { 408 /* return the bp (bp passively associated) */ 409 } 410 } else { 411 /* 412 * bp is left passively associated but we do not 413 * try to reacquire it. Interactions with the io 414 * structure will occur on completion of the bp's 415 * I/O. 416 */ 417 bp = NULL; 418 } 419 } 420 return(bp); 421 } 422 423 /* 424 * This routine is called with a locked IO when a flush is desired and 425 * no other references to the structure exists other then ours. This 426 * routine is ONLY called when HAMMER believes it is safe to flush a 427 * potentially modified buffer out. 428 */ 429 void 430 hammer_io_flush(struct hammer_io *io) 431 { 432 struct buf *bp; 433 434 /* 435 * Degenerate case - nothing to flush if nothing is dirty. 436 */ 437 if (io->modified == 0) { 438 return; 439 } 440 441 KKASSERT(io->bp); 442 KKASSERT(io->modify_refs <= 0); 443 444 /* 445 * Acquire ownership of the bp, particularly before we clear our 446 * modified flag. 447 * 448 * We are going to bawrite() this bp. Don't leave a window where 449 * io->released is set, we actually own the bp rather then our 450 * buffer. 451 */ 452 bp = io->bp; 453 if (io->released) { 454 regetblk(bp); 455 /* BUF_KERNPROC(io->bp); */ 456 /* io->released = 0; */ 457 KKASSERT(io->released); 458 KKASSERT(io->bp == bp); 459 } 460 io->released = 1; 461 462 /* 463 * Acquire exclusive access to the bp and then clear the modified 464 * state of the buffer prior to issuing I/O to interlock any 465 * modifications made while the I/O is in progress. This shouldn't 466 * happen anyway but losing data would be worse. The modified bit 467 * will be rechecked after the IO completes. 468 * 469 * NOTE: This call also finalizes the buffer's content (inval == 0). 470 * 471 * This is only legal when lock.refs == 1 (otherwise we might clear 472 * the modified bit while there are still users of the cluster 473 * modifying the data). 474 * 475 * Do this before potentially blocking so any attempt to modify the 476 * ondisk while we are blocked blocks waiting for us. 477 */ 478 hammer_io_clear_modify(io, 0); 479 480 /* 481 * Transfer ownership to the kernel and initiate I/O. 482 */ 483 io->running = 1; 484 io->hmp->io_running_space += io->bytes; 485 hammer_count_io_running_write += io->bytes; 486 bawrite(bp); 487 } 488 489 /************************************************************************ 490 * BUFFER DIRTYING * 491 ************************************************************************ 492 * 493 * These routines deal with dependancies created when IO buffers get 494 * modified. The caller must call hammer_modify_*() on a referenced 495 * HAMMER structure prior to modifying its on-disk data. 496 * 497 * Any intent to modify an IO buffer acquires the related bp and imposes 498 * various write ordering dependancies. 499 */ 500 501 /* 502 * Mark a HAMMER structure as undergoing modification. Meta-data buffers 503 * are locked until the flusher can deal with them, pure data buffers 504 * can be written out. 505 */ 506 static 507 void 508 hammer_io_modify(hammer_io_t io, int count) 509 { 510 struct hammer_mount *hmp = io->hmp; 511 512 /* 513 * io->modify_refs must be >= 0 514 */ 515 while (io->modify_refs < 0) { 516 io->waitmod = 1; 517 tsleep(io, 0, "hmrmod", 0); 518 } 519 520 /* 521 * Shortcut if nothing to do. 522 */ 523 KKASSERT(io->lock.refs != 0 && io->bp != NULL); 524 io->modify_refs += count; 525 if (io->modified && io->released == 0) 526 return; 527 528 hammer_lock_ex(&io->lock); 529 if (io->modified == 0) { 530 KKASSERT(io->mod_list == NULL); 531 switch(io->type) { 532 case HAMMER_STRUCTURE_VOLUME: 533 io->mod_list = &hmp->volu_list; 534 hmp->locked_dirty_space += io->bytes; 535 hammer_count_dirtybufspace += io->bytes; 536 break; 537 case HAMMER_STRUCTURE_META_BUFFER: 538 io->mod_list = &hmp->meta_list; 539 hmp->locked_dirty_space += io->bytes; 540 hammer_count_dirtybufspace += io->bytes; 541 break; 542 case HAMMER_STRUCTURE_UNDO_BUFFER: 543 io->mod_list = &hmp->undo_list; 544 break; 545 case HAMMER_STRUCTURE_DATA_BUFFER: 546 io->mod_list = &hmp->data_list; 547 break; 548 } 549 TAILQ_INSERT_TAIL(io->mod_list, io, mod_entry); 550 io->modified = 1; 551 } 552 if (io->released) { 553 regetblk(io->bp); 554 BUF_KERNPROC(io->bp); 555 io->released = 0; 556 KKASSERT(io->modified != 0); 557 } 558 hammer_unlock(&io->lock); 559 } 560 561 static __inline 562 void 563 hammer_io_modify_done(hammer_io_t io) 564 { 565 KKASSERT(io->modify_refs > 0); 566 --io->modify_refs; 567 if (io->modify_refs == 0 && io->waitmod) { 568 io->waitmod = 0; 569 wakeup(io); 570 } 571 } 572 573 void 574 hammer_io_write_interlock(hammer_io_t io) 575 { 576 while (io->modify_refs != 0) { 577 io->waitmod = 1; 578 tsleep(io, 0, "hmrmod", 0); 579 } 580 io->modify_refs = -1; 581 } 582 583 void 584 hammer_io_done_interlock(hammer_io_t io) 585 { 586 KKASSERT(io->modify_refs == -1); 587 io->modify_refs = 0; 588 if (io->waitmod) { 589 io->waitmod = 0; 590 wakeup(io); 591 } 592 } 593 594 /* 595 * Caller intends to modify a volume's ondisk structure. 596 * 597 * This is only allowed if we are the flusher or we have a ref on the 598 * sync_lock. 599 */ 600 void 601 hammer_modify_volume(hammer_transaction_t trans, hammer_volume_t volume, 602 void *base, int len) 603 { 604 KKASSERT (trans == NULL || trans->sync_lock_refs > 0); 605 606 hammer_io_modify(&volume->io, 1); 607 if (len) { 608 intptr_t rel_offset = (intptr_t)base - (intptr_t)volume->ondisk; 609 KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0); 610 hammer_generate_undo(trans, &volume->io, 611 HAMMER_ENCODE_RAW_VOLUME(volume->vol_no, rel_offset), 612 base, len); 613 } 614 } 615 616 /* 617 * Caller intends to modify a buffer's ondisk structure. 618 * 619 * This is only allowed if we are the flusher or we have a ref on the 620 * sync_lock. 621 */ 622 void 623 hammer_modify_buffer(hammer_transaction_t trans, hammer_buffer_t buffer, 624 void *base, int len) 625 { 626 KKASSERT (trans == NULL || trans->sync_lock_refs > 0); 627 628 hammer_io_modify(&buffer->io, 1); 629 if (len) { 630 intptr_t rel_offset = (intptr_t)base - (intptr_t)buffer->ondisk; 631 KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0); 632 hammer_generate_undo(trans, &buffer->io, 633 buffer->zone2_offset + rel_offset, 634 base, len); 635 } 636 } 637 638 void 639 hammer_modify_volume_done(hammer_volume_t volume) 640 { 641 hammer_io_modify_done(&volume->io); 642 } 643 644 void 645 hammer_modify_buffer_done(hammer_buffer_t buffer) 646 { 647 hammer_io_modify_done(&buffer->io); 648 } 649 650 /* 651 * Mark an entity as not being dirty any more and finalize any 652 * delayed adjustments to the buffer. 653 * 654 * Delayed adjustments are an important performance enhancement, allowing 655 * us to avoid recalculating B-Tree node CRCs over and over again when 656 * making bulk-modifications to the B-Tree. 657 * 658 * If inval is non-zero delayed adjustments are ignored. 659 */ 660 void 661 hammer_io_clear_modify(struct hammer_io *io, int inval) 662 { 663 if (io->modified == 0) 664 return; 665 666 /* 667 * Take us off the mod-list and clear the modified bit. 668 */ 669 KKASSERT(io->mod_list != NULL); 670 if (io->mod_list == &io->hmp->volu_list || 671 io->mod_list == &io->hmp->meta_list) { 672 io->hmp->locked_dirty_space -= io->bytes; 673 hammer_count_dirtybufspace -= io->bytes; 674 } 675 TAILQ_REMOVE(io->mod_list, io, mod_entry); 676 io->mod_list = NULL; 677 io->modified = 0; 678 679 /* 680 * If this bit is not set there are no delayed adjustments. 681 */ 682 if (io->gencrc == 0) 683 return; 684 io->gencrc = 0; 685 686 /* 687 * Finalize requested CRCs. The NEEDSCRC flag also holds a reference 688 * on the node (& underlying buffer). Release the node after clearing 689 * the flag. 690 */ 691 if (io->type == HAMMER_STRUCTURE_META_BUFFER) { 692 hammer_buffer_t buffer = (void *)io; 693 hammer_node_t node; 694 695 restart: 696 TAILQ_FOREACH(node, &buffer->clist, entry) { 697 if ((node->flags & HAMMER_NODE_NEEDSCRC) == 0) 698 continue; 699 node->flags &= ~HAMMER_NODE_NEEDSCRC; 700 KKASSERT(node->ondisk); 701 if (inval == 0) 702 node->ondisk->crc = crc32(&node->ondisk->crc + 1, HAMMER_BTREE_CRCSIZE); 703 hammer_rel_node(node); 704 goto restart; 705 } 706 } 707 708 } 709 710 /* 711 * Clear the IO's modify list. Even though the IO is no longer modified 712 * it may still be on the lose_list. This routine is called just before 713 * the governing hammer_buffer is destroyed. 714 */ 715 void 716 hammer_io_clear_modlist(struct hammer_io *io) 717 { 718 KKASSERT(io->modified == 0); 719 if (io->mod_list) { 720 crit_enter(); /* biodone race against list */ 721 KKASSERT(io->mod_list == &io->hmp->lose_list); 722 TAILQ_REMOVE(io->mod_list, io, mod_entry); 723 io->mod_list = NULL; 724 crit_exit(); 725 } 726 } 727 728 /************************************************************************ 729 * HAMMER_BIOOPS * 730 ************************************************************************ 731 * 732 */ 733 734 /* 735 * Pre-IO initiation kernel callback - cluster build only 736 */ 737 static void 738 hammer_io_start(struct buf *bp) 739 { 740 } 741 742 /* 743 * Post-IO completion kernel callback - MAY BE CALLED FROM INTERRUPT! 744 * 745 * NOTE: HAMMER may modify a buffer after initiating I/O. The modified bit 746 * may also be set if we were marking a cluster header open. Only remove 747 * our dependancy if the modified bit is clear. 748 */ 749 static void 750 hammer_io_complete(struct buf *bp) 751 { 752 union hammer_io_structure *iou = (void *)LIST_FIRST(&bp->b_dep); 753 754 KKASSERT(iou->io.released == 1); 755 756 /* 757 * Deal with people waiting for I/O to drain 758 */ 759 if (iou->io.running) { 760 hammer_count_io_running_write -= iou->io.bytes; 761 iou->io.hmp->io_running_space -= iou->io.bytes; 762 if (iou->io.hmp->io_running_space == 0) 763 wakeup(&iou->io.hmp->io_running_space); 764 KKASSERT(iou->io.hmp->io_running_space >= 0); 765 iou->io.running = 0; 766 } 767 768 if (iou->io.waiting) { 769 iou->io.waiting = 0; 770 wakeup(iou); 771 } 772 773 /* 774 * If B_LOCKED is set someone wanted to deallocate the bp at some 775 * point, do it now if refs has become zero. 776 */ 777 if ((bp->b_flags & B_LOCKED) && iou->io.lock.refs == 0) { 778 KKASSERT(iou->io.modified == 0); 779 --hammer_count_io_locked; 780 bp->b_flags &= ~B_LOCKED; 781 hammer_io_deallocate(bp); 782 /* structure may be dead now */ 783 } 784 } 785 786 /* 787 * Callback from kernel when it wishes to deallocate a passively 788 * associated structure. This mostly occurs with clean buffers 789 * but it may be possible for a holding structure to be marked dirty 790 * while its buffer is passively associated. The caller owns the bp. 791 * 792 * If we cannot disassociate we set B_LOCKED to prevent the buffer 793 * from getting reused. 794 * 795 * WARNING: Because this can be called directly by getnewbuf we cannot 796 * recurse into the tree. If a bp cannot be immediately disassociated 797 * our only recourse is to set B_LOCKED. 798 * 799 * WARNING: This may be called from an interrupt via hammer_io_complete() 800 */ 801 static void 802 hammer_io_deallocate(struct buf *bp) 803 { 804 hammer_io_structure_t iou = (void *)LIST_FIRST(&bp->b_dep); 805 806 KKASSERT((bp->b_flags & B_LOCKED) == 0 && iou->io.running == 0); 807 if (iou->io.lock.refs > 0 || iou->io.modified) { 808 /* 809 * It is not legal to disassociate a modified buffer. This 810 * case really shouldn't ever occur. 811 */ 812 bp->b_flags |= B_LOCKED; 813 ++hammer_count_io_locked; 814 } else { 815 /* 816 * Disassociate the BP. If the io has no refs left we 817 * have to add it to the loose list. 818 */ 819 hammer_io_disassociate(iou); 820 if (iou->io.type != HAMMER_STRUCTURE_VOLUME) { 821 KKASSERT(iou->io.bp == NULL); 822 KKASSERT(iou->io.mod_list == NULL); 823 crit_enter(); /* biodone race against list */ 824 iou->io.mod_list = &iou->io.hmp->lose_list; 825 TAILQ_INSERT_TAIL(iou->io.mod_list, &iou->io, mod_entry); 826 crit_exit(); 827 } 828 } 829 } 830 831 static int 832 hammer_io_fsync(struct vnode *vp) 833 { 834 return(0); 835 } 836 837 /* 838 * NOTE: will not be called unless we tell the kernel about the 839 * bioops. Unused... we use the mount's VFS_SYNC instead. 840 */ 841 static int 842 hammer_io_sync(struct mount *mp) 843 { 844 return(0); 845 } 846 847 static void 848 hammer_io_movedeps(struct buf *bp1, struct buf *bp2) 849 { 850 } 851 852 /* 853 * I/O pre-check for reading and writing. HAMMER only uses this for 854 * B_CACHE buffers so checkread just shouldn't happen, but if it does 855 * allow it. 856 * 857 * Writing is a different case. We don't want the kernel to try to write 858 * out a buffer that HAMMER may be modifying passively or which has a 859 * dependancy. In addition, kernel-demanded writes can only proceed for 860 * certain types of buffers (i.e. UNDO and DATA types). Other dirty 861 * buffer types can only be explicitly written by the flusher. 862 * 863 * checkwrite will only be called for bdwrite()n buffers. If we return 864 * success the kernel is guaranteed to initiate the buffer write. 865 */ 866 static int 867 hammer_io_checkread(struct buf *bp) 868 { 869 return(0); 870 } 871 872 static int 873 hammer_io_checkwrite(struct buf *bp) 874 { 875 hammer_io_t io = (void *)LIST_FIRST(&bp->b_dep); 876 877 /* 878 * This shouldn't happen under normal operation. 879 */ 880 if (io->type == HAMMER_STRUCTURE_VOLUME || 881 io->type == HAMMER_STRUCTURE_META_BUFFER) { 882 if (!panicstr) 883 panic("hammer_io_checkwrite: illegal buffer"); 884 if ((bp->b_flags & B_LOCKED) == 0) { 885 bp->b_flags |= B_LOCKED; 886 ++hammer_count_io_locked; 887 } 888 return(1); 889 } 890 891 /* 892 * We can only clear the modified bit if the IO is not currently 893 * undergoing modification. Otherwise we may miss changes. 894 */ 895 if (io->modify_refs == 0 && io->modified) 896 hammer_io_clear_modify(io, 0); 897 898 /* 899 * The kernel is going to start the IO, set io->running. 900 */ 901 KKASSERT(io->running == 0); 902 io->running = 1; 903 io->hmp->io_running_space += io->bytes; 904 hammer_count_io_running_write += io->bytes; 905 return(0); 906 } 907 908 /* 909 * Return non-zero if we wish to delay the kernel's attempt to flush 910 * this buffer to disk. 911 */ 912 static int 913 hammer_io_countdeps(struct buf *bp, int n) 914 { 915 return(0); 916 } 917 918 struct bio_ops hammer_bioops = { 919 .io_start = hammer_io_start, 920 .io_complete = hammer_io_complete, 921 .io_deallocate = hammer_io_deallocate, 922 .io_fsync = hammer_io_fsync, 923 .io_sync = hammer_io_sync, 924 .io_movedeps = hammer_io_movedeps, 925 .io_countdeps = hammer_io_countdeps, 926 .io_checkread = hammer_io_checkread, 927 .io_checkwrite = hammer_io_checkwrite, 928 }; 929 930 /************************************************************************ 931 * DIRECT IO OPS * 932 ************************************************************************ 933 * 934 * These functions operate directly on the buffer cache buffer associated 935 * with a front-end vnode rather then a back-end device vnode. 936 */ 937 938 /* 939 * Read a buffer associated with a front-end vnode directly from the 940 * disk media. The bio may be issued asynchronously. 941 * 942 * A second-level bio already resolved to a zone-2 offset (typically by 943 * the BMAP code, or by a previous hammer_io_direct_write()), is passed. 944 */ 945 int 946 hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio) 947 { 948 hammer_off_t zone2_offset; 949 hammer_volume_t volume; 950 struct buf *bp; 951 struct bio *nbio; 952 int vol_no; 953 int error; 954 955 zone2_offset = bio->bio_offset; 956 957 KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) == 958 HAMMER_ZONE_RAW_BUFFER); 959 960 vol_no = HAMMER_VOL_DECODE(zone2_offset); 961 volume = hammer_get_volume(hmp, vol_no, &error); 962 if (error == 0 && zone2_offset >= volume->maxbuf_off) 963 error = EIO; 964 965 /* 966 * Third level bio - raw offset specific to the 967 * correct volume. 968 */ 969 if (error == 0) { 970 zone2_offset &= HAMMER_OFF_SHORT_MASK; 971 972 nbio = push_bio(bio); 973 nbio->bio_offset = volume->ondisk->vol_buf_beg + 974 zone2_offset; 975 vn_strategy(volume->devvp, nbio); 976 } 977 hammer_rel_volume(volume, 0); 978 979 if (error) { 980 kprintf("hammer_direct_read: failed @ %016llx\n", 981 zone2_offset); 982 bp = bio->bio_buf; 983 bp->b_error = error; 984 bp->b_flags |= B_ERROR; 985 biodone(bio); 986 } 987 return(error); 988 } 989 990 /* 991 * Write a buffer associated with a front-end vnode directly to the 992 * disk media. The bio may be issued asynchronously. 993 */ 994 int 995 hammer_io_direct_write(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf, 996 struct bio *bio) 997 { 998 hammer_off_t buf_offset; 999 hammer_off_t zone2_offset; 1000 hammer_volume_t volume; 1001 hammer_buffer_t buffer; 1002 struct buf *bp; 1003 struct bio *nbio; 1004 char *ptr; 1005 int vol_no; 1006 int error; 1007 1008 buf_offset = leaf->data_offset; 1009 1010 KKASSERT(buf_offset > HAMMER_ZONE_BTREE); 1011 KKASSERT(bio->bio_buf->b_cmd == BUF_CMD_WRITE); 1012 1013 if ((buf_offset & HAMMER_BUFMASK) == 0 && 1014 leaf->data_len >= HAMMER_BUFSIZE) { 1015 /* 1016 * We are using the vnode's bio to write directly to the 1017 * media, any hammer_buffer at the same zone-X offset will 1018 * now have stale data. 1019 */ 1020 zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error); 1021 vol_no = HAMMER_VOL_DECODE(zone2_offset); 1022 volume = hammer_get_volume(hmp, vol_no, &error); 1023 1024 if (error == 0 && zone2_offset >= volume->maxbuf_off) 1025 error = EIO; 1026 if (error == 0) { 1027 bp = bio->bio_buf; 1028 KKASSERT((bp->b_bufsize & HAMMER_BUFMASK) == 0); 1029 hammer_del_buffers(hmp, buf_offset, 1030 zone2_offset, bp->b_bufsize); 1031 /* 1032 * Second level bio - cached zone2 offset. 1033 */ 1034 nbio = push_bio(bio); 1035 nbio->bio_offset = zone2_offset; 1036 1037 /* 1038 * Third level bio - raw offset specific to the 1039 * correct volume. 1040 */ 1041 zone2_offset &= HAMMER_OFF_SHORT_MASK; 1042 nbio = push_bio(nbio); 1043 nbio->bio_offset = volume->ondisk->vol_buf_beg + 1044 zone2_offset; 1045 vn_strategy(volume->devvp, nbio); 1046 } 1047 hammer_rel_volume(volume, 0); 1048 } else { 1049 /* must fit in a standard HAMMER buffer */ 1050 KKASSERT(((buf_offset ^ (buf_offset + leaf->data_len - 1)) & ~HAMMER_BUFMASK64) == 0); 1051 buffer = NULL; 1052 ptr = hammer_bread(hmp, buf_offset, &error, &buffer); 1053 if (error == 0) { 1054 bp = bio->bio_buf; 1055 bp->b_flags |= B_AGE; 1056 hammer_io_modify(&buffer->io, 1); 1057 bcopy(bp->b_data, ptr, leaf->data_len); 1058 hammer_io_modify_done(&buffer->io); 1059 hammer_rel_buffer(buffer, 0); 1060 bp->b_resid = 0; 1061 biodone(bio); 1062 } 1063 } 1064 if (error) { 1065 kprintf("hammer_direct_write: failed @ %016llx\n", 1066 leaf->data_offset); 1067 bp = bio->bio_buf; 1068 bp->b_resid = 0; 1069 bp->b_error = EIO; 1070 bp->b_flags |= B_ERROR; 1071 biodone(bio); 1072 } 1073 return(error); 1074 } 1075 1076 /* 1077 * This is called to remove the second-level cached zone-2 offset from 1078 * frontend buffer cache buffers, now stale due to a data relocation. 1079 * These offsets are generated by cluster_read() via VOP_BMAP, or directly 1080 * by hammer_vop_strategy_read(). 1081 * 1082 * This is rather nasty because here we have something like the reblocker 1083 * scanning the raw B-Tree with no held references on anything, really, 1084 * other then a shared lock on the B-Tree node, and we have to access the 1085 * frontend's buffer cache to check for and clean out the association. 1086 * Specifically, if the reblocker is moving data on the disk, these cached 1087 * offsets will become invalid. 1088 * 1089 * Only data record types associated with the large-data zone are subject 1090 * to direct-io and need to be checked. 1091 * 1092 */ 1093 void 1094 hammer_io_direct_uncache(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf) 1095 { 1096 struct hammer_inode_info iinfo; 1097 int zone; 1098 1099 if (leaf->base.rec_type != HAMMER_RECTYPE_DATA) 1100 return; 1101 zone = HAMMER_ZONE_DECODE(leaf->data_offset); 1102 if (zone != HAMMER_ZONE_LARGE_DATA_INDEX) 1103 return; 1104 iinfo.obj_id = leaf->base.obj_id; 1105 iinfo.obj_asof = 0; /* unused */ 1106 iinfo.obj_localization = leaf->base.localization & 1107 HAMMER_LOCALIZE_PSEUDOFS_MASK; 1108 iinfo.u.leaf = leaf; 1109 hammer_scan_inode_snapshots(hmp, &iinfo, 1110 hammer_io_direct_uncache_callback, 1111 leaf); 1112 } 1113 1114 static int 1115 hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data) 1116 { 1117 hammer_inode_info_t iinfo = data; 1118 hammer_off_t data_offset; 1119 hammer_off_t file_offset; 1120 struct vnode *vp; 1121 struct buf *bp; 1122 int blksize; 1123 1124 if (ip->vp == NULL) 1125 return(0); 1126 data_offset = iinfo->u.leaf->data_offset; 1127 file_offset = iinfo->u.leaf->base.key - iinfo->u.leaf->data_len; 1128 blksize = iinfo->u.leaf->data_len; 1129 KKASSERT((blksize & HAMMER_BUFMASK) == 0); 1130 1131 hammer_ref(&ip->lock); 1132 if (hammer_get_vnode(ip, &vp) == 0) { 1133 if ((bp = findblk(ip->vp, file_offset)) != NULL && 1134 bp->b_bio2.bio_offset != NOOFFSET) { 1135 bp = getblk(ip->vp, file_offset, blksize, 0, 0); 1136 bp->b_bio2.bio_offset = NOOFFSET; 1137 brelse(bp); 1138 } 1139 vput(vp); 1140 } 1141 hammer_rel_inode(ip, 0); 1142 return(0); 1143 } 1144 1145