1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * IO Primitives and buffer cache management 36 * 37 * All major data-tracking structures in HAMMER contain a struct hammer_io 38 * which is used to manage their backing store. We use filesystem buffers 39 * for backing store and we leave them passively associated with their 40 * HAMMER structures. 41 * 42 * If the kernel tries to destroy a passively associated buf which we cannot 43 * yet let go we set B_LOCKED in the buffer and then actively released it 44 * later when we can. 45 * 46 * The io_token is required for anything which might race bioops and bio_done 47 * callbacks, with one exception: A successful hammer_try_interlock_norefs(). 48 * the fs_token will be held in all other cases. 49 */ 50 51 #include <sys/buf2.h> 52 53 #include "hammer.h" 54 55 static void hammer_io_modify(hammer_io_t io, int count); 56 static void hammer_io_deallocate(struct buf *bp); 57 static void hammer_indirect_callback(struct bio *bio); 58 static void hammer_io_direct_write_complete(struct bio *nbio); 59 static int hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data); 60 static void hammer_io_set_modlist(struct hammer_io *io); 61 static __inline void hammer_io_flush_mark(hammer_volume_t volume); 62 static struct bio_ops hammer_bioops; 63 64 static int 65 hammer_mod_rb_compare(hammer_io_t io1, hammer_io_t io2) 66 { 67 hammer_off_t io1_offset; 68 hammer_off_t io2_offset; 69 70 /* 71 * Encoded offsets are neither valid block device offsets 72 * nor valid zone-X offsets. 73 */ 74 io1_offset = HAMMER_ENCODE(0, io1->volume->vol_no, io1->offset); 75 io2_offset = HAMMER_ENCODE(0, io2->volume->vol_no, io2->offset); 76 77 if (io1_offset < io2_offset) 78 return(-1); 79 if (io1_offset > io2_offset) 80 return(1); 81 return(0); 82 } 83 84 RB_GENERATE(hammer_mod_rb_tree, hammer_io, rb_node, hammer_mod_rb_compare); 85 86 /* 87 * Initialize a new, already-zero'd hammer_io structure, or reinitialize 88 * an existing hammer_io structure which may have switched to another type. 89 */ 90 void 91 hammer_io_init(hammer_io_t io, hammer_volume_t volume, enum hammer_io_type type) 92 { 93 io->volume = volume; 94 io->hmp = volume->io.hmp; 95 io->type = type; 96 } 97 98 /* 99 * Helper routine to disassociate a buffer cache buffer from an I/O 100 * structure. The io must be interlocked and marked appropriately for 101 * reclamation. 102 * 103 * The io must be in a released state with the io->bp owned and 104 * locked by the caller of this function. When not called from an 105 * io_deallocate() this cannot race an io_deallocate() since the 106 * kernel would be unable to get the buffer lock in that case. 107 * (The released state in this case means we own the bp, not the 108 * hammer_io structure). 109 * 110 * The io may have 0 or 1 references depending on who called us. The 111 * caller is responsible for dealing with the refs. 112 * 113 * This call can only be made when no action is required on the buffer. 114 * 115 * This function is guaranteed not to race against anything because we 116 * own both the io lock and the bp lock and are interlocked with no 117 * references. 118 */ 119 static void 120 hammer_io_disassociate(hammer_io_t io) 121 { 122 struct buf *bp = io->bp; 123 124 KKASSERT(io->released); 125 KKASSERT(io->modified == 0); 126 KKASSERT(hammer_buf_peek_io(bp) == io); 127 buf_dep_init(bp); 128 io->bp = NULL; 129 130 /* 131 * If the buffer was locked someone wanted to get rid of it. 132 */ 133 if (bp->b_flags & B_LOCKED) { 134 atomic_add_int(&hammer_count_io_locked, -1); 135 bp->b_flags &= ~B_LOCKED; 136 } 137 if (io->reclaim) { 138 bp->b_flags |= B_NOCACHE|B_RELBUF; 139 io->reclaim = 0; 140 } 141 142 switch(io->type) { 143 case HAMMER_STRUCTURE_VOLUME: 144 HAMMER_ITOV(io)->ondisk = NULL; 145 break; 146 case HAMMER_STRUCTURE_DATA_BUFFER: 147 case HAMMER_STRUCTURE_META_BUFFER: 148 case HAMMER_STRUCTURE_UNDO_BUFFER: 149 HAMMER_ITOB(io)->ondisk = NULL; 150 break; 151 case HAMMER_STRUCTURE_DUMMY: 152 hpanic("bad io type"); 153 break; 154 } 155 } 156 157 /* 158 * Wait for any physical IO to complete 159 * 160 * XXX we aren't interlocked against a spinlock or anything so there 161 * is a small window in the interlock / io->running == 0 test. 162 */ 163 void 164 hammer_io_wait(hammer_io_t io) 165 { 166 if (io->running) { 167 hammer_mount_t hmp = io->hmp; 168 169 lwkt_gettoken(&hmp->io_token); 170 while (io->running) { 171 io->waiting = 1; 172 tsleep_interlock(io, 0); 173 if (io->running) 174 tsleep(io, PINTERLOCKED, "hmrflw", hz); 175 } 176 lwkt_reltoken(&hmp->io_token); 177 } 178 } 179 180 /* 181 * Wait for all currently queued HAMMER-initiated I/Os to complete. 182 * 183 * This is not supposed to count direct I/O's but some can leak 184 * through (for non-full-sized direct I/Os). 185 */ 186 void 187 hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush) 188 { 189 struct hammer_io iodummy; 190 hammer_io_t io; 191 192 /* 193 * Degenerate case, no I/O is running 194 */ 195 lwkt_gettoken(&hmp->io_token); 196 if (TAILQ_EMPTY(&hmp->iorun_list)) { 197 lwkt_reltoken(&hmp->io_token); 198 if (doflush) 199 hammer_io_flush_sync(hmp); 200 return; 201 } 202 bzero(&iodummy, sizeof(iodummy)); 203 iodummy.type = HAMMER_STRUCTURE_DUMMY; 204 205 /* 206 * Add placemarker and then wait until it becomes the head of 207 * the list. 208 */ 209 TAILQ_INSERT_TAIL(&hmp->iorun_list, &iodummy, iorun_entry); 210 while (TAILQ_FIRST(&hmp->iorun_list) != &iodummy) { 211 tsleep(&iodummy, 0, ident, 0); 212 } 213 214 /* 215 * Chain in case several placemarkers are present. 216 */ 217 TAILQ_REMOVE(&hmp->iorun_list, &iodummy, iorun_entry); 218 io = TAILQ_FIRST(&hmp->iorun_list); 219 if (io && io->type == HAMMER_STRUCTURE_DUMMY) 220 wakeup(io); 221 lwkt_reltoken(&hmp->io_token); 222 223 if (doflush) 224 hammer_io_flush_sync(hmp); 225 } 226 227 /* 228 * Clear a flagged error condition on a I/O buffer. The caller must hold 229 * its own ref on the buffer. 230 */ 231 void 232 hammer_io_clear_error(struct hammer_io *io) 233 { 234 hammer_mount_t hmp = io->hmp; 235 236 lwkt_gettoken(&hmp->io_token); 237 if (io->ioerror) { 238 io->ioerror = 0; 239 hammer_rel(&io->lock); 240 KKASSERT(hammer_isactive(&io->lock)); 241 } 242 lwkt_reltoken(&hmp->io_token); 243 } 244 245 void 246 hammer_io_clear_error_noassert(struct hammer_io *io) 247 { 248 hammer_mount_t hmp = io->hmp; 249 250 lwkt_gettoken(&hmp->io_token); 251 if (io->ioerror) { 252 io->ioerror = 0; 253 hammer_rel(&io->lock); 254 } 255 lwkt_reltoken(&hmp->io_token); 256 } 257 258 /* 259 * This is an advisory function only which tells the buffer cache 260 * the bp is not a meta-data buffer, even though it is backed by 261 * a block device. 262 * 263 * This is used by HAMMER's reblocking code to avoid trying to 264 * swapcache the filesystem's data when it is read or written 265 * by the reblocking code. 266 * 267 * The caller has a ref on the buffer preventing the bp from 268 * being disassociated from it. 269 */ 270 void 271 hammer_io_notmeta(hammer_buffer_t buffer) 272 { 273 if ((buffer->io.bp->b_flags & B_NOTMETA) == 0) { 274 hammer_mount_t hmp = buffer->io.hmp; 275 276 lwkt_gettoken(&hmp->io_token); 277 buffer->io.bp->b_flags |= B_NOTMETA; 278 lwkt_reltoken(&hmp->io_token); 279 } 280 } 281 282 /* 283 * Load bp for a HAMMER structure. The io must be exclusively locked by 284 * the caller. 285 * 286 * This routine is mostly used on meta-data and small-data blocks. Generally 287 * speaking HAMMER assumes some locality of reference and will cluster. 288 * 289 * Note that the caller (hammer_ondisk.c) may place further restrictions 290 * on clusterability via the limit (in bytes). Typically large-data 291 * zones cannot be clustered due to their mixed buffer sizes. This is 292 * not an issue since such clustering occurs in hammer_vnops at the 293 * regular file layer, whereas this is the buffered block device layer. 294 * 295 * No I/O callbacks can occur while we hold the buffer locked. 296 */ 297 int 298 hammer_io_read(struct vnode *devvp, struct hammer_io *io, int limit) 299 { 300 struct buf *bp; 301 int error; 302 303 if ((bp = io->bp) == NULL) { 304 atomic_add_long(&hammer_count_io_running_read, io->bytes); 305 if (hammer_cluster_enable && limit > io->bytes) { 306 error = cluster_read(devvp, io->offset + limit, 307 io->offset, io->bytes, 308 HAMMER_CLUSTER_SIZE, 309 HAMMER_CLUSTER_SIZE, 310 &io->bp); 311 } else { 312 error = bread(devvp, io->offset, io->bytes, &io->bp); 313 } 314 hammer_stats_disk_read += io->bytes; 315 atomic_add_long(&hammer_count_io_running_read, -io->bytes); 316 317 /* 318 * The code generally assumes b_ops/b_dep has been set-up, 319 * even if we error out here. 320 */ 321 bp = io->bp; 322 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 323 const char *metatype; 324 325 switch(io->type) { 326 case HAMMER_STRUCTURE_VOLUME: 327 metatype = "volume"; 328 break; 329 case HAMMER_STRUCTURE_META_BUFFER: 330 switch(HAMMER_ITOB(io)->zoneX_offset 331 & HAMMER_OFF_ZONE_MASK) { 332 case HAMMER_ZONE_BTREE: 333 metatype = "btree"; 334 break; 335 case HAMMER_ZONE_META: 336 metatype = "meta"; 337 break; 338 case HAMMER_ZONE_FREEMAP: 339 metatype = "freemap"; 340 break; 341 default: 342 metatype = "meta?"; 343 break; 344 } 345 break; 346 case HAMMER_STRUCTURE_DATA_BUFFER: 347 metatype = "data"; 348 break; 349 case HAMMER_STRUCTURE_UNDO_BUFFER: 350 metatype = "undo"; 351 break; 352 default: 353 metatype = "unknown"; 354 break; 355 } 356 hdkprintf("zone2_offset %016jx %s\n", 357 (intmax_t)bp->b_bio2.bio_offset, 358 metatype); 359 } 360 bp->b_flags &= ~B_IODEBUG; 361 bp->b_ops = &hammer_bioops; 362 363 hammer_buf_attach_io(bp, io); /* locked by the io lock */ 364 BUF_KERNPROC(bp); 365 KKASSERT(io->modified == 0); 366 KKASSERT(io->running == 0); 367 KKASSERT(io->waiting == 0); 368 io->released = 0; /* we hold an active lock on bp */ 369 } else { 370 error = 0; 371 } 372 return(error); 373 } 374 375 /* 376 * Similar to hammer_io_read() but returns a zero'd out buffer instead. 377 * Must be called with the IO exclusively locked. 378 * 379 * vfs_bio_clrbuf() is kinda nasty, enforce serialization against background 380 * I/O by forcing the buffer to not be in a released state before calling 381 * it. 382 * 383 * This function will also mark the IO as modified but it will not 384 * increment the modify_refs count. 385 * 386 * No I/O callbacks can occur while we hold the buffer locked. 387 */ 388 int 389 hammer_io_new(struct vnode *devvp, struct hammer_io *io) 390 { 391 struct buf *bp; 392 393 if ((bp = io->bp) == NULL) { 394 io->bp = getblk(devvp, io->offset, io->bytes, 0, 0); 395 bp = io->bp; 396 bp->b_ops = &hammer_bioops; 397 398 hammer_buf_attach_io(bp, io); /* locked by the io lock */ 399 io->released = 0; 400 KKASSERT(io->running == 0); 401 io->waiting = 0; 402 BUF_KERNPROC(bp); 403 } else { 404 if (io->released) { 405 regetblk(bp); 406 BUF_KERNPROC(bp); 407 io->released = 0; 408 } 409 } 410 hammer_io_modify(io, 0); 411 vfs_bio_clrbuf(bp); 412 return(0); 413 } 414 415 /* 416 * Advance the activity count on the underlying buffer because 417 * HAMMER does not getblk/brelse on every access. 418 * 419 * The io->bp cannot go away while the buffer is referenced. 420 */ 421 void 422 hammer_io_advance(struct hammer_io *io) 423 { 424 if (io->bp) 425 buf_act_advance(io->bp); 426 } 427 428 /* 429 * Remove potential device level aliases against buffers managed by high level 430 * vnodes. Aliases can also be created due to mixed buffer sizes or via 431 * direct access to the backing store device. 432 * 433 * This is nasty because the buffers are also VMIO-backed. Even if a buffer 434 * does not exist its backing VM pages might, and we have to invalidate 435 * those as well or a getblk() will reinstate them. 436 * 437 * Buffer cache buffers associated with hammer_buffers cannot be 438 * invalidated. 439 */ 440 int 441 hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset) 442 { 443 hammer_io_t io; 444 hammer_mount_t hmp; 445 hammer_off_t phys_offset; 446 struct buf *bp; 447 int error; 448 449 hmp = volume->io.hmp; 450 lwkt_gettoken(&hmp->io_token); 451 452 /* 453 * If a device buffer already exists for the specified physical 454 * offset use that, otherwise instantiate a buffer to cover any 455 * related VM pages, set BNOCACHE, and brelse(). 456 */ 457 phys_offset = hammer_xlate_to_phys(volume->ondisk, zone2_offset); 458 if ((bp = findblk(volume->devvp, phys_offset, 0)) != NULL) 459 bremfree(bp); 460 else 461 bp = getblk(volume->devvp, phys_offset, HAMMER_BUFSIZE, 0, 0); 462 463 if ((io = hammer_buf_peek_io(bp)) != NULL) { 464 #if 0 465 hammer_ref(&io->lock); 466 hammer_io_clear_modify(io, 1); 467 bundirty(bp); 468 io->released = 0; 469 BUF_KERNPROC(bp); 470 io->reclaim = 1; 471 io->waitdep = 1; /* XXX this is a fs_token field */ 472 KKASSERT(hammer_isactive(&io->lock) == 1); 473 hammer_rel_buffer(HAMMER_ITOB(io), 0); 474 /*hammer_io_deallocate(bp);*/ 475 #endif 476 bqrelse(bp); 477 error = EAGAIN; 478 } else { 479 KKASSERT((bp->b_flags & B_LOCKED) == 0); 480 bundirty(bp); 481 bp->b_flags |= B_NOCACHE|B_RELBUF; 482 brelse(bp); 483 error = 0; 484 } 485 lwkt_reltoken(&hmp->io_token); 486 return(error); 487 } 488 489 /* 490 * This routine is called on the last reference to a hammer structure. 491 * The io must be interlocked with a refcount of zero. The hammer structure 492 * will remain interlocked on return. 493 * 494 * This routine may return a non-NULL bp to the caller for dispoal. 495 * The caller typically brelse()'s the bp. 496 * 497 * The bp may or may not still be passively associated with the IO. It 498 * will remain passively associated if it is unreleasable (e.g. a modified 499 * meta-data buffer). 500 * 501 * The only requirement here is that modified meta-data and volume-header 502 * buffer may NOT be disassociated from the IO structure, and consequently 503 * we also leave such buffers actively associated with the IO if they already 504 * are (since the kernel can't do anything with them anyway). Only the 505 * flusher is allowed to write such buffers out. Modified pure-data and 506 * undo buffers are returned to the kernel but left passively associated 507 * so we can track when the kernel writes the bp out. 508 */ 509 struct buf * 510 hammer_io_release(struct hammer_io *io, int flush) 511 { 512 struct buf *bp; 513 514 if ((bp = io->bp) == NULL) 515 return(NULL); 516 517 /* 518 * Try to flush a dirty IO to disk if asked to by the 519 * caller or if the kernel tried to flush the buffer in the past. 520 * 521 * Kernel-initiated flushes are only allowed for pure-data buffers. 522 * meta-data and volume buffers can only be flushed explicitly 523 * by HAMMER. 524 */ 525 if (io->modified) { 526 if (flush) { 527 hammer_io_flush(io, 0); 528 } else if (bp->b_flags & B_LOCKED) { 529 switch(io->type) { 530 case HAMMER_STRUCTURE_DATA_BUFFER: 531 hammer_io_flush(io, 0); 532 break; 533 case HAMMER_STRUCTURE_UNDO_BUFFER: 534 hammer_io_flush(io, hammer_undo_reclaim(io)); 535 break; 536 default: 537 break; 538 } 539 } /* else no explicit request to flush the buffer */ 540 } 541 542 /* 543 * Wait for the IO to complete if asked to. This occurs when 544 * the buffer must be disposed of definitively during an umount 545 * or buffer invalidation. 546 */ 547 if (io->waitdep && io->running) { 548 hammer_io_wait(io); 549 } 550 551 /* 552 * Return control of the buffer to the kernel (with the provisio 553 * that our bioops can override kernel decisions with regards to 554 * the buffer). 555 */ 556 if ((flush || io->reclaim) && io->modified == 0 && io->running == 0) { 557 /* 558 * Always disassociate the bp if an explicit flush 559 * was requested and the IO completed with no error 560 * (so unmount can really clean up the structure). 561 */ 562 if (io->released) { 563 regetblk(bp); 564 BUF_KERNPROC(bp); 565 } else { 566 io->released = 1; 567 } 568 hammer_io_disassociate(io); 569 /* return the bp */ 570 } else if (io->modified) { 571 /* 572 * Only certain IO types can be released to the kernel if 573 * the buffer has been modified. 574 * 575 * volume and meta-data IO types may only be explicitly 576 * flushed by HAMMER. 577 */ 578 switch(io->type) { 579 case HAMMER_STRUCTURE_DATA_BUFFER: 580 case HAMMER_STRUCTURE_UNDO_BUFFER: 581 if (io->released == 0) { 582 io->released = 1; 583 bp->b_flags |= B_CLUSTEROK; 584 bdwrite(bp); 585 } 586 break; 587 default: 588 break; 589 } 590 bp = NULL; /* bp left associated */ 591 } else if (io->released == 0) { 592 /* 593 * Clean buffers can be generally released to the kernel. 594 * We leave the bp passively associated with the HAMMER 595 * structure and use bioops to disconnect it later on 596 * if the kernel wants to discard the buffer. 597 * 598 * We can steal the structure's ownership of the bp. 599 */ 600 io->released = 1; 601 if (bp->b_flags & B_LOCKED) { 602 hammer_io_disassociate(io); 603 /* return the bp */ 604 } else { 605 if (io->reclaim) { 606 hammer_io_disassociate(io); 607 /* return the bp */ 608 } else { 609 /* return the bp (bp passively associated) */ 610 } 611 } 612 } else { 613 /* 614 * A released buffer is passively associate with our 615 * hammer_io structure. The kernel cannot destroy it 616 * without making a bioops call. If the kernel (B_LOCKED) 617 * or we (reclaim) requested that the buffer be destroyed 618 * we destroy it, otherwise we do a quick get/release to 619 * reset its position in the kernel's LRU list. 620 * 621 * Leaving the buffer passively associated allows us to 622 * use the kernel's LRU buffer flushing mechanisms rather 623 * then rolling our own. 624 * 625 * XXX there are two ways of doing this. We can re-acquire 626 * and passively release to reset the LRU, or not. 627 */ 628 if (io->running == 0) { 629 regetblk(bp); 630 if ((bp->b_flags & B_LOCKED) || io->reclaim) { 631 hammer_io_disassociate(io); 632 /* return the bp */ 633 } else { 634 /* return the bp (bp passively associated) */ 635 } 636 } else { 637 /* 638 * bp is left passively associated but we do not 639 * try to reacquire it. Interactions with the io 640 * structure will occur on completion of the bp's 641 * I/O. 642 */ 643 bp = NULL; 644 } 645 } 646 return(bp); 647 } 648 649 /* 650 * This routine is called with a locked IO when a flush is desired and 651 * no other references to the structure exists other then ours. This 652 * routine is ONLY called when HAMMER believes it is safe to flush a 653 * potentially modified buffer out. 654 * 655 * The locked io or io reference prevents a flush from being initiated 656 * by the kernel. 657 */ 658 void 659 hammer_io_flush(struct hammer_io *io, int reclaim) 660 { 661 struct buf *bp; 662 hammer_mount_t hmp; 663 664 /* 665 * Degenerate case - nothing to flush if nothing is dirty. 666 */ 667 if (io->modified == 0) 668 return; 669 670 KKASSERT(io->bp); 671 KKASSERT(io->modify_refs <= 0); 672 673 /* 674 * Acquire ownership of the bp, particularly before we clear our 675 * modified flag. 676 * 677 * We are going to bawrite() this bp. Don't leave a window where 678 * io->released is set, we actually own the bp rather then our 679 * buffer. 680 * 681 * The io_token should not be required here as only 682 */ 683 hmp = io->hmp; 684 bp = io->bp; 685 if (io->released) { 686 regetblk(bp); 687 /* BUF_KERNPROC(io->bp); */ 688 /* io->released = 0; */ 689 KKASSERT(io->released); 690 KKASSERT(io->bp == bp); 691 } else { 692 io->released = 1; 693 } 694 695 if (reclaim) { 696 io->reclaim = 1; 697 if ((bp->b_flags & B_LOCKED) == 0) { 698 bp->b_flags |= B_LOCKED; 699 atomic_add_int(&hammer_count_io_locked, 1); 700 } 701 } 702 703 /* 704 * Acquire exclusive access to the bp and then clear the modified 705 * state of the buffer prior to issuing I/O to interlock any 706 * modifications made while the I/O is in progress. This shouldn't 707 * happen anyway but losing data would be worse. The modified bit 708 * will be rechecked after the IO completes. 709 * 710 * NOTE: This call also finalizes the buffer's content (inval == 0). 711 * 712 * This is only legal when lock.refs == 1 (otherwise we might clear 713 * the modified bit while there are still users of the cluster 714 * modifying the data). 715 * 716 * Do this before potentially blocking so any attempt to modify the 717 * ondisk while we are blocked blocks waiting for us. 718 */ 719 hammer_ref(&io->lock); 720 hammer_io_clear_modify(io, 0); 721 hammer_rel(&io->lock); 722 723 if (hammer_debug_io & 0x0002) 724 hdkprintf("%016jx\n", bp->b_bio1.bio_offset); 725 726 /* 727 * Transfer ownership to the kernel and initiate I/O. 728 * 729 * NOTE: We do not hold io_token so an atomic op is required to 730 * update io_running_space. 731 */ 732 io->running = 1; 733 atomic_add_long(&hmp->io_running_space, io->bytes); 734 atomic_add_long(&hammer_count_io_running_write, io->bytes); 735 lwkt_gettoken(&hmp->io_token); 736 TAILQ_INSERT_TAIL(&hmp->iorun_list, io, iorun_entry); 737 lwkt_reltoken(&hmp->io_token); 738 cluster_awrite(bp); 739 hammer_io_flush_mark(io->volume); 740 } 741 742 /************************************************************************ 743 * BUFFER DIRTYING * 744 ************************************************************************ 745 * 746 * These routines deal with dependancies created when IO buffers get 747 * modified. The caller must call hammer_modify_*() on a referenced 748 * HAMMER structure prior to modifying its on-disk data. 749 * 750 * Any intent to modify an IO buffer acquires the related bp and imposes 751 * various write ordering dependancies. 752 */ 753 754 /* 755 * Mark a HAMMER structure as undergoing modification. Meta-data buffers 756 * are locked until the flusher can deal with them, pure data buffers 757 * can be written out. 758 * 759 * The referenced io prevents races. 760 */ 761 static 762 void 763 hammer_io_modify(hammer_io_t io, int count) 764 { 765 /* 766 * io->modify_refs must be >= 0 767 */ 768 while (io->modify_refs < 0) { 769 io->waitmod = 1; 770 tsleep(io, 0, "hmrmod", 0); 771 } 772 773 /* 774 * Shortcut if nothing to do. 775 */ 776 KKASSERT(hammer_isactive(&io->lock) && io->bp != NULL); 777 io->modify_refs += count; 778 if (io->modified && io->released == 0) 779 return; 780 781 /* 782 * NOTE: It is important not to set the modified bit 783 * until after we have acquired the bp or we risk 784 * racing against checkwrite. 785 */ 786 hammer_lock_ex(&io->lock); 787 if (io->released) { 788 regetblk(io->bp); 789 BUF_KERNPROC(io->bp); 790 io->released = 0; 791 } 792 if (io->modified == 0) { 793 hammer_io_set_modlist(io); 794 io->modified = 1; 795 } 796 hammer_unlock(&io->lock); 797 } 798 799 static __inline 800 void 801 hammer_io_modify_done(hammer_io_t io) 802 { 803 KKASSERT(io->modify_refs > 0); 804 --io->modify_refs; 805 if (io->modify_refs == 0 && io->waitmod) { 806 io->waitmod = 0; 807 wakeup(io); 808 } 809 } 810 811 /* 812 * The write interlock blocks other threads trying to modify a buffer 813 * (they block in hammer_io_modify()) after us, or blocks us while other 814 * threads are in the middle of modifying a buffer. 815 * 816 * The caller also has a ref on the io, however if we are not careful 817 * we will race bioops callbacks (checkwrite). To deal with this 818 * we must at least acquire and release the io_token, and it is probably 819 * better to hold it through the setting of modify_refs. 820 */ 821 void 822 hammer_io_write_interlock(hammer_io_t io) 823 { 824 hammer_mount_t hmp = io->hmp; 825 826 lwkt_gettoken(&hmp->io_token); 827 while (io->modify_refs != 0) { 828 io->waitmod = 1; 829 tsleep(io, 0, "hmrmod", 0); 830 } 831 io->modify_refs = -1; 832 lwkt_reltoken(&hmp->io_token); 833 } 834 835 void 836 hammer_io_done_interlock(hammer_io_t io) 837 { 838 KKASSERT(io->modify_refs == -1); 839 io->modify_refs = 0; 840 if (io->waitmod) { 841 io->waitmod = 0; 842 wakeup(io); 843 } 844 } 845 846 /* 847 * Caller intends to modify a volume's ondisk structure. 848 * 849 * This is only allowed if we are the flusher or we have a ref on the 850 * sync_lock. 851 */ 852 void 853 hammer_modify_volume(hammer_transaction_t trans, hammer_volume_t volume, 854 void *base, int len) 855 { 856 KKASSERT (trans == NULL || trans->sync_lock_refs > 0); 857 858 hammer_io_modify(&volume->io, 1); 859 if (len) { 860 intptr_t rel_offset = (intptr_t)base - (intptr_t)volume->ondisk; 861 KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0); 862 hammer_generate_undo(trans, 863 HAMMER_ENCODE_RAW_VOLUME(volume->vol_no, rel_offset), 864 base, len); 865 } 866 } 867 868 /* 869 * Caller intends to modify a buffer's ondisk structure. 870 * 871 * This is only allowed if we are the flusher or we have a ref on the 872 * sync_lock. 873 */ 874 void 875 hammer_modify_buffer(hammer_transaction_t trans, hammer_buffer_t buffer, 876 void *base, int len) 877 { 878 KKASSERT (trans == NULL || trans->sync_lock_refs > 0); 879 880 hammer_io_modify(&buffer->io, 1); 881 if (len) { 882 intptr_t rel_offset = (intptr_t)base - (intptr_t)buffer->ondisk; 883 KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0); 884 hammer_generate_undo(trans, 885 buffer->zone2_offset + rel_offset, 886 base, len); 887 } 888 } 889 890 void 891 hammer_modify_volume_done(hammer_volume_t volume) 892 { 893 hammer_io_modify_done(&volume->io); 894 } 895 896 void 897 hammer_modify_buffer_done(hammer_buffer_t buffer) 898 { 899 hammer_io_modify_done(&buffer->io); 900 } 901 902 /* 903 * Mark an entity as not being dirty any more and finalize any 904 * delayed adjustments to the buffer. 905 * 906 * Delayed adjustments are an important performance enhancement, allowing 907 * us to avoid recalculating B-Tree node CRCs over and over again when 908 * making bulk-modifications to the B-Tree. 909 * 910 * If inval is non-zero delayed adjustments are ignored. 911 * 912 * This routine may dereference related btree nodes and cause the 913 * buffer to be dereferenced. The caller must own a reference on io. 914 */ 915 void 916 hammer_io_clear_modify(struct hammer_io *io, int inval) 917 { 918 hammer_mount_t hmp; 919 920 /* 921 * io_token is needed to avoid races on mod_root 922 */ 923 if (io->modified == 0) 924 return; 925 hmp = io->hmp; 926 lwkt_gettoken(&hmp->io_token); 927 if (io->modified == 0) { 928 lwkt_reltoken(&hmp->io_token); 929 return; 930 } 931 932 /* 933 * Take us off the mod-list and clear the modified bit. 934 */ 935 KKASSERT(io->mod_root != NULL); 936 if (io->mod_root == &io->hmp->volu_root || 937 io->mod_root == &io->hmp->meta_root) { 938 io->hmp->locked_dirty_space -= io->bytes; 939 atomic_add_long(&hammer_count_dirtybufspace, -io->bytes); 940 } 941 RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io); 942 io->mod_root = NULL; 943 io->modified = 0; 944 945 lwkt_reltoken(&hmp->io_token); 946 947 /* 948 * If this bit is not set there are no delayed adjustments. 949 */ 950 if (io->gencrc == 0) 951 return; 952 io->gencrc = 0; 953 954 /* 955 * Finalize requested CRCs. The NEEDSCRC flag also holds a reference 956 * on the node (& underlying buffer). Release the node after clearing 957 * the flag. 958 */ 959 if (io->type == HAMMER_STRUCTURE_META_BUFFER) { 960 hammer_buffer_t buffer = HAMMER_ITOB(io); 961 hammer_node_t node; 962 963 restart: 964 TAILQ_FOREACH(node, &buffer->clist, entry) { 965 if ((node->flags & HAMMER_NODE_NEEDSCRC) == 0) 966 continue; 967 node->flags &= ~HAMMER_NODE_NEEDSCRC; 968 KKASSERT(node->ondisk); 969 if (inval == 0) 970 node->ondisk->crc = crc32(&node->ondisk->crc + 1, HAMMER_BTREE_CRCSIZE); 971 hammer_rel_node(node); 972 goto restart; 973 } 974 } 975 /* caller must still have ref on io */ 976 KKASSERT(hammer_isactive(&io->lock)); 977 } 978 979 /* 980 * Clear the IO's modify list. Even though the IO is no longer modified 981 * it may still be on the lose_root. This routine is called just before 982 * the governing hammer_buffer is destroyed. 983 * 984 * mod_root requires io_token protection. 985 */ 986 void 987 hammer_io_clear_modlist(struct hammer_io *io) 988 { 989 hammer_mount_t hmp = io->hmp; 990 991 KKASSERT(io->modified == 0); 992 if (io->mod_root) { 993 lwkt_gettoken(&hmp->io_token); 994 if (io->mod_root) { 995 KKASSERT(io->mod_root == &io->hmp->lose_root); 996 RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io); 997 io->mod_root = NULL; 998 } 999 lwkt_reltoken(&hmp->io_token); 1000 } 1001 } 1002 1003 static void 1004 hammer_io_set_modlist(struct hammer_io *io) 1005 { 1006 struct hammer_mount *hmp = io->hmp; 1007 1008 lwkt_gettoken(&hmp->io_token); 1009 KKASSERT(io->mod_root == NULL); 1010 1011 switch(io->type) { 1012 case HAMMER_STRUCTURE_VOLUME: 1013 io->mod_root = &hmp->volu_root; 1014 hmp->locked_dirty_space += io->bytes; 1015 atomic_add_long(&hammer_count_dirtybufspace, io->bytes); 1016 break; 1017 case HAMMER_STRUCTURE_META_BUFFER: 1018 io->mod_root = &hmp->meta_root; 1019 hmp->locked_dirty_space += io->bytes; 1020 atomic_add_long(&hammer_count_dirtybufspace, io->bytes); 1021 break; 1022 case HAMMER_STRUCTURE_UNDO_BUFFER: 1023 io->mod_root = &hmp->undo_root; 1024 break; 1025 case HAMMER_STRUCTURE_DATA_BUFFER: 1026 io->mod_root = &hmp->data_root; 1027 break; 1028 case HAMMER_STRUCTURE_DUMMY: 1029 hpanic("bad io type"); 1030 break; /* NOT REACHED */ 1031 } 1032 if (RB_INSERT(hammer_mod_rb_tree, io->mod_root, io)) { 1033 hpanic("duplicate entry @ %d:%015jx", 1034 io->volume->vol_no, io->offset); 1035 /* NOT REACHED */ 1036 } 1037 lwkt_reltoken(&hmp->io_token); 1038 } 1039 1040 /************************************************************************ 1041 * HAMMER_BIOOPS * 1042 ************************************************************************ 1043 * 1044 */ 1045 1046 /* 1047 * Pre-IO initiation kernel callback - cluster build only 1048 * 1049 * bioops callback - hold io_token 1050 */ 1051 static void 1052 hammer_io_start(struct buf *bp) 1053 { 1054 /* nothing to do, so io_token not needed */ 1055 } 1056 1057 /* 1058 * Post-IO completion kernel callback - MAY BE CALLED FROM INTERRUPT! 1059 * 1060 * NOTE: HAMMER may modify a data buffer after we have initiated write 1061 * I/O. 1062 * 1063 * NOTE: MPSAFE callback 1064 * 1065 * bioops callback - hold io_token 1066 */ 1067 static void 1068 hammer_io_complete(struct buf *bp) 1069 { 1070 hammer_io_t io = hammer_buf_peek_io(bp); 1071 struct hammer_mount *hmp = io->hmp; 1072 struct hammer_io *ionext; 1073 1074 lwkt_gettoken(&hmp->io_token); 1075 1076 KKASSERT(io->released == 1); 1077 1078 /* 1079 * Deal with people waiting for I/O to drain 1080 */ 1081 if (io->running) { 1082 /* 1083 * Deal with critical write errors. Once a critical error 1084 * has been flagged in hmp the UNDO FIFO will not be updated. 1085 * That way crash recover will give us a consistent 1086 * filesystem. 1087 * 1088 * Because of this we can throw away failed UNDO buffers. If 1089 * we throw away META or DATA buffers we risk corrupting 1090 * the now read-only version of the filesystem visible to 1091 * the user. Clear B_ERROR so the buffer is not re-dirtied 1092 * by the kernel and ref the io so it doesn't get thrown 1093 * away. 1094 */ 1095 if (bp->b_flags & B_ERROR) { 1096 lwkt_gettoken(&hmp->fs_token); 1097 hammer_critical_error(hmp, NULL, bp->b_error, 1098 "while flushing meta-data"); 1099 lwkt_reltoken(&hmp->fs_token); 1100 1101 switch(io->type) { 1102 case HAMMER_STRUCTURE_UNDO_BUFFER: 1103 break; 1104 default: 1105 if (io->ioerror == 0) { 1106 io->ioerror = 1; 1107 hammer_ref(&io->lock); 1108 } 1109 break; 1110 } 1111 bp->b_flags &= ~B_ERROR; 1112 bundirty(bp); 1113 #if 0 1114 hammer_io_set_modlist(io); 1115 io->modified = 1; 1116 #endif 1117 } 1118 hammer_stats_disk_write += io->bytes; 1119 atomic_add_long(&hammer_count_io_running_write, -io->bytes); 1120 atomic_add_long(&hmp->io_running_space, -io->bytes); 1121 KKASSERT(hmp->io_running_space >= 0); 1122 io->running = 0; 1123 1124 /* 1125 * Remove from iorun list and wakeup any multi-io waiter(s). 1126 */ 1127 if (TAILQ_FIRST(&hmp->iorun_list) == io) { 1128 ionext = TAILQ_NEXT(io, iorun_entry); 1129 if (ionext && ionext->type == HAMMER_STRUCTURE_DUMMY) 1130 wakeup(ionext); 1131 } 1132 TAILQ_REMOVE(&hmp->iorun_list, io, iorun_entry); 1133 } else { 1134 hammer_stats_disk_read += io->bytes; 1135 } 1136 1137 if (io->waiting) { 1138 io->waiting = 0; 1139 wakeup(io); 1140 } 1141 1142 /* 1143 * If B_LOCKED is set someone wanted to deallocate the bp at some 1144 * point, try to do it now. The operation will fail if there are 1145 * refs or if hammer_io_deallocate() is unable to gain the 1146 * interlock. 1147 */ 1148 if (bp->b_flags & B_LOCKED) { 1149 atomic_add_int(&hammer_count_io_locked, -1); 1150 bp->b_flags &= ~B_LOCKED; 1151 hammer_io_deallocate(bp); 1152 /* structure may be dead now */ 1153 } 1154 lwkt_reltoken(&hmp->io_token); 1155 } 1156 1157 /* 1158 * Callback from kernel when it wishes to deallocate a passively 1159 * associated structure. This mostly occurs with clean buffers 1160 * but it may be possible for a holding structure to be marked dirty 1161 * while its buffer is passively associated. The caller owns the bp. 1162 * 1163 * If we cannot disassociate we set B_LOCKED to prevent the buffer 1164 * from getting reused. 1165 * 1166 * WARNING: Because this can be called directly by getnewbuf we cannot 1167 * recurse into the tree. If a bp cannot be immediately disassociated 1168 * our only recourse is to set B_LOCKED. 1169 * 1170 * WARNING: This may be called from an interrupt via hammer_io_complete() 1171 * 1172 * bioops callback - hold io_token 1173 */ 1174 static void 1175 hammer_io_deallocate(struct buf *bp) 1176 { 1177 hammer_io_t io = hammer_buf_peek_io(bp); 1178 hammer_mount_t hmp; 1179 1180 hmp = io->hmp; 1181 1182 lwkt_gettoken(&hmp->io_token); 1183 1184 KKASSERT((bp->b_flags & B_LOCKED) == 0 && io->running == 0); 1185 if (hammer_try_interlock_norefs(&io->lock) == 0) { 1186 /* 1187 * We cannot safely disassociate a bp from a referenced 1188 * or interlocked HAMMER structure. 1189 */ 1190 bp->b_flags |= B_LOCKED; 1191 atomic_add_int(&hammer_count_io_locked, 1); 1192 } else if (io->modified) { 1193 /* 1194 * It is not legal to disassociate a modified buffer. This 1195 * case really shouldn't ever occur. 1196 */ 1197 bp->b_flags |= B_LOCKED; 1198 atomic_add_int(&hammer_count_io_locked, 1); 1199 hammer_put_interlock(&io->lock, 0); 1200 } else { 1201 /* 1202 * Disassociate the BP. If the io has no refs left we 1203 * have to add it to the loose list. The kernel has 1204 * locked the buffer and therefore our io must be 1205 * in a released state. 1206 */ 1207 hammer_io_disassociate(io); 1208 if (io->type != HAMMER_STRUCTURE_VOLUME) { 1209 KKASSERT(io->bp == NULL); 1210 KKASSERT(io->mod_root == NULL); 1211 io->mod_root = &hmp->lose_root; 1212 if (RB_INSERT(hammer_mod_rb_tree, io->mod_root, io)) { 1213 hpanic("duplicate entry @ %d:%015jx", 1214 io->volume->vol_no, io->offset); 1215 /* NOT REACHED */ 1216 } 1217 } 1218 hammer_put_interlock(&io->lock, 1); 1219 } 1220 lwkt_reltoken(&hmp->io_token); 1221 } 1222 1223 /* 1224 * bioops callback - hold io_token 1225 */ 1226 static int 1227 hammer_io_fsync(struct vnode *vp) 1228 { 1229 /* nothing to do, so io_token not needed */ 1230 return(0); 1231 } 1232 1233 /* 1234 * NOTE: will not be called unless we tell the kernel about the 1235 * bioops. Unused... we use the mount's VFS_SYNC instead. 1236 * 1237 * bioops callback - hold io_token 1238 */ 1239 static int 1240 hammer_io_sync(struct mount *mp) 1241 { 1242 /* nothing to do, so io_token not needed */ 1243 return(0); 1244 } 1245 1246 /* 1247 * bioops callback - hold io_token 1248 */ 1249 static void 1250 hammer_io_movedeps(struct buf *bp1, struct buf *bp2) 1251 { 1252 /* nothing to do, so io_token not needed */ 1253 } 1254 1255 /* 1256 * I/O pre-check for reading and writing. HAMMER only uses this for 1257 * B_CACHE buffers so checkread just shouldn't happen, but if it does 1258 * allow it. 1259 * 1260 * Writing is a different case. We don't want the kernel to try to write 1261 * out a buffer that HAMMER may be modifying passively or which has a 1262 * dependancy. In addition, kernel-demanded writes can only proceed for 1263 * certain types of buffers (i.e. UNDO and DATA types). Other dirty 1264 * buffer types can only be explicitly written by the flusher. 1265 * 1266 * checkwrite will only be called for bdwrite()n buffers. If we return 1267 * success the kernel is guaranteed to initiate the buffer write. 1268 * 1269 * bioops callback - hold io_token 1270 */ 1271 static int 1272 hammer_io_checkread(struct buf *bp) 1273 { 1274 /* nothing to do, so io_token not needed */ 1275 return(0); 1276 } 1277 1278 /* 1279 * The kernel is asking us whether it can write out a dirty buffer or not. 1280 * 1281 * bioops callback - hold io_token 1282 */ 1283 static int 1284 hammer_io_checkwrite(struct buf *bp) 1285 { 1286 hammer_io_t io = hammer_buf_peek_io(bp); 1287 hammer_mount_t hmp = io->hmp; 1288 1289 /* 1290 * This shouldn't happen under normal operation. 1291 */ 1292 lwkt_gettoken(&hmp->io_token); 1293 if (io->type == HAMMER_STRUCTURE_VOLUME || 1294 io->type == HAMMER_STRUCTURE_META_BUFFER) { 1295 if (!panicstr) 1296 hpanic("illegal buffer"); 1297 if ((bp->b_flags & B_LOCKED) == 0) { 1298 bp->b_flags |= B_LOCKED; 1299 atomic_add_int(&hammer_count_io_locked, 1); 1300 } 1301 lwkt_reltoken(&hmp->io_token); 1302 return(1); 1303 } 1304 1305 /* 1306 * We have to be able to interlock the IO to safely modify any 1307 * of its fields without holding the fs_token. If we can't lock 1308 * it then we are racing someone. 1309 * 1310 * Our ownership of the bp lock prevents the io from being ripped 1311 * out from under us. 1312 */ 1313 if (hammer_try_interlock_norefs(&io->lock) == 0) { 1314 bp->b_flags |= B_LOCKED; 1315 atomic_add_int(&hammer_count_io_locked, 1); 1316 lwkt_reltoken(&hmp->io_token); 1317 return(1); 1318 } 1319 1320 /* 1321 * The modified bit must be cleared prior to the initiation of 1322 * any IO (returning 0 initiates the IO). Because this is a 1323 * normal data buffer hammer_io_clear_modify() runs through a 1324 * simple degenerate case. 1325 * 1326 * Return 0 will cause the kernel to initiate the IO, and we 1327 * must normally clear the modified bit before we begin. If 1328 * the io has modify_refs we do not clear the modified bit, 1329 * otherwise we may miss changes. 1330 * 1331 * Only data and undo buffers can reach here. These buffers do 1332 * not have terminal crc functions but we temporarily reference 1333 * the IO anyway, just in case. 1334 */ 1335 if (io->modify_refs == 0 && io->modified) { 1336 hammer_ref(&io->lock); 1337 hammer_io_clear_modify(io, 0); 1338 hammer_rel(&io->lock); 1339 } else if (io->modified) { 1340 KKASSERT(io->type == HAMMER_STRUCTURE_DATA_BUFFER); 1341 } 1342 1343 /* 1344 * The kernel is going to start the IO, set io->running. 1345 */ 1346 KKASSERT(io->running == 0); 1347 io->running = 1; 1348 atomic_add_long(&io->hmp->io_running_space, io->bytes); 1349 atomic_add_long(&hammer_count_io_running_write, io->bytes); 1350 TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry); 1351 1352 hammer_put_interlock(&io->lock, 1); 1353 lwkt_reltoken(&hmp->io_token); 1354 1355 return(0); 1356 } 1357 1358 /* 1359 * Return non-zero if we wish to delay the kernel's attempt to flush 1360 * this buffer to disk. 1361 * 1362 * bioops callback - hold io_token 1363 */ 1364 static int 1365 hammer_io_countdeps(struct buf *bp, int n) 1366 { 1367 /* nothing to do, so io_token not needed */ 1368 return(0); 1369 } 1370 1371 static struct bio_ops hammer_bioops = { 1372 .io_start = hammer_io_start, 1373 .io_complete = hammer_io_complete, 1374 .io_deallocate = hammer_io_deallocate, 1375 .io_fsync = hammer_io_fsync, 1376 .io_sync = hammer_io_sync, 1377 .io_movedeps = hammer_io_movedeps, 1378 .io_countdeps = hammer_io_countdeps, 1379 .io_checkread = hammer_io_checkread, 1380 .io_checkwrite = hammer_io_checkwrite, 1381 }; 1382 1383 /************************************************************************ 1384 * DIRECT IO OPS * 1385 ************************************************************************ 1386 * 1387 * These functions operate directly on the buffer cache buffer associated 1388 * with a front-end vnode rather then a back-end device vnode. 1389 */ 1390 1391 /* 1392 * Read a buffer associated with a front-end vnode directly from the 1393 * disk media. The bio may be issued asynchronously. If leaf is non-NULL 1394 * we validate the CRC. 1395 * 1396 * We must check for the presence of a HAMMER buffer to handle the case 1397 * where the reblocker has rewritten the data (which it does via the HAMMER 1398 * buffer system, not via the high-level vnode buffer cache), but not yet 1399 * committed the buffer to the media. 1400 */ 1401 int 1402 hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio, 1403 hammer_btree_leaf_elm_t leaf) 1404 { 1405 hammer_off_t buf_offset; 1406 hammer_off_t zone2_offset; 1407 hammer_volume_t volume; 1408 struct buf *bp; 1409 struct bio *nbio; 1410 int vol_no; 1411 int error; 1412 1413 buf_offset = bio->bio_offset; 1414 KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == 1415 HAMMER_ZONE_LARGE_DATA); 1416 1417 /* 1418 * The buffer cache may have an aliased buffer (the reblocker can 1419 * write them). If it does we have to sync any dirty data before 1420 * we can build our direct-read. This is a non-critical code path. 1421 */ 1422 bp = bio->bio_buf; 1423 hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize); 1424 1425 /* 1426 * Resolve to a zone-2 offset. The conversion just requires 1427 * munging the top 4 bits but we want to abstract it anyway 1428 * so the blockmap code can verify the zone assignment. 1429 */ 1430 zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error); 1431 if (error) 1432 goto done; 1433 KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) == 1434 HAMMER_ZONE_RAW_BUFFER); 1435 1436 /* 1437 * Resolve volume and raw-offset for 3rd level bio. The 1438 * offset will be specific to the volume. 1439 */ 1440 vol_no = HAMMER_VOL_DECODE(zone2_offset); 1441 volume = hammer_get_volume(hmp, vol_no, &error); 1442 if (error == 0 && zone2_offset >= volume->maxbuf_off) 1443 error = EIO; 1444 1445 if (error == 0) { 1446 /* 1447 * 3rd level bio (the caller has already pushed once) 1448 */ 1449 nbio = push_bio(bio); 1450 nbio->bio_offset = hammer_xlate_to_phys(volume->ondisk, 1451 zone2_offset); 1452 hammer_stats_disk_read += bp->b_bufsize; 1453 vn_strategy(volume->devvp, nbio); 1454 } 1455 hammer_rel_volume(volume, 0); 1456 done: 1457 if (error) { 1458 hdkprintf("failed @ %016llx\n", (long long)zone2_offset); 1459 bp->b_error = error; 1460 bp->b_flags |= B_ERROR; 1461 biodone(bio); 1462 } 1463 return(error); 1464 } 1465 1466 /* 1467 * This works similarly to hammer_io_direct_read() except instead of 1468 * directly reading from the device into the bio we instead indirectly 1469 * read through the device's buffer cache and then copy the data into 1470 * the bio. 1471 * 1472 * If leaf is non-NULL and validation is enabled, the CRC will be checked. 1473 * 1474 * This routine also executes asynchronously. It allows hammer strategy 1475 * calls to operate asynchronously when in double_buffer mode (in addition 1476 * to operating asynchronously when in normal mode). 1477 */ 1478 int 1479 hammer_io_indirect_read(hammer_mount_t hmp, struct bio *bio, 1480 hammer_btree_leaf_elm_t leaf) 1481 { 1482 hammer_off_t buf_offset; 1483 hammer_off_t zone2_offset; 1484 hammer_volume_t volume; 1485 struct buf *bp; 1486 int vol_no; 1487 int error; 1488 1489 buf_offset = bio->bio_offset; 1490 KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == 1491 HAMMER_ZONE_LARGE_DATA); 1492 1493 /* 1494 * The buffer cache may have an aliased buffer (the reblocker can 1495 * write them). If it does we have to sync any dirty data before 1496 * we can build our direct-read. This is a non-critical code path. 1497 */ 1498 bp = bio->bio_buf; 1499 hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize); 1500 1501 /* 1502 * Resolve to a zone-2 offset. The conversion just requires 1503 * munging the top 4 bits but we want to abstract it anyway 1504 * so the blockmap code can verify the zone assignment. 1505 */ 1506 zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error); 1507 if (error) 1508 goto done; 1509 KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) == 1510 HAMMER_ZONE_RAW_BUFFER); 1511 1512 /* 1513 * Resolve volume and raw-offset for 3rd level bio. The 1514 * offset will be specific to the volume. 1515 */ 1516 vol_no = HAMMER_VOL_DECODE(zone2_offset); 1517 volume = hammer_get_volume(hmp, vol_no, &error); 1518 if (error == 0 && zone2_offset >= volume->maxbuf_off) 1519 error = EIO; 1520 1521 if (error == 0) { 1522 /* 1523 * Convert to the raw volume->devvp offset and acquire 1524 * the buf, issuing async I/O if necessary. 1525 */ 1526 buf_offset = hammer_xlate_to_phys(volume->ondisk, zone2_offset); 1527 1528 if (leaf && hammer_verify_data) { 1529 bio->bio_caller_info1.uvalue32 = leaf->data_crc; 1530 bio->bio_caller_info2.index = 1; 1531 } else { 1532 bio->bio_caller_info2.index = 0; 1533 } 1534 breadcb(volume->devvp, buf_offset, bp->b_bufsize, 1535 hammer_indirect_callback, bio); 1536 } 1537 hammer_rel_volume(volume, 0); 1538 done: 1539 if (error) { 1540 hdkprintf("failed @ %016llx\n", (long long)zone2_offset); 1541 bp->b_error = error; 1542 bp->b_flags |= B_ERROR; 1543 biodone(bio); 1544 } 1545 return(error); 1546 } 1547 1548 /* 1549 * Indirect callback on completion. bio/bp specify the device-backed 1550 * buffer. bio->bio_caller_info1.ptr holds obio. 1551 * 1552 * obio/obp is the original regular file buffer. obio->bio_caller_info* 1553 * contains the crc specification. 1554 * 1555 * We are responsible for calling bpdone() and bqrelse() on bio/bp, and 1556 * for calling biodone() on obio. 1557 */ 1558 static void 1559 hammer_indirect_callback(struct bio *bio) 1560 { 1561 struct buf *bp = bio->bio_buf; 1562 struct buf *obp; 1563 struct bio *obio; 1564 1565 /* 1566 * If BIO_DONE is already set the device buffer was already 1567 * fully valid (B_CACHE). If it is not set then I/O was issued 1568 * and we have to run I/O completion as the last bio. 1569 * 1570 * Nobody is waiting for our device I/O to complete, we are 1571 * responsible for bqrelse()ing it which means we also have to do 1572 * the equivalent of biowait() and clear BIO_DONE (which breadcb() 1573 * may have set). 1574 * 1575 * Any preexisting device buffer should match the requested size, 1576 * but due to big-block recycling and other factors there is some 1577 * fragility there, so we assert that the device buffer covers 1578 * the request. 1579 */ 1580 if ((bio->bio_flags & BIO_DONE) == 0) 1581 bpdone(bp, 0); 1582 bio->bio_flags &= ~(BIO_DONE | BIO_SYNC); 1583 1584 obio = bio->bio_caller_info1.ptr; 1585 obp = obio->bio_buf; 1586 1587 if (bp->b_flags & B_ERROR) { 1588 obp->b_flags |= B_ERROR; 1589 obp->b_error = bp->b_error; 1590 } else if (obio->bio_caller_info2.index && 1591 obio->bio_caller_info1.uvalue32 != 1592 crc32(bp->b_data, bp->b_bufsize)) { 1593 obp->b_flags |= B_ERROR; 1594 obp->b_error = EIO; 1595 } else { 1596 KKASSERT(bp->b_bufsize >= obp->b_bufsize); 1597 bcopy(bp->b_data, obp->b_data, obp->b_bufsize); 1598 obp->b_resid = 0; 1599 obp->b_flags |= B_AGE; 1600 } 1601 biodone(obio); 1602 bqrelse(bp); 1603 } 1604 1605 /* 1606 * Write a buffer associated with a front-end vnode directly to the 1607 * disk media. The bio may be issued asynchronously. 1608 * 1609 * The BIO is associated with the specified record and RECG_DIRECT_IO 1610 * is set. The recorded is added to its object. 1611 */ 1612 int 1613 hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio, 1614 hammer_record_t record) 1615 { 1616 hammer_btree_leaf_elm_t leaf = &record->leaf; 1617 hammer_off_t buf_offset; 1618 hammer_off_t zone2_offset; 1619 hammer_volume_t volume; 1620 hammer_buffer_t buffer; 1621 struct buf *bp; 1622 struct bio *nbio; 1623 char *ptr; 1624 int vol_no; 1625 int error; 1626 1627 buf_offset = leaf->data_offset; 1628 1629 KKASSERT(hammer_is_zone2_mapped_index( 1630 HAMMER_ZONE_DECODE(buf_offset))); 1631 KKASSERT(bio->bio_buf->b_cmd == BUF_CMD_WRITE); 1632 1633 /* 1634 * Issue or execute the I/O. The new memory record must replace 1635 * the old one before the I/O completes, otherwise a reaquisition of 1636 * the buffer will load the old media data instead of the new. 1637 */ 1638 if ((buf_offset & HAMMER_BUFMASK) == 0 && 1639 leaf->data_len >= HAMMER_BUFSIZE) { 1640 /* 1641 * We are using the vnode's bio to write directly to the 1642 * media, any hammer_buffer at the same zone-X offset will 1643 * now have stale data. 1644 */ 1645 zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error); 1646 vol_no = HAMMER_VOL_DECODE(zone2_offset); 1647 volume = hammer_get_volume(hmp, vol_no, &error); 1648 1649 if (error == 0 && zone2_offset >= volume->maxbuf_off) 1650 error = EIO; 1651 if (error == 0) { 1652 bp = bio->bio_buf; 1653 KKASSERT((bp->b_bufsize & HAMMER_BUFMASK) == 0); 1654 /* 1655 hammer_del_buffers(hmp, buf_offset, 1656 zone2_offset, bp->b_bufsize); 1657 */ 1658 1659 /* 1660 * Second level bio - cached zone2 offset. 1661 * 1662 * (We can put our bio_done function in either the 1663 * 2nd or 3rd level). 1664 */ 1665 nbio = push_bio(bio); 1666 nbio->bio_offset = zone2_offset; 1667 nbio->bio_done = hammer_io_direct_write_complete; 1668 nbio->bio_caller_info1.ptr = record; 1669 record->zone2_offset = zone2_offset; 1670 record->gflags |= HAMMER_RECG_DIRECT_IO | 1671 HAMMER_RECG_DIRECT_INVAL; 1672 1673 /* 1674 * Third level bio - raw offset specific to the 1675 * correct volume. 1676 */ 1677 nbio = push_bio(nbio); 1678 nbio->bio_offset = hammer_xlate_to_phys(volume->ondisk, 1679 zone2_offset); 1680 hammer_stats_disk_write += bp->b_bufsize; 1681 hammer_ip_replace_bulk(hmp, record); 1682 vn_strategy(volume->devvp, nbio); 1683 hammer_io_flush_mark(volume); 1684 } 1685 hammer_rel_volume(volume, 0); 1686 } else { 1687 /* 1688 * Must fit in a standard HAMMER buffer. In this case all 1689 * consumers use the HAMMER buffer system and RECG_DIRECT_IO 1690 * does not need to be set-up. 1691 */ 1692 KKASSERT(((buf_offset ^ (buf_offset + leaf->data_len - 1)) & ~HAMMER_BUFMASK64) == 0); 1693 buffer = NULL; 1694 ptr = hammer_bread(hmp, buf_offset, &error, &buffer); 1695 if (error == 0) { 1696 bp = bio->bio_buf; 1697 bp->b_flags |= B_AGE; 1698 hammer_io_modify(&buffer->io, 1); 1699 bcopy(bp->b_data, ptr, leaf->data_len); 1700 hammer_io_modify_done(&buffer->io); 1701 hammer_rel_buffer(buffer, 0); 1702 bp->b_resid = 0; 1703 hammer_ip_replace_bulk(hmp, record); 1704 biodone(bio); 1705 } 1706 } 1707 if (error) { 1708 /* 1709 * Major suckage occured. Also note: The record was 1710 * never added to the tree so we do not have to worry 1711 * about the backend. 1712 */ 1713 hdkprintf("failed @ %016llx\n", (long long)leaf->data_offset); 1714 bp = bio->bio_buf; 1715 bp->b_resid = 0; 1716 bp->b_error = EIO; 1717 bp->b_flags |= B_ERROR; 1718 biodone(bio); 1719 record->flags |= HAMMER_RECF_DELETED_FE; 1720 hammer_rel_mem_record(record); 1721 } 1722 return(error); 1723 } 1724 1725 /* 1726 * On completion of the BIO this callback must disconnect 1727 * it from the hammer_record and chain to the previous bio. 1728 * 1729 * An I/O error forces the mount to read-only. Data buffers 1730 * are not B_LOCKED like meta-data buffers are, so we have to 1731 * throw the buffer away to prevent the kernel from retrying. 1732 * 1733 * NOTE: MPSAFE callback, only modify fields we have explicit 1734 * access to (the bp and the record->gflags). 1735 */ 1736 static 1737 void 1738 hammer_io_direct_write_complete(struct bio *nbio) 1739 { 1740 struct bio *obio; 1741 struct buf *bp; 1742 hammer_record_t record; 1743 hammer_mount_t hmp; 1744 1745 record = nbio->bio_caller_info1.ptr; 1746 KKASSERT(record != NULL); 1747 hmp = record->ip->hmp; 1748 1749 lwkt_gettoken(&hmp->io_token); 1750 1751 bp = nbio->bio_buf; 1752 obio = pop_bio(nbio); 1753 if (bp->b_flags & B_ERROR) { 1754 lwkt_gettoken(&hmp->fs_token); 1755 hammer_critical_error(hmp, record->ip, bp->b_error, 1756 "while writing bulk data"); 1757 lwkt_reltoken(&hmp->fs_token); 1758 bp->b_flags |= B_INVAL; 1759 } 1760 biodone(obio); 1761 1762 KKASSERT(record->gflags & HAMMER_RECG_DIRECT_IO); 1763 if (record->gflags & HAMMER_RECG_DIRECT_WAIT) { 1764 record->gflags &= ~(HAMMER_RECG_DIRECT_IO | 1765 HAMMER_RECG_DIRECT_WAIT); 1766 /* record can disappear once DIRECT_IO flag is cleared */ 1767 wakeup(&record->flags); 1768 } else { 1769 record->gflags &= ~HAMMER_RECG_DIRECT_IO; 1770 /* record can disappear once DIRECT_IO flag is cleared */ 1771 } 1772 lwkt_reltoken(&hmp->io_token); 1773 } 1774 1775 1776 /* 1777 * This is called before a record is either committed to the B-Tree 1778 * or destroyed, to resolve any associated direct-IO. 1779 * 1780 * (1) We must wait for any direct-IO related to the record to complete. 1781 * 1782 * (2) We must remove any buffer cache aliases for data accessed via 1783 * leaf->data_offset or zone2_offset so non-direct-IO consumers 1784 * (the mirroring and reblocking code) do not see stale data. 1785 */ 1786 void 1787 hammer_io_direct_wait(hammer_record_t record) 1788 { 1789 hammer_mount_t hmp = record->ip->hmp; 1790 1791 /* 1792 * Wait for I/O to complete 1793 */ 1794 if (record->gflags & HAMMER_RECG_DIRECT_IO) { 1795 lwkt_gettoken(&hmp->io_token); 1796 while (record->gflags & HAMMER_RECG_DIRECT_IO) { 1797 record->gflags |= HAMMER_RECG_DIRECT_WAIT; 1798 tsleep(&record->flags, 0, "hmdiow", 0); 1799 } 1800 lwkt_reltoken(&hmp->io_token); 1801 } 1802 1803 /* 1804 * Invalidate any related buffer cache aliases associated with the 1805 * backing device. This is needed because the buffer cache buffer 1806 * for file data is associated with the file vnode, not the backing 1807 * device vnode. 1808 * 1809 * XXX I do not think this case can occur any more now that 1810 * reservations ensure that all such buffers are removed before 1811 * an area can be reused. 1812 */ 1813 if (record->gflags & HAMMER_RECG_DIRECT_INVAL) { 1814 KKASSERT(record->leaf.data_offset); 1815 hammer_del_buffers(hmp, record->leaf.data_offset, 1816 record->zone2_offset, record->leaf.data_len, 1817 1); 1818 record->gflags &= ~HAMMER_RECG_DIRECT_INVAL; 1819 } 1820 } 1821 1822 /* 1823 * This is called to remove the second-level cached zone-2 offset from 1824 * frontend buffer cache buffers, now stale due to a data relocation. 1825 * These offsets are generated by cluster_read() via VOP_BMAP, or directly 1826 * by hammer_vop_strategy_read(). 1827 * 1828 * This is rather nasty because here we have something like the reblocker 1829 * scanning the raw B-Tree with no held references on anything, really, 1830 * other then a shared lock on the B-Tree node, and we have to access the 1831 * frontend's buffer cache to check for and clean out the association. 1832 * Specifically, if the reblocker is moving data on the disk, these cached 1833 * offsets will become invalid. 1834 * 1835 * Only data record types associated with the large-data zone are subject 1836 * to direct-io and need to be checked. 1837 * 1838 */ 1839 void 1840 hammer_io_direct_uncache(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf) 1841 { 1842 struct hammer_inode_info iinfo; 1843 int zone; 1844 1845 if (leaf->base.rec_type != HAMMER_RECTYPE_DATA) 1846 return; 1847 zone = HAMMER_ZONE_DECODE(leaf->data_offset); 1848 if (zone != HAMMER_ZONE_LARGE_DATA_INDEX) 1849 return; 1850 iinfo.obj_id = leaf->base.obj_id; 1851 iinfo.obj_asof = 0; /* unused */ 1852 iinfo.obj_localization = leaf->base.localization & 1853 HAMMER_LOCALIZE_PSEUDOFS_MASK; 1854 iinfo.u.leaf = leaf; 1855 hammer_scan_inode_snapshots(hmp, &iinfo, 1856 hammer_io_direct_uncache_callback, 1857 leaf); 1858 } 1859 1860 static int 1861 hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data) 1862 { 1863 hammer_inode_info_t iinfo = data; 1864 hammer_off_t file_offset; 1865 struct vnode *vp; 1866 struct buf *bp; 1867 int blksize; 1868 1869 if (ip->vp == NULL) 1870 return(0); 1871 file_offset = iinfo->u.leaf->base.key - iinfo->u.leaf->data_len; 1872 blksize = iinfo->u.leaf->data_len; 1873 KKASSERT((blksize & HAMMER_BUFMASK) == 0); 1874 1875 /* 1876 * Warning: FINDBLK_TEST return stable storage but not stable 1877 * contents. It happens to be ok in this case. 1878 */ 1879 hammer_ref(&ip->lock); 1880 if (hammer_get_vnode(ip, &vp) == 0) { 1881 if ((bp = findblk(ip->vp, file_offset, FINDBLK_TEST)) != NULL && 1882 bp->b_bio2.bio_offset != NOOFFSET) { 1883 bp = getblk(ip->vp, file_offset, blksize, 0, 0); 1884 bp->b_bio2.bio_offset = NOOFFSET; 1885 brelse(bp); 1886 } 1887 vput(vp); 1888 } 1889 hammer_rel_inode(ip, 0); 1890 return(0); 1891 } 1892 1893 1894 /* 1895 * This function is called when writes may have occured on the volume, 1896 * indicating that the device may be holding cached writes. 1897 */ 1898 static __inline void 1899 hammer_io_flush_mark(hammer_volume_t volume) 1900 { 1901 atomic_set_int(&volume->vol_flags, HAMMER_VOLF_NEEDFLUSH); 1902 } 1903 1904 /* 1905 * This function ensures that the device has flushed any cached writes out. 1906 */ 1907 void 1908 hammer_io_flush_sync(hammer_mount_t hmp) 1909 { 1910 hammer_volume_t volume; 1911 struct buf *bp_base = NULL; 1912 struct buf *bp; 1913 1914 RB_FOREACH(volume, hammer_vol_rb_tree, &hmp->rb_vols_root) { 1915 if (volume->vol_flags & HAMMER_VOLF_NEEDFLUSH) { 1916 atomic_clear_int(&volume->vol_flags, 1917 HAMMER_VOLF_NEEDFLUSH); 1918 bp = getpbuf(NULL); 1919 bp->b_bio1.bio_offset = 0; 1920 bp->b_bufsize = 0; 1921 bp->b_bcount = 0; 1922 bp->b_cmd = BUF_CMD_FLUSH; 1923 bp->b_bio1.bio_caller_info1.cluster_head = bp_base; 1924 bp->b_bio1.bio_done = biodone_sync; 1925 bp->b_bio1.bio_flags |= BIO_SYNC; 1926 bp_base = bp; 1927 vn_strategy(volume->devvp, &bp->b_bio1); 1928 } 1929 } 1930 while ((bp = bp_base) != NULL) { 1931 bp_base = bp->b_bio1.bio_caller_info1.cluster_head; 1932 biowait(&bp->b_bio1, "hmrFLS"); 1933 relpbuf(bp, NULL); 1934 } 1935 } 1936 1937 /* 1938 * Limit the amount of backlog which we allow to build up 1939 */ 1940 void 1941 hammer_io_limit_backlog(hammer_mount_t hmp) 1942 { 1943 waitrunningbufspace(); 1944 } 1945