1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * IO Primitives and buffer cache management 36 * 37 * All major data-tracking structures in HAMMER contain a struct hammer_io 38 * which is used to manage their backing store. We use filesystem buffers 39 * for backing store and we leave them passively associated with their 40 * HAMMER structures. 41 * 42 * If the kernel tries to destroy a passively associated buf which we cannot 43 * yet let go we set B_LOCKED in the buffer and then actively released it 44 * later when we can. 45 * 46 * The io_token is required for anything which might race bioops and bio_done 47 * callbacks, with one exception: A successful hammer_try_interlock_norefs(). 48 * the fs_token will be held in all other cases. 49 */ 50 51 #include <sys/buf2.h> 52 53 #include "hammer.h" 54 55 static void hammer_io_modify(hammer_io_t io, int count); 56 static void hammer_io_deallocate(struct buf *bp); 57 static void hammer_indirect_callback(struct bio *bio); 58 static void hammer_io_direct_write_complete(struct bio *nbio); 59 static int hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data); 60 static void hammer_io_set_modlist(struct hammer_io *io); 61 static void hammer_io_flush_mark(hammer_volume_t volume); 62 63 static int 64 hammer_mod_rb_compare(hammer_io_t io1, hammer_io_t io2) 65 { 66 hammer_off_t io1_offset; 67 hammer_off_t io2_offset; 68 69 io1_offset = ((io1->offset & HAMMER_OFF_SHORT_MASK) << 8) | 70 io1->volume->vol_no; 71 io2_offset = ((io2->offset & HAMMER_OFF_SHORT_MASK) << 8) | 72 io2->volume->vol_no; 73 74 if (io1_offset < io2_offset) 75 return(-1); 76 if (io1_offset > io2_offset) 77 return(1); 78 return(0); 79 } 80 81 RB_GENERATE(hammer_mod_rb_tree, hammer_io, rb_node, hammer_mod_rb_compare); 82 83 /* 84 * Initialize a new, already-zero'd hammer_io structure, or reinitialize 85 * an existing hammer_io structure which may have switched to another type. 86 */ 87 void 88 hammer_io_init(hammer_io_t io, hammer_volume_t volume, enum hammer_io_type type) 89 { 90 io->volume = volume; 91 io->hmp = volume->io.hmp; 92 io->type = type; 93 } 94 95 /* 96 * Helper routine to disassociate a buffer cache buffer from an I/O 97 * structure. The io must be interlocked and marked appropriately for 98 * reclamation. 99 * 100 * The io must be in a released state with the io->bp owned and 101 * locked by the caller of this function. When not called from an 102 * io_deallocate() this cannot race an io_deallocate() since the 103 * kernel would be unable to get the buffer lock in that case. 104 * (The released state in this case means we own the bp, not the 105 * hammer_io structure). 106 * 107 * The io may have 0 or 1 references depending on who called us. The 108 * caller is responsible for dealing with the refs. 109 * 110 * This call can only be made when no action is required on the buffer. 111 * 112 * This function is guaranteed not to race against anything because we 113 * own both the io lock and the bp lock and are interlocked with no 114 * references. 115 */ 116 static void 117 hammer_io_disassociate(hammer_io_structure_t iou) 118 { 119 struct buf *bp = iou->io.bp; 120 121 KKASSERT(iou->io.released); 122 KKASSERT(iou->io.modified == 0); 123 KKASSERT(LIST_FIRST(&bp->b_dep) == (void *)iou); 124 buf_dep_init(bp); 125 iou->io.bp = NULL; 126 127 /* 128 * If the buffer was locked someone wanted to get rid of it. 129 */ 130 if (bp->b_flags & B_LOCKED) { 131 atomic_add_int(&hammer_count_io_locked, -1); 132 bp->b_flags &= ~B_LOCKED; 133 } 134 if (iou->io.reclaim) { 135 bp->b_flags |= B_NOCACHE|B_RELBUF; 136 iou->io.reclaim = 0; 137 } 138 139 switch(iou->io.type) { 140 case HAMMER_STRUCTURE_VOLUME: 141 iou->volume.ondisk = NULL; 142 break; 143 case HAMMER_STRUCTURE_DATA_BUFFER: 144 case HAMMER_STRUCTURE_META_BUFFER: 145 case HAMMER_STRUCTURE_UNDO_BUFFER: 146 iou->buffer.ondisk = NULL; 147 break; 148 case HAMMER_STRUCTURE_DUMMY: 149 hpanic("bad io type"); 150 break; 151 } 152 } 153 154 /* 155 * Wait for any physical IO to complete 156 * 157 * XXX we aren't interlocked against a spinlock or anything so there 158 * is a small window in the interlock / io->running == 0 test. 159 */ 160 void 161 hammer_io_wait(hammer_io_t io) 162 { 163 if (io->running) { 164 hammer_mount_t hmp = io->hmp; 165 166 lwkt_gettoken(&hmp->io_token); 167 while (io->running) { 168 io->waiting = 1; 169 tsleep_interlock(io, 0); 170 if (io->running) 171 tsleep(io, PINTERLOCKED, "hmrflw", hz); 172 } 173 lwkt_reltoken(&hmp->io_token); 174 } 175 } 176 177 /* 178 * Wait for all currently queued HAMMER-initiated I/Os to complete. 179 * 180 * This is not supposed to count direct I/O's but some can leak 181 * through (for non-full-sized direct I/Os). 182 */ 183 void 184 hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush) 185 { 186 struct hammer_io iodummy; 187 hammer_io_t io; 188 189 /* 190 * Degenerate case, no I/O is running 191 */ 192 lwkt_gettoken(&hmp->io_token); 193 if (TAILQ_EMPTY(&hmp->iorun_list)) { 194 lwkt_reltoken(&hmp->io_token); 195 if (doflush) 196 hammer_io_flush_sync(hmp); 197 return; 198 } 199 bzero(&iodummy, sizeof(iodummy)); 200 iodummy.type = HAMMER_STRUCTURE_DUMMY; 201 202 /* 203 * Add placemarker and then wait until it becomes the head of 204 * the list. 205 */ 206 TAILQ_INSERT_TAIL(&hmp->iorun_list, &iodummy, iorun_entry); 207 while (TAILQ_FIRST(&hmp->iorun_list) != &iodummy) { 208 tsleep(&iodummy, 0, ident, 0); 209 } 210 211 /* 212 * Chain in case several placemarkers are present. 213 */ 214 TAILQ_REMOVE(&hmp->iorun_list, &iodummy, iorun_entry); 215 io = TAILQ_FIRST(&hmp->iorun_list); 216 if (io && io->type == HAMMER_STRUCTURE_DUMMY) 217 wakeup(io); 218 lwkt_reltoken(&hmp->io_token); 219 220 if (doflush) 221 hammer_io_flush_sync(hmp); 222 } 223 224 /* 225 * Clear a flagged error condition on a I/O buffer. The caller must hold 226 * its own ref on the buffer. 227 */ 228 void 229 hammer_io_clear_error(struct hammer_io *io) 230 { 231 hammer_mount_t hmp = io->hmp; 232 233 lwkt_gettoken(&hmp->io_token); 234 if (io->ioerror) { 235 io->ioerror = 0; 236 hammer_rel(&io->lock); 237 KKASSERT(hammer_isactive(&io->lock)); 238 } 239 lwkt_reltoken(&hmp->io_token); 240 } 241 242 void 243 hammer_io_clear_error_noassert(struct hammer_io *io) 244 { 245 hammer_mount_t hmp = io->hmp; 246 247 lwkt_gettoken(&hmp->io_token); 248 if (io->ioerror) { 249 io->ioerror = 0; 250 hammer_rel(&io->lock); 251 } 252 lwkt_reltoken(&hmp->io_token); 253 } 254 255 /* 256 * This is an advisory function only which tells the buffer cache 257 * the bp is not a meta-data buffer, even though it is backed by 258 * a block device. 259 * 260 * This is used by HAMMER's reblocking code to avoid trying to 261 * swapcache the filesystem's data when it is read or written 262 * by the reblocking code. 263 * 264 * The caller has a ref on the buffer preventing the bp from 265 * being disassociated from it. 266 */ 267 void 268 hammer_io_notmeta(hammer_buffer_t buffer) 269 { 270 if ((buffer->io.bp->b_flags & B_NOTMETA) == 0) { 271 hammer_mount_t hmp = buffer->io.hmp; 272 273 lwkt_gettoken(&hmp->io_token); 274 buffer->io.bp->b_flags |= B_NOTMETA; 275 lwkt_reltoken(&hmp->io_token); 276 } 277 } 278 279 /* 280 * Load bp for a HAMMER structure. The io must be exclusively locked by 281 * the caller. 282 * 283 * This routine is mostly used on meta-data and small-data blocks. Generally 284 * speaking HAMMER assumes some locality of reference and will cluster. 285 * 286 * Note that the caller (hammer_ondisk.c) may place further restrictions 287 * on clusterability via the limit (in bytes). Typically large-data 288 * zones cannot be clustered due to their mixed buffer sizes. This is 289 * not an issue since such clustering occurs in hammer_vnops at the 290 * regular file layer, whereas this is the buffered block device layer. 291 * 292 * No I/O callbacks can occur while we hold the buffer locked. 293 */ 294 int 295 hammer_io_read(struct vnode *devvp, struct hammer_io *io, int limit) 296 { 297 struct buf *bp; 298 int error; 299 300 if ((bp = io->bp) == NULL) { 301 atomic_add_long(&hammer_count_io_running_read, io->bytes); 302 if (hammer_cluster_enable && limit > io->bytes) { 303 error = cluster_read(devvp, io->offset + limit, 304 io->offset, io->bytes, 305 HAMMER_CLUSTER_SIZE, 306 HAMMER_CLUSTER_SIZE, 307 &io->bp); 308 } else { 309 error = bread(devvp, io->offset, io->bytes, &io->bp); 310 } 311 hammer_stats_disk_read += io->bytes; 312 atomic_add_long(&hammer_count_io_running_read, -io->bytes); 313 314 /* 315 * The code generally assumes b_ops/b_dep has been set-up, 316 * even if we error out here. 317 */ 318 bp = io->bp; 319 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) { 320 const char *metatype; 321 322 switch(io->type) { 323 case HAMMER_STRUCTURE_VOLUME: 324 metatype = "volume"; 325 break; 326 case HAMMER_STRUCTURE_META_BUFFER: 327 switch(((struct hammer_buffer *)io)-> 328 zoneX_offset & HAMMER_OFF_ZONE_MASK) { 329 case HAMMER_ZONE_BTREE: 330 metatype = "btree"; 331 break; 332 case HAMMER_ZONE_META: 333 metatype = "meta"; 334 break; 335 case HAMMER_ZONE_FREEMAP: 336 metatype = "freemap"; 337 break; 338 default: 339 metatype = "meta?"; 340 break; 341 } 342 break; 343 case HAMMER_STRUCTURE_DATA_BUFFER: 344 metatype = "data"; 345 break; 346 case HAMMER_STRUCTURE_UNDO_BUFFER: 347 metatype = "undo"; 348 break; 349 default: 350 metatype = "unknown"; 351 break; 352 } 353 hdkprintf("doff %016jx %s\n", 354 (intmax_t)bp->b_bio2.bio_offset, 355 metatype); 356 } 357 bp->b_flags &= ~B_IODEBUG; 358 bp->b_ops = &hammer_bioops; 359 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 360 361 /* io->worklist is locked by the io lock */ 362 LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node); 363 BUF_KERNPROC(bp); 364 KKASSERT(io->modified == 0); 365 KKASSERT(io->running == 0); 366 KKASSERT(io->waiting == 0); 367 io->released = 0; /* we hold an active lock on bp */ 368 } else { 369 error = 0; 370 } 371 return(error); 372 } 373 374 /* 375 * Similar to hammer_io_read() but returns a zero'd out buffer instead. 376 * Must be called with the IO exclusively locked. 377 * 378 * vfs_bio_clrbuf() is kinda nasty, enforce serialization against background 379 * I/O by forcing the buffer to not be in a released state before calling 380 * it. 381 * 382 * This function will also mark the IO as modified but it will not 383 * increment the modify_refs count. 384 * 385 * No I/O callbacks can occur while we hold the buffer locked. 386 */ 387 int 388 hammer_io_new(struct vnode *devvp, struct hammer_io *io) 389 { 390 struct buf *bp; 391 392 if ((bp = io->bp) == NULL) { 393 io->bp = getblk(devvp, io->offset, io->bytes, 0, 0); 394 bp = io->bp; 395 bp->b_ops = &hammer_bioops; 396 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 397 398 /* io->worklist is locked by the io lock */ 399 LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node); 400 io->released = 0; 401 KKASSERT(io->running == 0); 402 io->waiting = 0; 403 BUF_KERNPROC(bp); 404 } else { 405 if (io->released) { 406 regetblk(bp); 407 BUF_KERNPROC(bp); 408 io->released = 0; 409 } 410 } 411 hammer_io_modify(io, 0); 412 vfs_bio_clrbuf(bp); 413 return(0); 414 } 415 416 /* 417 * Advance the activity count on the underlying buffer because 418 * HAMMER does not getblk/brelse on every access. 419 * 420 * The io->bp cannot go away while the buffer is referenced. 421 */ 422 void 423 hammer_io_advance(struct hammer_io *io) 424 { 425 if (io->bp) 426 buf_act_advance(io->bp); 427 } 428 429 /* 430 * Remove potential device level aliases against buffers managed by high level 431 * vnodes. Aliases can also be created due to mixed buffer sizes or via 432 * direct access to the backing store device. 433 * 434 * This is nasty because the buffers are also VMIO-backed. Even if a buffer 435 * does not exist its backing VM pages might, and we have to invalidate 436 * those as well or a getblk() will reinstate them. 437 * 438 * Buffer cache buffers associated with hammer_buffers cannot be 439 * invalidated. 440 */ 441 int 442 hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset) 443 { 444 hammer_io_structure_t iou; 445 hammer_mount_t hmp; 446 hammer_off_t phys_offset; 447 struct buf *bp; 448 int error; 449 450 hmp = volume->io.hmp; 451 lwkt_gettoken(&hmp->io_token); 452 453 /* 454 * If a device buffer already exists for the specified physical 455 * offset use that, otherwise instantiate a buffer to cover any 456 * related VM pages, set BNOCACHE, and brelse(). 457 */ 458 phys_offset = volume->ondisk->vol_buf_beg + 459 (zone2_offset & HAMMER_OFF_SHORT_MASK); 460 if ((bp = findblk(volume->devvp, phys_offset, 0)) != NULL) 461 bremfree(bp); 462 else 463 bp = getblk(volume->devvp, phys_offset, HAMMER_BUFSIZE, 0, 0); 464 465 if ((iou = (void *)LIST_FIRST(&bp->b_dep)) != NULL) { 466 #if 0 467 hammer_ref(&iou->io.lock); 468 hammer_io_clear_modify(&iou->io, 1); 469 bundirty(bp); 470 iou->io.released = 0; 471 BUF_KERNPROC(bp); 472 iou->io.reclaim = 1; 473 iou->io.waitdep = 1; /* XXX this is a fs_token field */ 474 KKASSERT(hammer_isactive(&iou->io.lock) == 1); 475 hammer_rel_buffer(&iou->buffer, 0); 476 /*hammer_io_deallocate(bp);*/ 477 #endif 478 bqrelse(bp); 479 error = EAGAIN; 480 } else { 481 KKASSERT((bp->b_flags & B_LOCKED) == 0); 482 bundirty(bp); 483 bp->b_flags |= B_NOCACHE|B_RELBUF; 484 brelse(bp); 485 error = 0; 486 } 487 lwkt_reltoken(&hmp->io_token); 488 return(error); 489 } 490 491 /* 492 * This routine is called on the last reference to a hammer structure. 493 * The io must be interlocked with a refcount of zero. The hammer structure 494 * will remain interlocked on return. 495 * 496 * This routine may return a non-NULL bp to the caller for dispoal. 497 * The caller typically brelse()'s the bp. 498 * 499 * The bp may or may not still be passively associated with the IO. It 500 * will remain passively associated if it is unreleasable (e.g. a modified 501 * meta-data buffer). 502 * 503 * The only requirement here is that modified meta-data and volume-header 504 * buffer may NOT be disassociated from the IO structure, and consequently 505 * we also leave such buffers actively associated with the IO if they already 506 * are (since the kernel can't do anything with them anyway). Only the 507 * flusher is allowed to write such buffers out. Modified pure-data and 508 * undo buffers are returned to the kernel but left passively associated 509 * so we can track when the kernel writes the bp out. 510 */ 511 struct buf * 512 hammer_io_release(struct hammer_io *io, int flush) 513 { 514 union hammer_io_structure *iou = (void *)io; 515 struct buf *bp; 516 517 if ((bp = io->bp) == NULL) 518 return(NULL); 519 520 /* 521 * Try to flush a dirty IO to disk if asked to by the 522 * caller or if the kernel tried to flush the buffer in the past. 523 * 524 * Kernel-initiated flushes are only allowed for pure-data buffers. 525 * meta-data and volume buffers can only be flushed explicitly 526 * by HAMMER. 527 */ 528 if (io->modified) { 529 if (flush) { 530 hammer_io_flush(io, 0); 531 } else if (bp->b_flags & B_LOCKED) { 532 switch(io->type) { 533 case HAMMER_STRUCTURE_DATA_BUFFER: 534 hammer_io_flush(io, 0); 535 break; 536 case HAMMER_STRUCTURE_UNDO_BUFFER: 537 hammer_io_flush(io, hammer_undo_reclaim(io)); 538 break; 539 default: 540 break; 541 } 542 } /* else no explicit request to flush the buffer */ 543 } 544 545 /* 546 * Wait for the IO to complete if asked to. This occurs when 547 * the buffer must be disposed of definitively during an umount 548 * or buffer invalidation. 549 */ 550 if (io->waitdep && io->running) { 551 hammer_io_wait(io); 552 } 553 554 /* 555 * Return control of the buffer to the kernel (with the provisio 556 * that our bioops can override kernel decisions with regards to 557 * the buffer). 558 */ 559 if ((flush || io->reclaim) && io->modified == 0 && io->running == 0) { 560 /* 561 * Always disassociate the bp if an explicit flush 562 * was requested and the IO completed with no error 563 * (so unmount can really clean up the structure). 564 */ 565 if (io->released) { 566 regetblk(bp); 567 BUF_KERNPROC(bp); 568 } else { 569 io->released = 1; 570 } 571 hammer_io_disassociate((hammer_io_structure_t)io); 572 /* return the bp */ 573 } else if (io->modified) { 574 /* 575 * Only certain IO types can be released to the kernel if 576 * the buffer has been modified. 577 * 578 * volume and meta-data IO types may only be explicitly 579 * flushed by HAMMER. 580 */ 581 switch(io->type) { 582 case HAMMER_STRUCTURE_DATA_BUFFER: 583 case HAMMER_STRUCTURE_UNDO_BUFFER: 584 if (io->released == 0) { 585 io->released = 1; 586 bp->b_flags |= B_CLUSTEROK; 587 bdwrite(bp); 588 } 589 break; 590 default: 591 break; 592 } 593 bp = NULL; /* bp left associated */ 594 } else if (io->released == 0) { 595 /* 596 * Clean buffers can be generally released to the kernel. 597 * We leave the bp passively associated with the HAMMER 598 * structure and use bioops to disconnect it later on 599 * if the kernel wants to discard the buffer. 600 * 601 * We can steal the structure's ownership of the bp. 602 */ 603 io->released = 1; 604 if (bp->b_flags & B_LOCKED) { 605 hammer_io_disassociate(iou); 606 /* return the bp */ 607 } else { 608 if (io->reclaim) { 609 hammer_io_disassociate(iou); 610 /* return the bp */ 611 } else { 612 /* return the bp (bp passively associated) */ 613 } 614 } 615 } else { 616 /* 617 * A released buffer is passively associate with our 618 * hammer_io structure. The kernel cannot destroy it 619 * without making a bioops call. If the kernel (B_LOCKED) 620 * or we (reclaim) requested that the buffer be destroyed 621 * we destroy it, otherwise we do a quick get/release to 622 * reset its position in the kernel's LRU list. 623 * 624 * Leaving the buffer passively associated allows us to 625 * use the kernel's LRU buffer flushing mechanisms rather 626 * then rolling our own. 627 * 628 * XXX there are two ways of doing this. We can re-acquire 629 * and passively release to reset the LRU, or not. 630 */ 631 if (io->running == 0) { 632 regetblk(bp); 633 if ((bp->b_flags & B_LOCKED) || io->reclaim) { 634 hammer_io_disassociate(iou); 635 /* return the bp */ 636 } else { 637 /* return the bp (bp passively associated) */ 638 } 639 } else { 640 /* 641 * bp is left passively associated but we do not 642 * try to reacquire it. Interactions with the io 643 * structure will occur on completion of the bp's 644 * I/O. 645 */ 646 bp = NULL; 647 } 648 } 649 return(bp); 650 } 651 652 /* 653 * This routine is called with a locked IO when a flush is desired and 654 * no other references to the structure exists other then ours. This 655 * routine is ONLY called when HAMMER believes it is safe to flush a 656 * potentially modified buffer out. 657 * 658 * The locked io or io reference prevents a flush from being initiated 659 * by the kernel. 660 */ 661 void 662 hammer_io_flush(struct hammer_io *io, int reclaim) 663 { 664 struct buf *bp; 665 hammer_mount_t hmp; 666 667 /* 668 * Degenerate case - nothing to flush if nothing is dirty. 669 */ 670 if (io->modified == 0) 671 return; 672 673 KKASSERT(io->bp); 674 KKASSERT(io->modify_refs <= 0); 675 676 /* 677 * Acquire ownership of the bp, particularly before we clear our 678 * modified flag. 679 * 680 * We are going to bawrite() this bp. Don't leave a window where 681 * io->released is set, we actually own the bp rather then our 682 * buffer. 683 * 684 * The io_token should not be required here as only 685 */ 686 hmp = io->hmp; 687 bp = io->bp; 688 if (io->released) { 689 regetblk(bp); 690 /* BUF_KERNPROC(io->bp); */ 691 /* io->released = 0; */ 692 KKASSERT(io->released); 693 KKASSERT(io->bp == bp); 694 } else { 695 io->released = 1; 696 } 697 698 if (reclaim) { 699 io->reclaim = 1; 700 if ((bp->b_flags & B_LOCKED) == 0) { 701 bp->b_flags |= B_LOCKED; 702 atomic_add_int(&hammer_count_io_locked, 1); 703 } 704 } 705 706 /* 707 * Acquire exclusive access to the bp and then clear the modified 708 * state of the buffer prior to issuing I/O to interlock any 709 * modifications made while the I/O is in progress. This shouldn't 710 * happen anyway but losing data would be worse. The modified bit 711 * will be rechecked after the IO completes. 712 * 713 * NOTE: This call also finalizes the buffer's content (inval == 0). 714 * 715 * This is only legal when lock.refs == 1 (otherwise we might clear 716 * the modified bit while there are still users of the cluster 717 * modifying the data). 718 * 719 * Do this before potentially blocking so any attempt to modify the 720 * ondisk while we are blocked blocks waiting for us. 721 */ 722 hammer_ref(&io->lock); 723 hammer_io_clear_modify(io, 0); 724 hammer_rel(&io->lock); 725 726 if (hammer_debug_io & 0x0002) 727 hdkprintf("%016jx\n", bp->b_bio1.bio_offset); 728 729 /* 730 * Transfer ownership to the kernel and initiate I/O. 731 * 732 * NOTE: We do not hold io_token so an atomic op is required to 733 * update io_running_space. 734 */ 735 io->running = 1; 736 atomic_add_long(&hmp->io_running_space, io->bytes); 737 atomic_add_long(&hammer_count_io_running_write, io->bytes); 738 lwkt_gettoken(&hmp->io_token); 739 TAILQ_INSERT_TAIL(&hmp->iorun_list, io, iorun_entry); 740 lwkt_reltoken(&hmp->io_token); 741 cluster_awrite(bp); 742 hammer_io_flush_mark(io->volume); 743 } 744 745 /************************************************************************ 746 * BUFFER DIRTYING * 747 ************************************************************************ 748 * 749 * These routines deal with dependancies created when IO buffers get 750 * modified. The caller must call hammer_modify_*() on a referenced 751 * HAMMER structure prior to modifying its on-disk data. 752 * 753 * Any intent to modify an IO buffer acquires the related bp and imposes 754 * various write ordering dependancies. 755 */ 756 757 /* 758 * Mark a HAMMER structure as undergoing modification. Meta-data buffers 759 * are locked until the flusher can deal with them, pure data buffers 760 * can be written out. 761 * 762 * The referenced io prevents races. 763 */ 764 static 765 void 766 hammer_io_modify(hammer_io_t io, int count) 767 { 768 /* 769 * io->modify_refs must be >= 0 770 */ 771 while (io->modify_refs < 0) { 772 io->waitmod = 1; 773 tsleep(io, 0, "hmrmod", 0); 774 } 775 776 /* 777 * Shortcut if nothing to do. 778 */ 779 KKASSERT(hammer_isactive(&io->lock) && io->bp != NULL); 780 io->modify_refs += count; 781 if (io->modified && io->released == 0) 782 return; 783 784 /* 785 * NOTE: It is important not to set the modified bit 786 * until after we have acquired the bp or we risk 787 * racing against checkwrite. 788 */ 789 hammer_lock_ex(&io->lock); 790 if (io->released) { 791 regetblk(io->bp); 792 BUF_KERNPROC(io->bp); 793 io->released = 0; 794 } 795 if (io->modified == 0) { 796 hammer_io_set_modlist(io); 797 io->modified = 1; 798 } 799 hammer_unlock(&io->lock); 800 } 801 802 static __inline 803 void 804 hammer_io_modify_done(hammer_io_t io) 805 { 806 KKASSERT(io->modify_refs > 0); 807 --io->modify_refs; 808 if (io->modify_refs == 0 && io->waitmod) { 809 io->waitmod = 0; 810 wakeup(io); 811 } 812 } 813 814 /* 815 * The write interlock blocks other threads trying to modify a buffer 816 * (they block in hammer_io_modify()) after us, or blocks us while other 817 * threads are in the middle of modifying a buffer. 818 * 819 * The caller also has a ref on the io, however if we are not careful 820 * we will race bioops callbacks (checkwrite). To deal with this 821 * we must at least acquire and release the io_token, and it is probably 822 * better to hold it through the setting of modify_refs. 823 */ 824 void 825 hammer_io_write_interlock(hammer_io_t io) 826 { 827 hammer_mount_t hmp = io->hmp; 828 829 lwkt_gettoken(&hmp->io_token); 830 while (io->modify_refs != 0) { 831 io->waitmod = 1; 832 tsleep(io, 0, "hmrmod", 0); 833 } 834 io->modify_refs = -1; 835 lwkt_reltoken(&hmp->io_token); 836 } 837 838 void 839 hammer_io_done_interlock(hammer_io_t io) 840 { 841 KKASSERT(io->modify_refs == -1); 842 io->modify_refs = 0; 843 if (io->waitmod) { 844 io->waitmod = 0; 845 wakeup(io); 846 } 847 } 848 849 /* 850 * Caller intends to modify a volume's ondisk structure. 851 * 852 * This is only allowed if we are the flusher or we have a ref on the 853 * sync_lock. 854 */ 855 void 856 hammer_modify_volume(hammer_transaction_t trans, hammer_volume_t volume, 857 void *base, int len) 858 { 859 KKASSERT (trans == NULL || trans->sync_lock_refs > 0); 860 861 hammer_io_modify(&volume->io, 1); 862 if (len) { 863 intptr_t rel_offset = (intptr_t)base - (intptr_t)volume->ondisk; 864 KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0); 865 hammer_generate_undo(trans, 866 HAMMER_ENCODE_RAW_VOLUME(volume->vol_no, rel_offset), 867 base, len); 868 } 869 } 870 871 /* 872 * Caller intends to modify a buffer's ondisk structure. 873 * 874 * This is only allowed if we are the flusher or we have a ref on the 875 * sync_lock. 876 */ 877 void 878 hammer_modify_buffer(hammer_transaction_t trans, hammer_buffer_t buffer, 879 void *base, int len) 880 { 881 KKASSERT (trans == NULL || trans->sync_lock_refs > 0); 882 883 hammer_io_modify(&buffer->io, 1); 884 if (len) { 885 intptr_t rel_offset = (intptr_t)base - (intptr_t)buffer->ondisk; 886 KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0); 887 hammer_generate_undo(trans, 888 buffer->zone2_offset + rel_offset, 889 base, len); 890 } 891 } 892 893 void 894 hammer_modify_volume_done(hammer_volume_t volume) 895 { 896 hammer_io_modify_done(&volume->io); 897 } 898 899 void 900 hammer_modify_buffer_done(hammer_buffer_t buffer) 901 { 902 hammer_io_modify_done(&buffer->io); 903 } 904 905 /* 906 * Mark an entity as not being dirty any more and finalize any 907 * delayed adjustments to the buffer. 908 * 909 * Delayed adjustments are an important performance enhancement, allowing 910 * us to avoid recalculating B-Tree node CRCs over and over again when 911 * making bulk-modifications to the B-Tree. 912 * 913 * If inval is non-zero delayed adjustments are ignored. 914 * 915 * This routine may dereference related btree nodes and cause the 916 * buffer to be dereferenced. The caller must own a reference on io. 917 */ 918 void 919 hammer_io_clear_modify(struct hammer_io *io, int inval) 920 { 921 hammer_mount_t hmp; 922 923 /* 924 * io_token is needed to avoid races on mod_root 925 */ 926 if (io->modified == 0) 927 return; 928 hmp = io->hmp; 929 lwkt_gettoken(&hmp->io_token); 930 if (io->modified == 0) { 931 lwkt_reltoken(&hmp->io_token); 932 return; 933 } 934 935 /* 936 * Take us off the mod-list and clear the modified bit. 937 */ 938 KKASSERT(io->mod_root != NULL); 939 if (io->mod_root == &io->hmp->volu_root || 940 io->mod_root == &io->hmp->meta_root) { 941 io->hmp->locked_dirty_space -= io->bytes; 942 atomic_add_long(&hammer_count_dirtybufspace, -io->bytes); 943 } 944 RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io); 945 io->mod_root = NULL; 946 io->modified = 0; 947 948 lwkt_reltoken(&hmp->io_token); 949 950 /* 951 * If this bit is not set there are no delayed adjustments. 952 */ 953 if (io->gencrc == 0) 954 return; 955 io->gencrc = 0; 956 957 /* 958 * Finalize requested CRCs. The NEEDSCRC flag also holds a reference 959 * on the node (& underlying buffer). Release the node after clearing 960 * the flag. 961 */ 962 if (io->type == HAMMER_STRUCTURE_META_BUFFER) { 963 hammer_buffer_t buffer = (void *)io; 964 hammer_node_t node; 965 966 restart: 967 TAILQ_FOREACH(node, &buffer->clist, entry) { 968 if ((node->flags & HAMMER_NODE_NEEDSCRC) == 0) 969 continue; 970 node->flags &= ~HAMMER_NODE_NEEDSCRC; 971 KKASSERT(node->ondisk); 972 if (inval == 0) 973 node->ondisk->crc = crc32(&node->ondisk->crc + 1, HAMMER_BTREE_CRCSIZE); 974 hammer_rel_node(node); 975 goto restart; 976 } 977 } 978 /* caller must still have ref on io */ 979 KKASSERT(hammer_isactive(&io->lock)); 980 } 981 982 /* 983 * Clear the IO's modify list. Even though the IO is no longer modified 984 * it may still be on the lose_root. This routine is called just before 985 * the governing hammer_buffer is destroyed. 986 * 987 * mod_root requires io_token protection. 988 */ 989 void 990 hammer_io_clear_modlist(struct hammer_io *io) 991 { 992 hammer_mount_t hmp = io->hmp; 993 994 KKASSERT(io->modified == 0); 995 if (io->mod_root) { 996 lwkt_gettoken(&hmp->io_token); 997 if (io->mod_root) { 998 KKASSERT(io->mod_root == &io->hmp->lose_root); 999 RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io); 1000 io->mod_root = NULL; 1001 } 1002 lwkt_reltoken(&hmp->io_token); 1003 } 1004 } 1005 1006 static void 1007 hammer_io_set_modlist(struct hammer_io *io) 1008 { 1009 struct hammer_mount *hmp = io->hmp; 1010 1011 lwkt_gettoken(&hmp->io_token); 1012 KKASSERT(io->mod_root == NULL); 1013 1014 switch(io->type) { 1015 case HAMMER_STRUCTURE_VOLUME: 1016 io->mod_root = &hmp->volu_root; 1017 hmp->locked_dirty_space += io->bytes; 1018 atomic_add_long(&hammer_count_dirtybufspace, io->bytes); 1019 break; 1020 case HAMMER_STRUCTURE_META_BUFFER: 1021 io->mod_root = &hmp->meta_root; 1022 hmp->locked_dirty_space += io->bytes; 1023 atomic_add_long(&hammer_count_dirtybufspace, io->bytes); 1024 break; 1025 case HAMMER_STRUCTURE_UNDO_BUFFER: 1026 io->mod_root = &hmp->undo_root; 1027 break; 1028 case HAMMER_STRUCTURE_DATA_BUFFER: 1029 io->mod_root = &hmp->data_root; 1030 break; 1031 case HAMMER_STRUCTURE_DUMMY: 1032 hpanic("bad io type"); 1033 break; /* NOT REACHED */ 1034 } 1035 if (RB_INSERT(hammer_mod_rb_tree, io->mod_root, io)) { 1036 hpanic("duplicate entry"); 1037 /* NOT REACHED */ 1038 } 1039 lwkt_reltoken(&hmp->io_token); 1040 } 1041 1042 /************************************************************************ 1043 * HAMMER_BIOOPS * 1044 ************************************************************************ 1045 * 1046 */ 1047 1048 /* 1049 * Pre-IO initiation kernel callback - cluster build only 1050 * 1051 * bioops callback - hold io_token 1052 */ 1053 static void 1054 hammer_io_start(struct buf *bp) 1055 { 1056 /* nothing to do, so io_token not needed */ 1057 } 1058 1059 /* 1060 * Post-IO completion kernel callback - MAY BE CALLED FROM INTERRUPT! 1061 * 1062 * NOTE: HAMMER may modify a data buffer after we have initiated write 1063 * I/O. 1064 * 1065 * NOTE: MPSAFE callback 1066 * 1067 * bioops callback - hold io_token 1068 */ 1069 static void 1070 hammer_io_complete(struct buf *bp) 1071 { 1072 union hammer_io_structure *iou = (void *)LIST_FIRST(&bp->b_dep); 1073 struct hammer_mount *hmp = iou->io.hmp; 1074 struct hammer_io *ionext; 1075 1076 lwkt_gettoken(&hmp->io_token); 1077 1078 KKASSERT(iou->io.released == 1); 1079 1080 /* 1081 * Deal with people waiting for I/O to drain 1082 */ 1083 if (iou->io.running) { 1084 /* 1085 * Deal with critical write errors. Once a critical error 1086 * has been flagged in hmp the UNDO FIFO will not be updated. 1087 * That way crash recover will give us a consistent 1088 * filesystem. 1089 * 1090 * Because of this we can throw away failed UNDO buffers. If 1091 * we throw away META or DATA buffers we risk corrupting 1092 * the now read-only version of the filesystem visible to 1093 * the user. Clear B_ERROR so the buffer is not re-dirtied 1094 * by the kernel and ref the io so it doesn't get thrown 1095 * away. 1096 */ 1097 if (bp->b_flags & B_ERROR) { 1098 lwkt_gettoken(&hmp->fs_token); 1099 hammer_critical_error(hmp, NULL, bp->b_error, 1100 "while flushing meta-data"); 1101 lwkt_reltoken(&hmp->fs_token); 1102 1103 switch(iou->io.type) { 1104 case HAMMER_STRUCTURE_UNDO_BUFFER: 1105 break; 1106 default: 1107 if (iou->io.ioerror == 0) { 1108 iou->io.ioerror = 1; 1109 hammer_ref(&iou->io.lock); 1110 } 1111 break; 1112 } 1113 bp->b_flags &= ~B_ERROR; 1114 bundirty(bp); 1115 #if 0 1116 hammer_io_set_modlist(&iou->io); 1117 iou->io.modified = 1; 1118 #endif 1119 } 1120 hammer_stats_disk_write += iou->io.bytes; 1121 atomic_add_long(&hammer_count_io_running_write, -iou->io.bytes); 1122 atomic_add_long(&hmp->io_running_space, -iou->io.bytes); 1123 KKASSERT(hmp->io_running_space >= 0); 1124 iou->io.running = 0; 1125 1126 /* 1127 * Remove from iorun list and wakeup any multi-io waiter(s). 1128 */ 1129 if (TAILQ_FIRST(&hmp->iorun_list) == &iou->io) { 1130 ionext = TAILQ_NEXT(&iou->io, iorun_entry); 1131 if (ionext && ionext->type == HAMMER_STRUCTURE_DUMMY) 1132 wakeup(ionext); 1133 } 1134 TAILQ_REMOVE(&hmp->iorun_list, &iou->io, iorun_entry); 1135 } else { 1136 hammer_stats_disk_read += iou->io.bytes; 1137 } 1138 1139 if (iou->io.waiting) { 1140 iou->io.waiting = 0; 1141 wakeup(iou); 1142 } 1143 1144 /* 1145 * If B_LOCKED is set someone wanted to deallocate the bp at some 1146 * point, try to do it now. The operation will fail if there are 1147 * refs or if hammer_io_deallocate() is unable to gain the 1148 * interlock. 1149 */ 1150 if (bp->b_flags & B_LOCKED) { 1151 atomic_add_int(&hammer_count_io_locked, -1); 1152 bp->b_flags &= ~B_LOCKED; 1153 hammer_io_deallocate(bp); 1154 /* structure may be dead now */ 1155 } 1156 lwkt_reltoken(&hmp->io_token); 1157 } 1158 1159 /* 1160 * Callback from kernel when it wishes to deallocate a passively 1161 * associated structure. This mostly occurs with clean buffers 1162 * but it may be possible for a holding structure to be marked dirty 1163 * while its buffer is passively associated. The caller owns the bp. 1164 * 1165 * If we cannot disassociate we set B_LOCKED to prevent the buffer 1166 * from getting reused. 1167 * 1168 * WARNING: Because this can be called directly by getnewbuf we cannot 1169 * recurse into the tree. If a bp cannot be immediately disassociated 1170 * our only recourse is to set B_LOCKED. 1171 * 1172 * WARNING: This may be called from an interrupt via hammer_io_complete() 1173 * 1174 * bioops callback - hold io_token 1175 */ 1176 static void 1177 hammer_io_deallocate(struct buf *bp) 1178 { 1179 hammer_io_structure_t iou = (void *)LIST_FIRST(&bp->b_dep); 1180 hammer_mount_t hmp; 1181 1182 hmp = iou->io.hmp; 1183 1184 lwkt_gettoken(&hmp->io_token); 1185 1186 KKASSERT((bp->b_flags & B_LOCKED) == 0 && iou->io.running == 0); 1187 if (hammer_try_interlock_norefs(&iou->io.lock) == 0) { 1188 /* 1189 * We cannot safely disassociate a bp from a referenced 1190 * or interlocked HAMMER structure. 1191 */ 1192 bp->b_flags |= B_LOCKED; 1193 atomic_add_int(&hammer_count_io_locked, 1); 1194 } else if (iou->io.modified) { 1195 /* 1196 * It is not legal to disassociate a modified buffer. This 1197 * case really shouldn't ever occur. 1198 */ 1199 bp->b_flags |= B_LOCKED; 1200 atomic_add_int(&hammer_count_io_locked, 1); 1201 hammer_put_interlock(&iou->io.lock, 0); 1202 } else { 1203 /* 1204 * Disassociate the BP. If the io has no refs left we 1205 * have to add it to the loose list. The kernel has 1206 * locked the buffer and therefore our io must be 1207 * in a released state. 1208 */ 1209 hammer_io_disassociate(iou); 1210 if (iou->io.type != HAMMER_STRUCTURE_VOLUME) { 1211 KKASSERT(iou->io.bp == NULL); 1212 KKASSERT(iou->io.mod_root == NULL); 1213 iou->io.mod_root = &hmp->lose_root; 1214 if (RB_INSERT(hammer_mod_rb_tree, iou->io.mod_root, 1215 &iou->io)) { 1216 hpanic("duplicate entry"); 1217 } 1218 } 1219 hammer_put_interlock(&iou->io.lock, 1); 1220 } 1221 lwkt_reltoken(&hmp->io_token); 1222 } 1223 1224 /* 1225 * bioops callback - hold io_token 1226 */ 1227 static int 1228 hammer_io_fsync(struct vnode *vp) 1229 { 1230 /* nothing to do, so io_token not needed */ 1231 return(0); 1232 } 1233 1234 /* 1235 * NOTE: will not be called unless we tell the kernel about the 1236 * bioops. Unused... we use the mount's VFS_SYNC instead. 1237 * 1238 * bioops callback - hold io_token 1239 */ 1240 static int 1241 hammer_io_sync(struct mount *mp) 1242 { 1243 /* nothing to do, so io_token not needed */ 1244 return(0); 1245 } 1246 1247 /* 1248 * bioops callback - hold io_token 1249 */ 1250 static void 1251 hammer_io_movedeps(struct buf *bp1, struct buf *bp2) 1252 { 1253 /* nothing to do, so io_token not needed */ 1254 } 1255 1256 /* 1257 * I/O pre-check for reading and writing. HAMMER only uses this for 1258 * B_CACHE buffers so checkread just shouldn't happen, but if it does 1259 * allow it. 1260 * 1261 * Writing is a different case. We don't want the kernel to try to write 1262 * out a buffer that HAMMER may be modifying passively or which has a 1263 * dependancy. In addition, kernel-demanded writes can only proceed for 1264 * certain types of buffers (i.e. UNDO and DATA types). Other dirty 1265 * buffer types can only be explicitly written by the flusher. 1266 * 1267 * checkwrite will only be called for bdwrite()n buffers. If we return 1268 * success the kernel is guaranteed to initiate the buffer write. 1269 * 1270 * bioops callback - hold io_token 1271 */ 1272 static int 1273 hammer_io_checkread(struct buf *bp) 1274 { 1275 /* nothing to do, so io_token not needed */ 1276 return(0); 1277 } 1278 1279 /* 1280 * The kernel is asking us whether it can write out a dirty buffer or not. 1281 * 1282 * bioops callback - hold io_token 1283 */ 1284 static int 1285 hammer_io_checkwrite(struct buf *bp) 1286 { 1287 hammer_io_t io = (void *)LIST_FIRST(&bp->b_dep); 1288 hammer_mount_t hmp = io->hmp; 1289 1290 /* 1291 * This shouldn't happen under normal operation. 1292 */ 1293 lwkt_gettoken(&hmp->io_token); 1294 if (io->type == HAMMER_STRUCTURE_VOLUME || 1295 io->type == HAMMER_STRUCTURE_META_BUFFER) { 1296 if (!panicstr) 1297 hpanic("illegal buffer"); 1298 if ((bp->b_flags & B_LOCKED) == 0) { 1299 bp->b_flags |= B_LOCKED; 1300 atomic_add_int(&hammer_count_io_locked, 1); 1301 } 1302 lwkt_reltoken(&hmp->io_token); 1303 return(1); 1304 } 1305 1306 /* 1307 * We have to be able to interlock the IO to safely modify any 1308 * of its fields without holding the fs_token. If we can't lock 1309 * it then we are racing someone. 1310 * 1311 * Our ownership of the bp lock prevents the io from being ripped 1312 * out from under us. 1313 */ 1314 if (hammer_try_interlock_norefs(&io->lock) == 0) { 1315 bp->b_flags |= B_LOCKED; 1316 atomic_add_int(&hammer_count_io_locked, 1); 1317 lwkt_reltoken(&hmp->io_token); 1318 return(1); 1319 } 1320 1321 /* 1322 * The modified bit must be cleared prior to the initiation of 1323 * any IO (returning 0 initiates the IO). Because this is a 1324 * normal data buffer hammer_io_clear_modify() runs through a 1325 * simple degenerate case. 1326 * 1327 * Return 0 will cause the kernel to initiate the IO, and we 1328 * must normally clear the modified bit before we begin. If 1329 * the io has modify_refs we do not clear the modified bit, 1330 * otherwise we may miss changes. 1331 * 1332 * Only data and undo buffers can reach here. These buffers do 1333 * not have terminal crc functions but we temporarily reference 1334 * the IO anyway, just in case. 1335 */ 1336 if (io->modify_refs == 0 && io->modified) { 1337 hammer_ref(&io->lock); 1338 hammer_io_clear_modify(io, 0); 1339 hammer_rel(&io->lock); 1340 } else if (io->modified) { 1341 KKASSERT(io->type == HAMMER_STRUCTURE_DATA_BUFFER); 1342 } 1343 1344 /* 1345 * The kernel is going to start the IO, set io->running. 1346 */ 1347 KKASSERT(io->running == 0); 1348 io->running = 1; 1349 atomic_add_long(&io->hmp->io_running_space, io->bytes); 1350 atomic_add_long(&hammer_count_io_running_write, io->bytes); 1351 TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry); 1352 1353 hammer_put_interlock(&io->lock, 1); 1354 lwkt_reltoken(&hmp->io_token); 1355 1356 return(0); 1357 } 1358 1359 /* 1360 * Return non-zero if we wish to delay the kernel's attempt to flush 1361 * this buffer to disk. 1362 * 1363 * bioops callback - hold io_token 1364 */ 1365 static int 1366 hammer_io_countdeps(struct buf *bp, int n) 1367 { 1368 /* nothing to do, so io_token not needed */ 1369 return(0); 1370 } 1371 1372 struct bio_ops hammer_bioops = { 1373 .io_start = hammer_io_start, 1374 .io_complete = hammer_io_complete, 1375 .io_deallocate = hammer_io_deallocate, 1376 .io_fsync = hammer_io_fsync, 1377 .io_sync = hammer_io_sync, 1378 .io_movedeps = hammer_io_movedeps, 1379 .io_countdeps = hammer_io_countdeps, 1380 .io_checkread = hammer_io_checkread, 1381 .io_checkwrite = hammer_io_checkwrite, 1382 }; 1383 1384 /************************************************************************ 1385 * DIRECT IO OPS * 1386 ************************************************************************ 1387 * 1388 * These functions operate directly on the buffer cache buffer associated 1389 * with a front-end vnode rather then a back-end device vnode. 1390 */ 1391 1392 /* 1393 * Read a buffer associated with a front-end vnode directly from the 1394 * disk media. The bio may be issued asynchronously. If leaf is non-NULL 1395 * we validate the CRC. 1396 * 1397 * We must check for the presence of a HAMMER buffer to handle the case 1398 * where the reblocker has rewritten the data (which it does via the HAMMER 1399 * buffer system, not via the high-level vnode buffer cache), but not yet 1400 * committed the buffer to the media. 1401 */ 1402 int 1403 hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio, 1404 hammer_btree_leaf_elm_t leaf) 1405 { 1406 hammer_off_t buf_offset; 1407 hammer_off_t zone2_offset; 1408 hammer_volume_t volume; 1409 struct buf *bp; 1410 struct bio *nbio; 1411 int vol_no; 1412 int error; 1413 1414 buf_offset = bio->bio_offset; 1415 KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == 1416 HAMMER_ZONE_LARGE_DATA); 1417 1418 /* 1419 * The buffer cache may have an aliased buffer (the reblocker can 1420 * write them). If it does we have to sync any dirty data before 1421 * we can build our direct-read. This is a non-critical code path. 1422 */ 1423 bp = bio->bio_buf; 1424 hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize); 1425 1426 /* 1427 * Resolve to a zone-2 offset. The conversion just requires 1428 * munging the top 4 bits but we want to abstract it anyway 1429 * so the blockmap code can verify the zone assignment. 1430 */ 1431 zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error); 1432 if (error) 1433 goto done; 1434 KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) == 1435 HAMMER_ZONE_RAW_BUFFER); 1436 1437 /* 1438 * Resolve volume and raw-offset for 3rd level bio. The 1439 * offset will be specific to the volume. 1440 */ 1441 vol_no = HAMMER_VOL_DECODE(zone2_offset); 1442 volume = hammer_get_volume(hmp, vol_no, &error); 1443 if (error == 0 && zone2_offset >= volume->maxbuf_off) 1444 error = EIO; 1445 1446 if (error == 0) { 1447 /* 1448 * 3rd level bio 1449 */ 1450 nbio = push_bio(bio); 1451 nbio->bio_offset = volume->ondisk->vol_buf_beg + 1452 (zone2_offset & HAMMER_OFF_SHORT_MASK); 1453 hammer_stats_disk_read += bp->b_bufsize; 1454 vn_strategy(volume->devvp, nbio); 1455 } 1456 hammer_rel_volume(volume, 0); 1457 done: 1458 if (error) { 1459 hdkprintf("failed @ %016llx\n", (long long)zone2_offset); 1460 bp->b_error = error; 1461 bp->b_flags |= B_ERROR; 1462 biodone(bio); 1463 } 1464 return(error); 1465 } 1466 1467 /* 1468 * This works similarly to hammer_io_direct_read() except instead of 1469 * directly reading from the device into the bio we instead indirectly 1470 * read through the device's buffer cache and then copy the data into 1471 * the bio. 1472 * 1473 * If leaf is non-NULL and validation is enabled, the CRC will be checked. 1474 * 1475 * This routine also executes asynchronously. It allows hammer strategy 1476 * calls to operate asynchronously when in double_buffer mode (in addition 1477 * to operating asynchronously when in normal mode). 1478 */ 1479 int 1480 hammer_io_indirect_read(hammer_mount_t hmp, struct bio *bio, 1481 hammer_btree_leaf_elm_t leaf) 1482 { 1483 hammer_off_t buf_offset; 1484 hammer_off_t zone2_offset; 1485 hammer_volume_t volume; 1486 struct buf *bp; 1487 int vol_no; 1488 int error; 1489 1490 buf_offset = bio->bio_offset; 1491 KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == 1492 HAMMER_ZONE_LARGE_DATA); 1493 1494 /* 1495 * The buffer cache may have an aliased buffer (the reblocker can 1496 * write them). If it does we have to sync any dirty data before 1497 * we can build our direct-read. This is a non-critical code path. 1498 */ 1499 bp = bio->bio_buf; 1500 hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize); 1501 1502 /* 1503 * Resolve to a zone-2 offset. The conversion just requires 1504 * munging the top 4 bits but we want to abstract it anyway 1505 * so the blockmap code can verify the zone assignment. 1506 */ 1507 zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error); 1508 if (error) 1509 goto done; 1510 KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) == 1511 HAMMER_ZONE_RAW_BUFFER); 1512 1513 /* 1514 * Resolve volume and raw-offset for 3rd level bio. The 1515 * offset will be specific to the volume. 1516 */ 1517 vol_no = HAMMER_VOL_DECODE(zone2_offset); 1518 volume = hammer_get_volume(hmp, vol_no, &error); 1519 if (error == 0 && zone2_offset >= volume->maxbuf_off) 1520 error = EIO; 1521 1522 if (error == 0) { 1523 /* 1524 * Convert to the raw volume->devvp offset and acquire 1525 * the buf, issuing async I/O if necessary. 1526 */ 1527 buf_offset = volume->ondisk->vol_buf_beg + 1528 (zone2_offset & HAMMER_OFF_SHORT_MASK); 1529 1530 if (leaf && hammer_verify_data) { 1531 bio->bio_caller_info1.uvalue32 = leaf->data_crc; 1532 bio->bio_caller_info2.index = 1; 1533 } else { 1534 bio->bio_caller_info2.index = 0; 1535 } 1536 breadcb(volume->devvp, buf_offset, bp->b_bufsize, 1537 hammer_indirect_callback, bio); 1538 } 1539 hammer_rel_volume(volume, 0); 1540 done: 1541 if (error) { 1542 hdkprintf("failed @ %016llx\n", (long long)zone2_offset); 1543 bp->b_error = error; 1544 bp->b_flags |= B_ERROR; 1545 biodone(bio); 1546 } 1547 return(error); 1548 } 1549 1550 /* 1551 * Indirect callback on completion. bio/bp specify the device-backed 1552 * buffer. bio->bio_caller_info1.ptr holds obio. 1553 * 1554 * obio/obp is the original regular file buffer. obio->bio_caller_info* 1555 * contains the crc specification. 1556 * 1557 * We are responsible for calling bpdone() and bqrelse() on bio/bp, and 1558 * for calling biodone() on obio. 1559 */ 1560 static void 1561 hammer_indirect_callback(struct bio *bio) 1562 { 1563 struct buf *bp = bio->bio_buf; 1564 struct buf *obp; 1565 struct bio *obio; 1566 1567 /* 1568 * If BIO_DONE is already set the device buffer was already 1569 * fully valid (B_CACHE). If it is not set then I/O was issued 1570 * and we have to run I/O completion as the last bio. 1571 * 1572 * Nobody is waiting for our device I/O to complete, we are 1573 * responsible for bqrelse()ing it which means we also have to do 1574 * the equivalent of biowait() and clear BIO_DONE (which breadcb() 1575 * may have set). 1576 * 1577 * Any preexisting device buffer should match the requested size, 1578 * but due to big-block recycling and other factors there is some 1579 * fragility there, so we assert that the device buffer covers 1580 * the request. 1581 */ 1582 if ((bio->bio_flags & BIO_DONE) == 0) 1583 bpdone(bp, 0); 1584 bio->bio_flags &= ~(BIO_DONE | BIO_SYNC); 1585 1586 obio = bio->bio_caller_info1.ptr; 1587 obp = obio->bio_buf; 1588 1589 if (bp->b_flags & B_ERROR) { 1590 obp->b_flags |= B_ERROR; 1591 obp->b_error = bp->b_error; 1592 } else if (obio->bio_caller_info2.index && 1593 obio->bio_caller_info1.uvalue32 != 1594 crc32(bp->b_data, bp->b_bufsize)) { 1595 obp->b_flags |= B_ERROR; 1596 obp->b_error = EIO; 1597 } else { 1598 KKASSERT(bp->b_bufsize >= obp->b_bufsize); 1599 bcopy(bp->b_data, obp->b_data, obp->b_bufsize); 1600 obp->b_resid = 0; 1601 obp->b_flags |= B_AGE; 1602 } 1603 biodone(obio); 1604 bqrelse(bp); 1605 } 1606 1607 /* 1608 * Write a buffer associated with a front-end vnode directly to the 1609 * disk media. The bio may be issued asynchronously. 1610 * 1611 * The BIO is associated with the specified record and RECG_DIRECT_IO 1612 * is set. The recorded is added to its object. 1613 */ 1614 int 1615 hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio, 1616 hammer_record_t record) 1617 { 1618 hammer_btree_leaf_elm_t leaf = &record->leaf; 1619 hammer_off_t buf_offset; 1620 hammer_off_t zone2_offset; 1621 hammer_volume_t volume; 1622 hammer_buffer_t buffer; 1623 struct buf *bp; 1624 struct bio *nbio; 1625 char *ptr; 1626 int vol_no; 1627 int error; 1628 1629 buf_offset = leaf->data_offset; 1630 1631 KKASSERT(buf_offset > HAMMER_ZONE_BTREE); 1632 KKASSERT(bio->bio_buf->b_cmd == BUF_CMD_WRITE); 1633 1634 /* 1635 * Issue or execute the I/O. The new memory record must replace 1636 * the old one before the I/O completes, otherwise a reaquisition of 1637 * the buffer will load the old media data instead of the new. 1638 */ 1639 if ((buf_offset & HAMMER_BUFMASK) == 0 && 1640 leaf->data_len >= HAMMER_BUFSIZE) { 1641 /* 1642 * We are using the vnode's bio to write directly to the 1643 * media, any hammer_buffer at the same zone-X offset will 1644 * now have stale data. 1645 */ 1646 zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error); 1647 vol_no = HAMMER_VOL_DECODE(zone2_offset); 1648 volume = hammer_get_volume(hmp, vol_no, &error); 1649 1650 if (error == 0 && zone2_offset >= volume->maxbuf_off) 1651 error = EIO; 1652 if (error == 0) { 1653 bp = bio->bio_buf; 1654 KKASSERT((bp->b_bufsize & HAMMER_BUFMASK) == 0); 1655 /* 1656 hammer_del_buffers(hmp, buf_offset, 1657 zone2_offset, bp->b_bufsize); 1658 */ 1659 1660 /* 1661 * Second level bio - cached zone2 offset. 1662 * 1663 * (We can put our bio_done function in either the 1664 * 2nd or 3rd level). 1665 */ 1666 nbio = push_bio(bio); 1667 nbio->bio_offset = zone2_offset; 1668 nbio->bio_done = hammer_io_direct_write_complete; 1669 nbio->bio_caller_info1.ptr = record; 1670 record->zone2_offset = zone2_offset; 1671 record->gflags |= HAMMER_RECG_DIRECT_IO | 1672 HAMMER_RECG_DIRECT_INVAL; 1673 1674 /* 1675 * Third level bio - raw offset specific to the 1676 * correct volume. 1677 */ 1678 zone2_offset &= HAMMER_OFF_SHORT_MASK; 1679 nbio = push_bio(nbio); 1680 nbio->bio_offset = volume->ondisk->vol_buf_beg + 1681 zone2_offset; 1682 hammer_stats_disk_write += bp->b_bufsize; 1683 hammer_ip_replace_bulk(hmp, record); 1684 vn_strategy(volume->devvp, nbio); 1685 hammer_io_flush_mark(volume); 1686 } 1687 hammer_rel_volume(volume, 0); 1688 } else { 1689 /* 1690 * Must fit in a standard HAMMER buffer. In this case all 1691 * consumers use the HAMMER buffer system and RECG_DIRECT_IO 1692 * does not need to be set-up. 1693 */ 1694 KKASSERT(((buf_offset ^ (buf_offset + leaf->data_len - 1)) & ~HAMMER_BUFMASK64) == 0); 1695 buffer = NULL; 1696 ptr = hammer_bread(hmp, buf_offset, &error, &buffer); 1697 if (error == 0) { 1698 bp = bio->bio_buf; 1699 bp->b_flags |= B_AGE; 1700 hammer_io_modify(&buffer->io, 1); 1701 bcopy(bp->b_data, ptr, leaf->data_len); 1702 hammer_io_modify_done(&buffer->io); 1703 hammer_rel_buffer(buffer, 0); 1704 bp->b_resid = 0; 1705 hammer_ip_replace_bulk(hmp, record); 1706 biodone(bio); 1707 } 1708 } 1709 if (error) { 1710 /* 1711 * Major suckage occured. Also note: The record was 1712 * never added to the tree so we do not have to worry 1713 * about the backend. 1714 */ 1715 hdkprintf("failed @ %016llx\n", (long long)leaf->data_offset); 1716 bp = bio->bio_buf; 1717 bp->b_resid = 0; 1718 bp->b_error = EIO; 1719 bp->b_flags |= B_ERROR; 1720 biodone(bio); 1721 record->flags |= HAMMER_RECF_DELETED_FE; 1722 hammer_rel_mem_record(record); 1723 } 1724 return(error); 1725 } 1726 1727 /* 1728 * On completion of the BIO this callback must disconnect 1729 * it from the hammer_record and chain to the previous bio. 1730 * 1731 * An I/O error forces the mount to read-only. Data buffers 1732 * are not B_LOCKED like meta-data buffers are, so we have to 1733 * throw the buffer away to prevent the kernel from retrying. 1734 * 1735 * NOTE: MPSAFE callback, only modify fields we have explicit 1736 * access to (the bp and the record->gflags). 1737 */ 1738 static 1739 void 1740 hammer_io_direct_write_complete(struct bio *nbio) 1741 { 1742 struct bio *obio; 1743 struct buf *bp; 1744 hammer_record_t record; 1745 hammer_mount_t hmp; 1746 1747 record = nbio->bio_caller_info1.ptr; 1748 KKASSERT(record != NULL); 1749 hmp = record->ip->hmp; 1750 1751 lwkt_gettoken(&hmp->io_token); 1752 1753 bp = nbio->bio_buf; 1754 obio = pop_bio(nbio); 1755 if (bp->b_flags & B_ERROR) { 1756 lwkt_gettoken(&hmp->fs_token); 1757 hammer_critical_error(hmp, record->ip, bp->b_error, 1758 "while writing bulk data"); 1759 lwkt_reltoken(&hmp->fs_token); 1760 bp->b_flags |= B_INVAL; 1761 } 1762 biodone(obio); 1763 1764 KKASSERT(record->gflags & HAMMER_RECG_DIRECT_IO); 1765 if (record->gflags & HAMMER_RECG_DIRECT_WAIT) { 1766 record->gflags &= ~(HAMMER_RECG_DIRECT_IO | 1767 HAMMER_RECG_DIRECT_WAIT); 1768 /* record can disappear once DIRECT_IO flag is cleared */ 1769 wakeup(&record->flags); 1770 } else { 1771 record->gflags &= ~HAMMER_RECG_DIRECT_IO; 1772 /* record can disappear once DIRECT_IO flag is cleared */ 1773 } 1774 lwkt_reltoken(&hmp->io_token); 1775 } 1776 1777 1778 /* 1779 * This is called before a record is either committed to the B-Tree 1780 * or destroyed, to resolve any associated direct-IO. 1781 * 1782 * (1) We must wait for any direct-IO related to the record to complete. 1783 * 1784 * (2) We must remove any buffer cache aliases for data accessed via 1785 * leaf->data_offset or zone2_offset so non-direct-IO consumers 1786 * (the mirroring and reblocking code) do not see stale data. 1787 */ 1788 void 1789 hammer_io_direct_wait(hammer_record_t record) 1790 { 1791 hammer_mount_t hmp = record->ip->hmp; 1792 1793 /* 1794 * Wait for I/O to complete 1795 */ 1796 if (record->gflags & HAMMER_RECG_DIRECT_IO) { 1797 lwkt_gettoken(&hmp->io_token); 1798 while (record->gflags & HAMMER_RECG_DIRECT_IO) { 1799 record->gflags |= HAMMER_RECG_DIRECT_WAIT; 1800 tsleep(&record->flags, 0, "hmdiow", 0); 1801 } 1802 lwkt_reltoken(&hmp->io_token); 1803 } 1804 1805 /* 1806 * Invalidate any related buffer cache aliases associated with the 1807 * backing device. This is needed because the buffer cache buffer 1808 * for file data is associated with the file vnode, not the backing 1809 * device vnode. 1810 * 1811 * XXX I do not think this case can occur any more now that 1812 * reservations ensure that all such buffers are removed before 1813 * an area can be reused. 1814 */ 1815 if (record->gflags & HAMMER_RECG_DIRECT_INVAL) { 1816 KKASSERT(record->leaf.data_offset); 1817 hammer_del_buffers(hmp, record->leaf.data_offset, 1818 record->zone2_offset, record->leaf.data_len, 1819 1); 1820 record->gflags &= ~HAMMER_RECG_DIRECT_INVAL; 1821 } 1822 } 1823 1824 /* 1825 * This is called to remove the second-level cached zone-2 offset from 1826 * frontend buffer cache buffers, now stale due to a data relocation. 1827 * These offsets are generated by cluster_read() via VOP_BMAP, or directly 1828 * by hammer_vop_strategy_read(). 1829 * 1830 * This is rather nasty because here we have something like the reblocker 1831 * scanning the raw B-Tree with no held references on anything, really, 1832 * other then a shared lock on the B-Tree node, and we have to access the 1833 * frontend's buffer cache to check for and clean out the association. 1834 * Specifically, if the reblocker is moving data on the disk, these cached 1835 * offsets will become invalid. 1836 * 1837 * Only data record types associated with the large-data zone are subject 1838 * to direct-io and need to be checked. 1839 * 1840 */ 1841 void 1842 hammer_io_direct_uncache(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf) 1843 { 1844 struct hammer_inode_info iinfo; 1845 int zone; 1846 1847 if (leaf->base.rec_type != HAMMER_RECTYPE_DATA) 1848 return; 1849 zone = HAMMER_ZONE_DECODE(leaf->data_offset); 1850 if (zone != HAMMER_ZONE_LARGE_DATA_INDEX) 1851 return; 1852 iinfo.obj_id = leaf->base.obj_id; 1853 iinfo.obj_asof = 0; /* unused */ 1854 iinfo.obj_localization = leaf->base.localization & 1855 HAMMER_LOCALIZE_PSEUDOFS_MASK; 1856 iinfo.u.leaf = leaf; 1857 hammer_scan_inode_snapshots(hmp, &iinfo, 1858 hammer_io_direct_uncache_callback, 1859 leaf); 1860 } 1861 1862 static int 1863 hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data) 1864 { 1865 hammer_inode_info_t iinfo = data; 1866 hammer_off_t file_offset; 1867 struct vnode *vp; 1868 struct buf *bp; 1869 int blksize; 1870 1871 if (ip->vp == NULL) 1872 return(0); 1873 file_offset = iinfo->u.leaf->base.key - iinfo->u.leaf->data_len; 1874 blksize = iinfo->u.leaf->data_len; 1875 KKASSERT((blksize & HAMMER_BUFMASK) == 0); 1876 1877 /* 1878 * Warning: FINDBLK_TEST return stable storage but not stable 1879 * contents. It happens to be ok in this case. 1880 */ 1881 hammer_ref(&ip->lock); 1882 if (hammer_get_vnode(ip, &vp) == 0) { 1883 if ((bp = findblk(ip->vp, file_offset, FINDBLK_TEST)) != NULL && 1884 bp->b_bio2.bio_offset != NOOFFSET) { 1885 bp = getblk(ip->vp, file_offset, blksize, 0, 0); 1886 bp->b_bio2.bio_offset = NOOFFSET; 1887 brelse(bp); 1888 } 1889 vput(vp); 1890 } 1891 hammer_rel_inode(ip, 0); 1892 return(0); 1893 } 1894 1895 1896 /* 1897 * This function is called when writes may have occured on the volume, 1898 * indicating that the device may be holding cached writes. 1899 */ 1900 static void 1901 hammer_io_flush_mark(hammer_volume_t volume) 1902 { 1903 atomic_set_int(&volume->vol_flags, HAMMER_VOLF_NEEDFLUSH); 1904 } 1905 1906 /* 1907 * This function ensures that the device has flushed any cached writes out. 1908 */ 1909 void 1910 hammer_io_flush_sync(hammer_mount_t hmp) 1911 { 1912 hammer_volume_t volume; 1913 struct buf *bp_base = NULL; 1914 struct buf *bp; 1915 1916 RB_FOREACH(volume, hammer_vol_rb_tree, &hmp->rb_vols_root) { 1917 if (volume->vol_flags & HAMMER_VOLF_NEEDFLUSH) { 1918 atomic_clear_int(&volume->vol_flags, 1919 HAMMER_VOLF_NEEDFLUSH); 1920 bp = getpbuf(NULL); 1921 bp->b_bio1.bio_offset = 0; 1922 bp->b_bufsize = 0; 1923 bp->b_bcount = 0; 1924 bp->b_cmd = BUF_CMD_FLUSH; 1925 bp->b_bio1.bio_caller_info1.cluster_head = bp_base; 1926 bp->b_bio1.bio_done = biodone_sync; 1927 bp->b_bio1.bio_flags |= BIO_SYNC; 1928 bp_base = bp; 1929 vn_strategy(volume->devvp, &bp->b_bio1); 1930 } 1931 } 1932 while ((bp = bp_base) != NULL) { 1933 bp_base = bp->b_bio1.bio_caller_info1.cluster_head; 1934 biowait(&bp->b_bio1, "hmrFLS"); 1935 relpbuf(bp, NULL); 1936 } 1937 } 1938 1939 /* 1940 * Limit the amount of backlog which we allow to build up 1941 */ 1942 void 1943 hammer_io_limit_backlog(hammer_mount_t hmp) 1944 { 1945 waitrunningbufspace(); 1946 } 1947