1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.76 2008/08/29 20:19:08 dillon Exp $ 35 */ 36 /* 37 * Manage HAMMER's on-disk structures. These routines are primarily 38 * responsible for interfacing with the kernel's I/O subsystem and for 39 * managing in-memory structures. 40 */ 41 42 #include "hammer.h" 43 #include <sys/fcntl.h> 44 #include <sys/nlookup.h> 45 #include <sys/buf.h> 46 #include <sys/buf2.h> 47 48 static void hammer_free_volume(hammer_volume_t volume); 49 static int hammer_load_volume(hammer_volume_t volume); 50 static int hammer_load_buffer(hammer_buffer_t buffer, int isnew); 51 static int hammer_load_node(hammer_transaction_t trans, 52 hammer_node_t node, int isnew); 53 static void _hammer_rel_node(hammer_node_t node, int locked); 54 55 static int 56 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2) 57 { 58 if (vol1->vol_no < vol2->vol_no) 59 return(-1); 60 if (vol1->vol_no > vol2->vol_no) 61 return(1); 62 return(0); 63 } 64 65 /* 66 * hammer_buffer structures are indexed via their zoneX_offset, not 67 * their zone2_offset. 68 */ 69 static int 70 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2) 71 { 72 if (buf1->zoneX_offset < buf2->zoneX_offset) 73 return(-1); 74 if (buf1->zoneX_offset > buf2->zoneX_offset) 75 return(1); 76 return(0); 77 } 78 79 static int 80 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2) 81 { 82 if (node1->node_offset < node2->node_offset) 83 return(-1); 84 if (node1->node_offset > node2->node_offset) 85 return(1); 86 return(0); 87 } 88 89 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node, 90 hammer_vol_rb_compare, int32_t, vol_no); 91 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node, 92 hammer_buf_rb_compare, hammer_off_t, zoneX_offset); 93 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node, 94 hammer_nod_rb_compare, hammer_off_t, node_offset); 95 96 /************************************************************************ 97 * VOLUMES * 98 ************************************************************************ 99 * 100 * Load a HAMMER volume by name. Returns 0 on success or a positive error 101 * code on failure. Volumes must be loaded at mount time, get_volume() will 102 * not load a new volume. 103 * 104 * The passed devvp is vref()'d but not locked. This function consumes the 105 * ref (typically by associating it with the volume structure). 106 * 107 * Calls made to hammer_load_volume() or single-threaded 108 */ 109 int 110 hammer_install_volume(struct hammer_mount *hmp, const char *volname, 111 struct vnode *devvp) 112 { 113 struct mount *mp; 114 hammer_volume_t volume; 115 struct hammer_volume_ondisk *ondisk; 116 struct nlookupdata nd; 117 struct buf *bp = NULL; 118 int error; 119 int ronly; 120 int setmp = 0; 121 122 mp = hmp->mp; 123 ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0); 124 125 /* 126 * Allocate a volume structure 127 */ 128 ++hammer_count_volumes; 129 volume = kmalloc(sizeof(*volume), hmp->m_misc, M_WAITOK|M_ZERO); 130 volume->vol_name = kstrdup(volname, hmp->m_misc); 131 volume->io.hmp = hmp; /* bootstrap */ 132 hammer_io_init(&volume->io, volume, HAMMER_STRUCTURE_VOLUME); 133 volume->io.offset = 0LL; 134 volume->io.bytes = HAMMER_BUFSIZE; 135 136 /* 137 * Get the device vnode 138 */ 139 if (devvp == NULL) { 140 error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW); 141 if (error == 0) 142 error = nlookup(&nd); 143 if (error == 0) 144 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp); 145 nlookup_done(&nd); 146 } else { 147 error = 0; 148 volume->devvp = devvp; 149 } 150 151 if (error == 0) { 152 if (vn_isdisk(volume->devvp, &error)) { 153 error = vfs_mountedon(volume->devvp); 154 } 155 } 156 if (error == 0 && vcount(volume->devvp) > 0) 157 error = EBUSY; 158 if (error == 0) { 159 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY); 160 error = vinvalbuf(volume->devvp, V_SAVE, 0, 0); 161 if (error == 0) { 162 error = VOP_OPEN(volume->devvp, 163 (ronly ? FREAD : FREAD|FWRITE), 164 FSCRED, NULL); 165 } 166 vn_unlock(volume->devvp); 167 } 168 if (error) { 169 hammer_free_volume(volume); 170 return(error); 171 } 172 volume->devvp->v_rdev->si_mountpoint = mp; 173 setmp = 1; 174 175 /* 176 * Extract the volume number from the volume header and do various 177 * sanity checks. 178 */ 179 error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp); 180 if (error) 181 goto late_failure; 182 ondisk = (void *)bp->b_data; 183 if (ondisk->vol_signature != HAMMER_FSBUF_VOLUME) { 184 kprintf("hammer_mount: volume %s has an invalid header\n", 185 volume->vol_name); 186 error = EFTYPE; 187 goto late_failure; 188 } 189 volume->vol_no = ondisk->vol_no; 190 volume->buffer_base = ondisk->vol_buf_beg; 191 volume->vol_flags = ondisk->vol_flags; 192 volume->nblocks = ondisk->vol_nblocks; 193 volume->maxbuf_off = HAMMER_ENCODE_RAW_BUFFER(volume->vol_no, 194 ondisk->vol_buf_end - ondisk->vol_buf_beg); 195 volume->maxraw_off = ondisk->vol_buf_end; 196 197 if (RB_EMPTY(&hmp->rb_vols_root)) { 198 hmp->fsid = ondisk->vol_fsid; 199 } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) { 200 kprintf("hammer_mount: volume %s's fsid does not match " 201 "other volumes\n", volume->vol_name); 202 error = EFTYPE; 203 goto late_failure; 204 } 205 206 /* 207 * Insert the volume structure into the red-black tree. 208 */ 209 if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) { 210 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n", 211 volume->vol_name, volume->vol_no); 212 error = EEXIST; 213 } 214 215 /* 216 * Set the root volume . HAMMER special cases rootvol the structure. 217 * We do not hold a ref because this would prevent related I/O 218 * from being flushed. 219 */ 220 if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) { 221 hmp->rootvol = volume; 222 hmp->nvolumes = ondisk->vol_count; 223 if (bp) { 224 brelse(bp); 225 bp = NULL; 226 } 227 hmp->mp->mnt_stat.f_blocks += ondisk->vol0_stat_bigblocks * 228 (HAMMER_LARGEBLOCK_SIZE / HAMMER_BUFSIZE); 229 hmp->mp->mnt_vstat.f_blocks += ondisk->vol0_stat_bigblocks * 230 (HAMMER_LARGEBLOCK_SIZE / HAMMER_BUFSIZE); 231 } 232 late_failure: 233 if (bp) 234 brelse(bp); 235 if (error) { 236 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/ 237 if (setmp) 238 volume->devvp->v_rdev->si_mountpoint = NULL; 239 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE); 240 hammer_free_volume(volume); 241 } 242 return (error); 243 } 244 245 /* 246 * This is called for each volume when updating the mount point from 247 * read-write to read-only or vise-versa. 248 */ 249 int 250 hammer_adjust_volume_mode(hammer_volume_t volume, void *data __unused) 251 { 252 if (volume->devvp) { 253 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY); 254 if (volume->io.hmp->ronly) { 255 /* do not call vinvalbuf */ 256 VOP_OPEN(volume->devvp, FREAD, FSCRED, NULL); 257 VOP_CLOSE(volume->devvp, FREAD|FWRITE); 258 } else { 259 /* do not call vinvalbuf */ 260 VOP_OPEN(volume->devvp, FREAD|FWRITE, FSCRED, NULL); 261 VOP_CLOSE(volume->devvp, FREAD); 262 } 263 vn_unlock(volume->devvp); 264 } 265 return(0); 266 } 267 268 /* 269 * Unload and free a HAMMER volume. Must return >= 0 to continue scan 270 * so returns -1 on failure. 271 */ 272 int 273 hammer_unload_volume(hammer_volume_t volume, void *data __unused) 274 { 275 hammer_mount_t hmp = volume->io.hmp; 276 int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0); 277 278 /* 279 * Clean up the root volume pointer, which is held unlocked in hmp. 280 */ 281 if (hmp->rootvol == volume) 282 hmp->rootvol = NULL; 283 284 /* 285 * We must not flush a dirty buffer to disk on umount. It should 286 * have already been dealt with by the flusher, or we may be in 287 * catastrophic failure. 288 */ 289 hammer_io_clear_modify(&volume->io, 1); 290 volume->io.waitdep = 1; 291 292 /* 293 * Clean up the persistent ref ioerror might have on the volume 294 */ 295 if (volume->io.ioerror) 296 hammer_io_clear_error_noassert(&volume->io); 297 298 /* 299 * This should release the bp. Releasing the volume with flush set 300 * implies the interlock is set. 301 */ 302 hammer_ref_interlock_true(&volume->io.lock); 303 hammer_rel_volume(volume, 1); 304 KKASSERT(volume->io.bp == NULL); 305 306 /* 307 * There should be no references on the volume, no clusters, and 308 * no super-clusters. 309 */ 310 KKASSERT(hammer_norefs(&volume->io.lock)); 311 312 volume->ondisk = NULL; 313 if (volume->devvp) { 314 if (volume->devvp->v_rdev && 315 volume->devvp->v_rdev->si_mountpoint == hmp->mp 316 ) { 317 volume->devvp->v_rdev->si_mountpoint = NULL; 318 } 319 if (ronly) { 320 /* 321 * Make sure we don't sync anything to disk if we 322 * are in read-only mode (1) or critically-errored 323 * (2). Note that there may be dirty buffers in 324 * normal read-only mode from crash recovery. 325 */ 326 vinvalbuf(volume->devvp, 0, 0, 0); 327 VOP_CLOSE(volume->devvp, FREAD); 328 } else { 329 /* 330 * Normal termination, save any dirty buffers 331 * (XXX there really shouldn't be any). 332 */ 333 vinvalbuf(volume->devvp, V_SAVE, 0, 0); 334 VOP_CLOSE(volume->devvp, FREAD|FWRITE); 335 } 336 } 337 338 /* 339 * Destroy the structure 340 */ 341 RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume); 342 hammer_free_volume(volume); 343 return(0); 344 } 345 346 static 347 void 348 hammer_free_volume(hammer_volume_t volume) 349 { 350 hammer_mount_t hmp = volume->io.hmp; 351 352 if (volume->vol_name) { 353 kfree(volume->vol_name, hmp->m_misc); 354 volume->vol_name = NULL; 355 } 356 if (volume->devvp) { 357 vrele(volume->devvp); 358 volume->devvp = NULL; 359 } 360 --hammer_count_volumes; 361 kfree(volume, hmp->m_misc); 362 } 363 364 /* 365 * Get a HAMMER volume. The volume must already exist. 366 */ 367 hammer_volume_t 368 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp) 369 { 370 struct hammer_volume *volume; 371 372 /* 373 * Locate the volume structure 374 */ 375 volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no); 376 if (volume == NULL) { 377 *errorp = ENOENT; 378 return(NULL); 379 } 380 381 /* 382 * Reference the volume, load/check the data on the 0->1 transition. 383 * hammer_load_volume() will dispose of the interlock on return, 384 * and also clean up the ref count on error. 385 */ 386 if (hammer_ref_interlock(&volume->io.lock)) { 387 *errorp = hammer_load_volume(volume); 388 if (*errorp) 389 volume = NULL; 390 } else { 391 KKASSERT(volume->ondisk); 392 *errorp = 0; 393 } 394 return(volume); 395 } 396 397 int 398 hammer_ref_volume(hammer_volume_t volume) 399 { 400 int error; 401 402 /* 403 * Reference the volume and deal with the check condition used to 404 * load its ondisk info. 405 */ 406 if (hammer_ref_interlock(&volume->io.lock)) { 407 error = hammer_load_volume(volume); 408 } else { 409 KKASSERT(volume->ondisk); 410 error = 0; 411 } 412 return (error); 413 } 414 415 hammer_volume_t 416 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp) 417 { 418 hammer_volume_t volume; 419 420 volume = hmp->rootvol; 421 KKASSERT(volume != NULL); 422 423 /* 424 * Reference the volume and deal with the check condition used to 425 * load its ondisk info. 426 */ 427 if (hammer_ref_interlock(&volume->io.lock)) { 428 *errorp = hammer_load_volume(volume); 429 if (*errorp) 430 volume = NULL; 431 } else { 432 KKASSERT(volume->ondisk); 433 *errorp = 0; 434 } 435 return (volume); 436 } 437 438 /* 439 * Load a volume's on-disk information. The volume must be referenced and 440 * the interlock is held on call. The interlock will be released on return. 441 * The reference will also be released on return if an error occurs. 442 */ 443 static int 444 hammer_load_volume(hammer_volume_t volume) 445 { 446 int error; 447 448 if (volume->ondisk == NULL) { 449 error = hammer_io_read(volume->devvp, &volume->io, 450 HAMMER_BUFSIZE); 451 if (error == 0) { 452 volume->ondisk = (void *)volume->io.bp->b_data; 453 hammer_ref_interlock_done(&volume->io.lock); 454 } else { 455 hammer_rel_volume(volume, 1); 456 } 457 } else { 458 error = 0; 459 } 460 return(error); 461 } 462 463 /* 464 * Release a previously acquired reference on the volume. 465 * 466 * Volumes are not unloaded from memory during normal operation. 467 */ 468 void 469 hammer_rel_volume(hammer_volume_t volume, int locked) 470 { 471 struct buf *bp; 472 473 if (hammer_rel_interlock(&volume->io.lock, locked)) { 474 volume->ondisk = NULL; 475 bp = hammer_io_release(&volume->io, locked); 476 hammer_rel_interlock_done(&volume->io.lock, locked); 477 if (bp) 478 brelse(bp); 479 } 480 } 481 482 int 483 hammer_mountcheck_volumes(struct hammer_mount *hmp) 484 { 485 hammer_volume_t vol; 486 int i; 487 488 for (i = 0; i < hmp->nvolumes; ++i) { 489 vol = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, i); 490 if (vol == NULL) 491 return(EINVAL); 492 } 493 return(0); 494 } 495 496 /************************************************************************ 497 * BUFFERS * 498 ************************************************************************ 499 * 500 * Manage buffers. Currently most blockmap-backed zones are direct-mapped 501 * to zone-2 buffer offsets, without a translation stage. However, the 502 * hammer_buffer structure is indexed by its zoneX_offset, not its 503 * zone2_offset. 504 * 505 * The proper zone must be maintained throughout the code-base all the way 506 * through to the big-block allocator, or routines like hammer_del_buffers() 507 * will not be able to locate all potentially conflicting buffers. 508 */ 509 510 /* 511 * Helper function returns whether a zone offset can be directly translated 512 * to a raw buffer index or not. Really only the volume and undo zones 513 * can't be directly translated. Volumes are special-cased and undo zones 514 * shouldn't be aliased accessed in read-only mode. 515 * 516 * This function is ONLY used to detect aliased zones during a read-only 517 * mount. 518 */ 519 static __inline int 520 hammer_direct_zone(hammer_off_t buf_offset) 521 { 522 switch(HAMMER_ZONE_DECODE(buf_offset)) { 523 case HAMMER_ZONE_RAW_BUFFER_INDEX: 524 case HAMMER_ZONE_FREEMAP_INDEX: 525 case HAMMER_ZONE_BTREE_INDEX: 526 case HAMMER_ZONE_META_INDEX: 527 case HAMMER_ZONE_LARGE_DATA_INDEX: 528 case HAMMER_ZONE_SMALL_DATA_INDEX: 529 return(1); 530 default: 531 return(0); 532 } 533 /* NOT REACHED */ 534 } 535 536 hammer_buffer_t 537 hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset, 538 int bytes, int isnew, int *errorp) 539 { 540 hammer_buffer_t buffer; 541 hammer_volume_t volume; 542 hammer_off_t zone2_offset; 543 hammer_io_type_t iotype; 544 int vol_no; 545 int zone; 546 547 buf_offset &= ~HAMMER_BUFMASK64; 548 again: 549 /* 550 * Shortcut if the buffer is already cached 551 */ 552 buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, buf_offset); 553 if (buffer) { 554 /* 555 * Once refed the ondisk field will not be cleared by 556 * any other action. Shortcut the operation if the 557 * ondisk structure is valid. 558 */ 559 found_aliased: 560 if (hammer_ref_interlock(&buffer->io.lock) == 0) { 561 hammer_io_advance(&buffer->io); 562 KKASSERT(buffer->ondisk); 563 *errorp = 0; 564 return(buffer); 565 } 566 567 /* 568 * 0->1 transition or defered 0->1 transition (CHECK), 569 * interlock now held. Shortcut if ondisk is already 570 * assigned. 571 */ 572 ++hammer_count_refedbufs; 573 if (buffer->ondisk) { 574 hammer_io_advance(&buffer->io); 575 hammer_ref_interlock_done(&buffer->io.lock); 576 *errorp = 0; 577 return(buffer); 578 } 579 580 /* 581 * The buffer is no longer loose if it has a ref, and 582 * cannot become loose once it gains a ref. Loose 583 * buffers will never be in a modified state. This should 584 * only occur on the 0->1 transition of refs. 585 * 586 * lose_list can be modified via a biodone() interrupt 587 * so the io_token must be held. 588 */ 589 if (buffer->io.mod_root == &hmp->lose_root) { 590 lwkt_gettoken(&hmp->io_token); 591 if (buffer->io.mod_root == &hmp->lose_root) { 592 RB_REMOVE(hammer_mod_rb_tree, 593 buffer->io.mod_root, &buffer->io); 594 buffer->io.mod_root = NULL; 595 KKASSERT(buffer->io.modified == 0); 596 } 597 lwkt_reltoken(&hmp->io_token); 598 } 599 goto found; 600 } else if (hmp->ronly && hammer_direct_zone(buf_offset)) { 601 /* 602 * If this is a read-only mount there could be an alias 603 * in the raw-zone. If there is we use that buffer instead. 604 * 605 * rw mounts will not have aliases. Also note when going 606 * from ro -> rw the recovered raw buffers are flushed and 607 * reclaimed, so again there will not be any aliases once 608 * the mount is rw. 609 */ 610 buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, 611 (buf_offset & ~HAMMER_OFF_ZONE_MASK) | 612 HAMMER_ZONE_RAW_BUFFER); 613 if (buffer) { 614 kprintf("HAMMER: recovered aliased %016jx\n", 615 (intmax_t)buf_offset); 616 goto found_aliased; 617 } 618 } 619 620 /* 621 * What is the buffer class? 622 */ 623 zone = HAMMER_ZONE_DECODE(buf_offset); 624 625 switch(zone) { 626 case HAMMER_ZONE_LARGE_DATA_INDEX: 627 case HAMMER_ZONE_SMALL_DATA_INDEX: 628 iotype = HAMMER_STRUCTURE_DATA_BUFFER; 629 break; 630 case HAMMER_ZONE_UNDO_INDEX: 631 iotype = HAMMER_STRUCTURE_UNDO_BUFFER; 632 break; 633 case HAMMER_ZONE_META_INDEX: 634 default: 635 /* 636 * NOTE: inode data and directory entries are placed in this 637 * zone. inode atime/mtime is updated in-place and thus 638 * buffers containing inodes must be synchronized as 639 * meta-buffers, same as buffers containing B-Tree info. 640 */ 641 iotype = HAMMER_STRUCTURE_META_BUFFER; 642 break; 643 } 644 645 /* 646 * Handle blockmap offset translations 647 */ 648 if (zone >= HAMMER_ZONE_BTREE_INDEX) { 649 zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, errorp); 650 } else if (zone == HAMMER_ZONE_UNDO_INDEX) { 651 zone2_offset = hammer_undo_lookup(hmp, buf_offset, errorp); 652 } else { 653 KKASSERT(zone == HAMMER_ZONE_RAW_BUFFER_INDEX); 654 zone2_offset = buf_offset; 655 *errorp = 0; 656 } 657 if (*errorp) 658 return(NULL); 659 660 /* 661 * NOTE: zone2_offset and maxbuf_off are both full zone-2 offset 662 * specifications. 663 */ 664 KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) == 665 HAMMER_ZONE_RAW_BUFFER); 666 vol_no = HAMMER_VOL_DECODE(zone2_offset); 667 volume = hammer_get_volume(hmp, vol_no, errorp); 668 if (volume == NULL) 669 return(NULL); 670 671 KKASSERT(zone2_offset < volume->maxbuf_off); 672 673 /* 674 * Allocate a new buffer structure. We will check for races later. 675 */ 676 ++hammer_count_buffers; 677 buffer = kmalloc(sizeof(*buffer), hmp->m_misc, 678 M_WAITOK|M_ZERO|M_USE_RESERVE); 679 buffer->zone2_offset = zone2_offset; 680 buffer->zoneX_offset = buf_offset; 681 682 hammer_io_init(&buffer->io, volume, iotype); 683 buffer->io.offset = volume->ondisk->vol_buf_beg + 684 (zone2_offset & HAMMER_OFF_SHORT_MASK); 685 buffer->io.bytes = bytes; 686 TAILQ_INIT(&buffer->clist); 687 hammer_ref_interlock_true(&buffer->io.lock); 688 689 /* 690 * Insert the buffer into the RB tree and handle late collisions. 691 */ 692 if (RB_INSERT(hammer_buf_rb_tree, &hmp->rb_bufs_root, buffer)) { 693 hammer_rel_volume(volume, 0); 694 buffer->io.volume = NULL; /* safety */ 695 if (hammer_rel_interlock(&buffer->io.lock, 1)) /* safety */ 696 hammer_rel_interlock_done(&buffer->io.lock, 1); 697 --hammer_count_buffers; 698 kfree(buffer, hmp->m_misc); 699 goto again; 700 } 701 ++hammer_count_refedbufs; 702 found: 703 704 /* 705 * The buffer is referenced and interlocked. Load the buffer 706 * if necessary. hammer_load_buffer() deals with the interlock 707 * and, if an error is returned, also deals with the ref. 708 */ 709 if (buffer->ondisk == NULL) { 710 *errorp = hammer_load_buffer(buffer, isnew); 711 if (*errorp) 712 buffer = NULL; 713 } else { 714 hammer_io_advance(&buffer->io); 715 hammer_ref_interlock_done(&buffer->io.lock); 716 *errorp = 0; 717 } 718 return(buffer); 719 } 720 721 /* 722 * This is used by the direct-read code to deal with large-data buffers 723 * created by the reblocker and mirror-write code. The direct-read code 724 * bypasses the HAMMER buffer subsystem and so any aliased dirty or write- 725 * running hammer buffers must be fully synced to disk before we can issue 726 * the direct-read. 727 * 728 * This code path is not considered critical as only the rebocker and 729 * mirror-write code will create large-data buffers via the HAMMER buffer 730 * subsystem. They do that because they operate at the B-Tree level and 731 * do not access the vnode/inode structures. 732 */ 733 void 734 hammer_sync_buffers(hammer_mount_t hmp, hammer_off_t base_offset, int bytes) 735 { 736 hammer_buffer_t buffer; 737 int error; 738 739 KKASSERT((base_offset & HAMMER_OFF_ZONE_MASK) == 740 HAMMER_ZONE_LARGE_DATA); 741 742 while (bytes > 0) { 743 buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, 744 base_offset); 745 if (buffer && (buffer->io.modified || buffer->io.running)) { 746 error = hammer_ref_buffer(buffer); 747 if (error == 0) { 748 hammer_io_wait(&buffer->io); 749 if (buffer->io.modified) { 750 hammer_io_write_interlock(&buffer->io); 751 hammer_io_flush(&buffer->io, 0); 752 hammer_io_done_interlock(&buffer->io); 753 hammer_io_wait(&buffer->io); 754 } 755 hammer_rel_buffer(buffer, 0); 756 } 757 } 758 base_offset += HAMMER_BUFSIZE; 759 bytes -= HAMMER_BUFSIZE; 760 } 761 } 762 763 /* 764 * Destroy all buffers covering the specified zoneX offset range. This 765 * is called when the related blockmap layer2 entry is freed or when 766 * a direct write bypasses our buffer/buffer-cache subsystem. 767 * 768 * The buffers may be referenced by the caller itself. Setting reclaim 769 * will cause the buffer to be destroyed when it's ref count reaches zero. 770 * 771 * Return 0 on success, EAGAIN if some buffers could not be destroyed due 772 * to additional references held by other threads, or some other (typically 773 * fatal) error. 774 */ 775 int 776 hammer_del_buffers(hammer_mount_t hmp, hammer_off_t base_offset, 777 hammer_off_t zone2_offset, int bytes, 778 int report_conflicts) 779 { 780 hammer_buffer_t buffer; 781 hammer_volume_t volume; 782 int vol_no; 783 int error; 784 int ret_error; 785 786 vol_no = HAMMER_VOL_DECODE(zone2_offset); 787 volume = hammer_get_volume(hmp, vol_no, &ret_error); 788 KKASSERT(ret_error == 0); 789 790 while (bytes > 0) { 791 buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, 792 base_offset); 793 if (buffer) { 794 error = hammer_ref_buffer(buffer); 795 if (hammer_debug_general & 0x20000) { 796 kprintf("hammer: delbufr %016jx " 797 "rerr=%d 1ref=%d\n", 798 (intmax_t)buffer->zoneX_offset, 799 error, 800 hammer_oneref(&buffer->io.lock)); 801 } 802 if (error == 0 && !hammer_oneref(&buffer->io.lock)) { 803 error = EAGAIN; 804 hammer_rel_buffer(buffer, 0); 805 } 806 if (error == 0) { 807 KKASSERT(buffer->zone2_offset == zone2_offset); 808 hammer_io_clear_modify(&buffer->io, 1); 809 buffer->io.reclaim = 1; 810 buffer->io.waitdep = 1; 811 KKASSERT(buffer->io.volume == volume); 812 hammer_rel_buffer(buffer, 0); 813 } 814 } else { 815 error = hammer_io_inval(volume, zone2_offset); 816 } 817 if (error) { 818 ret_error = error; 819 if (report_conflicts || 820 (hammer_debug_general & 0x8000)) { 821 kprintf("hammer_del_buffers: unable to " 822 "invalidate %016llx buffer=%p rep=%d\n", 823 (long long)base_offset, 824 buffer, report_conflicts); 825 } 826 } 827 base_offset += HAMMER_BUFSIZE; 828 zone2_offset += HAMMER_BUFSIZE; 829 bytes -= HAMMER_BUFSIZE; 830 } 831 hammer_rel_volume(volume, 0); 832 return (ret_error); 833 } 834 835 /* 836 * Given a referenced and interlocked buffer load/validate the data. 837 * 838 * The buffer interlock will be released on return. If an error is 839 * returned the buffer reference will also be released (and the buffer 840 * pointer will thus be stale). 841 */ 842 static int 843 hammer_load_buffer(hammer_buffer_t buffer, int isnew) 844 { 845 hammer_volume_t volume; 846 int error; 847 848 /* 849 * Load the buffer's on-disk info 850 */ 851 volume = buffer->io.volume; 852 853 if (hammer_debug_io & 0x0004) { 854 kprintf("load_buffer %016llx %016llx isnew=%d od=%p\n", 855 (long long)buffer->zoneX_offset, 856 (long long)buffer->zone2_offset, 857 isnew, buffer->ondisk); 858 } 859 860 if (buffer->ondisk == NULL) { 861 /* 862 * Issue the read or generate a new buffer. When reading 863 * the limit argument controls any read-ahead clustering 864 * hammer_io_read() is allowed to do. 865 * 866 * We cannot read-ahead in the large-data zone and we cannot 867 * cross a largeblock boundary as the next largeblock might 868 * use a different buffer size. 869 */ 870 if (isnew) { 871 error = hammer_io_new(volume->devvp, &buffer->io); 872 } else if ((buffer->zoneX_offset & HAMMER_OFF_ZONE_MASK) == 873 HAMMER_ZONE_LARGE_DATA) { 874 error = hammer_io_read(volume->devvp, &buffer->io, 875 buffer->io.bytes); 876 } else { 877 hammer_off_t limit; 878 879 limit = (buffer->zone2_offset + 880 HAMMER_LARGEBLOCK_MASK64) & 881 ~HAMMER_LARGEBLOCK_MASK64; 882 limit -= buffer->zone2_offset; 883 error = hammer_io_read(volume->devvp, &buffer->io, 884 limit); 885 } 886 if (error == 0) 887 buffer->ondisk = (void *)buffer->io.bp->b_data; 888 } else if (isnew) { 889 error = hammer_io_new(volume->devvp, &buffer->io); 890 } else { 891 error = 0; 892 } 893 if (error == 0) { 894 hammer_io_advance(&buffer->io); 895 hammer_ref_interlock_done(&buffer->io.lock); 896 } else { 897 hammer_rel_buffer(buffer, 1); 898 } 899 return (error); 900 } 901 902 /* 903 * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue. 904 * This routine is only called during unmount or when a volume is 905 * removed. 906 * 907 * If data != NULL, it specifies a volume whoose buffers should 908 * be unloaded. 909 */ 910 int 911 hammer_unload_buffer(hammer_buffer_t buffer, void *data) 912 { 913 struct hammer_volume *volume = (struct hammer_volume *) data; 914 915 /* 916 * If volume != NULL we are only interested in unloading buffers 917 * associated with a particular volume. 918 */ 919 if (volume != NULL && volume != buffer->io.volume) 920 return 0; 921 922 /* 923 * Clean up the persistent ref ioerror might have on the buffer 924 * and acquire a ref. Expect a 0->1 transition. 925 */ 926 if (buffer->io.ioerror) { 927 hammer_io_clear_error_noassert(&buffer->io); 928 --hammer_count_refedbufs; 929 } 930 hammer_ref_interlock_true(&buffer->io.lock); 931 ++hammer_count_refedbufs; 932 933 /* 934 * We must not flush a dirty buffer to disk on umount. It should 935 * have already been dealt with by the flusher, or we may be in 936 * catastrophic failure. 937 * 938 * We must set waitdep to ensure that a running buffer is waited 939 * on and released prior to us trying to unload the volume. 940 */ 941 hammer_io_clear_modify(&buffer->io, 1); 942 hammer_flush_buffer_nodes(buffer); 943 buffer->io.waitdep = 1; 944 hammer_rel_buffer(buffer, 1); 945 return(0); 946 } 947 948 /* 949 * Reference a buffer that is either already referenced or via a specially 950 * handled pointer (aka cursor->buffer). 951 */ 952 int 953 hammer_ref_buffer(hammer_buffer_t buffer) 954 { 955 hammer_mount_t hmp; 956 int error; 957 int locked; 958 959 /* 960 * Acquire a ref, plus the buffer will be interlocked on the 961 * 0->1 transition. 962 */ 963 locked = hammer_ref_interlock(&buffer->io.lock); 964 hmp = buffer->io.hmp; 965 966 /* 967 * At this point a biodone() will not touch the buffer other then 968 * incidental bits. However, lose_list can be modified via 969 * a biodone() interrupt. 970 * 971 * No longer loose. lose_list requires the io_token. 972 */ 973 if (buffer->io.mod_root == &hmp->lose_root) { 974 lwkt_gettoken(&hmp->io_token); 975 if (buffer->io.mod_root == &hmp->lose_root) { 976 RB_REMOVE(hammer_mod_rb_tree, 977 buffer->io.mod_root, &buffer->io); 978 buffer->io.mod_root = NULL; 979 } 980 lwkt_reltoken(&hmp->io_token); 981 } 982 983 if (locked) { 984 ++hammer_count_refedbufs; 985 error = hammer_load_buffer(buffer, 0); 986 /* NOTE: on error the buffer pointer is stale */ 987 } else { 988 error = 0; 989 } 990 return(error); 991 } 992 993 /* 994 * Release a reference on the buffer. On the 1->0 transition the 995 * underlying IO will be released but the data reference is left 996 * cached. 997 * 998 * Only destroy the structure itself if the related buffer cache buffer 999 * was disassociated from it. This ties the management of the structure 1000 * to the buffer cache subsystem. buffer->ondisk determines whether the 1001 * embedded io is referenced or not. 1002 */ 1003 void 1004 hammer_rel_buffer(hammer_buffer_t buffer, int locked) 1005 { 1006 hammer_volume_t volume; 1007 hammer_mount_t hmp; 1008 struct buf *bp = NULL; 1009 int freeme = 0; 1010 1011 hmp = buffer->io.hmp; 1012 1013 if (hammer_rel_interlock(&buffer->io.lock, locked) == 0) 1014 return; 1015 1016 /* 1017 * hammer_count_refedbufs accounting. Decrement if we are in 1018 * the error path or if CHECK is clear. 1019 * 1020 * If we are not in the error path and CHECK is set the caller 1021 * probably just did a hammer_ref() and didn't account for it, 1022 * so we don't account for the loss here. 1023 */ 1024 if (locked || (buffer->io.lock.refs & HAMMER_REFS_CHECK) == 0) 1025 --hammer_count_refedbufs; 1026 1027 /* 1028 * If the caller locked us or the normal released transitions 1029 * from 1->0 (and acquired the lock) attempt to release the 1030 * io. If the called locked us we tell hammer_io_release() 1031 * to flush (which would be the unload or failure path). 1032 */ 1033 bp = hammer_io_release(&buffer->io, locked); 1034 1035 /* 1036 * If the buffer has no bp association and no refs we can destroy 1037 * it. 1038 * 1039 * NOTE: It is impossible for any associated B-Tree nodes to have 1040 * refs if the buffer has no additional refs. 1041 */ 1042 if (buffer->io.bp == NULL && hammer_norefs(&buffer->io.lock)) { 1043 RB_REMOVE(hammer_buf_rb_tree, 1044 &buffer->io.hmp->rb_bufs_root, 1045 buffer); 1046 volume = buffer->io.volume; 1047 buffer->io.volume = NULL; /* sanity */ 1048 hammer_rel_volume(volume, 0); 1049 hammer_io_clear_modlist(&buffer->io); 1050 hammer_flush_buffer_nodes(buffer); 1051 KKASSERT(TAILQ_EMPTY(&buffer->clist)); 1052 freeme = 1; 1053 } 1054 1055 /* 1056 * Cleanup 1057 */ 1058 hammer_rel_interlock_done(&buffer->io.lock, locked); 1059 if (bp) 1060 brelse(bp); 1061 if (freeme) { 1062 --hammer_count_buffers; 1063 kfree(buffer, hmp->m_misc); 1064 } 1065 } 1066 1067 /* 1068 * Access the filesystem buffer containing the specified hammer offset. 1069 * buf_offset is a conglomeration of the volume number and vol_buf_beg 1070 * relative buffer offset. It must also have bit 55 set to be valid. 1071 * (see hammer_off_t in hammer_disk.h). 1072 * 1073 * Any prior buffer in *bufferp will be released and replaced by the 1074 * requested buffer. 1075 * 1076 * NOTE: The buffer is indexed via its zoneX_offset but we allow the 1077 * passed cached *bufferp to match against either zoneX or zone2. 1078 */ 1079 static __inline 1080 void * 1081 _hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes, 1082 int *errorp, struct hammer_buffer **bufferp) 1083 { 1084 hammer_buffer_t buffer; 1085 int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK; 1086 1087 buf_offset &= ~HAMMER_BUFMASK64; 1088 KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) != 0); 1089 1090 buffer = *bufferp; 1091 if (buffer == NULL || (buffer->zone2_offset != buf_offset && 1092 buffer->zoneX_offset != buf_offset)) { 1093 if (buffer) 1094 hammer_rel_buffer(buffer, 0); 1095 buffer = hammer_get_buffer(hmp, buf_offset, bytes, 0, errorp); 1096 *bufferp = buffer; 1097 } else { 1098 *errorp = 0; 1099 } 1100 1101 /* 1102 * Return a pointer to the buffer data. 1103 */ 1104 if (buffer == NULL) 1105 return(NULL); 1106 else 1107 return((char *)buffer->ondisk + xoff); 1108 } 1109 1110 void * 1111 hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset, 1112 int *errorp, struct hammer_buffer **bufferp) 1113 { 1114 return(_hammer_bread(hmp, buf_offset, HAMMER_BUFSIZE, errorp, bufferp)); 1115 } 1116 1117 void * 1118 hammer_bread_ext(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes, 1119 int *errorp, struct hammer_buffer **bufferp) 1120 { 1121 bytes = (bytes + HAMMER_BUFMASK) & ~HAMMER_BUFMASK; 1122 return(_hammer_bread(hmp, buf_offset, bytes, errorp, bufferp)); 1123 } 1124 1125 /* 1126 * Access the filesystem buffer containing the specified hammer offset. 1127 * No disk read operation occurs. The result buffer may contain garbage. 1128 * 1129 * Any prior buffer in *bufferp will be released and replaced by the 1130 * requested buffer. 1131 * 1132 * This function marks the buffer dirty but does not increment its 1133 * modify_refs count. 1134 */ 1135 static __inline 1136 void * 1137 _hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes, 1138 int *errorp, struct hammer_buffer **bufferp) 1139 { 1140 hammer_buffer_t buffer; 1141 int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK; 1142 1143 buf_offset &= ~HAMMER_BUFMASK64; 1144 1145 buffer = *bufferp; 1146 if (buffer == NULL || (buffer->zone2_offset != buf_offset && 1147 buffer->zoneX_offset != buf_offset)) { 1148 if (buffer) 1149 hammer_rel_buffer(buffer, 0); 1150 buffer = hammer_get_buffer(hmp, buf_offset, bytes, 1, errorp); 1151 *bufferp = buffer; 1152 } else { 1153 *errorp = 0; 1154 } 1155 1156 /* 1157 * Return a pointer to the buffer data. 1158 */ 1159 if (buffer == NULL) 1160 return(NULL); 1161 else 1162 return((char *)buffer->ondisk + xoff); 1163 } 1164 1165 void * 1166 hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, 1167 int *errorp, struct hammer_buffer **bufferp) 1168 { 1169 return(_hammer_bnew(hmp, buf_offset, HAMMER_BUFSIZE, errorp, bufferp)); 1170 } 1171 1172 void * 1173 hammer_bnew_ext(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes, 1174 int *errorp, struct hammer_buffer **bufferp) 1175 { 1176 bytes = (bytes + HAMMER_BUFMASK) & ~HAMMER_BUFMASK; 1177 return(_hammer_bnew(hmp, buf_offset, bytes, errorp, bufferp)); 1178 } 1179 1180 /************************************************************************ 1181 * NODES * 1182 ************************************************************************ 1183 * 1184 * Manage B-Tree nodes. B-Tree nodes represent the primary indexing 1185 * method used by the HAMMER filesystem. 1186 * 1187 * Unlike other HAMMER structures, a hammer_node can be PASSIVELY 1188 * associated with its buffer, and will only referenced the buffer while 1189 * the node itself is referenced. 1190 * 1191 * A hammer_node can also be passively associated with other HAMMER 1192 * structures, such as inodes, while retaining 0 references. These 1193 * associations can be cleared backwards using a pointer-to-pointer in 1194 * the hammer_node. 1195 * 1196 * This allows the HAMMER implementation to cache hammer_nodes long-term 1197 * and short-cut a great deal of the infrastructure's complexity. In 1198 * most cases a cached node can be reacquired without having to dip into 1199 * either the buffer or cluster management code. 1200 * 1201 * The caller must pass a referenced cluster on call and will retain 1202 * ownership of the reference on return. The node will acquire its own 1203 * additional references, if necessary. 1204 */ 1205 hammer_node_t 1206 hammer_get_node(hammer_transaction_t trans, hammer_off_t node_offset, 1207 int isnew, int *errorp) 1208 { 1209 hammer_mount_t hmp = trans->hmp; 1210 hammer_node_t node; 1211 int doload; 1212 1213 KKASSERT((node_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_BTREE); 1214 1215 /* 1216 * Locate the structure, allocating one if necessary. 1217 */ 1218 again: 1219 node = RB_LOOKUP(hammer_nod_rb_tree, &hmp->rb_nods_root, node_offset); 1220 if (node == NULL) { 1221 ++hammer_count_nodes; 1222 node = kmalloc(sizeof(*node), hmp->m_misc, M_WAITOK|M_ZERO|M_USE_RESERVE); 1223 node->node_offset = node_offset; 1224 node->hmp = hmp; 1225 TAILQ_INIT(&node->cursor_list); 1226 TAILQ_INIT(&node->cache_list); 1227 if (RB_INSERT(hammer_nod_rb_tree, &hmp->rb_nods_root, node)) { 1228 --hammer_count_nodes; 1229 kfree(node, hmp->m_misc); 1230 goto again; 1231 } 1232 doload = hammer_ref_interlock_true(&node->lock); 1233 } else { 1234 doload = hammer_ref_interlock(&node->lock); 1235 } 1236 if (doload) { 1237 *errorp = hammer_load_node(trans, node, isnew); 1238 trans->flags |= HAMMER_TRANSF_DIDIO; 1239 if (*errorp) 1240 node = NULL; 1241 } else { 1242 KKASSERT(node->ondisk); 1243 *errorp = 0; 1244 hammer_io_advance(&node->buffer->io); 1245 } 1246 return(node); 1247 } 1248 1249 /* 1250 * Reference an already-referenced node. 0->1 transitions should assert 1251 * so we do not have to deal with hammer_ref() setting CHECK. 1252 */ 1253 void 1254 hammer_ref_node(hammer_node_t node) 1255 { 1256 KKASSERT(hammer_isactive(&node->lock) && node->ondisk != NULL); 1257 hammer_ref(&node->lock); 1258 } 1259 1260 /* 1261 * Load a node's on-disk data reference. Called with the node referenced 1262 * and interlocked. 1263 * 1264 * On return the node interlock will be unlocked. If a non-zero error code 1265 * is returned the node will also be dereferenced (and the caller's pointer 1266 * will be stale). 1267 */ 1268 static int 1269 hammer_load_node(hammer_transaction_t trans, hammer_node_t node, int isnew) 1270 { 1271 hammer_buffer_t buffer; 1272 hammer_off_t buf_offset; 1273 int error; 1274 1275 error = 0; 1276 if (node->ondisk == NULL) { 1277 /* 1278 * This is a little confusing but the jist is that 1279 * node->buffer determines whether the node is on 1280 * the buffer's clist and node->ondisk determines 1281 * whether the buffer is referenced. 1282 * 1283 * We could be racing a buffer release, in which case 1284 * node->buffer may become NULL while we are blocked 1285 * referencing the buffer. 1286 */ 1287 if ((buffer = node->buffer) != NULL) { 1288 error = hammer_ref_buffer(buffer); 1289 if (error == 0 && node->buffer == NULL) { 1290 TAILQ_INSERT_TAIL(&buffer->clist, 1291 node, entry); 1292 node->buffer = buffer; 1293 } 1294 } else { 1295 buf_offset = node->node_offset & ~HAMMER_BUFMASK64; 1296 buffer = hammer_get_buffer(node->hmp, buf_offset, 1297 HAMMER_BUFSIZE, 0, &error); 1298 if (buffer) { 1299 KKASSERT(error == 0); 1300 TAILQ_INSERT_TAIL(&buffer->clist, 1301 node, entry); 1302 node->buffer = buffer; 1303 } 1304 } 1305 if (error) 1306 goto failed; 1307 node->ondisk = (void *)((char *)buffer->ondisk + 1308 (node->node_offset & HAMMER_BUFMASK)); 1309 1310 /* 1311 * Check CRC. NOTE: Neither flag is set and the CRC is not 1312 * generated on new B-Tree nodes. 1313 */ 1314 if (isnew == 0 && 1315 (node->flags & HAMMER_NODE_CRCANY) == 0) { 1316 if (hammer_crc_test_btree(node->ondisk) == 0) { 1317 if (hammer_debug_critical) 1318 Debugger("CRC FAILED: B-TREE NODE"); 1319 node->flags |= HAMMER_NODE_CRCBAD; 1320 } else { 1321 node->flags |= HAMMER_NODE_CRCGOOD; 1322 } 1323 } 1324 } 1325 if (node->flags & HAMMER_NODE_CRCBAD) { 1326 if (trans->flags & HAMMER_TRANSF_CRCDOM) 1327 error = EDOM; 1328 else 1329 error = EIO; 1330 } 1331 failed: 1332 if (error) { 1333 _hammer_rel_node(node, 1); 1334 } else { 1335 hammer_ref_interlock_done(&node->lock); 1336 } 1337 return (error); 1338 } 1339 1340 /* 1341 * Safely reference a node, interlock against flushes via the IO subsystem. 1342 */ 1343 hammer_node_t 1344 hammer_ref_node_safe(hammer_transaction_t trans, hammer_node_cache_t cache, 1345 int *errorp) 1346 { 1347 hammer_node_t node; 1348 int doload; 1349 1350 node = cache->node; 1351 if (node != NULL) { 1352 doload = hammer_ref_interlock(&node->lock); 1353 if (doload) { 1354 *errorp = hammer_load_node(trans, node, 0); 1355 if (*errorp) 1356 node = NULL; 1357 } else { 1358 KKASSERT(node->ondisk); 1359 if (node->flags & HAMMER_NODE_CRCBAD) { 1360 if (trans->flags & HAMMER_TRANSF_CRCDOM) 1361 *errorp = EDOM; 1362 else 1363 *errorp = EIO; 1364 _hammer_rel_node(node, 0); 1365 node = NULL; 1366 } else { 1367 *errorp = 0; 1368 } 1369 } 1370 } else { 1371 *errorp = ENOENT; 1372 } 1373 return(node); 1374 } 1375 1376 /* 1377 * Release a hammer_node. On the last release the node dereferences 1378 * its underlying buffer and may or may not be destroyed. 1379 * 1380 * If locked is non-zero the passed node has been interlocked by the 1381 * caller and we are in the failure/unload path, otherwise it has not and 1382 * we are doing a normal release. 1383 * 1384 * This function will dispose of the interlock and the reference. 1385 * On return the node pointer is stale. 1386 */ 1387 void 1388 _hammer_rel_node(hammer_node_t node, int locked) 1389 { 1390 hammer_buffer_t buffer; 1391 1392 /* 1393 * Deref the node. If this isn't the 1->0 transition we're basically 1394 * done. If locked is non-zero this function will just deref the 1395 * locked node and return TRUE, otherwise it will deref the locked 1396 * node and either lock and return TRUE on the 1->0 transition or 1397 * not lock and return FALSE. 1398 */ 1399 if (hammer_rel_interlock(&node->lock, locked) == 0) 1400 return; 1401 1402 /* 1403 * Either locked was non-zero and we are interlocked, or the 1404 * hammer_rel_interlock() call returned non-zero and we are 1405 * interlocked. 1406 * 1407 * The ref-count must still be decremented if locked != 0 so 1408 * the cleanup required still varies a bit. 1409 * 1410 * hammer_flush_node() when called with 1 or 2 will dispose of 1411 * the lock and possible ref-count. 1412 */ 1413 if (node->ondisk == NULL) { 1414 hammer_flush_node(node, locked + 1); 1415 /* node is stale now */ 1416 return; 1417 } 1418 1419 /* 1420 * Do not disassociate the node from the buffer if it represents 1421 * a modified B-Tree node that still needs its crc to be generated. 1422 */ 1423 if (node->flags & HAMMER_NODE_NEEDSCRC) { 1424 hammer_rel_interlock_done(&node->lock, locked); 1425 return; 1426 } 1427 1428 /* 1429 * Do final cleanups and then either destroy the node and leave it 1430 * passively cached. The buffer reference is removed regardless. 1431 */ 1432 buffer = node->buffer; 1433 node->ondisk = NULL; 1434 1435 if ((node->flags & HAMMER_NODE_FLUSH) == 0) { 1436 /* 1437 * Normal release. 1438 */ 1439 hammer_rel_interlock_done(&node->lock, locked); 1440 } else { 1441 /* 1442 * Destroy the node. 1443 */ 1444 hammer_flush_node(node, locked + 1); 1445 /* node is stale */ 1446 1447 } 1448 hammer_rel_buffer(buffer, 0); 1449 } 1450 1451 void 1452 hammer_rel_node(hammer_node_t node) 1453 { 1454 _hammer_rel_node(node, 0); 1455 } 1456 1457 /* 1458 * Free space on-media associated with a B-Tree node. 1459 */ 1460 void 1461 hammer_delete_node(hammer_transaction_t trans, hammer_node_t node) 1462 { 1463 KKASSERT((node->flags & HAMMER_NODE_DELETED) == 0); 1464 node->flags |= HAMMER_NODE_DELETED; 1465 hammer_blockmap_free(trans, node->node_offset, sizeof(*node->ondisk)); 1466 } 1467 1468 /* 1469 * Passively cache a referenced hammer_node. The caller may release 1470 * the node on return. 1471 */ 1472 void 1473 hammer_cache_node(hammer_node_cache_t cache, hammer_node_t node) 1474 { 1475 /* 1476 * If the node doesn't exist, or is being deleted, don't cache it! 1477 * 1478 * The node can only ever be NULL in the I/O failure path. 1479 */ 1480 if (node == NULL || (node->flags & HAMMER_NODE_DELETED)) 1481 return; 1482 if (cache->node == node) 1483 return; 1484 while (cache->node) 1485 hammer_uncache_node(cache); 1486 if (node->flags & HAMMER_NODE_DELETED) 1487 return; 1488 cache->node = node; 1489 TAILQ_INSERT_TAIL(&node->cache_list, cache, entry); 1490 } 1491 1492 void 1493 hammer_uncache_node(hammer_node_cache_t cache) 1494 { 1495 hammer_node_t node; 1496 1497 if ((node = cache->node) != NULL) { 1498 TAILQ_REMOVE(&node->cache_list, cache, entry); 1499 cache->node = NULL; 1500 if (TAILQ_EMPTY(&node->cache_list)) 1501 hammer_flush_node(node, 0); 1502 } 1503 } 1504 1505 /* 1506 * Remove a node's cache references and destroy the node if it has no 1507 * other references or backing store. 1508 * 1509 * locked == 0 Normal unlocked operation 1510 * locked == 1 Call hammer_rel_interlock_done(..., 0); 1511 * locked == 2 Call hammer_rel_interlock_done(..., 1); 1512 * 1513 * XXX for now this isn't even close to being MPSAFE so the refs check 1514 * is sufficient. 1515 */ 1516 void 1517 hammer_flush_node(hammer_node_t node, int locked) 1518 { 1519 hammer_node_cache_t cache; 1520 hammer_buffer_t buffer; 1521 hammer_mount_t hmp = node->hmp; 1522 int dofree; 1523 1524 while ((cache = TAILQ_FIRST(&node->cache_list)) != NULL) { 1525 TAILQ_REMOVE(&node->cache_list, cache, entry); 1526 cache->node = NULL; 1527 } 1528 1529 /* 1530 * NOTE: refs is predisposed if another thread is blocking and 1531 * will be larger than 0 in that case. We aren't MPSAFE 1532 * here. 1533 */ 1534 if (node->ondisk == NULL && hammer_norefs(&node->lock)) { 1535 KKASSERT((node->flags & HAMMER_NODE_NEEDSCRC) == 0); 1536 RB_REMOVE(hammer_nod_rb_tree, &node->hmp->rb_nods_root, node); 1537 if ((buffer = node->buffer) != NULL) { 1538 node->buffer = NULL; 1539 TAILQ_REMOVE(&buffer->clist, node, entry); 1540 /* buffer is unreferenced because ondisk is NULL */ 1541 } 1542 dofree = 1; 1543 } else { 1544 dofree = 0; 1545 } 1546 1547 /* 1548 * Deal with the interlock if locked == 1 or locked == 2. 1549 */ 1550 if (locked) 1551 hammer_rel_interlock_done(&node->lock, locked - 1); 1552 1553 /* 1554 * Destroy if requested 1555 */ 1556 if (dofree) { 1557 --hammer_count_nodes; 1558 kfree(node, hmp->m_misc); 1559 } 1560 } 1561 1562 /* 1563 * Flush passively cached B-Tree nodes associated with this buffer. 1564 * This is only called when the buffer is about to be destroyed, so 1565 * none of the nodes should have any references. The buffer is locked. 1566 * 1567 * We may be interlocked with the buffer. 1568 */ 1569 void 1570 hammer_flush_buffer_nodes(hammer_buffer_t buffer) 1571 { 1572 hammer_node_t node; 1573 1574 while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) { 1575 KKASSERT(node->ondisk == NULL); 1576 KKASSERT((node->flags & HAMMER_NODE_NEEDSCRC) == 0); 1577 1578 if (hammer_try_interlock_norefs(&node->lock)) { 1579 hammer_ref(&node->lock); 1580 node->flags |= HAMMER_NODE_FLUSH; 1581 _hammer_rel_node(node, 1); 1582 } else { 1583 KKASSERT(node->buffer != NULL); 1584 buffer = node->buffer; 1585 node->buffer = NULL; 1586 TAILQ_REMOVE(&buffer->clist, node, entry); 1587 /* buffer is unreferenced because ondisk is NULL */ 1588 } 1589 } 1590 } 1591 1592 1593 /************************************************************************ 1594 * ALLOCATORS * 1595 ************************************************************************/ 1596 1597 /* 1598 * Allocate a B-Tree node. 1599 */ 1600 hammer_node_t 1601 hammer_alloc_btree(hammer_transaction_t trans, hammer_off_t hint, int *errorp) 1602 { 1603 hammer_buffer_t buffer = NULL; 1604 hammer_node_t node = NULL; 1605 hammer_off_t node_offset; 1606 1607 node_offset = hammer_blockmap_alloc(trans, HAMMER_ZONE_BTREE_INDEX, 1608 sizeof(struct hammer_node_ondisk), 1609 hint, errorp); 1610 if (*errorp == 0) { 1611 node = hammer_get_node(trans, node_offset, 1, errorp); 1612 hammer_modify_node_noundo(trans, node); 1613 bzero(node->ondisk, sizeof(*node->ondisk)); 1614 hammer_modify_node_done(node); 1615 } 1616 if (buffer) 1617 hammer_rel_buffer(buffer, 0); 1618 return(node); 1619 } 1620 1621 /* 1622 * Allocate data. If the address of a data buffer is supplied then 1623 * any prior non-NULL *data_bufferp will be released and *data_bufferp 1624 * will be set to the related buffer. The caller must release it when 1625 * finally done. The initial *data_bufferp should be set to NULL by 1626 * the caller. 1627 * 1628 * The caller is responsible for making hammer_modify*() calls on the 1629 * *data_bufferp. 1630 */ 1631 void * 1632 hammer_alloc_data(hammer_transaction_t trans, int32_t data_len, 1633 u_int16_t rec_type, hammer_off_t *data_offsetp, 1634 struct hammer_buffer **data_bufferp, 1635 hammer_off_t hint, int *errorp) 1636 { 1637 void *data; 1638 int zone; 1639 1640 /* 1641 * Allocate data 1642 */ 1643 if (data_len) { 1644 switch(rec_type) { 1645 case HAMMER_RECTYPE_INODE: 1646 case HAMMER_RECTYPE_DIRENTRY: 1647 case HAMMER_RECTYPE_EXT: 1648 case HAMMER_RECTYPE_FIX: 1649 case HAMMER_RECTYPE_PFS: 1650 case HAMMER_RECTYPE_SNAPSHOT: 1651 case HAMMER_RECTYPE_CONFIG: 1652 zone = HAMMER_ZONE_META_INDEX; 1653 break; 1654 case HAMMER_RECTYPE_DATA: 1655 case HAMMER_RECTYPE_DB: 1656 if (data_len <= HAMMER_BUFSIZE / 2) { 1657 zone = HAMMER_ZONE_SMALL_DATA_INDEX; 1658 } else { 1659 data_len = (data_len + HAMMER_BUFMASK) & 1660 ~HAMMER_BUFMASK; 1661 zone = HAMMER_ZONE_LARGE_DATA_INDEX; 1662 } 1663 break; 1664 default: 1665 panic("hammer_alloc_data: rec_type %04x unknown", 1666 rec_type); 1667 zone = 0; /* NOT REACHED */ 1668 break; 1669 } 1670 *data_offsetp = hammer_blockmap_alloc(trans, zone, data_len, 1671 hint, errorp); 1672 } else { 1673 *data_offsetp = 0; 1674 } 1675 if (*errorp == 0 && data_bufferp) { 1676 if (data_len) { 1677 data = hammer_bread_ext(trans->hmp, *data_offsetp, 1678 data_len, errorp, data_bufferp); 1679 } else { 1680 data = NULL; 1681 } 1682 } else { 1683 data = NULL; 1684 } 1685 return(data); 1686 } 1687 1688 /* 1689 * Sync dirty buffers to the media and clean-up any loose ends. 1690 * 1691 * These functions do not start the flusher going, they simply 1692 * queue everything up to the flusher. 1693 */ 1694 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data); 1695 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data); 1696 1697 int 1698 hammer_queue_inodes_flusher(hammer_mount_t hmp, int waitfor) 1699 { 1700 struct hammer_sync_info info; 1701 1702 info.error = 0; 1703 info.waitfor = waitfor; 1704 if (waitfor == MNT_WAIT) { 1705 vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_ONEPASS, 1706 hammer_sync_scan1, hammer_sync_scan2, &info); 1707 } else { 1708 vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_ONEPASS|VMSC_NOWAIT, 1709 hammer_sync_scan1, hammer_sync_scan2, &info); 1710 } 1711 return(info.error); 1712 } 1713 1714 /* 1715 * Filesystem sync. If doing a synchronous sync make a second pass on 1716 * the vnodes in case any were already flushing during the first pass, 1717 * and activate the flusher twice (the second time brings the UNDO FIFO's 1718 * start position up to the end position after the first call). 1719 * 1720 * If doing a lazy sync make just one pass on the vnode list, ignoring 1721 * any new vnodes added to the list while the sync is in progress. 1722 */ 1723 int 1724 hammer_sync_hmp(hammer_mount_t hmp, int waitfor) 1725 { 1726 struct hammer_sync_info info; 1727 int flags; 1728 1729 flags = VMSC_GETVP; 1730 if (waitfor & MNT_LAZY) 1731 flags |= VMSC_ONEPASS; 1732 1733 info.error = 0; 1734 info.waitfor = MNT_NOWAIT; 1735 vmntvnodescan(hmp->mp, flags | VMSC_NOWAIT, 1736 hammer_sync_scan1, hammer_sync_scan2, &info); 1737 1738 if (info.error == 0 && (waitfor & MNT_WAIT)) { 1739 info.waitfor = waitfor; 1740 vmntvnodescan(hmp->mp, flags, 1741 hammer_sync_scan1, hammer_sync_scan2, &info); 1742 } 1743 if (waitfor == MNT_WAIT) { 1744 hammer_flusher_sync(hmp); 1745 hammer_flusher_sync(hmp); 1746 } else { 1747 hammer_flusher_async(hmp, NULL); 1748 hammer_flusher_async(hmp, NULL); 1749 } 1750 return(info.error); 1751 } 1752 1753 static int 1754 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data) 1755 { 1756 struct hammer_inode *ip; 1757 1758 ip = VTOI(vp); 1759 if (vp->v_type == VNON || ip == NULL || 1760 ((ip->flags & HAMMER_INODE_MODMASK) == 0 && 1761 RB_EMPTY(&vp->v_rbdirty_tree))) { 1762 return(-1); 1763 } 1764 return(0); 1765 } 1766 1767 static int 1768 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data) 1769 { 1770 struct hammer_sync_info *info = data; 1771 struct hammer_inode *ip; 1772 int error; 1773 1774 ip = VTOI(vp); 1775 if (vp->v_type == VNON || vp->v_type == VBAD || 1776 ((ip->flags & HAMMER_INODE_MODMASK) == 0 && 1777 RB_EMPTY(&vp->v_rbdirty_tree))) { 1778 return(0); 1779 } 1780 error = VOP_FSYNC(vp, MNT_NOWAIT, 0); 1781 if (error) 1782 info->error = error; 1783 return(0); 1784 } 1785 1786