1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.76 2008/08/29 20:19:08 dillon Exp $ 35 */ 36 /* 37 * Manage HAMMER's on-disk structures. These routines are primarily 38 * responsible for interfacing with the kernel's I/O subsystem and for 39 * managing in-memory structures. 40 */ 41 42 #include <sys/nlookup.h> 43 #include <sys/buf2.h> 44 45 #include "hammer.h" 46 47 static void hammer_free_volume(hammer_volume_t volume); 48 static int hammer_load_volume(hammer_volume_t volume); 49 static int hammer_load_buffer(hammer_buffer_t buffer, int isnew); 50 static int hammer_load_node(hammer_transaction_t trans, 51 hammer_node_t node, int isnew); 52 static void _hammer_rel_node(hammer_node_t node, int locked); 53 54 static int 55 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2) 56 { 57 if (vol1->vol_no < vol2->vol_no) 58 return(-1); 59 if (vol1->vol_no > vol2->vol_no) 60 return(1); 61 return(0); 62 } 63 64 /* 65 * hammer_buffer structures are indexed via their zoneX_offset, not 66 * their zone2_offset. 67 */ 68 static int 69 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2) 70 { 71 if (buf1->zoneX_offset < buf2->zoneX_offset) 72 return(-1); 73 if (buf1->zoneX_offset > buf2->zoneX_offset) 74 return(1); 75 return(0); 76 } 77 78 static int 79 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2) 80 { 81 if (node1->node_offset < node2->node_offset) 82 return(-1); 83 if (node1->node_offset > node2->node_offset) 84 return(1); 85 return(0); 86 } 87 88 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node, 89 hammer_vol_rb_compare, int32_t, vol_no); 90 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node, 91 hammer_buf_rb_compare, hammer_off_t, zoneX_offset); 92 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node, 93 hammer_nod_rb_compare, hammer_off_t, node_offset); 94 95 /************************************************************************ 96 * VOLUMES * 97 ************************************************************************ 98 * 99 * Load a HAMMER volume by name. Returns 0 on success or a positive error 100 * code on failure. Volumes must be loaded at mount time or via hammer 101 * volume-add command, hammer_get_volume() will not load a new volume. 102 * 103 * The passed devvp is vref()'d but not locked. This function consumes the 104 * ref (typically by associating it with the volume structure). 105 * 106 * Calls made to hammer_load_volume() or single-threaded 107 */ 108 int 109 hammer_install_volume(hammer_mount_t hmp, const char *volname, 110 struct vnode *devvp, void *data) 111 { 112 struct mount *mp; 113 hammer_volume_t volume; 114 struct hammer_volume_ondisk *ondisk; 115 struct hammer_volume_ondisk *img; 116 struct nlookupdata nd; 117 struct buf *bp = NULL; 118 int error; 119 int ronly; 120 int setmp = 0; 121 int i; 122 123 mp = hmp->mp; 124 ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0); 125 126 /* 127 * Allocate a volume structure 128 */ 129 ++hammer_count_volumes; 130 volume = kmalloc(sizeof(*volume), hmp->m_misc, M_WAITOK|M_ZERO); 131 volume->vol_name = kstrdup(volname, hmp->m_misc); 132 volume->io.hmp = hmp; /* bootstrap */ 133 hammer_io_init(&volume->io, volume, HAMMER_STRUCTURE_VOLUME); 134 volume->io.offset = 0LL; 135 volume->io.bytes = HAMMER_BUFSIZE; 136 137 /* 138 * Get the device vnode 139 */ 140 if (devvp == NULL) { 141 error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW); 142 if (error == 0) 143 error = nlookup(&nd); 144 if (error == 0) 145 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp); 146 nlookup_done(&nd); 147 } else { 148 error = 0; 149 volume->devvp = devvp; 150 } 151 152 if (error == 0) { 153 if (vn_isdisk(volume->devvp, &error)) { 154 error = vfs_mountedon(volume->devvp); 155 } 156 } 157 if (error == 0 && vcount(volume->devvp) > 0) 158 error = EBUSY; 159 if (error == 0) { 160 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY); 161 error = vinvalbuf(volume->devvp, V_SAVE, 0, 0); 162 if (error == 0) { 163 error = VOP_OPEN(volume->devvp, 164 (ronly ? FREAD : FREAD|FWRITE), 165 FSCRED, NULL); 166 } 167 vn_unlock(volume->devvp); 168 } 169 if (error) { 170 hammer_free_volume(volume); 171 return(error); 172 } 173 volume->devvp->v_rdev->si_mountpoint = mp; 174 setmp = 1; 175 176 /* 177 * Extract the volume number from the volume header and do various 178 * sanity checks. 179 */ 180 error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp); 181 if (error) 182 goto late_failure; 183 ondisk = (void *)bp->b_data; 184 185 /* 186 * Initialize the volume header with data if the data is specified. 187 */ 188 if (ronly == 0 && data) { 189 img = (struct hammer_volume_ondisk *)data; 190 if (ondisk->vol_signature == HAMMER_FSBUF_VOLUME) { 191 hkprintf("Formatting of valid HAMMER volume " 192 "%s denied. Erase with dd!\n", volname); 193 error = EFTYPE; 194 goto late_failure; 195 } 196 bcopy(img, ondisk, sizeof(*img)); 197 } 198 199 if (ondisk->vol_signature != HAMMER_FSBUF_VOLUME) { 200 hkprintf("volume %s has an invalid header\n", 201 volume->vol_name); 202 for (i = 0; i < (int)sizeof(ondisk->vol_signature); i++) { 203 kprintf("%02x", ((char*)&ondisk->vol_signature)[i] & 0xFF); 204 if (i != (int)sizeof(ondisk->vol_signature) - 1) 205 kprintf(" "); 206 } 207 kprintf("\n"); 208 error = EFTYPE; 209 goto late_failure; 210 } 211 volume->vol_no = ondisk->vol_no; 212 volume->vol_flags = ondisk->vol_flags; 213 volume->maxbuf_off = HAMMER_ENCODE_RAW_BUFFER(volume->vol_no, 214 ondisk->vol_buf_end - ondisk->vol_buf_beg); 215 216 if (RB_EMPTY(&hmp->rb_vols_root)) { 217 hmp->fsid = ondisk->vol_fsid; 218 } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) { 219 hkprintf("volume %s's fsid does not match other volumes\n", 220 volume->vol_name); 221 error = EFTYPE; 222 goto late_failure; 223 } 224 225 /* 226 * Insert the volume structure into the red-black tree. 227 */ 228 if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) { 229 hkprintf("volume %s has a duplicate vol_no %d\n", 230 volume->vol_name, volume->vol_no); 231 error = EEXIST; 232 } 233 234 if (error == 0) 235 HAMMER_VOLUME_NUMBER_ADD(hmp, volume); 236 237 /* 238 * Set the root volume . HAMMER special cases rootvol the structure. 239 * We do not hold a ref because this would prevent related I/O 240 * from being flushed. 241 */ 242 if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) { 243 hmp->rootvol = volume; 244 hmp->nvolumes = ondisk->vol_count; 245 if (bp) { 246 brelse(bp); 247 bp = NULL; 248 } 249 hmp->mp->mnt_stat.f_blocks += ondisk->vol0_stat_bigblocks * 250 HAMMER_BUFFERS_PER_BIGBLOCK; 251 hmp->mp->mnt_vstat.f_blocks += ondisk->vol0_stat_bigblocks * 252 HAMMER_BUFFERS_PER_BIGBLOCK; 253 } 254 late_failure: 255 if (bp) 256 brelse(bp); 257 if (error) { 258 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/ 259 if (setmp) 260 volume->devvp->v_rdev->si_mountpoint = NULL; 261 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY); 262 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE, NULL); 263 vn_unlock(volume->devvp); 264 hammer_free_volume(volume); 265 } 266 return (error); 267 } 268 269 /* 270 * This is called for each volume when updating the mount point from 271 * read-write to read-only or vise-versa. 272 */ 273 int 274 hammer_adjust_volume_mode(hammer_volume_t volume, void *data __unused) 275 { 276 if (volume->devvp) { 277 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY); 278 if (volume->io.hmp->ronly) { 279 /* do not call vinvalbuf */ 280 VOP_OPEN(volume->devvp, FREAD, FSCRED, NULL); 281 VOP_CLOSE(volume->devvp, FREAD|FWRITE, NULL); 282 } else { 283 /* do not call vinvalbuf */ 284 VOP_OPEN(volume->devvp, FREAD|FWRITE, FSCRED, NULL); 285 VOP_CLOSE(volume->devvp, FREAD, NULL); 286 } 287 vn_unlock(volume->devvp); 288 } 289 return(0); 290 } 291 292 /* 293 * Unload and free a HAMMER volume. Must return >= 0 to continue scan 294 * so returns -1 on failure. 295 */ 296 int 297 hammer_unload_volume(hammer_volume_t volume, void *data) 298 { 299 hammer_mount_t hmp = volume->io.hmp; 300 struct buf *bp = NULL; 301 struct hammer_volume_ondisk *img; 302 int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0); 303 int error; 304 305 /* 306 * Clear the volume header with data if the data is specified. 307 */ 308 if (ronly == 0 && data && volume->devvp) { 309 img = (struct hammer_volume_ondisk *)data; 310 error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp); 311 if (error || bp->b_bcount < sizeof(*img)) { 312 hmkprintf(hmp, "Failed to read volume header: %d\n", error); 313 brelse(bp); 314 } else { 315 bcopy(img, bp->b_data, sizeof(*img)); 316 error = bwrite(bp); 317 if (error) 318 hmkprintf(hmp, "Failed to clear volume header: %d\n", 319 error); 320 } 321 } 322 323 /* 324 * Clean up the root volume pointer, which is held unlocked in hmp. 325 */ 326 if (hmp->rootvol == volume) 327 hmp->rootvol = NULL; 328 329 /* 330 * We must not flush a dirty buffer to disk on umount. It should 331 * have already been dealt with by the flusher, or we may be in 332 * catastrophic failure. 333 */ 334 hammer_io_clear_modify(&volume->io, 1); 335 volume->io.waitdep = 1; 336 337 /* 338 * Clean up the persistent ref ioerror might have on the volume 339 */ 340 if (volume->io.ioerror) 341 hammer_io_clear_error_noassert(&volume->io); 342 343 /* 344 * This should release the bp. Releasing the volume with flush set 345 * implies the interlock is set. 346 */ 347 hammer_ref_interlock_true(&volume->io.lock); 348 hammer_rel_volume(volume, 1); 349 KKASSERT(volume->io.bp == NULL); 350 351 /* 352 * There should be no references on the volume. 353 */ 354 KKASSERT(hammer_norefs(&volume->io.lock)); 355 356 volume->ondisk = NULL; 357 if (volume->devvp) { 358 if (volume->devvp->v_rdev && 359 volume->devvp->v_rdev->si_mountpoint == hmp->mp) { 360 volume->devvp->v_rdev->si_mountpoint = NULL; 361 } 362 if (ronly) { 363 /* 364 * Make sure we don't sync anything to disk if we 365 * are in read-only mode (1) or critically-errored 366 * (2). Note that there may be dirty buffers in 367 * normal read-only mode from crash recovery. 368 */ 369 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY); 370 vinvalbuf(volume->devvp, 0, 0, 0); 371 VOP_CLOSE(volume->devvp, FREAD, NULL); 372 vn_unlock(volume->devvp); 373 } else { 374 /* 375 * Normal termination, save any dirty buffers 376 * (XXX there really shouldn't be any). 377 */ 378 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY); 379 vinvalbuf(volume->devvp, V_SAVE, 0, 0); 380 VOP_CLOSE(volume->devvp, FREAD|FWRITE, NULL); 381 vn_unlock(volume->devvp); 382 } 383 } 384 385 /* 386 * Destroy the structure 387 */ 388 RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume); 389 HAMMER_VOLUME_NUMBER_DEL(hmp, volume); 390 hammer_free_volume(volume); 391 return(0); 392 } 393 394 static 395 void 396 hammer_free_volume(hammer_volume_t volume) 397 { 398 hammer_mount_t hmp = volume->io.hmp; 399 400 if (volume->vol_name) { 401 kfree(volume->vol_name, hmp->m_misc); 402 volume->vol_name = NULL; 403 } 404 if (volume->devvp) { 405 vrele(volume->devvp); 406 volume->devvp = NULL; 407 } 408 --hammer_count_volumes; 409 kfree(volume, hmp->m_misc); 410 } 411 412 /* 413 * Get a HAMMER volume. The volume must already exist. 414 */ 415 hammer_volume_t 416 hammer_get_volume(hammer_mount_t hmp, int32_t vol_no, int *errorp) 417 { 418 struct hammer_volume *volume; 419 420 /* 421 * Locate the volume structure 422 */ 423 volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no); 424 if (volume == NULL) { 425 *errorp = ENOENT; 426 return(NULL); 427 } 428 429 /* 430 * Reference the volume, load/check the data on the 0->1 transition. 431 * hammer_load_volume() will dispose of the interlock on return, 432 * and also clean up the ref count on error. 433 */ 434 if (hammer_ref_interlock(&volume->io.lock)) { 435 *errorp = hammer_load_volume(volume); 436 if (*errorp) 437 volume = NULL; 438 } else { 439 KKASSERT(volume->ondisk); 440 *errorp = 0; 441 } 442 return(volume); 443 } 444 445 int 446 hammer_ref_volume(hammer_volume_t volume) 447 { 448 int error; 449 450 /* 451 * Reference the volume and deal with the check condition used to 452 * load its ondisk info. 453 */ 454 if (hammer_ref_interlock(&volume->io.lock)) { 455 error = hammer_load_volume(volume); 456 } else { 457 KKASSERT(volume->ondisk); 458 error = 0; 459 } 460 return (error); 461 } 462 463 /* 464 * May be called without fs_token 465 */ 466 hammer_volume_t 467 hammer_get_root_volume(hammer_mount_t hmp, int *errorp) 468 { 469 hammer_volume_t volume; 470 471 volume = hmp->rootvol; 472 KKASSERT(volume != NULL); 473 474 /* 475 * Reference the volume and deal with the check condition used to 476 * load its ondisk info. 477 */ 478 if (hammer_ref_interlock(&volume->io.lock)) { 479 lwkt_gettoken(&volume->io.hmp->fs_token); 480 *errorp = hammer_load_volume(volume); 481 lwkt_reltoken(&volume->io.hmp->fs_token); 482 if (*errorp) 483 volume = NULL; 484 } else { 485 KKASSERT(volume->ondisk); 486 *errorp = 0; 487 } 488 return (volume); 489 } 490 491 /* 492 * Load a volume's on-disk information. The volume must be referenced and 493 * the interlock is held on call. The interlock will be released on return. 494 * The reference will also be released on return if an error occurs. 495 */ 496 static int 497 hammer_load_volume(hammer_volume_t volume) 498 { 499 int error; 500 501 if (volume->ondisk == NULL) { 502 error = hammer_io_read(volume->devvp, &volume->io, 503 HAMMER_BUFSIZE); 504 if (error == 0) { 505 volume->ondisk = (void *)volume->io.bp->b_data; 506 hammer_ref_interlock_done(&volume->io.lock); 507 } else { 508 hammer_rel_volume(volume, 1); 509 } 510 } else { 511 error = 0; 512 } 513 return(error); 514 } 515 516 /* 517 * Release a previously acquired reference on the volume. 518 * 519 * Volumes are not unloaded from memory during normal operation. 520 * 521 * May be called without fs_token 522 */ 523 void 524 hammer_rel_volume(hammer_volume_t volume, int locked) 525 { 526 struct buf *bp; 527 528 if (hammer_rel_interlock(&volume->io.lock, locked)) { 529 lwkt_gettoken(&volume->io.hmp->fs_token); 530 volume->ondisk = NULL; 531 bp = hammer_io_release(&volume->io, locked); 532 lwkt_reltoken(&volume->io.hmp->fs_token); 533 hammer_rel_interlock_done(&volume->io.lock, locked); 534 if (bp) 535 brelse(bp); 536 } 537 } 538 539 int 540 hammer_mountcheck_volumes(hammer_mount_t hmp) 541 { 542 hammer_volume_t vol; 543 int i; 544 545 HAMMER_VOLUME_NUMBER_FOREACH(hmp, i) { 546 vol = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, i); 547 if (vol == NULL) 548 return(EINVAL); 549 } 550 return(0); 551 } 552 553 int 554 hammer_get_installed_volumes(hammer_mount_t hmp) 555 { 556 int i, ret = 0; 557 558 HAMMER_VOLUME_NUMBER_FOREACH(hmp, i) 559 ret++; 560 return(ret); 561 } 562 563 /************************************************************************ 564 * BUFFERS * 565 ************************************************************************ 566 * 567 * Manage buffers. Currently most blockmap-backed zones are direct-mapped 568 * to zone-2 buffer offsets, without a translation stage. However, the 569 * hammer_buffer structure is indexed by its zoneX_offset, not its 570 * zone2_offset. 571 * 572 * The proper zone must be maintained throughout the code-base all the way 573 * through to the big-block allocator, or routines like hammer_del_buffers() 574 * will not be able to locate all potentially conflicting buffers. 575 */ 576 577 /* 578 * Helper function returns whether a zone offset can be directly translated 579 * to a raw buffer index or not. Really only the volume and undo zones 580 * can't be directly translated. Volumes are special-cased and undo zones 581 * shouldn't be aliased accessed in read-only mode. 582 * 583 * This function is ONLY used to detect aliased zones during a read-only 584 * mount. 585 */ 586 static __inline int 587 hammer_direct_zone(hammer_off_t buf_offset) 588 { 589 switch(HAMMER_ZONE_DECODE(buf_offset)) { 590 case HAMMER_ZONE_RAW_BUFFER_INDEX: 591 case HAMMER_ZONE_FREEMAP_INDEX: 592 case HAMMER_ZONE_BTREE_INDEX: 593 case HAMMER_ZONE_META_INDEX: 594 case HAMMER_ZONE_LARGE_DATA_INDEX: 595 case HAMMER_ZONE_SMALL_DATA_INDEX: 596 return(1); 597 default: 598 return(0); 599 } 600 /* NOT REACHED */ 601 } 602 603 hammer_buffer_t 604 hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset, 605 int bytes, int isnew, int *errorp) 606 { 607 hammer_buffer_t buffer; 608 hammer_volume_t volume; 609 hammer_off_t zone2_offset; 610 hammer_io_type_t iotype; 611 int vol_no; 612 int zone; 613 614 buf_offset &= ~HAMMER_BUFMASK64; 615 again: 616 /* 617 * Shortcut if the buffer is already cached 618 */ 619 buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, buf_offset); 620 if (buffer) { 621 /* 622 * Once refed the ondisk field will not be cleared by 623 * any other action. Shortcut the operation if the 624 * ondisk structure is valid. 625 */ 626 found_aliased: 627 if (hammer_ref_interlock(&buffer->io.lock) == 0) { 628 hammer_io_advance(&buffer->io); 629 KKASSERT(buffer->ondisk); 630 *errorp = 0; 631 return(buffer); 632 } 633 634 /* 635 * 0->1 transition or defered 0->1 transition (CHECK), 636 * interlock now held. Shortcut if ondisk is already 637 * assigned. 638 */ 639 atomic_add_int(&hammer_count_refedbufs, 1); 640 if (buffer->ondisk) { 641 hammer_io_advance(&buffer->io); 642 hammer_ref_interlock_done(&buffer->io.lock); 643 *errorp = 0; 644 return(buffer); 645 } 646 647 /* 648 * The buffer is no longer loose if it has a ref, and 649 * cannot become loose once it gains a ref. Loose 650 * buffers will never be in a modified state. This should 651 * only occur on the 0->1 transition of refs. 652 * 653 * lose_root can be modified via a biodone() interrupt 654 * so the io_token must be held. 655 */ 656 if (buffer->io.mod_root == &hmp->lose_root) { 657 lwkt_gettoken(&hmp->io_token); 658 if (buffer->io.mod_root == &hmp->lose_root) { 659 RB_REMOVE(hammer_mod_rb_tree, 660 buffer->io.mod_root, &buffer->io); 661 buffer->io.mod_root = NULL; 662 KKASSERT(buffer->io.modified == 0); 663 } 664 lwkt_reltoken(&hmp->io_token); 665 } 666 goto found; 667 } else if (hmp->ronly && hammer_direct_zone(buf_offset)) { 668 /* 669 * If this is a read-only mount there could be an alias 670 * in the raw-zone. If there is we use that buffer instead. 671 * 672 * rw mounts will not have aliases. Also note when going 673 * from ro -> rw the recovered raw buffers are flushed and 674 * reclaimed, so again there will not be any aliases once 675 * the mount is rw. 676 */ 677 buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, 678 hammer_xlate_to_zone2(buf_offset)); 679 if (buffer) { 680 if (hammer_debug_general & 0x0001) { 681 hkrateprintf(&hmp->kdiag, 682 "recovered aliased %016jx\n", 683 (intmax_t)buf_offset); 684 } 685 goto found_aliased; 686 } 687 } 688 689 /* 690 * What is the buffer class? 691 */ 692 zone = HAMMER_ZONE_DECODE(buf_offset); 693 694 switch(zone) { 695 case HAMMER_ZONE_LARGE_DATA_INDEX: 696 case HAMMER_ZONE_SMALL_DATA_INDEX: 697 iotype = HAMMER_STRUCTURE_DATA_BUFFER; 698 break; 699 case HAMMER_ZONE_UNDO_INDEX: 700 iotype = HAMMER_STRUCTURE_UNDO_BUFFER; 701 break; 702 case HAMMER_ZONE_META_INDEX: 703 default: 704 /* 705 * NOTE: inode data and directory entries are placed in this 706 * zone. inode atime/mtime is updated in-place and thus 707 * buffers containing inodes must be synchronized as 708 * meta-buffers, same as buffers containing B-Tree info. 709 */ 710 iotype = HAMMER_STRUCTURE_META_BUFFER; 711 break; 712 } 713 714 /* 715 * Handle blockmap offset translations 716 */ 717 if (zone >= HAMMER_ZONE2_MAPPED_INDEX) { 718 zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, errorp); 719 } else if (zone == HAMMER_ZONE_UNDO_INDEX) { 720 zone2_offset = hammer_undo_lookup(hmp, buf_offset, errorp); 721 } else { 722 KKASSERT(zone == HAMMER_ZONE_RAW_BUFFER_INDEX); 723 zone2_offset = buf_offset; 724 *errorp = 0; 725 } 726 if (*errorp) 727 return(NULL); 728 729 /* 730 * NOTE: zone2_offset and maxbuf_off are both full zone-2 offset 731 * specifications. 732 */ 733 KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) == 734 HAMMER_ZONE_RAW_BUFFER); 735 vol_no = HAMMER_VOL_DECODE(zone2_offset); 736 volume = hammer_get_volume(hmp, vol_no, errorp); 737 if (volume == NULL) 738 return(NULL); 739 740 KKASSERT(zone2_offset < volume->maxbuf_off); 741 742 /* 743 * Allocate a new buffer structure. We will check for races later. 744 */ 745 ++hammer_count_buffers; 746 buffer = kmalloc(sizeof(*buffer), hmp->m_misc, 747 M_WAITOK|M_ZERO|M_USE_RESERVE); 748 buffer->zone2_offset = zone2_offset; 749 buffer->zoneX_offset = buf_offset; 750 751 hammer_io_init(&buffer->io, volume, iotype); 752 buffer->io.offset = volume->ondisk->vol_buf_beg + 753 (zone2_offset & HAMMER_OFF_SHORT_MASK); 754 buffer->io.bytes = bytes; 755 TAILQ_INIT(&buffer->clist); 756 hammer_ref_interlock_true(&buffer->io.lock); 757 758 /* 759 * Insert the buffer into the RB tree and handle late collisions. 760 */ 761 if (RB_INSERT(hammer_buf_rb_tree, &hmp->rb_bufs_root, buffer)) { 762 hammer_rel_volume(volume, 0); 763 buffer->io.volume = NULL; /* safety */ 764 if (hammer_rel_interlock(&buffer->io.lock, 1)) /* safety */ 765 hammer_rel_interlock_done(&buffer->io.lock, 1); 766 --hammer_count_buffers; 767 kfree(buffer, hmp->m_misc); 768 goto again; 769 } 770 atomic_add_int(&hammer_count_refedbufs, 1); 771 found: 772 773 /* 774 * The buffer is referenced and interlocked. Load the buffer 775 * if necessary. hammer_load_buffer() deals with the interlock 776 * and, if an error is returned, also deals with the ref. 777 */ 778 if (buffer->ondisk == NULL) { 779 *errorp = hammer_load_buffer(buffer, isnew); 780 if (*errorp) 781 buffer = NULL; 782 } else { 783 hammer_io_advance(&buffer->io); 784 hammer_ref_interlock_done(&buffer->io.lock); 785 *errorp = 0; 786 } 787 return(buffer); 788 } 789 790 /* 791 * This is used by the direct-read code to deal with large-data buffers 792 * created by the reblocker and mirror-write code. The direct-read code 793 * bypasses the HAMMER buffer subsystem and so any aliased dirty or write- 794 * running hammer buffers must be fully synced to disk before we can issue 795 * the direct-read. 796 * 797 * This code path is not considered critical as only the rebocker and 798 * mirror-write code will create large-data buffers via the HAMMER buffer 799 * subsystem. They do that because they operate at the B-Tree level and 800 * do not access the vnode/inode structures. 801 */ 802 void 803 hammer_sync_buffers(hammer_mount_t hmp, hammer_off_t base_offset, int bytes) 804 { 805 hammer_buffer_t buffer; 806 int error; 807 808 KKASSERT((base_offset & HAMMER_OFF_ZONE_MASK) == 809 HAMMER_ZONE_LARGE_DATA); 810 811 while (bytes > 0) { 812 buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, 813 base_offset); 814 if (buffer && (buffer->io.modified || buffer->io.running)) { 815 error = hammer_ref_buffer(buffer); 816 if (error == 0) { 817 hammer_io_wait(&buffer->io); 818 if (buffer->io.modified) { 819 hammer_io_write_interlock(&buffer->io); 820 hammer_io_flush(&buffer->io, 0); 821 hammer_io_done_interlock(&buffer->io); 822 hammer_io_wait(&buffer->io); 823 } 824 hammer_rel_buffer(buffer, 0); 825 } 826 } 827 base_offset += HAMMER_BUFSIZE; 828 bytes -= HAMMER_BUFSIZE; 829 } 830 } 831 832 /* 833 * Destroy all buffers covering the specified zoneX offset range. This 834 * is called when the related blockmap layer2 entry is freed or when 835 * a direct write bypasses our buffer/buffer-cache subsystem. 836 * 837 * The buffers may be referenced by the caller itself. Setting reclaim 838 * will cause the buffer to be destroyed when it's ref count reaches zero. 839 * 840 * Return 0 on success, EAGAIN if some buffers could not be destroyed due 841 * to additional references held by other threads, or some other (typically 842 * fatal) error. 843 */ 844 int 845 hammer_del_buffers(hammer_mount_t hmp, hammer_off_t base_offset, 846 hammer_off_t zone2_offset, int bytes, 847 int report_conflicts) 848 { 849 hammer_buffer_t buffer; 850 hammer_volume_t volume; 851 int vol_no; 852 int error; 853 int ret_error; 854 855 vol_no = HAMMER_VOL_DECODE(zone2_offset); 856 volume = hammer_get_volume(hmp, vol_no, &ret_error); 857 KKASSERT(ret_error == 0); 858 859 while (bytes > 0) { 860 buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, 861 base_offset); 862 if (buffer) { 863 error = hammer_ref_buffer(buffer); 864 if (hammer_debug_general & 0x20000) { 865 hkprintf("delbufr %016jx rerr=%d 1ref=%d\n", 866 (intmax_t)buffer->zoneX_offset, 867 error, 868 hammer_oneref(&buffer->io.lock)); 869 } 870 if (error == 0 && !hammer_oneref(&buffer->io.lock)) { 871 error = EAGAIN; 872 hammer_rel_buffer(buffer, 0); 873 } 874 if (error == 0) { 875 KKASSERT(buffer->zone2_offset == zone2_offset); 876 hammer_io_clear_modify(&buffer->io, 1); 877 buffer->io.reclaim = 1; 878 buffer->io.waitdep = 1; 879 KKASSERT(buffer->io.volume == volume); 880 hammer_rel_buffer(buffer, 0); 881 } 882 } else { 883 error = hammer_io_inval(volume, zone2_offset); 884 } 885 if (error) { 886 ret_error = error; 887 if (report_conflicts || 888 (hammer_debug_general & 0x8000)) { 889 krateprintf(&hmp->kdiag, 890 "hammer_del_buffers: unable to " 891 "invalidate %016llx buffer=%p " 892 "rep=%d lkrefs=%08x\n", 893 (long long)base_offset, 894 buffer, report_conflicts, 895 (buffer ? buffer->io.lock.refs : -1)); 896 } 897 } 898 base_offset += HAMMER_BUFSIZE; 899 zone2_offset += HAMMER_BUFSIZE; 900 bytes -= HAMMER_BUFSIZE; 901 } 902 hammer_rel_volume(volume, 0); 903 return (ret_error); 904 } 905 906 /* 907 * Given a referenced and interlocked buffer load/validate the data. 908 * 909 * The buffer interlock will be released on return. If an error is 910 * returned the buffer reference will also be released (and the buffer 911 * pointer will thus be stale). 912 */ 913 static int 914 hammer_load_buffer(hammer_buffer_t buffer, int isnew) 915 { 916 hammer_volume_t volume; 917 int error; 918 919 /* 920 * Load the buffer's on-disk info 921 */ 922 volume = buffer->io.volume; 923 924 if (hammer_debug_io & 0x0004) { 925 hdkprintf("load_buffer %016llx %016llx isnew=%d od=%p\n", 926 (long long)buffer->zoneX_offset, 927 (long long)buffer->zone2_offset, 928 isnew, buffer->ondisk); 929 } 930 931 if (buffer->ondisk == NULL) { 932 /* 933 * Issue the read or generate a new buffer. When reading 934 * the limit argument controls any read-ahead clustering 935 * hammer_io_read() is allowed to do. 936 * 937 * We cannot read-ahead in the large-data zone and we cannot 938 * cross a big-block boundary as the next big-block might 939 * use a different buffer size. 940 */ 941 if (isnew) { 942 error = hammer_io_new(volume->devvp, &buffer->io); 943 } else if ((buffer->zoneX_offset & HAMMER_OFF_ZONE_MASK) == 944 HAMMER_ZONE_LARGE_DATA) { 945 error = hammer_io_read(volume->devvp, &buffer->io, 946 buffer->io.bytes); 947 } else { 948 hammer_off_t limit; 949 950 limit = (buffer->zone2_offset + 951 HAMMER_BIGBLOCK_MASK64) & 952 ~HAMMER_BIGBLOCK_MASK64; 953 limit -= buffer->zone2_offset; 954 error = hammer_io_read(volume->devvp, &buffer->io, 955 limit); 956 } 957 if (error == 0) 958 buffer->ondisk = (void *)buffer->io.bp->b_data; 959 } else if (isnew) { 960 error = hammer_io_new(volume->devvp, &buffer->io); 961 } else { 962 error = 0; 963 } 964 if (error == 0) { 965 hammer_io_advance(&buffer->io); 966 hammer_ref_interlock_done(&buffer->io.lock); 967 } else { 968 hammer_rel_buffer(buffer, 1); 969 } 970 return (error); 971 } 972 973 /* 974 * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue. 975 * This routine is only called during unmount or when a volume is 976 * removed. 977 * 978 * If data != NULL, it specifies a volume whoose buffers should 979 * be unloaded. 980 */ 981 int 982 hammer_unload_buffer(hammer_buffer_t buffer, void *data) 983 { 984 struct hammer_volume *volume = (struct hammer_volume *) data; 985 986 /* 987 * If volume != NULL we are only interested in unloading buffers 988 * associated with a particular volume. 989 */ 990 if (volume != NULL && volume != buffer->io.volume) 991 return 0; 992 993 /* 994 * Clean up the persistent ref ioerror might have on the buffer 995 * and acquire a ref. Expect a 0->1 transition. 996 */ 997 if (buffer->io.ioerror) { 998 hammer_io_clear_error_noassert(&buffer->io); 999 atomic_add_int(&hammer_count_refedbufs, -1); 1000 } 1001 hammer_ref_interlock_true(&buffer->io.lock); 1002 atomic_add_int(&hammer_count_refedbufs, 1); 1003 1004 /* 1005 * We must not flush a dirty buffer to disk on umount. It should 1006 * have already been dealt with by the flusher, or we may be in 1007 * catastrophic failure. 1008 * 1009 * We must set waitdep to ensure that a running buffer is waited 1010 * on and released prior to us trying to unload the volume. 1011 */ 1012 hammer_io_clear_modify(&buffer->io, 1); 1013 hammer_flush_buffer_nodes(buffer); 1014 buffer->io.waitdep = 1; 1015 hammer_rel_buffer(buffer, 1); 1016 return(0); 1017 } 1018 1019 /* 1020 * Reference a buffer that is either already referenced or via a specially 1021 * handled pointer (aka cursor->buffer). 1022 */ 1023 int 1024 hammer_ref_buffer(hammer_buffer_t buffer) 1025 { 1026 hammer_mount_t hmp; 1027 int error; 1028 int locked; 1029 1030 /* 1031 * Acquire a ref, plus the buffer will be interlocked on the 1032 * 0->1 transition. 1033 */ 1034 locked = hammer_ref_interlock(&buffer->io.lock); 1035 hmp = buffer->io.hmp; 1036 1037 /* 1038 * At this point a biodone() will not touch the buffer other then 1039 * incidental bits. However, lose_root can be modified via 1040 * a biodone() interrupt. 1041 * 1042 * No longer loose. lose_root requires the io_token. 1043 */ 1044 if (buffer->io.mod_root == &hmp->lose_root) { 1045 lwkt_gettoken(&hmp->io_token); 1046 if (buffer->io.mod_root == &hmp->lose_root) { 1047 RB_REMOVE(hammer_mod_rb_tree, 1048 buffer->io.mod_root, &buffer->io); 1049 buffer->io.mod_root = NULL; 1050 } 1051 lwkt_reltoken(&hmp->io_token); 1052 } 1053 1054 if (locked) { 1055 atomic_add_int(&hammer_count_refedbufs, 1); 1056 error = hammer_load_buffer(buffer, 0); 1057 /* NOTE: on error the buffer pointer is stale */ 1058 } else { 1059 error = 0; 1060 } 1061 return(error); 1062 } 1063 1064 /* 1065 * Release a reference on the buffer. On the 1->0 transition the 1066 * underlying IO will be released but the data reference is left 1067 * cached. 1068 * 1069 * Only destroy the structure itself if the related buffer cache buffer 1070 * was disassociated from it. This ties the management of the structure 1071 * to the buffer cache subsystem. buffer->ondisk determines whether the 1072 * embedded io is referenced or not. 1073 */ 1074 void 1075 hammer_rel_buffer(hammer_buffer_t buffer, int locked) 1076 { 1077 hammer_volume_t volume; 1078 hammer_mount_t hmp; 1079 struct buf *bp = NULL; 1080 int freeme = 0; 1081 1082 hmp = buffer->io.hmp; 1083 1084 if (hammer_rel_interlock(&buffer->io.lock, locked) == 0) 1085 return; 1086 1087 /* 1088 * hammer_count_refedbufs accounting. Decrement if we are in 1089 * the error path or if CHECK is clear. 1090 * 1091 * If we are not in the error path and CHECK is set the caller 1092 * probably just did a hammer_ref() and didn't account for it, 1093 * so we don't account for the loss here. 1094 */ 1095 if (locked || (buffer->io.lock.refs & HAMMER_REFS_CHECK) == 0) 1096 atomic_add_int(&hammer_count_refedbufs, -1); 1097 1098 /* 1099 * If the caller locked us or the normal released transitions 1100 * from 1->0 (and acquired the lock) attempt to release the 1101 * io. If the called locked us we tell hammer_io_release() 1102 * to flush (which would be the unload or failure path). 1103 */ 1104 bp = hammer_io_release(&buffer->io, locked); 1105 1106 /* 1107 * If the buffer has no bp association and no refs we can destroy 1108 * it. 1109 * 1110 * NOTE: It is impossible for any associated B-Tree nodes to have 1111 * refs if the buffer has no additional refs. 1112 */ 1113 if (buffer->io.bp == NULL && hammer_norefs(&buffer->io.lock)) { 1114 RB_REMOVE(hammer_buf_rb_tree, 1115 &buffer->io.hmp->rb_bufs_root, 1116 buffer); 1117 volume = buffer->io.volume; 1118 buffer->io.volume = NULL; /* sanity */ 1119 hammer_rel_volume(volume, 0); 1120 hammer_io_clear_modlist(&buffer->io); 1121 hammer_flush_buffer_nodes(buffer); 1122 KKASSERT(TAILQ_EMPTY(&buffer->clist)); 1123 freeme = 1; 1124 } 1125 1126 /* 1127 * Cleanup 1128 */ 1129 hammer_rel_interlock_done(&buffer->io.lock, locked); 1130 if (bp) 1131 brelse(bp); 1132 if (freeme) { 1133 --hammer_count_buffers; 1134 kfree(buffer, hmp->m_misc); 1135 } 1136 } 1137 1138 /* 1139 * Access the filesystem buffer containing the specified hammer offset. 1140 * buf_offset is a conglomeration of the volume number and vol_buf_beg 1141 * relative buffer offset. It must also have bit 55 set to be valid. 1142 * (see hammer_off_t in hammer_disk.h). 1143 * 1144 * Any prior buffer in *bufferp will be released and replaced by the 1145 * requested buffer. 1146 * 1147 * NOTE: The buffer is indexed via its zoneX_offset but we allow the 1148 * passed cached *bufferp to match against either zoneX or zone2. 1149 */ 1150 static __inline 1151 void * 1152 _hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes, 1153 int *errorp, struct hammer_buffer **bufferp) 1154 { 1155 hammer_buffer_t buffer; 1156 int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK; 1157 1158 buf_offset &= ~HAMMER_BUFMASK64; 1159 KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) != 0); 1160 1161 buffer = *bufferp; 1162 if (buffer == NULL || (buffer->zone2_offset != buf_offset && 1163 buffer->zoneX_offset != buf_offset)) { 1164 if (buffer) 1165 hammer_rel_buffer(buffer, 0); 1166 buffer = hammer_get_buffer(hmp, buf_offset, bytes, 0, errorp); 1167 *bufferp = buffer; 1168 } else { 1169 *errorp = 0; 1170 } 1171 1172 /* 1173 * Return a pointer to the buffer data. 1174 */ 1175 if (buffer == NULL) 1176 return(NULL); 1177 else 1178 return((char *)buffer->ondisk + xoff); 1179 } 1180 1181 void * 1182 hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset, 1183 int *errorp, struct hammer_buffer **bufferp) 1184 { 1185 return(_hammer_bread(hmp, buf_offset, HAMMER_BUFSIZE, errorp, bufferp)); 1186 } 1187 1188 void * 1189 hammer_bread_ext(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes, 1190 int *errorp, struct hammer_buffer **bufferp) 1191 { 1192 bytes = (bytes + HAMMER_BUFMASK) & ~HAMMER_BUFMASK; 1193 return(_hammer_bread(hmp, buf_offset, bytes, errorp, bufferp)); 1194 } 1195 1196 /* 1197 * Access the filesystem buffer containing the specified hammer offset. 1198 * No disk read operation occurs. The result buffer may contain garbage. 1199 * 1200 * Any prior buffer in *bufferp will be released and replaced by the 1201 * requested buffer. 1202 * 1203 * This function marks the buffer dirty but does not increment its 1204 * modify_refs count. 1205 */ 1206 static __inline 1207 void * 1208 _hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes, 1209 int *errorp, struct hammer_buffer **bufferp) 1210 { 1211 hammer_buffer_t buffer; 1212 int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK; 1213 1214 buf_offset &= ~HAMMER_BUFMASK64; 1215 KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) != 0); 1216 1217 buffer = *bufferp; 1218 if (buffer == NULL || (buffer->zone2_offset != buf_offset && 1219 buffer->zoneX_offset != buf_offset)) { 1220 if (buffer) 1221 hammer_rel_buffer(buffer, 0); 1222 buffer = hammer_get_buffer(hmp, buf_offset, bytes, 1, errorp); 1223 *bufferp = buffer; 1224 } else { 1225 *errorp = 0; 1226 } 1227 1228 /* 1229 * Return a pointer to the buffer data. 1230 */ 1231 if (buffer == NULL) 1232 return(NULL); 1233 else 1234 return((char *)buffer->ondisk + xoff); 1235 } 1236 1237 void * 1238 hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, 1239 int *errorp, struct hammer_buffer **bufferp) 1240 { 1241 return(_hammer_bnew(hmp, buf_offset, HAMMER_BUFSIZE, errorp, bufferp)); 1242 } 1243 1244 void * 1245 hammer_bnew_ext(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes, 1246 int *errorp, struct hammer_buffer **bufferp) 1247 { 1248 bytes = (bytes + HAMMER_BUFMASK) & ~HAMMER_BUFMASK; 1249 return(_hammer_bnew(hmp, buf_offset, bytes, errorp, bufferp)); 1250 } 1251 1252 /************************************************************************ 1253 * NODES * 1254 ************************************************************************ 1255 * 1256 * Manage B-Tree nodes. B-Tree nodes represent the primary indexing 1257 * method used by the HAMMER filesystem. 1258 * 1259 * Unlike other HAMMER structures, a hammer_node can be PASSIVELY 1260 * associated with its buffer, and will only referenced the buffer while 1261 * the node itself is referenced. 1262 * 1263 * A hammer_node can also be passively associated with other HAMMER 1264 * structures, such as inodes, while retaining 0 references. These 1265 * associations can be cleared backwards using a pointer-to-pointer in 1266 * the hammer_node. 1267 * 1268 * This allows the HAMMER implementation to cache hammer_nodes long-term 1269 * and short-cut a great deal of the infrastructure's complexity. In 1270 * most cases a cached node can be reacquired without having to dip into 1271 * the B-Tree. 1272 */ 1273 hammer_node_t 1274 hammer_get_node(hammer_transaction_t trans, hammer_off_t node_offset, 1275 int isnew, int *errorp) 1276 { 1277 hammer_mount_t hmp = trans->hmp; 1278 hammer_node_t node; 1279 int doload; 1280 1281 KKASSERT((node_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_BTREE); 1282 1283 /* 1284 * Locate the structure, allocating one if necessary. 1285 */ 1286 again: 1287 node = RB_LOOKUP(hammer_nod_rb_tree, &hmp->rb_nods_root, node_offset); 1288 if (node == NULL) { 1289 ++hammer_count_nodes; 1290 node = kmalloc(sizeof(*node), hmp->m_misc, M_WAITOK|M_ZERO|M_USE_RESERVE); 1291 node->node_offset = node_offset; 1292 node->hmp = hmp; 1293 TAILQ_INIT(&node->cursor_list); 1294 TAILQ_INIT(&node->cache_list); 1295 if (RB_INSERT(hammer_nod_rb_tree, &hmp->rb_nods_root, node)) { 1296 --hammer_count_nodes; 1297 kfree(node, hmp->m_misc); 1298 goto again; 1299 } 1300 doload = hammer_ref_interlock_true(&node->lock); 1301 } else { 1302 doload = hammer_ref_interlock(&node->lock); 1303 } 1304 if (doload) { 1305 *errorp = hammer_load_node(trans, node, isnew); 1306 trans->flags |= HAMMER_TRANSF_DIDIO; 1307 if (*errorp) 1308 node = NULL; 1309 } else { 1310 KKASSERT(node->ondisk); 1311 *errorp = 0; 1312 hammer_io_advance(&node->buffer->io); 1313 } 1314 return(node); 1315 } 1316 1317 /* 1318 * Reference an already-referenced node. 0->1 transitions should assert 1319 * so we do not have to deal with hammer_ref() setting CHECK. 1320 */ 1321 void 1322 hammer_ref_node(hammer_node_t node) 1323 { 1324 KKASSERT(hammer_isactive(&node->lock) && node->ondisk != NULL); 1325 hammer_ref(&node->lock); 1326 } 1327 1328 /* 1329 * Load a node's on-disk data reference. Called with the node referenced 1330 * and interlocked. 1331 * 1332 * On return the node interlock will be unlocked. If a non-zero error code 1333 * is returned the node will also be dereferenced (and the caller's pointer 1334 * will be stale). 1335 */ 1336 static int 1337 hammer_load_node(hammer_transaction_t trans, hammer_node_t node, int isnew) 1338 { 1339 hammer_buffer_t buffer; 1340 hammer_off_t buf_offset; 1341 int error; 1342 1343 error = 0; 1344 if (node->ondisk == NULL) { 1345 /* 1346 * This is a little confusing but the jist is that 1347 * node->buffer determines whether the node is on 1348 * the buffer's clist and node->ondisk determines 1349 * whether the buffer is referenced. 1350 * 1351 * We could be racing a buffer release, in which case 1352 * node->buffer may become NULL while we are blocked 1353 * referencing the buffer. 1354 */ 1355 if ((buffer = node->buffer) != NULL) { 1356 error = hammer_ref_buffer(buffer); 1357 if (error == 0 && node->buffer == NULL) { 1358 TAILQ_INSERT_TAIL(&buffer->clist, node, entry); 1359 node->buffer = buffer; 1360 } 1361 } else { 1362 buf_offset = node->node_offset & ~HAMMER_BUFMASK64; 1363 buffer = hammer_get_buffer(node->hmp, buf_offset, 1364 HAMMER_BUFSIZE, 0, &error); 1365 if (buffer) { 1366 KKASSERT(error == 0); 1367 TAILQ_INSERT_TAIL(&buffer->clist, node, entry); 1368 node->buffer = buffer; 1369 } 1370 } 1371 if (error) 1372 goto failed; 1373 node->ondisk = (void *)((char *)buffer->ondisk + 1374 (node->node_offset & HAMMER_BUFMASK)); 1375 1376 /* 1377 * Check CRC. NOTE: Neither flag is set and the CRC is not 1378 * generated on new B-Tree nodes. 1379 */ 1380 if (isnew == 0 && 1381 (node->flags & HAMMER_NODE_CRCANY) == 0) { 1382 if (hammer_crc_test_btree(node->ondisk) == 0) { 1383 hdkprintf("CRC B-TREE NODE @ %016llx/%lu FAILED\n", 1384 (long long)node->node_offset, 1385 sizeof(*node->ondisk)); 1386 if (hammer_debug_critical) 1387 Debugger("CRC FAILED: B-TREE NODE"); 1388 node->flags |= HAMMER_NODE_CRCBAD; 1389 } else { 1390 node->flags |= HAMMER_NODE_CRCGOOD; 1391 } 1392 } 1393 } 1394 if (node->flags & HAMMER_NODE_CRCBAD) { 1395 if (trans->flags & HAMMER_TRANSF_CRCDOM) 1396 error = EDOM; 1397 else 1398 error = EIO; 1399 } 1400 failed: 1401 if (error) { 1402 _hammer_rel_node(node, 1); 1403 } else { 1404 hammer_ref_interlock_done(&node->lock); 1405 } 1406 return (error); 1407 } 1408 1409 /* 1410 * Safely reference a node, interlock against flushes via the IO subsystem. 1411 */ 1412 hammer_node_t 1413 hammer_ref_node_safe(hammer_transaction_t trans, hammer_node_cache_t cache, 1414 int *errorp) 1415 { 1416 hammer_node_t node; 1417 int doload; 1418 1419 node = cache->node; 1420 if (node != NULL) { 1421 doload = hammer_ref_interlock(&node->lock); 1422 if (doload) { 1423 *errorp = hammer_load_node(trans, node, 0); 1424 if (*errorp) 1425 node = NULL; 1426 } else { 1427 KKASSERT(node->ondisk); 1428 if (node->flags & HAMMER_NODE_CRCBAD) { 1429 if (trans->flags & HAMMER_TRANSF_CRCDOM) 1430 *errorp = EDOM; 1431 else 1432 *errorp = EIO; 1433 _hammer_rel_node(node, 0); 1434 node = NULL; 1435 } else { 1436 *errorp = 0; 1437 } 1438 } 1439 } else { 1440 *errorp = ENOENT; 1441 } 1442 return(node); 1443 } 1444 1445 /* 1446 * Release a hammer_node. On the last release the node dereferences 1447 * its underlying buffer and may or may not be destroyed. 1448 * 1449 * If locked is non-zero the passed node has been interlocked by the 1450 * caller and we are in the failure/unload path, otherwise it has not and 1451 * we are doing a normal release. 1452 * 1453 * This function will dispose of the interlock and the reference. 1454 * On return the node pointer is stale. 1455 */ 1456 void 1457 _hammer_rel_node(hammer_node_t node, int locked) 1458 { 1459 hammer_buffer_t buffer; 1460 1461 /* 1462 * Deref the node. If this isn't the 1->0 transition we're basically 1463 * done. If locked is non-zero this function will just deref the 1464 * locked node and return 1, otherwise it will deref the locked 1465 * node and either lock and return 1 on the 1->0 transition or 1466 * not lock and return 0. 1467 */ 1468 if (hammer_rel_interlock(&node->lock, locked) == 0) 1469 return; 1470 1471 /* 1472 * Either locked was non-zero and we are interlocked, or the 1473 * hammer_rel_interlock() call returned non-zero and we are 1474 * interlocked. 1475 * 1476 * The ref-count must still be decremented if locked != 0 so 1477 * the cleanup required still varies a bit. 1478 * 1479 * hammer_flush_node() when called with 1 or 2 will dispose of 1480 * the lock and possible ref-count. 1481 */ 1482 if (node->ondisk == NULL) { 1483 hammer_flush_node(node, locked + 1); 1484 /* node is stale now */ 1485 return; 1486 } 1487 1488 /* 1489 * Do not disassociate the node from the buffer if it represents 1490 * a modified B-Tree node that still needs its crc to be generated. 1491 */ 1492 if (node->flags & HAMMER_NODE_NEEDSCRC) { 1493 hammer_rel_interlock_done(&node->lock, locked); 1494 return; 1495 } 1496 1497 /* 1498 * Do final cleanups and then either destroy the node and leave it 1499 * passively cached. The buffer reference is removed regardless. 1500 */ 1501 buffer = node->buffer; 1502 node->ondisk = NULL; 1503 1504 if ((node->flags & HAMMER_NODE_FLUSH) == 0) { 1505 /* 1506 * Normal release. 1507 */ 1508 hammer_rel_interlock_done(&node->lock, locked); 1509 } else { 1510 /* 1511 * Destroy the node. 1512 */ 1513 hammer_flush_node(node, locked + 1); 1514 /* node is stale */ 1515 1516 } 1517 hammer_rel_buffer(buffer, 0); 1518 } 1519 1520 void 1521 hammer_rel_node(hammer_node_t node) 1522 { 1523 _hammer_rel_node(node, 0); 1524 } 1525 1526 /* 1527 * Free space on-media associated with a B-Tree node. 1528 */ 1529 void 1530 hammer_delete_node(hammer_transaction_t trans, hammer_node_t node) 1531 { 1532 KKASSERT((node->flags & HAMMER_NODE_DELETED) == 0); 1533 node->flags |= HAMMER_NODE_DELETED; 1534 hammer_blockmap_free(trans, node->node_offset, sizeof(*node->ondisk)); 1535 } 1536 1537 /* 1538 * Passively cache a referenced hammer_node. The caller may release 1539 * the node on return. 1540 */ 1541 void 1542 hammer_cache_node(hammer_node_cache_t cache, hammer_node_t node) 1543 { 1544 /* 1545 * If the node doesn't exist, or is being deleted, don't cache it! 1546 * 1547 * The node can only ever be NULL in the I/O failure path. 1548 */ 1549 if (node == NULL || (node->flags & HAMMER_NODE_DELETED)) 1550 return; 1551 if (cache->node == node) 1552 return; 1553 while (cache->node) 1554 hammer_uncache_node(cache); 1555 if (node->flags & HAMMER_NODE_DELETED) 1556 return; 1557 cache->node = node; 1558 TAILQ_INSERT_TAIL(&node->cache_list, cache, entry); 1559 } 1560 1561 void 1562 hammer_uncache_node(hammer_node_cache_t cache) 1563 { 1564 hammer_node_t node; 1565 1566 if ((node = cache->node) != NULL) { 1567 TAILQ_REMOVE(&node->cache_list, cache, entry); 1568 cache->node = NULL; 1569 if (TAILQ_EMPTY(&node->cache_list)) 1570 hammer_flush_node(node, 0); 1571 } 1572 } 1573 1574 /* 1575 * Remove a node's cache references and destroy the node if it has no 1576 * other references or backing store. 1577 * 1578 * locked == 0 Normal unlocked operation 1579 * locked == 1 Call hammer_rel_interlock_done(..., 0); 1580 * locked == 2 Call hammer_rel_interlock_done(..., 1); 1581 * 1582 * XXX for now this isn't even close to being MPSAFE so the refs check 1583 * is sufficient. 1584 */ 1585 void 1586 hammer_flush_node(hammer_node_t node, int locked) 1587 { 1588 hammer_node_cache_t cache; 1589 hammer_buffer_t buffer; 1590 hammer_mount_t hmp = node->hmp; 1591 int dofree; 1592 1593 while ((cache = TAILQ_FIRST(&node->cache_list)) != NULL) { 1594 TAILQ_REMOVE(&node->cache_list, cache, entry); 1595 cache->node = NULL; 1596 } 1597 1598 /* 1599 * NOTE: refs is predisposed if another thread is blocking and 1600 * will be larger than 0 in that case. We aren't MPSAFE 1601 * here. 1602 */ 1603 if (node->ondisk == NULL && hammer_norefs(&node->lock)) { 1604 KKASSERT((node->flags & HAMMER_NODE_NEEDSCRC) == 0); 1605 RB_REMOVE(hammer_nod_rb_tree, &node->hmp->rb_nods_root, node); 1606 if ((buffer = node->buffer) != NULL) { 1607 node->buffer = NULL; 1608 TAILQ_REMOVE(&buffer->clist, node, entry); 1609 /* buffer is unreferenced because ondisk is NULL */ 1610 } 1611 dofree = 1; 1612 } else { 1613 dofree = 0; 1614 } 1615 1616 /* 1617 * Deal with the interlock if locked == 1 or locked == 2. 1618 */ 1619 if (locked) 1620 hammer_rel_interlock_done(&node->lock, locked - 1); 1621 1622 /* 1623 * Destroy if requested 1624 */ 1625 if (dofree) { 1626 --hammer_count_nodes; 1627 kfree(node, hmp->m_misc); 1628 } 1629 } 1630 1631 /* 1632 * Flush passively cached B-Tree nodes associated with this buffer. 1633 * This is only called when the buffer is about to be destroyed, so 1634 * none of the nodes should have any references. The buffer is locked. 1635 * 1636 * We may be interlocked with the buffer. 1637 */ 1638 void 1639 hammer_flush_buffer_nodes(hammer_buffer_t buffer) 1640 { 1641 hammer_node_t node; 1642 1643 while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) { 1644 KKASSERT(node->ondisk == NULL); 1645 KKASSERT((node->flags & HAMMER_NODE_NEEDSCRC) == 0); 1646 1647 if (hammer_try_interlock_norefs(&node->lock)) { 1648 hammer_ref(&node->lock); 1649 node->flags |= HAMMER_NODE_FLUSH; 1650 _hammer_rel_node(node, 1); 1651 } else { 1652 KKASSERT(node->buffer != NULL); 1653 buffer = node->buffer; 1654 node->buffer = NULL; 1655 TAILQ_REMOVE(&buffer->clist, node, entry); 1656 /* buffer is unreferenced because ondisk is NULL */ 1657 } 1658 } 1659 } 1660 1661 1662 /************************************************************************ 1663 * ALLOCATORS * 1664 ************************************************************************/ 1665 1666 /* 1667 * Allocate a B-Tree node. 1668 */ 1669 hammer_node_t 1670 hammer_alloc_btree(hammer_transaction_t trans, hammer_off_t hint, int *errorp) 1671 { 1672 hammer_buffer_t buffer = NULL; 1673 hammer_node_t node = NULL; 1674 hammer_off_t node_offset; 1675 1676 node_offset = hammer_blockmap_alloc(trans, HAMMER_ZONE_BTREE_INDEX, 1677 sizeof(struct hammer_node_ondisk), 1678 hint, errorp); 1679 if (*errorp == 0) { 1680 node = hammer_get_node(trans, node_offset, 1, errorp); 1681 hammer_modify_node_noundo(trans, node); 1682 bzero(node->ondisk, sizeof(*node->ondisk)); 1683 hammer_modify_node_done(node); 1684 } 1685 if (buffer) 1686 hammer_rel_buffer(buffer, 0); 1687 return(node); 1688 } 1689 1690 /* 1691 * Allocate data. If the address of a data buffer is supplied then 1692 * any prior non-NULL *data_bufferp will be released and *data_bufferp 1693 * will be set to the related buffer. The caller must release it when 1694 * finally done. The initial *data_bufferp should be set to NULL by 1695 * the caller. 1696 * 1697 * The caller is responsible for making hammer_modify*() calls on the 1698 * *data_bufferp. 1699 */ 1700 void * 1701 hammer_alloc_data(hammer_transaction_t trans, int32_t data_len, 1702 u_int16_t rec_type, hammer_off_t *data_offsetp, 1703 struct hammer_buffer **data_bufferp, 1704 hammer_off_t hint, int *errorp) 1705 { 1706 void *data; 1707 int zone; 1708 1709 /* 1710 * Allocate data 1711 */ 1712 if (data_len) { 1713 switch(rec_type) { 1714 case HAMMER_RECTYPE_INODE: 1715 case HAMMER_RECTYPE_DIRENTRY: 1716 case HAMMER_RECTYPE_EXT: 1717 case HAMMER_RECTYPE_FIX: 1718 case HAMMER_RECTYPE_PFS: 1719 case HAMMER_RECTYPE_SNAPSHOT: 1720 case HAMMER_RECTYPE_CONFIG: 1721 zone = HAMMER_ZONE_META_INDEX; 1722 break; 1723 case HAMMER_RECTYPE_DATA: 1724 case HAMMER_RECTYPE_DB: 1725 /* 1726 * Only mirror-write comes here. 1727 */ 1728 zone = hammer_data_zone_index(data_len); 1729 if (zone == HAMMER_ZONE_LARGE_DATA_INDEX) { 1730 /* round up */ 1731 data_len = (data_len + HAMMER_BUFMASK) & 1732 ~HAMMER_BUFMASK; 1733 } 1734 break; 1735 default: 1736 hpanic("rec_type %04x unknown", rec_type); 1737 zone = 0; /* NOT REACHED */ 1738 break; 1739 } 1740 *data_offsetp = hammer_blockmap_alloc(trans, zone, data_len, 1741 hint, errorp); 1742 } else { 1743 *data_offsetp = 0; 1744 } 1745 if (*errorp == 0 && data_bufferp) { 1746 if (data_len) { 1747 data = hammer_bread_ext(trans->hmp, *data_offsetp, 1748 data_len, errorp, data_bufferp); 1749 } else { 1750 data = NULL; 1751 } 1752 } else { 1753 data = NULL; 1754 } 1755 return(data); 1756 } 1757 1758 /* 1759 * Sync dirty buffers to the media and clean-up any loose ends. 1760 * 1761 * These functions do not start the flusher going, they simply 1762 * queue everything up to the flusher. 1763 */ 1764 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data); 1765 1766 int 1767 hammer_queue_inodes_flusher(hammer_mount_t hmp, int waitfor) 1768 { 1769 struct hammer_sync_info info; 1770 1771 info.error = 0; 1772 info.waitfor = waitfor; 1773 if (waitfor == MNT_WAIT) { 1774 vsyncscan(hmp->mp, VMSC_GETVP | VMSC_ONEPASS, 1775 hammer_sync_scan2, &info); 1776 } else { 1777 vsyncscan(hmp->mp, VMSC_GETVP | VMSC_ONEPASS | VMSC_NOWAIT, 1778 hammer_sync_scan2, &info); 1779 } 1780 return(info.error); 1781 } 1782 1783 /* 1784 * Filesystem sync. If doing a synchronous sync make a second pass on 1785 * the vnodes in case any were already flushing during the first pass, 1786 * and activate the flusher twice (the second time brings the UNDO FIFO's 1787 * start position up to the end position after the first call). 1788 * 1789 * If doing a lazy sync make just one pass on the vnode list, ignoring 1790 * any new vnodes added to the list while the sync is in progress. 1791 */ 1792 int 1793 hammer_sync_hmp(hammer_mount_t hmp, int waitfor) 1794 { 1795 struct hammer_sync_info info; 1796 int flags; 1797 1798 flags = VMSC_GETVP; 1799 if (waitfor & MNT_LAZY) 1800 flags |= VMSC_ONEPASS; 1801 1802 info.error = 0; 1803 info.waitfor = MNT_NOWAIT; 1804 vsyncscan(hmp->mp, flags | VMSC_NOWAIT, hammer_sync_scan2, &info); 1805 1806 if (info.error == 0 && (waitfor & MNT_WAIT)) { 1807 info.waitfor = waitfor; 1808 vsyncscan(hmp->mp, flags, hammer_sync_scan2, &info); 1809 } 1810 if (waitfor == MNT_WAIT) { 1811 hammer_flusher_sync(hmp); 1812 hammer_flusher_sync(hmp); 1813 } else { 1814 hammer_flusher_async(hmp, NULL); 1815 hammer_flusher_async(hmp, NULL); 1816 } 1817 return(info.error); 1818 } 1819 1820 static int 1821 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data) 1822 { 1823 struct hammer_sync_info *info = data; 1824 struct hammer_inode *ip; 1825 int error; 1826 1827 ip = VTOI(vp); 1828 if (ip == NULL) 1829 return(0); 1830 if (vp->v_type == VNON || vp->v_type == VBAD) { 1831 vclrisdirty(vp); 1832 return(0); 1833 } 1834 if ((ip->flags & HAMMER_INODE_MODMASK) == 0 && 1835 RB_EMPTY(&vp->v_rbdirty_tree)) { 1836 vclrisdirty(vp); 1837 return(0); 1838 } 1839 error = VOP_FSYNC(vp, MNT_NOWAIT, 0); 1840 if (error) 1841 info->error = error; 1842 return(0); 1843 } 1844