1 /* 2 * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem handles direct and indirect block searches, recursions, 37 * creation, and deletion. Chains of blockrefs are tracked and modifications 38 * are flagged for propagation... eventually all the way back to the volume 39 * header. Any chain except the volume header can be flushed to disk at 40 * any time... none of it matters until the volume header is dealt with 41 * (which is not here, see hammer2_vfsops.c for the volume header disk 42 * sequencing). 43 * 44 * Serialized flushes are not handled here, see hammer2_flush.c. This module 45 * can essentially work on the current version of data, which can be in memory 46 * as well as on-disk due to the above. However, we are responsible for 47 * making a copy of the state when a modified chain is part of a flush 48 * and we attempt to modify it again before the flush gets to it. In that 49 * situation we create an allocated copy of the state that the flush can 50 * deal with. If a chain undergoing deletion is part of a flush it is 51 * marked DELETED and its bref index is kept intact for the flush, but the 52 * chain is thereafter ignored by this module's because it is no longer 53 * current. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/uuid.h> 61 62 #include "hammer2.h" 63 64 static int hammer2_indirect_optimize; /* XXX SYSCTL */ 65 66 static hammer2_chain_t *hammer2_chain_create_indirect( 67 hammer2_mount_t *hmp, hammer2_chain_t *parent, 68 hammer2_key_t key, int keybits, 69 int *errorp); 70 71 /* 72 * We use a red-black tree to guarantee safe lookups under shared locks. 73 */ 74 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 75 76 int 77 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 78 { 79 return(chain2->index - chain1->index); 80 } 81 82 /* 83 * Recursively mark the parent chain elements so flushes can find 84 * modified elements. Stop when we hit a chain already flagged 85 * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set 86 * in chain itself. 87 * 88 * SUBMODIFIED is not set on the chain passed in. 89 * 90 * The chain->cst.spin lock can be held to stabilize the chain->parent 91 * pointer. The first parent is stabilized by virtue of chain being 92 * fully locked. 93 */ 94 void 95 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain) 96 { 97 hammer2_chain_t *parent; 98 99 parent = chain->parent; 100 if (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) { 101 spin_lock(&parent->cst.spin); 102 for (;;) { 103 atomic_set_int(&parent->flags, 104 HAMMER2_CHAIN_SUBMODIFIED); 105 if ((chain = parent->parent) == NULL) 106 break; 107 spin_lock(&chain->cst.spin); /* upward interlock */ 108 spin_unlock(&parent->cst.spin); 109 parent = chain; 110 } 111 spin_unlock(&parent->cst.spin); 112 } 113 } 114 115 /* 116 * Allocate a new disconnected chain element representing the specified 117 * bref. The chain element is locked exclusively and refs is set to 1. 118 * Media data (data) and meta-structure (u) pointers are left NULL. 119 * 120 * This essentially allocates a system memory structure representing one 121 * of the media structure types, including inodes. 122 */ 123 hammer2_chain_t * 124 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref) 125 { 126 hammer2_chain_t *chain; 127 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 128 129 /* 130 * Construct the appropriate system structure. 131 */ 132 switch(bref->type) { 133 case HAMMER2_BREF_TYPE_INODE: 134 case HAMMER2_BREF_TYPE_INDIRECT: 135 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 136 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 137 case HAMMER2_BREF_TYPE_DATA: 138 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 139 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 140 break; 141 case HAMMER2_BREF_TYPE_VOLUME: 142 chain = NULL; 143 panic("hammer2_chain_alloc volume type illegal for op"); 144 default: 145 chain = NULL; 146 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 147 bref->type); 148 } 149 150 /* 151 * Only set bref_flush if the bref has a real media offset, otherwise 152 * the caller has to wait for the chain to be modified/block-allocated 153 * before a blockref can be synchronized with its (future) parent. 154 */ 155 chain->bref = *bref; 156 if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX) 157 chain->bref_flush = *bref; 158 chain->index = -1; /* not yet assigned */ 159 chain->refs = 1; 160 chain->bytes = bytes; 161 ccms_cst_init(&chain->cst, chain); 162 ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE); 163 164 return (chain); 165 } 166 167 /* 168 * Deallocate a chain (the step before freeing it). Remove the chain from 169 * its parent's tree. 170 * 171 * Caller must hold the parent and the chain exclusively locked, and 172 * chain->refs must be 0. 173 * 174 * This function unlocks, removes, and destroys chain, and will recursively 175 * destroy any sub-chains under chain (whos refs must also be 0 at this 176 * point). 177 * 178 * parent can be NULL. 179 */ 180 static void 181 hammer2_chain_dealloc(hammer2_mount_t *hmp, hammer2_chain_t *chain) 182 { 183 hammer2_chain_t *parent; 184 hammer2_chain_t *child; 185 186 KKASSERT(chain->refs == 0); 187 KKASSERT(chain->flushing == 0); 188 KKASSERT((chain->flags & 189 (HAMMER2_CHAIN_MOVED | HAMMER2_CHAIN_MODIFIED)) == 0); 190 191 /* 192 * If the sub-tree is not empty all the elements on it must have 193 * 0 refs and be deallocatable. 194 */ 195 while ((child = RB_ROOT(&chain->rbhead)) != NULL) { 196 ccms_thread_lock(&child->cst, CCMS_STATE_EXCLUSIVE); 197 hammer2_chain_dealloc(hmp, child); 198 } 199 200 /* 201 * If the DELETED flag is not set the chain must be removed from 202 * its parent's tree. 203 * 204 * WARNING! chain->cst.spin must be held when chain->parent is 205 * modified, even though we own the full blown lock, 206 * to deal with setsubmod and rename races. 207 */ 208 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 209 spin_lock(&chain->cst.spin); /* shouldn't be needed */ 210 parent = chain->parent; 211 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain); 212 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 213 chain->parent = NULL; 214 spin_unlock(&chain->cst.spin); 215 } 216 hammer2_chain_free(hmp, chain); 217 } 218 219 /* 220 * Free a disconnected chain element 221 */ 222 void 223 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain) 224 { 225 switch(chain->bref.type) { 226 case HAMMER2_BREF_TYPE_VOLUME: 227 chain->data = NULL; 228 break; 229 case HAMMER2_BREF_TYPE_INODE: 230 if (chain->data) { 231 kfree(chain->data, hmp->minode); 232 chain->data = NULL; 233 } 234 break; 235 default: 236 KKASSERT(chain->data == NULL); 237 break; 238 } 239 240 KKASSERT(chain->bp == NULL); 241 242 ccms_thread_unlock(&chain->cst); 243 KKASSERT(chain->cst.count == 0); 244 KKASSERT(chain->cst.upgrade == 0); 245 246 kfree(chain, hmp->mchain); 247 } 248 249 /* 250 * Add a reference to a chain element, preventing its destruction. 251 * 252 * The parent chain must be locked shared or exclusive or otherwise be 253 * stable and already have a reference. 254 */ 255 void 256 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain) 257 { 258 u_int refs; 259 260 while (chain) { 261 refs = chain->refs; 262 KKASSERT(chain->refs >= 0); 263 cpu_ccfence(); 264 if (refs == 0) { 265 /* 266 * 0 -> 1 transition must bump the refs on the parent 267 * too. The caller has stabilized the parent. 268 */ 269 if (atomic_cmpset_int(&chain->refs, 0, 1)) { 270 chain = chain->parent; 271 KKASSERT(chain == NULL || chain->refs > 0); 272 } 273 /* retry or continue along the parent chain */ 274 } else { 275 /* 276 * N -> N+1 277 */ 278 if (atomic_cmpset_int(&chain->refs, refs, refs + 1)) 279 break; 280 /* retry */ 281 } 282 } 283 } 284 285 /* 286 * Drop the callers reference to the chain element. If the ref count 287 * reaches zero we attempt to recursively drop the parent. 288 * 289 * MOVED and MODIFIED elements hold additional references so it should not 290 * be possible for the count on a modified element to drop to 0. 291 * 292 * The chain element must NOT be locked by the caller on the 1->0 transition. 293 * 294 * The parent might or might not be locked by the caller. If we are unable 295 * to lock the parent on the 1->0 transition the destruction of the chain 296 * will be deferred but we still recurse upward and drop the ref on the 297 * parent (see the lastdrop() function) 298 */ 299 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_mount_t *hmp, 300 hammer2_chain_t *chain); 301 302 void 303 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain) 304 { 305 u_int refs; 306 307 while (chain) { 308 refs = chain->refs; 309 cpu_ccfence(); 310 KKASSERT(refs > 0); 311 if (refs == 1) { 312 /* 313 * (1) lastdrop successfully drops the chain to 0 314 * refs and may may not have destroyed it. 315 * lastdrop will return the parent so we can 316 * recursively drop the implied ref from the 317 * 1->0 transition. 318 * 319 * (2) lastdrop fails to transition refs from 1 to 0 320 * and returns the same chain, we retry. 321 */ 322 chain = hammer2_chain_lastdrop(hmp, chain); 323 } else { 324 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) { 325 /* 326 * Succeeded, count did not reach zero so 327 * cut out of the loop. 328 */ 329 break; 330 } 331 /* retry the same chain */ 332 } 333 } 334 } 335 336 /* 337 * Handle SMP races during the last drop. We must obtain a lock on 338 * chain->parent to stabilize the last pointer reference to chain 339 * (if applicable). This reference does not have a parallel ref count, 340 * that is idle chains in the topology can have a ref count of 0. 341 * 342 * The 1->0 transition implies a ref on the parent. 343 */ 344 static 345 hammer2_chain_t * 346 hammer2_chain_lastdrop(hammer2_mount_t *hmp, hammer2_chain_t *chain) 347 { 348 hammer2_chain_t *parent; 349 350 /* 351 * Stablize chain->parent with the chain cst's spinlock. 352 * (parent can be NULL here). 353 * 354 * cst.spin locks are allowed to be nested bottom-up (the reverse 355 * of the normal top-down for full-blown cst locks), so this also 356 * allows us to attempt to obtain the parent's cst lock non-blocking 357 * (which must acquire the parent's spinlock unconditionally) while 358 * we are still holding the chain's spinlock. 359 */ 360 spin_lock(&chain->cst.spin); 361 parent = chain->parent; 362 363 /* 364 * If chain->flushing is non-zero we cannot deallocate the chain 365 * here. The flushing field will be serialized for the inline 366 * unlock made by the flusher itself and we don't care about races 367 * in any other situation because the thread lock on the parent 368 * will fail in other situations. 369 * 370 * If we have a non-NULL parent but cannot acquire its thread 371 * lock, we also cannot deallocate the chain. 372 */ 373 if (chain->flushing || 374 (parent && ccms_thread_lock_nonblock(&parent->cst, 375 CCMS_STATE_EXCLUSIVE))) { 376 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 377 spin_unlock(&chain->cst.spin); /* success */ 378 return(parent); 379 } else { 380 spin_unlock(&chain->cst.spin); /* failure */ 381 return(chain); 382 } 383 } 384 spin_unlock(&chain->cst.spin); 385 386 /* 387 * With the parent now held we control the last pointer reference 388 * to chain ONLY IF this is the 1->0 drop. If we fail to transition 389 * from 1->0 we raced a refs change and must retry at chain. 390 */ 391 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 392 /* failure */ 393 if (parent) 394 ccms_thread_unlock(&parent->cst); 395 return(chain); 396 } 397 398 /* 399 * Ok, we succeeded. We now own the implied ref on the parent 400 * associated with the 1->0 transition of the child. It should not 401 * be possible for ANYTHING to access the child now, as we own the 402 * lock on the parent, so we should be able to safely lock the 403 * child and destroy it. 404 */ 405 ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE); 406 hammer2_chain_dealloc(hmp, chain); 407 408 /* 409 * We want to return parent with its implied ref to the caller 410 * to recurse and drop the parent. 411 */ 412 if (parent) 413 ccms_thread_unlock(&parent->cst); 414 return (parent); 415 } 416 417 /* 418 * Ref and lock a chain element, acquiring its data with I/O if necessary, 419 * and specify how you would like the data to be resolved. 420 * 421 * Returns 0 on success or an error code if the data could not be acquired. 422 * The chain element is locked either way. 423 * 424 * The lock is allowed to recurse, multiple locking ops will aggregate 425 * the requested resolve types. Once data is assigned it will not be 426 * removed until the last unlock. 427 * 428 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 429 * (typically used to avoid device/logical buffer 430 * aliasing for data) 431 * 432 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 433 * the INITIAL-create state (indirect blocks only). 434 * 435 * Do not resolve data elements for DATA chains. 436 * (typically used to avoid device/logical buffer 437 * aliasing for data) 438 * 439 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 440 * 441 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 442 * it will be locked exclusive. 443 * 444 * NOTE: Embedded elements (volume header, inodes) are always resolved 445 * regardless. 446 * 447 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 448 * element will instantiate and zero its buffer, and flush it on 449 * release. 450 * 451 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 452 * so as not to instantiate a device buffer, which could alias against 453 * a logical file buffer. However, if ALWAYS is specified the 454 * device buffer will be instantiated anyway. 455 */ 456 int 457 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how) 458 { 459 hammer2_blockref_t *bref; 460 hammer2_off_t pbase; 461 hammer2_off_t peof; 462 ccms_state_t ostate; 463 size_t boff; 464 size_t bbytes; 465 int error; 466 char *bdata; 467 468 /* 469 * Ref and lock the element. Recursive locks are allowed. 470 */ 471 hammer2_chain_ref(hmp, chain); 472 if (how & HAMMER2_RESOLVE_SHARED) 473 ccms_thread_lock(&chain->cst, CCMS_STATE_SHARED); 474 else 475 ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE); 476 477 /* 478 * If we already have a valid data pointer no further action is 479 * necessary. 480 */ 481 if (chain->data) 482 return (0); 483 484 /* 485 * Do we have to resolve the data? 486 */ 487 switch(how & HAMMER2_RESOLVE_MASK) { 488 case HAMMER2_RESOLVE_NEVER: 489 return(0); 490 case HAMMER2_RESOLVE_MAYBE: 491 if (chain->flags & HAMMER2_CHAIN_INITIAL) 492 return(0); 493 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 494 return(0); 495 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 496 return(0); 497 /* fall through */ 498 case HAMMER2_RESOLVE_ALWAYS: 499 break; 500 } 501 502 /* 503 * Upgrade to an exclusive lock so we can safely manipulate the 504 * buffer cache. If another thread got to it before us we 505 * can just return. 506 */ 507 ostate = ccms_thread_lock_upgrade(&chain->cst); 508 if (chain->data) { 509 ccms_thread_lock_restore(&chain->cst, ostate); 510 return (0); 511 } 512 513 /* 514 * We must resolve to a device buffer, either by issuing I/O or 515 * by creating a zero-fill element. We do not mark the buffer 516 * dirty when creating a zero-fill element (the hammer2_chain_modify() 517 * API must still be used to do that). 518 * 519 * The device buffer is variable-sized in powers of 2 down 520 * to HAMMER2_MINALLOCSIZE (typically 1K). A 64K physical storage 521 * chunk always contains buffers of the same size. (XXX) 522 * 523 * The minimum physical IO size may be larger than the variable 524 * block size. 525 */ 526 bref = &chain->bref; 527 528 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) 529 bbytes = HAMMER2_MINIOSIZE; 530 pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1); 531 peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64; 532 boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1); 533 KKASSERT(pbase != 0); 534 535 /* 536 * The getblk() optimization can only be used on newly created 537 * elements if the physical block size matches the request. 538 */ 539 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 540 chain->bytes == bbytes) { 541 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0); 542 error = 0; 543 } else if (hammer2_cluster_enable) { 544 error = cluster_read(hmp->devvp, peof, pbase, bbytes, 545 HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE, 546 &chain->bp); 547 } else { 548 error = bread(hmp->devvp, pbase, bbytes, &chain->bp); 549 } 550 551 if (error) { 552 kprintf("hammer2_chain_get: I/O error %016jx: %d\n", 553 (intmax_t)pbase, error); 554 bqrelse(chain->bp); 555 chain->bp = NULL; 556 ccms_thread_lock_restore(&chain->cst, ostate); 557 return (error); 558 } 559 560 /* 561 * Zero the data area if the chain is in the INITIAL-create state. 562 * Mark the buffer for bdwrite(). 563 */ 564 bdata = (char *)chain->bp->b_data + boff; 565 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 566 bzero(bdata, chain->bytes); 567 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 568 } 569 570 /* 571 * Setup the data pointer, either pointing it to an embedded data 572 * structure and copying the data from the buffer, or pointing it 573 * into the buffer. 574 * 575 * The buffer is not retained when copying to an embedded data 576 * structure in order to avoid potential deadlocks or recursions 577 * on the same physical buffer. 578 */ 579 switch (bref->type) { 580 case HAMMER2_BREF_TYPE_VOLUME: 581 /* 582 * Copy data from bp to embedded buffer 583 */ 584 panic("hammer2_chain_lock: called on unresolved volume header"); 585 #if 0 586 /* NOT YET */ 587 KKASSERT(pbase == 0); 588 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE); 589 bcopy(bdata, &hmp->voldata, chain->bytes); 590 chain->data = (void *)&hmp->voldata; 591 bqrelse(chain->bp); 592 chain->bp = NULL; 593 #endif 594 break; 595 case HAMMER2_BREF_TYPE_INODE: 596 /* 597 * Copy data from bp to embedded buffer, do not retain the 598 * device buffer. 599 */ 600 KKASSERT(chain->bytes == sizeof(chain->data->ipdata)); 601 chain->data = kmalloc(sizeof(chain->data->ipdata), 602 hmp->minode, M_WAITOK | M_ZERO); 603 bcopy(bdata, &chain->data->ipdata, chain->bytes); 604 bqrelse(chain->bp); 605 chain->bp = NULL; 606 break; 607 case HAMMER2_BREF_TYPE_INDIRECT: 608 case HAMMER2_BREF_TYPE_DATA: 609 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 610 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 611 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 612 default: 613 /* 614 * Point data at the device buffer and leave bp intact. 615 */ 616 chain->data = (void *)bdata; 617 break; 618 } 619 620 /* 621 * Make sure the bp is not specifically owned by this thread before 622 * restoring to a possibly shared lock, so another hammer2 thread 623 * can release it. 624 */ 625 if (chain->bp) 626 BUF_KERNPROC(chain->bp); 627 ccms_thread_lock_restore(&chain->cst, ostate); 628 return (0); 629 } 630 631 /* 632 * Unlock and deref a chain element. 633 * 634 * On the last lock release any non-embedded data (chain->bp) will be 635 * retired. 636 */ 637 void 638 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain) 639 { 640 long *counterp; 641 642 /* 643 * Release the CST lock but with a special 1->0 transition case. 644 * 645 * Returns non-zero if lock references remain. When zero is 646 * returned the last lock reference is retained and any shared 647 * lock is upgraded to an exclusive lock for final disposition. 648 */ 649 if (ccms_thread_unlock_zero(&chain->cst)) { 650 KKASSERT(chain->refs > 1); 651 atomic_add_int(&chain->refs, -1); 652 return; 653 } 654 655 /* 656 * Shortcut the case if the data is embedded or not resolved. 657 * 658 * Do NOT NULL out chain->data (e.g. inode data), it might be 659 * dirty. 660 * 661 * The DIRTYBP flag is non-applicable in this situation and can 662 * be cleared to keep the flags state clean. 663 */ 664 if (chain->bp == NULL) { 665 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 666 ccms_thread_unlock(&chain->cst); 667 hammer2_chain_drop(hmp, chain); 668 return; 669 } 670 671 /* 672 * Statistics 673 */ 674 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) { 675 ; 676 } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 677 switch(chain->bref.type) { 678 case HAMMER2_BREF_TYPE_DATA: 679 counterp = &hammer2_ioa_file_write; 680 break; 681 case HAMMER2_BREF_TYPE_INODE: 682 counterp = &hammer2_ioa_meta_write; 683 break; 684 case HAMMER2_BREF_TYPE_INDIRECT: 685 counterp = &hammer2_ioa_indr_write; 686 break; 687 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 688 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 689 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 690 counterp = &hammer2_ioa_fmap_write; 691 break; 692 default: 693 counterp = &hammer2_ioa_volu_write; 694 break; 695 } 696 ++*counterp; 697 } else { 698 switch(chain->bref.type) { 699 case HAMMER2_BREF_TYPE_DATA: 700 counterp = &hammer2_iod_file_write; 701 break; 702 case HAMMER2_BREF_TYPE_INODE: 703 counterp = &hammer2_iod_meta_write; 704 break; 705 case HAMMER2_BREF_TYPE_INDIRECT: 706 counterp = &hammer2_iod_indr_write; 707 break; 708 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 709 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 710 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 711 counterp = &hammer2_iod_fmap_write; 712 break; 713 default: 714 counterp = &hammer2_iod_volu_write; 715 break; 716 } 717 ++*counterp; 718 } 719 720 /* 721 * Clean out the bp. 722 * 723 * If a device buffer was used for data be sure to destroy the 724 * buffer when we are done to avoid aliases (XXX what about the 725 * underlying VM pages?). 726 * 727 * NOTE: Freemap leaf's use reserved blocks and thus no aliasing 728 * is possible. 729 */ 730 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 731 chain->bp->b_flags |= B_RELBUF; 732 733 /* 734 * The DIRTYBP flag tracks whether we have to bdwrite() the buffer 735 * or not. The flag will get re-set when chain_modify() is called, 736 * even if MODIFIED is already set, allowing the OS to retire the 737 * buffer independent of a hammer2 flus. 738 */ 739 chain->data = NULL; 740 if (chain->flags & HAMMER2_CHAIN_DIRTYBP) { 741 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 742 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 743 atomic_clear_int(&chain->flags, 744 HAMMER2_CHAIN_IOFLUSH); 745 chain->bp->b_flags |= B_RELBUF; 746 cluster_awrite(chain->bp); 747 } else { 748 chain->bp->b_flags |= B_CLUSTEROK; 749 bdwrite(chain->bp); 750 } 751 } else { 752 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 753 atomic_clear_int(&chain->flags, 754 HAMMER2_CHAIN_IOFLUSH); 755 chain->bp->b_flags |= B_RELBUF; 756 brelse(chain->bp); 757 } else { 758 /* bp might still be dirty */ 759 bqrelse(chain->bp); 760 } 761 } 762 chain->bp = NULL; 763 ccms_thread_unlock(&chain->cst); 764 hammer2_chain_drop(hmp, chain); 765 } 766 767 /* 768 * Resize the chain's physical storage allocation. Chains can be resized 769 * smaller without reallocating the storage. Resizing larger will reallocate 770 * the storage. 771 * 772 * Must be passed a locked chain. 773 * 774 * If you want the resize code to copy the data to the new block then the 775 * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS. 776 * 777 * If the caller already holds a logical buffer containing the data and 778 * intends to bdwrite() that buffer resolve with RESOLVE_NEVER. The resize 779 * operation will then not copy the data. 780 * 781 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 782 * to avoid instantiating a device buffer that conflicts with the vnode 783 * data buffer. 784 * 785 * XXX flags currently ignored, uses chain->bp to detect data/no-data. 786 */ 787 void 788 hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain, 789 int nradix, int flags) 790 { 791 hammer2_mount_t *hmp = ip->hmp; 792 struct buf *nbp; 793 hammer2_off_t pbase; 794 size_t obytes; 795 size_t nbytes; 796 size_t bbytes; 797 int boff; 798 char *bdata; 799 int error; 800 801 /* 802 * Only data and indirect blocks can be resized for now. 803 * (The volu root, inodes, and freemap elements use a fixed size). 804 */ 805 KKASSERT(chain != &hmp->vchain); 806 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 807 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT); 808 809 /* 810 * Nothing to do if the element is already the proper size 811 */ 812 obytes = chain->bytes; 813 nbytes = 1U << nradix; 814 if (obytes == nbytes) 815 return; 816 817 /* 818 * Set MODIFIED and add a chain ref to prevent destruction. Both 819 * modified flags share the same ref. 820 * 821 * If the chain is already marked MODIFIED then we can safely 822 * return the previous allocation to the pool without having to 823 * worry about snapshots. 824 */ 825 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 826 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 827 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED | 828 HAMMER2_CHAIN_MODIFY_TID); 829 hammer2_chain_ref(hmp, chain); 830 } else { 831 hammer2_freemap_free(hmp, chain->bref.data_off, 832 chain->bref.type); 833 } 834 835 /* 836 * Relocate the block, even if making it smaller (because different 837 * block sizes may be in different regions). 838 */ 839 chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type, 840 nbytes); 841 chain->bytes = nbytes; 842 /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */ 843 844 /* 845 * The device buffer may be larger than the allocation size. 846 */ 847 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) 848 bbytes = HAMMER2_MINIOSIZE; 849 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1); 850 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1); 851 852 /* 853 * Only copy the data if resolved, otherwise the caller is 854 * responsible. 855 */ 856 if (chain->bp) { 857 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 858 chain->bref.type == HAMMER2_BREF_TYPE_DATA); 859 KKASSERT(chain != &hmp->vchain); /* safety */ 860 861 /* 862 * The getblk() optimization can only be used if the 863 * physical block size matches the request. 864 */ 865 if (nbytes == bbytes) { 866 nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0); 867 error = 0; 868 } else { 869 error = bread(hmp->devvp, pbase, bbytes, &nbp); 870 KKASSERT(error == 0); 871 } 872 bdata = (char *)nbp->b_data + boff; 873 874 if (nbytes < obytes) { 875 bcopy(chain->data, bdata, nbytes); 876 } else { 877 bcopy(chain->data, bdata, obytes); 878 bzero(bdata + obytes, nbytes - obytes); 879 } 880 881 /* 882 * NOTE: The INITIAL state of the chain is left intact. 883 * We depend on hammer2_chain_modify() to do the 884 * right thing. 885 * 886 * NOTE: We set B_NOCACHE to throw away the previous bp and 887 * any VM backing store, even if it was dirty. 888 * Otherwise we run the risk of a logical/device 889 * conflict on reallocation. 890 */ 891 chain->bp->b_flags |= B_RELBUF | B_NOCACHE; 892 brelse(chain->bp); 893 chain->bp = nbp; 894 chain->data = (void *)bdata; 895 hammer2_chain_modify(hmp, chain, 0); 896 } 897 898 /* 899 * Make sure the chain is marked MOVED and SUBMOD is set in the 900 * parent(s) so the adjustments are picked up by flush. 901 */ 902 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 903 hammer2_chain_ref(hmp, chain); 904 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 905 } 906 hammer2_chain_parent_setsubmod(hmp, chain); 907 } 908 909 /* 910 * Convert a locked chain that was retrieved read-only to read-write. 911 * 912 * If not already marked modified a new physical block will be allocated 913 * and assigned to the bref. 914 * 915 * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE 916 * level or the COW operation will not work. 917 * 918 * Data blocks - The chain is usually locked RESOLVE_NEVER so as not to 919 * run the data through the device buffers. 920 */ 921 void 922 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags) 923 { 924 struct buf *nbp; 925 int error; 926 hammer2_off_t pbase; 927 size_t bbytes; 928 size_t boff; 929 void *bdata; 930 931 /* 932 * Tells flush that modify_tid must be updated, otherwise only 933 * mirror_tid is updated. This is the default. 934 */ 935 if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0) 936 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID); 937 938 /* 939 * If the chain is already marked MODIFIED we can just return. 940 * 941 * However, it is possible that a prior lock/modify sequence 942 * retired the buffer. During this lock/modify sequence MODIFIED 943 * may still be set but the buffer could wind up clean. Since 944 * the caller is going to modify the buffer further we have to 945 * be sure that DIRTYBP is set again. 946 */ 947 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 948 if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 && 949 chain->bp == NULL) { 950 goto skip1; 951 } 952 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 953 return; 954 } 955 956 /* 957 * Set MODIFIED and add a chain ref to prevent destruction. Both 958 * modified flags share the same ref. 959 */ 960 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 961 hammer2_chain_ref(hmp, chain); 962 963 /* 964 * We must allocate the copy-on-write block. 965 * 966 * If the data is embedded no other action is required. 967 * 968 * If the data is not embedded we acquire and clear the 969 * new block. If chain->data is not NULL we then do the 970 * copy-on-write. chain->data will then be repointed to the new 971 * buffer and the old buffer will be released. 972 * 973 * For newly created elements with no prior allocation we go 974 * through the copy-on-write steps except without the copying part. 975 */ 976 if (chain != &hmp->vchain) { 977 if ((hammer2_debug & 0x0001) && 978 (chain->bref.data_off & HAMMER2_OFF_MASK)) { 979 kprintf("Replace %d\n", chain->bytes); 980 } 981 chain->bref.data_off = 982 hammer2_freemap_alloc(hmp, chain->bref.type, 983 chain->bytes); 984 /* XXX failed allocation */ 985 } 986 987 /* 988 * If data instantiation is optional and the chain has no current 989 * data association (typical for DATA and newly-created INDIRECT 990 * elements), don't instantiate the buffer now. 991 */ 992 if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL) 993 goto skip2; 994 995 skip1: 996 /* 997 * Setting the DIRTYBP flag will cause the buffer to be dirtied or 998 * written-out on unlock. This bit is independent of the MODIFIED 999 * bit because the chain may still need meta-data adjustments done 1000 * by virtue of MODIFIED for its parent, and the buffer can be 1001 * flushed out (possibly multiple times) by the OS before that. 1002 * 1003 * Clearing the INITIAL flag (for indirect blocks) indicates that 1004 * a zero-fill buffer has been instantiated. 1005 */ 1006 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 1007 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1008 1009 /* 1010 * We currently should never instantiate a device buffer for a 1011 * file data chain. (We definitely can for a freemap chain). 1012 */ 1013 KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA); 1014 1015 /* 1016 * Execute COW operation 1017 */ 1018 switch(chain->bref.type) { 1019 case HAMMER2_BREF_TYPE_VOLUME: 1020 case HAMMER2_BREF_TYPE_INODE: 1021 /* 1022 * The data is embedded, no copy-on-write operation is 1023 * needed. 1024 */ 1025 KKASSERT(chain->bp == NULL); 1026 break; 1027 case HAMMER2_BREF_TYPE_DATA: 1028 case HAMMER2_BREF_TYPE_INDIRECT: 1029 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 1030 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1031 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1032 /* 1033 * Perform the copy-on-write operation 1034 */ 1035 KKASSERT(chain != &hmp->vchain); /* safety */ 1036 /* 1037 * The device buffer may be larger than the allocation size. 1038 */ 1039 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) 1040 bbytes = HAMMER2_MINIOSIZE; 1041 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1); 1042 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1); 1043 1044 /* 1045 * The getblk() optimization can only be used if the 1046 * physical block size matches the request. 1047 */ 1048 if (chain->bytes == bbytes) { 1049 nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0); 1050 error = 0; 1051 } else { 1052 error = bread(hmp->devvp, pbase, bbytes, &nbp); 1053 KKASSERT(error == 0); 1054 } 1055 bdata = (char *)nbp->b_data + boff; 1056 1057 /* 1058 * Copy or zero-fill on write depending on whether 1059 * chain->data exists or not. 1060 */ 1061 if (chain->data) { 1062 bcopy(chain->data, bdata, chain->bytes); 1063 KKASSERT(chain->bp != NULL); 1064 } else { 1065 bzero(bdata, chain->bytes); 1066 } 1067 if (chain->bp) { 1068 chain->bp->b_flags |= B_RELBUF; 1069 brelse(chain->bp); 1070 } 1071 chain->bp = nbp; 1072 chain->data = bdata; 1073 break; 1074 default: 1075 panic("hammer2_chain_modify: illegal non-embedded type %d", 1076 chain->bref.type); 1077 break; 1078 1079 } 1080 skip2: 1081 if ((flags & HAMMER2_MODIFY_NOSUB) == 0) 1082 hammer2_chain_parent_setsubmod(hmp, chain); 1083 } 1084 1085 /* 1086 * Mark the volume as having been modified. This short-cut version 1087 * does not have to lock the volume's chain, which allows the ioctl 1088 * code to make adjustments to connections without deadlocking. 1089 */ 1090 void 1091 hammer2_modify_volume(hammer2_mount_t *hmp) 1092 { 1093 hammer2_voldata_lock(hmp); 1094 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX); 1095 hammer2_voldata_unlock(hmp); 1096 } 1097 1098 /* 1099 * Locate an in-memory chain. The parent must be locked. The in-memory 1100 * chain is returned or NULL if no in-memory chain is present. 1101 * 1102 * NOTE: A chain on-media might exist for this index when NULL is returned. 1103 */ 1104 hammer2_chain_t * 1105 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index) 1106 { 1107 hammer2_chain_t dummy; 1108 hammer2_chain_t *chain; 1109 1110 dummy.index = index; 1111 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy); 1112 return (chain); 1113 } 1114 1115 /* 1116 * Return a locked chain structure with all associated data acquired. 1117 * 1118 * Caller must lock the parent on call, the returned child will be locked. 1119 */ 1120 hammer2_chain_t * 1121 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent, 1122 int index, int flags) 1123 { 1124 hammer2_blockref_t *bref; 1125 hammer2_chain_t *chain; 1126 hammer2_chain_t dummy; 1127 int how; 1128 ccms_state_t ostate; 1129 1130 /* 1131 * Figure out how to lock. MAYBE can be used to optimized 1132 * the initial-create state for indirect blocks. 1133 */ 1134 if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) 1135 how = HAMMER2_RESOLVE_NEVER; 1136 else 1137 how = HAMMER2_RESOLVE_MAYBE; 1138 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) 1139 how |= HAMMER2_RESOLVE_SHARED; 1140 1141 /* 1142 * First see if we have a (possibly modified) chain element cached 1143 * for this (parent, index). Acquire the data if necessary. 1144 * 1145 * If chain->data is non-NULL the chain should already be marked 1146 * modified. 1147 */ 1148 dummy.index = index; 1149 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy); 1150 if (chain) { 1151 if (flags & HAMMER2_LOOKUP_NOLOCK) 1152 hammer2_chain_ref(hmp, chain); 1153 else 1154 hammer2_chain_lock(hmp, chain, how); 1155 return(chain); 1156 } 1157 1158 /* 1159 * Upgrade our thread lock and handle any race that may have 1160 * occurred. Leave the lock upgraded for the rest of the get. 1161 * We have to do this because we will be modifying the chain 1162 * structure. 1163 */ 1164 ostate = ccms_thread_lock_upgrade(&parent->cst); 1165 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy); 1166 if (chain) { 1167 if (flags & HAMMER2_LOOKUP_NOLOCK) 1168 hammer2_chain_ref(hmp, chain); 1169 else 1170 hammer2_chain_lock(hmp, chain, how); 1171 ccms_thread_lock_restore(&parent->cst, ostate); 1172 return(chain); 1173 } 1174 1175 /* 1176 * The get function must always succeed, panic if there's no 1177 * data to index. 1178 */ 1179 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1180 ccms_thread_lock_restore(&parent->cst, ostate); 1181 panic("hammer2_chain_get: Missing bref(1)"); 1182 /* NOT REACHED */ 1183 } 1184 1185 /* 1186 * Otherwise lookup the bref and issue I/O (switch on the parent) 1187 */ 1188 switch(parent->bref.type) { 1189 case HAMMER2_BREF_TYPE_INODE: 1190 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT); 1191 bref = &parent->data->ipdata.u.blockset.blockref[index]; 1192 break; 1193 case HAMMER2_BREF_TYPE_INDIRECT: 1194 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 1195 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1196 KKASSERT(parent->data != NULL); 1197 KKASSERT(index >= 0 && 1198 index < parent->bytes / sizeof(hammer2_blockref_t)); 1199 bref = &parent->data->npdata.blockref[index]; 1200 break; 1201 case HAMMER2_BREF_TYPE_VOLUME: 1202 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT); 1203 bref = &hmp->voldata.sroot_blockset.blockref[index]; 1204 break; 1205 default: 1206 bref = NULL; 1207 panic("hammer2_chain_get: unrecognized blockref type: %d", 1208 parent->bref.type); 1209 } 1210 if (bref->type == 0) { 1211 panic("hammer2_chain_get: Missing bref(2)"); 1212 /* NOT REACHED */ 1213 } 1214 1215 /* 1216 * Allocate a chain structure representing the existing media 1217 * entry. 1218 * 1219 * The locking operation we do later will issue I/O to read it. 1220 */ 1221 chain = hammer2_chain_alloc(hmp, bref); 1222 1223 /* 1224 * Link the chain into its parent. Caller is expected to hold an 1225 * exclusive lock on the parent. 1226 */ 1227 chain->parent = parent; 1228 chain->index = index; 1229 if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain)) 1230 panic("hammer2_chain_link: collision"); 1231 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 1232 KKASSERT(parent->refs > 0); 1233 atomic_add_int(&parent->refs, 1); /* for red-black entry */ 1234 ccms_thread_lock_restore(&parent->cst, ostate); 1235 1236 /* 1237 * Our new chain structure has already been referenced and locked 1238 * but the lock code handles the I/O so call it to resolve the data. 1239 * Then release one of our two exclusive locks. 1240 * 1241 * If NOLOCK is set the release will release the one-and-only lock. 1242 */ 1243 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) { 1244 hammer2_chain_lock(hmp, chain, how); /* recusive lock */ 1245 hammer2_chain_drop(hmp, chain); /* excess ref */ 1246 } 1247 ccms_thread_unlock(&chain->cst); /* from alloc */ 1248 1249 return (chain); 1250 } 1251 1252 /* 1253 * Locate any key between key_beg and key_end inclusive. (*parentp) 1254 * typically points to an inode but can also point to a related indirect 1255 * block and this function will recurse upwards and find the inode again. 1256 * 1257 * WARNING! THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER! ANY KEY 1258 * WITHIN THE RANGE CAN BE RETURNED. HOWEVER, AN ITERATION 1259 * WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN. 1260 * 1261 * (*parentp) must be exclusively locked and referenced and can be an inode 1262 * or an existing indirect block within the inode. 1263 * 1264 * On return (*parentp) will be modified to point at the deepest parent chain 1265 * element encountered during the search, as a helper for an insertion or 1266 * deletion. The new (*parentp) will be locked and referenced and the old 1267 * will be unlocked and dereferenced (no change if they are both the same). 1268 * 1269 * The matching chain will be returned exclusively locked and referenced. 1270 * 1271 * NULL is returned if no match was found, but (*parentp) will still 1272 * potentially be adjusted. 1273 * 1274 * This function will also recurse up the chain if the key is not within the 1275 * current parent's range. (*parentp) can never be set to NULL. An iteration 1276 * can simply allow (*parentp) to float inside the loop. 1277 */ 1278 hammer2_chain_t * 1279 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp, 1280 hammer2_key_t key_beg, hammer2_key_t key_end, 1281 int flags) 1282 { 1283 hammer2_chain_t *parent; 1284 hammer2_chain_t *chain; 1285 hammer2_chain_t *tmp; 1286 hammer2_blockref_t *base; 1287 hammer2_blockref_t *bref; 1288 hammer2_key_t scan_beg; 1289 hammer2_key_t scan_end; 1290 int count = 0; 1291 int i; 1292 int how_always = HAMMER2_RESOLVE_ALWAYS; 1293 int how_maybe = HAMMER2_RESOLVE_MAYBE; 1294 1295 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) { 1296 how_maybe |= HAMMER2_RESOLVE_SHARED; 1297 how_always |= HAMMER2_RESOLVE_SHARED; 1298 } 1299 1300 /* 1301 * Recurse (*parentp) upward if necessary until the parent completely 1302 * encloses the key range or we hit the inode. 1303 */ 1304 parent = *parentp; 1305 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1306 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1307 scan_beg = parent->bref.key; 1308 scan_end = scan_beg + 1309 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 1310 if (key_beg >= scan_beg && key_end <= scan_end) 1311 break; 1312 hammer2_chain_ref(hmp, parent); /* ref old parent */ 1313 hammer2_chain_unlock(hmp, parent); /* unlock old parent */ 1314 parent = parent->parent; 1315 /* lock new parent */ 1316 hammer2_chain_lock(hmp, parent, how_maybe); 1317 hammer2_chain_drop(hmp, *parentp); /* drop old parent */ 1318 *parentp = parent; /* new parent */ 1319 } 1320 1321 again: 1322 /* 1323 * Locate the blockref array. Currently we do a fully associative 1324 * search through the array. 1325 */ 1326 switch(parent->bref.type) { 1327 case HAMMER2_BREF_TYPE_INODE: 1328 /* 1329 * Special shortcut for embedded data returns the inode 1330 * itself. Callers must detect this condition and access 1331 * the embedded data (the strategy code does this for us). 1332 * 1333 * This is only applicable to regular files and softlinks. 1334 */ 1335 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 1336 if (flags & HAMMER2_LOOKUP_NOLOCK) 1337 hammer2_chain_ref(hmp, parent); 1338 else 1339 hammer2_chain_lock(hmp, parent, how_always); 1340 return (parent); 1341 } 1342 base = &parent->data->ipdata.u.blockset.blockref[0]; 1343 count = HAMMER2_SET_COUNT; 1344 break; 1345 case HAMMER2_BREF_TYPE_INDIRECT: 1346 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 1347 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1348 /* 1349 * Optimize indirect blocks in the INITIAL state to avoid 1350 * I/O. 1351 */ 1352 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1353 base = NULL; 1354 } else { 1355 if (parent->data == NULL) 1356 panic("parent->data is NULL"); 1357 base = &parent->data->npdata.blockref[0]; 1358 } 1359 count = parent->bytes / sizeof(hammer2_blockref_t); 1360 break; 1361 case HAMMER2_BREF_TYPE_VOLUME: 1362 base = &hmp->voldata.sroot_blockset.blockref[0]; 1363 count = HAMMER2_SET_COUNT; 1364 break; 1365 default: 1366 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 1367 parent->bref.type); 1368 base = NULL; /* safety */ 1369 count = 0; /* safety */ 1370 } 1371 1372 /* 1373 * If the element and key overlap we use the element. 1374 * 1375 * NOTE! Deleted elements are effectively invisible. A Deleted 1376 * elements covers (makes invisible) any original media 1377 * data. 1378 */ 1379 bref = NULL; 1380 for (i = 0; i < count; ++i) { 1381 tmp = hammer2_chain_find(hmp, parent, i); 1382 if (tmp) { 1383 if (tmp->flags & HAMMER2_CHAIN_DELETED) 1384 continue; 1385 bref = &tmp->bref; 1386 KKASSERT(bref->type != 0); 1387 } else if (base == NULL || base[i].type == 0) { 1388 continue; 1389 } else { 1390 bref = &base[i]; 1391 } 1392 scan_beg = bref->key; 1393 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1; 1394 if (key_beg <= scan_end && key_end >= scan_beg) 1395 break; 1396 } 1397 if (i == count) { 1398 if (key_beg == key_end) 1399 return (NULL); 1400 return (hammer2_chain_next(hmp, parentp, NULL, 1401 key_beg, key_end, flags)); 1402 } 1403 1404 /* 1405 * Acquire the new chain element. If the chain element is an 1406 * indirect block we must search recursively. 1407 */ 1408 chain = hammer2_chain_get(hmp, parent, i, flags); 1409 if (chain == NULL) 1410 return (NULL); 1411 1412 /* 1413 * If the chain element is an indirect block it becomes the new 1414 * parent and we loop on it. 1415 * 1416 * The parent always has to be locked with at least RESOLVE_MAYBE, 1417 * so it might need a fixup if the caller passed incompatible flags. 1418 */ 1419 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1420 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1421 hammer2_chain_unlock(hmp, parent); 1422 *parentp = parent = chain; 1423 if (flags & HAMMER2_LOOKUP_NOLOCK) { 1424 hammer2_chain_lock(hmp, chain, how_maybe); 1425 hammer2_chain_drop(hmp, chain); /* excess ref */ 1426 } else if (flags & HAMMER2_LOOKUP_NODATA) { 1427 hammer2_chain_lock(hmp, chain, how_maybe); 1428 hammer2_chain_unlock(hmp, chain); 1429 } 1430 goto again; 1431 } 1432 1433 /* 1434 * All done, return chain 1435 */ 1436 return (chain); 1437 } 1438 1439 /* 1440 * After having issued a lookup we can iterate all matching keys. 1441 * 1442 * If chain is non-NULL we continue the iteration from just after it's index. 1443 * 1444 * If chain is NULL we assume the parent was exhausted and continue the 1445 * iteration at the next parent. 1446 * 1447 * parent must be locked on entry and remains locked throughout. chain's 1448 * lock status must match flags. 1449 */ 1450 hammer2_chain_t * 1451 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp, 1452 hammer2_chain_t *chain, 1453 hammer2_key_t key_beg, hammer2_key_t key_end, 1454 int flags) 1455 { 1456 hammer2_chain_t *parent; 1457 hammer2_chain_t *tmp; 1458 hammer2_blockref_t *base; 1459 hammer2_blockref_t *bref; 1460 hammer2_key_t scan_beg; 1461 hammer2_key_t scan_end; 1462 int i; 1463 int how_maybe = HAMMER2_RESOLVE_MAYBE; 1464 int count; 1465 1466 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) 1467 how_maybe |= HAMMER2_RESOLVE_SHARED; 1468 1469 parent = *parentp; 1470 1471 again: 1472 /* 1473 * Calculate the next index and recalculate the parent if necessary. 1474 */ 1475 if (chain) { 1476 /* 1477 * Continue iteration within current parent. If not NULL 1478 * the passed-in chain may or may not be locked, based on 1479 * the LOOKUP_NOLOCK flag (passed in as returned from lookup 1480 * or a prior next). 1481 */ 1482 i = chain->index + 1; 1483 if (flags & HAMMER2_LOOKUP_NOLOCK) 1484 hammer2_chain_drop(hmp, chain); 1485 else 1486 hammer2_chain_unlock(hmp, chain); 1487 1488 /* 1489 * Any scan where the lookup returned degenerate data embedded 1490 * in the inode has an invalid index and must terminate. 1491 */ 1492 if (chain == parent) 1493 return(NULL); 1494 chain = NULL; 1495 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 1496 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1497 /* 1498 * We reached the end of the iteration. 1499 */ 1500 return (NULL); 1501 } else { 1502 /* 1503 * Continue iteration with next parent unless the current 1504 * parent covers the range. 1505 */ 1506 hammer2_chain_t *nparent; 1507 1508 scan_beg = parent->bref.key; 1509 scan_end = scan_beg + 1510 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 1511 if (key_beg >= scan_beg && key_end <= scan_end) 1512 return (NULL); 1513 1514 i = parent->index + 1; 1515 nparent = parent->parent; 1516 hammer2_chain_ref(hmp, nparent); /* ref new parent */ 1517 hammer2_chain_unlock(hmp, parent); /* unlock old parent */ 1518 /* lock new parent */ 1519 hammer2_chain_lock(hmp, nparent, how_maybe); 1520 hammer2_chain_drop(hmp, nparent); /* drop excess ref */ 1521 *parentp = parent = nparent; 1522 } 1523 1524 again2: 1525 /* 1526 * Locate the blockref array. Currently we do a fully associative 1527 * search through the array. 1528 */ 1529 switch(parent->bref.type) { 1530 case HAMMER2_BREF_TYPE_INODE: 1531 base = &parent->data->ipdata.u.blockset.blockref[0]; 1532 count = HAMMER2_SET_COUNT; 1533 break; 1534 case HAMMER2_BREF_TYPE_INDIRECT: 1535 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 1536 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1537 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1538 base = NULL; 1539 } else { 1540 KKASSERT(parent->data != NULL); 1541 base = &parent->data->npdata.blockref[0]; 1542 } 1543 count = parent->bytes / sizeof(hammer2_blockref_t); 1544 break; 1545 case HAMMER2_BREF_TYPE_VOLUME: 1546 base = &hmp->voldata.sroot_blockset.blockref[0]; 1547 count = HAMMER2_SET_COUNT; 1548 break; 1549 default: 1550 panic("hammer2_chain_next: unrecognized blockref type: %d", 1551 parent->bref.type); 1552 base = NULL; /* safety */ 1553 count = 0; /* safety */ 1554 break; 1555 } 1556 KKASSERT(i <= count); 1557 1558 /* 1559 * Look for the key. If we are unable to find a match and an exact 1560 * match was requested we return NULL. If a range was requested we 1561 * run hammer2_chain_next() to iterate. 1562 * 1563 * NOTE! Deleted elements are effectively invisible. A Deleted 1564 * elements covers (makes invisible) any original media 1565 * data. 1566 */ 1567 bref = NULL; 1568 while (i < count) { 1569 tmp = hammer2_chain_find(hmp, parent, i); 1570 if (tmp) { 1571 if (tmp->flags & HAMMER2_CHAIN_DELETED) { 1572 ++i; 1573 continue; 1574 } 1575 bref = &tmp->bref; 1576 } else if (base == NULL || base[i].type == 0) { 1577 ++i; 1578 continue; 1579 } else { 1580 bref = &base[i]; 1581 } 1582 scan_beg = bref->key; 1583 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1; 1584 if (key_beg <= scan_end && key_end >= scan_beg) 1585 break; 1586 ++i; 1587 } 1588 1589 /* 1590 * If we couldn't find a match recurse up a parent to continue the 1591 * search. 1592 */ 1593 if (i == count) 1594 goto again; 1595 1596 /* 1597 * Acquire the new chain element. If the chain element is an 1598 * indirect block we must search recursively. 1599 */ 1600 chain = hammer2_chain_get(hmp, parent, i, flags); 1601 if (chain == NULL) 1602 return (NULL); 1603 1604 /* 1605 * If the chain element is an indirect block it becomes the new 1606 * parent and we loop on it. 1607 * 1608 * The parent always has to be locked with at least RESOLVE_MAYBE, 1609 * so it might need a fixup if the caller passed incompatible flags. 1610 */ 1611 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1612 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1613 hammer2_chain_unlock(hmp, parent); 1614 *parentp = parent = chain; 1615 chain = NULL; 1616 if (flags & HAMMER2_LOOKUP_NOLOCK) { 1617 hammer2_chain_lock(hmp, parent, how_maybe); 1618 hammer2_chain_drop(hmp, parent); /* excess ref */ 1619 } else if (flags & HAMMER2_LOOKUP_NODATA) { 1620 hammer2_chain_lock(hmp, parent, how_maybe); 1621 hammer2_chain_unlock(hmp, parent); 1622 } 1623 i = 0; 1624 goto again2; 1625 } 1626 1627 /* 1628 * All done, return chain 1629 */ 1630 return (chain); 1631 } 1632 1633 /* 1634 * Create and return a new hammer2 system memory structure of the specified 1635 * key, type and size and insert it RELATIVE TO (PARENT). 1636 * 1637 * (parent) is typically either an inode or an indirect block, acquired 1638 * acquired as a side effect of issuing a prior failed lookup. parent 1639 * must be locked and held. Do not pass the inode chain to this function 1640 * unless that is the chain returned by the failed lookup. 1641 * 1642 * Non-indirect types will automatically allocate indirect blocks as required 1643 * if the new item does not fit in the current (parent). 1644 * 1645 * Indirect types will move a portion of the existing blockref array in 1646 * (parent) into the new indirect type and then use one of the free slots 1647 * to emplace the new indirect type. 1648 * 1649 * A new locked, referenced chain element is returned of the specified type. 1650 * The element may or may not have a data area associated with it: 1651 * 1652 * VOLUME not allowed here 1653 * INODE kmalloc()'d data area is set up 1654 * INDIRECT not allowed here 1655 * DATA no data area will be set-up (caller is expected 1656 * to have logical buffers, we don't want to alias 1657 * the data onto device buffers!). 1658 * 1659 * Requires an exclusively locked parent. 1660 */ 1661 hammer2_chain_t * 1662 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent, 1663 hammer2_chain_t *chain, 1664 hammer2_key_t key, int keybits, int type, size_t bytes, 1665 int *errorp) 1666 { 1667 hammer2_blockref_t dummy; 1668 hammer2_blockref_t *base; 1669 hammer2_chain_t dummy_chain; 1670 int unlock_parent = 0; 1671 int allocated = 0; 1672 int count; 1673 int i; 1674 1675 KKASSERT(ccms_thread_lock_owned(&parent->cst)); 1676 *errorp = 0; 1677 1678 if (chain == NULL) { 1679 /* 1680 * First allocate media space and construct the dummy bref, 1681 * then allocate the in-memory chain structure. 1682 */ 1683 bzero(&dummy, sizeof(dummy)); 1684 dummy.type = type; 1685 dummy.key = key; 1686 dummy.keybits = keybits; 1687 dummy.data_off = hammer2_allocsize(bytes); 1688 dummy.methods = parent->bref.methods; 1689 chain = hammer2_chain_alloc(hmp, &dummy); 1690 allocated = 1; 1691 1692 /* 1693 * We do NOT set INITIAL here (yet). INITIAL is only 1694 * used for indirect blocks. 1695 * 1696 * Recalculate bytes to reflect the actual media block 1697 * allocation. 1698 */ 1699 bytes = (hammer2_off_t)1 << 1700 (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 1701 chain->bytes = bytes; 1702 1703 switch(type) { 1704 case HAMMER2_BREF_TYPE_VOLUME: 1705 panic("hammer2_chain_create: called with volume type"); 1706 break; 1707 case HAMMER2_BREF_TYPE_INODE: 1708 KKASSERT(bytes == HAMMER2_INODE_BYTES); 1709 chain->data = kmalloc(sizeof(chain->data->ipdata), 1710 hmp->minode, M_WAITOK | M_ZERO); 1711 break; 1712 case HAMMER2_BREF_TYPE_INDIRECT: 1713 panic("hammer2_chain_create: cannot be used to" 1714 "create indirect block"); 1715 break; 1716 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 1717 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1718 panic("hammer2_chain_create: cannot be used to" 1719 "create freemap root or node"); 1720 break; 1721 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1722 case HAMMER2_BREF_TYPE_DATA: 1723 default: 1724 /* leave chain->data NULL */ 1725 KKASSERT(chain->data == NULL); 1726 break; 1727 } 1728 } else { 1729 /* 1730 * Potentially update the chain's key/keybits. 1731 */ 1732 chain->bref.key = key; 1733 chain->bref.keybits = keybits; 1734 } 1735 1736 again: 1737 /* 1738 * Locate a free blockref in the parent's array 1739 */ 1740 switch(parent->bref.type) { 1741 case HAMMER2_BREF_TYPE_INODE: 1742 KKASSERT((parent->data->ipdata.op_flags & 1743 HAMMER2_OPFLAG_DIRECTDATA) == 0); 1744 KKASSERT(parent->data != NULL); 1745 base = &parent->data->ipdata.u.blockset.blockref[0]; 1746 count = HAMMER2_SET_COUNT; 1747 break; 1748 case HAMMER2_BREF_TYPE_INDIRECT: 1749 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 1750 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1751 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1752 base = NULL; 1753 } else { 1754 KKASSERT(parent->data != NULL); 1755 base = &parent->data->npdata.blockref[0]; 1756 } 1757 count = parent->bytes / sizeof(hammer2_blockref_t); 1758 break; 1759 case HAMMER2_BREF_TYPE_VOLUME: 1760 KKASSERT(parent->data != NULL); 1761 base = &hmp->voldata.sroot_blockset.blockref[0]; 1762 count = HAMMER2_SET_COUNT; 1763 break; 1764 default: 1765 panic("hammer2_chain_create: unrecognized blockref type: %d", 1766 parent->bref.type); 1767 count = 0; 1768 break; 1769 } 1770 1771 /* 1772 * Scan for an unallocated bref, also skipping any slots occupied 1773 * by in-memory chain elements that may not yet have been updated 1774 * in the parent's bref array. 1775 */ 1776 bzero(&dummy_chain, sizeof(dummy_chain)); 1777 for (i = 0; i < count; ++i) { 1778 if (base == NULL) { 1779 dummy_chain.index = i; 1780 if (RB_FIND(hammer2_chain_tree, 1781 &parent->rbhead, &dummy_chain) == NULL) { 1782 break; 1783 } 1784 } else if (base[i].type == 0) { 1785 dummy_chain.index = i; 1786 if (RB_FIND(hammer2_chain_tree, 1787 &parent->rbhead, &dummy_chain) == NULL) { 1788 break; 1789 } 1790 } 1791 } 1792 1793 /* 1794 * If no free blockref could be found we must create an indirect 1795 * block and move a number of blockrefs into it. With the parent 1796 * locked we can safely lock each child in order to move it without 1797 * causing a deadlock. 1798 * 1799 * This may return the new indirect block or the old parent depending 1800 * on where the key falls. NULL is returned on error. The most 1801 * typical error is EAGAIN (flush conflict during chain move). 1802 */ 1803 if (i == count) { 1804 hammer2_chain_t *nparent; 1805 1806 nparent = hammer2_chain_create_indirect(hmp, parent, 1807 key, keybits, 1808 errorp); 1809 if (nparent == NULL) { 1810 if (allocated) 1811 hammer2_chain_free(hmp, chain); 1812 chain = NULL; 1813 goto done; 1814 } 1815 if (parent != nparent) { 1816 if (unlock_parent) 1817 hammer2_chain_unlock(hmp, parent); 1818 parent = nparent; 1819 unlock_parent = 1; 1820 } 1821 goto again; 1822 } 1823 1824 /* 1825 * Link the chain into its parent. Later on we will have to set 1826 * the MOVED bit in situations where we don't mark the new chain 1827 * as being modified. 1828 */ 1829 if (chain->parent != NULL) 1830 panic("hammer2: hammer2_chain_create: chain already connected"); 1831 KKASSERT(chain->parent == NULL); 1832 chain->parent = parent; 1833 chain->index = i; 1834 if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain)) 1835 panic("hammer2_chain_link: collision"); 1836 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 1837 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0); 1838 KKASSERT(parent->refs > 0); 1839 atomic_add_int(&parent->refs, 1); 1840 1841 /* 1842 * (allocated) indicates that this is a newly-created chain element 1843 * rather than a renamed chain element. In this situation we want 1844 * to place the chain element in the MODIFIED state. 1845 * 1846 * The data area will be set up as follows: 1847 * 1848 * VOLUME not allowed here. 1849 * 1850 * INODE embedded data are will be set-up. 1851 * 1852 * INDIRECT not allowed here. 1853 * 1854 * DATA no data area will be set-up (caller is expected 1855 * to have logical buffers, we don't want to alias 1856 * the data onto device buffers!). 1857 */ 1858 if (allocated) { 1859 switch(chain->bref.type) { 1860 case HAMMER2_BREF_TYPE_DATA: 1861 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1862 hammer2_chain_modify(hmp, chain, 1863 HAMMER2_MODIFY_OPTDATA); 1864 break; 1865 case HAMMER2_BREF_TYPE_INDIRECT: 1866 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 1867 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1868 /* not supported in this function */ 1869 panic("hammer2_chain_create: bad type"); 1870 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1871 hammer2_chain_modify(hmp, chain, 1872 HAMMER2_MODIFY_OPTDATA); 1873 break; 1874 default: 1875 hammer2_chain_modify(hmp, chain, 0); 1876 break; 1877 } 1878 } else { 1879 /* 1880 * When reconnecting inodes we have to call setsubmod() 1881 * to ensure that its state propagates up the newly 1882 * connected parent. 1883 * 1884 * Make sure MOVED is set but do not update bref_flush. If 1885 * the chain is undergoing modification bref_flush will be 1886 * updated when it gets flushed. If it is not then the 1887 * bref may not have been flushed yet and we do not want to 1888 * set MODIFIED here as this could result in unnecessary 1889 * reallocations. 1890 */ 1891 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 1892 hammer2_chain_ref(hmp, chain); 1893 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 1894 } 1895 hammer2_chain_parent_setsubmod(hmp, chain); 1896 } 1897 1898 done: 1899 if (unlock_parent) 1900 hammer2_chain_unlock(hmp, parent); 1901 return (chain); 1902 } 1903 1904 /* 1905 * Create an indirect block that covers one or more of the elements in the 1906 * current parent. Either returns the existing parent with no locking or 1907 * ref changes or returns the new indirect block locked and referenced 1908 * and leaving the original parent lock/ref intact as well. 1909 * 1910 * If an error occurs, NULL is returned and *errorp is set to the error. 1911 * EAGAIN can be returned to indicate a flush collision which requires the 1912 * caller to retry. 1913 * 1914 * The returned chain depends on where the specified key falls. 1915 * 1916 * The key/keybits for the indirect mode only needs to follow three rules: 1917 * 1918 * (1) That all elements underneath it fit within its key space and 1919 * 1920 * (2) That all elements outside it are outside its key space. 1921 * 1922 * (3) When creating the new indirect block any elements in the current 1923 * parent that fit within the new indirect block's keyspace must be 1924 * moved into the new indirect block. 1925 * 1926 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 1927 * keyspace the the current parent, but lookup/iteration rules will 1928 * ensure (and must ensure) that rule (2) for all parents leading up 1929 * to the nearest inode or the root volume header is adhered to. This 1930 * is accomplished by always recursing through matching keyspaces in 1931 * the hammer2_chain_lookup() and hammer2_chain_next() API. 1932 * 1933 * The current implementation calculates the current worst-case keyspace by 1934 * iterating the current parent and then divides it into two halves, choosing 1935 * whichever half has the most elements (not necessarily the half containing 1936 * the requested key). 1937 * 1938 * We can also opt to use the half with the least number of elements. This 1939 * causes lower-numbered keys (aka logical file offsets) to recurse through 1940 * fewer indirect blocks and higher-numbered keys to recurse through more. 1941 * This also has the risk of not moving enough elements to the new indirect 1942 * block and being forced to create several indirect blocks before the element 1943 * can be inserted. 1944 * 1945 * Must be called with an exclusively locked parent. 1946 */ 1947 static 1948 hammer2_chain_t * 1949 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent, 1950 hammer2_key_t create_key, int create_bits, 1951 int *errorp) 1952 { 1953 hammer2_blockref_t *base; 1954 hammer2_blockref_t *bref; 1955 hammer2_chain_t *chain; 1956 hammer2_chain_t *ichain; 1957 hammer2_chain_t dummy; 1958 hammer2_key_t key = create_key; 1959 int keybits = create_bits; 1960 int locount = 0; 1961 int hicount = 0; 1962 int count; 1963 int nbytes; 1964 int i; 1965 1966 /* 1967 * Calculate the base blockref pointer or NULL if the chain 1968 * is known to be empty. We need to calculate the array count 1969 * for RB lookups either way. 1970 */ 1971 KKASSERT(ccms_thread_lock_owned(&parent->cst)); 1972 *errorp = 0; 1973 1974 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA); 1975 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1976 base = NULL; 1977 1978 switch(parent->bref.type) { 1979 case HAMMER2_BREF_TYPE_INODE: 1980 count = HAMMER2_SET_COUNT; 1981 break; 1982 case HAMMER2_BREF_TYPE_INDIRECT: 1983 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 1984 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1985 count = parent->bytes / sizeof(hammer2_blockref_t); 1986 break; 1987 case HAMMER2_BREF_TYPE_VOLUME: 1988 count = HAMMER2_SET_COUNT; 1989 break; 1990 default: 1991 panic("hammer2_chain_create_indirect: " 1992 "unrecognized blockref type: %d", 1993 parent->bref.type); 1994 count = 0; 1995 break; 1996 } 1997 } else { 1998 switch(parent->bref.type) { 1999 case HAMMER2_BREF_TYPE_INODE: 2000 base = &parent->data->ipdata.u.blockset.blockref[0]; 2001 count = HAMMER2_SET_COUNT; 2002 break; 2003 case HAMMER2_BREF_TYPE_INDIRECT: 2004 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 2005 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2006 base = &parent->data->npdata.blockref[0]; 2007 count = parent->bytes / sizeof(hammer2_blockref_t); 2008 break; 2009 case HAMMER2_BREF_TYPE_VOLUME: 2010 base = &hmp->voldata.sroot_blockset.blockref[0]; 2011 count = HAMMER2_SET_COUNT; 2012 break; 2013 default: 2014 panic("hammer2_chain_create_indirect: " 2015 "unrecognized blockref type: %d", 2016 parent->bref.type); 2017 count = 0; 2018 break; 2019 } 2020 } 2021 2022 /* 2023 * Scan for an unallocated bref, also skipping any slots occupied 2024 * by in-memory chain elements which may not yet have been updated 2025 * in the parent's bref array. 2026 */ 2027 bzero(&dummy, sizeof(dummy)); 2028 for (i = 0; i < count; ++i) { 2029 int nkeybits; 2030 2031 dummy.index = i; 2032 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy); 2033 if (chain) { 2034 /* 2035 * NOTE! CHAIN_DELETED elements have to be adjusted 2036 * too, they cannot be ignored. 2037 */ 2038 bref = &chain->bref; 2039 } else if (base && base[i].type) { 2040 bref = &base[i]; 2041 } else { 2042 continue; 2043 } 2044 2045 /* 2046 * Expand our calculated key range (key, keybits) to fit 2047 * the scanned key. nkeybits represents the full range 2048 * that we will later cut in half (two halves @ nkeybits - 1). 2049 */ 2050 nkeybits = keybits; 2051 if (nkeybits < bref->keybits) 2052 nkeybits = bref->keybits; 2053 while (nkeybits < 64 && 2054 (~(((hammer2_key_t)1 << nkeybits) - 1) & 2055 (key ^ bref->key)) != 0) { 2056 ++nkeybits; 2057 } 2058 2059 /* 2060 * If the new key range is larger we have to determine 2061 * which side of the new key range the existing keys fall 2062 * under by checking the high bit, then collapsing the 2063 * locount into the hicount or vise-versa. 2064 */ 2065 if (keybits != nkeybits) { 2066 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 2067 hicount += locount; 2068 locount = 0; 2069 } else { 2070 locount += hicount; 2071 hicount = 0; 2072 } 2073 keybits = nkeybits; 2074 } 2075 2076 /* 2077 * The newly scanned key will be in the lower half or the 2078 * higher half of the (new) key range. 2079 */ 2080 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 2081 ++hicount; 2082 else 2083 ++locount; 2084 } 2085 2086 /* 2087 * Adjust keybits to represent half of the full range calculated 2088 * above (radix 63 max) 2089 */ 2090 --keybits; 2091 2092 /* 2093 * Select whichever half contains the most elements. Theoretically 2094 * we can select either side as long as it contains at least one 2095 * element (in order to ensure that a free slot is present to hold 2096 * the indirect block). 2097 */ 2098 key &= ~(((hammer2_key_t)1 << keybits) - 1); 2099 if (hammer2_indirect_optimize) { 2100 /* 2101 * Insert node for least number of keys, this will arrange 2102 * the first few blocks of a large file or the first few 2103 * inodes in a directory with fewer indirect blocks when 2104 * created linearly. 2105 */ 2106 if (hicount < locount && hicount != 0) 2107 key |= (hammer2_key_t)1 << keybits; 2108 else 2109 key &= ~(hammer2_key_t)1 << keybits; 2110 } else { 2111 /* 2112 * Insert node for most number of keys, best for heavily 2113 * fragmented files. 2114 */ 2115 if (hicount > locount) 2116 key |= (hammer2_key_t)1 << keybits; 2117 else 2118 key &= ~(hammer2_key_t)1 << keybits; 2119 } 2120 2121 /* 2122 * How big should our new indirect block be? It has to be at least 2123 * as large as its parent. 2124 */ 2125 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) 2126 nbytes = HAMMER2_IND_BYTES_MIN; 2127 else 2128 nbytes = HAMMER2_IND_BYTES_MAX; 2129 if (nbytes < count * sizeof(hammer2_blockref_t)) 2130 nbytes = count * sizeof(hammer2_blockref_t); 2131 2132 /* 2133 * Ok, create our new indirect block 2134 */ 2135 switch(parent->bref.type) { 2136 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 2137 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2138 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 2139 break; 2140 default: 2141 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT; 2142 break; 2143 } 2144 dummy.bref.key = key; 2145 dummy.bref.keybits = keybits; 2146 dummy.bref.data_off = hammer2_allocsize(nbytes); 2147 dummy.bref.methods = parent->bref.methods; 2148 ichain = hammer2_chain_alloc(hmp, &dummy.bref); 2149 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 2150 2151 /* 2152 * Iterate the original parent and move the matching brefs into 2153 * the new indirect block. 2154 */ 2155 for (i = 0; i < count; ++i) { 2156 /* 2157 * For keying purposes access the bref from the media or 2158 * from our in-memory cache. In cases where the in-memory 2159 * cache overrides the media the keyrefs will be the same 2160 * anyway so we can avoid checking the cache when the media 2161 * has a key. 2162 */ 2163 dummy.index = i; 2164 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy); 2165 if (chain) { 2166 /* 2167 * NOTE! CHAIN_DELETED elements have to be adjusted 2168 * too, they cannot be ignored. 2169 */ 2170 bref = &chain->bref; 2171 } else if (base && base[i].type) { 2172 bref = &base[i]; 2173 } else { 2174 if (ichain->index < 0) 2175 ichain->index = i; 2176 continue; 2177 } 2178 2179 /* 2180 * Skip keys not in the chosen half (low or high), only bit 2181 * (keybits - 1) needs to be compared but for safety we 2182 * will compare all msb bits plus that bit again. 2183 */ 2184 if ((~(((hammer2_key_t)1 << keybits) - 1) & 2185 (key ^ bref->key)) != 0) { 2186 continue; 2187 } 2188 2189 /* 2190 * This element is being moved from the parent, its slot 2191 * is available for our new indirect block. 2192 */ 2193 if (ichain->index < 0) 2194 ichain->index = i; 2195 2196 /* 2197 * Load the new indirect block by acquiring or allocating 2198 * the related chain entries, then simply move them to the 2199 * new parent (ichain). We cannot move chains which are 2200 * undergoing flushing and will break out of the loop in 2201 * that case. 2202 * 2203 * When adjusting the parent/child relationship we must 2204 * set the MOVED bit but we do NOT update bref_flush 2205 * because otherwise we might synchronize a bref that has 2206 * not yet been flushed. We depend on chain's bref_flush 2207 * either being correct or the chain being in a MODIFIED 2208 * state. 2209 * 2210 * We do not want to set MODIFIED here as this would result 2211 * in unnecessary reallocations. 2212 * 2213 * We must still set SUBMODIFIED in the parent but we do 2214 * that after the loop. 2215 * 2216 * WARNING! chain->cst.spin must be held when chain->parent is 2217 * modified, even though we own the full blown lock, 2218 * to deal with setsubmod and rename races. 2219 */ 2220 chain = hammer2_chain_get(hmp, parent, i, 2221 HAMMER2_LOOKUP_NODATA); 2222 if (chain->flushing) { 2223 hammer2_chain_unlock(hmp, chain); 2224 break; 2225 } 2226 2227 spin_lock(&chain->cst.spin); 2228 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain); 2229 if (RB_INSERT(hammer2_chain_tree, &ichain->rbhead, chain)) 2230 panic("hammer2_chain_create_indirect: collision"); 2231 chain->parent = ichain; 2232 spin_unlock(&chain->cst.spin); 2233 2234 if (base) 2235 bzero(&base[i], sizeof(base[i])); 2236 atomic_add_int(&parent->refs, -1); 2237 atomic_add_int(&ichain->refs, 1); 2238 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 2239 hammer2_chain_ref(hmp, chain); 2240 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 2241 } 2242 hammer2_chain_unlock(hmp, chain); 2243 KKASSERT(parent->refs > 0); 2244 chain = NULL; 2245 } 2246 2247 /* 2248 * If we hit a chain that is undergoing flushing we're screwed and 2249 * we have to undo the whole mess. Since ichain has not been linked 2250 * in yet, the moved chains are not reachable and will not have been 2251 * disposed of. 2252 * 2253 * WARNING! This code is pretty hairy because the flusher is sitting 2254 * on the parent processing one of the children that we 2255 * haven't yet moved, and will do a RB_NEXT loop on that 2256 * child. So the children we're moving back have to be 2257 * returned to the same place in the iteration that they 2258 * were removed from. 2259 */ 2260 if (i != count) { 2261 kprintf("hammer2_chain_create_indirect: EAGAIN\n"); 2262 *errorp = EAGAIN; 2263 while ((chain = RB_ROOT(&ichain->rbhead)) != NULL) { 2264 hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_NEVER); 2265 KKASSERT(chain->flushing == 0); 2266 RB_REMOVE(hammer2_chain_tree, &ichain->rbhead, chain); 2267 if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain)) 2268 panic("hammer2_chain_create_indirect: collision"); 2269 chain->parent = parent; 2270 atomic_add_int(&parent->refs, 1); 2271 atomic_add_int(&ichain->refs, -1); 2272 /* MOVED bit might have been inherited, cannot undo */ 2273 hammer2_chain_unlock(hmp, chain); 2274 } 2275 hammer2_chain_free(hmp, ichain); 2276 return(NULL); 2277 } 2278 2279 /* 2280 * Insert the new indirect block into the parent now that we've 2281 * cleared out some entries in the parent. We calculated a good 2282 * insertion index in the loop above (ichain->index). 2283 * 2284 * We don't have to set MOVED here because we mark ichain modified 2285 * down below (so the normal modified -> flush -> set-moved sequence 2286 * applies). 2287 */ 2288 KKASSERT(ichain->index >= 0); 2289 if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, ichain)) 2290 panic("hammer2_chain_create_indirect: ichain insertion"); 2291 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_ONRBTREE); 2292 ichain->parent = parent; 2293 atomic_add_int(&parent->refs, 1); 2294 2295 /* 2296 * Mark the new indirect block modified after insertion, which 2297 * will propagate up through parent all the way to the root and 2298 * also allocate the physical block in ichain for our caller, 2299 * and assign ichain->data to a pre-zero'd space (because there 2300 * is not prior data to copy into it). 2301 * 2302 * We have to set SUBMODIFIED in ichain's flags manually so the 2303 * flusher knows it has to recurse through it to get to all of 2304 * our moved blocks, then call setsubmod() to set the bit 2305 * recursively. 2306 */ 2307 hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA); 2308 hammer2_chain_parent_setsubmod(hmp, ichain); 2309 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED); 2310 2311 /* 2312 * Figure out what to return. 2313 */ 2314 if (create_bits > keybits) { 2315 /* 2316 * Key being created is way outside the key range, 2317 * return the original parent. 2318 */ 2319 hammer2_chain_unlock(hmp, ichain); 2320 } else if (~(((hammer2_key_t)1 << keybits) - 1) & 2321 (create_key ^ key)) { 2322 /* 2323 * Key being created is outside the key range, 2324 * return the original parent. 2325 */ 2326 hammer2_chain_unlock(hmp, ichain); 2327 } else { 2328 /* 2329 * Otherwise its in the range, return the new parent. 2330 * (leave both the new and old parent locked). 2331 */ 2332 parent = ichain; 2333 } 2334 2335 return(parent); 2336 } 2337 2338 /* 2339 * Physically delete the specified chain element. Note that inodes with 2340 * open descriptors should not be deleted (as with other filesystems) until 2341 * the last open descriptor is closed. 2342 * 2343 * This routine will remove the chain element from its parent and potentially 2344 * also recurse upward and delete indirect blocks which become empty as a 2345 * side effect. 2346 * 2347 * The caller must pass a pointer to the chain's parent, also locked and 2348 * referenced. (*parentp) will be modified in a manner similar to a lookup 2349 * or iteration when indirect blocks are also deleted as a side effect. 2350 * 2351 * Must be called with an exclusively locked parent and chain. parent and 2352 * chain are both left locked on return. 2353 * 2354 * XXX This currently does not adhere to the MOVED flag protocol in that 2355 * the removal is immediately indicated in the parent's blockref[] 2356 * array. 2357 */ 2358 void 2359 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent, 2360 hammer2_chain_t *chain, int retain) 2361 { 2362 hammer2_blockref_t *base; 2363 int count; 2364 2365 if (chain->parent != parent) 2366 panic("hammer2_chain_delete: parent mismatch"); 2367 KKASSERT(ccms_thread_lock_owned(&parent->cst)); 2368 2369 /* 2370 * Mark the parent modified so our base[] pointer remains valid 2371 * while we move entries. For the optimized indirect block 2372 * case mark the parent moved instead. 2373 * 2374 * Calculate the blockref reference in the parent 2375 */ 2376 switch(parent->bref.type) { 2377 case HAMMER2_BREF_TYPE_INODE: 2378 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID); 2379 base = &parent->data->ipdata.u.blockset.blockref[0]; 2380 count = HAMMER2_SET_COUNT; 2381 break; 2382 case HAMMER2_BREF_TYPE_INDIRECT: 2383 case HAMMER2_BREF_TYPE_FREEMAP_ROOT: 2384 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2385 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA | 2386 HAMMER2_MODIFY_NO_MODIFY_TID); 2387 if (parent->flags & HAMMER2_CHAIN_INITIAL) 2388 base = NULL; 2389 else 2390 base = &parent->data->npdata.blockref[0]; 2391 count = parent->bytes / sizeof(hammer2_blockref_t); 2392 break; 2393 case HAMMER2_BREF_TYPE_VOLUME: 2394 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID); 2395 base = &hmp->voldata.sroot_blockset.blockref[0]; 2396 count = HAMMER2_SET_COUNT; 2397 break; 2398 default: 2399 panic("hammer2_chain_delete: unrecognized blockref type: %d", 2400 parent->bref.type); 2401 count = 0; 2402 break; 2403 } 2404 KKASSERT(chain->index >= 0 && chain->index < count); 2405 2406 /* 2407 * We may not be able to immediately disconnect the chain if a 2408 * flush is in progress. If retain is non-zero we MUST disconnect 2409 * the chain now and callers are responsible for making sure that 2410 * flushing is zero. 2411 */ 2412 spin_lock(&chain->cst.spin); 2413 if ((retain || chain->flushing == 0) && 2414 (chain->flags & HAMMER2_CHAIN_ONRBTREE)) { 2415 if (base) 2416 bzero(&base[chain->index], sizeof(*base)); 2417 KKASSERT(chain->flushing == 0); 2418 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain); 2419 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 2420 atomic_add_int(&parent->refs, -1); /* for red-black entry */ 2421 chain->index = -1; 2422 chain->parent = NULL; 2423 } 2424 spin_unlock(&chain->cst.spin); 2425 2426 /* 2427 * Cumulative adjustments must be propagated to the parent inode 2428 * when deleting and synchronized to ip. This occurs even if we 2429 * cannot detach the chain from its parent. 2430 * 2431 * NOTE: We do not propagate ip->delta_*count to the parent because 2432 * these represent adjustments that have not yet been 2433 * propagated upward, so we don't need to remove them from 2434 * the parent. 2435 * 2436 * Clear the pointer to the parent inode. 2437 */ 2438 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 2439 chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 2440 /* XXX */ 2441 } 2442 2443 /* 2444 * If retain is 0 the deletion is permanent. Because the chain is 2445 * no longer connected to the topology a flush will have no 2446 * visibility into it. We must dispose of the references related 2447 * to the MODIFIED and MOVED flags, otherwise the ref count will 2448 * never transition to 0. 2449 * 2450 * If retain is non-zero the deleted element is likely an inode 2451 * which the vnops frontend will mark DESTROYED and flush. In that 2452 * situation we must retain the flags for any open file descriptors 2453 * on the (removed) inode. The final close will destroy the 2454 * disconnected chain. 2455 */ 2456 if (retain == 0) { 2457 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 2458 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 2459 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 2460 hammer2_chain_drop(hmp, chain); 2461 } 2462 if (chain->flags & HAMMER2_CHAIN_MOVED) { 2463 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED); 2464 hammer2_chain_drop(hmp, chain); 2465 } 2466 } 2467 2468 /* 2469 * The chain is still likely referenced, possibly even by a vnode 2470 * (if an inode), so defer further action until the chain gets 2471 * dropped. 2472 */ 2473 } 2474 2475 void 2476 hammer2_chain_wait(hammer2_mount_t *hmp, hammer2_chain_t *chain) 2477 { 2478 tsleep(chain, 0, "chnflw", 1); 2479 } 2480