1 /* 2 * Copyright (c) 2011-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/kern_syscall.h> 61 #include <sys/uuid.h> 62 63 #include <crypto/sha2/sha2.h> 64 65 #include "hammer2.h" 66 67 static hammer2_chain_t *hammer2_chain_create_indirect( 68 hammer2_chain_t *parent, 69 hammer2_key_t key, int keybits, 70 hammer2_tid_t mtid, int for_type, int *errorp); 71 static hammer2_io_t *hammer2_chain_drop_data(hammer2_chain_t *chain); 72 static hammer2_chain_t *hammer2_combined_find( 73 hammer2_chain_t *parent, 74 hammer2_blockref_t *base, int count, 75 hammer2_key_t *key_nextp, 76 hammer2_key_t key_beg, hammer2_key_t key_end, 77 hammer2_blockref_t **bresp); 78 79 /* 80 * Basic RBTree for chains (core->rbtree and core->dbtree). Chains cannot 81 * overlap in the RB trees. Deleted chains are moved from rbtree to either 82 * dbtree or to dbq. 83 * 84 * Chains in delete-duplicate sequences can always iterate through core_entry 85 * to locate the live version of the chain. 86 */ 87 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 88 89 int 90 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 91 { 92 hammer2_key_t c1_beg; 93 hammer2_key_t c1_end; 94 hammer2_key_t c2_beg; 95 hammer2_key_t c2_end; 96 97 /* 98 * Compare chains. Overlaps are not supposed to happen and catch 99 * any software issues early we count overlaps as a match. 100 */ 101 c1_beg = chain1->bref.key; 102 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 103 c2_beg = chain2->bref.key; 104 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 105 106 if (c1_end < c2_beg) /* fully to the left */ 107 return(-1); 108 if (c1_beg > c2_end) /* fully to the right */ 109 return(1); 110 return(0); /* overlap (must not cross edge boundary) */ 111 } 112 113 /* 114 * Assert that a chain has no media data associated with it. 115 */ 116 static __inline void 117 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 118 { 119 KKASSERT(chain->dio == NULL); 120 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 121 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 122 chain->data) { 123 panic("hammer2_assert_no_data: chain %p still has data", chain); 124 } 125 } 126 127 /* 128 * Make a chain visible to the flusher. The flusher needs to be able to 129 * do flushes of subdirectory chains or single files so it does a top-down 130 * recursion using the ONFLUSH flag for the recursion. It locates MODIFIED 131 * or UPDATE chains and flushes back up the chain to the volume root. 132 * 133 * This routine sets ONFLUSH upward until it hits the volume root. For 134 * simplicity we ignore PFSROOT boundaries whos rules can be complex. 135 * Extra ONFLUSH flagging doesn't hurt the filesystem. 136 */ 137 void 138 hammer2_chain_setflush(hammer2_chain_t *chain) 139 { 140 hammer2_chain_t *parent; 141 142 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 143 hammer2_spin_sh(&chain->core.spin); 144 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 145 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 146 if ((parent = chain->parent) == NULL) 147 break; 148 hammer2_spin_sh(&parent->core.spin); 149 hammer2_spin_unsh(&chain->core.spin); 150 chain = parent; 151 } 152 hammer2_spin_unsh(&chain->core.spin); 153 } 154 } 155 156 /* 157 * Allocate a new disconnected chain element representing the specified 158 * bref. chain->refs is set to 1 and the passed bref is copied to 159 * chain->bref. chain->bytes is derived from the bref. 160 * 161 * chain->pmp inherits pmp unless the chain is an inode (other than the 162 * super-root inode). 163 * 164 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 165 */ 166 hammer2_chain_t * 167 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 168 hammer2_blockref_t *bref) 169 { 170 hammer2_chain_t *chain; 171 u_int bytes; 172 173 /* 174 * Special case - radix of 0 indicates a chain that does not 175 * need a data reference (context is completely embedded in the 176 * bref). 177 */ 178 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 179 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 180 else 181 bytes = 0; 182 183 atomic_add_long(&hammer2_chain_allocs, 1); 184 185 /* 186 * Construct the appropriate system structure. 187 */ 188 switch(bref->type) { 189 case HAMMER2_BREF_TYPE_DIRENT: 190 case HAMMER2_BREF_TYPE_INODE: 191 case HAMMER2_BREF_TYPE_INDIRECT: 192 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 193 case HAMMER2_BREF_TYPE_DATA: 194 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 195 /* 196 * Chain's are really only associated with the hmp but we 197 * maintain a pmp association for per-mount memory tracking 198 * purposes. The pmp can be NULL. 199 */ 200 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 201 break; 202 case HAMMER2_BREF_TYPE_VOLUME: 203 case HAMMER2_BREF_TYPE_FREEMAP: 204 /* 205 * Only hammer2_chain_bulksnap() calls this function with these 206 * types. 207 */ 208 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 209 break; 210 default: 211 chain = NULL; 212 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 213 bref->type); 214 } 215 216 /* 217 * Initialize the new chain structure. pmp must be set to NULL for 218 * chains belonging to the super-root topology of a device mount. 219 */ 220 if (pmp == hmp->spmp) 221 chain->pmp = NULL; 222 else 223 chain->pmp = pmp; 224 chain->hmp = hmp; 225 chain->bref = *bref; 226 chain->bytes = bytes; 227 chain->refs = 1; 228 chain->flags = HAMMER2_CHAIN_ALLOCATED; 229 230 /* 231 * Set the PFS boundary flag if this chain represents a PFS root. 232 */ 233 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 234 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 235 hammer2_chain_core_init(chain); 236 237 return (chain); 238 } 239 240 /* 241 * Initialize a chain's core structure. This structure used to be allocated 242 * but is now embedded. 243 * 244 * The core is not locked. No additional refs on the chain are made. 245 * (trans) must not be NULL if (core) is not NULL. 246 */ 247 void 248 hammer2_chain_core_init(hammer2_chain_t *chain) 249 { 250 /* 251 * Fresh core under nchain (no multi-homing of ochain's 252 * sub-tree). 253 */ 254 RB_INIT(&chain->core.rbtree); /* live chains */ 255 hammer2_mtx_init(&chain->lock, "h2chain"); 256 } 257 258 /* 259 * Add a reference to a chain element, preventing its destruction. 260 * 261 * (can be called with spinlock held) 262 */ 263 void 264 hammer2_chain_ref(hammer2_chain_t *chain) 265 { 266 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 267 /* 268 * 0->non-zero transition must ensure that chain is removed 269 * from the LRU list. 270 * 271 * NOTE: Already holding lru_spin here so we cannot call 272 * hammer2_chain_ref() to get it off lru_list, do 273 * it manually. 274 */ 275 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 276 hammer2_pfs_t *pmp = chain->pmp; 277 hammer2_spin_ex(&pmp->lru_spin); 278 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 279 atomic_add_int(&pmp->lru_count, -1); 280 atomic_clear_int(&chain->flags, 281 HAMMER2_CHAIN_ONLRU); 282 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 283 } 284 hammer2_spin_unex(&pmp->lru_spin); 285 } 286 } 287 #if 0 288 kprintf("REFC %p %d %08x\n", chain, chain->refs - 1, chain->flags); 289 print_backtrace(8); 290 #endif 291 } 292 293 /* 294 * Ref a locked chain and force the data to be held across an unlock. 295 * Chain must be currently locked. The user of the chain who desires 296 * to release the hold must call hammer2_chain_lock_unhold() to relock 297 * and unhold the chain, then unlock normally, or may simply call 298 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 299 */ 300 void 301 hammer2_chain_ref_hold(hammer2_chain_t *chain) 302 { 303 atomic_add_int(&chain->lockcnt, 1); 304 hammer2_chain_ref(chain); 305 } 306 307 /* 308 * Insert the chain in the core rbtree. 309 * 310 * Normal insertions are placed in the live rbtree. Insertion of a deleted 311 * chain is a special case used by the flush code that is placed on the 312 * unstaged deleted list to avoid confusing the live view. 313 */ 314 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 315 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 316 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 317 318 static 319 int 320 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 321 int flags, int generation) 322 { 323 hammer2_chain_t *xchain; 324 int error = 0; 325 326 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 327 hammer2_spin_ex(&parent->core.spin); 328 329 /* 330 * Interlocked by spinlock, check for race 331 */ 332 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 333 parent->core.generation != generation) { 334 error = HAMMER2_ERROR_EAGAIN; 335 goto failed; 336 } 337 338 /* 339 * Insert chain 340 */ 341 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 342 KASSERT(xchain == NULL, 343 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 344 chain, xchain, chain->bref.key)); 345 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 346 chain->parent = parent; 347 ++parent->core.chain_count; 348 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 349 350 /* 351 * We have to keep track of the effective live-view blockref count 352 * so the create code knows when to push an indirect block. 353 */ 354 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 355 atomic_add_int(&parent->core.live_count, 1); 356 failed: 357 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 358 hammer2_spin_unex(&parent->core.spin); 359 return error; 360 } 361 362 /* 363 * Drop the caller's reference to the chain. When the ref count drops to 364 * zero this function will try to disassociate the chain from its parent and 365 * deallocate it, then recursely drop the parent using the implied ref 366 * from the chain's chain->parent. 367 * 368 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 369 * races an acquisition by another cpu. Therefore we can loop if we are 370 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 371 * race against another drop. 372 */ 373 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain); 374 375 void 376 hammer2_chain_drop(hammer2_chain_t *chain) 377 { 378 u_int refs; 379 380 if (hammer2_debug & 0x200000) 381 Debugger("drop"); 382 #if 0 383 kprintf("DROP %p %d %08x\n", chain, chain->refs - 1, chain->flags); 384 print_backtrace(8); 385 #endif 386 387 KKASSERT(chain->refs > 0); 388 389 while (chain) { 390 refs = chain->refs; 391 cpu_ccfence(); 392 KKASSERT(refs > 0); 393 394 if (refs == 1) { 395 if (mtx_lock_ex_try(&chain->lock) == 0) 396 chain = hammer2_chain_lastdrop(chain); 397 /* retry the same chain */ 398 } else { 399 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 400 break; 401 /* retry the same chain */ 402 } 403 cpu_pause(); 404 } 405 } 406 407 /* 408 * Unhold a held and probably not-locked chain, ensure that the data is 409 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 410 * lock and then simply unlocking the chain. 411 */ 412 void 413 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 414 { 415 u_int lockcnt; 416 int iter = 0; 417 418 for (;;) { 419 lockcnt = chain->lockcnt; 420 cpu_ccfence(); 421 if (lockcnt > 1) { 422 if (atomic_cmpset_int(&chain->lockcnt, 423 lockcnt, lockcnt - 1)) { 424 break; 425 } 426 } else if (mtx_lock_ex_try(&chain->lock) == 0) { 427 hammer2_chain_unlock(chain); 428 break; 429 } else { 430 /* 431 * This situation can easily occur on SMP due to 432 * the gap inbetween the 1->0 transition and the 433 * final unlock. We cannot safely block on the 434 * mutex because lockcnt might go above 1. 435 * 436 * XXX Sleep for one tick if it takes too long. 437 */ 438 if (++iter > 1000) { 439 if (iter > 1000 + hz) { 440 kprintf("hammer2: h2race1 %p\n", chain); 441 iter = 1000; 442 } 443 tsleep(&iter, 0, "h2race1", 1); 444 } 445 cpu_pause(); 446 } 447 } 448 hammer2_chain_drop(chain); 449 } 450 451 /* 452 * Handles the (potential) last drop of chain->refs from 1->0. Called with 453 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 454 * possible against refs and lockcnt. We must dispose of the mutex on chain. 455 * 456 * This function returns an unlocked chain for recursive drop or NULL. It 457 * can return the same chain if it determines it has raced another ref. 458 * 459 * -- 460 * 461 * When two chains need to be recursively dropped we use the chain we 462 * would otherwise free to placehold the additional chain. It's a bit 463 * convoluted but we can't just recurse without potentially blowing out 464 * the kernel stack. 465 * 466 * The chain cannot be freed if it has any children. 467 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 468 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 469 * Any dedup registration can remain intact. 470 * 471 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 472 */ 473 static 474 hammer2_chain_t * 475 hammer2_chain_lastdrop(hammer2_chain_t *chain) 476 { 477 hammer2_pfs_t *pmp; 478 hammer2_dev_t *hmp; 479 hammer2_chain_t *parent; 480 hammer2_chain_t *rdrop; 481 #if 0 482 hammer2_io_t *dio; 483 #endif 484 485 #if 0 486 /* 487 * On last drop if there is no parent and data_off is good (at 488 * least does not represent the volume root), the modified chain 489 * is probably going to be destroyed. We have to make sure that 490 * the data area is not registered for dedup. 491 * 492 * XXX removed. In fact, we do not have to make sure that the 493 * data area is not registered for dedup. The data area 494 * can, in fact, still be used for dedup because it is 495 * still allocated in the freemap and the underlying I/O 496 * will still be flushed. 497 */ 498 if (chain->parent == NULL && 499 (chain->flags & HAMMER2_CHAIN_MODIFIED) && 500 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 501 hmp = chain->hmp; 502 hammer2_io_dedup_delete(hmp, chain->bref.type, 503 chain->bref.data_off, chain->bytes); 504 } 505 #endif 506 /* 507 * We need chain's spinlock to interlock the sub-tree test. 508 * We already have chain's mutex, protecting chain->parent. 509 * 510 * Remember that chain->refs can be in flux. 511 */ 512 hammer2_spin_ex(&chain->core.spin); 513 514 if ((parent = chain->parent) != NULL) { 515 /* 516 * If the chain has a parent the UPDATE bit prevents scrapping 517 * as the chain is needed to properly flush the parent. Try 518 * to complete the 1->0 transition and return NULL. Retry 519 * (return chain) if we are unable to complete the 1->0 520 * transition, else return NULL (nothing more to do). 521 * 522 * If the chain has a parent the MODIFIED bit prevents 523 * scrapping. 524 * 525 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 526 */ 527 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 528 HAMMER2_CHAIN_MODIFIED)) { 529 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 530 hammer2_spin_unex(&chain->core.spin); 531 #if 0 532 dio = hammer2_chain_drop_data(chain, 0); 533 if (dio) 534 hammer2_io_bqrelse(&dio); 535 #endif 536 hammer2_chain_assert_no_data(chain); 537 hammer2_mtx_unlock(&chain->lock); 538 chain = NULL; 539 } else { 540 hammer2_spin_unex(&chain->core.spin); 541 hammer2_mtx_unlock(&chain->lock); 542 } 543 return (chain); 544 } 545 /* spinlock still held */ 546 } else { 547 /* 548 * The chain has no parent and can be flagged for destruction. 549 * Since it has no parent, UPDATE can also be cleared. 550 */ 551 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 552 if (chain->flags & HAMMER2_CHAIN_UPDATE) 553 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 554 555 /* 556 * If the chain has children we must still flush the chain. 557 * Any dedup is already handled by the underlying DIO, so 558 * we do not have to specifically flush it here. 559 * 560 * In the case where it has children, the DESTROY flag test 561 * in the flush code will prevent unnecessary flushes of 562 * MODIFIED chains that are not flagged DEDUP so don't worry 563 * about that here. 564 */ 565 if (chain->core.chain_count) { 566 /* 567 * Put on flushq (should ensure refs > 1), retry 568 * the drop. 569 */ 570 hammer2_spin_unex(&chain->core.spin); 571 hammer2_delayed_flush(chain); 572 hammer2_mtx_unlock(&chain->lock); 573 574 return(chain); /* retry drop */ 575 } 576 577 /* 578 * Otherwise we can scrap the MODIFIED bit if it is set, 579 * and continue along the freeing path. 580 * 581 * Be sure to clean-out any dedup bits. Without a parent 582 * this chain will no longer be visible to the flush code. 583 * Easy check data_off to avoid the volume root. 584 */ 585 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 586 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 587 atomic_add_long(&hammer2_count_modified_chains, -1); 588 if (chain->pmp) 589 hammer2_pfs_memory_wakeup(chain->pmp); 590 } 591 /* spinlock still held */ 592 } 593 594 /* spinlock still held */ 595 #if 0 596 dio = NULL; 597 #endif 598 599 /* 600 * If any children exist we must leave the chain intact with refs == 0. 601 * They exist because chains are retained below us which have refs or 602 * may require flushing. 603 * 604 * Retry (return chain) if we fail to transition the refs to 0, else 605 * return NULL indication nothing more to do. 606 * 607 * Chains with children are NOT put on the LRU list. 608 */ 609 if (chain->core.chain_count) { 610 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 611 hammer2_spin_unex(&chain->core.spin); 612 hammer2_chain_assert_no_data(chain); 613 hammer2_mtx_unlock(&chain->lock); 614 chain = NULL; 615 } else { 616 hammer2_spin_unex(&chain->core.spin); 617 hammer2_mtx_unlock(&chain->lock); 618 } 619 return (chain); 620 } 621 /* spinlock still held */ 622 /* no chains left under us */ 623 624 /* 625 * chain->core has no children left so no accessors can get to our 626 * chain from there. Now we have to lock the parent core to interlock 627 * remaining possible accessors that might bump chain's refs before 628 * we can safely drop chain's refs with intent to free the chain. 629 */ 630 hmp = chain->hmp; 631 pmp = chain->pmp; /* can be NULL */ 632 rdrop = NULL; 633 634 parent = chain->parent; 635 636 /* 637 * WARNING! chain's spin lock is still held here, and other spinlocks 638 * will be acquired and released in the code below. We 639 * cannot be making fancy procedure calls! 640 */ 641 642 /* 643 * We can cache the chain if it is associated with a pmp 644 * and not flagged as being destroyed or requesting a full 645 * release. In this situation the chain is not removed 646 * from its parent, i.e. it can still be looked up. 647 * 648 * We intentionally do not cache DATA chains because these 649 * were likely used to load data into the logical buffer cache 650 * and will not be accessed again for some time. 651 */ 652 if ((chain->flags & 653 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 654 chain->pmp && 655 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 656 if (parent) 657 hammer2_spin_ex(&parent->core.spin); 658 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 659 /* 660 * 1->0 transition failed, retry. Do not drop 661 * the chain's data yet! 662 */ 663 if (parent) 664 hammer2_spin_unex(&parent->core.spin); 665 hammer2_spin_unex(&chain->core.spin); 666 hammer2_mtx_unlock(&chain->lock); 667 668 return(chain); 669 } 670 671 /* 672 * Success 673 */ 674 #if 0 675 dio = hammer2_chain_drop_data(chain, 1); 676 #endif 677 hammer2_chain_assert_no_data(chain); 678 679 KKASSERT((chain->flags & HAMMER2_CHAIN_ONLRU) == 0); 680 hammer2_spin_ex(&pmp->lru_spin); 681 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 682 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 683 684 /* 685 * If we are over the LRU limit we need to drop something. 686 */ 687 if (pmp->lru_count > HAMMER2_LRU_LIMIT) { 688 rdrop = TAILQ_FIRST(&pmp->lru_list); 689 atomic_clear_int(&rdrop->flags, HAMMER2_CHAIN_ONLRU); 690 TAILQ_REMOVE(&pmp->lru_list, rdrop, lru_node); 691 atomic_add_int(&rdrop->refs, 1); 692 atomic_set_int(&rdrop->flags, HAMMER2_CHAIN_RELEASE); 693 } else { 694 atomic_add_int(&pmp->lru_count, 1); 695 } 696 hammer2_spin_unex(&pmp->lru_spin); 697 if (parent) { 698 hammer2_spin_unex(&parent->core.spin); 699 parent = NULL; /* safety */ 700 } 701 hammer2_spin_unex(&chain->core.spin); 702 hammer2_mtx_unlock(&chain->lock); 703 #if 0 704 if (dio) 705 hammer2_io_bqrelse(&dio); 706 #endif 707 708 return rdrop; 709 /* NOT REACHED */ 710 } 711 712 /* 713 * Spinlock the parent and try to drop the last ref on chain. 714 * On success determine if we should dispose of the chain 715 * (remove the chain from its parent, etc). 716 * 717 * (normal core locks are top-down recursive but we define 718 * core spinlocks as bottom-up recursive, so this is safe). 719 */ 720 if (parent) { 721 hammer2_spin_ex(&parent->core.spin); 722 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 723 #if 0 724 /* XXX remove, don't try to drop data on fail */ 725 hammer2_spin_unex(&parent->core.spin); 726 dio = hammer2_chain_drop_data(chain, 0); 727 hammer2_spin_unex(&chain->core.spin); 728 if (dio) 729 hammer2_io_bqrelse(&dio); 730 #endif 731 /* 732 * 1->0 transition failed, retry. 733 */ 734 hammer2_spin_unex(&parent->core.spin); 735 hammer2_spin_unex(&chain->core.spin); 736 hammer2_mtx_unlock(&chain->lock); 737 738 return(chain); 739 } 740 741 /* 742 * 1->0 transition successful, parent spin held to prevent 743 * new lookups, chain spinlock held to protect parent field. 744 * Remove chain from the parent. 745 * 746 * If the chain is being removed from the parent's btree but 747 * is not bmapped, we have to adjust live_count downward. If 748 * it is bmapped then the blockref is retained in the parent 749 * as is its associated live_count. This case can occur when 750 * a chain added to the topology is unable to flush and is 751 * then later deleted. 752 */ 753 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 754 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 755 (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 756 atomic_add_int(&parent->core.live_count, -1); 757 } 758 RB_REMOVE(hammer2_chain_tree, 759 &parent->core.rbtree, chain); 760 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 761 --parent->core.chain_count; 762 chain->parent = NULL; 763 } 764 765 /* 766 * If our chain was the last chain in the parent's core the 767 * core is now empty and its parent might have to be 768 * re-dropped if it has 0 refs. 769 */ 770 if (parent->core.chain_count == 0) { 771 rdrop = parent; 772 atomic_add_int(&rdrop->refs, 1); 773 /* 774 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 775 rdrop = NULL; 776 */ 777 } 778 hammer2_spin_unex(&parent->core.spin); 779 parent = NULL; /* safety */ 780 /* FALL THROUGH */ 781 } else { 782 /* 783 * No-parent case. 784 */ 785 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 786 /* 787 * 1->0 transition failed, retry. 788 */ 789 hammer2_spin_unex(&parent->core.spin); 790 hammer2_spin_unex(&chain->core.spin); 791 hammer2_mtx_unlock(&chain->lock); 792 793 return(chain); 794 } 795 } 796 797 /* 798 * Successful 1->0 transition, no parent, no children... no way for 799 * anyone to ref this chain any more. We can clean-up and free it. 800 * 801 * We still have the core spinlock, and core's chain_count is 0. 802 * Any parent spinlock is gone. 803 */ 804 hammer2_spin_unex(&chain->core.spin); 805 hammer2_chain_assert_no_data(chain); 806 hammer2_mtx_unlock(&chain->lock); 807 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 808 chain->core.chain_count == 0); 809 810 /* 811 * All locks are gone, no pointers remain to the chain, finish 812 * freeing it. 813 */ 814 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 815 HAMMER2_CHAIN_MODIFIED)) == 0); 816 #if 0 817 dio = hammer2_chain_drop_data(chain, 1); 818 if (dio) 819 hammer2_io_bqrelse(&dio); 820 #endif 821 822 /* 823 * Once chain resources are gone we can use the now dead chain 824 * structure to placehold what might otherwise require a recursive 825 * drop, because we have potentially two things to drop and can only 826 * return one directly. 827 */ 828 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 829 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 830 chain->hmp = NULL; 831 kfree(chain, hmp->mchain); 832 } 833 834 /* 835 * Possible chaining loop when parent re-drop needed. 836 */ 837 return(rdrop); 838 } 839 840 /* 841 * On last lock release. 842 */ 843 static hammer2_io_t * 844 hammer2_chain_drop_data(hammer2_chain_t *chain) 845 { 846 hammer2_io_t *dio; 847 848 if ((dio = chain->dio) != NULL) { 849 chain->dio = NULL; 850 chain->data = NULL; 851 } else { 852 switch(chain->bref.type) { 853 case HAMMER2_BREF_TYPE_VOLUME: 854 case HAMMER2_BREF_TYPE_FREEMAP: 855 break; 856 default: 857 if (chain->data != NULL) { 858 hammer2_spin_unex(&chain->core.spin); 859 panic("chain data not null: " 860 "chain %p bref %016jx.%02x " 861 "refs %d parent %p dio %p data %p", 862 chain, chain->bref.data_off, 863 chain->bref.type, chain->refs, 864 chain->parent, 865 chain->dio, chain->data); 866 } 867 KKASSERT(chain->data == NULL); 868 break; 869 } 870 } 871 return dio; 872 } 873 874 /* 875 * Lock a referenced chain element, acquiring its data with I/O if necessary, 876 * and specify how you would like the data to be resolved. 877 * 878 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 879 * 880 * The lock is allowed to recurse, multiple locking ops will aggregate 881 * the requested resolve types. Once data is assigned it will not be 882 * removed until the last unlock. 883 * 884 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 885 * (typically used to avoid device/logical buffer 886 * aliasing for data) 887 * 888 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 889 * the INITIAL-create state (indirect blocks only). 890 * 891 * Do not resolve data elements for DATA chains. 892 * (typically used to avoid device/logical buffer 893 * aliasing for data) 894 * 895 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 896 * 897 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 898 * it will be locked exclusive. 899 * 900 * NOTE: Embedded elements (volume header, inodes) are always resolved 901 * regardless. 902 * 903 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 904 * element will instantiate and zero its buffer, and flush it on 905 * release. 906 * 907 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 908 * so as not to instantiate a device buffer, which could alias against 909 * a logical file buffer. However, if ALWAYS is specified the 910 * device buffer will be instantiated anyway. 911 * 912 * WARNING! This function blocks on I/O if data needs to be fetched. This 913 * blocking can run concurrent with other compatible lock holders 914 * who do not need data returning. The lock is not upgraded to 915 * exclusive during a data fetch, a separate bit is used to 916 * interlock I/O. However, an exclusive lock holder can still count 917 * on being interlocked against an I/O fetch managed by a shared 918 * lock holder. 919 */ 920 void 921 hammer2_chain_lock(hammer2_chain_t *chain, int how) 922 { 923 /* 924 * Ref and lock the element. Recursive locks are allowed. 925 */ 926 KKASSERT(chain->refs > 0); 927 atomic_add_int(&chain->lockcnt, 1); 928 929 /* 930 * Get the appropriate lock. If LOCKAGAIN is flagged with SHARED 931 * the caller expects a shared lock to already be present and we 932 * are giving it another ref. This case must importantly not block 933 * if there is a pending exclusive lock request. 934 */ 935 if (how & HAMMER2_RESOLVE_SHARED) { 936 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 937 hammer2_mtx_sh_again(&chain->lock); 938 } else { 939 hammer2_mtx_sh(&chain->lock); 940 } 941 } else { 942 hammer2_mtx_ex(&chain->lock); 943 } 944 ++curthread->td_tracker; 945 946 /* 947 * If we already have a valid data pointer no further action is 948 * necessary. 949 */ 950 if (chain->data) 951 return; 952 953 /* 954 * Do we have to resolve the data? This is generally only 955 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 956 * Other BREF types expects the data to be there. 957 */ 958 switch(how & HAMMER2_RESOLVE_MASK) { 959 case HAMMER2_RESOLVE_NEVER: 960 return; 961 case HAMMER2_RESOLVE_MAYBE: 962 if (chain->flags & HAMMER2_CHAIN_INITIAL) 963 return; 964 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 965 return; 966 #if 0 967 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 968 return; 969 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 970 return; 971 #endif 972 /* fall through */ 973 case HAMMER2_RESOLVE_ALWAYS: 974 default: 975 break; 976 } 977 978 /* 979 * Caller requires data 980 */ 981 hammer2_chain_load_data(chain); 982 } 983 984 /* 985 * Lock the chain, retain the hold, and drop the data persistence count. 986 * The data should remain valid because we never transitioned lockcnt 987 * through 0. 988 */ 989 void 990 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 991 { 992 hammer2_chain_lock(chain, how); 993 atomic_add_int(&chain->lockcnt, -1); 994 } 995 996 #if 0 997 /* 998 * Downgrade an exclusive chain lock to a shared chain lock. 999 * 1000 * NOTE: There is no upgrade equivalent due to the ease of 1001 * deadlocks in that direction. 1002 */ 1003 void 1004 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1005 { 1006 hammer2_mtx_downgrade(&chain->lock); 1007 } 1008 #endif 1009 1010 #if 0 1011 /* 1012 * Obtains a second shared lock on the chain, does not account the second 1013 * shared lock as being owned by the current thread. 1014 * 1015 * Caller must already own a shared lock on this chain. 1016 * 1017 * The lock function is required to obtain the second shared lock without 1018 * blocking on pending exclusive requests. 1019 */ 1020 void 1021 hammer2_chain_push_shared_lock(hammer2_chain_t *chain) 1022 { 1023 hammer2_mtx_sh_again(&chain->lock); 1024 atomic_add_int(&chain->lockcnt, 1); 1025 /* do not count in td_tracker for this thread */ 1026 } 1027 1028 /* 1029 * Accounts for a shared lock that was pushed to us as being owned by our 1030 * thread. 1031 */ 1032 void 1033 hammer2_chain_pull_shared_lock(hammer2_chain_t *chain) 1034 { 1035 ++curthread->td_tracker; 1036 } 1037 #endif 1038 1039 /* 1040 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1041 * may be of any type. 1042 * 1043 * Once chain->data is set it cannot be disposed of until all locks are 1044 * released. 1045 */ 1046 void 1047 hammer2_chain_load_data(hammer2_chain_t *chain) 1048 { 1049 hammer2_blockref_t *bref; 1050 hammer2_dev_t *hmp; 1051 hammer2_io_t *dio; 1052 char *bdata; 1053 int error; 1054 1055 /* 1056 * Degenerate case, data already present, or chain has no media 1057 * reference to load. 1058 */ 1059 if (chain->data) 1060 return; 1061 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1062 return; 1063 1064 hmp = chain->hmp; 1065 KKASSERT(hmp != NULL); 1066 1067 /* 1068 * Gain the IOINPROG bit, interlocked block. 1069 */ 1070 for (;;) { 1071 u_int oflags; 1072 u_int nflags; 1073 1074 oflags = chain->flags; 1075 cpu_ccfence(); 1076 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1077 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1078 tsleep_interlock(&chain->flags, 0); 1079 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1080 tsleep(&chain->flags, PINTERLOCKED, 1081 "h2iocw", 0); 1082 } 1083 /* retry */ 1084 } else { 1085 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1086 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1087 break; 1088 } 1089 /* retry */ 1090 } 1091 } 1092 1093 /* 1094 * We own CHAIN_IOINPROG 1095 * 1096 * Degenerate case if we raced another load. 1097 */ 1098 if (chain->data) 1099 goto done; 1100 1101 /* 1102 * We must resolve to a device buffer, either by issuing I/O or 1103 * by creating a zero-fill element. We do not mark the buffer 1104 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1105 * API must still be used to do that). 1106 * 1107 * The device buffer is variable-sized in powers of 2 down 1108 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1109 * chunk always contains buffers of the same size. (XXX) 1110 * 1111 * The minimum physical IO size may be larger than the variable 1112 * block size. 1113 */ 1114 bref = &chain->bref; 1115 1116 /* 1117 * The getblk() optimization can only be used on newly created 1118 * elements if the physical block size matches the request. 1119 */ 1120 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1121 error = hammer2_io_new(hmp, bref->type, 1122 bref->data_off, chain->bytes, 1123 &chain->dio); 1124 } else { 1125 error = hammer2_io_bread(hmp, bref->type, 1126 bref->data_off, chain->bytes, 1127 &chain->dio); 1128 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1129 } 1130 if (error) { 1131 chain->error = HAMMER2_ERROR_EIO; 1132 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n", 1133 (intmax_t)bref->data_off, error); 1134 hammer2_io_bqrelse(&chain->dio); 1135 goto done; 1136 } 1137 chain->error = 0; 1138 1139 /* 1140 * This isn't perfect and can be ignored on OSs which do not have 1141 * an indication as to whether a buffer is coming from cache or 1142 * if I/O was actually issued for the read. TESTEDGOOD will work 1143 * pretty well without the B_IOISSUED logic because chains are 1144 * cached, but in that situation (without B_IOISSUED) it will not 1145 * detect whether a re-read via I/O is corrupted verses the original 1146 * read. 1147 * 1148 * We can't re-run the CRC on every fresh lock. That would be 1149 * insanely expensive. 1150 * 1151 * If the underlying kernel buffer covers the entire chain we can 1152 * use the B_IOISSUED indication to determine if we have to re-run 1153 * the CRC on chain data for chains that managed to stay cached 1154 * across the kernel disposal of the original buffer. 1155 */ 1156 if ((dio = chain->dio) != NULL && dio->bp) { 1157 struct buf *bp = dio->bp; 1158 1159 if (dio->psize == chain->bytes && 1160 (bp->b_flags & B_IOISSUED)) { 1161 atomic_clear_int(&chain->flags, 1162 HAMMER2_CHAIN_TESTEDGOOD); 1163 bp->b_flags &= ~B_IOISSUED; 1164 } 1165 } 1166 1167 /* 1168 * NOTE: A locked chain's data cannot be modified without first 1169 * calling hammer2_chain_modify(). 1170 */ 1171 1172 /* 1173 * Clear INITIAL. In this case we used io_new() and the buffer has 1174 * been zero'd and marked dirty. 1175 */ 1176 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1177 1178 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1179 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1180 chain->bref.flags |= HAMMER2_BREF_FLAG_ZERO; 1181 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1182 /* 1183 * check data not currently synchronized due to 1184 * modification. XXX assumes data stays in the buffer 1185 * cache, which might not be true (need biodep on flush 1186 * to calculate crc? or simple crc?). 1187 */ 1188 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1189 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1190 chain->error = HAMMER2_ERROR_CHECK; 1191 } else { 1192 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1193 } 1194 } 1195 1196 /* 1197 * Setup the data pointer, either pointing it to an embedded data 1198 * structure and copying the data from the buffer, or pointing it 1199 * into the buffer. 1200 * 1201 * The buffer is not retained when copying to an embedded data 1202 * structure in order to avoid potential deadlocks or recursions 1203 * on the same physical buffer. 1204 * 1205 * WARNING! Other threads can start using the data the instant we 1206 * set chain->data non-NULL. 1207 */ 1208 switch (bref->type) { 1209 case HAMMER2_BREF_TYPE_VOLUME: 1210 case HAMMER2_BREF_TYPE_FREEMAP: 1211 /* 1212 * Copy data from bp to embedded buffer 1213 */ 1214 panic("hammer2_chain_load_data: unresolved volume header"); 1215 break; 1216 case HAMMER2_BREF_TYPE_DIRENT: 1217 KKASSERT(chain->bytes != 0); 1218 /* fall through */ 1219 case HAMMER2_BREF_TYPE_INODE: 1220 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1221 case HAMMER2_BREF_TYPE_INDIRECT: 1222 case HAMMER2_BREF_TYPE_DATA: 1223 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1224 default: 1225 /* 1226 * Point data at the device buffer and leave dio intact. 1227 */ 1228 chain->data = (void *)bdata; 1229 break; 1230 } 1231 1232 /* 1233 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1234 */ 1235 done: 1236 for (;;) { 1237 u_int oflags; 1238 u_int nflags; 1239 1240 oflags = chain->flags; 1241 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1242 HAMMER2_CHAIN_IOSIGNAL); 1243 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1244 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1245 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1246 wakeup(&chain->flags); 1247 break; 1248 } 1249 } 1250 } 1251 1252 /* 1253 * Unlock and deref a chain element. 1254 * 1255 * Remember that the presence of children under chain prevent the chain's 1256 * destruction but do not add additional references, so the dio will still 1257 * be dropped. 1258 */ 1259 void 1260 hammer2_chain_unlock(hammer2_chain_t *chain) 1261 { 1262 hammer2_io_t *dio; 1263 u_int lockcnt; 1264 int iter = 0; 1265 1266 --curthread->td_tracker; 1267 1268 /* 1269 * If multiple locks are present (or being attempted) on this 1270 * particular chain we can just unlock, drop refs, and return. 1271 * 1272 * Otherwise fall-through on the 1->0 transition. 1273 */ 1274 for (;;) { 1275 lockcnt = chain->lockcnt; 1276 KKASSERT(lockcnt > 0); 1277 cpu_ccfence(); 1278 if (lockcnt > 1) { 1279 if (atomic_cmpset_int(&chain->lockcnt, 1280 lockcnt, lockcnt - 1)) { 1281 hammer2_mtx_unlock(&chain->lock); 1282 return; 1283 } 1284 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1285 /* while holding the mutex exclusively */ 1286 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1287 break; 1288 } else { 1289 /* 1290 * This situation can easily occur on SMP due to 1291 * the gap inbetween the 1->0 transition and the 1292 * final unlock. We cannot safely block on the 1293 * mutex because lockcnt might go above 1. 1294 * 1295 * XXX Sleep for one tick if it takes too long. 1296 */ 1297 if (++iter > 1000) { 1298 if (iter > 1000 + hz) { 1299 kprintf("hammer2: h2race2 %p\n", chain); 1300 iter = 1000; 1301 } 1302 tsleep(&iter, 0, "h2race2", 1); 1303 } 1304 cpu_pause(); 1305 } 1306 /* retry */ 1307 } 1308 1309 /* 1310 * Last unlock / mutex upgraded to exclusive. Drop the data 1311 * reference. 1312 */ 1313 dio = hammer2_chain_drop_data(chain); 1314 if (dio) 1315 hammer2_io_bqrelse(&dio); 1316 hammer2_mtx_unlock(&chain->lock); 1317 } 1318 1319 /* 1320 * Unlock and hold chain data intact 1321 */ 1322 void 1323 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1324 { 1325 atomic_add_int(&chain->lockcnt, 1); 1326 hammer2_chain_unlock(chain); 1327 } 1328 1329 /* 1330 * Helper to obtain the blockref[] array base and count for a chain. 1331 * 1332 * XXX Not widely used yet, various use cases need to be validated and 1333 * converted to use this function. 1334 */ 1335 static 1336 hammer2_blockref_t * 1337 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1338 { 1339 hammer2_blockref_t *base; 1340 int count; 1341 1342 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1343 base = NULL; 1344 1345 switch(parent->bref.type) { 1346 case HAMMER2_BREF_TYPE_INODE: 1347 count = HAMMER2_SET_COUNT; 1348 break; 1349 case HAMMER2_BREF_TYPE_INDIRECT: 1350 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1351 count = parent->bytes / sizeof(hammer2_blockref_t); 1352 break; 1353 case HAMMER2_BREF_TYPE_VOLUME: 1354 count = HAMMER2_SET_COUNT; 1355 break; 1356 case HAMMER2_BREF_TYPE_FREEMAP: 1357 count = HAMMER2_SET_COUNT; 1358 break; 1359 default: 1360 panic("hammer2_chain_create_indirect: " 1361 "unrecognized blockref type: %d", 1362 parent->bref.type); 1363 count = 0; 1364 break; 1365 } 1366 } else { 1367 switch(parent->bref.type) { 1368 case HAMMER2_BREF_TYPE_INODE: 1369 base = &parent->data->ipdata.u.blockset.blockref[0]; 1370 count = HAMMER2_SET_COUNT; 1371 break; 1372 case HAMMER2_BREF_TYPE_INDIRECT: 1373 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1374 base = &parent->data->npdata[0]; 1375 count = parent->bytes / sizeof(hammer2_blockref_t); 1376 break; 1377 case HAMMER2_BREF_TYPE_VOLUME: 1378 base = &parent->data->voldata. 1379 sroot_blockset.blockref[0]; 1380 count = HAMMER2_SET_COUNT; 1381 break; 1382 case HAMMER2_BREF_TYPE_FREEMAP: 1383 base = &parent->data->blkset.blockref[0]; 1384 count = HAMMER2_SET_COUNT; 1385 break; 1386 default: 1387 panic("hammer2_chain_create_indirect: " 1388 "unrecognized blockref type: %d", 1389 parent->bref.type); 1390 count = 0; 1391 break; 1392 } 1393 } 1394 *countp = count; 1395 1396 return base; 1397 } 1398 1399 /* 1400 * This counts the number of live blockrefs in a block array and 1401 * also calculates the point at which all remaining blockrefs are empty. 1402 * This routine can only be called on a live chain. 1403 * 1404 * Caller holds the chain locked, but possibly with a shared lock. We 1405 * must use an exclusive spinlock to prevent corruption. 1406 * 1407 * NOTE: Flag is not set until after the count is complete, allowing 1408 * callers to test the flag without holding the spinlock. 1409 * 1410 * NOTE: If base is NULL the related chain is still in the INITIAL 1411 * state and there are no blockrefs to count. 1412 * 1413 * NOTE: live_count may already have some counts accumulated due to 1414 * creation and deletion and could even be initially negative. 1415 */ 1416 void 1417 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1418 hammer2_blockref_t *base, int count) 1419 { 1420 hammer2_spin_ex(&chain->core.spin); 1421 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1422 if (base) { 1423 while (--count >= 0) { 1424 if (base[count].type) 1425 break; 1426 } 1427 chain->core.live_zero = count + 1; 1428 while (count >= 0) { 1429 if (base[count].type) 1430 atomic_add_int(&chain->core.live_count, 1431 1); 1432 --count; 1433 } 1434 } else { 1435 chain->core.live_zero = 0; 1436 } 1437 /* else do not modify live_count */ 1438 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1439 } 1440 hammer2_spin_unex(&chain->core.spin); 1441 } 1442 1443 /* 1444 * Resize the chain's physical storage allocation in-place. This function does 1445 * not usually adjust the data pointer and must be followed by (typically) a 1446 * hammer2_chain_modify() call to copy any old data over and adjust the 1447 * data pointer. 1448 * 1449 * Chains can be resized smaller without reallocating the storage. Resizing 1450 * larger will reallocate the storage. Excess or prior storage is reclaimed 1451 * asynchronously at a later time. 1452 * 1453 * An nradix value of 0 is special-cased to mean that the storage should 1454 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1455 * byte). 1456 * 1457 * Must be passed an exclusively locked parent and chain. 1458 * 1459 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1460 * to avoid instantiating a device buffer that conflicts with the vnode data 1461 * buffer. However, because H2 can compress or encrypt data, the chain may 1462 * have a dio assigned to it in those situations, and they do not conflict. 1463 * 1464 * XXX return error if cannot resize. 1465 */ 1466 int 1467 hammer2_chain_resize(hammer2_chain_t *chain, 1468 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1469 int nradix, int flags) 1470 { 1471 hammer2_dev_t *hmp; 1472 size_t obytes; 1473 size_t nbytes; 1474 int error; 1475 1476 hmp = chain->hmp; 1477 1478 /* 1479 * Only data and indirect blocks can be resized for now. 1480 * (The volu root, inodes, and freemap elements use a fixed size). 1481 */ 1482 KKASSERT(chain != &hmp->vchain); 1483 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1484 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1485 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1486 1487 /* 1488 * Nothing to do if the element is already the proper size 1489 */ 1490 obytes = chain->bytes; 1491 nbytes = (nradix) ? (1U << nradix) : 0; 1492 if (obytes == nbytes) 1493 return (chain->error); 1494 1495 /* 1496 * Make sure the old data is instantiated so we can copy it. If this 1497 * is a data block, the device data may be superfluous since the data 1498 * might be in a logical block, but compressed or encrypted data is 1499 * another matter. 1500 * 1501 * NOTE: The modify will set BMAPUPD for us if BMAPPED is set. 1502 */ 1503 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1504 if (error) 1505 return error; 1506 1507 /* 1508 * Relocate the block, even if making it smaller (because different 1509 * block sizes may be in different regions). 1510 * 1511 * NOTE: Operation does not copy the data and may only be used 1512 * to resize data blocks in-place, or directory entry blocks 1513 * which are about to be modified in some manner. 1514 */ 1515 error = hammer2_freemap_alloc(chain, nbytes); 1516 if (error) 1517 return error; 1518 1519 chain->bytes = nbytes; 1520 1521 /* 1522 * We don't want the followup chain_modify() to try to copy data 1523 * from the old (wrong-sized) buffer. It won't know how much to 1524 * copy. This case should only occur during writes when the 1525 * originator already has the data to write in-hand. 1526 */ 1527 if (chain->dio) { 1528 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1529 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1530 hammer2_io_brelse(&chain->dio); 1531 chain->data = NULL; 1532 } 1533 return (chain->error); 1534 } 1535 1536 /* 1537 * Set the chain modified so its data can be changed by the caller, or 1538 * install deduplicated data. The caller must call this routine for each 1539 * set of modifications it makes, even if the chain is already flagged 1540 * MODIFIED. 1541 * 1542 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1543 * is a CLC (cluster level change) field and is not updated by parent 1544 * propagation during a flush. 1545 * 1546 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1547 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1548 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1549 * remains unmodified with its old data ref intact and chain->error 1550 * unchanged. 1551 * 1552 * Dedup Handling 1553 * 1554 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1555 * even if the chain is still flagged MODIFIED. In this case the chain's 1556 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1557 * 1558 * If the caller passes a non-zero dedup_off we will use it to assign the 1559 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1560 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1561 * must not modify the data content upon return. 1562 */ 1563 int 1564 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1565 hammer2_off_t dedup_off, int flags) 1566 { 1567 hammer2_blockref_t obref; 1568 hammer2_dev_t *hmp; 1569 hammer2_io_t *dio; 1570 int error; 1571 int wasinitial; 1572 int setmodified; 1573 int setupdate; 1574 int newmod; 1575 char *bdata; 1576 1577 hmp = chain->hmp; 1578 obref = chain->bref; 1579 KKASSERT((chain->flags & HAMMER2_CHAIN_FICTITIOUS) == 0); 1580 1581 /* 1582 * Data is not optional for freemap chains (we must always be sure 1583 * to copy the data on COW storage allocations). 1584 */ 1585 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1586 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1587 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1588 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1589 } 1590 1591 /* 1592 * Data must be resolved if already assigned, unless explicitly 1593 * flagged otherwise. If we cannot safety load the data the 1594 * modification fails and we return early. 1595 */ 1596 if (chain->data == NULL && chain->bytes != 0 && 1597 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1598 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1599 hammer2_chain_load_data(chain); 1600 if (chain->error) 1601 return (chain->error); 1602 } 1603 error = 0; 1604 1605 /* 1606 * Set MODIFIED to indicate that the chain has been modified. A new 1607 * allocation is required when modifying a chain. 1608 * 1609 * Set UPDATE to ensure that the blockref is updated in the parent. 1610 * 1611 * If MODIFIED is already set determine if we can reuse the assigned 1612 * data block or if we need a new data block. 1613 */ 1614 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1615 /* 1616 * Must set modified bit. 1617 */ 1618 atomic_add_long(&hammer2_count_modified_chains, 1); 1619 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1620 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1621 setmodified = 1; 1622 1623 /* 1624 * We may be able to avoid a copy-on-write if the chain's 1625 * check mode is set to NONE and the chain's current 1626 * modify_tid is beyond the last explicit snapshot tid. 1627 * 1628 * This implements HAMMER2's overwrite-in-place feature. 1629 * 1630 * NOTE! This data-block cannot be used as a de-duplication 1631 * source when the check mode is set to NONE. 1632 */ 1633 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1634 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1635 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1636 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1637 HAMMER2_DEC_CHECK(chain->bref.methods) == 1638 HAMMER2_CHECK_NONE && 1639 chain->pmp && 1640 chain->bref.modify_tid > 1641 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1642 /* 1643 * Sector overwrite allowed. 1644 */ 1645 newmod = 0; 1646 } else { 1647 /* 1648 * Sector overwrite not allowed, must copy-on-write. 1649 */ 1650 newmod = 1; 1651 } 1652 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1653 /* 1654 * If the modified chain was registered for dedup we need 1655 * a new allocation. This only happens for delayed-flush 1656 * chains (i.e. which run through the front-end buffer 1657 * cache). 1658 */ 1659 newmod = 1; 1660 setmodified = 0; 1661 } else { 1662 /* 1663 * Already flagged modified, no new allocation is needed. 1664 */ 1665 newmod = 0; 1666 setmodified = 0; 1667 } 1668 1669 /* 1670 * Flag parent update required. 1671 */ 1672 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1673 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1674 setupdate = 1; 1675 } else { 1676 setupdate = 0; 1677 } 1678 1679 /* 1680 * The modification or re-modification requires an allocation and 1681 * possible COW. If an error occurs, the previous content and data 1682 * reference is retained and the modification fails. 1683 * 1684 * If dedup_off is non-zero, the caller is requesting a deduplication 1685 * rather than a modification. The MODIFIED bit is not set and the 1686 * data offset is set to the deduplication offset. The data cannot 1687 * be modified. 1688 * 1689 * NOTE: The dedup offset is allowed to be in a partially free state 1690 * and we must be sure to reset it to a fully allocated state 1691 * to force two bulkfree passes to free it again. 1692 * 1693 * NOTE: Only applicable when chain->bytes != 0. 1694 * 1695 * XXX can a chain already be marked MODIFIED without a data 1696 * assignment? If not, assert here instead of testing the case. 1697 */ 1698 if (chain != &hmp->vchain && chain != &hmp->fchain && 1699 chain->bytes) { 1700 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1701 newmod 1702 ) { 1703 /* 1704 * NOTE: We do not have to remove the dedup 1705 * registration because the area is still 1706 * allocated and the underlying DIO will 1707 * still be flushed. 1708 */ 1709 if (dedup_off) { 1710 chain->bref.data_off = dedup_off; 1711 chain->bytes = 1 << (dedup_off & 1712 HAMMER2_OFF_MASK_RADIX); 1713 chain->error = 0; 1714 atomic_clear_int(&chain->flags, 1715 HAMMER2_CHAIN_MODIFIED); 1716 atomic_add_long(&hammer2_count_modified_chains, 1717 -1); 1718 if (chain->pmp) 1719 hammer2_pfs_memory_wakeup(chain->pmp); 1720 hammer2_freemap_adjust(hmp, &chain->bref, 1721 HAMMER2_FREEMAP_DORECOVER); 1722 atomic_set_int(&chain->flags, 1723 HAMMER2_CHAIN_DEDUPABLE); 1724 } else { 1725 error = hammer2_freemap_alloc(chain, 1726 chain->bytes); 1727 atomic_clear_int(&chain->flags, 1728 HAMMER2_CHAIN_DEDUPABLE); 1729 } 1730 } 1731 } 1732 1733 /* 1734 * Stop here if error. We have to undo any flag bits we might 1735 * have set above. 1736 */ 1737 if (error) { 1738 if (setmodified) { 1739 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1740 atomic_add_long(&hammer2_count_modified_chains, -1); 1741 if (chain->pmp) 1742 hammer2_pfs_memory_wakeup(chain->pmp); 1743 } 1744 if (setupdate) { 1745 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1746 } 1747 return error; 1748 } 1749 1750 /* 1751 * Update mirror_tid and modify_tid. modify_tid is only updated 1752 * if not passed as zero (during flushes, parent propagation passes 1753 * the value 0). 1754 * 1755 * NOTE: chain->pmp could be the device spmp. 1756 */ 1757 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1758 if (mtid) 1759 chain->bref.modify_tid = mtid; 1760 1761 /* 1762 * Set BMAPUPD to tell the flush code that an existing blockmap entry 1763 * requires updating as well as to tell the delete code that the 1764 * chain's blockref might not exactly match (in terms of physical size 1765 * or block offset) the one in the parent's blocktable. The base key 1766 * of course will still match. 1767 */ 1768 if (chain->flags & HAMMER2_CHAIN_BMAPPED) 1769 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD); 1770 1771 /* 1772 * Short-cut data blocks which the caller does not need an actual 1773 * data reference to (aka OPTDATA), as long as the chain does not 1774 * already have a data pointer to the data. This generally means 1775 * that the modifications are being done via the logical buffer cache. 1776 * The INITIAL flag relates only to the device data buffer and thus 1777 * remains unchange in this situation. 1778 * 1779 * This code also handles bytes == 0 (most dirents). 1780 */ 1781 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1782 (flags & HAMMER2_MODIFY_OPTDATA) && 1783 chain->data == NULL) { 1784 KKASSERT(chain->dio == NULL); 1785 goto skip2; 1786 } 1787 1788 /* 1789 * Clearing the INITIAL flag (for indirect blocks) indicates that 1790 * we've processed the uninitialized storage allocation. 1791 * 1792 * If this flag is already clear we are likely in a copy-on-write 1793 * situation but we have to be sure NOT to bzero the storage if 1794 * no data is present. 1795 */ 1796 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1797 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1798 wasinitial = 1; 1799 } else { 1800 wasinitial = 0; 1801 } 1802 1803 /* 1804 * Instantiate data buffer and possibly execute COW operation 1805 */ 1806 switch(chain->bref.type) { 1807 case HAMMER2_BREF_TYPE_VOLUME: 1808 case HAMMER2_BREF_TYPE_FREEMAP: 1809 /* 1810 * The data is embedded, no copy-on-write operation is 1811 * needed. 1812 */ 1813 KKASSERT(chain->dio == NULL); 1814 break; 1815 case HAMMER2_BREF_TYPE_DIRENT: 1816 /* 1817 * The data might be fully embedded. 1818 */ 1819 if (chain->bytes == 0) { 1820 KKASSERT(chain->dio == NULL); 1821 break; 1822 } 1823 /* fall through */ 1824 case HAMMER2_BREF_TYPE_INODE: 1825 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1826 case HAMMER2_BREF_TYPE_DATA: 1827 case HAMMER2_BREF_TYPE_INDIRECT: 1828 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1829 /* 1830 * Perform the copy-on-write operation 1831 * 1832 * zero-fill or copy-on-write depending on whether 1833 * chain->data exists or not and set the dirty state for 1834 * the new buffer. hammer2_io_new() will handle the 1835 * zero-fill. 1836 * 1837 * If a dedup_off was supplied this is an existing block 1838 * and no COW, copy, or further modification is required. 1839 */ 1840 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 1841 1842 if (wasinitial && dedup_off == 0) { 1843 error = hammer2_io_new(hmp, chain->bref.type, 1844 chain->bref.data_off, 1845 chain->bytes, &dio); 1846 } else { 1847 error = hammer2_io_bread(hmp, chain->bref.type, 1848 chain->bref.data_off, 1849 chain->bytes, &dio); 1850 } 1851 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1852 1853 /* 1854 * If an I/O error occurs make sure callers cannot accidently 1855 * modify the old buffer's contents and corrupt the filesystem. 1856 */ 1857 if (error) { 1858 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 1859 hmp); 1860 chain->error = HAMMER2_ERROR_EIO; 1861 hammer2_io_brelse(&dio); 1862 hammer2_io_brelse(&chain->dio); 1863 chain->data = NULL; 1864 break; 1865 } 1866 chain->error = 0; 1867 bdata = hammer2_io_data(dio, chain->bref.data_off); 1868 1869 if (chain->data) { 1870 /* 1871 * COW (unless a dedup). 1872 */ 1873 KKASSERT(chain->dio != NULL); 1874 if (chain->data != (void *)bdata && dedup_off == 0) { 1875 bcopy(chain->data, bdata, chain->bytes); 1876 } 1877 } else if (wasinitial == 0) { 1878 /* 1879 * We have a problem. We were asked to COW but 1880 * we don't have any data to COW with! 1881 */ 1882 panic("hammer2_chain_modify: having a COW %p\n", 1883 chain); 1884 } 1885 1886 /* 1887 * Retire the old buffer, replace with the new. Dirty or 1888 * redirty the new buffer. 1889 * 1890 * WARNING! The system buffer cache may have already flushed 1891 * the buffer, so we must be sure to [re]dirty it 1892 * for further modification. 1893 * 1894 * If dedup_off was supplied, the caller is not 1895 * expected to make any further modification to the 1896 * buffer. 1897 */ 1898 if (chain->dio) 1899 hammer2_io_bqrelse(&chain->dio); 1900 chain->data = (void *)bdata; 1901 chain->dio = dio; 1902 if (dedup_off == 0) 1903 hammer2_io_setdirty(dio); 1904 break; 1905 default: 1906 panic("hammer2_chain_modify: illegal non-embedded type %d", 1907 chain->bref.type); 1908 break; 1909 1910 } 1911 skip2: 1912 /* 1913 * setflush on parent indicating that the parent must recurse down 1914 * to us. Do not call on chain itself which might already have it 1915 * set. 1916 */ 1917 if (chain->parent) 1918 hammer2_chain_setflush(chain->parent); 1919 return (chain->error); 1920 } 1921 1922 /* 1923 * Modify the chain associated with an inode. 1924 */ 1925 int 1926 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 1927 hammer2_tid_t mtid, int flags) 1928 { 1929 int error; 1930 1931 hammer2_inode_modify(ip); 1932 error = hammer2_chain_modify(chain, mtid, 0, flags); 1933 1934 return error; 1935 } 1936 1937 /* 1938 * Volume header data locks 1939 */ 1940 void 1941 hammer2_voldata_lock(hammer2_dev_t *hmp) 1942 { 1943 lockmgr(&hmp->vollk, LK_EXCLUSIVE); 1944 } 1945 1946 void 1947 hammer2_voldata_unlock(hammer2_dev_t *hmp) 1948 { 1949 lockmgr(&hmp->vollk, LK_RELEASE); 1950 } 1951 1952 void 1953 hammer2_voldata_modify(hammer2_dev_t *hmp) 1954 { 1955 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1956 atomic_add_long(&hammer2_count_modified_chains, 1); 1957 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 1958 hammer2_pfs_memory_inc(hmp->vchain.pmp); 1959 } 1960 } 1961 1962 /* 1963 * This function returns the chain at the nearest key within the specified 1964 * range. The returned chain will be referenced but not locked. 1965 * 1966 * This function will recurse through chain->rbtree as necessary and will 1967 * return a *key_nextp suitable for iteration. *key_nextp is only set if 1968 * the iteration value is less than the current value of *key_nextp. 1969 * 1970 * The caller should use (*key_nextp) to calculate the actual range of 1971 * the returned element, which will be (key_beg to *key_nextp - 1), because 1972 * there might be another element which is superior to the returned element 1973 * and overlaps it. 1974 * 1975 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 1976 * chains continue to be returned. On EOF (*key_nextp) may overflow since 1977 * it will wind up being (key_end + 1). 1978 * 1979 * WARNING! Must be called with child's spinlock held. Spinlock remains 1980 * held through the operation. 1981 */ 1982 struct hammer2_chain_find_info { 1983 hammer2_chain_t *best; 1984 hammer2_key_t key_beg; 1985 hammer2_key_t key_end; 1986 hammer2_key_t key_next; 1987 }; 1988 1989 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 1990 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 1991 1992 static 1993 hammer2_chain_t * 1994 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 1995 hammer2_key_t key_beg, hammer2_key_t key_end) 1996 { 1997 struct hammer2_chain_find_info info; 1998 1999 info.best = NULL; 2000 info.key_beg = key_beg; 2001 info.key_end = key_end; 2002 info.key_next = *key_nextp; 2003 2004 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 2005 hammer2_chain_find_cmp, hammer2_chain_find_callback, 2006 &info); 2007 *key_nextp = info.key_next; 2008 #if 0 2009 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 2010 parent, key_beg, key_end, *key_nextp); 2011 #endif 2012 2013 return (info.best); 2014 } 2015 2016 static 2017 int 2018 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 2019 { 2020 struct hammer2_chain_find_info *info = data; 2021 hammer2_key_t child_beg; 2022 hammer2_key_t child_end; 2023 2024 child_beg = child->bref.key; 2025 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 2026 2027 if (child_end < info->key_beg) 2028 return(-1); 2029 if (child_beg > info->key_end) 2030 return(1); 2031 return(0); 2032 } 2033 2034 static 2035 int 2036 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2037 { 2038 struct hammer2_chain_find_info *info = data; 2039 hammer2_chain_t *best; 2040 hammer2_key_t child_end; 2041 2042 /* 2043 * WARNING! Layerq is scanned forwards, exact matches should keep 2044 * the existing info->best. 2045 */ 2046 if ((best = info->best) == NULL) { 2047 /* 2048 * No previous best. Assign best 2049 */ 2050 info->best = child; 2051 } else if (best->bref.key <= info->key_beg && 2052 child->bref.key <= info->key_beg) { 2053 /* 2054 * Illegal overlap. 2055 */ 2056 KKASSERT(0); 2057 /*info->best = child;*/ 2058 } else if (child->bref.key < best->bref.key) { 2059 /* 2060 * Child has a nearer key and best is not flush with key_beg. 2061 * Set best to child. Truncate key_next to the old best key. 2062 */ 2063 info->best = child; 2064 if (info->key_next > best->bref.key || info->key_next == 0) 2065 info->key_next = best->bref.key; 2066 } else if (child->bref.key == best->bref.key) { 2067 /* 2068 * If our current best is flush with the child then this 2069 * is an illegal overlap. 2070 * 2071 * key_next will automatically be limited to the smaller of 2072 * the two end-points. 2073 */ 2074 KKASSERT(0); 2075 info->best = child; 2076 } else { 2077 /* 2078 * Keep the current best but truncate key_next to the child's 2079 * base. 2080 * 2081 * key_next will also automatically be limited to the smaller 2082 * of the two end-points (probably not necessary for this case 2083 * but we do it anyway). 2084 */ 2085 if (info->key_next > child->bref.key || info->key_next == 0) 2086 info->key_next = child->bref.key; 2087 } 2088 2089 /* 2090 * Always truncate key_next based on child's end-of-range. 2091 */ 2092 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2093 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2094 info->key_next = child_end; 2095 2096 return(0); 2097 } 2098 2099 /* 2100 * Retrieve the specified chain from a media blockref, creating the 2101 * in-memory chain structure which reflects it. The returned chain is 2102 * held but not locked. The caller must lock it to crc-check and 2103 * dereference its data, and should check chain->error after locking 2104 * before assuming that the data is good. 2105 * 2106 * To handle insertion races pass the INSERT_RACE flag along with the 2107 * generation number of the core. NULL will be returned if the generation 2108 * number changes before we have a chance to insert the chain. Insert 2109 * races can occur because the parent might be held shared. 2110 * 2111 * Caller must hold the parent locked shared or exclusive since we may 2112 * need the parent's bref array to find our block. 2113 * 2114 * WARNING! chain->pmp is always set to NULL for any chain representing 2115 * part of the super-root topology. 2116 */ 2117 hammer2_chain_t * 2118 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2119 hammer2_blockref_t *bref) 2120 { 2121 hammer2_dev_t *hmp = parent->hmp; 2122 hammer2_chain_t *chain; 2123 int error; 2124 2125 /* 2126 * Allocate a chain structure representing the existing media 2127 * entry. Resulting chain has one ref and is not locked. 2128 */ 2129 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2130 chain = hammer2_chain_alloc(hmp, NULL, bref); 2131 else 2132 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2133 /* ref'd chain returned */ 2134 2135 /* 2136 * Flag that the chain is in the parent's blockmap so delete/flush 2137 * knows what to do with it. 2138 */ 2139 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 2140 2141 /* 2142 * Link the chain into its parent. A spinlock is required to safely 2143 * access the RBTREE, and it is possible to collide with another 2144 * hammer2_chain_get() operation because the caller might only hold 2145 * a shared lock on the parent. 2146 * 2147 * NOTE: Get races can occur quite often when we distribute 2148 * asynchronous read-aheads across multiple threads. 2149 */ 2150 KKASSERT(parent->refs > 0); 2151 error = hammer2_chain_insert(parent, chain, 2152 HAMMER2_CHAIN_INSERT_SPIN | 2153 HAMMER2_CHAIN_INSERT_RACE, 2154 generation); 2155 if (error) { 2156 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2157 /*kprintf("chain %p get race\n", chain);*/ 2158 hammer2_chain_drop(chain); 2159 chain = NULL; 2160 } else { 2161 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2162 } 2163 2164 /* 2165 * Return our new chain referenced but not locked, or NULL if 2166 * a race occurred. 2167 */ 2168 return (chain); 2169 } 2170 2171 /* 2172 * Lookup initialization/completion API 2173 */ 2174 hammer2_chain_t * 2175 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2176 { 2177 hammer2_chain_ref(parent); 2178 if (flags & HAMMER2_LOOKUP_SHARED) { 2179 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2180 HAMMER2_RESOLVE_SHARED); 2181 } else { 2182 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2183 } 2184 return (parent); 2185 } 2186 2187 void 2188 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2189 { 2190 if (parent) { 2191 hammer2_chain_unlock(parent); 2192 hammer2_chain_drop(parent); 2193 } 2194 } 2195 2196 /* 2197 * Take the locked chain and return a locked parent. The chain remains 2198 * locked on return. 2199 * 2200 * This will work even if the chain is errored, and the caller can check 2201 * parent->error on return if desired since the parent will be locked. 2202 * 2203 * This function handles the lock order reversal. 2204 */ 2205 hammer2_chain_t * 2206 hammer2_chain_getparent(hammer2_chain_t *chain, int how) 2207 { 2208 hammer2_chain_t *parent; 2209 2210 /* 2211 * Be careful of order, chain must be unlocked before parent 2212 * is locked below to avoid a deadlock. 2213 * 2214 * Safe access to fu->parent requires fu's core spinlock. 2215 */ 2216 again: 2217 hammer2_spin_ex(&chain->core.spin); 2218 parent = chain->parent; 2219 if (parent == NULL) { 2220 hammer2_spin_unex(&chain->core.spin); 2221 panic("hammer2_chain_getparent: no parent"); 2222 } 2223 hammer2_chain_ref(parent); 2224 hammer2_spin_unex(&chain->core.spin); 2225 2226 hammer2_chain_unlock(chain); 2227 hammer2_chain_lock(parent, how); 2228 hammer2_chain_lock(chain, how); 2229 2230 /* 2231 * Parent relinking races are quite common. We have to get it right 2232 * or we will blow up the block table. 2233 */ 2234 if (chain->parent != parent) { 2235 hammer2_chain_unlock(parent); 2236 hammer2_chain_drop(parent); 2237 goto again; 2238 } 2239 return parent; 2240 } 2241 2242 /* 2243 * Take the locked chain and return a locked parent. The chain is unlocked 2244 * and dropped. *chainp is set to the returned parent as a convenience. 2245 * 2246 * This will work even if the chain is errored, and the caller can check 2247 * parent->error on return if desired since the parent will be locked. 2248 * 2249 * This function handles the lock order reversal. 2250 */ 2251 hammer2_chain_t * 2252 hammer2_chain_repparent(hammer2_chain_t **chainp, int how) 2253 { 2254 hammer2_chain_t *chain; 2255 hammer2_chain_t *parent; 2256 2257 /* 2258 * Be careful of order, chain must be unlocked before parent 2259 * is locked below to avoid a deadlock. 2260 * 2261 * Safe access to fu->parent requires fu's core spinlock. 2262 */ 2263 chain = *chainp; 2264 again: 2265 hammer2_spin_ex(&chain->core.spin); 2266 parent = chain->parent; 2267 if (parent == NULL) { 2268 hammer2_spin_unex(&chain->core.spin); 2269 panic("hammer2_chain_getparent: no parent"); 2270 } 2271 hammer2_chain_ref(parent); 2272 hammer2_spin_unex(&chain->core.spin); 2273 2274 hammer2_chain_unlock(chain); 2275 hammer2_chain_lock(parent, how); 2276 2277 /* 2278 * Parent relinking races are quite common. We have to get it right 2279 * or we will blow up the block table. 2280 */ 2281 if (chain->parent != parent) { 2282 hammer2_chain_lock(chain, how); 2283 hammer2_chain_unlock(parent); 2284 hammer2_chain_drop(parent); 2285 goto again; 2286 } 2287 hammer2_chain_drop(chain); 2288 *chainp = parent; 2289 2290 return parent; 2291 } 2292 2293 /* 2294 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2295 * (*parentp) typically points to an inode but can also point to a related 2296 * indirect block and this function will recurse upwards and find the inode 2297 * again. 2298 * 2299 * This function unconditionally sets *errorp, replacing any previous value. 2300 * 2301 * (*parentp) must be exclusively locked and referenced and can be an inode 2302 * or an existing indirect block within the inode. If (*parent) is errored 2303 * out, this function will not attempt to recurse the radix tree and 2304 * will return NULL along with an appropriate *errorp. If NULL is returned 2305 * and *errorp is 0, the requested lookup could not be located. 2306 * 2307 * On return (*parentp) will be modified to point at the deepest parent chain 2308 * element encountered during the search, as a helper for an insertion or 2309 * deletion. The new (*parentp) will be locked and referenced and the old 2310 * will be unlocked and dereferenced (no change if they are both the same). 2311 * This is particularly important if the caller wishes to insert a new chain, 2312 * (*parentp) will be set properly even if NULL is returned, as long as no 2313 * error occurred. 2314 * 2315 * The matching chain will be returned exclusively locked. If NOLOCK is 2316 * requested the chain will be returned only referenced. Note that the 2317 * parent chain must always be locked shared or exclusive, matching the 2318 * HAMMER2_LOOKUP_SHARED flag. We can conceivably lock it SHARED temporarily 2319 * when NOLOCK is specified but that complicates matters if *parentp must 2320 * inherit the chain. 2321 * 2322 * NOLOCK also implies NODATA, since an unlocked chain usually has a NULL 2323 * data pointer or can otherwise be in flux. 2324 * 2325 * NULL is returned if no match was found, but (*parentp) will still 2326 * potentially be adjusted. 2327 * 2328 * On return (*key_nextp) will point to an iterative value for key_beg. 2329 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2330 * 2331 * This function will also recurse up the chain if the key is not within the 2332 * current parent's range. (*parentp) can never be set to NULL. An iteration 2333 * can simply allow (*parentp) to float inside the loop. 2334 * 2335 * NOTE! chain->data is not always resolved. By default it will not be 2336 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2337 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2338 * BREF_TYPE_DATA as the device buffer can alias the logical file 2339 * buffer). 2340 */ 2341 2342 hammer2_chain_t * 2343 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2344 hammer2_key_t key_beg, hammer2_key_t key_end, 2345 int *errorp, int flags) 2346 { 2347 hammer2_dev_t *hmp; 2348 hammer2_chain_t *parent; 2349 hammer2_chain_t *chain; 2350 hammer2_blockref_t *base; 2351 hammer2_blockref_t *bref; 2352 hammer2_blockref_t bcopy; 2353 hammer2_key_t scan_beg; 2354 hammer2_key_t scan_end; 2355 int count = 0; 2356 int how_always = HAMMER2_RESOLVE_ALWAYS; 2357 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2358 int how; 2359 int generation; 2360 int maxloops = 300000; 2361 2362 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2363 how_maybe = how_always; 2364 how = HAMMER2_RESOLVE_ALWAYS; 2365 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2366 how = HAMMER2_RESOLVE_NEVER; 2367 } else { 2368 how = HAMMER2_RESOLVE_MAYBE; 2369 } 2370 if (flags & HAMMER2_LOOKUP_SHARED) { 2371 how_maybe |= HAMMER2_RESOLVE_SHARED; 2372 how_always |= HAMMER2_RESOLVE_SHARED; 2373 how |= HAMMER2_RESOLVE_SHARED; 2374 } 2375 2376 /* 2377 * Recurse (*parentp) upward if necessary until the parent completely 2378 * encloses the key range or we hit the inode. 2379 * 2380 * Handle races against the flusher deleting indirect nodes on its 2381 * way back up by continuing to recurse upward past the deletion. 2382 */ 2383 parent = *parentp; 2384 hmp = parent->hmp; 2385 *errorp = 0; 2386 2387 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2388 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2389 scan_beg = parent->bref.key; 2390 scan_end = scan_beg + 2391 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2392 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2393 if (key_beg >= scan_beg && key_end <= scan_end) 2394 break; 2395 } 2396 parent = hammer2_chain_repparent(parentp, how_maybe); 2397 } 2398 again: 2399 2400 if (--maxloops == 0) 2401 panic("hammer2_chain_lookup: maxloops"); 2402 /* 2403 * Locate the blockref array. Currently we do a fully associative 2404 * search through the array. 2405 */ 2406 switch(parent->bref.type) { 2407 case HAMMER2_BREF_TYPE_INODE: 2408 /* 2409 * Special shortcut for embedded data returns the inode 2410 * itself. Callers must detect this condition and access 2411 * the embedded data (the strategy code does this for us). 2412 * 2413 * This is only applicable to regular files and softlinks. 2414 * 2415 * We need a second lock on parent. Since we already have 2416 * a lock we must pass LOCKAGAIN to prevent unexpected 2417 * blocking (we don't want to block on a second shared 2418 * ref if an exclusive lock is pending) 2419 */ 2420 if (parent->data->ipdata.meta.op_flags & 2421 HAMMER2_OPFLAG_DIRECTDATA) { 2422 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2423 chain = NULL; 2424 *key_nextp = key_end + 1; 2425 goto done; 2426 } 2427 hammer2_chain_ref(parent); 2428 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 2429 hammer2_chain_lock(parent, 2430 how_always | 2431 HAMMER2_RESOLVE_LOCKAGAIN); 2432 *key_nextp = key_end + 1; 2433 return (parent); 2434 } 2435 base = &parent->data->ipdata.u.blockset.blockref[0]; 2436 count = HAMMER2_SET_COUNT; 2437 break; 2438 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2439 case HAMMER2_BREF_TYPE_INDIRECT: 2440 /* 2441 * Handle MATCHIND on the parent 2442 */ 2443 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2444 scan_beg = parent->bref.key; 2445 scan_end = scan_beg + 2446 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2447 if (key_beg == scan_beg && key_end == scan_end) { 2448 chain = parent; 2449 hammer2_chain_ref(chain); 2450 hammer2_chain_lock(chain, how_maybe); 2451 *key_nextp = scan_end + 1; 2452 goto done; 2453 } 2454 } 2455 2456 /* 2457 * Optimize indirect blocks in the INITIAL state to avoid 2458 * I/O. 2459 */ 2460 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2461 base = NULL; 2462 } else { 2463 if (parent->data == NULL) { 2464 kprintf("parent->data is NULL %p\n", parent); 2465 while (1) 2466 tsleep(parent, 0, "xxx", 0); 2467 } 2468 base = &parent->data->npdata[0]; 2469 } 2470 count = parent->bytes / sizeof(hammer2_blockref_t); 2471 break; 2472 case HAMMER2_BREF_TYPE_VOLUME: 2473 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2474 count = HAMMER2_SET_COUNT; 2475 break; 2476 case HAMMER2_BREF_TYPE_FREEMAP: 2477 base = &parent->data->blkset.blockref[0]; 2478 count = HAMMER2_SET_COUNT; 2479 break; 2480 default: 2481 kprintf("hammer2_chain_lookup: unrecognized " 2482 "blockref(B) type: %d", 2483 parent->bref.type); 2484 while (1) 2485 tsleep(&base, 0, "dead", 0); 2486 panic("hammer2_chain_lookup: unrecognized " 2487 "blockref(B) type: %d", 2488 parent->bref.type); 2489 base = NULL; /* safety */ 2490 count = 0; /* safety */ 2491 } 2492 2493 /* 2494 * No lookup is possible if the parent is errored. We delayed 2495 * this check as long as we could to ensure that the parent backup, 2496 * embedded data, and MATCHIND code could still execute. 2497 */ 2498 if (parent->error) { 2499 *errorp = parent->error; 2500 return NULL; 2501 } 2502 2503 /* 2504 * Merged scan to find next candidate. 2505 * 2506 * hammer2_base_*() functions require the parent->core.live_* fields 2507 * to be synchronized. 2508 * 2509 * We need to hold the spinlock to access the block array and RB tree 2510 * and to interlock chain creation. 2511 */ 2512 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2513 hammer2_chain_countbrefs(parent, base, count); 2514 2515 /* 2516 * Combined search 2517 */ 2518 hammer2_spin_ex(&parent->core.spin); 2519 chain = hammer2_combined_find(parent, base, count, 2520 key_nextp, 2521 key_beg, key_end, 2522 &bref); 2523 generation = parent->core.generation; 2524 2525 /* 2526 * Exhausted parent chain, iterate. 2527 */ 2528 if (bref == NULL) { 2529 hammer2_spin_unex(&parent->core.spin); 2530 if (key_beg == key_end) /* short cut single-key case */ 2531 return (NULL); 2532 2533 /* 2534 * Stop if we reached the end of the iteration. 2535 */ 2536 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2537 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2538 return (NULL); 2539 } 2540 2541 /* 2542 * Calculate next key, stop if we reached the end of the 2543 * iteration, otherwise go up one level and loop. 2544 */ 2545 key_beg = parent->bref.key + 2546 ((hammer2_key_t)1 << parent->bref.keybits); 2547 if (key_beg == 0 || key_beg > key_end) 2548 return (NULL); 2549 parent = hammer2_chain_repparent(parentp, how_maybe); 2550 goto again; 2551 } 2552 2553 /* 2554 * Selected from blockref or in-memory chain. 2555 */ 2556 if (chain == NULL) { 2557 bcopy = *bref; 2558 hammer2_spin_unex(&parent->core.spin); 2559 chain = hammer2_chain_get(parent, generation, 2560 &bcopy); 2561 if (chain == NULL) { 2562 /* 2563 kprintf("retry lookup parent %p keys %016jx:%016jx\n", 2564 parent, key_beg, key_end); 2565 */ 2566 goto again; 2567 } 2568 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2569 hammer2_chain_drop(chain); 2570 goto again; 2571 } 2572 } else { 2573 hammer2_chain_ref(chain); 2574 hammer2_spin_unex(&parent->core.spin); 2575 } 2576 2577 /* 2578 * chain is referenced but not locked. We must lock the chain 2579 * to obtain definitive state. 2580 */ 2581 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2582 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2583 hammer2_chain_lock(chain, how_maybe); 2584 } else { 2585 hammer2_chain_lock(chain, how); 2586 } 2587 KKASSERT(chain->parent == parent); 2588 2589 /* 2590 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2591 * 2592 * NOTE: Chain's key range is not relevant as there might be 2593 * one-offs within the range that are not deleted. 2594 * 2595 * NOTE: Lookups can race delete-duplicate because 2596 * delete-duplicate does not lock the parent's core 2597 * (they just use the spinlock on the core). 2598 */ 2599 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2600 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2601 chain->bref.data_off, chain->bref.type, 2602 chain->bref.key); 2603 hammer2_chain_unlock(chain); 2604 hammer2_chain_drop(chain); 2605 key_beg = *key_nextp; 2606 if (key_beg == 0 || key_beg > key_end) 2607 return(NULL); 2608 goto again; 2609 } 2610 2611 /* 2612 * If the chain element is an indirect block it becomes the new 2613 * parent and we loop on it. We must maintain our top-down locks 2614 * to prevent the flusher from interfering (i.e. doing a 2615 * delete-duplicate and leaving us recursing down a deleted chain). 2616 * 2617 * The parent always has to be locked with at least RESOLVE_MAYBE 2618 * so we can access its data. It might need a fixup if the caller 2619 * passed incompatible flags. Be careful not to cause a deadlock 2620 * as a data-load requires an exclusive lock. 2621 * 2622 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2623 * range is within the requested key range we return the indirect 2624 * block and do NOT loop. This is usually only used to acquire 2625 * freemap nodes. 2626 */ 2627 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2628 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2629 hammer2_chain_unlock(parent); 2630 hammer2_chain_drop(parent); 2631 *parentp = parent = chain; 2632 goto again; 2633 } 2634 done: 2635 /* 2636 * All done, return the chain. 2637 * 2638 * If the caller does not want a locked chain, replace the lock with 2639 * a ref. Perhaps this can eventually be optimized to not obtain the 2640 * lock in the first place for situations where the data does not 2641 * need to be resolved. 2642 * 2643 * NOTE! A chain->error must be tested by the caller upon return. 2644 * *errorp is only set based on issues which occur while 2645 * trying to reach the chain. 2646 */ 2647 if (chain) { 2648 if (flags & HAMMER2_LOOKUP_NOLOCK) 2649 hammer2_chain_unlock(chain); 2650 } 2651 return (chain); 2652 } 2653 2654 /* 2655 * After having issued a lookup we can iterate all matching keys. 2656 * 2657 * If chain is non-NULL we continue the iteration from just after it's index. 2658 * 2659 * If chain is NULL we assume the parent was exhausted and continue the 2660 * iteration at the next parent. 2661 * 2662 * If a fatal error occurs (typically an I/O error), a dummy chain is 2663 * returned with chain->error and error-identifying information set. This 2664 * chain will assert if you try to do anything fancy with it. 2665 * 2666 * XXX Depending on where the error occurs we should allow continued iteration. 2667 * 2668 * parent must be locked on entry and remains locked throughout. chain's 2669 * lock status must match flags. Chain is always at least referenced. 2670 * 2671 * WARNING! The MATCHIND flag does not apply to this function. 2672 */ 2673 hammer2_chain_t * 2674 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2675 hammer2_key_t *key_nextp, 2676 hammer2_key_t key_beg, hammer2_key_t key_end, 2677 int *errorp, int flags) 2678 { 2679 hammer2_chain_t *parent; 2680 int how_maybe; 2681 2682 /* 2683 * Calculate locking flags for upward recursion. 2684 */ 2685 how_maybe = HAMMER2_RESOLVE_MAYBE; 2686 if (flags & HAMMER2_LOOKUP_SHARED) 2687 how_maybe |= HAMMER2_RESOLVE_SHARED; 2688 2689 parent = *parentp; 2690 *errorp = 0; 2691 2692 /* 2693 * Calculate the next index and recalculate the parent if necessary. 2694 */ 2695 if (chain) { 2696 key_beg = chain->bref.key + 2697 ((hammer2_key_t)1 << chain->bref.keybits); 2698 if ((flags & (HAMMER2_LOOKUP_NOLOCK | 2699 HAMMER2_LOOKUP_NOUNLOCK)) == 0) { 2700 hammer2_chain_unlock(chain); 2701 } 2702 hammer2_chain_drop(chain); 2703 2704 /* 2705 * chain invalid past this point, but we can still do a 2706 * pointer comparison w/parent. 2707 * 2708 * Any scan where the lookup returned degenerate data embedded 2709 * in the inode has an invalid index and must terminate. 2710 */ 2711 if (chain == parent) 2712 return(NULL); 2713 if (key_beg == 0 || key_beg > key_end) 2714 return(NULL); 2715 chain = NULL; 2716 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2717 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2718 /* 2719 * We reached the end of the iteration. 2720 */ 2721 return (NULL); 2722 } else { 2723 /* 2724 * Continue iteration with next parent unless the current 2725 * parent covers the range. 2726 * 2727 * (This also handles the case of a deleted, empty indirect 2728 * node). 2729 */ 2730 key_beg = parent->bref.key + 2731 ((hammer2_key_t)1 << parent->bref.keybits); 2732 if (key_beg == 0 || key_beg > key_end) 2733 return (NULL); 2734 parent = hammer2_chain_repparent(parentp, how_maybe); 2735 } 2736 2737 /* 2738 * And execute 2739 */ 2740 return (hammer2_chain_lookup(parentp, key_nextp, 2741 key_beg, key_end, 2742 errorp, flags)); 2743 } 2744 2745 /* 2746 * Caller wishes to iterate chains under parent, loading new chains into 2747 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 2748 * then call hammer2_chain_scan() repeatedly until a non-zero return. 2749 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 2750 * with the returned chain for the scan. The returned *chainp will be 2751 * locked and referenced. Any prior contents will be unlocked and dropped. 2752 * 2753 * Caller should check the return value. A normal scan EOF will return 2754 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 2755 * error trying to access parent data. Any error in the returned chain 2756 * must be tested separately by the caller. 2757 * 2758 * (*chainp) is dropped on each scan, but will only be set if the returned 2759 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 2760 * returned via *chainp. The caller will get their bref only. 2761 * 2762 * The raw scan function is similar to lookup/next but does not seek to a key. 2763 * Blockrefs are iterated via first_bref = (parent, NULL) and 2764 * next_chain = (parent, bref). 2765 * 2766 * The passed-in parent must be locked and its data resolved. The function 2767 * nominally returns a locked and referenced *chainp != NULL for chains 2768 * the caller might need to recurse on (and will dipose of any *chainp passed 2769 * in). The caller must check the chain->bref.type either way. 2770 */ 2771 int 2772 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 2773 hammer2_blockref_t *bref, int *firstp, 2774 int flags) 2775 { 2776 hammer2_dev_t *hmp; 2777 hammer2_blockref_t *base; 2778 hammer2_blockref_t *bref_ptr; 2779 hammer2_key_t key; 2780 hammer2_key_t next_key; 2781 hammer2_chain_t *chain = NULL; 2782 int count = 0; 2783 int how_always = HAMMER2_RESOLVE_ALWAYS; 2784 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2785 int how; 2786 int generation; 2787 int maxloops = 300000; 2788 int error; 2789 2790 hmp = parent->hmp; 2791 error = 0; 2792 2793 /* 2794 * Scan flags borrowed from lookup. 2795 */ 2796 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2797 how_maybe = how_always; 2798 how = HAMMER2_RESOLVE_ALWAYS; 2799 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2800 how = HAMMER2_RESOLVE_NEVER; 2801 } else { 2802 how = HAMMER2_RESOLVE_MAYBE; 2803 } 2804 if (flags & HAMMER2_LOOKUP_SHARED) { 2805 how_maybe |= HAMMER2_RESOLVE_SHARED; 2806 how_always |= HAMMER2_RESOLVE_SHARED; 2807 how |= HAMMER2_RESOLVE_SHARED; 2808 } 2809 2810 /* 2811 * Calculate key to locate first/next element, unlocking the previous 2812 * element as we go. Be careful, the key calculation can overflow. 2813 * 2814 * (also reset bref to NULL) 2815 */ 2816 if (*firstp) { 2817 key = 0; 2818 *firstp = 0; 2819 } else { 2820 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 2821 if ((chain = *chainp) != NULL) { 2822 *chainp = NULL; 2823 hammer2_chain_unlock(chain); 2824 hammer2_chain_drop(chain); 2825 chain = NULL; 2826 } 2827 if (key == 0) { 2828 error |= HAMMER2_ERROR_EOF; 2829 goto done; 2830 } 2831 } 2832 2833 again: 2834 if (parent->error) { 2835 error = parent->error; 2836 goto done; 2837 } 2838 if (--maxloops == 0) 2839 panic("hammer2_chain_scan: maxloops"); 2840 2841 /* 2842 * Locate the blockref array. Currently we do a fully associative 2843 * search through the array. 2844 */ 2845 switch(parent->bref.type) { 2846 case HAMMER2_BREF_TYPE_INODE: 2847 /* 2848 * An inode with embedded data has no sub-chains. 2849 * 2850 * WARNING! Bulk scan code may pass a static chain marked 2851 * as BREF_TYPE_INODE with a copy of the volume 2852 * root blockset to snapshot the volume. 2853 */ 2854 if (parent->data->ipdata.meta.op_flags & 2855 HAMMER2_OPFLAG_DIRECTDATA) { 2856 error |= HAMMER2_ERROR_EOF; 2857 goto done; 2858 } 2859 base = &parent->data->ipdata.u.blockset.blockref[0]; 2860 count = HAMMER2_SET_COUNT; 2861 break; 2862 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2863 case HAMMER2_BREF_TYPE_INDIRECT: 2864 /* 2865 * Optimize indirect blocks in the INITIAL state to avoid 2866 * I/O. 2867 */ 2868 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2869 base = NULL; 2870 } else { 2871 if (parent->data == NULL) 2872 panic("parent->data is NULL"); 2873 base = &parent->data->npdata[0]; 2874 } 2875 count = parent->bytes / sizeof(hammer2_blockref_t); 2876 break; 2877 case HAMMER2_BREF_TYPE_VOLUME: 2878 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2879 count = HAMMER2_SET_COUNT; 2880 break; 2881 case HAMMER2_BREF_TYPE_FREEMAP: 2882 base = &parent->data->blkset.blockref[0]; 2883 count = HAMMER2_SET_COUNT; 2884 break; 2885 default: 2886 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 2887 parent->bref.type); 2888 base = NULL; /* safety */ 2889 count = 0; /* safety */ 2890 } 2891 2892 /* 2893 * Merged scan to find next candidate. 2894 * 2895 * hammer2_base_*() functions require the parent->core.live_* fields 2896 * to be synchronized. 2897 * 2898 * We need to hold the spinlock to access the block array and RB tree 2899 * and to interlock chain creation. 2900 */ 2901 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2902 hammer2_chain_countbrefs(parent, base, count); 2903 2904 next_key = 0; 2905 bref_ptr = NULL; 2906 hammer2_spin_ex(&parent->core.spin); 2907 chain = hammer2_combined_find(parent, base, count, 2908 &next_key, 2909 key, HAMMER2_KEY_MAX, 2910 &bref_ptr); 2911 generation = parent->core.generation; 2912 2913 /* 2914 * Exhausted parent chain, we're done. 2915 */ 2916 if (bref_ptr == NULL) { 2917 hammer2_spin_unex(&parent->core.spin); 2918 KKASSERT(chain == NULL); 2919 error |= HAMMER2_ERROR_EOF; 2920 goto done; 2921 } 2922 2923 /* 2924 * Copy into the supplied stack-based blockref. 2925 */ 2926 *bref = *bref_ptr; 2927 2928 /* 2929 * Selected from blockref or in-memory chain. 2930 */ 2931 if (chain == NULL) { 2932 switch(bref->type) { 2933 case HAMMER2_BREF_TYPE_INODE: 2934 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2935 case HAMMER2_BREF_TYPE_INDIRECT: 2936 case HAMMER2_BREF_TYPE_VOLUME: 2937 case HAMMER2_BREF_TYPE_FREEMAP: 2938 /* 2939 * Recursion, always get the chain 2940 */ 2941 hammer2_spin_unex(&parent->core.spin); 2942 chain = hammer2_chain_get(parent, generation, bref); 2943 if (chain == NULL) { 2944 kprintf("retry scan parent %p keys %016jx\n", 2945 parent, key); 2946 goto again; 2947 } 2948 if (bcmp(bref, bref_ptr, sizeof(*bref))) { 2949 hammer2_chain_drop(chain); 2950 chain = NULL; 2951 goto again; 2952 } 2953 break; 2954 default: 2955 /* 2956 * No recursion, do not waste time instantiating 2957 * a chain, just iterate using the bref. 2958 */ 2959 hammer2_spin_unex(&parent->core.spin); 2960 break; 2961 } 2962 } else { 2963 /* 2964 * Recursion or not we need the chain in order to supply 2965 * the bref. 2966 */ 2967 hammer2_chain_ref(chain); 2968 hammer2_spin_unex(&parent->core.spin); 2969 } 2970 2971 /* 2972 * chain is referenced but not locked. We must lock the chain 2973 * to obtain definitive state. 2974 */ 2975 if (chain) 2976 hammer2_chain_lock(chain, how); 2977 2978 /* 2979 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2980 * 2981 * NOTE: chain's key range is not relevant as there might be 2982 * one-offs within the range that are not deleted. 2983 * 2984 * NOTE: XXX this could create problems with scans used in 2985 * situations other than mount-time recovery. 2986 * 2987 * NOTE: Lookups can race delete-duplicate because 2988 * delete-duplicate does not lock the parent's core 2989 * (they just use the spinlock on the core). 2990 */ 2991 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 2992 hammer2_chain_unlock(chain); 2993 hammer2_chain_drop(chain); 2994 chain = NULL; 2995 2996 key = next_key; 2997 if (key == 0) { 2998 error |= HAMMER2_ERROR_EOF; 2999 goto done; 3000 } 3001 goto again; 3002 } 3003 3004 done: 3005 /* 3006 * All done, return the bref or NULL, supply chain if necessary. 3007 */ 3008 if (chain) 3009 *chainp = chain; 3010 return (error); 3011 } 3012 3013 /* 3014 * Create and return a new hammer2 system memory structure of the specified 3015 * key, type and size and insert it under (*parentp). This is a full 3016 * insertion, based on the supplied key/keybits, and may involve creating 3017 * indirect blocks and moving other chains around via delete/duplicate. 3018 * 3019 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3020 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3021 * FULL. This typically means that the caller is creating the chain after 3022 * doing a hammer2_chain_lookup(). 3023 * 3024 * (*parentp) must be exclusive locked and may be replaced on return 3025 * depending on how much work the function had to do. 3026 * 3027 * (*parentp) must not be errored or this function will assert. 3028 * 3029 * (*chainp) usually starts out NULL and returns the newly created chain, 3030 * but if the caller desires the caller may allocate a disconnected chain 3031 * and pass it in instead. 3032 * 3033 * This function should NOT be used to insert INDIRECT blocks. It is 3034 * typically used to create/insert inodes and data blocks. 3035 * 3036 * Caller must pass-in an exclusively locked parent the new chain is to 3037 * be inserted under, and optionally pass-in a disconnected, exclusively 3038 * locked chain to insert (else we create a new chain). The function will 3039 * adjust (*parentp) as necessary, create or connect the chain, and 3040 * return an exclusively locked chain in *chainp. 3041 * 3042 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3043 * and will be reassigned. 3044 * 3045 * NOTE: returns HAMMER_ERROR_* flags 3046 */ 3047 int 3048 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3049 hammer2_pfs_t *pmp, int methods, 3050 hammer2_key_t key, int keybits, int type, size_t bytes, 3051 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3052 { 3053 hammer2_dev_t *hmp; 3054 hammer2_chain_t *chain; 3055 hammer2_chain_t *parent; 3056 hammer2_blockref_t *base; 3057 hammer2_blockref_t dummy; 3058 int allocated = 0; 3059 int error = 0; 3060 int count; 3061 int maxloops = 300000; 3062 3063 /* 3064 * Topology may be crossing a PFS boundary. 3065 */ 3066 parent = *parentp; 3067 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3068 KKASSERT(parent->error == 0); 3069 hmp = parent->hmp; 3070 chain = *chainp; 3071 3072 if (chain == NULL) { 3073 /* 3074 * First allocate media space and construct the dummy bref, 3075 * then allocate the in-memory chain structure. Set the 3076 * INITIAL flag for fresh chains which do not have embedded 3077 * data. 3078 * 3079 * XXX for now set the check mode of the child based on 3080 * the parent or, if the parent is an inode, the 3081 * specification in the inode. 3082 */ 3083 bzero(&dummy, sizeof(dummy)); 3084 dummy.type = type; 3085 dummy.key = key; 3086 dummy.keybits = keybits; 3087 dummy.data_off = hammer2_getradix(bytes); 3088 3089 /* 3090 * Inherit methods from parent by default. Primarily used 3091 * for BREF_TYPE_DATA. Non-data types *must* be set to 3092 * a non-NONE check algorithm. 3093 */ 3094 if (methods == -1) 3095 dummy.methods = parent->bref.methods; 3096 else 3097 dummy.methods = (uint8_t)methods; 3098 3099 if (type != HAMMER2_BREF_TYPE_DATA && 3100 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3101 dummy.methods |= 3102 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3103 } 3104 3105 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3106 3107 /* 3108 * Lock the chain manually, chain_lock will load the chain 3109 * which we do NOT want to do. (note: chain->refs is set 3110 * to 1 by chain_alloc() for us, but lockcnt is not). 3111 */ 3112 chain->lockcnt = 1; 3113 hammer2_mtx_ex(&chain->lock); 3114 allocated = 1; 3115 ++curthread->td_tracker; 3116 3117 /* 3118 * Set INITIAL to optimize I/O. The flag will generally be 3119 * processed when we call hammer2_chain_modify(). 3120 * 3121 * Recalculate bytes to reflect the actual media block 3122 * allocation. Handle special case radix 0 == 0 bytes. 3123 */ 3124 bytes = (size_t)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 3125 if (bytes) 3126 bytes = (hammer2_off_t)1 << bytes; 3127 chain->bytes = bytes; 3128 3129 switch(type) { 3130 case HAMMER2_BREF_TYPE_VOLUME: 3131 case HAMMER2_BREF_TYPE_FREEMAP: 3132 panic("hammer2_chain_create: called with volume type"); 3133 break; 3134 case HAMMER2_BREF_TYPE_INDIRECT: 3135 panic("hammer2_chain_create: cannot be used to" 3136 "create indirect block"); 3137 break; 3138 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3139 panic("hammer2_chain_create: cannot be used to" 3140 "create freemap root or node"); 3141 break; 3142 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3143 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3144 /* fall through */ 3145 case HAMMER2_BREF_TYPE_DIRENT: 3146 case HAMMER2_BREF_TYPE_INODE: 3147 case HAMMER2_BREF_TYPE_DATA: 3148 default: 3149 /* 3150 * leave chain->data NULL, set INITIAL 3151 */ 3152 KKASSERT(chain->data == NULL); 3153 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3154 break; 3155 } 3156 } else { 3157 /* 3158 * We are reattaching a previously deleted chain, possibly 3159 * under a new parent and possibly with a new key/keybits. 3160 * The chain does not have to be in a modified state. The 3161 * UPDATE flag will be set later on in this routine. 3162 * 3163 * Do NOT mess with the current state of the INITIAL flag. 3164 */ 3165 chain->bref.key = key; 3166 chain->bref.keybits = keybits; 3167 if (chain->flags & HAMMER2_CHAIN_DELETED) 3168 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3169 KKASSERT(chain->parent == NULL); 3170 } 3171 if (flags & HAMMER2_INSERT_PFSROOT) 3172 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3173 else 3174 chain->bref.flags &= ~HAMMER2_BREF_FLAG_PFSROOT; 3175 3176 /* 3177 * Calculate how many entries we have in the blockref array and 3178 * determine if an indirect block is required. 3179 */ 3180 again: 3181 if (--maxloops == 0) 3182 panic("hammer2_chain_create: maxloops"); 3183 3184 switch(parent->bref.type) { 3185 case HAMMER2_BREF_TYPE_INODE: 3186 if ((parent->data->ipdata.meta.op_flags & 3187 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3188 kprintf("hammer2: parent set for direct-data! " 3189 "pkey=%016jx ckey=%016jx\n", 3190 parent->bref.key, 3191 chain->bref.key); 3192 } 3193 KKASSERT((parent->data->ipdata.meta.op_flags & 3194 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3195 KKASSERT(parent->data != NULL); 3196 base = &parent->data->ipdata.u.blockset.blockref[0]; 3197 count = HAMMER2_SET_COUNT; 3198 break; 3199 case HAMMER2_BREF_TYPE_INDIRECT: 3200 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3201 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3202 base = NULL; 3203 else 3204 base = &parent->data->npdata[0]; 3205 count = parent->bytes / sizeof(hammer2_blockref_t); 3206 break; 3207 case HAMMER2_BREF_TYPE_VOLUME: 3208 KKASSERT(parent->data != NULL); 3209 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3210 count = HAMMER2_SET_COUNT; 3211 break; 3212 case HAMMER2_BREF_TYPE_FREEMAP: 3213 KKASSERT(parent->data != NULL); 3214 base = &parent->data->blkset.blockref[0]; 3215 count = HAMMER2_SET_COUNT; 3216 break; 3217 default: 3218 panic("hammer2_chain_create: unrecognized blockref type: %d", 3219 parent->bref.type); 3220 base = NULL; 3221 count = 0; 3222 break; 3223 } 3224 3225 /* 3226 * Make sure we've counted the brefs 3227 */ 3228 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3229 hammer2_chain_countbrefs(parent, base, count); 3230 3231 KASSERT(parent->core.live_count >= 0 && 3232 parent->core.live_count <= count, 3233 ("bad live_count %d/%d (%02x, %d)", 3234 parent->core.live_count, count, 3235 parent->bref.type, parent->bytes)); 3236 3237 /* 3238 * If no free blockref could be found we must create an indirect 3239 * block and move a number of blockrefs into it. With the parent 3240 * locked we can safely lock each child in order to delete+duplicate 3241 * it without causing a deadlock. 3242 * 3243 * This may return the new indirect block or the old parent depending 3244 * on where the key falls. NULL is returned on error. 3245 */ 3246 if (parent->core.live_count == count) { 3247 hammer2_chain_t *nparent; 3248 3249 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3250 mtid, type, &error); 3251 if (nparent == NULL) { 3252 if (allocated) 3253 hammer2_chain_drop(chain); 3254 chain = NULL; 3255 goto done; 3256 } 3257 if (parent != nparent) { 3258 hammer2_chain_unlock(parent); 3259 hammer2_chain_drop(parent); 3260 parent = *parentp = nparent; 3261 } 3262 goto again; 3263 } 3264 3265 if (chain->flags & HAMMER2_CHAIN_DELETED) 3266 kprintf("Inserting deleted chain @%016jx\n", 3267 chain->bref.key); 3268 3269 /* 3270 * Link the chain into its parent. 3271 */ 3272 if (chain->parent != NULL) 3273 panic("hammer2: hammer2_chain_create: chain already connected"); 3274 KKASSERT(chain->parent == NULL); 3275 KKASSERT(parent->core.live_count < count); 3276 hammer2_chain_insert(parent, chain, 3277 HAMMER2_CHAIN_INSERT_SPIN | 3278 HAMMER2_CHAIN_INSERT_LIVE, 3279 0); 3280 3281 if (allocated) { 3282 /* 3283 * Mark the newly created chain modified. This will cause 3284 * UPDATE to be set and process the INITIAL flag. 3285 * 3286 * Device buffers are not instantiated for DATA elements 3287 * as these are handled by logical buffers. 3288 * 3289 * Indirect and freemap node indirect blocks are handled 3290 * by hammer2_chain_create_indirect() and not by this 3291 * function. 3292 * 3293 * Data for all other bref types is expected to be 3294 * instantiated (INODE, LEAF). 3295 */ 3296 switch(chain->bref.type) { 3297 case HAMMER2_BREF_TYPE_DATA: 3298 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3299 case HAMMER2_BREF_TYPE_DIRENT: 3300 case HAMMER2_BREF_TYPE_INODE: 3301 error = hammer2_chain_modify(chain, mtid, dedup_off, 3302 HAMMER2_MODIFY_OPTDATA); 3303 break; 3304 default: 3305 /* 3306 * Remaining types are not supported by this function. 3307 * In particular, INDIRECT and LEAF_NODE types are 3308 * handled by create_indirect(). 3309 */ 3310 panic("hammer2_chain_create: bad type: %d", 3311 chain->bref.type); 3312 /* NOT REACHED */ 3313 break; 3314 } 3315 } else { 3316 /* 3317 * When reconnecting a chain we must set UPDATE and 3318 * setflush so the flush recognizes that it must update 3319 * the bref in the parent. 3320 */ 3321 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3322 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3323 } 3324 3325 /* 3326 * We must setflush(parent) to ensure that it recurses through to 3327 * chain. setflush(chain) might not work because ONFLUSH is possibly 3328 * already set in the chain (so it won't recurse up to set it in the 3329 * parent). 3330 */ 3331 hammer2_chain_setflush(parent); 3332 3333 done: 3334 *chainp = chain; 3335 3336 return (error); 3337 } 3338 3339 /* 3340 * Move the chain from its old parent to a new parent. The chain must have 3341 * already been deleted or already disconnected (or never associated) with 3342 * a parent. The chain is reassociated with the new parent and the deleted 3343 * flag will be cleared (no longer deleted). The chain's modification state 3344 * is not altered. 3345 * 3346 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3347 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3348 * FULL. This typically means that the caller is creating the chain after 3349 * doing a hammer2_chain_lookup(). 3350 * 3351 * A non-NULL bref is typically passed when key and keybits must be overridden. 3352 * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields 3353 * from a passed-in bref and uses the old chain's bref for everything else. 3354 * 3355 * Neither (parent) or (chain) can be errored. 3356 * 3357 * If (parent) is non-NULL then the chain is inserted under the parent. 3358 * 3359 * If (parent) is NULL then the newly duplicated chain is not inserted 3360 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3361 * passing into hammer2_chain_create() after this function returns). 3362 * 3363 * WARNING! This function calls create which means it can insert indirect 3364 * blocks. This can cause other unrelated chains in the parent to 3365 * be moved to a newly inserted indirect block in addition to the 3366 * specific chain. 3367 */ 3368 void 3369 hammer2_chain_rename(hammer2_blockref_t *bref, 3370 hammer2_chain_t **parentp, hammer2_chain_t *chain, 3371 hammer2_tid_t mtid, int flags) 3372 { 3373 hammer2_dev_t *hmp; 3374 hammer2_chain_t *parent; 3375 size_t bytes; 3376 3377 /* 3378 * WARNING! We should never resolve DATA to device buffers 3379 * (XXX allow it if the caller did?), and since 3380 * we currently do not have the logical buffer cache 3381 * buffer in-hand to fix its cached physical offset 3382 * we also force the modify code to not COW it. XXX 3383 */ 3384 hmp = chain->hmp; 3385 KKASSERT(chain->parent == NULL); 3386 KKASSERT(chain->error == 0); 3387 3388 /* 3389 * Now create a duplicate of the chain structure, associating 3390 * it with the same core, making it the same size, pointing it 3391 * to the same bref (the same media block). 3392 * 3393 * NOTE: Handle special radix == 0 case (means 0 bytes). 3394 */ 3395 if (bref == NULL) 3396 bref = &chain->bref; 3397 bytes = (size_t)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 3398 if (bytes) 3399 bytes = (hammer2_off_t)1 << bytes; 3400 3401 /* 3402 * If parent is not NULL the duplicated chain will be entered under 3403 * the parent and the UPDATE bit set to tell flush to update 3404 * the blockref. 3405 * 3406 * We must setflush(parent) to ensure that it recurses through to 3407 * chain. setflush(chain) might not work because ONFLUSH is possibly 3408 * already set in the chain (so it won't recurse up to set it in the 3409 * parent). 3410 * 3411 * Having both chains locked is extremely important for atomicy. 3412 */ 3413 if (parentp && (parent = *parentp) != NULL) { 3414 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3415 KKASSERT(parent->refs > 0); 3416 KKASSERT(parent->error == 0); 3417 3418 hammer2_chain_create(parentp, &chain, 3419 chain->pmp, HAMMER2_METH_DEFAULT, 3420 bref->key, bref->keybits, bref->type, 3421 chain->bytes, mtid, 0, flags); 3422 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3423 hammer2_chain_setflush(*parentp); 3424 } 3425 } 3426 3427 /* 3428 * Helper function for deleting chains. 3429 * 3430 * The chain is removed from the live view (the RBTREE) as well as the parent's 3431 * blockmap. Both chain and its parent must be locked. 3432 * 3433 * parent may not be errored. chain can be errored. 3434 */ 3435 static int 3436 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3437 hammer2_tid_t mtid, int flags) 3438 { 3439 hammer2_dev_t *hmp; 3440 int error = 0; 3441 3442 KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED | 3443 HAMMER2_CHAIN_FICTITIOUS)) == 0); 3444 KKASSERT(chain->parent == parent); 3445 hmp = chain->hmp; 3446 3447 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 3448 /* 3449 * Chain is blockmapped, so there must be a parent. 3450 * Atomically remove the chain from the parent and remove 3451 * the blockmap entry. The parent must be set modified 3452 * to remove the blockmap entry. 3453 */ 3454 hammer2_blockref_t *base; 3455 int count; 3456 3457 KKASSERT(parent != NULL); 3458 KKASSERT(parent->error == 0); 3459 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3460 error = hammer2_chain_modify(parent, mtid, 0, 0); 3461 if (error) 3462 goto done; 3463 3464 /* 3465 * Calculate blockmap pointer 3466 */ 3467 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3468 hammer2_spin_ex(&chain->core.spin); 3469 hammer2_spin_ex(&parent->core.spin); 3470 3471 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3472 atomic_add_int(&parent->core.live_count, -1); 3473 ++parent->core.generation; 3474 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3475 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3476 --parent->core.chain_count; 3477 chain->parent = NULL; 3478 3479 switch(parent->bref.type) { 3480 case HAMMER2_BREF_TYPE_INODE: 3481 /* 3482 * Access the inode's block array. However, there 3483 * is no block array if the inode is flagged 3484 * DIRECTDATA. 3485 */ 3486 if (parent->data && 3487 (parent->data->ipdata.meta.op_flags & 3488 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3489 base = 3490 &parent->data->ipdata.u.blockset.blockref[0]; 3491 } else { 3492 base = NULL; 3493 } 3494 count = HAMMER2_SET_COUNT; 3495 break; 3496 case HAMMER2_BREF_TYPE_INDIRECT: 3497 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3498 if (parent->data) 3499 base = &parent->data->npdata[0]; 3500 else 3501 base = NULL; 3502 count = parent->bytes / sizeof(hammer2_blockref_t); 3503 break; 3504 case HAMMER2_BREF_TYPE_VOLUME: 3505 base = &parent->data->voldata. 3506 sroot_blockset.blockref[0]; 3507 count = HAMMER2_SET_COUNT; 3508 break; 3509 case HAMMER2_BREF_TYPE_FREEMAP: 3510 base = &parent->data->blkset.blockref[0]; 3511 count = HAMMER2_SET_COUNT; 3512 break; 3513 default: 3514 base = NULL; 3515 count = 0; 3516 panic("hammer2_flush_pass2: " 3517 "unrecognized blockref type: %d", 3518 parent->bref.type); 3519 } 3520 3521 /* 3522 * delete blockmapped chain from its parent. 3523 * 3524 * The parent is not affected by any statistics in chain 3525 * which are pending synchronization. That is, there is 3526 * nothing to undo in the parent since they have not yet 3527 * been incorporated into the parent. 3528 * 3529 * The parent is affected by statistics stored in inodes. 3530 * Those have already been synchronized, so they must be 3531 * undone. XXX split update possible w/delete in middle? 3532 */ 3533 if (base) { 3534 hammer2_base_delete(parent, base, count, chain); 3535 } 3536 hammer2_spin_unex(&parent->core.spin); 3537 hammer2_spin_unex(&chain->core.spin); 3538 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3539 /* 3540 * Chain is not blockmapped but a parent is present. 3541 * Atomically remove the chain from the parent. There is 3542 * no blockmap entry to remove. 3543 * 3544 * Because chain was associated with a parent but not 3545 * synchronized, the chain's *_count_up fields contain 3546 * inode adjustment statistics which must be undone. 3547 */ 3548 hammer2_spin_ex(&chain->core.spin); 3549 hammer2_spin_ex(&parent->core.spin); 3550 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3551 atomic_add_int(&parent->core.live_count, -1); 3552 ++parent->core.generation; 3553 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3554 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3555 --parent->core.chain_count; 3556 chain->parent = NULL; 3557 hammer2_spin_unex(&parent->core.spin); 3558 hammer2_spin_unex(&chain->core.spin); 3559 } else { 3560 /* 3561 * Chain is not blockmapped and has no parent. This 3562 * is a degenerate case. 3563 */ 3564 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3565 } 3566 done: 3567 return error; 3568 } 3569 3570 /* 3571 * Create an indirect block that covers one or more of the elements in the 3572 * current parent. Either returns the existing parent with no locking or 3573 * ref changes or returns the new indirect block locked and referenced 3574 * and leaving the original parent lock/ref intact as well. 3575 * 3576 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3577 * 3578 * The returned chain depends on where the specified key falls. 3579 * 3580 * The key/keybits for the indirect mode only needs to follow three rules: 3581 * 3582 * (1) That all elements underneath it fit within its key space and 3583 * 3584 * (2) That all elements outside it are outside its key space. 3585 * 3586 * (3) When creating the new indirect block any elements in the current 3587 * parent that fit within the new indirect block's keyspace must be 3588 * moved into the new indirect block. 3589 * 3590 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3591 * keyspace the the current parent, but lookup/iteration rules will 3592 * ensure (and must ensure) that rule (2) for all parents leading up 3593 * to the nearest inode or the root volume header is adhered to. This 3594 * is accomplished by always recursing through matching keyspaces in 3595 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3596 * 3597 * The current implementation calculates the current worst-case keyspace by 3598 * iterating the current parent and then divides it into two halves, choosing 3599 * whichever half has the most elements (not necessarily the half containing 3600 * the requested key). 3601 * 3602 * We can also opt to use the half with the least number of elements. This 3603 * causes lower-numbered keys (aka logical file offsets) to recurse through 3604 * fewer indirect blocks and higher-numbered keys to recurse through more. 3605 * This also has the risk of not moving enough elements to the new indirect 3606 * block and being forced to create several indirect blocks before the element 3607 * can be inserted. 3608 * 3609 * Must be called with an exclusively locked parent. 3610 * 3611 * NOTE: *errorp set to HAMMER_ERROR_* flags 3612 */ 3613 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3614 hammer2_key_t *keyp, int keybits, 3615 hammer2_blockref_t *base, int count); 3616 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 3617 hammer2_key_t *keyp, int keybits, 3618 hammer2_blockref_t *base, int count, 3619 int ncount); 3620 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 3621 hammer2_key_t *keyp, int keybits, 3622 hammer2_blockref_t *base, int count, 3623 int ncount); 3624 static 3625 hammer2_chain_t * 3626 hammer2_chain_create_indirect(hammer2_chain_t *parent, 3627 hammer2_key_t create_key, int create_bits, 3628 hammer2_tid_t mtid, int for_type, int *errorp) 3629 { 3630 hammer2_dev_t *hmp; 3631 hammer2_blockref_t *base; 3632 hammer2_blockref_t *bref; 3633 hammer2_blockref_t bcopy; 3634 hammer2_chain_t *chain; 3635 hammer2_chain_t *ichain; 3636 hammer2_chain_t dummy; 3637 hammer2_key_t key = create_key; 3638 hammer2_key_t key_beg; 3639 hammer2_key_t key_end; 3640 hammer2_key_t key_next; 3641 int keybits = create_bits; 3642 int count; 3643 int ncount; 3644 int nbytes; 3645 int loops; 3646 int error; 3647 int reason; 3648 int generation; 3649 int maxloops = 300000; 3650 3651 /* 3652 * Calculate the base blockref pointer or NULL if the chain 3653 * is known to be empty. We need to calculate the array count 3654 * for RB lookups either way. 3655 */ 3656 hmp = parent->hmp; 3657 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3658 3659 /* 3660 * Pre-modify the parent now to avoid having to deal with error 3661 * processing if we tried to later (in the middle of our loop). 3662 */ 3663 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 3664 if (*errorp) { 3665 kprintf("hammer2_create_indirect: error %08x %s\n", 3666 *errorp, hammer2_error_str(*errorp)); 3667 return NULL; 3668 } 3669 3670 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 3671 base = hammer2_chain_base_and_count(parent, &count); 3672 3673 /* 3674 * dummy used in later chain allocation (no longer used for lookups). 3675 */ 3676 bzero(&dummy, sizeof(dummy)); 3677 3678 /* 3679 * How big should our new indirect block be? It has to be at least 3680 * as large as its parent for splits to work properly. 3681 * 3682 * The freemap uses a specific indirect block size. The number of 3683 * levels are built dynamically and ultimately depend on the size 3684 * volume. Because freemap blocks are taken from the reserved areas 3685 * of the volume our goal is efficiency (fewer levels) and not so 3686 * much to save disk space. 3687 * 3688 * The first indirect block level for a directory usually uses 3689 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 3690 * the hash mechanism, this typically gives us a nominal 3691 * 32 * 4 entries with one level of indirection. 3692 * 3693 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 3694 * indirect blocks. The initial 4 entries in the inode gives us 3695 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 3696 * of indirection gives us 137GB, and so forth. H2 can support 3697 * huge file sizes but they are not typical, so we try to stick 3698 * with compactness and do not use a larger indirect block size. 3699 * 3700 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 3701 * due to the way indirect blocks are created this usually winds 3702 * up being extremely inefficient for small files. Even though 3703 * 16KB requires more levels of indirection for very large files, 3704 * the 16KB records can be ganged together into 64KB DIOs. 3705 */ 3706 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3707 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3708 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 3709 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 3710 if (parent->data->ipdata.meta.type == 3711 HAMMER2_OBJTYPE_DIRECTORY) 3712 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 3713 else 3714 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 3715 3716 } else { 3717 nbytes = HAMMER2_IND_BYTES_NOM; 3718 } 3719 if (nbytes < count * sizeof(hammer2_blockref_t)) { 3720 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 3721 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 3722 nbytes = count * sizeof(hammer2_blockref_t); 3723 } 3724 ncount = nbytes / sizeof(hammer2_blockref_t); 3725 3726 /* 3727 * When creating an indirect block for a freemap node or leaf 3728 * the key/keybits must be fitted to static radix levels because 3729 * particular radix levels use particular reserved blocks in the 3730 * related zone. 3731 * 3732 * This routine calculates the key/radix of the indirect block 3733 * we need to create, and whether it is on the high-side or the 3734 * low-side. 3735 */ 3736 switch(for_type) { 3737 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3738 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3739 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 3740 base, count); 3741 break; 3742 case HAMMER2_BREF_TYPE_DATA: 3743 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 3744 base, count, ncount); 3745 break; 3746 case HAMMER2_BREF_TYPE_DIRENT: 3747 case HAMMER2_BREF_TYPE_INODE: 3748 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 3749 base, count, ncount); 3750 break; 3751 default: 3752 panic("illegal indirect block for bref type %d", for_type); 3753 break; 3754 } 3755 3756 /* 3757 * Normalize the key for the radix being represented, keeping the 3758 * high bits and throwing away the low bits. 3759 */ 3760 key &= ~(((hammer2_key_t)1 << keybits) - 1); 3761 3762 /* 3763 * Ok, create our new indirect block 3764 */ 3765 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3766 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3767 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 3768 } else { 3769 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT; 3770 } 3771 dummy.bref.key = key; 3772 dummy.bref.keybits = keybits; 3773 dummy.bref.data_off = hammer2_getradix(nbytes); 3774 dummy.bref.methods = 3775 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 3776 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 3777 3778 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy.bref); 3779 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 3780 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 3781 /* ichain has one ref at this point */ 3782 3783 /* 3784 * We have to mark it modified to allocate its block, but use 3785 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 3786 * it won't be acted upon by the flush code. 3787 */ 3788 *errorp = hammer2_chain_modify(ichain, mtid, 0, HAMMER2_MODIFY_OPTDATA); 3789 if (*errorp) { 3790 kprintf("hammer2_alloc_indirect: error %08x %s\n", 3791 *errorp, hammer2_error_str(*errorp)); 3792 hammer2_chain_unlock(ichain); 3793 hammer2_chain_drop(ichain); 3794 return NULL; 3795 } 3796 3797 /* 3798 * Iterate the original parent and move the matching brefs into 3799 * the new indirect block. 3800 * 3801 * XXX handle flushes. 3802 */ 3803 key_beg = 0; 3804 key_end = HAMMER2_KEY_MAX; 3805 key_next = 0; /* avoid gcc warnings */ 3806 hammer2_spin_ex(&parent->core.spin); 3807 loops = 0; 3808 reason = 0; 3809 3810 for (;;) { 3811 /* 3812 * Parent may have been modified, relocating its block array. 3813 * Reload the base pointer. 3814 */ 3815 base = hammer2_chain_base_and_count(parent, &count); 3816 3817 if (++loops > 100000) { 3818 hammer2_spin_unex(&parent->core.spin); 3819 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 3820 reason, parent, base, count, key_next); 3821 } 3822 3823 /* 3824 * NOTE: spinlock stays intact, returned chain (if not NULL) 3825 * is not referenced or locked which means that we 3826 * cannot safely check its flagged / deletion status 3827 * until we lock it. 3828 */ 3829 chain = hammer2_combined_find(parent, base, count, 3830 &key_next, 3831 key_beg, key_end, 3832 &bref); 3833 generation = parent->core.generation; 3834 if (bref == NULL) 3835 break; 3836 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3837 3838 /* 3839 * Skip keys that are not within the key/radix of the new 3840 * indirect block. They stay in the parent. 3841 */ 3842 if ((~(((hammer2_key_t)1 << keybits) - 1) & 3843 (key ^ bref->key)) != 0) { 3844 goto next_key_spinlocked; 3845 } 3846 3847 /* 3848 * Load the new indirect block by acquiring the related 3849 * chains (potentially from media as it might not be 3850 * in-memory). Then move it to the new parent (ichain). 3851 * 3852 * chain is referenced but not locked. We must lock the 3853 * chain to obtain definitive state. 3854 */ 3855 if (chain) { 3856 /* 3857 * Use chain already present in the RBTREE 3858 */ 3859 hammer2_chain_ref(chain); 3860 hammer2_spin_unex(&parent->core.spin); 3861 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 3862 } else { 3863 /* 3864 * Get chain for blockref element. _get returns NULL 3865 * on insertion race. 3866 */ 3867 bcopy = *bref; 3868 hammer2_spin_unex(&parent->core.spin); 3869 chain = hammer2_chain_get(parent, generation, &bcopy); 3870 if (chain == NULL) { 3871 reason = 1; 3872 hammer2_spin_ex(&parent->core.spin); 3873 continue; 3874 } 3875 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 3876 reason = 2; 3877 hammer2_chain_drop(chain); 3878 hammer2_spin_ex(&parent->core.spin); 3879 continue; 3880 } 3881 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 3882 } 3883 3884 /* 3885 * This is always live so if the chain has been deleted 3886 * we raced someone and we have to retry. 3887 * 3888 * NOTE: Lookups can race delete-duplicate because 3889 * delete-duplicate does not lock the parent's core 3890 * (they just use the spinlock on the core). 3891 * 3892 * (note reversed logic for this one) 3893 */ 3894 if (chain->parent != parent || 3895 (chain->flags & HAMMER2_CHAIN_DELETED)) { 3896 hammer2_chain_unlock(chain); 3897 hammer2_chain_drop(chain); 3898 kprintf("hammer2_chain_create_indirect " 3899 "RETRY (%p,%p)->%p %08x\n", 3900 parent, chain->parent, chain, chain->flags); 3901 hammer2_spin_ex(&parent->core.spin); 3902 continue; 3903 } 3904 3905 /* 3906 * Shift the chain to the indirect block. 3907 * 3908 * WARNING! No reason for us to load chain data, pass NOSTATS 3909 * to prevent delete/insert from trying to access 3910 * inode stats (and thus asserting if there is no 3911 * chain->data loaded). 3912 * 3913 * WARNING! The (parent, chain) deletion may modify the parent 3914 * and invalidate the base pointer. 3915 * 3916 * WARNING! Parent must already be marked modified, so we 3917 * can assume that chain_delete always suceeds. 3918 */ 3919 error = hammer2_chain_delete(parent, chain, mtid, 0); 3920 KKASSERT(error == 0); 3921 hammer2_chain_rename(NULL, &ichain, chain, mtid, 0); 3922 hammer2_chain_unlock(chain); 3923 hammer2_chain_drop(chain); 3924 KKASSERT(parent->refs > 0); 3925 chain = NULL; 3926 base = NULL; /* safety */ 3927 hammer2_spin_ex(&parent->core.spin); 3928 next_key_spinlocked: 3929 if (--maxloops == 0) 3930 panic("hammer2_chain_create_indirect: maxloops"); 3931 reason = 4; 3932 if (key_next == 0 || key_next > key_end) 3933 break; 3934 key_beg = key_next; 3935 /* loop */ 3936 } 3937 hammer2_spin_unex(&parent->core.spin); 3938 3939 /* 3940 * Insert the new indirect block into the parent now that we've 3941 * cleared out some entries in the parent. We calculated a good 3942 * insertion index in the loop above (ichain->index). 3943 * 3944 * We don't have to set UPDATE here because we mark ichain 3945 * modified down below (so the normal modified -> flush -> set-moved 3946 * sequence applies). 3947 * 3948 * The insertion shouldn't race as this is a completely new block 3949 * and the parent is locked. 3950 */ 3951 base = NULL; /* safety, parent modify may change address */ 3952 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 3953 KKASSERT(parent->core.live_count < count); 3954 hammer2_chain_insert(parent, ichain, 3955 HAMMER2_CHAIN_INSERT_SPIN | 3956 HAMMER2_CHAIN_INSERT_LIVE, 3957 0); 3958 3959 /* 3960 * Make sure flushes propogate after our manual insertion. 3961 */ 3962 hammer2_chain_setflush(ichain); 3963 hammer2_chain_setflush(parent); 3964 3965 /* 3966 * Figure out what to return. 3967 */ 3968 if (~(((hammer2_key_t)1 << keybits) - 1) & 3969 (create_key ^ key)) { 3970 /* 3971 * Key being created is outside the key range, 3972 * return the original parent. 3973 */ 3974 hammer2_chain_unlock(ichain); 3975 hammer2_chain_drop(ichain); 3976 } else { 3977 /* 3978 * Otherwise its in the range, return the new parent. 3979 * (leave both the new and old parent locked). 3980 */ 3981 parent = ichain; 3982 } 3983 3984 return(parent); 3985 } 3986 3987 /* 3988 * Freemap indirect blocks 3989 * 3990 * Calculate the keybits and highside/lowside of the freemap node the 3991 * caller is creating. 3992 * 3993 * This routine will specify the next higher-level freemap key/radix 3994 * representing the lowest-ordered set. By doing so, eventually all 3995 * low-ordered sets will be moved one level down. 3996 * 3997 * We have to be careful here because the freemap reserves a limited 3998 * number of blocks for a limited number of levels. So we can't just 3999 * push indiscriminately. 4000 */ 4001 int 4002 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4003 int keybits, hammer2_blockref_t *base, int count) 4004 { 4005 hammer2_chain_t *chain; 4006 hammer2_blockref_t *bref; 4007 hammer2_key_t key; 4008 hammer2_key_t key_beg; 4009 hammer2_key_t key_end; 4010 hammer2_key_t key_next; 4011 int locount; 4012 int hicount; 4013 int maxloops = 300000; 4014 4015 key = *keyp; 4016 locount = 0; 4017 hicount = 0; 4018 keybits = 64; 4019 4020 /* 4021 * Calculate the range of keys in the array being careful to skip 4022 * slots which are overridden with a deletion. 4023 */ 4024 key_beg = 0; 4025 key_end = HAMMER2_KEY_MAX; 4026 hammer2_spin_ex(&parent->core.spin); 4027 4028 for (;;) { 4029 if (--maxloops == 0) { 4030 panic("indkey_freemap shit %p %p:%d\n", 4031 parent, base, count); 4032 } 4033 chain = hammer2_combined_find(parent, base, count, 4034 &key_next, 4035 key_beg, key_end, 4036 &bref); 4037 4038 /* 4039 * Exhausted search 4040 */ 4041 if (bref == NULL) 4042 break; 4043 4044 /* 4045 * Skip deleted chains. 4046 */ 4047 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4048 if (key_next == 0 || key_next > key_end) 4049 break; 4050 key_beg = key_next; 4051 continue; 4052 } 4053 4054 /* 4055 * Use the full live (not deleted) element for the scan 4056 * iteration. HAMMER2 does not allow partial replacements. 4057 * 4058 * XXX should be built into hammer2_combined_find(). 4059 */ 4060 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4061 4062 if (keybits > bref->keybits) { 4063 key = bref->key; 4064 keybits = bref->keybits; 4065 } else if (keybits == bref->keybits && bref->key < key) { 4066 key = bref->key; 4067 } 4068 if (key_next == 0) 4069 break; 4070 key_beg = key_next; 4071 } 4072 hammer2_spin_unex(&parent->core.spin); 4073 4074 /* 4075 * Return the keybits for a higher-level FREEMAP_NODE covering 4076 * this node. 4077 */ 4078 switch(keybits) { 4079 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4080 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4081 break; 4082 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4083 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4084 break; 4085 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4086 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4087 break; 4088 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4089 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4090 break; 4091 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4092 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4093 break; 4094 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4095 panic("hammer2_chain_indkey_freemap: level too high"); 4096 break; 4097 default: 4098 panic("hammer2_chain_indkey_freemap: bad radix"); 4099 break; 4100 } 4101 *keyp = key; 4102 4103 return (keybits); 4104 } 4105 4106 /* 4107 * File indirect blocks 4108 * 4109 * Calculate the key/keybits for the indirect block to create by scanning 4110 * existing keys. The key being created is also passed in *keyp and can be 4111 * inside or outside the indirect block. Regardless, the indirect block 4112 * must hold at least two keys in order to guarantee sufficient space. 4113 * 4114 * We use a modified version of the freemap's fixed radix tree, but taylored 4115 * for file data. Basically we configure an indirect block encompassing the 4116 * smallest key. 4117 */ 4118 static int 4119 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4120 int keybits, hammer2_blockref_t *base, int count, 4121 int ncount) 4122 { 4123 hammer2_chain_t *chain; 4124 hammer2_blockref_t *bref; 4125 hammer2_key_t key; 4126 hammer2_key_t key_beg; 4127 hammer2_key_t key_end; 4128 hammer2_key_t key_next; 4129 int nradix; 4130 int locount; 4131 int hicount; 4132 int maxloops = 300000; 4133 4134 key = *keyp; 4135 locount = 0; 4136 hicount = 0; 4137 keybits = 64; 4138 4139 /* 4140 * Calculate the range of keys in the array being careful to skip 4141 * slots which are overridden with a deletion. 4142 * 4143 * Locate the smallest key. 4144 */ 4145 key_beg = 0; 4146 key_end = HAMMER2_KEY_MAX; 4147 hammer2_spin_ex(&parent->core.spin); 4148 4149 for (;;) { 4150 if (--maxloops == 0) { 4151 panic("indkey_freemap shit %p %p:%d\n", 4152 parent, base, count); 4153 } 4154 chain = hammer2_combined_find(parent, base, count, 4155 &key_next, 4156 key_beg, key_end, 4157 &bref); 4158 4159 /* 4160 * Exhausted search 4161 */ 4162 if (bref == NULL) 4163 break; 4164 4165 /* 4166 * Skip deleted chains. 4167 */ 4168 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4169 if (key_next == 0 || key_next > key_end) 4170 break; 4171 key_beg = key_next; 4172 continue; 4173 } 4174 4175 /* 4176 * Use the full live (not deleted) element for the scan 4177 * iteration. HAMMER2 does not allow partial replacements. 4178 * 4179 * XXX should be built into hammer2_combined_find(). 4180 */ 4181 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4182 4183 if (keybits > bref->keybits) { 4184 key = bref->key; 4185 keybits = bref->keybits; 4186 } else if (keybits == bref->keybits && bref->key < key) { 4187 key = bref->key; 4188 } 4189 if (key_next == 0) 4190 break; 4191 key_beg = key_next; 4192 } 4193 hammer2_spin_unex(&parent->core.spin); 4194 4195 /* 4196 * Calculate the static keybits for a higher-level indirect block 4197 * that contains the key. 4198 */ 4199 *keyp = key; 4200 4201 switch(ncount) { 4202 case HAMMER2_IND_BYTES_MIN / sizeof(hammer2_blockref_t): 4203 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4204 break; 4205 case HAMMER2_IND_BYTES_NOM / sizeof(hammer2_blockref_t): 4206 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4207 break; 4208 case HAMMER2_IND_BYTES_MAX / sizeof(hammer2_blockref_t): 4209 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4210 break; 4211 default: 4212 panic("bad ncount %d\n", ncount); 4213 nradix = 0; 4214 break; 4215 } 4216 4217 /* 4218 * The largest radix that can be returned for an indirect block is 4219 * 63 bits. (The largest practical indirect block radix is actually 4220 * 62 bits because the top-level inode or volume root contains four 4221 * entries, but allow 63 to be returned). 4222 */ 4223 if (nradix >= 64) 4224 nradix = 63; 4225 4226 return keybits + nradix; 4227 } 4228 4229 #if 1 4230 4231 /* 4232 * Directory indirect blocks. 4233 * 4234 * Covers both the inode index (directory of inodes), and directory contents 4235 * (filenames hardlinked to inodes). 4236 * 4237 * Because directory keys are hashed we generally try to cut the space in 4238 * half. We accomodate the inode index (which tends to have linearly 4239 * increasing inode numbers) by ensuring that the keyspace is at least large 4240 * enough to fill up the indirect block being created. 4241 */ 4242 static int 4243 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4244 int keybits, hammer2_blockref_t *base, int count, 4245 int ncount) 4246 { 4247 hammer2_blockref_t *bref; 4248 hammer2_chain_t *chain; 4249 hammer2_key_t key_beg; 4250 hammer2_key_t key_end; 4251 hammer2_key_t key_next; 4252 hammer2_key_t key; 4253 int nkeybits; 4254 int locount; 4255 int hicount; 4256 int maxloops = 300000; 4257 4258 /* 4259 * Shortcut if the parent is the inode. In this situation the 4260 * parent has 4+1 directory entries and we are creating an indirect 4261 * block capable of holding many more. 4262 */ 4263 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4264 return 63; 4265 } 4266 4267 key = *keyp; 4268 locount = 0; 4269 hicount = 0; 4270 4271 /* 4272 * Calculate the range of keys in the array being careful to skip 4273 * slots which are overridden with a deletion. 4274 */ 4275 key_beg = 0; 4276 key_end = HAMMER2_KEY_MAX; 4277 hammer2_spin_ex(&parent->core.spin); 4278 4279 for (;;) { 4280 if (--maxloops == 0) { 4281 panic("indkey_freemap shit %p %p:%d\n", 4282 parent, base, count); 4283 } 4284 chain = hammer2_combined_find(parent, base, count, 4285 &key_next, 4286 key_beg, key_end, 4287 &bref); 4288 4289 /* 4290 * Exhausted search 4291 */ 4292 if (bref == NULL) 4293 break; 4294 4295 /* 4296 * Deleted object 4297 */ 4298 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4299 if (key_next == 0 || key_next > key_end) 4300 break; 4301 key_beg = key_next; 4302 continue; 4303 } 4304 4305 /* 4306 * Use the full live (not deleted) element for the scan 4307 * iteration. HAMMER2 does not allow partial replacements. 4308 * 4309 * XXX should be built into hammer2_combined_find(). 4310 */ 4311 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4312 4313 /* 4314 * Expand our calculated key range (key, keybits) to fit 4315 * the scanned key. nkeybits represents the full range 4316 * that we will later cut in half (two halves @ nkeybits - 1). 4317 */ 4318 nkeybits = keybits; 4319 if (nkeybits < bref->keybits) { 4320 if (bref->keybits > 64) { 4321 kprintf("bad bref chain %p bref %p\n", 4322 chain, bref); 4323 Debugger("fubar"); 4324 } 4325 nkeybits = bref->keybits; 4326 } 4327 while (nkeybits < 64 && 4328 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4329 (key ^ bref->key)) != 0) { 4330 ++nkeybits; 4331 } 4332 4333 /* 4334 * If the new key range is larger we have to determine 4335 * which side of the new key range the existing keys fall 4336 * under by checking the high bit, then collapsing the 4337 * locount into the hicount or vise-versa. 4338 */ 4339 if (keybits != nkeybits) { 4340 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4341 hicount += locount; 4342 locount = 0; 4343 } else { 4344 locount += hicount; 4345 hicount = 0; 4346 } 4347 keybits = nkeybits; 4348 } 4349 4350 /* 4351 * The newly scanned key will be in the lower half or the 4352 * upper half of the (new) key range. 4353 */ 4354 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4355 ++hicount; 4356 else 4357 ++locount; 4358 4359 if (key_next == 0) 4360 break; 4361 key_beg = key_next; 4362 } 4363 hammer2_spin_unex(&parent->core.spin); 4364 bref = NULL; /* now invalid (safety) */ 4365 4366 /* 4367 * Adjust keybits to represent half of the full range calculated 4368 * above (radix 63 max) for our new indirect block. 4369 */ 4370 --keybits; 4371 4372 /* 4373 * Expand keybits to hold at least ncount elements. ncount will be 4374 * a power of 2. This is to try to completely fill leaf nodes (at 4375 * least for keys which are not hashes). 4376 * 4377 * We aren't counting 'in' or 'out', we are counting 'high side' 4378 * and 'low side' based on the bit at (1LL << keybits). We want 4379 * everything to be inside in these cases so shift it all to 4380 * the low or high side depending on the new high bit. 4381 */ 4382 while (((hammer2_key_t)1 << keybits) < ncount) { 4383 ++keybits; 4384 if (key & ((hammer2_key_t)1 << keybits)) { 4385 hicount += locount; 4386 locount = 0; 4387 } else { 4388 locount += hicount; 4389 hicount = 0; 4390 } 4391 } 4392 4393 if (hicount > locount) 4394 key |= (hammer2_key_t)1 << keybits; 4395 else 4396 key &= ~(hammer2_key_t)1 << keybits; 4397 4398 *keyp = key; 4399 4400 return (keybits); 4401 } 4402 4403 #else 4404 4405 /* 4406 * Directory indirect blocks. 4407 * 4408 * Covers both the inode index (directory of inodes), and directory contents 4409 * (filenames hardlinked to inodes). 4410 * 4411 * Because directory keys are hashed we generally try to cut the space in 4412 * half. We accomodate the inode index (which tends to have linearly 4413 * increasing inode numbers) by ensuring that the keyspace is at least large 4414 * enough to fill up the indirect block being created. 4415 */ 4416 static int 4417 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4418 int keybits, hammer2_blockref_t *base, int count, 4419 int ncount) 4420 { 4421 hammer2_blockref_t *bref; 4422 hammer2_chain_t *chain; 4423 hammer2_key_t key_beg; 4424 hammer2_key_t key_end; 4425 hammer2_key_t key_next; 4426 hammer2_key_t key; 4427 int nkeybits; 4428 int locount; 4429 int hicount; 4430 int maxloops = 300000; 4431 4432 /* 4433 * Shortcut if the parent is the inode. In this situation the 4434 * parent has 4+1 directory entries and we are creating an indirect 4435 * block capable of holding many more. 4436 */ 4437 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4438 return 63; 4439 } 4440 4441 key = *keyp; 4442 locount = 0; 4443 hicount = 0; 4444 4445 /* 4446 * Calculate the range of keys in the array being careful to skip 4447 * slots which are overridden with a deletion. 4448 */ 4449 key_beg = 0; 4450 key_end = HAMMER2_KEY_MAX; 4451 hammer2_spin_ex(&parent->core.spin); 4452 4453 for (;;) { 4454 if (--maxloops == 0) { 4455 panic("indkey_freemap shit %p %p:%d\n", 4456 parent, base, count); 4457 } 4458 chain = hammer2_combined_find(parent, base, count, 4459 &key_next, 4460 key_beg, key_end, 4461 &bref); 4462 4463 /* 4464 * Exhausted search 4465 */ 4466 if (bref == NULL) 4467 break; 4468 4469 /* 4470 * Deleted object 4471 */ 4472 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4473 if (key_next == 0 || key_next > key_end) 4474 break; 4475 key_beg = key_next; 4476 continue; 4477 } 4478 4479 /* 4480 * Use the full live (not deleted) element for the scan 4481 * iteration. HAMMER2 does not allow partial replacements. 4482 * 4483 * XXX should be built into hammer2_combined_find(). 4484 */ 4485 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4486 4487 /* 4488 * Expand our calculated key range (key, keybits) to fit 4489 * the scanned key. nkeybits represents the full range 4490 * that we will later cut in half (two halves @ nkeybits - 1). 4491 */ 4492 nkeybits = keybits; 4493 if (nkeybits < bref->keybits) { 4494 if (bref->keybits > 64) { 4495 kprintf("bad bref chain %p bref %p\n", 4496 chain, bref); 4497 Debugger("fubar"); 4498 } 4499 nkeybits = bref->keybits; 4500 } 4501 while (nkeybits < 64 && 4502 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4503 (key ^ bref->key)) != 0) { 4504 ++nkeybits; 4505 } 4506 4507 /* 4508 * If the new key range is larger we have to determine 4509 * which side of the new key range the existing keys fall 4510 * under by checking the high bit, then collapsing the 4511 * locount into the hicount or vise-versa. 4512 */ 4513 if (keybits != nkeybits) { 4514 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4515 hicount += locount; 4516 locount = 0; 4517 } else { 4518 locount += hicount; 4519 hicount = 0; 4520 } 4521 keybits = nkeybits; 4522 } 4523 4524 /* 4525 * The newly scanned key will be in the lower half or the 4526 * upper half of the (new) key range. 4527 */ 4528 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4529 ++hicount; 4530 else 4531 ++locount; 4532 4533 if (key_next == 0) 4534 break; 4535 key_beg = key_next; 4536 } 4537 hammer2_spin_unex(&parent->core.spin); 4538 bref = NULL; /* now invalid (safety) */ 4539 4540 /* 4541 * Adjust keybits to represent half of the full range calculated 4542 * above (radix 63 max) for our new indirect block. 4543 */ 4544 --keybits; 4545 4546 /* 4547 * Expand keybits to hold at least ncount elements. ncount will be 4548 * a power of 2. This is to try to completely fill leaf nodes (at 4549 * least for keys which are not hashes). 4550 * 4551 * We aren't counting 'in' or 'out', we are counting 'high side' 4552 * and 'low side' based on the bit at (1LL << keybits). We want 4553 * everything to be inside in these cases so shift it all to 4554 * the low or high side depending on the new high bit. 4555 */ 4556 while (((hammer2_key_t)1 << keybits) < ncount) { 4557 ++keybits; 4558 if (key & ((hammer2_key_t)1 << keybits)) { 4559 hicount += locount; 4560 locount = 0; 4561 } else { 4562 locount += hicount; 4563 hicount = 0; 4564 } 4565 } 4566 4567 if (hicount > locount) 4568 key |= (hammer2_key_t)1 << keybits; 4569 else 4570 key &= ~(hammer2_key_t)1 << keybits; 4571 4572 *keyp = key; 4573 4574 return (keybits); 4575 } 4576 4577 #endif 4578 4579 /* 4580 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 4581 * it exists. 4582 * 4583 * Both parent and chain must be locked exclusively. 4584 * 4585 * This function will modify the parent if the blockref requires removal 4586 * from the parent's block table. 4587 * 4588 * This function is NOT recursive. Any entity already pushed into the 4589 * chain (such as an inode) may still need visibility into its contents, 4590 * as well as the ability to read and modify the contents. For example, 4591 * for an unlinked file which is still open. 4592 * 4593 * Also note that the flusher is responsible for cleaning up empty 4594 * indirect blocks. 4595 */ 4596 int 4597 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 4598 hammer2_tid_t mtid, int flags) 4599 { 4600 int error = 0; 4601 4602 KKASSERT(hammer2_mtx_owned(&chain->lock)); 4603 4604 /* 4605 * Nothing to do if already marked. 4606 * 4607 * We need the spinlock on the core whos RBTREE contains chain 4608 * to protect against races. 4609 */ 4610 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 4611 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 4612 chain->parent == parent); 4613 error = _hammer2_chain_delete_helper(parent, chain, 4614 mtid, flags); 4615 } 4616 4617 /* 4618 * Permanent deletions mark the chain as destroyed. 4619 */ 4620 if (error == 0) { 4621 if (flags & HAMMER2_DELETE_PERMANENT) 4622 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 4623 hammer2_chain_setflush(chain); 4624 } 4625 4626 return error; 4627 } 4628 4629 /* 4630 * Returns the index of the nearest element in the blockref array >= elm. 4631 * Returns (count) if no element could be found. 4632 * 4633 * Sets *key_nextp to the next key for loop purposes but does not modify 4634 * it if the next key would be higher than the current value of *key_nextp. 4635 * Note that *key_nexp can overflow to 0, which should be tested by the 4636 * caller. 4637 * 4638 * WARNING! Must be called with parent's spinlock held. Spinlock remains 4639 * held through the operation. 4640 */ 4641 static int 4642 hammer2_base_find(hammer2_chain_t *parent, 4643 hammer2_blockref_t *base, int count, 4644 hammer2_key_t *key_nextp, 4645 hammer2_key_t key_beg, hammer2_key_t key_end) 4646 { 4647 hammer2_blockref_t *scan; 4648 hammer2_key_t scan_end; 4649 int i; 4650 int limit; 4651 4652 /* 4653 * Require the live chain's already have their core's counted 4654 * so we can optimize operations. 4655 */ 4656 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 4657 4658 /* 4659 * Degenerate case 4660 */ 4661 if (count == 0 || base == NULL) 4662 return(count); 4663 4664 /* 4665 * Sequential optimization using parent->cache_index. This is 4666 * the most likely scenario. 4667 * 4668 * We can avoid trailing empty entries on live chains, otherwise 4669 * we might have to check the whole block array. 4670 */ 4671 i = parent->cache_index; /* SMP RACE OK */ 4672 cpu_ccfence(); 4673 limit = parent->core.live_zero; 4674 if (i >= limit) 4675 i = limit - 1; 4676 if (i < 0) 4677 i = 0; 4678 KKASSERT(i < count); 4679 4680 /* 4681 * Search backwards 4682 */ 4683 scan = &base[i]; 4684 while (i > 0 && (scan->type == 0 || scan->key > key_beg)) { 4685 --scan; 4686 --i; 4687 } 4688 parent->cache_index = i; 4689 4690 /* 4691 * Search forwards, stop when we find a scan element which 4692 * encloses the key or until we know that there are no further 4693 * elements. 4694 */ 4695 while (i < count) { 4696 if (scan->type != 0) { 4697 scan_end = scan->key + 4698 ((hammer2_key_t)1 << scan->keybits) - 1; 4699 if (scan->key > key_beg || scan_end >= key_beg) 4700 break; 4701 } 4702 if (i >= limit) 4703 return (count); 4704 ++scan; 4705 ++i; 4706 } 4707 if (i != count) { 4708 parent->cache_index = i; 4709 if (i >= limit) { 4710 i = count; 4711 } else { 4712 scan_end = scan->key + 4713 ((hammer2_key_t)1 << scan->keybits); 4714 if (scan_end && (*key_nextp > scan_end || 4715 *key_nextp == 0)) { 4716 *key_nextp = scan_end; 4717 } 4718 } 4719 } 4720 return (i); 4721 } 4722 4723 /* 4724 * Do a combined search and return the next match either from the blockref 4725 * array or from the in-memory chain. Sets *bresp to the returned bref in 4726 * both cases, or sets it to NULL if the search exhausted. Only returns 4727 * a non-NULL chain if the search matched from the in-memory chain. 4728 * 4729 * When no in-memory chain has been found and a non-NULL bref is returned 4730 * in *bresp. 4731 * 4732 * 4733 * The returned chain is not locked or referenced. Use the returned bref 4734 * to determine if the search exhausted or not. Iterate if the base find 4735 * is chosen but matches a deleted chain. 4736 * 4737 * WARNING! Must be called with parent's spinlock held. Spinlock remains 4738 * held through the operation. 4739 */ 4740 static hammer2_chain_t * 4741 hammer2_combined_find(hammer2_chain_t *parent, 4742 hammer2_blockref_t *base, int count, 4743 hammer2_key_t *key_nextp, 4744 hammer2_key_t key_beg, hammer2_key_t key_end, 4745 hammer2_blockref_t **bresp) 4746 { 4747 hammer2_blockref_t *bref; 4748 hammer2_chain_t *chain; 4749 int i; 4750 4751 /* 4752 * Lookup in block array and in rbtree. 4753 */ 4754 *key_nextp = key_end + 1; 4755 i = hammer2_base_find(parent, base, count, key_nextp, 4756 key_beg, key_end); 4757 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 4758 4759 /* 4760 * Neither matched 4761 */ 4762 if (i == count && chain == NULL) { 4763 *bresp = NULL; 4764 return(NULL); 4765 } 4766 4767 /* 4768 * Only chain matched. 4769 */ 4770 if (i == count) { 4771 bref = &chain->bref; 4772 goto found; 4773 } 4774 4775 /* 4776 * Only blockref matched. 4777 */ 4778 if (chain == NULL) { 4779 bref = &base[i]; 4780 goto found; 4781 } 4782 4783 /* 4784 * Both in-memory and blockref matched, select the nearer element. 4785 * 4786 * If both are flush with the left-hand side or both are the 4787 * same distance away, select the chain. In this situation the 4788 * chain must have been loaded from the matching blockmap. 4789 */ 4790 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 4791 chain->bref.key == base[i].key) { 4792 KKASSERT(chain->bref.key == base[i].key); 4793 bref = &chain->bref; 4794 goto found; 4795 } 4796 4797 /* 4798 * Select the nearer key 4799 */ 4800 if (chain->bref.key < base[i].key) { 4801 bref = &chain->bref; 4802 } else { 4803 bref = &base[i]; 4804 chain = NULL; 4805 } 4806 4807 /* 4808 * If the bref is out of bounds we've exhausted our search. 4809 */ 4810 found: 4811 if (bref->key > key_end) { 4812 *bresp = NULL; 4813 chain = NULL; 4814 } else { 4815 *bresp = bref; 4816 } 4817 return(chain); 4818 } 4819 4820 /* 4821 * Locate the specified block array element and delete it. The element 4822 * must exist. 4823 * 4824 * The spin lock on the related chain must be held. 4825 * 4826 * NOTE: live_count was adjusted when the chain was deleted, so it does not 4827 * need to be adjusted when we commit the media change. 4828 */ 4829 void 4830 hammer2_base_delete(hammer2_chain_t *parent, 4831 hammer2_blockref_t *base, int count, 4832 hammer2_chain_t *chain) 4833 { 4834 hammer2_blockref_t *elm = &chain->bref; 4835 hammer2_blockref_t *scan; 4836 hammer2_key_t key_next; 4837 int i; 4838 4839 /* 4840 * Delete element. Expect the element to exist. 4841 * 4842 * XXX see caller, flush code not yet sophisticated enough to prevent 4843 * re-flushed in some cases. 4844 */ 4845 key_next = 0; /* max range */ 4846 i = hammer2_base_find(parent, base, count, &key_next, 4847 elm->key, elm->key); 4848 scan = &base[i]; 4849 if (i == count || scan->type == 0 || 4850 scan->key != elm->key || 4851 ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 && 4852 scan->keybits != elm->keybits)) { 4853 hammer2_spin_unex(&parent->core.spin); 4854 panic("delete base %p element not found at %d/%d elm %p\n", 4855 base, i, count, elm); 4856 return; 4857 } 4858 4859 /* 4860 * Update stats and zero the entry. 4861 * 4862 * NOTE: Handle radix == 0 (0 bytes) case. 4863 */ 4864 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 4865 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 4866 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 4867 } 4868 switch(scan->type) { 4869 case HAMMER2_BREF_TYPE_INODE: 4870 parent->bref.embed.stats.inode_count -= 1; 4871 /* fall through */ 4872 case HAMMER2_BREF_TYPE_DATA: 4873 case HAMMER2_BREF_TYPE_INDIRECT: 4874 parent->bref.embed.stats.data_count -= 4875 scan->embed.stats.data_count; 4876 parent->bref.embed.stats.inode_count -= 4877 scan->embed.stats.inode_count; 4878 break; 4879 default: 4880 break; 4881 } 4882 4883 bzero(scan, sizeof(*scan)); 4884 4885 /* 4886 * We can only optimize parent->core.live_zero for live chains. 4887 */ 4888 if (parent->core.live_zero == i + 1) { 4889 while (--i >= 0 && base[i].type == 0) 4890 ; 4891 parent->core.live_zero = i + 1; 4892 } 4893 4894 /* 4895 * Clear appropriate blockmap flags in chain. 4896 */ 4897 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 4898 HAMMER2_CHAIN_BMAPUPD); 4899 } 4900 4901 /* 4902 * Insert the specified element. The block array must not already have the 4903 * element and must have space available for the insertion. 4904 * 4905 * The spin lock on the related chain must be held. 4906 * 4907 * NOTE: live_count was adjusted when the chain was deleted, so it does not 4908 * need to be adjusted when we commit the media change. 4909 */ 4910 void 4911 hammer2_base_insert(hammer2_chain_t *parent, 4912 hammer2_blockref_t *base, int count, 4913 hammer2_chain_t *chain) 4914 { 4915 hammer2_blockref_t *elm = &chain->bref; 4916 hammer2_key_t key_next; 4917 hammer2_key_t xkey; 4918 int i; 4919 int j; 4920 int k; 4921 int l; 4922 int u = 1; 4923 4924 /* 4925 * Insert new element. Expect the element to not already exist 4926 * unless we are replacing it. 4927 * 4928 * XXX see caller, flush code not yet sophisticated enough to prevent 4929 * re-flushed in some cases. 4930 */ 4931 key_next = 0; /* max range */ 4932 i = hammer2_base_find(parent, base, count, &key_next, 4933 elm->key, elm->key); 4934 4935 /* 4936 * Shortcut fill optimization, typical ordered insertion(s) may not 4937 * require a search. 4938 */ 4939 KKASSERT(i >= 0 && i <= count); 4940 4941 /* 4942 * Set appropriate blockmap flags in chain. 4943 */ 4944 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 4945 4946 /* 4947 * Update stats and zero the entry 4948 */ 4949 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 4950 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 4951 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 4952 } 4953 switch(elm->type) { 4954 case HAMMER2_BREF_TYPE_INODE: 4955 parent->bref.embed.stats.inode_count += 1; 4956 /* fall through */ 4957 case HAMMER2_BREF_TYPE_DATA: 4958 case HAMMER2_BREF_TYPE_INDIRECT: 4959 parent->bref.embed.stats.data_count += 4960 elm->embed.stats.data_count; 4961 parent->bref.embed.stats.inode_count += 4962 elm->embed.stats.inode_count; 4963 break; 4964 default: 4965 break; 4966 } 4967 4968 4969 /* 4970 * We can only optimize parent->core.live_zero for live chains. 4971 */ 4972 if (i == count && parent->core.live_zero < count) { 4973 i = parent->core.live_zero++; 4974 base[i] = *elm; 4975 return; 4976 } 4977 4978 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 4979 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 4980 hammer2_spin_unex(&parent->core.spin); 4981 panic("insert base %p overlapping elements at %d elm %p\n", 4982 base, i, elm); 4983 } 4984 4985 /* 4986 * Try to find an empty slot before or after. 4987 */ 4988 j = i; 4989 k = i; 4990 while (j > 0 || k < count) { 4991 --j; 4992 if (j >= 0 && base[j].type == 0) { 4993 if (j == i - 1) { 4994 base[j] = *elm; 4995 } else { 4996 bcopy(&base[j+1], &base[j], 4997 (i - j - 1) * sizeof(*base)); 4998 base[i - 1] = *elm; 4999 } 5000 goto validate; 5001 } 5002 ++k; 5003 if (k < count && base[k].type == 0) { 5004 bcopy(&base[i], &base[i+1], 5005 (k - i) * sizeof(hammer2_blockref_t)); 5006 base[i] = *elm; 5007 5008 /* 5009 * We can only update parent->core.live_zero for live 5010 * chains. 5011 */ 5012 if (parent->core.live_zero <= k) 5013 parent->core.live_zero = k + 1; 5014 u = 2; 5015 goto validate; 5016 } 5017 } 5018 panic("hammer2_base_insert: no room!"); 5019 5020 /* 5021 * Debugging 5022 */ 5023 validate: 5024 key_next = 0; 5025 for (l = 0; l < count; ++l) { 5026 if (base[l].type) { 5027 key_next = base[l].key + 5028 ((hammer2_key_t)1 << base[l].keybits) - 1; 5029 break; 5030 } 5031 } 5032 while (++l < count) { 5033 if (base[l].type) { 5034 if (base[l].key <= key_next) 5035 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5036 key_next = base[l].key + 5037 ((hammer2_key_t)1 << base[l].keybits) - 1; 5038 5039 } 5040 } 5041 5042 } 5043 5044 #if 0 5045 5046 /* 5047 * Sort the blockref array for the chain. Used by the flush code to 5048 * sort the blockref[] array. 5049 * 5050 * The chain must be exclusively locked AND spin-locked. 5051 */ 5052 typedef hammer2_blockref_t *hammer2_blockref_p; 5053 5054 static 5055 int 5056 hammer2_base_sort_callback(const void *v1, const void *v2) 5057 { 5058 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5059 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5060 5061 /* 5062 * Make sure empty elements are placed at the end of the array 5063 */ 5064 if (bref1->type == 0) { 5065 if (bref2->type == 0) 5066 return(0); 5067 return(1); 5068 } else if (bref2->type == 0) { 5069 return(-1); 5070 } 5071 5072 /* 5073 * Sort by key 5074 */ 5075 if (bref1->key < bref2->key) 5076 return(-1); 5077 if (bref1->key > bref2->key) 5078 return(1); 5079 return(0); 5080 } 5081 5082 void 5083 hammer2_base_sort(hammer2_chain_t *chain) 5084 { 5085 hammer2_blockref_t *base; 5086 int count; 5087 5088 switch(chain->bref.type) { 5089 case HAMMER2_BREF_TYPE_INODE: 5090 /* 5091 * Special shortcut for embedded data returns the inode 5092 * itself. Callers must detect this condition and access 5093 * the embedded data (the strategy code does this for us). 5094 * 5095 * This is only applicable to regular files and softlinks. 5096 */ 5097 if (chain->data->ipdata.meta.op_flags & 5098 HAMMER2_OPFLAG_DIRECTDATA) { 5099 return; 5100 } 5101 base = &chain->data->ipdata.u.blockset.blockref[0]; 5102 count = HAMMER2_SET_COUNT; 5103 break; 5104 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5105 case HAMMER2_BREF_TYPE_INDIRECT: 5106 /* 5107 * Optimize indirect blocks in the INITIAL state to avoid 5108 * I/O. 5109 */ 5110 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5111 base = &chain->data->npdata[0]; 5112 count = chain->bytes / sizeof(hammer2_blockref_t); 5113 break; 5114 case HAMMER2_BREF_TYPE_VOLUME: 5115 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5116 count = HAMMER2_SET_COUNT; 5117 break; 5118 case HAMMER2_BREF_TYPE_FREEMAP: 5119 base = &chain->data->blkset.blockref[0]; 5120 count = HAMMER2_SET_COUNT; 5121 break; 5122 default: 5123 kprintf("hammer2_chain_lookup: unrecognized " 5124 "blockref(A) type: %d", 5125 chain->bref.type); 5126 while (1) 5127 tsleep(&base, 0, "dead", 0); 5128 panic("hammer2_chain_lookup: unrecognized " 5129 "blockref(A) type: %d", 5130 chain->bref.type); 5131 base = NULL; /* safety */ 5132 count = 0; /* safety */ 5133 } 5134 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5135 } 5136 5137 #endif 5138 5139 /* 5140 * Chain memory management 5141 */ 5142 void 5143 hammer2_chain_wait(hammer2_chain_t *chain) 5144 { 5145 tsleep(chain, 0, "chnflw", 1); 5146 } 5147 5148 const hammer2_media_data_t * 5149 hammer2_chain_rdata(hammer2_chain_t *chain) 5150 { 5151 KKASSERT(chain->data != NULL); 5152 return (chain->data); 5153 } 5154 5155 hammer2_media_data_t * 5156 hammer2_chain_wdata(hammer2_chain_t *chain) 5157 { 5158 KKASSERT(chain->data != NULL); 5159 return (chain->data); 5160 } 5161 5162 /* 5163 * Set the check data for a chain. This can be a heavy-weight operation 5164 * and typically only runs on-flush. For file data check data is calculated 5165 * when the logical buffers are flushed. 5166 */ 5167 void 5168 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5169 { 5170 chain->bref.flags &= ~HAMMER2_BREF_FLAG_ZERO; 5171 5172 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5173 case HAMMER2_CHECK_NONE: 5174 break; 5175 case HAMMER2_CHECK_DISABLED: 5176 break; 5177 case HAMMER2_CHECK_ISCSI32: 5178 chain->bref.check.iscsi32.value = 5179 hammer2_icrc32(bdata, chain->bytes); 5180 break; 5181 case HAMMER2_CHECK_XXHASH64: 5182 chain->bref.check.xxhash64.value = 5183 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5184 break; 5185 case HAMMER2_CHECK_SHA192: 5186 { 5187 SHA256_CTX hash_ctx; 5188 union { 5189 uint8_t digest[SHA256_DIGEST_LENGTH]; 5190 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5191 } u; 5192 5193 SHA256_Init(&hash_ctx); 5194 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5195 SHA256_Final(u.digest, &hash_ctx); 5196 u.digest64[2] ^= u.digest64[3]; 5197 bcopy(u.digest, 5198 chain->bref.check.sha192.data, 5199 sizeof(chain->bref.check.sha192.data)); 5200 } 5201 break; 5202 case HAMMER2_CHECK_FREEMAP: 5203 chain->bref.check.freemap.icrc32 = 5204 hammer2_icrc32(bdata, chain->bytes); 5205 break; 5206 default: 5207 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5208 chain->bref.methods); 5209 break; 5210 } 5211 } 5212 5213 int 5214 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5215 { 5216 uint32_t check32; 5217 uint64_t check64; 5218 int r; 5219 5220 if (chain->bref.flags & HAMMER2_BREF_FLAG_ZERO) 5221 return 1; 5222 5223 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5224 case HAMMER2_CHECK_NONE: 5225 r = 1; 5226 break; 5227 case HAMMER2_CHECK_DISABLED: 5228 r = 1; 5229 break; 5230 case HAMMER2_CHECK_ISCSI32: 5231 check32 = hammer2_icrc32(bdata, chain->bytes); 5232 r = (chain->bref.check.iscsi32.value == check32); 5233 if (r == 0) { 5234 kprintf("chain %016jx.%02x meth=%02x CHECK FAIL " 5235 "(flags=%08x, bref/data %08x/%08x)\n", 5236 chain->bref.data_off, 5237 chain->bref.type, 5238 chain->bref.methods, 5239 chain->flags, 5240 chain->bref.check.iscsi32.value, 5241 check32); 5242 } 5243 hammer2_check_icrc32 += chain->bytes; 5244 break; 5245 case HAMMER2_CHECK_XXHASH64: 5246 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5247 r = (chain->bref.check.xxhash64.value == check64); 5248 if (r == 0) { 5249 kprintf("chain %016jx.%02x key=%016jx " 5250 "meth=%02x CHECK FAIL " 5251 "(flags=%08x, bref/data %016jx/%016jx)\n", 5252 chain->bref.data_off, 5253 chain->bref.type, 5254 chain->bref.key, 5255 chain->bref.methods, 5256 chain->flags, 5257 chain->bref.check.xxhash64.value, 5258 check64); 5259 } 5260 hammer2_check_xxhash64 += chain->bytes; 5261 break; 5262 case HAMMER2_CHECK_SHA192: 5263 { 5264 SHA256_CTX hash_ctx; 5265 union { 5266 uint8_t digest[SHA256_DIGEST_LENGTH]; 5267 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5268 } u; 5269 5270 SHA256_Init(&hash_ctx); 5271 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5272 SHA256_Final(u.digest, &hash_ctx); 5273 u.digest64[2] ^= u.digest64[3]; 5274 if (bcmp(u.digest, 5275 chain->bref.check.sha192.data, 5276 sizeof(chain->bref.check.sha192.data)) == 0) { 5277 r = 1; 5278 } else { 5279 r = 0; 5280 kprintf("chain %016jx.%02x meth=%02x " 5281 "CHECK FAIL\n", 5282 chain->bref.data_off, 5283 chain->bref.type, 5284 chain->bref.methods); 5285 } 5286 } 5287 break; 5288 case HAMMER2_CHECK_FREEMAP: 5289 r = (chain->bref.check.freemap.icrc32 == 5290 hammer2_icrc32(bdata, chain->bytes)); 5291 if (r == 0) { 5292 kprintf("chain %016jx.%02x meth=%02x " 5293 "CHECK FAIL\n", 5294 chain->bref.data_off, 5295 chain->bref.type, 5296 chain->bref.methods); 5297 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 5298 chain->bref.check.freemap.icrc32, 5299 hammer2_icrc32(bdata, chain->bytes), 5300 chain->bytes); 5301 if (chain->dio) 5302 kprintf("dio %p buf %016jx,%d bdata %p/%p\n", 5303 chain->dio, chain->dio->bp->b_loffset, 5304 chain->dio->bp->b_bufsize, bdata, 5305 chain->dio->bp->b_data); 5306 } 5307 5308 break; 5309 default: 5310 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5311 chain->bref.methods); 5312 r = 1; 5313 break; 5314 } 5315 return r; 5316 } 5317 5318 /* 5319 * Acquire the chain and parent representing the specified inode for the 5320 * device at the specified cluster index. 5321 * 5322 * The flags passed in are LOOKUP flags, not RESOLVE flags. 5323 * 5324 * If we are unable to locate the hardlink, INVAL is returned and *chainp 5325 * will be NULL. *parentp may still be set error or not, or NULL if the 5326 * parent itself could not be resolved. 5327 * 5328 * Caller must pass-in a valid or NULL *parentp or *chainp. The passed-in 5329 * *parentp and *chainp will be unlocked if not NULL. 5330 */ 5331 int 5332 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 5333 int clindex, int flags, 5334 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 5335 { 5336 hammer2_chain_t *parent; 5337 hammer2_chain_t *rchain; 5338 hammer2_key_t key_dummy; 5339 int resolve_flags; 5340 int error; 5341 5342 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 5343 HAMMER2_RESOLVE_SHARED : 0; 5344 5345 /* 5346 * Caller expects us to replace these. 5347 */ 5348 if (*chainp) { 5349 hammer2_chain_unlock(*chainp); 5350 hammer2_chain_drop(*chainp); 5351 *chainp = NULL; 5352 } 5353 if (*parentp) { 5354 hammer2_chain_unlock(*parentp); 5355 hammer2_chain_drop(*parentp); 5356 *parentp = NULL; 5357 } 5358 5359 /* 5360 * Inodes hang off of the iroot (bit 63 is clear, differentiating 5361 * inodes from root directory entries in the key lookup). 5362 */ 5363 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 5364 rchain = NULL; 5365 if (parent) { 5366 rchain = hammer2_chain_lookup(&parent, &key_dummy, 5367 inum, inum, 5368 &error, flags); 5369 } else { 5370 error = HAMMER2_ERROR_EIO; 5371 } 5372 *parentp = parent; 5373 *chainp = rchain; 5374 5375 return error; 5376 } 5377 5378 /* 5379 * Used by the bulkscan code to snapshot the synchronized storage for 5380 * a volume, allowing it to be scanned concurrently against normal 5381 * operation. 5382 */ 5383 hammer2_chain_t * 5384 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 5385 { 5386 hammer2_chain_t *copy; 5387 5388 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 5389 copy->data = kmalloc(sizeof(copy->data->voldata), 5390 hmp->mchain, 5391 M_WAITOK | M_ZERO); 5392 hammer2_voldata_lock(hmp); 5393 copy->data->voldata = hmp->volsync; 5394 hammer2_voldata_unlock(hmp); 5395 5396 return copy; 5397 } 5398 5399 void 5400 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 5401 { 5402 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 5403 KKASSERT(copy->data); 5404 kfree(copy->data, copy->hmp->mchain); 5405 copy->data = NULL; 5406 atomic_add_long(&hammer2_chain_allocs, -1); 5407 hammer2_chain_drop(copy); 5408 } 5409 5410 /* 5411 * Create a snapshot of the specified (chain) with the specified label. 5412 * The originating hammer2_inode must be exclusively locked for 5413 * safety. The device's bulklk should be held by the caller. The caller 5414 * is responsible for synchronizing the filesystem to storage before 5415 * taking the snapshot. 5416 */ 5417 int 5418 hammer2_chain_snapshot(hammer2_chain_t *chain, hammer2_ioc_pfs_t *pmp, 5419 hammer2_tid_t mtid) 5420 { 5421 hammer2_dev_t *hmp; 5422 const hammer2_inode_data_t *ripdata; 5423 hammer2_inode_data_t *wipdata; 5424 hammer2_chain_t *nchain; 5425 hammer2_inode_t *nip; 5426 size_t name_len; 5427 hammer2_key_t lhc; 5428 struct vattr vat; 5429 #if 0 5430 uuid_t opfs_clid; 5431 #endif 5432 int error; 5433 5434 kprintf("snapshot %s\n", pmp->name); 5435 5436 name_len = strlen(pmp->name); 5437 lhc = hammer2_dirhash(pmp->name, name_len); 5438 5439 /* 5440 * Get the clid 5441 */ 5442 ripdata = &chain->data->ipdata; 5443 #if 0 5444 opfs_clid = ripdata->meta.pfs_clid; 5445 #endif 5446 hmp = chain->hmp; 5447 5448 /* 5449 * Create the snapshot directory under the super-root 5450 * 5451 * Set PFS type, generate a unique filesystem id, and generate 5452 * a cluster id. Use the same clid when snapshotting a PFS root, 5453 * which theoretically allows the snapshot to be used as part of 5454 * the same cluster (perhaps as a cache). 5455 * 5456 * Copy the (flushed) blockref array. Theoretically we could use 5457 * chain_duplicate() but it becomes difficult to disentangle 5458 * the shared core so for now just brute-force it. 5459 */ 5460 VATTR_NULL(&vat); 5461 vat.va_type = VDIR; 5462 vat.va_mode = 0755; 5463 hammer2_chain_unlock(chain); 5464 nip = hammer2_inode_create(hmp->spmp->iroot, hmp->spmp->iroot, 5465 &vat, proc0.p_ucred, 5466 pmp->name, name_len, 0, 5467 1, 0, 0, 5468 HAMMER2_INSERT_PFSROOT, &error); 5469 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 5470 5471 if (nip) { 5472 hammer2_inode_modify(nip); 5473 nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS); 5474 error = hammer2_chain_modify(nchain, mtid, 0, 0); 5475 KKASSERT(error == 0); 5476 wipdata = &nchain->data->ipdata; 5477 5478 nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER; 5479 nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT; 5480 nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT; 5481 kern_uuidgen(&nip->meta.pfs_fsid, 1); 5482 5483 #if 0 5484 /* 5485 * Give the snapshot its own private cluster id. As a 5486 * snapshot no further synchronization with the original 5487 * cluster will be done. 5488 */ 5489 if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) 5490 nip->meta.pfs_clid = opfs_clid; 5491 else 5492 kern_uuidgen(&nip->meta.pfs_clid, 1); 5493 #endif 5494 kern_uuidgen(&nip->meta.pfs_clid, 1); 5495 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 5496 5497 /* XXX hack blockset copy */ 5498 /* XXX doesn't work with real cluster */ 5499 wipdata->meta = nip->meta; 5500 wipdata->u.blockset = ripdata->u.blockset; 5501 5502 hammer2_flush(nchain, 1); 5503 KKASSERT(wipdata == &nchain->data->ipdata); 5504 hammer2_pfsalloc(nchain, wipdata, nchain->bref.modify_tid, 0); 5505 5506 hammer2_chain_unlock(nchain); 5507 hammer2_chain_drop(nchain); 5508 hammer2_inode_chain_sync(nip); 5509 hammer2_inode_unlock(nip); 5510 hammer2_inode_run_sideq(hmp->spmp); 5511 } 5512 return (error); 5513 } 5514 5515 /* 5516 * Returns non-zero if the chain (INODE or DIRENT) matches the 5517 * filename. 5518 */ 5519 int 5520 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 5521 size_t name_len) 5522 { 5523 const hammer2_inode_data_t *ripdata; 5524 const hammer2_dirent_head_t *den; 5525 5526 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5527 ripdata = &chain->data->ipdata; 5528 if (ripdata->meta.name_len == name_len && 5529 bcmp(ripdata->filename, name, name_len) == 0) { 5530 return 1; 5531 } 5532 } 5533 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 5534 chain->bref.embed.dirent.namlen == name_len) { 5535 den = &chain->bref.embed.dirent; 5536 if (name_len > sizeof(chain->bref.check.buf) && 5537 bcmp(chain->data->buf, name, name_len) == 0) { 5538 return 1; 5539 } 5540 if (name_len <= sizeof(chain->bref.check.buf) && 5541 bcmp(chain->bref.check.buf, name, name_len) == 0) { 5542 return 1; 5543 } 5544 } 5545 return 0; 5546 } 5547