1 /* 2 * Copyright (c) 2011-2019 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/kern_syscall.h> 61 #include <sys/uuid.h> 62 63 #include <crypto/sha2/sha2.h> 64 65 #include "hammer2.h" 66 67 static hammer2_chain_t *hammer2_chain_create_indirect( 68 hammer2_chain_t *parent, 69 hammer2_key_t key, int keybits, 70 hammer2_tid_t mtid, int for_type, int *errorp); 71 static void hammer2_chain_rename_obref(hammer2_chain_t **parentp, 72 hammer2_chain_t *chain, hammer2_tid_t mtid, 73 int flags, hammer2_blockref_t *obref); 74 static int hammer2_chain_delete_obref(hammer2_chain_t *parent, 75 hammer2_chain_t *chain, 76 hammer2_tid_t mtid, int flags, 77 hammer2_blockref_t *obref); 78 static hammer2_io_t *hammer2_chain_drop_data(hammer2_chain_t *chain); 79 static hammer2_chain_t *hammer2_combined_find( 80 hammer2_chain_t *parent, 81 hammer2_blockref_t *base, int count, 82 hammer2_key_t *key_nextp, 83 hammer2_key_t key_beg, hammer2_key_t key_end, 84 hammer2_blockref_t **bresp); 85 86 /* 87 * There are many degenerate situations where an extreme rate of console 88 * output can occur from warnings and errors. Make sure this output does 89 * not impede operations. 90 */ 91 static struct krate krate_h2chk = { .freq = 5 }; 92 static struct krate krate_h2me = { .freq = 1 }; 93 static struct krate krate_h2em = { .freq = 1 }; 94 95 /* 96 * Basic RBTree for chains (core.rbtree). 97 */ 98 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 99 100 int 101 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 102 { 103 hammer2_key_t c1_beg; 104 hammer2_key_t c1_end; 105 hammer2_key_t c2_beg; 106 hammer2_key_t c2_end; 107 108 /* 109 * Compare chains. Overlaps are not supposed to happen and catch 110 * any software issues early we count overlaps as a match. 111 */ 112 c1_beg = chain1->bref.key; 113 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 114 c2_beg = chain2->bref.key; 115 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 116 117 if (c1_end < c2_beg) /* fully to the left */ 118 return(-1); 119 if (c1_beg > c2_end) /* fully to the right */ 120 return(1); 121 return(0); /* overlap (must not cross edge boundary) */ 122 } 123 124 /* 125 * Assert that a chain has no media data associated with it. 126 */ 127 static __inline void 128 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 129 { 130 KKASSERT(chain->dio == NULL); 131 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 132 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 133 chain->data) { 134 panic("hammer2_chain_assert_no_data: chain %p still has data", 135 chain); 136 } 137 } 138 139 /* 140 * Make a chain visible to the flusher. The flusher operates using a top-down 141 * recursion based on the ONFLUSH flag. It locates MODIFIED and UPDATE chains, 142 * flushes them, and updates blocks back to the volume root. 143 * 144 * This routine sets the ONFLUSH flag upward from the triggering chain until 145 * it hits an inode root or the volume root. Inode chains serve as inflection 146 * points, requiring the flusher to bridge across trees. Inodes include 147 * regular inodes, PFS roots (pmp->iroot), and the media super root 148 * (spmp->iroot). 149 */ 150 void 151 hammer2_chain_setflush(hammer2_chain_t *chain) 152 { 153 hammer2_chain_t *parent; 154 155 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 156 hammer2_spin_sh(&chain->core.spin); 157 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 158 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 159 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 160 break; 161 if ((parent = chain->parent) == NULL) 162 break; 163 hammer2_spin_sh(&parent->core.spin); 164 hammer2_spin_unsh(&chain->core.spin); 165 chain = parent; 166 } 167 hammer2_spin_unsh(&chain->core.spin); 168 } 169 } 170 171 /* 172 * Allocate a new disconnected chain element representing the specified 173 * bref. chain->refs is set to 1 and the passed bref is copied to 174 * chain->bref. chain->bytes is derived from the bref. 175 * 176 * chain->pmp inherits pmp unless the chain is an inode (other than the 177 * super-root inode). 178 * 179 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 180 */ 181 hammer2_chain_t * 182 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 183 hammer2_blockref_t *bref) 184 { 185 hammer2_chain_t *chain; 186 u_int bytes; 187 188 /* 189 * Special case - radix of 0 indicates a chain that does not 190 * need a data reference (context is completely embedded in the 191 * bref). 192 */ 193 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 194 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 195 else 196 bytes = 0; 197 198 atomic_add_long(&hammer2_chain_allocs, 1); 199 200 /* 201 * Construct the appropriate system structure. 202 */ 203 switch(bref->type) { 204 case HAMMER2_BREF_TYPE_DIRENT: 205 case HAMMER2_BREF_TYPE_INODE: 206 case HAMMER2_BREF_TYPE_INDIRECT: 207 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 208 case HAMMER2_BREF_TYPE_DATA: 209 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 210 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 211 break; 212 case HAMMER2_BREF_TYPE_VOLUME: 213 case HAMMER2_BREF_TYPE_FREEMAP: 214 /* 215 * Only hammer2_chain_bulksnap() calls this function with these 216 * types. 217 */ 218 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 219 break; 220 default: 221 chain = NULL; 222 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 223 bref->type); 224 } 225 226 /* 227 * Initialize the new chain structure. pmp must be set to NULL for 228 * chains belonging to the super-root topology of a device mount. 229 */ 230 if (pmp == hmp->spmp) 231 chain->pmp = NULL; 232 else 233 chain->pmp = pmp; 234 235 chain->hmp = hmp; 236 chain->bref = *bref; 237 chain->bytes = bytes; 238 chain->refs = 1; 239 chain->flags = HAMMER2_CHAIN_ALLOCATED; 240 lockinit(&chain->diolk, "chdio", 0, 0); 241 242 /* 243 * Set the PFS boundary flag if this chain represents a PFS root. 244 */ 245 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 246 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 247 hammer2_chain_core_init(chain); 248 249 return (chain); 250 } 251 252 /* 253 * Initialize a chain's core structure. This structure used to be allocated 254 * but is now embedded. 255 * 256 * The core is not locked. No additional refs on the chain are made. 257 * (trans) must not be NULL if (core) is not NULL. 258 */ 259 void 260 hammer2_chain_core_init(hammer2_chain_t *chain) 261 { 262 /* 263 * Fresh core under nchain (no multi-homing of ochain's 264 * sub-tree). 265 */ 266 RB_INIT(&chain->core.rbtree); /* live chains */ 267 hammer2_mtx_init(&chain->lock, "h2chain"); 268 } 269 270 /* 271 * Add a reference to a chain element, preventing its destruction. 272 * 273 * (can be called with spinlock held) 274 */ 275 void 276 hammer2_chain_ref(hammer2_chain_t *chain) 277 { 278 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 279 /* 280 * Just flag that the chain was used and should be recycled 281 * on the LRU if it encounters it later. 282 */ 283 if (chain->flags & HAMMER2_CHAIN_ONLRU) 284 atomic_set_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 285 286 #if 0 287 /* 288 * REMOVED - reduces contention, lru_list is more heuristical 289 * now. 290 * 291 * 0->non-zero transition must ensure that chain is removed 292 * from the LRU list. 293 * 294 * NOTE: Already holding lru_spin here so we cannot call 295 * hammer2_chain_ref() to get it off lru_list, do 296 * it manually. 297 */ 298 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 299 hammer2_pfs_t *pmp = chain->pmp; 300 hammer2_spin_ex(&pmp->lru_spin); 301 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 302 atomic_add_int(&pmp->lru_count, -1); 303 atomic_clear_int(&chain->flags, 304 HAMMER2_CHAIN_ONLRU); 305 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 306 } 307 hammer2_spin_unex(&pmp->lru_spin); 308 } 309 #endif 310 } 311 } 312 313 /* 314 * Ref a locked chain and force the data to be held across an unlock. 315 * Chain must be currently locked. The user of the chain who desires 316 * to release the hold must call hammer2_chain_lock_unhold() to relock 317 * and unhold the chain, then unlock normally, or may simply call 318 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 319 */ 320 void 321 hammer2_chain_ref_hold(hammer2_chain_t *chain) 322 { 323 atomic_add_int(&chain->lockcnt, 1); 324 hammer2_chain_ref(chain); 325 } 326 327 /* 328 * Insert the chain in the core rbtree. 329 * 330 * Normal insertions are placed in the live rbtree. Insertion of a deleted 331 * chain is a special case used by the flush code that is placed on the 332 * unstaged deleted list to avoid confusing the live view. 333 */ 334 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 335 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 336 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 337 338 static 339 int 340 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 341 int flags, int generation) 342 { 343 hammer2_chain_t *xchain; 344 int error = 0; 345 346 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 347 hammer2_spin_ex(&parent->core.spin); 348 349 /* 350 * Interlocked by spinlock, check for race 351 */ 352 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 353 parent->core.generation != generation) { 354 error = HAMMER2_ERROR_EAGAIN; 355 goto failed; 356 } 357 358 /* 359 * Insert chain 360 */ 361 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 362 KASSERT(xchain == NULL, 363 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 364 chain, xchain, chain->bref.key)); 365 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 366 chain->parent = parent; 367 ++parent->core.chain_count; 368 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 369 370 /* 371 * We have to keep track of the effective live-view blockref count 372 * so the create code knows when to push an indirect block. 373 */ 374 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 375 atomic_add_int(&parent->core.live_count, 1); 376 failed: 377 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 378 hammer2_spin_unex(&parent->core.spin); 379 return error; 380 } 381 382 /* 383 * Drop the caller's reference to the chain. When the ref count drops to 384 * zero this function will try to disassociate the chain from its parent and 385 * deallocate it, then recursely drop the parent using the implied ref 386 * from the chain's chain->parent. 387 * 388 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 389 * races an acquisition by another cpu. Therefore we can loop if we are 390 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 391 * race against another drop. 392 */ 393 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain, 394 int depth); 395 static void hammer2_chain_lru_flush(hammer2_pfs_t *pmp); 396 397 void 398 hammer2_chain_drop(hammer2_chain_t *chain) 399 { 400 u_int refs; 401 402 if (hammer2_debug & 0x200000) 403 Debugger("drop"); 404 405 KKASSERT(chain->refs > 0); 406 407 while (chain) { 408 refs = chain->refs; 409 cpu_ccfence(); 410 KKASSERT(refs > 0); 411 412 if (refs == 1) { 413 if (hammer2_mtx_ex_try(&chain->lock) == 0) 414 chain = hammer2_chain_lastdrop(chain, 0); 415 /* retry the same chain, or chain from lastdrop */ 416 } else { 417 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 418 break; 419 /* retry the same chain */ 420 } 421 cpu_pause(); 422 } 423 } 424 425 /* 426 * Unhold a held and probably not-locked chain, ensure that the data is 427 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 428 * lock and then simply unlocking the chain. 429 */ 430 void 431 hammer2_chain_unhold(hammer2_chain_t *chain) 432 { 433 u_int lockcnt; 434 int iter = 0; 435 436 for (;;) { 437 lockcnt = chain->lockcnt; 438 cpu_ccfence(); 439 if (lockcnt > 1) { 440 if (atomic_cmpset_int(&chain->lockcnt, 441 lockcnt, lockcnt - 1)) { 442 break; 443 } 444 } else if (hammer2_mtx_ex_try(&chain->lock) == 0) { 445 hammer2_chain_unlock(chain); 446 break; 447 } else { 448 /* 449 * This situation can easily occur on SMP due to 450 * the gap inbetween the 1->0 transition and the 451 * final unlock. We cannot safely block on the 452 * mutex because lockcnt might go above 1. 453 * 454 * XXX Sleep for one tick if it takes too long. 455 */ 456 if (++iter > 1000) { 457 if (iter > 1000 + hz) { 458 kprintf("hammer2: h2race1 %p\n", chain); 459 iter = 1000; 460 } 461 tsleep(&iter, 0, "h2race1", 1); 462 } 463 cpu_pause(); 464 } 465 } 466 } 467 468 void 469 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 470 { 471 hammer2_chain_unhold(chain); 472 hammer2_chain_drop(chain); 473 } 474 475 void 476 hammer2_chain_rehold(hammer2_chain_t *chain) 477 { 478 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED); 479 atomic_add_int(&chain->lockcnt, 1); 480 hammer2_chain_unlock(chain); 481 } 482 483 /* 484 * Handles the (potential) last drop of chain->refs from 1->0. Called with 485 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 486 * possible against refs and lockcnt. We must dispose of the mutex on chain. 487 * 488 * This function returns an unlocked chain for recursive drop or NULL. It 489 * can return the same chain if it determines it has raced another ref. 490 * 491 * -- 492 * 493 * When two chains need to be recursively dropped we use the chain we 494 * would otherwise free to placehold the additional chain. It's a bit 495 * convoluted but we can't just recurse without potentially blowing out 496 * the kernel stack. 497 * 498 * The chain cannot be freed if it has any children. 499 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 500 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 501 * Any dedup registration can remain intact. 502 * 503 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 504 */ 505 static 506 hammer2_chain_t * 507 hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth) 508 { 509 hammer2_pfs_t *pmp; 510 hammer2_dev_t *hmp; 511 hammer2_chain_t *parent; 512 hammer2_chain_t *rdrop; 513 514 /* 515 * We need chain's spinlock to interlock the sub-tree test. 516 * We already have chain's mutex, protecting chain->parent. 517 * 518 * Remember that chain->refs can be in flux. 519 */ 520 hammer2_spin_ex(&chain->core.spin); 521 522 if (chain->parent != NULL) { 523 /* 524 * If the chain has a parent the UPDATE bit prevents scrapping 525 * as the chain is needed to properly flush the parent. Try 526 * to complete the 1->0 transition and return NULL. Retry 527 * (return chain) if we are unable to complete the 1->0 528 * transition, else return NULL (nothing more to do). 529 * 530 * If the chain has a parent the MODIFIED bit prevents 531 * scrapping. 532 * 533 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 534 */ 535 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 536 HAMMER2_CHAIN_MODIFIED)) { 537 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 538 hammer2_spin_unex(&chain->core.spin); 539 hammer2_chain_assert_no_data(chain); 540 hammer2_mtx_unlock(&chain->lock); 541 chain = NULL; 542 } else { 543 hammer2_spin_unex(&chain->core.spin); 544 hammer2_mtx_unlock(&chain->lock); 545 } 546 return (chain); 547 } 548 /* spinlock still held */ 549 } else if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME || 550 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP) { 551 /* 552 * Retain the static vchain and fchain. Clear bits that 553 * are not relevant. Do not clear the MODIFIED bit, 554 * and certainly do not put it on the delayed-flush queue. 555 */ 556 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 557 } else { 558 /* 559 * The chain has no parent and can be flagged for destruction. 560 * Since it has no parent, UPDATE can also be cleared. 561 */ 562 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 563 if (chain->flags & HAMMER2_CHAIN_UPDATE) 564 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 565 566 /* 567 * If the chain has children we must propagate the DESTROY 568 * flag downward and rip the disconnected topology apart. 569 * This is accomplished by calling hammer2_flush() on the 570 * chain. 571 * 572 * Any dedup is already handled by the underlying DIO, so 573 * we do not have to specifically flush it here. 574 */ 575 if (chain->core.chain_count) { 576 hammer2_spin_unex(&chain->core.spin); 577 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 578 HAMMER2_FLUSH_ALL); 579 hammer2_mtx_unlock(&chain->lock); 580 581 return(chain); /* retry drop */ 582 } 583 584 /* 585 * Otherwise we can scrap the MODIFIED bit if it is set, 586 * and continue along the freeing path. 587 * 588 * Be sure to clean-out any dedup bits. Without a parent 589 * this chain will no longer be visible to the flush code. 590 * Easy check data_off to avoid the volume root. 591 */ 592 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 593 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 594 atomic_add_long(&hammer2_count_modified_chains, -1); 595 if (chain->pmp) 596 hammer2_pfs_memory_wakeup(chain->pmp, -1); 597 } 598 /* spinlock still held */ 599 } 600 601 /* spinlock still held */ 602 603 /* 604 * If any children exist we must leave the chain intact with refs == 0. 605 * They exist because chains are retained below us which have refs or 606 * may require flushing. 607 * 608 * Retry (return chain) if we fail to transition the refs to 0, else 609 * return NULL indication nothing more to do. 610 * 611 * Chains with children are NOT put on the LRU list. 612 */ 613 if (chain->core.chain_count) { 614 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 615 hammer2_spin_unex(&chain->core.spin); 616 hammer2_chain_assert_no_data(chain); 617 hammer2_mtx_unlock(&chain->lock); 618 chain = NULL; 619 } else { 620 hammer2_spin_unex(&chain->core.spin); 621 hammer2_mtx_unlock(&chain->lock); 622 } 623 return (chain); 624 } 625 /* spinlock still held */ 626 /* no chains left under us */ 627 628 /* 629 * chain->core has no children left so no accessors can get to our 630 * chain from there. Now we have to lock the parent core to interlock 631 * remaining possible accessors that might bump chain's refs before 632 * we can safely drop chain's refs with intent to free the chain. 633 */ 634 hmp = chain->hmp; 635 pmp = chain->pmp; /* can be NULL */ 636 rdrop = NULL; 637 638 parent = chain->parent; 639 640 /* 641 * WARNING! chain's spin lock is still held here, and other spinlocks 642 * will be acquired and released in the code below. We 643 * cannot be making fancy procedure calls! 644 */ 645 646 /* 647 * We can cache the chain if it is associated with a pmp 648 * and not flagged as being destroyed or requesting a full 649 * release. In this situation the chain is not removed 650 * from its parent, i.e. it can still be looked up. 651 * 652 * We intentionally do not cache DATA chains because these 653 * were likely used to load data into the logical buffer cache 654 * and will not be accessed again for some time. 655 */ 656 if ((chain->flags & 657 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 658 chain->pmp && 659 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 660 if (parent) 661 hammer2_spin_ex(&parent->core.spin); 662 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 663 /* 664 * 1->0 transition failed, retry. Do not drop 665 * the chain's data yet! 666 */ 667 if (parent) 668 hammer2_spin_unex(&parent->core.spin); 669 hammer2_spin_unex(&chain->core.spin); 670 hammer2_mtx_unlock(&chain->lock); 671 672 return(chain); 673 } 674 675 /* 676 * Success 677 */ 678 hammer2_chain_assert_no_data(chain); 679 680 /* 681 * Make sure we are on the LRU list, clean up excessive 682 * LRU entries. We can only really drop one but there might 683 * be other entries that we can remove from the lru_list 684 * without dropping. 685 * 686 * NOTE: HAMMER2_CHAIN_ONLRU may only be safely set when 687 * chain->core.spin AND pmp->lru_spin are held, but 688 * can be safely cleared only holding pmp->lru_spin. 689 */ 690 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 691 hammer2_spin_ex(&pmp->lru_spin); 692 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 693 atomic_set_int(&chain->flags, 694 HAMMER2_CHAIN_ONLRU); 695 TAILQ_INSERT_TAIL(&pmp->lru_list, 696 chain, lru_node); 697 atomic_add_int(&pmp->lru_count, 1); 698 } 699 if (pmp->lru_count < HAMMER2_LRU_LIMIT) 700 depth = 1; /* disable lru_list flush */ 701 hammer2_spin_unex(&pmp->lru_spin); 702 } else { 703 /* disable lru flush */ 704 depth = 1; 705 } 706 707 if (parent) { 708 hammer2_spin_unex(&parent->core.spin); 709 parent = NULL; /* safety */ 710 } 711 hammer2_spin_unex(&chain->core.spin); 712 hammer2_mtx_unlock(&chain->lock); 713 714 /* 715 * lru_list hysteresis (see above for depth overrides). 716 * Note that depth also prevents excessive lastdrop recursion. 717 */ 718 if (depth == 0) 719 hammer2_chain_lru_flush(pmp); 720 721 return NULL; 722 /* NOT REACHED */ 723 } 724 725 /* 726 * Make sure we are not on the LRU list. 727 */ 728 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 729 hammer2_spin_ex(&pmp->lru_spin); 730 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 731 atomic_add_int(&pmp->lru_count, -1); 732 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 733 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 734 } 735 hammer2_spin_unex(&pmp->lru_spin); 736 } 737 738 /* 739 * Spinlock the parent and try to drop the last ref on chain. 740 * On success determine if we should dispose of the chain 741 * (remove the chain from its parent, etc). 742 * 743 * (normal core locks are top-down recursive but we define 744 * core spinlocks as bottom-up recursive, so this is safe). 745 */ 746 if (parent) { 747 hammer2_spin_ex(&parent->core.spin); 748 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 749 /* 750 * 1->0 transition failed, retry. 751 */ 752 hammer2_spin_unex(&parent->core.spin); 753 hammer2_spin_unex(&chain->core.spin); 754 hammer2_mtx_unlock(&chain->lock); 755 756 return(chain); 757 } 758 759 /* 760 * 1->0 transition successful, parent spin held to prevent 761 * new lookups, chain spinlock held to protect parent field. 762 * Remove chain from the parent. 763 * 764 * If the chain is being removed from the parent's btree but 765 * is not bmapped, we have to adjust live_count downward. If 766 * it is bmapped then the blockref is retained in the parent 767 * as is its associated live_count. This case can occur when 768 * a chain added to the topology is unable to flush and is 769 * then later deleted. 770 */ 771 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 772 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 773 (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 774 atomic_add_int(&parent->core.live_count, -1); 775 } 776 RB_REMOVE(hammer2_chain_tree, 777 &parent->core.rbtree, chain); 778 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 779 --parent->core.chain_count; 780 chain->parent = NULL; 781 } 782 783 /* 784 * If our chain was the last chain in the parent's core the 785 * core is now empty and its parent might have to be 786 * re-dropped if it has 0 refs. 787 */ 788 if (parent->core.chain_count == 0) { 789 rdrop = parent; 790 atomic_add_int(&rdrop->refs, 1); 791 /* 792 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 793 rdrop = NULL; 794 */ 795 } 796 hammer2_spin_unex(&parent->core.spin); 797 parent = NULL; /* safety */ 798 /* FALL THROUGH */ 799 } else { 800 /* 801 * No-parent case. 802 */ 803 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 804 /* 805 * 1->0 transition failed, retry. 806 */ 807 hammer2_spin_unex(&parent->core.spin); 808 hammer2_spin_unex(&chain->core.spin); 809 hammer2_mtx_unlock(&chain->lock); 810 811 return(chain); 812 } 813 } 814 815 /* 816 * Successful 1->0 transition, no parent, no children... no way for 817 * anyone to ref this chain any more. We can clean-up and free it. 818 * 819 * We still have the core spinlock, and core's chain_count is 0. 820 * Any parent spinlock is gone. 821 */ 822 hammer2_spin_unex(&chain->core.spin); 823 hammer2_chain_assert_no_data(chain); 824 hammer2_mtx_unlock(&chain->lock); 825 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 826 chain->core.chain_count == 0); 827 828 /* 829 * All locks are gone, no pointers remain to the chain, finish 830 * freeing it. 831 */ 832 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 833 HAMMER2_CHAIN_MODIFIED)) == 0); 834 835 /* 836 * Once chain resources are gone we can use the now dead chain 837 * structure to placehold what might otherwise require a recursive 838 * drop, because we have potentially two things to drop and can only 839 * return one directly. 840 */ 841 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 842 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 843 chain->hmp = NULL; 844 kfree(chain, hmp->mchain); 845 } 846 847 /* 848 * Possible chaining loop when parent re-drop needed. 849 */ 850 return(rdrop); 851 } 852 853 /* 854 * Heuristical flush of the LRU, try to reduce the number of entries 855 * on the LRU to (HAMMER2_LRU_LIMIT * 2 / 3). This procedure is called 856 * only when lru_count exceeds HAMMER2_LRU_LIMIT. 857 */ 858 static 859 void 860 hammer2_chain_lru_flush(hammer2_pfs_t *pmp) 861 { 862 hammer2_chain_t *chain; 863 864 again: 865 chain = NULL; 866 hammer2_spin_ex(&pmp->lru_spin); 867 while (pmp->lru_count > HAMMER2_LRU_LIMIT * 2 / 3) { 868 /* 869 * Pick a chain off the lru_list, just recycle it quickly 870 * if LRUHINT is set (the chain was ref'd but left on 871 * the lru_list, so cycle to the end). 872 */ 873 chain = TAILQ_FIRST(&pmp->lru_list); 874 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 875 876 if (chain->flags & HAMMER2_CHAIN_LRUHINT) { 877 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 878 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 879 chain = NULL; 880 continue; 881 } 882 883 /* 884 * Ok, we are off the LRU. We must adjust refs before we 885 * can safely clear the ONLRU flag. 886 */ 887 atomic_add_int(&pmp->lru_count, -1); 888 if (atomic_cmpset_int(&chain->refs, 0, 1)) { 889 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 890 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 891 break; 892 } 893 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 894 chain = NULL; 895 } 896 hammer2_spin_unex(&pmp->lru_spin); 897 if (chain == NULL) 898 return; 899 900 /* 901 * If we picked a chain off the lru list we may be able to lastdrop 902 * it. Use a depth of 1 to prevent excessive lastdrop recursion. 903 */ 904 while (chain) { 905 u_int refs; 906 907 refs = chain->refs; 908 cpu_ccfence(); 909 KKASSERT(refs > 0); 910 911 if (refs == 1) { 912 if (hammer2_mtx_ex_try(&chain->lock) == 0) 913 chain = hammer2_chain_lastdrop(chain, 1); 914 /* retry the same chain, or chain from lastdrop */ 915 } else { 916 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 917 break; 918 /* retry the same chain */ 919 } 920 cpu_pause(); 921 } 922 goto again; 923 } 924 925 /* 926 * On last lock release. 927 */ 928 static hammer2_io_t * 929 hammer2_chain_drop_data(hammer2_chain_t *chain) 930 { 931 hammer2_io_t *dio; 932 933 if ((dio = chain->dio) != NULL) { 934 chain->dio = NULL; 935 chain->data = NULL; 936 } else { 937 switch(chain->bref.type) { 938 case HAMMER2_BREF_TYPE_VOLUME: 939 case HAMMER2_BREF_TYPE_FREEMAP: 940 break; 941 default: 942 if (chain->data != NULL) { 943 hammer2_spin_unex(&chain->core.spin); 944 panic("chain data not null: " 945 "chain %p bref %016jx.%02x " 946 "refs %d parent %p dio %p data %p", 947 chain, chain->bref.data_off, 948 chain->bref.type, chain->refs, 949 chain->parent, 950 chain->dio, chain->data); 951 } 952 KKASSERT(chain->data == NULL); 953 break; 954 } 955 } 956 return dio; 957 } 958 959 /* 960 * Lock a referenced chain element, acquiring its data with I/O if necessary, 961 * and specify how you would like the data to be resolved. 962 * 963 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 964 * 965 * The lock is allowed to recurse, multiple locking ops will aggregate 966 * the requested resolve types. Once data is assigned it will not be 967 * removed until the last unlock. 968 * 969 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 970 * (typically used to avoid device/logical buffer 971 * aliasing for data) 972 * 973 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 974 * the INITIAL-create state (indirect blocks only). 975 * 976 * Do not resolve data elements for DATA chains. 977 * (typically used to avoid device/logical buffer 978 * aliasing for data) 979 * 980 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 981 * 982 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 983 * it will be locked exclusive. 984 * 985 * HAMMER2_RESOLVE_NONBLOCK- (flag) The chain is locked non-blocking. If 986 * the lock fails, EAGAIN is returned. 987 * 988 * NOTE: Embedded elements (volume header, inodes) are always resolved 989 * regardless. 990 * 991 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 992 * element will instantiate and zero its buffer, and flush it on 993 * release. 994 * 995 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 996 * so as not to instantiate a device buffer, which could alias against 997 * a logical file buffer. However, if ALWAYS is specified the 998 * device buffer will be instantiated anyway. 999 * 1000 * NOTE: The return value is always 0 unless NONBLOCK is specified, in which 1001 * case it can be either 0 or EAGAIN. 1002 * 1003 * WARNING! This function blocks on I/O if data needs to be fetched. This 1004 * blocking can run concurrent with other compatible lock holders 1005 * who do not need data returning. The lock is not upgraded to 1006 * exclusive during a data fetch, a separate bit is used to 1007 * interlock I/O. However, an exclusive lock holder can still count 1008 * on being interlocked against an I/O fetch managed by a shared 1009 * lock holder. 1010 */ 1011 int 1012 hammer2_chain_lock(hammer2_chain_t *chain, int how) 1013 { 1014 KKASSERT(chain->refs > 0); 1015 1016 if (how & HAMMER2_RESOLVE_NONBLOCK) { 1017 /* 1018 * We still have to bump lockcnt before acquiring the lock, 1019 * even for non-blocking operation, because the unlock code 1020 * live-loops on lockcnt == 1 when dropping the last lock. 1021 * 1022 * If the non-blocking operation fails we have to use an 1023 * unhold sequence to undo the mess. 1024 * 1025 * NOTE: LOCKAGAIN must always succeed without blocking, 1026 * even if NONBLOCK is specified. 1027 */ 1028 atomic_add_int(&chain->lockcnt, 1); 1029 if (how & HAMMER2_RESOLVE_SHARED) { 1030 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1031 hammer2_mtx_sh_again(&chain->lock); 1032 } else { 1033 if (hammer2_mtx_sh_try(&chain->lock) != 0) { 1034 hammer2_chain_unhold(chain); 1035 return EAGAIN; 1036 } 1037 } 1038 } else { 1039 if (hammer2_mtx_ex_try(&chain->lock) != 0) { 1040 hammer2_chain_unhold(chain); 1041 return EAGAIN; 1042 } 1043 } 1044 } else { 1045 /* 1046 * Get the appropriate lock. If LOCKAGAIN is flagged with 1047 * SHARED the caller expects a shared lock to already be 1048 * present and we are giving it another ref. This case must 1049 * importantly not block if there is a pending exclusive lock 1050 * request. 1051 */ 1052 atomic_add_int(&chain->lockcnt, 1); 1053 if (how & HAMMER2_RESOLVE_SHARED) { 1054 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1055 hammer2_mtx_sh_again(&chain->lock); 1056 } else { 1057 hammer2_mtx_sh(&chain->lock); 1058 } 1059 } else { 1060 hammer2_mtx_ex(&chain->lock); 1061 } 1062 } 1063 1064 /* 1065 * If we already have a valid data pointer make sure the data is 1066 * synchronized to the current cpu, and then no further action is 1067 * necessary. 1068 */ 1069 if (chain->data) { 1070 if (chain->dio) 1071 hammer2_io_bkvasync(chain->dio); 1072 return 0; 1073 } 1074 1075 /* 1076 * Do we have to resolve the data? This is generally only 1077 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 1078 * Other BREF types expects the data to be there. 1079 */ 1080 switch(how & HAMMER2_RESOLVE_MASK) { 1081 case HAMMER2_RESOLVE_NEVER: 1082 return 0; 1083 case HAMMER2_RESOLVE_MAYBE: 1084 if (chain->flags & HAMMER2_CHAIN_INITIAL) 1085 return 0; 1086 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 1087 return 0; 1088 #if 0 1089 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 1090 return 0; 1091 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 1092 return 0; 1093 #endif 1094 /* fall through */ 1095 case HAMMER2_RESOLVE_ALWAYS: 1096 default: 1097 break; 1098 } 1099 1100 /* 1101 * Caller requires data 1102 */ 1103 hammer2_chain_load_data(chain); 1104 1105 return 0; 1106 } 1107 1108 /* 1109 * Lock the chain, retain the hold, and drop the data persistence count. 1110 * The data should remain valid because we never transitioned lockcnt 1111 * through 0. 1112 */ 1113 void 1114 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 1115 { 1116 hammer2_chain_lock(chain, how); 1117 atomic_add_int(&chain->lockcnt, -1); 1118 } 1119 1120 #if 0 1121 /* 1122 * Downgrade an exclusive chain lock to a shared chain lock. 1123 * 1124 * NOTE: There is no upgrade equivalent due to the ease of 1125 * deadlocks in that direction. 1126 */ 1127 void 1128 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1129 { 1130 hammer2_mtx_downgrade(&chain->lock); 1131 } 1132 #endif 1133 1134 /* 1135 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1136 * may be of any type. 1137 * 1138 * Once chain->data is set it cannot be disposed of until all locks are 1139 * released. 1140 * 1141 * Make sure the data is synchronized to the current cpu. 1142 */ 1143 void 1144 hammer2_chain_load_data(hammer2_chain_t *chain) 1145 { 1146 hammer2_blockref_t *bref; 1147 hammer2_dev_t *hmp; 1148 hammer2_io_t *dio; 1149 char *bdata; 1150 int error; 1151 1152 /* 1153 * Degenerate case, data already present, or chain has no media 1154 * reference to load. 1155 */ 1156 KKASSERT(chain->lock.mtx_lock & MTX_MASK); 1157 if (chain->data) { 1158 if (chain->dio) 1159 hammer2_io_bkvasync(chain->dio); 1160 return; 1161 } 1162 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1163 return; 1164 1165 hmp = chain->hmp; 1166 KKASSERT(hmp != NULL); 1167 1168 /* 1169 * Gain the IOINPROG bit, interlocked block. 1170 */ 1171 for (;;) { 1172 u_int oflags; 1173 u_int nflags; 1174 1175 oflags = chain->flags; 1176 cpu_ccfence(); 1177 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1178 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1179 tsleep_interlock(&chain->flags, 0); 1180 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1181 tsleep(&chain->flags, PINTERLOCKED, 1182 "h2iocw", 0); 1183 } 1184 /* retry */ 1185 } else { 1186 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1187 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1188 break; 1189 } 1190 /* retry */ 1191 } 1192 } 1193 1194 /* 1195 * We own CHAIN_IOINPROG 1196 * 1197 * Degenerate case if we raced another load. 1198 */ 1199 if (chain->data) { 1200 if (chain->dio) 1201 hammer2_io_bkvasync(chain->dio); 1202 goto done; 1203 } 1204 1205 /* 1206 * We must resolve to a device buffer, either by issuing I/O or 1207 * by creating a zero-fill element. We do not mark the buffer 1208 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1209 * API must still be used to do that). 1210 * 1211 * The device buffer is variable-sized in powers of 2 down 1212 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1213 * chunk always contains buffers of the same size. (XXX) 1214 * 1215 * The minimum physical IO size may be larger than the variable 1216 * block size. 1217 */ 1218 bref = &chain->bref; 1219 1220 /* 1221 * The getblk() optimization can only be used on newly created 1222 * elements if the physical block size matches the request. 1223 */ 1224 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1225 error = hammer2_io_new(hmp, bref->type, 1226 bref->data_off, chain->bytes, 1227 &chain->dio); 1228 } else { 1229 error = hammer2_io_bread(hmp, bref->type, 1230 bref->data_off, chain->bytes, 1231 &chain->dio); 1232 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1233 } 1234 if (error) { 1235 chain->error = HAMMER2_ERROR_EIO; 1236 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n", 1237 (intmax_t)bref->data_off, error); 1238 hammer2_io_bqrelse(&chain->dio); 1239 goto done; 1240 } 1241 chain->error = 0; 1242 1243 /* 1244 * This isn't perfect and can be ignored on OSs which do not have 1245 * an indication as to whether a buffer is coming from cache or 1246 * if I/O was actually issued for the read. TESTEDGOOD will work 1247 * pretty well without the B_IOISSUED logic because chains are 1248 * cached, but in that situation (without B_IOISSUED) it will not 1249 * detect whether a re-read via I/O is corrupted verses the original 1250 * read. 1251 * 1252 * We can't re-run the CRC on every fresh lock. That would be 1253 * insanely expensive. 1254 * 1255 * If the underlying kernel buffer covers the entire chain we can 1256 * use the B_IOISSUED indication to determine if we have to re-run 1257 * the CRC on chain data for chains that managed to stay cached 1258 * across the kernel disposal of the original buffer. 1259 */ 1260 if ((dio = chain->dio) != NULL && dio->bp) { 1261 struct buf *bp = dio->bp; 1262 1263 if (dio->psize == chain->bytes && 1264 (bp->b_flags & B_IOISSUED)) { 1265 atomic_clear_int(&chain->flags, 1266 HAMMER2_CHAIN_TESTEDGOOD); 1267 bp->b_flags &= ~B_IOISSUED; 1268 } 1269 } 1270 1271 /* 1272 * NOTE: A locked chain's data cannot be modified without first 1273 * calling hammer2_chain_modify(). 1274 */ 1275 1276 /* 1277 * Clear INITIAL. In this case we used io_new() and the buffer has 1278 * been zero'd and marked dirty. 1279 * 1280 * NOTE: hammer2_io_data() call issues bkvasync() 1281 */ 1282 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1283 1284 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1285 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1286 chain->bref.flags |= HAMMER2_BREF_FLAG_ZERO; 1287 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1288 /* 1289 * check data not currently synchronized due to 1290 * modification. XXX assumes data stays in the buffer 1291 * cache, which might not be true (need biodep on flush 1292 * to calculate crc? or simple crc?). 1293 */ 1294 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1295 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1296 chain->error = HAMMER2_ERROR_CHECK; 1297 } else { 1298 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1299 } 1300 } 1301 1302 /* 1303 * Setup the data pointer, either pointing it to an embedded data 1304 * structure and copying the data from the buffer, or pointing it 1305 * into the buffer. 1306 * 1307 * The buffer is not retained when copying to an embedded data 1308 * structure in order to avoid potential deadlocks or recursions 1309 * on the same physical buffer. 1310 * 1311 * WARNING! Other threads can start using the data the instant we 1312 * set chain->data non-NULL. 1313 */ 1314 switch (bref->type) { 1315 case HAMMER2_BREF_TYPE_VOLUME: 1316 case HAMMER2_BREF_TYPE_FREEMAP: 1317 /* 1318 * Copy data from bp to embedded buffer 1319 */ 1320 panic("hammer2_chain_load_data: unresolved volume header"); 1321 break; 1322 case HAMMER2_BREF_TYPE_DIRENT: 1323 KKASSERT(chain->bytes != 0); 1324 /* fall through */ 1325 case HAMMER2_BREF_TYPE_INODE: 1326 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1327 case HAMMER2_BREF_TYPE_INDIRECT: 1328 case HAMMER2_BREF_TYPE_DATA: 1329 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1330 default: 1331 /* 1332 * Point data at the device buffer and leave dio intact. 1333 */ 1334 chain->data = (void *)bdata; 1335 break; 1336 } 1337 1338 /* 1339 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1340 */ 1341 done: 1342 for (;;) { 1343 u_int oflags; 1344 u_int nflags; 1345 1346 oflags = chain->flags; 1347 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1348 HAMMER2_CHAIN_IOSIGNAL); 1349 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1350 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1351 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1352 wakeup(&chain->flags); 1353 break; 1354 } 1355 } 1356 } 1357 1358 /* 1359 * Unlock and deref a chain element. 1360 * 1361 * Remember that the presence of children under chain prevent the chain's 1362 * destruction but do not add additional references, so the dio will still 1363 * be dropped. 1364 */ 1365 void 1366 hammer2_chain_unlock(hammer2_chain_t *chain) 1367 { 1368 hammer2_io_t *dio; 1369 u_int lockcnt; 1370 int iter = 0; 1371 1372 /* 1373 * If multiple locks are present (or being attempted) on this 1374 * particular chain we can just unlock, drop refs, and return. 1375 * 1376 * Otherwise fall-through on the 1->0 transition. 1377 */ 1378 for (;;) { 1379 lockcnt = chain->lockcnt; 1380 KKASSERT(lockcnt > 0); 1381 cpu_ccfence(); 1382 if (lockcnt > 1) { 1383 if (atomic_cmpset_int(&chain->lockcnt, 1384 lockcnt, lockcnt - 1)) { 1385 hammer2_mtx_unlock(&chain->lock); 1386 return; 1387 } 1388 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1389 /* while holding the mutex exclusively */ 1390 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1391 break; 1392 } else { 1393 /* 1394 * This situation can easily occur on SMP due to 1395 * the gap inbetween the 1->0 transition and the 1396 * final unlock. We cannot safely block on the 1397 * mutex because lockcnt might go above 1. 1398 * 1399 * XXX Sleep for one tick if it takes too long. 1400 */ 1401 if (++iter > 1000) { 1402 if (iter > 1000 + hz) { 1403 kprintf("hammer2: h2race2 %p\n", chain); 1404 iter = 1000; 1405 } 1406 tsleep(&iter, 0, "h2race2", 1); 1407 } 1408 cpu_pause(); 1409 } 1410 /* retry */ 1411 } 1412 1413 /* 1414 * Last unlock / mutex upgraded to exclusive. Drop the data 1415 * reference. 1416 */ 1417 dio = hammer2_chain_drop_data(chain); 1418 if (dio) 1419 hammer2_io_bqrelse(&dio); 1420 hammer2_mtx_unlock(&chain->lock); 1421 } 1422 1423 /* 1424 * Unlock and hold chain data intact 1425 */ 1426 void 1427 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1428 { 1429 atomic_add_int(&chain->lockcnt, 1); 1430 hammer2_chain_unlock(chain); 1431 } 1432 1433 /* 1434 * Helper to obtain the blockref[] array base and count for a chain. 1435 * 1436 * XXX Not widely used yet, various use cases need to be validated and 1437 * converted to use this function. 1438 */ 1439 static 1440 hammer2_blockref_t * 1441 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1442 { 1443 hammer2_blockref_t *base; 1444 int count; 1445 1446 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1447 base = NULL; 1448 1449 switch(parent->bref.type) { 1450 case HAMMER2_BREF_TYPE_INODE: 1451 count = HAMMER2_SET_COUNT; 1452 break; 1453 case HAMMER2_BREF_TYPE_INDIRECT: 1454 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1455 count = parent->bytes / sizeof(hammer2_blockref_t); 1456 break; 1457 case HAMMER2_BREF_TYPE_VOLUME: 1458 count = HAMMER2_SET_COUNT; 1459 break; 1460 case HAMMER2_BREF_TYPE_FREEMAP: 1461 count = HAMMER2_SET_COUNT; 1462 break; 1463 default: 1464 panic("hammer2_chain_base_and_count: " 1465 "unrecognized blockref type: %d", 1466 parent->bref.type); 1467 count = 0; 1468 break; 1469 } 1470 } else { 1471 switch(parent->bref.type) { 1472 case HAMMER2_BREF_TYPE_INODE: 1473 base = &parent->data->ipdata.u.blockset.blockref[0]; 1474 count = HAMMER2_SET_COUNT; 1475 break; 1476 case HAMMER2_BREF_TYPE_INDIRECT: 1477 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1478 base = &parent->data->npdata[0]; 1479 count = parent->bytes / sizeof(hammer2_blockref_t); 1480 break; 1481 case HAMMER2_BREF_TYPE_VOLUME: 1482 base = &parent->data->voldata. 1483 sroot_blockset.blockref[0]; 1484 count = HAMMER2_SET_COUNT; 1485 break; 1486 case HAMMER2_BREF_TYPE_FREEMAP: 1487 base = &parent->data->blkset.blockref[0]; 1488 count = HAMMER2_SET_COUNT; 1489 break; 1490 default: 1491 panic("hammer2_chain_base_and_count: " 1492 "unrecognized blockref type: %d", 1493 parent->bref.type); 1494 count = 0; 1495 break; 1496 } 1497 } 1498 *countp = count; 1499 1500 return base; 1501 } 1502 1503 /* 1504 * This counts the number of live blockrefs in a block array and 1505 * also calculates the point at which all remaining blockrefs are empty. 1506 * This routine can only be called on a live chain. 1507 * 1508 * Caller holds the chain locked, but possibly with a shared lock. We 1509 * must use an exclusive spinlock to prevent corruption. 1510 * 1511 * NOTE: Flag is not set until after the count is complete, allowing 1512 * callers to test the flag without holding the spinlock. 1513 * 1514 * NOTE: If base is NULL the related chain is still in the INITIAL 1515 * state and there are no blockrefs to count. 1516 * 1517 * NOTE: live_count may already have some counts accumulated due to 1518 * creation and deletion and could even be initially negative. 1519 */ 1520 void 1521 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1522 hammer2_blockref_t *base, int count) 1523 { 1524 hammer2_spin_ex(&chain->core.spin); 1525 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1526 if (base) { 1527 while (--count >= 0) { 1528 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1529 break; 1530 } 1531 chain->core.live_zero = count + 1; 1532 while (count >= 0) { 1533 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1534 atomic_add_int(&chain->core.live_count, 1535 1); 1536 --count; 1537 } 1538 } else { 1539 chain->core.live_zero = 0; 1540 } 1541 /* else do not modify live_count */ 1542 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1543 } 1544 hammer2_spin_unex(&chain->core.spin); 1545 } 1546 1547 /* 1548 * Resize the chain's physical storage allocation in-place. This function does 1549 * not usually adjust the data pointer and must be followed by (typically) a 1550 * hammer2_chain_modify() call to copy any old data over and adjust the 1551 * data pointer. 1552 * 1553 * Chains can be resized smaller without reallocating the storage. Resizing 1554 * larger will reallocate the storage. Excess or prior storage is reclaimed 1555 * asynchronously at a later time. 1556 * 1557 * An nradix value of 0 is special-cased to mean that the storage should 1558 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1559 * byte). 1560 * 1561 * Must be passed an exclusively locked parent and chain. 1562 * 1563 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1564 * to avoid instantiating a device buffer that conflicts with the vnode data 1565 * buffer. However, because H2 can compress or encrypt data, the chain may 1566 * have a dio assigned to it in those situations, and they do not conflict. 1567 * 1568 * XXX return error if cannot resize. 1569 */ 1570 int 1571 hammer2_chain_resize(hammer2_chain_t *chain, 1572 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1573 int nradix, int flags) 1574 { 1575 hammer2_dev_t *hmp; 1576 size_t obytes; 1577 size_t nbytes; 1578 int error; 1579 1580 hmp = chain->hmp; 1581 1582 /* 1583 * Only data and indirect blocks can be resized for now. 1584 * (The volu root, inodes, and freemap elements use a fixed size). 1585 */ 1586 KKASSERT(chain != &hmp->vchain); 1587 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1588 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1589 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1590 1591 /* 1592 * Nothing to do if the element is already the proper size 1593 */ 1594 obytes = chain->bytes; 1595 nbytes = (nradix) ? (1U << nradix) : 0; 1596 if (obytes == nbytes) 1597 return (chain->error); 1598 1599 /* 1600 * Make sure the old data is instantiated so we can copy it. If this 1601 * is a data block, the device data may be superfluous since the data 1602 * might be in a logical block, but compressed or encrypted data is 1603 * another matter. 1604 * 1605 * NOTE: The modify will set BMAPUPD for us if BMAPPED is set. 1606 */ 1607 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1608 if (error) 1609 return error; 1610 1611 /* 1612 * Relocate the block, even if making it smaller (because different 1613 * block sizes may be in different regions). 1614 * 1615 * NOTE: Operation does not copy the data and may only be used 1616 * to resize data blocks in-place, or directory entry blocks 1617 * which are about to be modified in some manner. 1618 */ 1619 error = hammer2_freemap_alloc(chain, nbytes); 1620 if (error) 1621 return error; 1622 1623 chain->bytes = nbytes; 1624 1625 /* 1626 * We don't want the followup chain_modify() to try to copy data 1627 * from the old (wrong-sized) buffer. It won't know how much to 1628 * copy. This case should only occur during writes when the 1629 * originator already has the data to write in-hand. 1630 */ 1631 if (chain->dio) { 1632 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1633 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1634 hammer2_io_brelse(&chain->dio); 1635 chain->data = NULL; 1636 } 1637 return (chain->error); 1638 } 1639 1640 /* 1641 * Set the chain modified so its data can be changed by the caller, or 1642 * install deduplicated data. The caller must call this routine for each 1643 * set of modifications it makes, even if the chain is already flagged 1644 * MODIFIED. 1645 * 1646 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1647 * is a CLC (cluster level change) field and is not updated by parent 1648 * propagation during a flush. 1649 * 1650 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1651 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1652 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1653 * remains unmodified with its old data ref intact and chain->error 1654 * unchanged. 1655 * 1656 * Dedup Handling 1657 * 1658 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1659 * even if the chain is still flagged MODIFIED. In this case the chain's 1660 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1661 * 1662 * If the caller passes a non-zero dedup_off we will use it to assign the 1663 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1664 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1665 * must not modify the data content upon return. 1666 */ 1667 int 1668 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1669 hammer2_off_t dedup_off, int flags) 1670 { 1671 hammer2_blockref_t obref; 1672 hammer2_dev_t *hmp; 1673 hammer2_io_t *dio; 1674 int error; 1675 int wasinitial; 1676 int setmodified; 1677 int setupdate; 1678 int newmod; 1679 char *bdata; 1680 1681 hmp = chain->hmp; 1682 obref = chain->bref; 1683 KKASSERT((chain->flags & HAMMER2_CHAIN_FICTITIOUS) == 0); 1684 KKASSERT(chain->lock.mtx_lock & MTX_EXCLUSIVE); 1685 1686 /* 1687 * Data is not optional for freemap chains (we must always be sure 1688 * to copy the data on COW storage allocations). 1689 */ 1690 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1691 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1692 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1693 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1694 } 1695 1696 /* 1697 * Data must be resolved if already assigned, unless explicitly 1698 * flagged otherwise. If we cannot safety load the data the 1699 * modification fails and we return early. 1700 */ 1701 if (chain->data == NULL && chain->bytes != 0 && 1702 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1703 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1704 hammer2_chain_load_data(chain); 1705 if (chain->error) 1706 return (chain->error); 1707 } 1708 error = 0; 1709 1710 /* 1711 * Set MODIFIED to indicate that the chain has been modified. A new 1712 * allocation is required when modifying a chain. 1713 * 1714 * Set UPDATE to ensure that the blockref is updated in the parent. 1715 * 1716 * If MODIFIED is already set determine if we can reuse the assigned 1717 * data block or if we need a new data block. 1718 */ 1719 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1720 /* 1721 * Must set modified bit. 1722 */ 1723 atomic_add_long(&hammer2_count_modified_chains, 1); 1724 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1725 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1726 setmodified = 1; 1727 1728 /* 1729 * We may be able to avoid a copy-on-write if the chain's 1730 * check mode is set to NONE and the chain's current 1731 * modify_tid is beyond the last explicit snapshot tid. 1732 * 1733 * This implements HAMMER2's overwrite-in-place feature. 1734 * 1735 * NOTE! This data-block cannot be used as a de-duplication 1736 * source when the check mode is set to NONE. 1737 */ 1738 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1739 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1740 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1741 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1742 HAMMER2_DEC_CHECK(chain->bref.methods) == 1743 HAMMER2_CHECK_NONE && 1744 chain->pmp && 1745 chain->bref.modify_tid > 1746 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1747 /* 1748 * Sector overwrite allowed. 1749 */ 1750 newmod = 0; 1751 } else if ((hmp->hflags & HMNT2_EMERG) && 1752 chain->pmp && 1753 chain->bref.modify_tid > 1754 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1755 /* 1756 * If in emergency delete mode then do a modify-in- 1757 * place on any chain type belonging to the PFS as 1758 * long as it doesn't mess up a snapshot. We might 1759 * be forced to do this anyway a little further down 1760 * in the code if the allocation fails. 1761 * 1762 * Also note that in emergency mode, these modify-in- 1763 * place operations are NOT SAFE. A storage failure, 1764 * power failure, or panic can corrupt the filesystem. 1765 */ 1766 newmod = 0; 1767 } else { 1768 /* 1769 * Sector overwrite not allowed, must copy-on-write. 1770 */ 1771 newmod = 1; 1772 } 1773 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1774 /* 1775 * If the modified chain was registered for dedup we need 1776 * a new allocation. This only happens for delayed-flush 1777 * chains (i.e. which run through the front-end buffer 1778 * cache). 1779 */ 1780 newmod = 1; 1781 setmodified = 0; 1782 } else { 1783 /* 1784 * Already flagged modified, no new allocation is needed. 1785 */ 1786 newmod = 0; 1787 setmodified = 0; 1788 } 1789 1790 /* 1791 * Flag parent update required. 1792 */ 1793 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1794 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1795 setupdate = 1; 1796 } else { 1797 setupdate = 0; 1798 } 1799 1800 /* 1801 * The XOP code returns held but unlocked focus chains. This 1802 * prevents the chain from being destroyed but does not prevent 1803 * it from being modified. diolk is used to interlock modifications 1804 * against XOP frontend accesses to the focus. 1805 * 1806 * This allows us to theoretically avoid deadlocking the frontend 1807 * if one of the backends lock up by not formally locking the 1808 * focused chain in the frontend. In addition, the synchronization 1809 * code relies on this mechanism to avoid deadlocking concurrent 1810 * synchronization threads. 1811 */ 1812 lockmgr(&chain->diolk, LK_EXCLUSIVE); 1813 1814 /* 1815 * The modification or re-modification requires an allocation and 1816 * possible COW. If an error occurs, the previous content and data 1817 * reference is retained and the modification fails. 1818 * 1819 * If dedup_off is non-zero, the caller is requesting a deduplication 1820 * rather than a modification. The MODIFIED bit is not set and the 1821 * data offset is set to the deduplication offset. The data cannot 1822 * be modified. 1823 * 1824 * NOTE: The dedup offset is allowed to be in a partially free state 1825 * and we must be sure to reset it to a fully allocated state 1826 * to force two bulkfree passes to free it again. 1827 * 1828 * NOTE: Only applicable when chain->bytes != 0. 1829 * 1830 * XXX can a chain already be marked MODIFIED without a data 1831 * assignment? If not, assert here instead of testing the case. 1832 */ 1833 if (chain != &hmp->vchain && chain != &hmp->fchain && 1834 chain->bytes) { 1835 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1836 newmod 1837 ) { 1838 /* 1839 * NOTE: We do not have to remove the dedup 1840 * registration because the area is still 1841 * allocated and the underlying DIO will 1842 * still be flushed. 1843 */ 1844 if (dedup_off) { 1845 chain->bref.data_off = dedup_off; 1846 chain->bytes = 1 << (dedup_off & 1847 HAMMER2_OFF_MASK_RADIX); 1848 chain->error = 0; 1849 atomic_clear_int(&chain->flags, 1850 HAMMER2_CHAIN_MODIFIED); 1851 atomic_add_long(&hammer2_count_modified_chains, 1852 -1); 1853 if (chain->pmp) { 1854 hammer2_pfs_memory_wakeup( 1855 chain->pmp, -1); 1856 } 1857 hammer2_freemap_adjust(hmp, &chain->bref, 1858 HAMMER2_FREEMAP_DORECOVER); 1859 atomic_set_int(&chain->flags, 1860 HAMMER2_CHAIN_DEDUPABLE); 1861 } else { 1862 error = hammer2_freemap_alloc(chain, 1863 chain->bytes); 1864 atomic_clear_int(&chain->flags, 1865 HAMMER2_CHAIN_DEDUPABLE); 1866 1867 /* 1868 * If we are unable to allocate a new block 1869 * but we are in emergency mode, issue a 1870 * warning to the console and reuse the same 1871 * block. 1872 * 1873 * We behave as if the allocation were 1874 * successful. 1875 * 1876 * THIS IS IMPORTANT: These modifications 1877 * are virtually guaranteed to corrupt any 1878 * snapshots related to this filesystem. 1879 */ 1880 if (error && (hmp->hflags & HMNT2_EMERG)) { 1881 error = 0; 1882 chain->bref.flags |= 1883 HAMMER2_BREF_FLAG_EMERG_MIP; 1884 1885 krateprintf(&krate_h2em, 1886 "hammer2: Emergency Mode WARNING: " 1887 "Operation will likely corrupt " 1888 "related snapshot: " 1889 "%016jx.%02x key=%016jx\n", 1890 chain->bref.data_off, 1891 chain->bref.type, 1892 chain->bref.key); 1893 } else if (error == 0) { 1894 chain->bref.flags &= 1895 ~HAMMER2_BREF_FLAG_EMERG_MIP; 1896 } 1897 } 1898 } 1899 } 1900 1901 /* 1902 * Stop here if error. We have to undo any flag bits we might 1903 * have set above. 1904 */ 1905 if (error) { 1906 if (setmodified) { 1907 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1908 atomic_add_long(&hammer2_count_modified_chains, -1); 1909 if (chain->pmp) 1910 hammer2_pfs_memory_wakeup(chain->pmp, -1); 1911 } 1912 if (setupdate) { 1913 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1914 } 1915 lockmgr(&chain->diolk, LK_RELEASE); 1916 1917 return error; 1918 } 1919 1920 /* 1921 * Update mirror_tid and modify_tid. modify_tid is only updated 1922 * if not passed as zero (during flushes, parent propagation passes 1923 * the value 0). 1924 * 1925 * NOTE: chain->pmp could be the device spmp. 1926 */ 1927 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1928 if (mtid) 1929 chain->bref.modify_tid = mtid; 1930 1931 /* 1932 * Set BMAPUPD to tell the flush code that an existing blockmap entry 1933 * requires updating as well as to tell the delete code that the 1934 * chain's blockref might not exactly match (in terms of physical size 1935 * or block offset) the one in the parent's blocktable. The base key 1936 * of course will still match. 1937 */ 1938 if (chain->flags & HAMMER2_CHAIN_BMAPPED) 1939 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD); 1940 1941 /* 1942 * Short-cut data blocks which the caller does not need an actual 1943 * data reference to (aka OPTDATA), as long as the chain does not 1944 * already have a data pointer to the data. This generally means 1945 * that the modifications are being done via the logical buffer cache. 1946 * The INITIAL flag relates only to the device data buffer and thus 1947 * remains unchange in this situation. 1948 * 1949 * This code also handles bytes == 0 (most dirents). 1950 */ 1951 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1952 (flags & HAMMER2_MODIFY_OPTDATA) && 1953 chain->data == NULL) { 1954 KKASSERT(chain->dio == NULL); 1955 goto skip2; 1956 } 1957 1958 /* 1959 * Clearing the INITIAL flag (for indirect blocks) indicates that 1960 * we've processed the uninitialized storage allocation. 1961 * 1962 * If this flag is already clear we are likely in a copy-on-write 1963 * situation but we have to be sure NOT to bzero the storage if 1964 * no data is present. 1965 */ 1966 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1967 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1968 wasinitial = 1; 1969 } else { 1970 wasinitial = 0; 1971 } 1972 1973 /* 1974 * Instantiate data buffer and possibly execute COW operation 1975 */ 1976 switch(chain->bref.type) { 1977 case HAMMER2_BREF_TYPE_VOLUME: 1978 case HAMMER2_BREF_TYPE_FREEMAP: 1979 /* 1980 * The data is embedded, no copy-on-write operation is 1981 * needed. 1982 */ 1983 KKASSERT(chain->dio == NULL); 1984 break; 1985 case HAMMER2_BREF_TYPE_DIRENT: 1986 /* 1987 * The data might be fully embedded. 1988 */ 1989 if (chain->bytes == 0) { 1990 KKASSERT(chain->dio == NULL); 1991 break; 1992 } 1993 /* fall through */ 1994 case HAMMER2_BREF_TYPE_INODE: 1995 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1996 case HAMMER2_BREF_TYPE_DATA: 1997 case HAMMER2_BREF_TYPE_INDIRECT: 1998 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1999 /* 2000 * Perform the copy-on-write operation 2001 * 2002 * zero-fill or copy-on-write depending on whether 2003 * chain->data exists or not and set the dirty state for 2004 * the new buffer. hammer2_io_new() will handle the 2005 * zero-fill. 2006 * 2007 * If a dedup_off was supplied this is an existing block 2008 * and no COW, copy, or further modification is required. 2009 */ 2010 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 2011 2012 if (wasinitial && dedup_off == 0) { 2013 error = hammer2_io_new(hmp, chain->bref.type, 2014 chain->bref.data_off, 2015 chain->bytes, &dio); 2016 } else { 2017 error = hammer2_io_bread(hmp, chain->bref.type, 2018 chain->bref.data_off, 2019 chain->bytes, &dio); 2020 } 2021 hammer2_adjreadcounter(&chain->bref, chain->bytes); 2022 2023 /* 2024 * If an I/O error occurs make sure callers cannot accidently 2025 * modify the old buffer's contents and corrupt the filesystem. 2026 * 2027 * NOTE: hammer2_io_data() call issues bkvasync() 2028 */ 2029 if (error) { 2030 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 2031 hmp); 2032 chain->error = HAMMER2_ERROR_EIO; 2033 hammer2_io_brelse(&dio); 2034 hammer2_io_brelse(&chain->dio); 2035 chain->data = NULL; 2036 break; 2037 } 2038 chain->error = 0; 2039 bdata = hammer2_io_data(dio, chain->bref.data_off); 2040 2041 if (chain->data) { 2042 /* 2043 * COW (unless a dedup). 2044 */ 2045 KKASSERT(chain->dio != NULL); 2046 if (chain->data != (void *)bdata && dedup_off == 0) { 2047 bcopy(chain->data, bdata, chain->bytes); 2048 } 2049 } else if (wasinitial == 0) { 2050 /* 2051 * We have a problem. We were asked to COW but 2052 * we don't have any data to COW with! 2053 */ 2054 panic("hammer2_chain_modify: having a COW %p\n", 2055 chain); 2056 } 2057 2058 /* 2059 * Retire the old buffer, replace with the new. Dirty or 2060 * redirty the new buffer. 2061 * 2062 * WARNING! The system buffer cache may have already flushed 2063 * the buffer, so we must be sure to [re]dirty it 2064 * for further modification. 2065 * 2066 * If dedup_off was supplied, the caller is not 2067 * expected to make any further modification to the 2068 * buffer. 2069 * 2070 * WARNING! hammer2_get_gdata() assumes dio never transitions 2071 * through NULL in order to optimize away unnecessary 2072 * diolk operations. 2073 */ 2074 { 2075 hammer2_io_t *tio; 2076 2077 if ((tio = chain->dio) != NULL) 2078 hammer2_io_bqrelse(&tio); 2079 chain->data = (void *)bdata; 2080 chain->dio = dio; 2081 if (dedup_off == 0) 2082 hammer2_io_setdirty(dio); 2083 } 2084 break; 2085 default: 2086 panic("hammer2_chain_modify: illegal non-embedded type %d", 2087 chain->bref.type); 2088 break; 2089 2090 } 2091 skip2: 2092 /* 2093 * setflush on parent indicating that the parent must recurse down 2094 * to us. Do not call on chain itself which might already have it 2095 * set. 2096 */ 2097 if (chain->parent) 2098 hammer2_chain_setflush(chain->parent); 2099 lockmgr(&chain->diolk, LK_RELEASE); 2100 2101 return (chain->error); 2102 } 2103 2104 /* 2105 * Modify the chain associated with an inode. 2106 */ 2107 int 2108 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 2109 hammer2_tid_t mtid, int flags) 2110 { 2111 int error; 2112 2113 hammer2_inode_modify(ip); 2114 error = hammer2_chain_modify(chain, mtid, 0, flags); 2115 2116 return error; 2117 } 2118 2119 /* 2120 * Volume header data locks 2121 */ 2122 void 2123 hammer2_voldata_lock(hammer2_dev_t *hmp) 2124 { 2125 lockmgr(&hmp->vollk, LK_EXCLUSIVE); 2126 } 2127 2128 void 2129 hammer2_voldata_unlock(hammer2_dev_t *hmp) 2130 { 2131 lockmgr(&hmp->vollk, LK_RELEASE); 2132 } 2133 2134 void 2135 hammer2_voldata_modify(hammer2_dev_t *hmp) 2136 { 2137 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) { 2138 atomic_add_long(&hammer2_count_modified_chains, 1); 2139 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 2140 hammer2_pfs_memory_inc(hmp->vchain.pmp); 2141 } 2142 } 2143 2144 /* 2145 * This function returns the chain at the nearest key within the specified 2146 * range. The returned chain will be referenced but not locked. 2147 * 2148 * This function will recurse through chain->rbtree as necessary and will 2149 * return a *key_nextp suitable for iteration. *key_nextp is only set if 2150 * the iteration value is less than the current value of *key_nextp. 2151 * 2152 * The caller should use (*key_nextp) to calculate the actual range of 2153 * the returned element, which will be (key_beg to *key_nextp - 1), because 2154 * there might be another element which is superior to the returned element 2155 * and overlaps it. 2156 * 2157 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 2158 * chains continue to be returned. On EOF (*key_nextp) may overflow since 2159 * it will wind up being (key_end + 1). 2160 * 2161 * WARNING! Must be called with child's spinlock held. Spinlock remains 2162 * held through the operation. 2163 */ 2164 struct hammer2_chain_find_info { 2165 hammer2_chain_t *best; 2166 hammer2_key_t key_beg; 2167 hammer2_key_t key_end; 2168 hammer2_key_t key_next; 2169 }; 2170 2171 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 2172 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 2173 2174 static 2175 hammer2_chain_t * 2176 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 2177 hammer2_key_t key_beg, hammer2_key_t key_end) 2178 { 2179 struct hammer2_chain_find_info info; 2180 2181 info.best = NULL; 2182 info.key_beg = key_beg; 2183 info.key_end = key_end; 2184 info.key_next = *key_nextp; 2185 2186 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 2187 hammer2_chain_find_cmp, hammer2_chain_find_callback, 2188 &info); 2189 *key_nextp = info.key_next; 2190 #if 0 2191 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 2192 parent, key_beg, key_end, *key_nextp); 2193 #endif 2194 2195 return (info.best); 2196 } 2197 2198 static 2199 int 2200 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 2201 { 2202 struct hammer2_chain_find_info *info = data; 2203 hammer2_key_t child_beg; 2204 hammer2_key_t child_end; 2205 2206 child_beg = child->bref.key; 2207 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 2208 2209 if (child_end < info->key_beg) 2210 return(-1); 2211 if (child_beg > info->key_end) 2212 return(1); 2213 return(0); 2214 } 2215 2216 static 2217 int 2218 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2219 { 2220 struct hammer2_chain_find_info *info = data; 2221 hammer2_chain_t *best; 2222 hammer2_key_t child_end; 2223 2224 /* 2225 * WARNING! Layerq is scanned forwards, exact matches should keep 2226 * the existing info->best. 2227 */ 2228 if ((best = info->best) == NULL) { 2229 /* 2230 * No previous best. Assign best 2231 */ 2232 info->best = child; 2233 } else if (best->bref.key <= info->key_beg && 2234 child->bref.key <= info->key_beg) { 2235 /* 2236 * Illegal overlap. 2237 */ 2238 KKASSERT(0); 2239 /*info->best = child;*/ 2240 } else if (child->bref.key < best->bref.key) { 2241 /* 2242 * Child has a nearer key and best is not flush with key_beg. 2243 * Set best to child. Truncate key_next to the old best key. 2244 */ 2245 info->best = child; 2246 if (info->key_next > best->bref.key || info->key_next == 0) 2247 info->key_next = best->bref.key; 2248 } else if (child->bref.key == best->bref.key) { 2249 /* 2250 * If our current best is flush with the child then this 2251 * is an illegal overlap. 2252 * 2253 * key_next will automatically be limited to the smaller of 2254 * the two end-points. 2255 */ 2256 KKASSERT(0); 2257 info->best = child; 2258 } else { 2259 /* 2260 * Keep the current best but truncate key_next to the child's 2261 * base. 2262 * 2263 * key_next will also automatically be limited to the smaller 2264 * of the two end-points (probably not necessary for this case 2265 * but we do it anyway). 2266 */ 2267 if (info->key_next > child->bref.key || info->key_next == 0) 2268 info->key_next = child->bref.key; 2269 } 2270 2271 /* 2272 * Always truncate key_next based on child's end-of-range. 2273 */ 2274 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2275 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2276 info->key_next = child_end; 2277 2278 return(0); 2279 } 2280 2281 /* 2282 * Retrieve the specified chain from a media blockref, creating the 2283 * in-memory chain structure which reflects it. The returned chain is 2284 * held and locked according to (how) (HAMMER2_RESOLVE_*). The caller must 2285 * handle crc-checks and so forth, and should check chain->error before 2286 * assuming that the data is good. 2287 * 2288 * To handle insertion races pass the INSERT_RACE flag along with the 2289 * generation number of the core. NULL will be returned if the generation 2290 * number changes before we have a chance to insert the chain. Insert 2291 * races can occur because the parent might be held shared. 2292 * 2293 * Caller must hold the parent locked shared or exclusive since we may 2294 * need the parent's bref array to find our block. 2295 * 2296 * WARNING! chain->pmp is always set to NULL for any chain representing 2297 * part of the super-root topology. 2298 */ 2299 hammer2_chain_t * 2300 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2301 hammer2_blockref_t *bref, int how) 2302 { 2303 hammer2_dev_t *hmp = parent->hmp; 2304 hammer2_chain_t *chain; 2305 int error; 2306 2307 /* 2308 * Allocate a chain structure representing the existing media 2309 * entry. Resulting chain has one ref and is not locked. 2310 */ 2311 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2312 chain = hammer2_chain_alloc(hmp, NULL, bref); 2313 else 2314 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2315 /* ref'd chain returned */ 2316 2317 /* 2318 * Flag that the chain is in the parent's blockmap so delete/flush 2319 * knows what to do with it. 2320 */ 2321 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 2322 2323 /* 2324 * chain must be locked to avoid unexpected ripouts 2325 */ 2326 hammer2_chain_lock(chain, how); 2327 2328 /* 2329 * Link the chain into its parent. A spinlock is required to safely 2330 * access the RBTREE, and it is possible to collide with another 2331 * hammer2_chain_get() operation because the caller might only hold 2332 * a shared lock on the parent. 2333 * 2334 * NOTE: Get races can occur quite often when we distribute 2335 * asynchronous read-aheads across multiple threads. 2336 */ 2337 KKASSERT(parent->refs > 0); 2338 error = hammer2_chain_insert(parent, chain, 2339 HAMMER2_CHAIN_INSERT_SPIN | 2340 HAMMER2_CHAIN_INSERT_RACE, 2341 generation); 2342 if (error) { 2343 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2344 /*kprintf("chain %p get race\n", chain);*/ 2345 hammer2_chain_unlock(chain); 2346 hammer2_chain_drop(chain); 2347 chain = NULL; 2348 } else { 2349 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2350 } 2351 2352 /* 2353 * Return our new chain referenced but not locked, or NULL if 2354 * a race occurred. 2355 */ 2356 return (chain); 2357 } 2358 2359 /* 2360 * Lookup initialization/completion API 2361 */ 2362 hammer2_chain_t * 2363 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2364 { 2365 hammer2_chain_ref(parent); 2366 if (flags & HAMMER2_LOOKUP_SHARED) { 2367 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2368 HAMMER2_RESOLVE_SHARED); 2369 } else { 2370 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2371 } 2372 return (parent); 2373 } 2374 2375 void 2376 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2377 { 2378 if (parent) { 2379 hammer2_chain_unlock(parent); 2380 hammer2_chain_drop(parent); 2381 } 2382 } 2383 2384 /* 2385 * Take the locked chain and return a locked parent. The chain remains 2386 * locked on return, but may have to be temporarily unlocked to acquire 2387 * the parent. Because of this, (chain) must be stable and cannot be 2388 * deleted while it was temporarily unlocked (typically means that (chain) 2389 * is an inode). 2390 * 2391 * Pass HAMMER2_RESOLVE_* flags in flags. 2392 * 2393 * This will work even if the chain is errored, and the caller can check 2394 * parent->error on return if desired since the parent will be locked. 2395 * 2396 * This function handles the lock order reversal. 2397 */ 2398 hammer2_chain_t * 2399 hammer2_chain_getparent(hammer2_chain_t *chain, int flags) 2400 { 2401 hammer2_chain_t *parent; 2402 2403 /* 2404 * Be careful of order, chain must be unlocked before parent 2405 * is locked below to avoid a deadlock. Try it trivially first. 2406 */ 2407 parent = chain->parent; 2408 if (parent == NULL) 2409 panic("hammer2_chain_getparent: no parent"); 2410 hammer2_chain_ref(parent); 2411 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) 2412 return parent; 2413 2414 for (;;) { 2415 hammer2_chain_unlock(chain); 2416 hammer2_chain_lock(parent, flags); 2417 hammer2_chain_lock(chain, flags); 2418 2419 /* 2420 * Parent relinking races are quite common. We have to get 2421 * it right or we will blow up the block table. 2422 */ 2423 if (chain->parent == parent) 2424 break; 2425 hammer2_chain_unlock(parent); 2426 hammer2_chain_drop(parent); 2427 cpu_ccfence(); 2428 parent = chain->parent; 2429 if (parent == NULL) 2430 panic("hammer2_chain_getparent: no parent"); 2431 hammer2_chain_ref(parent); 2432 } 2433 return parent; 2434 } 2435 2436 /* 2437 * Take the locked chain and return a locked parent. The chain is unlocked 2438 * and dropped. *chainp is set to the returned parent as a convenience. 2439 * Pass HAMMER2_RESOLVE_* flags in flags. 2440 * 2441 * This will work even if the chain is errored, and the caller can check 2442 * parent->error on return if desired since the parent will be locked. 2443 * 2444 * The chain does NOT need to be stable. We use a tracking structure 2445 * to track the expected parent if the chain is deleted out from under us. 2446 * 2447 * This function handles the lock order reversal. 2448 */ 2449 hammer2_chain_t * 2450 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags) 2451 { 2452 hammer2_chain_t *chain; 2453 hammer2_chain_t *parent; 2454 struct hammer2_reptrack reptrack; 2455 struct hammer2_reptrack **repp; 2456 2457 /* 2458 * Be careful of order, chain must be unlocked before parent 2459 * is locked below to avoid a deadlock. Try it trivially first. 2460 */ 2461 chain = *chainp; 2462 parent = chain->parent; 2463 if (parent == NULL) { 2464 hammer2_spin_unex(&chain->core.spin); 2465 panic("hammer2_chain_repparent: no parent"); 2466 } 2467 hammer2_chain_ref(parent); 2468 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) { 2469 hammer2_chain_unlock(chain); 2470 hammer2_chain_drop(chain); 2471 *chainp = parent; 2472 2473 return parent; 2474 } 2475 2476 /* 2477 * Ok, now it gets a bit nasty. There are multiple situations where 2478 * the parent might be in the middle of a deletion, or where the child 2479 * (chain) might be deleted the instant we let go of its lock. 2480 * We can potentially end up in a no-win situation! 2481 * 2482 * In particular, the indirect_maintenance() case can cause these 2483 * situations. 2484 * 2485 * To deal with this we install a reptrack structure in the parent 2486 * This reptrack structure 'owns' the parent ref and will automatically 2487 * migrate to the parent's parent if the parent is deleted permanently. 2488 */ 2489 hammer2_spin_init(&reptrack.spin, "h2reptrk"); 2490 reptrack.chain = parent; 2491 hammer2_chain_ref(parent); /* for the reptrack */ 2492 2493 hammer2_spin_ex(&parent->core.spin); 2494 reptrack.next = parent->core.reptrack; 2495 parent->core.reptrack = &reptrack; 2496 hammer2_spin_unex(&parent->core.spin); 2497 2498 hammer2_chain_unlock(chain); 2499 hammer2_chain_drop(chain); 2500 chain = NULL; /* gone */ 2501 2502 /* 2503 * At the top of this loop, chain is gone and parent is refd both 2504 * by us explicitly AND via our reptrack. We are attempting to 2505 * lock parent. 2506 */ 2507 for (;;) { 2508 hammer2_chain_lock(parent, flags); 2509 2510 if (reptrack.chain == parent) 2511 break; 2512 hammer2_chain_unlock(parent); 2513 hammer2_chain_drop(parent); 2514 2515 kprintf("hammer2: debug REPTRACK %p->%p\n", 2516 parent, reptrack.chain); 2517 hammer2_spin_ex(&reptrack.spin); 2518 parent = reptrack.chain; 2519 hammer2_chain_ref(parent); 2520 hammer2_spin_unex(&reptrack.spin); 2521 } 2522 2523 /* 2524 * Once parent is locked and matches our reptrack, our reptrack 2525 * will be stable and we have our parent. We can unlink our 2526 * reptrack. 2527 * 2528 * WARNING! Remember that the chain lock might be shared. Chains 2529 * locked shared have stable parent linkages. 2530 */ 2531 hammer2_spin_ex(&parent->core.spin); 2532 repp = &parent->core.reptrack; 2533 while (*repp != &reptrack) 2534 repp = &(*repp)->next; 2535 *repp = reptrack.next; 2536 hammer2_spin_unex(&parent->core.spin); 2537 2538 hammer2_chain_drop(parent); /* reptrack ref */ 2539 *chainp = parent; /* return parent lock+ref */ 2540 2541 return parent; 2542 } 2543 2544 /* 2545 * Dispose of any linked reptrack structures in (chain) by shifting them to 2546 * (parent). Both (chain) and (parent) must be exclusively locked. 2547 * 2548 * This is interlocked against any children of (chain) on the other side. 2549 * No children so remain as-of when this is called so we can test 2550 * core.reptrack without holding the spin-lock. 2551 * 2552 * Used whenever the caller intends to permanently delete chains related 2553 * to topological recursions (BREF_TYPE_INDIRECT, BREF_TYPE_FREEMAP_NODE), 2554 * where the chains underneath the node being deleted are given a new parent 2555 * above the node being deleted. 2556 */ 2557 static 2558 void 2559 hammer2_chain_repchange(hammer2_chain_t *parent, hammer2_chain_t *chain) 2560 { 2561 struct hammer2_reptrack *reptrack; 2562 2563 KKASSERT(chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)); 2564 while (chain->core.reptrack) { 2565 hammer2_spin_ex(&parent->core.spin); 2566 hammer2_spin_ex(&chain->core.spin); 2567 reptrack = chain->core.reptrack; 2568 if (reptrack == NULL) { 2569 hammer2_spin_unex(&chain->core.spin); 2570 hammer2_spin_unex(&parent->core.spin); 2571 break; 2572 } 2573 hammer2_spin_ex(&reptrack->spin); 2574 chain->core.reptrack = reptrack->next; 2575 reptrack->chain = parent; 2576 reptrack->next = parent->core.reptrack; 2577 parent->core.reptrack = reptrack; 2578 hammer2_chain_ref(parent); /* reptrack */ 2579 2580 hammer2_spin_unex(&chain->core.spin); 2581 hammer2_spin_unex(&parent->core.spin); 2582 kprintf("hammer2: debug repchange %p %p->%p\n", 2583 reptrack, chain, parent); 2584 hammer2_chain_drop(chain); /* reptrack */ 2585 } 2586 } 2587 2588 /* 2589 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2590 * (*parentp) typically points to an inode but can also point to a related 2591 * indirect block and this function will recurse upwards and find the inode 2592 * or the nearest undeleted indirect block covering the key range. 2593 * 2594 * This function unconditionally sets *errorp, replacing any previous value. 2595 * 2596 * (*parentp) must be exclusive or shared locked (depending on flags) and 2597 * referenced and can be an inode or an existing indirect block within the 2598 * inode. 2599 * 2600 * If (*parent) is errored out, this function will not attempt to recurse 2601 * the radix tree and will return NULL along with an appropriate *errorp. 2602 * If NULL is returned and *errorp is 0, the requested lookup could not be 2603 * located. 2604 * 2605 * On return (*parentp) will be modified to point at the deepest parent chain 2606 * element encountered during the search, as a helper for an insertion or 2607 * deletion. 2608 * 2609 * The new (*parentp) will be locked shared or exclusive (depending on flags), 2610 * and referenced, and the old will be unlocked and dereferenced (no change 2611 * if they are both the same). This is particularly important if the caller 2612 * wishes to insert a new chain, (*parentp) will be set properly even if NULL 2613 * is returned, as long as no error occurred. 2614 * 2615 * The matching chain will be returned locked according to flags. 2616 * 2617 * -- 2618 * 2619 * NULL is returned if no match was found, but (*parentp) will still 2620 * potentially be adjusted. 2621 * 2622 * On return (*key_nextp) will point to an iterative value for key_beg. 2623 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2624 * 2625 * This function will also recurse up the chain if the key is not within the 2626 * current parent's range. (*parentp) can never be set to NULL. An iteration 2627 * can simply allow (*parentp) to float inside the loop. 2628 * 2629 * NOTE! chain->data is not always resolved. By default it will not be 2630 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2631 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2632 * BREF_TYPE_DATA as the device buffer can alias the logical file 2633 * buffer). 2634 */ 2635 2636 hammer2_chain_t * 2637 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2638 hammer2_key_t key_beg, hammer2_key_t key_end, 2639 int *errorp, int flags) 2640 { 2641 hammer2_dev_t *hmp; 2642 hammer2_chain_t *parent; 2643 hammer2_chain_t *chain; 2644 hammer2_blockref_t *base; 2645 hammer2_blockref_t *bref; 2646 hammer2_blockref_t bsave; 2647 hammer2_key_t scan_beg; 2648 hammer2_key_t scan_end; 2649 int count = 0; 2650 int how_always = HAMMER2_RESOLVE_ALWAYS; 2651 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2652 int how; 2653 int generation; 2654 int maxloops = 300000; 2655 volatile hammer2_mtx_t save_mtx; 2656 2657 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2658 how_maybe = how_always; 2659 how = HAMMER2_RESOLVE_ALWAYS; 2660 } else if (flags & HAMMER2_LOOKUP_NODATA) { 2661 how = HAMMER2_RESOLVE_NEVER; 2662 } else { 2663 how = HAMMER2_RESOLVE_MAYBE; 2664 } 2665 if (flags & HAMMER2_LOOKUP_SHARED) { 2666 how_maybe |= HAMMER2_RESOLVE_SHARED; 2667 how_always |= HAMMER2_RESOLVE_SHARED; 2668 how |= HAMMER2_RESOLVE_SHARED; 2669 } 2670 2671 /* 2672 * Recurse (*parentp) upward if necessary until the parent completely 2673 * encloses the key range or we hit the inode. 2674 * 2675 * Handle races against the flusher deleting indirect nodes on its 2676 * way back up by continuing to recurse upward past the deletion. 2677 */ 2678 parent = *parentp; 2679 hmp = parent->hmp; 2680 *errorp = 0; 2681 2682 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2683 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2684 scan_beg = parent->bref.key; 2685 scan_end = scan_beg + 2686 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2687 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2688 if (key_beg >= scan_beg && key_end <= scan_end) 2689 break; 2690 } 2691 parent = hammer2_chain_repparent(parentp, how_maybe); 2692 } 2693 again: 2694 if (--maxloops == 0) 2695 panic("hammer2_chain_lookup: maxloops"); 2696 2697 /* 2698 * MATCHIND case that does not require parent->data (do prior to 2699 * parent->error check). 2700 */ 2701 switch(parent->bref.type) { 2702 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2703 case HAMMER2_BREF_TYPE_INDIRECT: 2704 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2705 scan_beg = parent->bref.key; 2706 scan_end = scan_beg + 2707 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2708 if (key_beg == scan_beg && key_end == scan_end) { 2709 chain = parent; 2710 hammer2_chain_ref(chain); 2711 hammer2_chain_lock(chain, how_maybe); 2712 *key_nextp = scan_end + 1; 2713 goto done; 2714 } 2715 } 2716 break; 2717 default: 2718 break; 2719 } 2720 2721 /* 2722 * No lookup is possible if the parent is errored. We delayed 2723 * this check as long as we could to ensure that the parent backup, 2724 * embedded data, and MATCHIND code could still execute. 2725 */ 2726 if (parent->error) { 2727 *errorp = parent->error; 2728 return NULL; 2729 } 2730 2731 /* 2732 * Locate the blockref array. Currently we do a fully associative 2733 * search through the array. 2734 */ 2735 switch(parent->bref.type) { 2736 case HAMMER2_BREF_TYPE_INODE: 2737 /* 2738 * Special shortcut for embedded data returns the inode 2739 * itself. Callers must detect this condition and access 2740 * the embedded data (the strategy code does this for us). 2741 * 2742 * This is only applicable to regular files and softlinks. 2743 * 2744 * We need a second lock on parent. Since we already have 2745 * a lock we must pass LOCKAGAIN to prevent unexpected 2746 * blocking (we don't want to block on a second shared 2747 * ref if an exclusive lock is pending) 2748 */ 2749 if (parent->data->ipdata.meta.op_flags & 2750 HAMMER2_OPFLAG_DIRECTDATA) { 2751 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2752 chain = NULL; 2753 *key_nextp = key_end + 1; 2754 goto done; 2755 } 2756 hammer2_chain_ref(parent); 2757 hammer2_chain_lock(parent, how_always | 2758 HAMMER2_RESOLVE_LOCKAGAIN); 2759 *key_nextp = key_end + 1; 2760 return (parent); 2761 } 2762 base = &parent->data->ipdata.u.blockset.blockref[0]; 2763 count = HAMMER2_SET_COUNT; 2764 break; 2765 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2766 case HAMMER2_BREF_TYPE_INDIRECT: 2767 /* 2768 * Optimize indirect blocks in the INITIAL state to avoid 2769 * I/O. 2770 * 2771 * Debugging: Enter permanent wait state instead of 2772 * panicing on unexpectedly NULL data for the moment. 2773 */ 2774 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2775 base = NULL; 2776 } else { 2777 if (parent->data == NULL) { 2778 kprintf("hammer2: unexpected NULL data " 2779 "on %p\n", parent); 2780 while (1) 2781 tsleep(parent, 0, "xxx", 0); 2782 } 2783 base = &parent->data->npdata[0]; 2784 } 2785 count = parent->bytes / sizeof(hammer2_blockref_t); 2786 break; 2787 case HAMMER2_BREF_TYPE_VOLUME: 2788 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2789 count = HAMMER2_SET_COUNT; 2790 break; 2791 case HAMMER2_BREF_TYPE_FREEMAP: 2792 base = &parent->data->blkset.blockref[0]; 2793 count = HAMMER2_SET_COUNT; 2794 break; 2795 default: 2796 kprintf("hammer2_chain_lookup: unrecognized " 2797 "blockref(B) type: %d", 2798 parent->bref.type); 2799 while (1) 2800 tsleep(&base, 0, "dead", 0); 2801 panic("hammer2_chain_lookup: unrecognized " 2802 "blockref(B) type: %d", 2803 parent->bref.type); 2804 base = NULL; /* safety */ 2805 count = 0; /* safety */ 2806 } 2807 2808 /* 2809 * Merged scan to find next candidate. 2810 * 2811 * hammer2_base_*() functions require the parent->core.live_* fields 2812 * to be synchronized. 2813 * 2814 * We need to hold the spinlock to access the block array and RB tree 2815 * and to interlock chain creation. 2816 */ 2817 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2818 hammer2_chain_countbrefs(parent, base, count); 2819 2820 /* 2821 * Combined search 2822 */ 2823 hammer2_spin_ex(&parent->core.spin); 2824 chain = hammer2_combined_find(parent, base, count, 2825 key_nextp, 2826 key_beg, key_end, 2827 &bref); 2828 generation = parent->core.generation; 2829 2830 /* 2831 * Exhausted parent chain, iterate. 2832 */ 2833 if (bref == NULL) { 2834 KKASSERT(chain == NULL); 2835 hammer2_spin_unex(&parent->core.spin); 2836 if (key_beg == key_end) /* short cut single-key case */ 2837 return (NULL); 2838 2839 /* 2840 * Stop if we reached the end of the iteration. 2841 */ 2842 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2843 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2844 return (NULL); 2845 } 2846 2847 /* 2848 * Calculate next key, stop if we reached the end of the 2849 * iteration, otherwise go up one level and loop. 2850 */ 2851 key_beg = parent->bref.key + 2852 ((hammer2_key_t)1 << parent->bref.keybits); 2853 if (key_beg == 0 || key_beg > key_end) 2854 return (NULL); 2855 parent = hammer2_chain_repparent(parentp, how_maybe); 2856 goto again; 2857 } 2858 2859 /* 2860 * Selected from blockref or in-memory chain. 2861 */ 2862 bsave = *bref; 2863 if (chain == NULL) { 2864 hammer2_spin_unex(&parent->core.spin); 2865 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2866 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2867 chain = hammer2_chain_get(parent, generation, 2868 &bsave, how_maybe); 2869 } else { 2870 chain = hammer2_chain_get(parent, generation, 2871 &bsave, how); 2872 } 2873 if (chain == NULL) 2874 goto again; 2875 } else { 2876 hammer2_chain_ref(chain); 2877 hammer2_spin_unex(&parent->core.spin); 2878 2879 /* 2880 * chain is referenced but not locked. We must lock the 2881 * chain to obtain definitive state. 2882 */ 2883 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2884 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2885 hammer2_chain_lock(chain, how_maybe); 2886 } else { 2887 hammer2_chain_lock(chain, how); 2888 } 2889 KKASSERT(chain->parent == parent); 2890 } 2891 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 2892 chain->parent != parent) { 2893 hammer2_chain_unlock(chain); 2894 hammer2_chain_drop(chain); 2895 chain = NULL; /* SAFETY */ 2896 goto again; 2897 } 2898 2899 2900 /* 2901 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2902 * 2903 * NOTE: Chain's key range is not relevant as there might be 2904 * one-offs within the range that are not deleted. 2905 * 2906 * NOTE: Lookups can race delete-duplicate because 2907 * delete-duplicate does not lock the parent's core 2908 * (they just use the spinlock on the core). 2909 */ 2910 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2911 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2912 chain->bref.data_off, chain->bref.type, 2913 chain->bref.key); 2914 hammer2_chain_unlock(chain); 2915 hammer2_chain_drop(chain); 2916 chain = NULL; /* SAFETY */ 2917 key_beg = *key_nextp; 2918 if (key_beg == 0 || key_beg > key_end) 2919 return(NULL); 2920 goto again; 2921 } 2922 2923 /* 2924 * If the chain element is an indirect block it becomes the new 2925 * parent and we loop on it. We must maintain our top-down locks 2926 * to prevent the flusher from interfering (i.e. doing a 2927 * delete-duplicate and leaving us recursing down a deleted chain). 2928 * 2929 * The parent always has to be locked with at least RESOLVE_MAYBE 2930 * so we can access its data. It might need a fixup if the caller 2931 * passed incompatible flags. Be careful not to cause a deadlock 2932 * as a data-load requires an exclusive lock. 2933 * 2934 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2935 * range is within the requested key range we return the indirect 2936 * block and do NOT loop. This is usually only used to acquire 2937 * freemap nodes. 2938 */ 2939 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2940 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2941 save_mtx = parent->lock; 2942 hammer2_chain_unlock(parent); 2943 hammer2_chain_drop(parent); 2944 *parentp = parent = chain; 2945 chain = NULL; /* SAFETY */ 2946 goto again; 2947 } 2948 done: 2949 /* 2950 * All done, return the locked chain. 2951 * 2952 * If the caller does not want a locked chain, replace the lock with 2953 * a ref. Perhaps this can eventually be optimized to not obtain the 2954 * lock in the first place for situations where the data does not 2955 * need to be resolved. 2956 * 2957 * NOTE! A chain->error must be tested by the caller upon return. 2958 * *errorp is only set based on issues which occur while 2959 * trying to reach the chain. 2960 */ 2961 return (chain); 2962 } 2963 2964 /* 2965 * After having issued a lookup we can iterate all matching keys. 2966 * 2967 * If chain is non-NULL we continue the iteration from just after it's index. 2968 * 2969 * If chain is NULL we assume the parent was exhausted and continue the 2970 * iteration at the next parent. 2971 * 2972 * If a fatal error occurs (typically an I/O error), a dummy chain is 2973 * returned with chain->error and error-identifying information set. This 2974 * chain will assert if you try to do anything fancy with it. 2975 * 2976 * XXX Depending on where the error occurs we should allow continued iteration. 2977 * 2978 * parent must be locked on entry and remains locked throughout. chain's 2979 * lock status must match flags. Chain is always at least referenced. 2980 * 2981 * WARNING! The MATCHIND flag does not apply to this function. 2982 */ 2983 hammer2_chain_t * 2984 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2985 hammer2_key_t *key_nextp, 2986 hammer2_key_t key_beg, hammer2_key_t key_end, 2987 int *errorp, int flags) 2988 { 2989 hammer2_chain_t *parent; 2990 int how_maybe; 2991 2992 /* 2993 * Calculate locking flags for upward recursion. 2994 */ 2995 how_maybe = HAMMER2_RESOLVE_MAYBE; 2996 if (flags & HAMMER2_LOOKUP_SHARED) 2997 how_maybe |= HAMMER2_RESOLVE_SHARED; 2998 2999 parent = *parentp; 3000 *errorp = 0; 3001 3002 /* 3003 * Calculate the next index and recalculate the parent if necessary. 3004 */ 3005 if (chain) { 3006 key_beg = chain->bref.key + 3007 ((hammer2_key_t)1 << chain->bref.keybits); 3008 hammer2_chain_unlock(chain); 3009 hammer2_chain_drop(chain); 3010 3011 /* 3012 * chain invalid past this point, but we can still do a 3013 * pointer comparison w/parent. 3014 * 3015 * Any scan where the lookup returned degenerate data embedded 3016 * in the inode has an invalid index and must terminate. 3017 */ 3018 if (chain == parent) 3019 return(NULL); 3020 if (key_beg == 0 || key_beg > key_end) 3021 return(NULL); 3022 chain = NULL; 3023 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 3024 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 3025 /* 3026 * We reached the end of the iteration. 3027 */ 3028 return (NULL); 3029 } else { 3030 /* 3031 * Continue iteration with next parent unless the current 3032 * parent covers the range. 3033 * 3034 * (This also handles the case of a deleted, empty indirect 3035 * node). 3036 */ 3037 key_beg = parent->bref.key + 3038 ((hammer2_key_t)1 << parent->bref.keybits); 3039 if (key_beg == 0 || key_beg > key_end) 3040 return (NULL); 3041 parent = hammer2_chain_repparent(parentp, how_maybe); 3042 } 3043 3044 /* 3045 * And execute 3046 */ 3047 return (hammer2_chain_lookup(parentp, key_nextp, 3048 key_beg, key_end, 3049 errorp, flags)); 3050 } 3051 3052 /* 3053 * Caller wishes to iterate chains under parent, loading new chains into 3054 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 3055 * then call hammer2_chain_scan() repeatedly until a non-zero return. 3056 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 3057 * with the returned chain for the scan. The returned *chainp will be 3058 * locked and referenced. Any prior contents will be unlocked and dropped. 3059 * 3060 * Caller should check the return value. A normal scan EOF will return 3061 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 3062 * error trying to access parent data. Any error in the returned chain 3063 * must be tested separately by the caller. 3064 * 3065 * (*chainp) is dropped on each scan, but will only be set if the returned 3066 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 3067 * returned via *chainp. The caller will get their bref only. 3068 * 3069 * The raw scan function is similar to lookup/next but does not seek to a key. 3070 * Blockrefs are iterated via first_bref = (parent, NULL) and 3071 * next_chain = (parent, bref). 3072 * 3073 * The passed-in parent must be locked and its data resolved. The function 3074 * nominally returns a locked and referenced *chainp != NULL for chains 3075 * the caller might need to recurse on (and will dipose of any *chainp passed 3076 * in). The caller must check the chain->bref.type either way. 3077 */ 3078 int 3079 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 3080 hammer2_blockref_t *bref, int *firstp, 3081 int flags) 3082 { 3083 hammer2_dev_t *hmp; 3084 hammer2_blockref_t *base; 3085 hammer2_blockref_t *bref_ptr; 3086 hammer2_key_t key; 3087 hammer2_key_t next_key; 3088 hammer2_chain_t *chain = NULL; 3089 int count = 0; 3090 int how_always = HAMMER2_RESOLVE_ALWAYS; 3091 int how_maybe = HAMMER2_RESOLVE_MAYBE; 3092 int how; 3093 int generation; 3094 int maxloops = 300000; 3095 int error; 3096 3097 hmp = parent->hmp; 3098 error = 0; 3099 3100 /* 3101 * Scan flags borrowed from lookup. 3102 */ 3103 if (flags & HAMMER2_LOOKUP_ALWAYS) { 3104 how_maybe = how_always; 3105 how = HAMMER2_RESOLVE_ALWAYS; 3106 } else if (flags & HAMMER2_LOOKUP_NODATA) { 3107 how = HAMMER2_RESOLVE_NEVER; 3108 } else { 3109 how = HAMMER2_RESOLVE_MAYBE; 3110 } 3111 if (flags & HAMMER2_LOOKUP_SHARED) { 3112 how_maybe |= HAMMER2_RESOLVE_SHARED; 3113 how_always |= HAMMER2_RESOLVE_SHARED; 3114 how |= HAMMER2_RESOLVE_SHARED; 3115 } 3116 3117 /* 3118 * Calculate key to locate first/next element, unlocking the previous 3119 * element as we go. Be careful, the key calculation can overflow. 3120 * 3121 * (also reset bref to NULL) 3122 */ 3123 if (*firstp) { 3124 key = 0; 3125 *firstp = 0; 3126 } else { 3127 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 3128 if ((chain = *chainp) != NULL) { 3129 *chainp = NULL; 3130 hammer2_chain_unlock(chain); 3131 hammer2_chain_drop(chain); 3132 chain = NULL; 3133 } 3134 if (key == 0) { 3135 error |= HAMMER2_ERROR_EOF; 3136 goto done; 3137 } 3138 } 3139 3140 again: 3141 if (parent->error) { 3142 error = parent->error; 3143 goto done; 3144 } 3145 if (--maxloops == 0) 3146 panic("hammer2_chain_scan: maxloops"); 3147 3148 /* 3149 * Locate the blockref array. Currently we do a fully associative 3150 * search through the array. 3151 */ 3152 switch(parent->bref.type) { 3153 case HAMMER2_BREF_TYPE_INODE: 3154 /* 3155 * An inode with embedded data has no sub-chains. 3156 * 3157 * WARNING! Bulk scan code may pass a static chain marked 3158 * as BREF_TYPE_INODE with a copy of the volume 3159 * root blockset to snapshot the volume. 3160 */ 3161 if (parent->data->ipdata.meta.op_flags & 3162 HAMMER2_OPFLAG_DIRECTDATA) { 3163 error |= HAMMER2_ERROR_EOF; 3164 goto done; 3165 } 3166 base = &parent->data->ipdata.u.blockset.blockref[0]; 3167 count = HAMMER2_SET_COUNT; 3168 break; 3169 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3170 case HAMMER2_BREF_TYPE_INDIRECT: 3171 /* 3172 * Optimize indirect blocks in the INITIAL state to avoid 3173 * I/O. 3174 */ 3175 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 3176 base = NULL; 3177 } else { 3178 if (parent->data == NULL) 3179 panic("parent->data is NULL"); 3180 base = &parent->data->npdata[0]; 3181 } 3182 count = parent->bytes / sizeof(hammer2_blockref_t); 3183 break; 3184 case HAMMER2_BREF_TYPE_VOLUME: 3185 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3186 count = HAMMER2_SET_COUNT; 3187 break; 3188 case HAMMER2_BREF_TYPE_FREEMAP: 3189 base = &parent->data->blkset.blockref[0]; 3190 count = HAMMER2_SET_COUNT; 3191 break; 3192 default: 3193 panic("hammer2_chain_scan: unrecognized blockref type: %d", 3194 parent->bref.type); 3195 base = NULL; /* safety */ 3196 count = 0; /* safety */ 3197 } 3198 3199 /* 3200 * Merged scan to find next candidate. 3201 * 3202 * hammer2_base_*() functions require the parent->core.live_* fields 3203 * to be synchronized. 3204 * 3205 * We need to hold the spinlock to access the block array and RB tree 3206 * and to interlock chain creation. 3207 */ 3208 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3209 hammer2_chain_countbrefs(parent, base, count); 3210 3211 next_key = 0; 3212 bref_ptr = NULL; 3213 hammer2_spin_ex(&parent->core.spin); 3214 chain = hammer2_combined_find(parent, base, count, 3215 &next_key, 3216 key, HAMMER2_KEY_MAX, 3217 &bref_ptr); 3218 generation = parent->core.generation; 3219 3220 /* 3221 * Exhausted parent chain, we're done. 3222 */ 3223 if (bref_ptr == NULL) { 3224 hammer2_spin_unex(&parent->core.spin); 3225 KKASSERT(chain == NULL); 3226 error |= HAMMER2_ERROR_EOF; 3227 goto done; 3228 } 3229 3230 /* 3231 * Copy into the supplied stack-based blockref. 3232 */ 3233 *bref = *bref_ptr; 3234 3235 /* 3236 * Selected from blockref or in-memory chain. 3237 */ 3238 if (chain == NULL) { 3239 switch(bref->type) { 3240 case HAMMER2_BREF_TYPE_INODE: 3241 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3242 case HAMMER2_BREF_TYPE_INDIRECT: 3243 case HAMMER2_BREF_TYPE_VOLUME: 3244 case HAMMER2_BREF_TYPE_FREEMAP: 3245 /* 3246 * Recursion, always get the chain 3247 */ 3248 hammer2_spin_unex(&parent->core.spin); 3249 chain = hammer2_chain_get(parent, generation, 3250 bref, how); 3251 if (chain == NULL) 3252 goto again; 3253 break; 3254 default: 3255 /* 3256 * No recursion, do not waste time instantiating 3257 * a chain, just iterate using the bref. 3258 */ 3259 hammer2_spin_unex(&parent->core.spin); 3260 break; 3261 } 3262 } else { 3263 /* 3264 * Recursion or not we need the chain in order to supply 3265 * the bref. 3266 */ 3267 hammer2_chain_ref(chain); 3268 hammer2_spin_unex(&parent->core.spin); 3269 hammer2_chain_lock(chain, how); 3270 } 3271 if (chain && 3272 (bcmp(bref, &chain->bref, sizeof(*bref)) || 3273 chain->parent != parent)) { 3274 hammer2_chain_unlock(chain); 3275 hammer2_chain_drop(chain); 3276 chain = NULL; 3277 goto again; 3278 } 3279 3280 /* 3281 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 3282 * 3283 * NOTE: chain's key range is not relevant as there might be 3284 * one-offs within the range that are not deleted. 3285 * 3286 * NOTE: XXX this could create problems with scans used in 3287 * situations other than mount-time recovery. 3288 * 3289 * NOTE: Lookups can race delete-duplicate because 3290 * delete-duplicate does not lock the parent's core 3291 * (they just use the spinlock on the core). 3292 */ 3293 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3294 hammer2_chain_unlock(chain); 3295 hammer2_chain_drop(chain); 3296 chain = NULL; 3297 3298 key = next_key; 3299 if (key == 0) { 3300 error |= HAMMER2_ERROR_EOF; 3301 goto done; 3302 } 3303 goto again; 3304 } 3305 3306 done: 3307 /* 3308 * All done, return the bref or NULL, supply chain if necessary. 3309 */ 3310 if (chain) 3311 *chainp = chain; 3312 return (error); 3313 } 3314 3315 /* 3316 * Create and return a new hammer2 system memory structure of the specified 3317 * key, type and size and insert it under (*parentp). This is a full 3318 * insertion, based on the supplied key/keybits, and may involve creating 3319 * indirect blocks and moving other chains around via delete/duplicate. 3320 * 3321 * This call can be made with parent == NULL as long as a non -1 methods 3322 * is supplied. hmp must also be supplied in this situation (otherwise 3323 * hmp is extracted from the supplied parent). The chain will be detached 3324 * from the topology. A later call with both parent and chain can be made 3325 * to attach it. 3326 * 3327 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3328 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3329 * FULL. This typically means that the caller is creating the chain after 3330 * doing a hammer2_chain_lookup(). 3331 * 3332 * (*parentp) must be exclusive locked and may be replaced on return 3333 * depending on how much work the function had to do. 3334 * 3335 * (*parentp) must not be errored or this function will assert. 3336 * 3337 * (*chainp) usually starts out NULL and returns the newly created chain, 3338 * but if the caller desires the caller may allocate a disconnected chain 3339 * and pass it in instead. 3340 * 3341 * This function should NOT be used to insert INDIRECT blocks. It is 3342 * typically used to create/insert inodes and data blocks. 3343 * 3344 * Caller must pass-in an exclusively locked parent the new chain is to 3345 * be inserted under, and optionally pass-in a disconnected, exclusively 3346 * locked chain to insert (else we create a new chain). The function will 3347 * adjust (*parentp) as necessary, create or connect the chain, and 3348 * return an exclusively locked chain in *chainp. 3349 * 3350 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3351 * and will be reassigned. 3352 * 3353 * NOTE: returns HAMMER_ERROR_* flags 3354 */ 3355 int 3356 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3357 hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods, 3358 hammer2_key_t key, int keybits, int type, size_t bytes, 3359 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3360 { 3361 hammer2_chain_t *chain; 3362 hammer2_chain_t *parent; 3363 hammer2_blockref_t *base; 3364 hammer2_blockref_t dummy; 3365 int allocated = 0; 3366 int error = 0; 3367 int count; 3368 int maxloops = 300000; 3369 3370 /* 3371 * Topology may be crossing a PFS boundary. 3372 */ 3373 parent = *parentp; 3374 if (parent) { 3375 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3376 KKASSERT(parent->error == 0); 3377 hmp = parent->hmp; 3378 } 3379 chain = *chainp; 3380 3381 if (chain == NULL) { 3382 /* 3383 * First allocate media space and construct the dummy bref, 3384 * then allocate the in-memory chain structure. Set the 3385 * INITIAL flag for fresh chains which do not have embedded 3386 * data. 3387 * 3388 * XXX for now set the check mode of the child based on 3389 * the parent or, if the parent is an inode, the 3390 * specification in the inode. 3391 */ 3392 bzero(&dummy, sizeof(dummy)); 3393 dummy.type = type; 3394 dummy.key = key; 3395 dummy.keybits = keybits; 3396 dummy.data_off = hammer2_getradix(bytes); 3397 3398 /* 3399 * Inherit methods from parent by default. Primarily used 3400 * for BREF_TYPE_DATA. Non-data types *must* be set to 3401 * a non-NONE check algorithm. 3402 */ 3403 if (methods == -1) 3404 dummy.methods = parent->bref.methods; 3405 else 3406 dummy.methods = (uint8_t)methods; 3407 3408 if (type != HAMMER2_BREF_TYPE_DATA && 3409 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3410 dummy.methods |= 3411 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3412 } 3413 3414 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3415 3416 /* 3417 * Lock the chain manually, chain_lock will load the chain 3418 * which we do NOT want to do. (note: chain->refs is set 3419 * to 1 by chain_alloc() for us, but lockcnt is not). 3420 */ 3421 chain->lockcnt = 1; 3422 hammer2_mtx_ex(&chain->lock); 3423 allocated = 1; 3424 3425 /* 3426 * Set INITIAL to optimize I/O. The flag will generally be 3427 * processed when we call hammer2_chain_modify(). 3428 * 3429 * Recalculate bytes to reflect the actual media block 3430 * allocation. Handle special case radix 0 == 0 bytes. 3431 */ 3432 bytes = (size_t)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 3433 if (bytes) 3434 bytes = (hammer2_off_t)1 << bytes; 3435 chain->bytes = bytes; 3436 3437 switch(type) { 3438 case HAMMER2_BREF_TYPE_VOLUME: 3439 case HAMMER2_BREF_TYPE_FREEMAP: 3440 panic("hammer2_chain_create: called with volume type"); 3441 break; 3442 case HAMMER2_BREF_TYPE_INDIRECT: 3443 panic("hammer2_chain_create: cannot be used to" 3444 "create indirect block"); 3445 break; 3446 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3447 panic("hammer2_chain_create: cannot be used to" 3448 "create freemap root or node"); 3449 break; 3450 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3451 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3452 /* fall through */ 3453 case HAMMER2_BREF_TYPE_DIRENT: 3454 case HAMMER2_BREF_TYPE_INODE: 3455 case HAMMER2_BREF_TYPE_DATA: 3456 default: 3457 /* 3458 * leave chain->data NULL, set INITIAL 3459 */ 3460 KKASSERT(chain->data == NULL); 3461 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3462 break; 3463 } 3464 } else { 3465 /* 3466 * We are reattaching a previously deleted chain, possibly 3467 * under a new parent and possibly with a new key/keybits. 3468 * The chain does not have to be in a modified state. The 3469 * UPDATE flag will be set later on in this routine. 3470 * 3471 * Do NOT mess with the current state of the INITIAL flag. 3472 */ 3473 chain->bref.key = key; 3474 chain->bref.keybits = keybits; 3475 if (chain->flags & HAMMER2_CHAIN_DELETED) 3476 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3477 KKASSERT(chain->parent == NULL); 3478 } 3479 3480 /* 3481 * Set the appropriate bref flag if requested. 3482 * 3483 * NOTE! Callers can call this function to move chains without 3484 * knowing about special flags, so don't clear bref flags 3485 * here! 3486 */ 3487 if (flags & HAMMER2_INSERT_PFSROOT) 3488 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3489 3490 if (parent == NULL) 3491 goto skip; 3492 3493 /* 3494 * Calculate how many entries we have in the blockref array and 3495 * determine if an indirect block is required when inserting into 3496 * the parent. 3497 */ 3498 again: 3499 if (--maxloops == 0) 3500 panic("hammer2_chain_create: maxloops"); 3501 3502 switch(parent->bref.type) { 3503 case HAMMER2_BREF_TYPE_INODE: 3504 if ((parent->data->ipdata.meta.op_flags & 3505 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3506 kprintf("hammer2: parent set for direct-data! " 3507 "pkey=%016jx ckey=%016jx\n", 3508 parent->bref.key, 3509 chain->bref.key); 3510 } 3511 KKASSERT((parent->data->ipdata.meta.op_flags & 3512 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3513 KKASSERT(parent->data != NULL); 3514 base = &parent->data->ipdata.u.blockset.blockref[0]; 3515 count = HAMMER2_SET_COUNT; 3516 break; 3517 case HAMMER2_BREF_TYPE_INDIRECT: 3518 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3519 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3520 base = NULL; 3521 else 3522 base = &parent->data->npdata[0]; 3523 count = parent->bytes / sizeof(hammer2_blockref_t); 3524 break; 3525 case HAMMER2_BREF_TYPE_VOLUME: 3526 KKASSERT(parent->data != NULL); 3527 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3528 count = HAMMER2_SET_COUNT; 3529 break; 3530 case HAMMER2_BREF_TYPE_FREEMAP: 3531 KKASSERT(parent->data != NULL); 3532 base = &parent->data->blkset.blockref[0]; 3533 count = HAMMER2_SET_COUNT; 3534 break; 3535 default: 3536 panic("hammer2_chain_create: unrecognized blockref type: %d", 3537 parent->bref.type); 3538 base = NULL; 3539 count = 0; 3540 break; 3541 } 3542 3543 /* 3544 * Make sure we've counted the brefs 3545 */ 3546 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3547 hammer2_chain_countbrefs(parent, base, count); 3548 3549 KASSERT(parent->core.live_count >= 0 && 3550 parent->core.live_count <= count, 3551 ("bad live_count %d/%d (%02x, %d)", 3552 parent->core.live_count, count, 3553 parent->bref.type, parent->bytes)); 3554 3555 /* 3556 * If no free blockref could be found we must create an indirect 3557 * block and move a number of blockrefs into it. With the parent 3558 * locked we can safely lock each child in order to delete+duplicate 3559 * it without causing a deadlock. 3560 * 3561 * This may return the new indirect block or the old parent depending 3562 * on where the key falls. NULL is returned on error. 3563 */ 3564 if (parent->core.live_count == count) { 3565 hammer2_chain_t *nparent; 3566 3567 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0); 3568 3569 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3570 mtid, type, &error); 3571 if (nparent == NULL) { 3572 if (allocated) 3573 hammer2_chain_drop(chain); 3574 chain = NULL; 3575 goto done; 3576 } 3577 if (parent != nparent) { 3578 hammer2_chain_unlock(parent); 3579 hammer2_chain_drop(parent); 3580 parent = *parentp = nparent; 3581 } 3582 goto again; 3583 } 3584 3585 /* 3586 * fall through if parent, or skip to here if no parent. 3587 */ 3588 skip: 3589 if (chain->flags & HAMMER2_CHAIN_DELETED) 3590 kprintf("Inserting deleted chain @%016jx\n", 3591 chain->bref.key); 3592 3593 /* 3594 * Link the chain into its parent. 3595 */ 3596 if (chain->parent != NULL) 3597 panic("hammer2: hammer2_chain_create: chain already connected"); 3598 KKASSERT(chain->parent == NULL); 3599 if (parent) { 3600 KKASSERT(parent->core.live_count < count); 3601 hammer2_chain_insert(parent, chain, 3602 HAMMER2_CHAIN_INSERT_SPIN | 3603 HAMMER2_CHAIN_INSERT_LIVE, 3604 0); 3605 } 3606 3607 if (allocated) { 3608 /* 3609 * Mark the newly created chain modified. This will cause 3610 * UPDATE to be set and process the INITIAL flag. 3611 * 3612 * Device buffers are not instantiated for DATA elements 3613 * as these are handled by logical buffers. 3614 * 3615 * Indirect and freemap node indirect blocks are handled 3616 * by hammer2_chain_create_indirect() and not by this 3617 * function. 3618 * 3619 * Data for all other bref types is expected to be 3620 * instantiated (INODE, LEAF). 3621 */ 3622 switch(chain->bref.type) { 3623 case HAMMER2_BREF_TYPE_DATA: 3624 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3625 case HAMMER2_BREF_TYPE_DIRENT: 3626 case HAMMER2_BREF_TYPE_INODE: 3627 error = hammer2_chain_modify(chain, mtid, dedup_off, 3628 HAMMER2_MODIFY_OPTDATA); 3629 break; 3630 default: 3631 /* 3632 * Remaining types are not supported by this function. 3633 * In particular, INDIRECT and LEAF_NODE types are 3634 * handled by create_indirect(). 3635 */ 3636 panic("hammer2_chain_create: bad type: %d", 3637 chain->bref.type); 3638 /* NOT REACHED */ 3639 break; 3640 } 3641 } else { 3642 /* 3643 * When reconnecting a chain we must set UPDATE and 3644 * setflush so the flush recognizes that it must update 3645 * the bref in the parent. 3646 */ 3647 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3648 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3649 } 3650 3651 /* 3652 * We must setflush(parent) to ensure that it recurses through to 3653 * chain. setflush(chain) might not work because ONFLUSH is possibly 3654 * already set in the chain (so it won't recurse up to set it in the 3655 * parent). 3656 */ 3657 if (parent) 3658 hammer2_chain_setflush(parent); 3659 3660 done: 3661 *chainp = chain; 3662 3663 return (error); 3664 } 3665 3666 /* 3667 * Move the chain from its old parent to a new parent. The chain must have 3668 * already been deleted or already disconnected (or never associated) with 3669 * a parent. The chain is reassociated with the new parent and the deleted 3670 * flag will be cleared (no longer deleted). The chain's modification state 3671 * is not altered. 3672 * 3673 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3674 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3675 * FULL. This typically means that the caller is creating the chain after 3676 * doing a hammer2_chain_lookup(). 3677 * 3678 * Neither (parent) or (chain) can be errored. 3679 * 3680 * If (parent) is non-NULL then the chain is inserted under the parent. 3681 * 3682 * If (parent) is NULL then the newly duplicated chain is not inserted 3683 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3684 * passing into hammer2_chain_create() after this function returns). 3685 * 3686 * WARNING! This function calls create which means it can insert indirect 3687 * blocks. This can cause other unrelated chains in the parent to 3688 * be moved to a newly inserted indirect block in addition to the 3689 * specific chain. 3690 */ 3691 void 3692 hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3693 hammer2_tid_t mtid, int flags) 3694 { 3695 hammer2_blockref_t *bref; 3696 hammer2_dev_t *hmp; 3697 hammer2_chain_t *parent; 3698 size_t bytes; 3699 3700 /* 3701 * WARNING! We should never resolve DATA to device buffers 3702 * (XXX allow it if the caller did?), and since 3703 * we currently do not have the logical buffer cache 3704 * buffer in-hand to fix its cached physical offset 3705 * we also force the modify code to not COW it. XXX 3706 * 3707 * NOTE! We allow error'd chains to be renamed. The bref itself 3708 * is good and can be renamed. The content, however, may 3709 * be inaccessible. 3710 */ 3711 hmp = chain->hmp; 3712 KKASSERT(chain->parent == NULL); 3713 /*KKASSERT(chain->error == 0); allow */ 3714 3715 /* 3716 * Now create a duplicate of the chain structure, associating 3717 * it with the same core, making it the same size, pointing it 3718 * to the same bref (the same media block). 3719 * 3720 * NOTE: Handle special radix == 0 case (means 0 bytes). 3721 */ 3722 bref = &chain->bref; 3723 bytes = (size_t)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 3724 if (bytes) 3725 bytes = (hammer2_off_t)1 << bytes; 3726 3727 /* 3728 * If parent is not NULL the duplicated chain will be entered under 3729 * the parent and the UPDATE bit set to tell flush to update 3730 * the blockref. 3731 * 3732 * We must setflush(parent) to ensure that it recurses through to 3733 * chain. setflush(chain) might not work because ONFLUSH is possibly 3734 * already set in the chain (so it won't recurse up to set it in the 3735 * parent). 3736 * 3737 * Having both chains locked is extremely important for atomicy. 3738 */ 3739 if (parentp && (parent = *parentp) != NULL) { 3740 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3741 KKASSERT(parent->refs > 0); 3742 KKASSERT(parent->error == 0); 3743 3744 hammer2_chain_create(parentp, &chain, NULL, chain->pmp, 3745 HAMMER2_METH_DEFAULT, 3746 bref->key, bref->keybits, bref->type, 3747 chain->bytes, mtid, 0, flags); 3748 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3749 hammer2_chain_setflush(*parentp); 3750 } 3751 } 3752 3753 /* 3754 * This works in tandem with delete_obref() to install a blockref in 3755 * (typically) an indirect block that is associated with the chain being 3756 * moved to *parentp. 3757 * 3758 * The reason we need this function is that the caller needs to maintain 3759 * the blockref as it was, and not generate a new blockref for what might 3760 * be a modified chain. Otherwise stuff will leak into the flush that 3761 * the flush code's FLUSH_INODE_STOP flag is unable to catch. 3762 * 3763 * It is EXTREMELY important that we properly set CHAIN_BMAPUPD and 3764 * CHAIN_UPDATE. We must set BMAPUPD if the bref does not match, and 3765 * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if 3766 * it does. Otherwise we can end up in a situation where H2 is unable to 3767 * clean up the in-memory chain topology. 3768 * 3769 * The reason for this is that flushes do not generally flush through 3770 * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq 3771 * or sideq to properly flush and dispose of the related inode chain's flags. 3772 * Situations where the inode is not actually modified by the frontend, 3773 * but where we have to move the related chains around as we insert or cleanup 3774 * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory 3775 * inode chain that does not have a hammer2_inode_t associated with it. 3776 */ 3777 void 3778 hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3779 hammer2_tid_t mtid, int flags, 3780 hammer2_blockref_t *obref) 3781 { 3782 hammer2_chain_rename(parentp, chain, mtid, flags); 3783 3784 if (obref->type != HAMMER2_BREF_TYPE_EMPTY) { 3785 hammer2_blockref_t *tbase; 3786 int tcount; 3787 3788 KKASSERT((chain->flags & HAMMER2_CHAIN_BMAPPED) == 0); 3789 hammer2_chain_modify(*parentp, mtid, 0, 0); 3790 tbase = hammer2_chain_base_and_count(*parentp, &tcount); 3791 hammer2_base_insert(*parentp, tbase, tcount, chain, obref); 3792 if (bcmp(obref, &chain->bref, sizeof(chain->bref))) { 3793 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD | 3794 HAMMER2_CHAIN_UPDATE); 3795 } else { 3796 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3797 } 3798 } 3799 } 3800 3801 /* 3802 * Helper function for deleting chains. 3803 * 3804 * The chain is removed from the live view (the RBTREE) as well as the parent's 3805 * blockmap. Both chain and its parent must be locked. 3806 * 3807 * parent may not be errored. chain can be errored. 3808 */ 3809 static int 3810 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3811 hammer2_tid_t mtid, int flags, 3812 hammer2_blockref_t *obref) 3813 { 3814 hammer2_dev_t *hmp; 3815 int error = 0; 3816 3817 KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED | 3818 HAMMER2_CHAIN_FICTITIOUS)) == 0); 3819 KKASSERT(chain->parent == parent); 3820 hmp = chain->hmp; 3821 3822 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 3823 /* 3824 * Chain is blockmapped, so there must be a parent. 3825 * Atomically remove the chain from the parent and remove 3826 * the blockmap entry. The parent must be set modified 3827 * to remove the blockmap entry. 3828 */ 3829 hammer2_blockref_t *base; 3830 int count; 3831 3832 KKASSERT(parent != NULL); 3833 KKASSERT(parent->error == 0); 3834 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3835 error = hammer2_chain_modify(parent, mtid, 0, 0); 3836 if (error) 3837 goto done; 3838 3839 /* 3840 * Calculate blockmap pointer 3841 */ 3842 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3843 hammer2_spin_ex(&chain->core.spin); 3844 hammer2_spin_ex(&parent->core.spin); 3845 3846 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3847 atomic_add_int(&parent->core.live_count, -1); 3848 ++parent->core.generation; 3849 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3850 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3851 --parent->core.chain_count; 3852 chain->parent = NULL; 3853 3854 switch(parent->bref.type) { 3855 case HAMMER2_BREF_TYPE_INODE: 3856 /* 3857 * Access the inode's block array. However, there 3858 * is no block array if the inode is flagged 3859 * DIRECTDATA. 3860 */ 3861 if (parent->data && 3862 (parent->data->ipdata.meta.op_flags & 3863 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3864 base = 3865 &parent->data->ipdata.u.blockset.blockref[0]; 3866 } else { 3867 base = NULL; 3868 } 3869 count = HAMMER2_SET_COUNT; 3870 break; 3871 case HAMMER2_BREF_TYPE_INDIRECT: 3872 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3873 if (parent->data) 3874 base = &parent->data->npdata[0]; 3875 else 3876 base = NULL; 3877 count = parent->bytes / sizeof(hammer2_blockref_t); 3878 break; 3879 case HAMMER2_BREF_TYPE_VOLUME: 3880 base = &parent->data->voldata. 3881 sroot_blockset.blockref[0]; 3882 count = HAMMER2_SET_COUNT; 3883 break; 3884 case HAMMER2_BREF_TYPE_FREEMAP: 3885 base = &parent->data->blkset.blockref[0]; 3886 count = HAMMER2_SET_COUNT; 3887 break; 3888 default: 3889 base = NULL; 3890 count = 0; 3891 panic("_hammer2_chain_delete_helper: " 3892 "unrecognized blockref type: %d", 3893 parent->bref.type); 3894 } 3895 3896 /* 3897 * delete blockmapped chain from its parent. 3898 * 3899 * The parent is not affected by any statistics in chain 3900 * which are pending synchronization. That is, there is 3901 * nothing to undo in the parent since they have not yet 3902 * been incorporated into the parent. 3903 * 3904 * The parent is affected by statistics stored in inodes. 3905 * Those have already been synchronized, so they must be 3906 * undone. XXX split update possible w/delete in middle? 3907 */ 3908 if (base) { 3909 hammer2_base_delete(parent, base, count, chain, obref); 3910 } 3911 hammer2_spin_unex(&parent->core.spin); 3912 hammer2_spin_unex(&chain->core.spin); 3913 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3914 /* 3915 * Chain is not blockmapped but a parent is present. 3916 * Atomically remove the chain from the parent. There is 3917 * no blockmap entry to remove. 3918 * 3919 * Because chain was associated with a parent but not 3920 * synchronized, the chain's *_count_up fields contain 3921 * inode adjustment statistics which must be undone. 3922 */ 3923 hammer2_spin_ex(&chain->core.spin); 3924 hammer2_spin_ex(&parent->core.spin); 3925 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3926 atomic_add_int(&parent->core.live_count, -1); 3927 ++parent->core.generation; 3928 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3929 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3930 --parent->core.chain_count; 3931 chain->parent = NULL; 3932 hammer2_spin_unex(&parent->core.spin); 3933 hammer2_spin_unex(&chain->core.spin); 3934 } else { 3935 /* 3936 * Chain is not blockmapped and has no parent. This 3937 * is a degenerate case. 3938 */ 3939 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3940 } 3941 done: 3942 return error; 3943 } 3944 3945 /* 3946 * Create an indirect block that covers one or more of the elements in the 3947 * current parent. Either returns the existing parent with no locking or 3948 * ref changes or returns the new indirect block locked and referenced 3949 * and leaving the original parent lock/ref intact as well. 3950 * 3951 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3952 * 3953 * The returned chain depends on where the specified key falls. 3954 * 3955 * The key/keybits for the indirect mode only needs to follow three rules: 3956 * 3957 * (1) That all elements underneath it fit within its key space and 3958 * 3959 * (2) That all elements outside it are outside its key space. 3960 * 3961 * (3) When creating the new indirect block any elements in the current 3962 * parent that fit within the new indirect block's keyspace must be 3963 * moved into the new indirect block. 3964 * 3965 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3966 * keyspace the the current parent, but lookup/iteration rules will 3967 * ensure (and must ensure) that rule (2) for all parents leading up 3968 * to the nearest inode or the root volume header is adhered to. This 3969 * is accomplished by always recursing through matching keyspaces in 3970 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3971 * 3972 * The current implementation calculates the current worst-case keyspace by 3973 * iterating the current parent and then divides it into two halves, choosing 3974 * whichever half has the most elements (not necessarily the half containing 3975 * the requested key). 3976 * 3977 * We can also opt to use the half with the least number of elements. This 3978 * causes lower-numbered keys (aka logical file offsets) to recurse through 3979 * fewer indirect blocks and higher-numbered keys to recurse through more. 3980 * This also has the risk of not moving enough elements to the new indirect 3981 * block and being forced to create several indirect blocks before the element 3982 * can be inserted. 3983 * 3984 * Must be called with an exclusively locked parent. 3985 * 3986 * NOTE: *errorp set to HAMMER_ERROR_* flags 3987 */ 3988 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3989 hammer2_key_t *keyp, int keybits, 3990 hammer2_blockref_t *base, int count); 3991 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 3992 hammer2_key_t *keyp, int keybits, 3993 hammer2_blockref_t *base, int count, 3994 int ncount); 3995 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 3996 hammer2_key_t *keyp, int keybits, 3997 hammer2_blockref_t *base, int count, 3998 int ncount); 3999 static 4000 hammer2_chain_t * 4001 hammer2_chain_create_indirect(hammer2_chain_t *parent, 4002 hammer2_key_t create_key, int create_bits, 4003 hammer2_tid_t mtid, int for_type, int *errorp) 4004 { 4005 hammer2_dev_t *hmp; 4006 hammer2_blockref_t *base; 4007 hammer2_blockref_t *bref; 4008 hammer2_blockref_t bsave; 4009 hammer2_blockref_t dummy; 4010 hammer2_chain_t *chain; 4011 hammer2_chain_t *ichain; 4012 hammer2_key_t key = create_key; 4013 hammer2_key_t key_beg; 4014 hammer2_key_t key_end; 4015 hammer2_key_t key_next; 4016 int keybits = create_bits; 4017 int count; 4018 int ncount; 4019 int nbytes; 4020 int loops; 4021 int error; 4022 int reason; 4023 int generation; 4024 int maxloops = 300000; 4025 4026 /* 4027 * Calculate the base blockref pointer or NULL if the chain 4028 * is known to be empty. We need to calculate the array count 4029 * for RB lookups either way. 4030 */ 4031 hmp = parent->hmp; 4032 KKASSERT(hammer2_mtx_owned(&parent->lock)); 4033 4034 /* 4035 * Pre-modify the parent now to avoid having to deal with error 4036 * processing if we tried to later (in the middle of our loop). 4037 * 4038 * We are going to be moving bref's around, the indirect blocks 4039 * cannot be in an initial state. Do not pass MODIFY_OPTDATA. 4040 */ 4041 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 4042 if (*errorp) { 4043 kprintf("hammer2_create_indirect: error %08x %s\n", 4044 *errorp, hammer2_error_str(*errorp)); 4045 return NULL; 4046 } 4047 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 4048 4049 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 4050 base = hammer2_chain_base_and_count(parent, &count); 4051 4052 /* 4053 * How big should our new indirect block be? It has to be at least 4054 * as large as its parent for splits to work properly. 4055 * 4056 * The freemap uses a specific indirect block size. The number of 4057 * levels are built dynamically and ultimately depend on the size 4058 * volume. Because freemap blocks are taken from the reserved areas 4059 * of the volume our goal is efficiency (fewer levels) and not so 4060 * much to save disk space. 4061 * 4062 * The first indirect block level for a directory usually uses 4063 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 4064 * the hash mechanism, this typically gives us a nominal 4065 * 32 * 4 entries with one level of indirection. 4066 * 4067 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 4068 * indirect blocks. The initial 4 entries in the inode gives us 4069 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 4070 * of indirection gives us 137GB, and so forth. H2 can support 4071 * huge file sizes but they are not typical, so we try to stick 4072 * with compactness and do not use a larger indirect block size. 4073 * 4074 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 4075 * due to the way indirect blocks are created this usually winds 4076 * up being extremely inefficient for small files. Even though 4077 * 16KB requires more levels of indirection for very large files, 4078 * the 16KB records can be ganged together into 64KB DIOs. 4079 */ 4080 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4081 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4082 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 4083 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4084 if (parent->data->ipdata.meta.type == 4085 HAMMER2_OBJTYPE_DIRECTORY) 4086 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 4087 else 4088 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 4089 4090 } else { 4091 nbytes = HAMMER2_IND_BYTES_NOM; 4092 } 4093 if (nbytes < count * sizeof(hammer2_blockref_t)) { 4094 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 4095 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 4096 nbytes = count * sizeof(hammer2_blockref_t); 4097 } 4098 ncount = nbytes / sizeof(hammer2_blockref_t); 4099 4100 /* 4101 * When creating an indirect block for a freemap node or leaf 4102 * the key/keybits must be fitted to static radix levels because 4103 * particular radix levels use particular reserved blocks in the 4104 * related zone. 4105 * 4106 * This routine calculates the key/radix of the indirect block 4107 * we need to create, and whether it is on the high-side or the 4108 * low-side. 4109 */ 4110 switch(for_type) { 4111 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4112 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 4113 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 4114 base, count); 4115 break; 4116 case HAMMER2_BREF_TYPE_DATA: 4117 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 4118 base, count, ncount); 4119 break; 4120 case HAMMER2_BREF_TYPE_DIRENT: 4121 case HAMMER2_BREF_TYPE_INODE: 4122 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 4123 base, count, ncount); 4124 break; 4125 default: 4126 panic("illegal indirect block for bref type %d", for_type); 4127 break; 4128 } 4129 4130 /* 4131 * Normalize the key for the radix being represented, keeping the 4132 * high bits and throwing away the low bits. 4133 */ 4134 key &= ~(((hammer2_key_t)1 << keybits) - 1); 4135 4136 /* 4137 * Ok, create our new indirect block 4138 */ 4139 bzero(&dummy, sizeof(dummy)); 4140 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4141 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4142 dummy.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 4143 } else { 4144 dummy.type = HAMMER2_BREF_TYPE_INDIRECT; 4145 } 4146 dummy.key = key; 4147 dummy.keybits = keybits; 4148 dummy.data_off = hammer2_getradix(nbytes); 4149 dummy.methods = 4150 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 4151 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 4152 4153 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy); 4154 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 4155 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 4156 /* ichain has one ref at this point */ 4157 4158 /* 4159 * We have to mark it modified to allocate its block, but use 4160 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 4161 * it won't be acted upon by the flush code. 4162 * 4163 * XXX remove OPTDATA, we need a fully initialized indirect block to 4164 * be able to move the original blockref. 4165 */ 4166 *errorp = hammer2_chain_modify(ichain, mtid, 0, 0); 4167 if (*errorp) { 4168 kprintf("hammer2_alloc_indirect: error %08x %s\n", 4169 *errorp, hammer2_error_str(*errorp)); 4170 hammer2_chain_unlock(ichain); 4171 hammer2_chain_drop(ichain); 4172 return NULL; 4173 } 4174 KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0); 4175 4176 /* 4177 * Iterate the original parent and move the matching brefs into 4178 * the new indirect block. 4179 * 4180 * XXX handle flushes. 4181 */ 4182 key_beg = 0; 4183 key_end = HAMMER2_KEY_MAX; 4184 key_next = 0; /* avoid gcc warnings */ 4185 hammer2_spin_ex(&parent->core.spin); 4186 loops = 0; 4187 reason = 0; 4188 4189 for (;;) { 4190 /* 4191 * Parent may have been modified, relocating its block array. 4192 * Reload the base pointer. 4193 */ 4194 base = hammer2_chain_base_and_count(parent, &count); 4195 4196 if (++loops > 100000) { 4197 hammer2_spin_unex(&parent->core.spin); 4198 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 4199 reason, parent, base, count, key_next); 4200 } 4201 4202 /* 4203 * NOTE: spinlock stays intact, returned chain (if not NULL) 4204 * is not referenced or locked which means that we 4205 * cannot safely check its flagged / deletion status 4206 * until we lock it. 4207 */ 4208 chain = hammer2_combined_find(parent, base, count, 4209 &key_next, 4210 key_beg, key_end, 4211 &bref); 4212 generation = parent->core.generation; 4213 if (bref == NULL) 4214 break; 4215 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4216 4217 /* 4218 * Skip keys that are not within the key/radix of the new 4219 * indirect block. They stay in the parent. 4220 */ 4221 if (rounddown2(key ^ bref->key, (hammer2_key_t)1 << keybits) != 0) { 4222 goto next_key_spinlocked; 4223 } 4224 4225 /* 4226 * Load the new indirect block by acquiring the related 4227 * chains (potentially from media as it might not be 4228 * in-memory). Then move it to the new parent (ichain). 4229 * 4230 * chain is referenced but not locked. We must lock the 4231 * chain to obtain definitive state. 4232 */ 4233 bsave = *bref; 4234 if (chain) { 4235 /* 4236 * Use chain already present in the RBTREE 4237 */ 4238 hammer2_chain_ref(chain); 4239 hammer2_spin_unex(&parent->core.spin); 4240 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 4241 } else { 4242 /* 4243 * Get chain for blockref element. _get returns NULL 4244 * on insertion race. 4245 */ 4246 hammer2_spin_unex(&parent->core.spin); 4247 chain = hammer2_chain_get(parent, generation, &bsave, 4248 HAMMER2_RESOLVE_NEVER); 4249 if (chain == NULL) { 4250 reason = 1; 4251 hammer2_spin_ex(&parent->core.spin); 4252 continue; 4253 } 4254 } 4255 4256 /* 4257 * This is always live so if the chain has been deleted 4258 * we raced someone and we have to retry. 4259 * 4260 * NOTE: Lookups can race delete-duplicate because 4261 * delete-duplicate does not lock the parent's core 4262 * (they just use the spinlock on the core). 4263 * 4264 * (note reversed logic for this one) 4265 */ 4266 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 4267 chain->parent != parent || 4268 (chain->flags & HAMMER2_CHAIN_DELETED)) { 4269 hammer2_chain_unlock(chain); 4270 hammer2_chain_drop(chain); 4271 if (hammer2_debug & 0x0040) { 4272 kprintf("LOST PARENT RETRY " 4273 "RETRY (%p,%p)->%p %08x\n", 4274 parent, chain->parent, chain, chain->flags); 4275 } 4276 hammer2_spin_ex(&parent->core.spin); 4277 continue; 4278 } 4279 4280 /* 4281 * Shift the chain to the indirect block. 4282 * 4283 * WARNING! No reason for us to load chain data, pass NOSTATS 4284 * to prevent delete/insert from trying to access 4285 * inode stats (and thus asserting if there is no 4286 * chain->data loaded). 4287 * 4288 * WARNING! The (parent, chain) deletion may modify the parent 4289 * and invalidate the base pointer. 4290 * 4291 * WARNING! Parent must already be marked modified, so we 4292 * can assume that chain_delete always suceeds. 4293 * 4294 * WARNING! hammer2_chain_repchange() does not have to be 4295 * called (and doesn't work anyway because we are 4296 * only doing a partial shift). A recursion that is 4297 * in-progress can continue at the current parent 4298 * and will be able to properly find its next key. 4299 */ 4300 error = hammer2_chain_delete_obref(parent, chain, mtid, 0, 4301 &bsave); 4302 KKASSERT(error == 0); 4303 hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bsave); 4304 hammer2_chain_unlock(chain); 4305 hammer2_chain_drop(chain); 4306 KKASSERT(parent->refs > 0); 4307 chain = NULL; 4308 base = NULL; /* safety */ 4309 hammer2_spin_ex(&parent->core.spin); 4310 next_key_spinlocked: 4311 if (--maxloops == 0) 4312 panic("hammer2_chain_create_indirect: maxloops"); 4313 reason = 4; 4314 if (key_next == 0 || key_next > key_end) 4315 break; 4316 key_beg = key_next; 4317 /* loop */ 4318 } 4319 hammer2_spin_unex(&parent->core.spin); 4320 4321 /* 4322 * Insert the new indirect block into the parent now that we've 4323 * cleared out some entries in the parent. We calculated a good 4324 * insertion index in the loop above (ichain->index). 4325 * 4326 * We don't have to set UPDATE here because we mark ichain 4327 * modified down below (so the normal modified -> flush -> set-moved 4328 * sequence applies). 4329 * 4330 * The insertion shouldn't race as this is a completely new block 4331 * and the parent is locked. 4332 */ 4333 base = NULL; /* safety, parent modify may change address */ 4334 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 4335 KKASSERT(parent->core.live_count < count); 4336 hammer2_chain_insert(parent, ichain, 4337 HAMMER2_CHAIN_INSERT_SPIN | 4338 HAMMER2_CHAIN_INSERT_LIVE, 4339 0); 4340 4341 /* 4342 * Make sure flushes propogate after our manual insertion. 4343 */ 4344 hammer2_chain_setflush(ichain); 4345 hammer2_chain_setflush(parent); 4346 4347 /* 4348 * Figure out what to return. 4349 */ 4350 if (rounddown2(create_key ^ key, (hammer2_key_t)1 << keybits)) { 4351 /* 4352 * Key being created is outside the key range, 4353 * return the original parent. 4354 */ 4355 hammer2_chain_unlock(ichain); 4356 hammer2_chain_drop(ichain); 4357 } else { 4358 /* 4359 * Otherwise its in the range, return the new parent. 4360 * (leave both the new and old parent locked). 4361 */ 4362 parent = ichain; 4363 } 4364 4365 return(parent); 4366 } 4367 4368 /* 4369 * Do maintenance on an indirect chain. Both parent and chain are locked. 4370 * 4371 * Returns non-zero if (chain) is deleted, either due to being empty or 4372 * because its children were safely moved into the parent. 4373 */ 4374 int 4375 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, 4376 hammer2_chain_t *chain) 4377 { 4378 hammer2_blockref_t *chain_base; 4379 hammer2_blockref_t *base; 4380 hammer2_blockref_t *bref; 4381 hammer2_blockref_t bsave; 4382 hammer2_key_t key_next; 4383 hammer2_key_t key_beg; 4384 hammer2_key_t key_end; 4385 hammer2_chain_t *sub; 4386 int chain_count; 4387 int count; 4388 int error; 4389 int generation; 4390 4391 /* 4392 * Make sure we have an accurate live_count 4393 */ 4394 if ((chain->flags & (HAMMER2_CHAIN_INITIAL | 4395 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4396 base = &chain->data->npdata[0]; 4397 count = chain->bytes / sizeof(hammer2_blockref_t); 4398 hammer2_chain_countbrefs(chain, base, count); 4399 } 4400 4401 /* 4402 * If the indirect block is empty we can delete it. 4403 * (ignore deletion error) 4404 */ 4405 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) { 4406 hammer2_chain_delete(parent, chain, 4407 chain->bref.modify_tid, 4408 HAMMER2_DELETE_PERMANENT); 4409 hammer2_chain_repchange(parent, chain); 4410 return 1; 4411 } 4412 4413 base = hammer2_chain_base_and_count(parent, &count); 4414 4415 if ((parent->flags & (HAMMER2_CHAIN_INITIAL | 4416 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4417 hammer2_chain_countbrefs(parent, base, count); 4418 } 4419 4420 /* 4421 * Determine if we can collapse chain into parent, calculate 4422 * hysteresis for chain emptiness. 4423 */ 4424 if (parent->core.live_count + chain->core.live_count - 1 > count) 4425 return 0; 4426 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4427 if (chain->core.live_count > chain_count * 3 / 4) 4428 return 0; 4429 4430 /* 4431 * Ok, theoretically we can collapse chain's contents into 4432 * parent. chain is locked, but any in-memory children of chain 4433 * are not. For this to work, we must be able to dispose of any 4434 * in-memory children of chain. 4435 * 4436 * For now require that there are no in-memory children of chain. 4437 * 4438 * WARNING! Both chain and parent must remain locked across this 4439 * entire operation. 4440 */ 4441 4442 /* 4443 * Parent must be marked modified. Don't try to collapse it if we 4444 * can't mark it modified. Once modified, destroy chain to make room 4445 * and to get rid of what will be a conflicting key (this is included 4446 * in the calculation above). Finally, move the children of chain 4447 * into chain's parent. 4448 * 4449 * This order creates an accounting problem for bref.embed.stats 4450 * because we destroy chain before we remove its children. Any 4451 * elements whos blockref is already synchronized will be counted 4452 * twice. To deal with the problem we clean out chain's stats prior 4453 * to deleting it. 4454 */ 4455 error = hammer2_chain_modify(parent, 0, 0, 0); 4456 if (error) { 4457 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4458 hammer2_error_str(error)); 4459 return 0; 4460 } 4461 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0); 4462 if (error) { 4463 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4464 hammer2_error_str(error)); 4465 return 0; 4466 } 4467 4468 chain->bref.embed.stats.inode_count = 0; 4469 chain->bref.embed.stats.data_count = 0; 4470 error = hammer2_chain_delete(parent, chain, 4471 chain->bref.modify_tid, 4472 HAMMER2_DELETE_PERMANENT); 4473 KKASSERT(error == 0); 4474 4475 /* 4476 * The combined_find call requires core.spin to be held. One would 4477 * think there wouldn't be any conflicts since we hold chain 4478 * exclusively locked, but the caching mechanism for 0-ref children 4479 * does not require a chain lock. 4480 */ 4481 hammer2_spin_ex(&chain->core.spin); 4482 4483 key_next = 0; 4484 key_beg = 0; 4485 key_end = HAMMER2_KEY_MAX; 4486 for (;;) { 4487 chain_base = &chain->data->npdata[0]; 4488 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4489 sub = hammer2_combined_find(chain, chain_base, chain_count, 4490 &key_next, 4491 key_beg, key_end, 4492 &bref); 4493 generation = chain->core.generation; 4494 if (bref == NULL) 4495 break; 4496 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4497 4498 bsave = *bref; 4499 if (sub) { 4500 hammer2_chain_ref(sub); 4501 hammer2_spin_unex(&chain->core.spin); 4502 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4503 } else { 4504 hammer2_spin_unex(&chain->core.spin); 4505 sub = hammer2_chain_get(chain, generation, &bsave, 4506 HAMMER2_RESOLVE_NEVER); 4507 if (sub == NULL) { 4508 hammer2_spin_ex(&chain->core.spin); 4509 continue; 4510 } 4511 } 4512 if (bcmp(&bsave, &sub->bref, sizeof(bsave)) || 4513 sub->parent != chain || 4514 (sub->flags & HAMMER2_CHAIN_DELETED)) { 4515 hammer2_chain_unlock(sub); 4516 hammer2_chain_drop(sub); 4517 hammer2_spin_ex(&chain->core.spin); 4518 sub = NULL; /* safety */ 4519 continue; 4520 } 4521 error = hammer2_chain_delete_obref(chain, sub, 4522 sub->bref.modify_tid, 0, 4523 &bsave); 4524 KKASSERT(error == 0); 4525 hammer2_chain_rename_obref(&parent, sub, 4526 sub->bref.modify_tid, 4527 HAMMER2_INSERT_SAMEPARENT, &bsave); 4528 hammer2_chain_unlock(sub); 4529 hammer2_chain_drop(sub); 4530 hammer2_spin_ex(&chain->core.spin); 4531 4532 if (key_next == 0) 4533 break; 4534 key_beg = key_next; 4535 } 4536 hammer2_spin_unex(&chain->core.spin); 4537 4538 hammer2_chain_repchange(parent, chain); 4539 4540 return 1; 4541 } 4542 4543 /* 4544 * Freemap indirect blocks 4545 * 4546 * Calculate the keybits and highside/lowside of the freemap node the 4547 * caller is creating. 4548 * 4549 * This routine will specify the next higher-level freemap key/radix 4550 * representing the lowest-ordered set. By doing so, eventually all 4551 * low-ordered sets will be moved one level down. 4552 * 4553 * We have to be careful here because the freemap reserves a limited 4554 * number of blocks for a limited number of levels. So we can't just 4555 * push indiscriminately. 4556 */ 4557 int 4558 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4559 int keybits, hammer2_blockref_t *base, int count) 4560 { 4561 hammer2_chain_t *chain; 4562 hammer2_blockref_t *bref; 4563 hammer2_key_t key; 4564 hammer2_key_t key_beg; 4565 hammer2_key_t key_end; 4566 hammer2_key_t key_next; 4567 int locount; 4568 int hicount; 4569 int maxloops = 300000; 4570 4571 key = *keyp; 4572 locount = 0; 4573 hicount = 0; 4574 keybits = 64; 4575 4576 /* 4577 * Calculate the range of keys in the array being careful to skip 4578 * slots which are overridden with a deletion. 4579 */ 4580 key_beg = 0; 4581 key_end = HAMMER2_KEY_MAX; 4582 hammer2_spin_ex(&parent->core.spin); 4583 4584 for (;;) { 4585 if (--maxloops == 0) { 4586 panic("indkey_freemap shit %p %p:%d\n", 4587 parent, base, count); 4588 } 4589 chain = hammer2_combined_find(parent, base, count, 4590 &key_next, 4591 key_beg, key_end, 4592 &bref); 4593 4594 /* 4595 * Exhausted search 4596 */ 4597 if (bref == NULL) 4598 break; 4599 4600 /* 4601 * Skip deleted chains. 4602 */ 4603 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4604 if (key_next == 0 || key_next > key_end) 4605 break; 4606 key_beg = key_next; 4607 continue; 4608 } 4609 4610 /* 4611 * Use the full live (not deleted) element for the scan 4612 * iteration. HAMMER2 does not allow partial replacements. 4613 * 4614 * XXX should be built into hammer2_combined_find(). 4615 */ 4616 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4617 4618 if (keybits > bref->keybits) { 4619 key = bref->key; 4620 keybits = bref->keybits; 4621 } else if (keybits == bref->keybits && bref->key < key) { 4622 key = bref->key; 4623 } 4624 if (key_next == 0) 4625 break; 4626 key_beg = key_next; 4627 } 4628 hammer2_spin_unex(&parent->core.spin); 4629 4630 /* 4631 * Return the keybits for a higher-level FREEMAP_NODE covering 4632 * this node. 4633 */ 4634 switch(keybits) { 4635 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4636 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4637 break; 4638 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4639 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4640 break; 4641 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4642 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4643 break; 4644 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4645 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4646 break; 4647 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4648 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4649 break; 4650 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4651 panic("hammer2_chain_indkey_freemap: level too high"); 4652 break; 4653 default: 4654 panic("hammer2_chain_indkey_freemap: bad radix"); 4655 break; 4656 } 4657 *keyp = key; 4658 4659 return (keybits); 4660 } 4661 4662 /* 4663 * File indirect blocks 4664 * 4665 * Calculate the key/keybits for the indirect block to create by scanning 4666 * existing keys. The key being created is also passed in *keyp and can be 4667 * inside or outside the indirect block. Regardless, the indirect block 4668 * must hold at least two keys in order to guarantee sufficient space. 4669 * 4670 * We use a modified version of the freemap's fixed radix tree, but taylored 4671 * for file data. Basically we configure an indirect block encompassing the 4672 * smallest key. 4673 */ 4674 static int 4675 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4676 int keybits, hammer2_blockref_t *base, int count, 4677 int ncount) 4678 { 4679 hammer2_chain_t *chain; 4680 hammer2_blockref_t *bref; 4681 hammer2_key_t key; 4682 hammer2_key_t key_beg; 4683 hammer2_key_t key_end; 4684 hammer2_key_t key_next; 4685 int nradix; 4686 int locount; 4687 int hicount; 4688 int maxloops = 300000; 4689 4690 key = *keyp; 4691 locount = 0; 4692 hicount = 0; 4693 keybits = 64; 4694 4695 /* 4696 * Calculate the range of keys in the array being careful to skip 4697 * slots which are overridden with a deletion. 4698 * 4699 * Locate the smallest key. 4700 */ 4701 key_beg = 0; 4702 key_end = HAMMER2_KEY_MAX; 4703 hammer2_spin_ex(&parent->core.spin); 4704 4705 for (;;) { 4706 if (--maxloops == 0) { 4707 panic("indkey_freemap shit %p %p:%d\n", 4708 parent, base, count); 4709 } 4710 chain = hammer2_combined_find(parent, base, count, 4711 &key_next, 4712 key_beg, key_end, 4713 &bref); 4714 4715 /* 4716 * Exhausted search 4717 */ 4718 if (bref == NULL) 4719 break; 4720 4721 /* 4722 * Skip deleted chains. 4723 */ 4724 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4725 if (key_next == 0 || key_next > key_end) 4726 break; 4727 key_beg = key_next; 4728 continue; 4729 } 4730 4731 /* 4732 * Use the full live (not deleted) element for the scan 4733 * iteration. HAMMER2 does not allow partial replacements. 4734 * 4735 * XXX should be built into hammer2_combined_find(). 4736 */ 4737 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4738 4739 if (keybits > bref->keybits) { 4740 key = bref->key; 4741 keybits = bref->keybits; 4742 } else if (keybits == bref->keybits && bref->key < key) { 4743 key = bref->key; 4744 } 4745 if (key_next == 0) 4746 break; 4747 key_beg = key_next; 4748 } 4749 hammer2_spin_unex(&parent->core.spin); 4750 4751 /* 4752 * Calculate the static keybits for a higher-level indirect block 4753 * that contains the key. 4754 */ 4755 *keyp = key; 4756 4757 switch(ncount) { 4758 case HAMMER2_IND_BYTES_MIN / sizeof(hammer2_blockref_t): 4759 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4760 break; 4761 case HAMMER2_IND_BYTES_NOM / sizeof(hammer2_blockref_t): 4762 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4763 break; 4764 case HAMMER2_IND_BYTES_MAX / sizeof(hammer2_blockref_t): 4765 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4766 break; 4767 default: 4768 panic("bad ncount %d\n", ncount); 4769 nradix = 0; 4770 break; 4771 } 4772 4773 /* 4774 * The largest radix that can be returned for an indirect block is 4775 * 63 bits. (The largest practical indirect block radix is actually 4776 * 62 bits because the top-level inode or volume root contains four 4777 * entries, but allow 63 to be returned). 4778 */ 4779 if (nradix >= 64) 4780 nradix = 63; 4781 4782 return keybits + nradix; 4783 } 4784 4785 #if 1 4786 4787 /* 4788 * Directory indirect blocks. 4789 * 4790 * Covers both the inode index (directory of inodes), and directory contents 4791 * (filenames hardlinked to inodes). 4792 * 4793 * Because directory keys are hashed we generally try to cut the space in 4794 * half. We accomodate the inode index (which tends to have linearly 4795 * increasing inode numbers) by ensuring that the keyspace is at least large 4796 * enough to fill up the indirect block being created. 4797 */ 4798 static int 4799 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4800 int keybits, hammer2_blockref_t *base, int count, 4801 int ncount) 4802 { 4803 hammer2_blockref_t *bref; 4804 hammer2_chain_t *chain; 4805 hammer2_key_t key_beg; 4806 hammer2_key_t key_end; 4807 hammer2_key_t key_next; 4808 hammer2_key_t key; 4809 int nkeybits; 4810 int locount; 4811 int hicount; 4812 int maxloops = 300000; 4813 4814 /* 4815 * NOTE: We can't take a shortcut here anymore for inodes because 4816 * the root directory can contain a mix of inodes and directory 4817 * entries (we used to just return 63 if parent->bref.type was 4818 * HAMMER2_BREF_TYPE_INODE. 4819 */ 4820 key = *keyp; 4821 locount = 0; 4822 hicount = 0; 4823 4824 /* 4825 * Calculate the range of keys in the array being careful to skip 4826 * slots which are overridden with a deletion. 4827 */ 4828 key_beg = 0; 4829 key_end = HAMMER2_KEY_MAX; 4830 hammer2_spin_ex(&parent->core.spin); 4831 4832 for (;;) { 4833 if (--maxloops == 0) { 4834 panic("indkey_freemap shit %p %p:%d\n", 4835 parent, base, count); 4836 } 4837 chain = hammer2_combined_find(parent, base, count, 4838 &key_next, 4839 key_beg, key_end, 4840 &bref); 4841 4842 /* 4843 * Exhausted search 4844 */ 4845 if (bref == NULL) 4846 break; 4847 4848 /* 4849 * Deleted object 4850 */ 4851 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4852 if (key_next == 0 || key_next > key_end) 4853 break; 4854 key_beg = key_next; 4855 continue; 4856 } 4857 4858 /* 4859 * Use the full live (not deleted) element for the scan 4860 * iteration. HAMMER2 does not allow partial replacements. 4861 * 4862 * XXX should be built into hammer2_combined_find(). 4863 */ 4864 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4865 4866 /* 4867 * Expand our calculated key range (key, keybits) to fit 4868 * the scanned key. nkeybits represents the full range 4869 * that we will later cut in half (two halves @ nkeybits - 1). 4870 */ 4871 nkeybits = keybits; 4872 if (nkeybits < bref->keybits) { 4873 if (bref->keybits > 64) { 4874 kprintf("bad bref chain %p bref %p\n", 4875 chain, bref); 4876 Debugger("fubar"); 4877 } 4878 nkeybits = bref->keybits; 4879 } 4880 while (nkeybits < 64 && 4881 rounddown2(key ^ bref->key, (hammer2_key_t)1 << nkeybits) != 0) { 4882 ++nkeybits; 4883 } 4884 4885 /* 4886 * If the new key range is larger we have to determine 4887 * which side of the new key range the existing keys fall 4888 * under by checking the high bit, then collapsing the 4889 * locount into the hicount or vise-versa. 4890 */ 4891 if (keybits != nkeybits) { 4892 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4893 hicount += locount; 4894 locount = 0; 4895 } else { 4896 locount += hicount; 4897 hicount = 0; 4898 } 4899 keybits = nkeybits; 4900 } 4901 4902 /* 4903 * The newly scanned key will be in the lower half or the 4904 * upper half of the (new) key range. 4905 */ 4906 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4907 ++hicount; 4908 else 4909 ++locount; 4910 4911 if (key_next == 0) 4912 break; 4913 key_beg = key_next; 4914 } 4915 hammer2_spin_unex(&parent->core.spin); 4916 bref = NULL; /* now invalid (safety) */ 4917 4918 /* 4919 * Adjust keybits to represent half of the full range calculated 4920 * above (radix 63 max) for our new indirect block. 4921 */ 4922 --keybits; 4923 4924 /* 4925 * Expand keybits to hold at least ncount elements. ncount will be 4926 * a power of 2. This is to try to completely fill leaf nodes (at 4927 * least for keys which are not hashes). 4928 * 4929 * We aren't counting 'in' or 'out', we are counting 'high side' 4930 * and 'low side' based on the bit at (1LL << keybits). We want 4931 * everything to be inside in these cases so shift it all to 4932 * the low or high side depending on the new high bit. 4933 */ 4934 while (((hammer2_key_t)1 << keybits) < ncount) { 4935 ++keybits; 4936 if (key & ((hammer2_key_t)1 << keybits)) { 4937 hicount += locount; 4938 locount = 0; 4939 } else { 4940 locount += hicount; 4941 hicount = 0; 4942 } 4943 } 4944 4945 if (hicount > locount) 4946 key |= (hammer2_key_t)1 << keybits; 4947 else 4948 key &= ~(hammer2_key_t)1 << keybits; 4949 4950 *keyp = key; 4951 4952 return (keybits); 4953 } 4954 4955 #else 4956 4957 /* 4958 * Directory indirect blocks. 4959 * 4960 * Covers both the inode index (directory of inodes), and directory contents 4961 * (filenames hardlinked to inodes). 4962 * 4963 * Because directory keys are hashed we generally try to cut the space in 4964 * half. We accomodate the inode index (which tends to have linearly 4965 * increasing inode numbers) by ensuring that the keyspace is at least large 4966 * enough to fill up the indirect block being created. 4967 */ 4968 static int 4969 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4970 int keybits, hammer2_blockref_t *base, int count, 4971 int ncount) 4972 { 4973 hammer2_blockref_t *bref; 4974 hammer2_chain_t *chain; 4975 hammer2_key_t key_beg; 4976 hammer2_key_t key_end; 4977 hammer2_key_t key_next; 4978 hammer2_key_t key; 4979 int nkeybits; 4980 int locount; 4981 int hicount; 4982 int maxloops = 300000; 4983 4984 /* 4985 * Shortcut if the parent is the inode. In this situation the 4986 * parent has 4+1 directory entries and we are creating an indirect 4987 * block capable of holding many more. 4988 */ 4989 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4990 return 63; 4991 } 4992 4993 key = *keyp; 4994 locount = 0; 4995 hicount = 0; 4996 4997 /* 4998 * Calculate the range of keys in the array being careful to skip 4999 * slots which are overridden with a deletion. 5000 */ 5001 key_beg = 0; 5002 key_end = HAMMER2_KEY_MAX; 5003 hammer2_spin_ex(&parent->core.spin); 5004 5005 for (;;) { 5006 if (--maxloops == 0) { 5007 panic("indkey_freemap shit %p %p:%d\n", 5008 parent, base, count); 5009 } 5010 chain = hammer2_combined_find(parent, base, count, 5011 &key_next, 5012 key_beg, key_end, 5013 &bref); 5014 5015 /* 5016 * Exhausted search 5017 */ 5018 if (bref == NULL) 5019 break; 5020 5021 /* 5022 * Deleted object 5023 */ 5024 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 5025 if (key_next == 0 || key_next > key_end) 5026 break; 5027 key_beg = key_next; 5028 continue; 5029 } 5030 5031 /* 5032 * Use the full live (not deleted) element for the scan 5033 * iteration. HAMMER2 does not allow partial replacements. 5034 * 5035 * XXX should be built into hammer2_combined_find(). 5036 */ 5037 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 5038 5039 /* 5040 * Expand our calculated key range (key, keybits) to fit 5041 * the scanned key. nkeybits represents the full range 5042 * that we will later cut in half (two halves @ nkeybits - 1). 5043 */ 5044 nkeybits = keybits; 5045 if (nkeybits < bref->keybits) { 5046 if (bref->keybits > 64) { 5047 kprintf("bad bref chain %p bref %p\n", 5048 chain, bref); 5049 Debugger("fubar"); 5050 } 5051 nkeybits = bref->keybits; 5052 } 5053 while (nkeybits < 64 && 5054 (~(((hammer2_key_t)1 << nkeybits) - 1) & 5055 (key ^ bref->key)) != 0) { 5056 ++nkeybits; 5057 } 5058 5059 /* 5060 * If the new key range is larger we have to determine 5061 * which side of the new key range the existing keys fall 5062 * under by checking the high bit, then collapsing the 5063 * locount into the hicount or vise-versa. 5064 */ 5065 if (keybits != nkeybits) { 5066 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 5067 hicount += locount; 5068 locount = 0; 5069 } else { 5070 locount += hicount; 5071 hicount = 0; 5072 } 5073 keybits = nkeybits; 5074 } 5075 5076 /* 5077 * The newly scanned key will be in the lower half or the 5078 * upper half of the (new) key range. 5079 */ 5080 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 5081 ++hicount; 5082 else 5083 ++locount; 5084 5085 if (key_next == 0) 5086 break; 5087 key_beg = key_next; 5088 } 5089 hammer2_spin_unex(&parent->core.spin); 5090 bref = NULL; /* now invalid (safety) */ 5091 5092 /* 5093 * Adjust keybits to represent half of the full range calculated 5094 * above (radix 63 max) for our new indirect block. 5095 */ 5096 --keybits; 5097 5098 /* 5099 * Expand keybits to hold at least ncount elements. ncount will be 5100 * a power of 2. This is to try to completely fill leaf nodes (at 5101 * least for keys which are not hashes). 5102 * 5103 * We aren't counting 'in' or 'out', we are counting 'high side' 5104 * and 'low side' based on the bit at (1LL << keybits). We want 5105 * everything to be inside in these cases so shift it all to 5106 * the low or high side depending on the new high bit. 5107 */ 5108 while (((hammer2_key_t)1 << keybits) < ncount) { 5109 ++keybits; 5110 if (key & ((hammer2_key_t)1 << keybits)) { 5111 hicount += locount; 5112 locount = 0; 5113 } else { 5114 locount += hicount; 5115 hicount = 0; 5116 } 5117 } 5118 5119 if (hicount > locount) 5120 key |= (hammer2_key_t)1 << keybits; 5121 else 5122 key &= ~(hammer2_key_t)1 << keybits; 5123 5124 *keyp = key; 5125 5126 return (keybits); 5127 } 5128 5129 #endif 5130 5131 /* 5132 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 5133 * it exists. 5134 * 5135 * Both parent and chain must be locked exclusively. 5136 * 5137 * This function will modify the parent if the blockref requires removal 5138 * from the parent's block table. 5139 * 5140 * This function is NOT recursive. Any entity already pushed into the 5141 * chain (such as an inode) may still need visibility into its contents, 5142 * as well as the ability to read and modify the contents. For example, 5143 * for an unlinked file which is still open. 5144 * 5145 * Also note that the flusher is responsible for cleaning up empty 5146 * indirect blocks. 5147 */ 5148 int 5149 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 5150 hammer2_tid_t mtid, int flags) 5151 { 5152 int error = 0; 5153 5154 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5155 5156 /* 5157 * Nothing to do if already marked. 5158 * 5159 * We need the spinlock on the core whos RBTREE contains chain 5160 * to protect against races. 5161 */ 5162 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5163 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5164 chain->parent == parent); 5165 error = _hammer2_chain_delete_helper(parent, chain, 5166 mtid, flags, NULL); 5167 } 5168 5169 /* 5170 * Permanent deletions mark the chain as destroyed. 5171 * 5172 * NOTE: We do not setflush the chain unless the deletion is 5173 * permanent, since the deletion of a chain does not actually 5174 * require it to be flushed. 5175 */ 5176 if (error == 0) { 5177 if (flags & HAMMER2_DELETE_PERMANENT) { 5178 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5179 hammer2_chain_setflush(chain); 5180 } 5181 } 5182 5183 return error; 5184 } 5185 5186 static int 5187 hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain, 5188 hammer2_tid_t mtid, int flags, 5189 hammer2_blockref_t *obref) 5190 { 5191 int error = 0; 5192 5193 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5194 5195 /* 5196 * Nothing to do if already marked. 5197 * 5198 * We need the spinlock on the core whos RBTREE contains chain 5199 * to protect against races. 5200 */ 5201 obref->type = HAMMER2_BREF_TYPE_EMPTY; 5202 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5203 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5204 chain->parent == parent); 5205 error = _hammer2_chain_delete_helper(parent, chain, 5206 mtid, flags, obref); 5207 } 5208 5209 /* 5210 * Permanent deletions mark the chain as destroyed. 5211 * 5212 * NOTE: We do not setflush the chain unless the deletion is 5213 * permanent, since the deletion of a chain does not actually 5214 * require it to be flushed. 5215 */ 5216 if (error == 0) { 5217 if (flags & HAMMER2_DELETE_PERMANENT) { 5218 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5219 hammer2_chain_setflush(chain); 5220 } 5221 } 5222 5223 return error; 5224 } 5225 5226 /* 5227 * Returns the index of the nearest element in the blockref array >= elm. 5228 * Returns (count) if no element could be found. 5229 * 5230 * Sets *key_nextp to the next key for loop purposes but does not modify 5231 * it if the next key would be higher than the current value of *key_nextp. 5232 * Note that *key_nexp can overflow to 0, which should be tested by the 5233 * caller. 5234 * 5235 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5236 * held through the operation. 5237 */ 5238 static int 5239 hammer2_base_find(hammer2_chain_t *parent, 5240 hammer2_blockref_t *base, int count, 5241 hammer2_key_t *key_nextp, 5242 hammer2_key_t key_beg, hammer2_key_t key_end) 5243 { 5244 hammer2_blockref_t *scan; 5245 hammer2_key_t scan_end; 5246 int i; 5247 int limit; 5248 5249 /* 5250 * Require the live chain's already have their core's counted 5251 * so we can optimize operations. 5252 */ 5253 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 5254 5255 /* 5256 * Degenerate case 5257 */ 5258 if (count == 0 || base == NULL) 5259 return(count); 5260 5261 /* 5262 * Sequential optimization using parent->cache_index. This is 5263 * the most likely scenario. 5264 * 5265 * We can avoid trailing empty entries on live chains, otherwise 5266 * we might have to check the whole block array. 5267 */ 5268 i = parent->cache_index; /* SMP RACE OK */ 5269 cpu_ccfence(); 5270 limit = parent->core.live_zero; 5271 if (i >= limit) 5272 i = limit - 1; 5273 if (i < 0) 5274 i = 0; 5275 KKASSERT(i < count); 5276 5277 /* 5278 * Search backwards 5279 */ 5280 scan = &base[i]; 5281 while (i > 0 && (scan->type == HAMMER2_BREF_TYPE_EMPTY || 5282 scan->key > key_beg)) { 5283 --scan; 5284 --i; 5285 } 5286 parent->cache_index = i; 5287 5288 /* 5289 * Search forwards, stop when we find a scan element which 5290 * encloses the key or until we know that there are no further 5291 * elements. 5292 */ 5293 while (i < count) { 5294 if (scan->type != HAMMER2_BREF_TYPE_EMPTY) { 5295 scan_end = scan->key + 5296 ((hammer2_key_t)1 << scan->keybits) - 1; 5297 if (scan->key > key_beg || scan_end >= key_beg) 5298 break; 5299 } 5300 if (i >= limit) 5301 return (count); 5302 ++scan; 5303 ++i; 5304 } 5305 if (i != count) { 5306 parent->cache_index = i; 5307 if (i >= limit) { 5308 i = count; 5309 } else { 5310 scan_end = scan->key + 5311 ((hammer2_key_t)1 << scan->keybits); 5312 if (scan_end && (*key_nextp > scan_end || 5313 *key_nextp == 0)) { 5314 *key_nextp = scan_end; 5315 } 5316 } 5317 } 5318 return (i); 5319 } 5320 5321 /* 5322 * Do a combined search and return the next match either from the blockref 5323 * array or from the in-memory chain. Sets *bresp to the returned bref in 5324 * both cases, or sets it to NULL if the search exhausted. Only returns 5325 * a non-NULL chain if the search matched from the in-memory chain. 5326 * 5327 * When no in-memory chain has been found and a non-NULL bref is returned 5328 * in *bresp. 5329 * 5330 * 5331 * The returned chain is not locked or referenced. Use the returned bref 5332 * to determine if the search exhausted or not. Iterate if the base find 5333 * is chosen but matches a deleted chain. 5334 * 5335 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5336 * held through the operation. 5337 */ 5338 hammer2_chain_t * 5339 hammer2_combined_find(hammer2_chain_t *parent, 5340 hammer2_blockref_t *base, int count, 5341 hammer2_key_t *key_nextp, 5342 hammer2_key_t key_beg, hammer2_key_t key_end, 5343 hammer2_blockref_t **bresp) 5344 { 5345 hammer2_blockref_t *bref; 5346 hammer2_chain_t *chain; 5347 int i; 5348 5349 /* 5350 * Lookup in block array and in rbtree. 5351 */ 5352 *key_nextp = key_end + 1; 5353 i = hammer2_base_find(parent, base, count, key_nextp, 5354 key_beg, key_end); 5355 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 5356 5357 /* 5358 * Neither matched 5359 */ 5360 if (i == count && chain == NULL) { 5361 *bresp = NULL; 5362 return(NULL); 5363 } 5364 5365 /* 5366 * Only chain matched. 5367 */ 5368 if (i == count) { 5369 bref = &chain->bref; 5370 goto found; 5371 } 5372 5373 /* 5374 * Only blockref matched. 5375 */ 5376 if (chain == NULL) { 5377 bref = &base[i]; 5378 goto found; 5379 } 5380 5381 /* 5382 * Both in-memory and blockref matched, select the nearer element. 5383 * 5384 * If both are flush with the left-hand side or both are the 5385 * same distance away, select the chain. In this situation the 5386 * chain must have been loaded from the matching blockmap. 5387 */ 5388 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 5389 chain->bref.key == base[i].key) { 5390 KKASSERT(chain->bref.key == base[i].key); 5391 bref = &chain->bref; 5392 goto found; 5393 } 5394 5395 /* 5396 * Select the nearer key 5397 */ 5398 if (chain->bref.key < base[i].key) { 5399 bref = &chain->bref; 5400 } else { 5401 bref = &base[i]; 5402 chain = NULL; 5403 } 5404 5405 /* 5406 * If the bref is out of bounds we've exhausted our search. 5407 */ 5408 found: 5409 if (bref->key > key_end) { 5410 *bresp = NULL; 5411 chain = NULL; 5412 } else { 5413 *bresp = bref; 5414 } 5415 return(chain); 5416 } 5417 5418 /* 5419 * Locate the specified block array element and delete it. The element 5420 * must exist. 5421 * 5422 * The spin lock on the related chain must be held. 5423 * 5424 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5425 * need to be adjusted when we commit the media change. 5426 */ 5427 void 5428 hammer2_base_delete(hammer2_chain_t *parent, 5429 hammer2_blockref_t *base, int count, 5430 hammer2_chain_t *chain, 5431 hammer2_blockref_t *obref) 5432 { 5433 hammer2_blockref_t *elm = &chain->bref; 5434 hammer2_blockref_t *scan; 5435 hammer2_key_t key_next; 5436 int i; 5437 5438 /* 5439 * Delete element. Expect the element to exist. 5440 * 5441 * XXX see caller, flush code not yet sophisticated enough to prevent 5442 * re-flushed in some cases. 5443 */ 5444 key_next = 0; /* max range */ 5445 i = hammer2_base_find(parent, base, count, &key_next, 5446 elm->key, elm->key); 5447 scan = &base[i]; 5448 if (i == count || scan->type == HAMMER2_BREF_TYPE_EMPTY || 5449 scan->key != elm->key || 5450 ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 && 5451 scan->keybits != elm->keybits)) { 5452 hammer2_spin_unex(&parent->core.spin); 5453 panic("delete base %p element not found at %d/%d elm %p\n", 5454 base, i, count, elm); 5455 return; 5456 } 5457 5458 /* 5459 * Update stats and zero the entry. 5460 * 5461 * NOTE: Handle radix == 0 (0 bytes) case. 5462 */ 5463 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 5464 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 5465 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 5466 } 5467 switch(scan->type) { 5468 case HAMMER2_BREF_TYPE_INODE: 5469 --parent->bref.embed.stats.inode_count; 5470 /* fall through */ 5471 case HAMMER2_BREF_TYPE_DATA: 5472 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5473 atomic_set_int(&chain->flags, 5474 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5475 } else { 5476 if (parent->bref.leaf_count) 5477 --parent->bref.leaf_count; 5478 } 5479 /* fall through */ 5480 case HAMMER2_BREF_TYPE_INDIRECT: 5481 if (scan->type != HAMMER2_BREF_TYPE_DATA) { 5482 parent->bref.embed.stats.data_count -= 5483 scan->embed.stats.data_count; 5484 parent->bref.embed.stats.inode_count -= 5485 scan->embed.stats.inode_count; 5486 } 5487 if (scan->type == HAMMER2_BREF_TYPE_INODE) 5488 break; 5489 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5490 atomic_set_int(&chain->flags, 5491 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5492 } else { 5493 if (parent->bref.leaf_count <= scan->leaf_count) 5494 parent->bref.leaf_count = 0; 5495 else 5496 parent->bref.leaf_count -= scan->leaf_count; 5497 } 5498 break; 5499 case HAMMER2_BREF_TYPE_DIRENT: 5500 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5501 atomic_set_int(&chain->flags, 5502 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5503 } else { 5504 if (parent->bref.leaf_count) 5505 --parent->bref.leaf_count; 5506 } 5507 default: 5508 break; 5509 } 5510 5511 if (obref) 5512 *obref = *scan; 5513 bzero(scan, sizeof(*scan)); 5514 5515 /* 5516 * We can only optimize parent->core.live_zero for live chains. 5517 */ 5518 if (parent->core.live_zero == i + 1) { 5519 while (--i >= 0 && base[i].type == HAMMER2_BREF_TYPE_EMPTY) 5520 ; 5521 parent->core.live_zero = i + 1; 5522 } 5523 5524 /* 5525 * Clear appropriate blockmap flags in chain. 5526 */ 5527 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 5528 HAMMER2_CHAIN_BMAPUPD); 5529 } 5530 5531 /* 5532 * Insert the specified element. The block array must not already have the 5533 * element and must have space available for the insertion. 5534 * 5535 * The spin lock on the related chain must be held. 5536 * 5537 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5538 * need to be adjusted when we commit the media change. 5539 */ 5540 void 5541 hammer2_base_insert(hammer2_chain_t *parent, 5542 hammer2_blockref_t *base, int count, 5543 hammer2_chain_t *chain, hammer2_blockref_t *elm) 5544 { 5545 hammer2_key_t key_next; 5546 hammer2_key_t xkey; 5547 int i; 5548 int j; 5549 int k; 5550 int l; 5551 int u = 1; 5552 5553 /* 5554 * Insert new element. Expect the element to not already exist 5555 * unless we are replacing it. 5556 * 5557 * XXX see caller, flush code not yet sophisticated enough to prevent 5558 * re-flushed in some cases. 5559 */ 5560 key_next = 0; /* max range */ 5561 i = hammer2_base_find(parent, base, count, &key_next, 5562 elm->key, elm->key); 5563 5564 /* 5565 * Shortcut fill optimization, typical ordered insertion(s) may not 5566 * require a search. 5567 */ 5568 KKASSERT(i >= 0 && i <= count); 5569 5570 /* 5571 * Set appropriate blockmap flags in chain (if not NULL) 5572 */ 5573 if (chain) 5574 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 5575 5576 /* 5577 * Update stats and zero the entry 5578 */ 5579 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 5580 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 5581 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 5582 } 5583 switch(elm->type) { 5584 case HAMMER2_BREF_TYPE_INODE: 5585 ++parent->bref.embed.stats.inode_count; 5586 /* fall through */ 5587 case HAMMER2_BREF_TYPE_DATA: 5588 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5589 ++parent->bref.leaf_count; 5590 /* fall through */ 5591 case HAMMER2_BREF_TYPE_INDIRECT: 5592 if (elm->type != HAMMER2_BREF_TYPE_DATA) { 5593 parent->bref.embed.stats.data_count += 5594 elm->embed.stats.data_count; 5595 parent->bref.embed.stats.inode_count += 5596 elm->embed.stats.inode_count; 5597 } 5598 if (elm->type == HAMMER2_BREF_TYPE_INODE) 5599 break; 5600 if (parent->bref.leaf_count + elm->leaf_count < 5601 HAMMER2_BLOCKREF_LEAF_MAX) { 5602 parent->bref.leaf_count += elm->leaf_count; 5603 } else { 5604 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX; 5605 } 5606 break; 5607 case HAMMER2_BREF_TYPE_DIRENT: 5608 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5609 ++parent->bref.leaf_count; 5610 break; 5611 default: 5612 break; 5613 } 5614 5615 5616 /* 5617 * We can only optimize parent->core.live_zero for live chains. 5618 */ 5619 if (i == count && parent->core.live_zero < count) { 5620 i = parent->core.live_zero++; 5621 base[i] = *elm; 5622 return; 5623 } 5624 5625 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 5626 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 5627 hammer2_spin_unex(&parent->core.spin); 5628 panic("insert base %p overlapping elements at %d elm %p\n", 5629 base, i, elm); 5630 } 5631 5632 /* 5633 * Try to find an empty slot before or after. 5634 */ 5635 j = i; 5636 k = i; 5637 while (j > 0 || k < count) { 5638 --j; 5639 if (j >= 0 && base[j].type == HAMMER2_BREF_TYPE_EMPTY) { 5640 if (j == i - 1) { 5641 base[j] = *elm; 5642 } else { 5643 bcopy(&base[j+1], &base[j], 5644 (i - j - 1) * sizeof(*base)); 5645 base[i - 1] = *elm; 5646 } 5647 goto validate; 5648 } 5649 ++k; 5650 if (k < count && base[k].type == HAMMER2_BREF_TYPE_EMPTY) { 5651 bcopy(&base[i], &base[i+1], 5652 (k - i) * sizeof(hammer2_blockref_t)); 5653 base[i] = *elm; 5654 5655 /* 5656 * We can only update parent->core.live_zero for live 5657 * chains. 5658 */ 5659 if (parent->core.live_zero <= k) 5660 parent->core.live_zero = k + 1; 5661 u = 2; 5662 goto validate; 5663 } 5664 } 5665 panic("hammer2_base_insert: no room!"); 5666 5667 /* 5668 * Debugging 5669 */ 5670 validate: 5671 key_next = 0; 5672 for (l = 0; l < count; ++l) { 5673 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5674 key_next = base[l].key + 5675 ((hammer2_key_t)1 << base[l].keybits) - 1; 5676 break; 5677 } 5678 } 5679 while (++l < count) { 5680 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5681 if (base[l].key <= key_next) 5682 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5683 key_next = base[l].key + 5684 ((hammer2_key_t)1 << base[l].keybits) - 1; 5685 5686 } 5687 } 5688 5689 } 5690 5691 #if 0 5692 5693 /* 5694 * Sort the blockref array for the chain. Used by the flush code to 5695 * sort the blockref[] array. 5696 * 5697 * The chain must be exclusively locked AND spin-locked. 5698 */ 5699 typedef hammer2_blockref_t *hammer2_blockref_p; 5700 5701 static 5702 int 5703 hammer2_base_sort_callback(const void *v1, const void *v2) 5704 { 5705 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5706 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5707 5708 /* 5709 * Make sure empty elements are placed at the end of the array 5710 */ 5711 if (bref1->type == HAMMER2_BREF_TYPE_EMPTY) { 5712 if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) 5713 return(0); 5714 return(1); 5715 } else if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) { 5716 return(-1); 5717 } 5718 5719 /* 5720 * Sort by key 5721 */ 5722 if (bref1->key < bref2->key) 5723 return(-1); 5724 if (bref1->key > bref2->key) 5725 return(1); 5726 return(0); 5727 } 5728 5729 void 5730 hammer2_base_sort(hammer2_chain_t *chain) 5731 { 5732 hammer2_blockref_t *base; 5733 int count; 5734 5735 switch(chain->bref.type) { 5736 case HAMMER2_BREF_TYPE_INODE: 5737 /* 5738 * Special shortcut for embedded data returns the inode 5739 * itself. Callers must detect this condition and access 5740 * the embedded data (the strategy code does this for us). 5741 * 5742 * This is only applicable to regular files and softlinks. 5743 */ 5744 if (chain->data->ipdata.meta.op_flags & 5745 HAMMER2_OPFLAG_DIRECTDATA) { 5746 return; 5747 } 5748 base = &chain->data->ipdata.u.blockset.blockref[0]; 5749 count = HAMMER2_SET_COUNT; 5750 break; 5751 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5752 case HAMMER2_BREF_TYPE_INDIRECT: 5753 /* 5754 * Optimize indirect blocks in the INITIAL state to avoid 5755 * I/O. 5756 */ 5757 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5758 base = &chain->data->npdata[0]; 5759 count = chain->bytes / sizeof(hammer2_blockref_t); 5760 break; 5761 case HAMMER2_BREF_TYPE_VOLUME: 5762 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5763 count = HAMMER2_SET_COUNT; 5764 break; 5765 case HAMMER2_BREF_TYPE_FREEMAP: 5766 base = &chain->data->blkset.blockref[0]; 5767 count = HAMMER2_SET_COUNT; 5768 break; 5769 default: 5770 kprintf("hammer2_chain_lookup: unrecognized " 5771 "blockref(A) type: %d", 5772 chain->bref.type); 5773 while (1) 5774 tsleep(&base, 0, "dead", 0); 5775 panic("hammer2_base_sort: unrecognized " 5776 "blockref(A) type: %d", 5777 chain->bref.type); 5778 base = NULL; /* safety */ 5779 count = 0; /* safety */ 5780 } 5781 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5782 } 5783 5784 #endif 5785 5786 /* 5787 * Chain memory management 5788 */ 5789 void 5790 hammer2_chain_wait(hammer2_chain_t *chain) 5791 { 5792 tsleep(chain, 0, "chnflw", 1); 5793 } 5794 5795 const hammer2_media_data_t * 5796 hammer2_chain_rdata(hammer2_chain_t *chain) 5797 { 5798 KKASSERT(chain->data != NULL); 5799 return (chain->data); 5800 } 5801 5802 hammer2_media_data_t * 5803 hammer2_chain_wdata(hammer2_chain_t *chain) 5804 { 5805 KKASSERT(chain->data != NULL); 5806 return (chain->data); 5807 } 5808 5809 /* 5810 * Set the check data for a chain. This can be a heavy-weight operation 5811 * and typically only runs on-flush. For file data check data is calculated 5812 * when the logical buffers are flushed. 5813 */ 5814 void 5815 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5816 { 5817 chain->bref.flags &= ~HAMMER2_BREF_FLAG_ZERO; 5818 5819 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5820 case HAMMER2_CHECK_NONE: 5821 break; 5822 case HAMMER2_CHECK_DISABLED: 5823 break; 5824 case HAMMER2_CHECK_ISCSI32: 5825 chain->bref.check.iscsi32.value = 5826 hammer2_icrc32(bdata, chain->bytes); 5827 break; 5828 case HAMMER2_CHECK_XXHASH64: 5829 chain->bref.check.xxhash64.value = 5830 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5831 break; 5832 case HAMMER2_CHECK_SHA192: 5833 { 5834 SHA256_CTX hash_ctx; 5835 union { 5836 uint8_t digest[SHA256_DIGEST_LENGTH]; 5837 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5838 } u; 5839 5840 SHA256_Init(&hash_ctx); 5841 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5842 SHA256_Final(u.digest, &hash_ctx); 5843 u.digest64[2] ^= u.digest64[3]; 5844 bcopy(u.digest, 5845 chain->bref.check.sha192.data, 5846 sizeof(chain->bref.check.sha192.data)); 5847 } 5848 break; 5849 case HAMMER2_CHECK_FREEMAP: 5850 chain->bref.check.freemap.icrc32 = 5851 hammer2_icrc32(bdata, chain->bytes); 5852 break; 5853 default: 5854 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5855 chain->bref.methods); 5856 break; 5857 } 5858 } 5859 5860 /* 5861 * Characterize a failed check code and try to trace back to the inode. 5862 */ 5863 static void 5864 hammer2_characterize_failed_chain(hammer2_chain_t *chain, uint64_t check, 5865 int bits) 5866 { 5867 hammer2_chain_t *lchain; 5868 hammer2_chain_t *ochain; 5869 int did; 5870 5871 did = krateprintf(&krate_h2chk, 5872 "chain %016jx.%02x (%s) meth=%02x CHECK FAIL " 5873 "(flags=%08x, bref/data ", 5874 chain->bref.data_off, 5875 chain->bref.type, 5876 hammer2_bref_type_str(&chain->bref), 5877 chain->bref.methods, 5878 chain->flags); 5879 if (did == 0) 5880 return; 5881 5882 if (bits == 32) { 5883 kprintf("%08x/%08x)\n", 5884 chain->bref.check.iscsi32.value, 5885 (uint32_t)check); 5886 } else { 5887 kprintf("%016jx/%016jx)\n", 5888 chain->bref.check.xxhash64.value, 5889 check); 5890 } 5891 5892 /* 5893 * Run up the chains to try to find the governing inode so we 5894 * can report it. 5895 * 5896 * XXX This error reporting is not really MPSAFE 5897 */ 5898 ochain = chain; 5899 lchain = chain; 5900 while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) { 5901 lchain = chain; 5902 chain = chain->parent; 5903 } 5904 5905 if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE && 5906 ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) == 0 || 5907 (lchain->bref.key & HAMMER2_DIRHASH_VISIBLE))) { 5908 kprintf(" Resides at/in inode %ld\n", 5909 chain->bref.key); 5910 } else if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5911 kprintf(" Resides in inode index - CRITICAL!!!\n"); 5912 } else { 5913 kprintf(" Resides in root index - CRITICAL!!!\n"); 5914 } 5915 if (ochain->hmp) { 5916 const char *pfsname = "UNKNOWN"; 5917 int i; 5918 5919 if (ochain->pmp) { 5920 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 5921 if (ochain->pmp->pfs_hmps[i] == ochain->hmp && 5922 ochain->pmp->pfs_names[i]) { 5923 pfsname = ochain->pmp->pfs_names[i]; 5924 break; 5925 } 5926 } 5927 } 5928 kprintf(" In pfs %s on device %s\n", 5929 pfsname, ochain->hmp->devrepname); 5930 } 5931 } 5932 5933 /* 5934 * Returns non-zero on success, 0 on failure. 5935 */ 5936 int 5937 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5938 { 5939 uint32_t check32; 5940 uint64_t check64; 5941 int r; 5942 5943 if (chain->bref.flags & HAMMER2_BREF_FLAG_ZERO) 5944 return 1; 5945 5946 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5947 case HAMMER2_CHECK_NONE: 5948 r = 1; 5949 break; 5950 case HAMMER2_CHECK_DISABLED: 5951 r = 1; 5952 break; 5953 case HAMMER2_CHECK_ISCSI32: 5954 check32 = hammer2_icrc32(bdata, chain->bytes); 5955 r = (chain->bref.check.iscsi32.value == check32); 5956 if (r == 0) { 5957 hammer2_characterize_failed_chain(chain, check32, 32); 5958 } 5959 hammer2_process_icrc32 += chain->bytes; 5960 break; 5961 case HAMMER2_CHECK_XXHASH64: 5962 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5963 r = (chain->bref.check.xxhash64.value == check64); 5964 if (r == 0) { 5965 hammer2_characterize_failed_chain(chain, check64, 64); 5966 } 5967 hammer2_process_xxhash64 += chain->bytes; 5968 break; 5969 case HAMMER2_CHECK_SHA192: 5970 { 5971 SHA256_CTX hash_ctx; 5972 union { 5973 uint8_t digest[SHA256_DIGEST_LENGTH]; 5974 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5975 } u; 5976 5977 SHA256_Init(&hash_ctx); 5978 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5979 SHA256_Final(u.digest, &hash_ctx); 5980 u.digest64[2] ^= u.digest64[3]; 5981 if (bcmp(u.digest, 5982 chain->bref.check.sha192.data, 5983 sizeof(chain->bref.check.sha192.data)) == 0) { 5984 r = 1; 5985 } else { 5986 r = 0; 5987 krateprintf(&krate_h2chk, 5988 "chain %016jx.%02x meth=%02x " 5989 "CHECK FAIL\n", 5990 chain->bref.data_off, 5991 chain->bref.type, 5992 chain->bref.methods); 5993 } 5994 } 5995 break; 5996 case HAMMER2_CHECK_FREEMAP: 5997 r = (chain->bref.check.freemap.icrc32 == 5998 hammer2_icrc32(bdata, chain->bytes)); 5999 if (r == 0) { 6000 int did; 6001 6002 did = krateprintf(&krate_h2chk, 6003 "chain %016jx.%02x meth=%02x " 6004 "CHECK FAIL\n", 6005 chain->bref.data_off, 6006 chain->bref.type, 6007 chain->bref.methods); 6008 if (did) { 6009 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 6010 chain->bref.check.freemap.icrc32, 6011 hammer2_icrc32(bdata, chain->bytes), 6012 chain->bytes); 6013 if (chain->dio) { 6014 kprintf("dio %p buf %016jx,%d " 6015 "bdata %p/%p\n", 6016 chain->dio, 6017 chain->dio->bp->b_loffset, 6018 chain->dio->bp->b_bufsize, 6019 bdata, 6020 chain->dio->bp->b_data); 6021 } 6022 } 6023 } 6024 break; 6025 default: 6026 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 6027 chain->bref.methods); 6028 r = 1; 6029 break; 6030 } 6031 return r; 6032 } 6033 6034 /* 6035 * Acquire the chain and parent representing the specified inode for the 6036 * device at the specified cluster index. 6037 * 6038 * The flags passed in are LOOKUP flags, not RESOLVE flags. 6039 * 6040 * If we are unable to locate the inode, HAMMER2_ERROR_EIO is returned and 6041 * *chainp will be NULL. *parentp may still be set error or not, or NULL 6042 * if the parent itself could not be resolved. 6043 * 6044 * The caller may pass-in a locked *parentp and/or *chainp, or neither. 6045 * They will be unlocked and released by this function. The *parentp and 6046 * *chainp representing the located inode are returned locked. 6047 */ 6048 int 6049 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 6050 int clindex, int flags, 6051 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 6052 { 6053 hammer2_chain_t *parent; 6054 hammer2_chain_t *rchain; 6055 hammer2_key_t key_dummy; 6056 hammer2_inode_t *ip; 6057 int resolve_flags; 6058 int error; 6059 6060 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 6061 HAMMER2_RESOLVE_SHARED : 0; 6062 6063 /* 6064 * Caller expects us to replace these. 6065 */ 6066 if (*chainp) { 6067 hammer2_chain_unlock(*chainp); 6068 hammer2_chain_drop(*chainp); 6069 *chainp = NULL; 6070 } 6071 if (*parentp) { 6072 hammer2_chain_unlock(*parentp); 6073 hammer2_chain_drop(*parentp); 6074 *parentp = NULL; 6075 } 6076 6077 /* 6078 * Be very careful, this is a backend function and we CANNOT 6079 * lock any frontend inode structure we find. But we have to 6080 * look the inode up this way first in case it exists but is 6081 * detached from the radix tree. 6082 */ 6083 ip = hammer2_inode_lookup(pmp, inum); 6084 if (ip) { 6085 *chainp = hammer2_inode_chain_and_parent(ip, clindex, 6086 parentp, 6087 resolve_flags); 6088 hammer2_inode_drop(ip); 6089 if (*chainp) 6090 return 0; 6091 hammer2_chain_unlock(*chainp); 6092 hammer2_chain_drop(*chainp); 6093 *chainp = NULL; 6094 if (*parentp) { 6095 hammer2_chain_unlock(*parentp); 6096 hammer2_chain_drop(*parentp); 6097 *parentp = NULL; 6098 } 6099 } 6100 6101 /* 6102 * Inodes hang off of the iroot (bit 63 is clear, differentiating 6103 * inodes from root directory entries in the key lookup). 6104 */ 6105 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 6106 rchain = NULL; 6107 if (parent) { 6108 rchain = hammer2_chain_lookup(&parent, &key_dummy, 6109 inum, inum, 6110 &error, flags); 6111 } else { 6112 error = HAMMER2_ERROR_EIO; 6113 } 6114 *parentp = parent; 6115 *chainp = rchain; 6116 6117 return error; 6118 } 6119 6120 /* 6121 * Used by the bulkscan code to snapshot the synchronized storage for 6122 * a volume, allowing it to be scanned concurrently against normal 6123 * operation. 6124 */ 6125 hammer2_chain_t * 6126 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 6127 { 6128 hammer2_chain_t *copy; 6129 6130 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 6131 copy->data = kmalloc(sizeof(copy->data->voldata), 6132 hmp->mchain, 6133 M_WAITOK | M_ZERO); 6134 hammer2_voldata_lock(hmp); 6135 copy->data->voldata = hmp->volsync; 6136 hammer2_voldata_unlock(hmp); 6137 6138 return copy; 6139 } 6140 6141 void 6142 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 6143 { 6144 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 6145 KKASSERT(copy->data); 6146 kfree(copy->data, copy->hmp->mchain); 6147 copy->data = NULL; 6148 atomic_add_long(&hammer2_chain_allocs, -1); 6149 hammer2_chain_drop(copy); 6150 } 6151 6152 /* 6153 * Returns non-zero if the chain (INODE or DIRENT) matches the 6154 * filename. 6155 */ 6156 int 6157 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 6158 size_t name_len) 6159 { 6160 const hammer2_inode_data_t *ripdata; 6161 const hammer2_dirent_head_t *den; 6162 6163 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 6164 ripdata = &chain->data->ipdata; 6165 if (ripdata->meta.name_len == name_len && 6166 bcmp(ripdata->filename, name, name_len) == 0) { 6167 return 1; 6168 } 6169 } 6170 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 6171 chain->bref.embed.dirent.namlen == name_len) { 6172 den = &chain->bref.embed.dirent; 6173 if (name_len > sizeof(chain->bref.check.buf) && 6174 bcmp(chain->data->buf, name, name_len) == 0) { 6175 return 1; 6176 } 6177 if (name_len <= sizeof(chain->bref.check.buf) && 6178 bcmp(chain->bref.check.buf, name, name_len) == 0) { 6179 return 1; 6180 } 6181 } 6182 return 0; 6183 } 6184