1 /* 2 * Copyright (c) 2011-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/kern_syscall.h> 61 #include <sys/uuid.h> 62 63 #include <crypto/sha2/sha2.h> 64 65 #include "hammer2.h" 66 67 static hammer2_chain_t *hammer2_chain_create_indirect( 68 hammer2_chain_t *parent, 69 hammer2_key_t key, int keybits, 70 hammer2_tid_t mtid, int for_type, int *errorp); 71 static void hammer2_chain_rename_obref(hammer2_chain_t **parentp, 72 hammer2_chain_t *chain, hammer2_tid_t mtid, 73 int flags, hammer2_blockref_t *obref); 74 static int hammer2_chain_delete_obref(hammer2_chain_t *parent, 75 hammer2_chain_t *chain, 76 hammer2_tid_t mtid, int flags, 77 hammer2_blockref_t *obref); 78 static hammer2_io_t *hammer2_chain_drop_data(hammer2_chain_t *chain); 79 static hammer2_chain_t *hammer2_combined_find( 80 hammer2_chain_t *parent, 81 hammer2_blockref_t *base, int count, 82 hammer2_key_t *key_nextp, 83 hammer2_key_t key_beg, hammer2_key_t key_end, 84 hammer2_blockref_t **bresp); 85 86 /* 87 * There are many degenerate situations where an extreme rate of console 88 * output can occur from warnings and errors. Make sure this output does 89 * not impede operations. 90 */ 91 static struct krate krate_h2chk = { .freq = 5 }; 92 static struct krate krate_h2me = { .freq = 1 }; 93 static struct krate krate_h2em = { .freq = 1 }; 94 95 /* 96 * Basic RBTree for chains (core.rbtree). 97 */ 98 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 99 100 int 101 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 102 { 103 hammer2_key_t c1_beg; 104 hammer2_key_t c1_end; 105 hammer2_key_t c2_beg; 106 hammer2_key_t c2_end; 107 108 /* 109 * Compare chains. Overlaps are not supposed to happen and catch 110 * any software issues early we count overlaps as a match. 111 */ 112 c1_beg = chain1->bref.key; 113 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 114 c2_beg = chain2->bref.key; 115 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 116 117 if (c1_end < c2_beg) /* fully to the left */ 118 return(-1); 119 if (c1_beg > c2_end) /* fully to the right */ 120 return(1); 121 return(0); /* overlap (must not cross edge boundary) */ 122 } 123 124 /* 125 * Assert that a chain has no media data associated with it. 126 */ 127 static __inline void 128 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 129 { 130 KKASSERT(chain->dio == NULL); 131 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 132 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 133 chain->data) { 134 panic("hammer2_chain_assert_no_data: chain %p still has data", 135 chain); 136 } 137 } 138 139 /* 140 * Make a chain visible to the flusher. The flusher operates using a top-down 141 * recursion based on the ONFLUSH flag. It locates MODIFIED and UPDATE chains, 142 * flushes them, and updates blocks back to the volume root. 143 * 144 * This routine sets the ONFLUSH flag upward from the triggering chain until 145 * it hits an inode root or the volume root. Inode chains serve as inflection 146 * points, requiring the flusher to bridge across trees. Inodes include 147 * regular inodes, PFS roots (pmp->iroot), and the media super root 148 * (spmp->iroot). 149 */ 150 void 151 hammer2_chain_setflush(hammer2_chain_t *chain) 152 { 153 hammer2_chain_t *parent; 154 155 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 156 hammer2_spin_sh(&chain->core.spin); 157 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 158 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 159 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 160 break; 161 if ((parent = chain->parent) == NULL) 162 break; 163 hammer2_spin_sh(&parent->core.spin); 164 hammer2_spin_unsh(&chain->core.spin); 165 chain = parent; 166 } 167 hammer2_spin_unsh(&chain->core.spin); 168 } 169 } 170 171 /* 172 * Allocate a new disconnected chain element representing the specified 173 * bref. chain->refs is set to 1 and the passed bref is copied to 174 * chain->bref. chain->bytes is derived from the bref. 175 * 176 * chain->pmp inherits pmp unless the chain is an inode (other than the 177 * super-root inode). 178 * 179 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 180 */ 181 hammer2_chain_t * 182 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 183 hammer2_blockref_t *bref) 184 { 185 hammer2_chain_t *chain; 186 u_int bytes; 187 188 /* 189 * Special case - radix of 0 indicates a chain that does not 190 * need a data reference (context is completely embedded in the 191 * bref). 192 */ 193 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 194 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 195 else 196 bytes = 0; 197 198 atomic_add_long(&hammer2_chain_allocs, 1); 199 200 /* 201 * Construct the appropriate system structure. 202 */ 203 switch(bref->type) { 204 case HAMMER2_BREF_TYPE_DIRENT: 205 case HAMMER2_BREF_TYPE_INODE: 206 case HAMMER2_BREF_TYPE_INDIRECT: 207 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 208 case HAMMER2_BREF_TYPE_DATA: 209 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 210 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 211 break; 212 case HAMMER2_BREF_TYPE_VOLUME: 213 case HAMMER2_BREF_TYPE_FREEMAP: 214 /* 215 * Only hammer2_chain_bulksnap() calls this function with these 216 * types. 217 */ 218 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 219 break; 220 default: 221 chain = NULL; 222 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 223 bref->type); 224 } 225 226 /* 227 * Initialize the new chain structure. pmp must be set to NULL for 228 * chains belonging to the super-root topology of a device mount. 229 */ 230 if (pmp == hmp->spmp) 231 chain->pmp = NULL; 232 else 233 chain->pmp = pmp; 234 235 chain->hmp = hmp; 236 chain->bref = *bref; 237 chain->bytes = bytes; 238 chain->refs = 1; 239 chain->flags = HAMMER2_CHAIN_ALLOCATED; 240 lockinit(&chain->diolk, "chdio", 0, 0); 241 242 /* 243 * Set the PFS boundary flag if this chain represents a PFS root. 244 */ 245 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 246 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 247 hammer2_chain_core_init(chain); 248 249 return (chain); 250 } 251 252 /* 253 * Initialize a chain's core structure. This structure used to be allocated 254 * but is now embedded. 255 * 256 * The core is not locked. No additional refs on the chain are made. 257 * (trans) must not be NULL if (core) is not NULL. 258 */ 259 void 260 hammer2_chain_core_init(hammer2_chain_t *chain) 261 { 262 /* 263 * Fresh core under nchain (no multi-homing of ochain's 264 * sub-tree). 265 */ 266 RB_INIT(&chain->core.rbtree); /* live chains */ 267 hammer2_mtx_init(&chain->lock, "h2chain"); 268 } 269 270 /* 271 * Add a reference to a chain element, preventing its destruction. 272 * 273 * (can be called with spinlock held) 274 */ 275 void 276 hammer2_chain_ref(hammer2_chain_t *chain) 277 { 278 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 279 /* 280 * Just flag that the chain was used and should be recycled 281 * on the LRU if it encounters it later. 282 */ 283 if (chain->flags & HAMMER2_CHAIN_ONLRU) 284 atomic_set_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 285 286 #if 0 287 /* 288 * REMOVED - reduces contention, lru_list is more heuristical 289 * now. 290 * 291 * 0->non-zero transition must ensure that chain is removed 292 * from the LRU list. 293 * 294 * NOTE: Already holding lru_spin here so we cannot call 295 * hammer2_chain_ref() to get it off lru_list, do 296 * it manually. 297 */ 298 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 299 hammer2_pfs_t *pmp = chain->pmp; 300 hammer2_spin_ex(&pmp->lru_spin); 301 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 302 atomic_add_int(&pmp->lru_count, -1); 303 atomic_clear_int(&chain->flags, 304 HAMMER2_CHAIN_ONLRU); 305 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 306 } 307 hammer2_spin_unex(&pmp->lru_spin); 308 } 309 #endif 310 } 311 } 312 313 /* 314 * Ref a locked chain and force the data to be held across an unlock. 315 * Chain must be currently locked. The user of the chain who desires 316 * to release the hold must call hammer2_chain_lock_unhold() to relock 317 * and unhold the chain, then unlock normally, or may simply call 318 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 319 */ 320 void 321 hammer2_chain_ref_hold(hammer2_chain_t *chain) 322 { 323 atomic_add_int(&chain->lockcnt, 1); 324 hammer2_chain_ref(chain); 325 } 326 327 /* 328 * Insert the chain in the core rbtree. 329 * 330 * Normal insertions are placed in the live rbtree. Insertion of a deleted 331 * chain is a special case used by the flush code that is placed on the 332 * unstaged deleted list to avoid confusing the live view. 333 */ 334 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 335 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 336 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 337 338 static 339 int 340 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 341 int flags, int generation) 342 { 343 hammer2_chain_t *xchain; 344 int error = 0; 345 346 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 347 hammer2_spin_ex(&parent->core.spin); 348 349 /* 350 * Interlocked by spinlock, check for race 351 */ 352 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 353 parent->core.generation != generation) { 354 error = HAMMER2_ERROR_EAGAIN; 355 goto failed; 356 } 357 358 /* 359 * Insert chain 360 */ 361 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 362 KASSERT(xchain == NULL, 363 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 364 chain, xchain, chain->bref.key)); 365 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 366 chain->parent = parent; 367 ++parent->core.chain_count; 368 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 369 370 /* 371 * We have to keep track of the effective live-view blockref count 372 * so the create code knows when to push an indirect block. 373 */ 374 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 375 atomic_add_int(&parent->core.live_count, 1); 376 failed: 377 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 378 hammer2_spin_unex(&parent->core.spin); 379 return error; 380 } 381 382 /* 383 * Drop the caller's reference to the chain. When the ref count drops to 384 * zero this function will try to disassociate the chain from its parent and 385 * deallocate it, then recursely drop the parent using the implied ref 386 * from the chain's chain->parent. 387 * 388 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 389 * races an acquisition by another cpu. Therefore we can loop if we are 390 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 391 * race against another drop. 392 */ 393 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain, 394 int depth); 395 static void hammer2_chain_lru_flush(hammer2_pfs_t *pmp); 396 397 void 398 hammer2_chain_drop(hammer2_chain_t *chain) 399 { 400 u_int refs; 401 402 if (hammer2_debug & 0x200000) 403 Debugger("drop"); 404 405 KKASSERT(chain->refs > 0); 406 407 while (chain) { 408 refs = chain->refs; 409 cpu_ccfence(); 410 KKASSERT(refs > 0); 411 412 if (refs == 1) { 413 if (hammer2_mtx_ex_try(&chain->lock) == 0) 414 chain = hammer2_chain_lastdrop(chain, 0); 415 /* retry the same chain, or chain from lastdrop */ 416 } else { 417 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 418 break; 419 /* retry the same chain */ 420 } 421 cpu_pause(); 422 } 423 } 424 425 /* 426 * Unhold a held and probably not-locked chain, ensure that the data is 427 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 428 * lock and then simply unlocking the chain. 429 */ 430 void 431 hammer2_chain_unhold(hammer2_chain_t *chain) 432 { 433 u_int lockcnt; 434 int iter = 0; 435 436 for (;;) { 437 lockcnt = chain->lockcnt; 438 cpu_ccfence(); 439 if (lockcnt > 1) { 440 if (atomic_cmpset_int(&chain->lockcnt, 441 lockcnt, lockcnt - 1)) { 442 break; 443 } 444 } else if (hammer2_mtx_ex_try(&chain->lock) == 0) { 445 hammer2_chain_unlock(chain); 446 break; 447 } else { 448 /* 449 * This situation can easily occur on SMP due to 450 * the gap inbetween the 1->0 transition and the 451 * final unlock. We cannot safely block on the 452 * mutex because lockcnt might go above 1. 453 * 454 * XXX Sleep for one tick if it takes too long. 455 */ 456 if (++iter > 1000) { 457 if (iter > 1000 + hz) { 458 kprintf("hammer2: h2race1 %p\n", chain); 459 iter = 1000; 460 } 461 tsleep(&iter, 0, "h2race1", 1); 462 } 463 cpu_pause(); 464 } 465 } 466 } 467 468 void 469 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 470 { 471 hammer2_chain_unhold(chain); 472 hammer2_chain_drop(chain); 473 } 474 475 void 476 hammer2_chain_rehold(hammer2_chain_t *chain) 477 { 478 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED); 479 atomic_add_int(&chain->lockcnt, 1); 480 hammer2_chain_unlock(chain); 481 } 482 483 /* 484 * Handles the (potential) last drop of chain->refs from 1->0. Called with 485 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 486 * possible against refs and lockcnt. We must dispose of the mutex on chain. 487 * 488 * This function returns an unlocked chain for recursive drop or NULL. It 489 * can return the same chain if it determines it has raced another ref. 490 * 491 * -- 492 * 493 * When two chains need to be recursively dropped we use the chain we 494 * would otherwise free to placehold the additional chain. It's a bit 495 * convoluted but we can't just recurse without potentially blowing out 496 * the kernel stack. 497 * 498 * The chain cannot be freed if it has any children. 499 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 500 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 501 * Any dedup registration can remain intact. 502 * 503 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 504 */ 505 static 506 hammer2_chain_t * 507 hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth) 508 { 509 hammer2_pfs_t *pmp; 510 hammer2_dev_t *hmp; 511 hammer2_chain_t *parent; 512 hammer2_chain_t *rdrop; 513 514 /* 515 * We need chain's spinlock to interlock the sub-tree test. 516 * We already have chain's mutex, protecting chain->parent. 517 * 518 * Remember that chain->refs can be in flux. 519 */ 520 hammer2_spin_ex(&chain->core.spin); 521 522 if (chain->parent != NULL) { 523 /* 524 * If the chain has a parent the UPDATE bit prevents scrapping 525 * as the chain is needed to properly flush the parent. Try 526 * to complete the 1->0 transition and return NULL. Retry 527 * (return chain) if we are unable to complete the 1->0 528 * transition, else return NULL (nothing more to do). 529 * 530 * If the chain has a parent the MODIFIED bit prevents 531 * scrapping. 532 * 533 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 534 */ 535 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 536 HAMMER2_CHAIN_MODIFIED)) { 537 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 538 hammer2_spin_unex(&chain->core.spin); 539 hammer2_chain_assert_no_data(chain); 540 hammer2_mtx_unlock(&chain->lock); 541 chain = NULL; 542 } else { 543 hammer2_spin_unex(&chain->core.spin); 544 hammer2_mtx_unlock(&chain->lock); 545 } 546 return (chain); 547 } 548 /* spinlock still held */ 549 } else if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME || 550 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP) { 551 /* 552 * Retain the static vchain and fchain. Clear bits that 553 * are not relevant. Do not clear the MODIFIED bit, 554 * and certainly do not put it on the delayed-flush queue. 555 */ 556 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 557 } else { 558 /* 559 * The chain has no parent and can be flagged for destruction. 560 * Since it has no parent, UPDATE can also be cleared. 561 */ 562 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 563 if (chain->flags & HAMMER2_CHAIN_UPDATE) 564 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 565 566 /* 567 * If the chain has children we must propagate the DESTROY 568 * flag downward and rip the disconnected topology apart. 569 * This is accomplished by calling hammer2_flush() on the 570 * chain. 571 * 572 * Any dedup is already handled by the underlying DIO, so 573 * we do not have to specifically flush it here. 574 */ 575 if (chain->core.chain_count) { 576 hammer2_spin_unex(&chain->core.spin); 577 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 578 HAMMER2_FLUSH_ALL); 579 hammer2_mtx_unlock(&chain->lock); 580 581 return(chain); /* retry drop */ 582 } 583 584 /* 585 * Otherwise we can scrap the MODIFIED bit if it is set, 586 * and continue along the freeing path. 587 * 588 * Be sure to clean-out any dedup bits. Without a parent 589 * this chain will no longer be visible to the flush code. 590 * Easy check data_off to avoid the volume root. 591 */ 592 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 593 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 594 atomic_add_long(&hammer2_count_modified_chains, -1); 595 if (chain->pmp) 596 hammer2_pfs_memory_wakeup(chain->pmp); 597 } 598 /* spinlock still held */ 599 } 600 601 /* spinlock still held */ 602 603 /* 604 * If any children exist we must leave the chain intact with refs == 0. 605 * They exist because chains are retained below us which have refs or 606 * may require flushing. 607 * 608 * Retry (return chain) if we fail to transition the refs to 0, else 609 * return NULL indication nothing more to do. 610 * 611 * Chains with children are NOT put on the LRU list. 612 */ 613 if (chain->core.chain_count) { 614 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 615 hammer2_spin_unex(&chain->core.spin); 616 hammer2_chain_assert_no_data(chain); 617 hammer2_mtx_unlock(&chain->lock); 618 chain = NULL; 619 } else { 620 hammer2_spin_unex(&chain->core.spin); 621 hammer2_mtx_unlock(&chain->lock); 622 } 623 return (chain); 624 } 625 /* spinlock still held */ 626 /* no chains left under us */ 627 628 /* 629 * chain->core has no children left so no accessors can get to our 630 * chain from there. Now we have to lock the parent core to interlock 631 * remaining possible accessors that might bump chain's refs before 632 * we can safely drop chain's refs with intent to free the chain. 633 */ 634 hmp = chain->hmp; 635 pmp = chain->pmp; /* can be NULL */ 636 rdrop = NULL; 637 638 parent = chain->parent; 639 640 /* 641 * WARNING! chain's spin lock is still held here, and other spinlocks 642 * will be acquired and released in the code below. We 643 * cannot be making fancy procedure calls! 644 */ 645 646 /* 647 * We can cache the chain if it is associated with a pmp 648 * and not flagged as being destroyed or requesting a full 649 * release. In this situation the chain is not removed 650 * from its parent, i.e. it can still be looked up. 651 * 652 * We intentionally do not cache DATA chains because these 653 * were likely used to load data into the logical buffer cache 654 * and will not be accessed again for some time. 655 */ 656 if ((chain->flags & 657 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 658 chain->pmp && 659 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 660 if (parent) 661 hammer2_spin_ex(&parent->core.spin); 662 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 663 /* 664 * 1->0 transition failed, retry. Do not drop 665 * the chain's data yet! 666 */ 667 if (parent) 668 hammer2_spin_unex(&parent->core.spin); 669 hammer2_spin_unex(&chain->core.spin); 670 hammer2_mtx_unlock(&chain->lock); 671 672 return(chain); 673 } 674 675 /* 676 * Success 677 */ 678 hammer2_chain_assert_no_data(chain); 679 680 /* 681 * Make sure we are on the LRU list, clean up excessive 682 * LRU entries. We can only really drop one but there might 683 * be other entries that we can remove from the lru_list 684 * without dropping. 685 * 686 * NOTE: HAMMER2_CHAIN_ONLRU may only be safely set when 687 * chain->core.spin AND pmp->lru_spin are held, but 688 * can be safely cleared only holding pmp->lru_spin. 689 */ 690 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 691 hammer2_spin_ex(&pmp->lru_spin); 692 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 693 atomic_set_int(&chain->flags, 694 HAMMER2_CHAIN_ONLRU); 695 TAILQ_INSERT_TAIL(&pmp->lru_list, 696 chain, lru_node); 697 atomic_add_int(&pmp->lru_count, 1); 698 } 699 if (pmp->lru_count < HAMMER2_LRU_LIMIT) 700 depth = 1; /* disable lru_list flush */ 701 hammer2_spin_unex(&pmp->lru_spin); 702 } else { 703 /* disable lru flush */ 704 depth = 1; 705 } 706 707 if (parent) { 708 hammer2_spin_unex(&parent->core.spin); 709 parent = NULL; /* safety */ 710 } 711 hammer2_spin_unex(&chain->core.spin); 712 hammer2_mtx_unlock(&chain->lock); 713 714 /* 715 * lru_list hysteresis (see above for depth overrides). 716 * Note that depth also prevents excessive lastdrop recursion. 717 */ 718 if (depth == 0) 719 hammer2_chain_lru_flush(pmp); 720 721 return NULL; 722 /* NOT REACHED */ 723 } 724 725 /* 726 * Make sure we are not on the LRU list. 727 */ 728 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 729 hammer2_spin_ex(&pmp->lru_spin); 730 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 731 atomic_add_int(&pmp->lru_count, -1); 732 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 733 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 734 } 735 hammer2_spin_unex(&pmp->lru_spin); 736 } 737 738 /* 739 * Spinlock the parent and try to drop the last ref on chain. 740 * On success determine if we should dispose of the chain 741 * (remove the chain from its parent, etc). 742 * 743 * (normal core locks are top-down recursive but we define 744 * core spinlocks as bottom-up recursive, so this is safe). 745 */ 746 if (parent) { 747 hammer2_spin_ex(&parent->core.spin); 748 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 749 /* 750 * 1->0 transition failed, retry. 751 */ 752 hammer2_spin_unex(&parent->core.spin); 753 hammer2_spin_unex(&chain->core.spin); 754 hammer2_mtx_unlock(&chain->lock); 755 756 return(chain); 757 } 758 759 /* 760 * 1->0 transition successful, parent spin held to prevent 761 * new lookups, chain spinlock held to protect parent field. 762 * Remove chain from the parent. 763 * 764 * If the chain is being removed from the parent's btree but 765 * is not bmapped, we have to adjust live_count downward. If 766 * it is bmapped then the blockref is retained in the parent 767 * as is its associated live_count. This case can occur when 768 * a chain added to the topology is unable to flush and is 769 * then later deleted. 770 */ 771 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 772 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 773 (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 774 atomic_add_int(&parent->core.live_count, -1); 775 } 776 RB_REMOVE(hammer2_chain_tree, 777 &parent->core.rbtree, chain); 778 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 779 --parent->core.chain_count; 780 chain->parent = NULL; 781 } 782 783 /* 784 * If our chain was the last chain in the parent's core the 785 * core is now empty and its parent might have to be 786 * re-dropped if it has 0 refs. 787 */ 788 if (parent->core.chain_count == 0) { 789 rdrop = parent; 790 atomic_add_int(&rdrop->refs, 1); 791 /* 792 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 793 rdrop = NULL; 794 */ 795 } 796 hammer2_spin_unex(&parent->core.spin); 797 parent = NULL; /* safety */ 798 /* FALL THROUGH */ 799 } else { 800 /* 801 * No-parent case. 802 */ 803 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 804 /* 805 * 1->0 transition failed, retry. 806 */ 807 hammer2_spin_unex(&parent->core.spin); 808 hammer2_spin_unex(&chain->core.spin); 809 hammer2_mtx_unlock(&chain->lock); 810 811 return(chain); 812 } 813 } 814 815 /* 816 * Successful 1->0 transition, no parent, no children... no way for 817 * anyone to ref this chain any more. We can clean-up and free it. 818 * 819 * We still have the core spinlock, and core's chain_count is 0. 820 * Any parent spinlock is gone. 821 */ 822 hammer2_spin_unex(&chain->core.spin); 823 hammer2_chain_assert_no_data(chain); 824 hammer2_mtx_unlock(&chain->lock); 825 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 826 chain->core.chain_count == 0); 827 828 /* 829 * All locks are gone, no pointers remain to the chain, finish 830 * freeing it. 831 */ 832 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 833 HAMMER2_CHAIN_MODIFIED)) == 0); 834 835 /* 836 * Once chain resources are gone we can use the now dead chain 837 * structure to placehold what might otherwise require a recursive 838 * drop, because we have potentially two things to drop and can only 839 * return one directly. 840 */ 841 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 842 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 843 chain->hmp = NULL; 844 kfree(chain, hmp->mchain); 845 } 846 847 /* 848 * Possible chaining loop when parent re-drop needed. 849 */ 850 return(rdrop); 851 } 852 853 /* 854 * Heuristical flush of the LRU, try to reduce the number of entries 855 * on the LRU to (HAMMER2_LRU_LIMIT * 2 / 3). This procedure is called 856 * only when lru_count exceeds HAMMER2_LRU_LIMIT. 857 */ 858 static 859 void 860 hammer2_chain_lru_flush(hammer2_pfs_t *pmp) 861 { 862 hammer2_chain_t *chain; 863 864 again: 865 chain = NULL; 866 hammer2_spin_ex(&pmp->lru_spin); 867 while (pmp->lru_count > HAMMER2_LRU_LIMIT * 2 / 3) { 868 /* 869 * Pick a chain off the lru_list, just recycle it quickly 870 * if LRUHINT is set (the chain was ref'd but left on 871 * the lru_list, so cycle to the end). 872 */ 873 chain = TAILQ_FIRST(&pmp->lru_list); 874 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 875 876 if (chain->flags & HAMMER2_CHAIN_LRUHINT) { 877 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 878 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 879 chain = NULL; 880 continue; 881 } 882 883 /* 884 * Ok, we are off the LRU. We must adjust refs before we 885 * can safely clear the ONLRU flag. 886 */ 887 atomic_add_int(&pmp->lru_count, -1); 888 if (atomic_cmpset_int(&chain->refs, 0, 1)) { 889 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 890 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 891 break; 892 } 893 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 894 chain = NULL; 895 } 896 hammer2_spin_unex(&pmp->lru_spin); 897 if (chain == NULL) 898 return; 899 900 /* 901 * If we picked a chain off the lru list we may be able to lastdrop 902 * it. Use a depth of 1 to prevent excessive lastdrop recursion. 903 */ 904 while (chain) { 905 u_int refs; 906 907 refs = chain->refs; 908 cpu_ccfence(); 909 KKASSERT(refs > 0); 910 911 if (refs == 1) { 912 if (hammer2_mtx_ex_try(&chain->lock) == 0) 913 chain = hammer2_chain_lastdrop(chain, 1); 914 /* retry the same chain, or chain from lastdrop */ 915 } else { 916 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 917 break; 918 /* retry the same chain */ 919 } 920 cpu_pause(); 921 } 922 goto again; 923 } 924 925 /* 926 * On last lock release. 927 */ 928 static hammer2_io_t * 929 hammer2_chain_drop_data(hammer2_chain_t *chain) 930 { 931 hammer2_io_t *dio; 932 933 if ((dio = chain->dio) != NULL) { 934 chain->dio = NULL; 935 chain->data = NULL; 936 } else { 937 switch(chain->bref.type) { 938 case HAMMER2_BREF_TYPE_VOLUME: 939 case HAMMER2_BREF_TYPE_FREEMAP: 940 break; 941 default: 942 if (chain->data != NULL) { 943 hammer2_spin_unex(&chain->core.spin); 944 panic("chain data not null: " 945 "chain %p bref %016jx.%02x " 946 "refs %d parent %p dio %p data %p", 947 chain, chain->bref.data_off, 948 chain->bref.type, chain->refs, 949 chain->parent, 950 chain->dio, chain->data); 951 } 952 KKASSERT(chain->data == NULL); 953 break; 954 } 955 } 956 return dio; 957 } 958 959 /* 960 * Lock a referenced chain element, acquiring its data with I/O if necessary, 961 * and specify how you would like the data to be resolved. 962 * 963 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 964 * 965 * The lock is allowed to recurse, multiple locking ops will aggregate 966 * the requested resolve types. Once data is assigned it will not be 967 * removed until the last unlock. 968 * 969 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 970 * (typically used to avoid device/logical buffer 971 * aliasing for data) 972 * 973 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 974 * the INITIAL-create state (indirect blocks only). 975 * 976 * Do not resolve data elements for DATA chains. 977 * (typically used to avoid device/logical buffer 978 * aliasing for data) 979 * 980 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 981 * 982 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 983 * it will be locked exclusive. 984 * 985 * HAMMER2_RESOLVE_NONBLOCK- (flag) The chain is locked non-blocking. If 986 * the lock fails, EAGAIN is returned. 987 * 988 * NOTE: Embedded elements (volume header, inodes) are always resolved 989 * regardless. 990 * 991 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 992 * element will instantiate and zero its buffer, and flush it on 993 * release. 994 * 995 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 996 * so as not to instantiate a device buffer, which could alias against 997 * a logical file buffer. However, if ALWAYS is specified the 998 * device buffer will be instantiated anyway. 999 * 1000 * NOTE: The return value is always 0 unless NONBLOCK is specified, in which 1001 * case it can be either 0 or EAGAIN. 1002 * 1003 * WARNING! This function blocks on I/O if data needs to be fetched. This 1004 * blocking can run concurrent with other compatible lock holders 1005 * who do not need data returning. The lock is not upgraded to 1006 * exclusive during a data fetch, a separate bit is used to 1007 * interlock I/O. However, an exclusive lock holder can still count 1008 * on being interlocked against an I/O fetch managed by a shared 1009 * lock holder. 1010 */ 1011 int 1012 hammer2_chain_lock(hammer2_chain_t *chain, int how) 1013 { 1014 KKASSERT(chain->refs > 0); 1015 1016 if (how & HAMMER2_RESOLVE_NONBLOCK) { 1017 /* 1018 * We still have to bump lockcnt before acquiring the lock, 1019 * even for non-blocking operation, because the unlock code 1020 * live-loops on lockcnt == 1 when dropping the last lock. 1021 * 1022 * If the non-blocking operation fails we have to use an 1023 * unhold sequence to undo the mess. 1024 * 1025 * NOTE: LOCKAGAIN must always succeed without blocking, 1026 * even if NONBLOCK is specified. 1027 */ 1028 atomic_add_int(&chain->lockcnt, 1); 1029 if (how & HAMMER2_RESOLVE_SHARED) { 1030 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1031 hammer2_mtx_sh_again(&chain->lock); 1032 } else { 1033 if (hammer2_mtx_sh_try(&chain->lock) != 0) { 1034 hammer2_chain_unhold(chain); 1035 return EAGAIN; 1036 } 1037 } 1038 } else { 1039 if (hammer2_mtx_ex_try(&chain->lock) != 0) { 1040 hammer2_chain_unhold(chain); 1041 return EAGAIN; 1042 } 1043 } 1044 ++curthread->td_tracker; 1045 } else { 1046 /* 1047 * Get the appropriate lock. If LOCKAGAIN is flagged with 1048 * SHARED the caller expects a shared lock to already be 1049 * present and we are giving it another ref. This case must 1050 * importantly not block if there is a pending exclusive lock 1051 * request. 1052 */ 1053 atomic_add_int(&chain->lockcnt, 1); 1054 if (how & HAMMER2_RESOLVE_SHARED) { 1055 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1056 hammer2_mtx_sh_again(&chain->lock); 1057 } else { 1058 hammer2_mtx_sh(&chain->lock); 1059 } 1060 } else { 1061 hammer2_mtx_ex(&chain->lock); 1062 } 1063 ++curthread->td_tracker; 1064 } 1065 1066 /* 1067 * If we already have a valid data pointer make sure the data is 1068 * synchronized to the current cpu, and then no further action is 1069 * necessary. 1070 */ 1071 if (chain->data) { 1072 if (chain->dio) 1073 hammer2_io_bkvasync(chain->dio); 1074 return 0; 1075 } 1076 1077 /* 1078 * Do we have to resolve the data? This is generally only 1079 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 1080 * Other BREF types expects the data to be there. 1081 */ 1082 switch(how & HAMMER2_RESOLVE_MASK) { 1083 case HAMMER2_RESOLVE_NEVER: 1084 return 0; 1085 case HAMMER2_RESOLVE_MAYBE: 1086 if (chain->flags & HAMMER2_CHAIN_INITIAL) 1087 return 0; 1088 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 1089 return 0; 1090 #if 0 1091 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 1092 return 0; 1093 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 1094 return 0; 1095 #endif 1096 /* fall through */ 1097 case HAMMER2_RESOLVE_ALWAYS: 1098 default: 1099 break; 1100 } 1101 1102 /* 1103 * Caller requires data 1104 */ 1105 hammer2_chain_load_data(chain); 1106 1107 return 0; 1108 } 1109 1110 /* 1111 * Lock the chain, retain the hold, and drop the data persistence count. 1112 * The data should remain valid because we never transitioned lockcnt 1113 * through 0. 1114 */ 1115 void 1116 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 1117 { 1118 hammer2_chain_lock(chain, how); 1119 atomic_add_int(&chain->lockcnt, -1); 1120 } 1121 1122 #if 0 1123 /* 1124 * Downgrade an exclusive chain lock to a shared chain lock. 1125 * 1126 * NOTE: There is no upgrade equivalent due to the ease of 1127 * deadlocks in that direction. 1128 */ 1129 void 1130 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1131 { 1132 hammer2_mtx_downgrade(&chain->lock); 1133 } 1134 #endif 1135 1136 /* 1137 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1138 * may be of any type. 1139 * 1140 * Once chain->data is set it cannot be disposed of until all locks are 1141 * released. 1142 * 1143 * Make sure the data is synchronized to the current cpu. 1144 */ 1145 void 1146 hammer2_chain_load_data(hammer2_chain_t *chain) 1147 { 1148 hammer2_blockref_t *bref; 1149 hammer2_dev_t *hmp; 1150 hammer2_io_t *dio; 1151 char *bdata; 1152 int error; 1153 1154 /* 1155 * Degenerate case, data already present, or chain has no media 1156 * reference to load. 1157 */ 1158 KKASSERT(chain->lock.mtx_lock & MTX_MASK); 1159 if (chain->data) { 1160 if (chain->dio) 1161 hammer2_io_bkvasync(chain->dio); 1162 return; 1163 } 1164 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1165 return; 1166 1167 hmp = chain->hmp; 1168 KKASSERT(hmp != NULL); 1169 1170 /* 1171 * Gain the IOINPROG bit, interlocked block. 1172 */ 1173 for (;;) { 1174 u_int oflags; 1175 u_int nflags; 1176 1177 oflags = chain->flags; 1178 cpu_ccfence(); 1179 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1180 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1181 tsleep_interlock(&chain->flags, 0); 1182 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1183 tsleep(&chain->flags, PINTERLOCKED, 1184 "h2iocw", 0); 1185 } 1186 /* retry */ 1187 } else { 1188 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1189 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1190 break; 1191 } 1192 /* retry */ 1193 } 1194 } 1195 1196 /* 1197 * We own CHAIN_IOINPROG 1198 * 1199 * Degenerate case if we raced another load. 1200 */ 1201 if (chain->data) { 1202 if (chain->dio) 1203 hammer2_io_bkvasync(chain->dio); 1204 goto done; 1205 } 1206 1207 /* 1208 * We must resolve to a device buffer, either by issuing I/O or 1209 * by creating a zero-fill element. We do not mark the buffer 1210 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1211 * API must still be used to do that). 1212 * 1213 * The device buffer is variable-sized in powers of 2 down 1214 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1215 * chunk always contains buffers of the same size. (XXX) 1216 * 1217 * The minimum physical IO size may be larger than the variable 1218 * block size. 1219 */ 1220 bref = &chain->bref; 1221 1222 /* 1223 * The getblk() optimization can only be used on newly created 1224 * elements if the physical block size matches the request. 1225 */ 1226 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1227 error = hammer2_io_new(hmp, bref->type, 1228 bref->data_off, chain->bytes, 1229 &chain->dio); 1230 } else { 1231 error = hammer2_io_bread(hmp, bref->type, 1232 bref->data_off, chain->bytes, 1233 &chain->dio); 1234 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1235 } 1236 if (error) { 1237 chain->error = HAMMER2_ERROR_EIO; 1238 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n", 1239 (intmax_t)bref->data_off, error); 1240 hammer2_io_bqrelse(&chain->dio); 1241 goto done; 1242 } 1243 chain->error = 0; 1244 1245 /* 1246 * This isn't perfect and can be ignored on OSs which do not have 1247 * an indication as to whether a buffer is coming from cache or 1248 * if I/O was actually issued for the read. TESTEDGOOD will work 1249 * pretty well without the B_IOISSUED logic because chains are 1250 * cached, but in that situation (without B_IOISSUED) it will not 1251 * detect whether a re-read via I/O is corrupted verses the original 1252 * read. 1253 * 1254 * We can't re-run the CRC on every fresh lock. That would be 1255 * insanely expensive. 1256 * 1257 * If the underlying kernel buffer covers the entire chain we can 1258 * use the B_IOISSUED indication to determine if we have to re-run 1259 * the CRC on chain data for chains that managed to stay cached 1260 * across the kernel disposal of the original buffer. 1261 */ 1262 if ((dio = chain->dio) != NULL && dio->bp) { 1263 struct buf *bp = dio->bp; 1264 1265 if (dio->psize == chain->bytes && 1266 (bp->b_flags & B_IOISSUED)) { 1267 atomic_clear_int(&chain->flags, 1268 HAMMER2_CHAIN_TESTEDGOOD); 1269 bp->b_flags &= ~B_IOISSUED; 1270 } 1271 } 1272 1273 /* 1274 * NOTE: A locked chain's data cannot be modified without first 1275 * calling hammer2_chain_modify(). 1276 */ 1277 1278 /* 1279 * Clear INITIAL. In this case we used io_new() and the buffer has 1280 * been zero'd and marked dirty. 1281 * 1282 * NOTE: hammer2_io_data() call issues bkvasync() 1283 */ 1284 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1285 1286 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1287 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1288 chain->bref.flags |= HAMMER2_BREF_FLAG_ZERO; 1289 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1290 /* 1291 * check data not currently synchronized due to 1292 * modification. XXX assumes data stays in the buffer 1293 * cache, which might not be true (need biodep on flush 1294 * to calculate crc? or simple crc?). 1295 */ 1296 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1297 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1298 chain->error = HAMMER2_ERROR_CHECK; 1299 } else { 1300 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1301 } 1302 } 1303 1304 /* 1305 * Setup the data pointer, either pointing it to an embedded data 1306 * structure and copying the data from the buffer, or pointing it 1307 * into the buffer. 1308 * 1309 * The buffer is not retained when copying to an embedded data 1310 * structure in order to avoid potential deadlocks or recursions 1311 * on the same physical buffer. 1312 * 1313 * WARNING! Other threads can start using the data the instant we 1314 * set chain->data non-NULL. 1315 */ 1316 switch (bref->type) { 1317 case HAMMER2_BREF_TYPE_VOLUME: 1318 case HAMMER2_BREF_TYPE_FREEMAP: 1319 /* 1320 * Copy data from bp to embedded buffer 1321 */ 1322 panic("hammer2_chain_load_data: unresolved volume header"); 1323 break; 1324 case HAMMER2_BREF_TYPE_DIRENT: 1325 KKASSERT(chain->bytes != 0); 1326 /* fall through */ 1327 case HAMMER2_BREF_TYPE_INODE: 1328 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1329 case HAMMER2_BREF_TYPE_INDIRECT: 1330 case HAMMER2_BREF_TYPE_DATA: 1331 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1332 default: 1333 /* 1334 * Point data at the device buffer and leave dio intact. 1335 */ 1336 chain->data = (void *)bdata; 1337 break; 1338 } 1339 1340 /* 1341 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1342 */ 1343 done: 1344 for (;;) { 1345 u_int oflags; 1346 u_int nflags; 1347 1348 oflags = chain->flags; 1349 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1350 HAMMER2_CHAIN_IOSIGNAL); 1351 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1352 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1353 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1354 wakeup(&chain->flags); 1355 break; 1356 } 1357 } 1358 } 1359 1360 /* 1361 * Unlock and deref a chain element. 1362 * 1363 * Remember that the presence of children under chain prevent the chain's 1364 * destruction but do not add additional references, so the dio will still 1365 * be dropped. 1366 */ 1367 void 1368 hammer2_chain_unlock(hammer2_chain_t *chain) 1369 { 1370 hammer2_io_t *dio; 1371 u_int lockcnt; 1372 int iter = 0; 1373 1374 --curthread->td_tracker; 1375 1376 /* 1377 * If multiple locks are present (or being attempted) on this 1378 * particular chain we can just unlock, drop refs, and return. 1379 * 1380 * Otherwise fall-through on the 1->0 transition. 1381 */ 1382 for (;;) { 1383 lockcnt = chain->lockcnt; 1384 KKASSERT(lockcnt > 0); 1385 cpu_ccfence(); 1386 if (lockcnt > 1) { 1387 if (atomic_cmpset_int(&chain->lockcnt, 1388 lockcnt, lockcnt - 1)) { 1389 hammer2_mtx_unlock(&chain->lock); 1390 return; 1391 } 1392 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1393 /* while holding the mutex exclusively */ 1394 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1395 break; 1396 } else { 1397 /* 1398 * This situation can easily occur on SMP due to 1399 * the gap inbetween the 1->0 transition and the 1400 * final unlock. We cannot safely block on the 1401 * mutex because lockcnt might go above 1. 1402 * 1403 * XXX Sleep for one tick if it takes too long. 1404 */ 1405 if (++iter > 1000) { 1406 if (iter > 1000 + hz) { 1407 kprintf("hammer2: h2race2 %p\n", chain); 1408 iter = 1000; 1409 } 1410 tsleep(&iter, 0, "h2race2", 1); 1411 } 1412 cpu_pause(); 1413 } 1414 /* retry */ 1415 } 1416 1417 /* 1418 * Last unlock / mutex upgraded to exclusive. Drop the data 1419 * reference. 1420 */ 1421 dio = hammer2_chain_drop_data(chain); 1422 if (dio) 1423 hammer2_io_bqrelse(&dio); 1424 hammer2_mtx_unlock(&chain->lock); 1425 } 1426 1427 /* 1428 * Unlock and hold chain data intact 1429 */ 1430 void 1431 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1432 { 1433 atomic_add_int(&chain->lockcnt, 1); 1434 hammer2_chain_unlock(chain); 1435 } 1436 1437 /* 1438 * Helper to obtain the blockref[] array base and count for a chain. 1439 * 1440 * XXX Not widely used yet, various use cases need to be validated and 1441 * converted to use this function. 1442 */ 1443 static 1444 hammer2_blockref_t * 1445 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1446 { 1447 hammer2_blockref_t *base; 1448 int count; 1449 1450 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1451 base = NULL; 1452 1453 switch(parent->bref.type) { 1454 case HAMMER2_BREF_TYPE_INODE: 1455 count = HAMMER2_SET_COUNT; 1456 break; 1457 case HAMMER2_BREF_TYPE_INDIRECT: 1458 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1459 count = parent->bytes / sizeof(hammer2_blockref_t); 1460 break; 1461 case HAMMER2_BREF_TYPE_VOLUME: 1462 count = HAMMER2_SET_COUNT; 1463 break; 1464 case HAMMER2_BREF_TYPE_FREEMAP: 1465 count = HAMMER2_SET_COUNT; 1466 break; 1467 default: 1468 panic("hammer2_chain_base_and_count: " 1469 "unrecognized blockref type: %d", 1470 parent->bref.type); 1471 count = 0; 1472 break; 1473 } 1474 } else { 1475 switch(parent->bref.type) { 1476 case HAMMER2_BREF_TYPE_INODE: 1477 base = &parent->data->ipdata.u.blockset.blockref[0]; 1478 count = HAMMER2_SET_COUNT; 1479 break; 1480 case HAMMER2_BREF_TYPE_INDIRECT: 1481 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1482 base = &parent->data->npdata[0]; 1483 count = parent->bytes / sizeof(hammer2_blockref_t); 1484 break; 1485 case HAMMER2_BREF_TYPE_VOLUME: 1486 base = &parent->data->voldata. 1487 sroot_blockset.blockref[0]; 1488 count = HAMMER2_SET_COUNT; 1489 break; 1490 case HAMMER2_BREF_TYPE_FREEMAP: 1491 base = &parent->data->blkset.blockref[0]; 1492 count = HAMMER2_SET_COUNT; 1493 break; 1494 default: 1495 panic("hammer2_chain_base_and_count: " 1496 "unrecognized blockref type: %d", 1497 parent->bref.type); 1498 count = 0; 1499 break; 1500 } 1501 } 1502 *countp = count; 1503 1504 return base; 1505 } 1506 1507 /* 1508 * This counts the number of live blockrefs in a block array and 1509 * also calculates the point at which all remaining blockrefs are empty. 1510 * This routine can only be called on a live chain. 1511 * 1512 * Caller holds the chain locked, but possibly with a shared lock. We 1513 * must use an exclusive spinlock to prevent corruption. 1514 * 1515 * NOTE: Flag is not set until after the count is complete, allowing 1516 * callers to test the flag without holding the spinlock. 1517 * 1518 * NOTE: If base is NULL the related chain is still in the INITIAL 1519 * state and there are no blockrefs to count. 1520 * 1521 * NOTE: live_count may already have some counts accumulated due to 1522 * creation and deletion and could even be initially negative. 1523 */ 1524 void 1525 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1526 hammer2_blockref_t *base, int count) 1527 { 1528 hammer2_spin_ex(&chain->core.spin); 1529 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1530 if (base) { 1531 while (--count >= 0) { 1532 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1533 break; 1534 } 1535 chain->core.live_zero = count + 1; 1536 while (count >= 0) { 1537 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1538 atomic_add_int(&chain->core.live_count, 1539 1); 1540 --count; 1541 } 1542 } else { 1543 chain->core.live_zero = 0; 1544 } 1545 /* else do not modify live_count */ 1546 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1547 } 1548 hammer2_spin_unex(&chain->core.spin); 1549 } 1550 1551 /* 1552 * Resize the chain's physical storage allocation in-place. This function does 1553 * not usually adjust the data pointer and must be followed by (typically) a 1554 * hammer2_chain_modify() call to copy any old data over and adjust the 1555 * data pointer. 1556 * 1557 * Chains can be resized smaller without reallocating the storage. Resizing 1558 * larger will reallocate the storage. Excess or prior storage is reclaimed 1559 * asynchronously at a later time. 1560 * 1561 * An nradix value of 0 is special-cased to mean that the storage should 1562 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1563 * byte). 1564 * 1565 * Must be passed an exclusively locked parent and chain. 1566 * 1567 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1568 * to avoid instantiating a device buffer that conflicts with the vnode data 1569 * buffer. However, because H2 can compress or encrypt data, the chain may 1570 * have a dio assigned to it in those situations, and they do not conflict. 1571 * 1572 * XXX return error if cannot resize. 1573 */ 1574 int 1575 hammer2_chain_resize(hammer2_chain_t *chain, 1576 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1577 int nradix, int flags) 1578 { 1579 hammer2_dev_t *hmp; 1580 size_t obytes; 1581 size_t nbytes; 1582 int error; 1583 1584 hmp = chain->hmp; 1585 1586 /* 1587 * Only data and indirect blocks can be resized for now. 1588 * (The volu root, inodes, and freemap elements use a fixed size). 1589 */ 1590 KKASSERT(chain != &hmp->vchain); 1591 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1592 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1593 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1594 1595 /* 1596 * Nothing to do if the element is already the proper size 1597 */ 1598 obytes = chain->bytes; 1599 nbytes = (nradix) ? (1U << nradix) : 0; 1600 if (obytes == nbytes) 1601 return (chain->error); 1602 1603 /* 1604 * Make sure the old data is instantiated so we can copy it. If this 1605 * is a data block, the device data may be superfluous since the data 1606 * might be in a logical block, but compressed or encrypted data is 1607 * another matter. 1608 * 1609 * NOTE: The modify will set BMAPUPD for us if BMAPPED is set. 1610 */ 1611 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1612 if (error) 1613 return error; 1614 1615 /* 1616 * Relocate the block, even if making it smaller (because different 1617 * block sizes may be in different regions). 1618 * 1619 * NOTE: Operation does not copy the data and may only be used 1620 * to resize data blocks in-place, or directory entry blocks 1621 * which are about to be modified in some manner. 1622 */ 1623 error = hammer2_freemap_alloc(chain, nbytes); 1624 if (error) 1625 return error; 1626 1627 chain->bytes = nbytes; 1628 1629 /* 1630 * We don't want the followup chain_modify() to try to copy data 1631 * from the old (wrong-sized) buffer. It won't know how much to 1632 * copy. This case should only occur during writes when the 1633 * originator already has the data to write in-hand. 1634 */ 1635 if (chain->dio) { 1636 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1637 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1638 hammer2_io_brelse(&chain->dio); 1639 chain->data = NULL; 1640 } 1641 return (chain->error); 1642 } 1643 1644 /* 1645 * Set the chain modified so its data can be changed by the caller, or 1646 * install deduplicated data. The caller must call this routine for each 1647 * set of modifications it makes, even if the chain is already flagged 1648 * MODIFIED. 1649 * 1650 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1651 * is a CLC (cluster level change) field and is not updated by parent 1652 * propagation during a flush. 1653 * 1654 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1655 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1656 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1657 * remains unmodified with its old data ref intact and chain->error 1658 * unchanged. 1659 * 1660 * Dedup Handling 1661 * 1662 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1663 * even if the chain is still flagged MODIFIED. In this case the chain's 1664 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1665 * 1666 * If the caller passes a non-zero dedup_off we will use it to assign the 1667 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1668 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1669 * must not modify the data content upon return. 1670 */ 1671 int 1672 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1673 hammer2_off_t dedup_off, int flags) 1674 { 1675 hammer2_blockref_t obref; 1676 hammer2_dev_t *hmp; 1677 hammer2_io_t *dio; 1678 int error; 1679 int wasinitial; 1680 int setmodified; 1681 int setupdate; 1682 int newmod; 1683 char *bdata; 1684 1685 hmp = chain->hmp; 1686 obref = chain->bref; 1687 KKASSERT((chain->flags & HAMMER2_CHAIN_FICTITIOUS) == 0); 1688 KKASSERT(chain->lock.mtx_lock & MTX_EXCLUSIVE); 1689 1690 /* 1691 * Data is not optional for freemap chains (we must always be sure 1692 * to copy the data on COW storage allocations). 1693 */ 1694 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1695 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1696 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1697 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1698 } 1699 1700 /* 1701 * Data must be resolved if already assigned, unless explicitly 1702 * flagged otherwise. If we cannot safety load the data the 1703 * modification fails and we return early. 1704 */ 1705 if (chain->data == NULL && chain->bytes != 0 && 1706 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1707 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1708 hammer2_chain_load_data(chain); 1709 if (chain->error) 1710 return (chain->error); 1711 } 1712 error = 0; 1713 1714 /* 1715 * Set MODIFIED to indicate that the chain has been modified. A new 1716 * allocation is required when modifying a chain. 1717 * 1718 * Set UPDATE to ensure that the blockref is updated in the parent. 1719 * 1720 * If MODIFIED is already set determine if we can reuse the assigned 1721 * data block or if we need a new data block. 1722 */ 1723 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1724 /* 1725 * Must set modified bit. 1726 */ 1727 atomic_add_long(&hammer2_count_modified_chains, 1); 1728 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1729 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1730 setmodified = 1; 1731 1732 /* 1733 * We may be able to avoid a copy-on-write if the chain's 1734 * check mode is set to NONE and the chain's current 1735 * modify_tid is beyond the last explicit snapshot tid. 1736 * 1737 * This implements HAMMER2's overwrite-in-place feature. 1738 * 1739 * NOTE! This data-block cannot be used as a de-duplication 1740 * source when the check mode is set to NONE. 1741 */ 1742 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1743 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1744 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1745 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1746 HAMMER2_DEC_CHECK(chain->bref.methods) == 1747 HAMMER2_CHECK_NONE && 1748 chain->pmp && 1749 chain->bref.modify_tid > 1750 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1751 /* 1752 * Sector overwrite allowed. 1753 */ 1754 newmod = 0; 1755 } else if ((hmp->hflags & HMNT2_EMERG) && 1756 chain->pmp && 1757 chain->bref.modify_tid > 1758 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1759 /* 1760 * If in emergency delete mode then do a modify-in- 1761 * place on any chain type belonging to the PFS as 1762 * long as it doesn't mess up a snapshot. We might 1763 * be forced to do this anyway a little further down 1764 * in the code if the allocation fails. 1765 * 1766 * Also note that in emergency mode, these modify-in- 1767 * place operations are NOT SAFE. A storage failure, 1768 * power failure, or panic can corrupt the filesystem. 1769 */ 1770 newmod = 0; 1771 } else { 1772 /* 1773 * Sector overwrite not allowed, must copy-on-write. 1774 */ 1775 newmod = 1; 1776 } 1777 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1778 /* 1779 * If the modified chain was registered for dedup we need 1780 * a new allocation. This only happens for delayed-flush 1781 * chains (i.e. which run through the front-end buffer 1782 * cache). 1783 */ 1784 newmod = 1; 1785 setmodified = 0; 1786 } else { 1787 /* 1788 * Already flagged modified, no new allocation is needed. 1789 */ 1790 newmod = 0; 1791 setmodified = 0; 1792 } 1793 1794 /* 1795 * Flag parent update required. 1796 */ 1797 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1798 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1799 setupdate = 1; 1800 } else { 1801 setupdate = 0; 1802 } 1803 1804 /* 1805 * The XOP code returns held but unlocked focus chains. This 1806 * prevents the chain from being destroyed but does not prevent 1807 * it from being modified. diolk is used to interlock modifications 1808 * against XOP frontend accesses to the focus. 1809 * 1810 * This allows us to theoretically avoid deadlocking the frontend 1811 * if one of the backends lock up by not formally locking the 1812 * focused chain in the frontend. In addition, the synchronization 1813 * code relies on this mechanism to avoid deadlocking concurrent 1814 * synchronization threads. 1815 */ 1816 lockmgr(&chain->diolk, LK_EXCLUSIVE); 1817 1818 /* 1819 * The modification or re-modification requires an allocation and 1820 * possible COW. If an error occurs, the previous content and data 1821 * reference is retained and the modification fails. 1822 * 1823 * If dedup_off is non-zero, the caller is requesting a deduplication 1824 * rather than a modification. The MODIFIED bit is not set and the 1825 * data offset is set to the deduplication offset. The data cannot 1826 * be modified. 1827 * 1828 * NOTE: The dedup offset is allowed to be in a partially free state 1829 * and we must be sure to reset it to a fully allocated state 1830 * to force two bulkfree passes to free it again. 1831 * 1832 * NOTE: Only applicable when chain->bytes != 0. 1833 * 1834 * XXX can a chain already be marked MODIFIED without a data 1835 * assignment? If not, assert here instead of testing the case. 1836 */ 1837 if (chain != &hmp->vchain && chain != &hmp->fchain && 1838 chain->bytes) { 1839 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1840 newmod 1841 ) { 1842 /* 1843 * NOTE: We do not have to remove the dedup 1844 * registration because the area is still 1845 * allocated and the underlying DIO will 1846 * still be flushed. 1847 */ 1848 if (dedup_off) { 1849 chain->bref.data_off = dedup_off; 1850 chain->bytes = 1 << (dedup_off & 1851 HAMMER2_OFF_MASK_RADIX); 1852 chain->error = 0; 1853 atomic_clear_int(&chain->flags, 1854 HAMMER2_CHAIN_MODIFIED); 1855 atomic_add_long(&hammer2_count_modified_chains, 1856 -1); 1857 if (chain->pmp) 1858 hammer2_pfs_memory_wakeup(chain->pmp); 1859 hammer2_freemap_adjust(hmp, &chain->bref, 1860 HAMMER2_FREEMAP_DORECOVER); 1861 atomic_set_int(&chain->flags, 1862 HAMMER2_CHAIN_DEDUPABLE); 1863 } else { 1864 error = hammer2_freemap_alloc(chain, 1865 chain->bytes); 1866 atomic_clear_int(&chain->flags, 1867 HAMMER2_CHAIN_DEDUPABLE); 1868 1869 /* 1870 * If we are unable to allocate a new block 1871 * but we are in emergency mode, issue a 1872 * warning to the console and reuse the same 1873 * block. 1874 * 1875 * We behave as if the allocation were 1876 * successful. 1877 * 1878 * THIS IS IMPORTANT: These modifications 1879 * are virtually guaranteed to corrupt any 1880 * snapshots related to this filesystem. 1881 */ 1882 if (error && (hmp->hflags & HMNT2_EMERG)) { 1883 error = 0; 1884 chain->bref.flags |= 1885 HAMMER2_BREF_FLAG_EMERG_MIP; 1886 1887 krateprintf(&krate_h2em, 1888 "hammer2: Emergency Mode WARNING: " 1889 "Operation will likely corrupt " 1890 "related snapshot: " 1891 "%016jx.%02x key=%016jx\n", 1892 chain->bref.data_off, 1893 chain->bref.type, 1894 chain->bref.key); 1895 } else if (error == 0) { 1896 chain->bref.flags &= 1897 ~HAMMER2_BREF_FLAG_EMERG_MIP; 1898 } 1899 } 1900 } 1901 } 1902 1903 /* 1904 * Stop here if error. We have to undo any flag bits we might 1905 * have set above. 1906 */ 1907 if (error) { 1908 if (setmodified) { 1909 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1910 atomic_add_long(&hammer2_count_modified_chains, -1); 1911 if (chain->pmp) 1912 hammer2_pfs_memory_wakeup(chain->pmp); 1913 } 1914 if (setupdate) { 1915 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1916 } 1917 lockmgr(&chain->diolk, LK_RELEASE); 1918 1919 return error; 1920 } 1921 1922 /* 1923 * Update mirror_tid and modify_tid. modify_tid is only updated 1924 * if not passed as zero (during flushes, parent propagation passes 1925 * the value 0). 1926 * 1927 * NOTE: chain->pmp could be the device spmp. 1928 */ 1929 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1930 if (mtid) 1931 chain->bref.modify_tid = mtid; 1932 1933 /* 1934 * Set BMAPUPD to tell the flush code that an existing blockmap entry 1935 * requires updating as well as to tell the delete code that the 1936 * chain's blockref might not exactly match (in terms of physical size 1937 * or block offset) the one in the parent's blocktable. The base key 1938 * of course will still match. 1939 */ 1940 if (chain->flags & HAMMER2_CHAIN_BMAPPED) 1941 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD); 1942 1943 /* 1944 * Short-cut data blocks which the caller does not need an actual 1945 * data reference to (aka OPTDATA), as long as the chain does not 1946 * already have a data pointer to the data. This generally means 1947 * that the modifications are being done via the logical buffer cache. 1948 * The INITIAL flag relates only to the device data buffer and thus 1949 * remains unchange in this situation. 1950 * 1951 * This code also handles bytes == 0 (most dirents). 1952 */ 1953 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1954 (flags & HAMMER2_MODIFY_OPTDATA) && 1955 chain->data == NULL) { 1956 KKASSERT(chain->dio == NULL); 1957 goto skip2; 1958 } 1959 1960 /* 1961 * Clearing the INITIAL flag (for indirect blocks) indicates that 1962 * we've processed the uninitialized storage allocation. 1963 * 1964 * If this flag is already clear we are likely in a copy-on-write 1965 * situation but we have to be sure NOT to bzero the storage if 1966 * no data is present. 1967 */ 1968 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1969 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1970 wasinitial = 1; 1971 } else { 1972 wasinitial = 0; 1973 } 1974 1975 /* 1976 * Instantiate data buffer and possibly execute COW operation 1977 */ 1978 switch(chain->bref.type) { 1979 case HAMMER2_BREF_TYPE_VOLUME: 1980 case HAMMER2_BREF_TYPE_FREEMAP: 1981 /* 1982 * The data is embedded, no copy-on-write operation is 1983 * needed. 1984 */ 1985 KKASSERT(chain->dio == NULL); 1986 break; 1987 case HAMMER2_BREF_TYPE_DIRENT: 1988 /* 1989 * The data might be fully embedded. 1990 */ 1991 if (chain->bytes == 0) { 1992 KKASSERT(chain->dio == NULL); 1993 break; 1994 } 1995 /* fall through */ 1996 case HAMMER2_BREF_TYPE_INODE: 1997 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1998 case HAMMER2_BREF_TYPE_DATA: 1999 case HAMMER2_BREF_TYPE_INDIRECT: 2000 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2001 /* 2002 * Perform the copy-on-write operation 2003 * 2004 * zero-fill or copy-on-write depending on whether 2005 * chain->data exists or not and set the dirty state for 2006 * the new buffer. hammer2_io_new() will handle the 2007 * zero-fill. 2008 * 2009 * If a dedup_off was supplied this is an existing block 2010 * and no COW, copy, or further modification is required. 2011 */ 2012 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 2013 2014 if (wasinitial && dedup_off == 0) { 2015 error = hammer2_io_new(hmp, chain->bref.type, 2016 chain->bref.data_off, 2017 chain->bytes, &dio); 2018 } else { 2019 error = hammer2_io_bread(hmp, chain->bref.type, 2020 chain->bref.data_off, 2021 chain->bytes, &dio); 2022 } 2023 hammer2_adjreadcounter(&chain->bref, chain->bytes); 2024 2025 /* 2026 * If an I/O error occurs make sure callers cannot accidently 2027 * modify the old buffer's contents and corrupt the filesystem. 2028 * 2029 * NOTE: hammer2_io_data() call issues bkvasync() 2030 */ 2031 if (error) { 2032 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 2033 hmp); 2034 chain->error = HAMMER2_ERROR_EIO; 2035 hammer2_io_brelse(&dio); 2036 hammer2_io_brelse(&chain->dio); 2037 chain->data = NULL; 2038 break; 2039 } 2040 chain->error = 0; 2041 bdata = hammer2_io_data(dio, chain->bref.data_off); 2042 2043 if (chain->data) { 2044 /* 2045 * COW (unless a dedup). 2046 */ 2047 KKASSERT(chain->dio != NULL); 2048 if (chain->data != (void *)bdata && dedup_off == 0) { 2049 bcopy(chain->data, bdata, chain->bytes); 2050 } 2051 } else if (wasinitial == 0) { 2052 /* 2053 * We have a problem. We were asked to COW but 2054 * we don't have any data to COW with! 2055 */ 2056 panic("hammer2_chain_modify: having a COW %p\n", 2057 chain); 2058 } 2059 2060 /* 2061 * Retire the old buffer, replace with the new. Dirty or 2062 * redirty the new buffer. 2063 * 2064 * WARNING! The system buffer cache may have already flushed 2065 * the buffer, so we must be sure to [re]dirty it 2066 * for further modification. 2067 * 2068 * If dedup_off was supplied, the caller is not 2069 * expected to make any further modification to the 2070 * buffer. 2071 * 2072 * WARNING! hammer2_get_gdata() assumes dio never transitions 2073 * through NULL in order to optimize away unnecessary 2074 * diolk operations. 2075 */ 2076 { 2077 hammer2_io_t *tio; 2078 2079 if ((tio = chain->dio) != NULL) 2080 hammer2_io_bqrelse(&tio); 2081 chain->data = (void *)bdata; 2082 chain->dio = dio; 2083 if (dedup_off == 0) 2084 hammer2_io_setdirty(dio); 2085 } 2086 break; 2087 default: 2088 panic("hammer2_chain_modify: illegal non-embedded type %d", 2089 chain->bref.type); 2090 break; 2091 2092 } 2093 skip2: 2094 /* 2095 * setflush on parent indicating that the parent must recurse down 2096 * to us. Do not call on chain itself which might already have it 2097 * set. 2098 */ 2099 if (chain->parent) 2100 hammer2_chain_setflush(chain->parent); 2101 lockmgr(&chain->diolk, LK_RELEASE); 2102 2103 return (chain->error); 2104 } 2105 2106 /* 2107 * Modify the chain associated with an inode. 2108 */ 2109 int 2110 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 2111 hammer2_tid_t mtid, int flags) 2112 { 2113 int error; 2114 2115 hammer2_inode_modify(ip); 2116 error = hammer2_chain_modify(chain, mtid, 0, flags); 2117 2118 return error; 2119 } 2120 2121 /* 2122 * Volume header data locks 2123 */ 2124 void 2125 hammer2_voldata_lock(hammer2_dev_t *hmp) 2126 { 2127 lockmgr(&hmp->vollk, LK_EXCLUSIVE); 2128 } 2129 2130 void 2131 hammer2_voldata_unlock(hammer2_dev_t *hmp) 2132 { 2133 lockmgr(&hmp->vollk, LK_RELEASE); 2134 } 2135 2136 void 2137 hammer2_voldata_modify(hammer2_dev_t *hmp) 2138 { 2139 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) { 2140 atomic_add_long(&hammer2_count_modified_chains, 1); 2141 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 2142 hammer2_pfs_memory_inc(hmp->vchain.pmp); 2143 } 2144 } 2145 2146 /* 2147 * This function returns the chain at the nearest key within the specified 2148 * range. The returned chain will be referenced but not locked. 2149 * 2150 * This function will recurse through chain->rbtree as necessary and will 2151 * return a *key_nextp suitable for iteration. *key_nextp is only set if 2152 * the iteration value is less than the current value of *key_nextp. 2153 * 2154 * The caller should use (*key_nextp) to calculate the actual range of 2155 * the returned element, which will be (key_beg to *key_nextp - 1), because 2156 * there might be another element which is superior to the returned element 2157 * and overlaps it. 2158 * 2159 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 2160 * chains continue to be returned. On EOF (*key_nextp) may overflow since 2161 * it will wind up being (key_end + 1). 2162 * 2163 * WARNING! Must be called with child's spinlock held. Spinlock remains 2164 * held through the operation. 2165 */ 2166 struct hammer2_chain_find_info { 2167 hammer2_chain_t *best; 2168 hammer2_key_t key_beg; 2169 hammer2_key_t key_end; 2170 hammer2_key_t key_next; 2171 }; 2172 2173 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 2174 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 2175 2176 static 2177 hammer2_chain_t * 2178 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 2179 hammer2_key_t key_beg, hammer2_key_t key_end) 2180 { 2181 struct hammer2_chain_find_info info; 2182 2183 info.best = NULL; 2184 info.key_beg = key_beg; 2185 info.key_end = key_end; 2186 info.key_next = *key_nextp; 2187 2188 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 2189 hammer2_chain_find_cmp, hammer2_chain_find_callback, 2190 &info); 2191 *key_nextp = info.key_next; 2192 #if 0 2193 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 2194 parent, key_beg, key_end, *key_nextp); 2195 #endif 2196 2197 return (info.best); 2198 } 2199 2200 static 2201 int 2202 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 2203 { 2204 struct hammer2_chain_find_info *info = data; 2205 hammer2_key_t child_beg; 2206 hammer2_key_t child_end; 2207 2208 child_beg = child->bref.key; 2209 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 2210 2211 if (child_end < info->key_beg) 2212 return(-1); 2213 if (child_beg > info->key_end) 2214 return(1); 2215 return(0); 2216 } 2217 2218 static 2219 int 2220 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2221 { 2222 struct hammer2_chain_find_info *info = data; 2223 hammer2_chain_t *best; 2224 hammer2_key_t child_end; 2225 2226 /* 2227 * WARNING! Layerq is scanned forwards, exact matches should keep 2228 * the existing info->best. 2229 */ 2230 if ((best = info->best) == NULL) { 2231 /* 2232 * No previous best. Assign best 2233 */ 2234 info->best = child; 2235 } else if (best->bref.key <= info->key_beg && 2236 child->bref.key <= info->key_beg) { 2237 /* 2238 * Illegal overlap. 2239 */ 2240 KKASSERT(0); 2241 /*info->best = child;*/ 2242 } else if (child->bref.key < best->bref.key) { 2243 /* 2244 * Child has a nearer key and best is not flush with key_beg. 2245 * Set best to child. Truncate key_next to the old best key. 2246 */ 2247 info->best = child; 2248 if (info->key_next > best->bref.key || info->key_next == 0) 2249 info->key_next = best->bref.key; 2250 } else if (child->bref.key == best->bref.key) { 2251 /* 2252 * If our current best is flush with the child then this 2253 * is an illegal overlap. 2254 * 2255 * key_next will automatically be limited to the smaller of 2256 * the two end-points. 2257 */ 2258 KKASSERT(0); 2259 info->best = child; 2260 } else { 2261 /* 2262 * Keep the current best but truncate key_next to the child's 2263 * base. 2264 * 2265 * key_next will also automatically be limited to the smaller 2266 * of the two end-points (probably not necessary for this case 2267 * but we do it anyway). 2268 */ 2269 if (info->key_next > child->bref.key || info->key_next == 0) 2270 info->key_next = child->bref.key; 2271 } 2272 2273 /* 2274 * Always truncate key_next based on child's end-of-range. 2275 */ 2276 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2277 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2278 info->key_next = child_end; 2279 2280 return(0); 2281 } 2282 2283 /* 2284 * Retrieve the specified chain from a media blockref, creating the 2285 * in-memory chain structure which reflects it. The returned chain is 2286 * held and locked according to (how) (HAMMER2_RESOLVE_*). The caller must 2287 * handle crc-checks and so forth, and should check chain->error before 2288 * assuming that the data is good. 2289 * 2290 * To handle insertion races pass the INSERT_RACE flag along with the 2291 * generation number of the core. NULL will be returned if the generation 2292 * number changes before we have a chance to insert the chain. Insert 2293 * races can occur because the parent might be held shared. 2294 * 2295 * Caller must hold the parent locked shared or exclusive since we may 2296 * need the parent's bref array to find our block. 2297 * 2298 * WARNING! chain->pmp is always set to NULL for any chain representing 2299 * part of the super-root topology. 2300 */ 2301 hammer2_chain_t * 2302 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2303 hammer2_blockref_t *bref, int how) 2304 { 2305 hammer2_dev_t *hmp = parent->hmp; 2306 hammer2_chain_t *chain; 2307 int error; 2308 2309 /* 2310 * Allocate a chain structure representing the existing media 2311 * entry. Resulting chain has one ref and is not locked. 2312 */ 2313 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2314 chain = hammer2_chain_alloc(hmp, NULL, bref); 2315 else 2316 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2317 /* ref'd chain returned */ 2318 2319 /* 2320 * Flag that the chain is in the parent's blockmap so delete/flush 2321 * knows what to do with it. 2322 */ 2323 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 2324 2325 /* 2326 * chain must be locked to avoid unexpected ripouts 2327 */ 2328 hammer2_chain_lock(chain, how); 2329 2330 /* 2331 * Link the chain into its parent. A spinlock is required to safely 2332 * access the RBTREE, and it is possible to collide with another 2333 * hammer2_chain_get() operation because the caller might only hold 2334 * a shared lock on the parent. 2335 * 2336 * NOTE: Get races can occur quite often when we distribute 2337 * asynchronous read-aheads across multiple threads. 2338 */ 2339 KKASSERT(parent->refs > 0); 2340 error = hammer2_chain_insert(parent, chain, 2341 HAMMER2_CHAIN_INSERT_SPIN | 2342 HAMMER2_CHAIN_INSERT_RACE, 2343 generation); 2344 if (error) { 2345 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2346 /*kprintf("chain %p get race\n", chain);*/ 2347 hammer2_chain_unlock(chain); 2348 hammer2_chain_drop(chain); 2349 chain = NULL; 2350 } else { 2351 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2352 } 2353 2354 /* 2355 * Return our new chain referenced but not locked, or NULL if 2356 * a race occurred. 2357 */ 2358 return (chain); 2359 } 2360 2361 /* 2362 * Lookup initialization/completion API 2363 */ 2364 hammer2_chain_t * 2365 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2366 { 2367 hammer2_chain_ref(parent); 2368 if (flags & HAMMER2_LOOKUP_SHARED) { 2369 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2370 HAMMER2_RESOLVE_SHARED); 2371 } else { 2372 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2373 } 2374 return (parent); 2375 } 2376 2377 void 2378 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2379 { 2380 if (parent) { 2381 hammer2_chain_unlock(parent); 2382 hammer2_chain_drop(parent); 2383 } 2384 } 2385 2386 /* 2387 * Take the locked chain and return a locked parent. The chain remains 2388 * locked on return, but may have to be temporarily unlocked to acquire 2389 * the parent. Because of this, (chain) must be stable and cannot be 2390 * deleted while it was temporarily unlocked (typically means that (chain) 2391 * is an inode). 2392 * 2393 * Pass HAMMER2_RESOLVE_* flags in flags. 2394 * 2395 * This will work even if the chain is errored, and the caller can check 2396 * parent->error on return if desired since the parent will be locked. 2397 * 2398 * This function handles the lock order reversal. 2399 */ 2400 hammer2_chain_t * 2401 hammer2_chain_getparent(hammer2_chain_t *chain, int flags) 2402 { 2403 hammer2_chain_t *parent; 2404 2405 /* 2406 * Be careful of order, chain must be unlocked before parent 2407 * is locked below to avoid a deadlock. Try it trivially first. 2408 */ 2409 parent = chain->parent; 2410 if (parent == NULL) 2411 panic("hammer2_chain_getparent: no parent"); 2412 hammer2_chain_ref(parent); 2413 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) 2414 return parent; 2415 2416 for (;;) { 2417 hammer2_chain_unlock(chain); 2418 hammer2_chain_lock(parent, flags); 2419 hammer2_chain_lock(chain, flags); 2420 2421 /* 2422 * Parent relinking races are quite common. We have to get 2423 * it right or we will blow up the block table. 2424 */ 2425 if (chain->parent == parent) 2426 break; 2427 hammer2_chain_unlock(parent); 2428 hammer2_chain_drop(parent); 2429 cpu_ccfence(); 2430 parent = chain->parent; 2431 if (parent == NULL) 2432 panic("hammer2_chain_getparent: no parent"); 2433 hammer2_chain_ref(parent); 2434 } 2435 return parent; 2436 } 2437 2438 /* 2439 * Take the locked chain and return a locked parent. The chain is unlocked 2440 * and dropped. *chainp is set to the returned parent as a convenience. 2441 * Pass HAMMER2_RESOLVE_* flags in flags. 2442 * 2443 * This will work even if the chain is errored, and the caller can check 2444 * parent->error on return if desired since the parent will be locked. 2445 * 2446 * The chain does NOT need to be stable. We use a tracking structure 2447 * to track the expected parent if the chain is deleted out from under us. 2448 * 2449 * This function handles the lock order reversal. 2450 */ 2451 hammer2_chain_t * 2452 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags) 2453 { 2454 hammer2_chain_t *chain; 2455 hammer2_chain_t *parent; 2456 struct hammer2_reptrack reptrack; 2457 struct hammer2_reptrack **repp; 2458 2459 /* 2460 * Be careful of order, chain must be unlocked before parent 2461 * is locked below to avoid a deadlock. Try it trivially first. 2462 */ 2463 chain = *chainp; 2464 parent = chain->parent; 2465 if (parent == NULL) { 2466 hammer2_spin_unex(&chain->core.spin); 2467 panic("hammer2_chain_repparent: no parent"); 2468 } 2469 hammer2_chain_ref(parent); 2470 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) { 2471 hammer2_chain_unlock(chain); 2472 hammer2_chain_drop(chain); 2473 *chainp = parent; 2474 2475 return parent; 2476 } 2477 2478 /* 2479 * Ok, now it gets a bit nasty. There are multiple situations where 2480 * the parent might be in the middle of a deletion, or where the child 2481 * (chain) might be deleted the instant we let go of its lock. 2482 * We can potentially end up in a no-win situation! 2483 * 2484 * In particular, the indirect_maintenance() case can cause these 2485 * situations. 2486 * 2487 * To deal with this we install a reptrack structure in the parent 2488 * This reptrack structure 'owns' the parent ref and will automatically 2489 * migrate to the parent's parent if the parent is deleted permanently. 2490 */ 2491 hammer2_spin_init(&reptrack.spin, "h2reptrk"); 2492 reptrack.chain = parent; 2493 hammer2_chain_ref(parent); /* for the reptrack */ 2494 2495 hammer2_spin_ex(&parent->core.spin); 2496 reptrack.next = parent->core.reptrack; 2497 parent->core.reptrack = &reptrack; 2498 hammer2_spin_unex(&parent->core.spin); 2499 2500 hammer2_chain_unlock(chain); 2501 hammer2_chain_drop(chain); 2502 chain = NULL; /* gone */ 2503 2504 /* 2505 * At the top of this loop, chain is gone and parent is refd both 2506 * by us explicitly AND via our reptrack. We are attempting to 2507 * lock parent. 2508 */ 2509 for (;;) { 2510 hammer2_chain_lock(parent, flags); 2511 2512 if (reptrack.chain == parent) 2513 break; 2514 hammer2_chain_unlock(parent); 2515 hammer2_chain_drop(parent); 2516 2517 kprintf("hammer2: debug REPTRACK %p->%p\n", 2518 parent, reptrack.chain); 2519 hammer2_spin_ex(&reptrack.spin); 2520 parent = reptrack.chain; 2521 hammer2_chain_ref(parent); 2522 hammer2_spin_unex(&reptrack.spin); 2523 } 2524 2525 /* 2526 * Once parent is locked and matches our reptrack, our reptrack 2527 * will be stable and we have our parent. We can unlink our 2528 * reptrack. 2529 * 2530 * WARNING! Remember that the chain lock might be shared. Chains 2531 * locked shared have stable parent linkages. 2532 */ 2533 hammer2_spin_ex(&parent->core.spin); 2534 repp = &parent->core.reptrack; 2535 while (*repp != &reptrack) 2536 repp = &(*repp)->next; 2537 *repp = reptrack.next; 2538 hammer2_spin_unex(&parent->core.spin); 2539 2540 hammer2_chain_drop(parent); /* reptrack ref */ 2541 *chainp = parent; /* return parent lock+ref */ 2542 2543 return parent; 2544 } 2545 2546 /* 2547 * Dispose of any linked reptrack structures in (chain) by shifting them to 2548 * (parent). Both (chain) and (parent) must be exclusively locked. 2549 * 2550 * This is interlocked against any children of (chain) on the other side. 2551 * No children so remain as-of when this is called so we can test 2552 * core.reptrack without holding the spin-lock. 2553 * 2554 * Used whenever the caller intends to permanently delete chains related 2555 * to topological recursions (BREF_TYPE_INDIRECT, BREF_TYPE_FREEMAP_NODE), 2556 * where the chains underneath the node being deleted are given a new parent 2557 * above the node being deleted. 2558 */ 2559 static 2560 void 2561 hammer2_chain_repchange(hammer2_chain_t *parent, hammer2_chain_t *chain) 2562 { 2563 struct hammer2_reptrack *reptrack; 2564 2565 KKASSERT(chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)); 2566 while (chain->core.reptrack) { 2567 hammer2_spin_ex(&parent->core.spin); 2568 hammer2_spin_ex(&chain->core.spin); 2569 reptrack = chain->core.reptrack; 2570 if (reptrack == NULL) { 2571 hammer2_spin_unex(&chain->core.spin); 2572 hammer2_spin_unex(&parent->core.spin); 2573 break; 2574 } 2575 hammer2_spin_ex(&reptrack->spin); 2576 chain->core.reptrack = reptrack->next; 2577 reptrack->chain = parent; 2578 reptrack->next = parent->core.reptrack; 2579 parent->core.reptrack = reptrack; 2580 hammer2_chain_ref(parent); /* reptrack */ 2581 2582 hammer2_spin_unex(&chain->core.spin); 2583 hammer2_spin_unex(&parent->core.spin); 2584 kprintf("hammer2: debug repchange %p %p->%p\n", 2585 reptrack, chain, parent); 2586 hammer2_chain_drop(chain); /* reptrack */ 2587 } 2588 } 2589 2590 /* 2591 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2592 * (*parentp) typically points to an inode but can also point to a related 2593 * indirect block and this function will recurse upwards and find the inode 2594 * or the nearest undeleted indirect block covering the key range. 2595 * 2596 * This function unconditionally sets *errorp, replacing any previous value. 2597 * 2598 * (*parentp) must be exclusive or shared locked (depending on flags) and 2599 * referenced and can be an inode or an existing indirect block within the 2600 * inode. 2601 * 2602 * If (*parent) is errored out, this function will not attempt to recurse 2603 * the radix tree and will return NULL along with an appropriate *errorp. 2604 * If NULL is returned and *errorp is 0, the requested lookup could not be 2605 * located. 2606 * 2607 * On return (*parentp) will be modified to point at the deepest parent chain 2608 * element encountered during the search, as a helper for an insertion or 2609 * deletion. 2610 * 2611 * The new (*parentp) will be locked shared or exclusive (depending on flags), 2612 * and referenced, and the old will be unlocked and dereferenced (no change 2613 * if they are both the same). This is particularly important if the caller 2614 * wishes to insert a new chain, (*parentp) will be set properly even if NULL 2615 * is returned, as long as no error occurred. 2616 * 2617 * The matching chain will be returned locked according to flags. 2618 * 2619 * -- 2620 * 2621 * NULL is returned if no match was found, but (*parentp) will still 2622 * potentially be adjusted. 2623 * 2624 * On return (*key_nextp) will point to an iterative value for key_beg. 2625 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2626 * 2627 * This function will also recurse up the chain if the key is not within the 2628 * current parent's range. (*parentp) can never be set to NULL. An iteration 2629 * can simply allow (*parentp) to float inside the loop. 2630 * 2631 * NOTE! chain->data is not always resolved. By default it will not be 2632 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2633 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2634 * BREF_TYPE_DATA as the device buffer can alias the logical file 2635 * buffer). 2636 */ 2637 2638 hammer2_chain_t * 2639 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2640 hammer2_key_t key_beg, hammer2_key_t key_end, 2641 int *errorp, int flags) 2642 { 2643 hammer2_dev_t *hmp; 2644 hammer2_chain_t *parent; 2645 hammer2_chain_t *chain; 2646 hammer2_blockref_t *base; 2647 hammer2_blockref_t *bref; 2648 hammer2_blockref_t bcopy; 2649 hammer2_key_t scan_beg; 2650 hammer2_key_t scan_end; 2651 int count = 0; 2652 int how_always = HAMMER2_RESOLVE_ALWAYS; 2653 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2654 int how; 2655 int generation; 2656 int maxloops = 300000; 2657 volatile hammer2_mtx_t save_mtx; 2658 2659 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2660 how_maybe = how_always; 2661 how = HAMMER2_RESOLVE_ALWAYS; 2662 } else if (flags & HAMMER2_LOOKUP_NODATA) { 2663 how = HAMMER2_RESOLVE_NEVER; 2664 } else { 2665 how = HAMMER2_RESOLVE_MAYBE; 2666 } 2667 if (flags & HAMMER2_LOOKUP_SHARED) { 2668 how_maybe |= HAMMER2_RESOLVE_SHARED; 2669 how_always |= HAMMER2_RESOLVE_SHARED; 2670 how |= HAMMER2_RESOLVE_SHARED; 2671 } 2672 2673 /* 2674 * Recurse (*parentp) upward if necessary until the parent completely 2675 * encloses the key range or we hit the inode. 2676 * 2677 * Handle races against the flusher deleting indirect nodes on its 2678 * way back up by continuing to recurse upward past the deletion. 2679 */ 2680 parent = *parentp; 2681 hmp = parent->hmp; 2682 *errorp = 0; 2683 2684 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2685 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2686 scan_beg = parent->bref.key; 2687 scan_end = scan_beg + 2688 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2689 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2690 if (key_beg >= scan_beg && key_end <= scan_end) 2691 break; 2692 } 2693 parent = hammer2_chain_repparent(parentp, how_maybe); 2694 } 2695 again: 2696 if (--maxloops == 0) 2697 panic("hammer2_chain_lookup: maxloops"); 2698 /* 2699 * Locate the blockref array. Currently we do a fully associative 2700 * search through the array. 2701 */ 2702 switch(parent->bref.type) { 2703 case HAMMER2_BREF_TYPE_INODE: 2704 /* 2705 * Special shortcut for embedded data returns the inode 2706 * itself. Callers must detect this condition and access 2707 * the embedded data (the strategy code does this for us). 2708 * 2709 * This is only applicable to regular files and softlinks. 2710 * 2711 * We need a second lock on parent. Since we already have 2712 * a lock we must pass LOCKAGAIN to prevent unexpected 2713 * blocking (we don't want to block on a second shared 2714 * ref if an exclusive lock is pending) 2715 */ 2716 if (parent->data->ipdata.meta.op_flags & 2717 HAMMER2_OPFLAG_DIRECTDATA) { 2718 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2719 chain = NULL; 2720 *key_nextp = key_end + 1; 2721 goto done; 2722 } 2723 hammer2_chain_ref(parent); 2724 hammer2_chain_lock(parent, how_always | 2725 HAMMER2_RESOLVE_LOCKAGAIN); 2726 *key_nextp = key_end + 1; 2727 return (parent); 2728 } 2729 base = &parent->data->ipdata.u.blockset.blockref[0]; 2730 count = HAMMER2_SET_COUNT; 2731 break; 2732 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2733 case HAMMER2_BREF_TYPE_INDIRECT: 2734 /* 2735 * Handle MATCHIND on the parent 2736 */ 2737 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2738 scan_beg = parent->bref.key; 2739 scan_end = scan_beg + 2740 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2741 if (key_beg == scan_beg && key_end == scan_end) { 2742 chain = parent; 2743 hammer2_chain_ref(chain); 2744 hammer2_chain_lock(chain, how_maybe); 2745 *key_nextp = scan_end + 1; 2746 goto done; 2747 } 2748 } 2749 2750 /* 2751 * Optimize indirect blocks in the INITIAL state to avoid 2752 * I/O. 2753 */ 2754 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2755 base = NULL; 2756 } else { 2757 if (parent->data == NULL) { 2758 kprintf("parent->data is NULL %p\n", parent); 2759 while (1) 2760 tsleep(parent, 0, "xxx", 0); 2761 } 2762 base = &parent->data->npdata[0]; 2763 } 2764 count = parent->bytes / sizeof(hammer2_blockref_t); 2765 break; 2766 case HAMMER2_BREF_TYPE_VOLUME: 2767 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2768 count = HAMMER2_SET_COUNT; 2769 break; 2770 case HAMMER2_BREF_TYPE_FREEMAP: 2771 base = &parent->data->blkset.blockref[0]; 2772 count = HAMMER2_SET_COUNT; 2773 break; 2774 default: 2775 kprintf("hammer2_chain_lookup: unrecognized " 2776 "blockref(B) type: %d", 2777 parent->bref.type); 2778 while (1) 2779 tsleep(&base, 0, "dead", 0); 2780 panic("hammer2_chain_lookup: unrecognized " 2781 "blockref(B) type: %d", 2782 parent->bref.type); 2783 base = NULL; /* safety */ 2784 count = 0; /* safety */ 2785 } 2786 2787 /* 2788 * No lookup is possible if the parent is errored. We delayed 2789 * this check as long as we could to ensure that the parent backup, 2790 * embedded data, and MATCHIND code could still execute. 2791 */ 2792 if (parent->error) { 2793 *errorp = parent->error; 2794 return NULL; 2795 } 2796 2797 /* 2798 * Merged scan to find next candidate. 2799 * 2800 * hammer2_base_*() functions require the parent->core.live_* fields 2801 * to be synchronized. 2802 * 2803 * We need to hold the spinlock to access the block array and RB tree 2804 * and to interlock chain creation. 2805 */ 2806 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2807 hammer2_chain_countbrefs(parent, base, count); 2808 2809 /* 2810 * Combined search 2811 */ 2812 hammer2_spin_ex(&parent->core.spin); 2813 chain = hammer2_combined_find(parent, base, count, 2814 key_nextp, 2815 key_beg, key_end, 2816 &bref); 2817 generation = parent->core.generation; 2818 2819 /* 2820 * Exhausted parent chain, iterate. 2821 */ 2822 if (bref == NULL) { 2823 KKASSERT(chain == NULL); 2824 hammer2_spin_unex(&parent->core.spin); 2825 if (key_beg == key_end) /* short cut single-key case */ 2826 return (NULL); 2827 2828 /* 2829 * Stop if we reached the end of the iteration. 2830 */ 2831 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2832 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2833 return (NULL); 2834 } 2835 2836 /* 2837 * Calculate next key, stop if we reached the end of the 2838 * iteration, otherwise go up one level and loop. 2839 */ 2840 key_beg = parent->bref.key + 2841 ((hammer2_key_t)1 << parent->bref.keybits); 2842 if (key_beg == 0 || key_beg > key_end) 2843 return (NULL); 2844 parent = hammer2_chain_repparent(parentp, how_maybe); 2845 goto again; 2846 } 2847 2848 /* 2849 * Selected from blockref or in-memory chain. 2850 */ 2851 bcopy = *bref; 2852 if (chain == NULL) { 2853 hammer2_spin_unex(&parent->core.spin); 2854 if (bcopy.type == HAMMER2_BREF_TYPE_INDIRECT || 2855 bcopy.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2856 chain = hammer2_chain_get(parent, generation, 2857 &bcopy, how_maybe); 2858 } else { 2859 chain = hammer2_chain_get(parent, generation, 2860 &bcopy, how); 2861 } 2862 if (chain == NULL) 2863 goto again; 2864 } else { 2865 hammer2_chain_ref(chain); 2866 hammer2_spin_unex(&parent->core.spin); 2867 2868 /* 2869 * chain is referenced but not locked. We must lock the 2870 * chain to obtain definitive state. 2871 */ 2872 if (bcopy.type == HAMMER2_BREF_TYPE_INDIRECT || 2873 bcopy.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2874 hammer2_chain_lock(chain, how_maybe); 2875 } else { 2876 hammer2_chain_lock(chain, how); 2877 } 2878 KKASSERT(chain->parent == parent); 2879 } 2880 if (bcmp(&bcopy, &chain->bref, sizeof(bcopy)) || 2881 chain->parent != parent) { 2882 hammer2_chain_unlock(chain); 2883 hammer2_chain_drop(chain); 2884 chain = NULL; /* SAFETY */ 2885 goto again; 2886 } 2887 2888 2889 /* 2890 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2891 * 2892 * NOTE: Chain's key range is not relevant as there might be 2893 * one-offs within the range that are not deleted. 2894 * 2895 * NOTE: Lookups can race delete-duplicate because 2896 * delete-duplicate does not lock the parent's core 2897 * (they just use the spinlock on the core). 2898 */ 2899 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2900 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2901 chain->bref.data_off, chain->bref.type, 2902 chain->bref.key); 2903 hammer2_chain_unlock(chain); 2904 hammer2_chain_drop(chain); 2905 chain = NULL; /* SAFETY */ 2906 key_beg = *key_nextp; 2907 if (key_beg == 0 || key_beg > key_end) 2908 return(NULL); 2909 goto again; 2910 } 2911 2912 /* 2913 * If the chain element is an indirect block it becomes the new 2914 * parent and we loop on it. We must maintain our top-down locks 2915 * to prevent the flusher from interfering (i.e. doing a 2916 * delete-duplicate and leaving us recursing down a deleted chain). 2917 * 2918 * The parent always has to be locked with at least RESOLVE_MAYBE 2919 * so we can access its data. It might need a fixup if the caller 2920 * passed incompatible flags. Be careful not to cause a deadlock 2921 * as a data-load requires an exclusive lock. 2922 * 2923 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2924 * range is within the requested key range we return the indirect 2925 * block and do NOT loop. This is usually only used to acquire 2926 * freemap nodes. 2927 */ 2928 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2929 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2930 save_mtx = parent->lock; 2931 hammer2_chain_unlock(parent); 2932 hammer2_chain_drop(parent); 2933 *parentp = parent = chain; 2934 chain = NULL; /* SAFETY */ 2935 goto again; 2936 } 2937 done: 2938 /* 2939 * All done, return the locked chain. 2940 * 2941 * If the caller does not want a locked chain, replace the lock with 2942 * a ref. Perhaps this can eventually be optimized to not obtain the 2943 * lock in the first place for situations where the data does not 2944 * need to be resolved. 2945 * 2946 * NOTE! A chain->error must be tested by the caller upon return. 2947 * *errorp is only set based on issues which occur while 2948 * trying to reach the chain. 2949 */ 2950 return (chain); 2951 } 2952 2953 /* 2954 * After having issued a lookup we can iterate all matching keys. 2955 * 2956 * If chain is non-NULL we continue the iteration from just after it's index. 2957 * 2958 * If chain is NULL we assume the parent was exhausted and continue the 2959 * iteration at the next parent. 2960 * 2961 * If a fatal error occurs (typically an I/O error), a dummy chain is 2962 * returned with chain->error and error-identifying information set. This 2963 * chain will assert if you try to do anything fancy with it. 2964 * 2965 * XXX Depending on where the error occurs we should allow continued iteration. 2966 * 2967 * parent must be locked on entry and remains locked throughout. chain's 2968 * lock status must match flags. Chain is always at least referenced. 2969 * 2970 * WARNING! The MATCHIND flag does not apply to this function. 2971 */ 2972 hammer2_chain_t * 2973 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2974 hammer2_key_t *key_nextp, 2975 hammer2_key_t key_beg, hammer2_key_t key_end, 2976 int *errorp, int flags) 2977 { 2978 hammer2_chain_t *parent; 2979 int how_maybe; 2980 2981 /* 2982 * Calculate locking flags for upward recursion. 2983 */ 2984 how_maybe = HAMMER2_RESOLVE_MAYBE; 2985 if (flags & HAMMER2_LOOKUP_SHARED) 2986 how_maybe |= HAMMER2_RESOLVE_SHARED; 2987 2988 parent = *parentp; 2989 *errorp = 0; 2990 2991 /* 2992 * Calculate the next index and recalculate the parent if necessary. 2993 */ 2994 if (chain) { 2995 key_beg = chain->bref.key + 2996 ((hammer2_key_t)1 << chain->bref.keybits); 2997 hammer2_chain_unlock(chain); 2998 hammer2_chain_drop(chain); 2999 3000 /* 3001 * chain invalid past this point, but we can still do a 3002 * pointer comparison w/parent. 3003 * 3004 * Any scan where the lookup returned degenerate data embedded 3005 * in the inode has an invalid index and must terminate. 3006 */ 3007 if (chain == parent) 3008 return(NULL); 3009 if (key_beg == 0 || key_beg > key_end) 3010 return(NULL); 3011 chain = NULL; 3012 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 3013 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 3014 /* 3015 * We reached the end of the iteration. 3016 */ 3017 return (NULL); 3018 } else { 3019 /* 3020 * Continue iteration with next parent unless the current 3021 * parent covers the range. 3022 * 3023 * (This also handles the case of a deleted, empty indirect 3024 * node). 3025 */ 3026 key_beg = parent->bref.key + 3027 ((hammer2_key_t)1 << parent->bref.keybits); 3028 if (key_beg == 0 || key_beg > key_end) 3029 return (NULL); 3030 parent = hammer2_chain_repparent(parentp, how_maybe); 3031 } 3032 3033 /* 3034 * And execute 3035 */ 3036 return (hammer2_chain_lookup(parentp, key_nextp, 3037 key_beg, key_end, 3038 errorp, flags)); 3039 } 3040 3041 /* 3042 * Caller wishes to iterate chains under parent, loading new chains into 3043 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 3044 * then call hammer2_chain_scan() repeatedly until a non-zero return. 3045 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 3046 * with the returned chain for the scan. The returned *chainp will be 3047 * locked and referenced. Any prior contents will be unlocked and dropped. 3048 * 3049 * Caller should check the return value. A normal scan EOF will return 3050 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 3051 * error trying to access parent data. Any error in the returned chain 3052 * must be tested separately by the caller. 3053 * 3054 * (*chainp) is dropped on each scan, but will only be set if the returned 3055 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 3056 * returned via *chainp. The caller will get their bref only. 3057 * 3058 * The raw scan function is similar to lookup/next but does not seek to a key. 3059 * Blockrefs are iterated via first_bref = (parent, NULL) and 3060 * next_chain = (parent, bref). 3061 * 3062 * The passed-in parent must be locked and its data resolved. The function 3063 * nominally returns a locked and referenced *chainp != NULL for chains 3064 * the caller might need to recurse on (and will dipose of any *chainp passed 3065 * in). The caller must check the chain->bref.type either way. 3066 */ 3067 int 3068 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 3069 hammer2_blockref_t *bref, int *firstp, 3070 int flags) 3071 { 3072 hammer2_dev_t *hmp; 3073 hammer2_blockref_t *base; 3074 hammer2_blockref_t *bref_ptr; 3075 hammer2_key_t key; 3076 hammer2_key_t next_key; 3077 hammer2_chain_t *chain = NULL; 3078 int count = 0; 3079 int how_always = HAMMER2_RESOLVE_ALWAYS; 3080 int how_maybe = HAMMER2_RESOLVE_MAYBE; 3081 int how; 3082 int generation; 3083 int maxloops = 300000; 3084 int error; 3085 3086 hmp = parent->hmp; 3087 error = 0; 3088 3089 /* 3090 * Scan flags borrowed from lookup. 3091 */ 3092 if (flags & HAMMER2_LOOKUP_ALWAYS) { 3093 how_maybe = how_always; 3094 how = HAMMER2_RESOLVE_ALWAYS; 3095 } else if (flags & HAMMER2_LOOKUP_NODATA) { 3096 how = HAMMER2_RESOLVE_NEVER; 3097 } else { 3098 how = HAMMER2_RESOLVE_MAYBE; 3099 } 3100 if (flags & HAMMER2_LOOKUP_SHARED) { 3101 how_maybe |= HAMMER2_RESOLVE_SHARED; 3102 how_always |= HAMMER2_RESOLVE_SHARED; 3103 how |= HAMMER2_RESOLVE_SHARED; 3104 } 3105 3106 /* 3107 * Calculate key to locate first/next element, unlocking the previous 3108 * element as we go. Be careful, the key calculation can overflow. 3109 * 3110 * (also reset bref to NULL) 3111 */ 3112 if (*firstp) { 3113 key = 0; 3114 *firstp = 0; 3115 } else { 3116 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 3117 if ((chain = *chainp) != NULL) { 3118 *chainp = NULL; 3119 hammer2_chain_unlock(chain); 3120 hammer2_chain_drop(chain); 3121 chain = NULL; 3122 } 3123 if (key == 0) { 3124 error |= HAMMER2_ERROR_EOF; 3125 goto done; 3126 } 3127 } 3128 3129 again: 3130 if (parent->error) { 3131 error = parent->error; 3132 goto done; 3133 } 3134 if (--maxloops == 0) 3135 panic("hammer2_chain_scan: maxloops"); 3136 3137 /* 3138 * Locate the blockref array. Currently we do a fully associative 3139 * search through the array. 3140 */ 3141 switch(parent->bref.type) { 3142 case HAMMER2_BREF_TYPE_INODE: 3143 /* 3144 * An inode with embedded data has no sub-chains. 3145 * 3146 * WARNING! Bulk scan code may pass a static chain marked 3147 * as BREF_TYPE_INODE with a copy of the volume 3148 * root blockset to snapshot the volume. 3149 */ 3150 if (parent->data->ipdata.meta.op_flags & 3151 HAMMER2_OPFLAG_DIRECTDATA) { 3152 error |= HAMMER2_ERROR_EOF; 3153 goto done; 3154 } 3155 base = &parent->data->ipdata.u.blockset.blockref[0]; 3156 count = HAMMER2_SET_COUNT; 3157 break; 3158 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3159 case HAMMER2_BREF_TYPE_INDIRECT: 3160 /* 3161 * Optimize indirect blocks in the INITIAL state to avoid 3162 * I/O. 3163 */ 3164 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 3165 base = NULL; 3166 } else { 3167 if (parent->data == NULL) 3168 panic("parent->data is NULL"); 3169 base = &parent->data->npdata[0]; 3170 } 3171 count = parent->bytes / sizeof(hammer2_blockref_t); 3172 break; 3173 case HAMMER2_BREF_TYPE_VOLUME: 3174 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3175 count = HAMMER2_SET_COUNT; 3176 break; 3177 case HAMMER2_BREF_TYPE_FREEMAP: 3178 base = &parent->data->blkset.blockref[0]; 3179 count = HAMMER2_SET_COUNT; 3180 break; 3181 default: 3182 panic("hammer2_chain_scan: unrecognized blockref type: %d", 3183 parent->bref.type); 3184 base = NULL; /* safety */ 3185 count = 0; /* safety */ 3186 } 3187 3188 /* 3189 * Merged scan to find next candidate. 3190 * 3191 * hammer2_base_*() functions require the parent->core.live_* fields 3192 * to be synchronized. 3193 * 3194 * We need to hold the spinlock to access the block array and RB tree 3195 * and to interlock chain creation. 3196 */ 3197 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3198 hammer2_chain_countbrefs(parent, base, count); 3199 3200 next_key = 0; 3201 bref_ptr = NULL; 3202 hammer2_spin_ex(&parent->core.spin); 3203 chain = hammer2_combined_find(parent, base, count, 3204 &next_key, 3205 key, HAMMER2_KEY_MAX, 3206 &bref_ptr); 3207 generation = parent->core.generation; 3208 3209 /* 3210 * Exhausted parent chain, we're done. 3211 */ 3212 if (bref_ptr == NULL) { 3213 hammer2_spin_unex(&parent->core.spin); 3214 KKASSERT(chain == NULL); 3215 error |= HAMMER2_ERROR_EOF; 3216 goto done; 3217 } 3218 3219 /* 3220 * Copy into the supplied stack-based blockref. 3221 */ 3222 *bref = *bref_ptr; 3223 3224 /* 3225 * Selected from blockref or in-memory chain. 3226 */ 3227 if (chain == NULL) { 3228 switch(bref->type) { 3229 case HAMMER2_BREF_TYPE_INODE: 3230 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3231 case HAMMER2_BREF_TYPE_INDIRECT: 3232 case HAMMER2_BREF_TYPE_VOLUME: 3233 case HAMMER2_BREF_TYPE_FREEMAP: 3234 /* 3235 * Recursion, always get the chain 3236 */ 3237 hammer2_spin_unex(&parent->core.spin); 3238 chain = hammer2_chain_get(parent, generation, 3239 bref, how); 3240 if (chain == NULL) 3241 goto again; 3242 break; 3243 default: 3244 /* 3245 * No recursion, do not waste time instantiating 3246 * a chain, just iterate using the bref. 3247 */ 3248 hammer2_spin_unex(&parent->core.spin); 3249 break; 3250 } 3251 } else { 3252 /* 3253 * Recursion or not we need the chain in order to supply 3254 * the bref. 3255 */ 3256 hammer2_chain_ref(chain); 3257 hammer2_spin_unex(&parent->core.spin); 3258 hammer2_chain_lock(chain, how); 3259 } 3260 if (chain && 3261 (bcmp(bref, &chain->bref, sizeof(*bref)) || 3262 chain->parent != parent)) { 3263 hammer2_chain_unlock(chain); 3264 hammer2_chain_drop(chain); 3265 chain = NULL; 3266 goto again; 3267 } 3268 3269 /* 3270 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 3271 * 3272 * NOTE: chain's key range is not relevant as there might be 3273 * one-offs within the range that are not deleted. 3274 * 3275 * NOTE: XXX this could create problems with scans used in 3276 * situations other than mount-time recovery. 3277 * 3278 * NOTE: Lookups can race delete-duplicate because 3279 * delete-duplicate does not lock the parent's core 3280 * (they just use the spinlock on the core). 3281 */ 3282 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3283 hammer2_chain_unlock(chain); 3284 hammer2_chain_drop(chain); 3285 chain = NULL; 3286 3287 key = next_key; 3288 if (key == 0) { 3289 error |= HAMMER2_ERROR_EOF; 3290 goto done; 3291 } 3292 goto again; 3293 } 3294 3295 done: 3296 /* 3297 * All done, return the bref or NULL, supply chain if necessary. 3298 */ 3299 if (chain) 3300 *chainp = chain; 3301 return (error); 3302 } 3303 3304 /* 3305 * Create and return a new hammer2 system memory structure of the specified 3306 * key, type and size and insert it under (*parentp). This is a full 3307 * insertion, based on the supplied key/keybits, and may involve creating 3308 * indirect blocks and moving other chains around via delete/duplicate. 3309 * 3310 * This call can be made with parent == NULL as long as a non -1 methods 3311 * is supplied. hmp must also be supplied in this situation (otherwise 3312 * hmp is extracted from the supplied parent). The chain will be detached 3313 * from the topology. A later call with both parent and chain can be made 3314 * to attach it. 3315 * 3316 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3317 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3318 * FULL. This typically means that the caller is creating the chain after 3319 * doing a hammer2_chain_lookup(). 3320 * 3321 * (*parentp) must be exclusive locked and may be replaced on return 3322 * depending on how much work the function had to do. 3323 * 3324 * (*parentp) must not be errored or this function will assert. 3325 * 3326 * (*chainp) usually starts out NULL and returns the newly created chain, 3327 * but if the caller desires the caller may allocate a disconnected chain 3328 * and pass it in instead. 3329 * 3330 * This function should NOT be used to insert INDIRECT blocks. It is 3331 * typically used to create/insert inodes and data blocks. 3332 * 3333 * Caller must pass-in an exclusively locked parent the new chain is to 3334 * be inserted under, and optionally pass-in a disconnected, exclusively 3335 * locked chain to insert (else we create a new chain). The function will 3336 * adjust (*parentp) as necessary, create or connect the chain, and 3337 * return an exclusively locked chain in *chainp. 3338 * 3339 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3340 * and will be reassigned. 3341 * 3342 * NOTE: returns HAMMER_ERROR_* flags 3343 */ 3344 int 3345 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3346 hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods, 3347 hammer2_key_t key, int keybits, int type, size_t bytes, 3348 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3349 { 3350 hammer2_chain_t *chain; 3351 hammer2_chain_t *parent; 3352 hammer2_blockref_t *base; 3353 hammer2_blockref_t dummy; 3354 int allocated = 0; 3355 int error = 0; 3356 int count; 3357 int maxloops = 300000; 3358 3359 /* 3360 * Topology may be crossing a PFS boundary. 3361 */ 3362 parent = *parentp; 3363 if (parent) { 3364 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3365 KKASSERT(parent->error == 0); 3366 hmp = parent->hmp; 3367 } 3368 chain = *chainp; 3369 3370 if (chain == NULL) { 3371 /* 3372 * First allocate media space and construct the dummy bref, 3373 * then allocate the in-memory chain structure. Set the 3374 * INITIAL flag for fresh chains which do not have embedded 3375 * data. 3376 * 3377 * XXX for now set the check mode of the child based on 3378 * the parent or, if the parent is an inode, the 3379 * specification in the inode. 3380 */ 3381 bzero(&dummy, sizeof(dummy)); 3382 dummy.type = type; 3383 dummy.key = key; 3384 dummy.keybits = keybits; 3385 dummy.data_off = hammer2_getradix(bytes); 3386 3387 /* 3388 * Inherit methods from parent by default. Primarily used 3389 * for BREF_TYPE_DATA. Non-data types *must* be set to 3390 * a non-NONE check algorithm. 3391 */ 3392 if (methods == -1) 3393 dummy.methods = parent->bref.methods; 3394 else 3395 dummy.methods = (uint8_t)methods; 3396 3397 if (type != HAMMER2_BREF_TYPE_DATA && 3398 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3399 dummy.methods |= 3400 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3401 } 3402 3403 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3404 3405 /* 3406 * Lock the chain manually, chain_lock will load the chain 3407 * which we do NOT want to do. (note: chain->refs is set 3408 * to 1 by chain_alloc() for us, but lockcnt is not). 3409 */ 3410 chain->lockcnt = 1; 3411 hammer2_mtx_ex(&chain->lock); 3412 allocated = 1; 3413 ++curthread->td_tracker; 3414 3415 /* 3416 * Set INITIAL to optimize I/O. The flag will generally be 3417 * processed when we call hammer2_chain_modify(). 3418 * 3419 * Recalculate bytes to reflect the actual media block 3420 * allocation. Handle special case radix 0 == 0 bytes. 3421 */ 3422 bytes = (size_t)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 3423 if (bytes) 3424 bytes = (hammer2_off_t)1 << bytes; 3425 chain->bytes = bytes; 3426 3427 switch(type) { 3428 case HAMMER2_BREF_TYPE_VOLUME: 3429 case HAMMER2_BREF_TYPE_FREEMAP: 3430 panic("hammer2_chain_create: called with volume type"); 3431 break; 3432 case HAMMER2_BREF_TYPE_INDIRECT: 3433 panic("hammer2_chain_create: cannot be used to" 3434 "create indirect block"); 3435 break; 3436 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3437 panic("hammer2_chain_create: cannot be used to" 3438 "create freemap root or node"); 3439 break; 3440 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3441 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3442 /* fall through */ 3443 case HAMMER2_BREF_TYPE_DIRENT: 3444 case HAMMER2_BREF_TYPE_INODE: 3445 case HAMMER2_BREF_TYPE_DATA: 3446 default: 3447 /* 3448 * leave chain->data NULL, set INITIAL 3449 */ 3450 KKASSERT(chain->data == NULL); 3451 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3452 break; 3453 } 3454 } else { 3455 /* 3456 * We are reattaching a previously deleted chain, possibly 3457 * under a new parent and possibly with a new key/keybits. 3458 * The chain does not have to be in a modified state. The 3459 * UPDATE flag will be set later on in this routine. 3460 * 3461 * Do NOT mess with the current state of the INITIAL flag. 3462 */ 3463 chain->bref.key = key; 3464 chain->bref.keybits = keybits; 3465 if (chain->flags & HAMMER2_CHAIN_DELETED) 3466 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3467 KKASSERT(chain->parent == NULL); 3468 } 3469 3470 /* 3471 * Set the appropriate bref flag if requested. 3472 * 3473 * NOTE! Callers can call this function to move chains without 3474 * knowing about special flags, so don't clear bref flags 3475 * here! 3476 */ 3477 if (flags & HAMMER2_INSERT_PFSROOT) 3478 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3479 3480 if (parent == NULL) 3481 goto skip; 3482 3483 /* 3484 * Calculate how many entries we have in the blockref array and 3485 * determine if an indirect block is required when inserting into 3486 * the parent. 3487 */ 3488 again: 3489 if (--maxloops == 0) 3490 panic("hammer2_chain_create: maxloops"); 3491 3492 switch(parent->bref.type) { 3493 case HAMMER2_BREF_TYPE_INODE: 3494 if ((parent->data->ipdata.meta.op_flags & 3495 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3496 kprintf("hammer2: parent set for direct-data! " 3497 "pkey=%016jx ckey=%016jx\n", 3498 parent->bref.key, 3499 chain->bref.key); 3500 } 3501 KKASSERT((parent->data->ipdata.meta.op_flags & 3502 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3503 KKASSERT(parent->data != NULL); 3504 base = &parent->data->ipdata.u.blockset.blockref[0]; 3505 count = HAMMER2_SET_COUNT; 3506 break; 3507 case HAMMER2_BREF_TYPE_INDIRECT: 3508 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3509 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3510 base = NULL; 3511 else 3512 base = &parent->data->npdata[0]; 3513 count = parent->bytes / sizeof(hammer2_blockref_t); 3514 break; 3515 case HAMMER2_BREF_TYPE_VOLUME: 3516 KKASSERT(parent->data != NULL); 3517 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3518 count = HAMMER2_SET_COUNT; 3519 break; 3520 case HAMMER2_BREF_TYPE_FREEMAP: 3521 KKASSERT(parent->data != NULL); 3522 base = &parent->data->blkset.blockref[0]; 3523 count = HAMMER2_SET_COUNT; 3524 break; 3525 default: 3526 panic("hammer2_chain_create: unrecognized blockref type: %d", 3527 parent->bref.type); 3528 base = NULL; 3529 count = 0; 3530 break; 3531 } 3532 3533 /* 3534 * Make sure we've counted the brefs 3535 */ 3536 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3537 hammer2_chain_countbrefs(parent, base, count); 3538 3539 KASSERT(parent->core.live_count >= 0 && 3540 parent->core.live_count <= count, 3541 ("bad live_count %d/%d (%02x, %d)", 3542 parent->core.live_count, count, 3543 parent->bref.type, parent->bytes)); 3544 3545 /* 3546 * If no free blockref could be found we must create an indirect 3547 * block and move a number of blockrefs into it. With the parent 3548 * locked we can safely lock each child in order to delete+duplicate 3549 * it without causing a deadlock. 3550 * 3551 * This may return the new indirect block or the old parent depending 3552 * on where the key falls. NULL is returned on error. 3553 */ 3554 if (parent->core.live_count == count) { 3555 hammer2_chain_t *nparent; 3556 3557 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0); 3558 3559 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3560 mtid, type, &error); 3561 if (nparent == NULL) { 3562 if (allocated) 3563 hammer2_chain_drop(chain); 3564 chain = NULL; 3565 goto done; 3566 } 3567 if (parent != nparent) { 3568 hammer2_chain_unlock(parent); 3569 hammer2_chain_drop(parent); 3570 parent = *parentp = nparent; 3571 } 3572 goto again; 3573 } 3574 3575 /* 3576 * fall through if parent, or skip to here if no parent. 3577 */ 3578 skip: 3579 if (chain->flags & HAMMER2_CHAIN_DELETED) 3580 kprintf("Inserting deleted chain @%016jx\n", 3581 chain->bref.key); 3582 3583 /* 3584 * Link the chain into its parent. 3585 */ 3586 if (chain->parent != NULL) 3587 panic("hammer2: hammer2_chain_create: chain already connected"); 3588 KKASSERT(chain->parent == NULL); 3589 if (parent) { 3590 KKASSERT(parent->core.live_count < count); 3591 hammer2_chain_insert(parent, chain, 3592 HAMMER2_CHAIN_INSERT_SPIN | 3593 HAMMER2_CHAIN_INSERT_LIVE, 3594 0); 3595 } 3596 3597 if (allocated) { 3598 /* 3599 * Mark the newly created chain modified. This will cause 3600 * UPDATE to be set and process the INITIAL flag. 3601 * 3602 * Device buffers are not instantiated for DATA elements 3603 * as these are handled by logical buffers. 3604 * 3605 * Indirect and freemap node indirect blocks are handled 3606 * by hammer2_chain_create_indirect() and not by this 3607 * function. 3608 * 3609 * Data for all other bref types is expected to be 3610 * instantiated (INODE, LEAF). 3611 */ 3612 switch(chain->bref.type) { 3613 case HAMMER2_BREF_TYPE_DATA: 3614 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3615 case HAMMER2_BREF_TYPE_DIRENT: 3616 case HAMMER2_BREF_TYPE_INODE: 3617 error = hammer2_chain_modify(chain, mtid, dedup_off, 3618 HAMMER2_MODIFY_OPTDATA); 3619 break; 3620 default: 3621 /* 3622 * Remaining types are not supported by this function. 3623 * In particular, INDIRECT and LEAF_NODE types are 3624 * handled by create_indirect(). 3625 */ 3626 panic("hammer2_chain_create: bad type: %d", 3627 chain->bref.type); 3628 /* NOT REACHED */ 3629 break; 3630 } 3631 } else { 3632 /* 3633 * When reconnecting a chain we must set UPDATE and 3634 * setflush so the flush recognizes that it must update 3635 * the bref in the parent. 3636 */ 3637 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3638 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3639 } 3640 3641 /* 3642 * We must setflush(parent) to ensure that it recurses through to 3643 * chain. setflush(chain) might not work because ONFLUSH is possibly 3644 * already set in the chain (so it won't recurse up to set it in the 3645 * parent). 3646 */ 3647 if (parent) 3648 hammer2_chain_setflush(parent); 3649 3650 done: 3651 *chainp = chain; 3652 3653 return (error); 3654 } 3655 3656 /* 3657 * Move the chain from its old parent to a new parent. The chain must have 3658 * already been deleted or already disconnected (or never associated) with 3659 * a parent. The chain is reassociated with the new parent and the deleted 3660 * flag will be cleared (no longer deleted). The chain's modification state 3661 * is not altered. 3662 * 3663 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3664 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3665 * FULL. This typically means that the caller is creating the chain after 3666 * doing a hammer2_chain_lookup(). 3667 * 3668 * Neither (parent) or (chain) can be errored. 3669 * 3670 * If (parent) is non-NULL then the chain is inserted under the parent. 3671 * 3672 * If (parent) is NULL then the newly duplicated chain is not inserted 3673 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3674 * passing into hammer2_chain_create() after this function returns). 3675 * 3676 * WARNING! This function calls create which means it can insert indirect 3677 * blocks. This can cause other unrelated chains in the parent to 3678 * be moved to a newly inserted indirect block in addition to the 3679 * specific chain. 3680 */ 3681 void 3682 hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3683 hammer2_tid_t mtid, int flags) 3684 { 3685 hammer2_blockref_t *bref; 3686 hammer2_dev_t *hmp; 3687 hammer2_chain_t *parent; 3688 size_t bytes; 3689 3690 /* 3691 * WARNING! We should never resolve DATA to device buffers 3692 * (XXX allow it if the caller did?), and since 3693 * we currently do not have the logical buffer cache 3694 * buffer in-hand to fix its cached physical offset 3695 * we also force the modify code to not COW it. XXX 3696 * 3697 * NOTE! We allow error'd chains to be renamed. The bref itself 3698 * is good and can be renamed. The content, however, may 3699 * be inaccessible. 3700 */ 3701 hmp = chain->hmp; 3702 KKASSERT(chain->parent == NULL); 3703 /*KKASSERT(chain->error == 0); allow */ 3704 3705 /* 3706 * Now create a duplicate of the chain structure, associating 3707 * it with the same core, making it the same size, pointing it 3708 * to the same bref (the same media block). 3709 * 3710 * NOTE: Handle special radix == 0 case (means 0 bytes). 3711 */ 3712 bref = &chain->bref; 3713 bytes = (size_t)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 3714 if (bytes) 3715 bytes = (hammer2_off_t)1 << bytes; 3716 3717 /* 3718 * If parent is not NULL the duplicated chain will be entered under 3719 * the parent and the UPDATE bit set to tell flush to update 3720 * the blockref. 3721 * 3722 * We must setflush(parent) to ensure that it recurses through to 3723 * chain. setflush(chain) might not work because ONFLUSH is possibly 3724 * already set in the chain (so it won't recurse up to set it in the 3725 * parent). 3726 * 3727 * Having both chains locked is extremely important for atomicy. 3728 */ 3729 if (parentp && (parent = *parentp) != NULL) { 3730 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3731 KKASSERT(parent->refs > 0); 3732 KKASSERT(parent->error == 0); 3733 3734 hammer2_chain_create(parentp, &chain, NULL, chain->pmp, 3735 HAMMER2_METH_DEFAULT, 3736 bref->key, bref->keybits, bref->type, 3737 chain->bytes, mtid, 0, flags); 3738 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3739 hammer2_chain_setflush(*parentp); 3740 } 3741 } 3742 3743 /* 3744 * This works in tandem with delete_obref() to install a blockref in 3745 * (typically) an indirect block that is associated with the chain being 3746 * moved to *parentp. 3747 * 3748 * The reason we need this function is that the caller needs to maintain 3749 * the blockref as it was, and not generate a new blockref for what might 3750 * be a modified chain. Otherwise stuff will leak into the flush that 3751 * the flush code's FLUSH_INODE_STOP flag is unable to catch. 3752 * 3753 * It is EXTREMELY important that we properly set CHAIN_BMAPUPD and 3754 * CHAIN_UPDATE. We must set BMAPUPD if the bref does not match, and 3755 * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if 3756 * it does. Otherwise we can end up in a situation where H2 is unable to 3757 * clean up the in-memory chain topology. 3758 * 3759 * The reason for this is that flushes do not generally flush through 3760 * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq 3761 * or sideq to properly flush and dispose of the related inode chain's flags. 3762 * Situations where the inode is not actually modified by the frontend, 3763 * but where we have to move the related chains around as we insert or cleanup 3764 * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory 3765 * inode chain that does not have a hammer2_inode_t associated with it. 3766 */ 3767 void 3768 hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3769 hammer2_tid_t mtid, int flags, 3770 hammer2_blockref_t *obref) 3771 { 3772 hammer2_chain_rename(parentp, chain, mtid, flags); 3773 3774 if (obref->type != HAMMER2_BREF_TYPE_EMPTY) { 3775 hammer2_blockref_t *tbase; 3776 int tcount; 3777 3778 KKASSERT((chain->flags & HAMMER2_CHAIN_BMAPPED) == 0); 3779 hammer2_chain_modify(*parentp, mtid, 0, 0); 3780 tbase = hammer2_chain_base_and_count(*parentp, &tcount); 3781 hammer2_base_insert(*parentp, tbase, tcount, chain, obref); 3782 if (bcmp(obref, &chain->bref, sizeof(chain->bref))) { 3783 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD | 3784 HAMMER2_CHAIN_UPDATE); 3785 } else { 3786 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3787 } 3788 } 3789 } 3790 3791 /* 3792 * Helper function for deleting chains. 3793 * 3794 * The chain is removed from the live view (the RBTREE) as well as the parent's 3795 * blockmap. Both chain and its parent must be locked. 3796 * 3797 * parent may not be errored. chain can be errored. 3798 */ 3799 static int 3800 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3801 hammer2_tid_t mtid, int flags, 3802 hammer2_blockref_t *obref) 3803 { 3804 hammer2_dev_t *hmp; 3805 int error = 0; 3806 3807 KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED | 3808 HAMMER2_CHAIN_FICTITIOUS)) == 0); 3809 KKASSERT(chain->parent == parent); 3810 hmp = chain->hmp; 3811 3812 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 3813 /* 3814 * Chain is blockmapped, so there must be a parent. 3815 * Atomically remove the chain from the parent and remove 3816 * the blockmap entry. The parent must be set modified 3817 * to remove the blockmap entry. 3818 */ 3819 hammer2_blockref_t *base; 3820 int count; 3821 3822 KKASSERT(parent != NULL); 3823 KKASSERT(parent->error == 0); 3824 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3825 error = hammer2_chain_modify(parent, mtid, 0, 0); 3826 if (error) 3827 goto done; 3828 3829 /* 3830 * Calculate blockmap pointer 3831 */ 3832 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3833 hammer2_spin_ex(&chain->core.spin); 3834 hammer2_spin_ex(&parent->core.spin); 3835 3836 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3837 atomic_add_int(&parent->core.live_count, -1); 3838 ++parent->core.generation; 3839 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3840 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3841 --parent->core.chain_count; 3842 chain->parent = NULL; 3843 3844 switch(parent->bref.type) { 3845 case HAMMER2_BREF_TYPE_INODE: 3846 /* 3847 * Access the inode's block array. However, there 3848 * is no block array if the inode is flagged 3849 * DIRECTDATA. 3850 */ 3851 if (parent->data && 3852 (parent->data->ipdata.meta.op_flags & 3853 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3854 base = 3855 &parent->data->ipdata.u.blockset.blockref[0]; 3856 } else { 3857 base = NULL; 3858 } 3859 count = HAMMER2_SET_COUNT; 3860 break; 3861 case HAMMER2_BREF_TYPE_INDIRECT: 3862 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3863 if (parent->data) 3864 base = &parent->data->npdata[0]; 3865 else 3866 base = NULL; 3867 count = parent->bytes / sizeof(hammer2_blockref_t); 3868 break; 3869 case HAMMER2_BREF_TYPE_VOLUME: 3870 base = &parent->data->voldata. 3871 sroot_blockset.blockref[0]; 3872 count = HAMMER2_SET_COUNT; 3873 break; 3874 case HAMMER2_BREF_TYPE_FREEMAP: 3875 base = &parent->data->blkset.blockref[0]; 3876 count = HAMMER2_SET_COUNT; 3877 break; 3878 default: 3879 base = NULL; 3880 count = 0; 3881 panic("_hammer2_chain_delete_helper: " 3882 "unrecognized blockref type: %d", 3883 parent->bref.type); 3884 } 3885 3886 /* 3887 * delete blockmapped chain from its parent. 3888 * 3889 * The parent is not affected by any statistics in chain 3890 * which are pending synchronization. That is, there is 3891 * nothing to undo in the parent since they have not yet 3892 * been incorporated into the parent. 3893 * 3894 * The parent is affected by statistics stored in inodes. 3895 * Those have already been synchronized, so they must be 3896 * undone. XXX split update possible w/delete in middle? 3897 */ 3898 if (base) { 3899 hammer2_base_delete(parent, base, count, chain, obref); 3900 } 3901 hammer2_spin_unex(&parent->core.spin); 3902 hammer2_spin_unex(&chain->core.spin); 3903 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3904 /* 3905 * Chain is not blockmapped but a parent is present. 3906 * Atomically remove the chain from the parent. There is 3907 * no blockmap entry to remove. 3908 * 3909 * Because chain was associated with a parent but not 3910 * synchronized, the chain's *_count_up fields contain 3911 * inode adjustment statistics which must be undone. 3912 */ 3913 hammer2_spin_ex(&chain->core.spin); 3914 hammer2_spin_ex(&parent->core.spin); 3915 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3916 atomic_add_int(&parent->core.live_count, -1); 3917 ++parent->core.generation; 3918 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3919 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3920 --parent->core.chain_count; 3921 chain->parent = NULL; 3922 hammer2_spin_unex(&parent->core.spin); 3923 hammer2_spin_unex(&chain->core.spin); 3924 } else { 3925 /* 3926 * Chain is not blockmapped and has no parent. This 3927 * is a degenerate case. 3928 */ 3929 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3930 } 3931 done: 3932 return error; 3933 } 3934 3935 /* 3936 * Create an indirect block that covers one or more of the elements in the 3937 * current parent. Either returns the existing parent with no locking or 3938 * ref changes or returns the new indirect block locked and referenced 3939 * and leaving the original parent lock/ref intact as well. 3940 * 3941 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3942 * 3943 * The returned chain depends on where the specified key falls. 3944 * 3945 * The key/keybits for the indirect mode only needs to follow three rules: 3946 * 3947 * (1) That all elements underneath it fit within its key space and 3948 * 3949 * (2) That all elements outside it are outside its key space. 3950 * 3951 * (3) When creating the new indirect block any elements in the current 3952 * parent that fit within the new indirect block's keyspace must be 3953 * moved into the new indirect block. 3954 * 3955 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3956 * keyspace the the current parent, but lookup/iteration rules will 3957 * ensure (and must ensure) that rule (2) for all parents leading up 3958 * to the nearest inode or the root volume header is adhered to. This 3959 * is accomplished by always recursing through matching keyspaces in 3960 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3961 * 3962 * The current implementation calculates the current worst-case keyspace by 3963 * iterating the current parent and then divides it into two halves, choosing 3964 * whichever half has the most elements (not necessarily the half containing 3965 * the requested key). 3966 * 3967 * We can also opt to use the half with the least number of elements. This 3968 * causes lower-numbered keys (aka logical file offsets) to recurse through 3969 * fewer indirect blocks and higher-numbered keys to recurse through more. 3970 * This also has the risk of not moving enough elements to the new indirect 3971 * block and being forced to create several indirect blocks before the element 3972 * can be inserted. 3973 * 3974 * Must be called with an exclusively locked parent. 3975 * 3976 * NOTE: *errorp set to HAMMER_ERROR_* flags 3977 */ 3978 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3979 hammer2_key_t *keyp, int keybits, 3980 hammer2_blockref_t *base, int count); 3981 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 3982 hammer2_key_t *keyp, int keybits, 3983 hammer2_blockref_t *base, int count, 3984 int ncount); 3985 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 3986 hammer2_key_t *keyp, int keybits, 3987 hammer2_blockref_t *base, int count, 3988 int ncount); 3989 static 3990 hammer2_chain_t * 3991 hammer2_chain_create_indirect(hammer2_chain_t *parent, 3992 hammer2_key_t create_key, int create_bits, 3993 hammer2_tid_t mtid, int for_type, int *errorp) 3994 { 3995 hammer2_dev_t *hmp; 3996 hammer2_blockref_t *base; 3997 hammer2_blockref_t *bref; 3998 hammer2_blockref_t bcopy; 3999 hammer2_blockref_t dummy; 4000 hammer2_chain_t *chain; 4001 hammer2_chain_t *ichain; 4002 hammer2_key_t key = create_key; 4003 hammer2_key_t key_beg; 4004 hammer2_key_t key_end; 4005 hammer2_key_t key_next; 4006 int keybits = create_bits; 4007 int count; 4008 int ncount; 4009 int nbytes; 4010 int loops; 4011 int error; 4012 int reason; 4013 int generation; 4014 int maxloops = 300000; 4015 4016 /* 4017 * Calculate the base blockref pointer or NULL if the chain 4018 * is known to be empty. We need to calculate the array count 4019 * for RB lookups either way. 4020 */ 4021 hmp = parent->hmp; 4022 KKASSERT(hammer2_mtx_owned(&parent->lock)); 4023 4024 /* 4025 * Pre-modify the parent now to avoid having to deal with error 4026 * processing if we tried to later (in the middle of our loop). 4027 * 4028 * We are going to be moving bref's around, the indirect blocks 4029 * cannot be in an initial state. Do not pass MODIFY_OPTDATA. 4030 */ 4031 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 4032 if (*errorp) { 4033 kprintf("hammer2_create_indirect: error %08x %s\n", 4034 *errorp, hammer2_error_str(*errorp)); 4035 return NULL; 4036 } 4037 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 4038 4039 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 4040 base = hammer2_chain_base_and_count(parent, &count); 4041 4042 /* 4043 * How big should our new indirect block be? It has to be at least 4044 * as large as its parent for splits to work properly. 4045 * 4046 * The freemap uses a specific indirect block size. The number of 4047 * levels are built dynamically and ultimately depend on the size 4048 * volume. Because freemap blocks are taken from the reserved areas 4049 * of the volume our goal is efficiency (fewer levels) and not so 4050 * much to save disk space. 4051 * 4052 * The first indirect block level for a directory usually uses 4053 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 4054 * the hash mechanism, this typically gives us a nominal 4055 * 32 * 4 entries with one level of indirection. 4056 * 4057 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 4058 * indirect blocks. The initial 4 entries in the inode gives us 4059 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 4060 * of indirection gives us 137GB, and so forth. H2 can support 4061 * huge file sizes but they are not typical, so we try to stick 4062 * with compactness and do not use a larger indirect block size. 4063 * 4064 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 4065 * due to the way indirect blocks are created this usually winds 4066 * up being extremely inefficient for small files. Even though 4067 * 16KB requires more levels of indirection for very large files, 4068 * the 16KB records can be ganged together into 64KB DIOs. 4069 */ 4070 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4071 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4072 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 4073 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4074 if (parent->data->ipdata.meta.type == 4075 HAMMER2_OBJTYPE_DIRECTORY) 4076 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 4077 else 4078 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 4079 4080 } else { 4081 nbytes = HAMMER2_IND_BYTES_NOM; 4082 } 4083 if (nbytes < count * sizeof(hammer2_blockref_t)) { 4084 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 4085 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 4086 nbytes = count * sizeof(hammer2_blockref_t); 4087 } 4088 ncount = nbytes / sizeof(hammer2_blockref_t); 4089 4090 /* 4091 * When creating an indirect block for a freemap node or leaf 4092 * the key/keybits must be fitted to static radix levels because 4093 * particular radix levels use particular reserved blocks in the 4094 * related zone. 4095 * 4096 * This routine calculates the key/radix of the indirect block 4097 * we need to create, and whether it is on the high-side or the 4098 * low-side. 4099 */ 4100 switch(for_type) { 4101 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4102 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 4103 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 4104 base, count); 4105 break; 4106 case HAMMER2_BREF_TYPE_DATA: 4107 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 4108 base, count, ncount); 4109 break; 4110 case HAMMER2_BREF_TYPE_DIRENT: 4111 case HAMMER2_BREF_TYPE_INODE: 4112 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 4113 base, count, ncount); 4114 break; 4115 default: 4116 panic("illegal indirect block for bref type %d", for_type); 4117 break; 4118 } 4119 4120 /* 4121 * Normalize the key for the radix being represented, keeping the 4122 * high bits and throwing away the low bits. 4123 */ 4124 key &= ~(((hammer2_key_t)1 << keybits) - 1); 4125 4126 /* 4127 * Ok, create our new indirect block 4128 */ 4129 bzero(&dummy, sizeof(dummy)); 4130 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4131 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4132 dummy.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 4133 } else { 4134 dummy.type = HAMMER2_BREF_TYPE_INDIRECT; 4135 } 4136 dummy.key = key; 4137 dummy.keybits = keybits; 4138 dummy.data_off = hammer2_getradix(nbytes); 4139 dummy.methods = 4140 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 4141 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 4142 4143 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy); 4144 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 4145 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 4146 /* ichain has one ref at this point */ 4147 4148 /* 4149 * We have to mark it modified to allocate its block, but use 4150 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 4151 * it won't be acted upon by the flush code. 4152 * 4153 * XXX remove OPTDATA, we need a fully initialized indirect block to 4154 * be able to move the original blockref. 4155 */ 4156 *errorp = hammer2_chain_modify(ichain, mtid, 0, 0); 4157 if (*errorp) { 4158 kprintf("hammer2_alloc_indirect: error %08x %s\n", 4159 *errorp, hammer2_error_str(*errorp)); 4160 hammer2_chain_unlock(ichain); 4161 hammer2_chain_drop(ichain); 4162 return NULL; 4163 } 4164 KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0); 4165 4166 /* 4167 * Iterate the original parent and move the matching brefs into 4168 * the new indirect block. 4169 * 4170 * XXX handle flushes. 4171 */ 4172 key_beg = 0; 4173 key_end = HAMMER2_KEY_MAX; 4174 key_next = 0; /* avoid gcc warnings */ 4175 hammer2_spin_ex(&parent->core.spin); 4176 loops = 0; 4177 reason = 0; 4178 4179 for (;;) { 4180 /* 4181 * Parent may have been modified, relocating its block array. 4182 * Reload the base pointer. 4183 */ 4184 base = hammer2_chain_base_and_count(parent, &count); 4185 4186 if (++loops > 100000) { 4187 hammer2_spin_unex(&parent->core.spin); 4188 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 4189 reason, parent, base, count, key_next); 4190 } 4191 4192 /* 4193 * NOTE: spinlock stays intact, returned chain (if not NULL) 4194 * is not referenced or locked which means that we 4195 * cannot safely check its flagged / deletion status 4196 * until we lock it. 4197 */ 4198 chain = hammer2_combined_find(parent, base, count, 4199 &key_next, 4200 key_beg, key_end, 4201 &bref); 4202 generation = parent->core.generation; 4203 if (bref == NULL) 4204 break; 4205 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4206 4207 /* 4208 * Skip keys that are not within the key/radix of the new 4209 * indirect block. They stay in the parent. 4210 */ 4211 if (rounddown2(key ^ bref->key, (hammer2_key_t)1 << keybits) != 0) { 4212 goto next_key_spinlocked; 4213 } 4214 4215 /* 4216 * Load the new indirect block by acquiring the related 4217 * chains (potentially from media as it might not be 4218 * in-memory). Then move it to the new parent (ichain). 4219 * 4220 * chain is referenced but not locked. We must lock the 4221 * chain to obtain definitive state. 4222 */ 4223 bcopy = *bref; 4224 if (chain) { 4225 /* 4226 * Use chain already present in the RBTREE 4227 */ 4228 hammer2_chain_ref(chain); 4229 hammer2_spin_unex(&parent->core.spin); 4230 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 4231 } else { 4232 /* 4233 * Get chain for blockref element. _get returns NULL 4234 * on insertion race. 4235 */ 4236 hammer2_spin_unex(&parent->core.spin); 4237 chain = hammer2_chain_get(parent, generation, &bcopy, 4238 HAMMER2_RESOLVE_NEVER); 4239 if (chain == NULL) { 4240 reason = 1; 4241 hammer2_spin_ex(&parent->core.spin); 4242 continue; 4243 } 4244 } 4245 4246 /* 4247 * This is always live so if the chain has been deleted 4248 * we raced someone and we have to retry. 4249 * 4250 * NOTE: Lookups can race delete-duplicate because 4251 * delete-duplicate does not lock the parent's core 4252 * (they just use the spinlock on the core). 4253 * 4254 * (note reversed logic for this one) 4255 */ 4256 if (bcmp(&bcopy, &chain->bref, sizeof(bcopy)) || 4257 chain->parent != parent || 4258 (chain->flags & HAMMER2_CHAIN_DELETED)) { 4259 hammer2_chain_unlock(chain); 4260 hammer2_chain_drop(chain); 4261 if (hammer2_debug & 0x0040) { 4262 kprintf("LOST PARENT RETRY " 4263 "RETRY (%p,%p)->%p %08x\n", 4264 parent, chain->parent, chain, chain->flags); 4265 } 4266 hammer2_spin_ex(&parent->core.spin); 4267 continue; 4268 } 4269 4270 /* 4271 * Shift the chain to the indirect block. 4272 * 4273 * WARNING! No reason for us to load chain data, pass NOSTATS 4274 * to prevent delete/insert from trying to access 4275 * inode stats (and thus asserting if there is no 4276 * chain->data loaded). 4277 * 4278 * WARNING! The (parent, chain) deletion may modify the parent 4279 * and invalidate the base pointer. 4280 * 4281 * WARNING! Parent must already be marked modified, so we 4282 * can assume that chain_delete always suceeds. 4283 * 4284 * WARNING! hammer2_chain_repchange() does not have to be 4285 * called (and doesn't work anyway because we are 4286 * only doing a partial shift). A recursion that is 4287 * in-progress can continue at the current parent 4288 * and will be able to properly find its next key. 4289 */ 4290 error = hammer2_chain_delete_obref(parent, chain, mtid, 0, 4291 &bcopy); 4292 KKASSERT(error == 0); 4293 hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bcopy); 4294 hammer2_chain_unlock(chain); 4295 hammer2_chain_drop(chain); 4296 KKASSERT(parent->refs > 0); 4297 chain = NULL; 4298 base = NULL; /* safety */ 4299 hammer2_spin_ex(&parent->core.spin); 4300 next_key_spinlocked: 4301 if (--maxloops == 0) 4302 panic("hammer2_chain_create_indirect: maxloops"); 4303 reason = 4; 4304 if (key_next == 0 || key_next > key_end) 4305 break; 4306 key_beg = key_next; 4307 /* loop */ 4308 } 4309 hammer2_spin_unex(&parent->core.spin); 4310 4311 /* 4312 * Insert the new indirect block into the parent now that we've 4313 * cleared out some entries in the parent. We calculated a good 4314 * insertion index in the loop above (ichain->index). 4315 * 4316 * We don't have to set UPDATE here because we mark ichain 4317 * modified down below (so the normal modified -> flush -> set-moved 4318 * sequence applies). 4319 * 4320 * The insertion shouldn't race as this is a completely new block 4321 * and the parent is locked. 4322 */ 4323 base = NULL; /* safety, parent modify may change address */ 4324 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 4325 KKASSERT(parent->core.live_count < count); 4326 hammer2_chain_insert(parent, ichain, 4327 HAMMER2_CHAIN_INSERT_SPIN | 4328 HAMMER2_CHAIN_INSERT_LIVE, 4329 0); 4330 4331 /* 4332 * Make sure flushes propogate after our manual insertion. 4333 */ 4334 hammer2_chain_setflush(ichain); 4335 hammer2_chain_setflush(parent); 4336 4337 /* 4338 * Figure out what to return. 4339 */ 4340 if (rounddown2(create_key ^ key, (hammer2_key_t)1 << keybits)) { 4341 /* 4342 * Key being created is outside the key range, 4343 * return the original parent. 4344 */ 4345 hammer2_chain_unlock(ichain); 4346 hammer2_chain_drop(ichain); 4347 } else { 4348 /* 4349 * Otherwise its in the range, return the new parent. 4350 * (leave both the new and old parent locked). 4351 */ 4352 parent = ichain; 4353 } 4354 4355 return(parent); 4356 } 4357 4358 /* 4359 * Do maintenance on an indirect chain. Both parent and chain are locked. 4360 * 4361 * Returns non-zero if (chain) is deleted, either due to being empty or 4362 * because its children were safely moved into the parent. 4363 */ 4364 int 4365 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, 4366 hammer2_chain_t *chain) 4367 { 4368 hammer2_blockref_t *chain_base; 4369 hammer2_blockref_t *base; 4370 hammer2_blockref_t *bref; 4371 hammer2_blockref_t bcopy; 4372 hammer2_key_t key_next; 4373 hammer2_key_t key_beg; 4374 hammer2_key_t key_end; 4375 hammer2_chain_t *sub; 4376 int chain_count; 4377 int count; 4378 int error; 4379 int generation; 4380 4381 /* 4382 * Make sure we have an accurate live_count 4383 */ 4384 if ((chain->flags & (HAMMER2_CHAIN_INITIAL | 4385 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4386 base = &chain->data->npdata[0]; 4387 count = chain->bytes / sizeof(hammer2_blockref_t); 4388 hammer2_chain_countbrefs(chain, base, count); 4389 } 4390 4391 /* 4392 * If the indirect block is empty we can delete it. 4393 * (ignore deletion error) 4394 */ 4395 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) { 4396 hammer2_chain_delete(parent, chain, 4397 chain->bref.modify_tid, 4398 HAMMER2_DELETE_PERMANENT); 4399 hammer2_chain_repchange(parent, chain); 4400 return 1; 4401 } 4402 4403 base = hammer2_chain_base_and_count(parent, &count); 4404 4405 if ((parent->flags & (HAMMER2_CHAIN_INITIAL | 4406 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4407 hammer2_chain_countbrefs(parent, base, count); 4408 } 4409 4410 /* 4411 * Determine if we can collapse chain into parent, calculate 4412 * hysteresis for chain emptiness. 4413 */ 4414 if (parent->core.live_count + chain->core.live_count - 1 > count) 4415 return 0; 4416 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4417 if (chain->core.live_count > chain_count * 3 / 4) 4418 return 0; 4419 4420 /* 4421 * Ok, theoretically we can collapse chain's contents into 4422 * parent. chain is locked, but any in-memory children of chain 4423 * are not. For this to work, we must be able to dispose of any 4424 * in-memory children of chain. 4425 * 4426 * For now require that there are no in-memory children of chain. 4427 * 4428 * WARNING! Both chain and parent must remain locked across this 4429 * entire operation. 4430 */ 4431 4432 /* 4433 * Parent must be marked modified. Don't try to collapse it if we 4434 * can't mark it modified. Once modified, destroy chain to make room 4435 * and to get rid of what will be a conflicting key (this is included 4436 * in the calculation above). Finally, move the children of chain 4437 * into chain's parent. 4438 * 4439 * This order creates an accounting problem for bref.embed.stats 4440 * because we destroy chain before we remove its children. Any 4441 * elements whos blockref is already synchronized will be counted 4442 * twice. To deal with the problem we clean out chain's stats prior 4443 * to deleting it. 4444 */ 4445 error = hammer2_chain_modify(parent, 0, 0, 0); 4446 if (error) { 4447 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4448 hammer2_error_str(error)); 4449 return 0; 4450 } 4451 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0); 4452 if (error) { 4453 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4454 hammer2_error_str(error)); 4455 return 0; 4456 } 4457 4458 chain->bref.embed.stats.inode_count = 0; 4459 chain->bref.embed.stats.data_count = 0; 4460 error = hammer2_chain_delete(parent, chain, 4461 chain->bref.modify_tid, 4462 HAMMER2_DELETE_PERMANENT); 4463 KKASSERT(error == 0); 4464 4465 /* 4466 * The combined_find call requires core.spin to be held. One would 4467 * think there wouldn't be any conflicts since we hold chain 4468 * exclusively locked, but the caching mechanism for 0-ref children 4469 * does not require a chain lock. 4470 */ 4471 hammer2_spin_ex(&chain->core.spin); 4472 4473 key_next = 0; 4474 key_beg = 0; 4475 key_end = HAMMER2_KEY_MAX; 4476 for (;;) { 4477 chain_base = &chain->data->npdata[0]; 4478 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4479 sub = hammer2_combined_find(chain, chain_base, chain_count, 4480 &key_next, 4481 key_beg, key_end, 4482 &bref); 4483 generation = chain->core.generation; 4484 if (bref == NULL) 4485 break; 4486 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4487 4488 bcopy = *bref; 4489 if (sub) { 4490 hammer2_chain_ref(sub); 4491 hammer2_spin_unex(&chain->core.spin); 4492 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4493 } else { 4494 hammer2_spin_unex(&chain->core.spin); 4495 sub = hammer2_chain_get(chain, generation, &bcopy, 4496 HAMMER2_RESOLVE_NEVER); 4497 if (sub == NULL) { 4498 hammer2_spin_ex(&chain->core.spin); 4499 continue; 4500 } 4501 } 4502 if (bcmp(&bcopy, &sub->bref, sizeof(bcopy)) || 4503 sub->parent != chain || 4504 (sub->flags & HAMMER2_CHAIN_DELETED)) { 4505 hammer2_chain_unlock(sub); 4506 hammer2_chain_drop(sub); 4507 hammer2_spin_ex(&chain->core.spin); 4508 sub = NULL; /* safety */ 4509 continue; 4510 } 4511 error = hammer2_chain_delete_obref(chain, sub, 4512 sub->bref.modify_tid, 0, 4513 &bcopy); 4514 KKASSERT(error == 0); 4515 hammer2_chain_rename_obref(&parent, sub, 4516 sub->bref.modify_tid, 4517 HAMMER2_INSERT_SAMEPARENT, &bcopy); 4518 hammer2_chain_unlock(sub); 4519 hammer2_chain_drop(sub); 4520 hammer2_spin_ex(&chain->core.spin); 4521 4522 if (key_next == 0) 4523 break; 4524 key_beg = key_next; 4525 } 4526 hammer2_spin_unex(&chain->core.spin); 4527 4528 hammer2_chain_repchange(parent, chain); 4529 4530 return 1; 4531 } 4532 4533 /* 4534 * Freemap indirect blocks 4535 * 4536 * Calculate the keybits and highside/lowside of the freemap node the 4537 * caller is creating. 4538 * 4539 * This routine will specify the next higher-level freemap key/radix 4540 * representing the lowest-ordered set. By doing so, eventually all 4541 * low-ordered sets will be moved one level down. 4542 * 4543 * We have to be careful here because the freemap reserves a limited 4544 * number of blocks for a limited number of levels. So we can't just 4545 * push indiscriminately. 4546 */ 4547 int 4548 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4549 int keybits, hammer2_blockref_t *base, int count) 4550 { 4551 hammer2_chain_t *chain; 4552 hammer2_blockref_t *bref; 4553 hammer2_key_t key; 4554 hammer2_key_t key_beg; 4555 hammer2_key_t key_end; 4556 hammer2_key_t key_next; 4557 int locount; 4558 int hicount; 4559 int maxloops = 300000; 4560 4561 key = *keyp; 4562 locount = 0; 4563 hicount = 0; 4564 keybits = 64; 4565 4566 /* 4567 * Calculate the range of keys in the array being careful to skip 4568 * slots which are overridden with a deletion. 4569 */ 4570 key_beg = 0; 4571 key_end = HAMMER2_KEY_MAX; 4572 hammer2_spin_ex(&parent->core.spin); 4573 4574 for (;;) { 4575 if (--maxloops == 0) { 4576 panic("indkey_freemap shit %p %p:%d\n", 4577 parent, base, count); 4578 } 4579 chain = hammer2_combined_find(parent, base, count, 4580 &key_next, 4581 key_beg, key_end, 4582 &bref); 4583 4584 /* 4585 * Exhausted search 4586 */ 4587 if (bref == NULL) 4588 break; 4589 4590 /* 4591 * Skip deleted chains. 4592 */ 4593 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4594 if (key_next == 0 || key_next > key_end) 4595 break; 4596 key_beg = key_next; 4597 continue; 4598 } 4599 4600 /* 4601 * Use the full live (not deleted) element for the scan 4602 * iteration. HAMMER2 does not allow partial replacements. 4603 * 4604 * XXX should be built into hammer2_combined_find(). 4605 */ 4606 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4607 4608 if (keybits > bref->keybits) { 4609 key = bref->key; 4610 keybits = bref->keybits; 4611 } else if (keybits == bref->keybits && bref->key < key) { 4612 key = bref->key; 4613 } 4614 if (key_next == 0) 4615 break; 4616 key_beg = key_next; 4617 } 4618 hammer2_spin_unex(&parent->core.spin); 4619 4620 /* 4621 * Return the keybits for a higher-level FREEMAP_NODE covering 4622 * this node. 4623 */ 4624 switch(keybits) { 4625 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4626 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4627 break; 4628 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4629 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4630 break; 4631 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4632 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4633 break; 4634 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4635 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4636 break; 4637 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4638 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4639 break; 4640 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4641 panic("hammer2_chain_indkey_freemap: level too high"); 4642 break; 4643 default: 4644 panic("hammer2_chain_indkey_freemap: bad radix"); 4645 break; 4646 } 4647 *keyp = key; 4648 4649 return (keybits); 4650 } 4651 4652 /* 4653 * File indirect blocks 4654 * 4655 * Calculate the key/keybits for the indirect block to create by scanning 4656 * existing keys. The key being created is also passed in *keyp and can be 4657 * inside or outside the indirect block. Regardless, the indirect block 4658 * must hold at least two keys in order to guarantee sufficient space. 4659 * 4660 * We use a modified version of the freemap's fixed radix tree, but taylored 4661 * for file data. Basically we configure an indirect block encompassing the 4662 * smallest key. 4663 */ 4664 static int 4665 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4666 int keybits, hammer2_blockref_t *base, int count, 4667 int ncount) 4668 { 4669 hammer2_chain_t *chain; 4670 hammer2_blockref_t *bref; 4671 hammer2_key_t key; 4672 hammer2_key_t key_beg; 4673 hammer2_key_t key_end; 4674 hammer2_key_t key_next; 4675 int nradix; 4676 int locount; 4677 int hicount; 4678 int maxloops = 300000; 4679 4680 key = *keyp; 4681 locount = 0; 4682 hicount = 0; 4683 keybits = 64; 4684 4685 /* 4686 * Calculate the range of keys in the array being careful to skip 4687 * slots which are overridden with a deletion. 4688 * 4689 * Locate the smallest key. 4690 */ 4691 key_beg = 0; 4692 key_end = HAMMER2_KEY_MAX; 4693 hammer2_spin_ex(&parent->core.spin); 4694 4695 for (;;) { 4696 if (--maxloops == 0) { 4697 panic("indkey_freemap shit %p %p:%d\n", 4698 parent, base, count); 4699 } 4700 chain = hammer2_combined_find(parent, base, count, 4701 &key_next, 4702 key_beg, key_end, 4703 &bref); 4704 4705 /* 4706 * Exhausted search 4707 */ 4708 if (bref == NULL) 4709 break; 4710 4711 /* 4712 * Skip deleted chains. 4713 */ 4714 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4715 if (key_next == 0 || key_next > key_end) 4716 break; 4717 key_beg = key_next; 4718 continue; 4719 } 4720 4721 /* 4722 * Use the full live (not deleted) element for the scan 4723 * iteration. HAMMER2 does not allow partial replacements. 4724 * 4725 * XXX should be built into hammer2_combined_find(). 4726 */ 4727 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4728 4729 if (keybits > bref->keybits) { 4730 key = bref->key; 4731 keybits = bref->keybits; 4732 } else if (keybits == bref->keybits && bref->key < key) { 4733 key = bref->key; 4734 } 4735 if (key_next == 0) 4736 break; 4737 key_beg = key_next; 4738 } 4739 hammer2_spin_unex(&parent->core.spin); 4740 4741 /* 4742 * Calculate the static keybits for a higher-level indirect block 4743 * that contains the key. 4744 */ 4745 *keyp = key; 4746 4747 switch(ncount) { 4748 case HAMMER2_IND_BYTES_MIN / sizeof(hammer2_blockref_t): 4749 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4750 break; 4751 case HAMMER2_IND_BYTES_NOM / sizeof(hammer2_blockref_t): 4752 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4753 break; 4754 case HAMMER2_IND_BYTES_MAX / sizeof(hammer2_blockref_t): 4755 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4756 break; 4757 default: 4758 panic("bad ncount %d\n", ncount); 4759 nradix = 0; 4760 break; 4761 } 4762 4763 /* 4764 * The largest radix that can be returned for an indirect block is 4765 * 63 bits. (The largest practical indirect block radix is actually 4766 * 62 bits because the top-level inode or volume root contains four 4767 * entries, but allow 63 to be returned). 4768 */ 4769 if (nradix >= 64) 4770 nradix = 63; 4771 4772 return keybits + nradix; 4773 } 4774 4775 #if 1 4776 4777 /* 4778 * Directory indirect blocks. 4779 * 4780 * Covers both the inode index (directory of inodes), and directory contents 4781 * (filenames hardlinked to inodes). 4782 * 4783 * Because directory keys are hashed we generally try to cut the space in 4784 * half. We accomodate the inode index (which tends to have linearly 4785 * increasing inode numbers) by ensuring that the keyspace is at least large 4786 * enough to fill up the indirect block being created. 4787 */ 4788 static int 4789 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4790 int keybits, hammer2_blockref_t *base, int count, 4791 int ncount) 4792 { 4793 hammer2_blockref_t *bref; 4794 hammer2_chain_t *chain; 4795 hammer2_key_t key_beg; 4796 hammer2_key_t key_end; 4797 hammer2_key_t key_next; 4798 hammer2_key_t key; 4799 int nkeybits; 4800 int locount; 4801 int hicount; 4802 int maxloops = 300000; 4803 4804 /* 4805 * NOTE: We can't take a shortcut here anymore for inodes because 4806 * the root directory can contain a mix of inodes and directory 4807 * entries (we used to just return 63 if parent->bref.type was 4808 * HAMMER2_BREF_TYPE_INODE. 4809 */ 4810 key = *keyp; 4811 locount = 0; 4812 hicount = 0; 4813 4814 /* 4815 * Calculate the range of keys in the array being careful to skip 4816 * slots which are overridden with a deletion. 4817 */ 4818 key_beg = 0; 4819 key_end = HAMMER2_KEY_MAX; 4820 hammer2_spin_ex(&parent->core.spin); 4821 4822 for (;;) { 4823 if (--maxloops == 0) { 4824 panic("indkey_freemap shit %p %p:%d\n", 4825 parent, base, count); 4826 } 4827 chain = hammer2_combined_find(parent, base, count, 4828 &key_next, 4829 key_beg, key_end, 4830 &bref); 4831 4832 /* 4833 * Exhausted search 4834 */ 4835 if (bref == NULL) 4836 break; 4837 4838 /* 4839 * Deleted object 4840 */ 4841 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4842 if (key_next == 0 || key_next > key_end) 4843 break; 4844 key_beg = key_next; 4845 continue; 4846 } 4847 4848 /* 4849 * Use the full live (not deleted) element for the scan 4850 * iteration. HAMMER2 does not allow partial replacements. 4851 * 4852 * XXX should be built into hammer2_combined_find(). 4853 */ 4854 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4855 4856 /* 4857 * Expand our calculated key range (key, keybits) to fit 4858 * the scanned key. nkeybits represents the full range 4859 * that we will later cut in half (two halves @ nkeybits - 1). 4860 */ 4861 nkeybits = keybits; 4862 if (nkeybits < bref->keybits) { 4863 if (bref->keybits > 64) { 4864 kprintf("bad bref chain %p bref %p\n", 4865 chain, bref); 4866 Debugger("fubar"); 4867 } 4868 nkeybits = bref->keybits; 4869 } 4870 while (nkeybits < 64 && 4871 rounddown2(key ^ bref->key, (hammer2_key_t)1 << nkeybits) != 0) { 4872 ++nkeybits; 4873 } 4874 4875 /* 4876 * If the new key range is larger we have to determine 4877 * which side of the new key range the existing keys fall 4878 * under by checking the high bit, then collapsing the 4879 * locount into the hicount or vise-versa. 4880 */ 4881 if (keybits != nkeybits) { 4882 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4883 hicount += locount; 4884 locount = 0; 4885 } else { 4886 locount += hicount; 4887 hicount = 0; 4888 } 4889 keybits = nkeybits; 4890 } 4891 4892 /* 4893 * The newly scanned key will be in the lower half or the 4894 * upper half of the (new) key range. 4895 */ 4896 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4897 ++hicount; 4898 else 4899 ++locount; 4900 4901 if (key_next == 0) 4902 break; 4903 key_beg = key_next; 4904 } 4905 hammer2_spin_unex(&parent->core.spin); 4906 bref = NULL; /* now invalid (safety) */ 4907 4908 /* 4909 * Adjust keybits to represent half of the full range calculated 4910 * above (radix 63 max) for our new indirect block. 4911 */ 4912 --keybits; 4913 4914 /* 4915 * Expand keybits to hold at least ncount elements. ncount will be 4916 * a power of 2. This is to try to completely fill leaf nodes (at 4917 * least for keys which are not hashes). 4918 * 4919 * We aren't counting 'in' or 'out', we are counting 'high side' 4920 * and 'low side' based on the bit at (1LL << keybits). We want 4921 * everything to be inside in these cases so shift it all to 4922 * the low or high side depending on the new high bit. 4923 */ 4924 while (((hammer2_key_t)1 << keybits) < ncount) { 4925 ++keybits; 4926 if (key & ((hammer2_key_t)1 << keybits)) { 4927 hicount += locount; 4928 locount = 0; 4929 } else { 4930 locount += hicount; 4931 hicount = 0; 4932 } 4933 } 4934 4935 if (hicount > locount) 4936 key |= (hammer2_key_t)1 << keybits; 4937 else 4938 key &= ~(hammer2_key_t)1 << keybits; 4939 4940 *keyp = key; 4941 4942 return (keybits); 4943 } 4944 4945 #else 4946 4947 /* 4948 * Directory indirect blocks. 4949 * 4950 * Covers both the inode index (directory of inodes), and directory contents 4951 * (filenames hardlinked to inodes). 4952 * 4953 * Because directory keys are hashed we generally try to cut the space in 4954 * half. We accomodate the inode index (which tends to have linearly 4955 * increasing inode numbers) by ensuring that the keyspace is at least large 4956 * enough to fill up the indirect block being created. 4957 */ 4958 static int 4959 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4960 int keybits, hammer2_blockref_t *base, int count, 4961 int ncount) 4962 { 4963 hammer2_blockref_t *bref; 4964 hammer2_chain_t *chain; 4965 hammer2_key_t key_beg; 4966 hammer2_key_t key_end; 4967 hammer2_key_t key_next; 4968 hammer2_key_t key; 4969 int nkeybits; 4970 int locount; 4971 int hicount; 4972 int maxloops = 300000; 4973 4974 /* 4975 * Shortcut if the parent is the inode. In this situation the 4976 * parent has 4+1 directory entries and we are creating an indirect 4977 * block capable of holding many more. 4978 */ 4979 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4980 return 63; 4981 } 4982 4983 key = *keyp; 4984 locount = 0; 4985 hicount = 0; 4986 4987 /* 4988 * Calculate the range of keys in the array being careful to skip 4989 * slots which are overridden with a deletion. 4990 */ 4991 key_beg = 0; 4992 key_end = HAMMER2_KEY_MAX; 4993 hammer2_spin_ex(&parent->core.spin); 4994 4995 for (;;) { 4996 if (--maxloops == 0) { 4997 panic("indkey_freemap shit %p %p:%d\n", 4998 parent, base, count); 4999 } 5000 chain = hammer2_combined_find(parent, base, count, 5001 &key_next, 5002 key_beg, key_end, 5003 &bref); 5004 5005 /* 5006 * Exhausted search 5007 */ 5008 if (bref == NULL) 5009 break; 5010 5011 /* 5012 * Deleted object 5013 */ 5014 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 5015 if (key_next == 0 || key_next > key_end) 5016 break; 5017 key_beg = key_next; 5018 continue; 5019 } 5020 5021 /* 5022 * Use the full live (not deleted) element for the scan 5023 * iteration. HAMMER2 does not allow partial replacements. 5024 * 5025 * XXX should be built into hammer2_combined_find(). 5026 */ 5027 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 5028 5029 /* 5030 * Expand our calculated key range (key, keybits) to fit 5031 * the scanned key. nkeybits represents the full range 5032 * that we will later cut in half (two halves @ nkeybits - 1). 5033 */ 5034 nkeybits = keybits; 5035 if (nkeybits < bref->keybits) { 5036 if (bref->keybits > 64) { 5037 kprintf("bad bref chain %p bref %p\n", 5038 chain, bref); 5039 Debugger("fubar"); 5040 } 5041 nkeybits = bref->keybits; 5042 } 5043 while (nkeybits < 64 && 5044 (~(((hammer2_key_t)1 << nkeybits) - 1) & 5045 (key ^ bref->key)) != 0) { 5046 ++nkeybits; 5047 } 5048 5049 /* 5050 * If the new key range is larger we have to determine 5051 * which side of the new key range the existing keys fall 5052 * under by checking the high bit, then collapsing the 5053 * locount into the hicount or vise-versa. 5054 */ 5055 if (keybits != nkeybits) { 5056 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 5057 hicount += locount; 5058 locount = 0; 5059 } else { 5060 locount += hicount; 5061 hicount = 0; 5062 } 5063 keybits = nkeybits; 5064 } 5065 5066 /* 5067 * The newly scanned key will be in the lower half or the 5068 * upper half of the (new) key range. 5069 */ 5070 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 5071 ++hicount; 5072 else 5073 ++locount; 5074 5075 if (key_next == 0) 5076 break; 5077 key_beg = key_next; 5078 } 5079 hammer2_spin_unex(&parent->core.spin); 5080 bref = NULL; /* now invalid (safety) */ 5081 5082 /* 5083 * Adjust keybits to represent half of the full range calculated 5084 * above (radix 63 max) for our new indirect block. 5085 */ 5086 --keybits; 5087 5088 /* 5089 * Expand keybits to hold at least ncount elements. ncount will be 5090 * a power of 2. This is to try to completely fill leaf nodes (at 5091 * least for keys which are not hashes). 5092 * 5093 * We aren't counting 'in' or 'out', we are counting 'high side' 5094 * and 'low side' based on the bit at (1LL << keybits). We want 5095 * everything to be inside in these cases so shift it all to 5096 * the low or high side depending on the new high bit. 5097 */ 5098 while (((hammer2_key_t)1 << keybits) < ncount) { 5099 ++keybits; 5100 if (key & ((hammer2_key_t)1 << keybits)) { 5101 hicount += locount; 5102 locount = 0; 5103 } else { 5104 locount += hicount; 5105 hicount = 0; 5106 } 5107 } 5108 5109 if (hicount > locount) 5110 key |= (hammer2_key_t)1 << keybits; 5111 else 5112 key &= ~(hammer2_key_t)1 << keybits; 5113 5114 *keyp = key; 5115 5116 return (keybits); 5117 } 5118 5119 #endif 5120 5121 /* 5122 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 5123 * it exists. 5124 * 5125 * Both parent and chain must be locked exclusively. 5126 * 5127 * This function will modify the parent if the blockref requires removal 5128 * from the parent's block table. 5129 * 5130 * This function is NOT recursive. Any entity already pushed into the 5131 * chain (such as an inode) may still need visibility into its contents, 5132 * as well as the ability to read and modify the contents. For example, 5133 * for an unlinked file which is still open. 5134 * 5135 * Also note that the flusher is responsible for cleaning up empty 5136 * indirect blocks. 5137 */ 5138 int 5139 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 5140 hammer2_tid_t mtid, int flags) 5141 { 5142 int error = 0; 5143 5144 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5145 5146 /* 5147 * Nothing to do if already marked. 5148 * 5149 * We need the spinlock on the core whos RBTREE contains chain 5150 * to protect against races. 5151 */ 5152 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5153 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5154 chain->parent == parent); 5155 error = _hammer2_chain_delete_helper(parent, chain, 5156 mtid, flags, NULL); 5157 } 5158 5159 /* 5160 * Permanent deletions mark the chain as destroyed. 5161 * 5162 * NOTE: We do not setflush the chain unless the deletion is 5163 * permanent, since the deletion of a chain does not actually 5164 * require it to be flushed. 5165 */ 5166 if (error == 0) { 5167 if (flags & HAMMER2_DELETE_PERMANENT) { 5168 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5169 hammer2_chain_setflush(chain); 5170 } 5171 } 5172 5173 return error; 5174 } 5175 5176 static int 5177 hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain, 5178 hammer2_tid_t mtid, int flags, 5179 hammer2_blockref_t *obref) 5180 { 5181 int error = 0; 5182 5183 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5184 5185 /* 5186 * Nothing to do if already marked. 5187 * 5188 * We need the spinlock on the core whos RBTREE contains chain 5189 * to protect against races. 5190 */ 5191 obref->type = HAMMER2_BREF_TYPE_EMPTY; 5192 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5193 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5194 chain->parent == parent); 5195 error = _hammer2_chain_delete_helper(parent, chain, 5196 mtid, flags, obref); 5197 } 5198 5199 /* 5200 * Permanent deletions mark the chain as destroyed. 5201 * 5202 * NOTE: We do not setflush the chain unless the deletion is 5203 * permanent, since the deletion of a chain does not actually 5204 * require it to be flushed. 5205 */ 5206 if (error == 0) { 5207 if (flags & HAMMER2_DELETE_PERMANENT) { 5208 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5209 hammer2_chain_setflush(chain); 5210 } 5211 } 5212 5213 return error; 5214 } 5215 5216 /* 5217 * Returns the index of the nearest element in the blockref array >= elm. 5218 * Returns (count) if no element could be found. 5219 * 5220 * Sets *key_nextp to the next key for loop purposes but does not modify 5221 * it if the next key would be higher than the current value of *key_nextp. 5222 * Note that *key_nexp can overflow to 0, which should be tested by the 5223 * caller. 5224 * 5225 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5226 * held through the operation. 5227 */ 5228 static int 5229 hammer2_base_find(hammer2_chain_t *parent, 5230 hammer2_blockref_t *base, int count, 5231 hammer2_key_t *key_nextp, 5232 hammer2_key_t key_beg, hammer2_key_t key_end) 5233 { 5234 hammer2_blockref_t *scan; 5235 hammer2_key_t scan_end; 5236 int i; 5237 int limit; 5238 5239 /* 5240 * Require the live chain's already have their core's counted 5241 * so we can optimize operations. 5242 */ 5243 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 5244 5245 /* 5246 * Degenerate case 5247 */ 5248 if (count == 0 || base == NULL) 5249 return(count); 5250 5251 /* 5252 * Sequential optimization using parent->cache_index. This is 5253 * the most likely scenario. 5254 * 5255 * We can avoid trailing empty entries on live chains, otherwise 5256 * we might have to check the whole block array. 5257 */ 5258 i = parent->cache_index; /* SMP RACE OK */ 5259 cpu_ccfence(); 5260 limit = parent->core.live_zero; 5261 if (i >= limit) 5262 i = limit - 1; 5263 if (i < 0) 5264 i = 0; 5265 KKASSERT(i < count); 5266 5267 /* 5268 * Search backwards 5269 */ 5270 scan = &base[i]; 5271 while (i > 0 && (scan->type == HAMMER2_BREF_TYPE_EMPTY || 5272 scan->key > key_beg)) { 5273 --scan; 5274 --i; 5275 } 5276 parent->cache_index = i; 5277 5278 /* 5279 * Search forwards, stop when we find a scan element which 5280 * encloses the key or until we know that there are no further 5281 * elements. 5282 */ 5283 while (i < count) { 5284 if (scan->type != HAMMER2_BREF_TYPE_EMPTY) { 5285 scan_end = scan->key + 5286 ((hammer2_key_t)1 << scan->keybits) - 1; 5287 if (scan->key > key_beg || scan_end >= key_beg) 5288 break; 5289 } 5290 if (i >= limit) 5291 return (count); 5292 ++scan; 5293 ++i; 5294 } 5295 if (i != count) { 5296 parent->cache_index = i; 5297 if (i >= limit) { 5298 i = count; 5299 } else { 5300 scan_end = scan->key + 5301 ((hammer2_key_t)1 << scan->keybits); 5302 if (scan_end && (*key_nextp > scan_end || 5303 *key_nextp == 0)) { 5304 *key_nextp = scan_end; 5305 } 5306 } 5307 } 5308 return (i); 5309 } 5310 5311 /* 5312 * Do a combined search and return the next match either from the blockref 5313 * array or from the in-memory chain. Sets *bresp to the returned bref in 5314 * both cases, or sets it to NULL if the search exhausted. Only returns 5315 * a non-NULL chain if the search matched from the in-memory chain. 5316 * 5317 * When no in-memory chain has been found and a non-NULL bref is returned 5318 * in *bresp. 5319 * 5320 * 5321 * The returned chain is not locked or referenced. Use the returned bref 5322 * to determine if the search exhausted or not. Iterate if the base find 5323 * is chosen but matches a deleted chain. 5324 * 5325 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5326 * held through the operation. 5327 */ 5328 hammer2_chain_t * 5329 hammer2_combined_find(hammer2_chain_t *parent, 5330 hammer2_blockref_t *base, int count, 5331 hammer2_key_t *key_nextp, 5332 hammer2_key_t key_beg, hammer2_key_t key_end, 5333 hammer2_blockref_t **bresp) 5334 { 5335 hammer2_blockref_t *bref; 5336 hammer2_chain_t *chain; 5337 int i; 5338 5339 /* 5340 * Lookup in block array and in rbtree. 5341 */ 5342 *key_nextp = key_end + 1; 5343 i = hammer2_base_find(parent, base, count, key_nextp, 5344 key_beg, key_end); 5345 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 5346 5347 /* 5348 * Neither matched 5349 */ 5350 if (i == count && chain == NULL) { 5351 *bresp = NULL; 5352 return(NULL); 5353 } 5354 5355 /* 5356 * Only chain matched. 5357 */ 5358 if (i == count) { 5359 bref = &chain->bref; 5360 goto found; 5361 } 5362 5363 /* 5364 * Only blockref matched. 5365 */ 5366 if (chain == NULL) { 5367 bref = &base[i]; 5368 goto found; 5369 } 5370 5371 /* 5372 * Both in-memory and blockref matched, select the nearer element. 5373 * 5374 * If both are flush with the left-hand side or both are the 5375 * same distance away, select the chain. In this situation the 5376 * chain must have been loaded from the matching blockmap. 5377 */ 5378 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 5379 chain->bref.key == base[i].key) { 5380 KKASSERT(chain->bref.key == base[i].key); 5381 bref = &chain->bref; 5382 goto found; 5383 } 5384 5385 /* 5386 * Select the nearer key 5387 */ 5388 if (chain->bref.key < base[i].key) { 5389 bref = &chain->bref; 5390 } else { 5391 bref = &base[i]; 5392 chain = NULL; 5393 } 5394 5395 /* 5396 * If the bref is out of bounds we've exhausted our search. 5397 */ 5398 found: 5399 if (bref->key > key_end) { 5400 *bresp = NULL; 5401 chain = NULL; 5402 } else { 5403 *bresp = bref; 5404 } 5405 return(chain); 5406 } 5407 5408 /* 5409 * Locate the specified block array element and delete it. The element 5410 * must exist. 5411 * 5412 * The spin lock on the related chain must be held. 5413 * 5414 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5415 * need to be adjusted when we commit the media change. 5416 */ 5417 void 5418 hammer2_base_delete(hammer2_chain_t *parent, 5419 hammer2_blockref_t *base, int count, 5420 hammer2_chain_t *chain, 5421 hammer2_blockref_t *obref) 5422 { 5423 hammer2_blockref_t *elm = &chain->bref; 5424 hammer2_blockref_t *scan; 5425 hammer2_key_t key_next; 5426 int i; 5427 5428 /* 5429 * Delete element. Expect the element to exist. 5430 * 5431 * XXX see caller, flush code not yet sophisticated enough to prevent 5432 * re-flushed in some cases. 5433 */ 5434 key_next = 0; /* max range */ 5435 i = hammer2_base_find(parent, base, count, &key_next, 5436 elm->key, elm->key); 5437 scan = &base[i]; 5438 if (i == count || scan->type == HAMMER2_BREF_TYPE_EMPTY || 5439 scan->key != elm->key || 5440 ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 && 5441 scan->keybits != elm->keybits)) { 5442 hammer2_spin_unex(&parent->core.spin); 5443 panic("delete base %p element not found at %d/%d elm %p\n", 5444 base, i, count, elm); 5445 return; 5446 } 5447 5448 /* 5449 * Update stats and zero the entry. 5450 * 5451 * NOTE: Handle radix == 0 (0 bytes) case. 5452 */ 5453 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 5454 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 5455 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 5456 } 5457 switch(scan->type) { 5458 case HAMMER2_BREF_TYPE_INODE: 5459 --parent->bref.embed.stats.inode_count; 5460 /* fall through */ 5461 case HAMMER2_BREF_TYPE_DATA: 5462 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5463 atomic_set_int(&chain->flags, 5464 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5465 } else { 5466 if (parent->bref.leaf_count) 5467 --parent->bref.leaf_count; 5468 } 5469 /* fall through */ 5470 case HAMMER2_BREF_TYPE_INDIRECT: 5471 if (scan->type != HAMMER2_BREF_TYPE_DATA) { 5472 parent->bref.embed.stats.data_count -= 5473 scan->embed.stats.data_count; 5474 parent->bref.embed.stats.inode_count -= 5475 scan->embed.stats.inode_count; 5476 } 5477 if (scan->type == HAMMER2_BREF_TYPE_INODE) 5478 break; 5479 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5480 atomic_set_int(&chain->flags, 5481 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5482 } else { 5483 if (parent->bref.leaf_count <= scan->leaf_count) 5484 parent->bref.leaf_count = 0; 5485 else 5486 parent->bref.leaf_count -= scan->leaf_count; 5487 } 5488 break; 5489 case HAMMER2_BREF_TYPE_DIRENT: 5490 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5491 atomic_set_int(&chain->flags, 5492 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5493 } else { 5494 if (parent->bref.leaf_count) 5495 --parent->bref.leaf_count; 5496 } 5497 default: 5498 break; 5499 } 5500 5501 if (obref) 5502 *obref = *scan; 5503 bzero(scan, sizeof(*scan)); 5504 5505 /* 5506 * We can only optimize parent->core.live_zero for live chains. 5507 */ 5508 if (parent->core.live_zero == i + 1) { 5509 while (--i >= 0 && base[i].type == HAMMER2_BREF_TYPE_EMPTY) 5510 ; 5511 parent->core.live_zero = i + 1; 5512 } 5513 5514 /* 5515 * Clear appropriate blockmap flags in chain. 5516 */ 5517 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 5518 HAMMER2_CHAIN_BMAPUPD); 5519 } 5520 5521 /* 5522 * Insert the specified element. The block array must not already have the 5523 * element and must have space available for the insertion. 5524 * 5525 * The spin lock on the related chain must be held. 5526 * 5527 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5528 * need to be adjusted when we commit the media change. 5529 */ 5530 void 5531 hammer2_base_insert(hammer2_chain_t *parent, 5532 hammer2_blockref_t *base, int count, 5533 hammer2_chain_t *chain, hammer2_blockref_t *elm) 5534 { 5535 hammer2_key_t key_next; 5536 hammer2_key_t xkey; 5537 int i; 5538 int j; 5539 int k; 5540 int l; 5541 int u = 1; 5542 5543 /* 5544 * Insert new element. Expect the element to not already exist 5545 * unless we are replacing it. 5546 * 5547 * XXX see caller, flush code not yet sophisticated enough to prevent 5548 * re-flushed in some cases. 5549 */ 5550 key_next = 0; /* max range */ 5551 i = hammer2_base_find(parent, base, count, &key_next, 5552 elm->key, elm->key); 5553 5554 /* 5555 * Shortcut fill optimization, typical ordered insertion(s) may not 5556 * require a search. 5557 */ 5558 KKASSERT(i >= 0 && i <= count); 5559 5560 /* 5561 * Set appropriate blockmap flags in chain (if not NULL) 5562 */ 5563 if (chain) 5564 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 5565 5566 /* 5567 * Update stats and zero the entry 5568 */ 5569 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 5570 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 5571 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 5572 } 5573 switch(elm->type) { 5574 case HAMMER2_BREF_TYPE_INODE: 5575 ++parent->bref.embed.stats.inode_count; 5576 /* fall through */ 5577 case HAMMER2_BREF_TYPE_DATA: 5578 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5579 ++parent->bref.leaf_count; 5580 /* fall through */ 5581 case HAMMER2_BREF_TYPE_INDIRECT: 5582 if (elm->type != HAMMER2_BREF_TYPE_DATA) { 5583 parent->bref.embed.stats.data_count += 5584 elm->embed.stats.data_count; 5585 parent->bref.embed.stats.inode_count += 5586 elm->embed.stats.inode_count; 5587 } 5588 if (elm->type == HAMMER2_BREF_TYPE_INODE) 5589 break; 5590 if (parent->bref.leaf_count + elm->leaf_count < 5591 HAMMER2_BLOCKREF_LEAF_MAX) { 5592 parent->bref.leaf_count += elm->leaf_count; 5593 } else { 5594 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX; 5595 } 5596 break; 5597 case HAMMER2_BREF_TYPE_DIRENT: 5598 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5599 ++parent->bref.leaf_count; 5600 break; 5601 default: 5602 break; 5603 } 5604 5605 5606 /* 5607 * We can only optimize parent->core.live_zero for live chains. 5608 */ 5609 if (i == count && parent->core.live_zero < count) { 5610 i = parent->core.live_zero++; 5611 base[i] = *elm; 5612 return; 5613 } 5614 5615 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 5616 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 5617 hammer2_spin_unex(&parent->core.spin); 5618 panic("insert base %p overlapping elements at %d elm %p\n", 5619 base, i, elm); 5620 } 5621 5622 /* 5623 * Try to find an empty slot before or after. 5624 */ 5625 j = i; 5626 k = i; 5627 while (j > 0 || k < count) { 5628 --j; 5629 if (j >= 0 && base[j].type == HAMMER2_BREF_TYPE_EMPTY) { 5630 if (j == i - 1) { 5631 base[j] = *elm; 5632 } else { 5633 bcopy(&base[j+1], &base[j], 5634 (i - j - 1) * sizeof(*base)); 5635 base[i - 1] = *elm; 5636 } 5637 goto validate; 5638 } 5639 ++k; 5640 if (k < count && base[k].type == HAMMER2_BREF_TYPE_EMPTY) { 5641 bcopy(&base[i], &base[i+1], 5642 (k - i) * sizeof(hammer2_blockref_t)); 5643 base[i] = *elm; 5644 5645 /* 5646 * We can only update parent->core.live_zero for live 5647 * chains. 5648 */ 5649 if (parent->core.live_zero <= k) 5650 parent->core.live_zero = k + 1; 5651 u = 2; 5652 goto validate; 5653 } 5654 } 5655 panic("hammer2_base_insert: no room!"); 5656 5657 /* 5658 * Debugging 5659 */ 5660 validate: 5661 key_next = 0; 5662 for (l = 0; l < count; ++l) { 5663 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5664 key_next = base[l].key + 5665 ((hammer2_key_t)1 << base[l].keybits) - 1; 5666 break; 5667 } 5668 } 5669 while (++l < count) { 5670 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5671 if (base[l].key <= key_next) 5672 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5673 key_next = base[l].key + 5674 ((hammer2_key_t)1 << base[l].keybits) - 1; 5675 5676 } 5677 } 5678 5679 } 5680 5681 #if 0 5682 5683 /* 5684 * Sort the blockref array for the chain. Used by the flush code to 5685 * sort the blockref[] array. 5686 * 5687 * The chain must be exclusively locked AND spin-locked. 5688 */ 5689 typedef hammer2_blockref_t *hammer2_blockref_p; 5690 5691 static 5692 int 5693 hammer2_base_sort_callback(const void *v1, const void *v2) 5694 { 5695 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5696 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5697 5698 /* 5699 * Make sure empty elements are placed at the end of the array 5700 */ 5701 if (bref1->type == HAMMER2_BREF_TYPE_EMPTY) { 5702 if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) 5703 return(0); 5704 return(1); 5705 } else if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) { 5706 return(-1); 5707 } 5708 5709 /* 5710 * Sort by key 5711 */ 5712 if (bref1->key < bref2->key) 5713 return(-1); 5714 if (bref1->key > bref2->key) 5715 return(1); 5716 return(0); 5717 } 5718 5719 void 5720 hammer2_base_sort(hammer2_chain_t *chain) 5721 { 5722 hammer2_blockref_t *base; 5723 int count; 5724 5725 switch(chain->bref.type) { 5726 case HAMMER2_BREF_TYPE_INODE: 5727 /* 5728 * Special shortcut for embedded data returns the inode 5729 * itself. Callers must detect this condition and access 5730 * the embedded data (the strategy code does this for us). 5731 * 5732 * This is only applicable to regular files and softlinks. 5733 */ 5734 if (chain->data->ipdata.meta.op_flags & 5735 HAMMER2_OPFLAG_DIRECTDATA) { 5736 return; 5737 } 5738 base = &chain->data->ipdata.u.blockset.blockref[0]; 5739 count = HAMMER2_SET_COUNT; 5740 break; 5741 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5742 case HAMMER2_BREF_TYPE_INDIRECT: 5743 /* 5744 * Optimize indirect blocks in the INITIAL state to avoid 5745 * I/O. 5746 */ 5747 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5748 base = &chain->data->npdata[0]; 5749 count = chain->bytes / sizeof(hammer2_blockref_t); 5750 break; 5751 case HAMMER2_BREF_TYPE_VOLUME: 5752 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5753 count = HAMMER2_SET_COUNT; 5754 break; 5755 case HAMMER2_BREF_TYPE_FREEMAP: 5756 base = &chain->data->blkset.blockref[0]; 5757 count = HAMMER2_SET_COUNT; 5758 break; 5759 default: 5760 kprintf("hammer2_chain_lookup: unrecognized " 5761 "blockref(A) type: %d", 5762 chain->bref.type); 5763 while (1) 5764 tsleep(&base, 0, "dead", 0); 5765 panic("hammer2_base_sort: unrecognized " 5766 "blockref(A) type: %d", 5767 chain->bref.type); 5768 base = NULL; /* safety */ 5769 count = 0; /* safety */ 5770 } 5771 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5772 } 5773 5774 #endif 5775 5776 /* 5777 * Chain memory management 5778 */ 5779 void 5780 hammer2_chain_wait(hammer2_chain_t *chain) 5781 { 5782 tsleep(chain, 0, "chnflw", 1); 5783 } 5784 5785 const hammer2_media_data_t * 5786 hammer2_chain_rdata(hammer2_chain_t *chain) 5787 { 5788 KKASSERT(chain->data != NULL); 5789 return (chain->data); 5790 } 5791 5792 hammer2_media_data_t * 5793 hammer2_chain_wdata(hammer2_chain_t *chain) 5794 { 5795 KKASSERT(chain->data != NULL); 5796 return (chain->data); 5797 } 5798 5799 /* 5800 * Set the check data for a chain. This can be a heavy-weight operation 5801 * and typically only runs on-flush. For file data check data is calculated 5802 * when the logical buffers are flushed. 5803 */ 5804 void 5805 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5806 { 5807 chain->bref.flags &= ~HAMMER2_BREF_FLAG_ZERO; 5808 5809 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5810 case HAMMER2_CHECK_NONE: 5811 break; 5812 case HAMMER2_CHECK_DISABLED: 5813 break; 5814 case HAMMER2_CHECK_ISCSI32: 5815 chain->bref.check.iscsi32.value = 5816 hammer2_icrc32(bdata, chain->bytes); 5817 break; 5818 case HAMMER2_CHECK_XXHASH64: 5819 chain->bref.check.xxhash64.value = 5820 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5821 break; 5822 case HAMMER2_CHECK_SHA192: 5823 { 5824 SHA256_CTX hash_ctx; 5825 union { 5826 uint8_t digest[SHA256_DIGEST_LENGTH]; 5827 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5828 } u; 5829 5830 SHA256_Init(&hash_ctx); 5831 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5832 SHA256_Final(u.digest, &hash_ctx); 5833 u.digest64[2] ^= u.digest64[3]; 5834 bcopy(u.digest, 5835 chain->bref.check.sha192.data, 5836 sizeof(chain->bref.check.sha192.data)); 5837 } 5838 break; 5839 case HAMMER2_CHECK_FREEMAP: 5840 chain->bref.check.freemap.icrc32 = 5841 hammer2_icrc32(bdata, chain->bytes); 5842 break; 5843 default: 5844 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5845 chain->bref.methods); 5846 break; 5847 } 5848 } 5849 5850 /* 5851 * Characterize a failed check code and try to trace back to the inode. 5852 */ 5853 static void 5854 hammer2_characterize_failed_chain(hammer2_chain_t *chain, uint64_t check, 5855 int bits) 5856 { 5857 hammer2_chain_t *lchain; 5858 hammer2_chain_t *ochain; 5859 int did; 5860 5861 did = krateprintf(&krate_h2chk, 5862 "chain %016jx.%02x (%s) meth=%02x CHECK FAIL " 5863 "(flags=%08x, bref/data ", 5864 chain->bref.data_off, 5865 chain->bref.type, 5866 hammer2_bref_type_str(&chain->bref), 5867 chain->bref.methods, 5868 chain->flags); 5869 if (did == 0) 5870 return; 5871 5872 if (bits == 32) { 5873 kprintf("%08x/%08x)\n", 5874 chain->bref.check.iscsi32.value, 5875 (uint32_t)check); 5876 } else { 5877 kprintf("%016jx/%016jx)\n", 5878 chain->bref.check.xxhash64.value, 5879 check); 5880 } 5881 5882 /* 5883 * Run up the chains to try to find the governing inode so we 5884 * can report it. 5885 * 5886 * XXX This error reporting is not really MPSAFE 5887 */ 5888 ochain = chain; 5889 lchain = chain; 5890 while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) { 5891 lchain = chain; 5892 chain = chain->parent; 5893 } 5894 5895 if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE && 5896 ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) == 0 || 5897 (lchain->bref.key & HAMMER2_DIRHASH_VISIBLE))) { 5898 kprintf(" Resides at/in inode %ld\n", 5899 chain->bref.key); 5900 } else if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5901 kprintf(" Resides in inode index - CRITICAL!!!\n"); 5902 } else { 5903 kprintf(" Resides in root index - CRITICAL!!!\n"); 5904 } 5905 if (ochain->hmp) { 5906 const char *pfsname = "UNKNOWN"; 5907 int i; 5908 5909 if (ochain->pmp) { 5910 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 5911 if (ochain->pmp->pfs_hmps[i] == ochain->hmp && 5912 ochain->pmp->pfs_names[i]) { 5913 pfsname = ochain->pmp->pfs_names[i]; 5914 break; 5915 } 5916 } 5917 } 5918 kprintf(" In pfs %s on device %s\n", 5919 pfsname, ochain->hmp->devrepname); 5920 } 5921 } 5922 5923 /* 5924 * Returns non-zero on success, 0 on failure. 5925 */ 5926 int 5927 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5928 { 5929 uint32_t check32; 5930 uint64_t check64; 5931 int r; 5932 5933 if (chain->bref.flags & HAMMER2_BREF_FLAG_ZERO) 5934 return 1; 5935 5936 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5937 case HAMMER2_CHECK_NONE: 5938 r = 1; 5939 break; 5940 case HAMMER2_CHECK_DISABLED: 5941 r = 1; 5942 break; 5943 case HAMMER2_CHECK_ISCSI32: 5944 check32 = hammer2_icrc32(bdata, chain->bytes); 5945 r = (chain->bref.check.iscsi32.value == check32); 5946 if (r == 0) { 5947 hammer2_characterize_failed_chain(chain, check32, 32); 5948 } 5949 hammer2_process_icrc32 += chain->bytes; 5950 break; 5951 case HAMMER2_CHECK_XXHASH64: 5952 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5953 r = (chain->bref.check.xxhash64.value == check64); 5954 if (r == 0) { 5955 hammer2_characterize_failed_chain(chain, check64, 64); 5956 } 5957 hammer2_process_xxhash64 += chain->bytes; 5958 break; 5959 case HAMMER2_CHECK_SHA192: 5960 { 5961 SHA256_CTX hash_ctx; 5962 union { 5963 uint8_t digest[SHA256_DIGEST_LENGTH]; 5964 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5965 } u; 5966 5967 SHA256_Init(&hash_ctx); 5968 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5969 SHA256_Final(u.digest, &hash_ctx); 5970 u.digest64[2] ^= u.digest64[3]; 5971 if (bcmp(u.digest, 5972 chain->bref.check.sha192.data, 5973 sizeof(chain->bref.check.sha192.data)) == 0) { 5974 r = 1; 5975 } else { 5976 r = 0; 5977 krateprintf(&krate_h2chk, 5978 "chain %016jx.%02x meth=%02x " 5979 "CHECK FAIL\n", 5980 chain->bref.data_off, 5981 chain->bref.type, 5982 chain->bref.methods); 5983 } 5984 } 5985 break; 5986 case HAMMER2_CHECK_FREEMAP: 5987 r = (chain->bref.check.freemap.icrc32 == 5988 hammer2_icrc32(bdata, chain->bytes)); 5989 if (r == 0) { 5990 int did; 5991 5992 did = krateprintf(&krate_h2chk, 5993 "chain %016jx.%02x meth=%02x " 5994 "CHECK FAIL\n", 5995 chain->bref.data_off, 5996 chain->bref.type, 5997 chain->bref.methods); 5998 if (did) { 5999 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 6000 chain->bref.check.freemap.icrc32, 6001 hammer2_icrc32(bdata, chain->bytes), 6002 chain->bytes); 6003 if (chain->dio) { 6004 kprintf("dio %p buf %016jx,%d " 6005 "bdata %p/%p\n", 6006 chain->dio, 6007 chain->dio->bp->b_loffset, 6008 chain->dio->bp->b_bufsize, 6009 bdata, 6010 chain->dio->bp->b_data); 6011 } 6012 } 6013 } 6014 break; 6015 default: 6016 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 6017 chain->bref.methods); 6018 r = 1; 6019 break; 6020 } 6021 return r; 6022 } 6023 6024 /* 6025 * Acquire the chain and parent representing the specified inode for the 6026 * device at the specified cluster index. 6027 * 6028 * The flags passed in are LOOKUP flags, not RESOLVE flags. 6029 * 6030 * If we are unable to locate the inode, HAMMER2_ERROR_EIO is returned and 6031 * *chainp will be NULL. *parentp may still be set error or not, or NULL 6032 * if the parent itself could not be resolved. 6033 * 6034 * The caller may pass-in a locked *parentp and/or *chainp, or neither. 6035 * They will be unlocked and released by this function. The *parentp and 6036 * *chainp representing the located inode are returned locked. 6037 */ 6038 int 6039 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 6040 int clindex, int flags, 6041 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 6042 { 6043 hammer2_chain_t *parent; 6044 hammer2_chain_t *rchain; 6045 hammer2_key_t key_dummy; 6046 hammer2_inode_t *ip; 6047 int resolve_flags; 6048 int error; 6049 6050 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 6051 HAMMER2_RESOLVE_SHARED : 0; 6052 6053 /* 6054 * Caller expects us to replace these. 6055 */ 6056 if (*chainp) { 6057 hammer2_chain_unlock(*chainp); 6058 hammer2_chain_drop(*chainp); 6059 *chainp = NULL; 6060 } 6061 if (*parentp) { 6062 hammer2_chain_unlock(*parentp); 6063 hammer2_chain_drop(*parentp); 6064 *parentp = NULL; 6065 } 6066 6067 /* 6068 * Be very careful, this is a backend function and we CANNOT 6069 * lock any frontend inode structure we find. But we have to 6070 * look the inode up this way first in case it exists but is 6071 * detached from the radix tree. 6072 */ 6073 ip = hammer2_inode_lookup(pmp, inum); 6074 if (ip) { 6075 *chainp = hammer2_inode_chain_and_parent(ip, clindex, 6076 parentp, 6077 resolve_flags); 6078 hammer2_inode_drop(ip); 6079 if (*chainp) 6080 return 0; 6081 hammer2_chain_unlock(*chainp); 6082 hammer2_chain_drop(*chainp); 6083 *chainp = NULL; 6084 if (*parentp) { 6085 hammer2_chain_unlock(*parentp); 6086 hammer2_chain_drop(*parentp); 6087 *parentp = NULL; 6088 } 6089 } 6090 6091 /* 6092 * Inodes hang off of the iroot (bit 63 is clear, differentiating 6093 * inodes from root directory entries in the key lookup). 6094 */ 6095 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 6096 rchain = NULL; 6097 if (parent) { 6098 rchain = hammer2_chain_lookup(&parent, &key_dummy, 6099 inum, inum, 6100 &error, flags); 6101 } else { 6102 error = HAMMER2_ERROR_EIO; 6103 } 6104 *parentp = parent; 6105 *chainp = rchain; 6106 6107 return error; 6108 } 6109 6110 /* 6111 * Used by the bulkscan code to snapshot the synchronized storage for 6112 * a volume, allowing it to be scanned concurrently against normal 6113 * operation. 6114 */ 6115 hammer2_chain_t * 6116 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 6117 { 6118 hammer2_chain_t *copy; 6119 6120 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 6121 copy->data = kmalloc(sizeof(copy->data->voldata), 6122 hmp->mchain, 6123 M_WAITOK | M_ZERO); 6124 hammer2_voldata_lock(hmp); 6125 copy->data->voldata = hmp->volsync; 6126 hammer2_voldata_unlock(hmp); 6127 6128 return copy; 6129 } 6130 6131 void 6132 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 6133 { 6134 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 6135 KKASSERT(copy->data); 6136 kfree(copy->data, copy->hmp->mchain); 6137 copy->data = NULL; 6138 atomic_add_long(&hammer2_chain_allocs, -1); 6139 hammer2_chain_drop(copy); 6140 } 6141 6142 /* 6143 * Returns non-zero if the chain (INODE or DIRENT) matches the 6144 * filename. 6145 */ 6146 int 6147 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 6148 size_t name_len) 6149 { 6150 const hammer2_inode_data_t *ripdata; 6151 const hammer2_dirent_head_t *den; 6152 6153 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 6154 ripdata = &chain->data->ipdata; 6155 if (ripdata->meta.name_len == name_len && 6156 bcmp(ripdata->filename, name, name_len) == 0) { 6157 return 1; 6158 } 6159 } 6160 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 6161 chain->bref.embed.dirent.namlen == name_len) { 6162 den = &chain->bref.embed.dirent; 6163 if (name_len > sizeof(chain->bref.check.buf) && 6164 bcmp(chain->data->buf, name, name_len) == 0) { 6165 return 1; 6166 } 6167 if (name_len <= sizeof(chain->bref.check.buf) && 6168 bcmp(chain->bref.check.buf, name, name_len) == 0) { 6169 return 1; 6170 } 6171 } 6172 return 0; 6173 } 6174