1 /* 2 * Copyright (c) 2011-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/kern_syscall.h> 61 #include <sys/uuid.h> 62 63 #include <crypto/sha2/sha2.h> 64 65 #include "hammer2.h" 66 67 static hammer2_chain_t *hammer2_chain_create_indirect( 68 hammer2_chain_t *parent, 69 hammer2_key_t key, int keybits, 70 hammer2_tid_t mtid, int for_type, int *errorp); 71 static void hammer2_chain_rename_obref(hammer2_chain_t **parentp, 72 hammer2_chain_t *chain, hammer2_tid_t mtid, 73 int flags, hammer2_blockref_t *obref); 74 static int hammer2_chain_delete_obref(hammer2_chain_t *parent, 75 hammer2_chain_t *chain, 76 hammer2_tid_t mtid, int flags, 77 hammer2_blockref_t *obref); 78 static hammer2_io_t *hammer2_chain_drop_data(hammer2_chain_t *chain); 79 static hammer2_chain_t *hammer2_combined_find( 80 hammer2_chain_t *parent, 81 hammer2_blockref_t *base, int count, 82 hammer2_key_t *key_nextp, 83 hammer2_key_t key_beg, hammer2_key_t key_end, 84 hammer2_blockref_t **bresp); 85 86 /* 87 * There are many degenerate situations where an extreme rate of console 88 * output can occur from warnings and errors. Make sure this output does 89 * not impede operations. 90 */ 91 static struct krate krate_h2chk = { .freq = 5 }; 92 static struct krate krate_h2me = { .freq = 1 }; 93 static struct krate krate_h2em = { .freq = 1 }; 94 95 /* 96 * Basic RBTree for chains (core.rbtree). 97 */ 98 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 99 100 int 101 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 102 { 103 hammer2_key_t c1_beg; 104 hammer2_key_t c1_end; 105 hammer2_key_t c2_beg; 106 hammer2_key_t c2_end; 107 108 /* 109 * Compare chains. Overlaps are not supposed to happen and catch 110 * any software issues early we count overlaps as a match. 111 */ 112 c1_beg = chain1->bref.key; 113 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 114 c2_beg = chain2->bref.key; 115 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 116 117 if (c1_end < c2_beg) /* fully to the left */ 118 return(-1); 119 if (c1_beg > c2_end) /* fully to the right */ 120 return(1); 121 return(0); /* overlap (must not cross edge boundary) */ 122 } 123 124 /* 125 * Assert that a chain has no media data associated with it. 126 */ 127 static __inline void 128 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 129 { 130 KKASSERT(chain->dio == NULL); 131 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 132 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 133 chain->data) { 134 panic("hammer2_chain_assert_no_data: chain %p still has data", 135 chain); 136 } 137 } 138 139 /* 140 * Make a chain visible to the flusher. The flusher operates using a top-down 141 * recursion based on the ONFLUSH flag. It locates MODIFIED and UPDATE chains, 142 * flushes them, and updates blocks back to the volume root. 143 * 144 * This routine sets the ONFLUSH flag upward from the triggering chain until 145 * it hits an inode root or the volume root. Inode chains serve as inflection 146 * points, requiring the flusher to bridge across trees. Inodes include 147 * regular inodes, PFS roots (pmp->iroot), and the media super root 148 * (spmp->iroot). 149 */ 150 void 151 hammer2_chain_setflush(hammer2_chain_t *chain) 152 { 153 hammer2_chain_t *parent; 154 155 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 156 hammer2_spin_sh(&chain->core.spin); 157 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 158 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 159 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 160 break; 161 if ((parent = chain->parent) == NULL) 162 break; 163 hammer2_spin_sh(&parent->core.spin); 164 hammer2_spin_unsh(&chain->core.spin); 165 chain = parent; 166 } 167 hammer2_spin_unsh(&chain->core.spin); 168 } 169 } 170 171 /* 172 * Allocate a new disconnected chain element representing the specified 173 * bref. chain->refs is set to 1 and the passed bref is copied to 174 * chain->bref. chain->bytes is derived from the bref. 175 * 176 * chain->pmp inherits pmp unless the chain is an inode (other than the 177 * super-root inode). 178 * 179 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 180 */ 181 hammer2_chain_t * 182 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 183 hammer2_blockref_t *bref) 184 { 185 hammer2_chain_t *chain; 186 u_int bytes; 187 188 /* 189 * Special case - radix of 0 indicates a chain that does not 190 * need a data reference (context is completely embedded in the 191 * bref). 192 */ 193 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 194 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 195 else 196 bytes = 0; 197 198 atomic_add_long(&hammer2_chain_allocs, 1); 199 200 /* 201 * Construct the appropriate system structure. 202 */ 203 switch(bref->type) { 204 case HAMMER2_BREF_TYPE_DIRENT: 205 case HAMMER2_BREF_TYPE_INODE: 206 case HAMMER2_BREF_TYPE_INDIRECT: 207 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 208 case HAMMER2_BREF_TYPE_DATA: 209 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 210 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 211 break; 212 case HAMMER2_BREF_TYPE_VOLUME: 213 case HAMMER2_BREF_TYPE_FREEMAP: 214 /* 215 * Only hammer2_chain_bulksnap() calls this function with these 216 * types. 217 */ 218 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 219 break; 220 default: 221 chain = NULL; 222 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 223 bref->type); 224 } 225 226 /* 227 * Initialize the new chain structure. pmp must be set to NULL for 228 * chains belonging to the super-root topology of a device mount. 229 */ 230 if (pmp == hmp->spmp) 231 chain->pmp = NULL; 232 else 233 chain->pmp = pmp; 234 235 chain->hmp = hmp; 236 chain->bref = *bref; 237 chain->bytes = bytes; 238 chain->refs = 1; 239 chain->flags = HAMMER2_CHAIN_ALLOCATED; 240 lockinit(&chain->diolk, "chdio", 0, 0); 241 242 /* 243 * Set the PFS boundary flag if this chain represents a PFS root. 244 */ 245 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 246 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 247 hammer2_chain_core_init(chain); 248 249 return (chain); 250 } 251 252 /* 253 * Initialize a chain's core structure. This structure used to be allocated 254 * but is now embedded. 255 * 256 * The core is not locked. No additional refs on the chain are made. 257 * (trans) must not be NULL if (core) is not NULL. 258 */ 259 void 260 hammer2_chain_core_init(hammer2_chain_t *chain) 261 { 262 /* 263 * Fresh core under nchain (no multi-homing of ochain's 264 * sub-tree). 265 */ 266 RB_INIT(&chain->core.rbtree); /* live chains */ 267 hammer2_mtx_init(&chain->lock, "h2chain"); 268 } 269 270 /* 271 * Add a reference to a chain element, preventing its destruction. 272 * 273 * (can be called with spinlock held) 274 */ 275 void 276 hammer2_chain_ref(hammer2_chain_t *chain) 277 { 278 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 279 /* 280 * Just flag that the chain was used and should be recycled 281 * on the LRU if it encounters it later. 282 */ 283 if (chain->flags & HAMMER2_CHAIN_ONLRU) 284 atomic_set_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 285 286 #if 0 287 /* 288 * REMOVED - reduces contention, lru_list is more heuristical 289 * now. 290 * 291 * 0->non-zero transition must ensure that chain is removed 292 * from the LRU list. 293 * 294 * NOTE: Already holding lru_spin here so we cannot call 295 * hammer2_chain_ref() to get it off lru_list, do 296 * it manually. 297 */ 298 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 299 hammer2_pfs_t *pmp = chain->pmp; 300 hammer2_spin_ex(&pmp->lru_spin); 301 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 302 atomic_add_int(&pmp->lru_count, -1); 303 atomic_clear_int(&chain->flags, 304 HAMMER2_CHAIN_ONLRU); 305 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 306 } 307 hammer2_spin_unex(&pmp->lru_spin); 308 } 309 #endif 310 } 311 } 312 313 /* 314 * Ref a locked chain and force the data to be held across an unlock. 315 * Chain must be currently locked. The user of the chain who desires 316 * to release the hold must call hammer2_chain_lock_unhold() to relock 317 * and unhold the chain, then unlock normally, or may simply call 318 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 319 */ 320 void 321 hammer2_chain_ref_hold(hammer2_chain_t *chain) 322 { 323 atomic_add_int(&chain->lockcnt, 1); 324 hammer2_chain_ref(chain); 325 } 326 327 /* 328 * Insert the chain in the core rbtree. 329 * 330 * Normal insertions are placed in the live rbtree. Insertion of a deleted 331 * chain is a special case used by the flush code that is placed on the 332 * unstaged deleted list to avoid confusing the live view. 333 */ 334 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 335 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 336 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 337 338 static 339 int 340 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 341 int flags, int generation) 342 { 343 hammer2_chain_t *xchain; 344 int error = 0; 345 346 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 347 hammer2_spin_ex(&parent->core.spin); 348 349 /* 350 * Interlocked by spinlock, check for race 351 */ 352 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 353 parent->core.generation != generation) { 354 error = HAMMER2_ERROR_EAGAIN; 355 goto failed; 356 } 357 358 /* 359 * Insert chain 360 */ 361 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 362 KASSERT(xchain == NULL, 363 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 364 chain, xchain, chain->bref.key)); 365 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 366 chain->parent = parent; 367 ++parent->core.chain_count; 368 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 369 370 /* 371 * We have to keep track of the effective live-view blockref count 372 * so the create code knows when to push an indirect block. 373 */ 374 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 375 atomic_add_int(&parent->core.live_count, 1); 376 failed: 377 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 378 hammer2_spin_unex(&parent->core.spin); 379 return error; 380 } 381 382 /* 383 * Drop the caller's reference to the chain. When the ref count drops to 384 * zero this function will try to disassociate the chain from its parent and 385 * deallocate it, then recursely drop the parent using the implied ref 386 * from the chain's chain->parent. 387 * 388 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 389 * races an acquisition by another cpu. Therefore we can loop if we are 390 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 391 * race against another drop. 392 */ 393 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain, 394 int depth); 395 static void hammer2_chain_lru_flush(hammer2_pfs_t *pmp); 396 397 void 398 hammer2_chain_drop(hammer2_chain_t *chain) 399 { 400 u_int refs; 401 402 if (hammer2_debug & 0x200000) 403 Debugger("drop"); 404 405 KKASSERT(chain->refs > 0); 406 407 while (chain) { 408 refs = chain->refs; 409 cpu_ccfence(); 410 KKASSERT(refs > 0); 411 412 if (refs == 1) { 413 if (hammer2_mtx_ex_try(&chain->lock) == 0) 414 chain = hammer2_chain_lastdrop(chain, 0); 415 /* retry the same chain, or chain from lastdrop */ 416 } else { 417 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 418 break; 419 /* retry the same chain */ 420 } 421 cpu_pause(); 422 } 423 } 424 425 /* 426 * Unhold a held and probably not-locked chain, ensure that the data is 427 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 428 * lock and then simply unlocking the chain. 429 */ 430 void 431 hammer2_chain_unhold(hammer2_chain_t *chain) 432 { 433 u_int lockcnt; 434 int iter = 0; 435 436 for (;;) { 437 lockcnt = chain->lockcnt; 438 cpu_ccfence(); 439 if (lockcnt > 1) { 440 if (atomic_cmpset_int(&chain->lockcnt, 441 lockcnt, lockcnt - 1)) { 442 break; 443 } 444 } else if (hammer2_mtx_ex_try(&chain->lock) == 0) { 445 hammer2_chain_unlock(chain); 446 break; 447 } else { 448 /* 449 * This situation can easily occur on SMP due to 450 * the gap inbetween the 1->0 transition and the 451 * final unlock. We cannot safely block on the 452 * mutex because lockcnt might go above 1. 453 * 454 * XXX Sleep for one tick if it takes too long. 455 */ 456 if (++iter > 1000) { 457 if (iter > 1000 + hz) { 458 kprintf("hammer2: h2race1 %p\n", chain); 459 iter = 1000; 460 } 461 tsleep(&iter, 0, "h2race1", 1); 462 } 463 cpu_pause(); 464 } 465 } 466 } 467 468 void 469 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 470 { 471 hammer2_chain_unhold(chain); 472 hammer2_chain_drop(chain); 473 } 474 475 void 476 hammer2_chain_rehold(hammer2_chain_t *chain) 477 { 478 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED); 479 atomic_add_int(&chain->lockcnt, 1); 480 hammer2_chain_unlock(chain); 481 } 482 483 /* 484 * Handles the (potential) last drop of chain->refs from 1->0. Called with 485 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 486 * possible against refs and lockcnt. We must dispose of the mutex on chain. 487 * 488 * This function returns an unlocked chain for recursive drop or NULL. It 489 * can return the same chain if it determines it has raced another ref. 490 * 491 * -- 492 * 493 * When two chains need to be recursively dropped we use the chain we 494 * would otherwise free to placehold the additional chain. It's a bit 495 * convoluted but we can't just recurse without potentially blowing out 496 * the kernel stack. 497 * 498 * The chain cannot be freed if it has any children. 499 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 500 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 501 * Any dedup registration can remain intact. 502 * 503 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 504 */ 505 static 506 hammer2_chain_t * 507 hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth) 508 { 509 hammer2_pfs_t *pmp; 510 hammer2_dev_t *hmp; 511 hammer2_chain_t *parent; 512 hammer2_chain_t *rdrop; 513 514 /* 515 * We need chain's spinlock to interlock the sub-tree test. 516 * We already have chain's mutex, protecting chain->parent. 517 * 518 * Remember that chain->refs can be in flux. 519 */ 520 hammer2_spin_ex(&chain->core.spin); 521 522 if (chain->parent != NULL) { 523 /* 524 * If the chain has a parent the UPDATE bit prevents scrapping 525 * as the chain is needed to properly flush the parent. Try 526 * to complete the 1->0 transition and return NULL. Retry 527 * (return chain) if we are unable to complete the 1->0 528 * transition, else return NULL (nothing more to do). 529 * 530 * If the chain has a parent the MODIFIED bit prevents 531 * scrapping. 532 * 533 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 534 */ 535 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 536 HAMMER2_CHAIN_MODIFIED)) { 537 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 538 hammer2_spin_unex(&chain->core.spin); 539 hammer2_chain_assert_no_data(chain); 540 hammer2_mtx_unlock(&chain->lock); 541 chain = NULL; 542 } else { 543 hammer2_spin_unex(&chain->core.spin); 544 hammer2_mtx_unlock(&chain->lock); 545 } 546 return (chain); 547 } 548 /* spinlock still held */ 549 } else if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME || 550 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP) { 551 /* 552 * Retain the static vchain and fchain. Clear bits that 553 * are not relevant. Do not clear the MODIFIED bit, 554 * and certainly do not put it on the delayed-flush queue. 555 */ 556 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 557 } else { 558 /* 559 * The chain has no parent and can be flagged for destruction. 560 * Since it has no parent, UPDATE can also be cleared. 561 */ 562 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 563 if (chain->flags & HAMMER2_CHAIN_UPDATE) 564 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 565 566 /* 567 * If the chain has children we must propagate the DESTROY 568 * flag downward and rip the disconnected topology apart. 569 * This is accomplished by calling hammer2_flush() on the 570 * chain. 571 * 572 * Any dedup is already handled by the underlying DIO, so 573 * we do not have to specifically flush it here. 574 */ 575 if (chain->core.chain_count) { 576 hammer2_spin_unex(&chain->core.spin); 577 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 578 HAMMER2_FLUSH_ALL); 579 hammer2_mtx_unlock(&chain->lock); 580 581 return(chain); /* retry drop */ 582 } 583 584 /* 585 * Otherwise we can scrap the MODIFIED bit if it is set, 586 * and continue along the freeing path. 587 * 588 * Be sure to clean-out any dedup bits. Without a parent 589 * this chain will no longer be visible to the flush code. 590 * Easy check data_off to avoid the volume root. 591 */ 592 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 593 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 594 atomic_add_long(&hammer2_count_modified_chains, -1); 595 if (chain->pmp) 596 hammer2_pfs_memory_wakeup(chain->pmp, -1); 597 } 598 /* spinlock still held */ 599 } 600 601 /* spinlock still held */ 602 603 /* 604 * If any children exist we must leave the chain intact with refs == 0. 605 * They exist because chains are retained below us which have refs or 606 * may require flushing. 607 * 608 * Retry (return chain) if we fail to transition the refs to 0, else 609 * return NULL indication nothing more to do. 610 * 611 * Chains with children are NOT put on the LRU list. 612 */ 613 if (chain->core.chain_count) { 614 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 615 hammer2_spin_unex(&chain->core.spin); 616 hammer2_chain_assert_no_data(chain); 617 hammer2_mtx_unlock(&chain->lock); 618 chain = NULL; 619 } else { 620 hammer2_spin_unex(&chain->core.spin); 621 hammer2_mtx_unlock(&chain->lock); 622 } 623 return (chain); 624 } 625 /* spinlock still held */ 626 /* no chains left under us */ 627 628 /* 629 * chain->core has no children left so no accessors can get to our 630 * chain from there. Now we have to lock the parent core to interlock 631 * remaining possible accessors that might bump chain's refs before 632 * we can safely drop chain's refs with intent to free the chain. 633 */ 634 hmp = chain->hmp; 635 pmp = chain->pmp; /* can be NULL */ 636 rdrop = NULL; 637 638 parent = chain->parent; 639 640 /* 641 * WARNING! chain's spin lock is still held here, and other spinlocks 642 * will be acquired and released in the code below. We 643 * cannot be making fancy procedure calls! 644 */ 645 646 /* 647 * We can cache the chain if it is associated with a pmp 648 * and not flagged as being destroyed or requesting a full 649 * release. In this situation the chain is not removed 650 * from its parent, i.e. it can still be looked up. 651 * 652 * We intentionally do not cache DATA chains because these 653 * were likely used to load data into the logical buffer cache 654 * and will not be accessed again for some time. 655 */ 656 if ((chain->flags & 657 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 658 chain->pmp && 659 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 660 if (parent) 661 hammer2_spin_ex(&parent->core.spin); 662 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 663 /* 664 * 1->0 transition failed, retry. Do not drop 665 * the chain's data yet! 666 */ 667 if (parent) 668 hammer2_spin_unex(&parent->core.spin); 669 hammer2_spin_unex(&chain->core.spin); 670 hammer2_mtx_unlock(&chain->lock); 671 672 return(chain); 673 } 674 675 /* 676 * Success 677 */ 678 hammer2_chain_assert_no_data(chain); 679 680 /* 681 * Make sure we are on the LRU list, clean up excessive 682 * LRU entries. We can only really drop one but there might 683 * be other entries that we can remove from the lru_list 684 * without dropping. 685 * 686 * NOTE: HAMMER2_CHAIN_ONLRU may only be safely set when 687 * chain->core.spin AND pmp->lru_spin are held, but 688 * can be safely cleared only holding pmp->lru_spin. 689 */ 690 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 691 hammer2_spin_ex(&pmp->lru_spin); 692 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 693 atomic_set_int(&chain->flags, 694 HAMMER2_CHAIN_ONLRU); 695 TAILQ_INSERT_TAIL(&pmp->lru_list, 696 chain, lru_node); 697 atomic_add_int(&pmp->lru_count, 1); 698 } 699 if (pmp->lru_count < HAMMER2_LRU_LIMIT) 700 depth = 1; /* disable lru_list flush */ 701 hammer2_spin_unex(&pmp->lru_spin); 702 } else { 703 /* disable lru flush */ 704 depth = 1; 705 } 706 707 if (parent) { 708 hammer2_spin_unex(&parent->core.spin); 709 parent = NULL; /* safety */ 710 } 711 hammer2_spin_unex(&chain->core.spin); 712 hammer2_mtx_unlock(&chain->lock); 713 714 /* 715 * lru_list hysteresis (see above for depth overrides). 716 * Note that depth also prevents excessive lastdrop recursion. 717 */ 718 if (depth == 0) 719 hammer2_chain_lru_flush(pmp); 720 721 return NULL; 722 /* NOT REACHED */ 723 } 724 725 /* 726 * Make sure we are not on the LRU list. 727 */ 728 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 729 hammer2_spin_ex(&pmp->lru_spin); 730 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 731 atomic_add_int(&pmp->lru_count, -1); 732 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 733 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 734 } 735 hammer2_spin_unex(&pmp->lru_spin); 736 } 737 738 /* 739 * Spinlock the parent and try to drop the last ref on chain. 740 * On success determine if we should dispose of the chain 741 * (remove the chain from its parent, etc). 742 * 743 * (normal core locks are top-down recursive but we define 744 * core spinlocks as bottom-up recursive, so this is safe). 745 */ 746 if (parent) { 747 hammer2_spin_ex(&parent->core.spin); 748 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 749 /* 750 * 1->0 transition failed, retry. 751 */ 752 hammer2_spin_unex(&parent->core.spin); 753 hammer2_spin_unex(&chain->core.spin); 754 hammer2_mtx_unlock(&chain->lock); 755 756 return(chain); 757 } 758 759 /* 760 * 1->0 transition successful, parent spin held to prevent 761 * new lookups, chain spinlock held to protect parent field. 762 * Remove chain from the parent. 763 * 764 * If the chain is being removed from the parent's btree but 765 * is not bmapped, we have to adjust live_count downward. If 766 * it is bmapped then the blockref is retained in the parent 767 * as is its associated live_count. This case can occur when 768 * a chain added to the topology is unable to flush and is 769 * then later deleted. 770 */ 771 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 772 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 773 (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 774 atomic_add_int(&parent->core.live_count, -1); 775 } 776 RB_REMOVE(hammer2_chain_tree, 777 &parent->core.rbtree, chain); 778 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 779 --parent->core.chain_count; 780 chain->parent = NULL; 781 } 782 783 /* 784 * If our chain was the last chain in the parent's core the 785 * core is now empty and its parent might have to be 786 * re-dropped if it has 0 refs. 787 */ 788 if (parent->core.chain_count == 0) { 789 rdrop = parent; 790 atomic_add_int(&rdrop->refs, 1); 791 /* 792 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 793 rdrop = NULL; 794 */ 795 } 796 hammer2_spin_unex(&parent->core.spin); 797 parent = NULL; /* safety */ 798 /* FALL THROUGH */ 799 } else { 800 /* 801 * No-parent case. 802 */ 803 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 804 /* 805 * 1->0 transition failed, retry. 806 */ 807 hammer2_spin_unex(&parent->core.spin); 808 hammer2_spin_unex(&chain->core.spin); 809 hammer2_mtx_unlock(&chain->lock); 810 811 return(chain); 812 } 813 } 814 815 /* 816 * Successful 1->0 transition, no parent, no children... no way for 817 * anyone to ref this chain any more. We can clean-up and free it. 818 * 819 * We still have the core spinlock, and core's chain_count is 0. 820 * Any parent spinlock is gone. 821 */ 822 hammer2_spin_unex(&chain->core.spin); 823 hammer2_chain_assert_no_data(chain); 824 hammer2_mtx_unlock(&chain->lock); 825 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 826 chain->core.chain_count == 0); 827 828 /* 829 * All locks are gone, no pointers remain to the chain, finish 830 * freeing it. 831 */ 832 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 833 HAMMER2_CHAIN_MODIFIED)) == 0); 834 835 /* 836 * Once chain resources are gone we can use the now dead chain 837 * structure to placehold what might otherwise require a recursive 838 * drop, because we have potentially two things to drop and can only 839 * return one directly. 840 */ 841 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 842 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 843 chain->hmp = NULL; 844 kfree(chain, hmp->mchain); 845 } 846 847 /* 848 * Possible chaining loop when parent re-drop needed. 849 */ 850 return(rdrop); 851 } 852 853 /* 854 * Heuristical flush of the LRU, try to reduce the number of entries 855 * on the LRU to (HAMMER2_LRU_LIMIT * 2 / 3). This procedure is called 856 * only when lru_count exceeds HAMMER2_LRU_LIMIT. 857 */ 858 static 859 void 860 hammer2_chain_lru_flush(hammer2_pfs_t *pmp) 861 { 862 hammer2_chain_t *chain; 863 864 again: 865 chain = NULL; 866 hammer2_spin_ex(&pmp->lru_spin); 867 while (pmp->lru_count > HAMMER2_LRU_LIMIT * 2 / 3) { 868 /* 869 * Pick a chain off the lru_list, just recycle it quickly 870 * if LRUHINT is set (the chain was ref'd but left on 871 * the lru_list, so cycle to the end). 872 */ 873 chain = TAILQ_FIRST(&pmp->lru_list); 874 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 875 876 if (chain->flags & HAMMER2_CHAIN_LRUHINT) { 877 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 878 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 879 chain = NULL; 880 continue; 881 } 882 883 /* 884 * Ok, we are off the LRU. We must adjust refs before we 885 * can safely clear the ONLRU flag. 886 */ 887 atomic_add_int(&pmp->lru_count, -1); 888 if (atomic_cmpset_int(&chain->refs, 0, 1)) { 889 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 890 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 891 break; 892 } 893 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 894 chain = NULL; 895 } 896 hammer2_spin_unex(&pmp->lru_spin); 897 if (chain == NULL) 898 return; 899 900 /* 901 * If we picked a chain off the lru list we may be able to lastdrop 902 * it. Use a depth of 1 to prevent excessive lastdrop recursion. 903 */ 904 while (chain) { 905 u_int refs; 906 907 refs = chain->refs; 908 cpu_ccfence(); 909 KKASSERT(refs > 0); 910 911 if (refs == 1) { 912 if (hammer2_mtx_ex_try(&chain->lock) == 0) 913 chain = hammer2_chain_lastdrop(chain, 1); 914 /* retry the same chain, or chain from lastdrop */ 915 } else { 916 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 917 break; 918 /* retry the same chain */ 919 } 920 cpu_pause(); 921 } 922 goto again; 923 } 924 925 /* 926 * On last lock release. 927 */ 928 static hammer2_io_t * 929 hammer2_chain_drop_data(hammer2_chain_t *chain) 930 { 931 hammer2_io_t *dio; 932 933 if ((dio = chain->dio) != NULL) { 934 chain->dio = NULL; 935 chain->data = NULL; 936 } else { 937 switch(chain->bref.type) { 938 case HAMMER2_BREF_TYPE_VOLUME: 939 case HAMMER2_BREF_TYPE_FREEMAP: 940 break; 941 default: 942 if (chain->data != NULL) { 943 hammer2_spin_unex(&chain->core.spin); 944 panic("chain data not null: " 945 "chain %p bref %016jx.%02x " 946 "refs %d parent %p dio %p data %p", 947 chain, chain->bref.data_off, 948 chain->bref.type, chain->refs, 949 chain->parent, 950 chain->dio, chain->data); 951 } 952 KKASSERT(chain->data == NULL); 953 break; 954 } 955 } 956 return dio; 957 } 958 959 /* 960 * Lock a referenced chain element, acquiring its data with I/O if necessary, 961 * and specify how you would like the data to be resolved. 962 * 963 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 964 * 965 * The lock is allowed to recurse, multiple locking ops will aggregate 966 * the requested resolve types. Once data is assigned it will not be 967 * removed until the last unlock. 968 * 969 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 970 * (typically used to avoid device/logical buffer 971 * aliasing for data) 972 * 973 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 974 * the INITIAL-create state (indirect blocks only). 975 * 976 * Do not resolve data elements for DATA chains. 977 * (typically used to avoid device/logical buffer 978 * aliasing for data) 979 * 980 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 981 * 982 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 983 * it will be locked exclusive. 984 * 985 * HAMMER2_RESOLVE_NONBLOCK- (flag) The chain is locked non-blocking. If 986 * the lock fails, EAGAIN is returned. 987 * 988 * NOTE: Embedded elements (volume header, inodes) are always resolved 989 * regardless. 990 * 991 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 992 * element will instantiate and zero its buffer, and flush it on 993 * release. 994 * 995 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 996 * so as not to instantiate a device buffer, which could alias against 997 * a logical file buffer. However, if ALWAYS is specified the 998 * device buffer will be instantiated anyway. 999 * 1000 * NOTE: The return value is always 0 unless NONBLOCK is specified, in which 1001 * case it can be either 0 or EAGAIN. 1002 * 1003 * WARNING! This function blocks on I/O if data needs to be fetched. This 1004 * blocking can run concurrent with other compatible lock holders 1005 * who do not need data returning. The lock is not upgraded to 1006 * exclusive during a data fetch, a separate bit is used to 1007 * interlock I/O. However, an exclusive lock holder can still count 1008 * on being interlocked against an I/O fetch managed by a shared 1009 * lock holder. 1010 */ 1011 int 1012 hammer2_chain_lock(hammer2_chain_t *chain, int how) 1013 { 1014 KKASSERT(chain->refs > 0); 1015 1016 if (how & HAMMER2_RESOLVE_NONBLOCK) { 1017 /* 1018 * We still have to bump lockcnt before acquiring the lock, 1019 * even for non-blocking operation, because the unlock code 1020 * live-loops on lockcnt == 1 when dropping the last lock. 1021 * 1022 * If the non-blocking operation fails we have to use an 1023 * unhold sequence to undo the mess. 1024 * 1025 * NOTE: LOCKAGAIN must always succeed without blocking, 1026 * even if NONBLOCK is specified. 1027 */ 1028 atomic_add_int(&chain->lockcnt, 1); 1029 if (how & HAMMER2_RESOLVE_SHARED) { 1030 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1031 hammer2_mtx_sh_again(&chain->lock); 1032 } else { 1033 if (hammer2_mtx_sh_try(&chain->lock) != 0) { 1034 hammer2_chain_unhold(chain); 1035 return EAGAIN; 1036 } 1037 } 1038 } else { 1039 if (hammer2_mtx_ex_try(&chain->lock) != 0) { 1040 hammer2_chain_unhold(chain); 1041 return EAGAIN; 1042 } 1043 } 1044 } else { 1045 /* 1046 * Get the appropriate lock. If LOCKAGAIN is flagged with 1047 * SHARED the caller expects a shared lock to already be 1048 * present and we are giving it another ref. This case must 1049 * importantly not block if there is a pending exclusive lock 1050 * request. 1051 */ 1052 atomic_add_int(&chain->lockcnt, 1); 1053 if (how & HAMMER2_RESOLVE_SHARED) { 1054 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1055 hammer2_mtx_sh_again(&chain->lock); 1056 } else { 1057 hammer2_mtx_sh(&chain->lock); 1058 } 1059 } else { 1060 hammer2_mtx_ex(&chain->lock); 1061 } 1062 } 1063 1064 /* 1065 * If we already have a valid data pointer make sure the data is 1066 * synchronized to the current cpu, and then no further action is 1067 * necessary. 1068 */ 1069 if (chain->data) { 1070 if (chain->dio) 1071 hammer2_io_bkvasync(chain->dio); 1072 return 0; 1073 } 1074 1075 /* 1076 * Do we have to resolve the data? This is generally only 1077 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 1078 * Other BREF types expects the data to be there. 1079 */ 1080 switch(how & HAMMER2_RESOLVE_MASK) { 1081 case HAMMER2_RESOLVE_NEVER: 1082 return 0; 1083 case HAMMER2_RESOLVE_MAYBE: 1084 if (chain->flags & HAMMER2_CHAIN_INITIAL) 1085 return 0; 1086 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 1087 return 0; 1088 #if 0 1089 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 1090 return 0; 1091 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 1092 return 0; 1093 #endif 1094 /* fall through */ 1095 case HAMMER2_RESOLVE_ALWAYS: 1096 default: 1097 break; 1098 } 1099 1100 /* 1101 * Caller requires data 1102 */ 1103 hammer2_chain_load_data(chain); 1104 1105 return 0; 1106 } 1107 1108 /* 1109 * Lock the chain, retain the hold, and drop the data persistence count. 1110 * The data should remain valid because we never transitioned lockcnt 1111 * through 0. 1112 */ 1113 void 1114 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 1115 { 1116 hammer2_chain_lock(chain, how); 1117 atomic_add_int(&chain->lockcnt, -1); 1118 } 1119 1120 #if 0 1121 /* 1122 * Downgrade an exclusive chain lock to a shared chain lock. 1123 * 1124 * NOTE: There is no upgrade equivalent due to the ease of 1125 * deadlocks in that direction. 1126 */ 1127 void 1128 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1129 { 1130 hammer2_mtx_downgrade(&chain->lock); 1131 } 1132 #endif 1133 1134 /* 1135 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1136 * may be of any type. 1137 * 1138 * Once chain->data is set it cannot be disposed of until all locks are 1139 * released. 1140 * 1141 * Make sure the data is synchronized to the current cpu. 1142 */ 1143 void 1144 hammer2_chain_load_data(hammer2_chain_t *chain) 1145 { 1146 hammer2_blockref_t *bref; 1147 hammer2_dev_t *hmp; 1148 hammer2_io_t *dio; 1149 char *bdata; 1150 int error; 1151 1152 /* 1153 * Degenerate case, data already present, or chain has no media 1154 * reference to load. 1155 */ 1156 KKASSERT(chain->lock.mtx_lock & MTX_MASK); 1157 if (chain->data) { 1158 if (chain->dio) 1159 hammer2_io_bkvasync(chain->dio); 1160 return; 1161 } 1162 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1163 return; 1164 1165 hmp = chain->hmp; 1166 KKASSERT(hmp != NULL); 1167 1168 /* 1169 * Gain the IOINPROG bit, interlocked block. 1170 */ 1171 for (;;) { 1172 u_int oflags; 1173 u_int nflags; 1174 1175 oflags = chain->flags; 1176 cpu_ccfence(); 1177 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1178 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1179 tsleep_interlock(&chain->flags, 0); 1180 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1181 tsleep(&chain->flags, PINTERLOCKED, 1182 "h2iocw", 0); 1183 } 1184 /* retry */ 1185 } else { 1186 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1187 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1188 break; 1189 } 1190 /* retry */ 1191 } 1192 } 1193 1194 /* 1195 * We own CHAIN_IOINPROG 1196 * 1197 * Degenerate case if we raced another load. 1198 */ 1199 if (chain->data) { 1200 if (chain->dio) 1201 hammer2_io_bkvasync(chain->dio); 1202 goto done; 1203 } 1204 1205 /* 1206 * We must resolve to a device buffer, either by issuing I/O or 1207 * by creating a zero-fill element. We do not mark the buffer 1208 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1209 * API must still be used to do that). 1210 * 1211 * The device buffer is variable-sized in powers of 2 down 1212 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1213 * chunk always contains buffers of the same size. (XXX) 1214 * 1215 * The minimum physical IO size may be larger than the variable 1216 * block size. 1217 */ 1218 bref = &chain->bref; 1219 1220 /* 1221 * The getblk() optimization can only be used on newly created 1222 * elements if the physical block size matches the request. 1223 */ 1224 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1225 error = hammer2_io_new(hmp, bref->type, 1226 bref->data_off, chain->bytes, 1227 &chain->dio); 1228 } else { 1229 error = hammer2_io_bread(hmp, bref->type, 1230 bref->data_off, chain->bytes, 1231 &chain->dio); 1232 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1233 } 1234 if (error) { 1235 chain->error = HAMMER2_ERROR_EIO; 1236 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n", 1237 (intmax_t)bref->data_off, error); 1238 hammer2_io_bqrelse(&chain->dio); 1239 goto done; 1240 } 1241 chain->error = 0; 1242 1243 /* 1244 * This isn't perfect and can be ignored on OSs which do not have 1245 * an indication as to whether a buffer is coming from cache or 1246 * if I/O was actually issued for the read. TESTEDGOOD will work 1247 * pretty well without the B_IOISSUED logic because chains are 1248 * cached, but in that situation (without B_IOISSUED) it will not 1249 * detect whether a re-read via I/O is corrupted verses the original 1250 * read. 1251 * 1252 * We can't re-run the CRC on every fresh lock. That would be 1253 * insanely expensive. 1254 * 1255 * If the underlying kernel buffer covers the entire chain we can 1256 * use the B_IOISSUED indication to determine if we have to re-run 1257 * the CRC on chain data for chains that managed to stay cached 1258 * across the kernel disposal of the original buffer. 1259 */ 1260 if ((dio = chain->dio) != NULL && dio->bp) { 1261 struct buf *bp = dio->bp; 1262 1263 if (dio->psize == chain->bytes && 1264 (bp->b_flags & B_IOISSUED)) { 1265 atomic_clear_int(&chain->flags, 1266 HAMMER2_CHAIN_TESTEDGOOD); 1267 bp->b_flags &= ~B_IOISSUED; 1268 } 1269 } 1270 1271 /* 1272 * NOTE: A locked chain's data cannot be modified without first 1273 * calling hammer2_chain_modify(). 1274 */ 1275 1276 /* 1277 * NOTE: hammer2_io_data() call issues bkvasync() 1278 */ 1279 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1280 1281 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1282 /* 1283 * Clear INITIAL. In this case we used io_new() and the 1284 * buffer has been zero'd and marked dirty. 1285 * 1286 * CHAIN_MODIFIED has not been set yet, and we leave it 1287 * that way for now. Set a temporary CHAIN_NOTTESTED flag 1288 * to prevent hammer2_chain_testcheck() from trying to match 1289 * a check code that has not yet been generated. This bit 1290 * should NOT end up on the actual media. 1291 */ 1292 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1293 atomic_set_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 1294 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1295 /* 1296 * check data not currently synchronized due to 1297 * modification. XXX assumes data stays in the buffer 1298 * cache, which might not be true (need biodep on flush 1299 * to calculate crc? or simple crc?). 1300 */ 1301 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1302 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1303 chain->error = HAMMER2_ERROR_CHECK; 1304 } else { 1305 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1306 } 1307 } 1308 1309 /* 1310 * Setup the data pointer, either pointing it to an embedded data 1311 * structure and copying the data from the buffer, or pointing it 1312 * into the buffer. 1313 * 1314 * The buffer is not retained when copying to an embedded data 1315 * structure in order to avoid potential deadlocks or recursions 1316 * on the same physical buffer. 1317 * 1318 * WARNING! Other threads can start using the data the instant we 1319 * set chain->data non-NULL. 1320 */ 1321 switch (bref->type) { 1322 case HAMMER2_BREF_TYPE_VOLUME: 1323 case HAMMER2_BREF_TYPE_FREEMAP: 1324 /* 1325 * Copy data from bp to embedded buffer 1326 */ 1327 panic("hammer2_chain_load_data: unresolved volume header"); 1328 break; 1329 case HAMMER2_BREF_TYPE_DIRENT: 1330 KKASSERT(chain->bytes != 0); 1331 /* fall through */ 1332 case HAMMER2_BREF_TYPE_INODE: 1333 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1334 case HAMMER2_BREF_TYPE_INDIRECT: 1335 case HAMMER2_BREF_TYPE_DATA: 1336 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1337 default: 1338 /* 1339 * Point data at the device buffer and leave dio intact. 1340 */ 1341 chain->data = (void *)bdata; 1342 break; 1343 } 1344 1345 /* 1346 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1347 */ 1348 done: 1349 for (;;) { 1350 u_int oflags; 1351 u_int nflags; 1352 1353 oflags = chain->flags; 1354 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1355 HAMMER2_CHAIN_IOSIGNAL); 1356 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1357 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1358 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1359 wakeup(&chain->flags); 1360 break; 1361 } 1362 } 1363 } 1364 1365 /* 1366 * Unlock and deref a chain element. 1367 * 1368 * Remember that the presence of children under chain prevent the chain's 1369 * destruction but do not add additional references, so the dio will still 1370 * be dropped. 1371 */ 1372 void 1373 hammer2_chain_unlock(hammer2_chain_t *chain) 1374 { 1375 hammer2_io_t *dio; 1376 u_int lockcnt; 1377 int iter = 0; 1378 1379 /* 1380 * If multiple locks are present (or being attempted) on this 1381 * particular chain we can just unlock, drop refs, and return. 1382 * 1383 * Otherwise fall-through on the 1->0 transition. 1384 */ 1385 for (;;) { 1386 lockcnt = chain->lockcnt; 1387 KKASSERT(lockcnt > 0); 1388 cpu_ccfence(); 1389 if (lockcnt > 1) { 1390 if (atomic_cmpset_int(&chain->lockcnt, 1391 lockcnt, lockcnt - 1)) { 1392 hammer2_mtx_unlock(&chain->lock); 1393 return; 1394 } 1395 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1396 /* while holding the mutex exclusively */ 1397 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1398 break; 1399 } else { 1400 /* 1401 * This situation can easily occur on SMP due to 1402 * the gap inbetween the 1->0 transition and the 1403 * final unlock. We cannot safely block on the 1404 * mutex because lockcnt might go above 1. 1405 * 1406 * XXX Sleep for one tick if it takes too long. 1407 */ 1408 if (++iter > 1000) { 1409 if (iter > 1000 + hz) { 1410 kprintf("hammer2: h2race2 %p\n", chain); 1411 iter = 1000; 1412 } 1413 tsleep(&iter, 0, "h2race2", 1); 1414 } 1415 cpu_pause(); 1416 } 1417 /* retry */ 1418 } 1419 1420 /* 1421 * Last unlock / mutex upgraded to exclusive. Drop the data 1422 * reference. 1423 */ 1424 dio = hammer2_chain_drop_data(chain); 1425 if (dio) 1426 hammer2_io_bqrelse(&dio); 1427 hammer2_mtx_unlock(&chain->lock); 1428 } 1429 1430 /* 1431 * Unlock and hold chain data intact 1432 */ 1433 void 1434 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1435 { 1436 atomic_add_int(&chain->lockcnt, 1); 1437 hammer2_chain_unlock(chain); 1438 } 1439 1440 /* 1441 * Helper to obtain the blockref[] array base and count for a chain. 1442 * 1443 * XXX Not widely used yet, various use cases need to be validated and 1444 * converted to use this function. 1445 */ 1446 static 1447 hammer2_blockref_t * 1448 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1449 { 1450 hammer2_blockref_t *base; 1451 int count; 1452 1453 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1454 base = NULL; 1455 1456 switch(parent->bref.type) { 1457 case HAMMER2_BREF_TYPE_INODE: 1458 count = HAMMER2_SET_COUNT; 1459 break; 1460 case HAMMER2_BREF_TYPE_INDIRECT: 1461 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1462 count = parent->bytes / sizeof(hammer2_blockref_t); 1463 break; 1464 case HAMMER2_BREF_TYPE_VOLUME: 1465 count = HAMMER2_SET_COUNT; 1466 break; 1467 case HAMMER2_BREF_TYPE_FREEMAP: 1468 count = HAMMER2_SET_COUNT; 1469 break; 1470 default: 1471 panic("hammer2_chain_base_and_count: " 1472 "unrecognized blockref type: %d", 1473 parent->bref.type); 1474 count = 0; 1475 break; 1476 } 1477 } else { 1478 switch(parent->bref.type) { 1479 case HAMMER2_BREF_TYPE_INODE: 1480 base = &parent->data->ipdata.u.blockset.blockref[0]; 1481 count = HAMMER2_SET_COUNT; 1482 break; 1483 case HAMMER2_BREF_TYPE_INDIRECT: 1484 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1485 base = &parent->data->npdata[0]; 1486 count = parent->bytes / sizeof(hammer2_blockref_t); 1487 break; 1488 case HAMMER2_BREF_TYPE_VOLUME: 1489 base = &parent->data->voldata. 1490 sroot_blockset.blockref[0]; 1491 count = HAMMER2_SET_COUNT; 1492 break; 1493 case HAMMER2_BREF_TYPE_FREEMAP: 1494 base = &parent->data->blkset.blockref[0]; 1495 count = HAMMER2_SET_COUNT; 1496 break; 1497 default: 1498 panic("hammer2_chain_base_and_count: " 1499 "unrecognized blockref type: %d", 1500 parent->bref.type); 1501 count = 0; 1502 break; 1503 } 1504 } 1505 *countp = count; 1506 1507 return base; 1508 } 1509 1510 /* 1511 * This counts the number of live blockrefs in a block array and 1512 * also calculates the point at which all remaining blockrefs are empty. 1513 * This routine can only be called on a live chain. 1514 * 1515 * Caller holds the chain locked, but possibly with a shared lock. We 1516 * must use an exclusive spinlock to prevent corruption. 1517 * 1518 * NOTE: Flag is not set until after the count is complete, allowing 1519 * callers to test the flag without holding the spinlock. 1520 * 1521 * NOTE: If base is NULL the related chain is still in the INITIAL 1522 * state and there are no blockrefs to count. 1523 * 1524 * NOTE: live_count may already have some counts accumulated due to 1525 * creation and deletion and could even be initially negative. 1526 */ 1527 void 1528 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1529 hammer2_blockref_t *base, int count) 1530 { 1531 hammer2_spin_ex(&chain->core.spin); 1532 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1533 if (base) { 1534 while (--count >= 0) { 1535 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1536 break; 1537 } 1538 chain->core.live_zero = count + 1; 1539 while (count >= 0) { 1540 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1541 atomic_add_int(&chain->core.live_count, 1542 1); 1543 --count; 1544 } 1545 } else { 1546 chain->core.live_zero = 0; 1547 } 1548 /* else do not modify live_count */ 1549 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1550 } 1551 hammer2_spin_unex(&chain->core.spin); 1552 } 1553 1554 /* 1555 * Resize the chain's physical storage allocation in-place. This function does 1556 * not usually adjust the data pointer and must be followed by (typically) a 1557 * hammer2_chain_modify() call to copy any old data over and adjust the 1558 * data pointer. 1559 * 1560 * Chains can be resized smaller without reallocating the storage. Resizing 1561 * larger will reallocate the storage. Excess or prior storage is reclaimed 1562 * asynchronously at a later time. 1563 * 1564 * An nradix value of 0 is special-cased to mean that the storage should 1565 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1566 * byte). 1567 * 1568 * Must be passed an exclusively locked parent and chain. 1569 * 1570 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1571 * to avoid instantiating a device buffer that conflicts with the vnode data 1572 * buffer. However, because H2 can compress or encrypt data, the chain may 1573 * have a dio assigned to it in those situations, and they do not conflict. 1574 * 1575 * XXX return error if cannot resize. 1576 */ 1577 int 1578 hammer2_chain_resize(hammer2_chain_t *chain, 1579 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1580 int nradix, int flags) 1581 { 1582 hammer2_dev_t *hmp; 1583 size_t obytes; 1584 size_t nbytes; 1585 int error; 1586 1587 hmp = chain->hmp; 1588 1589 /* 1590 * Only data and indirect blocks can be resized for now. 1591 * (The volu root, inodes, and freemap elements use a fixed size). 1592 */ 1593 KKASSERT(chain != &hmp->vchain); 1594 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1595 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1596 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1597 1598 /* 1599 * Nothing to do if the element is already the proper size 1600 */ 1601 obytes = chain->bytes; 1602 nbytes = (nradix) ? (1U << nradix) : 0; 1603 if (obytes == nbytes) 1604 return (chain->error); 1605 1606 /* 1607 * Make sure the old data is instantiated so we can copy it. If this 1608 * is a data block, the device data may be superfluous since the data 1609 * might be in a logical block, but compressed or encrypted data is 1610 * another matter. 1611 * 1612 * NOTE: The modify will set BMAPUPD for us if BMAPPED is set. 1613 */ 1614 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1615 if (error) 1616 return error; 1617 1618 /* 1619 * Reallocate the block, even if making it smaller (because different 1620 * block sizes may be in different regions). 1621 * 1622 * NOTE: Operation does not copy the data and may only be used 1623 * to resize data blocks in-place, or directory entry blocks 1624 * which are about to be modified in some manner. 1625 */ 1626 error = hammer2_freemap_alloc(chain, nbytes); 1627 if (error) 1628 return error; 1629 1630 chain->bytes = nbytes; 1631 1632 /* 1633 * We don't want the followup chain_modify() to try to copy data 1634 * from the old (wrong-sized) buffer. It won't know how much to 1635 * copy. This case should only occur during writes when the 1636 * originator already has the data to write in-hand. 1637 */ 1638 if (chain->dio) { 1639 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1640 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1641 hammer2_io_brelse(&chain->dio); 1642 chain->data = NULL; 1643 } 1644 return (chain->error); 1645 } 1646 1647 /* 1648 * Set the chain modified so its data can be changed by the caller, or 1649 * install deduplicated data. The caller must call this routine for each 1650 * set of modifications it makes, even if the chain is already flagged 1651 * MODIFIED. 1652 * 1653 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1654 * is a CLC (cluster level change) field and is not updated by parent 1655 * propagation during a flush. 1656 * 1657 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1658 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1659 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1660 * remains unmodified with its old data ref intact and chain->error 1661 * unchanged. 1662 * 1663 * Dedup Handling 1664 * 1665 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1666 * even if the chain is still flagged MODIFIED. In this case the chain's 1667 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1668 * 1669 * If the caller passes a non-zero dedup_off we will use it to assign the 1670 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1671 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1672 * must not modify the data content upon return. 1673 */ 1674 int 1675 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1676 hammer2_off_t dedup_off, int flags) 1677 { 1678 hammer2_blockref_t obref; 1679 hammer2_dev_t *hmp; 1680 hammer2_io_t *dio; 1681 int error; 1682 int wasinitial; 1683 int setmodified; 1684 int setupdate; 1685 int newmod; 1686 char *bdata; 1687 1688 hmp = chain->hmp; 1689 obref = chain->bref; 1690 KKASSERT((chain->flags & HAMMER2_CHAIN_FICTITIOUS) == 0); 1691 KKASSERT(chain->lock.mtx_lock & MTX_EXCLUSIVE); 1692 1693 /* 1694 * Data is not optional for freemap chains (we must always be sure 1695 * to copy the data on COW storage allocations). 1696 */ 1697 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1698 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1699 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1700 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1701 } 1702 1703 /* 1704 * Data must be resolved if already assigned, unless explicitly 1705 * flagged otherwise. If we cannot safety load the data the 1706 * modification fails and we return early. 1707 */ 1708 if (chain->data == NULL && chain->bytes != 0 && 1709 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1710 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1711 hammer2_chain_load_data(chain); 1712 if (chain->error) 1713 return (chain->error); 1714 } 1715 error = 0; 1716 1717 /* 1718 * Set MODIFIED to indicate that the chain has been modified. A new 1719 * allocation is required when modifying a chain. 1720 * 1721 * Set UPDATE to ensure that the blockref is updated in the parent. 1722 * 1723 * If MODIFIED is already set determine if we can reuse the assigned 1724 * data block or if we need a new data block. 1725 */ 1726 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1727 /* 1728 * Must set modified bit. 1729 */ 1730 atomic_add_long(&hammer2_count_modified_chains, 1); 1731 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1732 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1733 setmodified = 1; 1734 1735 /* 1736 * We may be able to avoid a copy-on-write if the chain's 1737 * check mode is set to NONE and the chain's current 1738 * modify_tid is beyond the last explicit snapshot tid. 1739 * 1740 * This implements HAMMER2's overwrite-in-place feature. 1741 * 1742 * NOTE! This data-block cannot be used as a de-duplication 1743 * source when the check mode is set to NONE. 1744 */ 1745 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1746 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1747 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1748 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1749 HAMMER2_DEC_CHECK(chain->bref.methods) == 1750 HAMMER2_CHECK_NONE && 1751 chain->pmp && 1752 chain->bref.modify_tid > 1753 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1754 /* 1755 * Sector overwrite allowed. 1756 */ 1757 newmod = 0; 1758 } else if ((hmp->hflags & HMNT2_EMERG) && 1759 chain->pmp && 1760 chain->bref.modify_tid > 1761 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1762 /* 1763 * If in emergency delete mode then do a modify-in- 1764 * place on any chain type belonging to the PFS as 1765 * long as it doesn't mess up a snapshot. We might 1766 * be forced to do this anyway a little further down 1767 * in the code if the allocation fails. 1768 * 1769 * Also note that in emergency mode, these modify-in- 1770 * place operations are NOT SAFE. A storage failure, 1771 * power failure, or panic can corrupt the filesystem. 1772 */ 1773 newmod = 0; 1774 } else { 1775 /* 1776 * Sector overwrite not allowed, must copy-on-write. 1777 */ 1778 newmod = 1; 1779 } 1780 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1781 /* 1782 * If the modified chain was registered for dedup we need 1783 * a new allocation. This only happens for delayed-flush 1784 * chains (i.e. which run through the front-end buffer 1785 * cache). 1786 */ 1787 newmod = 1; 1788 setmodified = 0; 1789 } else { 1790 /* 1791 * Already flagged modified, no new allocation is needed. 1792 */ 1793 newmod = 0; 1794 setmodified = 0; 1795 } 1796 1797 /* 1798 * Flag parent update required. 1799 */ 1800 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1801 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1802 setupdate = 1; 1803 } else { 1804 setupdate = 0; 1805 } 1806 1807 /* 1808 * The XOP code returns held but unlocked focus chains. This 1809 * prevents the chain from being destroyed but does not prevent 1810 * it from being modified. diolk is used to interlock modifications 1811 * against XOP frontend accesses to the focus. 1812 * 1813 * This allows us to theoretically avoid deadlocking the frontend 1814 * if one of the backends lock up by not formally locking the 1815 * focused chain in the frontend. In addition, the synchronization 1816 * code relies on this mechanism to avoid deadlocking concurrent 1817 * synchronization threads. 1818 */ 1819 lockmgr(&chain->diolk, LK_EXCLUSIVE); 1820 1821 /* 1822 * The modification or re-modification requires an allocation and 1823 * possible COW. If an error occurs, the previous content and data 1824 * reference is retained and the modification fails. 1825 * 1826 * If dedup_off is non-zero, the caller is requesting a deduplication 1827 * rather than a modification. The MODIFIED bit is not set and the 1828 * data offset is set to the deduplication offset. The data cannot 1829 * be modified. 1830 * 1831 * NOTE: The dedup offset is allowed to be in a partially free state 1832 * and we must be sure to reset it to a fully allocated state 1833 * to force two bulkfree passes to free it again. 1834 * 1835 * NOTE: Only applicable when chain->bytes != 0. 1836 * 1837 * XXX can a chain already be marked MODIFIED without a data 1838 * assignment? If not, assert here instead of testing the case. 1839 */ 1840 if (chain != &hmp->vchain && chain != &hmp->fchain && 1841 chain->bytes) { 1842 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1843 newmod 1844 ) { 1845 /* 1846 * NOTE: We do not have to remove the dedup 1847 * registration because the area is still 1848 * allocated and the underlying DIO will 1849 * still be flushed. 1850 */ 1851 if (dedup_off) { 1852 chain->bref.data_off = dedup_off; 1853 chain->bytes = 1 << (dedup_off & 1854 HAMMER2_OFF_MASK_RADIX); 1855 chain->error = 0; 1856 atomic_clear_int(&chain->flags, 1857 HAMMER2_CHAIN_MODIFIED); 1858 atomic_add_long(&hammer2_count_modified_chains, 1859 -1); 1860 if (chain->pmp) { 1861 hammer2_pfs_memory_wakeup( 1862 chain->pmp, -1); 1863 } 1864 hammer2_freemap_adjust(hmp, &chain->bref, 1865 HAMMER2_FREEMAP_DORECOVER); 1866 atomic_set_int(&chain->flags, 1867 HAMMER2_CHAIN_DEDUPABLE); 1868 } else { 1869 error = hammer2_freemap_alloc(chain, 1870 chain->bytes); 1871 atomic_clear_int(&chain->flags, 1872 HAMMER2_CHAIN_DEDUPABLE); 1873 1874 /* 1875 * If we are unable to allocate a new block 1876 * but we are in emergency mode, issue a 1877 * warning to the console and reuse the same 1878 * block. 1879 * 1880 * We behave as if the allocation were 1881 * successful. 1882 * 1883 * THIS IS IMPORTANT: These modifications 1884 * are virtually guaranteed to corrupt any 1885 * snapshots related to this filesystem. 1886 */ 1887 if (error && (hmp->hflags & HMNT2_EMERG)) { 1888 error = 0; 1889 chain->bref.flags |= 1890 HAMMER2_BREF_FLAG_EMERG_MIP; 1891 1892 krateprintf(&krate_h2em, 1893 "hammer2: Emergency Mode WARNING: " 1894 "Operation will likely corrupt " 1895 "related snapshot: " 1896 "%016jx.%02x key=%016jx\n", 1897 chain->bref.data_off, 1898 chain->bref.type, 1899 chain->bref.key); 1900 } else if (error == 0) { 1901 chain->bref.flags &= 1902 ~HAMMER2_BREF_FLAG_EMERG_MIP; 1903 } 1904 } 1905 } 1906 } 1907 1908 /* 1909 * Stop here if error. We have to undo any flag bits we might 1910 * have set above. 1911 */ 1912 if (error) { 1913 if (setmodified) { 1914 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1915 atomic_add_long(&hammer2_count_modified_chains, -1); 1916 if (chain->pmp) 1917 hammer2_pfs_memory_wakeup(chain->pmp, -1); 1918 } 1919 if (setupdate) { 1920 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1921 } 1922 lockmgr(&chain->diolk, LK_RELEASE); 1923 1924 return error; 1925 } 1926 1927 /* 1928 * Update mirror_tid and modify_tid. modify_tid is only updated 1929 * if not passed as zero (during flushes, parent propagation passes 1930 * the value 0). 1931 * 1932 * NOTE: chain->pmp could be the device spmp. 1933 */ 1934 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1935 if (mtid) 1936 chain->bref.modify_tid = mtid; 1937 1938 /* 1939 * Set BMAPUPD to tell the flush code that an existing blockmap entry 1940 * requires updating as well as to tell the delete code that the 1941 * chain's blockref might not exactly match (in terms of physical size 1942 * or block offset) the one in the parent's blocktable. The base key 1943 * of course will still match. 1944 */ 1945 if (chain->flags & HAMMER2_CHAIN_BMAPPED) 1946 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD); 1947 1948 /* 1949 * Short-cut data block handling when the caller does not need an 1950 * actual data reference to (aka OPTDATA), as long as the chain does 1951 * not already have a data pointer to the data and no de-duplication 1952 * occurred. 1953 * 1954 * This generally means that the modifications are being done via the 1955 * logical buffer cache. 1956 * 1957 * NOTE: If deduplication occurred we have to run through the data 1958 * stuff to clear INITIAL, and the caller will likely want to 1959 * assign the check code anyway. Leaving INITIAL set on a 1960 * dedup can be deadly (it can cause the block to be zero'd!). 1961 * 1962 * This code also handles bytes == 0 (most dirents). 1963 */ 1964 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1965 (flags & HAMMER2_MODIFY_OPTDATA) && 1966 chain->data == NULL) { 1967 if (dedup_off == 0) { 1968 KKASSERT(chain->dio == NULL); 1969 goto skip2; 1970 } 1971 } 1972 1973 /* 1974 * Clearing the INITIAL flag (for indirect blocks) indicates that 1975 * we've processed the uninitialized storage allocation. 1976 * 1977 * If this flag is already clear we are likely in a copy-on-write 1978 * situation but we have to be sure NOT to bzero the storage if 1979 * no data is present. 1980 * 1981 * Clearing of NOTTESTED is allowed if the MODIFIED bit is set, 1982 */ 1983 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1984 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1985 wasinitial = 1; 1986 } else { 1987 wasinitial = 0; 1988 } 1989 1990 /* 1991 * Instantiate data buffer and possibly execute COW operation 1992 */ 1993 switch(chain->bref.type) { 1994 case HAMMER2_BREF_TYPE_VOLUME: 1995 case HAMMER2_BREF_TYPE_FREEMAP: 1996 /* 1997 * The data is embedded, no copy-on-write operation is 1998 * needed. 1999 */ 2000 KKASSERT(chain->dio == NULL); 2001 break; 2002 case HAMMER2_BREF_TYPE_DIRENT: 2003 /* 2004 * The data might be fully embedded. 2005 */ 2006 if (chain->bytes == 0) { 2007 KKASSERT(chain->dio == NULL); 2008 break; 2009 } 2010 /* fall through */ 2011 case HAMMER2_BREF_TYPE_INODE: 2012 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2013 case HAMMER2_BREF_TYPE_DATA: 2014 case HAMMER2_BREF_TYPE_INDIRECT: 2015 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2016 /* 2017 * Perform the copy-on-write operation 2018 * 2019 * zero-fill or copy-on-write depending on whether 2020 * chain->data exists or not and set the dirty state for 2021 * the new buffer. hammer2_io_new() will handle the 2022 * zero-fill. 2023 * 2024 * If a dedup_off was supplied this is an existing block 2025 * and no COW, copy, or further modification is required. 2026 */ 2027 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 2028 2029 if (wasinitial && dedup_off == 0) { 2030 error = hammer2_io_new(hmp, chain->bref.type, 2031 chain->bref.data_off, 2032 chain->bytes, &dio); 2033 } else { 2034 error = hammer2_io_bread(hmp, chain->bref.type, 2035 chain->bref.data_off, 2036 chain->bytes, &dio); 2037 } 2038 hammer2_adjreadcounter(&chain->bref, chain->bytes); 2039 2040 /* 2041 * If an I/O error occurs make sure callers cannot accidently 2042 * modify the old buffer's contents and corrupt the filesystem. 2043 * 2044 * NOTE: hammer2_io_data() call issues bkvasync() 2045 */ 2046 if (error) { 2047 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 2048 hmp); 2049 chain->error = HAMMER2_ERROR_EIO; 2050 hammer2_io_brelse(&dio); 2051 hammer2_io_brelse(&chain->dio); 2052 chain->data = NULL; 2053 break; 2054 } 2055 chain->error = 0; 2056 bdata = hammer2_io_data(dio, chain->bref.data_off); 2057 2058 if (chain->data) { 2059 /* 2060 * COW (unless a dedup). 2061 */ 2062 KKASSERT(chain->dio != NULL); 2063 if (chain->data != (void *)bdata && dedup_off == 0) { 2064 bcopy(chain->data, bdata, chain->bytes); 2065 } 2066 } else if (wasinitial == 0 && dedup_off == 0) { 2067 /* 2068 * We have a problem. We were asked to COW but 2069 * we don't have any data to COW with! 2070 */ 2071 panic("hammer2_chain_modify: having a COW %p\n", 2072 chain); 2073 } 2074 2075 /* 2076 * Retire the old buffer, replace with the new. Dirty or 2077 * redirty the new buffer. 2078 * 2079 * WARNING! The system buffer cache may have already flushed 2080 * the buffer, so we must be sure to [re]dirty it 2081 * for further modification. 2082 * 2083 * If dedup_off was supplied, the caller is not 2084 * expected to make any further modification to the 2085 * buffer. 2086 * 2087 * WARNING! hammer2_get_gdata() assumes dio never transitions 2088 * through NULL in order to optimize away unnecessary 2089 * diolk operations. 2090 */ 2091 { 2092 hammer2_io_t *tio; 2093 2094 if ((tio = chain->dio) != NULL) 2095 hammer2_io_bqrelse(&tio); 2096 chain->data = (void *)bdata; 2097 chain->dio = dio; 2098 if (dedup_off == 0) 2099 hammer2_io_setdirty(dio); 2100 } 2101 break; 2102 default: 2103 panic("hammer2_chain_modify: illegal non-embedded type %d", 2104 chain->bref.type); 2105 break; 2106 2107 } 2108 skip2: 2109 /* 2110 * setflush on parent indicating that the parent must recurse down 2111 * to us. Do not call on chain itself which might already have it 2112 * set. 2113 */ 2114 if (chain->parent) 2115 hammer2_chain_setflush(chain->parent); 2116 lockmgr(&chain->diolk, LK_RELEASE); 2117 2118 return (chain->error); 2119 } 2120 2121 /* 2122 * Modify the chain associated with an inode. 2123 */ 2124 int 2125 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 2126 hammer2_tid_t mtid, int flags) 2127 { 2128 int error; 2129 2130 hammer2_inode_modify(ip); 2131 error = hammer2_chain_modify(chain, mtid, 0, flags); 2132 2133 return error; 2134 } 2135 2136 /* 2137 * Volume header data locks 2138 */ 2139 void 2140 hammer2_voldata_lock(hammer2_dev_t *hmp) 2141 { 2142 lockmgr(&hmp->vollk, LK_EXCLUSIVE); 2143 } 2144 2145 void 2146 hammer2_voldata_unlock(hammer2_dev_t *hmp) 2147 { 2148 lockmgr(&hmp->vollk, LK_RELEASE); 2149 } 2150 2151 void 2152 hammer2_voldata_modify(hammer2_dev_t *hmp) 2153 { 2154 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) { 2155 atomic_add_long(&hammer2_count_modified_chains, 1); 2156 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 2157 hammer2_pfs_memory_inc(hmp->vchain.pmp); 2158 } 2159 } 2160 2161 /* 2162 * This function returns the chain at the nearest key within the specified 2163 * range. The returned chain will be referenced but not locked. 2164 * 2165 * This function will recurse through chain->rbtree as necessary and will 2166 * return a *key_nextp suitable for iteration. *key_nextp is only set if 2167 * the iteration value is less than the current value of *key_nextp. 2168 * 2169 * The caller should use (*key_nextp) to calculate the actual range of 2170 * the returned element, which will be (key_beg to *key_nextp - 1), because 2171 * there might be another element which is superior to the returned element 2172 * and overlaps it. 2173 * 2174 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 2175 * chains continue to be returned. On EOF (*key_nextp) may overflow since 2176 * it will wind up being (key_end + 1). 2177 * 2178 * WARNING! Must be called with child's spinlock held. Spinlock remains 2179 * held through the operation. 2180 */ 2181 struct hammer2_chain_find_info { 2182 hammer2_chain_t *best; 2183 hammer2_key_t key_beg; 2184 hammer2_key_t key_end; 2185 hammer2_key_t key_next; 2186 }; 2187 2188 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 2189 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 2190 2191 static 2192 hammer2_chain_t * 2193 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 2194 hammer2_key_t key_beg, hammer2_key_t key_end) 2195 { 2196 struct hammer2_chain_find_info info; 2197 2198 info.best = NULL; 2199 info.key_beg = key_beg; 2200 info.key_end = key_end; 2201 info.key_next = *key_nextp; 2202 2203 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 2204 hammer2_chain_find_cmp, hammer2_chain_find_callback, 2205 &info); 2206 *key_nextp = info.key_next; 2207 #if 0 2208 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 2209 parent, key_beg, key_end, *key_nextp); 2210 #endif 2211 2212 return (info.best); 2213 } 2214 2215 static 2216 int 2217 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 2218 { 2219 struct hammer2_chain_find_info *info = data; 2220 hammer2_key_t child_beg; 2221 hammer2_key_t child_end; 2222 2223 child_beg = child->bref.key; 2224 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 2225 2226 if (child_end < info->key_beg) 2227 return(-1); 2228 if (child_beg > info->key_end) 2229 return(1); 2230 return(0); 2231 } 2232 2233 static 2234 int 2235 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2236 { 2237 struct hammer2_chain_find_info *info = data; 2238 hammer2_chain_t *best; 2239 hammer2_key_t child_end; 2240 2241 /* 2242 * WARNING! Layerq is scanned forwards, exact matches should keep 2243 * the existing info->best. 2244 */ 2245 if ((best = info->best) == NULL) { 2246 /* 2247 * No previous best. Assign best 2248 */ 2249 info->best = child; 2250 } else if (best->bref.key <= info->key_beg && 2251 child->bref.key <= info->key_beg) { 2252 /* 2253 * Illegal overlap. 2254 */ 2255 KKASSERT(0); 2256 /*info->best = child;*/ 2257 } else if (child->bref.key < best->bref.key) { 2258 /* 2259 * Child has a nearer key and best is not flush with key_beg. 2260 * Set best to child. Truncate key_next to the old best key. 2261 */ 2262 info->best = child; 2263 if (info->key_next > best->bref.key || info->key_next == 0) 2264 info->key_next = best->bref.key; 2265 } else if (child->bref.key == best->bref.key) { 2266 /* 2267 * If our current best is flush with the child then this 2268 * is an illegal overlap. 2269 * 2270 * key_next will automatically be limited to the smaller of 2271 * the two end-points. 2272 */ 2273 KKASSERT(0); 2274 info->best = child; 2275 } else { 2276 /* 2277 * Keep the current best but truncate key_next to the child's 2278 * base. 2279 * 2280 * key_next will also automatically be limited to the smaller 2281 * of the two end-points (probably not necessary for this case 2282 * but we do it anyway). 2283 */ 2284 if (info->key_next > child->bref.key || info->key_next == 0) 2285 info->key_next = child->bref.key; 2286 } 2287 2288 /* 2289 * Always truncate key_next based on child's end-of-range. 2290 */ 2291 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2292 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2293 info->key_next = child_end; 2294 2295 return(0); 2296 } 2297 2298 /* 2299 * Retrieve the specified chain from a media blockref, creating the 2300 * in-memory chain structure which reflects it. The returned chain is 2301 * held and locked according to (how) (HAMMER2_RESOLVE_*). The caller must 2302 * handle crc-checks and so forth, and should check chain->error before 2303 * assuming that the data is good. 2304 * 2305 * To handle insertion races pass the INSERT_RACE flag along with the 2306 * generation number of the core. NULL will be returned if the generation 2307 * number changes before we have a chance to insert the chain. Insert 2308 * races can occur because the parent might be held shared. 2309 * 2310 * Caller must hold the parent locked shared or exclusive since we may 2311 * need the parent's bref array to find our block. 2312 * 2313 * WARNING! chain->pmp is always set to NULL for any chain representing 2314 * part of the super-root topology. 2315 */ 2316 hammer2_chain_t * 2317 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2318 hammer2_blockref_t *bref, int how) 2319 { 2320 hammer2_dev_t *hmp = parent->hmp; 2321 hammer2_chain_t *chain; 2322 int error; 2323 2324 /* 2325 * Allocate a chain structure representing the existing media 2326 * entry. Resulting chain has one ref and is not locked. 2327 */ 2328 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2329 chain = hammer2_chain_alloc(hmp, NULL, bref); 2330 else 2331 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2332 /* ref'd chain returned */ 2333 2334 /* 2335 * Flag that the chain is in the parent's blockmap so delete/flush 2336 * knows what to do with it. 2337 */ 2338 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 2339 2340 /* 2341 * chain must be locked to avoid unexpected ripouts 2342 */ 2343 hammer2_chain_lock(chain, how); 2344 2345 /* 2346 * Link the chain into its parent. A spinlock is required to safely 2347 * access the RBTREE, and it is possible to collide with another 2348 * hammer2_chain_get() operation because the caller might only hold 2349 * a shared lock on the parent. 2350 * 2351 * NOTE: Get races can occur quite often when we distribute 2352 * asynchronous read-aheads across multiple threads. 2353 */ 2354 KKASSERT(parent->refs > 0); 2355 error = hammer2_chain_insert(parent, chain, 2356 HAMMER2_CHAIN_INSERT_SPIN | 2357 HAMMER2_CHAIN_INSERT_RACE, 2358 generation); 2359 if (error) { 2360 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2361 /*kprintf("chain %p get race\n", chain);*/ 2362 hammer2_chain_unlock(chain); 2363 hammer2_chain_drop(chain); 2364 chain = NULL; 2365 } else { 2366 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2367 } 2368 2369 /* 2370 * Return our new chain referenced but not locked, or NULL if 2371 * a race occurred. 2372 */ 2373 return (chain); 2374 } 2375 2376 /* 2377 * Lookup initialization/completion API 2378 */ 2379 hammer2_chain_t * 2380 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2381 { 2382 hammer2_chain_ref(parent); 2383 if (flags & HAMMER2_LOOKUP_SHARED) { 2384 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2385 HAMMER2_RESOLVE_SHARED); 2386 } else { 2387 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2388 } 2389 return (parent); 2390 } 2391 2392 void 2393 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2394 { 2395 if (parent) { 2396 hammer2_chain_unlock(parent); 2397 hammer2_chain_drop(parent); 2398 } 2399 } 2400 2401 /* 2402 * Take the locked chain and return a locked parent. The chain remains 2403 * locked on return, but may have to be temporarily unlocked to acquire 2404 * the parent. Because of this, (chain) must be stable and cannot be 2405 * deleted while it was temporarily unlocked (typically means that (chain) 2406 * is an inode). 2407 * 2408 * Pass HAMMER2_RESOLVE_* flags in flags. 2409 * 2410 * This will work even if the chain is errored, and the caller can check 2411 * parent->error on return if desired since the parent will be locked. 2412 * 2413 * This function handles the lock order reversal. 2414 */ 2415 hammer2_chain_t * 2416 hammer2_chain_getparent(hammer2_chain_t *chain, int flags) 2417 { 2418 hammer2_chain_t *parent; 2419 2420 /* 2421 * Be careful of order, chain must be unlocked before parent 2422 * is locked below to avoid a deadlock. Try it trivially first. 2423 */ 2424 parent = chain->parent; 2425 if (parent == NULL) 2426 panic("hammer2_chain_getparent: no parent"); 2427 hammer2_chain_ref(parent); 2428 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) 2429 return parent; 2430 2431 for (;;) { 2432 hammer2_chain_unlock(chain); 2433 hammer2_chain_lock(parent, flags); 2434 hammer2_chain_lock(chain, flags); 2435 2436 /* 2437 * Parent relinking races are quite common. We have to get 2438 * it right or we will blow up the block table. 2439 */ 2440 if (chain->parent == parent) 2441 break; 2442 hammer2_chain_unlock(parent); 2443 hammer2_chain_drop(parent); 2444 cpu_ccfence(); 2445 parent = chain->parent; 2446 if (parent == NULL) 2447 panic("hammer2_chain_getparent: no parent"); 2448 hammer2_chain_ref(parent); 2449 } 2450 return parent; 2451 } 2452 2453 /* 2454 * Take the locked chain and return a locked parent. The chain is unlocked 2455 * and dropped. *chainp is set to the returned parent as a convenience. 2456 * Pass HAMMER2_RESOLVE_* flags in flags. 2457 * 2458 * This will work even if the chain is errored, and the caller can check 2459 * parent->error on return if desired since the parent will be locked. 2460 * 2461 * The chain does NOT need to be stable. We use a tracking structure 2462 * to track the expected parent if the chain is deleted out from under us. 2463 * 2464 * This function handles the lock order reversal. 2465 */ 2466 hammer2_chain_t * 2467 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags) 2468 { 2469 hammer2_chain_t *chain; 2470 hammer2_chain_t *parent; 2471 struct hammer2_reptrack reptrack; 2472 struct hammer2_reptrack **repp; 2473 2474 /* 2475 * Be careful of order, chain must be unlocked before parent 2476 * is locked below to avoid a deadlock. Try it trivially first. 2477 */ 2478 chain = *chainp; 2479 parent = chain->parent; 2480 if (parent == NULL) { 2481 hammer2_spin_unex(&chain->core.spin); 2482 panic("hammer2_chain_repparent: no parent"); 2483 } 2484 hammer2_chain_ref(parent); 2485 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) { 2486 hammer2_chain_unlock(chain); 2487 hammer2_chain_drop(chain); 2488 *chainp = parent; 2489 2490 return parent; 2491 } 2492 2493 /* 2494 * Ok, now it gets a bit nasty. There are multiple situations where 2495 * the parent might be in the middle of a deletion, or where the child 2496 * (chain) might be deleted the instant we let go of its lock. 2497 * We can potentially end up in a no-win situation! 2498 * 2499 * In particular, the indirect_maintenance() case can cause these 2500 * situations. 2501 * 2502 * To deal with this we install a reptrack structure in the parent 2503 * This reptrack structure 'owns' the parent ref and will automatically 2504 * migrate to the parent's parent if the parent is deleted permanently. 2505 */ 2506 hammer2_spin_init(&reptrack.spin, "h2reptrk"); 2507 reptrack.chain = parent; 2508 hammer2_chain_ref(parent); /* for the reptrack */ 2509 2510 hammer2_spin_ex(&parent->core.spin); 2511 reptrack.next = parent->core.reptrack; 2512 parent->core.reptrack = &reptrack; 2513 hammer2_spin_unex(&parent->core.spin); 2514 2515 hammer2_chain_unlock(chain); 2516 hammer2_chain_drop(chain); 2517 chain = NULL; /* gone */ 2518 2519 /* 2520 * At the top of this loop, chain is gone and parent is refd both 2521 * by us explicitly AND via our reptrack. We are attempting to 2522 * lock parent. 2523 */ 2524 for (;;) { 2525 hammer2_chain_lock(parent, flags); 2526 2527 if (reptrack.chain == parent) 2528 break; 2529 hammer2_chain_unlock(parent); 2530 hammer2_chain_drop(parent); 2531 2532 kprintf("hammer2: debug REPTRACK %p->%p\n", 2533 parent, reptrack.chain); 2534 hammer2_spin_ex(&reptrack.spin); 2535 parent = reptrack.chain; 2536 hammer2_chain_ref(parent); 2537 hammer2_spin_unex(&reptrack.spin); 2538 } 2539 2540 /* 2541 * Once parent is locked and matches our reptrack, our reptrack 2542 * will be stable and we have our parent. We can unlink our 2543 * reptrack. 2544 * 2545 * WARNING! Remember that the chain lock might be shared. Chains 2546 * locked shared have stable parent linkages. 2547 */ 2548 hammer2_spin_ex(&parent->core.spin); 2549 repp = &parent->core.reptrack; 2550 while (*repp != &reptrack) 2551 repp = &(*repp)->next; 2552 *repp = reptrack.next; 2553 hammer2_spin_unex(&parent->core.spin); 2554 2555 hammer2_chain_drop(parent); /* reptrack ref */ 2556 *chainp = parent; /* return parent lock+ref */ 2557 2558 return parent; 2559 } 2560 2561 /* 2562 * Dispose of any linked reptrack structures in (chain) by shifting them to 2563 * (parent). Both (chain) and (parent) must be exclusively locked. 2564 * 2565 * This is interlocked against any children of (chain) on the other side. 2566 * No children so remain as-of when this is called so we can test 2567 * core.reptrack without holding the spin-lock. 2568 * 2569 * Used whenever the caller intends to permanently delete chains related 2570 * to topological recursions (BREF_TYPE_INDIRECT, BREF_TYPE_FREEMAP_NODE), 2571 * where the chains underneath the node being deleted are given a new parent 2572 * above the node being deleted. 2573 */ 2574 static 2575 void 2576 hammer2_chain_repchange(hammer2_chain_t *parent, hammer2_chain_t *chain) 2577 { 2578 struct hammer2_reptrack *reptrack; 2579 2580 KKASSERT(chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)); 2581 while (chain->core.reptrack) { 2582 hammer2_spin_ex(&parent->core.spin); 2583 hammer2_spin_ex(&chain->core.spin); 2584 reptrack = chain->core.reptrack; 2585 if (reptrack == NULL) { 2586 hammer2_spin_unex(&chain->core.spin); 2587 hammer2_spin_unex(&parent->core.spin); 2588 break; 2589 } 2590 hammer2_spin_ex(&reptrack->spin); 2591 chain->core.reptrack = reptrack->next; 2592 reptrack->chain = parent; 2593 reptrack->next = parent->core.reptrack; 2594 parent->core.reptrack = reptrack; 2595 hammer2_chain_ref(parent); /* reptrack */ 2596 2597 hammer2_spin_unex(&chain->core.spin); 2598 hammer2_spin_unex(&parent->core.spin); 2599 kprintf("hammer2: debug repchange %p %p->%p\n", 2600 reptrack, chain, parent); 2601 hammer2_chain_drop(chain); /* reptrack */ 2602 } 2603 } 2604 2605 /* 2606 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2607 * (*parentp) typically points to an inode but can also point to a related 2608 * indirect block and this function will recurse upwards and find the inode 2609 * or the nearest undeleted indirect block covering the key range. 2610 * 2611 * This function unconditionally sets *errorp, replacing any previous value. 2612 * 2613 * (*parentp) must be exclusive or shared locked (depending on flags) and 2614 * referenced and can be an inode or an existing indirect block within the 2615 * inode. 2616 * 2617 * If (*parent) is errored out, this function will not attempt to recurse 2618 * the radix tree and will return NULL along with an appropriate *errorp. 2619 * If NULL is returned and *errorp is 0, the requested lookup could not be 2620 * located. 2621 * 2622 * On return (*parentp) will be modified to point at the deepest parent chain 2623 * element encountered during the search, as a helper for an insertion or 2624 * deletion. 2625 * 2626 * The new (*parentp) will be locked shared or exclusive (depending on flags), 2627 * and referenced, and the old will be unlocked and dereferenced (no change 2628 * if they are both the same). This is particularly important if the caller 2629 * wishes to insert a new chain, (*parentp) will be set properly even if NULL 2630 * is returned, as long as no error occurred. 2631 * 2632 * The matching chain will be returned locked according to flags. 2633 * 2634 * -- 2635 * 2636 * NULL is returned if no match was found, but (*parentp) will still 2637 * potentially be adjusted. 2638 * 2639 * On return (*key_nextp) will point to an iterative value for key_beg. 2640 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2641 * 2642 * This function will also recurse up the chain if the key is not within the 2643 * current parent's range. (*parentp) can never be set to NULL. An iteration 2644 * can simply allow (*parentp) to float inside the loop. 2645 * 2646 * NOTE! chain->data is not always resolved. By default it will not be 2647 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2648 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2649 * BREF_TYPE_DATA as the device buffer can alias the logical file 2650 * buffer). 2651 */ 2652 2653 hammer2_chain_t * 2654 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2655 hammer2_key_t key_beg, hammer2_key_t key_end, 2656 int *errorp, int flags) 2657 { 2658 hammer2_dev_t *hmp; 2659 hammer2_chain_t *parent; 2660 hammer2_chain_t *chain; 2661 hammer2_blockref_t *base; 2662 hammer2_blockref_t *bref; 2663 hammer2_blockref_t bsave; 2664 hammer2_key_t scan_beg; 2665 hammer2_key_t scan_end; 2666 int count = 0; 2667 int how_always = HAMMER2_RESOLVE_ALWAYS; 2668 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2669 int how; 2670 int generation; 2671 int maxloops = 300000; 2672 volatile hammer2_mtx_t save_mtx; 2673 2674 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2675 how_maybe = how_always; 2676 how = HAMMER2_RESOLVE_ALWAYS; 2677 } else if (flags & HAMMER2_LOOKUP_NODATA) { 2678 how = HAMMER2_RESOLVE_NEVER; 2679 } else { 2680 how = HAMMER2_RESOLVE_MAYBE; 2681 } 2682 if (flags & HAMMER2_LOOKUP_SHARED) { 2683 how_maybe |= HAMMER2_RESOLVE_SHARED; 2684 how_always |= HAMMER2_RESOLVE_SHARED; 2685 how |= HAMMER2_RESOLVE_SHARED; 2686 } 2687 2688 /* 2689 * Recurse (*parentp) upward if necessary until the parent completely 2690 * encloses the key range or we hit the inode. 2691 * 2692 * Handle races against the flusher deleting indirect nodes on its 2693 * way back up by continuing to recurse upward past the deletion. 2694 */ 2695 parent = *parentp; 2696 hmp = parent->hmp; 2697 *errorp = 0; 2698 2699 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2700 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2701 scan_beg = parent->bref.key; 2702 scan_end = scan_beg + 2703 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2704 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2705 if (key_beg >= scan_beg && key_end <= scan_end) 2706 break; 2707 } 2708 parent = hammer2_chain_repparent(parentp, how_maybe); 2709 } 2710 again: 2711 if (--maxloops == 0) 2712 panic("hammer2_chain_lookup: maxloops"); 2713 2714 /* 2715 * MATCHIND case that does not require parent->data (do prior to 2716 * parent->error check). 2717 */ 2718 switch(parent->bref.type) { 2719 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2720 case HAMMER2_BREF_TYPE_INDIRECT: 2721 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2722 scan_beg = parent->bref.key; 2723 scan_end = scan_beg + 2724 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2725 if (key_beg == scan_beg && key_end == scan_end) { 2726 chain = parent; 2727 hammer2_chain_ref(chain); 2728 hammer2_chain_lock(chain, how_maybe); 2729 *key_nextp = scan_end + 1; 2730 goto done; 2731 } 2732 } 2733 break; 2734 default: 2735 break; 2736 } 2737 2738 /* 2739 * No lookup is possible if the parent is errored. We delayed 2740 * this check as long as we could to ensure that the parent backup, 2741 * embedded data, and MATCHIND code could still execute. 2742 */ 2743 if (parent->error) { 2744 *errorp = parent->error; 2745 return NULL; 2746 } 2747 2748 /* 2749 * Locate the blockref array. Currently we do a fully associative 2750 * search through the array. 2751 */ 2752 switch(parent->bref.type) { 2753 case HAMMER2_BREF_TYPE_INODE: 2754 /* 2755 * Special shortcut for embedded data returns the inode 2756 * itself. Callers must detect this condition and access 2757 * the embedded data (the strategy code does this for us). 2758 * 2759 * This is only applicable to regular files and softlinks. 2760 * 2761 * We need a second lock on parent. Since we already have 2762 * a lock we must pass LOCKAGAIN to prevent unexpected 2763 * blocking (we don't want to block on a second shared 2764 * ref if an exclusive lock is pending) 2765 */ 2766 if (parent->data->ipdata.meta.op_flags & 2767 HAMMER2_OPFLAG_DIRECTDATA) { 2768 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2769 chain = NULL; 2770 *key_nextp = key_end + 1; 2771 goto done; 2772 } 2773 hammer2_chain_ref(parent); 2774 hammer2_chain_lock(parent, how_always | 2775 HAMMER2_RESOLVE_LOCKAGAIN); 2776 *key_nextp = key_end + 1; 2777 return (parent); 2778 } 2779 base = &parent->data->ipdata.u.blockset.blockref[0]; 2780 count = HAMMER2_SET_COUNT; 2781 break; 2782 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2783 case HAMMER2_BREF_TYPE_INDIRECT: 2784 /* 2785 * Optimize indirect blocks in the INITIAL state to avoid 2786 * I/O. 2787 * 2788 * Debugging: Enter permanent wait state instead of 2789 * panicing on unexpectedly NULL data for the moment. 2790 */ 2791 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2792 base = NULL; 2793 } else { 2794 if (parent->data == NULL) { 2795 kprintf("hammer2: unexpected NULL data " 2796 "on %p\n", parent); 2797 while (1) 2798 tsleep(parent, 0, "xxx", 0); 2799 } 2800 base = &parent->data->npdata[0]; 2801 } 2802 count = parent->bytes / sizeof(hammer2_blockref_t); 2803 break; 2804 case HAMMER2_BREF_TYPE_VOLUME: 2805 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2806 count = HAMMER2_SET_COUNT; 2807 break; 2808 case HAMMER2_BREF_TYPE_FREEMAP: 2809 base = &parent->data->blkset.blockref[0]; 2810 count = HAMMER2_SET_COUNT; 2811 break; 2812 default: 2813 kprintf("hammer2_chain_lookup: unrecognized " 2814 "blockref(B) type: %d", 2815 parent->bref.type); 2816 while (1) 2817 tsleep(&base, 0, "dead", 0); 2818 panic("hammer2_chain_lookup: unrecognized " 2819 "blockref(B) type: %d", 2820 parent->bref.type); 2821 base = NULL; /* safety */ 2822 count = 0; /* safety */ 2823 } 2824 2825 /* 2826 * Merged scan to find next candidate. 2827 * 2828 * hammer2_base_*() functions require the parent->core.live_* fields 2829 * to be synchronized. 2830 * 2831 * We need to hold the spinlock to access the block array and RB tree 2832 * and to interlock chain creation. 2833 */ 2834 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2835 hammer2_chain_countbrefs(parent, base, count); 2836 2837 /* 2838 * Combined search 2839 */ 2840 hammer2_spin_ex(&parent->core.spin); 2841 chain = hammer2_combined_find(parent, base, count, 2842 key_nextp, 2843 key_beg, key_end, 2844 &bref); 2845 generation = parent->core.generation; 2846 2847 /* 2848 * Exhausted parent chain, iterate. 2849 */ 2850 if (bref == NULL) { 2851 KKASSERT(chain == NULL); 2852 hammer2_spin_unex(&parent->core.spin); 2853 if (key_beg == key_end) /* short cut single-key case */ 2854 return (NULL); 2855 2856 /* 2857 * Stop if we reached the end of the iteration. 2858 */ 2859 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2860 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2861 return (NULL); 2862 } 2863 2864 /* 2865 * Calculate next key, stop if we reached the end of the 2866 * iteration, otherwise go up one level and loop. 2867 */ 2868 key_beg = parent->bref.key + 2869 ((hammer2_key_t)1 << parent->bref.keybits); 2870 if (key_beg == 0 || key_beg > key_end) 2871 return (NULL); 2872 parent = hammer2_chain_repparent(parentp, how_maybe); 2873 goto again; 2874 } 2875 2876 /* 2877 * Selected from blockref or in-memory chain. 2878 */ 2879 bsave = *bref; 2880 if (chain == NULL) { 2881 hammer2_spin_unex(&parent->core.spin); 2882 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2883 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2884 chain = hammer2_chain_get(parent, generation, 2885 &bsave, how_maybe); 2886 } else { 2887 chain = hammer2_chain_get(parent, generation, 2888 &bsave, how); 2889 } 2890 if (chain == NULL) 2891 goto again; 2892 } else { 2893 hammer2_chain_ref(chain); 2894 hammer2_spin_unex(&parent->core.spin); 2895 2896 /* 2897 * chain is referenced but not locked. We must lock the 2898 * chain to obtain definitive state. 2899 */ 2900 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2901 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2902 hammer2_chain_lock(chain, how_maybe); 2903 } else { 2904 hammer2_chain_lock(chain, how); 2905 } 2906 KKASSERT(chain->parent == parent); 2907 } 2908 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 2909 chain->parent != parent) { 2910 hammer2_chain_unlock(chain); 2911 hammer2_chain_drop(chain); 2912 chain = NULL; /* SAFETY */ 2913 goto again; 2914 } 2915 2916 2917 /* 2918 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2919 * 2920 * NOTE: Chain's key range is not relevant as there might be 2921 * one-offs within the range that are not deleted. 2922 * 2923 * NOTE: Lookups can race delete-duplicate because 2924 * delete-duplicate does not lock the parent's core 2925 * (they just use the spinlock on the core). 2926 */ 2927 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2928 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2929 chain->bref.data_off, chain->bref.type, 2930 chain->bref.key); 2931 hammer2_chain_unlock(chain); 2932 hammer2_chain_drop(chain); 2933 chain = NULL; /* SAFETY */ 2934 key_beg = *key_nextp; 2935 if (key_beg == 0 || key_beg > key_end) 2936 return(NULL); 2937 goto again; 2938 } 2939 2940 /* 2941 * If the chain element is an indirect block it becomes the new 2942 * parent and we loop on it. We must maintain our top-down locks 2943 * to prevent the flusher from interfering (i.e. doing a 2944 * delete-duplicate and leaving us recursing down a deleted chain). 2945 * 2946 * The parent always has to be locked with at least RESOLVE_MAYBE 2947 * so we can access its data. It might need a fixup if the caller 2948 * passed incompatible flags. Be careful not to cause a deadlock 2949 * as a data-load requires an exclusive lock. 2950 * 2951 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2952 * range is within the requested key range we return the indirect 2953 * block and do NOT loop. This is usually only used to acquire 2954 * freemap nodes. 2955 */ 2956 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2957 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2958 save_mtx = parent->lock; 2959 hammer2_chain_unlock(parent); 2960 hammer2_chain_drop(parent); 2961 *parentp = parent = chain; 2962 chain = NULL; /* SAFETY */ 2963 goto again; 2964 } 2965 done: 2966 /* 2967 * All done, return the locked chain. 2968 * 2969 * If the caller does not want a locked chain, replace the lock with 2970 * a ref. Perhaps this can eventually be optimized to not obtain the 2971 * lock in the first place for situations where the data does not 2972 * need to be resolved. 2973 * 2974 * NOTE! A chain->error must be tested by the caller upon return. 2975 * *errorp is only set based on issues which occur while 2976 * trying to reach the chain. 2977 */ 2978 return (chain); 2979 } 2980 2981 /* 2982 * After having issued a lookup we can iterate all matching keys. 2983 * 2984 * If chain is non-NULL we continue the iteration from just after it's index. 2985 * 2986 * If chain is NULL we assume the parent was exhausted and continue the 2987 * iteration at the next parent. 2988 * 2989 * If a fatal error occurs (typically an I/O error), a dummy chain is 2990 * returned with chain->error and error-identifying information set. This 2991 * chain will assert if you try to do anything fancy with it. 2992 * 2993 * XXX Depending on where the error occurs we should allow continued iteration. 2994 * 2995 * parent must be locked on entry and remains locked throughout. chain's 2996 * lock status must match flags. Chain is always at least referenced. 2997 * 2998 * WARNING! The MATCHIND flag does not apply to this function. 2999 */ 3000 hammer2_chain_t * 3001 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3002 hammer2_key_t *key_nextp, 3003 hammer2_key_t key_beg, hammer2_key_t key_end, 3004 int *errorp, int flags) 3005 { 3006 hammer2_chain_t *parent; 3007 int how_maybe; 3008 3009 /* 3010 * Calculate locking flags for upward recursion. 3011 */ 3012 how_maybe = HAMMER2_RESOLVE_MAYBE; 3013 if (flags & HAMMER2_LOOKUP_SHARED) 3014 how_maybe |= HAMMER2_RESOLVE_SHARED; 3015 3016 parent = *parentp; 3017 *errorp = 0; 3018 3019 /* 3020 * Calculate the next index and recalculate the parent if necessary. 3021 */ 3022 if (chain) { 3023 key_beg = chain->bref.key + 3024 ((hammer2_key_t)1 << chain->bref.keybits); 3025 hammer2_chain_unlock(chain); 3026 hammer2_chain_drop(chain); 3027 3028 /* 3029 * chain invalid past this point, but we can still do a 3030 * pointer comparison w/parent. 3031 * 3032 * Any scan where the lookup returned degenerate data embedded 3033 * in the inode has an invalid index and must terminate. 3034 */ 3035 if (chain == parent) 3036 return(NULL); 3037 if (key_beg == 0 || key_beg > key_end) 3038 return(NULL); 3039 chain = NULL; 3040 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 3041 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 3042 /* 3043 * We reached the end of the iteration. 3044 */ 3045 return (NULL); 3046 } else { 3047 /* 3048 * Continue iteration with next parent unless the current 3049 * parent covers the range. 3050 * 3051 * (This also handles the case of a deleted, empty indirect 3052 * node). 3053 */ 3054 key_beg = parent->bref.key + 3055 ((hammer2_key_t)1 << parent->bref.keybits); 3056 if (key_beg == 0 || key_beg > key_end) 3057 return (NULL); 3058 parent = hammer2_chain_repparent(parentp, how_maybe); 3059 } 3060 3061 /* 3062 * And execute 3063 */ 3064 return (hammer2_chain_lookup(parentp, key_nextp, 3065 key_beg, key_end, 3066 errorp, flags)); 3067 } 3068 3069 /* 3070 * Caller wishes to iterate chains under parent, loading new chains into 3071 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 3072 * then call hammer2_chain_scan() repeatedly until a non-zero return. 3073 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 3074 * with the returned chain for the scan. The returned *chainp will be 3075 * locked and referenced. Any prior contents will be unlocked and dropped. 3076 * 3077 * Caller should check the return value. A normal scan EOF will return 3078 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 3079 * error trying to access parent data. Any error in the returned chain 3080 * must be tested separately by the caller. 3081 * 3082 * (*chainp) is dropped on each scan, but will only be set if the returned 3083 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 3084 * returned via *chainp. The caller will get their bref only. 3085 * 3086 * The raw scan function is similar to lookup/next but does not seek to a key. 3087 * Blockrefs are iterated via first_bref = (parent, NULL) and 3088 * next_chain = (parent, bref). 3089 * 3090 * The passed-in parent must be locked and its data resolved. The function 3091 * nominally returns a locked and referenced *chainp != NULL for chains 3092 * the caller might need to recurse on (and will dipose of any *chainp passed 3093 * in). The caller must check the chain->bref.type either way. 3094 */ 3095 int 3096 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 3097 hammer2_blockref_t *bref, int *firstp, 3098 int flags) 3099 { 3100 hammer2_dev_t *hmp; 3101 hammer2_blockref_t *base; 3102 hammer2_blockref_t *bref_ptr; 3103 hammer2_key_t key; 3104 hammer2_key_t next_key; 3105 hammer2_chain_t *chain = NULL; 3106 int count = 0; 3107 int how_always = HAMMER2_RESOLVE_ALWAYS; 3108 int how_maybe = HAMMER2_RESOLVE_MAYBE; 3109 int how; 3110 int generation; 3111 int maxloops = 300000; 3112 int error; 3113 3114 hmp = parent->hmp; 3115 error = 0; 3116 3117 /* 3118 * Scan flags borrowed from lookup. 3119 */ 3120 if (flags & HAMMER2_LOOKUP_ALWAYS) { 3121 how_maybe = how_always; 3122 how = HAMMER2_RESOLVE_ALWAYS; 3123 } else if (flags & HAMMER2_LOOKUP_NODATA) { 3124 how = HAMMER2_RESOLVE_NEVER; 3125 } else { 3126 how = HAMMER2_RESOLVE_MAYBE; 3127 } 3128 if (flags & HAMMER2_LOOKUP_SHARED) { 3129 how_maybe |= HAMMER2_RESOLVE_SHARED; 3130 how_always |= HAMMER2_RESOLVE_SHARED; 3131 how |= HAMMER2_RESOLVE_SHARED; 3132 } 3133 3134 /* 3135 * Calculate key to locate first/next element, unlocking the previous 3136 * element as we go. Be careful, the key calculation can overflow. 3137 * 3138 * (also reset bref to NULL) 3139 */ 3140 if (*firstp) { 3141 key = 0; 3142 *firstp = 0; 3143 } else { 3144 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 3145 if ((chain = *chainp) != NULL) { 3146 *chainp = NULL; 3147 hammer2_chain_unlock(chain); 3148 hammer2_chain_drop(chain); 3149 chain = NULL; 3150 } 3151 if (key == 0) { 3152 error |= HAMMER2_ERROR_EOF; 3153 goto done; 3154 } 3155 } 3156 3157 again: 3158 if (parent->error) { 3159 error = parent->error; 3160 goto done; 3161 } 3162 if (--maxloops == 0) 3163 panic("hammer2_chain_scan: maxloops"); 3164 3165 /* 3166 * Locate the blockref array. Currently we do a fully associative 3167 * search through the array. 3168 */ 3169 switch(parent->bref.type) { 3170 case HAMMER2_BREF_TYPE_INODE: 3171 /* 3172 * An inode with embedded data has no sub-chains. 3173 * 3174 * WARNING! Bulk scan code may pass a static chain marked 3175 * as BREF_TYPE_INODE with a copy of the volume 3176 * root blockset to snapshot the volume. 3177 */ 3178 if (parent->data->ipdata.meta.op_flags & 3179 HAMMER2_OPFLAG_DIRECTDATA) { 3180 error |= HAMMER2_ERROR_EOF; 3181 goto done; 3182 } 3183 base = &parent->data->ipdata.u.blockset.blockref[0]; 3184 count = HAMMER2_SET_COUNT; 3185 break; 3186 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3187 case HAMMER2_BREF_TYPE_INDIRECT: 3188 /* 3189 * Optimize indirect blocks in the INITIAL state to avoid 3190 * I/O. 3191 */ 3192 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 3193 base = NULL; 3194 } else { 3195 if (parent->data == NULL) 3196 panic("parent->data is NULL"); 3197 base = &parent->data->npdata[0]; 3198 } 3199 count = parent->bytes / sizeof(hammer2_blockref_t); 3200 break; 3201 case HAMMER2_BREF_TYPE_VOLUME: 3202 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3203 count = HAMMER2_SET_COUNT; 3204 break; 3205 case HAMMER2_BREF_TYPE_FREEMAP: 3206 base = &parent->data->blkset.blockref[0]; 3207 count = HAMMER2_SET_COUNT; 3208 break; 3209 default: 3210 panic("hammer2_chain_scan: unrecognized blockref type: %d", 3211 parent->bref.type); 3212 base = NULL; /* safety */ 3213 count = 0; /* safety */ 3214 } 3215 3216 /* 3217 * Merged scan to find next candidate. 3218 * 3219 * hammer2_base_*() functions require the parent->core.live_* fields 3220 * to be synchronized. 3221 * 3222 * We need to hold the spinlock to access the block array and RB tree 3223 * and to interlock chain creation. 3224 */ 3225 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3226 hammer2_chain_countbrefs(parent, base, count); 3227 3228 next_key = 0; 3229 bref_ptr = NULL; 3230 hammer2_spin_ex(&parent->core.spin); 3231 chain = hammer2_combined_find(parent, base, count, 3232 &next_key, 3233 key, HAMMER2_KEY_MAX, 3234 &bref_ptr); 3235 generation = parent->core.generation; 3236 3237 /* 3238 * Exhausted parent chain, we're done. 3239 */ 3240 if (bref_ptr == NULL) { 3241 hammer2_spin_unex(&parent->core.spin); 3242 KKASSERT(chain == NULL); 3243 error |= HAMMER2_ERROR_EOF; 3244 goto done; 3245 } 3246 3247 /* 3248 * Copy into the supplied stack-based blockref. 3249 */ 3250 *bref = *bref_ptr; 3251 3252 /* 3253 * Selected from blockref or in-memory chain. 3254 */ 3255 if (chain == NULL) { 3256 switch(bref->type) { 3257 case HAMMER2_BREF_TYPE_INODE: 3258 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3259 case HAMMER2_BREF_TYPE_INDIRECT: 3260 case HAMMER2_BREF_TYPE_VOLUME: 3261 case HAMMER2_BREF_TYPE_FREEMAP: 3262 /* 3263 * Recursion, always get the chain 3264 */ 3265 hammer2_spin_unex(&parent->core.spin); 3266 chain = hammer2_chain_get(parent, generation, 3267 bref, how); 3268 if (chain == NULL) 3269 goto again; 3270 break; 3271 default: 3272 /* 3273 * No recursion, do not waste time instantiating 3274 * a chain, just iterate using the bref. 3275 */ 3276 hammer2_spin_unex(&parent->core.spin); 3277 break; 3278 } 3279 } else { 3280 /* 3281 * Recursion or not we need the chain in order to supply 3282 * the bref. 3283 */ 3284 hammer2_chain_ref(chain); 3285 hammer2_spin_unex(&parent->core.spin); 3286 hammer2_chain_lock(chain, how); 3287 } 3288 if (chain && 3289 (bcmp(bref, &chain->bref, sizeof(*bref)) || 3290 chain->parent != parent)) { 3291 hammer2_chain_unlock(chain); 3292 hammer2_chain_drop(chain); 3293 chain = NULL; 3294 goto again; 3295 } 3296 3297 /* 3298 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 3299 * 3300 * NOTE: chain's key range is not relevant as there might be 3301 * one-offs within the range that are not deleted. 3302 * 3303 * NOTE: XXX this could create problems with scans used in 3304 * situations other than mount-time recovery. 3305 * 3306 * NOTE: Lookups can race delete-duplicate because 3307 * delete-duplicate does not lock the parent's core 3308 * (they just use the spinlock on the core). 3309 */ 3310 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3311 hammer2_chain_unlock(chain); 3312 hammer2_chain_drop(chain); 3313 chain = NULL; 3314 3315 key = next_key; 3316 if (key == 0) { 3317 error |= HAMMER2_ERROR_EOF; 3318 goto done; 3319 } 3320 goto again; 3321 } 3322 3323 done: 3324 /* 3325 * All done, return the bref or NULL, supply chain if necessary. 3326 */ 3327 if (chain) 3328 *chainp = chain; 3329 return (error); 3330 } 3331 3332 /* 3333 * Create and return a new hammer2 system memory structure of the specified 3334 * key, type and size and insert it under (*parentp). This is a full 3335 * insertion, based on the supplied key/keybits, and may involve creating 3336 * indirect blocks and moving other chains around via delete/duplicate. 3337 * 3338 * This call can be made with parent == NULL as long as a non -1 methods 3339 * is supplied. hmp must also be supplied in this situation (otherwise 3340 * hmp is extracted from the supplied parent). The chain will be detached 3341 * from the topology. A later call with both parent and chain can be made 3342 * to attach it. 3343 * 3344 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3345 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3346 * FULL. This typically means that the caller is creating the chain after 3347 * doing a hammer2_chain_lookup(). 3348 * 3349 * (*parentp) must be exclusive locked and may be replaced on return 3350 * depending on how much work the function had to do. 3351 * 3352 * (*parentp) must not be errored or this function will assert. 3353 * 3354 * (*chainp) usually starts out NULL and returns the newly created chain, 3355 * but if the caller desires the caller may allocate a disconnected chain 3356 * and pass it in instead. 3357 * 3358 * This function should NOT be used to insert INDIRECT blocks. It is 3359 * typically used to create/insert inodes and data blocks. 3360 * 3361 * Caller must pass-in an exclusively locked parent the new chain is to 3362 * be inserted under, and optionally pass-in a disconnected, exclusively 3363 * locked chain to insert (else we create a new chain). The function will 3364 * adjust (*parentp) as necessary, create or connect the chain, and 3365 * return an exclusively locked chain in *chainp. 3366 * 3367 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3368 * and will be reassigned. 3369 * 3370 * NOTE: returns HAMMER_ERROR_* flags 3371 */ 3372 int 3373 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3374 hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods, 3375 hammer2_key_t key, int keybits, int type, size_t bytes, 3376 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3377 { 3378 hammer2_chain_t *chain; 3379 hammer2_chain_t *parent; 3380 hammer2_blockref_t *base; 3381 hammer2_blockref_t dummy; 3382 int allocated = 0; 3383 int error = 0; 3384 int count; 3385 int maxloops = 300000; 3386 3387 /* 3388 * Topology may be crossing a PFS boundary. 3389 */ 3390 parent = *parentp; 3391 if (parent) { 3392 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3393 KKASSERT(parent->error == 0); 3394 hmp = parent->hmp; 3395 } 3396 chain = *chainp; 3397 3398 if (chain == NULL) { 3399 /* 3400 * First allocate media space and construct the dummy bref, 3401 * then allocate the in-memory chain structure. Set the 3402 * INITIAL flag for fresh chains which do not have embedded 3403 * data. 3404 * 3405 * XXX for now set the check mode of the child based on 3406 * the parent or, if the parent is an inode, the 3407 * specification in the inode. 3408 */ 3409 bzero(&dummy, sizeof(dummy)); 3410 dummy.type = type; 3411 dummy.key = key; 3412 dummy.keybits = keybits; 3413 dummy.data_off = hammer2_getradix(bytes); 3414 3415 /* 3416 * Inherit methods from parent by default. Primarily used 3417 * for BREF_TYPE_DATA. Non-data types *must* be set to 3418 * a non-NONE check algorithm. 3419 */ 3420 if (methods == -1) 3421 dummy.methods = parent->bref.methods; 3422 else 3423 dummy.methods = (uint8_t)methods; 3424 3425 if (type != HAMMER2_BREF_TYPE_DATA && 3426 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3427 dummy.methods |= 3428 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3429 } 3430 3431 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3432 3433 /* 3434 * Lock the chain manually, chain_lock will load the chain 3435 * which we do NOT want to do. (note: chain->refs is set 3436 * to 1 by chain_alloc() for us, but lockcnt is not). 3437 */ 3438 chain->lockcnt = 1; 3439 hammer2_mtx_ex(&chain->lock); 3440 allocated = 1; 3441 3442 /* 3443 * Set INITIAL to optimize I/O. The flag will generally be 3444 * processed when we call hammer2_chain_modify(). 3445 * 3446 * Recalculate bytes to reflect the actual media block 3447 * allocation. Handle special case radix 0 == 0 bytes. 3448 */ 3449 bytes = (size_t)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 3450 if (bytes) 3451 bytes = (hammer2_off_t)1 << bytes; 3452 chain->bytes = bytes; 3453 3454 switch(type) { 3455 case HAMMER2_BREF_TYPE_VOLUME: 3456 case HAMMER2_BREF_TYPE_FREEMAP: 3457 panic("hammer2_chain_create: called with volume type"); 3458 break; 3459 case HAMMER2_BREF_TYPE_INDIRECT: 3460 panic("hammer2_chain_create: cannot be used to" 3461 "create indirect block"); 3462 break; 3463 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3464 panic("hammer2_chain_create: cannot be used to" 3465 "create freemap root or node"); 3466 break; 3467 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3468 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3469 /* fall through */ 3470 case HAMMER2_BREF_TYPE_DIRENT: 3471 case HAMMER2_BREF_TYPE_INODE: 3472 case HAMMER2_BREF_TYPE_DATA: 3473 default: 3474 /* 3475 * leave chain->data NULL, set INITIAL 3476 */ 3477 KKASSERT(chain->data == NULL); 3478 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3479 break; 3480 } 3481 } else { 3482 /* 3483 * We are reattaching a previously deleted chain, possibly 3484 * under a new parent and possibly with a new key/keybits. 3485 * The chain does not have to be in a modified state. The 3486 * UPDATE flag will be set later on in this routine. 3487 * 3488 * Do NOT mess with the current state of the INITIAL flag. 3489 */ 3490 chain->bref.key = key; 3491 chain->bref.keybits = keybits; 3492 if (chain->flags & HAMMER2_CHAIN_DELETED) 3493 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3494 KKASSERT(chain->parent == NULL); 3495 } 3496 3497 /* 3498 * Set the appropriate bref flag if requested. 3499 * 3500 * NOTE! Callers can call this function to move chains without 3501 * knowing about special flags, so don't clear bref flags 3502 * here! 3503 */ 3504 if (flags & HAMMER2_INSERT_PFSROOT) 3505 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3506 3507 if (parent == NULL) 3508 goto skip; 3509 3510 /* 3511 * Calculate how many entries we have in the blockref array and 3512 * determine if an indirect block is required when inserting into 3513 * the parent. 3514 */ 3515 again: 3516 if (--maxloops == 0) 3517 panic("hammer2_chain_create: maxloops"); 3518 3519 switch(parent->bref.type) { 3520 case HAMMER2_BREF_TYPE_INODE: 3521 if ((parent->data->ipdata.meta.op_flags & 3522 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3523 kprintf("hammer2: parent set for direct-data! " 3524 "pkey=%016jx ckey=%016jx\n", 3525 parent->bref.key, 3526 chain->bref.key); 3527 } 3528 KKASSERT((parent->data->ipdata.meta.op_flags & 3529 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3530 KKASSERT(parent->data != NULL); 3531 base = &parent->data->ipdata.u.blockset.blockref[0]; 3532 count = HAMMER2_SET_COUNT; 3533 break; 3534 case HAMMER2_BREF_TYPE_INDIRECT: 3535 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3536 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3537 base = NULL; 3538 else 3539 base = &parent->data->npdata[0]; 3540 count = parent->bytes / sizeof(hammer2_blockref_t); 3541 break; 3542 case HAMMER2_BREF_TYPE_VOLUME: 3543 KKASSERT(parent->data != NULL); 3544 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3545 count = HAMMER2_SET_COUNT; 3546 break; 3547 case HAMMER2_BREF_TYPE_FREEMAP: 3548 KKASSERT(parent->data != NULL); 3549 base = &parent->data->blkset.blockref[0]; 3550 count = HAMMER2_SET_COUNT; 3551 break; 3552 default: 3553 panic("hammer2_chain_create: unrecognized blockref type: %d", 3554 parent->bref.type); 3555 base = NULL; 3556 count = 0; 3557 break; 3558 } 3559 3560 /* 3561 * Make sure we've counted the brefs 3562 */ 3563 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3564 hammer2_chain_countbrefs(parent, base, count); 3565 3566 KASSERT(parent->core.live_count >= 0 && 3567 parent->core.live_count <= count, 3568 ("bad live_count %d/%d (%02x, %d)", 3569 parent->core.live_count, count, 3570 parent->bref.type, parent->bytes)); 3571 3572 /* 3573 * If no free blockref could be found we must create an indirect 3574 * block and move a number of blockrefs into it. With the parent 3575 * locked we can safely lock each child in order to delete+duplicate 3576 * it without causing a deadlock. 3577 * 3578 * This may return the new indirect block or the old parent depending 3579 * on where the key falls. NULL is returned on error. 3580 */ 3581 if (parent->core.live_count == count) { 3582 hammer2_chain_t *nparent; 3583 3584 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0); 3585 3586 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3587 mtid, type, &error); 3588 if (nparent == NULL) { 3589 if (allocated) 3590 hammer2_chain_drop(chain); 3591 chain = NULL; 3592 goto done; 3593 } 3594 if (parent != nparent) { 3595 hammer2_chain_unlock(parent); 3596 hammer2_chain_drop(parent); 3597 parent = *parentp = nparent; 3598 } 3599 goto again; 3600 } 3601 3602 /* 3603 * fall through if parent, or skip to here if no parent. 3604 */ 3605 skip: 3606 if (chain->flags & HAMMER2_CHAIN_DELETED) 3607 kprintf("Inserting deleted chain @%016jx\n", 3608 chain->bref.key); 3609 3610 /* 3611 * Link the chain into its parent. 3612 */ 3613 if (chain->parent != NULL) 3614 panic("hammer2: hammer2_chain_create: chain already connected"); 3615 KKASSERT(chain->parent == NULL); 3616 if (parent) { 3617 KKASSERT(parent->core.live_count < count); 3618 hammer2_chain_insert(parent, chain, 3619 HAMMER2_CHAIN_INSERT_SPIN | 3620 HAMMER2_CHAIN_INSERT_LIVE, 3621 0); 3622 } 3623 3624 if (allocated) { 3625 /* 3626 * Mark the newly created chain modified. This will cause 3627 * UPDATE to be set and process the INITIAL flag. 3628 * 3629 * Device buffers are not instantiated for DATA elements 3630 * as these are handled by logical buffers. 3631 * 3632 * Indirect and freemap node indirect blocks are handled 3633 * by hammer2_chain_create_indirect() and not by this 3634 * function. 3635 * 3636 * Data for all other bref types is expected to be 3637 * instantiated (INODE, LEAF). 3638 */ 3639 switch(chain->bref.type) { 3640 case HAMMER2_BREF_TYPE_DATA: 3641 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3642 case HAMMER2_BREF_TYPE_DIRENT: 3643 case HAMMER2_BREF_TYPE_INODE: 3644 error = hammer2_chain_modify(chain, mtid, dedup_off, 3645 HAMMER2_MODIFY_OPTDATA); 3646 break; 3647 default: 3648 /* 3649 * Remaining types are not supported by this function. 3650 * In particular, INDIRECT and LEAF_NODE types are 3651 * handled by create_indirect(). 3652 */ 3653 panic("hammer2_chain_create: bad type: %d", 3654 chain->bref.type); 3655 /* NOT REACHED */ 3656 break; 3657 } 3658 } else { 3659 /* 3660 * When reconnecting a chain we must set UPDATE and 3661 * setflush so the flush recognizes that it must update 3662 * the bref in the parent. 3663 */ 3664 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3665 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3666 } 3667 3668 /* 3669 * We must setflush(parent) to ensure that it recurses through to 3670 * chain. setflush(chain) might not work because ONFLUSH is possibly 3671 * already set in the chain (so it won't recurse up to set it in the 3672 * parent). 3673 */ 3674 if (parent) 3675 hammer2_chain_setflush(parent); 3676 3677 done: 3678 *chainp = chain; 3679 3680 return (error); 3681 } 3682 3683 /* 3684 * Move the chain from its old parent to a new parent. The chain must have 3685 * already been deleted or already disconnected (or never associated) with 3686 * a parent. The chain is reassociated with the new parent and the deleted 3687 * flag will be cleared (no longer deleted). The chain's modification state 3688 * is not altered. 3689 * 3690 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3691 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3692 * FULL. This typically means that the caller is creating the chain after 3693 * doing a hammer2_chain_lookup(). 3694 * 3695 * Neither (parent) or (chain) can be errored. 3696 * 3697 * If (parent) is non-NULL then the chain is inserted under the parent. 3698 * 3699 * If (parent) is NULL then the newly duplicated chain is not inserted 3700 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3701 * passing into hammer2_chain_create() after this function returns). 3702 * 3703 * WARNING! This function calls create which means it can insert indirect 3704 * blocks. This can cause other unrelated chains in the parent to 3705 * be moved to a newly inserted indirect block in addition to the 3706 * specific chain. 3707 */ 3708 void 3709 hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3710 hammer2_tid_t mtid, int flags) 3711 { 3712 hammer2_blockref_t *bref; 3713 hammer2_dev_t *hmp; 3714 hammer2_chain_t *parent; 3715 size_t bytes; 3716 3717 /* 3718 * WARNING! We should never resolve DATA to device buffers 3719 * (XXX allow it if the caller did?), and since 3720 * we currently do not have the logical buffer cache 3721 * buffer in-hand to fix its cached physical offset 3722 * we also force the modify code to not COW it. XXX 3723 * 3724 * NOTE! We allow error'd chains to be renamed. The bref itself 3725 * is good and can be renamed. The content, however, may 3726 * be inaccessible. 3727 */ 3728 hmp = chain->hmp; 3729 KKASSERT(chain->parent == NULL); 3730 /*KKASSERT(chain->error == 0); allow */ 3731 3732 /* 3733 * Now create a duplicate of the chain structure, associating 3734 * it with the same core, making it the same size, pointing it 3735 * to the same bref (the same media block). 3736 * 3737 * NOTE: Handle special radix == 0 case (means 0 bytes). 3738 */ 3739 bref = &chain->bref; 3740 bytes = (size_t)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 3741 if (bytes) 3742 bytes = (hammer2_off_t)1 << bytes; 3743 3744 /* 3745 * If parent is not NULL the duplicated chain will be entered under 3746 * the parent and the UPDATE bit set to tell flush to update 3747 * the blockref. 3748 * 3749 * We must setflush(parent) to ensure that it recurses through to 3750 * chain. setflush(chain) might not work because ONFLUSH is possibly 3751 * already set in the chain (so it won't recurse up to set it in the 3752 * parent). 3753 * 3754 * Having both chains locked is extremely important for atomicy. 3755 */ 3756 if (parentp && (parent = *parentp) != NULL) { 3757 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3758 KKASSERT(parent->refs > 0); 3759 KKASSERT(parent->error == 0); 3760 3761 hammer2_chain_create(parentp, &chain, NULL, chain->pmp, 3762 HAMMER2_METH_DEFAULT, 3763 bref->key, bref->keybits, bref->type, 3764 chain->bytes, mtid, 0, flags); 3765 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3766 hammer2_chain_setflush(*parentp); 3767 } 3768 } 3769 3770 /* 3771 * This works in tandem with delete_obref() to install a blockref in 3772 * (typically) an indirect block that is associated with the chain being 3773 * moved to *parentp. 3774 * 3775 * The reason we need this function is that the caller needs to maintain 3776 * the blockref as it was, and not generate a new blockref for what might 3777 * be a modified chain. Otherwise stuff will leak into the flush that 3778 * the flush code's FLUSH_INODE_STOP flag is unable to catch. 3779 * 3780 * It is EXTREMELY important that we properly set CHAIN_BMAPUPD and 3781 * CHAIN_UPDATE. We must set BMAPUPD if the bref does not match, and 3782 * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if 3783 * it does. Otherwise we can end up in a situation where H2 is unable to 3784 * clean up the in-memory chain topology. 3785 * 3786 * The reason for this is that flushes do not generally flush through 3787 * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq 3788 * or sideq to properly flush and dispose of the related inode chain's flags. 3789 * Situations where the inode is not actually modified by the frontend, 3790 * but where we have to move the related chains around as we insert or cleanup 3791 * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory 3792 * inode chain that does not have a hammer2_inode_t associated with it. 3793 */ 3794 void 3795 hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3796 hammer2_tid_t mtid, int flags, 3797 hammer2_blockref_t *obref) 3798 { 3799 hammer2_chain_rename(parentp, chain, mtid, flags); 3800 3801 if (obref->type != HAMMER2_BREF_TYPE_EMPTY) { 3802 hammer2_blockref_t *tbase; 3803 int tcount; 3804 3805 KKASSERT((chain->flags & HAMMER2_CHAIN_BMAPPED) == 0); 3806 hammer2_chain_modify(*parentp, mtid, 0, 0); 3807 tbase = hammer2_chain_base_and_count(*parentp, &tcount); 3808 hammer2_base_insert(*parentp, tbase, tcount, chain, obref); 3809 if (bcmp(obref, &chain->bref, sizeof(chain->bref))) { 3810 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD | 3811 HAMMER2_CHAIN_UPDATE); 3812 } else { 3813 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3814 } 3815 } 3816 } 3817 3818 /* 3819 * Helper function for deleting chains. 3820 * 3821 * The chain is removed from the live view (the RBTREE) as well as the parent's 3822 * blockmap. Both chain and its parent must be locked. 3823 * 3824 * parent may not be errored. chain can be errored. 3825 */ 3826 static int 3827 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3828 hammer2_tid_t mtid, int flags, 3829 hammer2_blockref_t *obref) 3830 { 3831 hammer2_dev_t *hmp; 3832 int error = 0; 3833 3834 KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED | 3835 HAMMER2_CHAIN_FICTITIOUS)) == 0); 3836 KKASSERT(chain->parent == parent); 3837 hmp = chain->hmp; 3838 3839 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 3840 /* 3841 * Chain is blockmapped, so there must be a parent. 3842 * Atomically remove the chain from the parent and remove 3843 * the blockmap entry. The parent must be set modified 3844 * to remove the blockmap entry. 3845 */ 3846 hammer2_blockref_t *base; 3847 int count; 3848 3849 KKASSERT(parent != NULL); 3850 KKASSERT(parent->error == 0); 3851 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3852 error = hammer2_chain_modify(parent, mtid, 0, 0); 3853 if (error) 3854 goto done; 3855 3856 /* 3857 * Calculate blockmap pointer 3858 */ 3859 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3860 hammer2_spin_ex(&chain->core.spin); 3861 hammer2_spin_ex(&parent->core.spin); 3862 3863 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3864 atomic_add_int(&parent->core.live_count, -1); 3865 ++parent->core.generation; 3866 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3867 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3868 --parent->core.chain_count; 3869 chain->parent = NULL; 3870 3871 switch(parent->bref.type) { 3872 case HAMMER2_BREF_TYPE_INODE: 3873 /* 3874 * Access the inode's block array. However, there 3875 * is no block array if the inode is flagged 3876 * DIRECTDATA. 3877 */ 3878 if (parent->data && 3879 (parent->data->ipdata.meta.op_flags & 3880 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3881 base = 3882 &parent->data->ipdata.u.blockset.blockref[0]; 3883 } else { 3884 base = NULL; 3885 } 3886 count = HAMMER2_SET_COUNT; 3887 break; 3888 case HAMMER2_BREF_TYPE_INDIRECT: 3889 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3890 if (parent->data) 3891 base = &parent->data->npdata[0]; 3892 else 3893 base = NULL; 3894 count = parent->bytes / sizeof(hammer2_blockref_t); 3895 break; 3896 case HAMMER2_BREF_TYPE_VOLUME: 3897 base = &parent->data->voldata. 3898 sroot_blockset.blockref[0]; 3899 count = HAMMER2_SET_COUNT; 3900 break; 3901 case HAMMER2_BREF_TYPE_FREEMAP: 3902 base = &parent->data->blkset.blockref[0]; 3903 count = HAMMER2_SET_COUNT; 3904 break; 3905 default: 3906 base = NULL; 3907 count = 0; 3908 panic("_hammer2_chain_delete_helper: " 3909 "unrecognized blockref type: %d", 3910 parent->bref.type); 3911 } 3912 3913 /* 3914 * delete blockmapped chain from its parent. 3915 * 3916 * The parent is not affected by any statistics in chain 3917 * which are pending synchronization. That is, there is 3918 * nothing to undo in the parent since they have not yet 3919 * been incorporated into the parent. 3920 * 3921 * The parent is affected by statistics stored in inodes. 3922 * Those have already been synchronized, so they must be 3923 * undone. XXX split update possible w/delete in middle? 3924 */ 3925 if (base) { 3926 hammer2_base_delete(parent, base, count, chain, obref); 3927 } 3928 hammer2_spin_unex(&parent->core.spin); 3929 hammer2_spin_unex(&chain->core.spin); 3930 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3931 /* 3932 * Chain is not blockmapped but a parent is present. 3933 * Atomically remove the chain from the parent. There is 3934 * no blockmap entry to remove. 3935 * 3936 * Because chain was associated with a parent but not 3937 * synchronized, the chain's *_count_up fields contain 3938 * inode adjustment statistics which must be undone. 3939 */ 3940 hammer2_spin_ex(&chain->core.spin); 3941 hammer2_spin_ex(&parent->core.spin); 3942 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3943 atomic_add_int(&parent->core.live_count, -1); 3944 ++parent->core.generation; 3945 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3946 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3947 --parent->core.chain_count; 3948 chain->parent = NULL; 3949 hammer2_spin_unex(&parent->core.spin); 3950 hammer2_spin_unex(&chain->core.spin); 3951 } else { 3952 /* 3953 * Chain is not blockmapped and has no parent. This 3954 * is a degenerate case. 3955 */ 3956 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3957 } 3958 done: 3959 return error; 3960 } 3961 3962 /* 3963 * Create an indirect block that covers one or more of the elements in the 3964 * current parent. Either returns the existing parent with no locking or 3965 * ref changes or returns the new indirect block locked and referenced 3966 * and leaving the original parent lock/ref intact as well. 3967 * 3968 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3969 * 3970 * The returned chain depends on where the specified key falls. 3971 * 3972 * The key/keybits for the indirect mode only needs to follow three rules: 3973 * 3974 * (1) That all elements underneath it fit within its key space and 3975 * 3976 * (2) That all elements outside it are outside its key space. 3977 * 3978 * (3) When creating the new indirect block any elements in the current 3979 * parent that fit within the new indirect block's keyspace must be 3980 * moved into the new indirect block. 3981 * 3982 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3983 * keyspace the the current parent, but lookup/iteration rules will 3984 * ensure (and must ensure) that rule (2) for all parents leading up 3985 * to the nearest inode or the root volume header is adhered to. This 3986 * is accomplished by always recursing through matching keyspaces in 3987 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3988 * 3989 * The current implementation calculates the current worst-case keyspace by 3990 * iterating the current parent and then divides it into two halves, choosing 3991 * whichever half has the most elements (not necessarily the half containing 3992 * the requested key). 3993 * 3994 * We can also opt to use the half with the least number of elements. This 3995 * causes lower-numbered keys (aka logical file offsets) to recurse through 3996 * fewer indirect blocks and higher-numbered keys to recurse through more. 3997 * This also has the risk of not moving enough elements to the new indirect 3998 * block and being forced to create several indirect blocks before the element 3999 * can be inserted. 4000 * 4001 * Must be called with an exclusively locked parent. 4002 * 4003 * NOTE: *errorp set to HAMMER_ERROR_* flags 4004 */ 4005 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 4006 hammer2_key_t *keyp, int keybits, 4007 hammer2_blockref_t *base, int count); 4008 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 4009 hammer2_key_t *keyp, int keybits, 4010 hammer2_blockref_t *base, int count, 4011 int ncount); 4012 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 4013 hammer2_key_t *keyp, int keybits, 4014 hammer2_blockref_t *base, int count, 4015 int ncount); 4016 static 4017 hammer2_chain_t * 4018 hammer2_chain_create_indirect(hammer2_chain_t *parent, 4019 hammer2_key_t create_key, int create_bits, 4020 hammer2_tid_t mtid, int for_type, int *errorp) 4021 { 4022 hammer2_dev_t *hmp; 4023 hammer2_blockref_t *base; 4024 hammer2_blockref_t *bref; 4025 hammer2_blockref_t bsave; 4026 hammer2_blockref_t dummy; 4027 hammer2_chain_t *chain; 4028 hammer2_chain_t *ichain; 4029 hammer2_key_t key = create_key; 4030 hammer2_key_t key_beg; 4031 hammer2_key_t key_end; 4032 hammer2_key_t key_next; 4033 int keybits = create_bits; 4034 int count; 4035 int ncount; 4036 int nbytes; 4037 int loops; 4038 int error; 4039 int reason; 4040 int generation; 4041 int maxloops = 300000; 4042 4043 /* 4044 * Calculate the base blockref pointer or NULL if the chain 4045 * is known to be empty. We need to calculate the array count 4046 * for RB lookups either way. 4047 */ 4048 hmp = parent->hmp; 4049 KKASSERT(hammer2_mtx_owned(&parent->lock)); 4050 4051 /* 4052 * Pre-modify the parent now to avoid having to deal with error 4053 * processing if we tried to later (in the middle of our loop). 4054 * 4055 * We are going to be moving bref's around, the indirect blocks 4056 * cannot be in an initial state. Do not pass MODIFY_OPTDATA. 4057 */ 4058 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 4059 if (*errorp) { 4060 kprintf("hammer2_create_indirect: error %08x %s\n", 4061 *errorp, hammer2_error_str(*errorp)); 4062 return NULL; 4063 } 4064 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 4065 4066 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 4067 base = hammer2_chain_base_and_count(parent, &count); 4068 4069 /* 4070 * How big should our new indirect block be? It has to be at least 4071 * as large as its parent for splits to work properly. 4072 * 4073 * The freemap uses a specific indirect block size. The number of 4074 * levels are built dynamically and ultimately depend on the size 4075 * volume. Because freemap blocks are taken from the reserved areas 4076 * of the volume our goal is efficiency (fewer levels) and not so 4077 * much to save disk space. 4078 * 4079 * The first indirect block level for a directory usually uses 4080 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 4081 * the hash mechanism, this typically gives us a nominal 4082 * 32 * 4 entries with one level of indirection. 4083 * 4084 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 4085 * indirect blocks. The initial 4 entries in the inode gives us 4086 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 4087 * of indirection gives us 137GB, and so forth. H2 can support 4088 * huge file sizes but they are not typical, so we try to stick 4089 * with compactness and do not use a larger indirect block size. 4090 * 4091 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 4092 * due to the way indirect blocks are created this usually winds 4093 * up being extremely inefficient for small files. Even though 4094 * 16KB requires more levels of indirection for very large files, 4095 * the 16KB records can be ganged together into 64KB DIOs. 4096 */ 4097 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4098 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4099 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 4100 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4101 if (parent->data->ipdata.meta.type == 4102 HAMMER2_OBJTYPE_DIRECTORY) 4103 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 4104 else 4105 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 4106 4107 } else { 4108 nbytes = HAMMER2_IND_BYTES_NOM; 4109 } 4110 if (nbytes < count * sizeof(hammer2_blockref_t)) { 4111 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 4112 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 4113 nbytes = count * sizeof(hammer2_blockref_t); 4114 } 4115 ncount = nbytes / sizeof(hammer2_blockref_t); 4116 4117 /* 4118 * When creating an indirect block for a freemap node or leaf 4119 * the key/keybits must be fitted to static radix levels because 4120 * particular radix levels use particular reserved blocks in the 4121 * related zone. 4122 * 4123 * This routine calculates the key/radix of the indirect block 4124 * we need to create, and whether it is on the high-side or the 4125 * low-side. 4126 */ 4127 switch(for_type) { 4128 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4129 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 4130 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 4131 base, count); 4132 break; 4133 case HAMMER2_BREF_TYPE_DATA: 4134 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 4135 base, count, ncount); 4136 break; 4137 case HAMMER2_BREF_TYPE_DIRENT: 4138 case HAMMER2_BREF_TYPE_INODE: 4139 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 4140 base, count, ncount); 4141 break; 4142 default: 4143 panic("illegal indirect block for bref type %d", for_type); 4144 break; 4145 } 4146 4147 /* 4148 * Normalize the key for the radix being represented, keeping the 4149 * high bits and throwing away the low bits. 4150 */ 4151 key &= ~(((hammer2_key_t)1 << keybits) - 1); 4152 4153 /* 4154 * Ok, create our new indirect block 4155 */ 4156 bzero(&dummy, sizeof(dummy)); 4157 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4158 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4159 dummy.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 4160 } else { 4161 dummy.type = HAMMER2_BREF_TYPE_INDIRECT; 4162 } 4163 dummy.key = key; 4164 dummy.keybits = keybits; 4165 dummy.data_off = hammer2_getradix(nbytes); 4166 dummy.methods = 4167 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 4168 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 4169 4170 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy); 4171 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 4172 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 4173 /* ichain has one ref at this point */ 4174 4175 /* 4176 * We have to mark it modified to allocate its block, but use 4177 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 4178 * it won't be acted upon by the flush code. 4179 * 4180 * XXX remove OPTDATA, we need a fully initialized indirect block to 4181 * be able to move the original blockref. 4182 */ 4183 *errorp = hammer2_chain_modify(ichain, mtid, 0, 0); 4184 if (*errorp) { 4185 kprintf("hammer2_alloc_indirect: error %08x %s\n", 4186 *errorp, hammer2_error_str(*errorp)); 4187 hammer2_chain_unlock(ichain); 4188 hammer2_chain_drop(ichain); 4189 return NULL; 4190 } 4191 KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0); 4192 4193 /* 4194 * Iterate the original parent and move the matching brefs into 4195 * the new indirect block. 4196 * 4197 * XXX handle flushes. 4198 */ 4199 key_beg = 0; 4200 key_end = HAMMER2_KEY_MAX; 4201 key_next = 0; /* avoid gcc warnings */ 4202 hammer2_spin_ex(&parent->core.spin); 4203 loops = 0; 4204 reason = 0; 4205 4206 for (;;) { 4207 /* 4208 * Parent may have been modified, relocating its block array. 4209 * Reload the base pointer. 4210 */ 4211 base = hammer2_chain_base_and_count(parent, &count); 4212 4213 if (++loops > 100000) { 4214 hammer2_spin_unex(&parent->core.spin); 4215 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 4216 reason, parent, base, count, key_next); 4217 } 4218 4219 /* 4220 * NOTE: spinlock stays intact, returned chain (if not NULL) 4221 * is not referenced or locked which means that we 4222 * cannot safely check its flagged / deletion status 4223 * until we lock it. 4224 */ 4225 chain = hammer2_combined_find(parent, base, count, 4226 &key_next, 4227 key_beg, key_end, 4228 &bref); 4229 generation = parent->core.generation; 4230 if (bref == NULL) 4231 break; 4232 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4233 4234 /* 4235 * Skip keys that are not within the key/radix of the new 4236 * indirect block. They stay in the parent. 4237 */ 4238 if (rounddown2(key ^ bref->key, (hammer2_key_t)1 << keybits) != 0) { 4239 goto next_key_spinlocked; 4240 } 4241 4242 /* 4243 * Load the new indirect block by acquiring the related 4244 * chains (potentially from media as it might not be 4245 * in-memory). Then move it to the new parent (ichain). 4246 * 4247 * chain is referenced but not locked. We must lock the 4248 * chain to obtain definitive state. 4249 */ 4250 bsave = *bref; 4251 if (chain) { 4252 /* 4253 * Use chain already present in the RBTREE 4254 */ 4255 hammer2_chain_ref(chain); 4256 hammer2_spin_unex(&parent->core.spin); 4257 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 4258 } else { 4259 /* 4260 * Get chain for blockref element. _get returns NULL 4261 * on insertion race. 4262 */ 4263 hammer2_spin_unex(&parent->core.spin); 4264 chain = hammer2_chain_get(parent, generation, &bsave, 4265 HAMMER2_RESOLVE_NEVER); 4266 if (chain == NULL) { 4267 reason = 1; 4268 hammer2_spin_ex(&parent->core.spin); 4269 continue; 4270 } 4271 } 4272 4273 /* 4274 * This is always live so if the chain has been deleted 4275 * we raced someone and we have to retry. 4276 * 4277 * NOTE: Lookups can race delete-duplicate because 4278 * delete-duplicate does not lock the parent's core 4279 * (they just use the spinlock on the core). 4280 * 4281 * (note reversed logic for this one) 4282 */ 4283 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 4284 chain->parent != parent || 4285 (chain->flags & HAMMER2_CHAIN_DELETED)) { 4286 hammer2_chain_unlock(chain); 4287 hammer2_chain_drop(chain); 4288 if (hammer2_debug & 0x0040) { 4289 kprintf("LOST PARENT RETRY " 4290 "RETRY (%p,%p)->%p %08x\n", 4291 parent, chain->parent, chain, chain->flags); 4292 } 4293 hammer2_spin_ex(&parent->core.spin); 4294 continue; 4295 } 4296 4297 /* 4298 * Shift the chain to the indirect block. 4299 * 4300 * WARNING! No reason for us to load chain data, pass NOSTATS 4301 * to prevent delete/insert from trying to access 4302 * inode stats (and thus asserting if there is no 4303 * chain->data loaded). 4304 * 4305 * WARNING! The (parent, chain) deletion may modify the parent 4306 * and invalidate the base pointer. 4307 * 4308 * WARNING! Parent must already be marked modified, so we 4309 * can assume that chain_delete always suceeds. 4310 * 4311 * WARNING! hammer2_chain_repchange() does not have to be 4312 * called (and doesn't work anyway because we are 4313 * only doing a partial shift). A recursion that is 4314 * in-progress can continue at the current parent 4315 * and will be able to properly find its next key. 4316 */ 4317 error = hammer2_chain_delete_obref(parent, chain, mtid, 0, 4318 &bsave); 4319 KKASSERT(error == 0); 4320 hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bsave); 4321 hammer2_chain_unlock(chain); 4322 hammer2_chain_drop(chain); 4323 KKASSERT(parent->refs > 0); 4324 chain = NULL; 4325 base = NULL; /* safety */ 4326 hammer2_spin_ex(&parent->core.spin); 4327 next_key_spinlocked: 4328 if (--maxloops == 0) 4329 panic("hammer2_chain_create_indirect: maxloops"); 4330 reason = 4; 4331 if (key_next == 0 || key_next > key_end) 4332 break; 4333 key_beg = key_next; 4334 /* loop */ 4335 } 4336 hammer2_spin_unex(&parent->core.spin); 4337 4338 /* 4339 * Insert the new indirect block into the parent now that we've 4340 * cleared out some entries in the parent. We calculated a good 4341 * insertion index in the loop above (ichain->index). 4342 * 4343 * We don't have to set UPDATE here because we mark ichain 4344 * modified down below (so the normal modified -> flush -> set-moved 4345 * sequence applies). 4346 * 4347 * The insertion shouldn't race as this is a completely new block 4348 * and the parent is locked. 4349 */ 4350 base = NULL; /* safety, parent modify may change address */ 4351 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 4352 KKASSERT(parent->core.live_count < count); 4353 hammer2_chain_insert(parent, ichain, 4354 HAMMER2_CHAIN_INSERT_SPIN | 4355 HAMMER2_CHAIN_INSERT_LIVE, 4356 0); 4357 4358 /* 4359 * Make sure flushes propogate after our manual insertion. 4360 */ 4361 hammer2_chain_setflush(ichain); 4362 hammer2_chain_setflush(parent); 4363 4364 /* 4365 * Figure out what to return. 4366 */ 4367 if (rounddown2(create_key ^ key, (hammer2_key_t)1 << keybits)) { 4368 /* 4369 * Key being created is outside the key range, 4370 * return the original parent. 4371 */ 4372 hammer2_chain_unlock(ichain); 4373 hammer2_chain_drop(ichain); 4374 } else { 4375 /* 4376 * Otherwise its in the range, return the new parent. 4377 * (leave both the new and old parent locked). 4378 */ 4379 parent = ichain; 4380 } 4381 4382 return(parent); 4383 } 4384 4385 /* 4386 * Do maintenance on an indirect chain. Both parent and chain are locked. 4387 * 4388 * Returns non-zero if (chain) is deleted, either due to being empty or 4389 * because its children were safely moved into the parent. 4390 */ 4391 int 4392 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, 4393 hammer2_chain_t *chain) 4394 { 4395 hammer2_blockref_t *chain_base; 4396 hammer2_blockref_t *base; 4397 hammer2_blockref_t *bref; 4398 hammer2_blockref_t bsave; 4399 hammer2_key_t key_next; 4400 hammer2_key_t key_beg; 4401 hammer2_key_t key_end; 4402 hammer2_chain_t *sub; 4403 int chain_count; 4404 int count; 4405 int error; 4406 int generation; 4407 4408 /* 4409 * Make sure we have an accurate live_count 4410 */ 4411 if ((chain->flags & (HAMMER2_CHAIN_INITIAL | 4412 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4413 base = &chain->data->npdata[0]; 4414 count = chain->bytes / sizeof(hammer2_blockref_t); 4415 hammer2_chain_countbrefs(chain, base, count); 4416 } 4417 4418 /* 4419 * If the indirect block is empty we can delete it. 4420 * (ignore deletion error) 4421 */ 4422 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) { 4423 hammer2_chain_delete(parent, chain, 4424 chain->bref.modify_tid, 4425 HAMMER2_DELETE_PERMANENT); 4426 hammer2_chain_repchange(parent, chain); 4427 return 1; 4428 } 4429 4430 base = hammer2_chain_base_and_count(parent, &count); 4431 4432 if ((parent->flags & (HAMMER2_CHAIN_INITIAL | 4433 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4434 hammer2_chain_countbrefs(parent, base, count); 4435 } 4436 4437 /* 4438 * Determine if we can collapse chain into parent, calculate 4439 * hysteresis for chain emptiness. 4440 */ 4441 if (parent->core.live_count + chain->core.live_count - 1 > count) 4442 return 0; 4443 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4444 if (chain->core.live_count > chain_count * 3 / 4) 4445 return 0; 4446 4447 /* 4448 * Ok, theoretically we can collapse chain's contents into 4449 * parent. chain is locked, but any in-memory children of chain 4450 * are not. For this to work, we must be able to dispose of any 4451 * in-memory children of chain. 4452 * 4453 * For now require that there are no in-memory children of chain. 4454 * 4455 * WARNING! Both chain and parent must remain locked across this 4456 * entire operation. 4457 */ 4458 4459 /* 4460 * Parent must be marked modified. Don't try to collapse it if we 4461 * can't mark it modified. Once modified, destroy chain to make room 4462 * and to get rid of what will be a conflicting key (this is included 4463 * in the calculation above). Finally, move the children of chain 4464 * into chain's parent. 4465 * 4466 * This order creates an accounting problem for bref.embed.stats 4467 * because we destroy chain before we remove its children. Any 4468 * elements whos blockref is already synchronized will be counted 4469 * twice. To deal with the problem we clean out chain's stats prior 4470 * to deleting it. 4471 */ 4472 error = hammer2_chain_modify(parent, 0, 0, 0); 4473 if (error) { 4474 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4475 hammer2_error_str(error)); 4476 return 0; 4477 } 4478 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0); 4479 if (error) { 4480 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4481 hammer2_error_str(error)); 4482 return 0; 4483 } 4484 4485 chain->bref.embed.stats.inode_count = 0; 4486 chain->bref.embed.stats.data_count = 0; 4487 error = hammer2_chain_delete(parent, chain, 4488 chain->bref.modify_tid, 4489 HAMMER2_DELETE_PERMANENT); 4490 KKASSERT(error == 0); 4491 4492 /* 4493 * The combined_find call requires core.spin to be held. One would 4494 * think there wouldn't be any conflicts since we hold chain 4495 * exclusively locked, but the caching mechanism for 0-ref children 4496 * does not require a chain lock. 4497 */ 4498 hammer2_spin_ex(&chain->core.spin); 4499 4500 key_next = 0; 4501 key_beg = 0; 4502 key_end = HAMMER2_KEY_MAX; 4503 for (;;) { 4504 chain_base = &chain->data->npdata[0]; 4505 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4506 sub = hammer2_combined_find(chain, chain_base, chain_count, 4507 &key_next, 4508 key_beg, key_end, 4509 &bref); 4510 generation = chain->core.generation; 4511 if (bref == NULL) 4512 break; 4513 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4514 4515 bsave = *bref; 4516 if (sub) { 4517 hammer2_chain_ref(sub); 4518 hammer2_spin_unex(&chain->core.spin); 4519 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4520 } else { 4521 hammer2_spin_unex(&chain->core.spin); 4522 sub = hammer2_chain_get(chain, generation, &bsave, 4523 HAMMER2_RESOLVE_NEVER); 4524 if (sub == NULL) { 4525 hammer2_spin_ex(&chain->core.spin); 4526 continue; 4527 } 4528 } 4529 if (bcmp(&bsave, &sub->bref, sizeof(bsave)) || 4530 sub->parent != chain || 4531 (sub->flags & HAMMER2_CHAIN_DELETED)) { 4532 hammer2_chain_unlock(sub); 4533 hammer2_chain_drop(sub); 4534 hammer2_spin_ex(&chain->core.spin); 4535 sub = NULL; /* safety */ 4536 continue; 4537 } 4538 error = hammer2_chain_delete_obref(chain, sub, 4539 sub->bref.modify_tid, 0, 4540 &bsave); 4541 KKASSERT(error == 0); 4542 hammer2_chain_rename_obref(&parent, sub, 4543 sub->bref.modify_tid, 4544 HAMMER2_INSERT_SAMEPARENT, &bsave); 4545 hammer2_chain_unlock(sub); 4546 hammer2_chain_drop(sub); 4547 hammer2_spin_ex(&chain->core.spin); 4548 4549 if (key_next == 0) 4550 break; 4551 key_beg = key_next; 4552 } 4553 hammer2_spin_unex(&chain->core.spin); 4554 4555 hammer2_chain_repchange(parent, chain); 4556 4557 return 1; 4558 } 4559 4560 /* 4561 * Freemap indirect blocks 4562 * 4563 * Calculate the keybits and highside/lowside of the freemap node the 4564 * caller is creating. 4565 * 4566 * This routine will specify the next higher-level freemap key/radix 4567 * representing the lowest-ordered set. By doing so, eventually all 4568 * low-ordered sets will be moved one level down. 4569 * 4570 * We have to be careful here because the freemap reserves a limited 4571 * number of blocks for a limited number of levels. So we can't just 4572 * push indiscriminately. 4573 */ 4574 int 4575 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4576 int keybits, hammer2_blockref_t *base, int count) 4577 { 4578 hammer2_chain_t *chain; 4579 hammer2_blockref_t *bref; 4580 hammer2_key_t key; 4581 hammer2_key_t key_beg; 4582 hammer2_key_t key_end; 4583 hammer2_key_t key_next; 4584 int locount; 4585 int hicount; 4586 int maxloops = 300000; 4587 4588 key = *keyp; 4589 locount = 0; 4590 hicount = 0; 4591 keybits = 64; 4592 4593 /* 4594 * Calculate the range of keys in the array being careful to skip 4595 * slots which are overridden with a deletion. 4596 */ 4597 key_beg = 0; 4598 key_end = HAMMER2_KEY_MAX; 4599 hammer2_spin_ex(&parent->core.spin); 4600 4601 for (;;) { 4602 if (--maxloops == 0) { 4603 panic("indkey_freemap shit %p %p:%d\n", 4604 parent, base, count); 4605 } 4606 chain = hammer2_combined_find(parent, base, count, 4607 &key_next, 4608 key_beg, key_end, 4609 &bref); 4610 4611 /* 4612 * Exhausted search 4613 */ 4614 if (bref == NULL) 4615 break; 4616 4617 /* 4618 * Skip deleted chains. 4619 */ 4620 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4621 if (key_next == 0 || key_next > key_end) 4622 break; 4623 key_beg = key_next; 4624 continue; 4625 } 4626 4627 /* 4628 * Use the full live (not deleted) element for the scan 4629 * iteration. HAMMER2 does not allow partial replacements. 4630 * 4631 * XXX should be built into hammer2_combined_find(). 4632 */ 4633 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4634 4635 if (keybits > bref->keybits) { 4636 key = bref->key; 4637 keybits = bref->keybits; 4638 } else if (keybits == bref->keybits && bref->key < key) { 4639 key = bref->key; 4640 } 4641 if (key_next == 0) 4642 break; 4643 key_beg = key_next; 4644 } 4645 hammer2_spin_unex(&parent->core.spin); 4646 4647 /* 4648 * Return the keybits for a higher-level FREEMAP_NODE covering 4649 * this node. 4650 */ 4651 switch(keybits) { 4652 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4653 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4654 break; 4655 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4656 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4657 break; 4658 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4659 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4660 break; 4661 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4662 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4663 break; 4664 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4665 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4666 break; 4667 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4668 panic("hammer2_chain_indkey_freemap: level too high"); 4669 break; 4670 default: 4671 panic("hammer2_chain_indkey_freemap: bad radix"); 4672 break; 4673 } 4674 *keyp = key; 4675 4676 return (keybits); 4677 } 4678 4679 /* 4680 * File indirect blocks 4681 * 4682 * Calculate the key/keybits for the indirect block to create by scanning 4683 * existing keys. The key being created is also passed in *keyp and can be 4684 * inside or outside the indirect block. Regardless, the indirect block 4685 * must hold at least two keys in order to guarantee sufficient space. 4686 * 4687 * We use a modified version of the freemap's fixed radix tree, but taylored 4688 * for file data. Basically we configure an indirect block encompassing the 4689 * smallest key. 4690 */ 4691 static int 4692 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4693 int keybits, hammer2_blockref_t *base, int count, 4694 int ncount) 4695 { 4696 hammer2_chain_t *chain; 4697 hammer2_blockref_t *bref; 4698 hammer2_key_t key; 4699 hammer2_key_t key_beg; 4700 hammer2_key_t key_end; 4701 hammer2_key_t key_next; 4702 int nradix; 4703 int locount; 4704 int hicount; 4705 int maxloops = 300000; 4706 4707 key = *keyp; 4708 locount = 0; 4709 hicount = 0; 4710 keybits = 64; 4711 4712 /* 4713 * Calculate the range of keys in the array being careful to skip 4714 * slots which are overridden with a deletion. 4715 * 4716 * Locate the smallest key. 4717 */ 4718 key_beg = 0; 4719 key_end = HAMMER2_KEY_MAX; 4720 hammer2_spin_ex(&parent->core.spin); 4721 4722 for (;;) { 4723 if (--maxloops == 0) { 4724 panic("indkey_freemap shit %p %p:%d\n", 4725 parent, base, count); 4726 } 4727 chain = hammer2_combined_find(parent, base, count, 4728 &key_next, 4729 key_beg, key_end, 4730 &bref); 4731 4732 /* 4733 * Exhausted search 4734 */ 4735 if (bref == NULL) 4736 break; 4737 4738 /* 4739 * Skip deleted chains. 4740 */ 4741 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4742 if (key_next == 0 || key_next > key_end) 4743 break; 4744 key_beg = key_next; 4745 continue; 4746 } 4747 4748 /* 4749 * Use the full live (not deleted) element for the scan 4750 * iteration. HAMMER2 does not allow partial replacements. 4751 * 4752 * XXX should be built into hammer2_combined_find(). 4753 */ 4754 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4755 4756 if (keybits > bref->keybits) { 4757 key = bref->key; 4758 keybits = bref->keybits; 4759 } else if (keybits == bref->keybits && bref->key < key) { 4760 key = bref->key; 4761 } 4762 if (key_next == 0) 4763 break; 4764 key_beg = key_next; 4765 } 4766 hammer2_spin_unex(&parent->core.spin); 4767 4768 /* 4769 * Calculate the static keybits for a higher-level indirect block 4770 * that contains the key. 4771 */ 4772 *keyp = key; 4773 4774 switch(ncount) { 4775 case HAMMER2_IND_BYTES_MIN / sizeof(hammer2_blockref_t): 4776 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4777 break; 4778 case HAMMER2_IND_BYTES_NOM / sizeof(hammer2_blockref_t): 4779 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4780 break; 4781 case HAMMER2_IND_BYTES_MAX / sizeof(hammer2_blockref_t): 4782 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4783 break; 4784 default: 4785 panic("bad ncount %d\n", ncount); 4786 nradix = 0; 4787 break; 4788 } 4789 4790 /* 4791 * The largest radix that can be returned for an indirect block is 4792 * 63 bits. (The largest practical indirect block radix is actually 4793 * 62 bits because the top-level inode or volume root contains four 4794 * entries, but allow 63 to be returned). 4795 */ 4796 if (nradix >= 64) 4797 nradix = 63; 4798 4799 return keybits + nradix; 4800 } 4801 4802 #if 1 4803 4804 /* 4805 * Directory indirect blocks. 4806 * 4807 * Covers both the inode index (directory of inodes), and directory contents 4808 * (filenames hardlinked to inodes). 4809 * 4810 * Because directory keys are hashed we generally try to cut the space in 4811 * half. We accomodate the inode index (which tends to have linearly 4812 * increasing inode numbers) by ensuring that the keyspace is at least large 4813 * enough to fill up the indirect block being created. 4814 */ 4815 static int 4816 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4817 int keybits, hammer2_blockref_t *base, int count, 4818 int ncount) 4819 { 4820 hammer2_blockref_t *bref; 4821 hammer2_chain_t *chain; 4822 hammer2_key_t key_beg; 4823 hammer2_key_t key_end; 4824 hammer2_key_t key_next; 4825 hammer2_key_t key; 4826 int nkeybits; 4827 int locount; 4828 int hicount; 4829 int maxloops = 300000; 4830 4831 /* 4832 * NOTE: We can't take a shortcut here anymore for inodes because 4833 * the root directory can contain a mix of inodes and directory 4834 * entries (we used to just return 63 if parent->bref.type was 4835 * HAMMER2_BREF_TYPE_INODE. 4836 */ 4837 key = *keyp; 4838 locount = 0; 4839 hicount = 0; 4840 4841 /* 4842 * Calculate the range of keys in the array being careful to skip 4843 * slots which are overridden with a deletion. 4844 */ 4845 key_beg = 0; 4846 key_end = HAMMER2_KEY_MAX; 4847 hammer2_spin_ex(&parent->core.spin); 4848 4849 for (;;) { 4850 if (--maxloops == 0) { 4851 panic("indkey_freemap shit %p %p:%d\n", 4852 parent, base, count); 4853 } 4854 chain = hammer2_combined_find(parent, base, count, 4855 &key_next, 4856 key_beg, key_end, 4857 &bref); 4858 4859 /* 4860 * Exhausted search 4861 */ 4862 if (bref == NULL) 4863 break; 4864 4865 /* 4866 * Deleted object 4867 */ 4868 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4869 if (key_next == 0 || key_next > key_end) 4870 break; 4871 key_beg = key_next; 4872 continue; 4873 } 4874 4875 /* 4876 * Use the full live (not deleted) element for the scan 4877 * iteration. HAMMER2 does not allow partial replacements. 4878 * 4879 * XXX should be built into hammer2_combined_find(). 4880 */ 4881 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4882 4883 /* 4884 * Expand our calculated key range (key, keybits) to fit 4885 * the scanned key. nkeybits represents the full range 4886 * that we will later cut in half (two halves @ nkeybits - 1). 4887 */ 4888 nkeybits = keybits; 4889 if (nkeybits < bref->keybits) { 4890 if (bref->keybits > 64) { 4891 kprintf("bad bref chain %p bref %p\n", 4892 chain, bref); 4893 Debugger("fubar"); 4894 } 4895 nkeybits = bref->keybits; 4896 } 4897 while (nkeybits < 64 && 4898 rounddown2(key ^ bref->key, (hammer2_key_t)1 << nkeybits) != 0) { 4899 ++nkeybits; 4900 } 4901 4902 /* 4903 * If the new key range is larger we have to determine 4904 * which side of the new key range the existing keys fall 4905 * under by checking the high bit, then collapsing the 4906 * locount into the hicount or vise-versa. 4907 */ 4908 if (keybits != nkeybits) { 4909 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4910 hicount += locount; 4911 locount = 0; 4912 } else { 4913 locount += hicount; 4914 hicount = 0; 4915 } 4916 keybits = nkeybits; 4917 } 4918 4919 /* 4920 * The newly scanned key will be in the lower half or the 4921 * upper half of the (new) key range. 4922 */ 4923 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4924 ++hicount; 4925 else 4926 ++locount; 4927 4928 if (key_next == 0) 4929 break; 4930 key_beg = key_next; 4931 } 4932 hammer2_spin_unex(&parent->core.spin); 4933 bref = NULL; /* now invalid (safety) */ 4934 4935 /* 4936 * Adjust keybits to represent half of the full range calculated 4937 * above (radix 63 max) for our new indirect block. 4938 */ 4939 --keybits; 4940 4941 /* 4942 * Expand keybits to hold at least ncount elements. ncount will be 4943 * a power of 2. This is to try to completely fill leaf nodes (at 4944 * least for keys which are not hashes). 4945 * 4946 * We aren't counting 'in' or 'out', we are counting 'high side' 4947 * and 'low side' based on the bit at (1LL << keybits). We want 4948 * everything to be inside in these cases so shift it all to 4949 * the low or high side depending on the new high bit. 4950 */ 4951 while (((hammer2_key_t)1 << keybits) < ncount) { 4952 ++keybits; 4953 if (key & ((hammer2_key_t)1 << keybits)) { 4954 hicount += locount; 4955 locount = 0; 4956 } else { 4957 locount += hicount; 4958 hicount = 0; 4959 } 4960 } 4961 4962 if (hicount > locount) 4963 key |= (hammer2_key_t)1 << keybits; 4964 else 4965 key &= ~(hammer2_key_t)1 << keybits; 4966 4967 *keyp = key; 4968 4969 return (keybits); 4970 } 4971 4972 #else 4973 4974 /* 4975 * Directory indirect blocks. 4976 * 4977 * Covers both the inode index (directory of inodes), and directory contents 4978 * (filenames hardlinked to inodes). 4979 * 4980 * Because directory keys are hashed we generally try to cut the space in 4981 * half. We accomodate the inode index (which tends to have linearly 4982 * increasing inode numbers) by ensuring that the keyspace is at least large 4983 * enough to fill up the indirect block being created. 4984 */ 4985 static int 4986 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4987 int keybits, hammer2_blockref_t *base, int count, 4988 int ncount) 4989 { 4990 hammer2_blockref_t *bref; 4991 hammer2_chain_t *chain; 4992 hammer2_key_t key_beg; 4993 hammer2_key_t key_end; 4994 hammer2_key_t key_next; 4995 hammer2_key_t key; 4996 int nkeybits; 4997 int locount; 4998 int hicount; 4999 int maxloops = 300000; 5000 5001 /* 5002 * Shortcut if the parent is the inode. In this situation the 5003 * parent has 4+1 directory entries and we are creating an indirect 5004 * block capable of holding many more. 5005 */ 5006 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 5007 return 63; 5008 } 5009 5010 key = *keyp; 5011 locount = 0; 5012 hicount = 0; 5013 5014 /* 5015 * Calculate the range of keys in the array being careful to skip 5016 * slots which are overridden with a deletion. 5017 */ 5018 key_beg = 0; 5019 key_end = HAMMER2_KEY_MAX; 5020 hammer2_spin_ex(&parent->core.spin); 5021 5022 for (;;) { 5023 if (--maxloops == 0) { 5024 panic("indkey_freemap shit %p %p:%d\n", 5025 parent, base, count); 5026 } 5027 chain = hammer2_combined_find(parent, base, count, 5028 &key_next, 5029 key_beg, key_end, 5030 &bref); 5031 5032 /* 5033 * Exhausted search 5034 */ 5035 if (bref == NULL) 5036 break; 5037 5038 /* 5039 * Deleted object 5040 */ 5041 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 5042 if (key_next == 0 || key_next > key_end) 5043 break; 5044 key_beg = key_next; 5045 continue; 5046 } 5047 5048 /* 5049 * Use the full live (not deleted) element for the scan 5050 * iteration. HAMMER2 does not allow partial replacements. 5051 * 5052 * XXX should be built into hammer2_combined_find(). 5053 */ 5054 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 5055 5056 /* 5057 * Expand our calculated key range (key, keybits) to fit 5058 * the scanned key. nkeybits represents the full range 5059 * that we will later cut in half (two halves @ nkeybits - 1). 5060 */ 5061 nkeybits = keybits; 5062 if (nkeybits < bref->keybits) { 5063 if (bref->keybits > 64) { 5064 kprintf("bad bref chain %p bref %p\n", 5065 chain, bref); 5066 Debugger("fubar"); 5067 } 5068 nkeybits = bref->keybits; 5069 } 5070 while (nkeybits < 64 && 5071 (~(((hammer2_key_t)1 << nkeybits) - 1) & 5072 (key ^ bref->key)) != 0) { 5073 ++nkeybits; 5074 } 5075 5076 /* 5077 * If the new key range is larger we have to determine 5078 * which side of the new key range the existing keys fall 5079 * under by checking the high bit, then collapsing the 5080 * locount into the hicount or vise-versa. 5081 */ 5082 if (keybits != nkeybits) { 5083 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 5084 hicount += locount; 5085 locount = 0; 5086 } else { 5087 locount += hicount; 5088 hicount = 0; 5089 } 5090 keybits = nkeybits; 5091 } 5092 5093 /* 5094 * The newly scanned key will be in the lower half or the 5095 * upper half of the (new) key range. 5096 */ 5097 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 5098 ++hicount; 5099 else 5100 ++locount; 5101 5102 if (key_next == 0) 5103 break; 5104 key_beg = key_next; 5105 } 5106 hammer2_spin_unex(&parent->core.spin); 5107 bref = NULL; /* now invalid (safety) */ 5108 5109 /* 5110 * Adjust keybits to represent half of the full range calculated 5111 * above (radix 63 max) for our new indirect block. 5112 */ 5113 --keybits; 5114 5115 /* 5116 * Expand keybits to hold at least ncount elements. ncount will be 5117 * a power of 2. This is to try to completely fill leaf nodes (at 5118 * least for keys which are not hashes). 5119 * 5120 * We aren't counting 'in' or 'out', we are counting 'high side' 5121 * and 'low side' based on the bit at (1LL << keybits). We want 5122 * everything to be inside in these cases so shift it all to 5123 * the low or high side depending on the new high bit. 5124 */ 5125 while (((hammer2_key_t)1 << keybits) < ncount) { 5126 ++keybits; 5127 if (key & ((hammer2_key_t)1 << keybits)) { 5128 hicount += locount; 5129 locount = 0; 5130 } else { 5131 locount += hicount; 5132 hicount = 0; 5133 } 5134 } 5135 5136 if (hicount > locount) 5137 key |= (hammer2_key_t)1 << keybits; 5138 else 5139 key &= ~(hammer2_key_t)1 << keybits; 5140 5141 *keyp = key; 5142 5143 return (keybits); 5144 } 5145 5146 #endif 5147 5148 /* 5149 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 5150 * it exists. 5151 * 5152 * Both parent and chain must be locked exclusively. 5153 * 5154 * This function will modify the parent if the blockref requires removal 5155 * from the parent's block table. 5156 * 5157 * This function is NOT recursive. Any entity already pushed into the 5158 * chain (such as an inode) may still need visibility into its contents, 5159 * as well as the ability to read and modify the contents. For example, 5160 * for an unlinked file which is still open. 5161 * 5162 * Also note that the flusher is responsible for cleaning up empty 5163 * indirect blocks. 5164 */ 5165 int 5166 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 5167 hammer2_tid_t mtid, int flags) 5168 { 5169 int error = 0; 5170 5171 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5172 5173 /* 5174 * Nothing to do if already marked. 5175 * 5176 * We need the spinlock on the core whos RBTREE contains chain 5177 * to protect against races. 5178 */ 5179 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5180 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5181 chain->parent == parent); 5182 error = _hammer2_chain_delete_helper(parent, chain, 5183 mtid, flags, NULL); 5184 } 5185 5186 /* 5187 * Permanent deletions mark the chain as destroyed. 5188 * 5189 * NOTE: We do not setflush the chain unless the deletion is 5190 * permanent, since the deletion of a chain does not actually 5191 * require it to be flushed. 5192 */ 5193 if (error == 0) { 5194 if (flags & HAMMER2_DELETE_PERMANENT) { 5195 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5196 hammer2_chain_setflush(chain); 5197 } 5198 } 5199 5200 return error; 5201 } 5202 5203 static int 5204 hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain, 5205 hammer2_tid_t mtid, int flags, 5206 hammer2_blockref_t *obref) 5207 { 5208 int error = 0; 5209 5210 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5211 5212 /* 5213 * Nothing to do if already marked. 5214 * 5215 * We need the spinlock on the core whos RBTREE contains chain 5216 * to protect against races. 5217 */ 5218 obref->type = HAMMER2_BREF_TYPE_EMPTY; 5219 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5220 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5221 chain->parent == parent); 5222 error = _hammer2_chain_delete_helper(parent, chain, 5223 mtid, flags, obref); 5224 } 5225 5226 /* 5227 * Permanent deletions mark the chain as destroyed. 5228 * 5229 * NOTE: We do not setflush the chain unless the deletion is 5230 * permanent, since the deletion of a chain does not actually 5231 * require it to be flushed. 5232 */ 5233 if (error == 0) { 5234 if (flags & HAMMER2_DELETE_PERMANENT) { 5235 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5236 hammer2_chain_setflush(chain); 5237 } 5238 } 5239 5240 return error; 5241 } 5242 5243 /* 5244 * Returns the index of the nearest element in the blockref array >= elm. 5245 * Returns (count) if no element could be found. 5246 * 5247 * Sets *key_nextp to the next key for loop purposes but does not modify 5248 * it if the next key would be higher than the current value of *key_nextp. 5249 * Note that *key_nexp can overflow to 0, which should be tested by the 5250 * caller. 5251 * 5252 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5253 * held through the operation. 5254 */ 5255 static int 5256 hammer2_base_find(hammer2_chain_t *parent, 5257 hammer2_blockref_t *base, int count, 5258 hammer2_key_t *key_nextp, 5259 hammer2_key_t key_beg, hammer2_key_t key_end) 5260 { 5261 hammer2_blockref_t *scan; 5262 hammer2_key_t scan_end; 5263 int i; 5264 int limit; 5265 5266 /* 5267 * Require the live chain's already have their core's counted 5268 * so we can optimize operations. 5269 */ 5270 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 5271 5272 /* 5273 * Degenerate case 5274 */ 5275 if (count == 0 || base == NULL) 5276 return(count); 5277 5278 /* 5279 * Sequential optimization using parent->cache_index. This is 5280 * the most likely scenario. 5281 * 5282 * We can avoid trailing empty entries on live chains, otherwise 5283 * we might have to check the whole block array. 5284 */ 5285 i = parent->cache_index; /* SMP RACE OK */ 5286 cpu_ccfence(); 5287 limit = parent->core.live_zero; 5288 if (i >= limit) 5289 i = limit - 1; 5290 if (i < 0) 5291 i = 0; 5292 KKASSERT(i < count); 5293 5294 /* 5295 * Search backwards 5296 */ 5297 scan = &base[i]; 5298 while (i > 0 && (scan->type == HAMMER2_BREF_TYPE_EMPTY || 5299 scan->key > key_beg)) { 5300 --scan; 5301 --i; 5302 } 5303 parent->cache_index = i; 5304 5305 /* 5306 * Search forwards, stop when we find a scan element which 5307 * encloses the key or until we know that there are no further 5308 * elements. 5309 */ 5310 while (i < count) { 5311 if (scan->type != HAMMER2_BREF_TYPE_EMPTY) { 5312 scan_end = scan->key + 5313 ((hammer2_key_t)1 << scan->keybits) - 1; 5314 if (scan->key > key_beg || scan_end >= key_beg) 5315 break; 5316 } 5317 if (i >= limit) 5318 return (count); 5319 ++scan; 5320 ++i; 5321 } 5322 if (i != count) { 5323 parent->cache_index = i; 5324 if (i >= limit) { 5325 i = count; 5326 } else { 5327 scan_end = scan->key + 5328 ((hammer2_key_t)1 << scan->keybits); 5329 if (scan_end && (*key_nextp > scan_end || 5330 *key_nextp == 0)) { 5331 *key_nextp = scan_end; 5332 } 5333 } 5334 } 5335 return (i); 5336 } 5337 5338 /* 5339 * Do a combined search and return the next match either from the blockref 5340 * array or from the in-memory chain. Sets *bresp to the returned bref in 5341 * both cases, or sets it to NULL if the search exhausted. Only returns 5342 * a non-NULL chain if the search matched from the in-memory chain. 5343 * 5344 * When no in-memory chain has been found and a non-NULL bref is returned 5345 * in *bresp. 5346 * 5347 * 5348 * The returned chain is not locked or referenced. Use the returned bref 5349 * to determine if the search exhausted or not. Iterate if the base find 5350 * is chosen but matches a deleted chain. 5351 * 5352 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5353 * held through the operation. 5354 */ 5355 hammer2_chain_t * 5356 hammer2_combined_find(hammer2_chain_t *parent, 5357 hammer2_blockref_t *base, int count, 5358 hammer2_key_t *key_nextp, 5359 hammer2_key_t key_beg, hammer2_key_t key_end, 5360 hammer2_blockref_t **bresp) 5361 { 5362 hammer2_blockref_t *bref; 5363 hammer2_chain_t *chain; 5364 int i; 5365 5366 /* 5367 * Lookup in block array and in rbtree. 5368 */ 5369 *key_nextp = key_end + 1; 5370 i = hammer2_base_find(parent, base, count, key_nextp, 5371 key_beg, key_end); 5372 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 5373 5374 /* 5375 * Neither matched 5376 */ 5377 if (i == count && chain == NULL) { 5378 *bresp = NULL; 5379 return(NULL); 5380 } 5381 5382 /* 5383 * Only chain matched. 5384 */ 5385 if (i == count) { 5386 bref = &chain->bref; 5387 goto found; 5388 } 5389 5390 /* 5391 * Only blockref matched. 5392 */ 5393 if (chain == NULL) { 5394 bref = &base[i]; 5395 goto found; 5396 } 5397 5398 /* 5399 * Both in-memory and blockref matched, select the nearer element. 5400 * 5401 * If both are flush with the left-hand side or both are the 5402 * same distance away, select the chain. In this situation the 5403 * chain must have been loaded from the matching blockmap. 5404 */ 5405 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 5406 chain->bref.key == base[i].key) { 5407 KKASSERT(chain->bref.key == base[i].key); 5408 bref = &chain->bref; 5409 goto found; 5410 } 5411 5412 /* 5413 * Select the nearer key 5414 */ 5415 if (chain->bref.key < base[i].key) { 5416 bref = &chain->bref; 5417 } else { 5418 bref = &base[i]; 5419 chain = NULL; 5420 } 5421 5422 /* 5423 * If the bref is out of bounds we've exhausted our search. 5424 */ 5425 found: 5426 if (bref->key > key_end) { 5427 *bresp = NULL; 5428 chain = NULL; 5429 } else { 5430 *bresp = bref; 5431 } 5432 return(chain); 5433 } 5434 5435 /* 5436 * Locate the specified block array element and delete it. The element 5437 * must exist. 5438 * 5439 * The spin lock on the related chain must be held. 5440 * 5441 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5442 * need to be adjusted when we commit the media change. 5443 */ 5444 void 5445 hammer2_base_delete(hammer2_chain_t *parent, 5446 hammer2_blockref_t *base, int count, 5447 hammer2_chain_t *chain, 5448 hammer2_blockref_t *obref) 5449 { 5450 hammer2_blockref_t *elm = &chain->bref; 5451 hammer2_blockref_t *scan; 5452 hammer2_key_t key_next; 5453 int i; 5454 5455 /* 5456 * Delete element. Expect the element to exist. 5457 * 5458 * XXX see caller, flush code not yet sophisticated enough to prevent 5459 * re-flushed in some cases. 5460 */ 5461 key_next = 0; /* max range */ 5462 i = hammer2_base_find(parent, base, count, &key_next, 5463 elm->key, elm->key); 5464 scan = &base[i]; 5465 if (i == count || scan->type == HAMMER2_BREF_TYPE_EMPTY || 5466 scan->key != elm->key || 5467 ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 && 5468 scan->keybits != elm->keybits)) { 5469 hammer2_spin_unex(&parent->core.spin); 5470 panic("delete base %p element not found at %d/%d elm %p\n", 5471 base, i, count, elm); 5472 return; 5473 } 5474 5475 /* 5476 * Update stats and zero the entry. 5477 * 5478 * NOTE: Handle radix == 0 (0 bytes) case. 5479 */ 5480 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 5481 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 5482 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 5483 } 5484 switch(scan->type) { 5485 case HAMMER2_BREF_TYPE_INODE: 5486 --parent->bref.embed.stats.inode_count; 5487 /* fall through */ 5488 case HAMMER2_BREF_TYPE_DATA: 5489 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5490 atomic_set_int(&chain->flags, 5491 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5492 } else { 5493 if (parent->bref.leaf_count) 5494 --parent->bref.leaf_count; 5495 } 5496 /* fall through */ 5497 case HAMMER2_BREF_TYPE_INDIRECT: 5498 if (scan->type != HAMMER2_BREF_TYPE_DATA) { 5499 parent->bref.embed.stats.data_count -= 5500 scan->embed.stats.data_count; 5501 parent->bref.embed.stats.inode_count -= 5502 scan->embed.stats.inode_count; 5503 } 5504 if (scan->type == HAMMER2_BREF_TYPE_INODE) 5505 break; 5506 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5507 atomic_set_int(&chain->flags, 5508 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5509 } else { 5510 if (parent->bref.leaf_count <= scan->leaf_count) 5511 parent->bref.leaf_count = 0; 5512 else 5513 parent->bref.leaf_count -= scan->leaf_count; 5514 } 5515 break; 5516 case HAMMER2_BREF_TYPE_DIRENT: 5517 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5518 atomic_set_int(&chain->flags, 5519 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5520 } else { 5521 if (parent->bref.leaf_count) 5522 --parent->bref.leaf_count; 5523 } 5524 default: 5525 break; 5526 } 5527 5528 if (obref) 5529 *obref = *scan; 5530 bzero(scan, sizeof(*scan)); 5531 5532 /* 5533 * We can only optimize parent->core.live_zero for live chains. 5534 */ 5535 if (parent->core.live_zero == i + 1) { 5536 while (--i >= 0 && base[i].type == HAMMER2_BREF_TYPE_EMPTY) 5537 ; 5538 parent->core.live_zero = i + 1; 5539 } 5540 5541 /* 5542 * Clear appropriate blockmap flags in chain. 5543 */ 5544 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 5545 HAMMER2_CHAIN_BMAPUPD); 5546 } 5547 5548 /* 5549 * Insert the specified element. The block array must not already have the 5550 * element and must have space available for the insertion. 5551 * 5552 * The spin lock on the related chain must be held. 5553 * 5554 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5555 * need to be adjusted when we commit the media change. 5556 */ 5557 void 5558 hammer2_base_insert(hammer2_chain_t *parent, 5559 hammer2_blockref_t *base, int count, 5560 hammer2_chain_t *chain, hammer2_blockref_t *elm) 5561 { 5562 hammer2_key_t key_next; 5563 hammer2_key_t xkey; 5564 int i; 5565 int j; 5566 int k; 5567 int l; 5568 int u = 1; 5569 5570 /* 5571 * Insert new element. Expect the element to not already exist 5572 * unless we are replacing it. 5573 * 5574 * XXX see caller, flush code not yet sophisticated enough to prevent 5575 * re-flushed in some cases. 5576 */ 5577 key_next = 0; /* max range */ 5578 i = hammer2_base_find(parent, base, count, &key_next, 5579 elm->key, elm->key); 5580 5581 /* 5582 * Shortcut fill optimization, typical ordered insertion(s) may not 5583 * require a search. 5584 */ 5585 KKASSERT(i >= 0 && i <= count); 5586 5587 /* 5588 * Set appropriate blockmap flags in chain (if not NULL) 5589 */ 5590 if (chain) 5591 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 5592 5593 /* 5594 * Update stats and zero the entry 5595 */ 5596 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 5597 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 5598 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 5599 } 5600 switch(elm->type) { 5601 case HAMMER2_BREF_TYPE_INODE: 5602 ++parent->bref.embed.stats.inode_count; 5603 /* fall through */ 5604 case HAMMER2_BREF_TYPE_DATA: 5605 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5606 ++parent->bref.leaf_count; 5607 /* fall through */ 5608 case HAMMER2_BREF_TYPE_INDIRECT: 5609 if (elm->type != HAMMER2_BREF_TYPE_DATA) { 5610 parent->bref.embed.stats.data_count += 5611 elm->embed.stats.data_count; 5612 parent->bref.embed.stats.inode_count += 5613 elm->embed.stats.inode_count; 5614 } 5615 if (elm->type == HAMMER2_BREF_TYPE_INODE) 5616 break; 5617 if (parent->bref.leaf_count + elm->leaf_count < 5618 HAMMER2_BLOCKREF_LEAF_MAX) { 5619 parent->bref.leaf_count += elm->leaf_count; 5620 } else { 5621 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX; 5622 } 5623 break; 5624 case HAMMER2_BREF_TYPE_DIRENT: 5625 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5626 ++parent->bref.leaf_count; 5627 break; 5628 default: 5629 break; 5630 } 5631 5632 5633 /* 5634 * We can only optimize parent->core.live_zero for live chains. 5635 */ 5636 if (i == count && parent->core.live_zero < count) { 5637 i = parent->core.live_zero++; 5638 base[i] = *elm; 5639 return; 5640 } 5641 5642 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 5643 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 5644 hammer2_spin_unex(&parent->core.spin); 5645 panic("insert base %p overlapping elements at %d elm %p\n", 5646 base, i, elm); 5647 } 5648 5649 /* 5650 * Try to find an empty slot before or after. 5651 */ 5652 j = i; 5653 k = i; 5654 while (j > 0 || k < count) { 5655 --j; 5656 if (j >= 0 && base[j].type == HAMMER2_BREF_TYPE_EMPTY) { 5657 if (j == i - 1) { 5658 base[j] = *elm; 5659 } else { 5660 bcopy(&base[j+1], &base[j], 5661 (i - j - 1) * sizeof(*base)); 5662 base[i - 1] = *elm; 5663 } 5664 goto validate; 5665 } 5666 ++k; 5667 if (k < count && base[k].type == HAMMER2_BREF_TYPE_EMPTY) { 5668 bcopy(&base[i], &base[i+1], 5669 (k - i) * sizeof(hammer2_blockref_t)); 5670 base[i] = *elm; 5671 5672 /* 5673 * We can only update parent->core.live_zero for live 5674 * chains. 5675 */ 5676 if (parent->core.live_zero <= k) 5677 parent->core.live_zero = k + 1; 5678 u = 2; 5679 goto validate; 5680 } 5681 } 5682 panic("hammer2_base_insert: no room!"); 5683 5684 /* 5685 * Debugging 5686 */ 5687 validate: 5688 key_next = 0; 5689 for (l = 0; l < count; ++l) { 5690 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5691 key_next = base[l].key + 5692 ((hammer2_key_t)1 << base[l].keybits) - 1; 5693 break; 5694 } 5695 } 5696 while (++l < count) { 5697 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5698 if (base[l].key <= key_next) 5699 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5700 key_next = base[l].key + 5701 ((hammer2_key_t)1 << base[l].keybits) - 1; 5702 5703 } 5704 } 5705 5706 } 5707 5708 #if 0 5709 5710 /* 5711 * Sort the blockref array for the chain. Used by the flush code to 5712 * sort the blockref[] array. 5713 * 5714 * The chain must be exclusively locked AND spin-locked. 5715 */ 5716 typedef hammer2_blockref_t *hammer2_blockref_p; 5717 5718 static 5719 int 5720 hammer2_base_sort_callback(const void *v1, const void *v2) 5721 { 5722 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5723 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5724 5725 /* 5726 * Make sure empty elements are placed at the end of the array 5727 */ 5728 if (bref1->type == HAMMER2_BREF_TYPE_EMPTY) { 5729 if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) 5730 return(0); 5731 return(1); 5732 } else if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) { 5733 return(-1); 5734 } 5735 5736 /* 5737 * Sort by key 5738 */ 5739 if (bref1->key < bref2->key) 5740 return(-1); 5741 if (bref1->key > bref2->key) 5742 return(1); 5743 return(0); 5744 } 5745 5746 void 5747 hammer2_base_sort(hammer2_chain_t *chain) 5748 { 5749 hammer2_blockref_t *base; 5750 int count; 5751 5752 switch(chain->bref.type) { 5753 case HAMMER2_BREF_TYPE_INODE: 5754 /* 5755 * Special shortcut for embedded data returns the inode 5756 * itself. Callers must detect this condition and access 5757 * the embedded data (the strategy code does this for us). 5758 * 5759 * This is only applicable to regular files and softlinks. 5760 */ 5761 if (chain->data->ipdata.meta.op_flags & 5762 HAMMER2_OPFLAG_DIRECTDATA) { 5763 return; 5764 } 5765 base = &chain->data->ipdata.u.blockset.blockref[0]; 5766 count = HAMMER2_SET_COUNT; 5767 break; 5768 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5769 case HAMMER2_BREF_TYPE_INDIRECT: 5770 /* 5771 * Optimize indirect blocks in the INITIAL state to avoid 5772 * I/O. 5773 */ 5774 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5775 base = &chain->data->npdata[0]; 5776 count = chain->bytes / sizeof(hammer2_blockref_t); 5777 break; 5778 case HAMMER2_BREF_TYPE_VOLUME: 5779 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5780 count = HAMMER2_SET_COUNT; 5781 break; 5782 case HAMMER2_BREF_TYPE_FREEMAP: 5783 base = &chain->data->blkset.blockref[0]; 5784 count = HAMMER2_SET_COUNT; 5785 break; 5786 default: 5787 kprintf("hammer2_chain_lookup: unrecognized " 5788 "blockref(A) type: %d", 5789 chain->bref.type); 5790 while (1) 5791 tsleep(&base, 0, "dead", 0); 5792 panic("hammer2_base_sort: unrecognized " 5793 "blockref(A) type: %d", 5794 chain->bref.type); 5795 base = NULL; /* safety */ 5796 count = 0; /* safety */ 5797 } 5798 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5799 } 5800 5801 #endif 5802 5803 /* 5804 * Chain memory management 5805 */ 5806 void 5807 hammer2_chain_wait(hammer2_chain_t *chain) 5808 { 5809 tsleep(chain, 0, "chnflw", 1); 5810 } 5811 5812 const hammer2_media_data_t * 5813 hammer2_chain_rdata(hammer2_chain_t *chain) 5814 { 5815 KKASSERT(chain->data != NULL); 5816 return (chain->data); 5817 } 5818 5819 hammer2_media_data_t * 5820 hammer2_chain_wdata(hammer2_chain_t *chain) 5821 { 5822 KKASSERT(chain->data != NULL); 5823 return (chain->data); 5824 } 5825 5826 /* 5827 * Set the check data for a chain. This can be a heavy-weight operation 5828 * and typically only runs on-flush. For file data check data is calculated 5829 * when the logical buffers are flushed. 5830 */ 5831 void 5832 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5833 { 5834 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 5835 5836 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5837 case HAMMER2_CHECK_NONE: 5838 break; 5839 case HAMMER2_CHECK_DISABLED: 5840 break; 5841 case HAMMER2_CHECK_ISCSI32: 5842 chain->bref.check.iscsi32.value = 5843 hammer2_icrc32(bdata, chain->bytes); 5844 break; 5845 case HAMMER2_CHECK_XXHASH64: 5846 chain->bref.check.xxhash64.value = 5847 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5848 break; 5849 case HAMMER2_CHECK_SHA192: 5850 { 5851 SHA256_CTX hash_ctx; 5852 union { 5853 uint8_t digest[SHA256_DIGEST_LENGTH]; 5854 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5855 } u; 5856 5857 SHA256_Init(&hash_ctx); 5858 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5859 SHA256_Final(u.digest, &hash_ctx); 5860 u.digest64[2] ^= u.digest64[3]; 5861 bcopy(u.digest, 5862 chain->bref.check.sha192.data, 5863 sizeof(chain->bref.check.sha192.data)); 5864 } 5865 break; 5866 case HAMMER2_CHECK_FREEMAP: 5867 chain->bref.check.freemap.icrc32 = 5868 hammer2_icrc32(bdata, chain->bytes); 5869 break; 5870 default: 5871 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5872 chain->bref.methods); 5873 break; 5874 } 5875 } 5876 5877 /* 5878 * Characterize a failed check code and try to trace back to the inode. 5879 */ 5880 static void 5881 hammer2_characterize_failed_chain(hammer2_chain_t *chain, uint64_t check, 5882 int bits) 5883 { 5884 hammer2_chain_t *lchain; 5885 hammer2_chain_t *ochain; 5886 int did; 5887 5888 did = krateprintf(&krate_h2chk, 5889 "chain %016jx.%02x (%s) meth=%02x CHECK FAIL " 5890 "(flags=%08x, bref/data ", 5891 chain->bref.data_off, 5892 chain->bref.type, 5893 hammer2_bref_type_str(&chain->bref), 5894 chain->bref.methods, 5895 chain->flags); 5896 if (did == 0) 5897 return; 5898 5899 if (bits == 32) { 5900 kprintf("%08x/%08x)\n", 5901 chain->bref.check.iscsi32.value, 5902 (uint32_t)check); 5903 } else { 5904 kprintf("%016jx/%016jx)\n", 5905 chain->bref.check.xxhash64.value, 5906 check); 5907 } 5908 5909 /* 5910 * Run up the chains to try to find the governing inode so we 5911 * can report it. 5912 * 5913 * XXX This error reporting is not really MPSAFE 5914 */ 5915 ochain = chain; 5916 lchain = chain; 5917 while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) { 5918 lchain = chain; 5919 chain = chain->parent; 5920 } 5921 5922 if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE && 5923 ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) == 0 || 5924 (lchain->bref.key & HAMMER2_DIRHASH_VISIBLE))) { 5925 kprintf(" Resides at/in inode %ld\n", 5926 chain->bref.key); 5927 } else if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5928 kprintf(" Resides in inode index - CRITICAL!!!\n"); 5929 } else { 5930 kprintf(" Resides in root index - CRITICAL!!!\n"); 5931 } 5932 if (ochain->hmp) { 5933 const char *pfsname = "UNKNOWN"; 5934 int i; 5935 5936 if (ochain->pmp) { 5937 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 5938 if (ochain->pmp->pfs_hmps[i] == ochain->hmp && 5939 ochain->pmp->pfs_names[i]) { 5940 pfsname = ochain->pmp->pfs_names[i]; 5941 break; 5942 } 5943 } 5944 } 5945 kprintf(" In pfs %s on device %s\n", 5946 pfsname, ochain->hmp->devrepname); 5947 } 5948 } 5949 5950 /* 5951 * Returns non-zero on success, 0 on failure. 5952 */ 5953 int 5954 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5955 { 5956 uint32_t check32; 5957 uint64_t check64; 5958 int r; 5959 5960 if (chain->flags & HAMMER2_CHAIN_NOTTESTED) 5961 return 1; 5962 5963 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5964 case HAMMER2_CHECK_NONE: 5965 r = 1; 5966 break; 5967 case HAMMER2_CHECK_DISABLED: 5968 r = 1; 5969 break; 5970 case HAMMER2_CHECK_ISCSI32: 5971 check32 = hammer2_icrc32(bdata, chain->bytes); 5972 r = (chain->bref.check.iscsi32.value == check32); 5973 if (r == 0) { 5974 hammer2_characterize_failed_chain(chain, check32, 32); 5975 } 5976 hammer2_process_icrc32 += chain->bytes; 5977 break; 5978 case HAMMER2_CHECK_XXHASH64: 5979 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5980 r = (chain->bref.check.xxhash64.value == check64); 5981 if (r == 0) { 5982 hammer2_characterize_failed_chain(chain, check64, 64); 5983 } 5984 hammer2_process_xxhash64 += chain->bytes; 5985 break; 5986 case HAMMER2_CHECK_SHA192: 5987 { 5988 SHA256_CTX hash_ctx; 5989 union { 5990 uint8_t digest[SHA256_DIGEST_LENGTH]; 5991 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5992 } u; 5993 5994 SHA256_Init(&hash_ctx); 5995 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5996 SHA256_Final(u.digest, &hash_ctx); 5997 u.digest64[2] ^= u.digest64[3]; 5998 if (bcmp(u.digest, 5999 chain->bref.check.sha192.data, 6000 sizeof(chain->bref.check.sha192.data)) == 0) { 6001 r = 1; 6002 } else { 6003 r = 0; 6004 krateprintf(&krate_h2chk, 6005 "chain %016jx.%02x meth=%02x " 6006 "CHECK FAIL\n", 6007 chain->bref.data_off, 6008 chain->bref.type, 6009 chain->bref.methods); 6010 } 6011 } 6012 break; 6013 case HAMMER2_CHECK_FREEMAP: 6014 r = (chain->bref.check.freemap.icrc32 == 6015 hammer2_icrc32(bdata, chain->bytes)); 6016 if (r == 0) { 6017 int did; 6018 6019 did = krateprintf(&krate_h2chk, 6020 "chain %016jx.%02x meth=%02x " 6021 "CHECK FAIL\n", 6022 chain->bref.data_off, 6023 chain->bref.type, 6024 chain->bref.methods); 6025 if (did) { 6026 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 6027 chain->bref.check.freemap.icrc32, 6028 hammer2_icrc32(bdata, chain->bytes), 6029 chain->bytes); 6030 if (chain->dio) { 6031 kprintf("dio %p buf %016jx,%d " 6032 "bdata %p/%p\n", 6033 chain->dio, 6034 chain->dio->bp->b_loffset, 6035 chain->dio->bp->b_bufsize, 6036 bdata, 6037 chain->dio->bp->b_data); 6038 } 6039 } 6040 } 6041 break; 6042 default: 6043 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 6044 chain->bref.methods); 6045 r = 1; 6046 break; 6047 } 6048 return r; 6049 } 6050 6051 /* 6052 * Acquire the chain and parent representing the specified inode for the 6053 * device at the specified cluster index. 6054 * 6055 * The flags passed in are LOOKUP flags, not RESOLVE flags. 6056 * 6057 * If we are unable to locate the inode, HAMMER2_ERROR_EIO is returned and 6058 * *chainp will be NULL. *parentp may still be set error or not, or NULL 6059 * if the parent itself could not be resolved. 6060 * 6061 * The caller may pass-in a locked *parentp and/or *chainp, or neither. 6062 * They will be unlocked and released by this function. The *parentp and 6063 * *chainp representing the located inode are returned locked. 6064 */ 6065 int 6066 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 6067 int clindex, int flags, 6068 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 6069 { 6070 hammer2_chain_t *parent; 6071 hammer2_chain_t *rchain; 6072 hammer2_key_t key_dummy; 6073 hammer2_inode_t *ip; 6074 int resolve_flags; 6075 int error; 6076 6077 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 6078 HAMMER2_RESOLVE_SHARED : 0; 6079 6080 /* 6081 * Caller expects us to replace these. 6082 */ 6083 if (*chainp) { 6084 hammer2_chain_unlock(*chainp); 6085 hammer2_chain_drop(*chainp); 6086 *chainp = NULL; 6087 } 6088 if (*parentp) { 6089 hammer2_chain_unlock(*parentp); 6090 hammer2_chain_drop(*parentp); 6091 *parentp = NULL; 6092 } 6093 6094 /* 6095 * Be very careful, this is a backend function and we CANNOT 6096 * lock any frontend inode structure we find. But we have to 6097 * look the inode up this way first in case it exists but is 6098 * detached from the radix tree. 6099 */ 6100 ip = hammer2_inode_lookup(pmp, inum); 6101 if (ip) { 6102 *chainp = hammer2_inode_chain_and_parent(ip, clindex, 6103 parentp, 6104 resolve_flags); 6105 hammer2_inode_drop(ip); 6106 if (*chainp) 6107 return 0; 6108 hammer2_chain_unlock(*chainp); 6109 hammer2_chain_drop(*chainp); 6110 *chainp = NULL; 6111 if (*parentp) { 6112 hammer2_chain_unlock(*parentp); 6113 hammer2_chain_drop(*parentp); 6114 *parentp = NULL; 6115 } 6116 } 6117 6118 /* 6119 * Inodes hang off of the iroot (bit 63 is clear, differentiating 6120 * inodes from root directory entries in the key lookup). 6121 */ 6122 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 6123 rchain = NULL; 6124 if (parent) { 6125 rchain = hammer2_chain_lookup(&parent, &key_dummy, 6126 inum, inum, 6127 &error, flags); 6128 } else { 6129 error = HAMMER2_ERROR_EIO; 6130 } 6131 *parentp = parent; 6132 *chainp = rchain; 6133 6134 return error; 6135 } 6136 6137 /* 6138 * Used by the bulkscan code to snapshot the synchronized storage for 6139 * a volume, allowing it to be scanned concurrently against normal 6140 * operation. 6141 */ 6142 hammer2_chain_t * 6143 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 6144 { 6145 hammer2_chain_t *copy; 6146 6147 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 6148 copy->data = kmalloc(sizeof(copy->data->voldata), 6149 hmp->mchain, 6150 M_WAITOK | M_ZERO); 6151 hammer2_voldata_lock(hmp); 6152 copy->data->voldata = hmp->volsync; 6153 hammer2_voldata_unlock(hmp); 6154 6155 return copy; 6156 } 6157 6158 void 6159 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 6160 { 6161 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 6162 KKASSERT(copy->data); 6163 kfree(copy->data, copy->hmp->mchain); 6164 copy->data = NULL; 6165 atomic_add_long(&hammer2_chain_allocs, -1); 6166 hammer2_chain_drop(copy); 6167 } 6168 6169 /* 6170 * Returns non-zero if the chain (INODE or DIRENT) matches the 6171 * filename. 6172 */ 6173 int 6174 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 6175 size_t name_len) 6176 { 6177 const hammer2_inode_data_t *ripdata; 6178 const hammer2_dirent_head_t *den; 6179 6180 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 6181 ripdata = &chain->data->ipdata; 6182 if (ripdata->meta.name_len == name_len && 6183 bcmp(ripdata->filename, name, name_len) == 0) { 6184 return 1; 6185 } 6186 } 6187 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 6188 chain->bref.embed.dirent.namlen == name_len) { 6189 den = &chain->bref.embed.dirent; 6190 if (name_len > sizeof(chain->bref.check.buf) && 6191 bcmp(chain->data->buf, name, name_len) == 0) { 6192 return 1; 6193 } 6194 if (name_len <= sizeof(chain->bref.check.buf) && 6195 bcmp(chain->bref.check.buf, name, name_len) == 0) { 6196 return 1; 6197 } 6198 } 6199 return 0; 6200 } 6201