1 /* 2 * Copyright (c) 2011-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/buf.h> 61 62 #include <crypto/sha2/sha2.h> 63 64 #include "hammer2.h" 65 66 static hammer2_chain_t *hammer2_chain_create_indirect( 67 hammer2_chain_t *parent, 68 hammer2_key_t key, int keybits, 69 hammer2_tid_t mtid, int for_type, int *errorp); 70 static int hammer2_chain_delete_obref(hammer2_chain_t *parent, 71 hammer2_chain_t *chain, 72 hammer2_tid_t mtid, int flags, 73 hammer2_blockref_t *obref); 74 static hammer2_chain_t *hammer2_combined_find( 75 hammer2_chain_t *parent, 76 hammer2_blockref_t *base, int count, 77 hammer2_key_t *key_nextp, 78 hammer2_key_t key_beg, hammer2_key_t key_end, 79 hammer2_blockref_t **brefp); 80 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain, 81 int depth); 82 static void hammer2_chain_lru_flush(hammer2_pfs_t *pmp); 83 84 /* 85 * There are many degenerate situations where an extreme rate of console 86 * output can occur from warnings and errors. Make sure this output does 87 * not impede operations. 88 */ 89 static struct krate krate_h2chk = { .freq = 5 }; 90 static struct krate krate_h2me = { .freq = 1 }; 91 static struct krate krate_h2em = { .freq = 1 }; 92 93 /* 94 * Basic RBTree for chains (core.rbtree). 95 */ 96 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 97 98 int 99 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 100 { 101 hammer2_key_t c1_beg; 102 hammer2_key_t c1_end; 103 hammer2_key_t c2_beg; 104 hammer2_key_t c2_end; 105 106 /* 107 * Compare chains. Overlaps are not supposed to happen and catch 108 * any software issues early we count overlaps as a match. 109 */ 110 c1_beg = chain1->bref.key; 111 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 112 c2_beg = chain2->bref.key; 113 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 114 115 if (c1_end < c2_beg) /* fully to the left */ 116 return(-1); 117 if (c1_beg > c2_end) /* fully to the right */ 118 return(1); 119 return(0); /* overlap (must not cross edge boundary) */ 120 } 121 122 /* 123 * Assert that a chain has no media data associated with it. 124 */ 125 static __inline void 126 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 127 { 128 KKASSERT(chain->dio == NULL); 129 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 130 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 131 chain->data) { 132 panic("hammer2_chain_assert_no_data: chain %p still has data", 133 chain); 134 } 135 } 136 137 /* 138 * Make a chain visible to the flusher. The flusher operates using a top-down 139 * recursion based on the ONFLUSH flag. It locates MODIFIED and UPDATE chains, 140 * flushes them, and updates blocks back to the volume root. 141 * 142 * This routine sets the ONFLUSH flag upward from the triggering chain until 143 * it hits an inode root or the volume root. Inode chains serve as inflection 144 * points, requiring the flusher to bridge across trees. Inodes include 145 * regular inodes, PFS roots (pmp->iroot), and the media super root 146 * (spmp->iroot). 147 */ 148 void 149 hammer2_chain_setflush(hammer2_chain_t *chain) 150 { 151 hammer2_chain_t *parent; 152 153 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 154 hammer2_spin_sh(&chain->core.spin); 155 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 156 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 157 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 158 break; 159 if ((parent = chain->parent) == NULL) 160 break; 161 hammer2_spin_sh(&parent->core.spin); 162 hammer2_spin_unsh(&chain->core.spin); 163 chain = parent; 164 } 165 hammer2_spin_unsh(&chain->core.spin); 166 } 167 } 168 169 /* 170 * Allocate a new disconnected chain element representing the specified 171 * bref. chain->refs is set to 1 and the passed bref is copied to 172 * chain->bref. chain->bytes is derived from the bref. 173 * 174 * chain->pmp inherits pmp unless the chain is an inode (other than the 175 * super-root inode). 176 * 177 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 178 */ 179 hammer2_chain_t * 180 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 181 hammer2_blockref_t *bref) 182 { 183 hammer2_chain_t *chain; 184 u_int bytes; 185 186 /* 187 * Special case - radix of 0 indicates a chain that does not 188 * need a data reference (context is completely embedded in the 189 * bref). 190 */ 191 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 192 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 193 else 194 bytes = 0; 195 196 switch(bref->type) { 197 case HAMMER2_BREF_TYPE_INODE: 198 case HAMMER2_BREF_TYPE_INDIRECT: 199 case HAMMER2_BREF_TYPE_DATA: 200 case HAMMER2_BREF_TYPE_DIRENT: 201 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 202 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 203 case HAMMER2_BREF_TYPE_FREEMAP: 204 case HAMMER2_BREF_TYPE_VOLUME: 205 chain = kmalloc_obj(sizeof(*chain), hmp->mchain, 206 M_WAITOK | M_ZERO); 207 atomic_add_long(&hammer2_chain_allocs, 1); 208 break; 209 case HAMMER2_BREF_TYPE_EMPTY: 210 default: 211 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 212 bref->type); 213 break; 214 } 215 216 /* 217 * Initialize the new chain structure. pmp must be set to NULL for 218 * chains belonging to the super-root topology of a device mount. 219 */ 220 if (pmp == hmp->spmp) 221 chain->pmp = NULL; 222 else 223 chain->pmp = pmp; 224 225 chain->hmp = hmp; 226 chain->bref = *bref; 227 chain->bytes = bytes; 228 chain->refs = 1; 229 chain->flags = HAMMER2_CHAIN_ALLOCATED; 230 lockinit(&chain->diolk, "chdio", 0, 0); 231 232 /* 233 * Set the PFS boundary flag if this chain represents a PFS root. 234 */ 235 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 236 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 237 hammer2_chain_init(chain); 238 239 return (chain); 240 } 241 242 /* 243 * A common function to initialize chains including fchain and vchain. 244 */ 245 void 246 hammer2_chain_init(hammer2_chain_t *chain) 247 { 248 RB_INIT(&chain->core.rbtree); /* live chains */ 249 hammer2_mtx_init(&chain->lock, "h2chain"); 250 hammer2_spin_init(&chain->core.spin, "h2chain"); 251 } 252 253 /* 254 * Add a reference to a chain element, preventing its destruction. 255 * 256 * (can be called with spinlock held) 257 */ 258 void 259 hammer2_chain_ref(hammer2_chain_t *chain) 260 { 261 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 262 /* 263 * Just flag that the chain was used and should be recycled 264 * on the LRU if it encounters it later. 265 */ 266 if (chain->flags & HAMMER2_CHAIN_ONLRU) 267 atomic_set_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 268 269 #if 0 270 /* 271 * REMOVED - reduces contention, lru_list is more heuristical 272 * now. 273 * 274 * 0->non-zero transition must ensure that chain is removed 275 * from the LRU list. 276 * 277 * NOTE: Already holding lru_spin here so we cannot call 278 * hammer2_chain_ref() to get it off lru_list, do 279 * it manually. 280 */ 281 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 282 hammer2_pfs_t *pmp = chain->pmp; 283 hammer2_spin_ex(&pmp->lru_spin); 284 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 285 atomic_add_int(&pmp->lru_count, -1); 286 atomic_clear_int(&chain->flags, 287 HAMMER2_CHAIN_ONLRU); 288 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 289 } 290 hammer2_spin_unex(&pmp->lru_spin); 291 } 292 #endif 293 } 294 } 295 296 /* 297 * Ref a locked chain and force the data to be held across an unlock. 298 * Chain must be currently locked. The user of the chain who desires 299 * to release the hold must call hammer2_chain_lock_unhold() to relock 300 * and unhold the chain, then unlock normally, or may simply call 301 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 302 */ 303 void 304 hammer2_chain_ref_hold(hammer2_chain_t *chain) 305 { 306 atomic_add_int(&chain->lockcnt, 1); 307 hammer2_chain_ref(chain); 308 } 309 310 /* 311 * Insert the chain in the core rbtree. 312 * 313 * Normal insertions are placed in the live rbtree. Insertion of a deleted 314 * chain is a special case used by the flush code that is placed on the 315 * unstaged deleted list to avoid confusing the live view. 316 */ 317 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 318 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 319 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 320 321 static 322 int 323 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 324 int flags, int generation) 325 { 326 hammer2_chain_t *xchain __debugvar; 327 int error = 0; 328 329 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 330 hammer2_spin_ex(&parent->core.spin); 331 332 /* 333 * Interlocked by spinlock, check for race 334 */ 335 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 336 parent->core.generation != generation) { 337 error = HAMMER2_ERROR_EAGAIN; 338 goto failed; 339 } 340 341 /* 342 * Insert chain 343 */ 344 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 345 KASSERT(xchain == NULL, 346 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 347 chain, xchain, chain->bref.key)); 348 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 349 chain->parent = parent; 350 ++parent->core.chain_count; 351 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 352 353 /* 354 * We have to keep track of the effective live-view blockref count 355 * so the create code knows when to push an indirect block. 356 */ 357 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 358 atomic_add_int(&parent->core.live_count, 1); 359 failed: 360 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 361 hammer2_spin_unex(&parent->core.spin); 362 return error; 363 } 364 365 /* 366 * Drop the caller's reference to the chain. When the ref count drops to 367 * zero this function will try to disassociate the chain from its parent and 368 * deallocate it, then recursely drop the parent using the implied ref 369 * from the chain's chain->parent. 370 * 371 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 372 * races an acquisition by another cpu. Therefore we can loop if we are 373 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 374 * race against another drop. 375 */ 376 void 377 hammer2_chain_drop(hammer2_chain_t *chain) 378 { 379 u_int refs; 380 381 KKASSERT(chain->refs > 0); 382 383 while (chain) { 384 refs = chain->refs; 385 cpu_ccfence(); 386 KKASSERT(refs > 0); 387 388 if (refs == 1) { 389 if (hammer2_mtx_ex_try(&chain->lock) == 0) 390 chain = hammer2_chain_lastdrop(chain, 0); 391 /* retry the same chain, or chain from lastdrop */ 392 } else { 393 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 394 break; 395 /* retry the same chain */ 396 } 397 cpu_pause(); 398 } 399 } 400 401 /* 402 * Unhold a held and probably not-locked chain, ensure that the data is 403 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 404 * lock and then simply unlocking the chain. 405 */ 406 void 407 hammer2_chain_unhold(hammer2_chain_t *chain) 408 { 409 u_int lockcnt; 410 int iter = 0; 411 412 for (;;) { 413 lockcnt = chain->lockcnt; 414 cpu_ccfence(); 415 if (lockcnt > 1) { 416 if (atomic_cmpset_int(&chain->lockcnt, 417 lockcnt, lockcnt - 1)) { 418 break; 419 } 420 } else if (hammer2_mtx_ex_try(&chain->lock) == 0) { 421 hammer2_chain_unlock(chain); 422 break; 423 } else { 424 /* 425 * This situation can easily occur on SMP due to 426 * the gap inbetween the 1->0 transition and the 427 * final unlock. We cannot safely block on the 428 * mutex because lockcnt might go above 1. 429 * 430 * XXX Sleep for one tick if it takes too long. 431 */ 432 if (++iter > 1000) { 433 if (iter > 1000 + hz) { 434 kprintf("hammer2: h2race1 %p\n", chain); 435 iter = 1000; 436 } 437 tsleep(&iter, 0, "h2race1", 1); 438 } 439 cpu_pause(); 440 } 441 } 442 } 443 444 void 445 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 446 { 447 hammer2_chain_unhold(chain); 448 hammer2_chain_drop(chain); 449 } 450 451 void 452 hammer2_chain_rehold(hammer2_chain_t *chain) 453 { 454 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED); 455 atomic_add_int(&chain->lockcnt, 1); 456 hammer2_chain_unlock(chain); 457 } 458 459 /* 460 * Handles the (potential) last drop of chain->refs from 1->0. Called with 461 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 462 * possible against refs and lockcnt. We must dispose of the mutex on chain. 463 * 464 * This function returns an unlocked chain for recursive drop or NULL. It 465 * can return the same chain if it determines it has raced another ref. 466 * 467 * -- 468 * 469 * When two chains need to be recursively dropped we use the chain we 470 * would otherwise free to placehold the additional chain. It's a bit 471 * convoluted but we can't just recurse without potentially blowing out 472 * the kernel stack. 473 * 474 * The chain cannot be freed if it has any children. 475 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 476 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 477 * Any dedup registration can remain intact. 478 * 479 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 480 */ 481 static 482 hammer2_chain_t * 483 hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth) 484 { 485 hammer2_pfs_t *pmp; 486 hammer2_dev_t *hmp; 487 hammer2_chain_t *parent; 488 hammer2_chain_t *rdrop; 489 490 /* 491 * We need chain's spinlock to interlock the sub-tree test. 492 * We already have chain's mutex, protecting chain->parent. 493 * 494 * Remember that chain->refs can be in flux. 495 */ 496 hammer2_spin_ex(&chain->core.spin); 497 498 if (chain->parent != NULL) { 499 /* 500 * If the chain has a parent the UPDATE bit prevents scrapping 501 * as the chain is needed to properly flush the parent. Try 502 * to complete the 1->0 transition and return NULL. Retry 503 * (return chain) if we are unable to complete the 1->0 504 * transition, else return NULL (nothing more to do). 505 * 506 * If the chain has a parent the MODIFIED bit prevents 507 * scrapping. 508 * 509 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 510 */ 511 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 512 HAMMER2_CHAIN_MODIFIED)) { 513 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 514 hammer2_spin_unex(&chain->core.spin); 515 hammer2_chain_assert_no_data(chain); 516 hammer2_mtx_unlock(&chain->lock); 517 chain = NULL; 518 } else { 519 hammer2_spin_unex(&chain->core.spin); 520 hammer2_mtx_unlock(&chain->lock); 521 } 522 return (chain); 523 } 524 /* spinlock still held */ 525 } else if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME || 526 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP) { 527 /* 528 * Retain the static vchain and fchain. Clear bits that 529 * are not relevant. Do not clear the MODIFIED bit, 530 * and certainly do not put it on the delayed-flush queue. 531 */ 532 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 533 } else { 534 /* 535 * The chain has no parent and can be flagged for destruction. 536 * Since it has no parent, UPDATE can also be cleared. 537 */ 538 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 539 if (chain->flags & HAMMER2_CHAIN_UPDATE) 540 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 541 542 /* 543 * If the chain has children we must propagate the DESTROY 544 * flag downward and rip the disconnected topology apart. 545 * This is accomplished by calling hammer2_flush() on the 546 * chain. 547 * 548 * Any dedup is already handled by the underlying DIO, so 549 * we do not have to specifically flush it here. 550 */ 551 if (chain->core.chain_count) { 552 hammer2_spin_unex(&chain->core.spin); 553 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 554 HAMMER2_FLUSH_ALL); 555 hammer2_mtx_unlock(&chain->lock); 556 557 return(chain); /* retry drop */ 558 } 559 560 /* 561 * Otherwise we can scrap the MODIFIED bit if it is set, 562 * and continue along the freeing path. 563 * 564 * Be sure to clean-out any dedup bits. Without a parent 565 * this chain will no longer be visible to the flush code. 566 * Easy check data_off to avoid the volume root. 567 */ 568 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 569 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 570 atomic_add_long(&hammer2_count_modified_chains, -1); 571 if (chain->pmp) 572 hammer2_pfs_memory_wakeup(chain->pmp, -1); 573 } 574 /* spinlock still held */ 575 } 576 577 /* spinlock still held */ 578 579 /* 580 * If any children exist we must leave the chain intact with refs == 0. 581 * They exist because chains are retained below us which have refs or 582 * may require flushing. 583 * 584 * Retry (return chain) if we fail to transition the refs to 0, else 585 * return NULL indication nothing more to do. 586 * 587 * Chains with children are NOT put on the LRU list. 588 */ 589 if (chain->core.chain_count) { 590 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 591 hammer2_spin_unex(&chain->core.spin); 592 hammer2_chain_assert_no_data(chain); 593 hammer2_mtx_unlock(&chain->lock); 594 chain = NULL; 595 } else { 596 hammer2_spin_unex(&chain->core.spin); 597 hammer2_mtx_unlock(&chain->lock); 598 } 599 return (chain); 600 } 601 /* spinlock still held */ 602 /* no chains left under us */ 603 604 /* 605 * chain->core has no children left so no accessors can get to our 606 * chain from there. Now we have to lock the parent core to interlock 607 * remaining possible accessors that might bump chain's refs before 608 * we can safely drop chain's refs with intent to free the chain. 609 */ 610 hmp = chain->hmp; 611 pmp = chain->pmp; /* can be NULL */ 612 rdrop = NULL; 613 614 parent = chain->parent; 615 616 /* 617 * WARNING! chain's spin lock is still held here, and other spinlocks 618 * will be acquired and released in the code below. We 619 * cannot be making fancy procedure calls! 620 */ 621 622 /* 623 * We can cache the chain if it is associated with a pmp 624 * and not flagged as being destroyed or requesting a full 625 * release. In this situation the chain is not removed 626 * from its parent, i.e. it can still be looked up. 627 * 628 * We intentionally do not cache DATA chains because these 629 * were likely used to load data into the logical buffer cache 630 * and will not be accessed again for some time. 631 */ 632 if ((chain->flags & 633 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 634 chain->pmp && 635 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 636 if (parent) 637 hammer2_spin_ex(&parent->core.spin); 638 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 639 /* 640 * 1->0 transition failed, retry. Do not drop 641 * the chain's data yet! 642 */ 643 if (parent) 644 hammer2_spin_unex(&parent->core.spin); 645 hammer2_spin_unex(&chain->core.spin); 646 hammer2_mtx_unlock(&chain->lock); 647 648 return(chain); 649 } 650 651 /* 652 * Success 653 */ 654 hammer2_chain_assert_no_data(chain); 655 656 /* 657 * Make sure we are on the LRU list, clean up excessive 658 * LRU entries. We can only really drop one but there might 659 * be other entries that we can remove from the lru_list 660 * without dropping. 661 * 662 * NOTE: HAMMER2_CHAIN_ONLRU may only be safely set when 663 * chain->core.spin AND pmp->lru_spin are held, but 664 * can be safely cleared only holding pmp->lru_spin. 665 */ 666 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 667 hammer2_spin_ex(&pmp->lru_spin); 668 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 669 atomic_set_int(&chain->flags, 670 HAMMER2_CHAIN_ONLRU); 671 TAILQ_INSERT_TAIL(&pmp->lru_list, 672 chain, lru_node); 673 atomic_add_int(&pmp->lru_count, 1); 674 } 675 if (pmp->lru_count < HAMMER2_LRU_LIMIT) 676 depth = 1; /* disable lru_list flush */ 677 hammer2_spin_unex(&pmp->lru_spin); 678 } else { 679 /* disable lru flush */ 680 depth = 1; 681 } 682 683 if (parent) { 684 hammer2_spin_unex(&parent->core.spin); 685 parent = NULL; /* safety */ 686 } 687 hammer2_spin_unex(&chain->core.spin); 688 hammer2_mtx_unlock(&chain->lock); 689 690 /* 691 * lru_list hysteresis (see above for depth overrides). 692 * Note that depth also prevents excessive lastdrop recursion. 693 */ 694 if (depth == 0) 695 hammer2_chain_lru_flush(pmp); 696 697 return NULL; 698 /* NOT REACHED */ 699 } 700 701 /* 702 * Make sure we are not on the LRU list. 703 */ 704 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 705 hammer2_spin_ex(&pmp->lru_spin); 706 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 707 atomic_add_int(&pmp->lru_count, -1); 708 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 709 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 710 } 711 hammer2_spin_unex(&pmp->lru_spin); 712 } 713 714 /* 715 * Spinlock the parent and try to drop the last ref on chain. 716 * On success determine if we should dispose of the chain 717 * (remove the chain from its parent, etc). 718 * 719 * (normal core locks are top-down recursive but we define 720 * core spinlocks as bottom-up recursive, so this is safe). 721 */ 722 if (parent) { 723 hammer2_spin_ex(&parent->core.spin); 724 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 725 /* 726 * 1->0 transition failed, retry. 727 */ 728 hammer2_spin_unex(&parent->core.spin); 729 hammer2_spin_unex(&chain->core.spin); 730 hammer2_mtx_unlock(&chain->lock); 731 732 return(chain); 733 } 734 735 /* 736 * 1->0 transition successful, parent spin held to prevent 737 * new lookups, chain spinlock held to protect parent field. 738 * Remove chain from the parent. 739 * 740 * If the chain is being removed from the parent's rbtree but 741 * is not blkmapped, we have to adjust live_count downward. If 742 * it is blkmapped then the blockref is retained in the parent 743 * as is its associated live_count. This case can occur when 744 * a chain added to the topology is unable to flush and is 745 * then later deleted. 746 */ 747 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 748 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 749 (chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0) { 750 atomic_add_int(&parent->core.live_count, -1); 751 } 752 RB_REMOVE(hammer2_chain_tree, 753 &parent->core.rbtree, chain); 754 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 755 --parent->core.chain_count; 756 chain->parent = NULL; 757 } 758 759 /* 760 * If our chain was the last chain in the parent's core the 761 * core is now empty and its parent might have to be 762 * re-dropped if it has 0 refs. 763 */ 764 if (parent->core.chain_count == 0) { 765 rdrop = parent; 766 atomic_add_int(&rdrop->refs, 1); 767 /* 768 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 769 rdrop = NULL; 770 */ 771 } 772 hammer2_spin_unex(&parent->core.spin); 773 parent = NULL; /* safety */ 774 /* FALL THROUGH */ 775 } else { 776 /* 777 * No-parent case. 778 */ 779 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 780 /* 781 * 1->0 transition failed, retry. 782 */ 783 hammer2_spin_unex(&parent->core.spin); 784 hammer2_spin_unex(&chain->core.spin); 785 hammer2_mtx_unlock(&chain->lock); 786 787 return(chain); 788 } 789 } 790 791 /* 792 * Successful 1->0 transition, no parent, no children... no way for 793 * anyone to ref this chain any more. We can clean-up and free it. 794 * 795 * We still have the core spinlock, and core's chain_count is 0. 796 * Any parent spinlock is gone. 797 */ 798 hammer2_spin_unex(&chain->core.spin); 799 hammer2_chain_assert_no_data(chain); 800 hammer2_mtx_unlock(&chain->lock); 801 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 802 chain->core.chain_count == 0); 803 804 /* 805 * All locks are gone, no pointers remain to the chain, finish 806 * freeing it. 807 */ 808 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 809 HAMMER2_CHAIN_MODIFIED)) == 0); 810 811 /* 812 * Once chain resources are gone we can use the now dead chain 813 * structure to placehold what might otherwise require a recursive 814 * drop, because we have potentially two things to drop and can only 815 * return one directly. 816 */ 817 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 818 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 819 chain->hmp = NULL; 820 kfree_obj(chain, hmp->mchain); 821 atomic_add_long(&hammer2_chain_allocs, -1); 822 } 823 824 /* 825 * Possible chaining loop when parent re-drop needed. 826 */ 827 return(rdrop); 828 } 829 830 /* 831 * Heuristical flush of the LRU, try to reduce the number of entries 832 * on the LRU to (HAMMER2_LRU_LIMIT * 2 / 3). This procedure is called 833 * only when lru_count exceeds HAMMER2_LRU_LIMIT. 834 */ 835 static 836 void 837 hammer2_chain_lru_flush(hammer2_pfs_t *pmp) 838 { 839 hammer2_chain_t *chain; 840 841 again: 842 chain = NULL; 843 hammer2_spin_ex(&pmp->lru_spin); 844 while (pmp->lru_count > HAMMER2_LRU_LIMIT * 2 / 3) { 845 /* 846 * Pick a chain off the lru_list, just recycle it quickly 847 * if LRUHINT is set (the chain was ref'd but left on 848 * the lru_list, so cycle to the end). 849 */ 850 chain = TAILQ_FIRST(&pmp->lru_list); 851 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 852 853 if (chain->flags & HAMMER2_CHAIN_LRUHINT) { 854 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 855 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 856 chain = NULL; 857 continue; 858 } 859 860 /* 861 * Ok, we are off the LRU. We must adjust refs before we 862 * can safely clear the ONLRU flag. 863 */ 864 atomic_add_int(&pmp->lru_count, -1); 865 if (atomic_cmpset_int(&chain->refs, 0, 1)) { 866 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 867 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 868 break; 869 } 870 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 871 chain = NULL; 872 } 873 hammer2_spin_unex(&pmp->lru_spin); 874 if (chain == NULL) 875 return; 876 877 /* 878 * If we picked a chain off the lru list we may be able to lastdrop 879 * it. Use a depth of 1 to prevent excessive lastdrop recursion. 880 */ 881 while (chain) { 882 u_int refs; 883 884 refs = chain->refs; 885 cpu_ccfence(); 886 KKASSERT(refs > 0); 887 888 if (refs == 1) { 889 if (hammer2_mtx_ex_try(&chain->lock) == 0) 890 chain = hammer2_chain_lastdrop(chain, 1); 891 /* retry the same chain, or chain from lastdrop */ 892 } else { 893 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 894 break; 895 /* retry the same chain */ 896 } 897 cpu_pause(); 898 } 899 goto again; 900 } 901 902 /* 903 * On last lock release. 904 */ 905 static hammer2_io_t * 906 hammer2_chain_drop_data(hammer2_chain_t *chain) 907 { 908 hammer2_io_t *dio; 909 910 if ((dio = chain->dio) != NULL) { 911 chain->dio = NULL; 912 chain->data = NULL; 913 } else { 914 switch(chain->bref.type) { 915 case HAMMER2_BREF_TYPE_VOLUME: 916 case HAMMER2_BREF_TYPE_FREEMAP: 917 break; 918 default: 919 if (chain->data != NULL) { 920 hammer2_spin_unex(&chain->core.spin); 921 panic("chain data not null: " 922 "chain %p bref %016jx.%02x " 923 "refs %d parent %p dio %p data %p", 924 chain, chain->bref.data_off, 925 chain->bref.type, chain->refs, 926 chain->parent, 927 chain->dio, chain->data); 928 } 929 KKASSERT(chain->data == NULL); 930 break; 931 } 932 } 933 return dio; 934 } 935 936 /* 937 * Lock a referenced chain element, acquiring its data with I/O if necessary, 938 * and specify how you would like the data to be resolved. 939 * 940 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 941 * 942 * The lock is allowed to recurse, multiple locking ops will aggregate 943 * the requested resolve types. Once data is assigned it will not be 944 * removed until the last unlock. 945 * 946 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 947 * (typically used to avoid device/logical buffer 948 * aliasing for data) 949 * 950 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 951 * the INITIAL-create state (indirect blocks only). 952 * 953 * Do not resolve data elements for DATA chains. 954 * (typically used to avoid device/logical buffer 955 * aliasing for data) 956 * 957 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 958 * 959 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 960 * it will be locked exclusive. 961 * 962 * HAMMER2_RESOLVE_NONBLOCK- (flag) The chain is locked non-blocking. If 963 * the lock fails, EAGAIN is returned. 964 * 965 * NOTE: Embedded elements (volume header, inodes) are always resolved 966 * regardless. 967 * 968 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 969 * element will instantiate and zero its buffer, and flush it on 970 * release. 971 * 972 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 973 * so as not to instantiate a device buffer, which could alias against 974 * a logical file buffer. However, if ALWAYS is specified the 975 * device buffer will be instantiated anyway. 976 * 977 * NOTE: The return value is always 0 unless NONBLOCK is specified, in which 978 * case it can be either 0 or EAGAIN. 979 * 980 * WARNING! This function blocks on I/O if data needs to be fetched. This 981 * blocking can run concurrent with other compatible lock holders 982 * who do not need data returning. The lock is not upgraded to 983 * exclusive during a data fetch, a separate bit is used to 984 * interlock I/O. However, an exclusive lock holder can still count 985 * on being interlocked against an I/O fetch managed by a shared 986 * lock holder. 987 */ 988 int 989 hammer2_chain_lock(hammer2_chain_t *chain, int how) 990 { 991 KKASSERT(chain->refs > 0); 992 993 if (how & HAMMER2_RESOLVE_NONBLOCK) { 994 /* 995 * We still have to bump lockcnt before acquiring the lock, 996 * even for non-blocking operation, because the unlock code 997 * live-loops on lockcnt == 1 when dropping the last lock. 998 * 999 * If the non-blocking operation fails we have to use an 1000 * unhold sequence to undo the mess. 1001 * 1002 * NOTE: LOCKAGAIN must always succeed without blocking, 1003 * even if NONBLOCK is specified. 1004 */ 1005 atomic_add_int(&chain->lockcnt, 1); 1006 if (how & HAMMER2_RESOLVE_SHARED) { 1007 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1008 hammer2_mtx_sh_again(&chain->lock); 1009 } else { 1010 if (hammer2_mtx_sh_try(&chain->lock) != 0) { 1011 hammer2_chain_unhold(chain); 1012 return EAGAIN; 1013 } 1014 } 1015 } else { 1016 if (hammer2_mtx_ex_try(&chain->lock) != 0) { 1017 hammer2_chain_unhold(chain); 1018 return EAGAIN; 1019 } 1020 } 1021 } else { 1022 /* 1023 * Get the appropriate lock. If LOCKAGAIN is flagged with 1024 * SHARED the caller expects a shared lock to already be 1025 * present and we are giving it another ref. This case must 1026 * importantly not block if there is a pending exclusive lock 1027 * request. 1028 */ 1029 atomic_add_int(&chain->lockcnt, 1); 1030 if (how & HAMMER2_RESOLVE_SHARED) { 1031 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1032 hammer2_mtx_sh_again(&chain->lock); 1033 } else { 1034 hammer2_mtx_sh(&chain->lock); 1035 } 1036 } else { 1037 hammer2_mtx_ex(&chain->lock); 1038 } 1039 } 1040 1041 /* 1042 * If we already have a valid data pointer make sure the data is 1043 * synchronized to the current cpu, and then no further action is 1044 * necessary. 1045 */ 1046 if (chain->data) { 1047 if (chain->dio) 1048 hammer2_io_bkvasync(chain->dio); 1049 return 0; 1050 } 1051 1052 /* 1053 * Do we have to resolve the data? This is generally only 1054 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 1055 * Other BREF types expects the data to be there. 1056 */ 1057 switch(how & HAMMER2_RESOLVE_MASK) { 1058 case HAMMER2_RESOLVE_NEVER: 1059 return 0; 1060 case HAMMER2_RESOLVE_MAYBE: 1061 if (chain->flags & HAMMER2_CHAIN_INITIAL) 1062 return 0; 1063 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 1064 return 0; 1065 #if 0 1066 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 1067 return 0; 1068 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 1069 return 0; 1070 #endif 1071 /* fall through */ 1072 case HAMMER2_RESOLVE_ALWAYS: 1073 default: 1074 break; 1075 } 1076 1077 /* 1078 * Caller requires data 1079 */ 1080 hammer2_chain_load_data(chain); 1081 1082 return 0; 1083 } 1084 1085 #if 0 1086 /* 1087 * Lock the chain, retain the hold, and drop the data persistence count. 1088 * The data should remain valid because we never transitioned lockcnt 1089 * through 0. 1090 */ 1091 void 1092 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 1093 { 1094 hammer2_chain_lock(chain, how); 1095 atomic_add_int(&chain->lockcnt, -1); 1096 } 1097 1098 /* 1099 * Downgrade an exclusive chain lock to a shared chain lock. 1100 * 1101 * NOTE: There is no upgrade equivalent due to the ease of 1102 * deadlocks in that direction. 1103 */ 1104 void 1105 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1106 { 1107 hammer2_mtx_downgrade(&chain->lock); 1108 } 1109 #endif 1110 1111 /* 1112 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1113 * may be of any type. 1114 * 1115 * Once chain->data is set it cannot be disposed of until all locks are 1116 * released. 1117 * 1118 * Make sure the data is synchronized to the current cpu. 1119 */ 1120 void 1121 hammer2_chain_load_data(hammer2_chain_t *chain) 1122 { 1123 hammer2_blockref_t *bref; 1124 hammer2_dev_t *hmp; 1125 hammer2_io_t *dio; 1126 char *bdata; 1127 int error; 1128 1129 /* 1130 * Degenerate case, data already present, or chain has no media 1131 * reference to load. 1132 */ 1133 KKASSERT(chain->lock.mtx_lock & MTX_MASK); 1134 if (chain->data) { 1135 if (chain->dio) 1136 hammer2_io_bkvasync(chain->dio); 1137 return; 1138 } 1139 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1140 return; 1141 1142 hmp = chain->hmp; 1143 KKASSERT(hmp != NULL); 1144 1145 /* 1146 * Gain the IOINPROG bit, interlocked block. 1147 */ 1148 for (;;) { 1149 u_int oflags; 1150 u_int nflags; 1151 1152 oflags = chain->flags; 1153 cpu_ccfence(); 1154 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1155 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1156 tsleep_interlock(&chain->flags, 0); 1157 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1158 tsleep(&chain->flags, PINTERLOCKED, 1159 "h2iocw", 0); 1160 } 1161 /* retry */ 1162 } else { 1163 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1164 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1165 break; 1166 } 1167 /* retry */ 1168 } 1169 } 1170 1171 /* 1172 * We own CHAIN_IOINPROG 1173 * 1174 * Degenerate case if we raced another load. 1175 */ 1176 if (chain->data) { 1177 if (chain->dio) 1178 hammer2_io_bkvasync(chain->dio); 1179 goto done; 1180 } 1181 1182 /* 1183 * We must resolve to a device buffer, either by issuing I/O or 1184 * by creating a zero-fill element. We do not mark the buffer 1185 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1186 * API must still be used to do that). 1187 * 1188 * The device buffer is variable-sized in powers of 2 down 1189 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1190 * chunk always contains buffers of the same size. (XXX) 1191 * 1192 * The minimum physical IO size may be larger than the variable 1193 * block size. 1194 */ 1195 bref = &chain->bref; 1196 1197 /* 1198 * The getblk() optimization can only be used on newly created 1199 * elements if the physical block size matches the request. 1200 */ 1201 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1202 error = hammer2_io_new(hmp, bref->type, 1203 bref->data_off, chain->bytes, 1204 &chain->dio); 1205 } else { 1206 error = hammer2_io_bread(hmp, bref->type, 1207 bref->data_off, chain->bytes, 1208 &chain->dio); 1209 hammer2_adjreadcounter(chain->bref.type, chain->bytes); 1210 } 1211 if (error) { 1212 chain->error = HAMMER2_ERROR_EIO; 1213 kprintf("hammer2_chain_load_data: I/O error %016jx: %d\n", 1214 (intmax_t)bref->data_off, error); 1215 hammer2_io_bqrelse(&chain->dio); 1216 goto done; 1217 } 1218 chain->error = 0; 1219 1220 /* 1221 * This isn't perfect and can be ignored on OSs which do not have 1222 * an indication as to whether a buffer is coming from cache or 1223 * if I/O was actually issued for the read. TESTEDGOOD will work 1224 * pretty well without the B_IOISSUED logic because chains are 1225 * cached, but in that situation (without B_IOISSUED) it will not 1226 * detect whether a re-read via I/O is corrupted verses the original 1227 * read. 1228 * 1229 * We can't re-run the CRC on every fresh lock. That would be 1230 * insanely expensive. 1231 * 1232 * If the underlying kernel buffer covers the entire chain we can 1233 * use the B_IOISSUED indication to determine if we have to re-run 1234 * the CRC on chain data for chains that managed to stay cached 1235 * across the kernel disposal of the original buffer. 1236 */ 1237 if ((dio = chain->dio) != NULL && dio->bp) { 1238 struct buf *bp = dio->bp; 1239 1240 if (dio->psize == chain->bytes && 1241 (bp->b_flags & B_IOISSUED)) { 1242 atomic_clear_int(&chain->flags, 1243 HAMMER2_CHAIN_TESTEDGOOD); 1244 bp->b_flags &= ~B_IOISSUED; 1245 } 1246 } 1247 1248 /* 1249 * NOTE: A locked chain's data cannot be modified without first 1250 * calling hammer2_chain_modify(). 1251 */ 1252 1253 /* 1254 * NOTE: hammer2_io_data() call issues bkvasync() 1255 */ 1256 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1257 1258 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1259 /* 1260 * Clear INITIAL. In this case we used io_new() and the 1261 * buffer has been zero'd and marked dirty. 1262 * 1263 * CHAIN_MODIFIED has not been set yet, and we leave it 1264 * that way for now. Set a temporary CHAIN_NOTTESTED flag 1265 * to prevent hammer2_chain_testcheck() from trying to match 1266 * a check code that has not yet been generated. This bit 1267 * should NOT end up on the actual media. 1268 */ 1269 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1270 atomic_set_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 1271 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1272 /* 1273 * check data not currently synchronized due to 1274 * modification. XXX assumes data stays in the buffer 1275 * cache, which might not be true (need biodep on flush 1276 * to calculate crc? or simple crc?). 1277 */ 1278 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1279 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1280 chain->error = HAMMER2_ERROR_CHECK; 1281 } else { 1282 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1283 } 1284 } 1285 1286 /* 1287 * Setup the data pointer by pointing it into the buffer. 1288 * WARNING! Other threads can start using the data the instant we 1289 * set chain->data non-NULL. 1290 */ 1291 switch (bref->type) { 1292 case HAMMER2_BREF_TYPE_VOLUME: 1293 case HAMMER2_BREF_TYPE_FREEMAP: 1294 panic("hammer2_chain_load_data: unresolved volume header"); 1295 break; 1296 case HAMMER2_BREF_TYPE_DIRENT: 1297 KKASSERT(chain->bytes != 0); 1298 /* fall through */ 1299 case HAMMER2_BREF_TYPE_INODE: 1300 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1301 case HAMMER2_BREF_TYPE_INDIRECT: 1302 case HAMMER2_BREF_TYPE_DATA: 1303 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1304 default: 1305 /* 1306 * Point data at the device buffer and leave dio intact. 1307 */ 1308 chain->data = (void *)bdata; 1309 break; 1310 } 1311 1312 /* 1313 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1314 */ 1315 done: 1316 for (;;) { 1317 u_int oflags; 1318 u_int nflags; 1319 1320 oflags = chain->flags; 1321 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1322 HAMMER2_CHAIN_IOSIGNAL); 1323 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1324 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1325 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1326 wakeup(&chain->flags); 1327 break; 1328 } 1329 } 1330 } 1331 1332 /* 1333 * Unlock and deref a chain element. 1334 * 1335 * Remember that the presence of children under chain prevent the chain's 1336 * destruction but do not add additional references, so the dio will still 1337 * be dropped. 1338 */ 1339 void 1340 hammer2_chain_unlock(hammer2_chain_t *chain) 1341 { 1342 hammer2_io_t *dio; 1343 u_int lockcnt; 1344 int iter = 0; 1345 1346 /* 1347 * If multiple locks are present (or being attempted) on this 1348 * particular chain we can just unlock, drop refs, and return. 1349 * 1350 * Otherwise fall-through on the 1->0 transition. 1351 */ 1352 for (;;) { 1353 lockcnt = chain->lockcnt; 1354 KKASSERT(lockcnt > 0); 1355 cpu_ccfence(); 1356 if (lockcnt > 1) { 1357 if (atomic_cmpset_int(&chain->lockcnt, 1358 lockcnt, lockcnt - 1)) { 1359 hammer2_mtx_unlock(&chain->lock); 1360 return; 1361 } 1362 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1363 /* while holding the mutex exclusively */ 1364 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1365 break; 1366 } else { 1367 /* 1368 * This situation can easily occur on SMP due to 1369 * the gap inbetween the 1->0 transition and the 1370 * final unlock. We cannot safely block on the 1371 * mutex because lockcnt might go above 1. 1372 * 1373 * XXX Sleep for one tick if it takes too long. 1374 */ 1375 if (++iter > 1000) { 1376 if (iter > 1000 + hz) { 1377 kprintf("hammer2: h2race2 %p\n", chain); 1378 iter = 1000; 1379 } 1380 tsleep(&iter, 0, "h2race2", 1); 1381 } 1382 cpu_pause(); 1383 } 1384 /* retry */ 1385 } 1386 1387 /* 1388 * Last unlock / mutex upgraded to exclusive. Drop the data 1389 * reference. 1390 */ 1391 dio = hammer2_chain_drop_data(chain); 1392 if (dio) 1393 hammer2_io_bqrelse(&dio); 1394 hammer2_mtx_unlock(&chain->lock); 1395 } 1396 1397 #if 0 1398 /* 1399 * Unlock and hold chain data intact 1400 */ 1401 void 1402 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1403 { 1404 atomic_add_int(&chain->lockcnt, 1); 1405 hammer2_chain_unlock(chain); 1406 } 1407 #endif 1408 1409 /* 1410 * Helper to obtain the blockref[] array base and count for a chain. 1411 * 1412 * XXX Not widely used yet, various use cases need to be validated and 1413 * converted to use this function. 1414 */ 1415 static 1416 hammer2_blockref_t * 1417 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1418 { 1419 hammer2_blockref_t *base; 1420 int count; 1421 1422 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1423 base = NULL; 1424 1425 switch(parent->bref.type) { 1426 case HAMMER2_BREF_TYPE_INODE: 1427 count = HAMMER2_SET_COUNT; 1428 break; 1429 case HAMMER2_BREF_TYPE_INDIRECT: 1430 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1431 count = parent->bytes / sizeof(hammer2_blockref_t); 1432 break; 1433 case HAMMER2_BREF_TYPE_VOLUME: 1434 count = HAMMER2_SET_COUNT; 1435 break; 1436 case HAMMER2_BREF_TYPE_FREEMAP: 1437 count = HAMMER2_SET_COUNT; 1438 break; 1439 default: 1440 panic("hammer2_chain_base_and_count: " 1441 "unrecognized blockref type: %d", 1442 parent->bref.type); 1443 count = 0; 1444 break; 1445 } 1446 } else { 1447 switch(parent->bref.type) { 1448 case HAMMER2_BREF_TYPE_INODE: 1449 base = &parent->data->ipdata.u.blockset.blockref[0]; 1450 count = HAMMER2_SET_COUNT; 1451 break; 1452 case HAMMER2_BREF_TYPE_INDIRECT: 1453 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1454 base = &parent->data->npdata[0]; 1455 count = parent->bytes / sizeof(hammer2_blockref_t); 1456 break; 1457 case HAMMER2_BREF_TYPE_VOLUME: 1458 base = &parent->data->voldata. 1459 sroot_blockset.blockref[0]; 1460 count = HAMMER2_SET_COUNT; 1461 break; 1462 case HAMMER2_BREF_TYPE_FREEMAP: 1463 base = &parent->data->blkset.blockref[0]; 1464 count = HAMMER2_SET_COUNT; 1465 break; 1466 default: 1467 panic("hammer2_chain_base_and_count: " 1468 "unrecognized blockref type: %d", 1469 parent->bref.type); 1470 base = NULL; 1471 count = 0; 1472 break; 1473 } 1474 } 1475 *countp = count; 1476 1477 return base; 1478 } 1479 1480 /* 1481 * This counts the number of live blockrefs in a block array and 1482 * also calculates the point at which all remaining blockrefs are empty. 1483 * This routine can only be called on a live chain. 1484 * 1485 * Caller holds the chain locked, but possibly with a shared lock. We 1486 * must use an exclusive spinlock to prevent corruption. 1487 * 1488 * NOTE: Flag is not set until after the count is complete, allowing 1489 * callers to test the flag without holding the spinlock. 1490 * 1491 * NOTE: If base is NULL the related chain is still in the INITIAL 1492 * state and there are no blockrefs to count. 1493 * 1494 * NOTE: live_count may already have some counts accumulated due to 1495 * creation and deletion and could even be initially negative. 1496 */ 1497 void 1498 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1499 hammer2_blockref_t *base, int count) 1500 { 1501 hammer2_spin_ex(&chain->core.spin); 1502 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1503 if (base) { 1504 while (--count >= 0) { 1505 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1506 break; 1507 } 1508 chain->core.live_zero = count + 1; 1509 while (count >= 0) { 1510 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1511 atomic_add_int(&chain->core.live_count, 1512 1); 1513 --count; 1514 } 1515 } else { 1516 chain->core.live_zero = 0; 1517 } 1518 /* else do not modify live_count */ 1519 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1520 } 1521 hammer2_spin_unex(&chain->core.spin); 1522 } 1523 1524 /* 1525 * Resize the chain's physical storage allocation in-place. This function does 1526 * not usually adjust the data pointer and must be followed by (typically) a 1527 * hammer2_chain_modify() call to copy any old data over and adjust the 1528 * data pointer. 1529 * 1530 * Chains can be resized smaller without reallocating the storage. Resizing 1531 * larger will reallocate the storage. Excess or prior storage is reclaimed 1532 * asynchronously at a later time. 1533 * 1534 * An nradix value of 0 is special-cased to mean that the storage should 1535 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1536 * byte). 1537 * 1538 * Must be passed an exclusively locked parent and chain. 1539 * 1540 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1541 * to avoid instantiating a device buffer that conflicts with the vnode data 1542 * buffer. However, because H2 can compress or encrypt data, the chain may 1543 * have a dio assigned to it in those situations, and they do not conflict. 1544 * 1545 * XXX return error if cannot resize. 1546 */ 1547 int 1548 hammer2_chain_resize(hammer2_chain_t *chain, 1549 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1550 int nradix, int flags) 1551 { 1552 hammer2_dev_t *hmp; 1553 size_t obytes; 1554 size_t nbytes; 1555 int error; 1556 1557 hmp = chain->hmp; 1558 1559 /* 1560 * Only data and indirect blocks can be resized for now. 1561 * (The volu root, inodes, and freemap elements use a fixed size). 1562 */ 1563 KKASSERT(chain != &hmp->vchain); 1564 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1565 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1566 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1567 1568 /* 1569 * Nothing to do if the element is already the proper size 1570 */ 1571 obytes = chain->bytes; 1572 nbytes = (nradix) ? (1U << nradix) : 0; 1573 if (obytes == nbytes) 1574 return (chain->error); 1575 1576 /* 1577 * Make sure the old data is instantiated so we can copy it. If this 1578 * is a data block, the device data may be superfluous since the data 1579 * might be in a logical block, but compressed or encrypted data is 1580 * another matter. 1581 * 1582 * NOTE: The modify will set BLKMAPUPD for us if BLKMAPPED is set. 1583 */ 1584 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1585 if (error) 1586 return error; 1587 1588 /* 1589 * Reallocate the block, even if making it smaller (because different 1590 * block sizes may be in different regions). 1591 * 1592 * NOTE: Operation does not copy the data and may only be used 1593 * to resize data blocks in-place, or directory entry blocks 1594 * which are about to be modified in some manner. 1595 */ 1596 error = hammer2_freemap_alloc(chain, nbytes); 1597 if (error) 1598 return error; 1599 1600 chain->bytes = nbytes; 1601 1602 /* 1603 * We don't want the followup chain_modify() to try to copy data 1604 * from the old (wrong-sized) buffer. It won't know how much to 1605 * copy. This case should only occur during writes when the 1606 * originator already has the data to write in-hand. 1607 */ 1608 if (chain->dio) { 1609 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1610 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1611 hammer2_io_brelse(&chain->dio); 1612 chain->data = NULL; 1613 } 1614 return (chain->error); 1615 } 1616 1617 /* 1618 * Set the chain modified so its data can be changed by the caller, or 1619 * install deduplicated data. The caller must call this routine for each 1620 * set of modifications it makes, even if the chain is already flagged 1621 * MODIFIED. 1622 * 1623 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1624 * is a CLC (cluster level change) field and is not updated by parent 1625 * propagation during a flush. 1626 * 1627 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1628 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1629 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1630 * remains unmodified with its old data ref intact and chain->error 1631 * unchanged. 1632 * 1633 * Dedup Handling 1634 * 1635 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1636 * even if the chain is still flagged MODIFIED. In this case the chain's 1637 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1638 * 1639 * If the caller passes a non-zero dedup_off we will use it to assign the 1640 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1641 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1642 * must not modify the data content upon return. 1643 */ 1644 int 1645 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1646 hammer2_off_t dedup_off, int flags) 1647 { 1648 hammer2_dev_t *hmp; 1649 hammer2_io_t *dio; 1650 int error; 1651 int wasinitial; 1652 int setmodified; 1653 int setupdate; 1654 int newmod; 1655 char *bdata; 1656 1657 hmp = chain->hmp; 1658 KKASSERT(chain->lock.mtx_lock & MTX_EXCLUSIVE); 1659 1660 /* 1661 * Data is not optional for freemap chains (we must always be sure 1662 * to copy the data on COW storage allocations). 1663 */ 1664 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1665 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1666 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1667 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1668 } 1669 1670 /* 1671 * Data must be resolved if already assigned, unless explicitly 1672 * flagged otherwise. If we cannot safety load the data the 1673 * modification fails and we return early. 1674 */ 1675 if (chain->data == NULL && chain->bytes != 0 && 1676 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1677 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1678 hammer2_chain_load_data(chain); 1679 if (chain->error) 1680 return (chain->error); 1681 } 1682 error = 0; 1683 1684 /* 1685 * Set MODIFIED to indicate that the chain has been modified. A new 1686 * allocation is required when modifying a chain. 1687 * 1688 * Set UPDATE to ensure that the blockref is updated in the parent. 1689 * 1690 * If MODIFIED is already set determine if we can reuse the assigned 1691 * data block or if we need a new data block. 1692 */ 1693 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1694 /* 1695 * Must set modified bit. 1696 */ 1697 atomic_add_long(&hammer2_count_modified_chains, 1); 1698 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1699 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1700 setmodified = 1; 1701 1702 /* 1703 * We may be able to avoid a copy-on-write if the chain's 1704 * check mode is set to NONE and the chain's current 1705 * modify_tid is beyond the last explicit snapshot tid. 1706 * 1707 * This implements HAMMER2's overwrite-in-place feature. 1708 * 1709 * NOTE! This data-block cannot be used as a de-duplication 1710 * source when the check mode is set to NONE. 1711 */ 1712 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1713 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1714 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1715 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1716 HAMMER2_DEC_CHECK(chain->bref.methods) == 1717 HAMMER2_CHECK_NONE && 1718 chain->pmp && 1719 chain->bref.modify_tid > 1720 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1721 /* 1722 * Sector overwrite allowed. 1723 */ 1724 newmod = 0; 1725 } else if ((hmp->hflags & HMNT2_EMERG) && 1726 chain->pmp && 1727 chain->bref.modify_tid > 1728 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1729 /* 1730 * If in emergency delete mode then do a modify-in- 1731 * place on any chain type belonging to the PFS as 1732 * long as it doesn't mess up a snapshot. We might 1733 * be forced to do this anyway a little further down 1734 * in the code if the allocation fails. 1735 * 1736 * Also note that in emergency mode, these modify-in- 1737 * place operations are NOT SAFE. A storage failure, 1738 * power failure, or panic can corrupt the filesystem. 1739 */ 1740 newmod = 0; 1741 } else { 1742 /* 1743 * Sector overwrite not allowed, must copy-on-write. 1744 */ 1745 newmod = 1; 1746 } 1747 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1748 /* 1749 * If the modified chain was registered for dedup we need 1750 * a new allocation. This only happens for delayed-flush 1751 * chains (i.e. which run through the front-end buffer 1752 * cache). 1753 */ 1754 newmod = 1; 1755 setmodified = 0; 1756 } else { 1757 /* 1758 * Already flagged modified, no new allocation is needed. 1759 */ 1760 newmod = 0; 1761 setmodified = 0; 1762 } 1763 1764 /* 1765 * Flag parent update required. 1766 */ 1767 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1768 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1769 setupdate = 1; 1770 } else { 1771 setupdate = 0; 1772 } 1773 1774 /* 1775 * The XOP code returns held but unlocked focus chains. This 1776 * prevents the chain from being destroyed but does not prevent 1777 * it from being modified. diolk is used to interlock modifications 1778 * against XOP frontend accesses to the focus. 1779 * 1780 * This allows us to theoretically avoid deadlocking the frontend 1781 * if one of the backends lock up by not formally locking the 1782 * focused chain in the frontend. In addition, the synchronization 1783 * code relies on this mechanism to avoid deadlocking concurrent 1784 * synchronization threads. 1785 */ 1786 lockmgr(&chain->diolk, LK_EXCLUSIVE); 1787 1788 /* 1789 * The modification or re-modification requires an allocation and 1790 * possible COW. If an error occurs, the previous content and data 1791 * reference is retained and the modification fails. 1792 * 1793 * If dedup_off is non-zero, the caller is requesting a deduplication 1794 * rather than a modification. The MODIFIED bit is not set and the 1795 * data offset is set to the deduplication offset. The data cannot 1796 * be modified. 1797 * 1798 * NOTE: The dedup offset is allowed to be in a partially free state 1799 * and we must be sure to reset it to a fully allocated state 1800 * to force two bulkfree passes to free it again. 1801 * 1802 * NOTE: Only applicable when chain->bytes != 0. 1803 * 1804 * XXX can a chain already be marked MODIFIED without a data 1805 * assignment? If not, assert here instead of testing the case. 1806 */ 1807 if (chain != &hmp->vchain && chain != &hmp->fchain && 1808 chain->bytes) { 1809 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1810 newmod 1811 ) { 1812 /* 1813 * NOTE: We do not have to remove the dedup 1814 * registration because the area is still 1815 * allocated and the underlying DIO will 1816 * still be flushed. 1817 */ 1818 if (dedup_off) { 1819 chain->bref.data_off = dedup_off; 1820 if ((int)(dedup_off & HAMMER2_OFF_MASK_RADIX)) 1821 chain->bytes = 1 << 1822 (int)(dedup_off & 1823 HAMMER2_OFF_MASK_RADIX); 1824 else 1825 chain->bytes = 0; 1826 chain->error = 0; 1827 atomic_clear_int(&chain->flags, 1828 HAMMER2_CHAIN_MODIFIED); 1829 atomic_add_long(&hammer2_count_modified_chains, 1830 -1); 1831 if (chain->pmp) { 1832 hammer2_pfs_memory_wakeup( 1833 chain->pmp, -1); 1834 } 1835 hammer2_freemap_adjust(hmp, &chain->bref, 1836 HAMMER2_FREEMAP_DORECOVER); 1837 atomic_set_int(&chain->flags, 1838 HAMMER2_CHAIN_DEDUPABLE); 1839 } else { 1840 error = hammer2_freemap_alloc(chain, 1841 chain->bytes); 1842 atomic_clear_int(&chain->flags, 1843 HAMMER2_CHAIN_DEDUPABLE); 1844 1845 /* 1846 * If we are unable to allocate a new block 1847 * but we are in emergency mode, issue a 1848 * warning to the console and reuse the same 1849 * block. 1850 * 1851 * We behave as if the allocation were 1852 * successful. 1853 * 1854 * THIS IS IMPORTANT: These modifications 1855 * are virtually guaranteed to corrupt any 1856 * snapshots related to this filesystem. 1857 */ 1858 if (error && (hmp->hflags & HMNT2_EMERG)) { 1859 error = 0; 1860 chain->bref.flags |= 1861 HAMMER2_BREF_FLAG_EMERG_MIP; 1862 1863 krateprintf(&krate_h2em, 1864 "hammer2: Emergency Mode WARNING: " 1865 "Operation will likely corrupt " 1866 "related snapshot: " 1867 "%016jx.%02x key=%016jx\n", 1868 chain->bref.data_off, 1869 chain->bref.type, 1870 chain->bref.key); 1871 } else if (error == 0) { 1872 chain->bref.flags &= 1873 ~HAMMER2_BREF_FLAG_EMERG_MIP; 1874 } 1875 } 1876 } 1877 } 1878 1879 /* 1880 * Stop here if error. We have to undo any flag bits we might 1881 * have set above. 1882 */ 1883 if (error) { 1884 if (setmodified) { 1885 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1886 atomic_add_long(&hammer2_count_modified_chains, -1); 1887 if (chain->pmp) 1888 hammer2_pfs_memory_wakeup(chain->pmp, -1); 1889 } 1890 if (setupdate) { 1891 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1892 } 1893 lockmgr(&chain->diolk, LK_RELEASE); 1894 1895 return error; 1896 } 1897 1898 /* 1899 * Update mirror_tid and modify_tid. modify_tid is only updated 1900 * if not passed as zero (during flushes, parent propagation passes 1901 * the value 0). 1902 * 1903 * NOTE: chain->pmp could be the device spmp. 1904 */ 1905 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1906 if (mtid) 1907 chain->bref.modify_tid = mtid; 1908 1909 /* 1910 * Set BLKMAPUPD to tell the flush code that an existing blockmap entry 1911 * requires updating as well as to tell the delete code that the 1912 * chain's blockref might not exactly match (in terms of physical size 1913 * or block offset) the one in the parent's blocktable. The base key 1914 * of course will still match. 1915 */ 1916 if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) 1917 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD); 1918 1919 /* 1920 * Short-cut data block handling when the caller does not need an 1921 * actual data reference to (aka OPTDATA), as long as the chain does 1922 * not already have a data pointer to the data and no de-duplication 1923 * occurred. 1924 * 1925 * This generally means that the modifications are being done via the 1926 * logical buffer cache. 1927 * 1928 * NOTE: If deduplication occurred we have to run through the data 1929 * stuff to clear INITIAL, and the caller will likely want to 1930 * assign the check code anyway. Leaving INITIAL set on a 1931 * dedup can be deadly (it can cause the block to be zero'd!). 1932 * 1933 * This code also handles bytes == 0 (most dirents). 1934 */ 1935 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1936 (flags & HAMMER2_MODIFY_OPTDATA) && 1937 chain->data == NULL) { 1938 if (dedup_off == 0) { 1939 KKASSERT(chain->dio == NULL); 1940 goto skip2; 1941 } 1942 } 1943 1944 /* 1945 * Clearing the INITIAL flag (for indirect blocks) indicates that 1946 * we've processed the uninitialized storage allocation. 1947 * 1948 * If this flag is already clear we are likely in a copy-on-write 1949 * situation but we have to be sure NOT to bzero the storage if 1950 * no data is present. 1951 * 1952 * Clearing of NOTTESTED is allowed if the MODIFIED bit is set, 1953 */ 1954 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1955 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1956 wasinitial = 1; 1957 } else { 1958 wasinitial = 0; 1959 } 1960 1961 /* 1962 * Instantiate data buffer and possibly execute COW operation 1963 */ 1964 switch(chain->bref.type) { 1965 case HAMMER2_BREF_TYPE_VOLUME: 1966 case HAMMER2_BREF_TYPE_FREEMAP: 1967 /* 1968 * The data is embedded, no copy-on-write operation is 1969 * needed. 1970 */ 1971 KKASSERT(chain->dio == NULL); 1972 break; 1973 case HAMMER2_BREF_TYPE_DIRENT: 1974 /* 1975 * The data might be fully embedded. 1976 */ 1977 if (chain->bytes == 0) { 1978 KKASSERT(chain->dio == NULL); 1979 break; 1980 } 1981 /* fall through */ 1982 case HAMMER2_BREF_TYPE_INODE: 1983 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1984 case HAMMER2_BREF_TYPE_DATA: 1985 case HAMMER2_BREF_TYPE_INDIRECT: 1986 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1987 /* 1988 * Perform the copy-on-write operation 1989 * 1990 * zero-fill or copy-on-write depending on whether 1991 * chain->data exists or not and set the dirty state for 1992 * the new buffer. hammer2_io_new() will handle the 1993 * zero-fill. 1994 * 1995 * If a dedup_off was supplied this is an existing block 1996 * and no COW, copy, or further modification is required. 1997 */ 1998 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 1999 2000 if (wasinitial && dedup_off == 0) { 2001 error = hammer2_io_new(hmp, chain->bref.type, 2002 chain->bref.data_off, 2003 chain->bytes, &dio); 2004 } else { 2005 error = hammer2_io_bread(hmp, chain->bref.type, 2006 chain->bref.data_off, 2007 chain->bytes, &dio); 2008 } 2009 hammer2_adjreadcounter(chain->bref.type, chain->bytes); 2010 2011 /* 2012 * If an I/O error occurs make sure callers cannot accidently 2013 * modify the old buffer's contents and corrupt the filesystem. 2014 * 2015 * NOTE: hammer2_io_data() call issues bkvasync() 2016 */ 2017 if (error) { 2018 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 2019 hmp); 2020 chain->error = HAMMER2_ERROR_EIO; 2021 hammer2_io_brelse(&dio); 2022 hammer2_io_brelse(&chain->dio); 2023 chain->data = NULL; 2024 break; 2025 } 2026 chain->error = 0; 2027 bdata = hammer2_io_data(dio, chain->bref.data_off); 2028 2029 if (chain->data) { 2030 /* 2031 * COW (unless a dedup). 2032 */ 2033 KKASSERT(chain->dio != NULL); 2034 if (chain->data != (void *)bdata && dedup_off == 0) { 2035 bcopy(chain->data, bdata, chain->bytes); 2036 } 2037 } else if (wasinitial == 0 && dedup_off == 0) { 2038 /* 2039 * We have a problem. We were asked to COW but 2040 * we don't have any data to COW with! 2041 */ 2042 panic("hammer2_chain_modify: having a COW %p\n", 2043 chain); 2044 } 2045 2046 /* 2047 * Retire the old buffer, replace with the new. Dirty or 2048 * redirty the new buffer. 2049 * 2050 * WARNING! The system buffer cache may have already flushed 2051 * the buffer, so we must be sure to [re]dirty it 2052 * for further modification. 2053 * 2054 * If dedup_off was supplied, the caller is not 2055 * expected to make any further modification to the 2056 * buffer. 2057 * 2058 * WARNING! hammer2_get_gdata() assumes dio never transitions 2059 * through NULL in order to optimize away unnecessary 2060 * diolk operations. 2061 */ 2062 { 2063 hammer2_io_t *tio; 2064 2065 if ((tio = chain->dio) != NULL) 2066 hammer2_io_bqrelse(&tio); 2067 chain->data = (void *)bdata; 2068 chain->dio = dio; 2069 if (dedup_off == 0) 2070 hammer2_io_setdirty(dio); 2071 } 2072 break; 2073 default: 2074 panic("hammer2_chain_modify: illegal non-embedded type %d", 2075 chain->bref.type); 2076 break; 2077 2078 } 2079 skip2: 2080 /* 2081 * setflush on parent indicating that the parent must recurse down 2082 * to us. Do not call on chain itself which might already have it 2083 * set. 2084 */ 2085 if (chain->parent) 2086 hammer2_chain_setflush(chain->parent); 2087 lockmgr(&chain->diolk, LK_RELEASE); 2088 2089 return (chain->error); 2090 } 2091 2092 /* 2093 * Modify the chain associated with an inode. 2094 */ 2095 int 2096 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 2097 hammer2_tid_t mtid, int flags) 2098 { 2099 int error; 2100 2101 hammer2_inode_modify(ip); 2102 error = hammer2_chain_modify(chain, mtid, 0, flags); 2103 2104 return error; 2105 } 2106 2107 /* 2108 * This function returns the chain at the nearest key within the specified 2109 * range. The returned chain will be referenced but not locked. 2110 * 2111 * This function will recurse through chain->rbtree as necessary and will 2112 * return a *key_nextp suitable for iteration. *key_nextp is only set if 2113 * the iteration value is less than the current value of *key_nextp. 2114 * 2115 * The caller should use (*key_nextp) to calculate the actual range of 2116 * the returned element, which will be (key_beg to *key_nextp - 1), because 2117 * there might be another element which is superior to the returned element 2118 * and overlaps it. 2119 * 2120 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 2121 * chains continue to be returned. On EOF (*key_nextp) may overflow since 2122 * it will wind up being (key_end + 1). 2123 * 2124 * WARNING! Must be called with child's spinlock held. Spinlock remains 2125 * held through the operation. 2126 */ 2127 struct hammer2_chain_find_info { 2128 hammer2_chain_t *best; 2129 hammer2_key_t key_beg; 2130 hammer2_key_t key_end; 2131 hammer2_key_t key_next; 2132 }; 2133 2134 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 2135 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 2136 2137 static 2138 hammer2_chain_t * 2139 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 2140 hammer2_key_t key_beg, hammer2_key_t key_end) 2141 { 2142 struct hammer2_chain_find_info info; 2143 2144 info.best = NULL; 2145 info.key_beg = key_beg; 2146 info.key_end = key_end; 2147 info.key_next = *key_nextp; 2148 2149 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 2150 hammer2_chain_find_cmp, hammer2_chain_find_callback, 2151 &info); 2152 *key_nextp = info.key_next; 2153 #if 0 2154 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 2155 parent, key_beg, key_end, *key_nextp); 2156 #endif 2157 2158 return (info.best); 2159 } 2160 2161 static 2162 int 2163 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 2164 { 2165 struct hammer2_chain_find_info *info = data; 2166 hammer2_key_t child_beg; 2167 hammer2_key_t child_end; 2168 2169 child_beg = child->bref.key; 2170 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 2171 2172 if (child_end < info->key_beg) 2173 return(-1); 2174 if (child_beg > info->key_end) 2175 return(1); 2176 return(0); 2177 } 2178 2179 static 2180 int 2181 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2182 { 2183 struct hammer2_chain_find_info *info = data; 2184 hammer2_chain_t *best; 2185 hammer2_key_t child_end; 2186 2187 if ((best = info->best) == NULL) { 2188 /* 2189 * No previous best. Assign best 2190 */ 2191 info->best = child; 2192 } else if (best->bref.key <= info->key_beg && 2193 child->bref.key <= info->key_beg) { 2194 /* 2195 * Illegal overlap. 2196 */ 2197 KKASSERT(0); 2198 /*info->best = child;*/ 2199 } else if (child->bref.key < best->bref.key) { 2200 /* 2201 * Child has a nearer key and best is not flush with key_beg. 2202 * Set best to child. Truncate key_next to the old best key. 2203 */ 2204 info->best = child; 2205 if (info->key_next > best->bref.key || info->key_next == 0) 2206 info->key_next = best->bref.key; 2207 } else if (child->bref.key == best->bref.key) { 2208 /* 2209 * If our current best is flush with the child then this 2210 * is an illegal overlap. 2211 * 2212 * key_next will automatically be limited to the smaller of 2213 * the two end-points. 2214 */ 2215 KKASSERT(0); 2216 info->best = child; 2217 } else { 2218 /* 2219 * Keep the current best but truncate key_next to the child's 2220 * base. 2221 * 2222 * key_next will also automatically be limited to the smaller 2223 * of the two end-points (probably not necessary for this case 2224 * but we do it anyway). 2225 */ 2226 if (info->key_next > child->bref.key || info->key_next == 0) 2227 info->key_next = child->bref.key; 2228 } 2229 2230 /* 2231 * Always truncate key_next based on child's end-of-range. 2232 */ 2233 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2234 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2235 info->key_next = child_end; 2236 2237 return(0); 2238 } 2239 2240 /* 2241 * Retrieve the specified chain from a media blockref, creating the 2242 * in-memory chain structure which reflects it. The returned chain is 2243 * held and locked according to (how) (HAMMER2_RESOLVE_*). The caller must 2244 * handle crc-checks and so forth, and should check chain->error before 2245 * assuming that the data is good. 2246 * 2247 * To handle insertion races pass the INSERT_RACE flag along with the 2248 * generation number of the core. NULL will be returned if the generation 2249 * number changes before we have a chance to insert the chain. Insert 2250 * races can occur because the parent might be held shared. 2251 * 2252 * Caller must hold the parent locked shared or exclusive since we may 2253 * need the parent's bref array to find our block. 2254 * 2255 * WARNING! chain->pmp is always set to NULL for any chain representing 2256 * part of the super-root topology. 2257 */ 2258 hammer2_chain_t * 2259 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2260 hammer2_blockref_t *bref, int how) 2261 { 2262 hammer2_dev_t *hmp = parent->hmp; 2263 hammer2_chain_t *chain; 2264 int error; 2265 2266 /* 2267 * Allocate a chain structure representing the existing media 2268 * entry. Resulting chain has one ref and is not locked. 2269 */ 2270 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2271 chain = hammer2_chain_alloc(hmp, NULL, bref); 2272 else 2273 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2274 /* ref'd chain returned */ 2275 2276 /* 2277 * Flag that the chain is in the parent's blockmap so delete/flush 2278 * knows what to do with it. 2279 */ 2280 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED); 2281 2282 /* 2283 * chain must be locked to avoid unexpected ripouts 2284 */ 2285 hammer2_chain_lock(chain, how); 2286 2287 /* 2288 * Link the chain into its parent. A spinlock is required to safely 2289 * access the RBTREE, and it is possible to collide with another 2290 * hammer2_chain_get() operation because the caller might only hold 2291 * a shared lock on the parent. 2292 * 2293 * NOTE: Get races can occur quite often when we distribute 2294 * asynchronous read-aheads across multiple threads. 2295 */ 2296 KKASSERT(parent->refs > 0); 2297 error = hammer2_chain_insert(parent, chain, 2298 HAMMER2_CHAIN_INSERT_SPIN | 2299 HAMMER2_CHAIN_INSERT_RACE, 2300 generation); 2301 if (error) { 2302 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2303 /*kprintf("chain %p get race\n", chain);*/ 2304 hammer2_chain_unlock(chain); 2305 hammer2_chain_drop(chain); 2306 chain = NULL; 2307 } else { 2308 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2309 } 2310 2311 /* 2312 * Return our new chain referenced but not locked, or NULL if 2313 * a race occurred. 2314 */ 2315 return (chain); 2316 } 2317 2318 /* 2319 * Lookup initialization/completion API 2320 */ 2321 hammer2_chain_t * 2322 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2323 { 2324 hammer2_chain_ref(parent); 2325 if (flags & HAMMER2_LOOKUP_SHARED) { 2326 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2327 HAMMER2_RESOLVE_SHARED); 2328 } else { 2329 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2330 } 2331 return (parent); 2332 } 2333 2334 void 2335 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2336 { 2337 if (parent) { 2338 hammer2_chain_unlock(parent); 2339 hammer2_chain_drop(parent); 2340 } 2341 } 2342 2343 /* 2344 * Take the locked chain and return a locked parent. The chain remains 2345 * locked on return, but may have to be temporarily unlocked to acquire 2346 * the parent. Because of this, (chain) must be stable and cannot be 2347 * deleted while it was temporarily unlocked (typically means that (chain) 2348 * is an inode). 2349 * 2350 * Pass HAMMER2_RESOLVE_* flags in flags. 2351 * 2352 * This will work even if the chain is errored, and the caller can check 2353 * parent->error on return if desired since the parent will be locked. 2354 * 2355 * This function handles the lock order reversal. 2356 */ 2357 hammer2_chain_t * 2358 hammer2_chain_getparent(hammer2_chain_t *chain, int flags) 2359 { 2360 hammer2_chain_t *parent; 2361 2362 /* 2363 * Be careful of order, chain must be unlocked before parent 2364 * is locked below to avoid a deadlock. Try it trivially first. 2365 */ 2366 parent = chain->parent; 2367 if (parent == NULL) 2368 panic("hammer2_chain_getparent: no parent"); 2369 hammer2_chain_ref(parent); 2370 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) 2371 return parent; 2372 2373 for (;;) { 2374 hammer2_chain_unlock(chain); 2375 hammer2_chain_lock(parent, flags); 2376 hammer2_chain_lock(chain, flags); 2377 2378 /* 2379 * Parent relinking races are quite common. We have to get 2380 * it right or we will blow up the block table. 2381 */ 2382 if (chain->parent == parent) 2383 break; 2384 hammer2_chain_unlock(parent); 2385 hammer2_chain_drop(parent); 2386 cpu_ccfence(); 2387 parent = chain->parent; 2388 if (parent == NULL) 2389 panic("hammer2_chain_getparent: no parent"); 2390 hammer2_chain_ref(parent); 2391 } 2392 return parent; 2393 } 2394 2395 /* 2396 * Take the locked chain and return a locked parent. The chain is unlocked 2397 * and dropped. *chainp is set to the returned parent as a convenience. 2398 * Pass HAMMER2_RESOLVE_* flags in flags. 2399 * 2400 * This will work even if the chain is errored, and the caller can check 2401 * parent->error on return if desired since the parent will be locked. 2402 * 2403 * The chain does NOT need to be stable. We use a tracking structure 2404 * to track the expected parent if the chain is deleted out from under us. 2405 * 2406 * This function handles the lock order reversal. 2407 */ 2408 hammer2_chain_t * 2409 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags) 2410 { 2411 hammer2_chain_t *chain; 2412 hammer2_chain_t *parent; 2413 struct hammer2_reptrack reptrack; 2414 struct hammer2_reptrack **repp; 2415 2416 /* 2417 * Be careful of order, chain must be unlocked before parent 2418 * is locked below to avoid a deadlock. Try it trivially first. 2419 */ 2420 chain = *chainp; 2421 parent = chain->parent; 2422 if (parent == NULL) { 2423 hammer2_spin_unex(&chain->core.spin); 2424 panic("hammer2_chain_repparent: no parent"); 2425 } 2426 hammer2_chain_ref(parent); 2427 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) { 2428 hammer2_chain_unlock(chain); 2429 hammer2_chain_drop(chain); 2430 *chainp = parent; 2431 2432 return parent; 2433 } 2434 2435 /* 2436 * Ok, now it gets a bit nasty. There are multiple situations where 2437 * the parent might be in the middle of a deletion, or where the child 2438 * (chain) might be deleted the instant we let go of its lock. 2439 * We can potentially end up in a no-win situation! 2440 * 2441 * In particular, the indirect_maintenance() case can cause these 2442 * situations. 2443 * 2444 * To deal with this we install a reptrack structure in the parent 2445 * This reptrack structure 'owns' the parent ref and will automatically 2446 * migrate to the parent's parent if the parent is deleted permanently. 2447 */ 2448 hammer2_spin_init(&reptrack.spin, "h2reptrk"); 2449 reptrack.chain = parent; 2450 hammer2_chain_ref(parent); /* for the reptrack */ 2451 2452 hammer2_spin_ex(&parent->core.spin); 2453 reptrack.next = parent->core.reptrack; 2454 parent->core.reptrack = &reptrack; 2455 hammer2_spin_unex(&parent->core.spin); 2456 2457 hammer2_chain_unlock(chain); 2458 hammer2_chain_drop(chain); 2459 chain = NULL; /* gone */ 2460 2461 /* 2462 * At the top of this loop, chain is gone and parent is refd both 2463 * by us explicitly AND via our reptrack. We are attempting to 2464 * lock parent. 2465 */ 2466 for (;;) { 2467 hammer2_chain_lock(parent, flags); 2468 2469 if (reptrack.chain == parent) 2470 break; 2471 hammer2_chain_unlock(parent); 2472 hammer2_chain_drop(parent); 2473 2474 kprintf("hammer2: debug REPTRACK %p->%p\n", 2475 parent, reptrack.chain); 2476 hammer2_spin_ex(&reptrack.spin); 2477 parent = reptrack.chain; 2478 hammer2_chain_ref(parent); 2479 hammer2_spin_unex(&reptrack.spin); 2480 } 2481 2482 /* 2483 * Once parent is locked and matches our reptrack, our reptrack 2484 * will be stable and we have our parent. We can unlink our 2485 * reptrack. 2486 * 2487 * WARNING! Remember that the chain lock might be shared. Chains 2488 * locked shared have stable parent linkages. 2489 */ 2490 hammer2_spin_ex(&parent->core.spin); 2491 repp = &parent->core.reptrack; 2492 while (*repp != &reptrack) 2493 repp = &(*repp)->next; 2494 *repp = reptrack.next; 2495 hammer2_spin_unex(&parent->core.spin); 2496 2497 hammer2_chain_drop(parent); /* reptrack ref */ 2498 *chainp = parent; /* return parent lock+ref */ 2499 2500 return parent; 2501 } 2502 2503 /* 2504 * Dispose of any linked reptrack structures in (chain) by shifting them to 2505 * (parent). Both (chain) and (parent) must be exclusively locked. 2506 * 2507 * This is interlocked against any children of (chain) on the other side. 2508 * No children so remain as-of when this is called so we can test 2509 * core.reptrack without holding the spin-lock. 2510 * 2511 * Used whenever the caller intends to permanently delete chains related 2512 * to topological recursions (BREF_TYPE_INDIRECT, BREF_TYPE_FREEMAP_NODE), 2513 * where the chains underneath the node being deleted are given a new parent 2514 * above the node being deleted. 2515 */ 2516 static 2517 void 2518 hammer2_chain_repchange(hammer2_chain_t *parent, hammer2_chain_t *chain) 2519 { 2520 struct hammer2_reptrack *reptrack; 2521 2522 KKASSERT(chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)); 2523 while (chain->core.reptrack) { 2524 hammer2_spin_ex(&parent->core.spin); 2525 hammer2_spin_ex(&chain->core.spin); 2526 reptrack = chain->core.reptrack; 2527 if (reptrack == NULL) { 2528 hammer2_spin_unex(&chain->core.spin); 2529 hammer2_spin_unex(&parent->core.spin); 2530 break; 2531 } 2532 hammer2_spin_ex(&reptrack->spin); 2533 chain->core.reptrack = reptrack->next; 2534 reptrack->chain = parent; 2535 reptrack->next = parent->core.reptrack; 2536 parent->core.reptrack = reptrack; 2537 hammer2_chain_ref(parent); /* reptrack */ 2538 2539 hammer2_spin_unex(&chain->core.spin); 2540 hammer2_spin_unex(&parent->core.spin); 2541 kprintf("hammer2: debug repchange %p %p->%p\n", 2542 reptrack, chain, parent); 2543 hammer2_chain_drop(chain); /* reptrack */ 2544 } 2545 } 2546 2547 /* 2548 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2549 * (*parentp) typically points to an inode but can also point to a related 2550 * indirect block and this function will recurse upwards and find the inode 2551 * or the nearest undeleted indirect block covering the key range. 2552 * 2553 * This function unconditionally sets *errorp, replacing any previous value. 2554 * 2555 * (*parentp) must be exclusive or shared locked (depending on flags) and 2556 * referenced and can be an inode or an existing indirect block within the 2557 * inode. 2558 * 2559 * If (*parent) is errored out, this function will not attempt to recurse 2560 * the radix tree and will return NULL along with an appropriate *errorp. 2561 * If NULL is returned and *errorp is 0, the requested lookup could not be 2562 * located. 2563 * 2564 * On return (*parentp) will be modified to point at the deepest parent chain 2565 * element encountered during the search, as a helper for an insertion or 2566 * deletion. 2567 * 2568 * The new (*parentp) will be locked shared or exclusive (depending on flags), 2569 * and referenced, and the old will be unlocked and dereferenced (no change 2570 * if they are both the same). This is particularly important if the caller 2571 * wishes to insert a new chain, (*parentp) will be set properly even if NULL 2572 * is returned, as long as no error occurred. 2573 * 2574 * The matching chain will be returned locked according to flags. 2575 * 2576 * -- 2577 * 2578 * NULL is returned if no match was found, but (*parentp) will still 2579 * potentially be adjusted. 2580 * 2581 * On return (*key_nextp) will point to an iterative value for key_beg. 2582 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2583 * 2584 * This function will also recurse up the chain if the key is not within the 2585 * current parent's range. (*parentp) can never be set to NULL. An iteration 2586 * can simply allow (*parentp) to float inside the loop. 2587 * 2588 * NOTE! chain->data is not always resolved. By default it will not be 2589 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2590 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2591 * BREF_TYPE_DATA as the device buffer can alias the logical file 2592 * buffer). 2593 */ 2594 hammer2_chain_t * 2595 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2596 hammer2_key_t key_beg, hammer2_key_t key_end, 2597 int *errorp, int flags) 2598 { 2599 hammer2_chain_t *parent; 2600 hammer2_chain_t *chain; 2601 hammer2_blockref_t *base; 2602 hammer2_blockref_t *bref; 2603 hammer2_blockref_t bsave; 2604 hammer2_key_t scan_beg; 2605 hammer2_key_t scan_end; 2606 int count = 0; 2607 int how_always = HAMMER2_RESOLVE_ALWAYS; 2608 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2609 int how; 2610 int generation; 2611 int maxloops = 300000; 2612 2613 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2614 how_maybe = how_always; 2615 how = HAMMER2_RESOLVE_ALWAYS; 2616 } else if (flags & HAMMER2_LOOKUP_NODATA) { 2617 how = HAMMER2_RESOLVE_NEVER; 2618 } else { 2619 how = HAMMER2_RESOLVE_MAYBE; 2620 } 2621 if (flags & HAMMER2_LOOKUP_SHARED) { 2622 how_maybe |= HAMMER2_RESOLVE_SHARED; 2623 how_always |= HAMMER2_RESOLVE_SHARED; 2624 how |= HAMMER2_RESOLVE_SHARED; 2625 } 2626 2627 /* 2628 * Recurse (*parentp) upward if necessary until the parent completely 2629 * encloses the key range or we hit the inode. 2630 * 2631 * Handle races against the flusher deleting indirect nodes on its 2632 * way back up by continuing to recurse upward past the deletion. 2633 */ 2634 parent = *parentp; 2635 *errorp = 0; 2636 2637 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2638 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2639 scan_beg = parent->bref.key; 2640 scan_end = scan_beg + 2641 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2642 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2643 if (key_beg >= scan_beg && key_end <= scan_end) 2644 break; 2645 } 2646 parent = hammer2_chain_repparent(parentp, how_maybe); 2647 } 2648 again: 2649 if (--maxloops == 0) 2650 panic("hammer2_chain_lookup: maxloops"); 2651 2652 /* 2653 * MATCHIND case that does not require parent->data (do prior to 2654 * parent->error check). 2655 */ 2656 switch(parent->bref.type) { 2657 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2658 case HAMMER2_BREF_TYPE_INDIRECT: 2659 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2660 scan_beg = parent->bref.key; 2661 scan_end = scan_beg + 2662 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2663 if (key_beg == scan_beg && key_end == scan_end) { 2664 chain = parent; 2665 hammer2_chain_ref(chain); 2666 hammer2_chain_lock(chain, how_maybe); 2667 *key_nextp = scan_end + 1; 2668 goto done; 2669 } 2670 } 2671 break; 2672 default: 2673 break; 2674 } 2675 2676 /* 2677 * No lookup is possible if the parent is errored. We delayed 2678 * this check as long as we could to ensure that the parent backup, 2679 * embedded data, and MATCHIND code could still execute. 2680 */ 2681 if (parent->error) { 2682 *errorp = parent->error; 2683 return NULL; 2684 } 2685 2686 /* 2687 * Locate the blockref array. Currently we do a fully associative 2688 * search through the array. 2689 */ 2690 switch(parent->bref.type) { 2691 case HAMMER2_BREF_TYPE_INODE: 2692 /* 2693 * Special shortcut for embedded data returns the inode 2694 * itself. Callers must detect this condition and access 2695 * the embedded data (the strategy code does this for us). 2696 * 2697 * This is only applicable to regular files and softlinks. 2698 * 2699 * We need a second lock on parent. Since we already have 2700 * a lock we must pass LOCKAGAIN to prevent unexpected 2701 * blocking (we don't want to block on a second shared 2702 * ref if an exclusive lock is pending) 2703 */ 2704 if (parent->data->ipdata.meta.op_flags & 2705 HAMMER2_OPFLAG_DIRECTDATA) { 2706 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2707 chain = NULL; 2708 *key_nextp = key_end + 1; 2709 goto done; 2710 } 2711 hammer2_chain_ref(parent); 2712 hammer2_chain_lock(parent, how_always | 2713 HAMMER2_RESOLVE_LOCKAGAIN); 2714 *key_nextp = key_end + 1; 2715 return (parent); 2716 } 2717 base = &parent->data->ipdata.u.blockset.blockref[0]; 2718 count = HAMMER2_SET_COUNT; 2719 break; 2720 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2721 case HAMMER2_BREF_TYPE_INDIRECT: 2722 /* 2723 * Optimize indirect blocks in the INITIAL state to avoid 2724 * I/O. 2725 * 2726 * Debugging: Enter permanent wait state instead of 2727 * panicing on unexpectedly NULL data for the moment. 2728 */ 2729 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2730 base = NULL; 2731 } else { 2732 if (parent->data == NULL) { 2733 kprintf("hammer2: unexpected NULL data " 2734 "on %p\n", parent); 2735 while (1) 2736 tsleep(parent, 0, "xxx", 0); 2737 } 2738 base = &parent->data->npdata[0]; 2739 } 2740 count = parent->bytes / sizeof(hammer2_blockref_t); 2741 break; 2742 case HAMMER2_BREF_TYPE_VOLUME: 2743 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2744 count = HAMMER2_SET_COUNT; 2745 break; 2746 case HAMMER2_BREF_TYPE_FREEMAP: 2747 base = &parent->data->blkset.blockref[0]; 2748 count = HAMMER2_SET_COUNT; 2749 break; 2750 default: 2751 panic("hammer2_chain_lookup: unrecognized " 2752 "blockref(B) type: %d", 2753 parent->bref.type); 2754 base = NULL; /* safety */ 2755 count = 0; /* safety */ 2756 break; 2757 } 2758 2759 /* 2760 * Merged scan to find next candidate. 2761 * 2762 * hammer2_base_*() functions require the parent->core.live_* fields 2763 * to be synchronized. 2764 * 2765 * We need to hold the spinlock to access the block array and RB tree 2766 * and to interlock chain creation. 2767 */ 2768 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2769 hammer2_chain_countbrefs(parent, base, count); 2770 2771 /* 2772 * Combined search 2773 */ 2774 hammer2_spin_ex(&parent->core.spin); 2775 chain = hammer2_combined_find(parent, base, count, 2776 key_nextp, 2777 key_beg, key_end, 2778 &bref); 2779 generation = parent->core.generation; 2780 2781 /* 2782 * Exhausted parent chain, iterate. 2783 */ 2784 if (bref == NULL) { 2785 KKASSERT(chain == NULL); 2786 hammer2_spin_unex(&parent->core.spin); 2787 if (key_beg == key_end) /* short cut single-key case */ 2788 return (NULL); 2789 2790 /* 2791 * Stop if we reached the end of the iteration. 2792 */ 2793 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2794 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2795 return (NULL); 2796 } 2797 2798 /* 2799 * Calculate next key, stop if we reached the end of the 2800 * iteration, otherwise go up one level and loop. 2801 */ 2802 key_beg = parent->bref.key + 2803 ((hammer2_key_t)1 << parent->bref.keybits); 2804 if (key_beg == 0 || key_beg > key_end) 2805 return (NULL); 2806 parent = hammer2_chain_repparent(parentp, how_maybe); 2807 goto again; 2808 } 2809 2810 /* 2811 * Selected from blockref or in-memory chain. 2812 */ 2813 bsave = *bref; 2814 if (chain == NULL) { 2815 hammer2_spin_unex(&parent->core.spin); 2816 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2817 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2818 chain = hammer2_chain_get(parent, generation, 2819 &bsave, how_maybe); 2820 } else { 2821 chain = hammer2_chain_get(parent, generation, 2822 &bsave, how); 2823 } 2824 if (chain == NULL) 2825 goto again; 2826 } else { 2827 hammer2_chain_ref(chain); 2828 hammer2_spin_unex(&parent->core.spin); 2829 2830 /* 2831 * chain is referenced but not locked. We must lock the 2832 * chain to obtain definitive state. 2833 */ 2834 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2835 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2836 hammer2_chain_lock(chain, how_maybe); 2837 } else { 2838 hammer2_chain_lock(chain, how); 2839 } 2840 KKASSERT(chain->parent == parent); 2841 } 2842 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 2843 chain->parent != parent) { 2844 hammer2_chain_unlock(chain); 2845 hammer2_chain_drop(chain); 2846 chain = NULL; /* SAFETY */ 2847 goto again; 2848 } 2849 2850 2851 /* 2852 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2853 * 2854 * NOTE: Chain's key range is not relevant as there might be 2855 * one-offs within the range that are not deleted. 2856 * 2857 * NOTE: Lookups can race delete-duplicate because 2858 * delete-duplicate does not lock the parent's core 2859 * (they just use the spinlock on the core). 2860 */ 2861 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2862 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2863 chain->bref.data_off, chain->bref.type, 2864 chain->bref.key); 2865 hammer2_chain_unlock(chain); 2866 hammer2_chain_drop(chain); 2867 chain = NULL; /* SAFETY */ 2868 key_beg = *key_nextp; 2869 if (key_beg == 0 || key_beg > key_end) 2870 return(NULL); 2871 goto again; 2872 } 2873 2874 /* 2875 * If the chain element is an indirect block it becomes the new 2876 * parent and we loop on it. We must maintain our top-down locks 2877 * to prevent the flusher from interfering (i.e. doing a 2878 * delete-duplicate and leaving us recursing down a deleted chain). 2879 * 2880 * The parent always has to be locked with at least RESOLVE_MAYBE 2881 * so we can access its data. It might need a fixup if the caller 2882 * passed incompatible flags. Be careful not to cause a deadlock 2883 * as a data-load requires an exclusive lock. 2884 * 2885 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2886 * range is within the requested key range we return the indirect 2887 * block and do NOT loop. This is usually only used to acquire 2888 * freemap nodes. 2889 */ 2890 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2891 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2892 hammer2_chain_unlock(parent); 2893 hammer2_chain_drop(parent); 2894 *parentp = parent = chain; 2895 chain = NULL; /* SAFETY */ 2896 goto again; 2897 } 2898 done: 2899 /* 2900 * All done, return the locked chain. 2901 * 2902 * If the caller does not want a locked chain, replace the lock with 2903 * a ref. Perhaps this can eventually be optimized to not obtain the 2904 * lock in the first place for situations where the data does not 2905 * need to be resolved. 2906 * 2907 * NOTE! A chain->error must be tested by the caller upon return. 2908 * *errorp is only set based on issues which occur while 2909 * trying to reach the chain. 2910 */ 2911 return (chain); 2912 } 2913 2914 /* 2915 * After having issued a lookup we can iterate all matching keys. 2916 * 2917 * If chain is non-NULL we continue the iteration from just after it's index. 2918 * 2919 * If chain is NULL we assume the parent was exhausted and continue the 2920 * iteration at the next parent. 2921 * 2922 * If a fatal error occurs (typically an I/O error), a dummy chain is 2923 * returned with chain->error and error-identifying information set. This 2924 * chain will assert if you try to do anything fancy with it. 2925 * 2926 * XXX Depending on where the error occurs we should allow continued iteration. 2927 * 2928 * parent must be locked on entry and remains locked throughout. chain's 2929 * lock status must match flags. Chain is always at least referenced. 2930 * 2931 * WARNING! The MATCHIND flag does not apply to this function. 2932 */ 2933 hammer2_chain_t * 2934 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2935 hammer2_key_t *key_nextp, 2936 hammer2_key_t key_beg, hammer2_key_t key_end, 2937 int *errorp, int flags) 2938 { 2939 hammer2_chain_t *parent; 2940 int how_maybe; 2941 2942 /* 2943 * Calculate locking flags for upward recursion. 2944 */ 2945 how_maybe = HAMMER2_RESOLVE_MAYBE; 2946 if (flags & HAMMER2_LOOKUP_SHARED) 2947 how_maybe |= HAMMER2_RESOLVE_SHARED; 2948 2949 parent = *parentp; 2950 *errorp = 0; 2951 2952 /* 2953 * Calculate the next index and recalculate the parent if necessary. 2954 */ 2955 if (chain) { 2956 key_beg = chain->bref.key + 2957 ((hammer2_key_t)1 << chain->bref.keybits); 2958 hammer2_chain_unlock(chain); 2959 hammer2_chain_drop(chain); 2960 2961 /* 2962 * chain invalid past this point, but we can still do a 2963 * pointer comparison w/parent. 2964 * 2965 * Any scan where the lookup returned degenerate data embedded 2966 * in the inode has an invalid index and must terminate. 2967 */ 2968 if (chain == parent) 2969 return(NULL); 2970 if (key_beg == 0 || key_beg > key_end) 2971 return(NULL); 2972 chain = NULL; 2973 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2974 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2975 /* 2976 * We reached the end of the iteration. 2977 */ 2978 return (NULL); 2979 } else { 2980 /* 2981 * Continue iteration with next parent unless the current 2982 * parent covers the range. 2983 * 2984 * (This also handles the case of a deleted, empty indirect 2985 * node). 2986 */ 2987 key_beg = parent->bref.key + 2988 ((hammer2_key_t)1 << parent->bref.keybits); 2989 if (key_beg == 0 || key_beg > key_end) 2990 return (NULL); 2991 parent = hammer2_chain_repparent(parentp, how_maybe); 2992 } 2993 2994 /* 2995 * And execute 2996 */ 2997 return (hammer2_chain_lookup(parentp, key_nextp, 2998 key_beg, key_end, 2999 errorp, flags)); 3000 } 3001 3002 /* 3003 * Caller wishes to iterate chains under parent, loading new chains into 3004 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 3005 * then call hammer2_chain_scan() repeatedly until a non-zero return. 3006 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 3007 * with the returned chain for the scan. The returned *chainp will be 3008 * locked and referenced. Any prior contents will be unlocked and dropped. 3009 * 3010 * Caller should check the return value. A normal scan EOF will return 3011 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 3012 * error trying to access parent data. Any error in the returned chain 3013 * must be tested separately by the caller. 3014 * 3015 * (*chainp) is dropped on each scan, but will only be set if the returned 3016 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 3017 * returned via *chainp. The caller will get their bref only. 3018 * 3019 * The raw scan function is similar to lookup/next but does not seek to a key. 3020 * Blockrefs are iterated via first_bref = (parent, NULL) and 3021 * next_chain = (parent, bref). 3022 * 3023 * The passed-in parent must be locked and its data resolved. The function 3024 * nominally returns a locked and referenced *chainp != NULL for chains 3025 * the caller might need to recurse on (and will dipose of any *chainp passed 3026 * in). The caller must check the chain->bref.type either way. 3027 */ 3028 int 3029 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 3030 hammer2_blockref_t *bref, int *firstp, 3031 int flags) 3032 { 3033 hammer2_blockref_t *base; 3034 hammer2_blockref_t *bref_ptr; 3035 hammer2_key_t key; 3036 hammer2_key_t next_key; 3037 hammer2_chain_t *chain = NULL; 3038 int count = 0; 3039 int how; 3040 int generation; 3041 int maxloops = 300000; 3042 int error; 3043 3044 error = 0; 3045 3046 /* 3047 * Scan flags borrowed from lookup. 3048 */ 3049 if (flags & HAMMER2_LOOKUP_ALWAYS) { 3050 how = HAMMER2_RESOLVE_ALWAYS; 3051 } else if (flags & HAMMER2_LOOKUP_NODATA) { 3052 how = HAMMER2_RESOLVE_NEVER; 3053 } else { 3054 how = HAMMER2_RESOLVE_MAYBE; 3055 } 3056 if (flags & HAMMER2_LOOKUP_SHARED) { 3057 how |= HAMMER2_RESOLVE_SHARED; 3058 } 3059 3060 /* 3061 * Calculate key to locate first/next element, unlocking the previous 3062 * element as we go. Be careful, the key calculation can overflow. 3063 * 3064 * (also reset bref to NULL) 3065 */ 3066 if (*firstp) { 3067 key = 0; 3068 *firstp = 0; 3069 } else { 3070 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 3071 if ((chain = *chainp) != NULL) { 3072 *chainp = NULL; 3073 hammer2_chain_unlock(chain); 3074 hammer2_chain_drop(chain); 3075 chain = NULL; 3076 } 3077 if (key == 0) { 3078 error |= HAMMER2_ERROR_EOF; 3079 goto done; 3080 } 3081 } 3082 3083 again: 3084 if (parent->error) { 3085 error = parent->error; 3086 goto done; 3087 } 3088 if (--maxloops == 0) 3089 panic("hammer2_chain_scan: maxloops"); 3090 3091 /* 3092 * Locate the blockref array. Currently we do a fully associative 3093 * search through the array. 3094 */ 3095 switch(parent->bref.type) { 3096 case HAMMER2_BREF_TYPE_INODE: 3097 /* 3098 * An inode with embedded data has no sub-chains. 3099 * 3100 * WARNING! Bulk scan code may pass a static chain marked 3101 * as BREF_TYPE_INODE with a copy of the volume 3102 * root blockset to snapshot the volume. 3103 */ 3104 if (parent->data->ipdata.meta.op_flags & 3105 HAMMER2_OPFLAG_DIRECTDATA) { 3106 error |= HAMMER2_ERROR_EOF; 3107 goto done; 3108 } 3109 base = &parent->data->ipdata.u.blockset.blockref[0]; 3110 count = HAMMER2_SET_COUNT; 3111 break; 3112 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3113 case HAMMER2_BREF_TYPE_INDIRECT: 3114 /* 3115 * Optimize indirect blocks in the INITIAL state to avoid 3116 * I/O. 3117 */ 3118 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 3119 base = NULL; 3120 } else { 3121 if (parent->data == NULL) 3122 panic("parent->data is NULL"); 3123 base = &parent->data->npdata[0]; 3124 } 3125 count = parent->bytes / sizeof(hammer2_blockref_t); 3126 break; 3127 case HAMMER2_BREF_TYPE_VOLUME: 3128 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3129 count = HAMMER2_SET_COUNT; 3130 break; 3131 case HAMMER2_BREF_TYPE_FREEMAP: 3132 base = &parent->data->blkset.blockref[0]; 3133 count = HAMMER2_SET_COUNT; 3134 break; 3135 default: 3136 panic("hammer2_chain_scan: unrecognized blockref type: %d", 3137 parent->bref.type); 3138 base = NULL; /* safety */ 3139 count = 0; /* safety */ 3140 break; 3141 } 3142 3143 /* 3144 * Merged scan to find next candidate. 3145 * 3146 * hammer2_base_*() functions require the parent->core.live_* fields 3147 * to be synchronized. 3148 * 3149 * We need to hold the spinlock to access the block array and RB tree 3150 * and to interlock chain creation. 3151 */ 3152 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3153 hammer2_chain_countbrefs(parent, base, count); 3154 3155 next_key = 0; 3156 bref_ptr = NULL; 3157 hammer2_spin_ex(&parent->core.spin); 3158 chain = hammer2_combined_find(parent, base, count, 3159 &next_key, 3160 key, HAMMER2_KEY_MAX, 3161 &bref_ptr); 3162 generation = parent->core.generation; 3163 3164 /* 3165 * Exhausted parent chain, we're done. 3166 */ 3167 if (bref_ptr == NULL) { 3168 hammer2_spin_unex(&parent->core.spin); 3169 KKASSERT(chain == NULL); 3170 error |= HAMMER2_ERROR_EOF; 3171 goto done; 3172 } 3173 3174 /* 3175 * Copy into the supplied stack-based blockref. 3176 */ 3177 *bref = *bref_ptr; 3178 3179 /* 3180 * Selected from blockref or in-memory chain. 3181 */ 3182 if (chain == NULL) { 3183 switch(bref->type) { 3184 case HAMMER2_BREF_TYPE_INODE: 3185 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3186 case HAMMER2_BREF_TYPE_INDIRECT: 3187 case HAMMER2_BREF_TYPE_VOLUME: 3188 case HAMMER2_BREF_TYPE_FREEMAP: 3189 /* 3190 * Recursion, always get the chain 3191 */ 3192 hammer2_spin_unex(&parent->core.spin); 3193 chain = hammer2_chain_get(parent, generation, 3194 bref, how); 3195 if (chain == NULL) 3196 goto again; 3197 break; 3198 default: 3199 /* 3200 * No recursion, do not waste time instantiating 3201 * a chain, just iterate using the bref. 3202 */ 3203 hammer2_spin_unex(&parent->core.spin); 3204 break; 3205 } 3206 } else { 3207 /* 3208 * Recursion or not we need the chain in order to supply 3209 * the bref. 3210 */ 3211 hammer2_chain_ref(chain); 3212 hammer2_spin_unex(&parent->core.spin); 3213 hammer2_chain_lock(chain, how); 3214 } 3215 if (chain && 3216 (bcmp(bref, &chain->bref, sizeof(*bref)) || 3217 chain->parent != parent)) { 3218 hammer2_chain_unlock(chain); 3219 hammer2_chain_drop(chain); 3220 chain = NULL; 3221 goto again; 3222 } 3223 3224 /* 3225 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 3226 * 3227 * NOTE: chain's key range is not relevant as there might be 3228 * one-offs within the range that are not deleted. 3229 * 3230 * NOTE: XXX this could create problems with scans used in 3231 * situations other than mount-time recovery. 3232 * 3233 * NOTE: Lookups can race delete-duplicate because 3234 * delete-duplicate does not lock the parent's core 3235 * (they just use the spinlock on the core). 3236 */ 3237 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3238 hammer2_chain_unlock(chain); 3239 hammer2_chain_drop(chain); 3240 chain = NULL; 3241 3242 key = next_key; 3243 if (key == 0) { 3244 error |= HAMMER2_ERROR_EOF; 3245 goto done; 3246 } 3247 goto again; 3248 } 3249 3250 done: 3251 /* 3252 * All done, return the bref or NULL, supply chain if necessary. 3253 */ 3254 if (chain) 3255 *chainp = chain; 3256 return (error); 3257 } 3258 3259 /* 3260 * Create and return a new hammer2 system memory structure of the specified 3261 * key, type and size and insert it under (*parentp). This is a full 3262 * insertion, based on the supplied key/keybits, and may involve creating 3263 * indirect blocks and moving other chains around via delete/duplicate. 3264 * 3265 * This call can be made with parent == NULL as long as a non -1 methods 3266 * is supplied. hmp must also be supplied in this situation (otherwise 3267 * hmp is extracted from the supplied parent). The chain will be detached 3268 * from the topology. A later call with both parent and chain can be made 3269 * to attach it. 3270 * 3271 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3272 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3273 * FULL. This typically means that the caller is creating the chain after 3274 * doing a hammer2_chain_lookup(). 3275 * 3276 * (*parentp) must be exclusive locked and may be replaced on return 3277 * depending on how much work the function had to do. 3278 * 3279 * (*parentp) must not be errored or this function will assert. 3280 * 3281 * (*chainp) usually starts out NULL and returns the newly created chain, 3282 * but if the caller desires the caller may allocate a disconnected chain 3283 * and pass it in instead. 3284 * 3285 * This function should NOT be used to insert INDIRECT blocks. It is 3286 * typically used to create/insert inodes and data blocks. 3287 * 3288 * Caller must pass-in an exclusively locked parent the new chain is to 3289 * be inserted under, and optionally pass-in a disconnected, exclusively 3290 * locked chain to insert (else we create a new chain). The function will 3291 * adjust (*parentp) as necessary, create or connect the chain, and 3292 * return an exclusively locked chain in *chainp. 3293 * 3294 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3295 * and will be reassigned. 3296 * 3297 * NOTE: returns HAMMER_ERROR_* flags 3298 */ 3299 int 3300 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3301 hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods, 3302 hammer2_key_t key, int keybits, int type, size_t bytes, 3303 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3304 { 3305 hammer2_chain_t *chain; 3306 hammer2_chain_t *parent; 3307 hammer2_blockref_t *base; 3308 hammer2_blockref_t dummy; 3309 int allocated = 0; 3310 int error = 0; 3311 int count; 3312 int maxloops = 300000; 3313 3314 /* 3315 * Topology may be crossing a PFS boundary. 3316 */ 3317 parent = *parentp; 3318 if (parent) { 3319 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3320 KKASSERT(parent->error == 0); 3321 hmp = parent->hmp; 3322 } 3323 chain = *chainp; 3324 3325 if (chain == NULL) { 3326 /* 3327 * First allocate media space and construct the dummy bref, 3328 * then allocate the in-memory chain structure. Set the 3329 * INITIAL flag for fresh chains which do not have embedded 3330 * data. 3331 */ 3332 bzero(&dummy, sizeof(dummy)); 3333 dummy.type = type; 3334 dummy.key = key; 3335 dummy.keybits = keybits; 3336 dummy.data_off = hammer2_getradix(bytes); 3337 3338 /* 3339 * Inherit methods from parent by default. Primarily used 3340 * for BREF_TYPE_DATA. Non-data types *must* be set to 3341 * a non-NONE check algorithm. 3342 */ 3343 if (methods == HAMMER2_METH_DEFAULT) 3344 dummy.methods = parent->bref.methods; 3345 else 3346 dummy.methods = (uint8_t)methods; 3347 3348 if (type != HAMMER2_BREF_TYPE_DATA && 3349 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3350 dummy.methods |= 3351 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3352 } 3353 3354 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3355 3356 /* 3357 * Lock the chain manually, chain_lock will load the chain 3358 * which we do NOT want to do. (note: chain->refs is set 3359 * to 1 by chain_alloc() for us, but lockcnt is not). 3360 */ 3361 chain->lockcnt = 1; 3362 hammer2_mtx_ex(&chain->lock); 3363 allocated = 1; 3364 3365 /* 3366 * Set INITIAL to optimize I/O. The flag will generally be 3367 * processed when we call hammer2_chain_modify(). 3368 */ 3369 switch(type) { 3370 case HAMMER2_BREF_TYPE_VOLUME: 3371 case HAMMER2_BREF_TYPE_FREEMAP: 3372 panic("hammer2_chain_create: called with volume type"); 3373 break; 3374 case HAMMER2_BREF_TYPE_INDIRECT: 3375 panic("hammer2_chain_create: cannot be used to" 3376 "create indirect block"); 3377 break; 3378 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3379 panic("hammer2_chain_create: cannot be used to" 3380 "create freemap root or node"); 3381 break; 3382 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3383 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3384 /* fall through */ 3385 case HAMMER2_BREF_TYPE_DIRENT: 3386 case HAMMER2_BREF_TYPE_INODE: 3387 case HAMMER2_BREF_TYPE_DATA: 3388 default: 3389 /* 3390 * leave chain->data NULL, set INITIAL 3391 */ 3392 KKASSERT(chain->data == NULL); 3393 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3394 break; 3395 } 3396 } else { 3397 /* 3398 * We are reattaching a previously deleted chain, possibly 3399 * under a new parent and possibly with a new key/keybits. 3400 * The chain does not have to be in a modified state. The 3401 * UPDATE flag will be set later on in this routine. 3402 * 3403 * Do NOT mess with the current state of the INITIAL flag. 3404 */ 3405 chain->bref.key = key; 3406 chain->bref.keybits = keybits; 3407 if (chain->flags & HAMMER2_CHAIN_DELETED) 3408 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3409 KKASSERT(chain->parent == NULL); 3410 } 3411 3412 /* 3413 * Set the appropriate bref flag if requested. 3414 * 3415 * NOTE! Callers can call this function to move chains without 3416 * knowing about special flags, so don't clear bref flags 3417 * here! 3418 */ 3419 if (flags & HAMMER2_INSERT_PFSROOT) 3420 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3421 3422 if (parent == NULL) 3423 goto skip; 3424 3425 /* 3426 * Calculate how many entries we have in the blockref array and 3427 * determine if an indirect block is required when inserting into 3428 * the parent. 3429 */ 3430 again: 3431 if (--maxloops == 0) 3432 panic("hammer2_chain_create: maxloops"); 3433 3434 switch(parent->bref.type) { 3435 case HAMMER2_BREF_TYPE_INODE: 3436 if ((parent->data->ipdata.meta.op_flags & 3437 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3438 kprintf("hammer2: parent set for direct-data! " 3439 "pkey=%016jx ckey=%016jx\n", 3440 parent->bref.key, 3441 chain->bref.key); 3442 } 3443 KKASSERT((parent->data->ipdata.meta.op_flags & 3444 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3445 KKASSERT(parent->data != NULL); 3446 base = &parent->data->ipdata.u.blockset.blockref[0]; 3447 count = HAMMER2_SET_COUNT; 3448 break; 3449 case HAMMER2_BREF_TYPE_INDIRECT: 3450 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3451 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3452 base = NULL; 3453 else 3454 base = &parent->data->npdata[0]; 3455 count = parent->bytes / sizeof(hammer2_blockref_t); 3456 break; 3457 case HAMMER2_BREF_TYPE_VOLUME: 3458 KKASSERT(parent->data != NULL); 3459 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3460 count = HAMMER2_SET_COUNT; 3461 break; 3462 case HAMMER2_BREF_TYPE_FREEMAP: 3463 KKASSERT(parent->data != NULL); 3464 base = &parent->data->blkset.blockref[0]; 3465 count = HAMMER2_SET_COUNT; 3466 break; 3467 default: 3468 panic("hammer2_chain_create: unrecognized blockref type: %d", 3469 parent->bref.type); 3470 base = NULL; 3471 count = 0; 3472 break; 3473 } 3474 3475 /* 3476 * Make sure we've counted the brefs 3477 */ 3478 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3479 hammer2_chain_countbrefs(parent, base, count); 3480 3481 KASSERT(parent->core.live_count >= 0 && 3482 parent->core.live_count <= count, 3483 ("bad live_count %d/%d (%02x, %d)", 3484 parent->core.live_count, count, 3485 parent->bref.type, parent->bytes)); 3486 3487 /* 3488 * If no free blockref could be found we must create an indirect 3489 * block and move a number of blockrefs into it. With the parent 3490 * locked we can safely lock each child in order to delete+duplicate 3491 * it without causing a deadlock. 3492 * 3493 * This may return the new indirect block or the old parent depending 3494 * on where the key falls. NULL is returned on error. 3495 */ 3496 if (parent->core.live_count == count) { 3497 hammer2_chain_t *nparent; 3498 3499 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0); 3500 3501 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3502 mtid, type, &error); 3503 if (nparent == NULL) { 3504 if (allocated) 3505 hammer2_chain_drop(chain); 3506 chain = NULL; 3507 goto done; 3508 } 3509 if (parent != nparent) { 3510 hammer2_chain_unlock(parent); 3511 hammer2_chain_drop(parent); 3512 parent = *parentp = nparent; 3513 } 3514 goto again; 3515 } 3516 3517 /* 3518 * fall through if parent, or skip to here if no parent. 3519 */ 3520 skip: 3521 if (chain->flags & HAMMER2_CHAIN_DELETED) 3522 kprintf("Inserting deleted chain @%016jx\n", 3523 chain->bref.key); 3524 3525 /* 3526 * Link the chain into its parent. 3527 */ 3528 if (chain->parent != NULL) 3529 panic("hammer2: hammer2_chain_create: chain already connected"); 3530 KKASSERT(chain->parent == NULL); 3531 if (parent) { 3532 KKASSERT(parent->core.live_count < count); 3533 hammer2_chain_insert(parent, chain, 3534 HAMMER2_CHAIN_INSERT_SPIN | 3535 HAMMER2_CHAIN_INSERT_LIVE, 3536 0); 3537 } 3538 3539 if (allocated) { 3540 /* 3541 * Mark the newly created chain modified. This will cause 3542 * UPDATE to be set and process the INITIAL flag. 3543 * 3544 * Device buffers are not instantiated for DATA elements 3545 * as these are handled by logical buffers. 3546 * 3547 * Indirect and freemap node indirect blocks are handled 3548 * by hammer2_chain_create_indirect() and not by this 3549 * function. 3550 * 3551 * Data for all other bref types is expected to be 3552 * instantiated (INODE, LEAF). 3553 */ 3554 switch(chain->bref.type) { 3555 case HAMMER2_BREF_TYPE_DATA: 3556 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3557 case HAMMER2_BREF_TYPE_DIRENT: 3558 case HAMMER2_BREF_TYPE_INODE: 3559 error = hammer2_chain_modify(chain, mtid, dedup_off, 3560 HAMMER2_MODIFY_OPTDATA); 3561 break; 3562 default: 3563 /* 3564 * Remaining types are not supported by this function. 3565 * In particular, INDIRECT and LEAF_NODE types are 3566 * handled by create_indirect(). 3567 */ 3568 panic("hammer2_chain_create: bad type: %d", 3569 chain->bref.type); 3570 /* NOT REACHED */ 3571 break; 3572 } 3573 } else { 3574 /* 3575 * When reconnecting a chain we must set UPDATE and 3576 * setflush so the flush recognizes that it must update 3577 * the bref in the parent. 3578 */ 3579 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3580 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3581 } 3582 3583 /* 3584 * We must setflush(parent) to ensure that it recurses through to 3585 * chain. setflush(chain) might not work because ONFLUSH is possibly 3586 * already set in the chain (so it won't recurse up to set it in the 3587 * parent). 3588 */ 3589 if (parent) 3590 hammer2_chain_setflush(parent); 3591 3592 done: 3593 *chainp = chain; 3594 3595 return (error); 3596 } 3597 3598 /* 3599 * Move the chain from its old parent to a new parent. The chain must have 3600 * already been deleted or already disconnected (or never associated) with 3601 * a parent. The chain is reassociated with the new parent and the deleted 3602 * flag will be cleared (no longer deleted). The chain's modification state 3603 * is not altered. 3604 * 3605 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3606 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3607 * FULL. This typically means that the caller is creating the chain after 3608 * doing a hammer2_chain_lookup(). 3609 * 3610 * Neither (parent) or (chain) can be errored. 3611 * 3612 * If (parent) is non-NULL then the chain is inserted under the parent. 3613 * 3614 * If (parent) is NULL then the newly duplicated chain is not inserted 3615 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3616 * passing into hammer2_chain_create() after this function returns). 3617 * 3618 * WARNING! This function calls create which means it can insert indirect 3619 * blocks. This can cause other unrelated chains in the parent to 3620 * be moved to a newly inserted indirect block in addition to the 3621 * specific chain. 3622 */ 3623 void 3624 hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3625 hammer2_tid_t mtid, int flags) 3626 { 3627 hammer2_blockref_t *bref; 3628 hammer2_chain_t *parent; 3629 3630 /* 3631 * WARNING! We should never resolve DATA to device buffers 3632 * (XXX allow it if the caller did?), and since 3633 * we currently do not have the logical buffer cache 3634 * buffer in-hand to fix its cached physical offset 3635 * we also force the modify code to not COW it. XXX 3636 * 3637 * NOTE! We allow error'd chains to be renamed. The bref itself 3638 * is good and can be renamed. The content, however, may 3639 * be inaccessible. 3640 */ 3641 KKASSERT(chain->parent == NULL); 3642 /*KKASSERT(chain->error == 0); allow */ 3643 bref = &chain->bref; 3644 3645 /* 3646 * If parent is not NULL the duplicated chain will be entered under 3647 * the parent and the UPDATE bit set to tell flush to update 3648 * the blockref. 3649 * 3650 * We must setflush(parent) to ensure that it recurses through to 3651 * chain. setflush(chain) might not work because ONFLUSH is possibly 3652 * already set in the chain (so it won't recurse up to set it in the 3653 * parent). 3654 * 3655 * Having both chains locked is extremely important for atomicy. 3656 */ 3657 if (parentp && (parent = *parentp) != NULL) { 3658 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3659 KKASSERT(parent->refs > 0); 3660 KKASSERT(parent->error == 0); 3661 3662 hammer2_chain_create(parentp, &chain, NULL, chain->pmp, 3663 HAMMER2_METH_DEFAULT, 3664 bref->key, bref->keybits, bref->type, 3665 chain->bytes, mtid, 0, flags); 3666 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3667 hammer2_chain_setflush(*parentp); 3668 } 3669 } 3670 3671 /* 3672 * This works in tandem with delete_obref() to install a blockref in 3673 * (typically) an indirect block that is associated with the chain being 3674 * moved to *parentp. 3675 * 3676 * The reason we need this function is that the caller needs to maintain 3677 * the blockref as it was, and not generate a new blockref for what might 3678 * be a modified chain. Otherwise stuff will leak into the flush that 3679 * the flush code's FLUSH_INODE_STOP flag is unable to catch. 3680 * 3681 * It is EXTREMELY important that we properly set CHAIN_BLKMAPUPD and 3682 * CHAIN_UPDATE. We must set BLKMAPUPD if the bref does not match, and 3683 * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if 3684 * it does. Otherwise we can end up in a situation where H2 is unable to 3685 * clean up the in-memory chain topology. 3686 * 3687 * The reason for this is that flushes do not generally flush through 3688 * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq 3689 * or sideq to properly flush and dispose of the related inode chain's flags. 3690 * Situations where the inode is not actually modified by the frontend, 3691 * but where we have to move the related chains around as we insert or cleanup 3692 * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory 3693 * inode chain that does not have a hammer2_inode_t associated with it. 3694 */ 3695 static void 3696 hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3697 hammer2_tid_t mtid, int flags, 3698 hammer2_blockref_t *obref) 3699 { 3700 hammer2_chain_rename(parentp, chain, mtid, flags); 3701 3702 if (obref->type != HAMMER2_BREF_TYPE_EMPTY) { 3703 hammer2_blockref_t *tbase; 3704 int tcount; 3705 3706 KKASSERT((chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0); 3707 hammer2_chain_modify(*parentp, mtid, 0, 0); 3708 tbase = hammer2_chain_base_and_count(*parentp, &tcount); 3709 hammer2_base_insert(*parentp, tbase, tcount, chain, obref); 3710 if (bcmp(obref, &chain->bref, sizeof(chain->bref))) { 3711 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD | 3712 HAMMER2_CHAIN_UPDATE); 3713 } else { 3714 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3715 } 3716 } 3717 } 3718 3719 /* 3720 * Helper function for deleting chains. 3721 * 3722 * The chain is removed from the live view (the RBTREE) as well as the parent's 3723 * blockmap. Both chain and its parent must be locked. 3724 * 3725 * parent may not be errored. chain can be errored. 3726 */ 3727 static int 3728 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3729 hammer2_tid_t mtid, int flags, 3730 hammer2_blockref_t *obref) 3731 { 3732 int error = 0; 3733 3734 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0); 3735 KKASSERT(chain->parent == parent); 3736 3737 if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) { 3738 /* 3739 * Chain is blockmapped, so there must be a parent. 3740 * Atomically remove the chain from the parent and remove 3741 * the blockmap entry. The parent must be set modified 3742 * to remove the blockmap entry. 3743 */ 3744 hammer2_blockref_t *base; 3745 int count; 3746 3747 KKASSERT(parent != NULL); 3748 KKASSERT(parent->error == 0); 3749 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3750 error = hammer2_chain_modify(parent, mtid, 0, 0); 3751 if (error) 3752 goto done; 3753 3754 /* 3755 * Calculate blockmap pointer 3756 */ 3757 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3758 hammer2_spin_ex(&chain->core.spin); 3759 hammer2_spin_ex(&parent->core.spin); 3760 3761 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3762 atomic_add_int(&parent->core.live_count, -1); 3763 ++parent->core.generation; 3764 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3765 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3766 --parent->core.chain_count; 3767 chain->parent = NULL; 3768 3769 switch(parent->bref.type) { 3770 case HAMMER2_BREF_TYPE_INODE: 3771 /* 3772 * Access the inode's block array. However, there 3773 * is no block array if the inode is flagged 3774 * DIRECTDATA. 3775 */ 3776 if (parent->data && 3777 (parent->data->ipdata.meta.op_flags & 3778 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3779 base = 3780 &parent->data->ipdata.u.blockset.blockref[0]; 3781 } else { 3782 base = NULL; 3783 } 3784 count = HAMMER2_SET_COUNT; 3785 break; 3786 case HAMMER2_BREF_TYPE_INDIRECT: 3787 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3788 if (parent->data) 3789 base = &parent->data->npdata[0]; 3790 else 3791 base = NULL; 3792 count = parent->bytes / sizeof(hammer2_blockref_t); 3793 break; 3794 case HAMMER2_BREF_TYPE_VOLUME: 3795 base = &parent->data->voldata. 3796 sroot_blockset.blockref[0]; 3797 count = HAMMER2_SET_COUNT; 3798 break; 3799 case HAMMER2_BREF_TYPE_FREEMAP: 3800 base = &parent->data->blkset.blockref[0]; 3801 count = HAMMER2_SET_COUNT; 3802 break; 3803 default: 3804 base = NULL; 3805 count = 0; 3806 panic("_hammer2_chain_delete_helper: " 3807 "unrecognized blockref type: %d", 3808 parent->bref.type); 3809 break; 3810 } 3811 3812 /* 3813 * delete blockmapped chain from its parent. 3814 * 3815 * The parent is not affected by any statistics in chain 3816 * which are pending synchronization. That is, there is 3817 * nothing to undo in the parent since they have not yet 3818 * been incorporated into the parent. 3819 * 3820 * The parent is affected by statistics stored in inodes. 3821 * Those have already been synchronized, so they must be 3822 * undone. XXX split update possible w/delete in middle? 3823 */ 3824 if (base) { 3825 hammer2_base_delete(parent, base, count, chain, obref); 3826 } 3827 hammer2_spin_unex(&parent->core.spin); 3828 hammer2_spin_unex(&chain->core.spin); 3829 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3830 /* 3831 * Chain is not blockmapped but a parent is present. 3832 * Atomically remove the chain from the parent. There is 3833 * no blockmap entry to remove. 3834 * 3835 * Because chain was associated with a parent but not 3836 * synchronized, the chain's *_count_up fields contain 3837 * inode adjustment statistics which must be undone. 3838 */ 3839 hammer2_spin_ex(&chain->core.spin); 3840 hammer2_spin_ex(&parent->core.spin); 3841 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3842 atomic_add_int(&parent->core.live_count, -1); 3843 ++parent->core.generation; 3844 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3845 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3846 --parent->core.chain_count; 3847 chain->parent = NULL; 3848 hammer2_spin_unex(&parent->core.spin); 3849 hammer2_spin_unex(&chain->core.spin); 3850 } else { 3851 /* 3852 * Chain is not blockmapped and has no parent. This 3853 * is a degenerate case. 3854 */ 3855 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3856 } 3857 done: 3858 return error; 3859 } 3860 3861 /* 3862 * Create an indirect block that covers one or more of the elements in the 3863 * current parent. Either returns the existing parent with no locking or 3864 * ref changes or returns the new indirect block locked and referenced 3865 * and leaving the original parent lock/ref intact as well. 3866 * 3867 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3868 * 3869 * The returned chain depends on where the specified key falls. 3870 * 3871 * The key/keybits for the indirect mode only needs to follow three rules: 3872 * 3873 * (1) That all elements underneath it fit within its key space and 3874 * 3875 * (2) That all elements outside it are outside its key space. 3876 * 3877 * (3) When creating the new indirect block any elements in the current 3878 * parent that fit within the new indirect block's keyspace must be 3879 * moved into the new indirect block. 3880 * 3881 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3882 * keyspace the the current parent, but lookup/iteration rules will 3883 * ensure (and must ensure) that rule (2) for all parents leading up 3884 * to the nearest inode or the root volume header is adhered to. This 3885 * is accomplished by always recursing through matching keyspaces in 3886 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3887 * 3888 * The current implementation calculates the current worst-case keyspace by 3889 * iterating the current parent and then divides it into two halves, choosing 3890 * whichever half has the most elements (not necessarily the half containing 3891 * the requested key). 3892 * 3893 * We can also opt to use the half with the least number of elements. This 3894 * causes lower-numbered keys (aka logical file offsets) to recurse through 3895 * fewer indirect blocks and higher-numbered keys to recurse through more. 3896 * This also has the risk of not moving enough elements to the new indirect 3897 * block and being forced to create several indirect blocks before the element 3898 * can be inserted. 3899 * 3900 * Must be called with an exclusively locked parent. 3901 * 3902 * NOTE: *errorp set to HAMMER_ERROR_* flags 3903 */ 3904 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3905 hammer2_key_t *keyp, int keybits, 3906 hammer2_blockref_t *base, int count); 3907 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 3908 hammer2_key_t *keyp, int keybits, 3909 hammer2_blockref_t *base, int count, 3910 int ncount); 3911 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 3912 hammer2_key_t *keyp, int keybits, 3913 hammer2_blockref_t *base, int count, 3914 int ncount); 3915 static 3916 hammer2_chain_t * 3917 hammer2_chain_create_indirect(hammer2_chain_t *parent, 3918 hammer2_key_t create_key, int create_bits, 3919 hammer2_tid_t mtid, int for_type, int *errorp) 3920 { 3921 hammer2_dev_t *hmp; 3922 hammer2_blockref_t *base; 3923 hammer2_blockref_t *bref; 3924 hammer2_blockref_t bsave; 3925 hammer2_blockref_t dummy; 3926 hammer2_chain_t *chain; 3927 hammer2_chain_t *ichain; 3928 hammer2_key_t key = create_key; 3929 hammer2_key_t key_beg; 3930 hammer2_key_t key_end; 3931 hammer2_key_t key_next; 3932 int keybits = create_bits; 3933 int count; 3934 int ncount; 3935 int nbytes; 3936 int loops; 3937 int error; 3938 int reason; 3939 int generation; 3940 int maxloops = 300000; 3941 3942 /* 3943 * Calculate the base blockref pointer or NULL if the chain 3944 * is known to be empty. We need to calculate the array count 3945 * for RB lookups either way. 3946 */ 3947 hmp = parent->hmp; 3948 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3949 3950 /* 3951 * Pre-modify the parent now to avoid having to deal with error 3952 * processing if we tried to later (in the middle of our loop). 3953 * 3954 * We are going to be moving bref's around, the indirect blocks 3955 * cannot be in an initial state. Do not pass MODIFY_OPTDATA. 3956 */ 3957 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 3958 if (*errorp) { 3959 kprintf("hammer2_chain_create_indirect: error %08x %s\n", 3960 *errorp, hammer2_error_str(*errorp)); 3961 return NULL; 3962 } 3963 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3964 3965 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 3966 base = hammer2_chain_base_and_count(parent, &count); 3967 3968 /* 3969 * How big should our new indirect block be? It has to be at least 3970 * as large as its parent for splits to work properly. 3971 * 3972 * The freemap uses a specific indirect block size. The number of 3973 * levels are built dynamically and ultimately depend on the size 3974 * volume. Because freemap blocks are taken from the reserved areas 3975 * of the volume our goal is efficiency (fewer levels) and not so 3976 * much to save disk space. 3977 * 3978 * The first indirect block level for a directory usually uses 3979 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 3980 * the hash mechanism, this typically gives us a nominal 3981 * 32 * 4 entries with one level of indirection. 3982 * 3983 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 3984 * indirect blocks. The initial 4 entries in the inode gives us 3985 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 3986 * of indirection gives us 137GB, and so forth. H2 can support 3987 * huge file sizes but they are not typical, so we try to stick 3988 * with compactness and do not use a larger indirect block size. 3989 * 3990 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 3991 * due to the way indirect blocks are created this usually winds 3992 * up being extremely inefficient for small files. Even though 3993 * 16KB requires more levels of indirection for very large files, 3994 * the 16KB records can be ganged together into 64KB DIOs. 3995 */ 3996 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3997 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3998 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 3999 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4000 if (parent->data->ipdata.meta.type == 4001 HAMMER2_OBJTYPE_DIRECTORY) 4002 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 4003 else 4004 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 4005 4006 } else { 4007 nbytes = HAMMER2_IND_BYTES_NOM; 4008 } 4009 if (nbytes < count * sizeof(hammer2_blockref_t)) { 4010 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 4011 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 4012 nbytes = count * sizeof(hammer2_blockref_t); 4013 } 4014 ncount = nbytes / sizeof(hammer2_blockref_t); 4015 4016 /* 4017 * When creating an indirect block for a freemap node or leaf 4018 * the key/keybits must be fitted to static radix levels because 4019 * particular radix levels use particular reserved blocks in the 4020 * related zone. 4021 * 4022 * This routine calculates the key/radix of the indirect block 4023 * we need to create, and whether it is on the high-side or the 4024 * low-side. 4025 */ 4026 switch(for_type) { 4027 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4028 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 4029 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 4030 base, count); 4031 break; 4032 case HAMMER2_BREF_TYPE_DATA: 4033 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 4034 base, count, ncount); 4035 break; 4036 case HAMMER2_BREF_TYPE_DIRENT: 4037 case HAMMER2_BREF_TYPE_INODE: 4038 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 4039 base, count, ncount); 4040 break; 4041 default: 4042 panic("illegal indirect block for bref type %d", for_type); 4043 break; 4044 } 4045 4046 /* 4047 * Normalize the key for the radix being represented, keeping the 4048 * high bits and throwing away the low bits. 4049 */ 4050 key &= ~(((hammer2_key_t)1 << keybits) - 1); 4051 4052 /* 4053 * Ok, create our new indirect block 4054 */ 4055 bzero(&dummy, sizeof(dummy)); 4056 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4057 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4058 dummy.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 4059 } else { 4060 dummy.type = HAMMER2_BREF_TYPE_INDIRECT; 4061 } 4062 dummy.key = key; 4063 dummy.keybits = keybits; 4064 dummy.data_off = hammer2_getradix(nbytes); 4065 dummy.methods = 4066 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 4067 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 4068 4069 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy); 4070 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 4071 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 4072 /* ichain has one ref at this point */ 4073 4074 /* 4075 * We have to mark it modified to allocate its block, but use 4076 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 4077 * it won't be acted upon by the flush code. 4078 * 4079 * XXX remove OPTDATA, we need a fully initialized indirect block to 4080 * be able to move the original blockref. 4081 */ 4082 *errorp = hammer2_chain_modify(ichain, mtid, 0, 0); 4083 if (*errorp) { 4084 kprintf("hammer2_chain_create_indirect: error %08x %s\n", 4085 *errorp, hammer2_error_str(*errorp)); 4086 hammer2_chain_unlock(ichain); 4087 hammer2_chain_drop(ichain); 4088 return NULL; 4089 } 4090 KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0); 4091 4092 /* 4093 * Iterate the original parent and move the matching brefs into 4094 * the new indirect block. 4095 * 4096 * XXX handle flushes. 4097 */ 4098 key_beg = 0; 4099 key_end = HAMMER2_KEY_MAX; 4100 key_next = 0; /* avoid gcc warnings */ 4101 hammer2_spin_ex(&parent->core.spin); 4102 loops = 0; 4103 reason = 0; 4104 4105 for (;;) { 4106 /* 4107 * Parent may have been modified, relocating its block array. 4108 * Reload the base pointer. 4109 */ 4110 base = hammer2_chain_base_and_count(parent, &count); 4111 4112 if (++loops > 100000) { 4113 hammer2_spin_unex(&parent->core.spin); 4114 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 4115 reason, parent, base, count, key_next); 4116 } 4117 4118 /* 4119 * NOTE: spinlock stays intact, returned chain (if not NULL) 4120 * is not referenced or locked which means that we 4121 * cannot safely check its flagged / deletion status 4122 * until we lock it. 4123 */ 4124 chain = hammer2_combined_find(parent, base, count, 4125 &key_next, 4126 key_beg, key_end, 4127 &bref); 4128 generation = parent->core.generation; 4129 if (bref == NULL) 4130 break; 4131 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4132 4133 /* 4134 * Skip keys that are not within the key/radix of the new 4135 * indirect block. They stay in the parent. 4136 */ 4137 if (rounddown2(key ^ bref->key, (hammer2_key_t)1 << keybits) != 0) { 4138 goto next_key_spinlocked; 4139 } 4140 4141 /* 4142 * Load the new indirect block by acquiring the related 4143 * chains (potentially from media as it might not be 4144 * in-memory). Then move it to the new parent (ichain). 4145 * 4146 * chain is referenced but not locked. We must lock the 4147 * chain to obtain definitive state. 4148 */ 4149 bsave = *bref; 4150 if (chain) { 4151 /* 4152 * Use chain already present in the RBTREE 4153 */ 4154 hammer2_chain_ref(chain); 4155 hammer2_spin_unex(&parent->core.spin); 4156 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 4157 } else { 4158 /* 4159 * Get chain for blockref element. _get returns NULL 4160 * on insertion race. 4161 */ 4162 hammer2_spin_unex(&parent->core.spin); 4163 chain = hammer2_chain_get(parent, generation, &bsave, 4164 HAMMER2_RESOLVE_NEVER); 4165 if (chain == NULL) { 4166 reason = 1; 4167 hammer2_spin_ex(&parent->core.spin); 4168 continue; 4169 } 4170 } 4171 4172 /* 4173 * This is always live so if the chain has been deleted 4174 * we raced someone and we have to retry. 4175 * 4176 * NOTE: Lookups can race delete-duplicate because 4177 * delete-duplicate does not lock the parent's core 4178 * (they just use the spinlock on the core). 4179 * 4180 * (note reversed logic for this one) 4181 */ 4182 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 4183 chain->parent != parent || 4184 (chain->flags & HAMMER2_CHAIN_DELETED)) { 4185 hammer2_chain_unlock(chain); 4186 hammer2_chain_drop(chain); 4187 if (hammer2_debug & 0x0040) { 4188 kprintf("LOST PARENT RETRY " 4189 "RETRY (%p,%p)->%p %08x\n", 4190 parent, chain->parent, chain, chain->flags); 4191 } 4192 hammer2_spin_ex(&parent->core.spin); 4193 continue; 4194 } 4195 4196 /* 4197 * Shift the chain to the indirect block. 4198 * 4199 * WARNING! No reason for us to load chain data, pass NOSTATS 4200 * to prevent delete/insert from trying to access 4201 * inode stats (and thus asserting if there is no 4202 * chain->data loaded). 4203 * 4204 * WARNING! The (parent, chain) deletion may modify the parent 4205 * and invalidate the base pointer. 4206 * 4207 * WARNING! Parent must already be marked modified, so we 4208 * can assume that chain_delete always suceeds. 4209 * 4210 * WARNING! hammer2_chain_repchange() does not have to be 4211 * called (and doesn't work anyway because we are 4212 * only doing a partial shift). A recursion that is 4213 * in-progress can continue at the current parent 4214 * and will be able to properly find its next key. 4215 */ 4216 error = hammer2_chain_delete_obref(parent, chain, mtid, 0, 4217 &bsave); 4218 KKASSERT(error == 0); 4219 hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bsave); 4220 hammer2_chain_unlock(chain); 4221 hammer2_chain_drop(chain); 4222 KKASSERT(parent->refs > 0); 4223 chain = NULL; 4224 base = NULL; /* safety */ 4225 hammer2_spin_ex(&parent->core.spin); 4226 next_key_spinlocked: 4227 if (--maxloops == 0) 4228 panic("hammer2_chain_create_indirect: maxloops"); 4229 reason = 4; 4230 if (key_next == 0 || key_next > key_end) 4231 break; 4232 key_beg = key_next; 4233 /* loop */ 4234 } 4235 hammer2_spin_unex(&parent->core.spin); 4236 4237 /* 4238 * Insert the new indirect block into the parent now that we've 4239 * cleared out some entries in the parent. We calculated a good 4240 * insertion index in the loop above (ichain->index). 4241 * 4242 * We don't have to set UPDATE here because we mark ichain 4243 * modified down below (so the normal modified -> flush -> set-moved 4244 * sequence applies). 4245 * 4246 * The insertion shouldn't race as this is a completely new block 4247 * and the parent is locked. 4248 */ 4249 base = NULL; /* safety, parent modify may change address */ 4250 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 4251 KKASSERT(parent->core.live_count < count); 4252 hammer2_chain_insert(parent, ichain, 4253 HAMMER2_CHAIN_INSERT_SPIN | 4254 HAMMER2_CHAIN_INSERT_LIVE, 4255 0); 4256 4257 /* 4258 * Make sure flushes propogate after our manual insertion. 4259 */ 4260 hammer2_chain_setflush(ichain); 4261 hammer2_chain_setflush(parent); 4262 4263 /* 4264 * Figure out what to return. 4265 */ 4266 if (rounddown2(create_key ^ key, (hammer2_key_t)1 << keybits) != 0) { 4267 /* 4268 * Key being created is outside the key range, 4269 * return the original parent. 4270 */ 4271 hammer2_chain_unlock(ichain); 4272 hammer2_chain_drop(ichain); 4273 } else { 4274 /* 4275 * Otherwise its in the range, return the new parent. 4276 * (leave both the new and old parent locked). 4277 */ 4278 parent = ichain; 4279 } 4280 4281 return(parent); 4282 } 4283 4284 /* 4285 * Do maintenance on an indirect chain. Both parent and chain are locked. 4286 * 4287 * Returns non-zero if (chain) is deleted, either due to being empty or 4288 * because its children were safely moved into the parent. 4289 */ 4290 int 4291 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, 4292 hammer2_chain_t *chain) 4293 { 4294 hammer2_blockref_t *chain_base; 4295 hammer2_blockref_t *base; 4296 hammer2_blockref_t *bref; 4297 hammer2_blockref_t bsave; 4298 hammer2_key_t key_next; 4299 hammer2_key_t key_beg; 4300 hammer2_key_t key_end; 4301 hammer2_chain_t *sub; 4302 int chain_count; 4303 int count; 4304 int error; 4305 int generation; 4306 4307 /* 4308 * Make sure we have an accurate live_count 4309 */ 4310 if ((chain->flags & (HAMMER2_CHAIN_INITIAL | 4311 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4312 base = &chain->data->npdata[0]; 4313 count = chain->bytes / sizeof(hammer2_blockref_t); 4314 hammer2_chain_countbrefs(chain, base, count); 4315 } 4316 4317 /* 4318 * If the indirect block is empty we can delete it. 4319 * (ignore deletion error) 4320 */ 4321 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) { 4322 hammer2_chain_delete(parent, chain, 4323 chain->bref.modify_tid, 4324 HAMMER2_DELETE_PERMANENT); 4325 hammer2_chain_repchange(parent, chain); 4326 return 1; 4327 } 4328 4329 base = hammer2_chain_base_and_count(parent, &count); 4330 4331 if ((parent->flags & (HAMMER2_CHAIN_INITIAL | 4332 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4333 hammer2_chain_countbrefs(parent, base, count); 4334 } 4335 4336 /* 4337 * Determine if we can collapse chain into parent, calculate 4338 * hysteresis for chain emptiness. 4339 */ 4340 if (parent->core.live_count + chain->core.live_count - 1 > count) 4341 return 0; 4342 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4343 if (chain->core.live_count > chain_count * 3 / 4) 4344 return 0; 4345 4346 /* 4347 * Ok, theoretically we can collapse chain's contents into 4348 * parent. chain is locked, but any in-memory children of chain 4349 * are not. For this to work, we must be able to dispose of any 4350 * in-memory children of chain. 4351 * 4352 * For now require that there are no in-memory children of chain. 4353 * 4354 * WARNING! Both chain and parent must remain locked across this 4355 * entire operation. 4356 */ 4357 4358 /* 4359 * Parent must be marked modified. Don't try to collapse it if we 4360 * can't mark it modified. Once modified, destroy chain to make room 4361 * and to get rid of what will be a conflicting key (this is included 4362 * in the calculation above). Finally, move the children of chain 4363 * into chain's parent. 4364 * 4365 * This order creates an accounting problem for bref.embed.stats 4366 * because we destroy chain before we remove its children. Any 4367 * elements whos blockref is already synchronized will be counted 4368 * twice. To deal with the problem we clean out chain's stats prior 4369 * to deleting it. 4370 */ 4371 error = hammer2_chain_modify(parent, 0, 0, 0); 4372 if (error) { 4373 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4374 hammer2_error_str(error)); 4375 return 0; 4376 } 4377 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0); 4378 if (error) { 4379 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4380 hammer2_error_str(error)); 4381 return 0; 4382 } 4383 4384 chain->bref.embed.stats.inode_count = 0; 4385 chain->bref.embed.stats.data_count = 0; 4386 error = hammer2_chain_delete(parent, chain, 4387 chain->bref.modify_tid, 4388 HAMMER2_DELETE_PERMANENT); 4389 KKASSERT(error == 0); 4390 4391 /* 4392 * The combined_find call requires core.spin to be held. One would 4393 * think there wouldn't be any conflicts since we hold chain 4394 * exclusively locked, but the caching mechanism for 0-ref children 4395 * does not require a chain lock. 4396 */ 4397 hammer2_spin_ex(&chain->core.spin); 4398 4399 key_next = 0; 4400 key_beg = 0; 4401 key_end = HAMMER2_KEY_MAX; 4402 for (;;) { 4403 chain_base = &chain->data->npdata[0]; 4404 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4405 sub = hammer2_combined_find(chain, chain_base, chain_count, 4406 &key_next, 4407 key_beg, key_end, 4408 &bref); 4409 generation = chain->core.generation; 4410 if (bref == NULL) 4411 break; 4412 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4413 4414 bsave = *bref; 4415 if (sub) { 4416 hammer2_chain_ref(sub); 4417 hammer2_spin_unex(&chain->core.spin); 4418 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4419 } else { 4420 hammer2_spin_unex(&chain->core.spin); 4421 sub = hammer2_chain_get(chain, generation, &bsave, 4422 HAMMER2_RESOLVE_NEVER); 4423 if (sub == NULL) { 4424 hammer2_spin_ex(&chain->core.spin); 4425 continue; 4426 } 4427 } 4428 if (bcmp(&bsave, &sub->bref, sizeof(bsave)) || 4429 sub->parent != chain || 4430 (sub->flags & HAMMER2_CHAIN_DELETED)) { 4431 hammer2_chain_unlock(sub); 4432 hammer2_chain_drop(sub); 4433 hammer2_spin_ex(&chain->core.spin); 4434 sub = NULL; /* safety */ 4435 continue; 4436 } 4437 error = hammer2_chain_delete_obref(chain, sub, 4438 sub->bref.modify_tid, 0, 4439 &bsave); 4440 KKASSERT(error == 0); 4441 hammer2_chain_rename_obref(&parent, sub, 4442 sub->bref.modify_tid, 4443 HAMMER2_INSERT_SAMEPARENT, &bsave); 4444 hammer2_chain_unlock(sub); 4445 hammer2_chain_drop(sub); 4446 hammer2_spin_ex(&chain->core.spin); 4447 4448 if (key_next == 0) 4449 break; 4450 key_beg = key_next; 4451 } 4452 hammer2_spin_unex(&chain->core.spin); 4453 4454 hammer2_chain_repchange(parent, chain); 4455 4456 return 1; 4457 } 4458 4459 /* 4460 * Freemap indirect blocks 4461 * 4462 * Calculate the keybits and highside/lowside of the freemap node the 4463 * caller is creating. 4464 * 4465 * This routine will specify the next higher-level freemap key/radix 4466 * representing the lowest-ordered set. By doing so, eventually all 4467 * low-ordered sets will be moved one level down. 4468 * 4469 * We have to be careful here because the freemap reserves a limited 4470 * number of blocks for a limited number of levels. So we can't just 4471 * push indiscriminately. 4472 */ 4473 int 4474 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4475 int keybits, hammer2_blockref_t *base, int count) 4476 { 4477 hammer2_chain_t *chain; 4478 hammer2_blockref_t *bref; 4479 hammer2_key_t key; 4480 hammer2_key_t key_beg; 4481 hammer2_key_t key_end; 4482 hammer2_key_t key_next; 4483 int maxloops = 300000; 4484 4485 key = *keyp; 4486 keybits = 64; 4487 4488 /* 4489 * Calculate the range of keys in the array being careful to skip 4490 * slots which are overridden with a deletion. 4491 */ 4492 key_beg = 0; 4493 key_end = HAMMER2_KEY_MAX; 4494 hammer2_spin_ex(&parent->core.spin); 4495 4496 for (;;) { 4497 if (--maxloops == 0) { 4498 panic("indkey_freemap shit %p %p:%d\n", 4499 parent, base, count); 4500 } 4501 chain = hammer2_combined_find(parent, base, count, 4502 &key_next, 4503 key_beg, key_end, 4504 &bref); 4505 4506 /* 4507 * Exhausted search 4508 */ 4509 if (bref == NULL) 4510 break; 4511 4512 /* 4513 * Skip deleted chains. 4514 */ 4515 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4516 if (key_next == 0 || key_next > key_end) 4517 break; 4518 key_beg = key_next; 4519 continue; 4520 } 4521 4522 /* 4523 * Use the full live (not deleted) element for the scan 4524 * iteration. HAMMER2 does not allow partial replacements. 4525 * 4526 * XXX should be built into hammer2_combined_find(). 4527 */ 4528 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4529 4530 if (keybits > bref->keybits) { 4531 key = bref->key; 4532 keybits = bref->keybits; 4533 } else if (keybits == bref->keybits && bref->key < key) { 4534 key = bref->key; 4535 } 4536 if (key_next == 0) 4537 break; 4538 key_beg = key_next; 4539 } 4540 hammer2_spin_unex(&parent->core.spin); 4541 4542 /* 4543 * Return the keybits for a higher-level FREEMAP_NODE covering 4544 * this node. 4545 */ 4546 switch(keybits) { 4547 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4548 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4549 break; 4550 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4551 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4552 break; 4553 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4554 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4555 break; 4556 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4557 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4558 break; 4559 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4560 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4561 break; 4562 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4563 panic("hammer2_chain_indkey_freemap: level too high"); 4564 break; 4565 default: 4566 panic("hammer2_chain_indkey_freemap: bad radix"); 4567 break; 4568 } 4569 *keyp = key; 4570 4571 return (keybits); 4572 } 4573 4574 /* 4575 * File indirect blocks 4576 * 4577 * Calculate the key/keybits for the indirect block to create by scanning 4578 * existing keys. The key being created is also passed in *keyp and can be 4579 * inside or outside the indirect block. Regardless, the indirect block 4580 * must hold at least two keys in order to guarantee sufficient space. 4581 * 4582 * We use a modified version of the freemap's fixed radix tree, but taylored 4583 * for file data. Basically we configure an indirect block encompassing the 4584 * smallest key. 4585 */ 4586 static int 4587 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4588 int keybits, hammer2_blockref_t *base, int count, 4589 int ncount) 4590 { 4591 hammer2_chain_t *chain; 4592 hammer2_blockref_t *bref; 4593 hammer2_key_t key; 4594 hammer2_key_t key_beg; 4595 hammer2_key_t key_end; 4596 hammer2_key_t key_next; 4597 int nradix; 4598 int maxloops = 300000; 4599 4600 key = *keyp; 4601 keybits = 64; 4602 4603 /* 4604 * Calculate the range of keys in the array being careful to skip 4605 * slots which are overridden with a deletion. 4606 * 4607 * Locate the smallest key. 4608 */ 4609 key_beg = 0; 4610 key_end = HAMMER2_KEY_MAX; 4611 hammer2_spin_ex(&parent->core.spin); 4612 4613 for (;;) { 4614 if (--maxloops == 0) { 4615 panic("indkey_freemap shit %p %p:%d\n", 4616 parent, base, count); 4617 } 4618 chain = hammer2_combined_find(parent, base, count, 4619 &key_next, 4620 key_beg, key_end, 4621 &bref); 4622 4623 /* 4624 * Exhausted search 4625 */ 4626 if (bref == NULL) 4627 break; 4628 4629 /* 4630 * Skip deleted chains. 4631 */ 4632 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4633 if (key_next == 0 || key_next > key_end) 4634 break; 4635 key_beg = key_next; 4636 continue; 4637 } 4638 4639 /* 4640 * Use the full live (not deleted) element for the scan 4641 * iteration. HAMMER2 does not allow partial replacements. 4642 * 4643 * XXX should be built into hammer2_combined_find(). 4644 */ 4645 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4646 4647 if (keybits > bref->keybits) { 4648 key = bref->key; 4649 keybits = bref->keybits; 4650 } else if (keybits == bref->keybits && bref->key < key) { 4651 key = bref->key; 4652 } 4653 if (key_next == 0) 4654 break; 4655 key_beg = key_next; 4656 } 4657 hammer2_spin_unex(&parent->core.spin); 4658 4659 /* 4660 * Calculate the static keybits for a higher-level indirect block 4661 * that contains the key. 4662 */ 4663 *keyp = key; 4664 4665 switch(ncount) { 4666 case HAMMER2_IND_COUNT_MIN: 4667 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4668 break; 4669 case HAMMER2_IND_COUNT_NOM: 4670 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4671 break; 4672 case HAMMER2_IND_COUNT_MAX: 4673 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4674 break; 4675 default: 4676 panic("bad ncount %d\n", ncount); 4677 nradix = 0; 4678 break; 4679 } 4680 4681 /* 4682 * The largest radix that can be returned for an indirect block is 4683 * 63 bits. (The largest practical indirect block radix is actually 4684 * 62 bits because the top-level inode or volume root contains four 4685 * entries, but allow 63 to be returned). 4686 */ 4687 if (nradix >= 64) 4688 nradix = 63; 4689 4690 return keybits + nradix; 4691 } 4692 4693 #if 1 4694 4695 /* 4696 * Directory indirect blocks. 4697 * 4698 * Covers both the inode index (directory of inodes), and directory contents 4699 * (filenames hardlinked to inodes). 4700 * 4701 * Because directory keys are hashed we generally try to cut the space in 4702 * half. We accomodate the inode index (which tends to have linearly 4703 * increasing inode numbers) by ensuring that the keyspace is at least large 4704 * enough to fill up the indirect block being created. 4705 */ 4706 static int 4707 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4708 int keybits, hammer2_blockref_t *base, int count, 4709 int ncount) 4710 { 4711 hammer2_blockref_t *bref; 4712 hammer2_chain_t *chain; 4713 hammer2_key_t key_beg; 4714 hammer2_key_t key_end; 4715 hammer2_key_t key_next; 4716 hammer2_key_t key; 4717 int nkeybits; 4718 int locount; 4719 int hicount; 4720 int maxloops = 300000; 4721 4722 /* 4723 * NOTE: We can't take a shortcut here anymore for inodes because 4724 * the root directory can contain a mix of inodes and directory 4725 * entries (we used to just return 63 if parent->bref.type was 4726 * HAMMER2_BREF_TYPE_INODE. 4727 */ 4728 key = *keyp; 4729 locount = 0; 4730 hicount = 0; 4731 4732 /* 4733 * Calculate the range of keys in the array being careful to skip 4734 * slots which are overridden with a deletion. 4735 */ 4736 key_beg = 0; 4737 key_end = HAMMER2_KEY_MAX; 4738 hammer2_spin_ex(&parent->core.spin); 4739 4740 for (;;) { 4741 if (--maxloops == 0) { 4742 panic("indkey_freemap shit %p %p:%d\n", 4743 parent, base, count); 4744 } 4745 chain = hammer2_combined_find(parent, base, count, 4746 &key_next, 4747 key_beg, key_end, 4748 &bref); 4749 4750 /* 4751 * Exhausted search 4752 */ 4753 if (bref == NULL) 4754 break; 4755 4756 /* 4757 * Deleted object 4758 */ 4759 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4760 if (key_next == 0 || key_next > key_end) 4761 break; 4762 key_beg = key_next; 4763 continue; 4764 } 4765 4766 /* 4767 * Use the full live (not deleted) element for the scan 4768 * iteration. HAMMER2 does not allow partial replacements. 4769 * 4770 * XXX should be built into hammer2_combined_find(). 4771 */ 4772 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4773 4774 /* 4775 * Expand our calculated key range (key, keybits) to fit 4776 * the scanned key. nkeybits represents the full range 4777 * that we will later cut in half (two halves @ nkeybits - 1). 4778 */ 4779 nkeybits = keybits; 4780 if (nkeybits < bref->keybits) { 4781 if (bref->keybits > 64) { 4782 kprintf("bad bref chain %p bref %p\n", 4783 chain, bref); 4784 Debugger("fubar"); 4785 } 4786 nkeybits = bref->keybits; 4787 } 4788 while (nkeybits < 64 && 4789 rounddown2(key ^ bref->key, (hammer2_key_t)1 << nkeybits) != 0) { 4790 ++nkeybits; 4791 } 4792 4793 /* 4794 * If the new key range is larger we have to determine 4795 * which side of the new key range the existing keys fall 4796 * under by checking the high bit, then collapsing the 4797 * locount into the hicount or vise-versa. 4798 */ 4799 if (keybits != nkeybits) { 4800 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4801 hicount += locount; 4802 locount = 0; 4803 } else { 4804 locount += hicount; 4805 hicount = 0; 4806 } 4807 keybits = nkeybits; 4808 } 4809 4810 /* 4811 * The newly scanned key will be in the lower half or the 4812 * upper half of the (new) key range. 4813 */ 4814 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4815 ++hicount; 4816 else 4817 ++locount; 4818 4819 if (key_next == 0) 4820 break; 4821 key_beg = key_next; 4822 } 4823 hammer2_spin_unex(&parent->core.spin); 4824 bref = NULL; /* now invalid (safety) */ 4825 4826 /* 4827 * Adjust keybits to represent half of the full range calculated 4828 * above (radix 63 max) for our new indirect block. 4829 */ 4830 --keybits; 4831 4832 /* 4833 * Expand keybits to hold at least ncount elements. ncount will be 4834 * a power of 2. This is to try to completely fill leaf nodes (at 4835 * least for keys which are not hashes). 4836 * 4837 * We aren't counting 'in' or 'out', we are counting 'high side' 4838 * and 'low side' based on the bit at (1LL << keybits). We want 4839 * everything to be inside in these cases so shift it all to 4840 * the low or high side depending on the new high bit. 4841 */ 4842 while (((hammer2_key_t)1 << keybits) < ncount) { 4843 ++keybits; 4844 if (key & ((hammer2_key_t)1 << keybits)) { 4845 hicount += locount; 4846 locount = 0; 4847 } else { 4848 locount += hicount; 4849 hicount = 0; 4850 } 4851 } 4852 4853 if (hicount > locount) 4854 key |= (hammer2_key_t)1 << keybits; 4855 else 4856 key &= ~(hammer2_key_t)1 << keybits; 4857 4858 *keyp = key; 4859 4860 return (keybits); 4861 } 4862 4863 #else 4864 4865 /* 4866 * Directory indirect blocks. 4867 * 4868 * Covers both the inode index (directory of inodes), and directory contents 4869 * (filenames hardlinked to inodes). 4870 * 4871 * Because directory keys are hashed we generally try to cut the space in 4872 * half. We accomodate the inode index (which tends to have linearly 4873 * increasing inode numbers) by ensuring that the keyspace is at least large 4874 * enough to fill up the indirect block being created. 4875 */ 4876 static int 4877 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4878 int keybits, hammer2_blockref_t *base, int count, 4879 int ncount) 4880 { 4881 hammer2_blockref_t *bref; 4882 hammer2_chain_t *chain; 4883 hammer2_key_t key_beg; 4884 hammer2_key_t key_end; 4885 hammer2_key_t key_next; 4886 hammer2_key_t key; 4887 int nkeybits; 4888 int locount; 4889 int hicount; 4890 int maxloops = 300000; 4891 4892 /* 4893 * Shortcut if the parent is the inode. In this situation the 4894 * parent has 4+1 directory entries and we are creating an indirect 4895 * block capable of holding many more. 4896 */ 4897 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4898 return 63; 4899 } 4900 4901 key = *keyp; 4902 locount = 0; 4903 hicount = 0; 4904 4905 /* 4906 * Calculate the range of keys in the array being careful to skip 4907 * slots which are overridden with a deletion. 4908 */ 4909 key_beg = 0; 4910 key_end = HAMMER2_KEY_MAX; 4911 hammer2_spin_ex(&parent->core.spin); 4912 4913 for (;;) { 4914 if (--maxloops == 0) { 4915 panic("indkey_freemap shit %p %p:%d\n", 4916 parent, base, count); 4917 } 4918 chain = hammer2_combined_find(parent, base, count, 4919 &key_next, 4920 key_beg, key_end, 4921 &bref); 4922 4923 /* 4924 * Exhausted search 4925 */ 4926 if (bref == NULL) 4927 break; 4928 4929 /* 4930 * Deleted object 4931 */ 4932 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4933 if (key_next == 0 || key_next > key_end) 4934 break; 4935 key_beg = key_next; 4936 continue; 4937 } 4938 4939 /* 4940 * Use the full live (not deleted) element for the scan 4941 * iteration. HAMMER2 does not allow partial replacements. 4942 * 4943 * XXX should be built into hammer2_combined_find(). 4944 */ 4945 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4946 4947 /* 4948 * Expand our calculated key range (key, keybits) to fit 4949 * the scanned key. nkeybits represents the full range 4950 * that we will later cut in half (two halves @ nkeybits - 1). 4951 */ 4952 nkeybits = keybits; 4953 if (nkeybits < bref->keybits) { 4954 if (bref->keybits > 64) { 4955 kprintf("bad bref chain %p bref %p\n", 4956 chain, bref); 4957 Debugger("fubar"); 4958 } 4959 nkeybits = bref->keybits; 4960 } 4961 while (nkeybits < 64 && 4962 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4963 (key ^ bref->key)) != 0) { 4964 ++nkeybits; 4965 } 4966 4967 /* 4968 * If the new key range is larger we have to determine 4969 * which side of the new key range the existing keys fall 4970 * under by checking the high bit, then collapsing the 4971 * locount into the hicount or vise-versa. 4972 */ 4973 if (keybits != nkeybits) { 4974 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4975 hicount += locount; 4976 locount = 0; 4977 } else { 4978 locount += hicount; 4979 hicount = 0; 4980 } 4981 keybits = nkeybits; 4982 } 4983 4984 /* 4985 * The newly scanned key will be in the lower half or the 4986 * upper half of the (new) key range. 4987 */ 4988 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4989 ++hicount; 4990 else 4991 ++locount; 4992 4993 if (key_next == 0) 4994 break; 4995 key_beg = key_next; 4996 } 4997 hammer2_spin_unex(&parent->core.spin); 4998 bref = NULL; /* now invalid (safety) */ 4999 5000 /* 5001 * Adjust keybits to represent half of the full range calculated 5002 * above (radix 63 max) for our new indirect block. 5003 */ 5004 --keybits; 5005 5006 /* 5007 * Expand keybits to hold at least ncount elements. ncount will be 5008 * a power of 2. This is to try to completely fill leaf nodes (at 5009 * least for keys which are not hashes). 5010 * 5011 * We aren't counting 'in' or 'out', we are counting 'high side' 5012 * and 'low side' based on the bit at (1LL << keybits). We want 5013 * everything to be inside in these cases so shift it all to 5014 * the low or high side depending on the new high bit. 5015 */ 5016 while (((hammer2_key_t)1 << keybits) < ncount) { 5017 ++keybits; 5018 if (key & ((hammer2_key_t)1 << keybits)) { 5019 hicount += locount; 5020 locount = 0; 5021 } else { 5022 locount += hicount; 5023 hicount = 0; 5024 } 5025 } 5026 5027 if (hicount > locount) 5028 key |= (hammer2_key_t)1 << keybits; 5029 else 5030 key &= ~(hammer2_key_t)1 << keybits; 5031 5032 *keyp = key; 5033 5034 return (keybits); 5035 } 5036 5037 #endif 5038 5039 /* 5040 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 5041 * it exists. 5042 * 5043 * Both parent and chain must be locked exclusively. 5044 * 5045 * This function will modify the parent if the blockref requires removal 5046 * from the parent's block table. 5047 * 5048 * This function is NOT recursive. Any entity already pushed into the 5049 * chain (such as an inode) may still need visibility into its contents, 5050 * as well as the ability to read and modify the contents. For example, 5051 * for an unlinked file which is still open. 5052 * 5053 * Also note that the flusher is responsible for cleaning up empty 5054 * indirect blocks. 5055 */ 5056 int 5057 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 5058 hammer2_tid_t mtid, int flags) 5059 { 5060 int error = 0; 5061 5062 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5063 5064 /* 5065 * Nothing to do if already marked. 5066 * 5067 * We need the spinlock on the core whos RBTREE contains chain 5068 * to protect against races. 5069 */ 5070 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5071 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5072 chain->parent == parent); 5073 error = _hammer2_chain_delete_helper(parent, chain, 5074 mtid, flags, NULL); 5075 } 5076 5077 /* 5078 * Permanent deletions mark the chain as destroyed. 5079 * 5080 * NOTE: We do not setflush the chain unless the deletion is 5081 * permanent, since the deletion of a chain does not actually 5082 * require it to be flushed. 5083 */ 5084 if (error == 0) { 5085 if (flags & HAMMER2_DELETE_PERMANENT) { 5086 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5087 hammer2_chain_setflush(chain); 5088 } 5089 } 5090 5091 return error; 5092 } 5093 5094 static int 5095 hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain, 5096 hammer2_tid_t mtid, int flags, 5097 hammer2_blockref_t *obref) 5098 { 5099 int error = 0; 5100 5101 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5102 5103 /* 5104 * Nothing to do if already marked. 5105 * 5106 * We need the spinlock on the core whos RBTREE contains chain 5107 * to protect against races. 5108 */ 5109 obref->type = HAMMER2_BREF_TYPE_EMPTY; 5110 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5111 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5112 chain->parent == parent); 5113 error = _hammer2_chain_delete_helper(parent, chain, 5114 mtid, flags, obref); 5115 } 5116 5117 /* 5118 * Permanent deletions mark the chain as destroyed. 5119 * 5120 * NOTE: We do not setflush the chain unless the deletion is 5121 * permanent, since the deletion of a chain does not actually 5122 * require it to be flushed. 5123 */ 5124 if (error == 0) { 5125 if (flags & HAMMER2_DELETE_PERMANENT) { 5126 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5127 hammer2_chain_setflush(chain); 5128 } 5129 } 5130 5131 return error; 5132 } 5133 5134 /* 5135 * Returns the index of the nearest element in the blockref array >= elm. 5136 * Returns (count) if no element could be found. 5137 * 5138 * Sets *key_nextp to the next key for loop purposes but does not modify 5139 * it if the next key would be higher than the current value of *key_nextp. 5140 * Note that *key_nexp can overflow to 0, which should be tested by the 5141 * caller. 5142 * 5143 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5144 * held through the operation. 5145 */ 5146 static int 5147 hammer2_base_find(hammer2_chain_t *parent, 5148 hammer2_blockref_t *base, int count, 5149 hammer2_key_t *key_nextp, 5150 hammer2_key_t key_beg, hammer2_key_t key_end) 5151 { 5152 hammer2_blockref_t *scan; 5153 hammer2_key_t scan_end; 5154 int i; 5155 int limit; 5156 5157 /* 5158 * Require the live chain's already have their core's counted 5159 * so we can optimize operations. 5160 */ 5161 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 5162 5163 /* 5164 * Degenerate case 5165 */ 5166 if (count == 0 || base == NULL) 5167 return(count); 5168 5169 /* 5170 * Sequential optimization using parent->cache_index. This is 5171 * the most likely scenario. 5172 * 5173 * We can avoid trailing empty entries on live chains, otherwise 5174 * we might have to check the whole block array. 5175 */ 5176 i = parent->cache_index; /* SMP RACE OK */ 5177 cpu_ccfence(); 5178 limit = parent->core.live_zero; 5179 if (i >= limit) 5180 i = limit - 1; 5181 if (i < 0) 5182 i = 0; 5183 KKASSERT(i < count); 5184 5185 /* 5186 * Search backwards 5187 */ 5188 scan = &base[i]; 5189 while (i > 0 && (scan->type == HAMMER2_BREF_TYPE_EMPTY || 5190 scan->key > key_beg)) { 5191 --scan; 5192 --i; 5193 } 5194 parent->cache_index = i; 5195 5196 /* 5197 * Search forwards, stop when we find a scan element which 5198 * encloses the key or until we know that there are no further 5199 * elements. 5200 */ 5201 while (i < count) { 5202 if (scan->type != HAMMER2_BREF_TYPE_EMPTY) { 5203 scan_end = scan->key + 5204 ((hammer2_key_t)1 << scan->keybits) - 1; 5205 if (scan->key > key_beg || scan_end >= key_beg) 5206 break; 5207 } 5208 if (i >= limit) 5209 return (count); 5210 ++scan; 5211 ++i; 5212 } 5213 if (i != count) { 5214 parent->cache_index = i; 5215 if (i >= limit) { 5216 i = count; 5217 } else { 5218 scan_end = scan->key + 5219 ((hammer2_key_t)1 << scan->keybits); 5220 if (scan_end && (*key_nextp > scan_end || 5221 *key_nextp == 0)) { 5222 *key_nextp = scan_end; 5223 } 5224 } 5225 } 5226 return (i); 5227 } 5228 5229 /* 5230 * Do a combined search and return the next match either from the blockref 5231 * array or from the in-memory chain. Sets *brefp to the returned bref in 5232 * both cases, or sets it to NULL if the search exhausted. Only returns 5233 * a non-NULL chain if the search matched from the in-memory chain. 5234 * 5235 * When no in-memory chain has been found and a non-NULL bref is returned 5236 * in *brefp. 5237 * 5238 * 5239 * The returned chain is not locked or referenced. Use the returned bref 5240 * to determine if the search exhausted or not. Iterate if the base find 5241 * is chosen but matches a deleted chain. 5242 * 5243 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5244 * held through the operation. 5245 */ 5246 static hammer2_chain_t * 5247 hammer2_combined_find(hammer2_chain_t *parent, 5248 hammer2_blockref_t *base, int count, 5249 hammer2_key_t *key_nextp, 5250 hammer2_key_t key_beg, hammer2_key_t key_end, 5251 hammer2_blockref_t **brefp) 5252 { 5253 hammer2_blockref_t *bref; 5254 hammer2_chain_t *chain; 5255 int i; 5256 5257 /* 5258 * Lookup in block array and in rbtree. 5259 */ 5260 *key_nextp = key_end + 1; 5261 i = hammer2_base_find(parent, base, count, key_nextp, 5262 key_beg, key_end); 5263 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 5264 5265 /* 5266 * Neither matched 5267 */ 5268 if (i == count && chain == NULL) { 5269 *brefp = NULL; 5270 return(NULL); 5271 } 5272 5273 /* 5274 * Only chain matched. 5275 */ 5276 if (i == count) { 5277 bref = &chain->bref; 5278 goto found; 5279 } 5280 5281 /* 5282 * Only blockref matched. 5283 */ 5284 if (chain == NULL) { 5285 bref = &base[i]; 5286 goto found; 5287 } 5288 5289 /* 5290 * Both in-memory and blockref matched, select the nearer element. 5291 * 5292 * If both are flush with the left-hand side or both are the 5293 * same distance away, select the chain. In this situation the 5294 * chain must have been loaded from the matching blockmap. 5295 */ 5296 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 5297 chain->bref.key == base[i].key) { 5298 KKASSERT(chain->bref.key == base[i].key); 5299 bref = &chain->bref; 5300 goto found; 5301 } 5302 5303 /* 5304 * Select the nearer key 5305 */ 5306 if (chain->bref.key < base[i].key) { 5307 bref = &chain->bref; 5308 } else { 5309 bref = &base[i]; 5310 chain = NULL; 5311 } 5312 5313 /* 5314 * If the bref is out of bounds we've exhausted our search. 5315 */ 5316 found: 5317 if (bref->key > key_end) { 5318 *brefp = NULL; 5319 chain = NULL; 5320 } else { 5321 *brefp = bref; 5322 } 5323 return(chain); 5324 } 5325 5326 /* 5327 * Locate the specified block array element and delete it. The element 5328 * must exist. 5329 * 5330 * The spin lock on the related chain must be held. 5331 * 5332 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5333 * need to be adjusted when we commit the media change. 5334 */ 5335 void 5336 hammer2_base_delete(hammer2_chain_t *parent, 5337 hammer2_blockref_t *base, int count, 5338 hammer2_chain_t *chain, 5339 hammer2_blockref_t *obref) 5340 { 5341 hammer2_blockref_t *elm = &chain->bref; 5342 hammer2_blockref_t *scan; 5343 hammer2_key_t key_next; 5344 int i; 5345 5346 /* 5347 * Delete element. Expect the element to exist. 5348 * 5349 * XXX see caller, flush code not yet sophisticated enough to prevent 5350 * re-flushed in some cases. 5351 */ 5352 key_next = 0; /* max range */ 5353 i = hammer2_base_find(parent, base, count, &key_next, 5354 elm->key, elm->key); 5355 scan = &base[i]; 5356 if (i == count || scan->type == HAMMER2_BREF_TYPE_EMPTY || 5357 scan->key != elm->key || 5358 ((chain->flags & HAMMER2_CHAIN_BLKMAPUPD) == 0 && 5359 scan->keybits != elm->keybits)) { 5360 hammer2_spin_unex(&parent->core.spin); 5361 panic("delete base %p element not found at %d/%d elm %p\n", 5362 base, i, count, elm); 5363 return; 5364 } 5365 5366 /* 5367 * Update stats and zero the entry. 5368 * 5369 * NOTE: Handle radix == 0 (0 bytes) case. 5370 */ 5371 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 5372 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 5373 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 5374 } 5375 switch(scan->type) { 5376 case HAMMER2_BREF_TYPE_INODE: 5377 --parent->bref.embed.stats.inode_count; 5378 /* fall through */ 5379 case HAMMER2_BREF_TYPE_DATA: 5380 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5381 atomic_set_int(&chain->flags, 5382 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5383 } else { 5384 if (parent->bref.leaf_count) 5385 --parent->bref.leaf_count; 5386 } 5387 /* fall through */ 5388 case HAMMER2_BREF_TYPE_INDIRECT: 5389 if (scan->type != HAMMER2_BREF_TYPE_DATA) { 5390 parent->bref.embed.stats.data_count -= 5391 scan->embed.stats.data_count; 5392 parent->bref.embed.stats.inode_count -= 5393 scan->embed.stats.inode_count; 5394 } 5395 if (scan->type == HAMMER2_BREF_TYPE_INODE) 5396 break; 5397 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5398 atomic_set_int(&chain->flags, 5399 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5400 } else { 5401 if (parent->bref.leaf_count <= scan->leaf_count) 5402 parent->bref.leaf_count = 0; 5403 else 5404 parent->bref.leaf_count -= scan->leaf_count; 5405 } 5406 break; 5407 case HAMMER2_BREF_TYPE_DIRENT: 5408 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5409 atomic_set_int(&chain->flags, 5410 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5411 } else { 5412 if (parent->bref.leaf_count) 5413 --parent->bref.leaf_count; 5414 } 5415 default: 5416 break; 5417 } 5418 5419 if (obref) 5420 *obref = *scan; 5421 bzero(scan, sizeof(*scan)); 5422 5423 /* 5424 * We can only optimize parent->core.live_zero for live chains. 5425 */ 5426 if (parent->core.live_zero == i + 1) { 5427 while (--i >= 0 && base[i].type == HAMMER2_BREF_TYPE_EMPTY) 5428 ; 5429 parent->core.live_zero = i + 1; 5430 } 5431 5432 /* 5433 * Clear appropriate blockmap flags in chain. 5434 */ 5435 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED | 5436 HAMMER2_CHAIN_BLKMAPUPD); 5437 } 5438 5439 /* 5440 * Insert the specified element. The block array must not already have the 5441 * element and must have space available for the insertion. 5442 * 5443 * The spin lock on the related chain must be held. 5444 * 5445 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5446 * need to be adjusted when we commit the media change. 5447 */ 5448 void 5449 hammer2_base_insert(hammer2_chain_t *parent, 5450 hammer2_blockref_t *base, int count, 5451 hammer2_chain_t *chain, hammer2_blockref_t *elm) 5452 { 5453 hammer2_key_t key_next; 5454 hammer2_key_t xkey; 5455 int i; 5456 int j; 5457 int k; 5458 int l; 5459 int u = 1; 5460 5461 /* 5462 * Insert new element. Expect the element to not already exist 5463 * unless we are replacing it. 5464 * 5465 * XXX see caller, flush code not yet sophisticated enough to prevent 5466 * re-flushed in some cases. 5467 */ 5468 key_next = 0; /* max range */ 5469 i = hammer2_base_find(parent, base, count, &key_next, 5470 elm->key, elm->key); 5471 5472 /* 5473 * Shortcut fill optimization, typical ordered insertion(s) may not 5474 * require a search. 5475 */ 5476 KKASSERT(i >= 0 && i <= count); 5477 5478 /* 5479 * Set appropriate blockmap flags in chain (if not NULL) 5480 */ 5481 if (chain) 5482 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED); 5483 5484 /* 5485 * Update stats and zero the entry 5486 */ 5487 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 5488 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 5489 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 5490 } 5491 switch(elm->type) { 5492 case HAMMER2_BREF_TYPE_INODE: 5493 ++parent->bref.embed.stats.inode_count; 5494 /* fall through */ 5495 case HAMMER2_BREF_TYPE_DATA: 5496 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5497 ++parent->bref.leaf_count; 5498 /* fall through */ 5499 case HAMMER2_BREF_TYPE_INDIRECT: 5500 if (elm->type != HAMMER2_BREF_TYPE_DATA) { 5501 parent->bref.embed.stats.data_count += 5502 elm->embed.stats.data_count; 5503 parent->bref.embed.stats.inode_count += 5504 elm->embed.stats.inode_count; 5505 } 5506 if (elm->type == HAMMER2_BREF_TYPE_INODE) 5507 break; 5508 if (parent->bref.leaf_count + elm->leaf_count < 5509 HAMMER2_BLOCKREF_LEAF_MAX) { 5510 parent->bref.leaf_count += elm->leaf_count; 5511 } else { 5512 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX; 5513 } 5514 break; 5515 case HAMMER2_BREF_TYPE_DIRENT: 5516 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5517 ++parent->bref.leaf_count; 5518 break; 5519 default: 5520 break; 5521 } 5522 5523 5524 /* 5525 * We can only optimize parent->core.live_zero for live chains. 5526 */ 5527 if (i == count && parent->core.live_zero < count) { 5528 i = parent->core.live_zero++; 5529 base[i] = *elm; 5530 return; 5531 } 5532 5533 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 5534 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 5535 hammer2_spin_unex(&parent->core.spin); 5536 panic("insert base %p overlapping elements at %d elm %p\n", 5537 base, i, elm); 5538 } 5539 5540 /* 5541 * Try to find an empty slot before or after. 5542 */ 5543 j = i; 5544 k = i; 5545 while (j > 0 || k < count) { 5546 --j; 5547 if (j >= 0 && base[j].type == HAMMER2_BREF_TYPE_EMPTY) { 5548 if (j == i - 1) { 5549 base[j] = *elm; 5550 } else { 5551 bcopy(&base[j+1], &base[j], 5552 (i - j - 1) * sizeof(*base)); 5553 base[i - 1] = *elm; 5554 } 5555 goto validate; 5556 } 5557 ++k; 5558 if (k < count && base[k].type == HAMMER2_BREF_TYPE_EMPTY) { 5559 bcopy(&base[i], &base[i+1], 5560 (k - i) * sizeof(hammer2_blockref_t)); 5561 base[i] = *elm; 5562 5563 /* 5564 * We can only update parent->core.live_zero for live 5565 * chains. 5566 */ 5567 if (parent->core.live_zero <= k) 5568 parent->core.live_zero = k + 1; 5569 u = 2; 5570 goto validate; 5571 } 5572 } 5573 panic("hammer2_base_insert: no room!"); 5574 5575 /* 5576 * Debugging 5577 */ 5578 validate: 5579 key_next = 0; 5580 for (l = 0; l < count; ++l) { 5581 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5582 key_next = base[l].key + 5583 ((hammer2_key_t)1 << base[l].keybits) - 1; 5584 break; 5585 } 5586 } 5587 while (++l < count) { 5588 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5589 if (base[l].key <= key_next) 5590 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5591 key_next = base[l].key + 5592 ((hammer2_key_t)1 << base[l].keybits) - 1; 5593 5594 } 5595 } 5596 5597 } 5598 5599 #if 0 5600 5601 /* 5602 * Sort the blockref array for the chain. Used by the flush code to 5603 * sort the blockref[] array. 5604 * 5605 * The chain must be exclusively locked AND spin-locked. 5606 */ 5607 typedef hammer2_blockref_t *hammer2_blockref_p; 5608 5609 static 5610 int 5611 hammer2_base_sort_callback(const void *v1, const void *v2) 5612 { 5613 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5614 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5615 5616 /* 5617 * Make sure empty elements are placed at the end of the array 5618 */ 5619 if (bref1->type == HAMMER2_BREF_TYPE_EMPTY) { 5620 if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) 5621 return(0); 5622 return(1); 5623 } else if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) { 5624 return(-1); 5625 } 5626 5627 /* 5628 * Sort by key 5629 */ 5630 if (bref1->key < bref2->key) 5631 return(-1); 5632 if (bref1->key > bref2->key) 5633 return(1); 5634 return(0); 5635 } 5636 5637 void 5638 hammer2_base_sort(hammer2_chain_t *chain) 5639 { 5640 hammer2_blockref_t *base; 5641 int count; 5642 5643 switch(chain->bref.type) { 5644 case HAMMER2_BREF_TYPE_INODE: 5645 /* 5646 * Special shortcut for embedded data returns the inode 5647 * itself. Callers must detect this condition and access 5648 * the embedded data (the strategy code does this for us). 5649 * 5650 * This is only applicable to regular files and softlinks. 5651 */ 5652 if (chain->data->ipdata.meta.op_flags & 5653 HAMMER2_OPFLAG_DIRECTDATA) { 5654 return; 5655 } 5656 base = &chain->data->ipdata.u.blockset.blockref[0]; 5657 count = HAMMER2_SET_COUNT; 5658 break; 5659 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5660 case HAMMER2_BREF_TYPE_INDIRECT: 5661 /* 5662 * Optimize indirect blocks in the INITIAL state to avoid 5663 * I/O. 5664 */ 5665 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5666 base = &chain->data->npdata[0]; 5667 count = chain->bytes / sizeof(hammer2_blockref_t); 5668 break; 5669 case HAMMER2_BREF_TYPE_VOLUME: 5670 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5671 count = HAMMER2_SET_COUNT; 5672 break; 5673 case HAMMER2_BREF_TYPE_FREEMAP: 5674 base = &chain->data->blkset.blockref[0]; 5675 count = HAMMER2_SET_COUNT; 5676 break; 5677 default: 5678 panic("hammer2_base_sort: unrecognized " 5679 "blockref(A) type: %d", 5680 chain->bref.type); 5681 base = NULL; /* safety */ 5682 count = 0; /* safety */ 5683 break; 5684 } 5685 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5686 } 5687 5688 #endif 5689 5690 /* 5691 * Set the check data for a chain. This can be a heavy-weight operation 5692 * and typically only runs on-flush. For file data check data is calculated 5693 * when the logical buffers are flushed. 5694 */ 5695 void 5696 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5697 { 5698 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 5699 5700 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5701 case HAMMER2_CHECK_NONE: 5702 break; 5703 case HAMMER2_CHECK_DISABLED: 5704 break; 5705 case HAMMER2_CHECK_ISCSI32: 5706 chain->bref.check.iscsi32.value = 5707 hammer2_icrc32(bdata, chain->bytes); 5708 break; 5709 case HAMMER2_CHECK_XXHASH64: 5710 chain->bref.check.xxhash64.value = 5711 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5712 break; 5713 case HAMMER2_CHECK_SHA192: 5714 { 5715 SHA256_CTX hash_ctx; 5716 union { 5717 uint8_t digest[SHA256_DIGEST_LENGTH]; 5718 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5719 } u; 5720 5721 SHA256_Init(&hash_ctx); 5722 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5723 SHA256_Final(u.digest, &hash_ctx); 5724 u.digest64[2] ^= u.digest64[3]; 5725 bcopy(u.digest, 5726 chain->bref.check.sha192.data, 5727 sizeof(chain->bref.check.sha192.data)); 5728 } 5729 break; 5730 case HAMMER2_CHECK_FREEMAP: 5731 chain->bref.check.freemap.icrc32 = 5732 hammer2_icrc32(bdata, chain->bytes); 5733 break; 5734 default: 5735 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5736 chain->bref.methods); 5737 break; 5738 } 5739 } 5740 5741 /* 5742 * Characterize a failed check code and try to trace back to the inode. 5743 */ 5744 static void 5745 hammer2_characterize_failed_chain(hammer2_chain_t *chain, uint64_t check, 5746 int bits) 5747 { 5748 hammer2_chain_t *lchain; 5749 hammer2_chain_t *ochain; 5750 int did; 5751 5752 did = krateprintf(&krate_h2chk, 5753 "chain %016jx.%02x (%s) meth=%02x CHECK FAIL " 5754 "(flags=%08x, bref/data ", 5755 chain->bref.data_off, 5756 chain->bref.type, 5757 hammer2_bref_type_str(chain->bref.type), 5758 chain->bref.methods, 5759 chain->flags); 5760 if (did == 0) 5761 return; 5762 5763 if (bits == 32) { 5764 kprintf("%08x/%08x)\n", 5765 chain->bref.check.iscsi32.value, 5766 (uint32_t)check); 5767 } else { 5768 kprintf("%016jx/%016jx)\n", 5769 chain->bref.check.xxhash64.value, 5770 check); 5771 } 5772 5773 /* 5774 * Run up the chains to try to find the governing inode so we 5775 * can report it. 5776 * 5777 * XXX This error reporting is not really MPSAFE 5778 */ 5779 ochain = chain; 5780 lchain = chain; 5781 while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) { 5782 lchain = chain; 5783 chain = chain->parent; 5784 } 5785 5786 if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE && 5787 ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) == 0 || 5788 (lchain->bref.key & HAMMER2_DIRHASH_VISIBLE))) { 5789 kprintf(" Resides at/in inode %ld\n", 5790 (long)chain->bref.key); 5791 } else if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5792 kprintf(" Resides in inode index - CRITICAL!!!\n"); 5793 } else { 5794 kprintf(" Resides in root index - CRITICAL!!!\n"); 5795 } 5796 if (ochain->hmp) { 5797 const char *pfsname = "UNKNOWN"; 5798 int i; 5799 5800 if (ochain->pmp) { 5801 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 5802 if (ochain->pmp->pfs_hmps[i] == ochain->hmp && 5803 ochain->pmp->pfs_names[i]) { 5804 pfsname = ochain->pmp->pfs_names[i]; 5805 break; 5806 } 5807 } 5808 } 5809 kprintf(" In pfs %s on device %s\n", 5810 pfsname, ochain->hmp->devrepname); 5811 } 5812 } 5813 5814 /* 5815 * Returns non-zero on success, 0 on failure. 5816 */ 5817 int 5818 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5819 { 5820 uint32_t check32; 5821 uint64_t check64; 5822 int r; 5823 5824 if (chain->flags & HAMMER2_CHAIN_NOTTESTED) 5825 return 1; 5826 5827 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5828 case HAMMER2_CHECK_NONE: 5829 r = 1; 5830 break; 5831 case HAMMER2_CHECK_DISABLED: 5832 r = 1; 5833 break; 5834 case HAMMER2_CHECK_ISCSI32: 5835 check32 = hammer2_icrc32(bdata, chain->bytes); 5836 r = (chain->bref.check.iscsi32.value == check32); 5837 if (r == 0) { 5838 hammer2_characterize_failed_chain(chain, check32, 32); 5839 } 5840 hammer2_process_icrc32 += chain->bytes; 5841 break; 5842 case HAMMER2_CHECK_XXHASH64: 5843 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5844 r = (chain->bref.check.xxhash64.value == check64); 5845 if (r == 0) { 5846 hammer2_characterize_failed_chain(chain, check64, 64); 5847 } 5848 hammer2_process_xxhash64 += chain->bytes; 5849 break; 5850 case HAMMER2_CHECK_SHA192: 5851 { 5852 SHA256_CTX hash_ctx; 5853 union { 5854 uint8_t digest[SHA256_DIGEST_LENGTH]; 5855 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5856 } u; 5857 5858 SHA256_Init(&hash_ctx); 5859 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5860 SHA256_Final(u.digest, &hash_ctx); 5861 u.digest64[2] ^= u.digest64[3]; 5862 if (bcmp(u.digest, 5863 chain->bref.check.sha192.data, 5864 sizeof(chain->bref.check.sha192.data)) == 0) { 5865 r = 1; 5866 } else { 5867 r = 0; 5868 krateprintf(&krate_h2chk, 5869 "chain %016jx.%02x meth=%02x " 5870 "CHECK FAIL\n", 5871 chain->bref.data_off, 5872 chain->bref.type, 5873 chain->bref.methods); 5874 } 5875 } 5876 break; 5877 case HAMMER2_CHECK_FREEMAP: 5878 r = (chain->bref.check.freemap.icrc32 == 5879 hammer2_icrc32(bdata, chain->bytes)); 5880 if (r == 0) { 5881 int did; 5882 5883 did = krateprintf(&krate_h2chk, 5884 "chain %016jx.%02x meth=%02x " 5885 "CHECK FAIL\n", 5886 chain->bref.data_off, 5887 chain->bref.type, 5888 chain->bref.methods); 5889 if (did) { 5890 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 5891 chain->bref.check.freemap.icrc32, 5892 hammer2_icrc32(bdata, chain->bytes), 5893 chain->bytes); 5894 if (chain->dio) { 5895 kprintf("dio %p buf %016jx,%d " 5896 "bdata %p/%p\n", 5897 chain->dio, 5898 (intmax_t)chain->dio->bp->b_loffset, 5899 chain->dio->bp->b_bufsize, 5900 bdata, 5901 chain->dio->bp->b_data); 5902 } 5903 } 5904 } 5905 break; 5906 default: 5907 kprintf("hammer2_chain_testcheck: unknown check type %02x\n", 5908 chain->bref.methods); 5909 r = 1; 5910 break; 5911 } 5912 return r; 5913 } 5914 5915 /* 5916 * Acquire the chain and parent representing the specified inode for the 5917 * device at the specified cluster index. 5918 * 5919 * The flags passed in are LOOKUP flags, not RESOLVE flags. 5920 * 5921 * If we are unable to locate the inode, HAMMER2_ERROR_EIO or HAMMER2_ERROR_CHECK 5922 * is returned. In case of error, *chainp and/or *parentp may still be returned 5923 * non-NULL. 5924 * 5925 * The caller may pass-in a locked *parentp and/or *chainp, or neither. 5926 * They will be unlocked and released by this function. The *parentp and 5927 * *chainp representing the located inode are returned locked. 5928 * 5929 * The returned error includes any error on the returned chain in addition to 5930 * errors incurred while trying to lookup the inode. However, a chain->error 5931 * might not be recognized if HAMMER2_LOOKUP_NODATA is passed. This flag may 5932 * not be passed to this function. 5933 */ 5934 int 5935 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 5936 int clindex, int flags, 5937 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 5938 { 5939 hammer2_chain_t *parent; 5940 hammer2_chain_t *rchain; 5941 hammer2_key_t key_dummy; 5942 hammer2_inode_t *ip; 5943 int resolve_flags; 5944 int error; 5945 5946 KKASSERT((flags & HAMMER2_LOOKUP_NODATA) == 0); 5947 5948 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 5949 HAMMER2_RESOLVE_SHARED : 0; 5950 5951 /* 5952 * Caller expects us to replace these. 5953 */ 5954 if (*chainp) { 5955 hammer2_chain_unlock(*chainp); 5956 hammer2_chain_drop(*chainp); 5957 *chainp = NULL; 5958 } 5959 if (*parentp) { 5960 hammer2_chain_unlock(*parentp); 5961 hammer2_chain_drop(*parentp); 5962 *parentp = NULL; 5963 } 5964 5965 /* 5966 * Be very careful, this is a backend function and we CANNOT 5967 * lock any frontend inode structure we find. But we have to 5968 * look the inode up this way first in case it exists but is 5969 * detached from the radix tree. 5970 */ 5971 ip = hammer2_inode_lookup(pmp, inum); 5972 if (ip) { 5973 *chainp = hammer2_inode_chain_and_parent(ip, clindex, 5974 parentp, 5975 resolve_flags); 5976 hammer2_inode_drop(ip); 5977 if (*chainp) 5978 return (*chainp)->error; 5979 hammer2_chain_unlock(*chainp); 5980 hammer2_chain_drop(*chainp); 5981 *chainp = NULL; 5982 if (*parentp) { 5983 hammer2_chain_unlock(*parentp); 5984 hammer2_chain_drop(*parentp); 5985 *parentp = NULL; 5986 } 5987 } 5988 5989 /* 5990 * Inodes hang off of the iroot (bit 63 is clear, differentiating 5991 * inodes from root directory entries in the key lookup). 5992 */ 5993 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 5994 rchain = NULL; 5995 if (parent) { 5996 /* 5997 * NOTE: rchain can be returned as NULL even if error == 0 5998 * (i.e. not found) 5999 */ 6000 rchain = hammer2_chain_lookup(&parent, &key_dummy, 6001 inum, inum, 6002 &error, flags); 6003 /* 6004 * Propagate a chain-specific error to caller. 6005 * 6006 * If the chain is not errored, we must still validate that the inode 6007 * number is correct, because all hell will break loose if it isn't 6008 * correct. It should always be correct so print to the console and 6009 * simulate a CHECK error if it is not. 6010 */ 6011 if (error == 0 && rchain) { 6012 error = rchain->error; 6013 if (error == 0 && rchain->data) { 6014 if (inum != rchain->data->ipdata.meta.inum) { 6015 kprintf("hammer2_chain_inode_find: lookup inum %ld, " 6016 "got valid inode but with inum %ld\n", 6017 (long)inum, (long)rchain->data->ipdata.meta.inum); 6018 error = HAMMER2_ERROR_CHECK; 6019 rchain->error = error; 6020 } 6021 } 6022 } 6023 } else { 6024 error = HAMMER2_ERROR_EIO; 6025 } 6026 *parentp = parent; 6027 *chainp = rchain; 6028 6029 return error; 6030 } 6031 6032 /* 6033 * Used by the bulkscan code to snapshot the synchronized storage for 6034 * a volume, allowing it to be scanned concurrently against normal 6035 * operation. 6036 */ 6037 hammer2_chain_t * 6038 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 6039 { 6040 hammer2_chain_t *copy; 6041 6042 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 6043 copy->data = kmalloc(sizeof(copy->data->voldata), 6044 hmp->mmsg, M_WAITOK | M_ZERO); 6045 hammer2_voldata_lock(hmp); 6046 copy->data->voldata = hmp->volsync; 6047 hammer2_voldata_unlock(hmp); 6048 6049 return copy; 6050 } 6051 6052 void 6053 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 6054 { 6055 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 6056 KKASSERT(copy->data); 6057 kfree(copy->data, copy->hmp->mmsg); 6058 copy->data = NULL; 6059 hammer2_chain_drop(copy); 6060 } 6061 6062 /* 6063 * Returns non-zero if the chain (INODE or DIRENT) matches the 6064 * filename. 6065 */ 6066 int 6067 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 6068 size_t name_len) 6069 { 6070 const hammer2_inode_data_t *ripdata; 6071 6072 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 6073 ripdata = &chain->data->ipdata; 6074 if (ripdata->meta.name_len == name_len && 6075 bcmp(ripdata->filename, name, name_len) == 0) { 6076 return 1; 6077 } 6078 } 6079 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 6080 chain->bref.embed.dirent.namlen == name_len) { 6081 if (name_len > sizeof(chain->bref.check.buf) && 6082 bcmp(chain->data->buf, name, name_len) == 0) { 6083 return 1; 6084 } 6085 if (name_len <= sizeof(chain->bref.check.buf) && 6086 bcmp(chain->bref.check.buf, name, name_len) == 0) { 6087 return 1; 6088 } 6089 } 6090 return 0; 6091 } 6092 6093 /* 6094 * Debugging 6095 */ 6096 void 6097 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int bi, int *countp, 6098 char pfx, u_int flags) 6099 { 6100 hammer2_chain_t *scan; 6101 hammer2_chain_t *parent; 6102 6103 --*countp; 6104 if (*countp == 0) { 6105 kprintf("%*.*s...\n", tab, tab, ""); 6106 return; 6107 } 6108 if (*countp < 0) 6109 return; 6110 kprintf("%*.*s%c-chain %p %s.%-3d %016jx %016jx/%-2d mir=%016jx\n", 6111 tab, tab, "", pfx, chain, 6112 hammer2_bref_type_str(chain->bref.type), bi, 6113 chain->bref.data_off, chain->bref.key, chain->bref.keybits, 6114 chain->bref.mirror_tid); 6115 6116 kprintf("%*.*s [%08x] (%s) refs=%d", 6117 tab, tab, "", 6118 chain->flags, 6119 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE && 6120 chain->data) ? (char *)chain->data->ipdata.filename : "?"), 6121 chain->refs); 6122 6123 parent = chain->parent; 6124 if (parent) 6125 kprintf("\n%*.*s p=%p [pflags %08x prefs %d]", 6126 tab, tab, "", 6127 parent, parent->flags, parent->refs); 6128 if (RB_EMPTY(&chain->core.rbtree)) { 6129 kprintf("\n"); 6130 } else { 6131 int bi = 0; 6132 kprintf(" {\n"); 6133 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) { 6134 if ((scan->flags & flags) || flags == (u_int)-1) { 6135 hammer2_dump_chain(scan, tab + 4, bi, countp, 6136 'a', flags); 6137 } 6138 bi++; 6139 } 6140 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data) 6141 kprintf("%*.*s}(%s)\n", tab, tab, "", 6142 chain->data->ipdata.filename); 6143 else 6144 kprintf("%*.*s}\n", tab, tab, ""); 6145 } 6146 } 6147